From f7ec7668a2a452f23f197ee2670a4d799be20ed4 Mon Sep 17 00:00:00 2001 From: Dmitrii Kovalkov <34828390+DimasKovas@users.noreply.github.com> Date: Thu, 5 Jun 2025 16:04:37 +0400 Subject: [PATCH 001/124] pageserver, tests: prepare test_basebackup_cache for --timelines-onto-safekeepers (#12143) ## Problem - `test_basebackup_cache` fails in https://github.com/neondatabase/neon/pull/11712 because after the timelines on safekeepers are managed by storage controller, they do contain proper start_lsn and the compute_ctl tool sends the first basebackup request with this LSN. - `Failed to prepare basebackup` log messages during timeline initialization, because the timeline is not yet in the global timeline map. - Relates to https://github.com/neondatabase/cloud/issues/29353 ## Summary of changes - Account for `timeline_onto_safekeepers` storcon's option in the test. - Do not trigger basebackup prepare during the timeline initialization. --- pageserver/src/tenant/timeline.rs | 7 ++++++ test_runner/regress/test_basebackup.py | 33 +++++++++++++++++++------- 2 files changed, 31 insertions(+), 9 deletions(-) diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 6798606141..3522af2de0 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -2506,6 +2506,13 @@ impl Timeline { // Preparing basebackup doesn't make sense for shards other than shard zero. return; } + if !self.is_active() { + // May happen during initial timeline creation. + // Such timeline is not in the global timeline map yet, + // so basebackup cache will not be able to find it. + // TODO(diko): We can prepare such timelines in finish_creation(). + return; + } let res = self .basebackup_prepare_sender diff --git a/test_runner/regress/test_basebackup.py b/test_runner/regress/test_basebackup.py index b083c394c7..2d42be4051 100644 --- a/test_runner/regress/test_basebackup.py +++ b/test_runner/regress/test_basebackup.py @@ -26,6 +26,10 @@ def test_basebackup_cache(neon_env_builder: NeonEnvBuilder): ps = env.pageserver ps_http = ps.http_client() + storcon_managed_timelines = (env.storage_controller_config or {}).get( + "timelines_onto_safekeepers", False + ) + # 1. Check that we always hit the cache after compute restart. for i in range(3): ep.start() @@ -33,15 +37,26 @@ def test_basebackup_cache(neon_env_builder: NeonEnvBuilder): def check_metrics(i=i): metrics = ps_http.get_metrics() - # Never miss. - # The first time compute_ctl sends `get_basebackup` with lsn=None, we do not cache such requests. - # All other requests should be a hit - assert ( - metrics.query_one( - "pageserver_basebackup_cache_read_total", {"result": "miss"} - ).value - == 0 - ) + if storcon_managed_timelines: + # We do not cache the initial basebackup yet, + # so the first compute startup should be a miss. + assert ( + metrics.query_one( + "pageserver_basebackup_cache_read_total", {"result": "miss"} + ).value + == 1 + ) + else: + # If the timeline is not initialized on safekeeprs, + # the compute_ctl sends `get_basebackup` with lsn=None for the first startup. + # We do not use cache for such requests, so it's niether a hit nor a miss. + assert ( + metrics.query_one( + "pageserver_basebackup_cache_read_total", {"result": "miss"} + ).value + == 0 + ) + # All but the first requests are hits. assert ( metrics.query_one("pageserver_basebackup_cache_read_total", {"result": "hit"}).value From 6ae4b89000b7dd80e5f950023e4f29c384dc926c Mon Sep 17 00:00:00 2001 From: Alexey Kondratov Date: Thu, 5 Jun 2025 14:17:28 +0200 Subject: [PATCH 002/124] feat(compute_ctl): Implement graceful compute monitor exit (#11911) ## Problem After introducing a naive downtime calculation for the Postgres process inside compute in https://github.com/neondatabase/neon/pull/11346, I noticed that some amount of computes regularly report short downtime. After checking some particular cases, it looks like all of them report downtime close to the end of the life of the compute, i.e., when the control plane calls a `/terminate` and we are waiting for Postgres to exit. Compute monitor also produces a lot of error logs because Postgres stops accepting connections, but it's expected during the termination process. ## Summary of changes Regularly check the compute status inside the main compute monitor loop and exit gracefully when the compute is in some terminal or soon-to-be-terminal state. --------- Co-authored-by: Tristan Partin --- compute_tools/src/monitor.rs | 66 ++++++++++++++++++++++++++++++++---- 1 file changed, 59 insertions(+), 7 deletions(-) diff --git a/compute_tools/src/monitor.rs b/compute_tools/src/monitor.rs index 3311ee47b3..bacaf05cd5 100644 --- a/compute_tools/src/monitor.rs +++ b/compute_tools/src/monitor.rs @@ -13,6 +13,12 @@ use crate::metrics::{PG_CURR_DOWNTIME_MS, PG_TOTAL_DOWNTIME_MS}; const MONITOR_CHECK_INTERVAL: Duration = Duration::from_millis(500); +/// Struct to store runtime state of the compute monitor thread. +/// In theory, this could be a part of `Compute`, but i) +/// this state is expected to be accessed only by single thread, +/// so we don't need to care about locking; ii) `Compute` is +/// already quite big. Thus, it seems to be a good idea to keep +/// all the activity/health monitoring parts here. struct ComputeMonitor { compute: Arc, @@ -70,12 +76,36 @@ impl ComputeMonitor { ) } + /// Check if compute is in some terminal or soon-to-be-terminal + /// state, then return `true`, signalling the caller that it + /// should exit gracefully. Otherwise, return `false`. + fn check_interrupts(&mut self) -> bool { + let compute_status = self.compute.get_status(); + if matches!( + compute_status, + ComputeStatus::Terminated | ComputeStatus::TerminationPending | ComputeStatus::Failed + ) { + info!( + "compute is in {} status, stopping compute monitor", + compute_status + ); + return true; + } + + false + } + /// Spin in a loop and figure out the last activity time in the Postgres. - /// Then update it in the shared state. This function never errors out. + /// Then update it in the shared state. This function currently never + /// errors out explicitly, but there is a graceful termination path. + /// Every time we receive an error trying to check Postgres, we use + /// [`ComputeMonitor::check_interrupts()`] because it could be that + /// compute is being terminated already, then we can exit gracefully + /// to not produce errors' noise in the log. /// NB: the only expected panic is at `Mutex` unwrap(), all other errors /// should be handled gracefully. #[instrument(skip_all)] - pub fn run(&mut self) { + pub fn run(&mut self) -> anyhow::Result<()> { // Suppose that `connstr` doesn't change let connstr = self.compute.params.connstr.clone(); let conf = self @@ -93,6 +123,10 @@ impl ComputeMonitor { info!("starting compute monitor for {}", connstr); loop { + if self.check_interrupts() { + break; + } + match &mut client { Ok(cli) => { if cli.is_closed() { @@ -100,6 +134,10 @@ impl ComputeMonitor { downtime_info = self.downtime_info(), "connection to Postgres is closed, trying to reconnect" ); + if self.check_interrupts() { + break; + } + self.report_down(); // Connection is closed, reconnect and try again. @@ -111,15 +149,19 @@ impl ComputeMonitor { self.compute.update_last_active(self.last_active); } Err(e) => { + error!( + downtime_info = self.downtime_info(), + "could not check Postgres: {}", e + ); + if self.check_interrupts() { + break; + } + // Although we have many places where we can return errors in `check()`, // normally it shouldn't happen. I.e., we will likely return error if // connection got broken, query timed out, Postgres returned invalid data, etc. // In all such cases it's suspicious, so let's report this as downtime. self.report_down(); - error!( - downtime_info = self.downtime_info(), - "could not check Postgres: {}", e - ); // Reconnect to Postgres just in case. During tests, I noticed // that queries in `check()` can fail with `connection closed`, @@ -136,6 +178,10 @@ impl ComputeMonitor { downtime_info = self.downtime_info(), "could not connect to Postgres: {}, retrying", e ); + if self.check_interrupts() { + break; + } + self.report_down(); // Establish a new connection and try again. @@ -147,6 +193,9 @@ impl ComputeMonitor { self.last_checked = Utc::now(); thread::sleep(MONITOR_CHECK_INTERVAL); } + + // Graceful termination path + Ok(()) } #[instrument(skip_all)] @@ -429,7 +478,10 @@ pub fn launch_monitor(compute: &Arc) -> thread::JoinHandle<()> { .spawn(move || { let span = span!(Level::INFO, "compute_monitor"); let _enter = span.enter(); - monitor.run(); + match monitor.run() { + Ok(_) => info!("compute monitor thread terminated gracefully"), + Err(err) => error!("compute monitor thread terminated abnormally {:?}", err), + } }) .expect("cannot launch compute monitor thread") } From f64eb0cbaf8edf8b9cf83533146295fb0f1a0a45 Mon Sep 17 00:00:00 2001 From: a-masterov <72613290+a-masterov@users.noreply.github.com> Date: Thu, 5 Jun 2025 17:02:38 +0200 Subject: [PATCH 003/124] Remove the Flaky Test computed-columns from postgis v16 (#12132) ## Problem The `computed_columns` test assumes that computed columns are always faster than the request itself. However, this is not always the case on Neon, which can lead to flaky results. ## Summary of changes The `computed_columns` test is excluded from the PostGIS test for PostgreSQL v16, accompanied by related patch refactoring. --- .../ext-src/postgis-src/neon-test.sh | 9 ++--- ...de-test.patch => postgis-common-v16.patch} | 16 +++++++++ .../postgis-src/postgis-common-v17.patch | 35 +++++++++++++++++++ .../postgis-src/postgis-regular-v16.patch | 14 +------- .../postgis-src/postgis-regular-v17.patch | 12 +------ .../ext-src/postgis-src/regular-test.sh | 4 +-- 6 files changed, 58 insertions(+), 32 deletions(-) rename docker-compose/ext-src/postgis-src/{postgis-no-upgrade-test.patch => postgis-common-v16.patch} (61%) create mode 100644 docker-compose/ext-src/postgis-src/postgis-common-v17.patch diff --git a/docker-compose/ext-src/postgis-src/neon-test.sh b/docker-compose/ext-src/postgis-src/neon-test.sh index 2866649a1b..13df1ec9d1 100755 --- a/docker-compose/ext-src/postgis-src/neon-test.sh +++ b/docker-compose/ext-src/postgis-src/neon-test.sh @@ -1,9 +1,6 @@ -#!/bin/bash +#!/bin/sh set -ex cd "$(dirname "$0")" -if [[ ${PG_VERSION} = v17 ]]; then - sed -i '/computed_columns/d' regress/core/tests.mk -fi -patch -p1 =" 120),1) +- TESTS += \ +- $(top_srcdir)/regress/core/computed_columns +-endif +- + ifeq ($(shell expr "$(POSTGIS_GEOS_VERSION)" ">=" 30700),1) + # GEOS-3.7 adds: + # ST_FrechetDistance diff --git a/regress/runtest.mk b/regress/runtest.mk index c051f03..010e493 100644 --- a/regress/runtest.mk diff --git a/docker-compose/ext-src/postgis-src/postgis-common-v17.patch b/docker-compose/ext-src/postgis-src/postgis-common-v17.patch new file mode 100644 index 0000000000..0b8978281e --- /dev/null +++ b/docker-compose/ext-src/postgis-src/postgis-common-v17.patch @@ -0,0 +1,35 @@ +diff --git a/regress/core/tests.mk b/regress/core/tests.mk +index 9e05244..90987df 100644 +--- a/regress/core/tests.mk ++++ b/regress/core/tests.mk +@@ -143,8 +143,7 @@ TESTS += \ + $(top_srcdir)/regress/core/oriented_envelope \ + $(top_srcdir)/regress/core/point_coordinates \ + $(top_srcdir)/regress/core/out_geojson \ +- $(top_srcdir)/regress/core/wrapx \ +- $(top_srcdir)/regress/core/computed_columns ++ $(top_srcdir)/regress/core/wrapx + + # Slow slow tests + TESTS_SLOW = \ +diff --git a/regress/runtest.mk b/regress/runtest.mk +index 4b95b7e..449d5a2 100644 +--- a/regress/runtest.mk ++++ b/regress/runtest.mk +@@ -24,16 +24,6 @@ check-regress: + + @POSTGIS_TOP_BUILD_DIR=$(abs_top_builddir) $(PERL) $(top_srcdir)/regress/run_test.pl $(RUNTESTFLAGS) $(RUNTESTFLAGS_INTERNAL) $(TESTS) + +- @if echo "$(RUNTESTFLAGS)" | grep -vq -- --upgrade; then \ +- echo "Running upgrade test as RUNTESTFLAGS did not contain that"; \ +- POSTGIS_TOP_BUILD_DIR=$(abs_top_builddir) $(PERL) $(top_srcdir)/regress/run_test.pl \ +- --upgrade \ +- $(RUNTESTFLAGS) \ +- $(RUNTESTFLAGS_INTERNAL) \ +- $(TESTS); \ +- else \ +- echo "Skipping upgrade test as RUNTESTFLAGS already requested upgrades"; \ +- fi + + check-long: + $(PERL) $(top_srcdir)/regress/run_test.pl $(RUNTESTFLAGS) $(TESTS) $(TESTS_SLOW) diff --git a/docker-compose/ext-src/postgis-src/postgis-regular-v16.patch b/docker-compose/ext-src/postgis-src/postgis-regular-v16.patch index 2fd214c534..e7f01ad288 100644 --- a/docker-compose/ext-src/postgis-src/postgis-regular-v16.patch +++ b/docker-compose/ext-src/postgis-src/postgis-regular-v16.patch @@ -125,7 +125,7 @@ index 7a36b65..ad78fc7 100644 DROP SCHEMA tm CASCADE; + diff --git a/regress/core/tests.mk b/regress/core/tests.mk -index 3abd7bc..94903c3 100644 +index 64a9254..94903c3 100644 --- a/regress/core/tests.mk +++ b/regress/core/tests.mk @@ -23,7 +23,6 @@ current_dir := $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) @@ -160,18 +160,6 @@ index 3abd7bc..94903c3 100644 $(top_srcdir)/regress/core/wkb \ $(top_srcdir)/regress/core/wkt \ $(top_srcdir)/regress/core/wmsservers \ -@@ -144,11 +140,6 @@ TESTS_SLOW = \ - $(top_srcdir)/regress/core/concave_hull_hard \ - $(top_srcdir)/regress/core/knn_recheck - --ifeq ($(shell expr "$(POSTGIS_PGSQL_VERSION)" ">=" 120),1) -- TESTS += \ -- $(top_srcdir)/regress/core/computed_columns --endif -- - ifeq ($(shell expr "$(POSTGIS_GEOS_VERSION)" ">=" 30700),1) - # GEOS-3.7 adds: - # ST_FrechetDistance diff --git a/regress/loader/tests.mk b/regress/loader/tests.mk index 1fc77ac..c3cb9de 100644 --- a/regress/loader/tests.mk diff --git a/docker-compose/ext-src/postgis-src/postgis-regular-v17.patch b/docker-compose/ext-src/postgis-src/postgis-regular-v17.patch index f4a9d83478..ae76e559df 100644 --- a/docker-compose/ext-src/postgis-src/postgis-regular-v17.patch +++ b/docker-compose/ext-src/postgis-src/postgis-regular-v17.patch @@ -125,7 +125,7 @@ index 7a36b65..ad78fc7 100644 DROP SCHEMA tm CASCADE; + diff --git a/regress/core/tests.mk b/regress/core/tests.mk -index 9e05244..a63a3e1 100644 +index 90987df..74fe3f1 100644 --- a/regress/core/tests.mk +++ b/regress/core/tests.mk @@ -16,14 +16,13 @@ POSTGIS_PGSQL_VERSION=170 @@ -168,16 +168,6 @@ index 9e05244..a63a3e1 100644 $(top_srcdir)/regress/core/wkb \ $(top_srcdir)/regress/core/wkt \ $(top_srcdir)/regress/core/wmsservers \ -@@ -143,8 +139,7 @@ TESTS += \ - $(top_srcdir)/regress/core/oriented_envelope \ - $(top_srcdir)/regress/core/point_coordinates \ - $(top_srcdir)/regress/core/out_geojson \ -- $(top_srcdir)/regress/core/wrapx \ -- $(top_srcdir)/regress/core/computed_columns -+ $(top_srcdir)/regress/core/wrapx - - # Slow slow tests - TESTS_SLOW = \ diff --git a/regress/loader/tests.mk b/regress/loader/tests.mk index ac4f8ad..4bad4fc 100644 --- a/regress/loader/tests.mk diff --git a/docker-compose/ext-src/postgis-src/regular-test.sh b/docker-compose/ext-src/postgis-src/regular-test.sh index 4b0b929946..1b1683b3f1 100755 --- a/docker-compose/ext-src/postgis-src/regular-test.sh +++ b/docker-compose/ext-src/postgis-src/regular-test.sh @@ -10,8 +10,8 @@ psql -d contrib_regression -c "ALTER DATABASE contrib_regression SET TimeZone='U -c "CREATE EXTENSION postgis_tiger_geocoder CASCADE" \ -c "CREATE EXTENSION postgis_raster SCHEMA public" \ -c "CREATE EXTENSION postgis_sfcgal SCHEMA public" -patch -p1 Date: Thu, 5 Jun 2025 20:53:14 +0200 Subject: [PATCH 004/124] neon_local timeline import: create timelines on safekeepers (#12138) neon_local's timeline import subcommand creates timelines manually, but doesn't create them on the safekeepers. If a test then tries to open an endpoint to read from the timeline, it will error in the new world with `--timelines-onto-safekeepers`. Therefore, if that flag is enabled, create the timelines on the safekeepers. Note that this import functionality is different from the fast import feature (https://github.com/neondatabase/neon/issues/10188, #11801). Part of #11670 As well as part of #11712 --- Cargo.lock | 1 + control_plane/Cargo.toml | 1 + control_plane/src/bin/neon_local.rs | 41 ++++++++++++++++++- control_plane/src/pageserver.rs | 12 ++++++ control_plane/src/safekeeper.rs | 63 ++++++++++++----------------- libs/safekeeper_api/src/models.rs | 2 +- safekeeper/client/src/mgmt_api.rs | 10 ++++- test_runner/regress/test_import.py | 3 ++ 8 files changed, 91 insertions(+), 42 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 588a63b6a3..5f71af118c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1445,6 +1445,7 @@ dependencies = [ "regex", "reqwest", "safekeeper_api", + "safekeeper_client", "scopeguard", "serde", "serde_json", diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml index 62c039047f..bbaa3f12b9 100644 --- a/control_plane/Cargo.toml +++ b/control_plane/Cargo.toml @@ -36,6 +36,7 @@ pageserver_api.workspace = true pageserver_client.workspace = true postgres_backend.workspace = true safekeeper_api.workspace = true +safekeeper_client.workspace = true postgres_connection.workspace = true storage_broker.workspace = true http-utils.workspace = true diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index ef6985d697..76e33e4bff 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -45,7 +45,7 @@ use pageserver_api::models::{ use pageserver_api::shard::{DEFAULT_STRIPE_SIZE, ShardCount, ShardStripeSize, TenantShardId}; use postgres_backend::AuthType; use postgres_connection::parse_host_port; -use safekeeper_api::membership::SafekeeperGeneration; +use safekeeper_api::membership::{SafekeeperGeneration, SafekeeperId}; use safekeeper_api::{ DEFAULT_HTTP_LISTEN_PORT as DEFAULT_SAFEKEEPER_HTTP_PORT, DEFAULT_PG_LISTEN_PORT as DEFAULT_SAFEKEEPER_PG_PORT, @@ -1255,6 +1255,45 @@ async fn handle_timeline(cmd: &TimelineCmd, env: &mut local_env::LocalEnv) -> Re pageserver .timeline_import(tenant_id, timeline_id, base, pg_wal, args.pg_version) .await?; + if env.storage_controller.timelines_onto_safekeepers { + println!("Creating timeline on safekeeper ..."); + let timeline_info = pageserver + .timeline_info( + TenantShardId::unsharded(tenant_id), + timeline_id, + pageserver_client::mgmt_api::ForceAwaitLogicalSize::No, + ) + .await?; + let default_sk = SafekeeperNode::from_env(env, env.safekeepers.first().unwrap()); + let default_host = default_sk + .conf + .listen_addr + .clone() + .unwrap_or_else(|| "localhost".to_string()); + let mconf = safekeeper_api::membership::Configuration { + generation: SafekeeperGeneration::new(1), + members: safekeeper_api::membership::MemberSet { + m: vec![SafekeeperId { + host: default_host, + id: default_sk.conf.id, + pg_port: default_sk.conf.pg_port, + }], + }, + new_members: None, + }; + let pg_version = args.pg_version * 10000; + let req = safekeeper_api::models::TimelineCreateRequest { + tenant_id, + timeline_id, + mconf, + pg_version, + system_id: None, + wal_seg_size: None, + start_lsn: timeline_info.last_record_lsn, + commit_lsn: None, + }; + default_sk.create_timeline(&req).await?; + } env.register_branch_mapping(branch_name.to_string(), tenant_id, timeline_id)?; println!("Done"); } diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index 0cf7ca184d..3b7c4ec39f 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -635,4 +635,16 @@ impl PageServerNode { Ok(()) } + pub async fn timeline_info( + &self, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + force_await_logical_size: mgmt_api::ForceAwaitLogicalSize, + ) -> anyhow::Result { + let timeline_info = self + .http_client + .timeline_info(tenant_shard_id, timeline_id, force_await_logical_size) + .await?; + Ok(timeline_info) + } } diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs index eec2c997e6..28d369a315 100644 --- a/control_plane/src/safekeeper.rs +++ b/control_plane/src/safekeeper.rs @@ -6,7 +6,6 @@ //! .neon/safekeepers/ //! ``` use std::error::Error as _; -use std::future::Future; use std::io::Write; use std::path::PathBuf; use std::time::Duration; @@ -14,9 +13,9 @@ use std::{io, result}; use anyhow::Context; use camino::Utf8PathBuf; -use http_utils::error::HttpErrorBody; use postgres_connection::PgConnectionConfig; -use reqwest::{IntoUrl, Method}; +use safekeeper_api::models::TimelineCreateRequest; +use safekeeper_client::mgmt_api; use thiserror::Error; use utils::auth::{Claims, Scope}; use utils::id::NodeId; @@ -35,25 +34,14 @@ pub enum SafekeeperHttpError { type Result = result::Result; -pub(crate) trait ResponseErrorMessageExt: Sized { - fn error_from_body(self) -> impl Future> + Send; -} - -impl ResponseErrorMessageExt for reqwest::Response { - async fn error_from_body(self) -> Result { - let status = self.status(); - if !(status.is_client_error() || status.is_server_error()) { - return Ok(self); - } - - // reqwest does not export its error construction utility functions, so let's craft the message ourselves - let url = self.url().to_owned(); - Err(SafekeeperHttpError::Response( - match self.json::().await { - Ok(err_body) => format!("Error: {}", err_body.msg), - Err(_) => format!("Http error ({}) at {}.", status.as_u16(), url), - }, - )) +fn err_from_client_err(err: mgmt_api::Error) -> SafekeeperHttpError { + use mgmt_api::Error::*; + match err { + ApiError(_, str) => SafekeeperHttpError::Response(str), + Cancelled => SafekeeperHttpError::Response("Cancelled".to_owned()), + ReceiveBody(err) => SafekeeperHttpError::Transport(err), + ReceiveErrorBody(err) => SafekeeperHttpError::Response(err), + Timeout(str) => SafekeeperHttpError::Response(format!("timeout: {str}")), } } @@ -70,9 +58,8 @@ pub struct SafekeeperNode { pub pg_connection_config: PgConnectionConfig, pub env: LocalEnv, - pub http_client: reqwest::Client, + pub http_client: mgmt_api::Client, pub listen_addr: String, - pub http_base_url: String, } impl SafekeeperNode { @@ -82,13 +69,14 @@ impl SafekeeperNode { } else { "127.0.0.1".to_string() }; + let jwt = None; + let http_base_url = format!("http://{}:{}", listen_addr, conf.http_port); SafekeeperNode { id: conf.id, conf: conf.clone(), pg_connection_config: Self::safekeeper_connection_config(&listen_addr, conf.pg_port), env: env.clone(), - http_client: env.create_http_client(), - http_base_url: format!("http://{}:{}/v1", listen_addr, conf.http_port), + http_client: mgmt_api::Client::new(env.create_http_client(), http_base_url, jwt), listen_addr, } } @@ -278,20 +266,19 @@ impl SafekeeperNode { ) } - fn http_request(&self, method: Method, url: U) -> reqwest::RequestBuilder { - // TODO: authentication - //if self.env.auth_type == AuthType::NeonJWT { - // builder = builder.bearer_auth(&self.env.safekeeper_auth_token) - //} - self.http_client.request(method, url) + pub async fn check_status(&self) -> Result<()> { + self.http_client + .status() + .await + .map_err(err_from_client_err)?; + Ok(()) } - pub async fn check_status(&self) -> Result<()> { - self.http_request(Method::GET, format!("{}/{}", self.http_base_url, "status")) - .send() - .await? - .error_from_body() - .await?; + pub async fn create_timeline(&self, req: &TimelineCreateRequest) -> Result<()> { + self.http_client + .create_timeline(req) + .await + .map_err(err_from_client_err)?; Ok(()) } } diff --git a/libs/safekeeper_api/src/models.rs b/libs/safekeeper_api/src/models.rs index 8658dc4011..fd05f6fda3 100644 --- a/libs/safekeeper_api/src/models.rs +++ b/libs/safekeeper_api/src/models.rs @@ -13,7 +13,7 @@ use utils::pageserver_feedback::PageserverFeedback; use crate::membership::Configuration; use crate::{ServerInfo, Term}; -#[derive(Debug, Serialize)] +#[derive(Debug, Serialize, Deserialize)] pub struct SafekeeperStatus { pub id: NodeId, } diff --git a/safekeeper/client/src/mgmt_api.rs b/safekeeper/client/src/mgmt_api.rs index b364ac8e48..2e46a7b529 100644 --- a/safekeeper/client/src/mgmt_api.rs +++ b/safekeeper/client/src/mgmt_api.rs @@ -8,8 +8,8 @@ use std::error::Error as _; use http_utils::error::HttpErrorBody; use reqwest::{IntoUrl, Method, StatusCode}; use safekeeper_api::models::{ - self, PullTimelineRequest, PullTimelineResponse, SafekeeperUtilization, TimelineCreateRequest, - TimelineStatus, + self, PullTimelineRequest, PullTimelineResponse, SafekeeperStatus, SafekeeperUtilization, + TimelineCreateRequest, TimelineStatus, }; use utils::id::{NodeId, TenantId, TimelineId}; use utils::logging::SecretString; @@ -183,6 +183,12 @@ impl Client { self.get(&uri).await } + pub async fn status(&self) -> Result { + let uri = format!("{}/v1/status", self.mgmt_api_endpoint); + let resp = self.get(&uri).await?; + resp.json().await.map_err(Error::ReceiveBody) + } + pub async fn utilization(&self) -> Result { let uri = format!("{}/v1/utilization", self.mgmt_api_endpoint); let resp = self.get(&uri).await?; diff --git a/test_runner/regress/test_import.py b/test_runner/regress/test_import.py index 55737c35f0..e1070a81e6 100644 --- a/test_runner/regress/test_import.py +++ b/test_runner/regress/test_import.py @@ -87,6 +87,9 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build # Set up pageserver for import neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) + neon_env_builder.storage_controller_config = { + "timelines_onto_safekeepers": True, + } env = neon_env_builder.init_start() env.pageserver.tenant_create(tenant) From b23e75ebfe6b6991cdff94d7ce1f627997db797e Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Fri, 6 Jun 2025 14:50:54 +0800 Subject: [PATCH 005/124] test(pageserver): ensure offload cleans up metrics (#12127) Add a test to ensure timeline metrics are fully cleaned up after offloading. Signed-off-by: Alex Chi Z --- test_runner/regress/test_tenants.py | 61 ++++++++++++++++++++++++++++- 1 file changed, 60 insertions(+), 1 deletion(-) diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index d08692500f..c54dd8b38d 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -12,7 +12,7 @@ from typing import TYPE_CHECKING import pytest import requests -from fixtures.common_types import Lsn, TenantId, TimelineId +from fixtures.common_types import Lsn, TenantId, TimelineArchivalState, TimelineId from fixtures.log_helper import log from fixtures.metrics import ( PAGESERVER_GLOBAL_METRICS, @@ -299,6 +299,65 @@ def test_pageserver_metrics_removed_after_detach(neon_env_builder: NeonEnvBuilde assert post_detach_samples == set() +def test_pageserver_metrics_removed_after_offload(neon_env_builder: NeonEnvBuilder): + """Tests that when a timeline is offloaded, the tenant specific metrics are not left behind""" + + neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.MOCK_S3) + + neon_env_builder.num_safekeepers = 3 + + env = neon_env_builder.init_start() + tenant_1, _ = env.create_tenant() + + timeline_1 = env.create_timeline("test_metrics_removed_after_offload_1", tenant_id=tenant_1) + timeline_2 = env.create_timeline("test_metrics_removed_after_offload_2", tenant_id=tenant_1) + + endpoint_tenant1 = env.endpoints.create_start( + "test_metrics_removed_after_offload_1", tenant_id=tenant_1 + ) + endpoint_tenant2 = env.endpoints.create_start( + "test_metrics_removed_after_offload_2", tenant_id=tenant_1 + ) + + for endpoint in [endpoint_tenant1, endpoint_tenant2]: + with closing(endpoint.connect()) as conn: + with conn.cursor() as cur: + cur.execute("CREATE TABLE t(key int primary key, value text)") + cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'") + cur.execute("SELECT sum(key) FROM t") + assert cur.fetchone() == (5000050000,) + endpoint.stop() + + def get_ps_metric_samples_for_timeline( + tenant_id: TenantId, timeline_id: TimelineId + ) -> list[Sample]: + ps_metrics = env.pageserver.http_client().get_metrics() + samples = [] + for metric_name in ps_metrics.metrics: + for sample in ps_metrics.query_all( + name=metric_name, + filter={"tenant_id": str(tenant_id), "timeline_id": str(timeline_id)}, + ): + samples.append(sample) + return samples + + for timeline in [timeline_1, timeline_2]: + pre_offload_samples = set( + [x.name for x in get_ps_metric_samples_for_timeline(tenant_1, timeline)] + ) + assert len(pre_offload_samples) > 0, f"expected at least one sample for {timeline}" + env.pageserver.http_client().timeline_archival_config( + tenant_1, + timeline, + state=TimelineArchivalState.ARCHIVED, + ) + env.pageserver.http_client().timeline_offload(tenant_1, timeline) + post_offload_samples = set( + [x.name for x in get_ps_metric_samples_for_timeline(tenant_1, timeline)] + ) + assert post_offload_samples == set() + + def test_pageserver_with_empty_tenants(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() From fe31baf9859d46f1eb9bf884fe480a1755ab01e9 Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Fri, 6 Jun 2025 17:38:58 +0800 Subject: [PATCH 006/124] feat(build): add aws cli into the docker image (#12161) ## Problem Makes it easier to debug AWS permission issues (i.e., storage scrubber) ## Summary of changes Install awscliv2 into the docker image. Signed-off-by: Alex Chi Z --- Dockerfile | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/Dockerfile b/Dockerfile index 3b7962dcf9..0b7ef491fd 100644 --- a/Dockerfile +++ b/Dockerfile @@ -110,6 +110,19 @@ RUN set -e \ # System postgres for use with client libraries (e.g. in storage controller) postgresql-15 \ openssl \ + unzip \ + curl \ + && ARCH=$(uname -m) \ + && if [ "$ARCH" = "x86_64" ]; then \ + curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip"; \ + elif [ "$ARCH" = "aarch64" ]; then \ + curl "https://awscli.amazonaws.com/awscli-exe-linux-aarch64.zip" -o "awscliv2.zip"; \ + else \ + echo "Unsupported architecture: $ARCH" && exit 1; \ + fi \ + && unzip awscliv2.zip \ + && ./aws/install \ + && rm -rf aws awscliv2.zip \ && rm -f /etc/apt/apt.conf.d/80-retries \ && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* \ && useradd -d /data neon \ From c511786548c8f09048b09a33b0e560fe2e518a5f Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Fri, 6 Jun 2025 12:01:58 +0200 Subject: [PATCH 007/124] pageserver: move `spawn_grpc` to `GrpcPageServiceHandler::spawn` (#12147) Mechanical move, no logic changes. --- pageserver/src/bin/pageserver.rs | 3 +- pageserver/src/page_service.rs | 188 ++++++++++++++++--------------- 2 files changed, 97 insertions(+), 94 deletions(-) diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index a1a95ad2d1..5cd865f53e 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -23,6 +23,7 @@ use pageserver::deletion_queue::DeletionQueue; use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task}; use pageserver::feature_resolver::FeatureResolver; use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING}; +use pageserver::page_service::GrpcPageServiceHandler; use pageserver::task_mgr::{ BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME, WALRECEIVER_RUNTIME, }; @@ -814,7 +815,7 @@ fn start_pageserver( // necessary? let mut page_service_grpc = None; if let Some(grpc_listener) = grpc_listener { - page_service_grpc = Some(page_service::spawn_grpc( + page_service_grpc = Some(GrpcPageServiceHandler::spawn( tenant_manager.clone(), grpc_auth, otel_guard.as_ref().map(|g| g.dispatch.clone()), diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 4a1ddf09b5..d47f6bd095 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -169,99 +169,6 @@ pub fn spawn( Listener { cancel, task } } -/// Spawns a gRPC server for the page service. -/// -/// TODO: move this onto GrpcPageServiceHandler::spawn(). -/// TODO: this doesn't support TLS. We need TLS reloading via ReloadingCertificateResolver, so we -/// need to reimplement the TCP+TLS accept loop ourselves. -pub fn spawn_grpc( - tenant_manager: Arc, - auth: Option>, - perf_trace_dispatch: Option, - get_vectored_concurrent_io: GetVectoredConcurrentIo, - listener: std::net::TcpListener, -) -> anyhow::Result { - let cancel = CancellationToken::new(); - let ctx = RequestContextBuilder::new(TaskKind::PageRequestHandler) - .download_behavior(DownloadBehavior::Download) - .perf_span_dispatch(perf_trace_dispatch) - .detached_child(); - let gate = Gate::default(); - - // Set up the TCP socket. We take a preconfigured TcpListener to bind the - // port early during startup. - let incoming = { - let _runtime = COMPUTE_REQUEST_RUNTIME.enter(); // required by TcpListener::from_std - listener.set_nonblocking(true)?; - tonic::transport::server::TcpIncoming::from(tokio::net::TcpListener::from_std(listener)?) - .with_nodelay(Some(GRPC_TCP_NODELAY)) - .with_keepalive(Some(GRPC_TCP_KEEPALIVE_TIME)) - }; - - // Set up the gRPC server. - // - // TODO: consider tuning window sizes. - let mut server = tonic::transport::Server::builder() - .http2_keepalive_interval(Some(GRPC_HTTP2_KEEPALIVE_INTERVAL)) - .http2_keepalive_timeout(Some(GRPC_HTTP2_KEEPALIVE_TIMEOUT)) - .max_concurrent_streams(Some(GRPC_MAX_CONCURRENT_STREAMS)); - - // Main page service stack. Uses a mix of Tonic interceptors and Tower layers: - // - // * Interceptors: can inspect and modify the gRPC request. Sync code only, runs before service. - // - // * Layers: allow async code, can run code after the service response. However, only has access - // to the raw HTTP request/response, not the gRPC types. - let page_service_handler = GrpcPageServiceHandler { - tenant_manager, - ctx, - gate_guard: gate.enter().expect("gate was just created"), - get_vectored_concurrent_io, - }; - - let observability_layer = ObservabilityLayer; - let mut tenant_interceptor = TenantMetadataInterceptor; - let mut auth_interceptor = TenantAuthInterceptor::new(auth); - - let page_service = tower::ServiceBuilder::new() - // Create tracing span and record request start time. - .layer(observability_layer) - // Intercept gRPC requests. - .layer(tonic::service::InterceptorLayer::new(move |mut req| { - // Extract tenant metadata. - req = tenant_interceptor.call(req)?; - // Authenticate tenant JWT token. - req = auth_interceptor.call(req)?; - Ok(req) - })) - .service(proto::PageServiceServer::new(page_service_handler)); - let server = server.add_service(page_service); - - // Reflection service for use with e.g. grpcurl. - let reflection_service = tonic_reflection::server::Builder::configure() - .register_encoded_file_descriptor_set(proto::FILE_DESCRIPTOR_SET) - .build_v1()?; - let server = server.add_service(reflection_service); - - // Spawn server task. - let task_cancel = cancel.clone(); - let task = COMPUTE_REQUEST_RUNTIME.spawn(task_mgr::exit_on_panic_or_error( - "grpc listener", - async move { - let result = server - .serve_with_incoming_shutdown(incoming, task_cancel.cancelled()) - .await; - if result.is_ok() { - // TODO: revisit shutdown logic once page service is implemented. - gate.close().await; - } - result - }, - )); - - Ok(CancellableTask { task, cancel }) -} - impl Listener { pub async fn stop_accepting(self) -> Connections { self.cancel.cancel(); @@ -3366,6 +3273,101 @@ pub struct GrpcPageServiceHandler { } impl GrpcPageServiceHandler { + /// Spawns a gRPC server for the page service. + /// + /// TODO: this doesn't support TLS. We need TLS reloading via ReloadingCertificateResolver, so we + /// need to reimplement the TCP+TLS accept loop ourselves. + pub fn spawn( + tenant_manager: Arc, + auth: Option>, + perf_trace_dispatch: Option, + get_vectored_concurrent_io: GetVectoredConcurrentIo, + listener: std::net::TcpListener, + ) -> anyhow::Result { + let cancel = CancellationToken::new(); + let ctx = RequestContextBuilder::new(TaskKind::PageRequestHandler) + .download_behavior(DownloadBehavior::Download) + .perf_span_dispatch(perf_trace_dispatch) + .detached_child(); + let gate = Gate::default(); + + // Set up the TCP socket. We take a preconfigured TcpListener to bind the + // port early during startup. + let incoming = { + let _runtime = COMPUTE_REQUEST_RUNTIME.enter(); // required by TcpListener::from_std + listener.set_nonblocking(true)?; + tonic::transport::server::TcpIncoming::from(tokio::net::TcpListener::from_std( + listener, + )?) + .with_nodelay(Some(GRPC_TCP_NODELAY)) + .with_keepalive(Some(GRPC_TCP_KEEPALIVE_TIME)) + }; + + // Set up the gRPC server. + // + // TODO: consider tuning window sizes. + let mut server = tonic::transport::Server::builder() + .http2_keepalive_interval(Some(GRPC_HTTP2_KEEPALIVE_INTERVAL)) + .http2_keepalive_timeout(Some(GRPC_HTTP2_KEEPALIVE_TIMEOUT)) + .max_concurrent_streams(Some(GRPC_MAX_CONCURRENT_STREAMS)); + + // Main page service stack. Uses a mix of Tonic interceptors and Tower layers: + // + // * Interceptors: can inspect and modify the gRPC request. Sync code only, runs before service. + // + // * Layers: allow async code, can run code after the service response. However, only has access + // to the raw HTTP request/response, not the gRPC types. + let page_service_handler = GrpcPageServiceHandler { + tenant_manager, + ctx, + gate_guard: gate.enter().expect("gate was just created"), + get_vectored_concurrent_io, + }; + + let observability_layer = ObservabilityLayer; + let mut tenant_interceptor = TenantMetadataInterceptor; + let mut auth_interceptor = TenantAuthInterceptor::new(auth); + + let page_service = tower::ServiceBuilder::new() + // Create tracing span and record request start time. + .layer(observability_layer) + // Intercept gRPC requests. + .layer(tonic::service::InterceptorLayer::new(move |mut req| { + // Extract tenant metadata. + req = tenant_interceptor.call(req)?; + // Authenticate tenant JWT token. + req = auth_interceptor.call(req)?; + Ok(req) + })) + // Run the page service. + .service(proto::PageServiceServer::new(page_service_handler)); + let server = server.add_service(page_service); + + // Reflection service for use with e.g. grpcurl. + let reflection_service = tonic_reflection::server::Builder::configure() + .register_encoded_file_descriptor_set(proto::FILE_DESCRIPTOR_SET) + .build_v1()?; + let server = server.add_service(reflection_service); + + // Spawn server task. + let task_cancel = cancel.clone(); + let task = COMPUTE_REQUEST_RUNTIME.spawn(task_mgr::exit_on_panic_or_error( + "grpc listener", + async move { + let result = server + .serve_with_incoming_shutdown(incoming, task_cancel.cancelled()) + .await; + if result.is_ok() { + // TODO: revisit shutdown logic once page service is implemented. + gate.close().await; + } + result + }, + )); + + Ok(CancellableTask { task, cancel }) + } + /// Errors if the request is executed on a non-zero shard. Only shard 0 has a complete view of /// relations and their sizes, as well as SLRU segments and similar data. #[allow(clippy::result_large_err)] From 590301df08b4eb3e8afc7afa7e3a91b6ab5dc420 Mon Sep 17 00:00:00 2001 From: Alexander Sarantcev <99037063+ephemeralsad@users.noreply.github.com> Date: Fri, 6 Jun 2025 14:16:55 +0400 Subject: [PATCH 008/124] storcon: Introduce deletion tombstones to support flaky node scenario (#12096) ## Problem Removed nodes can re-add themselves on restart if not properly tombstoned. We need a mechanism (e.g. soft-delete flag) to prevent this, especially in cases where the node is unreachable. More details there: #12036 ## Summary of changes - Introduced `NodeLifecycle` enum to represent node lifecycle states. - Added a string representation of `NodeLifecycle` to the `nodes` table. - Implemented node removal using a tombstone mechanism. - Introduced `/debug/v1/tombstone*` handlers to manage the tombstone state. --- control_plane/storcon_cli/src/main.rs | 41 +++++++ libs/pageserver_api/src/controller_api.rs | 29 +++++ .../down.sql | 1 + .../up.sql | 1 + storage_controller/src/http.rs | 50 ++++++++ storage_controller/src/node.rs | 6 +- storage_controller/src/persistence.rs | 115 +++++++++++++++--- storage_controller/src/schema.rs | 1 + storage_controller/src/service.rs | 57 ++++++++- test_runner/fixtures/neon_fixtures.py | 16 +++ .../regress/test_storage_controller.py | 52 ++++++++ 11 files changed, 345 insertions(+), 24 deletions(-) create mode 100644 storage_controller/migrations/2025-06-01-201442_add_lifecycle_to_nodes/down.sql create mode 100644 storage_controller/migrations/2025-06-01-201442_add_lifecycle_to_nodes/up.sql diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs index 19c686dcfd..1a9e944e07 100644 --- a/control_plane/storcon_cli/src/main.rs +++ b/control_plane/storcon_cli/src/main.rs @@ -61,10 +61,16 @@ enum Command { #[arg(long)] scheduling: Option, }, + // Set a node status as deleted. NodeDelete { #[arg(long)] node_id: NodeId, }, + /// Delete a tombstone of node from the storage controller. + NodeDeleteTombstone { + #[arg(long)] + node_id: NodeId, + }, /// Modify a tenant's policies in the storage controller TenantPolicy { #[arg(long)] @@ -82,6 +88,8 @@ enum Command { }, /// List nodes known to the storage controller Nodes {}, + /// List soft deleted nodes known to the storage controller + NodeTombstones {}, /// List tenants known to the storage controller Tenants { /// If this field is set, it will list the tenants on a specific node @@ -900,6 +908,39 @@ async fn main() -> anyhow::Result<()> { .dispatch::<(), ()>(Method::DELETE, format!("control/v1/node/{node_id}"), None) .await?; } + Command::NodeDeleteTombstone { node_id } => { + storcon_client + .dispatch::<(), ()>( + Method::DELETE, + format!("debug/v1/tombstone/{node_id}"), + None, + ) + .await?; + } + Command::NodeTombstones {} => { + let mut resp = storcon_client + .dispatch::<(), Vec>( + Method::GET, + "debug/v1/tombstone".to_string(), + None, + ) + .await?; + + resp.sort_by(|a, b| a.listen_http_addr.cmp(&b.listen_http_addr)); + + let mut table = comfy_table::Table::new(); + table.set_header(["Id", "Hostname", "AZ", "Scheduling", "Availability"]); + for node in resp { + table.add_row([ + format!("{}", node.id), + node.listen_http_addr, + node.availability_zone_id, + format!("{:?}", node.scheduling), + format!("{:?}", node.availability), + ]); + } + println!("{table}"); + } Command::TenantSetTimeBasedEviction { tenant_id, period, diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs index c5b49edba0..ae792cc81c 100644 --- a/libs/pageserver_api/src/controller_api.rs +++ b/libs/pageserver_api/src/controller_api.rs @@ -344,6 +344,35 @@ impl Default for ShardSchedulingPolicy { } } +#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)] +pub enum NodeLifecycle { + Active, + Deleted, +} + +impl FromStr for NodeLifecycle { + type Err = anyhow::Error; + + fn from_str(s: &str) -> Result { + match s { + "active" => Ok(Self::Active), + "deleted" => Ok(Self::Deleted), + _ => Err(anyhow::anyhow!("Unknown node lifecycle '{s}'")), + } + } +} + +impl From for String { + fn from(value: NodeLifecycle) -> String { + use NodeLifecycle::*; + match value { + Active => "active", + Deleted => "deleted", + } + .to_string() + } +} + #[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)] pub enum NodeSchedulingPolicy { Active, diff --git a/storage_controller/migrations/2025-06-01-201442_add_lifecycle_to_nodes/down.sql b/storage_controller/migrations/2025-06-01-201442_add_lifecycle_to_nodes/down.sql new file mode 100644 index 0000000000..a09acb916b --- /dev/null +++ b/storage_controller/migrations/2025-06-01-201442_add_lifecycle_to_nodes/down.sql @@ -0,0 +1 @@ +ALTER TABLE nodes DROP COLUMN lifecycle; diff --git a/storage_controller/migrations/2025-06-01-201442_add_lifecycle_to_nodes/up.sql b/storage_controller/migrations/2025-06-01-201442_add_lifecycle_to_nodes/up.sql new file mode 100644 index 0000000000..e03a0cadba --- /dev/null +++ b/storage_controller/migrations/2025-06-01-201442_add_lifecycle_to_nodes/up.sql @@ -0,0 +1 @@ +ALTER TABLE nodes ADD COLUMN lifecycle VARCHAR NOT NULL DEFAULT 'active'; diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs index 2b1c0db12f..705b81077e 100644 --- a/storage_controller/src/http.rs +++ b/storage_controller/src/http.rs @@ -907,6 +907,42 @@ async fn handle_node_delete(req: Request) -> Result, ApiErr json_response(StatusCode::OK, state.service.node_delete(node_id).await?) } +async fn handle_tombstone_list(req: Request) -> Result, ApiError> { + check_permissions(&req, Scope::Admin)?; + + let req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + + let state = get_state(&req); + let mut nodes = state.service.tombstone_list().await?; + nodes.sort_by_key(|n| n.get_id()); + let api_nodes = nodes.into_iter().map(|n| n.describe()).collect::>(); + + json_response(StatusCode::OK, api_nodes) +} + +async fn handle_tombstone_delete(req: Request) -> Result, ApiError> { + check_permissions(&req, Scope::Admin)?; + + let req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + + let state = get_state(&req); + let node_id: NodeId = parse_request_param(&req, "node_id")?; + json_response( + StatusCode::OK, + state.service.tombstone_delete(node_id).await?, + ) +} + async fn handle_node_configure(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; @@ -2062,6 +2098,20 @@ pub fn make_router( .post("/debug/v1/node/:node_id/drop", |r| { named_request_span(r, handle_node_drop, RequestName("debug_v1_node_drop")) }) + .delete("/debug/v1/tombstone/:node_id", |r| { + named_request_span( + r, + handle_tombstone_delete, + RequestName("debug_v1_tombstone_delete"), + ) + }) + .get("/debug/v1/tombstone", |r| { + named_request_span( + r, + handle_tombstone_list, + RequestName("debug_v1_tombstone_list"), + ) + }) .post("/debug/v1/tenant/:tenant_id/import", |r| { named_request_span( r, diff --git a/storage_controller/src/node.rs b/storage_controller/src/node.rs index e180c49b43..8e0f1873e5 100644 --- a/storage_controller/src/node.rs +++ b/storage_controller/src/node.rs @@ -2,7 +2,7 @@ use std::str::FromStr; use std::time::Duration; use pageserver_api::controller_api::{ - AvailabilityZone, NodeAvailability, NodeDescribeResponse, NodeRegisterRequest, + AvailabilityZone, NodeAvailability, NodeDescribeResponse, NodeLifecycle, NodeRegisterRequest, NodeSchedulingPolicy, TenantLocateResponseShard, }; use pageserver_api::shard::TenantShardId; @@ -29,6 +29,7 @@ pub(crate) struct Node { availability: NodeAvailability, scheduling: NodeSchedulingPolicy, + lifecycle: NodeLifecycle, listen_http_addr: String, listen_http_port: u16, @@ -228,6 +229,7 @@ impl Node { listen_pg_addr, listen_pg_port, scheduling: NodeSchedulingPolicy::Active, + lifecycle: NodeLifecycle::Active, availability: NodeAvailability::Offline, availability_zone_id, use_https, @@ -239,6 +241,7 @@ impl Node { NodePersistence { node_id: self.id.0 as i64, scheduling_policy: self.scheduling.into(), + lifecycle: self.lifecycle.into(), listen_http_addr: self.listen_http_addr.clone(), listen_http_port: self.listen_http_port as i32, listen_https_port: self.listen_https_port.map(|x| x as i32), @@ -263,6 +266,7 @@ impl Node { availability: NodeAvailability::Offline, scheduling: NodeSchedulingPolicy::from_str(&np.scheduling_policy) .expect("Bad scheduling policy in DB"), + lifecycle: NodeLifecycle::from_str(&np.lifecycle).expect("Bad lifecycle in DB"), listen_http_addr: np.listen_http_addr, listen_http_port: np.listen_http_port as u16, listen_https_port: np.listen_https_port.map(|x| x as u16), diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs index 052c0f02eb..2edfe3a338 100644 --- a/storage_controller/src/persistence.rs +++ b/storage_controller/src/persistence.rs @@ -19,7 +19,7 @@ use futures::FutureExt; use futures::future::BoxFuture; use itertools::Itertools; use pageserver_api::controller_api::{ - AvailabilityZone, MetadataHealthRecord, NodeSchedulingPolicy, PlacementPolicy, + AvailabilityZone, MetadataHealthRecord, NodeLifecycle, NodeSchedulingPolicy, PlacementPolicy, SafekeeperDescribeResponse, ShardSchedulingPolicy, SkSchedulingPolicy, }; use pageserver_api::models::{ShardImportStatus, TenantConfig}; @@ -102,6 +102,7 @@ pub(crate) enum DatabaseOperation { UpdateNode, DeleteNode, ListNodes, + ListTombstones, BeginShardSplit, CompleteShardSplit, AbortShardSplit, @@ -357,6 +358,8 @@ impl Persistence { } /// When a node is first registered, persist it before using it for anything + /// If the provided node_id already exists, it will be error. + /// The common case is when a node marked for deletion wants to register. pub(crate) async fn insert_node(&self, node: &Node) -> DatabaseResult<()> { let np = &node.to_persistent(); self.with_measured_conn(DatabaseOperation::InsertNode, move |conn| { @@ -373,19 +376,41 @@ impl Persistence { /// At startup, populate the list of nodes which our shards may be placed on pub(crate) async fn list_nodes(&self) -> DatabaseResult> { - let nodes: Vec = self + use crate::schema::nodes::dsl::*; + + let result: Vec = self .with_measured_conn(DatabaseOperation::ListNodes, move |conn| { Box::pin(async move { Ok(crate::schema::nodes::table + .filter(lifecycle.ne(String::from(NodeLifecycle::Deleted))) .load::(conn) .await?) }) }) .await?; - tracing::info!("list_nodes: loaded {} nodes", nodes.len()); + tracing::info!("list_nodes: loaded {} nodes", result.len()); - Ok(nodes) + Ok(result) + } + + pub(crate) async fn list_tombstones(&self) -> DatabaseResult> { + use crate::schema::nodes::dsl::*; + + let result: Vec = self + .with_measured_conn(DatabaseOperation::ListTombstones, move |conn| { + Box::pin(async move { + Ok(crate::schema::nodes::table + .filter(lifecycle.eq(String::from(NodeLifecycle::Deleted))) + .load::(conn) + .await?) + }) + }) + .await?; + + tracing::info!("list_tombstones: loaded {} nodes", result.len()); + + Ok(result) } pub(crate) async fn update_node( @@ -404,6 +429,7 @@ impl Persistence { Box::pin(async move { let updated = diesel::update(nodes) .filter(node_id.eq(input_node_id.0 as i64)) + .filter(lifecycle.ne(String::from(NodeLifecycle::Deleted))) .set(values) .execute(conn) .await?; @@ -447,6 +473,57 @@ impl Persistence { .await } + /// Tombstone is a special state where the node is not deleted from the database, + /// but it is not available for usage. + /// The main reason for it is to prevent the flaky node to register. + pub(crate) async fn set_tombstone(&self, del_node_id: NodeId) -> DatabaseResult<()> { + use crate::schema::nodes::dsl::*; + self.update_node( + del_node_id, + lifecycle.eq(String::from(NodeLifecycle::Deleted)), + ) + .await + } + + pub(crate) async fn delete_node(&self, del_node_id: NodeId) -> DatabaseResult<()> { + use crate::schema::nodes::dsl::*; + self.with_measured_conn(DatabaseOperation::DeleteNode, move |conn| { + Box::pin(async move { + // You can hard delete a node only if it has a tombstone. + // So we need to check if the node has lifecycle set to deleted. + let node_to_delete = nodes + .filter(node_id.eq(del_node_id.0 as i64)) + .first::(conn) + .await + .optional()?; + + if let Some(np) = node_to_delete { + let lc = NodeLifecycle::from_str(&np.lifecycle).map_err(|e| { + DatabaseError::Logical(format!( + "Node {} has invalid lifecycle: {}", + del_node_id, e + )) + })?; + + if lc != NodeLifecycle::Deleted { + return Err(DatabaseError::Logical(format!( + "Node {} was not soft deleted before, cannot hard delete it", + del_node_id + ))); + } + + diesel::delete(nodes) + .filter(node_id.eq(del_node_id.0 as i64)) + .execute(conn) + .await?; + } + + Ok(()) + }) + }) + .await + } + /// At startup, load the high level state for shards, such as their config + policy. This will /// be enriched at runtime with state discovered on pageservers. /// @@ -543,21 +620,6 @@ impl Persistence { .await } - pub(crate) async fn delete_node(&self, del_node_id: NodeId) -> DatabaseResult<()> { - use crate::schema::nodes::dsl::*; - self.with_measured_conn(DatabaseOperation::DeleteNode, move |conn| { - Box::pin(async move { - diesel::delete(nodes) - .filter(node_id.eq(del_node_id.0 as i64)) - .execute(conn) - .await?; - - Ok(()) - }) - }) - .await - } - /// When a tenant invokes the /re-attach API, this function is responsible for doing an efficient /// batched increment of the generations of all tenants whose generation_pageserver is equal to /// the node that called /re-attach. @@ -571,6 +633,20 @@ impl Persistence { let updated = self .with_measured_conn(DatabaseOperation::ReAttach, move |conn| { Box::pin(async move { + // Check if the node is not marked as deleted + let deleted_node: i64 = nodes + .filter(node_id.eq(input_node_id.0 as i64)) + .filter(lifecycle.eq(String::from(NodeLifecycle::Deleted))) + .count() + .get_result(conn) + .await?; + if deleted_node > 0 { + return Err(DatabaseError::Logical(format!( + "Node {} is marked as deleted, re-attach is not allowed", + input_node_id + ))); + } + let rows_updated = diesel::update(tenant_shards) .filter(generation_pageserver.eq(input_node_id.0 as i64)) .set(generation.eq(generation + 1)) @@ -2048,6 +2124,7 @@ pub(crate) struct NodePersistence { pub(crate) listen_pg_port: i32, pub(crate) availability_zone_id: String, pub(crate) listen_https_port: Option, + pub(crate) lifecycle: String, } /// Tenant metadata health status that are stored durably. diff --git a/storage_controller/src/schema.rs b/storage_controller/src/schema.rs index 20be9bb5ca..f5807cfcd2 100644 --- a/storage_controller/src/schema.rs +++ b/storage_controller/src/schema.rs @@ -33,6 +33,7 @@ diesel::table! { listen_pg_port -> Int4, availability_zone_id -> Varchar, listen_https_port -> Nullable, + lifecycle -> Varchar, } } diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 790797bae2..cb29993e8c 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -166,6 +166,7 @@ enum NodeOperations { Register, Configure, Delete, + DeleteTombstone, } /// The leadership status for the storage controller process. @@ -6909,7 +6910,7 @@ impl Service { /// detaching or deleting it on pageservers. We do not try and re-schedule any /// tenants that were on this node. pub(crate) async fn node_drop(&self, node_id: NodeId) -> Result<(), ApiError> { - self.persistence.delete_node(node_id).await?; + self.persistence.set_tombstone(node_id).await?; let mut locked = self.inner.write().unwrap(); @@ -7033,9 +7034,10 @@ impl Service { // That is safe because in Service::spawn we only use generation_pageserver if it refers to a node // that exists. - // 2. Actually delete the node from the database and from in-memory state + // 2. Actually delete the node from in-memory state and set tombstone to the database + // for preventing the node to register again. tracing::info!("Deleting node from database"); - self.persistence.delete_node(node_id).await?; + self.persistence.set_tombstone(node_id).await?; Ok(()) } @@ -7054,6 +7056,35 @@ impl Service { Ok(nodes) } + pub(crate) async fn tombstone_list(&self) -> Result, ApiError> { + self.persistence + .list_tombstones() + .await? + .into_iter() + .map(|np| Node::from_persistent(np, false)) + .collect::, _>>() + .map_err(ApiError::InternalServerError) + } + + pub(crate) async fn tombstone_delete(&self, node_id: NodeId) -> Result<(), ApiError> { + let _node_lock = trace_exclusive_lock( + &self.node_op_locks, + node_id, + NodeOperations::DeleteTombstone, + ) + .await; + + if matches!(self.get_node(node_id).await, Err(ApiError::NotFound(_))) { + self.persistence.delete_node(node_id).await?; + Ok(()) + } else { + Err(ApiError::Conflict(format!( + "Node {} is in use, consider using tombstone API first", + node_id + ))) + } + } + pub(crate) async fn get_node(&self, node_id: NodeId) -> Result { self.inner .read() @@ -7224,7 +7255,25 @@ impl Service { }; match registration_status { - RegistrationStatus::New => self.persistence.insert_node(&new_node).await?, + RegistrationStatus::New => { + self.persistence.insert_node(&new_node).await.map_err(|e| { + if matches!( + e, + crate::persistence::DatabaseError::Query( + diesel::result::Error::DatabaseError( + diesel::result::DatabaseErrorKind::UniqueViolation, + _, + ) + ) + ) { + // The node can be deleted by tombstone API, and not show up in the list of nodes. + // If you see this error, check tombstones first. + ApiError::Conflict(format!("Node {} is already exists", new_node.get_id())) + } else { + ApiError::from(e) + } + })?; + } RegistrationStatus::NeedUpdate => { self.persistence .update_node_on_registration( diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index db3f080261..5223e34baf 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -2054,6 +2054,14 @@ class NeonStorageController(MetricsGetter, LogUtils): headers=self.headers(TokenScope.ADMIN), ) + def tombstone_delete(self, node_id): + log.info(f"tombstone_delete({node_id})") + self.request( + "DELETE", + f"{self.api}/debug/v1/tombstone/{node_id}", + headers=self.headers(TokenScope.ADMIN), + ) + def node_drain(self, node_id): log.info(f"node_drain({node_id})") self.request( @@ -2110,6 +2118,14 @@ class NeonStorageController(MetricsGetter, LogUtils): ) return response.json() + def tombstone_list(self): + response = self.request( + "GET", + f"{self.api}/debug/v1/tombstone", + headers=self.headers(TokenScope.ADMIN), + ) + return response.json() + def tenant_shard_dump(self): """ Debug listing API: dumps the internal map of tenant shards diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py index 346ef0951d..5e0dd780c3 100644 --- a/test_runner/regress/test_storage_controller.py +++ b/test_runner/regress/test_storage_controller.py @@ -3093,6 +3093,58 @@ def test_storage_controller_ps_restarted_during_drain(neon_env_builder: NeonEnvB wait_until(reconfigure_node_again) +def test_ps_unavailable_after_delete(neon_env_builder: NeonEnvBuilder): + neon_env_builder.num_pageservers = 3 + + env = neon_env_builder.init_start() + + def assert_nodes_count(n: int): + nodes = env.storage_controller.node_list() + assert len(nodes) == n + + # Nodes count must remain the same before deletion + assert_nodes_count(3) + + ps = env.pageservers[0] + env.storage_controller.node_delete(ps.id) + + # After deletion, the node count must be reduced + assert_nodes_count(2) + + # Running pageserver CLI init in a separate thread + with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor: + log.info("Restarting tombstoned pageserver...") + ps.stop() + ps_start_fut = executor.submit(lambda: ps.start(await_active=False)) + + # After deleted pageserver restart, the node count must remain the same + assert_nodes_count(2) + + tombstones = env.storage_controller.tombstone_list() + assert len(tombstones) == 1 and tombstones[0]["id"] == ps.id + + env.storage_controller.tombstone_delete(ps.id) + + tombstones = env.storage_controller.tombstone_list() + assert len(tombstones) == 0 + + # Wait for the pageserver start operation to complete. + # If it fails with an exception, we try restarting the pageserver since the failure + # may be due to the storage controller refusing to register the node. + # However, if we get a TimeoutError that means the pageserver is completely hung, + # which is an unexpected failure mode that we'll let propagate up. + try: + ps_start_fut.result(timeout=20) + except TimeoutError: + raise + except Exception: + log.info("Restarting deleted pageserver...") + ps.restart() + + # Finally, the node can be registered again after tombstone is deleted + wait_until(lambda: assert_nodes_count(3)) + + def test_storage_controller_timeline_crud_race(neon_env_builder: NeonEnvBuilder): """ The storage controller is meant to handle the case where a timeline CRUD operation races From 4d99b6ff4d1e5ab87f198421bae8bab3948c6b66 Mon Sep 17 00:00:00 2001 From: Conrad Ludgate Date: Fri, 6 Jun 2025 11:29:55 +0100 Subject: [PATCH 009/124] [proxy] separate compute connect from compute authentication (#12145) ## Problem PGLB/Neonkeeper needs to separate the concerns of connecting to compute, and authenticating to compute. Additionally, the code within `connect_to_compute` is rather messy, spending effort on recovering the authentication info after wake_compute. ## Summary of changes Split `ConnCfg` into `ConnectInfo` and `AuthInfo`. `wake_compute` only returns `ConnectInfo` and `AuthInfo` is determined separately from the `handshake`/`authenticate` process. Additionally, `ConnectInfo::connect_raw` is in-charge or establishing the TLS connection, and the `postgres_client::Config::connect_raw` is configured to use `NoTls` which will force it to skip the TLS negotiation. This should just work. --- .../proxy/tokio-postgres2/src/cancel_query.rs | 2 +- libs/proxy/tokio-postgres2/src/config.rs | 3 +- libs/proxy/tokio-postgres2/src/connect.rs | 2 +- libs/proxy/tokio-postgres2/src/tls.rs | 4 +- proxy/src/auth/backend/classic.rs | 5 - proxy/src/auth/backend/console_redirect.rs | 59 +++-- proxy/src/auth/backend/local.rs | 10 +- proxy/src/auth/backend/mod.rs | 9 - proxy/src/auth/flow.rs | 7 - proxy/src/cancellation.rs | 7 +- proxy/src/{compute.rs => compute/mod.rs} | 202 ++++++++++-------- proxy/src/compute/tls.rs | 63 ++++++ proxy/src/console_redirect_proxy.rs | 6 +- .../control_plane/client/cplane_proxy_v1.rs | 24 +-- proxy/src/control_plane/client/mock.rs | 46 ++-- proxy/src/control_plane/mod.rs | 30 +-- proxy/src/pglb/connect_compute.rs | 47 ++-- proxy/src/pqproto.rs | 77 ++++--- proxy/src/proxy/mod.rs | 16 +- proxy/src/proxy/retry.rs | 4 +- proxy/src/proxy/tests/mod.rs | 26 ++- proxy/src/serverless/backend.rs | 39 ++-- proxy/src/serverless/conn_pool.rs | 4 +- proxy/src/tls/postgres_rustls.rs | 46 ++-- 24 files changed, 382 insertions(+), 356 deletions(-) rename proxy/src/{compute.rs => compute/mod.rs} (68%) create mode 100644 proxy/src/compute/tls.rs diff --git a/libs/proxy/tokio-postgres2/src/cancel_query.rs b/libs/proxy/tokio-postgres2/src/cancel_query.rs index 0bdad0b554..4c2a5ef50f 100644 --- a/libs/proxy/tokio-postgres2/src/cancel_query.rs +++ b/libs/proxy/tokio-postgres2/src/cancel_query.rs @@ -10,7 +10,7 @@ use crate::{Error, cancel_query_raw, connect_socket}; pub(crate) async fn cancel_query( config: Option, ssl_mode: SslMode, - mut tls: T, + tls: T, process_id: i32, secret_key: i32, ) -> Result<(), Error> diff --git a/libs/proxy/tokio-postgres2/src/config.rs b/libs/proxy/tokio-postgres2/src/config.rs index 978d348741..243a5bc725 100644 --- a/libs/proxy/tokio-postgres2/src/config.rs +++ b/libs/proxy/tokio-postgres2/src/config.rs @@ -17,7 +17,6 @@ use crate::{Client, Connection, Error}; /// TLS configuration. #[derive(Debug, Copy, Clone, PartialEq, Eq, Serialize, Deserialize)] -#[non_exhaustive] pub enum SslMode { /// Do not use TLS. Disable, @@ -231,7 +230,7 @@ impl Config { /// Requires the `runtime` Cargo feature (enabled by default). pub async fn connect( &self, - tls: T, + tls: &T, ) -> Result<(Client, Connection), Error> where T: MakeTlsConnect, diff --git a/libs/proxy/tokio-postgres2/src/connect.rs b/libs/proxy/tokio-postgres2/src/connect.rs index 39a0a87c74..f7bc863337 100644 --- a/libs/proxy/tokio-postgres2/src/connect.rs +++ b/libs/proxy/tokio-postgres2/src/connect.rs @@ -13,7 +13,7 @@ use crate::tls::{MakeTlsConnect, TlsConnect}; use crate::{Client, Config, Connection, Error, RawConnection}; pub async fn connect( - mut tls: T, + tls: &T, config: &Config, ) -> Result<(Client, Connection), Error> where diff --git a/libs/proxy/tokio-postgres2/src/tls.rs b/libs/proxy/tokio-postgres2/src/tls.rs index 41b51368ff..f9cbcf4991 100644 --- a/libs/proxy/tokio-postgres2/src/tls.rs +++ b/libs/proxy/tokio-postgres2/src/tls.rs @@ -47,7 +47,7 @@ pub trait MakeTlsConnect { /// Creates a new `TlsConnect`or. /// /// The domain name is provided for certificate verification and SNI. - fn make_tls_connect(&mut self, domain: &str) -> Result; + fn make_tls_connect(&self, domain: &str) -> Result; } /// An asynchronous function wrapping a stream in a TLS session. @@ -85,7 +85,7 @@ impl MakeTlsConnect for NoTls { type TlsConnect = NoTls; type Error = NoTlsError; - fn make_tls_connect(&mut self, _: &str) -> Result { + fn make_tls_connect(&self, _: &str) -> Result { Ok(NoTls) } } diff --git a/proxy/src/auth/backend/classic.rs b/proxy/src/auth/backend/classic.rs index 8445368740..f35b3ecc05 100644 --- a/proxy/src/auth/backend/classic.rs +++ b/proxy/src/auth/backend/classic.rs @@ -18,11 +18,6 @@ pub(super) async fn authenticate( secret: AuthSecret, ) -> auth::Result { let scram_keys = match secret { - #[cfg(any(test, feature = "testing"))] - AuthSecret::Md5(_) => { - debug!("auth endpoint chooses MD5"); - return Err(auth::AuthError::MalformedPassword("MD5 not supported")); - } AuthSecret::Scram(secret) => { debug!("auth endpoint chooses SCRAM"); diff --git a/proxy/src/auth/backend/console_redirect.rs b/proxy/src/auth/backend/console_redirect.rs index c388848926..455d96c90a 100644 --- a/proxy/src/auth/backend/console_redirect.rs +++ b/proxy/src/auth/backend/console_redirect.rs @@ -6,10 +6,9 @@ use thiserror::Error; use tokio::io::{AsyncRead, AsyncWrite}; use tracing::{info, info_span}; -use super::ComputeCredentialKeys; -use crate::auth::IpPattern; use crate::auth::backend::ComputeUserInfo; use crate::cache::Cached; +use crate::compute::AuthInfo; use crate::config::AuthenticationConfig; use crate::context::RequestContext; use crate::control_plane::client::cplane_proxy_v1; @@ -98,15 +97,11 @@ impl ConsoleRedirectBackend { ctx: &RequestContext, auth_config: &'static AuthenticationConfig, client: &mut PqStream, - ) -> auth::Result<( - ConsoleRedirectNodeInfo, - ComputeUserInfo, - Option>, - )> { + ) -> auth::Result<(ConsoleRedirectNodeInfo, AuthInfo, ComputeUserInfo)> { authenticate(ctx, auth_config, &self.console_uri, client) .await - .map(|(node_info, user_info, ip_allowlist)| { - (ConsoleRedirectNodeInfo(node_info), user_info, ip_allowlist) + .map(|(node_info, auth_info, user_info)| { + (ConsoleRedirectNodeInfo(node_info), auth_info, user_info) }) } } @@ -121,10 +116,6 @@ impl ComputeConnectBackend for ConsoleRedirectNodeInfo { ) -> Result { Ok(Cached::new_uncached(self.0.clone())) } - - fn get_keys(&self) -> &ComputeCredentialKeys { - &ComputeCredentialKeys::None - } } async fn authenticate( @@ -132,7 +123,7 @@ async fn authenticate( auth_config: &'static AuthenticationConfig, link_uri: &reqwest::Url, client: &mut PqStream, -) -> auth::Result<(NodeInfo, ComputeUserInfo, Option>)> { +) -> auth::Result<(NodeInfo, AuthInfo, ComputeUserInfo)> { ctx.set_auth_method(crate::context::AuthMethod::ConsoleRedirect); // registering waiter can fail if we get unlucky with rng. @@ -192,10 +183,24 @@ async fn authenticate( client.write_message(BeMessage::NoticeResponse("Connecting to database.")); - // This config should be self-contained, because we won't - // take username or dbname from client's startup message. - let mut config = compute::ConnCfg::new(db_info.host.to_string(), db_info.port); - config.dbname(&db_info.dbname).user(&db_info.user); + // Backwards compatibility. pg_sni_proxy uses "--" in domain names + // while direct connections do not. Once we migrate to pg_sni_proxy + // everywhere, we can remove this. + let ssl_mode = if db_info.host.contains("--") { + // we need TLS connection with SNI info to properly route it + SslMode::Require + } else { + SslMode::Disable + }; + + let conn_info = compute::ConnectInfo { + host: db_info.host.into(), + port: db_info.port, + ssl_mode, + host_addr: None, + }; + let auth_info = + AuthInfo::for_console_redirect(&db_info.dbname, &db_info.user, db_info.password.as_deref()); let user: RoleName = db_info.user.into(); let user_info = ComputeUserInfo { @@ -209,26 +214,12 @@ async fn authenticate( ctx.set_project(db_info.aux.clone()); info!("woken up a compute node"); - // Backwards compatibility. pg_sni_proxy uses "--" in domain names - // while direct connections do not. Once we migrate to pg_sni_proxy - // everywhere, we can remove this. - if db_info.host.contains("--") { - // we need TLS connection with SNI info to properly route it - config.ssl_mode(SslMode::Require); - } else { - config.ssl_mode(SslMode::Disable); - } - - if let Some(password) = db_info.password { - config.password(password.as_ref()); - } - Ok(( NodeInfo { - config, + conn_info, aux: db_info.aux, }, + auth_info, user_info, - db_info.allowed_ips, )) } diff --git a/proxy/src/auth/backend/local.rs b/proxy/src/auth/backend/local.rs index 7a6dceb194..2224f492b8 100644 --- a/proxy/src/auth/backend/local.rs +++ b/proxy/src/auth/backend/local.rs @@ -1,11 +1,12 @@ use std::net::SocketAddr; use arc_swap::ArcSwapOption; +use postgres_client::config::SslMode; use tokio::sync::Semaphore; use super::jwt::{AuthRule, FetchAuthRules}; use crate::auth::backend::jwt::FetchAuthRulesError; -use crate::compute::ConnCfg; +use crate::compute::ConnectInfo; use crate::compute_ctl::ComputeCtlApi; use crate::context::RequestContext; use crate::control_plane::NodeInfo; @@ -29,7 +30,12 @@ impl LocalBackend { api: http::Endpoint::new(compute_ctl, http::new_client()), }, node_info: NodeInfo { - config: ConnCfg::new(postgres_addr.ip().to_string(), postgres_addr.port()), + conn_info: ConnectInfo { + host_addr: Some(postgres_addr.ip()), + host: postgres_addr.ip().to_string().into(), + port: postgres_addr.port(), + ssl_mode: SslMode::Disable, + }, // TODO(conrad): make this better reflect compute info rather than endpoint info. aux: MetricsAuxInfo { endpoint_id: EndpointIdTag::get_interner().get_or_intern("local"), diff --git a/proxy/src/auth/backend/mod.rs b/proxy/src/auth/backend/mod.rs index f978f655c4..edc1ae06d9 100644 --- a/proxy/src/auth/backend/mod.rs +++ b/proxy/src/auth/backend/mod.rs @@ -168,8 +168,6 @@ impl ComputeUserInfo { #[cfg_attr(test, derive(Debug))] pub(crate) enum ComputeCredentialKeys { - #[cfg(any(test, feature = "testing"))] - Password(Vec), AuthKeys(AuthKeys), JwtPayload(Vec), None, @@ -419,13 +417,6 @@ impl ComputeConnectBackend for Backend<'_, ComputeCredentials> { Self::Local(local) => Ok(Cached::new_uncached(local.node_info.clone())), } } - - fn get_keys(&self) -> &ComputeCredentialKeys { - match self { - Self::ControlPlane(_, creds) => &creds.keys, - Self::Local(_) => &ComputeCredentialKeys::None, - } - } } #[cfg(test)] diff --git a/proxy/src/auth/flow.rs b/proxy/src/auth/flow.rs index 8fbc4577e9..c825d5bf4b 100644 --- a/proxy/src/auth/flow.rs +++ b/proxy/src/auth/flow.rs @@ -169,13 +169,6 @@ pub(crate) async fn validate_password_and_exchange( secret: AuthSecret, ) -> super::Result> { match secret { - #[cfg(any(test, feature = "testing"))] - AuthSecret::Md5(_) => { - // test only - Ok(sasl::Outcome::Success(ComputeCredentialKeys::Password( - password.to_owned(), - ))) - } // perform scram authentication as both client and server to validate the keys AuthSecret::Scram(scram_secret) => { let outcome = crate::scram::exchange(pool, endpoint, &scram_secret, password).await?; diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs index d26641db46..cce4c1d3a0 100644 --- a/proxy/src/cancellation.rs +++ b/proxy/src/cancellation.rs @@ -24,7 +24,6 @@ use crate::pqproto::CancelKeyData; use crate::rate_limiter::LeakyBucketRateLimiter; use crate::redis::keys::KeyPrefix; use crate::redis::kv_ops::RedisKVClient; -use crate::tls::postgres_rustls::MakeRustlsConnect; type IpSubnetKey = IpNet; @@ -497,10 +496,8 @@ impl CancelClosure { ) -> Result<(), CancelError> { let socket = TcpStream::connect(self.socket_addr).await?; - let mut mk_tls = - crate::tls::postgres_rustls::MakeRustlsConnect::new(compute_config.tls.clone()); - let tls = >::make_tls_connect( - &mut mk_tls, + let tls = <_ as MakeTlsConnect>::make_tls_connect( + compute_config, &self.hostname, ) .map_err(|e| CancelError::IO(std::io::Error::other(e.to_string())))?; diff --git a/proxy/src/compute.rs b/proxy/src/compute/mod.rs similarity index 68% rename from proxy/src/compute.rs rename to proxy/src/compute/mod.rs index 2899f25129..0dacd15547 100644 --- a/proxy/src/compute.rs +++ b/proxy/src/compute/mod.rs @@ -1,21 +1,24 @@ +mod tls; + use std::fmt::Debug; use std::io; -use std::net::SocketAddr; -use std::time::Duration; +use std::net::{IpAddr, SocketAddr}; use futures::{FutureExt, TryFutureExt}; use itertools::Itertools; +use postgres_client::config::{AuthKeys, SslMode}; +use postgres_client::maybe_tls_stream::MaybeTlsStream; use postgres_client::tls::MakeTlsConnect; -use postgres_client::{CancelToken, RawConnection}; +use postgres_client::{CancelToken, NoTls, RawConnection}; use postgres_protocol::message::backend::NoticeResponseBody; -use rustls::pki_types::InvalidDnsNameError; use thiserror::Error; use tokio::net::{TcpStream, lookup_host}; use tracing::{debug, error, info, warn}; -use crate::auth::backend::ComputeUserInfo; +use crate::auth::backend::{ComputeCredentialKeys, ComputeUserInfo}; use crate::auth::parse_endpoint_param; use crate::cancellation::CancelClosure; +use crate::compute::tls::TlsError; use crate::config::ComputeConfig; use crate::context::RequestContext; use crate::control_plane::client::ApiLockError; @@ -25,7 +28,6 @@ use crate::error::{ReportableError, UserFacingError}; use crate::metrics::{Metrics, NumDbConnectionsGuard}; use crate::pqproto::StartupMessageParams; use crate::proxy::neon_option; -use crate::tls::postgres_rustls::MakeRustlsConnect; use crate::types::Host; pub const COULD_NOT_CONNECT: &str = "Couldn't connect to compute node"; @@ -38,10 +40,7 @@ pub(crate) enum ConnectionError { Postgres(#[from] postgres_client::Error), #[error("{COULD_NOT_CONNECT}: {0}")] - CouldNotConnect(#[from] io::Error), - - #[error("{COULD_NOT_CONNECT}: {0}")] - TlsError(#[from] InvalidDnsNameError), + TlsError(#[from] TlsError), #[error("{COULD_NOT_CONNECT}: {0}")] WakeComputeError(#[from] WakeComputeError), @@ -73,7 +72,7 @@ impl UserFacingError for ConnectionError { ConnectionError::TooManyConnectionAttempts(_) => { "Failed to acquire permit to connect to the database. Too many database connection attempts are currently ongoing.".to_owned() } - _ => COULD_NOT_CONNECT.to_owned(), + ConnectionError::TlsError(_) => COULD_NOT_CONNECT.to_owned(), } } } @@ -85,7 +84,6 @@ impl ReportableError for ConnectionError { crate::error::ErrorKind::Postgres } ConnectionError::Postgres(_) => crate::error::ErrorKind::Compute, - ConnectionError::CouldNotConnect(_) => crate::error::ErrorKind::Compute, ConnectionError::TlsError(_) => crate::error::ErrorKind::Compute, ConnectionError::WakeComputeError(e) => e.get_error_kind(), ConnectionError::TooManyConnectionAttempts(e) => e.get_error_kind(), @@ -96,34 +94,85 @@ impl ReportableError for ConnectionError { /// A pair of `ClientKey` & `ServerKey` for `SCRAM-SHA-256`. pub(crate) type ScramKeys = postgres_client::config::ScramKeys<32>; -/// A config for establishing a connection to compute node. -/// Eventually, `postgres_client` will be replaced with something better. -/// Newtype allows us to implement methods on top of it. #[derive(Clone)] -pub(crate) struct ConnCfg(Box); +pub enum Auth { + /// Only used during console-redirect. + Password(Vec), + /// Used by sql-over-http, ws, tcp. + Scram(Box), +} + +/// A config for authenticating to the compute node. +pub(crate) struct AuthInfo { + /// None for local-proxy, as we use trust-based localhost auth. + /// Some for sql-over-http, ws, tcp, and in most cases for console-redirect. + /// Might be None for console-redirect, but that's only a consequence of testing environments ATM. + auth: Option, + server_params: StartupMessageParams, + + /// Console redirect sets user and database, we shouldn't re-use those from the params. + skip_db_user: bool, +} + +/// Contains only the data needed to establish a secure connection to compute. +#[derive(Clone)] +pub struct ConnectInfo { + pub host_addr: Option, + pub host: Host, + pub port: u16, + pub ssl_mode: SslMode, +} /// Creation and initialization routines. -impl ConnCfg { - pub(crate) fn new(host: String, port: u16) -> Self { - Self(Box::new(postgres_client::Config::new(host, port))) - } - - /// Reuse password or auth keys from the other config. - pub(crate) fn reuse_password(&mut self, other: Self) { - if let Some(password) = other.get_password() { - self.password(password); - } - - if let Some(keys) = other.get_auth_keys() { - self.auth_keys(keys); +impl AuthInfo { + pub(crate) fn for_console_redirect(db: &str, user: &str, pw: Option<&str>) -> Self { + let mut server_params = StartupMessageParams::default(); + server_params.insert("database", db); + server_params.insert("user", user); + Self { + auth: pw.map(|pw| Auth::Password(pw.as_bytes().to_owned())), + server_params, + skip_db_user: true, } } - pub(crate) fn get_host(&self) -> Host { - match self.0.get_host() { - postgres_client::config::Host::Tcp(s) => s.into(), + pub(crate) fn with_auth_keys(keys: &ComputeCredentialKeys) -> Self { + Self { + auth: match keys { + ComputeCredentialKeys::AuthKeys(AuthKeys::ScramSha256(auth_keys)) => { + Some(Auth::Scram(Box::new(*auth_keys))) + } + ComputeCredentialKeys::JwtPayload(_) | ComputeCredentialKeys::None => None, + }, + server_params: StartupMessageParams::default(), + skip_db_user: false, } } +} + +impl ConnectInfo { + pub fn to_postgres_client_config(&self) -> postgres_client::Config { + let mut config = postgres_client::Config::new(self.host.to_string(), self.port); + config.ssl_mode(self.ssl_mode); + if let Some(host_addr) = self.host_addr { + config.set_host_addr(host_addr); + } + config + } +} + +impl AuthInfo { + fn enrich(&self, mut config: postgres_client::Config) -> postgres_client::Config { + match &self.auth { + Some(Auth::Scram(keys)) => config.auth_keys(AuthKeys::ScramSha256(**keys)), + Some(Auth::Password(pw)) => config.password(pw), + None => &mut config, + }; + for (k, v) in self.server_params.iter() { + config.set_param(k, v); + } + config + } /// Apply startup message params to the connection config. pub(crate) fn set_startup_params( @@ -132,27 +181,26 @@ impl ConnCfg { arbitrary_params: bool, ) { if !arbitrary_params { - self.set_param("client_encoding", "UTF8"); + self.server_params.insert("client_encoding", "UTF8"); } for (k, v) in params.iter() { match k { // Only set `user` if it's not present in the config. // Console redirect auth flow takes username from the console's response. - "user" if self.user_is_set() => {} - "database" if self.db_is_set() => {} + "user" | "database" if self.skip_db_user => {} "options" => { if let Some(options) = filtered_options(v) { - self.set_param(k, &options); + self.server_params.insert(k, &options); } } "user" | "database" | "application_name" | "replication" => { - self.set_param(k, v); + self.server_params.insert(k, v); } // if we allow arbitrary params, then we forward them through. // this is a flag for a period of backwards compatibility k if arbitrary_params => { - self.set_param(k, v); + self.server_params.insert(k, v); } _ => {} } @@ -160,25 +208,13 @@ impl ConnCfg { } } -impl std::ops::Deref for ConnCfg { - type Target = postgres_client::Config; - - fn deref(&self) -> &Self::Target { - &self.0 - } -} - -/// For now, let's make it easier to setup the config. -impl std::ops::DerefMut for ConnCfg { - fn deref_mut(&mut self) -> &mut Self::Target { - &mut self.0 - } -} - -impl ConnCfg { - /// Establish a raw TCP connection to the compute node. - async fn connect_raw(&self, timeout: Duration) -> io::Result<(SocketAddr, TcpStream, &str)> { - use postgres_client::config::Host; +impl ConnectInfo { + /// Establish a raw TCP+TLS connection to the compute node. + async fn connect_raw( + &self, + config: &ComputeConfig, + ) -> Result<(SocketAddr, MaybeTlsStream), TlsError> { + let timeout = config.timeout; // wrap TcpStream::connect with timeout let connect_with_timeout = |addrs| { @@ -208,34 +244,32 @@ impl ConnCfg { // We can't reuse connection establishing logic from `postgres_client` here, // because it has no means for extracting the underlying socket which we // require for our business. - let port = self.0.get_port(); - let host = self.0.get_host(); + let port = self.port; + let host = &*self.host; - let host = match host { - Host::Tcp(host) => host.as_str(), - }; - - let addrs = match self.0.get_host_addr() { + let addrs = match self.host_addr { Some(addr) => vec![SocketAddr::new(addr, port)], None => lookup_host((host, port)).await?.collect(), }; match connect_once(&*addrs).await { - Ok((sockaddr, stream)) => Ok((sockaddr, stream, host)), + Ok((sockaddr, stream)) => Ok(( + sockaddr, + tls::connect_tls(stream, self.ssl_mode, config, host).await?, + )), Err(err) => { warn!("couldn't connect to compute node at {host}:{port}: {err}"); - Err(err) + Err(TlsError::Connection(err)) } } } } -type RustlsStream = >::Stream; +type RustlsStream = >::Stream; pub(crate) struct PostgresConnection { /// Socket connected to a compute node. - pub(crate) stream: - postgres_client::maybe_tls_stream::MaybeTlsStream, + pub(crate) stream: MaybeTlsStream, /// PostgreSQL connection parameters. pub(crate) params: std::collections::HashMap, /// Query cancellation token. @@ -248,28 +282,23 @@ pub(crate) struct PostgresConnection { _guage: NumDbConnectionsGuard<'static>, } -impl ConnCfg { +impl ConnectInfo { /// Connect to a corresponding compute node. pub(crate) async fn connect( &self, ctx: &RequestContext, aux: MetricsAuxInfo, + auth: &AuthInfo, config: &ComputeConfig, user_info: ComputeUserInfo, ) -> Result { - let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute); - let (socket_addr, stream, host) = self.connect_raw(config.timeout).await?; - drop(pause); + let mut tmp_config = auth.enrich(self.to_postgres_client_config()); + // we setup SSL early in `ConnectInfo::connect_raw`. + tmp_config.ssl_mode(SslMode::Disable); - let mut mk_tls = crate::tls::postgres_rustls::MakeRustlsConnect::new(config.tls.clone()); - let tls = >::make_tls_connect( - &mut mk_tls, - host, - )?; - - // connect_raw() will not use TLS if sslmode is "disable" let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute); - let connection = self.0.connect_raw(stream, tls).await?; + let (socket_addr, stream) = self.connect_raw(config).await?; + let connection = tmp_config.connect_raw(stream, NoTls).await?; drop(pause); let RawConnection { @@ -282,13 +311,14 @@ impl ConnCfg { tracing::Span::current().record("pid", tracing::field::display(process_id)); tracing::Span::current().record("compute_id", tracing::field::display(&aux.compute_id)); - let stream = stream.into_inner(); + let MaybeTlsStream::Raw(stream) = stream.into_inner(); // TODO: lots of useful info but maybe we can move it elsewhere (eg traces?) info!( cold_start_info = ctx.cold_start_info().as_str(), - "connected to compute node at {host} ({socket_addr}) sslmode={:?}, latency={}, query_id={}", - self.0.get_ssl_mode(), + "connected to compute node at {} ({socket_addr}) sslmode={:?}, latency={}, query_id={}", + self.host, + self.ssl_mode, ctx.get_proxy_latency(), ctx.get_testodrome_id().unwrap_or_default(), ); @@ -299,11 +329,11 @@ impl ConnCfg { socket_addr, CancelToken { socket_config: None, - ssl_mode: self.0.get_ssl_mode(), + ssl_mode: self.ssl_mode, process_id, secret_key, }, - host.to_string(), + self.host.to_string(), user_info, ); diff --git a/proxy/src/compute/tls.rs b/proxy/src/compute/tls.rs new file mode 100644 index 0000000000..000d75fca5 --- /dev/null +++ b/proxy/src/compute/tls.rs @@ -0,0 +1,63 @@ +use futures::FutureExt; +use postgres_client::config::SslMode; +use postgres_client::maybe_tls_stream::MaybeTlsStream; +use postgres_client::tls::{MakeTlsConnect, TlsConnect}; +use rustls::pki_types::InvalidDnsNameError; +use thiserror::Error; +use tokio::io::{AsyncRead, AsyncWrite}; + +use crate::pqproto::request_tls; +use crate::proxy::retry::CouldRetry; + +#[derive(Debug, Error)] +pub enum TlsError { + #[error(transparent)] + Dns(#[from] InvalidDnsNameError), + #[error(transparent)] + Connection(#[from] std::io::Error), + #[error("TLS required but not provided")] + Required, +} + +impl CouldRetry for TlsError { + fn could_retry(&self) -> bool { + match self { + TlsError::Dns(_) => false, + TlsError::Connection(err) => err.could_retry(), + // perhaps compute didn't realise it supports TLS? + TlsError::Required => true, + } + } +} + +pub async fn connect_tls( + mut stream: S, + mode: SslMode, + tls: &T, + host: &str, +) -> Result, TlsError> +where + S: AsyncRead + AsyncWrite + Unpin + Send, + T: MakeTlsConnect< + S, + Error = InvalidDnsNameError, + TlsConnect: TlsConnect, + >, +{ + match mode { + SslMode::Disable => return Ok(MaybeTlsStream::Raw(stream)), + SslMode::Prefer | SslMode::Require => {} + } + + if !request_tls(&mut stream).await? { + if SslMode::Require == mode { + return Err(TlsError::Required); + } + + return Ok(MaybeTlsStream::Raw(stream)); + } + + Ok(MaybeTlsStream::Tls( + tls.make_tls_connect(host)?.connect(stream).boxed().await?, + )) +} diff --git a/proxy/src/console_redirect_proxy.rs b/proxy/src/console_redirect_proxy.rs index f2484b54b8..324dcf5824 100644 --- a/proxy/src/console_redirect_proxy.rs +++ b/proxy/src/console_redirect_proxy.rs @@ -210,20 +210,20 @@ pub(crate) async fn handle_client( ctx.set_db_options(params.clone()); - let (node_info, user_info, _ip_allowlist) = match backend + let (node_info, mut auth_info, user_info) = match backend .authenticate(ctx, &config.authentication_config, &mut stream) .await { Ok(auth_result) => auth_result, Err(e) => Err(stream.throw_error(e, Some(ctx)).await)?, }; + auth_info.set_startup_params(¶ms, true); let node = connect_to_compute( ctx, &TcpMechanism { user_info, - params_compat: true, - params: ¶ms, + auth: auth_info, locks: &config.connect_compute_locks, }, &node_info, diff --git a/proxy/src/control_plane/client/cplane_proxy_v1.rs b/proxy/src/control_plane/client/cplane_proxy_v1.rs index da548d6b2c..cf2d9fba14 100644 --- a/proxy/src/control_plane/client/cplane_proxy_v1.rs +++ b/proxy/src/control_plane/client/cplane_proxy_v1.rs @@ -261,24 +261,18 @@ impl NeonControlPlaneClient { Some(_) => SslMode::Require, None => SslMode::Disable, }; - let host_name = match body.server_name { - Some(host) => host, - None => host.to_owned(), + let host = match body.server_name { + Some(host) => host.into(), + None => host.into(), }; - // Don't set anything but host and port! This config will be cached. - // We'll set username and such later using the startup message. - // TODO: add more type safety (in progress). - let mut config = compute::ConnCfg::new(host_name, port); - - if let Some(addr) = host_addr { - config.set_host_addr(addr); - } - - config.ssl_mode(ssl_mode); - let node = NodeInfo { - config, + conn_info: compute::ConnectInfo { + host_addr, + host, + port, + ssl_mode, + }, aux: body.aux, }; diff --git a/proxy/src/control_plane/client/mock.rs b/proxy/src/control_plane/client/mock.rs index ece7153fce..aeea57f2fc 100644 --- a/proxy/src/control_plane/client/mock.rs +++ b/proxy/src/control_plane/client/mock.rs @@ -6,6 +6,7 @@ use std::str::FromStr; use std::sync::Arc; use futures::TryFutureExt; +use postgres_client::config::SslMode; use thiserror::Error; use tokio_postgres::Client; use tracing::{Instrument, error, info, info_span, warn}; @@ -14,6 +15,7 @@ use crate::auth::IpPattern; use crate::auth::backend::ComputeUserInfo; use crate::auth::backend::jwt::AuthRule; use crate::cache::Cached; +use crate::compute::ConnectInfo; use crate::context::RequestContext; use crate::control_plane::errors::{ ControlPlaneError, GetAuthInfoError, GetEndpointJwksError, WakeComputeError, @@ -24,9 +26,9 @@ use crate::control_plane::{ RoleAccessControl, }; use crate::intern::RoleNameInt; +use crate::scram; use crate::types::{BranchId, EndpointId, ProjectId, RoleName}; use crate::url::ApiUrl; -use crate::{compute, scram}; #[derive(Debug, Error)] enum MockApiError { @@ -87,8 +89,7 @@ impl MockControlPlane { .await? { info!("got a secret: {entry}"); // safe since it's not a prod scenario - let secret = scram::ServerSecret::parse(&entry).map(AuthSecret::Scram); - secret.or_else(|| parse_md5(&entry).map(AuthSecret::Md5)) + scram::ServerSecret::parse(&entry).map(AuthSecret::Scram) } else { warn!("user '{role}' does not exist"); None @@ -170,25 +171,23 @@ impl MockControlPlane { async fn do_wake_compute(&self) -> Result { let port = self.endpoint.port().unwrap_or(5432); - let mut config = match self.endpoint.host_str() { - None => { - let mut config = compute::ConnCfg::new("localhost".to_string(), port); - config.set_host_addr(IpAddr::V4(Ipv4Addr::LOCALHOST)); - config - } - Some(host) => { - let mut config = compute::ConnCfg::new(host.to_string(), port); - if let Ok(addr) = IpAddr::from_str(host) { - config.set_host_addr(addr); - } - config - } + let conn_info = match self.endpoint.host_str() { + None => ConnectInfo { + host_addr: Some(IpAddr::V4(Ipv4Addr::LOCALHOST)), + host: "localhost".into(), + port, + ssl_mode: SslMode::Disable, + }, + Some(host) => ConnectInfo { + host_addr: IpAddr::from_str(host).ok(), + host: host.into(), + port, + ssl_mode: SslMode::Disable, + }, }; - config.ssl_mode(postgres_client::config::SslMode::Disable); - let node = NodeInfo { - config, + conn_info, aux: MetricsAuxInfo { endpoint_id: (&EndpointId::from("endpoint")).into(), project_id: (&ProjectId::from("project")).into(), @@ -266,12 +265,3 @@ impl super::ControlPlaneApi for MockControlPlane { self.do_wake_compute().map_ok(Cached::new_uncached).await } } - -fn parse_md5(input: &str) -> Option<[u8; 16]> { - let text = input.strip_prefix("md5")?; - - let mut bytes = [0u8; 16]; - hex::decode_to_slice(text, &mut bytes).ok()?; - - Some(bytes) -} diff --git a/proxy/src/control_plane/mod.rs b/proxy/src/control_plane/mod.rs index 7ff093d9dc..ad10cf4257 100644 --- a/proxy/src/control_plane/mod.rs +++ b/proxy/src/control_plane/mod.rs @@ -11,8 +11,8 @@ pub(crate) mod errors; use std::sync::Arc; +use crate::auth::backend::ComputeUserInfo; use crate::auth::backend::jwt::AuthRule; -use crate::auth::backend::{ComputeCredentialKeys, ComputeUserInfo}; use crate::auth::{AuthError, IpPattern, check_peer_addr_is_in_list}; use crate::cache::{Cached, TimedLru}; use crate::config::ComputeConfig; @@ -39,10 +39,6 @@ pub mod mgmt; /// Auth secret which is managed by the cloud. #[derive(Clone, Eq, PartialEq, Debug)] pub(crate) enum AuthSecret { - #[cfg(any(test, feature = "testing"))] - /// Md5 hash of user's password. - Md5([u8; 16]), - /// [SCRAM](crate::scram) authentication info. Scram(scram::ServerSecret), } @@ -63,13 +59,9 @@ pub(crate) struct AuthInfo { } /// Info for establishing a connection to a compute node. -/// This is what we get after auth succeeded, but not before! #[derive(Clone)] pub(crate) struct NodeInfo { - /// Compute node connection params. - /// It's sad that we have to clone this, but this will improve - /// once we migrate to a bespoke connection logic. - pub(crate) config: compute::ConnCfg, + pub(crate) conn_info: compute::ConnectInfo, /// Labels for proxy's metrics. pub(crate) aux: MetricsAuxInfo, @@ -79,26 +71,14 @@ impl NodeInfo { pub(crate) async fn connect( &self, ctx: &RequestContext, + auth: &compute::AuthInfo, config: &ComputeConfig, user_info: ComputeUserInfo, ) -> Result { - self.config - .connect(ctx, self.aux.clone(), config, user_info) + self.conn_info + .connect(ctx, self.aux.clone(), auth, config, user_info) .await } - - pub(crate) fn reuse_settings(&mut self, other: Self) { - self.config.reuse_password(other.config); - } - - pub(crate) fn set_keys(&mut self, keys: &ComputeCredentialKeys) { - match keys { - #[cfg(any(test, feature = "testing"))] - ComputeCredentialKeys::Password(password) => self.config.password(password), - ComputeCredentialKeys::AuthKeys(auth_keys) => self.config.auth_keys(*auth_keys), - ComputeCredentialKeys::JwtPayload(_) | ComputeCredentialKeys::None => &mut self.config, - }; - } } #[derive(Copy, Clone, Default)] diff --git a/proxy/src/pglb/connect_compute.rs b/proxy/src/pglb/connect_compute.rs index 1d6ca5fbb3..1807cdff0e 100644 --- a/proxy/src/pglb/connect_compute.rs +++ b/proxy/src/pglb/connect_compute.rs @@ -2,8 +2,8 @@ use async_trait::async_trait; use tokio::time; use tracing::{debug, info, warn}; -use crate::auth::backend::{ComputeCredentialKeys, ComputeUserInfo}; -use crate::compute::{self, COULD_NOT_CONNECT, PostgresConnection}; +use crate::auth::backend::ComputeUserInfo; +use crate::compute::{self, AuthInfo, COULD_NOT_CONNECT, PostgresConnection}; use crate::config::{ComputeConfig, RetryConfig}; use crate::context::RequestContext; use crate::control_plane::errors::WakeComputeError; @@ -13,7 +13,6 @@ use crate::error::ReportableError; use crate::metrics::{ ConnectOutcome, ConnectionFailureKind, Metrics, RetriesMetricGroup, RetryType, }; -use crate::pqproto::StartupMessageParams; use crate::proxy::retry::{CouldRetry, ShouldRetryWakeCompute, retry_after, should_retry}; use crate::proxy::wake_compute::wake_compute; use crate::types::Host; @@ -48,8 +47,6 @@ pub(crate) trait ConnectMechanism { node_info: &control_plane::CachedNodeInfo, config: &ComputeConfig, ) -> Result; - - fn update_connect_config(&self, conf: &mut compute::ConnCfg); } #[async_trait] @@ -58,24 +55,17 @@ pub(crate) trait ComputeConnectBackend { &self, ctx: &RequestContext, ) -> Result; - - fn get_keys(&self) -> &ComputeCredentialKeys; } -pub(crate) struct TcpMechanism<'a> { - pub(crate) params_compat: bool, - - /// KV-dictionary with PostgreSQL connection params. - pub(crate) params: &'a StartupMessageParams, - +pub(crate) struct TcpMechanism { + pub(crate) auth: AuthInfo, /// connect_to_compute concurrency lock pub(crate) locks: &'static ApiLocks, - pub(crate) user_info: ComputeUserInfo, } #[async_trait] -impl ConnectMechanism for TcpMechanism<'_> { +impl ConnectMechanism for TcpMechanism { type Connection = PostgresConnection; type ConnectError = compute::ConnectionError; type Error = compute::ConnectionError; @@ -90,13 +80,12 @@ impl ConnectMechanism for TcpMechanism<'_> { node_info: &control_plane::CachedNodeInfo, config: &ComputeConfig, ) -> Result { - let host = node_info.config.get_host(); - let permit = self.locks.get_permit(&host).await?; - permit.release_result(node_info.connect(ctx, config, self.user_info.clone()).await) - } - - fn update_connect_config(&self, config: &mut compute::ConnCfg) { - config.set_startup_params(self.params, self.params_compat); + let permit = self.locks.get_permit(&node_info.conn_info.host).await?; + permit.release_result( + node_info + .connect(ctx, &self.auth, config, self.user_info.clone()) + .await, + ) } } @@ -114,12 +103,9 @@ where M::Error: From, { let mut num_retries = 0; - let mut node_info = + let node_info = wake_compute(&mut num_retries, ctx, user_info, wake_compute_retry_config).await?; - node_info.set_keys(user_info.get_keys()); - mechanism.update_connect_config(&mut node_info.config); - // try once let err = match mechanism.connect_once(ctx, &node_info, compute).await { Ok(res) => { @@ -155,14 +141,9 @@ where } else { // if we failed to connect, it's likely that the compute node was suspended, wake a new compute node debug!("compute node's state has likely changed; requesting a wake-up"); - let old_node_info = invalidate_cache(node_info); + invalidate_cache(node_info); // TODO: increment num_retries? - let mut node_info = - wake_compute(&mut num_retries, ctx, user_info, wake_compute_retry_config).await?; - node_info.reuse_settings(old_node_info); - - mechanism.update_connect_config(&mut node_info.config); - node_info + wake_compute(&mut num_retries, ctx, user_info, wake_compute_retry_config).await? }; // now that we have a new node, try connect to it repeatedly. diff --git a/proxy/src/pqproto.rs b/proxy/src/pqproto.rs index 43074bf208..ad99eecda5 100644 --- a/proxy/src/pqproto.rs +++ b/proxy/src/pqproto.rs @@ -8,7 +8,7 @@ use std::io::{self, Cursor}; use bytes::{Buf, BufMut}; use itertools::Itertools; use rand::distributions::{Distribution, Standard}; -use tokio::io::{AsyncRead, AsyncReadExt}; +use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, AsyncWriteExt}; use zerocopy::{FromBytes, Immutable, IntoBytes, big_endian}; pub type ErrorCode = [u8; 5]; @@ -53,6 +53,28 @@ impl fmt::Debug for ProtocolVersion { } } +/// +const MAX_STARTUP_PACKET_LENGTH: usize = 10000; +const RESERVED_INVALID_MAJOR_VERSION: u16 = 1234; +/// +const CANCEL_REQUEST_CODE: ProtocolVersion = ProtocolVersion::new(1234, 5678); +/// +const NEGOTIATE_SSL_CODE: ProtocolVersion = ProtocolVersion::new(1234, 5679); +/// +const NEGOTIATE_GSS_CODE: ProtocolVersion = ProtocolVersion::new(1234, 5680); + +/// This first reads the startup message header, is 8 bytes. +/// The first 4 bytes is a big-endian message length, and the next 4 bytes is a version number. +/// +/// The length value is inclusive of the header. For example, +/// an empty message will always have length 8. +#[derive(Clone, Copy, FromBytes, IntoBytes, Immutable)] +#[repr(C)] +struct StartupHeader { + len: big_endian::U32, + version: ProtocolVersion, +} + /// read the type from the stream using zerocopy. /// /// not cancel safe. @@ -66,32 +88,38 @@ macro_rules! read { }}; } +/// Returns true if TLS is supported. +/// +/// This is not cancel safe. +pub async fn request_tls(stream: &mut S) -> io::Result +where + S: AsyncRead + AsyncWrite + Unpin, +{ + let payload = StartupHeader { + len: 8.into(), + version: NEGOTIATE_SSL_CODE, + }; + stream.write_all(payload.as_bytes()).await?; + stream.flush().await?; + + // we expect back either `S` or `N` as a single byte. + let mut res = *b"0"; + stream.read_exact(&mut res).await?; + + debug_assert!( + res == *b"S" || res == *b"N", + "unexpected SSL negotiation response: {}", + char::from(res[0]), + ); + + // S for SSL. + Ok(res == *b"S") +} + pub async fn read_startup(stream: &mut S) -> io::Result where S: AsyncRead + Unpin, { - /// - const MAX_STARTUP_PACKET_LENGTH: usize = 10000; - const RESERVED_INVALID_MAJOR_VERSION: u16 = 1234; - /// - const CANCEL_REQUEST_CODE: ProtocolVersion = ProtocolVersion::new(1234, 5678); - /// - const NEGOTIATE_SSL_CODE: ProtocolVersion = ProtocolVersion::new(1234, 5679); - /// - const NEGOTIATE_GSS_CODE: ProtocolVersion = ProtocolVersion::new(1234, 5680); - - /// This first reads the startup message header, is 8 bytes. - /// The first 4 bytes is a big-endian message length, and the next 4 bytes is a version number. - /// - /// The length value is inclusive of the header. For example, - /// an empty message will always have length 8. - #[derive(Clone, Copy, FromBytes, IntoBytes, Immutable)] - #[repr(C)] - struct StartupHeader { - len: big_endian::U32, - version: ProtocolVersion, - } - let header = read!(stream => StartupHeader); // @@ -564,9 +592,8 @@ mod tests { use tokio::io::{AsyncWriteExt, duplex}; use zerocopy::IntoBytes; - use crate::pqproto::{FeStartupPacket, read_message, read_startup}; - use super::ProtocolVersion; + use crate::pqproto::{FeStartupPacket, read_message, read_startup}; #[tokio::test] async fn reject_large_startup() { diff --git a/proxy/src/proxy/mod.rs b/proxy/src/proxy/mod.rs index 0e138cc0c7..0e00c4f97e 100644 --- a/proxy/src/proxy/mod.rs +++ b/proxy/src/proxy/mod.rs @@ -358,21 +358,19 @@ pub(crate) async fn handle_client( } }; - let compute_user_info = match &user_info { - auth::Backend::ControlPlane(_, info) => &info.info, + let creds = match &user_info { + auth::Backend::ControlPlane(_, creds) => creds, auth::Backend::Local(_) => unreachable!("local proxy does not run tcp proxy service"), }; - let params_compat = compute_user_info - .options - .get(NeonOptions::PARAMS_COMPAT) - .is_some(); + let params_compat = creds.info.options.get(NeonOptions::PARAMS_COMPAT).is_some(); + let mut auth_info = compute::AuthInfo::with_auth_keys(&creds.keys); + auth_info.set_startup_params(¶ms, params_compat); let res = connect_to_compute( ctx, &TcpMechanism { - user_info: compute_user_info.clone(), - params_compat, - params: ¶ms, + user_info: creds.info.clone(), + auth: auth_info, locks: &config.connect_compute_locks, }, &user_info, diff --git a/proxy/src/proxy/retry.rs b/proxy/src/proxy/retry.rs index 01e603ec14..0f19944afa 100644 --- a/proxy/src/proxy/retry.rs +++ b/proxy/src/proxy/retry.rs @@ -100,9 +100,9 @@ impl CouldRetry for compute::ConnectionError { fn could_retry(&self) -> bool { match self { compute::ConnectionError::Postgres(err) => err.could_retry(), - compute::ConnectionError::CouldNotConnect(err) => err.could_retry(), + compute::ConnectionError::TlsError(err) => err.could_retry(), compute::ConnectionError::WakeComputeError(err) => err.could_retry(), - _ => false, + compute::ConnectionError::TooManyConnectionAttempts(_) => false, } } } diff --git a/proxy/src/proxy/tests/mod.rs b/proxy/src/proxy/tests/mod.rs index e5db0013a7..028247a97d 100644 --- a/proxy/src/proxy/tests/mod.rs +++ b/proxy/src/proxy/tests/mod.rs @@ -8,7 +8,7 @@ use std::time::Duration; use anyhow::{Context, bail}; use async_trait::async_trait; use http::StatusCode; -use postgres_client::config::SslMode; +use postgres_client::config::{AuthKeys, ScramKeys, SslMode}; use postgres_client::tls::{MakeTlsConnect, NoTls}; use retry::{ShouldRetryWakeCompute, retry_after}; use rstest::rstest; @@ -29,7 +29,6 @@ use crate::control_plane::{self, CachedNodeInfo, NodeInfo, NodeInfoCache}; use crate::error::ErrorKind; use crate::pglb::connect_compute::ConnectMechanism; use crate::tls::client_config::compute_client_config_with_certs; -use crate::tls::postgres_rustls::MakeRustlsConnect; use crate::tls::server_config::CertResolver; use crate::types::{BranchId, EndpointId, ProjectId}; use crate::{sasl, scram}; @@ -72,13 +71,14 @@ struct ClientConfig<'a> { hostname: &'a str, } -type TlsConnect = >::TlsConnect; +type TlsConnect = >::TlsConnect; impl ClientConfig<'_> { fn make_tls_connect(self) -> anyhow::Result> { - let mut mk = MakeRustlsConnect::new(self.config); - let tls = MakeTlsConnect::::make_tls_connect(&mut mk, self.hostname)?; - Ok(tls) + Ok(crate::tls::postgres_rustls::make_tls_connect( + &self.config, + self.hostname, + )?) } } @@ -497,8 +497,6 @@ impl ConnectMechanism for TestConnectMechanism { x => panic!("expecting action {x:?}, connect is called instead"), } } - - fn update_connect_config(&self, _conf: &mut compute::ConnCfg) {} } impl TestControlPlaneClient for TestConnectMechanism { @@ -557,7 +555,12 @@ impl TestControlPlaneClient for TestConnectMechanism { fn helper_create_cached_node_info(cache: &'static NodeInfoCache) -> CachedNodeInfo { let node = NodeInfo { - config: compute::ConnCfg::new("test".to_owned(), 5432), + conn_info: compute::ConnectInfo { + host: "test".into(), + port: 5432, + ssl_mode: SslMode::Disable, + host_addr: None, + }, aux: MetricsAuxInfo { endpoint_id: (&EndpointId::from("endpoint")).into(), project_id: (&ProjectId::from("project")).into(), @@ -581,7 +584,10 @@ fn helper_create_connect_info( user: "user".into(), options: NeonOptions::parse_options_raw(""), }, - keys: ComputeCredentialKeys::Password("password".into()), + keys: ComputeCredentialKeys::AuthKeys(AuthKeys::ScramSha256(ScramKeys { + client_key: [0; 32], + server_key: [0; 32], + })), }, ) } diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs index 748e0ce6f2..a0e782dab0 100644 --- a/proxy/src/serverless/backend.rs +++ b/proxy/src/serverless/backend.rs @@ -23,7 +23,6 @@ use super::local_conn_pool::{self, EXT_NAME, EXT_SCHEMA, EXT_VERSION, LocalConnP use crate::auth::backend::local::StaticAuthRules; use crate::auth::backend::{ComputeCredentials, ComputeUserInfo}; use crate::auth::{self, AuthError}; -use crate::compute; use crate::compute_ctl::{ ComputeCtlError, ExtensionInstallRequest, Privilege, SetRoleGrantsRequest, }; @@ -305,12 +304,13 @@ impl PoolingBackend { tracing::Span::current().record("conn_id", display(conn_id)); info!(%conn_id, "local_pool: opening a new connection '{conn_info}'"); - let mut node_info = local_backend.node_info.clone(); - let (key, jwk) = create_random_jwk(); - let config = node_info - .config + let mut config = local_backend + .node_info + .conn_info + .to_postgres_client_config(); + config .user(&conn_info.user_info.user) .dbname(&conn_info.dbname) .set_param( @@ -322,7 +322,7 @@ impl PoolingBackend { ); let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute); - let (client, connection) = config.connect(postgres_client::NoTls).await?; + let (client, connection) = config.connect(&postgres_client::NoTls).await?; drop(pause); let pid = client.get_process_id(); @@ -336,7 +336,7 @@ impl PoolingBackend { connection, key, conn_id, - node_info.aux.clone(), + local_backend.node_info.aux.clone(), ); { @@ -512,19 +512,16 @@ impl ConnectMechanism for TokioMechanism { node_info: &CachedNodeInfo, compute_config: &ComputeConfig, ) -> Result { - let host = node_info.config.get_host(); - let permit = self.locks.get_permit(&host).await?; + let permit = self.locks.get_permit(&node_info.conn_info.host).await?; - let mut config = (*node_info.config).clone(); + let mut config = node_info.conn_info.to_postgres_client_config(); let config = config .user(&self.conn_info.user_info.user) .dbname(&self.conn_info.dbname) .connect_timeout(compute_config.timeout); - let mk_tls = - crate::tls::postgres_rustls::MakeRustlsConnect::new(compute_config.tls.clone()); let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute); - let res = config.connect(mk_tls).await; + let res = config.connect(compute_config).await; drop(pause); let (client, connection) = permit.release_result(res)?; @@ -548,8 +545,6 @@ impl ConnectMechanism for TokioMechanism { node_info.aux.clone(), )) } - - fn update_connect_config(&self, _config: &mut compute::ConnCfg) {} } struct HyperMechanism { @@ -573,20 +568,20 @@ impl ConnectMechanism for HyperMechanism { node_info: &CachedNodeInfo, config: &ComputeConfig, ) -> Result { - let host_addr = node_info.config.get_host_addr(); - let host = node_info.config.get_host(); - let permit = self.locks.get_permit(&host).await?; + let host_addr = node_info.conn_info.host_addr; + let host = &node_info.conn_info.host; + let permit = self.locks.get_permit(host).await?; let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute); - let tls = if node_info.config.get_ssl_mode() == SslMode::Disable { + let tls = if node_info.conn_info.ssl_mode == SslMode::Disable { None } else { Some(&config.tls) }; - let port = node_info.config.get_port(); - let res = connect_http2(host_addr, &host, port, config.timeout, tls).await; + let port = node_info.conn_info.port; + let res = connect_http2(host_addr, host, port, config.timeout, tls).await; drop(pause); let (client, connection) = permit.release_result(res)?; @@ -609,8 +604,6 @@ impl ConnectMechanism for HyperMechanism { node_info.aux.clone(), )) } - - fn update_connect_config(&self, _config: &mut compute::ConnCfg) {} } async fn connect_http2( diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs index 87176ff7d6..dd8cf052c5 100644 --- a/proxy/src/serverless/conn_pool.rs +++ b/proxy/src/serverless/conn_pool.rs @@ -23,12 +23,12 @@ use super::conn_pool_lib::{ Client, ClientDataEnum, ClientInnerCommon, ClientInnerExt, ConnInfo, EndpointConnPool, GlobalConnPool, }; +use crate::config::ComputeConfig; use crate::context::RequestContext; use crate::control_plane::messages::MetricsAuxInfo; use crate::metrics::Metrics; -use crate::tls::postgres_rustls::MakeRustlsConnect; -type TlsStream = >::Stream; +type TlsStream = >::Stream; #[derive(Debug, Clone)] pub(crate) struct ConnInfoWithAuth { diff --git a/proxy/src/tls/postgres_rustls.rs b/proxy/src/tls/postgres_rustls.rs index 013b307f0b..9269ad8a06 100644 --- a/proxy/src/tls/postgres_rustls.rs +++ b/proxy/src/tls/postgres_rustls.rs @@ -2,10 +2,11 @@ use std::convert::TryFrom; use std::sync::Arc; use postgres_client::tls::MakeTlsConnect; -use rustls::ClientConfig; -use rustls::pki_types::ServerName; +use rustls::pki_types::{InvalidDnsNameError, ServerName}; use tokio::io::{AsyncRead, AsyncWrite}; +use crate::config::ComputeConfig; + mod private { use std::future::Future; use std::io; @@ -123,36 +124,27 @@ mod private { } } -/// A `MakeTlsConnect` implementation using `rustls`. -/// -/// That way you can connect to PostgreSQL using `rustls` as the TLS stack. -#[derive(Clone)] -pub struct MakeRustlsConnect { - pub config: Arc, -} - -impl MakeRustlsConnect { - /// Creates a new `MakeRustlsConnect` from the provided `ClientConfig`. - #[must_use] - pub fn new(config: Arc) -> Self { - Self { config } - } -} - -impl MakeTlsConnect for MakeRustlsConnect +impl MakeTlsConnect for ComputeConfig where S: AsyncRead + AsyncWrite + Unpin + Send + 'static, { type Stream = private::RustlsStream; type TlsConnect = private::RustlsConnect; - type Error = rustls::pki_types::InvalidDnsNameError; + type Error = InvalidDnsNameError; - fn make_tls_connect(&mut self, hostname: &str) -> Result { - ServerName::try_from(hostname).map(|dns_name| { - private::RustlsConnect(private::RustlsConnectData { - hostname: dns_name.to_owned(), - connector: Arc::clone(&self.config).into(), - }) - }) + fn make_tls_connect(&self, hostname: &str) -> Result { + make_tls_connect(&self.tls, hostname) } } + +pub fn make_tls_connect( + tls: &Arc, + hostname: &str, +) -> Result { + ServerName::try_from(hostname).map(|dns_name| { + private::RustlsConnect(private::RustlsConnectData { + hostname: dns_name.to_owned(), + connector: tls.clone().into(), + }) + }) +} From 470c7d5e0e5f70fefeca9a3e9b7dbd380a78acc1 Mon Sep 17 00:00:00 2001 From: Mikhail Date: Fri, 6 Jun 2025 12:48:01 +0100 Subject: [PATCH 010/124] endpoint_storage: default listen port, allow inline config (#12152) Related: https://github.com/neondatabase/cloud/issues/27195 --- endpoint_storage/src/main.rs | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/endpoint_storage/src/main.rs b/endpoint_storage/src/main.rs index 3d1f05575d..399a4ec31e 100644 --- a/endpoint_storage/src/main.rs +++ b/endpoint_storage/src/main.rs @@ -3,7 +3,8 @@ //! This service is deployed either as a separate component or as part of compute image //! for large computes. mod app; -use anyhow::Context; +use anyhow::{Context, bail}; +use std::net::{IpAddr, Ipv4Addr, SocketAddr}; use tracing::info; use utils::logging; @@ -12,9 +13,14 @@ const fn max_upload_file_limit() -> usize { 100 * 1024 * 1024 } +const fn listen() -> SocketAddr { + SocketAddr::new(IpAddr::V4(Ipv4Addr::new(0, 0, 0, 0)), 51243) +} + #[derive(serde::Deserialize)] #[serde(tag = "type")] struct Config { + #[serde(default = "listen")] listen: std::net::SocketAddr, pemfile: camino::Utf8PathBuf, #[serde(flatten)] @@ -31,13 +37,21 @@ async fn main() -> anyhow::Result<()> { logging::Output::Stdout, )?; - let config: String = std::env::args().skip(1).take(1).collect(); - if config.is_empty() { - anyhow::bail!("Usage: endpoint_storage config.json") - } - info!("Reading config from {config}"); - let config = std::fs::read_to_string(config.clone())?; - let config: Config = serde_json::from_str(&config).context("parsing config")?; + // Allow either passing filename or inline config (for k8s helm chart) + let args: Vec = std::env::args().skip(1).collect(); + let config: Config = if args.len() == 1 && args[0].ends_with(".json") { + info!("Reading config from {}", args[0]); + let config = std::fs::read_to_string(args[0].clone())?; + serde_json::from_str(&config).context("parsing config")? + } else if !args.is_empty() && args[0].starts_with("--config=") { + info!("Reading inline config"); + let config = args.join(" "); + let config = config.strip_prefix("--config=").unwrap(); + serde_json::from_str(config).context("parsing config")? + } else { + bail!("Usage: endpoint_storage config.json or endpoint_storage --config=JSON"); + }; + info!("Reading pemfile from {}", config.pemfile.clone()); let pemfile = std::fs::read(config.pemfile.clone())?; info!("Loading public key from {}", config.pemfile.clone()); From df7e301a5401ac1da2792b00ace7323f913b4fc1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Fri, 6 Jun 2025 13:54:07 +0200 Subject: [PATCH 011/124] safekeeper: special error if a timeline has been deleted (#12155) We might delete timelines on safekeepers before we are deleting them on pageservers. This should be an exceptional situation, but can occur. As the first step to improve behaviour here, emit a special error that is less scary/obscure than "was not found in global map". It is for example emitted when the pageserver tries to run `IDENTIFY_SYSTEM` on a timeline that has been deleted on the safekeeper. Found when analyzing the failure of `test_scrubber_physical_gc_timeline_deletion` when enabling `--timelines-onto-safekeepers` on the pytests. Due to safekeeper restarts, there is no hard guarantee that we will keep issuing this error, so we need to think of something better if we start encountering this in staging/prod. But I would say that the introduction of `--timelines-onto-safekeepers` in the pytests and into staging won't change much about this: we are already deleting timelines from there. In `test_scrubber_physical_gc_timeline_deletion`, we'd just be leaking the timeline before on the safekeepers. Part of #11712 --- safekeeper/src/timeline.rs | 2 ++ safekeeper/src/timelines_global_map.rs | 8 +++++++- test_runner/regress/test_safekeeper_deletion.py | 2 ++ test_runner/regress/test_wal_acceptor.py | 2 ++ 4 files changed, 13 insertions(+), 1 deletion(-) diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index 588bd4f2c9..2bee41537f 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -395,6 +395,8 @@ pub enum TimelineError { Cancelled(TenantTimelineId), #[error("Timeline {0} was not found in global map")] NotFound(TenantTimelineId), + #[error("Timeline {0} has been deleted")] + Deleted(TenantTimelineId), #[error("Timeline {0} creation is in progress")] CreationInProgress(TenantTimelineId), #[error("Timeline {0} exists on disk, but wasn't loaded on startup")] diff --git a/safekeeper/src/timelines_global_map.rs b/safekeeper/src/timelines_global_map.rs index e3f7d88f7c..6e41ada1b3 100644 --- a/safekeeper/src/timelines_global_map.rs +++ b/safekeeper/src/timelines_global_map.rs @@ -78,7 +78,13 @@ impl GlobalTimelinesState { Some(GlobalMapTimeline::CreationInProgress) => { Err(TimelineError::CreationInProgress(*ttid)) } - None => Err(TimelineError::NotFound(*ttid)), + None => { + if self.has_tombstone(ttid) { + Err(TimelineError::Deleted(*ttid)) + } else { + Err(TimelineError::NotFound(*ttid)) + } + } } } diff --git a/test_runner/regress/test_safekeeper_deletion.py b/test_runner/regress/test_safekeeper_deletion.py index b681a86103..bc79969e9a 100644 --- a/test_runner/regress/test_safekeeper_deletion.py +++ b/test_runner/regress/test_safekeeper_deletion.py @@ -30,6 +30,7 @@ def test_safekeeper_delete_timeline(neon_env_builder: NeonEnvBuilder, auth_enabl env.pageserver.allowed_errors.extend( [ ".*Timeline .* was not found in global map.*", + ".*Timeline .* has been deleted.*", ".*Timeline .* was cancelled and cannot be used anymore.*", ] ) @@ -198,6 +199,7 @@ def test_safekeeper_delete_timeline_under_load(neon_env_builder: NeonEnvBuilder) env.pageserver.allowed_errors.extend( [ ".*Timeline.*was cancelled.*", + ".*Timeline.*has been deleted.*", ".*Timeline.*was not found.*", ] ) diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index 6a7c7a8bef..b9183286af 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -433,6 +433,7 @@ def test_wal_backup(neon_env_builder: NeonEnvBuilder): env.pageserver.allowed_errors.extend( [ ".*Timeline .* was not found in global map.*", + ".*Timeline .* has been deleted.*", ".*Timeline .* was cancelled and cannot be used anymore.*", ] ) @@ -1934,6 +1935,7 @@ def test_membership_api(neon_env_builder: NeonEnvBuilder): env.pageserver.allowed_errors.extend( [ ".*Timeline .* was not found in global map.*", + ".*Timeline .* has been deleted.*", ".*Timeline .* was cancelled and cannot be used anymore.*", ] ) From 6dd84041a1b93e8033abe75e93867db11069e91d Mon Sep 17 00:00:00 2001 From: Conrad Ludgate Date: Fri, 6 Jun 2025 13:49:29 +0100 Subject: [PATCH 012/124] refactor and simplify the invalidation notification structure (#12154) The current cache invalidation messages are far too specific. They should be more generic since it only ends up triggering a `GetEndpointAccessControl` message anyway. Mappings: * `/allowed_ips_updated`, `/block_public_or_vpc_access_updated`, and `/allowed_vpc_endpoints_updated_for_projects` -> `/project_settings_update`. * `/allowed_vpc_endpoints_updated_for_org` -> `/account_settings_update`. * `/password_updated` -> `/role_setting_update`. I've also introduced `/endpoint_settings_update`. All message types support singular or multiple entries, which allows us to simplify things both on our side and on cplane side. I'm opening a PR to cplane to apply the above mappings, but for now using the old phrases to allow both to roll out independently. This change is inspired by my need to add yet another cached entry to `GetEndpointAccessControl` for https://github.com/neondatabase/cloud/issues/28333 --- proxy/src/cache/project_info.rs | 8 + proxy/src/metrics.rs | 10 +- proxy/src/redis/notifications.rs | 241 +++++++++++++++++-------------- 3 files changed, 145 insertions(+), 114 deletions(-) diff --git a/proxy/src/cache/project_info.rs b/proxy/src/cache/project_info.rs index 81c88e3ddd..9a4be2f904 100644 --- a/proxy/src/cache/project_info.rs +++ b/proxy/src/cache/project_info.rs @@ -18,6 +18,7 @@ use crate::types::{EndpointId, RoleName}; #[async_trait] pub(crate) trait ProjectInfoCache { + fn invalidate_endpoint_access(&self, endpoint_id: EndpointIdInt); fn invalidate_endpoint_access_for_project(&self, project_id: ProjectIdInt); fn invalidate_endpoint_access_for_org(&self, account_id: AccountIdInt); fn invalidate_role_secret_for_project(&self, project_id: ProjectIdInt, role_name: RoleNameInt); @@ -100,6 +101,13 @@ pub struct ProjectInfoCacheImpl { #[async_trait] impl ProjectInfoCache for ProjectInfoCacheImpl { + fn invalidate_endpoint_access(&self, endpoint_id: EndpointIdInt) { + info!("invalidating endpoint access for `{endpoint_id}`"); + if let Some(mut endpoint_info) = self.cache.get_mut(&endpoint_id) { + endpoint_info.invalidate_endpoint(); + } + } + fn invalidate_endpoint_access_for_project(&self, project_id: ProjectIdInt) { info!("invalidating endpoint access for project `{project_id}`"); let endpoints = self diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs index 4b22c912eb..4c340edfd5 100644 --- a/proxy/src/metrics.rs +++ b/proxy/src/metrics.rs @@ -610,11 +610,11 @@ pub enum RedisEventsCount { BranchCreated, ProjectCreated, CancelSession, - PasswordUpdate, - AllowedIpsUpdate, - AllowedVpcEndpointIdsUpdateForProjects, - AllowedVpcEndpointIdsUpdateForAllProjectsInOrg, - BlockPublicOrVpcAccessUpdate, + InvalidateRole, + InvalidateEndpoint, + InvalidateProject, + InvalidateProjects, + InvalidateOrg, } pub struct ThreadPoolWorkers(usize); diff --git a/proxy/src/redis/notifications.rs b/proxy/src/redis/notifications.rs index a9d6b40603..6c8260027f 100644 --- a/proxy/src/redis/notifications.rs +++ b/proxy/src/redis/notifications.rs @@ -3,12 +3,12 @@ use std::sync::Arc; use futures::StreamExt; use redis::aio::PubSub; -use serde::{Deserialize, Serialize}; +use serde::Deserialize; use tokio_util::sync::CancellationToken; use super::connection_with_credentials_provider::ConnectionWithCredentialsProvider; use crate::cache::project_info::ProjectInfoCache; -use crate::intern::{AccountIdInt, ProjectIdInt, RoleNameInt}; +use crate::intern::{AccountIdInt, EndpointIdInt, ProjectIdInt, RoleNameInt}; use crate::metrics::{Metrics, RedisErrors, RedisEventsCount}; const CPLANE_CHANNEL_NAME: &str = "neondb-proxy-ws-updates"; @@ -27,42 +27,37 @@ struct NotificationHeader<'a> { topic: &'a str, } -#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)] +#[derive(Clone, Debug, Deserialize, Eq, PartialEq)] #[serde(tag = "topic", content = "data")] -pub(crate) enum Notification { +enum Notification { #[serde( - rename = "/allowed_ips_updated", + rename = "/account_settings_update", + alias = "/allowed_vpc_endpoints_updated_for_org", deserialize_with = "deserialize_json_string" )] - AllowedIpsUpdate { - allowed_ips_update: AllowedIpsUpdate, - }, + AccountSettingsUpdate(InvalidateAccount), + #[serde( - rename = "/block_public_or_vpc_access_updated", + rename = "/endpoint_settings_update", deserialize_with = "deserialize_json_string" )] - BlockPublicOrVpcAccessUpdated { - block_public_or_vpc_access_updated: BlockPublicOrVpcAccessUpdated, - }, + EndpointSettingsUpdate(InvalidateEndpoint), + #[serde( - rename = "/allowed_vpc_endpoints_updated_for_org", + rename = "/project_settings_update", + alias = "/allowed_ips_updated", + alias = "/block_public_or_vpc_access_updated", + alias = "/allowed_vpc_endpoints_updated_for_projects", deserialize_with = "deserialize_json_string" )] - AllowedVpcEndpointsUpdatedForOrg { - allowed_vpc_endpoints_updated_for_org: AllowedVpcEndpointsUpdatedForOrg, - }, + ProjectSettingsUpdate(InvalidateProject), + #[serde( - rename = "/allowed_vpc_endpoints_updated_for_projects", + rename = "/role_setting_update", + alias = "/password_updated", deserialize_with = "deserialize_json_string" )] - AllowedVpcEndpointsUpdatedForProjects { - allowed_vpc_endpoints_updated_for_projects: AllowedVpcEndpointsUpdatedForProjects, - }, - #[serde( - rename = "/password_updated", - deserialize_with = "deserialize_json_string" - )] - PasswordUpdate { password_update: PasswordUpdate }, + RoleSettingUpdate(InvalidateRole), #[serde( other, @@ -72,28 +67,56 @@ pub(crate) enum Notification { UnknownTopic, } -#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)] -pub(crate) struct AllowedIpsUpdate { - project_id: ProjectIdInt, +#[derive(Clone, Debug, Deserialize, Eq, PartialEq)] +#[serde(rename_all = "snake_case")] +enum InvalidateEndpoint { + EndpointId(EndpointIdInt), + EndpointIds(Vec), +} +impl std::ops::Deref for InvalidateEndpoint { + type Target = [EndpointIdInt]; + fn deref(&self) -> &Self::Target { + match self { + Self::EndpointId(id) => std::slice::from_ref(id), + Self::EndpointIds(ids) => ids, + } + } } -#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)] -pub(crate) struct BlockPublicOrVpcAccessUpdated { - project_id: ProjectIdInt, +#[derive(Clone, Debug, Deserialize, Eq, PartialEq)] +#[serde(rename_all = "snake_case")] +enum InvalidateProject { + ProjectId(ProjectIdInt), + ProjectIds(Vec), +} +impl std::ops::Deref for InvalidateProject { + type Target = [ProjectIdInt]; + fn deref(&self) -> &Self::Target { + match self { + Self::ProjectId(id) => std::slice::from_ref(id), + Self::ProjectIds(ids) => ids, + } + } } -#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)] -pub(crate) struct AllowedVpcEndpointsUpdatedForOrg { - account_id: AccountIdInt, +#[derive(Clone, Debug, Deserialize, Eq, PartialEq)] +#[serde(rename_all = "snake_case")] +enum InvalidateAccount { + AccountId(AccountIdInt), + AccountIds(Vec), +} +impl std::ops::Deref for InvalidateAccount { + type Target = [AccountIdInt]; + fn deref(&self) -> &Self::Target { + match self { + Self::AccountId(id) => std::slice::from_ref(id), + Self::AccountIds(ids) => ids, + } + } } -#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)] -pub(crate) struct AllowedVpcEndpointsUpdatedForProjects { - project_ids: Vec, -} - -#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)] -pub(crate) struct PasswordUpdate { +#[derive(Clone, Debug, Deserialize, Eq, PartialEq)] +struct InvalidateRole { project_id: ProjectIdInt, role_name: RoleNameInt, } @@ -177,41 +200,29 @@ impl MessageHandler { tracing::debug!(?msg, "received a message"); match msg { - Notification::AllowedIpsUpdate { .. } - | Notification::PasswordUpdate { .. } - | Notification::BlockPublicOrVpcAccessUpdated { .. } - | Notification::AllowedVpcEndpointsUpdatedForOrg { .. } - | Notification::AllowedVpcEndpointsUpdatedForProjects { .. } => { + Notification::RoleSettingUpdate { .. } + | Notification::EndpointSettingsUpdate { .. } + | Notification::ProjectSettingsUpdate { .. } + | Notification::AccountSettingsUpdate { .. } => { invalidate_cache(self.cache.clone(), msg.clone()); - if matches!(msg, Notification::AllowedIpsUpdate { .. }) { - Metrics::get() - .proxy - .redis_events_count - .inc(RedisEventsCount::AllowedIpsUpdate); - } else if matches!(msg, Notification::PasswordUpdate { .. }) { - Metrics::get() - .proxy - .redis_events_count - .inc(RedisEventsCount::PasswordUpdate); - } else if matches!( - msg, - Notification::AllowedVpcEndpointsUpdatedForProjects { .. } - ) { - Metrics::get() - .proxy - .redis_events_count - .inc(RedisEventsCount::AllowedVpcEndpointIdsUpdateForProjects); - } else if matches!(msg, Notification::AllowedVpcEndpointsUpdatedForOrg { .. }) { - Metrics::get() - .proxy - .redis_events_count - .inc(RedisEventsCount::AllowedVpcEndpointIdsUpdateForAllProjectsInOrg); - } else if matches!(msg, Notification::BlockPublicOrVpcAccessUpdated { .. }) { - Metrics::get() - .proxy - .redis_events_count - .inc(RedisEventsCount::BlockPublicOrVpcAccessUpdate); + + let m = &Metrics::get().proxy.redis_events_count; + match msg { + Notification::RoleSettingUpdate { .. } => { + m.inc(RedisEventsCount::InvalidateRole); + } + Notification::EndpointSettingsUpdate { .. } => { + m.inc(RedisEventsCount::InvalidateEndpoint); + } + Notification::ProjectSettingsUpdate { .. } => { + m.inc(RedisEventsCount::InvalidateProject); + } + Notification::AccountSettingsUpdate { .. } => { + m.inc(RedisEventsCount::InvalidateOrg); + } + Notification::UnknownTopic => {} } + // TODO: add additional metrics for the other event types. // It might happen that the invalid entry is on the way to be cached. @@ -233,30 +244,23 @@ impl MessageHandler { fn invalidate_cache(cache: Arc, msg: Notification) { match msg { - Notification::AllowedIpsUpdate { - allowed_ips_update: AllowedIpsUpdate { project_id }, - } - | Notification::BlockPublicOrVpcAccessUpdated { - block_public_or_vpc_access_updated: BlockPublicOrVpcAccessUpdated { project_id }, - } => cache.invalidate_endpoint_access_for_project(project_id), - Notification::AllowedVpcEndpointsUpdatedForOrg { - allowed_vpc_endpoints_updated_for_org: AllowedVpcEndpointsUpdatedForOrg { account_id }, - } => cache.invalidate_endpoint_access_for_org(account_id), - Notification::AllowedVpcEndpointsUpdatedForProjects { - allowed_vpc_endpoints_updated_for_projects: - AllowedVpcEndpointsUpdatedForProjects { project_ids }, - } => { - for project in project_ids { - cache.invalidate_endpoint_access_for_project(project); - } - } - Notification::PasswordUpdate { - password_update: - PasswordUpdate { - project_id, - role_name, - }, - } => cache.invalidate_role_secret_for_project(project_id, role_name), + Notification::EndpointSettingsUpdate(ids) => ids + .iter() + .for_each(|&id| cache.invalidate_endpoint_access(id)), + + Notification::AccountSettingsUpdate(ids) => ids + .iter() + .for_each(|&id| cache.invalidate_endpoint_access_for_org(id)), + + Notification::ProjectSettingsUpdate(ids) => ids + .iter() + .for_each(|&id| cache.invalidate_endpoint_access_for_project(id)), + + Notification::RoleSettingUpdate(InvalidateRole { + project_id, + role_name, + }) => cache.invalidate_role_secret_for_project(project_id, role_name), + Notification::UnknownTopic => unreachable!(), } } @@ -353,11 +357,32 @@ mod tests { let result: Notification = serde_json::from_str(&text)?; assert_eq!( result, - Notification::AllowedIpsUpdate { - allowed_ips_update: AllowedIpsUpdate { - project_id: (&project_id).into() - } - } + Notification::ProjectSettingsUpdate(InvalidateProject::ProjectId((&project_id).into())) + ); + + Ok(()) + } + + #[test] + fn parse_multiple_projects() -> anyhow::Result<()> { + let project_id1: ProjectId = "new_project1".into(); + let project_id2: ProjectId = "new_project2".into(); + let data = format!("{{\"project_ids\": [\"{project_id1}\",\"{project_id2}\"]}}"); + let text = json!({ + "type": "message", + "topic": "/allowed_vpc_endpoints_updated_for_projects", + "data": data, + "extre_fields": "something" + }) + .to_string(); + + let result: Notification = serde_json::from_str(&text)?; + assert_eq!( + result, + Notification::ProjectSettingsUpdate(InvalidateProject::ProjectIds(vec![ + (&project_id1).into(), + (&project_id2).into() + ])) ); Ok(()) @@ -379,12 +404,10 @@ mod tests { let result: Notification = serde_json::from_str(&text)?; assert_eq!( result, - Notification::PasswordUpdate { - password_update: PasswordUpdate { - project_id: (&project_id).into(), - role_name: (&role_name).into(), - } - } + Notification::RoleSettingUpdate(InvalidateRole { + project_id: (&project_id).into(), + role_name: (&role_name).into(), + }) ); Ok(()) From 3c7235669a3655edf12a47fc3d3b19e75826e6bd Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Fri, 6 Jun 2025 17:55:14 +0200 Subject: [PATCH 013/124] pageserver: don't delete parent shard files until split is committed (#12146) ## Problem If a shard split fails and must roll back, the tenant may hit a cold start as the parent shard's files have already been removed from local disk. External contribution with minor adjustments, see https://neondb.slack.com/archives/C08TE3203RQ/p1748246398269309. ## Summary of changes Keep the parent shard's files on local disk until the split has been committed, such that they are available if the spilt is rolled back. If all else fails, the files will be removed on the next Pageserver restart. This should also be fine in a mixed version: * New storcon, old Pageserver: the Pageserver will delete the files during the split, storcon will log an error when the cleanup detach fails. * Old storcon, new Pageserver: the Pageserver will leave the parent's files around until the next Pageserver restart. The change looks good to me, but shard splits are delicate so I'd like some extra eyes on this. --- pageserver/src/tenant/mgr.rs | 50 ++++++++--- pageserver/src/tenant/timeline.rs | 4 +- storage_controller/src/service.rs | 18 +++- test_runner/regress/test_sharding.py | 87 +++++++++++++++++++ .../regress/test_storage_controller.py | 4 +- 5 files changed, 146 insertions(+), 17 deletions(-) diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 86aef9b42c..186e0f4cdb 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -1671,7 +1671,12 @@ impl TenantManager { } } - // Phase 5: Shut down the parent shard, and erase it from disk + // Phase 5: Shut down the parent shard. We leave it on disk in case the split fails and we + // have to roll back to the parent shard, avoiding a cold start. It will be cleaned up once + // the storage controller commits the split, or if all else fails, on the next restart. + // + // TODO: We don't flush the ephemeral layer here, because the split is likely to succeed and + // catching up the parent should be reasonably quick. Consider using FreezeAndFlush instead. let (_guard, progress) = completion::channel(); match parent.shutdown(progress, ShutdownMode::Hard).await { Ok(()) => {} @@ -1679,11 +1684,6 @@ impl TenantManager { other.wait().await; } } - let local_tenant_directory = self.conf.tenant_path(&tenant_shard_id); - let tmp_path = safe_rename_tenant_dir(&local_tenant_directory) - .await - .with_context(|| format!("local tenant directory {local_tenant_directory:?} rename"))?; - self.background_purges.spawn(tmp_path); fail::fail_point!("shard-split-pre-finish", |_| Err(anyhow::anyhow!( "failpoint" @@ -1846,42 +1846,70 @@ impl TenantManager { shutdown_all_tenants0(self.tenants).await } + /// Detaches a tenant, and removes its local files asynchronously. + /// + /// File removal is idempotent: even if the tenant has already been removed, this will still + /// remove any local files. This is used during shard splits, where we leave the parent shard's + /// files around in case we have to roll back the split. pub(crate) async fn detach_tenant( &self, conf: &'static PageServerConf, tenant_shard_id: TenantShardId, deletion_queue_client: &DeletionQueueClient, ) -> Result<(), TenantStateError> { - let tmp_path = self + if let Some(tmp_path) = self .detach_tenant0(conf, tenant_shard_id, deletion_queue_client) - .await?; - self.background_purges.spawn(tmp_path); + .await? + { + self.background_purges.spawn(tmp_path); + } Ok(()) } + /// Detaches a tenant. This renames the tenant directory to a temporary path and returns it, + /// allowing the caller to delete it asynchronously. Returns None if the dir is already removed. async fn detach_tenant0( &self, conf: &'static PageServerConf, tenant_shard_id: TenantShardId, deletion_queue_client: &DeletionQueueClient, - ) -> Result { + ) -> Result, TenantStateError> { let tenant_dir_rename_operation = |tenant_id_to_clean: TenantShardId| async move { let local_tenant_directory = conf.tenant_path(&tenant_id_to_clean); + if !tokio::fs::try_exists(&local_tenant_directory).await? { + // If the tenant directory doesn't exist, it's already cleaned up. + return Ok(None); + } safe_rename_tenant_dir(&local_tenant_directory) .await .with_context(|| { format!("local tenant directory {local_tenant_directory:?} rename") }) + .map(Some) }; - let removal_result = remove_tenant_from_memory( + let mut removal_result = remove_tenant_from_memory( self.tenants, tenant_shard_id, tenant_dir_rename_operation(tenant_shard_id), ) .await; + // If the tenant was not found, it was likely already removed. Attempt to remove the tenant + // directory on disk anyway. For example, during shard splits, we shut down and remove the + // parent shard, but leave its directory on disk in case we have to roll back the split. + // + // TODO: it would be better to leave the parent shard attached until the split is committed. + // This will be needed by the gRPC page service too, such that a compute can continue to + // read from the parent shard until it's notified about the new child shards. See: + // . + if let Err(TenantStateError::SlotError(TenantSlotError::NotFound(_))) = removal_result { + removal_result = tenant_dir_rename_operation(tenant_shard_id) + .await + .map_err(TenantStateError::Other); + } + // Flush pending deletions, so that they have a good chance of passing validation // before this tenant is potentially re-attached elsewhere. deletion_queue_client.flush_advisory(); diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 3522af2de0..0ff005fbb9 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -1055,8 +1055,8 @@ pub(crate) enum WaitLsnWaiter<'a> { /// Argument to [`Timeline::shutdown`]. #[derive(Debug, Clone, Copy)] pub(crate) enum ShutdownMode { - /// Graceful shutdown, may do a lot of I/O as we flush any open layers to disk and then - /// also to remote storage. This method can easily take multiple seconds for a busy timeline. + /// Graceful shutdown, may do a lot of I/O as we flush any open layers to disk. This method can + /// take multiple seconds for a busy timeline. /// /// While we are flushing, we continue to accept read I/O for LSNs ingested before /// the call to [`Timeline::shutdown`]. diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index cb29993e8c..06318a01b5 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -1108,7 +1108,8 @@ impl Service { observed } - /// Used during [`Self::startup_reconcile`]: detach a list of unknown-to-us tenants from pageservers. + /// Used during [`Self::startup_reconcile`] and shard splits: detach a list of unknown-to-us + /// tenants from pageservers. /// /// This is safe to run in the background, because if we don't have this TenantShardId in our map of /// tenants, then it is probably something incompletely deleted before: we will not fight with any @@ -6211,7 +6212,11 @@ impl Service { } } - pausable_failpoint!("shard-split-pre-complete"); + fail::fail_point!("shard-split-pre-complete", |_| Err(ApiError::Conflict( + "failpoint".to_string() + ))); + + pausable_failpoint!("shard-split-pre-complete-pause"); // TODO: if the pageserver restarted concurrently with our split API call, // the actual generation of the child shard might differ from the generation @@ -6233,6 +6238,15 @@ impl Service { let (response, child_locations, waiters) = self.tenant_shard_split_commit_inmem(tenant_id, new_shard_count, new_stripe_size); + // Notify all page servers to detach and clean up the old shards because they will no longer + // be needed. This is best-effort: if it fails, it will be cleaned up on a subsequent + // Pageserver re-attach/startup. + let shards_to_cleanup = targets + .iter() + .map(|target| (target.parent_id, target.node.get_id())) + .collect(); + self.cleanup_locations(shards_to_cleanup).await; + // Send compute notifications for all the new shards let mut failed_notifications = Vec::new(); for (child_id, child_ps, stripe_size) in child_locations { diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py index 4c9887fb92..522e257ea5 100644 --- a/test_runner/regress/test_sharding.py +++ b/test_runner/regress/test_sharding.py @@ -1836,3 +1836,90 @@ def test_sharding_gc( shard_gc_cutoff_lsn = Lsn(shard_index["metadata_bytes"]["latest_gc_cutoff_lsn"]) log.info(f"Shard {shard_number} cutoff LSN: {shard_gc_cutoff_lsn}") assert shard_gc_cutoff_lsn == shard_0_gc_cutoff_lsn + + +def test_split_ps_delete_old_shard_after_commit(neon_env_builder: NeonEnvBuilder): + """ + Check that PageServer only deletes old shards after the split is committed such that it doesn't + have to download a lot of files during abort. + """ + DBNAME = "regression" + + init_shard_count = 4 + neon_env_builder.num_pageservers = init_shard_count + stripe_size = 32 + + env = neon_env_builder.init_start( + initial_tenant_shard_count=init_shard_count, initial_tenant_shard_stripe_size=stripe_size + ) + + env.storage_controller.allowed_errors.extend( + [ + # All split failures log a warning when they enqueue the abort operation + ".*Enqueuing background abort.*", + # Tolerate any error logs that mention a failpoint + ".*failpoint.*", + ] + ) + + endpoint = env.endpoints.create("main") + endpoint.respec(skip_pg_catalog_updates=False) + endpoint.start() + + # Write some initial data. + endpoint.safe_psql(f"CREATE DATABASE {DBNAME}") + endpoint.safe_psql("CREATE TABLE usertable ( YCSB_KEY INT, FIELD0 TEXT);") + + for _ in range(1000): + endpoint.safe_psql( + "INSERT INTO usertable SELECT random(), repeat('a', 1000);", log_query=False + ) + + # Record how many bytes we've downloaded before the split. + def collect_downloaded_bytes() -> list[float | None]: + downloaded_bytes = [] + for page_server in env.pageservers: + metric = page_server.http_client().get_metric_value( + "pageserver_remote_ondemand_downloaded_bytes_total" + ) + downloaded_bytes.append(metric) + return downloaded_bytes + + downloaded_bytes_before = collect_downloaded_bytes() + + # Attempt to split the tenant, but fail the split before it completes. + env.storage_controller.configure_failpoints(("shard-split-pre-complete", "return(1)")) + with pytest.raises(StorageControllerApiException): + env.storage_controller.tenant_shard_split(env.initial_tenant, shard_count=16) + + # Wait until split is aborted. + def check_split_is_aborted(): + tenants = env.storage_controller.tenant_list() + assert len(tenants) == 1 + shards = tenants[0]["shards"] + assert len(shards) == 4 + for shard in shards: + assert not shard["is_splitting"] + assert not shard["is_reconciling"] + + # Make sure all new shards have been deleted. + valid_shards = 0 + for ps in env.pageservers: + for tenant_dir in os.listdir(ps.workdir / "tenants"): + try: + tenant_shard_id = TenantShardId.parse(tenant_dir) + valid_shards += 1 + assert tenant_shard_id.shard_count == 4 + except ValueError: + log.info(f"{tenant_dir} is not valid tenant shard id") + assert valid_shards >= 4 + + wait_until(check_split_is_aborted) + + endpoint.safe_psql("SELECT count(*) from usertable;", log_query=False) + + # Make sure we didn't download anything following the aborted split. + downloaded_bytes_after = collect_downloaded_bytes() + + assert downloaded_bytes_before == downloaded_bytes_after + endpoint.stop_and_destroy() diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py index 5e0dd780c3..8f3aa010e3 100644 --- a/test_runner/regress/test_storage_controller.py +++ b/test_runner/regress/test_storage_controller.py @@ -2956,7 +2956,7 @@ def test_storage_controller_leadership_transfer_during_split( env.storage_controller.allowed_errors.extend( [".*Unexpected child shard count.*", ".*Enqueuing background abort.*"] ) - pause_failpoint = "shard-split-pre-complete" + pause_failpoint = "shard-split-pre-complete-pause" env.storage_controller.configure_failpoints((pause_failpoint, "pause")) split_fut = executor.submit( @@ -3003,7 +3003,7 @@ def test_storage_controller_leadership_transfer_during_split( env.storage_controller.request( "PUT", f"http://127.0.0.1:{storage_controller_1_port}/debug/v1/failpoints", - json=[{"name": "shard-split-pre-complete", "actions": "off"}], + json=[{"name": pause_failpoint, "actions": "off"}], headers=env.storage_controller.headers(TokenScope.ADMIN), ) From 7efd4554ab8e905cfd0cbf3e8a59f34fdfee7345 Mon Sep 17 00:00:00 2001 From: Mikhail Date: Fri, 6 Jun 2025 19:08:02 +0100 Subject: [PATCH 014/124] endpoint_storage: allow bypassing s3 write check on startup (#12165) Related: https://github.com/neondatabase/cloud/issues/27195 --- Cargo.lock | 1 + endpoint_storage/Cargo.toml | 1 + endpoint_storage/src/main.rs | 36 ++++++++++++++++++++++++------------ 3 files changed, 26 insertions(+), 12 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 5f71af118c..3ee261e885 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2055,6 +2055,7 @@ dependencies = [ "axum-extra", "camino", "camino-tempfile", + "clap", "futures", "http-body-util", "itertools 0.10.5", diff --git a/endpoint_storage/Cargo.toml b/endpoint_storage/Cargo.toml index b2c9d51551..c2e21d02e2 100644 --- a/endpoint_storage/Cargo.toml +++ b/endpoint_storage/Cargo.toml @@ -8,6 +8,7 @@ anyhow.workspace = true axum-extra.workspace = true axum.workspace = true camino.workspace = true +clap.workspace = true futures.workspace = true jsonwebtoken.workspace = true prometheus.workspace = true diff --git a/endpoint_storage/src/main.rs b/endpoint_storage/src/main.rs index 399a4ec31e..23b7343ff3 100644 --- a/endpoint_storage/src/main.rs +++ b/endpoint_storage/src/main.rs @@ -3,7 +3,8 @@ //! This service is deployed either as a separate component or as part of compute image //! for large computes. mod app; -use anyhow::{Context, bail}; +use anyhow::Context; +use clap::Parser; use std::net::{IpAddr, Ipv4Addr, SocketAddr}; use tracing::info; use utils::logging; @@ -17,6 +18,18 @@ const fn listen() -> SocketAddr { SocketAddr::new(IpAddr::V4(Ipv4Addr::new(0, 0, 0, 0)), 51243) } +#[derive(Parser)] +struct Args { + #[arg(exclusive = true)] + config_file: Option, + #[arg(long, default_value = "false", requires = "config")] + /// to allow testing k8s helm chart where we don't have s3 credentials + no_s3_check_on_startup: bool, + #[arg(long, value_name = "FILE")] + /// inline config mode for k8s helm chart + config: Option, +} + #[derive(serde::Deserialize)] #[serde(tag = "type")] struct Config { @@ -37,19 +50,16 @@ async fn main() -> anyhow::Result<()> { logging::Output::Stdout, )?; - // Allow either passing filename or inline config (for k8s helm chart) - let args: Vec = std::env::args().skip(1).collect(); - let config: Config = if args.len() == 1 && args[0].ends_with(".json") { - info!("Reading config from {}", args[0]); - let config = std::fs::read_to_string(args[0].clone())?; + let args = Args::parse(); + let config: Config = if let Some(config_path) = args.config_file { + info!("Reading config from {config_path}"); + let config = std::fs::read_to_string(config_path)?; serde_json::from_str(&config).context("parsing config")? - } else if !args.is_empty() && args[0].starts_with("--config=") { + } else if let Some(config) = args.config { info!("Reading inline config"); - let config = args.join(" "); - let config = config.strip_prefix("--config=").unwrap(); - serde_json::from_str(config).context("parsing config")? + serde_json::from_str(&config).context("parsing config")? } else { - bail!("Usage: endpoint_storage config.json or endpoint_storage --config=JSON"); + anyhow::bail!("Supply either config file path or --config=inline-config"); }; info!("Reading pemfile from {}", config.pemfile.clone()); @@ -62,7 +72,9 @@ async fn main() -> anyhow::Result<()> { let storage = remote_storage::GenericRemoteStorage::from_config(&config.storage_config).await?; let cancel = tokio_util::sync::CancellationToken::new(); - app::check_storage_permissions(&storage, cancel.clone()).await?; + if !args.no_s3_check_on_startup { + app::check_storage_permissions(&storage, cancel.clone()).await?; + } let proxy = std::sync::Arc::new(endpoint_storage::Storage { auth, From 76f95f06d8eac5cd500a867b61fba00c536f6963 Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Mon, 9 Jun 2025 17:57:36 +0800 Subject: [PATCH 015/124] feat(pageserver): add global timeline count metrics (#12159) ## Problem We are getting tenants with a lot of branches and num of timelines is a good indicator of pageserver loads. I added this metrics to help us better plan pageserver capacities. ## Summary of changes Add `pageserver_timeline_states_count` with two labels: active + offloaded. Signed-off-by: Alex Chi Z --- pageserver/src/metrics.rs | 13 +++++++++++++ pageserver/src/tenant.rs | 28 +++++++++++++++++++++++++++- 2 files changed, 40 insertions(+), 1 deletion(-) diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 3eb70ffac2..3b3522c36a 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -1053,6 +1053,15 @@ pub(crate) static TENANT_STATE_METRIC: Lazy = Lazy::new(|| { .expect("Failed to register pageserver_tenant_states_count metric") }); +pub(crate) static TIMELINE_STATE_METRIC: Lazy = Lazy::new(|| { + register_uint_gauge_vec!( + "pageserver_timeline_states_count", + "Count of timelines per state", + &["state"] + ) + .expect("Failed to register pageserver_timeline_states_count metric") +}); + /// A set of broken tenants. /// /// These are expected to be so rare that a set is fine. Set as in a new timeseries per each broken @@ -3325,6 +3334,8 @@ impl TimelineMetrics { &timeline_id, ); + TIMELINE_STATE_METRIC.with_label_values(&["active"]).inc(); + TimelineMetrics { tenant_id, shard_id, @@ -3479,6 +3490,8 @@ impl TimelineMetrics { return; } + TIMELINE_STATE_METRIC.with_label_values(&["active"]).dec(); + let tenant_id = &self.tenant_id; let timeline_id = &self.timeline_id; let shard_id = &self.shard_id; diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index f9fdc143b4..98a6bc2387 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -89,7 +89,8 @@ use crate::l0_flush::L0FlushGlobalState; use crate::metrics::{ BROKEN_TENANTS_SET, CIRCUIT_BREAKERS_BROKEN, CIRCUIT_BREAKERS_UNBROKEN, CONCURRENT_INITDBS, INITDB_RUN_TIME, INITDB_SEMAPHORE_ACQUISITION_TIME, TENANT, TENANT_OFFLOADED_TIMELINES, - TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC, remove_tenant_metrics, + TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC, TIMELINE_STATE_METRIC, + remove_tenant_metrics, }; use crate::task_mgr::TaskKind; use crate::tenant::config::LocationMode; @@ -544,6 +545,28 @@ pub struct OffloadedTimeline { /// Part of the `OffloadedTimeline` object's lifecycle: this needs to be set before we drop it pub deleted_from_ancestor: AtomicBool, + + _metrics_guard: OffloadedTimelineMetricsGuard, +} + +/// Increases the offloaded timeline count metric when created, and decreases when dropped. +struct OffloadedTimelineMetricsGuard; + +impl OffloadedTimelineMetricsGuard { + fn new() -> Self { + TIMELINE_STATE_METRIC + .with_label_values(&["offloaded"]) + .inc(); + Self + } +} + +impl Drop for OffloadedTimelineMetricsGuard { + fn drop(&mut self) { + TIMELINE_STATE_METRIC + .with_label_values(&["offloaded"]) + .dec(); + } } impl OffloadedTimeline { @@ -576,6 +599,8 @@ impl OffloadedTimeline { delete_progress: timeline.delete_progress.clone(), deleted_from_ancestor: AtomicBool::new(false), + + _metrics_guard: OffloadedTimelineMetricsGuard::new(), }) } fn from_manifest(tenant_shard_id: TenantShardId, manifest: &OffloadedTimelineManifest) -> Self { @@ -595,6 +620,7 @@ impl OffloadedTimeline { archived_at, delete_progress: TimelineDeleteProgress::default(), deleted_from_ancestor: AtomicBool::new(false), + _metrics_guard: OffloadedTimelineMetricsGuard::new(), } } fn manifest(&self) -> OffloadedTimelineManifest { From d759fcb8bdb143a4d9c52a6fb58d79c5939a5d5e Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Mon, 9 Jun 2025 21:01:30 +0300 Subject: [PATCH 016/124] Increase wait LFC prewarm timeout (#12174) ## Problem See https://github.com/neondatabase/neon/issues/12171 ## Summary of changes Increase LFC prewarm wait timeout to 1 minute Co-authored-by: Konstantin Knizhnik --- test_runner/fixtures/endpoint/http.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test_runner/fixtures/endpoint/http.py b/test_runner/fixtures/endpoint/http.py index 4b4b98aa6c..6d37dd1cb1 100644 --- a/test_runner/fixtures/endpoint/http.py +++ b/test_runner/fixtures/endpoint/http.py @@ -77,7 +77,7 @@ class EndpointHttpClient(requests.Session): status, err = json["status"], json.get("error") assert status == "completed", f"{status}, error {err}" - wait_until(prewarmed) + wait_until(prewarmed, timeout=60) def offload_lfc(self): url = f"http://localhost:{self.external_port}/lfc/offload" From f42d44342dfc67b38ff9aa57b4a541933fa671f3 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Tue, 10 Jun 2025 08:32:03 +0300 Subject: [PATCH 017/124] Increase statement timeout for test_pageserver_restarts_under_workload test (#12139) \## Problem See https://github.com/neondatabase/neon/issues/12119#issuecomment-2942586090 Page server restarts with interval 1 seconds increases time of vacuum especially off prefetch is enabled and so cause test failure because of statement timeout expiration. ## Summary of changes Increase statement timeout to 360 seconds. --------- Co-authored-by: Konstantin Knizhnik Co-authored-by: Alexander Lakhin --- .../regress/test_pageserver_restarts_under_workload.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/test_runner/regress/test_pageserver_restarts_under_workload.py b/test_runner/regress/test_pageserver_restarts_under_workload.py index 9f19c887a4..6b33b3e046 100644 --- a/test_runner/regress/test_pageserver_restarts_under_workload.py +++ b/test_runner/regress/test_pageserver_restarts_under_workload.py @@ -16,7 +16,7 @@ if TYPE_CHECKING: # Test restarting page server, while safekeeper and compute node keep # running. -def test_pageserver_restarts_under_worload(neon_simple_env: NeonEnv, pg_bin: PgBin): +def test_pageserver_restarts_under_workload(neon_simple_env: NeonEnv, pg_bin: PgBin): env = neon_simple_env env.create_branch("test_pageserver_restarts") endpoint = env.endpoints.create_start("test_pageserver_restarts") @@ -28,7 +28,11 @@ def test_pageserver_restarts_under_worload(neon_simple_env: NeonEnv, pg_bin: PgB pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", f"-s{scale}", connstr]) pg_bin.run_capture(["pgbench", f"-T{n_restarts}", connstr]) - thread = threading.Thread(target=run_pgbench, args=(endpoint.connstr(),), daemon=True) + thread = threading.Thread( + target=run_pgbench, + args=(endpoint.connstr(options="-cstatement_timeout=360s"),), + daemon=True, + ) thread.start() for _ in range(n_restarts): From 7a68699abbb1af4ccae405bf517abc7e615324c3 Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Tue, 10 Jun 2025 13:32:58 +0800 Subject: [PATCH 018/124] feat(pageserver): support azure time-travel recovery (in an okay way) (#12140) ## Problem part of https://github.com/neondatabase/neon/issues/7546 Add Azure time travel recovery support. The tricky thing is how Azure handles deletes in its blob version API. For the following sequence: ``` upload file_1 = a upload file_1 = b delete file_1 upload file_1 = c ``` The "delete file_1" won't be stored as a version (as AWS did). Therefore, we can never rollback to a state where file_1 is temporarily invisible. If we roll back to the time before file_1 gets created for the first time, it will be removed correctly. However, this is fine for pageservers, because (1) having extra files in the tenant storage is usually fine (2) for things like timelines/X/index_part-Y.json, it will only be deleted once, so it can always be recovered to a correct state. Therefore, I don't expect any issues when this functionality is used on pageserver recovery. TODO: unit tests for time-travel recovery. ## Summary of changes Add Azure blob storage time-travel recovery support. Signed-off-by: Alex Chi Z --- libs/remote_storage/src/azure_blob.rs | 217 ++++++++++++++++++++++---- 1 file changed, 189 insertions(+), 28 deletions(-) diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs index 5363e935e3..f64cd9e206 100644 --- a/libs/remote_storage/src/azure_blob.rs +++ b/libs/remote_storage/src/azure_blob.rs @@ -10,7 +10,7 @@ use std::sync::Arc; use std::time::{Duration, SystemTime}; use std::{env, io}; -use anyhow::{Context, Result}; +use anyhow::{Context, Result, anyhow}; use azure_core::request_options::{IfMatchCondition, MaxResults, Metadata, Range}; use azure_core::{Continuable, HttpClient, RetryOptions, TransportOptions}; use azure_storage::StorageCredentials; @@ -37,6 +37,7 @@ use crate::metrics::{AttemptOutcome, RequestKind, start_measuring_requests}; use crate::{ ConcurrencyLimiter, Download, DownloadError, DownloadKind, DownloadOpts, Listing, ListingMode, ListingObject, RemotePath, RemoteStorage, StorageMetadata, TimeTravelError, TimeoutOrCancel, + Version, VersionKind, }; pub struct AzureBlobStorage { @@ -405,6 +406,39 @@ impl AzureBlobStorage { pub fn container_name(&self) -> &str { &self.container_name } + + async fn list_versions_with_permit( + &self, + _permit: &tokio::sync::SemaphorePermit<'_>, + prefix: Option<&RemotePath>, + mode: ListingMode, + max_keys: Option, + cancel: &CancellationToken, + ) -> Result { + let customize_builder = |mut builder: ListBlobsBuilder| { + builder = builder.include_versions(true); + // We do not return this info back to `VersionListing` yet. + builder = builder.include_deleted(true); + builder + }; + let kind = RequestKind::ListVersions; + + let mut stream = std::pin::pin!(self.list_streaming_for_fn( + prefix, + mode, + max_keys, + cancel, + kind, + customize_builder + )); + let mut combined: crate::VersionListing = + stream.next().await.expect("At least one item required")?; + while let Some(list) = stream.next().await { + let list = list?; + combined.versions.extend(list.versions.into_iter()); + } + Ok(combined) + } } trait ListingCollector { @@ -488,27 +522,10 @@ impl RemoteStorage for AzureBlobStorage { max_keys: Option, cancel: &CancellationToken, ) -> std::result::Result { - let customize_builder = |mut builder: ListBlobsBuilder| { - builder = builder.include_versions(true); - builder - }; let kind = RequestKind::ListVersions; - - let mut stream = std::pin::pin!(self.list_streaming_for_fn( - prefix, - mode, - max_keys, - cancel, - kind, - customize_builder - )); - let mut combined: crate::VersionListing = - stream.next().await.expect("At least one item required")?; - while let Some(list) = stream.next().await { - let list = list?; - combined.versions.extend(list.versions.into_iter()); - } - Ok(combined) + let permit = self.permit(kind, cancel).await?; + self.list_versions_with_permit(&permit, prefix, mode, max_keys, cancel) + .await } async fn head_object( @@ -803,14 +820,158 @@ impl RemoteStorage for AzureBlobStorage { async fn time_travel_recover( &self, - _prefix: Option<&RemotePath>, - _timestamp: SystemTime, - _done_if_after: SystemTime, - _cancel: &CancellationToken, + prefix: Option<&RemotePath>, + timestamp: SystemTime, + done_if_after: SystemTime, + cancel: &CancellationToken, ) -> Result<(), TimeTravelError> { - // TODO use Azure point in time recovery feature for this - // https://learn.microsoft.com/en-us/azure/storage/blobs/point-in-time-restore-overview - Err(TimeTravelError::Unimplemented) + let msg = "PLEASE NOTE: Azure Blob storage time-travel recovery may not work as expected " + .to_string() + + "for some specific files. If a file gets deleted but then overwritten and we want to recover " + + "to the time during the file was not present, this functionality will recover the file. Only " + + "use the functionality for services that can tolerate this. For example, recovering a state of the " + + "pageserver tenants."; + tracing::error!("{}", msg); + + let kind = RequestKind::TimeTravel; + let permit = self.permit(kind, cancel).await?; + + let mode = ListingMode::NoDelimiter; + let version_listing = self + .list_versions_with_permit(&permit, prefix, mode, None, cancel) + .await + .map_err(|err| match err { + DownloadError::Other(e) => TimeTravelError::Other(e), + DownloadError::Cancelled => TimeTravelError::Cancelled, + other => TimeTravelError::Other(other.into()), + })?; + let versions_and_deletes = version_listing.versions; + + tracing::info!( + "Built list for time travel with {} versions and deletions", + versions_and_deletes.len() + ); + + // Work on the list of references instead of the objects directly, + // otherwise we get lifetime errors in the sort_by_key call below. + let mut versions_and_deletes = versions_and_deletes.iter().collect::>(); + + versions_and_deletes.sort_by_key(|vd| (&vd.key, &vd.last_modified)); + + let mut vds_for_key = HashMap::<_, Vec<_>>::new(); + + for vd in &versions_and_deletes { + let Version { key, .. } = &vd; + let version_id = vd.version_id().map(|v| v.0.as_str()); + if version_id == Some("null") { + return Err(TimeTravelError::Other(anyhow!( + "Received ListVersions response for key={key} with version_id='null', \ + indicating either disabled versioning, or legacy objects with null version id values" + ))); + } + tracing::trace!("Parsing version key={key} kind={:?}", vd.kind); + + vds_for_key.entry(key).or_default().push(vd); + } + + let warn_threshold = 3; + let max_retries = 10; + let is_permanent = |e: &_| matches!(e, TimeTravelError::Cancelled); + + for (key, versions) in vds_for_key { + let last_vd = versions.last().unwrap(); + let key = self.relative_path_to_name(key); + if last_vd.last_modified > done_if_after { + tracing::debug!("Key {key} has version later than done_if_after, skipping"); + continue; + } + // the version we want to restore to. + let version_to_restore_to = + match versions.binary_search_by_key(×tamp, |tpl| tpl.last_modified) { + Ok(v) => v, + Err(e) => e, + }; + if version_to_restore_to == versions.len() { + tracing::debug!("Key {key} has no changes since timestamp, skipping"); + continue; + } + let mut do_delete = false; + if version_to_restore_to == 0 { + // All versions more recent, so the key didn't exist at the specified time point. + tracing::debug!( + "All {} versions more recent for {key}, deleting", + versions.len() + ); + do_delete = true; + } else { + match &versions[version_to_restore_to - 1] { + Version { + kind: VersionKind::Version(version_id), + .. + } => { + let source_url = format!( + "{}/{}?versionid={}", + self.client + .url() + .map_err(|e| TimeTravelError::Other(anyhow!("{e}")))?, + key, + version_id.0 + ); + tracing::debug!( + "Promoting old version {} for {key} at {}...", + version_id.0, + source_url + ); + backoff::retry( + || async { + let blob_client = self.client.blob_client(key.clone()); + let op = blob_client.copy(Url::from_str(&source_url).unwrap()); + tokio::select! { + res = op => res.map_err(|e| TimeTravelError::Other(e.into())), + _ = cancel.cancelled() => Err(TimeTravelError::Cancelled), + } + }, + is_permanent, + warn_threshold, + max_retries, + "copying object version for time_travel_recover", + cancel, + ) + .await + .ok_or_else(|| TimeTravelError::Cancelled) + .and_then(|x| x)?; + tracing::info!(?version_id, %key, "Copied old version in Azure blob storage"); + } + Version { + kind: VersionKind::DeletionMarker, + .. + } => { + do_delete = true; + } + } + }; + if do_delete { + if matches!(last_vd.kind, VersionKind::DeletionMarker) { + // Key has since been deleted (but there was some history), no need to do anything + tracing::debug!("Key {key} already deleted, skipping."); + } else { + tracing::debug!("Deleting {key}..."); + + self.delete(&RemotePath::from_string(&key).unwrap(), cancel) + .await + .map_err(|e| { + // delete_oid0 will use TimeoutOrCancel + if TimeoutOrCancel::caused_by_cancel(&e) { + TimeTravelError::Cancelled + } else { + TimeTravelError::Other(e) + } + })?; + } + } + } + + Ok(()) } } From 40d7583906643fe21829fb0ce99de05b335b0f23 Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Tue, 10 Jun 2025 15:10:41 +0800 Subject: [PATCH 019/124] feat(pageserver): use hostname as feature flag resolver property (#12141) ## Problem part of https://github.com/neondatabase/neon/issues/11813 ## Summary of changes Collect pageserver hostname property so that we can use it in the PostHog UI. Not sure if this is the best way to do that -- open to suggestions. --------- Signed-off-by: Alex Chi Z --- libs/remote_storage/src/s3_bucket.rs | 1 + pageserver/src/feature_resolver.rs | 31 +++++++++++++++++++++++++++- 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs index d98ff552ee..004aad447e 100644 --- a/libs/remote_storage/src/s3_bucket.rs +++ b/libs/remote_storage/src/s3_bucket.rs @@ -1022,6 +1022,7 @@ impl RemoteStorage for S3Bucket { let Version { key, .. } = &vd; let version_id = vd.version_id().map(|v| v.0.as_str()); if version_id == Some("null") { + // TODO: check the behavior of using the SDK on a non-versioned container return Err(TimeTravelError::Other(anyhow!( "Received ListVersions response for key={key} with version_id='null', \ indicating either disabled versioning, or legacy objects with null version id values" diff --git a/pageserver/src/feature_resolver.rs b/pageserver/src/feature_resolver.rs index 50de3b691c..84edd68011 100644 --- a/pageserver/src/feature_resolver.rs +++ b/pageserver/src/feature_resolver.rs @@ -1,5 +1,6 @@ use std::{collections::HashMap, sync::Arc, time::Duration}; +use pageserver_api::config::NodeMetadata; use posthog_client_lite::{ CaptureEvent, FeatureResolverBackgroundLoop, PostHogClientConfig, PostHogEvaluationError, PostHogFlagFilterPropertyValue, @@ -86,7 +87,35 @@ impl FeatureResolver { } } } - // TODO: add pageserver URL. + // TODO: move this to a background task so that we don't block startup in case of slow disk + let metadata_path = conf.metadata_path(); + match std::fs::read_to_string(&metadata_path) { + Ok(metadata_str) => match serde_json::from_str::(&metadata_str) { + Ok(metadata) => { + properties.insert( + "hostname".to_string(), + PostHogFlagFilterPropertyValue::String(metadata.http_host), + ); + if let Some(cplane_region) = metadata.other.get("region_id") { + if let Some(cplane_region) = cplane_region.as_str() { + // This region contains the cell number + properties.insert( + "neon_region".to_string(), + PostHogFlagFilterPropertyValue::String( + cplane_region.to_string(), + ), + ); + } + } + } + Err(e) => { + tracing::warn!("Failed to parse metadata.json: {}", e); + } + }, + Err(e) => { + tracing::warn!("Failed to read metadata.json: {}", e); + } + } Arc::new(properties) }; let fake_tenants = { From 73be6bb7365762c525dae5ac489ac4d26895a383 Mon Sep 17 00:00:00 2001 From: Dmitrii Kovalkov <34828390+DimasKovas@users.noreply.github.com> Date: Tue, 10 Jun 2025 11:16:42 +0400 Subject: [PATCH 020/124] fix(compute): use proper safekeeper in VotesCollectedMset (#12175) ## Problem `VotesCollectedMset` uses the wrong safekeeper to update truncateLsn. This led to some failed assert later in the code during running safekeeper migration tests. - Relates to https://github.com/neondatabase/neon/issues/11823 ## Summary of changes Use proper safekeeper to update truncateLsn in VotesCollectedMset --- pgxn/neon/walproposer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c index 91d39345e2..ba6e4a54ff 100644 --- a/pgxn/neon/walproposer.c +++ b/pgxn/neon/walproposer.c @@ -1135,7 +1135,7 @@ VotesCollectedMset(WalProposer *wp, MemberSet *mset, Safekeeper **msk, StringInf wp->propTermStartLsn = sk->voteResponse.flushLsn; wp->donor = sk; } - wp->truncateLsn = Max(wp->safekeeper[i].voteResponse.truncateLsn, wp->truncateLsn); + wp->truncateLsn = Max(sk->voteResponse.truncateLsn, wp->truncateLsn); if (n_votes > 0) appendStringInfoString(s, ", "); From 58327ef74d86951eedcf2079116dbd3dd073d15e Mon Sep 17 00:00:00 2001 From: Conrad Ludgate Date: Tue, 10 Jun 2025 09:46:29 +0100 Subject: [PATCH 021/124] [proxy] fix sql-over-http password setting (#12177) ## Problem Looks like our sql-over-http tests get to rely on "trust" authentication, so the path that made sure the authkeys data was set was never being hit. ## Summary of changes Slight refactor to WakeComputeBackends, as well as making sure auth keys are propagated. Fix tests to ensure passwords are tested. --- proxy/src/auth/backend/console_redirect.rs | 4 +-- proxy/src/auth/backend/mod.rs | 6 ++--- proxy/src/compute/mod.rs | 4 +-- proxy/src/pglb/connect_compute.rs | 4 +-- proxy/src/proxy/mod.rs | 8 +++--- proxy/src/proxy/tests/mod.rs | 22 +++++---------- proxy/src/proxy/wake_compute.rs | 4 +-- proxy/src/serverless/backend.rs | 27 ++++++++++--------- test_runner/fixtures/neon_fixtures.py | 10 +++++++ test_runner/regress/test_proxy_allowed_ips.py | 16 ++++++++--- 10 files changed, 59 insertions(+), 46 deletions(-) diff --git a/proxy/src/auth/backend/console_redirect.rs b/proxy/src/auth/backend/console_redirect.rs index 455d96c90a..a7133b22e5 100644 --- a/proxy/src/auth/backend/console_redirect.rs +++ b/proxy/src/auth/backend/console_redirect.rs @@ -14,7 +14,7 @@ use crate::context::RequestContext; use crate::control_plane::client::cplane_proxy_v1; use crate::control_plane::{self, CachedNodeInfo, NodeInfo}; use crate::error::{ReportableError, UserFacingError}; -use crate::pglb::connect_compute::ComputeConnectBackend; +use crate::pglb::connect_compute::WakeComputeBackend; use crate::pqproto::BeMessage; use crate::proxy::NeonOptions; use crate::stream::PqStream; @@ -109,7 +109,7 @@ impl ConsoleRedirectBackend { pub struct ConsoleRedirectNodeInfo(pub(super) NodeInfo); #[async_trait] -impl ComputeConnectBackend for ConsoleRedirectNodeInfo { +impl WakeComputeBackend for ConsoleRedirectNodeInfo { async fn wake_compute( &self, _ctx: &RequestContext, diff --git a/proxy/src/auth/backend/mod.rs b/proxy/src/auth/backend/mod.rs index edc1ae06d9..a153ea4d42 100644 --- a/proxy/src/auth/backend/mod.rs +++ b/proxy/src/auth/backend/mod.rs @@ -25,7 +25,7 @@ use crate::control_plane::{ RoleAccessControl, }; use crate::intern::EndpointIdInt; -use crate::pglb::connect_compute::ComputeConnectBackend; +use crate::pglb::connect_compute::WakeComputeBackend; use crate::pqproto::BeMessage; use crate::proxy::NeonOptions; use crate::rate_limiter::EndpointRateLimiter; @@ -407,13 +407,13 @@ impl Backend<'_, ComputeUserInfo> { } #[async_trait::async_trait] -impl ComputeConnectBackend for Backend<'_, ComputeCredentials> { +impl WakeComputeBackend for Backend<'_, ComputeUserInfo> { async fn wake_compute( &self, ctx: &RequestContext, ) -> Result { match self { - Self::ControlPlane(api, creds) => api.wake_compute(ctx, &creds.info).await, + Self::ControlPlane(api, info) => api.wake_compute(ctx, info).await, Self::Local(local) => Ok(Cached::new_uncached(local.node_info.clone())), } } diff --git a/proxy/src/compute/mod.rs b/proxy/src/compute/mod.rs index 0dacd15547..aae1fea07d 100644 --- a/proxy/src/compute/mod.rs +++ b/proxy/src/compute/mod.rs @@ -136,11 +136,11 @@ impl AuthInfo { } } - pub(crate) fn with_auth_keys(keys: &ComputeCredentialKeys) -> Self { + pub(crate) fn with_auth_keys(keys: ComputeCredentialKeys) -> Self { Self { auth: match keys { ComputeCredentialKeys::AuthKeys(AuthKeys::ScramSha256(auth_keys)) => { - Some(Auth::Scram(Box::new(*auth_keys))) + Some(Auth::Scram(Box::new(auth_keys))) } ComputeCredentialKeys::JwtPayload(_) | ComputeCredentialKeys::None => None, }, diff --git a/proxy/src/pglb/connect_compute.rs b/proxy/src/pglb/connect_compute.rs index 1807cdff0e..4984588f75 100644 --- a/proxy/src/pglb/connect_compute.rs +++ b/proxy/src/pglb/connect_compute.rs @@ -50,7 +50,7 @@ pub(crate) trait ConnectMechanism { } #[async_trait] -pub(crate) trait ComputeConnectBackend { +pub(crate) trait WakeComputeBackend { async fn wake_compute( &self, ctx: &RequestContext, @@ -91,7 +91,7 @@ impl ConnectMechanism for TcpMechanism { /// Try to connect to the compute node, retrying if necessary. #[tracing::instrument(skip_all)] -pub(crate) async fn connect_to_compute( +pub(crate) async fn connect_to_compute( ctx: &RequestContext, mechanism: &M, user_info: &B, diff --git a/proxy/src/proxy/mod.rs b/proxy/src/proxy/mod.rs index 0e00c4f97e..4a294c1e82 100644 --- a/proxy/src/proxy/mod.rs +++ b/proxy/src/proxy/mod.rs @@ -358,12 +358,12 @@ pub(crate) async fn handle_client( } }; - let creds = match &user_info { - auth::Backend::ControlPlane(_, creds) => creds, + let (cplane, creds) = match user_info { + auth::Backend::ControlPlane(cplane, creds) => (cplane, creds), auth::Backend::Local(_) => unreachable!("local proxy does not run tcp proxy service"), }; let params_compat = creds.info.options.get(NeonOptions::PARAMS_COMPAT).is_some(); - let mut auth_info = compute::AuthInfo::with_auth_keys(&creds.keys); + let mut auth_info = compute::AuthInfo::with_auth_keys(creds.keys); auth_info.set_startup_params(¶ms, params_compat); let res = connect_to_compute( @@ -373,7 +373,7 @@ pub(crate) async fn handle_client( auth: auth_info, locks: &config.connect_compute_locks, }, - &user_info, + &auth::Backend::ControlPlane(cplane, creds.info), config.wake_compute_retry_config, &config.connect_to_compute, ) diff --git a/proxy/src/proxy/tests/mod.rs b/proxy/src/proxy/tests/mod.rs index 028247a97d..de85bf9df2 100644 --- a/proxy/src/proxy/tests/mod.rs +++ b/proxy/src/proxy/tests/mod.rs @@ -8,7 +8,7 @@ use std::time::Duration; use anyhow::{Context, bail}; use async_trait::async_trait; use http::StatusCode; -use postgres_client::config::{AuthKeys, ScramKeys, SslMode}; +use postgres_client::config::SslMode; use postgres_client::tls::{MakeTlsConnect, NoTls}; use retry::{ShouldRetryWakeCompute, retry_after}; use rstest::rstest; @@ -19,9 +19,7 @@ use tracing_test::traced_test; use super::retry::CouldRetry; use super::*; -use crate::auth::backend::{ - ComputeCredentialKeys, ComputeCredentials, ComputeUserInfo, MaybeOwned, -}; +use crate::auth::backend::{ComputeUserInfo, MaybeOwned}; use crate::config::{ComputeConfig, RetryConfig}; use crate::control_plane::client::{ControlPlaneClient, TestControlPlaneClient}; use crate::control_plane::messages::{ControlPlaneErrorMessage, Details, MetricsAuxInfo, Status}; @@ -575,19 +573,13 @@ fn helper_create_cached_node_info(cache: &'static NodeInfoCache) -> CachedNodeIn fn helper_create_connect_info( mechanism: &TestConnectMechanism, -) -> auth::Backend<'static, ComputeCredentials> { +) -> auth::Backend<'static, ComputeUserInfo> { auth::Backend::ControlPlane( MaybeOwned::Owned(ControlPlaneClient::Test(Box::new(mechanism.clone()))), - ComputeCredentials { - info: ComputeUserInfo { - endpoint: "endpoint".into(), - user: "user".into(), - options: NeonOptions::parse_options_raw(""), - }, - keys: ComputeCredentialKeys::AuthKeys(AuthKeys::ScramSha256(ScramKeys { - client_key: [0; 32], - server_key: [0; 32], - })), + ComputeUserInfo { + endpoint: "endpoint".into(), + user: "user".into(), + options: NeonOptions::parse_options_raw(""), }, ) } diff --git a/proxy/src/proxy/wake_compute.rs b/proxy/src/proxy/wake_compute.rs index 06c2da58db..fd59800745 100644 --- a/proxy/src/proxy/wake_compute.rs +++ b/proxy/src/proxy/wake_compute.rs @@ -8,7 +8,7 @@ use crate::error::ReportableError; use crate::metrics::{ ConnectOutcome, ConnectionFailuresBreakdownGroup, Metrics, RetriesMetricGroup, RetryType, }; -use crate::pglb::connect_compute::ComputeConnectBackend; +use crate::pglb::connect_compute::WakeComputeBackend; use crate::proxy::retry::{retry_after, should_retry}; // Use macro to retain original callsite. @@ -23,7 +23,7 @@ macro_rules! log_wake_compute_error { }; } -pub(crate) async fn wake_compute( +pub(crate) async fn wake_compute( num_retries: &mut u32, ctx: &RequestContext, api: &B, diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs index a0e782dab0..d74e3cad3d 100644 --- a/proxy/src/serverless/backend.rs +++ b/proxy/src/serverless/backend.rs @@ -21,7 +21,7 @@ use super::conn_pool_lib::{Client, ConnInfo, EndpointConnPool, GlobalConnPool}; use super::http_conn_pool::{self, HttpConnPool, Send, poll_http2_client}; use super::local_conn_pool::{self, EXT_NAME, EXT_SCHEMA, EXT_VERSION, LocalConnPool}; use crate::auth::backend::local::StaticAuthRules; -use crate::auth::backend::{ComputeCredentials, ComputeUserInfo}; +use crate::auth::backend::{ComputeCredentialKeys, ComputeCredentials, ComputeUserInfo}; use crate::auth::{self, AuthError}; use crate::compute_ctl::{ ComputeCtlError, ExtensionInstallRequest, Privilege, SetRoleGrantsRequest, @@ -180,7 +180,7 @@ impl PoolingBackend { let conn_id = uuid::Uuid::new_v4(); tracing::Span::current().record("conn_id", display(conn_id)); info!(%conn_id, "pool: opening a new connection '{conn_info}'"); - let backend = self.auth_backend.as_ref().map(|()| keys); + let backend = self.auth_backend.as_ref().map(|()| keys.info); crate::pglb::connect_compute::connect_to_compute( ctx, &TokioMechanism { @@ -188,6 +188,7 @@ impl PoolingBackend { conn_info, pool: self.pool.clone(), locks: &self.config.connect_compute_locks, + keys: keys.keys, }, &backend, self.config.wake_compute_retry_config, @@ -214,16 +215,13 @@ impl PoolingBackend { let conn_id = uuid::Uuid::new_v4(); tracing::Span::current().record("conn_id", display(conn_id)); debug!(%conn_id, "pool: opening a new connection '{conn_info}'"); - let backend = self.auth_backend.as_ref().map(|()| ComputeCredentials { - info: ComputeUserInfo { - user: conn_info.user_info.user.clone(), - endpoint: EndpointId::from(format!( - "{}{LOCAL_PROXY_SUFFIX}", - conn_info.user_info.endpoint.normalize() - )), - options: conn_info.user_info.options.clone(), - }, - keys: crate::auth::backend::ComputeCredentialKeys::None, + let backend = self.auth_backend.as_ref().map(|()| ComputeUserInfo { + user: conn_info.user_info.user.clone(), + endpoint: EndpointId::from(format!( + "{}{LOCAL_PROXY_SUFFIX}", + conn_info.user_info.endpoint.normalize() + )), + options: conn_info.user_info.options.clone(), }); crate::pglb::connect_compute::connect_to_compute( ctx, @@ -495,6 +493,7 @@ struct TokioMechanism { pool: Arc>>, conn_info: ConnInfo, conn_id: uuid::Uuid, + keys: ComputeCredentialKeys, /// connect_to_compute concurrency lock locks: &'static ApiLocks, @@ -520,6 +519,10 @@ impl ConnectMechanism for TokioMechanism { .dbname(&self.conn_info.dbname) .connect_timeout(compute_config.timeout); + if let ComputeCredentialKeys::AuthKeys(auth_keys) = self.keys { + config.auth_keys(auth_keys); + } + let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute); let res = config.connect(compute_config).await; drop(pause); diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 5223e34baf..84caf9e2af 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -4046,6 +4046,16 @@ def static_proxy( "CREATE TABLE neon_control_plane.endpoints (endpoint_id VARCHAR(255) PRIMARY KEY, allowed_ips VARCHAR(255))" ) + vanilla_pg.stop() + vanilla_pg.edit_hba( + [ + "local all all trust", + "host all all 127.0.0.1/32 scram-sha-256", + "host all all ::1/128 scram-sha-256", + ] + ) + vanilla_pg.start() + proxy_port = port_distributor.get_port() mgmt_port = port_distributor.get_port() http_port = port_distributor.get_port() diff --git a/test_runner/regress/test_proxy_allowed_ips.py b/test_runner/regress/test_proxy_allowed_ips.py index 7384326385..5ac74585b9 100644 --- a/test_runner/regress/test_proxy_allowed_ips.py +++ b/test_runner/regress/test_proxy_allowed_ips.py @@ -19,11 +19,15 @@ TABLE_NAME = "neon_control_plane.endpoints" async def test_proxy_psql_allowed_ips(static_proxy: NeonProxy, vanilla_pg: VanillaPostgres): # Shouldn't be able to connect to this project vanilla_pg.safe_psql( - f"INSERT INTO {TABLE_NAME} (endpoint_id, allowed_ips) VALUES ('private-project', '8.8.8.8')" + f"INSERT INTO {TABLE_NAME} (endpoint_id, allowed_ips) VALUES ('private-project', '8.8.8.8')", + user="proxy", + password="password", ) # Should be able to connect to this project vanilla_pg.safe_psql( - f"INSERT INTO {TABLE_NAME} (endpoint_id, allowed_ips) VALUES ('generic-project', '::1,127.0.0.1')" + f"INSERT INTO {TABLE_NAME} (endpoint_id, allowed_ips) VALUES ('generic-project', '::1,127.0.0.1')", + user="proxy", + password="password", ) def check_cannot_connect(**kwargs): @@ -60,7 +64,9 @@ async def test_proxy_http_allowed_ips(static_proxy: NeonProxy, vanilla_pg: Vanil # Shouldn't be able to connect to this project vanilla_pg.safe_psql( - f"INSERT INTO {TABLE_NAME} (endpoint_id, allowed_ips) VALUES ('proxy', '8.8.8.8')" + f"INSERT INTO {TABLE_NAME} (endpoint_id, allowed_ips) VALUES ('proxy', '8.8.8.8')", + user="proxy", + password="password", ) def query(status: int, query: str, *args): @@ -75,6 +81,8 @@ async def test_proxy_http_allowed_ips(static_proxy: NeonProxy, vanilla_pg: Vanil query(400, "select 1;") # ip address is not allowed # Should be able to connect to this project vanilla_pg.safe_psql( - f"UPDATE {TABLE_NAME} SET allowed_ips = '8.8.8.8,127.0.0.1' WHERE endpoint_id = 'proxy'" + f"UPDATE {TABLE_NAME} SET allowed_ips = '8.8.8.8,127.0.0.1' WHERE endpoint_id = 'proxy'", + user="proxy", + password="password", ) query(200, "select 1;") # should work now From 02f94edb60a7f4d290cbaf424436245a179b1859 Mon Sep 17 00:00:00 2001 From: Trung Dinh Date: Tue, 10 Jun 2025 02:26:40 -0700 Subject: [PATCH 022/124] Remove global static TENANTS (#12169) ## Problem There is this TODO in code: https://github.com/neondatabase/neon/blob/main/pageserver/src/tenant/mgr.rs#L300-L302 This is an old TODO by @jcsp. ## Summary of changes This PR addresses the TODO. Specifically, it removes a global static `TENANTS`. Instead the `TenantManager` now directly manages the tenant map. Enhancing abstraction. Essentially, this PR moves all module-level methods to inside the implementation of `TenantManager`. --- pageserver/src/bin/pageserver.rs | 7 +- pageserver/src/tenant/mgr.rs | 725 ++++++++++++++++--------------- 2 files changed, 388 insertions(+), 344 deletions(-) diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 5cd865f53e..417503089a 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -573,7 +573,8 @@ fn start_pageserver( tokio::sync::mpsc::unbounded_channel(); let deletion_queue_client = deletion_queue.new_client(); let background_purges = mgr::BackgroundPurges::default(); - let tenant_manager = BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr( + + let tenant_manager = mgr::init( conf, background_purges.clone(), TenantSharedResources { @@ -584,10 +585,10 @@ fn start_pageserver( basebackup_prepare_sender, feature_resolver, }, - order, shutdown_pageserver.clone(), - ))?; + ); let tenant_manager = Arc::new(tenant_manager); + BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(tenant_manager.clone(), order))?; let basebackup_cache = BasebackupCache::spawn( BACKGROUND_RUNTIME.handle(), diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 186e0f4cdb..4aa459e923 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -12,7 +12,6 @@ use anyhow::Context; use camino::{Utf8DirEntry, Utf8Path, Utf8PathBuf}; use futures::StreamExt; use itertools::Itertools; -use once_cell::sync::Lazy; use pageserver_api::key::Key; use pageserver_api::models::{DetachBehavior, LocationConfigMode}; use pageserver_api::shard::{ @@ -103,7 +102,7 @@ pub(crate) enum TenantsMap { /// [`init_tenant_mgr`] is not done yet. Initializing, /// [`init_tenant_mgr`] is done, all on-disk tenants have been loaded. - /// New tenants can be added using [`tenant_map_acquire_slot`]. + /// New tenants can be added using [`TenantManager::tenant_map_acquire_slot`]. Open(BTreeMap), /// The pageserver has entered shutdown mode via [`TenantManager::shutdown`]. /// Existing tenants are still accessible, but no new tenants can be created. @@ -284,9 +283,6 @@ impl BackgroundPurges { } } -static TENANTS: Lazy> = - Lazy::new(|| std::sync::RwLock::new(TenantsMap::Initializing)); - /// Responsible for storing and mutating the collection of all tenants /// that this pageserver has state for. /// @@ -297,10 +293,7 @@ static TENANTS: Lazy> = /// and attached modes concurrently. pub struct TenantManager { conf: &'static PageServerConf, - // TODO: currently this is a &'static pointing to TENANTs. When we finish refactoring - // out of that static variable, the TenantManager can own this. - // See https://github.com/neondatabase/neon/issues/5796 - tenants: &'static std::sync::RwLock, + tenants: std::sync::RwLock, resources: TenantSharedResources, // Long-running operations that happen outside of a [`Tenant`] lifetime should respect this token. @@ -479,21 +472,43 @@ pub(crate) enum DeleteTenantError { Other(#[from] anyhow::Error), } -/// Initialize repositories with locally available timelines. +/// Initialize repositories at `Initializing` state. +pub fn init( + conf: &'static PageServerConf, + background_purges: BackgroundPurges, + resources: TenantSharedResources, + cancel: CancellationToken, +) -> TenantManager { + TenantManager { + conf, + tenants: std::sync::RwLock::new(TenantsMap::Initializing), + resources, + cancel, + background_purges, + } +} + +/// Transition repositories from `Initializing` state to `Open` state with locally available timelines. /// Timelines that are only partially available locally (remote storage has more data than this pageserver) /// are scheduled for download and added to the tenant once download is completed. #[instrument(skip_all)] pub async fn init_tenant_mgr( - conf: &'static PageServerConf, - background_purges: BackgroundPurges, - resources: TenantSharedResources, + tenant_manager: Arc, init_order: InitializationOrder, - cancel: CancellationToken, -) -> anyhow::Result { +) -> anyhow::Result<()> { + debug_assert!(matches!( + *tenant_manager.tenants.read().unwrap(), + TenantsMap::Initializing + )); let mut tenants = BTreeMap::new(); let ctx = RequestContext::todo_child(TaskKind::Startup, DownloadBehavior::Warn); + let conf = tenant_manager.conf; + let resources = &tenant_manager.resources; + let cancel = &tenant_manager.cancel; + let background_purges = &tenant_manager.background_purges; + // Initialize dynamic limits that depend on system resources let system_memory = sysinfo::System::new_with_specifics(sysinfo::RefreshKind::new().with_memory()) @@ -512,7 +527,7 @@ pub async fn init_tenant_mgr( let tenant_configs = init_load_tenant_configs(conf).await; // Determine which tenants are to be secondary or attached, and in which generation - let tenant_modes = init_load_generations(conf, &tenant_configs, &resources, &cancel).await?; + let tenant_modes = init_load_generations(conf, &tenant_configs, resources, cancel).await?; tracing::info!( "Attaching {} tenants at startup, warming up {} at a time", @@ -669,18 +684,10 @@ pub async fn init_tenant_mgr( info!("Processed {} local tenants at startup", tenants.len()); - let mut tenants_map = TENANTS.write().unwrap(); - assert!(matches!(&*tenants_map, &TenantsMap::Initializing)); + let mut tenant_map = tenant_manager.tenants.write().unwrap(); + *tenant_map = TenantsMap::Open(tenants); - *tenants_map = TenantsMap::Open(tenants); - - Ok(TenantManager { - conf, - tenants: &TENANTS, - resources, - cancel: CancellationToken::new(), - background_purges, - }) + Ok(()) } /// Wrapper for Tenant::spawn that checks invariants before running @@ -719,142 +726,6 @@ fn tenant_spawn( ) } -async fn shutdown_all_tenants0(tenants: &std::sync::RwLock) { - let mut join_set = JoinSet::new(); - - #[cfg(all(debug_assertions, not(test)))] - { - // Check that our metrics properly tracked the size of the tenants map. This is a convenient location to check, - // as it happens implicitly at the end of tests etc. - let m = tenants.read().unwrap(); - debug_assert_eq!(METRICS.slots_total(), m.len() as u64); - } - - // Atomically, 1. create the shutdown tasks and 2. prevent creation of new tenants. - let (total_in_progress, total_attached) = { - let mut m = tenants.write().unwrap(); - match &mut *m { - TenantsMap::Initializing => { - *m = TenantsMap::ShuttingDown(BTreeMap::default()); - info!("tenants map is empty"); - return; - } - TenantsMap::Open(tenants) => { - let mut shutdown_state = BTreeMap::new(); - let mut total_in_progress = 0; - let mut total_attached = 0; - - for (tenant_shard_id, v) in std::mem::take(tenants).into_iter() { - match v { - TenantSlot::Attached(t) => { - shutdown_state.insert(tenant_shard_id, TenantSlot::Attached(t.clone())); - join_set.spawn( - async move { - let res = { - let (_guard, shutdown_progress) = completion::channel(); - t.shutdown(shutdown_progress, ShutdownMode::FreezeAndFlush).await - }; - - if let Err(other_progress) = res { - // join the another shutdown in progress - other_progress.wait().await; - } - - // we cannot afford per tenant logging here, because if s3 is degraded, we are - // going to log too many lines - debug!("tenant successfully stopped"); - } - .instrument(info_span!("shutdown", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug())), - ); - - total_attached += 1; - } - TenantSlot::Secondary(state) => { - // We don't need to wait for this individually per-tenant: the - // downloader task will be waited on eventually, this cancel - // is just to encourage it to drop out if it is doing work - // for this tenant right now. - state.cancel.cancel(); - - shutdown_state.insert(tenant_shard_id, TenantSlot::Secondary(state)); - } - TenantSlot::InProgress(notify) => { - // InProgress tenants are not visible in TenantsMap::ShuttingDown: we will - // wait for their notifications to fire in this function. - join_set.spawn(async move { - notify.wait().await; - }); - - total_in_progress += 1; - } - } - } - *m = TenantsMap::ShuttingDown(shutdown_state); - (total_in_progress, total_attached) - } - TenantsMap::ShuttingDown(_) => { - error!( - "already shutting down, this function isn't supposed to be called more than once" - ); - return; - } - } - }; - - let started_at = std::time::Instant::now(); - - info!( - "Waiting for {} InProgress tenants and {} Attached tenants to shut down", - total_in_progress, total_attached - ); - - let total = join_set.len(); - let mut panicked = 0; - let mut buffering = true; - const BUFFER_FOR: std::time::Duration = std::time::Duration::from_millis(500); - let mut buffered = std::pin::pin!(tokio::time::sleep(BUFFER_FOR)); - - while !join_set.is_empty() { - tokio::select! { - Some(joined) = join_set.join_next() => { - match joined { - Ok(()) => {}, - Err(join_error) if join_error.is_cancelled() => { - unreachable!("we are not cancelling any of the tasks"); - } - Err(join_error) if join_error.is_panic() => { - // cannot really do anything, as this panic is likely a bug - panicked += 1; - } - Err(join_error) => { - warn!("unknown kind of JoinError: {join_error}"); - } - } - if !buffering { - // buffer so that every 500ms since the first update (or starting) we'll log - // how far away we are; this is because we will get SIGKILL'd at 10s, and we - // are not able to log *then*. - buffering = true; - buffered.as_mut().reset(tokio::time::Instant::now() + BUFFER_FOR); - } - }, - _ = &mut buffered, if buffering => { - buffering = false; - info!(remaining = join_set.len(), total, elapsed_ms = started_at.elapsed().as_millis(), "waiting for tenants to shutdown"); - } - } - } - - if panicked > 0 { - warn!( - panicked, - total, "observed panicks while shutting down tenants" - ); - } - - // caller will log how long we took -} - #[derive(thiserror::Error, Debug)] pub(crate) enum UpsertLocationError { #[error("Bad config request: {0}")] @@ -1056,7 +927,8 @@ impl TenantManager { // the tenant is inaccessible to the outside world while we are doing this, but that is sensible: // the state is ill-defined while we're in transition. Transitions are async, but fast: we do // not do significant I/O, and shutdowns should be prompt via cancellation tokens. - let mut slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any) + let mut slot_guard = self + .tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any) .map_err(|e| match e { TenantSlotError::NotFound(_) => { unreachable!("Called with mode Any") @@ -1223,6 +1095,75 @@ impl TenantManager { } } + fn tenant_map_acquire_slot( + &self, + tenant_shard_id: &TenantShardId, + mode: TenantSlotAcquireMode, + ) -> Result { + use TenantSlotAcquireMode::*; + METRICS.tenant_slot_writes.inc(); + + let mut locked = self.tenants.write().unwrap(); + let span = tracing::info_span!("acquire_slot", tenant_id=%tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug()); + let _guard = span.enter(); + + let m = match &mut *locked { + TenantsMap::Initializing => return Err(TenantMapError::StillInitializing.into()), + TenantsMap::ShuttingDown(_) => return Err(TenantMapError::ShuttingDown.into()), + TenantsMap::Open(m) => m, + }; + + use std::collections::btree_map::Entry; + + let entry = m.entry(*tenant_shard_id); + + match entry { + Entry::Vacant(v) => match mode { + MustExist => { + tracing::debug!("Vacant && MustExist: return NotFound"); + Err(TenantSlotError::NotFound(*tenant_shard_id)) + } + _ => { + let (completion, barrier) = utils::completion::channel(); + let inserting = TenantSlot::InProgress(barrier); + METRICS.slot_inserted(&inserting); + v.insert(inserting); + tracing::debug!("Vacant, inserted InProgress"); + Ok(SlotGuard::new( + *tenant_shard_id, + None, + completion, + &self.tenants, + )) + } + }, + Entry::Occupied(mut o) => { + // Apply mode-driven checks + match (o.get(), mode) { + (TenantSlot::InProgress(_), _) => { + tracing::debug!("Occupied, failing for InProgress"); + Err(TenantSlotError::InProgress) + } + _ => { + // Happy case: the slot was not in any state that violated our mode + let (completion, barrier) = utils::completion::channel(); + let in_progress = TenantSlot::InProgress(barrier); + METRICS.slot_inserted(&in_progress); + let old_value = o.insert(in_progress); + METRICS.slot_removed(&old_value); + tracing::debug!("Occupied, replaced with InProgress"); + Ok(SlotGuard::new( + *tenant_shard_id, + Some(old_value), + completion, + &self.tenants, + )) + } + } + } + } + } + /// Resetting a tenant is equivalent to detaching it, then attaching it again with the same /// LocationConf that was last used to attach it. Optionally, the local file cache may be /// dropped before re-attaching. @@ -1239,7 +1180,8 @@ impl TenantManager { drop_cache: bool, ctx: &RequestContext, ) -> anyhow::Result<()> { - let mut slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?; + let mut slot_guard = + self.tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?; let Some(old_slot) = slot_guard.get_old_value() else { anyhow::bail!("Tenant not found when trying to reset"); }; @@ -1388,7 +1330,8 @@ impl TenantManager { Ok(()) } - let slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?; + let slot_guard = + self.tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?; match &slot_guard.old_value { Some(TenantSlot::Attached(tenant)) => { // Legacy deletion flow: the tenant remains attached, goes to Stopping state, and @@ -1539,7 +1482,7 @@ impl TenantManager { // Phase 2: Put the parent shard to InProgress and grab a reference to the parent Tenant drop(tenant); let mut parent_slot_guard = - tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?; + self.tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?; let parent = match parent_slot_guard.get_old_value() { Some(TenantSlot::Attached(t)) => t, Some(TenantSlot::Secondary(_)) => anyhow::bail!("Tenant location in secondary mode"), @@ -1843,7 +1786,145 @@ impl TenantManager { pub(crate) async fn shutdown(&self) { self.cancel.cancel(); - shutdown_all_tenants0(self.tenants).await + self.shutdown_all_tenants0().await + } + + async fn shutdown_all_tenants0(&self) { + let mut join_set = JoinSet::new(); + + #[cfg(all(debug_assertions, not(test)))] + { + // Check that our metrics properly tracked the size of the tenants map. This is a convenient location to check, + // as it happens implicitly at the end of tests etc. + let m = self.tenants.read().unwrap(); + debug_assert_eq!(METRICS.slots_total(), m.len() as u64); + } + + // Atomically, 1. create the shutdown tasks and 2. prevent creation of new tenants. + let (total_in_progress, total_attached) = { + let mut m = self.tenants.write().unwrap(); + match &mut *m { + TenantsMap::Initializing => { + *m = TenantsMap::ShuttingDown(BTreeMap::default()); + info!("tenants map is empty"); + return; + } + TenantsMap::Open(tenants) => { + let mut shutdown_state = BTreeMap::new(); + let mut total_in_progress = 0; + let mut total_attached = 0; + + for (tenant_shard_id, v) in std::mem::take(tenants).into_iter() { + match v { + TenantSlot::Attached(t) => { + shutdown_state + .insert(tenant_shard_id, TenantSlot::Attached(t.clone())); + join_set.spawn( + async move { + let res = { + let (_guard, shutdown_progress) = completion::channel(); + t.shutdown(shutdown_progress, ShutdownMode::FreezeAndFlush).await + }; + + if let Err(other_progress) = res { + // join the another shutdown in progress + other_progress.wait().await; + } + + // we cannot afford per tenant logging here, because if s3 is degraded, we are + // going to log too many lines + debug!("tenant successfully stopped"); + } + .instrument(info_span!("shutdown", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug())), + ); + + total_attached += 1; + } + TenantSlot::Secondary(state) => { + // We don't need to wait for this individually per-tenant: the + // downloader task will be waited on eventually, this cancel + // is just to encourage it to drop out if it is doing work + // for this tenant right now. + state.cancel.cancel(); + + shutdown_state + .insert(tenant_shard_id, TenantSlot::Secondary(state)); + } + TenantSlot::InProgress(notify) => { + // InProgress tenants are not visible in TenantsMap::ShuttingDown: we will + // wait for their notifications to fire in this function. + join_set.spawn(async move { + notify.wait().await; + }); + + total_in_progress += 1; + } + } + } + *m = TenantsMap::ShuttingDown(shutdown_state); + (total_in_progress, total_attached) + } + TenantsMap::ShuttingDown(_) => { + error!( + "already shutting down, this function isn't supposed to be called more than once" + ); + return; + } + } + }; + + let started_at = std::time::Instant::now(); + + info!( + "Waiting for {} InProgress tenants and {} Attached tenants to shut down", + total_in_progress, total_attached + ); + + let total = join_set.len(); + let mut panicked = 0; + let mut buffering = true; + const BUFFER_FOR: std::time::Duration = std::time::Duration::from_millis(500); + let mut buffered = std::pin::pin!(tokio::time::sleep(BUFFER_FOR)); + + while !join_set.is_empty() { + tokio::select! { + Some(joined) = join_set.join_next() => { + match joined { + Ok(()) => {}, + Err(join_error) if join_error.is_cancelled() => { + unreachable!("we are not cancelling any of the tasks"); + } + Err(join_error) if join_error.is_panic() => { + // cannot really do anything, as this panic is likely a bug + panicked += 1; + } + Err(join_error) => { + warn!("unknown kind of JoinError: {join_error}"); + } + } + if !buffering { + // buffer so that every 500ms since the first update (or starting) we'll log + // how far away we are; this is because we will get SIGKILL'd at 10s, and we + // are not able to log *then*. + buffering = true; + buffered.as_mut().reset(tokio::time::Instant::now() + BUFFER_FOR); + } + }, + _ = &mut buffered, if buffering => { + buffering = false; + info!(remaining = join_set.len(), total, elapsed_ms = started_at.elapsed().as_millis(), "waiting for tenants to shutdown"); + } + } + } + + if panicked > 0 { + warn!( + panicked, + total, "observed panicks while shutting down tenants" + ); + } + + // caller will log how long we took } /// Detaches a tenant, and removes its local files asynchronously. @@ -1889,12 +1970,12 @@ impl TenantManager { .map(Some) }; - let mut removal_result = remove_tenant_from_memory( - self.tenants, - tenant_shard_id, - tenant_dir_rename_operation(tenant_shard_id), - ) - .await; + let mut removal_result = self + .remove_tenant_from_memory( + tenant_shard_id, + tenant_dir_rename_operation(tenant_shard_id), + ) + .await; // If the tenant was not found, it was likely already removed. Attempt to remove the tenant // directory on disk anyway. For example, during shard splits, we shut down and remove the @@ -1948,17 +2029,16 @@ impl TenantManager { ) -> Result, detach_ancestor::Error> { use detach_ancestor::Error; - let slot_guard = - tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustExist).map_err( - |e| { - use TenantSlotError::*; + let slot_guard = self + .tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustExist) + .map_err(|e| { + use TenantSlotError::*; - match e { - MapState(TenantMapError::ShuttingDown) => Error::ShuttingDown, - NotFound(_) | InProgress | MapState(_) => Error::DetachReparent(e.into()), - } - }, - )?; + match e { + MapState(TenantMapError::ShuttingDown) => Error::ShuttingDown, + NotFound(_) | InProgress | MapState(_) => Error::DetachReparent(e.into()), + } + })?; let tenant = { let old_slot = slot_guard @@ -2291,6 +2371,80 @@ impl TenantManager { other => ApiError::InternalServerError(anyhow::anyhow!(other)), }) } + + /// Stops and removes the tenant from memory, if it's not [`TenantState::Stopping`] already, bails otherwise. + /// Allows to remove other tenant resources manually, via `tenant_cleanup`. + /// If the cleanup fails, tenant will stay in memory in [`TenantState::Broken`] state, and another removal + async fn remove_tenant_from_memory( + &self, + tenant_shard_id: TenantShardId, + tenant_cleanup: F, + ) -> Result + where + F: std::future::Future>, + { + let mut slot_guard = + self.tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustExist)?; + + // allow pageserver shutdown to await for our completion + let (_guard, progress) = completion::channel(); + + // The SlotGuard allows us to manipulate the Tenant object without fear of some + // concurrent API request doing something else for the same tenant ID. + let attached_tenant = match slot_guard.get_old_value() { + Some(TenantSlot::Attached(tenant)) => { + // whenever we remove a tenant from memory, we don't want to flush and wait for upload + let shutdown_mode = ShutdownMode::Hard; + + // shutdown is sure to transition tenant to stopping, and wait for all tasks to complete, so + // that we can continue safely to cleanup. + match tenant.shutdown(progress, shutdown_mode).await { + Ok(()) => {} + Err(_other) => { + // if pageserver shutdown or other detach/ignore is already ongoing, we don't want to + // wait for it but return an error right away because these are distinct requests. + slot_guard.revert(); + return Err(TenantStateError::IsStopping(tenant_shard_id)); + } + } + Some(tenant) + } + Some(TenantSlot::Secondary(secondary_state)) => { + tracing::info!("Shutting down in secondary mode"); + secondary_state.shutdown().await; + None + } + Some(TenantSlot::InProgress(_)) => { + // Acquiring a slot guarantees its old value was not InProgress + unreachable!(); + } + None => None, + }; + + match tenant_cleanup + .await + .with_context(|| format!("Failed to run cleanup for tenant {tenant_shard_id}")) + { + Ok(hook_value) => { + // Success: drop the old TenantSlot::Attached. + slot_guard + .drop_old_value() + .expect("We just called shutdown"); + + Ok(hook_value) + } + Err(e) => { + // If we had a Tenant, set it to Broken and put it back in the TenantsMap + if let Some(attached_tenant) = attached_tenant { + attached_tenant.set_broken(e.to_string()).await; + } + // Leave the broken tenant in the map + slot_guard.revert(); + + Err(TenantStateError::Other(e)) + } + } + } } #[derive(Debug, thiserror::Error)] @@ -2455,7 +2609,7 @@ pub(crate) enum TenantMapError { /// this tenant to retry later, or wait for the InProgress state to end. /// /// This structure enforces the important invariant that we do not have overlapping -/// tasks that will try use local storage for a the same tenant ID: we enforce that +/// tasks that will try to use local storage for a the same tenant ID: we enforce that /// the previous contents of a slot have been shut down before the slot can be /// left empty or used for something else /// @@ -2468,7 +2622,7 @@ pub(crate) enum TenantMapError { /// The `old_value` may be dropped before the SlotGuard is dropped, by calling /// `drop_old_value`. It is an error to call this without shutting down /// the conents of `old_value`. -pub(crate) struct SlotGuard { +pub(crate) struct SlotGuard<'a> { tenant_shard_id: TenantShardId, old_value: Option, upserted: bool, @@ -2476,19 +2630,23 @@ pub(crate) struct SlotGuard { /// [`TenantSlot::InProgress`] carries the corresponding Barrier: it will /// release any waiters as soon as this SlotGuard is dropped. completion: utils::completion::Completion, + + tenants: &'a std::sync::RwLock, } -impl SlotGuard { +impl<'a> SlotGuard<'a> { fn new( tenant_shard_id: TenantShardId, old_value: Option, completion: utils::completion::Completion, + tenants: &'a std::sync::RwLock, ) -> Self { Self { tenant_shard_id, old_value, upserted: false, completion, + tenants, } } @@ -2512,8 +2670,8 @@ impl SlotGuard { )); } - let replaced = { - let mut locked = TENANTS.write().unwrap(); + let replaced: Option = { + let mut locked = self.tenants.write().unwrap(); if let TenantSlot::InProgress(_) = new_value { // It is never expected to try and upsert InProgress via this path: it should @@ -2621,7 +2779,7 @@ impl SlotGuard { } } -impl Drop for SlotGuard { +impl<'a> Drop for SlotGuard<'a> { fn drop(&mut self) { if self.upserted { return; @@ -2629,7 +2787,7 @@ impl Drop for SlotGuard { // Our old value is already shutdown, or it never existed: it is safe // for us to fully release the TenantSlot back into an empty state - let mut locked = TENANTS.write().unwrap(); + let mut locked = self.tenants.write().unwrap(); let m = match &mut *locked { TenantsMap::Initializing => { @@ -2711,151 +2869,6 @@ enum TenantSlotAcquireMode { MustExist, } -fn tenant_map_acquire_slot( - tenant_shard_id: &TenantShardId, - mode: TenantSlotAcquireMode, -) -> Result { - tenant_map_acquire_slot_impl(tenant_shard_id, &TENANTS, mode) -} - -fn tenant_map_acquire_slot_impl( - tenant_shard_id: &TenantShardId, - tenants: &std::sync::RwLock, - mode: TenantSlotAcquireMode, -) -> Result { - use TenantSlotAcquireMode::*; - METRICS.tenant_slot_writes.inc(); - - let mut locked = tenants.write().unwrap(); - let span = tracing::info_span!("acquire_slot", tenant_id=%tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug()); - let _guard = span.enter(); - - let m = match &mut *locked { - TenantsMap::Initializing => return Err(TenantMapError::StillInitializing.into()), - TenantsMap::ShuttingDown(_) => return Err(TenantMapError::ShuttingDown.into()), - TenantsMap::Open(m) => m, - }; - - use std::collections::btree_map::Entry; - - let entry = m.entry(*tenant_shard_id); - - match entry { - Entry::Vacant(v) => match mode { - MustExist => { - tracing::debug!("Vacant && MustExist: return NotFound"); - Err(TenantSlotError::NotFound(*tenant_shard_id)) - } - _ => { - let (completion, barrier) = utils::completion::channel(); - let inserting = TenantSlot::InProgress(barrier); - METRICS.slot_inserted(&inserting); - v.insert(inserting); - tracing::debug!("Vacant, inserted InProgress"); - Ok(SlotGuard::new(*tenant_shard_id, None, completion)) - } - }, - Entry::Occupied(mut o) => { - // Apply mode-driven checks - match (o.get(), mode) { - (TenantSlot::InProgress(_), _) => { - tracing::debug!("Occupied, failing for InProgress"); - Err(TenantSlotError::InProgress) - } - _ => { - // Happy case: the slot was not in any state that violated our mode - let (completion, barrier) = utils::completion::channel(); - let in_progress = TenantSlot::InProgress(barrier); - METRICS.slot_inserted(&in_progress); - let old_value = o.insert(in_progress); - METRICS.slot_removed(&old_value); - tracing::debug!("Occupied, replaced with InProgress"); - Ok(SlotGuard::new( - *tenant_shard_id, - Some(old_value), - completion, - )) - } - } - } - } -} - -/// Stops and removes the tenant from memory, if it's not [`TenantState::Stopping`] already, bails otherwise. -/// Allows to remove other tenant resources manually, via `tenant_cleanup`. -/// If the cleanup fails, tenant will stay in memory in [`TenantState::Broken`] state, and another removal -/// operation would be needed to remove it. -async fn remove_tenant_from_memory( - tenants: &std::sync::RwLock, - tenant_shard_id: TenantShardId, - tenant_cleanup: F, -) -> Result -where - F: std::future::Future>, -{ - let mut slot_guard = - tenant_map_acquire_slot_impl(&tenant_shard_id, tenants, TenantSlotAcquireMode::MustExist)?; - - // allow pageserver shutdown to await for our completion - let (_guard, progress) = completion::channel(); - - // The SlotGuard allows us to manipulate the Tenant object without fear of some - // concurrent API request doing something else for the same tenant ID. - let attached_tenant = match slot_guard.get_old_value() { - Some(TenantSlot::Attached(tenant)) => { - // whenever we remove a tenant from memory, we don't want to flush and wait for upload - let shutdown_mode = ShutdownMode::Hard; - - // shutdown is sure to transition tenant to stopping, and wait for all tasks to complete, so - // that we can continue safely to cleanup. - match tenant.shutdown(progress, shutdown_mode).await { - Ok(()) => {} - Err(_other) => { - // if pageserver shutdown or other detach/ignore is already ongoing, we don't want to - // wait for it but return an error right away because these are distinct requests. - slot_guard.revert(); - return Err(TenantStateError::IsStopping(tenant_shard_id)); - } - } - Some(tenant) - } - Some(TenantSlot::Secondary(secondary_state)) => { - tracing::info!("Shutting down in secondary mode"); - secondary_state.shutdown().await; - None - } - Some(TenantSlot::InProgress(_)) => { - // Acquiring a slot guarantees its old value was not InProgress - unreachable!(); - } - None => None, - }; - - match tenant_cleanup - .await - .with_context(|| format!("Failed to run cleanup for tenant {tenant_shard_id}")) - { - Ok(hook_value) => { - // Success: drop the old TenantSlot::Attached. - slot_guard - .drop_old_value() - .expect("We just called shutdown"); - - Ok(hook_value) - } - Err(e) => { - // If we had a Tenant, set it to Broken and put it back in the TenantsMap - if let Some(attached_tenant) = attached_tenant { - attached_tenant.set_broken(e.to_string()).await; - } - // Leave the broken tenant in the map - slot_guard.revert(); - - Err(TenantStateError::Other(e)) - } - } -} - use http_utils::error::ApiError; use pageserver_api::models::TimelineGcRequest; @@ -2866,11 +2879,15 @@ mod tests { use std::collections::BTreeMap; use std::sync::Arc; + use storage_broker::BrokerClientChannel; use tracing::Instrument; use super::super::harness::TenantHarness; use super::TenantsMap; - use crate::tenant::mgr::TenantSlot; + use crate::tenant::{ + TenantSharedResources, + mgr::{BackgroundPurges, TenantManager, TenantSlot}, + }; #[tokio::test(start_paused = true)] async fn shutdown_awaits_in_progress_tenant() { @@ -2891,23 +2908,47 @@ mod tests { let _e = span.enter(); let tenants = BTreeMap::from([(id, TenantSlot::Attached(t.clone()))]); - let tenants = Arc::new(std::sync::RwLock::new(TenantsMap::Open(tenants))); // Invoke remove_tenant_from_memory with a cleanup hook that blocks until we manually // permit it to proceed: that will stick the tenant in InProgress + let (basebackup_prepare_sender, _) = tokio::sync::mpsc::unbounded_channel::< + crate::basebackup_cache::BasebackupPrepareRequest, + >(); + + let tenant_manager = TenantManager { + tenants: std::sync::RwLock::new(TenantsMap::Open(tenants)), + conf: h.conf, + resources: TenantSharedResources { + broker_client: BrokerClientChannel::connect_lazy("foobar.com") + .await + .unwrap(), + remote_storage: h.remote_storage.clone(), + deletion_queue_client: h.deletion_queue.new_client(), + l0_flush_global_state: crate::l0_flush::L0FlushGlobalState::new( + h.conf.l0_flush.clone(), + ), + basebackup_prepare_sender, + feature_resolver: crate::feature_resolver::FeatureResolver::new_disabled(), + }, + cancel: tokio_util::sync::CancellationToken::new(), + background_purges: BackgroundPurges::default(), + }; + + let tenant_manager = Arc::new(tenant_manager); + let (until_cleanup_completed, can_complete_cleanup) = utils::completion::channel(); let (until_cleanup_started, cleanup_started) = utils::completion::channel(); let mut remove_tenant_from_memory_task = { + let tenant_manager = tenant_manager.clone(); let jh = tokio::spawn({ - let tenants = tenants.clone(); async move { let cleanup = async move { drop(until_cleanup_started); can_complete_cleanup.wait().await; anyhow::Ok(()) }; - super::remove_tenant_from_memory(&tenants, id, cleanup).await + tenant_manager.remove_tenant_from_memory(id, cleanup).await } .instrument(h.span()) }); @@ -2920,9 +2961,11 @@ mod tests { let mut shutdown_task = { let (until_shutdown_started, shutdown_started) = utils::completion::channel(); + let tenant_manager = tenant_manager.clone(); + let shutdown_task = tokio::spawn(async move { drop(until_shutdown_started); - super::shutdown_all_tenants0(&tenants).await; + tenant_manager.shutdown_all_tenants0().await; }); shutdown_started.wait().await; From 21949137ed0db4429f79d8532103ded958f87634 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Tue, 10 Jun 2025 13:09:46 +0300 Subject: [PATCH 023/124] Return last ring index instead of min_ring_index in prefetch_register_bufferv (#12039) ## Problem See https://github.com/neondatabase/neon/issues/12018 Now `prefetch_register_bufferv` calculates min_ring_index of all vector requests. But because of pump prefetch state or connection failure, previous slots can be already proceeded and reused. ## Summary of changes Instead of returning minimal index, this function should return last slot index. Actually result of this function is used only in two places. A first place just fort checking (and this check is redundant because the same check is done in `prefetch_register_bufferv` itself. And in the second place where index of filled slot is actually used, there is just one request. Sp fortunately this bug can cause only assert failure in debug build. --------- Co-authored-by: Konstantin Knizhnik --- pgxn/neon/communicator.c | 47 ++++++++++++++++++++-------------------- 1 file changed, 23 insertions(+), 24 deletions(-) diff --git a/pgxn/neon/communicator.c b/pgxn/neon/communicator.c index 2655a45bcc..7c84be7d15 100644 --- a/pgxn/neon/communicator.c +++ b/pgxn/neon/communicator.c @@ -1092,13 +1092,15 @@ communicator_prefetch_register_bufferv(BufferTag tag, neon_request_lsns *frlsns, MyPState->ring_last <= ring_index); } -/* internal version. Returns the ring index */ +/* Internal version. Returns the ring index of the last block (result of this function is used only +* when nblocks==1) +*/ static uint64 prefetch_register_bufferv(BufferTag tag, neon_request_lsns *frlsns, BlockNumber nblocks, const bits8 *mask, bool is_prefetch) { - uint64 min_ring_index; + uint64 last_ring_index; PrefetchRequest hashkey; #ifdef USE_ASSERT_CHECKING bool any_hits = false; @@ -1122,13 +1124,12 @@ Retry: MyPState->ring_unused - MyPState->ring_receive; MyNeonCounters->getpage_prefetches_buffered = MyPState->n_responses_buffered; + last_ring_index = UINT64_MAX; - min_ring_index = UINT64_MAX; for (int i = 0; i < nblocks; i++) { PrefetchRequest *slot = NULL; PrfHashEntry *entry = NULL; - uint64 ring_index; neon_request_lsns *lsns; if (PointerIsValid(mask) && BITMAP_ISSET(mask, i)) @@ -1152,12 +1153,12 @@ Retry: if (entry != NULL) { slot = entry->slot; - ring_index = slot->my_ring_index; - Assert(slot == GetPrfSlot(ring_index)); + last_ring_index = slot->my_ring_index; + Assert(slot == GetPrfSlot(last_ring_index)); Assert(slot->status != PRFS_UNUSED); - Assert(MyPState->ring_last <= ring_index && - ring_index < MyPState->ring_unused); + Assert(MyPState->ring_last <= last_ring_index && + last_ring_index < MyPState->ring_unused); Assert(BufferTagsEqual(&slot->buftag, &hashkey.buftag)); /* @@ -1169,9 +1170,9 @@ Retry: if (!neon_prefetch_response_usable(lsns, slot)) { /* Wait for the old request to finish and discard it */ - if (!prefetch_wait_for(ring_index)) + if (!prefetch_wait_for(last_ring_index)) goto Retry; - prefetch_set_unused(ring_index); + prefetch_set_unused(last_ring_index); entry = NULL; slot = NULL; pgBufferUsage.prefetch.expired += 1; @@ -1188,13 +1189,12 @@ Retry: */ if (slot->status == PRFS_TAG_REMAINS) { - prefetch_set_unused(ring_index); + prefetch_set_unused(last_ring_index); entry = NULL; slot = NULL; } else { - min_ring_index = Min(min_ring_index, ring_index); /* The buffered request is good enough, return that index */ if (is_prefetch) pgBufferUsage.prefetch.duplicates++; @@ -1283,12 +1283,12 @@ Retry: * The next buffer pointed to by `ring_unused` is now definitely empty, so * we can insert the new request to it. */ - ring_index = MyPState->ring_unused; + last_ring_index = MyPState->ring_unused; - Assert(MyPState->ring_last <= ring_index && - ring_index <= MyPState->ring_unused); + Assert(MyPState->ring_last <= last_ring_index && + last_ring_index <= MyPState->ring_unused); - slot = GetPrfSlotNoCheck(ring_index); + slot = GetPrfSlotNoCheck(last_ring_index); Assert(slot->status == PRFS_UNUSED); @@ -1298,11 +1298,9 @@ Retry: */ slot->buftag = hashkey.buftag; slot->shard_no = get_shard_number(&tag); - slot->my_ring_index = ring_index; + slot->my_ring_index = last_ring_index; slot->flags = 0; - min_ring_index = Min(min_ring_index, ring_index); - if (is_prefetch) MyNeonCounters->getpage_prefetch_requests_total++; else @@ -1315,11 +1313,12 @@ Retry: MyPState->ring_unused - MyPState->ring_receive; Assert(any_hits); + Assert(last_ring_index != UINT64_MAX); - Assert(GetPrfSlot(min_ring_index)->status == PRFS_REQUESTED || - GetPrfSlot(min_ring_index)->status == PRFS_RECEIVED); - Assert(MyPState->ring_last <= min_ring_index && - min_ring_index < MyPState->ring_unused); + Assert(GetPrfSlot(last_ring_index)->status == PRFS_REQUESTED || + GetPrfSlot(last_ring_index)->status == PRFS_RECEIVED); + Assert(MyPState->ring_last <= last_ring_index && + last_ring_index < MyPState->ring_unused); if (flush_every_n_requests > 0 && MyPState->ring_unused - MyPState->ring_flush >= flush_every_n_requests) @@ -1335,7 +1334,7 @@ Retry: MyPState->ring_flush = MyPState->ring_unused; } - return min_ring_index; + return last_ring_index; } static bool From e38193c5307eaf3aa6ecd1bff5cb7979282890f6 Mon Sep 17 00:00:00 2001 From: Folke Behrens Date: Tue, 10 Jun 2025 11:23:03 +0000 Subject: [PATCH 024/124] proxy: Move connect_to_compute back to proxy (#12181) It's mostly responsible for waking, retrying, and caching. A new, thin wrapper around compute_once will be PGLB's entry point --- proxy/src/auth/backend/console_redirect.rs | 2 +- proxy/src/auth/backend/mod.rs | 2 +- proxy/src/binary/pg_sni_router.rs | 5 ++--- proxy/src/console_redirect_proxy.rs | 7 +++---- proxy/src/lib.rs | 1 + proxy/src/pglb/mod.rs | 1 - proxy/src/{pglb => proxy}/connect_compute.rs | 14 +++----------- proxy/src/proxy/mod.rs | 20 ++++---------------- proxy/src/proxy/tests/mod.rs | 2 +- proxy/src/proxy/wake_compute.rs | 7 ++++++- proxy/src/serverless/backend.rs | 6 +++--- proxy/src/serverless/mod.rs | 2 +- proxy/src/serverless/sql_over_http.rs | 3 ++- proxy/src/util.rs | 14 ++++++++++++++ 14 files changed, 42 insertions(+), 44 deletions(-) rename proxy/src/{pglb => proxy}/connect_compute.rs (94%) create mode 100644 proxy/src/util.rs diff --git a/proxy/src/auth/backend/console_redirect.rs b/proxy/src/auth/backend/console_redirect.rs index a7133b22e5..8440d198df 100644 --- a/proxy/src/auth/backend/console_redirect.rs +++ b/proxy/src/auth/backend/console_redirect.rs @@ -14,9 +14,9 @@ use crate::context::RequestContext; use crate::control_plane::client::cplane_proxy_v1; use crate::control_plane::{self, CachedNodeInfo, NodeInfo}; use crate::error::{ReportableError, UserFacingError}; -use crate::pglb::connect_compute::WakeComputeBackend; use crate::pqproto::BeMessage; use crate::proxy::NeonOptions; +use crate::proxy::wake_compute::WakeComputeBackend; use crate::stream::PqStream; use crate::types::RoleName; use crate::{auth, compute, waiters}; diff --git a/proxy/src/auth/backend/mod.rs b/proxy/src/auth/backend/mod.rs index a153ea4d42..7ceb1e6814 100644 --- a/proxy/src/auth/backend/mod.rs +++ b/proxy/src/auth/backend/mod.rs @@ -25,9 +25,9 @@ use crate::control_plane::{ RoleAccessControl, }; use crate::intern::EndpointIdInt; -use crate::pglb::connect_compute::WakeComputeBackend; use crate::pqproto::BeMessage; use crate::proxy::NeonOptions; +use crate::proxy::wake_compute::WakeComputeBackend; use crate::rate_limiter::EndpointRateLimiter; use crate::stream::Stream; use crate::types::{EndpointCacheKey, EndpointId, RoleName}; diff --git a/proxy/src/binary/pg_sni_router.rs b/proxy/src/binary/pg_sni_router.rs index a4f517fead..481bd8501c 100644 --- a/proxy/src/binary/pg_sni_router.rs +++ b/proxy/src/binary/pg_sni_router.rs @@ -28,10 +28,9 @@ use crate::context::RequestContext; use crate::metrics::{Metrics, ThreadPoolMetrics}; use crate::pqproto::FeStartupPacket; use crate::protocol2::ConnectionInfo; -use crate::proxy::{ - ErrorSource, TlsRequired, copy_bidirectional_client_compute, run_until_cancelled, -}; +use crate::proxy::{ErrorSource, TlsRequired, copy_bidirectional_client_compute}; use crate::stream::{PqStream, Stream}; +use crate::util::run_until_cancelled; project_git_version!(GIT_VERSION); diff --git a/proxy/src/console_redirect_proxy.rs b/proxy/src/console_redirect_proxy.rs index 324dcf5824..5331ea41fd 100644 --- a/proxy/src/console_redirect_proxy.rs +++ b/proxy/src/console_redirect_proxy.rs @@ -11,13 +11,12 @@ use crate::config::{ProxyConfig, ProxyProtocolV2}; use crate::context::RequestContext; use crate::error::ReportableError; use crate::metrics::{Metrics, NumClientConnectionsGuard}; -use crate::pglb::connect_compute::{TcpMechanism, connect_to_compute}; use crate::pglb::handshake::{HandshakeData, handshake}; use crate::pglb::passthrough::ProxyPassthrough; use crate::protocol2::{ConnectHeader, ConnectionInfo, read_proxy_protocol}; -use crate::proxy::{ - ClientRequestError, ErrorSource, prepare_client_connection, run_until_cancelled, -}; +use crate::proxy::connect_compute::{TcpMechanism, connect_to_compute}; +use crate::proxy::{ClientRequestError, ErrorSource, prepare_client_connection}; +use crate::util::run_until_cancelled; pub async fn task_main( config: &'static ProxyConfig, diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs index d65d056585..026c6aeba9 100644 --- a/proxy/src/lib.rs +++ b/proxy/src/lib.rs @@ -106,4 +106,5 @@ mod tls; mod types; mod url; mod usage_metrics; +mod util; mod waiters; diff --git a/proxy/src/pglb/mod.rs b/proxy/src/pglb/mod.rs index 4b107142a7..cb82524cf6 100644 --- a/proxy/src/pglb/mod.rs +++ b/proxy/src/pglb/mod.rs @@ -1,4 +1,3 @@ -pub mod connect_compute; pub mod copy_bidirectional; pub mod handshake; pub mod inprocess; diff --git a/proxy/src/pglb/connect_compute.rs b/proxy/src/proxy/connect_compute.rs similarity index 94% rename from proxy/src/pglb/connect_compute.rs rename to proxy/src/proxy/connect_compute.rs index 4984588f75..92ed84f50f 100644 --- a/proxy/src/pglb/connect_compute.rs +++ b/proxy/src/proxy/connect_compute.rs @@ -8,19 +8,19 @@ use crate::config::{ComputeConfig, RetryConfig}; use crate::context::RequestContext; use crate::control_plane::errors::WakeComputeError; use crate::control_plane::locks::ApiLocks; -use crate::control_plane::{self, CachedNodeInfo, NodeInfo}; +use crate::control_plane::{self, NodeInfo}; use crate::error::ReportableError; use crate::metrics::{ ConnectOutcome, ConnectionFailureKind, Metrics, RetriesMetricGroup, RetryType, }; use crate::proxy::retry::{CouldRetry, ShouldRetryWakeCompute, retry_after, should_retry}; -use crate::proxy::wake_compute::wake_compute; +use crate::proxy::wake_compute::{WakeComputeBackend, wake_compute}; use crate::types::Host; /// If we couldn't connect, a cached connection info might be to blame /// (e.g. the compute node's address might've changed at the wrong time). /// Invalidate the cache entry (if any) to prevent subsequent errors. -#[tracing::instrument(name = "invalidate_cache", skip_all)] +#[tracing::instrument(skip_all)] pub(crate) fn invalidate_cache(node_info: control_plane::CachedNodeInfo) -> NodeInfo { let is_cached = node_info.cached(); if is_cached { @@ -49,14 +49,6 @@ pub(crate) trait ConnectMechanism { ) -> Result; } -#[async_trait] -pub(crate) trait WakeComputeBackend { - async fn wake_compute( - &self, - ctx: &RequestContext, - ) -> Result; -} - pub(crate) struct TcpMechanism { pub(crate) auth: AuthInfo, /// connect_to_compute concurrency lock diff --git a/proxy/src/proxy/mod.rs b/proxy/src/proxy/mod.rs index 4a294c1e82..4211406f6c 100644 --- a/proxy/src/proxy/mod.rs +++ b/proxy/src/proxy/mod.rs @@ -1,8 +1,10 @@ #[cfg(test)] mod tests; +pub(crate) mod connect_compute; pub(crate) mod retry; pub(crate) mod wake_compute; + use std::sync::Arc; use futures::FutureExt; @@ -21,15 +23,16 @@ use crate::config::{ProxyConfig, ProxyProtocolV2, TlsConfig}; use crate::context::RequestContext; use crate::error::{ReportableError, UserFacingError}; use crate::metrics::{Metrics, NumClientConnectionsGuard}; -use crate::pglb::connect_compute::{TcpMechanism, connect_to_compute}; pub use crate::pglb::copy_bidirectional::{ErrorSource, copy_bidirectional_client_compute}; use crate::pglb::handshake::{HandshakeData, HandshakeError, handshake}; use crate::pglb::passthrough::ProxyPassthrough; use crate::pqproto::{BeMessage, CancelKeyData, StartupMessageParams}; use crate::protocol2::{ConnectHeader, ConnectionInfo, ConnectionInfoExtra, read_proxy_protocol}; +use crate::proxy::connect_compute::{TcpMechanism, connect_to_compute}; use crate::rate_limiter::EndpointRateLimiter; use crate::stream::{PqStream, Stream}; use crate::types::EndpointCacheKey; +use crate::util::run_until_cancelled; use crate::{auth, compute}; const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)"; @@ -46,21 +49,6 @@ impl ReportableError for TlsRequired { impl UserFacingError for TlsRequired {} -pub async fn run_until_cancelled( - f: F, - cancellation_token: &CancellationToken, -) -> Option { - match futures::future::select( - std::pin::pin!(f), - std::pin::pin!(cancellation_token.cancelled()), - ) - .await - { - futures::future::Either::Left((f, _)) => Some(f), - futures::future::Either::Right(((), _)) => None, - } -} - pub async fn task_main( config: &'static ProxyConfig, auth_backend: &'static auth::Backend<'static, ()>, diff --git a/proxy/src/proxy/tests/mod.rs b/proxy/src/proxy/tests/mod.rs index de85bf9df2..12de5cbc09 100644 --- a/proxy/src/proxy/tests/mod.rs +++ b/proxy/src/proxy/tests/mod.rs @@ -25,7 +25,7 @@ use crate::control_plane::client::{ControlPlaneClient, TestControlPlaneClient}; use crate::control_plane::messages::{ControlPlaneErrorMessage, Details, MetricsAuxInfo, Status}; use crate::control_plane::{self, CachedNodeInfo, NodeInfo, NodeInfoCache}; use crate::error::ErrorKind; -use crate::pglb::connect_compute::ConnectMechanism; +use crate::proxy::connect_compute::ConnectMechanism; use crate::tls::client_config::compute_client_config_with_certs; use crate::tls::server_config::CertResolver; use crate::types::{BranchId, EndpointId, ProjectId}; diff --git a/proxy/src/proxy/wake_compute.rs b/proxy/src/proxy/wake_compute.rs index fd59800745..b8edf9fd5c 100644 --- a/proxy/src/proxy/wake_compute.rs +++ b/proxy/src/proxy/wake_compute.rs @@ -1,3 +1,4 @@ +use async_trait::async_trait; use tracing::{error, info}; use crate::config::RetryConfig; @@ -8,7 +9,6 @@ use crate::error::ReportableError; use crate::metrics::{ ConnectOutcome, ConnectionFailuresBreakdownGroup, Metrics, RetriesMetricGroup, RetryType, }; -use crate::pglb::connect_compute::WakeComputeBackend; use crate::proxy::retry::{retry_after, should_retry}; // Use macro to retain original callsite. @@ -23,6 +23,11 @@ macro_rules! log_wake_compute_error { }; } +#[async_trait] +pub(crate) trait WakeComputeBackend { + async fn wake_compute(&self, ctx: &RequestContext) -> Result; +} + pub(crate) async fn wake_compute( num_retries: &mut u32, ctx: &RequestContext, diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs index d74e3cad3d..316e038344 100644 --- a/proxy/src/serverless/backend.rs +++ b/proxy/src/serverless/backend.rs @@ -34,7 +34,7 @@ use crate::control_plane::errors::{GetAuthInfoError, WakeComputeError}; use crate::control_plane::locks::ApiLocks; use crate::error::{ErrorKind, ReportableError, UserFacingError}; use crate::intern::EndpointIdInt; -use crate::pglb::connect_compute::ConnectMechanism; +use crate::proxy::connect_compute::ConnectMechanism; use crate::proxy::retry::{CouldRetry, ShouldRetryWakeCompute}; use crate::rate_limiter::EndpointRateLimiter; use crate::types::{EndpointId, Host, LOCAL_PROXY_SUFFIX}; @@ -181,7 +181,7 @@ impl PoolingBackend { tracing::Span::current().record("conn_id", display(conn_id)); info!(%conn_id, "pool: opening a new connection '{conn_info}'"); let backend = self.auth_backend.as_ref().map(|()| keys.info); - crate::pglb::connect_compute::connect_to_compute( + crate::proxy::connect_compute::connect_to_compute( ctx, &TokioMechanism { conn_id, @@ -223,7 +223,7 @@ impl PoolingBackend { )), options: conn_info.user_info.options.clone(), }); - crate::pglb::connect_compute::connect_to_compute( + crate::proxy::connect_compute::connect_to_compute( ctx, &HyperMechanism { conn_id, diff --git a/proxy/src/serverless/mod.rs b/proxy/src/serverless/mod.rs index f6f681ac45..ed33bf1246 100644 --- a/proxy/src/serverless/mod.rs +++ b/proxy/src/serverless/mod.rs @@ -50,10 +50,10 @@ use crate::context::RequestContext; use crate::ext::TaskExt; use crate::metrics::Metrics; use crate::protocol2::{ConnectHeader, ConnectionInfo, read_proxy_protocol}; -use crate::proxy::run_until_cancelled; use crate::rate_limiter::EndpointRateLimiter; use crate::serverless::backend::PoolingBackend; use crate::serverless::http_util::{api_error_into_response, json_response}; +use crate::util::run_until_cancelled; pub(crate) const SERVERLESS_DRIVER_SNI: &str = "api"; pub(crate) const AUTH_BROKER_SNI: &str = "apiauth"; diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs index eb80ac9ad0..b2eb801f5c 100644 --- a/proxy/src/serverless/sql_over_http.rs +++ b/proxy/src/serverless/sql_over_http.rs @@ -41,10 +41,11 @@ use crate::error::{ErrorKind, ReportableError, UserFacingError}; use crate::http::{ReadBodyError, read_body_with_limit}; use crate::metrics::{HttpDirection, Metrics, SniGroup, SniKind}; use crate::pqproto::StartupMessageParams; -use crate::proxy::{NeonOptions, run_until_cancelled}; +use crate::proxy::NeonOptions; use crate::serverless::backend::HttpConnError; use crate::types::{DbName, RoleName}; use crate::usage_metrics::{MetricCounter, MetricCounterRecorder}; +use crate::util::run_until_cancelled; #[derive(serde::Deserialize)] #[serde(rename_all = "camelCase")] diff --git a/proxy/src/util.rs b/proxy/src/util.rs new file mode 100644 index 0000000000..7fc2d9fbdb --- /dev/null +++ b/proxy/src/util.rs @@ -0,0 +1,14 @@ +use std::pin::pin; + +use futures::future::{Either, select}; +use tokio_util::sync::CancellationToken; + +pub async fn run_until_cancelled( + f: F, + cancellation_token: &CancellationToken, +) -> Option { + match select(pin!(f), pin!(cancellation_token.cancelled())).await { + Either::Left((f, _)) => Some(f), + Either::Right(((), _)) => None, + } +} From 67b94c59921a738396bd0a4a34b6ac9428237c0e Mon Sep 17 00:00:00 2001 From: Conrad Ludgate Date: Tue, 10 Jun 2025 15:26:08 +0100 Subject: [PATCH 025/124] [proxy] per endpoint configuration for rate limits (#12148) https://github.com/neondatabase/cloud/issues/28333 Adds a new `rate_limit` response type to EndpointAccessControl, uses it for rate limiting, and adds a generic invalidation for the cache. --- proxy/src/auth/backend/mod.rs | 13 ++++--- proxy/src/cache/project_info.rs | 4 ++ .../control_plane/client/cplane_proxy_v1.rs | 3 ++ proxy/src/control_plane/client/mock.rs | 4 +- proxy/src/control_plane/client/mod.rs | 3 +- proxy/src/control_plane/messages.rs | 27 ++++++++++++- proxy/src/control_plane/mod.rs | 39 ++++++++++++++++++- proxy/src/rate_limiter/leaky_bucket.rs | 3 +- proxy/src/rate_limiter/limiter.rs | 3 +- proxy/src/serverless/backend.rs | 13 ++++--- 10 files changed, 91 insertions(+), 21 deletions(-) diff --git a/proxy/src/auth/backend/mod.rs b/proxy/src/auth/backend/mod.rs index 7ceb1e6814..2e3013ead0 100644 --- a/proxy/src/auth/backend/mod.rs +++ b/proxy/src/auth/backend/mod.rs @@ -14,12 +14,13 @@ use serde::{Deserialize, Serialize}; use tokio::io::{AsyncRead, AsyncWrite}; use tracing::{debug, info}; -use crate::auth::{self, AuthError, ComputeUserInfoMaybeEndpoint, validate_password_and_exchange}; +use crate::auth::{self, ComputeUserInfoMaybeEndpoint, validate_password_and_exchange}; use crate::cache::Cached; use crate::config::AuthenticationConfig; use crate::context::RequestContext; use crate::control_plane::client::ControlPlaneClient; use crate::control_plane::errors::GetAuthInfoError; +use crate::control_plane::messages::EndpointRateLimitConfig; use crate::control_plane::{ self, AccessBlockerFlags, AuthSecret, CachedNodeInfo, ControlPlaneApi, EndpointAccessControl, RoleAccessControl, @@ -230,11 +231,8 @@ async fn auth_quirks( config.is_vpc_acccess_proxy, )?; - let endpoint = EndpointIdInt::from(&info.endpoint); - let rate_limit_config = None; - if !endpoint_rate_limiter.check(endpoint, rate_limit_config, 1) { - return Err(AuthError::too_many_connections()); - } + access_controls.connection_attempt_rate_limit(ctx, &info.endpoint, &endpoint_rate_limiter)?; + let role_access = api .get_role_access_control(ctx, &info.endpoint, &info.user) .await?; @@ -401,6 +399,7 @@ impl Backend<'_, ComputeUserInfo> { allowed_ips: Arc::new(vec![]), allowed_vpce: Arc::new(vec![]), flags: AccessBlockerFlags::default(), + rate_limits: EndpointRateLimitConfig::default(), }), } } @@ -439,6 +438,7 @@ mod tests { use crate::auth::{ComputeUserInfoMaybeEndpoint, IpPattern}; use crate::config::AuthenticationConfig; use crate::context::RequestContext; + use crate::control_plane::messages::EndpointRateLimitConfig; use crate::control_plane::{ self, AccessBlockerFlags, CachedNodeInfo, EndpointAccessControl, RoleAccessControl, }; @@ -477,6 +477,7 @@ mod tests { allowed_ips: Arc::new(self.ips.clone()), allowed_vpce: Arc::new(self.vpc_endpoint_ids.clone()), flags: self.access_blocker_flags, + rate_limits: EndpointRateLimitConfig::default(), }) } diff --git a/proxy/src/cache/project_info.rs b/proxy/src/cache/project_info.rs index 9a4be2f904..d37c107323 100644 --- a/proxy/src/cache/project_info.rs +++ b/proxy/src/cache/project_info.rs @@ -364,6 +364,7 @@ mod tests { use std::sync::Arc; use super::*; + use crate::control_plane::messages::EndpointRateLimitConfig; use crate::control_plane::{AccessBlockerFlags, AuthSecret}; use crate::scram::ServerSecret; use crate::types::ProjectId; @@ -399,6 +400,7 @@ mod tests { allowed_ips: allowed_ips.clone(), allowed_vpce: Arc::new(vec![]), flags: AccessBlockerFlags::default(), + rate_limits: EndpointRateLimitConfig::default(), }, RoleAccessControl { secret: secret1.clone(), @@ -414,6 +416,7 @@ mod tests { allowed_ips: allowed_ips.clone(), allowed_vpce: Arc::new(vec![]), flags: AccessBlockerFlags::default(), + rate_limits: EndpointRateLimitConfig::default(), }, RoleAccessControl { secret: secret2.clone(), @@ -439,6 +442,7 @@ mod tests { allowed_ips: allowed_ips.clone(), allowed_vpce: Arc::new(vec![]), flags: AccessBlockerFlags::default(), + rate_limits: EndpointRateLimitConfig::default(), }, RoleAccessControl { secret: secret3.clone(), diff --git a/proxy/src/control_plane/client/cplane_proxy_v1.rs b/proxy/src/control_plane/client/cplane_proxy_v1.rs index cf2d9fba14..8c76d034f7 100644 --- a/proxy/src/control_plane/client/cplane_proxy_v1.rs +++ b/proxy/src/control_plane/client/cplane_proxy_v1.rs @@ -146,6 +146,7 @@ impl NeonControlPlaneClient { public_access_blocked: block_public_connections, vpc_access_blocked: block_vpc_connections, }, + rate_limits: body.rate_limits, }) } .inspect_err(|e| tracing::debug!(error = ?e)) @@ -312,6 +313,7 @@ impl super::ControlPlaneApi for NeonControlPlaneClient { allowed_ips: Arc::new(auth_info.allowed_ips), allowed_vpce: Arc::new(auth_info.allowed_vpc_endpoint_ids), flags: auth_info.access_blocker_flags, + rate_limits: auth_info.rate_limits, }; let role_control = RoleAccessControl { secret: auth_info.secret, @@ -357,6 +359,7 @@ impl super::ControlPlaneApi for NeonControlPlaneClient { allowed_ips: Arc::new(auth_info.allowed_ips), allowed_vpce: Arc::new(auth_info.allowed_vpc_endpoint_ids), flags: auth_info.access_blocker_flags, + rate_limits: auth_info.rate_limits, }; let role_control = RoleAccessControl { secret: auth_info.secret, diff --git a/proxy/src/control_plane/client/mock.rs b/proxy/src/control_plane/client/mock.rs index aeea57f2fc..b84dba6b09 100644 --- a/proxy/src/control_plane/client/mock.rs +++ b/proxy/src/control_plane/client/mock.rs @@ -20,7 +20,7 @@ use crate::context::RequestContext; use crate::control_plane::errors::{ ControlPlaneError, GetAuthInfoError, GetEndpointJwksError, WakeComputeError, }; -use crate::control_plane::messages::MetricsAuxInfo; +use crate::control_plane::messages::{EndpointRateLimitConfig, MetricsAuxInfo}; use crate::control_plane::{ AccessBlockerFlags, AuthInfo, AuthSecret, CachedNodeInfo, EndpointAccessControl, NodeInfo, RoleAccessControl, @@ -130,6 +130,7 @@ impl MockControlPlane { project_id: None, account_id: None, access_blocker_flags: AccessBlockerFlags::default(), + rate_limits: EndpointRateLimitConfig::default(), }) } @@ -233,6 +234,7 @@ impl super::ControlPlaneApi for MockControlPlane { allowed_ips: Arc::new(info.allowed_ips), allowed_vpce: Arc::new(info.allowed_vpc_endpoint_ids), flags: info.access_blocker_flags, + rate_limits: info.rate_limits, }) } diff --git a/proxy/src/control_plane/client/mod.rs b/proxy/src/control_plane/client/mod.rs index 9b9d1e25ea..4e5f5c7899 100644 --- a/proxy/src/control_plane/client/mod.rs +++ b/proxy/src/control_plane/client/mod.rs @@ -10,6 +10,7 @@ use clashmap::ClashMap; use tokio::time::Instant; use tracing::{debug, info}; +use super::{EndpointAccessControl, RoleAccessControl}; use crate::auth::backend::ComputeUserInfo; use crate::auth::backend::jwt::{AuthRule, FetchAuthRules, FetchAuthRulesError}; use crate::cache::endpoints::EndpointsCache; @@ -22,8 +23,6 @@ use crate::metrics::ApiLockMetrics; use crate::rate_limiter::{DynamicLimiter, Outcome, RateLimiterConfig, Token}; use crate::types::EndpointId; -use super::{EndpointAccessControl, RoleAccessControl}; - #[non_exhaustive] #[derive(Clone)] pub enum ControlPlaneClient { diff --git a/proxy/src/control_plane/messages.rs b/proxy/src/control_plane/messages.rs index ec4554eab5..f0314f91f0 100644 --- a/proxy/src/control_plane/messages.rs +++ b/proxy/src/control_plane/messages.rs @@ -227,12 +227,35 @@ pub(crate) struct UserFacingMessage { #[derive(Deserialize)] pub(crate) struct GetEndpointAccessControl { pub(crate) role_secret: Box, - pub(crate) allowed_ips: Option>, - pub(crate) allowed_vpc_endpoint_ids: Option>, + pub(crate) project_id: Option, pub(crate) account_id: Option, + + pub(crate) allowed_ips: Option>, + pub(crate) allowed_vpc_endpoint_ids: Option>, pub(crate) block_public_connections: Option, pub(crate) block_vpc_connections: Option, + + #[serde(default)] + pub(crate) rate_limits: EndpointRateLimitConfig, +} + +#[derive(Copy, Clone, Deserialize, Default)] +pub struct EndpointRateLimitConfig { + pub connection_attempts: ConnectionAttemptsLimit, +} + +#[derive(Copy, Clone, Deserialize, Default)] +pub struct ConnectionAttemptsLimit { + pub tcp: Option, + pub ws: Option, + pub http: Option, +} + +#[derive(Copy, Clone, Deserialize)] +pub struct LeakyBucketSetting { + pub rps: f64, + pub burst: f64, } /// Response which holds compute node's `host:port` pair. diff --git a/proxy/src/control_plane/mod.rs b/proxy/src/control_plane/mod.rs index ad10cf4257..ed83e98bfe 100644 --- a/proxy/src/control_plane/mod.rs +++ b/proxy/src/control_plane/mod.rs @@ -11,6 +11,8 @@ pub(crate) mod errors; use std::sync::Arc; +use messages::EndpointRateLimitConfig; + use crate::auth::backend::ComputeUserInfo; use crate::auth::backend::jwt::AuthRule; use crate::auth::{AuthError, IpPattern, check_peer_addr_is_in_list}; @@ -18,8 +20,9 @@ use crate::cache::{Cached, TimedLru}; use crate::config::ComputeConfig; use crate::context::RequestContext; use crate::control_plane::messages::{ControlPlaneErrorMessage, MetricsAuxInfo}; -use crate::intern::{AccountIdInt, ProjectIdInt}; +use crate::intern::{AccountIdInt, EndpointIdInt, ProjectIdInt}; use crate::protocol2::ConnectionInfoExtra; +use crate::rate_limiter::{EndpointRateLimiter, LeakyBucketConfig}; use crate::types::{EndpointCacheKey, EndpointId, RoleName}; use crate::{compute, scram}; @@ -56,6 +59,8 @@ pub(crate) struct AuthInfo { pub(crate) account_id: Option, /// Are public connections or VPC connections blocked? pub(crate) access_blocker_flags: AccessBlockerFlags, + /// The rate limits for this endpoint. + pub(crate) rate_limits: EndpointRateLimitConfig, } /// Info for establishing a connection to a compute node. @@ -101,6 +106,8 @@ pub struct EndpointAccessControl { pub allowed_ips: Arc>, pub allowed_vpce: Arc>, pub flags: AccessBlockerFlags, + + pub rate_limits: EndpointRateLimitConfig, } impl EndpointAccessControl { @@ -139,6 +146,36 @@ impl EndpointAccessControl { Ok(()) } + + pub fn connection_attempt_rate_limit( + &self, + ctx: &RequestContext, + endpoint: &EndpointId, + rate_limiter: &EndpointRateLimiter, + ) -> Result<(), AuthError> { + let endpoint = EndpointIdInt::from(endpoint); + + let limits = &self.rate_limits.connection_attempts; + let config = match ctx.protocol() { + crate::metrics::Protocol::Http => limits.http, + crate::metrics::Protocol::Ws => limits.ws, + crate::metrics::Protocol::Tcp => limits.tcp, + crate::metrics::Protocol::SniRouter => return Ok(()), + }; + let config = config.and_then(|config| { + if config.rps <= 0.0 || config.burst <= 0.0 { + return None; + } + + Some(LeakyBucketConfig::new(config.rps, config.burst)) + }); + + if !rate_limiter.check(endpoint, config, 1) { + return Err(AuthError::too_many_connections()); + } + + Ok(()) + } } /// This will allocate per each call, but the http requests alone diff --git a/proxy/src/rate_limiter/leaky_bucket.rs b/proxy/src/rate_limiter/leaky_bucket.rs index 0c79b5e92f..f7e54ebfe7 100644 --- a/proxy/src/rate_limiter/leaky_bucket.rs +++ b/proxy/src/rate_limiter/leaky_bucket.rs @@ -69,9 +69,8 @@ pub struct LeakyBucketConfig { pub max: f64, } -#[cfg(test)] impl LeakyBucketConfig { - pub(crate) fn new(rps: f64, max: f64) -> Self { + pub fn new(rps: f64, max: f64) -> Self { assert!(rps > 0.0, "rps must be positive"); assert!(max > 0.0, "max must be positive"); Self { rps, max } diff --git a/proxy/src/rate_limiter/limiter.rs b/proxy/src/rate_limiter/limiter.rs index 9d700c1b52..0cd539188a 100644 --- a/proxy/src/rate_limiter/limiter.rs +++ b/proxy/src/rate_limiter/limiter.rs @@ -12,11 +12,10 @@ use rand::{Rng, SeedableRng}; use tokio::time::{Duration, Instant}; use tracing::info; +use super::LeakyBucketConfig; use crate::ext::LockExt; use crate::intern::EndpointIdInt; -use super::LeakyBucketConfig; - pub struct GlobalRateLimiter { data: Vec, info: Vec, diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs index 316e038344..26269d0a6e 100644 --- a/proxy/src/serverless/backend.rs +++ b/proxy/src/serverless/backend.rs @@ -68,17 +68,20 @@ impl PoolingBackend { self.config.authentication_config.is_vpc_acccess_proxy, )?; - let ep = EndpointIdInt::from(&user_info.endpoint); - let rate_limit_config = None; - if !self.endpoint_rate_limiter.check(ep, rate_limit_config, 1) { - return Err(AuthError::too_many_connections()); - } + access_control.connection_attempt_rate_limit( + ctx, + &user_info.endpoint, + &self.endpoint_rate_limiter, + )?; + let role_access = backend.get_role_secret(ctx).await?; let Some(secret) = role_access.secret else { // If we don't have an authentication secret, for the http flow we can just return an error. info!("authentication info not found"); return Err(AuthError::password_failed(&*user_info.user)); }; + + let ep = EndpointIdInt::from(&user_info.endpoint); let auth_outcome = crate::auth::validate_password_and_exchange( &self.config.authentication_config.thread_pool, ep, From 3f16ca2c185e820706e75fe7a8162ffbd3349b60 Mon Sep 17 00:00:00 2001 From: a-masterov <72613290+a-masterov@users.noreply.github.com> Date: Tue, 10 Jun 2025 17:59:51 +0200 Subject: [PATCH 026/124] Respect limits for projects for the Random Operations test (#12184) ## Problem The project limits were not respected, resulting in errors. ## Summary of changes Now limits are checked before running an action, and if the action is not possible to run, another random action will be run. --------- Co-authored-by: Peter Bendel --- test_runner/fixtures/neon_api.py | 12 ++++++ test_runner/random_ops/test_random_ops.py | 52 +++++++++++++++++------ 2 files changed, 51 insertions(+), 13 deletions(-) diff --git a/test_runner/fixtures/neon_api.py b/test_runner/fixtures/neon_api.py index 0cf5945458..9d85b9a332 100644 --- a/test_runner/fixtures/neon_api.py +++ b/test_runner/fixtures/neon_api.py @@ -129,6 +129,18 @@ class NeonAPI: return cast("dict[str, Any]", resp.json()) + def get_project_limits(self, project_id: str) -> dict[str, Any]: + resp = self.__request( + "GET", + f"/projects/{project_id}/limits", + headers={ + "Accept": "application/json", + "Content-Type": "application/json", + }, + ) + + return cast("dict[str, Any]", resp.json()) + def delete_project( self, project_id: str, diff --git a/test_runner/random_ops/test_random_ops.py b/test_runner/random_ops/test_random_ops.py index 645c9b7b9d..d3815c40bb 100644 --- a/test_runner/random_ops/test_random_ops.py +++ b/test_runner/random_ops/test_random_ops.py @@ -45,6 +45,8 @@ class NeonEndpoint: if self.branch.connect_env: self.connect_env = self.branch.connect_env.copy() self.connect_env["PGHOST"] = self.host + if self.type == "read_only": + self.project.read_only_endpoints_total += 1 def delete(self): self.project.delete_endpoint(self.id) @@ -228,8 +230,13 @@ class NeonProject: self.benchmarks: dict[str, subprocess.Popen[Any]] = {} self.restore_num: int = 0 self.restart_pgbench_on_console_errors: bool = False + self.limits: dict[str, Any] = self.get_limits()["limits"] + self.read_only_endpoints_total: int = 0 - def delete(self): + def get_limits(self) -> dict[str, Any]: + return self.neon_api.get_project_limits(self.id) + + def delete(self) -> None: self.neon_api.delete_project(self.id) def create_branch(self, parent_id: str | None = None) -> NeonBranch | None: @@ -282,6 +289,7 @@ class NeonProject: self.neon_api.delete_endpoint(self.id, endpoint_id) self.endpoints[endpoint_id].branch.endpoints.pop(endpoint_id) self.endpoints.pop(endpoint_id) + self.read_only_endpoints_total -= 1 self.wait() def start_benchmark(self, target: str, clients: int = 10) -> subprocess.Popen[Any]: @@ -369,49 +377,64 @@ def setup_class( print(f"::warning::Retried on 524 error {neon_api.retries524} times") if neon_api.retries4xx > 0: print(f"::warning::Retried on 4xx error {neon_api.retries4xx} times") - log.info("Removing the project") + log.info("Removing the project %s", project.id) project.delete() -def do_action(project: NeonProject, action: str) -> None: +def do_action(project: NeonProject, action: str) -> bool: """ Runs the action """ log.info("Action: %s", action) if action == "new_branch": log.info("Trying to create a new branch") + if 0 <= project.limits["max_branches"] <= len(project.branches): + log.info( + "Maximum branch limit exceeded (%s of %s)", + len(project.branches), + project.limits["max_branches"], + ) + return False parent = project.branches[ random.choice(list(set(project.branches.keys()) - project.reset_branches)) ] log.info("Parent: %s", parent) child = parent.create_child_branch() if child is None: - return + return False log.info("Created branch %s", child) child.start_benchmark() elif action == "delete_branch": if project.leaf_branches: - target = random.choice(list(project.leaf_branches.values())) + target: NeonBranch = random.choice(list(project.leaf_branches.values())) log.info("Trying to delete branch %s", target) target.delete() else: log.info("Leaf branches not found, skipping") + return False elif action == "new_ro_endpoint": + if 0 <= project.limits["max_read_only_endpoints"] <= project.read_only_endpoints_total: + log.info( + "Maximum read only endpoint limit exceeded (%s of %s)", + project.read_only_endpoints_total, + project.limits["max_read_only_endpoints"], + ) + return False ep = random.choice( [br for br in project.branches.values() if br.id not in project.reset_branches] ).create_ro_endpoint() log.info("Created the RO endpoint with id %s branch: %s", ep.id, ep.branch.id) ep.start_benchmark() elif action == "delete_ro_endpoint": + if project.read_only_endpoints_total == 0: + log.info("no read_only endpoints present, skipping") + return False ro_endpoints: list[NeonEndpoint] = [ endpoint for endpoint in project.endpoints.values() if endpoint.type == "read_only" ] - if ro_endpoints: - target_ep: NeonEndpoint = random.choice(ro_endpoints) - target_ep.delete() - log.info("endpoint %s deleted", target_ep.id) - else: - log.info("no read_only endpoints present, skipping") + target_ep: NeonEndpoint = random.choice(ro_endpoints) + target_ep.delete() + log.info("endpoint %s deleted", target_ep.id) elif action == "restore_random_time": if project.leaf_branches: br: NeonBranch = random.choice(list(project.leaf_branches.values())) @@ -419,8 +442,10 @@ def do_action(project: NeonProject, action: str) -> None: br.restore_random_time() else: log.info("No leaf branches found") + return False else: raise ValueError(f"The action {action} is unknown") + return True @pytest.mark.timeout(7200) @@ -457,8 +482,9 @@ def test_api_random( pg_bin.run(["pgbench", "-i", "-I", "dtGvp", "-s100"], env=project.main_branch.connect_env) for _ in range(num_operations): log.info("Starting action #%s", _ + 1) - do_action( + while not do_action( project, random.choices([a[0] for a in ACTIONS], weights=[w[1] for w in ACTIONS])[0] - ) + ): + log.info("Retrying...") project.check_all_benchmarks() assert True From 1b935b1958a7f508807f1bd241e715f33cdc386e Mon Sep 17 00:00:00 2001 From: Mikhail Date: Tue, 10 Jun 2025 20:25:32 +0100 Subject: [PATCH 027/124] endpoint_storage: add ?from_endpoint= to /lfc/prewarm (#12195) Related: https://github.com/neondatabase/cloud/issues/24225 Add optional from_endpoint parameter to allow prewarming from other endpoint --- Cargo.lock | 16 +++++++++++++ Cargo.toml | 2 +- compute_tools/src/compute.rs | 2 +- compute_tools/src/compute_prewarm.rs | 30 +++++++++++++++---------- compute_tools/src/http/routes/lfc.rs | 13 +++++++++-- test_runner/fixtures/endpoint/http.py | 6 +++-- test_runner/regress/test_lfc_prewarm.py | 3 ++- 7 files changed, 53 insertions(+), 19 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 3ee261e885..d8bf04e87f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -753,6 +753,7 @@ dependencies = [ "axum", "axum-core", "bytes", + "form_urlencoded", "futures-util", "headers", "http 1.1.0", @@ -761,6 +762,8 @@ dependencies = [ "mime", "pin-project-lite", "serde", + "serde_html_form", + "serde_path_to_error", "tower 0.5.2", "tower-layer", "tower-service", @@ -6422,6 +6425,19 @@ dependencies = [ "syn 2.0.100", ] +[[package]] +name = "serde_html_form" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d2de91cf02bbc07cde38891769ccd5d4f073d22a40683aa4bc7a95781aaa2c4" +dependencies = [ + "form_urlencoded", + "indexmap 2.9.0", + "itoa", + "ryu", + "serde", +] + [[package]] name = "serde_json" version = "1.0.125" diff --git a/Cargo.toml b/Cargo.toml index a040010fb7..666ead7352 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -71,7 +71,7 @@ aws-credential-types = "1.2.0" aws-sigv4 = { version = "1.2", features = ["sign-http"] } aws-types = "1.3" axum = { version = "0.8.1", features = ["ws"] } -axum-extra = { version = "0.10.0", features = ["typed-header"] } +axum-extra = { version = "0.10.0", features = ["typed-header", "query"] } base64 = "0.13.0" bincode = "1.3" bindgen = "0.71" diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index bd6ed910be..f15538b157 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -785,7 +785,7 @@ impl ComputeNode { self.spawn_extension_stats_task(); if pspec.spec.autoprewarm { - self.prewarm_lfc(); + self.prewarm_lfc(None); } Ok(()) } diff --git a/compute_tools/src/compute_prewarm.rs b/compute_tools/src/compute_prewarm.rs index a6a84b3f1f..1c7a7bef60 100644 --- a/compute_tools/src/compute_prewarm.rs +++ b/compute_tools/src/compute_prewarm.rs @@ -25,11 +25,16 @@ struct EndpointStoragePair { } const KEY: &str = "lfc_state"; -impl TryFrom<&crate::compute::ParsedSpec> for EndpointStoragePair { - type Error = anyhow::Error; - fn try_from(pspec: &crate::compute::ParsedSpec) -> Result { - let Some(ref endpoint_id) = pspec.spec.endpoint_id else { - bail!("pspec.endpoint_id missing") +impl EndpointStoragePair { + /// endpoint_id is set to None while prewarming from other endpoint, see replica promotion + /// If not None, takes precedence over pspec.spec.endpoint_id + fn from_spec_and_endpoint( + pspec: &crate::compute::ParsedSpec, + endpoint_id: Option, + ) -> Result { + let endpoint_id = endpoint_id.as_ref().or(pspec.spec.endpoint_id.as_ref()); + let Some(ref endpoint_id) = endpoint_id else { + bail!("pspec.endpoint_id missing, other endpoint_id not provided") }; let Some(ref base_uri) = pspec.endpoint_storage_addr else { bail!("pspec.endpoint_storage_addr missing") @@ -84,7 +89,7 @@ impl ComputeNode { } /// Returns false if there is a prewarm request ongoing, true otherwise - pub fn prewarm_lfc(self: &Arc) -> bool { + pub fn prewarm_lfc(self: &Arc, from_endpoint: Option) -> bool { crate::metrics::LFC_PREWARM_REQUESTS.inc(); { let state = &mut self.state.lock().unwrap().lfc_prewarm_state; @@ -97,7 +102,7 @@ impl ComputeNode { let cloned = self.clone(); spawn(async move { - let Err(err) = cloned.prewarm_impl().await else { + let Err(err) = cloned.prewarm_impl(from_endpoint).await else { cloned.state.lock().unwrap().lfc_prewarm_state = LfcPrewarmState::Completed; return; }; @@ -109,13 +114,14 @@ impl ComputeNode { true } - fn endpoint_storage_pair(&self) -> Result { + /// from_endpoint: None for endpoint managed by this compute_ctl + fn endpoint_storage_pair(&self, from_endpoint: Option) -> Result { let state = self.state.lock().unwrap(); - state.pspec.as_ref().unwrap().try_into() + EndpointStoragePair::from_spec_and_endpoint(state.pspec.as_ref().unwrap(), from_endpoint) } - async fn prewarm_impl(&self) -> Result<()> { - let EndpointStoragePair { url, token } = self.endpoint_storage_pair()?; + async fn prewarm_impl(&self, from_endpoint: Option) -> Result<()> { + let EndpointStoragePair { url, token } = self.endpoint_storage_pair(from_endpoint)?; info!(%url, "requesting LFC state from endpoint storage"); let request = Client::new().get(&url).bearer_auth(token); @@ -173,7 +179,7 @@ impl ComputeNode { } async fn offload_lfc_impl(&self) -> Result<()> { - let EndpointStoragePair { url, token } = self.endpoint_storage_pair()?; + let EndpointStoragePair { url, token } = self.endpoint_storage_pair(None)?; info!(%url, "requesting LFC state from postgres"); let mut compressed = Vec::new(); diff --git a/compute_tools/src/http/routes/lfc.rs b/compute_tools/src/http/routes/lfc.rs index 07bcc6bfb7..e98bd781a2 100644 --- a/compute_tools/src/http/routes/lfc.rs +++ b/compute_tools/src/http/routes/lfc.rs @@ -2,6 +2,7 @@ use crate::compute_prewarm::LfcPrewarmStateWithProgress; use crate::http::JsonResponse; use axum::response::{IntoResponse, Response}; use axum::{Json, http::StatusCode}; +use axum_extra::extract::OptionalQuery; use compute_api::responses::LfcOffloadState; type Compute = axum::extract::State>; @@ -16,8 +17,16 @@ pub(in crate::http) async fn offload_state(compute: Compute) -> Json Response { - if compute.prewarm_lfc() { +#[derive(serde::Deserialize)] +pub struct PrewarmQuery { + pub from_endpoint: String, +} + +pub(in crate::http) async fn prewarm( + compute: Compute, + OptionalQuery(query): OptionalQuery, +) -> Response { + if compute.prewarm_lfc(query.map(|q| q.from_endpoint)) { StatusCode::ACCEPTED.into_response() } else { JsonResponse::error( diff --git a/test_runner/fixtures/endpoint/http.py b/test_runner/fixtures/endpoint/http.py index 6d37dd1cb1..e2d405227b 100644 --- a/test_runner/fixtures/endpoint/http.py +++ b/test_runner/fixtures/endpoint/http.py @@ -69,8 +69,10 @@ class EndpointHttpClient(requests.Session): json: dict[str, str] = res.json() return json - def prewarm_lfc(self): - self.post(f"http://localhost:{self.external_port}/lfc/prewarm").raise_for_status() + def prewarm_lfc(self, from_endpoint_id: str | None = None): + url: str = f"http://localhost:{self.external_port}/lfc/prewarm" + params = {"from_endpoint": from_endpoint_id} if from_endpoint_id else dict() + self.post(url, params=params).raise_for_status() def prewarmed(): json = self.prewarm_lfc_status() diff --git a/test_runner/regress/test_lfc_prewarm.py b/test_runner/regress/test_lfc_prewarm.py index 82e1e9fcba..40a9b29296 100644 --- a/test_runner/regress/test_lfc_prewarm.py +++ b/test_runner/regress/test_lfc_prewarm.py @@ -188,7 +188,8 @@ def test_lfc_prewarm_under_workload(neon_simple_env: NeonEnv, query: LfcQueryMet pg_cur.execute("select pg_reload_conf()") if query is LfcQueryMethod.COMPUTE_CTL: - http_client.prewarm_lfc() + # Same thing as prewarm_lfc(), testing other method + http_client.prewarm_lfc(endpoint.endpoint_id) else: pg_cur.execute("select prewarm_local_cache(%s)", (lfc_state,)) From 24038033bf46f48c6dbe160150d620212ab3040a Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Wed, 11 Jun 2025 16:16:58 +0300 Subject: [PATCH 028/124] Remove default from DROP FUNCTION (#12202) ## Problem DROP FUNCTION doesn't allow to specify default for parameters. ## Summary of changes Remove DEFAULT clause from pgxn/neon/neon--1.6--1.5.sql Co-authored-by: Konstantin Knizhnik --- pgxn/neon/neon--1.6--1.5.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pgxn/neon/neon--1.6--1.5.sql b/pgxn/neon/neon--1.6--1.5.sql index 57512980f5..50c62238a3 100644 --- a/pgxn/neon/neon--1.6--1.5.sql +++ b/pgxn/neon/neon--1.6--1.5.sql @@ -2,6 +2,6 @@ DROP FUNCTION IF EXISTS get_prewarm_info(out total_pages integer, out prewarmed_ DROP FUNCTION IF EXISTS get_local_cache_state(max_chunks integer); -DROP FUNCTION IF EXISTS prewarm_local_cache(state bytea, n_workers integer default 1); +DROP FUNCTION IF EXISTS prewarm_local_cache(state bytea, n_workers integer); From 28e882a80f2bc48355ec4eac403746127382979b Mon Sep 17 00:00:00 2001 From: Vlad Lazar Date: Wed, 11 Jun 2025 17:16:30 +0100 Subject: [PATCH 029/124] pageserver: warn on long layer manager locking intervals (#12194) ## Problem We hold the layer map for too long on occasion. ## Summary of changes This should help us identify the places where it's happening from. Related https://github.com/neondatabase/neon/issues/12182 --- pageserver/src/http/routes.rs | 6 +- pageserver/src/tenant.rs | 35 ++-- pageserver/src/tenant/mgr.rs | 6 +- .../src/tenant/storage_layer/delta_layer.rs | 5 +- .../src/tenant/storage_layer/layer/tests.rs | 19 +-- pageserver/src/tenant/timeline.rs | 130 ++++++++++----- pageserver/src/tenant/timeline/analysis.rs | 6 +- pageserver/src/tenant/timeline/compaction.rs | 71 +++++--- .../src/tenant/timeline/detach_ancestor.rs | 16 +- .../src/tenant/timeline/eviction_task.rs | 3 +- .../src/tenant/timeline/import_pgdata.rs | 6 +- .../src/tenant/timeline/import_pgdata/flow.rs | 13 +- .../src/tenant/timeline/layer_manager.rs | 151 ++++++++++++++++++ 13 files changed, 375 insertions(+), 92 deletions(-) diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index c8a2a0209f..626986f580 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -73,6 +73,7 @@ use crate::tenant::remote_timeline_client::{ use crate::tenant::secondary::SecondaryController; use crate::tenant::size::ModelInputs; use crate::tenant::storage_layer::{IoConcurrency, LayerAccessStatsReset, LayerName}; +use crate::tenant::timeline::layer_manager::LayerManagerLockHolder; use crate::tenant::timeline::offload::{OffloadError, offload_timeline}; use crate::tenant::timeline::{ CompactFlags, CompactOptions, CompactRequest, CompactionError, MarkInvisibleRequest, Timeline, @@ -1451,7 +1452,10 @@ async fn timeline_layer_scan_disposable_keys( let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download) .with_scope_timeline(&timeline); - let guard = timeline.layers.read().await; + let guard = timeline + .layers + .read(LayerManagerLockHolder::GetLayerMapInfo) + .await; let Some(layer) = guard.try_get_from_key(&layer_name.clone().into()) else { return Err(ApiError::NotFound( anyhow::anyhow!("Layer {tenant_shard_id}/{timeline_id}/{layer_name} not found").into(), diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 98a6bc2387..cfecf5561c 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -51,6 +51,7 @@ use secondary::heatmap::{HeatMapTenant, HeatMapTimeline}; use storage_broker::BrokerClientChannel; use timeline::compaction::{CompactionOutcome, GcCompactionQueue}; use timeline::import_pgdata::ImportingTimeline; +use timeline::layer_manager::LayerManagerLockHolder; use timeline::offload::{OffloadError, offload_timeline}; use timeline::{ CompactFlags, CompactOptions, CompactionError, PreviousHeatmap, ShutdownMode, import_pgdata, @@ -1315,7 +1316,7 @@ impl TenantShard { ancestor.is_some() || timeline .layers - .read() + .read(LayerManagerLockHolder::LoadLayerMap) .await .layer_map() .expect( @@ -2643,7 +2644,7 @@ impl TenantShard { } let layer_names = tline .layers - .read() + .read(LayerManagerLockHolder::Testing) .await .layer_map() .unwrap() @@ -3158,7 +3159,12 @@ impl TenantShard { for timeline in &compact { // Collect L0 counts. Can't await while holding lock above. - if let Ok(lm) = timeline.layers.read().await.layer_map() { + if let Ok(lm) = timeline + .layers + .read(LayerManagerLockHolder::Compaction) + .await + .layer_map() + { l0_counts.insert(timeline.timeline_id, lm.level0_deltas().len()); } } @@ -4900,7 +4906,7 @@ impl TenantShard { } let layer_names = tline .layers - .read() + .read(LayerManagerLockHolder::Testing) .await .layer_map() .unwrap() @@ -6970,7 +6976,7 @@ mod tests { .await?; make_some_layers(tline.as_ref(), Lsn(0x20), &ctx).await?; - let layer_map = tline.layers.read().await; + let layer_map = tline.layers.read(LayerManagerLockHolder::Testing).await; let level0_deltas = layer_map .layer_map()? .level0_deltas() @@ -7206,7 +7212,7 @@ mod tests { let lsn = Lsn(0x10); let inserted = bulk_insert_compact_gc(&tenant, &tline, &ctx, lsn, 50, 10000).await?; - let guard = tline.layers.read().await; + let guard = tline.layers.read(LayerManagerLockHolder::Testing).await; let lm = guard.layer_map()?; lm.dump(true, &ctx).await?; @@ -8234,12 +8240,23 @@ mod tests { tline.freeze_and_flush().await?; // force create a delta layer } - let before_num_l0_delta_files = - tline.layers.read().await.layer_map()?.level0_deltas().len(); + let before_num_l0_delta_files = tline + .layers + .read(LayerManagerLockHolder::Testing) + .await + .layer_map()? + .level0_deltas() + .len(); tline.compact(&cancel, EnumSet::default(), &ctx).await?; - let after_num_l0_delta_files = tline.layers.read().await.layer_map()?.level0_deltas().len(); + let after_num_l0_delta_files = tline + .layers + .read(LayerManagerLockHolder::Testing) + .await + .layer_map()? + .level0_deltas() + .len(); assert!( after_num_l0_delta_files < before_num_l0_delta_files, diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 4aa459e923..766f846827 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -51,6 +51,7 @@ use crate::tenant::config::{ use crate::tenant::span::debug_assert_current_span_has_tenant_id; use crate::tenant::storage_layer::inmemory_layer; use crate::tenant::timeline::ShutdownMode; +use crate::tenant::timeline::layer_manager::LayerManagerLockHolder; use crate::tenant::{ AttachedTenantConf, GcError, LoadConfigError, SpawnMode, TenantShard, TenantState, }; @@ -1658,7 +1659,10 @@ impl TenantManager { let parent_timelines = timelines.keys().cloned().collect::>(); for timeline in timelines.values() { tracing::info!(timeline_id=%timeline.timeline_id, "Loading list of layers to hardlink"); - let layers = timeline.layers.read().await; + let layers = timeline + .layers + .read(LayerManagerLockHolder::GetLayerMapInfo) + .await; for layer in layers.likely_resident_layers() { let relative_path = layer diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index 2c1b27c8d5..e82a28bb4c 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -1635,6 +1635,7 @@ pub(crate) mod test { use crate::tenant::disk_btree::tests::TestDisk; use crate::tenant::harness::{TIMELINE_ID, TenantHarness}; use crate::tenant::storage_layer::{Layer, ResidentLayer}; + use crate::tenant::timeline::layer_manager::LayerManagerLockHolder; use crate::tenant::{TenantShard, Timeline}; /// Construct an index for a fictional delta layer and and then @@ -2002,7 +2003,7 @@ pub(crate) mod test { let initdb_layer = timeline .layers - .read() + .read(crate::tenant::timeline::layer_manager::LayerManagerLockHolder::Testing) .await .likely_resident_layers() .next() @@ -2078,7 +2079,7 @@ pub(crate) mod test { let new_layer = timeline .layers - .read() + .read(LayerManagerLockHolder::Testing) .await .likely_resident_layers() .find(|&x| x != &initdb_layer) diff --git a/pageserver/src/tenant/storage_layer/layer/tests.rs b/pageserver/src/tenant/storage_layer/layer/tests.rs index b6fd4678d6..2f2ff0f273 100644 --- a/pageserver/src/tenant/storage_layer/layer/tests.rs +++ b/pageserver/src/tenant/storage_layer/layer/tests.rs @@ -10,6 +10,7 @@ use super::*; use crate::context::DownloadBehavior; use crate::tenant::harness::{TenantHarness, test_img}; use crate::tenant::storage_layer::{IoConcurrency, LayerVisibilityHint}; +use crate::tenant::timeline::layer_manager::LayerManagerLockHolder; /// Used in tests to advance a future to wanted await point, and not futher. const ADVANCE: std::time::Duration = std::time::Duration::from_secs(3600); @@ -59,7 +60,7 @@ async fn smoke_test() { // there to avoid the timeline being illegally empty let (layer, dummy_layer) = { let mut layers = { - let layers = timeline.layers.read().await; + let layers = timeline.layers.read(LayerManagerLockHolder::Testing).await; layers.likely_resident_layers().cloned().collect::>() }; @@ -215,7 +216,7 @@ async fn smoke_test() { // Simulate GC removing our test layer. { - let mut g = timeline.layers.write().await; + let mut g = timeline.layers.write(LayerManagerLockHolder::Testing).await; let layers = &[layer]; g.open_mut().unwrap().finish_gc_timeline(layers); @@ -261,7 +262,7 @@ async fn evict_and_wait_on_wanted_deleted() { let layer = { let mut layers = { - let layers = timeline.layers.read().await; + let layers = timeline.layers.read(LayerManagerLockHolder::Testing).await; layers.likely_resident_layers().cloned().collect::>() }; @@ -305,7 +306,7 @@ async fn evict_and_wait_on_wanted_deleted() { // assert that once we remove the `layer` from the layer map and drop our reference, // the deletion of the layer in remote_storage happens. { - let mut layers = timeline.layers.write().await; + let mut layers = timeline.layers.write(LayerManagerLockHolder::Testing).await; layers.open_mut().unwrap().finish_gc_timeline(&[layer]); } @@ -347,7 +348,7 @@ fn read_wins_pending_eviction() { let layer = { let mut layers = { - let layers = timeline.layers.read().await; + let layers = timeline.layers.read(LayerManagerLockHolder::Testing).await; layers.likely_resident_layers().cloned().collect::>() }; @@ -480,7 +481,7 @@ fn multiple_pending_evictions_scenario(name: &'static str, in_order: bool) { let layer = { let mut layers = { - let layers = timeline.layers.read().await; + let layers = timeline.layers.read(LayerManagerLockHolder::Testing).await; layers.likely_resident_layers().cloned().collect::>() }; @@ -655,7 +656,7 @@ async fn cancelled_get_or_maybe_download_does_not_cancel_eviction() { let layer = { let mut layers = { - let layers = timeline.layers.read().await; + let layers = timeline.layers.read(LayerManagerLockHolder::Testing).await; layers.likely_resident_layers().cloned().collect::>() }; @@ -741,7 +742,7 @@ async fn evict_and_wait_does_not_wait_for_download() { let layer = { let mut layers = { - let layers = timeline.layers.read().await; + let layers = timeline.layers.read(LayerManagerLockHolder::Testing).await; layers.likely_resident_layers().cloned().collect::>() }; @@ -862,7 +863,7 @@ async fn eviction_cancellation_on_drop() { let (evicted_layer, not_evicted) = { let mut layers = { - let mut guard = timeline.layers.write().await; + let mut guard = timeline.layers.write(LayerManagerLockHolder::Testing).await; let layers = guard.likely_resident_layers().cloned().collect::>(); // remove the layers from layermap guard.open_mut().unwrap().finish_gc_timeline(&layers); diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 0ff005fbb9..a1969ecae6 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -35,7 +35,11 @@ use fail::fail_point; use futures::stream::FuturesUnordered; use futures::{FutureExt, StreamExt}; use handle::ShardTimelineId; -use layer_manager::Shutdown; +use layer_manager::{ + LayerManagerLockHolder, LayerManagerReadGuard, LayerManagerWriteGuard, LockedLayerManager, + Shutdown, +}; + use offload::OffloadError; use once_cell::sync::Lazy; use pageserver_api::config::tenant_conf_defaults::DEFAULT_PITR_INTERVAL; @@ -82,7 +86,6 @@ use wal_decoder::serialized_batch::{SerializedValueBatch, ValueMeta}; use self::delete::DeleteTimelineFlow; pub(super) use self::eviction_task::EvictionTaskTenantState; use self::eviction_task::EvictionTaskTimelineState; -use self::layer_manager::LayerManager; use self::logical_size::LogicalSize; use self::walreceiver::{WalReceiver, WalReceiverConf}; use super::remote_timeline_client::RemoteTimelineClient; @@ -181,13 +184,13 @@ impl std::fmt::Display for ImageLayerCreationMode { /// Temporary function for immutable storage state refactor, ensures we are dropping mutex guard instead of other things. /// Can be removed after all refactors are done. -fn drop_rlock(rlock: tokio::sync::RwLockReadGuard) { +fn drop_layer_manager_rlock(rlock: LayerManagerReadGuard<'_>) { drop(rlock) } /// Temporary function for immutable storage state refactor, ensures we are dropping mutex guard instead of other things. /// Can be removed after all refactors are done. -fn drop_wlock(rlock: tokio::sync::RwLockWriteGuard<'_, T>) { +fn drop_layer_manager_wlock(rlock: LayerManagerWriteGuard<'_>) { drop(rlock) } @@ -241,7 +244,7 @@ pub struct Timeline { /// /// In the future, we'll be able to split up the tuple of LayerMap and `LayerFileManager`, /// so that e.g. on-demand-download/eviction, and layer spreading, can operate just on `LayerFileManager`. - pub(crate) layers: tokio::sync::RwLock, + pub(crate) layers: LockedLayerManager, last_freeze_at: AtomicLsn, // Atomic would be more appropriate here. @@ -1535,7 +1538,10 @@ impl Timeline { /// This method makes no distinction between local and remote layers. /// Hence, the result **does not represent local filesystem usage**. pub(crate) async fn layer_size_sum(&self) -> u64 { - let guard = self.layers.read().await; + let guard = self + .layers + .read(LayerManagerLockHolder::GetLayerMapInfo) + .await; guard.layer_size_sum() } @@ -1845,7 +1851,7 @@ impl Timeline { // time, and this was missed. // if write_guard.is_none() { return; } - let Ok(layers_guard) = self.layers.try_read() else { + let Ok(layers_guard) = self.layers.try_read(LayerManagerLockHolder::TryFreezeLayer) else { // Don't block if the layer lock is busy return; }; @@ -2158,7 +2164,7 @@ impl Timeline { if let ShutdownMode::FreezeAndFlush = mode { let do_flush = if let Some((open, frozen)) = self .layers - .read() + .read(LayerManagerLockHolder::Shutdown) .await .layer_map() .map(|lm| (lm.open_layer.is_some(), lm.frozen_layers.len())) @@ -2262,7 +2268,10 @@ impl Timeline { // Allow any remaining in-memory layers to do cleanup -- until that, they hold the gate // open. let mut write_guard = self.write_lock.lock().await; - self.layers.write().await.shutdown(&mut write_guard); + self.layers + .write(LayerManagerLockHolder::Shutdown) + .await + .shutdown(&mut write_guard); } // Finally wait until any gate-holders are complete. @@ -2365,7 +2374,10 @@ impl Timeline { &self, reset: LayerAccessStatsReset, ) -> Result { - let guard = self.layers.read().await; + let guard = self + .layers + .read(LayerManagerLockHolder::GetLayerMapInfo) + .await; let layer_map = guard.layer_map()?; let mut in_memory_layers = Vec::with_capacity(layer_map.frozen_layers.len() + 1); if let Some(open_layer) = &layer_map.open_layer { @@ -3232,7 +3244,7 @@ impl Timeline { /// Initialize with an empty layer map. Used when creating a new timeline. pub(super) fn init_empty_layer_map(&self, start_lsn: Lsn) { - let mut layers = self.layers.try_write().expect( + let mut layers = self.layers.try_write(LayerManagerLockHolder::Init).expect( "in the context where we call this function, no other task has access to the object", ); layers @@ -3252,7 +3264,10 @@ impl Timeline { use init::Decision::*; use init::{Discovered, DismissedLayer}; - let mut guard = self.layers.write().await; + let mut guard = self + .layers + .write(LayerManagerLockHolder::LoadLayerMap) + .await; let timer = self.metrics.load_layer_map_histo.start_timer(); @@ -3869,7 +3884,10 @@ impl Timeline { &self, layer_name: &LayerName, ) -> Result, layer_manager::Shutdown> { - let guard = self.layers.read().await; + let guard = self + .layers + .read(LayerManagerLockHolder::GetLayerMapInfo) + .await; let layer = guard .layer_map()? .iter_historic_layers() @@ -3902,7 +3920,10 @@ impl Timeline { return None; } - let guard = self.layers.read().await; + let guard = self + .layers + .read(LayerManagerLockHolder::GenerateHeatmap) + .await; // Firstly, if there's any heatmap left over from when this location // was a secondary, take that into account. Keep layers that are: @@ -4000,7 +4021,10 @@ impl Timeline { } pub(super) async fn generate_unarchival_heatmap(&self, end_lsn: Lsn) -> PreviousHeatmap { - let guard = self.layers.read().await; + let guard = self + .layers + .read(LayerManagerLockHolder::GenerateHeatmap) + .await; let now = SystemTime::now(); let mut heatmap_layers = Vec::default(); @@ -4342,7 +4366,7 @@ impl Timeline { query: &VersionedKeySpaceQuery, ) -> Result { let mut fringe = LayerFringe::new(); - let guard = self.layers.read().await; + let guard = self.layers.read(LayerManagerLockHolder::GetPage).await; match query { VersionedKeySpaceQuery::Uniform { keyspace, lsn } => { @@ -4445,7 +4469,7 @@ impl Timeline { // required for correctness, but avoids visiting extra layers // which turns out to be a perf bottleneck in some cases. if !unmapped_keyspace.is_empty() { - let guard = timeline.layers.read().await; + let guard = timeline.layers.read(LayerManagerLockHolder::GetPage).await; guard.update_search_fringe(&unmapped_keyspace, cont_lsn, &mut fringe)?; // It's safe to drop the layer map lock after planning the next round of reads. @@ -4555,7 +4579,10 @@ impl Timeline { _guard: &tokio::sync::MutexGuard<'_, Option>, ctx: &RequestContext, ) -> anyhow::Result> { - let mut guard = self.layers.write().await; + let mut guard = self + .layers + .write(LayerManagerLockHolder::GetLayerForWrite) + .await; let last_record_lsn = self.get_last_record_lsn(); ensure!( @@ -4597,7 +4624,10 @@ impl Timeline { write_lock: &mut tokio::sync::MutexGuard<'_, Option>, ) -> Result { let frozen = { - let mut guard = self.layers.write().await; + let mut guard = self + .layers + .write(LayerManagerLockHolder::TryFreezeLayer) + .await; guard .open_mut()? .try_freeze_in_memory_layer(at, &self.last_freeze_at, write_lock, &self.metrics) @@ -4638,7 +4668,12 @@ impl Timeline { ctx: &RequestContext, ) { // Subscribe to L0 delta layer updates, for compaction backpressure. - let mut watch_l0 = match self.layers.read().await.layer_map() { + let mut watch_l0 = match self + .layers + .read(LayerManagerLockHolder::FlushLoop) + .await + .layer_map() + { Ok(lm) => lm.watch_level0_deltas(), Err(Shutdown) => return, }; @@ -4675,7 +4710,7 @@ impl Timeline { // Fetch the next layer to flush, if any. let (layer, l0_count, frozen_count, frozen_size) = { - let layers = self.layers.read().await; + let layers = self.layers.read(LayerManagerLockHolder::FlushLoop).await; let Ok(lm) = layers.layer_map() else { info!("dropping out of flush loop for timeline shutdown"); return; @@ -4971,7 +5006,10 @@ impl Timeline { // in-memory layer from the map now. The flushed layer is stored in // the mapping in `create_delta_layer`. { - let mut guard = self.layers.write().await; + let mut guard = self + .layers + .write(LayerManagerLockHolder::FlushFrozenLayer) + .await; guard.open_mut()?.finish_flush_l0_layer( delta_layer_to_add.as_ref(), @@ -5186,7 +5224,7 @@ impl Timeline { async fn time_for_new_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> bool { let threshold = self.get_image_creation_threshold(); - let guard = self.layers.read().await; + let guard = self.layers.read(LayerManagerLockHolder::Compaction).await; let Ok(layers) = guard.layer_map() else { return false; }; @@ -5604,7 +5642,7 @@ impl Timeline { if let ImageLayerCreationMode::Force = mode { // When forced to create image layers, we might try and create them where they already // exist. This mode is only used in tests/debug. - let layers = self.layers.read().await; + let layers = self.layers.read(LayerManagerLockHolder::Compaction).await; if layers.contains_key(&PersistentLayerKey { key_range: img_range.clone(), lsn_range: PersistentLayerDesc::image_layer_lsn_range(lsn), @@ -5729,7 +5767,7 @@ impl Timeline { let image_layers = batch_image_writer.finish(self, ctx).await?; - let mut guard = self.layers.write().await; + let mut guard = self.layers.write(LayerManagerLockHolder::Compaction).await; // FIXME: we could add the images to be uploaded *before* returning from here, but right // now they are being scheduled outside of write lock; current way is inconsistent with @@ -5737,7 +5775,7 @@ impl Timeline { guard .open_mut()? .track_new_image_layers(&image_layers, &self.metrics); - drop_wlock(guard); + drop_layer_manager_wlock(guard); let duration = timer.stop_and_record(); // Creating image layers may have caused some previously visible layers to be covered @@ -6107,7 +6145,7 @@ impl Timeline { layers_to_remove: &[Layer], ) -> Result<(), CompactionError> { let mut guard = tokio::select! { - guard = self.layers.write() => guard, + guard = self.layers.write(LayerManagerLockHolder::Compaction) => guard, _ = self.cancel.cancelled() => { return Err(CompactionError::ShuttingDown); } @@ -6156,7 +6194,7 @@ impl Timeline { self.remote_client .schedule_compaction_update(&remove_layers, new_deltas)?; - drop_wlock(guard); + drop_layer_manager_wlock(guard); Ok(()) } @@ -6166,7 +6204,7 @@ impl Timeline { mut replace_layers: Vec<(Layer, ResidentLayer)>, mut drop_layers: Vec, ) -> Result<(), CompactionError> { - let mut guard = self.layers.write().await; + let mut guard = self.layers.write(LayerManagerLockHolder::Compaction).await; // Trim our lists in case our caller (compaction) raced with someone else (GC) removing layers: we want // to avoid double-removing, and avoid rewriting something that was removed. @@ -6517,7 +6555,10 @@ impl Timeline { // 5. newer on-disk image layers cover the layer's whole key range // // TODO holding a write lock is too agressive and avoidable - let mut guard = self.layers.write().await; + let mut guard = self + .layers + .write(LayerManagerLockHolder::GarbageCollection) + .await; let layers = guard.layer_map()?; 'outer: for l in layers.iter_historic_layers() { result.layers_total += 1; @@ -6819,7 +6860,10 @@ impl Timeline { use pageserver_api::models::DownloadRemoteLayersTaskState; let remaining = { - let guard = self.layers.read().await; + let guard = self + .layers + .read(LayerManagerLockHolder::GetLayerMapInfo) + .await; let Ok(lm) = guard.layer_map() else { // technically here we could look into iterating accessible layers, but downloading // all layers of a shutdown timeline makes no sense regardless. @@ -6925,7 +6969,7 @@ impl Timeline { impl Timeline { /// Returns non-remote layers for eviction. pub(crate) async fn get_local_layers_for_disk_usage_eviction(&self) -> DiskUsageEvictionInfo { - let guard = self.layers.read().await; + let guard = self.layers.read(LayerManagerLockHolder::Eviction).await; let mut max_layer_size: Option = None; let resident_layers = guard @@ -7026,7 +7070,7 @@ impl Timeline { let image_layer = Layer::finish_creating(self.conf, self, desc, &path)?; info!("force created image layer {}", image_layer.local_path()); { - let mut guard = self.layers.write().await; + let mut guard = self.layers.write(LayerManagerLockHolder::Testing).await; guard .open_mut() .unwrap() @@ -7089,7 +7133,7 @@ impl Timeline { let delta_layer = Layer::finish_creating(self.conf, self, desc, &path)?; info!("force created delta layer {}", delta_layer.local_path()); { - let mut guard = self.layers.write().await; + let mut guard = self.layers.write(LayerManagerLockHolder::Testing).await; guard .open_mut() .unwrap() @@ -7184,7 +7228,7 @@ impl Timeline { // Link the layer to the layer map { - let mut guard = self.layers.write().await; + let mut guard = self.layers.write(LayerManagerLockHolder::Testing).await; let layer_map = guard.open_mut().unwrap(); layer_map.force_insert_in_memory_layer(Arc::new(layer)); } @@ -7201,7 +7245,7 @@ impl Timeline { io_concurrency: IoConcurrency, ) -> anyhow::Result> { let mut all_data = Vec::new(); - let guard = self.layers.read().await; + let guard = self.layers.read(LayerManagerLockHolder::Testing).await; for layer in guard.layer_map()?.iter_historic_layers() { if !layer.is_delta() && layer.image_layer_lsn() == lsn { let layer = guard.get_from_desc(&layer); @@ -7230,7 +7274,7 @@ impl Timeline { self: &Arc, ) -> anyhow::Result> { let mut layers = Vec::new(); - let guard = self.layers.read().await; + let guard = self.layers.read(LayerManagerLockHolder::Testing).await; for layer in guard.layer_map()?.iter_historic_layers() { layers.push(layer.key()); } @@ -7342,7 +7386,7 @@ impl TimelineWriter<'_> { let l0_count = self .tl .layers - .read() + .read(LayerManagerLockHolder::GetLayerMapInfo) .await .layer_map()? .level0_deltas() @@ -7561,6 +7605,7 @@ mod tests { use crate::tenant::harness::{TenantHarness, test_img}; use crate::tenant::layer_map::LayerMap; use crate::tenant::storage_layer::{Layer, LayerName, LayerVisibilityHint}; + use crate::tenant::timeline::layer_manager::LayerManagerLockHolder; use crate::tenant::timeline::{DeltaLayerTestDesc, EvictionError}; use crate::tenant::{PreviousHeatmap, Timeline}; @@ -7668,7 +7713,7 @@ mod tests { // Evict all the layers and stash the old heatmap in the timeline. // This simulates a migration to a cold secondary location. - let guard = timeline.layers.read().await; + let guard = timeline.layers.read(LayerManagerLockHolder::Testing).await; let mut all_layers = Vec::new(); let forever = std::time::Duration::from_secs(120); for layer in guard.likely_resident_layers() { @@ -7790,7 +7835,7 @@ mod tests { }))); // Evict all the layers in the previous heatmap - let guard = timeline.layers.read().await; + let guard = timeline.layers.read(LayerManagerLockHolder::Testing).await; let forever = std::time::Duration::from_secs(120); for layer in guard.likely_resident_layers() { layer.evict_and_wait(forever).await.unwrap(); @@ -7853,7 +7898,10 @@ mod tests { } async fn find_some_layer(timeline: &Timeline) -> Layer { - let layers = timeline.layers.read().await; + let layers = timeline + .layers + .read(LayerManagerLockHolder::GetLayerMapInfo) + .await; let desc = layers .layer_map() .unwrap() diff --git a/pageserver/src/tenant/timeline/analysis.rs b/pageserver/src/tenant/timeline/analysis.rs index 96864ec44b..90c70086ed 100644 --- a/pageserver/src/tenant/timeline/analysis.rs +++ b/pageserver/src/tenant/timeline/analysis.rs @@ -4,6 +4,7 @@ use std::ops::Range; use utils::lsn::Lsn; use super::Timeline; +use crate::tenant::timeline::layer_manager::LayerManagerLockHolder; #[derive(serde::Serialize)] pub(crate) struct RangeAnalysis { @@ -24,7 +25,10 @@ impl Timeline { let num_of_l0; let all_layer_files = { - let guard = self.layers.read().await; + let guard = self + .layers + .read(LayerManagerLockHolder::GetLayerMapInfo) + .await; num_of_l0 = guard.layer_map().unwrap().level0_deltas().len(); guard.all_persistent_layers() }; diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index 72ca0f9cc1..0ec2292ee8 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -9,7 +9,7 @@ use std::ops::{Deref, Range}; use std::sync::Arc; use std::time::{Duration, Instant}; -use super::layer_manager::LayerManager; +use super::layer_manager::{LayerManagerLockHolder, LayerManagerReadGuard}; use super::{ CompactFlags, CompactOptions, CompactionError, CreateImageLayersError, DurationRecorder, GetVectoredError, ImageLayerCreationMode, LastImageLayerCreationStatus, RecordedDuration, @@ -62,7 +62,7 @@ use crate::tenant::storage_layer::{ use crate::tenant::tasks::log_compaction_error; use crate::tenant::timeline::{ DeltaLayerWriter, ImageLayerCreationOutcome, ImageLayerWriter, IoConcurrency, Layer, - ResidentLayer, drop_rlock, + ResidentLayer, drop_layer_manager_rlock, }; use crate::tenant::{DeltaLayer, MaybeOffloaded}; use crate::virtual_file::{MaybeFatalIo, VirtualFile}; @@ -314,7 +314,10 @@ impl GcCompactionQueue { .unwrap_or(Lsn::INVALID); let layers = { - let guard = timeline.layers.read().await; + let guard = timeline + .layers + .read(LayerManagerLockHolder::GetLayerMapInfo) + .await; let layer_map = guard.layer_map()?; layer_map.iter_historic_layers().collect_vec() }; @@ -408,7 +411,10 @@ impl GcCompactionQueue { timeline: &Arc, lsn: Lsn, ) -> Result { - let guard = timeline.layers.read().await; + let guard = timeline + .layers + .read(LayerManagerLockHolder::GetLayerMapInfo) + .await; let layer_map = guard.layer_map()?; let layers = layer_map.iter_historic_layers().collect_vec(); let mut size = 0; @@ -851,7 +857,7 @@ impl KeyHistoryRetention { } let layer_generation; { - let guard = tline.layers.read().await; + let guard = tline.layers.read(LayerManagerLockHolder::Compaction).await; if !guard.contains_key(key) { return false; } @@ -1282,7 +1288,10 @@ impl Timeline { // We do the repartition on the L0-L1 boundary. All data below the boundary // are compacted by L0 with low read amplification, thus making the `repartition` // function run fast. - let guard = self.layers.read().await; + let guard = self + .layers + .read(LayerManagerLockHolder::GetLayerMapInfo) + .await; guard .all_persistent_layers() .iter() @@ -1461,7 +1470,7 @@ impl Timeline { let latest_gc_cutoff = self.get_applied_gc_cutoff_lsn(); let pitr_cutoff = self.gc_info.read().unwrap().cutoffs.time; - let layers = self.layers.read().await; + let layers = self.layers.read(LayerManagerLockHolder::Compaction).await; let layers_iter = layers.layer_map()?.iter_historic_layers(); let (layers_total, mut layers_checked) = (layers_iter.len(), 0); for layer_desc in layers_iter { @@ -1722,7 +1731,10 @@ impl Timeline { // are implicitly left visible, because LayerVisibilityHint's default is Visible, and we never modify it here. // Note that L0 deltas _can_ be covered by image layers, but we consider them 'visible' because we anticipate that // they will be subject to L0->L1 compaction in the near future. - let layer_manager = self.layers.read().await; + let layer_manager = self + .layers + .read(LayerManagerLockHolder::GetLayerMapInfo) + .await; let layer_map = layer_manager.layer_map()?; let readable_points = { @@ -1775,7 +1787,7 @@ impl Timeline { }; let begin = tokio::time::Instant::now(); - let phase1_layers_locked = self.layers.read().await; + let phase1_layers_locked = self.layers.read(LayerManagerLockHolder::Compaction).await; let now = tokio::time::Instant::now(); stats.read_lock_acquisition_micros = DurationRecorder::Recorded(RecordedDuration(now - begin), now); @@ -1803,7 +1815,7 @@ impl Timeline { /// Level0 files first phase of compaction, explained in the [`Self::compact_legacy`] comment. async fn compact_level0_phase1<'a>( self: &'a Arc, - guard: tokio::sync::RwLockReadGuard<'a, LayerManager>, + guard: LayerManagerReadGuard<'a>, mut stats: CompactLevel0Phase1StatsBuilder, target_file_size: u64, force_compaction_ignore_threshold: bool, @@ -2029,7 +2041,7 @@ impl Timeline { holes }; stats.read_lock_held_compute_holes_micros = stats.read_lock_held_key_sort_micros.till_now(); - drop_rlock(guard); + drop_layer_manager_rlock(guard); if self.cancel.is_cancelled() { return Err(CompactionError::ShuttingDown); @@ -2469,7 +2481,7 @@ impl Timeline { // Find the top of the historical layers let end_lsn = { - let guard = self.layers.read().await; + let guard = self.layers.read(LayerManagerLockHolder::Compaction).await; let layers = guard.layer_map()?; let l0_deltas = layers.level0_deltas(); @@ -3008,7 +3020,7 @@ impl Timeline { } split_key_ranges.sort(); let all_layers = { - let guard = self.layers.read().await; + let guard = self.layers.read(LayerManagerLockHolder::Compaction).await; let layer_map = guard.layer_map()?; layer_map.iter_historic_layers().collect_vec() }; @@ -3185,7 +3197,10 @@ impl Timeline { // 1. If a layer is in the selection, all layers below it are in the selection. // 2. Inferred from (1), for each key in the layer selection, the value can be reconstructed only with the layers in the layer selection. let job_desc = { - let guard = self.layers.read().await; + let guard = self + .layers + .read(LayerManagerLockHolder::GarbageCollection) + .await; let layers = guard.layer_map()?; let gc_info = self.gc_info.read().unwrap(); let mut retain_lsns_below_horizon = Vec::new(); @@ -3956,7 +3971,10 @@ impl Timeline { // First, do a sanity check to ensure the newly-created layer map does not contain overlaps. let all_layers = { - let guard = self.layers.read().await; + let guard = self + .layers + .read(LayerManagerLockHolder::GarbageCollection) + .await; let layer_map = guard.layer_map()?; layer_map.iter_historic_layers().collect_vec() }; @@ -4020,7 +4038,10 @@ impl Timeline { let update_guard = self.gc_compaction_layer_update_lock.write().await; // Acquiring the update guard ensures current read operations end and new read operations are blocked. // TODO: can we use `latest_gc_cutoff` Rcu to achieve the same effect? - let mut guard = self.layers.write().await; + let mut guard = self + .layers + .write(LayerManagerLockHolder::GarbageCollection) + .await; guard .open_mut()? .finish_gc_compaction(&layer_selection, &compact_to, &self.metrics); @@ -4088,7 +4109,11 @@ impl TimelineAdaptor { pub async fn flush_updates(&mut self) -> Result<(), CompactionError> { let layers_to_delete = { - let guard = self.timeline.layers.read().await; + let guard = self + .timeline + .layers + .read(LayerManagerLockHolder::Compaction) + .await; self.layers_to_delete .iter() .map(|x| guard.get_from_desc(x)) @@ -4133,7 +4158,11 @@ impl CompactionJobExecutor for TimelineAdaptor { ) -> anyhow::Result>> { self.flush_updates().await?; - let guard = self.timeline.layers.read().await; + let guard = self + .timeline + .layers + .read(LayerManagerLockHolder::Compaction) + .await; let layer_map = guard.layer_map()?; let result = layer_map @@ -4172,7 +4201,11 @@ impl CompactionJobExecutor for TimelineAdaptor { // this is a lot more complex than a simple downcast... if layer.is_delta() { let l = { - let guard = self.timeline.layers.read().await; + let guard = self + .timeline + .layers + .read(LayerManagerLockHolder::Compaction) + .await; guard.get_from_desc(layer) }; let result = l.download_and_keep_resident(ctx).await?; diff --git a/pageserver/src/tenant/timeline/detach_ancestor.rs b/pageserver/src/tenant/timeline/detach_ancestor.rs index 40eda8c785..f47ce5408b 100644 --- a/pageserver/src/tenant/timeline/detach_ancestor.rs +++ b/pageserver/src/tenant/timeline/detach_ancestor.rs @@ -19,7 +19,7 @@ use utils::id::TimelineId; use utils::lsn::Lsn; use utils::sync::gate::GateError; -use super::layer_manager::LayerManager; +use super::layer_manager::{LayerManager, LayerManagerLockHolder}; use super::{FlushLayerError, Timeline}; use crate::context::{DownloadBehavior, RequestContext}; use crate::task_mgr::TaskKind; @@ -199,7 +199,10 @@ pub(crate) async fn generate_tombstone_image_layer( let image_lsn = ancestor_lsn; { - let layers = detached.layers.read().await; + let layers = detached + .layers + .read(LayerManagerLockHolder::DetachAncestor) + .await; for layer in layers.all_persistent_layers() { if !layer.is_delta && layer.lsn_range.start == image_lsn @@ -423,7 +426,7 @@ pub(super) async fn prepare( // we do not need to start from our layers, because they can only be layers that come // *after* ancestor_lsn let layers = tokio::select! { - guard = ancestor.layers.read() => guard, + guard = ancestor.layers.read(LayerManagerLockHolder::DetachAncestor) => guard, _ = detached.cancel.cancelled() => { return Err(ShuttingDown); } @@ -869,7 +872,12 @@ async fn remote_copy( // Double check that the file is orphan (probably from an earlier attempt), then delete it let key = file_name.clone().into(); - if adoptee.layers.read().await.contains_key(&key) { + if adoptee + .layers + .read(LayerManagerLockHolder::DetachAncestor) + .await + .contains_key(&key) + { // We are supposed to filter out such cases before coming to this function return Err(Error::Prepare(anyhow::anyhow!( "layer file {file_name} already present and inside layer map" diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs index b1b0d32c9b..1328c3ac12 100644 --- a/pageserver/src/tenant/timeline/eviction_task.rs +++ b/pageserver/src/tenant/timeline/eviction_task.rs @@ -33,6 +33,7 @@ use crate::tenant::size::CalculateSyntheticSizeError; use crate::tenant::storage_layer::LayerVisibilityHint; use crate::tenant::tasks::{BackgroundLoopKind, BackgroundLoopSemaphorePermit, sleep_random}; use crate::tenant::timeline::EvictionError; +use crate::tenant::timeline::layer_manager::LayerManagerLockHolder; use crate::tenant::{LogicalSizeCalculationCause, TenantShard}; #[derive(Default)] @@ -208,7 +209,7 @@ impl Timeline { let mut js = tokio::task::JoinSet::new(); { - let guard = self.layers.read().await; + let guard = self.layers.read(LayerManagerLockHolder::Eviction).await; guard .likely_resident_layers() diff --git a/pageserver/src/tenant/timeline/import_pgdata.rs b/pageserver/src/tenant/timeline/import_pgdata.rs index 606ad09ef1..817d76ce2f 100644 --- a/pageserver/src/tenant/timeline/import_pgdata.rs +++ b/pageserver/src/tenant/timeline/import_pgdata.rs @@ -15,6 +15,7 @@ use super::{Timeline, TimelineDeleteProgress}; use crate::context::RequestContext; use crate::controller_upcall_client::{StorageControllerUpcallApi, StorageControllerUpcallClient}; use crate::tenant::metadata::TimelineMetadata; +use crate::tenant::timeline::layer_manager::LayerManagerLockHolder; mod flow; mod importbucket_client; @@ -163,7 +164,10 @@ async fn prepare_import( info!("wipe the slate clean"); { // TODO: do we need to hold GC lock for this? - let mut guard = timeline.layers.write().await; + let mut guard = timeline + .layers + .write(LayerManagerLockHolder::ImportPgData) + .await; assert!( guard.layer_map()?.open_layer.is_none(), "while importing, there should be no in-memory layer" // this just seems like a good place to assert it diff --git a/pageserver/src/tenant/timeline/import_pgdata/flow.rs b/pageserver/src/tenant/timeline/import_pgdata/flow.rs index e003bb6810..ed679a9bdc 100644 --- a/pageserver/src/tenant/timeline/import_pgdata/flow.rs +++ b/pageserver/src/tenant/timeline/import_pgdata/flow.rs @@ -56,6 +56,7 @@ use crate::pgdatadir_mapping::{ }; use crate::task_mgr::TaskKind; use crate::tenant::storage_layer::{AsLayerDesc, ImageLayerWriter, Layer}; +use crate::tenant::timeline::layer_manager::LayerManagerLockHolder; pub async fn run( timeline: Arc, @@ -984,7 +985,10 @@ impl ChunkProcessingJob { let (desc, path) = writer.finish(ctx).await?; { - let guard = timeline.layers.read().await; + let guard = timeline + .layers + .read(LayerManagerLockHolder::ImportPgData) + .await; let existing_layer = guard.try_get_from_key(&desc.key()); if let Some(layer) = existing_layer { if layer.metadata().generation == timeline.generation { @@ -1007,7 +1011,10 @@ impl ChunkProcessingJob { // certain that the existing layer is identical to the new one, so in that case // we replace the old layer with the one we just generated. - let mut guard = timeline.layers.write().await; + let mut guard = timeline + .layers + .write(LayerManagerLockHolder::ImportPgData) + .await; let existing_layer = guard .try_get_from_key(&resident_layer.layer_desc().key()) @@ -1036,7 +1043,7 @@ impl ChunkProcessingJob { } } - crate::tenant::timeline::drop_wlock(guard); + crate::tenant::timeline::drop_layer_manager_wlock(guard); timeline .remote_client diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs index ae898260d2..e419b6f8ad 100644 --- a/pageserver/src/tenant/timeline/layer_manager.rs +++ b/pageserver/src/tenant/timeline/layer_manager.rs @@ -1,5 +1,8 @@ use std::collections::HashMap; +use std::mem::ManuallyDrop; +use std::ops::{Deref, DerefMut}; use std::sync::Arc; +use std::time::Duration; use anyhow::{Context, bail, ensure}; use itertools::Itertools; @@ -20,6 +23,154 @@ use crate::tenant::storage_layer::{ PersistentLayerKey, ReadableLayerWeak, ResidentLayer, }; +/// Warn if the lock was held for longer than this threshold. +/// It's very generous and we should bring this value down over time. +const LAYER_MANAGER_LOCK_WARN_THRESHOLD: Duration = Duration::from_secs(5); + +/// Describes the operation that is holding the layer manager lock +#[derive(Debug, Clone, Copy, strum_macros::Display)] +#[strum(serialize_all = "kebab_case")] +pub(crate) enum LayerManagerLockHolder { + GetLayerMapInfo, + GenerateHeatmap, + GetPage, + Init, + LoadLayerMap, + GetLayerForWrite, + TryFreezeLayer, + FlushFrozenLayer, + FlushLoop, + Compaction, + GarbageCollection, + Shutdown, + ImportPgData, + DetachAncestor, + Eviction, + #[cfg(test)] + Testing, +} + +/// Wrapper for the layer manager that tracks the amount of time during which +/// it was held under read or write lock +#[derive(Default)] +pub(crate) struct LockedLayerManager { + locked: tokio::sync::RwLock, +} + +pub(crate) struct LayerManagerReadGuard<'a> { + guard: ManuallyDrop>, + acquired_at: std::time::Instant, + holder: LayerManagerLockHolder, +} + +pub(crate) struct LayerManagerWriteGuard<'a> { + guard: ManuallyDrop>, + acquired_at: std::time::Instant, + holder: LayerManagerLockHolder, +} + +impl Drop for LayerManagerReadGuard<'_> { + fn drop(&mut self) { + // Drop the lock first, before potentially warning if it was held for too long. + // SAFETY: ManuallyDrop in Drop implementation + unsafe { ManuallyDrop::drop(&mut self.guard) }; + + let held_for = self.acquired_at.elapsed(); + if held_for >= LAYER_MANAGER_LOCK_WARN_THRESHOLD { + tracing::warn!( + holder=%self.holder, + "Layer manager read lock held for {}s", + held_for.as_secs_f64(), + ); + } + } +} + +impl Drop for LayerManagerWriteGuard<'_> { + fn drop(&mut self) { + // Drop the lock first, before potentially warning if it was held for too long. + // SAFETY: ManuallyDrop in Drop implementation + unsafe { ManuallyDrop::drop(&mut self.guard) }; + + let held_for = self.acquired_at.elapsed(); + if held_for >= LAYER_MANAGER_LOCK_WARN_THRESHOLD { + tracing::warn!( + holder=%self.holder, + "Layer manager write lock held for {}s", + held_for.as_secs_f64(), + ); + } + } +} + +impl Deref for LayerManagerReadGuard<'_> { + type Target = LayerManager; + + fn deref(&self) -> &Self::Target { + self.guard.deref() + } +} + +impl Deref for LayerManagerWriteGuard<'_> { + type Target = LayerManager; + + fn deref(&self) -> &Self::Target { + self.guard.deref() + } +} + +impl DerefMut for LayerManagerWriteGuard<'_> { + fn deref_mut(&mut self) -> &mut Self::Target { + self.guard.deref_mut() + } +} + +impl LockedLayerManager { + pub(crate) async fn read(&self, holder: LayerManagerLockHolder) -> LayerManagerReadGuard { + let guard = ManuallyDrop::new(self.locked.read().await); + LayerManagerReadGuard { + guard, + acquired_at: std::time::Instant::now(), + holder, + } + } + + pub(crate) fn try_read( + &self, + holder: LayerManagerLockHolder, + ) -> Result { + let guard = ManuallyDrop::new(self.locked.try_read()?); + + Ok(LayerManagerReadGuard { + guard, + acquired_at: std::time::Instant::now(), + holder, + }) + } + + pub(crate) async fn write(&self, holder: LayerManagerLockHolder) -> LayerManagerWriteGuard { + let guard = ManuallyDrop::new(self.locked.write().await); + LayerManagerWriteGuard { + guard, + acquired_at: std::time::Instant::now(), + holder, + } + } + + pub(crate) fn try_write( + &self, + holder: LayerManagerLockHolder, + ) -> Result { + let guard = ManuallyDrop::new(self.locked.try_write()?); + + Ok(LayerManagerWriteGuard { + guard, + acquired_at: std::time::Instant::now(), + holder, + }) + } +} + /// Provides semantic APIs to manipulate the layer map. pub(crate) enum LayerManager { /// Open as in not shutdown layer manager; we still have in-memory layers and we can manipulate From 54fdcfdfa8401989fa1d62e500df5b78aee30269 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 11 Jun 2025 21:09:05 +0000 Subject: [PATCH 030/124] build(deps): bump requests from 2.32.3 to 2.32.4 in the pip group across 1 directory (#12180) Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- poetry.lock | 10 +++++----- pyproject.toml | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/poetry.lock b/poetry.lock index 21a2664555..f9b6f83366 100644 --- a/poetry.lock +++ b/poetry.lock @@ -3051,19 +3051,19 @@ files = [ [[package]] name = "requests" -version = "2.32.3" +version = "2.32.4" description = "Python HTTP for Humans." optional = false python-versions = ">=3.8" groups = ["main"] files = [ - {file = "requests-2.32.3-py3-none-any.whl", hash = "sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6"}, - {file = "requests-2.32.3.tar.gz", hash = "sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760"}, + {file = "requests-2.32.4-py3-none-any.whl", hash = "sha256:27babd3cda2a6d50b30443204ee89830707d396671944c998b5975b031ac2b2c"}, + {file = "requests-2.32.4.tar.gz", hash = "sha256:27d0316682c8a29834d3264820024b62a36942083d52caf2f14c0591336d3422"}, ] [package.dependencies] certifi = ">=2017.4.17" -charset-normalizer = ">=2,<4" +charset_normalizer = ">=2,<4" idna = ">=2.5,<4" urllib3 = ">=1.21.1,<3" @@ -3846,4 +3846,4 @@ cffi = ["cffi (>=1.11)"] [metadata] lock-version = "2.1" python-versions = "^3.11" -content-hash = "7ab1e7b975af34b3271b7c6018fa22a261d3f73c7c0a0403b6b2bb86b5fbd36e" +content-hash = "bd93313f110110aa53b24a3ed47ba2d7f60e2c658a79cdff7320fed1bb1b57b5" diff --git a/pyproject.toml b/pyproject.toml index c6dfdc223c..e7e314d144 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,7 +9,7 @@ pytest = "^7.4.4" psycopg2-binary = "^2.9.10" typing-extensions = "^4.12.2" PyJWT = {version = "^2.1.0", extras = ["crypto"]} -requests = "^2.32.3" +requests = "^2.32.4" pytest-xdist = "^3.3.1" asyncpg = "^0.30.0" aiopg = "^1.4.0" From 3d5e2bf685dc36e2c5b64b19f3e4d139a5bb10ea Mon Sep 17 00:00:00 2001 From: Dmitrii Kovalkov <34828390+DimasKovas@users.noreply.github.com> Date: Thu, 12 Jun 2025 12:09:57 +0400 Subject: [PATCH 031/124] storcon: add tenant_timeline_locate handler (#12203) ## Problem Compatibility tests may be run against a compatibility snapshot generated with `--timelines-onto-safekeepers=false`. We need to start the compute without a generation (or with 0 generation) if the timeline is not storcon-managed, otherwise the compute will hang. This handler is needed to check if the timeline is storcon-managed. It's also needed for better test coverage of safekeeper migration code. - Relates to https://github.com/neondatabase/neon/pull/11712 ## Summary of changes - Implement `tenant_timeline_locate` handler in storcon to get safekeeper info from storcon's DB --- storage_controller/src/http.rs | 35 ++++++++++++++++ .../src/service/safekeeper_service.rs | 41 ++++++++++++++++++- test_runner/fixtures/neon_fixtures.py | 11 +++++ 3 files changed, 86 insertions(+), 1 deletion(-) diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs index 705b81077e..7051a3326d 100644 --- a/storage_controller/src/http.rs +++ b/storage_controller/src/http.rs @@ -1398,6 +1398,31 @@ async fn handle_timeline_import(req: Request) -> Result, Ap ) } +async fn handle_tenant_timeline_locate( + service: Arc, + req: Request, +) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?; + + check_permissions(&req, Scope::Admin)?; + maybe_rate_limit(&req, tenant_id).await; + + match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(_req) => {} + }; + + json_response( + StatusCode::OK, + service + .tenant_timeline_locate(tenant_id, timeline_id) + .await?, + ) +} + async fn handle_tenants_dump(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; @@ -2139,6 +2164,16 @@ pub fn make_router( ) }, ) + .get( + "/debug/v1/tenant/:tenant_id/timeline/:timeline_id/locate", + |r| { + tenant_service_handler( + r, + handle_tenant_timeline_locate, + RequestName("v1_tenant_timeline_locate"), + ) + }, + ) .get("/debug/v1/scheduler", |r| { named_request_span(r, handle_scheduler_dump, RequestName("debug_v1_scheduler")) }) diff --git a/storage_controller/src/service/safekeeper_service.rs b/storage_controller/src/service/safekeeper_service.rs index 1f673fe445..61b9ec6b6d 100644 --- a/storage_controller/src/service/safekeeper_service.rs +++ b/storage_controller/src/service/safekeeper_service.rs @@ -17,7 +17,7 @@ use pageserver_api::controller_api::{ SafekeeperDescribeResponse, SkSchedulingPolicy, TimelineImportRequest, }; use pageserver_api::models::{SafekeeperInfo, SafekeepersInfo, TimelineInfo}; -use safekeeper_api::membership::{MemberSet, SafekeeperId}; +use safekeeper_api::membership::{MemberSet, SafekeeperGeneration, SafekeeperId}; use tokio::task::JoinSet; use tokio_util::sync::CancellationToken; use utils::id::{NodeId, TenantId, TimelineId}; @@ -26,6 +26,13 @@ use utils::lsn::Lsn; use super::Service; +#[derive(serde::Serialize, serde::Deserialize, Clone)] +pub struct TimelineLocateResponse { + pub generation: SafekeeperGeneration, + pub sk_set: Vec, + pub new_sk_set: Option>, +} + impl Service { /// Timeline creation on safekeepers /// @@ -396,6 +403,38 @@ impl Service { Ok(()) } + /// Locate safekeepers for a timeline. + /// Return the generation, sk_set and new_sk_set if present. + /// If the timeline is not storcon-managed, return NotFound. + pub(crate) async fn tenant_timeline_locate( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + ) -> Result { + let timeline = self + .persistence + .get_timeline(tenant_id, timeline_id) + .await?; + + let Some(timeline) = timeline else { + return Err(ApiError::NotFound( + anyhow::anyhow!("Timeline {}/{} not found", tenant_id, timeline_id).into(), + )); + }; + + Ok(TimelineLocateResponse { + generation: SafekeeperGeneration::new(timeline.generation as u32), + sk_set: timeline + .sk_set + .iter() + .map(|id| NodeId(*id as u64)) + .collect(), + new_sk_set: timeline + .new_sk_set + .map(|sk_set| sk_set.iter().map(|id| NodeId(*id as u64)).collect()), + }) + } + /// Perform timeline deletion on safekeepers. Will return success: we persist the deletion into the reconciler. pub(super) async fn tenant_timeline_delete_safekeepers( self: &Arc, diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 84caf9e2af..df34573b12 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -2223,6 +2223,17 @@ class NeonStorageController(MetricsGetter, LogUtils): shards: list[dict[str, Any]] = body["shards"] return shards + def timeline_locate(self, tenant_id: TenantId, timeline_id: TimelineId): + """ + :return: dict {"generation": int, "sk_set": [int], "new_sk_set": [int]} + """ + response = self.request( + "GET", + f"{self.api}/debug/v1/tenant/{tenant_id}/timeline/{timeline_id}/locate", + headers=self.headers(TokenScope.ADMIN), + ) + return response.json() + def tenant_describe(self, tenant_id: TenantId): """ :return: list of {"shard_id": "", "node_id": int, "listen_pg_addr": str, "listen_pg_port": int, "listen_http_addr: str, "listen_http_port: int, preferred_az_id: str} From 60dfdf39c77706e4038ecec252ac9e1cd134269a Mon Sep 17 00:00:00 2001 From: Dmitrii Kovalkov <34828390+DimasKovas@users.noreply.github.com> Date: Thu, 12 Jun 2025 12:10:22 +0400 Subject: [PATCH 032/124] tests: prepare test_tenant_delete_stale_shards for --timelines-onto-safekeepers (#12198) ## Problem The test creates an endpoint and deletes its tenant. The compute cannot stop gracefully because it tries to write a checkpoint shutdown record into the WAL, but the timeline had been already deleted from safekeepers. - Relates to https://github.com/neondatabase/neon/pull/11712 ## Summary of changes Stop the compute before deleting a tenant --- test_runner/regress/test_tenant_delete.py | 1 + 1 file changed, 1 insertion(+) diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py index 8379908631..a0ff9a3ae2 100644 --- a/test_runner/regress/test_tenant_delete.py +++ b/test_runner/regress/test_tenant_delete.py @@ -430,6 +430,7 @@ def test_tenant_delete_stale_shards(neon_env_builder: NeonEnvBuilder, pg_bin: Pg workload.init() workload.write_rows(256) workload.validate() + workload.stop() assert_prefix_not_empty( neon_env_builder.pageserver_remote_storage, From ad88ec9257b84d6633be86c5a970709146331ad4 Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Thu, 12 Jun 2025 16:39:54 +0800 Subject: [PATCH 033/124] fix(pageserver): extend layer manager read guard threshold (#12211) ## Problem Follow up of https://github.com/neondatabase/neon/pull/12194 to make the benchmarks run without warnings. ## Summary of changes Extend read guard hold timeout to 30s. Signed-off-by: Alex Chi Z --- pageserver/src/tenant/timeline/layer_manager.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs index e419b6f8ad..2eccf48579 100644 --- a/pageserver/src/tenant/timeline/layer_manager.rs +++ b/pageserver/src/tenant/timeline/layer_manager.rs @@ -26,6 +26,7 @@ use crate::tenant::storage_layer::{ /// Warn if the lock was held for longer than this threshold. /// It's very generous and we should bring this value down over time. const LAYER_MANAGER_LOCK_WARN_THRESHOLD: Duration = Duration::from_secs(5); +const LAYER_MANAGER_LOCK_READ_WARN_THRESHOLD: Duration = Duration::from_secs(30); /// Describes the operation that is holding the layer manager lock #[derive(Debug, Clone, Copy, strum_macros::Display)] @@ -76,7 +77,7 @@ impl Drop for LayerManagerReadGuard<'_> { unsafe { ManuallyDrop::drop(&mut self.guard) }; let held_for = self.acquired_at.elapsed(); - if held_for >= LAYER_MANAGER_LOCK_WARN_THRESHOLD { + if held_for >= LAYER_MANAGER_LOCK_READ_WARN_THRESHOLD { tracing::warn!( holder=%self.holder, "Layer manager read lock held for {}s", From 1dce65308dd36872e573561f6d959103b790ec03 Mon Sep 17 00:00:00 2001 From: Folke Behrens Date: Thu, 12 Jun 2025 16:12:47 +0000 Subject: [PATCH 034/124] Update base64 to 0.22 (#12215) ## Problem Base64 0.13 is outdated. ## Summary of changes Update base64 to 0.22. Affects mostly proxy and proxy libs. Also upgrade serde_with to remove another dep on base64 0.13 from dep tree. --- Cargo.lock | 27 ++++++++----------- Cargo.toml | 4 +-- control_plane/src/endpoint.rs | 6 +++-- libs/proxy/postgres-protocol2/Cargo.toml | 2 +- .../src/authentication/sasl.rs | 15 ++++++++--- .../postgres-protocol2/src/password/mod.rs | 8 +++--- proxy/src/auth/backend/jwt.rs | 20 +++++++------- proxy/src/sasl/channel_binding.rs | 11 +++++--- proxy/src/scram/exchange.rs | 4 ++- proxy/src/scram/messages.rs | 13 +++++---- proxy/src/scram/mod.rs | 4 ++- proxy/src/scram/secret.rs | 8 +++--- proxy/src/serverless/local_conn_pool.rs | 6 +++-- proxy/src/tls/mod.rs | 4 ++- workspace_hack/Cargo.toml | 3 +-- 15 files changed, 76 insertions(+), 59 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index d8bf04e87f..54a4f946d3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -903,12 +903,6 @@ version = "0.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8" -[[package]] -name = "base64" -version = "0.20.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ea22880d78093b0cbe17c89f64a7d457941e65759157ec6cb31a31d652b05e5" - [[package]] name = "base64" version = "0.21.7" @@ -1300,7 +1294,7 @@ dependencies = [ "aws-smithy-types", "axum", "axum-extra", - "base64 0.13.1", + "base64 0.22.1", "bytes", "camino", "cfg-if", @@ -1426,7 +1420,7 @@ name = "control_plane" version = "0.1.0" dependencies = [ "anyhow", - "base64 0.13.1", + "base64 0.22.1", "camino", "clap", "comfy-table", @@ -4818,7 +4812,7 @@ dependencies = [ name = "postgres-protocol2" version = "0.1.0" dependencies = [ - "base64 0.20.0", + "base64 0.22.1", "byteorder", "bytes", "fallible-iterator", @@ -5190,7 +5184,7 @@ dependencies = [ "aws-config", "aws-sdk-iam", "aws-sigv4", - "base64 0.13.1", + "base64 0.22.1", "bstr", "bytes", "camino", @@ -6494,15 +6488,17 @@ dependencies = [ [[package]] name = "serde_with" -version = "2.3.3" +version = "3.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "07ff71d2c147a7b57362cead5e22f772cd52f6ab31cfcd9edcd7f6aeb2a0afbe" +checksum = "d6b6f7f2fcb69f747921f79f3926bd1e203fce4fef62c268dd3abfb6d86029aa" dependencies = [ - "base64 0.13.1", + "base64 0.22.1", "chrono", "hex", "indexmap 1.9.3", + "indexmap 2.9.0", "serde", + "serde_derive", "serde_json", "serde_with_macros", "time", @@ -6510,9 +6506,9 @@ dependencies = [ [[package]] name = "serde_with_macros" -version = "2.3.3" +version = "3.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "881b6f881b17d13214e5d494c939ebab463d01264ce1811e9d4ac3a882e7695f" +checksum = "8d00caa5193a3c8362ac2b73be6b9e768aa5a4b2f721d8f4b339600c3cb51f8e" dependencies = [ "darling", "proc-macro2", @@ -8583,7 +8579,6 @@ dependencies = [ "anyhow", "axum", "axum-core", - "base64 0.13.1", "base64 0.21.7", "base64ct", "bytes", diff --git a/Cargo.toml b/Cargo.toml index 666ead7352..2f4fcbc249 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -72,7 +72,7 @@ aws-sigv4 = { version = "1.2", features = ["sign-http"] } aws-types = "1.3" axum = { version = "0.8.1", features = ["ws"] } axum-extra = { version = "0.10.0", features = ["typed-header", "query"] } -base64 = "0.13.0" +base64 = "0.22" bincode = "1.3" bindgen = "0.71" bit_field = "0.10.2" @@ -171,7 +171,7 @@ sentry = { version = "0.37", default-features = false, features = ["backtrace", serde = { version = "1.0", features = ["derive"] } serde_json = "1" serde_path_to_error = "0.1" -serde_with = { version = "2.0", features = [ "base64" ] } +serde_with = { version = "3", features = [ "base64" ] } serde_assert = "0.5.0" sha2 = "0.10.2" signal-hook = "0.3" diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs index 774a0053f8..bc3a3a4e38 100644 --- a/control_plane/src/endpoint.rs +++ b/control_plane/src/endpoint.rs @@ -45,6 +45,8 @@ use std::sync::Arc; use std::time::{Duration, Instant}; use anyhow::{Context, Result, anyhow, bail}; +use base64::Engine; +use base64::prelude::BASE64_URL_SAFE_NO_PAD; use compute_api::requests::{ COMPUTE_AUDIENCE, ComputeClaims, ComputeClaimsScope, ConfigurationRequest, }; @@ -164,7 +166,7 @@ impl ComputeControlPlane { public_key_use: Some(PublicKeyUse::Signature), key_operations: Some(vec![KeyOperations::Verify]), key_algorithm: Some(KeyAlgorithm::EdDSA), - key_id: Some(base64::encode_config(key_hash, base64::URL_SAFE_NO_PAD)), + key_id: Some(BASE64_URL_SAFE_NO_PAD.encode(key_hash)), x509_url: None::, x509_chain: None::>, x509_sha1_fingerprint: None::, @@ -173,7 +175,7 @@ impl ComputeControlPlane { algorithm: AlgorithmParameters::OctetKeyPair(OctetKeyPairParameters { key_type: OctetKeyPairType::OctetKeyPair, curve: EllipticCurve::Ed25519, - x: base64::encode_config(public_key, base64::URL_SAFE_NO_PAD), + x: BASE64_URL_SAFE_NO_PAD.encode(public_key), }), }], }) diff --git a/libs/proxy/postgres-protocol2/Cargo.toml b/libs/proxy/postgres-protocol2/Cargo.toml index 7ebb05eec1..9c8f8f3531 100644 --- a/libs/proxy/postgres-protocol2/Cargo.toml +++ b/libs/proxy/postgres-protocol2/Cargo.toml @@ -5,7 +5,7 @@ edition = "2024" license = "MIT/Apache-2.0" [dependencies] -base64 = "0.20" +base64.workspace = true byteorder.workspace = true bytes.workspace = true fallible-iterator.workspace = true diff --git a/libs/proxy/postgres-protocol2/src/authentication/sasl.rs b/libs/proxy/postgres-protocol2/src/authentication/sasl.rs index 2daf9a80d4..a7bf3da20a 100644 --- a/libs/proxy/postgres-protocol2/src/authentication/sasl.rs +++ b/libs/proxy/postgres-protocol2/src/authentication/sasl.rs @@ -3,6 +3,8 @@ use std::fmt::Write; use std::{io, iter, mem, str}; +use base64::Engine as _; +use base64::prelude::BASE64_STANDARD; use hmac::{Hmac, Mac}; use rand::{self, Rng}; use sha2::digest::FixedOutput; @@ -226,7 +228,7 @@ impl ScramSha256 { let (client_key, server_key) = match password { Credentials::Password(password) => { - let salt = match base64::decode(parsed.salt) { + let salt = match BASE64_STANDARD.decode(parsed.salt) { Ok(salt) => salt, Err(e) => return Err(io::Error::new(io::ErrorKind::InvalidInput, e)), }; @@ -255,7 +257,7 @@ impl ScramSha256 { let mut cbind_input = vec![]; cbind_input.extend(channel_binding.gs2_header().as_bytes()); cbind_input.extend(channel_binding.cbind_data()); - let cbind_input = base64::encode(&cbind_input); + let cbind_input = BASE64_STANDARD.encode(&cbind_input); self.message.clear(); write!(&mut self.message, "c={},r={}", cbind_input, parsed.nonce).unwrap(); @@ -272,7 +274,12 @@ impl ScramSha256 { *proof ^= signature; } - write!(&mut self.message, ",p={}", base64::encode(client_proof)).unwrap(); + write!( + &mut self.message, + ",p={}", + BASE64_STANDARD.encode(client_proof) + ) + .unwrap(); self.state = State::Finish { server_key, @@ -306,7 +313,7 @@ impl ScramSha256 { ServerFinalMessage::Verifier(verifier) => verifier, }; - let verifier = match base64::decode(verifier) { + let verifier = match BASE64_STANDARD.decode(verifier) { Ok(verifier) => verifier, Err(e) => return Err(io::Error::new(io::ErrorKind::InvalidInput, e)), }; diff --git a/libs/proxy/postgres-protocol2/src/password/mod.rs b/libs/proxy/postgres-protocol2/src/password/mod.rs index 4cd9bfb060..e00ca1e34c 100644 --- a/libs/proxy/postgres-protocol2/src/password/mod.rs +++ b/libs/proxy/postgres-protocol2/src/password/mod.rs @@ -6,6 +6,8 @@ //! side. This is good because it ensures the cleartext password won't //! end up in logs pg_stat displays, etc. +use base64::Engine as _; +use base64::prelude::BASE64_STANDARD; use hmac::{Hmac, Mac}; use rand::RngCore; use sha2::digest::FixedOutput; @@ -83,8 +85,8 @@ pub(crate) async fn scram_sha_256_salt( format!( "SCRAM-SHA-256${}:{}${}:{}", SCRAM_DEFAULT_ITERATIONS, - base64::encode(salt), - base64::encode(stored_key), - base64::encode(server_key) + BASE64_STANDARD.encode(salt), + BASE64_STANDARD.encode(stored_key), + BASE64_STANDARD.encode(server_key) ) } diff --git a/proxy/src/auth/backend/jwt.rs b/proxy/src/auth/backend/jwt.rs index a48f67199a..5edc878243 100644 --- a/proxy/src/auth/backend/jwt.rs +++ b/proxy/src/auth/backend/jwt.rs @@ -4,6 +4,8 @@ use std::sync::Arc; use std::time::{Duration, SystemTime}; use arc_swap::ArcSwapOption; +use base64::Engine as _; +use base64::prelude::BASE64_URL_SAFE_NO_PAD; use clashmap::ClashMap; use jose_jwk::crypto::KeyInfo; use reqwest::{Client, redirect}; @@ -347,17 +349,17 @@ impl JwkCacheEntryLock { .split_once('.') .ok_or(JwtEncodingError::InvalidCompactForm)?; - let header = base64::decode_config(header, base64::URL_SAFE_NO_PAD)?; + let header = BASE64_URL_SAFE_NO_PAD.decode(header)?; let header = serde_json::from_slice::>(&header)?; - let payloadb = base64::decode_config(payload, base64::URL_SAFE_NO_PAD)?; + let payloadb = BASE64_URL_SAFE_NO_PAD.decode(payload)?; let payload = serde_json::from_slice::>(&payloadb)?; if let Some(iss) = &payload.issuer { ctx.set_jwt_issuer(iss.as_ref().to_owned()); } - let sig = base64::decode_config(signature, base64::URL_SAFE_NO_PAD)?; + let sig = BASE64_URL_SAFE_NO_PAD.decode(signature)?; let kid = header.key_id.ok_or(JwtError::MissingKeyId)?; @@ -796,7 +798,6 @@ mod tests { use std::net::SocketAddr; use std::time::SystemTime; - use base64::URL_SAFE_NO_PAD; use bytes::Bytes; use http::Response; use http_body_util::Full; @@ -871,9 +872,8 @@ mod tests { key_id: Some(Cow::Owned(kid)), }; - let header = - base64::encode_config(serde_json::to_string(&header).unwrap(), URL_SAFE_NO_PAD); - let body = base64::encode_config(serde_json::to_string(&body).unwrap(), URL_SAFE_NO_PAD); + let header = BASE64_URL_SAFE_NO_PAD.encode(serde_json::to_string(&header).unwrap()); + let body = BASE64_URL_SAFE_NO_PAD.encode(serde_json::to_string(&body).unwrap()); format!("{header}.{body}") } @@ -883,7 +883,7 @@ mod tests { let payload = build_jwt_payload(kid, jose_jwa::Signing::Es256); let sig: Signature = SigningKey::from(key).sign(payload.as_bytes()); - let sig = base64::encode_config(sig.to_bytes(), URL_SAFE_NO_PAD); + let sig = BASE64_URL_SAFE_NO_PAD.encode(sig.to_bytes()); format!("{payload}.{sig}") } @@ -893,7 +893,7 @@ mod tests { let payload = build_custom_jwt_payload(kid, body, jose_jwa::Signing::Es256); let sig: Signature = SigningKey::from(key).sign(payload.as_bytes()); - let sig = base64::encode_config(sig.to_bytes(), URL_SAFE_NO_PAD); + let sig = BASE64_URL_SAFE_NO_PAD.encode(sig.to_bytes()); format!("{payload}.{sig}") } @@ -904,7 +904,7 @@ mod tests { let payload = build_jwt_payload(kid, jose_jwa::Signing::Rs256); let sig = SigningKey::::new(key).sign(payload.as_bytes()); - let sig = base64::encode_config(sig.to_bytes(), URL_SAFE_NO_PAD); + let sig = BASE64_URL_SAFE_NO_PAD.encode(sig.to_bytes()); format!("{payload}.{sig}") } diff --git a/proxy/src/sasl/channel_binding.rs b/proxy/src/sasl/channel_binding.rs index fdd011448e..e548cf3a83 100644 --- a/proxy/src/sasl/channel_binding.rs +++ b/proxy/src/sasl/channel_binding.rs @@ -1,5 +1,8 @@ //! Definition and parser for channel binding flag (a part of the `GS2` header). +use base64::Engine as _; +use base64::prelude::BASE64_STANDARD; + /// Channel binding flag (possibly with params). #[derive(Debug, PartialEq, Eq)] pub(crate) enum ChannelBinding { @@ -55,7 +58,7 @@ impl ChannelBinding { let mut cbind_input = vec![]; write!(&mut cbind_input, "p={mode},,",).unwrap(); cbind_input.extend_from_slice(get_cbind_data(mode)?); - base64::encode(&cbind_input).into() + BASE64_STANDARD.encode(&cbind_input).into() } }) } @@ -70,9 +73,9 @@ mod tests { use ChannelBinding::*; let cases = [ - (NotSupportedClient, base64::encode("n,,")), - (NotSupportedServer, base64::encode("y,,")), - (Required("foo"), base64::encode("p=foo,,bar")), + (NotSupportedClient, BASE64_STANDARD.encode("n,,")), + (NotSupportedServer, BASE64_STANDARD.encode("y,,")), + (Required("foo"), BASE64_STANDARD.encode("p=foo,,bar")), ]; for (cb, input) in cases { diff --git a/proxy/src/scram/exchange.rs b/proxy/src/scram/exchange.rs index abd5aeae5b..3ba8a79368 100644 --- a/proxy/src/scram/exchange.rs +++ b/proxy/src/scram/exchange.rs @@ -2,6 +2,8 @@ use std::convert::Infallible; +use base64::Engine as _; +use base64::prelude::BASE64_STANDARD; use hmac::{Hmac, Mac}; use sha2::Sha256; @@ -105,7 +107,7 @@ pub(crate) async fn exchange( secret: &ServerSecret, password: &[u8], ) -> sasl::Result> { - let salt = base64::decode(&secret.salt_base64)?; + let salt = BASE64_STANDARD.decode(&secret.salt_base64)?; let client_key = derive_client_key(pool, endpoint, password, &salt, secret.iterations).await; if secret.is_password_invalid(&client_key).into() { diff --git a/proxy/src/scram/messages.rs b/proxy/src/scram/messages.rs index e071417dab..42039f099c 100644 --- a/proxy/src/scram/messages.rs +++ b/proxy/src/scram/messages.rs @@ -3,6 +3,9 @@ use std::fmt; use std::ops::Range; +use base64::Engine as _; +use base64::prelude::BASE64_STANDARD; + use super::base64_decode_array; use super::key::{SCRAM_KEY_LEN, ScramKey}; use super::signature::SignatureBuilder; @@ -88,7 +91,7 @@ impl<'a> ClientFirstMessage<'a> { let mut message = String::new(); write!(&mut message, "r={}", self.nonce).unwrap(); - base64::encode_config_buf(nonce, base64::STANDARD, &mut message); + BASE64_STANDARD.encode_string(nonce, &mut message); let combined_nonce = 2..message.len(); write!(&mut message, ",s={salt_base64},i={iterations}").unwrap(); @@ -142,11 +145,7 @@ impl<'a> ClientFinalMessage<'a> { server_key: &ScramKey, ) -> String { let mut buf = String::from("v="); - base64::encode_config_buf( - signature_builder.build(server_key), - base64::STANDARD, - &mut buf, - ); + BASE64_STANDARD.encode_string(signature_builder.build(server_key), &mut buf); buf } @@ -251,7 +250,7 @@ mod tests { "iiYEfS3rOgn8S3rtpSdrOsHtPLWvIkdgmHxA0hf3JNOAG4dU" ); assert_eq!( - base64::encode(msg.proof), + BASE64_STANDARD.encode(msg.proof), "SRpfsIVS4Gk11w1LqQ4QvCUBZYQmqXNSDEcHqbQ3CHI=" ); } diff --git a/proxy/src/scram/mod.rs b/proxy/src/scram/mod.rs index 4f764c6087..5f627e062c 100644 --- a/proxy/src/scram/mod.rs +++ b/proxy/src/scram/mod.rs @@ -15,6 +15,8 @@ mod secret; mod signature; pub mod threadpool; +use base64::Engine as _; +use base64::prelude::BASE64_STANDARD; pub(crate) use exchange::{Exchange, exchange}; use hmac::{Hmac, Mac}; pub(crate) use key::ScramKey; @@ -32,7 +34,7 @@ pub(crate) const METHODS_WITHOUT_PLUS: &[&str] = &[SCRAM_SHA_256]; fn base64_decode_array(input: impl AsRef<[u8]>) -> Option<[u8; N]> { let mut bytes = [0u8; N]; - let size = base64::decode_config_slice(input, base64::STANDARD, &mut bytes).ok()?; + let size = BASE64_STANDARD.decode_slice(input, &mut bytes).ok()?; if size != N { return None; } diff --git a/proxy/src/scram/secret.rs b/proxy/src/scram/secret.rs index 8c6a08d432..f03617f34d 100644 --- a/proxy/src/scram/secret.rs +++ b/proxy/src/scram/secret.rs @@ -1,5 +1,7 @@ //! Tools for SCRAM server secret management. +use base64::Engine as _; +use base64::prelude::BASE64_STANDARD; use subtle::{Choice, ConstantTimeEq}; use super::base64_decode_array; @@ -56,7 +58,7 @@ impl ServerSecret { // iteration count 1 for our generated passwords going forward. // PG16 users can set iteration count=1 already today. iterations: 1, - salt_base64: base64::encode(nonce), + salt_base64: BASE64_STANDARD.encode(nonce), stored_key: ScramKey::default(), server_key: ScramKey::default(), doomed: true, @@ -88,7 +90,7 @@ mod tests { assert_eq!(parsed.iterations, iterations); assert_eq!(parsed.salt_base64, salt); - assert_eq!(base64::encode(parsed.stored_key), stored_key); - assert_eq!(base64::encode(parsed.server_key), server_key); + assert_eq!(BASE64_STANDARD.encode(parsed.stored_key), stored_key); + assert_eq!(BASE64_STANDARD.encode(parsed.server_key), server_key); } } diff --git a/proxy/src/serverless/local_conn_pool.rs b/proxy/src/serverless/local_conn_pool.rs index bb5637cd5f..c367615fb8 100644 --- a/proxy/src/serverless/local_conn_pool.rs +++ b/proxy/src/serverless/local_conn_pool.rs @@ -16,6 +16,8 @@ use std::sync::atomic::AtomicUsize; use std::task::{Poll, ready}; use std::time::Duration; +use base64::Engine as _; +use base64::prelude::BASE64_URL_SAFE_NO_PAD; use ed25519_dalek::{Signature, Signer, SigningKey}; use futures::Future; use futures::future::poll_fn; @@ -346,7 +348,7 @@ fn sign_jwt(sk: &SigningKey, payload: &[u8]) -> String { jwt.push_str("eyJhbGciOiJFZERTQSJ9."); // encode the jwt payload in-place - base64::encode_config_buf(payload, base64::URL_SAFE_NO_PAD, &mut jwt); + BASE64_URL_SAFE_NO_PAD.encode_string(payload, &mut jwt); // create the signature from the encoded header || payload let sig: Signature = sk.sign(jwt.as_bytes()); @@ -354,7 +356,7 @@ fn sign_jwt(sk: &SigningKey, payload: &[u8]) -> String { jwt.push('.'); // encode the jwt signature in-place - base64::encode_config_buf(sig.to_bytes(), base64::URL_SAFE_NO_PAD, &mut jwt); + BASE64_URL_SAFE_NO_PAD.encode_string(sig.to_bytes(), &mut jwt); debug_assert_eq!( jwt.len(), diff --git a/proxy/src/tls/mod.rs b/proxy/src/tls/mod.rs index 7fe71abf48..f576214255 100644 --- a/proxy/src/tls/mod.rs +++ b/proxy/src/tls/mod.rs @@ -3,6 +3,8 @@ pub mod postgres_rustls; pub mod server_config; use anyhow::Context; +use base64::Engine as _; +use base64::prelude::BASE64_STANDARD; use rustls::pki_types::CertificateDer; use sha2::{Digest, Sha256}; use tracing::{error, info}; @@ -58,7 +60,7 @@ impl TlsServerEndPoint { let oid = certificate.signature_algorithm.oid; if SHA256_OIDS.contains(&oid) { let tls_server_end_point: [u8; 32] = Sha256::new().chain_update(cert).finalize().into(); - info!(%subject, tls_server_end_point = %base64::encode(tls_server_end_point), "determined channel binding"); + info!(%subject, tls_server_end_point = %BASE64_STANDARD.encode(tls_server_end_point), "determined channel binding"); Ok(Self::Sha256(tls_server_end_point)) } else { error!(%subject, "unknown channel binding"); diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index 2b07889871..b74df50f86 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -20,8 +20,7 @@ anstream = { version = "0.6" } anyhow = { version = "1", features = ["backtrace"] } axum = { version = "0.8", features = ["ws"] } axum-core = { version = "0.5", default-features = false, features = ["tracing"] } -base64-594e8ee84c453af0 = { package = "base64", version = "0.13", features = ["alloc"] } -base64-647d43efb71741da = { package = "base64", version = "0.21" } +base64 = { version = "0.21" } base64ct = { version = "1", default-features = false, features = ["std"] } bytes = { version = "1", features = ["serde"] } camino = { version = "1", default-features = false, features = ["serde1"] } From db24ba95d1991a90c50d25f322eb07435cf31936 Mon Sep 17 00:00:00 2001 From: Vlad Lazar Date: Thu, 12 Jun 2025 18:15:02 +0100 Subject: [PATCH 035/124] pagserver: always persist shard identity (#12217) ## Problem The location config (which includes the stripe size) is stored on pageserver disk. For unsharded tenants we [do not include the shard identity in the serialized description](https://github.com/neondatabase/neon/blob/ad88ec9257b84d6633be86c5a970709146331ad4/pageserver/src/tenant/config.rs#L64-L66). When the pageserver restarts, it reads that configuration and will use the stripe size from there and rely on storcon input from reattach for generation and mode. The default deserialization is ShardIdentity::unsharded. This has the new default stripe size of 2048. Hence, for unsharded tenants we can be running with a stripe size different from that the one in the storcon observed state. This is not a problem until we shard split without specifying a stripe size (i.e. manual splits via the UI or storcon_cli). When that happens the new shards will use the 2048 stripe size until storcon realises and switches them back. At that point it's too late, since we've ingested data with the wrong stripe sizes. ## Summary of changes Ideally, we would always have the full shard identity on disk. To achieve this over two releases we do: 1. Always persist the shard identity in the location config on the PS. 2. Storage controller includes the stripe size to use in the re attach response. After the first release, we will start persisting correct stripe sizes for any tenant shard that the storage controller explicitly sends a location_conf. After the second release, the re-attach change kicks in and we'll persist the shard identity for all shards. --- libs/pageserver_api/src/upcall_api.rs | 6 +++- pageserver/src/tenant/config.rs | 11 ++++++-- pageserver/src/tenant/mgr.rs | 40 ++++++++++++++++----------- storage_controller/src/service.rs | 2 ++ 4 files changed, 40 insertions(+), 19 deletions(-) diff --git a/libs/pageserver_api/src/upcall_api.rs b/libs/pageserver_api/src/upcall_api.rs index 4dce5f7817..e2de02eea0 100644 --- a/libs/pageserver_api/src/upcall_api.rs +++ b/libs/pageserver_api/src/upcall_api.rs @@ -9,7 +9,7 @@ use utils::id::{NodeId, TimelineId}; use crate::controller_api::NodeRegisterRequest; use crate::models::{LocationConfigMode, ShardImportStatus}; -use crate::shard::TenantShardId; +use crate::shard::{ShardStripeSize, TenantShardId}; /// Upcall message sent by the pageserver to the configured `control_plane_api` on /// startup. @@ -36,6 +36,10 @@ pub struct ReAttachResponseTenant { /// Default value only for backward compat: this field should be set #[serde(default = "default_mode")] pub mode: LocationConfigMode, + + // Default value only for backward compat: this field should be set + #[serde(default = "ShardStripeSize::default")] + pub stripe_size: ShardStripeSize, } #[derive(Serialize, Deserialize)] pub struct ReAttachResponse { diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs index bf82fc8df8..030b43a020 100644 --- a/pageserver/src/tenant/config.rs +++ b/pageserver/src/tenant/config.rs @@ -61,8 +61,8 @@ pub(crate) struct LocationConf { /// The detailed shard identity. This structure is already scoped within /// a TenantShardId, but we need the full ShardIdentity to enable calculating /// key->shard mappings. + // TODO(vlad): Remove this default once all configs have a shard identity on disk. #[serde(default = "ShardIdentity::unsharded")] - #[serde(skip_serializing_if = "ShardIdentity::is_unsharded")] pub(crate) shard: ShardIdentity, /// The pan-cluster tenant configuration, the same on all locations @@ -149,7 +149,12 @@ impl LocationConf { /// For use when attaching/re-attaching: update the generation stored in this /// structure. If we were in a secondary state, promote to attached (posession /// of a fresh generation implies this). - pub(crate) fn attach_in_generation(&mut self, mode: AttachmentMode, generation: Generation) { + pub(crate) fn attach_in_generation( + &mut self, + mode: AttachmentMode, + generation: Generation, + stripe_size: ShardStripeSize, + ) { match &mut self.mode { LocationMode::Attached(attach_conf) => { attach_conf.generation = generation; @@ -163,6 +168,8 @@ impl LocationConf { }) } } + + self.shard.stripe_size = stripe_size; } pub(crate) fn try_from(conf: &'_ models::LocationConfig) -> anyhow::Result { diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 766f846827..76937dd959 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -129,7 +129,7 @@ pub(crate) enum ShardSelector { /// /// This represents the subset of a LocationConfig that we receive during re-attach. pub(crate) enum TenantStartupMode { - Attached((AttachmentMode, Generation)), + Attached((AttachmentMode, Generation, ShardStripeSize)), Secondary, } @@ -143,15 +143,21 @@ impl TenantStartupMode { match (rart.mode, rart.r#gen) { (LocationConfigMode::Detached, _) => None, (LocationConfigMode::Secondary, _) => Some(Self::Secondary), - (LocationConfigMode::AttachedMulti, Some(g)) => { - Some(Self::Attached((AttachmentMode::Multi, Generation::new(g)))) - } - (LocationConfigMode::AttachedSingle, Some(g)) => { - Some(Self::Attached((AttachmentMode::Single, Generation::new(g)))) - } - (LocationConfigMode::AttachedStale, Some(g)) => { - Some(Self::Attached((AttachmentMode::Stale, Generation::new(g)))) - } + (LocationConfigMode::AttachedMulti, Some(g)) => Some(Self::Attached(( + AttachmentMode::Multi, + Generation::new(g), + rart.stripe_size, + ))), + (LocationConfigMode::AttachedSingle, Some(g)) => Some(Self::Attached(( + AttachmentMode::Single, + Generation::new(g), + rart.stripe_size, + ))), + (LocationConfigMode::AttachedStale, Some(g)) => Some(Self::Attached(( + AttachmentMode::Stale, + Generation::new(g), + rart.stripe_size, + ))), _ => { tracing::warn!( "Received invalid re-attach state for tenant {}: {rart:?}", @@ -319,9 +325,11 @@ fn emergency_generations( Some(( *tid, match &lc.mode { - LocationMode::Attached(alc) => { - TenantStartupMode::Attached((alc.attach_mode, alc.generation)) - } + LocationMode::Attached(alc) => TenantStartupMode::Attached(( + alc.attach_mode, + alc.generation, + ShardStripeSize::default(), + )), LocationMode::Secondary(_) => TenantStartupMode::Secondary, }, )) @@ -365,7 +373,7 @@ async fn init_load_generations( .iter() .flat_map(|(id, start_mode)| { match start_mode { - TenantStartupMode::Attached((_mode, generation)) => Some(generation), + TenantStartupMode::Attached((_mode, generation, _stripe_size)) => Some(generation), TenantStartupMode::Secondary => None, } .map(|gen_| (*id, *gen_)) @@ -585,7 +593,7 @@ pub async fn init_tenant_mgr( location_conf.mode = LocationMode::Secondary(DEFAULT_SECONDARY_CONF); } } - Some(TenantStartupMode::Attached((attach_mode, generation))) => { + Some(TenantStartupMode::Attached((attach_mode, generation, stripe_size))) => { let old_gen_higher = match &location_conf.mode { LocationMode::Attached(AttachedLocationConfig { generation: old_generation, @@ -609,7 +617,7 @@ pub async fn init_tenant_mgr( // local disk content: demote to secondary rather than detaching. location_conf.mode = LocationMode::Secondary(DEFAULT_SECONDARY_CONF); } else { - location_conf.attach_in_generation(*attach_mode, *generation); + location_conf.attach_in_generation(*attach_mode, *generation, *stripe_size); } } } diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 06318a01b5..5e59d776ab 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -2267,6 +2267,7 @@ impl Service { // fail, and start from scratch, so it doesn't make sense for us to try and preserve // the stale/multi states at this point. mode: LocationConfigMode::AttachedSingle, + stripe_size: shard.shard.stripe_size, }); shard.generation = std::cmp::max(shard.generation, Some(new_gen)); @@ -2300,6 +2301,7 @@ impl Service { id: *tenant_shard_id, r#gen: None, mode: LocationConfigMode::Secondary, + stripe_size: shard.shard.stripe_size, }); // We must not update observed, because we have no guarantee that our From 43acabd4c2baf9bb8359fa3965cffedc7fd8d8b9 Mon Sep 17 00:00:00 2001 From: Ivan Efremov Date: Thu, 12 Jun 2025 22:46:02 +0300 Subject: [PATCH 036/124] [proxy]: Improve backoff strategy for redis reconnection (#12218) Sometimes during a failed redis connection attempt at the init stage proxy pod can continuously restart. This, in turn, can aggravate the problem if redis is overloaded. Solves the #11114 --- proxy/src/binary/proxy.rs | 55 +++++++++++++++++++++++++++------------ 1 file changed, 39 insertions(+), 16 deletions(-) diff --git a/proxy/src/binary/proxy.rs b/proxy/src/binary/proxy.rs index 757c1e988b..6ab6df5610 100644 --- a/proxy/src/binary/proxy.rs +++ b/proxy/src/binary/proxy.rs @@ -11,11 +11,13 @@ use anyhow::Context; use anyhow::{bail, ensure}; use arc_swap::ArcSwapOption; use futures::future::Either; +use itertools::{Itertools, Position}; +use rand::{Rng, thread_rng}; use remote_storage::RemoteStorageConfig; use tokio::net::TcpListener; use tokio::task::JoinSet; use tokio_util::sync::CancellationToken; -use tracing::{Instrument, info, warn}; +use tracing::{Instrument, error, info, warn}; use utils::sentry_init::init_sentry; use utils::{project_build_tag, project_git_version}; @@ -314,7 +316,7 @@ pub async fn run() -> anyhow::Result<()> { let jemalloc = match crate::jemalloc::MetricRecorder::new() { Ok(t) => Some(t), Err(e) => { - tracing::error!(error = ?e, "could not start jemalloc metrics loop"); + error!(error = ?e, "could not start jemalloc metrics loop"); None } }; @@ -520,23 +522,44 @@ pub async fn run() -> anyhow::Result<()> { } } + // Try to connect to Redis 3 times with 1 + (0..0.1) second interval. + // This prevents immediate exit and pod restart, + // which can cause hammering of the redis in case of connection issues. if let Some(mut redis_kv_client) = redis_kv_client { - maintenance_tasks.spawn(async move { - redis_kv_client.try_connect().await?; - handle_cancel_messages( - &mut redis_kv_client, - rx_cancel, - args.cancellation_batch_size, - ) - .await?; + for attempt in (0..3).with_position() { + match redis_kv_client.try_connect().await { + Ok(()) => { + info!("Connected to Redis KV client"); + maintenance_tasks.spawn(async move { + handle_cancel_messages( + &mut redis_kv_client, + rx_cancel, + args.cancellation_batch_size, + ) + .await?; - drop(redis_kv_client); + drop(redis_kv_client); - // `handle_cancel_messages` was terminated due to the tx_cancel - // being dropped. this is not worthy of an error, and this task can only return `Err`, - // so let's wait forever instead. - std::future::pending().await - }); + // `handle_cancel_messages` was terminated due to the tx_cancel + // being dropped. this is not worthy of an error, and this task can only return `Err`, + // so let's wait forever instead. + std::future::pending().await + }); + break; + } + Err(e) => { + error!("Failed to connect to Redis KV client: {e}"); + if matches!(attempt, Position::Last(_)) { + bail!( + "Failed to connect to Redis KV client after {} attempts", + attempt.into_inner() + ); + } + let jitter = thread_rng().gen_range(0..100); + tokio::time::sleep(Duration::from_millis(1000 + jitter)).await; + } + } + } } if let Some(regional_redis_client) = regional_redis_client { From 8917676e868a89b0f83a2dbd3428f230b5fd85fd Mon Sep 17 00:00:00 2001 From: Trung Dinh Date: Thu, 12 Jun 2025 23:07:18 -0700 Subject: [PATCH 037/124] Improve logging for gc-compaction (#12219) ## Problem * Inside `compact_with_gc_inner`, there is a similar log line: https://github.com/neondatabase/neon/blob/db24ba95d1991a90c50d25f322eb07435cf31936/pageserver/src/tenant/timeline/compaction.rs#L3181-L3187 * Also, I think it would be useful when debugging to have the ability to select a particular sub-compaction job (e.g., `1/100`) to see all the logs for that job. ## Summary of changes * Attach a span to the `compact_with_gc_inner`. CC: @skyzh --- pageserver/src/tenant/timeline/compaction.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index 0ec2292ee8..5307d3836f 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -3124,12 +3124,12 @@ impl Timeline { .await?; let jobs_len = jobs.len(); for (idx, job) in jobs.into_iter().enumerate() { - info!( - "running enhanced gc bottom-most compaction, sub-compaction {}/{}", - idx + 1, - jobs_len - ); + let sub_compaction_progress = format!("{}/{}", idx + 1, jobs_len); self.compact_with_gc_inner(cancel, job, ctx, yield_for_l0) + .instrument(info_span!( + "sub_compaction", + sub_compaction_progress = sub_compaction_progress + )) .await?; } if jobs_len == 0 { From e83f1d8ba5b34bcfd9be05420ae24c26d49c934c Mon Sep 17 00:00:00 2001 From: Dmitrii Kovalkov <34828390+DimasKovas@users.noreply.github.com> Date: Fri, 13 Jun 2025 10:28:18 +0400 Subject: [PATCH 038/124] tests: prepare test_historic_storage_formats for --timelines-onto-safekeepers (#12214) ## Problem `test_historic_storage_formats` uses `/tenant_import` to import historic data. Tenant import does not create timelines onto safekeepers, because they might already exist on some safekeeper set. If it does, then we may end up with two different quorums accepting WAL for the same timeline. If the tenant import is used in a real deployment, the administrator is responsible for looking for the proper safekeeper set and migrate timelines into storcon-managed timelines. - Relates to https://github.com/neondatabase/neon/pull/11712 ## Summary of changes - Create timelines onto safekeepers manually after tenant import in `test_historic_storage_formats` - Add a note to tenant import that timelines will be not storcon-managed after the import. --- storage_controller/src/service.rs | 2 ++ test_runner/regress/test_compatibility.py | 20 ++++++++++++++++++++ 2 files changed, 22 insertions(+) diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 5e59d776ab..8800d792f2 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -6651,6 +6651,8 @@ impl Service { /// This is for debug/support only: assuming tenant data is already present in S3, we "create" a /// tenant with a very high generation number so that it will see the existing data. + /// It does not create timelines on safekeepers, because they might already exist on some + /// safekeeper set. So, the timelines are not storcon-managed after the import. pub(crate) async fn tenant_import( &self, tenant_id: TenantId, diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py index 784afbba82..31e38144e3 100644 --- a/test_runner/regress/test_compatibility.py +++ b/test_runner/regress/test_compatibility.py @@ -18,6 +18,7 @@ from fixtures.neon_fixtures import ( NeonEnv, NeonEnvBuilder, PgBin, + Safekeeper, flush_ep_to_pageserver, ) from fixtures.pageserver.http import PageserverApiException @@ -26,6 +27,7 @@ from fixtures.pageserver.utils import ( ) from fixtures.pg_version import PgVersion from fixtures.remote_storage import RemoteStorageKind, S3Storage, s3_storage +from fixtures.safekeeper.http import MembershipConfiguration from fixtures.workload import Workload if TYPE_CHECKING: @@ -542,6 +544,24 @@ def test_historic_storage_formats( # All our artifacts should contain at least one timeline assert len(timelines) > 0 + # Import tenant does not create the timeline on safekeepers, + # because it is a debug handler and the timeline may have already been + # created on some set of safekeepers. + # Create the timeline on safekeepers manually. + # TODO(diko): when we have the script/storcon handler to migrate + # the timeline to storcon, we can replace this code with it. + mconf = MembershipConfiguration( + generation=1, + members=Safekeeper.sks_to_safekeeper_ids([env.safekeepers[0]]), + new_members=None, + ) + members_sks = Safekeeper.mconf_sks(env, mconf) + + for timeline in timelines: + Safekeeper.create_timeline( + dataset.tenant_id, timeline["timeline_id"], env.pageserver, mconf, members_sks + ) + # TODO: ensure that the snapshots we're importing contain a sensible variety of content, at the very # least they should include a mixture of deltas and image layers. Preferably they should also # contain some "exotic" stuff like aux files from logical replication. From 3046c307dab8632429bfb1c4dd270ed766b3bfea Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Fri, 13 Jun 2025 15:22:02 +0800 Subject: [PATCH 039/124] feat(posthog_client): support feature flag secure API (#12201) ## Problem Part of #11813 PostHog has two endpoints to retrieve feature flags: the old project ID one that uses personal API token, and the new one using a special feature flag secure token that can only retrieve feature flag. The new API I added in this patch is not documented in the PostHog API doc but it's used in their Python SDK. ## Summary of changes Add support for "feature flag secure token API". The API has no way of providing a project ID so we verify if the retrieved spec is consistent with the project ID specified by comparing the `team_id` field. --------- Signed-off-by: Alex Chi Z --- .../src/background_loop.rs | 13 +++- libs/posthog_client_lite/src/lib.rs | 59 +++++++++++++++---- 2 files changed, 58 insertions(+), 14 deletions(-) diff --git a/libs/posthog_client_lite/src/background_loop.rs b/libs/posthog_client_lite/src/background_loop.rs index a404c76da9..693d62efc4 100644 --- a/libs/posthog_client_lite/src/background_loop.rs +++ b/libs/posthog_client_lite/src/background_loop.rs @@ -55,9 +55,16 @@ impl FeatureResolverBackgroundLoop { continue; } }; - let feature_store = FeatureStore::new_with_flags(resp.flags); - this.feature_store.store(Arc::new(feature_store)); - tracing::info!("Feature flag updated"); + let project_id = this.posthog_client.config.project_id.parse::().ok(); + match FeatureStore::new_with_flags(resp.flags, project_id) { + Ok(feature_store) => { + this.feature_store.store(Arc::new(feature_store)); + tracing::info!("Feature flag updated"); + } + Err(e) => { + tracing::warn!("Cannot process feature flag spec: {}", e); + } + } } tracing::info!("PostHog feature resolver stopped"); } diff --git a/libs/posthog_client_lite/src/lib.rs b/libs/posthog_client_lite/src/lib.rs index f607b1be0a..730878fb58 100644 --- a/libs/posthog_client_lite/src/lib.rs +++ b/libs/posthog_client_lite/src/lib.rs @@ -39,6 +39,9 @@ pub struct LocalEvaluationResponse { #[derive(Deserialize)] pub struct LocalEvaluationFlag { + #[allow(dead_code)] + id: u64, + team_id: u64, key: String, filters: LocalEvaluationFlagFilters, active: bool, @@ -107,17 +110,32 @@ impl FeatureStore { } } - pub fn new_with_flags(flags: Vec) -> Self { + pub fn new_with_flags( + flags: Vec, + project_id: Option, + ) -> Result { let mut store = Self::new(); - store.set_flags(flags); - store + store.set_flags(flags, project_id)?; + Ok(store) } - pub fn set_flags(&mut self, flags: Vec) { + pub fn set_flags( + &mut self, + flags: Vec, + project_id: Option, + ) -> Result<(), &'static str> { self.flags.clear(); for flag in flags { + if let Some(project_id) = project_id { + if flag.team_id != project_id { + return Err( + "Retrieved a spec with different project id, wrong config? Discarding the feature flags.", + ); + } + } self.flags.insert(flag.key.clone(), flag); } + Ok(()) } /// Generate a consistent hash for a user ID (e.g., tenant ID). @@ -534,6 +552,13 @@ impl PostHogClient { }) } + /// Check if the server API key is a feature flag secure API key. This key can only be + /// used to fetch the feature flag specs and can only be used on a undocumented API + /// endpoint. + fn is_feature_flag_secure_api_key(&self) -> bool { + self.config.server_api_key.starts_with("phs_") + } + /// Fetch the feature flag specs from the server. /// /// This is unfortunately an undocumented API at: @@ -547,10 +572,22 @@ impl PostHogClient { ) -> anyhow::Result { // BASE_URL/api/projects/:project_id/feature_flags/local_evaluation // with bearer token of self.server_api_key - let url = format!( - "{}/api/projects/{}/feature_flags/local_evaluation", - self.config.private_api_url, self.config.project_id - ); + // OR + // BASE_URL/api/feature_flag/local_evaluation/ + // with bearer token of feature flag specific self.server_api_key + let url = if self.is_feature_flag_secure_api_key() { + // The new feature local evaluation secure API token + format!( + "{}/api/feature_flag/local_evaluation", + self.config.private_api_url + ) + } else { + // The old personal API token + format!( + "{}/api/projects/{}/feature_flags/local_evaluation", + self.config.private_api_url, self.config.project_id + ) + }; let response = self .client .get(url) @@ -803,7 +840,7 @@ mod tests { fn evaluate_multivariate() { let mut store = FeatureStore::new(); let response: LocalEvaluationResponse = serde_json::from_str(data()).unwrap(); - store.set_flags(response.flags); + store.set_flags(response.flags, None).unwrap(); // This lacks the required properties and cannot be evaluated. let variant = @@ -873,7 +910,7 @@ mod tests { let mut store = FeatureStore::new(); let response: LocalEvaluationResponse = serde_json::from_str(data()).unwrap(); - store.set_flags(response.flags); + store.set_flags(response.flags, None).unwrap(); // This lacks the required properties and cannot be evaluated. let variant = store.evaluate_boolean_inner("boolean-flag", 1.00, &HashMap::new()); @@ -929,7 +966,7 @@ mod tests { let mut store = FeatureStore::new(); let response: LocalEvaluationResponse = serde_json::from_str(data()).unwrap(); - store.set_flags(response.flags); + store.set_flags(response.flags, None).unwrap(); // This lacks the required properties and cannot be evaluated. let variant = From 8a68d463f673b1b6408a841fd134ba0588f79abc Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Fri, 13 Jun 2025 16:41:10 +0800 Subject: [PATCH 040/124] feat(pagectl): no max key limit if time travel recover locally (#12222) ## Problem We would easily hit this limit for a tenant running for enough long time. ## Summary of changes Remove the max key limit for time-travel recovery if the command is running locally. Signed-off-by: Alex Chi Z --- libs/remote_storage/src/azure_blob.rs | 1 + libs/remote_storage/src/lib.rs | 10 ++++++---- libs/remote_storage/src/local_fs.rs | 1 + libs/remote_storage/src/s3_bucket.rs | 10 ++-------- libs/remote_storage/src/simulate_failures.rs | 3 ++- libs/remote_storage/tests/test_real_s3.rs | 6 +++--- pageserver/ctl/src/main.rs | 4 +++- .../src/tenant/remote_timeline_client/upload.rs | 17 ++++++++++++++++- 8 files changed, 34 insertions(+), 18 deletions(-) diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs index f64cd9e206..e9c24ac723 100644 --- a/libs/remote_storage/src/azure_blob.rs +++ b/libs/remote_storage/src/azure_blob.rs @@ -824,6 +824,7 @@ impl RemoteStorage for AzureBlobStorage { timestamp: SystemTime, done_if_after: SystemTime, cancel: &CancellationToken, + _complexity_limit: Option, ) -> Result<(), TimeTravelError> { let msg = "PLEASE NOTE: Azure Blob storage time-travel recovery may not work as expected " .to_string() diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs index b265d37a62..9e445dd72f 100644 --- a/libs/remote_storage/src/lib.rs +++ b/libs/remote_storage/src/lib.rs @@ -440,6 +440,7 @@ pub trait RemoteStorage: Send + Sync + 'static { timestamp: SystemTime, done_if_after: SystemTime, cancel: &CancellationToken, + complexity_limit: Option, ) -> Result<(), TimeTravelError>; } @@ -651,22 +652,23 @@ impl GenericRemoteStorage> { timestamp: SystemTime, done_if_after: SystemTime, cancel: &CancellationToken, + complexity_limit: Option, ) -> Result<(), TimeTravelError> { match self { Self::LocalFs(s) => { - s.time_travel_recover(prefix, timestamp, done_if_after, cancel) + s.time_travel_recover(prefix, timestamp, done_if_after, cancel, complexity_limit) .await } Self::AwsS3(s) => { - s.time_travel_recover(prefix, timestamp, done_if_after, cancel) + s.time_travel_recover(prefix, timestamp, done_if_after, cancel, complexity_limit) .await } Self::AzureBlob(s) => { - s.time_travel_recover(prefix, timestamp, done_if_after, cancel) + s.time_travel_recover(prefix, timestamp, done_if_after, cancel, complexity_limit) .await } Self::Unreliable(s) => { - s.time_travel_recover(prefix, timestamp, done_if_after, cancel) + s.time_travel_recover(prefix, timestamp, done_if_after, cancel, complexity_limit) .await } } diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs index 6607b55f1a..8320d7afdc 100644 --- a/libs/remote_storage/src/local_fs.rs +++ b/libs/remote_storage/src/local_fs.rs @@ -610,6 +610,7 @@ impl RemoteStorage for LocalFs { _timestamp: SystemTime, _done_if_after: SystemTime, _cancel: &CancellationToken, + _complexity_limit: Option, ) -> Result<(), TimeTravelError> { Err(TimeTravelError::Unimplemented) } diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs index 004aad447e..8a2e5bd10e 100644 --- a/libs/remote_storage/src/s3_bucket.rs +++ b/libs/remote_storage/src/s3_bucket.rs @@ -981,22 +981,16 @@ impl RemoteStorage for S3Bucket { timestamp: SystemTime, done_if_after: SystemTime, cancel: &CancellationToken, + complexity_limit: Option, ) -> Result<(), TimeTravelError> { let kind = RequestKind::TimeTravel; let permit = self.permit(kind, cancel).await?; tracing::trace!("Target time: {timestamp:?}, done_if_after {done_if_after:?}"); - // Limit the number of versions deletions, mostly so that we don't - // keep requesting forever if the list is too long, as we'd put the - // list in RAM. - // Building a list of 100k entries that reaches the limit roughly takes - // 40 seconds, and roughly corresponds to tenants of 2 TiB physical size. - const COMPLEXITY_LIMIT: Option = NonZeroU32::new(100_000); - let mode = ListingMode::NoDelimiter; let version_listing = self - .list_versions_with_permit(&permit, prefix, mode, COMPLEXITY_LIMIT, cancel) + .list_versions_with_permit(&permit, prefix, mode, complexity_limit, cancel) .await .map_err(|err| match err { DownloadError::Other(e) => TimeTravelError::Other(e), diff --git a/libs/remote_storage/src/simulate_failures.rs b/libs/remote_storage/src/simulate_failures.rs index 894cf600be..f9856a5856 100644 --- a/libs/remote_storage/src/simulate_failures.rs +++ b/libs/remote_storage/src/simulate_failures.rs @@ -240,11 +240,12 @@ impl RemoteStorage for UnreliableWrapper { timestamp: SystemTime, done_if_after: SystemTime, cancel: &CancellationToken, + complexity_limit: Option, ) -> Result<(), TimeTravelError> { self.attempt(RemoteOp::TimeTravelRecover(prefix.map(|p| p.to_owned()))) .map_err(TimeTravelError::Other)?; self.inner - .time_travel_recover(prefix, timestamp, done_if_after, cancel) + .time_travel_recover(prefix, timestamp, done_if_after, cancel, complexity_limit) .await } } diff --git a/libs/remote_storage/tests/test_real_s3.rs b/libs/remote_storage/tests/test_real_s3.rs index d38e13fd05..6b893edf75 100644 --- a/libs/remote_storage/tests/test_real_s3.rs +++ b/libs/remote_storage/tests/test_real_s3.rs @@ -157,7 +157,7 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow: // No changes after recovery to t2 (no-op) let t_final = time_point().await; ctx.client - .time_travel_recover(None, t2, t_final, &cancel) + .time_travel_recover(None, t2, t_final, &cancel, None) .await?; let t2_files_recovered = list_files(&ctx.client, &cancel).await?; println!("after recovery to t2: {t2_files_recovered:?}"); @@ -173,7 +173,7 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow: // after recovery to t1: path1 is back, path2 has the old content let t_final = time_point().await; ctx.client - .time_travel_recover(None, t1, t_final, &cancel) + .time_travel_recover(None, t1, t_final, &cancel, None) .await?; let t1_files_recovered = list_files(&ctx.client, &cancel).await?; println!("after recovery to t1: {t1_files_recovered:?}"); @@ -189,7 +189,7 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow: // after recovery to t0: everything is gone except for path1 let t_final = time_point().await; ctx.client - .time_travel_recover(None, t0, t_final, &cancel) + .time_travel_recover(None, t0, t_final, &cancel, None) .await?; let t0_files_recovered = list_files(&ctx.client, &cancel).await?; println!("after recovery to t0: {t0_files_recovered:?}"); diff --git a/pageserver/ctl/src/main.rs b/pageserver/ctl/src/main.rs index 1d81b839a8..3cd4faaf2e 100644 --- a/pageserver/ctl/src/main.rs +++ b/pageserver/ctl/src/main.rs @@ -176,9 +176,11 @@ async fn main() -> anyhow::Result<()> { let config = RemoteStorageConfig::from_toml_str(&cmd.config_toml_str)?; let storage = remote_storage::GenericRemoteStorage::from_config(&config).await; let cancel = CancellationToken::new(); + // Complexity limit: as we are running this command locally, we should have a lot of memory available, and we do not + // need to limit the number of versions we are going to delete. storage .unwrap() - .time_travel_recover(Some(&prefix), timestamp, done_if_after, &cancel) + .time_travel_recover(Some(&prefix), timestamp, done_if_after, &cancel, None) .await?; } Commands::Key(dkc) => dkc.execute(), diff --git a/pageserver/src/tenant/remote_timeline_client/upload.rs b/pageserver/src/tenant/remote_timeline_client/upload.rs index 89f6136530..ffb4717d9f 100644 --- a/pageserver/src/tenant/remote_timeline_client/upload.rs +++ b/pageserver/src/tenant/remote_timeline_client/upload.rs @@ -1,6 +1,7 @@ //! Helper functions to upload files to remote storage with a RemoteStorage use std::io::{ErrorKind, SeekFrom}; +use std::num::NonZeroU32; use std::time::SystemTime; use anyhow::{Context, bail}; @@ -228,11 +229,25 @@ pub(crate) async fn time_travel_recover_tenant( let timelines_path = super::remote_timelines_path(tenant_shard_id); prefixes.push(timelines_path); } + + // Limit the number of versions deletions, mostly so that we don't + // keep requesting forever if the list is too long, as we'd put the + // list in RAM. + // Building a list of 100k entries that reaches the limit roughly takes + // 40 seconds, and roughly corresponds to tenants of 2 TiB physical size. + const COMPLEXITY_LIMIT: Option = NonZeroU32::new(100_000); + for prefix in &prefixes { backoff::retry( || async { storage - .time_travel_recover(Some(prefix), timestamp, done_if_after, cancel) + .time_travel_recover( + Some(prefix), + timestamp, + done_if_after, + cancel, + COMPLEXITY_LIMIT, + ) .await }, |e| !matches!(e, TimeTravelError::Other(_)), From 385324ee8a65b6287118aea01dae307cac056a97 Mon Sep 17 00:00:00 2001 From: Dmitrii Kovalkov <34828390+DimasKovas@users.noreply.github.com> Date: Fri, 13 Jun 2025 12:49:31 +0400 Subject: [PATCH 041/124] pageserver: fix post-merge PR comments on basebackup cache (#12216) ## Problem This PR addresses all but the direct IO post-merge comments on basebackup cache implementation. - Follow up on https://github.com/neondatabase/neon/pull/11989#pullrequestreview-2867966119 - Part of https://github.com/neondatabase/cloud/issues/29353 ## Summary of changes - Clean up the tmp directory by recreating it. - Recreate the tmp directory on startup. - Add comments why it's safe to not fsync the inode after renaming. --- pageserver/src/basebackup_cache.rs | 53 ++++++++++++++++++------------ 1 file changed, 32 insertions(+), 21 deletions(-) diff --git a/pageserver/src/basebackup_cache.rs b/pageserver/src/basebackup_cache.rs index 3a8ec555f7..7dde3e02fe 100644 --- a/pageserver/src/basebackup_cache.rs +++ b/pageserver/src/basebackup_cache.rs @@ -1,5 +1,6 @@ use std::{collections::HashMap, sync::Arc}; +use anyhow::Context; use async_compression::tokio::write::GzipEncoder; use camino::{Utf8Path, Utf8PathBuf}; use metrics::core::{AtomicU64, GenericCounter}; @@ -167,14 +168,17 @@ impl BasebackupCache { .join(Self::entry_filename(tenant_id, timeline_id, lsn)) } + fn tmp_dir(&self) -> Utf8PathBuf { + self.data_dir.join("tmp") + } + fn entry_tmp_path( &self, tenant_id: TenantId, timeline_id: TimelineId, lsn: Lsn, ) -> Utf8PathBuf { - self.data_dir - .join("tmp") + self.tmp_dir() .join(Self::entry_filename(tenant_id, timeline_id, lsn)) } @@ -194,15 +198,18 @@ impl BasebackupCache { Some((tenant_id, timeline_id, lsn)) } - async fn cleanup(&self) -> anyhow::Result<()> { - // Cleanup tmp directory. - let tmp_dir = self.data_dir.join("tmp"); - let mut tmp_dir = tokio::fs::read_dir(&tmp_dir).await?; - while let Some(dir_entry) = tmp_dir.next_entry().await? { - if let Err(e) = tokio::fs::remove_file(dir_entry.path()).await { - tracing::warn!("Failed to remove basebackup cache tmp file: {:#}", e); - } + // Recreate the tmp directory to clear all files in it. + async fn clean_tmp_dir(&self) -> anyhow::Result<()> { + let tmp_dir = self.tmp_dir(); + if tmp_dir.exists() { + tokio::fs::remove_dir_all(&tmp_dir).await?; } + tokio::fs::create_dir_all(&tmp_dir).await?; + Ok(()) + } + + async fn cleanup(&self) -> anyhow::Result<()> { + self.clean_tmp_dir().await?; // Remove outdated entries. let entries_old = self.entries.lock().unwrap().clone(); @@ -241,16 +248,14 @@ impl BasebackupCache { } async fn on_startup(&self) -> anyhow::Result<()> { - // Create data_dir and tmp directory if they do not exist. - tokio::fs::create_dir_all(&self.data_dir.join("tmp")) + // Create data_dir if it does not exist. + tokio::fs::create_dir_all(&self.data_dir) .await - .map_err(|e| { - anyhow::anyhow!( - "Failed to create basebackup cache data_dir {:?}: {:?}", - self.data_dir, - e - ) - })?; + .context("Failed to create basebackup cache data directory")?; + + self.clean_tmp_dir() + .await + .context("Failed to clean tmp directory")?; // Read existing entries from the data_dir and add them to in-memory state. let mut entries = HashMap::new(); @@ -451,6 +456,11 @@ impl BasebackupCache { } // Move the tmp file to the final location atomically. + // The tmp file is fsynced, so it's guaranteed that we will not have a partial file + // in the main directory. + // It's not necessary to fsync the inode after renaming, because the worst case is that + // the rename operation will be rolled back on the disk failure, the entry will disappear + // from the main directory, and the entry access will cause a cache miss. let entry_path = self.entry_path(tenant_shard_id.tenant_id, timeline_id, req_lsn); tokio::fs::rename(&entry_tmp_path, &entry_path).await?; @@ -468,16 +478,17 @@ impl BasebackupCache { } /// Prepares a basebackup in a temporary file. + /// Guarantees that the tmp file is fsynced before returning. async fn prepare_basebackup_tmp( &self, - emptry_tmp_path: &Utf8Path, + entry_tmp_path: &Utf8Path, timeline: &Arc, req_lsn: Lsn, ) -> anyhow::Result<()> { let ctx = RequestContext::new(TaskKind::BasebackupCache, DownloadBehavior::Download); let ctx = ctx.with_scope_timeline(timeline); - let file = tokio::fs::File::create(emptry_tmp_path).await?; + let file = tokio::fs::File::create(entry_tmp_path).await?; let mut writer = BufWriter::new(file); let mut encoder = GzipEncoder::with_quality( From d63815fa40ff6550596bb873f9722a6412a3444b Mon Sep 17 00:00:00 2001 From: Alexander Sarantcev <99037063+ephemeralsad@users.noreply.github.com> Date: Fri, 13 Jun 2025 17:34:29 +0400 Subject: [PATCH 042/124] Fix ChaosInjector shard eligibility bug (#12231) ## Problem ChaosInjector is intended to skip non-active scheduling policies, but the current logic skips active shards instead. ## Summary of changes - Fixed shard eligibility condition to correctly allow chaos injection for shards with an Active scheduling policy. --- storage_controller/src/service/chaos_injector.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/storage_controller/src/service/chaos_injector.rs b/storage_controller/src/service/chaos_injector.rs index 9c7a9e3798..4087de200a 100644 --- a/storage_controller/src/service/chaos_injector.rs +++ b/storage_controller/src/service/chaos_injector.rs @@ -107,7 +107,7 @@ impl ChaosInjector { // - Skip shards doing a graceful migration already, so that we allow these to run to // completion rather than only exercising the first part and then cancelling with // some other chaos. - !matches!(shard.get_scheduling_policy(), ShardSchedulingPolicy::Active) + matches!(shard.get_scheduling_policy(), ShardSchedulingPolicy::Active) && shard.get_preferred_node().is_none() } From d14689741509d70843a85d364d1d42a204d43b59 Mon Sep 17 00:00:00 2001 From: Alexander Sarantcev <99037063+ephemeralsad@users.noreply.github.com> Date: Fri, 13 Jun 2025 21:47:09 +0400 Subject: [PATCH 043/124] Fix reconciles metrics typo (#12235) ## Problem Need to fix naming `safkeeper` -> `safekeeper` ## Summary of changes - `storage_controller_safkeeper_reconciles_*` renamed to `storage_controller_safekeeper_reconciles_*` --- storage_controller/src/metrics.rs | 4 ++-- storage_controller/src/service/safekeeper_reconciler.rs | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/storage_controller/src/metrics.rs b/storage_controller/src/metrics.rs index ccdbcad139..e6ee2d8729 100644 --- a/storage_controller/src/metrics.rs +++ b/storage_controller/src/metrics.rs @@ -141,11 +141,11 @@ pub(crate) struct StorageControllerMetricGroup { measured::CounterVec, /// Indicator of safekeeper reconciler queue depth, broken down by safekeeper, excluding ongoing reconciles. - pub(crate) storage_controller_safkeeper_reconciles_queued: + pub(crate) storage_controller_safekeeper_reconciles_queued: measured::GaugeVec, /// Indicator of completed safekeeper reconciles, broken down by safekeeper. - pub(crate) storage_controller_safkeeper_reconciles_complete: + pub(crate) storage_controller_safekeeper_reconciles_complete: measured::CounterVec, } diff --git a/storage_controller/src/service/safekeeper_reconciler.rs b/storage_controller/src/service/safekeeper_reconciler.rs index fbf0b5c4e3..a3c5082be6 100644 --- a/storage_controller/src/service/safekeeper_reconciler.rs +++ b/storage_controller/src/service/safekeeper_reconciler.rs @@ -230,7 +230,7 @@ impl ReconcilerHandle { // increase it before putting into the queue. let queued_gauge = &METRICS_REGISTRY .metrics_group - .storage_controller_safkeeper_reconciles_queued; + .storage_controller_safekeeper_reconciles_queued; let label_group = SafekeeperReconcilerLabelGroup { sk_az: &sk_az, sk_node_id: &sk_node_id, @@ -306,7 +306,7 @@ impl SafekeeperReconciler { let queued_gauge = &METRICS_REGISTRY .metrics_group - .storage_controller_safkeeper_reconciles_queued; + .storage_controller_safekeeper_reconciles_queued; queued_gauge.set( SafekeeperReconcilerLabelGroup { sk_az: &req.safekeeper.skp.availability_zone_id, @@ -547,7 +547,7 @@ impl SafekeeperReconcilerInner { let complete_counter = &METRICS_REGISTRY .metrics_group - .storage_controller_safkeeper_reconciles_complete; + .storage_controller_safekeeper_reconciles_complete; complete_counter.inc(SafekeeperReconcilerLabelGroup { sk_az: &req.safekeeper.skp.availability_zone_id, sk_node_id: &req.safekeeper.get_id().to_string(), From 5303c715892e552b101612cf915149cba7d6782d Mon Sep 17 00:00:00 2001 From: Alexander Sarantcev <99037063+ephemeralsad@users.noreply.github.com> Date: Fri, 13 Jun 2025 22:18:51 +0400 Subject: [PATCH 044/124] Move comment above metrics handler (#12236) ## Problem Comment is in incorrect place: `/metrics` code is above its description comment. ## Summary of changes - `/metrics` code is now below the comment --- storage_controller/src/http.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs index 7051a3326d..346595aa11 100644 --- a/storage_controller/src/http.rs +++ b/storage_controller/src/http.rs @@ -2070,10 +2070,10 @@ pub fn make_router( router .data(Arc::new(HttpState::new(service, auth, build_info))) + // Non-prefixed generic endpoints (status, metrics, profiling) .get("/metrics", |r| { named_request_span(r, measured_metrics_handler, RequestName("metrics")) }) - // Non-prefixed generic endpoints (status, metrics, profiling) .get("/status", |r| { named_request_span(r, handle_status, RequestName("status")) }) From c243521ae58cfe75d3481562f3fdede9bf665a77 Mon Sep 17 00:00:00 2001 From: Alexander Sarantcev <99037063+ephemeralsad@users.noreply.github.com> Date: Mon, 16 Jun 2025 09:51:57 +0400 Subject: [PATCH 045/124] Fix reconcile_long_running metric comment (#12234) ## Problem Comment for `storage_controller_reconcile_long_running` metric was copy-pasted and not updated in #9207 ## Summary of changes - Fixed comment --- storage_controller/src/metrics.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/storage_controller/src/metrics.rs b/storage_controller/src/metrics.rs index e6ee2d8729..5b27e01afd 100644 --- a/storage_controller/src/metrics.rs +++ b/storage_controller/src/metrics.rs @@ -136,7 +136,8 @@ pub(crate) struct StorageControllerMetricGroup { pub(crate) storage_controller_leadership_status: measured::GaugeVec, - /// HTTP request status counters for handled requests + /// Indicator of stucked (long-running) reconciles, broken down by tenant, shard and sequence. + /// The metric is automatically removed once the reconciliation completes. pub(crate) storage_controller_reconcile_long_running: measured::CounterVec, From 818e5130f1d31dc67624465be13788523302b40b Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Mon, 16 Jun 2025 02:45:50 -0700 Subject: [PATCH 046/124] page_api: add a few derives (#12253) ## Problem The `page_api` domain types are missing a few derives. ## Summary of changes Add `Clone`, `Copy`, and `Debug` derives for all types where appropriate. --- pageserver/page_api/src/model.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pageserver/page_api/src/model.rs b/pageserver/page_api/src/model.rs index 1a08d04cc1..664ac0e6c4 100644 --- a/pageserver/page_api/src/model.rs +++ b/pageserver/page_api/src/model.rs @@ -26,7 +26,7 @@ use utils::lsn::Lsn; use crate::proto; /// A protocol error. Typically returned via try_from() or try_into(). -#[derive(thiserror::Error, Debug)] +#[derive(thiserror::Error, Clone, Debug)] pub enum ProtocolError { #[error("field '{0}' has invalid value '{1}'")] Invalid(&'static str, String), @@ -487,6 +487,7 @@ impl From for i32 { // Fetches the size of a relation at a given LSN, as # of blocks. Only valid on shard 0, other // shards will error. +#[derive(Clone, Copy, Debug)] pub struct GetRelSizeRequest { pub read_lsn: ReadLsn, pub rel: RelTag, @@ -530,6 +531,7 @@ impl From for proto::GetRelSizeResponse { } /// Requests an SLRU segment. Only valid on shard 0, other shards will error. +#[derive(Clone, Copy, Debug)] pub struct GetSlruSegmentRequest { pub read_lsn: ReadLsn, pub kind: SlruKind, From fc136eec8f638bb50f04add5566dd8e822ad9625 Mon Sep 17 00:00:00 2001 From: Trung Dinh Date: Mon, 16 Jun 2025 03:29:42 -0700 Subject: [PATCH 047/124] pagectl: add dump layer local (#12245) ## Problem In our environment, we don't always have access to the pagectl tool on the pageserver. We have to download the page files to local env to introspect them. Hence, it'll be useful to be able to parse the local files using `pagectl`. ## Summary of changes * Add `dump-layer-local` to `pagectl` that takes a local path as argument and returns the layer content: ``` cargo run -p pagectl layer dump-layer-local ~/Desktop/000000067F000040490002800000FFFFFFFF-030000000000000000000000000000000002__00003E7A53EDE611-00003E7AF27BFD19-v1-00000001 ``` * Bonus: Fix a bug in `pageserver/ctl/src/draw_timeline_dir.rs` in which we don't filter out temporary files. --- pageserver/ctl/src/draw_timeline_dir.rs | 8 +++- pageserver/ctl/src/layers.rs | 49 +++++++++++++++---------- 2 files changed, 35 insertions(+), 22 deletions(-) diff --git a/pageserver/ctl/src/draw_timeline_dir.rs b/pageserver/ctl/src/draw_timeline_dir.rs index 80ca414543..881ebd49a7 100644 --- a/pageserver/ctl/src/draw_timeline_dir.rs +++ b/pageserver/ctl/src/draw_timeline_dir.rs @@ -20,7 +20,7 @@ //! //! # local timeline dir //! ls test_output/test_pgbench\[neon-45-684\]/repo/tenants/$TENANT/timelines/$TIMELINE | \ -//! grep "__" | cargo run --release --bin pagectl draw-timeline-dir > out.svg +//! grep "__" | cargo run --release --bin pagectl draw-timeline > out.svg //! //! # Layer map dump from `/v1/tenant/$TENANT/timeline/$TIMELINE/layer` //! (jq -r '.historic_layers[] | .layer_file_name' | cargo run -p pagectl draw-timeline) < layer-map.json > out.svg @@ -81,7 +81,11 @@ fn build_coordinate_compression_map(coords: Vec) -> BTreeMap (Range, Range) { let split: Vec<&str> = name.split("__").collect(); let keys: Vec<&str> = split[0].split('-').collect(); - let mut lsns: Vec<&str> = split[1].split('-').collect(); + + // Remove the temporary file extension, e.g., remove the `.d20a.___temp` part from the following filename: + // 000000067F000040490000404A00441B0000-000000067F000040490000404A00441B4000__000043483A34CE00.d20a.___temp + let lsns = split[1].split('.').collect::>()[0]; + let mut lsns: Vec<&str> = lsns.split('-').collect(); // The current format of the layer file name: 000000067F0000000400000B150100000000-000000067F0000000400000D350100000000__00000000014B7AC8-v1-00000001 diff --git a/pageserver/ctl/src/layers.rs b/pageserver/ctl/src/layers.rs index 79f56a5a51..42b3e4a9ba 100644 --- a/pageserver/ctl/src/layers.rs +++ b/pageserver/ctl/src/layers.rs @@ -13,7 +13,7 @@ use pageserver::{page_cache, virtual_file}; use pageserver_api::key::Key; use utils::id::{TenantId, TimelineId}; -use crate::layer_map_analyzer::parse_filename; +use crate::layer_map_analyzer::{LayerFile, parse_filename}; #[derive(Subcommand)] pub(crate) enum LayerCmd { @@ -38,6 +38,8 @@ pub(crate) enum LayerCmd { /// The id from list-layer command id: usize, }, + /// Dump all information of a layer file locally + DumpLayerLocal { path: PathBuf }, RewriteSummary { layer_file_path: Utf8PathBuf, #[clap(long)] @@ -131,15 +133,7 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> { } for (idx, layer_file) in to_print { - println!( - "[{:3}] key:{}-{}\n lsn:{}-{}\n delta:{}", - idx, - layer_file.key_range.start, - layer_file.key_range.end, - layer_file.lsn_range.start, - layer_file.lsn_range.end, - layer_file.is_delta, - ); + print_layer_file(idx, &layer_file); } Ok(()) } @@ -159,16 +153,7 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> { let layer = layer?; if let Ok(layer_file) = parse_filename(&layer.file_name().into_string().unwrap()) { if *id == idx { - // TODO(chi): dedup code - println!( - "[{:3}] key:{}-{}\n lsn:{}-{}\n delta:{}", - idx, - layer_file.key_range.start, - layer_file.key_range.end, - layer_file.lsn_range.start, - layer_file.lsn_range.end, - layer_file.is_delta, - ); + print_layer_file(idx, &layer_file); if layer_file.is_delta { read_delta_file(layer.path(), &ctx).await?; @@ -183,6 +168,18 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> { } Ok(()) } + LayerCmd::DumpLayerLocal { path } => { + if let Ok(layer_file) = parse_filename(path.file_name().unwrap().to_str().unwrap()) { + print_layer_file(0, &layer_file); + + if layer_file.is_delta { + read_delta_file(path, &ctx).await?; + } else { + read_image_file(path, &ctx).await?; + } + } + Ok(()) + } LayerCmd::RewriteSummary { layer_file_path, new_tenant_id, @@ -247,3 +244,15 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> { } } } + +fn print_layer_file(idx: usize, layer_file: &LayerFile) { + println!( + "[{:3}] key:{}-{}\n lsn:{}-{}\n delta:{}", + idx, + layer_file.key_range.start, + layer_file.key_range.end, + layer_file.lsn_range.start, + layer_file.lsn_range.end, + layer_file.is_delta, + ); +} From 118e13438df173b98c83bea853e346ebbe00eab3 Mon Sep 17 00:00:00 2001 From: Alexander Lakhin Date: Mon, 16 Jun 2025 16:29:39 +0300 Subject: [PATCH 048/124] Add "Build and Test Fully" workflow (#11931) ## Problem We don't test debug builds for v14..v16 in the regular "Build and Test" runs to perform the testing faster, but it means we can't detect assertion failures in those versions. (See https://github.com/neondatabase/neon/issues/11891, https://github.com/neondatabase/neon/issues/11997) ## Summary of changes Add a new workflow to test all build types and all versions on all architectures. --- .github/workflows/_build-and-test-locally.yml | 11 +- .../workflows/build_and_run_selected_test.yml | 1 + .github/workflows/build_and_test_fully.yml | 151 ++++++++++++++++++ test_runner/regress/test_compatibility.py | 9 ++ 4 files changed, 169 insertions(+), 3 deletions(-) create mode 100644 .github/workflows/build_and_test_fully.yml diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml index 663afa2c8b..4729aea4f6 100644 --- a/.github/workflows/_build-and-test-locally.yml +++ b/.github/workflows/_build-and-test-locally.yml @@ -38,6 +38,11 @@ on: required: false default: 1 type: number + rerun_failed: + description: 'rerun failed tests to ignore flaky tests' + required: false + default: true + type: boolean defaults: run: @@ -379,7 +384,7 @@ jobs: - name: Pytest regression tests continue-on-error: ${{ matrix.lfc_state == 'with-lfc' && inputs.build-type == 'debug' }} uses: ./.github/actions/run-python-test-set - timeout-minutes: ${{ inputs.sanitizers != 'enabled' && 75 || 180 }} + timeout-minutes: ${{ (inputs.build-type == 'release' && inputs.sanitizers != 'enabled') && 75 || 180 }} with: build_type: ${{ inputs.build-type }} test_selection: regress @@ -387,14 +392,14 @@ jobs: run_with_real_s3: true real_s3_bucket: neon-github-ci-tests real_s3_region: eu-central-1 - rerun_failed: ${{ inputs.test-run-count == 1 }} + rerun_failed: ${{ inputs.rerun_failed }} pg_version: ${{ matrix.pg_version }} sanitizers: ${{ inputs.sanitizers }} aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} # `--session-timeout` is equal to (timeout-minutes - 10 minutes) * 60 seconds. # Attempt to stop tests gracefully to generate test reports # until they are forcibly stopped by the stricter `timeout-minutes` limit. - extra_params: --session-timeout=${{ inputs.sanitizers != 'enabled' && 3000 || 10200 }} --count=${{ inputs.test-run-count }} + extra_params: --session-timeout=${{ (inputs.build-type == 'release' && inputs.sanitizers != 'enabled') && 3000 || 10200 }} --count=${{ inputs.test-run-count }} ${{ inputs.test-selection != '' && format('-k "{0}"', inputs.test-selection) || '' }} env: TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }} diff --git a/.github/workflows/build_and_run_selected_test.yml b/.github/workflows/build_and_run_selected_test.yml index 7f1eb991c4..0f53d44a13 100644 --- a/.github/workflows/build_and_run_selected_test.yml +++ b/.github/workflows/build_and_run_selected_test.yml @@ -58,6 +58,7 @@ jobs: test-cfg: ${{ inputs.pg-versions }} test-selection: ${{ inputs.test-selection }} test-run-count: ${{ fromJson(inputs.run-count) }} + rerun_failed: false secrets: inherit create-test-report: diff --git a/.github/workflows/build_and_test_fully.yml b/.github/workflows/build_and_test_fully.yml new file mode 100644 index 0000000000..7d6543ee26 --- /dev/null +++ b/.github/workflows/build_and_test_fully.yml @@ -0,0 +1,151 @@ +name: Build and Test Fully + +on: + schedule: + # * is a special character in YAML so you have to quote this string + # ┌───────────── minute (0 - 59) + # │ ┌───────────── hour (0 - 23) + # │ │ ┌───────────── day of the month (1 - 31) + # │ │ │ ┌───────────── month (1 - 12 or JAN-DEC) + # │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT) + - cron: '0 3 * * *' # run once a day, timezone is utc + workflow_dispatch: + +defaults: + run: + shell: bash -euxo pipefail {0} + +concurrency: + # Allow only one workflow per any non-`main` branch. + group: ${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }} + cancel-in-progress: true + +env: + RUST_BACKTRACE: 1 + COPT: '-Werror' + +jobs: + tag: + runs-on: [ self-hosted, small ] + container: ${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/base:pinned + outputs: + build-tag: ${{steps.build-tag.outputs.tag}} + + steps: + # Need `fetch-depth: 0` to count the number of commits in the branch + - name: Harden the runner (Audit all outbound calls) + uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 + with: + egress-policy: audit + + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + with: + fetch-depth: 0 + + - name: Get build tag + run: | + echo run:$GITHUB_RUN_ID + echo ref:$GITHUB_REF_NAME + echo rev:$(git rev-list --count HEAD) + if [[ "$GITHUB_REF_NAME" == "main" ]]; then + echo "tag=$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT + elif [[ "$GITHUB_REF_NAME" == "release" ]]; then + echo "tag=release-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT + elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then + echo "tag=release-proxy-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT + elif [[ "$GITHUB_REF_NAME" == "release-compute" ]]; then + echo "tag=release-compute-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT + else + echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release', 'release-proxy', 'release-compute'" + echo "tag=$GITHUB_RUN_ID" >> $GITHUB_OUTPUT + fi + shell: bash + id: build-tag + + build-build-tools-image: + uses: ./.github/workflows/build-build-tools-image.yml + secrets: inherit + + build-and-test-locally: + needs: [ tag, build-build-tools-image ] + strategy: + fail-fast: false + matrix: + arch: [ x64, arm64 ] + build-type: [ debug, release ] + uses: ./.github/workflows/_build-and-test-locally.yml + with: + arch: ${{ matrix.arch }} + build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm + build-tag: ${{ needs.tag.outputs.build-tag }} + build-type: ${{ matrix.build-type }} + rerun_failed: false + test-cfg: '[{"pg_version":"v14", "lfc_state": "with-lfc"}, + {"pg_version":"v15", "lfc_state": "with-lfc"}, + {"pg_version":"v16", "lfc_state": "with-lfc"}, + {"pg_version":"v17", "lfc_state": "with-lfc"}, + {"pg_version":"v14", "lfc_state": "without-lfc"}, + {"pg_version":"v15", "lfc_state": "without-lfc"}, + {"pg_version":"v16", "lfc_state": "without-lfc"}, + {"pg_version":"v17", "lfc_state": "withouts-lfc"}]' + secrets: inherit + + + create-test-report: + needs: [ build-and-test-locally, build-build-tools-image ] + if: ${{ !cancelled() }} + permissions: + id-token: write # aws-actions/configure-aws-credentials + statuses: write + contents: write + pull-requests: write + outputs: + report-url: ${{ steps.create-allure-report.outputs.report-url }} + + runs-on: [ self-hosted, small ] + container: + image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm + credentials: + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + options: --init + + steps: + - name: Harden the runner (Audit all outbound calls) + uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 + with: + egress-policy: audit + + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + + - name: Create Allure report + if: ${{ !cancelled() }} + id: create-allure-report + uses: ./.github/actions/allure-report-generate + with: + store-test-results-into-db: true + aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + env: + REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }} + + - uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1 + if: ${{ !cancelled() }} + with: + # Retry script for 5XX server errors: https://github.com/actions/github-script#retries + retries: 5 + script: | + const report = { + reportUrl: "${{ steps.create-allure-report.outputs.report-url }}", + reportJsonUrl: "${{ steps.create-allure-report.outputs.report-json-url }}", + } + + const coverage = {} + + const script = require("./scripts/comment-test-report.js") + await script({ + github, + context, + fetch, + report, + coverage, + }) diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py index 31e38144e3..bc9b534095 100644 --- a/test_runner/regress/test_compatibility.py +++ b/test_runner/regress/test_compatibility.py @@ -127,6 +127,12 @@ check_ondisk_data_compatibility_if_enabled = pytest.mark.skipif( reason="CHECK_ONDISK_DATA_COMPATIBILITY env is not set", ) +skip_old_debug_versions = pytest.mark.skipif( + os.getenv("BUILD_TYPE", "debug") == "debug" + and os.getenv("DEFAULT_PG_VERSION") in [PgVersion.V14, PgVersion.V15, PgVersion.V16], + reason="compatibility snaphots not available for old versions of debug builds", +) + @pytest.mark.xdist_group("compatibility") @pytest.mark.order(before="test_forward_compatibility") @@ -197,6 +203,7 @@ ingest_lag_log_line = ".*ingesting record with timestamp lagging more than wait_ @check_ondisk_data_compatibility_if_enabled +@skip_old_debug_versions @pytest.mark.xdist_group("compatibility") @pytest.mark.order(after="test_create_snapshot") def test_backward_compatibility( @@ -224,6 +231,7 @@ def test_backward_compatibility( @check_ondisk_data_compatibility_if_enabled +@skip_old_debug_versions @pytest.mark.xdist_group("compatibility") @pytest.mark.order(after="test_create_snapshot") def test_forward_compatibility( @@ -593,6 +601,7 @@ def test_historic_storage_formats( @check_ondisk_data_compatibility_if_enabled +@skip_old_debug_versions @pytest.mark.xdist_group("compatibility") @pytest.mark.parametrize( **fixtures.utils.allpairs_versions(), From 632cde7f136da8e7962153a6570b2903d8e6bd75 Mon Sep 17 00:00:00 2001 From: Suhas Thalanki <54014218+thesuhas@users.noreply.github.com> Date: Mon, 16 Jun 2025 12:30:41 -0700 Subject: [PATCH 049/124] schema and github workflow for validation of compute manifest (#12069) Adds a schema to validate the manifest.yaml described in [this RFC](https://github.com/neondatabase/neon/blob/main/docs/rfcs/038-independent-compute-release.md) and a github workflow to test this. --- .github/workflows/build_and_test.yml | 22 +++ compute/.gitignore | 3 + compute/Makefile | 8 + compute/manifest.schema.json | 209 +++++++++++++++++++++++++++ compute/manifest.yaml | 8 +- compute/package-lock.json | 37 +++++ compute/package.json | 7 + 7 files changed, 290 insertions(+), 4 deletions(-) create mode 100644 compute/manifest.schema.json create mode 100644 compute/package-lock.json create mode 100644 compute/package.json diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 9f2fa3d52c..7faaed49c1 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -199,6 +199,28 @@ jobs: build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm secrets: inherit + validate-compute-manifest: + runs-on: ubuntu-22.04 + needs: [ meta, check-permissions ] + # We do need to run this in `.*-rc-pr` because of hotfixes. + if: ${{ contains(fromJSON('["pr", "push-main", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }} + steps: + - name: Harden the runner (Audit all outbound calls) + uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 + with: + egress-policy: audit + + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + + - name: Set up Node.js + uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # v4.4.0 + with: + node-version: '24' + + - name: Validate manifest against schema + run: | + make -C compute manifest-schema-validation + build-and-test-locally: needs: [ meta, build-build-tools-image ] # We do need to run this in `.*-rc-pr` because of hotfixes. diff --git a/compute/.gitignore b/compute/.gitignore index 70980d335a..aab2afaa4e 100644 --- a/compute/.gitignore +++ b/compute/.gitignore @@ -3,3 +3,6 @@ etc/neon_collector.yml etc/neon_collector_autoscaling.yml etc/sql_exporter.yml etc/sql_exporter_autoscaling.yml + +# Node.js dependencies +node_modules/ diff --git a/compute/Makefile b/compute/Makefile index 0036196160..c53d040887 100644 --- a/compute/Makefile +++ b/compute/Makefile @@ -48,3 +48,11 @@ jsonnetfmt-test: .PHONY: jsonnetfmt-format jsonnetfmt-format: jsonnetfmt --in-place $(jsonnet_files) + +.PHONY: manifest-schema-validation +manifest-schema-validation: node_modules + node_modules/.bin/jsonschema validate -d https://json-schema.org/draft/2020-12/schema manifest.schema.json manifest.yaml + +node_modules: package.json + npm install + touch node_modules diff --git a/compute/manifest.schema.json b/compute/manifest.schema.json new file mode 100644 index 0000000000..a25055b45a --- /dev/null +++ b/compute/manifest.schema.json @@ -0,0 +1,209 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "Neon Compute Manifest Schema", + "description": "Schema for Neon compute node configuration manifest", + "type": "object", + "properties": { + "pg_settings": { + "type": "object", + "properties": { + "common": { + "type": "object", + "properties": { + "client_connection_check_interval": { + "type": "string", + "description": "Check for client disconnection interval in milliseconds" + }, + "effective_io_concurrency": { + "type": "string", + "description": "Effective IO concurrency setting" + }, + "fsync": { + "type": "string", + "enum": ["on", "off"], + "description": "Whether to force fsync to disk" + }, + "hot_standby": { + "type": "string", + "enum": ["on", "off"], + "description": "Whether hot standby is enabled" + }, + "idle_in_transaction_session_timeout": { + "type": "string", + "description": "Timeout for idle transactions in milliseconds" + }, + "listen_addresses": { + "type": "string", + "description": "Addresses to listen on" + }, + "log_connections": { + "type": "string", + "enum": ["on", "off"], + "description": "Whether to log connections" + }, + "log_disconnections": { + "type": "string", + "enum": ["on", "off"], + "description": "Whether to log disconnections" + }, + "log_temp_files": { + "type": "string", + "description": "Size threshold for logging temporary files in KB" + }, + "log_error_verbosity": { + "type": "string", + "enum": ["terse", "verbose", "default"], + "description": "Error logging verbosity level" + }, + "log_min_error_statement": { + "type": "string", + "description": "Minimum error level for statement logging" + }, + "maintenance_io_concurrency": { + "type": "string", + "description": "Maintenance IO concurrency setting" + }, + "max_connections": { + "type": "string", + "description": "Maximum number of connections" + }, + "max_replication_flush_lag": { + "type": "string", + "description": "Maximum replication flush lag" + }, + "max_replication_slots": { + "type": "string", + "description": "Maximum number of replication slots" + }, + "max_replication_write_lag": { + "type": "string", + "description": "Maximum replication write lag" + }, + "max_wal_senders": { + "type": "string", + "description": "Maximum number of WAL senders" + }, + "max_wal_size": { + "type": "string", + "description": "Maximum WAL size" + }, + "neon.unstable_extensions": { + "type": "string", + "description": "List of unstable extensions" + }, + "neon.protocol_version": { + "type": "string", + "description": "Neon protocol version" + }, + "password_encryption": { + "type": "string", + "description": "Password encryption method" + }, + "restart_after_crash": { + "type": "string", + "enum": ["on", "off"], + "description": "Whether to restart after crash" + }, + "superuser_reserved_connections": { + "type": "string", + "description": "Number of reserved connections for superuser" + }, + "synchronous_standby_names": { + "type": "string", + "description": "Names of synchronous standby servers" + }, + "wal_keep_size": { + "type": "string", + "description": "WAL keep size" + }, + "wal_level": { + "type": "string", + "description": "WAL level" + }, + "wal_log_hints": { + "type": "string", + "enum": ["on", "off"], + "description": "Whether to log hints in WAL" + }, + "wal_sender_timeout": { + "type": "string", + "description": "WAL sender timeout in milliseconds" + } + }, + "required": [ + "client_connection_check_interval", + "effective_io_concurrency", + "fsync", + "hot_standby", + "idle_in_transaction_session_timeout", + "listen_addresses", + "log_connections", + "log_disconnections", + "log_temp_files", + "log_error_verbosity", + "log_min_error_statement", + "maintenance_io_concurrency", + "max_connections", + "max_replication_flush_lag", + "max_replication_slots", + "max_replication_write_lag", + "max_wal_senders", + "max_wal_size", + "neon.unstable_extensions", + "neon.protocol_version", + "password_encryption", + "restart_after_crash", + "superuser_reserved_connections", + "synchronous_standby_names", + "wal_keep_size", + "wal_level", + "wal_log_hints", + "wal_sender_timeout" + ] + }, + "replica": { + "type": "object", + "properties": { + "hot_standby": { + "type": "string", + "enum": ["on", "off"], + "description": "Whether hot standby is enabled for replicas" + } + }, + "required": ["hot_standby"] + }, + "per_version": { + "type": "object", + "patternProperties": { + "^1[4-7]$": { + "type": "object", + "properties": { + "common": { + "type": "object", + "properties": { + "io_combine_limit": { + "type": "string", + "description": "IO combine limit" + } + } + }, + "replica": { + "type": "object", + "properties": { + "recovery_prefetch": { + "type": "string", + "enum": ["on", "off"], + "description": "Whether to enable recovery prefetch for PostgreSQL replicas" + } + } + } + } + } + } + } + }, + "required": ["common", "replica", "per_version"] + } + }, + "required": ["pg_settings"] +} diff --git a/compute/manifest.yaml b/compute/manifest.yaml index f1cd20c497..4425241d8a 100644 --- a/compute/manifest.yaml +++ b/compute/manifest.yaml @@ -105,17 +105,17 @@ pg_settings: # Neon hot standby ignores pages that are not in the shared_buffers recovery_prefetch: "off" 16: - common: + common: {} replica: # prefetching of blocks referenced in WAL doesn't make sense for us # Neon hot standby ignores pages that are not in the shared_buffers recovery_prefetch: "off" 15: - common: + common: {} replica: # prefetching of blocks referenced in WAL doesn't make sense for us # Neon hot standby ignores pages that are not in the shared_buffers recovery_prefetch: "off" 14: - common: - replica: + common: {} + replica: {} diff --git a/compute/package-lock.json b/compute/package-lock.json new file mode 100644 index 0000000000..693a37cfcb --- /dev/null +++ b/compute/package-lock.json @@ -0,0 +1,37 @@ +{ + "name": "neon-compute", + "lockfileVersion": 3, + "requires": true, + "packages": { + "": { + "name": "neon-compute", + "dependencies": { + "@sourcemeta/jsonschema": "9.3.4" + } + }, + "node_modules/@sourcemeta/jsonschema": { + "version": "9.3.4", + "resolved": "https://registry.npmjs.org/@sourcemeta/jsonschema/-/jsonschema-9.3.4.tgz", + "integrity": "sha512-hkujfkZAIGXUs4U//We9faZW8LZ4/H9LqagRYsFSulH/VLcKPNhZyCTGg7AhORuzm27zqENvKpnX4g2FzudYFw==", + "cpu": [ + "x64", + "arm64" + ], + "license": "AGPL-3.0", + "os": [ + "darwin", + "linux", + "win32" + ], + "bin": { + "jsonschema": "cli.js" + }, + "engines": { + "node": ">=16" + }, + "funding": { + "url": "https://github.com/sponsors/sourcemeta" + } + } + } +} diff --git a/compute/package.json b/compute/package.json new file mode 100644 index 0000000000..581384dc13 --- /dev/null +++ b/compute/package.json @@ -0,0 +1,7 @@ +{ + "name": "neon-compute", + "private": true, + "dependencies": { + "@sourcemeta/jsonschema": "9.3.4" + } +} \ No newline at end of file From f669e184771cfd44d84075fbb191bb1eabbfca3c Mon Sep 17 00:00:00 2001 From: Tristan Partin Date: Mon, 16 Jun 2025 14:38:26 -0500 Subject: [PATCH 050/124] Remove TODO comment related to default_transaction_read_only (#12261) This code has been deployed for a while, so let's remove the TODO, and remove the option passed from the control plane. Link: https://github.com/neondatabase/cloud/pull/30274 Signed-off-by: Tristan Partin --- compute_tools/src/compute.rs | 5 ----- 1 file changed, 5 deletions(-) diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index f15538b157..50c254224a 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -354,11 +354,6 @@ impl ComputeNode { // that can affect `compute_ctl` and prevent it from properly configuring the database schema. // Unset them via connection string options before connecting to the database. // N.B. keep it in sync with `ZENITH_OPTIONS` in `get_maintenance_client()`. - // - // TODO(ololobus): we currently pass `-c default_transaction_read_only=off` from control plane - // as well. After rolling out this code, we can remove this parameter from control plane. - // In the meantime, double-passing is fine, the last value is applied. - // See: const EXTRA_OPTIONS: &str = "-c role=cloud_admin -c default_transaction_read_only=off -c search_path=public -c statement_timeout=0"; let options = match conn_conf.get_options() { Some(options) => format!("{} {}", options, EXTRA_OPTIONS), From 01ccb34118d3d2b0481ea02d9db38fdb4a659f86 Mon Sep 17 00:00:00 2001 From: Alexander Lakhin Date: Tue, 17 Jun 2025 11:08:43 +0300 Subject: [PATCH 051/124] Don't rerun failed tests in 'Build and Test with Sanitizers' workflow (#12259) ## Problem We could easily miss a sanitizer-detected defect, if it occurred due to some race condition, as we just rerun the test and if it succeeds, the overall test run is considered successful. It was more reasonable before, when we had much more unstable tests in main, but now we can track all test failures. ## Summary of changes Don't rerun failed tests. --- .github/workflows/_build-and-test-locally.yml | 4 ++-- .github/workflows/build_and_run_selected_test.yml | 2 +- .github/workflows/build_and_test_fully.yml | 2 +- .github/workflows/build_and_test_with_sanitizers.yml | 1 + 4 files changed, 5 insertions(+), 4 deletions(-) diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml index 4729aea4f6..3f66f41ef2 100644 --- a/.github/workflows/_build-and-test-locally.yml +++ b/.github/workflows/_build-and-test-locally.yml @@ -38,7 +38,7 @@ on: required: false default: 1 type: number - rerun_failed: + rerun-failed: description: 'rerun failed tests to ignore flaky tests' required: false default: true @@ -392,7 +392,7 @@ jobs: run_with_real_s3: true real_s3_bucket: neon-github-ci-tests real_s3_region: eu-central-1 - rerun_failed: ${{ inputs.rerun_failed }} + rerun_failed: ${{ inputs.rerun-failed }} pg_version: ${{ matrix.pg_version }} sanitizers: ${{ inputs.sanitizers }} aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} diff --git a/.github/workflows/build_and_run_selected_test.yml b/.github/workflows/build_and_run_selected_test.yml index 0f53d44a13..6d3541d1b6 100644 --- a/.github/workflows/build_and_run_selected_test.yml +++ b/.github/workflows/build_and_run_selected_test.yml @@ -58,7 +58,7 @@ jobs: test-cfg: ${{ inputs.pg-versions }} test-selection: ${{ inputs.test-selection }} test-run-count: ${{ fromJson(inputs.run-count) }} - rerun_failed: false + rerun-failed: false secrets: inherit create-test-report: diff --git a/.github/workflows/build_and_test_fully.yml b/.github/workflows/build_and_test_fully.yml index 7d6543ee26..dd1d63b02b 100644 --- a/.github/workflows/build_and_test_fully.yml +++ b/.github/workflows/build_and_test_fully.yml @@ -79,7 +79,7 @@ jobs: build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm build-tag: ${{ needs.tag.outputs.build-tag }} build-type: ${{ matrix.build-type }} - rerun_failed: false + rerun-failed: false test-cfg: '[{"pg_version":"v14", "lfc_state": "with-lfc"}, {"pg_version":"v15", "lfc_state": "with-lfc"}, {"pg_version":"v16", "lfc_state": "with-lfc"}, diff --git a/.github/workflows/build_and_test_with_sanitizers.yml b/.github/workflows/build_and_test_with_sanitizers.yml index c54448dedc..32fb3c7c15 100644 --- a/.github/workflows/build_and_test_with_sanitizers.yml +++ b/.github/workflows/build_and_test_with_sanitizers.yml @@ -79,6 +79,7 @@ jobs: build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm build-tag: ${{ needs.tag.outputs.build-tag }} build-type: ${{ matrix.build-type }} + rerun-failed: false test-cfg: '[{"pg_version":"v17"}]' sanitizers: enabled secrets: inherit From 1a5f7ce6ad4bf96548e65e597d8c01456cd91c25 Mon Sep 17 00:00:00 2001 From: Aleksandr Sarantsev <99037063+ephemeralsad@users.noreply.github.com> Date: Tue, 17 Jun 2025 14:09:55 +0400 Subject: [PATCH 052/124] storcon: Exclude another secondaries while optimizing secondary (#12251) ## Problem If the node intent includes more than one secondary, we can generate a replace optimization using a candidate node that is already a secondary location. ## Summary of changes - Exclude all other secondary nodes from the scoring process to ensure optimal candidate selection. --- storage_controller/src/tenant_shard.rs | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs index c7b2628ec4..d6046e062e 100644 --- a/storage_controller/src/tenant_shard.rs +++ b/storage_controller/src/tenant_shard.rs @@ -1184,11 +1184,19 @@ impl TenantShard { for secondary in self.intent.get_secondary() { // Make sure we don't try to migrate a secondary to our attached location: this case happens // easily in environments without multiple AZs. - let exclude = match self.intent.attached { + let mut exclude = match self.intent.attached { Some(attached) => vec![attached], None => vec![], }; + // Exclude all other secondaries from the scheduling process to avoid replacing + // one existing secondary with another existing secondary. + for another_secondary in self.intent.secondary.iter() { + if another_secondary != secondary { + exclude.push(*another_secondary); + } + } + let replacement = match &self.policy { PlacementPolicy::Attached(_) => { // Secondaries for an attached shard should be scheduled using `SecondaryShardTag` From 143500dc4f308d1ec44bb198340e1ca314ba1f92 Mon Sep 17 00:00:00 2001 From: Aleksandr Sarantsev <99037063+ephemeralsad@users.noreply.github.com> Date: Tue, 17 Jun 2025 14:10:10 +0400 Subject: [PATCH 053/124] storcon: Improve stably_attached readability (#12249) ## Problem The `stably_attached` function is hard to read due to deeply nested conditionals ## Summary of Changes - Refactored `stably_attached` to use early returns and the `?` operator for improved readability --- storage_controller/src/tenant_shard.rs | 35 ++++++++++---------------- 1 file changed, 13 insertions(+), 22 deletions(-) diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs index d6046e062e..acd18734cf 100644 --- a/storage_controller/src/tenant_shard.rs +++ b/storage_controller/src/tenant_shard.rs @@ -1356,28 +1356,19 @@ impl TenantShard { /// Reconciliation may still be needed for other aspects of state such as secondaries (see [`Self::dirty`]): this /// funciton should not be used to decide whether to reconcile. pub(crate) fn stably_attached(&self) -> Option { - if let Some(attach_intent) = self.intent.attached { - match self.observed.locations.get(&attach_intent) { - Some(loc) => match &loc.conf { - Some(conf) => match conf.mode { - LocationConfigMode::AttachedMulti - | LocationConfigMode::AttachedSingle - | LocationConfigMode::AttachedStale => { - // Our intent and observed state agree that this node is in an attached state. - Some(attach_intent) - } - // Our observed config is not an attached state - _ => None, - }, - // Our observed state is None, i.e. in flux - None => None, - }, - // We have no observed state for this node - None => None, - } - } else { - // Our intent is not to attach - None + // We have an intent to attach for this node + let attach_intent = self.intent.attached?; + // We have an observed state for this node + let location = self.observed.locations.get(&attach_intent)?; + // Our observed state is not None, i.e. not in flux + let location_config = location.conf.as_ref()?; + + // Check if our intent and observed state agree that this node is in an attached state. + match location_config.mode { + LocationConfigMode::AttachedMulti + | LocationConfigMode::AttachedSingle + | LocationConfigMode::AttachedStale => Some(attach_intent), + _ => None, } } From d81353b2d1263ce38b5381a914d1f0c7eef3c191 Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Tue, 17 Jun 2025 05:37:43 -0700 Subject: [PATCH 054/124] pageserver: gRPC base backup fixes (#12243) ## Problem The gRPC base backup implementation has a few issues: chunks are not properly bounded, and it's not possible to omit the LSN. Touches #11728. ## Summary of changes * Properly bound chunks by using a limited writer. * Use an `Option` rather than a `ReadLsn` (the latter requires an LSN). --- pageserver/page_api/proto/page_service.proto | 6 +- pageserver/page_api/src/model.rs | 23 +++---- pageserver/src/page_service.rs | 69 ++++++++++---------- 3 files changed, 48 insertions(+), 50 deletions(-) diff --git a/pageserver/page_api/proto/page_service.proto b/pageserver/page_api/proto/page_service.proto index 44976084bf..7d01dec4ab 100644 --- a/pageserver/page_api/proto/page_service.proto +++ b/pageserver/page_api/proto/page_service.proto @@ -102,10 +102,10 @@ message CheckRelExistsResponse { bool exists = 1; } -// Requests a base backup at a given LSN. +// Requests a base backup. message GetBaseBackupRequest { - // The LSN to fetch a base backup at. - ReadLsn read_lsn = 1; + // The LSN to fetch the base backup at. 0 or absent means the latest LSN known to the Pageserver. + uint64 lsn = 1; // If true, logical replication slots will not be created. bool replica = 2; } diff --git a/pageserver/page_api/src/model.rs b/pageserver/page_api/src/model.rs index 664ac0e6c4..799f48712f 100644 --- a/pageserver/page_api/src/model.rs +++ b/pageserver/page_api/src/model.rs @@ -182,33 +182,28 @@ impl From for proto::CheckRelExistsResponse { } } -/// Requests a base backup at a given LSN. +/// Requests a base backup. #[derive(Clone, Copy, Debug)] pub struct GetBaseBackupRequest { - /// The LSN to fetch a base backup at. - pub read_lsn: ReadLsn, + /// The LSN to fetch a base backup at. If None, uses the latest LSN known to the Pageserver. + pub lsn: Option, /// If true, logical replication slots will not be created. pub replica: bool, } -impl TryFrom for GetBaseBackupRequest { - type Error = ProtocolError; - - fn try_from(pb: proto::GetBaseBackupRequest) -> Result { - Ok(Self { - read_lsn: pb - .read_lsn - .ok_or(ProtocolError::Missing("read_lsn"))? - .try_into()?, +impl From for GetBaseBackupRequest { + fn from(pb: proto::GetBaseBackupRequest) -> Self { + Self { + lsn: (pb.lsn != 0).then_some(Lsn(pb.lsn)), replica: pb.replica, - }) + } } } impl From for proto::GetBaseBackupRequest { fn from(request: GetBaseBackupRequest) -> Self { Self { - read_lsn: Some(request.read_lsn.into()), + lsn: request.lsn.unwrap_or_default().0, replica: request.replica, } } diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index d47f6bd095..0521f5c556 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -14,7 +14,7 @@ use std::{io, str}; use anyhow::{Context as _, anyhow, bail}; use async_compression::tokio::write::GzipEncoder; -use bytes::{Buf, BytesMut}; +use bytes::{Buf as _, BufMut as _, BytesMut}; use futures::future::BoxFuture; use futures::{FutureExt, Stream}; use itertools::Itertools; @@ -3601,42 +3601,44 @@ impl proto::PageService for GrpcPageServiceHandler { let timeline = self.get_request_timeline(&req).await?; let ctx = self.ctx.with_scope_timeline(&timeline); - // Validate the request, decorate the span, and wait for the LSN to arrive. - // - // TODO: this requires a read LSN, is that ok? + // Validate the request and decorate the span. Self::ensure_shard_zero(&timeline)?; if timeline.is_archived() == Some(true) { return Err(tonic::Status::failed_precondition("timeline is archived")); } - let req: page_api::GetBaseBackupRequest = req.into_inner().try_into()?; + let req: page_api::GetBaseBackupRequest = req.into_inner().into(); - span_record!(lsn=%req.read_lsn); + span_record!(lsn=?req.lsn); - let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn(); - timeline - .wait_lsn( - req.read_lsn.request_lsn, - WaitLsnWaiter::PageService, - WaitLsnTimeout::Default, - &ctx, - ) - .await?; - timeline - .check_lsn_is_in_scope(req.read_lsn.request_lsn, &latest_gc_cutoff_lsn) - .map_err(|err| { - tonic::Status::invalid_argument(format!("invalid basebackup LSN: {err}")) - })?; + // Wait for the LSN to arrive, if given. + if let Some(lsn) = req.lsn { + let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn(); + timeline + .wait_lsn( + lsn, + WaitLsnWaiter::PageService, + WaitLsnTimeout::Default, + &ctx, + ) + .await?; + timeline + .check_lsn_is_in_scope(lsn, &latest_gc_cutoff_lsn) + .map_err(|err| { + tonic::Status::invalid_argument(format!("invalid basebackup LSN: {err}")) + })?; + } // Spawn a task to run the basebackup. // - // TODO: do we need to support full base backups, for debugging? + // TODO: do we need to support full base backups, for debugging? This also requires passing + // the prev_lsn parameter. let span = Span::current(); let (mut simplex_read, mut simplex_write) = tokio::io::simplex(CHUNK_SIZE); let jh = tokio::spawn(async move { let result = basebackup::send_basebackup_tarball( &mut simplex_write, &timeline, - Some(req.read_lsn.request_lsn), + req.lsn, None, false, req.replica, @@ -3652,20 +3654,21 @@ impl proto::PageService for GrpcPageServiceHandler { // Emit chunks of size CHUNK_SIZE. let chunks = async_stream::try_stream! { - let mut chunk = BytesMut::with_capacity(CHUNK_SIZE); loop { - let n = simplex_read.read_buf(&mut chunk).await.map_err(|err| { - tonic::Status::internal(format!("failed to read basebackup chunk: {err}")) - })?; - - // If we read 0 bytes, either the chunk is full or the stream is closed. - if n == 0 { - if chunk.is_empty() { - break; + let mut chunk = BytesMut::with_capacity(CHUNK_SIZE).limit(CHUNK_SIZE); + loop { + let n = simplex_read.read_buf(&mut chunk).await.map_err(|err| { + tonic::Status::internal(format!("failed to read basebackup chunk: {err}")) + })?; + if n == 0 { + break; // full chunk or closed stream } - yield proto::GetBaseBackupResponseChunk::from(chunk.clone().freeze()); - chunk.clear(); } + let chunk = chunk.into_inner().freeze(); + if chunk.is_empty() { + break; + } + yield proto::GetBaseBackupResponseChunk::from(chunk); } // Wait for the basebackup task to exit and check for errors. jh.await.map_err(|err| { From 48052477b471aaeff1b01d2af62cce40aa02b1d0 Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Tue, 17 Jun 2025 06:27:10 -0700 Subject: [PATCH 055/124] storcon: register Pageserver gRPC address (#12268) ## Problem Pageservers now expose a gRPC API on a separate address and port. This must be registered with the storage controller such that it can be plumbed through to the compute via cplane. Touches #11926. ## Summary of changes This patch registers the gRPC address and port with the storage controller: * Add gRPC address to `nodes` database table and `NodePersistence`, with a Diesel migration. * Add gRPC address in `NodeMetadata`, `NodeRegisterRequest`, `NodeDescribeResponse`, and `TenantLocateResponseShard`. * Add gRPC address flags to `storcon_cli node-register`. These changes are backwards-compatible, since all structs will ignore unknown fields during deserialization. --- control_plane/src/pageserver.rs | 17 ++++++++-- control_plane/storcon_cli/src/main.rs | 8 +++++ libs/pageserver_api/src/config.rs | 23 +++++++++++-- libs/pageserver_api/src/config/tests.rs | 31 +++++++++++++++++ libs/pageserver_api/src/controller_api.rs | 6 ++++ pageserver/src/controller_upcall_client.rs | 11 ++---- .../down.sql | 1 + .../up.sql | 1 + storage_controller/src/node.rs | 34 ++++++++++++++++--- storage_controller/src/persistence.rs | 2 ++ storage_controller/src/scheduler.rs | 2 ++ storage_controller/src/schema.rs | 2 ++ storage_controller/src/service.rs | 10 ++++++ 13 files changed, 131 insertions(+), 17 deletions(-) create mode 100644 storage_controller/migrations/2025-06-17-082247_pageserver_grpc_addr/down.sql create mode 100644 storage_controller/migrations/2025-06-17-082247_pageserver_grpc_addr/up.sql diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index 3b7c4ec39f..a683d2daec 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -16,6 +16,7 @@ use std::time::Duration; use anyhow::{Context, bail}; use camino::Utf8PathBuf; +use pageserver_api::config::{DEFAULT_GRPC_LISTEN_PORT, DEFAULT_HTTP_LISTEN_PORT}; use pageserver_api::models::{self, TenantInfo, TimelineInfo}; use pageserver_api::shard::TenantShardId; use pageserver_client::mgmt_api; @@ -252,9 +253,10 @@ impl PageServerNode { // the storage controller let metadata_path = datadir.join("metadata.json"); - let (_http_host, http_port) = + let http_host = "localhost".to_string(); + let (_, http_port) = parse_host_port(&self.conf.listen_http_addr).expect("Unable to parse listen_http_addr"); - let http_port = http_port.unwrap_or(9898); + let http_port = http_port.unwrap_or(DEFAULT_HTTP_LISTEN_PORT); let https_port = match self.conf.listen_https_addr.as_ref() { Some(https_addr) => { @@ -265,6 +267,13 @@ impl PageServerNode { None => None, }; + let (mut grpc_host, mut grpc_port) = (None, None); + if let Some(grpc_addr) = &self.conf.listen_grpc_addr { + let (_, port) = parse_host_port(grpc_addr).expect("Unable to parse listen_grpc_addr"); + grpc_host = Some("localhost".to_string()); + grpc_port = Some(port.unwrap_or(DEFAULT_GRPC_LISTEN_PORT)); + } + // Intentionally hand-craft JSON: this acts as an implicit format compat test // in case the pageserver-side structure is edited, and reflects the real life // situation: the metadata is written by some other script. @@ -273,7 +282,9 @@ impl PageServerNode { serde_json::to_vec(&pageserver_api::config::NodeMetadata { postgres_host: "localhost".to_string(), postgres_port: self.pg_connection_config.port(), - http_host: "localhost".to_string(), + grpc_host, + grpc_port, + http_host, http_port, https_port, other: HashMap::from([( diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs index 1a9e944e07..507190b1e0 100644 --- a/control_plane/storcon_cli/src/main.rs +++ b/control_plane/storcon_cli/src/main.rs @@ -36,6 +36,10 @@ enum Command { listen_pg_addr: String, #[arg(long)] listen_pg_port: u16, + #[arg(long)] + listen_grpc_addr: Option, + #[arg(long)] + listen_grpc_port: Option, #[arg(long)] listen_http_addr: String, @@ -418,6 +422,8 @@ async fn main() -> anyhow::Result<()> { node_id, listen_pg_addr, listen_pg_port, + listen_grpc_addr, + listen_grpc_port, listen_http_addr, listen_http_port, listen_https_port, @@ -431,6 +437,8 @@ async fn main() -> anyhow::Result<()> { node_id, listen_pg_addr, listen_pg_port, + listen_grpc_addr, + listen_grpc_port, listen_http_addr, listen_http_port, listen_https_port, diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs index 30b0612082..f6001d68c4 100644 --- a/libs/pageserver_api/src/config.rs +++ b/libs/pageserver_api/src/config.rs @@ -12,6 +12,7 @@ pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LI pub const DEFAULT_GRPC_LISTEN_PORT: u16 = 51051; // storage-broker already uses 50051 use std::collections::HashMap; +use std::fmt::Display; use std::num::{NonZeroU64, NonZeroUsize}; use std::str::FromStr; use std::time::Duration; @@ -24,16 +25,17 @@ use utils::logging::LogFormat; use crate::models::{ImageCompressionAlgorithm, LsnLease}; // Certain metadata (e.g. externally-addressable name, AZ) is delivered -// as a separate structure. This information is not neeed by the pageserver +// as a separate structure. This information is not needed by the pageserver // itself, it is only used for registering the pageserver with the control // plane and/or storage controller. -// #[derive(PartialEq, Eq, Debug, serde::Serialize, serde::Deserialize)] pub struct NodeMetadata { #[serde(rename = "host")] pub postgres_host: String, #[serde(rename = "port")] pub postgres_port: u16, + pub grpc_host: Option, + pub grpc_port: Option, pub http_host: String, pub http_port: u16, pub https_port: Option, @@ -44,6 +46,23 @@ pub struct NodeMetadata { pub other: HashMap, } +impl Display for NodeMetadata { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "postgresql://{}:{} ", + self.postgres_host, self.postgres_port + )?; + if let Some(grpc_host) = &self.grpc_host { + let grpc_port = self.grpc_port.unwrap_or_default(); + write!(f, "grpc://{grpc_host}:{grpc_port} ")?; + } + write!(f, "http://{}:{} ", self.http_host, self.http_port)?; + write!(f, "other:{:?}", self.other)?; + Ok(()) + } +} + /// PostHog integration config. #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] pub struct PostHogConfig { diff --git a/libs/pageserver_api/src/config/tests.rs b/libs/pageserver_api/src/config/tests.rs index 9e61873273..7137df969a 100644 --- a/libs/pageserver_api/src/config/tests.rs +++ b/libs/pageserver_api/src/config/tests.rs @@ -14,6 +14,8 @@ fn test_node_metadata_v1_backward_compatibilty() { NodeMetadata { postgres_host: "localhost".to_string(), postgres_port: 23, + grpc_host: None, + grpc_port: None, http_host: "localhost".to_string(), http_port: 42, https_port: None, @@ -37,6 +39,35 @@ fn test_node_metadata_v2_backward_compatibilty() { NodeMetadata { postgres_host: "localhost".to_string(), postgres_port: 23, + grpc_host: None, + grpc_port: None, + http_host: "localhost".to_string(), + http_port: 42, + https_port: Some(123), + other: HashMap::new(), + } + ) +} + +#[test] +fn test_node_metadata_v3_backward_compatibilty() { + let v3 = serde_json::to_vec(&serde_json::json!({ + "host": "localhost", + "port": 23, + "grpc_host": "localhost", + "grpc_port": 51, + "http_host": "localhost", + "http_port": 42, + "https_port": 123, + })); + + assert_eq!( + serde_json::from_slice::(&v3.unwrap()).unwrap(), + NodeMetadata { + postgres_host: "localhost".to_string(), + postgres_port: 23, + grpc_host: Some("localhost".to_string()), + grpc_port: Some(51), http_host: "localhost".to_string(), http_port: 42, https_port: Some(123), diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs index ae792cc81c..55495dd68e 100644 --- a/libs/pageserver_api/src/controller_api.rs +++ b/libs/pageserver_api/src/controller_api.rs @@ -52,6 +52,8 @@ pub struct NodeRegisterRequest { pub listen_pg_addr: String, pub listen_pg_port: u16, + pub listen_grpc_addr: Option, + pub listen_grpc_port: Option, pub listen_http_addr: String, pub listen_http_port: u16, @@ -101,6 +103,8 @@ pub struct TenantLocateResponseShard { pub listen_pg_addr: String, pub listen_pg_port: u16, + pub listen_grpc_addr: Option, + pub listen_grpc_port: Option, pub listen_http_addr: String, pub listen_http_port: u16, @@ -152,6 +156,8 @@ pub struct NodeDescribeResponse { pub listen_pg_addr: String, pub listen_pg_port: u16, + pub listen_grpc_addr: Option, + pub listen_grpc_port: Option, } #[derive(Serialize, Deserialize, Debug)] diff --git a/pageserver/src/controller_upcall_client.rs b/pageserver/src/controller_upcall_client.rs index dc38ea616c..f1f9aaf43c 100644 --- a/pageserver/src/controller_upcall_client.rs +++ b/pageserver/src/controller_upcall_client.rs @@ -159,14 +159,7 @@ impl StorageControllerUpcallApi for StorageControllerUpcallClient { Ok(m) => { // Since we run one time at startup, be generous in our logging and // dump all metadata. - tracing::info!( - "Loaded node metadata: postgres {}:{}, http {}:{}, other fields: {:?}", - m.postgres_host, - m.postgres_port, - m.http_host, - m.http_port, - m.other - ); + tracing::info!("Loaded node metadata: {m}"); let az_id = { let az_id_from_metadata = m @@ -195,6 +188,8 @@ impl StorageControllerUpcallApi for StorageControllerUpcallClient { node_id: conf.id, listen_pg_addr: m.postgres_host, listen_pg_port: m.postgres_port, + listen_grpc_addr: m.grpc_host, + listen_grpc_port: m.grpc_port, listen_http_addr: m.http_host, listen_http_port: m.http_port, listen_https_port: m.https_port, diff --git a/storage_controller/migrations/2025-06-17-082247_pageserver_grpc_addr/down.sql b/storage_controller/migrations/2025-06-17-082247_pageserver_grpc_addr/down.sql new file mode 100644 index 0000000000..f9f2ebb070 --- /dev/null +++ b/storage_controller/migrations/2025-06-17-082247_pageserver_grpc_addr/down.sql @@ -0,0 +1 @@ +ALTER TABLE nodes DROP listen_grpc_addr, listen_grpc_port; diff --git a/storage_controller/migrations/2025-06-17-082247_pageserver_grpc_addr/up.sql b/storage_controller/migrations/2025-06-17-082247_pageserver_grpc_addr/up.sql new file mode 100644 index 0000000000..8291864b16 --- /dev/null +++ b/storage_controller/migrations/2025-06-17-082247_pageserver_grpc_addr/up.sql @@ -0,0 +1 @@ +ALTER TABLE nodes ADD listen_grpc_addr VARCHAR NULL, ADD listen_grpc_port INTEGER NULL; diff --git a/storage_controller/src/node.rs b/storage_controller/src/node.rs index 8e0f1873e5..cba007d75f 100644 --- a/storage_controller/src/node.rs +++ b/storage_controller/src/node.rs @@ -37,6 +37,8 @@ pub(crate) struct Node { listen_pg_addr: String, listen_pg_port: u16, + listen_grpc_addr: Option, + listen_grpc_port: Option, availability_zone_id: AvailabilityZone, @@ -100,8 +102,8 @@ impl Node { self.id == register_req.node_id && self.listen_http_addr == register_req.listen_http_addr && self.listen_http_port == register_req.listen_http_port - // Note: listen_https_port may change. See [`Self::need_update`] for mode details. - // && self.listen_https_port == register_req.listen_https_port + // Note: HTTPS and gRPC addresses may change, to allow for migrations. See + // [`Self::need_update`] for more details. && self.listen_pg_addr == register_req.listen_pg_addr && self.listen_pg_port == register_req.listen_pg_port && self.availability_zone_id == register_req.availability_zone_id @@ -109,9 +111,10 @@ impl Node { // Do we need to update an existing record in DB on this registration request? pub(crate) fn need_update(&self, register_req: &NodeRegisterRequest) -> bool { - // listen_https_port is checked here because it may change during migration to https. - // After migration, this check may be moved to registration_match. + // These are checked here, since they may change before we're fully migrated. self.listen_https_port != register_req.listen_https_port + || self.listen_grpc_addr != register_req.listen_grpc_addr + || self.listen_grpc_port != register_req.listen_grpc_port } /// For a shard located on this node, populate a response object @@ -125,6 +128,8 @@ impl Node { listen_https_port: self.listen_https_port, listen_pg_addr: self.listen_pg_addr.clone(), listen_pg_port: self.listen_pg_port, + listen_grpc_addr: self.listen_grpc_addr.clone(), + listen_grpc_port: self.listen_grpc_port, } } @@ -211,6 +216,8 @@ impl Node { listen_https_port: Option, listen_pg_addr: String, listen_pg_port: u16, + listen_grpc_addr: Option, + listen_grpc_port: Option, availability_zone_id: AvailabilityZone, use_https: bool, ) -> anyhow::Result { @@ -221,6 +228,10 @@ impl Node { ); } + if listen_grpc_addr.is_some() != listen_grpc_port.is_some() { + anyhow::bail!("cannot create node {id}: must specify both gRPC address and port"); + } + Ok(Self { id, listen_http_addr, @@ -228,6 +239,8 @@ impl Node { listen_https_port, listen_pg_addr, listen_pg_port, + listen_grpc_addr, + listen_grpc_port, scheduling: NodeSchedulingPolicy::Active, lifecycle: NodeLifecycle::Active, availability: NodeAvailability::Offline, @@ -247,6 +260,8 @@ impl Node { listen_https_port: self.listen_https_port.map(|x| x as i32), listen_pg_addr: self.listen_pg_addr.clone(), listen_pg_port: self.listen_pg_port as i32, + listen_grpc_addr: self.listen_grpc_addr.clone(), + listen_grpc_port: self.listen_grpc_port.map(|port| port as i32), availability_zone_id: self.availability_zone_id.0.clone(), } } @@ -260,6 +275,13 @@ impl Node { ); } + if np.listen_grpc_addr.is_some() != np.listen_grpc_port.is_some() { + anyhow::bail!( + "can't load node {}: must specify both gRPC address and port", + np.node_id + ); + } + Ok(Self { id: NodeId(np.node_id as u64), // At startup we consider a node offline until proven otherwise. @@ -272,6 +294,8 @@ impl Node { listen_https_port: np.listen_https_port.map(|x| x as u16), listen_pg_addr: np.listen_pg_addr, listen_pg_port: np.listen_pg_port as u16, + listen_grpc_addr: np.listen_grpc_addr, + listen_grpc_port: np.listen_grpc_port.map(|port| port as u16), availability_zone_id: AvailabilityZone(np.availability_zone_id), use_https, cancel: CancellationToken::new(), @@ -361,6 +385,8 @@ impl Node { listen_https_port: self.listen_https_port, listen_pg_addr: self.listen_pg_addr.clone(), listen_pg_port: self.listen_pg_port, + listen_grpc_addr: self.listen_grpc_addr.clone(), + listen_grpc_port: self.listen_grpc_port, } } } diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs index 2edfe3a338..4300dd32a9 100644 --- a/storage_controller/src/persistence.rs +++ b/storage_controller/src/persistence.rs @@ -2125,6 +2125,8 @@ pub(crate) struct NodePersistence { pub(crate) availability_zone_id: String, pub(crate) listen_https_port: Option, pub(crate) lifecycle: String, + pub(crate) listen_grpc_addr: Option, + pub(crate) listen_grpc_port: Option, } /// Tenant metadata health status that are stored durably. diff --git a/storage_controller/src/scheduler.rs b/storage_controller/src/scheduler.rs index 773373391e..3fa25443da 100644 --- a/storage_controller/src/scheduler.rs +++ b/storage_controller/src/scheduler.rs @@ -945,6 +945,8 @@ pub(crate) mod test_utils { None, format!("pghost-{i}"), 5432 + i as u16, + Some(format!("grpchost-{i}")), + Some(51051 + i as u16), az_iter .next() .cloned() diff --git a/storage_controller/src/schema.rs b/storage_controller/src/schema.rs index f5807cfcd2..312f7e0b0e 100644 --- a/storage_controller/src/schema.rs +++ b/storage_controller/src/schema.rs @@ -34,6 +34,8 @@ diesel::table! { availability_zone_id -> Varchar, listen_https_port -> Nullable, lifecycle -> Varchar, + listen_grpc_addr -> Nullable, + listen_grpc_port -> Nullable, } } diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 8800d792f2..55f91cea09 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -1683,6 +1683,8 @@ impl Service { None, "".to_string(), 123, + None, + None, AvailabilityZone("test_az".to_string()), false, ) @@ -7254,6 +7256,12 @@ impl Service { )); } + if register_req.listen_grpc_addr.is_some() != register_req.listen_grpc_port.is_some() { + return Err(ApiError::BadRequest(anyhow::anyhow!( + "must specify both gRPC address and port" + ))); + } + // Ordering: we must persist the new node _before_ adding it to in-memory state. // This ensures that before we use it for anything or expose it via any external // API, it is guaranteed to be available after a restart. @@ -7264,6 +7272,8 @@ impl Service { register_req.listen_https_port, register_req.listen_pg_addr, register_req.listen_pg_port, + register_req.listen_grpc_addr, + register_req.listen_grpc_port, register_req.availability_zone_id.clone(), self.config.use_https_pageserver_api, ); From 4a8f3508f98053fb485160dba40efdce63c4d19e Mon Sep 17 00:00:00 2001 From: Aleksandr Sarantsev <99037063+ephemeralsad@users.noreply.github.com> Date: Tue, 17 Jun 2025 17:33:01 +0400 Subject: [PATCH 056/124] storcon: Add safekeeper request label group (#12239) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Problem The metrics `storage_controller_safekeeper_request_error` and `storage_controller_safekeeper_request_latency` currently use `pageserver_id` as a label. This can be misleading, as the metrics are about safekeeper requests. We want to replace this with a more accurate label — either `safekeeper_id` or `node_id`. ## Summary of changes - Introduced `SafekeeperRequestLabelGroup` with `safekeeper_id`. - Updated the affected metrics to use the new label group. - Fixed incorrect metric usage in safekeeper_client.rs ## Follow-up - Review usage of these metrics in alerting rules and existing Grafana dashboards to ensure this change does not break something. --- storage_controller/src/metrics.rs | 14 ++++++++++++-- storage_controller/src/safekeeper_client.rs | 8 ++++---- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/storage_controller/src/metrics.rs b/storage_controller/src/metrics.rs index 5b27e01afd..07713c3fbc 100644 --- a/storage_controller/src/metrics.rs +++ b/storage_controller/src/metrics.rs @@ -97,7 +97,7 @@ pub(crate) struct StorageControllerMetricGroup { /// Count of HTTP requests to the safekeeper that resulted in an error, /// broken down by the safekeeper node id, request name and method pub(crate) storage_controller_safekeeper_request_error: - measured::CounterVec, + measured::CounterVec, /// Latency of HTTP requests to the pageserver, broken down by pageserver /// node id, request name and method. This include both successful and unsuccessful @@ -111,7 +111,7 @@ pub(crate) struct StorageControllerMetricGroup { /// requests. #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))] pub(crate) storage_controller_safekeeper_request_latency: - measured::HistogramVec, + measured::HistogramVec, /// Count of pass-through HTTP requests to the pageserver that resulted in an error, /// broken down by the pageserver node id, request name and method @@ -219,6 +219,16 @@ pub(crate) struct PageserverRequestLabelGroup<'a> { pub(crate) method: Method, } +#[derive(measured::LabelGroup, Clone)] +#[label(set = SafekeeperRequestLabelGroupSet)] +pub(crate) struct SafekeeperRequestLabelGroup<'a> { + #[label(dynamic_with = lasso::ThreadedRodeo, default)] + pub(crate) safekeeper_id: &'a str, + #[label(dynamic_with = lasso::ThreadedRodeo, default)] + pub(crate) path: &'a str, + pub(crate) method: Method, +} + #[derive(measured::LabelGroup)] #[label(set = DatabaseQueryErrorLabelGroupSet)] pub(crate) struct DatabaseQueryErrorLabelGroup { diff --git a/storage_controller/src/safekeeper_client.rs b/storage_controller/src/safekeeper_client.rs index 1f3ea96d96..bcf223c731 100644 --- a/storage_controller/src/safekeeper_client.rs +++ b/storage_controller/src/safekeeper_client.rs @@ -5,7 +5,7 @@ use safekeeper_client::mgmt_api::{Client, Result}; use utils::id::{NodeId, TenantId, TimelineId}; use utils::logging::SecretString; -use crate::metrics::PageserverRequestLabelGroup; +use crate::metrics::SafekeeperRequestLabelGroup; /// Thin wrapper around [`safekeeper_client::mgmt_api::Client`]. It allows the storage /// controller to collect metrics in a non-intrusive manner. @@ -19,8 +19,8 @@ pub(crate) struct SafekeeperClient { macro_rules! measured_request { ($name:literal, $method:expr, $node_id: expr, $invoke:expr) => {{ - let labels = PageserverRequestLabelGroup { - pageserver_id: $node_id, + let labels = SafekeeperRequestLabelGroup { + safekeeper_id: $node_id, path: $name, method: $method, }; @@ -35,7 +35,7 @@ macro_rules! measured_request { if res.is_err() { let error_counters = &crate::metrics::METRICS_REGISTRY .metrics_group - .storage_controller_pageserver_request_error; + .storage_controller_safekeeper_request_error; error_counters.inc(labels) } From edf51688bc6a6f6389ea9537e57cbff73a98424d Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Tue, 17 Jun 2025 07:39:42 -0700 Subject: [PATCH 057/124] neon_local: support gRPC connstrings for endpoints (#12271) ## Problem `neon_local` should support endpoints using gRPC, by providing `grpc://` connstrings with the Pageservers' gRPC ports. Requires #12268. Touches #11926. ## Summary of changes * Add `--grpc` switch for `neon_local endpoint create`. * Generate `grpc://` connstrings for endpoints when enabled. Computes don't actually support `grpc://` connstrings yet, but will soon. gRPC is configured when the endpoint is created, not when it's started, such that it continues to use gRPC across restarts and reconfigurations. In particular, this is necessary for the storage controller's local notify hook, which can't easily plumb through gRPC configuration from the start/reconfigure commands but has access to the endpoint's configuration. --- control_plane/src/bin/neon_local.rs | 89 +++++++++++++++++++------- control_plane/src/endpoint.rs | 62 +++++++++++------- storage_controller/src/compute_hook.rs | 39 ++++++----- 3 files changed, 128 insertions(+), 62 deletions(-) diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index 76e33e4bff..01ca28fce0 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -18,7 +18,7 @@ use clap::Parser; use compute_api::requests::ComputeClaimsScope; use compute_api::spec::ComputeMode; use control_plane::broker::StorageBroker; -use control_plane::endpoint::ComputeControlPlane; +use control_plane::endpoint::{ComputeControlPlane, PageserverProtocol}; use control_plane::endpoint_storage::{ENDPOINT_STORAGE_DEFAULT_ADDR, EndpointStorage}; use control_plane::local_env; use control_plane::local_env::{ @@ -605,6 +605,14 @@ struct EndpointCreateCmdArgs { #[clap(long, help = "Postgres version")] pg_version: u32, + /// Use gRPC to communicate with Pageservers, by generating grpc:// connstrings. + /// + /// Specified on creation such that it's retained across reconfiguration and restarts. + /// + /// NB: not yet supported by computes. + #[clap(long)] + grpc: bool, + #[clap( long, help = "If set, the node will be a hot replica on the specified timeline", @@ -1451,6 +1459,7 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res args.internal_http_port, args.pg_version, mode, + args.grpc, !args.update_catalog, false, )?; @@ -1491,13 +1500,20 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res let (pageservers, stripe_size) = if let Some(pageserver_id) = pageserver_id { let conf = env.get_pageserver_conf(pageserver_id).unwrap(); - let parsed = parse_host_port(&conf.listen_pg_addr).expect("Bad config"); - ( - vec![(parsed.0, parsed.1.unwrap_or(5432))], - // If caller is telling us what pageserver to use, this is not a tenant which is - // full managed by storage controller, therefore not sharded. - DEFAULT_STRIPE_SIZE, - ) + // Use gRPC if requested. + let pageserver = if endpoint.grpc { + let grpc_addr = conf.listen_grpc_addr.as_ref().expect("bad config"); + let (host, port) = parse_host_port(grpc_addr)?; + let port = port.unwrap_or(DEFAULT_PAGESERVER_GRPC_PORT); + (PageserverProtocol::Grpc, host, port) + } else { + let (host, port) = parse_host_port(&conf.listen_pg_addr)?; + let port = port.unwrap_or(5432); + (PageserverProtocol::Libpq, host, port) + }; + // If caller is telling us what pageserver to use, this is not a tenant which is + // fully managed by storage controller, therefore not sharded. + (vec![pageserver], DEFAULT_STRIPE_SIZE) } else { // Look up the currently attached location of the tenant, and its striping metadata, // to pass these on to postgres. @@ -1516,11 +1532,20 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res .await?; } - anyhow::Ok(( - Host::parse(&shard.listen_pg_addr) - .expect("Storage controller reported bad hostname"), - shard.listen_pg_port, - )) + let pageserver = if endpoint.grpc { + ( + PageserverProtocol::Grpc, + Host::parse(&shard.listen_grpc_addr.expect("no gRPC address"))?, + shard.listen_grpc_port.expect("no gRPC port"), + ) + } else { + ( + PageserverProtocol::Libpq, + Host::parse(&shard.listen_pg_addr)?, + shard.listen_pg_port, + ) + }; + anyhow::Ok(pageserver) }), ) .await?; @@ -1575,11 +1600,19 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res .get(endpoint_id.as_str()) .with_context(|| format!("postgres endpoint {endpoint_id} is not found"))?; let pageservers = if let Some(ps_id) = args.endpoint_pageserver_id { - let pageserver = PageServerNode::from_env(env, env.get_pageserver_conf(ps_id)?); - vec![( - pageserver.pg_connection_config.host().clone(), - pageserver.pg_connection_config.port(), - )] + let conf = env.get_pageserver_conf(ps_id)?; + // Use gRPC if requested. + let pageserver = if endpoint.grpc { + let grpc_addr = conf.listen_grpc_addr.as_ref().expect("bad config"); + let (host, port) = parse_host_port(grpc_addr)?; + let port = port.unwrap_or(DEFAULT_PAGESERVER_GRPC_PORT); + (PageserverProtocol::Grpc, host, port) + } else { + let (host, port) = parse_host_port(&conf.listen_pg_addr)?; + let port = port.unwrap_or(5432); + (PageserverProtocol::Libpq, host, port) + }; + vec![pageserver] } else { let storage_controller = StorageController::from_env(env); storage_controller @@ -1588,11 +1621,21 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res .shards .into_iter() .map(|shard| { - ( - Host::parse(&shard.listen_pg_addr) - .expect("Storage controller reported malformed host"), - shard.listen_pg_port, - ) + // Use gRPC if requested. + if endpoint.grpc { + ( + PageserverProtocol::Grpc, + Host::parse(&shard.listen_grpc_addr.expect("no gRPC address")) + .expect("bad hostname"), + shard.listen_grpc_port.expect("no gRPC port"), + ) + } else { + ( + PageserverProtocol::Libpq, + Host::parse(&shard.listen_pg_addr).expect("bad hostname"), + shard.listen_pg_port, + ) + } }) .collect::>() }; diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs index bc3a3a4e38..03156c1809 100644 --- a/control_plane/src/endpoint.rs +++ b/control_plane/src/endpoint.rs @@ -37,6 +37,7 @@ //! ``` //! use std::collections::BTreeMap; +use std::fmt::Display; use std::net::{IpAddr, Ipv4Addr, SocketAddr, TcpStream}; use std::path::PathBuf; use std::process::Command; @@ -76,7 +77,6 @@ use utils::id::{NodeId, TenantId, TimelineId}; use crate::local_env::LocalEnv; use crate::postgresql_conf::PostgresConf; -use crate::storage_controller::StorageController; // contents of a endpoint.json file #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)] @@ -89,6 +89,7 @@ pub struct EndpointConf { external_http_port: u16, internal_http_port: u16, pg_version: u32, + grpc: bool, skip_pg_catalog_updates: bool, reconfigure_concurrency: usize, drop_subscriptions_before_start: bool, @@ -192,6 +193,7 @@ impl ComputeControlPlane { internal_http_port: Option, pg_version: u32, mode: ComputeMode, + grpc: bool, skip_pg_catalog_updates: bool, drop_subscriptions_before_start: bool, ) -> Result> { @@ -226,6 +228,7 @@ impl ComputeControlPlane { // we also skip catalog updates in the cloud. skip_pg_catalog_updates, drop_subscriptions_before_start, + grpc, reconfigure_concurrency: 1, features: vec![], cluster: None, @@ -244,6 +247,7 @@ impl ComputeControlPlane { internal_http_port, pg_port, pg_version, + grpc, skip_pg_catalog_updates, drop_subscriptions_before_start, reconfigure_concurrency: 1, @@ -298,6 +302,8 @@ pub struct Endpoint { pub tenant_id: TenantId, pub timeline_id: TimelineId, pub mode: ComputeMode, + /// If true, the endpoint should use gRPC to communicate with Pageservers. + pub grpc: bool, // port and address of the Postgres server and `compute_ctl`'s HTTP APIs pub pg_address: SocketAddr, @@ -333,7 +339,7 @@ pub enum EndpointStatus { RunningNoPidfile, } -impl std::fmt::Display for EndpointStatus { +impl Display for EndpointStatus { fn fmt(&self, writer: &mut std::fmt::Formatter) -> std::fmt::Result { let s = match self { Self::Running => "running", @@ -345,6 +351,29 @@ impl std::fmt::Display for EndpointStatus { } } +/// Protocol used to connect to a Pageserver. +#[derive(Clone, Copy, Debug)] +pub enum PageserverProtocol { + Libpq, + Grpc, +} + +impl PageserverProtocol { + /// Returns the URL scheme for the protocol, used in connstrings. + pub fn scheme(&self) -> &'static str { + match self { + Self::Libpq => "postgresql", + Self::Grpc => "grpc", + } + } +} + +impl Display for PageserverProtocol { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.write_str(self.scheme()) + } +} + impl Endpoint { fn from_dir_entry(entry: std::fs::DirEntry, env: &LocalEnv) -> Result { if !entry.file_type()?.is_dir() { @@ -380,6 +409,7 @@ impl Endpoint { mode: conf.mode, tenant_id: conf.tenant_id, pg_version: conf.pg_version, + grpc: conf.grpc, skip_pg_catalog_updates: conf.skip_pg_catalog_updates, reconfigure_concurrency: conf.reconfigure_concurrency, drop_subscriptions_before_start: conf.drop_subscriptions_before_start, @@ -608,10 +638,10 @@ impl Endpoint { } } - fn build_pageserver_connstr(pageservers: &[(Host, u16)]) -> String { + fn build_pageserver_connstr(pageservers: &[(PageserverProtocol, Host, u16)]) -> String { pageservers .iter() - .map(|(host, port)| format!("postgresql://no_user@{host}:{port}")) + .map(|(scheme, host, port)| format!("{scheme}://no_user@{host}:{port}")) .collect::>() .join(",") } @@ -656,7 +686,7 @@ impl Endpoint { endpoint_storage_addr: String, safekeepers_generation: Option, safekeepers: Vec, - pageservers: Vec<(Host, u16)>, + pageservers: Vec<(PageserverProtocol, Host, u16)>, remote_ext_base_url: Option<&String>, shard_stripe_size: usize, create_test_user: bool, @@ -941,10 +971,12 @@ impl Endpoint { pub async fn reconfigure( &self, - mut pageservers: Vec<(Host, u16)>, + pageservers: Vec<(PageserverProtocol, Host, u16)>, stripe_size: Option, safekeepers: Option>, ) -> Result<()> { + anyhow::ensure!(!pageservers.is_empty(), "no pageservers provided"); + let (mut spec, compute_ctl_config) = { let config_path = self.endpoint_path().join("config.json"); let file = std::fs::File::open(config_path)?; @@ -956,25 +988,7 @@ impl Endpoint { let postgresql_conf = self.read_postgresql_conf()?; spec.cluster.postgresql_conf = Some(postgresql_conf); - // If we weren't given explicit pageservers, query the storage controller - if pageservers.is_empty() { - let storage_controller = StorageController::from_env(&self.env); - let locate_result = storage_controller.tenant_locate(self.tenant_id).await?; - pageservers = locate_result - .shards - .into_iter() - .map(|shard| { - ( - Host::parse(&shard.listen_pg_addr) - .expect("Storage controller reported bad hostname"), - shard.listen_pg_port, - ) - }) - .collect::>(); - } - let pageserver_connstr = Self::build_pageserver_connstr(&pageservers); - assert!(!pageserver_connstr.is_empty()); spec.pageserver_connstring = Some(pageserver_connstr); if stripe_size.is_some() { spec.shard_stripe_size = stripe_size.map(|s| s.0 as usize); diff --git a/storage_controller/src/compute_hook.rs b/storage_controller/src/compute_hook.rs index 57709302e1..a4482a4dac 100644 --- a/storage_controller/src/compute_hook.rs +++ b/storage_controller/src/compute_hook.rs @@ -5,10 +5,11 @@ use std::sync::Arc; use std::time::Duration; use anyhow::Context; -use control_plane::endpoint::{ComputeControlPlane, EndpointStatus}; +use control_plane::endpoint::{ComputeControlPlane, EndpointStatus, PageserverProtocol}; use control_plane::local_env::LocalEnv; use futures::StreamExt; use hyper::StatusCode; +use pageserver_api::config::DEFAULT_GRPC_LISTEN_PORT; use pageserver_api::controller_api::AvailabilityZone; use pageserver_api::shard::{ShardCount, ShardNumber, ShardStripeSize, TenantShardId}; use postgres_connection::parse_host_port; @@ -420,23 +421,31 @@ impl ComputeHook { preferred_az: _preferred_az, } = reconfigure_request; - let compute_pageservers = shards - .iter() - .map(|shard| { - let ps_conf = env - .get_pageserver_conf(shard.node_id) - .expect("Unknown pageserver"); - let (pg_host, pg_port) = parse_host_port(&ps_conf.listen_pg_addr) - .expect("Unable to parse listen_pg_addr"); - (pg_host, pg_port.unwrap_or(5432)) - }) - .collect::>(); - for (endpoint_name, endpoint) in &cplane.endpoints { if endpoint.tenant_id == *tenant_id && endpoint.status() == EndpointStatus::Running { - tracing::info!("Reconfiguring endpoint {}", endpoint_name,); + tracing::info!("Reconfiguring endpoint {endpoint_name}"); + + let pageservers = shards + .iter() + .map(|shard| { + let ps_conf = env + .get_pageserver_conf(shard.node_id) + .expect("Unknown pageserver"); + if endpoint.grpc { + let addr = ps_conf.listen_grpc_addr.as_ref().expect("no gRPC address"); + let (host, port) = parse_host_port(addr).expect("invalid gRPC address"); + let port = port.unwrap_or(DEFAULT_GRPC_LISTEN_PORT); + (PageserverProtocol::Grpc, host, port) + } else { + let (host, port) = parse_host_port(&ps_conf.listen_pg_addr) + .expect("Unable to parse listen_pg_addr"); + (PageserverProtocol::Libpq, host, port.unwrap_or(5432)) + } + }) + .collect::>(); + endpoint - .reconfigure(compute_pageservers.clone(), *stripe_size, None) + .reconfigure(pageservers, *stripe_size, None) .await .map_err(NotifyError::NeonLocal)?; } From dee73f0cb494267ea2326842bbe093f9f1f1424a Mon Sep 17 00:00:00 2001 From: Dmitrii Kovalkov <34828390+DimasKovas@users.noreply.github.com> Date: Tue, 17 Jun 2025 19:08:59 +0400 Subject: [PATCH 058/124] pageserver: implement max_total_size_bytes limit for basebackup cache (#12230) ## Problem The cache was introduced as a hackathon project and the only supported limit was the number of entries. The basebackup entry size may vary. We need to have more control over disk space usage to ship it to production. - Part of https://github.com/neondatabase/cloud/issues/29353 ## Summary of changes - Store the size of entries in the cache and use it to limit `max_total_size_bytes` - Add the size of the cache in bytes to metrics. --- libs/pageserver_api/src/config.rs | 13 +- pageserver/src/basebackup_cache.rs | 337 ++++++++++++++++--------- pageserver/src/metrics.rs | 10 +- test_runner/regress/test_basebackup.py | 5 + 4 files changed, 241 insertions(+), 124 deletions(-) diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs index f6001d68c4..2d7a06a72f 100644 --- a/libs/pageserver_api/src/config.rs +++ b/libs/pageserver_api/src/config.rs @@ -356,16 +356,21 @@ pub struct TimelineImportConfig { pub struct BasebackupCacheConfig { #[serde(with = "humantime_serde")] pub cleanup_period: Duration, - // FIXME: Support max_size_bytes. - // pub max_size_bytes: usize, - pub max_size_entries: i64, + /// Maximum total size of basebackup cache entries on disk in bytes. + /// The cache may slightly exceed this limit because we do not know + /// the exact size of the cache entry untill it's written to disk. + pub max_total_size_bytes: u64, + // TODO(diko): support max_entry_size_bytes. + // pub max_entry_size_bytes: u64, + pub max_size_entries: usize, } impl Default for BasebackupCacheConfig { fn default() -> Self { Self { cleanup_period: Duration::from_secs(60), - // max_size_bytes: 1024 * 1024 * 1024, // 1 GiB + max_total_size_bytes: 1024 * 1024 * 1024, // 1 GiB + // max_entry_size_bytes: 16 * 1024 * 1024, // 16 MiB max_size_entries: 1000, } } diff --git a/pageserver/src/basebackup_cache.rs b/pageserver/src/basebackup_cache.rs index 7dde3e02fe..24f6413380 100644 --- a/pageserver/src/basebackup_cache.rs +++ b/pageserver/src/basebackup_cache.rs @@ -19,7 +19,10 @@ use utils::{ use crate::{ basebackup::send_basebackup_tarball, context::{DownloadBehavior, RequestContext}, - metrics::{BASEBACKUP_CACHE_ENTRIES, BASEBACKUP_CACHE_PREPARE, BASEBACKUP_CACHE_READ}, + metrics::{ + BASEBACKUP_CACHE_ENTRIES, BASEBACKUP_CACHE_PREPARE, BASEBACKUP_CACHE_READ, + BASEBACKUP_CACHE_SIZE, + }, task_mgr::TaskKind, tenant::{ Timeline, @@ -36,8 +39,13 @@ pub struct BasebackupPrepareRequest { pub type BasebackupPrepareSender = UnboundedSender; pub type BasebackupPrepareReceiver = UnboundedReceiver; -type BasebackupRemoveEntrySender = UnboundedSender; -type BasebackupRemoveEntryReceiver = UnboundedReceiver; +#[derive(Clone)] +struct CacheEntry { + /// LSN at which the basebackup was taken. + lsn: Lsn, + /// Size of the basebackup archive in bytes. + size_bytes: u64, +} /// BasebackupCache stores cached basebackup archives for timelines on local disk. /// @@ -53,21 +61,12 @@ type BasebackupRemoveEntryReceiver = UnboundedReceiver; /// and ~1 RPS for get requests. pub struct BasebackupCache { data_dir: Utf8PathBuf, - config: BasebackupCacheConfig, - tenant_manager: Arc, - remove_entry_sender: BasebackupRemoveEntrySender, - entries: std::sync::Mutex>, - - cancel: CancellationToken, + entries: std::sync::Mutex>, read_hit_count: GenericCounter, read_miss_count: GenericCounter, read_err_count: GenericCounter, - - prepare_ok_count: GenericCounter, - prepare_skip_count: GenericCounter, - prepare_err_count: GenericCounter, } impl BasebackupCache { @@ -83,35 +82,32 @@ impl BasebackupCache { tenant_manager: Arc, cancel: CancellationToken, ) -> Arc { - let (remove_entry_sender, remove_entry_receiver) = tokio::sync::mpsc::unbounded_channel(); - - let enabled = config.is_some(); - let cache = Arc::new(BasebackupCache { data_dir, - config: config.unwrap_or_default(), - tenant_manager, - remove_entry_sender, entries: std::sync::Mutex::new(HashMap::new()), - cancel, - read_hit_count: BASEBACKUP_CACHE_READ.with_label_values(&["hit"]), read_miss_count: BASEBACKUP_CACHE_READ.with_label_values(&["miss"]), read_err_count: BASEBACKUP_CACHE_READ.with_label_values(&["error"]), - - prepare_ok_count: BASEBACKUP_CACHE_PREPARE.with_label_values(&["ok"]), - prepare_skip_count: BASEBACKUP_CACHE_PREPARE.with_label_values(&["skip"]), - prepare_err_count: BASEBACKUP_CACHE_PREPARE.with_label_values(&["error"]), }); - if enabled { - runtime_handle.spawn( - cache - .clone() - .background(prepare_receiver, remove_entry_receiver), - ); + if let Some(config) = config { + let background = BackgroundTask { + c: cache.clone(), + + config, + tenant_manager, + cancel, + + entry_count: 0, + total_size_bytes: 0, + + prepare_ok_count: BASEBACKUP_CACHE_PREPARE.with_label_values(&["ok"]), + prepare_skip_count: BASEBACKUP_CACHE_PREPARE.with_label_values(&["skip"]), + prepare_err_count: BASEBACKUP_CACHE_PREPARE.with_label_values(&["error"]), + }; + runtime_handle.spawn(background.run(prepare_receiver)); } cache @@ -129,7 +125,7 @@ impl BasebackupCache { ) -> Option { // Fast path. Check if the entry exists using the in-memory state. let tti = TenantTimelineId::new(tenant_id, timeline_id); - if self.entries.lock().unwrap().get(&tti) != Some(&lsn) { + if self.entries.lock().unwrap().get(&tti).map(|e| e.lsn) != Some(lsn) { self.read_miss_count.inc(); return None; } @@ -167,9 +163,41 @@ impl BasebackupCache { self.data_dir .join(Self::entry_filename(tenant_id, timeline_id, lsn)) } +} +/// The background task that does the job to prepare basebackups +/// and manage the cache entries on disk. +/// It is a separate struct from BasebackupCache to allow holding +/// a mutable reference to this state without a mutex lock, +/// while BasebackupCache is referenced by the clients. +struct BackgroundTask { + c: Arc, + + config: BasebackupCacheConfig, + tenant_manager: Arc, + cancel: CancellationToken, + + /// Number of the entries in the cache. + /// This counter is used for metrics and applying cache limits. + /// It generally should be equal to c.entries.len(), but it's calculated + /// pessimistically for abnormal situations: if we encountered some errors + /// during removing the entry from disk, we won't decrement this counter to + /// make sure that we don't exceed the limit with "trashed" files on the disk. + /// It will also count files in the data_dir that are not valid cache entries. + entry_count: usize, + /// Total size of all the entries on the disk. + /// This counter is used for metrics and applying cache limits. + /// Similar to entry_count, it is calculated pessimistically for abnormal situations. + total_size_bytes: u64, + + prepare_ok_count: GenericCounter, + prepare_skip_count: GenericCounter, + prepare_err_count: GenericCounter, +} + +impl BackgroundTask { fn tmp_dir(&self) -> Utf8PathBuf { - self.data_dir.join("tmp") + self.c.data_dir.join("tmp") } fn entry_tmp_path( @@ -179,7 +207,7 @@ impl BasebackupCache { lsn: Lsn, ) -> Utf8PathBuf { self.tmp_dir() - .join(Self::entry_filename(tenant_id, timeline_id, lsn)) + .join(BasebackupCache::entry_filename(tenant_id, timeline_id, lsn)) } fn parse_entry_filename(filename: &str) -> Option<(TenantId, TimelineId, Lsn)> { @@ -208,11 +236,11 @@ impl BasebackupCache { Ok(()) } - async fn cleanup(&self) -> anyhow::Result<()> { + async fn cleanup(&mut self) -> anyhow::Result<()> { self.clean_tmp_dir().await?; - // Remove outdated entries. - let entries_old = self.entries.lock().unwrap().clone(); + // Leave only up-to-date entries. + let entries_old = self.c.entries.lock().unwrap().clone(); let mut entries_new = HashMap::new(); for (tenant_shard_id, tenant_slot) in self.tenant_manager.list() { if !tenant_shard_id.is_shard_zero() { @@ -225,31 +253,32 @@ impl BasebackupCache { for timeline in tenant.list_timelines() { let tti = TenantTimelineId::new(tenant_id, timeline.timeline_id); - if let Some(&entry_lsn) = entries_old.get(&tti) { - if timeline.get_last_record_lsn() <= entry_lsn { - entries_new.insert(tti, entry_lsn); + if let Some(entry) = entries_old.get(&tti) { + if timeline.get_last_record_lsn() <= entry.lsn { + entries_new.insert(tti, entry.clone()); } } } } - for (&tti, &lsn) in entries_old.iter() { + // Try to remove all entries that are not up-to-date. + for (&tti, entry) in entries_old.iter() { if !entries_new.contains_key(&tti) { - self.remove_entry_sender - .send(self.entry_path(tti.tenant_id, tti.timeline_id, lsn)) - .unwrap(); + self.try_remove_entry(tti.tenant_id, tti.timeline_id, entry) + .await; } } - BASEBACKUP_CACHE_ENTRIES.set(entries_new.len() as i64); - *self.entries.lock().unwrap() = entries_new; + // Note: BackgroundTask is the only writer for self.c.entries, + // so it couldn't have been modified concurrently. + *self.c.entries.lock().unwrap() = entries_new; Ok(()) } - async fn on_startup(&self) -> anyhow::Result<()> { + async fn on_startup(&mut self) -> anyhow::Result<()> { // Create data_dir if it does not exist. - tokio::fs::create_dir_all(&self.data_dir) + tokio::fs::create_dir_all(&self.c.data_dir) .await .context("Failed to create basebackup cache data directory")?; @@ -258,8 +287,8 @@ impl BasebackupCache { .context("Failed to clean tmp directory")?; // Read existing entries from the data_dir and add them to in-memory state. - let mut entries = HashMap::new(); - let mut dir = tokio::fs::read_dir(&self.data_dir).await?; + let mut entries = HashMap::::new(); + let mut dir = tokio::fs::read_dir(&self.c.data_dir).await?; while let Some(dir_entry) = dir.next_entry().await? { let filename = dir_entry.file_name(); @@ -268,33 +297,43 @@ impl BasebackupCache { continue; } + let size_bytes = dir_entry + .metadata() + .await + .map_err(|e| { + anyhow::anyhow!("Failed to read metadata for file {:?}: {:?}", filename, e) + })? + .len(); + + self.entry_count += 1; + BASEBACKUP_CACHE_ENTRIES.set(self.entry_count as u64); + + self.total_size_bytes += size_bytes; + BASEBACKUP_CACHE_SIZE.set(self.total_size_bytes); + let parsed = Self::parse_entry_filename(filename.to_string_lossy().as_ref()); let Some((tenant_id, timeline_id, lsn)) = parsed else { tracing::warn!("Invalid basebackup cache file name: {:?}", filename); continue; }; + let cur_entry = CacheEntry { lsn, size_bytes }; + let tti = TenantTimelineId::new(tenant_id, timeline_id); use std::collections::hash_map::Entry::*; match entries.entry(tti) { Occupied(mut entry) => { - let entry_lsn = *entry.get(); + let found_entry = entry.get(); // Leave only the latest entry, remove the old one. - if lsn < entry_lsn { - self.remove_entry_sender.send(self.entry_path( - tenant_id, - timeline_id, - lsn, - ))?; - } else if lsn > entry_lsn { - self.remove_entry_sender.send(self.entry_path( - tenant_id, - timeline_id, - entry_lsn, - ))?; - entry.insert(lsn); + if cur_entry.lsn < found_entry.lsn { + self.try_remove_entry(tenant_id, timeline_id, &cur_entry) + .await; + } else if cur_entry.lsn > found_entry.lsn { + self.try_remove_entry(tenant_id, timeline_id, found_entry) + .await; + entry.insert(cur_entry); } else { // Two different filenames parsed to the same timline_id and LSN. // Should never happen. @@ -305,22 +344,17 @@ impl BasebackupCache { } } Vacant(entry) => { - entry.insert(lsn); + entry.insert(cur_entry); } } } - BASEBACKUP_CACHE_ENTRIES.set(entries.len() as i64); - *self.entries.lock().unwrap() = entries; + *self.c.entries.lock().unwrap() = entries; Ok(()) } - async fn background( - self: Arc, - mut prepare_receiver: BasebackupPrepareReceiver, - mut remove_entry_receiver: BasebackupRemoveEntryReceiver, - ) { + async fn run(mut self, mut prepare_receiver: BasebackupPrepareReceiver) { // Panic in the background is a safe fallback. // It will drop receivers and the cache will be effectively disabled. self.on_startup() @@ -343,11 +377,6 @@ impl BasebackupCache { continue; } } - Some(req) = remove_entry_receiver.recv() => { - if let Err(e) = tokio::fs::remove_file(req).await { - tracing::warn!("Failed to remove basebackup cache file: {:#}", e); - } - } _ = cleanup_ticker.tick() => { self.cleanup().await.unwrap_or_else(|e| { tracing::warn!("Failed to clean up basebackup cache: {:#}", e); @@ -361,6 +390,67 @@ impl BasebackupCache { } } + /// Try to remove an entry from disk. + /// The caller is responsible for removing the entry from the in-memory state. + /// Updates size counters and corresponding metrics. + /// Ignores the filesystem errors as not-so-important, but the size counters + /// are not decremented in this case, so the file will continue to be counted + /// towards the size limits. + async fn try_remove_entry( + &mut self, + tenant_id: TenantId, + timeline_id: TimelineId, + entry: &CacheEntry, + ) { + let entry_path = self.c.entry_path(tenant_id, timeline_id, entry.lsn); + + match tokio::fs::remove_file(&entry_path).await { + Ok(_) => {} + Err(e) if e.kind() == std::io::ErrorKind::NotFound => {} + Err(e) => { + tracing::warn!( + "Failed to remove basebackup cache file for tenant {} timeline {} LSN {}: {:#}", + tenant_id, + timeline_id, + entry.lsn, + e + ); + return; + } + } + + self.entry_count -= 1; + BASEBACKUP_CACHE_ENTRIES.set(self.entry_count as u64); + + self.total_size_bytes -= entry.size_bytes; + BASEBACKUP_CACHE_SIZE.set(self.total_size_bytes); + } + + /// Insert the cache entry into in-memory state and update the size counters. + /// Assumes that the file for the entry already exists on disk. + /// If the entry already exists with previous LSN, it will be removed. + async fn upsert_entry( + &mut self, + tenant_id: TenantId, + timeline_id: TimelineId, + entry: CacheEntry, + ) { + let tti = TenantTimelineId::new(tenant_id, timeline_id); + + self.entry_count += 1; + BASEBACKUP_CACHE_ENTRIES.set(self.entry_count as u64); + + self.total_size_bytes += entry.size_bytes; + BASEBACKUP_CACHE_SIZE.set(self.total_size_bytes); + + let old_entry = self.c.entries.lock().unwrap().insert(tti, entry); + + if let Some(old_entry) = old_entry { + self.try_remove_entry(tenant_id, timeline_id, &old_entry) + .await; + } + } + /// Prepare a basebackup for the given timeline. /// /// If the basebackup already exists with a higher LSN or the timeline already @@ -369,7 +459,7 @@ impl BasebackupCache { /// The basebackup is prepared in a temporary directory and then moved to the final /// location to make the operation atomic. async fn prepare_basebackup( - &self, + &mut self, tenant_shard_id: TenantShardId, timeline_id: TimelineId, req_lsn: Lsn, @@ -383,30 +473,44 @@ impl BasebackupCache { let tti = TenantTimelineId::new(tenant_shard_id.tenant_id, timeline_id); + // TODO(diko): I don't think we will hit the limit, + // but if we do, it makes sense to try to evict oldest entries. here + if self.entry_count >= self.config.max_size_entries { + tracing::info!( + %tenant_shard_id, + %timeline_id, + %req_lsn, + "Basebackup cache is full (max_size_entries), skipping basebackup", + ); + self.prepare_skip_count.inc(); + return Ok(()); + } + + if self.total_size_bytes >= self.config.max_total_size_bytes { + tracing::info!( + %tenant_shard_id, + %timeline_id, + %req_lsn, + "Basebackup cache is full (max_total_size_bytes), skipping basebackup", + ); + self.prepare_skip_count.inc(); + return Ok(()); + } + { - let entries = self.entries.lock().unwrap(); - if let Some(&entry_lsn) = entries.get(&tti) { - if entry_lsn >= req_lsn { + let entries = self.c.entries.lock().unwrap(); + if let Some(entry) = entries.get(&tti) { + if entry.lsn >= req_lsn { tracing::info!( %timeline_id, %req_lsn, - %entry_lsn, + %entry.lsn, "Basebackup entry already exists for timeline with higher LSN, skipping basebackup", ); self.prepare_skip_count.inc(); return Ok(()); } } - - if entries.len() as i64 >= self.config.max_size_entries { - tracing::info!( - %timeline_id, - %req_lsn, - "Basebackup cache is full, skipping basebackup", - ); - self.prepare_skip_count.inc(); - return Ok(()); - } } let tenant = self @@ -442,18 +546,21 @@ impl BasebackupCache { .prepare_basebackup_tmp(&entry_tmp_path, &timeline, req_lsn) .await; - if let Err(err) = res { - tracing::info!("Failed to prepare basebackup tmp file: {:#}", err); - // Try to clean up tmp file. If we fail, the background clean up task will take care of it. - match tokio::fs::remove_file(&entry_tmp_path).await { - Ok(_) => {} - Err(e) if e.kind() == std::io::ErrorKind::NotFound => {} - Err(e) => { - tracing::info!("Failed to remove basebackup tmp file: {:?}", e); + let entry = match res { + Ok(entry) => entry, + Err(err) => { + tracing::info!("Failed to prepare basebackup tmp file: {:#}", err); + // Try to clean up tmp file. If we fail, the background clean up task will take care of it. + match tokio::fs::remove_file(&entry_tmp_path).await { + Ok(_) => {} + Err(e) if e.kind() == std::io::ErrorKind::NotFound => {} + Err(e) => { + tracing::info!("Failed to remove basebackup tmp file: {:?}", e); + } } + return Err(err); } - return Err(err); - } + }; // Move the tmp file to the final location atomically. // The tmp file is fsynced, so it's guaranteed that we will not have a partial file @@ -461,17 +568,13 @@ impl BasebackupCache { // It's not necessary to fsync the inode after renaming, because the worst case is that // the rename operation will be rolled back on the disk failure, the entry will disappear // from the main directory, and the entry access will cause a cache miss. - let entry_path = self.entry_path(tenant_shard_id.tenant_id, timeline_id, req_lsn); + let entry_path = self + .c + .entry_path(tenant_shard_id.tenant_id, timeline_id, req_lsn); tokio::fs::rename(&entry_tmp_path, &entry_path).await?; - let mut entries = self.entries.lock().unwrap(); - if let Some(old_lsn) = entries.insert(tti, req_lsn) { - // Remove the old entry if it exists. - self.remove_entry_sender - .send(self.entry_path(tenant_shard_id.tenant_id, timeline_id, old_lsn)) - .unwrap(); - } - BASEBACKUP_CACHE_ENTRIES.set(entries.len() as i64); + self.upsert_entry(tenant_shard_id.tenant_id, timeline_id, entry) + .await; self.prepare_ok_count.inc(); Ok(()) @@ -484,7 +587,7 @@ impl BasebackupCache { entry_tmp_path: &Utf8Path, timeline: &Arc, req_lsn: Lsn, - ) -> anyhow::Result<()> { + ) -> anyhow::Result { let ctx = RequestContext::new(TaskKind::BasebackupCache, DownloadBehavior::Download); let ctx = ctx.with_scope_timeline(timeline); @@ -524,6 +627,12 @@ impl BasebackupCache { writer.flush().await?; writer.into_inner().sync_all().await?; - Ok(()) + // TODO(diko): we can count it via Writer wrapper instead of a syscall. + let size_bytes = tokio::fs::metadata(entry_tmp_path).await?.len(); + + Ok(CacheEntry { + lsn: req_lsn, + size_bytes, + }) } } diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 3b3522c36a..bf54614baa 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -4428,18 +4428,16 @@ pub(crate) static BASEBACKUP_CACHE_PREPARE: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); -pub(crate) static BASEBACKUP_CACHE_ENTRIES: Lazy = Lazy::new(|| { - register_int_gauge!( +pub(crate) static BASEBACKUP_CACHE_ENTRIES: Lazy = Lazy::new(|| { + register_uint_gauge!( "pageserver_basebackup_cache_entries_total", "Number of entries in the basebackup cache" ) .expect("failed to define a metric") }); -// FIXME: Support basebackup cache size metrics. -#[allow(dead_code)] -pub(crate) static BASEBACKUP_CACHE_SIZE: Lazy = Lazy::new(|| { - register_int_gauge!( +pub(crate) static BASEBACKUP_CACHE_SIZE: Lazy = Lazy::new(|| { + register_uint_gauge!( "pageserver_basebackup_cache_size_bytes", "Total size of all basebackup cache entries on disk in bytes" ) diff --git a/test_runner/regress/test_basebackup.py b/test_runner/regress/test_basebackup.py index 2d42be4051..d1b10ec85d 100644 --- a/test_runner/regress/test_basebackup.py +++ b/test_runner/regress/test_basebackup.py @@ -69,6 +69,11 @@ def test_basebackup_cache(neon_env_builder: NeonEnvBuilder): ).value == i + 1 ) + # There should be only one basebackup file in the cache. + assert metrics.query_one("pageserver_basebackup_cache_entries_total").value == 1 + # The size of one basebackup for new DB is ~20KB. + size_bytes = metrics.query_one("pageserver_basebackup_cache_size_bytes").value + assert 10 * 1024 <= size_bytes <= 100 * 1024 wait_until(check_metrics) From f2e96b232349ca867d66084bf18791315c6a8aaf Mon Sep 17 00:00:00 2001 From: Dmitrii Kovalkov <34828390+DimasKovas@users.noreply.github.com> Date: Tue, 17 Jun 2025 19:16:07 +0400 Subject: [PATCH 059/124] tests: prepare test_compatibility.py for --timelines-onto-safekeepers (#12204) ## Problem Compatibility tests may be run against a compatibility snapshot generated with --timelines-onto-safekeepers=false. We need to start the compute without a generation (or with 0 generation) if the timeline is not storcon-managed, otherwise the compute will hang. - Follow up on https://github.com/neondatabase/neon/pull/12203 - Relates to https://github.com/neondatabase/neon/pull/11712 ## Summary of changes - Handle compatibility snapshot generated with no `--timelines-onot-safekeepers` properly --- test_runner/regress/test_compatibility.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py index bc9b534095..16ab2bb359 100644 --- a/test_runner/regress/test_compatibility.py +++ b/test_runner/regress/test_compatibility.py @@ -19,6 +19,7 @@ from fixtures.neon_fixtures import ( NeonEnvBuilder, PgBin, Safekeeper, + StorageControllerApiException, flush_ep_to_pageserver, ) from fixtures.pageserver.http import PageserverApiException @@ -301,7 +302,20 @@ def test_forward_compatibility( def check_neon_works(env: NeonEnv, test_output_dir: Path, sql_dump_path: Path, repo_dir: Path): ep = env.endpoints.create("main") ep_env = {"LD_LIBRARY_PATH": str(env.pg_distrib_dir / f"v{env.pg_version}/lib")} - ep.start(env=ep_env) + + # If the compatibility snapshot was created with --timelines-onto-safekeepers=false, + # we should not pass safekeeper_generation to the endpoint because the compute + # will not be able to start. + # Zero generation is INVALID_GENERATION. + generation = 0 + try: + res = env.storage_controller.timeline_locate(env.initial_tenant, env.initial_timeline) + generation = res["generation"] + except StorageControllerApiException as e: + if e.status_code != 404 or not re.search(r"Timeline .* not found", str(e)): + raise e + + ep.start(env=ep_env, safekeeper_generation=generation) connstr = ep.connstr() @@ -351,7 +365,7 @@ def check_neon_works(env: NeonEnv, test_output_dir: Path, sql_dump_path: Path, r ) # Timeline exists again: restart the endpoint - ep.start(env=ep_env) + ep.start(env=ep_env, safekeeper_generation=generation) pg_bin.run_capture( ["pg_dumpall", f"--dbname={connstr}", f"--file={test_output_dir / 'dump-from-wal.sql'}"] From a4c76740c062828709f540edb42b56828731f350 Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Tue, 17 Jun 2025 08:41:17 -0700 Subject: [PATCH 060/124] pageserver: emit gRPC GetPage errors as responses (#12255) ## Problem When converting `proto::GetPageRequest` into `page_api::GetPageRequest` and validating the request, errors are returned as `tonic::Status`. This will tear down the GetPage stream, which is disruptive and unnecessary. ## Summary of changes Emit invalid request errors as `GetPageResponse` with an appropriate `status_code` instead. Also move the conversion from `tonic::Status` to `GetPageResponse` out into the stream handler. --- pageserver/page_api/src/model.rs | 66 ++++++++++++++++++++++++++ pageserver/src/page_service.rs | 80 ++++++-------------------------- 2 files changed, 81 insertions(+), 65 deletions(-) diff --git a/pageserver/page_api/src/model.rs b/pageserver/page_api/src/model.rs index 799f48712f..a01bba0572 100644 --- a/pageserver/page_api/src/model.rs +++ b/pageserver/page_api/src/model.rs @@ -417,6 +417,39 @@ impl From for proto::GetPageResponse { } } +impl GetPageResponse { + /// Attempts to represent a tonic::Status as a GetPageResponse if appropriate. Returning a + /// tonic::Status will terminate the GetPage stream, so per-request errors are emitted as a + /// GetPageResponse with a non-OK status code instead. + #[allow(clippy::result_large_err)] + pub fn try_from_status( + status: tonic::Status, + request_id: RequestID, + ) -> Result { + // We shouldn't see an OK status here, because we're emitting an error. + debug_assert_ne!(status.code(), tonic::Code::Ok); + if status.code() == tonic::Code::Ok { + return Err(tonic::Status::internal(format!( + "unexpected OK status: {status:?}", + ))); + } + + // If we can't convert the tonic::Code to a GetPageStatusCode, this is not a per-request + // error and we should return a tonic::Status to terminate the stream. + let Ok(status_code) = status.code().try_into() else { + return Err(status); + }; + + // Return a GetPageResponse for the status. + Ok(Self { + request_id, + status_code, + reason: Some(status.message().to_string()), + page_images: Vec::new(), + }) + } +} + /// A GetPage response status code. /// /// These are effectively equivalent to gRPC statuses. However, we use a bidirectional stream @@ -480,6 +513,39 @@ impl From for i32 { } } +impl TryFrom for GetPageStatusCode { + type Error = tonic::Code; + + fn try_from(code: tonic::Code) -> Result { + use tonic::Code; + + let status_code = match code { + Code::Ok => Self::Ok, + + // These are per-request errors, which should be returned as GetPageResponses. + Code::AlreadyExists => Self::InvalidRequest, + Code::DataLoss => Self::InternalError, + Code::FailedPrecondition => Self::InvalidRequest, + Code::InvalidArgument => Self::InvalidRequest, + Code::Internal => Self::InternalError, + Code::NotFound => Self::NotFound, + Code::OutOfRange => Self::InvalidRequest, + Code::ResourceExhausted => Self::SlowDown, + + // These should terminate the stream by returning a tonic::Status. + Code::Aborted + | Code::Cancelled + | Code::DeadlineExceeded + | Code::PermissionDenied + | Code::Unauthenticated + | Code::Unavailable + | Code::Unimplemented + | Code::Unknown => return Err(code), + }; + Ok(status_code) + } +} + // Fetches the size of a relation at a given LSN, as # of blocks. Only valid on shard 0, other // shards will error. #[derive(Clone, Copy, Debug)] diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 0521f5c556..79c4c0faa9 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -623,60 +623,6 @@ enum PageStreamError { BadRequest(Cow<'static, str>), } -impl PageStreamError { - /// Converts a PageStreamError into a proto::GetPageResponse with the appropriate status - /// code, or a gRPC status if it should terminate the stream (e.g. shutdown). This is a - /// convenience method for use from a get_pages gRPC stream. - #[allow(clippy::result_large_err)] - fn into_get_page_response( - self, - request_id: page_api::RequestID, - ) -> Result { - use page_api::GetPageStatusCode; - use tonic::Code; - - // We dispatch to Into first, and then map it to a GetPageResponse. - let status: tonic::Status = self.into(); - let status_code = match status.code() { - // We shouldn't see an OK status here, because we're emitting an error. - Code::Ok => { - debug_assert_ne!(status.code(), Code::Ok); - return Err(tonic::Status::internal(format!( - "unexpected OK status: {status:?}", - ))); - } - - // These are per-request errors, returned as GetPageResponses. - Code::AlreadyExists => GetPageStatusCode::InvalidRequest, - Code::DataLoss => GetPageStatusCode::InternalError, - Code::FailedPrecondition => GetPageStatusCode::InvalidRequest, - Code::InvalidArgument => GetPageStatusCode::InvalidRequest, - Code::Internal => GetPageStatusCode::InternalError, - Code::NotFound => GetPageStatusCode::NotFound, - Code::OutOfRange => GetPageStatusCode::InvalidRequest, - Code::ResourceExhausted => GetPageStatusCode::SlowDown, - - // These should terminate the stream. - Code::Aborted => return Err(status), - Code::Cancelled => return Err(status), - Code::DeadlineExceeded => return Err(status), - Code::PermissionDenied => return Err(status), - Code::Unauthenticated => return Err(status), - Code::Unavailable => return Err(status), - Code::Unimplemented => return Err(status), - Code::Unknown => return Err(status), - }; - - Ok(page_api::GetPageResponse { - request_id, - status_code, - reason: Some(status.message().to_string()), - page_images: Vec::new(), - } - .into()) - } -} - impl From for tonic::Status { fn from(err: PageStreamError) -> Self { use tonic::Code; @@ -3438,8 +3384,8 @@ impl GrpcPageServiceHandler { /// Processes a GetPage batch request, via the GetPages bidirectional streaming RPC. /// - /// NB: errors will terminate the stream. Per-request errors should return a GetPageResponse - /// with an appropriate status code instead. + /// NB: errors returned from here are intercepted in get_pages(), and may be converted to a + /// GetPageResponse with an appropriate status code to avoid terminating the stream. /// /// TODO: get_vectored() currently enforces a batch limit of 32. Postgres will typically send /// batches up to effective_io_concurrency = 100. Either we have to accept large batches, or @@ -3456,7 +3402,7 @@ impl GrpcPageServiceHandler { let ctx = ctx.with_scope_page_service_pagestream(&timeline); // Validate the request, decorate the span, and convert it to a Pagestream request. - let req: page_api::GetPageRequest = req.try_into()?; + let req = page_api::GetPageRequest::try_from(req)?; span_record!( req_id = %req.request_id, @@ -3467,7 +3413,7 @@ impl GrpcPageServiceHandler { ); let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn(); // hold guard - let effective_lsn = match PageServerHandler::effective_request_lsn( + let effective_lsn = PageServerHandler::effective_request_lsn( &timeline, timeline.get_last_record_lsn(), req.read_lsn.request_lsn, @@ -3475,10 +3421,7 @@ impl GrpcPageServiceHandler { .not_modified_since_lsn .unwrap_or(req.read_lsn.request_lsn), &latest_gc_cutoff_lsn, - ) { - Ok(lsn) => lsn, - Err(err) => return err.into_get_page_response(req.request_id), - }; + )?; let mut batch = SmallVec::with_capacity(req.block_numbers.len()); for blkno in req.block_numbers { @@ -3535,7 +3478,7 @@ impl GrpcPageServiceHandler { "unexpected response: {resp:?}" ))); } - Err(err) => return err.err.into_get_page_response(req.request_id), + Err(err) => return Err(err.err.into()), }; } @@ -3745,9 +3688,16 @@ impl proto::PageService for GrpcPageServiceHandler { .await? .downgrade(); while let Some(req) = reqs.message().await? { - yield Self::get_page(&ctx, &timeline, req, io_concurrency.clone()) + let req_id = req.request_id; + let result = Self::get_page(&ctx, &timeline, req, io_concurrency.clone()) .instrument(span.clone()) // propagate request span - .await? + .await; + yield match result { + Ok(resp) => resp, + // Convert per-request errors to GetPageResponses as appropriate, or terminate + // the stream with a tonic::Status. + Err(err) => page_api::GetPageResponse::try_from_status(err, req_id)?.into(), + } } }; From dfa055f4be7042fa8e49a86c31bffef29dad6c48 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Tue, 17 Jun 2025 18:44:50 +0300 Subject: [PATCH 061/124] Support event trigger for Neon users (#10624) ## Problem https://github.com/neondatabase/neon/issues/7570 Even triggers are supported only for superusers. ## Summary of changes Temporary switch to superuser when even trigger is created and disable execution of user's even triggers under superuser. --------- Co-authored-by: Dimitri Fontaine Co-authored-by: Konstantin Knizhnik --- pgxn/neon/Makefile | 2 +- pgxn/neon/control_plane_connector.h | 6 - pgxn/neon/neon.c | 4 +- ...l_plane_connector.c => neon_ddl_handler.c} | 576 +++++++++++++++++- pgxn/neon/neon_ddl_handler.h | 6 + test_runner/regress/test_pg_regress.py | 8 +- .../expected/neon-event-triggers.out | 90 +++ test_runner/sql_regress/parallel_schedule | 1 + .../sql_regress/sql/neon-event-triggers.sql | 96 +++ 9 files changed, 754 insertions(+), 35 deletions(-) delete mode 100644 pgxn/neon/control_plane_connector.h rename pgxn/neon/{control_plane_connector.c => neon_ddl_handler.c} (57%) create mode 100644 pgxn/neon/neon_ddl_handler.h create mode 100644 test_runner/sql_regress/expected/neon-event-triggers.out create mode 100644 test_runner/sql_regress/sql/neon-event-triggers.sql diff --git a/pgxn/neon/Makefile b/pgxn/neon/Makefile index 8bcc6bf924..9bce0e798a 100644 --- a/pgxn/neon/Makefile +++ b/pgxn/neon/Makefile @@ -21,7 +21,7 @@ OBJS = \ unstable_extensions.o \ walproposer.o \ walproposer_pg.o \ - control_plane_connector.o \ + neon_ddl_handler.o \ walsender_hooks.o PG_CPPFLAGS = -I$(libpq_srcdir) diff --git a/pgxn/neon/control_plane_connector.h b/pgxn/neon/control_plane_connector.h deleted file mode 100644 index 7eed449200..0000000000 --- a/pgxn/neon/control_plane_connector.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef CONTROL_PLANE_CONNECTOR_H -#define CONTROL_PLANE_CONNECTOR_H - -void InitControlPlaneConnector(void); - -#endif diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c index 5b4ced7cf0..8a405f4129 100644 --- a/pgxn/neon/neon.c +++ b/pgxn/neon/neon.c @@ -33,9 +33,9 @@ #include "extension_server.h" #include "file_cache.h" #include "neon.h" +#include "neon_ddl_handler.h" #include "neon_lwlsncache.h" #include "neon_perf_counters.h" -#include "control_plane_connector.h" #include "logical_replication_monitor.h" #include "unstable_extensions.h" #include "walsender_hooks.h" @@ -454,7 +454,7 @@ _PG_init(void) InitUnstableExtensionsSupport(); InitLogicalReplicationMonitor(); - InitControlPlaneConnector(); + InitDDLHandler(); pg_init_extension_server(); diff --git a/pgxn/neon/control_plane_connector.c b/pgxn/neon/neon_ddl_handler.c similarity index 57% rename from pgxn/neon/control_plane_connector.c rename to pgxn/neon/neon_ddl_handler.c index 47ed37da06..dba28c0ed6 100644 --- a/pgxn/neon/control_plane_connector.c +++ b/pgxn/neon/neon_ddl_handler.c @@ -1,6 +1,6 @@ /*------------------------------------------------------------------------- * - * control_plane_connector.c + * neon_ddl_handler.c * Captures updates to roles/databases using ProcessUtility_hook and * sends them to the control ProcessUtility_hook. The changes are sent * via HTTP to the URL specified by the GUC neon.console_url when the @@ -13,18 +13,30 @@ * accumulate changes. On subtransaction commit, the top of the stack * is merged with the table below it. * + * Support event triggers for neon_superuser + * + * IDENTIFICATION + * contrib/neon/neon_dll_handler.c + * *------------------------------------------------------------------------- */ #include "postgres.h" #include +#include #include "access/xact.h" +#include "catalog/pg_authid.h" +#include "catalog/pg_proc.h" #include "commands/defrem.h" +#include "commands/event_trigger.h" +#include "commands/user.h" #include "fmgr.h" #include "libpq/crypt.h" #include "miscadmin.h" +#include "nodes/makefuncs.h" +#include "parser/parse_func.h" #include "tcop/pquery.h" #include "tcop/utility.h" #include "utils/acl.h" @@ -32,11 +44,16 @@ #include "utils/hsearch.h" #include "utils/memutils.h" #include "utils/jsonb.h" +#include +#include -#include "control_plane_connector.h" +#include "neon_ddl_handler.h" #include "neon_utils.h" static ProcessUtility_hook_type PreviousProcessUtilityHook = NULL; +static fmgr_hook_type next_fmgr_hook = NULL; +static needs_fmgr_hook_type next_needs_fmgr_hook = NULL; +static bool neon_event_triggers = true; static const char *jwt_token = NULL; @@ -773,6 +790,7 @@ HandleDropRole(DropRoleStmt *stmt) } } + static void HandleRename(RenameStmt *stmt) { @@ -782,6 +800,460 @@ HandleRename(RenameStmt *stmt) return HandleRoleRename(stmt); } + +/* + * Support for Event Triggers. + * + * In vanilla only superuser can create Event Triggers. + * + * We allow it for neon_superuser by temporary switching to superuser. But as + * far as event trigger can fire in superuser context we should protect + * superuser from execution of arbitrary user's code. + * + * The idea was taken from Supabase PR series starting at + * https://github.com/supabase/supautils/pull/98 + */ + +static bool +neon_needs_fmgr_hook(Oid functionId) { + + return (next_needs_fmgr_hook && (*next_needs_fmgr_hook) (functionId)) + || get_func_rettype(functionId) == EVENT_TRIGGEROID; +} + +static void +LookupFuncOwnerSecDef(Oid functionId, Oid *funcOwner, bool *is_secdef) +{ + Form_pg_proc procForm; + HeapTuple proc_tup = SearchSysCache1(PROCOID, ObjectIdGetDatum(functionId)); + + if (!HeapTupleIsValid(proc_tup)) + ereport(ERROR, + (errmsg("cache lookup failed for function %u", functionId))); + + procForm = (Form_pg_proc) GETSTRUCT(proc_tup); + + *funcOwner = procForm->proowner; + *is_secdef = procForm->prosecdef; + + ReleaseSysCache(proc_tup); +} + + +PG_FUNCTION_INFO_V1(noop); +Datum noop(__attribute__ ((unused)) PG_FUNCTION_ARGS) { PG_RETURN_VOID();} + +static void +force_noop(FmgrInfo *finfo) +{ + finfo->fn_addr = (PGFunction) noop; + finfo->fn_oid = InvalidOid; /* not a known function OID anymore */ + finfo->fn_nargs = 0; /* no arguments for noop */ + finfo->fn_strict = false; + finfo->fn_retset = false; + finfo->fn_stats = 0; /* no stats collection */ + finfo->fn_extra = NULL; /* clear out old context data */ + finfo->fn_mcxt = CurrentMemoryContext; + finfo->fn_expr = NULL; /* no parse tree */ +} + + +/* + * Skip executing Event Triggers execution for superusers, because Event + * Triggers are SECURITY DEFINER and user provided code could then attempt + * privilege escalation. + * + * Also skip executing Event Triggers when GUC neon.event_triggers has been + * set to false. This might be necessary to be able to connect again after a + * LOGIN Event Trigger has been installed that would prevent connections as + * neon_superuser. + */ +static void +neon_fmgr_hook(FmgrHookEventType event, FmgrInfo *flinfo, Datum *private) +{ + /* + * It can be other needs_fmgr_hook which cause our hook to be invoked for + * non-trigger function, so recheck that is is trigger function. + */ + if (flinfo->fn_oid != InvalidOid && + get_func_rettype(flinfo->fn_oid) != EVENT_TRIGGEROID) + { + if (next_fmgr_hook) + (*next_fmgr_hook) (event, flinfo, private); + + return; + } + + /* + * The neon_superuser role can use the GUC neon.event_triggers to disable + * firing Event Trigger. + * + * SET neon.event_triggers TO false; + * + * This only applies to the neon_superuser role though, and only allows + * skipping Event Triggers owned by neon_superuser, which we check by + * proxy of the Event Trigger function being owned by neon_superuser. + * + * A role that is created in role neon_superuser should be allowed to also + * benefit from the neon_event_triggers GUC, and will be considered the + * same as the neon_superuser role. + */ + if (event == FHET_START + && !neon_event_triggers + && is_neon_superuser()) + { + Oid neon_superuser_oid = get_role_oid("neon_superuser", false); + + /* Find the Function Attributes (owner Oid, security definer) */ + const char *fun_owner_name = NULL; + Oid fun_owner = InvalidOid; + bool fun_is_secdef = false; + + LookupFuncOwnerSecDef(flinfo->fn_oid, &fun_owner, &fun_is_secdef); + fun_owner_name = GetUserNameFromId(fun_owner, false); + + if (RoleIsNeonSuperuser(fun_owner_name) + || has_privs_of_role(fun_owner, neon_superuser_oid)) + { + elog(WARNING, + "Skipping Event Trigger: neon.event_triggers is false"); + + /* + * we can't skip execution directly inside the fmgr_hook so instead we + * change the event trigger function to a noop function. + */ + force_noop(flinfo); + } + } + + /* + * Fire Event Trigger if both function owner and current user are + * superuser, or none of them are. + */ + else if (event == FHET_START + /* still enable it to pass pg_regress tests */ + && !RegressTestMode) + { + /* + * Get the current user oid as of before SECURITY DEFINER change of + * CurrentUserId, and that would be SessionUserId. + */ + Oid current_role_oid = GetSessionUserId(); + bool role_is_super = superuser_arg(current_role_oid); + + /* Find the Function Attributes (owner Oid, security definer) */ + Oid function_owner = InvalidOid; + bool function_is_secdef = false; + bool function_is_owned_by_super = false; + + LookupFuncOwnerSecDef(flinfo->fn_oid, &function_owner, &function_is_secdef); + + function_is_owned_by_super = superuser_arg(function_owner); + + /* + * 1. Refuse to run SECURITY DEFINER function that belongs to a + * superuser when the current user is not a superuser itself. + */ + if (!role_is_super + && function_is_owned_by_super + && function_is_secdef) + { + char *func_name = get_func_name(flinfo->fn_oid); + + ereport(WARNING, + (errmsg("Skipping Event Trigger"), + errdetail("Event Trigger function \"%s\" is owned by \"%s\" " + "and is SECURITY DEFINER", + func_name, + GetUserNameFromId(function_owner, false)))); + + /* + * we can't skip execution directly inside the fmgr_hook so + * instead we change the event trigger function to a noop + * function. + */ + force_noop(flinfo); + } + + /* + * 2. Refuse to run functions that belongs to a non-superuser when the + * current user is a superuser. + * + * We could run a SECURITY DEFINER user-function here and be safe with + * privilege escalation risks, but superuser roles are only used for + * infrastructure maintenance operations, where we prefer to skip + * running user-defined code. + */ + else if (role_is_super && !function_is_owned_by_super) + { + char *func_name = get_func_name(flinfo->fn_oid); + + ereport(WARNING, + (errmsg("Skipping Event Trigger"), + errdetail("Event Trigger function \"%s\" " + "is owned by non-superuser role \"%s\", " + "and current_user \"%s\" is superuser", + func_name, + GetUserNameFromId(function_owner, false), + GetUserNameFromId(current_role_oid, false)))); + + /* + * we can't skip execution directly inside the fmgr_hook so + * instead we change the event trigger function to a noop + * function. + */ + force_noop(flinfo); + } + + } + + if (next_fmgr_hook) + (*next_fmgr_hook) (event, flinfo, private); +} + +static Oid prev_role_oid = 0; +static int prev_role_sec_context = 0; +static bool switched_to_superuser = false; + +/* + * Switch tp superuser if not yet superuser. + * Returns false if already switched to superuser. + */ +static bool +switch_to_superuser(void) +{ + Oid superuser_oid; + + if (switched_to_superuser) + return false; + switched_to_superuser = true; + + superuser_oid = get_role_oid("cloud_admin", true /*missing_ok*/); + if (superuser_oid == InvalidOid) + superuser_oid = BOOTSTRAP_SUPERUSERID; + + GetUserIdAndSecContext(&prev_role_oid, &prev_role_sec_context); + SetUserIdAndSecContext(superuser_oid, prev_role_sec_context | + SECURITY_LOCAL_USERID_CHANGE | + SECURITY_RESTRICTED_OPERATION); + return true; +} + +static void +switch_to_original_role(void) +{ + SetUserIdAndSecContext(prev_role_oid, prev_role_sec_context); + switched_to_superuser = false; +} + +/* + * ALTER ROLE ... SUPERUSER; + * + * Used internally to give superuser to a non-privileged role to allow + * ownership of superuser-only objects such as Event Trigger. + * + * ALTER ROLE foo SUPERUSER; + * ALTER EVENT TRIGGER ... OWNED BY foo; + * ALTER ROLE foo NOSUPERUSER; + * + * Now the EVENT TRIGGER is owned by foo, who can DROP it without having to be + * superuser again. + */ +static void +alter_role_super(const char* rolename, bool make_super) +{ + AlterRoleStmt *alter_stmt = makeNode(AlterRoleStmt); + + DefElem *defel_superuser = +#if PG_MAJORVERSION_NUM <= 14 + makeDefElem("superuser", (Node *) makeInteger(make_super), -1); +#else + makeDefElem("superuser", (Node *) makeBoolean(make_super), -1); +#endif + + RoleSpec *rolespec = makeNode(RoleSpec); + rolespec->roletype = ROLESPEC_CSTRING; + rolespec->rolename = pstrdup(rolename); + rolespec->location = -1; + + alter_stmt->role = rolespec; + alter_stmt->options = list_make1(defel_superuser); + +#if PG_MAJORVERSION_NUM < 15 + AlterRole(alter_stmt); +#else + /* ParseState *pstate, AlterRoleStmt *stmt */ + AlterRole(NULL, alter_stmt); +#endif + + CommandCounterIncrement(); +} + + +/* + * Changes the OWNER of an Event Trigger. + * + * Event Triggers can only be owned by superusers, so this ALTER ROLE with + * SUPERUSER and then removes the property. + */ +static void +alter_event_trigger_owner(const char *obj_name, Oid role_oid) +{ + char* role_name = GetUserNameFromId(role_oid, false); + + alter_role_super(role_name, true); + + AlterEventTriggerOwner(obj_name, role_oid); + CommandCounterIncrement(); + + alter_role_super(role_name, false); +} + + +/* + * Neon processing of the CREATE EVENT TRIGGER requires special attention and + * is worth having its own ProcessUtility_hook for that. + */ +static void +ProcessCreateEventTrigger( + PlannedStmt *pstmt, + const char *queryString, + bool readOnlyTree, + ProcessUtilityContext context, + ParamListInfo params, + QueryEnvironment *queryEnv, + DestReceiver *dest, + QueryCompletion *qc) +{ + Node *parseTree = pstmt->utilityStmt; + bool sudo = false; + + /* We double-check that after local variable declaration block */ + CreateEventTrigStmt *stmt = (CreateEventTrigStmt *) parseTree; + + /* + * We are going to change the current user privileges (sudo) and might + * need after execution cleanup. For that we want to capture the UserId + * before changing it for our sudo implementation. + */ + const Oid current_user_id = GetUserId(); + bool current_user_is_super = superuser_arg(current_user_id); + + if (nodeTag(parseTree) != T_CreateEventTrigStmt) + { + ereport(ERROR, + errcode(ERRCODE_INTERNAL_ERROR), + errmsg("ProcessCreateEventTrigger called for the wrong command")); + } + + /* + * Allow neon_superuser to create Event Trigger, while keeping the + * ownership of the object. + * + * For that we give superuser membership to the role for the execution of + * the command. + */ + if (IsTransactionState() && is_neon_superuser()) + { + /* Find the Event Trigger function Oid */ + Oid func_oid = LookupFuncName(stmt->funcname, 0, NULL, false); + + /* Find the Function Owner Oid */ + Oid func_owner = InvalidOid; + bool is_secdef = false; + bool function_is_owned_by_super = false; + + LookupFuncOwnerSecDef(func_oid, &func_owner, &is_secdef); + + function_is_owned_by_super = superuser_arg(func_owner); + + if(!current_user_is_super && function_is_owned_by_super) + { + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + errmsg("Permission denied to execute " + "a function owned by a superuser role"), + errdetail("current user \"%s\" is not a superuser " + "and Event Trigger function \"%s\" " + "is owned by a superuser", + GetUserNameFromId(current_user_id, false), + NameListToString(stmt->funcname)))); + } + + if(current_user_is_super && !function_is_owned_by_super) + { + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + errmsg("Permission denied to execute " + "a function owned by a non-superuser role"), + errdetail("current user \"%s\" is a superuser " + "and function \"%s\" is " + "owned by a non-superuser", + GetUserNameFromId(current_user_id, false), + NameListToString(stmt->funcname)))); + } + + sudo = switch_to_superuser(); + } + + PG_TRY(); + { + if (PreviousProcessUtilityHook) + { + PreviousProcessUtilityHook( + pstmt, + queryString, + readOnlyTree, + context, + params, + queryEnv, + dest, + qc); + } + else + { + standard_ProcessUtility( + pstmt, + queryString, + readOnlyTree, + context, + params, + queryEnv, + dest, + qc); + } + + /* + * Now that the Event Trigger has been installed via our sudo + * mechanism, if the original role was not a superuser then change + * the event trigger ownership back to the original role. + * + * That way [ ALTER | DROP ] EVENT TRIGGER commands just work. + */ + if (IsTransactionState() && is_neon_superuser()) + { + if (!current_user_is_super) + { + /* + * Change event trigger owner to the current role (making + * it a privileged role during the ALTER OWNER command). + */ + alter_event_trigger_owner(stmt->trigname, current_user_id); + } + } + } + PG_FINALLY(); + { + if (sudo) + switch_to_original_role(); + } + PG_END_TRY(); +} + + +/* + * Neon hooks for DDLs (handling privileges, limiting features, etc). + */ static void NeonProcessUtility( PlannedStmt *pstmt, @@ -795,6 +1267,27 @@ NeonProcessUtility( { Node *parseTree = pstmt->utilityStmt; + /* + * The process utility hook for CREATE EVENT TRIGGER is its own + * implementation and warrant being addressed separately from here. + */ + if (nodeTag(parseTree) == T_CreateEventTrigStmt) + { + ProcessCreateEventTrigger( + pstmt, + queryString, + readOnlyTree, + context, + params, + queryEnv, + dest, + qc); + return; + } + + /* + * Other commands that need Neon specific implementations are handled here: + */ switch (nodeTag(parseTree)) { case T_CreatedbStmt: @@ -833,37 +1326,82 @@ NeonProcessUtility( if (PreviousProcessUtilityHook) { PreviousProcessUtilityHook( - pstmt, - queryString, - readOnlyTree, - context, - params, - queryEnv, - dest, - qc); + pstmt, + queryString, + readOnlyTree, + context, + params, + queryEnv, + dest, + qc); } else { standard_ProcessUtility( - pstmt, - queryString, - readOnlyTree, - context, - params, - queryEnv, - dest, - qc); + pstmt, + queryString, + readOnlyTree, + context, + params, + queryEnv, + dest, + qc); } } +/* + * Only neon_superuser is granted privilege to edit neon.event_triggers GUC. + */ +static void +neon_event_triggers_assign_hook(bool newval, void *extra) +{ + /* MyDatabaseId == InvalidOid || !OidIsValid(GetUserId()) */ + + if (IsTransactionState() && !is_neon_superuser()) + { + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + errmsg("permission denied to set neon.event_triggers"), + errdetail("Only \"neon_superuser\" is allowed to set the GUC"))); + } +} + + void -InitControlPlaneConnector() +InitDDLHandler() { PreviousProcessUtilityHook = ProcessUtility_hook; ProcessUtility_hook = NeonProcessUtility; + + next_needs_fmgr_hook = needs_fmgr_hook; + needs_fmgr_hook = neon_needs_fmgr_hook; + + next_fmgr_hook = fmgr_hook; + fmgr_hook = neon_fmgr_hook; + RegisterXactCallback(NeonXactCallback, NULL); RegisterSubXactCallback(NeonSubXactCallback, NULL); + /* + * The GUC neon.event_triggers should provide the same effect as the + * Postgres GUC event_triggers, but the neon one is PGC_USERSET. + * + * This allows using the GUC in the connection string and work out of a + * LOGIN Event Trigger that would break database access, all without + * having to edit and reload the Postgres configuration file. + */ + DefineCustomBoolVariable( + "neon.event_triggers", + "Enable firing of event triggers", + NULL, + &neon_event_triggers, + true, + PGC_USERSET, + 0, + NULL, + neon_event_triggers_assign_hook, + NULL); + DefineCustomStringVariable( "neon.console_url", "URL of the Neon Console, which will be forwarded changes to dbs and roles", diff --git a/pgxn/neon/neon_ddl_handler.h b/pgxn/neon/neon_ddl_handler.h new file mode 100644 index 0000000000..de18ed3d82 --- /dev/null +++ b/pgxn/neon/neon_ddl_handler.h @@ -0,0 +1,6 @@ +#ifndef CONTROL_DDL_HANDLER_H +#define CONTROL_DDL_HANDLER_H + +void InitDDLHandler(void); + +#endif diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py index 474002353b..3695ece66b 100644 --- a/test_runner/regress/test_pg_regress.py +++ b/test_runner/regress/test_pg_regress.py @@ -306,13 +306,7 @@ def test_sql_regress( ) # Connect to postgres and create a database called "regression". - endpoint = env.endpoints.create_start( - "main", - config_lines=[ - # Enable the test mode, so that we don't need to patch the test cases. - "neon.regress_test_mode = true", - ], - ) + endpoint = env.endpoints.create_start("main") endpoint.safe_psql(f"CREATE DATABASE {DBNAME}") # Create some local directories for pg_regress to run in. diff --git a/test_runner/sql_regress/expected/neon-event-triggers.out b/test_runner/sql_regress/expected/neon-event-triggers.out new file mode 100644 index 0000000000..3a62e67316 --- /dev/null +++ b/test_runner/sql_regress/expected/neon-event-triggers.out @@ -0,0 +1,90 @@ +create or replace function admin_proc() + returns event_trigger + language plpgsql as +$$ +begin + raise notice 'admin event trigger is executed for %', current_user; +end; +$$; +create role neon_superuser; +create role neon_admin login inherit createrole createdb in role neon_superuser; +grant create on schema public to neon_admin; +create database neondb with owner neon_admin; +grant all privileges on database neondb to neon_superuser; +create role neon_user; +grant create on schema public to neon_user; +create event trigger on_ddl1 on ddl_command_end +execute procedure admin_proc(); +set role neon_user; +-- check that non-privileged user can not change neon.event_triggers +set neon.event_triggers to false; +ERROR: permission denied to set neon.event_triggers +DETAIL: Only "neon_superuser" is allowed to set the GUC +-- Non-privileged neon user should not be able to create event trigers +create event trigger on_ddl2 on ddl_command_end +execute procedure admin_proc(); +ERROR: permission denied to create event trigger "on_ddl2" +HINT: Must be superuser to create an event trigger. +set role neon_admin; +-- neon_superuser should be able to create event trigers +create or replace function neon_proc() + returns event_trigger + language plpgsql as +$$ +begin + raise notice 'neon event trigger is executed for %', current_user; +end; +$$; +NOTICE: admin event trigger is executed for neon_admin +create event trigger on_ddl2 on ddl_command_end +execute procedure neon_proc(); +\c neondb neon_admin +create or replace function neondb_proc() + returns event_trigger + language plpgsql as +$$ +begin + raise notice 'neondb event trigger is executed for %', current_user; +end; +$$; +create or replace function neondb_secdef_proc() + returns event_trigger + language plpgsql + SECURITY DEFINER +as +$$ +begin + raise notice 'neondb secdef event trigger is executed for %', current_user; +end; +$$; +-- neon_admin (neon_superuser member) should be able to create event triggers +create event trigger on_ddl3 on ddl_command_end +execute procedure neondb_proc(); +create event trigger on_ddl4 on ddl_command_end +execute procedure neondb_secdef_proc(); +-- Check that event trigger is fired for neon_admin +create table t1(x integer); +NOTICE: neondb event trigger is executed for neon_admin +NOTICE: neondb secdef event trigger is executed for neon_admin +-- Check that event trigger can be skipped +set neon.event_triggers to false; +create table t2(x integer); +WARNING: Skipping Event Trigger: neon.event_triggers is false +WARNING: Skipping Event Trigger: neon.event_triggers is false +\c regression cloud_admin +-- Check that event triggers are not fired for superuser +create table t3(x integer); +NOTICE: admin event trigger is executed for cloud_admin +WARNING: Skipping Event Trigger +DETAIL: Event Trigger function "neon_proc" is owned by non-superuser role "neon_admin", and current_user "cloud_admin" is superuser +\c neondb cloud_admin +-- Check that user-defined event triggers are not fired for superuser +create table t4(x integer); +WARNING: Skipping Event Trigger +DETAIL: Event Trigger function "neondb_proc" is owned by non-superuser role "neon_admin", and current_user "cloud_admin" is superuser +WARNING: Skipping Event Trigger +DETAIL: Event Trigger function "neondb_secdef_proc" is owned by non-superuser role "neon_admin", and current_user "cloud_admin" is superuser +\c neondb neon_admin +-- Check that neon_admin can drop event triggers +drop event trigger on_ddl3; +drop event trigger on_ddl4; diff --git a/test_runner/sql_regress/parallel_schedule b/test_runner/sql_regress/parallel_schedule index d9508d1c90..d1bd7226ed 100644 --- a/test_runner/sql_regress/parallel_schedule +++ b/test_runner/sql_regress/parallel_schedule @@ -9,3 +9,4 @@ test: neon-rel-truncate test: neon-clog test: neon-test-utils test: neon-vacuum-full +test: neon-event-triggers diff --git a/test_runner/sql_regress/sql/neon-event-triggers.sql b/test_runner/sql_regress/sql/neon-event-triggers.sql new file mode 100644 index 0000000000..75365455dc --- /dev/null +++ b/test_runner/sql_regress/sql/neon-event-triggers.sql @@ -0,0 +1,96 @@ +create or replace function admin_proc() + returns event_trigger + language plpgsql as +$$ +begin + raise notice 'admin event trigger is executed for %', current_user; +end; +$$; + +create role neon_superuser; +create role neon_admin login inherit createrole createdb in role neon_superuser; +grant create on schema public to neon_admin; +create database neondb with owner neon_admin; +grant all privileges on database neondb to neon_superuser; + +create role neon_user; +grant create on schema public to neon_user; + +create event trigger on_ddl1 on ddl_command_end +execute procedure admin_proc(); + +set role neon_user; + +-- check that non-privileged user can not change neon.event_triggers +set neon.event_triggers to false; + +-- Non-privileged neon user should not be able to create event trigers +create event trigger on_ddl2 on ddl_command_end +execute procedure admin_proc(); + +set role neon_admin; + +-- neon_superuser should be able to create event trigers +create or replace function neon_proc() + returns event_trigger + language plpgsql as +$$ +begin + raise notice 'neon event trigger is executed for %', current_user; +end; +$$; + +create event trigger on_ddl2 on ddl_command_end +execute procedure neon_proc(); + +\c neondb neon_admin + +create or replace function neondb_proc() + returns event_trigger + language plpgsql as +$$ +begin + raise notice 'neondb event trigger is executed for %', current_user; +end; +$$; + +create or replace function neondb_secdef_proc() + returns event_trigger + language plpgsql + SECURITY DEFINER +as +$$ +begin + raise notice 'neondb secdef event trigger is executed for %', current_user; +end; +$$; + +-- neon_admin (neon_superuser member) should be able to create event triggers +create event trigger on_ddl3 on ddl_command_end +execute procedure neondb_proc(); + +create event trigger on_ddl4 on ddl_command_end +execute procedure neondb_secdef_proc(); + +-- Check that event trigger is fired for neon_admin +create table t1(x integer); + +-- Check that event trigger can be skipped +set neon.event_triggers to false; +create table t2(x integer); + +\c regression cloud_admin + +-- Check that event triggers are not fired for superuser +create table t3(x integer); + +\c neondb cloud_admin + +-- Check that user-defined event triggers are not fired for superuser +create table t4(x integer); + +\c neondb neon_admin + +-- Check that neon_admin can drop event triggers +drop event trigger on_ddl3; +drop event trigger on_ddl4; From a5cac52e263438dbe952c5771551d3660db94a6d Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Tue, 17 Jun 2025 17:35:20 +0100 Subject: [PATCH 062/124] compute-image: add a patch for onnxruntime (#12274) ## Problem The checksum for eigen (a dependency for onnxruntime) has changed which breaks compute image build. ## Summary of changes - Add a patch for onnxruntime which backports changes from https://github.com/microsoft/onnxruntime/commit/f57db79743c4d1a3553aa05cf95bcd10966030e6 (we keep the current version) Ref https://github.com/microsoft/onnxruntime/issues/24861 --- compute/compute-node.Dockerfile | 4 +++- compute/patches/onnxruntime.patch | 15 +++++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) create mode 100644 compute/patches/onnxruntime.patch diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile index 248f52088b..0aa8c5d670 100644 --- a/compute/compute-node.Dockerfile +++ b/compute/compute-node.Dockerfile @@ -1144,10 +1144,12 @@ USER root FROM build-deps AS pgrag-src ARG PG_VERSION - WORKDIR /ext-src +COPY compute/patches/onnxruntime.patch . + RUN wget https://github.com/microsoft/onnxruntime/archive/refs/tags/v1.18.1.tar.gz -O onnxruntime.tar.gz && \ mkdir onnxruntime-src && cd onnxruntime-src && tar xzf ../onnxruntime.tar.gz --strip-components=1 -C . && \ + patch -p1 < /ext-src/onnxruntime.patch && \ echo "#nothing to test here" > neon-test.sh RUN wget https://github.com/neondatabase-labs/pgrag/archive/refs/tags/v0.1.2.tar.gz -O pgrag.tar.gz && \ diff --git a/compute/patches/onnxruntime.patch b/compute/patches/onnxruntime.patch new file mode 100644 index 0000000000..2347547e73 --- /dev/null +++ b/compute/patches/onnxruntime.patch @@ -0,0 +1,15 @@ +diff --git a/cmake/deps.txt b/cmake/deps.txt +index d213b09034..229de2ebf0 100644 +--- a/cmake/deps.txt ++++ b/cmake/deps.txt +@@ -22,7 +22,9 @@ dlpack;https://github.com/dmlc/dlpack/archive/refs/tags/v0.6.zip;4d565dd2e5b3132 + # it contains changes on top of 3.4.0 which are required to fix build issues. + # Until the 3.4.1 release this is the best option we have. + # Issue link: https://gitlab.com/libeigen/eigen/-/issues/2744 +-eigen;https://gitlab.com/libeigen/eigen/-/archive/e7248b26a1ed53fa030c5c459f7ea095dfd276ac/eigen-e7248b26a1ed53fa030c5c459f7ea095dfd276ac.zip;be8be39fdbc6e60e94fa7870b280707069b5b81a ++# Moved to github mirror to avoid gitlab issues.Add commentMore actions ++# Issue link: https://github.com/bazelbuild/bazel-central-registry/issues/4355 ++eigen;https://github.com/eigen-mirror/eigen/archive/e7248b26a1ed53fa030c5c459f7ea095dfd276ac/eigen-e7248b26a1ed53fa030c5c459f7ea095dfd276ac.zip;61418a349000ba7744a3ad03cf5071f22ebf860a + flatbuffers;https://github.com/google/flatbuffers/archive/refs/tags/v23.5.26.zip;59422c3b5e573dd192fead2834d25951f1c1670c + fp16;https://github.com/Maratyszcza/FP16/archive/0a92994d729ff76a58f692d3028ca1b64b145d91.zip;b985f6985a05a1c03ff1bb71190f66d8f98a1494 + fxdiv;https://github.com/Maratyszcza/FXdiv/archive/63058eff77e11aa15bf531df5dd34395ec3017c8.zip;a5658f4036402dbca7cebee32be57fb8149811e1 From 7d4f662fbf6d3eccf87ecab9d4e528f85ed099d4 Mon Sep 17 00:00:00 2001 From: Mikhail Date: Tue, 17 Jun 2025 18:46:35 +0100 Subject: [PATCH 063/124] upgrade default neon version to 1.6 (#12185) Changes for 1.6 were merged and deployed two months ago https://github.com/neondatabase/neon/blob/main/pgxn/neon/neon--1.6--1.5.sql. In order to deploy https://github.com/neondatabase/neon/pull/12183, we need 1.6 to be default, otherwise we can't use prewarm API on read-only replica (`ALTER EXTENSION` won't work) and we need it for promotion --- pgxn/neon/neon.control | 2 +- test_runner/regress/test_lfc_prewarm.py | 7 ++----- test_runner/regress/test_neon_extension.py | 8 ++++---- 3 files changed, 7 insertions(+), 10 deletions(-) diff --git a/pgxn/neon/neon.control b/pgxn/neon/neon.control index af69116e21..51193f63c8 100644 --- a/pgxn/neon/neon.control +++ b/pgxn/neon/neon.control @@ -1,6 +1,6 @@ # neon extension comment = 'cloud storage for PostgreSQL' -default_version = '1.5' +default_version = '1.6' module_pathname = '$libdir/neon' relocatable = true trusted = true diff --git a/test_runner/regress/test_lfc_prewarm.py b/test_runner/regress/test_lfc_prewarm.py index 40a9b29296..e1058cd644 100644 --- a/test_runner/regress/test_lfc_prewarm.py +++ b/test_runner/regress/test_lfc_prewarm.py @@ -59,7 +59,7 @@ def test_lfc_prewarm(neon_simple_env: NeonEnv, query: LfcQueryMethod): pg_conn = endpoint.connect() pg_cur = pg_conn.cursor() - pg_cur.execute("create extension neon version '1.6'") + pg_cur.execute("create extension neon") pg_cur.execute("create database lfc") lfc_conn = endpoint.connect(dbname="lfc") @@ -84,11 +84,8 @@ def test_lfc_prewarm(neon_simple_env: NeonEnv, query: LfcQueryMethod): endpoint.stop() endpoint.start() - # wait until compute_ctl completes downgrade of extension to default version - time.sleep(1) pg_conn = endpoint.connect() pg_cur = pg_conn.cursor() - pg_cur.execute("alter extension neon update to '1.6'") lfc_conn = endpoint.connect(dbname="lfc") lfc_cur = lfc_conn.cursor() @@ -144,7 +141,7 @@ def test_lfc_prewarm_under_workload(neon_simple_env: NeonEnv, query: LfcQueryMet pg_conn = endpoint.connect() pg_cur = pg_conn.cursor() - pg_cur.execute("create extension neon version '1.6'") + pg_cur.execute("create extension neon") pg_cur.execute("CREATE DATABASE lfc") lfc_conn = endpoint.connect(dbname="lfc") diff --git a/test_runner/regress/test_neon_extension.py b/test_runner/regress/test_neon_extension.py index e79ab458ca..6bcd15d463 100644 --- a/test_runner/regress/test_neon_extension.py +++ b/test_runner/regress/test_neon_extension.py @@ -29,7 +29,7 @@ def test_neon_extension(neon_env_builder: NeonEnvBuilder): # IMPORTANT: # If the version has changed, the test should be updated. # Ensure that the default version is also updated in the neon.control file - assert cur.fetchone() == ("1.5",) + assert cur.fetchone() == ("1.6",) cur.execute("SELECT * from neon.NEON_STAT_FILE_CACHE") res = cur.fetchall() log.info(res) @@ -53,10 +53,10 @@ def test_neon_extension_compatibility(neon_env_builder: NeonEnvBuilder): # IMPORTANT: # If the version has changed, the test should be updated. # Ensure that the default version is also updated in the neon.control file - assert cur.fetchone() == ("1.5",) + assert cur.fetchone() == ("1.6",) cur.execute("SELECT * from neon.NEON_STAT_FILE_CACHE") - all_versions = ["1.5", "1.4", "1.3", "1.2", "1.1", "1.0"] - current_version = "1.5" + all_versions = ["1.6", "1.5", "1.4", "1.3", "1.2", "1.1", "1.0"] + current_version = "1.6" for idx, begin_version in enumerate(all_versions): for target_version in all_versions[idx + 1 :]: if current_version != begin_version: From 83069f6ca10de1bc66bfb71fe870811138adfb22 Mon Sep 17 00:00:00 2001 From: Suhas Thalanki <54014218+thesuhas@users.noreply.github.com> Date: Tue, 17 Jun 2025 15:56:05 -0700 Subject: [PATCH 064/124] fix: terminate pgbouncer on compute suspend (#12153) ## Problem PgBouncer does not terminate connections on a suspend: https://github.com/neondatabase/cloud/issues/16282 ## Summary of changes 1. Adds a pid file to store the pid of PgBouncer 2. Terminates connections on a compute suspend --------- Co-authored-by: Alexey Kondratov --- compute/etc/pgbouncer.ini | 2 + compute_tools/src/bin/compute_ctl.rs | 14 +++-- compute_tools/src/compute.rs | 60 ++++++++++++++++++- compute_tools/src/http/routes/terminate.rs | 2 +- compute_tools/src/lib.rs | 1 + compute_tools/src/pgbouncer.rs | 1 + control_plane/src/bin/neon_local.rs | 8 +++ control_plane/src/endpoint.rs | 5 ++ .../compute_wrapper/shell/compute.sh | 1 + test_runner/fixtures/neon_cli.py | 3 + 10 files changed, 90 insertions(+), 7 deletions(-) create mode 100644 compute_tools/src/pgbouncer.rs diff --git a/compute/etc/pgbouncer.ini b/compute/etc/pgbouncer.ini index 9d68cbb8d5..fbcdfd4a87 100644 --- a/compute/etc/pgbouncer.ini +++ b/compute/etc/pgbouncer.ini @@ -21,6 +21,8 @@ unix_socket_dir=/tmp/ unix_socket_mode=0777 ; required for pgbouncer_exporter ignore_startup_parameters=extra_float_digits +; pidfile for graceful termination +pidfile=/tmp/pgbouncer.pid ;; Disable connection logging. It produces a lot of logs that no one looks at, ;; and we can get similar log entries from the proxy too. We had incidents in diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index 8b502a058e..d7ff381f1b 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -124,6 +124,10 @@ struct Cli { /// Interval in seconds for collecting installed extensions statistics #[arg(long, default_value = "3600")] pub installed_extensions_collection_interval: u64, + + /// Run in development mode, skipping VM-specific operations like process termination + #[arg(long, action = clap::ArgAction::SetTrue)] + pub dev: bool, } impl Cli { @@ -159,7 +163,7 @@ fn main() -> Result<()> { .build()?; let _rt_guard = runtime.enter(); - runtime.block_on(init())?; + runtime.block_on(init(cli.dev))?; // enable core dumping for all child processes setrlimit(Resource::CORE, rlimit::INFINITY, rlimit::INFINITY)?; @@ -198,13 +202,13 @@ fn main() -> Result<()> { deinit_and_exit(exit_code); } -async fn init() -> Result<()> { +async fn init(dev_mode: bool) -> Result<()> { init_tracing_and_logging(DEFAULT_LOG_LEVEL).await?; let mut signals = Signals::new([SIGINT, SIGTERM, SIGQUIT])?; thread::spawn(move || { for sig in signals.forever() { - handle_exit_signal(sig); + handle_exit_signal(sig, dev_mode); } }); @@ -263,9 +267,9 @@ fn deinit_and_exit(exit_code: Option) -> ! { /// When compute_ctl is killed, send also termination signal to sync-safekeepers /// to prevent leakage. TODO: it is better to convert compute_ctl to async and /// wait for termination which would be easy then. -fn handle_exit_signal(sig: i32) { +fn handle_exit_signal(sig: i32, dev_mode: bool) { info!("received {sig} termination signal"); - forward_termination_signal(); + forward_termination_signal(dev_mode); exit(1); } diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 50c254224a..c591d9711a 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -35,6 +35,7 @@ use url::Url; use utils::id::{TenantId, TimelineId}; use utils::lsn::Lsn; use utils::measured_stream::MeasuredReader; +use utils::pid_file; use crate::configurator::launch_configurator; use crate::disk_quota::set_disk_quota; @@ -44,6 +45,7 @@ use crate::lsn_lease::launch_lsn_lease_bg_task_for_static; use crate::metrics::COMPUTE_CTL_UP; use crate::monitor::launch_monitor; use crate::pg_helpers::*; +use crate::pgbouncer::*; use crate::rsyslog::{ PostgresLogsRsyslogConfig, configure_audit_rsyslog, configure_postgres_logs_export, launch_pgaudit_gc, @@ -2246,12 +2248,68 @@ pub async fn installed_extensions(conf: tokio_postgres::Config) -> Result<()> { Ok(()) } -pub fn forward_termination_signal() { +pub fn forward_termination_signal(dev_mode: bool) { let ss_pid = SYNC_SAFEKEEPERS_PID.load(Ordering::SeqCst); if ss_pid != 0 { let ss_pid = nix::unistd::Pid::from_raw(ss_pid as i32); kill(ss_pid, Signal::SIGTERM).ok(); } + + if !dev_mode { + info!("not in dev mode, terminating pgbouncer"); + + // Terminate pgbouncer with SIGKILL + match pid_file::read(PGBOUNCER_PIDFILE.into()) { + Ok(pid_file::PidFileRead::LockedByOtherProcess(pid)) => { + info!("sending SIGKILL to pgbouncer process pid: {}", pid); + if let Err(e) = kill(pid, Signal::SIGKILL) { + error!("failed to terminate pgbouncer: {}", e); + } + } + // pgbouncer does not lock the pid file, so we read and kill the process directly + Ok(pid_file::PidFileRead::NotHeldByAnyProcess(_)) => { + if let Ok(pid_str) = std::fs::read_to_string(PGBOUNCER_PIDFILE) { + if let Ok(pid) = pid_str.trim().parse::() { + info!( + "sending SIGKILL to pgbouncer process pid: {} (from unlocked pid file)", + pid + ); + if let Err(e) = kill(Pid::from_raw(pid), Signal::SIGKILL) { + error!("failed to terminate pgbouncer: {}", e); + } + } + } else { + info!("pgbouncer pid file exists but process not running"); + } + } + Ok(pid_file::PidFileRead::NotExist) => { + info!("pgbouncer pid file not found, process may not be running"); + } + Err(e) => { + error!("error reading pgbouncer pid file: {}", e); + } + } + } + + // Terminate local_proxy + match pid_file::read("/etc/local_proxy/pid".into()) { + Ok(pid_file::PidFileRead::LockedByOtherProcess(pid)) => { + info!("sending SIGTERM to local_proxy process pid: {}", pid); + if let Err(e) = kill(pid, Signal::SIGTERM) { + error!("failed to terminate local_proxy: {}", e); + } + } + Ok(pid_file::PidFileRead::NotHeldByAnyProcess(_)) => { + info!("local_proxy PID file exists but process not running"); + } + Ok(pid_file::PidFileRead::NotExist) => { + info!("local_proxy PID file not found, process may not be running"); + } + Err(e) => { + error!("error reading local_proxy PID file: {}", e); + } + } + let pg_pid = PG_PID.load(Ordering::SeqCst); if pg_pid != 0 { let pg_pid = nix::unistd::Pid::from_raw(pg_pid as i32); diff --git a/compute_tools/src/http/routes/terminate.rs b/compute_tools/src/http/routes/terminate.rs index 2c24d4ad6b..92a89c0ee7 100644 --- a/compute_tools/src/http/routes/terminate.rs +++ b/compute_tools/src/http/routes/terminate.rs @@ -26,7 +26,7 @@ pub(in crate::http) async fn terminate(State(compute): State>) drop(state); } - forward_termination_signal(); + forward_termination_signal(false); info!("sent signal and notified waiters"); // Spawn a blocking thread to wait for compute to become Terminated. diff --git a/compute_tools/src/lib.rs b/compute_tools/src/lib.rs index 7218067a8a..3899a1ca76 100644 --- a/compute_tools/src/lib.rs +++ b/compute_tools/src/lib.rs @@ -22,6 +22,7 @@ mod migration; pub mod monitor; pub mod params; pub mod pg_helpers; +pub mod pgbouncer; pub mod rsyslog; pub mod spec; mod spec_apply; diff --git a/compute_tools/src/pgbouncer.rs b/compute_tools/src/pgbouncer.rs new file mode 100644 index 0000000000..189dfabac9 --- /dev/null +++ b/compute_tools/src/pgbouncer.rs @@ -0,0 +1 @@ +pub const PGBOUNCER_PIDFILE: &str = "/tmp/pgbouncer.pid"; diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index 01ca28fce0..aeabf4a519 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -672,6 +672,13 @@ struct EndpointStartCmdArgs { #[clap(short = 't', long, value_parser= humantime::parse_duration, help = "timeout until we fail the command")] #[arg(default_value = "90s")] start_timeout: Duration, + + #[clap( + long, + help = "Run in development mode, skipping VM-specific operations like process termination", + action = clap::ArgAction::SetTrue + )] + dev: bool, } #[derive(clap::Args)] @@ -1590,6 +1597,7 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res stripe_size.0 as usize, args.create_test_user, args.start_timeout, + args.dev, ) .await?; } diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs index 03156c1809..2df71df57d 100644 --- a/control_plane/src/endpoint.rs +++ b/control_plane/src/endpoint.rs @@ -691,6 +691,7 @@ impl Endpoint { shard_stripe_size: usize, create_test_user: bool, start_timeout: Duration, + dev: bool, ) -> Result<()> { if self.status() == EndpointStatus::Running { anyhow::bail!("The endpoint is already running"); @@ -861,6 +862,10 @@ impl Endpoint { cmd.args(["--remote-ext-base-url", remote_ext_base_url]); } + if dev { + cmd.arg("--dev"); + } + let child = cmd.spawn()?; // set up a scopeguard to kill & wait for the child in case we panic or bail below let child = scopeguard::guard(child, |mut child| { diff --git a/docker-compose/compute_wrapper/shell/compute.sh b/docker-compose/compute_wrapper/shell/compute.sh index c8ca812bf9..1e62e91fd0 100755 --- a/docker-compose/compute_wrapper/shell/compute.sh +++ b/docker-compose/compute_wrapper/shell/compute.sh @@ -95,3 +95,4 @@ echo "Start compute node" -b /usr/local/bin/postgres \ --compute-id "compute-${RANDOM}" \ --config "${CONFIG_FILE}" + --dev diff --git a/test_runner/fixtures/neon_cli.py b/test_runner/fixtures/neon_cli.py index bb07e2b6d1..48a1a36e66 100644 --- a/test_runner/fixtures/neon_cli.py +++ b/test_runner/fixtures/neon_cli.py @@ -564,6 +564,7 @@ class NeonLocalCli(AbstractNeonCli): basebackup_request_tries: int | None = None, timeout: str | None = None, env: dict[str, str] | None = None, + dev: bool = False, ) -> subprocess.CompletedProcess[str]: args = [ "endpoint", @@ -589,6 +590,8 @@ class NeonLocalCli(AbstractNeonCli): args.extend(["--create-test-user"]) if timeout is not None: args.extend(["--start-timeout", str(timeout)]) + if dev: + args.extend(["--dev"]) res = self.raw_cli(args, extra_env_vars) res.check_returncode() From 04013929cb0a50a7fd70a7b1a82c59128a207f9f Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Wed, 18 Jun 2025 08:48:39 +0200 Subject: [PATCH 065/124] pageserver: support full gRPC basebackups (#12269) ## Problem Full basebackups are used in tests, and may be useful for debugging as well, so we should support them in the gRPC API. Touches #11728. ## Summary of changes Add `GetBaseBackupRequest::full` to generate full base backups. The libpq implementation also allows specifying `prev_lsn` for full backups, i.e. the end LSN of the previous WAL record. This is omitted in the gRPC API, since it's not used by any tests, and presumably of limited value since it's autodetected. We can add it later if we find that we need it. --- pageserver/page_api/proto/page_service.proto | 2 ++ pageserver/page_api/src/model.rs | 4 ++++ pageserver/src/page_service.rs | 5 +---- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/pageserver/page_api/proto/page_service.proto b/pageserver/page_api/proto/page_service.proto index 7d01dec4ab..81953a710f 100644 --- a/pageserver/page_api/proto/page_service.proto +++ b/pageserver/page_api/proto/page_service.proto @@ -108,6 +108,8 @@ message GetBaseBackupRequest { uint64 lsn = 1; // If true, logical replication slots will not be created. bool replica = 2; + // If true, include relation files in the base backup. Mainly for debugging and tests. + bool full = 3; } // Base backup response chunk, returned as an ordered stream. diff --git a/pageserver/page_api/src/model.rs b/pageserver/page_api/src/model.rs index a01bba0572..ef7f89473f 100644 --- a/pageserver/page_api/src/model.rs +++ b/pageserver/page_api/src/model.rs @@ -189,6 +189,8 @@ pub struct GetBaseBackupRequest { pub lsn: Option, /// If true, logical replication slots will not be created. pub replica: bool, + /// If true, include relation files in the base backup. Mainly for debugging and tests. + pub full: bool, } impl From for GetBaseBackupRequest { @@ -196,6 +198,7 @@ impl From for GetBaseBackupRequest { Self { lsn: (pb.lsn != 0).then_some(Lsn(pb.lsn)), replica: pb.replica, + full: pb.full, } } } @@ -205,6 +208,7 @@ impl From for proto::GetBaseBackupRequest { Self { lsn: request.lsn.unwrap_or_default().0, replica: request.replica, + full: request.full, } } } diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 79c4c0faa9..ff435451c3 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -3572,9 +3572,6 @@ impl proto::PageService for GrpcPageServiceHandler { } // Spawn a task to run the basebackup. - // - // TODO: do we need to support full base backups, for debugging? This also requires passing - // the prev_lsn parameter. let span = Span::current(); let (mut simplex_read, mut simplex_write) = tokio::io::simplex(CHUNK_SIZE); let jh = tokio::spawn(async move { @@ -3583,7 +3580,7 @@ impl proto::PageService for GrpcPageServiceHandler { &timeline, req.lsn, None, - false, + req.full, req.replica, &ctx, ) From 3af6b3a2bf0e2f656cbc936580e49bd19ded5226 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 18 Jun 2025 12:49:42 +0300 Subject: [PATCH 066/124] Avoid redownloading rust toolchain on Postgres changes (#12265) Create a separate stage for downloading the Rust toolchain for pgrx, so that it can be cached independently of the pg-build layer. Before this, the 'pg-build-nonroot=with-cargo' layer was unnecessarily rebuilt every time there was a change in PostgreSQL sources. Furthermore, this allows using the same cached layer for building the compute images of all Postgres versions. --- compute/compute-node.Dockerfile | 37 +++++++++++++++++++++------------ 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile index 0aa8c5d670..685ac564b7 100644 --- a/compute/compute-node.Dockerfile +++ b/compute/compute-node.Dockerfile @@ -149,8 +149,10 @@ RUN case $DEBIAN_VERSION in \ ninja-build git autoconf automake libtool build-essential bison flex libreadline-dev \ zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget ca-certificates pkg-config libssl-dev \ libicu-dev libxslt1-dev liblz4-dev libzstd-dev zstd curl unzip g++ \ + libclang-dev \ $VERSION_INSTALLS \ - && apt clean && rm -rf /var/lib/apt/lists/* + && apt clean && rm -rf /var/lib/apt/lists/* && \ + useradd -ms /bin/bash nonroot -b /home ######################################################################################### # @@ -1057,17 +1059,10 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) && \ ######################################################################################### # -# Layer "pg build with nonroot user and cargo installed" -# This layer is base and common for layers with `pgrx` +# Layer "build-deps with Rust toolchain installed" # ######################################################################################### -FROM pg-build AS pg-build-nonroot-with-cargo -ARG PG_VERSION - -RUN apt update && \ - apt install --no-install-recommends --no-install-suggests -y curl libclang-dev && \ - apt clean && rm -rf /var/lib/apt/lists/* && \ - useradd -ms /bin/bash nonroot -b /home +FROM build-deps AS build-deps-with-cargo ENV HOME=/home/nonroot ENV PATH="/home/nonroot/.cargo/bin:$PATH" @@ -1082,13 +1077,29 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux ./rustup-init -y --no-modify-path --profile minimal --default-toolchain stable && \ rm rustup-init +######################################################################################### +# +# Layer "pg-build with Rust toolchain installed" +# This layer is base and common for layers with `pgrx` +# +######################################################################################### +FROM pg-build AS pg-build-with-cargo +ARG PG_VERSION + +ENV HOME=/home/nonroot +ENV PATH="/home/nonroot/.cargo/bin:$PATH" +USER nonroot +WORKDIR /home/nonroot + +COPY --from=build-deps-with-cargo /home/nonroot /home/nonroot + ######################################################################################### # # Layer "rust extensions" # This layer is used to build `pgrx` deps # ######################################################################################### -FROM pg-build-nonroot-with-cargo AS rust-extensions-build +FROM pg-build-with-cargo AS rust-extensions-build ARG PG_VERSION RUN case "${PG_VERSION:?}" in \ @@ -1110,7 +1121,7 @@ USER root # and eventually get merged with `rust-extensions-build` # ######################################################################################### -FROM pg-build-nonroot-with-cargo AS rust-extensions-build-pgrx12 +FROM pg-build-with-cargo AS rust-extensions-build-pgrx12 ARG PG_VERSION RUN cargo install --locked --version 0.12.9 cargo-pgrx && \ @@ -1127,7 +1138,7 @@ USER root # and eventually get merged with `rust-extensions-build` # ######################################################################################### -FROM pg-build-nonroot-with-cargo AS rust-extensions-build-pgrx14 +FROM pg-build-with-cargo AS rust-extensions-build-pgrx14 ARG PG_VERSION RUN cargo install --locked --version 0.14.1 cargo-pgrx && \ From 67fbc0582e5a55a8b40e24d3719278e5227207eb Mon Sep 17 00:00:00 2001 From: Dimitri Fontaine Date: Wed, 18 Jun 2025 12:01:05 +0200 Subject: [PATCH 067/124] Validate safekeeper_connstrings when parsing compute specs. (#11906) This check API only cheks the safekeeper_connstrings at the moment, and the validation is limited to checking we have at least one entry in there, and no duplicates. ## Problem If the compute_ctl service is started with an empty list of safekeepers, then hard-to-debug errors may happen at runtime, where it would be much easier to catch them early. ## Summary of changes Add an entry point in the compute_ctl API to validate the configuration for safekeeper_connstrings. --------- Co-authored-by: Heikki Linnakangas --- compute_tools/src/compute.rs | 67 ++++- compute_tools/tests/README.md | 6 + compute_tools/tests/cluster_spec.json | 245 ++++++++++++++++++ .../regress/test_compute_reconfigure.py | 56 ++++ 4 files changed, 372 insertions(+), 2 deletions(-) create mode 100644 compute_tools/tests/README.md create mode 100644 compute_tools/tests/cluster_spec.json diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index c591d9711a..9744cc2dac 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -217,6 +217,46 @@ pub struct ParsedSpec { pub endpoint_storage_token: Option, } +impl ParsedSpec { + pub fn validate(&self) -> Result<(), String> { + // Only Primary nodes are using safekeeper_connstrings, and at the moment + // this method only validates that part of the specs. + if self.spec.mode != ComputeMode::Primary { + return Ok(()); + } + + // While it seems like a good idea to check for an odd number of entries in + // the safekeepers connection string, changes to the list of safekeepers might + // incur appending a new server to a list of 3, in which case a list of 4 + // entries is okay in production. + // + // Still we want unique entries, and at least one entry in the vector + if self.safekeeper_connstrings.is_empty() { + return Err(String::from("safekeeper_connstrings is empty")); + } + + // check for uniqueness of the connection strings in the set + let mut connstrings = self.safekeeper_connstrings.clone(); + + connstrings.sort(); + let mut previous = &connstrings[0]; + + for current in connstrings.iter().skip(1) { + // duplicate entry? + if current == previous { + return Err(format!( + "duplicate entry in safekeeper_connstrings: {}!", + current, + )); + } + + previous = current; + } + + Ok(()) + } +} + impl TryFrom for ParsedSpec { type Error = String; fn try_from(spec: ComputeSpec) -> Result { @@ -246,6 +286,7 @@ impl TryFrom for ParsedSpec { } else { spec.safekeeper_connstrings.clone() }; + let storage_auth_token = spec.storage_auth_token.clone(); let tenant_id: TenantId = if let Some(tenant_id) = spec.tenant_id { tenant_id @@ -280,7 +321,7 @@ impl TryFrom for ParsedSpec { .clone() .or_else(|| spec.cluster.settings.find("neon.endpoint_storage_token")); - Ok(ParsedSpec { + let res = ParsedSpec { spec, pageserver_connstr, safekeeper_connstrings, @@ -289,7 +330,11 @@ impl TryFrom for ParsedSpec { timeline_id, endpoint_storage_addr, endpoint_storage_token, - }) + }; + + // Now check validity of the parsed specification + res.validate()?; + Ok(res) } } @@ -2342,3 +2387,21 @@ impl JoinSetExt for tokio::task::JoinSet { }) } } + +#[cfg(test)] +mod tests { + use std::fs::File; + + use super::*; + + #[test] + fn duplicate_safekeeper_connstring() { + let file = File::open("tests/cluster_spec.json").unwrap(); + let spec: ComputeSpec = serde_json::from_reader(file).unwrap(); + + match ParsedSpec::try_from(spec.clone()) { + Ok(_p) => panic!("Failed to detect duplicate entry"), + Err(e) => assert!(e.starts_with("duplicate entry in safekeeper_connstrings:")), + }; + } +} diff --git a/compute_tools/tests/README.md b/compute_tools/tests/README.md new file mode 100644 index 0000000000..adeb9ef4b6 --- /dev/null +++ b/compute_tools/tests/README.md @@ -0,0 +1,6 @@ +### Test files + +The file `cluster_spec.json` has been copied over from libs/compute_api +tests, with some edits: + + - the neon.safekeepers setting contains a duplicate value diff --git a/compute_tools/tests/cluster_spec.json b/compute_tools/tests/cluster_spec.json new file mode 100644 index 0000000000..5655a94de4 --- /dev/null +++ b/compute_tools/tests/cluster_spec.json @@ -0,0 +1,245 @@ +{ + "format_version": 1.0, + + "timestamp": "2021-05-23T18:25:43.511Z", + "operation_uuid": "0f657b36-4b0f-4a2d-9c2e-1dcd615e7d8b", + + "cluster": { + "cluster_id": "test-cluster-42", + "name": "Zenith Test", + "state": "restarted", + "roles": [ + { + "name": "postgres", + "encrypted_password": "6b1d16b78004bbd51fa06af9eda75972", + "options": null + }, + { + "name": "alexk", + "encrypted_password": null, + "options": null + }, + { + "name": "zenith \"new\"", + "encrypted_password": "5b1d16b78004bbd51fa06af9eda75972", + "options": null + }, + { + "name": "zen", + "encrypted_password": "9b1d16b78004bbd51fa06af9eda75972" + }, + { + "name": "\"name\";\\n select 1;", + "encrypted_password": "5b1d16b78004bbd51fa06af9eda75972" + }, + { + "name": "MyRole", + "encrypted_password": "5b1d16b78004bbd51fa06af9eda75972" + } + ], + "databases": [ + { + "name": "DB2", + "owner": "alexk", + "options": [ + { + "name": "LC_COLLATE", + "value": "C", + "vartype": "string" + }, + { + "name": "LC_CTYPE", + "value": "C", + "vartype": "string" + }, + { + "name": "TEMPLATE", + "value": "template0", + "vartype": "enum" + } + ] + }, + { + "name": "zenith", + "owner": "MyRole" + }, + { + "name": "zen", + "owner": "zen" + } + ], + "settings": [ + { + "name": "fsync", + "value": "off", + "vartype": "bool" + }, + { + "name": "wal_level", + "value": "logical", + "vartype": "enum" + }, + { + "name": "hot_standby", + "value": "on", + "vartype": "bool" + }, + { + "name": "prewarm_lfc_on_startup", + "value": "off", + "vartype": "bool" + }, + { + "name": "neon.safekeepers", + "value": "127.0.0.1:6502,127.0.0.1:6503,127.0.0.1:6501,127.0.0.1:6502", + "vartype": "string" + }, + { + "name": "wal_log_hints", + "value": "on", + "vartype": "bool" + }, + { + "name": "log_connections", + "value": "on", + "vartype": "bool" + }, + { + "name": "shared_buffers", + "value": "32768", + "vartype": "integer" + }, + { + "name": "port", + "value": "55432", + "vartype": "integer" + }, + { + "name": "max_connections", + "value": "100", + "vartype": "integer" + }, + { + "name": "max_wal_senders", + "value": "10", + "vartype": "integer" + }, + { + "name": "listen_addresses", + "value": "0.0.0.0", + "vartype": "string" + }, + { + "name": "wal_sender_timeout", + "value": "0", + "vartype": "integer" + }, + { + "name": "password_encryption", + "value": "md5", + "vartype": "enum" + }, + { + "name": "maintenance_work_mem", + "value": "65536", + "vartype": "integer" + }, + { + "name": "max_parallel_workers", + "value": "8", + "vartype": "integer" + }, + { + "name": "max_worker_processes", + "value": "8", + "vartype": "integer" + }, + { + "name": "neon.tenant_id", + "value": "b0554b632bd4d547a63b86c3630317e8", + "vartype": "string" + }, + { + "name": "max_replication_slots", + "value": "10", + "vartype": "integer" + }, + { + "name": "neon.timeline_id", + "value": "2414a61ffc94e428f14b5758fe308e13", + "vartype": "string" + }, + { + "name": "shared_preload_libraries", + "value": "neon", + "vartype": "string" + }, + { + "name": "synchronous_standby_names", + "value": "walproposer", + "vartype": "string" + }, + { + "name": "neon.pageserver_connstring", + "value": "host=127.0.0.1 port=6400", + "vartype": "string" + }, + { + "name": "test.escaping", + "value": "here's a backslash \\ and a quote ' and a double-quote \" hooray", + "vartype": "string" + } + ] + }, + "delta_operations": [ + { + "action": "delete_db", + "name": "zenith_test" + }, + { + "action": "rename_db", + "name": "DB", + "new_name": "DB2" + }, + { + "action": "delete_role", + "name": "zenith2" + }, + { + "action": "rename_role", + "name": "zenith new", + "new_name": "zenith \"new\"" + } + ], + "remote_extensions": { + "library_index": { + "postgis-3": "postgis", + "libpgrouting-3.4": "postgis", + "postgis_raster-3": "postgis", + "postgis_sfcgal-3": "postgis", + "postgis_topology-3": "postgis", + "address_standardizer-3": "postgis" + }, + "extension_data": { + "postgis": { + "archive_path": "5834329303/v15/extensions/postgis.tar.zst", + "control_data": { + "postgis.control": "# postgis extension\ncomment = ''PostGIS geometry and geography spatial types and functions''\ndefault_version = ''3.3.2''\nmodule_pathname = ''$libdir/postgis-3''\nrelocatable = false\ntrusted = true\n", + "pgrouting.control": "# pgRouting Extension\ncomment = ''pgRouting Extension''\ndefault_version = ''3.4.2''\nmodule_pathname = ''$libdir/libpgrouting-3.4''\nrelocatable = true\nrequires = ''plpgsql''\nrequires = ''postgis''\ntrusted = true\n", + "postgis_raster.control": "# postgis_raster extension\ncomment = ''PostGIS raster types and functions''\ndefault_version = ''3.3.2''\nmodule_pathname = ''$libdir/postgis_raster-3''\nrelocatable = false\nrequires = postgis\ntrusted = true\n", + "postgis_sfcgal.control": "# postgis topology extension\ncomment = ''PostGIS SFCGAL functions''\ndefault_version = ''3.3.2''\nrelocatable = true\nrequires = postgis\ntrusted = true\n", + "postgis_topology.control": "# postgis topology extension\ncomment = ''PostGIS topology spatial types and functions''\ndefault_version = ''3.3.2''\nrelocatable = false\nschema = topology\nrequires = postgis\ntrusted = true\n", + "address_standardizer.control": "# address_standardizer extension\ncomment = ''Used to parse an address into constituent elements. Generally used to support geocoding address normalization step.''\ndefault_version = ''3.3.2''\nrelocatable = true\ntrusted = true\n", + "postgis_tiger_geocoder.control": "# postgis tiger geocoder extension\ncomment = ''PostGIS tiger geocoder and reverse geocoder''\ndefault_version = ''3.3.2''\nrelocatable = false\nschema = tiger\nrequires = ''postgis,fuzzystrmatch''\nsuperuser= false\ntrusted = true\n", + "address_standardizer_data_us.control": "# address standardizer us dataset\ncomment = ''Address Standardizer US dataset example''\ndefault_version = ''3.3.2''\nrelocatable = true\ntrusted = true\n" + } + } + }, + "custom_extensions": [], + "public_extensions": ["postgis"] + }, + "pgbouncer_settings": { + "default_pool_size": "42", + "pool_mode": "session" + } +} diff --git a/test_runner/regress/test_compute_reconfigure.py b/test_runner/regress/test_compute_reconfigure.py index b533d45b1e..cc792333ba 100644 --- a/test_runner/regress/test_compute_reconfigure.py +++ b/test_runner/regress/test_compute_reconfigure.py @@ -9,6 +9,8 @@ from fixtures.utils import wait_until if TYPE_CHECKING: from fixtures.neon_fixtures import NeonEnv +from fixtures.log_helper import log + def test_compute_reconfigure(neon_simple_env: NeonEnv): """ @@ -85,3 +87,57 @@ def test_compute_reconfigure(neon_simple_env: NeonEnv): samples = metrics.query_all("compute_ctl_up", {"build_tag": build_tag}) assert len(samples) == 1 assert samples[0].value == 1 + + +def test_compute_safekeeper_connstrings_duplicate(neon_simple_env: NeonEnv): + """ + Test that we catch duplicate entries in neon.safekeepers. + """ + env = neon_simple_env + + endpoint = env.endpoints.create_start("main") + + # grab the current value of neon.safekeepers + sk_list = [] + with endpoint.cursor() as cursor: + cursor.execute("SHOW neon.safekeepers;") + row = cursor.fetchone() + assert row is not None + + log.info(f' initial neon.safekeepers: "{row}"') + + # build a safekeepers list with a duplicate + sk_list.append(row[0]) + sk_list.append(row[0]) + + safekeepers = ",".join(sk_list) + log.info(f'reconfigure neon.safekeepers: "{safekeepers}"') + + # introduce duplicate entry in neon.safekeepers, on purpose + endpoint.respec_deep( + **{ + "spec": { + "skip_pg_catalog_updates": True, + "cluster": { + "settings": [ + { + "name": "neon.safekeepers", + "vartype": "string", + "value": safekeepers, + } + ] + }, + }, + } + ) + + try: + endpoint.reconfigure() + + # Check that in logs we see that it was actually reconfigured, + # not restarted or something else. + endpoint.log_contains("INFO request{method=POST uri=/configure") + + except Exception as e: + # we except a failure here + log.info(f"RAISED: {e}" % e) From 5a045e7d52eee5bf39eb24219f6e4782cddb0a4c Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 18 Jun 2025 15:03:14 +0300 Subject: [PATCH 068/124] Move pagestream_api to separate module (#12272) For general readability. --- libs/pageserver_api/src/lib.rs | 1 + libs/pageserver_api/src/models.rs | 780 ----------------- libs/pageserver_api/src/pagestream_api.rs | 792 ++++++++++++++++++ pageserver/client/src/page_service.rs | 2 +- .../pagebench/src/cmd/getpage_latest_lsn.rs | 2 +- .../src/bin/test_helper_slow_client_reads.rs | 22 +- pageserver/src/page_service.rs | 26 +- 7 files changed, 819 insertions(+), 806 deletions(-) create mode 100644 libs/pageserver_api/src/pagestream_api.rs diff --git a/libs/pageserver_api/src/lib.rs b/libs/pageserver_api/src/lib.rs index ff705e79cd..6c91d61508 100644 --- a/libs/pageserver_api/src/lib.rs +++ b/libs/pageserver_api/src/lib.rs @@ -5,6 +5,7 @@ pub mod controller_api; pub mod key; pub mod keyspace; pub mod models; +pub mod pagestream_api; pub mod record; pub mod reltag; pub mod shard; diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 881f24b86c..2fc32c8f49 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -5,16 +5,12 @@ pub mod utilization; use core::ops::Range; use std::collections::HashMap; use std::fmt::Display; -use std::io::{BufRead, Read}; use std::num::{NonZeroU32, NonZeroU64, NonZeroUsize}; use std::str::FromStr; use std::time::{Duration, SystemTime}; -use byteorder::{BigEndian, ReadBytesExt}; -use bytes::{Buf, BufMut, Bytes, BytesMut}; #[cfg(feature = "testing")] use camino::Utf8PathBuf; -use postgres_ffi::BLCKSZ; use serde::{Deserialize, Deserializer, Serialize, Serializer}; use serde_with::serde_as; pub use utilization::PageserverUtilization; @@ -24,7 +20,6 @@ use utils::{completion, serde_system_time}; use crate::config::Ratio; use crate::key::{CompactKey, Key}; -use crate::reltag::RelTag; use crate::shard::{DEFAULT_STRIPE_SIZE, ShardCount, ShardStripeSize, TenantShardId}; /// The state of a tenant in this pageserver. @@ -1907,219 +1902,6 @@ pub struct ScanDisposableKeysResponse { pub not_disposable_count: usize, } -// Wrapped in libpq CopyData -#[derive(PartialEq, Eq, Debug)] -pub enum PagestreamFeMessage { - Exists(PagestreamExistsRequest), - Nblocks(PagestreamNblocksRequest), - GetPage(PagestreamGetPageRequest), - DbSize(PagestreamDbSizeRequest), - GetSlruSegment(PagestreamGetSlruSegmentRequest), - #[cfg(feature = "testing")] - Test(PagestreamTestRequest), -} - -// Wrapped in libpq CopyData -#[derive(Debug, strum_macros::EnumProperty)] -pub enum PagestreamBeMessage { - Exists(PagestreamExistsResponse), - Nblocks(PagestreamNblocksResponse), - GetPage(PagestreamGetPageResponse), - Error(PagestreamErrorResponse), - DbSize(PagestreamDbSizeResponse), - GetSlruSegment(PagestreamGetSlruSegmentResponse), - #[cfg(feature = "testing")] - Test(PagestreamTestResponse), -} - -// Keep in sync with `pagestore_client.h` -#[repr(u8)] -enum PagestreamFeMessageTag { - Exists = 0, - Nblocks = 1, - GetPage = 2, - DbSize = 3, - GetSlruSegment = 4, - /* future tags above this line */ - /// For testing purposes, not available in production. - #[cfg(feature = "testing")] - Test = 99, -} - -// Keep in sync with `pagestore_client.h` -#[repr(u8)] -enum PagestreamBeMessageTag { - Exists = 100, - Nblocks = 101, - GetPage = 102, - Error = 103, - DbSize = 104, - GetSlruSegment = 105, - /* future tags above this line */ - /// For testing purposes, not available in production. - #[cfg(feature = "testing")] - Test = 199, -} - -impl TryFrom for PagestreamFeMessageTag { - type Error = u8; - fn try_from(value: u8) -> Result { - match value { - 0 => Ok(PagestreamFeMessageTag::Exists), - 1 => Ok(PagestreamFeMessageTag::Nblocks), - 2 => Ok(PagestreamFeMessageTag::GetPage), - 3 => Ok(PagestreamFeMessageTag::DbSize), - 4 => Ok(PagestreamFeMessageTag::GetSlruSegment), - #[cfg(feature = "testing")] - 99 => Ok(PagestreamFeMessageTag::Test), - _ => Err(value), - } - } -} - -impl TryFrom for PagestreamBeMessageTag { - type Error = u8; - fn try_from(value: u8) -> Result { - match value { - 100 => Ok(PagestreamBeMessageTag::Exists), - 101 => Ok(PagestreamBeMessageTag::Nblocks), - 102 => Ok(PagestreamBeMessageTag::GetPage), - 103 => Ok(PagestreamBeMessageTag::Error), - 104 => Ok(PagestreamBeMessageTag::DbSize), - 105 => Ok(PagestreamBeMessageTag::GetSlruSegment), - #[cfg(feature = "testing")] - 199 => Ok(PagestreamBeMessageTag::Test), - _ => Err(value), - } - } -} - -// A GetPage request contains two LSN values: -// -// request_lsn: Get the page version at this point in time. Lsn::Max is a special value that means -// "get the latest version present". It's used by the primary server, which knows that no one else -// is writing WAL. 'not_modified_since' must be set to a proper value even if request_lsn is -// Lsn::Max. Standby servers use the current replay LSN as the request LSN. -// -// not_modified_since: Hint to the pageserver that the client knows that the page has not been -// modified between 'not_modified_since' and the request LSN. It's always correct to set -// 'not_modified_since equal' to 'request_lsn' (unless Lsn::Max is used as the 'request_lsn'), but -// passing an earlier LSN can speed up the request, by allowing the pageserver to process the -// request without waiting for 'request_lsn' to arrive. -// -// The now-defunct V1 interface contained only one LSN, and a boolean 'latest' flag. The V1 interface was -// sufficient for the primary; the 'lsn' was equivalent to the 'not_modified_since' value, and -// 'latest' was set to true. The V2 interface was added because there was no correct way for a -// standby to request a page at a particular non-latest LSN, and also include the -// 'not_modified_since' hint. That led to an awkward choice of either using an old LSN in the -// request, if the standby knows that the page hasn't been modified since, and risk getting an error -// if that LSN has fallen behind the GC horizon, or requesting the current replay LSN, which could -// require the pageserver unnecessarily to wait for the WAL to arrive up to that point. The new V2 -// interface allows sending both LSNs, and let the pageserver do the right thing. There was no -// difference in the responses between V1 and V2. -// -// V3 version of protocol adds request ID to all requests. This request ID is also included in response -// as well as other fields from requests, which allows to verify that we receive response for our request. -// We copy fields from request to response to make checking more reliable: request ID is formed from process ID -// and local counter, so in principle there can be duplicated requests IDs if process PID is reused. -// -#[derive(Debug, PartialEq, Eq, Clone, Copy)] -pub enum PagestreamProtocolVersion { - V2, - V3, -} - -pub type RequestId = u64; - -#[derive(Debug, Default, PartialEq, Eq, Clone, Copy)] -pub struct PagestreamRequest { - pub reqid: RequestId, - pub request_lsn: Lsn, - pub not_modified_since: Lsn, -} - -#[derive(Debug, PartialEq, Eq, Clone, Copy)] -pub struct PagestreamExistsRequest { - pub hdr: PagestreamRequest, - pub rel: RelTag, -} - -#[derive(Debug, PartialEq, Eq, Clone, Copy)] -pub struct PagestreamNblocksRequest { - pub hdr: PagestreamRequest, - pub rel: RelTag, -} - -#[derive(Debug, Default, PartialEq, Eq, Clone, Copy)] -pub struct PagestreamGetPageRequest { - pub hdr: PagestreamRequest, - pub rel: RelTag, - pub blkno: u32, -} - -#[derive(Debug, PartialEq, Eq, Clone, Copy)] -pub struct PagestreamDbSizeRequest { - pub hdr: PagestreamRequest, - pub dbnode: u32, -} - -#[derive(Debug, PartialEq, Eq, Clone, Copy)] -pub struct PagestreamGetSlruSegmentRequest { - pub hdr: PagestreamRequest, - pub kind: u8, - pub segno: u32, -} - -#[derive(Debug)] -pub struct PagestreamExistsResponse { - pub req: PagestreamExistsRequest, - pub exists: bool, -} - -#[derive(Debug)] -pub struct PagestreamNblocksResponse { - pub req: PagestreamNblocksRequest, - pub n_blocks: u32, -} - -#[derive(Debug)] -pub struct PagestreamGetPageResponse { - pub req: PagestreamGetPageRequest, - pub page: Bytes, -} - -#[derive(Debug)] -pub struct PagestreamGetSlruSegmentResponse { - pub req: PagestreamGetSlruSegmentRequest, - pub segment: Bytes, -} - -#[derive(Debug)] -pub struct PagestreamErrorResponse { - pub req: PagestreamRequest, - pub message: String, -} - -#[derive(Debug)] -pub struct PagestreamDbSizeResponse { - pub req: PagestreamDbSizeRequest, - pub db_size: i64, -} - -#[cfg(feature = "testing")] -#[derive(Debug, PartialEq, Eq, Clone)] -pub struct PagestreamTestRequest { - pub hdr: PagestreamRequest, - pub batch_key: u64, - pub message: String, -} - -#[cfg(feature = "testing")] -#[derive(Debug)] -pub struct PagestreamTestResponse { - pub req: PagestreamTestRequest, -} - // This is a cut-down version of TenantHistorySize from the pageserver crate, omitting fields // that require pageserver-internal types. It is sufficient to get the total size. #[derive(Serialize, Deserialize, Debug)] @@ -2131,506 +1913,6 @@ pub struct TenantHistorySize { pub size: Option, } -impl PagestreamFeMessage { - /// Serialize a compute -> pageserver message. This is currently only used in testing - /// tools. Always uses protocol version 3. - pub fn serialize(&self) -> Bytes { - let mut bytes = BytesMut::new(); - - match self { - Self::Exists(req) => { - bytes.put_u8(PagestreamFeMessageTag::Exists as u8); - bytes.put_u64(req.hdr.reqid); - bytes.put_u64(req.hdr.request_lsn.0); - bytes.put_u64(req.hdr.not_modified_since.0); - bytes.put_u32(req.rel.spcnode); - bytes.put_u32(req.rel.dbnode); - bytes.put_u32(req.rel.relnode); - bytes.put_u8(req.rel.forknum); - } - - Self::Nblocks(req) => { - bytes.put_u8(PagestreamFeMessageTag::Nblocks as u8); - bytes.put_u64(req.hdr.reqid); - bytes.put_u64(req.hdr.request_lsn.0); - bytes.put_u64(req.hdr.not_modified_since.0); - bytes.put_u32(req.rel.spcnode); - bytes.put_u32(req.rel.dbnode); - bytes.put_u32(req.rel.relnode); - bytes.put_u8(req.rel.forknum); - } - - Self::GetPage(req) => { - bytes.put_u8(PagestreamFeMessageTag::GetPage as u8); - bytes.put_u64(req.hdr.reqid); - bytes.put_u64(req.hdr.request_lsn.0); - bytes.put_u64(req.hdr.not_modified_since.0); - bytes.put_u32(req.rel.spcnode); - bytes.put_u32(req.rel.dbnode); - bytes.put_u32(req.rel.relnode); - bytes.put_u8(req.rel.forknum); - bytes.put_u32(req.blkno); - } - - Self::DbSize(req) => { - bytes.put_u8(PagestreamFeMessageTag::DbSize as u8); - bytes.put_u64(req.hdr.reqid); - bytes.put_u64(req.hdr.request_lsn.0); - bytes.put_u64(req.hdr.not_modified_since.0); - bytes.put_u32(req.dbnode); - } - - Self::GetSlruSegment(req) => { - bytes.put_u8(PagestreamFeMessageTag::GetSlruSegment as u8); - bytes.put_u64(req.hdr.reqid); - bytes.put_u64(req.hdr.request_lsn.0); - bytes.put_u64(req.hdr.not_modified_since.0); - bytes.put_u8(req.kind); - bytes.put_u32(req.segno); - } - #[cfg(feature = "testing")] - Self::Test(req) => { - bytes.put_u8(PagestreamFeMessageTag::Test as u8); - bytes.put_u64(req.hdr.reqid); - bytes.put_u64(req.hdr.request_lsn.0); - bytes.put_u64(req.hdr.not_modified_since.0); - bytes.put_u64(req.batch_key); - let message = req.message.as_bytes(); - bytes.put_u64(message.len() as u64); - bytes.put_slice(message); - } - } - - bytes.into() - } - - pub fn parse( - body: &mut R, - protocol_version: PagestreamProtocolVersion, - ) -> anyhow::Result { - // these correspond to the NeonMessageTag enum in pagestore_client.h - // - // TODO: consider using protobuf or serde bincode for less error prone - // serialization. - let msg_tag = body.read_u8()?; - let (reqid, request_lsn, not_modified_since) = match protocol_version { - PagestreamProtocolVersion::V2 => ( - 0, - Lsn::from(body.read_u64::()?), - Lsn::from(body.read_u64::()?), - ), - PagestreamProtocolVersion::V3 => ( - body.read_u64::()?, - Lsn::from(body.read_u64::()?), - Lsn::from(body.read_u64::()?), - ), - }; - - match PagestreamFeMessageTag::try_from(msg_tag) - .map_err(|tag: u8| anyhow::anyhow!("invalid tag {tag}"))? - { - PagestreamFeMessageTag::Exists => { - Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest { - hdr: PagestreamRequest { - reqid, - request_lsn, - not_modified_since, - }, - rel: RelTag { - spcnode: body.read_u32::()?, - dbnode: body.read_u32::()?, - relnode: body.read_u32::()?, - forknum: body.read_u8()?, - }, - })) - } - PagestreamFeMessageTag::Nblocks => { - Ok(PagestreamFeMessage::Nblocks(PagestreamNblocksRequest { - hdr: PagestreamRequest { - reqid, - request_lsn, - not_modified_since, - }, - rel: RelTag { - spcnode: body.read_u32::()?, - dbnode: body.read_u32::()?, - relnode: body.read_u32::()?, - forknum: body.read_u8()?, - }, - })) - } - PagestreamFeMessageTag::GetPage => { - Ok(PagestreamFeMessage::GetPage(PagestreamGetPageRequest { - hdr: PagestreamRequest { - reqid, - request_lsn, - not_modified_since, - }, - rel: RelTag { - spcnode: body.read_u32::()?, - dbnode: body.read_u32::()?, - relnode: body.read_u32::()?, - forknum: body.read_u8()?, - }, - blkno: body.read_u32::()?, - })) - } - PagestreamFeMessageTag::DbSize => { - Ok(PagestreamFeMessage::DbSize(PagestreamDbSizeRequest { - hdr: PagestreamRequest { - reqid, - request_lsn, - not_modified_since, - }, - dbnode: body.read_u32::()?, - })) - } - PagestreamFeMessageTag::GetSlruSegment => Ok(PagestreamFeMessage::GetSlruSegment( - PagestreamGetSlruSegmentRequest { - hdr: PagestreamRequest { - reqid, - request_lsn, - not_modified_since, - }, - kind: body.read_u8()?, - segno: body.read_u32::()?, - }, - )), - #[cfg(feature = "testing")] - PagestreamFeMessageTag::Test => Ok(PagestreamFeMessage::Test(PagestreamTestRequest { - hdr: PagestreamRequest { - reqid, - request_lsn, - not_modified_since, - }, - batch_key: body.read_u64::()?, - message: { - let len = body.read_u64::()?; - let mut buf = vec![0; len as usize]; - body.read_exact(&mut buf)?; - String::from_utf8(buf)? - }, - })), - } - } -} - -impl PagestreamBeMessage { - pub fn serialize(&self, protocol_version: PagestreamProtocolVersion) -> Bytes { - let mut bytes = BytesMut::new(); - - use PagestreamBeMessageTag as Tag; - match protocol_version { - PagestreamProtocolVersion::V2 => { - match self { - Self::Exists(resp) => { - bytes.put_u8(Tag::Exists as u8); - bytes.put_u8(resp.exists as u8); - } - - Self::Nblocks(resp) => { - bytes.put_u8(Tag::Nblocks as u8); - bytes.put_u32(resp.n_blocks); - } - - Self::GetPage(resp) => { - bytes.put_u8(Tag::GetPage as u8); - bytes.put(&resp.page[..]) - } - - Self::Error(resp) => { - bytes.put_u8(Tag::Error as u8); - bytes.put(resp.message.as_bytes()); - bytes.put_u8(0); // null terminator - } - Self::DbSize(resp) => { - bytes.put_u8(Tag::DbSize as u8); - bytes.put_i64(resp.db_size); - } - - Self::GetSlruSegment(resp) => { - bytes.put_u8(Tag::GetSlruSegment as u8); - bytes.put_u32((resp.segment.len() / BLCKSZ as usize) as u32); - bytes.put(&resp.segment[..]); - } - - #[cfg(feature = "testing")] - Self::Test(resp) => { - bytes.put_u8(Tag::Test as u8); - bytes.put_u64(resp.req.batch_key); - let message = resp.req.message.as_bytes(); - bytes.put_u64(message.len() as u64); - bytes.put_slice(message); - } - } - } - PagestreamProtocolVersion::V3 => { - match self { - Self::Exists(resp) => { - bytes.put_u8(Tag::Exists as u8); - bytes.put_u64(resp.req.hdr.reqid); - bytes.put_u64(resp.req.hdr.request_lsn.0); - bytes.put_u64(resp.req.hdr.not_modified_since.0); - bytes.put_u32(resp.req.rel.spcnode); - bytes.put_u32(resp.req.rel.dbnode); - bytes.put_u32(resp.req.rel.relnode); - bytes.put_u8(resp.req.rel.forknum); - bytes.put_u8(resp.exists as u8); - } - - Self::Nblocks(resp) => { - bytes.put_u8(Tag::Nblocks as u8); - bytes.put_u64(resp.req.hdr.reqid); - bytes.put_u64(resp.req.hdr.request_lsn.0); - bytes.put_u64(resp.req.hdr.not_modified_since.0); - bytes.put_u32(resp.req.rel.spcnode); - bytes.put_u32(resp.req.rel.dbnode); - bytes.put_u32(resp.req.rel.relnode); - bytes.put_u8(resp.req.rel.forknum); - bytes.put_u32(resp.n_blocks); - } - - Self::GetPage(resp) => { - bytes.put_u8(Tag::GetPage as u8); - bytes.put_u64(resp.req.hdr.reqid); - bytes.put_u64(resp.req.hdr.request_lsn.0); - bytes.put_u64(resp.req.hdr.not_modified_since.0); - bytes.put_u32(resp.req.rel.spcnode); - bytes.put_u32(resp.req.rel.dbnode); - bytes.put_u32(resp.req.rel.relnode); - bytes.put_u8(resp.req.rel.forknum); - bytes.put_u32(resp.req.blkno); - bytes.put(&resp.page[..]) - } - - Self::Error(resp) => { - bytes.put_u8(Tag::Error as u8); - bytes.put_u64(resp.req.reqid); - bytes.put_u64(resp.req.request_lsn.0); - bytes.put_u64(resp.req.not_modified_since.0); - bytes.put(resp.message.as_bytes()); - bytes.put_u8(0); // null terminator - } - Self::DbSize(resp) => { - bytes.put_u8(Tag::DbSize as u8); - bytes.put_u64(resp.req.hdr.reqid); - bytes.put_u64(resp.req.hdr.request_lsn.0); - bytes.put_u64(resp.req.hdr.not_modified_since.0); - bytes.put_u32(resp.req.dbnode); - bytes.put_i64(resp.db_size); - } - - Self::GetSlruSegment(resp) => { - bytes.put_u8(Tag::GetSlruSegment as u8); - bytes.put_u64(resp.req.hdr.reqid); - bytes.put_u64(resp.req.hdr.request_lsn.0); - bytes.put_u64(resp.req.hdr.not_modified_since.0); - bytes.put_u8(resp.req.kind); - bytes.put_u32(resp.req.segno); - bytes.put_u32((resp.segment.len() / BLCKSZ as usize) as u32); - bytes.put(&resp.segment[..]); - } - - #[cfg(feature = "testing")] - Self::Test(resp) => { - bytes.put_u8(Tag::Test as u8); - bytes.put_u64(resp.req.hdr.reqid); - bytes.put_u64(resp.req.hdr.request_lsn.0); - bytes.put_u64(resp.req.hdr.not_modified_since.0); - bytes.put_u64(resp.req.batch_key); - let message = resp.req.message.as_bytes(); - bytes.put_u64(message.len() as u64); - bytes.put_slice(message); - } - } - } - } - bytes.into() - } - - pub fn deserialize(buf: Bytes) -> anyhow::Result { - let mut buf = buf.reader(); - let msg_tag = buf.read_u8()?; - - use PagestreamBeMessageTag as Tag; - let ok = - match Tag::try_from(msg_tag).map_err(|tag: u8| anyhow::anyhow!("invalid tag {tag}"))? { - Tag::Exists => { - let reqid = buf.read_u64::()?; - let request_lsn = Lsn(buf.read_u64::()?); - let not_modified_since = Lsn(buf.read_u64::()?); - let rel = RelTag { - spcnode: buf.read_u32::()?, - dbnode: buf.read_u32::()?, - relnode: buf.read_u32::()?, - forknum: buf.read_u8()?, - }; - let exists = buf.read_u8()? != 0; - Self::Exists(PagestreamExistsResponse { - req: PagestreamExistsRequest { - hdr: PagestreamRequest { - reqid, - request_lsn, - not_modified_since, - }, - rel, - }, - exists, - }) - } - Tag::Nblocks => { - let reqid = buf.read_u64::()?; - let request_lsn = Lsn(buf.read_u64::()?); - let not_modified_since = Lsn(buf.read_u64::()?); - let rel = RelTag { - spcnode: buf.read_u32::()?, - dbnode: buf.read_u32::()?, - relnode: buf.read_u32::()?, - forknum: buf.read_u8()?, - }; - let n_blocks = buf.read_u32::()?; - Self::Nblocks(PagestreamNblocksResponse { - req: PagestreamNblocksRequest { - hdr: PagestreamRequest { - reqid, - request_lsn, - not_modified_since, - }, - rel, - }, - n_blocks, - }) - } - Tag::GetPage => { - let reqid = buf.read_u64::()?; - let request_lsn = Lsn(buf.read_u64::()?); - let not_modified_since = Lsn(buf.read_u64::()?); - let rel = RelTag { - spcnode: buf.read_u32::()?, - dbnode: buf.read_u32::()?, - relnode: buf.read_u32::()?, - forknum: buf.read_u8()?, - }; - let blkno = buf.read_u32::()?; - let mut page = vec![0; 8192]; // TODO: use MaybeUninit - buf.read_exact(&mut page)?; - Self::GetPage(PagestreamGetPageResponse { - req: PagestreamGetPageRequest { - hdr: PagestreamRequest { - reqid, - request_lsn, - not_modified_since, - }, - rel, - blkno, - }, - page: page.into(), - }) - } - Tag::Error => { - let reqid = buf.read_u64::()?; - let request_lsn = Lsn(buf.read_u64::()?); - let not_modified_since = Lsn(buf.read_u64::()?); - let mut msg = Vec::new(); - buf.read_until(0, &mut msg)?; - let cstring = std::ffi::CString::from_vec_with_nul(msg)?; - let rust_str = cstring.to_str()?; - Self::Error(PagestreamErrorResponse { - req: PagestreamRequest { - reqid, - request_lsn, - not_modified_since, - }, - message: rust_str.to_owned(), - }) - } - Tag::DbSize => { - let reqid = buf.read_u64::()?; - let request_lsn = Lsn(buf.read_u64::()?); - let not_modified_since = Lsn(buf.read_u64::()?); - let dbnode = buf.read_u32::()?; - let db_size = buf.read_i64::()?; - Self::DbSize(PagestreamDbSizeResponse { - req: PagestreamDbSizeRequest { - hdr: PagestreamRequest { - reqid, - request_lsn, - not_modified_since, - }, - dbnode, - }, - db_size, - }) - } - Tag::GetSlruSegment => { - let reqid = buf.read_u64::()?; - let request_lsn = Lsn(buf.read_u64::()?); - let not_modified_since = Lsn(buf.read_u64::()?); - let kind = buf.read_u8()?; - let segno = buf.read_u32::()?; - let n_blocks = buf.read_u32::()?; - let mut segment = vec![0; n_blocks as usize * BLCKSZ as usize]; - buf.read_exact(&mut segment)?; - Self::GetSlruSegment(PagestreamGetSlruSegmentResponse { - req: PagestreamGetSlruSegmentRequest { - hdr: PagestreamRequest { - reqid, - request_lsn, - not_modified_since, - }, - kind, - segno, - }, - segment: segment.into(), - }) - } - #[cfg(feature = "testing")] - Tag::Test => { - let reqid = buf.read_u64::()?; - let request_lsn = Lsn(buf.read_u64::()?); - let not_modified_since = Lsn(buf.read_u64::()?); - let batch_key = buf.read_u64::()?; - let len = buf.read_u64::()?; - let mut msg = vec![0; len as usize]; - buf.read_exact(&mut msg)?; - let message = String::from_utf8(msg)?; - Self::Test(PagestreamTestResponse { - req: PagestreamTestRequest { - hdr: PagestreamRequest { - reqid, - request_lsn, - not_modified_since, - }, - batch_key, - message, - }, - }) - } - }; - let remaining = buf.into_inner(); - if !remaining.is_empty() { - anyhow::bail!( - "remaining bytes in msg with tag={msg_tag}: {}", - remaining.len() - ); - } - Ok(ok) - } - - pub fn kind(&self) -> &'static str { - match self { - Self::Exists(_) => "Exists", - Self::Nblocks(_) => "Nblocks", - Self::GetPage(_) => "GetPage", - Self::Error(_) => "Error", - Self::DbSize(_) => "DbSize", - Self::GetSlruSegment(_) => "GetSlruSegment", - #[cfg(feature = "testing")] - Self::Test(_) => "Test", - } - } -} - #[derive(Debug, Serialize, Deserialize)] pub struct PageTraceEvent { pub key: CompactKey, @@ -2656,68 +1938,6 @@ mod tests { use super::*; - #[test] - fn test_pagestream() { - // Test serialization/deserialization of PagestreamFeMessage - let messages = vec![ - PagestreamFeMessage::Exists(PagestreamExistsRequest { - hdr: PagestreamRequest { - reqid: 0, - request_lsn: Lsn(4), - not_modified_since: Lsn(3), - }, - rel: RelTag { - forknum: 1, - spcnode: 2, - dbnode: 3, - relnode: 4, - }, - }), - PagestreamFeMessage::Nblocks(PagestreamNblocksRequest { - hdr: PagestreamRequest { - reqid: 0, - request_lsn: Lsn(4), - not_modified_since: Lsn(4), - }, - rel: RelTag { - forknum: 1, - spcnode: 2, - dbnode: 3, - relnode: 4, - }, - }), - PagestreamFeMessage::GetPage(PagestreamGetPageRequest { - hdr: PagestreamRequest { - reqid: 0, - request_lsn: Lsn(4), - not_modified_since: Lsn(3), - }, - rel: RelTag { - forknum: 1, - spcnode: 2, - dbnode: 3, - relnode: 4, - }, - blkno: 7, - }), - PagestreamFeMessage::DbSize(PagestreamDbSizeRequest { - hdr: PagestreamRequest { - reqid: 0, - request_lsn: Lsn(4), - not_modified_since: Lsn(3), - }, - dbnode: 7, - }), - ]; - for msg in messages { - let bytes = msg.serialize(); - let reconstructed = - PagestreamFeMessage::parse(&mut bytes.reader(), PagestreamProtocolVersion::V3) - .unwrap(); - assert!(msg == reconstructed); - } - } - #[test] fn test_tenantinfo_serde() { // Test serialization/deserialization of TenantInfo diff --git a/libs/pageserver_api/src/pagestream_api.rs b/libs/pageserver_api/src/pagestream_api.rs new file mode 100644 index 0000000000..fba64c82d9 --- /dev/null +++ b/libs/pageserver_api/src/pagestream_api.rs @@ -0,0 +1,792 @@ +//! Rust definitions of the libpq-based pagestream API +//! +//! See also the C implementation of the same API in pgxn/neon/pagestore_client.h + +use std::io::{BufRead, Read}; + +use crate::reltag::RelTag; + +use byteorder::{BigEndian, ReadBytesExt}; +use bytes::{Buf, BufMut, Bytes, BytesMut}; +use postgres_ffi::BLCKSZ; +use utils::lsn::Lsn; + +// Wrapped in libpq CopyData +#[derive(PartialEq, Eq, Debug)] +pub enum PagestreamFeMessage { + Exists(PagestreamExistsRequest), + Nblocks(PagestreamNblocksRequest), + GetPage(PagestreamGetPageRequest), + DbSize(PagestreamDbSizeRequest), + GetSlruSegment(PagestreamGetSlruSegmentRequest), + #[cfg(feature = "testing")] + Test(PagestreamTestRequest), +} + +// Wrapped in libpq CopyData +#[derive(Debug, strum_macros::EnumProperty)] +pub enum PagestreamBeMessage { + Exists(PagestreamExistsResponse), + Nblocks(PagestreamNblocksResponse), + GetPage(PagestreamGetPageResponse), + Error(PagestreamErrorResponse), + DbSize(PagestreamDbSizeResponse), + GetSlruSegment(PagestreamGetSlruSegmentResponse), + #[cfg(feature = "testing")] + Test(PagestreamTestResponse), +} + +// Keep in sync with `pagestore_client.h` +#[repr(u8)] +enum PagestreamFeMessageTag { + Exists = 0, + Nblocks = 1, + GetPage = 2, + DbSize = 3, + GetSlruSegment = 4, + /* future tags above this line */ + /// For testing purposes, not available in production. + #[cfg(feature = "testing")] + Test = 99, +} + +// Keep in sync with `pagestore_client.h` +#[repr(u8)] +enum PagestreamBeMessageTag { + Exists = 100, + Nblocks = 101, + GetPage = 102, + Error = 103, + DbSize = 104, + GetSlruSegment = 105, + /* future tags above this line */ + /// For testing purposes, not available in production. + #[cfg(feature = "testing")] + Test = 199, +} + +impl TryFrom for PagestreamFeMessageTag { + type Error = u8; + fn try_from(value: u8) -> Result { + match value { + 0 => Ok(PagestreamFeMessageTag::Exists), + 1 => Ok(PagestreamFeMessageTag::Nblocks), + 2 => Ok(PagestreamFeMessageTag::GetPage), + 3 => Ok(PagestreamFeMessageTag::DbSize), + 4 => Ok(PagestreamFeMessageTag::GetSlruSegment), + #[cfg(feature = "testing")] + 99 => Ok(PagestreamFeMessageTag::Test), + _ => Err(value), + } + } +} + +impl TryFrom for PagestreamBeMessageTag { + type Error = u8; + fn try_from(value: u8) -> Result { + match value { + 100 => Ok(PagestreamBeMessageTag::Exists), + 101 => Ok(PagestreamBeMessageTag::Nblocks), + 102 => Ok(PagestreamBeMessageTag::GetPage), + 103 => Ok(PagestreamBeMessageTag::Error), + 104 => Ok(PagestreamBeMessageTag::DbSize), + 105 => Ok(PagestreamBeMessageTag::GetSlruSegment), + #[cfg(feature = "testing")] + 199 => Ok(PagestreamBeMessageTag::Test), + _ => Err(value), + } + } +} + +// A GetPage request contains two LSN values: +// +// request_lsn: Get the page version at this point in time. Lsn::Max is a special value that means +// "get the latest version present". It's used by the primary server, which knows that no one else +// is writing WAL. 'not_modified_since' must be set to a proper value even if request_lsn is +// Lsn::Max. Standby servers use the current replay LSN as the request LSN. +// +// not_modified_since: Hint to the pageserver that the client knows that the page has not been +// modified between 'not_modified_since' and the request LSN. It's always correct to set +// 'not_modified_since equal' to 'request_lsn' (unless Lsn::Max is used as the 'request_lsn'), but +// passing an earlier LSN can speed up the request, by allowing the pageserver to process the +// request without waiting for 'request_lsn' to arrive. +// +// The now-defunct V1 interface contained only one LSN, and a boolean 'latest' flag. The V1 interface was +// sufficient for the primary; the 'lsn' was equivalent to the 'not_modified_since' value, and +// 'latest' was set to true. The V2 interface was added because there was no correct way for a +// standby to request a page at a particular non-latest LSN, and also include the +// 'not_modified_since' hint. That led to an awkward choice of either using an old LSN in the +// request, if the standby knows that the page hasn't been modified since, and risk getting an error +// if that LSN has fallen behind the GC horizon, or requesting the current replay LSN, which could +// require the pageserver unnecessarily to wait for the WAL to arrive up to that point. The new V2 +// interface allows sending both LSNs, and let the pageserver do the right thing. There was no +// difference in the responses between V1 and V2. +// +// V3 version of protocol adds request ID to all requests. This request ID is also included in response +// as well as other fields from requests, which allows to verify that we receive response for our request. +// We copy fields from request to response to make checking more reliable: request ID is formed from process ID +// and local counter, so in principle there can be duplicated requests IDs if process PID is reused. +// +#[derive(Debug, PartialEq, Eq, Clone, Copy)] +pub enum PagestreamProtocolVersion { + V2, + V3, +} + +pub type RequestId = u64; + +#[derive(Debug, Default, PartialEq, Eq, Clone, Copy)] +pub struct PagestreamRequest { + pub reqid: RequestId, + pub request_lsn: Lsn, + pub not_modified_since: Lsn, +} + +#[derive(Debug, PartialEq, Eq, Clone, Copy)] +pub struct PagestreamExistsRequest { + pub hdr: PagestreamRequest, + pub rel: RelTag, +} + +#[derive(Debug, PartialEq, Eq, Clone, Copy)] +pub struct PagestreamNblocksRequest { + pub hdr: PagestreamRequest, + pub rel: RelTag, +} + +#[derive(Debug, Default, PartialEq, Eq, Clone, Copy)] +pub struct PagestreamGetPageRequest { + pub hdr: PagestreamRequest, + pub rel: RelTag, + pub blkno: u32, +} + +#[derive(Debug, PartialEq, Eq, Clone, Copy)] +pub struct PagestreamDbSizeRequest { + pub hdr: PagestreamRequest, + pub dbnode: u32, +} + +#[derive(Debug, PartialEq, Eq, Clone, Copy)] +pub struct PagestreamGetSlruSegmentRequest { + pub hdr: PagestreamRequest, + pub kind: u8, + pub segno: u32, +} + +#[derive(Debug)] +pub struct PagestreamExistsResponse { + pub req: PagestreamExistsRequest, + pub exists: bool, +} + +#[derive(Debug)] +pub struct PagestreamNblocksResponse { + pub req: PagestreamNblocksRequest, + pub n_blocks: u32, +} + +#[derive(Debug)] +pub struct PagestreamGetPageResponse { + pub req: PagestreamGetPageRequest, + pub page: Bytes, +} + +#[derive(Debug)] +pub struct PagestreamGetSlruSegmentResponse { + pub req: PagestreamGetSlruSegmentRequest, + pub segment: Bytes, +} + +#[derive(Debug)] +pub struct PagestreamErrorResponse { + pub req: PagestreamRequest, + pub message: String, +} + +#[derive(Debug)] +pub struct PagestreamDbSizeResponse { + pub req: PagestreamDbSizeRequest, + pub db_size: i64, +} + +#[cfg(feature = "testing")] +#[derive(Debug, PartialEq, Eq, Clone)] +pub struct PagestreamTestRequest { + pub hdr: PagestreamRequest, + pub batch_key: u64, + pub message: String, +} + +#[cfg(feature = "testing")] +#[derive(Debug)] +pub struct PagestreamTestResponse { + pub req: PagestreamTestRequest, +} + +impl PagestreamFeMessage { + /// Serialize a compute -> pageserver message. This is currently only used in testing + /// tools. Always uses protocol version 3. + pub fn serialize(&self) -> Bytes { + let mut bytes = BytesMut::new(); + + match self { + Self::Exists(req) => { + bytes.put_u8(PagestreamFeMessageTag::Exists as u8); + bytes.put_u64(req.hdr.reqid); + bytes.put_u64(req.hdr.request_lsn.0); + bytes.put_u64(req.hdr.not_modified_since.0); + bytes.put_u32(req.rel.spcnode); + bytes.put_u32(req.rel.dbnode); + bytes.put_u32(req.rel.relnode); + bytes.put_u8(req.rel.forknum); + } + + Self::Nblocks(req) => { + bytes.put_u8(PagestreamFeMessageTag::Nblocks as u8); + bytes.put_u64(req.hdr.reqid); + bytes.put_u64(req.hdr.request_lsn.0); + bytes.put_u64(req.hdr.not_modified_since.0); + bytes.put_u32(req.rel.spcnode); + bytes.put_u32(req.rel.dbnode); + bytes.put_u32(req.rel.relnode); + bytes.put_u8(req.rel.forknum); + } + + Self::GetPage(req) => { + bytes.put_u8(PagestreamFeMessageTag::GetPage as u8); + bytes.put_u64(req.hdr.reqid); + bytes.put_u64(req.hdr.request_lsn.0); + bytes.put_u64(req.hdr.not_modified_since.0); + bytes.put_u32(req.rel.spcnode); + bytes.put_u32(req.rel.dbnode); + bytes.put_u32(req.rel.relnode); + bytes.put_u8(req.rel.forknum); + bytes.put_u32(req.blkno); + } + + Self::DbSize(req) => { + bytes.put_u8(PagestreamFeMessageTag::DbSize as u8); + bytes.put_u64(req.hdr.reqid); + bytes.put_u64(req.hdr.request_lsn.0); + bytes.put_u64(req.hdr.not_modified_since.0); + bytes.put_u32(req.dbnode); + } + + Self::GetSlruSegment(req) => { + bytes.put_u8(PagestreamFeMessageTag::GetSlruSegment as u8); + bytes.put_u64(req.hdr.reqid); + bytes.put_u64(req.hdr.request_lsn.0); + bytes.put_u64(req.hdr.not_modified_since.0); + bytes.put_u8(req.kind); + bytes.put_u32(req.segno); + } + #[cfg(feature = "testing")] + Self::Test(req) => { + bytes.put_u8(PagestreamFeMessageTag::Test as u8); + bytes.put_u64(req.hdr.reqid); + bytes.put_u64(req.hdr.request_lsn.0); + bytes.put_u64(req.hdr.not_modified_since.0); + bytes.put_u64(req.batch_key); + let message = req.message.as_bytes(); + bytes.put_u64(message.len() as u64); + bytes.put_slice(message); + } + } + + bytes.into() + } + + pub fn parse( + body: &mut R, + protocol_version: PagestreamProtocolVersion, + ) -> anyhow::Result { + // these correspond to the NeonMessageTag enum in pagestore_client.h + // + // TODO: consider using protobuf or serde bincode for less error prone + // serialization. + let msg_tag = body.read_u8()?; + let (reqid, request_lsn, not_modified_since) = match protocol_version { + PagestreamProtocolVersion::V2 => ( + 0, + Lsn::from(body.read_u64::()?), + Lsn::from(body.read_u64::()?), + ), + PagestreamProtocolVersion::V3 => ( + body.read_u64::()?, + Lsn::from(body.read_u64::()?), + Lsn::from(body.read_u64::()?), + ), + }; + + match PagestreamFeMessageTag::try_from(msg_tag) + .map_err(|tag: u8| anyhow::anyhow!("invalid tag {tag}"))? + { + PagestreamFeMessageTag::Exists => { + Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest { + hdr: PagestreamRequest { + reqid, + request_lsn, + not_modified_since, + }, + rel: RelTag { + spcnode: body.read_u32::()?, + dbnode: body.read_u32::()?, + relnode: body.read_u32::()?, + forknum: body.read_u8()?, + }, + })) + } + PagestreamFeMessageTag::Nblocks => { + Ok(PagestreamFeMessage::Nblocks(PagestreamNblocksRequest { + hdr: PagestreamRequest { + reqid, + request_lsn, + not_modified_since, + }, + rel: RelTag { + spcnode: body.read_u32::()?, + dbnode: body.read_u32::()?, + relnode: body.read_u32::()?, + forknum: body.read_u8()?, + }, + })) + } + PagestreamFeMessageTag::GetPage => { + Ok(PagestreamFeMessage::GetPage(PagestreamGetPageRequest { + hdr: PagestreamRequest { + reqid, + request_lsn, + not_modified_since, + }, + rel: RelTag { + spcnode: body.read_u32::()?, + dbnode: body.read_u32::()?, + relnode: body.read_u32::()?, + forknum: body.read_u8()?, + }, + blkno: body.read_u32::()?, + })) + } + PagestreamFeMessageTag::DbSize => { + Ok(PagestreamFeMessage::DbSize(PagestreamDbSizeRequest { + hdr: PagestreamRequest { + reqid, + request_lsn, + not_modified_since, + }, + dbnode: body.read_u32::()?, + })) + } + PagestreamFeMessageTag::GetSlruSegment => Ok(PagestreamFeMessage::GetSlruSegment( + PagestreamGetSlruSegmentRequest { + hdr: PagestreamRequest { + reqid, + request_lsn, + not_modified_since, + }, + kind: body.read_u8()?, + segno: body.read_u32::()?, + }, + )), + #[cfg(feature = "testing")] + PagestreamFeMessageTag::Test => Ok(PagestreamFeMessage::Test(PagestreamTestRequest { + hdr: PagestreamRequest { + reqid, + request_lsn, + not_modified_since, + }, + batch_key: body.read_u64::()?, + message: { + let len = body.read_u64::()?; + let mut buf = vec![0; len as usize]; + body.read_exact(&mut buf)?; + String::from_utf8(buf)? + }, + })), + } + } +} + +impl PagestreamBeMessage { + pub fn serialize(&self, protocol_version: PagestreamProtocolVersion) -> Bytes { + let mut bytes = BytesMut::new(); + + use PagestreamBeMessageTag as Tag; + match protocol_version { + PagestreamProtocolVersion::V2 => { + match self { + Self::Exists(resp) => { + bytes.put_u8(Tag::Exists as u8); + bytes.put_u8(resp.exists as u8); + } + + Self::Nblocks(resp) => { + bytes.put_u8(Tag::Nblocks as u8); + bytes.put_u32(resp.n_blocks); + } + + Self::GetPage(resp) => { + bytes.put_u8(Tag::GetPage as u8); + bytes.put(&resp.page[..]) + } + + Self::Error(resp) => { + bytes.put_u8(Tag::Error as u8); + bytes.put(resp.message.as_bytes()); + bytes.put_u8(0); // null terminator + } + Self::DbSize(resp) => { + bytes.put_u8(Tag::DbSize as u8); + bytes.put_i64(resp.db_size); + } + + Self::GetSlruSegment(resp) => { + bytes.put_u8(Tag::GetSlruSegment as u8); + bytes.put_u32((resp.segment.len() / BLCKSZ as usize) as u32); + bytes.put(&resp.segment[..]); + } + + #[cfg(feature = "testing")] + Self::Test(resp) => { + bytes.put_u8(Tag::Test as u8); + bytes.put_u64(resp.req.batch_key); + let message = resp.req.message.as_bytes(); + bytes.put_u64(message.len() as u64); + bytes.put_slice(message); + } + } + } + PagestreamProtocolVersion::V3 => { + match self { + Self::Exists(resp) => { + bytes.put_u8(Tag::Exists as u8); + bytes.put_u64(resp.req.hdr.reqid); + bytes.put_u64(resp.req.hdr.request_lsn.0); + bytes.put_u64(resp.req.hdr.not_modified_since.0); + bytes.put_u32(resp.req.rel.spcnode); + bytes.put_u32(resp.req.rel.dbnode); + bytes.put_u32(resp.req.rel.relnode); + bytes.put_u8(resp.req.rel.forknum); + bytes.put_u8(resp.exists as u8); + } + + Self::Nblocks(resp) => { + bytes.put_u8(Tag::Nblocks as u8); + bytes.put_u64(resp.req.hdr.reqid); + bytes.put_u64(resp.req.hdr.request_lsn.0); + bytes.put_u64(resp.req.hdr.not_modified_since.0); + bytes.put_u32(resp.req.rel.spcnode); + bytes.put_u32(resp.req.rel.dbnode); + bytes.put_u32(resp.req.rel.relnode); + bytes.put_u8(resp.req.rel.forknum); + bytes.put_u32(resp.n_blocks); + } + + Self::GetPage(resp) => { + bytes.put_u8(Tag::GetPage as u8); + bytes.put_u64(resp.req.hdr.reqid); + bytes.put_u64(resp.req.hdr.request_lsn.0); + bytes.put_u64(resp.req.hdr.not_modified_since.0); + bytes.put_u32(resp.req.rel.spcnode); + bytes.put_u32(resp.req.rel.dbnode); + bytes.put_u32(resp.req.rel.relnode); + bytes.put_u8(resp.req.rel.forknum); + bytes.put_u32(resp.req.blkno); + bytes.put(&resp.page[..]) + } + + Self::Error(resp) => { + bytes.put_u8(Tag::Error as u8); + bytes.put_u64(resp.req.reqid); + bytes.put_u64(resp.req.request_lsn.0); + bytes.put_u64(resp.req.not_modified_since.0); + bytes.put(resp.message.as_bytes()); + bytes.put_u8(0); // null terminator + } + Self::DbSize(resp) => { + bytes.put_u8(Tag::DbSize as u8); + bytes.put_u64(resp.req.hdr.reqid); + bytes.put_u64(resp.req.hdr.request_lsn.0); + bytes.put_u64(resp.req.hdr.not_modified_since.0); + bytes.put_u32(resp.req.dbnode); + bytes.put_i64(resp.db_size); + } + + Self::GetSlruSegment(resp) => { + bytes.put_u8(Tag::GetSlruSegment as u8); + bytes.put_u64(resp.req.hdr.reqid); + bytes.put_u64(resp.req.hdr.request_lsn.0); + bytes.put_u64(resp.req.hdr.not_modified_since.0); + bytes.put_u8(resp.req.kind); + bytes.put_u32(resp.req.segno); + bytes.put_u32((resp.segment.len() / BLCKSZ as usize) as u32); + bytes.put(&resp.segment[..]); + } + + #[cfg(feature = "testing")] + Self::Test(resp) => { + bytes.put_u8(Tag::Test as u8); + bytes.put_u64(resp.req.hdr.reqid); + bytes.put_u64(resp.req.hdr.request_lsn.0); + bytes.put_u64(resp.req.hdr.not_modified_since.0); + bytes.put_u64(resp.req.batch_key); + let message = resp.req.message.as_bytes(); + bytes.put_u64(message.len() as u64); + bytes.put_slice(message); + } + } + } + } + bytes.into() + } + + pub fn deserialize(buf: Bytes) -> anyhow::Result { + let mut buf = buf.reader(); + let msg_tag = buf.read_u8()?; + + use PagestreamBeMessageTag as Tag; + let ok = + match Tag::try_from(msg_tag).map_err(|tag: u8| anyhow::anyhow!("invalid tag {tag}"))? { + Tag::Exists => { + let reqid = buf.read_u64::()?; + let request_lsn = Lsn(buf.read_u64::()?); + let not_modified_since = Lsn(buf.read_u64::()?); + let rel = RelTag { + spcnode: buf.read_u32::()?, + dbnode: buf.read_u32::()?, + relnode: buf.read_u32::()?, + forknum: buf.read_u8()?, + }; + let exists = buf.read_u8()? != 0; + Self::Exists(PagestreamExistsResponse { + req: PagestreamExistsRequest { + hdr: PagestreamRequest { + reqid, + request_lsn, + not_modified_since, + }, + rel, + }, + exists, + }) + } + Tag::Nblocks => { + let reqid = buf.read_u64::()?; + let request_lsn = Lsn(buf.read_u64::()?); + let not_modified_since = Lsn(buf.read_u64::()?); + let rel = RelTag { + spcnode: buf.read_u32::()?, + dbnode: buf.read_u32::()?, + relnode: buf.read_u32::()?, + forknum: buf.read_u8()?, + }; + let n_blocks = buf.read_u32::()?; + Self::Nblocks(PagestreamNblocksResponse { + req: PagestreamNblocksRequest { + hdr: PagestreamRequest { + reqid, + request_lsn, + not_modified_since, + }, + rel, + }, + n_blocks, + }) + } + Tag::GetPage => { + let reqid = buf.read_u64::()?; + let request_lsn = Lsn(buf.read_u64::()?); + let not_modified_since = Lsn(buf.read_u64::()?); + let rel = RelTag { + spcnode: buf.read_u32::()?, + dbnode: buf.read_u32::()?, + relnode: buf.read_u32::()?, + forknum: buf.read_u8()?, + }; + let blkno = buf.read_u32::()?; + let mut page = vec![0; 8192]; // TODO: use MaybeUninit + buf.read_exact(&mut page)?; + Self::GetPage(PagestreamGetPageResponse { + req: PagestreamGetPageRequest { + hdr: PagestreamRequest { + reqid, + request_lsn, + not_modified_since, + }, + rel, + blkno, + }, + page: page.into(), + }) + } + Tag::Error => { + let reqid = buf.read_u64::()?; + let request_lsn = Lsn(buf.read_u64::()?); + let not_modified_since = Lsn(buf.read_u64::()?); + let mut msg = Vec::new(); + buf.read_until(0, &mut msg)?; + let cstring = std::ffi::CString::from_vec_with_nul(msg)?; + let rust_str = cstring.to_str()?; + Self::Error(PagestreamErrorResponse { + req: PagestreamRequest { + reqid, + request_lsn, + not_modified_since, + }, + message: rust_str.to_owned(), + }) + } + Tag::DbSize => { + let reqid = buf.read_u64::()?; + let request_lsn = Lsn(buf.read_u64::()?); + let not_modified_since = Lsn(buf.read_u64::()?); + let dbnode = buf.read_u32::()?; + let db_size = buf.read_i64::()?; + Self::DbSize(PagestreamDbSizeResponse { + req: PagestreamDbSizeRequest { + hdr: PagestreamRequest { + reqid, + request_lsn, + not_modified_since, + }, + dbnode, + }, + db_size, + }) + } + Tag::GetSlruSegment => { + let reqid = buf.read_u64::()?; + let request_lsn = Lsn(buf.read_u64::()?); + let not_modified_since = Lsn(buf.read_u64::()?); + let kind = buf.read_u8()?; + let segno = buf.read_u32::()?; + let n_blocks = buf.read_u32::()?; + let mut segment = vec![0; n_blocks as usize * BLCKSZ as usize]; + buf.read_exact(&mut segment)?; + Self::GetSlruSegment(PagestreamGetSlruSegmentResponse { + req: PagestreamGetSlruSegmentRequest { + hdr: PagestreamRequest { + reqid, + request_lsn, + not_modified_since, + }, + kind, + segno, + }, + segment: segment.into(), + }) + } + #[cfg(feature = "testing")] + Tag::Test => { + let reqid = buf.read_u64::()?; + let request_lsn = Lsn(buf.read_u64::()?); + let not_modified_since = Lsn(buf.read_u64::()?); + let batch_key = buf.read_u64::()?; + let len = buf.read_u64::()?; + let mut msg = vec![0; len as usize]; + buf.read_exact(&mut msg)?; + let message = String::from_utf8(msg)?; + Self::Test(PagestreamTestResponse { + req: PagestreamTestRequest { + hdr: PagestreamRequest { + reqid, + request_lsn, + not_modified_since, + }, + batch_key, + message, + }, + }) + } + }; + let remaining = buf.into_inner(); + if !remaining.is_empty() { + anyhow::bail!( + "remaining bytes in msg with tag={msg_tag}: {}", + remaining.len() + ); + } + Ok(ok) + } + + pub fn kind(&self) -> &'static str { + match self { + Self::Exists(_) => "Exists", + Self::Nblocks(_) => "Nblocks", + Self::GetPage(_) => "GetPage", + Self::Error(_) => "Error", + Self::DbSize(_) => "DbSize", + Self::GetSlruSegment(_) => "GetSlruSegment", + #[cfg(feature = "testing")] + Self::Test(_) => "Test", + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_pagestream() { + // Test serialization/deserialization of PagestreamFeMessage + let messages = vec![ + PagestreamFeMessage::Exists(PagestreamExistsRequest { + hdr: PagestreamRequest { + reqid: 0, + request_lsn: Lsn(4), + not_modified_since: Lsn(3), + }, + rel: RelTag { + forknum: 1, + spcnode: 2, + dbnode: 3, + relnode: 4, + }, + }), + PagestreamFeMessage::Nblocks(PagestreamNblocksRequest { + hdr: PagestreamRequest { + reqid: 0, + request_lsn: Lsn(4), + not_modified_since: Lsn(4), + }, + rel: RelTag { + forknum: 1, + spcnode: 2, + dbnode: 3, + relnode: 4, + }, + }), + PagestreamFeMessage::GetPage(PagestreamGetPageRequest { + hdr: PagestreamRequest { + reqid: 0, + request_lsn: Lsn(4), + not_modified_since: Lsn(3), + }, + rel: RelTag { + forknum: 1, + spcnode: 2, + dbnode: 3, + relnode: 4, + }, + blkno: 7, + }), + PagestreamFeMessage::DbSize(PagestreamDbSizeRequest { + hdr: PagestreamRequest { + reqid: 0, + request_lsn: Lsn(4), + not_modified_since: Lsn(3), + }, + dbnode: 7, + }), + ]; + for msg in messages { + let bytes = msg.serialize(); + let reconstructed = + PagestreamFeMessage::parse(&mut bytes.reader(), PagestreamProtocolVersion::V3) + .unwrap(); + assert!(msg == reconstructed); + } + } +} diff --git a/pageserver/client/src/page_service.rs b/pageserver/client/src/page_service.rs index ef35ac2f48..085c0e6543 100644 --- a/pageserver/client/src/page_service.rs +++ b/pageserver/client/src/page_service.rs @@ -2,7 +2,7 @@ use std::sync::{Arc, Mutex}; use futures::stream::{SplitSink, SplitStream}; use futures::{SinkExt, StreamExt}; -use pageserver_api::models::{ +use pageserver_api::pagestream_api::{ PagestreamBeMessage, PagestreamFeMessage, PagestreamGetPageRequest, PagestreamGetPageResponse, }; use pageserver_api::reltag::RelTag; diff --git a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs index 3f3b6e396e..3a68a77279 100644 --- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs +++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs @@ -12,7 +12,7 @@ use bytes::Bytes; use camino::Utf8PathBuf; use pageserver_api::key::Key; use pageserver_api::keyspace::KeySpaceAccum; -use pageserver_api::models::{PagestreamGetPageRequest, PagestreamRequest}; +use pageserver_api::pagestream_api::{PagestreamGetPageRequest, PagestreamRequest}; use pageserver_api::reltag::RelTag; use pageserver_api::shard::TenantShardId; use pageserver_page_api::proto; diff --git a/pageserver/src/bin/test_helper_slow_client_reads.rs b/pageserver/src/bin/test_helper_slow_client_reads.rs index 0215dd06fb..be8e081945 100644 --- a/pageserver/src/bin/test_helper_slow_client_reads.rs +++ b/pageserver/src/bin/test_helper_slow_client_reads.rs @@ -2,7 +2,9 @@ use std::io::{Read, Write, stdin, stdout}; use std::time::Duration; use clap::Parser; -use pageserver_api::models::{PagestreamRequest, PagestreamTestRequest}; +use pageserver_api::pagestream_api::{ + PagestreamFeMessage, PagestreamRequest, PagestreamTestRequest, +}; use utils::id::{TenantId, TimelineId}; use utils::lsn::Lsn; @@ -28,17 +30,15 @@ async fn main() -> anyhow::Result<()> { let mut msg = 0; loop { msg += 1; - let fut = sender.send(pageserver_api::models::PagestreamFeMessage::Test( - PagestreamTestRequest { - hdr: PagestreamRequest { - reqid: 0, - request_lsn: Lsn(23), - not_modified_since: Lsn(23), - }, - batch_key: 42, - message: format!("message {}", msg), + let fut = sender.send(PagestreamFeMessage::Test(PagestreamTestRequest { + hdr: PagestreamRequest { + reqid: 0, + request_lsn: Lsn(23), + not_modified_since: Lsn(23), }, - )); + batch_key: 42, + message: format!("message {}", msg), + })); let Ok(res) = tokio::time::timeout(Duration::from_secs(10), fut).await else { eprintln!("pipe seems full"); break; diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index ff435451c3..57087dc6c3 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -25,12 +25,13 @@ use pageserver_api::config::{ PageServiceProtocolPipelinedBatchingStrategy, PageServiceProtocolPipelinedExecutionStrategy, }; use pageserver_api::key::rel_block_to_key; -use pageserver_api::models::{ - self, PageTraceEvent, PagestreamBeMessage, PagestreamDbSizeRequest, PagestreamDbSizeResponse, +use pageserver_api::models::{PageTraceEvent, TenantState}; +use pageserver_api::pagestream_api::{ + self, PagestreamBeMessage, PagestreamDbSizeRequest, PagestreamDbSizeResponse, PagestreamErrorResponse, PagestreamExistsRequest, PagestreamExistsResponse, PagestreamFeMessage, PagestreamGetPageRequest, PagestreamGetSlruSegmentRequest, PagestreamGetSlruSegmentResponse, PagestreamNblocksRequest, PagestreamNblocksResponse, - PagestreamProtocolVersion, PagestreamRequest, TenantState, + PagestreamProtocolVersion, PagestreamRequest, }; use pageserver_api::reltag::SlruKind; use pageserver_api::shard::TenantShardId; @@ -712,7 +713,7 @@ struct BatchedGetPageRequest { #[cfg(feature = "testing")] struct BatchedTestRequest { - req: models::PagestreamTestRequest, + req: pagestream_api::PagestreamTestRequest, timer: SmgrOpTimer, } @@ -726,13 +727,13 @@ enum BatchedFeMessage { span: Span, timer: SmgrOpTimer, shard: WeakHandle, - req: models::PagestreamExistsRequest, + req: PagestreamExistsRequest, }, Nblocks { span: Span, timer: SmgrOpTimer, shard: WeakHandle, - req: models::PagestreamNblocksRequest, + req: PagestreamNblocksRequest, }, GetPage { span: Span, @@ -744,13 +745,13 @@ enum BatchedFeMessage { span: Span, timer: SmgrOpTimer, shard: WeakHandle, - req: models::PagestreamDbSizeRequest, + req: PagestreamDbSizeRequest, }, GetSlruSegment { span: Span, timer: SmgrOpTimer, shard: WeakHandle, - req: models::PagestreamGetSlruSegmentRequest, + req: PagestreamGetSlruSegmentRequest, }, #[cfg(feature = "testing")] Test { @@ -2443,10 +2444,9 @@ impl PageServerHandler { .map(|(req, res)| { res.map(|page| { ( - PagestreamBeMessage::GetPage(models::PagestreamGetPageResponse { - req: req.req, - page, - }), + PagestreamBeMessage::GetPage( + pagestream_api::PagestreamGetPageResponse { req: req.req, page }, + ), req.timer, req.ctx, ) @@ -2513,7 +2513,7 @@ impl PageServerHandler { .map(|(req, res)| { res.map(|()| { ( - PagestreamBeMessage::Test(models::PagestreamTestResponse { + PagestreamBeMessage::Test(pagestream_api::PagestreamTestResponse { req: req.req.clone(), }), req.timer, From e95f2f9a671d276afd72e669bbece122f305b39a Mon Sep 17 00:00:00 2001 From: Mikhail Date: Wed, 18 Jun 2025 13:25:19 +0100 Subject: [PATCH 069/124] compute_ctl: return LSN in /terminate (#12240) - Add optional `?mode=fast|immediate` to `/terminate`, `fast` is default. Immediate avoids waiting 30 seconds before returning from `terminate`. - Add `TerminateMode` to `ComputeStatus::TerminationPending` - Use `/terminate?mode=immediate` in `neon_local` instead of `pg_ctl stop` for `test_replica_promotes`. - Change `test_replica_promotes` to check returned LSN - Annotate `finish_sync_safekeepers` as `noreturn`. https://github.com/neondatabase/cloud/issues/29807 --- compute_tools/src/compute.rs | 33 ++++++++--- compute_tools/src/http/routes/terminate.rs | 39 ++++++++----- compute_tools/src/monitor.rs | 4 +- control_plane/src/bin/neon_local.rs | 21 ++++--- control_plane/src/endpoint.rs | 58 ++++++++++++++++--- libs/compute_api/src/responses.rs | 19 +++++- libs/desim/src/executor.rs | 4 +- libs/walproposer/src/api_bindings.rs | 2 +- libs/walproposer/src/walproposer.rs | 4 +- pgxn/neon/walproposer.h | 3 +- pgxn/neon/walproposer_pg.c | 2 +- .../tests/walproposer_sim/walproposer_api.rs | 2 +- test_runner/fixtures/neon_cli.py | 8 ++- test_runner/fixtures/neon_fixtures.py | 8 ++- test_runner/regress/test_replica_promotes.py | 29 ++++++++-- 15 files changed, 176 insertions(+), 60 deletions(-) diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 9744cc2dac..d42e3cc860 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -163,6 +163,10 @@ pub struct ComputeState { pub lfc_prewarm_state: LfcPrewarmState, pub lfc_offload_state: LfcOffloadState, + /// WAL flush LSN that is set after terminating Postgres and syncing safekeepers if + /// mode == ComputeMode::Primary. None otherwise + pub terminate_flush_lsn: Option, + pub metrics: ComputeMetrics, } @@ -178,6 +182,7 @@ impl ComputeState { metrics: ComputeMetrics::default(), lfc_prewarm_state: LfcPrewarmState::default(), lfc_offload_state: LfcOffloadState::default(), + terminate_flush_lsn: None, } } @@ -531,12 +536,21 @@ impl ComputeNode { // Reap the postgres process delay_exit |= this.cleanup_after_postgres_exit()?; + // /terminate returns LSN. If we don't sleep at all, connection will break and we + // won't get result. If we sleep too much, tests will take significantly longer + // and Github Action run will error out + let sleep_duration = if delay_exit { + Duration::from_secs(30) + } else { + Duration::from_millis(300) + }; + // If launch failed, keep serving HTTP requests for a while, so the cloud // control plane can get the actual error. if delay_exit { info!("giving control plane 30s to collect the error before shutdown"); - std::thread::sleep(Duration::from_secs(30)); } + std::thread::sleep(sleep_duration); Ok(exit_code) } @@ -908,20 +922,25 @@ impl ComputeNode { // Maybe sync safekeepers again, to speed up next startup let compute_state = self.state.lock().unwrap().clone(); let pspec = compute_state.pspec.as_ref().expect("spec must be set"); - if matches!(pspec.spec.mode, compute_api::spec::ComputeMode::Primary) { + let lsn = if matches!(pspec.spec.mode, compute_api::spec::ComputeMode::Primary) { info!("syncing safekeepers on shutdown"); let storage_auth_token = pspec.storage_auth_token.clone(); let lsn = self.sync_safekeepers(storage_auth_token)?; - info!("synced safekeepers at lsn {lsn}"); - } + info!(%lsn, "synced safekeepers"); + Some(lsn) + } else { + info!("not primary, not syncing safekeepers"); + None + }; let mut delay_exit = false; let mut state = self.state.lock().unwrap(); - if state.status == ComputeStatus::TerminationPending { + state.terminate_flush_lsn = lsn; + if let ComputeStatus::TerminationPending { mode } = state.status { state.status = ComputeStatus::Terminated; self.state_changed.notify_all(); // we were asked to terminate gracefully, don't exit to avoid restart - delay_exit = true + delay_exit = mode == compute_api::responses::TerminateMode::Fast } drop(state); @@ -1792,7 +1811,7 @@ impl ComputeNode { // exit loop ComputeStatus::Failed - | ComputeStatus::TerminationPending + | ComputeStatus::TerminationPending { .. } | ComputeStatus::Terminated => break 'cert_update, // wait diff --git a/compute_tools/src/http/routes/terminate.rs b/compute_tools/src/http/routes/terminate.rs index 92a89c0ee7..32d90a5990 100644 --- a/compute_tools/src/http/routes/terminate.rs +++ b/compute_tools/src/http/routes/terminate.rs @@ -1,29 +1,39 @@ -use std::sync::Arc; - +use crate::compute::{ComputeNode, forward_termination_signal}; +use crate::http::JsonResponse; use axum::extract::State; -use axum::response::{IntoResponse, Response}; -use compute_api::responses::ComputeStatus; +use axum::response::Response; +use axum_extra::extract::OptionalQuery; +use compute_api::responses::{ComputeStatus, TerminateResponse}; use http::StatusCode; +use serde::Deserialize; +use std::sync::Arc; use tokio::task; use tracing::info; -use crate::compute::{ComputeNode, forward_termination_signal}; -use crate::http::JsonResponse; +#[derive(Deserialize, Default)] +pub struct TerminateQuery { + mode: compute_api::responses::TerminateMode, +} /// Terminate the compute. -pub(in crate::http) async fn terminate(State(compute): State>) -> Response { +pub(in crate::http) async fn terminate( + State(compute): State>, + OptionalQuery(terminate): OptionalQuery, +) -> Response { + let mode = terminate.unwrap_or_default().mode; { let mut state = compute.state.lock().unwrap(); if state.status == ComputeStatus::Terminated { - return StatusCode::CREATED.into_response(); + return JsonResponse::success(StatusCode::CREATED, state.terminate_flush_lsn); } if !matches!(state.status, ComputeStatus::Empty | ComputeStatus::Running) { return JsonResponse::invalid_status(state.status); } - - state.set_status(ComputeStatus::TerminationPending, &compute.state_changed); - drop(state); + state.set_status( + ComputeStatus::TerminationPending { mode }, + &compute.state_changed, + ); } forward_termination_signal(false); @@ -34,7 +44,7 @@ pub(in crate::http) async fn terminate(State(compute): State>) // be able to serve other requests while some particular request // is waiting for compute to finish configuration. let c = compute.clone(); - task::spawn_blocking(move || { + let lsn = task::spawn_blocking(move || { let mut state = c.state.lock().unwrap(); while state.status != ComputeStatus::Terminated { state = c.state_changed.wait(state).unwrap(); @@ -44,11 +54,10 @@ pub(in crate::http) async fn terminate(State(compute): State>) state.status ); } + state.terminate_flush_lsn }) .await .unwrap(); - info!("terminated Postgres"); - - StatusCode::OK.into_response() + JsonResponse::success(StatusCode::OK, TerminateResponse { lsn }) } diff --git a/compute_tools/src/monitor.rs b/compute_tools/src/monitor.rs index bacaf05cd5..8a2f6addad 100644 --- a/compute_tools/src/monitor.rs +++ b/compute_tools/src/monitor.rs @@ -83,7 +83,9 @@ impl ComputeMonitor { let compute_status = self.compute.get_status(); if matches!( compute_status, - ComputeStatus::Terminated | ComputeStatus::TerminationPending | ComputeStatus::Failed + ComputeStatus::Terminated + | ComputeStatus::TerminationPending { .. } + | ComputeStatus::Failed ) { info!( "compute is in {} status, stopping compute monitor", diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index aeabf4a519..21f55336aa 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -18,7 +18,7 @@ use clap::Parser; use compute_api::requests::ComputeClaimsScope; use compute_api::spec::ComputeMode; use control_plane::broker::StorageBroker; -use control_plane::endpoint::{ComputeControlPlane, PageserverProtocol}; +use control_plane::endpoint::{ComputeControlPlane, EndpointTerminateMode, PageserverProtocol}; use control_plane::endpoint_storage::{ENDPOINT_STORAGE_DEFAULT_ADDR, EndpointStorage}; use control_plane::local_env; use control_plane::local_env::{ @@ -711,10 +711,9 @@ struct EndpointStopCmdArgs { )] destroy: bool, - #[clap(long, help = "Postgres shutdown mode, passed to \"pg_ctl -m \"")] - #[arg(value_parser(["smart", "fast", "immediate"]))] - #[arg(default_value = "fast")] - mode: String, + #[clap(long, help = "Postgres shutdown mode")] + #[clap(default_value = "fast")] + mode: EndpointTerminateMode, } #[derive(clap::Args)] @@ -1658,7 +1657,10 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res .endpoints .get(endpoint_id) .with_context(|| format!("postgres endpoint {endpoint_id} is not found"))?; - endpoint.stop(&args.mode, args.destroy)?; + match endpoint.stop(args.mode, args.destroy).await?.lsn { + Some(lsn) => println!("{lsn}"), + None => println!("null"), + } } EndpointCmd::GenerateJwt(args) => { let endpoint = { @@ -2090,11 +2092,16 @@ async fn handle_stop_all(args: &StopCmdArgs, env: &local_env::LocalEnv) -> Resul } async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) { + let mode = if immediate { + EndpointTerminateMode::Immediate + } else { + EndpointTerminateMode::Fast + }; // Stop all endpoints match ComputeControlPlane::load(env.clone()) { Ok(cplane) => { for (_k, node) in cplane.endpoints { - if let Err(e) = node.stop(if immediate { "immediate" } else { "fast" }, false) { + if let Err(e) = node.stop(mode, false).await { eprintln!("postgres stop failed: {e:#}"); } } diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs index 2df71df57d..ae81e7abbe 100644 --- a/control_plane/src/endpoint.rs +++ b/control_plane/src/endpoint.rs @@ -52,7 +52,8 @@ use compute_api::requests::{ COMPUTE_AUDIENCE, ComputeClaims, ComputeClaimsScope, ConfigurationRequest, }; use compute_api::responses::{ - ComputeConfig, ComputeCtlConfig, ComputeStatus, ComputeStatusResponse, TlsConfig, + ComputeConfig, ComputeCtlConfig, ComputeStatus, ComputeStatusResponse, TerminateResponse, + TlsConfig, }; use compute_api::spec::{ Cluster, ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, Database, PgIdent, @@ -341,13 +342,33 @@ pub enum EndpointStatus { impl Display for EndpointStatus { fn fmt(&self, writer: &mut std::fmt::Formatter) -> std::fmt::Result { - let s = match self { + writer.write_str(match self { Self::Running => "running", Self::Stopped => "stopped", Self::Crashed => "crashed", Self::RunningNoPidfile => "running, no pidfile", - }; - write!(writer, "{}", s) + }) + } +} + +#[derive(Default, Clone, Copy, clap::ValueEnum)] +pub enum EndpointTerminateMode { + #[default] + /// Use pg_ctl stop -m fast + Fast, + /// Use pg_ctl stop -m immediate + Immediate, + /// Use /terminate?mode=immediate + ImmediateTerminate, +} + +impl std::fmt::Display for EndpointTerminateMode { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.write_str(match &self { + EndpointTerminateMode::Fast => "fast", + EndpointTerminateMode::Immediate => "immediate", + EndpointTerminateMode::ImmediateTerminate => "immediate-terminate", + }) } } @@ -918,7 +939,7 @@ impl Endpoint { ComputeStatus::Empty | ComputeStatus::ConfigurationPending | ComputeStatus::Configuration - | ComputeStatus::TerminationPending + | ComputeStatus::TerminationPending { .. } | ComputeStatus::Terminated => { bail!("unexpected compute status: {:?}", state.status) } @@ -1040,8 +1061,27 @@ impl Endpoint { } } - pub fn stop(&self, mode: &str, destroy: bool) -> Result<()> { - self.pg_ctl(&["-m", mode, "stop"], &None)?; + pub async fn stop( + &self, + mode: EndpointTerminateMode, + destroy: bool, + ) -> Result { + // pg_ctl stop is fast but doesn't allow us to collect LSN. /terminate is + // slow, and test runs time out. Solution: special mode "immediate-terminate" + // which uses /terminate + let response = if let EndpointTerminateMode::ImmediateTerminate = mode { + let ip = self.external_http_address.ip(); + let port = self.external_http_address.port(); + let url = format!("http://{ip}:{port}/terminate?mode=immediate"); + let token = self.generate_jwt(Some(ComputeClaimsScope::Admin))?; + let request = reqwest::Client::new().post(url).bearer_auth(token); + let response = request.send().await.context("/terminate")?; + let text = response.text().await.context("/terminate result")?; + serde_json::from_str(&text).with_context(|| format!("deserializing {text}"))? + } else { + self.pg_ctl(&["-m", &mode.to_string(), "stop"], &None)?; + TerminateResponse { lsn: None } + }; // Also wait for the compute_ctl process to die. It might have some // cleanup work to do after postgres stops, like syncing safekeepers, @@ -1051,7 +1091,7 @@ impl Endpoint { // waiting. Sometimes we do *not* want this cleanup: tests intentionally // do stop when majority of safekeepers is down, so sync-safekeepers // would hang otherwise. This could be a separate flag though. - let send_sigterm = destroy || mode == "immediate"; + let send_sigterm = destroy || !matches!(mode, EndpointTerminateMode::Fast); self.wait_for_compute_ctl_to_exit(send_sigterm)?; if destroy { println!( @@ -1060,7 +1100,7 @@ impl Endpoint { ); std::fs::remove_dir_all(self.endpoint_path())?; } - Ok(()) + Ok(response) } pub fn connstr(&self, user: &str, db_name: &str) -> String { diff --git a/libs/compute_api/src/responses.rs b/libs/compute_api/src/responses.rs index 24d371c6eb..5cad849e3d 100644 --- a/libs/compute_api/src/responses.rs +++ b/libs/compute_api/src/responses.rs @@ -83,6 +83,16 @@ pub struct ComputeStatusResponse { pub error: Option, } +#[derive(Serialize, Clone, Copy, Debug, Deserialize, PartialEq, Eq, Default)] +#[serde(rename_all = "snake_case")] +pub enum TerminateMode { + #[default] + /// wait 30s till returning from /terminate to allow control plane to get the error + Fast, + /// return from /terminate immediately as soon as all components are terminated + Immediate, +} + #[derive(Serialize, Clone, Copy, Debug, Deserialize, PartialEq, Eq)] #[serde(rename_all = "snake_case")] pub enum ComputeStatus { @@ -103,11 +113,16 @@ pub enum ComputeStatus { // control-plane to terminate it. Failed, // Termination requested - TerminationPending, + TerminationPending { mode: TerminateMode }, // Terminated Postgres Terminated, } +#[derive(Deserialize, Serialize)] +pub struct TerminateResponse { + pub lsn: Option, +} + impl Display for ComputeStatus { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { @@ -117,7 +132,7 @@ impl Display for ComputeStatus { ComputeStatus::Running => f.write_str("running"), ComputeStatus::Configuration => f.write_str("configuration"), ComputeStatus::Failed => f.write_str("failed"), - ComputeStatus::TerminationPending => f.write_str("termination-pending"), + ComputeStatus::TerminationPending { .. } => f.write_str("termination-pending"), ComputeStatus::Terminated => f.write_str("terminated"), } } diff --git a/libs/desim/src/executor.rs b/libs/desim/src/executor.rs index df8b071c06..51b11ff97e 100644 --- a/libs/desim/src/executor.rs +++ b/libs/desim/src/executor.rs @@ -419,13 +419,13 @@ pub fn now() -> u64 { with_thread_context(|ctx| ctx.clock.get().unwrap().now()) } -pub fn exit(code: i32, msg: String) { +pub fn exit(code: i32, msg: String) -> ! { with_thread_context(|ctx| { ctx.allow_panic.store(true, Ordering::SeqCst); let mut result = ctx.result.lock(); *result = (code, msg); panic!("exit"); - }); + }) } pub(crate) fn get_thread_ctx() -> Arc { diff --git a/libs/walproposer/src/api_bindings.rs b/libs/walproposer/src/api_bindings.rs index 4d6cbae9a9..b89f1877fd 100644 --- a/libs/walproposer/src/api_bindings.rs +++ b/libs/walproposer/src/api_bindings.rs @@ -311,7 +311,7 @@ extern "C" fn get_redo_start_lsn(wp: *mut WalProposer) -> XLogRecPtr { } } -extern "C-unwind" fn finish_sync_safekeepers(wp: *mut WalProposer, lsn: XLogRecPtr) { +unsafe extern "C-unwind" fn finish_sync_safekeepers(wp: *mut WalProposer, lsn: XLogRecPtr) -> ! { unsafe { let callback_data = (*(*wp).config).callback_data; let api = callback_data as *mut Box; diff --git a/libs/walproposer/src/walproposer.rs b/libs/walproposer/src/walproposer.rs index e95494297c..c853658ddf 100644 --- a/libs/walproposer/src/walproposer.rs +++ b/libs/walproposer/src/walproposer.rs @@ -144,7 +144,7 @@ pub trait ApiImpl { todo!() } - fn finish_sync_safekeepers(&self, _lsn: u64) { + fn finish_sync_safekeepers(&self, _lsn: u64) -> ! { todo!() } @@ -469,7 +469,7 @@ mod tests { true } - fn finish_sync_safekeepers(&self, lsn: u64) { + fn finish_sync_safekeepers(&self, lsn: u64) -> ! { self.sync_channel.send(lsn).unwrap(); panic!("sync safekeepers finished at lsn={}", lsn); } diff --git a/pgxn/neon/walproposer.h b/pgxn/neon/walproposer.h index 08087e5a55..4b223b6b18 100644 --- a/pgxn/neon/walproposer.h +++ b/pgxn/neon/walproposer.h @@ -679,8 +679,7 @@ typedef struct walproposer_api * Finish sync safekeepers with the given LSN. This function should not * return and should exit the program. */ - void (*finish_sync_safekeepers) (WalProposer *wp, XLogRecPtr lsn); - + void (*finish_sync_safekeepers) (WalProposer *wp, XLogRecPtr lsn) __attribute__((noreturn)) ; /* * Called after every AppendResponse from the safekeeper. Used to * propagate backpressure feedback and to confirm WAL persistence (has diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c index 3d6a92ad79..185fc83ace 100644 --- a/pgxn/neon/walproposer_pg.c +++ b/pgxn/neon/walproposer_pg.c @@ -1890,7 +1890,7 @@ walprop_pg_wait_event_set(WalProposer *wp, long timeout, Safekeeper **sk, uint32 return rc; } -static void +static void __attribute__((noreturn)) walprop_pg_finish_sync_safekeepers(WalProposer *wp, XLogRecPtr lsn) { fprintf(stdout, "%X/%X\n", LSN_FORMAT_ARGS(lsn)); diff --git a/safekeeper/tests/walproposer_sim/walproposer_api.rs b/safekeeper/tests/walproposer_sim/walproposer_api.rs index 82e7a32881..c2604c4bdc 100644 --- a/safekeeper/tests/walproposer_sim/walproposer_api.rs +++ b/safekeeper/tests/walproposer_sim/walproposer_api.rs @@ -499,7 +499,7 @@ impl ApiImpl for SimulationApi { true } - fn finish_sync_safekeepers(&self, lsn: u64) { + fn finish_sync_safekeepers(&self, lsn: u64) -> ! { debug!("finish_sync_safekeepers, lsn={}", lsn); executor::exit(0, Lsn(lsn).to_string()); } diff --git a/test_runner/fixtures/neon_cli.py b/test_runner/fixtures/neon_cli.py index 48a1a36e66..e177145294 100644 --- a/test_runner/fixtures/neon_cli.py +++ b/test_runner/fixtures/neon_cli.py @@ -620,7 +620,7 @@ class NeonLocalCli(AbstractNeonCli): destroy=False, check_return_code=True, mode: str | None = None, - ) -> subprocess.CompletedProcess[str]: + ) -> tuple[Lsn | None, subprocess.CompletedProcess[str]]: args = [ "endpoint", "stop", @@ -632,7 +632,11 @@ class NeonLocalCli(AbstractNeonCli): if endpoint_id is not None: args.append(endpoint_id) - return self.raw_cli(args, check_return_code=check_return_code) + proc = self.raw_cli(args, check_return_code=check_return_code) + log.debug(f"endpoint stop stdout: {proc.stdout}") + lsn_str = proc.stdout.split()[-1] + lsn: Lsn | None = None if lsn_str == "null" else Lsn(lsn_str) + return lsn, proc def mappings_map_branch( self, name: str, tenant_id: TenantId, timeline_id: TimelineId diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index df34573b12..970175a631 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -4192,6 +4192,8 @@ class Endpoint(PgProtocol, LogUtils): self._running = threading.Semaphore(0) self.__jwt: str | None = None + self.terminate_flush_lsn: Lsn | None = None + def http_client(self, retries: Retry | None = None) -> EndpointHttpClient: assert self.__jwt is not None return EndpointHttpClient( @@ -4494,9 +4496,10 @@ class Endpoint(PgProtocol, LogUtils): running = self._running.acquire(blocking=False) if running: assert self.endpoint_id is not None - self.env.neon_cli.endpoint_stop( + lsn, _ = self.env.neon_cli.endpoint_stop( self.endpoint_id, check_return_code=self.check_stop_result, mode=mode ) + self.terminate_flush_lsn = lsn if sks_wait_walreceiver_gone is not None: for sk in sks_wait_walreceiver_gone[0]: @@ -4514,9 +4517,10 @@ class Endpoint(PgProtocol, LogUtils): running = self._running.acquire(blocking=False) if running: assert self.endpoint_id is not None - self.env.neon_cli.endpoint_stop( + lsn, _ = self.env.neon_cli.endpoint_stop( self.endpoint_id, True, check_return_code=self.check_stop_result, mode=mode ) + self.terminate_flush_lsn = lsn self.endpoint_id = None return self diff --git a/test_runner/regress/test_replica_promotes.py b/test_runner/regress/test_replica_promotes.py index e378d37635..4486901bae 100644 --- a/test_runner/regress/test_replica_promotes.py +++ b/test_runner/regress/test_replica_promotes.py @@ -4,13 +4,25 @@ File with secondary->primary promotion testing. This far, only contains a test that we don't break and that the data is persisted. """ +from typing import cast + import psycopg2 +from fixtures.common_types import Lsn from fixtures.log_helper import log from fixtures.neon_fixtures import Endpoint, NeonEnv, wait_replica_caughtup from fixtures.pg_version import PgVersion from pytest import raises +def stop_and_check_lsn(ep: Endpoint, expected_lsn: Lsn | None): + ep.stop(mode="immediate-terminate") + lsn = ep.terminate_flush_lsn + if expected_lsn is not None: + assert lsn >= expected_lsn, f"{expected_lsn=} < {lsn=}" + else: + assert lsn == expected_lsn, f"{expected_lsn=} != {lsn=}" + + def test_replica_promotes(neon_simple_env: NeonEnv, pg_version: PgVersion): """ Test that a replica safely promotes, and can commit data updates which @@ -37,7 +49,9 @@ def test_replica_promotes(neon_simple_env: NeonEnv, pg_version: PgVersion): pg_current_wal_flush_lsn() """ ) - log.info(f"Primary: Current LSN after workload is {primary_cur.fetchone()}") + lsn_triple = cast("tuple[str, str, str]", primary_cur.fetchone()) + log.info(f"Primary: Current LSN after workload is {lsn_triple}") + expected_primary_lsn: Lsn = Lsn(lsn_triple[2]) primary_cur.execute("show neon.safekeepers") safekeepers = primary_cur.fetchall()[0][0] @@ -57,7 +71,7 @@ def test_replica_promotes(neon_simple_env: NeonEnv, pg_version: PgVersion): secondary_cur.execute("select count(*) from t") assert secondary_cur.fetchone() == (100,) - primary.stop_and_destroy(mode="immediate") + stop_and_check_lsn(primary, expected_primary_lsn) # Reconnect to the secondary to make sure we get a read-write connection promo_conn = secondary.connect() @@ -109,9 +123,10 @@ def test_replica_promotes(neon_simple_env: NeonEnv, pg_version: PgVersion): # wait_for_last_flush_lsn(env, secondary, env.initial_tenant, env.initial_timeline) - secondary.stop_and_destroy() + # secondaries don't sync safekeepers on finish so LSN will be None + stop_and_check_lsn(secondary, None) - primary = env.endpoints.create_start(branch_name="main", endpoint_id="primary") + primary = env.endpoints.create_start(branch_name="main", endpoint_id="primary2") with primary.connect() as new_primary: new_primary_cur = new_primary.cursor() @@ -122,7 +137,9 @@ def test_replica_promotes(neon_simple_env: NeonEnv, pg_version: PgVersion): pg_current_wal_flush_lsn() """ ) - log.info(f"New primary: Boot LSN is {new_primary_cur.fetchone()}") + lsn_triple = cast("tuple[str, str, str]", new_primary_cur.fetchone()) + expected_primary_lsn = Lsn(lsn_triple[2]) + log.info(f"New primary: Boot LSN is {lsn_triple}") new_primary_cur.execute("select count(*) from t") assert new_primary_cur.fetchone() == (200,) @@ -130,4 +147,4 @@ def test_replica_promotes(neon_simple_env: NeonEnv, pg_version: PgVersion): new_primary_cur.execute("select count(*) from t") assert new_primary_cur.fetchone() == (300,) - primary.stop(mode="immediate") + stop_and_check_lsn(primary, expected_primary_lsn) From 7e711ede4448e2a83f42719df33e19c9c0e2606d Mon Sep 17 00:00:00 2001 From: Peter Bendel Date: Wed, 18 Jun 2025 14:40:25 +0200 Subject: [PATCH 070/124] Increase tenant size for large tenant oltp workload (#12260) ## Problem - We run the large tenant oltp workload with a fixed size (larger than existing customers' workloads). Our customer's workloads are continuously growing and our testing should stay ahead of the customers' production workloads. - we want to touch all tables in the tenant's database (updates) so that we simulate a continuous change in layer files like in a real production workload - our current oltp benchmark uses a mixture of read and write transactions, however we also want a separate test run with read-only transactions only ## Summary of changes - modify the existing workload to have a separate run with pgbench custom scripts that are read-only - create a new workload that - grows all large tables in each run (for the reuse branch in the large oltp tenant's project) - updates a percentage of rows in all large tables in each run (to enforce table bloat and auto-vacuum runs and layer rebuild in pageservers Each run of the new workflow increases the logical database size about 16 GB. We start with 6 runs per day which will give us about 96-100 GB growth per day. --------- Co-authored-by: Alexander Lakhin --- .github/workflows/large_oltp_benchmark.yml | 11 +- .github/workflows/large_oltp_growth.yml | 175 ++++++++++++++++++ .../grow_action_blocks.sql | 22 +++ .../grow_action_kwargs.sql | 11 ++ .../grow_device_fingerprint_event.sql | 56 ++++++ .../large_synthetic_oltp/grow_edges.sql | 10 + .../grow_hotel_rate_mapping.sql | 21 +++ .../grow_ocr_pipeline_results_version.sql | 31 ++++ .../grow_priceline_raw_response.sql | 18 ++ .../grow_relabled_transactions.sql | 26 +++ .../grow_state_values.sql | 9 + .../large_synthetic_oltp/grow_values.sql | 30 +++ .../large_synthetic_oltp/grow_vertices.sql | 26 +++ ...oding_body_tracking_category_selection.sql | 9 + .../update_action_blocks.sql | 9 + .../update_action_kwargs.sql | 9 + .../update_denormalized_approval_workflow.sql | 10 + .../update_device_fingerprint_event.sql | 9 + .../large_synthetic_oltp/update_edges.sql | 9 + .../update_heron_transaction_enriched_log.sql | 9 + ..._heron_transaction_enrichment_requests.sql | 9 + .../update_hotel_rate_mapping.sql | 9 + .../update_incoming_webhooks.sql | 9 + .../update_manual_transaction.sql | 9 + .../update_ml_receipt_matching_log.sql | 9 + .../update_ocr_pipeine_results_version.sql | 9 + .../update_orc_pipeline_step_results.sql | 9 + ...date_orc_pipeline_step_results_version.sql | 9 + .../update_priceline_raw_response.sql | 9 + .../update_quickbooks_transactions.sql | 9 + .../update_raw_finicity_transaction.sql | 15 ++ .../update_relabeled_transactions.sql | 9 + .../update_state_values.sql | 9 + .../update_stripe_authorization_event_log.sql | 9 + .../update_transaction.sql | 9 + .../large_synthetic_oltp/update_values.sql | 9 + .../large_synthetic_oltp/update_vertices.sql | 9 + .../test_perf_oltp_large_tenant.py | 14 +- 38 files changed, 679 insertions(+), 4 deletions(-) create mode 100644 .github/workflows/large_oltp_growth.yml create mode 100644 test_runner/performance/large_synthetic_oltp/grow_action_blocks.sql create mode 100644 test_runner/performance/large_synthetic_oltp/grow_action_kwargs.sql create mode 100644 test_runner/performance/large_synthetic_oltp/grow_device_fingerprint_event.sql create mode 100644 test_runner/performance/large_synthetic_oltp/grow_edges.sql create mode 100644 test_runner/performance/large_synthetic_oltp/grow_hotel_rate_mapping.sql create mode 100644 test_runner/performance/large_synthetic_oltp/grow_ocr_pipeline_results_version.sql create mode 100644 test_runner/performance/large_synthetic_oltp/grow_priceline_raw_response.sql create mode 100644 test_runner/performance/large_synthetic_oltp/grow_relabled_transactions.sql create mode 100644 test_runner/performance/large_synthetic_oltp/grow_state_values.sql create mode 100644 test_runner/performance/large_synthetic_oltp/grow_values.sql create mode 100644 test_runner/performance/large_synthetic_oltp/grow_vertices.sql create mode 100644 test_runner/performance/large_synthetic_oltp/update_accounting_coding_body_tracking_category_selection.sql create mode 100644 test_runner/performance/large_synthetic_oltp/update_action_blocks.sql create mode 100644 test_runner/performance/large_synthetic_oltp/update_action_kwargs.sql create mode 100644 test_runner/performance/large_synthetic_oltp/update_denormalized_approval_workflow.sql create mode 100644 test_runner/performance/large_synthetic_oltp/update_device_fingerprint_event.sql create mode 100644 test_runner/performance/large_synthetic_oltp/update_edges.sql create mode 100644 test_runner/performance/large_synthetic_oltp/update_heron_transaction_enriched_log.sql create mode 100644 test_runner/performance/large_synthetic_oltp/update_heron_transaction_enrichment_requests.sql create mode 100644 test_runner/performance/large_synthetic_oltp/update_hotel_rate_mapping.sql create mode 100644 test_runner/performance/large_synthetic_oltp/update_incoming_webhooks.sql create mode 100644 test_runner/performance/large_synthetic_oltp/update_manual_transaction.sql create mode 100644 test_runner/performance/large_synthetic_oltp/update_ml_receipt_matching_log.sql create mode 100644 test_runner/performance/large_synthetic_oltp/update_ocr_pipeine_results_version.sql create mode 100644 test_runner/performance/large_synthetic_oltp/update_orc_pipeline_step_results.sql create mode 100644 test_runner/performance/large_synthetic_oltp/update_orc_pipeline_step_results_version.sql create mode 100644 test_runner/performance/large_synthetic_oltp/update_priceline_raw_response.sql create mode 100644 test_runner/performance/large_synthetic_oltp/update_quickbooks_transactions.sql create mode 100644 test_runner/performance/large_synthetic_oltp/update_raw_finicity_transaction.sql create mode 100644 test_runner/performance/large_synthetic_oltp/update_relabeled_transactions.sql create mode 100644 test_runner/performance/large_synthetic_oltp/update_state_values.sql create mode 100644 test_runner/performance/large_synthetic_oltp/update_stripe_authorization_event_log.sql create mode 100644 test_runner/performance/large_synthetic_oltp/update_transaction.sql create mode 100644 test_runner/performance/large_synthetic_oltp/update_values.sql create mode 100644 test_runner/performance/large_synthetic_oltp/update_vertices.sql diff --git a/.github/workflows/large_oltp_benchmark.yml b/.github/workflows/large_oltp_benchmark.yml index 42dcc8e918..050b9047c7 100644 --- a/.github/workflows/large_oltp_benchmark.yml +++ b/.github/workflows/large_oltp_benchmark.yml @@ -33,11 +33,19 @@ jobs: fail-fast: false # allow other variants to continue even if one fails matrix: include: + # test only read-only custom scripts in new branch without database maintenance + - target: new_branch + custom_scripts: select_any_webhook_with_skew.sql@300 select_recent_webhook.sql@397 select_prefetch_webhook.sql@3 + test_maintenance: false + # test all custom scripts in new branch with database maintenance - target: new_branch custom_scripts: insert_webhooks.sql@200 select_any_webhook_with_skew.sql@300 select_recent_webhook.sql@397 select_prefetch_webhook.sql@3 IUD_one_transaction.sql@100 + test_maintenance: true + # test all custom scripts in reuse branch with database maintenance - target: reuse_branch custom_scripts: insert_webhooks.sql@200 select_any_webhook_with_skew.sql@300 select_recent_webhook.sql@397 select_prefetch_webhook.sql@3 IUD_one_transaction.sql@100 - max-parallel: 1 # we want to run each stripe size sequentially to be able to compare the results + test_maintenance: true + max-parallel: 1 # we want to run each benchmark sequentially to not have noisy neighbors on shared storage (PS, SK) permissions: contents: write statuses: write @@ -145,6 +153,7 @@ jobs: PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" - name: Benchmark database maintenance + if: ${{ matrix.test_maintenance == 'true' }} uses: ./.github/actions/run-python-test-set with: build_type: ${{ env.BUILD_TYPE }} diff --git a/.github/workflows/large_oltp_growth.yml b/.github/workflows/large_oltp_growth.yml new file mode 100644 index 0000000000..8ca640d6ef --- /dev/null +++ b/.github/workflows/large_oltp_growth.yml @@ -0,0 +1,175 @@ +name: large oltp growth +# workflow to grow the reuse branch of large oltp benchmark continuously (about 16 GB per run) + +on: + # uncomment to run on push for debugging your PR + # push: + # branches: [ bodobolero/increase_large_oltp_workload ] + + schedule: + # * is a special character in YAML so you have to quote this string + # ┌───────────── minute (0 - 59) + # │ ┌───────────── hour (0 - 23) + # │ │ ┌───────────── day of the month (1 - 31) + # │ │ │ ┌───────────── month (1 - 12 or JAN-DEC) + # │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT) + - cron: '0 6 * * *' # 06:00 UTC + - cron: '0 8 * * *' # 08:00 UTC + - cron: '0 10 * * *' # 10:00 UTC + - cron: '0 12 * * *' # 12:00 UTC + - cron: '0 14 * * *' # 14:00 UTC + - cron: '0 16 * * *' # 16:00 UTC + workflow_dispatch: # adds ability to run this manually + +defaults: + run: + shell: bash -euxo pipefail {0} + +concurrency: + # Allow only one workflow globally because we need dedicated resources which only exist once + group: large-oltp-growth + cancel-in-progress: true + +permissions: + contents: read + +jobs: + oltp: + strategy: + fail-fast: false # allow other variants to continue even if one fails + matrix: + include: + # for now only grow the reuse branch, not the other branches. + - target: reuse_branch + custom_scripts: + - grow_action_blocks.sql + - grow_action_kwargs.sql + - grow_device_fingerprint_event.sql + - grow_edges.sql + - grow_hotel_rate_mapping.sql + - grow_ocr_pipeline_results_version.sql + - grow_priceline_raw_response.sql + - grow_relabled_transactions.sql + - grow_state_values.sql + - grow_values.sql + - grow_vertices.sql + - update_accounting_coding_body_tracking_category_selection.sql + - update_action_blocks.sql + - update_action_kwargs.sql + - update_denormalized_approval_workflow.sql + - update_device_fingerprint_event.sql + - update_edges.sql + - update_heron_transaction_enriched_log.sql + - update_heron_transaction_enrichment_requests.sql + - update_hotel_rate_mapping.sql + - update_incoming_webhooks.sql + - update_manual_transaction.sql + - update_ml_receipt_matching_log.sql + - update_ocr_pipeine_results_version.sql + - update_orc_pipeline_step_results.sql + - update_orc_pipeline_step_results_version.sql + - update_priceline_raw_response.sql + - update_quickbooks_transactions.sql + - update_raw_finicity_transaction.sql + - update_relabeled_transactions.sql + - update_state_values.sql + - update_stripe_authorization_event_log.sql + - update_transaction.sql + - update_values.sql + - update_vertices.sql + max-parallel: 1 # we want to run each growth workload sequentially (for now there is just one) + permissions: + contents: write + statuses: write + id-token: write # aws-actions/configure-aws-credentials + env: + TEST_PG_BENCH_DURATIONS_MATRIX: "1h" + TEST_PGBENCH_CUSTOM_SCRIPTS: ${{ join(matrix.custom_scripts, ' ') }} + POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install + PG_VERSION: 16 # pre-determined by pre-determined project + TEST_OUTPUT: /tmp/test_output + BUILD_TYPE: remote + PLATFORM: ${{ matrix.target }} + + runs-on: [ self-hosted, us-east-2, x64 ] + container: + image: ghcr.io/neondatabase/build-tools:pinned-bookworm + credentials: + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + options: --init + + steps: + - name: Harden the runner (Audit all outbound calls) + uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 + with: + egress-policy: audit + + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + + - name: Configure AWS credentials # necessary to download artefacts + uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2 + with: + aws-region: eu-central-1 + role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + role-duration-seconds: 18000 # 5 hours is currently max associated with IAM role + + - name: Download Neon artifact + uses: ./.github/actions/download + with: + name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact + path: /tmp/neon/ + prefix: latest + aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + + - name: Set up Connection String + id: set-up-connstr + run: | + case "${{ matrix.target }}" in + reuse_branch) + CONNSTR=${{ secrets.BENCHMARK_LARGE_OLTP_REUSE_CONNSTR }} + ;; + *) + echo >&2 "Unknown target=${{ matrix.target }}" + exit 1 + ;; + esac + + CONNSTR_WITHOUT_POOLER="${CONNSTR//-pooler/}" + + echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT + echo "connstr_without_pooler=${CONNSTR_WITHOUT_POOLER}" >> $GITHUB_OUTPUT + + - name: pgbench with custom-scripts + uses: ./.github/actions/run-python-test-set + with: + build_type: ${{ env.BUILD_TYPE }} + test_selection: performance + run_in_parallel: false + save_perf_report: true + extra_params: -m remote_cluster --timeout 7200 -k test_perf_oltp_large_tenant_growth + pg_version: ${{ env.PG_VERSION }} + aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + env: + BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} + VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" + PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" + + - name: Create Allure report + id: create-allure-report + if: ${{ !cancelled() }} + uses: ./.github/actions/allure-report-generate + with: + aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + + - name: Post to a Slack channel + if: ${{ github.event.schedule && failure() }} + uses: slackapi/slack-github-action@fcfb566f8b0aab22203f066d80ca1d7e4b5d05b3 # v1.27.1 + with: + channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream + slack-message: | + Periodic large oltp tenant growth increase: ${{ job.status }} + <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run> + <${{ steps.create-allure-report.outputs.report-url }}|Allure report> + env: + SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} diff --git a/test_runner/performance/large_synthetic_oltp/grow_action_blocks.sql b/test_runner/performance/large_synthetic_oltp/grow_action_blocks.sql new file mode 100644 index 0000000000..0860b76331 --- /dev/null +++ b/test_runner/performance/large_synthetic_oltp/grow_action_blocks.sql @@ -0,0 +1,22 @@ +-- add 100000 rows or approximately 11 MB to the action_blocks table +-- takes about 1 second +INSERT INTO workflows.action_blocks ( + id, + uuid, + created_at, + status, + function_signature, + reference_id, + blocking, + run_synchronously +) +SELECT + id, + uuid_generate_v4(), + now() - (random() * interval '100 days'), -- Random date within the last 100 days + 'CONDITIONS_NOT_MET', + 'function_signature_' || id, -- Create a unique function signature using id + CASE WHEN random() > 0.5 THEN 'reference_' || id ELSE NULL END, -- 50% chance of being NULL + true, + CASE WHEN random() > 0.5 THEN true ELSE false END -- Random boolean value +FROM generate_series(1, 100000) AS id; \ No newline at end of file diff --git a/test_runner/performance/large_synthetic_oltp/grow_action_kwargs.sql b/test_runner/performance/large_synthetic_oltp/grow_action_kwargs.sql new file mode 100644 index 0000000000..8a2b7c398a --- /dev/null +++ b/test_runner/performance/large_synthetic_oltp/grow_action_kwargs.sql @@ -0,0 +1,11 @@ +-- add 100000 rows or approximately 10 MB to the action_kwargs table +-- takes about 5 minutes +INSERT INTO workflows.action_kwargs (created_at, key, uuid, value_id, state_value_id, action_block_id) +SELECT + now(), -- Using the default value for `created_at` + 'key_' || gs.id, -- Generating a unique key based on the id + uuid_generate_v4(), -- Generating a new UUID for each row + CASE WHEN gs.id % 2 = 0 THEN gs.id ELSE NULL END, -- Setting value_id for even ids + CASE WHEN gs.id % 2 <> 0 THEN gs.id ELSE NULL END, -- Setting state_value_id for odd ids + 1 -- Setting action_block_id as 1 for simplicity +FROM generate_series(1, 100000) AS gs(id); \ No newline at end of file diff --git a/test_runner/performance/large_synthetic_oltp/grow_device_fingerprint_event.sql b/test_runner/performance/large_synthetic_oltp/grow_device_fingerprint_event.sql new file mode 100644 index 0000000000..1ef38451b7 --- /dev/null +++ b/test_runner/performance/large_synthetic_oltp/grow_device_fingerprint_event.sql @@ -0,0 +1,56 @@ +-- add 100000 rows or approx. 30 MB to the device_fingerprint_event table +-- takes about 4 minutes +INSERT INTO authentication.device_fingerprint_event ( + uuid, + created_at, + identity_uuid, + fingerprint_request_id, + fingerprint_id, + confidence_score, + ip_address, + url, + client_referrer, + last_seen_at, + raw_fingerprint_response, + session_uuid, + fingerprint_response, + browser_version, + browser_name, + device, + operating_system, + operating_system_version, + user_agent, + ip_address_location_city, + ip_address_location_region, + ip_address_location_country_code, + ip_address_location_latitude, + ip_address_location_longitude, + is_incognito +) +SELECT + gen_random_uuid(), -- Generates a random UUID for primary key + now() - (random() * interval '10 days'), -- Random timestamp within the last 10 days + gen_random_uuid(), -- Random UUID for identity + md5(gs::text), -- Simulates unique fingerprint request ID using `md5` hash of series number + md5((gs + 10000)::text), -- Simulates unique fingerprint ID + round(CAST(random() AS numeric), 2), -- Generates a random score between 0 and 1, cast `random()` to numeric + '192.168.' || (random() * 255)::int || '.' || (random() * 255)::int, -- Random IP address + 'https://example.com/' || (gs % 1000), -- Random URL with series number suffix + CASE WHEN random() < 0.5 THEN NULL ELSE 'https://referrer.com/' || (gs % 100)::text END, -- Random referrer, 50% chance of being NULL + now() - (random() * interval '5 days'), -- Last seen timestamp within the last 5 days + NULL, -- Keeping raw_fingerprint_response NULL for simplicity + CASE WHEN random() < 0.3 THEN gen_random_uuid() ELSE NULL END, -- Session UUID, 30% chance of NULL + NULL, -- Keeping fingerprint_response NULL for simplicity + CASE WHEN random() < 0.5 THEN '93.0' ELSE '92.0' END, -- Random browser version + CASE WHEN random() < 0.5 THEN 'Firefox' ELSE 'Chrome' END, -- Random browser name + CASE WHEN random() < 0.5 THEN 'Desktop' ELSE 'Mobile' END, -- Random device type + 'Windows', -- Static value for operating system + '10.0', -- Static value for operating system version + 'Mozilla/5.0', -- Static value for user agent + 'City ' || (gs % 1000)::text, -- Random city name + 'Region ' || (gs % 100)::text, -- Random region name + 'US', -- Static country code + random() * 180 - 90, -- Random latitude between -90 and 90 + random() * 360 - 180, -- Random longitude between -180 and 180 + random() < 0.1 -- 10% chance of being incognito +FROM generate_series(1, 100000) AS gs; \ No newline at end of file diff --git a/test_runner/performance/large_synthetic_oltp/grow_edges.sql b/test_runner/performance/large_synthetic_oltp/grow_edges.sql new file mode 100644 index 0000000000..17f289fe5b --- /dev/null +++ b/test_runner/performance/large_synthetic_oltp/grow_edges.sql @@ -0,0 +1,10 @@ +-- add 100000 rows or approximately 11 MB to the edges table +-- takes about 1 minute +INSERT INTO workflows.edges (created_at, workflow_id, uuid, from_vertex_id, to_vertex_id) +SELECT + now() - (random() * interval '365 days'), -- Random `created_at` timestamp in the last year + (random() * 100)::int + 1, -- Random `workflow_id` between 1 and 100 + uuid_generate_v4(), -- Generate a new UUID for each row + (random() * 100000)::bigint + 1, -- Random `from_vertex_id` between 1 and 100,000 + (random() * 100000)::bigint + 1 -- Random `to_vertex_id` between 1 and 100,000 +FROM generate_series(1, 100000) AS gs; -- Generate 100,000 sequential IDs \ No newline at end of file diff --git a/test_runner/performance/large_synthetic_oltp/grow_hotel_rate_mapping.sql b/test_runner/performance/large_synthetic_oltp/grow_hotel_rate_mapping.sql new file mode 100644 index 0000000000..1e79f94eab --- /dev/null +++ b/test_runner/performance/large_synthetic_oltp/grow_hotel_rate_mapping.sql @@ -0,0 +1,21 @@ +-- add 100000 rows or approximately 10 MB to the hotel_rate_mapping table +-- takes about 1 second +INSERT INTO booking_inventory.hotel_rate_mapping ( + uuid, + created_at, + updated_at, + hotel_rate_id, + remote_id, + source +) +SELECT + uuid_generate_v4(), -- Unique UUID for each row + now(), -- Created at timestamp + now(), -- Updated at timestamp + 'rate_' || gs AS hotel_rate_id, -- Unique hotel_rate_id + 'remote_' || gs AS remote_id, -- Unique remote_id + CASE WHEN gs % 3 = 0 THEN 'source_1' + WHEN gs % 3 = 1 THEN 'source_2' + ELSE 'source_3' + END AS source -- Distributing sources among three options +FROM generate_series(1, 100000) AS gs; \ No newline at end of file diff --git a/test_runner/performance/large_synthetic_oltp/grow_ocr_pipeline_results_version.sql b/test_runner/performance/large_synthetic_oltp/grow_ocr_pipeline_results_version.sql new file mode 100644 index 0000000000..21ebac74d2 --- /dev/null +++ b/test_runner/performance/large_synthetic_oltp/grow_ocr_pipeline_results_version.sql @@ -0,0 +1,31 @@ +-- add 100000 rows or approximately 20 MB to the ocr_pipeline_results_version table +-- takes about 1 second +INSERT INTO ocr.ocr_pipeline_results_version ( + id, transaction_id, operation_type, created_at, updated_at, s3_filename, completed_at, result, + end_transaction_id, pipeline_type, is_async, callback, callback_kwargs, input, error, file_type, s3_bucket_name, pipeline_kwargs +) +SELECT + gs.aid, -- id + gs.aid, -- transaction_id (same as id for simplicity) + (gs.aid % 5)::smallint + 1, -- operation_type (cyclic values from 1 to 5) + now() - interval '1 day' * (random() * 30), -- created_at (random timestamp within the last 30 days) + now() - interval '1 day' * (random() * 30), -- updated_at (random timestamp within the last 30 days) + 's3_file_' || gs.aid || '.txt', -- s3_filename (synthetic filename) + now() - interval '1 day' * (random() * 30), -- completed_at (random timestamp within the last 30 days) + '{}'::jsonb, -- result (empty JSON object) + NULL, -- end_transaction_id (NULL) + CASE (gs.aid % 3) -- pipeline_type (cyclic text values) + WHEN 0 THEN 'OCR' + WHEN 1 THEN 'PDF' + ELSE 'Image' + END, + gs.aid % 2 = 0, -- is_async (alternating between true and false) + 'http://callback/' || gs.aid, -- callback (synthetic URL) + '{}'::jsonb, -- callback_kwargs (empty JSON object) + 'Input text ' || gs.aid, -- input (synthetic input text) + NULL, -- error (NULL) + 'pdf', -- file_type (default to 'pdf') + 'bucket_' || gs.aid % 10, -- s3_bucket_name (synthetic bucket names) + '{}'::jsonb -- pipeline_kwargs (empty JSON object) +FROM + generate_series(1, 100000) AS gs(aid); \ No newline at end of file diff --git a/test_runner/performance/large_synthetic_oltp/grow_priceline_raw_response.sql b/test_runner/performance/large_synthetic_oltp/grow_priceline_raw_response.sql new file mode 100644 index 0000000000..28c4f1a7fb --- /dev/null +++ b/test_runner/performance/large_synthetic_oltp/grow_priceline_raw_response.sql @@ -0,0 +1,18 @@ +-- add 100000 rows or approx. 20 MB to the priceline_raw_response table +-- takes about 20 seconds +INSERT INTO booking_inventory.priceline_raw_response ( + uuid, created_at, updated_at, url, base_url, path, method, params, request, response +) +SELECT + gen_random_uuid(), -- Generate random UUIDs + now() - (random() * interval '30 days'), -- Random creation time within the past 30 days + now() - (random() * interval '30 days'), -- Random update time within the past 30 days + 'https://example.com/resource/' || gs, -- Construct a unique URL for each row + 'https://example.com', -- Base URL for all rows + '/resource/' || gs, -- Path for each row + CASE WHEN gs % 2 = 0 THEN 'GET' ELSE 'POST' END, -- Alternate between GET and POST methods + 'id=' || gs, -- Simple parameter pattern for each row + '{}'::jsonb, -- Empty JSON object for request + jsonb_build_object('status', 'success', 'data', gs) -- Construct a valid JSON response +FROM + generate_series(1, 100000) AS gs; \ No newline at end of file diff --git a/test_runner/performance/large_synthetic_oltp/grow_relabled_transactions.sql b/test_runner/performance/large_synthetic_oltp/grow_relabled_transactions.sql new file mode 100644 index 0000000000..0b1aa2d2bd --- /dev/null +++ b/test_runner/performance/large_synthetic_oltp/grow_relabled_transactions.sql @@ -0,0 +1,26 @@ +-- add 100000 rows or approx. 1 MB to the relabeled_transactions table +-- takes about 1 second +INSERT INTO heron.relabeled_transactions ( + id, + created_at, + universal_transaction_id, + raw_result, + category, + category_confidence, + merchant, + batch_id +) +SELECT + gs.aid AS id, + now() - (gs.aid % 1000) * interval '1 second' AS created_at, + 'txn_' || gs.aid AS universal_transaction_id, + '{}'::jsonb AS raw_result, + CASE WHEN gs.aid % 5 = 0 THEN 'grocery' + WHEN gs.aid % 5 = 1 THEN 'electronics' + WHEN gs.aid % 5 = 2 THEN 'clothing' + WHEN gs.aid % 5 = 3 THEN 'utilities' + ELSE NULL END AS category, + ROUND(RANDOM()::numeric, 2) AS category_confidence, + CASE WHEN gs.aid % 2 = 0 THEN 'Merchant_' || gs.aid % 20 ELSE NULL END AS merchant, + gs.aid % 100 + 1 AS batch_id +FROM generate_series(1, 100000) AS gs(aid); \ No newline at end of file diff --git a/test_runner/performance/large_synthetic_oltp/grow_state_values.sql b/test_runner/performance/large_synthetic_oltp/grow_state_values.sql new file mode 100644 index 0000000000..8a8ce146be --- /dev/null +++ b/test_runner/performance/large_synthetic_oltp/grow_state_values.sql @@ -0,0 +1,9 @@ +-- add 100000 rows or approx.10 MB to the state_values table +-- takes about 14 seconds +INSERT INTO workflows.state_values (key, workflow_id, state_type, value_id) +SELECT + 'key_' || gs::text, -- Key: Generate as 'key_1', 'key_2', etc. + (gs - 1) / 1000 + 1, -- workflow_id: Distribute over a range (1000 workflows) + 'STATIC', -- state_type: Use constant 'STATIC' as defined in schema + gs::bigint -- value_id: Use the same as the series value +FROM generate_series(1, 100000) AS gs; -- Generate 100,000 rows \ No newline at end of file diff --git a/test_runner/performance/large_synthetic_oltp/grow_values.sql b/test_runner/performance/large_synthetic_oltp/grow_values.sql new file mode 100644 index 0000000000..3afdafdf86 --- /dev/null +++ b/test_runner/performance/large_synthetic_oltp/grow_values.sql @@ -0,0 +1,30 @@ +-- add 100000 rows or approx. 24 MB to the values table +-- takes about 126 seconds +INSERT INTO workflows.values ( + id, + type, + int_value, + string_value, + child_type, + bool_value, + uuid, + numeric_value, + workflow_id, + jsonb_value, + parent_value_id +) +SELECT + gs AS id, + 'TYPE_A' AS type, + CASE WHEN selector = 1 THEN gs ELSE NULL END AS int_value, + CASE WHEN selector = 2 THEN 'string_value_' || gs::text ELSE NULL END AS string_value, + 'CHILD_TYPE_A' AS child_type, -- Always non-null + CASE WHEN selector = 3 THEN (gs % 2 = 0) ELSE NULL END AS bool_value, + uuid_generate_v4() AS uuid, -- Always non-null + CASE WHEN selector = 4 THEN gs * 1.0 ELSE NULL END AS numeric_value, + (array[1, 2, 3, 4, 5])[gs % 5 + 1] AS workflow_id, -- Use only existing workflow IDs + CASE WHEN selector = 5 THEN ('{"key":' || gs::text || '}')::jsonb ELSE NULL END AS jsonb_value, + (gs % 100) + 1 AS parent_value_id -- Always non-null +FROM + generate_series(1, 100000) AS gs, + (SELECT floor(random() * 5 + 1)::int AS selector) AS s; \ No newline at end of file diff --git a/test_runner/performance/large_synthetic_oltp/grow_vertices.sql b/test_runner/performance/large_synthetic_oltp/grow_vertices.sql new file mode 100644 index 0000000000..87a2410e8a --- /dev/null +++ b/test_runner/performance/large_synthetic_oltp/grow_vertices.sql @@ -0,0 +1,26 @@ +-- add 100000 rows or approx. 18 MB to the vertices table +-- takes about 90 seconds +INSERT INTO workflows.vertices( + uuid, + created_at, + condition_block_id, + operator, + has_been_visited, + reference_id, + workflow_id, + meta_data, + -- id, + action_block_id +) +SELECT + uuid_generate_v4() AS uuid, + now() AS created_at, + CASE WHEN (gs % 2 = 0) THEN gs % 10 ELSE NULL END AS condition_block_id, -- Every alternate row has a condition_block_id + 'operator_' || (gs % 10) AS operator, -- Cyclical operator values (e.g., operator_0, operator_1) + false AS has_been_visited, + 'ref_' || gs AS reference_id, -- Unique reference_id for each row + (gs % 1000) + 1 AS workflow_id, -- Random workflow_id values between 1 and 1000 + '{}'::jsonb AS meta_data, -- Empty JSON metadata + -- gs AS id, -- default from sequence to get unique ID + CASE WHEN (gs % 2 = 1) THEN gs ELSE NULL END AS action_block_id -- Complementary to condition_block_id +FROM generate_series(1, 100000) AS gs; \ No newline at end of file diff --git a/test_runner/performance/large_synthetic_oltp/update_accounting_coding_body_tracking_category_selection.sql b/test_runner/performance/large_synthetic_oltp/update_accounting_coding_body_tracking_category_selection.sql new file mode 100644 index 0000000000..78688fc8ba --- /dev/null +++ b/test_runner/performance/large_synthetic_oltp/update_accounting_coding_body_tracking_category_selection.sql @@ -0,0 +1,9 @@ +-- update approximately 2000 rows or 200 kb in the accounting_coding_body_tracking_category_selection table +-- takes about 1 second +UPDATE accounting.accounting_coding_body_tracking_category_selection +SET created_at = now() +WHERE ctid in ( + SELECT ctid + FROM accounting.accounting_coding_body_tracking_category_selection + TABLESAMPLE SYSTEM (0.0005) +); diff --git a/test_runner/performance/large_synthetic_oltp/update_action_blocks.sql b/test_runner/performance/large_synthetic_oltp/update_action_blocks.sql new file mode 100644 index 0000000000..ad1ee6c749 --- /dev/null +++ b/test_runner/performance/large_synthetic_oltp/update_action_blocks.sql @@ -0,0 +1,9 @@ +-- update approximately 9000 rows or 1 MB in the action_blocks table +-- takes about 1 second +UPDATE workflows.action_blocks +SET run_synchronously = NOT run_synchronously +WHERE ctid in ( + SELECT ctid + FROM workflows.action_blocks + TABLESAMPLE SYSTEM (0.001) +); diff --git a/test_runner/performance/large_synthetic_oltp/update_action_kwargs.sql b/test_runner/performance/large_synthetic_oltp/update_action_kwargs.sql new file mode 100644 index 0000000000..b939c0ff2d --- /dev/null +++ b/test_runner/performance/large_synthetic_oltp/update_action_kwargs.sql @@ -0,0 +1,9 @@ +-- update approximately 5000 rows or 1 MB in the action_kwargs table +-- takes about 1 second +UPDATE workflows.action_kwargs +SET created_at = now() +WHERE ctid in ( + SELECT ctid + FROM workflows.action_kwargs + TABLESAMPLE SYSTEM (0.0002) +); diff --git a/test_runner/performance/large_synthetic_oltp/update_denormalized_approval_workflow.sql b/test_runner/performance/large_synthetic_oltp/update_denormalized_approval_workflow.sql new file mode 100644 index 0000000000..671ddbc2d4 --- /dev/null +++ b/test_runner/performance/large_synthetic_oltp/update_denormalized_approval_workflow.sql @@ -0,0 +1,10 @@ +-- update approximately 3000 rows or 500 KB in the denormalized_approval_workflow table +-- takes about 1 second +UPDATE approvals_v2.denormalized_approval_workflow +SET created_at = now() +WHERE ctid in ( + SELECT ctid + FROM approvals_v2.denormalized_approval_workflow + TABLESAMPLE SYSTEM (0.0005) +); + diff --git a/test_runner/performance/large_synthetic_oltp/update_device_fingerprint_event.sql b/test_runner/performance/large_synthetic_oltp/update_device_fingerprint_event.sql new file mode 100644 index 0000000000..20baf12887 --- /dev/null +++ b/test_runner/performance/large_synthetic_oltp/update_device_fingerprint_event.sql @@ -0,0 +1,9 @@ +-- update approximately 2000 rows or 1 MB in the device_fingerprint_event table +-- takes about 5 seconds +UPDATE authentication.device_fingerprint_event +SET is_incognito = NOT is_incognito +WHERE ctid in ( + SELECT ctid + FROM authentication.device_fingerprint_event + TABLESAMPLE SYSTEM (0.001) +); diff --git a/test_runner/performance/large_synthetic_oltp/update_edges.sql b/test_runner/performance/large_synthetic_oltp/update_edges.sql new file mode 100644 index 0000000000..d79da78de3 --- /dev/null +++ b/test_runner/performance/large_synthetic_oltp/update_edges.sql @@ -0,0 +1,9 @@ +-- update approximately 4000 rows or 600 kb in the edges table +-- takes about 1 second +UPDATE workflows.edges +SET created_at = now() +WHERE ctid in ( + SELECT ctid + FROM workflows.edges + TABLESAMPLE SYSTEM (0.0005) +); diff --git a/test_runner/performance/large_synthetic_oltp/update_heron_transaction_enriched_log.sql b/test_runner/performance/large_synthetic_oltp/update_heron_transaction_enriched_log.sql new file mode 100644 index 0000000000..5bcc885736 --- /dev/null +++ b/test_runner/performance/large_synthetic_oltp/update_heron_transaction_enriched_log.sql @@ -0,0 +1,9 @@ +-- update approximately 10000 rows or 200 KB in the heron_transaction_enriched_log table +-- takes about 1 minutes +UPDATE heron.heron_transaction_enriched_log +SET created_at = now() +WHERE ctid in ( + SELECT ctid + FROM heron.heron_transaction_enriched_log + TABLESAMPLE SYSTEM (0.0005) +); diff --git a/test_runner/performance/large_synthetic_oltp/update_heron_transaction_enrichment_requests.sql b/test_runner/performance/large_synthetic_oltp/update_heron_transaction_enrichment_requests.sql new file mode 100644 index 0000000000..02cf0ca420 --- /dev/null +++ b/test_runner/performance/large_synthetic_oltp/update_heron_transaction_enrichment_requests.sql @@ -0,0 +1,9 @@ +-- update approximately 4000 rows or 1 MB in the heron_transaction_enrichment_requests table +-- takes about 2 minutes +UPDATE heron.heron_transaction_enrichment_requests +SET created_at = now() +WHERE ctid in ( + SELECT ctid + FROM heron.heron_transaction_enrichment_requests + TABLESAMPLE SYSTEM (0.0002) +); diff --git a/test_runner/performance/large_synthetic_oltp/update_hotel_rate_mapping.sql b/test_runner/performance/large_synthetic_oltp/update_hotel_rate_mapping.sql new file mode 100644 index 0000000000..3210b6dff8 --- /dev/null +++ b/test_runner/performance/large_synthetic_oltp/update_hotel_rate_mapping.sql @@ -0,0 +1,9 @@ +-- update approximately 6000 rows or 600 kb in the hotel_rate_mapping table +-- takes about 1 second +UPDATE booking_inventory.hotel_rate_mapping +SET created_at = now() +WHERE ctid in ( + SELECT ctid + FROM booking_inventory.hotel_rate_mapping + TABLESAMPLE SYSTEM (0.0005) +); diff --git a/test_runner/performance/large_synthetic_oltp/update_incoming_webhooks.sql b/test_runner/performance/large_synthetic_oltp/update_incoming_webhooks.sql new file mode 100644 index 0000000000..ea284eb47c --- /dev/null +++ b/test_runner/performance/large_synthetic_oltp/update_incoming_webhooks.sql @@ -0,0 +1,9 @@ +-- update approximately 2000 rows or 1 MB in the incoming_webhooks table +-- takes about 5 seconds +UPDATE webhook.incoming_webhooks +SET is_body_encrypted = NOT is_body_encrypted +WHERE ctid in ( + SELECT ctid + FROM webhook.incoming_webhooks + TABLESAMPLE SYSTEM (0.0002) +); diff --git a/test_runner/performance/large_synthetic_oltp/update_manual_transaction.sql b/test_runner/performance/large_synthetic_oltp/update_manual_transaction.sql new file mode 100644 index 0000000000..190bc625e2 --- /dev/null +++ b/test_runner/performance/large_synthetic_oltp/update_manual_transaction.sql @@ -0,0 +1,9 @@ +-- update approximately 1000 rows or 200 kb in the manual_transaction table +-- takes about 2 seconds +UPDATE banking.manual_transaction +SET created_at = now() +WHERE ctid in ( + SELECT ctid + FROM banking.manual_transaction + TABLESAMPLE SYSTEM (0.0005) +); diff --git a/test_runner/performance/large_synthetic_oltp/update_ml_receipt_matching_log.sql b/test_runner/performance/large_synthetic_oltp/update_ml_receipt_matching_log.sql new file mode 100644 index 0000000000..810021b09d --- /dev/null +++ b/test_runner/performance/large_synthetic_oltp/update_ml_receipt_matching_log.sql @@ -0,0 +1,9 @@ +-- update approximately 1000 rows or 100 kb in the ml_receipt_matching_log table +-- takes about 1 second +UPDATE receipt.ml_receipt_matching_log +SET is_shadow_mode = NOT is_shadow_mode +WHERE ctid in ( + SELECT ctid + FROM receipt.ml_receipt_matching_log + TABLESAMPLE SYSTEM (0.0005) +); diff --git a/test_runner/performance/large_synthetic_oltp/update_ocr_pipeine_results_version.sql b/test_runner/performance/large_synthetic_oltp/update_ocr_pipeine_results_version.sql new file mode 100644 index 0000000000..a1da8fdb07 --- /dev/null +++ b/test_runner/performance/large_synthetic_oltp/update_ocr_pipeine_results_version.sql @@ -0,0 +1,9 @@ +-- update approximately 2000 rows or 400 kb in the ocr_pipeline_results_version table +-- takes about 1 second +UPDATE ocr.ocr_pipeline_results_version +SET is_async = NOT is_async +WHERE ctid in ( + SELECT ctid + FROM ocr.ocr_pipeline_results_version + TABLESAMPLE SYSTEM (0.0005) +); diff --git a/test_runner/performance/large_synthetic_oltp/update_orc_pipeline_step_results.sql b/test_runner/performance/large_synthetic_oltp/update_orc_pipeline_step_results.sql new file mode 100644 index 0000000000..b7bb4932bd --- /dev/null +++ b/test_runner/performance/large_synthetic_oltp/update_orc_pipeline_step_results.sql @@ -0,0 +1,9 @@ +-- update approximately 3000 rows or 1 MB in the ocr_pipeline_step_results table +-- takes about 11 seconds +UPDATE ocr.ocr_pipeline_step_results +SET created_at = now() +WHERE ctid in ( + SELECT ctid + FROM ocr.ocr_pipeline_step_results + TABLESAMPLE SYSTEM (0.0005) +); diff --git a/test_runner/performance/large_synthetic_oltp/update_orc_pipeline_step_results_version.sql b/test_runner/performance/large_synthetic_oltp/update_orc_pipeline_step_results_version.sql new file mode 100644 index 0000000000..83e9765d22 --- /dev/null +++ b/test_runner/performance/large_synthetic_oltp/update_orc_pipeline_step_results_version.sql @@ -0,0 +1,9 @@ +-- update approximately 5000 rows or 1 MB in the ocr_pipeline_step_results_version table +-- takes about 40 seconds +UPDATE ocr.ocr_pipeline_step_results_version +SET created_at = now() +WHERE ctid in ( + SELECT ctid + FROM ocr.ocr_pipeline_step_results_version + TABLESAMPLE SYSTEM (0.0005) +); diff --git a/test_runner/performance/large_synthetic_oltp/update_priceline_raw_response.sql b/test_runner/performance/large_synthetic_oltp/update_priceline_raw_response.sql new file mode 100644 index 0000000000..a434c6cb63 --- /dev/null +++ b/test_runner/performance/large_synthetic_oltp/update_priceline_raw_response.sql @@ -0,0 +1,9 @@ +-- update approximately 5000 rows or 1 MB in the priceline_raw_response table +-- takes about 1 second +UPDATE booking_inventory.priceline_raw_response +SET created_at = now() +WHERE ctid in ( + SELECT ctid + FROM booking_inventory.priceline_raw_response + TABLESAMPLE SYSTEM (0.0005) +); diff --git a/test_runner/performance/large_synthetic_oltp/update_quickbooks_transactions.sql b/test_runner/performance/large_synthetic_oltp/update_quickbooks_transactions.sql new file mode 100644 index 0000000000..a783246c4c --- /dev/null +++ b/test_runner/performance/large_synthetic_oltp/update_quickbooks_transactions.sql @@ -0,0 +1,9 @@ +-- update approximately 5000 rows or 1 MB in the quickbooks_transactions table +-- takes about 30 seconds +UPDATE accounting.quickbooks_transactions +SET created_at = now() +WHERE ctid in ( + SELECT ctid + FROM accounting.quickbooks_transactions + TABLESAMPLE SYSTEM (0.0005) +); diff --git a/test_runner/performance/large_synthetic_oltp/update_raw_finicity_transaction.sql b/test_runner/performance/large_synthetic_oltp/update_raw_finicity_transaction.sql new file mode 100644 index 0000000000..91fb1bc789 --- /dev/null +++ b/test_runner/performance/large_synthetic_oltp/update_raw_finicity_transaction.sql @@ -0,0 +1,15 @@ +-- update approximately 6000 rows or 600 kb in the raw_finicity_transaction table +-- takes about 1 second +UPDATE banking.raw_finicity_transaction +SET raw_data = + jsonb_set( + raw_data, + '{updated}', + to_jsonb(now()), + true + ) +WHERE ctid IN ( + SELECT ctid + FROM banking.raw_finicity_transaction + TABLESAMPLE SYSTEM (0.0005) +); diff --git a/test_runner/performance/large_synthetic_oltp/update_relabeled_transactions.sql b/test_runner/performance/large_synthetic_oltp/update_relabeled_transactions.sql new file mode 100644 index 0000000000..87b402f9e7 --- /dev/null +++ b/test_runner/performance/large_synthetic_oltp/update_relabeled_transactions.sql @@ -0,0 +1,9 @@ +-- update approximately 8000 rows or 1 MB in the relabeled_transactions table +-- takes about 1 second +UPDATE heron.relabeled_transactions +SET created_at = now() +WHERE ctid in ( + SELECT ctid + FROM heron.relabeled_transactions + TABLESAMPLE SYSTEM (0.0005) +); diff --git a/test_runner/performance/large_synthetic_oltp/update_state_values.sql b/test_runner/performance/large_synthetic_oltp/update_state_values.sql new file mode 100644 index 0000000000..2365ea3d6b --- /dev/null +++ b/test_runner/performance/large_synthetic_oltp/update_state_values.sql @@ -0,0 +1,9 @@ +-- update approximately 8000 rows or 1 MB in the state_values table +-- takes about 2 minutes +UPDATE workflows.state_values +SET state_type = now()::text +WHERE ctid in ( + SELECT ctid + FROM workflows.state_values + TABLESAMPLE SYSTEM (0.0002) +); \ No newline at end of file diff --git a/test_runner/performance/large_synthetic_oltp/update_stripe_authorization_event_log.sql b/test_runner/performance/large_synthetic_oltp/update_stripe_authorization_event_log.sql new file mode 100644 index 0000000000..5328db9fb8 --- /dev/null +++ b/test_runner/performance/large_synthetic_oltp/update_stripe_authorization_event_log.sql @@ -0,0 +1,9 @@ +-- update approximately 4000 rows or 1 MB in the stripe_authorization_event_log table +-- takes about 5 minutes +UPDATE stripe.stripe_authorization_event_log +SET approved = NOT approved +WHERE ctid in ( + SELECT ctid + FROM stripe.stripe_authorization_event_log + TABLESAMPLE SYSTEM (0.0002) +); \ No newline at end of file diff --git a/test_runner/performance/large_synthetic_oltp/update_transaction.sql b/test_runner/performance/large_synthetic_oltp/update_transaction.sql new file mode 100644 index 0000000000..83bec52065 --- /dev/null +++ b/test_runner/performance/large_synthetic_oltp/update_transaction.sql @@ -0,0 +1,9 @@ +-- update approximately 2000 rows or 301 MB in the transaction table +-- takes about 90 seconds +UPDATE transaction.transaction +SET is_last = NOT is_last +WHERE ctid in ( + SELECT ctid + FROM transaction.transaction + TABLESAMPLE SYSTEM (0.0002) +); diff --git a/test_runner/performance/large_synthetic_oltp/update_values.sql b/test_runner/performance/large_synthetic_oltp/update_values.sql new file mode 100644 index 0000000000..e5d576dae5 --- /dev/null +++ b/test_runner/performance/large_synthetic_oltp/update_values.sql @@ -0,0 +1,9 @@ +-- update approximately 2500 rows or 1 MB in the values table +-- takes about 3 minutes +UPDATE workflows.values +SET bool_value = NOT bool_value +WHERE ctid in ( + SELECT ctid + FROM workflows.values + TABLESAMPLE SYSTEM (0.0002) +) AND bool_value IS NOT NULL; diff --git a/test_runner/performance/large_synthetic_oltp/update_vertices.sql b/test_runner/performance/large_synthetic_oltp/update_vertices.sql new file mode 100644 index 0000000000..714c38965b --- /dev/null +++ b/test_runner/performance/large_synthetic_oltp/update_vertices.sql @@ -0,0 +1,9 @@ +-- update approximately 10000 rows or 2 MB in the vertices table +-- takes about 1 minute +UPDATE workflows.vertices +SET has_been_visited = NOT has_been_visited +WHERE ctid in ( + SELECT ctid + FROM workflows.vertices + TABLESAMPLE SYSTEM (0.0002) +); \ No newline at end of file diff --git a/test_runner/performance/test_perf_oltp_large_tenant.py b/test_runner/performance/test_perf_oltp_large_tenant.py index b45394d627..bd00f6b65f 100644 --- a/test_runner/performance/test_perf_oltp_large_tenant.py +++ b/test_runner/performance/test_perf_oltp_large_tenant.py @@ -31,7 +31,9 @@ def get_custom_scripts( return rv -def run_test_pgbench(env: PgCompare, custom_scripts: str, duration: int): +def run_test_pgbench( + env: PgCompare, custom_scripts: str, duration: int, clients: int = 500, jobs: int = 100 +): password = env.pg.default_options.get("password", None) options = env.pg.default_options.get("options", "") # drop password from the connection string by passing password=None and set password separately @@ -46,8 +48,8 @@ def run_test_pgbench(env: PgCompare, custom_scripts: str, duration: int): "-n", # no explicit vacuum before the test - we want to rely on auto-vacuum "-M", "prepared", - "--client=500", - "--jobs=100", + f"--client={clients}", + f"--jobs={jobs}", f"-T{duration}", "-P60", # progress every minute "--progress-timestamp", @@ -164,6 +166,12 @@ def test_perf_oltp_large_tenant_pgbench( run_test_pgbench(remote_compare, custom_scripts, duration) +@pytest.mark.parametrize("duration", get_durations_matrix()) +@pytest.mark.remote_cluster +def test_perf_oltp_large_tenant_growth(remote_compare: PgCompare, duration: int): + run_test_pgbench(remote_compare, " ".join(get_custom_scripts()), duration, 35, 35) + + @pytest.mark.remote_cluster def test_perf_oltp_large_tenant_maintenance(remote_compare: PgCompare): # run analyze, vacuum, re-index after the test and measure and report its duration From e6a404c66d18fc966d93e3f3284fb587d29c5e68 Mon Sep 17 00:00:00 2001 From: Aleksandr Sarantsev <99037063+ephemeralsad@users.noreply.github.com> Date: Wed, 18 Jun 2025 17:27:41 +0400 Subject: [PATCH 071/124] Fix flaky test_sharding_split_failures (#12199) ## Problem `test_sharding_failures` is flaky due to interference from the `background_reconcile` process. The details are in the issue https://github.com/neondatabase/neon/issues/12029. ## Summary of changes - Use `reconcile_until_idle` to ensure a stable state before running test assertions - Added error tolerance in `reconcile_until_idle` test function (Failure cases: 1, 3, 19, 20) - Ignore the `Keeping extra secondaries` warning message since it i retryable (Failure case: 2) - Deduplicated code in `assert_rolled_back` and `assert_split_done` - Added a log message before printing plenty of Node `X` seen on pageserver `Y` --- storage_controller/src/service.rs | 23 +++++---- test_runner/fixtures/neon_fixtures.py | 1 + test_runner/regress/test_sharding.py | 69 +++++++++++---------------- 3 files changed, 45 insertions(+), 48 deletions(-) diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 55f91cea09..14c81ccf59 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -8778,15 +8778,22 @@ impl Service { let waiter_count = waiters.len(); match self.await_waiters(waiters, RECONCILE_TIMEOUT).await { Ok(()) => {} - Err(ReconcileWaitError::Failed(_, reconcile_error)) - if matches!(*reconcile_error, ReconcileError::Cancel) => - { - // Ignore reconciler cancel errors: this reconciler might have shut down - // because some other change superceded it. We will return a nonzero number, - // so the caller knows they might have to call again to quiesce the system. - } Err(e) => { - return Err(e); + if let ReconcileWaitError::Failed(_, reconcile_error) = &e { + match **reconcile_error { + ReconcileError::Cancel + | ReconcileError::Remote(mgmt_api::Error::Cancelled) => { + // Ignore reconciler cancel errors: this reconciler might have shut down + // because some other change superceded it. We will return a nonzero number, + // so the caller knows they might have to call again to quiesce the system. + } + _ => { + return Err(e); + } + } + } else { + return Err(e); + } } }; diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 970175a631..236c03e361 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -2360,6 +2360,7 @@ class NeonStorageController(MetricsGetter, LogUtils): delay_max = max_interval while n > 0: n = self.reconcile_all() + if n == 0: break elif time.time() - start_at > timeout_secs: diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py index 522e257ea5..93c621f564 100644 --- a/test_runner/regress/test_sharding.py +++ b/test_runner/regress/test_sharding.py @@ -1337,7 +1337,7 @@ def test_sharding_split_failures( # Create bystander tenants with various shard counts. They should not be affected by the aborted # splits. Regression test for https://github.com/neondatabase/cloud/issues/28589. bystanders = {} # id → shard_count - for bystander_shard_count in [1, 2, 4, 8]: + for bystander_shard_count in [1, 2, 4]: id, _ = env.create_tenant(shard_count=bystander_shard_count) bystanders[id] = bystander_shard_count @@ -1358,6 +1358,8 @@ def test_sharding_split_failures( ".*Reconcile error.*Cancelled.*", # While parent shard's client is stopped during split, flush loop updating LSNs will emit this warning ".*Failed to schedule metadata upload after updating disk_consistent_lsn.*", + # We didn't identify a secondary to remove. + ".*Keeping extra secondaries.*", ] ) @@ -1388,51 +1390,36 @@ def test_sharding_split_failures( with pytest.raises(failure.expect_exception()): env.storage_controller.tenant_shard_split(tenant_id, shard_count=4) + def assert_shard_count(shard_count: int, exclude_ps_id: int | None = None) -> None: + secondary_count = 0 + attached_count = 0 + log.info(f"Iterating over {len(env.pageservers)} pageservers to check shard count") + for ps in env.pageservers: + if exclude_ps_id is not None and ps.id == exclude_ps_id: + continue + + locations = ps.http_client().tenant_list_locations()["tenant_shards"] + for loc in locations: + tenant_shard_id = TenantShardId.parse(loc[0]) + if tenant_shard_id.tenant_id != tenant_id: + continue # skip bystanders + log.info(f"Shard {tenant_shard_id} seen on node {ps.id} in mode {loc[1]['mode']}") + assert tenant_shard_id.shard_count == shard_count + if loc[1]["mode"] == "Secondary": + secondary_count += 1 + else: + attached_count += 1 + assert secondary_count == shard_count + assert attached_count == shard_count + # We expect that the overall operation will fail, but some split requests # will have succeeded: the net result should be to return to a clean state, including # detaching any child shards. def assert_rolled_back(exclude_ps_id=None) -> None: - secondary_count = 0 - attached_count = 0 - for ps in env.pageservers: - if exclude_ps_id is not None and ps.id == exclude_ps_id: - continue - - locations = ps.http_client().tenant_list_locations()["tenant_shards"] - for loc in locations: - tenant_shard_id = TenantShardId.parse(loc[0]) - if tenant_shard_id.tenant_id != tenant_id: - continue # skip bystanders - log.info(f"Shard {tenant_shard_id} seen on node {ps.id} in mode {loc[1]['mode']}") - assert tenant_shard_id.shard_count == initial_shard_count - if loc[1]["mode"] == "Secondary": - secondary_count += 1 - else: - attached_count += 1 - - assert secondary_count == initial_shard_count - assert attached_count == initial_shard_count + assert_shard_count(initial_shard_count, exclude_ps_id) def assert_split_done(exclude_ps_id: int | None = None) -> None: - secondary_count = 0 - attached_count = 0 - for ps in env.pageservers: - if exclude_ps_id is not None and ps.id == exclude_ps_id: - continue - - locations = ps.http_client().tenant_list_locations()["tenant_shards"] - for loc in locations: - tenant_shard_id = TenantShardId.parse(loc[0]) - if tenant_shard_id.tenant_id != tenant_id: - continue # skip bystanders - log.info(f"Shard {tenant_shard_id} seen on node {ps.id} in mode {loc[1]['mode']}") - assert tenant_shard_id.shard_count == split_shard_count - if loc[1]["mode"] == "Secondary": - secondary_count += 1 - else: - attached_count += 1 - assert attached_count == split_shard_count - assert secondary_count == split_shard_count + assert_shard_count(split_shard_count, exclude_ps_id) def finish_split(): # Having failed+rolled back, we should be able to split again @@ -1468,6 +1455,7 @@ def test_sharding_split_failures( # The split should appear to be rolled back from the point of view of all pageservers # apart from the one that is offline + env.storage_controller.reconcile_until_idle(timeout_secs=60, max_interval=2) wait_until(lambda: assert_rolled_back(exclude_ps_id=failure.pageserver_id)) finish_split() @@ -1482,6 +1470,7 @@ def test_sharding_split_failures( log.info("Clearing failure...") failure.clear(env) + env.storage_controller.reconcile_until_idle(timeout_secs=60, max_interval=2) wait_until(assert_rolled_back) # Having rolled back, the tenant should be working From d8d62fb7cb99f22d783f0e13a983129698b1603d Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Wed, 18 Jun 2025 16:05:13 +0200 Subject: [PATCH 072/124] test_runner: add gRPC support (#12279) ## Problem `test_runner` integration tests should support gRPC. Touches #11926. ## Summary of changes * Enable gRPC for Pageservers, with dynamic port allocations. * Add a `grpc` parameter for endpoint creation, plumbed through to `neon_local endpoint create`. No tests actually use gRPC yet, since computes don't support it yet. --- test_runner/fixtures/neon_cli.py | 3 +++ test_runner/fixtures/neon_fixtures.py | 17 ++++++++++++++++- 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/test_runner/fixtures/neon_cli.py b/test_runner/fixtures/neon_cli.py index e177145294..1b09e5bdd0 100644 --- a/test_runner/fixtures/neon_cli.py +++ b/test_runner/fixtures/neon_cli.py @@ -497,6 +497,7 @@ class NeonLocalCli(AbstractNeonCli): tenant_id: TenantId, pg_version: PgVersion, endpoint_id: str | None = None, + grpc: bool | None = None, hot_standby: bool = False, lsn: Lsn | None = None, pageserver_id: int | None = None, @@ -521,6 +522,8 @@ class NeonLocalCli(AbstractNeonCli): args.extend(["--external-http-port", str(external_http_port)]) if internal_http_port is not None: args.extend(["--internal-http-port", str(internal_http_port)]) + if grpc: + args.append("--grpc") if endpoint_id is not None: args.append(endpoint_id) if hot_standby: diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 236c03e361..8cf1020adb 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -1228,6 +1228,7 @@ class NeonEnv: ): pageserver_port = PageserverPort( pg=self.port_distributor.get_port(), + grpc=self.port_distributor.get_port(), http=self.port_distributor.get_port(), https=self.port_distributor.get_port() if config.use_https_pageserver_api else None, ) @@ -1243,13 +1244,14 @@ class NeonEnv: ps_cfg: dict[str, Any] = { "id": ps_id, "listen_pg_addr": f"localhost:{pageserver_port.pg}", + "listen_grpc_addr": f"localhost:{pageserver_port.grpc}", "listen_http_addr": f"localhost:{pageserver_port.http}", "listen_https_addr": f"localhost:{pageserver_port.https}" if config.use_https_pageserver_api else None, "pg_auth_type": pg_auth_type, - "http_auth_type": http_auth_type, "grpc_auth_type": grpc_auth_type, + "http_auth_type": http_auth_type, "availability_zone": availability_zone, # Disable pageserver disk syncs in tests: when running tests concurrently, this avoids # the pageserver taking a long time to start up due to syncfs flushing other tests' data @@ -1762,6 +1764,7 @@ def neon_env_builder( @dataclass class PageserverPort: pg: int + grpc: int http: int https: int | None = None @@ -4207,6 +4210,7 @@ class Endpoint(PgProtocol, LogUtils): self, branch_name: str, endpoint_id: str | None = None, + grpc: bool | None = None, hot_standby: bool = False, lsn: Lsn | None = None, config_lines: list[str] | None = None, @@ -4231,6 +4235,7 @@ class Endpoint(PgProtocol, LogUtils): endpoint_id=self.endpoint_id, tenant_id=self.tenant_id, lsn=lsn, + grpc=grpc, hot_standby=hot_standby, pg_port=self.pg_port, external_http_port=self.external_http_port, @@ -4530,6 +4535,7 @@ class Endpoint(PgProtocol, LogUtils): self, branch_name: str, endpoint_id: str | None = None, + grpc: bool | None = None, hot_standby: bool = False, lsn: Lsn | None = None, config_lines: list[str] | None = None, @@ -4547,6 +4553,7 @@ class Endpoint(PgProtocol, LogUtils): branch_name=branch_name, endpoint_id=endpoint_id, config_lines=config_lines, + grpc=grpc, hot_standby=hot_standby, lsn=lsn, pageserver_id=pageserver_id, @@ -4634,6 +4641,7 @@ class EndpointFactory: endpoint_id: str | None = None, tenant_id: TenantId | None = None, lsn: Lsn | None = None, + grpc: bool | None = None, hot_standby: bool = False, config_lines: list[str] | None = None, remote_ext_base_url: str | None = None, @@ -4653,6 +4661,7 @@ class EndpointFactory: return ep.create_start( branch_name=branch_name, endpoint_id=endpoint_id, + grpc=grpc, hot_standby=hot_standby, config_lines=config_lines, lsn=lsn, @@ -4667,6 +4676,7 @@ class EndpointFactory: endpoint_id: str | None = None, tenant_id: TenantId | None = None, lsn: Lsn | None = None, + grpc: bool | None = None, hot_standby: bool = False, config_lines: list[str] | None = None, pageserver_id: int | None = None, @@ -4689,6 +4699,7 @@ class EndpointFactory: branch_name=branch_name, endpoint_id=endpoint_id, lsn=lsn, + grpc=grpc, hot_standby=hot_standby, config_lines=config_lines, pageserver_id=pageserver_id, @@ -4713,6 +4724,7 @@ class EndpointFactory: self, origin: Endpoint, endpoint_id: str | None = None, + grpc: bool | None = None, config_lines: list[str] | None = None, ) -> Endpoint: branch_name = origin.branch_name @@ -4724,6 +4736,7 @@ class EndpointFactory: endpoint_id=endpoint_id, tenant_id=origin.tenant_id, lsn=None, + grpc=grpc, hot_standby=True, config_lines=config_lines, ) @@ -4732,6 +4745,7 @@ class EndpointFactory: self, origin: Endpoint, endpoint_id: str | None = None, + grpc: bool | None = None, config_lines: list[str] | None = None, ) -> Endpoint: branch_name = origin.branch_name @@ -4743,6 +4757,7 @@ class EndpointFactory: endpoint_id=endpoint_id, tenant_id=origin.tenant_id, lsn=None, + grpc=grpc, hot_standby=True, config_lines=config_lines, ) From 830ef35ed387a6143e9d94165fb43eabc45cda9f Mon Sep 17 00:00:00 2001 From: Elizabeth Murray <52375559+bizwark@users.noreply.github.com> Date: Wed, 18 Jun 2025 08:51:49 -0700 Subject: [PATCH 073/124] Domain client for Pageserver GRPC. (#12111) Add domain client for new communicator GRPC types. --- Cargo.lock | 3 + pageserver/page_api/Cargo.toml | 3 + pageserver/page_api/src/client.rs | 191 ++++++++++++++++++++++++++++++ pageserver/page_api/src/lib.rs | 2 + 4 files changed, 199 insertions(+) create mode 100644 pageserver/page_api/src/client.rs diff --git a/Cargo.lock b/Cargo.lock index 54a4f946d3..e6b8399b5e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4465,11 +4465,14 @@ dependencies = [ name = "pageserver_page_api" version = "0.1.0" dependencies = [ + "anyhow", "bytes", + "futures", "pageserver_api", "postgres_ffi", "prost 0.13.5", "thiserror 1.0.69", + "tokio", "tonic 0.13.1", "tonic-build", "utils", diff --git a/pageserver/page_api/Cargo.toml b/pageserver/page_api/Cargo.toml index e643b5749b..8b13b9e1db 100644 --- a/pageserver/page_api/Cargo.toml +++ b/pageserver/page_api/Cargo.toml @@ -5,11 +5,14 @@ edition.workspace = true license.workspace = true [dependencies] +anyhow.workspace = true bytes.workspace = true +futures.workspace = true pageserver_api.workspace = true postgres_ffi.workspace = true prost.workspace = true thiserror.workspace = true +tokio.workspace = true tonic.workspace = true utils.workspace = true workspace_hack.workspace = true diff --git a/pageserver/page_api/src/client.rs b/pageserver/page_api/src/client.rs new file mode 100644 index 0000000000..057a1d4ad6 --- /dev/null +++ b/pageserver/page_api/src/client.rs @@ -0,0 +1,191 @@ +use std::convert::TryInto; + +use bytes::Bytes; +use futures::TryStreamExt; +use futures::{Stream, StreamExt}; +use tonic::metadata::AsciiMetadataValue; +use tonic::metadata::errors::InvalidMetadataValue; +use tonic::transport::Channel; +use tonic::{Request, Streaming}; + +use utils::id::TenantId; +use utils::id::TimelineId; +use utils::shard::ShardIndex; + +use anyhow::Result; + +use crate::model; +use crate::proto; + +/// +/// AuthInterceptor adds tenant, timeline, and auth header to the channel. These +/// headers are required at the pageserver. +/// +#[derive(Clone)] +struct AuthInterceptor { + tenant_id: AsciiMetadataValue, + timeline_id: AsciiMetadataValue, + shard_id: AsciiMetadataValue, + auth_header: Option, // including "Bearer " prefix +} + +impl AuthInterceptor { + fn new( + tenant_id: TenantId, + timeline_id: TimelineId, + auth_token: Option, + shard_id: ShardIndex, + ) -> Result { + let tenant_ascii: AsciiMetadataValue = tenant_id.to_string().try_into()?; + let timeline_ascii: AsciiMetadataValue = timeline_id.to_string().try_into()?; + let shard_ascii: AsciiMetadataValue = shard_id.to_string().try_into()?; + + let auth_header: Option = match auth_token { + Some(token) => Some(format!("Bearer {token}").try_into()?), + None => None, + }; + + Ok(Self { + tenant_id: tenant_ascii, + shard_id: shard_ascii, + timeline_id: timeline_ascii, + auth_header, + }) + } +} + +impl tonic::service::Interceptor for AuthInterceptor { + fn call(&mut self, mut req: tonic::Request<()>) -> Result, tonic::Status> { + req.metadata_mut() + .insert("neon-tenant-id", self.tenant_id.clone()); + req.metadata_mut() + .insert("neon-shard-id", self.shard_id.clone()); + req.metadata_mut() + .insert("neon-timeline-id", self.timeline_id.clone()); + if let Some(auth_header) = &self.auth_header { + req.metadata_mut() + .insert("authorization", auth_header.clone()); + } + Ok(req) + } +} +#[derive(Clone)] +pub struct Client { + client: proto::PageServiceClient< + tonic::service::interceptor::InterceptedService, + >, +} + +impl Client { + pub async fn new + Send + Sync + 'static>( + into_endpoint: T, + tenant_id: TenantId, + timeline_id: TimelineId, + shard_id: ShardIndex, + auth_header: Option, + ) -> anyhow::Result { + let endpoint: tonic::transport::Endpoint = into_endpoint + .try_into() + .map_err(|_e| anyhow::anyhow!("failed to convert endpoint"))?; + let channel = endpoint.connect().await?; + let auth = AuthInterceptor::new(tenant_id, timeline_id, auth_header, shard_id) + .map_err(|e| anyhow::anyhow!(e.to_string()))?; + let client = proto::PageServiceClient::with_interceptor(channel, auth); + + Ok(Self { client }) + } + + /// Returns whether a relation exists. + pub async fn check_rel_exists( + &mut self, + req: model::CheckRelExistsRequest, + ) -> Result { + let proto_req = proto::CheckRelExistsRequest::from(req); + + let response = self.client.check_rel_exists(proto_req).await?; + + let proto_resp = response.into_inner(); + Ok(proto_resp.into()) + } + + /// Fetches a base backup. + pub async fn get_base_backup( + &mut self, + req: model::GetBaseBackupRequest, + ) -> Result>, tonic::Status> { + let proto_req = proto::GetBaseBackupRequest::from(req); + + let response_stream: Streaming = + self.client.get_base_backup(proto_req).await?.into_inner(); + + // TODO: Consider dechunking internally + let domain_stream = response_stream.map(|chunk_res| { + chunk_res.and_then(|proto_chunk| { + proto_chunk.try_into().map_err(|e| { + tonic::Status::internal(format!("Failed to convert response chunk: {}", e)) + }) + }) + }); + + Ok(domain_stream) + } + + /// Returns the total size of a database, as # of bytes. + pub async fn get_db_size( + &mut self, + req: model::GetDbSizeRequest, + ) -> Result { + let proto_req = proto::GetDbSizeRequest::from(req); + + let response = self.client.get_db_size(proto_req).await?; + Ok(response.into_inner().into()) + } + + /// Fetches pages. + /// + /// This is implemented as a bidirectional streaming RPC for performance. + /// Per-request errors are often returned as status_code instead of errors, + /// to avoid tearing down the entire stream via tonic::Status. + pub async fn get_pages( + &mut self, + inbound: ReqSt, + ) -> Result< + impl Stream> + Send + 'static, + tonic::Status, + > + where + ReqSt: Stream + Send + 'static, + { + let outbound_proto = inbound.map(|domain_req| domain_req.into()); + + let req_new = Request::new(outbound_proto); + + let response_stream: Streaming = + self.client.get_pages(req_new).await?.into_inner(); + + let domain_stream = response_stream.map_ok(model::GetPageResponse::from); + + Ok(domain_stream) + } + + /// Returns the size of a relation, as # of blocks. + pub async fn get_rel_size( + &mut self, + req: model::GetRelSizeRequest, + ) -> Result { + let proto_req = proto::GetRelSizeRequest::from(req); + let response = self.client.get_rel_size(proto_req).await?; + let proto_resp = response.into_inner(); + Ok(proto_resp.into()) + } + + /// Fetches an SLRU segment. + pub async fn get_slru_segment( + &mut self, + req: model::GetSlruSegmentRequest, + ) -> Result { + let proto_req = proto::GetSlruSegmentRequest::from(req); + let response = self.client.get_slru_segment(proto_req).await?; + Ok(response.into_inner().try_into()?) + } +} diff --git a/pageserver/page_api/src/lib.rs b/pageserver/page_api/src/lib.rs index f515f27f3e..e78f6ce206 100644 --- a/pageserver/page_api/src/lib.rs +++ b/pageserver/page_api/src/lib.rs @@ -18,6 +18,8 @@ pub mod proto { pub use page_service_server::{PageService, PageServiceServer}; } +mod client; +pub use client::Client; mod model; pub use model::*; From 762905cf8da71df53888ace624f36ad702bd2e40 Mon Sep 17 00:00:00 2001 From: Mikhail Date: Wed, 18 Jun 2025 18:45:20 +0100 Subject: [PATCH 074/124] endpoint storage: parse config with type:LocalFs|AwsS3|AzureContainer (#12282) https://github.com/neondatabase/cloud/issues/27195 --- endpoint_storage/src/main.rs | 6 +++--- libs/remote_storage/src/config.rs | 22 ++++++++++++++++++++++ libs/remote_storage/src/lib.rs | 10 ++++++++++ 3 files changed, 35 insertions(+), 3 deletions(-) diff --git a/endpoint_storage/src/main.rs b/endpoint_storage/src/main.rs index 23b7343ff3..c96cef2083 100644 --- a/endpoint_storage/src/main.rs +++ b/endpoint_storage/src/main.rs @@ -31,13 +31,12 @@ struct Args { } #[derive(serde::Deserialize)] -#[serde(tag = "type")] struct Config { #[serde(default = "listen")] listen: std::net::SocketAddr, pemfile: camino::Utf8PathBuf, #[serde(flatten)] - storage_config: remote_storage::RemoteStorageConfig, + storage_kind: remote_storage::TypedRemoteStorageKind, #[serde(default = "max_upload_file_limit")] max_upload_file_limit: usize, } @@ -70,7 +69,8 @@ async fn main() -> anyhow::Result<()> { let listener = tokio::net::TcpListener::bind(config.listen).await.unwrap(); info!("listening on {}", listener.local_addr().unwrap()); - let storage = remote_storage::GenericRemoteStorage::from_config(&config.storage_config).await?; + let storage = + remote_storage::GenericRemoteStorage::from_storage_kind(config.storage_kind).await?; let cancel = tokio_util::sync::CancellationToken::new(); if !args.no_s3_check_on_startup { app::check_storage_permissions(&storage, cancel.clone()).await?; diff --git a/libs/remote_storage/src/config.rs b/libs/remote_storage/src/config.rs index 52978be5b4..5bc1f678ae 100644 --- a/libs/remote_storage/src/config.rs +++ b/libs/remote_storage/src/config.rs @@ -87,6 +87,28 @@ pub enum RemoteStorageKind { AzureContainer(AzureConfig), } +#[derive(Deserialize)] +#[serde(tag = "type")] +/// Version of RemoteStorageKind which deserializes with type: LocalFs | AwsS3 | AzureContainer +/// Needed for endpoint storage service +pub enum TypedRemoteStorageKind { + LocalFs { local_path: Utf8PathBuf }, + AwsS3(S3Config), + AzureContainer(AzureConfig), +} + +impl From for RemoteStorageKind { + fn from(value: TypedRemoteStorageKind) -> Self { + match value { + TypedRemoteStorageKind::LocalFs { local_path } => { + RemoteStorageKind::LocalFs { local_path } + } + TypedRemoteStorageKind::AwsS3(v) => RemoteStorageKind::AwsS3(v), + TypedRemoteStorageKind::AzureContainer(v) => RemoteStorageKind::AzureContainer(v), + } + } +} + /// AWS S3 bucket coordinates and access credentials to manage the bucket contents (read and write). #[derive(Clone, PartialEq, Eq, Deserialize, Serialize)] pub struct S3Config { diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs index 9e445dd72f..ed416b2811 100644 --- a/libs/remote_storage/src/lib.rs +++ b/libs/remote_storage/src/lib.rs @@ -31,6 +31,7 @@ use anyhow::Context; pub use azure_core::Etag; use bytes::Bytes; use camino::{Utf8Path, Utf8PathBuf}; +pub use config::TypedRemoteStorageKind; pub use error::{DownloadError, TimeTravelError, TimeoutOrCancel}; use futures::StreamExt; use futures::stream::Stream; @@ -676,6 +677,15 @@ impl GenericRemoteStorage> { } impl GenericRemoteStorage { + pub async fn from_storage_kind(kind: TypedRemoteStorageKind) -> anyhow::Result { + Self::from_config(&RemoteStorageConfig { + storage: kind.into(), + timeout: RemoteStorageConfig::DEFAULT_TIMEOUT, + small_timeout: RemoteStorageConfig::DEFAULT_SMALL_TIMEOUT, + }) + .await + } + pub async fn from_config(storage_config: &RemoteStorageConfig) -> anyhow::Result { let timeout = storage_config.timeout; From 20f4febce1221e1d619e5f4eef63720ef89d05da Mon Sep 17 00:00:00 2001 From: Suhas Thalanki <54014218+thesuhas@users.noreply.github.com> Date: Wed, 18 Jun 2025 12:31:22 -0700 Subject: [PATCH 075/124] fix: additional changes to terminate pgbouncer on compute suspend (#12153) (#12284) Addressed [retrospective comments ](https://github.com/neondatabase/neon/pull/12153#discussion_r2154197503) to https://github.com/neondatabase/neon/pull/12153 --- compute_tools/src/compute.rs | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index d42e3cc860..7a7f2dfedc 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -2320,8 +2320,6 @@ pub fn forward_termination_signal(dev_mode: bool) { } if !dev_mode { - info!("not in dev mode, terminating pgbouncer"); - // Terminate pgbouncer with SIGKILL match pid_file::read(PGBOUNCER_PIDFILE.into()) { Ok(pid_file::PidFileRead::LockedByOtherProcess(pid)) => { @@ -2353,25 +2351,27 @@ pub fn forward_termination_signal(dev_mode: bool) { error!("error reading pgbouncer pid file: {}", e); } } - } - // Terminate local_proxy - match pid_file::read("/etc/local_proxy/pid".into()) { - Ok(pid_file::PidFileRead::LockedByOtherProcess(pid)) => { - info!("sending SIGTERM to local_proxy process pid: {}", pid); - if let Err(e) = kill(pid, Signal::SIGTERM) { - error!("failed to terminate local_proxy: {}", e); + // Terminate local_proxy + match pid_file::read("/etc/local_proxy/pid".into()) { + Ok(pid_file::PidFileRead::LockedByOtherProcess(pid)) => { + info!("sending SIGTERM to local_proxy process pid: {}", pid); + if let Err(e) = kill(pid, Signal::SIGTERM) { + error!("failed to terminate local_proxy: {}", e); + } + } + Ok(pid_file::PidFileRead::NotHeldByAnyProcess(_)) => { + info!("local_proxy PID file exists but process not running"); + } + Ok(pid_file::PidFileRead::NotExist) => { + info!("local_proxy PID file not found, process may not be running"); + } + Err(e) => { + error!("error reading local_proxy PID file: {}", e); } } - Ok(pid_file::PidFileRead::NotHeldByAnyProcess(_)) => { - info!("local_proxy PID file exists but process not running"); - } - Ok(pid_file::PidFileRead::NotExist) => { - info!("local_proxy PID file not found, process may not be running"); - } - Err(e) => { - error!("error reading local_proxy PID file: {}", e); - } + } else { + info!("Skipping pgbouncer and local_proxy termination because in dev mode"); } let pg_pid = PG_PID.load(Ordering::SeqCst); From 3f676df3d519107ee09b54772652be38734608c1 Mon Sep 17 00:00:00 2001 From: Vlad Lazar Date: Thu, 19 Jun 2025 12:53:18 +0300 Subject: [PATCH 076/124] pageserver: fix initial layer visibility calculation (#12206) ## Problem GC info is an input to updating layer visibility. Currently, gc info is updated on timeline activation and visibility is computed on tenant attach, so we ignore branch points and compute visibility by taking all layers into account. Side note: gc info is also updated when timelines are created and dropped. That doesn't help because we create the timelines in topological order from the root. Hence the root timeline goes first, without context of where the branch points are. The impact of this in prod is that shards need to rehydrate layers after live migration since the non-visible ones were excluded from the heatmap. ## Summary of Changes Move the visibility calculation into tenant attachment instead of activation. --- pageserver/src/tenant.rs | 30 +++++++++++++++++++++++------- pageserver/src/tenant/timeline.rs | 6 +----- 2 files changed, 24 insertions(+), 12 deletions(-) diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index cfecf5561c..dc6926f299 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -1859,6 +1859,29 @@ impl TenantShard { } } + // At this point we've initialized all timelines and are tracking them. + // Now compute the layer visibility for all (not offloaded) timelines. + let compute_visiblity_for = { + let timelines_accessor = self.timelines.lock().unwrap(); + let mut timelines_offloaded_accessor = self.timelines_offloaded.lock().unwrap(); + + timelines_offloaded_accessor.extend(offloaded_timelines_list.into_iter()); + + // Before activation, populate each Timeline's GcInfo with information about its children + self.initialize_gc_info(&timelines_accessor, &timelines_offloaded_accessor, None); + + timelines_accessor.values().cloned().collect::>() + }; + + for tl in compute_visiblity_for { + tl.update_layer_visibility().await.with_context(|| { + format!( + "failed initial timeline visibility computation {} for tenant {}", + tl.timeline_id, self.tenant_shard_id + ) + })?; + } + // Walk through deleted timelines, resume deletion for (timeline_id, index_part, remote_timeline_client) in timelines_to_resume_deletions { remote_timeline_client @@ -1878,10 +1901,6 @@ impl TenantShard { .context("resume_deletion") .map_err(LoadLocalTimelineError::ResumeDeletion)?; } - { - let mut offloaded_timelines_accessor = self.timelines_offloaded.lock().unwrap(); - offloaded_timelines_accessor.extend(offloaded_timelines_list.into_iter()); - } // Stash the preloaded tenant manifest, and upload a new manifest if changed. // @@ -3449,9 +3468,6 @@ impl TenantShard { .values() .filter(|timeline| !(timeline.is_broken() || timeline.is_stopping())); - // Before activation, populate each Timeline's GcInfo with information about its children - self.initialize_gc_info(&timelines_accessor, &timelines_offloaded_accessor, None); - // Spawn gc and compaction loops. The loops will shut themselves // down when they notice that the tenant is inactive. tasks::start_background_loops(self, background_jobs_can_start); diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index a1969ecae6..b8bfc4f936 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -3422,10 +3422,6 @@ impl Timeline { // TenantShard::create_timeline will wait for these uploads to happen before returning, or // on retry. - // Now that we have the full layer map, we may calculate the visibility of layers within it (a global scan) - drop(guard); // drop write lock, update_layer_visibility will take a read lock. - self.update_layer_visibility().await?; - info!( "loaded layer map with {} layers at {}, total physical size: {}", num_layers, disk_consistent_lsn, total_physical_size @@ -5939,7 +5935,7 @@ impl Drop for Timeline { if let Ok(mut gc_info) = ancestor.gc_info.write() { if !gc_info.remove_child_not_offloaded(self.timeline_id) { tracing::error!(tenant_id = %self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id, - "Couldn't remove retain_lsn entry from offloaded timeline's parent: already removed"); + "Couldn't remove retain_lsn entry from timeline's parent on drop: already removed"); } } } From 6f4ffdb48b029bb56f02abb5b15551790083eb01 Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Thu, 19 Jun 2025 11:54:34 +0200 Subject: [PATCH 077/124] pageserver: add gRPC compression (#12280) ## Problem The gRPC page service should support compression. Requires #12111. Touches #11728. Touches https://github.com/neondatabase/cloud/issues/25679. ## Summary of changes Add support for gzip and zstd compression in the server, and a client parameter to enable compression. This will need further benchmarking under realistic network conditions. --- Cargo.lock | 2 ++ Cargo.toml | 2 +- pageserver/page_api/src/client.rs | 11 ++++++++++- pageserver/src/page_service.rs | 10 ++++++++-- 4 files changed, 21 insertions(+), 4 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index e6b8399b5e..b996f9d384 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -7561,6 +7561,7 @@ dependencies = [ "axum", "base64 0.22.1", "bytes", + "flate2", "h2 0.4.4", "http 1.1.0", "http-body 1.0.0", @@ -7580,6 +7581,7 @@ dependencies = [ "tower-layer", "tower-service", "tracing", + "zstd", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 2f4fcbc249..076d8d0b60 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -199,7 +199,7 @@ tokio-tar = "0.3" tokio-util = { version = "0.7.10", features = ["io", "rt"] } toml = "0.8" toml_edit = "0.22" -tonic = { version = "0.13.1", default-features = false, features = ["channel", "codegen", "prost", "router", "server", "tls-ring", "tls-native-roots"] } +tonic = { version = "0.13.1", default-features = false, features = ["channel", "codegen", "gzip", "prost", "router", "server", "tls-ring", "tls-native-roots", "zstd"] } tonic-reflection = { version = "0.13.1", features = ["server"] } tower = { version = "0.5.2", default-features = false } tower-http = { version = "0.6.2", features = ["auth", "request-id", "trace"] } diff --git a/pageserver/page_api/src/client.rs b/pageserver/page_api/src/client.rs index 057a1d4ad6..274f036f3d 100644 --- a/pageserver/page_api/src/client.rs +++ b/pageserver/page_api/src/client.rs @@ -83,6 +83,7 @@ impl Client { timeline_id: TimelineId, shard_id: ShardIndex, auth_header: Option, + compression: Option, ) -> anyhow::Result { let endpoint: tonic::transport::Endpoint = into_endpoint .try_into() @@ -90,7 +91,15 @@ impl Client { let channel = endpoint.connect().await?; let auth = AuthInterceptor::new(tenant_id, timeline_id, auth_header, shard_id) .map_err(|e| anyhow::anyhow!(e.to_string()))?; - let client = proto::PageServiceClient::with_interceptor(channel, auth); + let mut client = proto::PageServiceClient::with_interceptor(channel, auth); + + if let Some(compression) = compression { + // TODO: benchmark this (including network latency). + // TODO: consider enabling compression by default. + client = client + .accept_compressed(compression) + .send_compressed(compression); + } Ok(Self { client }) } diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 57087dc6c3..800b47f235 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -3286,7 +3286,14 @@ impl GrpcPageServiceHandler { Ok(req) })) // Run the page service. - .service(proto::PageServiceServer::new(page_service_handler)); + .service( + proto::PageServiceServer::new(page_service_handler) + // Support both gzip and zstd compression. The client decides what to use. + .accept_compressed(tonic::codec::CompressionEncoding::Gzip) + .accept_compressed(tonic::codec::CompressionEncoding::Zstd) + .send_compressed(tonic::codec::CompressionEncoding::Gzip) + .send_compressed(tonic::codec::CompressionEncoding::Zstd), + ); let server = server.add_service(page_service); // Reflection service for use with e.g. grpcurl. @@ -3532,7 +3539,6 @@ impl proto::PageService for GrpcPageServiceHandler { Ok(tonic::Response::new(resp.into())) } - // TODO: ensure clients use gzip compression for the stream. #[instrument(skip_all, fields(lsn))] async fn get_base_backup( &self, From fa954671b2876301188fc6249a90f81e384ec6d3 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Thu, 19 Jun 2025 13:00:01 +0300 Subject: [PATCH 078/124] Remove unnecessary Postgres libs from the storage docker image (#12286) Since commit 87ad50c925, storage_controller has used diesel_async, which in turn uses tokio-postgres as the Postgres client, which doesn't require libpq. Thus we no longer need libpq in the storage image. --- Dockerfile | 8 -------- 1 file changed, 8 deletions(-) diff --git a/Dockerfile b/Dockerfile index 0b7ef491fd..f72d7d9bbc 100644 --- a/Dockerfile +++ b/Dockerfile @@ -5,8 +5,6 @@ ARG REPOSITORY=ghcr.io/neondatabase ARG IMAGE=build-tools ARG TAG=pinned -ARG DEFAULT_PG_VERSION=17 -ARG STABLE_PG_VERSION=16 ARG DEBIAN_VERSION=bookworm ARG DEBIAN_FLAVOR=${DEBIAN_VERSION}-slim @@ -63,14 +61,11 @@ FROM $REPOSITORY/$IMAGE:$TAG AS build WORKDIR /home/nonroot ARG GIT_VERSION=local ARG BUILD_TAG -ARG STABLE_PG_VERSION COPY --from=pg-build /home/nonroot/pg_install/v14/include/postgresql/server pg_install/v14/include/postgresql/server COPY --from=pg-build /home/nonroot/pg_install/v15/include/postgresql/server pg_install/v15/include/postgresql/server COPY --from=pg-build /home/nonroot/pg_install/v16/include/postgresql/server pg_install/v16/include/postgresql/server COPY --from=pg-build /home/nonroot/pg_install/v17/include/postgresql/server pg_install/v17/include/postgresql/server -COPY --from=pg-build /home/nonroot/pg_install/v16/lib pg_install/v16/lib -COPY --from=pg-build /home/nonroot/pg_install/v17/lib pg_install/v17/lib COPY --from=plan /home/nonroot/recipe.json recipe.json ARG ADDITIONAL_RUSTFLAGS="" @@ -97,7 +92,6 @@ RUN set -e \ # Build final image # FROM $BASE_IMAGE_SHA -ARG DEFAULT_PG_VERSION WORKDIR /data RUN set -e \ @@ -107,8 +101,6 @@ RUN set -e \ libreadline-dev \ libseccomp-dev \ ca-certificates \ - # System postgres for use with client libraries (e.g. in storage controller) - postgresql-15 \ openssl \ unzip \ curl \ From 2ca6665f4add906114abed2ffa8db36565277414 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Thu, 19 Jun 2025 13:24:09 +0300 Subject: [PATCH 079/124] Remove outdated 'clean' Makefile targest (#12288) We have been bad at keeping them up-to-date, several contrib modules and neon extensions were missing from the clean rules. Give up trying, and remove the targets altogether. In practice, it's straightforward to just do `rm -rf pg_install/build`, so the clean-targets are hardly worth the maintenance effort. I kept `make distclean` though. The rule for that is simple enough. --- Makefile | 48 ------------------------------------------------ 1 file changed, 48 deletions(-) diff --git a/Makefile b/Makefile index 0911465fb8..5130e17e59 100644 --- a/Makefile +++ b/Makefile @@ -167,13 +167,6 @@ postgres-%: postgres-configure-% \ +@echo "Compiling test_decoding $*" $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/test_decoding install -.PHONY: postgres-clean-% -postgres-clean-%: - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$* MAKELEVEL=0 clean - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pg_buffercache clean - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pageinspect clean - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/src/interfaces/libpq clean - .PHONY: postgres-check-% postgres-check-%: postgres-% $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$* MAKELEVEL=0 check @@ -206,21 +199,6 @@ neon-pg-ext-%: postgres-% -C $(POSTGRES_INSTALL_DIR)/build/neon-utils-$* \ -f $(ROOT_PROJECT_DIR)/pgxn/neon_utils/Makefile install -.PHONY: neon-pg-clean-ext-% -neon-pg-clean-ext-%: - $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \ - -C $(POSTGRES_INSTALL_DIR)/build/neon-$* \ - -f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile clean - $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \ - -C $(POSTGRES_INSTALL_DIR)/build/neon-walredo-$* \ - -f $(ROOT_PROJECT_DIR)/pgxn/neon_walredo/Makefile clean - $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \ - -C $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-$* \ - -f $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils/Makefile clean - $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \ - -C $(POSTGRES_INSTALL_DIR)/build/neon-utils-$* \ - -f $(ROOT_PROJECT_DIR)/pgxn/neon_utils/Makefile clean - # Build walproposer as a static library. walproposer source code is located # in the pgxn/neon directory. # @@ -253,12 +231,6 @@ ifeq ($(UNAME_S),Linux) pg_crc32c.o endif -.PHONY: walproposer-lib-clean -walproposer-lib-clean: - $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v17/bin/pg_config \ - -C $(POSTGRES_INSTALL_DIR)/build/walproposer-lib \ - -f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile clean - .PHONY: neon-pg-ext neon-pg-ext: \ neon-pg-ext-v14 \ @@ -266,13 +238,6 @@ neon-pg-ext: \ neon-pg-ext-v16 \ neon-pg-ext-v17 -.PHONY: neon-pg-clean-ext -neon-pg-clean-ext: \ - neon-pg-clean-ext-v14 \ - neon-pg-clean-ext-v15 \ - neon-pg-clean-ext-v16 \ - neon-pg-clean-ext-v17 - # shorthand to build all Postgres versions .PHONY: postgres postgres: \ @@ -288,13 +253,6 @@ postgres-headers: \ postgres-headers-v16 \ postgres-headers-v17 -.PHONY: postgres-clean -postgres-clean: \ - postgres-clean-v14 \ - postgres-clean-v15 \ - postgres-clean-v16 \ - postgres-clean-v17 - .PHONY: postgres-check postgres-check: \ postgres-check-v14 \ @@ -302,12 +260,6 @@ postgres-check: \ postgres-check-v16 \ postgres-check-v17 -# This doesn't remove the effects of 'configure'. -.PHONY: clean -clean: postgres-clean neon-pg-clean-ext - $(MAKE) -C compute clean - $(CARGO_CMD_PREFIX) cargo clean - # This removes everything .PHONY: distclean distclean: From 1950ccfe33d895541217cc9784d12a941ecd571a Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Thu, 19 Jun 2025 13:31:27 +0300 Subject: [PATCH 080/124] Eliminate dependency from pageserver_api to postgres_ffi (#12273) Introduce a separate `postgres_ffi_types` crate which contains a few types and functions that were used in the API. `postgres_ffi_types` is a much small crate than `postgres_ffi`, and it doesn't depend on bindgen or the Postgres C headers. Move NeonWalRecord and Value types to wal_decoder crate. They are only used in the pageserver-safekeeper "ingest" API. The rest of the ingest API types are defined in wal_decoder, so move these there as well. --- Cargo.lock | 13 ++++- Cargo.toml | 2 + libs/pageserver_api/Cargo.toml | 2 +- libs/pageserver_api/src/key.rs | 6 +- libs/pageserver_api/src/keyspace.rs | 11 ++-- libs/pageserver_api/src/lib.rs | 2 - libs/pageserver_api/src/pagestream_api.rs | 14 +++-- libs/pageserver_api/src/reltag.rs | 6 +- libs/pageserver_api/src/shard.rs | 2 +- libs/postgres_ffi/Cargo.toml | 1 + libs/postgres_ffi/src/pg_constants.rs | 6 +- libs/postgres_ffi/src/relfile_utils.rs | 55 +++---------------- libs/postgres_ffi_types/Cargo.toml | 11 ++++ libs/postgres_ffi_types/src/constants.rs | 8 +++ libs/postgres_ffi_types/src/forknum.rs | 36 ++++++++++++ libs/postgres_ffi_types/src/lib.rs | 13 +++++ libs/wal_decoder/Cargo.toml | 1 + libs/wal_decoder/src/decoder.rs | 2 +- libs/wal_decoder/src/models.rs | 3 + .../src => wal_decoder/src/models}/record.rs | 0 .../src => wal_decoder/src/models}/value.rs | 2 +- libs/wal_decoder/src/serialized_batch.rs | 6 +- pageserver/Cargo.toml | 1 + pageserver/benches/bench_ingest.rs | 2 +- pageserver/benches/bench_walredo.rs | 2 +- pageserver/src/basebackup.rs | 8 +-- pageserver/src/import_datadir.rs | 4 +- pageserver/src/page_service.rs | 2 +- pageserver/src/pgdatadir_mapping.rs | 11 ++-- pageserver/src/tenant.rs | 10 ++-- pageserver/src/tenant/storage_layer.rs | 4 +- .../storage_layer/batch_split_writer.rs | 2 +- .../src/tenant/storage_layer/delta_layer.rs | 9 ++- .../tenant/storage_layer/filter_iterator.rs | 3 +- .../src/tenant/storage_layer/image_layer.rs | 4 +- .../src/tenant/storage_layer/layer/tests.rs | 2 +- .../tenant/storage_layer/merge_iterator.rs | 9 +-- pageserver/src/tenant/timeline.rs | 12 ++-- pageserver/src/tenant/timeline/compaction.rs | 4 +- .../src/tenant/timeline/import_pgdata/flow.rs | 6 +- pageserver/src/walingest.rs | 4 +- pageserver/src/walredo.rs | 4 +- pageserver/src/walredo/apply_neon.rs | 4 +- pageserver/src/walredo/process.rs | 2 +- 44 files changed, 183 insertions(+), 128 deletions(-) create mode 100644 libs/postgres_ffi_types/Cargo.toml create mode 100644 libs/postgres_ffi_types/src/constants.rs create mode 100644 libs/postgres_ffi_types/src/forknum.rs create mode 100644 libs/postgres_ffi_types/src/lib.rs rename libs/{pageserver_api/src => wal_decoder/src/models}/record.rs (100%) rename libs/{pageserver_api/src => wal_decoder/src/models}/value.rs (99%) diff --git a/Cargo.lock b/Cargo.lock index b996f9d384..5ab26b02fa 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4334,6 +4334,7 @@ dependencies = [ "postgres_backend", "postgres_connection", "postgres_ffi", + "postgres_ffi_types", "postgres_initdb", "posthog_client_lite", "pprof", @@ -4403,7 +4404,7 @@ dependencies = [ "nix 0.30.1", "once_cell", "postgres_backend", - "postgres_ffi", + "postgres_ffi_types", "rand 0.8.5", "remote_storage", "reqwest", @@ -4892,6 +4893,7 @@ dependencies = [ "memoffset 0.9.0", "once_cell", "postgres", + "postgres_ffi_types", "pprof", "regex", "serde", @@ -4900,6 +4902,14 @@ dependencies = [ "utils", ] +[[package]] +name = "postgres_ffi_types" +version = "0.1.0" +dependencies = [ + "thiserror 1.0.69", + "workspace_hack", +] + [[package]] name = "postgres_initdb" version = "0.1.0" @@ -8161,6 +8171,7 @@ dependencies = [ "futures", "pageserver_api", "postgres_ffi", + "postgres_ffi_types", "pprof", "prost 0.13.5", "remote_storage", diff --git a/Cargo.toml b/Cargo.toml index 076d8d0b60..2a6acc132e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -22,6 +22,7 @@ members = [ "libs/http-utils", "libs/pageserver_api", "libs/postgres_ffi", + "libs/postgres_ffi_types", "libs/safekeeper_api", "libs/desim", "libs/neon-shmem", @@ -259,6 +260,7 @@ pageserver_page_api = { path = "./pageserver/page_api" } postgres_backend = { version = "0.1", path = "./libs/postgres_backend/" } postgres_connection = { version = "0.1", path = "./libs/postgres_connection/" } postgres_ffi = { version = "0.1", path = "./libs/postgres_ffi/" } +postgres_ffi_types = { version = "0.1", path = "./libs/postgres_ffi_types/" } postgres_initdb = { path = "./libs/postgres_initdb" } posthog_client_lite = { version = "0.1", path = "./libs/posthog_client_lite" } pq_proto = { version = "0.1", path = "./libs/pq_proto/" } diff --git a/libs/pageserver_api/Cargo.toml b/libs/pageserver_api/Cargo.toml index 25f29b8ecd..5a9a74b93d 100644 --- a/libs/pageserver_api/Cargo.toml +++ b/libs/pageserver_api/Cargo.toml @@ -17,7 +17,7 @@ anyhow.workspace = true bytes.workspace = true byteorder.workspace = true utils.workspace = true -postgres_ffi.workspace = true +postgres_ffi_types.workspace = true enum-map.workspace = true strum.workspace = true strum_macros.workspace = true diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs index c14975167b..102bbee879 100644 --- a/libs/pageserver_api/src/key.rs +++ b/libs/pageserver_api/src/key.rs @@ -4,8 +4,8 @@ use std::ops::Range; use anyhow::{Result, bail}; use byteorder::{BE, ByteOrder}; use bytes::Bytes; -use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM}; -use postgres_ffi::{Oid, RepOriginId}; +use postgres_ffi_types::forknum::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM}; +use postgres_ffi_types::{Oid, RepOriginId}; use serde::{Deserialize, Serialize}; use utils::const_assert; @@ -194,7 +194,7 @@ impl Key { /// will be rejected on the write path. #[allow(dead_code)] pub fn is_valid_key_on_write_path_strong(&self) -> bool { - use postgres_ffi::pg_constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID}; + use postgres_ffi_types::constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID}; if !self.is_i128_representable() { return false; } diff --git a/libs/pageserver_api/src/keyspace.rs b/libs/pageserver_api/src/keyspace.rs index 79e3ef553b..1b48d3c462 100644 --- a/libs/pageserver_api/src/keyspace.rs +++ b/libs/pageserver_api/src/keyspace.rs @@ -1,7 +1,6 @@ use std::ops::Range; use itertools::Itertools; -use postgres_ffi::BLCKSZ; use crate::key::Key; use crate::shard::{ShardCount, ShardIdentity}; @@ -269,9 +268,13 @@ impl KeySpace { /// Partition a key space into roughly chunks of roughly 'target_size' bytes /// in each partition. /// - pub fn partition(&self, shard_identity: &ShardIdentity, target_size: u64) -> KeyPartitioning { - // Assume that each value is 8k in size. - let target_nblocks = (target_size / BLCKSZ as u64) as u32; + pub fn partition( + &self, + shard_identity: &ShardIdentity, + target_size: u64, + block_size: u64, + ) -> KeyPartitioning { + let target_nblocks = (target_size / block_size) as u32; let mut parts = Vec::new(); let mut current_part = Vec::new(); diff --git a/libs/pageserver_api/src/lib.rs b/libs/pageserver_api/src/lib.rs index 6c91d61508..52aed7a2c2 100644 --- a/libs/pageserver_api/src/lib.rs +++ b/libs/pageserver_api/src/lib.rs @@ -6,11 +6,9 @@ pub mod key; pub mod keyspace; pub mod models; pub mod pagestream_api; -pub mod record; pub mod reltag; pub mod shard; /// Public API types pub mod upcall_api; -pub mod value; pub mod config; diff --git a/libs/pageserver_api/src/pagestream_api.rs b/libs/pageserver_api/src/pagestream_api.rs index fba64c82d9..862da8268a 100644 --- a/libs/pageserver_api/src/pagestream_api.rs +++ b/libs/pageserver_api/src/pagestream_api.rs @@ -8,9 +8,15 @@ use crate::reltag::RelTag; use byteorder::{BigEndian, ReadBytesExt}; use bytes::{Buf, BufMut, Bytes, BytesMut}; -use postgres_ffi::BLCKSZ; use utils::lsn::Lsn; +/// Block size. +/// +/// XXX: We assume 8k block size in the SLRU fetch API. It's not great to hardcode +/// that in the protocol, because Postgres supports different block sizes as a compile +/// time option. +const BLCKSZ: usize = 8192; + // Wrapped in libpq CopyData #[derive(PartialEq, Eq, Debug)] pub enum PagestreamFeMessage { @@ -443,7 +449,7 @@ impl PagestreamBeMessage { Self::GetSlruSegment(resp) => { bytes.put_u8(Tag::GetSlruSegment as u8); - bytes.put_u32((resp.segment.len() / BLCKSZ as usize) as u32); + bytes.put_u32((resp.segment.len() / BLCKSZ) as u32); bytes.put(&resp.segment[..]); } @@ -520,7 +526,7 @@ impl PagestreamBeMessage { bytes.put_u64(resp.req.hdr.not_modified_since.0); bytes.put_u8(resp.req.kind); bytes.put_u32(resp.req.segno); - bytes.put_u32((resp.segment.len() / BLCKSZ as usize) as u32); + bytes.put_u32((resp.segment.len() / BLCKSZ) as u32); bytes.put(&resp.segment[..]); } @@ -662,7 +668,7 @@ impl PagestreamBeMessage { let kind = buf.read_u8()?; let segno = buf.read_u32::()?; let n_blocks = buf.read_u32::()?; - let mut segment = vec![0; n_blocks as usize * BLCKSZ as usize]; + let mut segment = vec![0; n_blocks as usize * BLCKSZ]; buf.read_exact(&mut segment)?; Self::GetSlruSegment(PagestreamGetSlruSegmentResponse { req: PagestreamGetSlruSegmentRequest { diff --git a/libs/pageserver_api/src/reltag.rs b/libs/pageserver_api/src/reltag.rs index e0dd4fdfe8..d0e37dffae 100644 --- a/libs/pageserver_api/src/reltag.rs +++ b/libs/pageserver_api/src/reltag.rs @@ -1,9 +1,9 @@ use std::cmp::Ordering; use std::fmt; -use postgres_ffi::Oid; -use postgres_ffi::pg_constants::GLOBALTABLESPACE_OID; -use postgres_ffi::relfile_utils::{MAIN_FORKNUM, forkname_to_number, forknumber_to_name}; +use postgres_ffi_types::Oid; +use postgres_ffi_types::constants::GLOBALTABLESPACE_OID; +use postgres_ffi_types::forknum::{MAIN_FORKNUM, forkname_to_number, forknumber_to_name}; use serde::{Deserialize, Serialize}; /// diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs index feb59f5070..9c16be93e8 100644 --- a/libs/pageserver_api/src/shard.rs +++ b/libs/pageserver_api/src/shard.rs @@ -35,7 +35,7 @@ use std::hash::{Hash, Hasher}; #[doc(inline)] pub use ::utils::shard::*; -use postgres_ffi::relfile_utils::INIT_FORKNUM; +use postgres_ffi_types::forknum::INIT_FORKNUM; use serde::{Deserialize, Serialize}; use crate::key::Key; diff --git a/libs/postgres_ffi/Cargo.toml b/libs/postgres_ffi/Cargo.toml index b7a376841d..67adfdd3c3 100644 --- a/libs/postgres_ffi/Cargo.toml +++ b/libs/postgres_ffi/Cargo.toml @@ -16,6 +16,7 @@ memoffset.workspace = true pprof.workspace = true thiserror.workspace = true serde.workspace = true +postgres_ffi_types.workspace = true utils.workspace = true tracing.workspace = true diff --git a/libs/postgres_ffi/src/pg_constants.rs b/libs/postgres_ffi/src/pg_constants.rs index b0bdd8a8da..f61b9a71c2 100644 --- a/libs/postgres_ffi/src/pg_constants.rs +++ b/libs/postgres_ffi/src/pg_constants.rs @@ -11,11 +11,7 @@ use crate::{BLCKSZ, PageHeaderData}; -// -// From pg_tablespace_d.h -// -pub const DEFAULTTABLESPACE_OID: u32 = 1663; -pub const GLOBALTABLESPACE_OID: u32 = 1664; +// Note: There are a few more widely-used constants in the postgres_ffi_types::constants crate. // From storage_xlog.h pub const XLOG_SMGR_CREATE: u8 = 0x10; diff --git a/libs/postgres_ffi/src/relfile_utils.rs b/libs/postgres_ffi/src/relfile_utils.rs index aa0e625b47..38f94b7221 100644 --- a/libs/postgres_ffi/src/relfile_utils.rs +++ b/libs/postgres_ffi/src/relfile_utils.rs @@ -4,50 +4,7 @@ use once_cell::sync::OnceCell; use regex::Regex; -// -// Fork numbers, from relpath.h -// -pub const MAIN_FORKNUM: u8 = 0; -pub const FSM_FORKNUM: u8 = 1; -pub const VISIBILITYMAP_FORKNUM: u8 = 2; -pub const INIT_FORKNUM: u8 = 3; - -#[derive(Debug, Clone, thiserror::Error, PartialEq, Eq)] -pub enum FilePathError { - #[error("invalid relation fork name")] - InvalidForkName, - #[error("invalid relation data file name")] - InvalidFileName, -} - -impl From for FilePathError { - fn from(_e: core::num::ParseIntError) -> Self { - FilePathError::InvalidFileName - } -} - -/// Convert Postgres relation file's fork suffix to fork number. -pub fn forkname_to_number(forkname: Option<&str>) -> Result { - match forkname { - // "main" is not in filenames, it's implicit if the fork name is not present - None => Ok(MAIN_FORKNUM), - Some("fsm") => Ok(FSM_FORKNUM), - Some("vm") => Ok(VISIBILITYMAP_FORKNUM), - Some("init") => Ok(INIT_FORKNUM), - Some(_) => Err(FilePathError::InvalidForkName), - } -} - -/// Convert Postgres fork number to the right suffix of the relation data file. -pub fn forknumber_to_name(forknum: u8) -> Option<&'static str> { - match forknum { - MAIN_FORKNUM => None, - FSM_FORKNUM => Some("fsm"), - VISIBILITYMAP_FORKNUM => Some("vm"), - INIT_FORKNUM => Some("init"), - _ => Some("UNKNOWN FORKNUM"), - } -} +use postgres_ffi_types::forknum::*; /// Parse a filename of a relation file. Returns (relfilenode, forknum, segno) tuple. /// @@ -75,7 +32,9 @@ pub fn parse_relfilename(fname: &str) -> Result<(u32, u8, u32), FilePathError> { .ok_or(FilePathError::InvalidFileName)?; let relnode_str = caps.name("relnode").unwrap().as_str(); - let relnode = relnode_str.parse::()?; + let relnode = relnode_str + .parse::() + .map_err(|_e| FilePathError::InvalidFileName)?; let forkname = caps.name("forkname").map(|f| f.as_str()); let forknum = forkname_to_number(forkname)?; @@ -84,7 +43,11 @@ pub fn parse_relfilename(fname: &str) -> Result<(u32, u8, u32), FilePathError> { let segno = if segno_match.is_none() { 0 } else { - segno_match.unwrap().as_str().parse::()? + segno_match + .unwrap() + .as_str() + .parse::() + .map_err(|_e| FilePathError::InvalidFileName)? }; Ok((relnode, forknum, segno)) diff --git a/libs/postgres_ffi_types/Cargo.toml b/libs/postgres_ffi_types/Cargo.toml new file mode 100644 index 0000000000..50c6fc7874 --- /dev/null +++ b/libs/postgres_ffi_types/Cargo.toml @@ -0,0 +1,11 @@ +[package] +name = "postgres_ffi_types" +version = "0.1.0" +edition.workspace = true +license.workspace = true + +[dependencies] +thiserror.workspace = true +workspace_hack = { version = "0.1", path = "../../workspace_hack" } + +[dev-dependencies] diff --git a/libs/postgres_ffi_types/src/constants.rs b/libs/postgres_ffi_types/src/constants.rs new file mode 100644 index 0000000000..c1a004c5ab --- /dev/null +++ b/libs/postgres_ffi_types/src/constants.rs @@ -0,0 +1,8 @@ +//! Misc constants, copied from PostgreSQL headers. +//! +//! Any constants included here must be the same in all PostgreSQL versions and unlikely to change +//! in the future either! + +// From pg_tablespace_d.h +pub const DEFAULTTABLESPACE_OID: u32 = 1663; +pub const GLOBALTABLESPACE_OID: u32 = 1664; diff --git a/libs/postgres_ffi_types/src/forknum.rs b/libs/postgres_ffi_types/src/forknum.rs new file mode 100644 index 0000000000..9b225d8ce5 --- /dev/null +++ b/libs/postgres_ffi_types/src/forknum.rs @@ -0,0 +1,36 @@ +// Fork numbers, from relpath.h +pub const MAIN_FORKNUM: u8 = 0; +pub const FSM_FORKNUM: u8 = 1; +pub const VISIBILITYMAP_FORKNUM: u8 = 2; +pub const INIT_FORKNUM: u8 = 3; + +#[derive(Debug, Clone, thiserror::Error, PartialEq, Eq)] +pub enum FilePathError { + #[error("invalid relation fork name")] + InvalidForkName, + #[error("invalid relation data file name")] + InvalidFileName, +} + +/// Convert Postgres relation file's fork suffix to fork number. +pub fn forkname_to_number(forkname: Option<&str>) -> Result { + match forkname { + // "main" is not in filenames, it's implicit if the fork name is not present + None => Ok(MAIN_FORKNUM), + Some("fsm") => Ok(FSM_FORKNUM), + Some("vm") => Ok(VISIBILITYMAP_FORKNUM), + Some("init") => Ok(INIT_FORKNUM), + Some(_) => Err(FilePathError::InvalidForkName), + } +} + +/// Convert Postgres fork number to the right suffix of the relation data file. +pub fn forknumber_to_name(forknum: u8) -> Option<&'static str> { + match forknum { + MAIN_FORKNUM => None, + FSM_FORKNUM => Some("fsm"), + VISIBILITYMAP_FORKNUM => Some("vm"), + INIT_FORKNUM => Some("init"), + _ => Some("UNKNOWN FORKNUM"), + } +} diff --git a/libs/postgres_ffi_types/src/lib.rs b/libs/postgres_ffi_types/src/lib.rs new file mode 100644 index 0000000000..84ef499b9f --- /dev/null +++ b/libs/postgres_ffi_types/src/lib.rs @@ -0,0 +1,13 @@ +//! This package contains some PostgreSQL constants and datatypes that are the same in all versions +//! of PostgreSQL and unlikely to change in the future either. These could be derived from the +//! PostgreSQL headers with 'bindgen', but in order to avoid proliferating the dependency to bindgen +//! and the PostgreSQL C headers to all services, we prefer to have this small stand-alone crate for +//! them instead. +//! +//! Be mindful in what you add here, as these types are deeply ingrained in the APIs. + +pub mod constants; +pub mod forknum; + +pub type Oid = u32; +pub type RepOriginId = u16; diff --git a/libs/wal_decoder/Cargo.toml b/libs/wal_decoder/Cargo.toml index cb0ef4b00d..600ef091f5 100644 --- a/libs/wal_decoder/Cargo.toml +++ b/libs/wal_decoder/Cargo.toml @@ -14,6 +14,7 @@ bytes.workspace = true pageserver_api.workspace = true prost.workspace = true postgres_ffi.workspace = true +postgres_ffi_types.workspace = true serde.workspace = true thiserror.workspace = true tokio = { workspace = true, features = ["io-util"] } diff --git a/libs/wal_decoder/src/decoder.rs b/libs/wal_decoder/src/decoder.rs index cb0835e894..9980a1f369 100644 --- a/libs/wal_decoder/src/decoder.rs +++ b/libs/wal_decoder/src/decoder.rs @@ -8,8 +8,8 @@ use pageserver_api::key::rel_block_to_key; use pageserver_api::reltag::{RelTag, SlruKind}; use pageserver_api::shard::ShardIdentity; use postgres_ffi::pg_constants; -use postgres_ffi::relfile_utils::VISIBILITYMAP_FORKNUM; use postgres_ffi::walrecord::*; +use postgres_ffi_types::forknum::VISIBILITYMAP_FORKNUM; use utils::lsn::Lsn; use crate::models::*; diff --git a/libs/wal_decoder/src/models.rs b/libs/wal_decoder/src/models.rs index 7e1934c6c3..94a00c0e53 100644 --- a/libs/wal_decoder/src/models.rs +++ b/libs/wal_decoder/src/models.rs @@ -25,6 +25,9 @@ //! | //! |--> write to KV store within the pageserver +pub mod record; +pub mod value; + use bytes::Bytes; use pageserver_api::reltag::{RelTag, SlruKind}; use postgres_ffi::walrecord::{ diff --git a/libs/pageserver_api/src/record.rs b/libs/wal_decoder/src/models/record.rs similarity index 100% rename from libs/pageserver_api/src/record.rs rename to libs/wal_decoder/src/models/record.rs diff --git a/libs/pageserver_api/src/value.rs b/libs/wal_decoder/src/models/value.rs similarity index 99% rename from libs/pageserver_api/src/value.rs rename to libs/wal_decoder/src/models/value.rs index e9000939c3..3b4f896a45 100644 --- a/libs/pageserver_api/src/value.rs +++ b/libs/wal_decoder/src/models/value.rs @@ -10,7 +10,7 @@ use bytes::Bytes; use serde::{Deserialize, Serialize}; -use crate::record::NeonWalRecord; +use crate::models::record::NeonWalRecord; #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] pub enum Value { diff --git a/libs/wal_decoder/src/serialized_batch.rs b/libs/wal_decoder/src/serialized_batch.rs index b451d6d8e0..4123f7d0ac 100644 --- a/libs/wal_decoder/src/serialized_batch.rs +++ b/libs/wal_decoder/src/serialized_batch.rs @@ -1,4 +1,4 @@ -//! This module implements batch type for serialized [`pageserver_api::value::Value`] +//! This module implements batch type for serialized [`crate::models::value::Value`] //! instances. Each batch contains a raw buffer (serialized values) //! and a list of metadata for each (key, LSN) tuple present in the batch. //! @@ -10,10 +10,8 @@ use std::collections::{BTreeSet, HashMap}; use bytes::{Bytes, BytesMut}; use pageserver_api::key::{CompactKey, Key, rel_block_to_key}; use pageserver_api::keyspace::KeySpace; -use pageserver_api::record::NeonWalRecord; use pageserver_api::reltag::RelTag; use pageserver_api::shard::ShardIdentity; -use pageserver_api::value::Value; use postgres_ffi::walrecord::{DecodedBkpBlock, DecodedWALRecord}; use postgres_ffi::{BLCKSZ, page_is_new, page_set_lsn, pg_constants}; use serde::{Deserialize, Serialize}; @@ -21,6 +19,8 @@ use utils::bin_ser::BeSer; use utils::lsn::Lsn; use crate::models::InterpretedWalRecord; +use crate::models::record::NeonWalRecord; +use crate::models::value::Value; static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]); diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 9591c729e8..606ba9ad8c 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -56,6 +56,7 @@ pin-project-lite.workspace = true postgres_backend.workspace = true postgres_connection.workspace = true postgres_ffi.workspace = true +postgres_ffi_types.workspace = true postgres_initdb.workspace = true postgres-protocol.workspace = true postgres-types.workspace = true diff --git a/pageserver/benches/bench_ingest.rs b/pageserver/benches/bench_ingest.rs index eaadfe14ae..681d135e09 100644 --- a/pageserver/benches/bench_ingest.rs +++ b/pageserver/benches/bench_ingest.rs @@ -13,11 +13,11 @@ use pageserver::{page_cache, virtual_file}; use pageserver_api::key::Key; use pageserver_api::models::virtual_file::IoMode; use pageserver_api::shard::TenantShardId; -use pageserver_api::value::Value; use strum::IntoEnumIterator; use tokio_util::sync::CancellationToken; use utils::bin_ser::BeSer; use utils::id::{TenantId, TimelineId}; +use wal_decoder::models::value::Value; use wal_decoder::serialized_batch::SerializedValueBatch; // A very cheap hash for generating non-sequential keys. diff --git a/pageserver/benches/bench_walredo.rs b/pageserver/benches/bench_walredo.rs index 215682d90c..36d0d9c974 100644 --- a/pageserver/benches/bench_walredo.rs +++ b/pageserver/benches/bench_walredo.rs @@ -67,12 +67,12 @@ use once_cell::sync::Lazy; use pageserver::config::PageServerConf; use pageserver::walredo::{PostgresRedoManager, RedoAttemptType}; use pageserver_api::key::Key; -use pageserver_api::record::NeonWalRecord; use pageserver_api::shard::TenantShardId; use tokio::sync::Barrier; use tokio::task::JoinSet; use utils::id::TenantId; use utils::lsn::Lsn; +use wal_decoder::models::record::NeonWalRecord; fn bench(c: &mut Criterion) { macro_rules! bench_group { diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index 2a0548b811..fe136b8bbd 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -18,13 +18,12 @@ use bytes::{BufMut, Bytes, BytesMut}; use fail::fail_point; use pageserver_api::key::{Key, rel_block_to_key}; use pageserver_api::reltag::{RelTag, SlruKind}; -use postgres_ffi::pg_constants::{ - DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID, PG_HBA, PGDATA_SPECIAL_FILES, -}; -use postgres_ffi::relfile_utils::{INIT_FORKNUM, MAIN_FORKNUM}; +use postgres_ffi::pg_constants::{PG_HBA, PGDATA_SPECIAL_FILES}; use postgres_ffi::{ BLCKSZ, PG_TLI, RELSEG_SIZE, WAL_SEGMENT_SIZE, XLogFileName, dispatch_pgversion, pg_constants, }; +use postgres_ffi_types::constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID}; +use postgres_ffi_types::forknum::{INIT_FORKNUM, MAIN_FORKNUM}; use tokio::io; use tokio::io::AsyncWrite; use tokio_tar::{Builder, EntryType, Header}; @@ -372,6 +371,7 @@ where .partition( self.timeline.get_shard_identity(), self.timeline.conf.max_get_vectored_keys.get() as u64 * BLCKSZ as u64, + BLCKSZ as u64, ); let mut slru_builder = SlruSegmentsBuilder::new(&mut self.ar); diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs index 911449c7c5..96fe0c1078 100644 --- a/pageserver/src/import_datadir.rs +++ b/pageserver/src/import_datadir.rs @@ -520,7 +520,7 @@ async fn import_file( } if file_path.starts_with("global") { - let spcnode = postgres_ffi::pg_constants::GLOBALTABLESPACE_OID; + let spcnode = postgres_ffi_types::constants::GLOBALTABLESPACE_OID; let dbnode = 0; match file_name.as_ref() { @@ -553,7 +553,7 @@ async fn import_file( } } } else if file_path.starts_with("base") { - let spcnode = pg_constants::DEFAULTTABLESPACE_OID; + let spcnode = postgres_ffi_types::constants::DEFAULTTABLESPACE_OID; let dbnode: u32 = file_path .iter() .nth(1) diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 800b47f235..642b447e5f 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -41,7 +41,7 @@ use postgres_backend::{ AuthType, PostgresBackend, PostgresBackendReader, QueryError, is_expected_io_error, }; use postgres_ffi::BLCKSZ; -use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID; +use postgres_ffi_types::constants::DEFAULTTABLESPACE_OID; use pq_proto::framed::ConnectionError; use pq_proto::{BeMessage, FeMessage, FeStartupPacket, RowDescriptor}; use smallvec::{SmallVec, smallvec}; diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 633d62210d..58af2548ee 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -23,12 +23,11 @@ use pageserver_api::key::{ }; use pageserver_api::keyspace::{KeySpaceRandomAccum, SparseKeySpace}; use pageserver_api::models::RelSizeMigration; -use pageserver_api::record::NeonWalRecord; use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind}; use pageserver_api::shard::ShardIdentity; -use pageserver_api::value::Value; -use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM}; -use postgres_ffi::{BLCKSZ, Oid, RepOriginId, TimestampTz, TransactionId}; +use postgres_ffi::{BLCKSZ, TimestampTz, TransactionId}; +use postgres_ffi_types::forknum::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM}; +use postgres_ffi_types::{Oid, RepOriginId}; use serde::{Deserialize, Serialize}; use strum::IntoEnumIterator; use tokio_util::sync::CancellationToken; @@ -36,6 +35,8 @@ use tracing::{debug, info, info_span, trace, warn}; use utils::bin_ser::{BeSer, DeserializeError}; use utils::lsn::Lsn; use utils::pausable_failpoint; +use wal_decoder::models::record::NeonWalRecord; +use wal_decoder::models::value::Value; use wal_decoder::serialized_batch::{SerializedValueBatch, ValueMeta}; use super::tenant::{PageReconstructError, Timeline}; @@ -720,6 +721,7 @@ impl Timeline { let batches = keyspace.partition( self.get_shard_identity(), self.conf.max_get_vectored_keys.get() as u64 * BLCKSZ as u64, + BLCKSZ as u64, ); let io_concurrency = IoConcurrency::spawn_from_conf( @@ -960,6 +962,7 @@ impl Timeline { let batches = keyspace.partition( self.get_shard_identity(), self.conf.max_get_vectored_keys.get() as u64 * BLCKSZ as u64, + BLCKSZ as u64, ); let io_concurrency = IoConcurrency::spawn_from_conf( diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index dc6926f299..d2c2fdef93 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -496,7 +496,7 @@ impl WalRedoManager { key: pageserver_api::key::Key, lsn: Lsn, base_img: Option<(Lsn, bytes::Bytes)>, - records: Vec<(Lsn, pageserver_api::record::NeonWalRecord)>, + records: Vec<(Lsn, wal_decoder::models::record::NeonWalRecord)>, pg_version: u32, redo_attempt_type: RedoAttemptType, ) -> Result { @@ -5852,10 +5852,10 @@ pub(crate) mod harness { use once_cell::sync::OnceCell; use pageserver_api::key::Key; use pageserver_api::models::ShardParameters; - use pageserver_api::record::NeonWalRecord; use pageserver_api::shard::ShardIndex; use utils::id::TenantId; use utils::logging; + use wal_decoder::models::record::NeonWalRecord; use super::*; use crate::deletion_queue::mock::MockDeletionQueue; @@ -6110,9 +6110,6 @@ mod tests { #[cfg(feature = "testing")] use pageserver_api::keyspace::KeySpaceRandomAccum; use pageserver_api::models::{CompactionAlgorithm, CompactionAlgorithmSettings}; - #[cfg(feature = "testing")] - use pageserver_api::record::NeonWalRecord; - use pageserver_api::value::Value; use pageserver_compaction::helpers::overlaps_with; #[cfg(feature = "testing")] use rand::SeedableRng; @@ -6133,6 +6130,9 @@ mod tests { use timeline::{CompactOptions, DeltaLayerTestDesc, VersionedKeySpaceQuery}; use utils::id::TenantId; use utils::shard::{ShardCount, ShardNumber}; + #[cfg(feature = "testing")] + use wal_decoder::models::record::NeonWalRecord; + use wal_decoder::models::value::Value; use super::*; use crate::DEFAULT_PG_VERSION; diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index 9d15e7c4de..e65d444f76 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -34,11 +34,11 @@ pub use layer_name::{DeltaLayerName, ImageLayerName, LayerName}; use pageserver_api::config::GetVectoredConcurrentIo; use pageserver_api::key::Key; use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum}; -use pageserver_api::record::NeonWalRecord; -use pageserver_api::value::Value; use tracing::{Instrument, info_span, trace}; use utils::lsn::Lsn; use utils::sync::gate::GateGuard; +use wal_decoder::models::record::NeonWalRecord; +use wal_decoder::models::value::Value; use self::inmemory_layer::InMemoryLayerFileId; use super::PageReconstructError; diff --git a/pageserver/src/tenant/storage_layer/batch_split_writer.rs b/pageserver/src/tenant/storage_layer/batch_split_writer.rs index 51f2e909a2..1d50a5f3a0 100644 --- a/pageserver/src/tenant/storage_layer/batch_split_writer.rs +++ b/pageserver/src/tenant/storage_layer/batch_split_writer.rs @@ -4,11 +4,11 @@ use std::sync::Arc; use bytes::Bytes; use pageserver_api::key::{KEY_SIZE, Key}; -use pageserver_api::value::Value; use tokio_util::sync::CancellationToken; use utils::id::TimelineId; use utils::lsn::Lsn; use utils::shard::TenantShardId; +use wal_decoder::models::value::Value; use super::errors::PutError; use super::layer::S3_UPLOAD_LIMIT; diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index e82a28bb4c..ba763d4c3f 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -44,7 +44,6 @@ use pageserver_api::key::{DBDIR_KEY, KEY_SIZE, Key}; use pageserver_api::keyspace::KeySpace; use pageserver_api::models::ImageCompressionAlgorithm; use pageserver_api::shard::TenantShardId; -use pageserver_api::value::Value; use serde::{Deserialize, Serialize}; use tokio::sync::OnceCell; use tokio_epoll_uring::IoBuf; @@ -54,6 +53,7 @@ use utils::bin_ser::BeSer; use utils::bin_ser::SerializeError; use utils::id::{TenantId, TimelineId}; use utils::lsn::Lsn; +use wal_decoder::models::value::Value; use super::errors::PutError; use super::{ @@ -1306,7 +1306,7 @@ impl DeltaLayerInner { // is it an image or will_init walrecord? // FIXME: this could be handled by threading the BlobRef to the // VectoredReadBuilder - let will_init = pageserver_api::value::ValueBytes::will_init(&data) + let will_init = wal_decoder::models::value::ValueBytes::will_init(&data) .inspect_err(|_e| { #[cfg(feature = "testing")] tracing::error!(data=?utils::Hex(&data), err=?_e, %key, %lsn, "failed to parse will_init out of serialized value"); @@ -1369,7 +1369,7 @@ impl DeltaLayerInner { format!(" img {} bytes", img.len()) } Value::WalRecord(rec) => { - let wal_desc = pageserver_api::record::describe_wal_record(&rec)?; + let wal_desc = wal_decoder::models::record::describe_wal_record(&rec)?; format!( " rec {} bytes will_init: {} {}", buf.len(), @@ -1624,7 +1624,6 @@ pub(crate) mod test { use bytes::Bytes; use itertools::MinMaxResult; - use pageserver_api::value::Value; use rand::prelude::{SeedableRng, SliceRandom, StdRng}; use rand::{Rng, RngCore}; @@ -1988,7 +1987,7 @@ pub(crate) mod test { #[tokio::test] async fn copy_delta_prefix_smoke() { use bytes::Bytes; - use pageserver_api::record::NeonWalRecord; + use wal_decoder::models::record::NeonWalRecord; let h = crate::tenant::harness::TenantHarness::create("truncate_delta_smoke") .await diff --git a/pageserver/src/tenant/storage_layer/filter_iterator.rs b/pageserver/src/tenant/storage_layer/filter_iterator.rs index 1a330ecfc2..d345195446 100644 --- a/pageserver/src/tenant/storage_layer/filter_iterator.rs +++ b/pageserver/src/tenant/storage_layer/filter_iterator.rs @@ -4,8 +4,8 @@ use std::sync::Arc; use anyhow::bail; use pageserver_api::key::Key; use pageserver_api::keyspace::{KeySpace, SparseKeySpace}; -use pageserver_api::value::Value; use utils::lsn::Lsn; +use wal_decoder::models::value::Value; use super::PersistentLayerKey; use super::merge_iterator::{MergeIterator, MergeIteratorItem}; @@ -126,7 +126,6 @@ mod tests { #[tokio::test] async fn filter_keyspace_iterator() { use bytes::Bytes; - use pageserver_api::value::Value; let harness = TenantHarness::create("filter_iterator_filter_keyspace_iterator") .await diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index 740f53f928..d6f5f48a6e 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -42,7 +42,6 @@ use pageserver_api::config::MaxVectoredReadBytes; use pageserver_api::key::{DBDIR_KEY, KEY_SIZE, Key}; use pageserver_api::keyspace::KeySpace; use pageserver_api::shard::{ShardIdentity, TenantShardId}; -use pageserver_api::value::Value; use serde::{Deserialize, Serialize}; use tokio::sync::OnceCell; use tokio_stream::StreamExt; @@ -52,6 +51,7 @@ use utils::bin_ser::BeSer; use utils::bin_ser::SerializeError; use utils::id::{TenantId, TimelineId}; use utils::lsn::Lsn; +use wal_decoder::models::value::Value; use super::errors::PutError; use super::layer_name::ImageLayerName; @@ -1232,10 +1232,10 @@ mod test { use itertools::Itertools; use pageserver_api::key::Key; use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize}; - use pageserver_api::value::Value; use utils::generation::Generation; use utils::id::{TenantId, TimelineId}; use utils::lsn::Lsn; + use wal_decoder::models::value::Value; use super::{ImageLayerIterator, ImageLayerWriter}; use crate::DEFAULT_PG_VERSION; diff --git a/pageserver/src/tenant/storage_layer/layer/tests.rs b/pageserver/src/tenant/storage_layer/layer/tests.rs index 2f2ff0f273..313c133fa2 100644 --- a/pageserver/src/tenant/storage_layer/layer/tests.rs +++ b/pageserver/src/tenant/storage_layer/layer/tests.rs @@ -824,7 +824,7 @@ async fn evict_and_wait_does_not_wait_for_download() { #[tokio::test(start_paused = true)] async fn eviction_cancellation_on_drop() { use bytes::Bytes; - use pageserver_api::value::Value; + use wal_decoder::models::value::Value; // this is the runtime on which Layer spawns the blocking tasks on let handle = tokio::runtime::Handle::current(); diff --git a/pageserver/src/tenant/storage_layer/merge_iterator.rs b/pageserver/src/tenant/storage_layer/merge_iterator.rs index ea3dea50c3..c15abcdf3f 100644 --- a/pageserver/src/tenant/storage_layer/merge_iterator.rs +++ b/pageserver/src/tenant/storage_layer/merge_iterator.rs @@ -4,8 +4,8 @@ use std::sync::Arc; use anyhow::bail; use pageserver_api::key::Key; -use pageserver_api::value::Value; use utils::lsn::Lsn; +use wal_decoder::models::value::Value; use super::delta_layer::{DeltaLayerInner, DeltaLayerIterator}; use super::image_layer::{ImageLayerInner, ImageLayerIterator}; @@ -402,9 +402,9 @@ impl<'a> MergeIterator<'a> { mod tests { use itertools::Itertools; use pageserver_api::key::Key; - #[cfg(feature = "testing")] - use pageserver_api::record::NeonWalRecord; use utils::lsn::Lsn; + #[cfg(feature = "testing")] + use wal_decoder::models::record::NeonWalRecord; use super::*; use crate::DEFAULT_PG_VERSION; @@ -436,7 +436,6 @@ mod tests { #[tokio::test] async fn merge_in_between() { use bytes::Bytes; - use pageserver_api::value::Value; let harness = TenantHarness::create("merge_iterator_merge_in_between") .await @@ -501,7 +500,6 @@ mod tests { #[tokio::test] async fn delta_merge() { use bytes::Bytes; - use pageserver_api::value::Value; let harness = TenantHarness::create("merge_iterator_delta_merge") .await @@ -578,7 +576,6 @@ mod tests { #[tokio::test] async fn delta_image_mixed_merge() { use bytes::Bytes; - use pageserver_api::value::Value; let harness = TenantHarness::create("merge_iterator_delta_image_mixed_merge") .await diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index b8bfc4f936..a0e9d8f06a 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -56,8 +56,6 @@ use pageserver_api::models::{ }; use pageserver_api::reltag::{BlockNumber, RelTag}; use pageserver_api::shard::{ShardIdentity, ShardIndex, ShardNumber, TenantShardId}; -#[cfg(test)] -use pageserver_api::value::Value; use postgres_connection::PgConnectionConfig; use postgres_ffi::v14::xlog_utils; use postgres_ffi::{WAL_SEGMENT_SIZE, to_pg_timestamp}; @@ -81,6 +79,8 @@ use utils::seqwait::SeqWait; use utils::simple_rcu::{Rcu, RcuReadGuard}; use utils::sync::gate::{Gate, GateGuard}; use utils::{completion, critical, fs_ext, pausable_failpoint}; +#[cfg(test)] +use wal_decoder::models::value::Value; use wal_decoder::serialized_batch::{SerializedValueBatch, ValueMeta}; use self::delete::DeleteTimelineFlow; @@ -5207,7 +5207,11 @@ impl Timeline { } let (dense_ks, sparse_ks) = self.collect_keyspace(lsn, ctx).await?; - let dense_partitioning = dense_ks.partition(&self.shard_identity, partition_size); + let dense_partitioning = dense_ks.partition( + &self.shard_identity, + partition_size, + postgres_ffi::BLCKSZ as u64, + ); let sparse_partitioning = SparseKeyPartitioning { parts: vec![sparse_ks], }; // no partitioning for metadata keys for now @@ -7590,11 +7594,11 @@ mod tests { use std::sync::Arc; use pageserver_api::key::Key; - use pageserver_api::value::Value; use std::iter::Iterator; use tracing::Instrument; use utils::id::TimelineId; use utils::lsn::Lsn; + use wal_decoder::models::value::Value; use super::HeatMapTimeline; use crate::context::RequestContextBuilder; diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index 5307d3836f..6039c002f7 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -29,9 +29,7 @@ use pageserver_api::config::tenant_conf_defaults::DEFAULT_CHECKPOINT_DISTANCE; use pageserver_api::key::{KEY_SIZE, Key}; use pageserver_api::keyspace::{KeySpace, ShardedRange}; use pageserver_api::models::{CompactInfoResponse, CompactKeyRange}; -use pageserver_api::record::NeonWalRecord; use pageserver_api::shard::{ShardCount, ShardIdentity, TenantShardId}; -use pageserver_api::value::Value; use pageserver_compaction::helpers::{fully_contains, overlaps_with}; use pageserver_compaction::interface::*; use serde::Serialize; @@ -41,6 +39,8 @@ use tracing::{Instrument, debug, error, info, info_span, trace, warn}; use utils::critical; use utils::id::TimelineId; use utils::lsn::Lsn; +use wal_decoder::models::record::NeonWalRecord; +use wal_decoder::models::value::Value; use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder}; use crate::page_cache; diff --git a/pageserver/src/tenant/timeline/import_pgdata/flow.rs b/pageserver/src/tenant/timeline/import_pgdata/flow.rs index ed679a9bdc..d471e9fc69 100644 --- a/pageserver/src/tenant/timeline/import_pgdata/flow.rs +++ b/pageserver/src/tenant/timeline/import_pgdata/flow.rs @@ -36,8 +36,8 @@ use pageserver_api::keyspace::{ShardedRange, singleton_range}; use pageserver_api::models::{ShardImportProgress, ShardImportProgressV1, ShardImportStatus}; use pageserver_api::reltag::{RelTag, SlruKind}; use pageserver_api::shard::ShardIdentity; +use postgres_ffi::BLCKSZ; use postgres_ffi::relfile_utils::parse_relfilename; -use postgres_ffi::{BLCKSZ, pg_constants}; use remote_storage::RemotePath; use tokio::sync::Semaphore; use tokio_stream::StreamExt; @@ -558,7 +558,7 @@ impl PgDataDir { PgDataDirDb::new( storage, &basedir.join(dboid.to_string()), - pg_constants::DEFAULTTABLESPACE_OID, + postgres_ffi_types::constants::DEFAULTTABLESPACE_OID, dboid, &datadir_path, ) @@ -571,7 +571,7 @@ impl PgDataDir { PgDataDirDb::new( storage, &datadir_path.join("global"), - postgres_ffi::pg_constants::GLOBALTABLESPACE_OID, + postgres_ffi_types::constants::GLOBALTABLESPACE_OID, 0, &datadir_path, ) diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index c1a3b79915..ebffaf70e2 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -28,20 +28,20 @@ use std::time::{Duration, Instant, SystemTime}; use bytes::{Buf, Bytes}; use pageserver_api::key::{Key, rel_block_to_key}; -use pageserver_api::record::NeonWalRecord; use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind}; use pageserver_api::shard::ShardIdentity; -use postgres_ffi::relfile_utils::{FSM_FORKNUM, INIT_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM}; use postgres_ffi::walrecord::*; use postgres_ffi::{ TimestampTz, TransactionId, dispatch_pgversion, enum_pgversion, enum_pgversion_dispatch, fsm_logical_to_physical, pg_constants, }; +use postgres_ffi_types::forknum::{FSM_FORKNUM, INIT_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM}; use tracing::*; use utils::bin_ser::{DeserializeError, SerializeError}; use utils::lsn::Lsn; use utils::rate_limit::RateLimit; use utils::{critical, failpoint_support}; +use wal_decoder::models::record::NeonWalRecord; use wal_decoder::models::*; use crate::ZERO_PAGE; diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index ed8a954369..1498f3c83d 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -32,12 +32,12 @@ use anyhow::Context; use bytes::{Bytes, BytesMut}; use pageserver_api::key::Key; use pageserver_api::models::{WalRedoManagerProcessStatus, WalRedoManagerStatus}; -use pageserver_api::record::NeonWalRecord; use pageserver_api::shard::TenantShardId; use tracing::*; use utils::lsn::Lsn; use utils::sync::gate::GateError; use utils::sync::heavier_once_cell; +use wal_decoder::models::record::NeonWalRecord; use crate::config::PageServerConf; use crate::metrics::{ @@ -571,11 +571,11 @@ mod tests { use bytes::Bytes; use pageserver_api::key::Key; - use pageserver_api::record::NeonWalRecord; use pageserver_api::shard::TenantShardId; use tracing::Instrument; use utils::id::TenantId; use utils::lsn::Lsn; + use wal_decoder::models::record::NeonWalRecord; use super::PostgresRedoManager; use crate::config::PageServerConf; diff --git a/pageserver/src/walredo/apply_neon.rs b/pageserver/src/walredo/apply_neon.rs index a3840f1f6f..0783c77622 100644 --- a/pageserver/src/walredo/apply_neon.rs +++ b/pageserver/src/walredo/apply_neon.rs @@ -2,16 +2,16 @@ use anyhow::Context; use byteorder::{ByteOrder, LittleEndian}; use bytes::BytesMut; use pageserver_api::key::Key; -use pageserver_api::record::NeonWalRecord; use pageserver_api::reltag::SlruKind; -use postgres_ffi::relfile_utils::VISIBILITYMAP_FORKNUM; use postgres_ffi::v14::nonrelfile_utils::{ mx_offset_to_flags_bitshift, mx_offset_to_flags_offset, mx_offset_to_member_offset, transaction_id_set_status, }; use postgres_ffi::{BLCKSZ, pg_constants}; +use postgres_ffi_types::forknum::VISIBILITYMAP_FORKNUM; use tracing::*; use utils::lsn::Lsn; +use wal_decoder::models::record::NeonWalRecord; /// Can this request be served by neon redo functions /// or we need to pass it to wal-redo postgres process? diff --git a/pageserver/src/walredo/process.rs b/pageserver/src/walredo/process.rs index 6d4a38d4ff..3dec0593bf 100644 --- a/pageserver/src/walredo/process.rs +++ b/pageserver/src/walredo/process.rs @@ -10,7 +10,6 @@ use std::time::Duration; use anyhow::Context; use bytes::Bytes; -use pageserver_api::record::NeonWalRecord; use pageserver_api::reltag::RelTag; use pageserver_api::shard::TenantShardId; use postgres_ffi::BLCKSZ; @@ -18,6 +17,7 @@ use tokio::io::{AsyncReadExt, AsyncWriteExt}; use tracing::{Instrument, debug, error, instrument}; use utils::lsn::Lsn; use utils::poison::Poison; +use wal_decoder::models::record::NeonWalRecord; use self::no_leak_child::NoLeakChild; use crate::config::PageServerConf; From ec1452a5597bdb1284b0183444f5f6b437aa16ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Thu, 19 Jun 2025 13:17:01 +0200 Subject: [PATCH 081/124] Switch on --timelines-onto-safekeepers in integration tests (#11712) Switch on the `--timelines-onto-safekeepers` param in integration tests. Some changes that were needed to enable this but which I put into other PRs to not clutter up this one: * #11786 * #11854 * #12129 * #12138 Further fixes that were needed for this: * https://github.com/neondatabase/neon/pull/11801 * https://github.com/neondatabase/neon/pull/12143 * https://github.com/neondatabase/neon/pull/12204 Not strictly needed, but helpful: * https://github.com/neondatabase/neon/pull/12155 Part of #11670 Closes #11424 --- control_plane/src/local_env.rs | 2 +- test_runner/fixtures/neon_fixtures.py | 7 +- test_runner/regress/test_branching.py | 14 ++++ test_runner/regress/test_normal_work.py | 5 ++ test_runner/regress/test_ondemand_download.py | 6 ++ .../regress/test_storage_controller.py | 21 ++++-- test_runner/regress/test_storage_scrubber.py | 5 ++ .../regress/test_timeline_detach_ancestor.py | 19 +++++- .../regress/test_timeline_gc_blocking.py | 2 + test_runner/regress/test_wal_acceptor.py | 66 +++++++++++++++---- .../regress/test_wal_acceptor_async.py | 12 ++++ test_runner/regress/test_wal_receiver.py | 7 ++ 12 files changed, 141 insertions(+), 25 deletions(-) diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index 47b77f0720..1b231151ce 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -236,7 +236,7 @@ impl Default for NeonStorageControllerConf { heartbeat_interval: Self::DEFAULT_HEARTBEAT_INTERVAL, long_reconcile_threshold: None, use_https_pageserver_api: false, - timelines_onto_safekeepers: false, + timelines_onto_safekeepers: true, use_https_safekeeper_api: false, use_local_compute_notifications: true, } diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 8cf1020adb..050d61055e 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -489,7 +489,9 @@ class NeonEnvBuilder: self.config_init_force: str | None = None self.top_output_dir = top_output_dir self.control_plane_hooks_api: str | None = None - self.storage_controller_config: dict[Any, Any] | None = None + self.storage_controller_config: dict[Any, Any] | None = { + "timelines_onto_safekeepers": True, + } # Flag to enable https listener in pageserver, generate local ssl certs, # and force storage controller to use https for pageserver api. @@ -4909,6 +4911,9 @@ class Safekeeper(LogUtils): log.info(f"finished pulling timeline from {src_ids} to {self.id}") return res + def safekeeper_id(self) -> SafekeeperId: + return SafekeeperId(self.id, "localhost", self.port.pg_tenant_only) + @property def data_dir(self) -> Path: return self.env.repo_dir / "safekeepers" / f"sk{self.id}" diff --git a/test_runner/regress/test_branching.py b/test_runner/regress/test_branching.py index 9ce618b2ad..fa5c9aa693 100644 --- a/test_runner/regress/test_branching.py +++ b/test_runner/regress/test_branching.py @@ -11,6 +11,7 @@ from fixtures.common_types import Lsn, TimelineId from fixtures.log_helper import log from fixtures.pageserver.http import PageserverApiException from fixtures.pageserver.utils import wait_until_tenant_active +from fixtures.safekeeper.http import MembershipConfiguration, TimelineCreateRequest from fixtures.utils import query_scalar from performance.test_perf_pgbench import get_scales_matrix from requests import RequestException @@ -164,6 +165,19 @@ def test_cannot_create_endpoint_on_non_uploaded_timeline(neon_env_builder: NeonE ps_http.configure_failpoints(("before-upload-index-pausable", "pause")) env.pageserver.tenant_create(env.initial_tenant) + sk = env.safekeepers[0] + assert sk + sk.http_client().timeline_create( + TimelineCreateRequest( + env.initial_tenant, + env.initial_timeline, + MembershipConfiguration(generation=1, members=[sk.safekeeper_id()], new_members=None), + int(env.pg_version), + Lsn(0), + None, + ) + ) + initial_branch = "initial_branch" def start_creating_timeline(): diff --git a/test_runner/regress/test_normal_work.py b/test_runner/regress/test_normal_work.py index 44590ea4b9..3335cf686c 100644 --- a/test_runner/regress/test_normal_work.py +++ b/test_runner/regress/test_normal_work.py @@ -64,6 +64,11 @@ def test_normal_work( """ neon_env_builder.num_safekeepers = num_safekeepers + + if safekeeper_proto_version == 2: + neon_env_builder.storage_controller_config = { + "timelines_onto_safekeepers": False, + } env = neon_env_builder.init_start() pageserver_http = env.pageserver.http_client() diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py index 2590a3fe9d..2b71662669 100644 --- a/test_runner/regress/test_ondemand_download.py +++ b/test_runner/regress/test_ondemand_download.py @@ -671,6 +671,12 @@ def test_layer_download_cancelled_by_config_location(neon_env_builder: NeonEnvBu """ neon_env_builder.enable_pageserver_remote_storage(s3_storage()) + # On the new mode, the test runs into a cancellation issue, i.e. the walproposer can't shut down + # as it is hang-waiting on the timeline_checkpoint call in WalIngest::new. + neon_env_builder.storage_controller_config = { + "timelines_onto_safekeepers": False, + } + # turn off background tasks so that they don't interfere with the downloads env = neon_env_builder.init_start( initial_tenant_conf={ diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py index 8f3aa010e3..74ba74645e 100644 --- a/test_runner/regress/test_storage_controller.py +++ b/test_runner/regress/test_storage_controller.py @@ -88,6 +88,12 @@ def test_storage_controller_smoke( neon_env_builder.control_plane_hooks_api = compute_reconfigure_listener.control_plane_hooks_api env = neon_env_builder.init_configs() + # These bubble up from safekeepers + for ps in env.pageservers: + ps.allowed_errors.extend( + [".*Timeline.* has been deleted.*", ".*Timeline.*was cancelled and cannot be used"] + ) + # Start services by hand so that we can skip a pageserver (this will start + register later) env.broker.start() env.storage_controller.start() @@ -3455,7 +3461,7 @@ def test_safekeeper_deployment_time_update(neon_env_builder: NeonEnvBuilder): assert target.get_safekeeper(fake_id) is None - assert len(target.get_safekeepers()) == 0 + start_sks = target.get_safekeepers() sk_0 = env.safekeepers[0] @@ -3477,7 +3483,7 @@ def test_safekeeper_deployment_time_update(neon_env_builder: NeonEnvBuilder): inserted = target.get_safekeeper(fake_id) assert inserted is not None - assert target.get_safekeepers() == [inserted] + assert target.get_safekeepers() == start_sks + [inserted] assert eq_safekeeper_records(body, inserted) # error out if pk is changed (unexpected) @@ -3489,7 +3495,7 @@ def test_safekeeper_deployment_time_update(neon_env_builder: NeonEnvBuilder): assert exc.value.status_code == 400 inserted_again = target.get_safekeeper(fake_id) - assert target.get_safekeepers() == [inserted_again] + assert target.get_safekeepers() == start_sks + [inserted_again] assert inserted_again is not None assert eq_safekeeper_records(inserted, inserted_again) @@ -3498,7 +3504,7 @@ def test_safekeeper_deployment_time_update(neon_env_builder: NeonEnvBuilder): body["version"] += 1 target.on_safekeeper_deploy(fake_id, body) inserted_now = target.get_safekeeper(fake_id) - assert target.get_safekeepers() == [inserted_now] + assert target.get_safekeepers() == start_sks + [inserted_now] assert inserted_now is not None assert eq_safekeeper_records(body, inserted_now) @@ -3507,7 +3513,7 @@ def test_safekeeper_deployment_time_update(neon_env_builder: NeonEnvBuilder): body["https_port"] = 123 target.on_safekeeper_deploy(fake_id, body) inserted_now = target.get_safekeeper(fake_id) - assert target.get_safekeepers() == [inserted_now] + assert target.get_safekeepers() == start_sks + [inserted_now] assert inserted_now is not None assert eq_safekeeper_records(body, inserted_now) env.storage_controller.consistency_check() @@ -3516,7 +3522,7 @@ def test_safekeeper_deployment_time_update(neon_env_builder: NeonEnvBuilder): body["https_port"] = None target.on_safekeeper_deploy(fake_id, body) inserted_now = target.get_safekeeper(fake_id) - assert target.get_safekeepers() == [inserted_now] + assert target.get_safekeepers() == start_sks + [inserted_now] assert inserted_now is not None assert eq_safekeeper_records(body, inserted_now) env.storage_controller.consistency_check() @@ -3635,6 +3641,9 @@ def test_timeline_delete_mid_live_migration(neon_env_builder: NeonEnvBuilder, mi env = neon_env_builder.init_configs() env.start() + for ps in env.pageservers: + ps.allowed_errors.append(".*Timeline.* has been deleted.*") + tenant_id = TenantId.generate() timeline_id = TimelineId.generate() env.storage_controller.tenant_create(tenant_id, placement_policy={"Attached": 1}) diff --git a/test_runner/regress/test_storage_scrubber.py b/test_runner/regress/test_storage_scrubber.py index 03cd133ccb..e29cb801d5 100644 --- a/test_runner/regress/test_storage_scrubber.py +++ b/test_runner/regress/test_storage_scrubber.py @@ -341,6 +341,11 @@ def test_scrubber_physical_gc_timeline_deletion(neon_env_builder: NeonEnvBuilder env = neon_env_builder.init_configs() env.start() + for ps in env.pageservers: + ps.allowed_errors.extend( + [".*Timeline.* has been deleted.*", ".*Timeline.*was cancelled and cannot be used"] + ) + tenant_id = TenantId.generate() timeline_id = TimelineId.generate() env.create_tenant( diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py index f0810270b1..c58f78aeb1 100644 --- a/test_runner/regress/test_timeline_detach_ancestor.py +++ b/test_runner/regress/test_timeline_detach_ancestor.py @@ -21,7 +21,10 @@ from fixtures.neon_fixtures import ( last_flush_lsn_upload, wait_for_last_flush_lsn, ) -from fixtures.pageserver.http import HistoricLayerInfo, PageserverApiException +from fixtures.pageserver.http import ( + HistoricLayerInfo, + PageserverApiException, +) from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_timeline_detail_404 from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind from fixtures.utils import assert_pageserver_backups_equal, skip_in_debug_build, wait_until @@ -413,6 +416,7 @@ def test_ancestor_detach_behavior_v2(neon_env_builder: NeonEnvBuilder, snapshots "read_only": True, }, ) + sk = env.safekeepers[0] assert sk with pytest.raises(requests.exceptions.HTTPError, match="Not Found"): @@ -504,8 +508,15 @@ def test_ancestor_detach_behavior_v2(neon_env_builder: NeonEnvBuilder, snapshots assert len(lineage.get("original_ancestor", [])) == 0 assert len(lineage.get("reparenting_history", [])) == 0 - for name, _, _, rows, starts in expected_result: - with env.endpoints.create_start(name, tenant_id=env.initial_tenant) as ep: + for branch_name, queried_timeline, _, rows, starts in expected_result: + details = client.timeline_detail(env.initial_tenant, queried_timeline) + log.info(f"reading data from branch {branch_name}") + # specifying the lsn makes the endpoint read-only and not connect to safekeepers + with env.endpoints.create( + branch_name, + lsn=Lsn(details["last_record_lsn"]), + ) as ep: + ep.start(safekeeper_generation=1) assert ep.safe_psql("SELECT count(*) FROM foo;")[0][0] == rows assert ep.safe_psql(f"SELECT count(*) FROM audit WHERE starts = {starts}")[0][0] == 1 @@ -1088,6 +1099,7 @@ def test_timeline_detach_ancestor_interrupted_by_deletion( for ps in env.pageservers: ps.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS) + ps.allowed_errors.append(".*Timeline.* has been deleted.*") pageservers = dict((int(p.id), p) for p in env.pageservers) @@ -1209,6 +1221,7 @@ def test_sharded_tad_interleaved_after_partial_success(neon_env_builder: NeonEnv for ps in env.pageservers: ps.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS) + ps.allowed_errors.append(".*Timeline.* has been deleted.*") pageservers = dict((int(p.id), p) for p in env.pageservers) diff --git a/test_runner/regress/test_timeline_gc_blocking.py b/test_runner/regress/test_timeline_gc_blocking.py index 9a710f5b80..8ef64a0742 100644 --- a/test_runner/regress/test_timeline_gc_blocking.py +++ b/test_runner/regress/test_timeline_gc_blocking.py @@ -24,6 +24,8 @@ def test_gc_blocking_by_timeline(neon_env_builder: NeonEnvBuilder, sharded: bool initial_tenant_conf={"gc_period": "1s", "lsn_lease_length": "0s"}, initial_tenant_shard_count=2 if sharded else None, ) + for ps in env.pageservers: + ps.allowed_errors.append(".*Timeline.* has been deleted.*") if sharded: http = env.storage_controller.pageserver_api() diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index b9183286af..ea120c1814 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -229,7 +229,7 @@ def test_many_timelines(neon_env_builder: NeonEnvBuilder): # Test timeline_list endpoint. http_cli = env.safekeepers[0].http_client() - assert len(http_cli.timeline_list()) == 3 + assert len(http_cli.timeline_list()) == 4 # Check that dead minority doesn't prevent the commits: execute insert n_inserts @@ -740,8 +740,8 @@ def test_timeline_status(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): env = neon_env_builder.init_start() tenant_id = env.initial_tenant - timeline_id = env.create_branch("test_timeline_status") - endpoint = env.endpoints.create_start("test_timeline_status") + timeline_id = env.initial_timeline + endpoint = env.endpoints.create_start("main") wa = env.safekeepers[0] @@ -1292,6 +1292,12 @@ def test_lagging_sk(neon_env_builder: NeonEnvBuilder): # it works without compute at all. def test_peer_recovery(neon_env_builder: NeonEnvBuilder): neon_env_builder.num_safekeepers = 3 + + # timelines should be created the old way + neon_env_builder.storage_controller_config = { + "timelines_onto_safekeepers": False, + } + env = neon_env_builder.init_start() tenant_id = env.initial_tenant @@ -1532,6 +1538,11 @@ def test_safekeeper_without_pageserver( def test_replace_safekeeper(neon_env_builder: NeonEnvBuilder): + # timelines should be created the old way manually until we have migration support + neon_env_builder.storage_controller_config = { + "timelines_onto_safekeepers": False, + } + def execute_payload(endpoint: Endpoint): with closing(endpoint.connect()) as conn: with conn.cursor() as cur: @@ -1661,6 +1672,15 @@ def test_pull_timeline(neon_env_builder: NeonEnvBuilder, live_sk_change: bool): res = env.safekeepers[3].pull_timeline( [env.safekeepers[0], env.safekeepers[2]], tenant_id, timeline_id ) + sk_id_1 = env.safekeepers[0].safekeeper_id() + sk_id_3 = env.safekeepers[2].safekeeper_id() + sk_id_4 = env.safekeepers[3].safekeeper_id() + new_conf = MembershipConfiguration( + generation=2, members=[sk_id_1, sk_id_3, sk_id_4], new_members=None + ) + for i in [0, 2, 3]: + env.safekeepers[i].http_client().membership_switch(tenant_id, timeline_id, new_conf) + log.info("Finished pulling timeline") log.info(res) @@ -1705,13 +1725,15 @@ def test_pull_timeline_gc(neon_env_builder: NeonEnvBuilder): neon_env_builder.num_safekeepers = 3 neon_env_builder.enable_safekeeper_remote_storage(default_remote_storage()) env = neon_env_builder.init_start() - tenant_id = env.initial_tenant - timeline_id = env.initial_timeline (src_sk, dst_sk) = (env.safekeepers[0], env.safekeepers[2]) + dst_sk.stop() + + [tenant_id, timeline_id] = env.create_tenant() + log.info("use only first 2 safekeepers, 3rd will be seeded") - endpoint = env.endpoints.create("main") + endpoint = env.endpoints.create("main", tenant_id=tenant_id) endpoint.active_safekeepers = [1, 2] endpoint.start() endpoint.safe_psql("create table t(key int, value text)") @@ -1723,6 +1745,7 @@ def test_pull_timeline_gc(neon_env_builder: NeonEnvBuilder): src_http = src_sk.http_client() # run pull_timeline which will halt before downloading files src_http.configure_failpoints(("sk-snapshot-after-list-pausable", "pause")) + dst_sk.start() pt_handle = PropagatingThread( target=dst_sk.pull_timeline, args=([src_sk], tenant_id, timeline_id) ) @@ -1782,23 +1805,27 @@ def test_pull_timeline_term_change(neon_env_builder: NeonEnvBuilder): neon_env_builder.enable_safekeeper_remote_storage(default_remote_storage()) env = neon_env_builder.init_start() tenant_id = env.initial_tenant - timeline_id = env.initial_timeline (src_sk, dst_sk) = (env.safekeepers[0], env.safekeepers[2]) + dst_sk.stop() + src_http = src_sk.http_client() + src_http.configure_failpoints(("sk-snapshot-after-list-pausable", "pause")) + + timeline_id = env.create_branch("pull_timeline_term_changes") + + # run pull_timeline which will halt before downloading files log.info("use only first 2 safekeepers, 3rd will be seeded") - ep = env.endpoints.create("main") + ep = env.endpoints.create("pull_timeline_term_changes") ep.active_safekeepers = [1, 2] ep.start() ep.safe_psql("create table t(key int, value text)") ep.safe_psql("insert into t select generate_series(1, 1000), 'pear'") - src_http = src_sk.http_client() - # run pull_timeline which will halt before downloading files - src_http.configure_failpoints(("sk-snapshot-after-list-pausable", "pause")) pt_handle = PropagatingThread( target=dst_sk.pull_timeline, args=([src_sk], tenant_id, timeline_id) ) + dst_sk.start() pt_handle.start() src_sk.wait_until_paused("sk-snapshot-after-list-pausable") @@ -1807,7 +1834,7 @@ def test_pull_timeline_term_change(neon_env_builder: NeonEnvBuilder): # restart compute to bump term ep.stop() - ep = env.endpoints.create("main") + ep = env.endpoints.create("pull_timeline_term_changes") ep.active_safekeepers = [1, 2] ep.start() ep.safe_psql("insert into t select generate_series(1, 100), 'pear'") @@ -1929,6 +1956,11 @@ def test_pull_timeline_while_evicted(neon_env_builder: NeonEnvBuilder): @run_only_on_default_postgres("tests only safekeeper API") def test_membership_api(neon_env_builder: NeonEnvBuilder): neon_env_builder.num_safekeepers = 1 + # timelines should be created the old way + neon_env_builder.storage_controller_config = { + "timelines_onto_safekeepers": False, + } + env = neon_env_builder.init_start() # These are expected after timeline deletion on safekeepers. @@ -2009,6 +2041,12 @@ def test_explicit_timeline_creation(neon_env_builder: NeonEnvBuilder): created manually, later storcon will do that. """ neon_env_builder.num_safekeepers = 3 + + # timelines should be created the old way manually + neon_env_builder.storage_controller_config = { + "timelines_onto_safekeepers": False, + } + env = neon_env_builder.init_start() tenant_id = env.initial_tenant @@ -2064,7 +2102,7 @@ def test_idle_reconnections(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() tenant_id = env.initial_tenant - timeline_id = env.create_branch("test_idle_reconnections") + timeline_id = env.initial_timeline def collect_stats() -> dict[str, float]: # we need to collect safekeeper_pg_queries_received_total metric from all safekeepers @@ -2095,7 +2133,7 @@ def test_idle_reconnections(neon_env_builder: NeonEnvBuilder): collect_stats() - endpoint = env.endpoints.create_start("test_idle_reconnections") + endpoint = env.endpoints.create_start("main") # just write something to the timeline endpoint.safe_psql("create table t(i int)") collect_stats() diff --git a/test_runner/regress/test_wal_acceptor_async.py b/test_runner/regress/test_wal_acceptor_async.py index d8a7dc2a2b..1bad387a90 100644 --- a/test_runner/regress/test_wal_acceptor_async.py +++ b/test_runner/regress/test_wal_acceptor_async.py @@ -590,6 +590,13 @@ async def run_wal_truncation(env: NeonEnv, safekeeper_proto_version: int): @pytest.mark.parametrize("safekeeper_proto_version", [2, 3]) def test_wal_truncation(neon_env_builder: NeonEnvBuilder, safekeeper_proto_version: int): neon_env_builder.num_safekeepers = 3 + if safekeeper_proto_version == 2: + # On the legacy protocol, we don't support generations, which are part of + # `timelines_onto_safekeepers` + neon_env_builder.storage_controller_config = { + "timelines_onto_safekeepers": False, + } + env = neon_env_builder.init_start() asyncio.run(run_wal_truncation(env, safekeeper_proto_version)) @@ -713,6 +720,11 @@ async def run_quorum_sanity(env: NeonEnv): # we don't. def test_quorum_sanity(neon_env_builder: NeonEnvBuilder): neon_env_builder.num_safekeepers = 4 + + # The test fails basically always on the new mode. + neon_env_builder.storage_controller_config = { + "timelines_onto_safekeepers": False, + } env = neon_env_builder.init_start() asyncio.run(run_quorum_sanity(env)) diff --git a/test_runner/regress/test_wal_receiver.py b/test_runner/regress/test_wal_receiver.py index 0252b590cc..d281c055b0 100644 --- a/test_runner/regress/test_wal_receiver.py +++ b/test_runner/regress/test_wal_receiver.py @@ -16,6 +16,13 @@ if TYPE_CHECKING: # Checks that pageserver's walreceiver state is printed in the logs during WAL wait timeout. # Ensures that walreceiver does not run without any data inserted and only starts after the insertion. def test_pageserver_lsn_wait_error_start(neon_env_builder: NeonEnvBuilder): + # we assert below that the walreceiver is not active before data writes. + # with manually created timelines, it is active. + # FIXME: remove this test once we remove timelines_onto_safekeepers + neon_env_builder.storage_controller_config = { + "timelines_onto_safekeepers": False, + } + # Trigger WAL wait timeout faster neon_env_builder.pageserver_config_override = "wait_lsn_timeout = '1s'" env = neon_env_builder.init_start() From a6d4de25cd9ac335e08f9e89cd2b2b49b09ef774 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 19 Jun 2025 14:20:02 +0000 Subject: [PATCH 082/124] build(deps): bump urllib3 from 1.26.19 to 2.5.0 in the pip group across 1 directory (#12289) Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- poetry.lock | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/poetry.lock b/poetry.lock index f9b6f83366..1bc5077eb7 100644 --- a/poetry.lock +++ b/poetry.lock @@ -746,23 +746,23 @@ xray = ["mypy-boto3-xray (>=1.26.0,<1.27.0)"] [[package]] name = "botocore" -version = "1.34.11" +version = "1.34.162" description = "Low-level, data-driven core of boto 3." optional = false -python-versions = ">= 3.8" +python-versions = ">=3.8" groups = ["main"] files = [ - {file = "botocore-1.34.11-py3-none-any.whl", hash = "sha256:1ff1398b6ea670e1c01ac67a33af3da854f8e700d3528289c04f319c330d8250"}, - {file = "botocore-1.34.11.tar.gz", hash = "sha256:51905c3d623c60df5dc5794387de7caf886d350180a01a3dfa762e903edb45a9"}, + {file = "botocore-1.34.162-py3-none-any.whl", hash = "sha256:2d918b02db88d27a75b48275e6fb2506e9adaaddbec1ffa6a8a0898b34e769be"}, + {file = "botocore-1.34.162.tar.gz", hash = "sha256:adc23be4fb99ad31961236342b7cbf3c0bfc62532cd02852196032e8c0d682f3"}, ] [package.dependencies] jmespath = ">=0.7.1,<2.0.0" python-dateutil = ">=2.1,<3.0.0" -urllib3 = {version = ">=1.25.4,<2.1", markers = "python_version >= \"3.10\""} +urllib3 = {version = ">=1.25.4,<2.2.0 || >2.2.0,<3", markers = "python_version >= \"3.10\""} [package.extras] -crt = ["awscrt (==0.19.19)"] +crt = ["awscrt (==0.21.2)"] [[package]] name = "botocore-stubs" @@ -3422,20 +3422,21 @@ files = [ [[package]] name = "urllib3" -version = "1.26.19" +version = "2.5.0" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false -python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,>=2.7" +python-versions = ">=3.9" groups = ["main"] files = [ - {file = "urllib3-1.26.19-py2.py3-none-any.whl", hash = "sha256:37a0344459b199fce0e80b0d3569837ec6b6937435c5244e7fd73fa6006830f3"}, - {file = "urllib3-1.26.19.tar.gz", hash = "sha256:3e3d753a8618b86d7de333b4223005f68720bcd6a7d2bcb9fbd2229ec7c1e429"}, + {file = "urllib3-2.5.0-py3-none-any.whl", hash = "sha256:e6b01673c0fa6a13e374b50871808eb3bf7046c4b125b216f6bf1cc604cff0dc"}, + {file = "urllib3-2.5.0.tar.gz", hash = "sha256:3fc47733c7e419d4bc3f6b3dc2b4f890bb743906a30d56ba4a5bfa4bbff92760"}, ] [package.extras] -brotli = ["brotli (==1.0.9) ; os_name != \"nt\" and python_version < \"3\" and platform_python_implementation == \"CPython\"", "brotli (>=1.0.9) ; python_version >= \"3\" and platform_python_implementation == \"CPython\"", "brotlicffi (>=0.8.0) ; (os_name != \"nt\" or python_version >= \"3\") and platform_python_implementation != \"CPython\"", "brotlipy (>=0.6.0) ; os_name == \"nt\" and python_version < \"3\""] -secure = ["certifi", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "ipaddress ; python_version == \"2.7\"", "pyOpenSSL (>=0.14)", "urllib3-secure-extra"] -socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"] +brotli = ["brotli (>=1.0.9) ; platform_python_implementation == \"CPython\"", "brotlicffi (>=0.8.0) ; platform_python_implementation != \"CPython\""] +h2 = ["h2 (>=4,<5)"] +socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"] +zstd = ["zstandard (>=0.18.0)"] [[package]] name = "websockets" From dc1625cd8e5c5fcc478a017261e28702ae9052e4 Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Thu, 19 Jun 2025 17:40:57 +0200 Subject: [PATCH 083/124] pagebench: add `basebackup` gRPC support (#12250) ## Problem Pagebench does not support gRPC for `basebackup` benchmarks. Requires #12243. Touches #11728. ## Summary of changes Add gRPC support via gRPC connstrings, e.g. `pagebench basebackup --page-service-connstring grpc://localhost:51051`. Also change `--gzip-probability` to `--no-compression`, since this must be specified per-client for gRPC. --- pageserver/page_api/src/client.rs | 2 +- pageserver/pagebench/src/cmd/basebackup.rs | 179 +++++++++++++++------ 2 files changed, 131 insertions(+), 50 deletions(-) diff --git a/pageserver/page_api/src/client.rs b/pageserver/page_api/src/client.rs index 274f036f3d..aa4774c056 100644 --- a/pageserver/page_api/src/client.rs +++ b/pageserver/page_api/src/client.rs @@ -121,7 +121,7 @@ impl Client { pub async fn get_base_backup( &mut self, req: model::GetBaseBackupRequest, - ) -> Result>, tonic::Status> { + ) -> Result> + 'static, tonic::Status> { let proto_req = proto::GetBaseBackupRequest::from(req); let response_stream: Streaming = diff --git a/pageserver/pagebench/src/cmd/basebackup.rs b/pageserver/pagebench/src/cmd/basebackup.rs index 43ad92980c..8015db528d 100644 --- a/pageserver/pagebench/src/cmd/basebackup.rs +++ b/pageserver/pagebench/src/cmd/basebackup.rs @@ -1,20 +1,29 @@ use std::collections::HashMap; use std::num::NonZeroUsize; use std::ops::Range; -use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering}; +use std::pin::Pin; +use std::sync::atomic::{AtomicU64, Ordering}; use std::sync::{Arc, Mutex}; use std::time::Instant; -use anyhow::Context; +use anyhow::anyhow; +use futures::TryStreamExt as _; use pageserver_api::shard::TenantShardId; use pageserver_client::mgmt_api::ForceAwaitLogicalSize; use pageserver_client::page_service::BasebackupRequest; +use pageserver_page_api as page_api; use rand::prelude::*; +use reqwest::Url; +use tokio::io::AsyncRead; use tokio::sync::Barrier; use tokio::task::JoinSet; +use tokio_util::compat::{TokioAsyncReadCompatExt as _, TokioAsyncWriteCompatExt as _}; +use tokio_util::io::StreamReader; +use tonic::async_trait; use tracing::{info, instrument}; use utils::id::TenantTimelineId; use utils::lsn::Lsn; +use utils::shard::ShardIndex; use crate::util::tokio_thread_local_stats::AllThreadLocalStats; use crate::util::{request_stats, tokio_thread_local_stats}; @@ -24,14 +33,15 @@ use crate::util::{request_stats, tokio_thread_local_stats}; pub(crate) struct Args { #[clap(long, default_value = "http://localhost:9898")] mgmt_api_endpoint: String, - #[clap(long, default_value = "postgres://postgres@localhost:64000")] + /// The Pageserver to connect to. Use postgresql:// for libpq, or grpc:// for gRPC. + #[clap(long, default_value = "postgresql://postgres@localhost:64000")] page_service_connstring: String, #[clap(long)] pageserver_jwt: Option, #[clap(long, default_value = "1")] num_clients: NonZeroUsize, - #[clap(long, default_value = "1.0")] - gzip_probability: f64, + #[clap(long)] + no_compression: bool, #[clap(long)] runtime: Option, #[clap(long)] @@ -146,12 +156,23 @@ async fn main_impl( let mut work_senders = HashMap::new(); let mut tasks = Vec::new(); - for tl in &timelines { + let connurl = Url::parse(&args.page_service_connstring)?; + for &tl in &timelines { let (sender, receiver) = tokio::sync::mpsc::channel(1); // TODO: not sure what the implications of this are work_senders.insert(tl, sender); - tasks.push(tokio::spawn(client( - args, - *tl, + + let client: Box = match connurl.scheme() { + "postgresql" | "postgres" => Box::new( + LibpqClient::new(&args.page_service_connstring, tl, !args.no_compression).await?, + ), + "grpc" => Box::new( + GrpcClient::new(&args.page_service_connstring, tl, !args.no_compression).await?, + ), + scheme => return Err(anyhow!("invalid scheme {scheme}")), + }; + + tasks.push(tokio::spawn(run_worker( + client, Arc::clone(&start_work_barrier), receiver, Arc::clone(&all_work_done_barrier), @@ -166,13 +187,7 @@ async fn main_impl( let mut rng = rand::thread_rng(); let target = all_targets.choose(&mut rng).unwrap(); let lsn = target.lsn_range.clone().map(|r| rng.gen_range(r)); - ( - target.timeline, - Work { - lsn, - gzip: rng.gen_bool(args.gzip_probability), - }, - ) + (target.timeline, Work { lsn }) }; let sender = work_senders.get(&timeline).unwrap(); // TODO: what if this blocks? @@ -216,13 +231,11 @@ async fn main_impl( #[derive(Copy, Clone)] struct Work { lsn: Option, - gzip: bool, } #[instrument(skip_all)] -async fn client( - args: &'static Args, - timeline: TenantTimelineId, +async fn run_worker( + mut client: Box, start_work_barrier: Arc, mut work: tokio::sync::mpsc::Receiver, all_work_done_barrier: Arc, @@ -230,37 +243,14 @@ async fn client( ) { start_work_barrier.wait().await; - let client = pageserver_client::page_service::Client::new(args.page_service_connstring.clone()) - .await - .unwrap(); - - while let Some(Work { lsn, gzip }) = work.recv().await { + while let Some(Work { lsn }) = work.recv().await { let start = Instant::now(); - let copy_out_stream = client - .basebackup(&BasebackupRequest { - tenant_id: timeline.tenant_id, - timeline_id: timeline.timeline_id, - lsn, - gzip, - }) - .await - .with_context(|| format!("start basebackup for {timeline}")) - .unwrap(); + let stream = client.basebackup(lsn).await.unwrap(); - use futures::StreamExt; - let size = Arc::new(AtomicUsize::new(0)); - copy_out_stream - .for_each({ - |r| { - let size = Arc::clone(&size); - async move { - let size = Arc::clone(&size); - size.fetch_add(r.unwrap().len(), Ordering::Relaxed); - } - } - }) - .await; - info!("basebackup size is {} bytes", size.load(Ordering::Relaxed)); + let size = futures::io::copy(stream.compat(), &mut tokio::io::sink().compat_write()) + .await + .unwrap(); + info!("basebackup size is {size} bytes"); let elapsed = start.elapsed(); live_stats.inc(); STATS.with(|stats| { @@ -270,3 +260,94 @@ async fn client( all_work_done_barrier.wait().await; } + +/// A basebackup client. This allows switching out the client protocol implementation. +#[async_trait] +trait Client: Send { + async fn basebackup( + &mut self, + lsn: Option, + ) -> anyhow::Result>>; +} + +/// A libpq-based Pageserver client. +struct LibpqClient { + inner: pageserver_client::page_service::Client, + ttid: TenantTimelineId, + compression: bool, +} + +impl LibpqClient { + async fn new( + connstring: &str, + ttid: TenantTimelineId, + compression: bool, + ) -> anyhow::Result { + Ok(Self { + inner: pageserver_client::page_service::Client::new(connstring.to_string()).await?, + ttid, + compression, + }) + } +} + +#[async_trait] +impl Client for LibpqClient { + async fn basebackup( + &mut self, + lsn: Option, + ) -> anyhow::Result>> { + let req = BasebackupRequest { + tenant_id: self.ttid.tenant_id, + timeline_id: self.ttid.timeline_id, + lsn, + gzip: self.compression, + }; + let stream = self.inner.basebackup(&req).await?; + Ok(Box::pin(StreamReader::new( + stream.map_err(std::io::Error::other), + ))) + } +} + +/// A gRPC Pageserver client. +struct GrpcClient { + inner: page_api::Client, +} + +impl GrpcClient { + async fn new( + connstring: &str, + ttid: TenantTimelineId, + compression: bool, + ) -> anyhow::Result { + let inner = page_api::Client::new( + connstring.to_string(), + ttid.tenant_id, + ttid.timeline_id, + ShardIndex::unsharded(), + None, + compression.then_some(tonic::codec::CompressionEncoding::Zstd), + ) + .await?; + Ok(Self { inner }) + } +} + +#[async_trait] +impl Client for GrpcClient { + async fn basebackup( + &mut self, + lsn: Option, + ) -> anyhow::Result>> { + let req = page_api::GetBaseBackupRequest { + lsn, + replica: false, + full: false, + }; + let stream = self.inner.get_base_backup(req).await?; + Ok(Box::pin(StreamReader::new( + stream.map_err(std::io::Error::other), + ))) + } +} From 15d079cd41c579555b7a42b7d36429ca63c7d8a2 Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Fri, 20 Jun 2025 10:31:40 +0200 Subject: [PATCH 084/124] pagebench: improve `getpage-latest-lsn` gRPC support (#12293) This improves `pagebench getpage-latest-lsn` gRPC support by: * Using `page_api::Client`. * Removing `--protocol`, and using the `page-server-connstring` scheme instead. * Adding `--compression` to enable zstd compression. --- Cargo.lock | 3 + pageserver/page_api/Cargo.toml | 2 + pageserver/page_api/src/model.rs | 2 +- pageserver/pagebench/Cargo.toml | 1 + pageserver/pagebench/src/cmd/basebackup.rs | 10 +- .../pagebench/src/cmd/getpage_latest_lsn.rs | 115 ++++++++++-------- 6 files changed, 79 insertions(+), 54 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 5ab26b02fa..8cc51350ef 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4255,6 +4255,7 @@ dependencies = [ "tokio-util", "tonic 0.13.1", "tracing", + "url", "utils", "workspace_hack", ] @@ -4472,6 +4473,8 @@ dependencies = [ "pageserver_api", "postgres_ffi", "prost 0.13.5", + "strum", + "strum_macros", "thiserror 1.0.69", "tokio", "tonic 0.13.1", diff --git a/pageserver/page_api/Cargo.toml b/pageserver/page_api/Cargo.toml index 8b13b9e1db..c5283c2b09 100644 --- a/pageserver/page_api/Cargo.toml +++ b/pageserver/page_api/Cargo.toml @@ -11,6 +11,8 @@ futures.workspace = true pageserver_api.workspace = true postgres_ffi.workspace = true prost.workspace = true +strum.workspace = true +strum_macros.workspace = true thiserror.workspace = true tokio.workspace = true tonic.workspace = true diff --git a/pageserver/page_api/src/model.rs b/pageserver/page_api/src/model.rs index ef7f89473f..6efa742799 100644 --- a/pageserver/page_api/src/model.rs +++ b/pageserver/page_api/src/model.rs @@ -459,7 +459,7 @@ impl GetPageResponse { /// These are effectively equivalent to gRPC statuses. However, we use a bidirectional stream /// (potentially shared by many backends), and a gRPC status response would terminate the stream so /// we send GetPageResponse messages with these codes instead. -#[derive(Clone, Copy, Debug)] +#[derive(Clone, Copy, Debug, PartialEq, strum_macros::Display)] pub enum GetPageStatusCode { /// Unknown status. For forwards compatibility: used when an older client version receives a new /// status code from a newer server version. diff --git a/pageserver/pagebench/Cargo.toml b/pageserver/pagebench/Cargo.toml index 5e4af88e69..f5dfc0db25 100644 --- a/pageserver/pagebench/Cargo.toml +++ b/pageserver/pagebench/Cargo.toml @@ -25,6 +25,7 @@ tokio.workspace = true tokio-stream.workspace = true tokio-util.workspace = true tonic.workspace = true +url.workspace = true pageserver_client.workspace = true pageserver_api.workspace = true diff --git a/pageserver/pagebench/src/cmd/basebackup.rs b/pageserver/pagebench/src/cmd/basebackup.rs index 8015db528d..e028174c1d 100644 --- a/pageserver/pagebench/src/cmd/basebackup.rs +++ b/pageserver/pagebench/src/cmd/basebackup.rs @@ -13,7 +13,6 @@ use pageserver_client::mgmt_api::ForceAwaitLogicalSize; use pageserver_client::page_service::BasebackupRequest; use pageserver_page_api as page_api; use rand::prelude::*; -use reqwest::Url; use tokio::io::AsyncRead; use tokio::sync::Barrier; use tokio::task::JoinSet; @@ -21,6 +20,7 @@ use tokio_util::compat::{TokioAsyncReadCompatExt as _, TokioAsyncWriteCompatExt use tokio_util::io::StreamReader; use tonic::async_trait; use tracing::{info, instrument}; +use url::Url; use utils::id::TenantTimelineId; use utils::lsn::Lsn; use utils::shard::ShardIndex; @@ -156,12 +156,16 @@ async fn main_impl( let mut work_senders = HashMap::new(); let mut tasks = Vec::new(); - let connurl = Url::parse(&args.page_service_connstring)?; + let scheme = match Url::parse(&args.page_service_connstring) { + Ok(url) => url.scheme().to_lowercase().to_string(), + Err(url::ParseError::RelativeUrlWithoutBase) => "postgresql".to_string(), + Err(err) => return Err(anyhow!("invalid connstring: {err}")), + }; for &tl in &timelines { let (sender, receiver) = tokio::sync::mpsc::channel(1); // TODO: not sure what the implications of this are work_senders.insert(tl, sender); - let client: Box = match connurl.scheme() { + let client: Box = match scheme.as_str() { "postgresql" | "postgres" => Box::new( LibpqClient::new(&args.page_service_connstring, tl, !args.no_compression).await?, ), diff --git a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs index 3a68a77279..a297819e9b 100644 --- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs +++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs @@ -10,33 +10,31 @@ use anyhow::Context; use async_trait::async_trait; use bytes::Bytes; use camino::Utf8PathBuf; +use futures::{Stream, StreamExt as _}; use pageserver_api::key::Key; use pageserver_api::keyspace::KeySpaceAccum; use pageserver_api::pagestream_api::{PagestreamGetPageRequest, PagestreamRequest}; use pageserver_api::reltag::RelTag; use pageserver_api::shard::TenantShardId; -use pageserver_page_api::proto; +use pageserver_page_api as page_api; use rand::prelude::*; use tokio::task::JoinSet; use tokio_util::sync::CancellationToken; use tracing::info; +use url::Url; use utils::id::TenantTimelineId; use utils::lsn::Lsn; +use utils::shard::ShardIndex; use crate::util::tokio_thread_local_stats::AllThreadLocalStats; use crate::util::{request_stats, tokio_thread_local_stats}; -#[derive(clap::ValueEnum, Clone, Debug)] -enum Protocol { - Libpq, - Grpc, -} - /// GetPage@LatestLSN, uniformly distributed across the compute-accessible keyspace. #[derive(clap::Parser)] pub(crate) struct Args { #[clap(long, default_value = "http://localhost:9898")] mgmt_api_endpoint: String, + /// Pageserver connection string. Supports postgresql:// and grpc:// protocols. #[clap(long, default_value = "postgres://postgres@localhost:64000")] page_service_connstring: String, #[clap(long)] @@ -45,8 +43,9 @@ pub(crate) struct Args { num_clients: NonZeroUsize, #[clap(long)] runtime: Option, - #[clap(long, value_enum, default_value = "libpq")] - protocol: Protocol, + /// If true, enable compression (only for gRPC). + #[clap(long)] + compression: bool, /// Each client sends requests at the given rate. /// /// If a request takes too long and we should be issuing a new request already, @@ -325,18 +324,32 @@ async fn main_impl( .unwrap(); Box::pin(async move { - let client: Box = match args.protocol { - Protocol::Libpq => Box::new( - LibpqClient::new(args.page_service_connstring.clone(), worker_id.timeline) - .await - .unwrap(), + let scheme = match Url::parse(&args.page_service_connstring) { + Ok(url) => url.scheme().to_lowercase().to_string(), + Err(url::ParseError::RelativeUrlWithoutBase) => "postgresql".to_string(), + Err(err) => panic!("invalid connstring: {err}"), + }; + let client: Box = match scheme.as_str() { + "postgresql" | "postgres" => { + assert!(!args.compression, "libpq does not support compression"); + Box::new( + LibpqClient::new(&args.page_service_connstring, worker_id.timeline) + .await + .unwrap(), + ) + } + + "grpc" => Box::new( + GrpcClient::new( + &args.page_service_connstring, + worker_id.timeline, + args.compression, + ) + .await + .unwrap(), ), - Protocol::Grpc => Box::new( - GrpcClient::new(args.page_service_connstring.clone(), worker_id.timeline) - .await - .unwrap(), - ), + scheme => panic!("unsupported scheme {scheme}"), }; run_worker(args, client, ss, cancel, rps_period, ranges, weights).await }) @@ -543,8 +556,8 @@ struct LibpqClient { } impl LibpqClient { - async fn new(connstring: String, ttid: TenantTimelineId) -> anyhow::Result { - let inner = pageserver_client::page_service::Client::new(connstring) + async fn new(connstring: &str, ttid: TenantTimelineId) -> anyhow::Result { + let inner = pageserver_client::page_service::Client::new(connstring.to_string()) .await? .pagestream(ttid.tenant_id, ttid.timeline_id) .await?; @@ -600,34 +613,36 @@ impl Client for LibpqClient { } } -/// A gRPC client using the raw, no-frills gRPC client. +/// A gRPC Pageserver client. struct GrpcClient { - req_tx: tokio::sync::mpsc::Sender, - resp_rx: tonic::Streaming, + req_tx: tokio::sync::mpsc::Sender, + resp_rx: Pin> + Send>>, } impl GrpcClient { - async fn new(connstring: String, ttid: TenantTimelineId) -> anyhow::Result { - let mut client = pageserver_page_api::proto::PageServiceClient::connect(connstring).await?; + async fn new( + connstring: &str, + ttid: TenantTimelineId, + compression: bool, + ) -> anyhow::Result { + let mut client = page_api::Client::new( + connstring.to_string(), + ttid.tenant_id, + ttid.timeline_id, + ShardIndex::unsharded(), + None, + compression.then_some(tonic::codec::CompressionEncoding::Zstd), + ) + .await?; // The channel has a buffer size of 1, since 0 is not allowed. It does not matter, since the // benchmark will control the queue depth (i.e. in-flight requests) anyway, and requests are // buffered by Tonic and the OS too. let (req_tx, req_rx) = tokio::sync::mpsc::channel(1); let req_stream = tokio_stream::wrappers::ReceiverStream::new(req_rx); - let mut req = tonic::Request::new(req_stream); - let metadata = req.metadata_mut(); - metadata.insert("neon-tenant-id", ttid.tenant_id.to_string().try_into()?); - metadata.insert("neon-timeline-id", ttid.timeline_id.to_string().try_into()?); - metadata.insert("neon-shard-id", "0000".try_into()?); + let resp_rx = Box::pin(client.get_pages(req_stream).await?); - let resp = client.get_pages(req).await?; - let resp_stream = resp.into_inner(); - - Ok(Self { - req_tx, - resp_rx: resp_stream, - }) + Ok(Self { req_tx, resp_rx }) } } @@ -641,27 +656,27 @@ impl Client for GrpcClient { rel: RelTag, blks: Vec, ) -> anyhow::Result<()> { - let req = proto::GetPageRequest { + let req = page_api::GetPageRequest { request_id: req_id, - request_class: proto::GetPageClass::Normal as i32, - read_lsn: Some(proto::ReadLsn { - request_lsn: req_lsn.0, - not_modified_since_lsn: mod_lsn.0, - }), - rel: Some(rel.into()), - block_number: blks, + request_class: page_api::GetPageClass::Normal, + read_lsn: page_api::ReadLsn { + request_lsn: req_lsn, + not_modified_since_lsn: Some(mod_lsn), + }, + rel, + block_numbers: blks, }; self.req_tx.send(req).await?; Ok(()) } async fn recv_get_page(&mut self) -> anyhow::Result<(u64, Vec)> { - let resp = self.resp_rx.message().await?.unwrap(); + let resp = self.resp_rx.next().await.unwrap().unwrap(); anyhow::ensure!( - resp.status_code == proto::GetPageStatusCode::Ok as i32, + resp.status_code == page_api::GetPageStatusCode::Ok, "unexpected status code: {}", - resp.status_code + resp.status_code, ); - Ok((resp.request_id, resp.page_image)) + Ok((resp.request_id, resp.page_images)) } } From 8b197de7ffcb2b96cfb4b237e5f220f11e4770d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Fri, 20 Jun 2025 12:33:11 +0200 Subject: [PATCH 085/124] Increase upload timeout for test_tenant_s3_restore (#12297) Increase the upload timeout of the test to avoid hitting timeouts (which we sometimes do). Fixes https://github.com/neondatabase/neon/issues/12212 --- test_runner/regress/test_s3_restore.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test_runner/regress/test_s3_restore.py b/test_runner/regress/test_s3_restore.py index 082808f9ff..2d7be1f9d1 100644 --- a/test_runner/regress/test_s3_restore.py +++ b/test_runner/regress/test_s3_restore.py @@ -74,7 +74,7 @@ def test_tenant_s3_restore( last_flush_lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) last_flush_lsns.append(last_flush_lsn) ps_http.timeline_checkpoint(tenant_id, timeline_id) - wait_for_upload(ps_http, tenant_id, timeline_id, last_flush_lsn) + wait_for_upload(ps_http, tenant_id, timeline_id, last_flush_lsn, timeout=60) log.info(f"{timeline} timeline {timeline_id} {last_flush_lsn=}") parent = timeline From a298d2c29b4b68e07cdfb50faa1a74b6df599d45 Mon Sep 17 00:00:00 2001 From: Conrad Ludgate Date: Fri, 20 Jun 2025 12:48:01 +0100 Subject: [PATCH 086/124] [proxy] replace the batch cancellation queue, shorten the TTL for cancel keys (#11943) See #11942 Idea: * if connections are short lived, they can get enqueued and then also remove themselves later if they never made it to redis. This reduces the load on the queue. * short lived connections (<10m, most?) will only issue 1 command, we remove the delete command and rely on ttl. * we can enqueue as many commands as we want, as we can always cancel the enqueue, thanks to the ~~intrusive linked lists~~ `BTreeMap`. --- .../proxy/tokio-postgres2/src/cancel_query.rs | 14 +- .../proxy/tokio-postgres2/src/cancel_token.rs | 21 +- libs/proxy/tokio-postgres2/src/client.rs | 11 +- libs/proxy/tokio-postgres2/src/lib.rs | 2 +- proxy/src/batch.rs | 146 +++++++ proxy/src/binary/local_proxy.rs | 2 +- proxy/src/binary/proxy.rs | 29 +- proxy/src/cancellation.rs | 405 ++++++------------ proxy/src/compute/mod.rs | 12 +- proxy/src/console_redirect_proxy.rs | 31 +- proxy/src/lib.rs | 1 + proxy/src/pglb/passthrough.rs | 46 +- proxy/src/proxy/mod.rs | 30 +- proxy/src/redis/keys.rs | 49 +-- proxy/src/redis/kv_ops.rs | 36 +- proxy/src/serverless/websocket.rs | 2 +- 16 files changed, 410 insertions(+), 427 deletions(-) create mode 100644 proxy/src/batch.rs diff --git a/libs/proxy/tokio-postgres2/src/cancel_query.rs b/libs/proxy/tokio-postgres2/src/cancel_query.rs index 4c2a5ef50f..94fbf333ed 100644 --- a/libs/proxy/tokio-postgres2/src/cancel_query.rs +++ b/libs/proxy/tokio-postgres2/src/cancel_query.rs @@ -1,5 +1,3 @@ -use std::io; - use tokio::net::TcpStream; use crate::client::SocketConfig; @@ -8,7 +6,7 @@ use crate::tls::MakeTlsConnect; use crate::{Error, cancel_query_raw, connect_socket}; pub(crate) async fn cancel_query( - config: Option, + config: SocketConfig, ssl_mode: SslMode, tls: T, process_id: i32, @@ -17,16 +15,6 @@ pub(crate) async fn cancel_query( where T: MakeTlsConnect, { - let config = match config { - Some(config) => config, - None => { - return Err(Error::connect(io::Error::new( - io::ErrorKind::InvalidInput, - "unknown host", - ))); - } - }; - let hostname = match &config.host { Host::Tcp(host) => &**host, }; diff --git a/libs/proxy/tokio-postgres2/src/cancel_token.rs b/libs/proxy/tokio-postgres2/src/cancel_token.rs index f6526395ee..c5566b4ad9 100644 --- a/libs/proxy/tokio-postgres2/src/cancel_token.rs +++ b/libs/proxy/tokio-postgres2/src/cancel_token.rs @@ -7,11 +7,16 @@ use crate::config::SslMode; use crate::tls::{MakeTlsConnect, TlsConnect}; use crate::{Error, cancel_query, cancel_query_raw}; -/// The capability to request cancellation of in-progress queries on a -/// connection. -#[derive(Clone, Serialize, Deserialize)] +/// A cancellation token that allows easy cancellation of a query. +#[derive(Clone)] pub struct CancelToken { - pub socket_config: Option, + pub socket_config: SocketConfig, + pub raw: RawCancelToken, +} + +/// A raw cancellation token that allows cancellation of a query, given a fresh connection to postgres. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct RawCancelToken { pub ssl_mode: SslMode, pub process_id: i32, pub secret_key: i32, @@ -36,14 +41,16 @@ impl CancelToken { { cancel_query::cancel_query( self.socket_config.clone(), - self.ssl_mode, + self.raw.ssl_mode, tls, - self.process_id, - self.secret_key, + self.raw.process_id, + self.raw.secret_key, ) .await } +} +impl RawCancelToken { /// Like `cancel_query`, but uses a stream which is already connected to the server rather than opening a new /// connection itself. pub async fn cancel_query_raw(&self, stream: S, tls: T) -> Result<(), Error> diff --git a/libs/proxy/tokio-postgres2/src/client.rs b/libs/proxy/tokio-postgres2/src/client.rs index a7edfc076a..41b22e35b6 100644 --- a/libs/proxy/tokio-postgres2/src/client.rs +++ b/libs/proxy/tokio-postgres2/src/client.rs @@ -12,6 +12,7 @@ use postgres_protocol2::message::frontend; use serde::{Deserialize, Serialize}; use tokio::sync::mpsc; +use crate::cancel_token::RawCancelToken; use crate::codec::{BackendMessages, FrontendMessage}; use crate::config::{Host, SslMode}; use crate::query::RowStream; @@ -331,10 +332,12 @@ impl Client { /// connection associated with this client. pub fn cancel_token(&self) -> CancelToken { CancelToken { - socket_config: Some(self.socket_config.clone()), - ssl_mode: self.ssl_mode, - process_id: self.process_id, - secret_key: self.secret_key, + socket_config: self.socket_config.clone(), + raw: RawCancelToken { + ssl_mode: self.ssl_mode, + process_id: self.process_id, + secret_key: self.secret_key, + }, } } diff --git a/libs/proxy/tokio-postgres2/src/lib.rs b/libs/proxy/tokio-postgres2/src/lib.rs index 9556070ed5..791c93b972 100644 --- a/libs/proxy/tokio-postgres2/src/lib.rs +++ b/libs/proxy/tokio-postgres2/src/lib.rs @@ -3,7 +3,7 @@ use postgres_protocol2::message::backend::ReadyForQueryBody; -pub use crate::cancel_token::CancelToken; +pub use crate::cancel_token::{CancelToken, RawCancelToken}; pub use crate::client::{Client, SocketConfig}; pub use crate::config::Config; pub use crate::connect_raw::RawConnection; diff --git a/proxy/src/batch.rs b/proxy/src/batch.rs new file mode 100644 index 0000000000..61bdf2b747 --- /dev/null +++ b/proxy/src/batch.rs @@ -0,0 +1,146 @@ +//! Batch processing system based on intrusive linked lists. +//! +//! Enqueuing a batch job requires no allocations, with +//! direct support for cancelling jobs early. +use std::collections::BTreeMap; +use std::pin::pin; +use std::sync::Mutex; + +use futures::future::Either; +use scopeguard::ScopeGuard; +use tokio::sync::oneshot::error::TryRecvError; + +use crate::ext::LockExt; + +pub trait QueueProcessing: Send + 'static { + type Req: Send + 'static; + type Res: Send; + + /// Get the desired batch size. + fn batch_size(&self, queue_size: usize) -> usize; + + /// This applies a full batch of events. + /// Must respond with a full batch of replies. + /// + /// If this apply can error, it's expected that errors be forwarded to each Self::Res. + /// + /// Batching does not need to happen atomically. + fn apply(&mut self, req: Vec) -> impl Future> + Send; +} + +pub struct BatchQueue { + processor: tokio::sync::Mutex

, + inner: Mutex>, +} + +struct BatchJob { + req: P::Req, + res: tokio::sync::oneshot::Sender, +} + +impl BatchQueue

{ + pub fn new(p: P) -> Self { + Self { + processor: tokio::sync::Mutex::new(p), + inner: Mutex::new(BatchQueueInner { + version: 0, + queue: BTreeMap::new(), + }), + } + } + + pub async fn call(&self, req: P::Req) -> P::Res { + let (id, mut rx) = self.inner.lock_propagate_poison().register_job(req); + let guard = scopeguard::guard(id, move |id| { + let mut inner = self.inner.lock_propagate_poison(); + if inner.queue.remove(&id).is_some() { + tracing::debug!("batched task cancelled before completion"); + } + }); + + let resp = loop { + // try become the leader, or try wait for success. + let mut processor = match futures::future::select(rx, pin!(self.processor.lock())).await + { + // we got the resp. + Either::Left((resp, _)) => break resp.ok(), + // we are the leader. + Either::Right((p, rx_)) => { + rx = rx_; + p + } + }; + + let (reqs, resps) = self.inner.lock_propagate_poison().get_batch(&processor); + + // apply a batch. + let values = processor.apply(reqs).await; + + // send response values. + for (tx, value) in std::iter::zip(resps, values) { + // sender hung up but that's fine. + drop(tx.send(value)); + } + + match rx.try_recv() { + Ok(resp) => break Some(resp), + Err(TryRecvError::Closed) => break None, + // edge case - there was a race condition where + // we became the leader but were not in the batch. + // + // Example: + // thread 1: register job id=1 + // thread 2: register job id=2 + // thread 2: processor.lock().await + // thread 1: processor.lock().await + // thread 2: becomes leader, batch_size=1, jobs=[1]. + Err(TryRecvError::Empty) => {} + } + }; + + // already removed. + ScopeGuard::into_inner(guard); + + resp.expect("no response found. batch processer should not panic") + } +} + +struct BatchQueueInner { + version: u64, + queue: BTreeMap>, +} + +impl BatchQueueInner

{ + fn register_job(&mut self, req: P::Req) -> (u64, tokio::sync::oneshot::Receiver) { + let (tx, rx) = tokio::sync::oneshot::channel(); + + let id = self.version; + + // Overflow concern: + // This is a u64, and we might enqueue 2^16 tasks per second. + // This gives us 2^48 seconds (9 million years). + // Even if this does overflow, it will not break, but some + // jobs with the higher version might never get prioritised. + self.version += 1; + + self.queue.insert(id, BatchJob { req, res: tx }); + + (id, rx) + } + + fn get_batch(&mut self, p: &P) -> (Vec, Vec>) { + let batch_size = p.batch_size(self.queue.len()); + let mut reqs = Vec::with_capacity(batch_size); + let mut resps = Vec::with_capacity(batch_size); + + while reqs.len() < batch_size { + let Some((_, job)) = self.queue.pop_first() else { + break; + }; + reqs.push(job.req); + resps.push(job.res); + } + + (reqs, resps) + } +} diff --git a/proxy/src/binary/local_proxy.rs b/proxy/src/binary/local_proxy.rs index ba10fce7b4..e3be454713 100644 --- a/proxy/src/binary/local_proxy.rs +++ b/proxy/src/binary/local_proxy.rs @@ -201,7 +201,7 @@ pub async fn run() -> anyhow::Result<()> { auth_backend, http_listener, shutdown.clone(), - Arc::new(CancellationHandler::new(&config.connect_to_compute, None)), + Arc::new(CancellationHandler::new(&config.connect_to_compute)), endpoint_rate_limiter, ); diff --git a/proxy/src/binary/proxy.rs b/proxy/src/binary/proxy.rs index 6ab6df5610..9215dbf73f 100644 --- a/proxy/src/binary/proxy.rs +++ b/proxy/src/binary/proxy.rs @@ -23,7 +23,8 @@ use utils::{project_build_tag, project_git_version}; use crate::auth::backend::jwt::JwkCache; use crate::auth::backend::{ConsoleRedirectBackend, MaybeOwned}; -use crate::cancellation::{CancellationHandler, handle_cancel_messages}; +use crate::batch::BatchQueue; +use crate::cancellation::{CancellationHandler, CancellationProcessor}; use crate::config::{ self, AuthenticationConfig, CacheOptions, ComputeConfig, HttpConfig, ProjectInfoCacheOptions, ProxyConfig, ProxyProtocolV2, remote_storage_from_toml, @@ -392,13 +393,7 @@ pub async fn run() -> anyhow::Result<()> { .as_ref() .map(|redis_publisher| RedisKVClient::new(redis_publisher.clone(), redis_rps_limit)); - // channel size should be higher than redis client limit to avoid blocking - let cancel_ch_size = args.cancellation_ch_size; - let (tx_cancel, rx_cancel) = tokio::sync::mpsc::channel(cancel_ch_size); - let cancellation_handler = Arc::new(CancellationHandler::new( - &config.connect_to_compute, - Some(tx_cancel), - )); + let cancellation_handler = Arc::new(CancellationHandler::new(&config.connect_to_compute)); let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new_with_shards( RateBucketInfo::to_leaky_bucket(&args.endpoint_rps_limit) @@ -530,21 +525,11 @@ pub async fn run() -> anyhow::Result<()> { match redis_kv_client.try_connect().await { Ok(()) => { info!("Connected to Redis KV client"); - maintenance_tasks.spawn(async move { - handle_cancel_messages( - &mut redis_kv_client, - rx_cancel, - args.cancellation_batch_size, - ) - .await?; + cancellation_handler.init_tx(BatchQueue::new(CancellationProcessor { + client: redis_kv_client, + batch_size: args.cancellation_batch_size, + })); - drop(redis_kv_client); - - // `handle_cancel_messages` was terminated due to the tx_cancel - // being dropped. this is not worthy of an error, and this task can only return `Err`, - // so let's wait forever instead. - std::future::pending().await - }); break; } Err(e) => { diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs index cce4c1d3a0..036f36c7f6 100644 --- a/proxy/src/cancellation.rs +++ b/proxy/src/cancellation.rs @@ -1,19 +1,23 @@ +use std::convert::Infallible; use std::net::{IpAddr, SocketAddr}; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; +use std::time::Duration; -use anyhow::{Context, anyhow}; +use anyhow::anyhow; +use futures::FutureExt; use ipnet::{IpNet, Ipv4Net, Ipv6Net}; -use postgres_client::CancelToken; +use postgres_client::RawCancelToken; use postgres_client::tls::MakeTlsConnect; use redis::{Cmd, FromRedisValue, Value}; use serde::{Deserialize, Serialize}; use thiserror::Error; use tokio::net::TcpStream; -use tokio::sync::{mpsc, oneshot}; -use tracing::{debug, error, info, warn}; +use tokio::time::timeout; +use tracing::{debug, error, info}; use crate::auth::AuthError; use crate::auth::backend::ComputeUserInfo; +use crate::batch::{BatchQueue, QueueProcessing}; use crate::config::ComputeConfig; use crate::context::RequestContext; use crate::control_plane::ControlPlaneApi; @@ -27,46 +31,36 @@ use crate::redis::kv_ops::RedisKVClient; type IpSubnetKey = IpNet; -const CANCEL_KEY_TTL: i64 = 1_209_600; // 2 weeks cancellation key expire time +const CANCEL_KEY_TTL: std::time::Duration = std::time::Duration::from_secs(600); +const CANCEL_KEY_REFRESH: std::time::Duration = std::time::Duration::from_secs(570); // Message types for sending through mpsc channel pub enum CancelKeyOp { StoreCancelKey { - key: String, - field: String, - value: String, - resp_tx: Option>>, - _guard: CancelChannelSizeGuard<'static>, - expire: i64, // TTL for key + key: CancelKeyData, + value: Box, + expire: std::time::Duration, }, GetCancelData { - key: String, - resp_tx: oneshot::Sender>>, - _guard: CancelChannelSizeGuard<'static>, - }, - RemoveCancelKey { - key: String, - field: String, - resp_tx: Option>>, - _guard: CancelChannelSizeGuard<'static>, + key: CancelKeyData, }, } pub struct Pipeline { inner: redis::Pipeline, - replies: Vec, + replies: usize, } impl Pipeline { fn with_capacity(n: usize) -> Self { Self { inner: redis::Pipeline::with_capacity(n), - replies: Vec::with_capacity(n), + replies: 0, } } - async fn execute(&mut self, client: &mut RedisKVClient) { - let responses = self.replies.len(); + async fn execute(self, client: &mut RedisKVClient) -> Vec> { + let responses = self.replies; let batch_size = self.inner.len(); match client.query(&self.inner).await { @@ -76,176 +70,73 @@ impl Pipeline { batch_size, responses, "successfully completed cancellation jobs", ); - for (value, reply) in std::iter::zip(values, self.replies.drain(..)) { - reply.send_value(value); - } + values.into_iter().map(Ok).collect() } Ok(value) => { error!(batch_size, ?value, "unexpected redis return value"); - for reply in self.replies.drain(..) { - reply.send_err(anyhow!("incorrect response type from redis")); - } + std::iter::repeat_with(|| Err(anyhow!("incorrect response type from redis"))) + .take(responses) + .collect() } Err(err) => { - for reply in self.replies.drain(..) { - reply.send_err(anyhow!("could not send cmd to redis: {err}")); - } + std::iter::repeat_with(|| Err(anyhow!("could not send cmd to redis: {err}"))) + .take(responses) + .collect() } } - - self.inner.clear(); - self.replies.clear(); } - fn add_command_with_reply(&mut self, cmd: Cmd, reply: CancelReplyOp) { + fn add_command_with_reply(&mut self, cmd: Cmd) { self.inner.add_command(cmd); - self.replies.push(reply); + self.replies += 1; } fn add_command_no_reply(&mut self, cmd: Cmd) { self.inner.add_command(cmd).ignore(); } - - fn add_command(&mut self, cmd: Cmd, reply: Option) { - match reply { - Some(reply) => self.add_command_with_reply(cmd, reply), - None => self.add_command_no_reply(cmd), - } - } } impl CancelKeyOp { - fn register(self, pipe: &mut Pipeline) { + fn register(&self, pipe: &mut Pipeline) { #[allow(clippy::used_underscore_binding)] match self { - CancelKeyOp::StoreCancelKey { - key, - field, - value, - resp_tx, - _guard, - expire, - } => { - let reply = - resp_tx.map(|resp_tx| CancelReplyOp::StoreCancelKey { resp_tx, _guard }); - pipe.add_command(Cmd::hset(&key, field, value), reply); - pipe.add_command_no_reply(Cmd::expire(key, expire)); + CancelKeyOp::StoreCancelKey { key, value, expire } => { + let key = KeyPrefix::Cancel(*key).build_redis_key(); + pipe.add_command_with_reply(Cmd::hset(&key, "data", &**value)); + pipe.add_command_no_reply(Cmd::expire(&key, expire.as_secs() as i64)); } - CancelKeyOp::GetCancelData { - key, - resp_tx, - _guard, - } => { - let reply = CancelReplyOp::GetCancelData { resp_tx, _guard }; - pipe.add_command_with_reply(Cmd::hgetall(key), reply); - } - CancelKeyOp::RemoveCancelKey { - key, - field, - resp_tx, - _guard, - } => { - let reply = - resp_tx.map(|resp_tx| CancelReplyOp::RemoveCancelKey { resp_tx, _guard }); - pipe.add_command(Cmd::hdel(key, field), reply); + CancelKeyOp::GetCancelData { key } => { + let key = KeyPrefix::Cancel(*key).build_redis_key(); + pipe.add_command_with_reply(Cmd::hget(key, "data")); } } } } -// Message types for sending through mpsc channel -pub enum CancelReplyOp { - StoreCancelKey { - resp_tx: oneshot::Sender>, - _guard: CancelChannelSizeGuard<'static>, - }, - GetCancelData { - resp_tx: oneshot::Sender>>, - _guard: CancelChannelSizeGuard<'static>, - }, - RemoveCancelKey { - resp_tx: oneshot::Sender>, - _guard: CancelChannelSizeGuard<'static>, - }, +pub struct CancellationProcessor { + pub client: RedisKVClient, + pub batch_size: usize, } -impl CancelReplyOp { - fn send_err(self, e: anyhow::Error) { - match self { - CancelReplyOp::StoreCancelKey { resp_tx, _guard } => { - resp_tx - .send(Err(e)) - .inspect_err(|_| tracing::debug!("could not send reply")) - .ok(); - } - CancelReplyOp::GetCancelData { resp_tx, _guard } => { - resp_tx - .send(Err(e)) - .inspect_err(|_| tracing::debug!("could not send reply")) - .ok(); - } - CancelReplyOp::RemoveCancelKey { resp_tx, _guard } => { - resp_tx - .send(Err(e)) - .inspect_err(|_| tracing::debug!("could not send reply")) - .ok(); - } - } +impl QueueProcessing for CancellationProcessor { + type Req = (CancelChannelSizeGuard<'static>, CancelKeyOp); + type Res = anyhow::Result; + + fn batch_size(&self, _queue_size: usize) -> usize { + self.batch_size } - fn send_value(self, v: redis::Value) { - match self { - CancelReplyOp::StoreCancelKey { resp_tx, _guard } => { - let send = - FromRedisValue::from_owned_redis_value(v).context("could not parse value"); - resp_tx - .send(send) - .inspect_err(|_| tracing::debug!("could not send reply")) - .ok(); - } - CancelReplyOp::GetCancelData { resp_tx, _guard } => { - let send = - FromRedisValue::from_owned_redis_value(v).context("could not parse value"); - resp_tx - .send(send) - .inspect_err(|_| tracing::debug!("could not send reply")) - .ok(); - } - CancelReplyOp::RemoveCancelKey { resp_tx, _guard } => { - let send = - FromRedisValue::from_owned_redis_value(v).context("could not parse value"); - resp_tx - .send(send) - .inspect_err(|_| tracing::debug!("could not send reply")) - .ok(); - } - } - } -} - -// Running as a separate task to accept messages through the rx channel -pub async fn handle_cancel_messages( - client: &mut RedisKVClient, - mut rx: mpsc::Receiver, - batch_size: usize, -) -> anyhow::Result<()> { - let mut batch = Vec::with_capacity(batch_size); - let mut pipeline = Pipeline::with_capacity(batch_size); - - loop { - if rx.recv_many(&mut batch, batch_size).await == 0 { - warn!("shutting down cancellation queue"); - break Ok(()); - } + async fn apply(&mut self, batch: Vec) -> Vec { + let mut pipeline = Pipeline::with_capacity(batch.len()); let batch_size = batch.len(); debug!(batch_size, "running cancellation jobs"); - for msg in batch.drain(..) { - msg.register(&mut pipeline); + for (_, op) in &batch { + op.register(&mut pipeline); } - pipeline.execute(client).await; + pipeline.execute(&mut self.client).await } } @@ -256,7 +147,7 @@ pub struct CancellationHandler { compute_config: &'static ComputeConfig, // rate limiter of cancellation requests limiter: Arc>>, - tx: Option>, // send messages to the redis KV client task + tx: OnceLock>, // send messages to the redis KV client task } #[derive(Debug, Error)] @@ -296,13 +187,10 @@ impl ReportableError for CancelError { } impl CancellationHandler { - pub fn new( - compute_config: &'static ComputeConfig, - tx: Option>, - ) -> Self { + pub fn new(compute_config: &'static ComputeConfig) -> Self { Self { compute_config, - tx, + tx: OnceLock::new(), limiter: Arc::new(std::sync::Mutex::new( LeakyBucketRateLimiter::::new_with_shards( LeakyBucketRateLimiter::::DEFAULT, @@ -312,7 +200,14 @@ impl CancellationHandler { } } - pub(crate) fn get_key(self: &Arc) -> Session { + pub fn init_tx(&self, queue: BatchQueue) { + self.tx + .set(queue) + .map_err(|_| {}) + .expect("cancellation queue should be registered once"); + } + + pub(crate) fn get_key(self: Arc) -> Session { // we intentionally generate a random "backend pid" and "secret key" here. // we use the corresponding u64 as an identifier for the // actual endpoint+pid+secret for postgres/pgbouncer. @@ -322,14 +217,10 @@ impl CancellationHandler { let key: CancelKeyData = rand::random(); - let prefix_key: KeyPrefix = KeyPrefix::Cancel(key); - let redis_key = prefix_key.build_redis_key(); - debug!("registered new query cancellation key {key}"); Session { key, - redis_key, - cancellation_handler: Arc::clone(self), + cancellation_handler: self, } } @@ -337,62 +228,43 @@ impl CancellationHandler { &self, key: CancelKeyData, ) -> Result, CancelError> { - let prefix_key: KeyPrefix = KeyPrefix::Cancel(key); - let redis_key = prefix_key.build_redis_key(); + let guard = Metrics::get() + .proxy + .cancel_channel_size + .guard(RedisMsgKind::HGet); + let op = CancelKeyOp::GetCancelData { key }; - let (resp_tx, resp_rx) = tokio::sync::oneshot::channel(); - let op = CancelKeyOp::GetCancelData { - key: redis_key, - resp_tx, - _guard: Metrics::get() - .proxy - .cancel_channel_size - .guard(RedisMsgKind::HGetAll), - }; - - let Some(tx) = &self.tx else { + let Some(tx) = self.tx.get() else { tracing::warn!("cancellation handler is not available"); return Err(CancelError::InternalError); }; - tx.try_send(op) + const TIMEOUT: Duration = Duration::from_secs(5); + let result = timeout(TIMEOUT, tx.call((guard, op))) + .await + .map_err(|_| { + tracing::warn!("timed out waiting to receive GetCancelData response"); + CancelError::RateLimit + })? .map_err(|e| { - tracing::warn!("failed to send GetCancelData for {key}: {e}"); - }) - .map_err(|()| CancelError::InternalError)?; + tracing::warn!("failed to receive GetCancelData response: {e}"); + CancelError::InternalError + })?; - let result = resp_rx.await.map_err(|e| { + let cancel_state_str = String::from_owned_redis_value(result).map_err(|e| { tracing::warn!("failed to receive GetCancelData response: {e}"); CancelError::InternalError })?; - let cancel_state_str: Option = match result { - Ok(mut state) => { - if state.len() == 1 { - Some(state.remove(0).1) - } else { - tracing::warn!("unexpected number of entries in cancel state: {state:?}"); - return Err(CancelError::InternalError); - } - } - Err(e) => { - tracing::warn!("failed to receive cancel state from redis: {e}"); - return Err(CancelError::InternalError); - } - }; + let cancel_closure: CancelClosure = + serde_json::from_str(&cancel_state_str).map_err(|e| { + tracing::warn!("failed to deserialize cancel state: {e}"); + CancelError::InternalError + })?; - let cancel_state: Option = match cancel_state_str { - Some(state) => { - let cancel_closure: CancelClosure = serde_json::from_str(&state).map_err(|e| { - tracing::warn!("failed to deserialize cancel state: {e}"); - CancelError::InternalError - })?; - Some(cancel_closure) - } - None => None, - }; - Ok(cancel_state) + Ok(Some(cancel_closure)) } + /// Try to cancel a running query for the corresponding connection. /// If the cancellation key is not found, it will be published to Redis. /// check_allowed - if true, check if the IP is allowed to cancel the query. @@ -467,10 +339,10 @@ impl CancellationHandler { /// This should've been a [`std::future::Future`], but /// it's impossible to name a type of an unboxed future /// (we'd need something like `#![feature(type_alias_impl_trait)]`). -#[derive(Clone, Serialize, Deserialize)] +#[derive(Debug, Clone, Serialize, Deserialize)] pub struct CancelClosure { socket_addr: SocketAddr, - cancel_token: CancelToken, + cancel_token: RawCancelToken, hostname: String, // for pg_sni router user_info: ComputeUserInfo, } @@ -478,7 +350,7 @@ pub struct CancelClosure { impl CancelClosure { pub(crate) fn new( socket_addr: SocketAddr, - cancel_token: CancelToken, + cancel_token: RawCancelToken, hostname: String, user_info: ComputeUserInfo, ) -> Self { @@ -491,7 +363,7 @@ impl CancelClosure { } /// Cancels the query running on user's compute node. pub(crate) async fn try_cancel_query( - self, + &self, compute_config: &ComputeConfig, ) -> Result<(), CancelError> { let socket = TcpStream::connect(self.socket_addr).await?; @@ -512,7 +384,6 @@ impl CancelClosure { pub(crate) struct Session { /// The user-facing key identifying this session. key: CancelKeyData, - redis_key: String, cancellation_handler: Arc, } @@ -521,60 +392,66 @@ impl Session { &self.key } - // Send the store key op to the cancellation handler and set TTL for the key - pub(crate) fn write_cancel_key( + /// Ensure the cancel key is continously refreshed, + /// but stop when the channel is dropped. + pub(crate) async fn maintain_cancel_key( &self, - cancel_closure: CancelClosure, - ) -> Result<(), CancelError> { - let Some(tx) = &self.cancellation_handler.tx else { - tracing::warn!("cancellation handler is not available"); - return Err(CancelError::InternalError); - }; + session_id: uuid::Uuid, + cancel: tokio::sync::oneshot::Receiver, + cancel_closure: &CancelClosure, + compute_config: &ComputeConfig, + ) { + futures::future::select( + std::pin::pin!(self.maintain_redis_cancel_key(cancel_closure)), + cancel, + ) + .await; - let closure_json = serde_json::to_string(&cancel_closure).map_err(|e| { - tracing::warn!("failed to serialize cancel closure: {e}"); - CancelError::InternalError - })?; - - let op = CancelKeyOp::StoreCancelKey { - key: self.redis_key.clone(), - field: "data".to_string(), - value: closure_json, - resp_tx: None, - _guard: Metrics::get() - .proxy - .cancel_channel_size - .guard(RedisMsgKind::HSet), - expire: CANCEL_KEY_TTL, - }; - - let _ = tx.try_send(op).map_err(|e| { - let key = self.key; - tracing::warn!("failed to send StoreCancelKey for {key}: {e}"); - }); - Ok(()) + if let Err(err) = cancel_closure + .try_cancel_query(compute_config) + .boxed() + .await + { + tracing::warn!( + ?session_id, + ?err, + "could not cancel the query in the database" + ); + } } - pub(crate) fn remove_cancel_key(&self) -> Result<(), CancelError> { - let Some(tx) = &self.cancellation_handler.tx else { + // Ensure the cancel key is continously refreshed. + async fn maintain_redis_cancel_key(&self, cancel_closure: &CancelClosure) -> ! { + let Some(tx) = self.cancellation_handler.tx.get() else { tracing::warn!("cancellation handler is not available"); - return Err(CancelError::InternalError); + // don't exit, as we only want to exit if cancelled externally. + std::future::pending().await }; - let op = CancelKeyOp::RemoveCancelKey { - key: self.redis_key.clone(), - field: "data".to_string(), - resp_tx: None, - _guard: Metrics::get() + let closure_json = serde_json::to_string(&cancel_closure) + .expect("serialising to json string should not fail") + .into_boxed_str(); + + loop { + let guard = Metrics::get() .proxy .cancel_channel_size - .guard(RedisMsgKind::HDel), - }; + .guard(RedisMsgKind::HSet); + let op = CancelKeyOp::StoreCancelKey { + key: self.key, + value: closure_json.clone(), + expire: CANCEL_KEY_TTL, + }; - let _ = tx.try_send(op).map_err(|e| { - let key = self.key; - tracing::warn!("failed to send RemoveCancelKey for {key}: {e}"); - }); - Ok(()) + tracing::debug!( + src=%self.key, + dest=?cancel_closure.cancel_token, + "registering cancellation key" + ); + + if tx.call((guard, op)).await.is_ok() { + tokio::time::sleep(CANCEL_KEY_REFRESH).await; + } + } } } diff --git a/proxy/src/compute/mod.rs b/proxy/src/compute/mod.rs index aae1fea07d..5dd264b35e 100644 --- a/proxy/src/compute/mod.rs +++ b/proxy/src/compute/mod.rs @@ -9,7 +9,7 @@ use itertools::Itertools; use postgres_client::config::{AuthKeys, SslMode}; use postgres_client::maybe_tls_stream::MaybeTlsStream; use postgres_client::tls::MakeTlsConnect; -use postgres_client::{CancelToken, NoTls, RawConnection}; +use postgres_client::{NoTls, RawCancelToken, RawConnection}; use postgres_protocol::message::backend::NoticeResponseBody; use thiserror::Error; use tokio::net::{TcpStream, lookup_host}; @@ -265,7 +265,8 @@ impl ConnectInfo { } } -type RustlsStream = >::Stream; +pub type RustlsStream = >::Stream; +pub type MaybeRustlsStream = MaybeTlsStream; pub(crate) struct PostgresConnection { /// Socket connected to a compute node. @@ -279,7 +280,7 @@ pub(crate) struct PostgresConnection { /// Notices received from compute after authenticating pub(crate) delayed_notice: Vec, - _guage: NumDbConnectionsGuard<'static>, + pub(crate) guage: NumDbConnectionsGuard<'static>, } impl ConnectInfo { @@ -327,8 +328,7 @@ impl ConnectInfo { // Yet another reason to rework the connection establishing code. let cancel_closure = CancelClosure::new( socket_addr, - CancelToken { - socket_config: None, + RawCancelToken { ssl_mode: self.ssl_mode, process_id, secret_key, @@ -343,7 +343,7 @@ impl ConnectInfo { delayed_notice, cancel_closure, aux, - _guage: Metrics::get().proxy.db_connections.guard(ctx.protocol()), + guage: Metrics::get().proxy.db_connections.guard(ctx.protocol()), }; Ok(connection) diff --git a/proxy/src/console_redirect_proxy.rs b/proxy/src/console_redirect_proxy.rs index 5331ea41fd..89adfc9049 100644 --- a/proxy/src/console_redirect_proxy.rs +++ b/proxy/src/console_redirect_proxy.rs @@ -120,7 +120,7 @@ pub async fn task_main( Ok(Some(p)) => { ctx.set_success(); let _disconnect = ctx.log_connect(); - match p.proxy_pass(&config.connect_to_compute).await { + match p.proxy_pass().await { Ok(()) => {} Err(ErrorSource::Client(e)) => { error!( @@ -232,22 +232,35 @@ pub(crate) async fn handle_client( .or_else(|e| async { Err(stream.throw_error(e, Some(ctx)).await) }) .await?; - let cancellation_handler_clone = Arc::clone(&cancellation_handler); - let session = cancellation_handler_clone.get_key(); - - session.write_cancel_key(node.cancel_closure.clone())?; + let session = cancellation_handler.get_key(); prepare_client_connection(&node, *session.key(), &mut stream); let stream = stream.flush_and_into_inner().await?; + let session_id = ctx.session_id(); + let (cancel_on_shutdown, cancel) = tokio::sync::oneshot::channel(); + tokio::spawn(async move { + session + .maintain_cancel_key( + session_id, + cancel, + &node.cancel_closure, + &config.connect_to_compute, + ) + .await; + }); + Ok(Some(ProxyPassthrough { client: stream, - aux: node.aux.clone(), + compute: node.stream, + + aux: node.aux, private_link_id: None, - compute: node, - session_id: ctx.session_id(), - cancel: session, + + _cancel_on_shutdown: cancel_on_shutdown, + _req: request_gauge, _conn: conn_gauge, + _db_conn: node.guage, })) } diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs index 026c6aeba9..d96f582fad 100644 --- a/proxy/src/lib.rs +++ b/proxy/src/lib.rs @@ -75,6 +75,7 @@ pub mod binary; mod auth; +mod batch; mod cache; mod cancellation; mod compute; diff --git a/proxy/src/pglb/passthrough.rs b/proxy/src/pglb/passthrough.rs index 6f651d383d..d4c029f6d9 100644 --- a/proxy/src/pglb/passthrough.rs +++ b/proxy/src/pglb/passthrough.rs @@ -1,15 +1,17 @@ -use futures::FutureExt; +use std::convert::Infallible; + use smol_str::SmolStr; use tokio::io::{AsyncRead, AsyncWrite}; use tracing::debug; use utils::measured_stream::MeasuredStream; use super::copy_bidirectional::ErrorSource; -use crate::cancellation; -use crate::compute::PostgresConnection; -use crate::config::ComputeConfig; +use crate::compute::MaybeRustlsStream; use crate::control_plane::messages::MetricsAuxInfo; -use crate::metrics::{Direction, Metrics, NumClientConnectionsGuard, NumConnectionRequestsGuard}; +use crate::metrics::{ + Direction, Metrics, NumClientConnectionsGuard, NumConnectionRequestsGuard, + NumDbConnectionsGuard, +}; use crate::stream::Stream; use crate::usage_metrics::{Ids, MetricCounterRecorder, USAGE_METRICS}; @@ -64,40 +66,20 @@ pub(crate) async fn proxy_pass( pub(crate) struct ProxyPassthrough { pub(crate) client: Stream, - pub(crate) compute: PostgresConnection, + pub(crate) compute: MaybeRustlsStream, + pub(crate) aux: MetricsAuxInfo, - pub(crate) session_id: uuid::Uuid, pub(crate) private_link_id: Option, - pub(crate) cancel: cancellation::Session, + + pub(crate) _cancel_on_shutdown: tokio::sync::oneshot::Sender, pub(crate) _req: NumConnectionRequestsGuard<'static>, pub(crate) _conn: NumClientConnectionsGuard<'static>, + pub(crate) _db_conn: NumDbConnectionsGuard<'static>, } impl ProxyPassthrough { - pub(crate) async fn proxy_pass( - self, - compute_config: &ComputeConfig, - ) -> Result<(), ErrorSource> { - let res = proxy_pass( - self.client, - self.compute.stream, - self.aux, - self.private_link_id, - ) - .await; - if let Err(err) = self - .compute - .cancel_closure - .try_cancel_query(compute_config) - .boxed() - .await - { - tracing::warn!(session_id = ?self.session_id, ?err, "could not cancel the query in the database"); - } - - drop(self.cancel.remove_cancel_key()); // we don't need a result. If the queue is full, we just log the error - - res + pub(crate) async fn proxy_pass(self) -> Result<(), ErrorSource> { + proxy_pass(self.client, self.compute, self.aux, self.private_link_id).await } } diff --git a/proxy/src/proxy/mod.rs b/proxy/src/proxy/mod.rs index 4211406f6c..7da1b8d8fa 100644 --- a/proxy/src/proxy/mod.rs +++ b/proxy/src/proxy/mod.rs @@ -155,7 +155,7 @@ pub async fn task_main( Ok(Some(p)) => { ctx.set_success(); let _disconnect = ctx.log_connect(); - match p.proxy_pass(&config.connect_to_compute).await { + match p.proxy_pass().await { Ok(()) => {} Err(ErrorSource::Client(e)) => { warn!( @@ -372,13 +372,24 @@ pub(crate) async fn handle_client( Err(e) => Err(stream.throw_error(e, Some(ctx)).await)?, }; - let cancellation_handler_clone = Arc::clone(&cancellation_handler); - let session = cancellation_handler_clone.get_key(); + let session = cancellation_handler.get_key(); - session.write_cancel_key(node.cancel_closure.clone())?; prepare_client_connection(&node, *session.key(), &mut stream); let stream = stream.flush_and_into_inner().await?; + let session_id = ctx.session_id(); + let (cancel_on_shutdown, cancel) = tokio::sync::oneshot::channel(); + tokio::spawn(async move { + session + .maintain_cancel_key( + session_id, + cancel, + &node.cancel_closure, + &config.connect_to_compute, + ) + .await; + }); + let private_link_id = match ctx.extra() { Some(ConnectionInfoExtra::Aws { vpce_id }) => Some(vpce_id.clone()), Some(ConnectionInfoExtra::Azure { link_id }) => Some(link_id.to_smolstr()), @@ -387,13 +398,16 @@ pub(crate) async fn handle_client( Ok(Some(ProxyPassthrough { client: stream, - aux: node.aux.clone(), + compute: node.stream, + + aux: node.aux, private_link_id, - compute: node, - session_id: ctx.session_id(), - cancel: session, + + _cancel_on_shutdown: cancel_on_shutdown, + _req: request_gauge, _conn: conn_gauge, + _db_conn: node.guage, })) } diff --git a/proxy/src/redis/keys.rs b/proxy/src/redis/keys.rs index 3113bad949..b453e6851c 100644 --- a/proxy/src/redis/keys.rs +++ b/proxy/src/redis/keys.rs @@ -1,8 +1,4 @@ -use std::io::ErrorKind; - -use anyhow::Ok; - -use crate::pqproto::{CancelKeyData, id_to_cancel_key}; +use crate::pqproto::CancelKeyData; pub mod keyspace { pub const CANCEL_PREFIX: &str = "cancel"; @@ -23,39 +19,12 @@ impl KeyPrefix { } } } - - #[allow(dead_code)] - pub(crate) fn as_str(&self) -> &'static str { - match self { - KeyPrefix::Cancel(_) => keyspace::CANCEL_PREFIX, - } - } -} - -#[allow(dead_code)] -pub(crate) fn parse_redis_key(key: &str) -> anyhow::Result { - let (prefix, key_str) = key.split_once(':').ok_or_else(|| { - anyhow::anyhow!(std::io::Error::new( - ErrorKind::InvalidData, - "missing prefix" - )) - })?; - - match prefix { - keyspace::CANCEL_PREFIX => { - let id = u64::from_str_radix(key_str, 16)?; - - Ok(KeyPrefix::Cancel(id_to_cancel_key(id))) - } - _ => Err(anyhow::anyhow!(std::io::Error::new( - ErrorKind::InvalidData, - "unknown prefix" - ))), - } } #[cfg(test)] mod tests { + use crate::pqproto::id_to_cancel_key; + use super::*; #[test] @@ -65,16 +34,4 @@ mod tests { let redis_key = cancel_key.build_redis_key(); assert_eq!(redis_key, "cancel:30390000d431"); } - - #[test] - fn test_parse_redis_key() { - let redis_key = "cancel:30390000d431"; - let key: KeyPrefix = parse_redis_key(redis_key).expect("Failed to parse key"); - - let ref_key = id_to_cancel_key(12345 << 32 | 54321); - - assert_eq!(key.as_str(), KeyPrefix::Cancel(ref_key).as_str()); - let KeyPrefix::Cancel(cancel_key) = key; - assert_eq!(ref_key, cancel_key); - } } diff --git a/proxy/src/redis/kv_ops.rs b/proxy/src/redis/kv_ops.rs index f71730c533..f8d3b5cc66 100644 --- a/proxy/src/redis/kv_ops.rs +++ b/proxy/src/redis/kv_ops.rs @@ -1,3 +1,6 @@ +use std::time::Duration; + +use futures::FutureExt; use redis::aio::ConnectionLike; use redis::{Cmd, FromRedisValue, Pipeline, RedisResult}; @@ -35,14 +38,11 @@ impl RedisKVClient { } pub async fn try_connect(&mut self) -> anyhow::Result<()> { - match self.client.connect().await { - Ok(()) => {} - Err(e) => { - tracing::error!("failed to connect to redis: {e}"); - return Err(e); - } - } - Ok(()) + self.client + .connect() + .boxed() + .await + .inspect_err(|e| tracing::error!("failed to connect to redis: {e}")) } pub(crate) async fn query( @@ -54,15 +54,25 @@ impl RedisKVClient { return Err(anyhow::anyhow!("Rate limit exceeded")); } - match q.query(&mut self.client).await { + let e = match q.query(&mut self.client).await { Ok(t) => return Ok(t), - Err(e) => { - tracing::error!("failed to run query: {e}"); + Err(e) => e, + }; + + tracing::error!("failed to run query: {e}"); + match e.retry_method() { + redis::RetryMethod::Reconnect => { + tracing::info!("Redis client is disconnected. Reconnecting..."); + self.try_connect().await?; } + redis::RetryMethod::RetryImmediately => {} + redis::RetryMethod::WaitAndRetry => { + // somewhat arbitrary. + tokio::time::sleep(Duration::from_millis(100)).await; + } + _ => Err(e)?, } - tracing::info!("Redis client is disconnected. Reconnecting..."); - self.try_connect().await?; Ok(q.query(&mut self.client).await?) } } diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs index 8648a94869..0d374e6df2 100644 --- a/proxy/src/serverless/websocket.rs +++ b/proxy/src/serverless/websocket.rs @@ -167,7 +167,7 @@ pub(crate) async fn serve_websocket( Ok(Some(p)) => { ctx.set_success(); ctx.log_connect(); - match p.proxy_pass(&config.connect_to_compute).await { + match p.proxy_pass().await { Ok(()) => Ok(()), Err(ErrorSource::Client(err)) => Err(err).context("client"), Err(ErrorSource::Compute(err)) => Err(err).context("compute"), From 6508f4e5c1505b2d5a18b60a036c831da0fea705 Mon Sep 17 00:00:00 2001 From: Vlad Lazar Date: Fri, 20 Jun 2025 14:57:30 +0300 Subject: [PATCH 087/124] pageserver: revise gc layer map lock handling (#12290) ## Problem Timeline GC is very aggressive with regards to layer map locking. We've seen timelines with loads of layers in production that hold the write lock for the layer map for 30 minutes at a time. This blocks reads and the write path to some extent. ## Summary of changes Determining the set of layers to GC is done under the read lock. Applying the updates is done under the write lock. Previously, everything was done under write lock. --- pageserver/src/tenant/timeline.rs | 190 +++++++++++++++--------------- 1 file changed, 98 insertions(+), 92 deletions(-) diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index a0e9d8f06a..c8a41f7875 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -6543,7 +6543,7 @@ impl Timeline { debug!("retain_lsns: {:?}", retain_lsns); - let mut layers_to_remove = Vec::new(); + let max_retain_lsn = retain_lsns.iter().max(); // Scan all layers in the timeline (remote or on-disk). // @@ -6553,108 +6553,110 @@ impl Timeline { // 3. it doesn't need to be retained for 'retain_lsns'; // 4. it does not need to be kept for LSNs holding valid leases. // 5. newer on-disk image layers cover the layer's whole key range - // - // TODO holding a write lock is too agressive and avoidable - let mut guard = self - .layers - .write(LayerManagerLockHolder::GarbageCollection) - .await; - let layers = guard.layer_map()?; - 'outer: for l in layers.iter_historic_layers() { - result.layers_total += 1; + let layers_to_remove = { + let mut layers_to_remove = Vec::new(); - // 1. Is it newer than GC horizon cutoff point? - if l.get_lsn_range().end > space_cutoff { - info!( - "keeping {} because it's newer than space_cutoff {}", - l.layer_name(), - space_cutoff, - ); - result.layers_needed_by_cutoff += 1; - continue 'outer; - } + let guard = self + .layers + .read(LayerManagerLockHolder::GarbageCollection) + .await; + let layers = guard.layer_map()?; + 'outer: for l in layers.iter_historic_layers() { + result.layers_total += 1; - // 2. It is newer than PiTR cutoff point? - if l.get_lsn_range().end > time_cutoff { - info!( - "keeping {} because it's newer than time_cutoff {}", - l.layer_name(), - time_cutoff, - ); - result.layers_needed_by_pitr += 1; - continue 'outer; - } - - // 3. Is it needed by a child branch? - // NOTE With that we would keep data that - // might be referenced by child branches forever. - // We can track this in child timeline GC and delete parent layers when - // they are no longer needed. This might be complicated with long inheritance chains. - // - // TODO Vec is not a great choice for `retain_lsns` - for retain_lsn in &retain_lsns { - // start_lsn is inclusive - if &l.get_lsn_range().start <= retain_lsn { - info!( - "keeping {} because it's still might be referenced by child branch forked at {} is_dropped: xx is_incremental: {}", + // 1. Is it newer than GC horizon cutoff point? + if l.get_lsn_range().end > space_cutoff { + debug!( + "keeping {} because it's newer than space_cutoff {}", l.layer_name(), - retain_lsn, - l.is_incremental(), + space_cutoff, ); - result.layers_needed_by_branches += 1; + result.layers_needed_by_cutoff += 1; continue 'outer; } - } - // 4. Is there a valid lease that requires us to keep this layer? - if let Some(lsn) = &max_lsn_with_valid_lease { - // keep if layer start <= any of the lease - if &l.get_lsn_range().start <= lsn { - info!( - "keeping {} because there is a valid lease preventing GC at {}", + // 2. It is newer than PiTR cutoff point? + if l.get_lsn_range().end > time_cutoff { + debug!( + "keeping {} because it's newer than time_cutoff {}", l.layer_name(), - lsn, + time_cutoff, ); - result.layers_needed_by_leases += 1; + result.layers_needed_by_pitr += 1; continue 'outer; } + + // 3. Is it needed by a child branch? + // NOTE With that we would keep data that + // might be referenced by child branches forever. + // We can track this in child timeline GC and delete parent layers when + // they are no longer needed. This might be complicated with long inheritance chains. + if let Some(retain_lsn) = max_retain_lsn { + // start_lsn is inclusive + if &l.get_lsn_range().start <= retain_lsn { + debug!( + "keeping {} because it's still might be referenced by child branch forked at {} is_dropped: xx is_incremental: {}", + l.layer_name(), + retain_lsn, + l.is_incremental(), + ); + result.layers_needed_by_branches += 1; + continue 'outer; + } + } + + // 4. Is there a valid lease that requires us to keep this layer? + if let Some(lsn) = &max_lsn_with_valid_lease { + // keep if layer start <= any of the lease + if &l.get_lsn_range().start <= lsn { + debug!( + "keeping {} because there is a valid lease preventing GC at {}", + l.layer_name(), + lsn, + ); + result.layers_needed_by_leases += 1; + continue 'outer; + } + } + + // 5. Is there a later on-disk layer for this relation? + // + // The end-LSN is exclusive, while disk_consistent_lsn is + // inclusive. For example, if disk_consistent_lsn is 100, it is + // OK for a delta layer to have end LSN 101, but if the end LSN + // is 102, then it might not have been fully flushed to disk + // before crash. + // + // For example, imagine that the following layers exist: + // + // 1000 - image (A) + // 1000-2000 - delta (B) + // 2000 - image (C) + // 2000-3000 - delta (D) + // 3000 - image (E) + // + // If GC horizon is at 2500, we can remove layers A and B, but + // we cannot remove C, even though it's older than 2500, because + // the delta layer 2000-3000 depends on it. + if !layers + .image_layer_exists(&l.get_key_range(), &(l.get_lsn_range().end..new_gc_cutoff)) + { + debug!("keeping {} because it is the latest layer", l.layer_name()); + result.layers_not_updated += 1; + continue 'outer; + } + + // We didn't find any reason to keep this file, so remove it. + info!( + "garbage collecting {} is_dropped: xx is_incremental: {}", + l.layer_name(), + l.is_incremental(), + ); + layers_to_remove.push(l); } - // 5. Is there a later on-disk layer for this relation? - // - // The end-LSN is exclusive, while disk_consistent_lsn is - // inclusive. For example, if disk_consistent_lsn is 100, it is - // OK for a delta layer to have end LSN 101, but if the end LSN - // is 102, then it might not have been fully flushed to disk - // before crash. - // - // For example, imagine that the following layers exist: - // - // 1000 - image (A) - // 1000-2000 - delta (B) - // 2000 - image (C) - // 2000-3000 - delta (D) - // 3000 - image (E) - // - // If GC horizon is at 2500, we can remove layers A and B, but - // we cannot remove C, even though it's older than 2500, because - // the delta layer 2000-3000 depends on it. - if !layers - .image_layer_exists(&l.get_key_range(), &(l.get_lsn_range().end..new_gc_cutoff)) - { - info!("keeping {} because it is the latest layer", l.layer_name()); - result.layers_not_updated += 1; - continue 'outer; - } - - // We didn't find any reason to keep this file, so remove it. - info!( - "garbage collecting {} is_dropped: xx is_incremental: {}", - l.layer_name(), - l.is_incremental(), - ); - layers_to_remove.push(l); - } + layers_to_remove + }; if !layers_to_remove.is_empty() { // Persist the new GC cutoff value before we actually remove anything. @@ -6670,15 +6672,19 @@ impl Timeline { } })?; + let mut guard = self + .layers + .write(LayerManagerLockHolder::GarbageCollection) + .await; + let gc_layers = layers_to_remove .iter() - .map(|x| guard.get_from_desc(x)) + .flat_map(|desc| guard.try_get_from_key(&desc.key()).cloned()) .collect::>(); result.layers_removed = gc_layers.len() as u64; self.remote_client.schedule_gc_update(&gc_layers)?; - guard.open_mut()?.finish_gc_timeline(&gc_layers); #[cfg(feature = "testing")] From eaf1ab21c43e3a9a072649931c69a2c885d8f87c Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Fri, 20 Jun 2025 17:50:03 +0300 Subject: [PATCH 088/124] Store intermediate build files in `build/` rather than `pg_install/build/` (#12295) This way, `pg_install` contains only the final build artifacts, not intermediate files like *.o files. Seems cleaner. --- .github/workflows/_build-and-test-locally.yml | 8 +- .github/workflows/build-macos.yml | 14 +-- .gitignore | 1 + Dockerfile | 1 - Makefile | 89 ++++++++++--------- libs/walproposer/build.rs | 8 +- test_runner/regress/test_pg_regress.py | 19 +++- 7 files changed, 78 insertions(+), 62 deletions(-) diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml index 3f66f41ef2..f9b96271a4 100644 --- a/.github/workflows/_build-and-test-locally.yml +++ b/.github/workflows/_build-and-test-locally.yml @@ -313,10 +313,10 @@ jobs: # Use tar to copy files matching the pattern, preserving the paths in the destionation tar c \ pg_install/v* \ - pg_install/build/*/src/test/regress/*.so \ - pg_install/build/*/src/test/regress/pg_regress \ - pg_install/build/*/src/test/isolation/isolationtester \ - pg_install/build/*/src/test/isolation/pg_isolation_regress \ + build/*/src/test/regress/*.so \ + build/*/src/test/regress/pg_regress \ + build/*/src/test/isolation/isolationtester \ + build/*/src/test/isolation/pg_isolation_regress \ | tar x -C /tmp/neon - name: Upload Neon artifact diff --git a/.github/workflows/build-macos.yml b/.github/workflows/build-macos.yml index 0f7fa3e813..226369de52 100644 --- a/.github/workflows/build-macos.yml +++ b/.github/workflows/build-macos.yml @@ -144,7 +144,7 @@ jobs: id: cache_walproposer_lib uses: actions/cache@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3 with: - path: pg_install/build/walproposer-lib + path: build/walproposer-lib key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-walproposer_lib-v17-${{ steps.pg_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} - name: Checkout submodule vendor/postgres-v17 @@ -169,11 +169,11 @@ jobs: run: make walproposer-lib -j$(sysctl -n hw.ncpu) - - name: Upload "pg_install/build/walproposer-lib" artifact + - name: Upload "build/walproposer-lib" artifact uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 with: - name: pg_install--build--walproposer-lib - path: pg_install/build/walproposer-lib + name: build--walproposer-lib + path: build/walproposer-lib # The artifact is supposed to be used by the next job in the same workflow, # so there’s no need to store it for too long. retention-days: 1 @@ -226,11 +226,11 @@ jobs: name: pg_install--v17 path: pg_install/v17 - - name: Download "pg_install/build/walproposer-lib" artifact + - name: Download "build/walproposer-lib" artifact uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0 with: - name: pg_install--build--walproposer-lib - path: pg_install/build/walproposer-lib + name: build--walproposer-lib + path: build/walproposer-lib # `actions/download-artifact` doesn't preserve permissions: # https://github.com/actions/download-artifact?tab=readme-ov-file#permission-loss diff --git a/.gitignore b/.gitignore index 45eb4dbf0e..70c7e96303 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ /artifact_cache +/build /pg_install /target /tmp_check diff --git a/Dockerfile b/Dockerfile index f72d7d9bbc..69657067de 100644 --- a/Dockerfile +++ b/Dockerfile @@ -45,7 +45,6 @@ COPY --chown=nonroot scripts/ninstall.sh scripts/ninstall.sh ENV BUILD_TYPE=release RUN set -e \ && mold -run make -j $(nproc) -s neon-pg-ext \ - && rm -rf pg_install/build \ && tar -C pg_install -czf /home/nonroot/postgres_install.tar.gz . # Prepare cargo-chef recipe diff --git a/Makefile b/Makefile index 5130e17e59..dee50a51c1 100644 --- a/Makefile +++ b/Makefile @@ -1,8 +1,12 @@ ROOT_PROJECT_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) -# Where to install Postgres, default is ./pg_install, maybe useful for package managers +# Where to install Postgres, default is ./pg_install, maybe useful for package +# managers. POSTGRES_INSTALL_DIR ?= $(ROOT_PROJECT_DIR)/pg_install/ +# All intermediate build artifacts are stored here. +BUILD_DIR := build + ICU_PREFIX_DIR := /usr/local/icu # @@ -104,21 +108,20 @@ cargo-target-dir: # Some rules are duplicated for Postgres v14 and 15. We may want to refactor # to avoid the duplication in the future, but it's tolerable for now. # -$(POSTGRES_INSTALL_DIR)/build/%/config.status: - - mkdir -p $(POSTGRES_INSTALL_DIR) - test -e $(POSTGRES_INSTALL_DIR)/CACHEDIR.TAG || echo "$(CACHEDIR_TAG_CONTENTS)" > $(POSTGRES_INSTALL_DIR)/CACHEDIR.TAG +$(BUILD_DIR)/%/config.status: + mkdir -p $(BUILD_DIR) + test -e $(BUILD_DIR)/CACHEDIR.TAG || echo "$(CACHEDIR_TAG_CONTENTS)" > $(BUILD_DIR)/CACHEDIR.TAG +@echo "Configuring Postgres $* build" @test -s $(ROOT_PROJECT_DIR)/vendor/postgres-$*/configure || { \ echo "\nPostgres submodule not found in $(ROOT_PROJECT_DIR)/vendor/postgres-$*/, execute "; \ echo "'git submodule update --init --recursive --depth 2 --progress .' in project root.\n"; \ exit 1; } - mkdir -p $(POSTGRES_INSTALL_DIR)/build/$* + mkdir -p $(BUILD_DIR)/$* VERSION=$*; \ EXTRA_VERSION=$$(cd $(ROOT_PROJECT_DIR)/vendor/postgres-$$VERSION && git rev-parse HEAD); \ - (cd $(POSTGRES_INSTALL_DIR)/build/$$VERSION && \ + (cd $(BUILD_DIR)/$$VERSION && \ env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-$$VERSION/configure \ CFLAGS='$(PG_CFLAGS)' LDFLAGS='$(PG_LDFLAGS)' \ $(PG_CONFIGURE_OPTS) --with-extra-version=" ($$EXTRA_VERSION)" \ @@ -130,73 +133,73 @@ $(POSTGRES_INSTALL_DIR)/build/%/config.status: # the "build-all-versions" entry points) where direct mention of PostgreSQL # versions is used. .PHONY: postgres-configure-v17 -postgres-configure-v17: $(POSTGRES_INSTALL_DIR)/build/v17/config.status +postgres-configure-v17: $(BUILD_DIR)/v17/config.status .PHONY: postgres-configure-v16 -postgres-configure-v16: $(POSTGRES_INSTALL_DIR)/build/v16/config.status +postgres-configure-v16: $(BUILD_DIR)/v16/config.status .PHONY: postgres-configure-v15 -postgres-configure-v15: $(POSTGRES_INSTALL_DIR)/build/v15/config.status +postgres-configure-v15: $(BUILD_DIR)/v15/config.status .PHONY: postgres-configure-v14 -postgres-configure-v14: $(POSTGRES_INSTALL_DIR)/build/v14/config.status +postgres-configure-v14: $(BUILD_DIR)/v14/config.status # Install the PostgreSQL header files into $(POSTGRES_INSTALL_DIR)//include .PHONY: postgres-headers-% postgres-headers-%: postgres-configure-% +@echo "Installing PostgreSQL $* headers" - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/src/include MAKELEVEL=0 install + $(MAKE) -C $(BUILD_DIR)/$*/src/include MAKELEVEL=0 install # Compile and install PostgreSQL .PHONY: postgres-% postgres-%: postgres-configure-% \ postgres-headers-% # to prevent `make install` conflicts with neon's `postgres-headers` +@echo "Compiling PostgreSQL $*" - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$* MAKELEVEL=0 install + $(MAKE) -C $(BUILD_DIR)/$* MAKELEVEL=0 install +@echo "Compiling libpq $*" - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/src/interfaces/libpq install + $(MAKE) -C $(BUILD_DIR)/$*/src/interfaces/libpq install +@echo "Compiling pg_prewarm $*" - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pg_prewarm install + $(MAKE) -C $(BUILD_DIR)/$*/contrib/pg_prewarm install +@echo "Compiling pg_buffercache $*" - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pg_buffercache install + $(MAKE) -C $(BUILD_DIR)/$*/contrib/pg_buffercache install +@echo "Compiling pg_visibility $*" - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pg_visibility install + $(MAKE) -C $(BUILD_DIR)/$*/contrib/pg_visibility install +@echo "Compiling pageinspect $*" - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pageinspect install + $(MAKE) -C $(BUILD_DIR)/$*/contrib/pageinspect install +@echo "Compiling pg_trgm $*" - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pg_trgm install + $(MAKE) -C $(BUILD_DIR)/$*/contrib/pg_trgm install +@echo "Compiling amcheck $*" - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/amcheck install + $(MAKE) -C $(BUILD_DIR)/$*/contrib/amcheck install +@echo "Compiling test_decoding $*" - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/test_decoding install + $(MAKE) -C $(BUILD_DIR)/$*/contrib/test_decoding install .PHONY: postgres-check-% postgres-check-%: postgres-% - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$* MAKELEVEL=0 check + $(MAKE) -C $(BUILD_DIR)/$* MAKELEVEL=0 check .PHONY: neon-pg-ext-% neon-pg-ext-%: postgres-% +@echo "Compiling neon $*" - mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-$* + mkdir -p $(BUILD_DIR)/neon-$* $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config COPT='$(COPT)' \ - -C $(POSTGRES_INSTALL_DIR)/build/neon-$* \ + -C $(BUILD_DIR)/neon-$* \ -f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile install +@echo "Compiling neon_walredo $*" - mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-walredo-$* + mkdir -p $(BUILD_DIR)/neon-walredo-$* $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config COPT='$(COPT)' \ - -C $(POSTGRES_INSTALL_DIR)/build/neon-walredo-$* \ + -C $(BUILD_DIR)/neon-walredo-$* \ -f $(ROOT_PROJECT_DIR)/pgxn/neon_walredo/Makefile install +@echo "Compiling neon_rmgr $*" - mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-rmgr-$* + mkdir -p $(BUILD_DIR)/neon-rmgr-$* $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config COPT='$(COPT)' \ - -C $(POSTGRES_INSTALL_DIR)/build/neon-rmgr-$* \ + -C $(BUILD_DIR)/neon-rmgr-$* \ -f $(ROOT_PROJECT_DIR)/pgxn/neon_rmgr/Makefile install +@echo "Compiling neon_test_utils $*" - mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-$* + mkdir -p $(BUILD_DIR)/neon-test-utils-$* $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config COPT='$(COPT)' \ - -C $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-$* \ + -C $(BUILD_DIR)/neon-test-utils-$* \ -f $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils/Makefile install +@echo "Compiling neon_utils $*" - mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-utils-$* + mkdir -p $(BUILD_DIR)/neon-utils-$* $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config COPT='$(COPT)' \ - -C $(POSTGRES_INSTALL_DIR)/build/neon-utils-$* \ + -C $(BUILD_DIR)/neon-utils-$* \ -f $(ROOT_PROJECT_DIR)/pgxn/neon_utils/Makefile install # Build walproposer as a static library. walproposer source code is located @@ -211,15 +214,15 @@ neon-pg-ext-%: postgres-% .PHONY: walproposer-lib walproposer-lib: neon-pg-ext-v17 +@echo "Compiling walproposer-lib" - mkdir -p $(POSTGRES_INSTALL_DIR)/build/walproposer-lib + mkdir -p $(BUILD_DIR)/walproposer-lib $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v17/bin/pg_config COPT='$(COPT)' \ - -C $(POSTGRES_INSTALL_DIR)/build/walproposer-lib \ + -C $(BUILD_DIR)/walproposer-lib \ -f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile walproposer-lib - cp $(POSTGRES_INSTALL_DIR)/v17/lib/libpgport.a $(POSTGRES_INSTALL_DIR)/build/walproposer-lib - cp $(POSTGRES_INSTALL_DIR)/v17/lib/libpgcommon.a $(POSTGRES_INSTALL_DIR)/build/walproposer-lib - $(AR) d $(POSTGRES_INSTALL_DIR)/build/walproposer-lib/libpgport.a \ + cp $(POSTGRES_INSTALL_DIR)/v17/lib/libpgport.a $(BUILD_DIR)/walproposer-lib + cp $(POSTGRES_INSTALL_DIR)/v17/lib/libpgcommon.a $(BUILD_DIR)/walproposer-lib + $(AR) d $(BUILD_DIR)/walproposer-lib/libpgport.a \ pg_strong_random.o - $(AR) d $(POSTGRES_INSTALL_DIR)/build/walproposer-lib/libpgcommon.a \ + $(AR) d $(BUILD_DIR)/walproposer-lib/libpgcommon.a \ checksum_helper.o \ cryptohash_openssl.o \ hmac_openssl.o \ @@ -227,7 +230,7 @@ walproposer-lib: neon-pg-ext-v17 parse_manifest.o \ scram-common.o ifeq ($(UNAME_S),Linux) - $(AR) d $(POSTGRES_INSTALL_DIR)/build/walproposer-lib/libpgcommon.a \ + $(AR) d $(BUILD_DIR)/walproposer-lib/libpgcommon.a \ pg_crc32c.o endif @@ -272,7 +275,7 @@ fmt: postgres-%-pg-bsd-indent: postgres-% +@echo "Compiling pg_bsd_indent" - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/src/tools/pg_bsd_indent/ + $(MAKE) -C $(BUILD_DIR)/$*/src/tools/pg_bsd_indent/ # Create typedef list for the core. Note that generally it should be combined with # buildfarm one to cover platform specific stuff. @@ -291,7 +294,7 @@ postgres-%-pgindent: postgres-%-pg-bsd-indent postgres-%-typedefs.list cat $(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/tools/pgindent/typedefs.list |\ cat - postgres-$*-typedefs.list | sort | uniq > postgres-$*-typedefs-full.list +@echo note: you might want to run it on selected files/dirs instead. - INDENT=$(POSTGRES_INSTALL_DIR)/build/$*/src/tools/pg_bsd_indent/pg_bsd_indent \ + INDENT=$(BUILD_DIR)/$*/src/tools/pg_bsd_indent/pg_bsd_indent \ $(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/tools/pgindent/pgindent --typedefs postgres-$*-typedefs-full.list \ $(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/ \ --excludes $(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/tools/pgindent/exclude_file_patterns @@ -302,9 +305,9 @@ postgres-%-pgindent: postgres-%-pg-bsd-indent postgres-%-typedefs.list neon-pgindent: postgres-v17-pg-bsd-indent neon-pg-ext-v17 $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v17/bin/pg_config COPT='$(COPT)' \ FIND_TYPEDEF=$(ROOT_PROJECT_DIR)/vendor/postgres-v17/src/tools/find_typedef \ - INDENT=$(POSTGRES_INSTALL_DIR)/build/v17/src/tools/pg_bsd_indent/pg_bsd_indent \ + INDENT=$(BUILD_DIR)/v17/src/tools/pg_bsd_indent/pg_bsd_indent \ PGINDENT_SCRIPT=$(ROOT_PROJECT_DIR)/vendor/postgres-v17/src/tools/pgindent/pgindent \ - -C $(POSTGRES_INSTALL_DIR)/build/neon-v17 \ + -C $(BUILD_DIR)/neon-v17 \ -f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile pgindent diff --git a/libs/walproposer/build.rs b/libs/walproposer/build.rs index 530ceb1327..b13c8b32b4 100644 --- a/libs/walproposer/build.rs +++ b/libs/walproposer/build.rs @@ -13,22 +13,24 @@ fn main() -> anyhow::Result<()> { // Tell cargo to invalidate the built crate whenever the wrapper changes println!("cargo:rerun-if-changed=bindgen_deps.h"); + let root_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../.."); + // Finding the location of built libraries and Postgres C headers: // - if POSTGRES_INSTALL_DIR is set look into it, otherwise look into `/pg_install` // - if there's a `bin/pg_config` file use it for getting include server, otherwise use `/pg_install/{PG_MAJORVERSION}/include/postgresql/server` let pg_install_dir = if let Some(postgres_install_dir) = env::var_os("POSTGRES_INSTALL_DIR") { postgres_install_dir.into() } else { - PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../../pg_install") + root_path.join("pg_install") }; let pg_install_abs = std::fs::canonicalize(pg_install_dir)?; - let walproposer_lib_dir = pg_install_abs.join("build/walproposer-lib"); + let walproposer_lib_dir = root_path.join("build/walproposer-lib"); let walproposer_lib_search_str = walproposer_lib_dir .to_str() .ok_or(anyhow!("Bad non-UTF path"))?; - let pgxn_neon = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../../pgxn/neon"); + let pgxn_neon = root_path.join("pgxn/neon"); let pgxn_neon = std::fs::canonicalize(pgxn_neon)?; let pgxn_neon = pgxn_neon.to_str().ok_or(anyhow!("Bad non-UTF path"))?; diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py index 3695ece66b..728241b465 100644 --- a/test_runner/regress/test_pg_regress.py +++ b/test_runner/regress/test_pg_regress.py @@ -173,7 +173,11 @@ def test_pg_regress( (runpath / "testtablespace").mkdir(parents=True) # Compute all the file locations that pg_regress will need. - build_path = pg_distrib_dir / f"build/{env.pg_version.v_prefixed}/src/test/regress" + # + # XXX: We assume that the `build` directory is a sibling of the + # pg_distrib_dir. That is the default when you check out the + # repository; `build` and `pg_install` are created side by side. + build_path = pg_distrib_dir / f"../build/{env.pg_version.v_prefixed}/src/test/regress" src_path = base_dir / f"vendor/postgres-{env.pg_version.v_prefixed}/src/test/regress" bindir = pg_distrib_dir / f"v{env.pg_version}/bin" schedule = src_path / "parallel_schedule" @@ -250,7 +254,11 @@ def test_isolation( (runpath / "testtablespace").mkdir(parents=True) # Compute all the file locations that pg_isolation_regress will need. - build_path = pg_distrib_dir / f"build/{env.pg_version.v_prefixed}/src/test/isolation" + # + # XXX: We assume that the `build` directory is a sibling of the + # pg_distrib_dir. That is the default when you check out the + # repository; `build` and `pg_install` are created side by side. + build_path = pg_distrib_dir / f"../build/{env.pg_version.v_prefixed}/src/test/isolation" src_path = base_dir / f"vendor/postgres-{env.pg_version.v_prefixed}/src/test/isolation" bindir = pg_distrib_dir / f"v{env.pg_version}/bin" schedule = src_path / "isolation_schedule" @@ -314,8 +322,11 @@ def test_sql_regress( (runpath / "testtablespace").mkdir(parents=True) # Compute all the file locations that pg_regress will need. - # This test runs neon specific tests - build_path = pg_distrib_dir / f"build/v{env.pg_version}/src/test/regress" + # + # XXX: We assume that the `build` directory is a sibling of the + # pg_distrib_dir. That is the default when you check out the + # repository; `build` and `pg_install` are created side by side. + build_path = pg_distrib_dir / f"../build/{env.pg_version.v_prefixed}/src/test/regress" src_path = base_dir / "test_runner/sql_regress" bindir = pg_distrib_dir / f"v{env.pg_version}/bin" schedule = src_path / "parallel_schedule" From 79485e7c3a138c724efc2b8edc82962581a48b53 Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Fri, 20 Jun 2025 11:35:11 -0400 Subject: [PATCH 089/124] feat(pageserver): enable gc-compaction by default everywhere (#12105) Enable it across tests and set it as default. Marks the first milestone of https://github.com/neondatabase/neon/issues/9114. We already enabled it in all AWS regions and planning to enable it in all Azure regions next week. will merge after we roll out in all regions. --------- Signed-off-by: Alex Chi Z --- libs/pageserver_api/src/config.rs | 2 +- test_runner/regress/test_attach_tenant_config.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs index 2d7a06a72f..1ecc17e04b 100644 --- a/libs/pageserver_api/src/config.rs +++ b/libs/pageserver_api/src/config.rs @@ -816,7 +816,7 @@ pub mod tenant_conf_defaults { // By default ingest enough WAL for two new L0 layers before checking if new image // image layers should be created. pub const DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD: u8 = 2; - pub const DEFAULT_GC_COMPACTION_ENABLED: bool = false; + pub const DEFAULT_GC_COMPACTION_ENABLED: bool = true; pub const DEFAULT_GC_COMPACTION_VERIFICATION: bool = true; pub const DEFAULT_GC_COMPACTION_INITIAL_THRESHOLD_KB: u64 = 5 * 1024 * 1024; // 5GB pub const DEFAULT_GC_COMPACTION_RATIO_PERCENT: u64 = 100; diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py index dc44fc77db..7788faceb4 100644 --- a/test_runner/regress/test_attach_tenant_config.py +++ b/test_runner/regress/test_attach_tenant_config.py @@ -184,7 +184,7 @@ def test_fully_custom_config(positive_env: NeonEnv): "timeline_offloading": False, "rel_size_v2_enabled": True, "relsize_snapshot_cache_capacity": 10000, - "gc_compaction_enabled": True, + "gc_compaction_enabled": False, "gc_compaction_verification": False, "gc_compaction_initial_threshold_kb": 1024000, "gc_compaction_ratio_percent": 200, From b2954d16ff12899e1e85d3c772988da4454450f0 Mon Sep 17 00:00:00 2001 From: Dmitrii Kovalkov <34828390+DimasKovas@users.noreply.github.com> Date: Fri, 20 Jun 2025 20:03:17 +0400 Subject: [PATCH 090/124] storcon, neon_local: add timeline_safekeeper_count (#12303) ## Problem We need to specify the number of safekeepers for neon_local without `testing` feature. Also we need this option for testing different configurations of safekeeper migration code. We cannot set it in `neon_fixtures.py` and in the default config of `neon_local` yet, because it will fail compatibility tests. I'll make a separate PR with removing `cfg!("testing")` completely and specifying this option in the config when this option reaches the release branch. - Part of https://github.com/neondatabase/neon/issues/12298 ## Summary of changes - Add `timeline_safekeeper_count` config option to storcon and neon_local --- control_plane/src/local_env.rs | 3 +++ control_plane/src/storage_controller.rs | 4 ++++ storage_controller/src/main.rs | 12 ++++++++++ storage_controller/src/service.rs | 4 ++++ .../src/service/safekeeper_service.rs | 24 +++++++++---------- 5 files changed, 34 insertions(+), 13 deletions(-) diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index 1b231151ce..387fc297f0 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -209,6 +209,8 @@ pub struct NeonStorageControllerConf { pub use_https_safekeeper_api: bool, pub use_local_compute_notifications: bool, + + pub timeline_safekeeper_count: Option, } impl NeonStorageControllerConf { @@ -239,6 +241,7 @@ impl Default for NeonStorageControllerConf { timelines_onto_safekeepers: true, use_https_safekeeper_api: false, use_local_compute_notifications: true, + timeline_safekeeper_count: None, } } } diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs index 755d67a7ad..95f7533057 100644 --- a/control_plane/src/storage_controller.rs +++ b/control_plane/src/storage_controller.rs @@ -628,6 +628,10 @@ impl StorageController { args.push("--timelines-onto-safekeepers".to_string()); } + if let Some(sk_cnt) = self.config.timeline_safekeeper_count { + args.push(format!("--timeline-safekeeper-count={sk_cnt}")); + } + println!("Starting storage controller"); background_process::start_process( diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs index 2eea2f9d10..fc0ba9f28c 100644 --- a/storage_controller/src/main.rs +++ b/storage_controller/src/main.rs @@ -207,6 +207,12 @@ struct Cli { /// the compute notification directly (instead of via control plane). #[arg(long, default_value = "false")] use_local_compute_notifications: bool, + + /// Number of safekeepers to choose for a timeline when creating it. + /// Safekeepers will be choosen from different availability zones. + /// This option exists primarily for testing purposes. + #[arg(long, default_value = "3", value_parser = clap::value_parser!(i64).range(1..))] + timeline_safekeeper_count: i64, } enum StrictMode { @@ -371,6 +377,11 @@ async fn async_main() -> anyhow::Result<()> { StrictMode::Strict if args.use_local_compute_notifications => { anyhow::bail!("`--use-local-compute-notifications` is only permitted in `--dev` mode"); } + StrictMode::Strict if args.timeline_safekeeper_count < 3 => { + anyhow::bail!( + "Running with less than 3 safekeepers per timeline is only permitted in `--dev` mode" + ); + } StrictMode::Strict => { tracing::info!("Starting in strict mode: configuration is OK.") } @@ -433,6 +444,7 @@ async fn async_main() -> anyhow::Result<()> { ssl_ca_certs, timelines_onto_safekeepers: args.timelines_onto_safekeepers, use_local_compute_notifications: args.use_local_compute_notifications, + timeline_safekeeper_count: args.timeline_safekeeper_count, }; // Validate that we can connect to the database diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 14c81ccf59..6ec3963c48 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -466,6 +466,10 @@ pub struct Config { pub timelines_onto_safekeepers: bool, pub use_local_compute_notifications: bool, + + /// Number of safekeepers to choose for a timeline when creating it. + /// Safekeepers will be choosen from different availability zones. + pub timeline_safekeeper_count: i64, } impl From for ApiError { diff --git a/storage_controller/src/service/safekeeper_service.rs b/storage_controller/src/service/safekeeper_service.rs index 61b9ec6b6d..193a1833a7 100644 --- a/storage_controller/src/service/safekeeper_service.rs +++ b/storage_controller/src/service/safekeeper_service.rs @@ -1,3 +1,4 @@ +use std::cmp::max; use std::collections::HashSet; use std::str::FromStr; use std::sync::Arc; @@ -608,7 +609,8 @@ impl Service { Ok(()) } - /// Choose safekeepers for the new timeline: 3 in different azs. + /// Choose safekeepers for the new timeline in different azs. + /// 3 are choosen by default, but may be configured via config (for testing). pub(crate) async fn safekeepers_for_new_timeline( &self, ) -> Result, ApiError> { @@ -651,18 +653,14 @@ impl Service { ) }); // Number of safekeepers in different AZs we are looking for - let wanted_count = match all_safekeepers.len() { - 0 => { - return Err(ApiError::InternalServerError(anyhow::anyhow!( - "couldn't find any active safekeeper for new timeline", - ))); - } - // Have laxer requirements on testig mode as we don't want to - // spin up three safekeepers for every single test - #[cfg(feature = "testing")] - 1 | 2 => all_safekeepers.len(), - _ => 3, - }; + let mut wanted_count = self.config.timeline_safekeeper_count as usize; + // TODO(diko): remove this when `timeline_safekeeper_count` option is in the release + // branch and is specified in tests/neon_local config. + if cfg!(feature = "testing") && all_safekeepers.len() < wanted_count { + // In testing mode, we can have less safekeepers than the config says + wanted_count = max(all_safekeepers.len(), 1); + } + let mut sks = Vec::new(); let mut azs = HashSet::new(); for (_sk_util, sk_info, az_id) in all_safekeepers.iter() { From c8b2ac93cf88a3d2c68970caf9290677b7d6cb92 Mon Sep 17 00:00:00 2001 From: Tristan Partin Date: Fri, 20 Jun 2025 13:46:30 -0500 Subject: [PATCH 091/124] Allow the control plane to override any Postgres connection options (#12262) The previous behavior was for the compute to override control plane options if there was a conflict. We want to change the behavior so that the control plane has the absolute power on what is right. In the event that we need a new option passed to the compute as soon as possible, we can initially roll it out in the control plane, and then migrate the option to EXTRA_OPTIONS within the compute later, for instance. Signed-off-by: Tristan Partin --- compute_tools/src/compute.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 7a7f2dfedc..684d841897 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -408,7 +408,9 @@ impl ComputeNode { // N.B. keep it in sync with `ZENITH_OPTIONS` in `get_maintenance_client()`. const EXTRA_OPTIONS: &str = "-c role=cloud_admin -c default_transaction_read_only=off -c search_path=public -c statement_timeout=0"; let options = match conn_conf.get_options() { - Some(options) => format!("{} {}", options, EXTRA_OPTIONS), + // Allow the control plane to override any options set by the + // compute + Some(options) => format!("{} {}", EXTRA_OPTIONS, options), None => EXTRA_OPTIONS.to_string(), }; conn_conf.options(&options); From 868c38f52257e555067be0ea4a588f358f130cea Mon Sep 17 00:00:00 2001 From: Tristan Partin Date: Fri, 20 Jun 2025 17:49:05 -0500 Subject: [PATCH 092/124] Rename the compute_ctl admin scope to compute_ctl:admin (#12263) Signed-off-by: Tristan Partin --- libs/compute_api/src/requests.rs | 23 ++++++++++++++++++++++- test_runner/fixtures/endpoint/http.py | 2 +- 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/libs/compute_api/src/requests.rs b/libs/compute_api/src/requests.rs index bbab271474..745c44c05b 100644 --- a/libs/compute_api/src/requests.rs +++ b/libs/compute_api/src/requests.rs @@ -16,6 +16,7 @@ pub static COMPUTE_AUDIENCE: &str = "compute"; pub enum ComputeClaimsScope { /// An admin-scoped token allows access to all of `compute_ctl`'s authorized /// facilities. + #[serde(rename = "compute_ctl:admin")] Admin, } @@ -24,7 +25,7 @@ impl FromStr for ComputeClaimsScope { fn from_str(s: &str) -> Result { match s { - "admin" => Ok(ComputeClaimsScope::Admin), + "compute_ctl:admin" => Ok(ComputeClaimsScope::Admin), _ => Err(anyhow::anyhow!("invalid compute claims scope \"{s}\"")), } } @@ -80,3 +81,23 @@ pub struct SetRoleGrantsRequest { pub privileges: Vec, pub role: PgIdent, } + +#[cfg(test)] +mod test { + use std::str::FromStr; + + use crate::requests::ComputeClaimsScope; + + /// Confirm that whether we parse the scope by string or through serde, the + /// same values parse to the same enum variant. + #[test] + fn compute_request_scopes() { + const ADMIN_SCOPE: &str = "compute_ctl:admin"; + + let from_serde: ComputeClaimsScope = + serde_json::from_str(&format!("\"{ADMIN_SCOPE}\"")).unwrap(); + let from_str = ComputeClaimsScope::from_str(ADMIN_SCOPE).unwrap(); + + assert_eq!(from_serde, from_str); + } +} diff --git a/test_runner/fixtures/endpoint/http.py b/test_runner/fixtures/endpoint/http.py index e2d405227b..f5be544439 100644 --- a/test_runner/fixtures/endpoint/http.py +++ b/test_runner/fixtures/endpoint/http.py @@ -24,7 +24,7 @@ The value to place in the `aud` claim. @final class ComputeClaimsScope(StrEnum): - ADMIN = "admin" + ADMIN = "compute_ctl:admin" @final From 47f7efee062b913506baa0ee080bfc335fbeba3d Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Sat, 21 Jun 2025 17:01:29 +0200 Subject: [PATCH 093/124] pageserver: require stripe size (#12257) ## Problem In #12217, we began passing the stripe size in reattach responses, and persisting it in the on-disk state. This is necessary to ensure the storage controller and Pageserver have a consistent view of the intended stripe size of unsharded tenants, which will be used for splits that do not specify a stripe size. However, for backwards compatibility, these stripe sizes were optional. ## Summary of changes Make the stripe sizes required for reattach responses and on-disk location configs. These will always be provided by the previous (current) release. --- libs/pageserver_api/src/upcall_api.rs | 10 ---------- pageserver/src/tenant/config.rs | 6 ++++-- 2 files changed, 4 insertions(+), 12 deletions(-) diff --git a/libs/pageserver_api/src/upcall_api.rs b/libs/pageserver_api/src/upcall_api.rs index e2de02eea0..07cada2eb1 100644 --- a/libs/pageserver_api/src/upcall_api.rs +++ b/libs/pageserver_api/src/upcall_api.rs @@ -23,22 +23,12 @@ pub struct ReAttachRequest { pub register: Option, } -fn default_mode() -> LocationConfigMode { - LocationConfigMode::AttachedSingle -} - #[derive(Serialize, Deserialize, Debug)] pub struct ReAttachResponseTenant { pub id: TenantShardId, /// Mandatory if LocationConfigMode is None or set to an Attached* mode pub r#gen: Option, - - /// Default value only for backward compat: this field should be set - #[serde(default = "default_mode")] pub mode: LocationConfigMode, - - // Default value only for backward compat: this field should be set - #[serde(default = "ShardStripeSize::default")] pub stripe_size: ShardStripeSize, } #[derive(Serialize, Deserialize)] diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs index 030b43a020..c5087f7e0f 100644 --- a/pageserver/src/tenant/config.rs +++ b/pageserver/src/tenant/config.rs @@ -61,8 +61,10 @@ pub(crate) struct LocationConf { /// The detailed shard identity. This structure is already scoped within /// a TenantShardId, but we need the full ShardIdentity to enable calculating /// key->shard mappings. - // TODO(vlad): Remove this default once all configs have a shard identity on disk. - #[serde(default = "ShardIdentity::unsharded")] + /// + /// NB: we store this even for unsharded tenants, so that we agree with storcon on the intended + /// stripe size. Otherwise, a split request that does not specify a stripe size may use a + /// different default than storcon, which can lead to incorrect stripe sizes and corruption. pub(crate) shard: ShardIdentity, /// The pan-cluster tenant configuration, the same on all locations From af46b5286f55aa3b563695b3483599efacac1d5a Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Sun, 22 Jun 2025 00:07:38 +0300 Subject: [PATCH 094/124] Avoid recompiling `postgres_ffi` when there has been no changes (#12292) Every time you run `make`, it runs `make install` on all the PostgreSQL sources, which copies the header files. That in turn triggers a rebuild of the `postgres_ffi` crate, and everything that depends on it. We had worked around this earlier (see #2458), by passing a custom INSTALL script to the Postgres makefiles, which refrains from updating the modification timestamp on headers when they have not been changed, but the v14 makefile didn't obey INSTALL for the header files. Backporting c0a1d7621b to v14 fixes that. This backports upstream PostgreSQL commit c0a1d7621b to v14. Corresponding PR in the 'postgres' repo: https://github.com/neondatabase/postgres/pull/660 --- .github/workflows/build-macos.yml | 2 +- vendor/postgres-v14 | 2 +- vendor/revisions.json | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/build-macos.yml b/.github/workflows/build-macos.yml index 226369de52..160c3d05bc 100644 --- a/.github/workflows/build-macos.yml +++ b/.github/workflows/build-macos.yml @@ -110,7 +110,7 @@ jobs: build-walproposer-lib: if: | - inputs.pg_versions != '[]' || inputs.rebuild_everything || + contains(inputs.pg_versions, 'v17') || inputs.rebuild_everything || contains(github.event.pull_request.labels.*.name, 'run-extra-build-macos') || contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') || github.ref_name == 'main' diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index 6770bc2513..9085654ee8 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit 6770bc251301ef40c66f7ecb731741dc435b5051 +Subproject commit 9085654ee8022d5cc4ca719380a1dc53e5e3246f diff --git a/vendor/revisions.json b/vendor/revisions.json index 12d5499ddb..b260698c86 100644 --- a/vendor/revisions.json +++ b/vendor/revisions.json @@ -13,6 +13,6 @@ ], "v14": [ "14.18", - "6770bc251301ef40c66f7ecb731741dc435b5051" + "9085654ee8022d5cc4ca719380a1dc53e5e3246f" ] } From 3d822dbbde84e2693a7b2a3aed938397b55fe651 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Sun, 22 Jun 2025 22:43:14 +0300 Subject: [PATCH 095/124] Refactor Makefile rules for building the extensions under pgxn/ (#12305) --- Makefile | 28 ++++------------------------ compute/compute-node.Dockerfile | 13 +------------ pgxn/Makefile | 28 ++++++++++++++++++++++++++++ 3 files changed, 33 insertions(+), 36 deletions(-) create mode 100644 pgxn/Makefile diff --git a/Makefile b/Makefile index dee50a51c1..71799b5be8 100644 --- a/Makefile +++ b/Makefile @@ -176,31 +176,11 @@ postgres-check-%: postgres-% .PHONY: neon-pg-ext-% neon-pg-ext-%: postgres-% - +@echo "Compiling neon $*" - mkdir -p $(BUILD_DIR)/neon-$* + +@echo "Compiling neon-specific Postgres extensions for $*" + mkdir -p $(BUILD_DIR)/pgxn-$* $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config COPT='$(COPT)' \ - -C $(BUILD_DIR)/neon-$* \ - -f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile install - +@echo "Compiling neon_walredo $*" - mkdir -p $(BUILD_DIR)/neon-walredo-$* - $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config COPT='$(COPT)' \ - -C $(BUILD_DIR)/neon-walredo-$* \ - -f $(ROOT_PROJECT_DIR)/pgxn/neon_walredo/Makefile install - +@echo "Compiling neon_rmgr $*" - mkdir -p $(BUILD_DIR)/neon-rmgr-$* - $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config COPT='$(COPT)' \ - -C $(BUILD_DIR)/neon-rmgr-$* \ - -f $(ROOT_PROJECT_DIR)/pgxn/neon_rmgr/Makefile install - +@echo "Compiling neon_test_utils $*" - mkdir -p $(BUILD_DIR)/neon-test-utils-$* - $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config COPT='$(COPT)' \ - -C $(BUILD_DIR)/neon-test-utils-$* \ - -f $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils/Makefile install - +@echo "Compiling neon_utils $*" - mkdir -p $(BUILD_DIR)/neon-utils-$* - $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config COPT='$(COPT)' \ - -C $(BUILD_DIR)/neon-utils-$* \ - -f $(ROOT_PROJECT_DIR)/pgxn/neon_utils/Makefile install + -C $(BUILD_DIR)/pgxn-$*\ + -f $(ROOT_PROJECT_DIR)/pgxn/Makefile install # Build walproposer as a static library. walproposer source code is located # in the pgxn/neon directory. diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile index 685ac564b7..13972269ae 100644 --- a/compute/compute-node.Dockerfile +++ b/compute/compute-node.Dockerfile @@ -1634,18 +1634,7 @@ FROM pg-build AS neon-ext-build ARG PG_VERSION COPY pgxn/ pgxn/ -RUN make -j $(getconf _NPROCESSORS_ONLN) \ - -C pgxn/neon \ - -s install && \ - make -j $(getconf _NPROCESSORS_ONLN) \ - -C pgxn/neon_utils \ - -s install && \ - make -j $(getconf _NPROCESSORS_ONLN) \ - -C pgxn/neon_test_utils \ - -s install && \ - make -j $(getconf _NPROCESSORS_ONLN) \ - -C pgxn/neon_rmgr \ - -s install +RUN make -j $(getconf _NPROCESSORS_ONLN) -C pgxn -s install-compute ######################################################################################### # diff --git a/pgxn/Makefile b/pgxn/Makefile new file mode 100644 index 0000000000..8f190668ea --- /dev/null +++ b/pgxn/Makefile @@ -0,0 +1,28 @@ +# This makefile assumes that 'pg_config' is in the path, or is passed in the +# PG_CONFIG variable. +# +# This is used in two different ways: +# +# 1. The main makefile calls this, when you invoke the `make neon-pg-ext-%` +# target. It passes PG_CONFIG pointing to pg_install/%/bin/pg_config. +# This is a VPATH build; the current directory is build/pgxn-%, and +# the path to the Makefile is passed with the -f argument. +# +# 2. compute-node.Dockerfile invokes this to build the compute extensions +# for the specific Postgres version. It relies on pg_config already +# being in $(PATH). + +srcdir = $(dir $(firstword $(MAKEFILE_LIST))) + +PG_CONFIG = pg_config + +subdirs = neon neon_rmgr neon_walredo neon_utils neon_test_utils + +.PHONY: install install-compute install-storage $(subdirs) +install: $(subdirs) +install-compute: neon neon_utils neon_test_utils neon_rmgr +install-storage: neon_rmgr neon_walredo + +$(subdirs): %: + mkdir -p $* + $(MAKE) PG_CONFIG=$(PG_CONFIG) -C $* -f $(abspath $(srcdir)/$@/Makefile) install From 52ab8f3e6513faed5db46312eeefd035ac642f87 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Mon, 23 Jun 2025 12:10:32 +0300 Subject: [PATCH 096/124] Use `make all` in the "Build and Test locally" CI workflow (#12311) To avoid duplicating the build logic. `make all` covers the separate `postgres-*` and `neon-pg-ext` steps, and also does `cargo build`. That's how you would typically do a full local build anyway. --- .github/workflows/_build-and-test-locally.yml | 58 +++++++------------ Makefile | 12 +++- 2 files changed, 30 insertions(+), 40 deletions(-) diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml index f9b96271a4..ff370ddb21 100644 --- a/.github/workflows/_build-and-test-locally.yml +++ b/.github/workflows/_build-and-test-locally.yml @@ -104,11 +104,10 @@ jobs: # Set some environment variables used by all the steps. # - # CARGO_FLAGS is extra options to pass to "cargo build", "cargo test" etc. - # It also includes --features, if any + # CARGO_FLAGS is extra options to pass to all "cargo" subcommands. # - # CARGO_FEATURES is passed to "cargo metadata". It is separate from CARGO_FLAGS, - # because "cargo metadata" doesn't accept --release or --debug options + # CARGO_PROFILE is passed to "cargo build", "cargo test" etc, but not to + # "cargo metadata", because it doesn't accept --release or --debug options. # # We run tests with addtional features, that are turned off by default (e.g. in release builds), see # corresponding Cargo.toml files for their descriptions. @@ -117,16 +116,16 @@ jobs: ARCH: ${{ inputs.arch }} SANITIZERS: ${{ inputs.sanitizers }} run: | - CARGO_FEATURES="--features testing" + CARGO_FLAGS="--locked --features testing" if [[ $BUILD_TYPE == "debug" && $ARCH == 'x64' ]]; then cov_prefix="scripts/coverage --profraw-prefix=$GITHUB_JOB --dir=/tmp/coverage run" - CARGO_FLAGS="--locked" + CARGO_PROFILE="" elif [[ $BUILD_TYPE == "debug" ]]; then cov_prefix="" - CARGO_FLAGS="--locked" + CARGO_PROFILE="" elif [[ $BUILD_TYPE == "release" ]]; then cov_prefix="" - CARGO_FLAGS="--locked --release" + CARGO_PROFILE="--release" fi if [[ $SANITIZERS == 'enabled' ]]; then make_vars="WITH_SANITIZERS=yes" @@ -136,8 +135,8 @@ jobs: { echo "cov_prefix=${cov_prefix}" echo "make_vars=${make_vars}" - echo "CARGO_FEATURES=${CARGO_FEATURES}" echo "CARGO_FLAGS=${CARGO_FLAGS}" + echo "CARGO_PROFILE=${CARGO_PROFILE}" echo "CARGO_HOME=${GITHUB_WORKSPACE}/.cargo" } >> $GITHUB_ENV @@ -189,34 +188,19 @@ jobs: path: pg_install/v17 key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v17_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools.Dockerfile') }} - - name: Build postgres v14 + - name: Build all if: steps.cache_pg_14.outputs.cache-hit != 'true' - run: mold -run make ${make_vars} postgres-v14 -j$(nproc) - - - name: Build postgres v15 - if: steps.cache_pg_15.outputs.cache-hit != 'true' - run: mold -run make ${make_vars} postgres-v15 -j$(nproc) - - - name: Build postgres v16 - if: steps.cache_pg_16.outputs.cache-hit != 'true' - run: mold -run make ${make_vars} postgres-v16 -j$(nproc) - - - name: Build postgres v17 - if: steps.cache_pg_17.outputs.cache-hit != 'true' - run: mold -run make ${make_vars} postgres-v17 -j$(nproc) - - - name: Build neon extensions - run: mold -run make ${make_vars} neon-pg-ext -j$(nproc) + # Note: the Makefile picks up BUILD_TYPE and CARGO_PROFILE from the env variables + run: mold -run make ${make_vars} all -j$(nproc) CARGO_BUILD_FLAGS="$CARGO_FLAGS" - name: Build walproposer-lib run: mold -run make ${make_vars} walproposer-lib -j$(nproc) - - name: Run cargo build - env: - WITH_TESTS: ${{ inputs.sanitizers != 'enabled' && '--tests' || '' }} + - name: Build unit tests + if: inputs.sanitizers != 'enabled' run: | export ASAN_OPTIONS=detect_leaks=0 - ${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins ${WITH_TESTS} + ${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_PROFILE --tests # Do install *before* running rust tests because they might recompile the # binaries with different features/flags. @@ -228,7 +212,7 @@ jobs: # Install target binaries mkdir -p /tmp/neon/bin/ binaries=$( - ${cov_prefix} cargo metadata $CARGO_FEATURES --format-version=1 --no-deps | + ${cov_prefix} cargo metadata $CARGO_FLAGS --format-version=1 --no-deps | jq -r '.packages[].targets[] | select(.kind | index("bin")) | .name' ) for bin in $binaries; do @@ -245,7 +229,7 @@ jobs: mkdir -p /tmp/neon/test_bin/ test_exe_paths=$( - ${cov_prefix} cargo test $CARGO_FLAGS $CARGO_FEATURES --message-format=json --no-run | + ${cov_prefix} cargo test $CARGO_FLAGS $CARGO_PROFILE --message-format=json --no-run | jq -r '.executable | select(. != null)' ) for bin in $test_exe_paths; do @@ -279,10 +263,10 @@ jobs: export LD_LIBRARY_PATH #nextest does not yet support running doctests - ${cov_prefix} cargo test --doc $CARGO_FLAGS $CARGO_FEATURES + ${cov_prefix} cargo test --doc $CARGO_FLAGS $CARGO_PROFILE # run all non-pageserver tests - ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E '!package(pageserver)' + ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_PROFILE -E '!package(pageserver)' # run pageserver tests # (When developing new pageserver features gated by config fields, we commonly make the rust @@ -291,13 +275,13 @@ jobs: # pageserver tests from non-pageserver tests cuts down the time it takes for this CI step.) NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=tokio-epoll-uring \ ${cov_prefix} \ - cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(pageserver)' + cargo nextest run $CARGO_FLAGS $CARGO_PROFILE -E 'package(pageserver)' # Run separate tests for real S3 export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests export REMOTE_STORAGE_S3_REGION=eu-central-1 - ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(remote_storage)' -E 'test(test_real_s3)' + ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_PROFILE -E 'package(remote_storage)' -E 'test(test_real_s3)' # Run separate tests for real Azure Blob Storage # XXX: replace region with `eu-central-1`-like region @@ -306,7 +290,7 @@ jobs: export AZURE_STORAGE_ACCESS_KEY="${{ secrets.AZURE_STORAGE_ACCESS_KEY_DEV }}" export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}" export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}" - ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(remote_storage)' -E 'test(test_real_azure)' + ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_PROFILE -E 'package(remote_storage)' -E 'test(test_real_azure)' - name: Install postgres binaries run: | diff --git a/Makefile b/Makefile index 71799b5be8..9824a47255 100644 --- a/Makefile +++ b/Makefile @@ -4,6 +4,12 @@ ROOT_PROJECT_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) # managers. POSTGRES_INSTALL_DIR ?= $(ROOT_PROJECT_DIR)/pg_install/ +# CARGO_BUILD_FLAGS: Extra flags to pass to `cargo build`. `--locked` +# and `--features testing` are popular examples. +# +# CARGO_PROFILE: You can also set to override the cargo profile to +# use. By default, it is derived from BUILD_TYPE. + # All intermediate build artifacts are stored here. BUILD_DIR := build @@ -20,12 +26,12 @@ ifeq ($(BUILD_TYPE),release) PG_CONFIGURE_OPTS = --enable-debug --with-openssl PG_CFLAGS += -O2 -g3 $(CFLAGS) PG_LDFLAGS = $(LDFLAGS) - # Unfortunately, `--profile=...` is a nightly feature - CARGO_BUILD_FLAGS += --release + CARGO_PROFILE ?= --profile=release else ifeq ($(BUILD_TYPE),debug) PG_CONFIGURE_OPTS = --enable-debug --with-openssl --enable-cassert --enable-depend PG_CFLAGS += -O0 -g3 $(CFLAGS) PG_LDFLAGS = $(LDFLAGS) + CARGO_PROFILE ?= --profile=dev else $(error Bad build type '$(BUILD_TYPE)', see Makefile for options) endif @@ -97,7 +103,7 @@ all: neon postgres neon-pg-ext .PHONY: neon neon: postgres-headers walproposer-lib cargo-target-dir +@echo "Compiling Neon" - $(CARGO_CMD_PREFIX) cargo build $(CARGO_BUILD_FLAGS) + $(CARGO_CMD_PREFIX) cargo build $(CARGO_BUILD_FLAGS) $(CARGO_PROFILE) .PHONY: cargo-target-dir cargo-target-dir: # https://github.com/rust-lang/cargo/issues/14281 From 7916aa26e07d7eaeb116f54a57acf138626bc4c3 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Mon, 23 Jun 2025 12:11:05 +0300 Subject: [PATCH 097/124] Stop using build-tools image in compute image build (#12306) The build-tools image contains various build tools and dependencies, mostly Rust-related. The compute image build used it to build compute_ctl and a few other little rust binaries that are included in the compute image. However, for extensions built in Rust (pgrx), the build used a different layer which installed the rust toolchain using rustup. Switch to using the same rust toolchain for both pgrx-based extensions and compute_ctl et al. Since we don't need anything else from the build-tools image, I switched to using the toolchain installed with rustup, and eliminated the dependency to build-tools altogether. The compute image build no longer depends on build-tools. Note: We no longer use 'mold' for linking compute_ctl et al, since mold is not included in the build-deps-with-cargo layer. We could add it there, but it doesn't seem worth it. I proposed stopping using mold altogether in https://github.com/neondatabase/neon/pull/10735, but that was rejected because 'mold' is faster for incremental builds. That doesn't matter much for docker builds however, since they're not incremental, and the compute binaries are not as large as the storage server binaries anyway. --- .github/workflows/build_and_test.yml | 4 +--- compute/compute-node.Dockerfile | 11 +++++------ 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 7faaed49c1..94f768719f 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -670,7 +670,7 @@ jobs: ghcr.io/neondatabase/neon:${{ needs.meta.outputs.build-tag }}-bookworm-arm64 compute-node-image-arch: - needs: [ check-permissions, build-build-tools-image, meta ] + needs: [ check-permissions, meta ] if: ${{ contains(fromJSON('["push-main", "pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }} permissions: id-token: write # aws-actions/configure-aws-credentials @@ -743,7 +743,6 @@ jobs: GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }} PG_VERSION=${{ matrix.version.pg }} BUILD_TAG=${{ needs.meta.outputs.release-tag || needs.meta.outputs.build-tag }} - TAG=${{ needs.build-build-tools-image.outputs.image-tag }}-${{ matrix.version.debian }} DEBIAN_VERSION=${{ matrix.version.debian }} provenance: false push: true @@ -763,7 +762,6 @@ jobs: GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }} PG_VERSION=${{ matrix.version.pg }} BUILD_TAG=${{ needs.meta.outputs.release-tag || needs.meta.outputs.build-tag }} - TAG=${{ needs.build-build-tools-image.outputs.image-tag }}-${{ matrix.version.debian }} DEBIAN_VERSION=${{ matrix.version.debian }} provenance: false push: true diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile index 13972269ae..7cd152f614 100644 --- a/compute/compute-node.Dockerfile +++ b/compute/compute-node.Dockerfile @@ -77,9 +77,6 @@ # build_and_test.yml github workflow for how that's done. ARG PG_VERSION -ARG REPOSITORY=ghcr.io/neondatabase -ARG IMAGE=build-tools -ARG TAG=pinned ARG BUILD_TAG ARG DEBIAN_VERSION=bookworm ARG DEBIAN_FLAVOR=${DEBIAN_VERSION}-slim @@ -150,6 +147,7 @@ RUN case $DEBIAN_VERSION in \ zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget ca-certificates pkg-config libssl-dev \ libicu-dev libxslt1-dev liblz4-dev libzstd-dev zstd curl unzip g++ \ libclang-dev \ + jsonnet \ $VERSION_INSTALLS \ && apt clean && rm -rf /var/lib/apt/lists/* && \ useradd -ms /bin/bash nonroot -b /home @@ -1724,7 +1722,7 @@ FROM extensions-${EXTENSIONS} AS neon-pg-ext-build # Compile the Neon-specific `compute_ctl`, `fast_import`, and `local_proxy` binaries # ######################################################################################### -FROM $REPOSITORY/$IMAGE:$TAG AS compute-tools +FROM build-deps-with-cargo AS compute-tools ARG BUILD_TAG ENV BUILD_TAG=$BUILD_TAG @@ -1734,7 +1732,7 @@ COPY --chown=nonroot . . RUN --mount=type=cache,uid=1000,target=/home/nonroot/.cargo/registry \ --mount=type=cache,uid=1000,target=/home/nonroot/.cargo/git \ --mount=type=cache,uid=1000,target=/home/nonroot/target \ - mold -run cargo build --locked --profile release-line-debug-size-lto --bin compute_ctl --bin fast_import --bin local_proxy && \ + cargo build --locked --profile release-line-debug-size-lto --bin compute_ctl --bin fast_import --bin local_proxy && \ mkdir target-bin && \ cp target/release-line-debug-size-lto/compute_ctl \ target/release-line-debug-size-lto/fast_import \ @@ -1828,10 +1826,11 @@ RUN rm /usr/local/pgsql/lib/lib*.a # Preprocess the sql_exporter configuration files # ######################################################################################### -FROM $REPOSITORY/$IMAGE:$TAG AS sql_exporter_preprocessor +FROM build-deps AS sql_exporter_preprocessor ARG PG_VERSION USER nonroot +WORKDIR /home/nonroot COPY --chown=nonroot compute compute From 7e41ef1bec4e7fc7a3b3e38dbdb90162a2e8d598 Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Mon, 23 Jun 2025 14:41:11 +0200 Subject: [PATCH 098/124] pageserver: set gRPC basebackup chunk size to 256 KB (#12314) gRPC base backups send a stream of fixed-size 64KB chunks. pagebench basebackup with compression enabled shows this to reduce throughput: * 64 KB: 55 RPS * 128 KB: 69 RPS * 256 KB: 73 RPS * 1024 KB: 73 RPS This patch sets the base backup chunk size to 256 KB. --- pageserver/src/page_service.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 642b447e5f..032db34983 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -3544,8 +3544,9 @@ impl proto::PageService for GrpcPageServiceHandler { &self, req: tonic::Request, ) -> Result, tonic::Status> { - // Send 64 KB chunks to avoid large memory allocations. - const CHUNK_SIZE: usize = 64 * 1024; + // Send chunks of 256 KB to avoid large memory allocations. pagebench basebackup shows this + // to be the sweet spot where throughput is saturated. + const CHUNK_SIZE: usize = 256 * 1024; let timeline = self.get_request_timeline(&req).await?; let ctx = self.ctx.with_scope_timeline(&timeline); From 0e490f3be773a1871b6763351843190723b4ba3d Mon Sep 17 00:00:00 2001 From: Vlad Lazar Date: Mon, 23 Jun 2025 16:17:30 +0300 Subject: [PATCH 099/124] pageserver: allow concurrent rw IO on in-mem layer (#12151) ## Problem Previously, we couldn't read from an in-memory layer while a batch was being written to it. Vice-versa, we couldn't write to it while there was an on-going read. ## Summary of Changes The goal of this change is to improve concurrency. Writes happened through a &mut self method so the enforcement was at the type system level. We attempt to improve by: 1. Adding interior mutability to EphemeralLayer. This involves wrapping the buffered writer in a read-write lock. 2. Minimise the time that the read lock is held for. Only hold the read lock while reading from the buffers (recently flushed or pending flush). If we need to read from the file, drop the lock and allow IO to be concurrent. The new benchmark variants with concurrent reads improve between 70 to 200 percent (against main). Benchmark results are in this [commit](https://github.com/neondatabase/neon/pull/12151/commits/891f094ce6fe6b9fdde7abd8183d59f8698804e5). ## Future Changes We can push the interior mutability into the buffered writer. The mutable tail goes under a read lock, the flushed part goes into an ArcSwap and then we can read from anything that is flushed _without_ any locking. --- pageserver/Cargo.toml | 4 + pageserver/benches/bench_ingest.rs | 156 +++++++++++++-- pageserver/src/metrics.rs | 4 +- pageserver/src/tenant/ephemeral_file.rs | 182 +++++++++++------- pageserver/src/tenant/storage_layer.rs | 84 ++++++-- .../tenant/storage_layer/inmemory_layer.rs | 127 ++++-------- pageserver/src/tenant/timeline.rs | 17 +- 7 files changed, 366 insertions(+), 208 deletions(-) diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 606ba9ad8c..8a2e2ed3be 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -12,6 +12,9 @@ testing = ["fail/failpoints", "pageserver_api/testing", "wal_decoder/testing", " fuzz-read-path = ["testing"] +# Enables benchmarking only APIs +benchmarking = [] + [dependencies] anyhow.workspace = true arc-swap.workspace = true @@ -127,6 +130,7 @@ harness = false [[bench]] name = "bench_ingest" harness = false +required-features = ["benchmarking"] [[bench]] name = "upload_queue" diff --git a/pageserver/benches/bench_ingest.rs b/pageserver/benches/bench_ingest.rs index 681d135e09..438c6e235e 100644 --- a/pageserver/benches/bench_ingest.rs +++ b/pageserver/benches/bench_ingest.rs @@ -1,22 +1,29 @@ use std::env; use std::num::NonZeroUsize; +use std::sync::Arc; use bytes::Bytes; use camino::Utf8PathBuf; use criterion::{Criterion, criterion_group, criterion_main}; +use futures::stream::FuturesUnordered; use pageserver::config::PageServerConf; use pageserver::context::{DownloadBehavior, RequestContext}; +use pageserver::keyspace::KeySpace; use pageserver::l0_flush::{L0FlushConfig, L0FlushGlobalState}; use pageserver::task_mgr::TaskKind; -use pageserver::tenant::storage_layer::InMemoryLayer; +use pageserver::tenant::storage_layer::IoConcurrency; +use pageserver::tenant::storage_layer::{InMemoryLayer, ValuesReconstructState}; use pageserver::{page_cache, virtual_file}; +use pageserver_api::config::GetVectoredConcurrentIo; use pageserver_api::key::Key; use pageserver_api::models::virtual_file::IoMode; use pageserver_api::shard::TenantShardId; -use strum::IntoEnumIterator; +use tokio_stream::StreamExt; use tokio_util::sync::CancellationToken; use utils::bin_ser::BeSer; use utils::id::{TenantId, TimelineId}; +use utils::lsn::Lsn; +use utils::sync::gate::Gate; use wal_decoder::models::value::Value; use wal_decoder::serialized_batch::SerializedValueBatch; @@ -30,7 +37,7 @@ fn murmurhash32(mut h: u32) -> u32 { h } -#[derive(serde::Serialize, Clone, Copy, Debug)] +#[derive(serde::Serialize, Clone, Copy, Debug, PartialEq)] enum KeyLayout { /// Sequential unique keys Sequential, @@ -40,19 +47,30 @@ enum KeyLayout { RandomReuse(u32), } -#[derive(serde::Serialize, Clone, Copy, Debug)] +#[derive(serde::Serialize, Clone, Copy, Debug, PartialEq)] enum WriteDelta { Yes, No, } +#[derive(serde::Serialize, Clone, Copy, Debug, PartialEq)] +enum ConcurrentReads { + Yes, + No, +} + async fn ingest( conf: &'static PageServerConf, put_size: usize, put_count: usize, key_layout: KeyLayout, write_delta: WriteDelta, + concurrent_reads: ConcurrentReads, ) -> anyhow::Result<()> { + if concurrent_reads == ConcurrentReads::Yes { + assert_eq!(key_layout, KeyLayout::Sequential); + } + let mut lsn = utils::lsn::Lsn(1000); let mut key = Key::from_i128(0x0); @@ -68,16 +86,18 @@ async fn ingest( let gate = utils::sync::gate::Gate::default(); let cancel = CancellationToken::new(); - let layer = InMemoryLayer::create( - conf, - timeline_id, - tenant_shard_id, - lsn, - &gate, - &cancel, - &ctx, - ) - .await?; + let layer = Arc::new( + InMemoryLayer::create( + conf, + timeline_id, + tenant_shard_id, + lsn, + &gate, + &cancel, + &ctx, + ) + .await?, + ); let data = Value::Image(Bytes::from(vec![0u8; put_size])); let data_ser_size = data.serialized_size().unwrap() as usize; @@ -86,6 +106,61 @@ async fn ingest( pageserver::context::DownloadBehavior::Download, ); + const READ_BATCH_SIZE: u32 = 32; + let (tx, mut rx) = tokio::sync::watch::channel::>(None); + let reader_cancel = CancellationToken::new(); + let reader_handle = if concurrent_reads == ConcurrentReads::Yes { + Some(tokio::task::spawn({ + let cancel = reader_cancel.clone(); + let layer = layer.clone(); + let ctx = ctx.attached_child(); + async move { + let gate = Gate::default(); + let gate_guard = gate.enter().unwrap(); + let io_concurrency = IoConcurrency::spawn_from_conf( + GetVectoredConcurrentIo::SidecarTask, + gate_guard, + ); + + rx.wait_for(|key| key.is_some()).await.unwrap(); + + while !cancel.is_cancelled() { + let key = match *rx.borrow() { + Some(some) => some, + None => unreachable!(), + }; + + let mut start_key = key; + start_key.field6 = key.field6.saturating_sub(READ_BATCH_SIZE); + let key_range = start_key..key.next(); + + let mut reconstruct_state = ValuesReconstructState::new(io_concurrency.clone()); + + layer + .get_values_reconstruct_data( + KeySpace::single(key_range), + Lsn(1)..Lsn(u64::MAX), + &mut reconstruct_state, + &ctx, + ) + .await + .unwrap(); + + let mut collect_futs = std::mem::take(&mut reconstruct_state.keys) + .into_values() + .map(|state| state.sink_pending_ios()) + .collect::>(); + while collect_futs.next().await.is_some() {} + } + + drop(io_concurrency); + gate.close().await; + } + })) + } else { + None + }; + const BATCH_SIZE: usize = 16; let mut batch = Vec::new(); @@ -113,19 +188,27 @@ async fn ingest( batch.push((key.to_compact(), lsn, data_ser_size, data.clone())); if batch.len() >= BATCH_SIZE { + let last_key = Key::from_compact(batch.last().unwrap().0); + let this_batch = std::mem::take(&mut batch); let serialized = SerializedValueBatch::from_values(this_batch); layer.put_batch(serialized, &ctx).await?; + + tx.send(Some(last_key)).unwrap(); } } if !batch.is_empty() { + let last_key = Key::from_compact(batch.last().unwrap().0); + let this_batch = std::mem::take(&mut batch); let serialized = SerializedValueBatch::from_values(this_batch); layer.put_batch(serialized, &ctx).await?; + + tx.send(Some(last_key)).unwrap(); } layer.freeze(lsn + 1).await; - if matches!(write_delta, WriteDelta::Yes) { + if write_delta == WriteDelta::Yes { let l0_flush_state = L0FlushGlobalState::new(L0FlushConfig::Direct { max_concurrency: NonZeroUsize::new(1).unwrap(), }); @@ -136,6 +219,11 @@ async fn ingest( tokio::fs::remove_file(path).await?; } + reader_cancel.cancel(); + if let Some(handle) = reader_handle { + handle.await.unwrap(); + } + Ok(()) } @@ -147,6 +235,7 @@ fn ingest_main( put_count: usize, key_layout: KeyLayout, write_delta: WriteDelta, + concurrent_reads: ConcurrentReads, ) { pageserver::virtual_file::set_io_mode(io_mode); @@ -156,7 +245,15 @@ fn ingest_main( .unwrap(); runtime.block_on(async move { - let r = ingest(conf, put_size, put_count, key_layout, write_delta).await; + let r = ingest( + conf, + put_size, + put_count, + key_layout, + write_delta, + concurrent_reads, + ) + .await; if let Err(e) = r { panic!("{e:?}"); } @@ -195,6 +292,7 @@ fn criterion_benchmark(c: &mut Criterion) { key_size: usize, key_layout: KeyLayout, write_delta: WriteDelta, + concurrent_reads: ConcurrentReads, } #[derive(Clone)] struct HandPickedParameters { @@ -245,7 +343,7 @@ fn criterion_benchmark(c: &mut Criterion) { ]; let exploded_parameters = { let mut out = Vec::new(); - for io_mode in IoMode::iter() { + for concurrent_reads in [ConcurrentReads::Yes, ConcurrentReads::No] { for param in expect.clone() { let HandPickedParameters { volume_mib, @@ -253,12 +351,18 @@ fn criterion_benchmark(c: &mut Criterion) { key_layout, write_delta, } = param; + + if key_layout != KeyLayout::Sequential && concurrent_reads == ConcurrentReads::Yes { + continue; + } + out.push(ExplodedParameters { - io_mode, + io_mode: IoMode::DirectRw, volume_mib, key_size, key_layout, write_delta, + concurrent_reads, }); } } @@ -272,9 +376,10 @@ fn criterion_benchmark(c: &mut Criterion) { key_size, key_layout, write_delta, + concurrent_reads, } = self; format!( - "io_mode={io_mode:?} volume_mib={volume_mib:?} key_size_bytes={key_size:?} key_layout={key_layout:?} write_delta={write_delta:?}" + "io_mode={io_mode:?} volume_mib={volume_mib:?} key_size_bytes={key_size:?} key_layout={key_layout:?} write_delta={write_delta:?} concurrent_reads={concurrent_reads:?}" ) } } @@ -287,12 +392,23 @@ fn criterion_benchmark(c: &mut Criterion) { key_size, key_layout, write_delta, + concurrent_reads, } = params; let put_count = volume_mib * 1024 * 1024 / key_size; group.throughput(criterion::Throughput::Bytes((key_size * put_count) as u64)); group.sample_size(10); group.bench_function(id, |b| { - b.iter(|| ingest_main(conf, io_mode, key_size, put_count, key_layout, write_delta)) + b.iter(|| { + ingest_main( + conf, + io_mode, + key_size, + put_count, + key_layout, + write_delta, + concurrent_reads, + ) + }) }); } } diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index bf54614baa..8d6d342cf9 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -3426,7 +3426,7 @@ impl TimelineMetrics { pub fn dec_frozen_layer(&self, layer: &InMemoryLayer) { assert!(matches!(layer.info(), InMemoryLayerInfo::Frozen { .. })); let labels = self.make_frozen_layer_labels(layer); - let size = layer.try_len().expect("frozen layer should have no writer"); + let size = layer.len(); TIMELINE_LAYER_COUNT .get_metric_with_label_values(&labels) .unwrap() @@ -3441,7 +3441,7 @@ impl TimelineMetrics { pub fn inc_frozen_layer(&self, layer: &InMemoryLayer) { assert!(matches!(layer.info(), InMemoryLayerInfo::Frozen { .. })); let labels = self.make_frozen_layer_labels(layer); - let size = layer.try_len().expect("frozen layer should have no writer"); + let size = layer.len(); TIMELINE_LAYER_COUNT .get_metric_with_label_values(&labels) .unwrap() diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs index 2edf22e9fd..203b5bf592 100644 --- a/pageserver/src/tenant/ephemeral_file.rs +++ b/pageserver/src/tenant/ephemeral_file.rs @@ -3,7 +3,7 @@ use std::io; use std::sync::Arc; -use std::sync::atomic::AtomicU64; +use std::sync::atomic::{AtomicU64, Ordering}; use camino::Utf8PathBuf; use num_traits::Num; @@ -18,6 +18,7 @@ use crate::assert_u64_eq_usize::{U64IsUsize, UsizeIsU64}; use crate::config::PageServerConf; use crate::context::RequestContext; use crate::page_cache; +use crate::tenant::storage_layer::inmemory_layer::GlobalResourceUnits; use crate::tenant::storage_layer::inmemory_layer::vectored_dio_read::File; use crate::virtual_file::owned_buffers_io::io_buf_aligned::IoBufAlignedMut; use crate::virtual_file::owned_buffers_io::slice::SliceMutExt; @@ -30,9 +31,13 @@ pub struct EphemeralFile { _tenant_shard_id: TenantShardId, _timeline_id: TimelineId, page_cache_file_id: page_cache::FileId, - bytes_written: u64, file: TempVirtualFileCoOwnedByEphemeralFileAndBufferedWriter, - buffered_writer: BufferedWriter, + + buffered_writer: tokio::sync::RwLock, + + bytes_written: AtomicU64, + + resource_units: std::sync::Mutex, } type BufferedWriter = owned_buffers_io::write::BufferedWriter< @@ -94,9 +99,8 @@ impl EphemeralFile { _tenant_shard_id: tenant_shard_id, _timeline_id: timeline_id, page_cache_file_id, - bytes_written: 0, file: file.clone(), - buffered_writer: BufferedWriter::new( + buffered_writer: tokio::sync::RwLock::new(BufferedWriter::new( file, 0, || IoBufferMut::with_capacity(TAIL_SZ), @@ -104,7 +108,9 @@ impl EphemeralFile { cancel.child_token(), ctx, info_span!(parent: None, "ephemeral_file_buffered_writer", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), timeline_id=%timeline_id, path = %filename), - ), + )), + bytes_written: AtomicU64::new(0), + resource_units: std::sync::Mutex::new(GlobalResourceUnits::new()), }) } } @@ -151,15 +157,17 @@ impl std::ops::Deref for TempVirtualFileCoOwnedByEphemeralFileAndBufferedWriter #[derive(Debug, thiserror::Error)] pub(crate) enum EphemeralFileWriteError { - #[error("{0}")] - TooLong(String), #[error("cancelled")] Cancelled, } impl EphemeralFile { pub(crate) fn len(&self) -> u64 { - self.bytes_written + // TODO(vlad): The value returned here is not always correct if + // we have more than one concurrent writer. Writes are always + // sequenced, but we could grab the buffered writer lock if we wanted + // to. + self.bytes_written.load(Ordering::Acquire) } pub(crate) fn page_cache_file_id(&self) -> page_cache::FileId { @@ -186,7 +194,7 @@ impl EphemeralFile { /// Panics if the write is short because there's no way we can recover from that. /// TODO: make upstack handle this as an error. pub(crate) async fn write_raw( - &mut self, + &self, srcbuf: &[u8], ctx: &RequestContext, ) -> Result { @@ -198,22 +206,13 @@ impl EphemeralFile { } async fn write_raw_controlled( - &mut self, + &self, srcbuf: &[u8], ctx: &RequestContext, ) -> Result<(u64, Option), EphemeralFileWriteError> { - let pos = self.bytes_written; + let mut writer = self.buffered_writer.write().await; - let new_bytes_written = pos.checked_add(srcbuf.len().into_u64()).ok_or_else(|| { - EphemeralFileWriteError::TooLong(format!( - "write would grow EphemeralFile beyond u64::MAX: len={pos} writen={srcbuf_len}", - srcbuf_len = srcbuf.len(), - )) - })?; - - // Write the payload - let (nwritten, control) = self - .buffered_writer + let (nwritten, control) = writer .write_buffered_borrowed_controlled(srcbuf, ctx) .await .map_err(|e| match e { @@ -225,43 +224,69 @@ impl EphemeralFile { "buffered writer has no short writes" ); - self.bytes_written = new_bytes_written; + // There's no realistic risk of overflow here. We won't have exabytes sized files on disk. + let pos = self + .bytes_written + .fetch_add(srcbuf.len().into_u64(), Ordering::AcqRel); + + let mut resource_units = self.resource_units.lock().unwrap(); + resource_units.maybe_publish_size(self.bytes_written.load(Ordering::Relaxed)); Ok((pos, control)) } + + pub(crate) fn tick(&self) -> Option { + let mut resource_units = self.resource_units.lock().unwrap(); + let len = self.bytes_written.load(Ordering::Relaxed); + resource_units.publish_size(len) + } } impl super::storage_layer::inmemory_layer::vectored_dio_read::File for EphemeralFile { async fn read_exact_at_eof_ok( &self, start: u64, - dst: tokio_epoll_uring::Slice, + mut dst: tokio_epoll_uring::Slice, ctx: &RequestContext, ) -> std::io::Result<(tokio_epoll_uring::Slice, usize)> { - let submitted_offset = self.buffered_writer.bytes_submitted(); + // We will fill the slice in back to front. Hence, we need + // the slice to be fully initialized. + // TODO(vlad): Is there a nicer way of doing this? + dst.as_mut_rust_slice_full_zeroed(); - let mutable = match self.buffered_writer.inspect_mutable() { - Some(mutable) => &mutable[0..mutable.pending()], - None => { - // Timeline::cancel and hence buffered writer flush was cancelled. - // Remain read-available while timeline is shutting down. - &[] - } - }; + let writer = self.buffered_writer.read().await; - let maybe_flushed = self.buffered_writer.inspect_maybe_flushed(); + // Read bytes written while under lock. This is a hack to deal with concurrent + // writes updating the number of bytes written. `bytes_written` is not DIO alligned + // but we may end the read there. + // + // TODO(vlad): Feels like there's a nicer path where we align the end if it + // shoots over the end of the file. + let bytes_written = self.bytes_written.load(Ordering::Acquire); let dst_cap = dst.bytes_total().into_u64(); let end = { // saturating_add is correct here because the max file size is u64::MAX, so, // if start + dst.len() > u64::MAX, then we know it will be a short read let mut end: u64 = start.saturating_add(dst_cap); - if end > self.bytes_written { - end = self.bytes_written; + if end > bytes_written { + end = bytes_written; } end }; + let submitted_offset = writer.bytes_submitted(); + let maybe_flushed = writer.inspect_maybe_flushed(); + + let mutable = match writer.inspect_mutable() { + Some(mutable) => &mutable[0..mutable.pending()], + None => { + // Timeline::cancel and hence buffered writer flush was cancelled. + // Remain read-available while timeline is shutting down. + &[] + } + }; + // inclusive, exclusive #[derive(Debug)] struct Range(N, N); @@ -306,13 +331,33 @@ impl super::storage_layer::inmemory_layer::vectored_dio_read::File for Ephemeral let mutable_range = Range(std::cmp::max(start, submitted_offset), end); - let dst = if written_range.len() > 0 { + // There are three sources from which we might have to read data: + // 1. The file itself + // 2. The buffer which contains changes currently being flushed + // 3. The buffer which contains chnages yet to be flushed + // + // For better concurrency, we do them in reverse order: perform the in-memory + // reads while holding the writer lock, drop the writer lock and read from the + // file if required. + + let dst = if mutable_range.len() > 0 { + let offset_in_buffer = mutable_range + .0 + .checked_sub(submitted_offset) + .unwrap() + .into_usize(); + let to_copy = + &mutable[offset_in_buffer..(offset_in_buffer + mutable_range.len().into_usize())]; let bounds = dst.bounds(); - let slice = self - .file - .read_exact_at(dst.slice(0..written_range.len().into_usize()), start, ctx) - .await?; - Slice::from_buf_bounds(Slice::into_inner(slice), bounds) + let mut view = dst.slice({ + let start = + written_range.len().into_usize() + maybe_flushed_range.len().into_usize(); + let end = start.checked_add(mutable_range.len().into_usize()).unwrap(); + start..end + }); + view.as_mut_rust_slice_full_zeroed() + .copy_from_slice(to_copy); + Slice::from_buf_bounds(Slice::into_inner(view), bounds) } else { dst }; @@ -342,24 +387,15 @@ impl super::storage_layer::inmemory_layer::vectored_dio_read::File for Ephemeral dst }; - let dst = if mutable_range.len() > 0 { - let offset_in_buffer = mutable_range - .0 - .checked_sub(submitted_offset) - .unwrap() - .into_usize(); - let to_copy = - &mutable[offset_in_buffer..(offset_in_buffer + mutable_range.len().into_usize())]; + drop(writer); + + let dst = if written_range.len() > 0 { let bounds = dst.bounds(); - let mut view = dst.slice({ - let start = - written_range.len().into_usize() + maybe_flushed_range.len().into_usize(); - let end = start.checked_add(mutable_range.len().into_usize()).unwrap(); - start..end - }); - view.as_mut_rust_slice_full_zeroed() - .copy_from_slice(to_copy); - Slice::from_buf_bounds(Slice::into_inner(view), bounds) + let slice = self + .file + .read_exact_at(dst.slice(0..written_range.len().into_usize()), start, ctx) + .await?; + Slice::from_buf_bounds(Slice::into_inner(slice), bounds) } else { dst }; @@ -460,13 +496,15 @@ mod tests { let gate = utils::sync::gate::Gate::default(); let cancel = CancellationToken::new(); - let mut file = EphemeralFile::create(conf, tenant_id, timeline_id, &gate, &cancel, &ctx) + let file = EphemeralFile::create(conf, tenant_id, timeline_id, &gate, &cancel, &ctx) .await .unwrap(); - let mutable = file.buffered_writer.mutable(); + let writer = file.buffered_writer.read().await; + let mutable = writer.mutable(); let cap = mutable.capacity(); let align = mutable.align(); + drop(writer); let write_nbytes = cap * 2 + cap / 2; @@ -504,10 +542,11 @@ mod tests { let file_contents = std::fs::read(file.file.path()).unwrap(); assert!(file_contents == content[0..cap * 2]); - let maybe_flushed_buffer_contents = file.buffered_writer.inspect_maybe_flushed().unwrap(); + let writer = file.buffered_writer.read().await; + let maybe_flushed_buffer_contents = writer.inspect_maybe_flushed().unwrap(); assert_eq!(&maybe_flushed_buffer_contents[..], &content[cap..cap * 2]); - let mutable_buffer_contents = file.buffered_writer.mutable(); + let mutable_buffer_contents = writer.mutable(); assert_eq!(mutable_buffer_contents, &content[cap * 2..write_nbytes]); } @@ -517,12 +556,14 @@ mod tests { let gate = utils::sync::gate::Gate::default(); let cancel = CancellationToken::new(); - let mut file = EphemeralFile::create(conf, tenant_id, timeline_id, &gate, &cancel, &ctx) + let file = EphemeralFile::create(conf, tenant_id, timeline_id, &gate, &cancel, &ctx) .await .unwrap(); // mutable buffer and maybe_flushed buffer each has `cap` bytes. - let cap = file.buffered_writer.mutable().capacity(); + let writer = file.buffered_writer.read().await; + let cap = writer.mutable().capacity(); + drop(writer); let content: Vec = rand::thread_rng() .sample_iter(rand::distributions::Standard) @@ -540,12 +581,13 @@ mod tests { 2 * cap.into_u64(), "buffered writer requires one write to be flushed if we write 2.5x buffer capacity" ); + let writer = file.buffered_writer.read().await; assert_eq!( - &file.buffered_writer.inspect_maybe_flushed().unwrap()[0..cap], + &writer.inspect_maybe_flushed().unwrap()[0..cap], &content[cap..cap * 2] ); assert_eq!( - &file.buffered_writer.mutable()[0..cap / 2], + &writer.mutable()[0..cap / 2], &content[cap * 2..cap * 2 + cap / 2] ); } @@ -563,13 +605,15 @@ mod tests { let gate = utils::sync::gate::Gate::default(); let cancel = CancellationToken::new(); - let mut file = EphemeralFile::create(conf, tenant_id, timeline_id, &gate, &cancel, &ctx) + let file = EphemeralFile::create(conf, tenant_id, timeline_id, &gate, &cancel, &ctx) .await .unwrap(); - let mutable = file.buffered_writer.mutable(); + let writer = file.buffered_writer.read().await; + let mutable = writer.mutable(); let cap = mutable.capacity(); let align = mutable.align(); + drop(writer); let content: Vec = rand::thread_rng() .sample_iter(rand::distributions::Standard) .take(cap * 2 + cap / 2) diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index e65d444f76..9fbb9d2438 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -109,7 +109,7 @@ pub(crate) enum OnDiskValue { /// Reconstruct data accumulated for a single key during a vectored get #[derive(Debug, Default)] -pub(crate) struct VectoredValueReconstructState { +pub struct VectoredValueReconstructState { pub(crate) on_disk_values: Vec<(Lsn, OnDiskValueIoWaiter)>, pub(crate) situation: ValueReconstructSituation, @@ -244,13 +244,60 @@ impl VectoredValueReconstructState { res } + + /// Benchmarking utility to await for the completion of all pending ios + /// + /// # Cancel-Safety + /// + /// Technically fine to stop polling this future, but, the IOs will still + /// be executed to completion by the sidecar task and hold on to / consume resources. + /// Better not do it to make reasonsing about the system easier. + #[cfg(feature = "benchmarking")] + pub async fn sink_pending_ios(self) -> Result<(), std::io::Error> { + let mut res = Ok(()); + + // We should try hard not to bail early, so that by the time we return from this + // function, all IO for this value is done. It's not required -- we could totally + // stop polling the IO futures in the sidecar task, they need to support that, + // but just stopping to poll doesn't reduce the IO load on the disk. It's easier + // to reason about the system if we just wait for all IO to complete, even if + // we're no longer interested in the result. + // + // Revisit this when IO futures are replaced with a more sophisticated IO system + // and an IO scheduler, where we know which IOs were submitted and which ones + // just queued. Cf the comment on IoConcurrency::spawn_io. + for (_lsn, waiter) in self.on_disk_values { + let value_recv_res = waiter + .wait_completion() + // we rely on the caller to poll us to completion, so this is not a bail point + .await; + + match (&mut res, value_recv_res) { + (Err(_), _) => { + // We've already failed, no need to process more. + } + (Ok(_), Err(_wait_err)) => { + // This shouldn't happen - likely the sidecar task panicked. + unreachable!(); + } + (Ok(_), Ok(Err(err))) => { + let err: std::io::Error = err; + res = Err(err); + } + (Ok(_ok), Ok(Ok(OnDiskValue::RawImage(_img)))) => {} + (Ok(_ok), Ok(Ok(OnDiskValue::WalRecordOrImage(_buf)))) => {} + } + } + + res + } } /// Bag of data accumulated during a vectored get.. -pub(crate) struct ValuesReconstructState { +pub struct ValuesReconstructState { /// The keys will be removed after `get_vectored` completes. The caller outside `Timeline` /// should not expect to get anything from this hashmap. - pub(crate) keys: HashMap, + pub keys: HashMap, /// The keys which are already retrieved keys_done: KeySpaceRandomAccum, @@ -272,7 +319,7 @@ pub(crate) struct ValuesReconstructState { /// The desired end state is that we always do parallel IO. /// This struct and the dispatching in the impl will be removed once /// we've built enough confidence. -pub(crate) enum IoConcurrency { +pub enum IoConcurrency { Sequential, SidecarTask { task_id: usize, @@ -317,10 +364,7 @@ impl IoConcurrency { Self::spawn(SelectedIoConcurrency::Sequential) } - pub(crate) fn spawn_from_conf( - conf: GetVectoredConcurrentIo, - gate_guard: GateGuard, - ) -> IoConcurrency { + pub fn spawn_from_conf(conf: GetVectoredConcurrentIo, gate_guard: GateGuard) -> IoConcurrency { let selected = match conf { GetVectoredConcurrentIo::Sequential => SelectedIoConcurrency::Sequential, GetVectoredConcurrentIo::SidecarTask => SelectedIoConcurrency::SidecarTask(gate_guard), @@ -425,16 +469,6 @@ impl IoConcurrency { } } - pub(crate) fn clone(&self) -> Self { - match self { - IoConcurrency::Sequential => IoConcurrency::Sequential, - IoConcurrency::SidecarTask { task_id, ios_tx } => IoConcurrency::SidecarTask { - task_id: *task_id, - ios_tx: ios_tx.clone(), - }, - } - } - /// Submit an IO to be executed in the background. DEADLOCK RISK, read the full doc string. /// /// The IO is represented as an opaque future. @@ -573,6 +607,18 @@ impl IoConcurrency { } } +impl Clone for IoConcurrency { + fn clone(&self) -> Self { + match self { + IoConcurrency::Sequential => IoConcurrency::Sequential, + IoConcurrency::SidecarTask { task_id, ios_tx } => IoConcurrency::SidecarTask { + task_id: *task_id, + ios_tx: ios_tx.clone(), + }, + } + } +} + /// Make noise in case the [`ValuesReconstructState`] gets dropped while /// there are still IOs in flight. /// Refer to `collect_pending_ios` for why we prefer not to do that. @@ -603,7 +649,7 @@ impl Drop for ValuesReconstructState { } impl ValuesReconstructState { - pub(crate) fn new(io_concurrency: IoConcurrency) -> Self { + pub fn new(io_concurrency: IoConcurrency) -> Self { Self { keys: HashMap::new(), keys_done: KeySpaceRandomAccum::new(), diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs index 200beba115..8e5b0ba648 100644 --- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs +++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs @@ -70,23 +70,15 @@ pub struct InMemoryLayer { /// We use a separate lock for the index to reduce the critical section /// during which reads cannot be planned. /// - /// If you need access to both the index and the underlying file at the same time, - /// respect the following locking order to avoid deadlocks: - /// 1. [`InMemoryLayer::inner`] - /// 2. [`InMemoryLayer::index`] - /// - /// Note that the file backing [`InMemoryLayer::inner`] is append-only, - /// so it is not necessary to hold simultaneous locks on index. - /// This avoids holding index locks across IO, and is crucial for avoiding read tail latency. + /// Note that the file backing [`InMemoryLayer::file`] is append-only, + /// so it is not necessary to hold a lock on the index while reading or writing from the file. /// In particular: - /// 1. It is safe to read and release [`InMemoryLayer::index`] before locking and reading from [`InMemoryLayer::inner`]. - /// 2. It is safe to write and release [`InMemoryLayer::inner`] before locking and updating [`InMemoryLayer::index`]. + /// 1. It is safe to read and release [`InMemoryLayer::index`] before reading from [`InMemoryLayer::file`]. + /// 2. It is safe to write to [`InMemoryLayer::file`] before locking and updating [`InMemoryLayer::index`]. index: RwLock>>, - /// The above fields never change, except for `end_lsn`, which is only set once, - /// and `index` (see rationale there). - /// All other changing parts are in `inner`, and protected by a mutex. - inner: RwLock, + /// Wrapper for the actual on-disk file. Uses interior mutability for concurrent reads/writes. + file: EphemeralFile, estimated_in_mem_size: AtomicU64, } @@ -96,20 +88,10 @@ impl std::fmt::Debug for InMemoryLayer { f.debug_struct("InMemoryLayer") .field("start_lsn", &self.start_lsn) .field("end_lsn", &self.end_lsn) - .field("inner", &self.inner) .finish() } } -pub struct InMemoryLayerInner { - /// The values are stored in a serialized format in this file. - /// Each serialized Value is preceded by a 'u32' length field. - /// PerSeg::page_versions map stores offsets into this file. - file: EphemeralFile, - - resource_units: GlobalResourceUnits, -} - /// Support the same max blob length as blob_io, because ultimately /// all the InMemoryLayer contents end up being written into a delta layer, /// using the [`crate::tenant::blob_io`]. @@ -258,12 +240,6 @@ struct IndexEntryUnpacked { pos: u64, } -impl std::fmt::Debug for InMemoryLayerInner { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("InMemoryLayerInner").finish() - } -} - /// State shared by all in-memory (ephemeral) layers. Updated infrequently during background ticks in Timeline, /// to minimize contention. /// @@ -280,7 +256,7 @@ pub(crate) struct GlobalResources { } // Per-timeline RAII struct for its contribution to [`GlobalResources`] -struct GlobalResourceUnits { +pub(crate) struct GlobalResourceUnits { // How many dirty bytes have I added to the global dirty_bytes: this guard object is responsible // for decrementing the global counter by this many bytes when dropped. dirty_bytes: u64, @@ -292,7 +268,7 @@ impl GlobalResourceUnits { // updated when the Timeline "ticks" in the background. const MAX_SIZE_DRIFT: u64 = 10 * 1024 * 1024; - fn new() -> Self { + pub(crate) fn new() -> Self { GLOBAL_RESOURCES .dirty_layers .fetch_add(1, AtomicOrdering::Relaxed); @@ -304,7 +280,7 @@ impl GlobalResourceUnits { /// /// Returns the effective layer size limit that should be applied, if any, to keep /// the total number of dirty bytes below the configured maximum. - fn publish_size(&mut self, size: u64) -> Option { + pub(crate) fn publish_size(&mut self, size: u64) -> Option { let new_global_dirty_bytes = match size.cmp(&self.dirty_bytes) { Ordering::Equal => GLOBAL_RESOURCES.dirty_bytes.load(AtomicOrdering::Relaxed), Ordering::Greater => { @@ -349,7 +325,7 @@ impl GlobalResourceUnits { // Call publish_size if the input size differs from last published size by more than // the drift limit - fn maybe_publish_size(&mut self, size: u64) { + pub(crate) fn maybe_publish_size(&mut self, size: u64) { let publish = match size.cmp(&self.dirty_bytes) { Ordering::Equal => false, Ordering::Greater => size - self.dirty_bytes > Self::MAX_SIZE_DRIFT, @@ -398,8 +374,8 @@ impl InMemoryLayer { } } - pub(crate) fn try_len(&self) -> Option { - self.inner.try_read().map(|i| i.file.len()).ok() + pub(crate) fn len(&self) -> u64 { + self.file.len() } pub(crate) fn assert_writable(&self) { @@ -430,7 +406,7 @@ impl InMemoryLayer { // Look up the keys in the provided keyspace and update // the reconstruct state with whatever is found. - pub(crate) async fn get_values_reconstruct_data( + pub async fn get_values_reconstruct_data( self: &Arc, keyspace: KeySpace, lsn_range: Range, @@ -479,14 +455,13 @@ impl InMemoryLayer { } } } - drop(index); // release the lock before we spawn the IO; if it's serial-mode IO we will deadlock on the read().await below + drop(index); // release the lock before we spawn the IO let read_from = Arc::clone(self); let read_ctx = ctx.attached_child(); reconstruct_state .spawn_io(async move { - let inner = read_from.inner.read().await; let f = vectored_dio_read::execute( - &inner.file, + &read_from.file, reads .iter() .flat_map(|(_, value_reads)| value_reads.iter().map(|v| &v.read)), @@ -518,7 +493,6 @@ impl InMemoryLayer { // This is kinda forced for InMemoryLayer because we need to inner.read() anyway, // but it's less obvious for DeltaLayer and ImageLayer. So, keep this explicit // drop for consistency among all three layer types. - drop(inner); drop(read_from); }) .await; @@ -549,12 +523,6 @@ impl std::fmt::Display for InMemoryLayer { } impl InMemoryLayer { - /// Get layer size. - pub async fn size(&self) -> Result { - let inner = self.inner.read().await; - Ok(inner.file.len()) - } - pub fn estimated_in_mem_size(&self) -> u64 { self.estimated_in_mem_size.load(AtomicOrdering::Relaxed) } @@ -587,10 +555,7 @@ impl InMemoryLayer { end_lsn: OnceLock::new(), opened_at: Instant::now(), index: RwLock::new(BTreeMap::new()), - inner: RwLock::new(InMemoryLayerInner { - file, - resource_units: GlobalResourceUnits::new(), - }), + file, estimated_in_mem_size: AtomicU64::new(0), }) } @@ -599,41 +564,37 @@ impl InMemoryLayer { /// /// Errors are not retryable, the [`InMemoryLayer`] must be discarded, and not be read from. /// The reason why it's not retryable is that the [`EphemeralFile`] writes are not retryable. + /// + /// This method shall not be called concurrently. We enforce this property via [`crate::tenant::Timeline::write_lock`]. + /// /// TODO: it can be made retryable if we aborted the process on EphemeralFile write errors. pub async fn put_batch( &self, serialized_batch: SerializedValueBatch, ctx: &RequestContext, ) -> anyhow::Result<()> { - let (base_offset, metadata) = { - let mut inner = self.inner.write().await; - self.assert_writable(); + self.assert_writable(); - let base_offset = inner.file.len(); + let base_offset = self.file.len(); - let SerializedValueBatch { - raw, - metadata, - max_lsn: _, - len: _, - } = serialized_batch; + let SerializedValueBatch { + raw, + metadata, + max_lsn: _, + len: _, + } = serialized_batch; - // Write the batch to the file - inner.file.write_raw(&raw, ctx).await?; - let new_size = inner.file.len(); + // Write the batch to the file + self.file.write_raw(&raw, ctx).await?; + let new_size = self.file.len(); - let expected_new_len = base_offset - .checked_add(raw.len().into_u64()) - // write_raw would error if we were to overflow u64. - // also IndexEntry and higher levels in - //the code don't allow the file to grow that large - .unwrap(); - assert_eq!(new_size, expected_new_len); - - inner.resource_units.maybe_publish_size(new_size); - - (base_offset, metadata) - }; + let expected_new_len = base_offset + .checked_add(raw.len().into_u64()) + // write_raw would error if we were to overflow u64. + // also IndexEntry and higher levels in + //the code don't allow the file to grow that large + .unwrap(); + assert_eq!(new_size, expected_new_len); // Update the index with the new entries let mut index = self.index.write().await; @@ -686,10 +647,8 @@ impl InMemoryLayer { self.opened_at } - pub(crate) async fn tick(&self) -> Option { - let mut inner = self.inner.write().await; - let size = inner.file.len(); - inner.resource_units.publish_size(size) + pub(crate) fn tick(&self) -> Option { + self.file.tick() } pub(crate) async fn put_tombstones(&self, _key_ranges: &[(Range, Lsn)]) -> Result<()> { @@ -753,12 +712,6 @@ impl InMemoryLayer { gate: &utils::sync::gate::Gate, cancel: CancellationToken, ) -> Result> { - // Grab the lock in read-mode. We hold it over the I/O, but because this - // layer is not writeable anymore, no one should be trying to acquire the - // write lock on it, so we shouldn't block anyone. See the comment on - // [`InMemoryLayer::freeze`] to understand how locking between the append path - // and layer flushing works. - let inner = self.inner.read().await; let index = self.index.read().await; use l0_flush::Inner; @@ -793,7 +746,7 @@ impl InMemoryLayer { match l0_flush_global_state { l0_flush::Inner::Direct { .. } => { - let file_contents = inner.file.load_to_io_buf(ctx).await?; + let file_contents = self.file.load_to_io_buf(ctx).await?; let file_contents = file_contents.freeze(); for (key, vec_map) in index.iter() { diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index c8a41f7875..1bb17af146 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -816,7 +816,7 @@ impl From for FlushLayerError { } #[derive(thiserror::Error, Debug)] -pub(crate) enum GetVectoredError { +pub enum GetVectoredError { #[error("timeline shutting down")] Cancelled, @@ -849,7 +849,7 @@ impl From for GetVectoredError { } #[derive(thiserror::Error, Debug)] -pub(crate) enum GetReadyAncestorError { +pub enum GetReadyAncestorError { #[error("ancestor LSN wait error")] AncestorLsnTimeout(#[from] WaitLsnError), @@ -939,7 +939,7 @@ impl std::fmt::Debug for Timeline { } #[derive(thiserror::Error, Debug, Clone)] -pub(crate) enum WaitLsnError { +pub enum WaitLsnError { // Called on a timeline which is shutting down #[error("Shutdown")] Shutdown, @@ -1902,16 +1902,11 @@ impl Timeline { return; }; - let Some(current_size) = open_layer.try_len() else { - // Unexpected: since we hold the write guard, nobody else should be writing to this layer, so - // read lock to get size should always succeed. - tracing::warn!("Lock conflict while reading size of open layer"); - return; - }; + let current_size = open_layer.len(); let current_lsn = self.get_last_record_lsn(); - let checkpoint_distance_override = open_layer.tick().await; + let checkpoint_distance_override = open_layer.tick(); if let Some(size_override) = checkpoint_distance_override { if current_size > size_override { @@ -7372,7 +7367,7 @@ impl TimelineWriter<'_> { .tl .get_layer_for_write(at, &self.write_guard, ctx) .await?; - let initial_size = layer.size().await?; + let initial_size = layer.len(); let last_freeze_at = self.last_freeze_at.load(); self.write_guard.replace(TimelineWriterState::new( From 8d711229c1381b09208e1feff780c4e90f23f784 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Mon, 23 Jun 2025 16:23:33 +0300 Subject: [PATCH 100/124] ci: Fix bogus skipping of 'make all' step in CI (#12318) The 'make all' step must run always. PR #12311 accidentally left the condition in there to skip it if there were no changes in postgres v14 sources. That condition belonged to a whole different step that was removed altogether in PR#12311, and the condition should've been removed too. Per CI failure: https://github.com/neondatabase/neon/actions/runs/15820148967/job/44587394469 --- .github/workflows/_build-and-test-locally.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml index ff370ddb21..e2203a38ec 100644 --- a/.github/workflows/_build-and-test-locally.yml +++ b/.github/workflows/_build-and-test-locally.yml @@ -189,7 +189,6 @@ jobs: key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v17_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools.Dockerfile') }} - name: Build all - if: steps.cache_pg_14.outputs.cache-hit != 'true' # Note: the Makefile picks up BUILD_TYPE and CARGO_PROFILE from the env variables run: mold -run make ${make_vars} all -j$(nproc) CARGO_BUILD_FLAGS="$CARGO_FLAGS" From 5e2c444525f36d23d8c4534ba973a8bce7b5172f Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Mon, 23 Jun 2025 09:51:21 -0400 Subject: [PATCH 101/124] fix(pageserver): reduce default feature flag refresh interval (#12246) ## Problem Part of #11813 ## Summary of changes The current interval is 30s and it costs a lot of $$$. This patch reduced it to 600s refresh interval (which means that it takes 10min for feature flags to propagate from UI to the pageserver). In the future we can let storcon retrieve the feature flags and push it to pageservers. We can consider creating a new release or we can postpone this to the week after the next week. --------- Signed-off-by: Alex Chi Z --- libs/pageserver_api/src/config.rs | 4 ++++ libs/posthog_client_lite/src/background_loop.rs | 5 ++++- pageserver/src/feature_resolver.rs | 13 +++++++++---- 3 files changed, 17 insertions(+), 5 deletions(-) diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs index 1ecc17e04b..cfb1190a27 100644 --- a/libs/pageserver_api/src/config.rs +++ b/libs/pageserver_api/src/config.rs @@ -76,6 +76,10 @@ pub struct PostHogConfig { pub private_api_url: String, /// Public API URL pub public_api_url: String, + /// Refresh interval for the feature flag spec + #[serde(skip_serializing_if = "Option::is_none")] + #[serde(with = "humantime_serde")] + pub refresh_interval: Option, } /// `pageserver.toml` diff --git a/libs/posthog_client_lite/src/background_loop.rs b/libs/posthog_client_lite/src/background_loop.rs index 693d62efc4..dc813ccb4a 100644 --- a/libs/posthog_client_lite/src/background_loop.rs +++ b/libs/posthog_client_lite/src/background_loop.rs @@ -36,7 +36,10 @@ impl FeatureResolverBackgroundLoop { // Main loop of updating the feature flags. handle.spawn( async move { - tracing::info!("Starting PostHog feature resolver"); + tracing::info!( + "Starting PostHog feature resolver with refresh period: {:?}", + refresh_period + ); let mut ticker = tokio::time::interval(refresh_period); ticker.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip); loop { diff --git a/pageserver/src/feature_resolver.rs b/pageserver/src/feature_resolver.rs index 84edd68011..b4e6f78bf2 100644 --- a/pageserver/src/feature_resolver.rs +++ b/pageserver/src/feature_resolver.rs @@ -12,6 +12,8 @@ use utils::id::TenantId; use crate::{config::PageServerConf, metrics::FEATURE_FLAG_EVALUATION}; +const DEFAULT_POSTHOG_REFRESH_INTERVAL: Duration = Duration::from_secs(600); + #[derive(Clone)] pub struct FeatureResolver { inner: Option>, @@ -139,10 +141,13 @@ impl FeatureResolver { } tenants }; - // TODO: make refresh period configurable - inner - .clone() - .spawn(handle, Duration::from_secs(60), fake_tenants); + inner.clone().spawn( + handle, + posthog_config + .refresh_interval + .unwrap_or(DEFAULT_POSTHOG_REFRESH_INTERVAL), + fake_tenants, + ); Ok(FeatureResolver { inner: Some(inner), internal_properties: Some(internal_properties), From 68a175d5455e0dbd01a014a89c5f289ab9777f0d Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Mon, 23 Jun 2025 17:33:45 +0200 Subject: [PATCH 102/124] test_runner: fix `test_basebackup_with_high_slru_count` gzip param (#12319) The `--gzip-probability` parameter was removed in #12250. However, `test_basebackup_with_high_slru_count` still uses it, and keeps failing. This patch removes the use of the parameter (gzip is enabled by default). --- .../pageserver/pagebench/test_large_slru_basebackup.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py index 8af52dcbd0..25dfd5277c 100644 --- a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py +++ b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py @@ -146,8 +146,6 @@ def run_benchmark(env: NeonEnv, pg_bin: PgBin, record, duration_secs: int): ps_http.base_url, "--page-service-connstring", env.pageserver.connstr(password=None), - "--gzip-probability", - "1", "--runtime", f"{duration_secs}s", # don't specify the targets explicitly, let pagebench auto-discover them From 6c3aba7c44e070a25064b113651b934cb7460e67 Mon Sep 17 00:00:00 2001 From: John Spray Date: Mon, 23 Jun 2025 08:50:31 -0700 Subject: [PATCH 103/124] storcon: adjust AZ selection for heterogenous AZs (#12296) ## Problem The scheduler uses total shards per AZ to select the AZ for newly created or attached tenants. This makes bad decisions when we have different node counts per AZ -- we might have 2 very busy pageservers in one AZ, and 4 more lightly loaded pageservers in other AZs, and the scheduler picks the busy pageservers because the total shard count in their AZ is lower. ## Summary of changes - Divide the shard count by the number of nodes in the AZ when scoring in `get_az_for_new_tenant` --------- Co-authored-by: John Spray --- storage_controller/src/scheduler.rs | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/storage_controller/src/scheduler.rs b/storage_controller/src/scheduler.rs index 3fa25443da..b3656c33d4 100644 --- a/storage_controller/src/scheduler.rs +++ b/storage_controller/src/scheduler.rs @@ -825,6 +825,7 @@ impl Scheduler { struct AzScore { home_shard_count: usize, scheduleable: bool, + node_count: usize, } let mut azs: HashMap<&AvailabilityZone, AzScore> = HashMap::new(); @@ -832,6 +833,7 @@ impl Scheduler { let az = azs.entry(&node.az).or_default(); az.home_shard_count += node.home_shard_count; az.scheduleable |= matches!(node.may_schedule, MaySchedule::Yes(_)); + az.node_count += 1; } // If any AZs are schedulable, then filter out the non-schedulable ones (i.e. AZs where @@ -840,10 +842,20 @@ impl Scheduler { azs.retain(|_, i| i.scheduleable); } + // We will multiply up shard counts by the max node count for scoring, before dividing + // by per-node max node count, to get a normalized score that doesn't collapse to zero + // when the absolute shard count is less than the node count. + let max_node_count = azs.values().map(|i| i.node_count).max().unwrap_or(0); + // Find the AZ with the lowest number of shards currently allocated Some( azs.into_iter() - .min_by_key(|i| (i.1.home_shard_count, i.0)) + .min_by_key(|i| { + ( + (i.1.home_shard_count * max_node_count) / i.1.node_count, + i.0, + ) + }) .unwrap() .0 .clone(), From 85164422d0c70ff6cb9ff3219db035b426d0d78b Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Mon, 23 Jun 2025 13:31:53 -0400 Subject: [PATCH 104/124] feat(pageserver): support force overriding feature flags (#12233) ## Problem Part of #11813 ## Summary of changes Add a test API to make it easier to manipulate the feature flags within tests. --------- Signed-off-by: Alex Chi Z --- pageserver/src/bin/pageserver.rs | 3 +- pageserver/src/feature_resolver.rs | 35 +++++++++++++++- pageserver/src/http/routes.rs | 49 ++++++++++++++++++++--- test_runner/fixtures/pageserver/http.py | 28 +++++++++++++ test_runner/regress/test_feature_flag.py | 51 ++++++++++++++++++++++++ 5 files changed, 159 insertions(+), 7 deletions(-) create mode 100644 test_runner/regress/test_feature_flag.py diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 417503089a..d137d651eb 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -583,7 +583,7 @@ fn start_pageserver( deletion_queue_client, l0_flush_global_state, basebackup_prepare_sender, - feature_resolver, + feature_resolver: feature_resolver.clone(), }, shutdown_pageserver.clone(), ); @@ -715,6 +715,7 @@ fn start_pageserver( disk_usage_eviction_state, deletion_queue.new_client(), secondary_controller, + feature_resolver, ) .context("Failed to initialize router state")?, ); diff --git a/pageserver/src/feature_resolver.rs b/pageserver/src/feature_resolver.rs index b4e6f78bf2..b0a68dfc4d 100644 --- a/pageserver/src/feature_resolver.rs +++ b/pageserver/src/feature_resolver.rs @@ -1,5 +1,6 @@ use std::{collections::HashMap, sync::Arc, time::Duration}; +use arc_swap::ArcSwap; use pageserver_api::config::NodeMetadata; use posthog_client_lite::{ CaptureEvent, FeatureResolverBackgroundLoop, PostHogClientConfig, PostHogEvaluationError, @@ -18,6 +19,7 @@ const DEFAULT_POSTHOG_REFRESH_INTERVAL: Duration = Duration::from_secs(600); pub struct FeatureResolver { inner: Option>, internal_properties: Option>>, + force_overrides_for_testing: Arc>>, } impl FeatureResolver { @@ -25,6 +27,7 @@ impl FeatureResolver { Self { inner: None, internal_properties: None, + force_overrides_for_testing: Arc::new(ArcSwap::new(Arc::new(HashMap::new()))), } } @@ -151,11 +154,13 @@ impl FeatureResolver { Ok(FeatureResolver { inner: Some(inner), internal_properties: Some(internal_properties), + force_overrides_for_testing: Arc::new(ArcSwap::new(Arc::new(HashMap::new()))), }) } else { Ok(FeatureResolver { inner: None, internal_properties: None, + force_overrides_for_testing: Arc::new(ArcSwap::new(Arc::new(HashMap::new()))), }) } } @@ -195,6 +200,11 @@ impl FeatureResolver { flag_key: &str, tenant_id: TenantId, ) -> Result { + let force_overrides = self.force_overrides_for_testing.load(); + if let Some(value) = force_overrides.get(flag_key) { + return Ok(value.clone()); + } + if let Some(inner) = &self.inner { let res = inner.feature_store().evaluate_multivariate( flag_key, @@ -233,6 +243,15 @@ impl FeatureResolver { flag_key: &str, tenant_id: TenantId, ) -> Result<(), PostHogEvaluationError> { + let force_overrides = self.force_overrides_for_testing.load(); + if let Some(value) = force_overrides.get(flag_key) { + return if value == "true" { + Ok(()) + } else { + Err(PostHogEvaluationError::NoConditionGroupMatched) + }; + } + if let Some(inner) = &self.inner { let res = inner.feature_store().evaluate_boolean( flag_key, @@ -264,8 +283,22 @@ impl FeatureResolver { inner.feature_store().is_feature_flag_boolean(flag_key) } else { Err(PostHogEvaluationError::NotAvailable( - "PostHog integration is not enabled".to_string(), + "PostHog integration is not enabled, cannot auto-determine the flag type" + .to_string(), )) } } + + /// Force override a feature flag for testing. This is only for testing purposes. Assume the caller only call it + /// from a single thread so it won't race. + pub fn force_override_for_testing(&self, flag_key: &str, value: Option<&str>) { + let mut force_overrides = self.force_overrides_for_testing.load().as_ref().clone(); + if let Some(value) = value { + force_overrides.insert(flag_key.to_string(), value.to_string()); + } else { + force_overrides.remove(flag_key); + } + self.force_overrides_for_testing + .store(Arc::new(force_overrides)); + } } diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 626986f580..2e9cd3ad70 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -59,6 +59,7 @@ use crate::config::PageServerConf; use crate::context; use crate::context::{DownloadBehavior, RequestContext, RequestContextBuilder}; use crate::deletion_queue::DeletionQueueClient; +use crate::feature_resolver::FeatureResolver; use crate::pgdatadir_mapping::LsnForTimestamp; use crate::task_mgr::TaskKind; use crate::tenant::config::LocationConf; @@ -107,6 +108,7 @@ pub struct State { deletion_queue_client: DeletionQueueClient, secondary_controller: SecondaryController, latest_utilization: tokio::sync::Mutex>, + feature_resolver: FeatureResolver, } impl State { @@ -120,6 +122,7 @@ impl State { disk_usage_eviction_state: Arc, deletion_queue_client: DeletionQueueClient, secondary_controller: SecondaryController, + feature_resolver: FeatureResolver, ) -> anyhow::Result { let allowlist_routes = &[ "/v1/status", @@ -140,6 +143,7 @@ impl State { deletion_queue_client, secondary_controller, latest_utilization: Default::default(), + feature_resolver, }) } } @@ -3675,8 +3679,8 @@ async fn tenant_evaluate_feature_flag( let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; - let flag: String = must_parse_query_param(&request, "flag")?; - let as_type: String = must_parse_query_param(&request, "as")?; + let flag: String = parse_request_param(&request, "flag_key")?; + let as_type: Option = parse_query_param(&request, "as")?; let state = get_state(&request); @@ -3685,11 +3689,11 @@ async fn tenant_evaluate_feature_flag( .tenant_manager .get_attached_tenant_shard(tenant_shard_id)?; let properties = tenant.feature_resolver.collect_properties(tenant_shard_id.tenant_id); - if as_type == "boolean" { + if as_type.as_deref() == Some("boolean") { let result = tenant.feature_resolver.evaluate_boolean(&flag, tenant_shard_id.tenant_id); let result = result.map(|_| true).map_err(|e| e.to_string()); json_response(StatusCode::OK, json!({ "result": result, "properties": properties })) - } else if as_type == "multivariate" { + } else if as_type.as_deref() == Some("multivariate") { let result = tenant.feature_resolver.evaluate_multivariate(&flag, tenant_shard_id.tenant_id).map_err(|e| e.to_string()); json_response(StatusCode::OK, json!({ "result": result, "properties": properties })) } else { @@ -3709,6 +3713,35 @@ async fn tenant_evaluate_feature_flag( .await } +async fn force_override_feature_flag_for_testing_put( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + check_permission(&request, None)?; + + let flag: String = parse_request_param(&request, "flag_key")?; + let value: String = must_parse_query_param(&request, "value")?; + let state = get_state(&request); + state + .feature_resolver + .force_override_for_testing(&flag, Some(&value)); + json_response(StatusCode::OK, ()) +} + +async fn force_override_feature_flag_for_testing_delete( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + check_permission(&request, None)?; + + let flag: String = parse_request_param(&request, "flag_key")?; + let state = get_state(&request); + state + .feature_resolver + .force_override_for_testing(&flag, None); + json_response(StatusCode::OK, ()) +} + /// Common functionality of all the HTTP API handlers. /// /// - Adds a tracing span to each request (by `request_span`) @@ -4085,8 +4118,14 @@ pub fn make_router( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/activate_post_import", |r| api_handler(r, activate_post_import_handler), ) - .get("/v1/tenant/:tenant_shard_id/feature_flag", |r| { + .get("/v1/tenant/:tenant_shard_id/feature_flag/:flag_key", |r| { api_handler(r, tenant_evaluate_feature_flag) }) + .put("/v1/feature_flag/:flag_key", |r| { + testing_api_handler("force override feature flag - put", r, force_override_feature_flag_for_testing_put) + }) + .delete("/v1/feature_flag/:flag_key", |r| { + testing_api_handler("force override feature flag - delete", r, force_override_feature_flag_for_testing_delete) + }) .any(handler_404)) } diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py index c29192c25c..d9037f2d08 100644 --- a/test_runner/fixtures/pageserver/http.py +++ b/test_runner/fixtures/pageserver/http.py @@ -1219,3 +1219,31 @@ class PageserverHttpClient(requests.Session, MetricsGetter): ) self.verbose_error(res) return res.json() + + def force_override_feature_flag(self, flag: str, value: str | None = None): + if value is None: + res = self.delete( + f"http://localhost:{self.port}/v1/feature_flag/{flag}", + ) + else: + res = self.put( + f"http://localhost:{self.port}/v1/feature_flag/{flag}", + params={"value": value}, + ) + self.verbose_error(res) + + def evaluate_feature_flag_boolean(self, tenant_id: TenantId, flag: str) -> Any: + res = self.get( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/feature_flag/{flag}", + params={"as": "boolean"}, + ) + self.verbose_error(res) + return res.json() + + def evaluate_feature_flag_multivariate(self, tenant_id: TenantId, flag: str) -> Any: + res = self.get( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/feature_flag/{flag}", + params={"as": "multivariate"}, + ) + self.verbose_error(res) + return res.json() diff --git a/test_runner/regress/test_feature_flag.py b/test_runner/regress/test_feature_flag.py new file mode 100644 index 0000000000..2712d13dcc --- /dev/null +++ b/test_runner/regress/test_feature_flag.py @@ -0,0 +1,51 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from fixtures.utils import run_only_on_default_postgres + +if TYPE_CHECKING: + from fixtures.neon_fixtures import NeonEnvBuilder + + +@run_only_on_default_postgres("Pageserver-only test only needs to run on one version") +def test_feature_flag(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start() + env.pageserver.http_client().force_override_feature_flag("test-feature-flag", "true") + assert env.pageserver.http_client().evaluate_feature_flag_boolean( + env.initial_tenant, "test-feature-flag" + )["result"]["Ok"] + assert ( + env.pageserver.http_client().evaluate_feature_flag_multivariate( + env.initial_tenant, "test-feature-flag" + )["result"]["Ok"] + == "true" + ) + + env.pageserver.http_client().force_override_feature_flag("test-feature-flag", "false") + assert ( + env.pageserver.http_client().evaluate_feature_flag_boolean( + env.initial_tenant, "test-feature-flag" + )["result"]["Err"] + == "No condition group is matched" + ) + assert ( + env.pageserver.http_client().evaluate_feature_flag_multivariate( + env.initial_tenant, "test-feature-flag" + )["result"]["Ok"] + == "false" + ) + + env.pageserver.http_client().force_override_feature_flag("test-feature-flag", None) + assert ( + "Err" + in env.pageserver.http_client().evaluate_feature_flag_boolean( + env.initial_tenant, "test-feature-flag" + )["result"] + ) + assert ( + "Err" + in env.pageserver.http_client().evaluate_feature_flag_multivariate( + env.initial_tenant, "test-feature-flag" + )["result"] + ) From 5eecde461d407e0fb63e911d5569d1cd1ffc0a9b Mon Sep 17 00:00:00 2001 From: Aleksandr Sarantsev <99037063+ephemeralsad@users.noreply.github.com> Date: Mon, 23 Jun 2025 22:55:26 +0400 Subject: [PATCH 105/124] storcon: Fix migration for Attached(0) tenants (#12256) ## Problem `Attached(0)` tenant migrations can get stuck if the heatmap file has not been uploaded. ## Summary of Changes - Added a test to reproduce the issue. - Introduced a `kick_secondary_downloads` config flag: - Enabled in testing environments. - Disabled in production (and in the new test). - Updated `Attached(0)` locations to consider the number of secondaries in their intent when deciding whether to download the heatmap. --- control_plane/src/local_env.rs | 3 ++ control_plane/src/storage_controller.rs | 4 ++ storage_controller/src/main.rs | 12 +++++ storage_controller/src/reconciler.rs | 7 +-- storage_controller/src/service.rs | 21 ++++++++- storage_controller/src/tenant_shard.rs | 9 +++- test_runner/fixtures/neon_fixtures.py | 11 +++++ .../regress/test_storage_controller.py | 47 +++++++++++++++++++ 8 files changed, 108 insertions(+), 6 deletions(-) diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index 387fc297f0..e8abde4901 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -211,6 +211,8 @@ pub struct NeonStorageControllerConf { pub use_local_compute_notifications: bool, pub timeline_safekeeper_count: Option, + + pub kick_secondary_downloads: Option, } impl NeonStorageControllerConf { @@ -242,6 +244,7 @@ impl Default for NeonStorageControllerConf { use_https_safekeeper_api: false, use_local_compute_notifications: true, timeline_safekeeper_count: None, + kick_secondary_downloads: None, } } } diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs index 95f7533057..334949924c 100644 --- a/control_plane/src/storage_controller.rs +++ b/control_plane/src/storage_controller.rs @@ -557,6 +557,10 @@ impl StorageController { args.push("--use-local-compute-notifications".to_string()); } + if let Some(value) = self.config.kick_secondary_downloads { + args.push(format!("--kick-secondary-downloads={value}")); + } + if let Some(ssl_ca_file) = self.env.ssl_ca_cert_path() { args.push(format!("--ssl-ca-file={}", ssl_ca_file.to_str().unwrap())); } diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs index fc0ba9f28c..ff134a4ebc 100644 --- a/storage_controller/src/main.rs +++ b/storage_controller/src/main.rs @@ -5,6 +5,9 @@ use std::time::Duration; use anyhow::{Context, anyhow}; use camino::Utf8PathBuf; + +#[cfg(feature = "testing")] +use clap::ArgAction; use clap::Parser; use futures::future::OptionFuture; use http_utils::tls_certs::ReloadingCertificateResolver; @@ -213,6 +216,13 @@ struct Cli { /// This option exists primarily for testing purposes. #[arg(long, default_value = "3", value_parser = clap::value_parser!(i64).range(1..))] timeline_safekeeper_count: i64, + + /// When set, actively checks and initiates heatmap downloads/uploads during reconciliation. + /// This speed up migrations by avoiding the default wait for the heatmap download interval. + /// Primarily useful for testing to reduce test execution time. + #[cfg(feature = "testing")] + #[arg(long, default_value = "true", action=ArgAction::Set)] + kick_secondary_downloads: bool, } enum StrictMode { @@ -445,6 +455,8 @@ async fn async_main() -> anyhow::Result<()> { timelines_onto_safekeepers: args.timelines_onto_safekeepers, use_local_compute_notifications: args.use_local_compute_notifications, timeline_safekeeper_count: args.timeline_safekeeper_count, + #[cfg(feature = "testing")] + kick_secondary_downloads: args.kick_secondary_downloads, }; // Validate that we can connect to the database diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs index b03a6dae04..92844c9c7b 100644 --- a/storage_controller/src/reconciler.rs +++ b/storage_controller/src/reconciler.rs @@ -856,6 +856,7 @@ impl Reconciler { &self.shard, &self.config, &self.placement_policy, + self.intent.secondary.len(), ); match self.observed.locations.get(&node.get_id()) { Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => { @@ -1235,11 +1236,11 @@ pub(crate) fn attached_location_conf( shard: &ShardIdentity, config: &TenantConfig, policy: &PlacementPolicy, + secondary_count: usize, ) -> LocationConfig { let has_secondaries = match policy { - PlacementPolicy::Attached(0) | PlacementPolicy::Detached | PlacementPolicy::Secondary => { - false - } + PlacementPolicy::Detached | PlacementPolicy::Secondary => false, + PlacementPolicy::Attached(0) => secondary_count > 0, PlacementPolicy::Attached(_) => true, }; diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 6ec3963c48..0eb87ffbe3 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -470,6 +470,9 @@ pub struct Config { /// Number of safekeepers to choose for a timeline when creating it. /// Safekeepers will be choosen from different availability zones. pub timeline_safekeeper_count: i64, + + #[cfg(feature = "testing")] + pub kick_secondary_downloads: bool, } impl From for ApiError { @@ -2064,6 +2067,7 @@ impl Service { &tenant_shard.shard, &tenant_shard.config, &PlacementPolicy::Attached(0), + tenant_shard.intent.get_secondary().len(), )), }, )]); @@ -5605,7 +5609,15 @@ impl Service { for parent_id in parent_ids { let child_ids = parent_id.split(new_shard_count); - let (pageserver, generation, policy, parent_ident, config, preferred_az) = { + let ( + pageserver, + generation, + policy, + parent_ident, + config, + preferred_az, + secondary_count, + ) = { let mut old_state = tenants .remove(&parent_id) .expect("It was present, we just split it"); @@ -5625,6 +5637,7 @@ impl Service { old_state.shard, old_state.config.clone(), old_state.preferred_az().cloned(), + old_state.intent.get_secondary().len(), ) }; @@ -5646,6 +5659,7 @@ impl Service { &child_shard, &config, &policy, + secondary_count, )), }, ); @@ -8373,6 +8387,11 @@ impl Service { /// we have this helper to move things along faster. #[cfg(feature = "testing")] async fn kick_secondary_download(&self, tenant_shard_id: TenantShardId) { + if !self.config.kick_secondary_downloads { + // No-op if kick_secondary_downloads functionaliuty is not configured + return; + } + let (attached_node, secondaries) = { let locked = self.inner.read().unwrap(); let Some(shard) = locked.tenants.get(&tenant_shard_id) else { diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs index acd18734cf..789327bfaf 100644 --- a/storage_controller/src/tenant_shard.rs +++ b/storage_controller/src/tenant_shard.rs @@ -1381,8 +1381,13 @@ impl TenantShard { .generation .expect("Attempted to enter attached state without a generation"); - let wanted_conf = - attached_location_conf(generation, &self.shard, &self.config, &self.policy); + let wanted_conf = attached_location_conf( + generation, + &self.shard, + &self.config, + &self.policy, + self.intent.get_secondary().len(), + ); match self.observed.locations.get(&node_id) { Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {} Some(_) | None => { diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 050d61055e..4eb85119ca 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -453,6 +453,7 @@ class NeonEnvBuilder: pageserver_get_vectored_concurrent_io: str | None = None, pageserver_tracing_config: PageserverTracingConfig | None = None, pageserver_import_config: PageserverImportConfig | None = None, + storcon_kick_secondary_downloads: bool | None = None, ): self.repo_dir = repo_dir self.rust_log_override = rust_log_override @@ -514,6 +515,8 @@ class NeonEnvBuilder: self.pageserver_tracing_config = pageserver_tracing_config self.pageserver_import_config = pageserver_import_config + self.storcon_kick_secondary_downloads = storcon_kick_secondary_downloads + self.pageserver_default_tenant_config_compaction_algorithm: dict[str, Any] | None = ( pageserver_default_tenant_config_compaction_algorithm ) @@ -1221,6 +1224,14 @@ class NeonEnv: else: cfg["storage_controller"] = {"use_local_compute_notifications": False} + if config.storcon_kick_secondary_downloads is not None: + # Configure whether storage controller should actively kick off secondary downloads + if "storage_controller" not in cfg: + cfg["storage_controller"] = {} + cfg["storage_controller"]["kick_secondary_downloads"] = ( + config.storcon_kick_secondary_downloads + ) + # Create config for pageserver http_auth_type = "NeonJWT" if config.auth_enabled else "Trust" pg_auth_type = "NeonJWT" if config.auth_enabled else "Trust" diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py index 74ba74645e..be7f0c8a3e 100644 --- a/test_runner/regress/test_storage_controller.py +++ b/test_runner/regress/test_storage_controller.py @@ -4434,6 +4434,53 @@ def test_storage_controller_graceful_migration(neon_env_builder: NeonEnvBuilder, assert initial_ps.http_client().tenant_list_locations()["tenant_shards"] == [] +def test_attached_0_graceful_migration(neon_env_builder: NeonEnvBuilder): + neon_env_builder.num_pageservers = 4 + neon_env_builder.num_azs = 2 + + neon_env_builder.storcon_kick_secondary_downloads = False + + env = neon_env_builder.init_start() + + # It is default, but we want to ensure that there are no secondary locations requested + env.storage_controller.tenant_policy_update(env.initial_tenant, {"placement": {"Attached": 0}}) + env.storage_controller.reconcile_until_idle() + + desc = env.storage_controller.tenant_describe(env.initial_tenant)["shards"][0] + src_ps_id = desc["node_attached"] + src_ps = env.get_pageserver(src_ps_id) + src_az = desc["preferred_az_id"] + + # There must be no secondary locations with Attached(0) placement policy + assert len(desc["node_secondary"]) == 0 + + # Migrate tenant shard to the same AZ node + dst_ps = [ps for ps in env.pageservers if ps.id != src_ps_id and ps.az_id == src_az][0] + + env.storage_controller.tenant_shard_migrate( + TenantShardId(env.initial_tenant, 0, 0), + dst_ps.id, + config=StorageControllerMigrationConfig(prewarm=True), + ) + + def tenant_shard_migrated(): + src_locations = src_ps.http_client().tenant_list_locations()["tenant_shards"] + assert len(src_locations) == 0 + log.info(f"Tenant shard migrated from {src_ps.id}") + dst_locations = dst_ps.http_client().tenant_list_locations()["tenant_shards"] + assert len(dst_locations) == 1 + assert dst_locations[0][1]["mode"] == "AttachedSingle" + log.info(f"Tenant shard migrated to {dst_ps.id}") + + # After all we expect that tenant shard exists only on dst node. + # We wait so long because [`DEFAULT_HEATMAP_PERIOD`] and [`DEFAULT_DOWNLOAD_INTERVAL`] + # are set to 60 seconds by default. + # + # TODO: we should consider making these configurable, so the test can run faster. + wait_until(tenant_shard_migrated, timeout=180, interval=5, status_interval=10) + log.info("Tenant shard migrated successfully") + + @run_only_on_default_postgres("this is like a 'unit test' against storcon db") def test_storage_controller_migrate_with_pageserver_restart( neon_env_builder: NeonEnvBuilder, make_httpserver From 0efff1db26d14000278e409586f580493e31289d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Tue, 24 Jun 2025 00:26:38 +0200 Subject: [PATCH 106/124] Allow cancellation errors in tests that allow timeline deletion errors (#12315) After merging of PR https://github.com/neondatabase/neon/pull/11712 we saw some tests be flaky, with errors showing up about the timeline having been cancelled instead of having been deleted. This is an outcome that is inherently racy with the "has been deleted" error. In some instances, https://github.com/neondatabase/neon/pull/11712 has already added the error about the timeline having been cancelled. This PR adds them to the remaining instances of https://github.com/neondatabase/neon/pull/11712, fixing the flakiness. --- test_runner/regress/test_storage_controller.py | 4 +++- test_runner/regress/test_timeline_detach_ancestor.py | 8 ++++++-- test_runner/regress/test_timeline_gc_blocking.py | 4 +++- 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py index be7f0c8a3e..70772766d7 100644 --- a/test_runner/regress/test_storage_controller.py +++ b/test_runner/regress/test_storage_controller.py @@ -3642,7 +3642,9 @@ def test_timeline_delete_mid_live_migration(neon_env_builder: NeonEnvBuilder, mi env.start() for ps in env.pageservers: - ps.allowed_errors.append(".*Timeline.* has been deleted.*") + ps.allowed_errors.extend( + [".*Timeline.* has been deleted.*", ".*Timeline.*was cancelled and cannot be used"] + ) tenant_id = TenantId.generate() timeline_id = TimelineId.generate() diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py index c58f78aeb1..b5cc431afe 100644 --- a/test_runner/regress/test_timeline_detach_ancestor.py +++ b/test_runner/regress/test_timeline_detach_ancestor.py @@ -1099,7 +1099,9 @@ def test_timeline_detach_ancestor_interrupted_by_deletion( for ps in env.pageservers: ps.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS) - ps.allowed_errors.append(".*Timeline.* has been deleted.*") + ps.allowed_errors.extend( + [".*Timeline.* has been deleted.*", ".*Timeline.*was cancelled and cannot be used"] + ) pageservers = dict((int(p.id), p) for p in env.pageservers) @@ -1221,7 +1223,9 @@ def test_sharded_tad_interleaved_after_partial_success(neon_env_builder: NeonEnv for ps in env.pageservers: ps.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS) - ps.allowed_errors.append(".*Timeline.* has been deleted.*") + ps.allowed_errors.extend( + [".*Timeline.* has been deleted.*", ".*Timeline.*was cancelled and cannot be used"] + ) pageservers = dict((int(p.id), p) for p in env.pageservers) diff --git a/test_runner/regress/test_timeline_gc_blocking.py b/test_runner/regress/test_timeline_gc_blocking.py index 8ef64a0742..daba8019b6 100644 --- a/test_runner/regress/test_timeline_gc_blocking.py +++ b/test_runner/regress/test_timeline_gc_blocking.py @@ -25,7 +25,9 @@ def test_gc_blocking_by_timeline(neon_env_builder: NeonEnvBuilder, sharded: bool initial_tenant_shard_count=2 if sharded else None, ) for ps in env.pageservers: - ps.allowed_errors.append(".*Timeline.* has been deleted.*") + ps.allowed_errors.extend( + [".*Timeline.* has been deleted.*", ".*Timeline.*was cancelled and cannot be used"] + ) if sharded: http = env.storage_controller.pageserver_api() From a29772bf6eb4b43ad6d7a568a1356d4357e82bd8 Mon Sep 17 00:00:00 2001 From: Ivan Efremov Date: Tue, 24 Jun 2025 11:54:43 +0200 Subject: [PATCH 107/124] Create proxy-bench periodic run in CI (#12242) Currently run for test only via pushing to the test-proxy-bench branch. Relates to the #22681 --- .github/workflows/proxy-benchmark.yml | 83 ++++++++++++ scripts/ingest_perf_test_result.py | 2 +- scripts/proxy_bench_results_ingest.py | 187 ++++++++++++++++++++++++++ 3 files changed, 271 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/proxy-benchmark.yml create mode 100644 scripts/proxy_bench_results_ingest.py diff --git a/.github/workflows/proxy-benchmark.yml b/.github/workflows/proxy-benchmark.yml new file mode 100644 index 0000000000..75ecacaced --- /dev/null +++ b/.github/workflows/proxy-benchmark.yml @@ -0,0 +1,83 @@ +name: Periodic proxy performance test on unit-perf hetzner runner + +on: + push: # TODO: remove after testing + branches: + - test-proxy-bench # Runs on pushes to branches starting with test-proxy-bench + # schedule: + # * is a special character in YAML so you have to quote this string + # ┌───────────── minute (0 - 59) + # │ ┌───────────── hour (0 - 23) + # │ │ ┌───────────── day of the month (1 - 31) + # │ │ │ ┌───────────── month (1 - 12 or JAN-DEC) + # │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT) + # - cron: '0 5 * * *' # Runs at 5 UTC once a day + workflow_dispatch: # adds an ability to run this manually + +defaults: + run: + shell: bash -euo pipefail {0} + +concurrency: + group: ${{ github.workflow }} + cancel-in-progress: false + +permissions: + contents: read + +jobs: + run_periodic_proxybench_test: + permissions: + id-token: write # aws-actions/configure-aws-credentials + statuses: write + contents: write + pull-requests: write + runs-on: [self-hosted, unit-perf] + timeout-minutes: 60 # 1h timeout + container: + image: ghcr.io/neondatabase/build-tools:pinned-bookworm + credentials: + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + options: --init + steps: + - name: Checkout proxy-bench Repo + uses: actions/checkout@v4 + with: + repository: neondatabase/proxy-bench + path: proxy-bench + + - name: Set up the environment which depends on $RUNNER_TEMP on nvme drive + id: set-env + shell: bash -euxo pipefail {0} + run: | + PROXY_BENCH_PATH=$(realpath ./proxy-bench) + { + echo "PROXY_BENCH_PATH=$PROXY_BENCH_PATH" + echo "NEON_DIR=${RUNNER_TEMP}/neon" + echo "TEST_OUTPUT=${PROXY_BENCH_PATH}/test_output" + echo "" + } >> "$GITHUB_ENV" + + - name: Run proxy-bench + run: ./${PROXY_BENCH_PATH}/run.sh + + - name: Ingest Bench Results # neon repo script + if: success() + run: | + mkdir -p $TEST_OUTPUT + python $NEON_DIR/scripts/proxy_bench_results_ingest.py --out $TEST_OUTPUT + + - name: Push Metrics to Proxy perf database + if: success() + env: + PERF_TEST_RESULT_CONNSTR: "${{ secrets.PROXY_TEST_RESULT_CONNSTR }}" + REPORT_FROM: $TEST_OUTPUT + run: $NEON_DIR/scripts/generate_and_push_perf_report.sh + + - name: Docker cleanup + run: docker compose down + + - name: Notify Failure + if: failure() + run: echo "Proxy bench job failed" && exit 1 \ No newline at end of file diff --git a/scripts/ingest_perf_test_result.py b/scripts/ingest_perf_test_result.py index 804f8a3cde..898e1ee954 100644 --- a/scripts/ingest_perf_test_result.py +++ b/scripts/ingest_perf_test_result.py @@ -26,7 +26,7 @@ CREATE TABLE IF NOT EXISTS perf_test_results ( metric_unit VARCHAR(10), metric_report_type TEXT, recorded_at_timestamp TIMESTAMP WITH TIME ZONE DEFAULT NOW(), - labels JSONB with default '{}' + labels JSONB DEFAULT '{}'::jsonb ) """ diff --git a/scripts/proxy_bench_results_ingest.py b/scripts/proxy_bench_results_ingest.py new file mode 100644 index 0000000000..475d053ed2 --- /dev/null +++ b/scripts/proxy_bench_results_ingest.py @@ -0,0 +1,187 @@ +#!/usr/bin/env python3 + +import argparse +import json +import time +from typing import Any, TypedDict, cast + +import requests + +PROMETHEUS_URL = "http://localhost:9090" +SAMPLE_INTERVAL = 1 # seconds + +DEFAULT_REVISION = "unknown" +DEFAULT_PLATFORM = "unknown" +DEFAULT_SUIT = "proxy_bench" + + +class MetricConfig(TypedDict, total=False): + name: str + promql: str + unit: str + report: str + labels: dict[str, str] + is_vector: bool + label_field: str + + +METRICS: list[MetricConfig] = [ + { + "name": "latency_p99", + "promql": 'histogram_quantile(0.99, sum(rate(proxy_compute_connection_latency_seconds_bucket{outcome="success", excluded="client_and_cplane"}[5m])) by (le))', + "unit": "s", + "report": "LOWER_IS_BETTER", + "labels": {}, + }, + { + "name": "error_rate", + "promql": 'sum(rate(proxy_errors_total{type!~"user|clientdisconnect|quota"}[5m])) / sum(rate(proxy_accepted_connections_total[5m]))', + "unit": "", + "report": "LOWER_IS_BETTER", + "labels": {}, + }, + { + "name": "max_memory_kb", + "promql": "max(libmetrics_maxrss_kb)", + "unit": "kB", + "report": "LOWER_IS_BETTER", + "labels": {}, + }, + { + "name": "jemalloc_active_bytes", + "promql": "sum(jemalloc_active_bytes)", + "unit": "bytes", + "report": "LOWER_IS_BETTER", + "labels": {}, + }, + { + "name": "open_connections", + "promql": "sum by (protocol) (proxy_opened_client_connections_total - proxy_closed_client_connections_total)", + "unit": "", + "report": "HIGHER_IS_BETTER", + "labels": {}, + "is_vector": True, + "label_field": "protocol", + }, +] + + +class PrometheusMetric(TypedDict): + metric: dict[str, str] + value: list[str | float] + + +class PrometheusResult(TypedDict): + result: list[PrometheusMetric] + + +class PrometheusResponse(TypedDict): + data: PrometheusResult + + +def query_prometheus(promql: str) -> PrometheusResponse: + resp = requests.get(f"{PROMETHEUS_URL}/api/v1/query", params={"query": promql}) + resp.raise_for_status() + return cast("PrometheusResponse", resp.json()) + + +def extract_scalar_metric(result_json: PrometheusResponse) -> float | None: + try: + return float(result_json["data"]["result"][0]["value"][1]) + except (IndexError, KeyError, ValueError, TypeError): + return None + + +def extract_vector_metric( + result_json: PrometheusResponse, label_field: str +) -> list[tuple[str | None, float, dict[str, str]]]: + out: list[tuple[str | None, float, dict[str, str]]] = [] + for entry in result_json["data"]["result"]: + try: + value_str = entry["value"][1] + if not isinstance(value_str, (str | float)): + continue + value = float(value_str) + except (IndexError, KeyError, ValueError, TypeError): + continue + labels = entry.get("metric", {}) + label_val = labels.get(label_field, None) + out.append((label_val, value, labels)) + return out + + +def main() -> None: + parser = argparse.ArgumentParser( + description="Collect Prometheus metrics and output in benchmark fixture format" + ) + parser.add_argument("--revision", default=DEFAULT_REVISION) + parser.add_argument("--platform", default=DEFAULT_PLATFORM) + parser.add_argument("--suit", default=DEFAULT_SUIT) + parser.add_argument("--out", default="metrics_benchmarks.json", help="Output JSON file") + parser.add_argument( + "--interval", default=SAMPLE_INTERVAL, type=int, help="Sampling interval (s)" + ) + args = parser.parse_args() + + start_time = int(time.time()) + samples: list[dict[str, Any]] = [] + + print("Collecting metrics (Ctrl+C to stop)...") + try: + while True: + ts = int(time.time()) + for metric in METRICS: + if metric.get("is_vector", False): + # Vector (per-label, e.g. per-protocol) + for label_val, value, labels in extract_vector_metric( + query_prometheus(metric["promql"]), metric["label_field"] + ): + entry = { + "name": f"{metric['name']}.{label_val}" + if label_val + else metric["name"], + "value": value, + "unit": metric["unit"], + "report": metric["report"], + "labels": {**metric.get("labels", {}), **labels}, + "timestamp": ts, + } + samples.append(entry) + else: + result = extract_scalar_metric(query_prometheus(metric["promql"])) + if result is not None: + entry = { + "name": metric["name"], + "value": result, + "unit": metric["unit"], + "report": metric["report"], + "labels": metric.get("labels", {}), + "timestamp": ts, + } + samples.append(entry) + time.sleep(args.interval) + except KeyboardInterrupt: + print("Collection stopped.") + + total_duration = int(time.time()) - start_time + + # Compose output + out = { + "revision": args.revision, + "platform": args.platform, + "result": [ + { + "suit": args.suit, + "total_duration": total_duration, + "data": samples, + } + ], + } + + with open(args.out, "w") as f: + json.dump(out, f, indent=2) + print(f"Wrote metrics in fixture format to {args.out}") + + +if __name__ == "__main__": + main() From 552249607da32a4c1c25a65339ce3b4533f6a728 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Tue, 24 Jun 2025 12:12:42 +0200 Subject: [PATCH 108/124] apply clippy fixes for 1.88.0 beta (#12331) The 1.88.0 stable release is near (this Thursday). We'd like to fix most warnings beforehand so that the compiler upgrade doesn't require approval from too many teams. This is therefore a preparation PR (like similar PRs before it). There is a lot of changes for this release, mostly because the `uninlined_format_args` lint has been added to the `style` lint group. One can read more about the lint [here](https://rust-lang.github.io/rust-clippy/master/#/uninlined_format_args). The PR is the result of `cargo +beta clippy --fix` and `cargo fmt`. One remaining warning is left for the proxy team. --------- Co-authored-by: Conrad Ludgate --- compute_tools/src/bin/fast_import.rs | 6 +- compute_tools/src/bin/fast_import/s3_uri.rs | 2 +- compute_tools/src/catalog.rs | 4 +- compute_tools/src/compute.rs | 21 ++-- compute_tools/src/config.rs | 18 ++-- compute_tools/src/extension_server.rs | 12 +-- compute_tools/src/http/routes/configure.rs | 2 +- compute_tools/src/installed_extensions.rs | 4 +- compute_tools/src/lsn_lease.rs | 2 +- compute_tools/src/pg_helpers.rs | 10 +- compute_tools/src/spec.rs | 8 +- compute_tools/src/spec_apply.rs | 95 +++++++++---------- compute_tools/src/sync_sk.rs | 2 +- control_plane/src/bin/neon_local.rs | 20 ++-- control_plane/src/endpoint.rs | 11 +-- control_plane/src/local_env.rs | 6 +- control_plane/src/pageserver.rs | 2 +- control_plane/src/safekeeper.rs | 2 +- control_plane/src/storage_controller.rs | 16 ++-- control_plane/storcon_cli/src/main.rs | 28 +++--- endpoint_storage/src/app.rs | 2 +- libs/desim/src/executor.rs | 2 +- libs/desim/src/proto.rs | 4 +- libs/http-utils/src/endpoint.rs | 8 +- libs/neon-shmem/src/lib.rs | 4 +- libs/pageserver_api/src/controller_api.rs | 3 +- libs/pageserver_api/src/keyspace.rs | 5 +- libs/pageserver_api/src/models.rs | 5 +- libs/postgres_backend/src/lib.rs | 2 +- libs/postgres_backend/tests/simple_select.rs | 4 +- libs/postgres_connection/src/lib.rs | 6 +- libs/postgres_ffi/src/waldecoder_handler.rs | 8 +- libs/postgres_ffi/src/walrecord.rs | 8 +- .../wal_craft/src/xlog_utils_test.rs | 2 +- libs/postgres_initdb/src/lib.rs | 6 +- libs/posthog_client_lite/src/lib.rs | 45 +++------ libs/pq_proto/src/lib.rs | 8 +- .../src/authentication/sasl.rs | 10 +- .../postgres-protocol2/src/message/backend.rs | 4 +- libs/proxy/postgres-types2/src/lib.rs | 2 +- libs/proxy/tokio-postgres2/src/error/mod.rs | 12 +-- libs/proxy/tokio-postgres2/src/row.rs | 4 +- libs/remote_storage/src/local_fs.rs | 14 +-- libs/safekeeper_api/src/membership.rs | 4 +- libs/utils/src/error.rs | 2 +- libs/utils/src/generation.rs | 2 +- libs/utils/src/id.rs | 2 +- libs/utils/src/postgres_client.rs | 2 +- libs/utils/src/shard.rs | 4 +- libs/utils/src/signals.rs | 2 +- libs/vm_monitor/src/dispatcher.rs | 3 +- libs/vm_monitor/src/filecache.rs | 2 +- .../benches/bench_interpret_wal.rs | 4 +- libs/wal_decoder/build.rs | 2 +- libs/wal_decoder/src/models/record.rs | 2 +- libs/walproposer/src/api_bindings.rs | 4 +- libs/walproposer/src/walproposer.rs | 13 +-- pageserver/client/src/mgmt_api.rs | 4 +- pageserver/compaction/src/simulator/draw.rs | 6 +- pageserver/ctl/src/draw_timeline_dir.rs | 10 +- pageserver/ctl/src/key.rs | 2 +- pageserver/ctl/src/layer_map_analyzer.rs | 3 +- pageserver/page_api/src/client.rs | 2 +- pageserver/pagebench/src/cmd/aux_files.rs | 6 +- pageserver/src/basebackup.rs | 12 +-- pageserver/src/http/routes.rs | 10 +- pageserver/src/metrics.rs | 7 +- pageserver/src/page_service.rs | 9 +- pageserver/src/pgdatadir_mapping.rs | 5 +- pageserver/src/tenant.rs | 55 +++++------ pageserver/src/tenant/checks.rs | 3 +- pageserver/src/tenant/metadata.rs | 3 +- pageserver/src/tenant/secondary/downloader.rs | 2 +- .../src/tenant/storage_layer/delta_layer.rs | 4 +- .../src/tenant/storage_layer/image_layer.rs | 7 +- .../tenant/storage_layer/inmemory_layer.rs | 2 +- .../inmemory_layer/vectored_dio_read.rs | 2 +- pageserver/src/tenant/storage_layer/layer.rs | 2 +- pageserver/src/tenant/timeline.rs | 18 ++-- pageserver/src/tenant/timeline/compaction.rs | 10 +- .../walreceiver/walreceiver_connection.rs | 3 +- .../owned_buffers_io/aligned_buffer/buffer.rs | 11 +-- .../aligned_buffer/buffer_mut.rs | 5 +- pageserver/src/walingest.rs | 12 +-- pageserver/src/walredo/apply_neon.rs | 50 +++------- proxy/src/lib.rs | 4 + safekeeper/src/control_file.rs | 10 +- safekeeper/src/handler.rs | 16 ++-- safekeeper/src/safekeeper.rs | 26 +++-- safekeeper/src/timeline_eviction.rs | 7 +- safekeeper/src/timeline_manager.rs | 2 +- safekeeper/src/timelines_global_map.rs | 4 +- safekeeper/src/wal_backup_partial.rs | 3 +- safekeeper/src/wal_storage.rs | 2 +- safekeeper/tests/walproposer_sim/log.rs | 2 +- .../tests/walproposer_sim/safekeeper.rs | 2 +- .../tests/walproposer_sim/simulation.rs | 2 +- .../tests/walproposer_sim/walproposer_api.rs | 7 +- storage_broker/benches/rps.rs | 2 +- storage_broker/build.rs | 2 +- storage_broker/src/lib.rs | 10 +- storage_controller/src/compute_hook.rs | 2 +- storage_controller/src/drain_utils.rs | 6 +- storage_controller/src/http.rs | 6 +- storage_controller/src/persistence.rs | 23 ++--- storage_controller/src/scheduler.rs | 6 +- storage_controller/src/service.rs | 31 +++--- storage_controller/src/tenant_shard.rs | 9 +- storage_controller/src/timeline_import.rs | 2 +- storage_scrubber/src/checks.rs | 2 +- storage_scrubber/src/lib.rs | 2 +- .../src/scan_safekeeper_metadata.rs | 4 +- 112 files changed, 404 insertions(+), 542 deletions(-) diff --git a/compute_tools/src/bin/fast_import.rs b/compute_tools/src/bin/fast_import.rs index e65c210b23..682525f6df 100644 --- a/compute_tools/src/bin/fast_import.rs +++ b/compute_tools/src/bin/fast_import.rs @@ -486,10 +486,8 @@ async fn cmd_pgdata( }; let superuser = "cloud_admin"; - let destination_connstring = format!( - "host=localhost port={} user={} dbname=neondb", - pg_port, superuser - ); + let destination_connstring = + format!("host=localhost port={pg_port} user={superuser} dbname=neondb"); let pgdata_dir = workdir.join("pgdata"); let mut proc = PostgresProcess::new(pgdata_dir.clone(), pg_bin_dir.clone(), pg_lib_dir.clone()); diff --git a/compute_tools/src/bin/fast_import/s3_uri.rs b/compute_tools/src/bin/fast_import/s3_uri.rs index cf4dab7c02..e1a85c73e7 100644 --- a/compute_tools/src/bin/fast_import/s3_uri.rs +++ b/compute_tools/src/bin/fast_import/s3_uri.rs @@ -69,7 +69,7 @@ impl clap::builder::TypedValueParser for S3Uri { S3Uri::from_str(value_str).map_err(|e| { clap::Error::raw( clap::error::ErrorKind::InvalidValue, - format!("Failed to parse S3 URI: {}", e), + format!("Failed to parse S3 URI: {e}"), ) }) } diff --git a/compute_tools/src/catalog.rs b/compute_tools/src/catalog.rs index 082ba62b8e..bc9f64075a 100644 --- a/compute_tools/src/catalog.rs +++ b/compute_tools/src/catalog.rs @@ -22,7 +22,7 @@ pub async fn get_dbs_and_roles(compute: &Arc) -> anyhow::Result { let mut lines = stderr_reader.lines(); if let Some(line) = lines.next_line().await? { - if line.contains(&format!("FATAL: database \"{}\" does not exist", dbname)) { + if line.contains(&format!("FATAL: database \"{dbname}\" does not exist")) { return Err(SchemaDumpError::DatabaseDoesNotExist); } warn!("pg_dump stderr: {}", line) diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 684d841897..f84a5f0841 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -250,8 +250,7 @@ impl ParsedSpec { // duplicate entry? if current == previous { return Err(format!( - "duplicate entry in safekeeper_connstrings: {}!", - current, + "duplicate entry in safekeeper_connstrings: {current}!", )); } @@ -410,7 +409,7 @@ impl ComputeNode { let options = match conn_conf.get_options() { // Allow the control plane to override any options set by the // compute - Some(options) => format!("{} {}", EXTRA_OPTIONS, options), + Some(options) => format!("{EXTRA_OPTIONS} {options}"), None => EXTRA_OPTIONS.to_string(), }; conn_conf.options(&options); @@ -1127,7 +1126,7 @@ impl ComputeNode { let sk_configs = sk_connstrs.into_iter().map(|connstr| { // Format connstr let id = connstr.clone(); - let connstr = format!("postgresql://no_user@{}", connstr); + let connstr = format!("postgresql://no_user@{connstr}"); let options = format!( "-c timeline_id={} tenant_id={}", pspec.timeline_id, pspec.tenant_id @@ -1490,7 +1489,7 @@ impl ComputeNode { let (mut client, connection) = conf.connect(NoTls).await?; tokio::spawn(async move { if let Err(e) = connection.await { - eprintln!("connection error: {}", e); + eprintln!("connection error: {e}"); } }); @@ -1633,7 +1632,7 @@ impl ComputeNode { Ok((mut client, connection)) => { tokio::spawn(async move { if let Err(e) = connection.await { - eprintln!("connection error: {}", e); + eprintln!("connection error: {e}"); } }); if let Err(e) = handle_migrations(&mut client).await { @@ -1937,7 +1936,7 @@ impl ComputeNode { let (client, connection) = connect_result.unwrap(); tokio::spawn(async move { if let Err(e) = connection.await { - eprintln!("connection error: {}", e); + eprintln!("connection error: {e}"); } }); let result = client @@ -2106,7 +2105,7 @@ LIMIT 100", db_client .simple_query(&query) .await - .with_context(|| format!("Failed to execute query: {}", query))?; + .with_context(|| format!("Failed to execute query: {query}"))?; } Ok(()) @@ -2133,7 +2132,7 @@ LIMIT 100", let version: Option = db_client .query_opt(version_query, &[&ext_name]) .await - .with_context(|| format!("Failed to execute query: {}", version_query))? + .with_context(|| format!("Failed to execute query: {version_query}"))? .map(|row| row.get(0)); // sanitize the inputs as postgres idents. @@ -2148,14 +2147,14 @@ LIMIT 100", db_client .simple_query(&query) .await - .with_context(|| format!("Failed to execute query: {}", query))?; + .with_context(|| format!("Failed to execute query: {query}"))?; } else { let query = format!("CREATE EXTENSION IF NOT EXISTS {ext_name} WITH VERSION {quoted_version}"); db_client .simple_query(&query) .await - .with_context(|| format!("Failed to execute query: {}", query))?; + .with_context(|| format!("Failed to execute query: {query}"))?; } Ok(ext_version) diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs index 933b30134f..169de5c963 100644 --- a/compute_tools/src/config.rs +++ b/compute_tools/src/config.rs @@ -51,7 +51,7 @@ pub fn write_postgres_conf( // Write the postgresql.conf content from the spec file as is. if let Some(conf) = &spec.cluster.postgresql_conf { - writeln!(file, "{}", conf)?; + writeln!(file, "{conf}")?; } // Add options for connecting to storage @@ -70,7 +70,7 @@ pub fn write_postgres_conf( ); // If generation is given, prepend sk list with g#number: if let Some(generation) = spec.safekeepers_generation { - write!(neon_safekeepers_value, "g#{}:", generation)?; + write!(neon_safekeepers_value, "g#{generation}:")?; } neon_safekeepers_value.push_str(&spec.safekeeper_connstrings.join(",")); writeln!( @@ -109,8 +109,8 @@ pub fn write_postgres_conf( tls::update_key_path_blocking(pgdata_path, tls_config); // these are the default, but good to be explicit. - writeln!(file, "ssl_cert_file = '{}'", SERVER_CRT)?; - writeln!(file, "ssl_key_file = '{}'", SERVER_KEY)?; + writeln!(file, "ssl_cert_file = '{SERVER_CRT}'")?; + writeln!(file, "ssl_key_file = '{SERVER_KEY}'")?; } // Locales @@ -191,8 +191,7 @@ pub fn write_postgres_conf( } writeln!( file, - "shared_preload_libraries='{}{}'", - libs, extra_shared_preload_libraries + "shared_preload_libraries='{libs}{extra_shared_preload_libraries}'" )?; } else { // Typically, this should be unreacheable, @@ -244,8 +243,7 @@ pub fn write_postgres_conf( } writeln!( file, - "shared_preload_libraries='{}{}'", - libs, extra_shared_preload_libraries + "shared_preload_libraries='{libs}{extra_shared_preload_libraries}'" )?; } else { // Typically, this should be unreacheable, @@ -263,7 +261,7 @@ pub fn write_postgres_conf( } } - writeln!(file, "neon.extension_server_port={}", extension_server_port)?; + writeln!(file, "neon.extension_server_port={extension_server_port}")?; if spec.drop_subscriptions_before_start { writeln!(file, "neon.disable_logical_replication_subscribers=true")?; @@ -291,7 +289,7 @@ where { let path = pgdata_path.join("compute_ctl_temp_override.conf"); let mut file = File::create(path)?; - write!(file, "{}", options)?; + write!(file, "{options}")?; let res = exec(); diff --git a/compute_tools/src/extension_server.rs b/compute_tools/src/extension_server.rs index 3764bc1525..d8d5de34a5 100644 --- a/compute_tools/src/extension_server.rs +++ b/compute_tools/src/extension_server.rs @@ -310,10 +310,7 @@ async fn download_extension_tar(remote_ext_base_url: &Url, ext_path: &str) -> Re async fn do_extension_server_request(uri: Url) -> Result { let resp = reqwest::get(uri).await.map_err(|e| { ( - format!( - "could not perform remote extensions server request: {:?}", - e - ), + format!("could not perform remote extensions server request: {e:?}"), UNKNOWN_HTTP_STATUS.to_string(), ) })?; @@ -323,7 +320,7 @@ async fn do_extension_server_request(uri: Url) -> Result match resp.bytes().await { Ok(resp) => Ok(resp), Err(e) => Err(( - format!("could not read remote extensions server response: {:?}", e), + format!("could not read remote extensions server response: {e:?}"), // It's fine to return and report error with status as 200 OK, // because we still failed to read the response. status.to_string(), @@ -334,10 +331,7 @@ async fn do_extension_server_request(uri: Url) -> Result Err(( - format!( - "unexpected remote extensions server response status code: {}", - status - ), + format!("unexpected remote extensions server response status code: {status}"), status.to_string(), )), } diff --git a/compute_tools/src/http/routes/configure.rs b/compute_tools/src/http/routes/configure.rs index c29e3a97da..b7325d283f 100644 --- a/compute_tools/src/http/routes/configure.rs +++ b/compute_tools/src/http/routes/configure.rs @@ -65,7 +65,7 @@ pub(in crate::http) async fn configure( if state.status == ComputeStatus::Failed { let err = state.error.as_ref().map_or("unknown error", |x| x); - let msg = format!("compute configuration failed: {:?}", err); + let msg = format!("compute configuration failed: {err:?}"); return Err(msg); } } diff --git a/compute_tools/src/installed_extensions.rs b/compute_tools/src/installed_extensions.rs index d95c168a99..411e03b7ec 100644 --- a/compute_tools/src/installed_extensions.rs +++ b/compute_tools/src/installed_extensions.rs @@ -43,7 +43,7 @@ pub async fn get_installed_extensions(mut conf: Config) -> Result Result Result> { let mut client = config.connect(NoTls)?; - let cmd = format!("lease lsn {} {} {} ", tenant_shard_id, timeline_id, lsn); + let cmd = format!("lease lsn {tenant_shard_id} {timeline_id} {lsn} "); let res = client.simple_query(&cmd)?; let msg = match res.first() { Some(msg) => msg, diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs index 94467a0d2f..0a3ceed2fa 100644 --- a/compute_tools/src/pg_helpers.rs +++ b/compute_tools/src/pg_helpers.rs @@ -36,9 +36,9 @@ pub fn escape_literal(s: &str) -> String { let res = s.replace('\'', "''").replace('\\', "\\\\"); if res.contains('\\') { - format!("E'{}'", res) + format!("E'{res}'") } else { - format!("'{}'", res) + format!("'{res}'") } } @@ -46,7 +46,7 @@ pub fn escape_literal(s: &str) -> String { /// with `'{}'` is not required, as it returns a ready-to-use config string. pub fn escape_conf_value(s: &str) -> String { let res = s.replace('\'', "''").replace('\\', "\\\\"); - format!("'{}'", res) + format!("'{res}'") } pub trait GenericOptionExt { @@ -446,7 +446,7 @@ pub async fn tune_pgbouncer( let mut pgbouncer_connstr = "host=localhost port=6432 dbname=pgbouncer user=postgres sslmode=disable".to_string(); if let Ok(pass) = std::env::var("PGBOUNCER_PASSWORD") { - pgbouncer_connstr.push_str(format!(" password={}", pass).as_str()); + pgbouncer_connstr.push_str(format!(" password={pass}").as_str()); } pgbouncer_connstr }; @@ -464,7 +464,7 @@ pub async fn tune_pgbouncer( Ok((client, connection)) => { tokio::spawn(async move { if let Err(e) = connection.await { - eprintln!("connection error: {}", e); + eprintln!("connection error: {e}"); } }); break client; diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs index 4b38e6e29c..43cfbb48f7 100644 --- a/compute_tools/src/spec.rs +++ b/compute_tools/src/spec.rs @@ -23,12 +23,12 @@ fn do_control_plane_request( ) -> Result { let resp = reqwest::blocking::Client::new() .get(uri) - .header("Authorization", format!("Bearer {}", jwt)) + .header("Authorization", format!("Bearer {jwt}")) .send() .map_err(|e| { ( true, - format!("could not perform request to control plane: {:?}", e), + format!("could not perform request to control plane: {e:?}"), UNKNOWN_HTTP_STATUS.to_string(), ) })?; @@ -39,7 +39,7 @@ fn do_control_plane_request( Ok(spec_resp) => Ok(spec_resp), Err(e) => Err(( true, - format!("could not deserialize control plane response: {:?}", e), + format!("could not deserialize control plane response: {e:?}"), status.to_string(), )), }, @@ -62,7 +62,7 @@ fn do_control_plane_request( // or some internal failure happened. Doesn't make much sense to retry in this case. _ => Err(( false, - format!("unexpected control plane response status code: {}", status), + format!("unexpected control plane response status code: {status}"), status.to_string(), )), } diff --git a/compute_tools/src/spec_apply.rs b/compute_tools/src/spec_apply.rs index 0d1389dbad..fcd072263a 100644 --- a/compute_tools/src/spec_apply.rs +++ b/compute_tools/src/spec_apply.rs @@ -933,56 +933,53 @@ async fn get_operations<'a>( PerDatabasePhase::DeleteDBRoleReferences => { let ctx = ctx.read().await; - let operations = - spec.delta_operations - .iter() - .flatten() - .filter(|op| op.action == "delete_role") - .filter_map(move |op| { - if db.is_owned_by(&op.name) { - return None; - } - if !ctx.roles.contains_key(&op.name) { - return None; - } - let quoted = op.name.pg_quote(); - let new_owner = match &db { - DB::SystemDB => PgIdent::from("cloud_admin").pg_quote(), - DB::UserDB(db) => db.owner.pg_quote(), - }; - let (escaped_role, outer_tag) = op.name.pg_quote_dollar(); + let operations = spec + .delta_operations + .iter() + .flatten() + .filter(|op| op.action == "delete_role") + .filter_map(move |op| { + if db.is_owned_by(&op.name) { + return None; + } + if !ctx.roles.contains_key(&op.name) { + return None; + } + let quoted = op.name.pg_quote(); + let new_owner = match &db { + DB::SystemDB => PgIdent::from("cloud_admin").pg_quote(), + DB::UserDB(db) => db.owner.pg_quote(), + }; + let (escaped_role, outer_tag) = op.name.pg_quote_dollar(); - Some(vec![ - // This will reassign all dependent objects to the db owner - Operation { - query: format!( - "REASSIGN OWNED BY {} TO {}", - quoted, new_owner, - ), - comment: None, - }, - // Revoke some potentially blocking privileges (Neon-specific currently) - Operation { - query: format!( - include_str!("sql/pre_drop_role_revoke_privileges.sql"), - // N.B. this has to be properly dollar-escaped with `pg_quote_dollar()` - role_name = escaped_role, - outer_tag = outer_tag, - ), - comment: None, - }, - // This now will only drop privileges of the role - // TODO: this is obviously not 100% true because of the above case, - // there could be still some privileges that are not revoked. Maybe this - // only drops privileges that were granted *by this* role, not *to this* role, - // but this has to be checked. - Operation { - query: format!("DROP OWNED BY {}", quoted), - comment: None, - }, - ]) - }) - .flatten(); + Some(vec![ + // This will reassign all dependent objects to the db owner + Operation { + query: format!("REASSIGN OWNED BY {quoted} TO {new_owner}",), + comment: None, + }, + // Revoke some potentially blocking privileges (Neon-specific currently) + Operation { + query: format!( + include_str!("sql/pre_drop_role_revoke_privileges.sql"), + // N.B. this has to be properly dollar-escaped with `pg_quote_dollar()` + role_name = escaped_role, + outer_tag = outer_tag, + ), + comment: None, + }, + // This now will only drop privileges of the role + // TODO: this is obviously not 100% true because of the above case, + // there could be still some privileges that are not revoked. Maybe this + // only drops privileges that were granted *by this* role, not *to this* role, + // but this has to be checked. + Operation { + query: format!("DROP OWNED BY {quoted}"), + comment: None, + }, + ]) + }) + .flatten(); Ok(Box::new(operations)) } diff --git a/compute_tools/src/sync_sk.rs b/compute_tools/src/sync_sk.rs index 22b7027b93..6c348644b2 100644 --- a/compute_tools/src/sync_sk.rs +++ b/compute_tools/src/sync_sk.rs @@ -27,7 +27,7 @@ pub async fn ping_safekeeper( let (client, conn) = config.connect(tokio_postgres::NoTls).await?; tokio::spawn(async move { if let Err(e) = conn.await { - eprintln!("connection error: {}", e); + eprintln!("connection error: {e}"); } }); diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index 21f55336aa..b2dd1a7077 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -919,7 +919,7 @@ fn print_timeline( br_sym = "┗━"; } - print!("{} @{}: ", br_sym, ancestor_lsn); + print!("{br_sym} @{ancestor_lsn}: "); } // Finally print a timeline id and name with new line @@ -1742,7 +1742,7 @@ async fn handle_pageserver(subcmd: &PageserverCmd, env: &local_env::LocalEnv) -> StopMode::Immediate => true, }; if let Err(e) = get_pageserver(env, args.pageserver_id)?.stop(immediate) { - eprintln!("pageserver stop failed: {}", e); + eprintln!("pageserver stop failed: {e}"); exit(1); } } @@ -1751,7 +1751,7 @@ async fn handle_pageserver(subcmd: &PageserverCmd, env: &local_env::LocalEnv) -> let pageserver = get_pageserver(env, args.pageserver_id)?; //TODO what shutdown strategy should we use here? if let Err(e) = pageserver.stop(false) { - eprintln!("pageserver stop failed: {}", e); + eprintln!("pageserver stop failed: {e}"); exit(1); } @@ -1768,7 +1768,7 @@ async fn handle_pageserver(subcmd: &PageserverCmd, env: &local_env::LocalEnv) -> { Ok(_) => println!("Page server is up and running"), Err(err) => { - eprintln!("Page server is not available: {}", err); + eprintln!("Page server is not available: {err}"); exit(1); } } @@ -1805,7 +1805,7 @@ async fn handle_storage_controller( }, }; if let Err(e) = svc.stop(stop_args).await { - eprintln!("stop failed: {}", e); + eprintln!("stop failed: {e}"); exit(1); } } @@ -1827,7 +1827,7 @@ async fn handle_safekeeper(subcmd: &SafekeeperCmd, env: &local_env::LocalEnv) -> let safekeeper = get_safekeeper(env, args.id)?; if let Err(e) = safekeeper.start(&args.extra_opt, &args.start_timeout).await { - eprintln!("safekeeper start failed: {}", e); + eprintln!("safekeeper start failed: {e}"); exit(1); } } @@ -1839,7 +1839,7 @@ async fn handle_safekeeper(subcmd: &SafekeeperCmd, env: &local_env::LocalEnv) -> StopMode::Immediate => true, }; if let Err(e) = safekeeper.stop(immediate) { - eprintln!("safekeeper stop failed: {}", e); + eprintln!("safekeeper stop failed: {e}"); exit(1); } } @@ -1852,12 +1852,12 @@ async fn handle_safekeeper(subcmd: &SafekeeperCmd, env: &local_env::LocalEnv) -> }; if let Err(e) = safekeeper.stop(immediate) { - eprintln!("safekeeper stop failed: {}", e); + eprintln!("safekeeper stop failed: {e}"); exit(1); } if let Err(e) = safekeeper.start(&args.extra_opt, &args.start_timeout).await { - eprintln!("safekeeper start failed: {}", e); + eprintln!("safekeeper start failed: {e}"); exit(1); } } @@ -2113,7 +2113,7 @@ async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) { let storage = EndpointStorage::from_env(env); if let Err(e) = storage.stop(immediate) { - eprintln!("endpoint_storage stop failed: {:#}", e); + eprintln!("endpoint_storage stop failed: {e:#}"); } for ps_conf in &env.pageservers { diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs index ae81e7abbe..dab53b0f27 100644 --- a/control_plane/src/endpoint.rs +++ b/control_plane/src/endpoint.rs @@ -846,10 +846,10 @@ impl Endpoint { // Launch compute_ctl let conn_str = self.connstr("cloud_admin", "postgres"); - println!("Starting postgres node at '{}'", conn_str); + println!("Starting postgres node at '{conn_str}'"); if create_test_user { let conn_str = self.connstr("test", "neondb"); - println!("Also at '{}'", conn_str); + println!("Also at '{conn_str}'"); } let mut cmd = Command::new(self.env.neon_distrib_dir.join("compute_ctl")); cmd.args([ @@ -948,8 +948,7 @@ impl Endpoint { Err(e) => { if Instant::now().duration_since(start_at) > start_timeout { return Err(e).context(format!( - "timed out {:?} waiting to connect to compute_ctl HTTP", - start_timeout, + "timed out {start_timeout:?} waiting to connect to compute_ctl HTTP", )); } } @@ -988,7 +987,7 @@ impl Endpoint { // reqwest does not export its error construction utility functions, so let's craft the message ourselves let url = response.url().to_owned(); let msg = match response.text().await { - Ok(err_body) => format!("Error: {}", err_body), + Ok(err_body) => format!("Error: {err_body}"), Err(_) => format!("Http error ({}) at {}.", status.as_u16(), url), }; Err(anyhow::anyhow!(msg)) @@ -1054,7 +1053,7 @@ impl Endpoint { } else { let url = response.url().to_owned(); let msg = match response.text().await { - Ok(err_body) => format!("Error: {}", err_body), + Ok(err_body) => format!("Error: {err_body}"), Err(_) => format!("Http error ({}) at {}.", status.as_u16(), url), }; Err(anyhow::anyhow!(msg)) diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index e8abde4901..34465b4d5d 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -260,7 +260,7 @@ impl Default for EndpointStorageConf { impl NeonBroker { pub fn client_url(&self) -> Url { let url = if let Some(addr) = self.listen_https_addr { - format!("https://{}", addr) + format!("https://{addr}") } else { format!( "http://{}", @@ -733,7 +733,7 @@ impl LocalEnv { let config_toml_path = dentry.path().join("pageserver.toml"); let config_toml: PageserverConfigTomlSubset = toml_edit::de::from_str( &std::fs::read_to_string(&config_toml_path) - .with_context(|| format!("read {:?}", config_toml_path))?, + .with_context(|| format!("read {config_toml_path:?}"))?, ) .context("parse pageserver.toml")?; let identity_toml_path = dentry.path().join("identity.toml"); @@ -743,7 +743,7 @@ impl LocalEnv { } let identity_toml: IdentityTomlSubset = toml_edit::de::from_str( &std::fs::read_to_string(&identity_toml_path) - .with_context(|| format!("read {:?}", identity_toml_path))?, + .with_context(|| format!("read {identity_toml_path:?}"))?, ) .context("parse identity.toml")?; let PageserverConfigTomlSubset { diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index a683d2daec..7fa00a6730 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -121,7 +121,7 @@ impl PageServerNode { .env .generate_auth_token(&Claims::new(None, Scope::GenerationsApi)) .unwrap(); - overrides.push(format!("control_plane_api_token='{}'", jwt_token)); + overrides.push(format!("control_plane_api_token='{jwt_token}'")); } if !conf.other.contains_key("remote_storage") { diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs index 28d369a315..da9dafd8e9 100644 --- a/control_plane/src/safekeeper.rs +++ b/control_plane/src/safekeeper.rs @@ -143,7 +143,7 @@ impl SafekeeperNode { let id_string = id.to_string(); // TODO: add availability_zone to the config. // Right now we just specify any value here and use it to check metrics in tests. - let availability_zone = format!("sk-{}", id_string); + let availability_zone = format!("sk-{id_string}"); let mut args = vec![ "-D".to_owned(), diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs index 334949924c..f2ac5bb2dd 100644 --- a/control_plane/src/storage_controller.rs +++ b/control_plane/src/storage_controller.rs @@ -167,7 +167,7 @@ impl StorageController { fn storage_controller_instance_dir(&self, instance_id: u8) -> PathBuf { self.env .base_data_dir - .join(format!("storage_controller_{}", instance_id)) + .join(format!("storage_controller_{instance_id}")) } fn pid_file(&self, instance_id: u8) -> Utf8PathBuf { @@ -220,7 +220,7 @@ impl StorageController { "-d", DB_NAME, "-p", - &format!("{}", postgres_port), + &format!("{postgres_port}"), ]; let pg_lib_dir = self.get_pg_lib_dir().await.unwrap(); let envs = [ @@ -263,7 +263,7 @@ impl StorageController { "-h", "localhost", "-p", - &format!("{}", postgres_port), + &format!("{postgres_port}"), "-U", &username(), "-O", @@ -425,7 +425,7 @@ impl StorageController { // from `LocalEnv`'s config file (`.neon/config`). tokio::fs::write( &pg_data_path.join("postgresql.conf"), - format!("port = {}\nfsync=off\n", postgres_port), + format!("port = {postgres_port}\nfsync=off\n"), ) .await?; @@ -477,7 +477,7 @@ impl StorageController { self.setup_database(postgres_port).await?; } - let database_url = format!("postgresql://localhost:{}/{DB_NAME}", postgres_port); + let database_url = format!("postgresql://localhost:{postgres_port}/{DB_NAME}"); // We support running a startup SQL script to fiddle with the database before we launch storcon. // This is used by the test suite. @@ -508,7 +508,7 @@ impl StorageController { drop(client); conn.await??; - let addr = format!("{}:{}", host, listen_port); + let addr = format!("{host}:{listen_port}"); let address_for_peers = Uri::builder() .scheme(scheme) .authority(addr.clone()) @@ -810,9 +810,9 @@ impl StorageController { builder = builder.json(&body) } if let Some(private_key) = &self.private_key { - println!("Getting claims for path {}", path); + println!("Getting claims for path {path}"); if let Some(required_claims) = Self::get_claims_for_path(&path)? { - println!("Got claims {:?} for path {}", required_claims, path); + println!("Got claims {required_claims:?} for path {path}"); let jwt_token = encode_from_key_file(&required_claims, private_key)?; builder = builder.header( reqwest::header::AUTHORIZATION, diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs index 507190b1e0..0036b7d0f6 100644 --- a/control_plane/storcon_cli/src/main.rs +++ b/control_plane/storcon_cli/src/main.rs @@ -649,7 +649,7 @@ async fn main() -> anyhow::Result<()> { response .new_shards .iter() - .map(|s| format!("{:?}", s)) + .map(|s| format!("{s:?}")) .collect::>() .join(",") ); @@ -771,8 +771,8 @@ async fn main() -> anyhow::Result<()> { println!("Tenant {tenant_id}"); let mut table = comfy_table::Table::new(); - table.add_row(["Policy", &format!("{:?}", policy)]); - table.add_row(["Stripe size", &format!("{:?}", stripe_size)]); + table.add_row(["Policy", &format!("{policy:?}")]); + table.add_row(["Stripe size", &format!("{stripe_size:?}")]); table.add_row(["Config", &serde_json::to_string_pretty(&config).unwrap()]); println!("{table}"); println!("Shards:"); @@ -789,7 +789,7 @@ async fn main() -> anyhow::Result<()> { let secondary = shard .node_secondary .iter() - .map(|n| format!("{}", n)) + .map(|n| format!("{n}")) .collect::>() .join(","); @@ -863,7 +863,7 @@ async fn main() -> anyhow::Result<()> { } } else { // Make it obvious to the user that since they've omitted an AZ, we're clearing it - eprintln!("Clearing preferred AZ for tenant {}", tenant_id); + eprintln!("Clearing preferred AZ for tenant {tenant_id}"); } // Construct a request that modifies all the tenant's shards @@ -1134,8 +1134,7 @@ async fn main() -> anyhow::Result<()> { Err((tenant_shard_id, from, to, error)) => { failure += 1; println!( - "Failed to migrate {} from node {} to node {}: {}", - tenant_shard_id, from, to, error + "Failed to migrate {tenant_shard_id} from node {from} to node {to}: {error}" ); } } @@ -1277,8 +1276,7 @@ async fn main() -> anyhow::Result<()> { concurrency, } => { let mut path = format!( - "/v1/tenant/{}/timeline/{}/download_heatmap_layers", - tenant_shard_id, timeline_id, + "/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/download_heatmap_layers", ); if let Some(c) = concurrency { @@ -1303,8 +1301,7 @@ async fn watch_tenant_shard( ) -> anyhow::Result<()> { if let Some(until_migrated_to) = until_migrated_to { println!( - "Waiting for tenant shard {} to be migrated to node {}", - tenant_shard_id, until_migrated_to + "Waiting for tenant shard {tenant_shard_id} to be migrated to node {until_migrated_to}" ); } @@ -1327,7 +1324,7 @@ async fn watch_tenant_shard( "attached: {} secondary: {} {}", shard .node_attached - .map(|n| format!("{}", n)) + .map(|n| format!("{n}")) .unwrap_or("none".to_string()), shard .node_secondary @@ -1341,15 +1338,12 @@ async fn watch_tenant_shard( "(reconciler idle)" } ); - println!("{}", summary); + println!("{summary}"); // Maybe drop out if we finished migration if let Some(until_migrated_to) = until_migrated_to { if shard.node_attached == Some(until_migrated_to) && !shard.is_reconciling { - println!( - "Tenant shard {} is now on node {}", - tenant_shard_id, until_migrated_to - ); + println!("Tenant shard {tenant_shard_id} is now on node {until_migrated_to}"); break; } } diff --git a/endpoint_storage/src/app.rs b/endpoint_storage/src/app.rs index f44efe6d7a..42431c0066 100644 --- a/endpoint_storage/src/app.rs +++ b/endpoint_storage/src/app.rs @@ -374,7 +374,7 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH let request = Request::builder() .uri(format!("/{tenant}/{timeline}/{endpoint}/sub/path/key")) .method(method) - .header("Authorization", format!("Bearer {}", token)) + .header("Authorization", format!("Bearer {token}")) .body(Body::empty()) .unwrap(); let status = ServiceExt::ready(&mut app) diff --git a/libs/desim/src/executor.rs b/libs/desim/src/executor.rs index 51b11ff97e..bdb9c6cd4b 100644 --- a/libs/desim/src/executor.rs +++ b/libs/desim/src/executor.rs @@ -71,7 +71,7 @@ impl Runtime { debug!("thread panicked: {:?}", e); let mut result = ctx.result.lock(); if result.0 == -1 { - *result = (256, format!("thread panicked: {:?}", e)); + *result = (256, format!("thread panicked: {e:?}")); } }); } diff --git a/libs/desim/src/proto.rs b/libs/desim/src/proto.rs index 31bc29e6a6..7c3de4ff4b 100644 --- a/libs/desim/src/proto.rs +++ b/libs/desim/src/proto.rs @@ -47,8 +47,8 @@ impl Debug for AnyMessage { match self { AnyMessage::None => write!(f, "None"), AnyMessage::InternalConnect => write!(f, "InternalConnect"), - AnyMessage::Just32(v) => write!(f, "Just32({})", v), - AnyMessage::ReplCell(v) => write!(f, "ReplCell({:?})", v), + AnyMessage::Just32(v) => write!(f, "Just32({v})"), + AnyMessage::ReplCell(v) => write!(f, "ReplCell({v:?})"), AnyMessage::Bytes(v) => write!(f, "Bytes({})", hex::encode(v)), AnyMessage::LSN(v) => write!(f, "LSN({})", Lsn(*v)), } diff --git a/libs/http-utils/src/endpoint.rs b/libs/http-utils/src/endpoint.rs index 64147f2dd0..f32ced1180 100644 --- a/libs/http-utils/src/endpoint.rs +++ b/libs/http-utils/src/endpoint.rs @@ -582,14 +582,14 @@ pub fn attach_openapi_ui( deepLinking: true, showExtensions: true, showCommonExtensions: true, - url: "{}", + url: "{spec_mount_path}", }}) window.ui = ui; }}; - "#, spec_mount_path))).unwrap()) + "#))).unwrap()) }) ) } @@ -696,7 +696,7 @@ mod tests { let remote_addr = SocketAddr::new(IpAddr::from_str("127.0.0.1").unwrap(), 80); let mut service = builder.build(remote_addr); if let Err(e) = poll_fn(|ctx| service.poll_ready(ctx)).await { - panic!("request service is not ready: {:?}", e); + panic!("request service is not ready: {e:?}"); } let mut req: Request = Request::default(); @@ -716,7 +716,7 @@ mod tests { let remote_addr = SocketAddr::new(IpAddr::from_str("127.0.0.1").unwrap(), 80); let mut service = builder.build(remote_addr); if let Err(e) = poll_fn(|ctx| service.poll_ready(ctx)).await { - panic!("request service is not ready: {:?}", e); + panic!("request service is not ready: {e:?}"); } let req: Request = Request::default(); diff --git a/libs/neon-shmem/src/lib.rs b/libs/neon-shmem/src/lib.rs index e1b14b1371..c689959b68 100644 --- a/libs/neon-shmem/src/lib.rs +++ b/libs/neon-shmem/src/lib.rs @@ -86,7 +86,7 @@ impl ShmemHandle { // somewhat smaller than that, because with anything close to that, you'll run out of // memory anyway. if max_size >= 1 << 48 { - panic!("max size {} too large", max_size); + panic!("max size {max_size} too large"); } if initial_size > max_size { panic!("initial size {initial_size} larger than max size {max_size}"); @@ -279,7 +279,7 @@ mod tests { fn assert_range(ptr: *const u8, expected: u8, range: Range) { for i in range { let b = unsafe { *(ptr.add(i)) }; - assert_eq!(expected, b, "unexpected byte at offset {}", i); + assert_eq!(expected, b, "unexpected byte at offset {i}"); } } diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs index 55495dd68e..ff18d40bfe 100644 --- a/libs/pageserver_api/src/controller_api.rs +++ b/libs/pageserver_api/src/controller_api.rs @@ -577,8 +577,7 @@ mod test { let err = serde_json::from_value::(create_request).unwrap_err(); assert!( err.to_string().contains("unknown field `unknown_field`"), - "expect unknown field `unknown_field` error, got: {}", - err + "expect unknown field `unknown_field` error, got: {err}" ); } diff --git a/libs/pageserver_api/src/keyspace.rs b/libs/pageserver_api/src/keyspace.rs index 1b48d3c462..10a242e13b 100644 --- a/libs/pageserver_api/src/keyspace.rs +++ b/libs/pageserver_api/src/keyspace.rs @@ -334,8 +334,7 @@ impl KeySpace { std::cmp::max(range.start, prev.start) < std::cmp::min(range.end, prev.end); assert!( !overlap, - "Attempt to merge ovelapping keyspaces: {:?} overlaps {:?}", - prev, range + "Attempt to merge ovelapping keyspaces: {prev:?} overlaps {range:?}" ); } @@ -1104,7 +1103,7 @@ mod tests { // total range contains at least one shard-local page let all_nonzero = fragments.iter().all(|f| f.0 > 0); if !all_nonzero { - eprintln!("Found a zero-length fragment: {:?}", fragments); + eprintln!("Found a zero-length fragment: {fragments:?}"); } assert!(all_nonzero); } else { diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 2fc32c8f49..ee6725efbe 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -1182,7 +1182,7 @@ impl Display for ImageCompressionAlgorithm { ImageCompressionAlgorithm::Disabled => write!(f, "disabled"), ImageCompressionAlgorithm::Zstd { level } => { if let Some(level) = level { - write!(f, "zstd({})", level) + write!(f, "zstd({level})") } else { write!(f, "zstd") } @@ -2011,8 +2011,7 @@ mod tests { let err = serde_json::from_value::(config_request).unwrap_err(); assert!( err.to_string().contains("unknown field `unknown_field`"), - "expect unknown field `unknown_field` error, got: {}", - err + "expect unknown field `unknown_field` error, got: {err}" ); } diff --git a/libs/postgres_backend/src/lib.rs b/libs/postgres_backend/src/lib.rs index 714d8ac403..091299f842 100644 --- a/libs/postgres_backend/src/lib.rs +++ b/libs/postgres_backend/src/lib.rs @@ -939,7 +939,7 @@ impl PostgresBackendReader { FeMessage::CopyFail => Err(CopyStreamHandlerEnd::CopyFail), FeMessage::Terminate => Err(CopyStreamHandlerEnd::Terminate), _ => Err(CopyStreamHandlerEnd::from(ConnectionError::Protocol( - ProtocolError::Protocol(format!("unexpected message in COPY stream {:?}", msg)), + ProtocolError::Protocol(format!("unexpected message in COPY stream {msg:?}")), ))), }, None => Err(CopyStreamHandlerEnd::EOF), diff --git a/libs/postgres_backend/tests/simple_select.rs b/libs/postgres_backend/tests/simple_select.rs index 75ca123014..23e17799bd 100644 --- a/libs/postgres_backend/tests/simple_select.rs +++ b/libs/postgres_backend/tests/simple_select.rs @@ -61,7 +61,7 @@ async fn simple_select() { // so spawn it off to run on its own. tokio::spawn(async move { if let Err(e) = connection.await { - eprintln!("connection error: {}", e); + eprintln!("connection error: {e}"); } }); @@ -137,7 +137,7 @@ async fn simple_select_ssl() { // so spawn it off to run on its own. tokio::spawn(async move { if let Err(e) = connection.await { - eprintln!("connection error: {}", e); + eprintln!("connection error: {e}"); } }); diff --git a/libs/postgres_connection/src/lib.rs b/libs/postgres_connection/src/lib.rs index cd981b3729..2388303329 100644 --- a/libs/postgres_connection/src/lib.rs +++ b/libs/postgres_connection/src/lib.rs @@ -223,7 +223,7 @@ mod tests_pg_connection_config { assert_eq!(cfg.port(), 123); assert_eq!(cfg.raw_address(), "stub.host.example:123"); assert_eq!( - format!("{:?}", cfg), + format!("{cfg:?}"), "PgConnectionConfig { host: Domain(\"stub.host.example\"), port: 123, password: None }" ); } @@ -239,7 +239,7 @@ mod tests_pg_connection_config { assert_eq!(cfg.port(), 123); assert_eq!(cfg.raw_address(), "[::1]:123"); assert_eq!( - format!("{:?}", cfg), + format!("{cfg:?}"), "PgConnectionConfig { host: Ipv6(::1), port: 123, password: None }" ); } @@ -252,7 +252,7 @@ mod tests_pg_connection_config { assert_eq!(cfg.port(), 123); assert_eq!(cfg.raw_address(), "stub.host.example:123"); assert_eq!( - format!("{:?}", cfg), + format!("{cfg:?}"), "PgConnectionConfig { host: Domain(\"stub.host.example\"), port: 123, password: Some(REDACTED-STRING) }" ); } diff --git a/libs/postgres_ffi/src/waldecoder_handler.rs b/libs/postgres_ffi/src/waldecoder_handler.rs index b4d50375bd..9cd40645ec 100644 --- a/libs/postgres_ffi/src/waldecoder_handler.rs +++ b/libs/postgres_ffi/src/waldecoder_handler.rs @@ -114,7 +114,7 @@ impl WalStreamDecoderHandler for WalStreamDecoder { let hdr = XLogLongPageHeaderData::from_bytes(&mut self.inputbuf).map_err( |e| WalDecodeError { - msg: format!("long header deserialization failed {}", e), + msg: format!("long header deserialization failed {e}"), lsn: self.lsn, }, )?; @@ -130,7 +130,7 @@ impl WalStreamDecoderHandler for WalStreamDecoder { let hdr = XLogPageHeaderData::from_bytes(&mut self.inputbuf).map_err(|e| { WalDecodeError { - msg: format!("header deserialization failed {}", e), + msg: format!("header deserialization failed {e}"), lsn: self.lsn, } })?; @@ -155,7 +155,7 @@ impl WalStreamDecoderHandler for WalStreamDecoder { let xl_tot_len = (&self.inputbuf[0..4]).get_u32_le(); if (xl_tot_len as usize) < XLOG_SIZE_OF_XLOG_RECORD { return Err(WalDecodeError { - msg: format!("invalid xl_tot_len {}", xl_tot_len), + msg: format!("invalid xl_tot_len {xl_tot_len}"), lsn: self.lsn, }); } @@ -218,7 +218,7 @@ impl WalStreamDecoderHandler for WalStreamDecoder { let xlogrec = XLogRecord::from_slice(&recordbuf[0..XLOG_SIZE_OF_XLOG_RECORD]).map_err(|e| { WalDecodeError { - msg: format!("xlog record deserialization failed {}", e), + msg: format!("xlog record deserialization failed {e}"), lsn: self.lsn, } })?; diff --git a/libs/postgres_ffi/src/walrecord.rs b/libs/postgres_ffi/src/walrecord.rs index 1ccf4590a9..c0ae88363e 100644 --- a/libs/postgres_ffi/src/walrecord.rs +++ b/libs/postgres_ffi/src/walrecord.rs @@ -1199,7 +1199,7 @@ pub fn describe_postgres_wal_record(record: &Bytes) -> Result "HEAP2 MULTI_INSERT", pg_constants::XLOG_HEAP2_VISIBLE => "HEAP2 VISIBLE", _ => { - unknown_str = format!("HEAP2 UNKNOWN_0x{:02x}", info); + unknown_str = format!("HEAP2 UNKNOWN_0x{info:02x}"); &unknown_str } } @@ -1212,7 +1212,7 @@ pub fn describe_postgres_wal_record(record: &Bytes) -> Result "HEAP UPDATE", pg_constants::XLOG_HEAP_HOT_UPDATE => "HEAP HOT_UPDATE", _ => { - unknown_str = format!("HEAP2 UNKNOWN_0x{:02x}", info); + unknown_str = format!("HEAP2 UNKNOWN_0x{info:02x}"); &unknown_str } } @@ -1223,7 +1223,7 @@ pub fn describe_postgres_wal_record(record: &Bytes) -> Result "XLOG FPI", pg_constants::XLOG_FPI_FOR_HINT => "XLOG FPI_FOR_HINT", _ => { - unknown_str = format!("XLOG UNKNOWN_0x{:02x}", info); + unknown_str = format!("XLOG UNKNOWN_0x{info:02x}"); &unknown_str } } @@ -1231,7 +1231,7 @@ pub fn describe_postgres_wal_record(record: &Bytes) -> Result { let info = xlogrec.xl_info & pg_constants::XLR_RMGR_INFO_MASK; - unknown_str = format!("UNKNOWN_RM_{} INFO_0x{:02x}", rmid, info); + unknown_str = format!("UNKNOWN_RM_{rmid} INFO_0x{info:02x}"); &unknown_str } }; diff --git a/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs b/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs index 4a33dbe25b..94371a35b5 100644 --- a/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs +++ b/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs @@ -34,7 +34,7 @@ fn test_end_of_wal(test_name: &str) { let cfg = Conf { pg_version, pg_distrib_dir: top_path.join("pg_install"), - datadir: top_path.join(format!("test_output/{}-{PG_MAJORVERSION}", test_name)), + datadir: top_path.join(format!("test_output/{test_name}-{PG_MAJORVERSION}")), }; if cfg.datadir.exists() { fs::remove_dir_all(&cfg.datadir).unwrap(); diff --git a/libs/postgres_initdb/src/lib.rs b/libs/postgres_initdb/src/lib.rs index ed54696861..4b4a597f73 100644 --- a/libs/postgres_initdb/src/lib.rs +++ b/libs/postgres_initdb/src/lib.rs @@ -31,15 +31,15 @@ pub enum Error { impl fmt::Display for Error { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self { - Error::Spawn(e) => write!(f, "Error spawning command: {:?}", e), + Error::Spawn(e) => write!(f, "Error spawning command: {e:?}"), Error::Failed { status, stderr } => write!( f, "Command failed with status {:?}: {}", status, String::from_utf8_lossy(stderr) ), - Error::WaitOutput(e) => write!(f, "Error waiting for command output: {:?}", e), - Error::Other(e) => write!(f, "Error: {:?}", e), + Error::WaitOutput(e) => write!(f, "Error waiting for command output: {e:?}"), + Error::Other(e) => write!(f, "Error: {e:?}"), } } } diff --git a/libs/posthog_client_lite/src/lib.rs b/libs/posthog_client_lite/src/lib.rs index 730878fb58..f21047bcfc 100644 --- a/libs/posthog_client_lite/src/lib.rs +++ b/libs/posthog_client_lite/src/lib.rs @@ -168,15 +168,13 @@ impl FeatureStore { let PostHogFlagFilterPropertyValue::String(provided) = provided else { // Left should be a string return Err(PostHogEvaluationError::Internal(format!( - "The left side of the condition is not a string: {:?}", - provided + "The left side of the condition is not a string: {provided:?}" ))); }; let PostHogFlagFilterPropertyValue::List(requested) = requested else { // Right should be a list of string return Err(PostHogEvaluationError::Internal(format!( - "The right side of the condition is not a list: {:?}", - requested + "The right side of the condition is not a list: {requested:?}" ))); }; Ok(requested.contains(provided)) @@ -185,14 +183,12 @@ impl FeatureStore { let PostHogFlagFilterPropertyValue::String(requested) = requested else { // Right should be a string return Err(PostHogEvaluationError::Internal(format!( - "The right side of the condition is not a string: {:?}", - requested + "The right side of the condition is not a string: {requested:?}" ))); }; let Ok(requested) = requested.parse::() else { return Err(PostHogEvaluationError::Internal(format!( - "Can not parse the right side of the condition as a number: {:?}", - requested + "Can not parse the right side of the condition as a number: {requested:?}" ))); }; // Left can either be a number or a string @@ -201,16 +197,14 @@ impl FeatureStore { PostHogFlagFilterPropertyValue::String(provided) => { let Ok(provided) = provided.parse::() else { return Err(PostHogEvaluationError::Internal(format!( - "Can not parse the left side of the condition as a number: {:?}", - provided + "Can not parse the left side of the condition as a number: {provided:?}" ))); }; provided } _ => { return Err(PostHogEvaluationError::Internal(format!( - "The left side of the condition is not a number or a string: {:?}", - provided + "The left side of the condition is not a number or a string: {provided:?}" ))); } }; @@ -218,14 +212,12 @@ impl FeatureStore { "lt" => Ok(provided < requested), "gt" => Ok(provided > requested), op => Err(PostHogEvaluationError::Internal(format!( - "Unsupported operator: {}", - op + "Unsupported operator: {op}" ))), } } _ => Err(PostHogEvaluationError::Internal(format!( - "Unsupported operator: {}", - operator + "Unsupported operator: {operator}" ))), } } @@ -373,8 +365,7 @@ impl FeatureStore { if let Some(flag_config) = self.flags.get(flag_key) { if !flag_config.active { return Err(PostHogEvaluationError::NotAvailable(format!( - "The feature flag is not active: {}", - flag_key + "The feature flag is not active: {flag_key}" ))); } let Some(ref multivariate) = flag_config.filters.multivariate else { @@ -401,8 +392,7 @@ impl FeatureStore { // This should not happen because the rollout percentage always adds up to 100, but just in case that PostHog // returned invalid spec, we return an error. return Err(PostHogEvaluationError::Internal(format!( - "Rollout percentage does not add up to 100: {}", - flag_key + "Rollout percentage does not add up to 100: {flag_key}" ))); } GroupEvaluationResult::Unmatched => continue, @@ -413,8 +403,7 @@ impl FeatureStore { } else { // The feature flag is not available yet Err(PostHogEvaluationError::NotAvailable(format!( - "Not found in the local evaluation spec: {}", - flag_key + "Not found in the local evaluation spec: {flag_key}" ))) } } @@ -440,8 +429,7 @@ impl FeatureStore { if let Some(flag_config) = self.flags.get(flag_key) { if !flag_config.active { return Err(PostHogEvaluationError::NotAvailable(format!( - "The feature flag is not active: {}", - flag_key + "The feature flag is not active: {flag_key}" ))); } if flag_config.filters.multivariate.is_some() { @@ -456,8 +444,7 @@ impl FeatureStore { match self.evaluate_group(group, hash_on_global_rollout_percentage, properties)? { GroupEvaluationResult::MatchedAndOverride(_) => { return Err(PostHogEvaluationError::Internal(format!( - "Boolean flag cannot have overrides: {}", - flag_key + "Boolean flag cannot have overrides: {flag_key}" ))); } GroupEvaluationResult::MatchedAndEvaluate => { @@ -471,8 +458,7 @@ impl FeatureStore { } else { // The feature flag is not available yet Err(PostHogEvaluationError::NotAvailable(format!( - "Not found in the local evaluation spec: {}", - flag_key + "Not found in the local evaluation spec: {flag_key}" ))) } } @@ -483,8 +469,7 @@ impl FeatureStore { Ok(flag_config.filters.multivariate.is_none()) } else { Err(PostHogEvaluationError::NotAvailable(format!( - "Not found in the local evaluation spec: {}", - flag_key + "Not found in the local evaluation spec: {flag_key}" ))) } } diff --git a/libs/pq_proto/src/lib.rs b/libs/pq_proto/src/lib.rs index e7afc64564..482dd9a298 100644 --- a/libs/pq_proto/src/lib.rs +++ b/libs/pq_proto/src/lib.rs @@ -198,7 +198,7 @@ impl fmt::Display for CancelKeyData { // This format is more compact and might work better for logs. f.debug_tuple("CancelKeyData") - .field(&format_args!("{:x}", id)) + .field(&format_args!("{id:x}")) .finish() } } @@ -291,8 +291,7 @@ impl FeMessage { let len = (&buf[1..5]).read_u32::().unwrap(); if len < 4 { return Err(ProtocolError::Protocol(format!( - "invalid message length {}", - len + "invalid message length {len}" ))); } @@ -367,8 +366,7 @@ impl FeStartupPacket { #[allow(clippy::manual_range_contains)] if len < 8 || len > MAX_STARTUP_PACKET_LENGTH { return Err(ProtocolError::Protocol(format!( - "invalid startup packet message length {}", - len + "invalid startup packet message length {len}" ))); } diff --git a/libs/proxy/postgres-protocol2/src/authentication/sasl.rs b/libs/proxy/postgres-protocol2/src/authentication/sasl.rs index a7bf3da20a..b8304f9d8d 100644 --- a/libs/proxy/postgres-protocol2/src/authentication/sasl.rs +++ b/libs/proxy/postgres-protocol2/src/authentication/sasl.rs @@ -308,7 +308,7 @@ impl ScramSha256 { let verifier = match parsed { ServerFinalMessage::Error(e) => { - return Err(io::Error::other(format!("SCRAM error: {}", e))); + return Err(io::Error::other(format!("SCRAM error: {e}"))); } ServerFinalMessage::Verifier(verifier) => verifier, }; @@ -343,10 +343,8 @@ impl<'a> Parser<'a> { match self.it.next() { Some((_, c)) if c == target => Ok(()), Some((i, c)) => { - let m = format!( - "unexpected character at byte {}: expected `{}` but got `{}", - i, target, c - ); + let m = + format!("unexpected character at byte {i}: expected `{target}` but got `{c}"); Err(io::Error::new(io::ErrorKind::InvalidInput, m)) } None => Err(io::Error::new( @@ -412,7 +410,7 @@ impl<'a> Parser<'a> { match self.it.peek() { Some(&(i, _)) => Err(io::Error::new( io::ErrorKind::InvalidInput, - format!("unexpected trailing data at byte {}", i), + format!("unexpected trailing data at byte {i}"), )), None => Ok(()), } diff --git a/libs/proxy/postgres-protocol2/src/message/backend.rs b/libs/proxy/postgres-protocol2/src/message/backend.rs index d7eaef9509..3fc9a9335c 100644 --- a/libs/proxy/postgres-protocol2/src/message/backend.rs +++ b/libs/proxy/postgres-protocol2/src/message/backend.rs @@ -211,7 +211,7 @@ impl Message { tag => { return Err(io::Error::new( io::ErrorKind::InvalidInput, - format!("unknown authentication tag `{}`", tag), + format!("unknown authentication tag `{tag}`"), )); } }, @@ -238,7 +238,7 @@ impl Message { tag => { return Err(io::Error::new( io::ErrorKind::InvalidInput, - format!("unknown message tag `{}`", tag), + format!("unknown message tag `{tag}`"), )); } }; diff --git a/libs/proxy/postgres-types2/src/lib.rs b/libs/proxy/postgres-types2/src/lib.rs index 7c9874bda3..c98c45636b 100644 --- a/libs/proxy/postgres-types2/src/lib.rs +++ b/libs/proxy/postgres-types2/src/lib.rs @@ -46,7 +46,7 @@ impl fmt::Display for Type { fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { match self.schema() { "public" | "pg_catalog" => {} - schema => write!(fmt, "{}.", schema)?, + schema => write!(fmt, "{schema}.")?, } fmt.write_str(self.name()) } diff --git a/libs/proxy/tokio-postgres2/src/error/mod.rs b/libs/proxy/tokio-postgres2/src/error/mod.rs index 8149bceeb9..5309bce17e 100644 --- a/libs/proxy/tokio-postgres2/src/error/mod.rs +++ b/libs/proxy/tokio-postgres2/src/error/mod.rs @@ -332,10 +332,10 @@ impl fmt::Display for DbError { fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { write!(fmt, "{}: {}", self.severity, self.message)?; if let Some(detail) = &self.detail { - write!(fmt, "\nDETAIL: {}", detail)?; + write!(fmt, "\nDETAIL: {detail}")?; } if let Some(hint) = &self.hint { - write!(fmt, "\nHINT: {}", hint)?; + write!(fmt, "\nHINT: {hint}")?; } Ok(()) } @@ -398,9 +398,9 @@ impl fmt::Display for Error { Kind::Io => fmt.write_str("error communicating with the server")?, Kind::UnexpectedMessage => fmt.write_str("unexpected message from server")?, Kind::Tls => fmt.write_str("error performing TLS handshake")?, - Kind::ToSql(idx) => write!(fmt, "error serializing parameter {}", idx)?, - Kind::FromSql(idx) => write!(fmt, "error deserializing column {}", idx)?, - Kind::Column(column) => write!(fmt, "invalid column `{}`", column)?, + Kind::ToSql(idx) => write!(fmt, "error serializing parameter {idx}")?, + Kind::FromSql(idx) => write!(fmt, "error deserializing column {idx}")?, + Kind::Column(column) => write!(fmt, "invalid column `{column}`")?, Kind::Closed => fmt.write_str("connection closed")?, Kind::Db => fmt.write_str("db error")?, Kind::Parse => fmt.write_str("error parsing response from server")?, @@ -411,7 +411,7 @@ impl fmt::Display for Error { Kind::Timeout => fmt.write_str("timeout waiting for server")?, }; if let Some(ref cause) = self.0.cause { - write!(fmt, ": {}", cause)?; + write!(fmt, ": {cause}")?; } Ok(()) } diff --git a/libs/proxy/tokio-postgres2/src/row.rs b/libs/proxy/tokio-postgres2/src/row.rs index 5fc955eef4..36d578558f 100644 --- a/libs/proxy/tokio-postgres2/src/row.rs +++ b/libs/proxy/tokio-postgres2/src/row.rs @@ -156,7 +156,7 @@ impl Row { { match self.get_inner(&idx) { Ok(ok) => ok, - Err(err) => panic!("error retrieving column {}: {}", idx, err), + Err(err) => panic!("error retrieving column {idx}: {err}"), } } @@ -274,7 +274,7 @@ impl SimpleQueryRow { { match self.get_inner(&idx) { Ok(ok) => ok, - Err(err) => panic!("error retrieving column {}: {}", idx, err), + Err(err) => panic!("error retrieving column {idx}: {err}"), } } diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs index 8320d7afdc..30690b1bdb 100644 --- a/libs/remote_storage/src/local_fs.rs +++ b/libs/remote_storage/src/local_fs.rs @@ -400,7 +400,7 @@ impl RemoteStorage for LocalFs { key }; - let relative_key = format!("{}", relative_key); + let relative_key = format!("{relative_key}"); if relative_key.contains(REMOTE_STORAGE_PREFIX_SEPARATOR) { let first_part = relative_key .split(REMOTE_STORAGE_PREFIX_SEPARATOR) @@ -594,13 +594,9 @@ impl RemoteStorage for LocalFs { let from_path = from.with_base(&self.storage_root); let to_path = to.with_base(&self.storage_root); create_target_directory(&to_path).await?; - fs::copy(&from_path, &to_path).await.with_context(|| { - format!( - "Failed to copy file from '{from_path}' to '{to_path}'", - from_path = from_path, - to_path = to_path - ) - })?; + fs::copy(&from_path, &to_path) + .await + .with_context(|| format!("Failed to copy file from '{from_path}' to '{to_path}'"))?; Ok(()) } @@ -1183,7 +1179,7 @@ mod fs_tests { .write(true) .create_new(true) .open(path)?; - write!(file_for_writing, "{}", contents)?; + write!(file_for_writing, "{contents}")?; drop(file_for_writing); let file_size = path.metadata()?.len() as usize; Ok(( diff --git a/libs/safekeeper_api/src/membership.rs b/libs/safekeeper_api/src/membership.rs index 3d4d17096e..1751c54f6a 100644 --- a/libs/safekeeper_api/src/membership.rs +++ b/libs/safekeeper_api/src/membership.rs @@ -193,10 +193,10 @@ mod tests { }) .unwrap(); - println!("members: {}", members); + println!("members: {members}"); let j = serde_json::to_string(&members).expect("failed to serialize"); - println!("members json: {}", j); + println!("members json: {j}"); assert_eq!( j, r#"[{"id":42,"host":"lala.org","pg_port":5432},{"id":43,"host":"bubu.org","pg_port":5432}]"# diff --git a/libs/utils/src/error.rs b/libs/utils/src/error.rs index 7ce203e918..6fa86916c1 100644 --- a/libs/utils/src/error.rs +++ b/libs/utils/src/error.rs @@ -41,7 +41,7 @@ pub fn report_compact_sources(e: &E) -> impl std::fmt::Dis // why is E a generic parameter here? hope that rustc will see through a default // Error::source implementation and leave the following out if there cannot be any // sources: - Sources(self.0.source()).try_for_each(|src| write!(f, ": {}", src)) + Sources(self.0.source()).try_for_each(|src| write!(f, ": {src}")) } } diff --git a/libs/utils/src/generation.rs b/libs/utils/src/generation.rs index b5e4a4644a..8a3bef914a 100644 --- a/libs/utils/src/generation.rs +++ b/libs/utils/src/generation.rs @@ -135,7 +135,7 @@ impl Debug for Generation { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { Self::Valid(v) => { - write!(f, "{:08x}", v) + write!(f, "{v:08x}") } Self::None => { write!(f, "") diff --git a/libs/utils/src/id.rs b/libs/utils/src/id.rs index 68cb1f0209..e3037aec21 100644 --- a/libs/utils/src/id.rs +++ b/libs/utils/src/id.rs @@ -280,7 +280,7 @@ impl TryFrom> for TimelineId { value .unwrap_or_default() .parse::() - .with_context(|| format!("Could not parse timeline id from {:?}", value)) + .with_context(|| format!("Could not parse timeline id from {value:?}")) } } diff --git a/libs/utils/src/postgres_client.rs b/libs/utils/src/postgres_client.rs index 4167839e28..7596fefe38 100644 --- a/libs/utils/src/postgres_client.rs +++ b/libs/utils/src/postgres_client.rs @@ -89,7 +89,7 @@ pub fn wal_stream_connection_config( .set_password(args.auth_token.map(|s| s.to_owned())); if let Some(availability_zone) = args.availability_zone { - connstr = connstr.extend_options([format!("availability_zone={}", availability_zone)]); + connstr = connstr.extend_options([format!("availability_zone={availability_zone}")]); } Ok(connstr) diff --git a/libs/utils/src/shard.rs b/libs/utils/src/shard.rs index c8c410a725..f2b81373e2 100644 --- a/libs/utils/src/shard.rs +++ b/libs/utils/src/shard.rs @@ -196,7 +196,7 @@ impl std::fmt::Display for TenantShardId { impl std::fmt::Debug for TenantShardId { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { // Debug is the same as Display: the compact hex representation - write!(f, "{}", self) + write!(f, "{self}") } } @@ -284,7 +284,7 @@ impl std::fmt::Display for ShardIndex { impl std::fmt::Debug for ShardIndex { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { // Debug is the same as Display: the compact hex representation - write!(f, "{}", self) + write!(f, "{self}") } } diff --git a/libs/utils/src/signals.rs b/libs/utils/src/signals.rs index 426bb65916..bdaa3cb665 100644 --- a/libs/utils/src/signals.rs +++ b/libs/utils/src/signals.rs @@ -29,7 +29,7 @@ impl ShutdownSignals { SIGINT => Signal::Interrupt, SIGTERM => Signal::Terminate, SIGQUIT => Signal::Quit, - other => panic!("unknown signal: {}", other), + other => panic!("unknown signal: {other}"), }; handler(signal)?; diff --git a/libs/vm_monitor/src/dispatcher.rs b/libs/vm_monitor/src/dispatcher.rs index 7b7201ab77..7bd6adc2f8 100644 --- a/libs/vm_monitor/src/dispatcher.rs +++ b/libs/vm_monitor/src/dispatcher.rs @@ -90,8 +90,7 @@ impl Dispatcher { Err(e) => { sink.send(Message::Text(Utf8Bytes::from( serde_json::to_string(&ProtocolResponse::Error(format!( - "Received protocol version range {} which does not overlap with {}", - agent_range, monitor_range + "Received protocol version range {agent_range} which does not overlap with {monitor_range}" ))) .unwrap(), ))) diff --git a/libs/vm_monitor/src/filecache.rs b/libs/vm_monitor/src/filecache.rs index bc42347e5a..55bbdea169 100644 --- a/libs/vm_monitor/src/filecache.rs +++ b/libs/vm_monitor/src/filecache.rs @@ -285,7 +285,7 @@ impl FileCacheState { // why we're constructing the query here. self.client .query( - &format!("ALTER SYSTEM SET neon.file_cache_size_limit = {};", num_mb), + &format!("ALTER SYSTEM SET neon.file_cache_size_limit = {num_mb};"), &[], ) .await diff --git a/libs/wal_decoder/benches/bench_interpret_wal.rs b/libs/wal_decoder/benches/bench_interpret_wal.rs index ed6ba4d267..ff860a92e2 100644 --- a/libs/wal_decoder/benches/bench_interpret_wal.rs +++ b/libs/wal_decoder/benches/bench_interpret_wal.rs @@ -64,7 +64,7 @@ async fn download_bench_data( let temp_dir_parent: Utf8PathBuf = env::current_dir().unwrap().try_into()?; let temp_dir = camino_tempfile::tempdir_in(temp_dir_parent)?; - eprintln!("Downloading benchmark data to {:?}", temp_dir); + eprintln!("Downloading benchmark data to {temp_dir:?}"); let listing = client .list(None, ListingMode::NoDelimiter, None, cancel) @@ -120,7 +120,7 @@ struct BenchmarkMetadata { } async fn load_bench_data(path: &Utf8Path, input_size: usize) -> anyhow::Result { - eprintln!("Loading benchmark data from {:?}", path); + eprintln!("Loading benchmark data from {path:?}"); let mut entries = tokio::fs::read_dir(path).await?; let mut ordered_segment_paths = Vec::new(); diff --git a/libs/wal_decoder/build.rs b/libs/wal_decoder/build.rs index d5b7ad02ad..e8acb52256 100644 --- a/libs/wal_decoder/build.rs +++ b/libs/wal_decoder/build.rs @@ -6,6 +6,6 @@ fn main() -> Result<(), Box> { // the build then. Anyway, per cargo docs build script shouldn't output to // anywhere but $OUT_DIR. tonic_build::compile_protos("proto/interpreted_wal.proto") - .unwrap_or_else(|e| panic!("failed to compile protos {:?}", e)); + .unwrap_or_else(|e| panic!("failed to compile protos {e:?}")); Ok(()) } diff --git a/libs/wal_decoder/src/models/record.rs b/libs/wal_decoder/src/models/record.rs index 73516c5220..51659ed904 100644 --- a/libs/wal_decoder/src/models/record.rs +++ b/libs/wal_decoder/src/models/record.rs @@ -128,6 +128,6 @@ pub fn describe_wal_record(rec: &NeonWalRecord) -> Result Ok(format!("{:?}", rec)), + _ => Ok(format!("{rec:?}")), } } diff --git a/libs/walproposer/src/api_bindings.rs b/libs/walproposer/src/api_bindings.rs index b89f1877fd..7c6abf252e 100644 --- a/libs/walproposer/src/api_bindings.rs +++ b/libs/walproposer/src/api_bindings.rs @@ -376,7 +376,7 @@ impl Level { FATAL => Level::Fatal, PANIC => Level::Panic, WPEVENT => Level::WPEvent, - _ => panic!("unknown log level {}", elevel), + _ => panic!("unknown log level {elevel}"), } } } @@ -446,7 +446,7 @@ pub fn empty_shmem() -> crate::bindings::WalproposerShmemState { impl std::fmt::Display for Level { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { - write!(f, "{:?}", self) + write!(f, "{self:?}") } } diff --git a/libs/walproposer/src/walproposer.rs b/libs/walproposer/src/walproposer.rs index c853658ddf..93bb0d5eb0 100644 --- a/libs/walproposer/src/walproposer.rs +++ b/libs/walproposer/src/walproposer.rs @@ -380,7 +380,7 @@ mod tests { } fn conn_send_query(&self, _: &mut crate::bindings::Safekeeper, query: &str) -> bool { - println!("conn_send_query: {}", query); + println!("conn_send_query: {query}"); true } @@ -399,13 +399,13 @@ mod tests { ) -> crate::bindings::PGAsyncReadResult { println!("conn_async_read"); let reply = self.next_safekeeper_reply(); - println!("conn_async_read result: {:?}", reply); + println!("conn_async_read result: {reply:?}"); vec.extend_from_slice(reply); crate::bindings::PGAsyncReadResult_PG_ASYNC_READ_SUCCESS } fn conn_blocking_write(&self, _: &mut crate::bindings::Safekeeper, buf: &[u8]) -> bool { - println!("conn_blocking_write: {:?}", buf); + println!("conn_blocking_write: {buf:?}"); self.check_walproposer_msg(buf); true } @@ -456,10 +456,7 @@ mod tests { timeout_millis: i64, ) -> super::WaitResult { let data = self.wait_events.get(); - println!( - "wait_event_set, timeout_millis={}, res={:?}", - timeout_millis, data - ); + println!("wait_event_set, timeout_millis={timeout_millis}, res={data:?}"); super::WaitResult::Network(data.sk, data.event_mask) } @@ -475,7 +472,7 @@ mod tests { } fn log_internal(&self, _wp: &mut crate::bindings::WalProposer, level: Level, msg: &str) { - println!("wp_log[{}] {}", level, msg); + println!("wp_log[{level}] {msg}"); } fn after_election(&self, _wp: &mut crate::bindings::WalProposer) { diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs index 219e63c9d4..8b091684eb 100644 --- a/pageserver/client/src/mgmt_api.rs +++ b/pageserver/client/src/mgmt_api.rs @@ -508,11 +508,11 @@ impl Client { .expect("Cannot build URL"); path.query_pairs_mut() - .append_pair("recurse", &format!("{}", recurse)); + .append_pair("recurse", &format!("{recurse}")); if let Some(concurrency) = concurrency { path.query_pairs_mut() - .append_pair("concurrency", &format!("{}", concurrency)); + .append_pair("concurrency", &format!("{concurrency}")); } self.request(Method::POST, path, ()).await.map(|_| ()) diff --git a/pageserver/compaction/src/simulator/draw.rs b/pageserver/compaction/src/simulator/draw.rs index 3d35d1b91e..8322fe7d6d 100644 --- a/pageserver/compaction/src/simulator/draw.rs +++ b/pageserver/compaction/src/simulator/draw.rs @@ -152,7 +152,7 @@ pub fn draw_history(history: &[LayerTraceEvent], mut output: let key_diff = key_end - key_start; if key_start >= key_end { - panic!("Invalid key range {}-{}", key_start, key_end); + panic!("Invalid key range {key_start}-{key_end}"); } let lsn_start = lsn_map.map(f.lsn_range.start); @@ -212,12 +212,12 @@ pub fn draw_history(history: &[LayerTraceEvent], mut output: )?; writeln!(svg, "")?; } - Ordering::Greater => panic!("Invalid lsn range {}-{}", lsn_start, lsn_end), + Ordering::Greater => panic!("Invalid lsn range {lsn_start}-{lsn_end}"), } files_seen.insert(f); } - writeln!(svg, "{}", EndSvg)?; + writeln!(svg, "{EndSvg}")?; let mut layer_events_str = String::new(); let mut first = true; diff --git a/pageserver/ctl/src/draw_timeline_dir.rs b/pageserver/ctl/src/draw_timeline_dir.rs index 881ebd49a7..2135d302c1 100644 --- a/pageserver/ctl/src/draw_timeline_dir.rs +++ b/pageserver/ctl/src/draw_timeline_dir.rs @@ -228,7 +228,7 @@ pub fn main() -> Result<()> { let lsn_max = lsn_map.len(); if key_start >= key_end { - panic!("Invalid key range {}-{}", key_start, key_end); + panic!("Invalid key range {key_start}-{key_end}"); } let lsn_start = *lsn_map.get(&lsnr.start).unwrap(); @@ -250,7 +250,7 @@ pub fn main() -> Result<()> { ymargin = 0.05; fill = Fill::Color(rgb(0, 0, 0)); } - Ordering::Greater => panic!("Invalid lsn range {}-{}", lsn_start, lsn_end), + Ordering::Greater => panic!("Invalid lsn range {lsn_start}-{lsn_end}"), } println!( @@ -287,10 +287,10 @@ pub fn main() -> Result<()> { ); } - println!("{}", EndSvg); + println!("{EndSvg}"); - eprintln!("num_images: {}", num_images); - eprintln!("num_deltas: {}", num_deltas); + eprintln!("num_images: {num_images}"); + eprintln!("num_deltas: {num_deltas}"); Ok(()) } diff --git a/pageserver/ctl/src/key.rs b/pageserver/ctl/src/key.rs index 600f7c412e..c4daafdfd0 100644 --- a/pageserver/ctl/src/key.rs +++ b/pageserver/ctl/src/key.rs @@ -372,7 +372,7 @@ impl std::fmt::Debug for RelTagish { f.write_char('/')?; } first = false; - write!(f, "{}", x) + write!(f, "{x}") }) } } diff --git a/pageserver/ctl/src/layer_map_analyzer.rs b/pageserver/ctl/src/layer_map_analyzer.rs index c49c8b58df..ef844fbd0f 100644 --- a/pageserver/ctl/src/layer_map_analyzer.rs +++ b/pageserver/ctl/src/layer_map_analyzer.rs @@ -224,8 +224,7 @@ pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> { } } println!( - "Total delta layers {} image layers {} excess layers {}", - total_delta_layers, total_image_layers, total_excess_layers + "Total delta layers {total_delta_layers} image layers {total_image_layers} excess layers {total_excess_layers}" ); Ok(()) } diff --git a/pageserver/page_api/src/client.rs b/pageserver/page_api/src/client.rs index aa4774c056..3977ce7c23 100644 --- a/pageserver/page_api/src/client.rs +++ b/pageserver/page_api/src/client.rs @@ -131,7 +131,7 @@ impl Client { let domain_stream = response_stream.map(|chunk_res| { chunk_res.and_then(|proto_chunk| { proto_chunk.try_into().map_err(|e| { - tonic::Status::internal(format!("Failed to convert response chunk: {}", e)) + tonic::Status::internal(format!("Failed to convert response chunk: {e}")) }) }) }); diff --git a/pageserver/pagebench/src/cmd/aux_files.rs b/pageserver/pagebench/src/cmd/aux_files.rs index 6441c047c2..43d7a73399 100644 --- a/pageserver/pagebench/src/cmd/aux_files.rs +++ b/pageserver/pagebench/src/cmd/aux_files.rs @@ -62,7 +62,7 @@ async fn main_impl(args: Args) -> anyhow::Result<()> { let tenant_shard_id = TenantShardId::unsharded(timeline.tenant_id); let timeline_id = timeline.timeline_id; - println!("operating on timeline {}", timeline); + println!("operating on timeline {timeline}"); mgmt_api_client .set_tenant_config(&TenantConfigRequest { @@ -75,8 +75,8 @@ async fn main_impl(args: Args) -> anyhow::Result<()> { let items = (0..100) .map(|id| { ( - format!("pg_logical/mappings/{:03}.{:03}", batch, id), - format!("{:08}", id), + format!("pg_logical/mappings/{batch:03}.{id:03}"), + format!("{id:08}"), ) }) .collect::>(); diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index fe136b8bbd..c4aaff58a1 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -669,7 +669,7 @@ where } // Append dir path for each database - let path = format!("base/{}", dbnode); + let path = format!("base/{dbnode}"); let header = new_tar_header_dir(&path)?; self.ar .append(&header, io::empty()) @@ -677,7 +677,7 @@ where .map_err(|e| BasebackupError::Client(e, "add_dbdir,base"))?; if let Some(img) = relmap_img { - let dst_path = format!("base/{}/PG_VERSION", dbnode); + let dst_path = format!("base/{dbnode}/PG_VERSION"); let pg_version_str = match self.timeline.pg_version { 14 | 15 => self.timeline.pg_version.to_string(), @@ -689,7 +689,7 @@ where .await .map_err(|e| BasebackupError::Client(e, "add_dbdir,base/PG_VERSION"))?; - let relmap_path = format!("base/{}/pg_filenode.map", dbnode); + let relmap_path = format!("base/{dbnode}/pg_filenode.map"); let header = new_tar_header(&relmap_path, img.len() as u64)?; self.ar .append(&header, &img[..]) @@ -714,9 +714,9 @@ where let crc = crc32c::crc32c(&img[..]); buf.put_u32_le(crc); let path = if self.timeline.pg_version < 17 { - format!("pg_twophase/{:>08X}", xid) + format!("pg_twophase/{xid:>08X}") } else { - format!("pg_twophase/{:>016X}", xid) + format!("pg_twophase/{xid:>016X}") }; let header = new_tar_header(&path, buf.len() as u64)?; self.ar @@ -768,7 +768,7 @@ where //send wal segment let segno = self.lsn.segment_number(WAL_SEGMENT_SIZE); let wal_file_name = XLogFileName(PG_TLI, segno, WAL_SEGMENT_SIZE); - let wal_file_path = format!("pg_wal/{}", wal_file_name); + let wal_file_path = format!("pg_wal/{wal_file_name}"); let header = new_tar_header(&wal_file_path, WAL_SEGMENT_SIZE as u64)?; let wal_seg = postgres_ffi::generate_wal_segment( diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 2e9cd3ad70..349bc6dba6 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -287,11 +287,11 @@ impl From for ApiError { GetActiveTenantError::WillNotBecomeActive(TenantState::Stopping { .. }) => { ApiError::ShuttingDown } - GetActiveTenantError::WillNotBecomeActive(_) => ApiError::Conflict(format!("{}", e)), + GetActiveTenantError::WillNotBecomeActive(_) => ApiError::Conflict(format!("{e}")), GetActiveTenantError::Cancelled => ApiError::ShuttingDown, GetActiveTenantError::NotFound(gte) => gte.into(), GetActiveTenantError::WaitForActiveTimeout { .. } => { - ApiError::ResourceUnavailable(format!("{}", e).into()) + ApiError::ResourceUnavailable(format!("{e}").into()) } GetActiveTenantError::SwitchedTenant => { // in our HTTP handlers, this error doesn't happen @@ -1015,7 +1015,7 @@ async fn get_lsn_by_timestamp_handler( let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; let timestamp_raw = must_get_query_param(&request, "timestamp")?; let timestamp = humantime::parse_rfc3339(×tamp_raw) - .with_context(|| format!("Invalid time: {:?}", timestamp_raw)) + .with_context(|| format!("Invalid time: {timestamp_raw:?}")) .map_err(ApiError::BadRequest)?; let timestamp_pg = postgres_ffi::to_pg_timestamp(timestamp); @@ -1110,7 +1110,7 @@ async fn get_timestamp_of_lsn_handler( json_response(StatusCode::OK, time) } None => Err(ApiError::PreconditionFailed( - format!("Timestamp for lsn {} not found", lsn).into(), + format!("Timestamp for lsn {lsn} not found").into(), )), } } @@ -2421,7 +2421,7 @@ async fn timeline_offload_handler( } if let (false, reason) = timeline.can_offload() { return Err(ApiError::PreconditionFailed( - format!("Timeline::can_offload() check failed: {}", reason) .into(), + format!("Timeline::can_offload() check failed: {reason}") .into(), )); } offload_timeline(&tenant, &timeline) diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 8d6d342cf9..7929b094b4 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -1727,12 +1727,7 @@ impl Drop for SmgrOpTimer { impl SmgrOpFlushInProgress { /// The caller must guarantee that `socket_fd`` outlives this function. - pub(crate) async fn measure( - self, - started_at: Instant, - mut fut: Fut, - socket_fd: RawFd, - ) -> O + pub(crate) async fn measure(self, started_at: Instant, fut: Fut, socket_fd: RawFd) -> O where Fut: std::future::Future, { diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 032db34983..d3a1ca681e 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -392,16 +392,14 @@ async fn page_service_conn_main( } else { let tenant_id = conn_handler.timeline_handles.as_ref().unwrap().tenant_id(); Err(io_error).context(format!( - "Postgres connection error for tenant_id={:?} client at peer_addr={}", - tenant_id, peer_addr + "Postgres connection error for tenant_id={tenant_id:?} client at peer_addr={peer_addr}" )) } } other => { let tenant_id = conn_handler.timeline_handles.as_ref().unwrap().tenant_id(); other.context(format!( - "Postgres query error for tenant_id={:?} client peer_addr={}", - tenant_id, peer_addr + "Postgres query error for tenant_id={tenant_id:?} client peer_addr={peer_addr}" )) } } @@ -2140,8 +2138,7 @@ impl PageServerHandler { if request_lsn < not_modified_since { return Err(PageStreamError::BadRequest( format!( - "invalid request with request LSN {} and not_modified_since {}", - request_lsn, not_modified_since, + "invalid request with request LSN {request_lsn} and not_modified_since {not_modified_since}", ) .into(), )); diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 58af2548ee..180a5b76e8 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -1185,7 +1185,7 @@ impl Timeline { } let origin_id = k.field6 as RepOriginId; let origin_lsn = Lsn::des(&v) - .with_context(|| format!("decode replorigin value for {}: {v:?}", origin_id))?; + .with_context(|| format!("decode replorigin value for {origin_id}: {v:?}"))?; if origin_lsn != Lsn::INVALID { result.insert(origin_id, origin_lsn); } @@ -2440,8 +2440,7 @@ impl DatadirModification<'_> { if path == p { assert!( modifying_file.is_none(), - "duplicated entries found for {}", - path + "duplicated entries found for {path}" ); modifying_file = Some(content); } else { diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index d2c2fdef93..a48bf15246 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -3449,7 +3449,7 @@ impl TenantShard { use pageserver_api::models::ActivatingFrom; match &*current_state { TenantState::Activating(_) | TenantState::Active | TenantState::Broken { .. } | TenantState::Stopping { .. } => { - panic!("caller is responsible for calling activate() only on Loading / Attaching tenants, got {state:?}", state = current_state); + panic!("caller is responsible for calling activate() only on Loading / Attaching tenants, got {current_state:?}"); } TenantState::Attaching => { *current_state = TenantState::Activating(ActivatingFrom::Attaching); @@ -6616,7 +6616,7 @@ mod tests { .put( *TEST_KEY, lsn, - &Value::Image(test_img(&format!("foo at {}", lsn))), + &Value::Image(test_img(&format!("foo at {lsn}"))), ctx, ) .await?; @@ -6626,7 +6626,7 @@ mod tests { .put( *TEST_KEY, lsn, - &Value::Image(test_img(&format!("foo at {}", lsn))), + &Value::Image(test_img(&format!("foo at {lsn}"))), ctx, ) .await?; @@ -6640,7 +6640,7 @@ mod tests { .put( *TEST_KEY, lsn, - &Value::Image(test_img(&format!("foo at {}", lsn))), + &Value::Image(test_img(&format!("foo at {lsn}"))), ctx, ) .await?; @@ -6650,7 +6650,7 @@ mod tests { .put( *TEST_KEY, lsn, - &Value::Image(test_img(&format!("foo at {}", lsn))), + &Value::Image(test_img(&format!("foo at {lsn}"))), ctx, ) .await?; @@ -7149,7 +7149,7 @@ mod tests { .put( test_key, lsn, - &Value::Image(test_img(&format!("{} at {}", blknum, lsn))), + &Value::Image(test_img(&format!("{blknum} at {lsn}"))), ctx, ) .await?; @@ -7437,7 +7437,7 @@ mod tests { .put( gap_at_key, current_lsn, - &Value::Image(test_img(&format!("{} at {}", gap_at_key, current_lsn))), + &Value::Image(test_img(&format!("{gap_at_key} at {current_lsn}"))), &ctx, ) .await?; @@ -7476,7 +7476,7 @@ mod tests { .put( current_key, current_lsn, - &Value::Image(test_img(&format!("{} at {}", current_key, current_lsn))), + &Value::Image(test_img(&format!("{current_key} at {current_lsn}"))), &ctx, ) .await?; @@ -7584,7 +7584,7 @@ mod tests { while key < end_key { current_lsn += 0x10; - let image_value = format!("{} at {}", child_gap_at_key, current_lsn); + let image_value = format!("{child_gap_at_key} at {current_lsn}"); let mut writer = parent_timeline.writer().await; writer @@ -7627,7 +7627,7 @@ mod tests { .put( key, current_lsn, - &Value::Image(test_img(&format!("{} at {}", key, current_lsn))), + &Value::Image(test_img(&format!("{key} at {current_lsn}"))), &ctx, ) .await?; @@ -7748,7 +7748,7 @@ mod tests { .put( test_key, lsn, - &Value::Image(test_img(&format!("{} at {}", blknum, lsn))), + &Value::Image(test_img(&format!("{blknum} at {lsn}"))), &ctx, ) .await?; @@ -7769,7 +7769,7 @@ mod tests { .put( test_key, lsn, - &Value::Image(test_img(&format!("{} at {}", blknum, lsn))), + &Value::Image(test_img(&format!("{blknum} at {lsn}"))), &ctx, ) .await?; @@ -7783,7 +7783,7 @@ mod tests { test_key.field6 = blknum as u32; assert_eq!( tline.get(test_key, lsn, &ctx).await?, - test_img(&format!("{} at {}", blknum, last_lsn)) + test_img(&format!("{blknum} at {last_lsn}")) ); } @@ -7829,7 +7829,7 @@ mod tests { .put( test_key, lsn, - &Value::Image(test_img(&format!("{} at {}", blknum, lsn))), + &Value::Image(test_img(&format!("{blknum} at {lsn}"))), &ctx, ) .await?; @@ -7858,11 +7858,11 @@ mod tests { .put( test_key, lsn, - &Value::Image(test_img(&format!("{} at {}", blknum, lsn))), + &Value::Image(test_img(&format!("{blknum} at {lsn}"))), &ctx, ) .await?; - println!("updating {} at {}", blknum, lsn); + println!("updating {blknum} at {lsn}"); writer.finish_write(lsn); drop(writer); updated[blknum] = lsn; @@ -7873,7 +7873,7 @@ mod tests { test_key.field6 = blknum as u32; assert_eq!( tline.get(test_key, lsn, &ctx).await?, - test_img(&format!("{} at {}", blknum, last_lsn)) + test_img(&format!("{blknum} at {last_lsn}")) ); } @@ -7926,11 +7926,11 @@ mod tests { .put( test_key, lsn, - &Value::Image(test_img(&format!("{} {} at {}", idx, blknum, lsn))), + &Value::Image(test_img(&format!("{idx} {blknum} at {lsn}"))), &ctx, ) .await?; - println!("updating [{}][{}] at {}", idx, blknum, lsn); + println!("updating [{idx}][{blknum}] at {lsn}"); writer.finish_write(lsn); drop(writer); updated[idx][blknum] = lsn; @@ -8136,7 +8136,7 @@ mod tests { .put( test_key, lsn, - &Value::Image(test_img(&format!("{} at {}", blknum, lsn))), + &Value::Image(test_img(&format!("{blknum} at {lsn}"))), &ctx, ) .await?; @@ -8153,7 +8153,7 @@ mod tests { test_key.field6 = (blknum * STEP) as u32; assert_eq!( tline.get(test_key, lsn, &ctx).await?, - test_img(&format!("{} at {}", blknum, last_lsn)) + test_img(&format!("{blknum} at {last_lsn}")) ); } @@ -8190,7 +8190,7 @@ mod tests { .put( test_key, lsn, - &Value::Image(test_img(&format!("{} at {}", blknum, lsn))), + &Value::Image(test_img(&format!("{blknum} at {lsn}"))), &ctx, ) .await?; @@ -8443,7 +8443,7 @@ mod tests { .put( test_key, lsn, - &Value::Image(test_img(&format!("{} at {}", blknum, lsn))), + &Value::Image(test_img(&format!("{blknum} at {lsn}"))), &ctx, ) .await?; @@ -8463,7 +8463,7 @@ mod tests { .put( test_key, lsn, - &Value::Image(test_img(&format!("{} at {}", blknum, lsn))), + &Value::Image(test_img(&format!("{blknum} at {lsn}"))), &ctx, ) .await?; @@ -9384,12 +9384,7 @@ mod tests { let end_lsn = Lsn(0x100); let image_layers = (0x20..=0x90) .step_by(0x10) - .map(|n| { - ( - Lsn(n), - vec![(key, test_img(&format!("data key at {:x}", n)))], - ) - }) + .map(|n| (Lsn(n), vec![(key, test_img(&format!("data key at {n:x}")))])) .collect(); let timeline = tenant diff --git a/pageserver/src/tenant/checks.rs b/pageserver/src/tenant/checks.rs index d5b979ab2a..83d54f09de 100644 --- a/pageserver/src/tenant/checks.rs +++ b/pageserver/src/tenant/checks.rs @@ -63,8 +63,7 @@ pub fn check_valid_layermap(metadata: &[LayerName]) -> Option { && overlaps_with(&layer.key_range, &other_layer.key_range) { let err = format!( - "layer violates the layer map LSN split assumption: layer {} intersects with layer {}", - layer, other_layer + "layer violates the layer map LSN split assumption: layer {layer} intersects with layer {other_layer}" ); return Some(err); } diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs index bea3128265..5081d7f5a4 100644 --- a/pageserver/src/tenant/metadata.rs +++ b/pageserver/src/tenant/metadata.rs @@ -550,8 +550,7 @@ mod tests { assert_eq!( deserialized_metadata.body, expected_metadata.body, - "Metadata of the old version {} should be upgraded to the latest version {}", - METADATA_OLD_FORMAT_VERSION, METADATA_FORMAT_VERSION + "Metadata of the old version {METADATA_OLD_FORMAT_VERSION} should be upgraded to the latest version {METADATA_FORMAT_VERSION}" ); } diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs index dd49c843f3..6b315dc4bc 100644 --- a/pageserver/src/tenant/secondary/downloader.rs +++ b/pageserver/src/tenant/secondary/downloader.rs @@ -1427,7 +1427,7 @@ async fn init_timeline_state( let local_meta = dentry .metadata() .await - .fatal_err(&format!("Read metadata on {}", file_path)); + .fatal_err(&format!("Read metadata on {file_path}")); let file_name = file_path.file_name().expect("created it from the dentry"); if crate::is_temporary(&file_path) diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index ba763d4c3f..0f31318f0c 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -783,7 +783,7 @@ impl DeltaLayer { ctx, ) .await - .with_context(|| format!("Failed to open file '{}'", path))?; + .with_context(|| format!("Failed to open file '{path}'"))?; let file_id = page_cache::next_file_id(); let block_reader = FileBlockReader::new(&file, file_id); let summary_blk = block_reader.read_blk(0, ctx).await?; @@ -1401,7 +1401,7 @@ impl DeltaLayerInner { match val { Value::Image(img) => { let checkpoint = CheckPoint::decode(&img)?; - println!(" CHECKPOINT: {:?}", checkpoint); + println!(" CHECKPOINT: {checkpoint:?}"); } Value::WalRecord(_rec) => { println!(" unexpected walrecord value for checkpoint key"); diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index d6f5f48a6e..9f76f697d3 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -272,8 +272,7 @@ impl ImageLayer { conf.timeline_path(&tenant_shard_id, &timeline_id) .join(format!( - "{fname}.{:x}.{TEMP_FILE_SUFFIX}", - filename_disambiguator + "{fname}.{filename_disambiguator:x}.{TEMP_FILE_SUFFIX}" )) } @@ -370,7 +369,7 @@ impl ImageLayer { ctx, ) .await - .with_context(|| format!("Failed to open file '{}'", path))?; + .with_context(|| format!("Failed to open file '{path}'"))?; let file_id = page_cache::next_file_id(); let block_reader = FileBlockReader::new(&file, file_id); let summary_blk = block_reader.read_blk(0, ctx).await?; @@ -1475,7 +1474,7 @@ mod test { assert_eq!(l1, expect_lsn); assert_eq!(&i1, i2); } - (o1, o2) => panic!("iterators length mismatch: {:?}, {:?}", o1, o2), + (o1, o2) => panic!("iterators length mismatch: {o1:?}, {o2:?}"), } } } diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs index 8e5b0ba648..c4d53c6405 100644 --- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs +++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs @@ -511,7 +511,7 @@ fn inmem_layer_log_display( start_lsn: Lsn, end_lsn: Lsn, ) -> std::fmt::Result { - write!(f, "timeline {} in-memory ", timeline)?; + write!(f, "timeline {timeline} in-memory ")?; inmem_layer_display(f, start_lsn, end_lsn) } diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs b/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs index ea354fc716..27fbc6f5fb 100644 --- a/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs +++ b/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs @@ -380,7 +380,7 @@ impl std::fmt::Debug for LogicalReadState { write!(f, "Ongoing({:?})", BufferDebug::from(b as &dyn Buffer)) } LogicalReadState::Ok(b) => write!(f, "Ok({:?})", BufferDebug::from(b as &dyn Buffer)), - LogicalReadState::Error(e) => write!(f, "Error({:?})", e), + LogicalReadState::Error(e) => write!(f, "Error({e:?})"), LogicalReadState::Undefined => write!(f, "Undefined"), } } diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index 3d55972017..0be13e67a8 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -105,7 +105,7 @@ impl std::fmt::Display for Layer { impl std::fmt::Debug for Layer { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "{}", self) + write!(f, "{self}") } } diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 1bb17af146..81f2646e5a 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -178,7 +178,7 @@ pub enum LastImageLayerCreationStatus { impl std::fmt::Display for ImageLayerCreationMode { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "{:?}", self) + write!(f, "{self:?}") } } @@ -632,7 +632,7 @@ pub enum ReadPathLayerId { impl std::fmt::Display for ReadPathLayerId { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { - ReadPathLayerId::PersistentLayer(key) => write!(f, "{}", key), + ReadPathLayerId::PersistentLayer(key) => write!(f, "{key}"), ReadPathLayerId::InMemoryLayer(range) => { write!(f, "in-mem {}..{}", range.start, range.end) } @@ -708,7 +708,7 @@ impl MissingKeyError { impl std::fmt::Debug for MissingKeyError { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "{}", self) + write!(f, "{self}") } } @@ -721,19 +721,19 @@ impl std::fmt::Display for MissingKeyError { )?; if let Some(ref ancestor_lsn) = self.ancestor_lsn { - write!(f, ", ancestor {}", ancestor_lsn)?; + write!(f, ", ancestor {ancestor_lsn}")?; } if let Some(ref query) = self.query { - write!(f, ", query {}", query)?; + write!(f, ", query {query}")?; } if let Some(ref read_path) = self.read_path { - write!(f, "\n{}", read_path)?; + write!(f, "\n{read_path}")?; } if let Some(ref backtrace) = self.backtrace { - write!(f, "\n{}", backtrace)?; + write!(f, "\n{backtrace}")?; } Ok(()) @@ -7179,9 +7179,7 @@ impl Timeline { if let Some(end) = layer_end_lsn { assert!( end <= last_record_lsn, - "advance last record lsn before inserting a layer, end_lsn={}, last_record_lsn={}", - end, - last_record_lsn, + "advance last record lsn before inserting a layer, end_lsn={end}, last_record_lsn={last_record_lsn}", ); } diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index 6039c002f7..1b8e5f4b9c 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -977,7 +977,7 @@ impl KeyHistoryRetention { tline .reconstruct_value(key, lsn, data, RedoAttemptType::GcCompaction) .await - .with_context(|| format!("verification failed for key {} at lsn {}", key, lsn))?; + .with_context(|| format!("verification failed for key {key} at lsn {lsn}"))?; Ok(()) } @@ -2647,15 +2647,15 @@ impl Timeline { use std::fmt::Write; let mut output = String::new(); if let Some((key, _, _)) = replay_history.first() { - write!(output, "key={} ", key).unwrap(); + write!(output, "key={key} ").unwrap(); let mut cnt = 0; for (_, lsn, val) in replay_history { if val.is_image() { - write!(output, "i@{} ", lsn).unwrap(); + write!(output, "i@{lsn} ").unwrap(); } else if val.will_init() { - write!(output, "di@{} ", lsn).unwrap(); + write!(output, "di@{lsn} ").unwrap(); } else { - write!(output, "d@{} ", lsn).unwrap(); + write!(output, "d@{lsn} ").unwrap(); } cnt += 1; if cnt >= 128 { diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs index 343e04f5f0..e91bd5d43a 100644 --- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs @@ -360,8 +360,7 @@ pub(super) async fn handle_walreceiver_connection( match raw_wal_start_lsn.cmp(&expected_wal_start) { std::cmp::Ordering::Greater => { let msg = format!( - "Gap in streamed WAL: [{}, {})", - expected_wal_start, raw_wal_start_lsn + "Gap in streamed WAL: [{expected_wal_start}, {raw_wal_start_lsn})" ); critical!("{msg}"); return Err(WalReceiverError::Other(anyhow!(msg))); diff --git a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer.rs b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer.rs index 090d2ece85..85ea5c4d80 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer.rs @@ -68,16 +68,9 @@ impl AlignedBuffer { assert!( begin <= end, - "range start must not be greater than end: {:?} <= {:?}", - begin, - end, - ); - assert!( - end <= len, - "range end out of bounds: {:?} <= {:?}", - end, - len, + "range start must not be greater than end: {begin:?} <= {end:?}", ); + assert!(end <= len, "range end out of bounds: {end:?} <= {len:?}",); let begin = self.range.start + begin; let end = self.range.start + end; diff --git a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs index 07f949b89e..93116ea85e 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs @@ -242,10 +242,7 @@ unsafe impl bytes::BufMut for AlignedBufferMut { /// Panic with a nice error message. #[cold] fn panic_advance(idx: usize, len: usize) -> ! { - panic!( - "advance out of bounds: the len is {} but advancing by {}", - len, idx - ); + panic!("advance out of bounds: the len is {len} but advancing by {idx}"); } /// Safety: [`AlignedBufferMut`] has exclusive ownership of the io buffer, diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index ebffaf70e2..c452f48e40 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -2108,7 +2108,7 @@ mod tests { // Check relation content for blkno in 0..relsize { let lsn = Lsn(0x20); - let data = format!("foo blk {} at {}", blkno, lsn); + let data = format!("foo blk {blkno} at {lsn}"); assert_eq!( tline .get_rel_page_at_lsn( @@ -2142,7 +2142,7 @@ mod tests { for blkno in 0..1 { let lsn = Lsn(0x20); - let data = format!("foo blk {} at {}", blkno, lsn); + let data = format!("foo blk {blkno} at {lsn}"); assert_eq!( tline .get_rel_page_at_lsn( @@ -2167,7 +2167,7 @@ mod tests { ); for blkno in 0..relsize { let lsn = Lsn(0x20); - let data = format!("foo blk {} at {}", blkno, lsn); + let data = format!("foo blk {blkno} at {lsn}"); assert_eq!( tline .get_rel_page_at_lsn( @@ -2188,7 +2188,7 @@ mod tests { let lsn = Lsn(0x80); let mut m = tline.begin_modification(lsn); for blkno in 0..relsize { - let data = format!("foo blk {} at {}", blkno, lsn); + let data = format!("foo blk {blkno} at {lsn}"); walingest .put_rel_page_image(&mut m, TESTREL_A, blkno, test_img(&data), &ctx) .await?; @@ -2210,7 +2210,7 @@ mod tests { // Check relation content for blkno in 0..relsize { let lsn = Lsn(0x80); - let data = format!("foo blk {} at {}", blkno, lsn); + let data = format!("foo blk {blkno} at {lsn}"); assert_eq!( tline .get_rel_page_at_lsn( @@ -2414,6 +2414,6 @@ mod tests { } let duration = started_at.elapsed(); - println!("done in {:?}", duration); + println!("done in {duration:?}"); } } diff --git a/pageserver/src/walredo/apply_neon.rs b/pageserver/src/walredo/apply_neon.rs index 0783c77622..a525579082 100644 --- a/pageserver/src/walredo/apply_neon.rs +++ b/pageserver/src/walredo/apply_neon.rs @@ -52,8 +52,7 @@ pub(crate) fn apply_in_neon( let (rel, _) = key.to_rel_block().context("invalid record")?; assert!( rel.forknum == VISIBILITYMAP_FORKNUM, - "TruncateVisibilityMap record on unexpected rel {}", - rel + "TruncateVisibilityMap record on unexpected rel {rel}" ); let map = &mut page[pg_constants::MAXALIGN_SIZE_OF_PAGE_HEADER_DATA..]; map[*trunc_byte + 1..].fill(0u8); @@ -78,8 +77,7 @@ pub(crate) fn apply_in_neon( let (rel, blknum) = key.to_rel_block().context("invalid record")?; assert!( rel.forknum == VISIBILITYMAP_FORKNUM, - "ClearVisibilityMapFlags record on unexpected rel {}", - rel + "ClearVisibilityMapFlags record on unexpected rel {rel}" ); if let Some(heap_blkno) = *new_heap_blkno { // Calculate the VM block and offset that corresponds to the heap block. @@ -124,8 +122,7 @@ pub(crate) fn apply_in_neon( assert_eq!( slru_kind, SlruKind::Clog, - "ClogSetCommitted record with unexpected key {}", - key + "ClogSetCommitted record with unexpected key {key}" ); for &xid in xids { let pageno = xid / pg_constants::CLOG_XACTS_PER_PAGE; @@ -135,15 +132,11 @@ pub(crate) fn apply_in_neon( // Check that we're modifying the correct CLOG block. assert!( segno == expected_segno, - "ClogSetCommitted record for XID {} with unexpected key {}", - xid, - key + "ClogSetCommitted record for XID {xid} with unexpected key {key}" ); assert!( blknum == expected_blknum, - "ClogSetCommitted record for XID {} with unexpected key {}", - xid, - key + "ClogSetCommitted record for XID {xid} with unexpected key {key}" ); transaction_id_set_status(xid, pg_constants::TRANSACTION_STATUS_COMMITTED, page); @@ -169,8 +162,7 @@ pub(crate) fn apply_in_neon( assert_eq!( slru_kind, SlruKind::Clog, - "ClogSetAborted record with unexpected key {}", - key + "ClogSetAborted record with unexpected key {key}" ); for &xid in xids { let pageno = xid / pg_constants::CLOG_XACTS_PER_PAGE; @@ -180,15 +172,11 @@ pub(crate) fn apply_in_neon( // Check that we're modifying the correct CLOG block. assert!( segno == expected_segno, - "ClogSetAborted record for XID {} with unexpected key {}", - xid, - key + "ClogSetAborted record for XID {xid} with unexpected key {key}" ); assert!( blknum == expected_blknum, - "ClogSetAborted record for XID {} with unexpected key {}", - xid, - key + "ClogSetAborted record for XID {xid} with unexpected key {key}" ); transaction_id_set_status(xid, pg_constants::TRANSACTION_STATUS_ABORTED, page); @@ -199,8 +187,7 @@ pub(crate) fn apply_in_neon( assert_eq!( slru_kind, SlruKind::MultiXactOffsets, - "MultixactOffsetCreate record with unexpected key {}", - key + "MultixactOffsetCreate record with unexpected key {key}" ); // Compute the block and offset to modify. // See RecordNewMultiXact in PostgreSQL sources. @@ -213,15 +200,11 @@ pub(crate) fn apply_in_neon( let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; assert!( segno == expected_segno, - "MultiXactOffsetsCreate record for multi-xid {} with unexpected key {}", - mid, - key + "MultiXactOffsetsCreate record for multi-xid {mid} with unexpected key {key}" ); assert!( blknum == expected_blknum, - "MultiXactOffsetsCreate record for multi-xid {} with unexpected key {}", - mid, - key + "MultiXactOffsetsCreate record for multi-xid {mid} with unexpected key {key}" ); LittleEndian::write_u32(&mut page[offset..offset + 4], *moff); @@ -231,8 +214,7 @@ pub(crate) fn apply_in_neon( assert_eq!( slru_kind, SlruKind::MultiXactMembers, - "MultixactMembersCreate record with unexpected key {}", - key + "MultixactMembersCreate record with unexpected key {key}" ); for (i, member) in members.iter().enumerate() { let offset = moff + i as u32; @@ -249,15 +231,11 @@ pub(crate) fn apply_in_neon( let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; assert!( segno == expected_segno, - "MultiXactMembersCreate record for offset {} with unexpected key {}", - moff, - key + "MultiXactMembersCreate record for offset {moff} with unexpected key {key}" ); assert!( blknum == expected_blknum, - "MultiXactMembersCreate record for offset {} with unexpected key {}", - moff, - key + "MultiXactMembersCreate record for offset {moff} with unexpected key {key}" ); let mut flagsval = LittleEndian::read_u32(&page[flagsoff..flagsoff + 4]); diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs index d96f582fad..263d784e78 100644 --- a/proxy/src/lib.rs +++ b/proxy/src/lib.rs @@ -61,6 +61,10 @@ clippy::too_many_lines, clippy::unused_self )] +#![allow( + clippy::unsafe_derive_deserialize, + reason = "false positive: https://github.com/rust-lang/rust-clippy/issues/15120" +)] #![cfg_attr( any(test, feature = "testing"), allow( diff --git a/safekeeper/src/control_file.rs b/safekeeper/src/control_file.rs index 1bf3e4cac1..4fc62fb229 100644 --- a/safekeeper/src/control_file.rs +++ b/safekeeper/src/control_file.rs @@ -206,16 +206,10 @@ impl Storage for FileStorage { let buf: Vec = s.write_to_buf()?; control_partial.write_all(&buf).await.with_context(|| { - format!( - "failed to write safekeeper state into control file at: {}", - control_partial_path - ) + format!("failed to write safekeeper state into control file at: {control_partial_path}") })?; control_partial.flush().await.with_context(|| { - format!( - "failed to flush safekeeper state into control file at: {}", - control_partial_path - ) + format!("failed to flush safekeeper state into control file at: {control_partial_path}") })?; let control_path = self.timeline_dir.join(CONTROL_FILE_NAME); diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs index b54bee8bfb..5e7f1d8758 100644 --- a/safekeeper/src/handler.rs +++ b/safekeeper/src/handler.rs @@ -73,7 +73,7 @@ fn parse_cmd(cmd: &str) -> anyhow::Result { let re = Regex::new(r"START_WAL_PUSH(\s+?\((.*)\))?").unwrap(); let caps = re .captures(cmd) - .context(format!("failed to parse START_WAL_PUSH command {}", cmd))?; + .context(format!("failed to parse START_WAL_PUSH command {cmd}"))?; // capture () content let options = caps.get(2).map(|m| m.as_str()).unwrap_or(""); // default values @@ -85,24 +85,20 @@ fn parse_cmd(cmd: &str) -> anyhow::Result { } let mut kvit = kvstr.split_whitespace(); let key = kvit.next().context(format!( - "failed to parse key in kv {} in command {}", - kvstr, cmd + "failed to parse key in kv {kvstr} in command {cmd}" ))?; let value = kvit.next().context(format!( - "failed to parse value in kv {} in command {}", - kvstr, cmd + "failed to parse value in kv {kvstr} in command {cmd}" ))?; let value_trimmed = value.trim_matches('\''); if key == "proto_version" { proto_version = value_trimmed.parse::().context(format!( - "failed to parse proto_version value {} in command {}", - value, cmd + "failed to parse proto_version value {value} in command {cmd}" ))?; } if key == "allow_timeline_creation" { allow_timeline_creation = value_trimmed.parse::().context(format!( - "failed to parse allow_timeline_creation value {} in command {}", - value, cmd + "failed to parse allow_timeline_creation value {value} in command {cmd}" ))?; } } @@ -118,7 +114,7 @@ fn parse_cmd(cmd: &str) -> anyhow::Result { .unwrap(); let caps = re .captures(cmd) - .context(format!("failed to parse START_REPLICATION command {}", cmd))?; + .context(format!("failed to parse START_REPLICATION command {cmd}"))?; let start_lsn = Lsn::from_str(&caps[1]).context("parse start LSN from START_REPLICATION command")?; let term = if let Some(m) = caps.get(2) { diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index 886cac869d..aa19b6d283 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -64,10 +64,10 @@ impl TermHistory { for i in 0..n_entries { let term = bytes .get_u64_f() - .with_context(|| format!("TermHistory pos {} misses term", i))?; + .with_context(|| format!("TermHistory pos {i} misses term"))?; let lsn = bytes .get_u64_f() - .with_context(|| format!("TermHistory pos {} misses lsn", i))? + .with_context(|| format!("TermHistory pos {i} misses lsn"))? .into(); res.push(TermLsn { term, lsn }) } @@ -121,9 +121,7 @@ impl TermHistory { if let Some(sk_th_last) = sk_th.last() { assert!( sk_th_last.lsn <= sk_wal_end, - "safekeeper term history end {:?} LSN is higher than WAL end {:?}", - sk_th_last, - sk_wal_end + "safekeeper term history end {sk_th_last:?} LSN is higher than WAL end {sk_wal_end:?}" ); } @@ -438,11 +436,11 @@ impl ProposerAcceptorMessage { for i in 0..members_len { let id = buf .get_u64_f() - .with_context(|| format!("reading member {} node_id", i))?; - let host = Self::get_cstr(buf).with_context(|| format!("reading member {} host", i))?; + .with_context(|| format!("reading member {i} node_id"))?; + let host = Self::get_cstr(buf).with_context(|| format!("reading member {i} host"))?; let pg_port = buf .get_u16_f() - .with_context(|| format!("reading member {} port", i))?; + .with_context(|| format!("reading member {i} port"))?; let sk = SafekeeperId { id: NodeId(id), host, @@ -463,12 +461,12 @@ impl ProposerAcceptorMessage { for i in 0..new_members_len { let id = buf .get_u64_f() - .with_context(|| format!("reading new member {} node_id", i))?; - let host = Self::get_cstr(buf) - .with_context(|| format!("reading new member {} host", i))?; + .with_context(|| format!("reading new member {i} node_id"))?; + let host = + Self::get_cstr(buf).with_context(|| format!("reading new member {i} host"))?; let pg_port = buf .get_u16_f() - .with_context(|| format!("reading new member {} port", i))?; + .with_context(|| format!("reading new member {i} port"))?; let sk = SafekeeperId { id: NodeId(id), host, @@ -1508,7 +1506,7 @@ mod tests { let mut vote_resp = sk.process_msg(&vote_request).await; match vote_resp.unwrap() { Some(AcceptorProposerMessage::VoteResponse(resp)) => assert!(resp.vote_given), - r => panic!("unexpected response: {:?}", r), + r => panic!("unexpected response: {r:?}"), } // reboot... @@ -1523,7 +1521,7 @@ mod tests { vote_resp = sk.process_msg(&vote_request).await; match vote_resp.unwrap() { Some(AcceptorProposerMessage::VoteResponse(resp)) => assert!(!resp.vote_given), - r => panic!("unexpected response: {:?}", r), + r => panic!("unexpected response: {r:?}"), } } diff --git a/safekeeper/src/timeline_eviction.rs b/safekeeper/src/timeline_eviction.rs index e817dbf6f9..47b65a579a 100644 --- a/safekeeper/src/timeline_eviction.rs +++ b/safekeeper/src/timeline_eviction.rs @@ -342,7 +342,7 @@ where let bytes_read1 = reader1 .read(&mut buffer1[..bytes_to_read]) .await - .with_context(|| format!("failed to read from reader1 at offset {}", offset))?; + .with_context(|| format!("failed to read from reader1 at offset {offset}"))?; if bytes_read1 == 0 { anyhow::bail!("unexpected EOF from reader1 at offset {}", offset); } @@ -351,10 +351,7 @@ where .read_exact(&mut buffer2[..bytes_read1]) .await .with_context(|| { - format!( - "failed to read {} bytes from reader2 at offset {}", - bytes_read1, offset - ) + format!("failed to read {bytes_read1} bytes from reader2 at offset {offset}") })?; assert!(bytes_read2 == bytes_read1); diff --git a/safekeeper/src/timeline_manager.rs b/safekeeper/src/timeline_manager.rs index 48eda92fed..a68752bfdd 100644 --- a/safekeeper/src/timeline_manager.rs +++ b/safekeeper/src/timeline_manager.rs @@ -108,7 +108,7 @@ impl std::fmt::Debug for ManagerCtlMessage { match self { ManagerCtlMessage::GuardRequest(_) => write!(f, "GuardRequest"), ManagerCtlMessage::TryGuardRequest(_) => write!(f, "TryGuardRequest"), - ManagerCtlMessage::GuardDrop(id) => write!(f, "GuardDrop({:?})", id), + ManagerCtlMessage::GuardDrop(id) => write!(f, "GuardDrop({id:?})"), ManagerCtlMessage::BackupPartialReset(_) => write!(f, "BackupPartialReset"), } } diff --git a/safekeeper/src/timelines_global_map.rs b/safekeeper/src/timelines_global_map.rs index 6e41ada1b3..a81a7298a9 100644 --- a/safekeeper/src/timelines_global_map.rs +++ b/safekeeper/src/timelines_global_map.rs @@ -147,7 +147,7 @@ impl GlobalTimelines { }; let mut tenant_count = 0; for tenants_dir_entry in std::fs::read_dir(&tenants_dir) - .with_context(|| format!("failed to list tenants dir {}", tenants_dir))? + .with_context(|| format!("failed to list tenants dir {tenants_dir}"))? { match &tenants_dir_entry { Ok(tenants_dir_entry) => { @@ -188,7 +188,7 @@ impl GlobalTimelines { let timelines_dir = get_tenant_dir(&conf, &tenant_id); for timelines_dir_entry in std::fs::read_dir(&timelines_dir) - .with_context(|| format!("failed to list timelines dir {}", timelines_dir))? + .with_context(|| format!("failed to list timelines dir {timelines_dir}"))? { match &timelines_dir_entry { Ok(timeline_dir_entry) => { diff --git a/safekeeper/src/wal_backup_partial.rs b/safekeeper/src/wal_backup_partial.rs index fe0f1b3607..cdf68262dd 100644 --- a/safekeeper/src/wal_backup_partial.rs +++ b/safekeeper/src/wal_backup_partial.rs @@ -364,8 +364,7 @@ impl PartialBackup { // there should always be zero or one uploaded segment assert!( new_segments.is_empty(), - "too many uploaded segments: {:?}", - new_segments + "too many uploaded segments: {new_segments:?}" ); } diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs index 8ba3e7cc47..e68c9f3a99 100644 --- a/safekeeper/src/wal_storage.rs +++ b/safekeeper/src/wal_storage.rs @@ -841,7 +841,7 @@ pub(crate) async fn open_wal_file( // If that failed, try it without the .partial extension. let pf = tokio::fs::File::open(&wal_file_path) .await - .with_context(|| format!("failed to open WAL file {:#}", wal_file_path)) + .with_context(|| format!("failed to open WAL file {wal_file_path:#}")) .map_err(|e| { warn!("{}", e); e diff --git a/safekeeper/tests/walproposer_sim/log.rs b/safekeeper/tests/walproposer_sim/log.rs index e2ba3282ca..cecbc859e6 100644 --- a/safekeeper/tests/walproposer_sim/log.rs +++ b/safekeeper/tests/walproposer_sim/log.rs @@ -33,7 +33,7 @@ impl FormatTime for SimClock { if let Some(clock) = clock.as_ref() { let now = clock.now(); - write!(w, "[{}]", now) + write!(w, "[{now}]") } else { write!(w, "[?]") } diff --git a/safekeeper/tests/walproposer_sim/safekeeper.rs b/safekeeper/tests/walproposer_sim/safekeeper.rs index 5fb29683f2..1fdf8e4949 100644 --- a/safekeeper/tests/walproposer_sim/safekeeper.rs +++ b/safekeeper/tests/walproposer_sim/safekeeper.rs @@ -257,7 +257,7 @@ pub fn run_server(os: NodeOs, disk: Arc) -> Result<()> { let estr = e.to_string(); if !estr.contains("finished processing START_REPLICATION") { warn!("conn {:?} error: {:?}", connection_id, e); - panic!("unexpected error at safekeeper: {:#}", e); + panic!("unexpected error at safekeeper: {e:#}"); } conns.remove(&connection_id); break; diff --git a/safekeeper/tests/walproposer_sim/simulation.rs b/safekeeper/tests/walproposer_sim/simulation.rs index 70fecfbe22..edd3bf2d9e 100644 --- a/safekeeper/tests/walproposer_sim/simulation.rs +++ b/safekeeper/tests/walproposer_sim/simulation.rs @@ -217,7 +217,7 @@ impl TestConfig { ]; let server_ids = [servers[0].id, servers[1].id, servers[2].id]; - let safekeepers_addrs = server_ids.map(|id| format!("node:{}", id)).to_vec(); + let safekeepers_addrs = server_ids.map(|id| format!("node:{id}")).to_vec(); let ttid = TenantTimelineId::generate(); diff --git a/safekeeper/tests/walproposer_sim/walproposer_api.rs b/safekeeper/tests/walproposer_sim/walproposer_api.rs index c2604c4bdc..29b361db7e 100644 --- a/safekeeper/tests/walproposer_sim/walproposer_api.rs +++ b/safekeeper/tests/walproposer_sim/walproposer_api.rs @@ -523,7 +523,7 @@ impl ApiImpl for SimulationApi { // Voting bug when safekeeper disconnects after voting executor::exit(1, msg.to_owned()); } - panic!("unknown FATAL error from walproposer: {}", msg); + panic!("unknown FATAL error from walproposer: {msg}"); } } @@ -544,10 +544,7 @@ impl ApiImpl for SimulationApi { } } - let msg = format!( - "prop_elected;{};{};{};{}", - prop_lsn, prop_term, prev_lsn, prev_term - ); + let msg = format!("prop_elected;{prop_lsn};{prop_term};{prev_lsn};{prev_term}"); debug!(msg); self.os.log_event(msg); diff --git a/storage_broker/benches/rps.rs b/storage_broker/benches/rps.rs index 9953ccfa91..5f3e594687 100644 --- a/storage_broker/benches/rps.rs +++ b/storage_broker/benches/rps.rs @@ -161,7 +161,7 @@ async fn publish(client: Option, n_keys: u64) { } }; let response = client.publish_safekeeper_info(Request::new(outbound)).await; - println!("pub response is {:?}", response); + println!("pub response is {response:?}"); } #[tokio::main] diff --git a/storage_broker/build.rs b/storage_broker/build.rs index 08dadeacd5..77c441dddd 100644 --- a/storage_broker/build.rs +++ b/storage_broker/build.rs @@ -6,6 +6,6 @@ fn main() -> Result<(), Box> { // the build then. Anyway, per cargo docs build script shouldn't output to // anywhere but $OUT_DIR. tonic_build::compile_protos("proto/broker.proto") - .unwrap_or_else(|e| panic!("failed to compile protos {:?}", e)); + .unwrap_or_else(|e| panic!("failed to compile protos {e:?}")); Ok(()) } diff --git a/storage_broker/src/lib.rs b/storage_broker/src/lib.rs index 149656a191..7d8b57380f 100644 --- a/storage_broker/src/lib.rs +++ b/storage_broker/src/lib.rs @@ -86,13 +86,9 @@ impl BrokerClientChannel { #[allow(clippy::result_large_err, reason = "TODO")] pub fn parse_proto_ttid(proto_ttid: &ProtoTenantTimelineId) -> Result { let tenant_id = TenantId::from_slice(&proto_ttid.tenant_id) - .map_err(|e| Status::new(Code::InvalidArgument, format!("malformed tenant_id: {}", e)))?; - let timeline_id = TimelineId::from_slice(&proto_ttid.timeline_id).map_err(|e| { - Status::new( - Code::InvalidArgument, - format!("malformed timeline_id: {}", e), - ) - })?; + .map_err(|e| Status::new(Code::InvalidArgument, format!("malformed tenant_id: {e}")))?; + let timeline_id = TimelineId::from_slice(&proto_ttid.timeline_id) + .map_err(|e| Status::new(Code::InvalidArgument, format!("malformed timeline_id: {e}")))?; Ok(TenantTimelineId { tenant_id, timeline_id, diff --git a/storage_controller/src/compute_hook.rs b/storage_controller/src/compute_hook.rs index a4482a4dac..0b5569b3d6 100644 --- a/storage_controller/src/compute_hook.rs +++ b/storage_controller/src/compute_hook.rs @@ -370,7 +370,7 @@ impl ComputeHook { let authorization_header = config .control_plane_jwt_token .clone() - .map(|jwt| format!("Bearer {}", jwt)); + .map(|jwt| format!("Bearer {jwt}")); let mut client = reqwest::ClientBuilder::new().timeout(NOTIFY_REQUEST_TIMEOUT); for cert in &config.ssl_ca_certs { diff --git a/storage_controller/src/drain_utils.rs b/storage_controller/src/drain_utils.rs index bd4b8ba38f..0dae7b8147 100644 --- a/storage_controller/src/drain_utils.rs +++ b/storage_controller/src/drain_utils.rs @@ -62,7 +62,7 @@ pub(crate) fn validate_node_state( nodes: Arc>, ) -> Result<(), OperationError> { let node = nodes.get(node_id).ok_or(OperationError::NodeStateChanged( - format!("node {} was removed", node_id).into(), + format!("node {node_id} was removed").into(), ))?; let current_policy = node.get_scheduling(); @@ -70,7 +70,7 @@ pub(crate) fn validate_node_state( // TODO(vlad): maybe cancel pending reconciles before erroring out. need to think // about it return Err(OperationError::NodeStateChanged( - format!("node {} changed state to {:?}", node_id, current_policy).into(), + format!("node {node_id} changed state to {current_policy:?}").into(), )); } @@ -145,7 +145,7 @@ impl TenantShardDrain { if !nodes.contains_key(&destination) { return Err(OperationError::NodeStateChanged( - format!("node {} was removed", destination).into(), + format!("node {destination} was removed").into(), )); } diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs index 346595aa11..a7e86b5224 100644 --- a/storage_controller/src/http.rs +++ b/storage_controller/src/http.rs @@ -721,9 +721,9 @@ async fn handle_tenant_timeline_passthrough( // Callers will always pass an unsharded tenant ID. Before proxying, we must // rewrite this to a shard-aware shard zero ID. - let path = format!("{}", path); + let path = format!("{path}"); let tenant_str = tenant_or_shard_id.tenant_id.to_string(); - let tenant_shard_str = format!("{}", tenant_shard_id); + let tenant_shard_str = format!("{tenant_shard_id}"); let path = path.replace(&tenant_str, &tenant_shard_str); let latency = &METRICS_REGISTRY @@ -1539,7 +1539,7 @@ async fn handle_ready(req: Request) -> Result, ApiError> { impl From for ApiError { fn from(value: ReconcileError) -> Self { - ApiError::Conflict(format!("Reconciliation error: {}", value)) + ApiError::Conflict(format!("Reconciliation error: {value}")) } } diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs index 4300dd32a9..2948e9019f 100644 --- a/storage_controller/src/persistence.rs +++ b/storage_controller/src/persistence.rs @@ -500,15 +500,13 @@ impl Persistence { if let Some(np) = node_to_delete { let lc = NodeLifecycle::from_str(&np.lifecycle).map_err(|e| { DatabaseError::Logical(format!( - "Node {} has invalid lifecycle: {}", - del_node_id, e + "Node {del_node_id} has invalid lifecycle: {e}" )) })?; if lc != NodeLifecycle::Deleted { return Err(DatabaseError::Logical(format!( - "Node {} was not soft deleted before, cannot hard delete it", - del_node_id + "Node {del_node_id} was not soft deleted before, cannot hard delete it" ))); } @@ -642,8 +640,7 @@ impl Persistence { .await?; if deleted_node > 0 { return Err(DatabaseError::Logical(format!( - "Node {} is marked as deleted, re-attach is not allowed", - input_node_id + "Node {input_node_id} is marked as deleted, re-attach is not allowed" ))); } @@ -1003,7 +1000,7 @@ impl Persistence { .execute(conn).await?; if u8::try_from(updated) .map_err(|_| DatabaseError::Logical( - format!("Overflow existing shard count {} while splitting", updated)) + format!("Overflow existing shard count {updated} while splitting")) )? != old_shard_count.count() { // Perhaps a deletion or another split raced with this attempt to split, mutating // the parent shards that we intend to split. In this case the split request should fail. @@ -1343,8 +1340,7 @@ impl Persistence { if inserted_updated != 1 { return Err(DatabaseError::Logical(format!( - "unexpected number of rows ({})", - inserted_updated + "unexpected number of rows ({inserted_updated})" ))); } @@ -1406,8 +1402,7 @@ impl Persistence { 0 => Ok(false), 1 => Ok(true), _ => Err(DatabaseError::Logical(format!( - "unexpected number of rows ({})", - inserted_updated + "unexpected number of rows ({inserted_updated})" ))), } }) @@ -1476,8 +1471,7 @@ impl Persistence { 0 => Ok(()), 1 => Ok(()), _ => Err(DatabaseError::Logical(format!( - "unexpected number of rows ({})", - updated + "unexpected number of rows ({updated})" ))), } }) @@ -1570,8 +1564,7 @@ impl Persistence { 0 => Ok(false), 1 => Ok(true), _ => Err(DatabaseError::Logical(format!( - "unexpected number of rows ({})", - inserted_updated + "unexpected number of rows ({inserted_updated})" ))), } }) diff --git a/storage_controller/src/scheduler.rs b/storage_controller/src/scheduler.rs index b3656c33d4..b86b4dfab1 100644 --- a/storage_controller/src/scheduler.rs +++ b/storage_controller/src/scheduler.rs @@ -23,7 +23,7 @@ pub enum ScheduleError { impl From for ApiError { fn from(value: ScheduleError) -> Self { - ApiError::Conflict(format!("Scheduling error: {}", value)) + ApiError::Conflict(format!("Scheduling error: {value}")) } } @@ -903,7 +903,7 @@ impl Scheduler { /// rigorously updating them on every change. pub(crate) fn update_metrics(&self) { for (node_id, node) in &self.nodes { - let node_id_str = format!("{}", node_id); + let node_id_str = format!("{node_id}"); let label_group = NodeLabelGroup { az: &node.az.0, node_id: &node_id_str, @@ -1326,7 +1326,7 @@ mod tests { .map(|(node_id, node)| (node_id, node.home_shard_count)) .collect::>(); node_home_counts.sort_by_key(|i| i.0); - eprintln!("Selected {}, vs nodes {:?}", preferred_az, node_home_counts); + eprintln!("Selected {preferred_az}, vs nodes {node_home_counts:?}"); let tenant_shard_id = TenantShardId { tenant_id: TenantId::generate(), diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 0eb87ffbe3..8424c27cf8 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -260,7 +260,7 @@ fn passthrough_api_error(node: &Node, e: mgmt_api::Error) -> ApiError { // Presume errors receiving body are connectivity/availability issues except for decoding errors let src_str = err.source().map(|e| e.to_string()).unwrap_or_default(); ApiError::ResourceUnavailable( - format!("{node} error receiving error body: {err} {}", src_str).into(), + format!("{node} error receiving error body: {err} {src_str}").into(), ) } mgmt_api::Error::ApiError(StatusCode::NOT_FOUND, msg) => { @@ -671,7 +671,7 @@ impl std::fmt::Display for StopReconciliationsReason { Self::ShuttingDown => "Shutting down", Self::SteppingDown => "Stepping down", }; - write!(writer, "{}", s) + write!(writer, "{s}") } } @@ -5278,7 +5278,7 @@ impl Service { shard_params, result .iter() - .map(|s| format!("{:?}", s)) + .map(|s| format!("{s:?}")) .collect::>() .join(",") ); @@ -6201,7 +6201,7 @@ impl Service { }, ) .await - .map_err(|e| ApiError::Conflict(format!("Failed to split {}: {}", parent_id, e)))?; + .map_err(|e| ApiError::Conflict(format!("Failed to split {parent_id}: {e}")))?; fail::fail_point!("shard-split-post-remote", |_| Err(ApiError::Conflict( "failpoint".to_string() @@ -6218,7 +6218,7 @@ impl Service { response .new_shards .iter() - .map(|s| format!("{:?}", s)) + .map(|s| format!("{s:?}")) .collect::>() .join(",") ); @@ -7117,8 +7117,7 @@ impl Service { Ok(()) } else { Err(ApiError::Conflict(format!( - "Node {} is in use, consider using tombstone API first", - node_id + "Node {node_id} is in use, consider using tombstone API first" ))) } } @@ -7668,7 +7667,7 @@ impl Service { if let Some(ongoing) = ongoing_op { return Err(ApiError::PreconditionFailed( - format!("Background operation already ongoing for node: {}", ongoing).into(), + format!("Background operation already ongoing for node: {ongoing}").into(), )); } @@ -7799,7 +7798,7 @@ impl Service { if let Some(ongoing) = ongoing_op { return Err(ApiError::PreconditionFailed( - format!("Background operation already ongoing for node: {}", ongoing).into(), + format!("Background operation already ongoing for node: {ongoing}").into(), )); } @@ -8870,7 +8869,7 @@ impl Service { let nodes = self.inner.read().unwrap().nodes.clone(); let node = nodes.get(secondary).ok_or(mgmt_api::Error::ApiError( StatusCode::NOT_FOUND, - format!("Node with id {} not found", secondary), + format!("Node with id {secondary} not found"), ))?; match node @@ -8949,8 +8948,7 @@ impl Service { Err(err) => { return Err(OperationError::FinalizeError( format!( - "Failed to finalise drain cancel of {} by setting scheduling policy to Active: {}", - node_id, err + "Failed to finalise drain cancel of {node_id} by setting scheduling policy to Active: {err}" ) .into(), )); @@ -9054,8 +9052,7 @@ impl Service { Err(err) => { return Err(OperationError::FinalizeError( format!( - "Failed to finalise drain cancel of {} by setting scheduling policy to Active: {}", - node_id, err + "Failed to finalise drain cancel of {node_id} by setting scheduling policy to Active: {err}" ) .into(), )); @@ -9265,8 +9262,7 @@ impl Service { Err(err) => { return Err(OperationError::FinalizeError( format!( - "Failed to finalise drain cancel of {} by setting scheduling policy to Active: {}", - node_id, err + "Failed to finalise drain cancel of {node_id} by setting scheduling policy to Active: {err}" ) .into(), )); @@ -9348,8 +9344,7 @@ impl Service { Err(err) => { return Err(OperationError::FinalizeError( format!( - "Failed to finalise drain cancel of {} by setting scheduling policy to Active: {}", - node_id, err + "Failed to finalise drain cancel of {node_id} by setting scheduling policy to Active: {err}" ) .into(), )); diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs index 789327bfaf..359921ecbf 100644 --- a/storage_controller/src/tenant_shard.rs +++ b/storage_controller/src/tenant_shard.rs @@ -3008,21 +3008,18 @@ pub(crate) mod tests { if attachments_in_wrong_az > 0 { violations.push(format!( - "{} attachments scheduled to the incorrect AZ", - attachments_in_wrong_az + "{attachments_in_wrong_az} attachments scheduled to the incorrect AZ" )); } if secondaries_in_wrong_az > 0 { violations.push(format!( - "{} secondaries scheduled to the incorrect AZ", - secondaries_in_wrong_az + "{secondaries_in_wrong_az} secondaries scheduled to the incorrect AZ" )); } eprintln!( - "attachments_in_wrong_az={} secondaries_in_wrong_az={}", - attachments_in_wrong_az, secondaries_in_wrong_az + "attachments_in_wrong_az={attachments_in_wrong_az} secondaries_in_wrong_az={secondaries_in_wrong_az}" ); for (node_id, stats) in &node_stats { diff --git a/storage_controller/src/timeline_import.rs b/storage_controller/src/timeline_import.rs index eb50819d02..e88bce4c82 100644 --- a/storage_controller/src/timeline_import.rs +++ b/storage_controller/src/timeline_import.rs @@ -195,7 +195,7 @@ impl UpcallClient { let authorization_header = config .control_plane_jwt_token .clone() - .map(|jwt| format!("Bearer {}", jwt)); + .map(|jwt| format!("Bearer {jwt}")); let client = reqwest::ClientBuilder::new() .timeout(IMPORT_COMPLETE_REQUEST_TIMEOUT) diff --git a/storage_scrubber/src/checks.rs b/storage_scrubber/src/checks.rs index 865f0908f9..774418f237 100644 --- a/storage_scrubber/src/checks.rs +++ b/storage_scrubber/src/checks.rs @@ -146,7 +146,7 @@ pub(crate) async fn branch_cleanup_and_check_errors( for (layer, metadata) in index_part.layer_metadata { if metadata.file_size == 0 { result.errors.push(format!( - "index_part.json contains a layer {} that has 0 size in its layer metadata", layer, + "index_part.json contains a layer {layer} that has 0 size in its layer metadata", )) } diff --git a/storage_scrubber/src/lib.rs b/storage_scrubber/src/lib.rs index 25a157f108..d3ed5a8357 100644 --- a/storage_scrubber/src/lib.rs +++ b/storage_scrubber/src/lib.rs @@ -123,7 +123,7 @@ impl S3Target { pub fn with_sub_segment(&self, new_segment: &str) -> Self { let mut new_self = self.clone(); if new_self.prefix_in_bucket.is_empty() { - new_self.prefix_in_bucket = format!("/{}/", new_segment); + new_self.prefix_in_bucket = format!("/{new_segment}/"); } else { if new_self.prefix_in_bucket.ends_with('/') { new_self.prefix_in_bucket.pop(); diff --git a/storage_scrubber/src/scan_safekeeper_metadata.rs b/storage_scrubber/src/scan_safekeeper_metadata.rs index f10d758097..cf0a3d19e9 100644 --- a/storage_scrubber/src/scan_safekeeper_metadata.rs +++ b/storage_scrubber/src/scan_safekeeper_metadata.rs @@ -265,7 +265,7 @@ async fn load_timelines_from_db( // so spawn it off to run on its own. tokio::spawn(async move { if let Err(e) = connection.await { - eprintln!("connection error: {}", e); + eprintln!("connection error: {e}"); } }); @@ -274,7 +274,7 @@ async fn load_timelines_from_db( "and tenant_id in ({})", tenant_ids .iter() - .map(|t| format!("'{}'", t)) + .map(|t| format!("'{t}'")) .collect::>() .join(", ") ) From 4dd9ca7b04d75155a14f6ad51948353f85a1f5fc Mon Sep 17 00:00:00 2001 From: Conrad Ludgate Date: Tue, 24 Jun 2025 15:15:36 +0100 Subject: [PATCH 109/124] [proxy]: authenticate to compute after connect_to_compute (#12335) ## Problem PGLB will do the connect_to_compute logic, neonkeeper will do the session establishment logic. We should split it. ## Summary of changes Moves postgres authentication to compute to a separate routine that happens after connect_to_compute. --- proxy/src/compute/mod.rs | 181 +++++++++++++++++----------- proxy/src/console_redirect_proxy.rs | 13 +- proxy/src/control_plane/mod.rs | 8 +- proxy/src/proxy/connect_compute.rs | 15 +-- proxy/src/proxy/mod.rs | 22 ++-- proxy/src/proxy/retry.rs | 2 - 6 files changed, 140 insertions(+), 101 deletions(-) diff --git a/proxy/src/compute/mod.rs b/proxy/src/compute/mod.rs index 5dd264b35e..f6c58c7459 100644 --- a/proxy/src/compute/mod.rs +++ b/proxy/src/compute/mod.rs @@ -33,12 +33,51 @@ use crate::types::Host; pub const COULD_NOT_CONNECT: &str = "Couldn't connect to compute node"; #[derive(Debug, Error)] -pub(crate) enum ConnectionError { +pub(crate) enum PostgresError { /// This error doesn't seem to reveal any secrets; for instance, /// `postgres_client::error::Kind` doesn't contain ip addresses and such. #[error("{COULD_NOT_CONNECT}: {0}")] Postgres(#[from] postgres_client::Error), +} +impl UserFacingError for PostgresError { + fn to_string_client(&self) -> String { + match self { + // This helps us drop irrelevant library-specific prefixes. + // TODO: propagate severity level and other parameters. + PostgresError::Postgres(err) => match err.as_db_error() { + Some(err) => { + let msg = err.message(); + + if msg.starts_with("unsupported startup parameter: ") + || msg.starts_with("unsupported startup parameter in options: ") + { + format!( + "{msg}. Please use unpooled connection or remove this parameter from the startup package. More details: https://neon.tech/docs/connect/connection-errors#unsupported-startup-parameter" + ) + } else { + msg.to_owned() + } + } + None => err.to_string(), + }, + } + } +} + +impl ReportableError for PostgresError { + fn get_error_kind(&self) -> crate::error::ErrorKind { + match self { + PostgresError::Postgres(e) if e.as_db_error().is_some() => { + crate::error::ErrorKind::Postgres + } + PostgresError::Postgres(_) => crate::error::ErrorKind::Compute, + } + } +} + +#[derive(Debug, Error)] +pub(crate) enum ConnectionError { #[error("{COULD_NOT_CONNECT}: {0}")] TlsError(#[from] TlsError), @@ -52,22 +91,6 @@ pub(crate) enum ConnectionError { impl UserFacingError for ConnectionError { fn to_string_client(&self) -> String { match self { - // This helps us drop irrelevant library-specific prefixes. - // TODO: propagate severity level and other parameters. - ConnectionError::Postgres(err) => match err.as_db_error() { - Some(err) => { - let msg = err.message(); - - if msg.starts_with("unsupported startup parameter: ") - || msg.starts_with("unsupported startup parameter in options: ") - { - format!("{msg}. Please use unpooled connection or remove this parameter from the startup package. More details: https://neon.tech/docs/connect/connection-errors#unsupported-startup-parameter") - } else { - msg.to_owned() - } - } - None => err.to_string(), - }, ConnectionError::WakeComputeError(err) => err.to_string_client(), ConnectionError::TooManyConnectionAttempts(_) => { "Failed to acquire permit to connect to the database. Too many database connection attempts are currently ongoing.".to_owned() @@ -80,10 +103,6 @@ impl UserFacingError for ConnectionError { impl ReportableError for ConnectionError { fn get_error_kind(&self) -> crate::error::ErrorKind { match self { - ConnectionError::Postgres(e) if e.as_db_error().is_some() => { - crate::error::ErrorKind::Postgres - } - ConnectionError::Postgres(_) => crate::error::ErrorKind::Compute, ConnectionError::TlsError(_) => crate::error::ErrorKind::Compute, ConnectionError::WakeComputeError(e) => e.get_error_kind(), ConnectionError::TooManyConnectionAttempts(e) => e.get_error_kind(), @@ -206,6 +225,54 @@ impl AuthInfo { } } } + + pub async fn authenticate( + &self, + ctx: &RequestContext, + compute: &mut ComputeConnection, + user_info: ComputeUserInfo, + ) -> Result { + // client config with stubbed connect info. + // TODO(conrad): should we rewrite this to bypass tokio-postgres2 entirely, + // utilising pqproto.rs. + let mut tmp_config = postgres_client::Config::new(String::new(), 0); + // We have already established SSL if necessary. + tmp_config.ssl_mode(SslMode::Disable); + let tmp_config = self.enrich(tmp_config); + + let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute); + let connection = tmp_config.connect_raw(&mut compute.stream, NoTls).await?; + drop(pause); + + let RawConnection { + stream: _, + parameters, + delayed_notice, + process_id, + secret_key, + } = connection; + + tracing::Span::current().record("pid", tracing::field::display(process_id)); + + // NB: CancelToken is supposed to hold socket_addr, but we use connect_raw. + // Yet another reason to rework the connection establishing code. + let cancel_closure = CancelClosure::new( + compute.socket_addr, + RawCancelToken { + ssl_mode: compute.ssl_mode, + process_id, + secret_key, + }, + compute.hostname.to_string(), + user_info, + ); + + Ok(PostgresSettings { + params: parameters, + cancel_closure, + delayed_notice, + }) + } } impl ConnectInfo { @@ -268,51 +335,42 @@ impl ConnectInfo { pub type RustlsStream = >::Stream; pub type MaybeRustlsStream = MaybeTlsStream; -pub(crate) struct PostgresConnection { - /// Socket connected to a compute node. - pub(crate) stream: MaybeTlsStream, +// TODO(conrad): we don't need to parse these. +// These are just immediately forwarded back to the client. +// We could instead stream them out instead of reading them into memory. +pub struct PostgresSettings { /// PostgreSQL connection parameters. - pub(crate) params: std::collections::HashMap, + pub params: std::collections::HashMap, /// Query cancellation token. - pub(crate) cancel_closure: CancelClosure, - /// Labels for proxy's metrics. - pub(crate) aux: MetricsAuxInfo, + pub cancel_closure: CancelClosure, /// Notices received from compute after authenticating - pub(crate) delayed_notice: Vec, + pub delayed_notice: Vec, +} - pub(crate) guage: NumDbConnectionsGuard<'static>, +pub struct ComputeConnection { + /// Socket connected to a compute node. + pub stream: MaybeTlsStream, + /// Labels for proxy's metrics. + pub aux: MetricsAuxInfo, + pub hostname: Host, + pub ssl_mode: SslMode, + pub socket_addr: SocketAddr, + pub guage: NumDbConnectionsGuard<'static>, } impl ConnectInfo { /// Connect to a corresponding compute node. - pub(crate) async fn connect( + pub async fn connect( &self, ctx: &RequestContext, - aux: MetricsAuxInfo, - auth: &AuthInfo, + aux: &MetricsAuxInfo, config: &ComputeConfig, - user_info: ComputeUserInfo, - ) -> Result { - let mut tmp_config = auth.enrich(self.to_postgres_client_config()); - // we setup SSL early in `ConnectInfo::connect_raw`. - tmp_config.ssl_mode(SslMode::Disable); - + ) -> Result { let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute); let (socket_addr, stream) = self.connect_raw(config).await?; - let connection = tmp_config.connect_raw(stream, NoTls).await?; drop(pause); - let RawConnection { - stream, - parameters, - delayed_notice, - process_id, - secret_key, - } = connection; - - tracing::Span::current().record("pid", tracing::field::display(process_id)); tracing::Span::current().record("compute_id", tracing::field::display(&aux.compute_id)); - let MaybeTlsStream::Raw(stream) = stream.into_inner(); // TODO: lots of useful info but maybe we can move it elsewhere (eg traces?) info!( @@ -324,25 +382,12 @@ impl ConnectInfo { ctx.get_testodrome_id().unwrap_or_default(), ); - // NB: CancelToken is supposed to hold socket_addr, but we use connect_raw. - // Yet another reason to rework the connection establishing code. - let cancel_closure = CancelClosure::new( - socket_addr, - RawCancelToken { - ssl_mode: self.ssl_mode, - process_id, - secret_key, - }, - self.host.to_string(), - user_info, - ); - - let connection = PostgresConnection { + let connection = ComputeConnection { stream, - params: parameters, - delayed_notice, - cancel_closure, - aux, + socket_addr, + hostname: self.host.clone(), + ssl_mode: self.ssl_mode, + aux: aux.clone(), guage: Metrics::get().proxy.db_connections.guard(ctx.protocol()), }; diff --git a/proxy/src/console_redirect_proxy.rs b/proxy/src/console_redirect_proxy.rs index 89adfc9049..113a11beab 100644 --- a/proxy/src/console_redirect_proxy.rs +++ b/proxy/src/console_redirect_proxy.rs @@ -218,11 +218,9 @@ pub(crate) async fn handle_client( }; auth_info.set_startup_params(¶ms, true); - let node = connect_to_compute( + let mut node = connect_to_compute( ctx, &TcpMechanism { - user_info, - auth: auth_info, locks: &config.connect_compute_locks, }, &node_info, @@ -232,9 +230,14 @@ pub(crate) async fn handle_client( .or_else(|e| async { Err(stream.throw_error(e, Some(ctx)).await) }) .await?; + let pg_settings = auth_info + .authenticate(ctx, &mut node, user_info) + .or_else(|e| async { Err(stream.throw_error(e, Some(ctx)).await) }) + .await?; + let session = cancellation_handler.get_key(); - prepare_client_connection(&node, *session.key(), &mut stream); + prepare_client_connection(&pg_settings, *session.key(), &mut stream); let stream = stream.flush_and_into_inner().await?; let session_id = ctx.session_id(); @@ -244,7 +247,7 @@ pub(crate) async fn handle_client( .maintain_cancel_key( session_id, cancel, - &node.cancel_closure, + &pg_settings.cancel_closure, &config.connect_to_compute, ) .await; diff --git a/proxy/src/control_plane/mod.rs b/proxy/src/control_plane/mod.rs index ed83e98bfe..a8c59dad0c 100644 --- a/proxy/src/control_plane/mod.rs +++ b/proxy/src/control_plane/mod.rs @@ -76,13 +76,9 @@ impl NodeInfo { pub(crate) async fn connect( &self, ctx: &RequestContext, - auth: &compute::AuthInfo, config: &ComputeConfig, - user_info: ComputeUserInfo, - ) -> Result { - self.conn_info - .connect(ctx, self.aux.clone(), auth, config, user_info) - .await + ) -> Result { + self.conn_info.connect(ctx, &self.aux, config).await } } diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs index 92ed84f50f..aa675a439e 100644 --- a/proxy/src/proxy/connect_compute.rs +++ b/proxy/src/proxy/connect_compute.rs @@ -2,8 +2,7 @@ use async_trait::async_trait; use tokio::time; use tracing::{debug, info, warn}; -use crate::auth::backend::ComputeUserInfo; -use crate::compute::{self, AuthInfo, COULD_NOT_CONNECT, PostgresConnection}; +use crate::compute::{self, COULD_NOT_CONNECT, ComputeConnection}; use crate::config::{ComputeConfig, RetryConfig}; use crate::context::RequestContext; use crate::control_plane::errors::WakeComputeError; @@ -50,15 +49,13 @@ pub(crate) trait ConnectMechanism { } pub(crate) struct TcpMechanism { - pub(crate) auth: AuthInfo, /// connect_to_compute concurrency lock pub(crate) locks: &'static ApiLocks, - pub(crate) user_info: ComputeUserInfo, } #[async_trait] impl ConnectMechanism for TcpMechanism { - type Connection = PostgresConnection; + type Connection = ComputeConnection; type ConnectError = compute::ConnectionError; type Error = compute::ConnectionError; @@ -71,13 +68,9 @@ impl ConnectMechanism for TcpMechanism { ctx: &RequestContext, node_info: &control_plane::CachedNodeInfo, config: &ComputeConfig, - ) -> Result { + ) -> Result { let permit = self.locks.get_permit(&node_info.conn_info.host).await?; - permit.release_result( - node_info - .connect(ctx, &self.auth, config, self.user_info.clone()) - .await, - ) + permit.release_result(node_info.connect(ctx, config).await) } } diff --git a/proxy/src/proxy/mod.rs b/proxy/src/proxy/mod.rs index 7da1b8d8fa..6947e07488 100644 --- a/proxy/src/proxy/mod.rs +++ b/proxy/src/proxy/mod.rs @@ -357,24 +357,28 @@ pub(crate) async fn handle_client( let res = connect_to_compute( ctx, &TcpMechanism { - user_info: creds.info.clone(), - auth: auth_info, locks: &config.connect_compute_locks, }, - &auth::Backend::ControlPlane(cplane, creds.info), + &auth::Backend::ControlPlane(cplane, creds.info.clone()), config.wake_compute_retry_config, &config.connect_to_compute, ) .await; - let node = match res { + let mut node = match res { Ok(node) => node, Err(e) => Err(stream.throw_error(e, Some(ctx)).await)?, }; + let pg_settings = auth_info.authenticate(ctx, &mut node, creds.info).await; + let pg_settings = match pg_settings { + Ok(pg_settings) => pg_settings, + Err(e) => Err(stream.throw_error(e, Some(ctx)).await)?, + }; + let session = cancellation_handler.get_key(); - prepare_client_connection(&node, *session.key(), &mut stream); + prepare_client_connection(&pg_settings, *session.key(), &mut stream); let stream = stream.flush_and_into_inner().await?; let session_id = ctx.session_id(); @@ -384,7 +388,7 @@ pub(crate) async fn handle_client( .maintain_cancel_key( session_id, cancel, - &node.cancel_closure, + &pg_settings.cancel_closure, &config.connect_to_compute, ) .await; @@ -413,19 +417,19 @@ pub(crate) async fn handle_client( /// Finish client connection initialization: confirm auth success, send params, etc. pub(crate) fn prepare_client_connection( - node: &compute::PostgresConnection, + settings: &compute::PostgresSettings, cancel_key_data: CancelKeyData, stream: &mut PqStream, ) { // Forward all deferred notices to the client. - for notice in &node.delayed_notice { + for notice in &settings.delayed_notice { stream.write_raw(notice.as_bytes().len(), b'N', |buf| { buf.extend_from_slice(notice.as_bytes()); }); } // Forward all postgres connection params to the client. - for (name, value) in &node.params { + for (name, value) in &settings.params { stream.write_message(BeMessage::ParameterStatus { name: name.as_bytes(), value: value.as_bytes(), diff --git a/proxy/src/proxy/retry.rs b/proxy/src/proxy/retry.rs index 0f19944afa..e9eca95724 100644 --- a/proxy/src/proxy/retry.rs +++ b/proxy/src/proxy/retry.rs @@ -99,7 +99,6 @@ impl ShouldRetryWakeCompute for postgres_client::Error { impl CouldRetry for compute::ConnectionError { fn could_retry(&self) -> bool { match self { - compute::ConnectionError::Postgres(err) => err.could_retry(), compute::ConnectionError::TlsError(err) => err.could_retry(), compute::ConnectionError::WakeComputeError(err) => err.could_retry(), compute::ConnectionError::TooManyConnectionAttempts(_) => false, @@ -109,7 +108,6 @@ impl CouldRetry for compute::ConnectionError { impl ShouldRetryWakeCompute for compute::ConnectionError { fn should_retry_wake_compute(&self) -> bool { match self { - compute::ConnectionError::Postgres(err) => err.should_retry_wake_compute(), // the cache entry was not checked for validity compute::ConnectionError::TooManyConnectionAttempts(_) => false, _ => true, From 158d84ea30c65bcffdd6f9ae73aefedc9be912b3 Mon Sep 17 00:00:00 2001 From: Dmitry Savelev Date: Tue, 24 Jun 2025 17:36:36 +0200 Subject: [PATCH 110/124] Switch the billing metrics storage format to ndjson. (#12338) ## Problem The billing team wants to change the billing events pipeline and use a common events format in S3 buckets across different event producers. ## Summary of changes Change the events storage format for billing events from JSON to NDJSON. Resolves: https://github.com/neondatabase/cloud/issues/29994 --- proxy/src/usage_metrics.rs | 34 ++++++++++++++++++++++++---------- 1 file changed, 24 insertions(+), 10 deletions(-) diff --git a/proxy/src/usage_metrics.rs b/proxy/src/usage_metrics.rs index 115b958c54..c82c4865a7 100644 --- a/proxy/src/usage_metrics.rs +++ b/proxy/src/usage_metrics.rs @@ -399,7 +399,7 @@ async fn collect_metrics_iteration( fn create_remote_path_prefix(now: DateTime) -> String { format!( - "year={year:04}/month={month:02}/day={day:02}/{hour:02}:{minute:02}:{second:02}Z", + "year={year:04}/month={month:02}/day={day:02}/hour={hour:02}/{hour:02}:{minute:02}:{second:02}Z", year = now.year(), month = now.month(), day = now.day(), @@ -461,7 +461,7 @@ async fn upload_backup_events( real_now.second().into(), real_now.nanosecond(), )); - let path = format!("{path_prefix}_{id}.json.gz"); + let path = format!("{path_prefix}_{id}.ndjson.gz"); let remote_path = match RemotePath::from_string(&path) { Ok(remote_path) => remote_path, Err(e) => { @@ -471,9 +471,12 @@ async fn upload_backup_events( // TODO: This is async compression from Vec to Vec. Rewrite as byte stream. // Use sync compression in blocking threadpool. - let data = serde_json::to_vec(chunk).context("serialize metrics")?; let mut encoder = GzipEncoder::new(Vec::new()); - encoder.write_all(&data).await.context("compress metrics")?; + for event in chunk.events.iter() { + let data = serde_json::to_vec(event).context("serialize metrics")?; + encoder.write_all(&data).await.context("compress metrics")?; + encoder.write_all(b"\n").await.context("compress metrics")?; + } encoder.shutdown().await.context("compress metrics")?; let compressed_data: Bytes = encoder.get_ref().clone().into(); backoff::retry( @@ -499,7 +502,7 @@ async fn upload_backup_events( #[cfg(test)] mod tests { use std::fs; - use std::io::BufReader; + use std::io::{BufRead, BufReader}; use std::sync::{Arc, Mutex}; use anyhow::Error; @@ -673,11 +676,22 @@ mod tests { { let path = local_fs_path.join(&path_prefix).to_string(); if entry.path().to_str().unwrap().starts_with(&path) { - let chunk = serde_json::from_reader(flate2::bufread::GzDecoder::new( - BufReader::new(fs::File::open(entry.into_path()).unwrap()), - )) - .unwrap(); - stored_chunks.push(chunk); + let file = fs::File::open(entry.into_path()).unwrap(); + let decoder = flate2::bufread::GzDecoder::new(BufReader::new(file)); + let reader = BufReader::new(decoder); + + let mut events: Vec> = Vec::new(); + for line in reader.lines() { + let line = line.unwrap(); + let event: Event = serde_json::from_str(&line).unwrap(); + events.push(event); + } + + let report = Report { + events: Cow::Owned(events), + }; + + stored_chunks.push(report); } } storage_test_dir.close().ok(); From 6c6de6382ad22655e237847cee36b32ee0029063 Mon Sep 17 00:00:00 2001 From: Matthias van de Meent Date: Tue, 24 Jun 2025 19:25:31 +0200 Subject: [PATCH 111/124] Use enum-typed PG versions (#12317) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This makes it possible for the compiler to validate that a match block matched all PostgreSQL versions we support. ## Problem We did not have a complete picture about which places we had to test against PG versions, and what format these versions were: The full PG version ID format (Major/minor/bugfix `MMmmbb`) as transfered in protocol messages, or only the Major release version (`MM`). This meant type confusion was rampant. With this change, it becomes easier to develop new version-dependent features, by making type and niche confusion impossible. ## Summary of changes Every use of `pg_version` is now typed as either `PgVersionId` (u32, valued in decimal `MMmmbb`) or PgMajorVersion (an enum, with a value for every major version we support, serialized and stored like a u32 with the value of that major version) --------- Co-authored-by: Arpad Müller --- Cargo.lock | 29 +++ Cargo.toml | 3 + compute_tools/Cargo.toml | 1 + compute_tools/src/bin/fast_import.rs | 10 +- compute_tools/src/extension_server.rs | 56 ++--- control_plane/src/bin/neon_local.rs | 14 +- control_plane/src/endpoint.rs | 9 +- control_plane/src/local_env.rs | 15 +- control_plane/src/pageserver.rs | 3 +- control_plane/src/storage_controller.rs | 18 +- libs/pageserver_api/Cargo.toml | 1 + libs/pageserver_api/src/models.rs | 7 +- libs/postgres_ffi/Cargo.toml | 1 + libs/postgres_ffi/benches/waldecoder.rs | 3 +- libs/postgres_ffi/src/lib.rs | 41 ++-- libs/postgres_ffi/src/pg_constants_v14.rs | 4 + libs/postgres_ffi/src/pg_constants_v15.rs | 4 + libs/postgres_ffi/src/pg_constants_v16.rs | 4 + libs/postgres_ffi/src/pg_constants_v17.rs | 4 + libs/postgres_ffi/src/walrecord.rs | 23 +- libs/postgres_ffi/src/xlog_utils.rs | 4 +- .../wal_craft/src/bin/wal_craft.rs | 3 +- libs/postgres_ffi/wal_craft/src/lib.rs | 12 +- .../wal_craft/src/xlog_utils_test.rs | 2 +- libs/postgres_initdb/Cargo.toml | 1 + libs/postgres_initdb/src/lib.rs | 11 +- libs/postgres_versioninfo/Cargo.toml | 12 + libs/postgres_versioninfo/src/lib.rs | 175 ++++++++++++++ libs/safekeeper_api/Cargo.toml | 1 + libs/safekeeper_api/src/lib.rs | 4 +- libs/safekeeper_api/src/models.rs | 4 +- .../benches/bench_interpret_wal.rs | 4 +- libs/wal_decoder/src/decoder.rs | 214 +++++++++--------- libs/wal_decoder/src/serialized_batch.rs | 12 +- pageserver/benches/bench_walredo.rs | 11 +- pageserver/client/Cargo.toml | 1 + pageserver/client/src/mgmt_api.rs | 5 +- pageserver/src/basebackup.rs | 15 +- pageserver/src/config.rs | 15 +- pageserver/src/http/routes.rs | 3 +- pageserver/src/lib.rs | 3 +- pageserver/src/pgdatadir_mapping.rs | 10 +- pageserver/src/tenant.rs | 25 +- pageserver/src/tenant/metadata.rs | 15 +- .../tenant/remote_timeline_client/index.rs | 16 +- .../src/tenant/storage_layer/delta_layer.rs | 12 +- .../src/tenant/storage_layer/layer/tests.rs | 45 +++- pageserver/src/tenant/timeline.rs | 18 +- .../import_pgdata/importbucket_client.rs | 15 +- pageserver/src/walingest.rs | 22 +- pageserver/src/walredo.rs | 18 +- pageserver/src/walredo/process.rs | 6 +- safekeeper/Cargo.toml | 1 + safekeeper/src/control_file_upgrade.rs | 28 +-- safekeeper/src/safekeeper.rs | 14 +- safekeeper/src/send_interpreted_wal.rs | 18 +- safekeeper/src/send_wal.rs | 6 +- safekeeper/src/state.rs | 5 +- safekeeper/src/wal_storage.rs | 16 +- .../tests/walproposer_sim/safekeeper_disk.rs | 10 +- .../src/service/safekeeper_service.rs | 5 +- test_runner/regress/test_branching.py | 2 +- 62 files changed, 683 insertions(+), 386 deletions(-) create mode 100644 libs/postgres_versioninfo/Cargo.toml create mode 100644 libs/postgres_versioninfo/src/lib.rs diff --git a/Cargo.lock b/Cargo.lock index 8cc51350ef..51724da061 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1318,6 +1318,7 @@ dependencies = [ "p256 0.13.2", "postgres", "postgres_initdb", + "postgres_versioninfo", "regex", "remote_storage", "reqwest", @@ -4406,6 +4407,7 @@ dependencies = [ "once_cell", "postgres_backend", "postgres_ffi_types", + "postgres_versioninfo", "rand 0.8.5", "remote_storage", "reqwest", @@ -4429,6 +4431,7 @@ dependencies = [ "futures", "http-utils", "pageserver_api", + "postgres_versioninfo", "reqwest", "serde", "thiserror 1.0.69", @@ -4897,6 +4900,7 @@ dependencies = [ "once_cell", "postgres", "postgres_ffi_types", + "postgres_versioninfo", "pprof", "regex", "serde", @@ -4919,11 +4923,23 @@ version = "0.1.0" dependencies = [ "anyhow", "camino", + "postgres_versioninfo", "thiserror 1.0.69", "tokio", "workspace_hack", ] +[[package]] +name = "postgres_versioninfo" +version = "0.1.0" +dependencies = [ + "anyhow", + "serde", + "serde_repr", + "thiserror 1.0.69", + "workspace_hack", +] + [[package]] name = "posthog_client_lite" version = "0.1.0" @@ -6115,6 +6131,7 @@ dependencies = [ "postgres-protocol", "postgres_backend", "postgres_ffi", + "postgres_versioninfo", "pprof", "pq_proto", "rand 0.8.5", @@ -6159,6 +6176,7 @@ dependencies = [ "const_format", "pageserver_api", "postgres_ffi", + "postgres_versioninfo", "pq_proto", "serde", "serde_json", @@ -6481,6 +6499,17 @@ dependencies = [ "thiserror 1.0.69", ] +[[package]] +name = "serde_repr" +version = "0.1.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "175ee3e80ae9982737ca543e96133087cbd9a485eecc3bc4de9c1a37b47ea59c" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.100", +] + [[package]] name = "serde_spanned" version = "0.6.6" diff --git a/Cargo.toml b/Cargo.toml index 2a6acc132e..857bc5d5d9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -23,6 +23,7 @@ members = [ "libs/pageserver_api", "libs/postgres_ffi", "libs/postgres_ffi_types", + "libs/postgres_versioninfo", "libs/safekeeper_api", "libs/desim", "libs/neon-shmem", @@ -174,6 +175,7 @@ serde_json = "1" serde_path_to_error = "0.1" serde_with = { version = "3", features = [ "base64" ] } serde_assert = "0.5.0" +serde_repr = "0.1.20" sha2 = "0.10.2" signal-hook = "0.3" smallvec = "1.11" @@ -261,6 +263,7 @@ postgres_backend = { version = "0.1", path = "./libs/postgres_backend/" } postgres_connection = { version = "0.1", path = "./libs/postgres_connection/" } postgres_ffi = { version = "0.1", path = "./libs/postgres_ffi/" } postgres_ffi_types = { version = "0.1", path = "./libs/postgres_ffi_types/" } +postgres_versioninfo = { version = "0.1", path = "./libs/postgres_versioninfo/" } postgres_initdb = { path = "./libs/postgres_initdb" } posthog_client_lite = { version = "0.1", path = "./libs/posthog_client_lite" } pq_proto = { version = "0.1", path = "./libs/pq_proto/" } diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml index f9da3ba700..a5879c4b7c 100644 --- a/compute_tools/Cargo.toml +++ b/compute_tools/Cargo.toml @@ -64,6 +64,7 @@ uuid.workspace = true walkdir.workspace = true x509-cert.workspace = true +postgres_versioninfo.workspace = true postgres_initdb.workspace = true compute_api.workspace = true utils.workspace = true diff --git a/compute_tools/src/bin/fast_import.rs b/compute_tools/src/bin/fast_import.rs index 682525f6df..0eca9aba53 100644 --- a/compute_tools/src/bin/fast_import.rs +++ b/compute_tools/src/bin/fast_import.rs @@ -29,7 +29,7 @@ use anyhow::{Context, bail}; use aws_config::BehaviorVersion; use camino::{Utf8Path, Utf8PathBuf}; use clap::{Parser, Subcommand}; -use compute_tools::extension_server::{PostgresMajorVersion, get_pg_version}; +use compute_tools::extension_server::get_pg_version; use nix::unistd::Pid; use std::ops::Not; use tracing::{Instrument, error, info, info_span, warn}; @@ -179,12 +179,8 @@ impl PostgresProcess { .await .context("create pgdata directory")?; - let pg_version = match get_pg_version(self.pgbin.as_ref()) { - PostgresMajorVersion::V14 => 14, - PostgresMajorVersion::V15 => 15, - PostgresMajorVersion::V16 => 16, - PostgresMajorVersion::V17 => 17, - }; + let pg_version = get_pg_version(self.pgbin.as_ref()); + postgres_initdb::do_run_initdb(postgres_initdb::RunInitdbArgs { superuser: initdb_user, locale: DEFAULT_LOCALE, // XXX: this shouldn't be hard-coded, diff --git a/compute_tools/src/extension_server.rs b/compute_tools/src/extension_server.rs index d8d5de34a5..47931d5f72 100644 --- a/compute_tools/src/extension_server.rs +++ b/compute_tools/src/extension_server.rs @@ -74,9 +74,11 @@ More specifically, here is an example ext_index.json use std::path::Path; use std::str; +use crate::metrics::{REMOTE_EXT_REQUESTS_TOTAL, UNKNOWN_HTTP_STATUS}; use anyhow::{Context, Result, bail}; use bytes::Bytes; use compute_api::spec::RemoteExtSpec; +use postgres_versioninfo::PgMajorVersion; use regex::Regex; use remote_storage::*; use reqwest::StatusCode; @@ -86,8 +88,6 @@ use tracing::log::warn; use url::Url; use zstd::stream::read::Decoder; -use crate::metrics::{REMOTE_EXT_REQUESTS_TOTAL, UNKNOWN_HTTP_STATUS}; - fn get_pg_config(argument: &str, pgbin: &str) -> String { // gives the result of `pg_config [argument]` // where argument is a flag like `--version` or `--sharedir` @@ -106,7 +106,7 @@ fn get_pg_config(argument: &str, pgbin: &str) -> String { .to_string() } -pub fn get_pg_version(pgbin: &str) -> PostgresMajorVersion { +pub fn get_pg_version(pgbin: &str) -> PgMajorVersion { // pg_config --version returns a (platform specific) human readable string // such as "PostgreSQL 15.4". We parse this to v14/v15/v16 etc. let human_version = get_pg_config("--version", pgbin); @@ -114,25 +114,11 @@ pub fn get_pg_version(pgbin: &str) -> PostgresMajorVersion { } pub fn get_pg_version_string(pgbin: &str) -> String { - match get_pg_version(pgbin) { - PostgresMajorVersion::V14 => "v14", - PostgresMajorVersion::V15 => "v15", - PostgresMajorVersion::V16 => "v16", - PostgresMajorVersion::V17 => "v17", - } - .to_owned() + get_pg_version(pgbin).v_str() } -#[derive(Copy, Clone, Debug, PartialEq, Eq)] -pub enum PostgresMajorVersion { - V14, - V15, - V16, - V17, -} - -fn parse_pg_version(human_version: &str) -> PostgresMajorVersion { - use PostgresMajorVersion::*; +fn parse_pg_version(human_version: &str) -> PgMajorVersion { + use PgMajorVersion::*; // Normal releases have version strings like "PostgreSQL 15.4". But there // are also pre-release versions like "PostgreSQL 17devel" or "PostgreSQL // 16beta2" or "PostgreSQL 17rc1". And with the --with-extra-version @@ -143,10 +129,10 @@ fn parse_pg_version(human_version: &str) -> PostgresMajorVersion { .captures(human_version) { Some(captures) if captures.len() == 2 => match &captures["major"] { - "14" => return V14, - "15" => return V15, - "16" => return V16, - "17" => return V17, + "14" => return PG14, + "15" => return PG15, + "16" => return PG16, + "17" => return PG17, _ => {} }, _ => {} @@ -343,25 +329,25 @@ mod tests { #[test] fn test_parse_pg_version() { - use super::PostgresMajorVersion::*; - assert_eq!(parse_pg_version("PostgreSQL 15.4"), V15); - assert_eq!(parse_pg_version("PostgreSQL 15.14"), V15); + use postgres_versioninfo::PgMajorVersion::*; + assert_eq!(parse_pg_version("PostgreSQL 15.4"), PG15); + assert_eq!(parse_pg_version("PostgreSQL 15.14"), PG15); assert_eq!( parse_pg_version("PostgreSQL 15.4 (Ubuntu 15.4-0ubuntu0.23.04.1)"), - V15 + PG15 ); - assert_eq!(parse_pg_version("PostgreSQL 14.15"), V14); - assert_eq!(parse_pg_version("PostgreSQL 14.0"), V14); + assert_eq!(parse_pg_version("PostgreSQL 14.15"), PG14); + assert_eq!(parse_pg_version("PostgreSQL 14.0"), PG14); assert_eq!( parse_pg_version("PostgreSQL 14.9 (Debian 14.9-1.pgdg120+1"), - V14 + PG14 ); - assert_eq!(parse_pg_version("PostgreSQL 16devel"), V16); - assert_eq!(parse_pg_version("PostgreSQL 16beta1"), V16); - assert_eq!(parse_pg_version("PostgreSQL 16rc2"), V16); - assert_eq!(parse_pg_version("PostgreSQL 16extra"), V16); + assert_eq!(parse_pg_version("PostgreSQL 16devel"), PG16); + assert_eq!(parse_pg_version("PostgreSQL 16beta1"), PG16); + assert_eq!(parse_pg_version("PostgreSQL 16rc2"), PG16); + assert_eq!(parse_pg_version("PostgreSQL 16extra"), PG16); } #[test] diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index b2dd1a7077..c818d07fef 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -48,7 +48,7 @@ use postgres_connection::parse_host_port; use safekeeper_api::membership::{SafekeeperGeneration, SafekeeperId}; use safekeeper_api::{ DEFAULT_HTTP_LISTEN_PORT as DEFAULT_SAFEKEEPER_HTTP_PORT, - DEFAULT_PG_LISTEN_PORT as DEFAULT_SAFEKEEPER_PG_PORT, + DEFAULT_PG_LISTEN_PORT as DEFAULT_SAFEKEEPER_PG_PORT, PgMajorVersion, PgVersionId, }; use storage_broker::DEFAULT_LISTEN_ADDR as DEFAULT_BROKER_ADDR; use tokio::task::JoinSet; @@ -64,7 +64,7 @@ const DEFAULT_PAGESERVER_ID: NodeId = NodeId(1); const DEFAULT_BRANCH_NAME: &str = "main"; project_git_version!(GIT_VERSION); -const DEFAULT_PG_VERSION: u32 = 17; +const DEFAULT_PG_VERSION: PgMajorVersion = PgMajorVersion::PG17; const DEFAULT_PAGESERVER_CONTROL_PLANE_API: &str = "http://127.0.0.1:1234/upcall/v1/"; @@ -169,7 +169,7 @@ struct TenantCreateCmdArgs { #[arg(default_value_t = DEFAULT_PG_VERSION)] #[clap(long, help = "Postgres version to use for the initial timeline")] - pg_version: u32, + pg_version: PgMajorVersion, #[clap( long, @@ -292,7 +292,7 @@ struct TimelineCreateCmdArgs { #[arg(default_value_t = DEFAULT_PG_VERSION)] #[clap(long, help = "Postgres version")] - pg_version: u32, + pg_version: PgMajorVersion, } #[derive(clap::Args)] @@ -324,7 +324,7 @@ struct TimelineImportCmdArgs { #[arg(default_value_t = DEFAULT_PG_VERSION)] #[clap(long, help = "Postgres version of the backup being imported")] - pg_version: u32, + pg_version: PgMajorVersion, } #[derive(clap::Subcommand)] @@ -603,7 +603,7 @@ struct EndpointCreateCmdArgs { #[arg(default_value_t = DEFAULT_PG_VERSION)] #[clap(long, help = "Postgres version")] - pg_version: u32, + pg_version: PgMajorVersion, /// Use gRPC to communicate with Pageservers, by generating grpc:// connstrings. /// @@ -1295,7 +1295,7 @@ async fn handle_timeline(cmd: &TimelineCmd, env: &mut local_env::LocalEnv) -> Re }, new_members: None, }; - let pg_version = args.pg_version * 10000; + let pg_version = PgVersionId::from(args.pg_version); let req = safekeeper_api::models::TimelineCreateRequest { tenant_id, timeline_id, diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs index dab53b0f27..e3faa082db 100644 --- a/control_plane/src/endpoint.rs +++ b/control_plane/src/endpoint.rs @@ -67,6 +67,7 @@ use nix::sys::signal::{Signal, kill}; use pageserver_api::shard::ShardStripeSize; use pem::Pem; use reqwest::header::CONTENT_TYPE; +use safekeeper_api::PgMajorVersion; use safekeeper_api::membership::SafekeeperGeneration; use serde::{Deserialize, Serialize}; use sha2::{Digest, Sha256}; @@ -89,7 +90,7 @@ pub struct EndpointConf { pg_port: u16, external_http_port: u16, internal_http_port: u16, - pg_version: u32, + pg_version: PgMajorVersion, grpc: bool, skip_pg_catalog_updates: bool, reconfigure_concurrency: usize, @@ -192,7 +193,7 @@ impl ComputeControlPlane { pg_port: Option, external_http_port: Option, internal_http_port: Option, - pg_version: u32, + pg_version: PgMajorVersion, mode: ComputeMode, grpc: bool, skip_pg_catalog_updates: bool, @@ -312,7 +313,7 @@ pub struct Endpoint { pub internal_http_address: SocketAddr, // postgres major version in the format: 14, 15, etc. - pg_version: u32, + pg_version: PgMajorVersion, // These are not part of the endpoint as such, but the environment // the endpoint runs in. @@ -557,7 +558,7 @@ impl Endpoint { conf.append("hot_standby", "on"); // prefetching of blocks referenced in WAL doesn't make sense for us // Neon hot standby ignores pages that are not in the shared_buffers - if self.pg_version >= 15 { + if self.pg_version >= PgMajorVersion::PG15 { conf.append("recovery_prefetch", "off"); } } diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index 34465b4d5d..370921a85c 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -15,6 +15,7 @@ use clap::ValueEnum; use pem::Pem; use postgres_backend::AuthType; use reqwest::{Certificate, Url}; +use safekeeper_api::PgMajorVersion; use serde::{Deserialize, Serialize}; use utils::auth::encode_from_key_file; use utils::id::{NodeId, TenantId, TenantTimelineId, TimelineId}; @@ -424,25 +425,21 @@ impl LocalEnv { self.pg_distrib_dir.clone() } - pub fn pg_distrib_dir(&self, pg_version: u32) -> anyhow::Result { + pub fn pg_distrib_dir(&self, pg_version: PgMajorVersion) -> anyhow::Result { let path = self.pg_distrib_dir.clone(); - #[allow(clippy::manual_range_patterns)] - match pg_version { - 14 | 15 | 16 | 17 => Ok(path.join(format!("v{pg_version}"))), - _ => bail!("Unsupported postgres version: {}", pg_version), - } + Ok(path.join(pg_version.v_str())) } - pub fn pg_dir(&self, pg_version: u32, dir_name: &str) -> anyhow::Result { + pub fn pg_dir(&self, pg_version: PgMajorVersion, dir_name: &str) -> anyhow::Result { Ok(self.pg_distrib_dir(pg_version)?.join(dir_name)) } - pub fn pg_bin_dir(&self, pg_version: u32) -> anyhow::Result { + pub fn pg_bin_dir(&self, pg_version: PgMajorVersion) -> anyhow::Result { self.pg_dir(pg_version, "bin") } - pub fn pg_lib_dir(&self, pg_version: u32) -> anyhow::Result { + pub fn pg_lib_dir(&self, pg_version: PgMajorVersion) -> anyhow::Result { self.pg_dir(pg_version, "lib") } diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index 7fa00a6730..3f66960edd 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -22,6 +22,7 @@ use pageserver_api::shard::TenantShardId; use pageserver_client::mgmt_api; use postgres_backend::AuthType; use postgres_connection::{PgConnectionConfig, parse_host_port}; +use safekeeper_api::PgMajorVersion; use utils::auth::{Claims, Scope}; use utils::id::{NodeId, TenantId, TimelineId}; use utils::lsn::Lsn; @@ -607,7 +608,7 @@ impl PageServerNode { timeline_id: TimelineId, base: (Lsn, PathBuf), pg_wal: Option<(Lsn, PathBuf)>, - pg_version: u32, + pg_version: PgMajorVersion, ) -> anyhow::Result<()> { // Init base reader let (start_lsn, base_tarfile_path) = base; diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs index f2ac5bb2dd..218b17d88d 100644 --- a/control_plane/src/storage_controller.rs +++ b/control_plane/src/storage_controller.rs @@ -6,6 +6,8 @@ use std::str::FromStr; use std::sync::OnceLock; use std::time::{Duration, Instant}; +use crate::background_process; +use crate::local_env::{LocalEnv, NeonStorageControllerConf}; use camino::{Utf8Path, Utf8PathBuf}; use hyper0::Uri; use nix::unistd::Pid; @@ -22,6 +24,7 @@ use pageserver_client::mgmt_api::ResponseErrorMessageExt; use pem::Pem; use postgres_backend::AuthType; use reqwest::{Method, Response}; +use safekeeper_api::PgMajorVersion; use serde::de::DeserializeOwned; use serde::{Deserialize, Serialize}; use tokio::process::Command; @@ -31,9 +34,6 @@ use utils::auth::{Claims, Scope, encode_from_key_file}; use utils::id::{NodeId, TenantId}; use whoami::username; -use crate::background_process; -use crate::local_env::{LocalEnv, NeonStorageControllerConf}; - pub struct StorageController { env: LocalEnv, private_key: Option, @@ -48,7 +48,7 @@ pub struct StorageController { const COMMAND: &str = "storage_controller"; -const STORAGE_CONTROLLER_POSTGRES_VERSION: u32 = 16; +const STORAGE_CONTROLLER_POSTGRES_VERSION: PgMajorVersion = PgMajorVersion::PG16; const DB_NAME: &str = "storage_controller"; @@ -184,9 +184,15 @@ impl StorageController { /// to other versions if that one isn't found. Some automated tests create circumstances /// where only one version is available in pg_distrib_dir, such as `test_remote_extensions`. async fn get_pg_dir(&self, dir_name: &str) -> anyhow::Result { - let prefer_versions = [STORAGE_CONTROLLER_POSTGRES_VERSION, 16, 15, 14]; + const PREFER_VERSIONS: [PgMajorVersion; 5] = [ + STORAGE_CONTROLLER_POSTGRES_VERSION, + PgMajorVersion::PG16, + PgMajorVersion::PG15, + PgMajorVersion::PG14, + PgMajorVersion::PG17, + ]; - for v in prefer_versions { + for v in PREFER_VERSIONS { let path = Utf8PathBuf::from_path_buf(self.env.pg_dir(v, dir_name)?).unwrap(); if tokio::fs::try_exists(&path).await? { return Ok(path); diff --git a/libs/pageserver_api/Cargo.toml b/libs/pageserver_api/Cargo.toml index 5a9a74b93d..a34e065788 100644 --- a/libs/pageserver_api/Cargo.toml +++ b/libs/pageserver_api/Cargo.toml @@ -18,6 +18,7 @@ bytes.workspace = true byteorder.workspace = true utils.workspace = true postgres_ffi_types.workspace = true +postgres_versioninfo.workspace = true enum-map.workspace = true strum.workspace = true strum_macros.workspace = true diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index ee6725efbe..82a3ac0eb4 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -11,6 +11,7 @@ use std::time::{Duration, SystemTime}; #[cfg(feature = "testing")] use camino::Utf8PathBuf; +use postgres_versioninfo::PgMajorVersion; use serde::{Deserialize, Deserializer, Serialize, Serializer}; use serde_with::serde_as; pub use utilization::PageserverUtilization; @@ -398,7 +399,7 @@ pub enum TimelineCreateRequestMode { // inherits the ancestor's pg_version. Earlier code wasn't // using a flattened enum, so, it was an accepted field, and // we continue to accept it by having it here. - pg_version: Option, + pg_version: Option, #[serde(default, skip_serializing_if = "std::ops::Not::not")] read_only: bool, }, @@ -410,7 +411,7 @@ pub enum TimelineCreateRequestMode { Bootstrap { #[serde(default)] existing_initdb_timeline_id: Option, - pg_version: Option, + pg_version: Option, }, } @@ -1573,7 +1574,7 @@ pub struct TimelineInfo { pub last_received_msg_lsn: Option, /// the timestamp (in microseconds) of the last received message pub last_received_msg_ts: Option, - pub pg_version: u32, + pub pg_version: PgMajorVersion, pub state: TimelineState, diff --git a/libs/postgres_ffi/Cargo.toml b/libs/postgres_ffi/Cargo.toml index 67adfdd3c3..d4fec6cbe9 100644 --- a/libs/postgres_ffi/Cargo.toml +++ b/libs/postgres_ffi/Cargo.toml @@ -19,6 +19,7 @@ serde.workspace = true postgres_ffi_types.workspace = true utils.workspace = true tracing.workspace = true +postgres_versioninfo.workspace = true [dev-dependencies] env_logger.workspace = true diff --git a/libs/postgres_ffi/benches/waldecoder.rs b/libs/postgres_ffi/benches/waldecoder.rs index 2e1d62e452..b2a884c7db 100644 --- a/libs/postgres_ffi/benches/waldecoder.rs +++ b/libs/postgres_ffi/benches/waldecoder.rs @@ -4,6 +4,7 @@ use criterion::{Bencher, Criterion, criterion_group, criterion_main}; use postgres_ffi::v17::wal_generator::LogicalMessageGenerator; use postgres_ffi::v17::waldecoder_handler::WalStreamDecoderHandler; use postgres_ffi::waldecoder::WalStreamDecoder; +use postgres_versioninfo::PgMajorVersion; use pprof::criterion::{Output, PProfProfiler}; use utils::lsn::Lsn; @@ -32,7 +33,7 @@ fn bench_complete_record(c: &mut Criterion) { let value_size = LogicalMessageGenerator::make_value_size(size, PREFIX); let value = vec![1; value_size]; - let mut decoder = WalStreamDecoder::new(Lsn(0), 170000); + let mut decoder = WalStreamDecoder::new(Lsn(0), PgMajorVersion::PG17); let msg = LogicalMessageGenerator::new(PREFIX, &value) .next() .unwrap() diff --git a/libs/postgres_ffi/src/lib.rs b/libs/postgres_ffi/src/lib.rs index 05d8de4c7a..9297ac46c9 100644 --- a/libs/postgres_ffi/src/lib.rs +++ b/libs/postgres_ffi/src/lib.rs @@ -14,6 +14,8 @@ use bytes::Bytes; use utils::bin_ser::SerializeError; use utils::lsn::Lsn; +pub use postgres_versioninfo::PgMajorVersion; + macro_rules! postgres_ffi { ($version:ident) => { #[path = "."] @@ -91,21 +93,22 @@ macro_rules! dispatch_pgversion { $version => $code, default = $invalid_pgver_handling, pgversions = [ - 14 : v14, - 15 : v15, - 16 : v16, - 17 : v17, + $crate::PgMajorVersion::PG14 => v14, + $crate::PgMajorVersion::PG15 => v15, + $crate::PgMajorVersion::PG16 => v16, + $crate::PgMajorVersion::PG17 => v17, ] ) }; ($pgversion:expr => $code:expr, default = $default:expr, - pgversions = [$($sv:literal : $vsv:ident),+ $(,)?]) => { - match ($pgversion) { + pgversions = [$($sv:pat => $vsv:ident),+ $(,)?]) => { + match ($pgversion.clone().into()) { $($sv => { use $crate::$vsv as pgv; $code },)+ + #[allow(unreachable_patterns)] _ => { $default } @@ -179,9 +182,9 @@ macro_rules! enum_pgversion { $($variant ( $crate::$md::$t )),+ } impl self::$name { - pub fn pg_version(&self) -> u32 { + pub fn pg_version(&self) -> PgMajorVersion { enum_pgversion_dispatch!(self, $name, _ign, { - pgv::bindings::PG_MAJORVERSION_NUM + pgv::bindings::MY_PGVERSION }) } } @@ -195,15 +198,15 @@ macro_rules! enum_pgversion { }; {name = $name:ident, path = $p:ident, - typ = $t:ident, + $(typ = $t:ident,)? pgversions = [$($variant:ident : $md:ident),+ $(,)?]} => { pub enum $name { - $($variant ($crate::$md::$p::$t)),+ + $($variant $(($crate::$md::$p::$t))?),+ } impl $name { - pub fn pg_version(&self) -> u32 { + pub fn pg_version(&self) -> PgMajorVersion { enum_pgversion_dispatch!(self, $name, _ign, { - pgv::bindings::PG_MAJORVERSION_NUM + pgv::bindings::MY_PGVERSION }) } } @@ -249,22 +252,21 @@ pub use v14::xlog_utils::{ try_from_pg_timestamp, }; -pub fn bkpimage_is_compressed(bimg_info: u8, version: u32) -> bool { +pub fn bkpimage_is_compressed(bimg_info: u8, version: PgMajorVersion) -> bool { dispatch_pgversion!(version, pgv::bindings::bkpimg_is_compressed(bimg_info)) } pub fn generate_wal_segment( segno: u64, system_id: u64, - pg_version: u32, + pg_version: PgMajorVersion, lsn: Lsn, ) -> Result { assert_eq!(segno, lsn.segment_number(WAL_SEGMENT_SIZE)); dispatch_pgversion!( pg_version, - pgv::xlog_utils::generate_wal_segment(segno, system_id, lsn), - Err(SerializeError::BadInput) + pgv::xlog_utils::generate_wal_segment(segno, system_id, lsn) ) } @@ -272,7 +274,7 @@ pub fn generate_pg_control( pg_control_bytes: &[u8], checkpoint_bytes: &[u8], lsn: Lsn, - pg_version: u32, + pg_version: PgMajorVersion, ) -> anyhow::Result<(Bytes, u64, bool)> { dispatch_pgversion!( pg_version, @@ -352,6 +354,7 @@ pub fn fsm_logical_to_physical(addr: BlockNumber) -> BlockNumber { pub mod waldecoder { use std::num::NonZeroU32; + use crate::PgMajorVersion; use bytes::{Buf, Bytes, BytesMut}; use thiserror::Error; use utils::lsn::Lsn; @@ -369,7 +372,7 @@ pub mod waldecoder { pub struct WalStreamDecoder { pub lsn: Lsn, - pub pg_version: u32, + pub pg_version: PgMajorVersion, pub inputbuf: BytesMut, pub state: State, } @@ -382,7 +385,7 @@ pub mod waldecoder { } impl WalStreamDecoder { - pub fn new(lsn: Lsn, pg_version: u32) -> WalStreamDecoder { + pub fn new(lsn: Lsn, pg_version: PgMajorVersion) -> WalStreamDecoder { WalStreamDecoder { lsn, pg_version, diff --git a/libs/postgres_ffi/src/pg_constants_v14.rs b/libs/postgres_ffi/src/pg_constants_v14.rs index fe01a5df7c..fd393995db 100644 --- a/libs/postgres_ffi/src/pg_constants_v14.rs +++ b/libs/postgres_ffi/src/pg_constants_v14.rs @@ -1,3 +1,7 @@ +use crate::PgMajorVersion; + +pub const MY_PGVERSION: PgMajorVersion = PgMajorVersion::PG14; + pub const XLOG_DBASE_CREATE: u8 = 0x00; pub const XLOG_DBASE_DROP: u8 = 0x10; diff --git a/libs/postgres_ffi/src/pg_constants_v15.rs b/libs/postgres_ffi/src/pg_constants_v15.rs index 3cd1b7aec5..6c1e2c13de 100644 --- a/libs/postgres_ffi/src/pg_constants_v15.rs +++ b/libs/postgres_ffi/src/pg_constants_v15.rs @@ -1,3 +1,7 @@ +use crate::PgMajorVersion; + +pub const MY_PGVERSION: PgMajorVersion = PgMajorVersion::PG15; + pub const XACT_XINFO_HAS_DROPPED_STATS: u32 = 1u32 << 8; pub const XLOG_DBASE_CREATE_FILE_COPY: u8 = 0x00; diff --git a/libs/postgres_ffi/src/pg_constants_v16.rs b/libs/postgres_ffi/src/pg_constants_v16.rs index 31bd5b68fd..d84db502f3 100644 --- a/libs/postgres_ffi/src/pg_constants_v16.rs +++ b/libs/postgres_ffi/src/pg_constants_v16.rs @@ -1,3 +1,7 @@ +use crate::PgMajorVersion; + +pub const MY_PGVERSION: PgMajorVersion = PgMajorVersion::PG16; + pub const XACT_XINFO_HAS_DROPPED_STATS: u32 = 1u32 << 8; pub const XLOG_DBASE_CREATE_FILE_COPY: u8 = 0x00; diff --git a/libs/postgres_ffi/src/pg_constants_v17.rs b/libs/postgres_ffi/src/pg_constants_v17.rs index 2132938680..14d4b3d42f 100644 --- a/libs/postgres_ffi/src/pg_constants_v17.rs +++ b/libs/postgres_ffi/src/pg_constants_v17.rs @@ -1,3 +1,7 @@ +use crate::PgMajorVersion; + +pub const MY_PGVERSION: PgMajorVersion = PgMajorVersion::PG17; + pub const XACT_XINFO_HAS_DROPPED_STATS: u32 = 1u32 << 8; pub const XLOG_DBASE_CREATE_FILE_COPY: u8 = 0x00; diff --git a/libs/postgres_ffi/src/walrecord.rs b/libs/postgres_ffi/src/walrecord.rs index c0ae88363e..d593123dc0 100644 --- a/libs/postgres_ffi/src/walrecord.rs +++ b/libs/postgres_ffi/src/walrecord.rs @@ -9,8 +9,8 @@ use utils::bin_ser::DeserializeError; use utils::lsn::Lsn; use crate::{ - BLCKSZ, BlockNumber, MultiXactId, MultiXactOffset, MultiXactStatus, Oid, RepOriginId, - TimestampTz, TransactionId, XLOG_SIZE_OF_XLOG_RECORD, XLogRecord, pg_constants, + BLCKSZ, BlockNumber, MultiXactId, MultiXactOffset, MultiXactStatus, Oid, PgMajorVersion, + RepOriginId, TimestampTz, TransactionId, XLOG_SIZE_OF_XLOG_RECORD, XLogRecord, pg_constants, }; #[repr(C)] @@ -199,20 +199,17 @@ impl DecodedWALRecord { /// Check if this WAL record represents a legacy "copy" database creation, which populates new relations /// by reading other existing relations' data blocks. This is more complex to apply than new-style database /// creations which simply include all the desired blocks in the WAL, so we need a helper function to detect this case. - pub fn is_dbase_create_copy(&self, pg_version: u32) -> bool { + pub fn is_dbase_create_copy(&self, pg_version: PgMajorVersion) -> bool { if self.xl_rmid == pg_constants::RM_DBASE_ID { let info = self.xl_info & pg_constants::XLR_RMGR_INFO_MASK; match pg_version { - 14 => { + PgMajorVersion::PG14 => { // Postgres 14 database creations are always the legacy kind info == crate::v14::bindings::XLOG_DBASE_CREATE } - 15 => info == crate::v15::bindings::XLOG_DBASE_CREATE_FILE_COPY, - 16 => info == crate::v16::bindings::XLOG_DBASE_CREATE_FILE_COPY, - 17 => info == crate::v17::bindings::XLOG_DBASE_CREATE_FILE_COPY, - _ => { - panic!("Unsupported postgres version {pg_version}") - } + PgMajorVersion::PG15 => info == crate::v15::bindings::XLOG_DBASE_CREATE_FILE_COPY, + PgMajorVersion::PG16 => info == crate::v16::bindings::XLOG_DBASE_CREATE_FILE_COPY, + PgMajorVersion::PG17 => info == crate::v17::bindings::XLOG_DBASE_CREATE_FILE_COPY, } } else { false @@ -248,7 +245,7 @@ impl DecodedWALRecord { pub fn decode_wal_record( record: Bytes, decoded: &mut DecodedWALRecord, - pg_version: u32, + pg_version: PgMajorVersion, ) -> anyhow::Result<()> { let mut rnode_spcnode: u32 = 0; let mut rnode_dbnode: u32 = 0; @@ -1106,9 +1103,9 @@ pub struct XlClogTruncate { } impl XlClogTruncate { - pub fn decode(buf: &mut Bytes, pg_version: u32) -> XlClogTruncate { + pub fn decode(buf: &mut Bytes, pg_version: PgMajorVersion) -> XlClogTruncate { XlClogTruncate { - pageno: if pg_version < 17 { + pageno: if pg_version < PgMajorVersion::PG17 { buf.get_u32_le() } else { buf.get_u64_le() as u32 diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs index 14fb1f2a1f..f7b6296053 100644 --- a/libs/postgres_ffi/src/xlog_utils.rs +++ b/libs/postgres_ffi/src/xlog_utils.rs @@ -11,9 +11,9 @@ use super::super::waldecoder::WalStreamDecoder; use super::bindings::{ CheckPoint, ControlFileData, DBState_DB_SHUTDOWNED, FullTransactionId, TimeLineID, TimestampTz, XLogLongPageHeaderData, XLogPageHeaderData, XLogRecPtr, XLogRecord, XLogSegNo, XLOG_PAGE_MAGIC, + MY_PGVERSION }; use super::wal_generator::LogicalMessageGenerator; -use super::PG_MAJORVERSION; use crate::pg_constants; use crate::PG_TLI; use crate::{uint32, uint64, Oid}; @@ -233,7 +233,7 @@ pub fn find_end_of_wal( let mut result = start_lsn; let mut curr_lsn = start_lsn; let mut buf = [0u8; XLOG_BLCKSZ]; - let pg_version = PG_MAJORVERSION[1..3].parse::().unwrap(); + let pg_version = MY_PGVERSION; debug!("find_end_of_wal PG_VERSION: {}", pg_version); let mut decoder = WalStreamDecoder::new(start_lsn, pg_version); diff --git a/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs b/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs index 6151ce34ac..44bc4dfa95 100644 --- a/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs +++ b/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs @@ -4,6 +4,7 @@ use std::str::FromStr; use anyhow::*; use clap::{Arg, ArgMatches, Command, value_parser}; use postgres::Client; +use postgres_ffi::PgMajorVersion; use wal_craft::*; fn main() -> Result<()> { @@ -48,7 +49,7 @@ fn main() -> Result<()> { Some(("with-initdb", arg_matches)) => { let cfg = Conf { pg_version: *arg_matches - .get_one::("pg-version") + .get_one::("pg-version") .context("'pg-version' is required")?, pg_distrib_dir: arg_matches .get_one::("pg-distrib-dir") diff --git a/libs/postgres_ffi/wal_craft/src/lib.rs b/libs/postgres_ffi/wal_craft/src/lib.rs index ca9530faef..ef9e854297 100644 --- a/libs/postgres_ffi/wal_craft/src/lib.rs +++ b/libs/postgres_ffi/wal_craft/src/lib.rs @@ -9,8 +9,8 @@ use log::*; use postgres::Client; use postgres::types::PgLsn; use postgres_ffi::{ - WAL_SEGMENT_SIZE, XLOG_BLCKSZ, XLOG_SIZE_OF_XLOG_LONG_PHD, XLOG_SIZE_OF_XLOG_RECORD, - XLOG_SIZE_OF_XLOG_SHORT_PHD, + PgMajorVersion, WAL_SEGMENT_SIZE, XLOG_BLCKSZ, XLOG_SIZE_OF_XLOG_LONG_PHD, + XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD, }; macro_rules! xlog_utils_test { @@ -29,7 +29,7 @@ macro_rules! xlog_utils_test { postgres_ffi::for_all_postgres_versions! { xlog_utils_test } pub struct Conf { - pub pg_version: u32, + pub pg_version: PgMajorVersion, pub pg_distrib_dir: PathBuf, pub datadir: PathBuf, } @@ -52,11 +52,7 @@ impl Conf { pub fn pg_distrib_dir(&self) -> anyhow::Result { let path = self.pg_distrib_dir.clone(); - #[allow(clippy::manual_range_patterns)] - match self.pg_version { - 14 | 15 | 16 | 17 => Ok(path.join(format!("v{}", self.pg_version))), - _ => bail!("Unsupported postgres version: {}", self.pg_version), - } + Ok(path.join(self.pg_version.v_str())) } fn pg_bin_dir(&self) -> anyhow::Result { diff --git a/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs b/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs index 94371a35b5..366aa7dbef 100644 --- a/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs +++ b/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs @@ -24,7 +24,7 @@ fn init_logging() { fn test_end_of_wal(test_name: &str) { use crate::*; - let pg_version = PG_MAJORVERSION[1..3].parse::().unwrap(); + let pg_version = MY_PGVERSION; // Craft some WAL let top_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")) diff --git a/libs/postgres_initdb/Cargo.toml b/libs/postgres_initdb/Cargo.toml index 1605279bce..5b3b0cd936 100644 --- a/libs/postgres_initdb/Cargo.toml +++ b/libs/postgres_initdb/Cargo.toml @@ -9,4 +9,5 @@ anyhow.workspace = true tokio.workspace = true camino.workspace = true thiserror.workspace = true +postgres_versioninfo.workspace = true workspace_hack = { version = "0.1", path = "../../workspace_hack" } diff --git a/libs/postgres_initdb/src/lib.rs b/libs/postgres_initdb/src/lib.rs index 4b4a597f73..a0c6ebef81 100644 --- a/libs/postgres_initdb/src/lib.rs +++ b/libs/postgres_initdb/src/lib.rs @@ -7,12 +7,13 @@ use std::fmt; use camino::Utf8Path; +use postgres_versioninfo::PgMajorVersion; pub struct RunInitdbArgs<'a> { pub superuser: &'a str, pub locale: &'a str, pub initdb_bin: &'a Utf8Path, - pub pg_version: u32, + pub pg_version: PgMajorVersion, pub library_search_path: &'a Utf8Path, pub pgdata: &'a Utf8Path, } @@ -79,12 +80,16 @@ pub async fn do_run_initdb(args: RunInitdbArgs<'_>) -> Result<(), Error> { .stderr(std::process::Stdio::piped()); // Before version 14, only the libc provide was available. - if pg_version > 14 { + if pg_version > PgMajorVersion::PG14 { // Version 17 brought with it a builtin locale provider which only provides // C and C.UTF-8. While being safer for collation purposes since it is // guaranteed to be consistent throughout a major release, it is also more // performant. - let locale_provider = if pg_version >= 17 { "builtin" } else { "libc" }; + let locale_provider = if pg_version >= PgMajorVersion::PG17 { + "builtin" + } else { + "libc" + }; initdb_command.args(["--locale-provider", locale_provider]); } diff --git a/libs/postgres_versioninfo/Cargo.toml b/libs/postgres_versioninfo/Cargo.toml new file mode 100644 index 0000000000..cc59f9698d --- /dev/null +++ b/libs/postgres_versioninfo/Cargo.toml @@ -0,0 +1,12 @@ +[package] +name = "postgres_versioninfo" +version = "0.1.0" +edition = "2024" +license.workspace = true + +[dependencies] +anyhow.workspace = true +thiserror.workspace = true +serde.workspace = true +serde_repr.workspace = true +workspace_hack = { version = "0.1", path = "../../workspace_hack" } diff --git a/libs/postgres_versioninfo/src/lib.rs b/libs/postgres_versioninfo/src/lib.rs new file mode 100644 index 0000000000..286507b654 --- /dev/null +++ b/libs/postgres_versioninfo/src/lib.rs @@ -0,0 +1,175 @@ +use serde::{Deserialize, Deserializer, Serialize, Serializer}; +use serde_repr::{Deserialize_repr, Serialize_repr}; +use std::fmt::{Display, Formatter}; +use std::str::FromStr; + +/// An enum with one variant for each major version of PostgreSQL that we support. +/// +#[derive(Debug, Clone, Copy, Ord, PartialOrd, Eq, PartialEq, Deserialize_repr, Serialize_repr)] +#[repr(u32)] +pub enum PgMajorVersion { + PG14 = 14, + PG15 = 15, + PG16 = 16, + PG17 = 17, + // !!! When you add a new PgMajorVersion, don't forget to update PgMajorVersion::ALL +} + +/// A full PostgreSQL version ID, in MMmmbb numerical format (Major/minor/bugfix) +#[derive(Debug, Copy, Clone, Ord, PartialOrd, Eq, PartialEq)] +#[repr(transparent)] +pub struct PgVersionId(u32); + +impl PgVersionId { + pub const UNKNOWN: PgVersionId = PgVersionId(0); + + pub fn from_full_pg_version(version: u32) -> PgVersionId { + match version { + 0 => PgVersionId(version), // unknown version + 140000..180000 => PgVersionId(version), + _ => panic!("Invalid full PostgreSQL version ID {version}"), + } + } +} + +impl Display for PgVersionId { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + u32::fmt(&self.0, f) + } +} + +impl Serialize for PgVersionId { + fn serialize(&self, serializer: S) -> Result + where + S: Serializer, + { + u32::serialize(&self.0, serializer) + } +} + +impl<'de> Deserialize<'de> for PgVersionId { + fn deserialize(deserializer: D) -> Result + where + D: Deserializer<'de>, + { + u32::deserialize(deserializer).map(PgVersionId) + } + + fn deserialize_in_place(deserializer: D, place: &mut Self) -> Result<(), D::Error> + where + D: Deserializer<'de>, + { + u32::deserialize_in_place(deserializer, &mut place.0) + } +} + +impl PgMajorVersion { + /// Get the numerical representation of the represented Major Version + pub const fn major_version_num(&self) -> u32 { + match self { + PgMajorVersion::PG14 => 14, + PgMajorVersion::PG15 => 15, + PgMajorVersion::PG16 => 16, + PgMajorVersion::PG17 => 17, + } + } + + /// Get the contents of this version's PG_VERSION file. + /// + /// The PG_VERSION file is used to determine the PostgreSQL version that currently + /// owns the data in a PostgreSQL data directory. + pub fn versionfile_string(&self) -> &'static str { + match self { + PgMajorVersion::PG14 => "14", + PgMajorVersion::PG15 => "15", + PgMajorVersion::PG16 => "16\x0A", + PgMajorVersion::PG17 => "17\x0A", + } + } + + /// Get the v{version} string of this major PostgreSQL version. + /// + /// Because this was hand-coded in various places, this was moved into a shared + /// implementation. + pub fn v_str(&self) -> String { + match self { + PgMajorVersion::PG14 => "v14", + PgMajorVersion::PG15 => "v15", + PgMajorVersion::PG16 => "v16", + PgMajorVersion::PG17 => "v17", + } + .to_string() + } + + /// All currently supported major versions of PostgreSQL. + pub const ALL: &'static [PgMajorVersion] = &[ + PgMajorVersion::PG14, + PgMajorVersion::PG15, + PgMajorVersion::PG16, + PgMajorVersion::PG17, + ]; +} + +impl Display for PgMajorVersion { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + f.write_str(match self { + PgMajorVersion::PG14 => "PgMajorVersion::PG14", + PgMajorVersion::PG15 => "PgMajorVersion::PG15", + PgMajorVersion::PG16 => "PgMajorVersion::PG16", + PgMajorVersion::PG17 => "PgMajorVersion::PG17", + }) + } +} + +#[derive(Debug, thiserror::Error)] +#[allow(dead_code)] +pub struct InvalidPgVersion(u32); + +impl Display for InvalidPgVersion { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "InvalidPgVersion({})", self.0) + } +} + +impl TryFrom for PgMajorVersion { + type Error = InvalidPgVersion; + + fn try_from(value: PgVersionId) -> Result { + Ok(match value.0 / 10000 { + 14 => PgMajorVersion::PG14, + 15 => PgMajorVersion::PG15, + 16 => PgMajorVersion::PG16, + 17 => PgMajorVersion::PG17, + _ => return Err(InvalidPgVersion(value.0)), + }) + } +} + +impl From for PgVersionId { + fn from(value: PgMajorVersion) -> Self { + PgVersionId((value as u32) * 10000) + } +} + +#[derive(Debug, PartialEq, Eq, thiserror::Error)] +pub struct PgMajorVersionParseError(String); + +impl Display for PgMajorVersionParseError { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "PgMajorVersionParseError({})", self.0) + } +} + +impl FromStr for PgMajorVersion { + type Err = PgMajorVersionParseError; + + fn from_str(s: &str) -> Result { + Ok(match s { + "14" => PgMajorVersion::PG14, + "15" => PgMajorVersion::PG15, + "16" => PgMajorVersion::PG16, + "17" => PgMajorVersion::PG17, + _ => return Err(PgMajorVersionParseError(s.to_string())), + }) + } +} diff --git a/libs/safekeeper_api/Cargo.toml b/libs/safekeeper_api/Cargo.toml index d9d080e8fe..928e583b0b 100644 --- a/libs/safekeeper_api/Cargo.toml +++ b/libs/safekeeper_api/Cargo.toml @@ -10,6 +10,7 @@ const_format.workspace = true serde.workspace = true serde_json.workspace = true postgres_ffi.workspace = true +postgres_versioninfo.workspace = true pq_proto.workspace = true tokio.workspace = true utils.workspace = true diff --git a/libs/safekeeper_api/src/lib.rs b/libs/safekeeper_api/src/lib.rs index fa86523ad7..ba0bfee971 100644 --- a/libs/safekeeper_api/src/lib.rs +++ b/libs/safekeeper_api/src/lib.rs @@ -8,6 +8,8 @@ pub mod membership; /// Public API types pub mod models; +pub use postgres_versioninfo::{PgMajorVersion, PgVersionId}; + /// Consensus logical timestamp. Note: it is a part of sk control file. pub type Term = u64; /// With this term timeline is created initially. It @@ -20,7 +22,7 @@ pub const INITIAL_TERM: Term = 0; #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] pub struct ServerInfo { /// Postgres server version - pub pg_version: u32, + pub pg_version: PgVersionId, pub system_id: SystemId, pub wal_seg_size: u32, } diff --git a/libs/safekeeper_api/src/models.rs b/libs/safekeeper_api/src/models.rs index fd05f6fda3..5c1ee41f7b 100644 --- a/libs/safekeeper_api/src/models.rs +++ b/libs/safekeeper_api/src/models.rs @@ -4,6 +4,7 @@ use std::net::SocketAddr; use pageserver_api::shard::ShardIdentity; use postgres_ffi::TimestampTz; +use postgres_versioninfo::PgVersionId; use serde::{Deserialize, Serialize}; use tokio::time::Instant; use utils::id::{NodeId, TenantId, TenantTimelineId, TimelineId}; @@ -23,8 +24,7 @@ pub struct TimelineCreateRequest { pub tenant_id: TenantId, pub timeline_id: TimelineId, pub mconf: Configuration, - /// In the PG_VERSION_NUM macro format, like 140017. - pub pg_version: u32, + pub pg_version: PgVersionId, pub system_id: Option, // By default WAL_SEGMENT_SIZE pub wal_seg_size: Option, diff --git a/libs/wal_decoder/benches/bench_interpret_wal.rs b/libs/wal_decoder/benches/bench_interpret_wal.rs index ff860a92e2..e3956eca05 100644 --- a/libs/wal_decoder/benches/bench_interpret_wal.rs +++ b/libs/wal_decoder/benches/bench_interpret_wal.rs @@ -10,7 +10,7 @@ use futures::StreamExt; use futures::stream::FuturesUnordered; use pageserver_api::shard::{ShardIdentity, ShardStripeSize}; use postgres_ffi::waldecoder::WalStreamDecoder; -use postgres_ffi::{MAX_SEND_SIZE, WAL_SEGMENT_SIZE}; +use postgres_ffi::{MAX_SEND_SIZE, PgMajorVersion, WAL_SEGMENT_SIZE}; use pprof::criterion::{Output, PProfProfiler}; use remote_storage::{ DownloadOpts, GenericRemoteStorage, ListingMode, RemoteStorageConfig, RemoteStorageKind, @@ -115,7 +115,7 @@ struct BenchmarkData { #[derive(Deserialize)] struct BenchmarkMetadata { - pg_version: u32, + pg_version: PgMajorVersion, start_lsn: Lsn, } diff --git a/libs/wal_decoder/src/decoder.rs b/libs/wal_decoder/src/decoder.rs index 9980a1f369..0843eb35bf 100644 --- a/libs/wal_decoder/src/decoder.rs +++ b/libs/wal_decoder/src/decoder.rs @@ -7,8 +7,8 @@ use bytes::{Buf, Bytes}; use pageserver_api::key::rel_block_to_key; use pageserver_api::reltag::{RelTag, SlruKind}; use pageserver_api::shard::ShardIdentity; -use postgres_ffi::pg_constants; use postgres_ffi::walrecord::*; +use postgres_ffi::{PgMajorVersion, pg_constants}; use postgres_ffi_types::forknum::VISIBILITYMAP_FORKNUM; use utils::lsn::Lsn; @@ -24,7 +24,7 @@ impl InterpretedWalRecord { buf: Bytes, shards: &[ShardIdentity], next_record_lsn: Lsn, - pg_version: u32, + pg_version: PgMajorVersion, ) -> anyhow::Result> { let mut decoded = DecodedWALRecord::default(); decode_wal_record(buf, &mut decoded, pg_version)?; @@ -78,7 +78,7 @@ impl MetadataRecord { decoded: &DecodedWALRecord, shard_records: &mut HashMap, next_record_lsn: Lsn, - pg_version: u32, + pg_version: PgMajorVersion, ) -> anyhow::Result<()> { // Note: this doesn't actually copy the bytes since // the [`Bytes`] type implements it via a level of indirection. @@ -193,7 +193,7 @@ impl MetadataRecord { fn decode_heapam_record( buf: &mut Bytes, decoded: &DecodedWALRecord, - pg_version: u32, + pg_version: PgMajorVersion, ) -> anyhow::Result> { // Handle VM bit updates that are implicitly part of heap records. @@ -205,7 +205,7 @@ impl MetadataRecord { let mut flags = pg_constants::VISIBILITYMAP_VALID_BITS; match pg_version { - 14 => { + PgMajorVersion::PG14 => { if decoded.xl_rmid == pg_constants::RM_HEAP_ID { let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK; @@ -272,7 +272,7 @@ impl MetadataRecord { anyhow::bail!("Unknown RMGR {} for Heap decoding", decoded.xl_rmid); } } - 15 => { + PgMajorVersion::PG15 => { if decoded.xl_rmid == pg_constants::RM_HEAP_ID { let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK; @@ -339,7 +339,7 @@ impl MetadataRecord { anyhow::bail!("Unknown RMGR {} for Heap decoding", decoded.xl_rmid); } } - 16 => { + PgMajorVersion::PG16 => { if decoded.xl_rmid == pg_constants::RM_HEAP_ID { let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK; @@ -406,7 +406,7 @@ impl MetadataRecord { anyhow::bail!("Unknown RMGR {} for Heap decoding", decoded.xl_rmid); } } - 17 => { + PgMajorVersion::PG17 => { if decoded.xl_rmid == pg_constants::RM_HEAP_ID { let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK; @@ -473,7 +473,6 @@ impl MetadataRecord { anyhow::bail!("Unknown RMGR {} for Heap decoding", decoded.xl_rmid); } } - _ => {} } if new_heap_blkno.is_some() || old_heap_blkno.is_some() { @@ -500,7 +499,7 @@ impl MetadataRecord { fn decode_neonmgr_record( buf: &mut Bytes, decoded: &DecodedWALRecord, - pg_version: u32, + pg_version: PgMajorVersion, ) -> anyhow::Result> { // Handle VM bit updates that are implicitly part of heap records. @@ -514,7 +513,7 @@ impl MetadataRecord { assert_eq!(decoded.xl_rmid, pg_constants::RM_NEON_ID); match pg_version { - 16 | 17 => { + PgMajorVersion::PG16 | PgMajorVersion::PG17 => { let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK; match info { @@ -574,7 +573,7 @@ impl MetadataRecord { info => anyhow::bail!("Unknown WAL record type for Neon RMGR: {}", info), } } - _ => anyhow::bail!( + PgMajorVersion::PG15 | PgMajorVersion::PG14 => anyhow::bail!( "Neon RMGR has no known compatibility with PostgreSQL version {}", pg_version ), @@ -629,116 +628,121 @@ impl MetadataRecord { fn decode_dbase_record( buf: &mut Bytes, decoded: &DecodedWALRecord, - pg_version: u32, + pg_version: PgMajorVersion, ) -> anyhow::Result> { // TODO: Refactor this to avoid the duplication between postgres versions. let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK; tracing::debug!(%info, %pg_version, "handle RM_DBASE_ID"); - if pg_version == 14 { - if info == postgres_ffi::v14::bindings::XLOG_DBASE_CREATE { - let createdb = XlCreateDatabase::decode(buf); - tracing::debug!("XLOG_DBASE_CREATE v14"); + match pg_version { + PgMajorVersion::PG14 => { + if info == postgres_ffi::v14::bindings::XLOG_DBASE_CREATE { + let createdb = XlCreateDatabase::decode(buf); + tracing::debug!("XLOG_DBASE_CREATE v14"); - let record = MetadataRecord::Dbase(DbaseRecord::Create(DbaseCreate { - db_id: createdb.db_id, - tablespace_id: createdb.tablespace_id, - src_db_id: createdb.src_db_id, - src_tablespace_id: createdb.src_tablespace_id, - })); + let record = MetadataRecord::Dbase(DbaseRecord::Create(DbaseCreate { + db_id: createdb.db_id, + tablespace_id: createdb.tablespace_id, + src_db_id: createdb.src_db_id, + src_tablespace_id: createdb.src_tablespace_id, + })); - return Ok(Some(record)); - } else if info == postgres_ffi::v14::bindings::XLOG_DBASE_DROP { - let dropdb = XlDropDatabase::decode(buf); + return Ok(Some(record)); + } else if info == postgres_ffi::v14::bindings::XLOG_DBASE_DROP { + let dropdb = XlDropDatabase::decode(buf); - let record = MetadataRecord::Dbase(DbaseRecord::Drop(DbaseDrop { - db_id: dropdb.db_id, - tablespace_ids: dropdb.tablespace_ids, - })); + let record = MetadataRecord::Dbase(DbaseRecord::Drop(DbaseDrop { + db_id: dropdb.db_id, + tablespace_ids: dropdb.tablespace_ids, + })); - return Ok(Some(record)); + return Ok(Some(record)); + } } - } else if pg_version == 15 { - if info == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_WAL_LOG { - tracing::debug!("XLOG_DBASE_CREATE_WAL_LOG: noop"); - } else if info == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_FILE_COPY { - // The XLOG record was renamed between v14 and v15, - // but the record format is the same. - // So we can reuse XlCreateDatabase here. - tracing::debug!("XLOG_DBASE_CREATE_FILE_COPY"); + PgMajorVersion::PG15 => { + if info == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_WAL_LOG { + tracing::debug!("XLOG_DBASE_CREATE_WAL_LOG: noop"); + } else if info == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_FILE_COPY { + // The XLOG record was renamed between v14 and v15, + // but the record format is the same. + // So we can reuse XlCreateDatabase here. + tracing::debug!("XLOG_DBASE_CREATE_FILE_COPY"); - let createdb = XlCreateDatabase::decode(buf); - let record = MetadataRecord::Dbase(DbaseRecord::Create(DbaseCreate { - db_id: createdb.db_id, - tablespace_id: createdb.tablespace_id, - src_db_id: createdb.src_db_id, - src_tablespace_id: createdb.src_tablespace_id, - })); + let createdb = XlCreateDatabase::decode(buf); + let record = MetadataRecord::Dbase(DbaseRecord::Create(DbaseCreate { + db_id: createdb.db_id, + tablespace_id: createdb.tablespace_id, + src_db_id: createdb.src_db_id, + src_tablespace_id: createdb.src_tablespace_id, + })); - return Ok(Some(record)); - } else if info == postgres_ffi::v15::bindings::XLOG_DBASE_DROP { - let dropdb = XlDropDatabase::decode(buf); - let record = MetadataRecord::Dbase(DbaseRecord::Drop(DbaseDrop { - db_id: dropdb.db_id, - tablespace_ids: dropdb.tablespace_ids, - })); + return Ok(Some(record)); + } else if info == postgres_ffi::v15::bindings::XLOG_DBASE_DROP { + let dropdb = XlDropDatabase::decode(buf); + let record = MetadataRecord::Dbase(DbaseRecord::Drop(DbaseDrop { + db_id: dropdb.db_id, + tablespace_ids: dropdb.tablespace_ids, + })); - return Ok(Some(record)); + return Ok(Some(record)); + } } - } else if pg_version == 16 { - if info == postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_WAL_LOG { - tracing::debug!("XLOG_DBASE_CREATE_WAL_LOG: noop"); - } else if info == postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_FILE_COPY { - // The XLOG record was renamed between v14 and v15, - // but the record format is the same. - // So we can reuse XlCreateDatabase here. - tracing::debug!("XLOG_DBASE_CREATE_FILE_COPY"); + PgMajorVersion::PG16 => { + if info == postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_WAL_LOG { + tracing::debug!("XLOG_DBASE_CREATE_WAL_LOG: noop"); + } else if info == postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_FILE_COPY { + // The XLOG record was renamed between v14 and v15, + // but the record format is the same. + // So we can reuse XlCreateDatabase here. + tracing::debug!("XLOG_DBASE_CREATE_FILE_COPY"); - let createdb = XlCreateDatabase::decode(buf); - let record = MetadataRecord::Dbase(DbaseRecord::Create(DbaseCreate { - db_id: createdb.db_id, - tablespace_id: createdb.tablespace_id, - src_db_id: createdb.src_db_id, - src_tablespace_id: createdb.src_tablespace_id, - })); + let createdb = XlCreateDatabase::decode(buf); + let record = MetadataRecord::Dbase(DbaseRecord::Create(DbaseCreate { + db_id: createdb.db_id, + tablespace_id: createdb.tablespace_id, + src_db_id: createdb.src_db_id, + src_tablespace_id: createdb.src_tablespace_id, + })); - return Ok(Some(record)); - } else if info == postgres_ffi::v16::bindings::XLOG_DBASE_DROP { - let dropdb = XlDropDatabase::decode(buf); - let record = MetadataRecord::Dbase(DbaseRecord::Drop(DbaseDrop { - db_id: dropdb.db_id, - tablespace_ids: dropdb.tablespace_ids, - })); + return Ok(Some(record)); + } else if info == postgres_ffi::v16::bindings::XLOG_DBASE_DROP { + let dropdb = XlDropDatabase::decode(buf); + let record = MetadataRecord::Dbase(DbaseRecord::Drop(DbaseDrop { + db_id: dropdb.db_id, + tablespace_ids: dropdb.tablespace_ids, + })); - return Ok(Some(record)); + return Ok(Some(record)); + } } - } else if pg_version == 17 { - if info == postgres_ffi::v17::bindings::XLOG_DBASE_CREATE_WAL_LOG { - tracing::debug!("XLOG_DBASE_CREATE_WAL_LOG: noop"); - } else if info == postgres_ffi::v17::bindings::XLOG_DBASE_CREATE_FILE_COPY { - // The XLOG record was renamed between v14 and v15, - // but the record format is the same. - // So we can reuse XlCreateDatabase here. - tracing::debug!("XLOG_DBASE_CREATE_FILE_COPY"); + PgMajorVersion::PG17 => { + if info == postgres_ffi::v17::bindings::XLOG_DBASE_CREATE_WAL_LOG { + tracing::debug!("XLOG_DBASE_CREATE_WAL_LOG: noop"); + } else if info == postgres_ffi::v17::bindings::XLOG_DBASE_CREATE_FILE_COPY { + // The XLOG record was renamed between v14 and v15, + // but the record format is the same. + // So we can reuse XlCreateDatabase here. + tracing::debug!("XLOG_DBASE_CREATE_FILE_COPY"); - let createdb = XlCreateDatabase::decode(buf); - let record = MetadataRecord::Dbase(DbaseRecord::Create(DbaseCreate { - db_id: createdb.db_id, - tablespace_id: createdb.tablespace_id, - src_db_id: createdb.src_db_id, - src_tablespace_id: createdb.src_tablespace_id, - })); + let createdb = XlCreateDatabase::decode(buf); + let record = MetadataRecord::Dbase(DbaseRecord::Create(DbaseCreate { + db_id: createdb.db_id, + tablespace_id: createdb.tablespace_id, + src_db_id: createdb.src_db_id, + src_tablespace_id: createdb.src_tablespace_id, + })); - return Ok(Some(record)); - } else if info == postgres_ffi::v17::bindings::XLOG_DBASE_DROP { - let dropdb = XlDropDatabase::decode(buf); - let record = MetadataRecord::Dbase(DbaseRecord::Drop(DbaseDrop { - db_id: dropdb.db_id, - tablespace_ids: dropdb.tablespace_ids, - })); + return Ok(Some(record)); + } else if info == postgres_ffi::v17::bindings::XLOG_DBASE_DROP { + let dropdb = XlDropDatabase::decode(buf); + let record = MetadataRecord::Dbase(DbaseRecord::Drop(DbaseDrop { + db_id: dropdb.db_id, + tablespace_ids: dropdb.tablespace_ids, + })); - return Ok(Some(record)); + return Ok(Some(record)); + } } } @@ -748,12 +752,12 @@ impl MetadataRecord { fn decode_clog_record( buf: &mut Bytes, decoded: &DecodedWALRecord, - pg_version: u32, + pg_version: PgMajorVersion, ) -> anyhow::Result> { let info = decoded.xl_info & !pg_constants::XLR_INFO_MASK; if info == pg_constants::CLOG_ZEROPAGE { - let pageno = if pg_version < 17 { + let pageno = if pg_version < PgMajorVersion::PG17 { buf.get_u32_le() } else { buf.get_u64_le() as u32 @@ -765,7 +769,7 @@ impl MetadataRecord { ClogZeroPage { segno, rpageno }, )))) } else { - assert!(info == pg_constants::CLOG_TRUNCATE); + assert_eq!(info, pg_constants::CLOG_TRUNCATE); let xlrec = XlClogTruncate::decode(buf, pg_version); Ok(Some(MetadataRecord::Clog(ClogRecord::Truncate( @@ -838,14 +842,14 @@ impl MetadataRecord { fn decode_multixact_record( buf: &mut Bytes, decoded: &DecodedWALRecord, - pg_version: u32, + pg_version: PgMajorVersion, ) -> anyhow::Result> { let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK; if info == pg_constants::XLOG_MULTIXACT_ZERO_OFF_PAGE || info == pg_constants::XLOG_MULTIXACT_ZERO_MEM_PAGE { - let pageno = if pg_version < 17 { + let pageno = if pg_version < PgMajorVersion::PG17 { buf.get_u32_le() } else { buf.get_u64_le() as u32 diff --git a/libs/wal_decoder/src/serialized_batch.rs b/libs/wal_decoder/src/serialized_batch.rs index 4123f7d0ac..ab38ff3d73 100644 --- a/libs/wal_decoder/src/serialized_batch.rs +++ b/libs/wal_decoder/src/serialized_batch.rs @@ -13,7 +13,7 @@ use pageserver_api::keyspace::KeySpace; use pageserver_api::reltag::RelTag; use pageserver_api::shard::ShardIdentity; use postgres_ffi::walrecord::{DecodedBkpBlock, DecodedWALRecord}; -use postgres_ffi::{BLCKSZ, page_is_new, page_set_lsn, pg_constants}; +use postgres_ffi::{BLCKSZ, PgMajorVersion, page_is_new, page_set_lsn, pg_constants}; use serde::{Deserialize, Serialize}; use utils::bin_ser::BeSer; use utils::lsn::Lsn; @@ -139,7 +139,7 @@ impl SerializedValueBatch { decoded: DecodedWALRecord, shard_records: &mut HashMap, next_record_lsn: Lsn, - pg_version: u32, + pg_version: PgMajorVersion, ) -> anyhow::Result<()> { // First determine how big the buffers need to be and allocate it up-front. // This duplicates some of the work below, but it's empirically much faster. @@ -267,7 +267,7 @@ impl SerializedValueBatch { fn estimate_buffer_size( decoded: &DecodedWALRecord, shard: &ShardIdentity, - pg_version: u32, + pg_version: PgMajorVersion, ) -> usize { let mut estimate: usize = 0; @@ -303,7 +303,11 @@ impl SerializedValueBatch { estimate } - fn block_is_image(decoded: &DecodedWALRecord, blk: &DecodedBkpBlock, pg_version: u32) -> bool { + fn block_is_image( + decoded: &DecodedWALRecord, + blk: &DecodedBkpBlock, + pg_version: PgMajorVersion, + ) -> bool { blk.apply_image && blk.has_image && decoded.xl_rmid == pg_constants::RM_XLOG_ID diff --git a/pageserver/benches/bench_walredo.rs b/pageserver/benches/bench_walredo.rs index 36d0d9c974..efb970f705 100644 --- a/pageserver/benches/bench_walredo.rs +++ b/pageserver/benches/bench_walredo.rs @@ -68,6 +68,7 @@ use pageserver::config::PageServerConf; use pageserver::walredo::{PostgresRedoManager, RedoAttemptType}; use pageserver_api::key::Key; use pageserver_api::shard::TenantShardId; +use postgres_ffi::{BLCKSZ, PgMajorVersion}; use tokio::sync::Barrier; use tokio::task::JoinSet; use utils::id::TenantId; @@ -94,7 +95,7 @@ fn bench(c: &mut Criterion) { // // benchmark the protocol implementation // - let pg_version = 14; + let pg_version = PgMajorVersion::PG14; bench_group!( "ping", Arc::new(move |mgr: Arc| async move { @@ -107,7 +108,7 @@ fn bench(c: &mut Criterion) { let make_redo_work = |req: &'static Request| { Arc::new(move |mgr: Arc| async move { let page = req.execute(&mgr).await.unwrap(); - assert_eq!(page.remaining(), 8192); + assert_eq!(page.remaining(), BLCKSZ as usize); }) }; bench_group!("short", { @@ -208,7 +209,7 @@ struct Request { lsn: Lsn, base_img: Option<(Lsn, Bytes)>, records: Vec<(Lsn, NeonWalRecord)>, - pg_version: u32, + pg_version: PgMajorVersion, } impl Request { @@ -267,7 +268,7 @@ impl Request { pg_record(false, b"\xbc\0\0\0\0\0\0\0h?m\x01\0\0\0\0p\n\0\09\x08\xa3\xea\0 \x8c\0\x7f\x06\0\0\xd22\0\0\xeb\x04\0\0\0\0\0\0\xff\x02\0@\0\0another_table\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x98\x08\0\0\x02@\0\0\0\0\0\0\n\0\0\0\x02\0\0\0\0@\0\0\0\0\0\0\x05\0\0\0\0@zD\x05\0\0\0\0\0\0\0\0\0pr\x01\0\0\0\0\0\0\0\0\x01d\0\0\0\0\0\0\x04\0\0\x01\0\0\0\x02\0"), ), ], - pg_version: 14, + pg_version: PgMajorVersion::PG14, } } @@ -516,7 +517,7 @@ impl Request { (lsn!("0/16B8000"), pg_record(false, b"C\0\0\0\0\x04\0\0p\x7fk\x01\0\0\0\0\0\n\0\0\\\xc4:?\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\xe1\0\0\0\0\0\0\0\xe2\0\0")), (lsn!("0/16CBD68"), pg_record(false, b"@ \0\0\0\0\0\0\xc0|l\x01\0\0\0\0@\t\0\0\xdf\xb0\x1a`\0\x12\0\0\0 \0\0\x04\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\x01\x80\0\0\0\0\0\0\xff\x05\0\0\0\0\0\0\0\0\0\0\0\0\x18\0\0 \0 \x04 \0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x04\0\0\x01")), ], - pg_version: 14, + pg_version: PgMajorVersion::PG14, } } } diff --git a/pageserver/client/Cargo.toml b/pageserver/client/Cargo.toml index 970a437a42..47e2a6ddae 100644 --- a/pageserver/client/Cargo.toml +++ b/pageserver/client/Cargo.toml @@ -18,6 +18,7 @@ workspace_hack = { version = "0.1", path = "../../workspace_hack" } tokio-postgres.workspace = true tokio-stream.workspace = true tokio.workspace = true +postgres_versioninfo.workspace = true futures.workspace = true tokio-util.workspace = true anyhow.workspace = true diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs index 8b091684eb..3919a6e788 100644 --- a/pageserver/client/src/mgmt_api.rs +++ b/pageserver/client/src/mgmt_api.rs @@ -7,6 +7,7 @@ use detach_ancestor::AncestorDetached; use http_utils::error::HttpErrorBody; use pageserver_api::models::*; use pageserver_api::shard::TenantShardId; +use postgres_versioninfo::PgMajorVersion; pub use reqwest::Body as ReqwestBody; use reqwest::{IntoUrl, Method, StatusCode, Url}; use utils::id::{TenantId, TimelineId}; @@ -745,9 +746,11 @@ impl Client { timeline_id: TimelineId, base_lsn: Lsn, end_lsn: Lsn, - pg_version: u32, + pg_version: PgMajorVersion, basebackup_tarball: ReqwestBody, ) -> Result<()> { + let pg_version = pg_version.major_version_num(); + let uri = format!( "{}/v1/tenant/{tenant_id}/timeline/{timeline_id}/import_basebackup?base_lsn={base_lsn}&end_lsn={end_lsn}&pg_version={pg_version}", self.mgmt_api_endpoint, diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index c4aaff58a1..115f0d9ebc 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -20,7 +20,8 @@ use pageserver_api::key::{Key, rel_block_to_key}; use pageserver_api::reltag::{RelTag, SlruKind}; use postgres_ffi::pg_constants::{PG_HBA, PGDATA_SPECIAL_FILES}; use postgres_ffi::{ - BLCKSZ, PG_TLI, RELSEG_SIZE, WAL_SEGMENT_SIZE, XLogFileName, dispatch_pgversion, pg_constants, + BLCKSZ, PG_TLI, PgMajorVersion, RELSEG_SIZE, WAL_SEGMENT_SIZE, XLogFileName, + dispatch_pgversion, pg_constants, }; use postgres_ffi_types::constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID}; use postgres_ffi_types::forknum::{INIT_FORKNUM, MAIN_FORKNUM}; @@ -619,10 +620,7 @@ where }; if spcnode == GLOBALTABLESPACE_OID { - let pg_version_str = match self.timeline.pg_version { - 14 | 15 => self.timeline.pg_version.to_string(), - ver => format!("{ver}\x0A"), - }; + let pg_version_str = self.timeline.pg_version.versionfile_string(); let header = new_tar_header("PG_VERSION", pg_version_str.len() as u64)?; self.ar .append(&header, pg_version_str.as_bytes()) @@ -679,10 +677,7 @@ where if let Some(img) = relmap_img { let dst_path = format!("base/{dbnode}/PG_VERSION"); - let pg_version_str = match self.timeline.pg_version { - 14 | 15 => self.timeline.pg_version.to_string(), - ver => format!("{ver}\x0A"), - }; + let pg_version_str = self.timeline.pg_version.versionfile_string(); let header = new_tar_header(&dst_path, pg_version_str.len() as u64)?; self.ar .append(&header, pg_version_str.as_bytes()) @@ -713,7 +708,7 @@ where buf.extend_from_slice(&img[..]); let crc = crc32c::crc32c(&img[..]); buf.put_u32_le(crc); - let path = if self.timeline.pg_version < 17 { + let path = if self.timeline.pg_version < PgMajorVersion::PG17 { format!("pg_twophase/{xid:>08X}") } else { format!("pg_twophase/{xid:>016X}") diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 3492a8d966..9952496061 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -11,7 +11,7 @@ use std::num::NonZeroUsize; use std::sync::Arc; use std::time::Duration; -use anyhow::{Context, bail, ensure}; +use anyhow::{Context, ensure}; use camino::{Utf8Path, Utf8PathBuf}; use once_cell::sync::OnceCell; use pageserver_api::config::{ @@ -22,6 +22,7 @@ use pageserver_api::models::ImageCompressionAlgorithm; use pageserver_api::shard::TenantShardId; use pem::Pem; use postgres_backend::AuthType; +use postgres_ffi::PgMajorVersion; use remote_storage::{RemotePath, RemoteStorageConfig}; use reqwest::Url; use storage_broker::Uri; @@ -338,20 +339,16 @@ impl PageServerConf { // // Postgres distribution paths // - pub fn pg_distrib_dir(&self, pg_version: u32) -> anyhow::Result { + pub fn pg_distrib_dir(&self, pg_version: PgMajorVersion) -> anyhow::Result { let path = self.pg_distrib_dir.clone(); - #[allow(clippy::manual_range_patterns)] - match pg_version { - 14 | 15 | 16 | 17 => Ok(path.join(format!("v{pg_version}"))), - _ => bail!("Unsupported postgres version: {}", pg_version), - } + Ok(path.join(pg_version.v_str())) } - pub fn pg_bin_dir(&self, pg_version: u32) -> anyhow::Result { + pub fn pg_bin_dir(&self, pg_version: PgMajorVersion) -> anyhow::Result { Ok(self.pg_distrib_dir(pg_version)?.join("bin")) } - pub fn pg_lib_dir(&self, pg_version: u32) -> anyhow::Result { + pub fn pg_lib_dir(&self, pg_version: PgMajorVersion) -> anyhow::Result { Ok(self.pg_distrib_dir(pg_version)?.join("lib")) } diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 349bc6dba6..3755cbda6a 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -41,6 +41,7 @@ use pageserver_api::models::{ TopTenantShardItem, TopTenantShardsRequest, TopTenantShardsResponse, }; use pageserver_api::shard::{ShardCount, TenantShardId}; +use postgres_ffi::PgMajorVersion; use remote_storage::{DownloadError, GenericRemoteStorage, TimeTravelError}; use scopeguard::defer; use serde_json::json; @@ -3385,7 +3386,7 @@ async fn put_tenant_timeline_import_basebackup( let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; let base_lsn: Lsn = must_parse_query_param(&request, "base_lsn")?; let end_lsn: Lsn = must_parse_query_param(&request, "end_lsn")?; - let pg_version: u32 = must_parse_query_param(&request, "pg_version")?; + let pg_version: PgMajorVersion = must_parse_query_param(&request, "pg_version")?; check_permission(&request, Some(tenant_id))?; diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index ae7cbf1d6b..0dd3c465e0 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -38,6 +38,7 @@ pub mod walredo; use camino::Utf8Path; use deletion_queue::DeletionQueue; +use postgres_ffi::PgMajorVersion; use tenant::mgr::{BackgroundPurges, TenantManager}; use tenant::secondary; use tracing::{info, info_span}; @@ -51,7 +52,7 @@ use tracing::{info, info_span}; /// backwards-compatible changes to the metadata format. pub const STORAGE_FORMAT_VERSION: u16 = 3; -pub const DEFAULT_PG_VERSION: u32 = 17; +pub const DEFAULT_PG_VERSION: PgMajorVersion = PgMajorVersion::PG17; // Magic constants used to identify different kinds of files pub const IMAGE_FILE_MAGIC: u16 = 0x5A60; diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 180a5b76e8..09a7a8a651 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -25,7 +25,7 @@ use pageserver_api::keyspace::{KeySpaceRandomAccum, SparseKeySpace}; use pageserver_api::models::RelSizeMigration; use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind}; use pageserver_api::shard::ShardIdentity; -use postgres_ffi::{BLCKSZ, TimestampTz, TransactionId}; +use postgres_ffi::{BLCKSZ, PgMajorVersion, TimestampTz, TransactionId}; use postgres_ffi_types::forknum::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM}; use postgres_ffi_types::{Oid, RepOriginId}; use serde::{Deserialize, Serialize}; @@ -1081,7 +1081,7 @@ impl Timeline { // fetch directory entry let buf = self.get(TWOPHASEDIR_KEY, lsn, ctx).await?; - if self.pg_version >= 17 { + if self.pg_version >= PgMajorVersion::PG17 { Ok(TwoPhaseDirectoryV17::des(&buf)?.xids) } else { Ok(TwoPhaseDirectory::des(&buf)? @@ -1613,7 +1613,7 @@ impl DatadirModification<'_> { .push((DirectoryKind::Db, MetricsUpdate::Set(0))); self.put(DBDIR_KEY, Value::Image(buf.into())); - let buf = if self.tline.pg_version >= 17 { + let buf = if self.tline.pg_version >= PgMajorVersion::PG17 { TwoPhaseDirectoryV17::ser(&TwoPhaseDirectoryV17 { xids: HashSet::new(), }) @@ -1967,7 +1967,7 @@ impl DatadirModification<'_> { ) -> Result<(), WalIngestError> { // Add it to the directory entry let dirbuf = self.get(TWOPHASEDIR_KEY, ctx).await?; - let newdirbuf = if self.tline.pg_version >= 17 { + let newdirbuf = if self.tline.pg_version >= PgMajorVersion::PG17 { let mut dir = TwoPhaseDirectoryV17::des(&dirbuf)?; if !dir.xids.insert(xid) { Err(WalIngestErrorKind::FileAlreadyExists(xid))?; @@ -2383,7 +2383,7 @@ impl DatadirModification<'_> { ) -> Result<(), WalIngestError> { // Remove it from the directory entry let buf = self.get(TWOPHASEDIR_KEY, ctx).await?; - let newdirbuf = if self.tline.pg_version >= 17 { + let newdirbuf = if self.tline.pg_version >= PgMajorVersion::PG17 { let mut dir = TwoPhaseDirectoryV17::des(&buf)?; if !dir.xids.remove(&xid) { diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index a48bf15246..c71655ce17 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -38,6 +38,7 @@ use pageserver_api::models::{ WalRedoManagerStatus, }; use pageserver_api::shard::{ShardIdentity, ShardStripeSize, TenantShardId}; +use postgres_ffi::PgMajorVersion; use remote_storage::{DownloadError, GenericRemoteStorage, TimeoutOrCancel}; use remote_timeline_client::index::GcCompactionState; use remote_timeline_client::manifest::{ @@ -497,7 +498,7 @@ impl WalRedoManager { lsn: Lsn, base_img: Option<(Lsn, bytes::Bytes)>, records: Vec<(Lsn, wal_decoder::models::record::NeonWalRecord)>, - pg_version: u32, + pg_version: PgMajorVersion, redo_attempt_type: RedoAttemptType, ) -> Result { match self { @@ -933,7 +934,7 @@ pub(crate) enum CreateTimelineParams { pub(crate) struct CreateTimelineParamsBootstrap { pub(crate) new_timeline_id: TimelineId, pub(crate) existing_initdb_timeline_id: Option, - pub(crate) pg_version: u32, + pub(crate) pg_version: PgMajorVersion, } /// NB: See comment on [`CreateTimelineIdempotency::Branch`] for why there's no `pg_version` here. @@ -971,7 +972,7 @@ pub(crate) enum CreateTimelineIdempotency { /// NB: special treatment, see comment in [`Self`]. FailWithConflict, Bootstrap { - pg_version: u32, + pg_version: PgMajorVersion, }, /// NB: branches always have the same `pg_version` as their ancestor. /// While [`pageserver_api::models::TimelineCreateRequestMode::Branch::pg_version`] @@ -2541,7 +2542,7 @@ impl TenantShard { self: &Arc, new_timeline_id: TimelineId, initdb_lsn: Lsn, - pg_version: u32, + pg_version: PgMajorVersion, ctx: &RequestContext, ) -> anyhow::Result<(UninitializedTimeline, RequestContext)> { anyhow::ensure!( @@ -2593,7 +2594,7 @@ impl TenantShard { self: &Arc, new_timeline_id: TimelineId, initdb_lsn: Lsn, - pg_version: u32, + pg_version: PgMajorVersion, ctx: &RequestContext, ) -> anyhow::Result> { let (uninit_tl, ctx) = self @@ -2632,7 +2633,7 @@ impl TenantShard { self: &Arc, new_timeline_id: TimelineId, initdb_lsn: Lsn, - pg_version: u32, + pg_version: PgMajorVersion, ctx: &RequestContext, in_memory_layer_desc: Vec, delta_layer_desc: Vec, @@ -2898,7 +2899,7 @@ impl TenantShard { Lsn(0), initdb_lsn, initdb_lsn, - 15, + PgMajorVersion::PG15, ); this.prepare_new_timeline( new_timeline_id, @@ -5090,7 +5091,7 @@ impl TenantShard { pub(crate) async fn bootstrap_timeline_test( self: &Arc, timeline_id: TimelineId, - pg_version: u32, + pg_version: PgMajorVersion, load_existing_initdb: Option, ctx: &RequestContext, ) -> anyhow::Result> { @@ -5232,7 +5233,7 @@ impl TenantShard { async fn bootstrap_timeline( self: &Arc, timeline_id: TimelineId, - pg_version: u32, + pg_version: PgMajorVersion, load_existing_initdb: Option, ctx: &RequestContext, ) -> Result { @@ -5770,7 +5771,7 @@ impl TenantShard { async fn run_initdb( conf: &'static PageServerConf, initdb_target_dir: &Utf8Path, - pg_version: u32, + pg_version: PgMajorVersion, cancel: &CancellationToken, ) -> Result<(), InitdbError> { let initdb_bin_path = conf @@ -6051,7 +6052,7 @@ pub(crate) mod harness { lsn: Lsn, base_img: Option<(Lsn, Bytes)>, records: Vec<(Lsn, NeonWalRecord)>, - _pg_version: u32, + _pg_version: PgMajorVersion, _redo_attempt_type: RedoAttemptType, ) -> Result { let records_neon = records.iter().all(|r| apply_neon::can_apply_in_neon(&r.1)); @@ -6223,7 +6224,7 @@ mod tests { async fn randomize_timeline( tenant: &Arc, new_timeline_id: TimelineId, - pg_version: u32, + pg_version: PgMajorVersion, spec: TestTimelineSpecification, random: &mut rand::rngs::StdRng, ctx: &RequestContext, diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs index 5081d7f5a4..2f407de951 100644 --- a/pageserver/src/tenant/metadata.rs +++ b/pageserver/src/tenant/metadata.rs @@ -18,6 +18,7 @@ //! [`IndexPart`]: super::remote_timeline_client::index::IndexPart use anyhow::ensure; +use postgres_ffi::PgMajorVersion; use serde::{Deserialize, Serialize}; use utils::bin_ser::{BeSer, SerializeError}; use utils::id::TimelineId; @@ -136,7 +137,7 @@ struct TimelineMetadataBodyV2 { latest_gc_cutoff_lsn: Lsn, initdb_lsn: Lsn, - pg_version: u32, + pg_version: PgMajorVersion, } #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] @@ -167,7 +168,7 @@ impl TimelineMetadata { ancestor_lsn: Lsn, latest_gc_cutoff_lsn: Lsn, initdb_lsn: Lsn, - pg_version: u32, + pg_version: PgMajorVersion, ) -> Self { Self { hdr: TimelineMetadataHeader { @@ -215,7 +216,7 @@ impl TimelineMetadata { ancestor_lsn: body.ancestor_lsn, latest_gc_cutoff_lsn: body.latest_gc_cutoff_lsn, initdb_lsn: body.initdb_lsn, - pg_version: 14, // All timelines created before this version had pg_version 14 + pg_version: PgMajorVersion::PG14, // All timelines created before this version had pg_version 14 }; hdr.format_version = METADATA_FORMAT_VERSION; @@ -317,7 +318,7 @@ impl TimelineMetadata { self.body.initdb_lsn } - pub fn pg_version(&self) -> u32 { + pub fn pg_version(&self) -> PgMajorVersion { self.body.pg_version } @@ -331,7 +332,7 @@ impl TimelineMetadata { Lsn::from_hex("00000000").unwrap(), Lsn::from_hex("00000000").unwrap(), Lsn::from_hex("00000000").unwrap(), - 0, + PgMajorVersion::PG14, ); let bytes = instance.to_bytes().unwrap(); Self::from_bytes(&bytes).unwrap() @@ -545,7 +546,7 @@ mod tests { Lsn(0), Lsn(0), Lsn(0), - 14, // All timelines created before this version had pg_version 14 + PgMajorVersion::PG14, // All timelines created before this version had pg_version 14 ); assert_eq!( @@ -565,7 +566,7 @@ mod tests { Lsn(0), // Updating this version to 17 will cause the test to fail at the // next assert_eq!(). - 16, + PgMajorVersion::PG16, ); let expected_bytes = vec![ /* TimelineMetadataHeader */ diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs index a5cd8989aa..6060c42cbb 100644 --- a/pageserver/src/tenant/remote_timeline_client/index.rs +++ b/pageserver/src/tenant/remote_timeline_client/index.rs @@ -427,8 +427,8 @@ impl GcBlocking { #[cfg(test)] mod tests { + use postgres_ffi::PgMajorVersion; use std::str::FromStr; - use utils::id::TimelineId; use super::*; @@ -831,7 +831,7 @@ mod tests { Lsn::INVALID, Lsn::from_str("0/1696070").unwrap(), Lsn::from_str("0/1696070").unwrap(), - 14, + PgMajorVersion::PG14, ).with_recalculated_checksum().unwrap(), deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")), archived_at: None, @@ -893,7 +893,7 @@ mod tests { Lsn::INVALID, Lsn::from_str("0/1696070").unwrap(), Lsn::from_str("0/1696070").unwrap(), - 14, + PgMajorVersion::PG14, ).with_recalculated_checksum().unwrap(), deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")), archived_at: Some(parse_naive_datetime("2023-04-29T09:00:00.123000000")), @@ -957,7 +957,7 @@ mod tests { Lsn::INVALID, Lsn::from_str("0/1696070").unwrap(), Lsn::from_str("0/1696070").unwrap(), - 14, + PgMajorVersion::PG14, ).with_recalculated_checksum().unwrap(), deleted_at: None, lineage: Default::default(), @@ -1033,7 +1033,7 @@ mod tests { Lsn::INVALID, Lsn::from_str("0/1696070").unwrap(), Lsn::from_str("0/1696070").unwrap(), - 14, + PgMajorVersion::PG14, ).with_recalculated_checksum().unwrap(), deleted_at: None, lineage: Default::default(), @@ -1114,7 +1114,7 @@ mod tests { Lsn::INVALID, Lsn::from_str("0/1696070").unwrap(), Lsn::from_str("0/1696070").unwrap(), - 14, + PgMajorVersion::PG14, ).with_recalculated_checksum().unwrap(), deleted_at: None, lineage: Default::default(), @@ -1199,7 +1199,7 @@ mod tests { Lsn::INVALID, Lsn::from_str("0/1696070").unwrap(), Lsn::from_str("0/1696070").unwrap(), - 14, + PgMajorVersion::PG14, ).with_recalculated_checksum().unwrap(), deleted_at: None, lineage: Default::default(), @@ -1287,7 +1287,7 @@ mod tests { Lsn::INVALID, Lsn::from_str("0/1696070").unwrap(), Lsn::from_str("0/1696070").unwrap(), - 14, + PgMajorVersion::PG14, ).with_recalculated_checksum().unwrap(), deleted_at: None, lineage: Default::default(), diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index 0f31318f0c..c2f76c859c 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -1622,11 +1622,6 @@ impl DeltaLayerIterator<'_> { pub(crate) mod test { use std::collections::BTreeMap; - use bytes::Bytes; - use itertools::MinMaxResult; - use rand::prelude::{SeedableRng, SliceRandom, StdRng}; - use rand::{Rng, RngCore}; - use super::*; use crate::DEFAULT_PG_VERSION; use crate::context::DownloadBehavior; @@ -1636,6 +1631,11 @@ pub(crate) mod test { use crate::tenant::storage_layer::{Layer, ResidentLayer}; use crate::tenant::timeline::layer_manager::LayerManagerLockHolder; use crate::tenant::{TenantShard, Timeline}; + use bytes::Bytes; + use itertools::MinMaxResult; + use postgres_ffi::PgMajorVersion; + use rand::prelude::{SeedableRng, SliceRandom, StdRng}; + use rand::{Rng, RngCore}; /// Construct an index for a fictional delta layer and and then /// traverse in order to plan vectored reads for a query. Finally, @@ -1995,7 +1995,7 @@ pub(crate) mod test { let (tenant, ctx) = h.load().await; let ctx = &ctx; let timeline = tenant - .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, ctx) + .create_test_timeline(TimelineId::generate(), Lsn(0x10), PgMajorVersion::PG14, ctx) .await .unwrap(); let ctx = &ctx.with_scope_timeline(&timeline); diff --git a/pageserver/src/tenant/storage_layer/layer/tests.rs b/pageserver/src/tenant/storage_layer/layer/tests.rs index 313c133fa2..9bdce163c9 100644 --- a/pageserver/src/tenant/storage_layer/layer/tests.rs +++ b/pageserver/src/tenant/storage_layer/layer/tests.rs @@ -1,6 +1,7 @@ use std::time::UNIX_EPOCH; use pageserver_api::key::{CONTROLFILE_KEY, Key}; +use postgres_ffi::PgMajorVersion; use tokio::task::JoinSet; use utils::completion::{self, Completion}; use utils::id::TimelineId; @@ -45,7 +46,7 @@ async fn smoke_test() { .create_test_timeline_with_layers( TimelineId::generate(), Lsn(0x10), - 14, + PgMajorVersion::PG14, &ctx, Default::default(), // in-memory layers Default::default(), @@ -256,7 +257,12 @@ async fn evict_and_wait_on_wanted_deleted() { let (tenant, ctx) = h.load().await; let timeline = tenant - .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx) + .create_test_timeline( + TimelineId::generate(), + Lsn(0x10), + PgMajorVersion::PG14, + &ctx, + ) .await .unwrap(); @@ -341,7 +347,12 @@ fn read_wins_pending_eviction() { let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1)); let timeline = tenant - .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx) + .create_test_timeline( + TimelineId::generate(), + Lsn(0x10), + PgMajorVersion::PG14, + &ctx, + ) .await .unwrap(); let ctx = ctx.with_scope_timeline(&timeline); @@ -474,7 +485,12 @@ fn multiple_pending_evictions_scenario(name: &'static str, in_order: bool) { let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1)); let timeline = tenant - .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx) + .create_test_timeline( + TimelineId::generate(), + Lsn(0x10), + PgMajorVersion::PG14, + &ctx, + ) .await .unwrap(); let ctx = ctx.with_scope_timeline(&timeline); @@ -644,7 +660,12 @@ async fn cancelled_get_or_maybe_download_does_not_cancel_eviction() { let (tenant, ctx) = h.load().await; let timeline = tenant - .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx) + .create_test_timeline( + TimelineId::generate(), + Lsn(0x10), + PgMajorVersion::PG14, + &ctx, + ) .await .unwrap(); let ctx = ctx.with_scope_timeline(&timeline); @@ -730,7 +751,12 @@ async fn evict_and_wait_does_not_wait_for_download() { let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1)); let timeline = tenant - .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx) + .create_test_timeline( + TimelineId::generate(), + Lsn(0x10), + PgMajorVersion::PG14, + &ctx, + ) .await .unwrap(); let ctx = ctx.with_scope_timeline(&timeline); @@ -836,7 +862,12 @@ async fn eviction_cancellation_on_drop() { let (tenant, ctx) = h.load().await; let timeline = tenant - .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx) + .create_test_timeline( + TimelineId::generate(), + Lsn(0x10), + PgMajorVersion::PG14, + &ctx, + ) .await .unwrap(); diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 81f2646e5a..4ca005bfd4 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -58,7 +58,7 @@ use pageserver_api::reltag::{BlockNumber, RelTag}; use pageserver_api::shard::{ShardIdentity, ShardIndex, ShardNumber, TenantShardId}; use postgres_connection::PgConnectionConfig; use postgres_ffi::v14::xlog_utils; -use postgres_ffi::{WAL_SEGMENT_SIZE, to_pg_timestamp}; +use postgres_ffi::{PgMajorVersion, WAL_SEGMENT_SIZE, to_pg_timestamp}; use rand::Rng; use remote_storage::DownloadError; use serde_with::serde_as; @@ -225,7 +225,7 @@ pub struct Timeline { /// to shards, and is constant through the lifetime of this Timeline. shard_identity: ShardIdentity, - pub pg_version: u32, + pub pg_version: PgMajorVersion, /// The tuple has two elements. /// 1. `LayerFileManager` keeps track of the various physical representations of the layer files (inmem, local, remote). @@ -2913,7 +2913,7 @@ impl Timeline { shard_identity: ShardIdentity, walredo_mgr: Option>, resources: TimelineResources, - pg_version: u32, + pg_version: PgMajorVersion, state: TimelineState, attach_wal_lag_cooldown: Arc>, create_idempotency: crate::tenant::CreateTimelineIdempotency, @@ -7593,6 +7593,7 @@ mod tests { use std::sync::Arc; use pageserver_api::key::Key; + use postgres_ffi::PgMajorVersion; use std::iter::Iterator; use tracing::Instrument; use utils::id::TimelineId; @@ -7667,7 +7668,7 @@ mod tests { .create_test_timeline_with_layers( TimelineId::generate(), Lsn(0x10), - 14, + PgMajorVersion::PG14, &ctx, Vec::new(), // in-memory layers delta_layers, @@ -7803,7 +7804,7 @@ mod tests { .create_test_timeline_with_layers( TimelineId::generate(), Lsn(0x10), - 14, + PgMajorVersion::PG14, &ctx, Vec::new(), // in-memory layers delta_layers, @@ -7863,7 +7864,12 @@ mod tests { let (tenant, ctx) = harness.load().await; let timeline = tenant - .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx) + .create_test_timeline( + TimelineId::generate(), + Lsn(0x10), + PgMajorVersion::PG14, + &ctx, + ) .await .unwrap(); diff --git a/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs b/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs index bf2d9875c1..98c44313f1 100644 --- a/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs +++ b/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs @@ -3,7 +3,7 @@ use std::sync::Arc; use anyhow::Context; use bytes::Bytes; -use postgres_ffi::ControlFileData; +use postgres_ffi::{ControlFileData, PgMajorVersion}; use remote_storage::{ Download, DownloadError, DownloadKind, DownloadOpts, GenericRemoteStorage, Listing, ListingObject, RemotePath, RemoteStorageConfig, @@ -264,7 +264,7 @@ impl ControlFile { pub(crate) fn base_lsn(&self) -> Lsn { Lsn(self.control_file_data.checkPoint).align() } - pub(crate) fn pg_version(&self) -> u32 { + pub(crate) fn pg_version(&self) -> PgMajorVersion { self.try_pg_version() .expect("prepare() checks that try_pg_version doesn't error") } @@ -274,13 +274,14 @@ impl ControlFile { pub(crate) fn control_file_buf(&self) -> &Bytes { &self.control_file_buf } - fn try_pg_version(&self) -> anyhow::Result { + + fn try_pg_version(&self) -> anyhow::Result { Ok(match self.control_file_data.catalog_version_no { // thesea are from catversion.h - 202107181 => 14, - 202209061 => 15, - 202307071 => 16, - 202406281 => 17, + 202107181 => PgMajorVersion::PG14, + 202209061 => PgMajorVersion::PG15, + 202307071 => PgMajorVersion::PG16, + 202406281 => PgMajorVersion::PG17, catversion => { anyhow::bail!("unrecognized catalog version {catversion}") } diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index c452f48e40..a597aedee3 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -32,8 +32,8 @@ use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind}; use pageserver_api::shard::ShardIdentity; use postgres_ffi::walrecord::*; use postgres_ffi::{ - TimestampTz, TransactionId, dispatch_pgversion, enum_pgversion, enum_pgversion_dispatch, - fsm_logical_to_physical, pg_constants, + PgMajorVersion, TimestampTz, TransactionId, dispatch_pgversion, enum_pgversion, + enum_pgversion_dispatch, fsm_logical_to_physical, pg_constants, }; use postgres_ffi_types::forknum::{FSM_FORKNUM, INIT_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM}; use tracing::*; @@ -781,7 +781,7 @@ impl WalIngest { ) -> Result<(), WalIngestError> { let (xact_common, is_commit, is_prepared) = match record { XactRecord::Prepare(XactPrepare { xl_xid, data }) => { - let xid: u64 = if modification.tline.pg_version >= 17 { + let xid: u64 = if modification.tline.pg_version >= PgMajorVersion::PG17 { self.adjust_to_full_transaction_id(xl_xid)? } else { xl_xid as u64 @@ -886,7 +886,7 @@ impl WalIngest { xl_xid, parsed.xid, lsn, ); - let xid: u64 = if modification.tline.pg_version >= 17 { + let xid: u64 = if modification.tline.pg_version >= PgMajorVersion::PG17 { self.adjust_to_full_transaction_id(parsed.xid)? } else { parsed.xid as u64 @@ -1241,7 +1241,7 @@ impl WalIngest { if xlog_checkpoint.oldestActiveXid == pg_constants::INVALID_TRANSACTION_ID && info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN { - let oldest_active_xid = if pg_version >= 17 { + let oldest_active_xid = if pg_version >= PgMajorVersion::PG17 { let mut oldest_active_full_xid = cp.nextXid.value; for xid in modification.tline.list_twophase_files(lsn, ctx).await? { if xid < oldest_active_full_xid { @@ -1475,10 +1475,11 @@ impl WalIngest { const fn rate_limiter( &self, - pg_version: u32, + pg_version: PgMajorVersion, ) -> Option<&Lazy>> { - const MIN_PG_VERSION: u32 = 14; - const MAX_PG_VERSION: u32 = 17; + const MIN_PG_VERSION: u32 = PgMajorVersion::PG14.major_version_num(); + const MAX_PG_VERSION: u32 = PgMajorVersion::PG17.major_version_num(); + let pg_version = pg_version.major_version_num(); if pg_version < MIN_PG_VERSION || pg_version > MAX_PG_VERSION { return None; @@ -1603,6 +1604,7 @@ async fn get_relsize( #[cfg(test)] mod tests { use anyhow::Result; + use postgres_ffi::PgMajorVersion; use postgres_ffi::RELSEG_SIZE; use super::*; @@ -1625,7 +1627,7 @@ mod tests { #[tokio::test] async fn test_zeroed_checkpoint_decodes_correctly() -> Result<(), anyhow::Error> { - for i in 14..=16 { + for i in PgMajorVersion::ALL { dispatch_pgversion!(i, { pgv::CheckPoint::decode(&pgv::ZERO_CHECKPOINT)?; }); @@ -2335,7 +2337,7 @@ mod tests { // 5. Grep sk logs for "restart decoder" to get startpoint // 6. Run just the decoder from this test to get the endpoint. // It's the last LSN the decoder will output. - let pg_version = 15; // The test data was generated by pg15 + let pg_version = PgMajorVersion::PG15; // The test data was generated by pg15 let path = "test_data/sk_wal_segment_from_pgbench"; let wal_segment_path = format!("{path}/000000010000000000000001.zst"); let source_initdb_path = format!("{path}/{INITDB_PATH}"); diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index 1498f3c83d..b17b5a15f9 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -33,6 +33,7 @@ use bytes::{Bytes, BytesMut}; use pageserver_api::key::Key; use pageserver_api::models::{WalRedoManagerProcessStatus, WalRedoManagerStatus}; use pageserver_api::shard::TenantShardId; +use postgres_ffi::PgMajorVersion; use tracing::*; use utils::lsn::Lsn; use utils::sync::gate::GateError; @@ -165,7 +166,7 @@ impl PostgresRedoManager { lsn: Lsn, base_img: Option<(Lsn, Bytes)>, records: Vec<(Lsn, NeonWalRecord)>, - pg_version: u32, + pg_version: PgMajorVersion, redo_attempt_type: RedoAttemptType, ) -> Result { if records.is_empty() { @@ -232,7 +233,7 @@ impl PostgresRedoManager { /// # Cancel-Safety /// /// This method is cancellation-safe. - pub async fn ping(&self, pg_version: u32) -> Result<(), Error> { + pub async fn ping(&self, pg_version: PgMajorVersion) -> Result<(), Error> { self.do_with_walredo_process(pg_version, |proc| async move { proc.ping(Duration::from_secs(1)) .await @@ -342,7 +343,7 @@ impl PostgresRedoManager { O, >( &self, - pg_version: u32, + pg_version: PgMajorVersion, closure: F, ) -> Result { let proc: Arc = match self.redo_process.get_or_init_detached().await { @@ -442,7 +443,7 @@ impl PostgresRedoManager { base_img_lsn: Lsn, records: &[(Lsn, NeonWalRecord)], wal_redo_timeout: Duration, - pg_version: u32, + pg_version: PgMajorVersion, max_retry_attempts: u32, ) -> Result { *(self.last_redo_at.lock().unwrap()) = Some(Instant::now()); @@ -572,6 +573,7 @@ mod tests { use bytes::Bytes; use pageserver_api::key::Key; use pageserver_api::shard::TenantShardId; + use postgres_ffi::PgMajorVersion; use tracing::Instrument; use utils::id::TenantId; use utils::lsn::Lsn; @@ -586,7 +588,7 @@ mod tests { let h = RedoHarness::new().unwrap(); h.manager - .ping(14) + .ping(PgMajorVersion::PG14) .instrument(h.span()) .await .expect("ping should work"); @@ -612,7 +614,7 @@ mod tests { Lsn::from_str("0/16E2408").unwrap(), None, short_records(), - 14, + PgMajorVersion::PG14, RedoAttemptType::ReadPage, ) .instrument(h.span()) @@ -641,7 +643,7 @@ mod tests { Lsn::from_str("0/16E2408").unwrap(), None, short_records(), - 14, + PgMajorVersion::PG14, RedoAttemptType::ReadPage, ) .instrument(h.span()) @@ -663,7 +665,7 @@ mod tests { Lsn::INVALID, None, short_records(), - 16, /* 16 currently produces stderr output on startup, which adds a nice extra edge */ + PgMajorVersion::PG16, /* 16 currently produces stderr output on startup, which adds a nice extra edge */ RedoAttemptType::ReadPage, ) .instrument(h.span()) diff --git a/pageserver/src/walredo/process.rs b/pageserver/src/walredo/process.rs index 3dec0593bf..c8b0846480 100644 --- a/pageserver/src/walredo/process.rs +++ b/pageserver/src/walredo/process.rs @@ -12,7 +12,7 @@ use anyhow::Context; use bytes::Bytes; use pageserver_api::reltag::RelTag; use pageserver_api::shard::TenantShardId; -use postgres_ffi::BLCKSZ; +use postgres_ffi::{BLCKSZ, PgMajorVersion}; use tokio::io::{AsyncReadExt, AsyncWriteExt}; use tracing::{Instrument, debug, error, instrument}; use utils::lsn::Lsn; @@ -54,11 +54,11 @@ impl WalRedoProcess { // // Start postgres binary in special WAL redo mode. // - #[instrument(skip_all,fields(pg_version=pg_version))] + #[instrument(skip_all,fields(pg_version=pg_version.major_version_num()))] pub(crate) fn launch( conf: &'static PageServerConf, tenant_shard_id: TenantShardId, - pg_version: u32, + pg_version: PgMajorVersion, ) -> anyhow::Result { crate::span::debug_assert_current_span_has_tenant_id(); diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml index 0a8cc415be..6955028c73 100644 --- a/safekeeper/Cargo.toml +++ b/safekeeper/Cargo.toml @@ -58,6 +58,7 @@ metrics.workspace = true pem.workspace = true postgres_backend.workspace = true postgres_ffi.workspace = true +postgres_versioninfo.workspace = true pq_proto.workspace = true remote_storage.workspace = true safekeeper_api.workspace = true diff --git a/safekeeper/src/control_file_upgrade.rs b/safekeeper/src/control_file_upgrade.rs index 1ad9e62f9b..555cbe457b 100644 --- a/safekeeper/src/control_file_upgrade.rs +++ b/safekeeper/src/control_file_upgrade.rs @@ -2,6 +2,7 @@ use std::vec; use anyhow::{Result, bail}; +use postgres_versioninfo::PgVersionId; use pq_proto::SystemId; use safekeeper_api::membership::{Configuration, INVALID_GENERATION}; use safekeeper_api::{ServerInfo, Term}; @@ -46,7 +47,7 @@ struct SafeKeeperStateV1 { #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] pub struct ServerInfoV2 { /// Postgres server version - pub pg_version: u32, + pub pg_version: PgVersionId, pub system_id: SystemId, pub tenant_id: TenantId, pub timeline_id: TimelineId, @@ -75,7 +76,7 @@ pub struct SafeKeeperStateV2 { #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] pub struct ServerInfoV3 { /// Postgres server version - pub pg_version: u32, + pub pg_version: PgVersionId, pub system_id: SystemId, #[serde(with = "hex")] pub tenant_id: TenantId, @@ -444,13 +445,13 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result TimelinePersisten mod tests { use std::str::FromStr; + use postgres_versioninfo::PgMajorVersion; use utils::Hex; use utils::id::NodeId; @@ -563,7 +565,7 @@ mod tests { epoch: 43, }, server: ServerInfoV2 { - pg_version: 14, + pg_version: PgVersionId::from(PgMajorVersion::PG14), system_id: 0x1234567887654321, tenant_id, timeline_id, @@ -586,8 +588,8 @@ mod tests { 0x2a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // epoch 0x2b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - // pg_version - 0x0e, 0x00, 0x00, 0x00, + // pg_version = 140000 + 0xE0, 0x22, 0x02, 0x00, // system_id 0x21, 0x43, 0x65, 0x87, 0x78, 0x56, 0x34, 0x12, // tenant_id @@ -626,7 +628,7 @@ mod tests { }]), }, server: ServerInfoV2 { - pg_version: 14, + pg_version: PgVersionId::from(PgMajorVersion::PG14), system_id: 0x1234567887654321, tenant_id, timeline_id, @@ -646,7 +648,7 @@ mod tests { let expected = [ 0x2a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x29, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x21, 0x43, 0x65, 0x87, 0x78, 0x56, + 0x00, 0x00, 0x00, 0x00, 0xE0, 0x22, 0x02, 0x00, 0x21, 0x43, 0x65, 0x87, 0x78, 0x56, 0x34, 0x12, 0xcf, 0x04, 0x80, 0x92, 0x97, 0x07, 0xee, 0x75, 0x37, 0x23, 0x37, 0xef, 0xaa, 0x5e, 0xcf, 0x96, 0x11, 0x2d, 0xed, 0x66, 0x42, 0x2a, 0xa5, 0xe9, 0x53, 0xe5, 0x44, 0x0f, 0xa5, 0x42, 0x7a, 0xc4, 0x78, 0x56, 0x34, 0x12, 0xc4, 0x7a, 0x42, 0xa5, @@ -675,7 +677,7 @@ mod tests { }]), }, server: ServerInfoV3 { - pg_version: 14, + pg_version: PgVersionId::from(PgMajorVersion::PG14), system_id: 0x1234567887654321, tenant_id, timeline_id, @@ -695,7 +697,7 @@ mod tests { let expected = [ 0x2a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x29, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x21, 0x43, 0x65, 0x87, 0x78, 0x56, + 0x00, 0x00, 0x00, 0x00, 0xE0, 0x22, 0x02, 0x00, 0x21, 0x43, 0x65, 0x87, 0x78, 0x56, 0x34, 0x12, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x63, 0x66, 0x30, 0x34, 0x38, 0x30, 0x39, 0x32, 0x39, 0x37, 0x30, 0x37, 0x65, 0x65, 0x37, 0x35, 0x33, 0x37, 0x32, 0x33, 0x33, 0x37, 0x65, 0x66, 0x61, 0x61, 0x35, 0x65, 0x63, 0x66, 0x39, 0x36, @@ -731,7 +733,7 @@ mod tests { }]), }, server: ServerInfo { - pg_version: 14, + pg_version: PgVersionId::from(PgMajorVersion::PG14), system_id: 0x1234567887654321, wal_seg_size: 0x12345678, }, @@ -765,7 +767,7 @@ mod tests { 0x30, 0x66, 0x61, 0x35, 0x34, 0x32, 0x37, 0x61, 0x63, 0x34, 0x2a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x29, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x0e, 0x00, 0x00, 0x00, 0x21, 0x43, 0x65, 0x87, 0x78, 0x56, 0x34, 0x12, 0x78, 0x56, + 0xE0, 0x22, 0x02, 0x00, 0x21, 0x43, 0x65, 0x87, 0x78, 0x56, 0x34, 0x12, 0x78, 0x56, 0x34, 0x12, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x63, 0x34, 0x37, 0x61, 0x34, 0x32, 0x61, 0x35, 0x30, 0x66, 0x34, 0x34, 0x65, 0x35, 0x35, 0x33, 0x65, 0x39, 0x61, 0x35, 0x32, 0x61, 0x34, 0x32, 0x36, 0x36, 0x65, 0x64, 0x32, 0x64, 0x31, 0x31, diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index aa19b6d283..4d15fc9de3 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -9,6 +9,7 @@ use anyhow::{Context, Result, bail}; use byteorder::{LittleEndian, ReadBytesExt}; use bytes::{Buf, BufMut, Bytes, BytesMut}; use postgres_ffi::{MAX_SEND_SIZE, TimeLineID}; +use postgres_versioninfo::{PgMajorVersion, PgVersionId}; use pq_proto::SystemId; use safekeeper_api::membership::{ INVALID_GENERATION, MemberSet, SafekeeperGeneration as Generation, SafekeeperId, @@ -29,7 +30,7 @@ use crate::{control_file, wal_storage}; pub const SK_PROTO_VERSION_2: u32 = 2; pub const SK_PROTO_VERSION_3: u32 = 3; -pub const UNKNOWN_SERVER_VERSION: u32 = 0; +pub const UNKNOWN_SERVER_VERSION: PgVersionId = PgVersionId::UNKNOWN; #[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)] pub struct TermLsn { @@ -218,7 +219,7 @@ pub struct ProposerGreeting { pub timeline_id: TimelineId, pub mconf: membership::Configuration, /// Postgres server version - pub pg_version: u32, + pub pg_version: PgVersionId, pub system_id: SystemId, pub wal_seg_size: u32, } @@ -229,7 +230,7 @@ pub struct ProposerGreetingV2 { /// proposer-acceptor protocol version pub protocol_version: u32, /// Postgres server version - pub pg_version: u32, + pub pg_version: PgVersionId, pub proposer_id: PgUuid, pub system_id: SystemId, pub timeline_id: TimelineId, @@ -511,7 +512,7 @@ impl ProposerAcceptorMessage { tenant_id, timeline_id, mconf, - pg_version, + pg_version: PgVersionId::from_full_pg_version(pg_version), system_id, wal_seg_size, }; @@ -961,7 +962,8 @@ where * because safekeepers parse WAL headers and the format * may change between versions. */ - if msg.pg_version / 10000 != self.state.server.pg_version / 10000 + if PgMajorVersion::try_from(msg.pg_version)? + != PgMajorVersion::try_from(self.state.server.pg_version)? && self.state.server.pg_version != UNKNOWN_SERVER_VERSION { bail!( @@ -1748,7 +1750,7 @@ mod tests { }]), }, server: ServerInfo { - pg_version: 14, + pg_version: PgVersionId::from_full_pg_version(140000), system_id: 0x1234567887654321, wal_seg_size: 0x12345678, }, diff --git a/safekeeper/src/send_interpreted_wal.rs b/safekeeper/src/send_interpreted_wal.rs index 2b1fd7b854..2192f5eab4 100644 --- a/safekeeper/src/send_interpreted_wal.rs +++ b/safekeeper/src/send_interpreted_wal.rs @@ -8,8 +8,8 @@ use futures::StreamExt; use futures::future::Either; use pageserver_api::shard::ShardIdentity; use postgres_backend::{CopyStreamHandlerEnd, PostgresBackend}; -use postgres_ffi::get_current_timestamp; use postgres_ffi::waldecoder::{WalDecodeError, WalStreamDecoder}; +use postgres_ffi::{PgMajorVersion, get_current_timestamp}; use pq_proto::{BeMessage, InterpretedWalRecordsBody, WalSndKeepAlive}; use tokio::io::{AsyncRead, AsyncWrite}; use tokio::sync::mpsc::error::SendError; @@ -78,7 +78,7 @@ pub(crate) struct InterpretedWalReader { shard_senders: HashMap>, shard_notification_rx: Option>, state: Arc>, - pg_version: u32, + pg_version: PgMajorVersion, } /// A handle for [`InterpretedWalReader`] which allows for interacting with it @@ -258,7 +258,7 @@ impl InterpretedWalReader { start_pos: Lsn, tx: tokio::sync::mpsc::Sender, shard: ShardIdentity, - pg_version: u32, + pg_version: PgMajorVersion, appname: &Option, ) -> InterpretedWalReaderHandle { let state = Arc::new(std::sync::RwLock::new(InterpretedWalReaderState::Running { @@ -322,7 +322,7 @@ impl InterpretedWalReader { start_pos: Lsn, tx: tokio::sync::mpsc::Sender, shard: ShardIdentity, - pg_version: u32, + pg_version: PgMajorVersion, shard_notification_rx: Option< tokio::sync::mpsc::UnboundedReceiver, >, @@ -718,7 +718,7 @@ mod tests { use std::time::Duration; use pageserver_api::shard::{ShardIdentity, ShardStripeSize}; - use postgres_ffi::MAX_SEND_SIZE; + use postgres_ffi::{MAX_SEND_SIZE, PgMajorVersion}; use tokio::sync::mpsc::error::TryRecvError; use utils::id::{NodeId, TenantTimelineId}; use utils::lsn::Lsn; @@ -734,7 +734,7 @@ mod tests { const SIZE: usize = 8 * 1024; const MSG_COUNT: usize = 200; - const PG_VERSION: u32 = 17; + const PG_VERSION: PgMajorVersion = PgMajorVersion::PG17; const SHARD_COUNT: u8 = 2; let start_lsn = Lsn::from_str("0/149FD18").unwrap(); @@ -876,7 +876,7 @@ mod tests { const SIZE: usize = 8 * 1024; const MSG_COUNT: usize = 200; - const PG_VERSION: u32 = 17; + const PG_VERSION: PgMajorVersion = PgMajorVersion::PG17; const SHARD_COUNT: u8 = 2; let start_lsn = Lsn::from_str("0/149FD18").unwrap(); @@ -1025,7 +1025,7 @@ mod tests { const SIZE: usize = 64 * 1024; const MSG_COUNT: usize = 10; - const PG_VERSION: u32 = 17; + const PG_VERSION: PgMajorVersion = PgMajorVersion::PG17; const SHARD_COUNT: u8 = 2; const WAL_READER_BATCH_SIZE: usize = 8192; @@ -1148,7 +1148,7 @@ mod tests { const SIZE: usize = 8 * 1024; const MSG_COUNT: usize = 10; - const PG_VERSION: u32 = 17; + const PG_VERSION: PgMajorVersion = PgMajorVersion::PG17; let start_lsn = Lsn::from_str("0/149FD18").unwrap(); let env = Env::new(true).unwrap(); diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs index 05f827494e..177e759db5 100644 --- a/safekeeper/src/send_wal.rs +++ b/safekeeper/src/send_wal.rs @@ -12,7 +12,7 @@ use futures::FutureExt; use itertools::Itertools; use parking_lot::Mutex; use postgres_backend::{CopyStreamHandlerEnd, PostgresBackend, PostgresBackendReader, QueryError}; -use postgres_ffi::{MAX_SEND_SIZE, TimestampTz, get_current_timestamp}; +use postgres_ffi::{MAX_SEND_SIZE, PgMajorVersion, TimestampTz, get_current_timestamp}; use pq_proto::{BeMessage, WalSndKeepAlive, XLogDataBody}; use safekeeper_api::Term; use safekeeper_api::models::{ @@ -559,7 +559,9 @@ impl SafekeeperPostgresHandler { format, compression, } => { - let pg_version = tli.tli.get_state().await.1.server.pg_version / 10000; + let pg_version = + PgMajorVersion::try_from(tli.tli.get_state().await.1.server.pg_version) + .unwrap(); let end_watch_view = end_watch.view(); let wal_residence_guard = tli.wal_residence_guard().await?; let (tx, rx) = tokio::sync::mpsc::channel::(2); diff --git a/safekeeper/src/state.rs b/safekeeper/src/state.rs index 7533005c35..b6cf73be2e 100644 --- a/safekeeper/src/state.rs +++ b/safekeeper/src/state.rs @@ -7,6 +7,7 @@ use std::time::SystemTime; use anyhow::{Result, bail}; use postgres_ffi::WAL_SEGMENT_SIZE; +use postgres_versioninfo::{PgMajorVersion, PgVersionId}; use safekeeper_api::membership::Configuration; use safekeeper_api::models::{TimelineMembershipSwitchResponse, TimelineTermBumpResponse}; use safekeeper_api::{INITIAL_TERM, ServerInfo, Term}; @@ -149,8 +150,8 @@ impl TimelinePersistentState { &TenantTimelineId::empty(), Configuration::empty(), ServerInfo { - pg_version: 170000, /* Postgres server version (major * 10000) */ - system_id: 0, /* Postgres system identifier */ + pg_version: PgVersionId::from(PgMajorVersion::PG17), + system_id: 0, /* Postgres system identifier */ wal_seg_size: WAL_SEGMENT_SIZE as u32, }, Lsn::INVALID, diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs index e68c9f3a99..da00df2dd7 100644 --- a/safekeeper/src/wal_storage.rs +++ b/safekeeper/src/wal_storage.rs @@ -19,6 +19,7 @@ use futures::future::BoxFuture; use postgres_ffi::v14::xlog_utils::{IsPartialXLogFileName, IsXLogFileName, XLogFromFileName}; use postgres_ffi::waldecoder::WalStreamDecoder; use postgres_ffi::{PG_TLI, XLogFileName, XLogSegNo, dispatch_pgversion}; +use postgres_versioninfo::{PgMajorVersion, PgVersionId}; use pq_proto::SystemId; use remote_storage::RemotePath; use std::sync::Arc; @@ -92,7 +93,7 @@ pub struct PhysicalStorage { /// Size of WAL segment in bytes. wal_seg_size: usize, - pg_version: u32, + pg_version: PgVersionId, system_id: u64, /// Written to disk, but possibly still in the cache and not fully persisted. @@ -180,7 +181,7 @@ impl PhysicalStorage { let write_lsn = if state.commit_lsn == Lsn(0) { Lsn(0) } else { - let version = state.server.pg_version / 10000; + let version = PgMajorVersion::try_from(state.server.pg_version).unwrap(); dispatch_pgversion!( version, @@ -226,7 +227,10 @@ impl PhysicalStorage { write_record_lsn: write_lsn, flush_lsn, flush_record_lsn: flush_lsn, - decoder: WalStreamDecoder::new(write_lsn, state.server.pg_version / 10000), + decoder: WalStreamDecoder::new( + write_lsn, + PgMajorVersion::try_from(state.server.pg_version).unwrap(), + ), file: None, pending_wal_truncation: true, }) @@ -408,7 +412,7 @@ impl Storage for PhysicalStorage { let segno = init_lsn.segment_number(self.wal_seg_size); let (mut file, _) = self.open_or_create(segno).await?; - let major_pg_version = self.pg_version / 10000; + let major_pg_version = PgMajorVersion::try_from(self.pg_version).unwrap(); let wal_seg = postgres_ffi::generate_wal_segment(segno, self.system_id, major_pg_version, init_lsn)?; file.seek(SeekFrom::Start(0)).await?; @@ -654,7 +658,7 @@ pub struct WalReader { // pos is in the same segment as timeline_start_lsn. timeline_start_lsn: Lsn, // integer version number of PostgreSQL, e.g. 14; 15; 16 - pg_version: u32, + pg_version: PgMajorVersion, system_id: SystemId, timeline_start_segment: Option, } @@ -697,7 +701,7 @@ impl WalReader { wal_backup, local_start_lsn: state.local_start_lsn, timeline_start_lsn: state.timeline_start_lsn, - pg_version: state.server.pg_version / 10000, + pg_version: PgMajorVersion::try_from(state.server.pg_version).unwrap(), system_id: state.server.system_id, timeline_start_segment: None, }) diff --git a/safekeeper/tests/walproposer_sim/safekeeper_disk.rs b/safekeeper/tests/walproposer_sim/safekeeper_disk.rs index 94a849b5f0..029f8fab0a 100644 --- a/safekeeper/tests/walproposer_sim/safekeeper_disk.rs +++ b/safekeeper/tests/walproposer_sim/safekeeper_disk.rs @@ -7,8 +7,8 @@ use anyhow::Result; use bytes::{Buf, BytesMut}; use futures::future::BoxFuture; use parking_lot::Mutex; -use postgres_ffi::XLogSegNo; use postgres_ffi::waldecoder::WalStreamDecoder; +use postgres_ffi::{PgMajorVersion, XLogSegNo}; use safekeeper::metrics::WalStorageMetrics; use safekeeper::state::TimelinePersistentState; use safekeeper::{control_file, wal_storage}; @@ -142,7 +142,7 @@ impl DiskWALStorage { write_lsn, write_record_lsn: flush_lsn, flush_record_lsn: flush_lsn, - decoder: WalStreamDecoder::new(flush_lsn, 16), + decoder: WalStreamDecoder::new(flush_lsn, PgMajorVersion::PG16), unflushed_bytes: BytesMut::new(), disk, }) @@ -151,7 +151,7 @@ impl DiskWALStorage { fn find_end_of_wal(disk: Arc, start_lsn: Lsn) -> Result { let mut buf = [0; 8192]; let mut pos = start_lsn.0; - let mut decoder = WalStreamDecoder::new(start_lsn, 16); + let mut decoder = WalStreamDecoder::new(start_lsn, PgMajorVersion::PG16); let mut result = start_lsn; loop { disk.wal.lock().read(pos, &mut buf); @@ -204,7 +204,7 @@ impl wal_storage::Storage for DiskWALStorage { self.decoder.available(), startpos, ); - self.decoder = WalStreamDecoder::new(startpos, 16); + self.decoder = WalStreamDecoder::new(startpos, PgMajorVersion::PG16); } self.decoder.feed_bytes(buf); loop { @@ -242,7 +242,7 @@ impl wal_storage::Storage for DiskWALStorage { self.write_record_lsn = end_pos; self.flush_record_lsn = end_pos; self.unflushed_bytes.clear(); - self.decoder = WalStreamDecoder::new(end_pos, 16); + self.decoder = WalStreamDecoder::new(end_pos, PgMajorVersion::PG16); Ok(()) } diff --git a/storage_controller/src/service/safekeeper_service.rs b/storage_controller/src/service/safekeeper_service.rs index 193a1833a7..fec81fb661 100644 --- a/storage_controller/src/service/safekeeper_service.rs +++ b/storage_controller/src/service/safekeeper_service.rs @@ -18,6 +18,7 @@ use pageserver_api::controller_api::{ SafekeeperDescribeResponse, SkSchedulingPolicy, TimelineImportRequest, }; use pageserver_api::models::{SafekeeperInfo, SafekeepersInfo, TimelineInfo}; +use safekeeper_api::PgVersionId; use safekeeper_api::membership::{MemberSet, SafekeeperGeneration, SafekeeperId}; use tokio::task::JoinSet; use tokio_util::sync::CancellationToken; @@ -44,7 +45,7 @@ impl Service { &self, tenant_id: TenantId, timeline_id: TimelineId, - pg_version: u32, + pg_version: PgVersionId, timeline_persistence: &TimelinePersistence, ) -> Result, ApiError> { // If quorum is reached, return if we are outside of a specified timeout @@ -219,7 +220,7 @@ impl Service { read_only: bool, ) -> Result { let timeline_id = timeline_info.timeline_id; - let pg_version = timeline_info.pg_version * 10000; + let pg_version = PgVersionId::from(timeline_info.pg_version); // Initially start_lsn is determined by last_record_lsn in pageserver // response as it does initdb. However, later we persist it and in sk // creation calls replace with the value from the timeline row if it diff --git a/test_runner/regress/test_branching.py b/test_runner/regress/test_branching.py index fa5c9aa693..920c538069 100644 --- a/test_runner/regress/test_branching.py +++ b/test_runner/regress/test_branching.py @@ -172,7 +172,7 @@ def test_cannot_create_endpoint_on_non_uploaded_timeline(neon_env_builder: NeonE env.initial_tenant, env.initial_timeline, MembershipConfiguration(generation=1, members=[sk.safekeeper_id()], new_members=None), - int(env.pg_version), + int(env.pg_version) * 10000, Lsn(0), None, ) From aa7572201097d3b36e09cdeb74b35b7de13344c0 Mon Sep 17 00:00:00 2001 From: Tristan Partin Date: Tue, 24 Jun 2025 12:42:23 -0500 Subject: [PATCH 112/124] Set pgaudit.log=none for monitoring connections (#12137) pgaudit can spam logs due to all the monitoring that we do. Logs from these connections are not necessary for HIPPA compliance, so we can stop logging from those connections. Part-of: https://github.com/neondatabase/cloud/issues/29574 Signed-off-by: Tristan Partin --- compute/Makefile | 4 ++-- compute/vm-image-spec-bookworm.yaml | 4 ++-- compute/vm-image-spec-bullseye.yaml | 4 ++-- compute_tools/src/compute.rs | 2 +- test_runner/regress/test_compute_metrics.py | 2 +- 5 files changed, 8 insertions(+), 8 deletions(-) diff --git a/compute/Makefile b/compute/Makefile index c53d040887..ef2e55f7b1 100644 --- a/compute/Makefile +++ b/compute/Makefile @@ -22,7 +22,7 @@ sql_exporter.yml: $(jsonnet_files) --output-file etc/$@ \ --tla-str collector_name=neon_collector \ --tla-str collector_file=neon_collector.yml \ - --tla-str 'connection_string=postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable&application_name=sql_exporter' \ + --tla-str 'connection_string=postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable&application_name=sql_exporter&pgaudit.log=none' \ etc/sql_exporter.jsonnet sql_exporter_autoscaling.yml: $(jsonnet_files) @@ -30,7 +30,7 @@ sql_exporter_autoscaling.yml: $(jsonnet_files) --output-file etc/$@ \ --tla-str collector_name=neon_collector_autoscaling \ --tla-str collector_file=neon_collector_autoscaling.yml \ - --tla-str 'connection_string=postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable&application_name=sql_exporter_autoscaling' \ + --tla-str 'connection_string=postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable&application_name=sql_exporter_autoscaling&pgaudit.log=none' \ etc/sql_exporter.jsonnet .PHONY: clean diff --git a/compute/vm-image-spec-bookworm.yaml b/compute/vm-image-spec-bookworm.yaml index 057099994a..267e4c83b5 100644 --- a/compute/vm-image-spec-bookworm.yaml +++ b/compute/vm-image-spec-bookworm.yaml @@ -26,7 +26,7 @@ commands: - name: postgres-exporter user: nobody sysvInitAction: respawn - shell: 'DATA_SOURCE_NAME="user=cloud_admin sslmode=disable dbname=postgres application_name=postgres-exporter" /bin/postgres_exporter --config.file=/etc/postgres_exporter.yml' + shell: 'DATA_SOURCE_NAME="user=cloud_admin sslmode=disable dbname=postgres application_name=postgres-exporter pgaudit.log=none" /bin/postgres_exporter --config.file=/etc/postgres_exporter.yml' - name: pgbouncer-exporter user: postgres sysvInitAction: respawn @@ -59,7 +59,7 @@ files: # the rules use ALL as the hostname. Avoid the pointless lookups and the "unable to # resolve host" log messages that they generate. Defaults !fqdn - + # Allow postgres user (which is what compute_ctl runs as) to run /neonvm/bin/resize-swap # and /neonvm/bin/set-disk-quota as root without requiring entering a password (NOPASSWD), # regardless of hostname (ALL) diff --git a/compute/vm-image-spec-bullseye.yaml b/compute/vm-image-spec-bullseye.yaml index d048e20b2e..2b6e77b656 100644 --- a/compute/vm-image-spec-bullseye.yaml +++ b/compute/vm-image-spec-bullseye.yaml @@ -26,7 +26,7 @@ commands: - name: postgres-exporter user: nobody sysvInitAction: respawn - shell: 'DATA_SOURCE_NAME="user=cloud_admin sslmode=disable dbname=postgres application_name=postgres-exporter" /bin/postgres_exporter --config.file=/etc/postgres_exporter.yml' + shell: 'DATA_SOURCE_NAME="user=cloud_admin sslmode=disable dbname=postgres application_name=postgres-exporter pgaudit.log=none" /bin/postgres_exporter --config.file=/etc/postgres_exporter.yml' - name: pgbouncer-exporter user: postgres sysvInitAction: respawn @@ -59,7 +59,7 @@ files: # the rules use ALL as the hostname. Avoid the pointless lookups and the "unable to # resolve host" log messages that they generate. Defaults !fqdn - + # Allow postgres user (which is what compute_ctl runs as) to run /neonvm/bin/resize-swap # and /neonvm/bin/set-disk-quota as root without requiring entering a password (NOPASSWD), # regardless of hostname (ALL) diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index f84a5f0841..70b2d28bf2 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -405,7 +405,7 @@ impl ComputeNode { // that can affect `compute_ctl` and prevent it from properly configuring the database schema. // Unset them via connection string options before connecting to the database. // N.B. keep it in sync with `ZENITH_OPTIONS` in `get_maintenance_client()`. - const EXTRA_OPTIONS: &str = "-c role=cloud_admin -c default_transaction_read_only=off -c search_path=public -c statement_timeout=0"; + const EXTRA_OPTIONS: &str = "-c role=cloud_admin -c default_transaction_read_only=off -c search_path=public -c statement_timeout=0 -c pgaudit.log=none"; let options = match conn_conf.get_options() { // Allow the control plane to override any options set by the // compute diff --git a/test_runner/regress/test_compute_metrics.py b/test_runner/regress/test_compute_metrics.py index c751a3e7cc..d1e61e597c 100644 --- a/test_runner/regress/test_compute_metrics.py +++ b/test_runner/regress/test_compute_metrics.py @@ -418,7 +418,7 @@ def test_sql_exporter_metrics_e2e( pg_user = conn_options["user"] pg_dbname = conn_options["dbname"] pg_application_name = f"sql_exporter{stem_suffix}" - connstr = f"postgresql://{pg_user}@{pg_host}:{pg_port}/{pg_dbname}?sslmode=disable&application_name={pg_application_name}" + connstr = f"postgresql://{pg_user}@{pg_host}:{pg_port}/{pg_dbname}?sslmode=disable&application_name={pg_application_name}&pgaudit.log=none" def escape_go_filepath_match_characters(s: str) -> str: """ From a2d623696c91c1ec832710f6a839eae2c6087d05 Mon Sep 17 00:00:00 2001 From: Tristan Partin Date: Wed, 25 Jun 2025 04:03:02 -0500 Subject: [PATCH 113/124] Update pgaudit to latest versions (#12328) These updates contain some bug fixes and are completely backwards compatible with what we currently support in Neon. Link: https://github.com/pgaudit/pgaudit/compare/1.6.2...1.6.3 Link: https://github.com/pgaudit/pgaudit/compare/1.7.0...1.7.1 Link: https://github.com/pgaudit/pgaudit/compare/16.0...16.1 Link: https://github.com/pgaudit/pgaudit/compare/17.0...17.1 Signed-off-by: Tristan Partin Signed-off-by: Tristan Partin --- compute/compute-node.Dockerfile | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile index 7cd152f614..cf0598eec8 100644 --- a/compute/compute-node.Dockerfile +++ b/compute/compute-node.Dockerfile @@ -1568,20 +1568,20 @@ ARG PG_VERSION WORKDIR /ext-src RUN case "${PG_VERSION}" in \ "v14") \ - export PGAUDIT_VERSION=1.6.2 \ - export PGAUDIT_CHECKSUM=1f350d70a0cbf488c0f2b485e3a5c9b11f78ad9e3cbb95ef6904afa1eb3187eb \ + export PGAUDIT_VERSION=1.6.3 \ + export PGAUDIT_CHECKSUM=37a8f5a7cc8d9188e536d15cf0fdc457fcdab2547caedb54442c37f124110919 \ ;; \ "v15") \ - export PGAUDIT_VERSION=1.7.0 \ - export PGAUDIT_CHECKSUM=8f4a73e451c88c567e516e6cba7dc1e23bc91686bb6f1f77f8f3126d428a8bd8 \ + export PGAUDIT_VERSION=1.7.1 \ + export PGAUDIT_CHECKSUM=e9c8e6e092d82b2f901d72555ce0fe7780552f35f8985573796cd7e64b09d4ec \ ;; \ "v16") \ - export PGAUDIT_VERSION=16.0 \ - export PGAUDIT_CHECKSUM=d53ef985f2d0b15ba25c512c4ce967dce07b94fd4422c95bd04c4c1a055fe738 \ + export PGAUDIT_VERSION=16.1 \ + export PGAUDIT_CHECKSUM=3bae908ab70ba0c6f51224009dbcfff1a97bd6104c6273297a64292e1b921fee \ ;; \ "v17") \ - export PGAUDIT_VERSION=17.0 \ - export PGAUDIT_CHECKSUM=7d0d08d030275d525f36cd48b38c6455f1023da863385badff0cec44965bfd8c \ + export PGAUDIT_VERSION=17.1 \ + export PGAUDIT_CHECKSUM=9c5f37504d393486cc75d2ced83f75f5899be64fa85f689d6babb833b4361e6c \ ;; \ *) \ echo "pgaudit is not supported on this PostgreSQL version" && exit 1;; \ From 7c4c36f5ac236e56f311d412d98b208342c57fa8 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 25 Jun 2025 13:47:56 +0300 Subject: [PATCH 114/124] Remove unnecessary separate installation of libpq (#12287) `make install` compiles and installs libpq. Remove redundant separate step to compile and install it. --- Makefile | 2 -- compute/compute-node.Dockerfile | 3 --- 2 files changed, 5 deletions(-) diff --git a/Makefile b/Makefile index 9824a47255..d39b9b68c8 100644 --- a/Makefile +++ b/Makefile @@ -159,8 +159,6 @@ postgres-%: postgres-configure-% \ postgres-headers-% # to prevent `make install` conflicts with neon's `postgres-headers` +@echo "Compiling PostgreSQL $*" $(MAKE) -C $(BUILD_DIR)/$* MAKELEVEL=0 install - +@echo "Compiling libpq $*" - $(MAKE) -C $(BUILD_DIR)/$*/src/interfaces/libpq install +@echo "Compiling pg_prewarm $*" $(MAKE) -C $(BUILD_DIR)/$*/contrib/pg_prewarm install +@echo "Compiling pg_buffercache $*" diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile index cf0598eec8..35ece73030 100644 --- a/compute/compute-node.Dockerfile +++ b/compute/compute-node.Dockerfile @@ -171,9 +171,6 @@ RUN cd postgres && \ eval $CONFIGURE_CMD && \ make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \ make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \ - # Install headers - make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/include install && \ - make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/interfaces/libpq install && \ # Enable some of contrib extensions echo 'trusted = true' >> /usr/local/pgsql/share/extension/autoinc.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/dblink.control && \ From 1dc01c9bed24846473f04d983d36f7da3f3c04c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Wed, 25 Jun 2025 15:40:38 +0200 Subject: [PATCH 115/124] Support cancellations of timelines with hanging ondemand downloads (#12330) In `test_layer_download_cancelled_by_config_location`, we simulate hung downloads via the `before-downloading-layer-stream-pausable` failpoint. Then, we cancel a timeline via the `location_config` endpoint. With the new default as of https://github.com/neondatabase/neon/pull/11712, we would be creating the timeline on safekeepers regardless if there have been writes or not, and it turns out the test relied on the timeline not existing on safekeepers, due to a cancellation bug: * as established before, the test makes the read path hang * the timeline cancellation function first cancels the walreceiver, and only then cancels the timeline's token * `WalIngest::new` is requesting a checkpoint, which hits the read path * at cancellation time, we'd be hanging inside the read, not seeing the cancellation of the walreceiver * the test would time out due to the hang This is probably also reproducible in the wild when there is S3 unavailabilies or bottlenecks. So we thought that it's worthwhile to fix the hang issue. The approach chosen in the end involves the `tokio::select` macro. In PR 11712, we originally punted on the test due to the hang and opted it out from the new default, but now we can use the new default. Part of https://github.com/neondatabase/neon/issues/12299 --- .../walreceiver/walreceiver_connection.rs | 20 +++++++++++++------ test_runner/regress/test_ondemand_download.py | 6 ------ 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs index e91bd5d43a..6d52da1f00 100644 --- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs @@ -275,12 +275,20 @@ pub(super) async fn handle_walreceiver_connection( let copy_stream = replication_client.copy_both_simple(&query).await?; let mut physical_stream = pin!(ReplicationStream::new(copy_stream)); - let mut walingest = WalIngest::new(timeline.as_ref(), startpoint, &ctx) - .await - .map_err(|e| match e.kind { - crate::walingest::WalIngestErrorKind::Cancelled => WalReceiverError::Cancelled, - _ => WalReceiverError::Other(e.into()), - })?; + let walingest_future = WalIngest::new(timeline.as_ref(), startpoint, &ctx); + let walingest_res = select! { + walingest_res = walingest_future => walingest_res, + _ = cancellation.cancelled() => { + // We are doing reads in WalIngest::new, and those can hang as they come from the network. + // Timeline cancellation hits the walreceiver cancellation token before it hits the timeline global one. + debug!("Connection cancelled"); + return Err(WalReceiverError::Cancelled); + }, + }; + let mut walingest = walingest_res.map_err(|e| match e.kind { + crate::walingest::WalIngestErrorKind::Cancelled => WalReceiverError::Cancelled, + _ => WalReceiverError::Other(e.into()), + })?; let (format, compression) = match protocol { PostgresClientProtocol::Interpreted { diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py index 2b71662669..2590a3fe9d 100644 --- a/test_runner/regress/test_ondemand_download.py +++ b/test_runner/regress/test_ondemand_download.py @@ -671,12 +671,6 @@ def test_layer_download_cancelled_by_config_location(neon_env_builder: NeonEnvBu """ neon_env_builder.enable_pageserver_remote_storage(s3_storage()) - # On the new mode, the test runs into a cancellation issue, i.e. the walproposer can't shut down - # as it is hang-waiting on the timeline_checkpoint call in WalIngest::new. - neon_env_builder.storage_controller_config = { - "timelines_onto_safekeepers": False, - } - # turn off background tasks so that they don't interfere with the downloads env = neon_env_builder.init_start( initial_tenant_conf={ From 27ca1e21bec2fc90311ec3ac1cad69bced69dd6f Mon Sep 17 00:00:00 2001 From: Conrad Ludgate Date: Wed, 25 Jun 2025 14:41:30 +0100 Subject: [PATCH 116/124] [console_redirect_proxy]: fix channel binding (#12238) ## Problem While working more on TLS to compute, I realised that Console Redirect -> pg-sni-router -> compute would break if channel binding was set to prefer. This is because the channel binding data would differ between Console Redirect -> pg-sni-router vs pg-sni-router -> compute. I also noticed that I actually disabled channel binding in #12145, since `connect_raw` would think that the connection didn't support TLS. ## Summary of changes Make sure we specify the channel binding. Make sure that `connect_raw` can see if we have TLS support. --- libs/proxy/tokio-postgres2/src/config.rs | 20 ++++++++++++++++--- libs/proxy/tokio-postgres2/src/connect.rs | 4 +++- libs/proxy/tokio-postgres2/src/connect_raw.rs | 12 ++++------- proxy/src/compute/mod.rs | 12 +++++++++-- proxy/src/proxy/tests/mitm.rs | 4 ++-- proxy/src/proxy/tests/mod.rs | 12 +++++------ 6 files changed, 42 insertions(+), 22 deletions(-) diff --git a/libs/proxy/tokio-postgres2/src/config.rs b/libs/proxy/tokio-postgres2/src/config.rs index 243a5bc725..961cbc923e 100644 --- a/libs/proxy/tokio-postgres2/src/config.rs +++ b/libs/proxy/tokio-postgres2/src/config.rs @@ -12,7 +12,9 @@ use tokio::net::TcpStream; use crate::connect::connect; use crate::connect_raw::{RawConnection, connect_raw}; -use crate::tls::{MakeTlsConnect, TlsConnect}; +use crate::connect_tls::connect_tls; +use crate::maybe_tls_stream::MaybeTlsStream; +use crate::tls::{MakeTlsConnect, TlsConnect, TlsStream}; use crate::{Client, Connection, Error}; /// TLS configuration. @@ -238,7 +240,7 @@ impl Config { connect(tls, self).await } - pub async fn connect_raw( + pub async fn tls_and_authenticate( &self, stream: S, tls: T, @@ -247,7 +249,19 @@ impl Config { S: AsyncRead + AsyncWrite + Unpin, T: TlsConnect, { - connect_raw(stream, tls, self).await + let stream = connect_tls(stream, self.ssl_mode, tls).await?; + connect_raw(stream, self).await + } + + pub async fn authenticate( + &self, + stream: MaybeTlsStream, + ) -> Result, Error> + where + S: AsyncRead + AsyncWrite + Unpin, + T: TlsStream + Unpin, + { + connect_raw(stream, self).await } } diff --git a/libs/proxy/tokio-postgres2/src/connect.rs b/libs/proxy/tokio-postgres2/src/connect.rs index f7bc863337..4a07eccf9a 100644 --- a/libs/proxy/tokio-postgres2/src/connect.rs +++ b/libs/proxy/tokio-postgres2/src/connect.rs @@ -9,6 +9,7 @@ use crate::codec::BackendMessage; use crate::config::Host; use crate::connect_raw::connect_raw; use crate::connect_socket::connect_socket; +use crate::connect_tls::connect_tls; use crate::tls::{MakeTlsConnect, TlsConnect}; use crate::{Client, Config, Connection, Error, RawConnection}; @@ -44,13 +45,14 @@ where T: TlsConnect, { let socket = connect_socket(host_addr, host, port, config.connect_timeout).await?; + let stream = connect_tls(socket, config.ssl_mode, tls).await?; let RawConnection { stream, parameters, delayed_notice, process_id, secret_key, - } = connect_raw(socket, tls, config).await?; + } = connect_raw(stream, config).await?; let socket_config = SocketConfig { host_addr, diff --git a/libs/proxy/tokio-postgres2/src/connect_raw.rs b/libs/proxy/tokio-postgres2/src/connect_raw.rs index 20dc538cf2..b89a600a2e 100644 --- a/libs/proxy/tokio-postgres2/src/connect_raw.rs +++ b/libs/proxy/tokio-postgres2/src/connect_raw.rs @@ -16,9 +16,8 @@ use tokio_util::codec::Framed; use crate::Error; use crate::codec::{BackendMessage, BackendMessages, FrontendMessage, PostgresCodec}; use crate::config::{self, AuthKeys, Config}; -use crate::connect_tls::connect_tls; use crate::maybe_tls_stream::MaybeTlsStream; -use crate::tls::{TlsConnect, TlsStream}; +use crate::tls::TlsStream; pub struct StartupStream { inner: Framed, PostgresCodec>, @@ -87,16 +86,13 @@ pub struct RawConnection { } pub async fn connect_raw( - stream: S, - tls: T, + stream: MaybeTlsStream, config: &Config, -) -> Result, Error> +) -> Result, Error> where S: AsyncRead + AsyncWrite + Unpin, - T: TlsConnect, + T: TlsStream + Unpin, { - let stream = connect_tls(stream, config.ssl_mode, tls).await?; - let mut stream = StartupStream { inner: Framed::new(stream, PostgresCodec), buf: BackendMessages::empty(), diff --git a/proxy/src/compute/mod.rs b/proxy/src/compute/mod.rs index f6c58c7459..7fb88e6a45 100644 --- a/proxy/src/compute/mod.rs +++ b/proxy/src/compute/mod.rs @@ -6,7 +6,7 @@ use std::net::{IpAddr, SocketAddr}; use futures::{FutureExt, TryFutureExt}; use itertools::Itertools; -use postgres_client::config::{AuthKeys, SslMode}; +use postgres_client::config::{AuthKeys, ChannelBinding, SslMode}; use postgres_client::maybe_tls_stream::MaybeTlsStream; use postgres_client::tls::MakeTlsConnect; use postgres_client::{NoTls, RawCancelToken, RawConnection}; @@ -129,6 +129,8 @@ pub(crate) struct AuthInfo { auth: Option, server_params: StartupMessageParams, + channel_binding: ChannelBinding, + /// Console redirect sets user and database, we shouldn't re-use those from the params. skip_db_user: bool, } @@ -152,6 +154,8 @@ impl AuthInfo { auth: pw.map(|pw| Auth::Password(pw.as_bytes().to_owned())), server_params, skip_db_user: true, + // pg-sni-router is a mitm so this would fail. + channel_binding: ChannelBinding::Disable, } } @@ -165,6 +169,7 @@ impl AuthInfo { }, server_params: StartupMessageParams::default(), skip_db_user: false, + channel_binding: ChannelBinding::Prefer, } } } @@ -187,6 +192,7 @@ impl AuthInfo { Some(Auth::Password(pw)) => config.password(pw), None => &mut config, }; + config.channel_binding(self.channel_binding); for (k, v) in self.server_params.iter() { config.set_param(k, v); } @@ -241,7 +247,9 @@ impl AuthInfo { let tmp_config = self.enrich(tmp_config); let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute); - let connection = tmp_config.connect_raw(&mut compute.stream, NoTls).await?; + let connection = tmp_config + .tls_and_authenticate(&mut compute.stream, NoTls) + .await?; drop(pause); let RawConnection { diff --git a/proxy/src/proxy/tests/mitm.rs b/proxy/src/proxy/tests/mitm.rs index c92ee49b8d..67dd0ab522 100644 --- a/proxy/src/proxy/tests/mitm.rs +++ b/proxy/src/proxy/tests/mitm.rs @@ -169,7 +169,7 @@ async fn scram_auth_disable_channel_binding() -> anyhow::Result<()> { .dbname("db") .password("password") .ssl_mode(SslMode::Require) - .connect_raw(server, client_config.make_tls_connect()?) + .tls_and_authenticate(server, client_config.make_tls_connect()?) .await?; proxy.await? @@ -252,7 +252,7 @@ async fn connect_failure( .dbname("db") .password("password") .ssl_mode(SslMode::Require) - .connect_raw(server, client_config.make_tls_connect()?) + .tls_and_authenticate(server, client_config.make_tls_connect()?) .await .err() .context("client shouldn't be able to connect")?; diff --git a/proxy/src/proxy/tests/mod.rs b/proxy/src/proxy/tests/mod.rs index 12de5cbc09..29a269208a 100644 --- a/proxy/src/proxy/tests/mod.rs +++ b/proxy/src/proxy/tests/mod.rs @@ -199,7 +199,7 @@ async fn handshake_tls_is_enforced_by_proxy() -> anyhow::Result<()> { .user("john_doe") .dbname("earth") .ssl_mode(SslMode::Disable) - .connect_raw(server, NoTls) + .tls_and_authenticate(server, NoTls) .await .err() // -> Option .context("client shouldn't be able to connect")?; @@ -228,7 +228,7 @@ async fn handshake_tls() -> anyhow::Result<()> { .user("john_doe") .dbname("earth") .ssl_mode(SslMode::Require) - .connect_raw(server, client_config.make_tls_connect()?) + .tls_and_authenticate(server, client_config.make_tls_connect()?) .await?; proxy.await? @@ -245,7 +245,7 @@ async fn handshake_raw() -> anyhow::Result<()> { .dbname("earth") .set_param("options", "project=generic-project-name") .ssl_mode(SslMode::Prefer) - .connect_raw(server, NoTls) + .tls_and_authenticate(server, NoTls) .await?; proxy.await? @@ -293,7 +293,7 @@ async fn scram_auth_good(#[case] password: &str) -> anyhow::Result<()> { .dbname("db") .password(password) .ssl_mode(SslMode::Require) - .connect_raw(server, client_config.make_tls_connect()?) + .tls_and_authenticate(server, client_config.make_tls_connect()?) .await?; proxy.await? @@ -317,7 +317,7 @@ async fn scram_auth_disable_channel_binding() -> anyhow::Result<()> { .dbname("db") .password("password") .ssl_mode(SslMode::Require) - .connect_raw(server, client_config.make_tls_connect()?) + .tls_and_authenticate(server, client_config.make_tls_connect()?) .await?; proxy.await? @@ -344,7 +344,7 @@ async fn scram_auth_mock() -> anyhow::Result<()> { .dbname("db") .password(&password) // no password will match the mocked secret .ssl_mode(SslMode::Require) - .connect_raw(server, client_config.make_tls_connect()?) + .tls_and_authenticate(server, client_config.make_tls_connect()?) .await .err() // -> Option .context("client shouldn't be able to connect")?; From 517a3d0d86303ee3084ffd99ffac7042e2eca5c2 Mon Sep 17 00:00:00 2001 From: Conrad Ludgate Date: Wed, 25 Jun 2025 15:19:20 +0100 Subject: [PATCH 117/124] [proxy]: BatchQueue::call is not cancel safe - make it directly cancellation aware (#12345) ## Problem https://github.com/neondatabase/cloud/issues/30539 If the current leader cancels the `call` function, then it has removed the jobs from the queue, but will never finish sending the responses. Because of this, it is not cancellation safe. ## Summary of changes Document these functions as not cancellation safe. Move cancellation of the queued jobs into the queue itself. ## Alternatives considered 1. We could spawn the task that runs the batch, since that won't get cancelled. * This requires `fn call(self: Arc)` or `fn call(&'static self)`. 2. We could add another scopeguard and return the requests back to the queue. * This requires that requests are always retry safe, and also requires requests to be `Clone`. --- proxy/src/batch.rs | 80 +++++++++++++++++++++++++----------- proxy/src/cancellation.rs | 85 +++++++++++++++++++++++---------------- proxy/src/redis/keys.rs | 3 +- 3 files changed, 109 insertions(+), 59 deletions(-) diff --git a/proxy/src/batch.rs b/proxy/src/batch.rs index 61bdf2b747..33e08797f2 100644 --- a/proxy/src/batch.rs +++ b/proxy/src/batch.rs @@ -6,7 +6,6 @@ use std::collections::BTreeMap; use std::pin::pin; use std::sync::Mutex; -use futures::future::Either; use scopeguard::ScopeGuard; use tokio::sync::oneshot::error::TryRecvError; @@ -49,37 +48,67 @@ impl BatchQueue

{ } } - pub async fn call(&self, req: P::Req) -> P::Res { + /// Perform a single request-response process, this may be batched internally. + /// + /// This function is not cancel safe. + pub async fn call( + &self, + req: P::Req, + cancelled: impl Future, + ) -> Result { let (id, mut rx) = self.inner.lock_propagate_poison().register_job(req); - let guard = scopeguard::guard(id, move |id| { - let mut inner = self.inner.lock_propagate_poison(); - if inner.queue.remove(&id).is_some() { - tracing::debug!("batched task cancelled before completion"); - } - }); + let mut cancelled = pin!(cancelled); let resp = loop { // try become the leader, or try wait for success. - let mut processor = match futures::future::select(rx, pin!(self.processor.lock())).await - { - // we got the resp. - Either::Left((resp, _)) => break resp.ok(), - // we are the leader. - Either::Right((p, rx_)) => { - rx = rx_; - p - } + let mut processor = tokio::select! { + // try become leader. + p = self.processor.lock() => p, + // wait for success. + resp = &mut rx => break resp.ok(), + // wait for cancellation. + cancel = cancelled.as_mut() => { + let mut inner = self.inner.lock_propagate_poison(); + if inner.queue.remove(&id).is_some() { + tracing::warn!("batched task cancelled before completion"); + } + return Err(cancel); + }, }; + tracing::debug!(id, "batch: became leader"); let (reqs, resps) = self.inner.lock_propagate_poison().get_batch(&processor); + // snitch incase the task gets cancelled. + let cancel_safety = scopeguard::guard((), |()| { + if !std::thread::panicking() { + tracing::error!( + id, + "batch: leader cancelled, despite not being cancellation safe" + ); + } + }); + // apply a batch. + // if this is cancelled, jobs will not be completed and will panic. let values = processor.apply(reqs).await; + // good: we didn't get cancelled. + ScopeGuard::into_inner(cancel_safety); + + if values.len() != resps.len() { + tracing::error!( + "batch: invalid response size, expected={}, got={}", + resps.len(), + values.len() + ); + } + // send response values. for (tx, value) in std::iter::zip(resps, values) { - // sender hung up but that's fine. - drop(tx.send(value)); + if tx.send(value).is_err() { + // receiver hung up but that's fine. + } } match rx.try_recv() { @@ -98,10 +127,9 @@ impl BatchQueue

{ } }; - // already removed. - ScopeGuard::into_inner(guard); + tracing::debug!(id, "batch: job completed"); - resp.expect("no response found. batch processer should not panic") + Ok(resp.expect("no response found. batch processer should not panic")) } } @@ -125,6 +153,8 @@ impl BatchQueueInner

{ self.queue.insert(id, BatchJob { req, res: tx }); + tracing::debug!(id, "batch: registered job in the queue"); + (id, rx) } @@ -132,15 +162,19 @@ impl BatchQueueInner

{ let batch_size = p.batch_size(self.queue.len()); let mut reqs = Vec::with_capacity(batch_size); let mut resps = Vec::with_capacity(batch_size); + let mut ids = Vec::with_capacity(batch_size); while reqs.len() < batch_size { - let Some((_, job)) = self.queue.pop_first() else { + let Some((id, job)) = self.queue.pop_first() else { break; }; reqs.push(job.req); resps.push(job.res); + ids.push(id); } + tracing::debug!(ids=?ids, "batch: acquired jobs"); + (reqs, resps) } } diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs index 036f36c7f6..ffc0cf43f1 100644 --- a/proxy/src/cancellation.rs +++ b/proxy/src/cancellation.rs @@ -1,5 +1,6 @@ use std::convert::Infallible; use std::net::{IpAddr, SocketAddr}; +use std::pin::pin; use std::sync::{Arc, OnceLock}; use std::time::Duration; @@ -98,7 +99,6 @@ impl Pipeline { impl CancelKeyOp { fn register(&self, pipe: &mut Pipeline) { - #[allow(clippy::used_underscore_binding)] match self { CancelKeyOp::StoreCancelKey { key, value, expire } => { let key = KeyPrefix::Cancel(*key).build_redis_key(); @@ -224,6 +224,7 @@ impl CancellationHandler { } } + /// This is not cancel safe async fn get_cancel_key( &self, key: CancelKeyData, @@ -240,16 +241,21 @@ impl CancellationHandler { }; const TIMEOUT: Duration = Duration::from_secs(5); - let result = timeout(TIMEOUT, tx.call((guard, op))) - .await - .map_err(|_| { - tracing::warn!("timed out waiting to receive GetCancelData response"); - CancelError::RateLimit - })? - .map_err(|e| { - tracing::warn!("failed to receive GetCancelData response: {e}"); - CancelError::InternalError - })?; + let result = timeout( + TIMEOUT, + tx.call((guard, op), std::future::pending::()), + ) + .await + .map_err(|_| { + tracing::warn!("timed out waiting to receive GetCancelData response"); + CancelError::RateLimit + })? + // cannot be cancelled + .unwrap_or_else(|x| match x {}) + .map_err(|e| { + tracing::warn!("failed to receive GetCancelData response: {e}"); + CancelError::InternalError + })?; let cancel_state_str = String::from_owned_redis_value(result).map_err(|e| { tracing::warn!("failed to receive GetCancelData response: {e}"); @@ -271,6 +277,8 @@ impl CancellationHandler { /// Will fetch IP allowlist internally. /// /// return Result primarily for tests + /// + /// This is not cancel safe pub(crate) async fn cancel_session( &self, key: CancelKeyData, @@ -394,6 +402,8 @@ impl Session { /// Ensure the cancel key is continously refreshed, /// but stop when the channel is dropped. + /// + /// This is not cancel safe pub(crate) async fn maintain_cancel_key( &self, session_id: uuid::Uuid, @@ -401,27 +411,6 @@ impl Session { cancel_closure: &CancelClosure, compute_config: &ComputeConfig, ) { - futures::future::select( - std::pin::pin!(self.maintain_redis_cancel_key(cancel_closure)), - cancel, - ) - .await; - - if let Err(err) = cancel_closure - .try_cancel_query(compute_config) - .boxed() - .await - { - tracing::warn!( - ?session_id, - ?err, - "could not cancel the query in the database" - ); - } - } - - // Ensure the cancel key is continously refreshed. - async fn maintain_redis_cancel_key(&self, cancel_closure: &CancelClosure) -> ! { let Some(tx) = self.cancellation_handler.tx.get() else { tracing::warn!("cancellation handler is not available"); // don't exit, as we only want to exit if cancelled externally. @@ -432,6 +421,8 @@ impl Session { .expect("serialising to json string should not fail") .into_boxed_str(); + let mut cancel = pin!(cancel); + loop { let guard = Metrics::get() .proxy @@ -449,9 +440,35 @@ impl Session { "registering cancellation key" ); - if tx.call((guard, op)).await.is_ok() { - tokio::time::sleep(CANCEL_KEY_REFRESH).await; + match tx.call((guard, op), cancel.as_mut()).await { + Ok(Ok(_)) => { + tracing::debug!( + src=%self.key, + dest=?cancel_closure.cancel_token, + "registered cancellation key" + ); + + // wait before continuing. + tokio::time::sleep(CANCEL_KEY_REFRESH).await; + } + // retry immediately. + Ok(Err(error)) => { + tracing::warn!(?error, "error registering cancellation key"); + } + Err(Err(_cancelled)) => break, } } + + if let Err(err) = cancel_closure + .try_cancel_query(compute_config) + .boxed() + .await + { + tracing::warn!( + ?session_id, + ?err, + "could not cancel the query in the database" + ); + } } } diff --git a/proxy/src/redis/keys.rs b/proxy/src/redis/keys.rs index b453e6851c..ffb7bc876b 100644 --- a/proxy/src/redis/keys.rs +++ b/proxy/src/redis/keys.rs @@ -23,9 +23,8 @@ impl KeyPrefix { #[cfg(test)] mod tests { - use crate::pqproto::id_to_cancel_key; - use super::*; + use crate::pqproto::id_to_cancel_key; #[test] fn test_build_redis_key() { From 6c77638ea15be150a128ce3d09823dfaafb966fc Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Wed, 25 Jun 2025 10:58:18 -0400 Subject: [PATCH 118/124] feat(storcon): retrieve feature flag and pass to pageservers (#12324) ## Problem part of https://github.com/neondatabase/neon/issues/11813 ## Summary of changes It costs $$$ to directly retrieve the feature flags from the pageserver. Therefore, this patch adds new APIs to retrieve the spec from the storcon and updates it via pageserver. * Storcon retrieves the feature flag and send it to the pageservers. * If the feature flag gets updated outside of the normal refresh loop of the pageserver, pageserver won't fetch the flags on its own as long as the last updated time <= refresh_period. Signed-off-by: Alex Chi Z --- Cargo.lock | 2 + control_plane/src/local_env.rs | 4 + control_plane/src/storage_controller.rs | 17 ++- libs/pageserver_api/src/config.rs | 7 +- .../src/background_loop.rs | 61 ++++++--- libs/posthog_client_lite/src/lib.rs | 30 +++-- pageserver/client/src/mgmt_api.rs | 9 ++ pageserver/src/feature_resolver.rs | 7 ++ pageserver/src/http/routes.rs | 17 +++ storage_controller/Cargo.toml | 2 + storage_controller/src/main.rs | 40 ++++++ storage_controller/src/pageserver_client.rs | 9 ++ storage_controller/src/service.rs | 5 + .../src/service/feature_flag.rs | 117 ++++++++++++++++++ 14 files changed, 294 insertions(+), 33 deletions(-) create mode 100644 storage_controller/src/service/feature_flag.rs diff --git a/Cargo.lock b/Cargo.lock index 51724da061..1fee728d9c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6815,6 +6815,7 @@ dependencies = [ "hex", "http-utils", "humantime", + "humantime-serde", "hyper 0.14.30", "itertools 0.10.5", "json-structural-diff", @@ -6825,6 +6826,7 @@ dependencies = [ "pageserver_api", "pageserver_client", "postgres_connection", + "posthog_client_lite", "rand 0.8.5", "regex", "reqwest", diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index 370921a85c..16cd2d8c08 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -12,6 +12,7 @@ use std::{env, fs}; use anyhow::{Context, bail}; use clap::ValueEnum; +use pageserver_api::config::PostHogConfig; use pem::Pem; use postgres_backend::AuthType; use reqwest::{Certificate, Url}; @@ -213,6 +214,8 @@ pub struct NeonStorageControllerConf { pub timeline_safekeeper_count: Option, + pub posthog_config: Option, + pub kick_secondary_downloads: Option, } @@ -245,6 +248,7 @@ impl Default for NeonStorageControllerConf { use_https_safekeeper_api: false, use_local_compute_notifications: true, timeline_safekeeper_count: None, + posthog_config: None, kick_secondary_downloads: None, } } diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs index 218b17d88d..dea7ae2ccf 100644 --- a/control_plane/src/storage_controller.rs +++ b/control_plane/src/storage_controller.rs @@ -642,6 +642,18 @@ impl StorageController { args.push(format!("--timeline-safekeeper-count={sk_cnt}")); } + let mut envs = vec![ + ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()), + ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()), + ]; + + if let Some(posthog_config) = &self.config.posthog_config { + envs.push(( + "POSTHOG_CONFIG".to_string(), + serde_json::to_string(posthog_config)?, + )); + } + println!("Starting storage controller"); background_process::start_process( @@ -649,10 +661,7 @@ impl StorageController { &instance_dir, &self.env.storage_controller_bin(), args, - vec![ - ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()), - ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()), - ], + envs, background_process::InitialPidFile::Create(self.pid_file(start_args.instance_id)), &start_args.start_timeout, || async { diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs index cfb1190a27..76730c9ee6 100644 --- a/libs/pageserver_api/src/config.rs +++ b/libs/pageserver_api/src/config.rs @@ -63,7 +63,8 @@ impl Display for NodeMetadata { } } -/// PostHog integration config. +/// PostHog integration config. This is used in pageserver, storcon, and neon_local. +/// Ensure backward compatibility when adding new fields. #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] pub struct PostHogConfig { /// PostHog project ID @@ -76,7 +77,9 @@ pub struct PostHogConfig { pub private_api_url: String, /// Public API URL pub public_api_url: String, - /// Refresh interval for the feature flag spec + /// Refresh interval for the feature flag spec. + /// The storcon will push the feature flag spec to the pageserver. If the pageserver does not receive + /// the spec for `refresh_interval`, it will fetch the spec from the PostHog API. #[serde(skip_serializing_if = "Option::is_none")] #[serde(with = "humantime_serde")] pub refresh_interval: Option, diff --git a/libs/posthog_client_lite/src/background_loop.rs b/libs/posthog_client_lite/src/background_loop.rs index dc813ccb4a..08cb0d2264 100644 --- a/libs/posthog_client_lite/src/background_loop.rs +++ b/libs/posthog_client_lite/src/background_loop.rs @@ -1,17 +1,22 @@ //! A background loop that fetches feature flags from PostHog and updates the feature store. -use std::{sync::Arc, time::Duration}; +use std::{ + sync::Arc, + time::{Duration, SystemTime}, +}; use arc_swap::ArcSwap; use tokio_util::sync::CancellationToken; use tracing::{Instrument, info_span}; -use crate::{CaptureEvent, FeatureStore, PostHogClient, PostHogClientConfig}; +use crate::{ + CaptureEvent, FeatureStore, LocalEvaluationResponse, PostHogClient, PostHogClientConfig, +}; /// A background loop that fetches feature flags from PostHog and updates the feature store. pub struct FeatureResolverBackgroundLoop { posthog_client: PostHogClient, - feature_store: ArcSwap, + feature_store: ArcSwap<(SystemTime, Arc)>, cancel: CancellationToken, } @@ -19,11 +24,35 @@ impl FeatureResolverBackgroundLoop { pub fn new(config: PostHogClientConfig, shutdown_pageserver: CancellationToken) -> Self { Self { posthog_client: PostHogClient::new(config), - feature_store: ArcSwap::new(Arc::new(FeatureStore::new())), + feature_store: ArcSwap::new(Arc::new(( + SystemTime::UNIX_EPOCH, + Arc::new(FeatureStore::new()), + ))), cancel: shutdown_pageserver, } } + /// Update the feature store with a new feature flag spec bypassing the normal refresh loop. + pub fn update(&self, spec: String) -> anyhow::Result<()> { + let resp: LocalEvaluationResponse = serde_json::from_str(&spec)?; + self.update_feature_store_nofail(resp, "http_propagate"); + Ok(()) + } + + fn update_feature_store_nofail(&self, resp: LocalEvaluationResponse, source: &'static str) { + let project_id = self.posthog_client.config.project_id.parse::().ok(); + match FeatureStore::new_with_flags(resp.flags, project_id) { + Ok(feature_store) => { + self.feature_store + .store(Arc::new((SystemTime::now(), Arc::new(feature_store)))); + tracing::info!("Feature flag updated from {}", source); + } + Err(e) => { + tracing::warn!("Cannot process feature flag spec from {}: {}", source, e); + } + } + } + pub fn spawn( self: Arc, handle: &tokio::runtime::Handle, @@ -47,6 +76,17 @@ impl FeatureResolverBackgroundLoop { _ = ticker.tick() => {} _ = cancel.cancelled() => break } + { + let last_update = this.feature_store.load().0; + if let Ok(elapsed) = last_update.elapsed() { + if elapsed < refresh_period { + tracing::debug!( + "Skipping feature flag refresh because it's too soon" + ); + continue; + } + } + } let resp = match this .posthog_client .get_feature_flags_local_evaluation() @@ -58,16 +98,7 @@ impl FeatureResolverBackgroundLoop { continue; } }; - let project_id = this.posthog_client.config.project_id.parse::().ok(); - match FeatureStore::new_with_flags(resp.flags, project_id) { - Ok(feature_store) => { - this.feature_store.store(Arc::new(feature_store)); - tracing::info!("Feature flag updated"); - } - Err(e) => { - tracing::warn!("Cannot process feature flag spec: {}", e); - } - } + this.update_feature_store_nofail(resp, "refresh_loop"); } tracing::info!("PostHog feature resolver stopped"); } @@ -92,6 +123,6 @@ impl FeatureResolverBackgroundLoop { } pub fn feature_store(&self) -> Arc { - self.feature_store.load_full() + self.feature_store.load().1.clone() } } diff --git a/libs/posthog_client_lite/src/lib.rs b/libs/posthog_client_lite/src/lib.rs index f21047bcfc..d042ee2410 100644 --- a/libs/posthog_client_lite/src/lib.rs +++ b/libs/posthog_client_lite/src/lib.rs @@ -544,17 +544,8 @@ impl PostHogClient { self.config.server_api_key.starts_with("phs_") } - /// Fetch the feature flag specs from the server. - /// - /// This is unfortunately an undocumented API at: - /// - - /// - - /// - /// The handling logic in [`FeatureStore`] mostly follows the Python API implementation. - /// See `_compute_flag_locally` in - pub async fn get_feature_flags_local_evaluation( - &self, - ) -> anyhow::Result { + /// Get the raw JSON spec, same as `get_feature_flags_local_evaluation` but without parsing. + pub async fn get_feature_flags_local_evaluation_raw(&self) -> anyhow::Result { // BASE_URL/api/projects/:project_id/feature_flags/local_evaluation // with bearer token of self.server_api_key // OR @@ -588,7 +579,22 @@ impl PostHogClient { body )); } - Ok(serde_json::from_str(&body)?) + Ok(body) + } + + /// Fetch the feature flag specs from the server. + /// + /// This is unfortunately an undocumented API at: + /// - + /// - + /// + /// The handling logic in [`FeatureStore`] mostly follows the Python API implementation. + /// See `_compute_flag_locally` in + pub async fn get_feature_flags_local_evaluation( + &self, + ) -> Result { + let raw = self.get_feature_flags_local_evaluation_raw().await?; + Ok(serde_json::from_str(&raw)?) } /// Capture an event. This will only be used to report the feature flag usage back to PostHog, though diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs index 3919a6e788..af4be23b9b 100644 --- a/pageserver/client/src/mgmt_api.rs +++ b/pageserver/client/src/mgmt_api.rs @@ -844,4 +844,13 @@ impl Client { .await .map_err(Error::ReceiveBody) } + + pub async fn update_feature_flag_spec(&self, spec: String) -> Result<()> { + let uri = format!("{}/v1/feature_flag_spec", self.mgmt_api_endpoint); + self.request(Method::POST, uri, spec) + .await? + .json() + .await + .map_err(Error::ReceiveBody) + } } diff --git a/pageserver/src/feature_resolver.rs b/pageserver/src/feature_resolver.rs index b0a68dfc4d..92a9ef2880 100644 --- a/pageserver/src/feature_resolver.rs +++ b/pageserver/src/feature_resolver.rs @@ -31,6 +31,13 @@ impl FeatureResolver { } } + pub fn update(&self, spec: String) -> anyhow::Result<()> { + if let Some(inner) = &self.inner { + inner.update(spec)?; + } + Ok(()) + } + pub fn spawn( conf: &PageServerConf, shutdown_pageserver: CancellationToken, diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 3755cbda6a..aa9bec657c 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -3743,6 +3743,20 @@ async fn force_override_feature_flag_for_testing_delete( json_response(StatusCode::OK, ()) } +async fn update_feature_flag_spec( + mut request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + check_permission(&request, None)?; + let body = json_request(&mut request).await?; + let state = get_state(&request); + state + .feature_resolver + .update(body) + .map_err(ApiError::InternalServerError)?; + json_response(StatusCode::OK, ()) +} + /// Common functionality of all the HTTP API handlers. /// /// - Adds a tracing span to each request (by `request_span`) @@ -4128,5 +4142,8 @@ pub fn make_router( .delete("/v1/feature_flag/:flag_key", |r| { testing_api_handler("force override feature flag - delete", r, force_override_feature_flag_for_testing_delete) }) + .post("/v1/feature_flag_spec", |r| { + api_handler(r, update_feature_flag_spec) + }) .any(handler_404)) } diff --git a/storage_controller/Cargo.toml b/storage_controller/Cargo.toml index c41e174d9d..3a0806b3b2 100644 --- a/storage_controller/Cargo.toml +++ b/storage_controller/Cargo.toml @@ -27,6 +27,7 @@ governor.workspace = true hex.workspace = true hyper0.workspace = true humantime.workspace = true +humantime-serde.workspace = true itertools.workspace = true json-structural-diff.workspace = true lasso.workspace = true @@ -34,6 +35,7 @@ once_cell.workspace = true pageserver_api.workspace = true pageserver_client.workspace = true postgres_connection.workspace = true +posthog_client_lite.workspace = true rand.workspace = true reqwest = { workspace = true, features = ["stream"] } routerify.workspace = true diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs index ff134a4ebc..296a98e620 100644 --- a/storage_controller/src/main.rs +++ b/storage_controller/src/main.rs @@ -14,11 +14,13 @@ use http_utils::tls_certs::ReloadingCertificateResolver; use hyper0::Uri; use metrics::BuildInfo; use metrics::launch_timestamp::LaunchTimestamp; +use pageserver_api::config::PostHogConfig; use reqwest::Certificate; use storage_controller::http::make_router; use storage_controller::metrics::preinitialize_metrics; use storage_controller::persistence::Persistence; use storage_controller::service::chaos_injector::ChaosInjector; +use storage_controller::service::feature_flag::FeatureFlagService; use storage_controller::service::{ Config, HEARTBEAT_INTERVAL_DEFAULT, LONG_RECONCILE_THRESHOLD_DEFAULT, MAX_OFFLINE_INTERVAL_DEFAULT, MAX_WARMING_UP_INTERVAL_DEFAULT, @@ -252,6 +254,8 @@ struct Secrets { peer_jwt_token: Option, } +const POSTHOG_CONFIG_ENV: &str = "POSTHOG_CONFIG"; + impl Secrets { const DATABASE_URL_ENV: &'static str = "DATABASE_URL"; const PAGESERVER_JWT_TOKEN_ENV: &'static str = "PAGESERVER_JWT_TOKEN"; @@ -409,6 +413,18 @@ async fn async_main() -> anyhow::Result<()> { None => Vec::new(), }; + let posthog_config = if let Ok(json) = std::env::var(POSTHOG_CONFIG_ENV) { + let res: Result = serde_json::from_str(&json); + if let Ok(config) = res { + Some(config) + } else { + tracing::warn!("Invalid posthog config: {json}"); + None + } + } else { + None + }; + let config = Config { pageserver_jwt_token: secrets.pageserver_jwt_token, safekeeper_jwt_token: secrets.safekeeper_jwt_token, @@ -455,6 +471,7 @@ async fn async_main() -> anyhow::Result<()> { timelines_onto_safekeepers: args.timelines_onto_safekeepers, use_local_compute_notifications: args.use_local_compute_notifications, timeline_safekeeper_count: args.timeline_safekeeper_count, + posthog_config: posthog_config.clone(), #[cfg(feature = "testing")] kick_secondary_downloads: args.kick_secondary_downloads, }; @@ -537,6 +554,23 @@ async fn async_main() -> anyhow::Result<()> { ) }); + let feature_flag_task = if let Some(posthog_config) = posthog_config { + let service = service.clone(); + let cancel = CancellationToken::new(); + let cancel_bg = cancel.clone(); + let task = tokio::task::spawn( + async move { + let feature_flag_service = FeatureFlagService::new(service, posthog_config); + let feature_flag_service = Arc::new(feature_flag_service); + feature_flag_service.run(cancel_bg).await + } + .instrument(tracing::info_span!("feature_flag_service")), + ); + Some((task, cancel)) + } else { + None + }; + // Wait until we receive a signal let mut sigint = tokio::signal::unix::signal(SignalKind::interrupt())?; let mut sigquit = tokio::signal::unix::signal(SignalKind::quit())?; @@ -584,6 +618,12 @@ async fn async_main() -> anyhow::Result<()> { chaos_jh.await.ok(); } + // If we were running the feature flag service, stop that so that we're not calling into Service while it shuts down + if let Some((feature_flag_task, feature_flag_cancel)) = feature_flag_task { + feature_flag_cancel.cancel(); + feature_flag_task.await.ok(); + } + service.shutdown().await; tracing::info!("Service shutdown complete"); diff --git a/storage_controller/src/pageserver_client.rs b/storage_controller/src/pageserver_client.rs index 817409e112..d6fe173eb3 100644 --- a/storage_controller/src/pageserver_client.rs +++ b/storage_controller/src/pageserver_client.rs @@ -376,4 +376,13 @@ impl PageserverClient { .await ) } + + pub(crate) async fn update_feature_flag_spec(&self, spec: String) -> Result<()> { + measured_request!( + "update_feature_flag_spec", + crate::metrics::Method::Post, + &self.node_id_label, + self.inner.update_feature_flag_spec(spec).await + ) + } } diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 8424c27cf8..b4dfd01249 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -1,5 +1,6 @@ pub mod chaos_injector; mod context_iterator; +pub mod feature_flag; pub(crate) mod safekeeper_reconciler; mod safekeeper_service; @@ -25,6 +26,7 @@ use futures::stream::FuturesUnordered; use http_utils::error::ApiError; use hyper::Uri; use itertools::Itertools; +use pageserver_api::config::PostHogConfig; use pageserver_api::controller_api::{ AvailabilityZone, MetadataHealthRecord, MetadataHealthUpdateRequest, NodeAvailability, NodeRegisterRequest, NodeSchedulingPolicy, NodeShard, NodeShardResponse, PlacementPolicy, @@ -471,6 +473,9 @@ pub struct Config { /// Safekeepers will be choosen from different availability zones. pub timeline_safekeeper_count: i64, + /// PostHog integration config + pub posthog_config: Option, + #[cfg(feature = "testing")] pub kick_secondary_downloads: bool, } diff --git a/storage_controller/src/service/feature_flag.rs b/storage_controller/src/service/feature_flag.rs new file mode 100644 index 0000000000..645eb75237 --- /dev/null +++ b/storage_controller/src/service/feature_flag.rs @@ -0,0 +1,117 @@ +use std::{sync::Arc, time::Duration}; + +use futures::StreamExt; +use pageserver_api::config::PostHogConfig; +use pageserver_client::mgmt_api; +use posthog_client_lite::{PostHogClient, PostHogClientConfig}; +use reqwest::StatusCode; +use tokio::time::MissedTickBehavior; +use tokio_util::sync::CancellationToken; + +use crate::{pageserver_client::PageserverClient, service::Service}; + +pub struct FeatureFlagService { + service: Arc, + config: PostHogConfig, + client: PostHogClient, + http_client: reqwest::Client, +} + +const DEFAULT_POSTHOG_REFRESH_INTERVAL: Duration = Duration::from_secs(30); + +impl FeatureFlagService { + pub fn new(service: Arc, config: PostHogConfig) -> Self { + let client = PostHogClient::new(PostHogClientConfig { + project_id: config.project_id.clone(), + server_api_key: config.server_api_key.clone(), + client_api_key: config.client_api_key.clone(), + private_api_url: config.private_api_url.clone(), + public_api_url: config.public_api_url.clone(), + }); + Self { + service, + config, + client, + http_client: reqwest::Client::new(), + } + } + + async fn refresh(self: Arc, cancel: CancellationToken) -> Result<(), anyhow::Error> { + let nodes = { + let inner = self.service.inner.read().unwrap(); + inner.nodes.clone() + }; + + let feature_flag_spec = self.client.get_feature_flags_local_evaluation_raw().await?; + let stream = futures::stream::iter(nodes.values().cloned()).map(|node| { + let this = self.clone(); + let feature_flag_spec = feature_flag_spec.clone(); + async move { + let res = async { + let client = PageserverClient::new( + node.get_id(), + this.http_client.clone(), + node.base_url(), + // TODO: what if we rotate the token during storcon lifetime? + this.service.config.pageserver_jwt_token.as_deref(), + ); + + client.update_feature_flag_spec(feature_flag_spec).await?; + tracing::info!( + "Updated {}({}) with feature flag spec", + node.get_id(), + node.base_url() + ); + Ok::<_, mgmt_api::Error>(()) + }; + + if let Err(e) = res.await { + if let mgmt_api::Error::ApiError(status, _) = e { + if status == StatusCode::NOT_FOUND { + // This is expected during deployments where the API is not available, so we can ignore it + return; + } + } + tracing::warn!( + "Failed to update feature flag spec for {}: {e}", + node.get_id() + ); + } + } + }); + let mut stream = stream.buffer_unordered(8); + + while stream.next().await.is_some() { + if cancel.is_cancelled() { + return Ok(()); + } + } + + Ok(()) + } + + pub async fn run(self: Arc, cancel: CancellationToken) { + let refresh_interval = self + .config + .refresh_interval + .unwrap_or(DEFAULT_POSTHOG_REFRESH_INTERVAL); + let mut interval = tokio::time::interval(refresh_interval); + interval.set_missed_tick_behavior(MissedTickBehavior::Skip); + tracing::info!( + "Starting feature flag service with refresh interval: {:?}", + refresh_interval + ); + loop { + tokio::select! { + _ = interval.tick() => {} + _ = cancel.cancelled() => { + break; + } + } + let res = self.clone().refresh(cancel.clone()).await; + if let Err(e) = res { + tracing::error!("Failed to refresh feature flags: {e:#?}"); + } + } + } +} From 1d49eefbbba840b176e1ea57285e6fb3cdf851e7 Mon Sep 17 00:00:00 2001 From: Matthias van de Meent Date: Wed, 25 Jun 2025 18:25:57 +0200 Subject: [PATCH 119/124] RFC: Endpoint Persistent Unlogged Files Storage (#9661) ## Summary A design for a storage system that allows storage of files required to make Neon's Endpoints have a better experience at or after a reboot. ## Motivation Several systems inside PostgreSQL (and Neon) need some persistent storage for optimal workings across reboots and restarts, but still work without. Examples are the cumulative statistics file in `pg_stat/global.stat`, `pg_stat_statements`' `pg_stat/pg_stat_statements.stat`, and `pg_prewarm`'s `autoprewarm.blocks`. We need a storage system that can store and manage these files for each Endpoint. [GH rendered file](https://github.com/neondatabase/neon/blob/MMeent/rfc-unlogged-file/docs/rfcs/040-Endpoint-Persistent-Unlogged-Files-Storage.md) Part of https://github.com/neondatabase/cloud/issues/24225 --- ...point-Persistent-Unlogged-Files-Storage.md | 396 ++++++++++++++++++ 1 file changed, 396 insertions(+) create mode 100644 docs/rfcs/040-Endpoint-Persistent-Unlogged-Files-Storage.md diff --git a/docs/rfcs/040-Endpoint-Persistent-Unlogged-Files-Storage.md b/docs/rfcs/040-Endpoint-Persistent-Unlogged-Files-Storage.md new file mode 100644 index 0000000000..182b2682af --- /dev/null +++ b/docs/rfcs/040-Endpoint-Persistent-Unlogged-Files-Storage.md @@ -0,0 +1,396 @@ +# Memo: Endpoint Persistent Unlogged Files Storage +Created on 2024-11-05 +Implemented on N/A + +## Summary +A design for a storage system that allows storage of files required to make +Neon's Endpoints have a better experience at or after a reboot. + +## Motivation +Several systems inside PostgreSQL (and Neon) need some persistent storage for +optimal workings across reboots and restarts, but still work without. +Examples are the query-level statistics files of `pg_stat_statements` in +`pg_stat/pg_stat_statements.stat`, and `pg_prewarm`'s `autoprewarm.blocks`. +We need a storage system that can store and manage these files for each +Endpoint, without necessarily granting users access to an unlimited storage +device. + +## Goals +- Store known files for Endpoints with reasonable persistence. + _Data loss in this service, while annoying and bad for UX, won't lose any + customer's data._ + +## Non Goals (if relevant) +- This storage system does not need branching, file versioning, or other such + features. The files are as ephemeral to the timeline of the data as the + Endpoints that host the data. +- This storage system does not need to store _all_ user files, only 'known' + user files. +- This storage system does not need to be hosted fully inside Computes. + _Instead, this will be a separate component similar to Pageserver, + SafeKeeper, the S3 proxy used for dynamically loaded extensions, etc._ + +## Impacted components +- Compute needs new code to load and store these files in its lifetime. +- Control Plane needs to consider this new storage system when signalling + the deletion of an Endpoint, Timeline, or Tenant. +- Control Plane needs to consider this new storage system when it resets + or re-assigns an endpoint's timeline/branch state. + +A new service is created: the Endpoint Persistent Unlogged Files Storage +service. This could be integrated in e.g. Pageserver or Control Plane, or a +separately hosted service. + +## Proposed implementation +Endpoint-related data files are managed by a newly designed service (which +optionally is integrated in an existing service like Pageserver or Control +Plane), which stores data directly into S3 or any blob storage of choice. + +Upon deletion of the Endpoint, or reassignment of the endpoint to a different +branch, this ephemeral data is dropped: the data stored may not match the +state of the branch's data after reassignment, and on endpoint deletion the +data won't have any use to the user. + +Compute gets credentials (JWT token with Tenant, Timeline & Endpoint claims) +which it can use to authenticate to this new service and retrieve and store +data associated with this endpoint. This limited scope reduces leaks of data +across endpoints and timeline resets, and limits the ability of endpoints to +mess with other endpoints' data. + +The path of this endpoint data in S3 is initially as follows: + + s3:/// + tenants/ + / + tenants/ + / + endpoints/ + / + pgdata/ + + +For other blob storages an equivalent or similar path can be constructed. + +### Reliability, failure modes and corner cases (if relevant) +Reliability is important, but not critical to the workings of Neon. The data +stored in this service will, when lost, reduce performance, but won't be a +cause of permanent data loss - only operational metadata is stored. + +Most, if not all, blob storage services have sufficiently high persistence +guarantees to cater our need for persistence and uptime. The only concern with +blob storages is that the access latency is generally higher than local disk, +but for the object types stored (cache state, ...) I don't think this will be +much of an issue. + +### Interaction/Sequence diagram (if relevant) + +In these diagrams you can replace S3 with any persistent storage device of +choice, but S3 is chosen as representative name: The well-known and short name +of AWS' blob storage. Azure Blob Storage should work too, but it has a much +longer name making it less practical for the diagrams. + +Write data: + +```http +POST /tenants//timelines//endpoints//pgdata/ +Host: epufs.svc.neon.local + +<<< + +200 OK +{ + "version": "", # opaque file version token, changes when the file contents change + "size": , +} +``` + +```mermaid +sequenceDiagram + autonumber + participant co as Compute + participant ep as EPUFS + participant s3 as Blob Storage + + co-->ep: Connect with credentials + co->>+ep: Store Unlogged Persistent File + opt is authenticated + ep->>s3: Write UPF to S3 + end + ep->>-co: OK / Failure / Auth Failure + co-->ep: Cancel connection +``` + +Read data: (optional with cache-relevant request parameters, e.g. If-Modified-Since) +```http +GET /tenants//timelines//endpoints//pgdata/ +Host: epufs.svc.neon.local + +<<< + +200 OK + + +``` + +```mermaid +sequenceDiagram + autonumber + participant co as Compute + participant ep as EPUFS + participant s3 as Blob Storage + + co->>+ep: Read Unlogged Persistent File + opt is authenticated + ep->>+s3: Request UPF from storage + s3->>-ep: Receive UPF from storage + end + ep->>-co: OK(response) / Failure(storage, auth, ...) +``` + +Compute Startup: +```mermaid +sequenceDiagram + autonumber + participant co as Compute + participant ps as Pageserver + participant ep as EPUFS + participant es as Extension server + + note over co: Bind endpoint ep-xxx + par Get basebackup + co->>+ps: Request basebackup @ LSN + ps-)ps: Construct basebackup + ps->>-co: Receive basebackup TAR @ LSN + and Get startup-critical Unlogged Persistent Files + co->>+ep: Get all UPFs of endpoint ep-xxx + ep-)ep: Retrieve and gather all UPFs + ep->>-co: TAR of UPFs + and Get startup-critical extensions + loop For every startup-critical extension + co->>es: Get critical extension + es->>co: Receive critical extension + end + end + note over co: Start compute +``` + +CPlane ops: +```http +DELETE /tenants//timelines//endpoints/ +Host: epufs.svc.neon.local + +<<< + +200 OK +{ + "tenant": "", + "timeline": "", + "endpoint": "", + "deleted": { + "files": , + "bytes": , + }, +} +``` + +```http +DELETE /tenants//timelines/ +Host: epufs.svc.neon.local + +<<< + +200 OK +{ + "tenant": "", + "timeline": "", + "deleted": { + "files": , + "bytes": , + }, +} +``` + +```http +DELETE /tenants/ +Host: epufs.svc.neon.local + +<<< + +200 OK +{ + "tenant": "", + "deleted": { + "files": , + "bytes": , + }, +} +``` + +```mermaid +sequenceDiagram + autonumber + participant cp as Control Plane + participant ep as EPUFS + participant s3 as Blob Storage + + alt Tenant deleted + cp-)ep: Tenant deleted + loop For every object associated with removed tenant + ep->>s3: Remove data of deleted tenant from Storage + end + opt + ep-)cp: Tenant cleanup complete + end + alt Timeline deleted + cp-)ep: Timeline deleted + loop For every object associated with removed timeline + ep->>s3: Remove data of deleted timeline from Storage + end + opt + ep-)cp: Timeline cleanup complete + end + else Endpoint reassigned or removed + cp->>+ep: Endpoint reassigned + loop For every object associated with reassigned/removed endpoint + ep->>s3: Remove data from Storage + end + ep->>-cp: Cleanup complete + end +``` + +### Scalability (if relevant) + +Provisionally: As this service is going to be part of compute startup, this +service should be able to quickly respond to all requests. Therefore this +service is deployed to every AZ we host Computes in, and Computes communicate +(generally) only to the EPUFS endpoint of the AZ they're hosted in. + +Local caching of frequently restarted endpoints' data or metadata may be +needed for best performance. However, due to the regional nature of stored +data but zonal nature of the service deployment, we should be careful when we +implement any local caching, as it is possible that computes in AZ 1 will +update data originally written and thus cached by AZ 2. Cache version tests +and invalidation is therefore required if we want to roll out caching to this +service, which is too broad a scope for an MVC. This is why caching is left +out of scope for this RFC, and should be considered separately after this RFC +is implemented. + +### Security implications (if relevant) +This service must be able to authenticate users at least by Tenant ID, +Timeline ID and Endpoint ID. This will use the existing JWT infrastructure of +Compute, which will be upgraded to the extent needed to support Timeline- and +Endpoint-based claims. + +The service requires unlimited access to (a prefix of) a blob storage bucket, +and thus must be hosted outside the Compute VM sandbox. + +A service that generates pre-signed request URLs for Compute to download the +data from that URL is likely problematic, too: Compute would be able to write +unlimited data to the bucket, or exfiltrate this signed URL to get read/write +access to specific objects in this bucket, which would still effectively give +users access to the S3 bucket (but with improved access logging). + +There may be a use case for transferring data associated with one endpoint to +another endpoint (e.g. to make one endpoint warm its caches with the state of +another endpoint), but that's not currently in scope, and specific needs may +be solved through out-of-line communication of data or pre-signed URLs. + +### Unresolved questions (if relevant) +Caching of files is not in the implementation scope of the document, but +should at some future point be considered to maximize performance. + +## Alternative implementation (if relevant) +Several ideas have come up to solve this issue: + +### Use AUXfile +One prevalent idea was to WAL-log the files using our AUXfile mechanism. + +Benefits: + ++ We already have this storage mechanism + +Demerits: + +- It isn't available on read replicas +- Additional WAL will be consumed during shutdown and after the shutdown + checkpoint, which needs PG modifications to work without panics. +- It increases the data we need to manage in our versioned storage, thus + causing higher storage costs with higher retention due to duplication at + the storage layer. + +### Sign URLs for read/write operations, instead of proxying them + +Benefits: + ++ The service can be implemented with a much reduced IO budget + +Demerits: + +- Users could get access to these signed credentials +- Not all blob storage services may implement URL signing + +### Give endpoints each their own directly accessed block volume + +Benefits: + ++ Easier to integrate for PostgreSQL + +Demerits: + +- Little control on data size and contents +- Potentially problematic as we'd need to store data all across the pgdata + directory. +- EBS is not a good candidate + - Attaches in 10s of seconds, if not more; i.e. too cold to start + - Shared EBS volumes are a no-go, as you'd have to schedule the endpoint + with users of the same EBS volumes, which can't work with VM migration + - EBS storage costs are very high (>80$/kilotenant when using a + volume/tenant) + - EBS volumes can't be mounted across AZ boundaries +- Bucket per endpoint is unfeasible + - S3 buckets are priced at $20/month per 1k, which we could better spend + on developers. + - Allocating service accounts takes time (100s of ms), and service accounts + are a limited resource, too; so they're not a good candidate to allocate + on a per-endpoint basis. + - Giving credentials limited to prefix has similar issues as the pre-signed + URL approach. + - Bucket DNS lookup will fill DNS caches and put pressure on DNS lookup + much more than our current systems would. +- Volumes bound by hypervisor are unlikely + - This requires significant investment and increased software on the + hypervisor. + - It is unclear if we can attach volumes after boot, i.e. for pooled + instances. + +### Put the files into a table + +Benefits: + + + Mostly already available in PostgreSQL + +Demerits: + + - Uses WAL + - Can't be used after shutdown checkpoint + - Needs a RW endpoint, and table & catalog access to write to this data + - Gets hit with DB size limitations + - Depending on user acces: + - Inaccessible: + The user doesn't have control over database size caused by + these systems. + - Accessible: + The user can corrupt these files and cause the system to crash while + user-corrupted files are present, thus increasing on-call overhead. + +## Definition of Done (if relevant) + +This project is done if we have: + +- One S3 bucket equivalent per region, which stores this per-endpoint data. +- A new service endpoint in at least every AZ, which indirectly grants + endpoints access to the data stored for these endpoints in these buckets. +- Compute writes & reads temp-data at shutdown and startup, respectively, for + at least the pg_prewarm or lfc_prewarm state files. +- Cleanup of endpoint data is triggered when the endpoint is deleted or is + detached from its current timeline. From f755979102f4a5086253e5eaab5e48d8af7995cc Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Wed, 25 Jun 2025 20:16:23 +0200 Subject: [PATCH 120/124] pageserver: payload compression for gRPC base backups (#12346) ## Problem gRPC base backups use gRPC compression. However, this has two problems: * Base backup caching will cache compressed base backups (making gRPC compression pointless). * Tonic does not support varying the compression level, and zstd default level is 10% slower than gzip fastest level. Touches https://github.com/neondatabase/neon/issues/11728. Touches https://github.com/neondatabase/cloud/issues/29353. ## Summary of changes This patch adds a gRPC parameter `BaseBackupRequest::compression` specifying the compression algorithm. It also moves compression into `send_basebackup_tarball` to reduce code duplication. A follow-up PR will integrate the base backup cache with gRPC. --- pageserver/page_api/proto/page_service.proto | 13 ++++ pageserver/page_api/src/client.rs | 1 - pageserver/page_api/src/model.rs | 59 ++++++++++++++- pageserver/pagebench/src/cmd/basebackup.rs | 10 ++- pageserver/src/basebackup.rs | 77 +++++++++++++------- pageserver/src/basebackup_cache.rs | 14 +--- pageserver/src/page_service.rs | 44 +++++------ 7 files changed, 149 insertions(+), 69 deletions(-) diff --git a/pageserver/page_api/proto/page_service.proto b/pageserver/page_api/proto/page_service.proto index 81953a710f..d06b2cfca5 100644 --- a/pageserver/page_api/proto/page_service.proto +++ b/pageserver/page_api/proto/page_service.proto @@ -110,6 +110,19 @@ message GetBaseBackupRequest { bool replica = 2; // If true, include relation files in the base backup. Mainly for debugging and tests. bool full = 3; + // Compression algorithm to use. Base backups send a compressed payload instead of using gRPC + // compression, so that we can cache compressed backups on the server. + BaseBackupCompression compression = 4; +} + +// Base backup compression algorithms. +enum BaseBackupCompression { + // Unknown algorithm. Used when clients send an unsupported algorithm. + BASE_BACKUP_COMPRESSION_UNKNOWN = 0; + // No compression. + BASE_BACKUP_COMPRESSION_NONE = 1; + // GZIP compression. + BASE_BACKUP_COMPRESSION_GZIP = 2; } // Base backup response chunk, returned as an ordered stream. diff --git a/pageserver/page_api/src/client.rs b/pageserver/page_api/src/client.rs index 3977ce7c23..71d539ab91 100644 --- a/pageserver/page_api/src/client.rs +++ b/pageserver/page_api/src/client.rs @@ -95,7 +95,6 @@ impl Client { if let Some(compression) = compression { // TODO: benchmark this (including network latency). - // TODO: consider enabling compression by default. client = client .accept_compressed(compression) .send_compressed(compression); diff --git a/pageserver/page_api/src/model.rs b/pageserver/page_api/src/model.rs index 6efa742799..1ca89b4870 100644 --- a/pageserver/page_api/src/model.rs +++ b/pageserver/page_api/src/model.rs @@ -191,15 +191,21 @@ pub struct GetBaseBackupRequest { pub replica: bool, /// If true, include relation files in the base backup. Mainly for debugging and tests. pub full: bool, + /// Compression algorithm to use. Base backups send a compressed payload instead of using gRPC + /// compression, so that we can cache compressed backups on the server. + pub compression: BaseBackupCompression, } -impl From for GetBaseBackupRequest { - fn from(pb: proto::GetBaseBackupRequest) -> Self { - Self { +impl TryFrom for GetBaseBackupRequest { + type Error = ProtocolError; + + fn try_from(pb: proto::GetBaseBackupRequest) -> Result { + Ok(Self { lsn: (pb.lsn != 0).then_some(Lsn(pb.lsn)), replica: pb.replica, full: pb.full, - } + compression: pb.compression.try_into()?, + }) } } @@ -209,10 +215,55 @@ impl From for proto::GetBaseBackupRequest { lsn: request.lsn.unwrap_or_default().0, replica: request.replica, full: request.full, + compression: request.compression.into(), } } } +/// Base backup compression algorithm. +#[derive(Clone, Copy, Debug)] +pub enum BaseBackupCompression { + None, + Gzip, +} + +impl TryFrom for BaseBackupCompression { + type Error = ProtocolError; + + fn try_from(pb: proto::BaseBackupCompression) -> Result { + match pb { + proto::BaseBackupCompression::Unknown => Err(ProtocolError::invalid("compression", pb)), + proto::BaseBackupCompression::None => Ok(Self::None), + proto::BaseBackupCompression::Gzip => Ok(Self::Gzip), + } + } +} + +impl TryFrom for BaseBackupCompression { + type Error = ProtocolError; + + fn try_from(compression: i32) -> Result { + proto::BaseBackupCompression::try_from(compression) + .map_err(|_| ProtocolError::invalid("compression", compression)) + .and_then(Self::try_from) + } +} + +impl From for proto::BaseBackupCompression { + fn from(compression: BaseBackupCompression) -> Self { + match compression { + BaseBackupCompression::None => Self::None, + BaseBackupCompression::Gzip => Self::Gzip, + } + } +} + +impl From for i32 { + fn from(compression: BaseBackupCompression) -> Self { + proto::BaseBackupCompression::from(compression).into() + } +} + pub type GetBaseBackupResponseChunk = Bytes; impl TryFrom for GetBaseBackupResponseChunk { diff --git a/pageserver/pagebench/src/cmd/basebackup.rs b/pageserver/pagebench/src/cmd/basebackup.rs index e028174c1d..4111d09f92 100644 --- a/pageserver/pagebench/src/cmd/basebackup.rs +++ b/pageserver/pagebench/src/cmd/basebackup.rs @@ -317,6 +317,7 @@ impl Client for LibpqClient { /// A gRPC Pageserver client. struct GrpcClient { inner: page_api::Client, + compression: page_api::BaseBackupCompression, } impl GrpcClient { @@ -331,10 +332,14 @@ impl GrpcClient { ttid.timeline_id, ShardIndex::unsharded(), None, - compression.then_some(tonic::codec::CompressionEncoding::Zstd), + None, // NB: uses payload compression ) .await?; - Ok(Self { inner }) + let compression = match compression { + true => page_api::BaseBackupCompression::Gzip, + false => page_api::BaseBackupCompression::None, + }; + Ok(Self { inner, compression }) } } @@ -348,6 +353,7 @@ impl Client for GrpcClient { lsn, replica: false, full: false, + compression: self.compression, }; let stream = self.inner.get_base_backup(req).await?; Ok(Box::pin(StreamReader::new( diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index 115f0d9ebc..36dada1e89 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -14,6 +14,7 @@ use std::fmt::Write as FmtWrite; use std::time::{Instant, SystemTime}; use anyhow::{Context, anyhow}; +use async_compression::tokio::write::GzipEncoder; use bytes::{BufMut, Bytes, BytesMut}; use fail::fail_point; use pageserver_api::key::{Key, rel_block_to_key}; @@ -25,8 +26,7 @@ use postgres_ffi::{ }; use postgres_ffi_types::constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID}; use postgres_ffi_types::forknum::{INIT_FORKNUM, MAIN_FORKNUM}; -use tokio::io; -use tokio::io::AsyncWrite; +use tokio::io::{self, AsyncWrite, AsyncWriteExt as _}; use tokio_tar::{Builder, EntryType, Header}; use tracing::*; use utils::lsn::Lsn; @@ -97,6 +97,7 @@ impl From for tonic::Status { /// * When working without safekeepers. In this situation it is important to match the lsn /// we are taking basebackup on with the lsn that is used in pageserver's walreceiver /// to start the replication. +#[allow(clippy::too_many_arguments)] pub async fn send_basebackup_tarball<'a, W>( write: &'a mut W, timeline: &'a Timeline, @@ -104,6 +105,7 @@ pub async fn send_basebackup_tarball<'a, W>( prev_lsn: Option, full_backup: bool, replica: bool, + gzip_level: Option, ctx: &'a RequestContext, ) -> Result<(), BasebackupError> where @@ -122,7 +124,7 @@ where // prev_lsn value; that happens if the timeline was just branched from // an old LSN and it doesn't have any WAL of its own yet. We will set // prev_lsn to Lsn(0) if we cannot provide the correct value. - let (backup_prev, backup_lsn) = if let Some(req_lsn) = req_lsn { + let (backup_prev, lsn) = if let Some(req_lsn) = req_lsn { // Backup was requested at a particular LSN. The caller should've // already checked that it's a valid LSN. @@ -143,7 +145,7 @@ where }; // Consolidate the derived and the provided prev_lsn values - let prev_lsn = if let Some(provided_prev_lsn) = prev_lsn { + let prev_record_lsn = if let Some(provided_prev_lsn) = prev_lsn { if backup_prev != Lsn(0) && backup_prev != provided_prev_lsn { return Err(BasebackupError::Server(anyhow!( "backup_prev {backup_prev} != provided_prev_lsn {provided_prev_lsn}" @@ -155,30 +157,55 @@ where }; info!( - "taking basebackup lsn={}, prev_lsn={} (full_backup={}, replica={})", - backup_lsn, prev_lsn, full_backup, replica + "taking basebackup lsn={lsn}, prev_lsn={prev_record_lsn} \ + (full_backup={full_backup}, replica={replica}, gzip={gzip_level:?})", + ); + let span = info_span!("send_tarball", backup_lsn=%lsn); + + let io_concurrency = IoConcurrency::spawn_from_conf( + timeline.conf.get_vectored_concurrent_io, + timeline + .gate + .enter() + .map_err(|_| BasebackupError::Shutdown)?, ); - let basebackup = Basebackup { - ar: Builder::new_non_terminated(write), - timeline, - lsn: backup_lsn, - prev_record_lsn: prev_lsn, - full_backup, - replica, - ctx, - io_concurrency: IoConcurrency::spawn_from_conf( - timeline.conf.get_vectored_concurrent_io, - timeline - .gate - .enter() - .map_err(|_| BasebackupError::Shutdown)?, - ), - }; - basebackup + if let Some(gzip_level) = gzip_level { + let mut encoder = GzipEncoder::with_quality(write, gzip_level); + Basebackup { + ar: Builder::new_non_terminated(&mut encoder), + timeline, + lsn, + prev_record_lsn, + full_backup, + replica, + ctx, + io_concurrency, + } .send_tarball() - .instrument(info_span!("send_tarball", backup_lsn=%backup_lsn)) - .await + .instrument(span) + .await?; + encoder + .shutdown() + .await + .map_err(|err| BasebackupError::Client(err, "gzip"))?; + } else { + Basebackup { + ar: Builder::new_non_terminated(write), + timeline, + lsn, + prev_record_lsn, + full_backup, + replica, + ctx, + io_concurrency, + } + .send_tarball() + .instrument(span) + .await?; + } + + Ok(()) } /// This is short-living object only for the time of tarball creation, diff --git a/pageserver/src/basebackup_cache.rs b/pageserver/src/basebackup_cache.rs index 24f6413380..69438dae7f 100644 --- a/pageserver/src/basebackup_cache.rs +++ b/pageserver/src/basebackup_cache.rs @@ -1,7 +1,6 @@ use std::{collections::HashMap, sync::Arc}; use anyhow::Context; -use async_compression::tokio::write::GzipEncoder; use camino::{Utf8Path, Utf8PathBuf}; use metrics::core::{AtomicU64, GenericCounter}; use pageserver_api::{config::BasebackupCacheConfig, models::TenantState}; @@ -594,13 +593,6 @@ impl BackgroundTask { let file = tokio::fs::File::create(entry_tmp_path).await?; let mut writer = BufWriter::new(file); - let mut encoder = GzipEncoder::with_quality( - &mut writer, - // Level::Best because compression is not on the hot path of basebackup requests. - // The decompression is almost not affected by the compression level. - async_compression::Level::Best, - ); - // We may receive a request before the WAL record is applied to the timeline. // Wait for the requested LSN to be applied. timeline @@ -613,17 +605,19 @@ impl BackgroundTask { .await?; send_basebackup_tarball( - &mut encoder, + &mut writer, timeline, Some(req_lsn), None, false, false, + // Level::Best because compression is not on the hot path of basebackup requests. + // The decompression is almost not affected by the compression level. + Some(async_compression::Level::Best), &ctx, ) .await?; - encoder.shutdown().await?; writer.flush().await?; writer.into_inner().sync_all().await?; diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index d3a1ca681e..dd02947e5c 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -13,7 +13,6 @@ use std::time::{Duration, Instant, SystemTime}; use std::{io, str}; use anyhow::{Context as _, anyhow, bail}; -use async_compression::tokio::write::GzipEncoder; use bytes::{Buf as _, BufMut as _, BytesMut}; use futures::future::BoxFuture; use futures::{FutureExt, Stream}; @@ -2613,6 +2612,7 @@ impl PageServerHandler { prev_lsn, full_backup, replica, + None, &ctx, ) .await?; @@ -2641,31 +2641,6 @@ impl PageServerHandler { .map_err(|err| { BasebackupError::Client(err, "handle_basebackup_request,cached,copy") })?; - } else if gzip { - let mut encoder = GzipEncoder::with_quality( - &mut writer, - // NOTE using fast compression because it's on the critical path - // for compute startup. For an empty database, we get - // <100KB with this method. The Level::Best compression method - // gives us <20KB, but maybe we should add basebackup caching - // on compute shutdown first. - async_compression::Level::Fastest, - ); - basebackup::send_basebackup_tarball( - &mut encoder, - &timeline, - lsn, - prev_lsn, - full_backup, - replica, - &ctx, - ) - .await?; - // shutdown the encoder to ensure the gzip footer is written - encoder - .shutdown() - .await - .map_err(|e| QueryError::Disconnected(ConnectionError::Io(e)))?; } else { basebackup::send_basebackup_tarball( &mut writer, @@ -2674,6 +2649,11 @@ impl PageServerHandler { prev_lsn, full_backup, replica, + // NB: using fast compression because it's on the critical path for compute + // startup. For an empty database, we get <100KB with this method. The + // Level::Best compression method gives us <20KB, but maybe we should add + // basebackup caching on compute shutdown first. + gzip.then_some(async_compression::Level::Fastest), &ctx, ) .await?; @@ -3553,7 +3533,7 @@ impl proto::PageService for GrpcPageServiceHandler { if timeline.is_archived() == Some(true) { return Err(tonic::Status::failed_precondition("timeline is archived")); } - let req: page_api::GetBaseBackupRequest = req.into_inner().into(); + let req: page_api::GetBaseBackupRequest = req.into_inner().try_into()?; span_record!(lsn=?req.lsn); @@ -3579,6 +3559,15 @@ impl proto::PageService for GrpcPageServiceHandler { let span = Span::current(); let (mut simplex_read, mut simplex_write) = tokio::io::simplex(CHUNK_SIZE); let jh = tokio::spawn(async move { + let gzip_level = match req.compression { + page_api::BaseBackupCompression::None => None, + // NB: using fast compression because it's on the critical path for compute + // startup. For an empty database, we get <100KB with this method. The + // Level::Best compression method gives us <20KB, but maybe we should add + // basebackup caching on compute shutdown first. + page_api::BaseBackupCompression::Gzip => Some(async_compression::Level::Fastest), + }; + let result = basebackup::send_basebackup_tarball( &mut simplex_write, &timeline, @@ -3586,6 +3575,7 @@ impl proto::PageService for GrpcPageServiceHandler { None, req.full, req.replica, + gzip_level, &ctx, ) .instrument(span) // propagate request span From 6f70885e11f35017ba32fa2d044b271b8cd97fde Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Wed, 25 Jun 2025 18:15:03 -0400 Subject: [PATCH 121/124] fix(pageserver): allow refresh_interval to be empty (#12349) ## Problem Fix for https://github.com/neondatabase/neon/pull/12324 ## Summary of changes Need `serde(default)` to allow this field not present in the config, otherwise there will be a config deserialization error. --------- Signed-off-by: Alex Chi Z --- libs/pageserver_api/src/config.rs | 1 + pageserver/src/config.rs | 19 +++++++++++++++++++ 2 files changed, 20 insertions(+) diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs index 76730c9ee6..7926e839cf 100644 --- a/libs/pageserver_api/src/config.rs +++ b/libs/pageserver_api/src/config.rs @@ -80,6 +80,7 @@ pub struct PostHogConfig { /// Refresh interval for the feature flag spec. /// The storcon will push the feature flag spec to the pageserver. If the pageserver does not receive /// the spec for `refresh_interval`, it will fetch the spec from the PostHog API. + #[serde(default)] #[serde(skip_serializing_if = "Option::is_none")] #[serde(with = "humantime_serde")] pub refresh_interval: Option, diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 9952496061..5b51a9617b 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -762,4 +762,23 @@ mod tests { let result = PageServerConf::parse_and_validate(NodeId(0), config_toml, &workdir); assert_eq!(result.is_ok(), is_valid); } + + #[test] + fn test_config_posthog_config_is_valid() { + let input = r#" + control_plane_api = "http://localhost:6666" + + [posthog_config] + server_api_key = "phs_AAA" + client_api_key = "phc_BBB" + project_id = "000" + private_api_url = "https://us.posthog.com" + public_api_url = "https://us.i.posthog.com" + "#; + let config_toml = toml_edit::de::from_str::(input) + .expect("posthogconfig is valid"); + let workdir = Utf8PathBuf::from("/nonexistent"); + PageServerConf::parse_and_validate(NodeId(0), config_toml, &workdir) + .expect("parse_and_validate"); + } } From be23eae3b622aac0a76ad95b0a6421c9870c39a6 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Thu, 26 Jun 2025 10:06:27 +0300 Subject: [PATCH 122/124] Mark pages as avaiable in LFC only after generation check (#12350) ## Problem If LFC generation is changed then `lfc_readv_select` will return -1 but pages are still marked as available in bitmap. ## Summary of changes Update bitmap after generation check. Co-authored-by: Kosntantin Knizhnik --- pgxn/neon/file_cache.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c index 45a4695495..8cfa09bc87 100644 --- a/pgxn/neon/file_cache.c +++ b/pgxn/neon/file_cache.c @@ -1295,7 +1295,8 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, if (iteration_hits != 0) { - /* chunk offset (# of pages) into the LFC file */ + /* chunk offset (# + of pages) into the LFC file */ off_t first_read_offset = (off_t) entry_offset * lfc_blocks_per_chunk; int nwrite = iov_last_used - first_block_in_chunk_read; /* offset of first IOV */ @@ -1313,16 +1314,6 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, lfc_disable("read"); return -1; } - - /* - * We successfully read the pages we know were valid when we - * started reading; now mark those pages as read - */ - for (int i = first_block_in_chunk_read; i < iov_last_used; i++) - { - if (BITMAP_ISSET(chunk_mask, i)) - BITMAP_SET(mask, buf_offset + i); - } } /* Place entry to the head of LRU list */ @@ -1340,6 +1331,15 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, { lfc_ctl->time_read += io_time_us; inc_page_cache_read_wait(io_time_us); + /* + * We successfully read the pages we know were valid when we + * started reading; now mark those pages as read + */ + for (int i = first_block_in_chunk_read; i < iov_last_used; i++) + { + if (BITMAP_ISSET(chunk_mask, i)) + BITMAP_SET(mask, buf_offset + i); + } } CriticalAssert(entry->access_count > 0); From fd1e8ec257a55cb9e5e6b97ec1007adab354dcfe Mon Sep 17 00:00:00 2001 From: Conrad Ludgate Date: Thu, 26 Jun 2025 12:25:41 +0100 Subject: [PATCH 123/124] [proxy] review and cleanup CLI args (#12167) I was looking at how we could expose our proxy config as toml again, and as I was writing out the schema format, I noticed some cruft in our CLI args that no longer seem to be in use. The redis change is the most complex, but I am pretty sure it's sound. Since https://github.com/neondatabase/cloud/pull/15613 cplane longer publishes to the global redis instance. --- proxy/src/binary/local_proxy.rs | 1 - proxy/src/binary/pg_sni_router.rs | 1 - proxy/src/binary/proxy.rs | 117 +++++++++------------------- proxy/src/config.rs | 1 - proxy/src/console_redirect_proxy.rs | 7 +- proxy/src/context/mod.rs | 12 +-- proxy/src/context/parquet.rs | 26 +++++-- proxy/src/proxy/mod.rs | 7 +- proxy/src/rate_limiter/limiter.rs | 6 -- proxy/src/redis/kv_ops.rs | 14 +--- proxy/src/redis/notifications.rs | 23 ++---- proxy/src/serverless/mod.rs | 14 +--- 12 files changed, 70 insertions(+), 159 deletions(-) diff --git a/proxy/src/binary/local_proxy.rs b/proxy/src/binary/local_proxy.rs index e3be454713..423ecf821e 100644 --- a/proxy/src/binary/local_proxy.rs +++ b/proxy/src/binary/local_proxy.rs @@ -279,7 +279,6 @@ fn build_config(args: &LocalProxyCliArgs) -> anyhow::Result<&'static ProxyConfig }, proxy_protocol_v2: config::ProxyProtocolV2::Rejected, handshake_timeout: Duration::from_secs(10), - region: "local".into(), wake_compute_retry_config: RetryConfig::parse(RetryConfig::WAKE_COMPUTE_DEFAULT_VALUES)?, connect_compute_locks, connect_to_compute: compute_config, diff --git a/proxy/src/binary/pg_sni_router.rs b/proxy/src/binary/pg_sni_router.rs index 481bd8501c..070c73cdcf 100644 --- a/proxy/src/binary/pg_sni_router.rs +++ b/proxy/src/binary/pg_sni_router.rs @@ -236,7 +236,6 @@ pub(super) async fn task_main( extra: None, }, crate::metrics::Protocol::SniRouter, - "sni", ); handle_client(ctx, dest_suffix, tls_config, compute_tls_config, socket).await } diff --git a/proxy/src/binary/proxy.rs b/proxy/src/binary/proxy.rs index 9215dbf73f..9ead05d492 100644 --- a/proxy/src/binary/proxy.rs +++ b/proxy/src/binary/proxy.rs @@ -123,12 +123,6 @@ struct ProxyCliArgs { /// timeout for the TLS handshake #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)] handshake_timeout: tokio::time::Duration, - /// http endpoint to receive periodic metric updates - #[clap(long)] - metric_collection_endpoint: Option, - /// how often metrics should be sent to a collection endpoint - #[clap(long)] - metric_collection_interval: Option, /// cache for `wake_compute` api method (use `size=0` to disable) #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)] wake_compute_cache: String, @@ -155,40 +149,31 @@ struct ProxyCliArgs { /// Wake compute rate limiter max number of requests per second. #[clap(long, default_values_t = RateBucketInfo::DEFAULT_SET)] wake_compute_limit: Vec, - /// Redis rate limiter max number of requests per second. - #[clap(long, default_values_t = RateBucketInfo::DEFAULT_REDIS_SET)] - redis_rps_limit: Vec, /// Cancellation channel size (max queue size for redis kv client) #[clap(long, default_value_t = 1024)] cancellation_ch_size: usize, /// Cancellation ops batch size for redis #[clap(long, default_value_t = 8)] cancellation_batch_size: usize, - /// cache for `allowed_ips` (use `size=0` to disable) - #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)] - allowed_ips_cache: String, - /// cache for `role_secret` (use `size=0` to disable) - #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)] - role_secret_cache: String, - /// redis url for notifications (if empty, redis_host:port will be used for both notifications and streaming connections) - #[clap(long)] - redis_notifications: Option, - /// what from the available authentications type to use for the regional redis we have. Supported are "irsa" and "plain". + /// redis url for plain authentication + #[clap(long, alias("redis-notifications"))] + redis_plain: Option, + /// what from the available authentications type to use for redis. Supported are "irsa" and "plain". #[clap(long, default_value = "irsa")] redis_auth_type: String, - /// redis host for streaming connections (might be different from the notifications host) + /// redis host for irsa authentication #[clap(long)] redis_host: Option, - /// redis port for streaming connections (might be different from the notifications host) + /// redis port for irsa authentication #[clap(long)] redis_port: Option, - /// redis cluster name, used in aws elasticache + /// redis cluster name for irsa authentication #[clap(long)] redis_cluster_name: Option, - /// redis user_id, used in aws elasticache + /// redis user_id for irsa authentication #[clap(long)] redis_user_id: Option, - /// aws region to retrieve credentials + /// aws region for irsa authentication #[clap(long, default_value_t = String::new())] aws_region: String, /// cache for `project_info` (use `size=0` to disable) @@ -200,6 +185,12 @@ struct ProxyCliArgs { #[clap(flatten)] parquet_upload: ParquetUploadArgs, + /// http endpoint to receive periodic metric updates + #[clap(long)] + metric_collection_endpoint: Option, + /// how often metrics should be sent to a collection endpoint + #[clap(long)] + metric_collection_interval: Option, /// interval for backup metric collection #[clap(long, default_value = "10m", value_parser = humantime::parse_duration)] metric_backup_collection_interval: std::time::Duration, @@ -212,6 +203,7 @@ struct ProxyCliArgs { /// Size of each event is no more than 400 bytes, so 2**22 is about 200MB before the compression. #[clap(long, default_value = "4194304")] metric_backup_collection_chunk_size: usize, + /// Whether to retry the connection to the compute node #[clap(long, default_value = config::RetryConfig::CONNECT_TO_COMPUTE_DEFAULT_VALUES)] connect_to_compute_retry: String, @@ -331,7 +323,7 @@ pub async fn run() -> anyhow::Result<()> { Either::Right(auth_backend) => info!("Authentication backend: {auth_backend:?}"), } info!("Using region: {}", args.aws_region); - let (regional_redis_client, redis_notifications_client) = configure_redis(&args).await?; + let redis_client = configure_redis(&args).await?; // Check that we can bind to address before further initialization info!("Starting http on {}", args.http); @@ -386,13 +378,6 @@ pub async fn run() -> anyhow::Result<()> { let cancellation_token = CancellationToken::new(); - let redis_rps_limit = Vec::leak(args.redis_rps_limit.clone()); - RateBucketInfo::validate(redis_rps_limit)?; - - let redis_kv_client = regional_redis_client - .as_ref() - .map(|redis_publisher| RedisKVClient::new(redis_publisher.clone(), redis_rps_limit)); - let cancellation_handler = Arc::new(CancellationHandler::new(&config.connect_to_compute)); let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new_with_shards( @@ -472,6 +457,7 @@ pub async fn run() -> anyhow::Result<()> { client_tasks.spawn(crate::context::parquet::worker( cancellation_token.clone(), args.parquet_upload, + args.region, )); // maintenance tasks. these never return unless there's an error @@ -495,32 +481,17 @@ pub async fn run() -> anyhow::Result<()> { #[cfg_attr(not(any(test, feature = "testing")), expect(irrefutable_let_patterns))] if let Either::Left(auth::Backend::ControlPlane(api, ())) = &auth_backend { if let crate::control_plane::client::ControlPlaneClient::ProxyV1(api) = &**api { - match (redis_notifications_client, regional_redis_client.clone()) { - (None, None) => {} - (client1, client2) => { - let cache = api.caches.project_info.clone(); - if let Some(client) = client1 { - maintenance_tasks.spawn(notifications::task_main( - client, - cache.clone(), - args.region.clone(), - )); - } - if let Some(client) = client2 { - maintenance_tasks.spawn(notifications::task_main( - client, - cache.clone(), - args.region.clone(), - )); - } - maintenance_tasks.spawn(async move { cache.clone().gc_worker().await }); - } - } + if let Some(client) = redis_client { + // project info cache and invalidation of that cache. + let cache = api.caches.project_info.clone(); + maintenance_tasks.spawn(notifications::task_main(client.clone(), cache.clone())); + maintenance_tasks.spawn(async move { cache.clone().gc_worker().await }); - // Try to connect to Redis 3 times with 1 + (0..0.1) second interval. - // This prevents immediate exit and pod restart, - // which can cause hammering of the redis in case of connection issues. - if let Some(mut redis_kv_client) = redis_kv_client { + // Try to connect to Redis 3 times with 1 + (0..0.1) second interval. + // This prevents immediate exit and pod restart, + // which can cause hammering of the redis in case of connection issues. + // cancellation key management + let mut redis_kv_client = RedisKVClient::new(client.clone()); for attempt in (0..3).with_position() { match redis_kv_client.try_connect().await { Ok(()) => { @@ -545,14 +516,12 @@ pub async fn run() -> anyhow::Result<()> { } } } - } - if let Some(regional_redis_client) = regional_redis_client { + // listen for notifications of new projects/endpoints/branches let cache = api.caches.endpoints_cache.clone(); - let con = regional_redis_client; let span = tracing::info_span!("endpoints_cache"); maintenance_tasks.spawn( - async move { cache.do_read(con, cancellation_token.clone()).await } + async move { cache.do_read(client, cancellation_token.clone()).await } .instrument(span), ); } @@ -681,7 +650,6 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { authentication_config, proxy_protocol_v2: args.proxy_protocol_v2, handshake_timeout: args.handshake_timeout, - region: args.region.clone(), wake_compute_retry_config: config::RetryConfig::parse(&args.wake_compute_retry)?, connect_compute_locks, connect_to_compute: compute_config, @@ -843,21 +811,18 @@ fn build_auth_backend( async fn configure_redis( args: &ProxyCliArgs, -) -> anyhow::Result<( - Option, - Option, -)> { +) -> anyhow::Result> { // TODO: untangle the config args - let regional_redis_client = match (args.redis_auth_type.as_str(), &args.redis_notifications) { - ("plain", redis_url) => match redis_url { + let redis_client = match &*args.redis_auth_type { + "plain" => match &args.redis_plain { None => { - bail!("plain auth requires redis_notifications to be set"); + bail!("plain auth requires redis_plain to be set"); } Some(url) => { Some(ConnectionWithCredentialsProvider::new_with_static_credentials(url.clone())) } }, - ("irsa", _) => match (&args.redis_host, args.redis_port) { + "irsa" => match (&args.redis_host, args.redis_port) { (Some(host), Some(port)) => Some( ConnectionWithCredentialsProvider::new_with_credentials_provider( host.clone(), @@ -881,18 +846,12 @@ async fn configure_redis( bail!("redis-host and redis-port must be specified together"); } }, - _ => { - bail!("unknown auth type given"); + auth_type => { + bail!("unknown auth type {auth_type:?} given") } }; - let redis_notifications_client = if let Some(url) = &args.redis_notifications { - Some(ConnectionWithCredentialsProvider::new_with_static_credentials(&**url)) - } else { - regional_redis_client.clone() - }; - - Ok((regional_redis_client, redis_notifications_client)) + Ok(redis_client) } #[cfg(test)] diff --git a/proxy/src/config.rs b/proxy/src/config.rs index 248584a19a..cee15ac7fa 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -22,7 +22,6 @@ pub struct ProxyConfig { pub http_config: HttpConfig, pub authentication_config: AuthenticationConfig, pub proxy_protocol_v2: ProxyProtocolV2, - pub region: String, pub handshake_timeout: Duration, pub wake_compute_retry_config: RetryConfig, pub connect_compute_locks: ApiLocks, diff --git a/proxy/src/console_redirect_proxy.rs b/proxy/src/console_redirect_proxy.rs index 113a11beab..112465a89b 100644 --- a/proxy/src/console_redirect_proxy.rs +++ b/proxy/src/console_redirect_proxy.rs @@ -89,12 +89,7 @@ pub async fn task_main( } } - let ctx = RequestContext::new( - session_id, - conn_info, - crate::metrics::Protocol::Tcp, - &config.region, - ); + let ctx = RequestContext::new(session_id, conn_info, crate::metrics::Protocol::Tcp); let res = handle_client( config, diff --git a/proxy/src/context/mod.rs b/proxy/src/context/mod.rs index 24268997ba..df1c4e194a 100644 --- a/proxy/src/context/mod.rs +++ b/proxy/src/context/mod.rs @@ -46,7 +46,6 @@ struct RequestContextInner { pub(crate) session_id: Uuid, pub(crate) protocol: Protocol, first_packet: chrono::DateTime, - region: &'static str, pub(crate) span: Span, // filled in as they are discovered @@ -94,7 +93,6 @@ impl Clone for RequestContext { session_id: inner.session_id, protocol: inner.protocol, first_packet: inner.first_packet, - region: inner.region, span: info_span!("background_task"), project: inner.project, @@ -124,12 +122,7 @@ impl Clone for RequestContext { } impl RequestContext { - pub fn new( - session_id: Uuid, - conn_info: ConnectionInfo, - protocol: Protocol, - region: &'static str, - ) -> Self { + pub fn new(session_id: Uuid, conn_info: ConnectionInfo, protocol: Protocol) -> Self { // TODO: be careful with long lived spans let span = info_span!( "connect_request", @@ -145,7 +138,6 @@ impl RequestContext { session_id, protocol, first_packet: Utc::now(), - region, span, project: None, @@ -179,7 +171,7 @@ impl RequestContext { let ip = IpAddr::from([127, 0, 0, 1]); let addr = SocketAddr::new(ip, 5432); let conn_info = ConnectionInfo { addr, extra: None }; - RequestContext::new(Uuid::now_v7(), conn_info, Protocol::Tcp, "test") + RequestContext::new(Uuid::now_v7(), conn_info, Protocol::Tcp) } pub(crate) fn console_application_name(&self) -> String { diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs index c9d3905abd..b55cc14532 100644 --- a/proxy/src/context/parquet.rs +++ b/proxy/src/context/parquet.rs @@ -74,7 +74,7 @@ pub(crate) const FAILED_UPLOAD_MAX_RETRIES: u32 = 10; #[derive(parquet_derive::ParquetRecordWriter)] pub(crate) struct RequestData { - region: &'static str, + region: String, protocol: &'static str, /// Must be UTC. The derive macro doesn't like the timezones timestamp: chrono::NaiveDateTime, @@ -147,7 +147,7 @@ impl From<&RequestContextInner> for RequestData { }), jwt_issuer: value.jwt_issuer.clone(), protocol: value.protocol.as_str(), - region: value.region, + region: String::new(), error: value.error_kind.as_ref().map(|e| e.to_metric_label()), success: value.success, cold_start_info: value.cold_start_info.as_str(), @@ -167,6 +167,7 @@ impl From<&RequestContextInner> for RequestData { pub async fn worker( cancellation_token: CancellationToken, config: ParquetUploadArgs, + region: String, ) -> anyhow::Result<()> { let Some(remote_storage_config) = config.parquet_upload_remote_storage else { tracing::warn!("parquet request upload: no s3 bucket configured"); @@ -232,12 +233,17 @@ pub async fn worker( .context("remote storage for disconnect events init")?; let parquet_config_disconnect = parquet_config.clone(); tokio::try_join!( - worker_inner(storage, rx, parquet_config), - worker_inner(storage_disconnect, rx_disconnect, parquet_config_disconnect) + worker_inner(storage, rx, parquet_config, ®ion), + worker_inner( + storage_disconnect, + rx_disconnect, + parquet_config_disconnect, + ®ion + ) ) .map(|_| ()) } else { - worker_inner(storage, rx, parquet_config).await + worker_inner(storage, rx, parquet_config, ®ion).await } } @@ -257,6 +263,7 @@ async fn worker_inner( storage: GenericRemoteStorage, rx: impl Stream, config: ParquetConfig, + region: &str, ) -> anyhow::Result<()> { #[cfg(any(test, feature = "testing"))] let storage = if config.test_remote_failures > 0 { @@ -277,7 +284,8 @@ async fn worker_inner( let mut last_upload = time::Instant::now(); let mut len = 0; - while let Some(row) = rx.next().await { + while let Some(mut row) = rx.next().await { + region.clone_into(&mut row.region); rows.push(row); let force = last_upload.elapsed() > config.max_duration; if rows.len() == config.rows_per_group || force { @@ -533,7 +541,7 @@ mod tests { auth_method: None, jwt_issuer: None, protocol: ["tcp", "ws", "http"][rng.gen_range(0..3)], - region: "us-east-1", + region: String::new(), error: None, success: rng.r#gen(), cold_start_info: "no", @@ -565,7 +573,9 @@ mod tests { .await .unwrap(); - worker_inner(storage, rx, config).await.unwrap(); + worker_inner(storage, rx, config, "us-east-1") + .await + .unwrap(); let mut files = WalkDir::new(tmpdir.as_std_path()) .into_iter() diff --git a/proxy/src/proxy/mod.rs b/proxy/src/proxy/mod.rs index 6947e07488..6b84e47982 100644 --- a/proxy/src/proxy/mod.rs +++ b/proxy/src/proxy/mod.rs @@ -122,12 +122,7 @@ pub async fn task_main( } } - let ctx = RequestContext::new( - session_id, - conn_info, - crate::metrics::Protocol::Tcp, - &config.region, - ); + let ctx = RequestContext::new(session_id, conn_info, crate::metrics::Protocol::Tcp); let res = handle_client( config, diff --git a/proxy/src/rate_limiter/limiter.rs b/proxy/src/rate_limiter/limiter.rs index 0cd539188a..2e40f5bf60 100644 --- a/proxy/src/rate_limiter/limiter.rs +++ b/proxy/src/rate_limiter/limiter.rs @@ -139,12 +139,6 @@ impl RateBucketInfo { Self::new(200, Duration::from_secs(600)), ]; - // For all the sessions will be cancel key. So this limit is essentially global proxy limit. - pub const DEFAULT_REDIS_SET: [Self; 2] = [ - Self::new(100_000, Duration::from_secs(1)), - Self::new(50_000, Duration::from_secs(10)), - ]; - pub fn rps(&self) -> f64 { (self.max_rpi as f64) / self.interval.as_secs_f64() } diff --git a/proxy/src/redis/kv_ops.rs b/proxy/src/redis/kv_ops.rs index f8d3b5cc66..671fe09b0b 100644 --- a/proxy/src/redis/kv_ops.rs +++ b/proxy/src/redis/kv_ops.rs @@ -5,11 +5,9 @@ use redis::aio::ConnectionLike; use redis::{Cmd, FromRedisValue, Pipeline, RedisResult}; use super::connection_with_credentials_provider::ConnectionWithCredentialsProvider; -use crate::rate_limiter::{GlobalRateLimiter, RateBucketInfo}; pub struct RedisKVClient { client: ConnectionWithCredentialsProvider, - limiter: GlobalRateLimiter, } #[allow(async_fn_in_trait)] @@ -30,11 +28,8 @@ impl Queryable for Cmd { } impl RedisKVClient { - pub fn new(client: ConnectionWithCredentialsProvider, info: &'static [RateBucketInfo]) -> Self { - Self { - client, - limiter: GlobalRateLimiter::new(info.into()), - } + pub fn new(client: ConnectionWithCredentialsProvider) -> Self { + Self { client } } pub async fn try_connect(&mut self) -> anyhow::Result<()> { @@ -49,11 +44,6 @@ impl RedisKVClient { &mut self, q: &impl Queryable, ) -> anyhow::Result { - if !self.limiter.check() { - tracing::info!("Rate limit exceeded. Skipping query"); - return Err(anyhow::anyhow!("Rate limit exceeded")); - } - let e = match q.query(&mut self.client).await { Ok(t) => return Ok(t), Err(e) => e, diff --git a/proxy/src/redis/notifications.rs b/proxy/src/redis/notifications.rs index 6c8260027f..973a4c5b02 100644 --- a/proxy/src/redis/notifications.rs +++ b/proxy/src/redis/notifications.rs @@ -141,29 +141,19 @@ where struct MessageHandler { cache: Arc, - region_id: String, } impl Clone for MessageHandler { fn clone(&self) -> Self { Self { cache: self.cache.clone(), - region_id: self.region_id.clone(), } } } impl MessageHandler { - pub(crate) fn new(cache: Arc, region_id: String) -> Self { - Self { cache, region_id } - } - - pub(crate) async fn increment_active_listeners(&self) { - self.cache.increment_active_listeners().await; - } - - pub(crate) async fn decrement_active_listeners(&self) { - self.cache.decrement_active_listeners().await; + pub(crate) fn new(cache: Arc) -> Self { + Self { cache } } #[tracing::instrument(skip(self, msg), fields(session_id = tracing::field::Empty))] @@ -276,7 +266,7 @@ async fn handle_messages( } let mut conn = match try_connect(&redis).await { Ok(conn) => { - handler.increment_active_listeners().await; + handler.cache.increment_active_listeners().await; conn } Err(e) => { @@ -297,11 +287,11 @@ async fn handle_messages( } } if cancellation_token.is_cancelled() { - handler.decrement_active_listeners().await; + handler.cache.decrement_active_listeners().await; return Ok(()); } } - handler.decrement_active_listeners().await; + handler.cache.decrement_active_listeners().await; } } @@ -310,12 +300,11 @@ async fn handle_messages( pub async fn task_main( redis: ConnectionWithCredentialsProvider, cache: Arc, - region_id: String, ) -> anyhow::Result where C: ProjectInfoCache + Send + Sync + 'static, { - let handler = MessageHandler::new(cache, region_id); + let handler = MessageHandler::new(cache); // 6h - 1m. // There will be 1 minute overlap between two tasks. But at least we can be sure that no message is lost. let mut interval = tokio::time::interval(std::time::Duration::from_secs(6 * 60 * 60 - 60)); diff --git a/proxy/src/serverless/mod.rs b/proxy/src/serverless/mod.rs index ed33bf1246..d8942bb814 100644 --- a/proxy/src/serverless/mod.rs +++ b/proxy/src/serverless/mod.rs @@ -417,12 +417,7 @@ async fn request_handler( if config.http_config.accept_websockets && framed_websockets::upgrade::is_upgrade_request(&request) { - let ctx = RequestContext::new( - session_id, - conn_info, - crate::metrics::Protocol::Ws, - &config.region, - ); + let ctx = RequestContext::new(session_id, conn_info, crate::metrics::Protocol::Ws); ctx.set_user_agent( request @@ -462,12 +457,7 @@ async fn request_handler( // Return the response so the spawned future can continue. Ok(response.map(|b| b.map_err(|x| match x {}).boxed())) } else if request.uri().path() == "/sql" && *request.method() == Method::POST { - let ctx = RequestContext::new( - session_id, - conn_info, - crate::metrics::Protocol::Http, - &config.region, - ); + let ctx = RequestContext::new(session_id, conn_info, crate::metrics::Protocol::Http); let span = ctx.span(); let testodrome_id = request From 605fb04f8912d5939d72c9c7b17b8c543f8fc078 Mon Sep 17 00:00:00 2001 From: Dmitrii Kovalkov <34828390+DimasKovas@users.noreply.github.com> Date: Thu, 26 Jun 2025 17:26:24 +0400 Subject: [PATCH 124/124] pageserver: use bounded sender for basebackup cache (#12342) ## Problem Basebackup cache now uses unbounded channel for prepare requests. In theory it can grow large if the cache is hung and does not process the requests. - Part of https://github.com/neondatabase/cloud/issues/29353 ## Summary of changes - Replace an unbounded channel with a bounded one, the size is configurable. - Add `pageserver_basebackup_cache_prepare_queue_size` to observe the size of the queue. - Refactor a bit to move all metrics logic to `basebackup_cache.rs` --- libs/pageserver_api/src/config.rs | 4 ++ pageserver/src/basebackup_cache.rs | 109 ++++++++++++++++++++++++----- pageserver/src/bin/pageserver.rs | 13 ++-- pageserver/src/metrics.rs | 8 +++ pageserver/src/page_service.rs | 15 +--- pageserver/src/tenant.rs | 20 +++--- pageserver/src/tenant/mgr.rs | 16 +++-- pageserver/src/tenant/timeline.rs | 30 ++++---- 8 files changed, 142 insertions(+), 73 deletions(-) diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs index 7926e839cf..0cfa1c8485 100644 --- a/libs/pageserver_api/src/config.rs +++ b/libs/pageserver_api/src/config.rs @@ -371,6 +371,9 @@ pub struct BasebackupCacheConfig { // TODO(diko): support max_entry_size_bytes. // pub max_entry_size_bytes: u64, pub max_size_entries: usize, + /// Size of the channel used to send prepare requests to the basebackup cache worker. + /// If exceeded, new prepare requests will be dropped. + pub prepare_channel_size: usize, } impl Default for BasebackupCacheConfig { @@ -380,6 +383,7 @@ impl Default for BasebackupCacheConfig { max_total_size_bytes: 1024 * 1024 * 1024, // 1 GiB // max_entry_size_bytes: 16 * 1024 * 1024, // 16 MiB max_size_entries: 1000, + prepare_channel_size: 100, } } } diff --git a/pageserver/src/basebackup_cache.rs b/pageserver/src/basebackup_cache.rs index 69438dae7f..4966fee2d7 100644 --- a/pageserver/src/basebackup_cache.rs +++ b/pageserver/src/basebackup_cache.rs @@ -6,7 +6,7 @@ use metrics::core::{AtomicU64, GenericCounter}; use pageserver_api::{config::BasebackupCacheConfig, models::TenantState}; use tokio::{ io::{AsyncWriteExt, BufWriter}, - sync::mpsc::{UnboundedReceiver, UnboundedSender}, + sync::mpsc::{Receiver, Sender, error::TrySendError}, }; use tokio_util::sync::CancellationToken; use utils::{ @@ -19,8 +19,8 @@ use crate::{ basebackup::send_basebackup_tarball, context::{DownloadBehavior, RequestContext}, metrics::{ - BASEBACKUP_CACHE_ENTRIES, BASEBACKUP_CACHE_PREPARE, BASEBACKUP_CACHE_READ, - BASEBACKUP_CACHE_SIZE, + BASEBACKUP_CACHE_ENTRIES, BASEBACKUP_CACHE_PREPARE, BASEBACKUP_CACHE_PREPARE_QUEUE_SIZE, + BASEBACKUP_CACHE_READ, BASEBACKUP_CACHE_SIZE, }, task_mgr::TaskKind, tenant::{ @@ -35,8 +35,8 @@ pub struct BasebackupPrepareRequest { pub lsn: Lsn, } -pub type BasebackupPrepareSender = UnboundedSender; -pub type BasebackupPrepareReceiver = UnboundedReceiver; +pub type BasebackupPrepareSender = Sender; +pub type BasebackupPrepareReceiver = Receiver; #[derive(Clone)] struct CacheEntry { @@ -60,40 +60,65 @@ struct CacheEntry { /// and ~1 RPS for get requests. pub struct BasebackupCache { data_dir: Utf8PathBuf, + config: Option, entries: std::sync::Mutex>, + prepare_sender: BasebackupPrepareSender, + read_hit_count: GenericCounter, read_miss_count: GenericCounter, read_err_count: GenericCounter, + + prepare_skip_count: GenericCounter, } impl BasebackupCache { - /// Creates a BasebackupCache and spawns the background task. - /// The initialization of the cache is performed in the background and does not - /// block the caller. The cache will return `None` for any get requests until - /// initialization is complete. - pub fn spawn( - runtime_handle: &tokio::runtime::Handle, + /// Create a new BasebackupCache instance. + /// Also returns a BasebackupPrepareReceiver which is needed to start + /// the background task. + /// The cache is initialized from the data_dir in the background task. + /// The cache will return `None` for any get requests until the initialization is complete. + /// The background task is spawned separately using [`Self::spawn_background_task`] + /// to avoid a circular dependency between the cache and the tenant manager. + pub fn new( data_dir: Utf8PathBuf, config: Option, - prepare_receiver: BasebackupPrepareReceiver, - tenant_manager: Arc, - cancel: CancellationToken, - ) -> Arc { + ) -> (Arc, BasebackupPrepareReceiver) { + let chan_size = config.as_ref().map(|c| c.max_size_entries).unwrap_or(1); + + let (prepare_sender, prepare_receiver) = tokio::sync::mpsc::channel(chan_size); + let cache = Arc::new(BasebackupCache { data_dir, - + config, entries: std::sync::Mutex::new(HashMap::new()), + prepare_sender, read_hit_count: BASEBACKUP_CACHE_READ.with_label_values(&["hit"]), read_miss_count: BASEBACKUP_CACHE_READ.with_label_values(&["miss"]), read_err_count: BASEBACKUP_CACHE_READ.with_label_values(&["error"]), + + prepare_skip_count: BASEBACKUP_CACHE_PREPARE.with_label_values(&["skip"]), }); - if let Some(config) = config { + (cache, prepare_receiver) + } + + /// Spawns the background task. + /// The background task initializes the cache from the disk, + /// processes prepare requests, and cleans up outdated cache entries. + /// Noop if the cache is disabled (config is None). + pub fn spawn_background_task( + self: Arc, + runtime_handle: &tokio::runtime::Handle, + prepare_receiver: BasebackupPrepareReceiver, + tenant_manager: Arc, + cancel: CancellationToken, + ) { + if let Some(config) = self.config.clone() { let background = BackgroundTask { - c: cache.clone(), + c: self, config, tenant_manager, @@ -108,8 +133,45 @@ impl BasebackupCache { }; runtime_handle.spawn(background.run(prepare_receiver)); } + } - cache + /// Send a basebackup prepare request to the background task. + /// The basebackup will be prepared asynchronously, it does not block the caller. + /// The request will be skipped if any cache limits are exceeded. + pub fn send_prepare(&self, tenant_shard_id: TenantShardId, timeline_id: TimelineId, lsn: Lsn) { + let req = BasebackupPrepareRequest { + tenant_shard_id, + timeline_id, + lsn, + }; + + BASEBACKUP_CACHE_PREPARE_QUEUE_SIZE.inc(); + let res = self.prepare_sender.try_send(req); + + if let Err(e) = res { + BASEBACKUP_CACHE_PREPARE_QUEUE_SIZE.dec(); + self.prepare_skip_count.inc(); + match e { + TrySendError::Full(_) => { + // Basebackup prepares are pretty rare, normally we should not hit this. + tracing::info!( + tenant_id = %tenant_shard_id.tenant_id, + %timeline_id, + %lsn, + "Basebackup prepare channel is full, skipping the request" + ); + } + TrySendError::Closed(_) => { + // Normal during shutdown, not critical. + tracing::info!( + tenant_id = %tenant_shard_id.tenant_id, + %timeline_id, + %lsn, + "Basebackup prepare channel is closed, skipping the request" + ); + } + } + } } /// Gets a basebackup entry from the cache. @@ -122,6 +184,10 @@ impl BasebackupCache { timeline_id: TimelineId, lsn: Lsn, ) -> Option { + if !self.is_enabled() { + return None; + } + // Fast path. Check if the entry exists using the in-memory state. let tti = TenantTimelineId::new(tenant_id, timeline_id); if self.entries.lock().unwrap().get(&tti).map(|e| e.lsn) != Some(lsn) { @@ -149,6 +215,10 @@ impl BasebackupCache { } } + pub fn is_enabled(&self) -> bool { + self.config.is_some() + } + // Private methods. fn entry_filename(tenant_id: TenantId, timeline_id: TimelineId, lsn: Lsn) -> String { @@ -366,6 +436,7 @@ impl BackgroundTask { loop { tokio::select! { Some(req) = prepare_receiver.recv() => { + BASEBACKUP_CACHE_PREPARE_QUEUE_SIZE.dec(); if let Err(err) = self.prepare_basebackup( req.tenant_shard_id, req.timeline_id, diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index d137d651eb..327384fd82 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -569,8 +569,10 @@ fn start_pageserver( pageserver::l0_flush::L0FlushGlobalState::new(conf.l0_flush.clone()); // Scan the local 'tenants/' directory and start loading the tenants - let (basebackup_prepare_sender, basebackup_prepare_receiver) = - tokio::sync::mpsc::unbounded_channel(); + let (basebackup_cache, basebackup_prepare_receiver) = BasebackupCache::new( + conf.basebackup_cache_dir(), + conf.basebackup_cache_config.clone(), + ); let deletion_queue_client = deletion_queue.new_client(); let background_purges = mgr::BackgroundPurges::default(); @@ -582,7 +584,7 @@ fn start_pageserver( remote_storage: remote_storage.clone(), deletion_queue_client, l0_flush_global_state, - basebackup_prepare_sender, + basebackup_cache: Arc::clone(&basebackup_cache), feature_resolver: feature_resolver.clone(), }, shutdown_pageserver.clone(), @@ -590,10 +592,8 @@ fn start_pageserver( let tenant_manager = Arc::new(tenant_manager); BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(tenant_manager.clone(), order))?; - let basebackup_cache = BasebackupCache::spawn( + basebackup_cache.spawn_background_task( BACKGROUND_RUNTIME.handle(), - conf.basebackup_cache_dir(), - conf.basebackup_cache_config.clone(), basebackup_prepare_receiver, Arc::clone(&tenant_manager), shutdown_pageserver.child_token(), @@ -806,7 +806,6 @@ fn start_pageserver( } else { None }, - basebackup_cache, ); // Spawn a Pageserver gRPC server task. It will spawn separate tasks for diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 7929b094b4..21faceef49 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -4439,6 +4439,14 @@ pub(crate) static BASEBACKUP_CACHE_SIZE: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); +pub(crate) static BASEBACKUP_CACHE_PREPARE_QUEUE_SIZE: Lazy = Lazy::new(|| { + register_uint_gauge!( + "pageserver_basebackup_cache_prepare_queue_size", + "Number of requests in the basebackup prepare channel" + ) + .expect("failed to define a metric") +}); + static PAGESERVER_CONFIG_IGNORED_ITEMS: Lazy = Lazy::new(|| { register_uint_gauge_vec!( "pageserver_config_ignored_items", diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index dd02947e5c..0287a2bdb5 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -62,7 +62,6 @@ use utils::{failpoint_support, span_record}; use crate::auth::check_permission; use crate::basebackup::{self, BasebackupError}; -use crate::basebackup_cache::BasebackupCache; use crate::config::PageServerConf; use crate::context::{ DownloadBehavior, PerfInstrumentFutureExt, RequestContext, RequestContextBuilder, @@ -137,7 +136,6 @@ pub fn spawn( perf_trace_dispatch: Option, tcp_listener: tokio::net::TcpListener, tls_config: Option>, - basebackup_cache: Arc, ) -> Listener { let cancel = CancellationToken::new(); let libpq_ctx = RequestContext::todo_child( @@ -159,7 +157,6 @@ pub fn spawn( conf.pg_auth_type, tls_config, conf.page_service_pipelining.clone(), - basebackup_cache, libpq_ctx, cancel.clone(), ) @@ -218,7 +215,6 @@ pub async fn libpq_listener_main( auth_type: AuthType, tls_config: Option>, pipelining_config: PageServicePipeliningConfig, - basebackup_cache: Arc, listener_ctx: RequestContext, listener_cancel: CancellationToken, ) -> Connections { @@ -262,7 +258,6 @@ pub async fn libpq_listener_main( auth_type, tls_config.clone(), pipelining_config.clone(), - Arc::clone(&basebackup_cache), connection_ctx, connections_cancel.child_token(), gate_guard, @@ -305,7 +300,6 @@ async fn page_service_conn_main( auth_type: AuthType, tls_config: Option>, pipelining_config: PageServicePipeliningConfig, - basebackup_cache: Arc, connection_ctx: RequestContext, cancel: CancellationToken, gate_guard: GateGuard, @@ -371,7 +365,6 @@ async fn page_service_conn_main( pipelining_config, conf.get_vectored_concurrent_io, perf_span_fields, - basebackup_cache, connection_ctx, cancel.clone(), gate_guard, @@ -425,8 +418,6 @@ struct PageServerHandler { pipelining_config: PageServicePipeliningConfig, get_vectored_concurrent_io: GetVectoredConcurrentIo, - basebackup_cache: Arc, - gate_guard: GateGuard, } @@ -912,7 +903,6 @@ impl PageServerHandler { pipelining_config: PageServicePipeliningConfig, get_vectored_concurrent_io: GetVectoredConcurrentIo, perf_span_fields: ConnectionPerfSpanFields, - basebackup_cache: Arc, connection_ctx: RequestContext, cancel: CancellationToken, gate_guard: GateGuard, @@ -926,7 +916,6 @@ impl PageServerHandler { cancel, pipelining_config, get_vectored_concurrent_io, - basebackup_cache, gate_guard, } } @@ -2626,9 +2615,7 @@ impl PageServerHandler { && lsn.is_some() && prev_lsn.is_none() { - self.basebackup_cache - .get(tenant_id, timeline_id, lsn.unwrap()) - .await + timeline.get_cached_basebackup(lsn.unwrap()).await } else { None } diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index c71655ce17..2613528143 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -80,7 +80,7 @@ use self::timeline::uninit::{TimelineCreateGuard, TimelineExclusionError, Uninit use self::timeline::{ EvictionTaskTenantState, GcCutoffs, TimelineDeleteProgress, TimelineResources, WaitLsnError, }; -use crate::basebackup_cache::BasebackupPrepareSender; +use crate::basebackup_cache::BasebackupCache; use crate::config::PageServerConf; use crate::context; use crate::context::RequestContextBuilder; @@ -162,7 +162,7 @@ pub struct TenantSharedResources { pub remote_storage: GenericRemoteStorage, pub deletion_queue_client: DeletionQueueClient, pub l0_flush_global_state: L0FlushGlobalState, - pub basebackup_prepare_sender: BasebackupPrepareSender, + pub basebackup_cache: Arc, pub feature_resolver: FeatureResolver, } @@ -331,7 +331,7 @@ pub struct TenantShard { deletion_queue_client: DeletionQueueClient, /// A channel to send async requests to prepare a basebackup for the basebackup cache. - basebackup_prepare_sender: BasebackupPrepareSender, + basebackup_cache: Arc, /// Cached logical sizes updated updated on each [`TenantShard::gather_size_inputs`]. cached_logical_sizes: tokio::sync::Mutex>, @@ -1363,7 +1363,7 @@ impl TenantShard { remote_storage, deletion_queue_client, l0_flush_global_state, - basebackup_prepare_sender, + basebackup_cache, feature_resolver, } = resources; @@ -1380,7 +1380,7 @@ impl TenantShard { remote_storage.clone(), deletion_queue_client, l0_flush_global_state, - basebackup_prepare_sender, + basebackup_cache, feature_resolver, )); @@ -4380,7 +4380,7 @@ impl TenantShard { remote_storage: GenericRemoteStorage, deletion_queue_client: DeletionQueueClient, l0_flush_global_state: L0FlushGlobalState, - basebackup_prepare_sender: BasebackupPrepareSender, + basebackup_cache: Arc, feature_resolver: FeatureResolver, ) -> TenantShard { assert!(!attached_conf.location.generation.is_none()); @@ -4485,7 +4485,7 @@ impl TenantShard { ongoing_timeline_detach: std::sync::Mutex::default(), gc_block: Default::default(), l0_flush_global_state, - basebackup_prepare_sender, + basebackup_cache, feature_resolver, } } @@ -5414,7 +5414,7 @@ impl TenantShard { pagestream_throttle_metrics: self.pagestream_throttle_metrics.clone(), l0_compaction_trigger: self.l0_compaction_trigger.clone(), l0_flush_global_state: self.l0_flush_global_state.clone(), - basebackup_prepare_sender: self.basebackup_prepare_sender.clone(), + basebackup_cache: self.basebackup_cache.clone(), feature_resolver: self.feature_resolver.clone(), } } @@ -6000,7 +6000,7 @@ pub(crate) mod harness { ) -> anyhow::Result> { let walredo_mgr = Arc::new(WalRedoManager::from(TestRedoManager)); - let (basebackup_requst_sender, _) = tokio::sync::mpsc::unbounded_channel(); + let (basebackup_cache, _) = BasebackupCache::new(Utf8PathBuf::new(), None); let tenant = Arc::new(TenantShard::new( TenantState::Attaching, @@ -6018,7 +6018,7 @@ pub(crate) mod harness { self.deletion_queue.new_client(), // TODO: ideally we should run all unit tests with both configs L0FlushGlobalState::new(L0FlushConfig::default()), - basebackup_requst_sender, + basebackup_cache, FeatureResolver::new_disabled(), )); diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 76937dd959..0a494e7923 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -2891,14 +2891,18 @@ mod tests { use std::collections::BTreeMap; use std::sync::Arc; + use camino::Utf8PathBuf; use storage_broker::BrokerClientChannel; use tracing::Instrument; use super::super::harness::TenantHarness; use super::TenantsMap; - use crate::tenant::{ - TenantSharedResources, - mgr::{BackgroundPurges, TenantManager, TenantSlot}, + use crate::{ + basebackup_cache::BasebackupCache, + tenant::{ + TenantSharedResources, + mgr::{BackgroundPurges, TenantManager, TenantSlot}, + }, }; #[tokio::test(start_paused = true)] @@ -2924,9 +2928,7 @@ mod tests { // Invoke remove_tenant_from_memory with a cleanup hook that blocks until we manually // permit it to proceed: that will stick the tenant in InProgress - let (basebackup_prepare_sender, _) = tokio::sync::mpsc::unbounded_channel::< - crate::basebackup_cache::BasebackupPrepareRequest, - >(); + let (basebackup_cache, _) = BasebackupCache::new(Utf8PathBuf::new(), None); let tenant_manager = TenantManager { tenants: std::sync::RwLock::new(TenantsMap::Open(tenants)), @@ -2940,7 +2942,7 @@ mod tests { l0_flush_global_state: crate::l0_flush::L0FlushGlobalState::new( h.conf.l0_flush.clone(), ), - basebackup_prepare_sender, + basebackup_cache, feature_resolver: crate::feature_resolver::FeatureResolver::new_disabled(), }, cancel: tokio_util::sync::CancellationToken::new(), diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 4ca005bfd4..bec2f0ed52 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -95,12 +95,12 @@ use super::storage_layer::{LayerFringe, LayerVisibilityHint, ReadableLayer}; use super::tasks::log_compaction_error; use super::upload_queue::NotInitialized; use super::{ - AttachedTenantConf, BasebackupPrepareSender, GcError, HeatMapTimeline, MaybeOffloaded, + AttachedTenantConf, GcError, HeatMapTimeline, MaybeOffloaded, debug_assert_current_span_has_tenant_and_timeline_id, }; use crate::PERF_TRACE_TARGET; use crate::aux_file::AuxFileSizeEstimator; -use crate::basebackup_cache::BasebackupPrepareRequest; +use crate::basebackup_cache::BasebackupCache; use crate::config::PageServerConf; use crate::context::{ DownloadBehavior, PerfInstrumentFutureExt, RequestContext, RequestContextBuilder, @@ -201,7 +201,7 @@ pub struct TimelineResources { pub pagestream_throttle_metrics: Arc, pub l0_compaction_trigger: Arc, pub l0_flush_global_state: l0_flush::L0FlushGlobalState, - pub basebackup_prepare_sender: BasebackupPrepareSender, + pub basebackup_cache: Arc, pub feature_resolver: FeatureResolver, } @@ -448,7 +448,7 @@ pub struct Timeline { wait_lsn_log_slow: tokio::sync::Semaphore, /// A channel to send async requests to prepare a basebackup for the basebackup cache. - basebackup_prepare_sender: BasebackupPrepareSender, + basebackup_cache: Arc, feature_resolver: FeatureResolver, } @@ -2500,6 +2500,13 @@ impl Timeline { .unwrap_or(self.conf.default_tenant_conf.basebackup_cache_enabled) } + /// Try to get a basebackup from the on-disk cache. + pub(crate) async fn get_cached_basebackup(&self, lsn: Lsn) -> Option { + self.basebackup_cache + .get(self.tenant_shard_id.tenant_id, self.timeline_id, lsn) + .await + } + /// Prepare basebackup for the given LSN and store it in the basebackup cache. /// The method is asynchronous and returns immediately. /// The actual basebackup preparation is performed in the background @@ -2521,17 +2528,8 @@ impl Timeline { return; } - let res = self - .basebackup_prepare_sender - .send(BasebackupPrepareRequest { - tenant_shard_id: self.tenant_shard_id, - timeline_id: self.timeline_id, - lsn, - }); - if let Err(e) = res { - // May happen during shutdown, it's not critical. - info!("Failed to send shutdown checkpoint: {e:#}"); - } + self.basebackup_cache + .send_prepare(self.tenant_shard_id, self.timeline_id, lsn); } } @@ -3088,7 +3086,7 @@ impl Timeline { wait_lsn_log_slow: tokio::sync::Semaphore::new(1), - basebackup_prepare_sender: resources.basebackup_prepare_sender, + basebackup_cache: resources.basebackup_cache, feature_resolver: resources.feature_resolver, };