From bae793ffcd90470b26380053fe931a91545798a5 Mon Sep 17 00:00:00 2001 From: Folke Behrens Date: Tue, 10 Sep 2024 15:36:08 +0200 Subject: [PATCH 01/30] proxy: Handle all let underscore instances (#8898) * Most can be simply replaced * One instance renamed to _rtchk (return-type check) --- proxy/src/cache/endpoints.rs | 2 +- proxy/src/console/messages.rs | 16 ++++++++-------- proxy/src/context.rs | 10 +++++++--- proxy/src/context/parquet.rs | 2 +- proxy/src/lib.rs | 2 +- proxy/src/proxy/tests.rs | 2 +- .../connection_with_credentials_provider.rs | 7 +++++-- proxy/src/stream.rs | 15 +++++++++------ proxy/src/url.rs | 2 +- 9 files changed, 34 insertions(+), 24 deletions(-) diff --git a/proxy/src/cache/endpoints.rs b/proxy/src/cache/endpoints.rs index f4762232d8..27121ce89e 100644 --- a/proxy/src/cache/endpoints.rs +++ b/proxy/src/cache/endpoints.rs @@ -242,6 +242,6 @@ mod tests { #[test] fn test() { let s = "{\"branch_created\":null,\"endpoint_created\":{\"endpoint_id\":\"ep-rapid-thunder-w0qqw2q9\"},\"project_created\":null,\"type\":\"endpoint_created\"}"; - let _: ControlPlaneEventKey = serde_json::from_str(s).unwrap(); + serde_json::from_str::(s).unwrap(); } } diff --git a/proxy/src/console/messages.rs b/proxy/src/console/messages.rs index a48c7316f6..9b66333cd4 100644 --- a/proxy/src/console/messages.rs +++ b/proxy/src/console/messages.rs @@ -395,7 +395,7 @@ mod tests { } } }); - let _: KickSession<'_> = serde_json::from_str(&json.to_string())?; + serde_json::from_str::>(&json.to_string())?; Ok(()) } @@ -403,7 +403,7 @@ mod tests { #[test] fn parse_db_info() -> anyhow::Result<()> { // with password - let _: DatabaseInfo = serde_json::from_value(json!({ + serde_json::from_value::(json!({ "host": "localhost", "port": 5432, "dbname": "postgres", @@ -413,7 +413,7 @@ mod tests { }))?; // without password - let _: DatabaseInfo = serde_json::from_value(json!({ + serde_json::from_value::(json!({ "host": "localhost", "port": 5432, "dbname": "postgres", @@ -422,7 +422,7 @@ mod tests { }))?; // new field (forward compatibility) - let _: DatabaseInfo = serde_json::from_value(json!({ + serde_json::from_value::(json!({ "host": "localhost", "port": 5432, "dbname": "postgres", @@ -441,7 +441,7 @@ mod tests { "address": "0.0.0.0", "aux": dummy_aux(), }); - let _: WakeCompute = serde_json::from_str(&json.to_string())?; + serde_json::from_str::(&json.to_string())?; Ok(()) } @@ -451,18 +451,18 @@ mod tests { let json = json!({ "role_secret": "secret", }); - let _: GetRoleSecret = serde_json::from_str(&json.to_string())?; + serde_json::from_str::(&json.to_string())?; let json = json!({ "role_secret": "secret", "allowed_ips": ["8.8.8.8"], }); - let _: GetRoleSecret = serde_json::from_str(&json.to_string())?; + serde_json::from_str::(&json.to_string())?; let json = json!({ "role_secret": "secret", "allowed_ips": ["8.8.8.8"], "project_id": "project", }); - let _: GetRoleSecret = serde_json::from_str(&json.to_string())?; + serde_json::from_str::(&json.to_string())?; Ok(()) } diff --git a/proxy/src/context.rs b/proxy/src/context.rs index 72e1fa1cee..c013218ad9 100644 --- a/proxy/src/context.rs +++ b/proxy/src/context.rs @@ -6,7 +6,7 @@ use pq_proto::StartupMessageParams; use smol_str::SmolStr; use std::net::IpAddr; use tokio::sync::mpsc; -use tracing::{field::display, info, info_span, Span}; +use tracing::{debug, field::display, info, info_span, Span}; use try_lock::TryLock; use uuid::Uuid; @@ -362,7 +362,9 @@ impl RequestMonitoringInner { }); } if let Some(tx) = self.sender.take() { - let _: Result<(), _> = tx.send(RequestData::from(&*self)); + tx.send(RequestData::from(&*self)) + .inspect_err(|e| debug!("tx send failed: {e}")) + .ok(); } } @@ -371,7 +373,9 @@ impl RequestMonitoringInner { // Here we log the length of the session. self.disconnect_timestamp = Some(Utc::now()); if let Some(tx) = self.disconnect_sender.take() { - let _: Result<(), _> = tx.send(RequestData::from(&*self)); + tx.send(RequestData::from(&*self)) + .inspect_err(|e| debug!("tx send failed: {e}")) + .ok(); } } } diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs index fafea2a08f..9f6f83022e 100644 --- a/proxy/src/context/parquet.rs +++ b/proxy/src/context/parquet.rs @@ -290,7 +290,7 @@ async fn worker_inner( } if !w.flushed_row_groups().is_empty() { - let _: Writer = upload_parquet(w, len, &storage).await?; + let _rtchk: Writer = upload_parquet(w, len, &storage).await?; } Ok(()) diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs index 923d6ae288..0070839aa8 100644 --- a/proxy/src/lib.rs +++ b/proxy/src/lib.rs @@ -3,7 +3,7 @@ #![deny( deprecated, future_incompatible, - // TODO: consider let_underscore + let_underscore, nonstandard_style, rust_2024_compatibility )] diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs index 4264dbae0f..752d982726 100644 --- a/proxy/src/proxy/tests.rs +++ b/proxy/src/proxy/tests.rs @@ -268,7 +268,7 @@ async fn keepalive_is_inherited() -> anyhow::Result<()> { anyhow::Ok(keepalive) }); - let _ = TcpStream::connect(("127.0.0.1", port)).await?; + TcpStream::connect(("127.0.0.1", port)).await?; assert!(t.await??, "keepalive should be inherited"); Ok(()) diff --git a/proxy/src/redis/connection_with_credentials_provider.rs b/proxy/src/redis/connection_with_credentials_provider.rs index 7d222e2dec..2de66b58b1 100644 --- a/proxy/src/redis/connection_with_credentials_provider.rs +++ b/proxy/src/redis/connection_with_credentials_provider.rs @@ -6,7 +6,7 @@ use redis::{ ConnectionInfo, IntoConnectionInfo, RedisConnectionInfo, RedisResult, }; use tokio::task::JoinHandle; -use tracing::{error, info}; +use tracing::{debug, error, info}; use super::elasticache::CredentialsProvider; @@ -109,7 +109,10 @@ impl ConnectionWithCredentialsProvider { let credentials_provider = credentials_provider.clone(); let con2 = con.clone(); let f = tokio::spawn(async move { - let _ = Self::keep_connection(con2, credentials_provider).await; + Self::keep_connection(con2, credentials_provider) + .await + .inspect_err(|e| debug!("keep_connection failed: {e}")) + .ok(); }); self.refresh_token_task = Some(f); } diff --git a/proxy/src/stream.rs b/proxy/src/stream.rs index c14dd18afe..e2fc73235e 100644 --- a/proxy/src/stream.rs +++ b/proxy/src/stream.rs @@ -12,6 +12,7 @@ use std::{io, task}; use thiserror::Error; use tokio::io::{AsyncRead, AsyncWrite, ReadBuf}; use tokio_rustls::server::TlsStream; +use tracing::debug; /// Stream wrapper which implements libpq's protocol. /// @@ -138,9 +139,10 @@ impl PqStream { ); // already error case, ignore client IO error - let _: Result<_, std::io::Error> = self - .write_message(&BeMessage::ErrorResponse(msg, None)) - .await; + self.write_message(&BeMessage::ErrorResponse(msg, None)) + .await + .inspect_err(|e| debug!("write_message failed: {e}")) + .ok(); Err(ReportedError { source: anyhow::anyhow!(msg), @@ -164,9 +166,10 @@ impl PqStream { ); // already error case, ignore client IO error - let _: Result<_, std::io::Error> = self - .write_message(&BeMessage::ErrorResponse(&msg, None)) - .await; + self.write_message(&BeMessage::ErrorResponse(&msg, None)) + .await + .inspect_err(|e| debug!("write_message failed: {e}")) + .ok(); Err(ReportedError { source: anyhow::anyhow!(error), diff --git a/proxy/src/url.rs b/proxy/src/url.rs index 28ac7efdfc..270cd7c24d 100644 --- a/proxy/src/url.rs +++ b/proxy/src/url.rs @@ -57,7 +57,7 @@ mod tests { fn bad_url() { let url = "test:foobar"; url.parse::().expect("unexpected parsing failure"); - let _ = url.parse::().expect_err("should not parse"); + url.parse::().expect_err("should not parse"); } #[test] From cb060548fb2115ca6a57a95c6c947c45fc2095a6 Mon Sep 17 00:00:00 2001 From: John Spray Date: Wed, 11 Sep 2024 18:45:34 +0100 Subject: [PATCH 02/30] libs: tweak PageserverUtilization::is_overloaded (#8946) ## Problem Having run in production for a while, we see that nodes are generally safely oversubscribed by about a factor of 2. ## Summary of changes Tweak the is_overloaded method to check for utililzation over 200% rather than over 100% --- libs/pageserver_api/src/models/utilization.rs | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/libs/pageserver_api/src/models/utilization.rs b/libs/pageserver_api/src/models/utilization.rs index 844a0cda5d..641aa51989 100644 --- a/libs/pageserver_api/src/models/utilization.rs +++ b/libs/pageserver_api/src/models/utilization.rs @@ -89,8 +89,19 @@ impl PageserverUtilization { /// If a node is currently hosting more work than it can comfortably handle. This does not indicate that /// it will fail, but it is a strong signal that more work should not be added unless there is no alternative. + /// + /// When a node is overloaded, we may override soft affinity preferences and do things like scheduling + /// into a node in a less desirable AZ, if all the nodes in the preferred AZ are overloaded. pub fn is_overloaded(score: RawScore) -> bool { - score >= Self::UTILIZATION_FULL + // Why the factor of two? This is unscientific but reflects behavior of real systems: + // - In terms of shard counts, a node's preferred max count is a soft limit intended to keep + // startup and housekeeping jobs nice and responsive. We can go to double this limit if needed + // until some more nodes are deployed. + // - In terms of disk space, the node's utilization heuristic assumes every tenant needs to + // hold its biggest timeline fully on disk, which is tends to be an over estimate when + // some tenants are very idle and have dropped layers from disk. In practice going up to + // double is generally better than giving up and scheduling in a sub-optimal AZ. + score >= 2 * Self::UTILIZATION_FULL } pub fn adjust_shard_count_max(&mut self, shard_count: u32) { From 43846b72fa488b96d37bbc40d691bbf4e4f8fdd3 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Sun, 8 Sep 2024 21:40:30 +0300 Subject: [PATCH 03/30] Remove unused "neon_local init --pg-version" arg It has been unused since commit 8712e1899e, when it stopped creating the initial timeline. --- control_plane/src/bin/neon_local.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index 1d66532d49..af6545f8d2 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -1570,7 +1570,6 @@ fn cli() -> Command { .value_parser(value_parser!(PathBuf)) .value_name("config") ) - .arg(pg_version_arg.clone()) .arg(force_arg) ) .subcommand( From aeca15008c15b211d74536439ff701e533a412ef Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 11 Sep 2024 10:55:41 +0300 Subject: [PATCH 04/30] Remove obsolete and misleading comment The tenant ID was not actually generated here but in NeonEnvBuilder. And the "neon_local init" command hasn't been able to generate the initial tenant since 8712e1899e anyway. --- test_runner/fixtures/neon_fixtures.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 60887b9aed..22472559f4 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -1073,9 +1073,6 @@ class NeonEnv: self.pg_distrib_dir = config.pg_distrib_dir self.endpoint_counter = 0 self.storage_controller_config = config.storage_controller_config - - # generate initial tenant ID here instead of letting 'neon init' generate it, - # so that we don't need to dig it out of the config file afterwards. self.initial_tenant = config.initial_tenant self.initial_timeline = config.initial_timeline From 0a363c3dce2fbd76f6483f02c2273e3f7b205b3e Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 11 Sep 2024 12:38:31 +0300 Subject: [PATCH 05/30] Add --timeline-id option to "neon_local timeline branch" command Makes it consistent with the "timeline create" and "timeline import" commands, which allowed you to pass the timeline id as argument. This also makes it unnecessary to parse the timeline ID from the output in the python function that calls it. --- control_plane/src/bin/neon_local.rs | 4 ++- test_runner/fixtures/neon_fixtures.py | 43 ++++++++++----------------- 2 files changed, 18 insertions(+), 29 deletions(-) diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index af6545f8d2..144cd647c9 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -640,6 +640,8 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local } Some(("branch", branch_match)) => { let tenant_id = get_tenant_id(branch_match, env)?; + let new_timeline_id = + parse_timeline_id(branch_match)?.unwrap_or(TimelineId::generate()); let new_branch_name = branch_match .get_one::("branch-name") .ok_or_else(|| anyhow!("No branch name provided"))?; @@ -658,7 +660,6 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local .map(|lsn_str| Lsn::from_str(lsn_str)) .transpose() .context("Failed to parse ancestor start Lsn from the request")?; - let new_timeline_id = TimelineId::generate(); let storage_controller = StorageController::from_env(env); let create_req = TimelineCreateRequest { new_timeline_id, @@ -1582,6 +1583,7 @@ fn cli() -> Command { .subcommand(Command::new("branch") .about("Create a new timeline, using another timeline as a base, copying its data") .arg(tenant_id_arg.clone()) + .arg(timeline_id_arg.clone()) .arg(branch_name_arg.clone()) .arg(Arg::new("ancestor-branch-name").long("ancestor-branch-name") .help("Use last Lsn of another timeline (and its data) as base when creating the new timeline. The timeline gets resolved by its branch name.").required(false)) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 22472559f4..1c33d14154 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -1518,14 +1518,6 @@ class PageserverPort: http: int -CREATE_TIMELINE_ID_EXTRACTOR: re.Pattern = re.compile( # type: ignore[type-arg] - r"^Created timeline '(?P[^']+)'", re.MULTILINE -) -TIMELINE_DATA_EXTRACTOR: re.Pattern = re.compile( # type: ignore[type-arg] - r"\s?(?P[^\s]+)\s\[(?P[^\]]+)\]", re.MULTILINE -) - - class AbstractNeonCli(abc.ABC): """ A typed wrapper around an arbitrary Neon CLI tool. @@ -1754,6 +1746,9 @@ class NeonCli(AbstractNeonCli): tenant_id: Optional[TenantId] = None, timeline_id: Optional[TimelineId] = None, ) -> TimelineId: + if timeline_id is None: + timeline_id = TimelineId.generate() + cmd = [ "timeline", "create", @@ -1761,23 +1756,16 @@ class NeonCli(AbstractNeonCli): new_branch_name, "--tenant-id", str(tenant_id or self.env.initial_tenant), + "--timeline-id", + str(timeline_id), "--pg-version", self.env.pg_version, ] - if timeline_id is not None: - cmd.extend(["--timeline-id", str(timeline_id)]) - res = self.raw_cli(cmd) res.check_returncode() - matches = CREATE_TIMELINE_ID_EXTRACTOR.search(res.stdout) - - created_timeline_id = None - if matches is not None: - created_timeline_id = matches.group("timeline_id") - - return TimelineId(str(created_timeline_id)) + return timeline_id def create_branch( self, @@ -1785,12 +1773,17 @@ class NeonCli(AbstractNeonCli): ancestor_branch_name: Optional[str] = None, tenant_id: Optional[TenantId] = None, ancestor_start_lsn: Optional[Lsn] = None, + new_timeline_id: Optional[TimelineId] = None, ) -> TimelineId: + if new_timeline_id is None: + new_timeline_id = TimelineId.generate() cmd = [ "timeline", "branch", "--branch-name", new_branch_name, + "--timeline-id", + str(new_timeline_id), "--tenant-id", str(tenant_id or self.env.initial_tenant), ] @@ -1802,16 +1795,7 @@ class NeonCli(AbstractNeonCli): res = self.raw_cli(cmd) res.check_returncode() - matches = CREATE_TIMELINE_ID_EXTRACTOR.search(res.stdout) - - created_timeline_id = None - if matches is not None: - created_timeline_id = matches.group("timeline_id") - - if created_timeline_id is None: - raise Exception("could not find timeline id after `neon timeline create` invocation") - else: - return TimelineId(str(created_timeline_id)) + return TimelineId(str(new_timeline_id)) def list_timelines(self, tenant_id: Optional[TenantId] = None) -> List[Tuple[str, TimelineId]]: """ @@ -1820,6 +1804,9 @@ class NeonCli(AbstractNeonCli): # main [b49f7954224a0ad25cc0013ea107b54b] # ┣━ @0/16B5A50: test_cli_branch_list_main [20f98c79111b9015d84452258b7d5540] + TIMELINE_DATA_EXTRACTOR: re.Pattern = re.compile( # type: ignore[type-arg] + r"\s?(?P[^\s]+)\s\[(?P[^\]]+)\]", re.MULTILINE + ) res = self.raw_cli( ["timeline", "list", "--tenant-id", str(tenant_id or self.env.initial_tenant)] ) From 8dc069037b003e63c77683670be4e965384e794b Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 11 Sep 2024 15:01:34 +0300 Subject: [PATCH 06/30] Remove NeonEnvBuilder.start() function It feels wrong to me to start() from the builder object. Surely the thing you start is the environment itself, not its configuration. --- test_runner/fixtures/neon_fixtures.py | 6 +----- test_runner/performance/test_storage_controller_scale.py | 2 +- test_runner/regress/test_compatibility.py | 4 ++-- test_runner/regress/test_sharding.py | 6 +++--- 4 files changed, 7 insertions(+), 11 deletions(-) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 1c33d14154..ee62372871 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -553,10 +553,6 @@ class NeonEnvBuilder: self.env = NeonEnv(self) return self.env - def start(self): - assert self.env is not None, "environment is not already initialized, call init() first" - self.env.start() - def init_start( self, initial_tenant_conf: Optional[Dict[str, Any]] = None, @@ -572,7 +568,7 @@ class NeonEnvBuilder: Configuring pageserver with remote storage is now the default. There will be a warning if pageserver is created without one. """ env = self.init_configs(default_remote_storage_if_missing=default_remote_storage_if_missing) - self.start() + env.start() # Prepare the default branch to start the postgres on later. # Pageserver itself does not create tenants and timelines, until started first and asked via HTTP API. diff --git a/test_runner/performance/test_storage_controller_scale.py b/test_runner/performance/test_storage_controller_scale.py index 297aedfbed..a186bbaceb 100644 --- a/test_runner/performance/test_storage_controller_scale.py +++ b/test_runner/performance/test_storage_controller_scale.py @@ -84,7 +84,7 @@ def test_storage_controller_many_tenants( compute_reconfigure_listener.register_on_notify(lambda body: time.sleep(0.01)) env = neon_env_builder.init_configs() - neon_env_builder.start() + env.start() # We will intentionally stress reconciler concurrrency, which triggers a warning when lots # of shards are hitting the delayed path. diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py index 467e5b1734..b559be5f18 100644 --- a/test_runner/regress/test_compatibility.py +++ b/test_runner/regress/test_compatibility.py @@ -178,7 +178,7 @@ def test_backward_compatibility( neon_env_builder.num_safekeepers = 3 env = neon_env_builder.from_repo_dir(compatibility_snapshot_dir / "repo") env.pageserver.allowed_errors.append(ingest_lag_log_line) - neon_env_builder.start() + env.start() check_neon_works( env, @@ -265,7 +265,7 @@ def test_forward_compatibility( # does not include logs from previous runs assert not env.pageserver.log_contains("git-env:" + prev_pageserver_version) - neon_env_builder.start() + env.start() # ensure the specified pageserver is running assert env.pageserver.log_contains("git-env:" + prev_pageserver_version) diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py index bfd82242e9..4a84dca399 100644 --- a/test_runner/regress/test_sharding.py +++ b/test_runner/regress/test_sharding.py @@ -374,7 +374,7 @@ def test_sharding_split_smoke( non_default_tenant_config = {"gc_horizon": 77 * 1024 * 1024} env = neon_env_builder.init_configs(True) - neon_env_builder.start() + env.start() tenant_id = TenantId.generate() timeline_id = TimelineId.generate() env.neon_cli.create_tenant( @@ -1436,7 +1436,7 @@ def test_sharding_unlogged_relation(neon_env_builder: NeonEnvBuilder): neon_env_builder.num_pageservers = 2 env = neon_env_builder.init_configs() - neon_env_builder.start() + env.start() tenant_id = TenantId.generate() timeline_id = TimelineId.generate() @@ -1475,7 +1475,7 @@ def test_top_tenants(neon_env_builder: NeonEnvBuilder): """ env = neon_env_builder.init_configs() - neon_env_builder.start() + env.start() tenants = [] n_tenants = 8 From 9e3ead3689b012d344afbac4fcbf000372bb9969 Mon Sep 17 00:00:00 2001 From: Tristan Partin Date: Wed, 11 Sep 2024 18:43:42 +0100 Subject: [PATCH 07/30] Collect the last of on-demand WAL download in CreateReplicationSlot reverts Signed-off-by: Tristan Partin --- vendor/postgres-v15 | 2 +- vendor/postgres-v16 | 2 +- vendor/revisions.json | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index 49d5e576a5..6f6d77fb59 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit 49d5e576a56e4cc59cd6a6a0791b2324b9fa675e +Subproject commit 6f6d77fb5960602fcd3fd130aca9f99ecb1619c9 diff --git a/vendor/postgres-v16 b/vendor/postgres-v16 index 6e9a4ff624..0baa7346df 160000 --- a/vendor/postgres-v16 +++ b/vendor/postgres-v16 @@ -1 +1 @@ -Subproject commit 6e9a4ff6249ac02b8175054b7b3f7dfb198be48b +Subproject commit 0baa7346dfd42d61912eeca554c9bb0a190f0a1e diff --git a/vendor/revisions.json b/vendor/revisions.json index e52576e61f..3a65a507f3 100644 --- a/vendor/revisions.json +++ b/vendor/revisions.json @@ -1,11 +1,11 @@ { "v16": [ "16.4", - "6e9a4ff6249ac02b8175054b7b3f7dfb198be48b" + "0baa7346dfd42d61912eeca554c9bb0a190f0a1e" ], "v15": [ "15.8", - "49d5e576a56e4cc59cd6a6a0791b2324b9fa675e" + "6f6d77fb5960602fcd3fd130aca9f99ecb1619c9" ], "v14": [ "14.13", From fcab61bdcd9e30f2e2f6ce5be59e34bb98068f2f Mon Sep 17 00:00:00 2001 From: Stefan Radig Date: Thu, 12 Sep 2024 15:55:12 +0100 Subject: [PATCH 08/30] Prototype implementation for private access poc (#8976) ## Problem For the Private Access POC we want users to be able to disable access from the public proxy. To limit the number of changes this can be done by configuring an IP allowlist [ "255.255.255.255" ]. For the Private Access proxy a new commandline flag allows to disable IP allowlist completely. See https://www.notion.so/neondatabase/Neon-Private-Access-POC-Proposal-8f707754e1ab4190ad5709da7832f020?d=887495c15e884aa4973f973a8a0a582a#7ac6ec249b524a74adbeddc4b84b8f5f for details about the POC., ## Summary of changes - Adding the commandline flag is_private_access_proxy=true will disable IP allowlist --- proxy/src/auth/backend.rs | 5 ++++- proxy/src/auth/credentials.rs | 13 +++++++++++++ proxy/src/bin/local_proxy.rs | 1 + proxy/src/bin/proxy.rs | 5 +++++ proxy/src/config.rs | 1 + proxy/src/serverless/backend.rs | 4 +++- 6 files changed, 27 insertions(+), 2 deletions(-) diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs index 1d28c6df31..5561c9c56d 100644 --- a/proxy/src/auth/backend.rs +++ b/proxy/src/auth/backend.rs @@ -311,7 +311,9 @@ async fn auth_quirks( let (allowed_ips, maybe_secret) = api.get_allowed_ips_and_secret(ctx, &info).await?; // check allowed list - if !check_peer_addr_is_in_list(&ctx.peer_addr(), &allowed_ips) { + if config.ip_allowlist_check_enabled + && !check_peer_addr_is_in_list(&ctx.peer_addr(), &allowed_ips) + { return Err(auth::AuthError::ip_address_not_allowed(ctx.peer_addr())); } @@ -603,6 +605,7 @@ mod tests { rate_limiter_enabled: true, rate_limiter: AuthRateLimiter::new(&RateBucketInfo::DEFAULT_AUTH_SET), rate_limit_ip_subnet: 64, + ip_allowlist_check_enabled: true, }); async fn read_message(r: &mut (impl AsyncRead + Unpin), b: &mut BytesMut) -> PgMessage { diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs index 0e91ae570a..cba8601d14 100644 --- a/proxy/src/auth/credentials.rs +++ b/proxy/src/auth/credentials.rs @@ -538,4 +538,17 @@ mod tests { )); Ok(()) } + + #[test] + fn test_connection_blocker() { + fn check(v: serde_json::Value) -> bool { + let peer_addr = IpAddr::from([127, 0, 0, 1]); + let ip_list: Vec = serde_json::from_value(v).unwrap(); + check_peer_addr_is_in_list(&peer_addr, &ip_list) + } + + assert!(check(json!([]))); + assert!(check(json!(["127.0.0.1"]))); + assert!(!check(json!(["255.255.255.255"]))); + } } diff --git a/proxy/src/bin/local_proxy.rs b/proxy/src/bin/local_proxy.rs index 08effeff99..6eba71df1b 100644 --- a/proxy/src/bin/local_proxy.rs +++ b/proxy/src/bin/local_proxy.rs @@ -224,6 +224,7 @@ fn build_config(args: &LocalProxyCliArgs) -> anyhow::Result<&'static ProxyConfig rate_limiter_enabled: false, rate_limiter: BucketRateLimiter::new(vec![]), rate_limit_ip_subnet: 64, + ip_allowlist_check_enabled: true, }, require_client_ip: false, handshake_timeout: Duration::from_secs(10), diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs index 7706a1f7cd..ca9aeb04d8 100644 --- a/proxy/src/bin/proxy.rs +++ b/proxy/src/bin/proxy.rs @@ -224,6 +224,10 @@ struct ProxyCliArgs { /// Whether to retry the wake_compute request #[clap(long, default_value = config::RetryConfig::WAKE_COMPUTE_DEFAULT_VALUES)] wake_compute_retry: String, + + /// Configure if this is a private access proxy for the POC: In that case the proxy will ignore the IP allowlist + #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] + is_private_access_proxy: bool, } #[derive(clap::Args, Clone, Copy, Debug)] @@ -682,6 +686,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { rate_limiter_enabled: args.auth_rate_limit_enabled, rate_limiter: AuthRateLimiter::new(args.auth_rate_limit.clone()), rate_limit_ip_subnet: args.auth_rate_limit_ip_subnet, + ip_allowlist_check_enabled: !args.is_private_access_proxy, }; let config = Box::leak(Box::new(ProxyConfig { diff --git a/proxy/src/config.rs b/proxy/src/config.rs index d7fc6eee22..1cda6d200c 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -64,6 +64,7 @@ pub struct AuthenticationConfig { pub rate_limiter_enabled: bool, pub rate_limiter: AuthRateLimiter, pub rate_limit_ip_subnet: u8, + pub ip_allowlist_check_enabled: bool, } impl TlsConfig { diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs index f24e0478be..d163878528 100644 --- a/proxy/src/serverless/backend.rs +++ b/proxy/src/serverless/backend.rs @@ -50,7 +50,9 @@ impl PoolingBackend { .as_ref() .map(|()| user_info.clone()); let (allowed_ips, maybe_secret) = backend.get_allowed_ips_and_secret(ctx).await?; - if !check_peer_addr_is_in_list(&ctx.peer_addr(), &allowed_ips) { + if config.ip_allowlist_check_enabled + && !check_peer_addr_is_in_list(&ctx.peer_addr(), &allowed_ips) + { return Err(AuthError::ip_address_not_allowed(ctx.peer_addr())); } if !self From 78938d1b591b33d23495a0edb8b123cc5cac6a27 Mon Sep 17 00:00:00 2001 From: Matthias van de Meent Date: Thu, 12 Sep 2024 23:18:41 +0100 Subject: [PATCH 09/30] [compute/postgres] feature: PostgreSQL 17 (#8573) This adds preliminary PG17 support to Neon, based on RC1 / 2024-09-04 https://github.com/postgres/postgres/commit/07b828e9d4aa916f1763774787440d914eea69c4 NOTICE: The data produced by the included version of the PostgreSQL fork may not be compatible with the future full release of PostgreSQL 17 due to expected or unexpected future changes in magic numbers and internals. DO NOT EXPECT DATA IN V17-TENANTS TO BE COMPATIBLE WITH THE 17.0 RELEASE! Co-authored-by: Anastasia Lubennikova Co-authored-by: Alexander Bayandin Co-authored-by: Konstantin Knizhnik Co-authored-by: Heikki Linnakangas --- .github/workflows/_build-and-test-locally.yml | 19 +- .github/workflows/build_and_test.yml | 27 +- .github/workflows/neon_extra_builds.yml | 15 + .gitmodules | 4 + Dockerfile | 12 +- Dockerfile.compute-node | 289 +++- Makefile | 56 +- compute_tools/src/compute.rs | 31 +- compute_tools/src/extension_server.rs | 1 + control_plane/src/local_env.rs | 2 +- control_plane/src/storage_controller.rs | 40 +- libs/pageserver_api/src/key.rs | 39 +- libs/postgres_ffi/build.rs | 2 +- libs/postgres_ffi/src/lib.rs | 5 + libs/postgres_ffi/src/pg_constants.rs | 39 +- libs/postgres_ffi/src/pg_constants_v14.rs | 27 + libs/postgres_ffi/src/pg_constants_v15.rs | 2 + libs/postgres_ffi/src/pg_constants_v16.rs | 2 + libs/postgres_ffi/src/pg_constants_v17.rs | 55 + libs/postgres_ffi/wal_craft/src/lib.rs | 2 +- libs/walproposer/build.rs | 9 +- pageserver/ctl/src/layer_map_analyzer.rs | 16 +- pageserver/src/basebackup.rs | 16 +- pageserver/src/config.rs | 2 +- pageserver/src/import_datadir.rs | 6 +- pageserver/src/pgdatadir_mapping.rs | 111 +- pageserver/src/walingest.rs | 191 ++- pageserver/src/walrecord.rs | 73 +- pgxn/neon/bitmap.h | 12 + pgxn/neon/file_cache.c | 504 ++++-- pgxn/neon/libpagestore.c | 4 + pgxn/neon/neon_pgversioncompat.h | 14 +- pgxn/neon/pagestore_client.h | 54 +- pgxn/neon/pagestore_smgr.c | 1360 ++++++++++++----- pgxn/neon/walproposer_pg.c | 39 +- pgxn/neon_rmgr/neon_rmgr_decode.c | 399 ++++- pgxn/neon_walredo/inmem_smgr.c | 79 +- pgxn/neon_walredo/inmem_smgr.h | 2 +- pgxn/neon_walredo/walredoproc.c | 14 +- test_runner/fixtures/common_types.py | 2 +- test_runner/fixtures/neon_fixtures.py | 9 +- test_runner/fixtures/pg_version.py | 1 + .../5670669815/v17/ext_index.json | 7 + test_runner/regress/test_compatibility.py | 8 +- .../regress/test_download_extensions.py | 2 + test_runner/regress/test_postgres_version.py | 17 +- .../regress/test_timeline_detach_ancestor.py | 3 + test_runner/regress/test_twophase.py | 70 +- vendor/postgres-v17 | 1 + 49 files changed, 2907 insertions(+), 787 deletions(-) create mode 100644 libs/postgres_ffi/src/pg_constants_v17.rs create mode 100644 pgxn/neon/bitmap.h create mode 100644 test_runner/regress/data/extension_test/5670669815/v17/ext_index.json create mode 160000 vendor/postgres-v17 diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml index e18e6a1201..67152b6991 100644 --- a/.github/workflows/_build-and-test-locally.yml +++ b/.github/workflows/_build-and-test-locally.yml @@ -62,7 +62,7 @@ jobs: # git config --global --add safe.directory ${{ github.workspace }} git config --global --add safe.directory ${GITHUB_WORKSPACE} - for r in 14 15 16; do + for r in 14 15 16 17; do git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r" git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r" done @@ -83,6 +83,10 @@ jobs: id: pg_v16_rev run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v16) >> $GITHUB_OUTPUT + - name: Set pg 17 revision for caching + id: pg_v17_rev + run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v17) >> $GITHUB_OUTPUT + # Set some environment variables used by all the steps. # # CARGO_FLAGS is extra options to pass to "cargo build", "cargo test" etc. @@ -136,6 +140,13 @@ jobs: path: pg_install/v16 key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }} + - name: Cache postgres v17 build + id: cache_pg_17 + uses: actions/cache@v4 + with: + path: pg_install/v17 + key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v17_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }} + - name: Build postgres v14 if: steps.cache_pg_14.outputs.cache-hit != 'true' run: mold -run make postgres-v14 -j$(nproc) @@ -148,6 +159,10 @@ jobs: if: steps.cache_pg_16.outputs.cache-hit != 'true' run: mold -run make postgres-v16 -j$(nproc) + - name: Build postgres v17 + if: steps.cache_pg_17.outputs.cache-hit != 'true' + run: mold -run make postgres-v17 -j$(nproc) + - name: Build neon extensions run: mold -run make neon-pg-ext -j$(nproc) @@ -210,7 +225,7 @@ jobs: run: | PQ_LIB_DIR=$(pwd)/pg_install/v16/lib export PQ_LIB_DIR - LD_LIBRARY_PATH=$(pwd)/pg_install/v16/lib + LD_LIBRARY_PATH=$(pwd)/pg_install/v17/lib export LD_LIBRARY_PATH #nextest does not yet support running doctests diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 4bb9e5cb66..7c06fd9ab8 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -211,7 +211,7 @@ jobs: build-tag: ${{ needs.tag.outputs.build-tag }} build-type: ${{ matrix.build-type }} # Run tests on all Postgres versions in release builds and only on the latest version in debug builds - pg-versions: ${{ matrix.build-type == 'release' && '["v14", "v15", "v16"]' || '["v16"]' }} + pg-versions: ${{ matrix.build-type == 'release' && '["v14", "v15", "v16", "v17"]' || '["v17"]' }} secrets: inherit # Keep `benchmarks` job outside of `build-and-test-locally` workflow to make job failures non-blocking @@ -548,7 +548,7 @@ jobs: strategy: fail-fast: false matrix: - version: [ v14, v15, v16 ] + version: [ v14, v15, v16, v17 ] arch: [ x64, arm64 ] runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }} @@ -627,7 +627,7 @@ jobs: - name: Build compute-tools image # compute-tools are Postgres independent, so build it only once - if: matrix.version == 'v16' + if: matrix.version == 'v17' uses: docker/build-push-action@v6 with: target: compute-tools-image @@ -649,7 +649,7 @@ jobs: strategy: matrix: - version: [ v14, v15, v16 ] + version: [ v14, v15, v16, v17 ] steps: - uses: docker/login-action@v3 @@ -671,7 +671,7 @@ jobs: neondatabase/neon-test-extensions-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-arm64 - name: Create multi-arch compute-tools image - if: matrix.version == 'v16' + if: matrix.version == 'v17' run: | docker buildx imagetools create -t neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }} \ neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-x64 \ @@ -689,7 +689,7 @@ jobs: neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} - name: Push multi-arch compute-tools image to ECR - if: matrix.version == 'v16' + if: matrix.version == 'v17' run: | docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{ needs.tag.outputs.build-tag }} \ neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }} @@ -700,7 +700,7 @@ jobs: strategy: fail-fast: false matrix: - version: [ v14, v15, v16 ] + version: [ v14, v15, v16, v17 ] env: VM_BUILDER_VERSION: v0.29.3 @@ -798,7 +798,7 @@ jobs: runs-on: ubuntu-22.04 env: - VERSIONS: v14 v15 v16 + VERSIONS: v14 v15 v16 v17 steps: - uses: docker/login-action@v3 @@ -839,7 +839,7 @@ jobs: done done docker buildx imagetools create -t neondatabase/neon-test-extensions-v16:latest \ - neondatabase/neon-test-extensions-v16:${{ needs.tag.outputs.build-tag }} + neondatabase/neon-test-extensions-v16:${{ needs.tag.outputs.build-tag }} - name: Login to prod ECR uses: docker/login-action@v3 @@ -852,7 +852,7 @@ jobs: - name: Copy all images to prod ECR if: github.ref_name == 'release'|| github.ref_name == 'release-proxy' run: | - for image in neon compute-tools {vm-,}compute-node-{v14,v15,v16}; do + for image in neon compute-tools {vm-,}compute-node-{v14,v15,v16,v17}; do docker buildx imagetools create -t 093970136003.dkr.ecr.eu-central-1.amazonaws.com/${image}:${{ needs.tag.outputs.build-tag }} \ 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${image}:${{ needs.tag.outputs.build-tag }} done @@ -864,7 +864,7 @@ jobs: with: client_id: ${{ vars.AZURE_DEV_CLIENT_ID }} image_tag: ${{ needs.tag.outputs.build-tag }} - images: neon compute-tools vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 compute-node-v14 compute-node-v15 compute-node-v16 + images: neon compute-tools vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 vm-compute-node-v17 compute-node-v14 compute-node-v15 compute-node-v16 compute-node-v17 registry_name: ${{ vars.AZURE_DEV_REGISTRY_NAME }} subscription_id: ${{ vars.AZURE_DEV_SUBSCRIPTION_ID }} tenant_id: ${{ vars.AZURE_TENANT_ID }} @@ -876,7 +876,7 @@ jobs: with: client_id: ${{ vars.AZURE_PROD_CLIENT_ID }} image_tag: ${{ needs.tag.outputs.build-tag }} - images: neon compute-tools vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 compute-node-v14 compute-node-v15 compute-node-v16 + images: neon compute-tools vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 vm-compute-node-v17 compute-node-v14 compute-node-v15 compute-node-v16 compute-node-v17 registry_name: ${{ vars.AZURE_PROD_REGISTRY_NAME }} subscription_id: ${{ vars.AZURE_PROD_SUBSCRIPTION_ID }} tenant_id: ${{ vars.AZURE_TENANT_ID }} @@ -971,7 +971,7 @@ jobs: # git config --global --add safe.directory ${{ github.workspace }} git config --global --add safe.directory ${GITHUB_WORKSPACE} - for r in 14 15 16; do + for r in 14 15 16 17; do git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r" git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r" done @@ -1117,6 +1117,7 @@ jobs: files_to_promote+=("s3://${BUCKET}/${s3_key}") + # TODO Add v17 for pg_version in v14 v15 v16; do # We run less tests for debug builds, so we don't need to promote them if [ "${build_type}" == "debug" ] && { [ "${arch}" == "ARM64" ] || [ "${pg_version}" != "v16" ] ; }; then diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml index 7fecdbde8c..41c9f5dee5 100644 --- a/.github/workflows/neon_extra_builds.yml +++ b/.github/workflows/neon_extra_builds.yml @@ -72,6 +72,10 @@ jobs: id: pg_v16_rev run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v16) >> $GITHUB_OUTPUT + - name: Set pg 17 revision for caching + id: pg_v17_rev + run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v17) >> $GITHUB_OUTPUT + - name: Cache postgres v14 build id: cache_pg_14 uses: actions/cache@v4 @@ -93,6 +97,13 @@ jobs: path: pg_install/v16 key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} + - name: Cache postgres v17 build + id: cache_pg_17 + uses: actions/cache@v4 + with: + path: pg_install/v17 + key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v17_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} + - name: Set extra env for macOS run: | echo 'LDFLAGS=-L/usr/local/opt/openssl@3/lib' >> $GITHUB_ENV @@ -120,6 +131,10 @@ jobs: if: steps.cache_pg_16.outputs.cache-hit != 'true' run: make postgres-v16 -j$(sysctl -n hw.ncpu) + - name: Build postgres v17 + if: steps.cache_pg_17.outputs.cache-hit != 'true' + run: make postgres-v17 -j$(sysctl -n hw.ncpu) + - name: Build neon extensions run: make neon-pg-ext -j$(sysctl -n hw.ncpu) diff --git a/.gitmodules b/.gitmodules index 1d925674a1..d1330bf28c 100644 --- a/.gitmodules +++ b/.gitmodules @@ -10,3 +10,7 @@ path = vendor/postgres-v16 url = https://github.com/neondatabase/postgres.git branch = REL_16_STABLE_neon +[submodule "vendor/postgres-v17"] + path = vendor/postgres-v17 + url = https://github.com/neondatabase/postgres.git + branch = REL_17_STABLE_neon diff --git a/Dockerfile b/Dockerfile index 1efedfa9bc..bdb76a4f4f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -5,6 +5,8 @@ ARG REPOSITORY=neondatabase ARG IMAGE=build-tools ARG TAG=pinned +ARG DEFAULT_PG_VERSION=17 +ARG STABLE_PG_VERSION=16 # Build Postgres FROM $REPOSITORY/$IMAGE:$TAG AS pg-build @@ -13,6 +15,7 @@ WORKDIR /home/nonroot COPY --chown=nonroot vendor/postgres-v14 vendor/postgres-v14 COPY --chown=nonroot vendor/postgres-v15 vendor/postgres-v15 COPY --chown=nonroot vendor/postgres-v16 vendor/postgres-v16 +COPY --chown=nonroot vendor/postgres-v17 vendor/postgres-v17 COPY --chown=nonroot pgxn pgxn COPY --chown=nonroot Makefile Makefile COPY --chown=nonroot scripts/ninstall.sh scripts/ninstall.sh @@ -28,16 +31,19 @@ FROM $REPOSITORY/$IMAGE:$TAG AS build WORKDIR /home/nonroot ARG GIT_VERSION=local ARG BUILD_TAG +ARG STABLE_PG_VERSION COPY --from=pg-build /home/nonroot/pg_install/v14/include/postgresql/server pg_install/v14/include/postgresql/server COPY --from=pg-build /home/nonroot/pg_install/v15/include/postgresql/server pg_install/v15/include/postgresql/server COPY --from=pg-build /home/nonroot/pg_install/v16/include/postgresql/server pg_install/v16/include/postgresql/server +COPY --from=pg-build /home/nonroot/pg_install/v17/include/postgresql/server pg_install/v17/include/postgresql/server COPY --from=pg-build /home/nonroot/pg_install/v16/lib pg_install/v16/lib +COPY --from=pg-build /home/nonroot/pg_install/v17/lib pg_install/v17/lib COPY --chown=nonroot . . ARG ADDITIONAL_RUSTFLAGS RUN set -e \ - && PQ_LIB_DIR=$(pwd)/pg_install/v16/lib RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment ${ADDITIONAL_RUSTFLAGS}" cargo build \ + && PQ_LIB_DIR=$(pwd)/pg_install/v${STABLE_PG_VERSION}/lib RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment ${ADDITIONAL_RUSTFLAGS}" cargo build \ --bin pg_sni_router \ --bin pageserver \ --bin pagectl \ @@ -52,6 +58,7 @@ RUN set -e \ # Build final image # FROM debian:bullseye-slim +ARG DEFAULT_PG_VERSION WORKDIR /data RUN set -e \ @@ -77,6 +84,7 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_scrubbe COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/v14/ COPY --from=pg-build /home/nonroot/pg_install/v15 /usr/local/v15/ COPY --from=pg-build /home/nonroot/pg_install/v16 /usr/local/v16/ +COPY --from=pg-build /home/nonroot/pg_install/v17 /usr/local/v17/ COPY --from=pg-build /home/nonroot/postgres_install.tar.gz /data/ # By default, pageserver uses `.neon/` working directory in WORKDIR, so create one and fill it with the dummy config. @@ -93,7 +101,7 @@ RUN mkdir -p /data/.neon/ && \ # When running a binary that links with libpq, default to using our most recent postgres version. Binaries # that want a particular postgres version will select it explicitly: this is just a default. -ENV LD_LIBRARY_PATH=/usr/local/v16/lib +ENV LD_LIBRARY_PATH=/usr/local/v${DEFAULT_PG_VERSION}/lib VOLUME ["/data"] diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node index b6c89cd71f..fe902eb978 100644 --- a/Dockerfile.compute-node +++ b/Dockerfile.compute-node @@ -79,15 +79,23 @@ RUN cd postgres && \ # ######################################################################################### FROM build-deps AS postgis-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN apt update && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + apt update && \ apt install -y cmake gdal-bin libboost-dev libboost-thread-dev libboost-filesystem-dev \ libboost-system-dev libboost-iostreams-dev libboost-program-options-dev libboost-timer-dev \ libcgal-dev libgdal-dev libgmp-dev libmpfr-dev libopenscenegraph-dev libprotobuf-c-dev \ protobuf-c-compiler xsltproc # SFCGAL > 1.3 requires CGAL > 5.2, Bullseye's libcgal-dev is 5.2 -RUN wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar.gz -O SFCGAL.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + mkdir -p /sfcgal && \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar.gz -O SFCGAL.tar.gz && \ echo "4e39b3b2adada6254a7bdba6d297bb28e1a9835a9f879b74f37e2dab70203232 SFCGAL.tar.gz" | sha256sum --check && \ mkdir sfcgal-src && cd sfcgal-src && tar xzf ../SFCGAL.tar.gz --strip-components=1 -C . && \ cmake -DCMAKE_BUILD_TYPE=Release . && make -j $(getconf _NPROCESSORS_ONLN) && \ @@ -96,7 +104,10 @@ RUN wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar ENV PATH="/usr/local/pgsql/bin:$PATH" -RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.3.tar.gz -O postgis.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://download.osgeo.org/postgis/source/postgis-3.3.3.tar.gz -O postgis.tar.gz && \ echo "74eb356e3f85f14233791013360881b6748f78081cc688ff9d6f0f673a762d13 postgis.tar.gz" | sha256sum --check && \ mkdir postgis-src && cd postgis-src && tar xzf ../postgis.tar.gz --strip-components=1 -C . && \ find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\ @@ -122,7 +133,10 @@ RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.3.tar.gz -O postg cp /usr/local/pgsql/share/extension/address_standardizer.control /extensions/postgis && \ cp /usr/local/pgsql/share/extension/address_standardizer_data_us.control /extensions/postgis -RUN wget https://github.com/pgRouting/pgrouting/archive/v3.4.2.tar.gz -O pgrouting.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/pgRouting/pgrouting/archive/v3.4.2.tar.gz -O pgrouting.tar.gz && \ echo "cac297c07d34460887c4f3b522b35c470138760fe358e351ad1db4edb6ee306e pgrouting.tar.gz" | sha256sum --check && \ mkdir pgrouting-src && cd pgrouting-src && tar xzf ../pgrouting.tar.gz --strip-components=1 -C . && \ mkdir build && cd build && \ @@ -142,12 +156,19 @@ RUN wget https://github.com/pgRouting/pgrouting/archive/v3.4.2.tar.gz -O pgrouti # ######################################################################################### FROM build-deps AS plv8-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN apt update && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + apt update && \ apt install -y ninja-build python3-dev libncurses5 binutils clang -RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.10.tar.gz -O plv8.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.10.tar.gz -O plv8.tar.gz && \ echo "7096c3290928561f0d4901b7a52794295dc47f6303102fae3f8e42dd575ad97d plv8.tar.gz" | sha256sum --check && \ mkdir plv8-src && cd plv8-src && tar xzf ../plv8.tar.gz --strip-components=1 -C . && \ # generate and copy upgrade scripts @@ -172,9 +193,13 @@ RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.10.tar.gz -O plv8.t # ######################################################################################### FROM build-deps AS h3-pg-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN case "$(uname -m)" in \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + case "$(uname -m)" in \ "x86_64") \ export CMAKE_CHECKSUM=739d372726cb23129d57a539ce1432453448816e345e1545f6127296926b6754 \ ;; \ @@ -192,7 +217,11 @@ RUN case "$(uname -m)" in \ && /tmp/cmake-install.sh --skip-license --prefix=/usr/local/ \ && rm /tmp/cmake-install.sh -RUN wget https://github.com/uber/h3/archive/refs/tags/v4.1.0.tar.gz -O h3.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + mkdir -p /h3/usr/ && \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/uber/h3/archive/refs/tags/v4.1.0.tar.gz -O h3.tar.gz && \ echo "ec99f1f5974846bde64f4513cf8d2ea1b8d172d2218ab41803bf6a63532272bc h3.tar.gz" | sha256sum --check && \ mkdir h3-src && cd h3-src && tar xzf ../h3.tar.gz --strip-components=1 -C . && \ mkdir build && cd build && \ @@ -202,7 +231,10 @@ RUN wget https://github.com/uber/h3/archive/refs/tags/v4.1.0.tar.gz -O h3.tar.gz cp -R /h3/usr / && \ rm -rf build -RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.3.tar.gz -O h3-pg.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.3.tar.gz -O h3-pg.tar.gz && \ echo "5c17f09a820859ffe949f847bebf1be98511fb8f1bd86f94932512c00479e324 h3-pg.tar.gz" | sha256sum --check && \ mkdir h3-pg-src && cd h3-pg-src && tar xzf ../h3-pg.tar.gz --strip-components=1 -C . && \ export PATH="/usr/local/pgsql/bin:$PATH" && \ @@ -218,9 +250,13 @@ RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.3.tar.gz -O h3 # ######################################################################################### FROM build-deps AS unit-pg-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz -O postgresql-unit.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz -O postgresql-unit.tar.gz && \ echo "411d05beeb97e5a4abf17572bfcfbb5a68d98d1018918feff995f6ee3bb03e79 postgresql-unit.tar.gz" | sha256sum --check && \ mkdir postgresql-unit-src && cd postgresql-unit-src && tar xzf ../postgresql-unit.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ @@ -239,6 +275,7 @@ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz - # ######################################################################################### FROM build-deps AS vector-pg-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY patches/pgvector.patch /pgvector.patch @@ -246,7 +283,10 @@ COPY patches/pgvector.patch /pgvector.patch # By default, pgvector Makefile uses `-march=native`. We don't want that, # because we build the images on different machines than where we run them. # Pass OPTFLAGS="" to remove it. -RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.7.2.tar.gz -O pgvector.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.7.2.tar.gz -O pgvector.tar.gz && \ echo "617fba855c9bcb41a2a9bc78a78567fd2e147c72afd5bf9d37b31b9591632b30 pgvector.tar.gz" | sha256sum --check && \ mkdir pgvector-src && cd pgvector-src && tar xzf ../pgvector.tar.gz --strip-components=1 -C . && \ patch -p1 < /pgvector.patch && \ @@ -261,10 +301,14 @@ RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.7.2.tar.gz -O # ######################################################################################### FROM build-deps AS pgjwt-pg-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ # 9742dab1b2f297ad3811120db7b21451bca2d3c9 made on 13/11/2021 -RUN wget https://github.com/michelp/pgjwt/archive/9742dab1b2f297ad3811120db7b21451bca2d3c9.tar.gz -O pgjwt.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/michelp/pgjwt/archive/9742dab1b2f297ad3811120db7b21451bca2d3c9.tar.gz -O pgjwt.tar.gz && \ echo "cfdefb15007286f67d3d45510f04a6a7a495004be5b3aecb12cda667e774203f pgjwt.tar.gz" | sha256sum --check && \ mkdir pgjwt-src && cd pgjwt-src && tar xzf ../pgjwt.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ @@ -277,9 +321,13 @@ RUN wget https://github.com/michelp/pgjwt/archive/9742dab1b2f297ad3811120db7b214 # ######################################################################################### FROM build-deps AS hypopg-pg-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.4.0.tar.gz -O hypopg.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.4.0.tar.gz -O hypopg.tar.gz && \ echo "0821011743083226fc9b813c1f2ef5897a91901b57b6bea85a78e466187c6819 hypopg.tar.gz" | sha256sum --check && \ mkdir hypopg-src && cd hypopg-src && tar xzf ../hypopg.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ @@ -293,9 +341,13 @@ RUN wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.4.0.tar.gz -O hypo # ######################################################################################### FROM build-deps AS pg-hashids-pg-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN wget https://github.com/iCyberon/pg_hashids/archive/refs/tags/v1.2.1.tar.gz -O pg_hashids.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/iCyberon/pg_hashids/archive/refs/tags/v1.2.1.tar.gz -O pg_hashids.tar.gz && \ echo "74576b992d9277c92196dd8d816baa2cc2d8046fe102f3dcd7f3c3febed6822a pg_hashids.tar.gz" | sha256sum --check && \ mkdir pg_hashids-src && cd pg_hashids-src && tar xzf ../pg_hashids.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \ @@ -309,11 +361,15 @@ RUN wget https://github.com/iCyberon/pg_hashids/archive/refs/tags/v1.2.1.tar.gz # ######################################################################################### FROM build-deps AS rum-pg-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY patches/rum.patch /rum.patch -RUN wget https://github.com/postgrespro/rum/archive/refs/tags/1.3.13.tar.gz -O rum.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/postgrespro/rum/archive/refs/tags/1.3.13.tar.gz -O rum.tar.gz && \ echo "6ab370532c965568df6210bd844ac6ba649f53055e48243525b0b7e5c4d69a7d rum.tar.gz" | sha256sum --check && \ mkdir rum-src && cd rum-src && tar xzf ../rum.tar.gz --strip-components=1 -C . && \ patch -p1 < /rum.patch && \ @@ -328,9 +384,13 @@ RUN wget https://github.com/postgrespro/rum/archive/refs/tags/1.3.13.tar.gz -O r # ######################################################################################### FROM build-deps AS pgtap-pg-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN wget https://github.com/theory/pgtap/archive/refs/tags/v1.2.0.tar.gz -O pgtap.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/theory/pgtap/archive/refs/tags/v1.2.0.tar.gz -O pgtap.tar.gz && \ echo "9c7c3de67ea41638e14f06da5da57bac6f5bd03fea05c165a0ec862205a5c052 pgtap.tar.gz" | sha256sum --check && \ mkdir pgtap-src && cd pgtap-src && tar xzf ../pgtap.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ @@ -344,9 +404,13 @@ RUN wget https://github.com/theory/pgtap/archive/refs/tags/v1.2.0.tar.gz -O pgta # ######################################################################################### FROM build-deps AS ip4r-pg-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/2.4.2.tar.gz -O ip4r.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/2.4.2.tar.gz -O ip4r.tar.gz && \ echo "0f7b1f159974f49a47842a8ab6751aecca1ed1142b6d5e38d81b064b2ead1b4b ip4r.tar.gz" | sha256sum --check && \ mkdir ip4r-src && cd ip4r-src && tar xzf ../ip4r.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ @@ -360,9 +424,13 @@ RUN wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/2.4.2.tar.gz -O i # ######################################################################################### FROM build-deps AS prefix-pg-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.10.tar.gz -O prefix.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.10.tar.gz -O prefix.tar.gz && \ echo "4342f251432a5f6fb05b8597139d3ccde8dcf87e8ca1498e7ee931ca057a8575 prefix.tar.gz" | sha256sum --check && \ mkdir prefix-src && cd prefix-src && tar xzf ../prefix.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ @@ -376,9 +444,13 @@ RUN wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.10.tar.gz -O p # ######################################################################################### FROM build-deps AS hll-pg-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.18.tar.gz -O hll.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.18.tar.gz -O hll.tar.gz && \ echo "e2f55a6f4c4ab95ee4f1b4a2b73280258c5136b161fe9d059559556079694f0e hll.tar.gz" | sha256sum --check && \ mkdir hll-src && cd hll-src && tar xzf ../hll.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ @@ -392,9 +464,13 @@ RUN wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.18.tar # ######################################################################################### FROM build-deps AS plpgsql-check-pg-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.5.3.tar.gz -O plpgsql_check.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.5.3.tar.gz -O plpgsql_check.tar.gz && \ echo "6631ec3e7fb3769eaaf56e3dfedb829aa761abf163d13dba354b4c218508e1c0 plpgsql_check.tar.gz" | sha256sum --check && \ mkdir plpgsql_check-src && cd plpgsql_check-src && tar xzf ../plpgsql_check.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \ @@ -413,7 +489,10 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ARG PG_VERSION ENV PATH="/usr/local/pgsql/bin:$PATH" -RUN case "${PG_VERSION}" in \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + case "${PG_VERSION}" in \ "v14" | "v15") \ export TIMESCALEDB_VERSION=2.10.1 \ export TIMESCALEDB_CHECKSUM=6fca72a6ed0f6d32d2b3523951ede73dc5f9b0077b38450a029a5f411fdb8c73 \ @@ -446,7 +525,10 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ARG PG_VERSION ENV PATH="/usr/local/pgsql/bin:$PATH" -RUN case "${PG_VERSION}" in \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + case "${PG_VERSION}" in \ "v14") \ export PG_HINT_PLAN_VERSION=14_1_4_1 \ export PG_HINT_PLAN_CHECKSUM=c3501becf70ead27f70626bce80ea401ceac6a77e2083ee5f3ff1f1444ec1ad1 \ @@ -459,6 +541,9 @@ RUN case "${PG_VERSION}" in \ export PG_HINT_PLAN_VERSION=16_1_6_0 \ export PG_HINT_PLAN_CHECKSUM=fc85a9212e7d2819d4ae4ac75817481101833c3cfa9f0fe1f980984e12347d00 \ ;; \ + "v17") \ + echo "TODO: PG17 pg_hint_plan support" && exit 0 \ + ;; \ *) \ echo "Export the valid PG_HINT_PLAN_VERSION variable" && exit 1 \ ;; \ @@ -478,10 +563,14 @@ RUN case "${PG_VERSION}" in \ # ######################################################################################### FROM build-deps AS pg-cron-pg-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ENV PATH="/usr/local/pgsql/bin/:$PATH" -RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.0.tar.gz -O pg_cron.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.0.tar.gz -O pg_cron.tar.gz && \ echo "383a627867d730222c272bfd25cd5e151c578d73f696d32910c7db8c665cc7db pg_cron.tar.gz" | sha256sum --check && \ mkdir pg_cron-src && cd pg_cron-src && tar xzf ../pg_cron.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) && \ @@ -495,9 +584,13 @@ RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.0.tar.gz -O # ######################################################################################### FROM build-deps AS rdkit-pg-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN apt-get update && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + apt-get update && \ apt-get install -y \ cmake \ libboost-iostreams1.74-dev \ @@ -507,7 +600,10 @@ RUN apt-get update && \ libeigen3-dev ENV PATH="/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH" -RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar.gz -O rdkit.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar.gz -O rdkit.tar.gz && \ echo "bdbf9a2e6988526bfeb8c56ce3cdfe2998d60ac289078e2215374288185e8c8d rdkit.tar.gz" | sha256sum --check && \ mkdir rdkit-src && cd rdkit-src && tar xzf ../rdkit.tar.gz --strip-components=1 -C . && \ cmake \ @@ -544,10 +640,14 @@ RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar. # ######################################################################################### FROM build-deps AS pg-uuidv7-pg-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ENV PATH="/usr/local/pgsql/bin/:$PATH" -RUN wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.0.1.tar.gz -O pg_uuidv7.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.0.1.tar.gz -O pg_uuidv7.tar.gz && \ echo "0d0759ab01b7fb23851ecffb0bce27822e1868a4a5819bfd276101c716637a7a pg_uuidv7.tar.gz" | sha256sum --check && \ mkdir pg_uuidv7-src && cd pg_uuidv7-src && tar xzf ../pg_uuidv7.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) && \ @@ -561,10 +661,14 @@ RUN wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.0.1.tar.gz # ######################################################################################### FROM build-deps AS pg-roaringbitmap-pg-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ENV PATH="/usr/local/pgsql/bin/:$PATH" -RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4.tar.gz -O pg_roaringbitmap.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4.tar.gz -O pg_roaringbitmap.tar.gz && \ echo "b75201efcb1c2d1b014ec4ae6a22769cc7a224e6e406a587f5784a37b6b5a2aa pg_roaringbitmap.tar.gz" | sha256sum --check && \ mkdir pg_roaringbitmap-src && cd pg_roaringbitmap-src && tar xzf ../pg_roaringbitmap.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) && \ @@ -578,10 +682,14 @@ RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4 # ######################################################################################### FROM build-deps AS pg-semver-pg-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ENV PATH="/usr/local/pgsql/bin/:$PATH" -RUN wget https://github.com/theory/pg-semver/archive/refs/tags/v0.32.1.tar.gz -O pg_semver.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/theory/pg-semver/archive/refs/tags/v0.32.1.tar.gz -O pg_semver.tar.gz && \ echo "fbdaf7512026d62eec03fad8687c15ed509b6ba395bff140acd63d2e4fbe25d7 pg_semver.tar.gz" | sha256sum --check && \ mkdir pg_semver-src && cd pg_semver-src && tar xzf ../pg_semver.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) && \ @@ -599,7 +707,10 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ARG PG_VERSION ENV PATH="/usr/local/pgsql/bin/:$PATH" -RUN case "${PG_VERSION}" in \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + case "${PG_VERSION}" in \ "v14" | "v15") \ export PG_EMBEDDING_VERSION=0.3.5 \ export PG_EMBEDDING_CHECKSUM=0e95b27b8b6196e2cf0a0c9ec143fe2219b82e54c5bb4ee064e76398cbe69ae9 \ @@ -620,10 +731,14 @@ RUN case "${PG_VERSION}" in \ # ######################################################################################### FROM build-deps AS pg-anon-pg-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ENV PATH="/usr/local/pgsql/bin/:$PATH" -RUN wget https://github.com/neondatabase/postgresql_anonymizer/archive/refs/tags/neon_1.1.1.tar.gz -O pg_anon.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/neondatabase/postgresql_anonymizer/archive/refs/tags/neon_1.1.1.tar.gz -O pg_anon.tar.gz && \ echo "321ea8d5c1648880aafde850a2c576e4a9e7b9933a34ce272efc839328999fa9 pg_anon.tar.gz" | sha256sum --check && \ mkdir pg_anon-src && cd pg_anon-src && tar xzf ../pg_anon.tar.gz --strip-components=1 -C . && \ find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\ @@ -641,6 +756,7 @@ RUN wget https://github.com/neondatabase/postgresql_anonymizer/archive/refs/tag # ######################################################################################### FROM build-deps AS rust-extensions-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ RUN apt-get update && \ @@ -651,9 +767,11 @@ ENV HOME=/home/nonroot ENV PATH="/home/nonroot/.cargo/bin:/usr/local/pgsql/bin/:$PATH" USER nonroot WORKDIR /home/nonroot -ARG PG_VERSION -RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \ chmod +x rustup-init && \ ./rustup-init -y --no-modify-path --profile minimal --default-toolchain stable && \ rm rustup-init && \ @@ -672,7 +790,10 @@ USER root FROM rust-extensions-build AS pg-jsonschema-pg-build ARG PG_VERSION -RUN wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.3.1.tar.gz -O pg_jsonschema.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.3.1.tar.gz -O pg_jsonschema.tar.gz && \ echo "61df3db1ed83cf24f6aa39c826f8818bfa4f0bd33b587fd6b2b1747985642297 pg_jsonschema.tar.gz" | sha256sum --check && \ mkdir pg_jsonschema-src && cd pg_jsonschema-src && tar xzf ../pg_jsonschema.tar.gz --strip-components=1 -C . && \ # see commit 252b3685a27a0f4c31a0f91e983c6314838e89e8 @@ -694,7 +815,10 @@ RUN wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.3.1.tar. FROM rust-extensions-build AS pg-graphql-pg-build ARG PG_VERSION -RUN wget https://github.com/supabase/pg_graphql/archive/refs/tags/v1.5.7.tar.gz -O pg_graphql.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/supabase/pg_graphql/archive/refs/tags/v1.5.7.tar.gz -O pg_graphql.tar.gz && \ echo "2b3e567a5b31019cb97ae0e33263c1bcc28580be5a444ac4c8ece5c4be2aea41 pg_graphql.tar.gz" | sha256sum --check && \ mkdir pg_graphql-src && cd pg_graphql-src && tar xzf ../pg_graphql.tar.gz --strip-components=1 -C . && \ sed -i 's/pgrx = "=0.11.3"/pgrx = { version = "0.11.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ @@ -714,7 +838,10 @@ FROM rust-extensions-build AS pg-tiktoken-pg-build ARG PG_VERSION # 26806147b17b60763039c6a6878884c41a262318 made on 26/09/2023 -RUN wget https://github.com/kelvich/pg_tiktoken/archive/26806147b17b60763039c6a6878884c41a262318.tar.gz -O pg_tiktoken.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/kelvich/pg_tiktoken/archive/26806147b17b60763039c6a6878884c41a262318.tar.gz -O pg_tiktoken.tar.gz && \ echo "e64e55aaa38c259512d3e27c572da22c4637418cf124caba904cd50944e5004e pg_tiktoken.tar.gz" | sha256sum --check && \ mkdir pg_tiktoken-src && cd pg_tiktoken-src && tar xzf ../pg_tiktoken.tar.gz --strip-components=1 -C . && \ # TODO update pgrx version in the pg_tiktoken repo and remove this line @@ -733,7 +860,10 @@ RUN wget https://github.com/kelvich/pg_tiktoken/archive/26806147b17b60763039c6a6 FROM rust-extensions-build AS pg-pgx-ulid-build ARG PG_VERSION -RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.5.tar.gz -O pgx_ulid.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.5.tar.gz -O pgx_ulid.tar.gz && \ echo "9d1659a2da65af0133d5451c454de31b37364e3502087dadf579f790bc8bef17 pgx_ulid.tar.gz" | sha256sum --check && \ mkdir pgx_ulid-src && cd pgx_ulid-src && tar xzf ../pgx_ulid.tar.gz --strip-components=1 -C . && \ sed -i 's/pgrx = "^0.11.2"/pgrx = { version = "=0.11.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ @@ -748,10 +878,14 @@ RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.5.tar.gz - ######################################################################################### FROM build-deps AS wal2json-pg-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ENV PATH="/usr/local/pgsql/bin/:$PATH" -RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar.gz && \ echo "b516653575541cf221b99cf3f8be9b6821f6dbcfc125675c85f35090f824f00e wal2json_2_5.tar.gz" | sha256sum --check && \ mkdir wal2json-src && cd wal2json-src && tar xzf ../wal2json_2_5.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) && \ @@ -764,10 +898,14 @@ RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar. # ######################################################################################### FROM build-deps AS pg-ivm-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ENV PATH="/usr/local/pgsql/bin/:$PATH" -RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.7.tar.gz -O pg_ivm.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.7.tar.gz -O pg_ivm.tar.gz && \ echo "ebfde04f99203c7be4b0e873f91104090e2e83e5429c32ac242d00f334224d5e pg_ivm.tar.gz" | sha256sum --check && \ mkdir pg_ivm-src && cd pg_ivm-src && tar xzf ../pg_ivm.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) && \ @@ -781,10 +919,14 @@ RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.7.tar.gz -O pg_iv # ######################################################################################### FROM build-deps AS pg-partman-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ENV PATH="/usr/local/pgsql/bin/:$PATH" -RUN wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.0.1.tar.gz -O pg_partman.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.0.1.tar.gz -O pg_partman.tar.gz && \ echo "75b541733a9659a6c90dbd40fccb904a630a32880a6e3044d0c4c5f4c8a65525 pg_partman.tar.gz" | sha256sum --check && \ mkdir pg_partman-src && cd pg_partman-src && tar xzf ../pg_partman.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) && \ @@ -835,7 +977,10 @@ COPY --from=pg-ivm-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg-partman-build /usr/local/pgsql/ /usr/local/pgsql/ COPY pgxn/ pgxn/ -RUN make -j $(getconf _NPROCESSORS_ONLN) \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + make -j $(getconf _NPROCESSORS_ONLN) \ PG_CONFIG=/usr/local/pgsql/bin/pg_config \ -C pgxn/neon \ -s install && \ @@ -854,8 +999,8 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) \ case "${PG_VERSION}" in \ "v14" | "v15") \ ;; \ - "v16") \ - echo "Skipping HNSW for PostgreSQL 16" && exit 0 \ + "v16" | "v17") \ + echo "Skipping HNSW for PostgreSQL ${PG_VERSION}" && exit 0 \ ;; \ *) \ echo "unexpected PostgreSQL version" && exit 1 \ @@ -878,7 +1023,10 @@ ENV BUILD_TAG=$BUILD_TAG USER nonroot # Copy entire project to get Cargo.* files with proper dependencies for the whole project COPY --chown=nonroot . . -RUN cd compute_tools && mold -run cargo build --locked --profile release-line-debug-size-lto +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + cd compute_tools && mold -run cargo build --locked --profile release-line-debug-size-lto ######################################################################################### # @@ -899,15 +1047,24 @@ FROM neon-pg-ext-build AS postgres-cleanup-layer COPY --from=neon-pg-ext-build /usr/local/pgsql /usr/local/pgsql # Remove binaries from /bin/ that we won't use (or would manually copy & install otherwise) -RUN cd /usr/local/pgsql/bin && rm ecpg raster2pgsql shp2pgsql pgtopo_export pgtopo_import pgsql2shp +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + cd /usr/local/pgsql/bin && rm ecpg raster2pgsql shp2pgsql pgtopo_export pgtopo_import pgsql2shp # Remove headers that we won't need anymore - we've completed installation of all extensions -RUN rm -r /usr/local/pgsql/include +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + rm -r /usr/local/pgsql/include # Remove static postgresql libraries - all compilation is finished, so we # can now remove these files - they must be included in other binaries by now # if they were to be used by other libraries. -RUN rm /usr/local/pgsql/lib/lib*.a +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + rm /usr/local/pgsql/lib/lib*.a ######################################################################################### @@ -918,7 +1075,10 @@ RUN rm /usr/local/pgsql/lib/lib*.a FROM neon-pg-ext-build AS neon-pg-ext-test ARG PG_VERSION -RUN mkdir /ext-src +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + mkdir /ext-src #COPY --from=postgis-build /postgis.tar.gz /ext-src/ #COPY --from=postgis-build /sfcgal/* /usr @@ -956,18 +1116,39 @@ COPY --from=pg-anon-pg-build /pg_anon.tar.gz /ext-src COPY patches/pg_anon.patch /ext-src COPY --from=pg-ivm-build /pg_ivm.tar.gz /ext-src COPY --from=pg-partman-build /pg_partman.tar.gz /ext-src -RUN cd /ext-src/ && for f in *.tar.gz; \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + cd /ext-src/ && for f in *.tar.gz; \ do echo $f; dname=$(echo $f | sed 's/\.tar.*//')-src; \ rm -rf $dname; mkdir $dname; tar xzf $f --strip-components=1 -C $dname \ || exit 1; rm -f $f; done -RUN cd /ext-src/pgvector-src && patch -p1 <../pgvector.patch -RUN cd /ext-src/rum-src && patch -p1 <../rum.patch +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + cd /ext-src/rum-src && patch -p1 <../rum.patch +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + cd /ext-src/pgvector-src && patch -p1 <../pgvector.patch # cmake is required for the h3 test -RUN apt-get update && apt-get install -y cmake -RUN cd /ext-src/pg_hint_plan-src && patch -p1 < /ext-src/pg_hint_plan.patch +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + apt-get update && apt-get install -y cmake +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + cd /ext-src/pg_hint_plan-src && patch -p1 < /ext-src/pg_hint_plan.patch COPY --chmod=755 docker-compose/run-tests.sh /run-tests.sh -RUN patch -p1 &str { "14" => return "v14", "15" => return "v15", "16" => return "v16", + "17" => return "v17", _ => {} }, _ => {} diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index 5dbc3bcbbc..d616154af6 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -342,7 +342,7 @@ impl LocalEnv { #[allow(clippy::manual_range_patterns)] match pg_version { - 14 | 15 | 16 => Ok(path.join(format!("v{pg_version}"))), + 14 | 15 | 16 | 17 => Ok(path.join(format!("v{pg_version}"))), _ => bail!("Unsupported postgres version: {}", pg_version), } } diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs index c715d6b789..2b714fbfbf 100644 --- a/control_plane/src/storage_controller.rs +++ b/control_plane/src/storage_controller.rs @@ -28,6 +28,7 @@ use utils::{ auth::{encode_from_key_file, Claims, Scope}, id::{NodeId, TenantId}, }; +use whoami::username; pub struct StorageController { env: LocalEnv, @@ -183,7 +184,7 @@ impl StorageController { /// to other versions if that one isn't found. Some automated tests create circumstances /// where only one version is available in pg_distrib_dir, such as `test_remote_extensions`. async fn get_pg_dir(&self, dir_name: &str) -> anyhow::Result { - let prefer_versions = [STORAGE_CONTROLLER_POSTGRES_VERSION, 15, 14]; + let prefer_versions = [STORAGE_CONTROLLER_POSTGRES_VERSION, 16, 15, 14]; for v in prefer_versions { let path = Utf8PathBuf::from_path_buf(self.env.pg_dir(v, dir_name)?).unwrap(); @@ -211,7 +212,16 @@ impl StorageController { /// Readiness check for our postgres process async fn pg_isready(&self, pg_bin_dir: &Utf8Path, postgres_port: u16) -> anyhow::Result { let bin_path = pg_bin_dir.join("pg_isready"); - let args = ["-h", "localhost", "-p", &format!("{}", postgres_port)]; + let args = [ + "-h", + "localhost", + "-U", + &username(), + "-d", + DB_NAME, + "-p", + &format!("{}", postgres_port), + ]; let exitcode = Command::new(bin_path).args(args).spawn()?.wait().await?; Ok(exitcode.success()) @@ -225,7 +235,11 @@ impl StorageController { /// /// Returns the database url pub async fn setup_database(&self, postgres_port: u16) -> anyhow::Result { - let database_url = format!("postgresql://localhost:{}/{DB_NAME}", postgres_port); + let database_url = format!( + "postgresql://{}@localhost:{}/{DB_NAME}", + &username(), + postgres_port + ); let pg_bin_dir = self.get_pg_bin_dir().await?; let createdb_path = pg_bin_dir.join("createdb"); @@ -235,6 +249,10 @@ impl StorageController { "localhost", "-p", &format!("{}", postgres_port), + "-U", + &username(), + "-O", + &username(), DB_NAME, ]) .output() @@ -271,7 +289,7 @@ impl StorageController { // But tokio-postgres fork doesn't have this upstream commit: // https://github.com/sfackler/rust-postgres/commit/cb609be758f3fb5af537f04b584a2ee0cebd5e79 // => we should rebase our fork => TODO https://github.com/neondatabase/neon/issues/8399 - .user(&whoami::username()) + .user(&username()) .dbname(DB_NAME) .connect(tokio_postgres::NoTls) .await @@ -328,6 +346,12 @@ impl StorageController { let pg_log_path = pg_data_path.join("postgres.log"); if !tokio::fs::try_exists(&pg_data_path).await? { + let initdb_args = ["-D", pg_data_path.as_ref(), "--username", &username()]; + tracing::info!( + "Initializing storage controller database with args: {:?}", + initdb_args + ); + // Initialize empty database let initdb_path = pg_bin_dir.join("initdb"); let mut child = Command::new(&initdb_path) @@ -335,7 +359,7 @@ impl StorageController { ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()), ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()), ]) - .args(["-D", pg_data_path.as_ref()]) + .args(initdb_args) .spawn() .expect("Failed to spawn initdb"); let status = child.wait().await?; @@ -364,8 +388,14 @@ impl StorageController { pg_data_path.as_ref(), "-l", pg_log_path.as_ref(), + "-U", + &username(), "start", ]; + tracing::info!( + "Starting storage controller database with args: {:?}", + db_start_args + ); background_process::start_process( "storage_controller_db", diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs index 8929ccb41d..4a776709c9 100644 --- a/libs/pageserver_api/src/key.rs +++ b/libs/pageserver_api/src/key.rs @@ -1,8 +1,8 @@ use anyhow::{bail, Result}; use byteorder::{ByteOrder, BE}; use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM}; +use postgres_ffi::Oid; use postgres_ffi::RepOriginId; -use postgres_ffi::{Oid, TransactionId}; use serde::{Deserialize, Serialize}; use std::{fmt, ops::Range}; @@ -350,7 +350,17 @@ impl Key { // 02 00000000 00000000 00000000 00 00000000 // // TwoPhaseFile: -// 02 00000000 00000000 00000000 00 XID +// +// 02 00000000 00000000 00XXXXXX XX XXXXXXXX +// +// \______XID_________/ +// +// The 64-bit XID is stored a little awkwardly in field6, field5 and +// field4. PostgreSQL v16 and below only stored a 32-bit XID, which +// fit completely in field6, but starting with PostgreSQL v17, a full +// 64-bit XID is used. Most pageserver code that accesses +// TwoPhaseFiles now deals with 64-bit XIDs even on v16, the high bits +// are just unused. // // ControlFile: // 03 00000000 00000000 00000000 00 00000000 @@ -582,35 +592,36 @@ pub const TWOPHASEDIR_KEY: Key = Key { }; #[inline(always)] -pub fn twophase_file_key(xid: TransactionId) -> Key { +pub fn twophase_file_key(xid: u64) -> Key { Key { field1: 0x02, field2: 0, field3: 0, - field4: 0, - field5: 0, - field6: xid, + field4: ((xid & 0xFFFFFF0000000000) >> 40) as u32, + field5: ((xid & 0x000000FF00000000) >> 32) as u8, + field6: (xid & 0x00000000FFFFFFFF) as u32, } } #[inline(always)] -pub fn twophase_key_range(xid: TransactionId) -> Range { +pub fn twophase_key_range(xid: u64) -> Range { + // 64-bit XIDs really should not overflow let (next_xid, overflowed) = xid.overflowing_add(1); Key { field1: 0x02, field2: 0, field3: 0, - field4: 0, - field5: 0, - field6: xid, + field4: ((xid & 0xFFFFFF0000000000) >> 40) as u32, + field5: ((xid & 0x000000FF00000000) >> 32) as u8, + field6: (xid & 0x00000000FFFFFFFF) as u32, }..Key { field1: 0x02, field2: 0, - field3: 0, - field4: 0, - field5: u8::from(overflowed), - field6: next_xid, + field3: u32::from(overflowed), + field4: ((next_xid & 0xFFFFFF0000000000) >> 40) as u32, + field5: ((next_xid & 0x000000FF00000000) >> 32) as u8, + field6: (next_xid & 0x00000000FFFFFFFF) as u32, } } diff --git a/libs/postgres_ffi/build.rs b/libs/postgres_ffi/build.rs index a346390f3d..d3a85f2683 100644 --- a/libs/postgres_ffi/build.rs +++ b/libs/postgres_ffi/build.rs @@ -56,7 +56,7 @@ fn main() -> anyhow::Result<()> { PathBuf::from("pg_install") }; - for pg_version in &["v14", "v15", "v16"] { + for pg_version in &["v14", "v15", "v16", "v17"] { let mut pg_install_dir_versioned = pg_install_dir.join(pg_version); if pg_install_dir_versioned.is_relative() { let cwd = env::current_dir().context("Failed to get current_dir")?; diff --git a/libs/postgres_ffi/src/lib.rs b/libs/postgres_ffi/src/lib.rs index f18e0c603b..0d46ed6aac 100644 --- a/libs/postgres_ffi/src/lib.rs +++ b/libs/postgres_ffi/src/lib.rs @@ -57,6 +57,7 @@ macro_rules! for_all_postgres_versions { $macro!(v14); $macro!(v15); $macro!(v16); + $macro!(v17); }; } @@ -91,6 +92,7 @@ macro_rules! dispatch_pgversion { 14 : v14, 15 : v15, 16 : v16, + 17 : v17, ] ) }; @@ -121,6 +123,7 @@ macro_rules! enum_pgversion_dispatch { V14 : v14, V15 : v15, V16 : v16, + V17 : v17, ] ) }; @@ -150,6 +153,7 @@ macro_rules! enum_pgversion { V14 : v14, V15 : v15, V16 : v16, + V17 : v17, ] } }; @@ -162,6 +166,7 @@ macro_rules! enum_pgversion { V14 : v14, V15 : v15, V16 : v16, + V17 : v17, ] } }; diff --git a/libs/postgres_ffi/src/pg_constants.rs b/libs/postgres_ffi/src/pg_constants.rs index 6ce855c78e..61b49a634d 100644 --- a/libs/postgres_ffi/src/pg_constants.rs +++ b/libs/postgres_ffi/src/pg_constants.rs @@ -152,6 +152,9 @@ pub const XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED: u8 = (1 << 0) as u8; pub const XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED: u8 = (1 << 1) as u8; pub const XLH_DELETE_ALL_VISIBLE_CLEARED: u8 = (1 << 0) as u8; +// From heapam_xlog.h +pub const XLOG_HEAP2_REWRITE: u8 = 0x00; + // From replication/message.h pub const XLOG_LOGICAL_MESSAGE: u8 = 0x00; @@ -219,15 +222,20 @@ pub const INVALID_TRANSACTION_ID: u32 = 0; pub const FIRST_BOOTSTRAP_OBJECT_ID: u32 = 12000; pub const FIRST_NORMAL_OBJECT_ID: u32 = 16384; +/* pg_control.h */ pub const XLOG_CHECKPOINT_SHUTDOWN: u8 = 0x00; pub const XLOG_CHECKPOINT_ONLINE: u8 = 0x10; -pub const XLP_FIRST_IS_CONTRECORD: u16 = 0x0001; -pub const XLP_LONG_HEADER: u16 = 0x0002; +pub const XLOG_PARAMETER_CHANGE: u8 = 0x60; +pub const XLOG_END_OF_RECOVERY: u8 = 0x90; /* From xlog.h */ pub const XLOG_REPLORIGIN_SET: u8 = 0x00; pub const XLOG_REPLORIGIN_DROP: u8 = 0x10; +/* xlog_internal.h */ +pub const XLP_FIRST_IS_CONTRECORD: u16 = 0x0001; +pub const XLP_LONG_HEADER: u16 = 0x0002; + /* From replication/slot.h */ pub const REPL_SLOT_ON_DISK_OFFSETOF_RESTART_LSN: usize = 4*4 /* offset of `slotdata` in ReplicationSlotOnDisk */ + 64 /* NameData */ + 4*4; @@ -245,33 +253,6 @@ pub const VM_HEAPBLOCKS_PER_PAGE: u32 = /* From origin.c */ pub const REPLICATION_STATE_MAGIC: u32 = 0x1257DADE; -// List of subdirectories inside pgdata. -// Copied from src/bin/initdb/initdb.c -pub const PGDATA_SUBDIRS: [&str; 22] = [ - "global", - "pg_wal/archive_status", - "pg_commit_ts", - "pg_dynshmem", - "pg_notify", - "pg_serial", - "pg_snapshots", - "pg_subtrans", - "pg_twophase", - "pg_multixact", - "pg_multixact/members", - "pg_multixact/offsets", - "base", - "base/1", - "pg_replslot", - "pg_tblspc", - "pg_stat", - "pg_stat_tmp", - "pg_xact", - "pg_logical", - "pg_logical/snapshots", - "pg_logical/mappings", -]; - // Don't include postgresql.conf as it is inconvenient on node start: // we need postgresql.conf before basebackup to synchronize safekeepers // so no point in overwriting it during backup restore. Rest of the files diff --git a/libs/postgres_ffi/src/pg_constants_v14.rs b/libs/postgres_ffi/src/pg_constants_v14.rs index 32f8f51114..fe01a5df7c 100644 --- a/libs/postgres_ffi/src/pg_constants_v14.rs +++ b/libs/postgres_ffi/src/pg_constants_v14.rs @@ -5,6 +5,33 @@ pub const BKPIMAGE_IS_COMPRESSED: u8 = 0x02; /* page image is compressed */ pub const BKPIMAGE_APPLY: u8 = 0x04; /* page image should be restored during replay */ pub const SIZEOF_RELMAPFILE: usize = 512; /* sizeof(RelMapFile) in relmapper.c */ +// List of subdirectories inside pgdata. +// Copied from src/bin/initdb/initdb.c +pub const PGDATA_SUBDIRS: [&str; 22] = [ + "global", + "pg_wal/archive_status", + "pg_commit_ts", + "pg_dynshmem", + "pg_notify", + "pg_serial", + "pg_snapshots", + "pg_subtrans", + "pg_twophase", + "pg_multixact", + "pg_multixact/members", + "pg_multixact/offsets", + "base", + "base/1", + "pg_replslot", + "pg_tblspc", + "pg_stat", + "pg_stat_tmp", + "pg_xact", + "pg_logical", + "pg_logical/snapshots", + "pg_logical/mappings", +]; + pub fn bkpimg_is_compressed(bimg_info: u8) -> bool { (bimg_info & BKPIMAGE_IS_COMPRESSED) != 0 } diff --git a/libs/postgres_ffi/src/pg_constants_v15.rs b/libs/postgres_ffi/src/pg_constants_v15.rs index 626a23c7ea..3cd1b7aec5 100644 --- a/libs/postgres_ffi/src/pg_constants_v15.rs +++ b/libs/postgres_ffi/src/pg_constants_v15.rs @@ -11,6 +11,8 @@ pub const BKPIMAGE_COMPRESS_ZSTD: u8 = 0x10; /* page image is compressed */ pub const SIZEOF_RELMAPFILE: usize = 512; /* sizeof(RelMapFile) in relmapper.c */ +pub use super::super::v14::bindings::PGDATA_SUBDIRS; + pub fn bkpimg_is_compressed(bimg_info: u8) -> bool { const ANY_COMPRESS_FLAG: u8 = BKPIMAGE_COMPRESS_PGLZ | BKPIMAGE_COMPRESS_LZ4 | BKPIMAGE_COMPRESS_ZSTD; diff --git a/libs/postgres_ffi/src/pg_constants_v16.rs b/libs/postgres_ffi/src/pg_constants_v16.rs index 587be71cb3..31bd5b68fd 100644 --- a/libs/postgres_ffi/src/pg_constants_v16.rs +++ b/libs/postgres_ffi/src/pg_constants_v16.rs @@ -11,6 +11,8 @@ pub const BKPIMAGE_COMPRESS_ZSTD: u8 = 0x10; /* page image is compressed */ pub const SIZEOF_RELMAPFILE: usize = 524; /* sizeof(RelMapFile) in relmapper.c */ +pub use super::super::v14::bindings::PGDATA_SUBDIRS; + pub fn bkpimg_is_compressed(bimg_info: u8) -> bool { const ANY_COMPRESS_FLAG: u8 = BKPIMAGE_COMPRESS_PGLZ | BKPIMAGE_COMPRESS_LZ4 | BKPIMAGE_COMPRESS_ZSTD; diff --git a/libs/postgres_ffi/src/pg_constants_v17.rs b/libs/postgres_ffi/src/pg_constants_v17.rs new file mode 100644 index 0000000000..2132938680 --- /dev/null +++ b/libs/postgres_ffi/src/pg_constants_v17.rs @@ -0,0 +1,55 @@ +pub const XACT_XINFO_HAS_DROPPED_STATS: u32 = 1u32 << 8; + +pub const XLOG_DBASE_CREATE_FILE_COPY: u8 = 0x00; +pub const XLOG_DBASE_CREATE_WAL_LOG: u8 = 0x10; +pub const XLOG_DBASE_DROP: u8 = 0x20; + +pub const BKPIMAGE_APPLY: u8 = 0x02; /* page image should be restored during replay */ +pub const BKPIMAGE_COMPRESS_PGLZ: u8 = 0x04; /* page image is compressed */ +pub const BKPIMAGE_COMPRESS_LZ4: u8 = 0x08; /* page image is compressed */ +pub const BKPIMAGE_COMPRESS_ZSTD: u8 = 0x10; /* page image is compressed */ + +pub const SIZEOF_RELMAPFILE: usize = 524; /* sizeof(RelMapFile) in relmapper.c */ + +// List of subdirectories inside pgdata. +// Copied from src/bin/initdb/initdb.c +pub const PGDATA_SUBDIRS: [&str; 23] = [ + "global", + "pg_wal/archive_status", + "pg_wal/summaries", + "pg_commit_ts", + "pg_dynshmem", + "pg_notify", + "pg_serial", + "pg_snapshots", + "pg_subtrans", + "pg_twophase", + "pg_multixact", + "pg_multixact/members", + "pg_multixact/offsets", + "base", + "base/1", + "pg_replslot", + "pg_tblspc", + "pg_stat", + "pg_stat_tmp", + "pg_xact", + "pg_logical", + "pg_logical/snapshots", + "pg_logical/mappings", +]; + +pub fn bkpimg_is_compressed(bimg_info: u8) -> bool { + const ANY_COMPRESS_FLAG: u8 = BKPIMAGE_COMPRESS_PGLZ | BKPIMAGE_COMPRESS_LZ4 | BKPIMAGE_COMPRESS_ZSTD; + + (bimg_info & ANY_COMPRESS_FLAG) != 0 +} + + +pub const XLOG_HEAP2_PRUNE_ON_ACCESS: u8 = 0x10; +pub const XLOG_HEAP2_PRUNE_VACUUM_SCAN: u8 = 0x20; +pub const XLOG_HEAP2_PRUNE_VACUUM_CLEANUP: u8 = 0x30; + + +pub const XLOG_OVERWRITE_CONTRECORD: u8 = 0xD0; +pub const XLOG_CHECKPOINT_REDO: u8 = 0xE0; diff --git a/libs/postgres_ffi/wal_craft/src/lib.rs b/libs/postgres_ffi/wal_craft/src/lib.rs index 6052f04d11..949e3f4251 100644 --- a/libs/postgres_ffi/wal_craft/src/lib.rs +++ b/libs/postgres_ffi/wal_craft/src/lib.rs @@ -53,7 +53,7 @@ impl Conf { #[allow(clippy::manual_range_patterns)] match self.pg_version { - 14 | 15 | 16 => Ok(path.join(format!("v{}", self.pg_version))), + 14 | 15 | 16 | 17 => Ok(path.join(format!("v{}", self.pg_version))), _ => bail!("Unsupported postgres version: {}", self.pg_version), } } diff --git a/libs/walproposer/build.rs b/libs/walproposer/build.rs index 28547f52bf..3f549889b8 100644 --- a/libs/walproposer/build.rs +++ b/libs/walproposer/build.rs @@ -5,6 +5,8 @@ use std::{env, path::PathBuf, process::Command}; use anyhow::{anyhow, Context}; +const WALPROPOSER_PG_VERSION: &str = "v17"; + fn main() -> anyhow::Result<()> { // Tell cargo to invalidate the built crate whenever the wrapper changes println!("cargo:rerun-if-changed=bindgen_deps.h"); @@ -36,7 +38,10 @@ fn main() -> anyhow::Result<()> { // Rebuild crate when libwalproposer.a changes println!("cargo:rerun-if-changed={walproposer_lib_search_str}/libwalproposer.a"); - let pg_config_bin = pg_install_abs.join("v16").join("bin").join("pg_config"); + let pg_config_bin = pg_install_abs + .join(WALPROPOSER_PG_VERSION) + .join("bin") + .join("pg_config"); let inc_server_path: String = if pg_config_bin.exists() { let output = Command::new(pg_config_bin) .arg("--includedir-server") @@ -53,7 +58,7 @@ fn main() -> anyhow::Result<()> { .into() } else { let server_path = pg_install_abs - .join("v16") + .join(WALPROPOSER_PG_VERSION) .join("include") .join("postgresql") .join("server") diff --git a/pageserver/ctl/src/layer_map_analyzer.rs b/pageserver/ctl/src/layer_map_analyzer.rs index a07107753e..adc090823d 100644 --- a/pageserver/ctl/src/layer_map_analyzer.rs +++ b/pageserver/ctl/src/layer_map_analyzer.rs @@ -79,16 +79,24 @@ pub(crate) fn parse_filename(name: &str) -> Option { return None; } let keys: Vec<&str> = split[0].split('-').collect(); - let mut lsns: Vec<&str> = split[1].split('-').collect(); - let is_delta = if lsns.len() == 1 { - lsns.push(lsns[0]); + let lsn_and_opt_generation: Vec<&str> = split[1].split('v').collect(); + let lsns: Vec<&str> = lsn_and_opt_generation[0].split('-').collect(); + let the_lsns: [&str; 2]; + + /* + * Generations add a -vX-XXXXXX postfix, which causes issues when we try to + * parse 'vX' as an LSN. + */ + let is_delta = if lsns.len() == 1 || lsns[1].is_empty() { + the_lsns = [lsns[0], lsns[0]]; false } else { + the_lsns = [lsns[0], lsns[1]]; true }; let key_range = Key::from_hex(keys[0]).unwrap()..Key::from_hex(keys[1]).unwrap(); - let lsn_range = Lsn::from_hex(lsns[0]).unwrap()..Lsn::from_hex(lsns[1]).unwrap(); + let lsn_range = Lsn::from_hex(the_lsns[0]).unwrap()..Lsn::from_hex(the_lsns[1]).unwrap(); let holes = Vec::new(); Some(LayerFile { key_range, diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index 207f781e1b..a32d09f3b3 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -30,9 +30,8 @@ use pageserver_api::reltag::{RelTag, SlruKind}; use postgres_ffi::dispatch_pgversion; use postgres_ffi::pg_constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID}; -use postgres_ffi::pg_constants::{PGDATA_SPECIAL_FILES, PGDATA_SUBDIRS, PG_HBA}; +use postgres_ffi::pg_constants::{PGDATA_SPECIAL_FILES, PG_HBA}; use postgres_ffi::relfile_utils::{INIT_FORKNUM, MAIN_FORKNUM}; -use postgres_ffi::TransactionId; use postgres_ffi::XLogFileName; use postgres_ffi::PG_TLI; use postgres_ffi::{BLCKSZ, RELSEG_SIZE, WAL_SEGMENT_SIZE}; @@ -255,8 +254,11 @@ where let lazy_slru_download = self.timeline.get_lazy_slru_download() && !self.full_backup; + let pgversion = self.timeline.pg_version; + let subdirs = dispatch_pgversion!(pgversion, &pgv::bindings::PGDATA_SUBDIRS[..]); + // Create pgdata subdirs structure - for dir in PGDATA_SUBDIRS.iter() { + for dir in subdirs.iter() { let header = new_tar_header_dir(dir)?; self.ar .append(&header, &mut io::empty()) @@ -606,7 +608,7 @@ where // // Extract twophase state files // - async fn add_twophase_file(&mut self, xid: TransactionId) -> Result<(), BasebackupError> { + async fn add_twophase_file(&mut self, xid: u64) -> Result<(), BasebackupError> { let img = self .timeline .get_twophase_file(xid, self.lsn, self.ctx) @@ -617,7 +619,11 @@ where buf.extend_from_slice(&img[..]); let crc = crc32c::crc32c(&img[..]); buf.put_u32_le(crc); - let path = format!("pg_twophase/{:>08X}", xid); + let path = if self.timeline.pg_version < 17 { + format!("pg_twophase/{:>08X}", xid) + } else { + format!("pg_twophase/{:>016X}", xid) + }; let header = new_tar_header(&path, buf.len() as u64)?; self.ar .append(&header, &buf[..]) diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 29a98855d3..e9f197ec2d 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -281,7 +281,7 @@ impl PageServerConf { #[allow(clippy::manual_range_patterns)] match pg_version { - 14 | 15 | 16 => Ok(path.join(format!("v{pg_version}"))), + 14 | 15 | 16 | 17 => Ok(path.join(format!("v{pg_version}"))), _ => bail!("Unsupported postgres version: {}", pg_version), } } diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs index 5a0894cd1b..ca87f1d080 100644 --- a/pageserver/src/import_datadir.rs +++ b/pageserver/src/import_datadir.rs @@ -580,9 +580,11 @@ async fn import_file( import_slru(modification, slru, file_path, reader, len, ctx).await?; debug!("imported multixact members slru"); } else if file_path.starts_with("pg_twophase") { - let xid = u32::from_str_radix(file_name.as_ref(), 16)?; - let bytes = read_all_bytes(reader).await?; + + // In PostgreSQL v17, this is a 64-bit FullTransactionid. In previous versions, + // it's a 32-bit TransactionId, which fits in u64 anyway. + let xid = u64::from_str_radix(file_name.as_ref(), 16)?; modification .put_twophase_file(xid, Bytes::copy_from_slice(&bytes[..]), ctx) .await?; diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 6dd8851b13..5f8766ca2c 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -633,7 +633,7 @@ impl Timeline { pub(crate) async fn get_twophase_file( &self, - xid: TransactionId, + xid: u64, lsn: Lsn, ctx: &RequestContext, ) -> Result { @@ -646,11 +646,19 @@ impl Timeline { &self, lsn: Lsn, ctx: &RequestContext, - ) -> Result, PageReconstructError> { + ) -> Result, PageReconstructError> { // fetch directory entry let buf = self.get(TWOPHASEDIR_KEY, lsn, ctx).await?; - Ok(TwoPhaseDirectory::des(&buf)?.xids) + if self.pg_version >= 17 { + Ok(TwoPhaseDirectoryV17::des(&buf)?.xids) + } else { + Ok(TwoPhaseDirectory::des(&buf)? + .xids + .iter() + .map(|x| u64::from(*x)) + .collect()) + } } pub(crate) async fn get_control_file( @@ -902,9 +910,13 @@ impl Timeline { // Then pg_twophase result.add_key(TWOPHASEDIR_KEY); - let buf = self.get(TWOPHASEDIR_KEY, lsn, ctx).await?; - let twophase_dir = TwoPhaseDirectory::des(&buf)?; - let mut xids: Vec = twophase_dir.xids.iter().cloned().collect(); + + let mut xids: Vec = self + .list_twophase_files(lsn, ctx) + .await? + .iter() + .cloned() + .collect(); xids.sort_unstable(); for xid in xids { result.add_key(twophase_file_key(xid)); @@ -1127,9 +1139,15 @@ impl<'a> DatadirModification<'a> { // Create AuxFilesDirectory self.init_aux_dir()?; - let buf = TwoPhaseDirectory::ser(&TwoPhaseDirectory { - xids: HashSet::new(), - })?; + let buf = if self.tline.pg_version >= 17 { + TwoPhaseDirectoryV17::ser(&TwoPhaseDirectoryV17 { + xids: HashSet::new(), + }) + } else { + TwoPhaseDirectory::ser(&TwoPhaseDirectory { + xids: HashSet::new(), + }) + }?; self.pending_directory_entries .push((DirectoryKind::TwoPhase, 0)); self.put(TWOPHASEDIR_KEY, Value::Image(buf.into())); @@ -1321,22 +1339,31 @@ impl<'a> DatadirModification<'a> { pub async fn put_twophase_file( &mut self, - xid: TransactionId, + xid: u64, img: Bytes, ctx: &RequestContext, ) -> anyhow::Result<()> { // Add it to the directory entry - let buf = self.get(TWOPHASEDIR_KEY, ctx).await?; - let mut dir = TwoPhaseDirectory::des(&buf)?; - if !dir.xids.insert(xid) { - anyhow::bail!("twophase file for xid {} already exists", xid); - } - self.pending_directory_entries - .push((DirectoryKind::TwoPhase, dir.xids.len())); - self.put( - TWOPHASEDIR_KEY, - Value::Image(Bytes::from(TwoPhaseDirectory::ser(&dir)?)), - ); + let dirbuf = self.get(TWOPHASEDIR_KEY, ctx).await?; + let newdirbuf = if self.tline.pg_version >= 17 { + let mut dir = TwoPhaseDirectoryV17::des(&dirbuf)?; + if !dir.xids.insert(xid) { + anyhow::bail!("twophase file for xid {} already exists", xid); + } + self.pending_directory_entries + .push((DirectoryKind::TwoPhase, dir.xids.len())); + Bytes::from(TwoPhaseDirectoryV17::ser(&dir)?) + } else { + let xid = xid as u32; + let mut dir = TwoPhaseDirectory::des(&dirbuf)?; + if !dir.xids.insert(xid) { + anyhow::bail!("twophase file for xid {} already exists", xid); + } + self.pending_directory_entries + .push((DirectoryKind::TwoPhase, dir.xids.len())); + Bytes::from(TwoPhaseDirectory::ser(&dir)?) + }; + self.put(TWOPHASEDIR_KEY, Value::Image(newdirbuf)); self.put(twophase_file_key(xid), Value::Image(img)); Ok(()) @@ -1639,22 +1666,32 @@ impl<'a> DatadirModification<'a> { /// This method is used for marking truncated SLRU files pub async fn drop_twophase_file( &mut self, - xid: TransactionId, + xid: u64, ctx: &RequestContext, ) -> anyhow::Result<()> { // Remove it from the directory entry let buf = self.get(TWOPHASEDIR_KEY, ctx).await?; - let mut dir = TwoPhaseDirectory::des(&buf)?; + let newdirbuf = if self.tline.pg_version >= 17 { + let mut dir = TwoPhaseDirectoryV17::des(&buf)?; - if !dir.xids.remove(&xid) { - warn!("twophase file for xid {} does not exist", xid); - } - self.pending_directory_entries - .push((DirectoryKind::TwoPhase, dir.xids.len())); - self.put( - TWOPHASEDIR_KEY, - Value::Image(Bytes::from(TwoPhaseDirectory::ser(&dir)?)), - ); + if !dir.xids.remove(&xid) { + warn!("twophase file for xid {} does not exist", xid); + } + self.pending_directory_entries + .push((DirectoryKind::TwoPhase, dir.xids.len())); + Bytes::from(TwoPhaseDirectoryV17::ser(&dir)?) + } else { + let xid: u32 = u32::try_from(xid)?; + let mut dir = TwoPhaseDirectory::des(&buf)?; + + if !dir.xids.remove(&xid) { + warn!("twophase file for xid {} does not exist", xid); + } + self.pending_directory_entries + .push((DirectoryKind::TwoPhase, dir.xids.len())); + Bytes::from(TwoPhaseDirectory::ser(&dir)?) + }; + self.put(TWOPHASEDIR_KEY, Value::Image(newdirbuf)); // Delete it self.delete(twophase_key_range(xid)); @@ -2124,11 +2161,21 @@ struct DbDirectory { dbdirs: HashMap<(Oid, Oid), bool>, } +// The format of TwoPhaseDirectory changed in PostgreSQL v17, because the filenames of +// pg_twophase files was expanded from 32-bit XIDs to 64-bit XIDs. Previously, the files +// were named like "pg_twophase/000002E5", now they're like +// "pg_twophsae/0000000A000002E4". + #[derive(Debug, Serialize, Deserialize)] struct TwoPhaseDirectory { xids: HashSet, } +#[derive(Debug, Serialize, Deserialize)] +struct TwoPhaseDirectoryV17 { + xids: HashSet, +} + #[derive(Debug, Serialize, Deserialize, Default)] struct RelDirectory { // Set of relations that exist. (relfilenode, forknum) diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 6e15ad81c3..229c01a681 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -237,6 +237,26 @@ impl WalIngest { .await?; } } + } else if pg_version == 17 { + if info == postgres_ffi::v17::bindings::XLOG_DBASE_CREATE_WAL_LOG { + debug!("XLOG_DBASE_CREATE_WAL_LOG: noop"); + } else if info == postgres_ffi::v17::bindings::XLOG_DBASE_CREATE_FILE_COPY { + // The XLOG record was renamed between v14 and v15, + // but the record format is the same. + // So we can reuse XlCreateDatabase here. + debug!("XLOG_DBASE_CREATE_FILE_COPY"); + let createdb = XlCreateDatabase::decode(&mut buf); + self.ingest_xlog_dbase_create(modification, &createdb, ctx) + .await?; + } else if info == postgres_ffi::v17::bindings::XLOG_DBASE_DROP { + let dropdb = XlDropDatabase::decode(&mut buf); + for tablespace_id in dropdb.tablespace_ids { + trace!("Drop db {}, {}", tablespace_id, dropdb.db_id); + modification + .drop_dbdir(tablespace_id, dropdb.db_id, ctx) + .await?; + } + } } } pg_constants::RM_TBLSPC_ID => { @@ -246,7 +266,11 @@ impl WalIngest { let info = decoded.xl_info & !pg_constants::XLR_INFO_MASK; if info == pg_constants::CLOG_ZEROPAGE { - let pageno = buf.get_u32_le(); + let pageno = if pg_version < 17 { + buf.get_u32_le() + } else { + buf.get_u64_le() as u32 + }; let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; self.put_slru_page_image( @@ -260,7 +284,7 @@ impl WalIngest { .await?; } else { assert!(info == pg_constants::CLOG_TRUNCATE); - let xlrec = XlClogTruncate::decode(&mut buf); + let xlrec = XlClogTruncate::decode(&mut buf, pg_version); self.ingest_clog_truncate_record(modification, &xlrec, ctx) .await?; } @@ -299,12 +323,21 @@ impl WalIngest { parsed_xact.xid, lsn, ); - modification - .drop_twophase_file(parsed_xact.xid, ctx) - .await?; + + let xid: u64 = if pg_version >= 17 { + self.adjust_to_full_transaction_id(parsed_xact.xid)? + } else { + parsed_xact.xid as u64 + }; + modification.drop_twophase_file(xid, ctx).await?; } else if info == pg_constants::XLOG_XACT_PREPARE { + let xid: u64 = if pg_version >= 17 { + self.adjust_to_full_transaction_id(decoded.xl_xid)? + } else { + decoded.xl_xid as u64 + }; modification - .put_twophase_file(decoded.xl_xid, Bytes::copy_from_slice(&buf[..]), ctx) + .put_twophase_file(xid, Bytes::copy_from_slice(&buf[..]), ctx) .await?; } } @@ -312,7 +345,11 @@ impl WalIngest { let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK; if info == pg_constants::XLOG_MULTIXACT_ZERO_OFF_PAGE { - let pageno = buf.get_u32_le(); + let pageno = if pg_version < 17 { + buf.get_u32_le() + } else { + buf.get_u64_le() as u32 + }; let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; self.put_slru_page_image( @@ -325,7 +362,11 @@ impl WalIngest { ) .await?; } else if info == pg_constants::XLOG_MULTIXACT_ZERO_MEM_PAGE { - let pageno = buf.get_u32_le(); + let pageno = if pg_version < 17 { + buf.get_u32_le() + } else { + buf.get_u64_le() as u32 + }; let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; self.put_slru_page_image( @@ -354,6 +395,20 @@ impl WalIngest { pg_constants::RM_XLOG_ID => { let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK; + if info == pg_constants::XLOG_PARAMETER_CHANGE { + if let CheckPoint::V17(cp) = &mut self.checkpoint { + let rec = v17::XlParameterChange::decode(&mut buf); + cp.wal_level = rec.wal_level; + self.checkpoint_modified = true; + } + } else if info == pg_constants::XLOG_END_OF_RECOVERY { + if let CheckPoint::V17(cp) = &mut self.checkpoint { + let rec = v17::XlEndOfRecovery::decode(&mut buf); + cp.wal_level = rec.wal_level; + self.checkpoint_modified = true; + } + } + enum_pgversion_dispatch!(&mut self.checkpoint, CheckPoint, cp, { if info == pg_constants::XLOG_NEXTOID { let next_oid = buf.get_u32_le(); @@ -397,12 +452,24 @@ impl WalIngest { if xlog_checkpoint.oldestActiveXid == pg_constants::INVALID_TRANSACTION_ID && info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN { - let mut oldest_active_xid = cp.nextXid.value as u32; - for xid in modification.tline.list_twophase_files(lsn, ctx).await? { - if (xid.wrapping_sub(oldest_active_xid) as i32) < 0 { - oldest_active_xid = xid; + let oldest_active_xid = if pg_version >= 17 { + let mut oldest_active_full_xid = cp.nextXid.value; + for xid in modification.tline.list_twophase_files(lsn, ctx).await? { + if xid < oldest_active_full_xid { + oldest_active_full_xid = xid; + } } - } + oldest_active_full_xid as u32 + } else { + let mut oldest_active_xid = cp.nextXid.value as u32; + for xid in modification.tline.list_twophase_files(lsn, ctx).await? { + let narrow_xid = xid as u32; + if (narrow_xid.wrapping_sub(oldest_active_xid) as i32) < 0 { + oldest_active_xid = narrow_xid; + } + } + oldest_active_xid + }; cp.oldestActiveXid = oldest_active_xid; } else { cp.oldestActiveXid = xlog_checkpoint.oldestActiveXid; @@ -515,6 +582,25 @@ impl WalIngest { Ok(modification.len() > prev_len) } + /// This is the same as AdjustToFullTransactionId(xid) in PostgreSQL + fn adjust_to_full_transaction_id(&self, xid: TransactionId) -> Result { + let next_full_xid = + enum_pgversion_dispatch!(&self.checkpoint, CheckPoint, cp, { cp.nextXid.value }); + + let next_xid = (next_full_xid) as u32; + let mut epoch = (next_full_xid >> 32) as u32; + + if xid > next_xid { + // Wraparound occurred, must be from a prev epoch. + if epoch == 0 { + bail!("apparent XID wraparound with prepared transaction XID {xid}, nextXid is {next_full_xid}"); + } + epoch -= 1; + } + + Ok((epoch as u64) << 32 | xid as u64) + } + /// Do not store this block, but observe it for the purposes of updating our relation size state. async fn observe_decoded_block( &mut self, @@ -815,6 +901,73 @@ impl WalIngest { bail!("Unknown RMGR {} for Heap decoding", decoded.xl_rmid); } } + 17 => { + if decoded.xl_rmid == pg_constants::RM_HEAP_ID { + let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK; + + if info == pg_constants::XLOG_HEAP_INSERT { + let xlrec = v17::XlHeapInsert::decode(buf); + assert_eq!(0, buf.remaining()); + if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 { + new_heap_blkno = Some(decoded.blocks[0].blkno); + } + } else if info == pg_constants::XLOG_HEAP_DELETE { + let xlrec = v17::XlHeapDelete::decode(buf); + if (xlrec.flags & pg_constants::XLH_DELETE_ALL_VISIBLE_CLEARED) != 0 { + new_heap_blkno = Some(decoded.blocks[0].blkno); + } + } else if info == pg_constants::XLOG_HEAP_UPDATE + || info == pg_constants::XLOG_HEAP_HOT_UPDATE + { + let xlrec = v17::XlHeapUpdate::decode(buf); + // the size of tuple data is inferred from the size of the record. + // we can't validate the remaining number of bytes without parsing + // the tuple data. + if (xlrec.flags & pg_constants::XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) != 0 { + old_heap_blkno = Some(decoded.blocks.last().unwrap().blkno); + } + if (xlrec.flags & pg_constants::XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) != 0 { + // PostgreSQL only uses XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED on a + // non-HOT update where the new tuple goes to different page than + // the old one. Otherwise, only XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED is + // set. + new_heap_blkno = Some(decoded.blocks[0].blkno); + } + } else if info == pg_constants::XLOG_HEAP_LOCK { + let xlrec = v17::XlHeapLock::decode(buf); + if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 { + old_heap_blkno = Some(decoded.blocks[0].blkno); + flags = pg_constants::VISIBILITYMAP_ALL_FROZEN; + } + } + } else if decoded.xl_rmid == pg_constants::RM_HEAP2_ID { + let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK; + if info == pg_constants::XLOG_HEAP2_MULTI_INSERT { + let xlrec = v17::XlHeapMultiInsert::decode(buf); + + let offset_array_len = + if decoded.xl_info & pg_constants::XLOG_HEAP_INIT_PAGE > 0 { + // the offsets array is omitted if XLOG_HEAP_INIT_PAGE is set + 0 + } else { + size_of::() * xlrec.ntuples as usize + }; + assert_eq!(offset_array_len, buf.remaining()); + + if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 { + new_heap_blkno = Some(decoded.blocks[0].blkno); + } + } else if info == pg_constants::XLOG_HEAP2_LOCK_UPDATED { + let xlrec = v17::XlHeapLockUpdated::decode(buf); + if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 { + old_heap_blkno = Some(decoded.blocks[0].blkno); + flags = pg_constants::VISIBILITYMAP_ALL_FROZEN; + } + } + } else { + bail!("Unknown RMGR {} for Heap decoding", decoded.xl_rmid); + } + } _ => {} } @@ -923,26 +1076,26 @@ impl WalIngest { assert_eq!(decoded.xl_rmid, pg_constants::RM_NEON_ID); match pg_version { - 16 => { + 16 | 17 => { let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK; match info { pg_constants::XLOG_NEON_HEAP_INSERT => { - let xlrec = v16::rm_neon::XlNeonHeapInsert::decode(buf); + let xlrec = v17::rm_neon::XlNeonHeapInsert::decode(buf); assert_eq!(0, buf.remaining()); if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 { new_heap_blkno = Some(decoded.blocks[0].blkno); } } pg_constants::XLOG_NEON_HEAP_DELETE => { - let xlrec = v16::rm_neon::XlNeonHeapDelete::decode(buf); + let xlrec = v17::rm_neon::XlNeonHeapDelete::decode(buf); if (xlrec.flags & pg_constants::XLH_DELETE_ALL_VISIBLE_CLEARED) != 0 { new_heap_blkno = Some(decoded.blocks[0].blkno); } } pg_constants::XLOG_NEON_HEAP_UPDATE | pg_constants::XLOG_NEON_HEAP_HOT_UPDATE => { - let xlrec = v16::rm_neon::XlNeonHeapUpdate::decode(buf); + let xlrec = v17::rm_neon::XlNeonHeapUpdate::decode(buf); // the size of tuple data is inferred from the size of the record. // we can't validate the remaining number of bytes without parsing // the tuple data. @@ -958,7 +1111,7 @@ impl WalIngest { } } pg_constants::XLOG_NEON_HEAP_MULTI_INSERT => { - let xlrec = v16::rm_neon::XlNeonHeapMultiInsert::decode(buf); + let xlrec = v17::rm_neon::XlNeonHeapMultiInsert::decode(buf); let offset_array_len = if decoded.xl_info & pg_constants::XLOG_HEAP_INIT_PAGE > 0 { @@ -974,7 +1127,7 @@ impl WalIngest { } } pg_constants::XLOG_NEON_HEAP_LOCK => { - let xlrec = v16::rm_neon::XlNeonHeapLock::decode(buf); + let xlrec = v17::rm_neon::XlNeonHeapLock::decode(buf); if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 { old_heap_blkno = Some(decoded.blocks[0].blkno); flags = pg_constants::VISIBILITYMAP_ALL_FROZEN; diff --git a/pageserver/src/walrecord.rs b/pageserver/src/walrecord.rs index 0c4d575de8..dd199e2c55 100644 --- a/pageserver/src/walrecord.rs +++ b/pageserver/src/walrecord.rs @@ -174,6 +174,7 @@ impl DecodedWALRecord { } 15 => info == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_FILE_COPY, 16 => info == postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_FILE_COPY, + 17 => info == postgres_ffi::v17::bindings::XLOG_DBASE_CREATE_FILE_COPY, _ => { panic!("Unsupported postgres version {pg_version}") } @@ -341,16 +342,47 @@ pub mod v14 { } } } + + #[repr(C)] + #[derive(Debug)] + pub struct XlParameterChange { + pub max_connections: i32, + pub max_worker_processes: i32, + pub max_wal_senders: i32, + pub max_prepared_xacts: i32, + pub max_locks_per_xact: i32, + pub wal_level: i32, + pub wal_log_hints: bool, + pub track_commit_timestamp: bool, + pub _padding: [u8; 2], + } + + impl XlParameterChange { + pub fn decode(buf: &mut Bytes) -> XlParameterChange { + XlParameterChange { + max_connections: buf.get_i32_le(), + max_worker_processes: buf.get_i32_le(), + max_wal_senders: buf.get_i32_le(), + max_prepared_xacts: buf.get_i32_le(), + max_locks_per_xact: buf.get_i32_le(), + wal_level: buf.get_i32_le(), + wal_log_hints: buf.get_u8() != 0, + track_commit_timestamp: buf.get_u8() != 0, + _padding: [buf.get_u8(), buf.get_u8()], + } + } + } } pub mod v15 { pub use super::v14::{ XlHeapDelete, XlHeapInsert, XlHeapLock, XlHeapLockUpdated, XlHeapMultiInsert, XlHeapUpdate, + XlParameterChange, }; } pub mod v16 { - pub use super::v14::{XlHeapInsert, XlHeapLockUpdated, XlHeapMultiInsert}; + pub use super::v14::{XlHeapInsert, XlHeapLockUpdated, XlHeapMultiInsert, XlParameterChange}; use bytes::{Buf, Bytes}; use postgres_ffi::{OffsetNumber, TransactionId}; @@ -529,6 +561,37 @@ pub mod v16 { } } +pub mod v17 { + pub use super::v14::XlHeapLockUpdated; + use bytes::{Buf, Bytes}; + pub use postgres_ffi::{TimeLineID, TimestampTz}; + + pub use super::v16::rm_neon; + pub use super::v16::{ + XlHeapDelete, XlHeapInsert, XlHeapLock, XlHeapMultiInsert, XlHeapUpdate, XlParameterChange, + }; + + #[repr(C)] + #[derive(Debug)] + pub struct XlEndOfRecovery { + pub end_time: TimestampTz, + pub this_time_line_id: TimeLineID, + pub prev_time_line_id: TimeLineID, + pub wal_level: i32, + } + + impl XlEndOfRecovery { + pub fn decode(buf: &mut Bytes) -> XlEndOfRecovery { + XlEndOfRecovery { + end_time: buf.get_i64_le(), + this_time_line_id: buf.get_u32_le(), + prev_time_line_id: buf.get_u32_le(), + wal_level: buf.get_i32_le(), + } + } + } +} + #[repr(C)] #[derive(Debug)] pub struct XlSmgrCreate { @@ -746,9 +809,13 @@ pub struct XlClogTruncate { } impl XlClogTruncate { - pub fn decode(buf: &mut Bytes) -> XlClogTruncate { + pub fn decode(buf: &mut Bytes, pg_version: u32) -> XlClogTruncate { XlClogTruncate { - pageno: buf.get_u32_le(), + pageno: if pg_version < 17 { + buf.get_u32_le() + } else { + buf.get_u64_le() as u32 + }, oldest_xid: buf.get_u32_le(), oldest_xid_db: buf.get_u32_le(), } diff --git a/pgxn/neon/bitmap.h b/pgxn/neon/bitmap.h new file mode 100644 index 0000000000..0a131816ef --- /dev/null +++ b/pgxn/neon/bitmap.h @@ -0,0 +1,12 @@ +#ifndef NEON_BITMAP_H +#define NEON_BITMAP_H + +/* + * Utilities for manipulating bits8* as bitmaps. + */ + +#define BITMAP_ISSET(bm, bit) ((bm)[(bit) >> 3] & (1 << ((bit) & 7))) +#define BITMAP_SET(bm, bit) (bm)[(bit) >> 3] |= (1 << ((bit) & 7)) +#define BITMAP_CLR(bm, bit) (bm)[(bit) >> 3] &= ~(1 << ((bit) & 7)) + +#endif //NEON_BITMAP_H diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c index 479209a537..ab6739465b 100644 --- a/pgxn/neon/file_cache.c +++ b/pgxn/neon/file_cache.c @@ -27,6 +27,7 @@ #include "pagestore_client.h" #include "common/hashfn.h" #include "pgstat.h" +#include "port/pg_iovec.h" #include "postmaster/bgworker.h" #include RELFILEINFO_HDR #include "storage/buf_internals.h" @@ -40,6 +41,7 @@ #include "utils/guc.h" #include "hll.h" +#include "bitmap.h" #define CriticalAssert(cond) do if (!(cond)) elog(PANIC, "Assertion %s failed at %s:%d: ", #cond, __FILE__, __LINE__); while (0) @@ -469,6 +471,99 @@ lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno) return found; } +/* + * Check if page is present in the cache. + * Returns true if page is found in local cache. + */ +int +lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, + int nblocks, bits8 *bitmap) +{ + BufferTag tag; + FileCacheEntry *entry; + uint32 chunk_offs; + int found = 0; + uint32 hash; + int i = 0; + + if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */ + return 0; + + CopyNRelFileInfoToBufTag(tag, rinfo); + tag.forkNum = forkNum; + + CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber); + + tag.blockNum = (blkno + i) & ~(BLOCKS_PER_CHUNK - 1); + hash = get_hash_value(lfc_hash, &tag); + chunk_offs = (blkno + i) & (BLOCKS_PER_CHUNK - 1); + + LWLockAcquire(lfc_lock, LW_SHARED); + + while (true) + { + int this_chunk = Min(nblocks, BLOCKS_PER_CHUNK - chunk_offs); + if (LFC_ENABLED()) + { + entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL); + + if (entry != NULL) + { + for (; chunk_offs < BLOCKS_PER_CHUNK && i < nblocks; chunk_offs++, i++) + { + if ((entry->bitmap[chunk_offs >> 5] & + (1 << (chunk_offs & 31))) != 0) + { + BITMAP_SET(bitmap, i); + found++; + } + } + } + else + { + i += this_chunk; + } + } + else + { + return found; + } + + /* + * Break out of the iteration before doing expensive stuff for + * a next iteration + */ + if (i + 1 >= nblocks) + break; + + /* + * Prepare for the next iteration. We don't unlock here, as that'd + * probably be more expensive than the gains it'd get us. + */ + tag.blockNum = (blkno + i) & ~(BLOCKS_PER_CHUNK - 1); + hash = get_hash_value(lfc_hash, &tag); + chunk_offs = (blkno + i) & (BLOCKS_PER_CHUNK - 1); + } + + LWLockRelease(lfc_lock); + +#if USE_ASSERT_CHECKING + do { + int count = 0; + + for (int j = 0; j < nblocks; j++) + { + if (BITMAP_ISSET(bitmap, j)) + count++; + } + + Assert(count == found); + } while (false); +#endif + + return found; +} + /* * Evict a page (if present) from the local file cache */ @@ -548,91 +643,171 @@ lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno) } /* - * Try to read page from local cache. - * Returns true if page is found in local cache. - * In case of error local file cache is disabled (lfc->limit is set to zero). + * Try to read pages from local cache. + * Returns the number of pages read from the local cache, and sets bits in + * 'read' for the pages which were read. This may scribble over buffers not + * marked in 'read', so be careful with operation ordering. + * + * In case of error local file cache is disabled (lfc->limit is set to zero), + * and -1 is returned. Note that 'read' and the buffers may be touched and in + * an otherwise invalid state. + * + * If the mask argument is supplied, bits will be set at the offsets of pages + * that were present and read from the LFC. */ -bool -lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, - char *buffer) +int +lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, + void **buffers, BlockNumber nblocks, bits8 *mask) { BufferTag tag; FileCacheEntry *entry; ssize_t rc; - int chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1); bool result = true; uint32 hash; uint64 generation; uint32 entry_offset; + int blocks_read = 0; + int buf_offset = 0; if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */ - return false; + return 0; if (!lfc_ensure_opened()) - return false; + return 0; CopyNRelFileInfoToBufTag(tag, rinfo); tag.forkNum = forkNum; - tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1); CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber); - hash = get_hash_value(lfc_hash, &tag); - LWLockAcquire(lfc_lock, LW_EXCLUSIVE); - - if (!LFC_ENABLED()) + /* + * For every chunk that has blocks we're interested in, we + * 1. get the chunk header + * 2. Check if the chunk actually has the blocks we're interested in + * 3. Read the blocks we're looking for (in one preadv), assuming they exist + * 4. Update the statistics for the read call. + * + * If there is an error, we do an early return. + */ + while (nblocks > 0) { + struct iovec iov[PG_IOV_MAX]; + int chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1); + int blocks_in_chunk = Min(nblocks, BLOCKS_PER_CHUNK - (blkno % BLOCKS_PER_CHUNK)); + int iteration_hits = 0; + int iteration_misses = 0; + Assert(blocks_in_chunk > 0); + + for (int i = 0; i < blocks_in_chunk; i++) + { + iov[i].iov_base = buffers[buf_offset + i]; + iov[i].iov_len = BLCKSZ; + } + + tag.blockNum = blkno - chunk_offs; + hash = get_hash_value(lfc_hash, &tag); + + LWLockAcquire(lfc_lock, LW_EXCLUSIVE); + + /* We can return the blocks we've read before LFC got disabled; + * assuming we read any. */ + if (!LFC_ENABLED()) + { + LWLockRelease(lfc_lock); + return blocks_read; + } + + entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL); + + /* Approximate working set for the blocks assumed in this entry */ + for (int i = 0; i < blocks_in_chunk; i++) + { + tag.blockNum = blkno + i; + addSHLL(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag))); + } + + if (entry == NULL) + { + /* Pages are not cached */ + lfc_ctl->misses += blocks_in_chunk; + pgBufferUsage.file_cache.misses += blocks_in_chunk; + LWLockRelease(lfc_lock); + + buf_offset += blocks_in_chunk; + nblocks -= blocks_in_chunk; + blkno += blocks_in_chunk; + + continue; + } + + /* Unlink entry from LRU list to pin it for the duration of IO operation */ + if (entry->access_count++ == 0) + dlist_delete(&entry->list_node); + + generation = lfc_ctl->generation; + entry_offset = entry->offset; + LWLockRelease(lfc_lock); - return false; - } - entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL); + for (int i = 0; i < blocks_in_chunk; i++) + { + /* + * If the page is valid, we consider it "read". + * All other pages will be fetched separately by the next cache + */ + if (entry->bitmap[(chunk_offs + i) / 32] & (1 << ((chunk_offs + i) % 32))) + { + BITMAP_SET(mask, buf_offset + i); + iteration_hits++; + } + else + iteration_misses++; + } - /* Approximate working set */ - tag.blockNum = blkno; - addSHLL(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag))); + Assert(iteration_hits + iteration_misses > 0); + + if (iteration_hits != 0) + { + rc = preadv(lfc_desc, iov, blocks_in_chunk, + ((off_t) entry_offset * BLOCKS_PER_CHUNK + chunk_offs) * BLCKSZ); + + if (rc != (BLCKSZ * blocks_in_chunk)) + { + lfc_disable("read"); + return -1; + } + } + + /* Place entry to the head of LRU list */ + LWLockAcquire(lfc_lock, LW_EXCLUSIVE); + + if (lfc_ctl->generation == generation) + { + CriticalAssert(LFC_ENABLED()); + lfc_ctl->hits += iteration_hits; + lfc_ctl->misses += iteration_misses; + pgBufferUsage.file_cache.hits += iteration_hits; + pgBufferUsage.file_cache.misses += iteration_misses; + CriticalAssert(entry->access_count > 0); + if (--entry->access_count == 0) + dlist_push_tail(&lfc_ctl->lru, &entry->list_node); + } + else + { + /* generation mismatch, assume error condition */ + LWLockRelease(lfc_lock); + return -1; + } - if (entry == NULL || (entry->bitmap[chunk_offs >> 5] & (1 << (chunk_offs & 31))) == 0) - { - /* Page is not cached */ - lfc_ctl->misses += 1; - pgBufferUsage.file_cache.misses += 1; LWLockRelease(lfc_lock); - return false; - } - /* Unlink entry from LRU list to pin it for the duration of IO operation */ - if (entry->access_count++ == 0) - dlist_delete(&entry->list_node); - generation = lfc_ctl->generation; - entry_offset = entry->offset; - LWLockRelease(lfc_lock); - - rc = pread(lfc_desc, buffer, BLCKSZ, ((off_t) entry_offset * BLOCKS_PER_CHUNK + chunk_offs) * BLCKSZ); - if (rc != BLCKSZ) - { - lfc_disable("read"); - return false; + buf_offset += blocks_in_chunk; + nblocks -= blocks_in_chunk; + blkno += blocks_in_chunk; + blocks_read += iteration_hits; } - /* Place entry to the head of LRU list */ - LWLockAcquire(lfc_lock, LW_EXCLUSIVE); - - if (lfc_ctl->generation == generation) - { - CriticalAssert(LFC_ENABLED()); - lfc_ctl->hits += 1; - pgBufferUsage.file_cache.hits += 1; - CriticalAssert(entry->access_count > 0); - if (--entry->access_count == 0) - dlist_push_tail(&lfc_ctl->lru, &entry->list_node); - } - else - result = false; - - LWLockRelease(lfc_lock); - - return result; + return blocks_read; } /* @@ -640,20 +815,17 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, * If cache is full then evict some other page. */ void -#if PG_MAJORVERSION_NUM < 16 -lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, char *buffer) -#else -lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, const void *buffer) -#endif +lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, + const void *const *buffers, BlockNumber nblocks) { BufferTag tag; FileCacheEntry *entry; ssize_t rc; bool found; - int chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1); uint32 hash; uint64 generation; uint32 entry_offset; + int buf_offset = 0; if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */ return; @@ -661,110 +833,142 @@ lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, const void if (!lfc_ensure_opened()) return; - tag.forkNum = forkNum; - tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1); CopyNRelFileInfoToBufTag(tag, rinfo); + tag.forkNum = forkNum; CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber); - hash = get_hash_value(lfc_hash, &tag); - LWLockAcquire(lfc_lock, LW_EXCLUSIVE); - - if (!LFC_ENABLED()) + /* + * For every chunk that has blocks we're interested in, we + * 1. get the chunk header + * 2. Check if the chunk actually has the blocks we're interested in + * 3. Read the blocks we're looking for (in one preadv), assuming they exist + * 4. Update the statistics for the read call. + * + * If there is an error, we do an early return. + */ + while (nblocks > 0) { - LWLockRelease(lfc_lock); - return; - } + struct iovec iov[PG_IOV_MAX]; + int chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1); + int blocks_in_chunk = Min(nblocks, BLOCKS_PER_CHUNK - (blkno % BLOCKS_PER_CHUNK)); + Assert(blocks_in_chunk > 0); - entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_ENTER, &found); - - if (found) - { - /* - * Unlink entry from LRU list to pin it for the duration of IO - * operation - */ - if (entry->access_count++ == 0) - dlist_delete(&entry->list_node); - } - else - { - /* - * We have two choices if all cache pages are pinned (i.e. used in IO - * operations): - * - * 1) Wait until some of this operation is completed and pages is - * unpinned. - * - * 2) Allocate one more chunk, so that specified cache size is more - * recommendation than hard limit. - * - * As far as probability of such event (that all pages are pinned) is - * considered to be very very small: there are should be very large - * number of concurrent IO operations and them are limited by - * max_connections, we prefer not to complicate code and use second - * approach. - */ - if (lfc_ctl->used >= lfc_ctl->limit && !dlist_is_empty(&lfc_ctl->lru)) + for (int i = 0; i < blocks_in_chunk; i++) { - /* Cache overflow: evict least recently used chunk */ - FileCacheEntry *victim = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->lru)); - - CriticalAssert(victim->access_count == 0); - entry->offset = victim->offset; /* grab victim's chunk */ - hash_search_with_hash_value(lfc_hash, &victim->key, victim->hash, HASH_REMOVE, NULL); - neon_log(DEBUG2, "Swap file cache page"); + iov[i].iov_base = unconstify(void *, buffers[buf_offset + i]); + iov[i].iov_len = BLCKSZ; } - else if (!dlist_is_empty(&lfc_ctl->holes)) + + tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1); + hash = get_hash_value(lfc_hash, &tag); + + LWLockAcquire(lfc_lock, LW_EXCLUSIVE); + + if (!LFC_ENABLED()) { - /* We can reuse a hole that was left behind when the LFC was shrunk previously */ - FileCacheEntry *hole = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->holes)); - uint32 offset = hole->offset; - bool found; + LWLockRelease(lfc_lock); + return; + } - hash_search_with_hash_value(lfc_hash, &hole->key, hole->hash, HASH_REMOVE, &found); - CriticalAssert(found); + entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_ENTER, &found); - lfc_ctl->used += 1; - entry->offset = offset; /* reuse the hole */ + if (found) + { + /* + * Unlink entry from LRU list to pin it for the duration of IO + * operation + */ + if (entry->access_count++ == 0) + dlist_delete(&entry->list_node); } else { - lfc_ctl->used += 1; - entry->offset = lfc_ctl->size++; /* allocate new chunk at end - * of file */ - } - entry->access_count = 1; - entry->hash = hash; - memset(entry->bitmap, 0, sizeof entry->bitmap); - } - - generation = lfc_ctl->generation; - entry_offset = entry->offset; - lfc_ctl->writes += 1; - LWLockRelease(lfc_lock); - - rc = pwrite(lfc_desc, buffer, BLCKSZ, ((off_t) entry_offset * BLOCKS_PER_CHUNK + chunk_offs) * BLCKSZ); - if (rc != BLCKSZ) - { - lfc_disable("write"); - } - else - { - LWLockAcquire(lfc_lock, LW_EXCLUSIVE); - - if (lfc_ctl->generation == generation) - { - CriticalAssert(LFC_ENABLED()); - /* Place entry to the head of LRU list */ - CriticalAssert(entry->access_count > 0); - if (--entry->access_count == 0) - dlist_push_tail(&lfc_ctl->lru, &entry->list_node); - - entry->bitmap[chunk_offs >> 5] |= (1 << (chunk_offs & 31)); + /* + * We have two choices if all cache pages are pinned (i.e. used in IO + * operations): + * + * 1) Wait until some of this operation is completed and pages is + * unpinned. + * + * 2) Allocate one more chunk, so that specified cache size is more + * recommendation than hard limit. + * + * As far as probability of such event (that all pages are pinned) is + * considered to be very very small: there are should be very large + * number of concurrent IO operations and them are limited by + * max_connections, we prefer not to complicate code and use second + * approach. + */ + if (lfc_ctl->used >= lfc_ctl->limit && !dlist_is_empty(&lfc_ctl->lru)) + { + /* Cache overflow: evict least recently used chunk */ + FileCacheEntry *victim = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->lru)); + + CriticalAssert(victim->access_count == 0); + entry->offset = victim->offset; /* grab victim's chunk */ + hash_search_with_hash_value(lfc_hash, &victim->key, victim->hash, HASH_REMOVE, NULL); + neon_log(DEBUG2, "Swap file cache page"); + } + else if (!dlist_is_empty(&lfc_ctl->holes)) + { + /* We can reuse a hole that was left behind when the LFC was shrunk previously */ + FileCacheEntry *hole = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->holes)); + uint32 offset = hole->offset; + bool found; + + hash_search_with_hash_value(lfc_hash, &hole->key, hole->hash, HASH_REMOVE, &found); + CriticalAssert(found); + + lfc_ctl->used += 1; + entry->offset = offset; /* reuse the hole */ + } + else + { + lfc_ctl->used += 1; + entry->offset = lfc_ctl->size++; /* allocate new chunk at end + * of file */ + } + entry->access_count = 1; + entry->hash = hash; + memset(entry->bitmap, 0, sizeof entry->bitmap); } + generation = lfc_ctl->generation; + entry_offset = entry->offset; + lfc_ctl->writes += blocks_in_chunk; LWLockRelease(lfc_lock); + + rc = pwritev(lfc_desc, iov, blocks_in_chunk, + ((off_t) entry_offset * BLOCKS_PER_CHUNK + chunk_offs) * BLCKSZ); + if (rc != BLCKSZ * blocks_in_chunk) + { + lfc_disable("write"); + } + else + { + LWLockAcquire(lfc_lock, LW_EXCLUSIVE); + + if (lfc_ctl->generation == generation) + { + CriticalAssert(LFC_ENABLED()); + /* Place entry to the head of LRU list */ + CriticalAssert(entry->access_count > 0); + if (--entry->access_count == 0) + dlist_push_tail(&lfc_ctl->lru, &entry->list_node); + + for (int i = 0; i < blocks_in_chunk; i++) + { + entry->bitmap[(chunk_offs + i) >> 5] |= + (1 << ((chunk_offs + i) & 31)); + } + } + + LWLockRelease(lfc_lock); + } + blkno += blocks_in_chunk; + buf_offset += blocks_in_chunk; + nblocks -= blocks_in_chunk; } } diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index 5126c26c5d..df7000acc0 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -537,7 +537,11 @@ pageserver_connect(shardno_t shard_no, int elevel) /* No more polling needed; connection succeeded */ shard->last_connect_time = GetCurrentTimestamp(); +#if PG_MAJORVERSION_NUM >= 17 + shard->wes_read = CreateWaitEventSet(NULL, 3); +#else shard->wes_read = CreateWaitEventSet(TopMemoryContext, 3); +#endif AddWaitEventToSet(shard->wes_read, WL_LATCH_SET, PGINVALID_SOCKET, MyLatch, NULL); AddWaitEventToSet(shard->wes_read, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET, diff --git a/pgxn/neon/neon_pgversioncompat.h b/pgxn/neon/neon_pgversioncompat.h index addb6ccce6..59b97d64fe 100644 --- a/pgxn/neon/neon_pgversioncompat.h +++ b/pgxn/neon/neon_pgversioncompat.h @@ -6,7 +6,11 @@ #ifndef NEON_PGVERSIONCOMPAT_H #define NEON_PGVERSIONCOMPAT_H +#if PG_MAJORVERSION_NUM < 17 #define NRelFileInfoBackendIsTemp(rinfo) (rinfo.backend != InvalidBackendId) +#else +#define NRelFileInfoBackendIsTemp(rinfo) (rinfo.backend != INVALID_PROC_NUMBER) +#endif #define RelFileInfoEquals(a, b) ( \ NInfoGetSpcOid(a) == NInfoGetSpcOid(b) && \ @@ -50,7 +54,7 @@ #define CopyNRelFileInfoToBufTag(tag, rinfo) \ do { \ (tag).rnode = (rinfo); \ - } while (false); + } while (false) #define BufTagGetNRelFileInfo(tag) tag.rnode @@ -98,7 +102,7 @@ (tag).spcOid = (rinfo).spcOid; \ (tag).dbOid = (rinfo).dbOid; \ (tag).relNumber = (rinfo).relNumber; \ - } while (false); + } while (false) #define BufTagGetNRelFileInfo(tag) \ ((RelFileLocator) { \ @@ -113,4 +117,10 @@ #define DropRelationAllLocalBuffers DropRelationAllLocalBuffers #endif +#if PG_MAJORVERSION_NUM < 17 +#define ProcNumber BackendId +#define INVALID_PROC_NUMBER InvalidBackendId +#define AmAutoVacuumWorkerProcess() (IsAutoVacuumWorkerProcess()) +#endif + #endif /* NEON_PGVERSIONCOMPAT_H */ diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h index 1f196d016c..4c9e40a063 100644 --- a/pgxn/neon/pagestore_client.h +++ b/pgxn/neon/pagestore_client.h @@ -6,8 +6,6 @@ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * contrib/neon/pagestore_client.h - * *------------------------------------------------------------------------- */ #ifndef pageserver_h @@ -187,7 +185,7 @@ extern char *nm_to_string(NeonMessage *msg); * API */ -typedef unsigned shardno_t; +typedef uint16 shardno_t; typedef struct { @@ -211,7 +209,7 @@ extern int neon_protocol_version; extern shardno_t get_shard_number(BufferTag* tag); -extern const f_smgr *smgr_neon(BackendId backend, NRelFileInfo rinfo); +extern const f_smgr *smgr_neon(ProcNumber backend, NRelFileInfo rinfo); extern void smgr_init_neon(void); extern void readahead_buffer_resize(int newsize, void *extra); @@ -233,8 +231,13 @@ extern void neon_zeroextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nbuffers, bool skipFsync); #endif +#if PG_MAJORVERSION_NUM >=17 +extern bool neon_prefetch(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, int nblocks); +#else extern bool neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum); +#endif /* * LSN values associated with each request to the pageserver @@ -269,19 +272,11 @@ typedef struct } neon_request_lsns; #if PG_MAJORVERSION_NUM < 16 -extern void neon_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, - char *buffer); extern PGDLLEXPORT void neon_read_at_lsn(NRelFileInfo rnode, ForkNumber forkNum, BlockNumber blkno, neon_request_lsns request_lsns, char *buffer); -extern void neon_write(SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, char *buffer, bool skipFsync); #else -extern void neon_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, - void *buffer); extern PGDLLEXPORT void neon_read_at_lsn(NRelFileInfo rnode, ForkNumber forkNum, BlockNumber blkno, neon_request_lsns request_lsns, void *buffer); -extern void neon_write(SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, const void *buffer, bool skipFsync); #endif extern void neon_writeback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, BlockNumber nblocks); @@ -299,17 +294,34 @@ extern void update_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockN extern void forget_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum); /* functions for local file cache */ -#if PG_MAJORVERSION_NUM < 16 -extern void lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, - char *buffer); -#else -extern void lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, - const void *buffer); -#endif -extern bool lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, char *buffer); -extern bool lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno); +extern void lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, + BlockNumber blkno, const void *const *buffers, + BlockNumber nblocks); +/* returns number of blocks read, with one bit set in *read for each */ +extern int lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, + BlockNumber blkno, void **buffers, + BlockNumber nblocks, bits8 *mask); + +extern bool lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum, + BlockNumber blkno); +extern int lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum, + BlockNumber blkno, int nblocks, bits8 *bitmap); extern void lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno); extern void lfc_init(void); +static inline bool +lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, + void *buffer) +{ + bits8 rv = 0; + return lfc_readv_select(rinfo, forkNum, blkno, &buffer, 1, &rv) == 1; +} + +static inline void +lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, + const void *buffer) +{ + return lfc_writev(rinfo, forkNum, blkno, &buffer, 1); +} #endif diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index 7f39c7d026..36538ea5e2 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -58,6 +58,7 @@ #include "pgstat.h" #include "postmaster/autovacuum.h" #include "postmaster/interrupt.h" +#include "port/pg_iovec.h" #include "replication/walsender.h" #include "storage/bufmgr.h" #include "storage/buf_internals.h" @@ -66,6 +67,7 @@ #include "storage/smgr.h" #include "pagestore_client.h" +#include "bitmap.h" #if PG_VERSION_NUM >= 150000 #include "access/xlogrecovery.h" @@ -170,16 +172,28 @@ typedef enum PrefetchStatus * valid */ } PrefetchStatus; +/* must fit in uint8; bits 0x1 are used */ +typedef enum { + PRFSF_NONE = 0x0, + PRFSF_SEQ = 0x1, +} PrefetchRequestFlags; + typedef struct PrefetchRequest { BufferTag buftag; /* must be first entry in the struct */ + shardno_t shard_no; + uint8 status; /* see PrefetchStatus for valid values */ + uint8 flags; /* see PrefetchRequestFlags */ neon_request_lsns request_lsns; NeonResponse *response; /* may be null */ - PrefetchStatus status; - shardno_t shard_no; uint64 my_ring_index; } PrefetchRequest; +StaticAssertDecl(sizeof(PrefetchRequest) == 64, + "We prefer to have a power-of-2 size for this struct. Please" + " try to find an alternative solution before reaching to" + " increase the expected size here"); + /* prefetch buffer lookup hash table */ typedef struct PrfHashEntry @@ -251,17 +265,17 @@ typedef struct PrefetchState PrefetchRequest prf_buffer[]; /* prefetch buffers */ } PrefetchState; -#define BITMAP_ISSET(bm, bit) ((bm)[(bit) >> 3] & (1 << ((bit) & 7))) -#define BITMAP_SET(bm, bit) (bm)[(bit) >> 3] |= (1 << ((bit) & 7)) -#define BITMAP_CLR(bm, bit) (bm)[(bit) >> 3] &= ~(1 << ((bit) & 7)) - static PrefetchState *MyPState; +#define GetPrfSlotNoCheck(ring_index) ( \ + &MyPState->prf_buffer[((ring_index) % readahead_buffer_size)] \ +) + #define GetPrfSlot(ring_index) ( \ ( \ AssertMacro((ring_index) < MyPState->ring_unused && \ (ring_index) >= MyPState->ring_last), \ - &MyPState->prf_buffer[((ring_index) % readahead_buffer_size)] \ + GetPrfSlotNoCheck(ring_index) \ ) \ ) @@ -281,9 +295,17 @@ static void prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_ static bool prefetch_wait_for(uint64 ring_index); static void prefetch_cleanup_trailing_unused(void); static inline void prefetch_set_unused(uint64 ring_index); +#if PG_MAJORVERSION_NUM < 17 +static void +GetLastWrittenLSNv(NRelFileInfo relfilenode, ForkNumber forknum, + BlockNumber blkno, int nblocks, XLogRecPtr *lsns); +#endif -static neon_request_lsns neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno); -static bool neon_prefetch_response_usable(neon_request_lsns request_lsns, +static void +neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, + BlockNumber blkno, neon_request_lsns *output, + BlockNumber nblocks, const bits8 *mask); +static bool neon_prefetch_response_usable(neon_request_lsns *request_lsns, PrefetchRequest *slot); static bool @@ -729,9 +751,9 @@ prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns if (force_request_lsns) slot->request_lsns = *force_request_lsns; else - slot->request_lsns = neon_get_request_lsns(BufTagGetNRelFileInfo(slot->buftag), - slot->buftag.forkNum, - slot->buftag.blockNum); + neon_get_request_lsns(BufTagGetNRelFileInfo(slot->buftag), + slot->buftag.forkNum, slot->buftag.blockNum, + &slot->request_lsns, 1, NULL); request.req.lsn = slot->request_lsns.request_lsn; request.req.not_modified_since = slot->request_lsns.not_modified_since; @@ -771,141 +793,194 @@ prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns */ static uint64 -prefetch_register_buffer(BufferTag tag, neon_request_lsns *force_request_lsns) +prefetch_register_bufferv(BufferTag tag, neon_request_lsns *frlsns, + BlockNumber nblocks, const bits8 *mask) { - uint64 ring_index; + uint64 min_ring_index; PrefetchRequest req; - PrefetchRequest *slot; - PrfHashEntry *entry; +#if USE_ASSERT_CHECKING + bool any_hits = false; +#endif + /* We will never read further ahead than our buffer can store. */ + nblocks = Max(1, Min(nblocks, readahead_buffer_size)); /* use an intermediate PrefetchRequest struct to ensure correct alignment */ req.buftag = tag; + Retry: - entry = prfh_lookup(MyPState->prf_hash, (PrefetchRequest *) &req); - - if (entry != NULL) + min_ring_index = UINT64_MAX; + for (int i = 0; i < nblocks; i++) { - slot = entry->slot; - ring_index = slot->my_ring_index; - Assert(slot == GetPrfSlot(ring_index)); + PrefetchRequest *slot = NULL; + PrfHashEntry *entry = NULL; + uint64 ring_index; + neon_request_lsns *lsns; + if (PointerIsValid(mask) && !BITMAP_ISSET(mask, i)) + continue; - Assert(slot->status != PRFS_UNUSED); - Assert(MyPState->ring_last <= ring_index && - ring_index < MyPState->ring_unused); - Assert(BUFFERTAGS_EQUAL(slot->buftag, tag)); + if (frlsns) + lsns = &frlsns[i]; + else + lsns = NULL; - /* - * If the caller specified a request LSN to use, only accept prefetch - * responses that satisfy that request. - */ - if (force_request_lsns) - { - if (!neon_prefetch_response_usable(*force_request_lsns, slot)) - { - /* Wait for the old request to finish and discard it */ - if (!prefetch_wait_for(ring_index)) - goto Retry; - prefetch_set_unused(ring_index); - entry = NULL; - } - } +#if USE_ASSERT_CHECKING + any_hits = true; +#endif + + slot = NULL; + entry = NULL; + + req.buftag.blockNum = tag.blockNum + i; + entry = prfh_lookup(MyPState->prf_hash, (PrefetchRequest *) &req); if (entry != NULL) { + slot = entry->slot; + ring_index = slot->my_ring_index; + Assert(slot == GetPrfSlot(ring_index)); + + Assert(slot->status != PRFS_UNUSED); + Assert(MyPState->ring_last <= ring_index && + ring_index < MyPState->ring_unused); + Assert(BUFFERTAGS_EQUAL(slot->buftag, req.buftag)); + /* - * We received a prefetch for a page that was recently read and - * removed from the buffers. Remove that request from the buffers. + * If the caller specified a request LSN to use, only accept + * prefetch responses that satisfy that request. */ - if (slot->status == PRFS_TAG_REMAINS) + if (lsns) { - prefetch_set_unused(ring_index); - entry = NULL; + if (!neon_prefetch_response_usable(lsns, slot)) + { + /* Wait for the old request to finish and discard it */ + if (!prefetch_wait_for(ring_index)) + goto Retry; + prefetch_set_unused(ring_index); + entry = NULL; + slot = NULL; + } + } + + if (entry != NULL) + { + /* + * We received a prefetch for a page that was recently read + * and removed from the buffers. Remove that request from the + * buffers. + */ + if (slot->status == PRFS_TAG_REMAINS) + { + prefetch_set_unused(ring_index); + entry = NULL; + slot = NULL; + } + else + { + min_ring_index = Min(min_ring_index, ring_index); + /* The buffered request is good enough, return that index */ + pgBufferUsage.prefetch.duplicates++; + continue; + } + } + } + + /* + * We can only leave the block above by finding that there's + * no entry that can satisfy this request, either because there + * was no entry, or because the entry was invalid or didn't satisfy + * the LSNs provided. + * + * The code should've made sure to clear up the data. + */ + Assert(entry == NULL); + Assert(slot == NULL); + + /* + * If the prefetch queue is full, we need to make room by clearing the + * oldest slot. If the oldest slot holds a buffer that was already + * received, we can just throw it away; we fetched the page + * unnecessarily in that case. If the oldest slot holds a request that + * we haven't received a response for yet, we have to wait for the + * response to that before we can continue. We might not have even + * flushed the request to the pageserver yet, it might be just sitting + * in the output buffer. In that case, we flush it and wait for the + * response. (We could decide not to send it, but it's hard to abort + * when the request is already in the output buffer, and 'not sending' + * a prefetch request kind of goes against the principles of + * prefetching) + */ + if (MyPState->ring_last + readahead_buffer_size - 1 == MyPState->ring_unused) + { + uint64 cleanup_index = MyPState->ring_last; + + slot = GetPrfSlot(cleanup_index); + + Assert(slot->status != PRFS_UNUSED); + + /* + * If there is good reason to run compaction on the prefetch buffers, + * try to do that. + */ + if (ReceiveBufferNeedsCompaction() && compact_prefetch_buffers()) + { + Assert(slot->status == PRFS_UNUSED); } else { - /* The buffered request is good enough, return that index */ - pgBufferUsage.prefetch.duplicates++; - return ring_index; + /* + * We have the slot for ring_last, so that must still be in + * progress + */ + switch (slot->status) + { + case PRFS_REQUESTED: + Assert(MyPState->ring_receive == cleanup_index); + if (!prefetch_wait_for(cleanup_index)) + goto Retry; + prefetch_set_unused(cleanup_index); + break; + case PRFS_RECEIVED: + case PRFS_TAG_REMAINS: + prefetch_set_unused(cleanup_index); + break; + default: + pg_unreachable(); + } } } - } - - /* - * If the prefetch queue is full, we need to make room by clearing the - * oldest slot. If the oldest slot holds a buffer that was already - * received, we can just throw it away; we fetched the page unnecessarily - * in that case. If the oldest slot holds a request that we haven't - * received a response for yet, we have to wait for the response to that - * before we can continue. We might not have even flushed the request to - * the pageserver yet, it might be just sitting in the output buffer. In - * that case, we flush it and wait for the response. (We could decide not - * to send it, but it's hard to abort when the request is already in the - * output buffer, and 'not sending' a prefetch request kind of goes - * against the principles of prefetching) - */ - if (MyPState->ring_last + readahead_buffer_size - 1 == MyPState->ring_unused) - { - uint64 cleanup_index = MyPState->ring_last; - - slot = GetPrfSlot(cleanup_index); - - Assert(slot->status != PRFS_UNUSED); /* - * If there is good reason to run compaction on the prefetch buffers, - * try to do that. + * The next buffer pointed to by `ring_unused` is now definitely empty, so + * we can insert the new request to it. */ - if (ReceiveBufferNeedsCompaction() && compact_prefetch_buffers()) - { - Assert(slot->status == PRFS_UNUSED); - } - else - { - /* - * We have the slot for ring_last, so that must still be in - * progress - */ - switch (slot->status) - { - case PRFS_REQUESTED: - Assert(MyPState->ring_receive == cleanup_index); - if (!prefetch_wait_for(cleanup_index)) - goto Retry; - prefetch_set_unused(cleanup_index); - break; - case PRFS_RECEIVED: - case PRFS_TAG_REMAINS: - prefetch_set_unused(cleanup_index); - break; - default: - pg_unreachable(); - } - } + ring_index = MyPState->ring_unused; + + Assert(MyPState->ring_last <= ring_index && + ring_index <= MyPState->ring_unused); + + slot = GetPrfSlotNoCheck(ring_index); + + Assert(slot->status == PRFS_UNUSED); + + /* + * We must update the slot data before insertion, because the hash + * function reads the buffer tag from the slot. + */ + slot->buftag = req.buftag; + slot->shard_no = get_shard_number(&tag); + slot->my_ring_index = ring_index; + + min_ring_index = Min(min_ring_index, ring_index); + + prefetch_do_request(slot, lsns); } - /* - * The next buffer pointed to by `ring_unused` is now definitely empty, so - * we can insert the new request to it. - */ - ring_index = MyPState->ring_unused; - slot = &MyPState->prf_buffer[((ring_index) % readahead_buffer_size)]; + Assert(any_hits); - Assert(MyPState->ring_last <= ring_index); - - Assert(slot->status == PRFS_UNUSED); - - /* - * We must update the slot data before insertion, because the hash - * function reads the buffer tag from the slot. - */ - slot->buftag = tag; - slot->shard_no = get_shard_number(&tag); - slot->my_ring_index = ring_index; - - prefetch_do_request(slot, force_request_lsns); - Assert(slot->status == PRFS_REQUESTED); - Assert(MyPState->ring_last <= ring_index && - ring_index < MyPState->ring_unused); + Assert(GetPrfSlot(min_ring_index)->status == PRFS_REQUESTED || + GetPrfSlot(min_ring_index)->status == PRFS_RECEIVED); + Assert(MyPState->ring_last <= min_ring_index && + min_ring_index < MyPState->ring_unused); if (flush_every_n_requests > 0 && MyPState->ring_unused - MyPState->ring_flush >= flush_every_n_requests) @@ -921,9 +996,17 @@ Retry: MyPState->ring_flush = MyPState->ring_unused; } - return ring_index; + return min_ring_index; } + +static uint64 +prefetch_register_buffer(BufferTag tag, neon_request_lsns *force_request_lsns) +{ + return prefetch_register_bufferv(tag, force_request_lsns, 1, NULL); +} + + /* * Note: this function can get canceled and use a long jump to the next catch * context. Take care. @@ -1348,6 +1431,50 @@ log_newpage_copy(NRelFileInfo * rinfo, ForkNumber forkNum, BlockNumber blkno, return log_newpage(rinfo, forkNum, blkno, copied_buffer.data, page_std); } +#if PG_MAJORVERSION_NUM >= 17 +/* + * Wrapper around log_newpages() that makes a temporary copy of the block and + * WAL-logs that. This makes it safe to use while holding only a shared lock + * on the page, see XLogSaveBufferForHint. We don't use XLogSaveBufferForHint + * directly because it skips the logging if the LSN is new enough. + */ +static XLogRecPtr +log_newpages_copy(NRelFileInfo * rinfo, ForkNumber forkNum, BlockNumber blkno, + BlockNumber nblocks, Page *pages, bool page_std) +{ + PGAlignedBlock copied_buffer[XLR_MAX_BLOCK_ID]; + BlockNumber blknos[XLR_MAX_BLOCK_ID]; + Page pageptrs[XLR_MAX_BLOCK_ID]; + int nregistered = 0; + XLogRecPtr result = 0; + + for (int i = 0; i < nblocks; i++) + { + Page page = copied_buffer[nregistered].data; + memcpy(page, pages[i], BLCKSZ); + pageptrs[nregistered] = page; + blknos[nregistered] = blkno + i; + + ++nregistered; + + if (nregistered >= XLR_MAX_BLOCK_ID) + { + log_newpages(rinfo, forkNum, nregistered, blknos, pageptrs, + page_std); + nregistered = 0; + } + } + + if (nregistered != 0) + { + log_newpages(rinfo, forkNum, nregistered, blknos, pageptrs, + page_std); + } + + return ProcLastRecPtr; +} +#endif /* PG_MAJORVERSION_NUM >= 17 */ + /* * Is 'buffer' identical to a freshly initialized empty heap page? */ @@ -1361,14 +1488,160 @@ PageIsEmptyHeapPage(char *buffer) return memcmp(buffer, empty_page.data, BLCKSZ) == 0; } +#if PG_MAJORVERSION_NUM >= 17 +static void +neon_wallog_pagev(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + BlockNumber nblocks, const char **buffers, bool force) +{ +#define BLOCK_BATCH_SIZE 16 + bool log_pages; + BlockNumber batch_blockno = blocknum; + XLogRecPtr lsns[BLOCK_BATCH_SIZE]; + int batch_size = 0; + + /* + * Whenever a VM or FSM page is evicted, WAL-log it. FSM and (some) VM + * changes are not WAL-logged when the changes are made, so this is our + * last chance to log them, otherwise they're lost. That's OK for + * correctness, the non-logged updates are not critical. But we want to + * have a reasonably up-to-date VM and FSM in the page server. + */ + log_pages = false; + if (force) + { + Assert(XLogInsertAllowed()); + log_pages = true; + } + else if (XLogInsertAllowed() && + !ShutdownRequestPending && + (forknum == FSM_FORKNUM || forknum == VISIBILITYMAP_FORKNUM)) + { + log_pages = true; + } + + if (log_pages) + { + XLogRecPtr recptr; + recptr = log_newpages_copy(&InfoFromSMgrRel(reln), forknum, blocknum, + nblocks, (Page *) buffers, false); + + for (int i = 0; i < nblocks; i++) + PageSetLSN(unconstify(char *, buffers[i]), recptr); + + ereport(SmgrTrace, + (errmsg(NEON_TAG "Page %u through %u of relation %u/%u/%u.%u " + "were force logged, lsn=%X/%X", + blocknum, blocknum + nblocks, + RelFileInfoFmt(InfoFromSMgrRel(reln)), + forknum, LSN_FORMAT_ARGS(recptr)))); + } + + for (int i = 0; i < nblocks; i++) + { + Page page = (Page) buffers[i]; + BlockNumber blkno = blocknum + i; + XLogRecPtr lsn = PageGetLSN(page); + + if (lsn == InvalidXLogRecPtr) + { + /* + * When PostgreSQL extends a relation, it calls smgrextend() with an + * all-zeros pages, and we can just ignore that in Neon. We do need to + * remember the new size, though, so that smgrnblocks() returns the + * right answer after the rel has been extended. We rely on the + * relsize cache for that. + * + * A completely empty heap page doesn't need to be WAL-logged, either. + * The heapam can leave such a page behind, if e.g. an insert errors + * out after initializing the page, but before it has inserted the + * tuple and WAL-logged the change. When we read the page from the + * page server, it will come back as all-zeros. That's OK, the heapam + * will initialize an all-zeros page on first use. + * + * In other scenarios, evicting a dirty page with no LSN is a bad + * sign: it implies that the page was not WAL-logged, and its contents + * will be lost when it's evicted. + */ + if (PageIsNew(page)) + { + ereport(SmgrTrace, + (errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u is all-zeros", + blkno, + RelFileInfoFmt(InfoFromSMgrRel(reln)), + forknum))); + } + else if (PageIsEmptyHeapPage(page)) + { + ereport(SmgrTrace, + (errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u is an empty heap page with no LSN", + blkno, + RelFileInfoFmt(InfoFromSMgrRel(reln)), + forknum))); + } + else if (forknum != FSM_FORKNUM && forknum != VISIBILITYMAP_FORKNUM) + { + /* + * Its a bad sign if there is a page with zero LSN in the buffer + * cache in a standby, too. However, PANICing seems like a cure + * worse than the disease, as the damage has likely already been + * done in the primary. So in a standby, make this an assertion, + * and in a release build just LOG the error and soldier on. We + * update the last-written LSN of the page with a conservative + * value in that case, which is the last replayed LSN. + */ + ereport(RecoveryInProgress() ? LOG : PANIC, + (errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u is evicted with zero LSN", + blkno, + RelFileInfoFmt(InfoFromSMgrRel(reln)), + forknum))); + Assert(false); + + lsn = GetXLogReplayRecPtr(NULL); /* in standby mode, soldier on */ + } + } + else + { + ereport(SmgrTrace, + (errmsg(NEON_TAG "Evicting page %u of relation %u/%u/%u.%u with lsn=%X/%X", + blkno, + RelFileInfoFmt(InfoFromSMgrRel(reln)), + forknum, LSN_FORMAT_ARGS(lsn)))); + } + + /* + * Remember the LSN on this page. When we read the page again, we must + * read the same or newer version of it. + */ + lsns[batch_size++] = lsn; + + if (batch_size >= BLOCK_BATCH_SIZE) + { + SetLastWrittenLSNForBlockv(lsns, InfoFromSMgrRel(reln), forknum, + batch_blockno, + batch_size); + batch_blockno += batch_size; + batch_size = 0; + } + } + + if (batch_size != 0) + { + SetLastWrittenLSNForBlockv(lsns, InfoFromSMgrRel(reln), forknum, + batch_blockno, + batch_size); + } +} +#endif + /* * A page is being evicted from the shared buffer cache. Update the * last-written LSN of the page, and WAL-log it if needed. */ -static void #if PG_MAJORVERSION_NUM < 16 +static void neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool force) #else +static void neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const char *buffer, bool force) #endif { @@ -1548,18 +1821,39 @@ nm_adjust_lsn(XLogRecPtr lsn) return lsn; } + +/* + * Since PG17 we use vetorized version, + * so add compatibility function for older versions + */ +#if PG_MAJORVERSION_NUM < 17 +static void +GetLastWrittenLSNv(NRelFileInfo relfilenode, ForkNumber forknum, + BlockNumber blkno, int nblocks, XLogRecPtr *lsns) +{ + lsns[0] = GetLastWrittenLSN(relfilenode, forknum, blkno); +} +#endif + /* * Return LSN for requesting pages and number of blocks from page server */ -static neon_request_lsns -neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno) +static void +neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, + neon_request_lsns *output, BlockNumber nblocks, + const bits8 *mask) { - XLogRecPtr last_written_lsn; - neon_request_lsns result; + XLogRecPtr last_written_lsns[PG_IOV_MAX]; - last_written_lsn = GetLastWrittenLSN(rinfo, forknum, blkno); - last_written_lsn = nm_adjust_lsn(last_written_lsn); - Assert(last_written_lsn != InvalidXLogRecPtr); + Assert(nblocks <= PG_IOV_MAX); + + GetLastWrittenLSNv(rinfo, forknum, blkno, (int) nblocks, last_written_lsns); + + for (int i = 0; i < nblocks; i++) + { + last_written_lsns[i] = nm_adjust_lsn(last_written_lsns[i]); + Assert(last_written_lsns[i] != InvalidXLogRecPtr); + } if (RecoveryInProgress()) { @@ -1630,95 +1924,111 @@ neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno) /* Request the page at the end of the last fully replayed LSN. */ XLogRecPtr replay_lsn = GetXLogReplayRecPtr(NULL); - if (last_written_lsn > replay_lsn) + for (int i = 0; i < nblocks; i++) { - /* GetCurrentReplayRecPtr was introduced in v15 */ + neon_request_lsns *result = &output[i]; + XLogRecPtr last_written_lsn = last_written_lsns[i]; + + if (PointerIsValid(mask) && !BITMAP_ISSET(mask, i)) + continue; + + if (last_written_lsn > replay_lsn) + { + /* GetCurrentReplayRecPtr was introduced in v15 */ #if PG_VERSION_NUM >= 150000 - Assert(last_written_lsn == GetCurrentReplayRecPtr(NULL)); + Assert(last_written_lsn == GetCurrentReplayRecPtr(NULL)); #endif - /* - * Cases 2 and 4. If this is a backend (case 4), the - * neon_read_at_lsn() call later will wait for the WAL record to be - * fully replayed. - */ - result.request_lsn = last_written_lsn; - } - else - { - /* cases 1 and 3 */ - result.request_lsn = replay_lsn; - } - result.not_modified_since = last_written_lsn; - result.effective_request_lsn = result.request_lsn; - Assert(last_written_lsn <= result.request_lsn); + /* + * Cases 2 and 4. If this is a backend (case 4), the + * neon_read_at_lsn() call later will wait for the WAL record to be + * fully replayed. + */ + result->request_lsn = last_written_lsn; + } + else + { + /* cases 1 and 3 */ + result->request_lsn = replay_lsn; + } - neon_log(DEBUG1, "neon_get_request_lsns request lsn %X/%X, not_modified_since %X/%X", - LSN_FORMAT_ARGS(result.request_lsn), LSN_FORMAT_ARGS(result.not_modified_since)); + result->not_modified_since = last_written_lsn; + result->effective_request_lsn = result->request_lsn; + Assert(last_written_lsn <= result->request_lsn); + + neon_log(DEBUG1, "neon_get_request_lsns request lsn %X/%X, not_modified_since %X/%X", + LSN_FORMAT_ARGS(result->request_lsn), LSN_FORMAT_ARGS(result->not_modified_since)); + } } else { XLogRecPtr flushlsn; - - /* - * Use the latest LSN that was evicted from the buffer cache as the - * 'not_modified_since' hint. Any pages modified by later WAL records - * must still in the buffer cache, so our request cannot concern - * those. - */ - neon_log(DEBUG1, "neon_get_request_lsns GetLastWrittenLSN lsn %X/%X", - LSN_FORMAT_ARGS(last_written_lsn)); - - /* - * Is it possible that the last-written LSN is ahead of last flush - * LSN? Generally not, we shouldn't evict a page from the buffer cache - * before all its modifications have been safely flushed. That's the - * "WAL before data" rule. However, such case does exist at index - * building, _bt_blwritepage logs the full page without flushing WAL - * before smgrextend (files are fsynced before build ends). - */ #if PG_VERSION_NUM >= 150000 flushlsn = GetFlushRecPtr(NULL); #else flushlsn = GetFlushRecPtr(); #endif - if (last_written_lsn > flushlsn) + + for (int i = 0; i < nblocks; i++) { - neon_log(DEBUG5, "last-written LSN %X/%X is ahead of last flushed LSN %X/%X", - LSN_FORMAT_ARGS(last_written_lsn), - LSN_FORMAT_ARGS(flushlsn)); - XLogFlush(last_written_lsn); - flushlsn = last_written_lsn; + neon_request_lsns *result = &output[i]; + XLogRecPtr last_written_lsn = last_written_lsns[i]; + + if (PointerIsValid(mask) && !BITMAP_ISSET(mask, i)) + continue; + /* + * Use the latest LSN that was evicted from the buffer cache as the + * 'not_modified_since' hint. Any pages modified by later WAL records + * must still in the buffer cache, so our request cannot concern + * those. + */ + neon_log(DEBUG1, "neon_get_request_lsns GetLastWrittenLSN lsn %X/%X", + LSN_FORMAT_ARGS(last_written_lsn)); + + /* + * Is it possible that the last-written LSN is ahead of last flush + * LSN? Generally not, we shouldn't evict a page from the buffer cache + * before all its modifications have been safely flushed. That's the + * "WAL before data" rule. However, such case does exist at index + * building, _bt_blwritepage logs the full page without flushing WAL + * before smgrextend (files are fsynced before build ends). + */ + if (last_written_lsn > flushlsn) + { + neon_log(DEBUG5, "last-written LSN %X/%X is ahead of last flushed LSN %X/%X", + LSN_FORMAT_ARGS(last_written_lsn), + LSN_FORMAT_ARGS(flushlsn)); + XLogFlush(last_written_lsn); + flushlsn = last_written_lsn; + } + + /* + * Request the very latest version of the page. In principle we + * want to read the page at the current insert LSN, and we could + * use that value in the request. However, there's a corner case + * with pageserver's garbage collection. If the GC horizon is + * set to a very small value, it's possible that by the time + * that the pageserver processes our request, the GC horizon has + * already moved past the LSN we calculate here. Standby servers + * always have that problem as the can always lag behind the + * primary, but for the primary we can avoid it by always + * requesting the latest page, by setting request LSN to + * UINT64_MAX. + * + * Remember the current LSN, however, so that we can later + * correctly determine if the response to the request is still + * valid. The most up-to-date LSN we could use for that purpose + * would be the current insert LSN, but to avoid the overhead of + * looking it up, use 'flushlsn' instead. This relies on the + * assumption that if the page was modified since the last WAL + * flush, it should still be in the buffer cache, and we + * wouldn't be requesting it. + */ + result->request_lsn = UINT64_MAX; + result->not_modified_since = last_written_lsn; + result->effective_request_lsn = flushlsn; } - - /* - * Request the very latest version of the page. In principle we - * want to read the page at the current insert LSN, and we could - * use that value in the request. However, there's a corner case - * with pageserver's garbage collection. If the GC horizon is - * set to a very small value, it's possible that by the time - * that the pageserver processes our request, the GC horizon has - * already moved past the LSN we calculate here. Standby servers - * always have that problem as the can always lag behind the - * primary, but for the primary we can avoid it by always - * requesting the latest page, by setting request LSN to - * UINT64_MAX. - * - * Remember the current LSN, however, so that we can later - * correctly determine if the response to the request is still - * valid. The most up-to-date LSN we could use for that purpose - * would be the current insert LSN, but to avoid the overhead of - * looking it up, use 'flushlsn' instead. This relies on the - * assumption that if the page was modified since the last WAL - * flush, it should still be in the buffer cache, and we - * wouldn't be requesting it. - */ - result.request_lsn = UINT64_MAX; - result.not_modified_since = last_written_lsn; - result.effective_request_lsn = flushlsn; } - - return result; } /* @@ -1728,13 +2038,13 @@ neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno) * satisfy a page read now. */ static bool -neon_prefetch_response_usable(neon_request_lsns request_lsns, +neon_prefetch_response_usable(neon_request_lsns *request_lsns, PrefetchRequest *slot) { /* sanity check the LSN's on the old and the new request */ - Assert(request_lsns.request_lsn >= request_lsns.not_modified_since); - Assert(request_lsns.effective_request_lsn >= request_lsns.not_modified_since); - Assert(request_lsns.effective_request_lsn <= request_lsns.request_lsn); + Assert(request_lsns->request_lsn >= request_lsns->not_modified_since); + Assert(request_lsns->effective_request_lsn >= request_lsns->not_modified_since); + Assert(request_lsns->effective_request_lsn <= request_lsns->request_lsn); Assert(slot->request_lsns.request_lsn >= slot->request_lsns.not_modified_since); Assert(slot->request_lsns.effective_request_lsn >= slot->request_lsns.not_modified_since); Assert(slot->request_lsns.effective_request_lsn <= slot->request_lsns.request_lsn); @@ -1755,15 +2065,15 @@ neon_prefetch_response_usable(neon_request_lsns request_lsns, * calculate LSNs "out of order" with each other, but the prefetch queue * is backend-private at the moment.) */ - if (request_lsns.effective_request_lsn < slot->request_lsns.effective_request_lsn || - request_lsns.not_modified_since < slot->request_lsns.not_modified_since) + if (request_lsns->effective_request_lsn < slot->request_lsns.effective_request_lsn || + request_lsns->not_modified_since < slot->request_lsns.not_modified_since) { ereport(LOG, (errcode(ERRCODE_IO_ERROR), errmsg(NEON_TAG "request with unexpected LSN after prefetch"), errdetail("Request %X/%X not_modified_since %X/%X, prefetch %X/%X not_modified_since %X/%X)", - LSN_FORMAT_ARGS(request_lsns.effective_request_lsn), - LSN_FORMAT_ARGS(request_lsns.not_modified_since), + LSN_FORMAT_ARGS(request_lsns->effective_request_lsn), + LSN_FORMAT_ARGS(request_lsns->not_modified_since), LSN_FORMAT_ARGS(slot->request_lsns.effective_request_lsn), LSN_FORMAT_ARGS(slot->request_lsns.not_modified_since)))); return false; @@ -1817,9 +2127,9 @@ neon_prefetch_response_usable(neon_request_lsns request_lsns, */ /* this follows from the checks above */ - Assert(request_lsns.effective_request_lsn >= slot->request_lsns.not_modified_since); + Assert(request_lsns->effective_request_lsn >= slot->request_lsns.not_modified_since); - return request_lsns.not_modified_since <= slot->request_lsns.effective_request_lsn; + return request_lsns->not_modified_since <= slot->request_lsns.effective_request_lsn; } /* @@ -1886,7 +2196,8 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum) return false; } - request_lsns = neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, REL_METADATA_PSEUDO_BLOCKNO); + neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, + REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1, NULL); { NeonExistsRequest request = { .req.tag = T_NeonExistsRequest, @@ -2068,7 +2379,7 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, */ if (max_cluster_size > 0 && reln->smgr_relpersistence == RELPERSISTENCE_PERMANENT && - !IsAutoVacuumWorkerProcess()) + !AmAutoVacuumWorkerProcess()) { uint64 current_size = GetNeonCurrentClusterSize(); @@ -2149,7 +2460,7 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum, if (max_cluster_size > 0 && reln->smgr_relpersistence == RELPERSISTENCE_PERMANENT && - !IsAutoVacuumWorkerProcess()) + !AmAutoVacuumWorkerProcess()) { uint64 current_size = GetNeonCurrentClusterSize(); @@ -2247,6 +2558,73 @@ neon_close(SMgrRelation reln, ForkNumber forknum) } +#if PG_MAJORVERSION_NUM >= 17 +/* + * neon_prefetch() -- Initiate asynchronous read of the specified block of a relation + */ +bool +neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + int nblocks) +{ + uint64 ring_index PG_USED_FOR_ASSERTS_ONLY; + BufferTag tag; + bool io_initiated = false; + + switch (reln->smgr_relpersistence) + { + case 0: /* probably shouldn't happen, but ignore it */ + case RELPERSISTENCE_PERMANENT: + break; + + case RELPERSISTENCE_TEMP: + case RELPERSISTENCE_UNLOGGED: + return mdprefetch(reln, forknum, blocknum, nblocks); + + default: + neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); + } + + tag.spcOid = reln->smgr_rlocator.locator.spcOid; + tag.dbOid = reln->smgr_rlocator.locator.dbOid; + tag.relNumber = reln->smgr_rlocator.locator.relNumber; + tag.forkNum = forknum; + + while (nblocks > 0) + { + int iterblocks = Min(nblocks, PG_IOV_MAX); + int seqlen = 0; + bits8 lfc_present[PG_IOV_MAX / 8]; + memset(lfc_present, 0, sizeof(lfc_present)); + + if (lfc_cache_containsv(InfoFromSMgrRel(reln), forknum, blocknum, + iterblocks, lfc_present) == iterblocks) + { + nblocks -= iterblocks; + blocknum += iterblocks; + continue; + } + + io_initiated = true; + + tag.blockNum = blocknum; + + for (int i = 0; i < PG_IOV_MAX / 8; i++) + lfc_present[i] = ~(lfc_present[i]); + + ring_index = prefetch_register_bufferv(tag, NULL, iterblocks, + lfc_present); + nblocks -= iterblocks; + blocknum += iterblocks; + + Assert(ring_index < MyPState->ring_unused && + MyPState->ring_last <= ring_index); + } + + return false; +} + + +#else /* PG_MAJORVERSION_NUM >= 17 */ /* * neon_prefetch() -- Initiate asynchronous read of the specified block of a relation */ @@ -2285,6 +2663,8 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) return false; } +#endif /* PG_MAJORVERSION_NUM < 17 */ + /* * neon_writeback() -- Tell the kernel to write pages back to storage. @@ -2315,7 +2695,12 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum, neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); } - /* not implemented */ + /* + * TODO: WAL sync up to lwLsn for the indicated blocks + * Without that sync, writeback doesn't actually guarantee the data is + * persistently written, which does seem to be one of the assumed + * properties of this smgr API call. + */ neon_log(SmgrTrace, "writeback noop"); #ifdef DEBUG_COMPARE_LOCAL @@ -2324,30 +2709,27 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum, #endif } -/* - * While function is defined in the neon extension it's used within neon_test_utils directly. - * To avoid breaking tests in the runtime please keep function signature in sync. - */ -void +static void #if PG_MAJORVERSION_NUM < 16 -neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, - neon_request_lsns request_lsns, char *buffer) +neon_read_at_lsnv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber base_blockno, neon_request_lsns *request_lsns, + char **buffers, BlockNumber nblocks, const bits8 *mask) #else -neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, - neon_request_lsns request_lsns, void *buffer) +neon_read_at_lsnv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber base_blockno, neon_request_lsns *request_lsns, + void **buffers, BlockNumber nblocks, const bits8 *mask) #endif { NeonResponse *resp; uint64 ring_index; PrfHashEntry *entry; PrefetchRequest *slot; - BufferTag buftag = - { - .forkNum = forkNum, - .blockNum = blkno, - }; + BufferTag buftag = {0}; + + Assert(PointerIsValid(request_lsns)); + Assert(nblocks >= 1); CopyNRelFileInfoToBufTag(buftag, rinfo); + buftag.forkNum = forkNum; + buftag.blockNum = base_blockno; /* * The redo process does not lock pages that it needs to replay but are @@ -2365,115 +2747,147 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, * weren't for the behaviour of the LwLsn cache that uses the highest * value of the LwLsn cache when the entry is not found. */ - if (RecoveryInProgress() && !(MyBackendType == B_STARTUP)) - XLogWaitForReplayOf(request_lsns.request_lsn); + prefetch_register_bufferv(buftag, request_lsns, nblocks, mask); - /* - * Try to find prefetched page in the list of received pages. - */ + for (int i = 0; i < nblocks; i++) + { + void *buffer = buffers[i]; + BlockNumber blockno = base_blockno + i; + neon_request_lsns *reqlsns = &request_lsns[i]; + + if (PointerIsValid(mask) && !BITMAP_ISSET(mask, i)) + continue; + + if (RecoveryInProgress() && MyBackendType != B_STARTUP) + XLogWaitForReplayOf(reqlsns[0].request_lsn); + + /* + * Try to find prefetched page in the list of received pages. + */ Retry: - entry = prfh_lookup(MyPState->prf_hash, (PrefetchRequest *) &buftag); + buftag.blockNum = blockno; + entry = prfh_lookup(MyPState->prf_hash, (PrefetchRequest *) &buftag); - if (entry != NULL) - { - slot = entry->slot; - if (neon_prefetch_response_usable(request_lsns, slot)) + if (entry != NULL) { - ring_index = slot->my_ring_index; - pgBufferUsage.prefetch.hits += 1; - } - else - { - /* - * Cannot use this prefetch, discard it - * - * We can't drop cache for not-yet-received requested items. It is - * unlikely this happens, but it can happen if prefetch distance - * is large enough and a backend didn't consume all prefetch - * requests. - */ - if (slot->status == PRFS_REQUESTED) + slot = entry->slot; + if (neon_prefetch_response_usable(reqlsns, slot)) { - if (!prefetch_wait_for(slot->my_ring_index)) - goto Retry; + ring_index = slot->my_ring_index; + pgBufferUsage.prefetch.hits += 1; + } + else + { + /* + * Cannot use this prefetch, discard it + * + * We can't drop cache for not-yet-received requested items. It is + * unlikely this happens, but it can happen if prefetch distance + * is large enough and a backend didn't consume all prefetch + * requests. + */ + if (slot->status == PRFS_REQUESTED) + { + if (!prefetch_wait_for(slot->my_ring_index)) + goto Retry; + } + /* drop caches */ + prefetch_set_unused(slot->my_ring_index); + pgBufferUsage.prefetch.expired += 1; + /* make it look like a prefetch cache miss */ + entry = NULL; } - /* drop caches */ - prefetch_set_unused(slot->my_ring_index); - pgBufferUsage.prefetch.expired += 1; - /* make it look like a prefetch cache miss */ - entry = NULL; } - } - do - { - if (entry == NULL) + do { - pgBufferUsage.prefetch.misses += 1; + if (entry == NULL) + { + pgBufferUsage.prefetch.misses += 1; - ring_index = prefetch_register_buffer(buftag, &request_lsns); - slot = GetPrfSlot(ring_index); - } - else + ring_index = prefetch_register_bufferv(buftag, reqlsns, 1, NULL); + Assert(ring_index != UINT64_MAX); + slot = GetPrfSlot(ring_index); + } + else + { + /* + * Empty our reference to the prefetch buffer's hash entry. When + * we wait for prefetches, the entry reference is invalidated by + * potential updates to the hash, and when we reconnect to the + * pageserver the prefetch we're waiting for may be dropped, in + * which case we need to retry and take the branch above. + */ + entry = NULL; + } + + Assert(slot->my_ring_index == ring_index); + Assert(MyPState->ring_last <= ring_index && + MyPState->ring_unused > ring_index); + Assert(slot->status != PRFS_UNUSED); + Assert(GetPrfSlot(ring_index) == slot); + + } while (!prefetch_wait_for(ring_index)); + + Assert(slot->status == PRFS_RECEIVED); + Assert(memcmp(&buftag, &slot->buftag, sizeof(BufferTag)) == 0); + Assert(buftag.blockNum == base_blockno + i); + + resp = slot->response; + + switch (resp->tag) { - /* - * Empty our reference to the prefetch buffer's hash entry. When - * we wait for prefetches, the entry reference is invalidated by - * potential updates to the hash, and when we reconnect to the - * pageserver the prefetch we're waiting for may be dropped, in - * which case we need to retry and take the branch above. - */ - entry = NULL; + case T_NeonGetPageResponse: + memcpy(buffer, ((NeonGetPageResponse *) resp)->page, BLCKSZ); + lfc_write(rinfo, forkNum, blockno, buffer); + break; + + case T_NeonErrorResponse: + ereport(ERROR, + (errcode(ERRCODE_IO_ERROR), + errmsg(NEON_TAG "[shard %d] could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X", + slot->shard_no, blockno, RelFileInfoFmt(rinfo), + forkNum, LSN_FORMAT_ARGS(reqlsns->effective_request_lsn)), + errdetail("page server returned error: %s", + ((NeonErrorResponse *) resp)->message))); + break; + default: + NEON_PANIC_CONNECTION_STATE(slot->shard_no, PANIC, + "Expected GetPage (0x%02x) or Error (0x%02x) response to GetPageRequest, but got 0x%02x", + T_NeonGetPageResponse, T_NeonErrorResponse, resp->tag); } - Assert(slot->my_ring_index == ring_index); - Assert(MyPState->ring_last <= ring_index && - MyPState->ring_unused > ring_index); - Assert(slot->status != PRFS_UNUSED); - Assert(GetPrfSlot(ring_index) == slot); - - } while (!prefetch_wait_for(ring_index)); - - Assert(slot->status == PRFS_RECEIVED); - - resp = slot->response; - - switch (resp->tag) - { - case T_NeonGetPageResponse: - memcpy(buffer, ((NeonGetPageResponse *) resp)->page, BLCKSZ); - lfc_write(rinfo, forkNum, blkno, buffer); - break; - - case T_NeonErrorResponse: - ereport(ERROR, - (errcode(ERRCODE_IO_ERROR), - errmsg(NEON_TAG "[shard %d] could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X", - slot->shard_no, blkno, - RelFileInfoFmt(rinfo), - forkNum, - LSN_FORMAT_ARGS(request_lsns.effective_request_lsn)), - errdetail("page server returned error: %s", - ((NeonErrorResponse *) resp)->message))); - break; - default: - NEON_PANIC_CONNECTION_STATE(slot->shard_no, PANIC, - "Expected GetPage (0x%02x) or Error (0x%02x) response to GetPageRequest, but got 0x%02x", - T_NeonGetPageResponse, T_NeonErrorResponse, resp->tag); + /* buffer was used, clean up for later reuse */ + prefetch_set_unused(ring_index); + prefetch_cleanup_trailing_unused(); } - - /* buffer was used, clean up for later reuse */ - prefetch_set_unused(ring_index); - prefetch_cleanup_trailing_unused(); } /* - * neon_read() -- Read the specified block from a relation. + * While function is defined in the neon extension it's used within neon_test_utils directly. + * To avoid breaking tests in the runtime please keep function signature in sync. */ void #if PG_MAJORVERSION_NUM < 16 +neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, + neon_request_lsns request_lsns, char *buffer) +#else +neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, + neon_request_lsns request_lsns, void *buffer) +#endif +{ + neon_read_at_lsnv(rinfo, forkNum, blkno, &request_lsns, &buffer, 1, NULL); +} + +#if PG_MAJORVERSION_NUM < 17 +/* + * neon_read() -- Read the specified block from a relation. + */ +#if PG_MAJORVERSION_NUM < 16 +void neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, char *buffer) #else +void neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer) #endif { @@ -2502,7 +2916,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer return; } - request_lsns = neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, blkno); + neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1, NULL); neon_read_at_lsn(InfoFromSMgrRel(reln), forkNum, blkno, request_lsns, buffer); #ifdef DEBUG_COMPARE_LOCAL @@ -2578,6 +2992,148 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer } #endif } +#endif /* PG_MAJORVERSION_NUM <= 16 */ + +#if PG_MAJORVERSION_NUM >= 17 +void +neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + void **buffers, BlockNumber nblocks) +{ + bits8 read[PG_IOV_MAX / 8]; + neon_request_lsns request_lsns[PG_IOV_MAX]; + int lfc_result; + + switch (reln->smgr_relpersistence) + { + case 0: + neon_log(ERROR, "cannot call smgrread() on rel with unknown persistence"); + + case RELPERSISTENCE_PERMANENT: + break; + + case RELPERSISTENCE_TEMP: + case RELPERSISTENCE_UNLOGGED: + mdreadv(reln, forknum, blocknum, buffers, nblocks); + return; + + default: + neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); + } + + if (nblocks > PG_IOV_MAX) + neon_log(ERROR, "Read request too large: %d is larger than max %d", + nblocks, PG_IOV_MAX); + + memset(read, 0, sizeof(read)); + + /* Try to read from local file cache */ + lfc_result = lfc_readv_select(InfoFromSMgrRel(reln), forknum, blocknum, buffers, + nblocks, read); + + /* Read all blocks from LFC, so we're done */ + if (lfc_result == nblocks) + return; + + if (lfc_result == -1) + { + /* can't use the LFC result, so read all blocks from PS */ + for (int i = 0; i < PG_IOV_MAX / 8; i++) + read[i] = 0xFF; + } + else + { + /* invert the result: exclude blocks read from lfc */ + for (int i = 0; i < PG_IOV_MAX / 8; i++) + read[i] = ~(read[i]); + } + + neon_get_request_lsns(InfoFromSMgrRel(reln), forknum, blocknum, + request_lsns, nblocks, read); + + neon_read_at_lsnv(InfoFromSMgrRel(reln), forknum, blocknum, request_lsns, + buffers, nblocks, read); + +#ifdef DEBUG_COMPARE_LOCAL + if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln)) + { + char pageserver_masked[BLCKSZ]; + char mdbuf[BLCKSZ]; + char mdbuf_masked[BLCKSZ]; + + for (int i = 0; i < nblocks; i++) + { +#if PG_MAJORVERSION_NUM >= 17 + mdreadv(reln, forkNum, blkno + i, &mdbuf, 1); +#else + mdread(reln, forkNum, blkno + i, mdbuf); +#endif + + memcpy(pageserver_masked, buffer, BLCKSZ); + memcpy(mdbuf_masked, mdbuf, BLCKSZ); + + if (PageIsNew((Page) mdbuf)) + { + if (!PageIsNew((Page) pageserver_masked)) + { + neon_log(PANIC, "page is new in MD but not in Page Server at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n", + blkno, + RelFileInfoFmt(InfoFromSMgrRel(reln)), + forkNum, + (uint32) (request_lsn >> 32), (uint32) request_lsn, + hexdump_page(buffer)); + } + } + else if (PageIsNew((Page) buffer)) + { + neon_log(PANIC, "page is new in Page Server but not in MD at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n", + blkno, + RelFileInfoFmt(InfoFromSMgrRel(reln)), + forkNum, + (uint32) (request_lsn >> 32), (uint32) request_lsn, + hexdump_page(mdbuf)); + } + else if (PageGetSpecialSize(mdbuf) == 0) + { + /* assume heap */ + RmgrTable[RM_HEAP_ID].rm_mask(mdbuf_masked, blkno); + RmgrTable[RM_HEAP_ID].rm_mask(pageserver_masked, blkno); + + if (memcmp(mdbuf_masked, pageserver_masked, BLCKSZ) != 0) + { + neon_log(PANIC, "heap buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n", + blkno, + RelFileInfoFmt(InfoFromSMgrRel(reln)), + forkNum, + (uint32) (request_lsn >> 32), (uint32) request_lsn, + hexdump_page(mdbuf_masked), + hexdump_page(pageserver_masked)); + } + } + else if (PageGetSpecialSize(mdbuf) == MAXALIGN(sizeof(BTPageOpaqueData))) + { + if (((BTPageOpaqueData *) PageGetSpecialPointer(mdbuf))->btpo_cycleid < MAX_BT_CYCLE_ID) + { + /* assume btree */ + RmgrTable[RM_BTREE_ID].rm_mask(mdbuf_masked, blkno); + RmgrTable[RM_BTREE_ID].rm_mask(pageserver_masked, blkno); + + if (memcmp(mdbuf_masked, pageserver_masked, BLCKSZ) != 0) + { + neon_log(PANIC, "btree buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n", + blkno, + RelFileInfoFmt(InfoFromSMgrRel(reln)), + forkNum, + (uint32) (request_lsn >> 32), (uint32) request_lsn, + hexdump_page(mdbuf_masked), + hexdump_page(pageserver_masked)); + } + } + } + } + } +#endif +} +#endif #ifdef DEBUG_COMPARE_LOCAL static char * @@ -2623,7 +3179,72 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo if (mdexists(reln, forknum)) { /* It exists locally. Guess it's unlogged then. */ +#if PG_MAJORVERSION_NUM >= 17 + mdwritev(reln, forknum, blocknum, &buffer, 1, skipFsync); +#else mdwrite(reln, forknum, blocknum, buffer, skipFsync); +#endif + /* + * We could set relpersistence now that we have determined + * that it's local. But we don't dare to do it, because that + * would immediately allow reads as well, which shouldn't + * happen. We could cache it with a different 'relpersistence' + * value, but this isn't performance critical. + */ + return; + } + break; + + case RELPERSISTENCE_PERMANENT: + break; + + case RELPERSISTENCE_TEMP: + case RELPERSISTENCE_UNLOGGED: + #if PG_MAJORVERSION_NUM >= 17 + mdwritev(reln, forknum, blocknum, &buffer, 1, skipFsync); + #else + mdwrite(reln, forknum, blocknum, buffer, skipFsync); + #endif + return; + default: + neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); + } + + neon_wallog_page(reln, forknum, blocknum, buffer, false); + + lsn = PageGetLSN((Page) buffer); + neon_log(SmgrTrace, "smgrwrite called for %u/%u/%u.%u blk %u, page LSN: %X/%08X", + RelFileInfoFmt(InfoFromSMgrRel(reln)), + forknum, blocknum, + (uint32) (lsn >> 32), (uint32) lsn); + + lfc_write(InfoFromSMgrRel(reln), forknum, blocknum, buffer); + +#ifdef DEBUG_COMPARE_LOCAL + if (IS_LOCAL_REL(reln)) + #if PG_MAJORVERSION_NUM >= 17 + mdwritev(reln, forknum, blocknum, &buffer, 1, skipFsync); + #else + mdwrite(reln, forknum, blocknum, buffer, skipFsync); + #endif +#endif +} + + + +#if PG_MAJORVERSION_NUM >= 17 +void +neon_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, + const void **buffers, BlockNumber nblocks, bool skipFsync) +{ + switch (reln->smgr_relpersistence) + { + case 0: + /* This is a bit tricky. Check if the relation exists locally */ + if (mdexists(reln, forknum)) + { + /* It exists locally. Guess it's unlogged then. */ + mdwritev(reln, forknum, blkno, buffers, nblocks, skipFsync); /* * We could set relpersistence now that we have determined @@ -2641,29 +3262,24 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo case RELPERSISTENCE_TEMP: case RELPERSISTENCE_UNLOGGED: - mdwrite(reln, forknum, blocknum, buffer, skipFsync); + mdwritev(reln, forknum, blkno, buffers, nblocks, skipFsync); return; - default: neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); } - neon_wallog_page(reln, forknum, blocknum, buffer, false); + neon_wallog_pagev(reln, forknum, blkno, nblocks, (const char **) buffers, false); - lsn = PageGetLSN((Page) buffer); - neon_log(SmgrTrace, "smgrwrite called for %u/%u/%u.%u blk %u, page LSN: %X/%08X", - RelFileInfoFmt(InfoFromSMgrRel(reln)), - forknum, blocknum, - (uint32) (lsn >> 32), (uint32) lsn); - - lfc_write(InfoFromSMgrRel(reln), forknum, blocknum, buffer); + lfc_writev(InfoFromSMgrRel(reln), forknum, blkno, buffers, nblocks); #ifdef DEBUG_COMPARE_LOCAL if (IS_LOCAL_REL(reln)) - mdwrite(reln, forknum, blocknum, buffer, skipFsync); + mdwritev(reln, forknum, blocknum, &buffer, 1, skipFsync); #endif } +#endif + /* * neon_nblocks() -- Get the number of blocks stored in a relation. */ @@ -2699,7 +3315,9 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum) return n_blocks; } - request_lsns = neon_get_request_lsns(InfoFromSMgrRel(reln), forknum, REL_METADATA_PSEUDO_BLOCKNO); + neon_get_request_lsns(InfoFromSMgrRel(reln), forknum, + REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1, NULL); + { NeonNblocksRequest request = { .req.tag = T_NeonNblocksRequest, @@ -2757,7 +3375,9 @@ neon_dbsize(Oid dbNode) neon_request_lsns request_lsns; NRelFileInfo dummy_node = {0}; - request_lsns = neon_get_request_lsns(dummy_node, MAIN_FORKNUM, REL_METADATA_PSEUDO_BLOCKNO); + neon_get_request_lsns(dummy_node, MAIN_FORKNUM, + REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1, NULL); + { NeonDbSizeRequest request = { .req.tag = T_NeonDbSizeRequest, @@ -2898,6 +3518,38 @@ neon_immedsync(SMgrRelation reln, ForkNumber forknum) #endif } +#if PG_MAJORVERSION_NUM >= 17 +void +neon_regisersync(SMgrRelation reln, ForkNumber forknum) +{ + switch (reln->smgr_relpersistence) + { + case 0: + neon_log(ERROR, "cannot call smgrregistersync() on rel with unknown persistence"); + break; + + case RELPERSISTENCE_PERMANENT: + break; + + case RELPERSISTENCE_TEMP: + case RELPERSISTENCE_UNLOGGED: + mdregistersync(reln, forknum); + return; + + default: + neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); + } + + neon_log(SmgrTrace, "[NEON_SMGR] registersync noop"); + +#ifdef DEBUG_COMPARE_LOCAL + if (IS_LOCAL_REL(reln)) + mdimmedsync(reln, forknum); +#endif +} +#endif + + /* * neon_start_unlogged_build() -- Starting build operation on a rel. * @@ -3047,8 +3699,11 @@ neon_end_unlogged_build(SMgrRelation reln) static int neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buffer) { - XLogRecPtr request_lsn, - not_modified_since; + XLogRecPtr request_lsn, + not_modified_since; + SlruKind kind; + int n_blocks; + shardno_t shard_no = 0; /* All SLRUs are at shard 0 */ /* * Compute a request LSN to use, similar to neon_get_request_lsns() but the @@ -3078,32 +3733,30 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf */ not_modified_since = nm_adjust_lsn(GetRedoStartLsn()); - SlruKind kind; - - if (STRPREFIX(path, "pg_xact")) - kind = SLRU_CLOG; - else if (STRPREFIX(path, "pg_multixact/members")) - kind = SLRU_MULTIXACT_MEMBERS; - else if (STRPREFIX(path, "pg_multixact/offsets")) - kind = SLRU_MULTIXACT_OFFSETS; - else - return -1; + if (STRPREFIX(path, "pg_xact")) + kind = SLRU_CLOG; + else if (STRPREFIX(path, "pg_multixact/members")) + kind = SLRU_MULTIXACT_MEMBERS; + else if (STRPREFIX(path, "pg_multixact/offsets")) + kind = SLRU_MULTIXACT_OFFSETS; + else + return -1; NeonResponse *resp; NeonGetSlruSegmentRequest request = { .req.tag = T_NeonGetSlruSegmentRequest, .req.lsn = request_lsn, .req.not_modified_since = not_modified_since, - .kind = kind, .segno = segno }; - int n_blocks; - shardno_t shard_no = 0; /* All SLRUs are at shard 0 */ + do { while (!page_server->send(shard_no, &request.req) || !page_server->flush(shard_no)); + consume_prefetch_responses(); + resp = page_server->receive(shard_no); } while (resp == NULL); @@ -3182,14 +3835,23 @@ static const struct f_smgr neon_smgr = #if PG_MAJORVERSION_NUM >= 16 .smgr_zeroextend = neon_zeroextend, #endif +#if PG_MAJORVERSION_NUM >= 17 + .smgr_prefetch = neon_prefetch, + .smgr_readv = neon_readv, + .smgr_writev = neon_writev, +#else .smgr_prefetch = neon_prefetch, .smgr_read = neon_read, .smgr_write = neon_write, +#endif + .smgr_writeback = neon_writeback, .smgr_nblocks = neon_nblocks, .smgr_truncate = neon_truncate, .smgr_immedsync = neon_immedsync, - +#if PG_MAJORVERSION_NUM >= 17 + .smgr_registersync = neon_regisersync, +#endif .smgr_start_unlogged_build = neon_start_unlogged_build, .smgr_finish_unlogged_build_phase_1 = neon_finish_unlogged_build_phase_1, .smgr_end_unlogged_build = neon_end_unlogged_build, @@ -3198,11 +3860,11 @@ static const struct f_smgr neon_smgr = }; const f_smgr * -smgr_neon(BackendId backend, NRelFileInfo rinfo) +smgr_neon(ProcNumber backend, NRelFileInfo rinfo) { /* Don't use page server for temp relations */ - if (backend != InvalidBackendId) + if (backend != INVALID_PROC_NUMBER) return smgr_standard(backend, rinfo); else return &neon_smgr; diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c index 65ef588ba5..4d0d06e6de 100644 --- a/pgxn/neon/walproposer_pg.c +++ b/pgxn/neon/walproposer_pg.c @@ -81,6 +81,7 @@ static void nwp_register_gucs(void); static void assign_neon_safekeepers(const char *newval, void *extra); static void nwp_prepare_shmem(void); static uint64 backpressure_lag_impl(void); +static uint64 startup_backpressure_wrap(void); static bool backpressure_throttling_impl(void); static void walprop_register_bgworker(void); @@ -90,7 +91,7 @@ static void walprop_pg_init_bgworker(void); static TimestampTz walprop_pg_get_current_timestamp(WalProposer *wp); static void walprop_pg_load_libpqwalreceiver(void); -static process_interrupts_callback_t PrevProcessInterruptsCallback; +static process_interrupts_callback_t PrevProcessInterruptsCallback = NULL; static shmem_startup_hook_type prev_shmem_startup_hook_type; #if PG_VERSION_NUM >= 150000 static shmem_request_hook_type prev_shmem_request_hook = NULL; @@ -178,7 +179,7 @@ pg_init_walproposer(void) nwp_prepare_shmem(); - delay_backend_us = &backpressure_lag_impl; + delay_backend_us = &startup_backpressure_wrap; PrevProcessInterruptsCallback = ProcessInterruptsCallback; ProcessInterruptsCallback = backpressure_throttling_impl; @@ -352,6 +353,22 @@ backpressure_lag_impl(void) return 0; } +/* + * We don't apply backpressure when we're the postmaster, or the startup + * process, because in postmaster we can't apply backpressure, and in + * the startup process we can't afford to slow down. + */ +static uint64 +startup_backpressure_wrap(void) +{ + if (AmStartupProcess() || !IsUnderPostmaster) + return 0; + + delay_backend_us = &backpressure_lag_impl; + + return backpressure_lag_impl(); +} + /* * WalproposerShmemSize --- report amount of shared memory space needed */ @@ -401,12 +418,13 @@ WalproposerShmemInit_SyncSafekeeper(void) static bool backpressure_throttling_impl(void) { - int64 lag; + uint64 lag; TimestampTz start, stop; - bool retry = PrevProcessInterruptsCallback - ? PrevProcessInterruptsCallback() - : false; + bool retry = false; + + if (PointerIsValid(PrevProcessInterruptsCallback)) + retry = PrevProcessInterruptsCallback(); /* * Don't throttle read only transactions or wal sender. Do throttle CREATE @@ -602,7 +620,12 @@ walprop_pg_init_walsender(void) /* Create replication slot for WAL proposer if not exists */ if (SearchNamedReplicationSlot(WAL_PROPOSER_SLOT_NAME, false) == NULL) { +#if PG_MAJORVERSION_NUM >= 17 + ReplicationSlotCreate(WAL_PROPOSER_SLOT_NAME, false, RS_PERSISTENT, + false, false, false); +#else ReplicationSlotCreate(WAL_PROPOSER_SLOT_NAME, false, RS_PERSISTENT, false); +#endif ReplicationSlotReserveWal(); /* Write this slot to disk */ ReplicationSlotMarkDirty(); @@ -1509,7 +1532,11 @@ walprop_pg_init_event_set(WalProposer *wp) wpg_log(FATAL, "double-initialization of event set"); /* for each sk, we have socket plus potentially socket for neon walreader */ +#if PG_MAJORVERSION_NUM >= 17 + waitEvents = CreateWaitEventSet(NULL, 2 + 2 * wp->n_safekeepers); +#else waitEvents = CreateWaitEventSet(TopMemoryContext, 2 + 2 * wp->n_safekeepers); +#endif AddWaitEventToSet(waitEvents, WL_LATCH_SET, PGINVALID_SOCKET, MyLatch, NULL); AddWaitEventToSet(waitEvents, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET, diff --git a/pgxn/neon_rmgr/neon_rmgr_decode.c b/pgxn/neon_rmgr/neon_rmgr_decode.c index f327e132e9..66032c88f6 100644 --- a/pgxn/neon_rmgr/neon_rmgr_decode.c +++ b/pgxn/neon_rmgr/neon_rmgr_decode.c @@ -1,6 +1,7 @@ #include "postgres.h" #if PG_MAJORVERSION_NUM >= 16 + #include "access/heapam_xlog.h" #include "access/neon_xlog.h" #include "replication/decode.h" @@ -9,6 +10,10 @@ #include "neon_rmgr.h" +#endif /* PG >= 16 */ + +#if PG_MAJORVERSION_NUM == 16 + /* individual record(group)'s handlers */ static void DecodeNeonInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf); static void DecodeNeonUpdate(LogicalDecodingContext *ctx, XLogRecordBuffer *buf); @@ -399,6 +404,398 @@ DecodeXLogTuple(char *data, Size len, ReorderBufferTupleBuf *tuple) header->t_infomask2 = xlhdr.t_infomask2; header->t_hoff = xlhdr.t_hoff; } +#endif + +#if PG_MAJORVERSION_NUM == 17 + +/* individual record(group)'s handlers */ +static void DecodeNeonInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf); +static void DecodeNeonUpdate(LogicalDecodingContext *ctx, XLogRecordBuffer *buf); +static void DecodeNeonDelete(LogicalDecodingContext *ctx, XLogRecordBuffer *buf); +static void DecodeNeonMultiInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf); + +/* common function to decode tuples */ +static void DecodeXLogTuple(char *data, Size len, HeapTuple tuple); -#endif \ No newline at end of file +void +neon_rm_decode(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) +{ + uint8 info = XLogRecGetInfo(buf->record) & XLOG_NEON_OPMASK; + TransactionId xid = XLogRecGetXid(buf->record); + SnapBuild *builder = ctx->snapshot_builder; + + ReorderBufferProcessXid(ctx->reorder, xid, buf->origptr); + + /* + * If we don't have snapshot or we are just fast-forwarding, there is no + * point in decoding data changes. + */ + if (SnapBuildCurrentState(builder) < SNAPBUILD_FULL_SNAPSHOT || + ctx->fast_forward) + return; + + switch (info) + { + case XLOG_NEON_HEAP_INSERT: + if (SnapBuildProcessChange(builder, xid, buf->origptr)) + DecodeNeonInsert(ctx, buf); + break; + case XLOG_NEON_HEAP_DELETE: + if (SnapBuildProcessChange(builder, xid, buf->origptr)) + DecodeNeonDelete(ctx, buf); + break; + case XLOG_NEON_HEAP_UPDATE: + case XLOG_NEON_HEAP_HOT_UPDATE: + if (SnapBuildProcessChange(builder, xid, buf->origptr)) + DecodeNeonUpdate(ctx, buf); + break; + case XLOG_NEON_HEAP_LOCK: + break; + case XLOG_NEON_HEAP_MULTI_INSERT: + if (SnapBuildProcessChange(builder, xid, buf->origptr)) + DecodeNeonMultiInsert(ctx, buf); + break; + default: + elog(ERROR, "unexpected RM_HEAP_ID record type: %u", info); + break; + } +} + +static inline bool +FilterByOrigin(LogicalDecodingContext *ctx, RepOriginId origin_id) +{ + if (ctx->callbacks.filter_by_origin_cb == NULL) + return false; + + return filter_by_origin_cb_wrapper(ctx, origin_id); +} + +/* + * Parse XLOG_HEAP_INSERT (not MULTI_INSERT!) records into tuplebufs. + * + * Deletes can contain the new tuple. + */ +static void +DecodeNeonInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) +{ + Size datalen; + char *tupledata; + Size tuplelen; + XLogReaderState *r = buf->record; + xl_neon_heap_insert *xlrec; + ReorderBufferChange *change; + RelFileLocator target_locator; + + xlrec = (xl_neon_heap_insert *) XLogRecGetData(r); + + /* + * Ignore insert records without new tuples (this does happen when + * raw_heap_insert marks the TOAST record as HEAP_INSERT_NO_LOGICAL). + */ + if (!(xlrec->flags & XLH_INSERT_CONTAINS_NEW_TUPLE)) + return; + + /* only interested in our database */ + XLogRecGetBlockTag(r, 0, &target_locator, NULL, NULL); + if (target_locator.dbOid != ctx->slot->data.database) + return; + + /* output plugin doesn't look for this origin, no need to queue */ + if (FilterByOrigin(ctx, XLogRecGetOrigin(r))) + return; + + change = ReorderBufferGetChange(ctx->reorder); + if (!(xlrec->flags & XLH_INSERT_IS_SPECULATIVE)) + change->action = REORDER_BUFFER_CHANGE_INSERT; + else + change->action = REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT; + change->origin_id = XLogRecGetOrigin(r); + + memcpy(&change->data.tp.rlocator, &target_locator, sizeof(RelFileLocator)); + + tupledata = XLogRecGetBlockData(r, 0, &datalen); + tuplelen = datalen - SizeOfHeapHeader; + + change->data.tp.newtuple = + ReorderBufferGetTupleBuf(ctx->reorder, tuplelen); + + DecodeXLogTuple(tupledata, datalen, change->data.tp.newtuple); + + change->data.tp.clear_toast_afterwards = true; + + ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), buf->origptr, + change, + xlrec->flags & XLH_INSERT_ON_TOAST_RELATION); +} + +/* + * Parse XLOG_HEAP_DELETE from wal into proper tuplebufs. + * + * Deletes can possibly contain the old primary key. + */ +static void +DecodeNeonDelete(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) +{ + XLogReaderState *r = buf->record; + xl_neon_heap_delete *xlrec; + ReorderBufferChange *change; + RelFileLocator target_locator; + + xlrec = (xl_neon_heap_delete *) XLogRecGetData(r); + + /* only interested in our database */ + XLogRecGetBlockTag(r, 0, &target_locator, NULL, NULL); + if (target_locator.dbOid != ctx->slot->data.database) + return; + + /* output plugin doesn't look for this origin, no need to queue */ + if (FilterByOrigin(ctx, XLogRecGetOrigin(r))) + return; + + change = ReorderBufferGetChange(ctx->reorder); + + if (xlrec->flags & XLH_DELETE_IS_SUPER) + change->action = REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT; + else + change->action = REORDER_BUFFER_CHANGE_DELETE; + + change->origin_id = XLogRecGetOrigin(r); + + memcpy(&change->data.tp.rlocator, &target_locator, sizeof(RelFileLocator)); + + /* old primary key stored */ + if (xlrec->flags & XLH_DELETE_CONTAINS_OLD) + { + Size datalen = XLogRecGetDataLen(r) - SizeOfNeonHeapHeader; + Size tuplelen = datalen - SizeOfNeonHeapHeader; + + Assert(XLogRecGetDataLen(r) > (SizeOfNeonHeapDelete + SizeOfNeonHeapHeader)); + + change->data.tp.oldtuple = + ReorderBufferGetTupleBuf(ctx->reorder, tuplelen); + + DecodeXLogTuple((char *) xlrec + SizeOfNeonHeapDelete, + datalen, change->data.tp.oldtuple); + } + + change->data.tp.clear_toast_afterwards = true; + + ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), buf->origptr, + change, false); +} + +/* + * Parse XLOG_HEAP_UPDATE and XLOG_HEAP_HOT_UPDATE, which have the same layout + * in the record, from wal into proper tuplebufs. + * + * Updates can possibly contain a new tuple and the old primary key. + */ +static void +DecodeNeonUpdate(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) +{ + XLogReaderState *r = buf->record; + xl_neon_heap_update *xlrec; + ReorderBufferChange *change; + char *data; + RelFileLocator target_locator; + + xlrec = (xl_neon_heap_update *) XLogRecGetData(r); + + /* only interested in our database */ + XLogRecGetBlockTag(r, 0, &target_locator, NULL, NULL); + if (target_locator.dbOid != ctx->slot->data.database) + return; + + /* output plugin doesn't look for this origin, no need to queue */ + if (FilterByOrigin(ctx, XLogRecGetOrigin(r))) + return; + + change = ReorderBufferGetChange(ctx->reorder); + change->action = REORDER_BUFFER_CHANGE_UPDATE; + change->origin_id = XLogRecGetOrigin(r); + memcpy(&change->data.tp.rlocator, &target_locator, sizeof(RelFileLocator)); + + if (xlrec->flags & XLH_UPDATE_CONTAINS_NEW_TUPLE) + { + Size datalen; + Size tuplelen; + + data = XLogRecGetBlockData(r, 0, &datalen); + + tuplelen = datalen - SizeOfNeonHeapHeader; + + change->data.tp.newtuple = + ReorderBufferGetTupleBuf(ctx->reorder, tuplelen); + + DecodeXLogTuple(data, datalen, change->data.tp.newtuple); + } + + if (xlrec->flags & XLH_UPDATE_CONTAINS_OLD) + { + Size datalen; + Size tuplelen; + + /* caution, remaining data in record is not aligned */ + data = XLogRecGetData(r) + SizeOfNeonHeapUpdate; + datalen = XLogRecGetDataLen(r) - SizeOfNeonHeapUpdate; + tuplelen = datalen - SizeOfNeonHeapHeader; + + change->data.tp.oldtuple = + ReorderBufferGetTupleBuf(ctx->reorder, tuplelen); + + DecodeXLogTuple(data, datalen, change->data.tp.oldtuple); + } + + change->data.tp.clear_toast_afterwards = true; + + ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), buf->origptr, + change, false); +} + +/* + * Decode XLOG_HEAP2_MULTI_INSERT_insert record into multiple tuplebufs. + * + * Currently MULTI_INSERT will always contain the full tuples. + */ +static void +DecodeNeonMultiInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) +{ + XLogReaderState *r = buf->record; + xl_neon_heap_multi_insert *xlrec; + int i; + char *data; + char *tupledata; + Size tuplelen; + RelFileLocator rlocator; + + xlrec = (xl_neon_heap_multi_insert *) XLogRecGetData(r); + + /* + * Ignore insert records without new tuples. This happens when a + * multi_insert is done on a catalog or on a non-persistent relation. + */ + if (!(xlrec->flags & XLH_INSERT_CONTAINS_NEW_TUPLE)) + return; + + /* only interested in our database */ + XLogRecGetBlockTag(r, 0, &rlocator, NULL, NULL); + if (rlocator.dbOid != ctx->slot->data.database) + return; + + /* output plugin doesn't look for this origin, no need to queue */ + if (FilterByOrigin(ctx, XLogRecGetOrigin(r))) + return; + + /* + * We know that this multi_insert isn't for a catalog, so the block should + * always have data even if a full-page write of it is taken. + */ + tupledata = XLogRecGetBlockData(r, 0, &tuplelen); + Assert(tupledata != NULL); + + data = tupledata; + for (i = 0; i < xlrec->ntuples; i++) + { + ReorderBufferChange *change; + xl_neon_multi_insert_tuple *xlhdr; + int datalen; + HeapTuple tuple; + HeapTupleHeader header; + + change = ReorderBufferGetChange(ctx->reorder); + change->action = REORDER_BUFFER_CHANGE_INSERT; + change->origin_id = XLogRecGetOrigin(r); + + memcpy(&change->data.tp.rlocator, &rlocator, sizeof(RelFileLocator)); + + xlhdr = (xl_neon_multi_insert_tuple *) SHORTALIGN(data); + data = ((char *) xlhdr) + SizeOfNeonMultiInsertTuple; + datalen = xlhdr->datalen; + + change->data.tp.newtuple = + ReorderBufferGetTupleBuf(ctx->reorder, datalen); + + tuple = change->data.tp.newtuple; + header = tuple->t_data; + + /* not a disk based tuple */ + ItemPointerSetInvalid(&tuple->t_self); + + /* + * We can only figure this out after reassembling the transactions. + */ + tuple->t_tableOid = InvalidOid; + + tuple->t_len = datalen + SizeofHeapTupleHeader; + + memset(header, 0, SizeofHeapTupleHeader); + + memcpy((char *) tuple->t_data + SizeofHeapTupleHeader, + (char *) data, + datalen); + header->t_infomask = xlhdr->t_infomask; + header->t_infomask2 = xlhdr->t_infomask2; + header->t_hoff = xlhdr->t_hoff; + + /* + * Reset toast reassembly state only after the last row in the last + * xl_multi_insert_tuple record emitted by one heap_multi_insert() + * call. + */ + if (xlrec->flags & XLH_INSERT_LAST_IN_MULTI && + (i + 1) == xlrec->ntuples) + change->data.tp.clear_toast_afterwards = true; + else + change->data.tp.clear_toast_afterwards = false; + + ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), + buf->origptr, change, false); + + /* move to the next xl_neon_multi_insert_tuple entry */ + data += datalen; + } + Assert(data == tupledata + tuplelen); +} + +/* + * Read a HeapTuple as WAL logged by heap_insert, heap_update and heap_delete + * (but not by heap_multi_insert) into a tuplebuf. + * + * The size 'len' and the pointer 'data' in the record need to be + * computed outside as they are record specific. + */ +static void +DecodeXLogTuple(char *data, Size len, HeapTuple tuple) +{ + xl_neon_heap_header xlhdr; + int datalen = len - SizeOfNeonHeapHeader; + HeapTupleHeader header; + + Assert(datalen >= 0); + + tuple->t_len = datalen + SizeofHeapTupleHeader; + header = tuple->t_data; + + /* not a disk based tuple */ + ItemPointerSetInvalid(&tuple->t_self); + + /* we can only figure this out after reassembling the transactions */ + tuple->t_tableOid = InvalidOid; + + /* data is not stored aligned, copy to aligned storage */ + memcpy((char *) &xlhdr, + data, + SizeOfNeonHeapHeader); + + memset(header, 0, SizeofHeapTupleHeader); + + memcpy(((char *) tuple->t_data) + SizeofHeapTupleHeader, + data + SizeOfNeonHeapHeader, + datalen); + + header->t_infomask = xlhdr.t_infomask; + header->t_infomask2 = xlhdr.t_infomask2; + header->t_hoff = xlhdr.t_hoff; +} +#endif diff --git a/pgxn/neon_walredo/inmem_smgr.c b/pgxn/neon_walredo/inmem_smgr.c index 4e604a710c..a45e8f5c4a 100644 --- a/pgxn/neon_walredo/inmem_smgr.c +++ b/pgxn/neon_walredo/inmem_smgr.c @@ -68,8 +68,13 @@ static void inmem_close(SMgrRelation reln, ForkNumber forknum); static void inmem_create(SMgrRelation reln, ForkNumber forknum, bool isRedo); static bool inmem_exists(SMgrRelation reln, ForkNumber forknum); static void inmem_unlink(NRelFileInfoBackend rinfo, ForkNumber forknum, bool isRedo); +#if PG_MAJORVERSION_NUM >= 17 +static bool inmem_prefetch(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, int nblocks); +#else static bool inmem_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum); +#endif #if PG_MAJORVERSION_NUM < 16 static void inmem_extend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync); @@ -93,7 +98,9 @@ static BlockNumber inmem_nblocks(SMgrRelation reln, ForkNumber forknum); static void inmem_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks); static void inmem_immedsync(SMgrRelation reln, ForkNumber forknum); - +#if PG_MAJORVERSION_NUM >= 17 +static void inmem_registersync(SMgrRelation reln, ForkNumber forknum); +#endif /* * inmem_init() -- Initialize private state @@ -190,6 +197,14 @@ inmem_close(SMgrRelation reln, ForkNumber forknum) { } +#if PG_MAJORVERSION_NUM >= 17 +static bool +inmem_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + int nblocks) +{ + return true; +} +#else /* * inmem_prefetch() -- Initiate asynchronous read of the specified block of a relation */ @@ -198,6 +213,7 @@ inmem_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) { return true; } +#endif /* * inmem_writeback() -- Tell the kernel to write pages back to storage. @@ -211,11 +227,13 @@ inmem_writeback(SMgrRelation reln, ForkNumber forknum, /* * inmem_read() -- Read the specified block from a relation. */ +#if PG_MAJORVERSION_NUM < 16 static void inmem_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, -#if PG_MAJORVERSION_NUM < 16 char *buffer) #else +static void +inmem_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, void *buffer) #endif { @@ -228,6 +246,18 @@ inmem_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, memcpy(buffer, page_body[pg], BLCKSZ); } +#if PG_MAJORVERSION_NUM >= 17 +static void +inmem_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, + void **buffers, BlockNumber nblocks) +{ + for (int i = 0; i < nblocks; i++) + { + inmem_read(reln, forknum, blkno, buffers[i]); + } +} +#endif + /* * inmem_write() -- Write the supplied block at the appropriate location. * @@ -280,6 +310,18 @@ inmem_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, memcpy(page_body[pg], buffer, BLCKSZ); } +#if PG_MAJORVERSION_NUM >= 17 +static void +inmem_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, + const void **buffers, BlockNumber nblocks, bool skipFsync) +{ + for (int i = 0; i < nblocks; i++) + { + inmem_write(reln, forknum, blkno, buffers[i], skipFsync); + } +} +#endif + /* * inmem_nblocks() -- Get the number of blocks stored in a relation. */ @@ -315,6 +357,13 @@ inmem_immedsync(SMgrRelation reln, ForkNumber forknum) { } +#if PG_MAJORVERSION_NUM >= 17 +static void +inmem_registersync(SMgrRelation reln, ForkNumber forknum) +{ +} +#endif + static const struct f_smgr inmem_smgr = { .smgr_init = inmem_init, @@ -328,23 +377,39 @@ static const struct f_smgr inmem_smgr = #if PG_MAJORVERSION_NUM >= 16 .smgr_zeroextend = inmem_zeroextend, #endif +#if PG_MAJORVERSION_NUM >= 17 + .smgr_prefetch = inmem_prefetch, + .smgr_readv = inmem_readv, + .smgr_writev = inmem_writev, +#else .smgr_prefetch = inmem_prefetch, .smgr_read = inmem_read, .smgr_write = inmem_write, +#endif .smgr_writeback = inmem_writeback, .smgr_nblocks = inmem_nblocks, .smgr_truncate = inmem_truncate, .smgr_immedsync = inmem_immedsync, + +#if PG_MAJORVERSION_NUM >= 17 + .smgr_registersync = inmem_registersync, +#endif + + .smgr_start_unlogged_build = NULL, + .smgr_finish_unlogged_build_phase_1 = NULL, + .smgr_end_unlogged_build = NULL, + .smgr_read_slru_segment = NULL, }; const f_smgr * -smgr_inmem(BackendId backend, NRelFileInfo rinfo) +smgr_inmem(ProcNumber backend, NRelFileInfo rinfo) { Assert(InRecovery); - if (backend != InvalidBackendId) - return smgr_standard(backend, rinfo); - else - return &inmem_smgr; + // // What does this code do? + // if (backend != INVALID_PROC_NUMBER) + // return smgr_standard(backend, rinfo); + // else + return &inmem_smgr; } void diff --git a/pgxn/neon_walredo/inmem_smgr.h b/pgxn/neon_walredo/inmem_smgr.h index 58b98b8e6a..91f1c80965 100644 --- a/pgxn/neon_walredo/inmem_smgr.h +++ b/pgxn/neon_walredo/inmem_smgr.h @@ -11,7 +11,7 @@ #ifndef INMEM_SMGR_H #define INMEM_SMGR_H -extern const f_smgr *smgr_inmem(BackendId backend, NRelFileInfo rinfo); +extern const f_smgr *smgr_inmem(ProcNumber backend, NRelFileInfo rinfo); extern void smgr_init_inmem(void); #endif /* INMEM_SMGR_H */ diff --git a/pgxn/neon_walredo/walredoproc.c b/pgxn/neon_walredo/walredoproc.c index cc545393f5..219ca85207 100644 --- a/pgxn/neon_walredo/walredoproc.c +++ b/pgxn/neon_walredo/walredoproc.c @@ -100,6 +100,9 @@ #include "storage/buf_internals.h" #include "storage/bufmgr.h" #include "storage/dsm.h" +#if PG_MAJORVERSION_NUM >= 17 +#include "storage/dsm_registry.h" +#endif #include "storage/ipc.h" #include "storage/pg_shmem.h" #include "storage/pmsignal.h" @@ -137,7 +140,7 @@ static BufferTag target_redo_tag; static XLogReaderState *reader_state; -#define TRACE DEBUG5 +#define TRACE LOG #ifdef HAVE_LIBSECCOMP @@ -517,6 +520,10 @@ CreateFakeSharedMemoryAndSemaphores() /* * Set up xlog, clog, and buffers */ +#if PG_MAJORVERSION_NUM >= 17 + DSMRegistryShmemInit(); + VarsupShmemInit(); +#endif XLOGShmemInit(); CLOGShmemInit(); CommitTsShmemInit(); @@ -566,7 +573,10 @@ CreateFakeSharedMemoryAndSemaphores() /* * Set up other modules that need some shared memory space */ +#if PG_MAJORVERSION_NUM < 17 + /* "snapshot too old" was removed in PG17, and with it the SnapMgr */ SnapMgrInit(); +#endif BTreeShmemInit(); SyncScanShmemInit(); /* Skip due to the 'pg_notify' directory check */ @@ -742,7 +752,7 @@ BeginRedoForBlock(StringInfo input_message) target_redo_tag.forkNum, target_redo_tag.blockNum); - reln = smgropen(rinfo, InvalidBackendId, RELPERSISTENCE_PERMANENT); + reln = smgropen(rinfo, INVALID_PROC_NUMBER, RELPERSISTENCE_PERMANENT); if (reln->smgr_cached_nblocks[forknum] == InvalidBlockNumber || reln->smgr_cached_nblocks[forknum] < blknum + 1) { diff --git a/test_runner/fixtures/common_types.py b/test_runner/fixtures/common_types.py index 064a678c96..d8390138c9 100644 --- a/test_runner/fixtures/common_types.py +++ b/test_runner/fixtures/common_types.py @@ -13,7 +13,7 @@ DEFAULT_WAL_SEG_SIZE = 16 * 1024 * 1024 class Lsn: """ Datatype for an LSN. Internally it is a 64-bit integer, but the string - representation is like "1/123abcd". See also pg_lsn datatype in Postgres + representation is like "1/0123abcd". See also pg_lsn datatype in Postgres """ def __init__(self, x: Union[int, str]): diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index ee62372871..50284a3f5a 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -933,8 +933,11 @@ class NeonEnvBuilder: for directory_to_clean in reversed(directories_to_clean): if not os.listdir(directory_to_clean): - log.debug(f"Removing empty directory {directory_to_clean}") - directory_to_clean.rmdir() + log.info(f"Removing empty directory {directory_to_clean}") + try: + directory_to_clean.rmdir() + except Exception as e: + log.error(f"Error removing empty directory {directory_to_clean}: {e}") def cleanup_remote_storage(self): for x in [self.pageserver_remote_storage, self.safekeepers_remote_storage]: @@ -3423,6 +3426,7 @@ class VanillaPostgres(PgProtocol): assert not self.running with open(os.path.join(self.pgdatadir, "postgresql.conf"), "a") as conf_file: conf_file.write("\n".join(options)) + conf_file.write("\n") def edit_hba(self, hba: List[str]): """Prepend hba lines into pg_hba.conf file.""" @@ -3476,6 +3480,7 @@ def vanilla_pg( pg_bin = PgBin(test_output_dir, pg_distrib_dir, pg_version) port = port_distributor.get_port() with VanillaPostgres(pgdatadir, pg_bin, port) as vanilla_pg: + vanilla_pg.configure(["shared_preload_libraries='neon_rmgr'"]) yield vanilla_pg diff --git a/test_runner/fixtures/pg_version.py b/test_runner/fixtures/pg_version.py index e12c8e5f4a..258935959b 100644 --- a/test_runner/fixtures/pg_version.py +++ b/test_runner/fixtures/pg_version.py @@ -16,6 +16,7 @@ class PgVersion(str, enum.Enum): V14 = "14" V15 = "15" V16 = "16" + V17 = "17" # Instead of making version an optional parameter in methods, we can use this fake entry # to explicitly rely on the default server version (could be different from pg_version fixture value) NOT_SET = "<-POSTRGRES VERSION IS NOT SET->" diff --git a/test_runner/regress/data/extension_test/5670669815/v17/ext_index.json b/test_runner/regress/data/extension_test/5670669815/v17/ext_index.json new file mode 100644 index 0000000000..7990b2c3a2 --- /dev/null +++ b/test_runner/regress/data/extension_test/5670669815/v17/ext_index.json @@ -0,0 +1,7 @@ +{ + "public_extensions": [], + "library_index": { + "TODO": "We still need PG17 extensions" + }, + "extension_data": {} +} \ No newline at end of file diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py index b559be5f18..fb5c1d3115 100644 --- a/test_runner/regress/test_compatibility.py +++ b/test_runner/regress/test_compatibility.py @@ -21,7 +21,7 @@ from fixtures.pageserver.http import PageserverApiException from fixtures.pageserver.utils import ( timeline_delete_wait_completed, ) -from fixtures.pg_version import PgVersion +from fixtures.pg_version import PgVersion, skip_on_postgres from fixtures.remote_storage import RemoteStorageKind, S3Storage, s3_storage from fixtures.workload import Workload @@ -156,6 +156,9 @@ ingest_lag_log_line = ".*ingesting record with timestamp lagging more than wait_ @check_ondisk_data_compatibility_if_enabled @pytest.mark.xdist_group("compatibility") @pytest.mark.order(after="test_create_snapshot") +@skip_on_postgres( + PgVersion.V17, "There are no snapshots yet" +) # TODO: revert this once we have snapshots def test_backward_compatibility( neon_env_builder: NeonEnvBuilder, test_output_dir: Path, @@ -203,6 +206,9 @@ def test_backward_compatibility( @check_ondisk_data_compatibility_if_enabled @pytest.mark.xdist_group("compatibility") @pytest.mark.order(after="test_create_snapshot") +@skip_on_postgres( + PgVersion.V17, "There are no snapshots yet" +) # TODO: revert this once we have snapshots def test_forward_compatibility( neon_env_builder: NeonEnvBuilder, test_output_dir: Path, diff --git a/test_runner/regress/test_download_extensions.py b/test_runner/regress/test_download_extensions.py index 27eb05ac09..7370eb1456 100644 --- a/test_runner/regress/test_download_extensions.py +++ b/test_runner/regress/test_download_extensions.py @@ -44,6 +44,8 @@ def test_remote_extensions( ): if pg_version == PgVersion.V16: pytest.skip("TODO: PG16 extension building") + if pg_version == PgVersion.V17: + pytest.skip("TODO: PG17 extension building") # setup mock http server # that expects request for anon.tar.zst diff --git a/test_runner/regress/test_postgres_version.py b/test_runner/regress/test_postgres_version.py index 03e8c7c0df..4145a303c6 100644 --- a/test_runner/regress/test_postgres_version.py +++ b/test_runner/regress/test_postgres_version.py @@ -20,16 +20,19 @@ def test_postgres_version(base_dir: Path, pg_bin: PgBin, pg_version: PgVersion): output = f.read().strip() # `postgres --version` prints something like "postgres (PostgreSQL) 15.6 (85d809c124a898847a97d66a211f7d5ef4f8e0cb)". - pattern = r"postgres \(PostgreSQL\) (?P\d+\.\d+) \((?P[0-9a-f]{40})\)" + # beta- and release candidate releases would use '17beta1' and '18rc2' instead of .-separated numbers. + pattern = ( + r"postgres \(PostgreSQL\) (?P\d+(?:beta|rc|\.)\d+) \((?P[0-9a-f]{40})\)" + ) match = re.search(pattern, output, re.IGNORECASE) assert match is not None, f"Can't parse {output} with {pattern}" version = match.group("version") commit = match.group("commit") - assert ( - pg_version.v_prefixed in expected_revisions - ), f"Version `{pg_version.v_prefixed}` doesn't exist in `vendor/revisions.json`, please update it if these changes are intentional" - - msg = f"Unexpected Postgres {pg_version} version: `{output}`, please update `vendor/revisions.json` if these changes are intentional" - assert [version, commit] == expected_revisions[pg_version.v_prefixed], msg + if "." in version: + assert ( + pg_version.v_prefixed in expected_revisions + ), f"Released PostgreSQL version `{pg_version.v_prefixed}` doesn't exist in `vendor/revisions.json`, please update it if these changes are intentional" + msg = f"Unexpected Postgres {pg_version} version: `{output}`, please update `vendor/revisions.json` if these changes are intentional" + assert [version, commit] == expected_revisions[pg_version.v_prefixed], msg diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py index d152d0f41f..f98b53d966 100644 --- a/test_runner/regress/test_timeline_detach_ancestor.py +++ b/test_runner/regress/test_timeline_detach_ancestor.py @@ -118,6 +118,9 @@ def test_ancestor_detach_branched_from( truncated_layers = 0 elif branchpoint == Branchpoint.AFTER_L0: branch_at = Lsn(last_lsn + 8) + # make sure the branch point is not on a page header + if 0 < (branch_at.lsn_int % 8192) < 40: + branch_at += 40 rows = 8192 # as there is no 8 byte walrecord, nothing should get copied from the straddling layer truncated_layers = 0 diff --git a/test_runner/regress/test_twophase.py b/test_runner/regress/test_twophase.py index ea900b07b8..ebe65e7c29 100644 --- a/test_runner/regress/test_twophase.py +++ b/test_runner/regress/test_twophase.py @@ -1,19 +1,32 @@ import os +from pathlib import Path +from fixtures.common_types import TimelineId from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnv, fork_at_current_lsn +from fixtures.neon_fixtures import ( + NeonEnv, + PgBin, + fork_at_current_lsn, + import_timeline_from_vanilla_postgres, +) # # Test branching, when a transaction is in prepared state # -def test_twophase(neon_simple_env: NeonEnv): - env = neon_simple_env - endpoint = env.endpoints.create_start("main", config_lines=["max_prepared_transactions=5"]) +def twophase_test_on_timeline(env: NeonEnv): + endpoint = env.endpoints.create_start( + "test_twophase", config_lines=["max_prepared_transactions=5"] + ) conn = endpoint.connect() cur = conn.cursor() + # FIXME: Switch to the next WAL segment, to work around the bug fixed in + # https://github.com/neondatabase/neon/pull/8914. When that is merged, this can be + # removed. + cur.execute("select pg_switch_wal()") + cur.execute("CREATE TABLE foo (t text)") # Prepare a transaction that will insert a row @@ -53,7 +66,7 @@ def test_twophase(neon_simple_env: NeonEnv): assert len(twophase_files) == 2 # Create a branch with the transaction in prepared state - fork_at_current_lsn(env, endpoint, "test_twophase_prepared", "main") + fork_at_current_lsn(env, endpoint, "test_twophase_prepared", "test_twophase") # Start compute on the new branch endpoint2 = env.endpoints.create_start( @@ -80,3 +93,50 @@ def test_twophase(neon_simple_env: NeonEnv): # Only one committed insert is visible on the original branch cur.execute("SELECT * FROM foo") assert cur.fetchall() == [("three",)] + + +def test_twophase(neon_simple_env: NeonEnv): + """ + Test branching, when a transaction is in prepared state + """ + env = neon_simple_env + env.neon_cli.create_branch("test_twophase") + + twophase_test_on_timeline(env) + + +def test_twophase_nonzero_epoch( + neon_simple_env: NeonEnv, + test_output_dir: Path, + pg_bin: PgBin, + vanilla_pg, +): + """ + Same as 'test_twophase' test, but with a non-zero XID epoch, i.e. after 4 billion XIDs + have been consumed. (This is to ensure that we correctly use the full 64-bit XIDs in + pg_twophase filenames with PostgreSQL v17.) + """ + env = neon_simple_env + + # Reset the vanilla Postgres instance with a higher XID epoch + pg_resetwal_path = os.path.join(pg_bin.pg_bin_path, "pg_resetwal") + cmd = [pg_resetwal_path, "--epoch=1000000000", "-D", str(vanilla_pg.pgdatadir)] + pg_bin.run_capture(cmd) + + timeline_id = TimelineId.generate() + + # Import the cluster to Neon + vanilla_pg.start() + vanilla_pg.safe_psql("create user cloud_admin with password 'postgres' superuser") + import_timeline_from_vanilla_postgres( + test_output_dir, + env, + pg_bin, + env.initial_tenant, + timeline_id, + "test_twophase", + vanilla_pg.connstr(), + ) + vanilla_pg.stop() # don't need the original server anymore + + twophase_test_on_timeline(env) diff --git a/vendor/postgres-v17 b/vendor/postgres-v17 new file mode 160000 index 0000000000..9156d63ce2 --- /dev/null +++ b/vendor/postgres-v17 @@ -0,0 +1 @@ +Subproject commit 9156d63ce253bed9d1f76355ceec610e444eaffa From 0a8c5e1214fcd3f59767a6ca4adeb68612977e51 Mon Sep 17 00:00:00 2001 From: Matthias van de Meent Date: Fri, 13 Sep 2024 15:10:52 +0100 Subject: [PATCH 10/30] Fix broken image for PG17 (#8998) Most extensions are not required to run Neon-based PostgreSQL, but the Neon extension is _quite_ critical, so let's make sure we include it. ## Problem Staging doesn't have working compute images for PG17 ## Summary of changes Disable some PG17 filters so that we get the critical components into the PG17 image --- Dockerfile.compute-node | 63 ++++++++++++++--------------------------- 1 file changed, 21 insertions(+), 42 deletions(-) diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node index fe902eb978..6e2510fe60 100644 --- a/Dockerfile.compute-node +++ b/Dockerfile.compute-node @@ -81,10 +81,7 @@ RUN cd postgres && \ FROM build-deps AS postgis-build ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN case "${PG_VERSION}" in "v17") \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ - esac && \ - apt update && \ +RUN apt update && \ apt install -y cmake gdal-bin libboost-dev libboost-thread-dev libboost-filesystem-dev \ libboost-system-dev libboost-iostreams-dev libboost-program-options-dev libboost-timer-dev \ libcgal-dev libgdal-dev libgmp-dev libmpfr-dev libopenscenegraph-dev libprotobuf-c-dev \ @@ -92,8 +89,8 @@ RUN case "${PG_VERSION}" in "v17") \ # SFCGAL > 1.3 requires CGAL > 5.2, Bullseye's libcgal-dev is 5.2 RUN case "${PG_VERSION}" in "v17") \ - mkdir -p /sfcgal && \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + mkdir -p /sfcgal && \ + echo "Postgis doensn't yet support PG17 (needs 3.4.3, if not higher)" && exit 0;; \ esac && \ wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar.gz -O SFCGAL.tar.gz && \ echo "4e39b3b2adada6254a7bdba6d297bb28e1a9835a9f879b74f37e2dab70203232 SFCGAL.tar.gz" | sha256sum --check && \ @@ -105,7 +102,7 @@ RUN case "${PG_VERSION}" in "v17") \ ENV PATH="/usr/local/pgsql/bin:$PATH" RUN case "${PG_VERSION}" in "v17") \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + echo "Postgis doensn't yet support PG17 (needs 3.4.3, if not higher)" && exit 0;; \ esac && \ wget https://download.osgeo.org/postgis/source/postgis-3.3.3.tar.gz -O postgis.tar.gz && \ echo "74eb356e3f85f14233791013360881b6748f78081cc688ff9d6f0f673a762d13 postgis.tar.gz" | sha256sum --check && \ @@ -666,7 +663,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ENV PATH="/usr/local/pgsql/bin/:$PATH" RUN case "${PG_VERSION}" in "v17") \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + echo "v17 extensions is not supported yet by pg_roaringbitmap. Quit" && exit 0;; \ esac && \ wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4.tar.gz -O pg_roaringbitmap.tar.gz && \ echo "b75201efcb1c2d1b014ec4ae6a22769cc7a224e6e406a587f5784a37b6b5a2aa pg_roaringbitmap.tar.gz" | sha256sum --check && \ @@ -687,7 +684,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ENV PATH="/usr/local/pgsql/bin/:$PATH" RUN case "${PG_VERSION}" in "v17") \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + echo "v17 is not supported yet by pg_semver. Quit" && exit 0;; \ esac && \ wget https://github.com/theory/pg-semver/archive/refs/tags/v0.32.1.tar.gz -O pg_semver.tar.gz && \ echo "fbdaf7512026d62eec03fad8687c15ed509b6ba395bff140acd63d2e4fbe25d7 pg_semver.tar.gz" | sha256sum --check && \ @@ -707,10 +704,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ARG PG_VERSION ENV PATH="/usr/local/pgsql/bin/:$PATH" -RUN case "${PG_VERSION}" in "v17") \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ - esac && \ - case "${PG_VERSION}" in \ +RUN case "${PG_VERSION}" in \ "v14" | "v15") \ export PG_EMBEDDING_VERSION=0.3.5 \ export PG_EMBEDDING_CHECKSUM=0e95b27b8b6196e2cf0a0c9ec143fe2219b82e54c5bb4ee064e76398cbe69ae9 \ @@ -736,7 +730,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ENV PATH="/usr/local/pgsql/bin/:$PATH" RUN case "${PG_VERSION}" in "v17") \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + echo "postgresql_anonymizer does not yet support PG17" && exit 0;; \ esac && \ wget https://github.com/neondatabase/postgresql_anonymizer/archive/refs/tags/neon_1.1.1.tar.gz -O pg_anon.tar.gz && \ echo "321ea8d5c1648880aafde850a2c576e4a9e7b9933a34ce272efc839328999fa9 pg_anon.tar.gz" | sha256sum --check && \ @@ -769,7 +763,7 @@ USER nonroot WORKDIR /home/nonroot RUN case "${PG_VERSION}" in "v17") \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + echo "v17 is not supported yet by pgrx. Quit" && exit 0;; \ esac && \ curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \ chmod +x rustup-init && \ @@ -791,7 +785,7 @@ FROM rust-extensions-build AS pg-jsonschema-pg-build ARG PG_VERSION RUN case "${PG_VERSION}" in "v17") \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + echo "pg_jsonschema does not yet have a release that supports pg17" && exit 0;; \ esac && \ wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.3.1.tar.gz -O pg_jsonschema.tar.gz && \ echo "61df3db1ed83cf24f6aa39c826f8818bfa4f0bd33b587fd6b2b1747985642297 pg_jsonschema.tar.gz" | sha256sum --check && \ @@ -816,7 +810,7 @@ FROM rust-extensions-build AS pg-graphql-pg-build ARG PG_VERSION RUN case "${PG_VERSION}" in "v17") \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + echo "pg_graphql does not yet have a release that supports pg17 as of now" && exit 0;; \ esac && \ wget https://github.com/supabase/pg_graphql/archive/refs/tags/v1.5.7.tar.gz -O pg_graphql.tar.gz && \ echo "2b3e567a5b31019cb97ae0e33263c1bcc28580be5a444ac4c8ece5c4be2aea41 pg_graphql.tar.gz" | sha256sum --check && \ @@ -839,7 +833,7 @@ ARG PG_VERSION # 26806147b17b60763039c6a6878884c41a262318 made on 26/09/2023 RUN case "${PG_VERSION}" in "v17") \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + echo "pg_tiktoken does not have versions, nor support for pg17" && exit 0;; \ esac && \ wget https://github.com/kelvich/pg_tiktoken/archive/26806147b17b60763039c6a6878884c41a262318.tar.gz -O pg_tiktoken.tar.gz && \ echo "e64e55aaa38c259512d3e27c572da22c4637418cf124caba904cd50944e5004e pg_tiktoken.tar.gz" | sha256sum --check && \ @@ -861,7 +855,7 @@ FROM rust-extensions-build AS pg-pgx-ulid-build ARG PG_VERSION RUN case "${PG_VERSION}" in "v17") \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + echo "pgx_ulid does not support pg17 as of the latest version (0.1.5)" && exit 0;; \ esac && \ wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.5.tar.gz -O pgx_ulid.tar.gz && \ echo "9d1659a2da65af0133d5451c454de31b37364e3502087dadf579f790bc8bef17 pgx_ulid.tar.gz" | sha256sum --check && \ @@ -883,7 +877,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ENV PATH="/usr/local/pgsql/bin/:$PATH" RUN case "${PG_VERSION}" in "v17") \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + echo "We'll need to update wal2json to 2.6+ for pg17 support" && exit 0;; \ esac && \ wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar.gz && \ echo "b516653575541cf221b99cf3f8be9b6821f6dbcfc125675c85f35090f824f00e wal2json_2_5.tar.gz" | sha256sum --check && \ @@ -903,7 +897,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ENV PATH="/usr/local/pgsql/bin/:$PATH" RUN case "${PG_VERSION}" in "v17") \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + echo "We'll need to update pg_ivm to 1.9+ for pg17 support" && exit 0;; \ esac && \ wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.7.tar.gz -O pg_ivm.tar.gz && \ echo "ebfde04f99203c7be4b0e873f91104090e2e83e5429c32ac242d00f334224d5e pg_ivm.tar.gz" | sha256sum --check && \ @@ -924,7 +918,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ENV PATH="/usr/local/pgsql/bin/:$PATH" RUN case "${PG_VERSION}" in "v17") \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + echo "pg_partman doesn't support PG17 yet" && exit 0;; \ esac && \ wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.0.1.tar.gz -O pg_partman.tar.gz && \ echo "75b541733a9659a6c90dbd40fccb904a630a32880a6e3044d0c4c5f4c8a65525 pg_partman.tar.gz" | sha256sum --check && \ @@ -977,10 +971,7 @@ COPY --from=pg-ivm-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg-partman-build /usr/local/pgsql/ /usr/local/pgsql/ COPY pgxn/ pgxn/ -RUN case "${PG_VERSION}" in "v17") \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ - esac && \ - make -j $(getconf _NPROCESSORS_ONLN) \ +RUN make -j $(getconf _NPROCESSORS_ONLN) \ PG_CONFIG=/usr/local/pgsql/bin/pg_config \ -C pgxn/neon \ -s install && \ @@ -1023,10 +1014,7 @@ ENV BUILD_TAG=$BUILD_TAG USER nonroot # Copy entire project to get Cargo.* files with proper dependencies for the whole project COPY --chown=nonroot . . -RUN case "${PG_VERSION}" in "v17") \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ - esac && \ - cd compute_tools && mold -run cargo build --locked --profile release-line-debug-size-lto +RUN cd compute_tools && mold -run cargo build --locked --profile release-line-debug-size-lto ######################################################################################### # @@ -1047,24 +1035,15 @@ FROM neon-pg-ext-build AS postgres-cleanup-layer COPY --from=neon-pg-ext-build /usr/local/pgsql /usr/local/pgsql # Remove binaries from /bin/ that we won't use (or would manually copy & install otherwise) -RUN case "${PG_VERSION}" in "v17") \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ - esac && \ - cd /usr/local/pgsql/bin && rm ecpg raster2pgsql shp2pgsql pgtopo_export pgtopo_import pgsql2shp +RUN cd /usr/local/pgsql/bin && rm -f ecpg raster2pgsql shp2pgsql pgtopo_export pgtopo_import pgsql2shp # Remove headers that we won't need anymore - we've completed installation of all extensions -RUN case "${PG_VERSION}" in "v17") \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ - esac && \ - rm -r /usr/local/pgsql/include +RUN rm -r /usr/local/pgsql/include # Remove static postgresql libraries - all compilation is finished, so we # can now remove these files - they must be included in other binaries by now # if they were to be used by other libraries. -RUN case "${PG_VERSION}" in "v17") \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ - esac && \ - rm /usr/local/pgsql/lib/lib*.a +RUN rm /usr/local/pgsql/lib/lib*.a ######################################################################################### From b2c83db54d58d46e8ca11d5f1b4a38471322f713 Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Mon, 16 Sep 2024 12:44:26 +0100 Subject: [PATCH 11/30] CI(gather-rust-build-stats): set PQ_LIB_DIR to Postgres 17 (#9001) ## Problem `gather-rust-build-stats` extra CI job fails with ``` "PQ_LIB_DIR" doesn't exist in the configured path: "/__w/neon/neon/pg_install/v16/lib" ``` ## Summary of changes - Use the path to Postgres 17 for the `gather-rust-build-stats` job. The job uses Postgres built by `make walproposer-lib` --- .github/workflows/neon_extra_builds.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml index 41c9f5dee5..140aac032a 100644 --- a/.github/workflows/neon_extra_builds.yml +++ b/.github/workflows/neon_extra_builds.yml @@ -181,7 +181,7 @@ jobs: run: make walproposer-lib -j$(nproc) - name: Produce the build stats - run: PQ_LIB_DIR=$(pwd)/pg_install/v16/lib cargo build --all --release --timings -j$(nproc) + run: PQ_LIB_DIR=$(pwd)/pg_install/v17/lib cargo build --all --release --timings -j$(nproc) - name: Upload the build stats id: upload-stats From 5876c441abc973acca60882192ad46333c075abd Mon Sep 17 00:00:00 2001 From: Tristan Partin Date: Tue, 10 Sep 2024 16:46:32 +0100 Subject: [PATCH 12/30] Grant access to pg_show_replication_origin_status for neon_superuser Signed-off-by: Tristan Partin --- ...ant_pg_show_replication_origin_status_to_neon_superuser.sql | 1 + compute_tools/src/spec.rs | 3 +++ test_runner/regress/test_migrations.py | 2 +- 3 files changed, 5 insertions(+), 1 deletion(-) create mode 100644 compute_tools/src/migrations/0011-grant_pg_show_replication_origin_status_to_neon_superuser.sql diff --git a/compute_tools/src/migrations/0011-grant_pg_show_replication_origin_status_to_neon_superuser.sql b/compute_tools/src/migrations/0011-grant_pg_show_replication_origin_status_to_neon_superuser.sql new file mode 100644 index 0000000000..425ed8cd3d --- /dev/null +++ b/compute_tools/src/migrations/0011-grant_pg_show_replication_origin_status_to_neon_superuser.sql @@ -0,0 +1 @@ +GRANT EXECUTE ON FUNCTION pg_show_replication_origin_status TO neon_superuser; diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs index 6a87263821..aa9405d28d 100644 --- a/compute_tools/src/spec.rs +++ b/compute_tools/src/spec.rs @@ -793,6 +793,9 @@ pub fn handle_migrations(client: &mut Client) -> Result<()> { include_str!( "./migrations/0010-grant_snapshot_synchronization_funcs_to_neon_superuser.sql" ), + include_str!( + "./migrations/0011-grant_pg_show_replication_origin_status_to_neon_superuser.sql" + ), ]; MigrationRunner::new(client, &migrations).run_migrations()?; diff --git a/test_runner/regress/test_migrations.py b/test_runner/regress/test_migrations.py index e88e56d030..7211619a99 100644 --- a/test_runner/regress/test_migrations.py +++ b/test_runner/regress/test_migrations.py @@ -14,7 +14,7 @@ def test_migrations(neon_simple_env: NeonEnv): endpoint.respec(skip_pg_catalog_updates=False) endpoint.start() - num_migrations = 10 + num_migrations = 11 endpoint.wait_for_migrations(num_migrations=num_migrations) with endpoint.cursor() as cur: From c8bedca5821694633d140a1da2baa4fa9c7dea0c Mon Sep 17 00:00:00 2001 From: Matthias van de Meent Date: Mon, 16 Sep 2024 18:06:31 +0200 Subject: [PATCH 13/30] Fix PG17's extension modifications (#9010) This also reduces the GRANT statements to one per created _reset function --- Dockerfile.compute-node | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node index 6e2510fe60..6bf6fb650f 100644 --- a/Dockerfile.compute-node +++ b/Dockerfile.compute-node @@ -55,22 +55,27 @@ RUN cd postgres && \ # We could add the additional grant statements to the postgres repository but it would be hard to maintain, # whenever we need to pick up a new postgres version and we want to limit the changes in our postgres fork, # so we do it here. - old_list="pg_stat_statements--1.0--1.1.sql pg_stat_statements--1.1--1.2.sql pg_stat_statements--1.2--1.3.sql pg_stat_statements--1.3--1.4.sql pg_stat_statements--1.4--1.5.sql pg_stat_statements--1.4.sql pg_stat_statements--1.5--1.6.sql"; \ - # the first loop is for pg_stat_statement extension version <= 1.6 for file in /usr/local/pgsql/share/extension/pg_stat_statements--*.sql; do \ filename=$(basename "$file"); \ - if echo "$old_list" | grep -q -F "$filename"; then \ + # Note that there are no downgrade scripts for pg_stat_statements, so we \ + # don't have to modify any downgrade paths or (much) older versions: we only \ + # have to make sure every creation of the pg_stat_statements_reset function \ + # also adds execute permissions to the neon_superuser. + case $filename in \ + pg_stat_statements--1.4.sql) \ + # pg_stat_statements_reset is first created with 1.4 echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset() TO neon_superuser;' >> $file; \ - fi; \ - done; \ - # the second loop is for pg_stat_statement extension versions >= 1.7, - # where pg_stat_statement_reset() got 3 additional arguments - for file in /usr/local/pgsql/share/extension/pg_stat_statements--*.sql; do \ - filename=$(basename "$file"); \ - if ! echo "$old_list" | grep -q -F "$filename"; then \ + ;; \ + pg_stat_statements--1.6--1.7.sql) \ + # Then with the 1.6-1.7 migration it is re-created with a new signature, thus add the permissions back echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset(Oid, Oid, bigint) TO neon_superuser;' >> $file; \ - fi; \ - done + ;; \ + pg_stat_statements--1.10--1.11.sql) \ + # Then with the 1.10-1.11 migration it is re-created with a new signature again, thus add the permissions back + echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset(Oid, Oid, bigint, boolean) TO neon_superuser;' >> $file; \ + ;; \ + esac; \ + done; ######################################################################################### # From 2bbb4d3e1c4d70d5bcdc972eb2b9863d1073338a Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Mon, 16 Sep 2024 21:45:19 +0300 Subject: [PATCH 14/30] Remove misc unused code (#9014) --- libs/postgres_ffi/src/xlog_utils.rs | 9 +------- libs/utils/src/accum.rs | 33 ----------------------------- libs/utils/src/id.rs | 10 --------- libs/utils/src/lib.rs | 7 ------ libs/utils/src/lsn.rs | 9 -------- libs/utils/src/nonblock.rs | 17 --------------- libs/utils/src/shutdown.rs | 7 ------ 7 files changed, 1 insertion(+), 91 deletions(-) delete mode 100644 libs/utils/src/accum.rs delete mode 100644 libs/utils/src/nonblock.rs delete mode 100644 libs/utils/src/shutdown.rs diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs index 0cfd56962e..1873734753 100644 --- a/libs/postgres_ffi/src/xlog_utils.rs +++ b/libs/postgres_ffi/src/xlog_utils.rs @@ -30,7 +30,7 @@ use std::fs::File; use std::io::prelude::*; use std::io::ErrorKind; use std::io::SeekFrom; -use std::path::{Path, PathBuf}; +use std::path::Path; use std::time::SystemTime; use utils::bin_ser::DeserializeError; use utils::bin_ser::SerializeError; @@ -260,13 +260,6 @@ fn open_wal_segment(seg_file_path: &Path) -> anyhow::Result> { } } -pub fn main() { - let mut data_dir = PathBuf::new(); - data_dir.push("."); - let wal_end = find_end_of_wal(&data_dir, WAL_SEGMENT_SIZE, Lsn(0)).unwrap(); - println!("wal_end={:?}", wal_end); -} - impl XLogRecord { pub fn from_slice(buf: &[u8]) -> Result { use utils::bin_ser::LeSer; diff --git a/libs/utils/src/accum.rs b/libs/utils/src/accum.rs deleted file mode 100644 index 0fb0190a92..0000000000 --- a/libs/utils/src/accum.rs +++ /dev/null @@ -1,33 +0,0 @@ -/// A helper to "accumulate" a value similar to `Iterator::reduce`, but lets you -/// feed the accumulated values by calling the 'accum' function, instead of having an -/// iterator. -/// -/// For example, to calculate the smallest value among some integers: -/// -/// ``` -/// use utils::accum::Accum; -/// -/// let values = [1, 2, 3]; -/// -/// let mut min_value: Accum = Accum(None); -/// for new_value in &values { -/// min_value.accum(std::cmp::min, *new_value); -/// } -/// -/// assert_eq!(min_value.0.unwrap(), 1); -/// ``` -pub struct Accum(pub Option); -impl Accum { - pub fn accum(&mut self, func: F, new_value: T) - where - F: FnOnce(T, T) -> T, - { - // If there is no previous value, just store the new value. - // Otherwise call the function to decide which one to keep. - self.0 = Some(if let Some(accum) = self.0 { - func(accum, new_value) - } else { - new_value - }); - } -} diff --git a/libs/utils/src/id.rs b/libs/utils/src/id.rs index 2cda899b15..eb91839504 100644 --- a/libs/utils/src/id.rs +++ b/libs/utils/src/id.rs @@ -88,12 +88,6 @@ impl<'de> Deserialize<'de> for Id { } impl Id { - pub fn get_from_buf(buf: &mut impl bytes::Buf) -> Id { - let mut arr = [0u8; 16]; - buf.copy_to_slice(&mut arr); - Id::from(arr) - } - pub fn from_slice(src: &[u8]) -> Result { if src.len() != 16 { return Err(IdError::SliceParseError(src.len())); @@ -179,10 +173,6 @@ impl fmt::Debug for Id { macro_rules! id_newtype { ($t:ident) => { impl $t { - pub fn get_from_buf(buf: &mut impl bytes::Buf) -> $t { - $t(Id::get_from_buf(buf)) - } - pub fn from_slice(src: &[u8]) -> Result<$t, IdError> { Ok($t(Id::from_slice(src)?)) } diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs index 218dd468b1..03fb36caf8 100644 --- a/libs/utils/src/lib.rs +++ b/libs/utils/src/lib.rs @@ -43,16 +43,9 @@ pub mod logging; pub mod lock_file; pub mod pid_file; -// Misc -pub mod accum; -pub mod shutdown; - // Utility for binding TcpListeners with proper socket options. pub mod tcp_listener; -// Utility for putting a raw file descriptor into non-blocking mode -pub mod nonblock; - // Default signal handling pub mod sentry_init; pub mod signals; diff --git a/libs/utils/src/lsn.rs b/libs/utils/src/lsn.rs index 1aebe91428..06d5c27ebf 100644 --- a/libs/utils/src/lsn.rs +++ b/libs/utils/src/lsn.rs @@ -1,6 +1,5 @@ #![warn(missing_docs)] -use camino::Utf8Path; use serde::{de::Visitor, Deserialize, Serialize}; use std::fmt; use std::ops::{Add, AddAssign}; @@ -145,14 +144,6 @@ impl Lsn { i128::from(self.0) - i128::from(other) } - /// Parse an LSN from a filename in the form `0000000000000000` - pub fn from_filename(filename: F) -> Result - where - F: AsRef, - { - Lsn::from_hex(filename.as_ref().as_str()) - } - /// Parse an LSN from a string in the form `0000000000000000` pub fn from_hex(s: S) -> Result where diff --git a/libs/utils/src/nonblock.rs b/libs/utils/src/nonblock.rs deleted file mode 100644 index 05e2e3af4c..0000000000 --- a/libs/utils/src/nonblock.rs +++ /dev/null @@ -1,17 +0,0 @@ -use nix::fcntl::{fcntl, OFlag, F_GETFL, F_SETFL}; -use std::os::unix::io::RawFd; - -/// Put a file descriptor into non-blocking mode -pub fn set_nonblock(fd: RawFd) -> Result<(), std::io::Error> { - let bits = fcntl(fd, F_GETFL)?; - - // If F_GETFL returns some unknown bits, they should be valid - // for passing back to F_SETFL, too. If we left them out, the F_SETFL - // would effectively clear them, which is not what we want. - let mut flags = OFlag::from_bits_retain(bits); - flags |= OFlag::O_NONBLOCK; - - fcntl(fd, F_SETFL(flags))?; - - Ok(()) -} diff --git a/libs/utils/src/shutdown.rs b/libs/utils/src/shutdown.rs deleted file mode 100644 index cb5a44d664..0000000000 --- a/libs/utils/src/shutdown.rs +++ /dev/null @@ -1,7 +0,0 @@ -/// Immediately terminate the calling process without calling -/// atexit callbacks, C runtime destructors etc. We mainly use -/// this to protect coverage data from concurrent writes. -pub fn exit_now(code: u8) -> ! { - // SAFETY: exiting is safe, the ffi is not safe - unsafe { nix::libc::_exit(code as _) }; -} From 5e16c7bb0b484cc760f4ddb3c0a866157760825e Mon Sep 17 00:00:00 2001 From: Tristan Partin Date: Tue, 10 Sep 2024 15:37:08 +0100 Subject: [PATCH 15/30] Generate pgbench data on the server for most tests This should generally be faster when running tests, especially those that run with higher scales. Ignoring test_lfc_resize since it seems like we are hitting a query timeout for some reason that I have yet to investigate. A little bit of improvemnt is better than none. Signed-off-by: Tristan Partin --- test_runner/performance/test_branch_creation.py | 2 +- test_runner/performance/test_branching.py | 2 +- .../performance/test_logical_replication.py | 14 +++++++------- .../performance/test_physical_replication.py | 4 ++-- test_runner/regress/test_branching.py | 2 +- test_runner/regress/test_disk_usage_eviction.py | 2 +- test_runner/regress/test_hot_standby.py | 2 +- test_runner/regress/test_pageserver_reconnect.py | 2 +- .../test_pageserver_restarts_under_workload.py | 2 +- .../regress/test_threshold_based_eviction.py | 2 +- 10 files changed, 17 insertions(+), 17 deletions(-) diff --git a/test_runner/performance/test_branch_creation.py b/test_runner/performance/test_branch_creation.py index b3866f1813..f1ab7876f9 100644 --- a/test_runner/performance/test_branch_creation.py +++ b/test_runner/performance/test_branch_creation.py @@ -107,7 +107,7 @@ def test_branch_creation_many(neon_compare: NeonCompare, n_branches: int, shape: env.neon_cli.create_branch("b0") endpoint = env.endpoints.create_start("b0") - neon_compare.pg_bin.run_capture(["pgbench", "-i", "-s10", endpoint.connstr()]) + neon_compare.pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", "-s10", endpoint.connstr()]) branch_creation_durations = [] diff --git a/test_runner/performance/test_branching.py b/test_runner/performance/test_branching.py index 667d1a4c4a..f8d39487f2 100644 --- a/test_runner/performance/test_branching.py +++ b/test_runner/performance/test_branching.py @@ -43,7 +43,7 @@ def test_compare_child_and_root_pgbench_perf(neon_compare: NeonCompare): env.neon_cli.create_branch("root") endpoint_root = env.endpoints.create_start("root") - pg_bin.run_capture(["pgbench", "-i", endpoint_root.connstr(), "-s10"]) + pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", endpoint_root.connstr(), "-s10"]) fork_at_current_lsn(env, endpoint_root, "child", "root") diff --git a/test_runner/performance/test_logical_replication.py b/test_runner/performance/test_logical_replication.py index 29a0380524..dbf94a2cf5 100644 --- a/test_runner/performance/test_logical_replication.py +++ b/test_runner/performance/test_logical_replication.py @@ -24,13 +24,13 @@ def test_logical_replication(neon_simple_env: NeonEnv, pg_bin: PgBin, vanilla_pg endpoint = env.endpoints.create_start("main") - pg_bin.run_capture(["pgbench", "-i", "-s10", endpoint.connstr()]) + pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", "-s10", endpoint.connstr()]) endpoint.safe_psql("create publication pub1 for table pgbench_accounts, pgbench_history") # now start subscriber vanilla_pg.start() - pg_bin.run_capture(["pgbench", "-i", "-s10", vanilla_pg.connstr()]) + pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", "-s10", vanilla_pg.connstr()]) vanilla_pg.safe_psql("truncate table pgbench_accounts") vanilla_pg.safe_psql("truncate table pgbench_history") @@ -99,9 +99,9 @@ def test_subscriber_lag( sub_connstr = benchmark_project_sub.connstr if benchmark_project_pub.is_new: - pg_bin.run_capture(["pgbench", "-i", "-s100"], env=pub_env) + pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", "-s100"], env=pub_env) if benchmark_project_sub.is_new: - pg_bin.run_capture(["pgbench", "-i", "-s100"], env=sub_env) + pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", "-s100"], env=sub_env) pub_conn = psycopg2.connect(pub_connstr) sub_conn = psycopg2.connect(sub_connstr) @@ -193,8 +193,8 @@ def test_publisher_restart( pub_connstr = benchmark_project_pub.connstr sub_connstr = benchmark_project_sub.connstr - pg_bin.run_capture(["pgbench", "-i", "-s100"], env=pub_env) - pg_bin.run_capture(["pgbench", "-i", "-s100"], env=sub_env) + pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", "-s100"], env=pub_env) + pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", "-s100"], env=sub_env) pub_conn = psycopg2.connect(pub_connstr) sub_conn = psycopg2.connect(sub_connstr) @@ -288,7 +288,7 @@ def test_snap_files( is_super = cur.fetchall()[0][0] assert is_super, "This benchmark won't work if we don't have superuser" - pg_bin.run_capture(["pgbench", "-i", "-s100"], env=env) + pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", "-s100"], env=env) conn = psycopg2.connect(connstr) conn.autocommit = True diff --git a/test_runner/performance/test_physical_replication.py b/test_runner/performance/test_physical_replication.py index 7e16197211..49b1176d34 100644 --- a/test_runner/performance/test_physical_replication.py +++ b/test_runner/performance/test_physical_replication.py @@ -85,7 +85,7 @@ def test_ro_replica_lag( endpoint_id=replica["endpoint"]["id"], )["uri"] - pg_bin.run_capture(["pgbench", "-i", "-s100"], env=master_env) + pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", "-s100"], env=master_env) master_workload = pg_bin.run_nonblocking( ["pgbench", "-c10", pgbench_duration, "-Mprepared"], @@ -212,7 +212,7 @@ def test_replication_start_stop( for i in range(num_replicas): replica_env[i]["PGHOST"] = replicas[i]["endpoint"]["host"] - pg_bin.run_capture(["pgbench", "-i", "-s10"], env=master_env) + pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", "-s10"], env=master_env) # Sync replicas with psycopg2.connect(master_connstr) as conn_master: diff --git a/test_runner/regress/test_branching.py b/test_runner/regress/test_branching.py index fc74707639..1729e2fc98 100644 --- a/test_runner/regress/test_branching.py +++ b/test_runner/regress/test_branching.py @@ -52,7 +52,7 @@ def test_branching_with_pgbench( def run_pgbench(connstr: str): log.info(f"Start a pgbench workload on pg {connstr}") - pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", connstr]) + pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", f"-s{scale}", connstr]) pg_bin.run_capture(["pgbench", "-T15", connstr]) env.neon_cli.create_branch("b0", tenant_id=tenant) diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py index 85616c3fe2..1fec8b3f18 100644 --- a/test_runner/regress/test_disk_usage_eviction.py +++ b/test_runner/regress/test_disk_usage_eviction.py @@ -291,7 +291,7 @@ def pgbench_init_tenant( ) with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint: - pg_bin.run(["pgbench", "-i", f"-s{scale}", endpoint.connstr()]) + pg_bin.run(["pgbench", "-i", "-I", "dtGvp", f"-s{scale}", endpoint.connstr()]) wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id) return (tenant_id, timeline_id) diff --git a/test_runner/regress/test_hot_standby.py b/test_runner/regress/test_hot_standby.py index ae63136abb..d94704012f 100644 --- a/test_runner/regress/test_hot_standby.py +++ b/test_runner/regress/test_hot_standby.py @@ -199,7 +199,7 @@ def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder, pause_apply: bool): def run_pgbench(connstr: str, pg_bin: PgBin): log.info(f"Start a pgbench workload on pg {connstr}") # s10 is about 150MB of data. In debug mode init takes about 15s on SSD. - pg_bin.run_capture(["pgbench", "-i", "-s10", connstr]) + pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", "-s10", connstr]) log.info("pgbench init done") pg_bin.run_capture(["pgbench", "-T60", connstr]) diff --git a/test_runner/regress/test_pageserver_reconnect.py b/test_runner/regress/test_pageserver_reconnect.py index 37ff923632..ada6da98ff 100644 --- a/test_runner/regress/test_pageserver_reconnect.py +++ b/test_runner/regress/test_pageserver_reconnect.py @@ -22,7 +22,7 @@ def test_pageserver_reconnect(neon_simple_env: NeonEnv, pg_bin: PgBin): def run_pgbench(connstr: str): log.info(f"Start a pgbench workload on pg {connstr}") - pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", connstr]) + pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", f"-s{scale}", connstr]) pg_bin.run_capture(["pgbench", f"-T{int(n_reconnects*timeout)}", connstr]) thread = threading.Thread(target=run_pgbench, args=(endpoint.connstr(),), daemon=True) diff --git a/test_runner/regress/test_pageserver_restarts_under_workload.py b/test_runner/regress/test_pageserver_restarts_under_workload.py index 65569f3bac..9bb9b373ad 100644 --- a/test_runner/regress/test_pageserver_restarts_under_workload.py +++ b/test_runner/regress/test_pageserver_restarts_under_workload.py @@ -19,7 +19,7 @@ def test_pageserver_restarts_under_worload(neon_simple_env: NeonEnv, pg_bin: PgB def run_pgbench(connstr: str): log.info(f"Start a pgbench workload on pg {connstr}") - pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", connstr]) + pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", f"-s{scale}", connstr]) pg_bin.run_capture(["pgbench", f"-T{n_restarts}", connstr]) thread = threading.Thread(target=run_pgbench, args=(endpoint.connstr(),), daemon=True) diff --git a/test_runner/regress/test_threshold_based_eviction.py b/test_runner/regress/test_threshold_based_eviction.py index 840c7159ad..094dd20529 100644 --- a/test_runner/regress/test_threshold_based_eviction.py +++ b/test_runner/regress/test_threshold_based_eviction.py @@ -106,7 +106,7 @@ def test_threshold_based_eviction( # create a bunch of layers with env.endpoints.create_start("main", tenant_id=tenant_id) as pg: - pg_bin.run(["pgbench", "-i", "-s", "3", pg.connstr()]) + pg_bin.run(["pgbench", "-i", "-I", "dtGvp", "-s", "3", pg.connstr()]) last_flush_lsn_upload(env, pg, tenant_id, timeline_id) # wrap up and shutdown safekeepers so that no more layers will be created after the final checkpoint for sk in env.safekeepers: From 3a52e356c18440cb3b685d2cf1848fd3a3c86c67 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 17 Sep 2024 01:46:58 +0300 Subject: [PATCH 16/30] Remove unused function (#9018) --- test_runner/fixtures/neon_fixtures.py | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 50284a3f5a..5f7a32782c 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -182,25 +182,6 @@ def top_output_dir(base_dir: Path) -> Iterator[Path]: yield output_dir -@pytest.fixture(scope="function") -def versioned_pg_distrib_dir(pg_distrib_dir: Path, pg_version: PgVersion) -> Iterator[Path]: - versioned_dir = pg_distrib_dir / pg_version.v_prefixed - - psql_bin_path = versioned_dir / "bin/psql" - postgres_bin_path = versioned_dir / "bin/postgres" - - if os.getenv("REMOTE_ENV"): - # When testing against a remote server, we only need the client binary. - if not psql_bin_path.exists(): - raise Exception(f"psql not found at '{psql_bin_path}'") - else: - if not postgres_bin_path.exists(): - raise Exception(f"postgres not found at '{postgres_bin_path}'") - - log.info(f"versioned_pg_distrib_dir is {versioned_dir}") - yield versioned_dir - - @pytest.fixture(scope="session") def neon_api_key() -> str: api_key = os.getenv("NEON_API_KEY") From fec9321fc04434f84f86d166a6d89c4421110c77 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 17 Sep 2024 02:23:49 +0300 Subject: [PATCH 17/30] Use Path type in a few more places in neon_fixtures.py (#9018) This is in preparation of replacing neon_fixtures.get_dir_size with neon_fixtures.utils.get_dir_size() in next commit. --- test_runner/fixtures/compare_fixtures.py | 5 ++-- test_runner/fixtures/neon_fixtures.py | 32 ++++++++++++------------ 2 files changed, 19 insertions(+), 18 deletions(-) diff --git a/test_runner/fixtures/compare_fixtures.py b/test_runner/fixtures/compare_fixtures.py index 7c4a8db36f..770b32b11e 100644 --- a/test_runner/fixtures/compare_fixtures.py +++ b/test_runner/fixtures/compare_fixtures.py @@ -4,6 +4,7 @@ from abc import ABC, abstractmethod from contextlib import _GeneratorContextManager, contextmanager # Type-related stuff +from pathlib import Path from typing import Dict, Iterator, List import pytest @@ -229,11 +230,11 @@ class VanillaCompare(PgCompare): pass # TODO find something def report_size(self): - data_size = self.pg.get_subdir_size("base") + data_size = self.pg.get_subdir_size(Path("base")) self.zenbenchmark.record( "data_size", data_size / (1024 * 1024), "MB", report=MetricReport.LOWER_IS_BETTER ) - wal_size = self.pg.get_subdir_size("pg_wal") + wal_size = self.pg.get_subdir_size(Path("pg_wal")) self.zenbenchmark.record( "wal_size", wal_size / (1024 * 1024), "MB", report=MetricReport.LOWER_IS_BETTER ) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 5f7a32782c..90a351cdb3 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -224,7 +224,7 @@ def worker_base_port(worker_seq_no: int, worker_port_num: int) -> int: return BASE_PORT + worker_seq_no * worker_port_num -def get_dir_size(path: str) -> int: +def get_dir_size(path: Path) -> int: """Return size in bytes.""" totalbytes = 0 for root, _dirs, files in os.walk(path): @@ -3319,12 +3319,12 @@ class PgBin: ) return base_path - def get_pg_controldata_checkpoint_lsn(self, pgdata: str) -> Lsn: + def get_pg_controldata_checkpoint_lsn(self, pgdata: Path) -> Lsn: """ Run pg_controldata on given datadir and extract checkpoint lsn. """ - pg_controldata_path = os.path.join(self.pg_bin_path, "pg_controldata") + pg_controldata_path = self.pg_bin_path / "pg_controldata" cmd = f"{pg_controldata_path} -D {pgdata}" result = subprocess.run(cmd, capture_output=True, text=True, shell=True) checkpoint_lsn = re.findall( @@ -3433,9 +3433,9 @@ class VanillaPostgres(PgProtocol): self.running = False self.pg_bin.run_capture(["pg_ctl", "-w", "-D", str(self.pgdatadir), "stop"]) - def get_subdir_size(self, subdir) -> int: + def get_subdir_size(self, subdir: Path) -> int: """Return size of pgdatadir subdirectory in bytes.""" - return get_dir_size(os.path.join(self.pgdatadir, subdir)) + return get_dir_size(self.pgdatadir / subdir) def __enter__(self) -> "VanillaPostgres": return self @@ -3962,7 +3962,7 @@ class Endpoint(PgProtocol, LogUtils): self.env = env self.branch_name: Optional[str] = None # dubious self.endpoint_id: Optional[str] = None # dubious, see asserts below - self.pgdata_dir: Optional[str] = None # Path to computenode PGDATA + self.pgdata_dir: Optional[Path] = None # Path to computenode PGDATA self.tenant_id = tenant_id self.pg_port = pg_port self.http_port = http_port @@ -4019,7 +4019,7 @@ class Endpoint(PgProtocol, LogUtils): allow_multiple=allow_multiple, ) path = Path("endpoints") / self.endpoint_id / "pgdata" - self.pgdata_dir = os.path.join(self.env.repo_dir, path) + self.pgdata_dir = self.env.repo_dir / path self.logfile = self.endpoint_path() / "compute.log" config_lines = config_lines or [] @@ -4072,21 +4072,21 @@ class Endpoint(PgProtocol, LogUtils): path = Path("endpoints") / self.endpoint_id return self.env.repo_dir / path - def pg_data_dir_path(self) -> str: + def pg_data_dir_path(self) -> Path: """Path to Postgres data directory""" - return os.path.join(self.endpoint_path(), "pgdata") + return self.endpoint_path() / "pgdata" - def pg_xact_dir_path(self) -> str: + def pg_xact_dir_path(self) -> Path: """Path to pg_xact dir""" - return os.path.join(self.pg_data_dir_path(), "pg_xact") + return self.pg_data_dir_path() / "pg_xact" - def pg_twophase_dir_path(self) -> str: + def pg_twophase_dir_path(self) -> Path: """Path to pg_twophase dir""" - return os.path.join(self.pg_data_dir_path(), "pg_twophase") + return self.pg_data_dir_path() / "pg_twophase" - def config_file_path(self) -> str: + def config_file_path(self) -> Path: """Path to the postgresql.conf in the endpoint directory (not the one in pgdata)""" - return os.path.join(self.endpoint_path(), "postgresql.conf") + return self.endpoint_path() / "postgresql.conf" def config(self, lines: List[str]) -> "Endpoint": """ @@ -4251,7 +4251,7 @@ class Endpoint(PgProtocol, LogUtils): log.info(f'checkpointing at LSN {self.safe_psql("select pg_current_wal_lsn()")[0][0]}') self.safe_psql("checkpoint") assert self.pgdata_dir is not None # please mypy - return get_dir_size(os.path.join(self.pgdata_dir, "pg_wal")) / 1024 / 1024 + return get_dir_size(self.pgdata_dir / "pg_wal") / 1024 / 1024 def clear_shared_buffers(self, cursor: Optional[Any] = None): """ From c6f56b8462e16284b77a3176801c9ed7364df04b Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 17 Sep 2024 01:47:15 +0300 Subject: [PATCH 18/30] Remove redundant get_dir_size() function (#9018) There was another copy of it in utils.py. The only difference is that the version in utils.py tolerates files that are concurrently removed. That seems fine for the few callers in neon_fixtures.py too. --- test_runner/fixtures/neon_fixtures.py | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 90a351cdb3..92dcd1e3cd 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -93,6 +93,7 @@ from fixtures.utils import ( allure_add_grafana_links, allure_attach_from_dir, assert_no_errors, + get_dir_size, get_self_dir, print_gc_result, subprocess_capture, @@ -224,16 +225,6 @@ def worker_base_port(worker_seq_no: int, worker_port_num: int) -> int: return BASE_PORT + worker_seq_no * worker_port_num -def get_dir_size(path: Path) -> int: - """Return size in bytes.""" - totalbytes = 0 - for root, _dirs, files in os.walk(path): - for name in files: - totalbytes += os.path.getsize(os.path.join(root, name)) - - return totalbytes - - @pytest.fixture(scope="session") def port_distributor(worker_base_port: int, worker_port_num: int) -> PortDistributor: return PortDistributor(base_port=worker_base_port, port_number=worker_port_num) From 4295ff0f071789c22d0a838f5512e5caba57d54a Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 17 Sep 2024 01:47:21 +0300 Subject: [PATCH 19/30] Mark a couple of test fixtures as session-scoped (#9018) pg_distrib_dir doesn't include the Postgres version and only depends on env variables which cannot change during a test run, so it can be marked as session-scoped. Similarly, the platform cannot change during a test run. --- test_runner/fixtures/neon_fixtures.py | 2 +- test_runner/fixtures/parametrize.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 92dcd1e3cd..d2caee5992 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -159,7 +159,7 @@ def neon_binpath(base_dir: Path, build_type: str) -> Iterator[Path]: yield binpath -@pytest.fixture(scope="function") +@pytest.fixture(scope="session") def pg_distrib_dir(base_dir: Path) -> Iterator[Path]: if env_postgres_bin := os.environ.get("POSTGRES_DISTRIB_DIR"): distrib_dir = Path(env_postgres_bin).resolve() diff --git a/test_runner/fixtures/parametrize.py b/test_runner/fixtures/parametrize.py index e2dd51802c..2c8e71526c 100644 --- a/test_runner/fixtures/parametrize.py +++ b/test_runner/fixtures/parametrize.py @@ -24,7 +24,7 @@ def build_type() -> Optional[str]: return None -@pytest.fixture(scope="function", autouse=True) +@pytest.fixture(scope="session", autouse=True) def platform() -> Optional[str]: return None From 2db840d8b8736530a9653719ed2560e3647539a0 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 17 Sep 2024 01:47:26 +0300 Subject: [PATCH 20/30] Move a few test functions related to auth tokens to separate file (#9018) For readability. neon_fixtures.py is huge. --- test_runner/fixtures/auth_tokens.py | 47 +++++++++++++++++++ test_runner/fixtures/neon_fixtures.py | 40 +--------------- .../regress/test_storage_controller.py | 2 +- 3 files changed, 49 insertions(+), 40 deletions(-) create mode 100644 test_runner/fixtures/auth_tokens.py diff --git a/test_runner/fixtures/auth_tokens.py b/test_runner/fixtures/auth_tokens.py new file mode 100644 index 0000000000..8ebaf61e5e --- /dev/null +++ b/test_runner/fixtures/auth_tokens.py @@ -0,0 +1,47 @@ +from __future__ import annotations + +from dataclasses import dataclass +from enum import Enum +from typing import Any + +import jwt + +from fixtures.common_types import TenantId + + +@dataclass +class AuthKeys: + priv: str + + def generate_token(self, *, scope: TokenScope, **token_data: Any) -> str: + token_data = {key: str(val) for key, val in token_data.items()} + token = jwt.encode({"scope": scope, **token_data}, self.priv, algorithm="EdDSA") + # cast(Any, self.priv) + + # jwt.encode can return 'bytes' or 'str', depending on Python version or type + # hinting or something (not sure what). If it returned 'bytes', convert it to 'str' + # explicitly. + if isinstance(token, bytes): + token = token.decode() + + return token + + def generate_pageserver_token(self) -> str: + return self.generate_token(scope=TokenScope.PAGE_SERVER_API) + + def generate_safekeeper_token(self) -> str: + return self.generate_token(scope=TokenScope.SAFEKEEPER_DATA) + + # generate token giving access to only one tenant + def generate_tenant_token(self, tenant_id: TenantId) -> str: + return self.generate_token(scope=TokenScope.TENANT, tenant_id=str(tenant_id)) + + +# TODO: Replace with `StrEnum` when we upgrade to python 3.11 +class TokenScope(str, Enum): + ADMIN = "admin" + PAGE_SERVER_API = "pageserverapi" + GENERATIONS_API = "generations_api" + SAFEKEEPER_DATA = "safekeeperdata" + TENANT = "tenant" + SCRUBBER = "scrubber" diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index d2caee5992..93b93ff019 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -43,7 +43,6 @@ from urllib.parse import quote, urlparse import asyncpg import backoff import httpx -import jwt import psycopg2 import psycopg2.sql import pytest @@ -60,6 +59,7 @@ from psycopg2.extensions import make_dsn, parse_dsn from urllib3.util.retry import Retry from fixtures import overlayfs +from fixtures.auth_tokens import AuthKeys, TokenScope from fixtures.broker import NeonBroker from fixtures.common_types import Lsn, NodeId, TenantId, TenantShardId, TimelineId from fixtures.endpoint.http import EndpointHttpClient @@ -373,44 +373,6 @@ class PgProtocol: return self.safe_psql(query, log_query=log_query)[0][0] -@dataclass -class AuthKeys: - priv: str - - def generate_token(self, *, scope: TokenScope, **token_data: Any) -> str: - token_data = {key: str(val) for key, val in token_data.items()} - token = jwt.encode({"scope": scope, **token_data}, self.priv, algorithm="EdDSA") - # cast(Any, self.priv) - - # jwt.encode can return 'bytes' or 'str', depending on Python version or type - # hinting or something (not sure what). If it returned 'bytes', convert it to 'str' - # explicitly. - if isinstance(token, bytes): - token = token.decode() - - return token - - def generate_pageserver_token(self) -> str: - return self.generate_token(scope=TokenScope.PAGE_SERVER_API) - - def generate_safekeeper_token(self) -> str: - return self.generate_token(scope=TokenScope.SAFEKEEPER_DATA) - - # generate token giving access to only one tenant - def generate_tenant_token(self, tenant_id: TenantId) -> str: - return self.generate_token(scope=TokenScope.TENANT, tenant_id=str(tenant_id)) - - -# TODO: Replace with `StrEnum` when we upgrade to python 3.11 -class TokenScope(str, Enum): - ADMIN = "admin" - PAGE_SERVER_API = "pageserverapi" - GENERATIONS_API = "generations_api" - SAFEKEEPER_DATA = "safekeeperdata" - TENANT = "tenant" - SCRUBBER = "scrubber" - - class NeonEnvBuilder: """ Builder object to create a Neon runtime environment diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py index eea05d7548..2e21f8fb46 100644 --- a/test_runner/regress/test_storage_controller.py +++ b/test_runner/regress/test_storage_controller.py @@ -7,6 +7,7 @@ from datetime import datetime, timezone from typing import Any, Dict, List, Optional, Set, Tuple, Union import pytest +from fixtures.auth_tokens import TokenScope from fixtures.common_types import TenantId, TenantShardId, TimelineId from fixtures.compute_reconfigure import ComputeReconfigure from fixtures.log_helper import log @@ -18,7 +19,6 @@ from fixtures.neon_fixtures import ( PgBin, StorageControllerApiException, StorageControllerLeadershipStatus, - TokenScope, last_flush_lsn_upload, ) from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient From b719d58863f90beb46218d1167e4236b05910924 Mon Sep 17 00:00:00 2001 From: Vlad Lazar Date: Tue, 17 Sep 2024 09:25:42 +0100 Subject: [PATCH 21/30] storcon: forward requests from stepped down instance to the current leader (#8954) ## Problem It turns out that we can't rely on external orchestration to promptly route trafic to the new leader. This is downtime inducing. Forwarding provides a safe way out. ## Safety We forward when: 1. Request is not one of ["/control/v1/step_down", "/status", "/ready", "/metrics"] 2. Current instance is in [`LeadershipStatus::SteppedDown`] state 3. There is a leader in the database to forward to 4. Leader from step (3) is not the current instance If a storcon instance is persisted in the database, then we know that it is the current leader. There's one exception: time between handling step-down request and the new leader updating the database. Let's treat the happy case first. The stepped down node does not produce any side effects, since all request handling happens on the leader. As for the edge case, we are guaranteed to always have a maximum of two running instances. Hence, if we are in the edge case scenario the leader persisted in the database is the stepped down instance that received the request. Condition (4) above covers this scenario. ## Summary of changes * Conversion utilities for reqwest <-> hyper. I'm not happy with these, but I don't see a better way. Open to suggestions. * Add request forwarding logic * Update each request handler. Again, not happy with this. If anyone knows a nice to wrap the handlers, lmk. Me and Joonas tried :/ * Update each handler to maybe forward * Tweak tests to showcase new behaviour --- storage_controller/src/http.rs | 620 +++++++++++++++++- .../regress/test_storage_controller.py | 22 +- 2 files changed, 607 insertions(+), 35 deletions(-) diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs index a6638f5191..1745bf5575 100644 --- a/storage_controller/src/http.rs +++ b/storage_controller/src/http.rs @@ -1,10 +1,11 @@ +use crate::http; use crate::metrics::{ HttpRequestLatencyLabelGroup, HttpRequestStatusLabelGroup, PageserverRequestLabelGroup, METRICS_REGISTRY, }; use crate::persistence::SafekeeperPersistence; use crate::reconciler::ReconcileError; -use crate::service::{LeadershipStatus, Service, STARTUP_RECONCILE_TIMEOUT}; +use crate::service::{LeadershipStatus, Service, RECONCILE_TIMEOUT, STARTUP_RECONCILE_TIMEOUT}; use anyhow::Context; use futures::Future; use hyper::header::CONTENT_TYPE; @@ -22,6 +23,7 @@ use pageserver_api::models::{ }; use pageserver_api::shard::TenantShardId; use pageserver_client::{mgmt_api, BlockUnblock}; +use std::str::FromStr; use std::sync::Arc; use std::time::{Duration, Instant}; use tokio_util::sync::CancellationToken; @@ -87,9 +89,16 @@ fn get_state(request: &Request) -> &HttpState { } /// Pageserver calls into this on startup, to learn which tenants it should attach -async fn handle_re_attach(mut req: Request) -> Result, ApiError> { +async fn handle_re_attach(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::GenerationsApi)?; + let mut req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let reattach_req = json_request::(&mut req).await?; let state = get_state(&req); json_response(StatusCode::OK, state.service.re_attach(reattach_req).await?) @@ -97,9 +106,16 @@ async fn handle_re_attach(mut req: Request) -> Result, ApiE /// Pageserver calls into this before doing deletions, to confirm that it still /// holds the latest generation for the tenants with deletions enqueued -async fn handle_validate(mut req: Request) -> Result, ApiError> { +async fn handle_validate(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::GenerationsApi)?; + let mut req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let validate_req = json_request::(&mut req).await?; let state = get_state(&req); json_response(StatusCode::OK, state.service.validate(validate_req).await?) @@ -108,9 +124,16 @@ async fn handle_validate(mut req: Request) -> Result, ApiEr /// Call into this before attaching a tenant to a pageserver, to acquire a generation number /// (in the real control plane this is unnecessary, because the same program is managing /// generation numbers and doing attachments). -async fn handle_attach_hook(mut req: Request) -> Result, ApiError> { +async fn handle_attach_hook(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + let mut req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let attach_req = json_request::(&mut req).await?; let state = get_state(&req); @@ -124,9 +147,16 @@ async fn handle_attach_hook(mut req: Request) -> Result, Ap ) } -async fn handle_inspect(mut req: Request) -> Result, ApiError> { +async fn handle_inspect(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + let mut req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let inspect_req = json_request::(&mut req).await?; let state = get_state(&req); @@ -136,10 +166,17 @@ async fn handle_inspect(mut req: Request) -> Result, ApiErr async fn handle_tenant_create( service: Arc, - mut req: Request, + req: Request, ) -> Result, ApiError> { check_permissions(&req, Scope::PageServerApi)?; + let mut req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let create_req = json_request::(&mut req).await?; json_response( @@ -150,11 +187,18 @@ async fn handle_tenant_create( async fn handle_tenant_location_config( service: Arc, - mut req: Request, + req: Request, ) -> Result, ApiError> { let tenant_shard_id: TenantShardId = parse_request_param(&req, "tenant_shard_id")?; check_permissions(&req, Scope::PageServerApi)?; + let mut req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let config_req = json_request::(&mut req).await?; json_response( StatusCode::OK, @@ -166,10 +210,17 @@ async fn handle_tenant_location_config( async fn handle_tenant_config_set( service: Arc, - mut req: Request, + req: Request, ) -> Result, ApiError> { check_permissions(&req, Scope::PageServerApi)?; + let mut req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let config_req = json_request::(&mut req).await?; json_response(StatusCode::OK, service.tenant_config_set(config_req).await?) @@ -182,16 +233,30 @@ async fn handle_tenant_config_get( let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; check_permissions(&req, Scope::PageServerApi)?; + match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(_req) => {} + }; + json_response(StatusCode::OK, service.tenant_config_get(tenant_id)?) } async fn handle_tenant_time_travel_remote_storage( service: Arc, - mut req: Request, + req: Request, ) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; check_permissions(&req, Scope::PageServerApi)?; + let mut req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let time_travel_req = json_request::(&mut req).await?; let timestamp_raw = must_get_query_param(&req, "travel_to")?; @@ -232,6 +297,13 @@ async fn handle_tenant_secondary_download( let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; let wait = parse_query_param(&req, "wait_ms")?.map(Duration::from_millis); + match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(_req) => {} + }; + let (status, progress) = service.tenant_secondary_download(tenant_id, wait).await?; json_response(map_reqwest_hyper_status(status)?, progress) } @@ -243,6 +315,13 @@ async fn handle_tenant_delete( let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; check_permissions(&req, Scope::PageServerApi)?; + match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(_req) => {} + }; + let status_code = service .tenant_delete(tenant_id) .await @@ -258,11 +337,18 @@ async fn handle_tenant_delete( async fn handle_tenant_timeline_create( service: Arc, - mut req: Request, + req: Request, ) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; check_permissions(&req, Scope::PageServerApi)?; + let mut req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let create_req = json_request::(&mut req).await?; json_response( StatusCode::CREATED, @@ -277,9 +363,16 @@ async fn handle_tenant_timeline_delete( req: Request, ) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?; + check_permissions(&req, Scope::PageServerApi)?; - let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?; + match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(_req) => {} + }; // For timeline deletions, which both implement an "initially return 202, then 404 once // we're done" semantic, we wrap with a retry loop to expose a simpler API upstream. @@ -337,12 +430,19 @@ async fn handle_tenant_timeline_delete( async fn handle_tenant_timeline_archival_config( service: Arc, - mut req: Request, + req: Request, ) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?; + check_permissions(&req, Scope::PageServerApi)?; - let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?; + let mut req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; let create_req = json_request::(&mut req).await?; @@ -358,9 +458,16 @@ async fn handle_tenant_timeline_detach_ancestor( req: Request, ) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?; + check_permissions(&req, Scope::PageServerApi)?; - let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?; + match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(_req) => {} + }; let res = service .tenant_timeline_detach_ancestor(tenant_id, timeline_id) @@ -393,6 +500,13 @@ async fn handle_tenant_timeline_passthrough( let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; check_permissions(&req, Scope::PageServerApi)?; + let req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let Some(path) = req.uri().path_and_query() else { // This should never happen, our request router only calls us if there is a path return Err(ApiError::BadRequest(anyhow::anyhow!("Missing path"))); @@ -460,9 +574,17 @@ async fn handle_tenant_locate( service: Arc, req: Request, ) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + check_permissions(&req, Scope::Admin)?; - let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(_req) => {} + }; + json_response(StatusCode::OK, service.tenant_locate(tenant_id)?) } @@ -473,6 +595,14 @@ async fn handle_tenant_describe( check_permissions(&req, Scope::Scrubber)?; let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + + match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(_req) => {} + }; + json_response(StatusCode::OK, service.tenant_describe(tenant_id)?) } @@ -482,12 +612,26 @@ async fn handle_tenant_list( ) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(_req) => {} + }; + json_response(StatusCode::OK, service.tenant_list()) } -async fn handle_node_register(mut req: Request) -> Result, ApiError> { +async fn handle_node_register(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + let mut req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let register_req = json_request::(&mut req).await?; let state = get_state(&req); state.service.node_register(register_req).await?; @@ -497,6 +641,13 @@ async fn handle_node_register(mut req: Request) -> Result, async fn handle_node_list(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + let req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let state = get_state(&req); let nodes = state.service.node_list().await?; let api_nodes = nodes.into_iter().map(|n| n.describe()).collect::>(); @@ -507,6 +658,13 @@ async fn handle_node_list(req: Request) -> Result, ApiError async fn handle_node_drop(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + let req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let state = get_state(&req); let node_id: NodeId = parse_request_param(&req, "node_id")?; json_response(StatusCode::OK, state.service.node_drop(node_id).await?) @@ -515,14 +673,28 @@ async fn handle_node_drop(req: Request) -> Result, ApiError async fn handle_node_delete(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + let req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let state = get_state(&req); let node_id: NodeId = parse_request_param(&req, "node_id")?; json_response(StatusCode::OK, state.service.node_delete(node_id).await?) } -async fn handle_node_configure(mut req: Request) -> Result, ApiError> { +async fn handle_node_configure(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + let mut req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let node_id: NodeId = parse_request_param(&req, "node_id")?; let config_req = json_request::(&mut req).await?; if node_id != config_req.node_id { @@ -548,6 +720,13 @@ async fn handle_node_configure(mut req: Request) -> Result, async fn handle_node_status(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + let req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let state = get_state(&req); let node_id: NodeId = parse_request_param(&req, "node_id")?; @@ -570,6 +749,13 @@ async fn handle_node_shards(req: Request) -> Result, ApiErr async fn handle_get_leader(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + let req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let state = get_state(&req); let leader = state.service.get_leader().await.map_err(|err| { ApiError::InternalServerError(anyhow::anyhow!( @@ -583,6 +769,13 @@ async fn handle_get_leader(req: Request) -> Result, ApiErro async fn handle_node_drain(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + let req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let state = get_state(&req); let node_id: NodeId = parse_request_param(&req, "node_id")?; @@ -594,6 +787,13 @@ async fn handle_node_drain(req: Request) -> Result, ApiErro async fn handle_cancel_node_drain(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + let req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let state = get_state(&req); let node_id: NodeId = parse_request_param(&req, "node_id")?; @@ -605,6 +805,13 @@ async fn handle_cancel_node_drain(req: Request) -> Result, async fn handle_node_fill(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + let req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let state = get_state(&req); let node_id: NodeId = parse_request_param(&req, "node_id")?; @@ -616,6 +823,13 @@ async fn handle_node_fill(req: Request) -> Result, ApiError async fn handle_cancel_node_fill(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + let req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let state = get_state(&req); let node_id: NodeId = parse_request_param(&req, "node_id")?; @@ -624,9 +838,16 @@ async fn handle_cancel_node_fill(req: Request) -> Result, A json_response(StatusCode::ACCEPTED, ()) } -async fn handle_metadata_health_update(mut req: Request) -> Result, ApiError> { +async fn handle_metadata_health_update(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Scrubber)?; + let mut req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let update_req = json_request::(&mut req).await?; let state = get_state(&req); @@ -640,6 +861,13 @@ async fn handle_metadata_health_list_unhealthy( ) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + let req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let state = get_state(&req); let unhealthy_tenant_shards = state.service.metadata_health_list_unhealthy().await?; @@ -652,10 +880,17 @@ async fn handle_metadata_health_list_unhealthy( } async fn handle_metadata_health_list_outdated( - mut req: Request, + req: Request, ) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + let mut req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let list_outdated_req = json_request::(&mut req).await?; let state = get_state(&req); let health_records = state @@ -671,10 +906,17 @@ async fn handle_metadata_health_list_outdated( async fn handle_tenant_shard_split( service: Arc, - mut req: Request, + req: Request, ) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + let mut req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; let split_req = json_request::(&mut req).await?; @@ -686,10 +928,17 @@ async fn handle_tenant_shard_split( async fn handle_tenant_shard_migrate( service: Arc, - mut req: Request, + req: Request, ) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + let mut req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let tenant_shard_id: TenantShardId = parse_request_param(&req, "tenant_shard_id")?; let migrate_req = json_request::(&mut req).await?; json_response( @@ -700,9 +949,16 @@ async fn handle_tenant_shard_migrate( ) } -async fn handle_tenant_update_policy(mut req: Request) -> Result, ApiError> { +async fn handle_tenant_update_policy(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + let mut req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; let update_req = json_request::(&mut req).await?; let state = get_state(&req); @@ -716,9 +972,16 @@ async fn handle_tenant_update_policy(mut req: Request) -> Result) -> Result, ApiError> { +async fn handle_update_preferred_azs(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + let mut req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let azs_req = json_request::(&mut req).await?; let state = get_state(&req); @@ -731,23 +994,46 @@ async fn handle_update_preferred_azs(mut req: Request) -> Result) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + let req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let state = get_state(&req); json_response(StatusCode::OK, state.service.step_down().await) } async fn handle_tenant_drop(req: Request) -> Result, ApiError> { - let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; check_permissions(&req, Scope::PageServerApi)?; + let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + + let req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let state = get_state(&req); json_response(StatusCode::OK, state.service.tenant_drop(tenant_id).await?) } async fn handle_tenant_import(req: Request) -> Result, ApiError> { - let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; check_permissions(&req, Scope::PageServerApi)?; + let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + + let req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let state = get_state(&req); json_response( @@ -759,6 +1045,13 @@ async fn handle_tenant_import(req: Request) -> Result, ApiE async fn handle_tenants_dump(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + let req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let state = get_state(&req); state.service.tenants_dump() } @@ -766,6 +1059,13 @@ async fn handle_tenants_dump(req: Request) -> Result, ApiEr async fn handle_scheduler_dump(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + let req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let state = get_state(&req); state.service.scheduler_dump() } @@ -773,6 +1073,13 @@ async fn handle_scheduler_dump(req: Request) -> Result, Api async fn handle_consistency_check(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + let req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let state = get_state(&req); json_response(StatusCode::OK, state.service.consistency_check().await?) @@ -781,19 +1088,40 @@ async fn handle_consistency_check(req: Request) -> Result, async fn handle_reconcile_all(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + let req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let state = get_state(&req); json_response(StatusCode::OK, state.service.reconcile_all_now().await?) } /// Status endpoint is just used for checking that our HTTP listener is up -async fn handle_status(_req: Request) -> Result, ApiError> { +async fn handle_status(req: Request) -> Result, ApiError> { + match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(_req) => {} + }; + json_response(StatusCode::OK, ()) } /// Readiness endpoint indicates when we're done doing startup I/O (e.g. reconciling /// with remote pageserver nodes). This is intended for use as a kubernetes readiness probe. async fn handle_ready(req: Request) -> Result, ApiError> { + let req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let state = get_state(&req); if state.service.startup_complete.is_ready() { json_response(StatusCode::OK, ()) @@ -816,6 +1144,13 @@ async fn handle_get_safekeeper(req: Request) -> Result, Api let id = parse_request_param::(&req, "id")?; + let req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let state = get_state(&req); let res = state.service.get_safekeeper(id).await; @@ -847,6 +1182,13 @@ async fn handle_upsert_safekeeper(mut req: Request) -> Result { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let state = get_state(&req); state.service.upsert_safekeeper(body).await?; @@ -925,10 +1267,7 @@ pub fn prologue_leadership_status_check_middleware< let allowed_routes = match leadership_status { LeadershipStatus::Leader => AllowedRoutes::All, - LeadershipStatus::SteppedDown => { - // TODO: does it make sense to allow /status here? - AllowedRoutes::Some(["/control/v1/step_down", "/status", "/metrics"].to_vec()) - } + LeadershipStatus::SteppedDown => AllowedRoutes::All, LeadershipStatus::Candidate => { AllowedRoutes::Some(["/ready", "/status", "/metrics"].to_vec()) } @@ -1005,6 +1344,13 @@ fn epilogue_metrics_middleware pub async fn measured_metrics_handler(req: Request) -> Result, ApiError> { pub const TEXT_FORMAT: &str = "text/plain; version=0.0.4"; + let req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let state = get_state(&req); let payload = crate::metrics::METRICS_REGISTRY.encode(&state.neon_metrics); let response = Response::builder() @@ -1032,6 +1378,220 @@ where request_span(request, handler).await } +enum ForwardOutcome { + Forwarded(Result, ApiError>), + NotForwarded(Request), +} + +/// Potentially forward the request to the current storage controler leader. +/// More specifically we forward when: +/// 1. Request is not one of ["/control/v1/step_down", "/status", "/ready", "/metrics"] +/// 2. Current instance is in [`LeadershipStatus::SteppedDown`] state +/// 3. There is a leader in the database to forward to +/// 4. Leader from step (3) is not the current instance +/// +/// Why forward? +/// It turns out that we can't rely on external orchestration to promptly route trafic to the +/// new leader. This is downtime inducing. Forwarding provides a safe way out. +/// +/// Why is it safe? +/// If a storcon instance is persisted in the database, then we know that it is the current leader. +/// There's one exception: time between handling step-down request and the new leader updating the +/// database. +/// +/// Let's treat the happy case first. The stepped down node does not produce any side effects, +/// since all request handling happens on the leader. +/// +/// As for the edge case, we are guaranteed to always have a maximum of two running instances. +/// Hence, if we are in the edge case scenario the leader persisted in the database is the +/// stepped down instance that received the request. Condition (4) above covers this scenario. +async fn maybe_forward(req: Request) -> ForwardOutcome { + const NOT_FOR_FORWARD: [&str; 4] = ["/control/v1/step_down", "/status", "/ready", "/metrics"]; + + let uri = req.uri().to_string(); + let uri_for_forward = !NOT_FOR_FORWARD.contains(&uri.as_str()); + + let state = get_state(&req); + let leadership_status = state.service.get_leadership_status(); + + if leadership_status != LeadershipStatus::SteppedDown || !uri_for_forward { + return ForwardOutcome::NotForwarded(req); + } + + let leader = state.service.get_leader().await; + let leader = { + match leader { + Ok(Some(leader)) => leader, + Ok(None) => { + return ForwardOutcome::Forwarded(Err(ApiError::ResourceUnavailable( + "No leader to forward to while in stepped down state".into(), + ))); + } + Err(err) => { + return ForwardOutcome::Forwarded(Err(ApiError::InternalServerError( + anyhow::anyhow!( + "Failed to get leader for forwarding while in stepped down state: {err}" + ), + ))); + } + } + }; + + let cfg = state.service.get_config(); + if let Some(ref self_addr) = cfg.address_for_peers { + let leader_addr = match Uri::from_str(leader.address.as_str()) { + Ok(uri) => uri, + Err(err) => { + return ForwardOutcome::Forwarded(Err(ApiError::InternalServerError( + anyhow::anyhow!( + "Failed to parse leader uri for forwarding while in stepped down state: {err}" + ), + ))); + } + }; + + if *self_addr == leader_addr { + return ForwardOutcome::Forwarded(Err(ApiError::InternalServerError(anyhow::anyhow!( + "Leader is stepped down instance" + )))); + } + } + + tracing::info!("Forwarding {} to leader at {}", uri, leader.address); + + // Use [`RECONCILE_TIMEOUT`] as the max amount of time a request should block for and + // include some leeway to get the timeout for proxied requests. + const PROXIED_REQUEST_TIMEOUT: Duration = Duration::from_secs(RECONCILE_TIMEOUT.as_secs() + 10); + let client = reqwest::ClientBuilder::new() + .timeout(PROXIED_REQUEST_TIMEOUT) + .build(); + let client = match client { + Ok(client) => client, + Err(err) => { + return ForwardOutcome::Forwarded(Err(ApiError::InternalServerError(anyhow::anyhow!( + "Failed to build leader client for forwarding while in stepped down state: {err}" + )))); + } + }; + + let request: reqwest::Request = match convert_request(req, &client, leader.address).await { + Ok(r) => r, + Err(err) => { + return ForwardOutcome::Forwarded(Err(ApiError::InternalServerError(anyhow::anyhow!( + "Failed to convert request for forwarding while in stepped down state: {err}" + )))); + } + }; + + let response = match client.execute(request).await { + Ok(r) => r, + Err(err) => { + return ForwardOutcome::Forwarded(Err(ApiError::InternalServerError(anyhow::anyhow!( + "Failed to forward while in stepped down state: {err}" + )))); + } + }; + + ForwardOutcome::Forwarded(convert_response(response).await) +} + +/// Convert a [`reqwest::Response`] to a [hyper::Response`] by passing through +/// a stable representation (string, bytes or integer) +/// +/// Ideally, we would not have to do this since both types use the http crate +/// under the hood. However, they use different versions of the crate and keeping +/// second order dependencies in sync is difficult. +async fn convert_response(resp: reqwest::Response) -> Result, ApiError> { + use std::str::FromStr; + + let mut builder = hyper::Response::builder().status(resp.status().as_u16()); + for (key, value) in resp.headers().into_iter() { + let key = hyper::header::HeaderName::from_str(key.as_str()).map_err(|err| { + ApiError::InternalServerError(anyhow::anyhow!("Response conversion failed: {err}")) + })?; + + let value = hyper::header::HeaderValue::from_bytes(value.as_bytes()).map_err(|err| { + ApiError::InternalServerError(anyhow::anyhow!("Response conversion failed: {err}")) + })?; + + builder = builder.header(key, value); + } + + let body = http::Body::wrap_stream(resp.bytes_stream()); + + builder.body(body).map_err(|err| { + ApiError::InternalServerError(anyhow::anyhow!("Response conversion failed: {err}")) + }) +} + +/// Convert a [`reqwest::Request`] to a [hyper::Request`] by passing through +/// a stable representation (string, bytes or integer) +/// +/// See [`convert_response`] for why we are doing it this way. +async fn convert_request( + req: hyper::Request, + client: &reqwest::Client, + to_address: String, +) -> Result { + use std::str::FromStr; + + let (parts, body) = req.into_parts(); + let method = reqwest::Method::from_str(parts.method.as_str()).map_err(|err| { + ApiError::InternalServerError(anyhow::anyhow!("Request conversion failed: {err}")) + })?; + + let path_and_query = parts.uri.path_and_query().ok_or_else(|| { + ApiError::InternalServerError(anyhow::anyhow!( + "Request conversion failed: no path and query" + )) + })?; + + let uri = reqwest::Url::from_str( + format!( + "{}{}", + to_address.trim_end_matches("/"), + path_and_query.as_str() + ) + .as_str(), + ) + .map_err(|err| { + ApiError::InternalServerError(anyhow::anyhow!("Request conversion failed: {err}")) + })?; + + let mut headers = reqwest::header::HeaderMap::new(); + for (key, value) in parts.headers.into_iter() { + let key = match key { + Some(k) => k, + None => { + continue; + } + }; + + let key = reqwest::header::HeaderName::from_str(key.as_str()).map_err(|err| { + ApiError::InternalServerError(anyhow::anyhow!("Request conversion failed: {err}")) + })?; + + let value = reqwest::header::HeaderValue::from_bytes(value.as_bytes()).map_err(|err| { + ApiError::InternalServerError(anyhow::anyhow!("Request conversion failed: {err}")) + })?; + + headers.insert(key, value); + } + + let body = hyper::body::to_bytes(body).await.map_err(|err| { + ApiError::InternalServerError(anyhow::anyhow!("Request conversion failed: {err}")) + })?; + + client + .request(method, uri) + .headers(headers) + .body(body) + .build() + .map_err(|err| { + ApiError::InternalServerError(anyhow::anyhow!("Request conversion failed: {err}")) + }) +} + pub fn make_router( service: Arc, auth: Option>, diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py index 2e21f8fb46..2d72dbb2df 100644 --- a/test_runner/regress/test_storage_controller.py +++ b/test_runner/regress/test_storage_controller.py @@ -2048,8 +2048,11 @@ def test_storage_controller_step_down(neon_env_builder: NeonEnvBuilder): # Make a change to the tenant config to trigger a slow reconcile virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True) virtual_ps_http.patch_tenant_config_client_side(tid, {"compaction_threshold": 5}, None) - env.storage_controller.allowed_errors.append( - ".*Accepted configuration update but reconciliation failed.*" + env.storage_controller.allowed_errors.extend( + [ + ".*Accepted configuration update but reconciliation failed.*", + ".*Leader is stepped down instance", + ] ) observed_state = env.storage_controller.step_down() @@ -2072,9 +2075,9 @@ def test_storage_controller_step_down(neon_env_builder: NeonEnvBuilder): assert "compaction_threshold" in ps_tenant_conf.effective_config assert ps_tenant_conf.effective_config["compaction_threshold"] == 5 - # Validate that the storcon is not replying to the usual requests - # once it has stepped down. - with pytest.raises(StorageControllerApiException, match="stepped_down"): + # Validate that the storcon attempts to forward the request, but stops. + # when it realises it is still the current leader. + with pytest.raises(StorageControllerApiException, match="Leader is stepped down instance"): env.storage_controller.tenant_list() # Validate that we can step down multiple times and the observed state @@ -2221,6 +2224,15 @@ def test_storage_controller_leadership_transfer( env.storage_controller.wait_until_ready() env.storage_controller.consistency_check() + if not step_down_times_out: + # Check that the stepped down instance forwards requests + # to the new leader while it's still running. + storage_controller_proxy.route_to(f"http://127.0.0.1:{storage_controller_1_port}") + env.storage_controller.tenant_list() + env.storage_controller.node_configure(env.pageservers[0].id, {"scheduling": "Pause"}) + status = env.storage_controller.node_status(env.pageservers[0].id) + assert status["scheduling"] == "Pause" + if step_down_times_out: env.storage_controller.allowed_errors.extend( [ From cd4276fd656b9e917ee315240d957d344fc2cfe6 Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Tue, 17 Sep 2024 10:17:48 +0100 Subject: [PATCH 22/30] CI: fix release pipeline (#9017) ## Problem We've got 2 non-blocking failures on the release pipeline: - `promote-compatibility-data` job got skipped _presumably_ because one of the dependencies of `deploy` job (`push-to-acr-dev`) got skipped (https://github.com/neondatabase/neon/pull/8940) - `coverage-report` job fails because we don't build debug artifacts in the release branch (https://github.com/neondatabase/neon/pull/8561) ## Summary of changes - Always run `push-to-acr-dev` / `push-to-acr-prod` jobs, but add `skip_if` parameter to the reusable workflow, which can skip the job internally, without skipping externally - Do not run `coverage-report` on release branches --- .github/workflows/_push-to-acr.yml | 8 +++++++- .github/workflows/build_and_test.yml | 7 ++++--- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/.github/workflows/_push-to-acr.yml b/.github/workflows/_push-to-acr.yml index 415b3d9cc6..7b6eba2c06 100644 --- a/.github/workflows/_push-to-acr.yml +++ b/.github/workflows/_push-to-acr.yml @@ -26,9 +26,15 @@ on: description: Azure tenant ID required: true type: string + skip_if: + description: Skip the job if this expression is true + required: true + type: boolean jobs: push-to-acr: + if: ${{ !inputs.skip_if }} + runs-on: ubuntu-22.04 permissions: contents: read # This is required for actions/checkout @@ -52,5 +58,5 @@ jobs: for image in ${images}; do docker buildx imagetools create \ -t ${{ inputs.registry_name }}.azurecr.io/neondatabase/${image}:${{ inputs.image_tag }} \ - neondatabase/${image}:${{ inputs.image_tag }} + neondatabase/${image}:${{ inputs.image_tag }} done diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 7c06fd9ab8..7ddd624dd5 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -357,6 +357,7 @@ jobs: }) coverage-report: + if: ${{ !startsWith(github.ref_name, 'release') }} needs: [ check-permissions, build-build-tools-image, build-and-test-locally ] runs-on: [ self-hosted, small ] container: @@ -858,7 +859,6 @@ jobs: done push-to-acr-dev: - if: github.ref_name == 'main' needs: [ tag, promote-images ] uses: ./.github/workflows/_push-to-acr.yml with: @@ -868,9 +868,9 @@ jobs: registry_name: ${{ vars.AZURE_DEV_REGISTRY_NAME }} subscription_id: ${{ vars.AZURE_DEV_SUBSCRIPTION_ID }} tenant_id: ${{ vars.AZURE_TENANT_ID }} + skip_if: ${{ github.ref_name != 'main' }} push-to-acr-prod: - if: github.ref_name == 'release'|| github.ref_name == 'release-proxy' needs: [ tag, promote-images ] uses: ./.github/workflows/_push-to-acr.yml with: @@ -880,6 +880,7 @@ jobs: registry_name: ${{ vars.AZURE_PROD_REGISTRY_NAME }} subscription_id: ${{ vars.AZURE_PROD_SUBSCRIPTION_ID }} tenant_id: ${{ vars.AZURE_TENANT_ID }} + skip_if: ${{ !startsWith(github.ref_name, 'release') }} trigger-custom-extensions-build-and-wait: needs: [ check-permissions, tag ] @@ -957,7 +958,7 @@ jobs: deploy: needs: [ check-permissions, promote-images, tag, build-and-test-locally, trigger-custom-extensions-build-and-wait, push-to-acr-dev, push-to-acr-prod ] - if: (github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy') && !failure() && !cancelled() + if: github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy' runs-on: [ self-hosted, small ] container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest From d211f00f054b80515684ff0887b76ecec60cc796 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 17 Sep 2024 17:55:45 +0300 Subject: [PATCH 23/30] Remove unnecessary dependencies (#9000) Found by "cargo machete" --- Cargo.lock | 144 ++----------------------- compute_tools/Cargo.toml | 3 - control_plane/Cargo.toml | 5 - control_plane/storcon_cli/Cargo.toml | 3 - libs/compute_api/Cargo.toml | 1 - libs/consumption_metrics/Cargo.toml | 3 - libs/desim/Cargo.toml | 1 - libs/postgres_backend/Cargo.toml | 2 - libs/postgres_ffi/Cargo.toml | 3 - libs/postgres_ffi/wal_craft/Cargo.toml | 1 - libs/pq_proto/Cargo.toml | 2 - libs/remote_storage/Cargo.toml | 3 - libs/safekeeper_api/Cargo.toml | 1 - libs/tracing-utils/Cargo.toml | 5 +- libs/utils/Cargo.toml | 1 - libs/vm_monitor/Cargo.toml | 2 - pageserver/Cargo.toml | 9 -- pageserver/compaction/Cargo.toml | 22 ---- pageserver/ctl/Cargo.toml | 2 - proxy/Cargo.toml | 7 -- safekeeper/Cargo.toml | 5 - storage_broker/Cargo.toml | 2 - storage_controller/Cargo.toml | 2 - storage_controller/client/Cargo.toml | 11 -- storage_scrubber/Cargo.toml | 8 -- workspace_hack/Cargo.toml | 3 + 26 files changed, 15 insertions(+), 236 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 3ca6acbc3e..136f07956f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1209,7 +1209,6 @@ dependencies = [ "remote_storage", "serde", "serde_json", - "serde_with", "utils", ] @@ -1218,7 +1217,6 @@ name = "compute_tools" version = "0.1.0" dependencies = [ "anyhow", - "async-compression", "bytes", "cfg-if", "chrono", @@ -1237,7 +1235,6 @@ dependencies = [ "reqwest 0.12.4", "rlimit", "rust-ini", - "serde", "serde_json", "signal-hook", "tar", @@ -1246,7 +1243,6 @@ dependencies = [ "tokio-postgres", "tokio-stream", "tokio-util", - "toml_edit", "tracing", "tracing-opentelemetry", "tracing-subscriber", @@ -1317,12 +1313,9 @@ dependencies = [ name = "consumption_metrics" version = "0.1.0" dependencies = [ - "anyhow", "chrono", "rand 0.8.5", "serde", - "serde_with", - "utils", ] [[package]] @@ -1334,9 +1327,7 @@ dependencies = [ "clap", "comfy-table", "compute_api", - "futures", "git-version", - "hex", "humantime", "humantime-serde", "hyper 0.14.26", @@ -1344,7 +1335,6 @@ dependencies = [ "once_cell", "pageserver_api", "pageserver_client", - "postgres", "postgres_backend", "postgres_connection", "regex", @@ -1353,9 +1343,7 @@ dependencies = [ "scopeguard", "serde", "serde_json", - "serde_with", "storage_broker", - "tar", "thiserror", "tokio", "tokio-postgres", @@ -1663,7 +1651,6 @@ dependencies = [ "hex", "parking_lot 0.12.1", "rand 0.8.5", - "scopeguard", "smallvec", "tracing", "utils", @@ -2233,24 +2220,22 @@ checksum = "4271d37baee1b8c7e4b708028c57d816cf9d2434acb33a549475f78c181f6253" [[package]] name = "git-version" -version = "0.3.5" +version = "0.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f6b0decc02f4636b9ccad390dcbe77b722a77efedfa393caf8379a51d5c61899" +checksum = "1ad568aa3db0fcbc81f2f116137f263d7304f512a1209b35b85150d3ef88ad19" dependencies = [ "git-version-macro", - "proc-macro-hack", ] [[package]] name = "git-version-macro" -version = "0.3.5" +version = "0.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fe69f1cbdb6e28af2bac214e943b99ce8a0a06b447d15d3e61161b0423139f3f" +checksum = "53010ccb100b96a67bc32c0175f0ed1426b31b655d562898e57325f81c023ac0" dependencies = [ - "proc-macro-hack", "proc-macro2", "quote", - "syn 1.0.109", + "syn 2.0.52", ] [[package]] @@ -2744,19 +2729,6 @@ dependencies = [ "libc", ] -[[package]] -name = "inotify" -version = "0.10.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fdd168d97690d0b8c412d6b6c10360277f4d7ee495c5d0d5d5fe0854923255cc" -dependencies = [ - "bitflags 1.3.2", - "futures-core", - "inotify-sys", - "libc", - "tokio", -] - [[package]] name = "inotify-sys" version = "0.1.5" @@ -3251,7 +3223,7 @@ dependencies = [ "crossbeam-channel", "filetime", "fsevent-sys", - "inotify 0.9.6", + "inotify", "kqueue", "libc", "log", @@ -3642,7 +3614,6 @@ name = "pagectl" version = "0.1.0" dependencies = [ "anyhow", - "bytes", "camino", "clap", "git-version", @@ -3651,7 +3622,6 @@ dependencies = [ "pageserver_api", "postgres_ffi", "remote_storage", - "serde", "serde_json", "svg_fmt", "thiserror", @@ -3670,7 +3640,6 @@ dependencies = [ "arc-swap", "async-compression", "async-stream", - "async-trait", "bit_field", "byteorder", "bytes", @@ -3678,16 +3647,13 @@ dependencies = [ "camino-tempfile", "chrono", "clap", - "const_format", "consumption_metrics", "crc32c", "criterion", - "crossbeam-utils", "either", "enum-map", "enumset", "fail", - "flate2", "futures", "git-version", "hex", @@ -3726,13 +3692,9 @@ dependencies = [ "serde_json", "serde_path_to_error", "serde_with", - "signal-hook", - "smallvec", "storage_broker", "strum", "strum_macros", - "svg_fmt", - "sync_wrapper", "sysinfo", "tenant_size_model", "thiserror", @@ -3746,7 +3708,6 @@ dependencies = [ "tokio-util", "toml_edit", "tracing", - "twox-hash", "url", "utils", "walkdir", @@ -3810,44 +3771,22 @@ name = "pageserver_compaction" version = "0.1.0" dependencies = [ "anyhow", - "async-compression", "async-stream", - "byteorder", - "bytes", - "chrono", "clap", - "const_format", - "consumption_metrics", "criterion", - "crossbeam-utils", - "either", - "fail", - "flate2", "futures", "git-version", - "hex", "hex-literal", - "humantime", - "humantime-serde", "itertools 0.10.5", - "metrics", "once_cell", "pageserver_api", "pin-project-lite", "rand 0.8.5", - "smallvec", "svg_fmt", - "sync_wrapper", - "thiserror", "tokio", - "tokio-io-timeout", - "tokio-util", "tracing", - "tracing-error", "tracing-subscriber", - "url", "utils", - "walkdir", "workspace_hack", ] @@ -4164,9 +4103,7 @@ name = "postgres_backend" version = "0.1.0" dependencies = [ "anyhow", - "async-trait", "bytes", - "futures", "once_cell", "pq_proto", "rustls 0.22.4", @@ -4199,16 +4136,13 @@ version = "0.1.0" dependencies = [ "anyhow", "bindgen", - "byteorder", "bytes", "crc32c", "env_logger", - "hex", "log", "memoffset 0.8.0", "once_cell", "postgres", - "rand 0.8.5", "regex", "serde", "thiserror", @@ -4243,13 +4177,11 @@ dependencies = [ "byteorder", "bytes", "itertools 0.10.5", - "pin-project-lite", "postgres-protocol", "rand 0.8.5", "serde", "thiserror", "tokio", - "tracing", ] [[package]] @@ -4281,12 +4213,6 @@ dependencies = [ "elliptic-curve 0.13.8", ] -[[package]] -name = "proc-macro-hack" -version = "0.5.20+deprecated" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068" - [[package]] name = "proc-macro2" version = "1.0.78" @@ -4405,7 +4331,6 @@ dependencies = [ "aws-config", "aws-sdk-iam", "aws-sigv4", - "aws-types", "base64 0.13.1", "bstr", "bytes", @@ -4414,7 +4339,6 @@ dependencies = [ "chrono", "clap", "consumption_metrics", - "crossbeam-deque", "dashmap", "ecdsa 0.16.9", "env_logger", @@ -4440,11 +4364,9 @@ dependencies = [ "jose-jwa", "jose-jwk", "lasso", - "md5", "measured", "metrics", "once_cell", - "opentelemetry", "p256 0.13.2", "parking_lot 0.12.1", "parquet", @@ -4465,7 +4387,6 @@ dependencies = [ "reqwest-middleware", "reqwest-retry", "reqwest-tracing", - "routerify", "rsa", "rstest", "rustc-hash", @@ -4481,7 +4402,6 @@ dependencies = [ "smol_str", "socket2 0.5.5", "subtle", - "task-local-extensions", "thiserror", "tikv-jemalloc-ctl", "tikv-jemallocator", @@ -4491,7 +4411,6 @@ dependencies = [ "tokio-rustls 0.25.0", "tokio-tungstenite", "tokio-util", - "tower-service", "tracing", "tracing-opentelemetry", "tracing-subscriber", @@ -4781,7 +4700,6 @@ dependencies = [ "async-stream", "async-trait", "aws-config", - "aws-credential-types", "aws-sdk-s3", "aws-smithy-async", "aws-smithy-types", @@ -4795,7 +4713,6 @@ dependencies = [ "futures", "futures-util", "http-types", - "humantime", "humantime-serde", "hyper 0.14.26", "itertools 0.10.5", @@ -5275,14 +5192,12 @@ version = "0.1.0" dependencies = [ "anyhow", "async-stream", - "async-trait", "byteorder", "bytes", "camino", "camino-tempfile", "chrono", "clap", - "const_format", "crc32c", "desim", "fail", @@ -5308,9 +5223,7 @@ dependencies = [ "sd-notify", "serde", "serde_json", - "serde_with", "sha2", - "signal-hook", "storage_broker", "strum", "strum_macros", @@ -5321,7 +5234,6 @@ dependencies = [ "tokio-stream", "tokio-tar", "tokio-util", - "toml_edit", "tracing", "tracing-subscriber", "url", @@ -5336,7 +5248,6 @@ version = "0.1.0" dependencies = [ "const_format", "serde", - "serde_with", "utils", ] @@ -5865,7 +5776,6 @@ version = "0.1.0" dependencies = [ "anyhow", "async-stream", - "bytes", "clap", "const_format", "futures", @@ -5879,7 +5789,6 @@ dependencies = [ "parking_lot 0.12.1", "prost", "tokio", - "tokio-stream", "tonic", "tonic-build", "tracing", @@ -5892,9 +5801,7 @@ name = "storage_controller" version = "0.1.0" dependencies = [ "anyhow", - "aws-config", "bytes", - "camino", "chrono", "clap", "control_plane", @@ -5935,20 +5842,9 @@ dependencies = [ name = "storage_controller_client" version = "0.1.0" dependencies = [ - "anyhow", - "bytes", - "futures", - "pageserver_api", "pageserver_client", - "postgres", "reqwest 0.12.4", "serde", - "thiserror", - "tokio", - "tokio-postgres", - "tokio-stream", - "tokio-util", - "utils", "workspace_hack", ] @@ -5960,13 +5856,9 @@ dependencies = [ "async-stream", "aws-config", "aws-sdk-s3", - "aws-smithy-async", - "bincode", - "bytes", "camino", "chrono", "clap", - "crc32c", "either", "futures", "futures-util", @@ -5978,20 +5870,16 @@ dependencies = [ "pageserver", "pageserver_api", "postgres_ffi", - "rand 0.8.5", "remote_storage", "reqwest 0.12.4", "rustls 0.22.4", "rustls-native-certs 0.7.0", "serde", "serde_json", - "serde_with", "storage_controller_client", - "thiserror", "tokio", "tokio-postgres", "tokio-postgres-rustls", - "tokio-rustls 0.25.0", "tokio-stream", "tokio-util", "tracing", @@ -6010,14 +5898,11 @@ dependencies = [ "comfy-table", "futures", "humantime", - "hyper 0.14.26", "pageserver_api", "pageserver_client", "reqwest 0.12.4", - "serde", "serde_json", "storage_controller_client", - "thiserror", "tokio", "tracing", "utils", @@ -6140,15 +6025,6 @@ dependencies = [ "xattr", ] -[[package]] -name = "task-local-extensions" -version = "0.1.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ba323866e5d033818e3240feeb9f7db2c4296674e4d9e16b97b7bf8f490434e8" -dependencies = [ - "pin-utils", -] - [[package]] name = "tempfile" version = "3.9.0" @@ -6739,7 +6615,6 @@ dependencies = [ "opentelemetry", "opentelemetry-otlp", "opentelemetry-semantic-conventions", - "reqwest 0.12.4", "tokio", "tracing", "tracing-opentelemetry", @@ -6943,7 +6818,6 @@ dependencies = [ "serde_assert", "serde_json", "serde_path_to_error", - "serde_with", "signal-hook", "strum", "strum_macros", @@ -6999,13 +6873,11 @@ dependencies = [ "cgroups-rs", "clap", "futures", - "inotify 0.10.2", "serde", "serde_json", "sysinfo", "tokio", "tokio-postgres", - "tokio-stream", "tokio-util", "tracing", "tracing-subscriber", @@ -7032,7 +6904,6 @@ dependencies = [ "clap", "env_logger", "log", - "once_cell", "postgres", "postgres_ffi", "regex", @@ -7555,6 +7426,7 @@ dependencies = [ "digest", "either", "fail", + "futures", "futures-channel", "futures-executor", "futures-io", @@ -7610,6 +7482,8 @@ dependencies = [ "tower", "tracing", "tracing-core", + "tracing-log", + "tracing-subscriber", "url", "uuid", "zeroize", diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml index 8af0ed43ce..00a82e4be6 100644 --- a/compute_tools/Cargo.toml +++ b/compute_tools/Cargo.toml @@ -11,7 +11,6 @@ testing = [] [dependencies] anyhow.workspace = true -async-compression.workspace = true chrono.workspace = true cfg-if.workspace = true clap.workspace = true @@ -24,7 +23,6 @@ num_cpus.workspace = true opentelemetry.workspace = true postgres.workspace = true regex.workspace = true -serde.workspace = true serde_json.workspace = true signal-hook.workspace = true tar.workspace = true @@ -43,7 +41,6 @@ url.workspace = true compute_api.workspace = true utils.workspace = true workspace_hack.workspace = true -toml_edit.workspace = true remote_storage = { version = "0.1", path = "../libs/remote_storage/" } vm_monitor = { version = "0.1", path = "../libs/vm_monitor/" } zstd = "0.13" diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml index 6fca59b368..c185d20484 100644 --- a/control_plane/Cargo.toml +++ b/control_plane/Cargo.toml @@ -9,13 +9,10 @@ anyhow.workspace = true camino.workspace = true clap.workspace = true comfy-table.workspace = true -futures.workspace = true git-version.workspace = true humantime.workspace = true nix.workspace = true once_cell.workspace = true -postgres.workspace = true -hex.workspace = true humantime-serde.workspace = true hyper.workspace = true regex.workspace = true @@ -23,8 +20,6 @@ reqwest = { workspace = true, features = ["blocking", "json"] } scopeguard.workspace = true serde.workspace = true serde_json.workspace = true -serde_with.workspace = true -tar.workspace = true thiserror.workspace = true toml.workspace = true toml_edit.workspace = true diff --git a/control_plane/storcon_cli/Cargo.toml b/control_plane/storcon_cli/Cargo.toml index be69208d0d..ce89116691 100644 --- a/control_plane/storcon_cli/Cargo.toml +++ b/control_plane/storcon_cli/Cargo.toml @@ -11,14 +11,11 @@ clap.workspace = true comfy-table.workspace = true futures.workspace = true humantime.workspace = true -hyper.workspace = true pageserver_api.workspace = true pageserver_client.workspace = true reqwest.workspace = true -serde.workspace = true serde_json = { workspace = true, features = ["raw_value"] } storage_controller_client.workspace = true -thiserror.workspace = true tokio.workspace = true tracing.workspace = true utils.workspace = true diff --git a/libs/compute_api/Cargo.toml b/libs/compute_api/Cargo.toml index 8aaa481f8c..c0ec40a6c2 100644 --- a/libs/compute_api/Cargo.toml +++ b/libs/compute_api/Cargo.toml @@ -8,7 +8,6 @@ license.workspace = true anyhow.workspace = true chrono.workspace = true serde.workspace = true -serde_with.workspace = true serde_json.workspace = true regex.workspace = true diff --git a/libs/consumption_metrics/Cargo.toml b/libs/consumption_metrics/Cargo.toml index a40b74b952..0e517e3856 100644 --- a/libs/consumption_metrics/Cargo.toml +++ b/libs/consumption_metrics/Cargo.toml @@ -5,9 +5,6 @@ edition = "2021" license = "Apache-2.0" [dependencies] -anyhow.workspace = true chrono = { workspace = true, features = ["serde"] } rand.workspace = true serde.workspace = true -serde_with.workspace = true -utils.workspace = true diff --git a/libs/desim/Cargo.toml b/libs/desim/Cargo.toml index 0c4be90267..473f3a2a13 100644 --- a/libs/desim/Cargo.toml +++ b/libs/desim/Cargo.toml @@ -12,5 +12,4 @@ bytes.workspace = true utils.workspace = true parking_lot.workspace = true hex.workspace = true -scopeguard.workspace = true smallvec = { workspace = true, features = ["write"] } diff --git a/libs/postgres_backend/Cargo.toml b/libs/postgres_backend/Cargo.toml index f6854328fc..a0c87263ed 100644 --- a/libs/postgres_backend/Cargo.toml +++ b/libs/postgres_backend/Cargo.toml @@ -5,10 +5,8 @@ edition.workspace = true license.workspace = true [dependencies] -async-trait.workspace = true anyhow.workspace = true bytes.workspace = true -futures.workspace = true rustls.workspace = true serde.workspace = true thiserror.workspace = true diff --git a/libs/postgres_ffi/Cargo.toml b/libs/postgres_ffi/Cargo.toml index ee69878f69..ef17833a48 100644 --- a/libs/postgres_ffi/Cargo.toml +++ b/libs/postgres_ffi/Cargo.toml @@ -5,13 +5,10 @@ edition.workspace = true license.workspace = true [dependencies] -rand.workspace = true regex.workspace = true bytes.workspace = true -byteorder.workspace = true anyhow.workspace = true crc32c.workspace = true -hex.workspace = true once_cell.workspace = true log.workspace = true memoffset.workspace = true diff --git a/libs/postgres_ffi/wal_craft/Cargo.toml b/libs/postgres_ffi/wal_craft/Cargo.toml index 29dd01a936..14c7d2e340 100644 --- a/libs/postgres_ffi/wal_craft/Cargo.toml +++ b/libs/postgres_ffi/wal_craft/Cargo.toml @@ -9,7 +9,6 @@ anyhow.workspace = true clap.workspace = true env_logger.workspace = true log.workspace = true -once_cell.workspace = true postgres.workspace = true postgres_ffi.workspace = true camino-tempfile.workspace = true diff --git a/libs/pq_proto/Cargo.toml b/libs/pq_proto/Cargo.toml index 66bbe03ebc..9524a1490d 100644 --- a/libs/pq_proto/Cargo.toml +++ b/libs/pq_proto/Cargo.toml @@ -8,10 +8,8 @@ license.workspace = true bytes.workspace = true byteorder.workspace = true itertools.workspace = true -pin-project-lite.workspace = true postgres-protocol.workspace = true rand.workspace = true tokio = { workspace = true, features = ["io-util"] } -tracing.workspace = true thiserror.workspace = true serde.workspace = true diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml index 02adee058f..f48f1801a4 100644 --- a/libs/remote_storage/Cargo.toml +++ b/libs/remote_storage/Cargo.toml @@ -13,14 +13,11 @@ aws-smithy-async.workspace = true aws-smithy-types.workspace = true aws-config.workspace = true aws-sdk-s3.workspace = true -aws-credential-types.workspace = true bytes.workspace = true camino = { workspace = true, features = ["serde1"] } -humantime.workspace = true humantime-serde.workspace = true hyper = { workspace = true, features = ["stream"] } futures.workspace = true -rand.workspace = true serde.workspace = true serde_json.workspace = true tokio = { workspace = true, features = ["sync", "fs", "io-util"] } diff --git a/libs/safekeeper_api/Cargo.toml b/libs/safekeeper_api/Cargo.toml index e1f4bcca46..14811232d3 100644 --- a/libs/safekeeper_api/Cargo.toml +++ b/libs/safekeeper_api/Cargo.toml @@ -6,6 +6,5 @@ license.workspace = true [dependencies] serde.workspace = true -serde_with.workspace = true const_format.workspace = true utils.workspace = true diff --git a/libs/tracing-utils/Cargo.toml b/libs/tracing-utils/Cargo.toml index 5ea8db6b42..05eb538d42 100644 --- a/libs/tracing-utils/Cargo.toml +++ b/libs/tracing-utils/Cargo.toml @@ -9,8 +9,9 @@ hyper.workspace = true opentelemetry = { workspace = true, features=["rt-tokio"] } opentelemetry-otlp = { workspace = true, default-features=false, features = ["http-proto", "trace", "http", "reqwest-client"] } opentelemetry-semantic-conventions.workspace = true -reqwest = { workspace = true, default-features = false, features = ["rustls-tls"] } tokio = { workspace = true, features = ["rt", "rt-multi-thread"] } tracing.workspace = true tracing-opentelemetry.workspace = true -tracing-subscriber.workspace = true + +[dev-dependencies] +tracing-subscriber.workspace = true # For examples in docs diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml index 19deaab63f..f199b15554 100644 --- a/libs/utils/Cargo.toml +++ b/libs/utils/Cargo.toml @@ -42,7 +42,6 @@ tracing.workspace = true tracing-error.workspace = true tracing-subscriber = { workspace = true, features = ["json", "registry"] } rand.workspace = true -serde_with.workspace = true strum.workspace = true strum_macros.workspace = true url.workspace = true diff --git a/libs/vm_monitor/Cargo.toml b/libs/vm_monitor/Cargo.toml index 46e9f880a1..ba73902d38 100644 --- a/libs/vm_monitor/Cargo.toml +++ b/libs/vm_monitor/Cargo.toml @@ -15,13 +15,11 @@ anyhow.workspace = true axum.workspace = true clap.workspace = true futures.workspace = true -inotify.workspace = true serde.workspace = true serde_json.workspace = true sysinfo.workspace = true tokio = { workspace = true, features = ["rt-multi-thread"] } tokio-postgres.workspace = true -tokio-stream.workspace = true tokio-util.workspace = true tracing.workspace = true tracing-subscriber.workspace = true diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 24373afca3..0eb48d6823 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -15,7 +15,6 @@ anyhow.workspace = true arc-swap.workspace = true async-compression.workspace = true async-stream.workspace = true -async-trait.workspace = true bit_field.workspace = true byteorder.workspace = true bytes.workspace = true @@ -23,12 +22,9 @@ camino.workspace = true camino-tempfile.workspace = true chrono = { workspace = true, features = ["serde"] } clap = { workspace = true, features = ["string"] } -const_format.workspace = true consumption_metrics.workspace = true crc32c.workspace = true -crossbeam-utils.workspace = true either.workspace = true -flate2.workspace = true fail.workspace = true futures.workspace = true git-version.workspace = true @@ -57,10 +53,6 @@ serde.workspace = true serde_json = { workspace = true, features = ["raw_value"] } serde_path_to_error.workspace = true serde_with.workspace = true -signal-hook.workspace = true -smallvec = { workspace = true, features = ["write"] } -svg_fmt.workspace = true -sync_wrapper.workspace = true sysinfo.workspace = true tokio-tar.workspace = true thiserror.workspace = true @@ -73,7 +65,6 @@ tokio-stream.workspace = true tokio-util.workspace = true toml_edit = { workspace = true, features = [ "serde" ] } tracing.workspace = true -twox-hash.workspace = true url.workspace = true walkdir.workspace = true metrics.workspace = true diff --git a/pageserver/compaction/Cargo.toml b/pageserver/compaction/Cargo.toml index 0fd1d81845..52b58fc298 100644 --- a/pageserver/compaction/Cargo.toml +++ b/pageserver/compaction/Cargo.toml @@ -9,41 +9,19 @@ default = [] [dependencies] anyhow.workspace = true -async-compression.workspace = true async-stream.workspace = true -byteorder.workspace = true -bytes.workspace = true -chrono = { workspace = true, features = ["serde"] } clap = { workspace = true, features = ["string"] } -const_format.workspace = true -consumption_metrics.workspace = true -crossbeam-utils.workspace = true -either.workspace = true -flate2.workspace = true -fail.workspace = true futures.workspace = true git-version.workspace = true -hex.workspace = true -humantime.workspace = true -humantime-serde.workspace = true itertools.workspace = true once_cell.workspace = true pageserver_api.workspace = true pin-project-lite.workspace = true rand.workspace = true -smallvec = { workspace = true, features = ["write"] } svg_fmt.workspace = true -sync_wrapper.workspace = true -thiserror.workspace = true tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] } -tokio-io-timeout.workspace = true -tokio-util.workspace = true tracing.workspace = true -tracing-error.workspace = true tracing-subscriber.workspace = true -url.workspace = true -walkdir.workspace = true -metrics.workspace = true utils.workspace = true workspace_hack.workspace = true diff --git a/pageserver/ctl/Cargo.toml b/pageserver/ctl/Cargo.toml index be5626040b..9592002de1 100644 --- a/pageserver/ctl/Cargo.toml +++ b/pageserver/ctl/Cargo.toml @@ -8,7 +8,6 @@ license.workspace = true [dependencies] anyhow.workspace = true -bytes.workspace = true camino.workspace = true clap = { workspace = true, features = ["string"] } git-version.workspace = true @@ -24,5 +23,4 @@ toml_edit.workspace = true utils.workspace = true svg_fmt.workspace = true workspace_hack.workspace = true -serde.workspace = true serde_json.workspace = true diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index 21d92abb20..6703eb06eb 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -18,7 +18,6 @@ atomic-take.workspace = true aws-config.workspace = true aws-sdk-iam.workspace = true aws-sigv4.workspace = true -aws-types.workspace = true base64.workspace = true bstr.workspace = true bytes = { workspace = true, features = ["serde"] } @@ -26,7 +25,6 @@ camino.workspace = true chrono.workspace = true clap.workspace = true consumption_metrics.workspace = true -crossbeam-deque.workspace = true dashmap.workspace = true env_logger.workspace = true framed-websockets.workspace = true @@ -48,11 +46,9 @@ indexmap.workspace = true ipnet.workspace = true itertools.workspace = true lasso = { workspace = true, features = ["multi-threaded"] } -md5.workspace = true measured = { workspace = true, features = ["lasso"] } metrics.workspace = true once_cell.workspace = true -opentelemetry.workspace = true parking_lot.workspace = true parquet.workspace = true parquet_derive.workspace = true @@ -67,7 +63,6 @@ reqwest.workspace = true reqwest-middleware = { workspace = true, features = ["json"] } reqwest-retry.workspace = true reqwest-tracing.workspace = true -routerify.workspace = true rustc-hash.workspace = true rustls-pemfile.workspace = true rustls.workspace = true @@ -79,7 +74,6 @@ smol_str.workspace = true smallvec.workspace = true socket2.workspace = true subtle.workspace = true -task-local-extensions.workspace = true thiserror.workspace = true tikv-jemallocator.workspace = true tikv-jemalloc-ctl = { workspace = true, features = ["use_std"] } @@ -88,7 +82,6 @@ tokio-postgres-rustls.workspace = true tokio-rustls.workspace = true tokio-util.workspace = true tokio = { workspace = true, features = ["signal"] } -tower-service.workspace = true tracing-opentelemetry.workspace = true tracing-subscriber.workspace = true tracing-utils.workspace = true diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml index 0fdb3147bf..daf21c70b0 100644 --- a/safekeeper/Cargo.toml +++ b/safekeeper/Cargo.toml @@ -13,14 +13,12 @@ testing = ["fail/failpoints"] [dependencies] async-stream.workspace = true anyhow.workspace = true -async-trait.workspace = true byteorder.workspace = true bytes.workspace = true camino.workspace = true camino-tempfile.workspace = true chrono.workspace = true clap = { workspace = true, features = ["derive"] } -const_format.workspace = true crc32c.workspace = true fail.workspace = true git-version.workspace = true @@ -38,8 +36,6 @@ scopeguard.workspace = true reqwest = { workspace = true, features = ["json"] } serde.workspace = true serde_json.workspace = true -serde_with.workspace = true -signal-hook.workspace = true strum.workspace = true strum_macros.workspace = true thiserror.workspace = true @@ -48,7 +44,6 @@ tokio-util = { workspace = true } tokio-io-timeout.workspace = true tokio-postgres.workspace = true tokio-tar.workspace = true -toml_edit.workspace = true tracing.workspace = true url.workspace = true metrics.workspace = true diff --git a/storage_broker/Cargo.toml b/storage_broker/Cargo.toml index ac4b00669e..82ec0aa272 100644 --- a/storage_broker/Cargo.toml +++ b/storage_broker/Cargo.toml @@ -10,7 +10,6 @@ bench = [] [dependencies] anyhow.workspace = true async-stream.workspace = true -bytes.workspace = true clap = { workspace = true, features = ["derive"] } const_format.workspace = true futures.workspace = true @@ -24,7 +23,6 @@ parking_lot.workspace = true prost.workspace = true tonic.workspace = true tokio = { workspace = true, features = ["rt-multi-thread"] } -tokio-stream.workspace = true tracing.workspace = true metrics.workspace = true utils.workspace = true diff --git a/storage_controller/Cargo.toml b/storage_controller/Cargo.toml index ecaac04915..a96d64e096 100644 --- a/storage_controller/Cargo.toml +++ b/storage_controller/Cargo.toml @@ -15,9 +15,7 @@ testing = [] [dependencies] anyhow.workspace = true -aws-config.workspace = true bytes.workspace = true -camino.workspace = true chrono.workspace = true clap.workspace = true fail.workspace = true diff --git a/storage_controller/client/Cargo.toml b/storage_controller/client/Cargo.toml index e7a4264fd0..9fa89176af 100644 --- a/storage_controller/client/Cargo.toml +++ b/storage_controller/client/Cargo.toml @@ -5,18 +5,7 @@ edition.workspace = true license.workspace = true [dependencies] -pageserver_api.workspace = true pageserver_client.workspace = true -thiserror.workspace = true reqwest.workspace = true -utils.workspace = true serde.workspace = true workspace_hack = { version = "0.1", path = "../../workspace_hack" } -tokio-postgres.workspace = true -tokio-stream.workspace = true -tokio.workspace = true -futures.workspace = true -tokio-util.workspace = true -anyhow.workspace = true -postgres.workspace = true -bytes.workspace = true diff --git a/storage_scrubber/Cargo.toml b/storage_scrubber/Cargo.toml index d19119990b..f9987662b9 100644 --- a/storage_scrubber/Cargo.toml +++ b/storage_scrubber/Cargo.toml @@ -6,21 +6,13 @@ license.workspace = true [dependencies] aws-sdk-s3.workspace = true -aws-smithy-async.workspace = true either.workspace = true -tokio-rustls.workspace = true anyhow.workspace = true git-version.workspace = true hex.workspace = true humantime.workspace = true -thiserror.workspace = true -rand.workspace = true -bytes.workspace = true -bincode.workspace = true -crc32c.workspace = true serde.workspace = true serde_json.workspace = true -serde_with.workspace = true workspace_hack.workspace = true utils.workspace = true async-stream.workspace = true diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index 140c43639e..662916d42c 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -38,6 +38,7 @@ deranged = { version = "0.3", default-features = false, features = ["powerfmt", digest = { version = "0.10", features = ["mac", "oid", "std"] } either = { version = "1" } fail = { version = "0.5", default-features = false, features = ["failpoints"] } +futures = { version = "0.3" } futures-channel = { version = "0.3", features = ["sink"] } futures-executor = { version = "0.3" } futures-io = { version = "0.3" } @@ -88,6 +89,8 @@ tonic = { version = "0.9", features = ["tls-roots"] } tower = { version = "0.4", default-features = false, features = ["balance", "buffer", "limit", "log", "timeout", "util"] } tracing = { version = "0.1", features = ["log"] } tracing-core = { version = "0.1" } +tracing-log = { version = "0.1", default-features = false, features = ["log-tracer", "std"] } +tracing-subscriber = { version = "0.3", default-features = false, features = ["env-filter", "fmt", "json", "smallvec", "tracing-log"] } url = { version = "2", features = ["serde"] } uuid = { version = "1", features = ["serde", "v4", "v7"] } zeroize = { version = "1", features = ["derive", "serde"] } From 6138eb50e9fc5fb6bc7af57cb95cbad8aaedc186 Mon Sep 17 00:00:00 2001 From: Tristan Partin Date: Mon, 16 Sep 2024 17:53:57 -0500 Subject: [PATCH 24/30] Fix test code related to migrations We added another migration in 5876c441abc973acca60882192ad46333c075abd, but didn't bump this value. This had no effect, but best to fix it anyway. Signed-off-by: Tristan Partin --- test_runner/fixtures/neon_fixtures.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 93b93ff019..69284bfdfc 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -4094,7 +4094,7 @@ class Endpoint(PgProtocol, LogUtils): json.dump(dict(data_dict, **kwargs), file, indent=4) # Please note: Migrations only run if pg_skip_catalog_updates is false - def wait_for_migrations(self, num_migrations: int = 10): + def wait_for_migrations(self, num_migrations: int = 11): with self.cursor() as cur: def check_migrations_done(): From a1b71b73fe8f1193ef841604af962053c0bf1061 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Tue, 17 Sep 2024 19:15:01 +0200 Subject: [PATCH 25/30] Rename some S3 usages to "remote storage" in exposed messages (#8999) In exposed messages like log messages we mentioned "S3", which is not entirely accurate as we support Azure blob storage now as well. --- pageserver/src/consumption_metrics.rs | 2 +- pageserver/src/metrics.rs | 2 +- test_runner/fixtures/pageserver/many_tenants.py | 2 +- test_runner/regress/test_pageserver_metric_collection.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs index 64a267e0e4..0c7630edca 100644 --- a/pageserver/src/consumption_metrics.rs +++ b/pageserver/src/consumption_metrics.rs @@ -178,7 +178,7 @@ async fn collect_metrics( ) .await; if let Err(e) = res { - tracing::error!("failed to upload to S3: {e:#}"); + tracing::error!("failed to upload to remote storage: {e:#}"); } } }; diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 9197505876..72229d80be 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -1777,7 +1777,7 @@ pub(crate) static SECONDARY_MODE: Lazy = Lazy::new(|| { .expect("failed to define a metric"), upload_heatmap_duration: register_histogram!( "pageserver_secondary_upload_heatmap_duration", - "Time to build and upload a heatmap, including any waiting inside the S3 client" + "Time to build and upload a heatmap, including any waiting inside the remote storage client" ) .expect("failed to define a metric"), download_heatmap: register_int_counter!( diff --git a/test_runner/fixtures/pageserver/many_tenants.py b/test_runner/fixtures/pageserver/many_tenants.py index 3e0ffabf74..97e63ed4ba 100644 --- a/test_runner/fixtures/pageserver/many_tenants.py +++ b/test_runner/fixtures/pageserver/many_tenants.py @@ -39,7 +39,7 @@ def single_timeline( log.info("detach template tenant form pageserver") env.pageserver.tenant_detach(template_tenant) - log.info(f"duplicating template tenant {ncopies} times in S3") + log.info(f"duplicating template tenant {ncopies} times in remote storage") tenants = fixtures.pageserver.remote_storage.duplicate_tenant(env, template_tenant, ncopies) # In theory we could just attach all the tenants, force on-demand downloads via mgmt API, and be done. diff --git a/test_runner/regress/test_pageserver_metric_collection.py b/test_runner/regress/test_pageserver_metric_collection.py index 24a37b04ec..37ab51f9fb 100644 --- a/test_runner/regress/test_pageserver_metric_collection.py +++ b/test_runner/regress/test_pageserver_metric_collection.py @@ -74,7 +74,7 @@ def test_metric_collection( env.pageserver.allowed_errors.extend( [ ".*metrics endpoint refused the sent metrics*", - ".*metrics_collection: failed to upload to S3: Failed to upload data of length .* to storage path.*", + ".*metrics_collection: failed to upload to remote storage: Failed to upload data of length .* to storage path.*", ] ) From d78f5ce6da2866cb982203d1bab33c49a990cd5f Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Tue, 17 Sep 2024 18:40:05 +0100 Subject: [PATCH 26/30] CI: don't fetch the whole git history if it's not required (#9021) ## Problem We do use `actions/checkout` with `fetch-depth: 0` when it's not required ## Summary of changes - Remove unneeded `fetch-depth: 0` - Add a comment if `fetch-depth: 0` is required --- .github/workflows/build_and_test.yml | 31 +++++++------------------ .github/workflows/trigger-e2e-tests.yml | 4 ++-- 2 files changed, 11 insertions(+), 24 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 7ddd624dd5..d46b8dc1f5 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -54,8 +54,8 @@ jobs: build-tag: ${{steps.build-tag.outputs.tag}} steps: - - name: Checkout - uses: actions/checkout@v4 + # Need `fetch-depth: 0` to count the number of commits in the branch + - uses: actions/checkout@v4 with: fetch-depth: 0 @@ -374,8 +374,8 @@ jobs: coverage-html: ${{ steps.upload-coverage-report-new.outputs.report-url }} coverage-json: ${{ steps.upload-coverage-report-new.outputs.summary-json }} steps: - - name: Checkout - uses: actions/checkout@v4 + # Need `fetch-depth: 0` for differential coverage (to get diff between two commits) + - uses: actions/checkout@v4 with: submodules: true fetch-depth: 0 @@ -476,11 +476,9 @@ jobs: runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }} steps: - - name: Checkout - uses: actions/checkout@v4 + - uses: actions/checkout@v4 with: submodules: true - fetch-depth: 0 - uses: ./.github/actions/set-docker-config-dir - uses: docker/setup-buildx-action@v3 @@ -555,11 +553,9 @@ jobs: runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }} steps: - - name: Checkout - uses: actions/checkout@v4 + - uses: actions/checkout@v4 with: submodules: true - fetch-depth: 0 - uses: ./.github/actions/set-docker-config-dir - uses: docker/setup-buildx-action@v3 @@ -706,10 +702,7 @@ jobs: VM_BUILDER_VERSION: v0.29.3 steps: - - name: Checkout - uses: actions/checkout@v4 - with: - fetch-depth: 0 + - uses: actions/checkout@v4 - name: Downloading vm-builder run: | @@ -749,10 +742,7 @@ jobs: runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }} steps: - - name: Checkout - uses: actions/checkout@v4 - with: - fetch-depth: 0 + - uses: actions/checkout@v4 - uses: ./.github/actions/set-docker-config-dir - uses: docker/login-action@v3 @@ -977,10 +967,7 @@ jobs: git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r" done - - name: Checkout - uses: actions/checkout@v4 - with: - fetch-depth: 0 + - uses: actions/checkout@v4 - name: Trigger deploy workflow env: diff --git a/.github/workflows/trigger-e2e-tests.yml b/.github/workflows/trigger-e2e-tests.yml index 6fbe785c56..b299cf9b99 100644 --- a/.github/workflows/trigger-e2e-tests.yml +++ b/.github/workflows/trigger-e2e-tests.yml @@ -34,8 +34,8 @@ jobs: build-tag: ${{ steps.build-tag.outputs.tag }} steps: - - name: Checkout - uses: actions/checkout@v4 + # Need `fetch-depth: 0` to count the number of commits in the branch + - uses: actions/checkout@v4 with: fetch-depth: 0 From 3cd2a3f9317f54e75c00a813680060dcaa612a36 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Tue, 17 Sep 2024 20:16:33 +0100 Subject: [PATCH 27/30] refactor(walredo): process launch & kill-on-error machinery (#8951) Immediate benefit: easier to spot what's going on. Later benefit: use the extracted method in PR - https://github.com/neondatabase/neon/pull/8952 which adds a `ping` command to walredo. Found this useful during investigation https://github.com/neondatabase/cloud/issues/16886. --- pageserver/src/walredo.rs | 255 +++++++++++++++++++++----------------- 1 file changed, 140 insertions(+), 115 deletions(-) diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index a36955fa21..0fe7def8b0 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -35,6 +35,7 @@ use anyhow::Context; use bytes::{Bytes, BytesMut}; use pageserver_api::models::{WalRedoManagerProcessStatus, WalRedoManagerStatus}; use pageserver_api::shard::TenantShardId; +use std::future::Future; use std::sync::Arc; use std::time::Duration; use std::time::Instant; @@ -296,6 +297,97 @@ impl PostgresRedoManager { } } + async fn do_with_walredo_process< + F: FnOnce(Arc) -> Fut, + Fut: Future>, + O, + >( + &self, + pg_version: u32, + closure: F, + ) -> Result { + let proc: Arc = match self.redo_process.get_or_init_detached().await { + Ok(guard) => match &*guard { + ProcessOnceCell::Spawned(proc) => Arc::clone(proc), + ProcessOnceCell::ManagerShutDown => { + return Err(Error::Cancelled); + } + }, + Err(permit) => { + let start = Instant::now(); + // acquire guard before spawning process, so that we don't spawn new processes + // if the gate is already closed. + let _launched_processes_guard = match self.launched_processes.enter() { + Ok(guard) => guard, + Err(GateError::GateClosed) => unreachable!( + "shutdown sets the once cell to `ManagerShutDown` state before closing the gate" + ), + }; + let proc = Arc::new(Process { + process: process::WalRedoProcess::launch( + self.conf, + self.tenant_shard_id, + pg_version, + ) + .context("launch walredo process")?, + _launched_processes_guard, + }); + let duration = start.elapsed(); + WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.observe(duration.as_secs_f64()); + info!( + elapsed_ms = duration.as_millis(), + pid = proc.id(), + "launched walredo process" + ); + self.redo_process + .set(ProcessOnceCell::Spawned(Arc::clone(&proc)), permit); + proc + } + }; + + // async closures are unstable, would support &Process + let result = closure(proc.clone()).await; + + if result.is_err() { + // Avoid concurrent callers hitting the same issue by taking `proc` out of the rotation. + // Note that there may be other tasks concurrent with us that also hold `proc`. + // We have to deal with that here. + // Also read the doc comment on field `self.redo_process`. + // + // NB: there may still be other concurrent threads using `proc`. + // The last one will send SIGKILL when the underlying Arc reaches refcount 0. + // + // NB: the drop impl blocks the dropping thread with a wait() system call for + // the child process. In some ways the blocking is actually good: if we + // deferred the waiting into the background / to tokio if we used `tokio::process`, + // it could happen that if walredo always fails immediately, we spawn processes faster + // than we can SIGKILL & `wait` for them to exit. By doing it the way we do here, + // we limit this risk of run-away to at most $num_runtimes * $num_executor_threads. + // This probably needs revisiting at some later point. + match self.redo_process.get() { + None => (), + Some(guard) => { + match &*guard { + ProcessOnceCell::ManagerShutDown => {} + ProcessOnceCell::Spawned(guard_proc) => { + if Arc::ptr_eq(&proc, guard_proc) { + // We're the first to observe an error from `proc`, it's our job to take it out of rotation. + guard.take_and_deinit(); + } else { + // Another task already spawned another redo process (further up in this method) + // and put it into `redo_process`. Do nothing, our view of the world is behind. + } + } + } + } + } + // The last task that does this `drop()` of `proc` will do a blocking `wait()` syscall. + drop(proc); + } + + result + } + /// /// Process one request for WAL redo using wal-redo postgres /// @@ -319,130 +411,63 @@ impl PostgresRedoManager { const MAX_RETRY_ATTEMPTS: u32 = 1; let mut n_attempts = 0u32; loop { - let proc: Arc = match self.redo_process.get_or_init_detached().await { - Ok(guard) => match &*guard { - ProcessOnceCell::Spawned(proc) => Arc::clone(proc), - ProcessOnceCell::ManagerShutDown => { - return Err(Error::Cancelled); - } - }, - Err(permit) => { - let start = Instant::now(); - // acquire guard before spawning process, so that we don't spawn new processes - // if the gate is already closed. - let _launched_processes_guard = match self.launched_processes.enter() { - Ok(guard) => guard, - Err(GateError::GateClosed) => unreachable!( - "shutdown sets the once cell to `ManagerShutDown` state before closing the gate" - ), - }; - let proc = Arc::new(Process { - process: process::WalRedoProcess::launch( - self.conf, - self.tenant_shard_id, - pg_version, - ) - .context("launch walredo process")?, - _launched_processes_guard, - }); - let duration = start.elapsed(); - WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.observe(duration.as_secs_f64()); - info!( - duration_ms = duration.as_millis(), - pid = proc.id(), - "launched walredo process" - ); - self.redo_process - .set(ProcessOnceCell::Spawned(Arc::clone(&proc)), permit); - proc - } - }; + let base_img = &base_img; + let closure = |proc: Arc| async move { + let started_at = std::time::Instant::now(); - let started_at = std::time::Instant::now(); + // Relational WAL records are applied using wal-redo-postgres + let result = proc + .apply_wal_records(rel, blknum, base_img, records, wal_redo_timeout) + .await + .context("apply_wal_records"); - // Relational WAL records are applied using wal-redo-postgres - let result = proc - .apply_wal_records(rel, blknum, &base_img, records, wal_redo_timeout) - .await - .context("apply_wal_records"); + let duration = started_at.elapsed(); - let duration = started_at.elapsed(); - - let len = records.len(); - let nbytes = records.iter().fold(0, |acumulator, record| { - acumulator - + match &record.1 { - NeonWalRecord::Postgres { rec, .. } => rec.len(), - _ => unreachable!("Only PostgreSQL records are accepted in this batch"), - } - }); - - WAL_REDO_TIME.observe(duration.as_secs_f64()); - WAL_REDO_RECORDS_HISTOGRAM.observe(len as f64); - WAL_REDO_BYTES_HISTOGRAM.observe(nbytes as f64); - - debug!( - "postgres applied {} WAL records ({} bytes) in {} us to reconstruct page image at LSN {}", - len, - nbytes, - duration.as_micros(), - lsn - ); - - // If something went wrong, don't try to reuse the process. Kill it, and - // next request will launch a new one. - if let Err(e) = result.as_ref() { - error!( - "error applying {} WAL records {}..{} ({} bytes) to key {key}, from base image with LSN {} to reconstruct page image at LSN {} n_attempts={}: {:?}", - records.len(), - records.first().map(|p| p.0).unwrap_or(Lsn(0)), - records.last().map(|p| p.0).unwrap_or(Lsn(0)), - nbytes, - base_img_lsn, - lsn, - n_attempts, - e, - ); - // Avoid concurrent callers hitting the same issue by taking `proc` out of the rotation. - // Note that there may be other tasks concurrent with us that also hold `proc`. - // We have to deal with that here. - // Also read the doc comment on field `self.redo_process`. - // - // NB: there may still be other concurrent threads using `proc`. - // The last one will send SIGKILL when the underlying Arc reaches refcount 0. - // - // NB: the drop impl blocks the dropping thread with a wait() system call for - // the child process. In some ways the blocking is actually good: if we - // deferred the waiting into the background / to tokio if we used `tokio::process`, - // it could happen that if walredo always fails immediately, we spawn processes faster - // than we can SIGKILL & `wait` for them to exit. By doing it the way we do here, - // we limit this risk of run-away to at most $num_runtimes * $num_executor_threads. - // This probably needs revisiting at some later point. - match self.redo_process.get() { - None => (), - Some(guard) => { - match &*guard { - ProcessOnceCell::ManagerShutDown => {} - ProcessOnceCell::Spawned(guard_proc) => { - if Arc::ptr_eq(&proc, guard_proc) { - // We're the first to observe an error from `proc`, it's our job to take it out of rotation. - guard.take_and_deinit(); - } else { - // Another task already spawned another redo process (further up in this method) - // and put it into `redo_process`. Do nothing, our view of the world is behind. - } - } + let len = records.len(); + let nbytes = records.iter().fold(0, |acumulator, record| { + acumulator + + match &record.1 { + NeonWalRecord::Postgres { rec, .. } => rec.len(), + _ => unreachable!("Only PostgreSQL records are accepted in this batch"), } - } + }); + + WAL_REDO_TIME.observe(duration.as_secs_f64()); + WAL_REDO_RECORDS_HISTOGRAM.observe(len as f64); + WAL_REDO_BYTES_HISTOGRAM.observe(nbytes as f64); + + debug!( + "postgres applied {} WAL records ({} bytes) in {} us to reconstruct page image at LSN {}", + len, + nbytes, + duration.as_micros(), + lsn + ); + + if let Err(e) = result.as_ref() { + error!( + "error applying {} WAL records {}..{} ({} bytes) to key {key}, from base image with LSN {} to reconstruct page image at LSN {} n_attempts={}: {:?}", + records.len(), + records.first().map(|p| p.0).unwrap_or(Lsn(0)), + records.last().map(|p| p.0).unwrap_or(Lsn(0)), + nbytes, + base_img_lsn, + lsn, + n_attempts, + e, + ); } - // The last task that does this `drop()` of `proc` will do a blocking `wait()` syscall. - drop(proc); - } else if n_attempts != 0 { + + result.map_err(Error::Other) + }; + let result = self.do_with_walredo_process(pg_version, closure).await; + + if result.is_ok() && n_attempts != 0 { info!(n_attempts, "retried walredo succeeded"); } n_attempts += 1; if n_attempts > MAX_RETRY_ATTEMPTS || result.is_ok() { - return result.map_err(Error::Other); + return result; } } } From 135e7e4306978d79a133086abb81707d8d9fb8a9 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Wed, 18 Sep 2024 09:10:27 +0200 Subject: [PATCH 28/30] add `neon_local` subcommand for the broker & use that from regression tests (#8948) There's currently no way to just start/stop broker from `neon_local`. This PR * adds a sub-command * uses that sub-command from the test suite instead of the pre-existing Python `subprocess` based approach. Found this useful during investigation https://github.com/neondatabase/cloud/issues/16886. --- control_plane/src/bin/neon_local.rs | 40 ++++++++++ test_runner/fixtures/broker.py | 63 --------------- test_runner/fixtures/neon_fixtures.py | 76 +++++++++++++------ test_runner/regress/test_neon_cli.py | 2 + .../regress/test_pageserver_generations.py | 2 +- .../regress/test_storage_controller.py | 6 +- test_runner/regress/test_wal_acceptor.py | 9 +-- 7 files changed, 99 insertions(+), 99 deletions(-) delete mode 100644 test_runner/fixtures/broker.py diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index 144cd647c9..1c94c42865 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -106,6 +106,7 @@ fn main() -> Result<()> { "stop" => rt.block_on(handle_stop_all(sub_args, &env)), "pageserver" => rt.block_on(handle_pageserver(sub_args, &env)), "storage_controller" => rt.block_on(handle_storage_controller(sub_args, &env)), + "storage_broker" => rt.block_on(handle_storage_broker(sub_args, &env)), "safekeeper" => rt.block_on(handle_safekeeper(sub_args, &env)), "endpoint" => rt.block_on(handle_endpoint(sub_args, &env)), "mappings" => handle_mappings(sub_args, &mut env), @@ -1245,6 +1246,32 @@ async fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Ok(()) } +async fn handle_storage_broker(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { + let (sub_name, sub_args) = match sub_match.subcommand() { + Some(broker_command_data) => broker_command_data, + None => bail!("no broker subcommand provided"), + }; + + match sub_name { + "start" => { + if let Err(e) = broker::start_broker_process(env, get_start_timeout(sub_args)).await { + eprintln!("broker start failed: {e}"); + exit(1); + } + } + + "stop" => { + if let Err(e) = broker::stop_broker_process(env) { + eprintln!("broker stop failed: {e}"); + exit(1); + } + } + + _ => bail!("Unexpected broker subcommand '{}'", sub_name), + } + Ok(()) +} + async fn handle_start_all( env: &local_env::LocalEnv, retry_timeout: &Duration, @@ -1672,6 +1699,19 @@ fn cli() -> Command { .arg(stop_mode_arg.clone()) .arg(instance_id)) ) + .subcommand( + Command::new("storage_broker") + .arg_required_else_help(true) + .about("Manage broker") + .subcommand(Command::new("start") + .about("Start broker") + .arg(timeout_arg.clone()) + ) + .subcommand(Command::new("stop") + .about("Stop broker") + .arg(stop_mode_arg.clone()) + ) + ) .subcommand( Command::new("safekeeper") .arg_required_else_help(true) diff --git a/test_runner/fixtures/broker.py b/test_runner/fixtures/broker.py deleted file mode 100644 index 8aca90a097..0000000000 --- a/test_runner/fixtures/broker.py +++ /dev/null @@ -1,63 +0,0 @@ -import subprocess -import time -from dataclasses import dataclass -from pathlib import Path -from typing import Any, Optional - -from fixtures.log_helper import log - - -@dataclass -class NeonBroker: - """An object managing storage_broker instance""" - - logfile: Path - port: int - neon_binpath: Path - handle: Optional[subprocess.Popen[Any]] = None # handle of running daemon - - def listen_addr(self): - return f"127.0.0.1:{self.port}" - - def client_url(self): - return f"http://{self.listen_addr()}" - - def check_status(self): - return True # TODO - - def try_start(self): - if self.handle is not None: - log.debug(f"storage_broker is already running on port {self.port}") - return - - listen_addr = self.listen_addr() - log.info(f'starting storage_broker to listen incoming connections at "{listen_addr}"') - with open(self.logfile, "wb") as logfile: - args = [ - str(self.neon_binpath / "storage_broker"), - f"--listen-addr={listen_addr}", - ] - self.handle = subprocess.Popen(args, stdout=logfile, stderr=logfile) - - # wait for start - started_at = time.time() - while True: - try: - self.check_status() - except Exception as e: - elapsed = time.time() - started_at - if elapsed > 5: - raise RuntimeError( - f"timed out waiting {elapsed:.0f}s for storage_broker start: {e}" - ) from e - time.sleep(0.5) - else: - break # success - - def stop(self, immediate: bool = False): - if self.handle is not None: - if immediate: - self.handle.kill() - else: - self.handle.terminate() - self.handle.wait() diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 69284bfdfc..cbbb162cc6 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -60,7 +60,6 @@ from urllib3.util.retry import Retry from fixtures import overlayfs from fixtures.auth_tokens import AuthKeys, TokenScope -from fixtures.broker import NeonBroker from fixtures.common_types import Lsn, NodeId, TenantId, TenantShardId, TimelineId from fixtures.endpoint.http import EndpointHttpClient from fixtures.log_helper import log @@ -230,21 +229,6 @@ def port_distributor(worker_base_port: int, worker_port_num: int) -> PortDistrib return PortDistributor(base_port=worker_base_port, port_number=worker_port_num) -@pytest.fixture(scope="function") -def default_broker( - port_distributor: PortDistributor, - test_output_dir: Path, - neon_binpath: Path, -) -> Iterator[NeonBroker]: - # multiple pytest sessions could get launched in parallel, get them different ports/datadirs - client_port = port_distributor.get_port() - broker_logfile = test_output_dir / "repo" / "storage_broker.log" - - broker = NeonBroker(logfile=broker_logfile, port=client_port, neon_binpath=neon_binpath) - yield broker - broker.stop() - - @pytest.fixture(scope="session") def run_id() -> Iterator[uuid.UUID]: yield uuid.uuid4() @@ -387,7 +371,6 @@ class NeonEnvBuilder: self, repo_dir: Path, port_distributor: PortDistributor, - broker: NeonBroker, run_id: uuid.UUID, mock_s3_server: MockS3Server, neon_binpath: Path, @@ -428,7 +411,6 @@ class NeonEnvBuilder: # Safekeepers remote storage self.safekeepers_remote_storage: Optional[RemoteStorage] = None - self.broker = broker self.run_id = run_id self.mock_s3_server: MockS3Server = mock_s3_server self.pageserver_config_override = pageserver_config_override @@ -940,6 +922,8 @@ class NeonEnvBuilder: self.env.storage_controller.assert_no_errors() + self.env.broker.assert_no_errors() + try: self.overlay_cleanup_teardown() except Exception as e: @@ -993,7 +977,7 @@ class NeonEnv: self.endpoints = EndpointFactory(self) self.safekeepers: List[Safekeeper] = [] self.pageservers: List[NeonPageserver] = [] - self.broker = config.broker + self.broker = NeonBroker(self) self.pageserver_remote_storage = config.pageserver_remote_storage self.safekeepers_remote_storage = config.safekeepers_remote_storage self.pg_version = config.pg_version @@ -1168,7 +1152,7 @@ class NeonEnv: max_workers=2 + len(self.pageservers) + len(self.safekeepers) ) as executor: futs.append( - executor.submit(lambda: self.broker.try_start() or None) + executor.submit(lambda: self.broker.start() or None) ) # The `or None` is for the linter for pageserver in self.pageservers: @@ -1225,7 +1209,7 @@ class NeonEnv: pageserver.stop(immediate=immediate) except RuntimeError: stop_later.append(pageserver) - self.broker.stop(immediate=immediate) + self.broker.stop() # TODO: for nice logging we need python 3.11 ExceptionGroup for ps in stop_later: @@ -1339,7 +1323,6 @@ def neon_simple_env( pytestconfig: Config, port_distributor: PortDistributor, mock_s3_server: MockS3Server, - default_broker: NeonBroker, run_id: uuid.UUID, top_output_dir: Path, test_output_dir: Path, @@ -1364,7 +1347,6 @@ def neon_simple_env( top_output_dir=top_output_dir, repo_dir=repo_dir, port_distributor=port_distributor, - broker=default_broker, mock_s3_server=mock_s3_server, neon_binpath=neon_binpath, pg_distrib_dir=pg_distrib_dir, @@ -1392,7 +1374,6 @@ def neon_env_builder( neon_binpath: Path, pg_distrib_dir: Path, pg_version: PgVersion, - default_broker: NeonBroker, run_id: uuid.UUID, request: FixtureRequest, test_overlay_dir: Path, @@ -1428,7 +1409,6 @@ def neon_env_builder( neon_binpath=neon_binpath, pg_distrib_dir=pg_distrib_dir, pg_version=pg_version, - broker=default_broker, run_id=run_id, preserve_database_files=cast(bool, pytestconfig.getoption("--preserve-database-files")), pageserver_virtual_file_io_engine=pageserver_virtual_file_io_engine, @@ -1850,6 +1830,18 @@ class NeonCli(AbstractNeonCli): args.extend(["-m", "immediate"]) return self.raw_cli(args) + def broker_start( + self, timeout_in_seconds: Optional[int] = None + ) -> "subprocess.CompletedProcess[str]": + cmd = ["storage_broker", "start"] + if timeout_in_seconds is not None: + cmd.append(f"--start-timeout={timeout_in_seconds}s") + return self.raw_cli(cmd) + + def broker_stop(self) -> "subprocess.CompletedProcess[str]": + cmd = ["storage_broker", "stop"] + return self.raw_cli(cmd) + def endpoint_create( self, branch_name: str, @@ -4573,6 +4565,40 @@ class Safekeeper(LogUtils): wait_until(20, 0.5, paused) +class NeonBroker(LogUtils): + """An object managing storage_broker instance""" + + def __init__(self, env: NeonEnv): + super().__init__(logfile=env.repo_dir / "storage_broker.log") + self.env = env + self.port: int = self.env.port_distributor.get_port() + self.running = False + + def start( + self, + timeout_in_seconds: Optional[int] = None, + ): + assert not self.running + self.env.neon_cli.broker_start(timeout_in_seconds) + self.running = True + return self + + def stop(self): + if self.running: + self.env.neon_cli.broker_stop() + self.running = False + return self + + def listen_addr(self): + return f"127.0.0.1:{self.port}" + + def client_url(self): + return f"http://{self.listen_addr()}" + + def assert_no_errors(self): + assert_no_errors(self.logfile, "storage_controller", []) + + # TODO: Replace with `StrEnum` when we upgrade to python 3.11 class NodeKind(str, Enum): PAGESERVER = "pageserver" diff --git a/test_runner/regress/test_neon_cli.py b/test_runner/regress/test_neon_cli.py index ba170cfb4c..b65430ff49 100644 --- a/test_runner/regress/test_neon_cli.py +++ b/test_runner/regress/test_neon_cli.py @@ -134,6 +134,7 @@ def test_cli_start_stop(neon_env_builder: NeonEnvBuilder): env.neon_cli.pageserver_stop(env.pageserver.id) env.neon_cli.safekeeper_stop() env.neon_cli.storage_controller_stop(False) + env.neon_cli.broker_stop() # Keep NeonEnv state up to date, it usually owns starting/stopping services env.pageserver.running = False @@ -176,6 +177,7 @@ def test_cli_start_stop_multi(neon_env_builder: NeonEnvBuilder): # Stop this to get out of the way of the following `start` env.neon_cli.storage_controller_stop(False) + env.neon_cli.broker_stop() # Default start res = env.neon_cli.raw_cli(["start"]) diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py index ebf58d2bd1..c923713432 100644 --- a/test_runner/regress/test_pageserver_generations.py +++ b/test_runner/regress/test_pageserver_generations.py @@ -134,7 +134,7 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder): ) env = neon_env_builder.init_configs() - env.broker.try_start() + env.broker.start() for sk in env.safekeepers: sk.start() env.storage_controller.start() diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py index 2d72dbb2df..dc90a6e9a0 100644 --- a/test_runner/regress/test_storage_controller.py +++ b/test_runner/regress/test_storage_controller.py @@ -69,7 +69,7 @@ def test_storage_controller_smoke( env = neon_env_builder.init_configs() # Start services by hand so that we can skip a pageserver (this will start + register later) - env.broker.try_start() + env.broker.start() env.storage_controller.start() env.pageservers[0].start() env.pageservers[1].start() @@ -292,7 +292,7 @@ def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up # Start services by hand so that we can skip registration on one of the pageservers env = neon_env_builder.init_configs() - env.broker.try_start() + env.broker.start() env.storage_controller.start() # This is the pageserver where we'll initially create the tenant. Run it in emergency @@ -2126,7 +2126,7 @@ def start_env(env: NeonEnv, storage_controller_port: int): max_workers=2 + len(env.pageservers) + len(env.safekeepers) ) as executor: futs.append( - executor.submit(lambda: env.broker.try_start() or None) + executor.submit(lambda: env.broker.start() or None) ) # The `or None` is for the linter for pageserver in env.pageservers: diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index 50fac441c0..4bf8cfe88f 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -19,7 +19,6 @@ import psycopg2.errors import psycopg2.extras import pytest import requests -from fixtures.broker import NeonBroker from fixtures.common_types import Lsn, TenantId, TimelineId from fixtures.log_helper import log from fixtures.metrics import parse_metrics @@ -1439,11 +1438,7 @@ class SafekeeperEnv: ): self.repo_dir = repo_dir self.port_distributor = port_distributor - self.broker = NeonBroker( - logfile=Path(self.repo_dir) / "storage_broker.log", - port=self.port_distributor.get_port(), - neon_binpath=neon_binpath, - ) + self.fake_broker_endpoint = f"http://127.0.0.1:{port_distributor.get_port()}" self.pg_bin = pg_bin self.num_safekeepers = num_safekeepers self.bin_safekeeper = str(neon_binpath / "safekeeper") @@ -1492,7 +1487,7 @@ class SafekeeperEnv: "--id", str(i), "--broker-endpoint", - self.broker.client_url(), + self.fake_broker_endpoint, ] log.info(f'Running command "{" ".join(cmd)}"') From 3454ef7507000f05707418324dad392b366f8de3 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 18 Sep 2024 13:16:51 +0300 Subject: [PATCH 29/30] Refactor ImageLayerWriter to avoid passing a Timeline to finish() (#9028) Commit ca5390a89d made a similar change to DeltaLayerWriter. We bumped into this with Stas with our hackathon project, to create a standalong program to create image layers directly from a Postgres data directory. It needs to create image layers without having a Timeline and other pageserver machinery. This downgrades the "created image layer {}" message from INFO to TRACE level. TRACE is used for the corresponding message on delta layer creation too. The path logged in the message is now the temporary path, before the file is renamed to its final name. Again commit ca5390a89d made the same change for the message on delta layer creation. --- .../src/tenant/storage_layer/image_layer.rs | 43 ++++++++----------- .../src/tenant/storage_layer/split_writer.rs | 16 +++---- pageserver/src/tenant/timeline.rs | 9 ++-- pageserver/src/tenant/timeline/compaction.rs | 6 ++- 4 files changed, 35 insertions(+), 39 deletions(-) diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index 875e223c9c..5de2582ab7 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -38,7 +38,7 @@ use crate::tenant::timeline::GetVectoredError; use crate::tenant::vectored_blob_io::{ BlobFlag, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead, VectoredReadPlanner, }; -use crate::tenant::{PageReconstructError, Timeline}; +use crate::tenant::PageReconstructError; use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt; use crate::virtual_file::{self, VirtualFile}; use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX}; @@ -58,7 +58,6 @@ use std::io::SeekFrom; use std::ops::Range; use std::os::unix::prelude::FileExt; use std::str::FromStr; -use std::sync::Arc; use tokio::sync::OnceCell; use tokio_stream::StreamExt; use tracing::*; @@ -70,9 +69,7 @@ use utils::{ }; use super::layer_name::ImageLayerName; -use super::{ - AsLayerDesc, Layer, LayerName, PersistentLayerDesc, ResidentLayer, ValuesReconstructState, -}; +use super::{AsLayerDesc, LayerName, PersistentLayerDesc, ValuesReconstructState}; /// /// Header stored in the beginning of the file @@ -800,10 +797,9 @@ impl ImageLayerWriterInner { /// async fn finish( self, - timeline: &Arc, ctx: &RequestContext, end_key: Option, - ) -> anyhow::Result { + ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> { let index_start_blk = ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32; @@ -879,12 +875,9 @@ impl ImageLayerWriterInner { // fsync the file file.sync_all().await?; - // FIXME: why not carry the virtualfile here, it supports renaming? - let layer = Layer::finish_creating(self.conf, timeline, desc, &self.path)?; + trace!("created image layer {}", self.path); - info!("created image layer {}", layer.local_path()); - - Ok(layer) + Ok((desc, self.path)) } } @@ -963,24 +956,18 @@ impl ImageLayerWriter { /// pub(crate) async fn finish( mut self, - timeline: &Arc, ctx: &RequestContext, - ) -> anyhow::Result { - self.inner.take().unwrap().finish(timeline, ctx, None).await + ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> { + self.inner.take().unwrap().finish(ctx, None).await } /// Finish writing the image layer with an end key, used in [`super::split_writer::SplitImageLayerWriter`]. The end key determines the end of the image layer's covered range and is exclusive. pub(super) async fn finish_with_end_key( mut self, - timeline: &Arc, end_key: Key, ctx: &RequestContext, - ) -> anyhow::Result { - self.inner - .take() - .unwrap() - .finish(timeline, ctx, Some(end_key)) - .await + ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> { + self.inner.take().unwrap().finish(ctx, Some(end_key)).await } } @@ -1084,7 +1071,7 @@ mod test { tenant::{ config::TenantConf, harness::{TenantHarness, TIMELINE_ID}, - storage_layer::ResidentLayer, + storage_layer::{Layer, ResidentLayer}, vectored_blob_io::StreamingVectoredReadPlanner, Tenant, Timeline, }, @@ -1155,7 +1142,8 @@ mod test { key = key.next(); } - writer.finish(&timeline, &ctx).await.unwrap() + let (desc, path) = writer.finish(&ctx).await.unwrap(); + Layer::finish_creating(tenant.conf, &timeline, desc, &path).unwrap() }; let original_size = resident.metadata().file_size; @@ -1217,7 +1205,9 @@ mod test { .await .unwrap(); let replacement = if wrote_keys > 0 { - Some(filtered_writer.finish(&timeline, &ctx).await.unwrap()) + let (desc, path) = filtered_writer.finish(&ctx).await.unwrap(); + let resident = Layer::finish_creating(tenant.conf, &timeline, desc, &path).unwrap(); + Some(resident) } else { None }; @@ -1290,7 +1280,8 @@ mod test { for (key, img) in images { writer.put_image(key, img, ctx).await?; } - let img_layer = writer.finish(tline, ctx).await?; + let (desc, path) = writer.finish(ctx).await?; + let img_layer = Layer::finish_creating(tenant.conf, tline, desc, &path)?; Ok::<_, anyhow::Error>(img_layer) } diff --git a/pageserver/src/tenant/storage_layer/split_writer.rs b/pageserver/src/tenant/storage_layer/split_writer.rs index 40a6a77a50..b499a0eef4 100644 --- a/pageserver/src/tenant/storage_layer/split_writer.rs +++ b/pageserver/src/tenant/storage_layer/split_writer.rs @@ -121,11 +121,11 @@ impl SplitImageLayerWriter { self.generated_layers .push(SplitWriterResult::Discarded(layer_key)); } else { - self.generated_layers.push(SplitWriterResult::Produced( - prev_image_writer - .finish_with_end_key(tline, key, ctx) - .await?, - )); + let (desc, path) = prev_image_writer.finish_with_end_key(key, ctx).await?; + + let layer = Layer::finish_creating(self.conf, tline, desc, &path)?; + self.generated_layers + .push(SplitWriterResult::Produced(layer)); } } self.inner.put_image(key, img, ctx).await @@ -170,9 +170,9 @@ impl SplitImageLayerWriter { if discard(&layer_key).await { generated_layers.push(SplitWriterResult::Discarded(layer_key)); } else { - generated_layers.push(SplitWriterResult::Produced( - inner.finish_with_end_key(tline, end_key, ctx).await?, - )); + let (desc, path) = inner.finish_with_end_key(end_key, ctx).await?; + let layer = Layer::finish_creating(self.conf, tline, desc, &path)?; + generated_layers.push(SplitWriterResult::Produced(layer)); } Ok(generated_layers) } diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 262dccac7d..f66491d962 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -4013,7 +4013,8 @@ impl Timeline { if wrote_keys { // Normal path: we have written some data into the new image layer for this // partition, so flush it to disk. - let image_layer = image_layer_writer.finish(self, ctx).await?; + let (desc, path) = image_layer_writer.finish(ctx).await?; + let image_layer = Layer::finish_creating(self.conf, self, desc, &path)?; Ok(ImageLayerCreationOutcome { image: Some(image_layer), next_start_key: img_range.end, @@ -4101,7 +4102,8 @@ impl Timeline { if wrote_any_image { // Normal path: we have written some data into the new image layer for this // partition, so flush it to disk. - let image_layer = image_layer_writer.finish(self, ctx).await?; + let (desc, path) = image_layer_writer.finish(ctx).await?; + let image_layer = Layer::finish_creating(self.conf, self, desc, &path)?; Ok(ImageLayerCreationOutcome { image: Some(image_layer), next_start_key: img_range.end, @@ -5403,7 +5405,8 @@ impl Timeline { for (key, img) in images { image_layer_writer.put_image(key, img, ctx).await?; } - let image_layer = image_layer_writer.finish(self, ctx).await?; + let (desc, path) = image_layer_writer.finish(ctx).await?; + let image_layer = Layer::finish_creating(self.conf, self, desc, &path)?; { let mut guard = self.layers.write().await; diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index 0b5c520ba7..d1f06e3480 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -563,10 +563,12 @@ impl Timeline { .await?; if keys_written > 0 { - let new_layer = image_layer_writer - .finish(self, ctx) + let (desc, path) = image_layer_writer + .finish(ctx) .await .map_err(CompactionError::Other)?; + let new_layer = Layer::finish_creating(self.conf, self, desc, &path) + .map_err(CompactionError::Other)?; tracing::info!(layer=%new_layer, "Rewrote layer, {} -> {} bytes", layer.metadata().file_size, new_layer.metadata().file_size); From c5cd8577ff6d96e8153dd22af17373c4351e52e4 Mon Sep 17 00:00:00 2001 From: Folke Behrens Date: Wed, 18 Sep 2024 13:58:51 +0200 Subject: [PATCH 30/30] proxy: make sql-over-http max request/response sizes configurable (#9029) --- proxy/src/bin/local_proxy.rs | 8 ++ proxy/src/bin/proxy.rs | 8 ++ proxy/src/config.rs | 2 + proxy/src/serverless/conn_pool.rs | 2 + proxy/src/serverless/sql_over_http.rs | 114 +++++++++++++++----------- 5 files changed, 85 insertions(+), 49 deletions(-) diff --git a/proxy/src/bin/local_proxy.rs b/proxy/src/bin/local_proxy.rs index 6eba71df1b..94365ddf05 100644 --- a/proxy/src/bin/local_proxy.rs +++ b/proxy/src/bin/local_proxy.rs @@ -92,6 +92,12 @@ struct SqlOverHttpArgs { #[clap(long, default_value_t = 16)] sql_over_http_cancel_set_shards: usize, + + #[clap(long, default_value_t = 10 * 1024 * 1024)] // 10 MiB + sql_over_http_max_request_size_bytes: u64, + + #[clap(long, default_value_t = 10 * 1024 * 1024)] // 10 MiB + sql_over_http_max_response_size_bytes: usize, } #[tokio::main] @@ -208,6 +214,8 @@ fn build_config(args: &LocalProxyCliArgs) -> anyhow::Result<&'static ProxyConfig }, cancel_set: CancelSet::new(args.sql_over_http.sql_over_http_cancel_set_shards), client_conn_threshold: args.sql_over_http.sql_over_http_client_conn_threshold, + max_request_size_bytes: args.sql_over_http.sql_over_http_max_request_size_bytes, + max_response_size_bytes: args.sql_over_http.sql_over_http_max_response_size_bytes, }; Ok(Box::leak(Box::new(ProxyConfig { diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs index ca9aeb04d8..e5c5b47795 100644 --- a/proxy/src/bin/proxy.rs +++ b/proxy/src/bin/proxy.rs @@ -268,6 +268,12 @@ struct SqlOverHttpArgs { #[clap(long, default_value_t = 64)] sql_over_http_cancel_set_shards: usize, + + #[clap(long, default_value_t = 10 * 1024 * 1024)] // 10 MiB + sql_over_http_max_request_size_bytes: u64, + + #[clap(long, default_value_t = 10 * 1024 * 1024)] // 10 MiB + sql_over_http_max_response_size_bytes: usize, } #[tokio::main] @@ -679,6 +685,8 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { }, cancel_set: CancelSet::new(args.sql_over_http.sql_over_http_cancel_set_shards), client_conn_threshold: args.sql_over_http.sql_over_http_client_conn_threshold, + max_request_size_bytes: args.sql_over_http.sql_over_http_max_request_size_bytes, + max_response_size_bytes: args.sql_over_http.sql_over_http_max_response_size_bytes, }; let authentication_config = AuthenticationConfig { thread_pool, diff --git a/proxy/src/config.rs b/proxy/src/config.rs index 1cda6d200c..373e4cf650 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -56,6 +56,8 @@ pub struct HttpConfig { pub pool_options: GlobalConnPoolOptions, pub cancel_set: CancelSet, pub client_conn_threshold: u64, + pub max_request_size_bytes: u64, + pub max_response_size_bytes: usize, } pub struct AuthenticationConfig { diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs index bea599e9b9..6c32d5df0e 100644 --- a/proxy/src/serverless/conn_pool.rs +++ b/proxy/src/serverless/conn_pool.rs @@ -776,6 +776,8 @@ mod tests { }, cancel_set: CancelSet::new(0), client_conn_threshold: u64::MAX, + max_request_size_bytes: u64::MAX, + max_response_size_bytes: usize::MAX, })); let pool = GlobalConnPool::new(config); let conn_info = ConnInfo { diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs index 2188edc8c5..06e540d149 100644 --- a/proxy/src/serverless/sql_over_http.rs +++ b/proxy/src/serverless/sql_over_http.rs @@ -87,9 +87,6 @@ enum Payload { Batch(BatchQueryData), } -const MAX_RESPONSE_SIZE: usize = 10 * 1024 * 1024; // 10 MiB -const MAX_REQUEST_SIZE: u64 = 10 * 1024 * 1024; // 10 MiB - static CONN_STRING: HeaderName = HeaderName::from_static("neon-connection-string"); static RAW_TEXT_OUTPUT: HeaderName = HeaderName::from_static("neon-raw-text-output"); static ARRAY_MODE: HeaderName = HeaderName::from_static("neon-array-mode"); @@ -366,10 +363,10 @@ pub(crate) enum SqlOverHttpError { ConnectCompute(#[from] HttpConnError), #[error("{0}")] ConnInfo(#[from] ConnInfoError), - #[error("request is too large (max is {MAX_REQUEST_SIZE} bytes)")] - RequestTooLarge, - #[error("response is too large (max is {MAX_RESPONSE_SIZE} bytes)")] - ResponseTooLarge, + #[error("request is too large (max is {0} bytes)")] + RequestTooLarge(u64), + #[error("response is too large (max is {0} bytes)")] + ResponseTooLarge(usize), #[error("invalid isolation level")] InvalidIsolationLevel, #[error("{0}")] @@ -386,8 +383,8 @@ impl ReportableError for SqlOverHttpError { SqlOverHttpError::ReadPayload(e) => e.get_error_kind(), SqlOverHttpError::ConnectCompute(e) => e.get_error_kind(), SqlOverHttpError::ConnInfo(e) => e.get_error_kind(), - SqlOverHttpError::RequestTooLarge => ErrorKind::User, - SqlOverHttpError::ResponseTooLarge => ErrorKind::User, + SqlOverHttpError::RequestTooLarge(_) => ErrorKind::User, + SqlOverHttpError::ResponseTooLarge(_) => ErrorKind::User, SqlOverHttpError::InvalidIsolationLevel => ErrorKind::User, SqlOverHttpError::Postgres(p) => p.get_error_kind(), SqlOverHttpError::JsonConversion(_) => ErrorKind::Postgres, @@ -402,8 +399,8 @@ impl UserFacingError for SqlOverHttpError { SqlOverHttpError::ReadPayload(p) => p.to_string(), SqlOverHttpError::ConnectCompute(c) => c.to_string_client(), SqlOverHttpError::ConnInfo(c) => c.to_string_client(), - SqlOverHttpError::RequestTooLarge => self.to_string(), - SqlOverHttpError::ResponseTooLarge => self.to_string(), + SqlOverHttpError::RequestTooLarge(_) => self.to_string(), + SqlOverHttpError::ResponseTooLarge(_) => self.to_string(), SqlOverHttpError::InvalidIsolationLevel => self.to_string(), SqlOverHttpError::Postgres(p) => p.to_string(), SqlOverHttpError::JsonConversion(_) => "could not parse postgres response".to_string(), @@ -537,7 +534,7 @@ async fn handle_inner( let request_content_length = match request.body().size_hint().upper() { Some(v) => v, - None => MAX_REQUEST_SIZE + 1, + None => config.http_config.max_request_size_bytes + 1, }; info!(request_content_length, "request size in bytes"); Metrics::get() @@ -547,8 +544,10 @@ async fn handle_inner( // we don't have a streaming request support yet so this is to prevent OOM // from a malicious user sending an extremely large request body - if request_content_length > MAX_REQUEST_SIZE { - return Err(SqlOverHttpError::RequestTooLarge); + if request_content_length > config.http_config.max_request_size_bytes { + return Err(SqlOverHttpError::RequestTooLarge( + config.http_config.max_request_size_bytes, + )); } let fetch_and_process_request = Box::pin( @@ -612,7 +611,10 @@ async fn handle_inner( // Now execute the query and return the result. let json_output = match payload { - Payload::Single(stmt) => stmt.process(cancel, &mut client, parsed_headers).await?, + Payload::Single(stmt) => { + stmt.process(config, cancel, &mut client, parsed_headers) + .await? + } Payload::Batch(statements) => { if parsed_headers.txn_read_only { response = response.header(TXN_READ_ONLY.clone(), &HEADER_VALUE_TRUE); @@ -628,7 +630,7 @@ async fn handle_inner( } statements - .process(cancel, &mut client, parsed_headers) + .process(config, cancel, &mut client, parsed_headers) .await? } }; @@ -656,6 +658,7 @@ async fn handle_inner( impl QueryData { async fn process( self, + config: &'static ProxyConfig, cancel: CancellationToken, client: &mut Client, parsed_headers: HttpHeaders, @@ -664,7 +667,7 @@ impl QueryData { let cancel_token = inner.cancel_token(); let res = match select( - pin!(query_to_json(&*inner, self, &mut 0, parsed_headers)), + pin!(query_to_json(config, &*inner, self, &mut 0, parsed_headers)), pin!(cancel.cancelled()), ) .await @@ -727,6 +730,7 @@ impl QueryData { impl BatchQueryData { async fn process( self, + config: &'static ProxyConfig, cancel: CancellationToken, client: &mut Client, parsed_headers: HttpHeaders, @@ -751,44 +755,52 @@ impl BatchQueryData { discard.discard(); })?; - let json_output = - match query_batch(cancel.child_token(), &transaction, self, parsed_headers).await { - Ok(json_output) => { - info!("commit"); - let status = transaction.commit().await.inspect_err(|_| { - // if we cannot commit - for now don't return connection to pool - // TODO: get a query status from the error - discard.discard(); - })?; - discard.check_idle(status); - json_output - } - Err(SqlOverHttpError::Cancelled(_)) => { - if let Err(err) = cancel_token.cancel_query(NoTls).await { - tracing::error!(?err, "could not cancel query"); - } - // TODO: after cancelling, wait to see if we can get a status. maybe the connection is still safe. + let json_output = match query_batch( + config, + cancel.child_token(), + &transaction, + self, + parsed_headers, + ) + .await + { + Ok(json_output) => { + info!("commit"); + let status = transaction.commit().await.inspect_err(|_| { + // if we cannot commit - for now don't return connection to pool + // TODO: get a query status from the error discard.discard(); + })?; + discard.check_idle(status); + json_output + } + Err(SqlOverHttpError::Cancelled(_)) => { + if let Err(err) = cancel_token.cancel_query(NoTls).await { + tracing::error!(?err, "could not cancel query"); + } + // TODO: after cancelling, wait to see if we can get a status. maybe the connection is still safe. + discard.discard(); - return Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres)); - } - Err(err) => { - info!("rollback"); - let status = transaction.rollback().await.inspect_err(|_| { - // if we cannot rollback - for now don't return connection to pool - // TODO: get a query status from the error - discard.discard(); - })?; - discard.check_idle(status); - return Err(err); - } - }; + return Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres)); + } + Err(err) => { + info!("rollback"); + let status = transaction.rollback().await.inspect_err(|_| { + // if we cannot rollback - for now don't return connection to pool + // TODO: get a query status from the error + discard.discard(); + })?; + discard.check_idle(status); + return Err(err); + } + }; Ok(json_output) } } async fn query_batch( + config: &'static ProxyConfig, cancel: CancellationToken, transaction: &Transaction<'_>, queries: BatchQueryData, @@ -798,6 +810,7 @@ async fn query_batch( let mut current_size = 0; for stmt in queries.queries { let query = pin!(query_to_json( + config, transaction, stmt, &mut current_size, @@ -826,6 +839,7 @@ async fn query_batch( } async fn query_to_json( + config: &'static ProxyConfig, client: &T, data: QueryData, current_size: &mut usize, @@ -846,8 +860,10 @@ async fn query_to_json( rows.push(row); // we don't have a streaming response support yet so this is to prevent OOM // from a malicious query (eg a cross join) - if *current_size > MAX_RESPONSE_SIZE { - return Err(SqlOverHttpError::ResponseTooLarge); + if *current_size > config.http_config.max_response_size_bytes { + return Err(SqlOverHttpError::ResponseTooLarge( + config.http_config.max_response_size_bytes, + )); } }