diff --git a/Cargo.lock b/Cargo.lock index 9fc233e5ec..588a63b6a3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4237,6 +4237,7 @@ version = "0.1.0" dependencies = [ "anyhow", "async-trait", + "bytes", "camino", "clap", "futures", @@ -4469,7 +4470,6 @@ dependencies = [ "pageserver_api", "postgres_ffi", "prost 0.13.5", - "smallvec", "thiserror 1.0.69", "tonic 0.13.1", "tonic-build", diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile index 2afdde0cfa..248f52088b 100644 --- a/compute/compute-node.Dockerfile +++ b/compute/compute-node.Dockerfile @@ -297,6 +297,7 @@ RUN ./autogen.sh && \ ./configure --with-sfcgal=/usr/local/bin/sfcgal-config && \ make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install && \ + make staged-install && \ cd extensions/postgis && \ make clean && \ make -j $(getconf _NPROCESSORS_ONLN) install && \ @@ -602,7 +603,7 @@ RUN case "${PG_VERSION:?}" in \ ;; \ esac && \ wget https://github.com/knizhnik/online_advisor/archive/refs/tags/1.0.tar.gz -O online_advisor.tar.gz && \ - echo "059b7d9e5a90013a58bdd22e9505b88406ce05790675eb2d8434e5b215652d54 online_advisor.tar.gz" | sha256sum --check && \ + echo "37dcadf8f7cc8d6cc1f8831276ee245b44f1b0274f09e511e47a67738ba9ed0f online_advisor.tar.gz" | sha256sum --check && \ mkdir online_advisor-src && cd online_advisor-src && tar xzf ../online_advisor.tar.gz --strip-components=1 -C . FROM pg-build AS online_advisor-build @@ -1842,10 +1843,25 @@ RUN make PG_VERSION="${PG_VERSION:?}" -C compute FROM pg-build AS extension-tests ARG PG_VERSION +# This is required for the PostGIS test +RUN apt-get update && case $DEBIAN_VERSION in \ + bullseye) \ + apt-get install -y libproj19 libgdal28 time; \ + ;; \ + bookworm) \ + apt-get install -y libgdal32 libproj25 time; \ + ;; \ + *) \ + echo "Unknown Debian version ${DEBIAN_VERSION}" && exit 1 \ + ;; \ + esac + COPY docker-compose/ext-src/ /ext-src/ COPY --from=pg-build /postgres /postgres -#COPY --from=postgis-src /ext-src/ /ext-src/ +COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=postgis-build /ext-src/postgis-src /ext-src/postgis-src +COPY --from=postgis-build /sfcgal/* /usr COPY --from=plv8-src /ext-src/ /ext-src/ COPY --from=h3-pg-src /ext-src/h3-pg-src /ext-src/h3-pg-src COPY --from=postgresql-unit-src /ext-src/ /ext-src/ @@ -1886,6 +1902,7 @@ COPY compute/patches/pg_repack.patch /ext-src RUN cd /ext-src/pg_repack-src && patch -p1 /etc/ld.so.conf.d/00-neon.conf && /sbin/ldconfig RUN apt-get update && apt-get install -y libtap-parser-sourcehandler-pgtap-perl jq \ && apt clean && rm -rf /ext-src/*.tar.gz /ext-src/*.patch /var/lib/apt/lists/* ENV PATH=/usr/local/pgsql/bin:$PATH diff --git a/compute/manifest.yaml b/compute/manifest.yaml new file mode 100644 index 0000000000..f1cd20c497 --- /dev/null +++ b/compute/manifest.yaml @@ -0,0 +1,121 @@ +pg_settings: + # Common settings for primaries and replicas of all versions. + common: + # Check for client disconnection every 1 minute. By default, Postgres will detect the + # loss of the connection only at the next interaction with the socket, when it waits + # for, receives or sends data, so it will likely waste resources till the end of the + # query execution. There should be no drawbacks in setting this for everyone, so enable + # it by default. If anyone will complain, we can allow editing it. + # https://www.postgresql.org/docs/16/runtime-config-connection.html#GUC-CLIENT-CONNECTION-CHECK-INTERVAL + client_connection_check_interval: "60000" # 1 minute + # ---- IO ---- + effective_io_concurrency: "20" + maintenance_io_concurrency: "100" + fsync: "off" + hot_standby: "off" + # We allow users to change this if needed, but by default we + # just don't want to see long-lasting idle transactions, as they + # prevent activity monitor from suspending projects. + idle_in_transaction_session_timeout: "300000" # 5 minutes + listen_addresses: "*" + # --- LOGGING ---- helps investigations + log_connections: "on" + log_disconnections: "on" + # 1GB, unit is KB + log_temp_files: "1048576" + # Disable dumping customer data to logs, both to increase data privacy + # and to reduce the amount the logs. + log_error_verbosity: "terse" + log_min_error_statement: "panic" + max_connections: "100" + # --- WAL --- + # - flush lag is the max amount of WAL that has been generated but not yet stored + # to disk in the page server. A smaller value means less delay after a pageserver + # restart, but if you set it too small you might again need to slow down writes if the + # pageserver cannot flush incoming WAL to disk fast enough. This must be larger + # than the pageserver's checkpoint interval, currently 1 GB! Otherwise you get a + # a deadlock where the compute node refuses to generate more WAL before the + # old WAL has been uploaded to S3, but the pageserver is waiting for more WAL + # to be generated before it is uploaded to S3. + max_replication_flush_lag: "10GB" + max_replication_slots: "10" + # Backpressure configuration: + # - write lag is the max amount of WAL that has been generated by Postgres but not yet + # processed by the page server. Making this smaller reduces the worst case latency + # of a GetPage request, if you request a page that was recently modified. On the other + # hand, if this is too small, the compute node might need to wait on a write if there is a + # hiccup in the network or page server so that the page server has temporarily fallen + # behind. + # + # Previously it was set to 500 MB, but it caused compute being unresponsive under load + # https://github.com/neondatabase/neon/issues/2028 + max_replication_write_lag: "500MB" + max_wal_senders: "10" + # A Postgres checkpoint is cheap in storage, as doesn't involve any significant amount + # of real I/O. Only the SLRU buffers and some other small files are flushed to disk. + # However, as long as we have full_page_writes=on, page updates after a checkpoint + # include full-page images which bloats the WAL. So may want to bump max_wal_size to + # reduce the WAL bloating, but at the same it will increase pg_wal directory size on + # compute and can lead to out of disk error on k8s nodes. + max_wal_size: "1024" + wal_keep_size: "0" + wal_level: "replica" + # Reduce amount of WAL generated by default. + wal_log_hints: "off" + # - without wal_sender_timeout set we don't get feedback messages, + # required for backpressure. + wal_sender_timeout: "10000" + # We have some experimental extensions, which we don't want users to install unconsciously. + # To install them, users would need to set the `neon.allow_unstable_extensions` setting. + # There are two of them currently: + # - `pgrag` - https://github.com/neondatabase-labs/pgrag - extension is actually called just `rag`, + # and two dependencies: + # - `rag_bge_small_en_v15` + # - `rag_jina_reranker_v1_tiny_en` + # - `pg_mooncake` - https://github.com/Mooncake-Labs/pg_mooncake/ + neon.unstable_extensions: "rag,rag_bge_small_en_v15,rag_jina_reranker_v1_tiny_en,pg_mooncake,anon" + neon.protocol_version: "3" + password_encryption: "scram-sha-256" + # This is important to prevent Postgres from trying to perform + # a local WAL redo after backend crash. It should exit and let + # the systemd or k8s to do a fresh startup with compute_ctl. + restart_after_crash: "off" + # By default 3. We have the following persistent connections in the VM: + # * compute_activity_monitor (from compute_ctl) + # * postgres-exporter (metrics collector; it has 2 connections) + # * sql_exporter (metrics collector; we have 2 instances [1 for us & users; 1 for autoscaling]) + # * vm-monitor (to query & change file cache size) + # i.e. total of 6. Let's reserve 7, so there's still at least one left over. + superuser_reserved_connections: "7" + synchronous_standby_names: "walproposer" + + replica: + hot_standby: "on" + + per_version: + 17: + common: + # PostgreSQL 17 has a new IO system called "read stream", which can combine IOs up to some + # size. It still has some issues with readahead, though, so we default to disabled/ + # "no combining of IOs" to make sure we get the maximum prefetch depth. + # See also: https://github.com/neondatabase/neon/pull/9860 + io_combine_limit: "1" + replica: + # prefetching of blocks referenced in WAL doesn't make sense for us + # Neon hot standby ignores pages that are not in the shared_buffers + recovery_prefetch: "off" + 16: + common: + replica: + # prefetching of blocks referenced in WAL doesn't make sense for us + # Neon hot standby ignores pages that are not in the shared_buffers + recovery_prefetch: "off" + 15: + common: + replica: + # prefetching of blocks referenced in WAL doesn't make sense for us + # Neon hot standby ignores pages that are not in the shared_buffers + recovery_prefetch: "off" + 14: + common: + replica: diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index db6835da61..8b502a058e 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -40,7 +40,7 @@ use std::sync::mpsc; use std::thread; use std::time::Duration; -use anyhow::{Context, Result}; +use anyhow::{Context, Result, bail}; use clap::Parser; use compute_api::responses::ComputeConfig; use compute_tools::compute::{ @@ -57,14 +57,14 @@ use tracing::{error, info}; use url::Url; use utils::failpoint_support; -#[derive(Parser)] +#[derive(Debug, Parser)] #[command(rename_all = "kebab-case")] struct Cli { #[arg(short = 'b', long, default_value = "postgres", env = "POSTGRES_PATH")] pub pgbin: String, /// The base URL for the remote extension storage proxy gateway. - #[arg(short = 'r', long)] + #[arg(short = 'r', long, value_parser = Self::parse_remote_ext_base_url)] pub remote_ext_base_url: Option, /// The port to bind the external listening HTTP server to. Clients running @@ -126,6 +126,25 @@ struct Cli { pub installed_extensions_collection_interval: u64, } +impl Cli { + /// Parse a URL from an argument. By default, this isn't necessary, but we + /// want to do some sanity checking. + fn parse_remote_ext_base_url(value: &str) -> Result { + // Remove extra trailing slashes, and add one. We use Url::join() later + // when downloading remote extensions. If the base URL is something like + // http://example.com/pg-ext-s3-gateway, and join() is called with + // something like "xyz", the resulting URL is http://example.com/xyz. + let value = value.trim_end_matches('/').to_owned() + "/"; + let url = Url::parse(&value)?; + + if url.query_pairs().count() != 0 { + bail!("parameters detected in remote extensions base URL") + } + + Ok(url) + } +} + fn main() -> Result<()> { let cli = Cli::parse(); @@ -252,7 +271,8 @@ fn handle_exit_signal(sig: i32) { #[cfg(test)] mod test { - use clap::CommandFactory; + use clap::{CommandFactory, Parser}; + use url::Url; use super::Cli; @@ -260,4 +280,43 @@ mod test { fn verify_cli() { Cli::command().debug_assert() } + + #[test] + fn verify_remote_ext_base_url() { + let cli = Cli::parse_from([ + "compute_ctl", + "--pgdata=test", + "--connstr=test", + "--compute-id=test", + "--remote-ext-base-url", + "https://example.com/subpath", + ]); + assert_eq!( + cli.remote_ext_base_url.unwrap(), + Url::parse("https://example.com/subpath/").unwrap() + ); + + let cli = Cli::parse_from([ + "compute_ctl", + "--pgdata=test", + "--connstr=test", + "--compute-id=test", + "--remote-ext-base-url", + "https://example.com//", + ]); + assert_eq!( + cli.remote_ext_base_url.unwrap(), + Url::parse("https://example.com").unwrap() + ); + + Cli::try_parse_from([ + "compute_ctl", + "--pgdata=test", + "--connstr=test", + "--compute-id=test", + "--remote-ext-base-url", + "https://example.com?hello=world", + ]) + .expect_err("URL parameters are not allowed"); + } } diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index d678b7d670..bd6ed910be 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -3,7 +3,7 @@ use chrono::{DateTime, Utc}; use compute_api::privilege::Privilege; use compute_api::responses::{ ComputeConfig, ComputeCtlConfig, ComputeMetrics, ComputeStatus, LfcOffloadState, - LfcPrewarmState, + LfcPrewarmState, TlsConfig, }; use compute_api::spec::{ ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, ExtVersion, PgIdent, @@ -396,7 +396,7 @@ impl ComputeNode { // because QEMU will already have its memory allocated from the host, and // the necessary binaries will already be cached. if cli_spec.is_none() { - this.prewarm_postgres()?; + this.prewarm_postgres_vm_memory()?; } // Set the up metric with Empty status before starting the HTTP server. @@ -603,6 +603,8 @@ impl ComputeNode { }); } + let tls_config = self.tls_config(&pspec.spec); + // If there are any remote extensions in shared_preload_libraries, start downloading them if pspec.spec.remote_extensions.is_some() { let (this, spec) = (self.clone(), pspec.spec.clone()); @@ -659,7 +661,7 @@ impl ComputeNode { info!("tuning pgbouncer"); let pgbouncer_settings = pgbouncer_settings.clone(); - let tls_config = self.compute_ctl_config.tls.clone(); + let tls_config = tls_config.clone(); // Spawn a background task to do the tuning, // so that we don't block the main thread that starts Postgres. @@ -678,7 +680,10 @@ impl ComputeNode { // Spawn a background task to do the configuration, // so that we don't block the main thread that starts Postgres. - let local_proxy = local_proxy.clone(); + + let mut local_proxy = local_proxy.clone(); + local_proxy.tls = tls_config.clone(); + let _handle = tokio::spawn(async move { if let Err(err) = local_proxy::configure(&local_proxy) { error!("error while configuring local_proxy: {err:?}"); @@ -779,7 +784,7 @@ impl ComputeNode { // Spawn the extension stats background task self.spawn_extension_stats_task(); - if pspec.spec.prewarm_lfc_on_startup { + if pspec.spec.autoprewarm { self.prewarm_lfc(); } Ok(()) @@ -1205,13 +1210,15 @@ impl ComputeNode { let spec = &pspec.spec; let pgdata_path = Path::new(&self.params.pgdata); + let tls_config = self.tls_config(&pspec.spec); + // Remove/create an empty pgdata directory and put configuration there. self.create_pgdata()?; config::write_postgres_conf( pgdata_path, &pspec.spec, self.params.internal_http_port, - &self.compute_ctl_config.tls, + tls_config, )?; // Syncing safekeepers is only safe with primary nodes: if a primary @@ -1307,8 +1314,8 @@ impl ComputeNode { } /// Start and stop a postgres process to warm up the VM for startup. - pub fn prewarm_postgres(&self) -> Result<()> { - info!("prewarming"); + pub fn prewarm_postgres_vm_memory(&self) -> Result<()> { + info!("prewarming VM memory"); // Create pgdata let pgdata = &format!("{}.warmup", self.params.pgdata); @@ -1350,7 +1357,7 @@ impl ComputeNode { kill(pm_pid, Signal::SIGQUIT)?; info!("sent SIGQUIT signal"); pg.wait()?; - info!("done prewarming"); + info!("done prewarming vm memory"); // clean up let _ok = fs::remove_dir_all(pgdata); @@ -1536,14 +1543,22 @@ impl ComputeNode { .clone(), ); + let mut tls_config = None::; + if spec.features.contains(&ComputeFeature::TlsExperimental) { + tls_config = self.compute_ctl_config.tls.clone(); + } + let max_concurrent_connections = self.max_service_connections(compute_state, &spec); // Merge-apply spec & changes to PostgreSQL state. self.apply_spec_sql(spec.clone(), conf.clone(), max_concurrent_connections)?; if let Some(local_proxy) = &spec.clone().local_proxy_config { + let mut local_proxy = local_proxy.clone(); + local_proxy.tls = tls_config.clone(); + info!("configuring local_proxy"); - local_proxy::configure(local_proxy).context("apply_config local_proxy")?; + local_proxy::configure(&local_proxy).context("apply_config local_proxy")?; } // Run migrations separately to not hold up cold starts @@ -1595,11 +1610,13 @@ impl ComputeNode { pub fn reconfigure(&self) -> Result<()> { let spec = self.state.lock().unwrap().pspec.clone().unwrap().spec; + let tls_config = self.tls_config(&spec); + if let Some(ref pgbouncer_settings) = spec.pgbouncer_settings { info!("tuning pgbouncer"); let pgbouncer_settings = pgbouncer_settings.clone(); - let tls_config = self.compute_ctl_config.tls.clone(); + let tls_config = tls_config.clone(); // Spawn a background task to do the tuning, // so that we don't block the main thread that starts Postgres. @@ -1617,7 +1634,7 @@ impl ComputeNode { // Spawn a background task to do the configuration, // so that we don't block the main thread that starts Postgres. let mut local_proxy = local_proxy.clone(); - local_proxy.tls = self.compute_ctl_config.tls.clone(); + local_proxy.tls = tls_config.clone(); tokio::spawn(async move { if let Err(err) = local_proxy::configure(&local_proxy) { error!("error while configuring local_proxy: {err:?}"); @@ -1635,7 +1652,7 @@ impl ComputeNode { pgdata_path, &spec, self.params.internal_http_port, - &self.compute_ctl_config.tls, + tls_config, )?; if !spec.skip_pg_catalog_updates { @@ -1755,6 +1772,14 @@ impl ComputeNode { } } + pub fn tls_config(&self, spec: &ComputeSpec) -> &Option { + if spec.features.contains(&ComputeFeature::TlsExperimental) { + &self.compute_ctl_config.tls + } else { + &None:: + } + } + /// Update the `last_active` in the shared state, but ensure that it's a more recent one. pub fn update_last_active(&self, last_active: Option>) { let mut state = self.state.lock().unwrap(); diff --git a/compute_tools/src/extension_server.rs b/compute_tools/src/extension_server.rs index 1857afa08c..3764bc1525 100644 --- a/compute_tools/src/extension_server.rs +++ b/compute_tools/src/extension_server.rs @@ -166,7 +166,7 @@ pub async fn download_extension( // TODO add retry logic let download_buffer = - match download_extension_tar(remote_ext_base_url.as_str(), &ext_path.to_string()).await { + match download_extension_tar(remote_ext_base_url, &ext_path.to_string()).await { Ok(buffer) => buffer, Err(error_message) => { return Err(anyhow::anyhow!( @@ -271,10 +271,14 @@ pub fn create_control_files(remote_extensions: &RemoteExtSpec, pgbin: &str) { } // Do request to extension storage proxy, e.g., -// curl http://pg-ext-s3-gateway/latest/v15/extensions/anon.tar.zst +// curl http://pg-ext-s3-gateway.pg-ext-s3-gateway.svc.cluster.local/latest/v15/extensions/anon.tar.zst // using HTTP GET and return the response body as bytes. -async fn download_extension_tar(remote_ext_base_url: &str, ext_path: &str) -> Result { - let uri = format!("{}/{}", remote_ext_base_url, ext_path); +async fn download_extension_tar(remote_ext_base_url: &Url, ext_path: &str) -> Result { + let uri = remote_ext_base_url.join(ext_path).with_context(|| { + format!( + "failed to create the remote extension URI for {ext_path} using {remote_ext_base_url}" + ) + })?; let filename = Path::new(ext_path) .file_name() .unwrap_or_else(|| std::ffi::OsStr::new("unknown")) @@ -284,7 +288,7 @@ async fn download_extension_tar(remote_ext_base_url: &str, ext_path: &str) -> Re info!("Downloading extension file '{}' from uri {}", filename, uri); - match do_extension_server_request(&uri).await { + match do_extension_server_request(uri).await { Ok(resp) => { info!("Successfully downloaded remote extension data {}", ext_path); REMOTE_EXT_REQUESTS_TOTAL @@ -303,7 +307,7 @@ async fn download_extension_tar(remote_ext_base_url: &str, ext_path: &str) -> Re // Do a single remote extensions server request. // Return result or (error message + stringified status code) in case of any failures. -async fn do_extension_server_request(uri: &str) -> Result { +async fn do_extension_server_request(uri: Url) -> Result { let resp = reqwest::get(uri).await.map_err(|e| { ( format!( diff --git a/compute_tools/src/http/mod.rs b/compute_tools/src/http/mod.rs index 9ecc1b0093..9b01def966 100644 --- a/compute_tools/src/http/mod.rs +++ b/compute_tools/src/http/mod.rs @@ -48,11 +48,9 @@ impl JsonResponse { /// Create an error response related to the compute being in an invalid state pub(self) fn invalid_status(status: ComputeStatus) -> Response { - Self::create_response( + Self::error( StatusCode::PRECONDITION_FAILED, - &GenericAPIError { - error: format!("invalid compute status: {status}"), - }, + format!("invalid compute status: {status}"), ) } } diff --git a/compute_tools/src/http/routes/configure.rs b/compute_tools/src/http/routes/configure.rs index f7a19da611..c29e3a97da 100644 --- a/compute_tools/src/http/routes/configure.rs +++ b/compute_tools/src/http/routes/configure.rs @@ -22,7 +22,7 @@ pub(in crate::http) async fn configure( State(compute): State>, request: Json, ) -> Response { - let pspec = match ParsedSpec::try_from(request.spec.clone()) { + let pspec = match ParsedSpec::try_from(request.0.spec) { Ok(p) => p, Err(e) => return JsonResponse::error(StatusCode::BAD_REQUEST, e), }; diff --git a/compute_tools/tests/pg_helpers_tests.rs b/compute_tools/tests/pg_helpers_tests.rs index 04b6ed2256..2b865c75d0 100644 --- a/compute_tools/tests/pg_helpers_tests.rs +++ b/compute_tools/tests/pg_helpers_tests.rs @@ -30,7 +30,7 @@ mod pg_helpers_tests { r#"fsync = off wal_level = logical hot_standby = on -prewarm_lfc_on_startup = off +autoprewarm = off neon.safekeepers = '127.0.0.1:6502,127.0.0.1:6503,127.0.0.1:6501' wal_log_hints = on log_connections = on diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs index 708745446d..774a0053f8 100644 --- a/control_plane/src/endpoint.rs +++ b/control_plane/src/endpoint.rs @@ -747,7 +747,7 @@ impl Endpoint { logs_export_host: None::, endpoint_storage_addr: Some(endpoint_storage_addr), endpoint_storage_token: Some(endpoint_storage_token), - prewarm_lfc_on_startup: false, + autoprewarm: false, }; // this strange code is needed to support respec() in tests diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index 5ac3f85acd..db14d98afd 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -515,11 +515,6 @@ impl PageServerNode { .map(|x| x.parse::()) .transpose() .context("Failed to parse 'timeline_offloading' as bool")?, - wal_receiver_protocol_override: settings - .remove("wal_receiver_protocol_override") - .map(serde_json::from_str) - .transpose() - .context("parse `wal_receiver_protocol_override` from json")?, rel_size_v2_enabled: settings .remove("rel_size_v2_enabled") .map(|x| x.parse::()) diff --git a/docker-compose/compute_wrapper/Dockerfile b/docker-compose/compute_wrapper/Dockerfile index 9ef831a9cd..b89e69c650 100644 --- a/docker-compose/compute_wrapper/Dockerfile +++ b/docker-compose/compute_wrapper/Dockerfile @@ -13,6 +13,6 @@ RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \ jq \ netcat-openbsd #This is required for the pg_hintplan test -RUN mkdir -p /ext-src/pg_hint_plan-src /postgres/contrib/file_fdw && chown postgres /ext-src/pg_hint_plan-src /postgres/contrib/file_fdw +RUN mkdir -p /ext-src/pg_hint_plan-src /postgres/contrib/file_fdw /ext-src/postgis-src/ && chown postgres /ext-src/pg_hint_plan-src /postgres/contrib/file_fdw /ext-src/postgis-src USER postgres diff --git a/docker-compose/compute_wrapper/shell/compute.sh b/docker-compose/compute_wrapper/shell/compute.sh index ab8d74d355..c8ca812bf9 100755 --- a/docker-compose/compute_wrapper/shell/compute.sh +++ b/docker-compose/compute_wrapper/shell/compute.sh @@ -1,18 +1,18 @@ -#!/bin/bash +#!/usr/bin/env bash set -eux # Generate a random tenant or timeline ID # # Takes a variable name as argument. The result is stored in that variable. generate_id() { - local -n resvar=$1 - printf -v resvar '%08x%08x%08x%08x' $SRANDOM $SRANDOM $SRANDOM $SRANDOM + local -n resvar=${1} + printf -v resvar '%08x%08x%08x%08x' ${SRANDOM} ${SRANDOM} ${SRANDOM} ${SRANDOM} } PG_VERSION=${PG_VERSION:-14} -CONFIG_FILE_ORG=/var/db/postgres/configs/config.json -CONFIG_FILE=/tmp/config.json +readonly CONFIG_FILE_ORG=/var/db/postgres/configs/config.json +readonly CONFIG_FILE=/tmp/config.json # Test that the first library path that the dynamic loader looks in is the path # that we use for custom compiled software @@ -20,17 +20,17 @@ first_path="$(ldconfig --verbose 2>/dev/null \ | grep --invert-match ^$'\t' \ | cut --delimiter=: --fields=1 \ | head --lines=1)" -test "$first_path" == '/usr/local/lib' +test "${first_path}" = '/usr/local/lib' echo "Waiting pageserver become ready." while ! nc -z pageserver 6400; do - sleep 1; + sleep 1 done echo "Page server is ready." -cp ${CONFIG_FILE_ORG} ${CONFIG_FILE} +cp "${CONFIG_FILE_ORG}" "${CONFIG_FILE}" - if [ -n "${TENANT_ID:-}" ] && [ -n "${TIMELINE_ID:-}" ]; then + if [[ -n "${TENANT_ID:-}" && -n "${TIMELINE_ID:-}" ]]; then tenant_id=${TENANT_ID} timeline_id=${TIMELINE_ID} else @@ -41,7 +41,7 @@ else "http://pageserver:9898/v1/tenant" ) tenant_id=$(curl "${PARAMS[@]}" | jq -r .[0].id) - if [ -z "${tenant_id}" ] || [ "${tenant_id}" = null ]; then + if [[ -z "${tenant_id}" || "${tenant_id}" = null ]]; then echo "Create a tenant" generate_id tenant_id PARAMS=( @@ -51,7 +51,7 @@ else "http://pageserver:9898/v1/tenant/${tenant_id}/location_config" ) result=$(curl "${PARAMS[@]}") - echo $result | jq . + printf '%s\n' "${result}" | jq . fi echo "Check if a timeline present" @@ -61,7 +61,7 @@ else "http://pageserver:9898/v1/tenant/${tenant_id}/timeline" ) timeline_id=$(curl "${PARAMS[@]}" | jq -r .[0].timeline_id) - if [ -z "${timeline_id}" ] || [ "${timeline_id}" = null ]; then + if [[ -z "${timeline_id}" || "${timeline_id}" = null ]]; then generate_id timeline_id PARAMS=( -sbf @@ -71,7 +71,7 @@ else "http://pageserver:9898/v1/tenant/${tenant_id}/timeline/" ) result=$(curl "${PARAMS[@]}") - echo $result | jq . + printf '%s\n' "${result}" | jq . fi fi @@ -82,10 +82,10 @@ else fi echo "Adding pgx_ulid" shared_libraries=$(jq -r '.spec.cluster.settings[] | select(.name=="shared_preload_libraries").value' ${CONFIG_FILE}) -sed -i "s/${shared_libraries}/${shared_libraries},${ulid_extension}/" ${CONFIG_FILE} +sed -i "s|${shared_libraries}|${shared_libraries},${ulid_extension}|" ${CONFIG_FILE} echo "Overwrite tenant id and timeline id in spec file" -sed -i "s/TENANT_ID/${tenant_id}/" ${CONFIG_FILE} -sed -i "s/TIMELINE_ID/${timeline_id}/" ${CONFIG_FILE} +sed -i "s|TENANT_ID|${tenant_id}|" ${CONFIG_FILE} +sed -i "s|TIMELINE_ID|${timeline_id}|" ${CONFIG_FILE} cat ${CONFIG_FILE} @@ -93,5 +93,5 @@ echo "Start compute node" /usr/local/bin/compute_ctl --pgdata /var/db/postgres/compute \ -C "postgresql://cloud_admin@localhost:55433/postgres" \ -b /usr/local/bin/postgres \ - --compute-id "compute-$RANDOM" \ - --config "$CONFIG_FILE" + --compute-id "compute-${RANDOM}" \ + --config "${CONFIG_FILE}" diff --git a/docker-compose/docker-compose.yml b/docker-compose/docker-compose.yml index a64909b134..3c0173b9c6 100644 --- a/docker-compose/docker-compose.yml +++ b/docker-compose/docker-compose.yml @@ -189,13 +189,14 @@ services: neon-test-extensions: profiles: ["test-extensions"] - image: ${REPOSITORY:-ghcr.io/neondatabase}/neon-test-extensions-v${PG_TEST_VERSION:-16}:${TEST_EXTENSIONS_TAG:-${TAG:-latest}} + image: ${REPOSITORY:-ghcr.io/neondatabase}/neon-test-extensions-v${PG_TEST_VERSION:-${PG_VERSION:-16}}:${TEST_EXTENSIONS_TAG:-${TAG:-latest}} environment: - - PGPASSWORD=cloud_admin + - PGUSER=${PGUSER:-cloud_admin} + - PGPASSWORD=${PGPASSWORD:-cloud_admin} entrypoint: - "/bin/bash" - "-c" command: - - sleep 1800 + - sleep 3600 depends_on: - compute diff --git a/docker-compose/docker_compose_test.sh b/docker-compose/docker_compose_test.sh index 2645a49883..6edf90ca8d 100755 --- a/docker-compose/docker_compose_test.sh +++ b/docker-compose/docker_compose_test.sh @@ -54,6 +54,15 @@ for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do # It cannot be moved to Dockerfile now because the database directory is created after the start of the container echo Adding dummy config docker compose exec compute touch /var/db/postgres/compute/compute_ctl_temp_override.conf + # Prepare for the PostGIS test + docker compose exec compute mkdir -p /tmp/pgis_reg/pgis_reg_tmp + TMPDIR=$(mktemp -d) + docker compose cp neon-test-extensions:/ext-src/postgis-src/raster/test "${TMPDIR}" + docker compose cp neon-test-extensions:/ext-src/postgis-src/regress/00-regress-install "${TMPDIR}" + docker compose exec compute mkdir -p /ext-src/postgis-src/raster /ext-src/postgis-src/regress /ext-src/postgis-src/regress/00-regress-install + docker compose cp "${TMPDIR}/test" compute:/ext-src/postgis-src/raster/test + docker compose cp "${TMPDIR}/00-regress-install" compute:/ext-src/postgis-src/regress + rm -rf "${TMPDIR}" # The following block copies the files for the pg_hintplan test to the compute node for the extension test in an isolated docker-compose environment TMPDIR=$(mktemp -d) docker compose cp neon-test-extensions:/ext-src/pg_hint_plan-src/data "${TMPDIR}/data" @@ -68,7 +77,7 @@ for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do docker compose exec -T neon-test-extensions bash -c "(cd /postgres && patch -p1)" <"../compute/patches/contrib_pg${pg_version}.patch" # We are running tests now rm -f testout.txt testout_contrib.txt - docker compose exec -e USE_PGXS=1 -e SKIP=timescaledb-src,rdkit-src,postgis-src,pg_jsonschema-src,kq_imcx-src,wal2json_2_5-src,rag_jina_reranker_v1_tiny_en-src,rag_bge_small_en_v15-src \ + docker compose exec -e USE_PGXS=1 -e SKIP=timescaledb-src,rdkit-src,pg_jsonschema-src,kq_imcx-src,wal2json_2_5-src,rag_jina_reranker_v1_tiny_en-src,rag_bge_small_en_v15-src \ neon-test-extensions /run-tests.sh /ext-src | tee testout.txt && EXT_SUCCESS=1 || EXT_SUCCESS=0 docker compose exec -e SKIP=start-scripts,postgres_fdw,ltree_plpython,jsonb_plpython,jsonb_plperl,hstore_plpython,hstore_plperl,dblink,bool_plperl \ neon-test-extensions /run-tests.sh /postgres/contrib | tee testout_contrib.txt && CONTRIB_SUCCESS=1 || CONTRIB_SUCCESS=0 diff --git a/docker-compose/ext-src/postgis-src/README-Neon.md b/docker-compose/ext-src/postgis-src/README-Neon.md new file mode 100644 index 0000000000..5937fc782b --- /dev/null +++ b/docker-compose/ext-src/postgis-src/README-Neon.md @@ -0,0 +1,70 @@ +# PostGIS Testing in Neon + +This directory contains configuration files and patches for running PostGIS tests in the Neon database environment. + +## Overview + +PostGIS is a spatial database extension for PostgreSQL that adds support for geographic objects. Testing PostGIS compatibility ensures that Neon's modifications to PostgreSQL don't break compatibility with this critical extension. + +## PostGIS Versions + +- PostgreSQL v17: PostGIS 3.5.0 +- PostgreSQL v14/v15/v16: PostGIS 3.3.3 + +## Test Configuration + +The test setup includes: + +- `postgis-no-upgrade-test.patch`: Disables upgrade tests by removing the upgrade test section from regress/runtest.mk +- `postgis-regular-v16.patch`: Version-specific patch for PostgreSQL v16 +- `postgis-regular-v17.patch`: Version-specific patch for PostgreSQL v17 +- `regular-test.sh`: Script to run PostGIS tests as a regular user +- `neon-test.sh`: Script to handle version-specific test configurations +- `raster_outdb_template.sql`: Template for raster tests with explicit file paths + +## Excluded Tests + +**Important Note:** The test exclusions listed below are specifically for regular-user tests against staging instances. These exclusions are necessary because staging instances run with limited privileges and cannot perform operations requiring superuser access. Docker-compose based tests are not affected by these exclusions. + +### Tests Requiring Superuser Permissions + +These tests cannot be run as a regular user: +- `estimatedextent` +- `regress/core/legacy` +- `regress/core/typmod` +- `regress/loader/TestSkipANALYZE` +- `regress/loader/TestANALYZE` + +### Tests Requiring Filesystem Access + +These tests need direct filesystem access that is only possible for superusers: +- `loader/load_outdb` + +### Tests with Flaky Results + +These tests have assumptions that don't always hold true: +- `regress/core/computed_columns` - Assumes computed columns always outperform alternatives, which is not consistently true + +### Tests Requiring Tunable Parameter Modifications + +These tests attempt to modify the `postgis.gdal_enabled_drivers` parameter, which is only accessible to superusers: +- `raster/test/regress/rt_wkb` +- `raster/test/regress/rt_addband` +- `raster/test/regress/rt_setbandpath` +- `raster/test/regress/rt_fromgdalraster` +- `raster/test/regress/rt_asgdalraster` +- `raster/test/regress/rt_astiff` +- `raster/test/regress/rt_asjpeg` +- `raster/test/regress/rt_aspng` +- `raster/test/regress/permitted_gdal_drivers` +- Loader tests: `BasicOutDB`, `Tiled10x10`, `Tiled10x10Copy`, `Tiled8x8`, `TiledAuto`, `TiledAutoSkipNoData`, `TiledAutoCopyn` + +### Topology Tests (v17 only) +- `populate_topology_layer` +- `renametopogeometrycolumn` + +## Other Modifications + +- Binary.sql tests are modified to use explicit file paths +- Server-side SQL COPY commands (which require superuser privileges) are converted to client-side `\copy` commands +- Upgrade tests are disabled diff --git a/docker-compose/ext-src/postgis-src/neon-test.sh b/docker-compose/ext-src/postgis-src/neon-test.sh new file mode 100755 index 0000000000..2866649a1b --- /dev/null +++ b/docker-compose/ext-src/postgis-src/neon-test.sh @@ -0,0 +1,9 @@ +#!/bin/bash +set -ex +cd "$(dirname "$0")" +if [[ ${PG_VERSION} = v17 ]]; then + sed -i '/computed_columns/d' regress/core/tests.mk +fi +patch -p1 =" 120),1) +- TESTS += \ +- $(top_srcdir)/regress/core/computed_columns +-endif +- + ifeq ($(shell expr "$(POSTGIS_GEOS_VERSION)" ">=" 30700),1) + # GEOS-3.7 adds: + # ST_FrechetDistance +diff --git a/regress/loader/tests.mk b/regress/loader/tests.mk +index 1fc77ac..c3cb9de 100644 +--- a/regress/loader/tests.mk ++++ b/regress/loader/tests.mk +@@ -38,7 +38,5 @@ TESTS += \ + $(top_srcdir)/regress/loader/Latin1 \ + $(top_srcdir)/regress/loader/Latin1-implicit \ + $(top_srcdir)/regress/loader/mfile \ +- $(top_srcdir)/regress/loader/TestSkipANALYZE \ +- $(top_srcdir)/regress/loader/TestANALYZE \ + $(top_srcdir)/regress/loader/CharNoWidth + +diff --git a/regress/run_test.pl b/regress/run_test.pl +index 0ec5b2d..1c331f4 100755 +--- a/regress/run_test.pl ++++ b/regress/run_test.pl +@@ -147,7 +147,6 @@ $ENV{"LANG"} = "C"; + # Add locale info to the psql options + # Add pg12 precision suppression + my $PGOPTIONS = $ENV{"PGOPTIONS"}; +-$PGOPTIONS .= " -c lc_messages=C"; + $PGOPTIONS .= " -c client_min_messages=NOTICE"; + $PGOPTIONS .= " -c extra_float_digits=0"; + $ENV{"PGOPTIONS"} = $PGOPTIONS; diff --git a/docker-compose/ext-src/postgis-src/postgis-regular-v17.patch b/docker-compose/ext-src/postgis-src/postgis-regular-v17.patch new file mode 100644 index 0000000000..f4a9d83478 --- /dev/null +++ b/docker-compose/ext-src/postgis-src/postgis-regular-v17.patch @@ -0,0 +1,218 @@ +diff --git a/raster/test/regress/tests.mk b/raster/test/regress/tests.mk +index 00918e1..7e2b6cd 100644 +--- a/raster/test/regress/tests.mk ++++ b/raster/test/regress/tests.mk +@@ -17,9 +17,7 @@ override RUNTESTFLAGS_INTERNAL := \ + $(RUNTESTFLAGS_INTERNAL) \ + --after-upgrade-script $(top_srcdir)/raster/test/regress/hooks/hook-after-upgrade-raster.sql + +-RASTER_TEST_FIRST = \ +- $(top_srcdir)/raster/test/regress/check_gdal \ +- $(top_srcdir)/raster/test/regress/loader/load_outdb ++RASTER_TEST_FIRST = + + RASTER_TEST_LAST = \ + $(top_srcdir)/raster/test/regress/clean +@@ -33,9 +31,7 @@ RASTER_TEST_IO = \ + + RASTER_TEST_BASIC_FUNC = \ + $(top_srcdir)/raster/test/regress/rt_bytea \ +- $(top_srcdir)/raster/test/regress/rt_wkb \ + $(top_srcdir)/raster/test/regress/box3d \ +- $(top_srcdir)/raster/test/regress/rt_addband \ + $(top_srcdir)/raster/test/regress/rt_band \ + $(top_srcdir)/raster/test/regress/rt_tile + +@@ -73,16 +69,10 @@ RASTER_TEST_BANDPROPS = \ + $(top_srcdir)/raster/test/regress/rt_neighborhood \ + $(top_srcdir)/raster/test/regress/rt_nearestvalue \ + $(top_srcdir)/raster/test/regress/rt_pixelofvalue \ +- $(top_srcdir)/raster/test/regress/rt_polygon \ +- $(top_srcdir)/raster/test/regress/rt_setbandpath ++ $(top_srcdir)/raster/test/regress/rt_polygon + + RASTER_TEST_UTILITY = \ + $(top_srcdir)/raster/test/regress/rt_utility \ +- $(top_srcdir)/raster/test/regress/rt_fromgdalraster \ +- $(top_srcdir)/raster/test/regress/rt_asgdalraster \ +- $(top_srcdir)/raster/test/regress/rt_astiff \ +- $(top_srcdir)/raster/test/regress/rt_asjpeg \ +- $(top_srcdir)/raster/test/regress/rt_aspng \ + $(top_srcdir)/raster/test/regress/rt_reclass \ + $(top_srcdir)/raster/test/regress/rt_gdalwarp \ + $(top_srcdir)/raster/test/regress/rt_gdalcontour \ +@@ -120,21 +110,13 @@ RASTER_TEST_SREL = \ + + RASTER_TEST_BUGS = \ + $(top_srcdir)/raster/test/regress/bug_test_car5 \ +- $(top_srcdir)/raster/test/regress/permitted_gdal_drivers \ + $(top_srcdir)/raster/test/regress/tickets + + RASTER_TEST_LOADER = \ + $(top_srcdir)/raster/test/regress/loader/Basic \ + $(top_srcdir)/raster/test/regress/loader/Projected \ + $(top_srcdir)/raster/test/regress/loader/BasicCopy \ +- $(top_srcdir)/raster/test/regress/loader/BasicFilename \ +- $(top_srcdir)/raster/test/regress/loader/BasicOutDB \ +- $(top_srcdir)/raster/test/regress/loader/Tiled10x10 \ +- $(top_srcdir)/raster/test/regress/loader/Tiled10x10Copy \ +- $(top_srcdir)/raster/test/regress/loader/Tiled8x8 \ +- $(top_srcdir)/raster/test/regress/loader/TiledAuto \ +- $(top_srcdir)/raster/test/regress/loader/TiledAutoSkipNoData \ +- $(top_srcdir)/raster/test/regress/loader/TiledAutoCopyn ++ $(top_srcdir)/raster/test/regress/loader/BasicFilename + + RASTER_TESTS := $(RASTER_TEST_FIRST) \ + $(RASTER_TEST_METADATA) $(RASTER_TEST_IO) $(RASTER_TEST_BASIC_FUNC) \ +diff --git a/regress/core/binary.sql b/regress/core/binary.sql +index 7a36b65..ad78fc7 100644 +--- a/regress/core/binary.sql ++++ b/regress/core/binary.sql +@@ -1,4 +1,5 @@ + SET client_min_messages TO warning; ++ + CREATE SCHEMA tm; + + CREATE TABLE tm.geoms (id serial, g geometry); +@@ -31,24 +32,39 @@ SELECT st_force4d(g) FROM tm.geoms WHERE id < 15 ORDER BY id; + INSERT INTO tm.geoms(g) + SELECT st_setsrid(g,4326) FROM tm.geoms ORDER BY id; + +-COPY tm.geoms TO :tmpfile WITH BINARY; ++-- define temp file path ++\set tmpfile '/tmp/postgis_binary_test.dat' ++ ++-- export ++\set command '\\copy tm.geoms TO ':tmpfile' WITH (FORMAT BINARY)' ++:command ++ ++-- import + CREATE TABLE tm.geoms_in AS SELECT * FROM tm.geoms LIMIT 0; +-COPY tm.geoms_in FROM :tmpfile WITH BINARY; +-SELECT 'geometry', count(*) FROM tm.geoms_in i, tm.geoms o WHERE i.id = o.id +- AND ST_OrderingEquals(i.g, o.g); ++\set command '\\copy tm.geoms_in FROM ':tmpfile' WITH (FORMAT BINARY)' ++:command ++ ++SELECT 'geometry', count(*) FROM tm.geoms_in i, tm.geoms o ++WHERE i.id = o.id AND ST_OrderingEquals(i.g, o.g); + + CREATE TABLE tm.geogs AS SELECT id,g::geography FROM tm.geoms + WHERE geometrytype(g) NOT LIKE '%CURVE%' + AND geometrytype(g) NOT LIKE '%CIRCULAR%' + AND geometrytype(g) NOT LIKE '%SURFACE%' + AND geometrytype(g) NOT LIKE 'TRIANGLE%' +- AND geometrytype(g) NOT LIKE 'TIN%' +-; ++ AND geometrytype(g) NOT LIKE 'TIN%'; + +-COPY tm.geogs TO :tmpfile WITH BINARY; ++-- export ++\set command '\\copy tm.geogs TO ':tmpfile' WITH (FORMAT BINARY)' ++:command ++ ++-- import + CREATE TABLE tm.geogs_in AS SELECT * FROM tm.geogs LIMIT 0; +-COPY tm.geogs_in FROM :tmpfile WITH BINARY; +-SELECT 'geometry', count(*) FROM tm.geogs_in i, tm.geogs o WHERE i.id = o.id +- AND ST_OrderingEquals(i.g::geometry, o.g::geometry); ++\set command '\\copy tm.geogs_in FROM ':tmpfile' WITH (FORMAT BINARY)' ++:command ++ ++SELECT 'geometry', count(*) FROM tm.geogs_in i, tm.geogs o ++WHERE i.id = o.id AND ST_OrderingEquals(i.g::geometry, o.g::geometry); + + DROP SCHEMA tm CASCADE; ++ +diff --git a/regress/core/tests.mk b/regress/core/tests.mk +index 9e05244..a63a3e1 100644 +--- a/regress/core/tests.mk ++++ b/regress/core/tests.mk +@@ -16,14 +16,13 @@ POSTGIS_PGSQL_VERSION=170 + POSTGIS_GEOS_VERSION=31101 + HAVE_JSON=yes + HAVE_SPGIST=yes +-INTERRUPTTESTS=yes ++INTERRUPTTESTS=no + + current_dir := $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) + + RUNTESTFLAGS_INTERNAL += \ + --before-upgrade-script $(top_srcdir)/regress/hooks/hook-before-upgrade.sql \ + --after-upgrade-script $(top_srcdir)/regress/hooks/hook-after-upgrade.sql \ +- --after-create-script $(top_srcdir)/regress/hooks/hook-after-create.sql \ + --before-uninstall-script $(top_srcdir)/regress/hooks/hook-before-uninstall.sql + + TESTS += \ +@@ -40,7 +39,6 @@ TESTS += \ + $(top_srcdir)/regress/core/dumppoints \ + $(top_srcdir)/regress/core/dumpsegments \ + $(top_srcdir)/regress/core/empty \ +- $(top_srcdir)/regress/core/estimatedextent \ + $(top_srcdir)/regress/core/forcecurve \ + $(top_srcdir)/regress/core/flatgeobuf \ + $(top_srcdir)/regress/core/frechet \ +@@ -60,7 +58,6 @@ TESTS += \ + $(top_srcdir)/regress/core/out_marc21 \ + $(top_srcdir)/regress/core/in_encodedpolyline \ + $(top_srcdir)/regress/core/iscollection \ +- $(top_srcdir)/regress/core/legacy \ + $(top_srcdir)/regress/core/letters \ + $(top_srcdir)/regress/core/lwgeom_regress \ + $(top_srcdir)/regress/core/measures \ +@@ -119,7 +116,6 @@ TESTS += \ + $(top_srcdir)/regress/core/temporal_knn \ + $(top_srcdir)/regress/core/tickets \ + $(top_srcdir)/regress/core/twkb \ +- $(top_srcdir)/regress/core/typmod \ + $(top_srcdir)/regress/core/wkb \ + $(top_srcdir)/regress/core/wkt \ + $(top_srcdir)/regress/core/wmsservers \ +@@ -143,8 +139,7 @@ TESTS += \ + $(top_srcdir)/regress/core/oriented_envelope \ + $(top_srcdir)/regress/core/point_coordinates \ + $(top_srcdir)/regress/core/out_geojson \ +- $(top_srcdir)/regress/core/wrapx \ +- $(top_srcdir)/regress/core/computed_columns ++ $(top_srcdir)/regress/core/wrapx + + # Slow slow tests + TESTS_SLOW = \ +diff --git a/regress/loader/tests.mk b/regress/loader/tests.mk +index ac4f8ad..4bad4fc 100644 +--- a/regress/loader/tests.mk ++++ b/regress/loader/tests.mk +@@ -38,7 +38,5 @@ TESTS += \ + $(top_srcdir)/regress/loader/Latin1 \ + $(top_srcdir)/regress/loader/Latin1-implicit \ + $(top_srcdir)/regress/loader/mfile \ +- $(top_srcdir)/regress/loader/TestSkipANALYZE \ +- $(top_srcdir)/regress/loader/TestANALYZE \ + $(top_srcdir)/regress/loader/CharNoWidth \ + +diff --git a/regress/run_test.pl b/regress/run_test.pl +index cac4b2e..4c7c82b 100755 +--- a/regress/run_test.pl ++++ b/regress/run_test.pl +@@ -238,7 +238,6 @@ $ENV{"LANG"} = "C"; + # Add locale info to the psql options + # Add pg12 precision suppression + my $PGOPTIONS = $ENV{"PGOPTIONS"}; +-$PGOPTIONS .= " -c lc_messages=C"; + $PGOPTIONS .= " -c client_min_messages=NOTICE"; + $PGOPTIONS .= " -c extra_float_digits=0"; + $ENV{"PGOPTIONS"} = $PGOPTIONS; +diff --git a/topology/test/tests.mk b/topology/test/tests.mk +index cbe2633..2c7c18f 100644 +--- a/topology/test/tests.mk ++++ b/topology/test/tests.mk +@@ -46,9 +46,7 @@ TESTS += \ + $(top_srcdir)/topology/test/regress/legacy_query.sql \ + $(top_srcdir)/topology/test/regress/legacy_validate.sql \ + $(top_srcdir)/topology/test/regress/polygonize.sql \ +- $(top_srcdir)/topology/test/regress/populate_topology_layer.sql \ + $(top_srcdir)/topology/test/regress/removeunusedprimitives.sql \ +- $(top_srcdir)/topology/test/regress/renametopogeometrycolumn.sql \ + $(top_srcdir)/topology/test/regress/renametopology.sql \ + $(top_srcdir)/topology/test/regress/share_sequences.sql \ + $(top_srcdir)/topology/test/regress/sqlmm.sql \ diff --git a/docker-compose/ext-src/postgis-src/raster_outdb_template.sql b/docker-compose/ext-src/postgis-src/raster_outdb_template.sql new file mode 100644 index 0000000000..16232f28dd --- /dev/null +++ b/docker-compose/ext-src/postgis-src/raster_outdb_template.sql @@ -0,0 +1,46 @@ +-- +-- PostgreSQL database dump +-- + +-- Dumped from database version 17.4 +-- Dumped by pg_dump version 17.4 + +SET statement_timeout = 0; +SET lock_timeout = 0; +SET idle_in_transaction_session_timeout = 0; +SET transaction_timeout = 0; +SET client_encoding = 'UTF8'; +SET standard_conforming_strings = on; +SELECT pg_catalog.set_config('search_path', '', false); +SET check_function_bodies = false; +SET xmloption = content; +SET client_min_messages = warning; + +-- +-- Name: raster_outdb_template; Type: TABLE; Schema: public; Owner: cloud_admin +-- + +CREATE TABLE public.raster_outdb_template ( + rid integer, + rast public.raster +); + + +ALTER TABLE public.raster_outdb_template OWNER TO cloud_admin; + +-- +-- Data for Name: raster_outdb_template; Type: TABLE DATA; Schema: public; Owner: cloud_admin +-- + +COPY public.raster_outdb_template (rid, rast) FROM stdin; +1 0100000300000000000000F03F000000000000F0BF0000000000000000000000000000000000000000000000000000000000000000000000005A0032008400002F6578742D7372632F706F73746769732D7372632F726567726573732F2E2E2F7261737465722F746573742F726567726573732F6C6F616465722F746573747261737465722E746966008400012F6578742D7372632F706F73746769732D7372632F726567726573732F2E2E2F7261737465722F746573742F726567726573732F6C6F616465722F746573747261737465722E746966008400022F6578742D7372632F706F73746769732D7372632F726567726573732F2E2E2F7261737465722F746573742F726567726573732F6C6F616465722F746573747261737465722E74696600 +2 0100000300000000000000F03F000000000000F0BF0000000000000000000000000000000000000000000000000000000000000000000000005A0032008400002F6578742D7372632F706F73746769732D7372632F726567726573732F2E2E2F7261737465722F746573742F726567726573732F6C6F616465722F746573747261737465722E746966008400012F6578742D7372632F706F73746769732D7372632F726567726573732F2E2E2F7261737465722F746573742F726567726573732F6C6F616465722F746573747261737465722E746966008400022F6578742D7372632F706F73746769732D7372632F726567726573732F2E2E2F7261737465722F746573742F726567726573732F6C6F616465722F746573747261737465722E74696600 +3 0100000200000000000000F03F000000000000F0BF0000000000000000000000000000000000000000000000000000000000000000000000005A00320044000101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101018400012F6578742D7372632F706F73746769732D7372632F726567726573732F2E2E2F7261737465722F746573742F726567726573732F6C6F616465722F746573747261737465722E74696600 +4 0100000200000000000000F03F000000000000F0BF0000000000000000000000000000000000000000000000000000000000000000000000005A003200C4FF012F6578742D7372632F706F73746769732D7372632F726567726573732F2E2E2F7261737465722F746573742F726567726573732F6C6F616465722F746573747261737465722E746966004400010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101 +\. + + +-- +-- PostgreSQL database dump complete +-- + diff --git a/docker-compose/ext-src/postgis-src/regular-test.sh b/docker-compose/ext-src/postgis-src/regular-test.sh new file mode 100755 index 0000000000..4b0b929946 --- /dev/null +++ b/docker-compose/ext-src/postgis-src/regular-test.sh @@ -0,0 +1,17 @@ +#!/bin/bash +set -ex +cd "$(dirname "${0}")" +dropdb --if-exist contrib_regression +createdb contrib_regression +psql -d contrib_regression -c "ALTER DATABASE contrib_regression SET TimeZone='UTC'" \ + -c "ALTER DATABASE contrib_regression SET DateStyle='ISO, MDY'" \ + -c "CREATE EXTENSION postgis SCHEMA public" \ + -c "CREATE EXTENSION postgis_topology" \ + -c "CREATE EXTENSION postgis_tiger_geocoder CASCADE" \ + -c "CREATE EXTENSION postgis_raster SCHEMA public" \ + -c "CREATE EXTENSION postgis_sfcgal SCHEMA public" +patch -p1 , - /// If true, download LFC state from endpoint_storage and pass it to Postgres on startup + /// Download LFC state from endpoint_storage and pass it to Postgres on startup #[serde(default)] - pub prewarm_lfc_on_startup: bool, + pub autoprewarm: bool, } /// Feature flag to signal `compute_ctl` to enable certain experimental functionality. @@ -192,6 +192,9 @@ pub enum ComputeFeature { /// track short-lived connections as user activity. ActivityMonitorExperimental, + /// Enable TLS functionality. + TlsExperimental, + /// This is a special feature flag that is used to represent unknown feature flags. /// Basically all unknown to enum flags are represented as this one. See unit test /// `parse_unknown_features()` for more details. @@ -250,34 +253,44 @@ impl RemoteExtSpec { } match self.extension_data.get(real_ext_name) { - Some(_ext_data) => { - // We have decided to use the Go naming convention due to Kubernetes. - - let arch = match std::env::consts::ARCH { - "x86_64" => "amd64", - "aarch64" => "arm64", - arch => arch, - }; - - // Construct the path to the extension archive - // BUILD_TAG/PG_MAJOR_VERSION/extensions/EXTENSION_NAME.tar.zst - // - // Keep it in sync with path generation in - // https://github.com/neondatabase/build-custom-extensions/tree/main - let archive_path_str = format!( - "{build_tag}/{arch}/{pg_major_version}/extensions/{real_ext_name}.tar.zst" - ); - Ok(( - real_ext_name.to_string(), - RemotePath::from_string(&archive_path_str)?, - )) - } + Some(_ext_data) => Ok(( + real_ext_name.to_string(), + Self::build_remote_path(build_tag, pg_major_version, real_ext_name)?, + )), None => Err(anyhow::anyhow!( "real_ext_name {} is not found", real_ext_name )), } } + + /// Get the architecture-specific portion of the remote extension path. We + /// use the Go naming convention due to Kubernetes. + fn get_arch() -> &'static str { + match std::env::consts::ARCH { + "x86_64" => "amd64", + "aarch64" => "arm64", + arch => arch, + } + } + + /// Build a [`RemotePath`] for an extension. + fn build_remote_path( + build_tag: &str, + pg_major_version: &str, + ext_name: &str, + ) -> anyhow::Result { + let arch = Self::get_arch(); + + // Construct the path to the extension archive + // BUILD_TAG/PG_MAJOR_VERSION/extensions/EXTENSION_NAME.tar.zst + // + // Keep it in sync with path generation in + // https://github.com/neondatabase/build-custom-extensions/tree/main + RemotePath::from_string(&format!( + "{build_tag}/{arch}/{pg_major_version}/extensions/{ext_name}.tar.zst" + )) + } } #[derive(Clone, Copy, Debug, Default, Eq, PartialEq, Deserialize, Serialize)] @@ -518,6 +531,37 @@ mod tests { .expect("Library should be found"); } + #[test] + fn remote_extension_path() { + let rspec: RemoteExtSpec = serde_json::from_value(serde_json::json!({ + "public_extensions": ["ext"], + "custom_extensions": [], + "library_index": { + "extlib": "ext", + }, + "extension_data": { + "ext": { + "control_data": { + "ext.control": "" + }, + "archive_path": "" + } + }, + })) + .unwrap(); + + let (_ext_name, ext_path) = rspec + .get_ext("ext", false, "latest", "v17") + .expect("Extension should be found"); + // Starting with a forward slash would have consequences for the + // Url::join() that occurs when downloading a remote extension. + assert!(!ext_path.to_string().starts_with("/")); + assert_eq!( + ext_path, + RemoteExtSpec::build_remote_path("latest", "v17", "ext").unwrap() + ); + } + #[test] fn parse_spec_file() { let file = File::open("tests/cluster_spec.json").unwrap(); diff --git a/libs/compute_api/tests/cluster_spec.json b/libs/compute_api/tests/cluster_spec.json index 30e788a601..2dd2aae015 100644 --- a/libs/compute_api/tests/cluster_spec.json +++ b/libs/compute_api/tests/cluster_spec.json @@ -85,7 +85,7 @@ "vartype": "bool" }, { - "name": "prewarm_lfc_on_startup", + "name": "autoprewarm", "value": "off", "vartype": "bool" }, diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs index 444983bd18..30b0612082 100644 --- a/libs/pageserver_api/src/config.rs +++ b/libs/pageserver_api/src/config.rs @@ -20,7 +20,6 @@ use postgres_backend::AuthType; use remote_storage::RemoteStorageConfig; use serde_with::serde_as; use utils::logging::LogFormat; -use utils::postgres_client::PostgresClientProtocol; use crate::models::{ImageCompressionAlgorithm, LsnLease}; @@ -181,6 +180,7 @@ pub struct ConfigToml { pub virtual_file_io_engine: Option, pub ingest_batch_size: u64, pub max_vectored_read_bytes: MaxVectoredReadBytes, + pub max_get_vectored_keys: MaxGetVectoredKeys, pub image_compression: ImageCompressionAlgorithm, pub timeline_offloading: bool, pub ephemeral_bytes_per_memory_kb: usize, @@ -188,7 +188,6 @@ pub struct ConfigToml { pub virtual_file_io_mode: Option, #[serde(skip_serializing_if = "Option::is_none")] pub no_sync: Option, - pub wal_receiver_protocol: PostgresClientProtocol, pub page_service_pipelining: PageServicePipeliningConfig, pub get_vectored_concurrent_io: GetVectoredConcurrentIo, pub enable_read_path_debugging: Option, @@ -229,7 +228,7 @@ pub enum PageServicePipeliningConfig { } #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] pub struct PageServicePipeliningConfigPipelined { - /// Causes runtime errors if larger than max get_vectored batch size. + /// Failed config parsing and validation if larger than `max_get_vectored_keys`. pub max_batch_size: NonZeroUsize, pub execution: PageServiceProtocolPipelinedExecutionStrategy, // The default below is such that new versions of the software can start @@ -329,6 +328,8 @@ pub struct TimelineImportConfig { pub import_job_concurrency: NonZeroUsize, pub import_job_soft_size_limit: NonZeroUsize, pub import_job_checkpoint_threshold: NonZeroUsize, + /// Max size of the remote storage partial read done by any job + pub import_job_max_byte_range_size: NonZeroUsize, } #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] @@ -403,6 +404,16 @@ impl Default for EvictionOrder { #[serde(transparent)] pub struct MaxVectoredReadBytes(pub NonZeroUsize); +#[derive(Copy, Clone, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +#[serde(transparent)] +pub struct MaxGetVectoredKeys(NonZeroUsize); + +impl MaxGetVectoredKeys { + pub fn get(&self) -> usize { + self.0.get() + } +} + /// Tenant-level configuration values, used for various purposes. #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] #[serde(default)] @@ -514,8 +525,6 @@ pub struct TenantConfigToml { /// (either this flag or the pageserver-global one need to be set) pub timeline_offloading: bool, - pub wal_receiver_protocol_override: Option, - /// Enable rel_size_v2 for this tenant. Once enabled, the tenant will persist this information into /// `index_part.json`, and it cannot be reversed. pub rel_size_v2_enabled: bool, @@ -587,6 +596,8 @@ pub mod defaults { /// That is, slightly above 128 kB. pub const DEFAULT_MAX_VECTORED_READ_BYTES: usize = 130 * 1024; // 130 KiB + pub const DEFAULT_MAX_GET_VECTORED_KEYS: usize = 32; + pub const DEFAULT_IMAGE_COMPRESSION: ImageCompressionAlgorithm = ImageCompressionAlgorithm::Zstd { level: Some(1) }; @@ -594,9 +605,6 @@ pub mod defaults { pub const DEFAULT_IO_BUFFER_ALIGNMENT: usize = 512; - pub const DEFAULT_WAL_RECEIVER_PROTOCOL: utils::postgres_client::PostgresClientProtocol = - utils::postgres_client::PostgresClientProtocol::Vanilla; - pub const DEFAULT_SSL_KEY_FILE: &str = "server.key"; pub const DEFAULT_SSL_CERT_FILE: &str = "server.crt"; } @@ -685,6 +693,9 @@ impl Default for ConfigToml { max_vectored_read_bytes: (MaxVectoredReadBytes( NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(), )), + max_get_vectored_keys: (MaxGetVectoredKeys( + NonZeroUsize::new(DEFAULT_MAX_GET_VECTORED_KEYS).unwrap(), + )), image_compression: (DEFAULT_IMAGE_COMPRESSION), timeline_offloading: true, ephemeral_bytes_per_memory_kb: (DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB), @@ -692,7 +703,6 @@ impl Default for ConfigToml { virtual_file_io_mode: None, tenant_config: TenantConfigToml::default(), no_sync: None, - wal_receiver_protocol: DEFAULT_WAL_RECEIVER_PROTOCOL, page_service_pipelining: PageServicePipeliningConfig::Pipelined( PageServicePipeliningConfigPipelined { max_batch_size: NonZeroUsize::new(32).unwrap(), @@ -716,6 +726,7 @@ impl Default for ConfigToml { import_job_concurrency: NonZeroUsize::new(32).unwrap(), import_job_soft_size_limit: NonZeroUsize::new(256 * 1024 * 1024).unwrap(), import_job_checkpoint_threshold: NonZeroUsize::new(32).unwrap(), + import_job_max_byte_range_size: NonZeroUsize::new(4 * 1024 * 1024).unwrap(), }, basebackup_cache_config: None, posthog_config: None, @@ -836,7 +847,6 @@ impl Default for TenantConfigToml { lsn_lease_length: LsnLease::DEFAULT_LENGTH, lsn_lease_length_for_ts: LsnLease::DEFAULT_LENGTH_FOR_TS, timeline_offloading: true, - wal_receiver_protocol_override: None, rel_size_v2_enabled: false, gc_compaction_enabled: DEFAULT_GC_COMPACTION_ENABLED, gc_compaction_verification: DEFAULT_GC_COMPACTION_VERIFICATION, diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 28ced4a368..881f24b86c 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -20,7 +20,6 @@ use serde_with::serde_as; pub use utilization::PageserverUtilization; use utils::id::{NodeId, TenantId, TimelineId}; use utils::lsn::Lsn; -use utils::postgres_client::PostgresClientProtocol; use utils::{completion, serde_system_time}; use crate::config::Ratio; @@ -622,8 +621,6 @@ pub struct TenantConfigPatch { #[serde(skip_serializing_if = "FieldPatch::is_noop")] pub timeline_offloading: FieldPatch, #[serde(skip_serializing_if = "FieldPatch::is_noop")] - pub wal_receiver_protocol_override: FieldPatch, - #[serde(skip_serializing_if = "FieldPatch::is_noop")] pub rel_size_v2_enabled: FieldPatch, #[serde(skip_serializing_if = "FieldPatch::is_noop")] pub gc_compaction_enabled: FieldPatch, @@ -748,9 +745,6 @@ pub struct TenantConfig { #[serde(skip_serializing_if = "Option::is_none")] pub timeline_offloading: Option, - #[serde(skip_serializing_if = "Option::is_none")] - pub wal_receiver_protocol_override: Option, - #[serde(skip_serializing_if = "Option::is_none")] pub rel_size_v2_enabled: Option, @@ -812,7 +806,6 @@ impl TenantConfig { mut lsn_lease_length, mut lsn_lease_length_for_ts, mut timeline_offloading, - mut wal_receiver_protocol_override, mut rel_size_v2_enabled, mut gc_compaction_enabled, mut gc_compaction_verification, @@ -905,9 +898,6 @@ impl TenantConfig { .map(|v| humantime::parse_duration(&v))? .apply(&mut lsn_lease_length_for_ts); patch.timeline_offloading.apply(&mut timeline_offloading); - patch - .wal_receiver_protocol_override - .apply(&mut wal_receiver_protocol_override); patch.rel_size_v2_enabled.apply(&mut rel_size_v2_enabled); patch .gc_compaction_enabled @@ -960,7 +950,6 @@ impl TenantConfig { lsn_lease_length, lsn_lease_length_for_ts, timeline_offloading, - wal_receiver_protocol_override, rel_size_v2_enabled, gc_compaction_enabled, gc_compaction_verification, @@ -1058,9 +1047,6 @@ impl TenantConfig { timeline_offloading: self .timeline_offloading .unwrap_or(global_conf.timeline_offloading), - wal_receiver_protocol_override: self - .wal_receiver_protocol_override - .or(global_conf.wal_receiver_protocol_override), rel_size_v2_enabled: self .rel_size_v2_enabled .unwrap_or(global_conf.rel_size_v2_enabled), diff --git a/libs/posthog_client_lite/src/background_loop.rs b/libs/posthog_client_lite/src/background_loop.rs index a05f6096b1..a404c76da9 100644 --- a/libs/posthog_client_lite/src/background_loop.rs +++ b/libs/posthog_client_lite/src/background_loop.rs @@ -6,7 +6,7 @@ use arc_swap::ArcSwap; use tokio_util::sync::CancellationToken; use tracing::{Instrument, info_span}; -use crate::{FeatureStore, PostHogClient, PostHogClientConfig}; +use crate::{CaptureEvent, FeatureStore, PostHogClient, PostHogClientConfig}; /// A background loop that fetches feature flags from PostHog and updates the feature store. pub struct FeatureResolverBackgroundLoop { @@ -24,9 +24,16 @@ impl FeatureResolverBackgroundLoop { } } - pub fn spawn(self: Arc, handle: &tokio::runtime::Handle, refresh_period: Duration) { + pub fn spawn( + self: Arc, + handle: &tokio::runtime::Handle, + refresh_period: Duration, + fake_tenants: Vec, + ) { let this = self.clone(); let cancel = self.cancel.clone(); + + // Main loop of updating the feature flags. handle.spawn( async move { tracing::info!("Starting PostHog feature resolver"); @@ -56,6 +63,22 @@ impl FeatureResolverBackgroundLoop { } .instrument(info_span!("posthog_feature_resolver")), ); + + // Report fake tenants to PostHog so that we have the combination of all the properties in the UI. + // Do one report per pageserver restart. + let this = self.clone(); + handle.spawn( + async move { + tracing::info!("Starting PostHog feature reporter"); + for tenant in &fake_tenants { + tracing::info!("Reporting fake tenant: {:?}", tenant); + } + if let Err(e) = this.posthog_client.capture_event_batch(&fake_tenants).await { + tracing::warn!("Cannot report fake tenants: {}", e); + } + } + .instrument(info_span!("posthog_feature_reporter")), + ); } pub fn feature_store(&self) -> Arc { diff --git a/libs/posthog_client_lite/src/lib.rs b/libs/posthog_client_lite/src/lib.rs index ff12051196..f607b1be0a 100644 --- a/libs/posthog_client_lite/src/lib.rs +++ b/libs/posthog_client_lite/src/lib.rs @@ -22,6 +22,16 @@ pub enum PostHogEvaluationError { Internal(String), } +impl PostHogEvaluationError { + pub fn as_variant_str(&self) -> &'static str { + match self { + PostHogEvaluationError::NotAvailable(_) => "not_available", + PostHogEvaluationError::NoConditionGroupMatched => "no_condition_group_matched", + PostHogEvaluationError::Internal(_) => "internal", + } + } +} + #[derive(Deserialize)] pub struct LocalEvaluationResponse { pub flags: Vec, @@ -54,7 +64,7 @@ pub struct LocalEvaluationFlagFilterProperty { operator: String, } -#[derive(Debug, Serialize, Deserialize)] +#[derive(Debug, Serialize, Deserialize, Clone)] #[serde(untagged)] pub enum PostHogFlagFilterPropertyValue { String(String), @@ -497,6 +507,13 @@ pub struct PostHogClient { client: reqwest::Client, } +#[derive(Serialize, Debug)] +pub struct CaptureEvent { + pub event: String, + pub distinct_id: String, + pub properties: serde_json::Value, +} + impl PostHogClient { pub fn new(config: PostHogClientConfig) -> Self { let client = reqwest::Client::new(); @@ -560,12 +577,12 @@ impl PostHogClient { &self, event: &str, distinct_id: &str, - properties: &HashMap, + properties: &serde_json::Value, ) -> anyhow::Result<()> { // PUBLIC_URL/capture/ - // with bearer token of self.client_api_key let url = format!("{}/capture/", self.config.public_api_url); - self.client + let response = self + .client .post(url) .body(serde_json::to_string(&json!({ "api_key": self.config.client_api_key, @@ -575,6 +592,39 @@ impl PostHogClient { }))?) .send() .await?; + let status = response.status(); + let body = response.text().await?; + if !status.is_success() { + return Err(anyhow::anyhow!( + "Failed to capture events: {}, {}", + status, + body + )); + } + Ok(()) + } + + pub async fn capture_event_batch(&self, events: &[CaptureEvent]) -> anyhow::Result<()> { + // PUBLIC_URL/batch/ + let url = format!("{}/batch/", self.config.public_api_url); + let response = self + .client + .post(url) + .body(serde_json::to_string(&json!({ + "api_key": self.config.client_api_key, + "batch": events, + }))?) + .send() + .await?; + let status = response.status(); + let body = response.text().await?; + if !status.is_success() { + return Err(anyhow::anyhow!( + "Failed to capture events: {}, {}", + status, + body + )); + } Ok(()) } } diff --git a/libs/walproposer/src/api_bindings.rs b/libs/walproposer/src/api_bindings.rs index d660602149..4d6cbae9a9 100644 --- a/libs/walproposer/src/api_bindings.rs +++ b/libs/walproposer/src/api_bindings.rs @@ -439,6 +439,7 @@ pub fn empty_shmem() -> crate::bindings::WalproposerShmemState { currentClusterSize: crate::bindings::pg_atomic_uint64 { value: 0 }, shard_ps_feedback: [empty_feedback; 128], num_shards: 0, + replica_promote: false, min_ps_feedback: empty_feedback, } } diff --git a/pageserver/page_api/Cargo.toml b/pageserver/page_api/Cargo.toml index 4f62c77eb2..e643b5749b 100644 --- a/pageserver/page_api/Cargo.toml +++ b/pageserver/page_api/Cargo.toml @@ -9,7 +9,6 @@ bytes.workspace = true pageserver_api.workspace = true postgres_ffi.workspace = true prost.workspace = true -smallvec.workspace = true thiserror.workspace = true tonic.workspace = true utils.workspace = true diff --git a/pageserver/page_api/src/model.rs b/pageserver/page_api/src/model.rs index 0268ab920b..1a08d04cc1 100644 --- a/pageserver/page_api/src/model.rs +++ b/pageserver/page_api/src/model.rs @@ -9,12 +9,16 @@ //! - Use more precise datatypes, e.g. Lsn and uints shorter than 32 bits. //! //! - Validate protocol invariants, via try_from() and try_into(). +//! +//! Validation only happens on the receiver side, i.e. when converting from Protobuf to domain +//! types. This is where it matters -- the Protobuf types are less strict than the domain types, and +//! receivers should expect all sorts of junk from senders. This also allows the sender to use e.g. +//! stream combinators without dealing with errors, and avoids validating the same message twice. use std::fmt::Display; use bytes::Bytes; use postgres_ffi::Oid; -use smallvec::SmallVec; // TODO: split out Lsn, RelTag, SlruKind, Oid and other basic types to a separate crate, to avoid // pulling in all of their other crate dependencies when building the client. use utils::lsn::Lsn; @@ -72,47 +76,35 @@ impl Display for ReadLsn { } } -impl ReadLsn { - /// Validates the ReadLsn. - pub fn validate(&self) -> Result<(), ProtocolError> { - if self.request_lsn == Lsn::INVALID { - return Err(ProtocolError::invalid("request_lsn", self.request_lsn)); - } - if self.not_modified_since_lsn > Some(self.request_lsn) { - return Err(ProtocolError::invalid( - "not_modified_since_lsn", - self.not_modified_since_lsn, - )); - } - Ok(()) - } -} - impl TryFrom for ReadLsn { type Error = ProtocolError; fn try_from(pb: proto::ReadLsn) -> Result { - let read_lsn = Self { + if pb.request_lsn == 0 { + return Err(ProtocolError::invalid("request_lsn", pb.request_lsn)); + } + if pb.not_modified_since_lsn > pb.request_lsn { + return Err(ProtocolError::invalid( + "not_modified_since_lsn", + pb.not_modified_since_lsn, + )); + } + Ok(Self { request_lsn: Lsn(pb.request_lsn), not_modified_since_lsn: match pb.not_modified_since_lsn { 0 => None, lsn => Some(Lsn(lsn)), }, - }; - read_lsn.validate()?; - Ok(read_lsn) + }) } } -impl TryFrom for proto::ReadLsn { - type Error = ProtocolError; - - fn try_from(read_lsn: ReadLsn) -> Result { - read_lsn.validate()?; - Ok(Self { +impl From for proto::ReadLsn { + fn from(read_lsn: ReadLsn) -> Self { + Self { request_lsn: read_lsn.request_lsn.0, not_modified_since_lsn: read_lsn.not_modified_since_lsn.unwrap_or_default().0, - }) + } } } @@ -167,6 +159,15 @@ impl TryFrom for CheckRelExistsRequest { } } +impl From for proto::CheckRelExistsRequest { + fn from(request: CheckRelExistsRequest) -> Self { + Self { + read_lsn: Some(request.read_lsn.into()), + rel: Some(request.rel.into()), + } + } +} + pub type CheckRelExistsResponse = bool; impl From for CheckRelExistsResponse { @@ -204,14 +205,12 @@ impl TryFrom for GetBaseBackupRequest { } } -impl TryFrom for proto::GetBaseBackupRequest { - type Error = ProtocolError; - - fn try_from(request: GetBaseBackupRequest) -> Result { - Ok(Self { - read_lsn: Some(request.read_lsn.try_into()?), +impl From for proto::GetBaseBackupRequest { + fn from(request: GetBaseBackupRequest) -> Self { + Self { + read_lsn: Some(request.read_lsn.into()), replica: request.replica, - }) + } } } @@ -228,14 +227,9 @@ impl TryFrom for GetBaseBackupResponseChunk { } } -impl TryFrom for proto::GetBaseBackupResponseChunk { - type Error = ProtocolError; - - fn try_from(chunk: GetBaseBackupResponseChunk) -> Result { - if chunk.is_empty() { - return Err(ProtocolError::Missing("chunk")); - } - Ok(Self { chunk }) +impl From for proto::GetBaseBackupResponseChunk { + fn from(chunk: GetBaseBackupResponseChunk) -> Self { + Self { chunk } } } @@ -260,14 +254,12 @@ impl TryFrom for GetDbSizeRequest { } } -impl TryFrom for proto::GetDbSizeRequest { - type Error = ProtocolError; - - fn try_from(request: GetDbSizeRequest) -> Result { - Ok(Self { - read_lsn: Some(request.read_lsn.try_into()?), +impl From for proto::GetDbSizeRequest { + fn from(request: GetDbSizeRequest) -> Self { + Self { + read_lsn: Some(request.read_lsn.into()), db_oid: request.db_oid, - }) + } } } @@ -302,7 +294,7 @@ pub struct GetPageRequest { /// Multiple pages will be executed as a single batch by the Pageserver, amortizing layer access /// costs and parallelizing them. This may increase the latency of any individual request, but /// improves the overall latency and throughput of the batch as a whole. - pub block_numbers: SmallVec<[u32; 1]>, + pub block_numbers: Vec, } impl TryFrom for GetPageRequest { @@ -320,25 +312,20 @@ impl TryFrom for GetPageRequest { .ok_or(ProtocolError::Missing("read_lsn"))? .try_into()?, rel: pb.rel.ok_or(ProtocolError::Missing("rel"))?.try_into()?, - block_numbers: pb.block_number.into(), + block_numbers: pb.block_number, }) } } -impl TryFrom for proto::GetPageRequest { - type Error = ProtocolError; - - fn try_from(request: GetPageRequest) -> Result { - if request.block_numbers.is_empty() { - return Err(ProtocolError::Missing("block_number")); - } - Ok(Self { +impl From for proto::GetPageRequest { + fn from(request: GetPageRequest) -> Self { + Self { request_id: request.request_id, request_class: request.request_class.into(), - read_lsn: Some(request.read_lsn.try_into()?), + read_lsn: Some(request.read_lsn.into()), rel: Some(request.rel.into()), - block_number: request.block_numbers.into_vec(), - }) + block_number: request.block_numbers, + } } } @@ -410,7 +397,7 @@ pub struct GetPageResponse { /// A string describing the status, if any. pub reason: Option, /// The 8KB page images, in the same order as the request. Empty if status != OK. - pub page_images: SmallVec<[Bytes; 1]>, + pub page_images: Vec, } impl From for GetPageResponse { @@ -419,7 +406,7 @@ impl From for GetPageResponse { request_id: pb.request_id, status_code: pb.status_code.into(), reason: Some(pb.reason).filter(|r| !r.is_empty()), - page_images: pb.page_image.into(), + page_images: pb.page_image, } } } @@ -430,7 +417,7 @@ impl From for proto::GetPageResponse { request_id: response.request_id, status_code: response.status_code.into(), reason: response.reason.unwrap_or_default(), - page_image: response.page_images.into_vec(), + page_image: response.page_images, } } } @@ -519,14 +506,12 @@ impl TryFrom for GetRelSizeRequest { } } -impl TryFrom for proto::GetRelSizeRequest { - type Error = ProtocolError; - - fn try_from(request: GetRelSizeRequest) -> Result { - Ok(Self { - read_lsn: Some(request.read_lsn.try_into()?), +impl From for proto::GetRelSizeRequest { + fn from(request: GetRelSizeRequest) -> Self { + Self { + read_lsn: Some(request.read_lsn.into()), rel: Some(request.rel.into()), - }) + } } } @@ -569,15 +554,13 @@ impl TryFrom for GetSlruSegmentRequest { } } -impl TryFrom for proto::GetSlruSegmentRequest { - type Error = ProtocolError; - - fn try_from(request: GetSlruSegmentRequest) -> Result { - Ok(Self { - read_lsn: Some(request.read_lsn.try_into()?), +impl From for proto::GetSlruSegmentRequest { + fn from(request: GetSlruSegmentRequest) -> Self { + Self { + read_lsn: Some(request.read_lsn.into()), kind: request.kind as u32, segno: request.segno, - }) + } } } @@ -594,15 +577,9 @@ impl TryFrom for GetSlruSegmentResponse { } } -impl TryFrom for proto::GetSlruSegmentResponse { - type Error = ProtocolError; - - fn try_from(segment: GetSlruSegmentResponse) -> Result { - // TODO: can a segment legitimately be empty? - if segment.is_empty() { - return Err(ProtocolError::Missing("segment")); - } - Ok(Self { segment }) +impl From for proto::GetSlruSegmentResponse { + fn from(segment: GetSlruSegmentResponse) -> Self { + Self { segment } } } diff --git a/pageserver/pagebench/Cargo.toml b/pageserver/pagebench/Cargo.toml index ceb1278eab..5e4af88e69 100644 --- a/pageserver/pagebench/Cargo.toml +++ b/pageserver/pagebench/Cargo.toml @@ -9,6 +9,7 @@ license.workspace = true [dependencies] anyhow.workspace = true async-trait.workspace = true +bytes.workspace = true camino.workspace = true clap.workspace = true futures.workspace = true diff --git a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs index 395e9cac41..3f3b6e396e 100644 --- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs +++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs @@ -1,4 +1,4 @@ -use std::collections::{HashSet, VecDeque}; +use std::collections::{HashMap, HashSet, VecDeque}; use std::future::Future; use std::num::NonZeroUsize; use std::pin::Pin; @@ -8,12 +8,12 @@ use std::time::{Duration, Instant}; use anyhow::Context; use async_trait::async_trait; +use bytes::Bytes; use camino::Utf8PathBuf; use pageserver_api::key::Key; use pageserver_api::keyspace::KeySpaceAccum; -use pageserver_api::models::{ - PagestreamGetPageRequest, PagestreamGetPageResponse, PagestreamRequest, -}; +use pageserver_api::models::{PagestreamGetPageRequest, PagestreamRequest}; +use pageserver_api::reltag::RelTag; use pageserver_api::shard::TenantShardId; use pageserver_page_api::proto; use rand::prelude::*; @@ -77,6 +77,16 @@ pub(crate) struct Args { #[clap(long, default_value = "1")] queue_depth: NonZeroUsize, + /// Batch size of contiguous pages generated by each client. This is equivalent to how Postgres + /// will request page batches (e.g. prefetches or vectored reads). A batch counts as 1 RPS and + /// 1 queue depth. + /// + /// The libpq protocol does not support client-side batching, and will submit batches as many + /// individual requests, in the hope that the server will batch them. Each batch still counts as + /// 1 RPS and 1 queue depth. + #[clap(long, default_value = "1")] + batch_size: NonZeroUsize, + #[clap(long)] only_relnode: Option, @@ -392,7 +402,16 @@ async fn run_worker( shared_state.start_work_barrier.wait().await; let client_start = Instant::now(); let mut ticks_processed = 0; - let mut inflight = VecDeque::new(); + let mut req_id = 0; + let batch_size: usize = args.batch_size.into(); + + // Track inflight requests by request ID and start time. This times the request duration, and + // ensures responses match requests. We don't expect responses back in any particular order. + // + // NB: this does not check that all requests received a response, because we don't wait for the + // inflight requests to complete when the duration elapses. + let mut inflight: HashMap = HashMap::new(); + while !cancel.is_cancelled() { // Detect if a request took longer than the RPS rate if let Some(period) = &rps_period { @@ -408,36 +427,72 @@ async fn run_worker( } while inflight.len() < args.queue_depth.get() { + req_id += 1; let start = Instant::now(); - let req = { + let (req_lsn, mod_lsn, rel, blks) = { + /// Converts a compact i128 key to a relation tag and block number. + fn key_to_block(key: i128) -> (RelTag, u32) { + let key = Key::from_i128(key); + assert!(key.is_rel_block_key()); + key.to_rel_block() + .expect("we filter non-rel-block keys out above") + } + + // Pick a random page from a random relation. let mut rng = rand::thread_rng(); let r = &ranges[weights.sample(&mut rng)]; let key: i128 = rng.gen_range(r.start..r.end); - let key = Key::from_i128(key); - assert!(key.is_rel_block_key()); - let (rel_tag, block_no) = key - .to_rel_block() - .expect("we filter non-rel-block keys out above"); - PagestreamGetPageRequest { - hdr: PagestreamRequest { - reqid: 0, - request_lsn: if rng.gen_bool(args.req_latest_probability) { - Lsn::MAX - } else { - r.timeline_lsn - }, - not_modified_since: r.timeline_lsn, - }, - rel: rel_tag, - blkno: block_no, + let (rel_tag, block_no) = key_to_block(key); + + let mut blks = VecDeque::with_capacity(batch_size); + blks.push_back(block_no); + + // If requested, populate a batch of sequential pages. This is how Postgres will + // request page batches (e.g. prefetches). If we hit the end of the relation, we + // grow the batch towards the start too. + for i in 1..batch_size { + let (r, b) = key_to_block(key + i as i128); + if r != rel_tag { + break; // went outside relation + } + blks.push_back(b) } + + if blks.len() < batch_size { + // Grow batch backwards if needed. + for i in 1..batch_size { + let (r, b) = key_to_block(key - i as i128); + if r != rel_tag { + break; // went outside relation + } + blks.push_front(b) + } + } + + // We assume that the entire batch can fit within the relation. + assert_eq!(blks.len(), batch_size, "incomplete batch"); + + let req_lsn = if rng.gen_bool(args.req_latest_probability) { + Lsn::MAX + } else { + r.timeline_lsn + }; + (req_lsn, r.timeline_lsn, rel_tag, blks.into()) }; - client.send_get_page(req).await.unwrap(); - inflight.push_back(start); + client + .send_get_page(req_id, req_lsn, mod_lsn, rel, blks) + .await + .unwrap(); + let old = inflight.insert(req_id, start); + assert!(old.is_none(), "duplicate request ID {req_id}"); } - let start = inflight.pop_front().unwrap(); - client.recv_get_page().await.unwrap(); + let (req_id, pages) = client.recv_get_page().await.unwrap(); + assert_eq!(pages.len(), batch_size, "unexpected page count"); + assert!(pages.iter().all(|p| !p.is_empty()), "empty page"); + let start = inflight + .remove(&req_id) + .expect("response for unknown request ID"); let end = Instant::now(); shared_state.live_stats.request_done(); ticks_processed += 1; @@ -467,15 +522,24 @@ async fn run_worker( #[async_trait] trait Client: Send { /// Sends an asynchronous GetPage request to the pageserver. - async fn send_get_page(&mut self, req: PagestreamGetPageRequest) -> anyhow::Result<()>; + async fn send_get_page( + &mut self, + req_id: u64, + req_lsn: Lsn, + mod_lsn: Lsn, + rel: RelTag, + blks: Vec, + ) -> anyhow::Result<()>; /// Receives the next GetPage response from the pageserver. - async fn recv_get_page(&mut self) -> anyhow::Result; + async fn recv_get_page(&mut self) -> anyhow::Result<(u64, Vec)>; } /// A libpq-based Pageserver client. struct LibpqClient { inner: pageserver_client::page_service::PagestreamClient, + // Track sent batches, so we know how many responses to expect. + batch_sizes: VecDeque, } impl LibpqClient { @@ -484,18 +548,55 @@ impl LibpqClient { .await? .pagestream(ttid.tenant_id, ttid.timeline_id) .await?; - Ok(Self { inner }) + Ok(Self { + inner, + batch_sizes: VecDeque::new(), + }) } } #[async_trait] impl Client for LibpqClient { - async fn send_get_page(&mut self, req: PagestreamGetPageRequest) -> anyhow::Result<()> { - self.inner.getpage_send(req).await + async fn send_get_page( + &mut self, + req_id: u64, + req_lsn: Lsn, + mod_lsn: Lsn, + rel: RelTag, + blks: Vec, + ) -> anyhow::Result<()> { + // libpq doesn't support client-side batches, so we send a bunch of individual requests + // instead in the hope that the server will batch them for us. We use the same request ID + // for all, because we'll return a single batch response. + self.batch_sizes.push_back(blks.len()); + for blkno in blks { + let req = PagestreamGetPageRequest { + hdr: PagestreamRequest { + reqid: req_id, + request_lsn: req_lsn, + not_modified_since: mod_lsn, + }, + rel, + blkno, + }; + self.inner.getpage_send(req).await?; + } + Ok(()) } - async fn recv_get_page(&mut self) -> anyhow::Result { - self.inner.getpage_recv().await + async fn recv_get_page(&mut self) -> anyhow::Result<(u64, Vec)> { + let batch_size = self.batch_sizes.pop_front().unwrap(); + let mut batch = Vec::with_capacity(batch_size); + let mut req_id = None; + for _ in 0..batch_size { + let resp = self.inner.getpage_recv().await?; + if req_id.is_none() { + req_id = Some(resp.req.hdr.reqid); + } + assert_eq!(req_id, Some(resp.req.hdr.reqid), "request ID mismatch"); + batch.push(resp.page); + } + Ok((req_id.unwrap(), batch)) } } @@ -532,31 +633,35 @@ impl GrpcClient { #[async_trait] impl Client for GrpcClient { - async fn send_get_page(&mut self, req: PagestreamGetPageRequest) -> anyhow::Result<()> { + async fn send_get_page( + &mut self, + req_id: u64, + req_lsn: Lsn, + mod_lsn: Lsn, + rel: RelTag, + blks: Vec, + ) -> anyhow::Result<()> { let req = proto::GetPageRequest { - request_id: 0, + request_id: req_id, request_class: proto::GetPageClass::Normal as i32, read_lsn: Some(proto::ReadLsn { - request_lsn: req.hdr.request_lsn.0, - not_modified_since_lsn: req.hdr.not_modified_since.0, + request_lsn: req_lsn.0, + not_modified_since_lsn: mod_lsn.0, }), - rel: Some(req.rel.into()), - block_number: vec![req.blkno], + rel: Some(rel.into()), + block_number: blks, }; self.req_tx.send(req).await?; Ok(()) } - async fn recv_get_page(&mut self) -> anyhow::Result { + async fn recv_get_page(&mut self) -> anyhow::Result<(u64, Vec)> { let resp = self.resp_rx.message().await?.unwrap(); anyhow::ensure!( resp.status_code == proto::GetPageStatusCode::Ok as i32, "unexpected status code: {}", resp.status_code ); - Ok(PagestreamGetPageResponse { - page: resp.page_image[0].clone(), - req: PagestreamGetPageRequest::default(), // dummy - }) + Ok((resp.request_id, resp.page_image)) } } diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index 4dba9d267c..2a0548b811 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -371,7 +371,7 @@ where .await? .partition( self.timeline.get_shard_identity(), - Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64, + self.timeline.conf.max_get_vectored_keys.get() as u64 * BLCKSZ as u64, ); let mut slru_builder = SlruSegmentsBuilder::new(&mut self.ar); diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 154cc86eb5..45b90a5aa3 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -171,7 +171,6 @@ fn main() -> anyhow::Result<()> { // (maybe we should automate this with a visitor?). info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine"); info!(?conf.virtual_file_io_mode, "starting with virtual_file IO mode"); - info!(?conf.wal_receiver_protocol, "starting with WAL receiver protocol"); info!(?conf.validate_wal_contiguity, "starting with WAL contiguity validation"); info!(?conf.page_service_pipelining, "starting with page service pipelining config"); info!(?conf.get_vectored_concurrent_io, "starting with get_vectored IO concurrency config"); @@ -832,6 +831,7 @@ fn start_pageserver( tenant_manager.clone(), grpc_auth, otel_guard.as_ref().map(|g| g.dispatch.clone()), + conf.get_vectored_concurrent_io, grpc_listener, )?); } diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 89f7539722..3492a8d966 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -14,7 +14,10 @@ use std::time::Duration; use anyhow::{Context, bail, ensure}; use camino::{Utf8Path, Utf8PathBuf}; use once_cell::sync::OnceCell; -use pageserver_api::config::{DiskUsageEvictionTaskConfig, MaxVectoredReadBytes, PostHogConfig}; +use pageserver_api::config::{ + DiskUsageEvictionTaskConfig, MaxGetVectoredKeys, MaxVectoredReadBytes, + PageServicePipeliningConfig, PageServicePipeliningConfigPipelined, PostHogConfig, +}; use pageserver_api::models::ImageCompressionAlgorithm; use pageserver_api::shard::TenantShardId; use pem::Pem; @@ -24,7 +27,6 @@ use reqwest::Url; use storage_broker::Uri; use utils::id::{NodeId, TimelineId}; use utils::logging::{LogFormat, SecretString}; -use utils::postgres_client::PostgresClientProtocol; use crate::tenant::storage_layer::inmemory_layer::IndexEntry; use crate::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME}; @@ -185,6 +187,9 @@ pub struct PageServerConf { pub max_vectored_read_bytes: MaxVectoredReadBytes, + /// Maximum number of keys to be read in a single get_vectored call. + pub max_get_vectored_keys: MaxGetVectoredKeys, + pub image_compression: ImageCompressionAlgorithm, /// Whether to offload archived timelines automatically @@ -205,8 +210,6 @@ pub struct PageServerConf { /// Optionally disable disk syncs (unsafe!) pub no_sync: bool, - pub wal_receiver_protocol: PostgresClientProtocol, - pub page_service_pipelining: pageserver_api::config::PageServicePipeliningConfig, pub get_vectored_concurrent_io: pageserver_api::config::GetVectoredConcurrentIo, @@ -404,6 +407,7 @@ impl PageServerConf { secondary_download_concurrency, ingest_batch_size, max_vectored_read_bytes, + max_get_vectored_keys, image_compression, timeline_offloading, ephemeral_bytes_per_memory_kb, @@ -414,7 +418,6 @@ impl PageServerConf { virtual_file_io_engine, tenant_config, no_sync, - wal_receiver_protocol, page_service_pipelining, get_vectored_concurrent_io, enable_read_path_debugging, @@ -470,13 +473,13 @@ impl PageServerConf { secondary_download_concurrency, ingest_batch_size, max_vectored_read_bytes, + max_get_vectored_keys, image_compression, timeline_offloading, ephemeral_bytes_per_memory_kb, import_pgdata_upcall_api, import_pgdata_upcall_api_token: import_pgdata_upcall_api_token.map(SecretString::from), import_pgdata_aws_endpoint_url, - wal_receiver_protocol, page_service_pipelining, get_vectored_concurrent_io, tracing, @@ -598,6 +601,19 @@ impl PageServerConf { ) })?; + if let PageServicePipeliningConfig::Pipelined(PageServicePipeliningConfigPipelined { + max_batch_size, + .. + }) = conf.page_service_pipelining + { + if max_batch_size.get() > conf.max_get_vectored_keys.get() { + return Err(anyhow::anyhow!( + "`max_batch_size` ({max_batch_size}) must be less than or equal to `max_get_vectored_keys` ({})", + conf.max_get_vectored_keys.get() + )); + } + }; + Ok(conf) } @@ -685,6 +701,7 @@ impl ConfigurableSemaphore { mod tests { use camino::Utf8PathBuf; + use rstest::rstest; use utils::id::NodeId; use super::PageServerConf; @@ -724,4 +741,28 @@ mod tests { PageServerConf::parse_and_validate(NodeId(0), config_toml, &workdir) .expect_err("parse_and_validate should fail for endpoint without scheme"); } + + #[rstest] + #[case(32, 32, true)] + #[case(64, 32, false)] + #[case(64, 64, true)] + #[case(128, 128, true)] + fn test_config_max_batch_size_is_valid( + #[case] max_batch_size: usize, + #[case] max_get_vectored_keys: usize, + #[case] is_valid: bool, + ) { + let input = format!( + r#" + control_plane_api = "http://localhost:6666" + max_get_vectored_keys = {max_get_vectored_keys} + page_service_pipelining = {{ mode="pipelined", execution="concurrent-futures", max_batch_size={max_batch_size}, batching="uniform-lsn" }} + "#, + ); + let config_toml = toml_edit::de::from_str::(&input) + .expect("config has valid fields"); + let workdir = Utf8PathBuf::from("/nonexistent"); + let result = PageServerConf::parse_and_validate(NodeId(0), config_toml, &workdir); + assert_eq!(result.is_ok(), is_valid); + } } diff --git a/pageserver/src/feature_resolver.rs b/pageserver/src/feature_resolver.rs index 7e31b930d0..50de3b691c 100644 --- a/pageserver/src/feature_resolver.rs +++ b/pageserver/src/feature_resolver.rs @@ -1,21 +1,28 @@ use std::{collections::HashMap, sync::Arc, time::Duration}; use posthog_client_lite::{ - FeatureResolverBackgroundLoop, PostHogClientConfig, PostHogEvaluationError, + CaptureEvent, FeatureResolverBackgroundLoop, PostHogClientConfig, PostHogEvaluationError, + PostHogFlagFilterPropertyValue, }; +use remote_storage::RemoteStorageKind; +use serde_json::json; use tokio_util::sync::CancellationToken; use utils::id::TenantId; -use crate::config::PageServerConf; +use crate::{config::PageServerConf, metrics::FEATURE_FLAG_EVALUATION}; #[derive(Clone)] pub struct FeatureResolver { inner: Option>, + internal_properties: Option>>, } impl FeatureResolver { pub fn new_disabled() -> Self { - Self { inner: None } + Self { + inner: None, + internal_properties: None, + } } pub fn spawn( @@ -36,14 +43,114 @@ impl FeatureResolver { shutdown_pageserver, ); let inner = Arc::new(inner); - // TODO: make this configurable - inner.clone().spawn(handle, Duration::from_secs(60)); - Ok(FeatureResolver { inner: Some(inner) }) + + // The properties shared by all tenants on this pageserver. + let internal_properties = { + let mut properties = HashMap::new(); + properties.insert( + "pageserver_id".to_string(), + PostHogFlagFilterPropertyValue::String(conf.id.to_string()), + ); + if let Some(availability_zone) = &conf.availability_zone { + properties.insert( + "availability_zone".to_string(), + PostHogFlagFilterPropertyValue::String(availability_zone.clone()), + ); + } + // Infer region based on the remote storage config. + if let Some(remote_storage) = &conf.remote_storage_config { + match &remote_storage.storage { + RemoteStorageKind::AwsS3(config) => { + properties.insert( + "region".to_string(), + PostHogFlagFilterPropertyValue::String(format!( + "aws-{}", + config.bucket_region + )), + ); + } + RemoteStorageKind::AzureContainer(config) => { + properties.insert( + "region".to_string(), + PostHogFlagFilterPropertyValue::String(format!( + "azure-{}", + config.container_region + )), + ); + } + RemoteStorageKind::LocalFs { .. } => { + properties.insert( + "region".to_string(), + PostHogFlagFilterPropertyValue::String("local".to_string()), + ); + } + } + } + // TODO: add pageserver URL. + Arc::new(properties) + }; + let fake_tenants = { + let mut tenants = Vec::new(); + for i in 0..10 { + let distinct_id = format!( + "fake_tenant_{}_{}_{}", + conf.availability_zone.as_deref().unwrap_or_default(), + conf.id, + i + ); + let properties = Self::collect_properties_inner( + distinct_id.clone(), + Some(&internal_properties), + ); + tenants.push(CaptureEvent { + event: "initial_tenant_report".to_string(), + distinct_id, + properties: json!({ "$set": properties }), // use `$set` to set the person properties instead of the event properties + }); + } + tenants + }; + // TODO: make refresh period configurable + inner + .clone() + .spawn(handle, Duration::from_secs(60), fake_tenants); + Ok(FeatureResolver { + inner: Some(inner), + internal_properties: Some(internal_properties), + }) } else { - Ok(FeatureResolver { inner: None }) + Ok(FeatureResolver { + inner: None, + internal_properties: None, + }) } } + fn collect_properties_inner( + tenant_id: String, + internal_properties: Option<&HashMap>, + ) -> HashMap { + let mut properties = HashMap::new(); + if let Some(internal_properties) = internal_properties { + for (key, value) in internal_properties.iter() { + properties.insert(key.clone(), value.clone()); + } + } + properties.insert( + "tenant_id".to_string(), + PostHogFlagFilterPropertyValue::String(tenant_id), + ); + properties + } + + /// Collect all properties availble for the feature flag evaluation. + pub(crate) fn collect_properties( + &self, + tenant_id: TenantId, + ) -> HashMap { + Self::collect_properties_inner(tenant_id.to_string(), self.internal_properties.as_deref()) + } + /// Evaluate a multivariate feature flag. Currently, we do not support any properties. /// /// Error handling: the caller should inspect the error and decide the behavior when a feature flag @@ -55,11 +162,24 @@ impl FeatureResolver { tenant_id: TenantId, ) -> Result { if let Some(inner) = &self.inner { - inner.feature_store().evaluate_multivariate( + let res = inner.feature_store().evaluate_multivariate( flag_key, &tenant_id.to_string(), - &HashMap::new(), - ) + &self.collect_properties(tenant_id), + ); + match &res { + Ok(value) => { + FEATURE_FLAG_EVALUATION + .with_label_values(&[flag_key, "ok", value]) + .inc(); + } + Err(e) => { + FEATURE_FLAG_EVALUATION + .with_label_values(&[flag_key, "error", e.as_variant_str()]) + .inc(); + } + } + res } else { Err(PostHogEvaluationError::NotAvailable( "PostHog integration is not enabled".to_string(), @@ -80,11 +200,24 @@ impl FeatureResolver { tenant_id: TenantId, ) -> Result<(), PostHogEvaluationError> { if let Some(inner) = &self.inner { - inner.feature_store().evaluate_boolean( + let res = inner.feature_store().evaluate_boolean( flag_key, &tenant_id.to_string(), - &HashMap::new(), - ) + &self.collect_properties(tenant_id), + ); + match &res { + Ok(()) => { + FEATURE_FLAG_EVALUATION + .with_label_values(&[flag_key, "ok", "true"]) + .inc(); + } + Err(e) => { + FEATURE_FLAG_EVALUATION + .with_label_values(&[flag_key, "error", e.as_variant_str()]) + .inc(); + } + } + res } else { Err(PostHogEvaluationError::NotAvailable( "PostHog integration is not enabled".to_string(), diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 1effa10404..c8a2a0209f 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -43,6 +43,7 @@ use pageserver_api::models::{ use pageserver_api::shard::{ShardCount, TenantShardId}; use remote_storage::{DownloadError, GenericRemoteStorage, TimeTravelError}; use scopeguard::defer; +use serde_json::json; use tenant_size_model::svg::SvgBranchKind; use tenant_size_model::{SizeResult, StorageModel}; use tokio::time::Instant; @@ -3679,23 +3680,24 @@ async fn tenant_evaluate_feature_flag( let tenant = state .tenant_manager .get_attached_tenant_shard(tenant_shard_id)?; + let properties = tenant.feature_resolver.collect_properties(tenant_shard_id.tenant_id); if as_type == "boolean" { let result = tenant.feature_resolver.evaluate_boolean(&flag, tenant_shard_id.tenant_id); let result = result.map(|_| true).map_err(|e| e.to_string()); - json_response(StatusCode::OK, result) + json_response(StatusCode::OK, json!({ "result": result, "properties": properties })) } else if as_type == "multivariate" { let result = tenant.feature_resolver.evaluate_multivariate(&flag, tenant_shard_id.tenant_id).map_err(|e| e.to_string()); - json_response(StatusCode::OK, result) + json_response(StatusCode::OK, json!({ "result": result, "properties": properties })) } else { // Auto infer the type of the feature flag. let is_boolean = tenant.feature_resolver.is_feature_flag_boolean(&flag).map_err(|e| ApiError::InternalServerError(anyhow::anyhow!("{e}")))?; if is_boolean { let result = tenant.feature_resolver.evaluate_boolean(&flag, tenant_shard_id.tenant_id); let result = result.map(|_| true).map_err(|e| e.to_string()); - json_response(StatusCode::OK, result) + json_response(StatusCode::OK, json!({ "result": result, "properties": properties })) } else { let result = tenant.feature_resolver.evaluate_multivariate(&flag, tenant_shard_id.tenant_id).map_err(|e| e.to_string()); - json_response(StatusCode::OK, result) + json_response(StatusCode::OK, json!({ "result": result, "properties": properties })) } } } diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index a9b2f1b7e0..3eb70ffac2 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -15,6 +15,7 @@ use metrics::{ register_int_gauge, register_int_gauge_vec, register_uint_gauge, register_uint_gauge_vec, }; use once_cell::sync::Lazy; +use pageserver_api::config::defaults::DEFAULT_MAX_GET_VECTORED_KEYS; use pageserver_api::config::{ PageServicePipeliningConfig, PageServicePipeliningConfigPipelined, PageServiceProtocolPipelinedBatchingStrategy, PageServiceProtocolPipelinedExecutionStrategy, @@ -32,7 +33,6 @@ use crate::config::PageServerConf; use crate::context::{PageContentKind, RequestContext}; use crate::pgdatadir_mapping::DatadirModificationStats; use crate::task_mgr::TaskKind; -use crate::tenant::Timeline; use crate::tenant::layer_map::LayerMap; use crate::tenant::mgr::TenantSlot; use crate::tenant::storage_layer::{InMemoryLayer, PersistentLayerDesc}; @@ -446,6 +446,15 @@ static PAGE_CACHE_ERRORS: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); +pub(crate) static FEATURE_FLAG_EVALUATION: Lazy = Lazy::new(|| { + register_counter_vec!( + "pageserver_feature_flag_evaluation", + "Number of times a feature flag is evaluated", + &["flag_key", "status", "value"], + ) + .unwrap() +}); + #[derive(IntoStaticStr)] #[strum(serialize_all = "kebab_case")] pub(crate) enum PageCacheErrorKind { @@ -1939,7 +1948,7 @@ static SMGR_QUERY_TIME_GLOBAL: Lazy = Lazy::new(|| { }); static PAGE_SERVICE_BATCH_SIZE_BUCKETS_GLOBAL: Lazy> = Lazy::new(|| { - (1..=u32::try_from(Timeline::MAX_GET_VECTORED_KEYS).unwrap()) + (1..=u32::try_from(DEFAULT_MAX_GET_VECTORED_KEYS).unwrap()) .map(|v| v.into()) .collect() }); @@ -1957,7 +1966,7 @@ static PAGE_SERVICE_BATCH_SIZE_BUCKETS_PER_TIMELINE: Lazy> = Lazy::new( let mut buckets = Vec::new(); for i in 0.. { let bucket = 1 << i; - if bucket > u32::try_from(Timeline::MAX_GET_VECTORED_KEYS).unwrap() { + if bucket > u32::try_from(DEFAULT_MAX_GET_VECTORED_KEYS).unwrap() { break; } buckets.push(bucket.into()); @@ -2846,7 +2855,6 @@ pub(crate) struct WalIngestMetrics { pub(crate) records_received: IntCounter, pub(crate) records_observed: IntCounter, pub(crate) records_committed: IntCounter, - pub(crate) records_filtered: IntCounter, pub(crate) values_committed_metadata_images: IntCounter, pub(crate) values_committed_metadata_deltas: IntCounter, pub(crate) values_committed_data_images: IntCounter, @@ -2902,11 +2910,6 @@ pub(crate) static WAL_INGEST: Lazy = Lazy::new(|| { "Number of WAL records which resulted in writes to pageserver storage" ) .expect("failed to define a metric"), - records_filtered: register_int_counter!( - "pageserver_wal_ingest_records_filtered", - "Number of WAL records filtered out due to sharding" - ) - .expect("failed to define a metric"), values_committed_metadata_images: values_committed.with_label_values(&["metadata", "image"]), values_committed_metadata_deltas: values_committed.with_label_values(&["metadata", "delta"]), values_committed_data_images: values_committed.with_label_values(&["data", "image"]), diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index b9ba4a3555..4a1ddf09b5 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -178,6 +178,7 @@ pub fn spawn_grpc( tenant_manager: Arc, auth: Option>, perf_trace_dispatch: Option, + get_vectored_concurrent_io: GetVectoredConcurrentIo, listener: std::net::TcpListener, ) -> anyhow::Result { let cancel = CancellationToken::new(); @@ -214,6 +215,8 @@ pub fn spawn_grpc( let page_service_handler = GrpcPageServiceHandler { tenant_manager, ctx, + gate_guard: gate.enter().expect("gate was just created"), + get_vectored_concurrent_io, }; let observability_layer = ObservabilityLayer; @@ -497,10 +500,6 @@ async fn page_service_conn_main( } /// Page service connection handler. -/// -/// TODO: for gRPC, this will be shared by all requests from all connections. -/// Decompose it into global state and per-connection/request state, and make -/// libpq-specific options (e.g. pipelining) separate. struct PageServerHandler { auth: Option>, claims: Option, @@ -765,7 +764,7 @@ impl PageStreamError { request_id, status_code, reason: Some(status.message().to_string()), - page_images: SmallVec::new(), + page_images: Vec::new(), } .into()) } @@ -3362,6 +3361,8 @@ where pub struct GrpcPageServiceHandler { tenant_manager: Arc, ctx: RequestContext, + gate_guard: GateGuard, + get_vectored_concurrent_io: GetVectoredConcurrentIo, } impl GrpcPageServiceHandler { @@ -3521,7 +3522,7 @@ impl GrpcPageServiceHandler { request_id: req.request_id, status_code: page_api::GetPageStatusCode::Ok, reason: None, - page_images: SmallVec::with_capacity(results.len()), + page_images: Vec::with_capacity(results.len()), }; for result in results { @@ -3660,7 +3661,7 @@ impl proto::PageService for GrpcPageServiceHandler { if chunk.is_empty() { break; } - yield proto::GetBaseBackupResponseChunk::try_from(chunk.clone().freeze())?; + yield proto::GetBaseBackupResponseChunk::from(chunk.clone().freeze()); chunk.clear(); } } @@ -3721,6 +3722,14 @@ impl proto::PageService for GrpcPageServiceHandler { .get(ttid.tenant_id, ttid.timeline_id, shard_selector) .await?; + // Spawn an IoConcurrency sidecar, if enabled. + let Ok(gate_guard) = self.gate_guard.try_clone() else { + return Err(tonic::Status::unavailable("shutting down")); + }; + let io_concurrency = + IoConcurrency::spawn_from_conf(self.get_vectored_concurrent_io, gate_guard); + + // Spawn a task to handle the GetPageRequest stream. let span = Span::current(); let ctx = self.ctx.attached_child(); let mut reqs = req.into_inner(); @@ -3731,8 +3740,7 @@ impl proto::PageService for GrpcPageServiceHandler { .await? .downgrade(); while let Some(req) = reqs.message().await? { - // TODO: implement IoConcurrency sidecar. - yield Self::get_page(&ctx, &timeline, req, IoConcurrency::Sequential) + yield Self::get_page(&ctx, &timeline, req, io_concurrency.clone()) .instrument(span.clone()) // propagate request span .await? } @@ -3806,7 +3814,7 @@ impl proto::PageService for GrpcPageServiceHandler { let resp = PageServerHandler::handle_get_slru_segment_request(&timeline, &req, &ctx).await?; let resp: page_api::GetSlruSegmentResponse = resp.segment; - Ok(tonic::Response::new(resp.try_into()?)) + Ok(tonic::Response::new(resp.into())) } } diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index b6f11b744b..633d62210d 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -431,10 +431,10 @@ impl Timeline { GetVectoredError::InvalidLsn(e) => { Err(anyhow::anyhow!("invalid LSN: {e:?}").into()) } - // NB: this should never happen in practice because we limit MAX_GET_VECTORED_KEYS + // NB: this should never happen in practice because we limit batch size to be smaller than max_get_vectored_keys // TODO: we can prevent this error class by moving this check into the type system - GetVectoredError::Oversized(err) => { - Err(anyhow::anyhow!("batching oversized: {err:?}").into()) + GetVectoredError::Oversized(err, max) => { + Err(anyhow::anyhow!("batching oversized: {err} > {max}").into()) } }; @@ -719,7 +719,7 @@ impl Timeline { let batches = keyspace.partition( self.get_shard_identity(), - Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64, + self.conf.max_get_vectored_keys.get() as u64 * BLCKSZ as u64, ); let io_concurrency = IoConcurrency::spawn_from_conf( @@ -959,7 +959,7 @@ impl Timeline { let batches = keyspace.partition( self.get_shard_identity(), - Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64, + self.conf.max_get_vectored_keys.get() as u64 * BLCKSZ as u64, ); let io_concurrency = IoConcurrency::spawn_from_conf( diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 308ada3fa1..f9fdc143b4 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -7197,7 +7197,7 @@ mod tests { let end = desc .key_range .start - .add(Timeline::MAX_GET_VECTORED_KEYS.try_into().unwrap()); + .add(tenant.conf.max_get_vectored_keys.get() as u32); reads.push(KeySpace { ranges: vec![start..end], }); @@ -11260,11 +11260,11 @@ mod tests { let mut keyspaces_at_lsn: HashMap = HashMap::default(); let mut used_keys: HashSet = HashSet::default(); - while used_keys.len() < Timeline::MAX_GET_VECTORED_KEYS as usize { + while used_keys.len() < tenant.conf.max_get_vectored_keys.get() { let selected_lsn = interesting_lsns.choose(&mut random).expect("not empty"); let mut selected_key = start_key.add(random.gen_range(0..KEY_DIMENSION_SIZE)); - while used_keys.len() < Timeline::MAX_GET_VECTORED_KEYS as usize { + while used_keys.len() < tenant.conf.max_get_vectored_keys.get() { if used_keys.contains(&selected_key) || selected_key >= start_key.add(KEY_DIMENSION_SIZE) { diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 9ddbe404d2..6798606141 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -817,8 +817,8 @@ pub(crate) enum GetVectoredError { #[error("timeline shutting down")] Cancelled, - #[error("requested too many keys: {0} > {}", Timeline::MAX_GET_VECTORED_KEYS)] - Oversized(u64), + #[error("requested too many keys: {0} > {1}")] + Oversized(u64, u64), #[error("requested at invalid LSN: {0}")] InvalidLsn(Lsn), @@ -1019,7 +1019,7 @@ impl From for PageReconstructError { match e { GetVectoredError::Cancelled => PageReconstructError::Cancelled, GetVectoredError::InvalidLsn(_) => PageReconstructError::Other(anyhow!("Invalid LSN")), - err @ GetVectoredError::Oversized(_) => PageReconstructError::Other(err.into()), + err @ GetVectoredError::Oversized(_, _) => PageReconstructError::Other(err.into()), GetVectoredError::MissingKey(err) => PageReconstructError::MissingKey(err), GetVectoredError::GetReadyAncestorError(err) => PageReconstructError::from(err), GetVectoredError::Other(err) => PageReconstructError::Other(err), @@ -1199,7 +1199,6 @@ impl Timeline { } } - pub(crate) const MAX_GET_VECTORED_KEYS: u64 = 32; pub(crate) const LAYERS_VISITED_WARN_THRESHOLD: u32 = 100; /// Look up multiple page versions at a given LSN @@ -1214,9 +1213,12 @@ impl Timeline { ) -> Result>, GetVectoredError> { let total_keyspace = query.total_keyspace(); - let key_count = total_keyspace.total_raw_size().try_into().unwrap(); - if key_count > Timeline::MAX_GET_VECTORED_KEYS { - return Err(GetVectoredError::Oversized(key_count)); + let key_count = total_keyspace.total_raw_size(); + if key_count > self.conf.max_get_vectored_keys.get() { + return Err(GetVectoredError::Oversized( + key_count as u64, + self.conf.max_get_vectored_keys.get() as u64, + )); } for range in &total_keyspace.ranges { @@ -2843,21 +2845,6 @@ impl Timeline { ) } - /// Resolve the effective WAL receiver protocol to use for this tenant. - /// - /// Priority order is: - /// 1. Tenant config override - /// 2. Default value for tenant config override - /// 3. Pageserver config override - /// 4. Pageserver config default - pub fn resolve_wal_receiver_protocol(&self) -> PostgresClientProtocol { - let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); - tenant_conf - .wal_receiver_protocol_override - .or(self.conf.default_tenant_conf.wal_receiver_protocol_override) - .unwrap_or(self.conf.wal_receiver_protocol) - } - pub(super) fn tenant_conf_updated(&self, new_conf: &AttachedTenantConf) { // NB: Most tenant conf options are read by background loops, so, // changes will automatically be picked up. @@ -3213,10 +3200,16 @@ impl Timeline { guard.is_none(), "multiple launches / re-launches of WAL receiver are not supported" ); + + let protocol = PostgresClientProtocol::Interpreted { + format: utils::postgres_client::InterpretedFormat::Protobuf, + compression: Some(utils::postgres_client::Compression::Zstd { level: 1 }), + }; + *guard = Some(WalReceiver::start( Arc::clone(self), WalReceiverConf { - protocol: self.resolve_wal_receiver_protocol(), + protocol, wal_connect_timeout, lagging_wal_timeout, max_lsn_wal_lag, @@ -5270,7 +5263,7 @@ impl Timeline { key = key.next(); // Maybe flush `key_rest_accum` - if key_request_accum.raw_size() >= Timeline::MAX_GET_VECTORED_KEYS + if key_request_accum.raw_size() >= self.conf.max_get_vectored_keys.get() as u64 || (last_key_in_range && key_request_accum.raw_size() > 0) { let query = diff --git a/pageserver/src/tenant/timeline/import_pgdata.rs b/pageserver/src/tenant/timeline/import_pgdata.rs index 3f760d858b..606ad09ef1 100644 --- a/pageserver/src/tenant/timeline/import_pgdata.rs +++ b/pageserver/src/tenant/timeline/import_pgdata.rs @@ -201,8 +201,8 @@ async fn prepare_import( .await; match res { Ok(_) => break, - Err(err) => { - info!(?err, "indefinitely waiting for pgdata to finish"); + Err(_err) => { + info!("indefinitely waiting for pgdata to finish"); if tokio::time::timeout(std::time::Duration::from_secs(10), cancel.cancelled()) .await .is_ok() diff --git a/pageserver/src/tenant/timeline/import_pgdata/flow.rs b/pageserver/src/tenant/timeline/import_pgdata/flow.rs index 760e82dd57..e003bb6810 100644 --- a/pageserver/src/tenant/timeline/import_pgdata/flow.rs +++ b/pageserver/src/tenant/timeline/import_pgdata/flow.rs @@ -100,6 +100,7 @@ async fn run_v1( .unwrap(), import_job_concurrency: base.import_job_concurrency, import_job_checkpoint_threshold: base.import_job_checkpoint_threshold, + import_job_max_byte_range_size: base.import_job_max_byte_range_size, } } None => timeline.conf.timeline_import_config.clone(), @@ -441,6 +442,7 @@ impl Plan { let mut last_completed_job_idx = start_after_job_idx.unwrap_or(0); let checkpoint_every: usize = import_config.import_job_checkpoint_threshold.into(); + let max_byte_range_size: usize = import_config.import_job_max_byte_range_size.into(); // Run import jobs concurrently up to the limit specified by the pageserver configuration. // Note that we process completed futures in the oreder of insertion. This will be the @@ -456,7 +458,7 @@ impl Plan { work.push_back(tokio::task::spawn(async move { let _permit = permit; - let res = job.run(job_timeline, &ctx).await; + let res = job.run(job_timeline, max_byte_range_size, &ctx).await; (job_idx, res) })); }, @@ -471,6 +473,8 @@ impl Plan { last_completed_job_idx = job_idx; if last_completed_job_idx % checkpoint_every == 0 { + tracing::info!(last_completed_job_idx, jobs=%jobs_in_plan, "Checkpointing import status"); + let progress = ShardImportProgressV1 { jobs: jobs_in_plan, completed: last_completed_job_idx, @@ -492,8 +496,6 @@ impl Plan { anyhow::anyhow!("Shut down while putting timeline import status") })?; } - - tracing::info!(last_completed_job_idx, jobs=%jobs_in_plan, "Checkpointing import status"); }, Some(Err(_)) => { anyhow::bail!( @@ -679,6 +681,7 @@ trait ImportTask { async fn doit( self, layer_writer: &mut ImageLayerWriter, + max_byte_range_size: usize, ctx: &RequestContext, ) -> anyhow::Result; } @@ -715,6 +718,7 @@ impl ImportTask for ImportSingleKeyTask { async fn doit( self, layer_writer: &mut ImageLayerWriter, + _max_byte_range_size: usize, ctx: &RequestContext, ) -> anyhow::Result { layer_writer.put_image(self.key, self.buf, ctx).await?; @@ -768,10 +772,9 @@ impl ImportTask for ImportRelBlocksTask { async fn doit( self, layer_writer: &mut ImageLayerWriter, + max_byte_range_size: usize, ctx: &RequestContext, ) -> anyhow::Result { - const MAX_BYTE_RANGE_SIZE: usize = 4 * 1024 * 1024; - debug!("Importing relation file"); let (rel_tag, start_blk) = self.key_range.start.to_rel_block()?; @@ -796,7 +799,7 @@ impl ImportTask for ImportRelBlocksTask { assert_eq!(key.len(), 1); assert!(!acc.is_empty()); assert!(acc_end > acc_start); - if acc_end == start && end - acc_start <= MAX_BYTE_RANGE_SIZE { + if acc_end == start && end - acc_start <= max_byte_range_size { acc.push(key.pop().unwrap()); Ok((acc, acc_start, end)) } else { @@ -860,6 +863,7 @@ impl ImportTask for ImportSlruBlocksTask { async fn doit( self, layer_writer: &mut ImageLayerWriter, + _max_byte_range_size: usize, ctx: &RequestContext, ) -> anyhow::Result { debug!("Importing SLRU segment file {}", self.path); @@ -906,12 +910,13 @@ impl ImportTask for AnyImportTask { async fn doit( self, layer_writer: &mut ImageLayerWriter, + max_byte_range_size: usize, ctx: &RequestContext, ) -> anyhow::Result { match self { - Self::SingleKey(t) => t.doit(layer_writer, ctx).await, - Self::RelBlocks(t) => t.doit(layer_writer, ctx).await, - Self::SlruBlocks(t) => t.doit(layer_writer, ctx).await, + Self::SingleKey(t) => t.doit(layer_writer, max_byte_range_size, ctx).await, + Self::RelBlocks(t) => t.doit(layer_writer, max_byte_range_size, ctx).await, + Self::SlruBlocks(t) => t.doit(layer_writer, max_byte_range_size, ctx).await, } } } @@ -952,7 +957,12 @@ impl ChunkProcessingJob { } } - async fn run(self, timeline: Arc, ctx: &RequestContext) -> anyhow::Result<()> { + async fn run( + self, + timeline: Arc, + max_byte_range_size: usize, + ctx: &RequestContext, + ) -> anyhow::Result<()> { let mut writer = ImageLayerWriter::new( timeline.conf, timeline.timeline_id, @@ -967,7 +977,7 @@ impl ChunkProcessingJob { let mut nimages = 0; for task in self.tasks { - nimages += task.doit(&mut writer, ctx).await?; + nimages += task.doit(&mut writer, max_byte_range_size, ctx).await?; } let resident_layer = if nimages > 0 { diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs index 3c3608d1bd..7e0b0e9b25 100644 --- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs +++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs @@ -32,9 +32,7 @@ use utils::backoff::{ }; use utils::id::{NodeId, TenantTimelineId}; use utils::lsn::Lsn; -use utils::postgres_client::{ - ConnectionConfigArgs, PostgresClientProtocol, wal_stream_connection_config, -}; +use utils::postgres_client::{ConnectionConfigArgs, wal_stream_connection_config}; use super::walreceiver_connection::{WalConnectionStatus, WalReceiverError}; use super::{TaskEvent, TaskHandle, TaskStateUpdate, WalReceiverConf}; @@ -991,19 +989,12 @@ impl ConnectionManagerState { return None; // no connection string, ignore sk } - let (shard_number, shard_count, shard_stripe_size) = match self.conf.protocol { - PostgresClientProtocol::Vanilla => { - (None, None, None) - }, - PostgresClientProtocol::Interpreted { .. } => { - let shard_identity = self.timeline.get_shard_identity(); - ( - Some(shard_identity.number.0), - Some(shard_identity.count.0), - Some(shard_identity.stripe_size.0), - ) - } - }; + let shard_identity = self.timeline.get_shard_identity(); + let (shard_number, shard_count, shard_stripe_size) = ( + Some(shard_identity.number.0), + Some(shard_identity.count.0), + Some(shard_identity.stripe_size.0), + ); let connection_conf_args = ConnectionConfigArgs { protocol: self.conf.protocol, @@ -1120,8 +1111,8 @@ impl ReconnectReason { #[cfg(test)] mod tests { - use pageserver_api::config::defaults::DEFAULT_WAL_RECEIVER_PROTOCOL; use url::Host; + use utils::postgres_client::PostgresClientProtocol; use super::*; use crate::tenant::harness::{TIMELINE_ID, TenantHarness}; @@ -1552,6 +1543,11 @@ mod tests { .await .expect("Failed to create an empty timeline for dummy wal connection manager"); + let protocol = PostgresClientProtocol::Interpreted { + format: utils::postgres_client::InterpretedFormat::Protobuf, + compression: Some(utils::postgres_client::Compression::Zstd { level: 1 }), + }; + ConnectionManagerState { id: TenantTimelineId { tenant_id: harness.tenant_shard_id.tenant_id, @@ -1560,7 +1556,7 @@ mod tests { timeline, cancel: CancellationToken::new(), conf: WalReceiverConf { - protocol: DEFAULT_WAL_RECEIVER_PROTOCOL, + protocol, wal_connect_timeout: Duration::from_secs(1), lagging_wal_timeout: Duration::from_secs(1), max_lsn_wal_lag: NonZeroU64::new(1024 * 1024).unwrap(), diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs index 249849ac4b..343e04f5f0 100644 --- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs @@ -15,7 +15,7 @@ use postgres_backend::is_expected_io_error; use postgres_connection::PgConnectionConfig; use postgres_ffi::WAL_SEGMENT_SIZE; use postgres_ffi::v14::xlog_utils::normalize_lsn; -use postgres_ffi::waldecoder::{WalDecodeError, WalStreamDecoder}; +use postgres_ffi::waldecoder::WalDecodeError; use postgres_protocol::message::backend::ReplicationMessage; use postgres_types::PgLsn; use tokio::sync::watch; @@ -31,7 +31,7 @@ use utils::lsn::Lsn; use utils::pageserver_feedback::PageserverFeedback; use utils::postgres_client::PostgresClientProtocol; use utils::sync::gate::GateError; -use wal_decoder::models::{FlushUncommittedRecords, InterpretedWalRecord, InterpretedWalRecords}; +use wal_decoder::models::{FlushUncommittedRecords, InterpretedWalRecords}; use wal_decoder::wire_format::FromWireFormat; use super::TaskStateUpdate; @@ -275,8 +275,6 @@ pub(super) async fn handle_walreceiver_connection( let copy_stream = replication_client.copy_both_simple(&query).await?; let mut physical_stream = pin!(ReplicationStream::new(copy_stream)); - let mut waldecoder = WalStreamDecoder::new(startpoint, timeline.pg_version); - let mut walingest = WalIngest::new(timeline.as_ref(), startpoint, &ctx) .await .map_err(|e| match e.kind { @@ -284,14 +282,16 @@ pub(super) async fn handle_walreceiver_connection( _ => WalReceiverError::Other(e.into()), })?; - let shard = vec![*timeline.get_shard_identity()]; - - let interpreted_proto_config = match protocol { - PostgresClientProtocol::Vanilla => None, + let (format, compression) = match protocol { PostgresClientProtocol::Interpreted { format, compression, - } => Some((format, compression)), + } => (format, compression), + PostgresClientProtocol::Vanilla => { + return Err(WalReceiverError::Other(anyhow!( + "Vanilla WAL receiver protocol is no longer supported for ingest" + ))); + } }; let mut expected_wal_start = startpoint; @@ -313,16 +313,6 @@ pub(super) async fn handle_walreceiver_connection( // Update the connection status before processing the message. If the message processing // fails (e.g. in walingest), we still want to know latests LSNs from the safekeeper. match &replication_message { - ReplicationMessage::XLogData(xlog_data) => { - connection_status.latest_connection_update = now; - connection_status.commit_lsn = Some(Lsn::from(xlog_data.wal_end())); - connection_status.streaming_lsn = Some(Lsn::from( - xlog_data.wal_start() + xlog_data.data().len() as u64, - )); - if !xlog_data.data().is_empty() { - connection_status.latest_wal_update = now; - } - } ReplicationMessage::PrimaryKeepAlive(keepalive) => { connection_status.latest_connection_update = now; connection_status.commit_lsn = Some(Lsn::from(keepalive.wal_end())); @@ -353,7 +343,6 @@ pub(super) async fn handle_walreceiver_connection( // were interpreted. let streaming_lsn = Lsn::from(raw.streaming_lsn()); - let (format, compression) = interpreted_proto_config.unwrap(); let batch = InterpretedWalRecords::from_wire(raw.data(), format, compression) .await .with_context(|| { @@ -509,138 +498,6 @@ pub(super) async fn handle_walreceiver_connection( Some(streaming_lsn) } - ReplicationMessage::XLogData(xlog_data) => { - async fn commit( - modification: &mut DatadirModification<'_>, - uncommitted: &mut u64, - filtered: &mut u64, - ctx: &RequestContext, - ) -> anyhow::Result<()> { - let stats = modification.stats(); - modification.commit(ctx).await?; - WAL_INGEST - .records_committed - .inc_by(*uncommitted - *filtered); - WAL_INGEST.inc_values_committed(&stats); - *uncommitted = 0; - *filtered = 0; - Ok(()) - } - - // Pass the WAL data to the decoder, and see if we can decode - // more records as a result. - let data = xlog_data.data(); - let startlsn = Lsn::from(xlog_data.wal_start()); - let endlsn = startlsn + data.len() as u64; - - trace!("received XLogData between {startlsn} and {endlsn}"); - - WAL_INGEST.bytes_received.inc_by(data.len() as u64); - waldecoder.feed_bytes(data); - - { - let mut modification = timeline.begin_modification(startlsn); - let mut uncommitted_records = 0; - let mut filtered_records = 0; - - while let Some((next_record_lsn, recdata)) = waldecoder.poll_decode()? { - // It is important to deal with the aligned records as lsn in getPage@LSN is - // aligned and can be several bytes bigger. Without this alignment we are - // at risk of hitting a deadlock. - if !next_record_lsn.is_aligned() { - return Err(WalReceiverError::Other(anyhow!("LSN not aligned"))); - } - - // Deserialize and interpret WAL record - let interpreted = InterpretedWalRecord::from_bytes_filtered( - recdata, - &shard, - next_record_lsn, - modification.tline.pg_version, - )? - .remove(timeline.get_shard_identity()) - .unwrap(); - - if matches!(interpreted.flush_uncommitted, FlushUncommittedRecords::Yes) - && uncommitted_records > 0 - { - // Special case: legacy PG database creations operate by reading pages from a 'template' database: - // these are the only kinds of WAL record that require reading data blocks while ingesting. Ensure - // all earlier writes of data blocks are visible by committing any modification in flight. - commit( - &mut modification, - &mut uncommitted_records, - &mut filtered_records, - &ctx, - ) - .await?; - } - - // Ingest the records without immediately committing them. - timeline.metrics.wal_records_received.inc(); - let ingested = walingest - .ingest_record(interpreted, &mut modification, &ctx) - .await - .with_context(|| { - format!("could not ingest record at {next_record_lsn}") - }) - .inspect_err(|err| { - // TODO: we can't differentiate cancellation errors with - // anyhow::Error, so just ignore it if we're cancelled. - if !cancellation.is_cancelled() && !timeline.is_stopping() { - critical!("{err:?}") - } - })?; - if !ingested { - tracing::debug!("ingest: filtered out record @ LSN {next_record_lsn}"); - WAL_INGEST.records_filtered.inc(); - filtered_records += 1; - } - - // FIXME: this cannot be made pausable_failpoint without fixing the - // failpoint library; in tests, the added amount of debugging will cause us - // to timeout the tests. - fail_point!("walreceiver-after-ingest"); - - last_rec_lsn = next_record_lsn; - - // Commit every ingest_batch_size records. Even if we filtered out - // all records, we still need to call commit to advance the LSN. - uncommitted_records += 1; - if uncommitted_records >= ingest_batch_size - || modification.approx_pending_bytes() - > DatadirModification::MAX_PENDING_BYTES - { - commit( - &mut modification, - &mut uncommitted_records, - &mut filtered_records, - &ctx, - ) - .await?; - } - } - - // Commit the remaining records. - if uncommitted_records > 0 { - commit( - &mut modification, - &mut uncommitted_records, - &mut filtered_records, - &ctx, - ) - .await?; - } - } - - if !caught_up && endlsn >= end_of_wal { - info!("caught up at LSN {endlsn}"); - caught_up = true; - } - - Some(endlsn) - } - ReplicationMessage::PrimaryKeepAlive(keepalive) => { let wal_end = keepalive.wal_end(); let timestamp = keepalive.timestamp(); diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c index a6a7021756..5b4ced7cf0 100644 --- a/pgxn/neon/neon.c +++ b/pgxn/neon/neon.c @@ -16,6 +16,7 @@ #if PG_MAJORVERSION_NUM >= 15 #include "access/xlogrecovery.h" #endif +#include "executor/instrument.h" #include "replication/logical.h" #include "replication/logicallauncher.h" #include "replication/slot.h" @@ -33,6 +34,7 @@ #include "file_cache.h" #include "neon.h" #include "neon_lwlsncache.h" +#include "neon_perf_counters.h" #include "control_plane_connector.h" #include "logical_replication_monitor.h" #include "unstable_extensions.h" @@ -46,6 +48,13 @@ void _PG_init(void); static int running_xacts_overflow_policy; +static bool monitor_query_exec_time = false; + +static ExecutorStart_hook_type prev_ExecutorStart = NULL; +static ExecutorEnd_hook_type prev_ExecutorEnd = NULL; + +static void neon_ExecutorStart(QueryDesc *queryDesc, int eflags); +static void neon_ExecutorEnd(QueryDesc *queryDesc); #if PG_MAJORVERSION_NUM >= 16 static shmem_startup_hook_type prev_shmem_startup_hook; @@ -470,6 +479,16 @@ _PG_init(void) 0, NULL, NULL, NULL); + DefineCustomBoolVariable( + "neon.monitor_query_exec_time", + "Collect infortmation about query execution time", + NULL, + &monitor_query_exec_time, + false, + PGC_USERSET, + 0, + NULL, NULL, NULL); + DefineCustomBoolVariable( "neon.allow_replica_misconfig", "Allow replica startup when some critical GUCs have smaller value than on primary node", @@ -508,6 +527,11 @@ _PG_init(void) EmitWarningsOnPlaceholders("neon"); ReportSearchPath(); + + prev_ExecutorStart = ExecutorStart_hook; + ExecutorStart_hook = neon_ExecutorStart; + prev_ExecutorEnd = ExecutorEnd_hook; + ExecutorEnd_hook = neon_ExecutorEnd; } PG_FUNCTION_INFO_V1(pg_cluster_size); @@ -581,3 +605,55 @@ neon_shmem_startup_hook(void) #endif } #endif + +/* + * ExecutorStart hook: start up tracking if needed + */ +static void +neon_ExecutorStart(QueryDesc *queryDesc, int eflags) +{ + if (prev_ExecutorStart) + prev_ExecutorStart(queryDesc, eflags); + else + standard_ExecutorStart(queryDesc, eflags); + + if (monitor_query_exec_time) + { + /* + * Set up to track total elapsed time in ExecutorRun. Make sure the + * space is allocated in the per-query context so it will go away at + * ExecutorEnd. + */ + if (queryDesc->totaltime == NULL) + { + MemoryContext oldcxt; + + oldcxt = MemoryContextSwitchTo(queryDesc->estate->es_query_cxt); + queryDesc->totaltime = InstrAlloc(1, INSTRUMENT_TIMER, false); + MemoryContextSwitchTo(oldcxt); + } + } +} + +/* + * ExecutorEnd hook: store results if needed + */ +static void +neon_ExecutorEnd(QueryDesc *queryDesc) +{ + if (monitor_query_exec_time && queryDesc->totaltime) + { + /* + * Make sure stats accumulation is done. (Note: it's okay if several + * levels of hook all do this.) + */ + InstrEndLoop(queryDesc->totaltime); + + inc_query_time(queryDesc->totaltime->total*1000000); /* convert to usec */ + } + + if (prev_ExecutorEnd) + prev_ExecutorEnd(queryDesc); + else + standard_ExecutorEnd(queryDesc); +} diff --git a/pgxn/neon/neon_perf_counters.c b/pgxn/neon/neon_perf_counters.c index c77d99d636..d0a3d15108 100644 --- a/pgxn/neon/neon_perf_counters.c +++ b/pgxn/neon/neon_perf_counters.c @@ -71,6 +71,27 @@ inc_iohist(IOHistogram hist, uint64 latency_us) hist->wait_us_count++; } +static inline void +inc_qthist(QTHistogram hist, uint64 elapsed_us) +{ + int lo = 0; + int hi = NUM_QT_BUCKETS - 1; + + /* Find the right bucket with binary search */ + while (lo < hi) + { + int mid = (lo + hi) / 2; + + if (elapsed_us < qt_bucket_thresholds[mid]) + hi = mid; + else + lo = mid + 1; + } + hist->elapsed_us_bucket[lo]++; + hist->elapsed_us_sum += elapsed_us; + hist->elapsed_us_count++; +} + /* * Count a GetPage wait operation. */ @@ -98,6 +119,13 @@ inc_page_cache_write_wait(uint64 latency) inc_iohist(&MyNeonCounters->file_cache_write_hist, latency); } + +void +inc_query_time(uint64 elapsed) +{ + inc_qthist(&MyNeonCounters->query_time_hist, elapsed); +} + /* * Support functions for the views, neon_backend_perf_counters and * neon_perf_counters. @@ -112,11 +140,11 @@ typedef struct } metric_t; static int -histogram_to_metrics(IOHistogram histogram, - metric_t *metrics, - const char *count, - const char *sum, - const char *bucket) +io_histogram_to_metrics(IOHistogram histogram, + metric_t *metrics, + const char *count, + const char *sum, + const char *bucket) { int i = 0; uint64 bucket_accum = 0; @@ -145,10 +173,44 @@ histogram_to_metrics(IOHistogram histogram, return i; } +static int +qt_histogram_to_metrics(QTHistogram histogram, + metric_t *metrics, + const char *count, + const char *sum, + const char *bucket) +{ + int i = 0; + uint64 bucket_accum = 0; + + metrics[i].name = count; + metrics[i].is_bucket = false; + metrics[i].value = (double) histogram->elapsed_us_count; + i++; + metrics[i].name = sum; + metrics[i].is_bucket = false; + metrics[i].value = (double) histogram->elapsed_us_sum / 1000000.0; + i++; + for (int bucketno = 0; bucketno < NUM_QT_BUCKETS; bucketno++) + { + uint64 threshold = qt_bucket_thresholds[bucketno]; + + bucket_accum += histogram->elapsed_us_bucket[bucketno]; + + metrics[i].name = bucket; + metrics[i].is_bucket = true; + metrics[i].bucket_le = (threshold == UINT64_MAX) ? INFINITY : ((double) threshold) / 1000000.0; + metrics[i].value = (double) bucket_accum; + i++; + } + + return i; +} + static metric_t * neon_perf_counters_to_metrics(neon_per_backend_counters *counters) { -#define NUM_METRICS ((2 + NUM_IO_WAIT_BUCKETS) * 3 + 12) +#define NUM_METRICS ((2 + NUM_IO_WAIT_BUCKETS) * 3 + (2 + NUM_QT_BUCKETS) + 12) metric_t *metrics = palloc((NUM_METRICS + 1) * sizeof(metric_t)); int i = 0; @@ -159,10 +221,10 @@ neon_perf_counters_to_metrics(neon_per_backend_counters *counters) i++; \ } while (false) - i += histogram_to_metrics(&counters->getpage_hist, &metrics[i], - "getpage_wait_seconds_count", - "getpage_wait_seconds_sum", - "getpage_wait_seconds_bucket"); + i += io_histogram_to_metrics(&counters->getpage_hist, &metrics[i], + "getpage_wait_seconds_count", + "getpage_wait_seconds_sum", + "getpage_wait_seconds_bucket"); APPEND_METRIC(getpage_prefetch_requests_total); APPEND_METRIC(getpage_sync_requests_total); @@ -178,14 +240,19 @@ neon_perf_counters_to_metrics(neon_per_backend_counters *counters) APPEND_METRIC(file_cache_hits_total); - i += histogram_to_metrics(&counters->file_cache_read_hist, &metrics[i], - "file_cache_read_wait_seconds_count", - "file_cache_read_wait_seconds_sum", - "file_cache_read_wait_seconds_bucket"); - i += histogram_to_metrics(&counters->file_cache_write_hist, &metrics[i], - "file_cache_write_wait_seconds_count", - "file_cache_write_wait_seconds_sum", - "file_cache_write_wait_seconds_bucket"); + i += io_histogram_to_metrics(&counters->file_cache_read_hist, &metrics[i], + "file_cache_read_wait_seconds_count", + "file_cache_read_wait_seconds_sum", + "file_cache_read_wait_seconds_bucket"); + i += io_histogram_to_metrics(&counters->file_cache_write_hist, &metrics[i], + "file_cache_write_wait_seconds_count", + "file_cache_write_wait_seconds_sum", + "file_cache_write_wait_seconds_bucket"); + + i += qt_histogram_to_metrics(&counters->query_time_hist, &metrics[i], + "query_time_seconds_count", + "query_time_seconds_sum", + "query_time_seconds_bucket"); Assert(i == NUM_METRICS); @@ -257,7 +324,7 @@ neon_get_backend_perf_counters(PG_FUNCTION_ARGS) } static inline void -histogram_merge_into(IOHistogram into, IOHistogram from) +io_histogram_merge_into(IOHistogram into, IOHistogram from) { into->wait_us_count += from->wait_us_count; into->wait_us_sum += from->wait_us_sum; @@ -265,6 +332,15 @@ histogram_merge_into(IOHistogram into, IOHistogram from) into->wait_us_bucket[bucketno] += from->wait_us_bucket[bucketno]; } +static inline void +qt_histogram_merge_into(QTHistogram into, QTHistogram from) +{ + into->elapsed_us_count += from->elapsed_us_count; + into->elapsed_us_sum += from->elapsed_us_sum; + for (int bucketno = 0; bucketno < NUM_QT_BUCKETS; bucketno++) + into->elapsed_us_bucket[bucketno] += from->elapsed_us_bucket[bucketno]; +} + PG_FUNCTION_INFO_V1(neon_get_perf_counters); Datum neon_get_perf_counters(PG_FUNCTION_ARGS) @@ -283,7 +359,7 @@ neon_get_perf_counters(PG_FUNCTION_ARGS) { neon_per_backend_counters *counters = &neon_per_backend_counters_shared[procno]; - histogram_merge_into(&totals.getpage_hist, &counters->getpage_hist); + io_histogram_merge_into(&totals.getpage_hist, &counters->getpage_hist); totals.getpage_prefetch_requests_total += counters->getpage_prefetch_requests_total; totals.getpage_sync_requests_total += counters->getpage_sync_requests_total; totals.getpage_prefetch_misses_total += counters->getpage_prefetch_misses_total; @@ -294,13 +370,13 @@ neon_get_perf_counters(PG_FUNCTION_ARGS) totals.pageserver_open_requests += counters->pageserver_open_requests; totals.getpage_prefetches_buffered += counters->getpage_prefetches_buffered; totals.file_cache_hits_total += counters->file_cache_hits_total; - histogram_merge_into(&totals.file_cache_read_hist, &counters->file_cache_read_hist); - histogram_merge_into(&totals.file_cache_write_hist, &counters->file_cache_write_hist); - totals.compute_getpage_stuck_requests_total += counters->compute_getpage_stuck_requests_total; totals.compute_getpage_max_inflight_stuck_time_ms = Max( totals.compute_getpage_max_inflight_stuck_time_ms, counters->compute_getpage_max_inflight_stuck_time_ms); + io_histogram_merge_into(&totals.file_cache_read_hist, &counters->file_cache_read_hist); + io_histogram_merge_into(&totals.file_cache_write_hist, &counters->file_cache_write_hist); + qt_histogram_merge_into(&totals.query_time_hist, &counters->query_time_hist); } metrics = neon_perf_counters_to_metrics(&totals); diff --git a/pgxn/neon/neon_perf_counters.h b/pgxn/neon/neon_perf_counters.h index 10cf094d4a..4b611b0636 100644 --- a/pgxn/neon/neon_perf_counters.h +++ b/pgxn/neon/neon_perf_counters.h @@ -36,6 +36,28 @@ typedef struct IOHistogramData typedef IOHistogramData *IOHistogram; +static const uint64 qt_bucket_thresholds[] = { + 2, 3, 6, 10, /* 0 us - 10 us */ + 20, 30, 60, 100, /* 10 us - 100 us */ + 200, 300, 600, 1000, /* 100 us - 1 ms */ + 2000, 3000, 6000, 10000, /* 1 ms - 10 ms */ + 20000, 30000, 60000, 100000, /* 10 ms - 100 ms */ + 200000, 300000, 600000, 1000000, /* 100 ms - 1 s */ + 2000000, 3000000, 6000000, 10000000, /* 1 s - 10 s */ + 20000000, 30000000, 60000000, 100000000, /* 10 s - 100 s */ + UINT64_MAX, +}; +#define NUM_QT_BUCKETS (lengthof(qt_bucket_thresholds)) + +typedef struct QTHistogramData +{ + uint64 elapsed_us_count; + uint64 elapsed_us_sum; + uint64 elapsed_us_bucket[NUM_QT_BUCKETS]; +} QTHistogramData; + +typedef QTHistogramData *QTHistogram; + typedef struct { /* @@ -127,6 +149,11 @@ typedef struct /* LFC I/O time buckets */ IOHistogramData file_cache_read_hist; IOHistogramData file_cache_write_hist; + + /* + * Histogram of query execution time. + */ + QTHistogramData query_time_hist; } neon_per_backend_counters; /* Pointer to the shared memory array of neon_per_backend_counters structs */ @@ -149,6 +176,7 @@ extern neon_per_backend_counters *neon_per_backend_counters_shared; extern void inc_getpage_wait(uint64 latency); extern void inc_page_cache_read_wait(uint64 latency); extern void inc_page_cache_write_wait(uint64 latency); +extern void inc_query_time(uint64 elapsed); extern Size NeonPerfCountersShmemSize(void); extern void NeonPerfCountersShmemInit(void); diff --git a/pgxn/neon/neon_pgversioncompat.c b/pgxn/neon/neon_pgversioncompat.c index 7c404fb5a9..6f57b618da 100644 --- a/pgxn/neon/neon_pgversioncompat.c +++ b/pgxn/neon/neon_pgversioncompat.c @@ -5,6 +5,7 @@ #include "funcapi.h" #include "miscadmin.h" +#include "access/xlog.h" #include "utils/tuplestore.h" #include "neon_pgversioncompat.h" @@ -41,5 +42,12 @@ InitMaterializedSRF(FunctionCallInfo fcinfo, bits32 flags) rsinfo->setDesc = stored_tupdesc; MemoryContextSwitchTo(old_context); } + +TimeLineID GetWALInsertionTimeLine(void) +{ + return ThisTimeLineID + 1; +} + + #endif diff --git a/pgxn/neon/neon_pgversioncompat.h b/pgxn/neon/neon_pgversioncompat.h index bf91a02b45..787bd552f8 100644 --- a/pgxn/neon/neon_pgversioncompat.h +++ b/pgxn/neon/neon_pgversioncompat.h @@ -162,6 +162,7 @@ InitBufferTag(BufferTag *tag, const RelFileNode *rnode, #if PG_MAJORVERSION_NUM < 15 extern void InitMaterializedSRF(FunctionCallInfo fcinfo, bits32 flags); +extern TimeLineID GetWALInsertionTimeLine(void); #endif #endif /* NEON_PGVERSIONCOMPAT_H */ diff --git a/pgxn/neon/neon_walreader.c b/pgxn/neon/neon_walreader.c index d5e3a38dbb..0a1f6d9c72 100644 --- a/pgxn/neon/neon_walreader.c +++ b/pgxn/neon/neon_walreader.c @@ -69,6 +69,7 @@ struct NeonWALReader WALSegmentContext segcxt; WALOpenSegment seg; int wre_errno; + TimeLineID local_active_tlid; /* Explains failure to read, static for simplicity. */ char err_msg[NEON_WALREADER_ERR_MSG_LEN]; @@ -106,7 +107,7 @@ struct NeonWALReader /* palloc and initialize NeonWALReader */ NeonWALReader * -NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, char *log_prefix) +NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, char *log_prefix, TimeLineID tlid) { NeonWALReader *reader; @@ -118,6 +119,7 @@ NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, char *log_ MemoryContextAllocZero(TopMemoryContext, sizeof(NeonWALReader)); reader->available_lsn = available_lsn; + reader->local_active_tlid = tlid; reader->seg.ws_file = -1; reader->seg.ws_segno = 0; reader->seg.ws_tli = 0; @@ -577,6 +579,17 @@ NeonWALReaderIsRemConnEstablished(NeonWALReader *state) return state->rem_state == RS_ESTABLISHED; } +/* + * Whether remote connection is established. Once this is done, until successful + * local read or error socket is stable and user can update socket events + * instead of readding it each time. + */ +TimeLineID +NeonWALReaderLocalActiveTimeLineID(NeonWALReader *state) +{ + return state->local_active_tlid; +} + /* * Returns events user should wait on connection socket or 0 if remote * connection is not active. diff --git a/pgxn/neon/neon_walreader.h b/pgxn/neon/neon_walreader.h index 3e41825069..722bc10537 100644 --- a/pgxn/neon/neon_walreader.h +++ b/pgxn/neon/neon_walreader.h @@ -19,9 +19,10 @@ typedef enum NEON_WALREAD_ERROR, } NeonWALReadResult; -extern NeonWALReader *NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, char *log_prefix); +extern NeonWALReader *NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, char *log_prefix, TimeLineID tlid); extern void NeonWALReaderFree(NeonWALReader *state); extern void NeonWALReaderResetRemote(NeonWALReader *state); +extern TimeLineID NeonWALReaderLocalActiveTimeLineID(NeonWALReader *state); extern NeonWALReadResult NeonWALRead(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli); extern pgsocket NeonWALReaderSocket(NeonWALReader *state); extern uint32 NeonWALReaderEvents(NeonWALReader *state); diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c index f42103c7cd..91d39345e2 100644 --- a/pgxn/neon/walproposer.c +++ b/pgxn/neon/walproposer.c @@ -98,6 +98,7 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api) wp = palloc0(sizeof(WalProposer)); wp->config = config; wp->api = api; + wp->localTimeLineID = config->pgTimeline; wp->state = WPS_COLLECTING_TERMS; wp->mconf.generation = INVALID_GENERATION; wp->mconf.members.len = 0; @@ -119,6 +120,10 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api) { wp_log(FATAL, "failed to parse neon.safekeepers generation number: %m"); } + if (*endptr != ':') + { + wp_log(FATAL, "failed to parse neon.safekeepers: no colon after generation"); + } /* Skip past : to the first hostname. */ host = endptr + 1; } @@ -1380,7 +1385,7 @@ ProcessPropStartPos(WalProposer *wp) * we must bail out, as clog and other non rel data is inconsistent. */ walprop_shared = wp->api.get_shmem_state(wp); - if (!wp->config->syncSafekeepers) + if (!wp->config->syncSafekeepers && !walprop_shared->replica_promote) { /* * Basebackup LSN always points to the beginning of the record (not diff --git a/pgxn/neon/walproposer.h b/pgxn/neon/walproposer.h index cca20e746b..08087e5a55 100644 --- a/pgxn/neon/walproposer.h +++ b/pgxn/neon/walproposer.h @@ -391,6 +391,7 @@ typedef struct WalproposerShmemState /* last feedback from each shard */ PageserverFeedback shard_ps_feedback[MAX_SHARDS]; int num_shards; + bool replica_promote; /* aggregated feedback with min LSNs across shards */ PageserverFeedback min_ps_feedback; @@ -806,6 +807,9 @@ typedef struct WalProposer /* Safekeepers walproposer is connecting to. */ Safekeeper safekeeper[MAX_SAFEKEEPERS]; + /* Current local TimeLineId in use */ + TimeLineID localTimeLineID; + /* WAL has been generated up to this point */ XLogRecPtr availableLsn; diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c index d15bf91d24..3d6a92ad79 100644 --- a/pgxn/neon/walproposer_pg.c +++ b/pgxn/neon/walproposer_pg.c @@ -35,6 +35,7 @@ #include "storage/proc.h" #include "storage/ipc.h" #include "storage/lwlock.h" +#include "storage/pg_shmem.h" #include "storage/shmem.h" #include "storage/spin.h" #include "tcop/tcopprot.h" @@ -159,12 +160,19 @@ WalProposerMain(Datum main_arg) { WalProposer *wp; + if (*wal_acceptors_list == '\0') + { + wpg_log(WARNING, "Safekeepers list is empty"); + return; + } + init_walprop_config(false); walprop_pg_init_bgworker(); am_walproposer = true; walprop_pg_load_libpqwalreceiver(); wp = WalProposerCreate(&walprop_config, walprop_pg); + wp->localTimeLineID = GetWALInsertionTimeLine(); wp->last_reconnect_attempt = walprop_pg_get_current_timestamp(wp); walprop_pg_init_walsender(); @@ -272,6 +280,30 @@ split_safekeepers_list(char *safekeepers_list, char *safekeepers[]) return n_safekeepers; } +static char *split_off_safekeepers_generation(char *safekeepers_list, uint32 *generation) +{ + char *endptr; + + if (strncmp(safekeepers_list, "g#", 2) != 0) + { + return safekeepers_list; + } + else + { + errno = 0; + *generation = strtoul(safekeepers_list + 2, &endptr, 10); + if (errno != 0) + { + wp_log(FATAL, "failed to parse neon.safekeepers generation number: %m"); + } + if (*endptr != ':') + { + wp_log(FATAL, "failed to parse neon.safekeepers: no colon after generation"); + } + return endptr + 1; + } +} + /* * Accept two coma-separated strings with list of safekeeper host:port addresses. * Split them into arrays and return false if two sets do not match, ignoring the order. @@ -283,6 +315,16 @@ safekeepers_cmp(char *old, char *new) char *safekeepers_new[MAX_SAFEKEEPERS]; int len_old = 0; int len_new = 0; + uint32 gen_old = INVALID_GENERATION; + uint32 gen_new = INVALID_GENERATION; + + old = split_off_safekeepers_generation(old, &gen_old); + new = split_off_safekeepers_generation(new, &gen_new); + + if (gen_old != gen_new) + { + return false; + } len_old = split_safekeepers_list(old, safekeepers_old); len_new = split_safekeepers_list(new, safekeepers_new); @@ -316,6 +358,9 @@ assign_neon_safekeepers(const char *newval, void *extra) char *newval_copy; char *oldval; + if (newval && *newval != '\0' && UsedShmemSegAddr && walprop_shared && RecoveryInProgress()) + walprop_shared->replica_promote = true; + if (!am_walproposer) return; @@ -506,16 +551,15 @@ BackpressureThrottlingTime(void) /* * Register a background worker proposing WAL to wal acceptors. + * We start walproposer bgworker even for replicas in order to support possible replica promotion. + * When pg_promote() function is called, then walproposer bgworker registered with BgWorkerStart_RecoveryFinished + * is automatically launched when promotion is completed. */ static void walprop_register_bgworker(void) { BackgroundWorker bgw; - /* If no wal acceptors are specified, don't start the background worker. */ - if (*wal_acceptors_list == '\0') - return; - memset(&bgw, 0, sizeof(bgw)); bgw.bgw_flags = BGWORKER_SHMEM_ACCESS; bgw.bgw_start_time = BgWorkerStart_RecoveryFinished; @@ -1292,9 +1336,7 @@ StartProposerReplication(WalProposer *wp, StartReplicationCmd *cmd) #if PG_VERSION_NUM < 150000 if (ThisTimeLineID == 0) - ereport(ERROR, - (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), - errmsg("IDENTIFY_SYSTEM has not been run before START_REPLICATION"))); + ThisTimeLineID = 1; #endif /* @@ -1508,7 +1550,7 @@ walprop_pg_wal_reader_allocate(Safekeeper *sk) snprintf(log_prefix, sizeof(log_prefix), WP_LOG_PREFIX "sk %s:%s nwr: ", sk->host, sk->port); Assert(!sk->xlogreader); - sk->xlogreader = NeonWALReaderAllocate(wal_segment_size, sk->wp->propTermStartLsn, log_prefix); + sk->xlogreader = NeonWALReaderAllocate(wal_segment_size, sk->wp->propTermStartLsn, log_prefix, sk->wp->localTimeLineID); if (sk->xlogreader == NULL) wpg_log(FATAL, "failed to allocate xlog reader"); } @@ -1522,7 +1564,7 @@ walprop_pg_wal_read(Safekeeper *sk, char *buf, XLogRecPtr startptr, Size count, buf, startptr, count, - walprop_pg_get_timeline_id()); + sk->wp->localTimeLineID); if (res == NEON_WALREAD_SUCCESS) { diff --git a/pgxn/neon/walsender_hooks.c b/pgxn/neon/walsender_hooks.c index 81198d6c8d..534bf1c19b 100644 --- a/pgxn/neon/walsender_hooks.c +++ b/pgxn/neon/walsender_hooks.c @@ -111,7 +111,7 @@ NeonWALPageRead( readBuf, targetPagePtr, count, - walprop_pg_get_timeline_id()); + NeonWALReaderLocalActiveTimeLineID(wal_reader)); if (res == NEON_WALREAD_SUCCESS) { @@ -202,7 +202,7 @@ NeonOnDemandXLogReaderRoutines(XLogReaderRoutine *xlr) { elog(ERROR, "unable to start walsender when basebackupLsn is 0"); } - wal_reader = NeonWALReaderAllocate(wal_segment_size, basebackupLsn, "[walsender] "); + wal_reader = NeonWALReaderAllocate(wal_segment_size, basebackupLsn, "[walsender] ", 1); } xlr->page_read = NeonWALPageRead; xlr->segment_open = NeonWALReadSegmentOpen; diff --git a/proxy/src/auth/backend/console_redirect.rs b/proxy/src/auth/backend/console_redirect.rs index a50c30257f..c388848926 100644 --- a/proxy/src/auth/backend/console_redirect.rs +++ b/proxy/src/auth/backend/console_redirect.rs @@ -15,9 +15,9 @@ use crate::context::RequestContext; use crate::control_plane::client::cplane_proxy_v1; use crate::control_plane::{self, CachedNodeInfo, NodeInfo}; use crate::error::{ReportableError, UserFacingError}; +use crate::pglb::connect_compute::ComputeConnectBackend; use crate::pqproto::BeMessage; use crate::proxy::NeonOptions; -use crate::proxy::connect_compute::ComputeConnectBackend; use crate::stream::PqStream; use crate::types::RoleName; use crate::{auth, compute, waiters}; diff --git a/proxy/src/auth/backend/mod.rs b/proxy/src/auth/backend/mod.rs index 735cb52f47..f978f655c4 100644 --- a/proxy/src/auth/backend/mod.rs +++ b/proxy/src/auth/backend/mod.rs @@ -25,9 +25,9 @@ use crate::control_plane::{ RoleAccessControl, }; use crate::intern::EndpointIdInt; +use crate::pglb::connect_compute::ComputeConnectBackend; use crate::pqproto::BeMessage; use crate::proxy::NeonOptions; -use crate::proxy::connect_compute::ComputeConnectBackend; use crate::rate_limiter::EndpointRateLimiter; use crate::stream::Stream; use crate::types::{EndpointCacheKey, EndpointId, RoleName}; diff --git a/proxy/src/binary/proxy.rs b/proxy/src/binary/proxy.rs index dcae263647..757c1e988b 100644 --- a/proxy/src/binary/proxy.rs +++ b/proxy/src/binary/proxy.rs @@ -221,8 +221,7 @@ struct ProxyCliArgs { is_private_access_proxy: bool, /// Configure whether all incoming requests have a Proxy Protocol V2 packet. - // TODO(conradludgate): switch default to rejected or required once we've updated all deployments - #[clap(value_enum, long, default_value_t = ProxyProtocolV2::Supported)] + #[clap(value_enum, long, default_value_t = ProxyProtocolV2::Rejected)] proxy_protocol_v2: ProxyProtocolV2, /// Time the proxy waits for the webauth session to be confirmed by the control plane. diff --git a/proxy/src/config.rs b/proxy/src/config.rs index a97339df9a..248584a19a 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -39,8 +39,6 @@ pub struct ComputeConfig { pub enum ProxyProtocolV2 { /// Connection will error if PROXY protocol v2 header is missing Required, - /// Connection will parse PROXY protocol v2 header, but accept the connection if it's missing. - Supported, /// Connection will error if PROXY protocol v2 header is provided Rejected, } diff --git a/proxy/src/console_redirect_proxy.rs b/proxy/src/console_redirect_proxy.rs index 7fb84b5ee5..f2484b54b8 100644 --- a/proxy/src/console_redirect_proxy.rs +++ b/proxy/src/console_redirect_proxy.rs @@ -11,10 +11,10 @@ use crate::config::{ProxyConfig, ProxyProtocolV2}; use crate::context::RequestContext; use crate::error::ReportableError; use crate::metrics::{Metrics, NumClientConnectionsGuard}; +use crate::pglb::connect_compute::{TcpMechanism, connect_to_compute}; +use crate::pglb::handshake::{HandshakeData, handshake}; +use crate::pglb::passthrough::ProxyPassthrough; use crate::protocol2::{ConnectHeader, ConnectionInfo, read_proxy_protocol}; -use crate::proxy::connect_compute::{TcpMechanism, connect_to_compute}; -use crate::proxy::handshake::{HandshakeData, handshake}; -use crate::proxy::passthrough::ProxyPassthrough; use crate::proxy::{ ClientRequestError, ErrorSource, prepare_client_connection, run_until_cancelled, }; @@ -54,30 +54,24 @@ pub async fn task_main( debug!(protocol = "tcp", %session_id, "accepted new TCP connection"); connections.spawn(async move { - let (socket, peer_addr) = match read_proxy_protocol(socket).await { - Err(e) => { - error!("per-client task finished with an error: {e:#}"); - return; + let (socket, conn_info) = match config.proxy_protocol_v2 { + ProxyProtocolV2::Required => { + match read_proxy_protocol(socket).await { + Err(e) => { + error!("per-client task finished with an error: {e:#}"); + return; + } + // our load balancers will not send any more data. let's just exit immediately + Ok((_socket, ConnectHeader::Local)) => { + debug!("healthcheck received"); + return; + } + Ok((socket, ConnectHeader::Proxy(info))) => (socket, info), + } } - // our load balancers will not send any more data. let's just exit immediately - Ok((_socket, ConnectHeader::Local)) => { - debug!("healthcheck received"); - return; - } - Ok((_socket, ConnectHeader::Missing)) - if config.proxy_protocol_v2 == ProxyProtocolV2::Required => - { - error!("missing required proxy protocol header"); - return; - } - Ok((_socket, ConnectHeader::Proxy(_))) - if config.proxy_protocol_v2 == ProxyProtocolV2::Rejected => - { - error!("proxy protocol header not supported"); - return; - } - Ok((socket, ConnectHeader::Proxy(info))) => (socket, info), - Ok((socket, ConnectHeader::Missing)) => ( + // ignore the header - it cannot be confused for a postgres or http connection so will + // error later. + ProxyProtocolV2::Rejected => ( socket, ConnectionInfo { addr: peer_addr, @@ -86,7 +80,7 @@ pub async fn task_main( ), }; - match socket.inner.set_nodelay(true) { + match socket.set_nodelay(true) { Ok(()) => {} Err(e) => { error!( @@ -98,7 +92,7 @@ pub async fn task_main( let ctx = RequestContext::new( session_id, - peer_addr, + conn_info, crate::metrics::Protocol::Tcp, &config.region, ); diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/pglb/connect_compute.rs similarity index 98% rename from proxy/src/proxy/connect_compute.rs rename to proxy/src/pglb/connect_compute.rs index 57785c9ec5..1d6ca5fbb3 100644 --- a/proxy/src/proxy/connect_compute.rs +++ b/proxy/src/pglb/connect_compute.rs @@ -2,7 +2,6 @@ use async_trait::async_trait; use tokio::time; use tracing::{debug, info, warn}; -use super::retry::ShouldRetryWakeCompute; use crate::auth::backend::{ComputeCredentialKeys, ComputeUserInfo}; use crate::compute::{self, COULD_NOT_CONNECT, PostgresConnection}; use crate::config::{ComputeConfig, RetryConfig}; @@ -15,7 +14,7 @@ use crate::metrics::{ ConnectOutcome, ConnectionFailureKind, Metrics, RetriesMetricGroup, RetryType, }; use crate::pqproto::StartupMessageParams; -use crate::proxy::retry::{CouldRetry, retry_after, should_retry}; +use crate::proxy::retry::{CouldRetry, ShouldRetryWakeCompute, retry_after, should_retry}; use crate::proxy::wake_compute::wake_compute; use crate::types::Host; diff --git a/proxy/src/proxy/copy_bidirectional.rs b/proxy/src/pglb/copy_bidirectional.rs similarity index 100% rename from proxy/src/proxy/copy_bidirectional.rs rename to proxy/src/pglb/copy_bidirectional.rs diff --git a/proxy/src/proxy/handshake.rs b/proxy/src/pglb/handshake.rs similarity index 100% rename from proxy/src/proxy/handshake.rs rename to proxy/src/pglb/handshake.rs diff --git a/proxy/src/pglb/mod.rs b/proxy/src/pglb/mod.rs index 1088859fb9..4b107142a7 100644 --- a/proxy/src/pglb/mod.rs +++ b/proxy/src/pglb/mod.rs @@ -1 +1,5 @@ +pub mod connect_compute; +pub mod copy_bidirectional; +pub mod handshake; pub mod inprocess; +pub mod passthrough; diff --git a/proxy/src/proxy/passthrough.rs b/proxy/src/pglb/passthrough.rs similarity index 97% rename from proxy/src/proxy/passthrough.rs rename to proxy/src/pglb/passthrough.rs index 55ab5f4dba..6f651d383d 100644 --- a/proxy/src/proxy/passthrough.rs +++ b/proxy/src/pglb/passthrough.rs @@ -53,7 +53,7 @@ pub(crate) async fn proxy_pass( // Starting from here we only proxy the client's traffic. debug!("performing the proxy pass..."); - let _ = crate::proxy::copy_bidirectional::copy_bidirectional_client_compute( + let _ = crate::pglb::copy_bidirectional::copy_bidirectional_client_compute( &mut client, &mut compute, ) diff --git a/proxy/src/protocol2.rs b/proxy/src/protocol2.rs index 0793998639..5bec6d6ca3 100644 --- a/proxy/src/protocol2.rs +++ b/proxy/src/protocol2.rs @@ -4,60 +4,13 @@ use core::fmt; use std::io; use std::net::{Ipv4Addr, Ipv6Addr, SocketAddr}; -use std::pin::Pin; -use std::task::{Context, Poll}; -use bytes::{Buf, Bytes, BytesMut}; -use pin_project_lite::pin_project; +use bytes::Buf; use smol_str::SmolStr; use strum_macros::FromRepr; -use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, ReadBuf}; +use tokio::io::{AsyncRead, AsyncReadExt}; use zerocopy::{FromBytes, Immutable, KnownLayout, Unaligned, network_endian}; -pin_project! { - /// A chained [`AsyncRead`] with [`AsyncWrite`] passthrough - pub(crate) struct ChainRW { - #[pin] - pub(crate) inner: T, - buf: BytesMut, - } -} - -impl AsyncWrite for ChainRW { - #[inline] - fn poll_write( - self: Pin<&mut Self>, - cx: &mut Context<'_>, - buf: &[u8], - ) -> Poll> { - self.project().inner.poll_write(cx, buf) - } - - #[inline] - fn poll_flush(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { - self.project().inner.poll_flush(cx) - } - - #[inline] - fn poll_shutdown(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { - self.project().inner.poll_shutdown(cx) - } - - #[inline] - fn poll_write_vectored( - self: Pin<&mut Self>, - cx: &mut Context<'_>, - bufs: &[io::IoSlice<'_>], - ) -> Poll> { - self.project().inner.poll_write_vectored(cx, bufs) - } - - #[inline] - fn is_write_vectored(&self) -> bool { - self.inner.is_write_vectored() - } -} - /// Proxy Protocol Version 2 Header const SIGNATURE: [u8; 12] = [ 0x0D, 0x0A, 0x0D, 0x0A, 0x00, 0x0D, 0x0A, 0x51, 0x55, 0x49, 0x54, 0x0A, @@ -79,7 +32,6 @@ pub struct ConnectionInfo { #[derive(PartialEq, Eq, Clone, Debug)] pub enum ConnectHeader { - Missing, Local, Proxy(ConnectionInfo), } @@ -106,47 +58,24 @@ pub enum ConnectionInfoExtra { pub(crate) async fn read_proxy_protocol( mut read: T, -) -> std::io::Result<(ChainRW, ConnectHeader)> { - let mut buf = BytesMut::with_capacity(128); - let header = loop { - let bytes_read = read.read_buf(&mut buf).await?; - - // exit for bad header signature - let len = usize::min(buf.len(), SIGNATURE.len()); - if buf[..len] != SIGNATURE[..len] { - return Ok((ChainRW { inner: read, buf }, ConnectHeader::Missing)); - } - - // if no more bytes available then exit - if bytes_read == 0 { - return Ok((ChainRW { inner: read, buf }, ConnectHeader::Missing)); - } - - // check if we have enough bytes to continue - if let Some(header) = buf.try_get::() { - break header; - } - }; - - let remaining_length = usize::from(header.len.get()); - - while buf.len() < remaining_length { - if read.read_buf(&mut buf).await? == 0 { - return Err(io::Error::new( - io::ErrorKind::UnexpectedEof, - "stream closed while waiting for proxy protocol addresses", - )); - } +) -> std::io::Result<(T, ConnectHeader)> { + let mut header = [0; size_of::()]; + read.read_exact(&mut header).await?; + let header: ProxyProtocolV2Header = zerocopy::transmute!(header); + if header.signature != SIGNATURE { + return Err(std::io::Error::other("invalid proxy protocol header")); } - let payload = buf.split_to(remaining_length); - let res = process_proxy_payload(header, payload)?; - Ok((ChainRW { inner: read, buf }, res)) + let mut payload = vec![0; usize::from(header.len.get())]; + read.read_exact(&mut payload).await?; + + let res = process_proxy_payload(header, &payload)?; + Ok((read, res)) } fn process_proxy_payload( header: ProxyProtocolV2Header, - mut payload: BytesMut, + mut payload: &[u8], ) -> std::io::Result { match header.version_and_command { // the connection was established on purpose by the proxy @@ -162,13 +91,12 @@ fn process_proxy_payload( PROXY_V2 => {} // other values are unassigned and must not be emitted by senders. Receivers // must drop connections presenting unexpected values here. - #[rustfmt::skip] // https://github.com/rust-lang/rustfmt/issues/6384 - _ => return Err(io::Error::other( - format!( + _ => { + return Err(io::Error::other(format!( "invalid proxy protocol command 0x{:02X}. expected local (0x20) or proxy (0x21)", header.version_and_command - ), - )), + ))); + } } let size_err = @@ -206,7 +134,7 @@ fn process_proxy_payload( } let subtype = tlv.value.get_u8(); match Pp2AwsType::from_repr(subtype) { - Some(Pp2AwsType::VpceId) => match std::str::from_utf8(&tlv.value) { + Some(Pp2AwsType::VpceId) => match std::str::from_utf8(tlv.value) { Ok(s) => { extra = Some(ConnectionInfoExtra::Aws { vpce_id: s.into() }); } @@ -282,65 +210,28 @@ enum Pp2AzureType { PrivateEndpointLinkId = 0x01, } -impl AsyncRead for ChainRW { - #[inline] - fn poll_read( - self: Pin<&mut Self>, - cx: &mut Context<'_>, - buf: &mut ReadBuf<'_>, - ) -> Poll> { - if self.buf.is_empty() { - self.project().inner.poll_read(cx, buf) - } else { - self.read_from_buf(buf) - } - } -} - -impl ChainRW { - #[cold] - fn read_from_buf(self: Pin<&mut Self>, buf: &mut ReadBuf<'_>) -> Poll> { - debug_assert!(!self.buf.is_empty()); - let this = self.project(); - - let write = usize::min(this.buf.len(), buf.remaining()); - let slice = this.buf.split_to(write).freeze(); - buf.put_slice(&slice); - - // reset the allocation so it can be freed - if this.buf.is_empty() { - *this.buf = BytesMut::new(); - } - - Poll::Ready(Ok(())) - } -} - #[derive(Debug)] -struct Tlv { +struct Tlv<'a> { kind: u8, - value: Bytes, + value: &'a [u8], } -fn read_tlv(b: &mut BytesMut) -> Option { +fn read_tlv<'a>(b: &mut &'a [u8]) -> Option> { let tlv_header = b.try_get::()?; let len = usize::from(tlv_header.len.get()); - if b.len() < len { - return None; - } Some(Tlv { kind: tlv_header.kind, - value: b.split_to(len).freeze(), + value: b.split_off(..len)?, }) } trait BufExt: Sized { fn try_get(&mut self) -> Option; } -impl BufExt for BytesMut { +impl BufExt for &[u8] { fn try_get(&mut self) -> Option { - let (res, _) = T::read_from_prefix(self).ok()?; - self.advance(size_of::()); + let (res, rest) = T::read_from_prefix(self).ok()?; + *self = rest; Some(res) } } @@ -481,27 +372,19 @@ mod tests { } #[tokio::test] + #[should_panic = "invalid proxy protocol header"] async fn test_invalid() { let data = [0x55; 256]; - let (mut read, info) = read_proxy_protocol(data.as_slice()).await.unwrap(); - - let mut bytes = vec![]; - read.read_to_end(&mut bytes).await.unwrap(); - assert_eq!(bytes, data); - assert_eq!(info, ConnectHeader::Missing); + read_proxy_protocol(data.as_slice()).await.unwrap(); } #[tokio::test] + #[should_panic = "early eof"] async fn test_short() { let data = [0x55; 10]; - let (mut read, info) = read_proxy_protocol(data.as_slice()).await.unwrap(); - - let mut bytes = vec![]; - read.read_to_end(&mut bytes).await.unwrap(); - assert_eq!(bytes, data); - assert_eq!(info, ConnectHeader::Missing); + read_proxy_protocol(data.as_slice()).await.unwrap(); } #[tokio::test] diff --git a/proxy/src/proxy/mod.rs b/proxy/src/proxy/mod.rs index 0ffc54aa88..0e138cc0c7 100644 --- a/proxy/src/proxy/mod.rs +++ b/proxy/src/proxy/mod.rs @@ -1,15 +1,10 @@ #[cfg(test)] mod tests; -pub(crate) mod connect_compute; -mod copy_bidirectional; -pub(crate) mod handshake; -pub(crate) mod passthrough; pub(crate) mod retry; pub(crate) mod wake_compute; use std::sync::Arc; -pub use copy_bidirectional::{ErrorSource, copy_bidirectional_client_compute}; use futures::FutureExt; use itertools::Itertools; use once_cell::sync::OnceCell; @@ -21,16 +16,17 @@ use tokio::io::{AsyncRead, AsyncWrite}; use tokio_util::sync::CancellationToken; use tracing::{Instrument, debug, error, info, warn}; -use self::connect_compute::{TcpMechanism, connect_to_compute}; -use self::passthrough::ProxyPassthrough; use crate::cancellation::{self, CancellationHandler}; use crate::config::{ProxyConfig, ProxyProtocolV2, TlsConfig}; use crate::context::RequestContext; use crate::error::{ReportableError, UserFacingError}; use crate::metrics::{Metrics, NumClientConnectionsGuard}; +use crate::pglb::connect_compute::{TcpMechanism, connect_to_compute}; +pub use crate::pglb::copy_bidirectional::{ErrorSource, copy_bidirectional_client_compute}; +use crate::pglb::handshake::{HandshakeData, HandshakeError, handshake}; +use crate::pglb::passthrough::ProxyPassthrough; use crate::pqproto::{BeMessage, CancelKeyData, StartupMessageParams}; use crate::protocol2::{ConnectHeader, ConnectionInfo, ConnectionInfoExtra, read_proxy_protocol}; -use crate::proxy::handshake::{HandshakeData, handshake}; use crate::rate_limiter::EndpointRateLimiter; use crate::stream::{PqStream, Stream}; use crate::types::EndpointCacheKey; @@ -102,30 +98,24 @@ pub async fn task_main( let endpoint_rate_limiter2 = endpoint_rate_limiter.clone(); connections.spawn(async move { - let (socket, conn_info) = match read_proxy_protocol(socket).await { - Err(e) => { - warn!("per-client task finished with an error: {e:#}"); - return; + let (socket, conn_info) = match config.proxy_protocol_v2 { + ProxyProtocolV2::Required => { + match read_proxy_protocol(socket).await { + Err(e) => { + warn!("per-client task finished with an error: {e:#}"); + return; + } + // our load balancers will not send any more data. let's just exit immediately + Ok((_socket, ConnectHeader::Local)) => { + debug!("healthcheck received"); + return; + } + Ok((socket, ConnectHeader::Proxy(info))) => (socket, info), + } } - // our load balancers will not send any more data. let's just exit immediately - Ok((_socket, ConnectHeader::Local)) => { - debug!("healthcheck received"); - return; - } - Ok((_socket, ConnectHeader::Missing)) - if config.proxy_protocol_v2 == ProxyProtocolV2::Required => - { - warn!("missing required proxy protocol header"); - return; - } - Ok((_socket, ConnectHeader::Proxy(_))) - if config.proxy_protocol_v2 == ProxyProtocolV2::Rejected => - { - warn!("proxy protocol header not supported"); - return; - } - Ok((socket, ConnectHeader::Proxy(info))) => (socket, info), - Ok((socket, ConnectHeader::Missing)) => ( + // ignore the header - it cannot be confused for a postgres or http connection so will + // error later. + ProxyProtocolV2::Rejected => ( socket, ConnectionInfo { addr: peer_addr, @@ -134,7 +124,7 @@ pub async fn task_main( ), }; - match socket.inner.set_nodelay(true) { + match socket.set_nodelay(true) { Ok(()) => {} Err(e) => { error!( @@ -248,7 +238,7 @@ pub(crate) enum ClientRequestError { #[error("{0}")] Cancellation(#[from] cancellation::CancelError), #[error("{0}")] - Handshake(#[from] handshake::HandshakeError), + Handshake(#[from] HandshakeError), #[error("{0}")] HandshakeTimeout(#[from] tokio::time::error::Elapsed), #[error("{0}")] diff --git a/proxy/src/proxy/tests/mod.rs b/proxy/src/proxy/tests/mod.rs index 61e8ee4a10..e5db0013a7 100644 --- a/proxy/src/proxy/tests/mod.rs +++ b/proxy/src/proxy/tests/mod.rs @@ -17,7 +17,6 @@ use rustls::pki_types; use tokio::io::DuplexStream; use tracing_test::traced_test; -use super::connect_compute::ConnectMechanism; use super::retry::CouldRetry; use super::*; use crate::auth::backend::{ @@ -28,6 +27,7 @@ use crate::control_plane::client::{ControlPlaneClient, TestControlPlaneClient}; use crate::control_plane::messages::{ControlPlaneErrorMessage, Details, MetricsAuxInfo, Status}; use crate::control_plane::{self, CachedNodeInfo, NodeInfo, NodeInfoCache}; use crate::error::ErrorKind; +use crate::pglb::connect_compute::ConnectMechanism; use crate::tls::client_config::compute_client_config_with_certs; use crate::tls::postgres_rustls::MakeRustlsConnect; use crate::tls::server_config::CertResolver; @@ -173,7 +173,6 @@ async fn dummy_proxy( tls: Option, auth: impl TestAuth + Send, ) -> anyhow::Result<()> { - let (client, _) = read_proxy_protocol(client).await?; let mut stream = match handshake(&RequestContext::test(), client, tls.as_ref(), false).await? { HandshakeData::Startup(stream, _) => stream, HandshakeData::Cancel(_) => bail!("cancellation not supported"), diff --git a/proxy/src/proxy/wake_compute.rs b/proxy/src/proxy/wake_compute.rs index 9d8915e24a..06c2da58db 100644 --- a/proxy/src/proxy/wake_compute.rs +++ b/proxy/src/proxy/wake_compute.rs @@ -1,6 +1,5 @@ use tracing::{error, info}; -use super::connect_compute::ComputeConnectBackend; use crate::config::RetryConfig; use crate::context::RequestContext; use crate::control_plane::CachedNodeInfo; @@ -9,6 +8,7 @@ use crate::error::ReportableError; use crate::metrics::{ ConnectOutcome, ConnectionFailuresBreakdownGroup, Metrics, RetriesMetricGroup, RetryType, }; +use crate::pglb::connect_compute::ComputeConnectBackend; use crate::proxy::retry::{retry_after, should_retry}; // Use macro to retain original callsite. diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs index bf640c05e9..748e0ce6f2 100644 --- a/proxy/src/serverless/backend.rs +++ b/proxy/src/serverless/backend.rs @@ -35,7 +35,7 @@ use crate::control_plane::errors::{GetAuthInfoError, WakeComputeError}; use crate::control_plane::locks::ApiLocks; use crate::error::{ErrorKind, ReportableError, UserFacingError}; use crate::intern::EndpointIdInt; -use crate::proxy::connect_compute::ConnectMechanism; +use crate::pglb::connect_compute::ConnectMechanism; use crate::proxy::retry::{CouldRetry, ShouldRetryWakeCompute}; use crate::rate_limiter::EndpointRateLimiter; use crate::types::{EndpointId, Host, LOCAL_PROXY_SUFFIX}; @@ -182,7 +182,7 @@ impl PoolingBackend { tracing::Span::current().record("conn_id", display(conn_id)); info!(%conn_id, "pool: opening a new connection '{conn_info}'"); let backend = self.auth_backend.as_ref().map(|()| keys); - crate::proxy::connect_compute::connect_to_compute( + crate::pglb::connect_compute::connect_to_compute( ctx, &TokioMechanism { conn_id, @@ -226,7 +226,7 @@ impl PoolingBackend { }, keys: crate::auth::backend::ComputeCredentialKeys::None, }); - crate::proxy::connect_compute::connect_to_compute( + crate::pglb::connect_compute::connect_to_compute( ctx, &HyperMechanism { conn_id, diff --git a/proxy/src/serverless/mod.rs b/proxy/src/serverless/mod.rs index 2a7069b1c2..f6f681ac45 100644 --- a/proxy/src/serverless/mod.rs +++ b/proxy/src/serverless/mod.rs @@ -49,7 +49,7 @@ use crate::config::{ProxyConfig, ProxyProtocolV2}; use crate::context::RequestContext; use crate::ext::TaskExt; use crate::metrics::Metrics; -use crate::protocol2::{ChainRW, ConnectHeader, ConnectionInfo, read_proxy_protocol}; +use crate::protocol2::{ConnectHeader, ConnectionInfo, read_proxy_protocol}; use crate::proxy::run_until_cancelled; use crate::rate_limiter::EndpointRateLimiter; use crate::serverless::backend::PoolingBackend; @@ -207,12 +207,12 @@ pub(crate) type AsyncRW = Pin>; #[async_trait] trait MaybeTlsAcceptor: Send + Sync + 'static { - async fn accept(&self, conn: ChainRW) -> std::io::Result; + async fn accept(&self, conn: TcpStream) -> std::io::Result; } #[async_trait] impl MaybeTlsAcceptor for &'static ArcSwapOption { - async fn accept(&self, conn: ChainRW) -> std::io::Result { + async fn accept(&self, conn: TcpStream) -> std::io::Result { match &*self.load() { Some(config) => Ok(Box::pin( TlsAcceptor::from(config.http_config.clone()) @@ -235,33 +235,30 @@ async fn connection_startup( peer_addr: SocketAddr, ) -> Option<(AsyncRW, ConnectionInfo)> { // handle PROXY protocol - let (conn, peer) = match read_proxy_protocol(conn).await { - Ok(c) => c, - Err(e) => { - tracing::warn!(?session_id, %peer_addr, "failed to accept TCP connection: invalid PROXY protocol V2 header: {e:#}"); - return None; + let (conn, conn_info) = match config.proxy_protocol_v2 { + ProxyProtocolV2::Required => { + match read_proxy_protocol(conn).await { + Err(e) => { + warn!("per-client task finished with an error: {e:#}"); + return None; + } + // our load balancers will not send any more data. let's just exit immediately + Ok((_conn, ConnectHeader::Local)) => { + tracing::debug!("healthcheck received"); + return None; + } + Ok((conn, ConnectHeader::Proxy(info))) => (conn, info), + } } - }; - - let conn_info = match peer { - // our load balancers will not send any more data. let's just exit immediately - ConnectHeader::Local => { - tracing::debug!("healthcheck received"); - return None; - } - ConnectHeader::Missing if config.proxy_protocol_v2 == ProxyProtocolV2::Required => { - tracing::warn!("missing required proxy protocol header"); - return None; - } - ConnectHeader::Proxy(_) if config.proxy_protocol_v2 == ProxyProtocolV2::Rejected => { - tracing::warn!("proxy protocol header not supported"); - return None; - } - ConnectHeader::Proxy(info) => info, - ConnectHeader::Missing => ConnectionInfo { - addr: peer_addr, - extra: None, - }, + // ignore the header - it cannot be confused for a postgres or http connection so will + // error later. + ProxyProtocolV2::Rejected => ( + conn, + ConnectionInfo { + addr: peer_addr, + extra: None, + }, + ), }; let has_private_peer_addr = match conn_info.addr.ip() { diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index ab4885ce6b..db3f080261 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -357,31 +357,6 @@ class PgProtocol: return TimelineId(cast("str", self.safe_psql("show neon.timeline_id")[0][0])) -class PageserverWalReceiverProtocol(StrEnum): - VANILLA = "vanilla" - INTERPRETED = "interpreted" - - @staticmethod - def to_config_key_value(proto) -> tuple[str, dict[str, Any]]: - if proto == PageserverWalReceiverProtocol.VANILLA: - return ( - "wal_receiver_protocol", - { - "type": "vanilla", - }, - ) - elif proto == PageserverWalReceiverProtocol.INTERPRETED: - return ( - "wal_receiver_protocol", - { - "type": "interpreted", - "args": {"format": "protobuf", "compression": {"zstd": {"level": 1}}}, - }, - ) - else: - raise ValueError(f"Unknown protocol type: {proto}") - - @dataclass class PageserverTracingConfig: sampling_ratio: tuple[int, int] @@ -423,6 +398,7 @@ class PageserverImportConfig: "import_job_concurrency": self.import_job_concurrency, "import_job_soft_size_limit": self.import_job_soft_size_limit, "import_job_checkpoint_threshold": self.import_job_checkpoint_threshold, + "import_job_max_byte_range_size": 4 * 1024 * 1024, # Pageserver default } return ("timeline_import_config", value) @@ -474,7 +450,6 @@ class NeonEnvBuilder: safekeeper_extra_opts: list[str] | None = None, storage_controller_port_override: int | None = None, pageserver_virtual_file_io_mode: str | None = None, - pageserver_wal_receiver_protocol: PageserverWalReceiverProtocol | None = None, pageserver_get_vectored_concurrent_io: str | None = None, pageserver_tracing_config: PageserverTracingConfig | None = None, pageserver_import_config: PageserverImportConfig | None = None, @@ -551,11 +526,6 @@ class NeonEnvBuilder: self.pageserver_virtual_file_io_mode = pageserver_virtual_file_io_mode - if pageserver_wal_receiver_protocol is not None: - self.pageserver_wal_receiver_protocol = pageserver_wal_receiver_protocol - else: - self.pageserver_wal_receiver_protocol = PageserverWalReceiverProtocol.INTERPRETED - assert test_name.startswith("test_"), ( "Unexpectedly instantiated from outside a test function" ) @@ -1201,7 +1171,6 @@ class NeonEnv: self.pageserver_virtual_file_io_engine = config.pageserver_virtual_file_io_engine self.pageserver_virtual_file_io_mode = config.pageserver_virtual_file_io_mode - self.pageserver_wal_receiver_protocol = config.pageserver_wal_receiver_protocol self.pageserver_get_vectored_concurrent_io = config.pageserver_get_vectored_concurrent_io self.pageserver_tracing_config = config.pageserver_tracing_config if config.pageserver_import_config is None: @@ -1333,13 +1302,6 @@ class NeonEnv: for key, value in override.items(): ps_cfg[key] = value - if self.pageserver_wal_receiver_protocol is not None: - key, value = PageserverWalReceiverProtocol.to_config_key_value( - self.pageserver_wal_receiver_protocol - ) - if key not in ps_cfg: - ps_cfg[key] = value - if self.pageserver_tracing_config is not None: key, value = self.pageserver_tracing_config.to_config_key_value() @@ -4710,7 +4672,7 @@ class EndpointFactory: origin: Endpoint, endpoint_id: str | None = None, config_lines: list[str] | None = None, - ): + ) -> Endpoint: branch_name = origin.branch_name assert origin in self.endpoints assert branch_name is not None @@ -4729,7 +4691,7 @@ class EndpointFactory: origin: Endpoint, endpoint_id: str | None = None, config_lines: list[str] | None = None, - ): + ) -> Endpoint: branch_name = origin.branch_name assert origin in self.endpoints assert branch_name is not None diff --git a/test_runner/performance/test_sharded_ingest.py b/test_runner/performance/test_sharded_ingest.py index 293026d40a..364fcf3737 100644 --- a/test_runner/performance/test_sharded_ingest.py +++ b/test_runner/performance/test_sharded_ingest.py @@ -15,19 +15,10 @@ from fixtures.neon_fixtures import ( @pytest.mark.timeout(1200) @pytest.mark.parametrize("shard_count", [1, 8, 32]) -@pytest.mark.parametrize( - "wal_receiver_protocol", - [ - "vanilla", - "interpreted-bincode-compressed", - "interpreted-protobuf-compressed", - ], -) def test_sharded_ingest( neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker, shard_count: int, - wal_receiver_protocol: str, ): """ Benchmarks sharded ingestion throughput, by ingesting a large amount of WAL into a Safekeeper @@ -39,36 +30,6 @@ def test_sharded_ingest( neon_env_builder.num_pageservers = shard_count env = neon_env_builder.init_configs() - for ps in env.pageservers: - if wal_receiver_protocol == "vanilla": - ps.patch_config_toml_nonrecursive( - { - "wal_receiver_protocol": { - "type": "vanilla", - } - } - ) - elif wal_receiver_protocol == "interpreted-bincode-compressed": - ps.patch_config_toml_nonrecursive( - { - "wal_receiver_protocol": { - "type": "interpreted", - "args": {"format": "bincode", "compression": {"zstd": {"level": 1}}}, - } - } - ) - elif wal_receiver_protocol == "interpreted-protobuf-compressed": - ps.patch_config_toml_nonrecursive( - { - "wal_receiver_protocol": { - "type": "interpreted", - "args": {"format": "protobuf", "compression": {"zstd": {"level": 1}}}, - } - } - ) - else: - raise AssertionError("Test must use explicit wal receiver protocol config") - env.start() # Create a sharded tenant and timeline, and migrate it to the respective pageservers. Ensure diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py index 3eb6b7193c..dc44fc77db 100644 --- a/test_runner/regress/test_attach_tenant_config.py +++ b/test_runner/regress/test_attach_tenant_config.py @@ -182,10 +182,6 @@ def test_fully_custom_config(positive_env: NeonEnv): "lsn_lease_length": "1m", "lsn_lease_length_for_ts": "5s", "timeline_offloading": False, - "wal_receiver_protocol_override": { - "type": "interpreted", - "args": {"format": "bincode", "compression": {"zstd": {"level": 1}}}, - }, "rel_size_v2_enabled": True, "relsize_snapshot_cache_capacity": 10000, "gc_compaction_enabled": True, diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py index 370f57b19d..1570d40ae9 100644 --- a/test_runner/regress/test_compaction.py +++ b/test_runner/regress/test_compaction.py @@ -10,7 +10,6 @@ import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnvBuilder, - PageserverWalReceiverProtocol, generate_uploads_and_deletions, ) from fixtures.pageserver.http import PageserverApiException @@ -68,14 +67,9 @@ PREEMPT_GC_COMPACTION_TENANT_CONF = { @skip_in_debug_build("only run with release build") -@pytest.mark.parametrize( - "wal_receiver_protocol", - [PageserverWalReceiverProtocol.VANILLA, PageserverWalReceiverProtocol.INTERPRETED], -) @pytest.mark.timeout(900) def test_pageserver_compaction_smoke( neon_env_builder: NeonEnvBuilder, - wal_receiver_protocol: PageserverWalReceiverProtocol, ): """ This is a smoke test that compaction kicks in. The workload repeatedly churns @@ -85,8 +79,6 @@ def test_pageserver_compaction_smoke( observed bounds. """ - neon_env_builder.pageserver_wal_receiver_protocol = wal_receiver_protocol - # Effectively disable the page cache to rely only on image layers # to shorten reads. neon_env_builder.pageserver_config_override = """ diff --git a/test_runner/regress/test_compute_metrics.py b/test_runner/regress/test_compute_metrics.py index 2cb2ee7b58..c751a3e7cc 100644 --- a/test_runner/regress/test_compute_metrics.py +++ b/test_runner/regress/test_compute_metrics.py @@ -466,8 +466,13 @@ def test_perf_counters(neon_simple_env: NeonEnv): # # 1.5 is the minimum version to contain these views. cur.execute("CREATE EXTENSION neon VERSION '1.5'") + cur.execute("set neon.monitor_query_exec_time = on") cur.execute("SELECT * FROM neon_perf_counters") cur.execute("SELECT * FROM neon_backend_perf_counters") + cur.execute( + "select value from neon_backend_perf_counters where metric='query_time_seconds_count' and pid=pg_backend_pid()" + ) + assert cur.fetchall()[0][0] == 2 def collect_metric( diff --git a/test_runner/regress/test_crafted_wal_end.py b/test_runner/regress/test_crafted_wal_end.py index 6b9dcbba07..89ff873ca3 100644 --- a/test_runner/regress/test_crafted_wal_end.py +++ b/test_runner/regress/test_crafted_wal_end.py @@ -1,9 +1,13 @@ from __future__ import annotations +from typing import TYPE_CHECKING + import pytest from fixtures.log_helper import log from fixtures.neon_cli import WalCraft -from fixtures.neon_fixtures import NeonEnvBuilder, PageserverWalReceiverProtocol + +if TYPE_CHECKING: + from fixtures.neon_fixtures import NeonEnvBuilder # Restart nodes with WAL end having specially crafted shape, like last record # crossing segment boundary, to test decoding issues. @@ -19,17 +23,10 @@ from fixtures.neon_fixtures import NeonEnvBuilder, PageserverWalReceiverProtocol "wal_record_crossing_segment_followed_by_small_one", ], ) -@pytest.mark.parametrize( - "wal_receiver_protocol", - [PageserverWalReceiverProtocol.VANILLA, PageserverWalReceiverProtocol.INTERPRETED], -) def test_crafted_wal_end( neon_env_builder: NeonEnvBuilder, wal_type: str, - wal_receiver_protocol: PageserverWalReceiverProtocol, ): - neon_env_builder.pageserver_wal_receiver_protocol = wal_receiver_protocol - env = neon_env_builder.init_start() env.create_branch("test_crafted_wal_end") env.pageserver.allowed_errors.extend( diff --git a/test_runner/regress/test_download_extensions.py b/test_runner/regress/test_download_extensions.py index 24ba0713d2..fe3b220c67 100644 --- a/test_runner/regress/test_download_extensions.py +++ b/test_runner/regress/test_download_extensions.py @@ -159,7 +159,8 @@ def test_remote_extensions( # Setup a mock nginx S3 gateway which will return our test extension. (host, port) = httpserver_listen_address - extensions_endpoint = f"http://{host}:{port}/pg-ext-s3-gateway" + remote_ext_base_url = f"http://{host}:{port}/pg-ext-s3-gateway" + log.info(f"remote extensions base URL: {remote_ext_base_url}") extension.build(pg_config, test_output_dir) tarball = extension.package(test_output_dir) @@ -221,7 +222,7 @@ def test_remote_extensions( endpoint.create_remote_extension_spec(spec) - endpoint.start(remote_ext_base_url=extensions_endpoint) + endpoint.start(remote_ext_base_url=remote_ext_base_url) with endpoint.connect() as conn: with conn.cursor() as cur: @@ -249,7 +250,7 @@ def test_remote_extensions( # Remove the extension files to force a redownload of the extension. extension.remove(test_output_dir, pg_version) - endpoint.start(remote_ext_base_url=extensions_endpoint) + endpoint.start(remote_ext_base_url=remote_ext_base_url) # Test that ALTER EXTENSION UPDATE statements also fetch remote extensions. with endpoint.connect() as conn: diff --git a/test_runner/regress/test_hot_standby.py b/test_runner/regress/test_hot_standby.py index 4044f25b37..1ff61ce8dc 100644 --- a/test_runner/regress/test_hot_standby.py +++ b/test_runner/regress/test_hot_standby.py @@ -74,8 +74,9 @@ def test_hot_standby(neon_simple_env: NeonEnv): for query in queries: with s_con.cursor() as secondary_cursor: secondary_cursor.execute(query) - response = secondary_cursor.fetchone() - assert response is not None + res = secondary_cursor.fetchone() + assert res is not None + response = res assert response == responses[query] # Check for corrupted WAL messages which might otherwise go unnoticed if @@ -164,7 +165,7 @@ def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder, pause_apply: bool): s_cur.execute("SELECT COUNT(*) FROM test") res = s_cur.fetchone() - assert res[0] == 10000 + assert res == (10000,) # Clear the cache in the standby, so that when we # re-execute the query, it will make GetPage @@ -195,7 +196,7 @@ def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder, pause_apply: bool): s_cur.execute("SELECT COUNT(*) FROM test") log_replica_lag(primary, secondary) res = s_cur.fetchone() - assert res[0] == 10000 + assert res == (10000,) def run_pgbench(connstr: str, pg_bin: PgBin): diff --git a/test_runner/regress/test_replica_promotes.py b/test_runner/regress/test_replica_promotes.py new file mode 100644 index 0000000000..e378d37635 --- /dev/null +++ b/test_runner/regress/test_replica_promotes.py @@ -0,0 +1,133 @@ +""" +File with secondary->primary promotion testing. + +This far, only contains a test that we don't break and that the data is persisted. +""" + +import psycopg2 +from fixtures.log_helper import log +from fixtures.neon_fixtures import Endpoint, NeonEnv, wait_replica_caughtup +from fixtures.pg_version import PgVersion +from pytest import raises + + +def test_replica_promotes(neon_simple_env: NeonEnv, pg_version: PgVersion): + """ + Test that a replica safely promotes, and can commit data updates which + show up when the primary boots up after the promoted secondary endpoint + shut down. + """ + + # Initialize the primary, a test table, and a helper function to create lots + # of subtransactions. + env: NeonEnv = neon_simple_env + primary: Endpoint = env.endpoints.create_start(branch_name="main", endpoint_id="primary") + secondary: Endpoint = env.endpoints.new_replica_start(origin=primary, endpoint_id="secondary") + + with primary.connect() as primary_conn: + primary_cur = primary_conn.cursor() + primary_cur.execute( + "create table t(pk bigint GENERATED ALWAYS AS IDENTITY, payload integer)" + ) + primary_cur.execute("INSERT INTO t(payload) SELECT generate_series(1, 100)") + primary_cur.execute( + """ + SELECT pg_current_wal_insert_lsn(), + pg_current_wal_lsn(), + pg_current_wal_flush_lsn() + """ + ) + log.info(f"Primary: Current LSN after workload is {primary_cur.fetchone()}") + primary_cur.execute("show neon.safekeepers") + safekeepers = primary_cur.fetchall()[0][0] + + wait_replica_caughtup(primary, secondary) + + with secondary.connect() as secondary_conn: + secondary_cur = secondary_conn.cursor() + secondary_cur.execute("select count(*) from t") + + assert secondary_cur.fetchone() == (100,) + + with raises(psycopg2.Error): + secondary_cur.execute("INSERT INTO t (payload) SELECT generate_series(101, 200)") + secondary_conn.commit() + + secondary_conn.rollback() + secondary_cur.execute("select count(*) from t") + assert secondary_cur.fetchone() == (100,) + + primary.stop_and_destroy(mode="immediate") + + # Reconnect to the secondary to make sure we get a read-write connection + promo_conn = secondary.connect() + promo_cur = promo_conn.cursor() + promo_cur.execute(f"alter system set neon.safekeepers='{safekeepers}'") + promo_cur.execute("select pg_reload_conf()") + + promo_cur.execute("SELECT * FROM pg_promote()") + assert promo_cur.fetchone() == (True,) + promo_cur.execute( + """ + SELECT pg_current_wal_insert_lsn(), + pg_current_wal_lsn(), + pg_current_wal_flush_lsn() + """ + ) + log.info(f"Secondary: LSN after promotion is {promo_cur.fetchone()}") + + # Reconnect to the secondary to make sure we get a read-write connection + with secondary.connect() as new_primary_conn: + new_primary_cur = new_primary_conn.cursor() + new_primary_cur.execute("select count(*) from t") + assert new_primary_cur.fetchone() == (100,) + + new_primary_cur.execute( + "INSERT INTO t (payload) SELECT generate_series(101, 200) RETURNING payload" + ) + assert new_primary_cur.fetchall() == [(it,) for it in range(101, 201)] + + new_primary_cur = new_primary_conn.cursor() + new_primary_cur.execute("select payload from t") + assert new_primary_cur.fetchall() == [(it,) for it in range(1, 201)] + + new_primary_cur.execute("select count(*) from t") + assert new_primary_cur.fetchone() == (200,) + new_primary_cur.execute( + """ + SELECT pg_current_wal_insert_lsn(), + pg_current_wal_lsn(), + pg_current_wal_flush_lsn() + """ + ) + log.info(f"Secondary: LSN after workload is {new_primary_cur.fetchone()}") + + with secondary.connect() as second_viewpoint_conn: + new_primary_cur = second_viewpoint_conn.cursor() + new_primary_cur.execute("select payload from t") + assert new_primary_cur.fetchall() == [(it,) for it in range(1, 201)] + + # wait_for_last_flush_lsn(env, secondary, env.initial_tenant, env.initial_timeline) + + secondary.stop_and_destroy() + + primary = env.endpoints.create_start(branch_name="main", endpoint_id="primary") + + with primary.connect() as new_primary: + new_primary_cur = new_primary.cursor() + new_primary_cur.execute( + """ + SELECT pg_current_wal_insert_lsn(), + pg_current_wal_lsn(), + pg_current_wal_flush_lsn() + """ + ) + log.info(f"New primary: Boot LSN is {new_primary_cur.fetchone()}") + + new_primary_cur.execute("select count(*) from t") + assert new_primary_cur.fetchone() == (200,) + new_primary_cur.execute("INSERT INTO t (payload) SELECT generate_series(201, 300)") + new_primary_cur.execute("select count(*) from t") + assert new_primary_cur.fetchone() == (300,) + + primary.stop(mode="immediate") diff --git a/test_runner/regress/test_subxacts.py b/test_runner/regress/test_subxacts.py index b235da0bc7..92a21007c8 100644 --- a/test_runner/regress/test_subxacts.py +++ b/test_runner/regress/test_subxacts.py @@ -1,9 +1,7 @@ from __future__ import annotations -import pytest from fixtures.neon_fixtures import ( NeonEnvBuilder, - PageserverWalReceiverProtocol, check_restored_datadir_content, ) @@ -14,13 +12,7 @@ from fixtures.neon_fixtures import ( # maintained in the pageserver, so subtransactions are not very exciting for # Neon. They are included in the commit record though and updated in the # CLOG. -@pytest.mark.parametrize( - "wal_receiver_protocol", - [PageserverWalReceiverProtocol.VANILLA, PageserverWalReceiverProtocol.INTERPRETED], -) -def test_subxacts(neon_env_builder: NeonEnvBuilder, test_output_dir, wal_receiver_protocol): - neon_env_builder.pageserver_wal_receiver_protocol = wal_receiver_protocol - +def test_subxacts(neon_env_builder: NeonEnvBuilder, test_output_dir): env = neon_env_builder.init_start() endpoint = env.endpoints.create_start("main") diff --git a/test_runner/regress/test_tenant_conf.py b/test_runner/regress/test_tenant_conf.py index de6bdc0aec..d78b9d8817 100644 --- a/test_runner/regress/test_tenant_conf.py +++ b/test_runner/regress/test_tenant_conf.py @@ -348,7 +348,6 @@ def test_tenant_config_patch(neon_env_builder: NeonEnvBuilder, ps_managed_by: st def assert_tenant_conf_semantically_equal(lhs, rhs): """ - Storcon returns None for fields that are not set while the pageserver does not. Compare two tenant's config overrides semantically, by dropping the None values. """ lhs = {k: v for k, v in lhs.items() if v is not None} @@ -375,10 +374,7 @@ def test_tenant_config_patch(neon_env_builder: NeonEnvBuilder, ps_managed_by: st patch: dict[str, Any | None] = { "gc_period": "3h", - "wal_receiver_protocol_override": { - "type": "interpreted", - "args": {"format": "bincode", "compression": {"zstd": {"level": 1}}}, - }, + "gc_compaction_ratio_percent": 10, } api.patch_tenant_config(env.initial_tenant, patch) tenant_conf_after_patch = api.tenant_config(env.initial_tenant).tenant_specific_overrides @@ -391,7 +387,7 @@ def test_tenant_config_patch(neon_env_builder: NeonEnvBuilder, ps_managed_by: st assert_tenant_conf_semantically_equal(tenant_conf_after_patch, crnt_tenant_conf | patch) crnt_tenant_conf = tenant_conf_after_patch - patch = {"gc_period": "5h", "wal_receiver_protocol_override": None} + patch = {"gc_period": "5h", "gc_compaction_ratio_percent": None} api.patch_tenant_config(env.initial_tenant, patch) tenant_conf_after_patch = api.tenant_config(env.initial_tenant).tenant_specific_overrides if ps_managed_by == "storcon": diff --git a/test_runner/regress/test_wal_acceptor_async.py b/test_runner/regress/test_wal_acceptor_async.py index 4070f99568..d8a7dc2a2b 100644 --- a/test_runner/regress/test_wal_acceptor_async.py +++ b/test_runner/regress/test_wal_acceptor_async.py @@ -14,7 +14,6 @@ from fixtures.neon_fixtures import ( Endpoint, NeonEnv, NeonEnvBuilder, - PageserverWalReceiverProtocol, Safekeeper, ) from fixtures.remote_storage import RemoteStorageKind @@ -751,15 +750,8 @@ async def run_segment_init_failure(env: NeonEnv): # Test (injected) failure during WAL segment init. # https://github.com/neondatabase/neon/issues/6401 # https://github.com/neondatabase/neon/issues/6402 -@pytest.mark.parametrize( - "wal_receiver_protocol", - [PageserverWalReceiverProtocol.VANILLA, PageserverWalReceiverProtocol.INTERPRETED], -) -def test_segment_init_failure( - neon_env_builder: NeonEnvBuilder, wal_receiver_protocol: PageserverWalReceiverProtocol -): +def test_segment_init_failure(neon_env_builder: NeonEnvBuilder): neon_env_builder.num_safekeepers = 1 - neon_env_builder.pageserver_wal_receiver_protocol = wal_receiver_protocol env = neon_env_builder.init_start() asyncio.run(run_segment_init_failure(env)) diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index 55c0d45abe..6770bc2513 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit 55c0d45abe6467c02084c2192bca117eda6ce1e7 +Subproject commit 6770bc251301ef40c66f7ecb731741dc435b5051 diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index de7640f55d..8c3249f36c 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit de7640f55da07512834d5cc40c4b3fb376b5f04f +Subproject commit 8c3249f36c7df6ac0efb8ee9f1baf4aa1b83e5c9 diff --git a/vendor/postgres-v16 b/vendor/postgres-v16 index 0bf96bd6d7..7a4c0eacae 160000 --- a/vendor/postgres-v16 +++ b/vendor/postgres-v16 @@ -1 +1 @@ -Subproject commit 0bf96bd6d70301a0b43b0b3457bb3cf8fb43c198 +Subproject commit 7a4c0eacaeb9b97416542fa19103061c166460b1 diff --git a/vendor/postgres-v17 b/vendor/postgres-v17 index 8be779fd3a..db424d42d7 160000 --- a/vendor/postgres-v17 +++ b/vendor/postgres-v17 @@ -1 +1 @@ -Subproject commit 8be779fd3ab9e87206da96a7e4842ef1abf04f44 +Subproject commit db424d42d748f8ad91ac00e28db2c7f2efa42f7f diff --git a/vendor/revisions.json b/vendor/revisions.json index 3e999760f4..12d5499ddb 100644 --- a/vendor/revisions.json +++ b/vendor/revisions.json @@ -1,18 +1,18 @@ { "v17": [ "17.5", - "8be779fd3ab9e87206da96a7e4842ef1abf04f44" + "db424d42d748f8ad91ac00e28db2c7f2efa42f7f" ], "v16": [ "16.9", - "0bf96bd6d70301a0b43b0b3457bb3cf8fb43c198" + "7a4c0eacaeb9b97416542fa19103061c166460b1" ], "v15": [ "15.13", - "de7640f55da07512834d5cc40c4b3fb376b5f04f" + "8c3249f36c7df6ac0efb8ee9f1baf4aa1b83e5c9" ], "v14": [ "14.18", - "55c0d45abe6467c02084c2192bca117eda6ce1e7" + "6770bc251301ef40c66f7ecb731741dc435b5051" ] }