From 3b8be98b67acbb3da0852ca5adf33408e2313f89 Mon Sep 17 00:00:00 2001 From: Vlad Lazar Date: Tue, 3 Jun 2025 10:07:07 +0100 Subject: [PATCH 01/25] pageserver: remove backtrace in info level log (#12108) ## Problem We print a backtrace in an info level log every 10 seconds while waiting for the import data to land in the bucket. ## Summary of changes The backtrace is not useful. Remove it. --- pageserver/src/tenant/timeline/import_pgdata.rs | 4 ++-- pageserver/src/tenant/timeline/import_pgdata/flow.rs | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pageserver/src/tenant/timeline/import_pgdata.rs b/pageserver/src/tenant/timeline/import_pgdata.rs index 3f760d858b..606ad09ef1 100644 --- a/pageserver/src/tenant/timeline/import_pgdata.rs +++ b/pageserver/src/tenant/timeline/import_pgdata.rs @@ -201,8 +201,8 @@ async fn prepare_import( .await; match res { Ok(_) => break, - Err(err) => { - info!(?err, "indefinitely waiting for pgdata to finish"); + Err(_err) => { + info!("indefinitely waiting for pgdata to finish"); if tokio::time::timeout(std::time::Duration::from_secs(10), cancel.cancelled()) .await .is_ok() diff --git a/pageserver/src/tenant/timeline/import_pgdata/flow.rs b/pageserver/src/tenant/timeline/import_pgdata/flow.rs index 760e82dd57..2ec9d86720 100644 --- a/pageserver/src/tenant/timeline/import_pgdata/flow.rs +++ b/pageserver/src/tenant/timeline/import_pgdata/flow.rs @@ -471,6 +471,8 @@ impl Plan { last_completed_job_idx = job_idx; if last_completed_job_idx % checkpoint_every == 0 { + tracing::info!(last_completed_job_idx, jobs=%jobs_in_plan, "Checkpointing import status"); + let progress = ShardImportProgressV1 { jobs: jobs_in_plan, completed: last_completed_job_idx, @@ -492,8 +494,6 @@ impl Plan { anyhow::anyhow!("Shut down while putting timeline import status") })?; } - - tracing::info!(last_completed_job_idx, jobs=%jobs_in_plan, "Checkpointing import status"); }, Some(Err(_)) => { anyhow::bail!( From e00fd45bba98dcc22b14e3555684245a43bee160 Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Tue, 3 Jun 2025 14:20:34 +0200 Subject: [PATCH 02/25] page_api: remove smallvec (#12095) ## Problem The gRPC `page_api` domain types used smallvecs to avoid heap allocations in the common case where a single page is requested. However, this is pointless: the Protobuf types use a normal vec, and converting a smallvec into a vec always causes a heap allocation anyway. ## Summary of changes Use a normal `Vec` instead of a `SmallVec` in `page_api` domain types. --- Cargo.lock | 1 - pageserver/page_api/Cargo.toml | 1 - pageserver/page_api/src/model.rs | 13 ++++++------- pageserver/src/page_service.rs | 4 ++-- 4 files changed, 8 insertions(+), 11 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 9fc233e5ec..542a4528d3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4469,7 +4469,6 @@ dependencies = [ "pageserver_api", "postgres_ffi", "prost 0.13.5", - "smallvec", "thiserror 1.0.69", "tonic 0.13.1", "tonic-build", diff --git a/pageserver/page_api/Cargo.toml b/pageserver/page_api/Cargo.toml index 4f62c77eb2..e643b5749b 100644 --- a/pageserver/page_api/Cargo.toml +++ b/pageserver/page_api/Cargo.toml @@ -9,7 +9,6 @@ bytes.workspace = true pageserver_api.workspace = true postgres_ffi.workspace = true prost.workspace = true -smallvec.workspace = true thiserror.workspace = true tonic.workspace = true utils.workspace = true diff --git a/pageserver/page_api/src/model.rs b/pageserver/page_api/src/model.rs index 0268ab920b..a6853895d9 100644 --- a/pageserver/page_api/src/model.rs +++ b/pageserver/page_api/src/model.rs @@ -14,7 +14,6 @@ use std::fmt::Display; use bytes::Bytes; use postgres_ffi::Oid; -use smallvec::SmallVec; // TODO: split out Lsn, RelTag, SlruKind, Oid and other basic types to a separate crate, to avoid // pulling in all of their other crate dependencies when building the client. use utils::lsn::Lsn; @@ -302,7 +301,7 @@ pub struct GetPageRequest { /// Multiple pages will be executed as a single batch by the Pageserver, amortizing layer access /// costs and parallelizing them. This may increase the latency of any individual request, but /// improves the overall latency and throughput of the batch as a whole. - pub block_numbers: SmallVec<[u32; 1]>, + pub block_numbers: Vec, } impl TryFrom for GetPageRequest { @@ -320,7 +319,7 @@ impl TryFrom for GetPageRequest { .ok_or(ProtocolError::Missing("read_lsn"))? .try_into()?, rel: pb.rel.ok_or(ProtocolError::Missing("rel"))?.try_into()?, - block_numbers: pb.block_number.into(), + block_numbers: pb.block_number, }) } } @@ -337,7 +336,7 @@ impl TryFrom for proto::GetPageRequest { request_class: request.request_class.into(), read_lsn: Some(request.read_lsn.try_into()?), rel: Some(request.rel.into()), - block_number: request.block_numbers.into_vec(), + block_number: request.block_numbers, }) } } @@ -410,7 +409,7 @@ pub struct GetPageResponse { /// A string describing the status, if any. pub reason: Option, /// The 8KB page images, in the same order as the request. Empty if status != OK. - pub page_images: SmallVec<[Bytes; 1]>, + pub page_images: Vec, } impl From for GetPageResponse { @@ -419,7 +418,7 @@ impl From for GetPageResponse { request_id: pb.request_id, status_code: pb.status_code.into(), reason: Some(pb.reason).filter(|r| !r.is_empty()), - page_images: pb.page_image.into(), + page_images: pb.page_image, } } } @@ -430,7 +429,7 @@ impl From for proto::GetPageResponse { request_id: response.request_id, status_code: response.status_code.into(), reason: response.reason.unwrap_or_default(), - page_image: response.page_images.into_vec(), + page_image: response.page_images, } } } diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index b9ba4a3555..735c944970 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -765,7 +765,7 @@ impl PageStreamError { request_id, status_code, reason: Some(status.message().to_string()), - page_images: SmallVec::new(), + page_images: Vec::new(), } .into()) } @@ -3521,7 +3521,7 @@ impl GrpcPageServiceHandler { request_id: req.request_id, status_code: page_api::GetPageStatusCode::Ok, reason: None, - page_images: SmallVec::with_capacity(results.len()), + page_images: Vec::with_capacity(results.len()), }; for result in results { From 25fffd3a55b8c3dfba01631e94c25335cdcaa190 Mon Sep 17 00:00:00 2001 From: Trung Dinh Date: Tue, 3 Jun 2025 06:37:11 -0700 Subject: [PATCH 03/25] Validate max_batch_size against max_get_vectored_keys (#12052) ## Problem Setting `max_batch_size` to anything higher than `Timeline::MAX_GET_VECTORED_KEYS` will cause runtime error. We should rather fail fast at startup if this is the case. ## Summary of changes * Create `max_get_vectored_keys` as a new configuration (default to 32); * Validate `max_batch_size` against `max_get_vectored_keys` right at config parsing and validation. Closes https://github.com/neondatabase/neon/issues/11994 --- libs/pageserver_api/src/config.rs | 18 ++++++++++- pageserver/src/basebackup.rs | 2 +- pageserver/src/config.rs | 48 ++++++++++++++++++++++++++++- pageserver/src/metrics.rs | 6 ++-- pageserver/src/pgdatadir_mapping.rs | 10 +++--- pageserver/src/tenant.rs | 6 ++-- pageserver/src/tenant/timeline.rs | 18 ++++++----- 7 files changed, 86 insertions(+), 22 deletions(-) diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs index 444983bd18..7ca623f8e5 100644 --- a/libs/pageserver_api/src/config.rs +++ b/libs/pageserver_api/src/config.rs @@ -181,6 +181,7 @@ pub struct ConfigToml { pub virtual_file_io_engine: Option, pub ingest_batch_size: u64, pub max_vectored_read_bytes: MaxVectoredReadBytes, + pub max_get_vectored_keys: MaxGetVectoredKeys, pub image_compression: ImageCompressionAlgorithm, pub timeline_offloading: bool, pub ephemeral_bytes_per_memory_kb: usize, @@ -229,7 +230,7 @@ pub enum PageServicePipeliningConfig { } #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] pub struct PageServicePipeliningConfigPipelined { - /// Causes runtime errors if larger than max get_vectored batch size. + /// Failed config parsing and validation if larger than `max_get_vectored_keys`. pub max_batch_size: NonZeroUsize, pub execution: PageServiceProtocolPipelinedExecutionStrategy, // The default below is such that new versions of the software can start @@ -403,6 +404,16 @@ impl Default for EvictionOrder { #[serde(transparent)] pub struct MaxVectoredReadBytes(pub NonZeroUsize); +#[derive(Copy, Clone, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +#[serde(transparent)] +pub struct MaxGetVectoredKeys(NonZeroUsize); + +impl MaxGetVectoredKeys { + pub fn get(&self) -> usize { + self.0.get() + } +} + /// Tenant-level configuration values, used for various purposes. #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] #[serde(default)] @@ -587,6 +598,8 @@ pub mod defaults { /// That is, slightly above 128 kB. pub const DEFAULT_MAX_VECTORED_READ_BYTES: usize = 130 * 1024; // 130 KiB + pub const DEFAULT_MAX_GET_VECTORED_KEYS: usize = 32; + pub const DEFAULT_IMAGE_COMPRESSION: ImageCompressionAlgorithm = ImageCompressionAlgorithm::Zstd { level: Some(1) }; @@ -685,6 +698,9 @@ impl Default for ConfigToml { max_vectored_read_bytes: (MaxVectoredReadBytes( NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(), )), + max_get_vectored_keys: (MaxGetVectoredKeys( + NonZeroUsize::new(DEFAULT_MAX_GET_VECTORED_KEYS).unwrap(), + )), image_compression: (DEFAULT_IMAGE_COMPRESSION), timeline_offloading: true, ephemeral_bytes_per_memory_kb: (DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB), diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index 4dba9d267c..2a0548b811 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -371,7 +371,7 @@ where .await? .partition( self.timeline.get_shard_identity(), - Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64, + self.timeline.conf.max_get_vectored_keys.get() as u64 * BLCKSZ as u64, ); let mut slru_builder = SlruSegmentsBuilder::new(&mut self.ar); diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 89f7539722..628d4f6021 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -14,7 +14,10 @@ use std::time::Duration; use anyhow::{Context, bail, ensure}; use camino::{Utf8Path, Utf8PathBuf}; use once_cell::sync::OnceCell; -use pageserver_api::config::{DiskUsageEvictionTaskConfig, MaxVectoredReadBytes, PostHogConfig}; +use pageserver_api::config::{ + DiskUsageEvictionTaskConfig, MaxGetVectoredKeys, MaxVectoredReadBytes, + PageServicePipeliningConfig, PageServicePipeliningConfigPipelined, PostHogConfig, +}; use pageserver_api::models::ImageCompressionAlgorithm; use pageserver_api::shard::TenantShardId; use pem::Pem; @@ -185,6 +188,9 @@ pub struct PageServerConf { pub max_vectored_read_bytes: MaxVectoredReadBytes, + /// Maximum number of keys to be read in a single get_vectored call. + pub max_get_vectored_keys: MaxGetVectoredKeys, + pub image_compression: ImageCompressionAlgorithm, /// Whether to offload archived timelines automatically @@ -404,6 +410,7 @@ impl PageServerConf { secondary_download_concurrency, ingest_batch_size, max_vectored_read_bytes, + max_get_vectored_keys, image_compression, timeline_offloading, ephemeral_bytes_per_memory_kb, @@ -470,6 +477,7 @@ impl PageServerConf { secondary_download_concurrency, ingest_batch_size, max_vectored_read_bytes, + max_get_vectored_keys, image_compression, timeline_offloading, ephemeral_bytes_per_memory_kb, @@ -598,6 +606,19 @@ impl PageServerConf { ) })?; + if let PageServicePipeliningConfig::Pipelined(PageServicePipeliningConfigPipelined { + max_batch_size, + .. + }) = conf.page_service_pipelining + { + if max_batch_size.get() > conf.max_get_vectored_keys.get() { + return Err(anyhow::anyhow!( + "`max_batch_size` ({max_batch_size}) must be less than or equal to `max_get_vectored_keys` ({})", + conf.max_get_vectored_keys.get() + )); + } + }; + Ok(conf) } @@ -685,6 +706,7 @@ impl ConfigurableSemaphore { mod tests { use camino::Utf8PathBuf; + use rstest::rstest; use utils::id::NodeId; use super::PageServerConf; @@ -724,4 +746,28 @@ mod tests { PageServerConf::parse_and_validate(NodeId(0), config_toml, &workdir) .expect_err("parse_and_validate should fail for endpoint without scheme"); } + + #[rstest] + #[case(32, 32, true)] + #[case(64, 32, false)] + #[case(64, 64, true)] + #[case(128, 128, true)] + fn test_config_max_batch_size_is_valid( + #[case] max_batch_size: usize, + #[case] max_get_vectored_keys: usize, + #[case] is_valid: bool, + ) { + let input = format!( + r#" + control_plane_api = "http://localhost:6666" + max_get_vectored_keys = {max_get_vectored_keys} + page_service_pipelining = {{ mode="pipelined", execution="concurrent-futures", max_batch_size={max_batch_size}, batching="uniform-lsn" }} + "#, + ); + let config_toml = toml_edit::de::from_str::(&input) + .expect("config has valid fields"); + let workdir = Utf8PathBuf::from("/nonexistent"); + let result = PageServerConf::parse_and_validate(NodeId(0), config_toml, &workdir); + assert_eq!(result.is_ok(), is_valid); + } } diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index a9b2f1b7e0..4bef9b44a7 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -15,6 +15,7 @@ use metrics::{ register_int_gauge, register_int_gauge_vec, register_uint_gauge, register_uint_gauge_vec, }; use once_cell::sync::Lazy; +use pageserver_api::config::defaults::DEFAULT_MAX_GET_VECTORED_KEYS; use pageserver_api::config::{ PageServicePipeliningConfig, PageServicePipeliningConfigPipelined, PageServiceProtocolPipelinedBatchingStrategy, PageServiceProtocolPipelinedExecutionStrategy, @@ -32,7 +33,6 @@ use crate::config::PageServerConf; use crate::context::{PageContentKind, RequestContext}; use crate::pgdatadir_mapping::DatadirModificationStats; use crate::task_mgr::TaskKind; -use crate::tenant::Timeline; use crate::tenant::layer_map::LayerMap; use crate::tenant::mgr::TenantSlot; use crate::tenant::storage_layer::{InMemoryLayer, PersistentLayerDesc}; @@ -1939,7 +1939,7 @@ static SMGR_QUERY_TIME_GLOBAL: Lazy = Lazy::new(|| { }); static PAGE_SERVICE_BATCH_SIZE_BUCKETS_GLOBAL: Lazy> = Lazy::new(|| { - (1..=u32::try_from(Timeline::MAX_GET_VECTORED_KEYS).unwrap()) + (1..=u32::try_from(DEFAULT_MAX_GET_VECTORED_KEYS).unwrap()) .map(|v| v.into()) .collect() }); @@ -1957,7 +1957,7 @@ static PAGE_SERVICE_BATCH_SIZE_BUCKETS_PER_TIMELINE: Lazy> = Lazy::new( let mut buckets = Vec::new(); for i in 0.. { let bucket = 1 << i; - if bucket > u32::try_from(Timeline::MAX_GET_VECTORED_KEYS).unwrap() { + if bucket > u32::try_from(DEFAULT_MAX_GET_VECTORED_KEYS).unwrap() { break; } buckets.push(bucket.into()); diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index b6f11b744b..633d62210d 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -431,10 +431,10 @@ impl Timeline { GetVectoredError::InvalidLsn(e) => { Err(anyhow::anyhow!("invalid LSN: {e:?}").into()) } - // NB: this should never happen in practice because we limit MAX_GET_VECTORED_KEYS + // NB: this should never happen in practice because we limit batch size to be smaller than max_get_vectored_keys // TODO: we can prevent this error class by moving this check into the type system - GetVectoredError::Oversized(err) => { - Err(anyhow::anyhow!("batching oversized: {err:?}").into()) + GetVectoredError::Oversized(err, max) => { + Err(anyhow::anyhow!("batching oversized: {err} > {max}").into()) } }; @@ -719,7 +719,7 @@ impl Timeline { let batches = keyspace.partition( self.get_shard_identity(), - Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64, + self.conf.max_get_vectored_keys.get() as u64 * BLCKSZ as u64, ); let io_concurrency = IoConcurrency::spawn_from_conf( @@ -959,7 +959,7 @@ impl Timeline { let batches = keyspace.partition( self.get_shard_identity(), - Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64, + self.conf.max_get_vectored_keys.get() as u64 * BLCKSZ as u64, ); let io_concurrency = IoConcurrency::spawn_from_conf( diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 308ada3fa1..f9fdc143b4 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -7197,7 +7197,7 @@ mod tests { let end = desc .key_range .start - .add(Timeline::MAX_GET_VECTORED_KEYS.try_into().unwrap()); + .add(tenant.conf.max_get_vectored_keys.get() as u32); reads.push(KeySpace { ranges: vec![start..end], }); @@ -11260,11 +11260,11 @@ mod tests { let mut keyspaces_at_lsn: HashMap = HashMap::default(); let mut used_keys: HashSet = HashSet::default(); - while used_keys.len() < Timeline::MAX_GET_VECTORED_KEYS as usize { + while used_keys.len() < tenant.conf.max_get_vectored_keys.get() { let selected_lsn = interesting_lsns.choose(&mut random).expect("not empty"); let mut selected_key = start_key.add(random.gen_range(0..KEY_DIMENSION_SIZE)); - while used_keys.len() < Timeline::MAX_GET_VECTORED_KEYS as usize { + while used_keys.len() < tenant.conf.max_get_vectored_keys.get() { if used_keys.contains(&selected_key) || selected_key >= start_key.add(KEY_DIMENSION_SIZE) { diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 9ddbe404d2..8fdf84b6d3 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -817,8 +817,8 @@ pub(crate) enum GetVectoredError { #[error("timeline shutting down")] Cancelled, - #[error("requested too many keys: {0} > {}", Timeline::MAX_GET_VECTORED_KEYS)] - Oversized(u64), + #[error("requested too many keys: {0} > {1}")] + Oversized(u64, u64), #[error("requested at invalid LSN: {0}")] InvalidLsn(Lsn), @@ -1019,7 +1019,7 @@ impl From for PageReconstructError { match e { GetVectoredError::Cancelled => PageReconstructError::Cancelled, GetVectoredError::InvalidLsn(_) => PageReconstructError::Other(anyhow!("Invalid LSN")), - err @ GetVectoredError::Oversized(_) => PageReconstructError::Other(err.into()), + err @ GetVectoredError::Oversized(_, _) => PageReconstructError::Other(err.into()), GetVectoredError::MissingKey(err) => PageReconstructError::MissingKey(err), GetVectoredError::GetReadyAncestorError(err) => PageReconstructError::from(err), GetVectoredError::Other(err) => PageReconstructError::Other(err), @@ -1199,7 +1199,6 @@ impl Timeline { } } - pub(crate) const MAX_GET_VECTORED_KEYS: u64 = 32; pub(crate) const LAYERS_VISITED_WARN_THRESHOLD: u32 = 100; /// Look up multiple page versions at a given LSN @@ -1214,9 +1213,12 @@ impl Timeline { ) -> Result>, GetVectoredError> { let total_keyspace = query.total_keyspace(); - let key_count = total_keyspace.total_raw_size().try_into().unwrap(); - if key_count > Timeline::MAX_GET_VECTORED_KEYS { - return Err(GetVectoredError::Oversized(key_count)); + let key_count = total_keyspace.total_raw_size(); + if key_count > self.conf.max_get_vectored_keys.get() { + return Err(GetVectoredError::Oversized( + key_count as u64, + self.conf.max_get_vectored_keys.get() as u64, + )); } for range in &total_keyspace.ranges { @@ -5270,7 +5272,7 @@ impl Timeline { key = key.next(); // Maybe flush `key_rest_accum` - if key_request_accum.raw_size() >= Timeline::MAX_GET_VECTORED_KEYS + if key_request_accum.raw_size() >= self.conf.max_get_vectored_keys.get() as u64 || (last_key_in_range && key_request_accum.raw_size() > 0) { let query = From 5bdba70f7dc1c7a489769029d762f1bbea9c727e Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Tue, 3 Jun 2025 15:50:41 +0200 Subject: [PATCH 04/25] =?UTF-8?q?page=5Fapi:=20only=20validate=20Protobuf?= =?UTF-8?q?=20=E2=86=92=20domain=20type=20conversion=20(#12115)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Problem Currently, `page_api` domain types validate message invariants both when converting Protobuf → domain and domain → Protobuf. This is annoying for clients, because they can't use stream combinators to convert streamed requests (needed for hot path performance), and also performs the validation twice in the common case. Blocks #12099. ## Summary of changes Only validate the Protobuf → domain type conversion, i.e. on the receiver side, and make domain → Protobuf infallible. This is where it matters -- the Protobuf types are less strict than the domain types, and receivers should expect all sorts of junk from senders (they're not required to validate anyway, and can just construct an invalid message manually). Also adds a missing `impl From for proto::CheckRelExistsRequest`. --- pageserver/page_api/src/model.rs | 142 +++++++++++++------------------ pageserver/src/page_service.rs | 4 +- 2 files changed, 62 insertions(+), 84 deletions(-) diff --git a/pageserver/page_api/src/model.rs b/pageserver/page_api/src/model.rs index a6853895d9..1a08d04cc1 100644 --- a/pageserver/page_api/src/model.rs +++ b/pageserver/page_api/src/model.rs @@ -9,6 +9,11 @@ //! - Use more precise datatypes, e.g. Lsn and uints shorter than 32 bits. //! //! - Validate protocol invariants, via try_from() and try_into(). +//! +//! Validation only happens on the receiver side, i.e. when converting from Protobuf to domain +//! types. This is where it matters -- the Protobuf types are less strict than the domain types, and +//! receivers should expect all sorts of junk from senders. This also allows the sender to use e.g. +//! stream combinators without dealing with errors, and avoids validating the same message twice. use std::fmt::Display; @@ -71,47 +76,35 @@ impl Display for ReadLsn { } } -impl ReadLsn { - /// Validates the ReadLsn. - pub fn validate(&self) -> Result<(), ProtocolError> { - if self.request_lsn == Lsn::INVALID { - return Err(ProtocolError::invalid("request_lsn", self.request_lsn)); - } - if self.not_modified_since_lsn > Some(self.request_lsn) { - return Err(ProtocolError::invalid( - "not_modified_since_lsn", - self.not_modified_since_lsn, - )); - } - Ok(()) - } -} - impl TryFrom for ReadLsn { type Error = ProtocolError; fn try_from(pb: proto::ReadLsn) -> Result { - let read_lsn = Self { + if pb.request_lsn == 0 { + return Err(ProtocolError::invalid("request_lsn", pb.request_lsn)); + } + if pb.not_modified_since_lsn > pb.request_lsn { + return Err(ProtocolError::invalid( + "not_modified_since_lsn", + pb.not_modified_since_lsn, + )); + } + Ok(Self { request_lsn: Lsn(pb.request_lsn), not_modified_since_lsn: match pb.not_modified_since_lsn { 0 => None, lsn => Some(Lsn(lsn)), }, - }; - read_lsn.validate()?; - Ok(read_lsn) + }) } } -impl TryFrom for proto::ReadLsn { - type Error = ProtocolError; - - fn try_from(read_lsn: ReadLsn) -> Result { - read_lsn.validate()?; - Ok(Self { +impl From for proto::ReadLsn { + fn from(read_lsn: ReadLsn) -> Self { + Self { request_lsn: read_lsn.request_lsn.0, not_modified_since_lsn: read_lsn.not_modified_since_lsn.unwrap_or_default().0, - }) + } } } @@ -166,6 +159,15 @@ impl TryFrom for CheckRelExistsRequest { } } +impl From for proto::CheckRelExistsRequest { + fn from(request: CheckRelExistsRequest) -> Self { + Self { + read_lsn: Some(request.read_lsn.into()), + rel: Some(request.rel.into()), + } + } +} + pub type CheckRelExistsResponse = bool; impl From for CheckRelExistsResponse { @@ -203,14 +205,12 @@ impl TryFrom for GetBaseBackupRequest { } } -impl TryFrom for proto::GetBaseBackupRequest { - type Error = ProtocolError; - - fn try_from(request: GetBaseBackupRequest) -> Result { - Ok(Self { - read_lsn: Some(request.read_lsn.try_into()?), +impl From for proto::GetBaseBackupRequest { + fn from(request: GetBaseBackupRequest) -> Self { + Self { + read_lsn: Some(request.read_lsn.into()), replica: request.replica, - }) + } } } @@ -227,14 +227,9 @@ impl TryFrom for GetBaseBackupResponseChunk { } } -impl TryFrom for proto::GetBaseBackupResponseChunk { - type Error = ProtocolError; - - fn try_from(chunk: GetBaseBackupResponseChunk) -> Result { - if chunk.is_empty() { - return Err(ProtocolError::Missing("chunk")); - } - Ok(Self { chunk }) +impl From for proto::GetBaseBackupResponseChunk { + fn from(chunk: GetBaseBackupResponseChunk) -> Self { + Self { chunk } } } @@ -259,14 +254,12 @@ impl TryFrom for GetDbSizeRequest { } } -impl TryFrom for proto::GetDbSizeRequest { - type Error = ProtocolError; - - fn try_from(request: GetDbSizeRequest) -> Result { - Ok(Self { - read_lsn: Some(request.read_lsn.try_into()?), +impl From for proto::GetDbSizeRequest { + fn from(request: GetDbSizeRequest) -> Self { + Self { + read_lsn: Some(request.read_lsn.into()), db_oid: request.db_oid, - }) + } } } @@ -324,20 +317,15 @@ impl TryFrom for GetPageRequest { } } -impl TryFrom for proto::GetPageRequest { - type Error = ProtocolError; - - fn try_from(request: GetPageRequest) -> Result { - if request.block_numbers.is_empty() { - return Err(ProtocolError::Missing("block_number")); - } - Ok(Self { +impl From for proto::GetPageRequest { + fn from(request: GetPageRequest) -> Self { + Self { request_id: request.request_id, request_class: request.request_class.into(), - read_lsn: Some(request.read_lsn.try_into()?), + read_lsn: Some(request.read_lsn.into()), rel: Some(request.rel.into()), block_number: request.block_numbers, - }) + } } } @@ -518,14 +506,12 @@ impl TryFrom for GetRelSizeRequest { } } -impl TryFrom for proto::GetRelSizeRequest { - type Error = ProtocolError; - - fn try_from(request: GetRelSizeRequest) -> Result { - Ok(Self { - read_lsn: Some(request.read_lsn.try_into()?), +impl From for proto::GetRelSizeRequest { + fn from(request: GetRelSizeRequest) -> Self { + Self { + read_lsn: Some(request.read_lsn.into()), rel: Some(request.rel.into()), - }) + } } } @@ -568,15 +554,13 @@ impl TryFrom for GetSlruSegmentRequest { } } -impl TryFrom for proto::GetSlruSegmentRequest { - type Error = ProtocolError; - - fn try_from(request: GetSlruSegmentRequest) -> Result { - Ok(Self { - read_lsn: Some(request.read_lsn.try_into()?), +impl From for proto::GetSlruSegmentRequest { + fn from(request: GetSlruSegmentRequest) -> Self { + Self { + read_lsn: Some(request.read_lsn.into()), kind: request.kind as u32, segno: request.segno, - }) + } } } @@ -593,15 +577,9 @@ impl TryFrom for GetSlruSegmentResponse { } } -impl TryFrom for proto::GetSlruSegmentResponse { - type Error = ProtocolError; - - fn try_from(segment: GetSlruSegmentResponse) -> Result { - // TODO: can a segment legitimately be empty? - if segment.is_empty() { - return Err(ProtocolError::Missing("segment")); - } - Ok(Self { segment }) +impl From for proto::GetSlruSegmentResponse { + fn from(segment: GetSlruSegmentResponse) -> Self { + Self { segment } } } diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 735c944970..77e5f0a92b 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -3660,7 +3660,7 @@ impl proto::PageService for GrpcPageServiceHandler { if chunk.is_empty() { break; } - yield proto::GetBaseBackupResponseChunk::try_from(chunk.clone().freeze())?; + yield proto::GetBaseBackupResponseChunk::from(chunk.clone().freeze()); chunk.clear(); } } @@ -3806,7 +3806,7 @@ impl proto::PageService for GrpcPageServiceHandler { let resp = PageServerHandler::handle_get_slru_segment_request(&timeline, &req, &ctx).await?; let resp: page_api::GetSlruSegmentResponse = resp.segment; - Ok(tonic::Response::new(resp.try_into()?)) + Ok(tonic::Response::new(resp.into())) } } From a963aab14b80062acdfac54af29f876b0f044552 Mon Sep 17 00:00:00 2001 From: Vlad Lazar Date: Tue, 3 Jun 2025 15:57:36 +0100 Subject: [PATCH 05/25] pagserver: set default wal receiver proto to interpreted (#12100) ## Problem This is already the default in production and in our test suite. ## Summary of changes Set the default proto to interpreted to reduce friction when spinning up new regions or cells. --- libs/pageserver_api/src/config.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs index 7ca623f8e5..237833b9de 100644 --- a/libs/pageserver_api/src/config.rs +++ b/libs/pageserver_api/src/config.rs @@ -608,7 +608,10 @@ pub mod defaults { pub const DEFAULT_IO_BUFFER_ALIGNMENT: usize = 512; pub const DEFAULT_WAL_RECEIVER_PROTOCOL: utils::postgres_client::PostgresClientProtocol = - utils::postgres_client::PostgresClientProtocol::Vanilla; + utils::postgres_client::PostgresClientProtocol::Interpreted { + format: utils::postgres_client::InterpretedFormat::Protobuf, + compression: Some(utils::postgres_client::Compression::Zstd { level: 1 }), + }; pub const DEFAULT_SSL_KEY_FILE: &str = "server.key"; pub const DEFAULT_SSL_CERT_FILE: &str = "server.crt"; From 4a3f32bf4a2865db28aa8d99e80e4766169e0edf Mon Sep 17 00:00:00 2001 From: Tristan Partin Date: Tue, 3 Jun 2025 11:00:56 -0500 Subject: [PATCH 06/25] Clean up compute_tools::http::JsonResponse::invalid_status() (#12110) JsonResponse::error() properly logs an error message which can be read in the compute logs. invalid_status() was not going through that helper function, thus not logging anything. Signed-off-by: Tristan Partin --- compute_tools/src/http/mod.rs | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/compute_tools/src/http/mod.rs b/compute_tools/src/http/mod.rs index 9ecc1b0093..9b01def966 100644 --- a/compute_tools/src/http/mod.rs +++ b/compute_tools/src/http/mod.rs @@ -48,11 +48,9 @@ impl JsonResponse { /// Create an error response related to the compute being in an invalid state pub(self) fn invalid_status(status: ComputeStatus) -> Response { - Self::create_response( + Self::error( StatusCode::PRECONDITION_FAILED, - &GenericAPIError { - error: format!("invalid compute status: {status}"), - }, + format!("invalid compute status: {status}"), ) } } From c698cee19ac8378fe536d65140dd24009e65ab35 Mon Sep 17 00:00:00 2001 From: Mikhail Date: Wed, 4 Jun 2025 06:38:03 +0100 Subject: [PATCH 07/25] ComputeSpec: prewarm_lfc_on_startup -> autoprewarm (#12120) https://github.com/neondatabase/cloud/pull/29472 https://github.com/neondatabase/cloud/issues/26346 --- compute_tools/src/compute.rs | 10 +++++----- compute_tools/tests/pg_helpers_tests.rs | 2 +- control_plane/src/endpoint.rs | 2 +- libs/compute_api/src/spec.rs | 4 ++-- libs/compute_api/tests/cluster_spec.json | 2 +- 5 files changed, 10 insertions(+), 10 deletions(-) diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index d678b7d670..1ed71180d0 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -396,7 +396,7 @@ impl ComputeNode { // because QEMU will already have its memory allocated from the host, and // the necessary binaries will already be cached. if cli_spec.is_none() { - this.prewarm_postgres()?; + this.prewarm_postgres_vm_memory()?; } // Set the up metric with Empty status before starting the HTTP server. @@ -779,7 +779,7 @@ impl ComputeNode { // Spawn the extension stats background task self.spawn_extension_stats_task(); - if pspec.spec.prewarm_lfc_on_startup { + if pspec.spec.autoprewarm { self.prewarm_lfc(); } Ok(()) @@ -1307,8 +1307,8 @@ impl ComputeNode { } /// Start and stop a postgres process to warm up the VM for startup. - pub fn prewarm_postgres(&self) -> Result<()> { - info!("prewarming"); + pub fn prewarm_postgres_vm_memory(&self) -> Result<()> { + info!("prewarming VM memory"); // Create pgdata let pgdata = &format!("{}.warmup", self.params.pgdata); @@ -1350,7 +1350,7 @@ impl ComputeNode { kill(pm_pid, Signal::SIGQUIT)?; info!("sent SIGQUIT signal"); pg.wait()?; - info!("done prewarming"); + info!("done prewarming vm memory"); // clean up let _ok = fs::remove_dir_all(pgdata); diff --git a/compute_tools/tests/pg_helpers_tests.rs b/compute_tools/tests/pg_helpers_tests.rs index 04b6ed2256..2b865c75d0 100644 --- a/compute_tools/tests/pg_helpers_tests.rs +++ b/compute_tools/tests/pg_helpers_tests.rs @@ -30,7 +30,7 @@ mod pg_helpers_tests { r#"fsync = off wal_level = logical hot_standby = on -prewarm_lfc_on_startup = off +autoprewarm = off neon.safekeepers = '127.0.0.1:6502,127.0.0.1:6503,127.0.0.1:6501' wal_log_hints = on log_connections = on diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs index 708745446d..774a0053f8 100644 --- a/control_plane/src/endpoint.rs +++ b/control_plane/src/endpoint.rs @@ -747,7 +747,7 @@ impl Endpoint { logs_export_host: None::, endpoint_storage_addr: Some(endpoint_storage_addr), endpoint_storage_token: Some(endpoint_storage_token), - prewarm_lfc_on_startup: false, + autoprewarm: false, }; // this strange code is needed to support respec() in tests diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs index 09b550b96c..9b7cf43bb9 100644 --- a/libs/compute_api/src/spec.rs +++ b/libs/compute_api/src/spec.rs @@ -178,9 +178,9 @@ pub struct ComputeSpec { /// JWT for authorizing requests to endpoint storage service pub endpoint_storage_token: Option, - /// If true, download LFC state from endpoint_storage and pass it to Postgres on startup + /// Download LFC state from endpoint_storage and pass it to Postgres on startup #[serde(default)] - pub prewarm_lfc_on_startup: bool, + pub autoprewarm: bool, } /// Feature flag to signal `compute_ctl` to enable certain experimental functionality. diff --git a/libs/compute_api/tests/cluster_spec.json b/libs/compute_api/tests/cluster_spec.json index 30e788a601..2dd2aae015 100644 --- a/libs/compute_api/tests/cluster_spec.json +++ b/libs/compute_api/tests/cluster_spec.json @@ -85,7 +85,7 @@ "vartype": "bool" }, { - "name": "prewarm_lfc_on_startup", + "name": "autoprewarm", "value": "off", "vartype": "bool" }, From c567ed0de0837bc3923d39ca8706ac353a2ba50b Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Wed, 4 Jun 2025 14:41:42 +0800 Subject: [PATCH 08/25] feat(pageserver): feature flag counter metrics (#12112) ## Problem Part of https://github.com/neondatabase/neon/issues/11813 ## Summary of changes Add a counter on the feature evaluation outcome and we will set up alerts for too many failed evaluations in the future. Signed-off-by: Alex Chi Z --- libs/posthog_client_lite/src/lib.rs | 10 ++++++++ pageserver/src/feature_resolver.rs | 36 +++++++++++++++++++++++++---- pageserver/src/metrics.rs | 9 ++++++++ 3 files changed, 50 insertions(+), 5 deletions(-) diff --git a/libs/posthog_client_lite/src/lib.rs b/libs/posthog_client_lite/src/lib.rs index ff12051196..281a8f8011 100644 --- a/libs/posthog_client_lite/src/lib.rs +++ b/libs/posthog_client_lite/src/lib.rs @@ -22,6 +22,16 @@ pub enum PostHogEvaluationError { Internal(String), } +impl PostHogEvaluationError { + pub fn as_variant_str(&self) -> &'static str { + match self { + PostHogEvaluationError::NotAvailable(_) => "not_available", + PostHogEvaluationError::NoConditionGroupMatched => "no_condition_group_matched", + PostHogEvaluationError::Internal(_) => "internal", + } + } +} + #[derive(Deserialize)] pub struct LocalEvaluationResponse { pub flags: Vec, diff --git a/pageserver/src/feature_resolver.rs b/pageserver/src/feature_resolver.rs index 7e31b930d0..7463186c06 100644 --- a/pageserver/src/feature_resolver.rs +++ b/pageserver/src/feature_resolver.rs @@ -6,7 +6,7 @@ use posthog_client_lite::{ use tokio_util::sync::CancellationToken; use utils::id::TenantId; -use crate::config::PageServerConf; +use crate::{config::PageServerConf, metrics::FEATURE_FLAG_EVALUATION}; #[derive(Clone)] pub struct FeatureResolver { @@ -55,11 +55,24 @@ impl FeatureResolver { tenant_id: TenantId, ) -> Result { if let Some(inner) = &self.inner { - inner.feature_store().evaluate_multivariate( + let res = inner.feature_store().evaluate_multivariate( flag_key, &tenant_id.to_string(), &HashMap::new(), - ) + ); + match &res { + Ok(value) => { + FEATURE_FLAG_EVALUATION + .with_label_values(&[flag_key, "ok", value]) + .inc(); + } + Err(e) => { + FEATURE_FLAG_EVALUATION + .with_label_values(&[flag_key, "error", e.as_variant_str()]) + .inc(); + } + } + res } else { Err(PostHogEvaluationError::NotAvailable( "PostHog integration is not enabled".to_string(), @@ -80,11 +93,24 @@ impl FeatureResolver { tenant_id: TenantId, ) -> Result<(), PostHogEvaluationError> { if let Some(inner) = &self.inner { - inner.feature_store().evaluate_boolean( + let res = inner.feature_store().evaluate_boolean( flag_key, &tenant_id.to_string(), &HashMap::new(), - ) + ); + match &res { + Ok(()) => { + FEATURE_FLAG_EVALUATION + .with_label_values(&[flag_key, "ok", "true"]) + .inc(); + } + Err(e) => { + FEATURE_FLAG_EVALUATION + .with_label_values(&[flag_key, "error", e.as_variant_str()]) + .inc(); + } + } + res } else { Err(PostHogEvaluationError::NotAvailable( "PostHog integration is not enabled".to_string(), diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 4bef9b44a7..3173ab221f 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -446,6 +446,15 @@ static PAGE_CACHE_ERRORS: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); +pub(crate) static FEATURE_FLAG_EVALUATION: Lazy = Lazy::new(|| { + register_counter_vec!( + "pageserver_feature_flag_evaluation", + "Number of times a feature flag is evaluated", + &["flag_key", "status", "value"], + ) + .unwrap() +}); + #[derive(IntoStaticStr)] #[strum(serialize_all = "kebab_case")] pub(crate) enum PageCacheErrorKind { From 208cbd52d44fb5dd322e6190cdd1ab25e8160d24 Mon Sep 17 00:00:00 2001 From: a-masterov <72613290+a-masterov@users.noreply.github.com> Date: Wed, 4 Jun 2025 11:57:31 +0200 Subject: [PATCH 09/25] Add postgis to the test image (#11672) ## Problem We don't currently run tests for PostGIS in our test environment. ## Summary of Changes - Added PostGIS test support for PostgreSQL v16 and v17 - Configured different PostGIS versions based on PostgreSQL version: - PostgreSQL v17: PostGIS 3.5.0 - PostgreSQL v14/v15/v16: PostGIS 3.3.3 - Added necessary test scripts and configurations This ensures our PostgreSQL implementation remains compatible with this widely-used extension. --------- Co-authored-by: Alexander Bayandin Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> --- compute/compute-node.Dockerfile | 18 +- docker-compose/compute_wrapper/Dockerfile | 2 +- docker-compose/docker-compose.yml | 7 +- docker-compose/docker_compose_test.sh | 11 +- .../ext-src/postgis-src/README-Neon.md | 70 ++++++ .../ext-src/postgis-src/neon-test.sh | 9 + .../postgis-src/postgis-no-upgrade-test.patch | 21 ++ .../postgis-src/postgis-regular-v16.patch | 198 ++++++++++++++++ .../postgis-src/postgis-regular-v17.patch | 218 ++++++++++++++++++ .../postgis-src/raster_outdb_template.sql | 46 ++++ .../ext-src/postgis-src/regular-test.sh | 17 ++ docker-compose/run-tests.sh | 4 + 12 files changed, 615 insertions(+), 6 deletions(-) create mode 100644 docker-compose/ext-src/postgis-src/README-Neon.md create mode 100755 docker-compose/ext-src/postgis-src/neon-test.sh create mode 100644 docker-compose/ext-src/postgis-src/postgis-no-upgrade-test.patch create mode 100644 docker-compose/ext-src/postgis-src/postgis-regular-v16.patch create mode 100644 docker-compose/ext-src/postgis-src/postgis-regular-v17.patch create mode 100644 docker-compose/ext-src/postgis-src/raster_outdb_template.sql create mode 100755 docker-compose/ext-src/postgis-src/regular-test.sh diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile index 2afdde0cfa..6ece9c6566 100644 --- a/compute/compute-node.Dockerfile +++ b/compute/compute-node.Dockerfile @@ -297,6 +297,7 @@ RUN ./autogen.sh && \ ./configure --with-sfcgal=/usr/local/bin/sfcgal-config && \ make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install && \ + make staged-install && \ cd extensions/postgis && \ make clean && \ make -j $(getconf _NPROCESSORS_ONLN) install && \ @@ -1842,10 +1843,25 @@ RUN make PG_VERSION="${PG_VERSION:?}" -C compute FROM pg-build AS extension-tests ARG PG_VERSION +# This is required for the PostGIS test +RUN apt-get update && case $DEBIAN_VERSION in \ + bullseye) \ + apt-get install -y libproj19 libgdal28 time; \ + ;; \ + bookworm) \ + apt-get install -y libgdal32 libproj25 time; \ + ;; \ + *) \ + echo "Unknown Debian version ${DEBIAN_VERSION}" && exit 1 \ + ;; \ + esac + COPY docker-compose/ext-src/ /ext-src/ COPY --from=pg-build /postgres /postgres -#COPY --from=postgis-src /ext-src/ /ext-src/ +COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=postgis-build /ext-src/postgis-src /ext-src/postgis-src +COPY --from=postgis-build /sfcgal/* /usr COPY --from=plv8-src /ext-src/ /ext-src/ COPY --from=h3-pg-src /ext-src/h3-pg-src /ext-src/h3-pg-src COPY --from=postgresql-unit-src /ext-src/ /ext-src/ diff --git a/docker-compose/compute_wrapper/Dockerfile b/docker-compose/compute_wrapper/Dockerfile index 9ef831a9cd..b89e69c650 100644 --- a/docker-compose/compute_wrapper/Dockerfile +++ b/docker-compose/compute_wrapper/Dockerfile @@ -13,6 +13,6 @@ RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \ jq \ netcat-openbsd #This is required for the pg_hintplan test -RUN mkdir -p /ext-src/pg_hint_plan-src /postgres/contrib/file_fdw && chown postgres /ext-src/pg_hint_plan-src /postgres/contrib/file_fdw +RUN mkdir -p /ext-src/pg_hint_plan-src /postgres/contrib/file_fdw /ext-src/postgis-src/ && chown postgres /ext-src/pg_hint_plan-src /postgres/contrib/file_fdw /ext-src/postgis-src USER postgres diff --git a/docker-compose/docker-compose.yml b/docker-compose/docker-compose.yml index fd3ad1fffc..2519b75c7f 100644 --- a/docker-compose/docker-compose.yml +++ b/docker-compose/docker-compose.yml @@ -186,13 +186,14 @@ services: neon-test-extensions: profiles: ["test-extensions"] - image: ${REPOSITORY:-ghcr.io/neondatabase}/neon-test-extensions-v${PG_TEST_VERSION:-16}:${TEST_EXTENSIONS_TAG:-${TAG:-latest}} + image: ${REPOSITORY:-ghcr.io/neondatabase}/neon-test-extensions-v${PG_TEST_VERSION:-${PG_VERSION:-16}}:${TEST_EXTENSIONS_TAG:-${TAG:-latest}} environment: - - PGPASSWORD=cloud_admin + - PGUSER=${PGUSER:-cloud_admin} + - PGPASSWORD=${PGPASSWORD:-cloud_admin} entrypoint: - "/bin/bash" - "-c" command: - - sleep 1800 + - sleep 3600 depends_on: - compute diff --git a/docker-compose/docker_compose_test.sh b/docker-compose/docker_compose_test.sh index 2645a49883..6edf90ca8d 100755 --- a/docker-compose/docker_compose_test.sh +++ b/docker-compose/docker_compose_test.sh @@ -54,6 +54,15 @@ for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do # It cannot be moved to Dockerfile now because the database directory is created after the start of the container echo Adding dummy config docker compose exec compute touch /var/db/postgres/compute/compute_ctl_temp_override.conf + # Prepare for the PostGIS test + docker compose exec compute mkdir -p /tmp/pgis_reg/pgis_reg_tmp + TMPDIR=$(mktemp -d) + docker compose cp neon-test-extensions:/ext-src/postgis-src/raster/test "${TMPDIR}" + docker compose cp neon-test-extensions:/ext-src/postgis-src/regress/00-regress-install "${TMPDIR}" + docker compose exec compute mkdir -p /ext-src/postgis-src/raster /ext-src/postgis-src/regress /ext-src/postgis-src/regress/00-regress-install + docker compose cp "${TMPDIR}/test" compute:/ext-src/postgis-src/raster/test + docker compose cp "${TMPDIR}/00-regress-install" compute:/ext-src/postgis-src/regress + rm -rf "${TMPDIR}" # The following block copies the files for the pg_hintplan test to the compute node for the extension test in an isolated docker-compose environment TMPDIR=$(mktemp -d) docker compose cp neon-test-extensions:/ext-src/pg_hint_plan-src/data "${TMPDIR}/data" @@ -68,7 +77,7 @@ for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do docker compose exec -T neon-test-extensions bash -c "(cd /postgres && patch -p1)" <"../compute/patches/contrib_pg${pg_version}.patch" # We are running tests now rm -f testout.txt testout_contrib.txt - docker compose exec -e USE_PGXS=1 -e SKIP=timescaledb-src,rdkit-src,postgis-src,pg_jsonschema-src,kq_imcx-src,wal2json_2_5-src,rag_jina_reranker_v1_tiny_en-src,rag_bge_small_en_v15-src \ + docker compose exec -e USE_PGXS=1 -e SKIP=timescaledb-src,rdkit-src,pg_jsonschema-src,kq_imcx-src,wal2json_2_5-src,rag_jina_reranker_v1_tiny_en-src,rag_bge_small_en_v15-src \ neon-test-extensions /run-tests.sh /ext-src | tee testout.txt && EXT_SUCCESS=1 || EXT_SUCCESS=0 docker compose exec -e SKIP=start-scripts,postgres_fdw,ltree_plpython,jsonb_plpython,jsonb_plperl,hstore_plpython,hstore_plperl,dblink,bool_plperl \ neon-test-extensions /run-tests.sh /postgres/contrib | tee testout_contrib.txt && CONTRIB_SUCCESS=1 || CONTRIB_SUCCESS=0 diff --git a/docker-compose/ext-src/postgis-src/README-Neon.md b/docker-compose/ext-src/postgis-src/README-Neon.md new file mode 100644 index 0000000000..5937fc782b --- /dev/null +++ b/docker-compose/ext-src/postgis-src/README-Neon.md @@ -0,0 +1,70 @@ +# PostGIS Testing in Neon + +This directory contains configuration files and patches for running PostGIS tests in the Neon database environment. + +## Overview + +PostGIS is a spatial database extension for PostgreSQL that adds support for geographic objects. Testing PostGIS compatibility ensures that Neon's modifications to PostgreSQL don't break compatibility with this critical extension. + +## PostGIS Versions + +- PostgreSQL v17: PostGIS 3.5.0 +- PostgreSQL v14/v15/v16: PostGIS 3.3.3 + +## Test Configuration + +The test setup includes: + +- `postgis-no-upgrade-test.patch`: Disables upgrade tests by removing the upgrade test section from regress/runtest.mk +- `postgis-regular-v16.patch`: Version-specific patch for PostgreSQL v16 +- `postgis-regular-v17.patch`: Version-specific patch for PostgreSQL v17 +- `regular-test.sh`: Script to run PostGIS tests as a regular user +- `neon-test.sh`: Script to handle version-specific test configurations +- `raster_outdb_template.sql`: Template for raster tests with explicit file paths + +## Excluded Tests + +**Important Note:** The test exclusions listed below are specifically for regular-user tests against staging instances. These exclusions are necessary because staging instances run with limited privileges and cannot perform operations requiring superuser access. Docker-compose based tests are not affected by these exclusions. + +### Tests Requiring Superuser Permissions + +These tests cannot be run as a regular user: +- `estimatedextent` +- `regress/core/legacy` +- `regress/core/typmod` +- `regress/loader/TestSkipANALYZE` +- `regress/loader/TestANALYZE` + +### Tests Requiring Filesystem Access + +These tests need direct filesystem access that is only possible for superusers: +- `loader/load_outdb` + +### Tests with Flaky Results + +These tests have assumptions that don't always hold true: +- `regress/core/computed_columns` - Assumes computed columns always outperform alternatives, which is not consistently true + +### Tests Requiring Tunable Parameter Modifications + +These tests attempt to modify the `postgis.gdal_enabled_drivers` parameter, which is only accessible to superusers: +- `raster/test/regress/rt_wkb` +- `raster/test/regress/rt_addband` +- `raster/test/regress/rt_setbandpath` +- `raster/test/regress/rt_fromgdalraster` +- `raster/test/regress/rt_asgdalraster` +- `raster/test/regress/rt_astiff` +- `raster/test/regress/rt_asjpeg` +- `raster/test/regress/rt_aspng` +- `raster/test/regress/permitted_gdal_drivers` +- Loader tests: `BasicOutDB`, `Tiled10x10`, `Tiled10x10Copy`, `Tiled8x8`, `TiledAuto`, `TiledAutoSkipNoData`, `TiledAutoCopyn` + +### Topology Tests (v17 only) +- `populate_topology_layer` +- `renametopogeometrycolumn` + +## Other Modifications + +- Binary.sql tests are modified to use explicit file paths +- Server-side SQL COPY commands (which require superuser privileges) are converted to client-side `\copy` commands +- Upgrade tests are disabled diff --git a/docker-compose/ext-src/postgis-src/neon-test.sh b/docker-compose/ext-src/postgis-src/neon-test.sh new file mode 100755 index 0000000000..2866649a1b --- /dev/null +++ b/docker-compose/ext-src/postgis-src/neon-test.sh @@ -0,0 +1,9 @@ +#!/bin/bash +set -ex +cd "$(dirname "$0")" +if [[ ${PG_VERSION} = v17 ]]; then + sed -i '/computed_columns/d' regress/core/tests.mk +fi +patch -p1 =" 120),1) +- TESTS += \ +- $(top_srcdir)/regress/core/computed_columns +-endif +- + ifeq ($(shell expr "$(POSTGIS_GEOS_VERSION)" ">=" 30700),1) + # GEOS-3.7 adds: + # ST_FrechetDistance +diff --git a/regress/loader/tests.mk b/regress/loader/tests.mk +index 1fc77ac..c3cb9de 100644 +--- a/regress/loader/tests.mk ++++ b/regress/loader/tests.mk +@@ -38,7 +38,5 @@ TESTS += \ + $(top_srcdir)/regress/loader/Latin1 \ + $(top_srcdir)/regress/loader/Latin1-implicit \ + $(top_srcdir)/regress/loader/mfile \ +- $(top_srcdir)/regress/loader/TestSkipANALYZE \ +- $(top_srcdir)/regress/loader/TestANALYZE \ + $(top_srcdir)/regress/loader/CharNoWidth + +diff --git a/regress/run_test.pl b/regress/run_test.pl +index 0ec5b2d..1c331f4 100755 +--- a/regress/run_test.pl ++++ b/regress/run_test.pl +@@ -147,7 +147,6 @@ $ENV{"LANG"} = "C"; + # Add locale info to the psql options + # Add pg12 precision suppression + my $PGOPTIONS = $ENV{"PGOPTIONS"}; +-$PGOPTIONS .= " -c lc_messages=C"; + $PGOPTIONS .= " -c client_min_messages=NOTICE"; + $PGOPTIONS .= " -c extra_float_digits=0"; + $ENV{"PGOPTIONS"} = $PGOPTIONS; diff --git a/docker-compose/ext-src/postgis-src/postgis-regular-v17.patch b/docker-compose/ext-src/postgis-src/postgis-regular-v17.patch new file mode 100644 index 0000000000..f4a9d83478 --- /dev/null +++ b/docker-compose/ext-src/postgis-src/postgis-regular-v17.patch @@ -0,0 +1,218 @@ +diff --git a/raster/test/regress/tests.mk b/raster/test/regress/tests.mk +index 00918e1..7e2b6cd 100644 +--- a/raster/test/regress/tests.mk ++++ b/raster/test/regress/tests.mk +@@ -17,9 +17,7 @@ override RUNTESTFLAGS_INTERNAL := \ + $(RUNTESTFLAGS_INTERNAL) \ + --after-upgrade-script $(top_srcdir)/raster/test/regress/hooks/hook-after-upgrade-raster.sql + +-RASTER_TEST_FIRST = \ +- $(top_srcdir)/raster/test/regress/check_gdal \ +- $(top_srcdir)/raster/test/regress/loader/load_outdb ++RASTER_TEST_FIRST = + + RASTER_TEST_LAST = \ + $(top_srcdir)/raster/test/regress/clean +@@ -33,9 +31,7 @@ RASTER_TEST_IO = \ + + RASTER_TEST_BASIC_FUNC = \ + $(top_srcdir)/raster/test/regress/rt_bytea \ +- $(top_srcdir)/raster/test/regress/rt_wkb \ + $(top_srcdir)/raster/test/regress/box3d \ +- $(top_srcdir)/raster/test/regress/rt_addband \ + $(top_srcdir)/raster/test/regress/rt_band \ + $(top_srcdir)/raster/test/regress/rt_tile + +@@ -73,16 +69,10 @@ RASTER_TEST_BANDPROPS = \ + $(top_srcdir)/raster/test/regress/rt_neighborhood \ + $(top_srcdir)/raster/test/regress/rt_nearestvalue \ + $(top_srcdir)/raster/test/regress/rt_pixelofvalue \ +- $(top_srcdir)/raster/test/regress/rt_polygon \ +- $(top_srcdir)/raster/test/regress/rt_setbandpath ++ $(top_srcdir)/raster/test/regress/rt_polygon + + RASTER_TEST_UTILITY = \ + $(top_srcdir)/raster/test/regress/rt_utility \ +- $(top_srcdir)/raster/test/regress/rt_fromgdalraster \ +- $(top_srcdir)/raster/test/regress/rt_asgdalraster \ +- $(top_srcdir)/raster/test/regress/rt_astiff \ +- $(top_srcdir)/raster/test/regress/rt_asjpeg \ +- $(top_srcdir)/raster/test/regress/rt_aspng \ + $(top_srcdir)/raster/test/regress/rt_reclass \ + $(top_srcdir)/raster/test/regress/rt_gdalwarp \ + $(top_srcdir)/raster/test/regress/rt_gdalcontour \ +@@ -120,21 +110,13 @@ RASTER_TEST_SREL = \ + + RASTER_TEST_BUGS = \ + $(top_srcdir)/raster/test/regress/bug_test_car5 \ +- $(top_srcdir)/raster/test/regress/permitted_gdal_drivers \ + $(top_srcdir)/raster/test/regress/tickets + + RASTER_TEST_LOADER = \ + $(top_srcdir)/raster/test/regress/loader/Basic \ + $(top_srcdir)/raster/test/regress/loader/Projected \ + $(top_srcdir)/raster/test/regress/loader/BasicCopy \ +- $(top_srcdir)/raster/test/regress/loader/BasicFilename \ +- $(top_srcdir)/raster/test/regress/loader/BasicOutDB \ +- $(top_srcdir)/raster/test/regress/loader/Tiled10x10 \ +- $(top_srcdir)/raster/test/regress/loader/Tiled10x10Copy \ +- $(top_srcdir)/raster/test/regress/loader/Tiled8x8 \ +- $(top_srcdir)/raster/test/regress/loader/TiledAuto \ +- $(top_srcdir)/raster/test/regress/loader/TiledAutoSkipNoData \ +- $(top_srcdir)/raster/test/regress/loader/TiledAutoCopyn ++ $(top_srcdir)/raster/test/regress/loader/BasicFilename + + RASTER_TESTS := $(RASTER_TEST_FIRST) \ + $(RASTER_TEST_METADATA) $(RASTER_TEST_IO) $(RASTER_TEST_BASIC_FUNC) \ +diff --git a/regress/core/binary.sql b/regress/core/binary.sql +index 7a36b65..ad78fc7 100644 +--- a/regress/core/binary.sql ++++ b/regress/core/binary.sql +@@ -1,4 +1,5 @@ + SET client_min_messages TO warning; ++ + CREATE SCHEMA tm; + + CREATE TABLE tm.geoms (id serial, g geometry); +@@ -31,24 +32,39 @@ SELECT st_force4d(g) FROM tm.geoms WHERE id < 15 ORDER BY id; + INSERT INTO tm.geoms(g) + SELECT st_setsrid(g,4326) FROM tm.geoms ORDER BY id; + +-COPY tm.geoms TO :tmpfile WITH BINARY; ++-- define temp file path ++\set tmpfile '/tmp/postgis_binary_test.dat' ++ ++-- export ++\set command '\\copy tm.geoms TO ':tmpfile' WITH (FORMAT BINARY)' ++:command ++ ++-- import + CREATE TABLE tm.geoms_in AS SELECT * FROM tm.geoms LIMIT 0; +-COPY tm.geoms_in FROM :tmpfile WITH BINARY; +-SELECT 'geometry', count(*) FROM tm.geoms_in i, tm.geoms o WHERE i.id = o.id +- AND ST_OrderingEquals(i.g, o.g); ++\set command '\\copy tm.geoms_in FROM ':tmpfile' WITH (FORMAT BINARY)' ++:command ++ ++SELECT 'geometry', count(*) FROM tm.geoms_in i, tm.geoms o ++WHERE i.id = o.id AND ST_OrderingEquals(i.g, o.g); + + CREATE TABLE tm.geogs AS SELECT id,g::geography FROM tm.geoms + WHERE geometrytype(g) NOT LIKE '%CURVE%' + AND geometrytype(g) NOT LIKE '%CIRCULAR%' + AND geometrytype(g) NOT LIKE '%SURFACE%' + AND geometrytype(g) NOT LIKE 'TRIANGLE%' +- AND geometrytype(g) NOT LIKE 'TIN%' +-; ++ AND geometrytype(g) NOT LIKE 'TIN%'; + +-COPY tm.geogs TO :tmpfile WITH BINARY; ++-- export ++\set command '\\copy tm.geogs TO ':tmpfile' WITH (FORMAT BINARY)' ++:command ++ ++-- import + CREATE TABLE tm.geogs_in AS SELECT * FROM tm.geogs LIMIT 0; +-COPY tm.geogs_in FROM :tmpfile WITH BINARY; +-SELECT 'geometry', count(*) FROM tm.geogs_in i, tm.geogs o WHERE i.id = o.id +- AND ST_OrderingEquals(i.g::geometry, o.g::geometry); ++\set command '\\copy tm.geogs_in FROM ':tmpfile' WITH (FORMAT BINARY)' ++:command ++ ++SELECT 'geometry', count(*) FROM tm.geogs_in i, tm.geogs o ++WHERE i.id = o.id AND ST_OrderingEquals(i.g::geometry, o.g::geometry); + + DROP SCHEMA tm CASCADE; ++ +diff --git a/regress/core/tests.mk b/regress/core/tests.mk +index 9e05244..a63a3e1 100644 +--- a/regress/core/tests.mk ++++ b/regress/core/tests.mk +@@ -16,14 +16,13 @@ POSTGIS_PGSQL_VERSION=170 + POSTGIS_GEOS_VERSION=31101 + HAVE_JSON=yes + HAVE_SPGIST=yes +-INTERRUPTTESTS=yes ++INTERRUPTTESTS=no + + current_dir := $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) + + RUNTESTFLAGS_INTERNAL += \ + --before-upgrade-script $(top_srcdir)/regress/hooks/hook-before-upgrade.sql \ + --after-upgrade-script $(top_srcdir)/regress/hooks/hook-after-upgrade.sql \ +- --after-create-script $(top_srcdir)/regress/hooks/hook-after-create.sql \ + --before-uninstall-script $(top_srcdir)/regress/hooks/hook-before-uninstall.sql + + TESTS += \ +@@ -40,7 +39,6 @@ TESTS += \ + $(top_srcdir)/regress/core/dumppoints \ + $(top_srcdir)/regress/core/dumpsegments \ + $(top_srcdir)/regress/core/empty \ +- $(top_srcdir)/regress/core/estimatedextent \ + $(top_srcdir)/regress/core/forcecurve \ + $(top_srcdir)/regress/core/flatgeobuf \ + $(top_srcdir)/regress/core/frechet \ +@@ -60,7 +58,6 @@ TESTS += \ + $(top_srcdir)/regress/core/out_marc21 \ + $(top_srcdir)/regress/core/in_encodedpolyline \ + $(top_srcdir)/regress/core/iscollection \ +- $(top_srcdir)/regress/core/legacy \ + $(top_srcdir)/regress/core/letters \ + $(top_srcdir)/regress/core/lwgeom_regress \ + $(top_srcdir)/regress/core/measures \ +@@ -119,7 +116,6 @@ TESTS += \ + $(top_srcdir)/regress/core/temporal_knn \ + $(top_srcdir)/regress/core/tickets \ + $(top_srcdir)/regress/core/twkb \ +- $(top_srcdir)/regress/core/typmod \ + $(top_srcdir)/regress/core/wkb \ + $(top_srcdir)/regress/core/wkt \ + $(top_srcdir)/regress/core/wmsservers \ +@@ -143,8 +139,7 @@ TESTS += \ + $(top_srcdir)/regress/core/oriented_envelope \ + $(top_srcdir)/regress/core/point_coordinates \ + $(top_srcdir)/regress/core/out_geojson \ +- $(top_srcdir)/regress/core/wrapx \ +- $(top_srcdir)/regress/core/computed_columns ++ $(top_srcdir)/regress/core/wrapx + + # Slow slow tests + TESTS_SLOW = \ +diff --git a/regress/loader/tests.mk b/regress/loader/tests.mk +index ac4f8ad..4bad4fc 100644 +--- a/regress/loader/tests.mk ++++ b/regress/loader/tests.mk +@@ -38,7 +38,5 @@ TESTS += \ + $(top_srcdir)/regress/loader/Latin1 \ + $(top_srcdir)/regress/loader/Latin1-implicit \ + $(top_srcdir)/regress/loader/mfile \ +- $(top_srcdir)/regress/loader/TestSkipANALYZE \ +- $(top_srcdir)/regress/loader/TestANALYZE \ + $(top_srcdir)/regress/loader/CharNoWidth \ + +diff --git a/regress/run_test.pl b/regress/run_test.pl +index cac4b2e..4c7c82b 100755 +--- a/regress/run_test.pl ++++ b/regress/run_test.pl +@@ -238,7 +238,6 @@ $ENV{"LANG"} = "C"; + # Add locale info to the psql options + # Add pg12 precision suppression + my $PGOPTIONS = $ENV{"PGOPTIONS"}; +-$PGOPTIONS .= " -c lc_messages=C"; + $PGOPTIONS .= " -c client_min_messages=NOTICE"; + $PGOPTIONS .= " -c extra_float_digits=0"; + $ENV{"PGOPTIONS"} = $PGOPTIONS; +diff --git a/topology/test/tests.mk b/topology/test/tests.mk +index cbe2633..2c7c18f 100644 +--- a/topology/test/tests.mk ++++ b/topology/test/tests.mk +@@ -46,9 +46,7 @@ TESTS += \ + $(top_srcdir)/topology/test/regress/legacy_query.sql \ + $(top_srcdir)/topology/test/regress/legacy_validate.sql \ + $(top_srcdir)/topology/test/regress/polygonize.sql \ +- $(top_srcdir)/topology/test/regress/populate_topology_layer.sql \ + $(top_srcdir)/topology/test/regress/removeunusedprimitives.sql \ +- $(top_srcdir)/topology/test/regress/renametopogeometrycolumn.sql \ + $(top_srcdir)/topology/test/regress/renametopology.sql \ + $(top_srcdir)/topology/test/regress/share_sequences.sql \ + $(top_srcdir)/topology/test/regress/sqlmm.sql \ diff --git a/docker-compose/ext-src/postgis-src/raster_outdb_template.sql b/docker-compose/ext-src/postgis-src/raster_outdb_template.sql new file mode 100644 index 0000000000..16232f28dd --- /dev/null +++ b/docker-compose/ext-src/postgis-src/raster_outdb_template.sql @@ -0,0 +1,46 @@ +-- +-- PostgreSQL database dump +-- + +-- Dumped from database version 17.4 +-- Dumped by pg_dump version 17.4 + +SET statement_timeout = 0; +SET lock_timeout = 0; +SET idle_in_transaction_session_timeout = 0; +SET transaction_timeout = 0; +SET client_encoding = 'UTF8'; +SET standard_conforming_strings = on; +SELECT pg_catalog.set_config('search_path', '', false); +SET check_function_bodies = false; +SET xmloption = content; +SET client_min_messages = warning; + +-- +-- Name: raster_outdb_template; Type: TABLE; Schema: public; Owner: cloud_admin +-- + +CREATE TABLE public.raster_outdb_template ( + rid integer, + rast public.raster +); + + +ALTER TABLE public.raster_outdb_template OWNER TO cloud_admin; + +-- +-- Data for Name: raster_outdb_template; Type: TABLE DATA; Schema: public; Owner: cloud_admin +-- + +COPY public.raster_outdb_template (rid, rast) FROM stdin; +1 0100000300000000000000F03F000000000000F0BF0000000000000000000000000000000000000000000000000000000000000000000000005A0032008400002F6578742D7372632F706F73746769732D7372632F726567726573732F2E2E2F7261737465722F746573742F726567726573732F6C6F616465722F746573747261737465722E746966008400012F6578742D7372632F706F73746769732D7372632F726567726573732F2E2E2F7261737465722F746573742F726567726573732F6C6F616465722F746573747261737465722E746966008400022F6578742D7372632F706F73746769732D7372632F726567726573732F2E2E2F7261737465722F746573742F726567726573732F6C6F616465722F746573747261737465722E74696600 +2 0100000300000000000000F03F000000000000F0BF0000000000000000000000000000000000000000000000000000000000000000000000005A0032008400002F6578742D7372632F706F73746769732D7372632F726567726573732F2E2E2F7261737465722F746573742F726567726573732F6C6F616465722F746573747261737465722E746966008400012F6578742D7372632F706F73746769732D7372632F726567726573732F2E2E2F7261737465722F746573742F726567726573732F6C6F616465722F746573747261737465722E746966008400022F6578742D7372632F706F73746769732D7372632F726567726573732F2E2E2F7261737465722F746573742F726567726573732F6C6F616465722F746573747261737465722E74696600 +3 0100000200000000000000F03F000000000000F0BF0000000000000000000000000000000000000000000000000000000000000000000000005A00320044000101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101018400012F6578742D7372632F706F73746769732D7372632F726567726573732F2E2E2F7261737465722F746573742F726567726573732F6C6F616465722F746573747261737465722E74696600 +4 0100000200000000000000F03F000000000000F0BF0000000000000000000000000000000000000000000000000000000000000000000000005A003200C4FF012F6578742D7372632F706F73746769732D7372632F726567726573732F2E2E2F7261737465722F746573742F726567726573732F6C6F616465722F746573747261737465722E746966004400010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101 +\. + + +-- +-- PostgreSQL database dump complete +-- + diff --git a/docker-compose/ext-src/postgis-src/regular-test.sh b/docker-compose/ext-src/postgis-src/regular-test.sh new file mode 100755 index 0000000000..4b0b929946 --- /dev/null +++ b/docker-compose/ext-src/postgis-src/regular-test.sh @@ -0,0 +1,17 @@ +#!/bin/bash +set -ex +cd "$(dirname "${0}")" +dropdb --if-exist contrib_regression +createdb contrib_regression +psql -d contrib_regression -c "ALTER DATABASE contrib_regression SET TimeZone='UTC'" \ + -c "ALTER DATABASE contrib_regression SET DateStyle='ISO, MDY'" \ + -c "CREATE EXTENSION postgis SCHEMA public" \ + -c "CREATE EXTENSION postgis_topology" \ + -c "CREATE EXTENSION postgis_tiger_geocoder CASCADE" \ + -c "CREATE EXTENSION postgis_raster SCHEMA public" \ + -c "CREATE EXTENSION postgis_sfcgal SCHEMA public" +patch -p1 Date: Wed, 4 Jun 2025 11:44:23 +0100 Subject: [PATCH 10/25] pageserver: make import job max byte range size configurable (#12117) ## Problem We want to repro an OOM situation, but large partial reads are required. ## Summary of Changes Make the max partial read size configurable for import jobs. --- libs/pageserver_api/src/config.rs | 3 ++ .../src/tenant/timeline/import_pgdata/flow.rs | 28 +++++++++++++------ test_runner/fixtures/neon_fixtures.py | 1 + 3 files changed, 23 insertions(+), 9 deletions(-) diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs index 237833b9de..46903965b1 100644 --- a/libs/pageserver_api/src/config.rs +++ b/libs/pageserver_api/src/config.rs @@ -330,6 +330,8 @@ pub struct TimelineImportConfig { pub import_job_concurrency: NonZeroUsize, pub import_job_soft_size_limit: NonZeroUsize, pub import_job_checkpoint_threshold: NonZeroUsize, + /// Max size of the remote storage partial read done by any job + pub import_job_max_byte_range_size: NonZeroUsize, } #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] @@ -735,6 +737,7 @@ impl Default for ConfigToml { import_job_concurrency: NonZeroUsize::new(32).unwrap(), import_job_soft_size_limit: NonZeroUsize::new(256 * 1024 * 1024).unwrap(), import_job_checkpoint_threshold: NonZeroUsize::new(32).unwrap(), + import_job_max_byte_range_size: NonZeroUsize::new(4 * 1024 * 1024).unwrap(), }, basebackup_cache_config: None, posthog_config: None, diff --git a/pageserver/src/tenant/timeline/import_pgdata/flow.rs b/pageserver/src/tenant/timeline/import_pgdata/flow.rs index 2ec9d86720..e003bb6810 100644 --- a/pageserver/src/tenant/timeline/import_pgdata/flow.rs +++ b/pageserver/src/tenant/timeline/import_pgdata/flow.rs @@ -100,6 +100,7 @@ async fn run_v1( .unwrap(), import_job_concurrency: base.import_job_concurrency, import_job_checkpoint_threshold: base.import_job_checkpoint_threshold, + import_job_max_byte_range_size: base.import_job_max_byte_range_size, } } None => timeline.conf.timeline_import_config.clone(), @@ -441,6 +442,7 @@ impl Plan { let mut last_completed_job_idx = start_after_job_idx.unwrap_or(0); let checkpoint_every: usize = import_config.import_job_checkpoint_threshold.into(); + let max_byte_range_size: usize = import_config.import_job_max_byte_range_size.into(); // Run import jobs concurrently up to the limit specified by the pageserver configuration. // Note that we process completed futures in the oreder of insertion. This will be the @@ -456,7 +458,7 @@ impl Plan { work.push_back(tokio::task::spawn(async move { let _permit = permit; - let res = job.run(job_timeline, &ctx).await; + let res = job.run(job_timeline, max_byte_range_size, &ctx).await; (job_idx, res) })); }, @@ -679,6 +681,7 @@ trait ImportTask { async fn doit( self, layer_writer: &mut ImageLayerWriter, + max_byte_range_size: usize, ctx: &RequestContext, ) -> anyhow::Result; } @@ -715,6 +718,7 @@ impl ImportTask for ImportSingleKeyTask { async fn doit( self, layer_writer: &mut ImageLayerWriter, + _max_byte_range_size: usize, ctx: &RequestContext, ) -> anyhow::Result { layer_writer.put_image(self.key, self.buf, ctx).await?; @@ -768,10 +772,9 @@ impl ImportTask for ImportRelBlocksTask { async fn doit( self, layer_writer: &mut ImageLayerWriter, + max_byte_range_size: usize, ctx: &RequestContext, ) -> anyhow::Result { - const MAX_BYTE_RANGE_SIZE: usize = 4 * 1024 * 1024; - debug!("Importing relation file"); let (rel_tag, start_blk) = self.key_range.start.to_rel_block()?; @@ -796,7 +799,7 @@ impl ImportTask for ImportRelBlocksTask { assert_eq!(key.len(), 1); assert!(!acc.is_empty()); assert!(acc_end > acc_start); - if acc_end == start && end - acc_start <= MAX_BYTE_RANGE_SIZE { + if acc_end == start && end - acc_start <= max_byte_range_size { acc.push(key.pop().unwrap()); Ok((acc, acc_start, end)) } else { @@ -860,6 +863,7 @@ impl ImportTask for ImportSlruBlocksTask { async fn doit( self, layer_writer: &mut ImageLayerWriter, + _max_byte_range_size: usize, ctx: &RequestContext, ) -> anyhow::Result { debug!("Importing SLRU segment file {}", self.path); @@ -906,12 +910,13 @@ impl ImportTask for AnyImportTask { async fn doit( self, layer_writer: &mut ImageLayerWriter, + max_byte_range_size: usize, ctx: &RequestContext, ) -> anyhow::Result { match self { - Self::SingleKey(t) => t.doit(layer_writer, ctx).await, - Self::RelBlocks(t) => t.doit(layer_writer, ctx).await, - Self::SlruBlocks(t) => t.doit(layer_writer, ctx).await, + Self::SingleKey(t) => t.doit(layer_writer, max_byte_range_size, ctx).await, + Self::RelBlocks(t) => t.doit(layer_writer, max_byte_range_size, ctx).await, + Self::SlruBlocks(t) => t.doit(layer_writer, max_byte_range_size, ctx).await, } } } @@ -952,7 +957,12 @@ impl ChunkProcessingJob { } } - async fn run(self, timeline: Arc, ctx: &RequestContext) -> anyhow::Result<()> { + async fn run( + self, + timeline: Arc, + max_byte_range_size: usize, + ctx: &RequestContext, + ) -> anyhow::Result<()> { let mut writer = ImageLayerWriter::new( timeline.conf, timeline.timeline_id, @@ -967,7 +977,7 @@ impl ChunkProcessingJob { let mut nimages = 0; for task in self.tasks { - nimages += task.doit(&mut writer, ctx).await?; + nimages += task.doit(&mut writer, max_byte_range_size, ctx).await?; } let resident_layer = if nimages > 0 { diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index ab4885ce6b..19d12da5e3 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -423,6 +423,7 @@ class PageserverImportConfig: "import_job_concurrency": self.import_job_concurrency, "import_job_soft_size_limit": self.import_job_soft_size_limit, "import_job_checkpoint_threshold": self.import_job_checkpoint_threshold, + "import_job_max_byte_range_size": 4 * 1024 * 1024, # Pageserver default } return ("timeline_import_config", value) From e4ca3ac7458ca22dd97e89da859adee966b11580 Mon Sep 17 00:00:00 2001 From: a-masterov <72613290+a-masterov@users.noreply.github.com> Date: Wed, 4 Jun 2025 17:07:48 +0200 Subject: [PATCH 11/25] Fix codestyle for compute.sh for docker-compose (#12128) ## Problem The script `compute.sh` had a non-consistent coding style and didn't follow best practices for modern bash scripts ## Summary of changes The coding style was fixed to follow best practices. --- .../compute_wrapper/shell/compute.sh | 36 +++++++++---------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/docker-compose/compute_wrapper/shell/compute.sh b/docker-compose/compute_wrapper/shell/compute.sh index ab8d74d355..c8ca812bf9 100755 --- a/docker-compose/compute_wrapper/shell/compute.sh +++ b/docker-compose/compute_wrapper/shell/compute.sh @@ -1,18 +1,18 @@ -#!/bin/bash +#!/usr/bin/env bash set -eux # Generate a random tenant or timeline ID # # Takes a variable name as argument. The result is stored in that variable. generate_id() { - local -n resvar=$1 - printf -v resvar '%08x%08x%08x%08x' $SRANDOM $SRANDOM $SRANDOM $SRANDOM + local -n resvar=${1} + printf -v resvar '%08x%08x%08x%08x' ${SRANDOM} ${SRANDOM} ${SRANDOM} ${SRANDOM} } PG_VERSION=${PG_VERSION:-14} -CONFIG_FILE_ORG=/var/db/postgres/configs/config.json -CONFIG_FILE=/tmp/config.json +readonly CONFIG_FILE_ORG=/var/db/postgres/configs/config.json +readonly CONFIG_FILE=/tmp/config.json # Test that the first library path that the dynamic loader looks in is the path # that we use for custom compiled software @@ -20,17 +20,17 @@ first_path="$(ldconfig --verbose 2>/dev/null \ | grep --invert-match ^$'\t' \ | cut --delimiter=: --fields=1 \ | head --lines=1)" -test "$first_path" == '/usr/local/lib' +test "${first_path}" = '/usr/local/lib' echo "Waiting pageserver become ready." while ! nc -z pageserver 6400; do - sleep 1; + sleep 1 done echo "Page server is ready." -cp ${CONFIG_FILE_ORG} ${CONFIG_FILE} +cp "${CONFIG_FILE_ORG}" "${CONFIG_FILE}" - if [ -n "${TENANT_ID:-}" ] && [ -n "${TIMELINE_ID:-}" ]; then + if [[ -n "${TENANT_ID:-}" && -n "${TIMELINE_ID:-}" ]]; then tenant_id=${TENANT_ID} timeline_id=${TIMELINE_ID} else @@ -41,7 +41,7 @@ else "http://pageserver:9898/v1/tenant" ) tenant_id=$(curl "${PARAMS[@]}" | jq -r .[0].id) - if [ -z "${tenant_id}" ] || [ "${tenant_id}" = null ]; then + if [[ -z "${tenant_id}" || "${tenant_id}" = null ]]; then echo "Create a tenant" generate_id tenant_id PARAMS=( @@ -51,7 +51,7 @@ else "http://pageserver:9898/v1/tenant/${tenant_id}/location_config" ) result=$(curl "${PARAMS[@]}") - echo $result | jq . + printf '%s\n' "${result}" | jq . fi echo "Check if a timeline present" @@ -61,7 +61,7 @@ else "http://pageserver:9898/v1/tenant/${tenant_id}/timeline" ) timeline_id=$(curl "${PARAMS[@]}" | jq -r .[0].timeline_id) - if [ -z "${timeline_id}" ] || [ "${timeline_id}" = null ]; then + if [[ -z "${timeline_id}" || "${timeline_id}" = null ]]; then generate_id timeline_id PARAMS=( -sbf @@ -71,7 +71,7 @@ else "http://pageserver:9898/v1/tenant/${tenant_id}/timeline/" ) result=$(curl "${PARAMS[@]}") - echo $result | jq . + printf '%s\n' "${result}" | jq . fi fi @@ -82,10 +82,10 @@ else fi echo "Adding pgx_ulid" shared_libraries=$(jq -r '.spec.cluster.settings[] | select(.name=="shared_preload_libraries").value' ${CONFIG_FILE}) -sed -i "s/${shared_libraries}/${shared_libraries},${ulid_extension}/" ${CONFIG_FILE} +sed -i "s|${shared_libraries}|${shared_libraries},${ulid_extension}|" ${CONFIG_FILE} echo "Overwrite tenant id and timeline id in spec file" -sed -i "s/TENANT_ID/${tenant_id}/" ${CONFIG_FILE} -sed -i "s/TIMELINE_ID/${timeline_id}/" ${CONFIG_FILE} +sed -i "s|TENANT_ID|${tenant_id}|" ${CONFIG_FILE} +sed -i "s|TIMELINE_ID|${timeline_id}|" ${CONFIG_FILE} cat ${CONFIG_FILE} @@ -93,5 +93,5 @@ echo "Start compute node" /usr/local/bin/compute_ctl --pgdata /var/db/postgres/compute \ -C "postgresql://cloud_admin@localhost:55433/postgres" \ -b /usr/local/bin/postgres \ - --compute-id "compute-$RANDOM" \ - --config "$CONFIG_FILE" + --compute-id "compute-${RANDOM}" \ + --config "${CONFIG_FILE}" From e7d6f525b3fa4f3fb849b9d515f39b0d2e09bf7e Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Wed, 4 Jun 2025 17:14:17 +0200 Subject: [PATCH 12/25] pageserver: support `get_vectored_concurrent_io` with gRPC (#12131) ## Problem The gRPC page service doesn't respect `get_vectored_concurrent_io` and always uses sequential IO. ## Summary of changes Spawn a sidecar task for concurrent IO when enabled. Cancellation will be addressed separately. --- pageserver/src/bin/pageserver.rs | 1 + pageserver/src/page_service.rs | 20 ++++++++++++++------ 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 337aa135dc..be7e634d4c 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -819,6 +819,7 @@ fn start_pageserver( tenant_manager.clone(), grpc_auth, otel_guard.as_ref().map(|g| g.dispatch.clone()), + conf.get_vectored_concurrent_io, grpc_listener, )?); } diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 77e5f0a92b..4a1ddf09b5 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -178,6 +178,7 @@ pub fn spawn_grpc( tenant_manager: Arc, auth: Option>, perf_trace_dispatch: Option, + get_vectored_concurrent_io: GetVectoredConcurrentIo, listener: std::net::TcpListener, ) -> anyhow::Result { let cancel = CancellationToken::new(); @@ -214,6 +215,8 @@ pub fn spawn_grpc( let page_service_handler = GrpcPageServiceHandler { tenant_manager, ctx, + gate_guard: gate.enter().expect("gate was just created"), + get_vectored_concurrent_io, }; let observability_layer = ObservabilityLayer; @@ -497,10 +500,6 @@ async fn page_service_conn_main( } /// Page service connection handler. -/// -/// TODO: for gRPC, this will be shared by all requests from all connections. -/// Decompose it into global state and per-connection/request state, and make -/// libpq-specific options (e.g. pipelining) separate. struct PageServerHandler { auth: Option>, claims: Option, @@ -3362,6 +3361,8 @@ where pub struct GrpcPageServiceHandler { tenant_manager: Arc, ctx: RequestContext, + gate_guard: GateGuard, + get_vectored_concurrent_io: GetVectoredConcurrentIo, } impl GrpcPageServiceHandler { @@ -3721,6 +3722,14 @@ impl proto::PageService for GrpcPageServiceHandler { .get(ttid.tenant_id, ttid.timeline_id, shard_selector) .await?; + // Spawn an IoConcurrency sidecar, if enabled. + let Ok(gate_guard) = self.gate_guard.try_clone() else { + return Err(tonic::Status::unavailable("shutting down")); + }; + let io_concurrency = + IoConcurrency::spawn_from_conf(self.get_vectored_concurrent_io, gate_guard); + + // Spawn a task to handle the GetPageRequest stream. let span = Span::current(); let ctx = self.ctx.attached_child(); let mut reqs = req.into_inner(); @@ -3731,8 +3740,7 @@ impl proto::PageService for GrpcPageServiceHandler { .await? .downgrade(); while let Some(req) = reqs.message().await? { - // TODO: implement IoConcurrency sidecar. - yield Self::get_page(&ctx, &timeline, req, IoConcurrency::Sequential) + yield Self::get_page(&ctx, &timeline, req, io_concurrency.clone()) .instrument(span.clone()) // propagate request span .await? } From 3fd5a94a853604d26ff69acaf88c35f765e51698 Mon Sep 17 00:00:00 2001 From: Tristan Partin Date: Wed, 4 Jun 2025 10:56:12 -0500 Subject: [PATCH 13/25] Use Url::join() when creating the final remote extension URL (#12121) Url::to_string() adds a trailing slash on the base URL, so when we did the format!(), we were adding a double forward slash. Signed-off-by: Tristan Partin --- compute_tools/src/bin/compute_ctl.rs | 67 ++++++++++++++- compute_tools/src/extension_server.rs | 16 ++-- libs/compute_api/src/spec.rs | 85 ++++++++++++++----- .../regress/test_download_extensions.py | 7 +- 4 files changed, 140 insertions(+), 35 deletions(-) diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index db6835da61..8b502a058e 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -40,7 +40,7 @@ use std::sync::mpsc; use std::thread; use std::time::Duration; -use anyhow::{Context, Result}; +use anyhow::{Context, Result, bail}; use clap::Parser; use compute_api::responses::ComputeConfig; use compute_tools::compute::{ @@ -57,14 +57,14 @@ use tracing::{error, info}; use url::Url; use utils::failpoint_support; -#[derive(Parser)] +#[derive(Debug, Parser)] #[command(rename_all = "kebab-case")] struct Cli { #[arg(short = 'b', long, default_value = "postgres", env = "POSTGRES_PATH")] pub pgbin: String, /// The base URL for the remote extension storage proxy gateway. - #[arg(short = 'r', long)] + #[arg(short = 'r', long, value_parser = Self::parse_remote_ext_base_url)] pub remote_ext_base_url: Option, /// The port to bind the external listening HTTP server to. Clients running @@ -126,6 +126,25 @@ struct Cli { pub installed_extensions_collection_interval: u64, } +impl Cli { + /// Parse a URL from an argument. By default, this isn't necessary, but we + /// want to do some sanity checking. + fn parse_remote_ext_base_url(value: &str) -> Result { + // Remove extra trailing slashes, and add one. We use Url::join() later + // when downloading remote extensions. If the base URL is something like + // http://example.com/pg-ext-s3-gateway, and join() is called with + // something like "xyz", the resulting URL is http://example.com/xyz. + let value = value.trim_end_matches('/').to_owned() + "/"; + let url = Url::parse(&value)?; + + if url.query_pairs().count() != 0 { + bail!("parameters detected in remote extensions base URL") + } + + Ok(url) + } +} + fn main() -> Result<()> { let cli = Cli::parse(); @@ -252,7 +271,8 @@ fn handle_exit_signal(sig: i32) { #[cfg(test)] mod test { - use clap::CommandFactory; + use clap::{CommandFactory, Parser}; + use url::Url; use super::Cli; @@ -260,4 +280,43 @@ mod test { fn verify_cli() { Cli::command().debug_assert() } + + #[test] + fn verify_remote_ext_base_url() { + let cli = Cli::parse_from([ + "compute_ctl", + "--pgdata=test", + "--connstr=test", + "--compute-id=test", + "--remote-ext-base-url", + "https://example.com/subpath", + ]); + assert_eq!( + cli.remote_ext_base_url.unwrap(), + Url::parse("https://example.com/subpath/").unwrap() + ); + + let cli = Cli::parse_from([ + "compute_ctl", + "--pgdata=test", + "--connstr=test", + "--compute-id=test", + "--remote-ext-base-url", + "https://example.com//", + ]); + assert_eq!( + cli.remote_ext_base_url.unwrap(), + Url::parse("https://example.com").unwrap() + ); + + Cli::try_parse_from([ + "compute_ctl", + "--pgdata=test", + "--connstr=test", + "--compute-id=test", + "--remote-ext-base-url", + "https://example.com?hello=world", + ]) + .expect_err("URL parameters are not allowed"); + } } diff --git a/compute_tools/src/extension_server.rs b/compute_tools/src/extension_server.rs index 1857afa08c..3764bc1525 100644 --- a/compute_tools/src/extension_server.rs +++ b/compute_tools/src/extension_server.rs @@ -166,7 +166,7 @@ pub async fn download_extension( // TODO add retry logic let download_buffer = - match download_extension_tar(remote_ext_base_url.as_str(), &ext_path.to_string()).await { + match download_extension_tar(remote_ext_base_url, &ext_path.to_string()).await { Ok(buffer) => buffer, Err(error_message) => { return Err(anyhow::anyhow!( @@ -271,10 +271,14 @@ pub fn create_control_files(remote_extensions: &RemoteExtSpec, pgbin: &str) { } // Do request to extension storage proxy, e.g., -// curl http://pg-ext-s3-gateway/latest/v15/extensions/anon.tar.zst +// curl http://pg-ext-s3-gateway.pg-ext-s3-gateway.svc.cluster.local/latest/v15/extensions/anon.tar.zst // using HTTP GET and return the response body as bytes. -async fn download_extension_tar(remote_ext_base_url: &str, ext_path: &str) -> Result { - let uri = format!("{}/{}", remote_ext_base_url, ext_path); +async fn download_extension_tar(remote_ext_base_url: &Url, ext_path: &str) -> Result { + let uri = remote_ext_base_url.join(ext_path).with_context(|| { + format!( + "failed to create the remote extension URI for {ext_path} using {remote_ext_base_url}" + ) + })?; let filename = Path::new(ext_path) .file_name() .unwrap_or_else(|| std::ffi::OsStr::new("unknown")) @@ -284,7 +288,7 @@ async fn download_extension_tar(remote_ext_base_url: &str, ext_path: &str) -> Re info!("Downloading extension file '{}' from uri {}", filename, uri); - match do_extension_server_request(&uri).await { + match do_extension_server_request(uri).await { Ok(resp) => { info!("Successfully downloaded remote extension data {}", ext_path); REMOTE_EXT_REQUESTS_TOTAL @@ -303,7 +307,7 @@ async fn download_extension_tar(remote_ext_base_url: &str, ext_path: &str) -> Re // Do a single remote extensions server request. // Return result or (error message + stringified status code) in case of any failures. -async fn do_extension_server_request(uri: &str) -> Result { +async fn do_extension_server_request(uri: Url) -> Result { let resp = reqwest::get(uri).await.map_err(|e| { ( format!( diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs index 9b7cf43bb9..343923d446 100644 --- a/libs/compute_api/src/spec.rs +++ b/libs/compute_api/src/spec.rs @@ -250,34 +250,44 @@ impl RemoteExtSpec { } match self.extension_data.get(real_ext_name) { - Some(_ext_data) => { - // We have decided to use the Go naming convention due to Kubernetes. - - let arch = match std::env::consts::ARCH { - "x86_64" => "amd64", - "aarch64" => "arm64", - arch => arch, - }; - - // Construct the path to the extension archive - // BUILD_TAG/PG_MAJOR_VERSION/extensions/EXTENSION_NAME.tar.zst - // - // Keep it in sync with path generation in - // https://github.com/neondatabase/build-custom-extensions/tree/main - let archive_path_str = format!( - "{build_tag}/{arch}/{pg_major_version}/extensions/{real_ext_name}.tar.zst" - ); - Ok(( - real_ext_name.to_string(), - RemotePath::from_string(&archive_path_str)?, - )) - } + Some(_ext_data) => Ok(( + real_ext_name.to_string(), + Self::build_remote_path(build_tag, pg_major_version, real_ext_name)?, + )), None => Err(anyhow::anyhow!( "real_ext_name {} is not found", real_ext_name )), } } + + /// Get the architecture-specific portion of the remote extension path. We + /// use the Go naming convention due to Kubernetes. + fn get_arch() -> &'static str { + match std::env::consts::ARCH { + "x86_64" => "amd64", + "aarch64" => "arm64", + arch => arch, + } + } + + /// Build a [`RemotePath`] for an extension. + fn build_remote_path( + build_tag: &str, + pg_major_version: &str, + ext_name: &str, + ) -> anyhow::Result { + let arch = Self::get_arch(); + + // Construct the path to the extension archive + // BUILD_TAG/PG_MAJOR_VERSION/extensions/EXTENSION_NAME.tar.zst + // + // Keep it in sync with path generation in + // https://github.com/neondatabase/build-custom-extensions/tree/main + RemotePath::from_string(&format!( + "{build_tag}/{arch}/{pg_major_version}/extensions/{ext_name}.tar.zst" + )) + } } #[derive(Clone, Copy, Debug, Default, Eq, PartialEq, Deserialize, Serialize)] @@ -518,6 +528,37 @@ mod tests { .expect("Library should be found"); } + #[test] + fn remote_extension_path() { + let rspec: RemoteExtSpec = serde_json::from_value(serde_json::json!({ + "public_extensions": ["ext"], + "custom_extensions": [], + "library_index": { + "extlib": "ext", + }, + "extension_data": { + "ext": { + "control_data": { + "ext.control": "" + }, + "archive_path": "" + } + }, + })) + .unwrap(); + + let (_ext_name, ext_path) = rspec + .get_ext("ext", false, "latest", "v17") + .expect("Extension should be found"); + // Starting with a forward slash would have consequences for the + // Url::join() that occurs when downloading a remote extension. + assert!(!ext_path.to_string().starts_with("/")); + assert_eq!( + ext_path, + RemoteExtSpec::build_remote_path("latest", "v17", "ext").unwrap() + ); + } + #[test] fn parse_spec_file() { let file = File::open("tests/cluster_spec.json").unwrap(); diff --git a/test_runner/regress/test_download_extensions.py b/test_runner/regress/test_download_extensions.py index 24ba0713d2..fe3b220c67 100644 --- a/test_runner/regress/test_download_extensions.py +++ b/test_runner/regress/test_download_extensions.py @@ -159,7 +159,8 @@ def test_remote_extensions( # Setup a mock nginx S3 gateway which will return our test extension. (host, port) = httpserver_listen_address - extensions_endpoint = f"http://{host}:{port}/pg-ext-s3-gateway" + remote_ext_base_url = f"http://{host}:{port}/pg-ext-s3-gateway" + log.info(f"remote extensions base URL: {remote_ext_base_url}") extension.build(pg_config, test_output_dir) tarball = extension.package(test_output_dir) @@ -221,7 +222,7 @@ def test_remote_extensions( endpoint.create_remote_extension_spec(spec) - endpoint.start(remote_ext_base_url=extensions_endpoint) + endpoint.start(remote_ext_base_url=remote_ext_base_url) with endpoint.connect() as conn: with conn.cursor() as cur: @@ -249,7 +250,7 @@ def test_remote_extensions( # Remove the extension files to force a redownload of the extension. extension.remove(test_output_dir, pg_version) - endpoint.start(remote_ext_base_url=extensions_endpoint) + endpoint.start(remote_ext_base_url=remote_ext_base_url) # Test that ALTER EXTENSION UPDATE statements also fetch remote extensions. with endpoint.connect() as conn: From 838622c5944d22dbb645a93fec915a465e9242f8 Mon Sep 17 00:00:00 2001 From: Suhas Thalanki <54014218+thesuhas@users.noreply.github.com> Date: Wed, 4 Jun 2025 14:03:59 -0400 Subject: [PATCH 14/25] compute: Add manifest.yml for default Postgres configuration settings (#11820) Adds a `manifest.yml` file that contains the default settings for compute. Currently, it comes from cplane code [here](https://github.com/neondatabase/cloud/blob/0cda3d4b014765c4e2e551a1d56efc095b004e77/goapp/controlplane/internal/pkg/compute/computespec/pg_settings.go#L110). Related RFC: https://github.com/neondatabase/neon/blob/main/docs/rfcs/038-independent-compute-release.md Related Issue: https://github.com/neondatabase/cloud/issues/11698 --- compute/manifest.yaml | 121 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 121 insertions(+) create mode 100644 compute/manifest.yaml diff --git a/compute/manifest.yaml b/compute/manifest.yaml new file mode 100644 index 0000000000..f1cd20c497 --- /dev/null +++ b/compute/manifest.yaml @@ -0,0 +1,121 @@ +pg_settings: + # Common settings for primaries and replicas of all versions. + common: + # Check for client disconnection every 1 minute. By default, Postgres will detect the + # loss of the connection only at the next interaction with the socket, when it waits + # for, receives or sends data, so it will likely waste resources till the end of the + # query execution. There should be no drawbacks in setting this for everyone, so enable + # it by default. If anyone will complain, we can allow editing it. + # https://www.postgresql.org/docs/16/runtime-config-connection.html#GUC-CLIENT-CONNECTION-CHECK-INTERVAL + client_connection_check_interval: "60000" # 1 minute + # ---- IO ---- + effective_io_concurrency: "20" + maintenance_io_concurrency: "100" + fsync: "off" + hot_standby: "off" + # We allow users to change this if needed, but by default we + # just don't want to see long-lasting idle transactions, as they + # prevent activity monitor from suspending projects. + idle_in_transaction_session_timeout: "300000" # 5 minutes + listen_addresses: "*" + # --- LOGGING ---- helps investigations + log_connections: "on" + log_disconnections: "on" + # 1GB, unit is KB + log_temp_files: "1048576" + # Disable dumping customer data to logs, both to increase data privacy + # and to reduce the amount the logs. + log_error_verbosity: "terse" + log_min_error_statement: "panic" + max_connections: "100" + # --- WAL --- + # - flush lag is the max amount of WAL that has been generated but not yet stored + # to disk in the page server. A smaller value means less delay after a pageserver + # restart, but if you set it too small you might again need to slow down writes if the + # pageserver cannot flush incoming WAL to disk fast enough. This must be larger + # than the pageserver's checkpoint interval, currently 1 GB! Otherwise you get a + # a deadlock where the compute node refuses to generate more WAL before the + # old WAL has been uploaded to S3, but the pageserver is waiting for more WAL + # to be generated before it is uploaded to S3. + max_replication_flush_lag: "10GB" + max_replication_slots: "10" + # Backpressure configuration: + # - write lag is the max amount of WAL that has been generated by Postgres but not yet + # processed by the page server. Making this smaller reduces the worst case latency + # of a GetPage request, if you request a page that was recently modified. On the other + # hand, if this is too small, the compute node might need to wait on a write if there is a + # hiccup in the network or page server so that the page server has temporarily fallen + # behind. + # + # Previously it was set to 500 MB, but it caused compute being unresponsive under load + # https://github.com/neondatabase/neon/issues/2028 + max_replication_write_lag: "500MB" + max_wal_senders: "10" + # A Postgres checkpoint is cheap in storage, as doesn't involve any significant amount + # of real I/O. Only the SLRU buffers and some other small files are flushed to disk. + # However, as long as we have full_page_writes=on, page updates after a checkpoint + # include full-page images which bloats the WAL. So may want to bump max_wal_size to + # reduce the WAL bloating, but at the same it will increase pg_wal directory size on + # compute and can lead to out of disk error on k8s nodes. + max_wal_size: "1024" + wal_keep_size: "0" + wal_level: "replica" + # Reduce amount of WAL generated by default. + wal_log_hints: "off" + # - without wal_sender_timeout set we don't get feedback messages, + # required for backpressure. + wal_sender_timeout: "10000" + # We have some experimental extensions, which we don't want users to install unconsciously. + # To install them, users would need to set the `neon.allow_unstable_extensions` setting. + # There are two of them currently: + # - `pgrag` - https://github.com/neondatabase-labs/pgrag - extension is actually called just `rag`, + # and two dependencies: + # - `rag_bge_small_en_v15` + # - `rag_jina_reranker_v1_tiny_en` + # - `pg_mooncake` - https://github.com/Mooncake-Labs/pg_mooncake/ + neon.unstable_extensions: "rag,rag_bge_small_en_v15,rag_jina_reranker_v1_tiny_en,pg_mooncake,anon" + neon.protocol_version: "3" + password_encryption: "scram-sha-256" + # This is important to prevent Postgres from trying to perform + # a local WAL redo after backend crash. It should exit and let + # the systemd or k8s to do a fresh startup with compute_ctl. + restart_after_crash: "off" + # By default 3. We have the following persistent connections in the VM: + # * compute_activity_monitor (from compute_ctl) + # * postgres-exporter (metrics collector; it has 2 connections) + # * sql_exporter (metrics collector; we have 2 instances [1 for us & users; 1 for autoscaling]) + # * vm-monitor (to query & change file cache size) + # i.e. total of 6. Let's reserve 7, so there's still at least one left over. + superuser_reserved_connections: "7" + synchronous_standby_names: "walproposer" + + replica: + hot_standby: "on" + + per_version: + 17: + common: + # PostgreSQL 17 has a new IO system called "read stream", which can combine IOs up to some + # size. It still has some issues with readahead, though, so we default to disabled/ + # "no combining of IOs" to make sure we get the maximum prefetch depth. + # See also: https://github.com/neondatabase/neon/pull/9860 + io_combine_limit: "1" + replica: + # prefetching of blocks referenced in WAL doesn't make sense for us + # Neon hot standby ignores pages that are not in the shared_buffers + recovery_prefetch: "off" + 16: + common: + replica: + # prefetching of blocks referenced in WAL doesn't make sense for us + # Neon hot standby ignores pages that are not in the shared_buffers + recovery_prefetch: "off" + 15: + common: + replica: + # prefetching of blocks referenced in WAL doesn't make sense for us + # Neon hot standby ignores pages that are not in the shared_buffers + recovery_prefetch: "off" + 14: + common: + replica: From 1fb1315aedc8816aa038520b2275b1434a14418f Mon Sep 17 00:00:00 2001 From: Conrad Ludgate Date: Wed, 4 Jun 2025 21:07:47 +0100 Subject: [PATCH 15/25] compute-ctl: add spec for enable_tls, separate from compute-ctl config (#12109) ## Problem Inbetween adding the TLS config for compute-ctl, and adding the TLS config in controlplane, we switched from using a provision flag to a bind flag. This happened to work in all of my testing in preview regions as they have no VM pool, so each bind was also a provision. However, in staging I found that the TLS config is still only processed during provision, even though it's only sent on bind. ## Summary of changes * Add a new feature flag value, `tls_experimental`, which tells postgres/pgbouncer/local_proxy to use the TLS certificates on bind. * compute_ctl on provision will be told where the certificates are, instead of being told on bind. --- compute_tools/src/compute.rs | 41 +++++++++++++++++----- compute_tools/src/http/routes/configure.rs | 2 +- libs/compute_api/src/spec.rs | 3 ++ 3 files changed, 37 insertions(+), 9 deletions(-) diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 1ed71180d0..bd6ed910be 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -3,7 +3,7 @@ use chrono::{DateTime, Utc}; use compute_api::privilege::Privilege; use compute_api::responses::{ ComputeConfig, ComputeCtlConfig, ComputeMetrics, ComputeStatus, LfcOffloadState, - LfcPrewarmState, + LfcPrewarmState, TlsConfig, }; use compute_api::spec::{ ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, ExtVersion, PgIdent, @@ -603,6 +603,8 @@ impl ComputeNode { }); } + let tls_config = self.tls_config(&pspec.spec); + // If there are any remote extensions in shared_preload_libraries, start downloading them if pspec.spec.remote_extensions.is_some() { let (this, spec) = (self.clone(), pspec.spec.clone()); @@ -659,7 +661,7 @@ impl ComputeNode { info!("tuning pgbouncer"); let pgbouncer_settings = pgbouncer_settings.clone(); - let tls_config = self.compute_ctl_config.tls.clone(); + let tls_config = tls_config.clone(); // Spawn a background task to do the tuning, // so that we don't block the main thread that starts Postgres. @@ -678,7 +680,10 @@ impl ComputeNode { // Spawn a background task to do the configuration, // so that we don't block the main thread that starts Postgres. - let local_proxy = local_proxy.clone(); + + let mut local_proxy = local_proxy.clone(); + local_proxy.tls = tls_config.clone(); + let _handle = tokio::spawn(async move { if let Err(err) = local_proxy::configure(&local_proxy) { error!("error while configuring local_proxy: {err:?}"); @@ -1205,13 +1210,15 @@ impl ComputeNode { let spec = &pspec.spec; let pgdata_path = Path::new(&self.params.pgdata); + let tls_config = self.tls_config(&pspec.spec); + // Remove/create an empty pgdata directory and put configuration there. self.create_pgdata()?; config::write_postgres_conf( pgdata_path, &pspec.spec, self.params.internal_http_port, - &self.compute_ctl_config.tls, + tls_config, )?; // Syncing safekeepers is only safe with primary nodes: if a primary @@ -1536,14 +1543,22 @@ impl ComputeNode { .clone(), ); + let mut tls_config = None::; + if spec.features.contains(&ComputeFeature::TlsExperimental) { + tls_config = self.compute_ctl_config.tls.clone(); + } + let max_concurrent_connections = self.max_service_connections(compute_state, &spec); // Merge-apply spec & changes to PostgreSQL state. self.apply_spec_sql(spec.clone(), conf.clone(), max_concurrent_connections)?; if let Some(local_proxy) = &spec.clone().local_proxy_config { + let mut local_proxy = local_proxy.clone(); + local_proxy.tls = tls_config.clone(); + info!("configuring local_proxy"); - local_proxy::configure(local_proxy).context("apply_config local_proxy")?; + local_proxy::configure(&local_proxy).context("apply_config local_proxy")?; } // Run migrations separately to not hold up cold starts @@ -1595,11 +1610,13 @@ impl ComputeNode { pub fn reconfigure(&self) -> Result<()> { let spec = self.state.lock().unwrap().pspec.clone().unwrap().spec; + let tls_config = self.tls_config(&spec); + if let Some(ref pgbouncer_settings) = spec.pgbouncer_settings { info!("tuning pgbouncer"); let pgbouncer_settings = pgbouncer_settings.clone(); - let tls_config = self.compute_ctl_config.tls.clone(); + let tls_config = tls_config.clone(); // Spawn a background task to do the tuning, // so that we don't block the main thread that starts Postgres. @@ -1617,7 +1634,7 @@ impl ComputeNode { // Spawn a background task to do the configuration, // so that we don't block the main thread that starts Postgres. let mut local_proxy = local_proxy.clone(); - local_proxy.tls = self.compute_ctl_config.tls.clone(); + local_proxy.tls = tls_config.clone(); tokio::spawn(async move { if let Err(err) = local_proxy::configure(&local_proxy) { error!("error while configuring local_proxy: {err:?}"); @@ -1635,7 +1652,7 @@ impl ComputeNode { pgdata_path, &spec, self.params.internal_http_port, - &self.compute_ctl_config.tls, + tls_config, )?; if !spec.skip_pg_catalog_updates { @@ -1755,6 +1772,14 @@ impl ComputeNode { } } + pub fn tls_config(&self, spec: &ComputeSpec) -> &Option { + if spec.features.contains(&ComputeFeature::TlsExperimental) { + &self.compute_ctl_config.tls + } else { + &None:: + } + } + /// Update the `last_active` in the shared state, but ensure that it's a more recent one. pub fn update_last_active(&self, last_active: Option>) { let mut state = self.state.lock().unwrap(); diff --git a/compute_tools/src/http/routes/configure.rs b/compute_tools/src/http/routes/configure.rs index f7a19da611..c29e3a97da 100644 --- a/compute_tools/src/http/routes/configure.rs +++ b/compute_tools/src/http/routes/configure.rs @@ -22,7 +22,7 @@ pub(in crate::http) async fn configure( State(compute): State>, request: Json, ) -> Response { - let pspec = match ParsedSpec::try_from(request.spec.clone()) { + let pspec = match ParsedSpec::try_from(request.0.spec) { Ok(p) => p, Err(e) => return JsonResponse::error(StatusCode::BAD_REQUEST, e), }; diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs index 343923d446..0e23b70265 100644 --- a/libs/compute_api/src/spec.rs +++ b/libs/compute_api/src/spec.rs @@ -192,6 +192,9 @@ pub enum ComputeFeature { /// track short-lived connections as user activity. ActivityMonitorExperimental, + /// Enable TLS functionality. + TlsExperimental, + /// This is a special feature flag that is used to represent unknown feature flags. /// Basically all unknown to enum flags are represented as this one. See unit test /// `parse_unknown_features()` for more details. From dae203ef692415428977d49bdf116c3a5fed344e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Thu, 5 Jun 2025 01:02:31 +0200 Subject: [PATCH 16/25] pgxn: support generations in safekeepers_cmp (#12129) `safekeepers_cmp` was added by #8840 to make changes of the safekeeper set order independent: a `sk1,sk2,sk3` specifier changed to `sk3,sk1,sk2` should not cause a walproposer restart. However, this check didn't support generations, in the sense that it would see the `g#123:` as part of the first safekeeper in the list, and if the first safekeeper changes, it would also restart the walproposer. Therefore, parse the generation properly and make it not be a part of the generation. This PR doesn't add a specific test, but I have confirmed locally that `test_safekeepers_reconfigure_reorder` is fixed with the changes of PR #11712 applied thanks to this PR. Part of https://github.com/neondatabase/neon/issues/11670 --- pgxn/neon/walproposer.c | 4 ++++ pgxn/neon/walproposer_pg.c | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+) diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c index f42103c7cd..c63edb1398 100644 --- a/pgxn/neon/walproposer.c +++ b/pgxn/neon/walproposer.c @@ -119,6 +119,10 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api) { wp_log(FATAL, "failed to parse neon.safekeepers generation number: %m"); } + if (*endptr != ':') + { + wp_log(FATAL, "failed to parse neon.safekeepers: no colon after generation"); + } /* Skip past : to the first hostname. */ host = endptr + 1; } diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c index d15bf91d24..c90702a282 100644 --- a/pgxn/neon/walproposer_pg.c +++ b/pgxn/neon/walproposer_pg.c @@ -272,6 +272,30 @@ split_safekeepers_list(char *safekeepers_list, char *safekeepers[]) return n_safekeepers; } +static char *split_off_safekeepers_generation(char *safekeepers_list, uint32 *generation) +{ + char *endptr; + + if (strncmp(safekeepers_list, "g#", 2) != 0) + { + return safekeepers_list; + } + else + { + errno = 0; + *generation = strtoul(safekeepers_list + 2, &endptr, 10); + if (errno != 0) + { + wp_log(FATAL, "failed to parse neon.safekeepers generation number: %m"); + } + if (*endptr != ':') + { + wp_log(FATAL, "failed to parse neon.safekeepers: no colon after generation"); + } + return endptr + 1; + } +} + /* * Accept two coma-separated strings with list of safekeeper host:port addresses. * Split them into arrays and return false if two sets do not match, ignoring the order. @@ -283,6 +307,16 @@ safekeepers_cmp(char *old, char *new) char *safekeepers_new[MAX_SAFEKEEPERS]; int len_old = 0; int len_new = 0; + uint32 gen_old = INVALID_GENERATION; + uint32 gen_new = INVALID_GENERATION; + + old = split_off_safekeepers_generation(old, &gen_old); + new = split_off_safekeepers_generation(new, &gen_new); + + if (gen_old != gen_new) + { + return false; + } len_old = split_safekeepers_list(old, safekeepers_old); len_new = split_safekeepers_list(new, safekeepers_new); From 56d505bce6088f72e4291afee8ae8ac4fe44dbfb Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Thu, 5 Jun 2025 08:48:25 +0300 Subject: [PATCH 17/25] Update online_advisor (#12045) ## Problem Investigate crash of online_advisor in image check ## Summary of changes --------- Co-authored-by: Konstantin Knizhnik --- compute/compute-node.Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile index 6ece9c6566..d3008c8085 100644 --- a/compute/compute-node.Dockerfile +++ b/compute/compute-node.Dockerfile @@ -603,7 +603,7 @@ RUN case "${PG_VERSION:?}" in \ ;; \ esac && \ wget https://github.com/knizhnik/online_advisor/archive/refs/tags/1.0.tar.gz -O online_advisor.tar.gz && \ - echo "059b7d9e5a90013a58bdd22e9505b88406ce05790675eb2d8434e5b215652d54 online_advisor.tar.gz" | sha256sum --check && \ + echo "37dcadf8f7cc8d6cc1f8831276ee245b44f1b0274f09e511e47a67738ba9ed0f online_advisor.tar.gz" | sha256sum --check && \ mkdir online_advisor-src && cd online_advisor-src && tar xzf ../online_advisor.tar.gz --strip-components=1 -C . FROM pg-build AS online_advisor-build From c8a96cf7223de04f208766a6d162205269a61030 Mon Sep 17 00:00:00 2001 From: Conrad Ludgate Date: Thu, 5 Jun 2025 08:12:00 +0100 Subject: [PATCH 18/25] update proxy protocol parsing to not a rw wrapper (#12035) ## Problem I believe in all environments we now specify either required/rejected for proxy-protocol V2 as required. We no longer rely on the supported flow. This means we no longer need to keep around read bytes incase they're not in a header. While I designed ChainRW to be fast (the hot path with an empty buffer is very easy to branch predict), it's still unnecessary. ## Summary of changes * Remove the ChainRW wrapper * Refactor how we read the proxy-protocol header using read_exact. Slightly worse perf but it's hardly significant. * Don't try and parse the header if it's rejected. --- proxy/src/binary/proxy.rs | 3 +- proxy/src/config.rs | 2 - proxy/src/console_redirect_proxy.rs | 44 +++---- proxy/src/protocol2.rs | 177 +++++----------------------- proxy/src/proxy/mod.rs | 42 +++---- proxy/src/proxy/tests/mod.rs | 1 - proxy/src/serverless/mod.rs | 55 ++++----- 7 files changed, 94 insertions(+), 230 deletions(-) diff --git a/proxy/src/binary/proxy.rs b/proxy/src/binary/proxy.rs index dcae263647..757c1e988b 100644 --- a/proxy/src/binary/proxy.rs +++ b/proxy/src/binary/proxy.rs @@ -221,8 +221,7 @@ struct ProxyCliArgs { is_private_access_proxy: bool, /// Configure whether all incoming requests have a Proxy Protocol V2 packet. - // TODO(conradludgate): switch default to rejected or required once we've updated all deployments - #[clap(value_enum, long, default_value_t = ProxyProtocolV2::Supported)] + #[clap(value_enum, long, default_value_t = ProxyProtocolV2::Rejected)] proxy_protocol_v2: ProxyProtocolV2, /// Time the proxy waits for the webauth session to be confirmed by the control plane. diff --git a/proxy/src/config.rs b/proxy/src/config.rs index a97339df9a..248584a19a 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -39,8 +39,6 @@ pub struct ComputeConfig { pub enum ProxyProtocolV2 { /// Connection will error if PROXY protocol v2 header is missing Required, - /// Connection will parse PROXY protocol v2 header, but accept the connection if it's missing. - Supported, /// Connection will error if PROXY protocol v2 header is provided Rejected, } diff --git a/proxy/src/console_redirect_proxy.rs b/proxy/src/console_redirect_proxy.rs index 7fb84b5ee5..6755499b45 100644 --- a/proxy/src/console_redirect_proxy.rs +++ b/proxy/src/console_redirect_proxy.rs @@ -54,30 +54,24 @@ pub async fn task_main( debug!(protocol = "tcp", %session_id, "accepted new TCP connection"); connections.spawn(async move { - let (socket, peer_addr) = match read_proxy_protocol(socket).await { - Err(e) => { - error!("per-client task finished with an error: {e:#}"); - return; + let (socket, conn_info) = match config.proxy_protocol_v2 { + ProxyProtocolV2::Required => { + match read_proxy_protocol(socket).await { + Err(e) => { + error!("per-client task finished with an error: {e:#}"); + return; + } + // our load balancers will not send any more data. let's just exit immediately + Ok((_socket, ConnectHeader::Local)) => { + debug!("healthcheck received"); + return; + } + Ok((socket, ConnectHeader::Proxy(info))) => (socket, info), + } } - // our load balancers will not send any more data. let's just exit immediately - Ok((_socket, ConnectHeader::Local)) => { - debug!("healthcheck received"); - return; - } - Ok((_socket, ConnectHeader::Missing)) - if config.proxy_protocol_v2 == ProxyProtocolV2::Required => - { - error!("missing required proxy protocol header"); - return; - } - Ok((_socket, ConnectHeader::Proxy(_))) - if config.proxy_protocol_v2 == ProxyProtocolV2::Rejected => - { - error!("proxy protocol header not supported"); - return; - } - Ok((socket, ConnectHeader::Proxy(info))) => (socket, info), - Ok((socket, ConnectHeader::Missing)) => ( + // ignore the header - it cannot be confused for a postgres or http connection so will + // error later. + ProxyProtocolV2::Rejected => ( socket, ConnectionInfo { addr: peer_addr, @@ -86,7 +80,7 @@ pub async fn task_main( ), }; - match socket.inner.set_nodelay(true) { + match socket.set_nodelay(true) { Ok(()) => {} Err(e) => { error!( @@ -98,7 +92,7 @@ pub async fn task_main( let ctx = RequestContext::new( session_id, - peer_addr, + conn_info, crate::metrics::Protocol::Tcp, &config.region, ); diff --git a/proxy/src/protocol2.rs b/proxy/src/protocol2.rs index 0793998639..5bec6d6ca3 100644 --- a/proxy/src/protocol2.rs +++ b/proxy/src/protocol2.rs @@ -4,60 +4,13 @@ use core::fmt; use std::io; use std::net::{Ipv4Addr, Ipv6Addr, SocketAddr}; -use std::pin::Pin; -use std::task::{Context, Poll}; -use bytes::{Buf, Bytes, BytesMut}; -use pin_project_lite::pin_project; +use bytes::Buf; use smol_str::SmolStr; use strum_macros::FromRepr; -use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, ReadBuf}; +use tokio::io::{AsyncRead, AsyncReadExt}; use zerocopy::{FromBytes, Immutable, KnownLayout, Unaligned, network_endian}; -pin_project! { - /// A chained [`AsyncRead`] with [`AsyncWrite`] passthrough - pub(crate) struct ChainRW { - #[pin] - pub(crate) inner: T, - buf: BytesMut, - } -} - -impl AsyncWrite for ChainRW { - #[inline] - fn poll_write( - self: Pin<&mut Self>, - cx: &mut Context<'_>, - buf: &[u8], - ) -> Poll> { - self.project().inner.poll_write(cx, buf) - } - - #[inline] - fn poll_flush(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { - self.project().inner.poll_flush(cx) - } - - #[inline] - fn poll_shutdown(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { - self.project().inner.poll_shutdown(cx) - } - - #[inline] - fn poll_write_vectored( - self: Pin<&mut Self>, - cx: &mut Context<'_>, - bufs: &[io::IoSlice<'_>], - ) -> Poll> { - self.project().inner.poll_write_vectored(cx, bufs) - } - - #[inline] - fn is_write_vectored(&self) -> bool { - self.inner.is_write_vectored() - } -} - /// Proxy Protocol Version 2 Header const SIGNATURE: [u8; 12] = [ 0x0D, 0x0A, 0x0D, 0x0A, 0x00, 0x0D, 0x0A, 0x51, 0x55, 0x49, 0x54, 0x0A, @@ -79,7 +32,6 @@ pub struct ConnectionInfo { #[derive(PartialEq, Eq, Clone, Debug)] pub enum ConnectHeader { - Missing, Local, Proxy(ConnectionInfo), } @@ -106,47 +58,24 @@ pub enum ConnectionInfoExtra { pub(crate) async fn read_proxy_protocol( mut read: T, -) -> std::io::Result<(ChainRW, ConnectHeader)> { - let mut buf = BytesMut::with_capacity(128); - let header = loop { - let bytes_read = read.read_buf(&mut buf).await?; - - // exit for bad header signature - let len = usize::min(buf.len(), SIGNATURE.len()); - if buf[..len] != SIGNATURE[..len] { - return Ok((ChainRW { inner: read, buf }, ConnectHeader::Missing)); - } - - // if no more bytes available then exit - if bytes_read == 0 { - return Ok((ChainRW { inner: read, buf }, ConnectHeader::Missing)); - } - - // check if we have enough bytes to continue - if let Some(header) = buf.try_get::() { - break header; - } - }; - - let remaining_length = usize::from(header.len.get()); - - while buf.len() < remaining_length { - if read.read_buf(&mut buf).await? == 0 { - return Err(io::Error::new( - io::ErrorKind::UnexpectedEof, - "stream closed while waiting for proxy protocol addresses", - )); - } +) -> std::io::Result<(T, ConnectHeader)> { + let mut header = [0; size_of::()]; + read.read_exact(&mut header).await?; + let header: ProxyProtocolV2Header = zerocopy::transmute!(header); + if header.signature != SIGNATURE { + return Err(std::io::Error::other("invalid proxy protocol header")); } - let payload = buf.split_to(remaining_length); - let res = process_proxy_payload(header, payload)?; - Ok((ChainRW { inner: read, buf }, res)) + let mut payload = vec![0; usize::from(header.len.get())]; + read.read_exact(&mut payload).await?; + + let res = process_proxy_payload(header, &payload)?; + Ok((read, res)) } fn process_proxy_payload( header: ProxyProtocolV2Header, - mut payload: BytesMut, + mut payload: &[u8], ) -> std::io::Result { match header.version_and_command { // the connection was established on purpose by the proxy @@ -162,13 +91,12 @@ fn process_proxy_payload( PROXY_V2 => {} // other values are unassigned and must not be emitted by senders. Receivers // must drop connections presenting unexpected values here. - #[rustfmt::skip] // https://github.com/rust-lang/rustfmt/issues/6384 - _ => return Err(io::Error::other( - format!( + _ => { + return Err(io::Error::other(format!( "invalid proxy protocol command 0x{:02X}. expected local (0x20) or proxy (0x21)", header.version_and_command - ), - )), + ))); + } } let size_err = @@ -206,7 +134,7 @@ fn process_proxy_payload( } let subtype = tlv.value.get_u8(); match Pp2AwsType::from_repr(subtype) { - Some(Pp2AwsType::VpceId) => match std::str::from_utf8(&tlv.value) { + Some(Pp2AwsType::VpceId) => match std::str::from_utf8(tlv.value) { Ok(s) => { extra = Some(ConnectionInfoExtra::Aws { vpce_id: s.into() }); } @@ -282,65 +210,28 @@ enum Pp2AzureType { PrivateEndpointLinkId = 0x01, } -impl AsyncRead for ChainRW { - #[inline] - fn poll_read( - self: Pin<&mut Self>, - cx: &mut Context<'_>, - buf: &mut ReadBuf<'_>, - ) -> Poll> { - if self.buf.is_empty() { - self.project().inner.poll_read(cx, buf) - } else { - self.read_from_buf(buf) - } - } -} - -impl ChainRW { - #[cold] - fn read_from_buf(self: Pin<&mut Self>, buf: &mut ReadBuf<'_>) -> Poll> { - debug_assert!(!self.buf.is_empty()); - let this = self.project(); - - let write = usize::min(this.buf.len(), buf.remaining()); - let slice = this.buf.split_to(write).freeze(); - buf.put_slice(&slice); - - // reset the allocation so it can be freed - if this.buf.is_empty() { - *this.buf = BytesMut::new(); - } - - Poll::Ready(Ok(())) - } -} - #[derive(Debug)] -struct Tlv { +struct Tlv<'a> { kind: u8, - value: Bytes, + value: &'a [u8], } -fn read_tlv(b: &mut BytesMut) -> Option { +fn read_tlv<'a>(b: &mut &'a [u8]) -> Option> { let tlv_header = b.try_get::()?; let len = usize::from(tlv_header.len.get()); - if b.len() < len { - return None; - } Some(Tlv { kind: tlv_header.kind, - value: b.split_to(len).freeze(), + value: b.split_off(..len)?, }) } trait BufExt: Sized { fn try_get(&mut self) -> Option; } -impl BufExt for BytesMut { +impl BufExt for &[u8] { fn try_get(&mut self) -> Option { - let (res, _) = T::read_from_prefix(self).ok()?; - self.advance(size_of::()); + let (res, rest) = T::read_from_prefix(self).ok()?; + *self = rest; Some(res) } } @@ -481,27 +372,19 @@ mod tests { } #[tokio::test] + #[should_panic = "invalid proxy protocol header"] async fn test_invalid() { let data = [0x55; 256]; - let (mut read, info) = read_proxy_protocol(data.as_slice()).await.unwrap(); - - let mut bytes = vec![]; - read.read_to_end(&mut bytes).await.unwrap(); - assert_eq!(bytes, data); - assert_eq!(info, ConnectHeader::Missing); + read_proxy_protocol(data.as_slice()).await.unwrap(); } #[tokio::test] + #[should_panic = "early eof"] async fn test_short() { let data = [0x55; 10]; - let (mut read, info) = read_proxy_protocol(data.as_slice()).await.unwrap(); - - let mut bytes = vec![]; - read.read_to_end(&mut bytes).await.unwrap(); - assert_eq!(bytes, data); - assert_eq!(info, ConnectHeader::Missing); + read_proxy_protocol(data.as_slice()).await.unwrap(); } #[tokio::test] diff --git a/proxy/src/proxy/mod.rs b/proxy/src/proxy/mod.rs index 0ffc54aa88..477baff1c9 100644 --- a/proxy/src/proxy/mod.rs +++ b/proxy/src/proxy/mod.rs @@ -102,30 +102,24 @@ pub async fn task_main( let endpoint_rate_limiter2 = endpoint_rate_limiter.clone(); connections.spawn(async move { - let (socket, conn_info) = match read_proxy_protocol(socket).await { - Err(e) => { - warn!("per-client task finished with an error: {e:#}"); - return; + let (socket, conn_info) = match config.proxy_protocol_v2 { + ProxyProtocolV2::Required => { + match read_proxy_protocol(socket).await { + Err(e) => { + warn!("per-client task finished with an error: {e:#}"); + return; + } + // our load balancers will not send any more data. let's just exit immediately + Ok((_socket, ConnectHeader::Local)) => { + debug!("healthcheck received"); + return; + } + Ok((socket, ConnectHeader::Proxy(info))) => (socket, info), + } } - // our load balancers will not send any more data. let's just exit immediately - Ok((_socket, ConnectHeader::Local)) => { - debug!("healthcheck received"); - return; - } - Ok((_socket, ConnectHeader::Missing)) - if config.proxy_protocol_v2 == ProxyProtocolV2::Required => - { - warn!("missing required proxy protocol header"); - return; - } - Ok((_socket, ConnectHeader::Proxy(_))) - if config.proxy_protocol_v2 == ProxyProtocolV2::Rejected => - { - warn!("proxy protocol header not supported"); - return; - } - Ok((socket, ConnectHeader::Proxy(info))) => (socket, info), - Ok((socket, ConnectHeader::Missing)) => ( + // ignore the header - it cannot be confused for a postgres or http connection so will + // error later. + ProxyProtocolV2::Rejected => ( socket, ConnectionInfo { addr: peer_addr, @@ -134,7 +128,7 @@ pub async fn task_main( ), }; - match socket.inner.set_nodelay(true) { + match socket.set_nodelay(true) { Ok(()) => {} Err(e) => { error!( diff --git a/proxy/src/proxy/tests/mod.rs b/proxy/src/proxy/tests/mod.rs index 61e8ee4a10..117c42e19c 100644 --- a/proxy/src/proxy/tests/mod.rs +++ b/proxy/src/proxy/tests/mod.rs @@ -173,7 +173,6 @@ async fn dummy_proxy( tls: Option, auth: impl TestAuth + Send, ) -> anyhow::Result<()> { - let (client, _) = read_proxy_protocol(client).await?; let mut stream = match handshake(&RequestContext::test(), client, tls.as_ref(), false).await? { HandshakeData::Startup(stream, _) => stream, HandshakeData::Cancel(_) => bail!("cancellation not supported"), diff --git a/proxy/src/serverless/mod.rs b/proxy/src/serverless/mod.rs index 2a7069b1c2..f6f681ac45 100644 --- a/proxy/src/serverless/mod.rs +++ b/proxy/src/serverless/mod.rs @@ -49,7 +49,7 @@ use crate::config::{ProxyConfig, ProxyProtocolV2}; use crate::context::RequestContext; use crate::ext::TaskExt; use crate::metrics::Metrics; -use crate::protocol2::{ChainRW, ConnectHeader, ConnectionInfo, read_proxy_protocol}; +use crate::protocol2::{ConnectHeader, ConnectionInfo, read_proxy_protocol}; use crate::proxy::run_until_cancelled; use crate::rate_limiter::EndpointRateLimiter; use crate::serverless::backend::PoolingBackend; @@ -207,12 +207,12 @@ pub(crate) type AsyncRW = Pin>; #[async_trait] trait MaybeTlsAcceptor: Send + Sync + 'static { - async fn accept(&self, conn: ChainRW) -> std::io::Result; + async fn accept(&self, conn: TcpStream) -> std::io::Result; } #[async_trait] impl MaybeTlsAcceptor for &'static ArcSwapOption { - async fn accept(&self, conn: ChainRW) -> std::io::Result { + async fn accept(&self, conn: TcpStream) -> std::io::Result { match &*self.load() { Some(config) => Ok(Box::pin( TlsAcceptor::from(config.http_config.clone()) @@ -235,33 +235,30 @@ async fn connection_startup( peer_addr: SocketAddr, ) -> Option<(AsyncRW, ConnectionInfo)> { // handle PROXY protocol - let (conn, peer) = match read_proxy_protocol(conn).await { - Ok(c) => c, - Err(e) => { - tracing::warn!(?session_id, %peer_addr, "failed to accept TCP connection: invalid PROXY protocol V2 header: {e:#}"); - return None; + let (conn, conn_info) = match config.proxy_protocol_v2 { + ProxyProtocolV2::Required => { + match read_proxy_protocol(conn).await { + Err(e) => { + warn!("per-client task finished with an error: {e:#}"); + return None; + } + // our load balancers will not send any more data. let's just exit immediately + Ok((_conn, ConnectHeader::Local)) => { + tracing::debug!("healthcheck received"); + return None; + } + Ok((conn, ConnectHeader::Proxy(info))) => (conn, info), + } } - }; - - let conn_info = match peer { - // our load balancers will not send any more data. let's just exit immediately - ConnectHeader::Local => { - tracing::debug!("healthcheck received"); - return None; - } - ConnectHeader::Missing if config.proxy_protocol_v2 == ProxyProtocolV2::Required => { - tracing::warn!("missing required proxy protocol header"); - return None; - } - ConnectHeader::Proxy(_) if config.proxy_protocol_v2 == ProxyProtocolV2::Rejected => { - tracing::warn!("proxy protocol header not supported"); - return None; - } - ConnectHeader::Proxy(info) => info, - ConnectHeader::Missing => ConnectionInfo { - addr: peer_addr, - extra: None, - }, + // ignore the header - it cannot be confused for a postgres or http connection so will + // error later. + ProxyProtocolV2::Rejected => ( + conn, + ConnectionInfo { + addr: peer_addr, + extra: None, + }, + ), }; let has_private_peer_addr = match conn_info.addr.ip() { From d8ebd1d7717a4871cca5dda1b60ae4167e4bf514 Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Thu, 5 Jun 2025 15:48:36 +0800 Subject: [PATCH 19/25] feat(pageserver): report tenant properties to posthog (#12113) ## Problem Part of https://github.com/neondatabase/neon/issues/11813 In PostHog UI, we need to create the properties before using them as a filter. We report all variants automatically when we start the pageserver. In the future, we can report all real tenants instead of fake tenants (we do that now to save money + we don't need real tenants in the UI). ## Summary of changes * Collect `region`, `availability_zone`, `pageserver_id` properties and use them in the feature evaluation. * Report 10 fake tenants on each pageserver startup. --------- Signed-off-by: Alex Chi Z --- .../src/background_loop.rs | 27 +++- libs/posthog_client_lite/src/lib.rs | 48 ++++++- pageserver/src/feature_resolver.rs | 123 ++++++++++++++++-- pageserver/src/http/routes.rs | 10 +- 4 files changed, 190 insertions(+), 18 deletions(-) diff --git a/libs/posthog_client_lite/src/background_loop.rs b/libs/posthog_client_lite/src/background_loop.rs index a05f6096b1..a404c76da9 100644 --- a/libs/posthog_client_lite/src/background_loop.rs +++ b/libs/posthog_client_lite/src/background_loop.rs @@ -6,7 +6,7 @@ use arc_swap::ArcSwap; use tokio_util::sync::CancellationToken; use tracing::{Instrument, info_span}; -use crate::{FeatureStore, PostHogClient, PostHogClientConfig}; +use crate::{CaptureEvent, FeatureStore, PostHogClient, PostHogClientConfig}; /// A background loop that fetches feature flags from PostHog and updates the feature store. pub struct FeatureResolverBackgroundLoop { @@ -24,9 +24,16 @@ impl FeatureResolverBackgroundLoop { } } - pub fn spawn(self: Arc, handle: &tokio::runtime::Handle, refresh_period: Duration) { + pub fn spawn( + self: Arc, + handle: &tokio::runtime::Handle, + refresh_period: Duration, + fake_tenants: Vec, + ) { let this = self.clone(); let cancel = self.cancel.clone(); + + // Main loop of updating the feature flags. handle.spawn( async move { tracing::info!("Starting PostHog feature resolver"); @@ -56,6 +63,22 @@ impl FeatureResolverBackgroundLoop { } .instrument(info_span!("posthog_feature_resolver")), ); + + // Report fake tenants to PostHog so that we have the combination of all the properties in the UI. + // Do one report per pageserver restart. + let this = self.clone(); + handle.spawn( + async move { + tracing::info!("Starting PostHog feature reporter"); + for tenant in &fake_tenants { + tracing::info!("Reporting fake tenant: {:?}", tenant); + } + if let Err(e) = this.posthog_client.capture_event_batch(&fake_tenants).await { + tracing::warn!("Cannot report fake tenants: {}", e); + } + } + .instrument(info_span!("posthog_feature_reporter")), + ); } pub fn feature_store(&self) -> Arc { diff --git a/libs/posthog_client_lite/src/lib.rs b/libs/posthog_client_lite/src/lib.rs index 281a8f8011..f607b1be0a 100644 --- a/libs/posthog_client_lite/src/lib.rs +++ b/libs/posthog_client_lite/src/lib.rs @@ -64,7 +64,7 @@ pub struct LocalEvaluationFlagFilterProperty { operator: String, } -#[derive(Debug, Serialize, Deserialize)] +#[derive(Debug, Serialize, Deserialize, Clone)] #[serde(untagged)] pub enum PostHogFlagFilterPropertyValue { String(String), @@ -507,6 +507,13 @@ pub struct PostHogClient { client: reqwest::Client, } +#[derive(Serialize, Debug)] +pub struct CaptureEvent { + pub event: String, + pub distinct_id: String, + pub properties: serde_json::Value, +} + impl PostHogClient { pub fn new(config: PostHogClientConfig) -> Self { let client = reqwest::Client::new(); @@ -570,12 +577,12 @@ impl PostHogClient { &self, event: &str, distinct_id: &str, - properties: &HashMap, + properties: &serde_json::Value, ) -> anyhow::Result<()> { // PUBLIC_URL/capture/ - // with bearer token of self.client_api_key let url = format!("{}/capture/", self.config.public_api_url); - self.client + let response = self + .client .post(url) .body(serde_json::to_string(&json!({ "api_key": self.config.client_api_key, @@ -585,6 +592,39 @@ impl PostHogClient { }))?) .send() .await?; + let status = response.status(); + let body = response.text().await?; + if !status.is_success() { + return Err(anyhow::anyhow!( + "Failed to capture events: {}, {}", + status, + body + )); + } + Ok(()) + } + + pub async fn capture_event_batch(&self, events: &[CaptureEvent]) -> anyhow::Result<()> { + // PUBLIC_URL/batch/ + let url = format!("{}/batch/", self.config.public_api_url); + let response = self + .client + .post(url) + .body(serde_json::to_string(&json!({ + "api_key": self.config.client_api_key, + "batch": events, + }))?) + .send() + .await?; + let status = response.status(); + let body = response.text().await?; + if !status.is_success() { + return Err(anyhow::anyhow!( + "Failed to capture events: {}, {}", + status, + body + )); + } Ok(()) } } diff --git a/pageserver/src/feature_resolver.rs b/pageserver/src/feature_resolver.rs index 7463186c06..50de3b691c 100644 --- a/pageserver/src/feature_resolver.rs +++ b/pageserver/src/feature_resolver.rs @@ -1,8 +1,11 @@ use std::{collections::HashMap, sync::Arc, time::Duration}; use posthog_client_lite::{ - FeatureResolverBackgroundLoop, PostHogClientConfig, PostHogEvaluationError, + CaptureEvent, FeatureResolverBackgroundLoop, PostHogClientConfig, PostHogEvaluationError, + PostHogFlagFilterPropertyValue, }; +use remote_storage::RemoteStorageKind; +use serde_json::json; use tokio_util::sync::CancellationToken; use utils::id::TenantId; @@ -11,11 +14,15 @@ use crate::{config::PageServerConf, metrics::FEATURE_FLAG_EVALUATION}; #[derive(Clone)] pub struct FeatureResolver { inner: Option>, + internal_properties: Option>>, } impl FeatureResolver { pub fn new_disabled() -> Self { - Self { inner: None } + Self { + inner: None, + internal_properties: None, + } } pub fn spawn( @@ -36,14 +43,114 @@ impl FeatureResolver { shutdown_pageserver, ); let inner = Arc::new(inner); - // TODO: make this configurable - inner.clone().spawn(handle, Duration::from_secs(60)); - Ok(FeatureResolver { inner: Some(inner) }) + + // The properties shared by all tenants on this pageserver. + let internal_properties = { + let mut properties = HashMap::new(); + properties.insert( + "pageserver_id".to_string(), + PostHogFlagFilterPropertyValue::String(conf.id.to_string()), + ); + if let Some(availability_zone) = &conf.availability_zone { + properties.insert( + "availability_zone".to_string(), + PostHogFlagFilterPropertyValue::String(availability_zone.clone()), + ); + } + // Infer region based on the remote storage config. + if let Some(remote_storage) = &conf.remote_storage_config { + match &remote_storage.storage { + RemoteStorageKind::AwsS3(config) => { + properties.insert( + "region".to_string(), + PostHogFlagFilterPropertyValue::String(format!( + "aws-{}", + config.bucket_region + )), + ); + } + RemoteStorageKind::AzureContainer(config) => { + properties.insert( + "region".to_string(), + PostHogFlagFilterPropertyValue::String(format!( + "azure-{}", + config.container_region + )), + ); + } + RemoteStorageKind::LocalFs { .. } => { + properties.insert( + "region".to_string(), + PostHogFlagFilterPropertyValue::String("local".to_string()), + ); + } + } + } + // TODO: add pageserver URL. + Arc::new(properties) + }; + let fake_tenants = { + let mut tenants = Vec::new(); + for i in 0..10 { + let distinct_id = format!( + "fake_tenant_{}_{}_{}", + conf.availability_zone.as_deref().unwrap_or_default(), + conf.id, + i + ); + let properties = Self::collect_properties_inner( + distinct_id.clone(), + Some(&internal_properties), + ); + tenants.push(CaptureEvent { + event: "initial_tenant_report".to_string(), + distinct_id, + properties: json!({ "$set": properties }), // use `$set` to set the person properties instead of the event properties + }); + } + tenants + }; + // TODO: make refresh period configurable + inner + .clone() + .spawn(handle, Duration::from_secs(60), fake_tenants); + Ok(FeatureResolver { + inner: Some(inner), + internal_properties: Some(internal_properties), + }) } else { - Ok(FeatureResolver { inner: None }) + Ok(FeatureResolver { + inner: None, + internal_properties: None, + }) } } + fn collect_properties_inner( + tenant_id: String, + internal_properties: Option<&HashMap>, + ) -> HashMap { + let mut properties = HashMap::new(); + if let Some(internal_properties) = internal_properties { + for (key, value) in internal_properties.iter() { + properties.insert(key.clone(), value.clone()); + } + } + properties.insert( + "tenant_id".to_string(), + PostHogFlagFilterPropertyValue::String(tenant_id), + ); + properties + } + + /// Collect all properties availble for the feature flag evaluation. + pub(crate) fn collect_properties( + &self, + tenant_id: TenantId, + ) -> HashMap { + Self::collect_properties_inner(tenant_id.to_string(), self.internal_properties.as_deref()) + } + /// Evaluate a multivariate feature flag. Currently, we do not support any properties. /// /// Error handling: the caller should inspect the error and decide the behavior when a feature flag @@ -58,7 +165,7 @@ impl FeatureResolver { let res = inner.feature_store().evaluate_multivariate( flag_key, &tenant_id.to_string(), - &HashMap::new(), + &self.collect_properties(tenant_id), ); match &res { Ok(value) => { @@ -96,7 +203,7 @@ impl FeatureResolver { let res = inner.feature_store().evaluate_boolean( flag_key, &tenant_id.to_string(), - &HashMap::new(), + &self.collect_properties(tenant_id), ); match &res { Ok(()) => { diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 1effa10404..c8a2a0209f 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -43,6 +43,7 @@ use pageserver_api::models::{ use pageserver_api::shard::{ShardCount, TenantShardId}; use remote_storage::{DownloadError, GenericRemoteStorage, TimeTravelError}; use scopeguard::defer; +use serde_json::json; use tenant_size_model::svg::SvgBranchKind; use tenant_size_model::{SizeResult, StorageModel}; use tokio::time::Instant; @@ -3679,23 +3680,24 @@ async fn tenant_evaluate_feature_flag( let tenant = state .tenant_manager .get_attached_tenant_shard(tenant_shard_id)?; + let properties = tenant.feature_resolver.collect_properties(tenant_shard_id.tenant_id); if as_type == "boolean" { let result = tenant.feature_resolver.evaluate_boolean(&flag, tenant_shard_id.tenant_id); let result = result.map(|_| true).map_err(|e| e.to_string()); - json_response(StatusCode::OK, result) + json_response(StatusCode::OK, json!({ "result": result, "properties": properties })) } else if as_type == "multivariate" { let result = tenant.feature_resolver.evaluate_multivariate(&flag, tenant_shard_id.tenant_id).map_err(|e| e.to_string()); - json_response(StatusCode::OK, result) + json_response(StatusCode::OK, json!({ "result": result, "properties": properties })) } else { // Auto infer the type of the feature flag. let is_boolean = tenant.feature_resolver.is_feature_flag_boolean(&flag).map_err(|e| ApiError::InternalServerError(anyhow::anyhow!("{e}")))?; if is_boolean { let result = tenant.feature_resolver.evaluate_boolean(&flag, tenant_shard_id.tenant_id); let result = result.map(|_| true).map_err(|e| e.to_string()); - json_response(StatusCode::OK, result) + json_response(StatusCode::OK, json!({ "result": result, "properties": properties })) } else { let result = tenant.feature_resolver.evaluate_multivariate(&flag, tenant_shard_id.tenant_id).map_err(|e| e.to_string()); - json_response(StatusCode::OK, result) + json_response(StatusCode::OK, json!({ "result": result, "properties": properties })) } } } From 1577665c20c98ab82c9aa5edcd2e4972887b5e5f Mon Sep 17 00:00:00 2001 From: Folke Behrens Date: Thu, 5 Jun 2025 11:00:23 +0000 Subject: [PATCH 20/25] proxy: Move PGLB-related modules into pglb root module. (#12144) Split the modules responsible for passing data and connecting to compute from auth and waking for PGLB. This PR just moves files. The waking is going to get removed from pglb after this. --- proxy/src/auth/backend/console_redirect.rs | 2 +- proxy/src/auth/backend/mod.rs | 2 +- proxy/src/console_redirect_proxy.rs | 6 +++--- proxy/src/{proxy => pglb}/connect_compute.rs | 3 +-- proxy/src/{proxy => pglb}/copy_bidirectional.rs | 0 proxy/src/{proxy => pglb}/handshake.rs | 0 proxy/src/pglb/mod.rs | 4 ++++ proxy/src/{proxy => pglb}/passthrough.rs | 2 +- proxy/src/proxy/mod.rs | 14 +++++--------- proxy/src/proxy/tests/mod.rs | 2 +- proxy/src/proxy/wake_compute.rs | 2 +- proxy/src/serverless/backend.rs | 6 +++--- 12 files changed, 21 insertions(+), 22 deletions(-) rename proxy/src/{proxy => pglb}/connect_compute.rs (98%) rename proxy/src/{proxy => pglb}/copy_bidirectional.rs (100%) rename proxy/src/{proxy => pglb}/handshake.rs (100%) rename proxy/src/{proxy => pglb}/passthrough.rs (97%) diff --git a/proxy/src/auth/backend/console_redirect.rs b/proxy/src/auth/backend/console_redirect.rs index a50c30257f..c388848926 100644 --- a/proxy/src/auth/backend/console_redirect.rs +++ b/proxy/src/auth/backend/console_redirect.rs @@ -15,9 +15,9 @@ use crate::context::RequestContext; use crate::control_plane::client::cplane_proxy_v1; use crate::control_plane::{self, CachedNodeInfo, NodeInfo}; use crate::error::{ReportableError, UserFacingError}; +use crate::pglb::connect_compute::ComputeConnectBackend; use crate::pqproto::BeMessage; use crate::proxy::NeonOptions; -use crate::proxy::connect_compute::ComputeConnectBackend; use crate::stream::PqStream; use crate::types::RoleName; use crate::{auth, compute, waiters}; diff --git a/proxy/src/auth/backend/mod.rs b/proxy/src/auth/backend/mod.rs index 735cb52f47..f978f655c4 100644 --- a/proxy/src/auth/backend/mod.rs +++ b/proxy/src/auth/backend/mod.rs @@ -25,9 +25,9 @@ use crate::control_plane::{ RoleAccessControl, }; use crate::intern::EndpointIdInt; +use crate::pglb::connect_compute::ComputeConnectBackend; use crate::pqproto::BeMessage; use crate::proxy::NeonOptions; -use crate::proxy::connect_compute::ComputeConnectBackend; use crate::rate_limiter::EndpointRateLimiter; use crate::stream::Stream; use crate::types::{EndpointCacheKey, EndpointId, RoleName}; diff --git a/proxy/src/console_redirect_proxy.rs b/proxy/src/console_redirect_proxy.rs index 6755499b45..f2484b54b8 100644 --- a/proxy/src/console_redirect_proxy.rs +++ b/proxy/src/console_redirect_proxy.rs @@ -11,10 +11,10 @@ use crate::config::{ProxyConfig, ProxyProtocolV2}; use crate::context::RequestContext; use crate::error::ReportableError; use crate::metrics::{Metrics, NumClientConnectionsGuard}; +use crate::pglb::connect_compute::{TcpMechanism, connect_to_compute}; +use crate::pglb::handshake::{HandshakeData, handshake}; +use crate::pglb::passthrough::ProxyPassthrough; use crate::protocol2::{ConnectHeader, ConnectionInfo, read_proxy_protocol}; -use crate::proxy::connect_compute::{TcpMechanism, connect_to_compute}; -use crate::proxy::handshake::{HandshakeData, handshake}; -use crate::proxy::passthrough::ProxyPassthrough; use crate::proxy::{ ClientRequestError, ErrorSource, prepare_client_connection, run_until_cancelled, }; diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/pglb/connect_compute.rs similarity index 98% rename from proxy/src/proxy/connect_compute.rs rename to proxy/src/pglb/connect_compute.rs index 57785c9ec5..1d6ca5fbb3 100644 --- a/proxy/src/proxy/connect_compute.rs +++ b/proxy/src/pglb/connect_compute.rs @@ -2,7 +2,6 @@ use async_trait::async_trait; use tokio::time; use tracing::{debug, info, warn}; -use super::retry::ShouldRetryWakeCompute; use crate::auth::backend::{ComputeCredentialKeys, ComputeUserInfo}; use crate::compute::{self, COULD_NOT_CONNECT, PostgresConnection}; use crate::config::{ComputeConfig, RetryConfig}; @@ -15,7 +14,7 @@ use crate::metrics::{ ConnectOutcome, ConnectionFailureKind, Metrics, RetriesMetricGroup, RetryType, }; use crate::pqproto::StartupMessageParams; -use crate::proxy::retry::{CouldRetry, retry_after, should_retry}; +use crate::proxy::retry::{CouldRetry, ShouldRetryWakeCompute, retry_after, should_retry}; use crate::proxy::wake_compute::wake_compute; use crate::types::Host; diff --git a/proxy/src/proxy/copy_bidirectional.rs b/proxy/src/pglb/copy_bidirectional.rs similarity index 100% rename from proxy/src/proxy/copy_bidirectional.rs rename to proxy/src/pglb/copy_bidirectional.rs diff --git a/proxy/src/proxy/handshake.rs b/proxy/src/pglb/handshake.rs similarity index 100% rename from proxy/src/proxy/handshake.rs rename to proxy/src/pglb/handshake.rs diff --git a/proxy/src/pglb/mod.rs b/proxy/src/pglb/mod.rs index 1088859fb9..4b107142a7 100644 --- a/proxy/src/pglb/mod.rs +++ b/proxy/src/pglb/mod.rs @@ -1 +1,5 @@ +pub mod connect_compute; +pub mod copy_bidirectional; +pub mod handshake; pub mod inprocess; +pub mod passthrough; diff --git a/proxy/src/proxy/passthrough.rs b/proxy/src/pglb/passthrough.rs similarity index 97% rename from proxy/src/proxy/passthrough.rs rename to proxy/src/pglb/passthrough.rs index 55ab5f4dba..6f651d383d 100644 --- a/proxy/src/proxy/passthrough.rs +++ b/proxy/src/pglb/passthrough.rs @@ -53,7 +53,7 @@ pub(crate) async fn proxy_pass( // Starting from here we only proxy the client's traffic. debug!("performing the proxy pass..."); - let _ = crate::proxy::copy_bidirectional::copy_bidirectional_client_compute( + let _ = crate::pglb::copy_bidirectional::copy_bidirectional_client_compute( &mut client, &mut compute, ) diff --git a/proxy/src/proxy/mod.rs b/proxy/src/proxy/mod.rs index 477baff1c9..0e138cc0c7 100644 --- a/proxy/src/proxy/mod.rs +++ b/proxy/src/proxy/mod.rs @@ -1,15 +1,10 @@ #[cfg(test)] mod tests; -pub(crate) mod connect_compute; -mod copy_bidirectional; -pub(crate) mod handshake; -pub(crate) mod passthrough; pub(crate) mod retry; pub(crate) mod wake_compute; use std::sync::Arc; -pub use copy_bidirectional::{ErrorSource, copy_bidirectional_client_compute}; use futures::FutureExt; use itertools::Itertools; use once_cell::sync::OnceCell; @@ -21,16 +16,17 @@ use tokio::io::{AsyncRead, AsyncWrite}; use tokio_util::sync::CancellationToken; use tracing::{Instrument, debug, error, info, warn}; -use self::connect_compute::{TcpMechanism, connect_to_compute}; -use self::passthrough::ProxyPassthrough; use crate::cancellation::{self, CancellationHandler}; use crate::config::{ProxyConfig, ProxyProtocolV2, TlsConfig}; use crate::context::RequestContext; use crate::error::{ReportableError, UserFacingError}; use crate::metrics::{Metrics, NumClientConnectionsGuard}; +use crate::pglb::connect_compute::{TcpMechanism, connect_to_compute}; +pub use crate::pglb::copy_bidirectional::{ErrorSource, copy_bidirectional_client_compute}; +use crate::pglb::handshake::{HandshakeData, HandshakeError, handshake}; +use crate::pglb::passthrough::ProxyPassthrough; use crate::pqproto::{BeMessage, CancelKeyData, StartupMessageParams}; use crate::protocol2::{ConnectHeader, ConnectionInfo, ConnectionInfoExtra, read_proxy_protocol}; -use crate::proxy::handshake::{HandshakeData, handshake}; use crate::rate_limiter::EndpointRateLimiter; use crate::stream::{PqStream, Stream}; use crate::types::EndpointCacheKey; @@ -242,7 +238,7 @@ pub(crate) enum ClientRequestError { #[error("{0}")] Cancellation(#[from] cancellation::CancelError), #[error("{0}")] - Handshake(#[from] handshake::HandshakeError), + Handshake(#[from] HandshakeError), #[error("{0}")] HandshakeTimeout(#[from] tokio::time::error::Elapsed), #[error("{0}")] diff --git a/proxy/src/proxy/tests/mod.rs b/proxy/src/proxy/tests/mod.rs index 117c42e19c..e5db0013a7 100644 --- a/proxy/src/proxy/tests/mod.rs +++ b/proxy/src/proxy/tests/mod.rs @@ -17,7 +17,6 @@ use rustls::pki_types; use tokio::io::DuplexStream; use tracing_test::traced_test; -use super::connect_compute::ConnectMechanism; use super::retry::CouldRetry; use super::*; use crate::auth::backend::{ @@ -28,6 +27,7 @@ use crate::control_plane::client::{ControlPlaneClient, TestControlPlaneClient}; use crate::control_plane::messages::{ControlPlaneErrorMessage, Details, MetricsAuxInfo, Status}; use crate::control_plane::{self, CachedNodeInfo, NodeInfo, NodeInfoCache}; use crate::error::ErrorKind; +use crate::pglb::connect_compute::ConnectMechanism; use crate::tls::client_config::compute_client_config_with_certs; use crate::tls::postgres_rustls::MakeRustlsConnect; use crate::tls::server_config::CertResolver; diff --git a/proxy/src/proxy/wake_compute.rs b/proxy/src/proxy/wake_compute.rs index 9d8915e24a..06c2da58db 100644 --- a/proxy/src/proxy/wake_compute.rs +++ b/proxy/src/proxy/wake_compute.rs @@ -1,6 +1,5 @@ use tracing::{error, info}; -use super::connect_compute::ComputeConnectBackend; use crate::config::RetryConfig; use crate::context::RequestContext; use crate::control_plane::CachedNodeInfo; @@ -9,6 +8,7 @@ use crate::error::ReportableError; use crate::metrics::{ ConnectOutcome, ConnectionFailuresBreakdownGroup, Metrics, RetriesMetricGroup, RetryType, }; +use crate::pglb::connect_compute::ComputeConnectBackend; use crate::proxy::retry::{retry_after, should_retry}; // Use macro to retain original callsite. diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs index bf640c05e9..748e0ce6f2 100644 --- a/proxy/src/serverless/backend.rs +++ b/proxy/src/serverless/backend.rs @@ -35,7 +35,7 @@ use crate::control_plane::errors::{GetAuthInfoError, WakeComputeError}; use crate::control_plane::locks::ApiLocks; use crate::error::{ErrorKind, ReportableError, UserFacingError}; use crate::intern::EndpointIdInt; -use crate::proxy::connect_compute::ConnectMechanism; +use crate::pglb::connect_compute::ConnectMechanism; use crate::proxy::retry::{CouldRetry, ShouldRetryWakeCompute}; use crate::rate_limiter::EndpointRateLimiter; use crate::types::{EndpointId, Host, LOCAL_PROXY_SUFFIX}; @@ -182,7 +182,7 @@ impl PoolingBackend { tracing::Span::current().record("conn_id", display(conn_id)); info!(%conn_id, "pool: opening a new connection '{conn_info}'"); let backend = self.auth_backend.as_ref().map(|()| keys); - crate::proxy::connect_compute::connect_to_compute( + crate::pglb::connect_compute::connect_to_compute( ctx, &TokioMechanism { conn_id, @@ -226,7 +226,7 @@ impl PoolingBackend { }, keys: crate::auth::backend::ComputeCredentialKeys::None, }); - crate::proxy::connect_compute::connect_to_compute( + crate::pglb::connect_compute::connect_to_compute( ctx, &HyperMechanism { conn_id, From 6123fe2d5e0e860fc3da74422d094927ecd6191e Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Thu, 5 Jun 2025 14:23:39 +0300 Subject: [PATCH 21/25] Add query execution time histogram (#10050) ## Problem It will be useful to understand what kind of queries our clients are executed. And one of the most important characteristic of query is query execution time - at least it allows to distinguish OLAP and OLTP queries. Also monitoring query execution time can help to detect problem with performance (assuming that workload is more or less stable). ## Summary of changes Add query execution time histogram. --------- Co-authored-by: Konstantin Knizhnik --- pgxn/neon/neon.c | 76 ++++++++++++ pgxn/neon/neon_perf_counters.c | 122 ++++++++++++++++---- pgxn/neon/neon_perf_counters.h | 28 +++++ test_runner/regress/test_compute_metrics.py | 5 + 4 files changed, 208 insertions(+), 23 deletions(-) diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c index a6a7021756..5b4ced7cf0 100644 --- a/pgxn/neon/neon.c +++ b/pgxn/neon/neon.c @@ -16,6 +16,7 @@ #if PG_MAJORVERSION_NUM >= 15 #include "access/xlogrecovery.h" #endif +#include "executor/instrument.h" #include "replication/logical.h" #include "replication/logicallauncher.h" #include "replication/slot.h" @@ -33,6 +34,7 @@ #include "file_cache.h" #include "neon.h" #include "neon_lwlsncache.h" +#include "neon_perf_counters.h" #include "control_plane_connector.h" #include "logical_replication_monitor.h" #include "unstable_extensions.h" @@ -46,6 +48,13 @@ void _PG_init(void); static int running_xacts_overflow_policy; +static bool monitor_query_exec_time = false; + +static ExecutorStart_hook_type prev_ExecutorStart = NULL; +static ExecutorEnd_hook_type prev_ExecutorEnd = NULL; + +static void neon_ExecutorStart(QueryDesc *queryDesc, int eflags); +static void neon_ExecutorEnd(QueryDesc *queryDesc); #if PG_MAJORVERSION_NUM >= 16 static shmem_startup_hook_type prev_shmem_startup_hook; @@ -470,6 +479,16 @@ _PG_init(void) 0, NULL, NULL, NULL); + DefineCustomBoolVariable( + "neon.monitor_query_exec_time", + "Collect infortmation about query execution time", + NULL, + &monitor_query_exec_time, + false, + PGC_USERSET, + 0, + NULL, NULL, NULL); + DefineCustomBoolVariable( "neon.allow_replica_misconfig", "Allow replica startup when some critical GUCs have smaller value than on primary node", @@ -508,6 +527,11 @@ _PG_init(void) EmitWarningsOnPlaceholders("neon"); ReportSearchPath(); + + prev_ExecutorStart = ExecutorStart_hook; + ExecutorStart_hook = neon_ExecutorStart; + prev_ExecutorEnd = ExecutorEnd_hook; + ExecutorEnd_hook = neon_ExecutorEnd; } PG_FUNCTION_INFO_V1(pg_cluster_size); @@ -581,3 +605,55 @@ neon_shmem_startup_hook(void) #endif } #endif + +/* + * ExecutorStart hook: start up tracking if needed + */ +static void +neon_ExecutorStart(QueryDesc *queryDesc, int eflags) +{ + if (prev_ExecutorStart) + prev_ExecutorStart(queryDesc, eflags); + else + standard_ExecutorStart(queryDesc, eflags); + + if (monitor_query_exec_time) + { + /* + * Set up to track total elapsed time in ExecutorRun. Make sure the + * space is allocated in the per-query context so it will go away at + * ExecutorEnd. + */ + if (queryDesc->totaltime == NULL) + { + MemoryContext oldcxt; + + oldcxt = MemoryContextSwitchTo(queryDesc->estate->es_query_cxt); + queryDesc->totaltime = InstrAlloc(1, INSTRUMENT_TIMER, false); + MemoryContextSwitchTo(oldcxt); + } + } +} + +/* + * ExecutorEnd hook: store results if needed + */ +static void +neon_ExecutorEnd(QueryDesc *queryDesc) +{ + if (monitor_query_exec_time && queryDesc->totaltime) + { + /* + * Make sure stats accumulation is done. (Note: it's okay if several + * levels of hook all do this.) + */ + InstrEndLoop(queryDesc->totaltime); + + inc_query_time(queryDesc->totaltime->total*1000000); /* convert to usec */ + } + + if (prev_ExecutorEnd) + prev_ExecutorEnd(queryDesc); + else + standard_ExecutorEnd(queryDesc); +} diff --git a/pgxn/neon/neon_perf_counters.c b/pgxn/neon/neon_perf_counters.c index c77d99d636..d0a3d15108 100644 --- a/pgxn/neon/neon_perf_counters.c +++ b/pgxn/neon/neon_perf_counters.c @@ -71,6 +71,27 @@ inc_iohist(IOHistogram hist, uint64 latency_us) hist->wait_us_count++; } +static inline void +inc_qthist(QTHistogram hist, uint64 elapsed_us) +{ + int lo = 0; + int hi = NUM_QT_BUCKETS - 1; + + /* Find the right bucket with binary search */ + while (lo < hi) + { + int mid = (lo + hi) / 2; + + if (elapsed_us < qt_bucket_thresholds[mid]) + hi = mid; + else + lo = mid + 1; + } + hist->elapsed_us_bucket[lo]++; + hist->elapsed_us_sum += elapsed_us; + hist->elapsed_us_count++; +} + /* * Count a GetPage wait operation. */ @@ -98,6 +119,13 @@ inc_page_cache_write_wait(uint64 latency) inc_iohist(&MyNeonCounters->file_cache_write_hist, latency); } + +void +inc_query_time(uint64 elapsed) +{ + inc_qthist(&MyNeonCounters->query_time_hist, elapsed); +} + /* * Support functions for the views, neon_backend_perf_counters and * neon_perf_counters. @@ -112,11 +140,11 @@ typedef struct } metric_t; static int -histogram_to_metrics(IOHistogram histogram, - metric_t *metrics, - const char *count, - const char *sum, - const char *bucket) +io_histogram_to_metrics(IOHistogram histogram, + metric_t *metrics, + const char *count, + const char *sum, + const char *bucket) { int i = 0; uint64 bucket_accum = 0; @@ -145,10 +173,44 @@ histogram_to_metrics(IOHistogram histogram, return i; } +static int +qt_histogram_to_metrics(QTHistogram histogram, + metric_t *metrics, + const char *count, + const char *sum, + const char *bucket) +{ + int i = 0; + uint64 bucket_accum = 0; + + metrics[i].name = count; + metrics[i].is_bucket = false; + metrics[i].value = (double) histogram->elapsed_us_count; + i++; + metrics[i].name = sum; + metrics[i].is_bucket = false; + metrics[i].value = (double) histogram->elapsed_us_sum / 1000000.0; + i++; + for (int bucketno = 0; bucketno < NUM_QT_BUCKETS; bucketno++) + { + uint64 threshold = qt_bucket_thresholds[bucketno]; + + bucket_accum += histogram->elapsed_us_bucket[bucketno]; + + metrics[i].name = bucket; + metrics[i].is_bucket = true; + metrics[i].bucket_le = (threshold == UINT64_MAX) ? INFINITY : ((double) threshold) / 1000000.0; + metrics[i].value = (double) bucket_accum; + i++; + } + + return i; +} + static metric_t * neon_perf_counters_to_metrics(neon_per_backend_counters *counters) { -#define NUM_METRICS ((2 + NUM_IO_WAIT_BUCKETS) * 3 + 12) +#define NUM_METRICS ((2 + NUM_IO_WAIT_BUCKETS) * 3 + (2 + NUM_QT_BUCKETS) + 12) metric_t *metrics = palloc((NUM_METRICS + 1) * sizeof(metric_t)); int i = 0; @@ -159,10 +221,10 @@ neon_perf_counters_to_metrics(neon_per_backend_counters *counters) i++; \ } while (false) - i += histogram_to_metrics(&counters->getpage_hist, &metrics[i], - "getpage_wait_seconds_count", - "getpage_wait_seconds_sum", - "getpage_wait_seconds_bucket"); + i += io_histogram_to_metrics(&counters->getpage_hist, &metrics[i], + "getpage_wait_seconds_count", + "getpage_wait_seconds_sum", + "getpage_wait_seconds_bucket"); APPEND_METRIC(getpage_prefetch_requests_total); APPEND_METRIC(getpage_sync_requests_total); @@ -178,14 +240,19 @@ neon_perf_counters_to_metrics(neon_per_backend_counters *counters) APPEND_METRIC(file_cache_hits_total); - i += histogram_to_metrics(&counters->file_cache_read_hist, &metrics[i], - "file_cache_read_wait_seconds_count", - "file_cache_read_wait_seconds_sum", - "file_cache_read_wait_seconds_bucket"); - i += histogram_to_metrics(&counters->file_cache_write_hist, &metrics[i], - "file_cache_write_wait_seconds_count", - "file_cache_write_wait_seconds_sum", - "file_cache_write_wait_seconds_bucket"); + i += io_histogram_to_metrics(&counters->file_cache_read_hist, &metrics[i], + "file_cache_read_wait_seconds_count", + "file_cache_read_wait_seconds_sum", + "file_cache_read_wait_seconds_bucket"); + i += io_histogram_to_metrics(&counters->file_cache_write_hist, &metrics[i], + "file_cache_write_wait_seconds_count", + "file_cache_write_wait_seconds_sum", + "file_cache_write_wait_seconds_bucket"); + + i += qt_histogram_to_metrics(&counters->query_time_hist, &metrics[i], + "query_time_seconds_count", + "query_time_seconds_sum", + "query_time_seconds_bucket"); Assert(i == NUM_METRICS); @@ -257,7 +324,7 @@ neon_get_backend_perf_counters(PG_FUNCTION_ARGS) } static inline void -histogram_merge_into(IOHistogram into, IOHistogram from) +io_histogram_merge_into(IOHistogram into, IOHistogram from) { into->wait_us_count += from->wait_us_count; into->wait_us_sum += from->wait_us_sum; @@ -265,6 +332,15 @@ histogram_merge_into(IOHistogram into, IOHistogram from) into->wait_us_bucket[bucketno] += from->wait_us_bucket[bucketno]; } +static inline void +qt_histogram_merge_into(QTHistogram into, QTHistogram from) +{ + into->elapsed_us_count += from->elapsed_us_count; + into->elapsed_us_sum += from->elapsed_us_sum; + for (int bucketno = 0; bucketno < NUM_QT_BUCKETS; bucketno++) + into->elapsed_us_bucket[bucketno] += from->elapsed_us_bucket[bucketno]; +} + PG_FUNCTION_INFO_V1(neon_get_perf_counters); Datum neon_get_perf_counters(PG_FUNCTION_ARGS) @@ -283,7 +359,7 @@ neon_get_perf_counters(PG_FUNCTION_ARGS) { neon_per_backend_counters *counters = &neon_per_backend_counters_shared[procno]; - histogram_merge_into(&totals.getpage_hist, &counters->getpage_hist); + io_histogram_merge_into(&totals.getpage_hist, &counters->getpage_hist); totals.getpage_prefetch_requests_total += counters->getpage_prefetch_requests_total; totals.getpage_sync_requests_total += counters->getpage_sync_requests_total; totals.getpage_prefetch_misses_total += counters->getpage_prefetch_misses_total; @@ -294,13 +370,13 @@ neon_get_perf_counters(PG_FUNCTION_ARGS) totals.pageserver_open_requests += counters->pageserver_open_requests; totals.getpage_prefetches_buffered += counters->getpage_prefetches_buffered; totals.file_cache_hits_total += counters->file_cache_hits_total; - histogram_merge_into(&totals.file_cache_read_hist, &counters->file_cache_read_hist); - histogram_merge_into(&totals.file_cache_write_hist, &counters->file_cache_write_hist); - totals.compute_getpage_stuck_requests_total += counters->compute_getpage_stuck_requests_total; totals.compute_getpage_max_inflight_stuck_time_ms = Max( totals.compute_getpage_max_inflight_stuck_time_ms, counters->compute_getpage_max_inflight_stuck_time_ms); + io_histogram_merge_into(&totals.file_cache_read_hist, &counters->file_cache_read_hist); + io_histogram_merge_into(&totals.file_cache_write_hist, &counters->file_cache_write_hist); + qt_histogram_merge_into(&totals.query_time_hist, &counters->query_time_hist); } metrics = neon_perf_counters_to_metrics(&totals); diff --git a/pgxn/neon/neon_perf_counters.h b/pgxn/neon/neon_perf_counters.h index 10cf094d4a..4b611b0636 100644 --- a/pgxn/neon/neon_perf_counters.h +++ b/pgxn/neon/neon_perf_counters.h @@ -36,6 +36,28 @@ typedef struct IOHistogramData typedef IOHistogramData *IOHistogram; +static const uint64 qt_bucket_thresholds[] = { + 2, 3, 6, 10, /* 0 us - 10 us */ + 20, 30, 60, 100, /* 10 us - 100 us */ + 200, 300, 600, 1000, /* 100 us - 1 ms */ + 2000, 3000, 6000, 10000, /* 1 ms - 10 ms */ + 20000, 30000, 60000, 100000, /* 10 ms - 100 ms */ + 200000, 300000, 600000, 1000000, /* 100 ms - 1 s */ + 2000000, 3000000, 6000000, 10000000, /* 1 s - 10 s */ + 20000000, 30000000, 60000000, 100000000, /* 10 s - 100 s */ + UINT64_MAX, +}; +#define NUM_QT_BUCKETS (lengthof(qt_bucket_thresholds)) + +typedef struct QTHistogramData +{ + uint64 elapsed_us_count; + uint64 elapsed_us_sum; + uint64 elapsed_us_bucket[NUM_QT_BUCKETS]; +} QTHistogramData; + +typedef QTHistogramData *QTHistogram; + typedef struct { /* @@ -127,6 +149,11 @@ typedef struct /* LFC I/O time buckets */ IOHistogramData file_cache_read_hist; IOHistogramData file_cache_write_hist; + + /* + * Histogram of query execution time. + */ + QTHistogramData query_time_hist; } neon_per_backend_counters; /* Pointer to the shared memory array of neon_per_backend_counters structs */ @@ -149,6 +176,7 @@ extern neon_per_backend_counters *neon_per_backend_counters_shared; extern void inc_getpage_wait(uint64 latency); extern void inc_page_cache_read_wait(uint64 latency); extern void inc_page_cache_write_wait(uint64 latency); +extern void inc_query_time(uint64 elapsed); extern Size NeonPerfCountersShmemSize(void); extern void NeonPerfCountersShmemInit(void); diff --git a/test_runner/regress/test_compute_metrics.py b/test_runner/regress/test_compute_metrics.py index 2cb2ee7b58..c751a3e7cc 100644 --- a/test_runner/regress/test_compute_metrics.py +++ b/test_runner/regress/test_compute_metrics.py @@ -466,8 +466,13 @@ def test_perf_counters(neon_simple_env: NeonEnv): # # 1.5 is the minimum version to contain these views. cur.execute("CREATE EXTENSION neon VERSION '1.5'") + cur.execute("set neon.monitor_query_exec_time = on") cur.execute("SELECT * FROM neon_perf_counters") cur.execute("SELECT * FROM neon_backend_perf_counters") + cur.execute( + "select value from neon_backend_perf_counters where metric='query_time_seconds_count' and pid=pg_backend_pid()" + ) + assert cur.fetchall()[0][0] == 2 def collect_metric( From 9c6c7802012cd4f5d1ed24aeaeb5e60f28075922 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Thu, 5 Jun 2025 14:27:14 +0300 Subject: [PATCH 22/25] Replica promote (#12090) ## Problem This PR is part of larger computes support activity: https://www.notion.so/neondatabase/Larger-computes-114f189e00478080ba01e8651ab7da90 Epic: https://github.com/neondatabase/cloud/issues/19010 In case of planned node restart, we are going to 1. create new read-only replica 2. capture LFC state at primary 3. use this state to prewarm replica 4. stop old primary 5. promote replica to primary Steps 1-3 are currently implemented and support from compute side. This PR provides compute level implementation of replica promotion. Support replica promotion ## Summary of changes Right now replica promotion is done in three steps: 1. Set safekeepers list (now it is empty for replica) 2. Call `pg_promote()` top promote replica 3. Update endpoint setting to that it ids not more treated as replica. May be all this three steps should be done by some function in compute_ctl. But right now this logic is only implement5ed in test. Postgres submodules PRs: https://github.com/neondatabase/postgres/pull/648 https://github.com/neondatabase/postgres/pull/649 https://github.com/neondatabase/postgres/pull/650 https://github.com/neondatabase/postgres/pull/651 --------- Co-authored-by: Matthias van de Meent Co-authored-by: Konstantin Knizhnik --- libs/walproposer/src/api_bindings.rs | 1 + pgxn/neon/neon_pgversioncompat.c | 8 ++ pgxn/neon/neon_pgversioncompat.h | 1 + pgxn/neon/neon_walreader.c | 15 ++- pgxn/neon/neon_walreader.h | 3 +- pgxn/neon/walproposer.c | 3 +- pgxn/neon/walproposer.h | 4 + pgxn/neon/walproposer_pg.c | 26 ++-- pgxn/neon/walsender_hooks.c | 4 +- test_runner/fixtures/neon_fixtures.py | 4 +- test_runner/regress/test_hot_standby.py | 9 +- test_runner/regress/test_replica_promotes.py | 133 +++++++++++++++++++ vendor/postgres-v14 | 2 +- vendor/postgres-v15 | 2 +- vendor/postgres-v16 | 2 +- vendor/postgres-v17 | 2 +- vendor/revisions.json | 8 +- 17 files changed, 199 insertions(+), 28 deletions(-) create mode 100644 test_runner/regress/test_replica_promotes.py diff --git a/libs/walproposer/src/api_bindings.rs b/libs/walproposer/src/api_bindings.rs index d660602149..4d6cbae9a9 100644 --- a/libs/walproposer/src/api_bindings.rs +++ b/libs/walproposer/src/api_bindings.rs @@ -439,6 +439,7 @@ pub fn empty_shmem() -> crate::bindings::WalproposerShmemState { currentClusterSize: crate::bindings::pg_atomic_uint64 { value: 0 }, shard_ps_feedback: [empty_feedback; 128], num_shards: 0, + replica_promote: false, min_ps_feedback: empty_feedback, } } diff --git a/pgxn/neon/neon_pgversioncompat.c b/pgxn/neon/neon_pgversioncompat.c index 7c404fb5a9..6f57b618da 100644 --- a/pgxn/neon/neon_pgversioncompat.c +++ b/pgxn/neon/neon_pgversioncompat.c @@ -5,6 +5,7 @@ #include "funcapi.h" #include "miscadmin.h" +#include "access/xlog.h" #include "utils/tuplestore.h" #include "neon_pgversioncompat.h" @@ -41,5 +42,12 @@ InitMaterializedSRF(FunctionCallInfo fcinfo, bits32 flags) rsinfo->setDesc = stored_tupdesc; MemoryContextSwitchTo(old_context); } + +TimeLineID GetWALInsertionTimeLine(void) +{ + return ThisTimeLineID + 1; +} + + #endif diff --git a/pgxn/neon/neon_pgversioncompat.h b/pgxn/neon/neon_pgversioncompat.h index bf91a02b45..787bd552f8 100644 --- a/pgxn/neon/neon_pgversioncompat.h +++ b/pgxn/neon/neon_pgversioncompat.h @@ -162,6 +162,7 @@ InitBufferTag(BufferTag *tag, const RelFileNode *rnode, #if PG_MAJORVERSION_NUM < 15 extern void InitMaterializedSRF(FunctionCallInfo fcinfo, bits32 flags); +extern TimeLineID GetWALInsertionTimeLine(void); #endif #endif /* NEON_PGVERSIONCOMPAT_H */ diff --git a/pgxn/neon/neon_walreader.c b/pgxn/neon/neon_walreader.c index d5e3a38dbb..0a1f6d9c72 100644 --- a/pgxn/neon/neon_walreader.c +++ b/pgxn/neon/neon_walreader.c @@ -69,6 +69,7 @@ struct NeonWALReader WALSegmentContext segcxt; WALOpenSegment seg; int wre_errno; + TimeLineID local_active_tlid; /* Explains failure to read, static for simplicity. */ char err_msg[NEON_WALREADER_ERR_MSG_LEN]; @@ -106,7 +107,7 @@ struct NeonWALReader /* palloc and initialize NeonWALReader */ NeonWALReader * -NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, char *log_prefix) +NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, char *log_prefix, TimeLineID tlid) { NeonWALReader *reader; @@ -118,6 +119,7 @@ NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, char *log_ MemoryContextAllocZero(TopMemoryContext, sizeof(NeonWALReader)); reader->available_lsn = available_lsn; + reader->local_active_tlid = tlid; reader->seg.ws_file = -1; reader->seg.ws_segno = 0; reader->seg.ws_tli = 0; @@ -577,6 +579,17 @@ NeonWALReaderIsRemConnEstablished(NeonWALReader *state) return state->rem_state == RS_ESTABLISHED; } +/* + * Whether remote connection is established. Once this is done, until successful + * local read or error socket is stable and user can update socket events + * instead of readding it each time. + */ +TimeLineID +NeonWALReaderLocalActiveTimeLineID(NeonWALReader *state) +{ + return state->local_active_tlid; +} + /* * Returns events user should wait on connection socket or 0 if remote * connection is not active. diff --git a/pgxn/neon/neon_walreader.h b/pgxn/neon/neon_walreader.h index 3e41825069..722bc10537 100644 --- a/pgxn/neon/neon_walreader.h +++ b/pgxn/neon/neon_walreader.h @@ -19,9 +19,10 @@ typedef enum NEON_WALREAD_ERROR, } NeonWALReadResult; -extern NeonWALReader *NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, char *log_prefix); +extern NeonWALReader *NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, char *log_prefix, TimeLineID tlid); extern void NeonWALReaderFree(NeonWALReader *state); extern void NeonWALReaderResetRemote(NeonWALReader *state); +extern TimeLineID NeonWALReaderLocalActiveTimeLineID(NeonWALReader *state); extern NeonWALReadResult NeonWALRead(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli); extern pgsocket NeonWALReaderSocket(NeonWALReader *state); extern uint32 NeonWALReaderEvents(NeonWALReader *state); diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c index c63edb1398..91d39345e2 100644 --- a/pgxn/neon/walproposer.c +++ b/pgxn/neon/walproposer.c @@ -98,6 +98,7 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api) wp = palloc0(sizeof(WalProposer)); wp->config = config; wp->api = api; + wp->localTimeLineID = config->pgTimeline; wp->state = WPS_COLLECTING_TERMS; wp->mconf.generation = INVALID_GENERATION; wp->mconf.members.len = 0; @@ -1384,7 +1385,7 @@ ProcessPropStartPos(WalProposer *wp) * we must bail out, as clog and other non rel data is inconsistent. */ walprop_shared = wp->api.get_shmem_state(wp); - if (!wp->config->syncSafekeepers) + if (!wp->config->syncSafekeepers && !walprop_shared->replica_promote) { /* * Basebackup LSN always points to the beginning of the record (not diff --git a/pgxn/neon/walproposer.h b/pgxn/neon/walproposer.h index cca20e746b..08087e5a55 100644 --- a/pgxn/neon/walproposer.h +++ b/pgxn/neon/walproposer.h @@ -391,6 +391,7 @@ typedef struct WalproposerShmemState /* last feedback from each shard */ PageserverFeedback shard_ps_feedback[MAX_SHARDS]; int num_shards; + bool replica_promote; /* aggregated feedback with min LSNs across shards */ PageserverFeedback min_ps_feedback; @@ -806,6 +807,9 @@ typedef struct WalProposer /* Safekeepers walproposer is connecting to. */ Safekeeper safekeeper[MAX_SAFEKEEPERS]; + /* Current local TimeLineId in use */ + TimeLineID localTimeLineID; + /* WAL has been generated up to this point */ XLogRecPtr availableLsn; diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c index c90702a282..3d6a92ad79 100644 --- a/pgxn/neon/walproposer_pg.c +++ b/pgxn/neon/walproposer_pg.c @@ -35,6 +35,7 @@ #include "storage/proc.h" #include "storage/ipc.h" #include "storage/lwlock.h" +#include "storage/pg_shmem.h" #include "storage/shmem.h" #include "storage/spin.h" #include "tcop/tcopprot.h" @@ -159,12 +160,19 @@ WalProposerMain(Datum main_arg) { WalProposer *wp; + if (*wal_acceptors_list == '\0') + { + wpg_log(WARNING, "Safekeepers list is empty"); + return; + } + init_walprop_config(false); walprop_pg_init_bgworker(); am_walproposer = true; walprop_pg_load_libpqwalreceiver(); wp = WalProposerCreate(&walprop_config, walprop_pg); + wp->localTimeLineID = GetWALInsertionTimeLine(); wp->last_reconnect_attempt = walprop_pg_get_current_timestamp(wp); walprop_pg_init_walsender(); @@ -350,6 +358,9 @@ assign_neon_safekeepers(const char *newval, void *extra) char *newval_copy; char *oldval; + if (newval && *newval != '\0' && UsedShmemSegAddr && walprop_shared && RecoveryInProgress()) + walprop_shared->replica_promote = true; + if (!am_walproposer) return; @@ -540,16 +551,15 @@ BackpressureThrottlingTime(void) /* * Register a background worker proposing WAL to wal acceptors. + * We start walproposer bgworker even for replicas in order to support possible replica promotion. + * When pg_promote() function is called, then walproposer bgworker registered with BgWorkerStart_RecoveryFinished + * is automatically launched when promotion is completed. */ static void walprop_register_bgworker(void) { BackgroundWorker bgw; - /* If no wal acceptors are specified, don't start the background worker. */ - if (*wal_acceptors_list == '\0') - return; - memset(&bgw, 0, sizeof(bgw)); bgw.bgw_flags = BGWORKER_SHMEM_ACCESS; bgw.bgw_start_time = BgWorkerStart_RecoveryFinished; @@ -1326,9 +1336,7 @@ StartProposerReplication(WalProposer *wp, StartReplicationCmd *cmd) #if PG_VERSION_NUM < 150000 if (ThisTimeLineID == 0) - ereport(ERROR, - (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), - errmsg("IDENTIFY_SYSTEM has not been run before START_REPLICATION"))); + ThisTimeLineID = 1; #endif /* @@ -1542,7 +1550,7 @@ walprop_pg_wal_reader_allocate(Safekeeper *sk) snprintf(log_prefix, sizeof(log_prefix), WP_LOG_PREFIX "sk %s:%s nwr: ", sk->host, sk->port); Assert(!sk->xlogreader); - sk->xlogreader = NeonWALReaderAllocate(wal_segment_size, sk->wp->propTermStartLsn, log_prefix); + sk->xlogreader = NeonWALReaderAllocate(wal_segment_size, sk->wp->propTermStartLsn, log_prefix, sk->wp->localTimeLineID); if (sk->xlogreader == NULL) wpg_log(FATAL, "failed to allocate xlog reader"); } @@ -1556,7 +1564,7 @@ walprop_pg_wal_read(Safekeeper *sk, char *buf, XLogRecPtr startptr, Size count, buf, startptr, count, - walprop_pg_get_timeline_id()); + sk->wp->localTimeLineID); if (res == NEON_WALREAD_SUCCESS) { diff --git a/pgxn/neon/walsender_hooks.c b/pgxn/neon/walsender_hooks.c index 81198d6c8d..534bf1c19b 100644 --- a/pgxn/neon/walsender_hooks.c +++ b/pgxn/neon/walsender_hooks.c @@ -111,7 +111,7 @@ NeonWALPageRead( readBuf, targetPagePtr, count, - walprop_pg_get_timeline_id()); + NeonWALReaderLocalActiveTimeLineID(wal_reader)); if (res == NEON_WALREAD_SUCCESS) { @@ -202,7 +202,7 @@ NeonOnDemandXLogReaderRoutines(XLogReaderRoutine *xlr) { elog(ERROR, "unable to start walsender when basebackupLsn is 0"); } - wal_reader = NeonWALReaderAllocate(wal_segment_size, basebackupLsn, "[walsender] "); + wal_reader = NeonWALReaderAllocate(wal_segment_size, basebackupLsn, "[walsender] ", 1); } xlr->page_read = NeonWALPageRead; xlr->segment_open = NeonWALReadSegmentOpen; diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 19d12da5e3..f9337bed89 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -4711,7 +4711,7 @@ class EndpointFactory: origin: Endpoint, endpoint_id: str | None = None, config_lines: list[str] | None = None, - ): + ) -> Endpoint: branch_name = origin.branch_name assert origin in self.endpoints assert branch_name is not None @@ -4730,7 +4730,7 @@ class EndpointFactory: origin: Endpoint, endpoint_id: str | None = None, config_lines: list[str] | None = None, - ): + ) -> Endpoint: branch_name = origin.branch_name assert origin in self.endpoints assert branch_name is not None diff --git a/test_runner/regress/test_hot_standby.py b/test_runner/regress/test_hot_standby.py index 4044f25b37..1ff61ce8dc 100644 --- a/test_runner/regress/test_hot_standby.py +++ b/test_runner/regress/test_hot_standby.py @@ -74,8 +74,9 @@ def test_hot_standby(neon_simple_env: NeonEnv): for query in queries: with s_con.cursor() as secondary_cursor: secondary_cursor.execute(query) - response = secondary_cursor.fetchone() - assert response is not None + res = secondary_cursor.fetchone() + assert res is not None + response = res assert response == responses[query] # Check for corrupted WAL messages which might otherwise go unnoticed if @@ -164,7 +165,7 @@ def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder, pause_apply: bool): s_cur.execute("SELECT COUNT(*) FROM test") res = s_cur.fetchone() - assert res[0] == 10000 + assert res == (10000,) # Clear the cache in the standby, so that when we # re-execute the query, it will make GetPage @@ -195,7 +196,7 @@ def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder, pause_apply: bool): s_cur.execute("SELECT COUNT(*) FROM test") log_replica_lag(primary, secondary) res = s_cur.fetchone() - assert res[0] == 10000 + assert res == (10000,) def run_pgbench(connstr: str, pg_bin: PgBin): diff --git a/test_runner/regress/test_replica_promotes.py b/test_runner/regress/test_replica_promotes.py new file mode 100644 index 0000000000..e378d37635 --- /dev/null +++ b/test_runner/regress/test_replica_promotes.py @@ -0,0 +1,133 @@ +""" +File with secondary->primary promotion testing. + +This far, only contains a test that we don't break and that the data is persisted. +""" + +import psycopg2 +from fixtures.log_helper import log +from fixtures.neon_fixtures import Endpoint, NeonEnv, wait_replica_caughtup +from fixtures.pg_version import PgVersion +from pytest import raises + + +def test_replica_promotes(neon_simple_env: NeonEnv, pg_version: PgVersion): + """ + Test that a replica safely promotes, and can commit data updates which + show up when the primary boots up after the promoted secondary endpoint + shut down. + """ + + # Initialize the primary, a test table, and a helper function to create lots + # of subtransactions. + env: NeonEnv = neon_simple_env + primary: Endpoint = env.endpoints.create_start(branch_name="main", endpoint_id="primary") + secondary: Endpoint = env.endpoints.new_replica_start(origin=primary, endpoint_id="secondary") + + with primary.connect() as primary_conn: + primary_cur = primary_conn.cursor() + primary_cur.execute( + "create table t(pk bigint GENERATED ALWAYS AS IDENTITY, payload integer)" + ) + primary_cur.execute("INSERT INTO t(payload) SELECT generate_series(1, 100)") + primary_cur.execute( + """ + SELECT pg_current_wal_insert_lsn(), + pg_current_wal_lsn(), + pg_current_wal_flush_lsn() + """ + ) + log.info(f"Primary: Current LSN after workload is {primary_cur.fetchone()}") + primary_cur.execute("show neon.safekeepers") + safekeepers = primary_cur.fetchall()[0][0] + + wait_replica_caughtup(primary, secondary) + + with secondary.connect() as secondary_conn: + secondary_cur = secondary_conn.cursor() + secondary_cur.execute("select count(*) from t") + + assert secondary_cur.fetchone() == (100,) + + with raises(psycopg2.Error): + secondary_cur.execute("INSERT INTO t (payload) SELECT generate_series(101, 200)") + secondary_conn.commit() + + secondary_conn.rollback() + secondary_cur.execute("select count(*) from t") + assert secondary_cur.fetchone() == (100,) + + primary.stop_and_destroy(mode="immediate") + + # Reconnect to the secondary to make sure we get a read-write connection + promo_conn = secondary.connect() + promo_cur = promo_conn.cursor() + promo_cur.execute(f"alter system set neon.safekeepers='{safekeepers}'") + promo_cur.execute("select pg_reload_conf()") + + promo_cur.execute("SELECT * FROM pg_promote()") + assert promo_cur.fetchone() == (True,) + promo_cur.execute( + """ + SELECT pg_current_wal_insert_lsn(), + pg_current_wal_lsn(), + pg_current_wal_flush_lsn() + """ + ) + log.info(f"Secondary: LSN after promotion is {promo_cur.fetchone()}") + + # Reconnect to the secondary to make sure we get a read-write connection + with secondary.connect() as new_primary_conn: + new_primary_cur = new_primary_conn.cursor() + new_primary_cur.execute("select count(*) from t") + assert new_primary_cur.fetchone() == (100,) + + new_primary_cur.execute( + "INSERT INTO t (payload) SELECT generate_series(101, 200) RETURNING payload" + ) + assert new_primary_cur.fetchall() == [(it,) for it in range(101, 201)] + + new_primary_cur = new_primary_conn.cursor() + new_primary_cur.execute("select payload from t") + assert new_primary_cur.fetchall() == [(it,) for it in range(1, 201)] + + new_primary_cur.execute("select count(*) from t") + assert new_primary_cur.fetchone() == (200,) + new_primary_cur.execute( + """ + SELECT pg_current_wal_insert_lsn(), + pg_current_wal_lsn(), + pg_current_wal_flush_lsn() + """ + ) + log.info(f"Secondary: LSN after workload is {new_primary_cur.fetchone()}") + + with secondary.connect() as second_viewpoint_conn: + new_primary_cur = second_viewpoint_conn.cursor() + new_primary_cur.execute("select payload from t") + assert new_primary_cur.fetchall() == [(it,) for it in range(1, 201)] + + # wait_for_last_flush_lsn(env, secondary, env.initial_tenant, env.initial_timeline) + + secondary.stop_and_destroy() + + primary = env.endpoints.create_start(branch_name="main", endpoint_id="primary") + + with primary.connect() as new_primary: + new_primary_cur = new_primary.cursor() + new_primary_cur.execute( + """ + SELECT pg_current_wal_insert_lsn(), + pg_current_wal_lsn(), + pg_current_wal_flush_lsn() + """ + ) + log.info(f"New primary: Boot LSN is {new_primary_cur.fetchone()}") + + new_primary_cur.execute("select count(*) from t") + assert new_primary_cur.fetchone() == (200,) + new_primary_cur.execute("INSERT INTO t (payload) SELECT generate_series(201, 300)") + new_primary_cur.execute("select count(*) from t") + assert new_primary_cur.fetchone() == (300,) + + primary.stop(mode="immediate") diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index 55c0d45abe..6770bc2513 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit 55c0d45abe6467c02084c2192bca117eda6ce1e7 +Subproject commit 6770bc251301ef40c66f7ecb731741dc435b5051 diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index de7640f55d..8c3249f36c 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit de7640f55da07512834d5cc40c4b3fb376b5f04f +Subproject commit 8c3249f36c7df6ac0efb8ee9f1baf4aa1b83e5c9 diff --git a/vendor/postgres-v16 b/vendor/postgres-v16 index 0bf96bd6d7..7a4c0eacae 160000 --- a/vendor/postgres-v16 +++ b/vendor/postgres-v16 @@ -1 +1 @@ -Subproject commit 0bf96bd6d70301a0b43b0b3457bb3cf8fb43c198 +Subproject commit 7a4c0eacaeb9b97416542fa19103061c166460b1 diff --git a/vendor/postgres-v17 b/vendor/postgres-v17 index 8be779fd3a..db424d42d7 160000 --- a/vendor/postgres-v17 +++ b/vendor/postgres-v17 @@ -1 +1 @@ -Subproject commit 8be779fd3ab9e87206da96a7e4842ef1abf04f44 +Subproject commit db424d42d748f8ad91ac00e28db2c7f2efa42f7f diff --git a/vendor/revisions.json b/vendor/revisions.json index 3e999760f4..12d5499ddb 100644 --- a/vendor/revisions.json +++ b/vendor/revisions.json @@ -1,18 +1,18 @@ { "v17": [ "17.5", - "8be779fd3ab9e87206da96a7e4842ef1abf04f44" + "db424d42d748f8ad91ac00e28db2c7f2efa42f7f" ], "v16": [ "16.9", - "0bf96bd6d70301a0b43b0b3457bb3cf8fb43c198" + "7a4c0eacaeb9b97416542fa19103061c166460b1" ], "v15": [ "15.13", - "de7640f55da07512834d5cc40c4b3fb376b5f04f" + "8c3249f36c7df6ac0efb8ee9f1baf4aa1b83e5c9" ], "v14": [ "14.18", - "55c0d45abe6467c02084c2192bca117eda6ce1e7" + "6770bc251301ef40c66f7ecb731741dc435b5051" ] } From 868f194a3b7cf2463c86e1fbd0e83360a40d31ef Mon Sep 17 00:00:00 2001 From: Vlad Lazar Date: Thu, 5 Jun 2025 12:43:04 +0100 Subject: [PATCH 23/25] pageserver: remove handling of vanilla protocol (#12126) ## Problem We support two ingest protocols on the pageserver: vanilla and interpreted. Interpreted has been the only protocol in use for a long time. ## Summary of changes * Remove the ingest handling of the vanilla protocol * Remove tenant and pageserver configuration for it * Update all tests that tweaked the ingest protocol ## Compatibility Backward compatibility: * The new pageserver version can read the existing pageserver configuration and it will ignore the unknown field. * When the tenant config is read from the storcon db or from the pageserver disk, the extra field will be ignored. Forward compatiblity: * Both the pageserver config and the tenant config map missing fields to their default value. I'm not aware of any tenant level override that was made for this knob. --- control_plane/src/pageserver.rs | 5 - libs/pageserver_api/src/config.rs | 12 -- libs/pageserver_api/src/models.rs | 14 -- pageserver/src/bin/pageserver.rs | 1 - pageserver/src/config.rs | 5 - pageserver/src/metrics.rs | 6 - pageserver/src/tenant/timeline.rs | 23 +-- .../walreceiver/connection_manager.rs | 32 ++-- .../walreceiver/walreceiver_connection.rs | 161 +----------------- test_runner/fixtures/neon_fixtures.py | 39 ----- .../performance/test_sharded_ingest.py | 39 ----- .../regress/test_attach_tenant_config.py | 4 - test_runner/regress/test_compaction.py | 8 - test_runner/regress/test_crafted_wal_end.py | 13 +- test_runner/regress/test_subxacts.py | 10 +- test_runner/regress/test_tenant_conf.py | 8 +- .../regress/test_wal_acceptor_async.py | 10 +- 17 files changed, 39 insertions(+), 351 deletions(-) diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index 29314dab9e..0cf7ca184d 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -513,11 +513,6 @@ impl PageServerNode { .map(|x| x.parse::()) .transpose() .context("Failed to parse 'timeline_offloading' as bool")?, - wal_receiver_protocol_override: settings - .remove("wal_receiver_protocol_override") - .map(serde_json::from_str) - .transpose() - .context("parse `wal_receiver_protocol_override` from json")?, rel_size_v2_enabled: settings .remove("rel_size_v2_enabled") .map(|x| x.parse::()) diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs index 46903965b1..30b0612082 100644 --- a/libs/pageserver_api/src/config.rs +++ b/libs/pageserver_api/src/config.rs @@ -20,7 +20,6 @@ use postgres_backend::AuthType; use remote_storage::RemoteStorageConfig; use serde_with::serde_as; use utils::logging::LogFormat; -use utils::postgres_client::PostgresClientProtocol; use crate::models::{ImageCompressionAlgorithm, LsnLease}; @@ -189,7 +188,6 @@ pub struct ConfigToml { pub virtual_file_io_mode: Option, #[serde(skip_serializing_if = "Option::is_none")] pub no_sync: Option, - pub wal_receiver_protocol: PostgresClientProtocol, pub page_service_pipelining: PageServicePipeliningConfig, pub get_vectored_concurrent_io: GetVectoredConcurrentIo, pub enable_read_path_debugging: Option, @@ -527,8 +525,6 @@ pub struct TenantConfigToml { /// (either this flag or the pageserver-global one need to be set) pub timeline_offloading: bool, - pub wal_receiver_protocol_override: Option, - /// Enable rel_size_v2 for this tenant. Once enabled, the tenant will persist this information into /// `index_part.json`, and it cannot be reversed. pub rel_size_v2_enabled: bool, @@ -609,12 +605,6 @@ pub mod defaults { pub const DEFAULT_IO_BUFFER_ALIGNMENT: usize = 512; - pub const DEFAULT_WAL_RECEIVER_PROTOCOL: utils::postgres_client::PostgresClientProtocol = - utils::postgres_client::PostgresClientProtocol::Interpreted { - format: utils::postgres_client::InterpretedFormat::Protobuf, - compression: Some(utils::postgres_client::Compression::Zstd { level: 1 }), - }; - pub const DEFAULT_SSL_KEY_FILE: &str = "server.key"; pub const DEFAULT_SSL_CERT_FILE: &str = "server.crt"; } @@ -713,7 +703,6 @@ impl Default for ConfigToml { virtual_file_io_mode: None, tenant_config: TenantConfigToml::default(), no_sync: None, - wal_receiver_protocol: DEFAULT_WAL_RECEIVER_PROTOCOL, page_service_pipelining: PageServicePipeliningConfig::Pipelined( PageServicePipeliningConfigPipelined { max_batch_size: NonZeroUsize::new(32).unwrap(), @@ -858,7 +847,6 @@ impl Default for TenantConfigToml { lsn_lease_length: LsnLease::DEFAULT_LENGTH, lsn_lease_length_for_ts: LsnLease::DEFAULT_LENGTH_FOR_TS, timeline_offloading: true, - wal_receiver_protocol_override: None, rel_size_v2_enabled: false, gc_compaction_enabled: DEFAULT_GC_COMPACTION_ENABLED, gc_compaction_verification: DEFAULT_GC_COMPACTION_VERIFICATION, diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 28ced4a368..881f24b86c 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -20,7 +20,6 @@ use serde_with::serde_as; pub use utilization::PageserverUtilization; use utils::id::{NodeId, TenantId, TimelineId}; use utils::lsn::Lsn; -use utils::postgres_client::PostgresClientProtocol; use utils::{completion, serde_system_time}; use crate::config::Ratio; @@ -622,8 +621,6 @@ pub struct TenantConfigPatch { #[serde(skip_serializing_if = "FieldPatch::is_noop")] pub timeline_offloading: FieldPatch, #[serde(skip_serializing_if = "FieldPatch::is_noop")] - pub wal_receiver_protocol_override: FieldPatch, - #[serde(skip_serializing_if = "FieldPatch::is_noop")] pub rel_size_v2_enabled: FieldPatch, #[serde(skip_serializing_if = "FieldPatch::is_noop")] pub gc_compaction_enabled: FieldPatch, @@ -748,9 +745,6 @@ pub struct TenantConfig { #[serde(skip_serializing_if = "Option::is_none")] pub timeline_offloading: Option, - #[serde(skip_serializing_if = "Option::is_none")] - pub wal_receiver_protocol_override: Option, - #[serde(skip_serializing_if = "Option::is_none")] pub rel_size_v2_enabled: Option, @@ -812,7 +806,6 @@ impl TenantConfig { mut lsn_lease_length, mut lsn_lease_length_for_ts, mut timeline_offloading, - mut wal_receiver_protocol_override, mut rel_size_v2_enabled, mut gc_compaction_enabled, mut gc_compaction_verification, @@ -905,9 +898,6 @@ impl TenantConfig { .map(|v| humantime::parse_duration(&v))? .apply(&mut lsn_lease_length_for_ts); patch.timeline_offloading.apply(&mut timeline_offloading); - patch - .wal_receiver_protocol_override - .apply(&mut wal_receiver_protocol_override); patch.rel_size_v2_enabled.apply(&mut rel_size_v2_enabled); patch .gc_compaction_enabled @@ -960,7 +950,6 @@ impl TenantConfig { lsn_lease_length, lsn_lease_length_for_ts, timeline_offloading, - wal_receiver_protocol_override, rel_size_v2_enabled, gc_compaction_enabled, gc_compaction_verification, @@ -1058,9 +1047,6 @@ impl TenantConfig { timeline_offloading: self .timeline_offloading .unwrap_or(global_conf.timeline_offloading), - wal_receiver_protocol_override: self - .wal_receiver_protocol_override - .or(global_conf.wal_receiver_protocol_override), rel_size_v2_enabled: self .rel_size_v2_enabled .unwrap_or(global_conf.rel_size_v2_enabled), diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index be7e634d4c..a1a95ad2d1 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -158,7 +158,6 @@ fn main() -> anyhow::Result<()> { // (maybe we should automate this with a visitor?). info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine"); info!(?conf.virtual_file_io_mode, "starting with virtual_file IO mode"); - info!(?conf.wal_receiver_protocol, "starting with WAL receiver protocol"); info!(?conf.validate_wal_contiguity, "starting with WAL contiguity validation"); info!(?conf.page_service_pipelining, "starting with page service pipelining config"); info!(?conf.get_vectored_concurrent_io, "starting with get_vectored IO concurrency config"); diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 628d4f6021..3492a8d966 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -27,7 +27,6 @@ use reqwest::Url; use storage_broker::Uri; use utils::id::{NodeId, TimelineId}; use utils::logging::{LogFormat, SecretString}; -use utils::postgres_client::PostgresClientProtocol; use crate::tenant::storage_layer::inmemory_layer::IndexEntry; use crate::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME}; @@ -211,8 +210,6 @@ pub struct PageServerConf { /// Optionally disable disk syncs (unsafe!) pub no_sync: bool, - pub wal_receiver_protocol: PostgresClientProtocol, - pub page_service_pipelining: pageserver_api::config::PageServicePipeliningConfig, pub get_vectored_concurrent_io: pageserver_api::config::GetVectoredConcurrentIo, @@ -421,7 +418,6 @@ impl PageServerConf { virtual_file_io_engine, tenant_config, no_sync, - wal_receiver_protocol, page_service_pipelining, get_vectored_concurrent_io, enable_read_path_debugging, @@ -484,7 +480,6 @@ impl PageServerConf { import_pgdata_upcall_api, import_pgdata_upcall_api_token: import_pgdata_upcall_api_token.map(SecretString::from), import_pgdata_aws_endpoint_url, - wal_receiver_protocol, page_service_pipelining, get_vectored_concurrent_io, tracing, diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 3173ab221f..3eb70ffac2 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -2855,7 +2855,6 @@ pub(crate) struct WalIngestMetrics { pub(crate) records_received: IntCounter, pub(crate) records_observed: IntCounter, pub(crate) records_committed: IntCounter, - pub(crate) records_filtered: IntCounter, pub(crate) values_committed_metadata_images: IntCounter, pub(crate) values_committed_metadata_deltas: IntCounter, pub(crate) values_committed_data_images: IntCounter, @@ -2911,11 +2910,6 @@ pub(crate) static WAL_INGEST: Lazy = Lazy::new(|| { "Number of WAL records which resulted in writes to pageserver storage" ) .expect("failed to define a metric"), - records_filtered: register_int_counter!( - "pageserver_wal_ingest_records_filtered", - "Number of WAL records filtered out due to sharding" - ) - .expect("failed to define a metric"), values_committed_metadata_images: values_committed.with_label_values(&["metadata", "image"]), values_committed_metadata_deltas: values_committed.with_label_values(&["metadata", "delta"]), values_committed_data_images: values_committed.with_label_values(&["data", "image"]), diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 8fdf84b6d3..6798606141 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -2845,21 +2845,6 @@ impl Timeline { ) } - /// Resolve the effective WAL receiver protocol to use for this tenant. - /// - /// Priority order is: - /// 1. Tenant config override - /// 2. Default value for tenant config override - /// 3. Pageserver config override - /// 4. Pageserver config default - pub fn resolve_wal_receiver_protocol(&self) -> PostgresClientProtocol { - let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); - tenant_conf - .wal_receiver_protocol_override - .or(self.conf.default_tenant_conf.wal_receiver_protocol_override) - .unwrap_or(self.conf.wal_receiver_protocol) - } - pub(super) fn tenant_conf_updated(&self, new_conf: &AttachedTenantConf) { // NB: Most tenant conf options are read by background loops, so, // changes will automatically be picked up. @@ -3215,10 +3200,16 @@ impl Timeline { guard.is_none(), "multiple launches / re-launches of WAL receiver are not supported" ); + + let protocol = PostgresClientProtocol::Interpreted { + format: utils::postgres_client::InterpretedFormat::Protobuf, + compression: Some(utils::postgres_client::Compression::Zstd { level: 1 }), + }; + *guard = Some(WalReceiver::start( Arc::clone(self), WalReceiverConf { - protocol: self.resolve_wal_receiver_protocol(), + protocol, wal_connect_timeout, lagging_wal_timeout, max_lsn_wal_lag, diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs index 3c3608d1bd..7e0b0e9b25 100644 --- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs +++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs @@ -32,9 +32,7 @@ use utils::backoff::{ }; use utils::id::{NodeId, TenantTimelineId}; use utils::lsn::Lsn; -use utils::postgres_client::{ - ConnectionConfigArgs, PostgresClientProtocol, wal_stream_connection_config, -}; +use utils::postgres_client::{ConnectionConfigArgs, wal_stream_connection_config}; use super::walreceiver_connection::{WalConnectionStatus, WalReceiverError}; use super::{TaskEvent, TaskHandle, TaskStateUpdate, WalReceiverConf}; @@ -991,19 +989,12 @@ impl ConnectionManagerState { return None; // no connection string, ignore sk } - let (shard_number, shard_count, shard_stripe_size) = match self.conf.protocol { - PostgresClientProtocol::Vanilla => { - (None, None, None) - }, - PostgresClientProtocol::Interpreted { .. } => { - let shard_identity = self.timeline.get_shard_identity(); - ( - Some(shard_identity.number.0), - Some(shard_identity.count.0), - Some(shard_identity.stripe_size.0), - ) - } - }; + let shard_identity = self.timeline.get_shard_identity(); + let (shard_number, shard_count, shard_stripe_size) = ( + Some(shard_identity.number.0), + Some(shard_identity.count.0), + Some(shard_identity.stripe_size.0), + ); let connection_conf_args = ConnectionConfigArgs { protocol: self.conf.protocol, @@ -1120,8 +1111,8 @@ impl ReconnectReason { #[cfg(test)] mod tests { - use pageserver_api::config::defaults::DEFAULT_WAL_RECEIVER_PROTOCOL; use url::Host; + use utils::postgres_client::PostgresClientProtocol; use super::*; use crate::tenant::harness::{TIMELINE_ID, TenantHarness}; @@ -1552,6 +1543,11 @@ mod tests { .await .expect("Failed to create an empty timeline for dummy wal connection manager"); + let protocol = PostgresClientProtocol::Interpreted { + format: utils::postgres_client::InterpretedFormat::Protobuf, + compression: Some(utils::postgres_client::Compression::Zstd { level: 1 }), + }; + ConnectionManagerState { id: TenantTimelineId { tenant_id: harness.tenant_shard_id.tenant_id, @@ -1560,7 +1556,7 @@ mod tests { timeline, cancel: CancellationToken::new(), conf: WalReceiverConf { - protocol: DEFAULT_WAL_RECEIVER_PROTOCOL, + protocol, wal_connect_timeout: Duration::from_secs(1), lagging_wal_timeout: Duration::from_secs(1), max_lsn_wal_lag: NonZeroU64::new(1024 * 1024).unwrap(), diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs index 249849ac4b..343e04f5f0 100644 --- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs @@ -15,7 +15,7 @@ use postgres_backend::is_expected_io_error; use postgres_connection::PgConnectionConfig; use postgres_ffi::WAL_SEGMENT_SIZE; use postgres_ffi::v14::xlog_utils::normalize_lsn; -use postgres_ffi::waldecoder::{WalDecodeError, WalStreamDecoder}; +use postgres_ffi::waldecoder::WalDecodeError; use postgres_protocol::message::backend::ReplicationMessage; use postgres_types::PgLsn; use tokio::sync::watch; @@ -31,7 +31,7 @@ use utils::lsn::Lsn; use utils::pageserver_feedback::PageserverFeedback; use utils::postgres_client::PostgresClientProtocol; use utils::sync::gate::GateError; -use wal_decoder::models::{FlushUncommittedRecords, InterpretedWalRecord, InterpretedWalRecords}; +use wal_decoder::models::{FlushUncommittedRecords, InterpretedWalRecords}; use wal_decoder::wire_format::FromWireFormat; use super::TaskStateUpdate; @@ -275,8 +275,6 @@ pub(super) async fn handle_walreceiver_connection( let copy_stream = replication_client.copy_both_simple(&query).await?; let mut physical_stream = pin!(ReplicationStream::new(copy_stream)); - let mut waldecoder = WalStreamDecoder::new(startpoint, timeline.pg_version); - let mut walingest = WalIngest::new(timeline.as_ref(), startpoint, &ctx) .await .map_err(|e| match e.kind { @@ -284,14 +282,16 @@ pub(super) async fn handle_walreceiver_connection( _ => WalReceiverError::Other(e.into()), })?; - let shard = vec![*timeline.get_shard_identity()]; - - let interpreted_proto_config = match protocol { - PostgresClientProtocol::Vanilla => None, + let (format, compression) = match protocol { PostgresClientProtocol::Interpreted { format, compression, - } => Some((format, compression)), + } => (format, compression), + PostgresClientProtocol::Vanilla => { + return Err(WalReceiverError::Other(anyhow!( + "Vanilla WAL receiver protocol is no longer supported for ingest" + ))); + } }; let mut expected_wal_start = startpoint; @@ -313,16 +313,6 @@ pub(super) async fn handle_walreceiver_connection( // Update the connection status before processing the message. If the message processing // fails (e.g. in walingest), we still want to know latests LSNs from the safekeeper. match &replication_message { - ReplicationMessage::XLogData(xlog_data) => { - connection_status.latest_connection_update = now; - connection_status.commit_lsn = Some(Lsn::from(xlog_data.wal_end())); - connection_status.streaming_lsn = Some(Lsn::from( - xlog_data.wal_start() + xlog_data.data().len() as u64, - )); - if !xlog_data.data().is_empty() { - connection_status.latest_wal_update = now; - } - } ReplicationMessage::PrimaryKeepAlive(keepalive) => { connection_status.latest_connection_update = now; connection_status.commit_lsn = Some(Lsn::from(keepalive.wal_end())); @@ -353,7 +343,6 @@ pub(super) async fn handle_walreceiver_connection( // were interpreted. let streaming_lsn = Lsn::from(raw.streaming_lsn()); - let (format, compression) = interpreted_proto_config.unwrap(); let batch = InterpretedWalRecords::from_wire(raw.data(), format, compression) .await .with_context(|| { @@ -509,138 +498,6 @@ pub(super) async fn handle_walreceiver_connection( Some(streaming_lsn) } - ReplicationMessage::XLogData(xlog_data) => { - async fn commit( - modification: &mut DatadirModification<'_>, - uncommitted: &mut u64, - filtered: &mut u64, - ctx: &RequestContext, - ) -> anyhow::Result<()> { - let stats = modification.stats(); - modification.commit(ctx).await?; - WAL_INGEST - .records_committed - .inc_by(*uncommitted - *filtered); - WAL_INGEST.inc_values_committed(&stats); - *uncommitted = 0; - *filtered = 0; - Ok(()) - } - - // Pass the WAL data to the decoder, and see if we can decode - // more records as a result. - let data = xlog_data.data(); - let startlsn = Lsn::from(xlog_data.wal_start()); - let endlsn = startlsn + data.len() as u64; - - trace!("received XLogData between {startlsn} and {endlsn}"); - - WAL_INGEST.bytes_received.inc_by(data.len() as u64); - waldecoder.feed_bytes(data); - - { - let mut modification = timeline.begin_modification(startlsn); - let mut uncommitted_records = 0; - let mut filtered_records = 0; - - while let Some((next_record_lsn, recdata)) = waldecoder.poll_decode()? { - // It is important to deal with the aligned records as lsn in getPage@LSN is - // aligned and can be several bytes bigger. Without this alignment we are - // at risk of hitting a deadlock. - if !next_record_lsn.is_aligned() { - return Err(WalReceiverError::Other(anyhow!("LSN not aligned"))); - } - - // Deserialize and interpret WAL record - let interpreted = InterpretedWalRecord::from_bytes_filtered( - recdata, - &shard, - next_record_lsn, - modification.tline.pg_version, - )? - .remove(timeline.get_shard_identity()) - .unwrap(); - - if matches!(interpreted.flush_uncommitted, FlushUncommittedRecords::Yes) - && uncommitted_records > 0 - { - // Special case: legacy PG database creations operate by reading pages from a 'template' database: - // these are the only kinds of WAL record that require reading data blocks while ingesting. Ensure - // all earlier writes of data blocks are visible by committing any modification in flight. - commit( - &mut modification, - &mut uncommitted_records, - &mut filtered_records, - &ctx, - ) - .await?; - } - - // Ingest the records without immediately committing them. - timeline.metrics.wal_records_received.inc(); - let ingested = walingest - .ingest_record(interpreted, &mut modification, &ctx) - .await - .with_context(|| { - format!("could not ingest record at {next_record_lsn}") - }) - .inspect_err(|err| { - // TODO: we can't differentiate cancellation errors with - // anyhow::Error, so just ignore it if we're cancelled. - if !cancellation.is_cancelled() && !timeline.is_stopping() { - critical!("{err:?}") - } - })?; - if !ingested { - tracing::debug!("ingest: filtered out record @ LSN {next_record_lsn}"); - WAL_INGEST.records_filtered.inc(); - filtered_records += 1; - } - - // FIXME: this cannot be made pausable_failpoint without fixing the - // failpoint library; in tests, the added amount of debugging will cause us - // to timeout the tests. - fail_point!("walreceiver-after-ingest"); - - last_rec_lsn = next_record_lsn; - - // Commit every ingest_batch_size records. Even if we filtered out - // all records, we still need to call commit to advance the LSN. - uncommitted_records += 1; - if uncommitted_records >= ingest_batch_size - || modification.approx_pending_bytes() - > DatadirModification::MAX_PENDING_BYTES - { - commit( - &mut modification, - &mut uncommitted_records, - &mut filtered_records, - &ctx, - ) - .await?; - } - } - - // Commit the remaining records. - if uncommitted_records > 0 { - commit( - &mut modification, - &mut uncommitted_records, - &mut filtered_records, - &ctx, - ) - .await?; - } - } - - if !caught_up && endlsn >= end_of_wal { - info!("caught up at LSN {endlsn}"); - caught_up = true; - } - - Some(endlsn) - } - ReplicationMessage::PrimaryKeepAlive(keepalive) => { let wal_end = keepalive.wal_end(); let timestamp = keepalive.timestamp(); diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index f9337bed89..db3f080261 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -357,31 +357,6 @@ class PgProtocol: return TimelineId(cast("str", self.safe_psql("show neon.timeline_id")[0][0])) -class PageserverWalReceiverProtocol(StrEnum): - VANILLA = "vanilla" - INTERPRETED = "interpreted" - - @staticmethod - def to_config_key_value(proto) -> tuple[str, dict[str, Any]]: - if proto == PageserverWalReceiverProtocol.VANILLA: - return ( - "wal_receiver_protocol", - { - "type": "vanilla", - }, - ) - elif proto == PageserverWalReceiverProtocol.INTERPRETED: - return ( - "wal_receiver_protocol", - { - "type": "interpreted", - "args": {"format": "protobuf", "compression": {"zstd": {"level": 1}}}, - }, - ) - else: - raise ValueError(f"Unknown protocol type: {proto}") - - @dataclass class PageserverTracingConfig: sampling_ratio: tuple[int, int] @@ -475,7 +450,6 @@ class NeonEnvBuilder: safekeeper_extra_opts: list[str] | None = None, storage_controller_port_override: int | None = None, pageserver_virtual_file_io_mode: str | None = None, - pageserver_wal_receiver_protocol: PageserverWalReceiverProtocol | None = None, pageserver_get_vectored_concurrent_io: str | None = None, pageserver_tracing_config: PageserverTracingConfig | None = None, pageserver_import_config: PageserverImportConfig | None = None, @@ -552,11 +526,6 @@ class NeonEnvBuilder: self.pageserver_virtual_file_io_mode = pageserver_virtual_file_io_mode - if pageserver_wal_receiver_protocol is not None: - self.pageserver_wal_receiver_protocol = pageserver_wal_receiver_protocol - else: - self.pageserver_wal_receiver_protocol = PageserverWalReceiverProtocol.INTERPRETED - assert test_name.startswith("test_"), ( "Unexpectedly instantiated from outside a test function" ) @@ -1202,7 +1171,6 @@ class NeonEnv: self.pageserver_virtual_file_io_engine = config.pageserver_virtual_file_io_engine self.pageserver_virtual_file_io_mode = config.pageserver_virtual_file_io_mode - self.pageserver_wal_receiver_protocol = config.pageserver_wal_receiver_protocol self.pageserver_get_vectored_concurrent_io = config.pageserver_get_vectored_concurrent_io self.pageserver_tracing_config = config.pageserver_tracing_config if config.pageserver_import_config is None: @@ -1334,13 +1302,6 @@ class NeonEnv: for key, value in override.items(): ps_cfg[key] = value - if self.pageserver_wal_receiver_protocol is not None: - key, value = PageserverWalReceiverProtocol.to_config_key_value( - self.pageserver_wal_receiver_protocol - ) - if key not in ps_cfg: - ps_cfg[key] = value - if self.pageserver_tracing_config is not None: key, value = self.pageserver_tracing_config.to_config_key_value() diff --git a/test_runner/performance/test_sharded_ingest.py b/test_runner/performance/test_sharded_ingest.py index 293026d40a..364fcf3737 100644 --- a/test_runner/performance/test_sharded_ingest.py +++ b/test_runner/performance/test_sharded_ingest.py @@ -15,19 +15,10 @@ from fixtures.neon_fixtures import ( @pytest.mark.timeout(1200) @pytest.mark.parametrize("shard_count", [1, 8, 32]) -@pytest.mark.parametrize( - "wal_receiver_protocol", - [ - "vanilla", - "interpreted-bincode-compressed", - "interpreted-protobuf-compressed", - ], -) def test_sharded_ingest( neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker, shard_count: int, - wal_receiver_protocol: str, ): """ Benchmarks sharded ingestion throughput, by ingesting a large amount of WAL into a Safekeeper @@ -39,36 +30,6 @@ def test_sharded_ingest( neon_env_builder.num_pageservers = shard_count env = neon_env_builder.init_configs() - for ps in env.pageservers: - if wal_receiver_protocol == "vanilla": - ps.patch_config_toml_nonrecursive( - { - "wal_receiver_protocol": { - "type": "vanilla", - } - } - ) - elif wal_receiver_protocol == "interpreted-bincode-compressed": - ps.patch_config_toml_nonrecursive( - { - "wal_receiver_protocol": { - "type": "interpreted", - "args": {"format": "bincode", "compression": {"zstd": {"level": 1}}}, - } - } - ) - elif wal_receiver_protocol == "interpreted-protobuf-compressed": - ps.patch_config_toml_nonrecursive( - { - "wal_receiver_protocol": { - "type": "interpreted", - "args": {"format": "protobuf", "compression": {"zstd": {"level": 1}}}, - } - } - ) - else: - raise AssertionError("Test must use explicit wal receiver protocol config") - env.start() # Create a sharded tenant and timeline, and migrate it to the respective pageservers. Ensure diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py index 3eb6b7193c..dc44fc77db 100644 --- a/test_runner/regress/test_attach_tenant_config.py +++ b/test_runner/regress/test_attach_tenant_config.py @@ -182,10 +182,6 @@ def test_fully_custom_config(positive_env: NeonEnv): "lsn_lease_length": "1m", "lsn_lease_length_for_ts": "5s", "timeline_offloading": False, - "wal_receiver_protocol_override": { - "type": "interpreted", - "args": {"format": "bincode", "compression": {"zstd": {"level": 1}}}, - }, "rel_size_v2_enabled": True, "relsize_snapshot_cache_capacity": 10000, "gc_compaction_enabled": True, diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py index 370f57b19d..1570d40ae9 100644 --- a/test_runner/regress/test_compaction.py +++ b/test_runner/regress/test_compaction.py @@ -10,7 +10,6 @@ import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnvBuilder, - PageserverWalReceiverProtocol, generate_uploads_and_deletions, ) from fixtures.pageserver.http import PageserverApiException @@ -68,14 +67,9 @@ PREEMPT_GC_COMPACTION_TENANT_CONF = { @skip_in_debug_build("only run with release build") -@pytest.mark.parametrize( - "wal_receiver_protocol", - [PageserverWalReceiverProtocol.VANILLA, PageserverWalReceiverProtocol.INTERPRETED], -) @pytest.mark.timeout(900) def test_pageserver_compaction_smoke( neon_env_builder: NeonEnvBuilder, - wal_receiver_protocol: PageserverWalReceiverProtocol, ): """ This is a smoke test that compaction kicks in. The workload repeatedly churns @@ -85,8 +79,6 @@ def test_pageserver_compaction_smoke( observed bounds. """ - neon_env_builder.pageserver_wal_receiver_protocol = wal_receiver_protocol - # Effectively disable the page cache to rely only on image layers # to shorten reads. neon_env_builder.pageserver_config_override = """ diff --git a/test_runner/regress/test_crafted_wal_end.py b/test_runner/regress/test_crafted_wal_end.py index 6b9dcbba07..89ff873ca3 100644 --- a/test_runner/regress/test_crafted_wal_end.py +++ b/test_runner/regress/test_crafted_wal_end.py @@ -1,9 +1,13 @@ from __future__ import annotations +from typing import TYPE_CHECKING + import pytest from fixtures.log_helper import log from fixtures.neon_cli import WalCraft -from fixtures.neon_fixtures import NeonEnvBuilder, PageserverWalReceiverProtocol + +if TYPE_CHECKING: + from fixtures.neon_fixtures import NeonEnvBuilder # Restart nodes with WAL end having specially crafted shape, like last record # crossing segment boundary, to test decoding issues. @@ -19,17 +23,10 @@ from fixtures.neon_fixtures import NeonEnvBuilder, PageserverWalReceiverProtocol "wal_record_crossing_segment_followed_by_small_one", ], ) -@pytest.mark.parametrize( - "wal_receiver_protocol", - [PageserverWalReceiverProtocol.VANILLA, PageserverWalReceiverProtocol.INTERPRETED], -) def test_crafted_wal_end( neon_env_builder: NeonEnvBuilder, wal_type: str, - wal_receiver_protocol: PageserverWalReceiverProtocol, ): - neon_env_builder.pageserver_wal_receiver_protocol = wal_receiver_protocol - env = neon_env_builder.init_start() env.create_branch("test_crafted_wal_end") env.pageserver.allowed_errors.extend( diff --git a/test_runner/regress/test_subxacts.py b/test_runner/regress/test_subxacts.py index b235da0bc7..92a21007c8 100644 --- a/test_runner/regress/test_subxacts.py +++ b/test_runner/regress/test_subxacts.py @@ -1,9 +1,7 @@ from __future__ import annotations -import pytest from fixtures.neon_fixtures import ( NeonEnvBuilder, - PageserverWalReceiverProtocol, check_restored_datadir_content, ) @@ -14,13 +12,7 @@ from fixtures.neon_fixtures import ( # maintained in the pageserver, so subtransactions are not very exciting for # Neon. They are included in the commit record though and updated in the # CLOG. -@pytest.mark.parametrize( - "wal_receiver_protocol", - [PageserverWalReceiverProtocol.VANILLA, PageserverWalReceiverProtocol.INTERPRETED], -) -def test_subxacts(neon_env_builder: NeonEnvBuilder, test_output_dir, wal_receiver_protocol): - neon_env_builder.pageserver_wal_receiver_protocol = wal_receiver_protocol - +def test_subxacts(neon_env_builder: NeonEnvBuilder, test_output_dir): env = neon_env_builder.init_start() endpoint = env.endpoints.create_start("main") diff --git a/test_runner/regress/test_tenant_conf.py b/test_runner/regress/test_tenant_conf.py index de6bdc0aec..d78b9d8817 100644 --- a/test_runner/regress/test_tenant_conf.py +++ b/test_runner/regress/test_tenant_conf.py @@ -348,7 +348,6 @@ def test_tenant_config_patch(neon_env_builder: NeonEnvBuilder, ps_managed_by: st def assert_tenant_conf_semantically_equal(lhs, rhs): """ - Storcon returns None for fields that are not set while the pageserver does not. Compare two tenant's config overrides semantically, by dropping the None values. """ lhs = {k: v for k, v in lhs.items() if v is not None} @@ -375,10 +374,7 @@ def test_tenant_config_patch(neon_env_builder: NeonEnvBuilder, ps_managed_by: st patch: dict[str, Any | None] = { "gc_period": "3h", - "wal_receiver_protocol_override": { - "type": "interpreted", - "args": {"format": "bincode", "compression": {"zstd": {"level": 1}}}, - }, + "gc_compaction_ratio_percent": 10, } api.patch_tenant_config(env.initial_tenant, patch) tenant_conf_after_patch = api.tenant_config(env.initial_tenant).tenant_specific_overrides @@ -391,7 +387,7 @@ def test_tenant_config_patch(neon_env_builder: NeonEnvBuilder, ps_managed_by: st assert_tenant_conf_semantically_equal(tenant_conf_after_patch, crnt_tenant_conf | patch) crnt_tenant_conf = tenant_conf_after_patch - patch = {"gc_period": "5h", "wal_receiver_protocol_override": None} + patch = {"gc_period": "5h", "gc_compaction_ratio_percent": None} api.patch_tenant_config(env.initial_tenant, patch) tenant_conf_after_patch = api.tenant_config(env.initial_tenant).tenant_specific_overrides if ps_managed_by == "storcon": diff --git a/test_runner/regress/test_wal_acceptor_async.py b/test_runner/regress/test_wal_acceptor_async.py index 4070f99568..d8a7dc2a2b 100644 --- a/test_runner/regress/test_wal_acceptor_async.py +++ b/test_runner/regress/test_wal_acceptor_async.py @@ -14,7 +14,6 @@ from fixtures.neon_fixtures import ( Endpoint, NeonEnv, NeonEnvBuilder, - PageserverWalReceiverProtocol, Safekeeper, ) from fixtures.remote_storage import RemoteStorageKind @@ -751,15 +750,8 @@ async def run_segment_init_failure(env: NeonEnv): # Test (injected) failure during WAL segment init. # https://github.com/neondatabase/neon/issues/6401 # https://github.com/neondatabase/neon/issues/6402 -@pytest.mark.parametrize( - "wal_receiver_protocol", - [PageserverWalReceiverProtocol.VANILLA, PageserverWalReceiverProtocol.INTERPRETED], -) -def test_segment_init_failure( - neon_env_builder: NeonEnvBuilder, wal_receiver_protocol: PageserverWalReceiverProtocol -): +def test_segment_init_failure(neon_env_builder: NeonEnvBuilder): neon_env_builder.num_safekeepers = 1 - neon_env_builder.pageserver_wal_receiver_protocol = wal_receiver_protocol env = neon_env_builder.init_start() asyncio.run(run_segment_init_failure(env)) From 6a43f23ecaeb12385dc7b7bc5b1b4a5f52047b37 Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Thu, 5 Jun 2025 13:52:52 +0200 Subject: [PATCH 24/25] pagebench: add batch support (#12133) ## Problem The new gRPC page service protocol supports client-side batches. The current libpq protocol only does best-effort server-side batching. To compare these approaches, Pagebench should support submitting contiguous page batches, similar to how Postgres will submit them (e.g. with prefetches or vectored reads). ## Summary of changes Add a `--batch-size` parameter specifying the size of contiguous page batches. One batch counts as 1 RPS and 1 queue depth. For the libpq protocol, a batch is submitted as individual requests and we rely on the server to batch them for us. This will give a realistic comparison of how these would be processed in the wild (e.g. when Postgres sends 100 prefetch requests). This patch also adds some basic validation of responses. --- Cargo.lock | 1 + pageserver/pagebench/Cargo.toml | 1 + .../pagebench/src/cmd/getpage_latest_lsn.rs | 195 ++++++++++++++---- 3 files changed, 152 insertions(+), 45 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 542a4528d3..588a63b6a3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4237,6 +4237,7 @@ version = "0.1.0" dependencies = [ "anyhow", "async-trait", + "bytes", "camino", "clap", "futures", diff --git a/pageserver/pagebench/Cargo.toml b/pageserver/pagebench/Cargo.toml index ceb1278eab..5e4af88e69 100644 --- a/pageserver/pagebench/Cargo.toml +++ b/pageserver/pagebench/Cargo.toml @@ -9,6 +9,7 @@ license.workspace = true [dependencies] anyhow.workspace = true async-trait.workspace = true +bytes.workspace = true camino.workspace = true clap.workspace = true futures.workspace = true diff --git a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs index 395e9cac41..3f3b6e396e 100644 --- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs +++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs @@ -1,4 +1,4 @@ -use std::collections::{HashSet, VecDeque}; +use std::collections::{HashMap, HashSet, VecDeque}; use std::future::Future; use std::num::NonZeroUsize; use std::pin::Pin; @@ -8,12 +8,12 @@ use std::time::{Duration, Instant}; use anyhow::Context; use async_trait::async_trait; +use bytes::Bytes; use camino::Utf8PathBuf; use pageserver_api::key::Key; use pageserver_api::keyspace::KeySpaceAccum; -use pageserver_api::models::{ - PagestreamGetPageRequest, PagestreamGetPageResponse, PagestreamRequest, -}; +use pageserver_api::models::{PagestreamGetPageRequest, PagestreamRequest}; +use pageserver_api::reltag::RelTag; use pageserver_api::shard::TenantShardId; use pageserver_page_api::proto; use rand::prelude::*; @@ -77,6 +77,16 @@ pub(crate) struct Args { #[clap(long, default_value = "1")] queue_depth: NonZeroUsize, + /// Batch size of contiguous pages generated by each client. This is equivalent to how Postgres + /// will request page batches (e.g. prefetches or vectored reads). A batch counts as 1 RPS and + /// 1 queue depth. + /// + /// The libpq protocol does not support client-side batching, and will submit batches as many + /// individual requests, in the hope that the server will batch them. Each batch still counts as + /// 1 RPS and 1 queue depth. + #[clap(long, default_value = "1")] + batch_size: NonZeroUsize, + #[clap(long)] only_relnode: Option, @@ -392,7 +402,16 @@ async fn run_worker( shared_state.start_work_barrier.wait().await; let client_start = Instant::now(); let mut ticks_processed = 0; - let mut inflight = VecDeque::new(); + let mut req_id = 0; + let batch_size: usize = args.batch_size.into(); + + // Track inflight requests by request ID and start time. This times the request duration, and + // ensures responses match requests. We don't expect responses back in any particular order. + // + // NB: this does not check that all requests received a response, because we don't wait for the + // inflight requests to complete when the duration elapses. + let mut inflight: HashMap = HashMap::new(); + while !cancel.is_cancelled() { // Detect if a request took longer than the RPS rate if let Some(period) = &rps_period { @@ -408,36 +427,72 @@ async fn run_worker( } while inflight.len() < args.queue_depth.get() { + req_id += 1; let start = Instant::now(); - let req = { + let (req_lsn, mod_lsn, rel, blks) = { + /// Converts a compact i128 key to a relation tag and block number. + fn key_to_block(key: i128) -> (RelTag, u32) { + let key = Key::from_i128(key); + assert!(key.is_rel_block_key()); + key.to_rel_block() + .expect("we filter non-rel-block keys out above") + } + + // Pick a random page from a random relation. let mut rng = rand::thread_rng(); let r = &ranges[weights.sample(&mut rng)]; let key: i128 = rng.gen_range(r.start..r.end); - let key = Key::from_i128(key); - assert!(key.is_rel_block_key()); - let (rel_tag, block_no) = key - .to_rel_block() - .expect("we filter non-rel-block keys out above"); - PagestreamGetPageRequest { - hdr: PagestreamRequest { - reqid: 0, - request_lsn: if rng.gen_bool(args.req_latest_probability) { - Lsn::MAX - } else { - r.timeline_lsn - }, - not_modified_since: r.timeline_lsn, - }, - rel: rel_tag, - blkno: block_no, + let (rel_tag, block_no) = key_to_block(key); + + let mut blks = VecDeque::with_capacity(batch_size); + blks.push_back(block_no); + + // If requested, populate a batch of sequential pages. This is how Postgres will + // request page batches (e.g. prefetches). If we hit the end of the relation, we + // grow the batch towards the start too. + for i in 1..batch_size { + let (r, b) = key_to_block(key + i as i128); + if r != rel_tag { + break; // went outside relation + } + blks.push_back(b) } + + if blks.len() < batch_size { + // Grow batch backwards if needed. + for i in 1..batch_size { + let (r, b) = key_to_block(key - i as i128); + if r != rel_tag { + break; // went outside relation + } + blks.push_front(b) + } + } + + // We assume that the entire batch can fit within the relation. + assert_eq!(blks.len(), batch_size, "incomplete batch"); + + let req_lsn = if rng.gen_bool(args.req_latest_probability) { + Lsn::MAX + } else { + r.timeline_lsn + }; + (req_lsn, r.timeline_lsn, rel_tag, blks.into()) }; - client.send_get_page(req).await.unwrap(); - inflight.push_back(start); + client + .send_get_page(req_id, req_lsn, mod_lsn, rel, blks) + .await + .unwrap(); + let old = inflight.insert(req_id, start); + assert!(old.is_none(), "duplicate request ID {req_id}"); } - let start = inflight.pop_front().unwrap(); - client.recv_get_page().await.unwrap(); + let (req_id, pages) = client.recv_get_page().await.unwrap(); + assert_eq!(pages.len(), batch_size, "unexpected page count"); + assert!(pages.iter().all(|p| !p.is_empty()), "empty page"); + let start = inflight + .remove(&req_id) + .expect("response for unknown request ID"); let end = Instant::now(); shared_state.live_stats.request_done(); ticks_processed += 1; @@ -467,15 +522,24 @@ async fn run_worker( #[async_trait] trait Client: Send { /// Sends an asynchronous GetPage request to the pageserver. - async fn send_get_page(&mut self, req: PagestreamGetPageRequest) -> anyhow::Result<()>; + async fn send_get_page( + &mut self, + req_id: u64, + req_lsn: Lsn, + mod_lsn: Lsn, + rel: RelTag, + blks: Vec, + ) -> anyhow::Result<()>; /// Receives the next GetPage response from the pageserver. - async fn recv_get_page(&mut self) -> anyhow::Result; + async fn recv_get_page(&mut self) -> anyhow::Result<(u64, Vec)>; } /// A libpq-based Pageserver client. struct LibpqClient { inner: pageserver_client::page_service::PagestreamClient, + // Track sent batches, so we know how many responses to expect. + batch_sizes: VecDeque, } impl LibpqClient { @@ -484,18 +548,55 @@ impl LibpqClient { .await? .pagestream(ttid.tenant_id, ttid.timeline_id) .await?; - Ok(Self { inner }) + Ok(Self { + inner, + batch_sizes: VecDeque::new(), + }) } } #[async_trait] impl Client for LibpqClient { - async fn send_get_page(&mut self, req: PagestreamGetPageRequest) -> anyhow::Result<()> { - self.inner.getpage_send(req).await + async fn send_get_page( + &mut self, + req_id: u64, + req_lsn: Lsn, + mod_lsn: Lsn, + rel: RelTag, + blks: Vec, + ) -> anyhow::Result<()> { + // libpq doesn't support client-side batches, so we send a bunch of individual requests + // instead in the hope that the server will batch them for us. We use the same request ID + // for all, because we'll return a single batch response. + self.batch_sizes.push_back(blks.len()); + for blkno in blks { + let req = PagestreamGetPageRequest { + hdr: PagestreamRequest { + reqid: req_id, + request_lsn: req_lsn, + not_modified_since: mod_lsn, + }, + rel, + blkno, + }; + self.inner.getpage_send(req).await?; + } + Ok(()) } - async fn recv_get_page(&mut self) -> anyhow::Result { - self.inner.getpage_recv().await + async fn recv_get_page(&mut self) -> anyhow::Result<(u64, Vec)> { + let batch_size = self.batch_sizes.pop_front().unwrap(); + let mut batch = Vec::with_capacity(batch_size); + let mut req_id = None; + for _ in 0..batch_size { + let resp = self.inner.getpage_recv().await?; + if req_id.is_none() { + req_id = Some(resp.req.hdr.reqid); + } + assert_eq!(req_id, Some(resp.req.hdr.reqid), "request ID mismatch"); + batch.push(resp.page); + } + Ok((req_id.unwrap(), batch)) } } @@ -532,31 +633,35 @@ impl GrpcClient { #[async_trait] impl Client for GrpcClient { - async fn send_get_page(&mut self, req: PagestreamGetPageRequest) -> anyhow::Result<()> { + async fn send_get_page( + &mut self, + req_id: u64, + req_lsn: Lsn, + mod_lsn: Lsn, + rel: RelTag, + blks: Vec, + ) -> anyhow::Result<()> { let req = proto::GetPageRequest { - request_id: 0, + request_id: req_id, request_class: proto::GetPageClass::Normal as i32, read_lsn: Some(proto::ReadLsn { - request_lsn: req.hdr.request_lsn.0, - not_modified_since_lsn: req.hdr.not_modified_since.0, + request_lsn: req_lsn.0, + not_modified_since_lsn: mod_lsn.0, }), - rel: Some(req.rel.into()), - block_number: vec![req.blkno], + rel: Some(rel.into()), + block_number: blks, }; self.req_tx.send(req).await?; Ok(()) } - async fn recv_get_page(&mut self) -> anyhow::Result { + async fn recv_get_page(&mut self) -> anyhow::Result<(u64, Vec)> { let resp = self.resp_rx.message().await?.unwrap(); anyhow::ensure!( resp.status_code == proto::GetPageStatusCode::Ok as i32, "unexpected status code: {}", resp.status_code ); - Ok(PagestreamGetPageResponse { - page: resp.page_image[0].clone(), - req: PagestreamGetPageRequest::default(), // dummy - }) + Ok((resp.request_id, resp.page_image)) } } From 038e967daf145af48290eb8560b8a3a43f3579fc Mon Sep 17 00:00:00 2001 From: a-masterov <72613290+a-masterov@users.noreply.github.com> Date: Thu, 5 Jun 2025 14:03:51 +0200 Subject: [PATCH 25/25] Configure the dynamic loader for the extension-tests image (#12142) ## Problem The same problem, fixed in https://github.com/neondatabase/neon/issues/11857, but for the image `neon-extesions-test` ## Summary of changes The config file was added to use our library --- compute/compute-node.Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile index d3008c8085..248f52088b 100644 --- a/compute/compute-node.Dockerfile +++ b/compute/compute-node.Dockerfile @@ -1902,6 +1902,7 @@ COPY compute/patches/pg_repack.patch /ext-src RUN cd /ext-src/pg_repack-src && patch -p1 /etc/ld.so.conf.d/00-neon.conf && /sbin/ldconfig RUN apt-get update && apt-get install -y libtap-parser-sourcehandler-pgtap-perl jq \ && apt clean && rm -rf /ext-src/*.tar.gz /ext-src/*.patch /var/lib/apt/lists/* ENV PATH=/usr/local/pgsql/bin:$PATH