From 4d2e4b19c3dc8816668abc4204b110f1c9fd1b1e Mon Sep 17 00:00:00 2001 From: Shockingly Good Date: Wed, 7 May 2025 18:34:08 +0200 Subject: [PATCH 01/65] fix(compute) Correct the PGXN s3 gateway URL. (#11796) Corrects the postgres extension s3 gateway address to be not just a domain name but a full base URL. To make the code more readable, the option is renamed to "remote_ext_base_url", while keeping the old name also accessible by providing a clap argument alias. Also provides a very simple and, perhaps, even redundant unit test to confirm the logic behind parsing of the corresponding CLI argument. ## Problem As it is clearly stated in https://github.com/neondatabase/cloud/issues/26005, using of the short version of the domain name might work for now, but in the future, we should get rid of using the `default` namespace and this is where it will, most likely, break down. ## Summary of changes The changes adjust the domain name of the extension s3 gateway to use the proper base url format instead of the just domain name assuming the "default" namespace and add a new CLI argument name for to reflect the change and the expectance. --- compute_tools/src/bin/compute_ctl.rs | 34 +++++++++++++++---- compute_tools/src/compute.rs | 10 +++--- compute_tools/src/extension_server.rs | 8 ++--- .../src/http/routes/extension_server.rs | 2 +- control_plane/src/bin/neon_local.rs | 9 ++--- control_plane/src/endpoint.rs | 6 ++-- test_runner/fixtures/neon_cli.py | 6 ++-- test_runner/fixtures/neon_fixtures.py | 12 +++---- .../regress/test_download_extensions.py | 4 +-- 9 files changed, 56 insertions(+), 35 deletions(-) diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index e337ee7b15..20b5e567a8 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -60,12 +60,16 @@ use utils::failpoint_support; // Compatibility hack: if the control plane specified any remote-ext-config // use the default value for extension storage proxy gateway. // Remove this once the control plane is updated to pass the gateway URL -fn parse_remote_ext_config(arg: &str) -> Result { - if arg.starts_with("http") { - Ok(arg.trim_end_matches('/').to_string()) +fn parse_remote_ext_base_url(arg: &str) -> Result { + const FALLBACK_PG_EXT_GATEWAY_BASE_URL: &str = + "http://pg-ext-s3-gateway.pg-ext-s3-gateway.svc.cluster.local"; + + Ok(if arg.starts_with("http") { + arg } else { - Ok("http://pg-ext-s3-gateway".to_string()) + FALLBACK_PG_EXT_GATEWAY_BASE_URL } + .to_owned()) } #[derive(Parser)] @@ -74,8 +78,10 @@ struct Cli { #[arg(short = 'b', long, default_value = "postgres", env = "POSTGRES_PATH")] pub pgbin: String, - #[arg(short = 'r', long, value_parser = parse_remote_ext_config)] - pub remote_ext_config: Option, + /// The base URL for the remote extension storage proxy gateway. + /// Should be in the form of `http(s)://[:]`. + #[arg(short = 'r', long, value_parser = parse_remote_ext_base_url, alias = "remote-ext-config")] + pub remote_ext_base_url: Option, /// The port to bind the external listening HTTP server to. Clients running /// outside the compute will talk to the compute through this port. Keep @@ -164,7 +170,7 @@ fn main() -> Result<()> { pgversion: get_pg_version_string(&cli.pgbin), external_http_port: cli.external_http_port, internal_http_port: cli.internal_http_port, - ext_remote_storage: cli.remote_ext_config.clone(), + remote_ext_base_url: cli.remote_ext_base_url.clone(), resize_swap_on_bind: cli.resize_swap_on_bind, set_disk_quota_for_fs: cli.set_disk_quota_for_fs, #[cfg(target_os = "linux")] @@ -265,4 +271,18 @@ mod test { fn verify_cli() { Cli::command().debug_assert() } + + #[test] + fn parse_pg_ext_gateway_base_url() { + let arg = "http://pg-ext-s3-gateway2"; + let result = super::parse_remote_ext_base_url(arg).unwrap(); + assert_eq!(result, arg); + + let arg = "pg-ext-s3-gateway"; + let result = super::parse_remote_ext_base_url(arg).unwrap(); + assert_eq!( + result, + "http://pg-ext-s3-gateway.pg-ext-s3-gateway.svc.cluster.local" + ); + } } diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 0cda36a6e2..25920675c1 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -95,7 +95,7 @@ pub struct ComputeNodeParams { pub internal_http_port: u16, /// the address of extension storage proxy gateway - pub ext_remote_storage: Option, + pub remote_ext_base_url: Option, } /// Compute node info shared across several `compute_ctl` threads. @@ -1896,9 +1896,9 @@ LIMIT 100", real_ext_name: String, ext_path: RemotePath, ) -> Result { - let ext_remote_storage = + let remote_ext_base_url = self.params - .ext_remote_storage + .remote_ext_base_url .as_ref() .ok_or(DownloadError::BadInput(anyhow::anyhow!( "Remote extensions storage is not configured", @@ -1960,7 +1960,7 @@ LIMIT 100", let download_size = extension_server::download_extension( &real_ext_name, &ext_path, - ext_remote_storage, + remote_ext_base_url, &self.params.pgbin, ) .await @@ -2069,7 +2069,7 @@ LIMIT 100", &self, spec: &ComputeSpec, ) -> Result { - if self.params.ext_remote_storage.is_none() { + if self.params.remote_ext_base_url.is_none() { return Ok(RemoteExtensionMetrics { num_ext_downloaded: 0, largest_ext_size: 0, diff --git a/compute_tools/src/extension_server.rs b/compute_tools/src/extension_server.rs index ee889e0c40..3439383699 100644 --- a/compute_tools/src/extension_server.rs +++ b/compute_tools/src/extension_server.rs @@ -158,14 +158,14 @@ fn parse_pg_version(human_version: &str) -> PostgresMajorVersion { pub async fn download_extension( ext_name: &str, ext_path: &RemotePath, - ext_remote_storage: &str, + remote_ext_base_url: &str, pgbin: &str, ) -> Result { info!("Download extension {:?} from {:?}", ext_name, ext_path); // TODO add retry logic let download_buffer = - match download_extension_tar(ext_remote_storage, &ext_path.to_string()).await { + match download_extension_tar(remote_ext_base_url, &ext_path.to_string()).await { Ok(buffer) => buffer, Err(error_message) => { return Err(anyhow::anyhow!( @@ -272,8 +272,8 @@ pub fn create_control_files(remote_extensions: &RemoteExtSpec, pgbin: &str) { // Do request to extension storage proxy, e.g., // curl http://pg-ext-s3-gateway/latest/v15/extensions/anon.tar.zst // using HTTP GET and return the response body as bytes. -async fn download_extension_tar(ext_remote_storage: &str, ext_path: &str) -> Result { - let uri = format!("{}/{}", ext_remote_storage, ext_path); +async fn download_extension_tar(remote_ext_base_url: &str, ext_path: &str) -> Result { + let uri = format!("{}/{}", remote_ext_base_url, ext_path); let filename = Path::new(ext_path) .file_name() .unwrap_or_else(|| std::ffi::OsStr::new("unknown")) diff --git a/compute_tools/src/http/routes/extension_server.rs b/compute_tools/src/http/routes/extension_server.rs index 6508de6eee..e141a48b7f 100644 --- a/compute_tools/src/http/routes/extension_server.rs +++ b/compute_tools/src/http/routes/extension_server.rs @@ -22,7 +22,7 @@ pub(in crate::http) async fn download_extension( State(compute): State>, ) -> Response { // Don't even try to download extensions if no remote storage is configured - if compute.params.ext_remote_storage.is_none() { + if compute.params.remote_ext_base_url.is_none() { return JsonResponse::error( StatusCode::PRECONDITION_FAILED, "remote storage is not configured", diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index fd625e9ed6..610fa5f865 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -644,9 +644,10 @@ struct EndpointStartCmdArgs { #[clap( long, - help = "Configure the remote extensions storage proxy gateway to request for extensions." + help = "Configure the remote extensions storage proxy gateway URL to request for extensions.", + alias = "remote-ext-config" )] - remote_ext_config: Option, + remote_ext_base_url: Option, #[clap( long, @@ -1414,7 +1415,7 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res EndpointCmd::Start(args) => { let endpoint_id = &args.endpoint_id; let pageserver_id = args.endpoint_pageserver_id; - let remote_ext_config = &args.remote_ext_config; + let remote_ext_base_url = &args.remote_ext_base_url; let safekeepers_generation = args.safekeepers_generation.map(SafekeeperGeneration::new); // If --safekeepers argument is given, use only the listed @@ -1510,7 +1511,7 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res safekeepers_generation, safekeepers, pageservers, - remote_ext_config.as_ref(), + remote_ext_base_url.as_ref(), stripe_size.0 as usize, args.create_test_user, args.start_timeout, diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs index be73661a3c..708745446d 100644 --- a/control_plane/src/endpoint.rs +++ b/control_plane/src/endpoint.rs @@ -655,7 +655,7 @@ impl Endpoint { safekeepers_generation: Option, safekeepers: Vec, pageservers: Vec<(Host, u16)>, - remote_ext_config: Option<&String>, + remote_ext_base_url: Option<&String>, shard_stripe_size: usize, create_test_user: bool, start_timeout: Duration, @@ -825,8 +825,8 @@ impl Endpoint { .stderr(logfile.try_clone()?) .stdout(logfile); - if let Some(remote_ext_config) = remote_ext_config { - cmd.args(["--remote-ext-config", remote_ext_config]); + if let Some(remote_ext_base_url) = remote_ext_base_url { + cmd.args(["--remote-ext-base-url", remote_ext_base_url]); } let child = cmd.spawn()?; diff --git a/test_runner/fixtures/neon_cli.py b/test_runner/fixtures/neon_cli.py index 3be78719d7..4eaa4b7d99 100644 --- a/test_runner/fixtures/neon_cli.py +++ b/test_runner/fixtures/neon_cli.py @@ -557,7 +557,7 @@ class NeonLocalCli(AbstractNeonCli): endpoint_id: str, safekeepers_generation: int | None = None, safekeepers: list[int] | None = None, - remote_ext_config: str | None = None, + remote_ext_base_url: str | None = None, pageserver_id: int | None = None, allow_multiple: bool = False, create_test_user: bool = False, @@ -572,8 +572,8 @@ class NeonLocalCli(AbstractNeonCli): extra_env_vars = env or {} if basebackup_request_tries is not None: extra_env_vars["NEON_COMPUTE_TESTING_BASEBACKUP_TRIES"] = str(basebackup_request_tries) - if remote_ext_config is not None: - args.extend(["--remote-ext-config", remote_ext_config]) + if remote_ext_base_url is not None: + args.extend(["--remote-ext-base-url", remote_ext_base_url]) if safekeepers_generation is not None: args.extend(["--safekeepers-generation", str(safekeepers_generation)]) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index d4a750ad3b..85ad49bb4f 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -4226,7 +4226,7 @@ class Endpoint(PgProtocol, LogUtils): def start( self, - remote_ext_config: str | None = None, + remote_ext_base_url: str | None = None, pageserver_id: int | None = None, safekeeper_generation: int | None = None, safekeepers: list[int] | None = None, @@ -4252,7 +4252,7 @@ class Endpoint(PgProtocol, LogUtils): self.endpoint_id, safekeepers_generation=safekeeper_generation, safekeepers=self.active_safekeepers, - remote_ext_config=remote_ext_config, + remote_ext_base_url=remote_ext_base_url, pageserver_id=pageserver_id, allow_multiple=allow_multiple, create_test_user=create_test_user, @@ -4467,7 +4467,7 @@ class Endpoint(PgProtocol, LogUtils): hot_standby: bool = False, lsn: Lsn | None = None, config_lines: list[str] | None = None, - remote_ext_config: str | None = None, + remote_ext_base_url: str | None = None, pageserver_id: int | None = None, allow_multiple: bool = False, basebackup_request_tries: int | None = None, @@ -4486,7 +4486,7 @@ class Endpoint(PgProtocol, LogUtils): pageserver_id=pageserver_id, allow_multiple=allow_multiple, ).start( - remote_ext_config=remote_ext_config, + remote_ext_base_url=remote_ext_base_url, pageserver_id=pageserver_id, allow_multiple=allow_multiple, basebackup_request_tries=basebackup_request_tries, @@ -4570,7 +4570,7 @@ class EndpointFactory: lsn: Lsn | None = None, hot_standby: bool = False, config_lines: list[str] | None = None, - remote_ext_config: str | None = None, + remote_ext_base_url: str | None = None, pageserver_id: int | None = None, basebackup_request_tries: int | None = None, ) -> Endpoint: @@ -4590,7 +4590,7 @@ class EndpointFactory: hot_standby=hot_standby, config_lines=config_lines, lsn=lsn, - remote_ext_config=remote_ext_config, + remote_ext_base_url=remote_ext_base_url, pageserver_id=pageserver_id, basebackup_request_tries=basebackup_request_tries, ) diff --git a/test_runner/regress/test_download_extensions.py b/test_runner/regress/test_download_extensions.py index d28240c722..24ba0713d2 100644 --- a/test_runner/regress/test_download_extensions.py +++ b/test_runner/regress/test_download_extensions.py @@ -221,7 +221,7 @@ def test_remote_extensions( endpoint.create_remote_extension_spec(spec) - endpoint.start(remote_ext_config=extensions_endpoint) + endpoint.start(remote_ext_base_url=extensions_endpoint) with endpoint.connect() as conn: with conn.cursor() as cur: @@ -249,7 +249,7 @@ def test_remote_extensions( # Remove the extension files to force a redownload of the extension. extension.remove(test_output_dir, pg_version) - endpoint.start(remote_ext_config=extensions_endpoint) + endpoint.start(remote_ext_base_url=extensions_endpoint) # Test that ALTER EXTENSION UPDATE statements also fetch remote extensions. with endpoint.connect() as conn: From 24d62c647fba00d1ac93f4118836ceeddf07b270 Mon Sep 17 00:00:00 2001 From: Dmitrii Kovalkov <34828390+DimasKovas@users.noreply.github.com> Date: Wed, 7 May 2025 21:00:41 +0400 Subject: [PATCH 02/65] storcon: add missing switch_timeline_membership method to sk client (#11850) ## Problem `switch_timeline_membership` is implemented on safekeeper's server side, but the is missing in the client. - Part of https://github.com/neondatabase/neon/issues/11823 ## Summary of changes - Add `switch_timeline_membership` method to `SafekeeperClient` --- safekeeper/client/src/mgmt_api.rs | 14 ++++++++++++++ storage_controller/src/safekeeper_client.rs | 17 +++++++++++++++++ 2 files changed, 31 insertions(+) diff --git a/safekeeper/client/src/mgmt_api.rs b/safekeeper/client/src/mgmt_api.rs index 5849df0343..b364ac8e48 100644 --- a/safekeeper/client/src/mgmt_api.rs +++ b/safekeeper/client/src/mgmt_api.rs @@ -121,6 +121,20 @@ impl Client { resp.json().await.map_err(Error::ReceiveBody) } + pub async fn switch_timeline_membership( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + req: &models::TimelineMembershipSwitchRequest, + ) -> Result { + let uri = format!( + "{}/v1/tenant/{}/timeline/{}/membership", + self.mgmt_api_endpoint, tenant_id, timeline_id + ); + let resp = self.put(&uri, req).await?; + resp.json().await.map_err(Error::ReceiveBody) + } + pub async fn delete_tenant(&self, tenant_id: TenantId) -> Result { let uri = format!("{}/v1/tenant/{}", self.mgmt_api_endpoint, tenant_id); let resp = self diff --git a/storage_controller/src/safekeeper_client.rs b/storage_controller/src/safekeeper_client.rs index 988159af4a..1f3ea96d96 100644 --- a/storage_controller/src/safekeeper_client.rs +++ b/storage_controller/src/safekeeper_client.rs @@ -98,6 +98,23 @@ impl SafekeeperClient { ) } + #[allow(unused)] + pub(crate) async fn switch_timeline_membership( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + req: &models::TimelineMembershipSwitchRequest, + ) -> Result { + measured_request!( + "switch_timeline_membership", + crate::metrics::Method::Put, + &self.node_id_label, + self.inner + .switch_timeline_membership(tenant_id, timeline_id, req) + .await + ) + } + pub(crate) async fn delete_tenant( &self, tenant_id: TenantId, From 7eb85c56acb5f87c730b879c9488e217448ee28b Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Thu, 8 May 2025 08:33:29 +0200 Subject: [PATCH 03/65] tokio-epoll-uring: avoid warn! noise due to `ECANCELED` during shutdowns (#11819) # Problem Before this PR, `test_pageserver_catchup_while_compute_down` would occasionally fail due to scary-looking WARN log line ``` WARN ephemeral_file_buffered_writer{...}:flush_attempt{attempt=1}: \ error flushing buffered writer buffer to disk, retrying after backoff err=Operation canceled (os error 125) ``` After lengthy investigation, the conclusion is that this is likely due to a kernel bug related due to io_uring async workers (io-wq) and signals. The main indicator is that the error only ever happens in correlation with pageserver shtudown when SIGTERM is received. There is a fix that is merged in 6.14 kernels (`io-wq: backoff when retrying worker creation`). However, even when I revert that patch, the issue is not reproducible on 6.14, so, it remains a speculation. It was ruled out that the ECANCELED is due to the executor thread exiting before the async worker starts processing the operation. # Solution The workaround in this issue is to retry the operation on ECANCELED once. Retries are safe because the low-level io_engine operations are idempotent. (We don't use O_APPEND and I can't think of another flag that would make the APIs covered by this patch not idempotent.) # Testing With this PR, the warn! log no longer happens on [my reproducer setup](https://github.com/neondatabase/neon/issues/11446#issuecomment-2843015111). And the new rate-limited `info!`-level log line informing about the internal retry shows up instead, as expected. # Refs - fixes https://github.com/neondatabase/neon/issues/11446 --- libs/utils/src/rate_limit.rs | 2 +- pageserver/src/virtual_file/io_engine.rs | 85 +++++++++++++++++++-- pageserver/src/virtual_file/open_options.rs | 18 +++-- 3 files changed, 91 insertions(+), 14 deletions(-) diff --git a/libs/utils/src/rate_limit.rs b/libs/utils/src/rate_limit.rs index 945f710b1d..700cd5792b 100644 --- a/libs/utils/src/rate_limit.rs +++ b/libs/utils/src/rate_limit.rs @@ -17,7 +17,7 @@ impl std::fmt::Display for RateLimitStats { } impl RateLimit { - pub fn new(interval: Duration) -> Self { + pub const fn new(interval: Duration) -> Self { Self { last: None, interval, diff --git a/pageserver/src/virtual_file/io_engine.rs b/pageserver/src/virtual_file/io_engine.rs index dd04fb561a..d8eb803335 100644 --- a/pageserver/src/virtual_file/io_engine.rs +++ b/pageserver/src/virtual_file/io_engine.rs @@ -13,7 +13,7 @@ pub(super) mod tokio_epoll_uring_ext; use tokio_epoll_uring::IoBuf; -use tracing::Instrument; +use tracing::{Instrument, info}; pub(crate) use super::api::IoEngineKind; #[derive(Clone, Copy)] @@ -111,13 +111,16 @@ pub(crate) fn get() -> IoEngine { use std::os::unix::prelude::FileExt; use std::sync::atomic::{AtomicU8, Ordering}; +use std::time::Duration; use super::owned_buffers_io::io_buf_ext::FullSlice; use super::owned_buffers_io::slice::SliceMutExt; use super::{FileGuard, Metadata}; #[cfg(target_os = "linux")] -fn epoll_uring_error_to_std(e: tokio_epoll_uring::Error) -> std::io::Error { +pub(super) fn epoll_uring_error_to_std( + e: tokio_epoll_uring::Error, +) -> std::io::Error { match e { tokio_epoll_uring::Error::Op(e) => e, tokio_epoll_uring::Error::System(system) => { @@ -149,7 +152,11 @@ impl IoEngine { #[cfg(target_os = "linux")] IoEngine::TokioEpollUring => { let system = tokio_epoll_uring_ext::thread_local_system().await; - let (resources, res) = system.read(file_guard, offset, slice).await; + let (resources, res) = + retry_ecanceled_once((file_guard, slice), |(file_guard, slice)| async { + system.read(file_guard, offset, slice).await + }) + .await; (resources, res.map_err(epoll_uring_error_to_std)) } } @@ -164,7 +171,10 @@ impl IoEngine { #[cfg(target_os = "linux")] IoEngine::TokioEpollUring => { let system = tokio_epoll_uring_ext::thread_local_system().await; - let (resources, res) = system.fsync(file_guard).await; + let (resources, res) = retry_ecanceled_once(file_guard, |file_guard| async { + system.fsync(file_guard).await + }) + .await; (resources, res.map_err(epoll_uring_error_to_std)) } } @@ -182,7 +192,10 @@ impl IoEngine { #[cfg(target_os = "linux")] IoEngine::TokioEpollUring => { let system = tokio_epoll_uring_ext::thread_local_system().await; - let (resources, res) = system.fdatasync(file_guard).await; + let (resources, res) = retry_ecanceled_once(file_guard, |file_guard| async { + system.fdatasync(file_guard).await + }) + .await; (resources, res.map_err(epoll_uring_error_to_std)) } } @@ -201,7 +214,10 @@ impl IoEngine { #[cfg(target_os = "linux")] IoEngine::TokioEpollUring => { let system = tokio_epoll_uring_ext::thread_local_system().await; - let (resources, res) = system.statx(file_guard).await; + let (resources, res) = retry_ecanceled_once(file_guard, |file_guard| async { + system.statx(file_guard).await + }) + .await; ( resources, res.map_err(epoll_uring_error_to_std).map(Metadata::from), @@ -224,6 +240,7 @@ impl IoEngine { #[cfg(target_os = "linux")] IoEngine::TokioEpollUring => { // TODO: ftruncate op for tokio-epoll-uring + // Don't forget to use retry_ecanceled_once let res = file_guard.with_std_file(|std_file| std_file.set_len(len)); (file_guard, res) } @@ -245,8 +262,11 @@ impl IoEngine { #[cfg(target_os = "linux")] IoEngine::TokioEpollUring => { let system = tokio_epoll_uring_ext::thread_local_system().await; - let ((file_guard, slice), res) = - system.write(file_guard, offset, buf.into_raw_slice()).await; + let ((file_guard, slice), res) = retry_ecanceled_once( + (file_guard, buf.into_raw_slice()), + async |(file_guard, buf)| system.write(file_guard, offset, buf).await, + ) + .await; ( (file_guard, FullSlice::must_new(slice)), res.map_err(epoll_uring_error_to_std), @@ -282,6 +302,55 @@ impl IoEngine { } } +/// We observe in tests that stop pageserver with SIGTERM immediately after it was ingesting data, +/// occasionally buffered writers fail (and get retried by BufferedWriter) with ECANCELED. +/// The problem is believed to be a race condition in how io_uring handles punted async work (io-wq) and signals. +/// Investigation ticket: +/// +/// This function retries the operation once if it fails with ECANCELED. +/// ONLY USE FOR IDEMPOTENT [`super::VirtualFile`] operations. +pub(super) async fn retry_ecanceled_once( + resources: T, + f: F, +) -> (T, Result>) +where + F: Fn(T) -> Fut, + Fut: std::future::Future>)>, + T: Send, + V: Send, +{ + let (resources, res) = f(resources).await; + let Err(e) = res else { + return (resources, res); + }; + let tokio_epoll_uring::Error::Op(err) = e else { + return (resources, Err(e)); + }; + if err.raw_os_error() != Some(nix::libc::ECANCELED) { + return (resources, Err(tokio_epoll_uring::Error::Op(err))); + } + { + static RATE_LIMIT: std::sync::Mutex = + std::sync::Mutex::new(utils::rate_limit::RateLimit::new(Duration::from_secs(1))); + let mut guard = RATE_LIMIT.lock().unwrap(); + guard.call2(|rate_limit_stats| { + info!( + %rate_limit_stats, "ECANCELED observed, assuming it is due to a signal being received by the submitting thread, retrying after a delay; this message is rate-limited" + ); + }); + drop(guard); + } + tokio::time::sleep(Duration::from_millis(100)).await; // something big enough to beat even heavily overcommitted CI runners + let (resources, res) = f(resources).await; + (resources, res) +} + +pub(super) fn panic_operation_must_be_idempotent() { + panic!( + "unsupported; io_engine may retry operations internally and thus needs them to be idempotent (retry_ecanceled_once)" + ) +} + pub enum FeatureTestResult { PlatformPreferred(IoEngineKind), Worse { diff --git a/pageserver/src/virtual_file/open_options.rs b/pageserver/src/virtual_file/open_options.rs index 2a7bb693f2..a40dfed4a4 100644 --- a/pageserver/src/virtual_file/open_options.rs +++ b/pageserver/src/virtual_file/open_options.rs @@ -110,18 +110,23 @@ impl OpenOptions { self } + /// Don't use, `O_APPEND` is not supported. + pub fn append(&mut self, _append: bool) { + super::io_engine::panic_operation_must_be_idempotent(); + } + pub(in crate::virtual_file) async fn open(&self, path: &Path) -> std::io::Result { match &self.inner { Inner::StdFs(x) => x.open(path).map(|file| file.into()), #[cfg(target_os = "linux")] Inner::TokioEpollUring(x) => { let system = super::io_engine::tokio_epoll_uring_ext::thread_local_system().await; - system.open(path, x).await.map_err(|e| match e { - tokio_epoll_uring::Error::Op(e) => e, - tokio_epoll_uring::Error::System(system) => { - std::io::Error::new(std::io::ErrorKind::Other, system) - } + let (_, res) = super::io_engine::retry_ecanceled_once((), |()| async { + let res = system.open(path, x).await; + ((), res) }) + .await; + res.map_err(super::io_engine::epoll_uring_error_to_std) } } } @@ -140,6 +145,9 @@ impl OpenOptions { } pub fn custom_flags(mut self, flags: i32) -> Self { + if flags & nix::libc::O_APPEND != 0 { + super::io_engine::panic_operation_must_be_idempotent(); + } match &mut self.inner { Inner::StdFs(x) => { let _ = x.custom_flags(flags); From 1d1502bc167a2d0372756650581b4666597120c8 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Thu, 8 May 2025 08:57:53 +0200 Subject: [PATCH 04/65] fix(pageserver): `flush task cancelled` errors during timeline shutdown (#11853) # Refs - fixes https://github.com/neondatabase/neon/issues/11762 # Problem PR #10993 introduced internal retries for BufferedWriter flushes. PR #11052 added cancellation sensitivity to that retry loop. That cancellation sensitivity is an error path that didn't exist before. The result is that during timeline shutdown, after we `Timeline::cancel`, compaction can now fail with error `flush task cancelled`. The problem with that: 1. We mis-classify this as an `error!`-worthy event. 2. This causes tests to become flaky because the error is not in global `allowed_errors`. Technically we also trip the `compaction_circuit_breaker` because the resulting `CompactionError` is variant `::Other`. But since this is Timeline shutdown, is doesn't matter practically speaking. # Solution / Changes - Log the anyhow stack trace when classifying a compaction error as `error!`. This was helpful to identify sources of `flush task cancelled` errors. We only log at `error!` level in exceptional circumstances, so, it's ok to have bit verbose logs. - Introduce typed errors along the `BufferedWriter::write_*`=> `BlobWriter::write_blob` => `{Delta,Image}LayerWriter::put_*` => `Split{Delta,Image}LayerWriter::put_{value,image}` chain. - Proper mapping to `CompactionError`/`CreateImageLayersError` via new `From` impls. I am usually opposed to any magic `From` impls, but, it's how most of the compaction code works today. # Testing The symptoms are most prevalent in `test_runner/regress/test_branch_and_gc.py::test_branch_and_gc`. Before this PR, I was able to reproduce locally 1 or 2 times per 400 runs using `DEFAULT_PG_VERSION=15 BUILD_TYPE=release poetry run pytest --count 400 -n 8`. After this PR, it doesn't reproduce anymore after 2000 runs. # Future Work Technically the ingest path is also exposed to this new source of errors because `InMemoryLayer` is backed by `BufferedWriter`. But we haven't seen it occur in flaky tests yet. Details and a fix in - https://github.com/neondatabase/neon/pull/11851 --- pageserver/src/tenant/blob_io.rs | 27 ++++++++++++++----- pageserver/src/tenant/storage_layer.rs | 1 + .../storage_layer/batch_split_writer.rs | 18 ++++++++----- .../src/tenant/storage_layer/delta_layer.rs | 25 +++++++++++------ pageserver/src/tenant/storage_layer/errors.rs | 24 +++++++++++++++++ .../src/tenant/storage_layer/image_layer.rs | 20 ++++++++++---- pageserver/src/tenant/tasks.rs | 2 +- pageserver/src/tenant/timeline.rs | 20 ++++++++++++++ pageserver/src/tenant/timeline/compaction.rs | 3 +-- .../owned_buffers_io/write/flush.rs | 13 +++++++++ 10 files changed, 124 insertions(+), 29 deletions(-) create mode 100644 pageserver/src/tenant/storage_layer/errors.rs diff --git a/pageserver/src/tenant/blob_io.rs b/pageserver/src/tenant/blob_io.rs index 8cf3c548c9..ed541c4f12 100644 --- a/pageserver/src/tenant/blob_io.rs +++ b/pageserver/src/tenant/blob_io.rs @@ -94,10 +94,23 @@ impl Header { pub enum WriteBlobError { #[error(transparent)] Flush(FlushTaskError), - #[error("blob too large ({len} bytes)")] - BlobTooLarge { len: usize }, #[error(transparent)] - WriteBlobRaw(anyhow::Error), + Other(anyhow::Error), +} + +impl WriteBlobError { + pub fn is_cancel(&self) -> bool { + match self { + WriteBlobError::Flush(e) => e.is_cancel(), + WriteBlobError::Other(_) => false, + } + } + pub fn into_anyhow(self) -> anyhow::Error { + match self { + WriteBlobError::Flush(e) => e.into_anyhow(), + WriteBlobError::Other(e) => e, + } + } } impl BlockCursor<'_> { @@ -327,7 +340,9 @@ where return ( ( io_buf.slice_len(), - Err(WriteBlobError::BlobTooLarge { len }), + Err(WriteBlobError::Other(anyhow::anyhow!( + "blob too large ({len} bytes)" + ))), ), srcbuf, ); @@ -391,7 +406,7 @@ where // Verify the header, to ensure we don't write invalid/corrupt data. let header = match Header::decode(&raw_with_header) .context("decoding blob header") - .map_err(WriteBlobError::WriteBlobRaw) + .map_err(WriteBlobError::Other) { Ok(header) => header, Err(err) => return (raw_with_header, Err(err)), @@ -401,7 +416,7 @@ where let raw_len = raw_with_header.len(); return ( raw_with_header, - Err(WriteBlobError::WriteBlobRaw(anyhow::anyhow!( + Err(WriteBlobError::Other(anyhow::anyhow!( "header length mismatch: {header_total_len} != {raw_len}" ))), ); diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index 796ad01e54..5dfa961b71 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -2,6 +2,7 @@ pub mod batch_split_writer; pub mod delta_layer; +pub mod errors; pub mod filter_iterator; pub mod image_layer; pub mod inmemory_layer; diff --git a/pageserver/src/tenant/storage_layer/batch_split_writer.rs b/pageserver/src/tenant/storage_layer/batch_split_writer.rs index 39cd02d101..51f2e909a2 100644 --- a/pageserver/src/tenant/storage_layer/batch_split_writer.rs +++ b/pageserver/src/tenant/storage_layer/batch_split_writer.rs @@ -10,6 +10,7 @@ use utils::id::TimelineId; use utils::lsn::Lsn; use utils::shard::TenantShardId; +use super::errors::PutError; use super::layer::S3_UPLOAD_LIMIT; use super::{ DeltaLayerWriter, ImageLayerWriter, PersistentLayerDesc, PersistentLayerKey, ResidentLayer, @@ -235,7 +236,7 @@ impl<'a> SplitImageLayerWriter<'a> { key: Key, img: Bytes, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), PutError> { // The current estimation is an upper bound of the space that the key/image could take // because we did not consider compression in this estimation. The resulting image layer // could be smaller than the target size. @@ -253,7 +254,8 @@ impl<'a> SplitImageLayerWriter<'a> { self.cancel.clone(), ctx, ) - .await?; + .await + .map_err(PutError::Other)?; let prev_image_writer = std::mem::replace(&mut self.inner, next_image_writer); self.batches.add_unfinished_image_writer( prev_image_writer, @@ -346,7 +348,7 @@ impl<'a> SplitDeltaLayerWriter<'a> { lsn: Lsn, val: Value, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), PutError> { // The current estimation is key size plus LSN size plus value size estimation. This is not an accurate // number, and therefore the final layer size could be a little bit larger or smaller than the target. // @@ -366,7 +368,8 @@ impl<'a> SplitDeltaLayerWriter<'a> { self.cancel.clone(), ctx, ) - .await?, + .await + .map_err(PutError::Other)?, )); } let (_, inner) = self.inner.as_mut().unwrap(); @@ -386,7 +389,8 @@ impl<'a> SplitDeltaLayerWriter<'a> { self.cancel.clone(), ctx, ) - .await?; + .await + .map_err(PutError::Other)?; let (start_key, prev_delta_writer) = self.inner.replace((key, next_delta_writer)).unwrap(); self.batches.add_unfinished_delta_writer( @@ -396,11 +400,11 @@ impl<'a> SplitDeltaLayerWriter<'a> { ); } else if inner.estimated_size() >= S3_UPLOAD_LIMIT { // We have to produce a very large file b/c a key is updated too often. - anyhow::bail!( + return Err(PutError::Other(anyhow::anyhow!( "a single key is updated too often: key={}, estimated_size={}, and the layer file cannot be produced", key, inner.estimated_size() - ); + ))); } } self.last_key_written = key; diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index 11875ac653..2c1b27c8d5 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -55,6 +55,7 @@ use utils::bin_ser::SerializeError; use utils::id::{TenantId, TimelineId}; use utils::lsn::Lsn; +use super::errors::PutError; use super::{ AsLayerDesc, LayerName, OnDiskValue, OnDiskValueIo, PersistentLayerDesc, ResidentLayer, ValuesReconstructState, @@ -477,12 +478,15 @@ impl DeltaLayerWriterInner { lsn: Lsn, val: Value, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), PutError> { let (_, res) = self .put_value_bytes( key, lsn, - Value::ser(&val)?.slice_len(), + Value::ser(&val) + .map_err(anyhow::Error::new) + .map_err(PutError::Other)? + .slice_len(), val.will_init(), ctx, ) @@ -497,7 +501,7 @@ impl DeltaLayerWriterInner { val: FullSlice, will_init: bool, ctx: &RequestContext, - ) -> (FullSlice, anyhow::Result<()>) + ) -> (FullSlice, Result<(), PutError>) where Buf: IoBuf + Send, { @@ -513,19 +517,24 @@ impl DeltaLayerWriterInner { .blob_writer .write_blob_maybe_compressed(val, ctx, compression) .await; + let res = res.map_err(PutError::WriteBlob); let off = match res { Ok((off, _)) => off, - Err(e) => return (val, Err(anyhow::anyhow!(e))), + Err(e) => return (val, Err(e)), }; let blob_ref = BlobRef::new(off, will_init); let delta_key = DeltaKey::from_key_lsn(&key, lsn); - let res = self.tree.append(&delta_key.0, blob_ref.0); + let res = self + .tree + .append(&delta_key.0, blob_ref.0) + .map_err(anyhow::Error::new) + .map_err(PutError::Other); self.num_keys += 1; - (val, res.map_err(|e| anyhow::anyhow!(e))) + (val, res) } fn size(&self) -> u64 { @@ -694,7 +703,7 @@ impl DeltaLayerWriter { lsn: Lsn, val: Value, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), PutError> { self.inner .as_mut() .unwrap() @@ -709,7 +718,7 @@ impl DeltaLayerWriter { val: FullSlice, will_init: bool, ctx: &RequestContext, - ) -> (FullSlice, anyhow::Result<()>) + ) -> (FullSlice, Result<(), PutError>) where Buf: IoBuf + Send, { diff --git a/pageserver/src/tenant/storage_layer/errors.rs b/pageserver/src/tenant/storage_layer/errors.rs new file mode 100644 index 0000000000..591e489faa --- /dev/null +++ b/pageserver/src/tenant/storage_layer/errors.rs @@ -0,0 +1,24 @@ +use crate::tenant::blob_io::WriteBlobError; + +#[derive(Debug, thiserror::Error)] +pub enum PutError { + #[error(transparent)] + WriteBlob(WriteBlobError), + #[error(transparent)] + Other(anyhow::Error), +} + +impl PutError { + pub fn is_cancel(&self) -> bool { + match self { + PutError::WriteBlob(e) => e.is_cancel(), + PutError::Other(_) => false, + } + } + pub fn into_anyhow(self) -> anyhow::Error { + match self { + PutError::WriteBlob(e) => e.into_anyhow(), + PutError::Other(e) => e, + } + } +} diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index d684230572..740f53f928 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -53,6 +53,7 @@ use utils::bin_ser::SerializeError; use utils::id::{TenantId, TimelineId}; use utils::lsn::Lsn; +use super::errors::PutError; use super::layer_name::ImageLayerName; use super::{ AsLayerDesc, LayerName, OnDiskValue, OnDiskValueIo, PersistentLayerDesc, ResidentLayer, @@ -842,8 +843,14 @@ impl ImageLayerWriterInner { key: Key, img: Bytes, ctx: &RequestContext, - ) -> anyhow::Result<()> { - ensure!(self.key_range.contains(&key)); + ) -> Result<(), PutError> { + if !self.key_range.contains(&key) { + return Err(PutError::Other(anyhow::anyhow!( + "key {:?} not in range {:?}", + key, + self.key_range + ))); + } let compression = self.conf.image_compression; let uncompressed_len = img.len() as u64; self.uncompressed_bytes += uncompressed_len; @@ -853,7 +860,7 @@ impl ImageLayerWriterInner { .write_blob_maybe_compressed(img.slice_len(), ctx, compression) .await; // TODO: re-use the buffer for `img` further upstack - let (off, compression_info) = res?; + let (off, compression_info) = res.map_err(PutError::WriteBlob)?; if compression_info.compressed_size.is_some() { // The image has been considered for compression at least self.uncompressed_bytes_eligible += uncompressed_len; @@ -865,7 +872,10 @@ impl ImageLayerWriterInner { let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE]; key.write_to_byte_slice(&mut keybuf); - self.tree.append(&keybuf, off)?; + self.tree + .append(&keybuf, off) + .map_err(anyhow::Error::new) + .map_err(PutError::Other)?; #[cfg(feature = "testing")] { @@ -1085,7 +1095,7 @@ impl ImageLayerWriter { key: Key, img: Bytes, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), PutError> { self.inner.as_mut().unwrap().put_image(key, img, ctx).await } diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index 1112a5330b..4709a6d616 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -340,7 +340,7 @@ pub(crate) fn log_compaction_error( } else { match level { Level::ERROR if degrade_to_warning => warn!("Compaction failed and discarded: {err:#}"), - Level::ERROR => error!("Compaction failed: {err:#}"), + Level::ERROR => error!("Compaction failed: {err:?}"), Level::INFO => info!("Compaction failed: {err:#}"), level => unimplemented!("unexpected level {level:?}"), } diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index cfeab77598..c8d897d074 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -987,6 +987,16 @@ impl From for CreateImageLayersError { } } +impl From for CreateImageLayersError { + fn from(e: super::storage_layer::errors::PutError) -> Self { + if e.is_cancel() { + CreateImageLayersError::Cancelled + } else { + CreateImageLayersError::Other(e.into_anyhow()) + } + } +} + impl From for CreateImageLayersError { fn from(e: GetVectoredError) -> Self { match e { @@ -5923,6 +5933,16 @@ impl From for CompactionError { } } +impl From for CompactionError { + fn from(e: super::storage_layer::errors::PutError) -> Self { + if e.is_cancel() { + CompactionError::ShuttingDown + } else { + CompactionError::Other(e.into_anyhow()) + } + } +} + #[serde_as] #[derive(serde::Serialize)] struct RecordedDuration(#[serde_as(as = "serde_with::DurationMicroSeconds")] Duration); diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index d0c13d86ce..07cd274a41 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -2204,8 +2204,7 @@ impl Timeline { .as_mut() .unwrap() .put_value(key, lsn, value, ctx) - .await - .map_err(CompactionError::Other)?; + .await?; } else { let owner = self.shard_identity.get_shard_number(&key); diff --git a/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs b/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs index b41a9f6cd2..ac9867e8b4 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs @@ -247,6 +247,19 @@ pub enum FlushTaskError { Cancelled, } +impl FlushTaskError { + pub fn is_cancel(&self) -> bool { + match self { + FlushTaskError::Cancelled => true, + } + } + pub fn into_anyhow(self) -> anyhow::Error { + match self { + FlushTaskError::Cancelled => anyhow::anyhow!(self), + } + } +} + impl FlushBackgroundTask where Buf: IoBufAligned + Send + Sync, From 40f32ea326ac9f8b691f179d0ced414470eb06ff Mon Sep 17 00:00:00 2001 From: Vlad Lazar Date: Thu, 8 May 2025 10:19:14 +0100 Subject: [PATCH 05/65] pageserver: refactor import flow and add job concurrency limiting (#11816) ## Problem Import code is one big block. Separating planning and execution will help with reporting progress of import to storcon (building block for resuming import). ## Summary of changes Split up the import into planning and execution. A concurrency limit driven by PS config is also added. --- libs/pageserver_api/src/config.rs | 11 + pageserver/src/config.rs | 4 + .../src/tenant/timeline/import_pgdata.rs | 9 +- .../src/tenant/timeline/import_pgdata/flow.rs | 195 ++++++++++-------- 4 files changed, 129 insertions(+), 90 deletions(-) diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs index b64c42a808..5b0c13dd89 100644 --- a/libs/pageserver_api/src/config.rs +++ b/libs/pageserver_api/src/config.rs @@ -182,6 +182,7 @@ pub struct ConfigToml { pub tracing: Option, pub enable_tls_page_service_api: bool, pub dev_mode: bool, + pub timeline_import_config: TimelineImportConfig, } #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] @@ -300,6 +301,12 @@ impl From for tracing_utils::Protocol { } } +#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +pub struct TimelineImportConfig { + pub import_job_concurrency: NonZeroUsize, + pub import_job_soft_size_limit: NonZeroUsize, +} + pub mod statvfs { pub mod mock { #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] @@ -659,6 +666,10 @@ impl Default for ConfigToml { tracing: None, enable_tls_page_service_api: false, dev_mode: false, + timeline_import_config: TimelineImportConfig { + import_job_concurrency: NonZeroUsize::new(128).unwrap(), + import_job_soft_size_limit: NonZeroUsize::new(1024 * 1024 * 1024).unwrap(), + }, } } } diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index ded2805602..7e773f56b3 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -230,6 +230,8 @@ pub struct PageServerConf { /// such as authentication requirements for HTTP and PostgreSQL APIs. /// This is insecure and should only be used in development environments. pub dev_mode: bool, + + pub timeline_import_config: pageserver_api::config::TimelineImportConfig, } /// Token for authentication to safekeepers @@ -404,6 +406,7 @@ impl PageServerConf { tracing, enable_tls_page_service_api, dev_mode, + timeline_import_config, } = config_toml; let mut conf = PageServerConf { @@ -457,6 +460,7 @@ impl PageServerConf { tracing, enable_tls_page_service_api, dev_mode, + timeline_import_config, // ------------------------------------------------------------ // fields that require additional validation or custom handling diff --git a/pageserver/src/tenant/timeline/import_pgdata.rs b/pageserver/src/tenant/timeline/import_pgdata.rs index 6ab6b90cb6..c4a8df39a3 100644 --- a/pageserver/src/tenant/timeline/import_pgdata.rs +++ b/pageserver/src/tenant/timeline/import_pgdata.rs @@ -149,14 +149,7 @@ pub async fn doit( } .await?; - flow::run( - timeline.clone(), - base_lsn, - control_file, - storage.clone(), - ctx, - ) - .await?; + flow::run(timeline.clone(), control_file, storage.clone(), ctx).await?; // // Communicate that shard is done. diff --git a/pageserver/src/tenant/timeline/import_pgdata/flow.rs b/pageserver/src/tenant/timeline/import_pgdata/flow.rs index c6d2944769..34c073365d 100644 --- a/pageserver/src/tenant/timeline/import_pgdata/flow.rs +++ b/pageserver/src/tenant/timeline/import_pgdata/flow.rs @@ -34,7 +34,9 @@ use std::sync::Arc; use anyhow::{bail, ensure}; use bytes::Bytes; +use futures::stream::FuturesOrdered; use itertools::Itertools; +use pageserver_api::config::TimelineImportConfig; use pageserver_api::key::{ CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, Key, TWOPHASEDIR_KEY, rel_block_to_key, rel_dir_to_key, rel_size_to_key, relmap_file_key, slru_block_to_key, slru_dir_to_key, @@ -46,8 +48,9 @@ use pageserver_api::shard::ShardIdentity; use postgres_ffi::relfile_utils::parse_relfilename; use postgres_ffi::{BLCKSZ, pg_constants}; use remote_storage::RemotePath; -use tokio::task::JoinSet; -use tracing::{Instrument, debug, info_span, instrument}; +use tokio::sync::Semaphore; +use tokio_stream::StreamExt; +use tracing::{debug, instrument}; use utils::bin_ser::BeSer; use utils::lsn::Lsn; @@ -63,37 +66,39 @@ use crate::tenant::storage_layer::{ImageLayerWriter, Layer}; pub async fn run( timeline: Arc, - pgdata_lsn: Lsn, control_file: ControlFile, storage: RemoteStorageWrapper, ctx: &RequestContext, ) -> anyhow::Result<()> { - Flow { - timeline, - pgdata_lsn, + let planner = Planner { control_file, - tasks: Vec::new(), - storage, - } - .run(ctx) - .await + storage: storage.clone(), + shard: timeline.shard_identity, + tasks: Vec::default(), + }; + + let import_config = &timeline.conf.timeline_import_config; + let plan = planner.plan(import_config).await?; + plan.execute(timeline, import_config, ctx).await } -struct Flow { - timeline: Arc, - pgdata_lsn: Lsn, +struct Planner { control_file: ControlFile, - tasks: Vec, storage: RemoteStorageWrapper, + shard: ShardIdentity, + tasks: Vec, } -impl Flow { - /// Perform the ingestion into [`Self::timeline`]. - /// Assumes the timeline is empty (= no layers). - pub async fn run(mut self, ctx: &RequestContext) -> anyhow::Result<()> { - let pgdata_lsn = Lsn(self.control_file.control_file_data().checkPoint).align(); +struct Plan { + jobs: Vec, +} - self.pgdata_lsn = pgdata_lsn; +impl Planner { + /// Creates an import plan + /// + /// This function is and must remain pure: given the same input, it will generate the same import plan. + async fn plan(mut self, import_config: &TimelineImportConfig) -> anyhow::Result { + let pgdata_lsn = Lsn(self.control_file.control_file_data().checkPoint).align(); let datadir = PgDataDir::new(&self.storage).await?; @@ -115,7 +120,7 @@ impl Flow { } // Import SLRUs - if self.timeline.tenant_shard_id.is_shard_zero() { + if self.shard.is_shard_zero() { // pg_xact (01:00 keyspace) self.import_slru(SlruKind::Clog, &self.storage.pgdata().join("pg_xact")) .await?; @@ -166,14 +171,16 @@ impl Flow { let mut last_end_key = Key::MIN; let mut current_chunk = Vec::new(); let mut current_chunk_size: usize = 0; - let mut parallel_jobs = Vec::new(); + let mut jobs = Vec::new(); for task in std::mem::take(&mut self.tasks).into_iter() { - if current_chunk_size + task.total_size() > 1024 * 1024 * 1024 { + if current_chunk_size + task.total_size() + > import_config.import_job_soft_size_limit.into() + { let key_range = last_end_key..task.key_range().start; - parallel_jobs.push(ChunkProcessingJob::new( + jobs.push(ChunkProcessingJob::new( key_range.clone(), std::mem::take(&mut current_chunk), - &self, + pgdata_lsn, )); last_end_key = key_range.end; current_chunk_size = 0; @@ -181,45 +188,13 @@ impl Flow { current_chunk_size += task.total_size(); current_chunk.push(task); } - parallel_jobs.push(ChunkProcessingJob::new( + jobs.push(ChunkProcessingJob::new( last_end_key..Key::MAX, current_chunk, - &self, + pgdata_lsn, )); - // Start all jobs simultaneosly - let mut work = JoinSet::new(); - // TODO: semaphore? - for job in parallel_jobs { - let ctx: RequestContext = - ctx.detached_child(TaskKind::ImportPgdata, DownloadBehavior::Error); - work.spawn(async move { job.run(&ctx).await }.instrument(info_span!("parallel_job"))); - } - let mut results = Vec::new(); - while let Some(result) = work.join_next().await { - match result { - Ok(res) => { - results.push(res); - } - Err(_joinset_err) => { - results.push(Err(anyhow::anyhow!( - "parallel job panicked or cancelled, check pageserver logs" - ))); - } - } - } - - if results.iter().all(|r| r.is_ok()) { - Ok(()) - } else { - let mut msg = String::new(); - for result in results { - if let Err(err) = result { - msg.push_str(&format!("{err:?}\n\n")); - } - } - bail!("Some parallel jobs failed:\n\n{msg}"); - } + Ok(Plan { jobs }) } #[instrument(level = tracing::Level::DEBUG, skip_all, fields(dboid=%db.dboid, tablespace=%db.spcnode, path=%db.path))] @@ -266,7 +241,7 @@ impl Flow { let end_key = rel_block_to_key(file.rel_tag, start_blk + (len / 8192) as u32); self.tasks .push(AnyImportTask::RelBlocks(ImportRelBlocksTask::new( - *self.timeline.get_shard_identity(), + self.shard, start_key..end_key, &file.path, self.storage.clone(), @@ -289,7 +264,7 @@ impl Flow { } async fn import_slru(&mut self, kind: SlruKind, path: &RemotePath) -> anyhow::Result<()> { - assert!(self.timeline.tenant_shard_id.is_shard_zero()); + assert!(self.shard.is_shard_zero()); let segments = self.storage.listfilesindir(path).await?; let segments: Vec<(String, u32, usize)> = segments @@ -344,6 +319,68 @@ impl Flow { } } +impl Plan { + async fn execute( + self, + timeline: Arc, + import_config: &TimelineImportConfig, + ctx: &RequestContext, + ) -> anyhow::Result<()> { + let mut work = FuturesOrdered::new(); + let semaphore = Arc::new(Semaphore::new(import_config.import_job_concurrency.into())); + + let jobs_in_plan = self.jobs.len(); + + let mut jobs = self.jobs.into_iter().enumerate().peekable(); + let mut results = Vec::new(); + + // Run import jobs concurrently up to the limit specified by the pageserver configuration. + // Note that we process completed futures in the oreder of insertion. This will be the + // building block for resuming imports across pageserver restarts or tenant migrations. + while results.len() < jobs_in_plan { + tokio::select! { + permit = semaphore.clone().acquire_owned(), if jobs.peek().is_some() => { + let permit = permit.expect("never closed"); + let (job_idx, job) = jobs.next().expect("we peeked"); + let job_timeline = timeline.clone(); + let ctx = ctx.detached_child(TaskKind::ImportPgdata, DownloadBehavior::Error); + + work.push_back(tokio::task::spawn(async move { + let _permit = permit; + let res = job.run(job_timeline, &ctx).await; + (job_idx, res) + })); + }, + maybe_complete_job_idx = work.next() => { + match maybe_complete_job_idx { + Some(Ok((_job_idx, res))) => { + results.push(res); + }, + Some(Err(_)) => { + results.push(Err(anyhow::anyhow!( + "parallel job panicked or cancelled, check pageserver logs" + ))); + } + None => {} + } + } + } + } + + if results.iter().all(|r| r.is_ok()) { + Ok(()) + } else { + let mut msg = String::new(); + for result in results { + if let Err(err) = result { + msg.push_str(&format!("{err:?}\n\n")); + } + } + bail!("Some parallel jobs failed:\n\n{msg}"); + } + } +} + // // dbdir iteration tools // @@ -713,7 +750,6 @@ impl From for AnyImportTask { } struct ChunkProcessingJob { - timeline: Arc, range: Range, tasks: Vec, @@ -721,25 +757,24 @@ struct ChunkProcessingJob { } impl ChunkProcessingJob { - fn new(range: Range, tasks: Vec, env: &Flow) -> Self { - assert!(env.pgdata_lsn.is_valid()); + fn new(range: Range, tasks: Vec, pgdata_lsn: Lsn) -> Self { + assert!(pgdata_lsn.is_valid()); Self { - timeline: env.timeline.clone(), range, tasks, - pgdata_lsn: env.pgdata_lsn, + pgdata_lsn, } } - async fn run(self, ctx: &RequestContext) -> anyhow::Result<()> { + async fn run(self, timeline: Arc, ctx: &RequestContext) -> anyhow::Result<()> { let mut writer = ImageLayerWriter::new( - self.timeline.conf, - self.timeline.timeline_id, - self.timeline.tenant_shard_id, + timeline.conf, + timeline.timeline_id, + timeline.tenant_shard_id, &self.range, self.pgdata_lsn, - &self.timeline.gate, - self.timeline.cancel.clone(), + &timeline.gate, + timeline.cancel.clone(), ctx, ) .await?; @@ -751,24 +786,20 @@ impl ChunkProcessingJob { let resident_layer = if nimages > 0 { let (desc, path) = writer.finish(ctx).await?; - Layer::finish_creating(self.timeline.conf, &self.timeline, desc, &path)? + Layer::finish_creating(timeline.conf, &timeline, desc, &path)? } else { // dropping the writer cleans up return Ok(()); }; // this is sharing the same code as create_image_layers - let mut guard = self.timeline.layers.write().await; + let mut guard = timeline.layers.write().await; guard .open_mut()? - .track_new_image_layers(&[resident_layer.clone()], &self.timeline.metrics); + .track_new_image_layers(&[resident_layer.clone()], &timeline.metrics); crate::tenant::timeline::drop_wlock(guard); - // Schedule the layer for upload but don't add barriers such as - // wait for completion or index upload, so we don't inhibit upload parallelism. - // TODO: limit upload parallelism somehow (e.g. by limiting concurrency of jobs?) - // TODO: or regulate parallelism by upload queue depth? Prob should happen at a higher level. - self.timeline + timeline .remote_client .schedule_layer_file_upload(resident_layer)?; From 7e55497e131f2f26a16ae22bff80cac11951cdd4 Mon Sep 17 00:00:00 2001 From: Dmitrii Kovalkov <34828390+DimasKovas@users.noreply.github.com> Date: Thu, 8 May 2025 14:00:45 +0400 Subject: [PATCH 06/65] tests: flush wal before waiting for last record lsn (#11726) ## Problem Compute may flush WAL on page boundaries, leaving some records partially flushed for a long time. It leads to `wait_for_last_flush_lsn` stuck waiting for this partial LSN. - Closes: https://github.com/neondatabase/cloud/issues/27876 ## Summary of changes - Flush WAL via CHECKPOINT after requesting current_wal_lsn to make sure that the record we point to is flushed in full - Use proper endpoint in `test_timeline_detach_with_aux_files_with_detach_v1` --- test_runner/fixtures/neon_fixtures.py | 7 +++++++ test_runner/regress/test_timeline_detach_ancestor.py | 4 ++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 85ad49bb4f..370eca5130 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -5477,6 +5477,13 @@ def wait_for_last_flush_lsn( if last_flush_lsn is None: last_flush_lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) + # The last_flush_lsn may not correspond to a record boundary. + # For example, if the compute flushed WAL on a page boundary, + # the remaining part of the record might not be flushed for a long time. + # This would prevent the pageserver from reaching last_flush_lsn promptly. + # To ensure the rest of the record reaches the pageserver quickly, + # we forcibly flush the WAL by using CHECKPOINT. + endpoint.safe_psql("CHECKPOINT") results = [] for tenant_shard_id, pageserver in shards: diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py index a71652af8a..d42c5d403e 100644 --- a/test_runner/regress/test_timeline_detach_ancestor.py +++ b/test_runner/regress/test_timeline_detach_ancestor.py @@ -1822,7 +1822,7 @@ def test_timeline_detach_with_aux_files_with_detach_v1( endpoint2.safe_psql( "SELECT pg_create_logical_replication_slot('test_slot_restore', 'pgoutput')" ) - lsn3 = wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, branch_timeline_id) + lsn3 = wait_for_last_flush_lsn(env, endpoint2, env.initial_tenant, branch_timeline_id) assert set(http.list_aux_files(env.initial_tenant, branch_timeline_id, lsn1).keys()) == set([]) assert set(http.list_aux_files(env.initial_tenant, branch_timeline_id, lsn3).keys()) == set( ["pg_replslot/test_slot_restore/state"] @@ -1839,7 +1839,7 @@ def test_timeline_detach_with_aux_files_with_detach_v1( assert all_reparented == set([]) # We need to ensure all safekeeper data are ingested before checking aux files: the API does not wait for LSN. - wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, branch_timeline_id) + wait_for_last_flush_lsn(env, endpoint2, env.initial_tenant, branch_timeline_id) assert set(http.list_aux_files(env.initial_tenant, env.initial_timeline, lsn2).keys()) == set( ["pg_replslot/test_slot_parent_1/state", "pg_replslot/test_slot_parent_2/state"] ), "main branch unaffected" From 6c70789cfdf145ae4ca73228884ca1359b80c302 Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Thu, 8 May 2025 12:14:41 +0200 Subject: [PATCH 07/65] storcon: increase drain+fill secondary warmup timeout from 20 to 30 seconds (#11848) ## Problem During deployment drains/fills, we often see the storage controller giving up on warmups after 20 seconds, when the warmup is nearly complete (~90%). This can cause latency spikes for migrated tenants if they block on layer downloads. Touches https://github.com/neondatabase/cloud/issues/26193. ## Summary of changes Increase the drain and fill secondary warmup timeout from 20 to 30 seconds. --- storage_controller/src/service.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 21c693af97..fdb791c2cf 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -8485,7 +8485,7 @@ impl Service { // By default, live migrations are generous about the wait time for getting // the secondary location up to speed. When draining, give up earlier in order // to not stall the operation when a cold secondary is encountered. - const SECONDARY_WARMUP_TIMEOUT: Duration = Duration::from_secs(20); + const SECONDARY_WARMUP_TIMEOUT: Duration = Duration::from_secs(30); const SECONDARY_DOWNLOAD_REQUEST_TIMEOUT: Duration = Duration::from_secs(5); let reconciler_config = ReconcilerConfigBuilder::new(ReconcilerPriority::Normal) .secondary_warmup_timeout(SECONDARY_WARMUP_TIMEOUT) @@ -8818,7 +8818,7 @@ impl Service { node_id: NodeId, cancel: CancellationToken, ) -> Result<(), OperationError> { - const SECONDARY_WARMUP_TIMEOUT: Duration = Duration::from_secs(20); + const SECONDARY_WARMUP_TIMEOUT: Duration = Duration::from_secs(30); const SECONDARY_DOWNLOAD_REQUEST_TIMEOUT: Duration = Duration::from_secs(5); let reconciler_config = ReconcilerConfigBuilder::new(ReconcilerPriority::Normal) .secondary_warmup_timeout(SECONDARY_WARMUP_TIMEOUT) From d22377c754556c95d24970458cb08968828902b3 Mon Sep 17 00:00:00 2001 From: Mark Novikov Date: Thu, 8 May 2025 15:04:28 +0400 Subject: [PATCH 08/65] Skip event triggers in dump-restore (#11794) ## Problem Data import fails if the src db has any event triggers, because those can only be restored by a superuser. Specifically imports from Heroku and Supabase are guaranteed to fail. Closes https://github.com/neondatabase/cloud/issues/27353 ## Summary of changes Depends on `pg_dump` patches per each supported PostgreSQL version: - https://github.com/neondatabase/postgres/pull/630 - https://github.com/neondatabase/postgres/pull/629 - https://github.com/neondatabase/postgres/pull/627 - https://github.com/neondatabase/postgres/pull/628 --- compute_tools/src/bin/fast_import.rs | 1 + test_runner/regress/test_import_pgdata.py | 49 +++++++++++++++++++++++ vendor/postgres-v14 | 2 +- vendor/postgres-v17 | 2 +- vendor/revisions.json | 4 +- 5 files changed, 54 insertions(+), 4 deletions(-) diff --git a/compute_tools/src/bin/fast_import.rs b/compute_tools/src/bin/fast_import.rs index 537028cde1..78acd78585 100644 --- a/compute_tools/src/bin/fast_import.rs +++ b/compute_tools/src/bin/fast_import.rs @@ -348,6 +348,7 @@ async fn run_dump_restore( "--no-security-labels".to_string(), "--no-subscriptions".to_string(), "--no-tablespaces".to_string(), + "--no-event-triggers".to_string(), // format "--format".to_string(), "directory".to_string(), diff --git a/test_runner/regress/test_import_pgdata.py b/test_runner/regress/test_import_pgdata.py index a26c3994a5..2fda1991f7 100644 --- a/test_runner/regress/test_import_pgdata.py +++ b/test_runner/regress/test_import_pgdata.py @@ -641,6 +641,55 @@ def test_fast_import_binary( assert res[0][0] == 10 +def test_fast_import_event_triggers( + test_output_dir, + vanilla_pg: VanillaPostgres, + port_distributor: PortDistributor, + fast_import: FastImport, +): + vanilla_pg.start() + vanilla_pg.safe_psql(""" + CREATE FUNCTION test_event_trigger_for_drops() + RETURNS event_trigger LANGUAGE plpgsql AS $$ + DECLARE + obj record; + BEGIN + FOR obj IN SELECT * FROM pg_event_trigger_dropped_objects() + LOOP + RAISE NOTICE '% dropped object: % %.% %', + tg_tag, + obj.object_type, + obj.schema_name, + obj.object_name, + obj.object_identity; + END LOOP; + END + $$; + + CREATE EVENT TRIGGER test_event_trigger_for_drops + ON sql_drop + EXECUTE PROCEDURE test_event_trigger_for_drops(); + """) + + pg_port = port_distributor.get_port() + p = fast_import.run_pgdata(pg_port=pg_port, source_connection_string=vanilla_pg.connstr()) + assert p.returncode == 0 + + vanilla_pg.stop() + + pgbin = PgBin(test_output_dir, fast_import.pg_distrib_dir, fast_import.pg_version) + with VanillaPostgres( + fast_import.workdir / "pgdata", pgbin, pg_port, False + ) as new_pgdata_vanilla_pg: + new_pgdata_vanilla_pg.start() + + # database name and user are hardcoded in fast_import binary, and they are different from normal vanilla postgres + conn = PgProtocol(dsn=f"postgresql://cloud_admin@localhost:{pg_port}/neondb") + res = conn.safe_psql("SELECT count(*) FROM pg_event_trigger;") + log.info(f"Result: {res}") + assert res[0][0] == 0, f"Neon does not support importing event triggers, got: {res[0][0]}" + + def test_fast_import_restore_to_connstring( test_output_dir, vanilla_pg: VanillaPostgres, diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index c8dab02bfc..108856a4ae 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit c8dab02bfc003ae7bd59096919042d7840f3c194 +Subproject commit 108856a4ae76be285b04497a0ed08fcbe60ddbe9 diff --git a/vendor/postgres-v17 b/vendor/postgres-v17 index eab3a37834..b763ab54b9 160000 --- a/vendor/postgres-v17 +++ b/vendor/postgres-v17 @@ -1 +1 @@ -Subproject commit eab3a37834cac6ec0719bf817ac918a201712d66 +Subproject commit b763ab54b98d232a0959371ab1d07f06ed77c49e diff --git a/vendor/revisions.json b/vendor/revisions.json index 74a6ff33d7..4307fd1c3f 100644 --- a/vendor/revisions.json +++ b/vendor/revisions.json @@ -1,7 +1,7 @@ { "v17": [ "17.4", - "eab3a37834cac6ec0719bf817ac918a201712d66" + "b763ab54b98d232a0959371ab1d07f06ed77c49e" ], "v16": [ "16.8", @@ -13,6 +13,6 @@ ], "v14": [ "14.17", - "c8dab02bfc003ae7bd59096919042d7840f3c194" + "108856a4ae76be285b04497a0ed08fcbe60ddbe9" ] } From 42d93031a13b31cee2fbb8c2e7f1b094b0f554a2 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Thu, 8 May 2025 13:48:29 +0200 Subject: [PATCH 09/65] fixup(#11819): broken macOS build (#11861) refs - fixes https://github.com/neondatabase/neon/issues/11860 --- pageserver/src/virtual_file/io_engine.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pageserver/src/virtual_file/io_engine.rs b/pageserver/src/virtual_file/io_engine.rs index d8eb803335..7827682498 100644 --- a/pageserver/src/virtual_file/io_engine.rs +++ b/pageserver/src/virtual_file/io_engine.rs @@ -13,7 +13,7 @@ pub(super) mod tokio_epoll_uring_ext; use tokio_epoll_uring::IoBuf; -use tracing::{Instrument, info}; +use tracing::Instrument; pub(crate) use super::api::IoEngineKind; #[derive(Clone, Copy)] @@ -111,7 +111,8 @@ pub(crate) fn get() -> IoEngine { use std::os::unix::prelude::FileExt; use std::sync::atomic::{AtomicU8, Ordering}; -use std::time::Duration; +#[cfg(target_os = "linux")] +use {std::time::Duration, tracing::info}; use super::owned_buffers_io::io_buf_ext::FullSlice; use super::owned_buffers_io::slice::SliceMutExt; @@ -309,6 +310,7 @@ impl IoEngine { /// /// This function retries the operation once if it fails with ECANCELED. /// ONLY USE FOR IDEMPOTENT [`super::VirtualFile`] operations. +#[cfg(target_os = "linux")] pub(super) async fn retry_ecanceled_once( resources: T, f: F, From 659366060dcef08a46c42c0794a829afb4270b1c Mon Sep 17 00:00:00 2001 From: Santosh Pingale <3813695+santosh-d3vpl3x@users.noreply.github.com> Date: Thu, 8 May 2025 16:09:15 +0200 Subject: [PATCH 10/65] Reuse remote_client from the SnapshotDownloader instead of recreating in download function (#11812) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Problem At the moment, remote_client and target are recreated in download function. We could reuse it from SnapshotDownloader instance. This isn't a problem per se, just a quality of life improvement but it caught my attention when we were trying out snapshot downloading in one of the older version and ran into a curious case of s3 clients behaving in two different manners. One client that used `force_path_style` and other one didn't. **Logs from this run:** ``` 2025-05-02T12:56:22.384626Z DEBUG /data/snappie/2739e7da34e625e3934ef0b76fa12483/timelines/d44b831adb0a6ba96792dc3a5cc30910/000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014E8F20-00000000014E8F99-00000001 requires download... 2025-05-02T12:56:22.384689Z DEBUG invoke{service=s3 operation=ListObjectVersions sdk_invocation_id=7315885}:apply_configuration: timeout settings for this operation: TimeoutConfig { connect_timeout: Set(3.1s), read_timeout: Disabled, operation_timeout: Disabled, operation_attempt_timeout: Disabled } 2025-05-02T12:56:22.384730Z DEBUG invoke{service=s3 operation=ListObjectVersions sdk_invocation_id=7315885}:try_op: entering 'serialization' phase 2025-05-02T12:56:22.384784Z DEBUG invoke{service=s3 operation=ListObjectVersions sdk_invocation_id=7315885}:try_op: entering 'before transmit' phase 2025-05-02T12:56:22.384813Z DEBUG invoke{service=s3 operation=ListObjectVersions sdk_invocation_id=7315885}:try_op: retry strategy has OKed initial request 2025-05-02T12:56:22.384841Z DEBUG invoke{service=s3 operation=ListObjectVersions sdk_invocation_id=7315885}:try_op: beginning attempt #1 2025-05-02T12:56:22.384870Z DEBUG invoke{service=s3 operation=ListObjectVersions sdk_invocation_id=7315885}:try_op:try_attempt: resolving endpoint endpoint_params=EndpointResolverParams(TypeErasedBox[!Clone]:Params { bucket: Some("bucket"), region: Some("eu-north-1"), use_fips: false, use_dual_stack: false, endpoint: Some("https://s3.self-hosted.company.com"), force_path_style: false, accelerate: false, use_global_endpoint: false, use_object_lambda_endpoint: None, key: None, prefix: Some("/pageserver/tenants/2739e7da34e625e3934ef0b76fa12483/timelines/d44b831adb0a6ba96792dc3a5cc30910/000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014E8F20-00000000014E8F99-00000001"), copy_source: None, disable_access_points: None, disable_multi_region_access_points: false, use_arn_region: None, use_s3_express_control_endpoint: None, disable_s3_express_session_auth: None }) endpoint_prefix=None 2025-05-02T12:56:22.384979Z DEBUG invoke{service=s3 operation=ListObjectVersions sdk_invocation_id=7315885}:try_op:try_attempt: will use endpoint Endpoint { url: "https://neon.s3.self-hosted.company.com", headers: {}, properties: {"authSchemes": Array([Object({"signingRegion": String("eu-north-1"), "disableDoubleEncoding": Bool(true), "name": String("sigv4"), "signingName": String("s3")})])} } 2025-05-02T12:56:22.385042Z DEBUG invoke{service=s3 operation=ListObjectVersions sdk_invocation_id=7315885}:try_op:try_attempt:lazy_load_identity:provide_credentials{provider=default_chain}: loaded credentials provider=Environment 2025-05-02T12:56:22.385066Z DEBUG invoke{service=s3 operation=ListObjectVersions sdk_invocation_id=7315885}:try_op:try_attempt:lazy_load_identity: identity cache miss occurred; added new identity (took 35.958µs) new_expiration=2025-05-02T13:11:22.385028Z valid_for=899.999961437s partition=IdentityCachePartition(5) 2025-05-02T12:56:22.385090Z DEBUG invoke{service=s3 operation=ListObjectVersions sdk_invocation_id=7315885}:try_op:try_attempt: loaded identity 2025-05-02T12:56:22.385162Z DEBUG invoke{service=s3 operation=ListObjectVersions sdk_invocation_id=7315885}:try_op:try_attempt: entering 'transmit' phase 2025-05-02T12:56:22.385211Z DEBUG invoke{service=s3 operation=ListObjectVersions sdk_invocation_id=7315885}:try_op:try_attempt: new TCP connector created in 361ns 2025-05-02T12:56:22.385288Z DEBUG resolving host="neon.s3.self-hosted.company.com" 2025-05-02T12:56:22.390796Z DEBUG invoke{service=s3 operation=ListObjectVersions sdk_invocation_id=7315885}:try_op:try_attempt: encountered orchestrator error; halting ``` --- storage_scrubber/src/tenant_snapshot.rs | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/storage_scrubber/src/tenant_snapshot.rs b/storage_scrubber/src/tenant_snapshot.rs index 24231e32fc..d0ca53f8ab 100644 --- a/storage_scrubber/src/tenant_snapshot.rs +++ b/storage_scrubber/src/tenant_snapshot.rs @@ -24,7 +24,6 @@ pub struct SnapshotDownloader { remote_client: GenericRemoteStorage, #[allow(dead_code)] target: RootTarget, - bucket_config: BucketConfig, tenant_id: TenantId, output_path: Utf8PathBuf, concurrency: usize, @@ -43,7 +42,6 @@ impl SnapshotDownloader { Ok(Self { remote_client, target, - bucket_config, tenant_id, output_path, concurrency, @@ -218,11 +216,9 @@ impl SnapshotDownloader { } pub async fn download(&self) -> anyhow::Result<()> { - let (remote_client, target) = - init_remote(self.bucket_config.clone(), NodeKind::Pageserver).await?; - // Generate a stream of TenantShardId - let shards = stream_tenant_shards(&remote_client, &target, self.tenant_id).await?; + let shards = + stream_tenant_shards(&self.remote_client, &self.target, self.tenant_id).await?; let shards: Vec = shards.try_collect().await?; // Only read from shards that have the highest count: avoids redundantly downloading @@ -240,7 +236,8 @@ impl SnapshotDownloader { for shard in shards.into_iter().filter(|s| s.shard_count == shard_count) { // Generate a stream of TenantTimelineId - let timelines = stream_tenant_timelines(&remote_client, &target, shard).await?; + let timelines = + stream_tenant_timelines(&self.remote_client, &self.target, shard).await?; // Generate a stream of S3TimelineBlobData async fn load_timeline_index( @@ -251,8 +248,8 @@ impl SnapshotDownloader { let data = list_timeline_blobs(remote_client, ttid, target).await?; Ok((ttid, data)) } - let timelines = - timelines.map_ok(|ttid| load_timeline_index(&remote_client, &target, ttid)); + let timelines = timelines + .map_ok(|ttid| load_timeline_index(&self.remote_client, &self.target, ttid)); let mut timelines = std::pin::pin!(timelines.try_buffered(8)); while let Some(i) = timelines.next().await { From 622b3b29936d0496808396e447e678177a58412d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Thu, 8 May 2025 17:13:11 +0200 Subject: [PATCH 11/65] Fixes for enabling --timelines-onto-safekeepers in tests (#11854) Second PR with fixes extracted from #11712, relating to `--timelines-onto-safekeepers`. Does the following: * Moves safekeeper registration to `neon_local` instead of the test fixtures * Pass safekeeper JWT token if `--timelines-onto-safekeepers` is enabled * Allow some warnings related to offline safekeepers (similarly to how we allow them for offline pageservers) * Enable generations on the compute's config if `--timelines-onto-safekeepers` is enabled * fix parallel `pull_timeline` race condition (the one that #11786 put for later) Fixes #11424 Part of #11670 --- control_plane/src/bin/neon_local.rs | 9 +- control_plane/src/storage_controller.rs | 100 ++++++++++++++++-- safekeeper/src/http/routes.rs | 3 +- safekeeper/src/pull_timeline.rs | 30 ++++-- test_runner/fixtures/neon_fixtures.py | 24 ----- .../fixtures/pageserver/allowed_errors.py | 4 + 6 files changed, 131 insertions(+), 39 deletions(-) diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index 610fa5f865..191a22f1de 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -1417,7 +1417,14 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res let pageserver_id = args.endpoint_pageserver_id; let remote_ext_base_url = &args.remote_ext_base_url; - let safekeepers_generation = args.safekeepers_generation.map(SafekeeperGeneration::new); + let default_generation = env + .storage_controller + .timelines_onto_safekeepers + .then_some(1); + let safekeepers_generation = args + .safekeepers_generation + .or(default_generation) + .map(SafekeeperGeneration::new); // If --safekeepers argument is given, use only the listed // safekeeper nodes; otherwise all from the env. let safekeepers = if let Some(safekeepers) = parse_safekeepers(&args.safekeepers)? { diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs index a36815d27e..755d67a7ad 100644 --- a/control_plane/src/storage_controller.rs +++ b/control_plane/src/storage_controller.rs @@ -10,7 +10,8 @@ use camino::{Utf8Path, Utf8PathBuf}; use hyper0::Uri; use nix::unistd::Pid; use pageserver_api::controller_api::{ - NodeConfigureRequest, NodeDescribeResponse, NodeRegisterRequest, TenantCreateRequest, + NodeConfigureRequest, NodeDescribeResponse, NodeRegisterRequest, + SafekeeperSchedulingPolicyRequest, SkSchedulingPolicy, TenantCreateRequest, TenantCreateResponse, TenantLocateResponse, }; use pageserver_api::models::{ @@ -20,7 +21,7 @@ use pageserver_api::shard::TenantShardId; use pageserver_client::mgmt_api::ResponseErrorMessageExt; use pem::Pem; use postgres_backend::AuthType; -use reqwest::Method; +use reqwest::{Method, Response}; use serde::de::DeserializeOwned; use serde::{Deserialize, Serialize}; use tokio::process::Command; @@ -570,6 +571,11 @@ impl StorageController { let peer_jwt_token = encode_from_key_file(&peer_claims, private_key) .expect("failed to generate jwt token"); args.push(format!("--peer-jwt-token={peer_jwt_token}")); + + let claims = Claims::new(None, Scope::SafekeeperData); + let jwt_token = + encode_from_key_file(&claims, private_key).expect("failed to generate jwt token"); + args.push(format!("--safekeeper-jwt-token={jwt_token}")); } if let Some(public_key) = &self.public_key { @@ -614,6 +620,10 @@ impl StorageController { self.env.base_data_dir.display() )); + if self.env.safekeepers.iter().any(|sk| sk.auth_enabled) && self.private_key.is_none() { + anyhow::bail!("Safekeeper set up for auth but no private key specified"); + } + if self.config.timelines_onto_safekeepers { args.push("--timelines-onto-safekeepers".to_string()); } @@ -640,6 +650,10 @@ impl StorageController { ) .await?; + if self.config.timelines_onto_safekeepers { + self.register_safekeepers().await?; + } + Ok(()) } @@ -743,6 +757,23 @@ impl StorageController { where RQ: Serialize + Sized, RS: DeserializeOwned + Sized, + { + let response = self.dispatch_inner(method, path, body).await?; + Ok(response + .json() + .await + .map_err(pageserver_client::mgmt_api::Error::ReceiveBody)?) + } + + /// Simple HTTP request wrapper for calling into storage controller + async fn dispatch_inner( + &self, + method: reqwest::Method, + path: String, + body: Option, + ) -> anyhow::Result + where + RQ: Serialize + Sized, { // In the special case of the `storage_controller start` subcommand, we wish // to use the API endpoint of the newly started storage controller in order @@ -785,10 +816,31 @@ impl StorageController { let response = builder.send().await?; let response = response.error_from_body().await?; - Ok(response - .json() - .await - .map_err(pageserver_client::mgmt_api::Error::ReceiveBody)?) + Ok(response) + } + + /// Register the safekeepers in the storage controller + #[instrument(skip(self))] + async fn register_safekeepers(&self) -> anyhow::Result<()> { + for sk in self.env.safekeepers.iter() { + let sk_id = sk.id; + let body = serde_json::json!({ + "id": sk_id, + "created_at": "2023-10-25T09:11:25Z", + "updated_at": "2024-08-28T11:32:43Z", + "region_id": "aws-us-east-2", + "host": "127.0.0.1", + "port": sk.pg_port, + "http_port": sk.http_port, + "https_port": sk.https_port, + "version": 5957, + "availability_zone_id": format!("us-east-2b-{sk_id}"), + }); + self.upsert_safekeeper(sk_id, body).await?; + self.safekeeper_scheduling_policy(sk_id, SkSchedulingPolicy::Active) + .await?; + } + Ok(()) } /// Call into the attach_hook API, for use before handing out attachments to pageservers @@ -816,6 +868,42 @@ impl StorageController { Ok(response.generation) } + #[instrument(skip(self))] + pub async fn upsert_safekeeper( + &self, + node_id: NodeId, + request: serde_json::Value, + ) -> anyhow::Result<()> { + let resp = self + .dispatch_inner::( + Method::POST, + format!("control/v1/safekeeper/{node_id}"), + Some(request), + ) + .await?; + if !resp.status().is_success() { + anyhow::bail!( + "setting scheduling policy unsuccessful for safekeeper {node_id}: {}", + resp.status() + ); + } + Ok(()) + } + + #[instrument(skip(self))] + pub async fn safekeeper_scheduling_policy( + &self, + node_id: NodeId, + scheduling_policy: SkSchedulingPolicy, + ) -> anyhow::Result<()> { + self.dispatch::( + Method::POST, + format!("control/v1/safekeeper/{node_id}/scheduling_policy"), + Some(SafekeeperSchedulingPolicyRequest { scheduling_policy }), + ) + .await + } + #[instrument(skip(self))] pub async fn inspect( &self, diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index 2b2d721db2..1a25b07496 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -243,8 +243,7 @@ async fn timeline_pull_handler(mut request: Request) -> Result, ssl_ca_certs: Vec, global_timelines: Arc, -) -> Result { +) -> Result { let existing_tli = global_timelines.get(TenantTimelineId::new( request.tenant_id, request.timeline_id, @@ -411,7 +412,9 @@ pub async fn handle_request( for ssl_ca_cert in ssl_ca_certs { http_client = http_client.add_root_certificate(ssl_ca_cert); } - let http_client = http_client.build()?; + let http_client = http_client + .build() + .map_err(|e| ApiError::InternalServerError(e.into()))?; let http_hosts = request.http_hosts.clone(); @@ -443,10 +446,10 @@ pub async fn handle_request( // offline and C comes online. Then we want a pull on C with A and B as hosts to work. let min_required_successful = (http_hosts.len() - 1).max(1); if statuses.len() < min_required_successful { - bail!( + return Err(ApiError::InternalServerError(anyhow::anyhow!( "only got {} successful status responses. required: {min_required_successful}", statuses.len() - ) + ))); } // Find the most advanced safekeeper @@ -465,7 +468,7 @@ pub async fn handle_request( assert!(status.tenant_id == request.tenant_id); assert!(status.timeline_id == request.timeline_id); - pull_timeline( + match pull_timeline( status, safekeeper_host, sk_auth_token, @@ -473,6 +476,21 @@ pub async fn handle_request( global_timelines, ) .await + { + Ok(resp) => Ok(resp), + Err(e) => { + match e.downcast_ref::() { + Some(TimelineError::AlreadyExists(_)) => Ok(PullTimelineResponse { + safekeeper_host: None, + }), + Some(TimelineError::CreationInProgress(_)) => { + // We don't return success here because creation might still fail. + Err(ApiError::Conflict("Creation in progress".to_owned())) + } + _ => Err(ApiError::InternalServerError(e)), + } + } + } } async fn pull_timeline( diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 370eca5130..547c640a40 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -1409,30 +1409,6 @@ class NeonEnv: for f in futs: f.result() - # Last step: register safekeepers at the storage controller - if ( - self.storage_controller_config is not None - and self.storage_controller_config.get("timelines_onto_safekeepers") is True - ): - for sk_id, sk in enumerate(self.safekeepers): - # 0 is an invalid safekeeper id - sk_id = sk_id + 1 - body = { - "id": sk_id, - "created_at": "2023-10-25T09:11:25Z", - "updated_at": "2024-08-28T11:32:43Z", - "region_id": "aws-us-east-2", - "host": "127.0.0.1", - "port": sk.port.pg, - "http_port": sk.port.http, - "https_port": None, - "version": 5957, - "availability_zone_id": f"us-east-2b-{sk_id}", - } - - self.storage_controller.on_safekeeper_deploy(sk_id, body) - self.storage_controller.safekeeper_scheduling_policy(sk_id, "Active") - self.endpoint_storage.start(timeout_in_seconds=timeout_in_seconds) def stop(self, immediate=False, ps_assert_metric_no_errors=False, fail_on_endpoint_errors=True): diff --git a/test_runner/fixtures/pageserver/allowed_errors.py b/test_runner/fixtures/pageserver/allowed_errors.py index 24c856e279..43bffd919c 100755 --- a/test_runner/fixtures/pageserver/allowed_errors.py +++ b/test_runner/fixtures/pageserver/allowed_errors.py @@ -122,6 +122,10 @@ DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS = [ ".*Call to node.*management API.*failed.*Timeout.*", ".*Failed to update node .+ after heartbeat round.*error sending request for url.*", ".*background_reconcile: failed to fetch top tenants:.*client error \\(Connect\\).*", + # Many tests will take safekeepers offline + ".*Call to safekeeper.*management API.*failed.*receive body.*", + ".*Call to safekeeper.*management API.*failed.*ReceiveBody.*", + ".*Call to safekeeper.*management API.*failed.*Timeout.*", # Many tests will start up with a node offline ".*startup_reconcile: Could not scan node.*", # Tests run in dev mode From 8477d15f95ffb094c444e658bbcdb95301b1a750 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Thu, 8 May 2025 18:11:45 +0200 Subject: [PATCH 12/65] feat(direct IO): remove special case in test suite for compat tests (#11864) PR - https://github.com/neondatabase/neon/pull/11558 adds special treatment for compat snapshot binaries which don't understand the `direct-rw` mode. A new compat snapshot has been published since, so, we can remove the special case. refs: - fixes https://github.com/neondatabase/neon/issues/11598 --- test_runner/fixtures/neon_fixtures.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 547c640a40..aa468d9386 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -1299,13 +1299,6 @@ class NeonEnv: for key, value in override.items(): ps_cfg[key] = value - if self.pageserver_virtual_file_io_mode is not None: - # TODO(christian): https://github.com/neondatabase/neon/issues/11598 - if not config.test_may_use_compatibility_snapshot_binaries: - ps_cfg["virtual_file_io_mode"] = self.pageserver_virtual_file_io_mode - else: - log.info("ignoring virtual_file_io_mode parametrization for compatibility test") - if self.pageserver_wal_receiver_protocol is not None: key, value = PageserverWalReceiverProtocol.to_config_key_value( self.pageserver_wal_receiver_protocol From bef5954fd7b8ea43cac6f43a111d437cd7a360ad Mon Sep 17 00:00:00 2001 From: Conrad Ludgate Date: Thu, 8 May 2025 17:46:57 +0100 Subject: [PATCH 13/65] feat(proxy): track SNI usage by protocol, including for http (#11863) ## Problem We want to see how many users of the legacy serverless driver are still using the old URL for SQL-over-HTTP traffic. ## Summary of changes Adds a protocol field to the connections_by_sni metric. Ensures it's incremented for sql-over-http. --- proxy/src/auth/credentials.rs | 29 ++++++++++++++------------- proxy/src/metrics.rs | 15 +++++++++++--- proxy/src/serverless/mod.rs | 1 + proxy/src/serverless/sql_over_http.rs | 28 +++++++++++++++++++++++++- test_runner/fixtures/neon_fixtures.py | 8 ++++---- 5 files changed, 59 insertions(+), 22 deletions(-) diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs index 183976374a..526d0df7f2 100644 --- a/proxy/src/auth/credentials.rs +++ b/proxy/src/auth/credentials.rs @@ -12,9 +12,9 @@ use tracing::{debug, warn}; use crate::auth::password_hack::parse_endpoint_param; use crate::context::RequestContext; use crate::error::{ReportableError, UserFacingError}; -use crate::metrics::{Metrics, SniKind}; +use crate::metrics::{Metrics, SniGroup, SniKind}; use crate::proxy::NeonOptions; -use crate::serverless::SERVERLESS_DRIVER_SNI; +use crate::serverless::{AUTH_BROKER_SNI, SERVERLESS_DRIVER_SNI}; use crate::types::{EndpointId, RoleName}; #[derive(Debug, Error, PartialEq, Eq, Clone)] @@ -65,7 +65,7 @@ pub(crate) fn endpoint_sni(sni: &str, common_names: &HashSet) -> Option< if !common_names.contains(common_name) { return None; } - if subdomain == SERVERLESS_DRIVER_SNI { + if subdomain == SERVERLESS_DRIVER_SNI || subdomain == AUTH_BROKER_SNI { return None; } Some(EndpointId::from(subdomain)) @@ -128,22 +128,23 @@ impl ComputeUserInfoMaybeEndpoint { let metrics = Metrics::get(); debug!(%user, "credentials"); - if sni.is_some() { + + let protocol = ctx.protocol(); + let kind = if sni.is_some() { debug!("Connection with sni"); - metrics.proxy.accepted_connections_by_sni.inc(SniKind::Sni); + SniKind::Sni } else if endpoint.is_some() { - metrics - .proxy - .accepted_connections_by_sni - .inc(SniKind::NoSni); debug!("Connection without sni"); + SniKind::NoSni } else { - metrics - .proxy - .accepted_connections_by_sni - .inc(SniKind::PasswordHack); debug!("Connection with password hack"); - } + SniKind::PasswordHack + }; + + metrics + .proxy + .accepted_connections_by_sni + .inc(SniGroup { protocol, kind }); let options = NeonOptions::parse_params(params); diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs index e5fc0b724b..4b22c912eb 100644 --- a/proxy/src/metrics.rs +++ b/proxy/src/metrics.rs @@ -115,8 +115,8 @@ pub struct ProxyMetrics { #[metric(metadata = Thresholds::with_buckets([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 10.0, 20.0, 50.0, 100.0]))] pub allowed_vpc_endpoint_ids: Histogram<10>, - /// Number of connections (per sni). - pub accepted_connections_by_sni: CounterVec>, + /// Number of connections, by the method we used to determine the endpoint. + pub accepted_connections_by_sni: CounterVec, /// Number of connection failures (per kind). pub connection_failures_total: CounterVec>, @@ -342,11 +342,20 @@ pub enum LatencyExclusions { ClientCplaneComputeRetry, } +#[derive(LabelGroup)] +#[label(set = SniSet)] +pub struct SniGroup { + pub protocol: Protocol, + pub kind: SniKind, +} + #[derive(FixedCardinalityLabel, Copy, Clone)] -#[label(singleton = "kind")] pub enum SniKind { + /// Domain name based routing. SNI for libpq/websockets. Host for HTTP Sni, + /// Metadata based routing. `options` for libpq/websockets. Header for HTTP NoSni, + /// Metadata based routing, using the password field. PasswordHack, } diff --git a/proxy/src/serverless/mod.rs b/proxy/src/serverless/mod.rs index 6f24ad3dec..2a7069b1c2 100644 --- a/proxy/src/serverless/mod.rs +++ b/proxy/src/serverless/mod.rs @@ -56,6 +56,7 @@ use crate::serverless::backend::PoolingBackend; use crate::serverless::http_util::{api_error_into_response, json_response}; pub(crate) const SERVERLESS_DRIVER_SNI: &str = "api"; +pub(crate) const AUTH_BROKER_SNI: &str = "apiauth"; pub async fn task_main( config: &'static ProxyConfig, diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs index fee5942b7e..dfaeedaeae 100644 --- a/proxy/src/serverless/sql_over_http.rs +++ b/proxy/src/serverless/sql_over_http.rs @@ -38,7 +38,7 @@ use crate::config::{AuthenticationConfig, HttpConfig, ProxyConfig, TlsConfig}; use crate::context::RequestContext; use crate::error::{ErrorKind, ReportableError, UserFacingError}; use crate::http::{ReadBodyError, read_body_with_limit}; -use crate::metrics::{HttpDirection, Metrics}; +use crate::metrics::{HttpDirection, Metrics, SniGroup, SniKind}; use crate::proxy::{NeonOptions, run_until_cancelled}; use crate::serverless::backend::HttpConnError; use crate::types::{DbName, RoleName}; @@ -227,6 +227,32 @@ fn get_conn_info( } } + // check the URL that was used, for metrics + { + let host_endpoint = headers + // get the host header + .get("host") + // extract the domain + .and_then(|h| { + let (host, _port) = h.to_str().ok()?.split_once(':')?; + Some(host) + }) + // get the endpoint prefix + .map(|h| h.split_once('.').map_or(h, |(prefix, _)| prefix)); + + let kind = if host_endpoint == Some(&*endpoint) { + SniKind::Sni + } else { + SniKind::NoSni + }; + + let protocol = ctx.protocol(); + Metrics::get() + .proxy + .accepted_connections_by_sni + .inc(SniGroup { protocol, kind }); + } + ctx.set_user_agent( headers .get(hyper::header::USER_AGENT) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index aa468d9386..1b4562c0b3 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -3835,7 +3835,7 @@ class NeonAuthBroker: external_http_port: int, auth_backend: NeonAuthBroker.ProxyV1, ): - self.domain = "apiauth.local.neon.build" # resolves to 127.0.0.1 + self.domain = "local.neon.build" # resolves to 127.0.0.1 self.host = "127.0.0.1" self.http_port = http_port self.external_http_port = external_http_port @@ -3852,7 +3852,7 @@ class NeonAuthBroker: # generate key of it doesn't exist crt_path = self.test_output_dir / "proxy.crt" key_path = self.test_output_dir / "proxy.key" - generate_proxy_tls_certs("apiauth.local.neon.build", key_path, crt_path) + generate_proxy_tls_certs(f"apiauth.{self.domain}", key_path, crt_path) args = [ str(self.neon_binpath / "proxy"), @@ -3896,10 +3896,10 @@ class NeonAuthBroker: log.info(f"Executing http query: {query}") - connstr = f"postgresql://{user}@{self.domain}/postgres" + connstr = f"postgresql://{user}@ep-foo-bar-1234.{self.domain}/postgres" async with httpx.AsyncClient(verify=str(self.test_output_dir / "proxy.crt")) as client: response = await client.post( - f"https://{self.domain}:{self.external_http_port}/sql", + f"https://apiauth.{self.domain}:{self.external_http_port}/sql", json={"query": query, "params": args}, headers={ "Neon-Connection-String": connstr, From b37bb7d7edaab870d05bff7286e345066d49664e Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Thu, 8 May 2025 20:48:24 +0200 Subject: [PATCH 14/65] pageserver: timeline shutdown: fully quiesce ingest path before`freeze_and_flush` (#11851) # Problem Before this PR, timeline shutdown would - cancel the walreceiver cancellation token subtree (child token of Timeline::cancel) - call freeze_and_flush - Timeline::cancel.cancel() - ... bunch of waiting for things ... - Timeline::gate.close() As noted by the comment that is deleted by this PR, this left a window where, after freeze_and_flush, walreceiver could still be running and ingest data into a new InMemoryLayer. This presents a potential source of log noise during Timeline shutdown where the InMemoryLayer created after the freeze_and_flush observes that Timeline::cancel is cancelled, failing the ingest with some anyhow::Error wrapping (deeply) a `FlushTaskError::Cancelled` instance (`flush task cancelled` error message). # Solution It turns out that it is quite easy to shut down, not just cancel, walreceiver completely because the only subtask spawned by walreceiver connection manager is the `handle_walreceiver_connection` task, which is properly shut down and waited upon when the manager task observes cancellation and exits its retry loop. The alternative is to replace all the usage of `anyhow` on the ingest path with differentiated error types. A lot of busywork for little gain to fix a potential logging noise nuisance, so, not doing that for now. # Correctness / Risk We do not risk leaking walreceiver child tasks because existing discipline is to hold a gate guard. We will prolong `Timeline::shutdown` to the degree that we're no longer making progress with the rest of shutdown while the walreceiver task hasn't yet observed cancellation. In practice, this should be negligible. `Timeline::shutdown` could fail to complete if there is a hidden dependency of walreceiver shutdown on some subsystem. The code certainly suggests there isn't, and I'm not aware of any such dependency. Anyway, impact will be low because we only shut down Timeline instances that are obsolete, either because there is a newer attachment at a different location, or because the timeline got deleted by the user. We would learn about this through stuck cplane operations or stuck storcon reconciliations. We would be able to mitigate by cancelling such stuck operations/reconciliations and/or by rolling back pageserver. # Refs - identified this while investigating https://github.com/neondatabase/neon/issues/11762 - PR that _does_ fix a bunch _real_ `flush task cancelled` noise on the compaction path: https://github.com/neondatabase/neon/pull/11853 --- pageserver/src/tenant/timeline.rs | 12 ++---------- pageserver/src/tenant/timeline/walreceiver.rs | 16 ++++++++++++++-- 2 files changed, 16 insertions(+), 12 deletions(-) diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index c8d897d074..d7f5958128 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -2127,22 +2127,14 @@ impl Timeline { debug_assert_current_span_has_tenant_and_timeline_id(); // Regardless of whether we're going to try_freeze_and_flush - // or not, stop ingesting any more data. Walreceiver only provides - // cancellation but no "wait until gone", because it uses the Timeline::gate. - // So, only after the self.gate.close() below will we know for sure that - // no walreceiver tasks are left. - // For `try_freeze_and_flush=true`, this means that we might still be ingesting - // data during the call to `self.freeze_and_flush()` below. - // That's not ideal, but, we don't have the concept of a ChildGuard, - // which is what we'd need to properly model early shutdown of the walreceiver - // task sub-tree before the other Timeline task sub-trees. + // or not, stop ingesting any more data. let walreceiver = self.walreceiver.lock().unwrap().take(); tracing::debug!( is_some = walreceiver.is_some(), "Waiting for WalReceiverManager..." ); if let Some(walreceiver) = walreceiver { - walreceiver.cancel(); + walreceiver.shutdown().await; } // ... and inform any waiters for newer LSNs that there won't be any. self.last_record_lsn.shutdown(); diff --git a/pageserver/src/tenant/timeline/walreceiver.rs b/pageserver/src/tenant/timeline/walreceiver.rs index 4f80073cc3..0f73eb839b 100644 --- a/pageserver/src/tenant/timeline/walreceiver.rs +++ b/pageserver/src/tenant/timeline/walreceiver.rs @@ -63,6 +63,7 @@ pub struct WalReceiver { /// All task spawned by [`WalReceiver::start`] and its children are sensitive to this token. /// It's a child token of [`Timeline`] so that timeline shutdown can cancel WalReceiver tasks early for `freeze_and_flush=true`. cancel: CancellationToken, + task: tokio::task::JoinHandle<()>, } impl WalReceiver { @@ -79,7 +80,7 @@ impl WalReceiver { let loop_status = Arc::new(std::sync::RwLock::new(None)); let manager_status = Arc::clone(&loop_status); let cancel = timeline.cancel.child_token(); - WALRECEIVER_RUNTIME.spawn({ + let task = WALRECEIVER_RUNTIME.spawn({ let cancel = cancel.clone(); async move { debug_assert_current_span_has_tenant_and_timeline_id(); @@ -120,14 +121,25 @@ impl WalReceiver { Self { manager_status, cancel, + task, } } #[instrument(skip_all, level = tracing::Level::DEBUG)] - pub fn cancel(&self) { + pub async fn shutdown(self) { debug_assert_current_span_has_tenant_and_timeline_id(); debug!("cancelling walreceiver tasks"); self.cancel.cancel(); + match self.task.await { + Ok(()) => debug!("Shutdown success"), + Err(je) if je.is_cancelled() => unreachable!("not used"), + Err(je) if je.is_panic() => { + // already logged by panic hook + } + Err(je) => { + error!("shutdown walreceiver task join error: {je}") + } + } } pub(crate) fn status(&self) -> Option { From 101e115b3885dd966a839ef50b450771988fa9aa Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Fri, 9 May 2025 09:54:40 +0300 Subject: [PATCH 15/65] Change prefetch logic in vacuum (#11650) ## Problem See https://neondb.slack.com/archives/C03QLRH7PPD/p1745003314183649 Vacuum doesn't use prefetch because this strange logic in `lazy_scan_heap`: ``` /* And only up to the next unskippable block */ if (next_prefetch_block + prefetch_budget > vacrel->next_unskippable_block) prefetch_budget = vacrel->next_unskippable_block - next_prefetch_block; ``` ## Summary of changes Disable prefetch only if vacuum jumps to next skippable block (there is SKIP_PAGES_THRESHOLD) which cancel seqscan and perform jump only if gap is large enough). Postgres PRs: https://github.com/neondatabase/postgres/pull/620 https://github.com/neondatabase/postgres/pull/621 https://github.com/neondatabase/postgres/pull/622 https://github.com/neondatabase/postgres/pull/623 --------- Co-authored-by: Konstantin Knizhnik --- vendor/postgres-v14 | 2 +- vendor/postgres-v15 | 2 +- vendor/postgres-v17 | 2 +- vendor/revisions.json | 6 +++--- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index 108856a4ae..06b405bc98 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit 108856a4ae76be285b04497a0ed08fcbe60ddbe9 +Subproject commit 06b405bc982fd53522689aa4acbfd9c44b7993cf diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index b838c8969b..72f83df76c 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit b838c8969b7c63f3e637a769656f5f36793b797c +Subproject commit 72f83df76c61ce18d81bd371f0afd2a43d59c052 diff --git a/vendor/postgres-v17 b/vendor/postgres-v17 index b763ab54b9..0d59c91c1a 160000 --- a/vendor/postgres-v17 +++ b/vendor/postgres-v17 @@ -1 +1 @@ -Subproject commit b763ab54b98d232a0959371ab1d07f06ed77c49e +Subproject commit 0d59c91c1a23e667f1d1169d5f040b3fa0a0ab44 diff --git a/vendor/revisions.json b/vendor/revisions.json index 4307fd1c3f..10aad7e1a2 100644 --- a/vendor/revisions.json +++ b/vendor/revisions.json @@ -1,7 +1,7 @@ { "v17": [ "17.4", - "b763ab54b98d232a0959371ab1d07f06ed77c49e" + "0d59c91c1a23e667f1d1169d5f040b3fa0a0ab44" ], "v16": [ "16.8", @@ -9,10 +9,10 @@ ], "v15": [ "15.12", - "b838c8969b7c63f3e637a769656f5f36793b797c" + "72f83df76c61ce18d81bd371f0afd2a43d59c052" ], "v14": [ "14.17", - "108856a4ae76be285b04497a0ed08fcbe60ddbe9" + "06b405bc982fd53522689aa4acbfd9c44b7993cf" ] } From 5cd7f936f90978673a1f6a1dc64765e701035aa4 Mon Sep 17 00:00:00 2001 From: Conrad Ludgate Date: Fri, 9 May 2025 08:48:30 +0100 Subject: [PATCH 16/65] fix(neon-rls): optimistically assume role grants are already assigned for replicas (#11811) ## Problem Read replicas cannot grant permissions for roles for Neon RLS. Usually the permission is already granted, so we can optimistically check. See INC-509 ## Summary of changes Perform a permission lookup prior to actually executing any grants. --- Cargo.lock | 1 + compute_tools/Cargo.toml | 1 + compute_tools/src/compute.rs | 52 +++++++++++++++++-------- test_runner/fixtures/neon_fixtures.py | 10 ++++- test_runner/regress/test_role_grants.py | 7 ++++ 5 files changed, 52 insertions(+), 19 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index fe4cc35029..7083baa092 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1303,6 +1303,7 @@ dependencies = [ "futures", "http 1.1.0", "indexmap 2.0.1", + "itertools 0.10.5", "jsonwebtoken", "metrics", "nix 0.27.1", diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml index 8ee5dd0665..f9da3ba700 100644 --- a/compute_tools/Cargo.toml +++ b/compute_tools/Cargo.toml @@ -28,6 +28,7 @@ flate2.workspace = true futures.workspace = true http.workspace = true indexmap.workspace = true +itertools.workspace = true jsonwebtoken.workspace = true metrics.workspace = true nix.workspace = true diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 25920675c1..f494e2444a 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -11,6 +11,7 @@ use compute_api::spec::{ use futures::StreamExt; use futures::future::join_all; use futures::stream::FuturesUnordered; +use itertools::Itertools; use nix::sys::signal::{Signal, kill}; use nix::unistd::Pid; use once_cell::sync::Lazy; @@ -18,7 +19,7 @@ use postgres; use postgres::NoTls; use postgres::error::SqlState; use remote_storage::{DownloadError, RemotePath}; -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; use std::net::SocketAddr; use std::os::unix::fs::{PermissionsExt, symlink}; use std::path::Path; @@ -1995,23 +1996,40 @@ LIMIT 100", tokio::spawn(conn); // TODO: support other types of grants apart from schemas? - let query = format!( - "GRANT {} ON SCHEMA {} TO {}", - privileges - .iter() - // should not be quoted as it's part of the command. - // is already sanitized so it's ok - .map(|p| p.as_str()) - .collect::>() - .join(", "), - // quote the schema and role name as identifiers to sanitize them. - schema_name.pg_quote(), - role_name.pg_quote(), - ); - db_client - .simple_query(&query) + + // check the role grants first - to gracefully handle read-replicas. + let select = "SELECT privilege_type + FROM pg_namespace + JOIN LATERAL (SELECT * FROM aclexplode(nspacl) AS x) acl ON true + JOIN pg_user users ON acl.grantee = users.usesysid + WHERE users.usename = $1 + AND nspname = $2"; + let rows = db_client + .query(select, &[role_name, schema_name]) .await - .with_context(|| format!("Failed to execute query: {}", query))?; + .with_context(|| format!("Failed to execute query: {select}"))?; + + let already_granted: HashSet = rows.into_iter().map(|row| row.get(0)).collect(); + + let grants = privileges + .iter() + .filter(|p| !already_granted.contains(p.as_str())) + // should not be quoted as it's part of the command. + // is already sanitized so it's ok + .map(|p| p.as_str()) + .join(", "); + + if !grants.is_empty() { + // quote the schema and role name as identifiers to sanitize them. + let schema_name = schema_name.pg_quote(); + let role_name = role_name.pg_quote(); + + let query = format!("GRANT {grants} ON SCHEMA {schema_name} TO {role_name}",); + db_client + .simple_query(&query) + .await + .with_context(|| format!("Failed to execute query: {}", query))?; + } Ok(()) } diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 1b4562c0b3..131820f23e 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -4613,7 +4613,10 @@ class EndpointFactory: return self def new_replica( - self, origin: Endpoint, endpoint_id: str, config_lines: list[str] | None = None + self, + origin: Endpoint, + endpoint_id: str | None = None, + config_lines: list[str] | None = None, ): branch_name = origin.branch_name assert origin in self.endpoints @@ -4629,7 +4632,10 @@ class EndpointFactory: ) def new_replica_start( - self, origin: Endpoint, endpoint_id: str, config_lines: list[str] | None = None + self, + origin: Endpoint, + endpoint_id: str | None = None, + config_lines: list[str] | None = None, ): branch_name = origin.branch_name assert origin in self.endpoints diff --git a/test_runner/regress/test_role_grants.py b/test_runner/regress/test_role_grants.py index b2251875f0..5b13d461f0 100644 --- a/test_runner/regress/test_role_grants.py +++ b/test_runner/regress/test_role_grants.py @@ -39,3 +39,10 @@ def test_role_grants(neon_simple_env: NeonEnv): res = cur.fetchall() assert res == [(1,)], "select should not succeed" + + # confirm that replicas can also ensure the grants are correctly set. + replica = env.endpoints.new_replica_start(endpoint) + replica_client = replica.http_client() + replica_client.set_role_grants( + "test_role_grants", "test_role", "test_schema", ["CREATE", "USAGE"] + ) From 03d635b916ed057826d80bbc709864acb1c108f1 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Fri, 9 May 2025 12:07:08 +0300 Subject: [PATCH 17/65] Add more guards for prefetch_pump_state (#11859) ## Problem See https://neondb.slack.com/archives/C08PJ07BZ44/p1746566292750689 Looks like there are more cases when `prefetch_pump_state` can be called in unexpected place and cause core dump. ## Summary of changes Add more guards. --------- Co-authored-by: Konstantin Knizhnik --- pgxn/neon/communicator.c | 36 +++++++++++++++++++++--------------- pgxn/neon/communicator.h | 2 +- pgxn/neon/pagestore_smgr.c | 20 ++++++++++---------- vendor/postgres-v16 | 2 +- vendor/revisions.json | 2 +- 5 files changed, 34 insertions(+), 28 deletions(-) diff --git a/pgxn/neon/communicator.c b/pgxn/neon/communicator.c index 818a149499..9609f186b9 100644 --- a/pgxn/neon/communicator.c +++ b/pgxn/neon/communicator.c @@ -425,15 +425,12 @@ compact_prefetch_buffers(void) * point inside and outside PostgreSQL. * * This still does throw errors when it receives malformed responses from PS. - * - * When we're not called from CHECK_FOR_INTERRUPTS (indicated by - * IsHandlingInterrupts) we also report we've ended prefetch receive work, - * just in case state tracking was lost due to an error in the sync getPage - * response code. */ void -communicator_prefetch_pump_state(bool IsHandlingInterrupts) +communicator_prefetch_pump_state(void) { + START_PREFETCH_RECEIVE_WORK(); + while (MyPState->ring_receive != MyPState->ring_flush) { NeonResponse *response; @@ -482,9 +479,7 @@ communicator_prefetch_pump_state(bool IsHandlingInterrupts) } } - /* We never pump the prefetch state while handling other pages */ - if (!IsHandlingInterrupts) - END_PREFETCH_RECEIVE_WORK(); + END_PREFETCH_RECEIVE_WORK(); communicator_reconfigure_timeout_if_needed(); } @@ -672,9 +667,10 @@ prefetch_wait_for(uint64 ring_index) Assert(MyPState->ring_unused > ring_index); + START_PREFETCH_RECEIVE_WORK(); + while (MyPState->ring_receive <= ring_index) { - START_PREFETCH_RECEIVE_WORK(); entry = GetPrfSlot(MyPState->ring_receive); Assert(entry->status == PRFS_REQUESTED); @@ -683,17 +679,18 @@ prefetch_wait_for(uint64 ring_index) result = false; break; } - - END_PREFETCH_RECEIVE_WORK(); CHECK_FOR_INTERRUPTS(); } + if (result) { /* Check that slot is actually received (srver can be disconnected in prefetch_pump_state called from CHECK_FOR_INTERRUPTS */ PrefetchRequest *slot = GetPrfSlot(ring_index); - return slot->status == PRFS_RECEIVED; + result = slot->status == PRFS_RECEIVED; } - return false; + END_PREFETCH_RECEIVE_WORK(); + + return result; ; } @@ -720,6 +717,7 @@ prefetch_read(PrefetchRequest *slot) Assert(slot->status == PRFS_REQUESTED); Assert(slot->response == NULL); Assert(slot->my_ring_index == MyPState->ring_receive); + Assert(readpage_reentrant_guard); if (slot->status != PRFS_REQUESTED || slot->response != NULL || @@ -802,6 +800,7 @@ communicator_prefetch_receive(BufferTag tag) PrfHashEntry *entry; PrefetchRequest hashkey; + Assert(readpage_reentrant_guard); hashkey.buftag = tag; entry = prfh_lookup(MyPState->prf_hash, &hashkey); if (entry != NULL && prefetch_wait_for(entry->slot->my_ring_index)) @@ -821,8 +820,12 @@ communicator_prefetch_receive(BufferTag tag) void prefetch_on_ps_disconnect(void) { + bool save_readpage_reentrant_guard = readpage_reentrant_guard; MyPState->ring_flush = MyPState->ring_unused; + /* Prohibit callig of prefetch_pump_state */ + START_PREFETCH_RECEIVE_WORK(); + while (MyPState->ring_receive < MyPState->ring_unused) { PrefetchRequest *slot; @@ -851,6 +854,9 @@ prefetch_on_ps_disconnect(void) MyNeonCounters->getpage_prefetch_discards_total += 1; } + /* Restore guard */ + readpage_reentrant_guard = save_readpage_reentrant_guard; + /* * We can have gone into retry due to network error, so update stats with * the latest available @@ -2509,7 +2515,7 @@ communicator_processinterrupts(void) if (timeout_signaled) { if (!readpage_reentrant_guard && readahead_getpage_pull_timeout_ms > 0) - communicator_prefetch_pump_state(true); + communicator_prefetch_pump_state(); timeout_signaled = false; communicator_reconfigure_timeout_if_needed(); diff --git a/pgxn/neon/communicator.h b/pgxn/neon/communicator.h index f55c4b10f1..5376c9b839 100644 --- a/pgxn/neon/communicator.h +++ b/pgxn/neon/communicator.h @@ -44,7 +44,7 @@ extern int communicator_read_slru_segment(SlruKind kind, int64 segno, void *buffer); extern void communicator_reconfigure_timeout_if_needed(void); -extern void communicator_prefetch_pump_state(bool IsHandlingInterrupts); +extern void communicator_prefetch_pump_state(void); #endif diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index 87eb420717..f574517b2a 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -1179,7 +1179,7 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, blocknum += iterblocks; } - communicator_prefetch_pump_state(false); + communicator_prefetch_pump_state(); return false; } @@ -1218,7 +1218,7 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) communicator_prefetch_register_bufferv(tag, NULL, 1, NULL); - communicator_prefetch_pump_state(false); + communicator_prefetch_pump_state(); return false; } @@ -1262,7 +1262,7 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum, */ neon_log(SmgrTrace, "writeback noop"); - communicator_prefetch_pump_state(false); + communicator_prefetch_pump_state(); #ifdef DEBUG_COMPARE_LOCAL if (IS_LOCAL_REL(reln)) @@ -1315,7 +1315,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer } /* Try to read PS results if they are available */ - communicator_prefetch_pump_state(false); + communicator_prefetch_pump_state(); neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1); @@ -1339,7 +1339,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer /* * Try to receive prefetch results once again just to make sure we don't leave the smgr code while the OS might still have buffered bytes. */ - communicator_prefetch_pump_state(false); + communicator_prefetch_pump_state(); #ifdef DEBUG_COMPARE_LOCAL if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln)) @@ -1449,7 +1449,7 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, nblocks, PG_IOV_MAX); /* Try to read PS results if they are available */ - communicator_prefetch_pump_state(false); + communicator_prefetch_pump_state(); neon_get_request_lsns(InfoFromSMgrRel(reln), forknum, blocknum, request_lsns, nblocks); @@ -1480,7 +1480,7 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, /* * Try to receive prefetch results once again just to make sure we don't leave the smgr code while the OS might still have buffered bytes. */ - communicator_prefetch_pump_state(false); + communicator_prefetch_pump_state(); #ifdef DEBUG_COMPARE_LOCAL if (forknum == MAIN_FORKNUM && IS_LOCAL_REL(reln)) @@ -1665,7 +1665,7 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo lfc_write(InfoFromSMgrRel(reln), forknum, blocknum, buffer); - communicator_prefetch_pump_state(false); + communicator_prefetch_pump_state(); #ifdef DEBUG_COMPARE_LOCAL if (IS_LOCAL_REL(reln)) @@ -1727,7 +1727,7 @@ neon_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, lfc_writev(InfoFromSMgrRel(reln), forknum, blkno, buffers, nblocks); - communicator_prefetch_pump_state(false); + communicator_prefetch_pump_state(); #ifdef DEBUG_COMPARE_LOCAL if (IS_LOCAL_REL(reln)) @@ -1902,7 +1902,7 @@ neon_immedsync(SMgrRelation reln, ForkNumber forknum) neon_log(SmgrTrace, "[NEON_SMGR] immedsync noop"); - communicator_prefetch_pump_state(false); + communicator_prefetch_pump_state(); #ifdef DEBUG_COMPARE_LOCAL if (IS_LOCAL_REL(reln)) diff --git a/vendor/postgres-v16 b/vendor/postgres-v16 index 05ddf212e2..d72d76f2cd 160000 --- a/vendor/postgres-v16 +++ b/vendor/postgres-v16 @@ -1 +1 @@ -Subproject commit 05ddf212e2e07b788b5c8b88bdcf98630941f6ae +Subproject commit d72d76f2cdee4194dd052ce099e9784aca7c794a diff --git a/vendor/revisions.json b/vendor/revisions.json index 10aad7e1a2..e76510f969 100644 --- a/vendor/revisions.json +++ b/vendor/revisions.json @@ -5,7 +5,7 @@ ], "v16": [ "16.8", - "05ddf212e2e07b788b5c8b88bdcf98630941f6ae" + "d72d76f2cdee4194dd052ce099e9784aca7c794a" ], "v15": [ "15.12", From d0dc65da124d3f84e2f64ac5e3927b0a299c9eab Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Fri, 9 May 2025 18:12:49 +0800 Subject: [PATCH 18/65] fix(pageserver): give up gc-compaction if one key has too long history (#11869) ## Problem The limitation we imposed last week https://github.com/neondatabase/neon/pull/11709 is not enough to protect excessive memory usage. ## Summary of changes If a single key accumulated too much history, give up compaction. In the future, we can make the `generate_key_retention` function take a stream of keys instead of first accumulating them in memory, thus easily support such long key history cases. Signed-off-by: Alex Chi Z --- pageserver/src/tenant/timeline/compaction.rs | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index 07cd274a41..6b155268d6 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -3606,6 +3606,13 @@ impl Timeline { last_key = Some(key); } accumulated_values.push((key, lsn, val)); + + if accumulated_values.len() >= 65536 { + // Assume all of them are images, that would be 512MB of data in memory for a single key. + return Err(CompactionError::Other(anyhow!( + "too many values for a single key, giving up gc-compaction" + ))); + } } else { let last_key: &mut Key = last_key.as_mut().unwrap(); stat.on_unique_key_visited(); // TODO: adjust statistics for partial compaction From d0aaec2abbf502a962351b5939f1fae974053cd5 Mon Sep 17 00:00:00 2001 From: Vlad Lazar Date: Fri, 9 May 2025 11:55:26 +0100 Subject: [PATCH 19/65] storage_controller: create imported timelines on safekeepers (#11801) ## Problem SK timeline creations were skipped for imported timelines since we didn't know the correct start LSN of the timeline at that point. ## Summary of changes Created imported timelines on the SK as part of the import finalize step. We use the last record LSN of shard 0 as the start LSN for the safekeeper timeline. Closes https://github.com/neondatabase/neon/issues/11569 --- storage_controller/src/service.rs | 51 ++++++++++++++----- .../src/service/safekeeper_service.rs | 36 +++++++++++++ test_runner/regress/test_import_pgdata.py | 33 ++++++------ 3 files changed, 90 insertions(+), 30 deletions(-) diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index fdb791c2cf..193050460d 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -3886,10 +3886,10 @@ impl Service { None } else if safekeepers { - // Note that we do not support creating the timeline on the safekeepers - // for imported timelines. The `start_lsn` of the timeline is not known - // until the import finshes. - // https://github.com/neondatabase/neon/issues/11569 + // Note that for imported timelines, we do not create the timeline on the safekeepers + // straight away. Instead, we do it once the import finalized such that we know what + // start LSN to provide for the safekeepers. This is done in + // [`Self::finalize_timeline_import`]. let res = self .tenant_timeline_create_safekeepers(tenant_id, &timeline_info) .instrument(tracing::info_span!("timeline_create_safekeepers", %tenant_id, timeline_id=%timeline_info.timeline_id)) @@ -3966,11 +3966,22 @@ impl Service { let active = self.timeline_active_on_all_shards(&import).await?; match active { - true => { + Some(timeline_info) => { tracing::info!("Timeline became active on all shards"); + + if self.config.timelines_onto_safekeepers { + // Now that we know the start LSN of this timeline, create it on the + // safekeepers. + self.tenant_timeline_create_safekeepers_until_success( + import.tenant_id, + timeline_info, + ) + .await?; + } + break; } - false => { + None => { tracing::info!("Timeline not active on all shards yet"); tokio::select! { @@ -4004,9 +4015,6 @@ impl Service { .range_mut(TenantShardId::tenant_range(import.tenant_id)) .for_each(|(_id, shard)| shard.importing = TimelineImportState::Idle); - // TODO(vlad): Timeline creations in import mode do not return a correct initdb lsn, - // so we can't create the timeline on the safekeepers. Fix by moving creation here. - // https://github.com/neondatabase/neon/issues/11569 tracing::info!(%import_failed, "Timeline import complete"); Ok(()) @@ -4021,10 +4029,16 @@ impl Service { .await; } + /// If the timeline is active on all shards, returns the [`TimelineInfo`] + /// collected from shard 0. + /// + /// An error is returned if the shard layout has changed during the import. + /// This is guarded against within the storage controller and the pageserver, + /// and, therefore, unexpected. async fn timeline_active_on_all_shards( self: &Arc, import: &TimelineImport, - ) -> anyhow::Result { + ) -> anyhow::Result> { let targets = { let locked = self.inner.read().unwrap(); let mut targets = Vec::new(); @@ -4048,13 +4062,17 @@ impl Service { .expect("Pageservers may not be deleted while referenced"); targets.push((*tenant_shard_id, node.clone())); } else { - return Ok(false); + return Ok(None); } } targets }; + if targets.is_empty() { + anyhow::bail!("No shards found to finalize import for"); + } + let results = self .tenant_for_shards_api( targets, @@ -4070,10 +4088,17 @@ impl Service { ) .await; - Ok(results.into_iter().all(|res| match res { + let all_active = results.iter().all(|res| match res { Ok(info) => info.state == TimelineState::Active, Err(_) => false, - })) + }); + + if all_active { + // Both unwraps are validated above + Ok(Some(results.into_iter().next().unwrap().unwrap())) + } else { + Ok(None) + } } pub(crate) async fn tenant_timeline_archival_config( diff --git a/storage_controller/src/service/safekeeper_service.rs b/storage_controller/src/service/safekeeper_service.rs index 5eecf0d415..5c15660ba3 100644 --- a/storage_controller/src/service/safekeeper_service.rs +++ b/storage_controller/src/service/safekeeper_service.rs @@ -323,6 +323,42 @@ impl Service { }) } + pub(crate) async fn tenant_timeline_create_safekeepers_until_success( + self: &Arc, + tenant_id: TenantId, + timeline_info: TimelineInfo, + ) -> anyhow::Result<()> { + const BACKOFF: Duration = Duration::from_secs(5); + + loop { + if self.cancel.is_cancelled() { + anyhow::bail!("Shut down requested while finalizing import"); + } + + let res = self + .tenant_timeline_create_safekeepers(tenant_id, &timeline_info) + .await; + + match res { + Ok(_) => { + tracing::info!("Timeline created on safekeepers"); + break; + } + Err(err) => { + tracing::error!("Failed to create timeline on safekeepers: {err}"); + tokio::select! { + _ = self.cancel.cancelled() => { + anyhow::bail!("Shut down requested while finalizing import"); + }, + _ = tokio::time::sleep(BACKOFF) => {} + }; + } + } + } + + Ok(()) + } + /// Directly insert the timeline into the database without reconciling it with safekeepers. /// /// Useful if the timeline already exists on the specified safekeepers, diff --git a/test_runner/regress/test_import_pgdata.py b/test_runner/regress/test_import_pgdata.py index 2fda1991f7..05e63ad955 100644 --- a/test_runner/regress/test_import_pgdata.py +++ b/test_runner/regress/test_import_pgdata.py @@ -24,6 +24,7 @@ from fixtures.utils import ( skip_in_debug_build, wait_until, ) +from fixtures.workload import Workload from mypy_boto3_kms import KMSClient from mypy_boto3_kms.type_defs import EncryptResponseTypeDef from mypy_boto3_s3 import S3Client @@ -97,6 +98,10 @@ def test_pgdata_import_smoke( f"http://{cplane_mgmt_api_server.host}:{cplane_mgmt_api_server.port}/storage/api/v1/" ) + if neon_env_builder.storage_controller_config is None: + neon_env_builder.storage_controller_config = {} + neon_env_builder.storage_controller_config["timelines_onto_safekeepers"] = True + env = neon_env_builder.init_start() # The test needs LocalFs support, which is only built in testing mode. @@ -286,34 +291,28 @@ def test_pgdata_import_smoke( # # validate that we can write # - rw_endpoint = env.endpoints.create_start( - branch_name=import_branch_name, - endpoint_id="rw", - tenant_id=tenant_id, - config_lines=ep_config, - ) - rw_endpoint.safe_psql("create table othertable(values text)") - rw_lsn = Lsn(rw_endpoint.safe_psql_scalar("select pg_current_wal_flush_lsn()")) + workload = Workload(env, tenant_id, timeline_id, branch_name=import_branch_name) + workload.init() + workload.write_rows(64) + workload.validate() - # TODO: consider using `class Workload` here - # to do compaction and whatnot? + rw_lsn = Lsn(workload.endpoint().safe_psql_scalar("select pg_current_wal_flush_lsn()")) # # validate that we can branch (important use case) # # ... at the tip - _ = env.create_branch( + child_timeline_id = env.create_branch( new_branch_name="br-tip", ancestor_branch_name=import_branch_name, tenant_id=tenant_id, ancestor_start_lsn=rw_lsn, ) - br_tip_endpoint = env.endpoints.create_start( - branch_name="br-tip", endpoint_id="br-tip-ro", tenant_id=tenant_id, config_lines=ep_config - ) - validate_vanilla_equivalence(br_tip_endpoint) - br_tip_endpoint.safe_psql("select * from othertable") + child_workload = workload.branch(timeline_id=child_timeline_id, branch_name="br-tip") + child_workload.validate() + + validate_vanilla_equivalence(child_workload.endpoint()) # ... at the initdb lsn _ = env.create_branch( @@ -330,7 +329,7 @@ def test_pgdata_import_smoke( ) validate_vanilla_equivalence(br_initdb_endpoint) with pytest.raises(psycopg2.errors.UndefinedTable): - br_initdb_endpoint.safe_psql("select * from othertable") + br_initdb_endpoint.safe_psql(f"select * from {workload.table}") @run_only_on_default_postgres(reason="PG version is irrelevant here") From 93b964f829f05b4c7e9bf6408f504bf6b70e033b Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Fri, 9 May 2025 20:07:52 +0800 Subject: [PATCH 20/65] fix(pageserver): do not do image compaction if it's below gc cutoff (#11872) ## Problem We observe image compaction errors after gc-compaction finishes compacting below the gc_cutoff. This is because `repartition` returns an LSN below the gc horizon as we (likely) determined that `distance <= self.repartition_threshold`. I think it's better to keep the current behavior of when to trigger compaction but we should skip image compaction if the returned LSN is below the gc horizon. ## Summary of changes If the repartition returns an invalid LSN, skip image compaction. Signed-off-by: Alex Chi Z --- pageserver/src/tenant/timeline/compaction.rs | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index 6b155268d6..e7d39db70d 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -1277,6 +1277,8 @@ impl Timeline { return Ok(CompactionOutcome::YieldForL0); } + let gc_cutoff = *self.applied_gc_cutoff_lsn.read(); + // 2. Repartition and create image layers if necessary match self .repartition( @@ -1287,7 +1289,7 @@ impl Timeline { ) .await { - Ok(((dense_partitioning, sparse_partitioning), lsn)) => { + Ok(((dense_partitioning, sparse_partitioning), lsn)) if lsn >= gc_cutoff => { // Disables access_stats updates, so that the files we read remain candidates for eviction after we're done with them let image_ctx = RequestContextBuilder::from(ctx) .access_stats_behavior(AccessStatsBehavior::Skip) @@ -1341,6 +1343,10 @@ impl Timeline { } } + Ok(_) => { + info!("skipping repartitioning due to image compaction LSN being below GC cutoff"); + } + // Suppress errors when cancelled. Err(_) if self.cancel.is_cancelled() => {} Err(err) if err.is_cancel() => {} From 33abfc2b741de285846a8cfaef5c2e158d039342 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Fri, 9 May 2025 15:34:22 +0200 Subject: [PATCH 21/65] storcon: remove finished safekeeper reconciliations from in-memory hashmap (#11876) ## Problem Currently there is a memory leak, in that finished safekeeper reconciliations leave a cancellation token behind which is never cleaned up. ## Summary of changes The change adds cleanup after finishing of a reconciliation. In order to ensure we remove the correct cancellation token, and we haven't raced with another reconciliation, we introduce a `TokenId` counter to tell tokens apart. Part of https://github.com/neondatabase/neon/issues/11670 --- .../src/service/safekeeper_reconciler.rs | 133 ++++++++++++------ 1 file changed, 88 insertions(+), 45 deletions(-) diff --git a/storage_controller/src/service/safekeeper_reconciler.rs b/storage_controller/src/service/safekeeper_reconciler.rs index 71c73a0112..17bb132982 100644 --- a/storage_controller/src/service/safekeeper_reconciler.rs +++ b/storage_controller/src/service/safekeeper_reconciler.rs @@ -1,4 +1,9 @@ -use std::{collections::HashMap, str::FromStr, sync::Arc, time::Duration}; +use std::{ + collections::HashMap, + str::FromStr, + sync::{Arc, atomic::AtomicU64}, + time::Duration, +}; use clashmap::{ClashMap, Entry}; use safekeeper_api::models::PullTimelineRequest; @@ -169,10 +174,17 @@ pub(crate) struct ScheduleRequest { pub(crate) kind: SafekeeperTimelineOpKind, } +/// A way to keep ongoing/queued reconcile requests apart +#[derive(Copy, Clone, PartialEq, Eq)] +struct TokenId(u64); + +type OngoingTokens = ClashMap<(TenantId, Option), (CancellationToken, TokenId)>; + /// Handle to per safekeeper reconciler. struct ReconcilerHandle { - tx: UnboundedSender<(ScheduleRequest, CancellationToken)>, - ongoing_tokens: Arc), CancellationToken>>, + tx: UnboundedSender<(ScheduleRequest, CancellationToken, TokenId)>, + ongoing_tokens: Arc, + token_id_counter: AtomicU64, cancel: CancellationToken, } @@ -185,24 +197,28 @@ impl ReconcilerHandle { &self, tenant_id: TenantId, timeline_id: Option, - ) -> CancellationToken { + ) -> (CancellationToken, TokenId) { + let token_id = self + .token_id_counter + .fetch_add(1, std::sync::atomic::Ordering::Relaxed); + let token_id = TokenId(token_id); let entry = self.ongoing_tokens.entry((tenant_id, timeline_id)); if let Entry::Occupied(entry) = &entry { - let cancel: &CancellationToken = entry.get(); + let (cancel, _) = entry.get(); cancel.cancel(); } - entry.insert(self.cancel.child_token()).clone() + entry.insert((self.cancel.child_token(), token_id)).clone() } /// Cancel an ongoing reconciliation fn cancel_reconciliation(&self, tenant_id: TenantId, timeline_id: Option) { - if let Some((_, cancel)) = self.ongoing_tokens.remove(&(tenant_id, timeline_id)) { + if let Some((_, (cancel, _id))) = self.ongoing_tokens.remove(&(tenant_id, timeline_id)) { cancel.cancel(); } } fn schedule_reconcile(&self, req: ScheduleRequest) { - let cancel = self.new_token_slot(req.tenant_id, req.timeline_id); + let (cancel, token_id) = self.new_token_slot(req.tenant_id, req.timeline_id); let hostname = req.safekeeper.skp.host.clone(); - if let Err(err) = self.tx.send((req, cancel)) { + if let Err(err) = self.tx.send((req, cancel, token_id)) { tracing::info!("scheduling request onto {hostname} returned error: {err}"); } } @@ -211,13 +227,14 @@ impl ReconcilerHandle { pub(crate) struct SafekeeperReconciler { inner: SafekeeperReconcilerInner, concurrency_limiter: Arc, - rx: UnboundedReceiver<(ScheduleRequest, CancellationToken)>, + rx: UnboundedReceiver<(ScheduleRequest, CancellationToken, TokenId)>, cancel: CancellationToken, } /// Thin wrapper over `Service` to not clutter its inherent functions #[derive(Clone)] struct SafekeeperReconcilerInner { + ongoing_tokens: Arc, service: Arc, } @@ -226,15 +243,20 @@ impl SafekeeperReconciler { // We hold the ServiceInner lock so we don't want to make sending to the reconciler channel to be blocking. let (tx, rx) = mpsc::unbounded_channel(); let concurrency = service.config.safekeeper_reconciler_concurrency; + let ongoing_tokens = Arc::new(ClashMap::new()); let mut reconciler = SafekeeperReconciler { - inner: SafekeeperReconcilerInner { service }, + inner: SafekeeperReconcilerInner { + service, + ongoing_tokens: ongoing_tokens.clone(), + }, rx, concurrency_limiter: Arc::new(Semaphore::new(concurrency)), cancel: cancel.clone(), }; let handle = ReconcilerHandle { tx, - ongoing_tokens: Arc::new(ClashMap::new()), + ongoing_tokens, + token_id_counter: AtomicU64::new(0), cancel, }; tokio::spawn(async move { reconciler.run().await }); @@ -246,7 +268,9 @@ impl SafekeeperReconciler { req = self.rx.recv() => req, _ = self.cancel.cancelled() => break, }; - let Some((req, req_cancel)) = req else { break }; + let Some((req, req_cancel, req_token_id)) = req else { + break; + }; let permit_res = tokio::select! { req = self.concurrency_limiter.clone().acquire_owned() => req, @@ -265,7 +289,7 @@ impl SafekeeperReconciler { let timeline_id = req.timeline_id; let node_id = req.safekeeper.skp.id; inner - .reconcile_one(req, req_cancel) + .reconcile_one(req, req_cancel, req_token_id) .instrument(tracing::info_span!( "reconcile_one", ?kind, @@ -280,8 +304,14 @@ impl SafekeeperReconciler { } impl SafekeeperReconcilerInner { - async fn reconcile_one(&self, req: ScheduleRequest, req_cancel: CancellationToken) { + async fn reconcile_one( + &self, + req: ScheduleRequest, + req_cancel: CancellationToken, + req_token_id: TokenId, + ) { let req_host = req.safekeeper.skp.host.clone(); + let success; match req.kind { SafekeeperTimelineOpKind::Pull => { let Some(timeline_id) = req.timeline_id else { @@ -302,19 +332,22 @@ impl SafekeeperReconcilerInner { tenant_id: req.tenant_id, timeline_id, }; - self.reconcile_inner( - req, - async |client| client.pull_timeline(&pull_req).await, - |resp| { - if let Some(host) = resp.safekeeper_host { - tracing::info!("pulled timeline from {host} onto {req_host}"); - } else { - tracing::info!("timeline already present on safekeeper on {req_host}"); - } - }, - req_cancel, - ) - .await; + success = self + .reconcile_inner( + &req, + async |client| client.pull_timeline(&pull_req).await, + |resp| { + if let Some(host) = resp.safekeeper_host { + tracing::info!("pulled timeline from {host} onto {req_host}"); + } else { + tracing::info!( + "timeline already present on safekeeper on {req_host}" + ); + } + }, + req_cancel, + ) + .await; } SafekeeperTimelineOpKind::Exclude => { // TODO actually exclude instead of delete here @@ -325,22 +358,23 @@ impl SafekeeperReconcilerInner { ); return; }; - self.reconcile_inner( - req, - async |client| client.delete_timeline(tenant_id, timeline_id).await, - |_resp| { - tracing::info!("deleted timeline from {req_host}"); - }, - req_cancel, - ) - .await; + success = self + .reconcile_inner( + &req, + async |client| client.delete_timeline(tenant_id, timeline_id).await, + |_resp| { + tracing::info!("deleted timeline from {req_host}"); + }, + req_cancel, + ) + .await; } SafekeeperTimelineOpKind::Delete => { let tenant_id = req.tenant_id; if let Some(timeline_id) = req.timeline_id { - let deleted = self + success = self .reconcile_inner( - req, + &req, async |client| client.delete_timeline(tenant_id, timeline_id).await, |_resp| { tracing::info!("deleted timeline from {req_host}"); @@ -348,13 +382,13 @@ impl SafekeeperReconcilerInner { req_cancel, ) .await; - if deleted { + if success { self.delete_timeline_from_db(tenant_id, timeline_id).await; } } else { - let deleted = self + success = self .reconcile_inner( - req, + &req, async |client| client.delete_tenant(tenant_id).await, |_resp| { tracing::info!(%tenant_id, "deleted tenant from {req_host}"); @@ -362,12 +396,21 @@ impl SafekeeperReconcilerInner { req_cancel, ) .await; - if deleted { + if success { self.delete_tenant_timelines_from_db(tenant_id).await; } } } } + if success { + self.ongoing_tokens.remove_if( + &(req.tenant_id, req.timeline_id), + |_ttid, (_cancel, token_id)| { + // Ensure that this request is indeed the request we just finished and not a new one + req_token_id == *token_id + }, + ); + } } async fn delete_timeline_from_db(&self, tenant_id: TenantId, timeline_id: TimelineId) { match self @@ -421,10 +464,10 @@ impl SafekeeperReconcilerInner { self.delete_timeline_from_db(tenant_id, timeline_id).await; } } - /// Returns whether the reconciliation happened successfully + /// Returns whether the reconciliation happened successfully (or we got cancelled) async fn reconcile_inner( &self, - req: ScheduleRequest, + req: &ScheduleRequest, closure: impl Fn(SafekeeperClient) -> F, log_success: impl FnOnce(T) -> U, req_cancel: CancellationToken, From 3b7cc4234c8675b777a3f85798734c0b41748d11 Mon Sep 17 00:00:00 2001 From: Matthias van de Meent Date: Fri, 9 May 2025 19:02:24 +0200 Subject: [PATCH 22/65] Fix PS connect attempt timeouts when facing interrupts (#11880) With the 50ms timeouts of pumping state in connector.c, we need to correctly handle these timeouts that also wake up pg_usleep. This new approach makes the connection attempts re-start the wait whenever it gets woken up early; and CHECK_FOR_INTERRUPTS() is called to make sure we don't miss query cancellations. ## Problem https://neondb.slack.com/archives/C04DGM6SMTM/p1746794528680269 ## Summary of changes Make sure we start sleeping again if pg_usleep got woken up ahead of time. --- pgxn/neon/libpagestore.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index ee4e6ccc5b..3b6c4247c3 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -433,7 +433,6 @@ pageserver_connect(shardno_t shard_no, int elevel) now = GetCurrentTimestamp(); us_since_last_attempt = (int64) (now - shard->last_reconnect_time); - shard->last_reconnect_time = now; /* * Make sure we don't do exponential backoff with a constant multiplier @@ -447,14 +446,23 @@ pageserver_connect(shardno_t shard_no, int elevel) /* * If we did other tasks between reconnect attempts, then we won't * need to wait as long as a full delay. + * + * This is a loop to protect against interrupted sleeps. */ - if (us_since_last_attempt < shard->delay_us) + while (us_since_last_attempt < shard->delay_us) { pg_usleep(shard->delay_us - us_since_last_attempt); + + /* At least we should handle cancellations here */ + CHECK_FOR_INTERRUPTS(); + + now = GetCurrentTimestamp(); + us_since_last_attempt = (int64) (now - shard->last_reconnect_time); } /* update the delay metric */ shard->delay_us = Min(shard->delay_us * 2, MAX_RECONNECT_INTERVAL_USEC); + shard->last_reconnect_time = now; /* * Connect using the connection string we got from the From f5070f6aa4dad26b669811bf72923665f0340147 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Fri, 9 May 2025 20:13:35 +0200 Subject: [PATCH 23/65] fixup(direct IO): PR #11864 broke test suite parametrization (#11887) PR - github.com/neondatabase/neon/pull/11864 committed yesterday rendered the `PAGESERVER_VIRTUAL_FILE_IO_MODE` env-var-based parametrization ineffective. As a consequence, the tests and benchmarks in `test_runner/` were using the binary built-in-default, i.e., `buffered`. --- test_runner/fixtures/neon_fixtures.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 131820f23e..8f56ee4392 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -1274,6 +1274,8 @@ class NeonEnv: if self.pageserver_virtual_file_io_engine is not None: ps_cfg["virtual_file_io_engine"] = self.pageserver_virtual_file_io_engine + if self.pageserver_virtual_file_io_mode is not None: + ps_cfg["virtual_file_io_mode"] = self.pageserver_virtual_file_io_mode if config.pageserver_default_tenant_config_compaction_algorithm is not None: tenant_config = ps_cfg.setdefault("tenant_config", {}) tenant_config["compaction_algorithm"] = ( From 79ddc803af16e35c5d5a9b1c2c520c1fa88adcc4 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Sat, 10 May 2025 16:19:52 +0200 Subject: [PATCH 24/65] feat(direct IO): runtime alignment validation; support config flag on macOS; default to `DirectRw` (#11868) This PR adds a runtime validation mode to check adherence to alignment and size-multiple requirements at the VirtualFile level. This can help prevent alignment bugs from slipping into production because test systems may have more lax requirements than production. (This is not the case today, but it could change in the future). It also allows catching O_DIRECT bugs on systems that don't have O_DIRECT (macOS). Consequently, we can now accept `virtual_file_io_mode={direct,direct-rw}` on macOS now. This has the side benefit of removing some annoying conditional compilation around `IoMode`. A third benefit is that it helped weed out size-multiple requirement violation bugs in how the VirtualFile unit tests exercise read and write APIs. I seized the opportunity to trim these tests down to what actually matters, i.e., exercising of the `OpenFiles` file descriptor cache. Lastly, this PR flips the binary-built-in default to `DirectRw` so that when running Python regress tests and benchmarks without specifying `PAGESERVER_VIRTUAL_FILE_IO_MODE`, one gets the production behavior. Refs - fixes https://github.com/neondatabase/neon/issues/11676 --- .../pageserver_config/pageserver.toml | 1 + libs/pageserver_api/src/models.rs | 28 +- pageserver/benches/bench_ingest.rs | 9 +- pageserver/src/virtual_file.rs | 309 +++++++----------- pageserver/src/virtual_file/open_options.rs | 59 +++- .../fixtures/pageserver/allowed_errors.py | 7 + 6 files changed, 178 insertions(+), 235 deletions(-) diff --git a/docker-compose/pageserver_config/pageserver.toml b/docker-compose/pageserver_config/pageserver.toml index 7d603b6c65..81445ed412 100644 --- a/docker-compose/pageserver_config/pageserver.toml +++ b/docker-compose/pageserver_config/pageserver.toml @@ -5,3 +5,4 @@ listen_http_addr='0.0.0.0:9898' remote_storage={ endpoint='http://minio:9000', bucket_name='neon', bucket_region='eu-north-1', prefix_in_bucket='/pageserver' } control_plane_api='http://0.0.0.0:6666' # No storage controller in docker compose, specify a junk address control_plane_emergency_mode=true +virtual_file_io_mode="buffered" # the CI runners where we run the docker compose tests have slow disks diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index ff911499ab..5fcdefba66 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -1832,6 +1832,7 @@ pub mod virtual_file { Eq, Hash, strum_macros::EnumString, + strum_macros::EnumIter, strum_macros::Display, serde_with::DeserializeFromStr, serde_with::SerializeDisplay, @@ -1843,10 +1844,8 @@ pub mod virtual_file { /// Uses buffered IO. Buffered, /// Uses direct IO for reads only. - #[cfg(target_os = "linux")] Direct, /// Use direct IO for reads and writes. - #[cfg(target_os = "linux")] DirectRw, } @@ -1854,26 +1853,13 @@ pub mod virtual_file { pub fn preferred() -> Self { // The default behavior when running Rust unit tests without any further // flags is to use the newest behavior (DirectRw). - // The CI uses the following environment variable to unit tests for all - // different modes. + // The CI uses the environment variable to unit tests for all different modes. // NB: the Python regression & perf tests have their own defaults management // that writes pageserver.toml; they do not use this variable. - if cfg!(test) { - static CACHED: LazyLock = LazyLock::new(|| { - utils::env::var_serde_json_string( - "NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IO_MODE", - ) - .unwrap_or( - #[cfg(target_os = "linux")] - IoMode::DirectRw, - #[cfg(not(target_os = "linux"))] - IoMode::Buffered, - ) - }); - *CACHED - } else { - IoMode::Buffered - } + static ENV_OVERRIDE: LazyLock> = LazyLock::new(|| { + utils::env::var_serde_json_string("NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IO_MODE") + }); + ENV_OVERRIDE.unwrap_or(IoMode::DirectRw) } } @@ -1883,9 +1869,7 @@ pub mod virtual_file { fn try_from(value: u8) -> Result { Ok(match value { v if v == (IoMode::Buffered as u8) => IoMode::Buffered, - #[cfg(target_os = "linux")] v if v == (IoMode::Direct as u8) => IoMode::Direct, - #[cfg(target_os = "linux")] v if v == (IoMode::DirectRw as u8) => IoMode::DirectRw, x => return Err(x), }) diff --git a/pageserver/benches/bench_ingest.rs b/pageserver/benches/bench_ingest.rs index 2836450a0e..eaadfe14ae 100644 --- a/pageserver/benches/bench_ingest.rs +++ b/pageserver/benches/bench_ingest.rs @@ -14,6 +14,7 @@ use pageserver_api::key::Key; use pageserver_api::models::virtual_file::IoMode; use pageserver_api::shard::TenantShardId; use pageserver_api::value::Value; +use strum::IntoEnumIterator; use tokio_util::sync::CancellationToken; use utils::bin_ser::BeSer; use utils::id::{TenantId, TimelineId}; @@ -244,13 +245,7 @@ fn criterion_benchmark(c: &mut Criterion) { ]; let exploded_parameters = { let mut out = Vec::new(); - for io_mode in [ - IoMode::Buffered, - #[cfg(target_os = "linux")] - IoMode::Direct, - #[cfg(target_os = "linux")] - IoMode::DirectRw, - ] { + for io_mode in IoMode::iter() { for param in expect.clone() { let HandPickedParameters { volume_mib, diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index f429e59ef3..c707d35114 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -74,6 +74,8 @@ pub struct VirtualFile { impl VirtualFile { /// Open a file in read-only mode. Like File::open. + /// + /// Insensitive to `virtual_file_io_mode` setting. pub async fn open>( path: P, ctx: &RequestContext, @@ -95,31 +97,20 @@ impl VirtualFile { Self::open_with_options_v2(path.as_ref(), OpenOptions::new().read(true), ctx).await } + /// `O_DIRECT` will be enabled base on `virtual_file_io_mode`. pub async fn open_with_options_v2>( path: P, - #[cfg_attr(not(target_os = "linux"), allow(unused_mut))] mut open_options: OpenOptions, + mut open_options: OpenOptions, ctx: &RequestContext, ) -> Result { let mode = get_io_mode(); - let set_o_direct = match (mode, open_options.is_write()) { + let direct = match (mode, open_options.is_write()) { (IoMode::Buffered, _) => false, - #[cfg(target_os = "linux")] (IoMode::Direct, false) => true, - #[cfg(target_os = "linux")] (IoMode::Direct, true) => false, - #[cfg(target_os = "linux")] (IoMode::DirectRw, _) => true, }; - if set_o_direct { - #[cfg(target_os = "linux")] - { - open_options = open_options.custom_flags(nix::libc::O_DIRECT); - } - #[cfg(not(target_os = "linux"))] - unreachable!( - "O_DIRECT is not supported on this platform, IoMode's that result in set_o_direct=true shouldn't even be defined" - ); - } + open_options = open_options.direct(direct); let inner = VirtualFileInner::open_with_options(path, open_options, ctx).await?; Ok(VirtualFile { inner, _mode: mode }) } @@ -791,6 +782,12 @@ impl VirtualFileInner { where Buf: tokio_epoll_uring::IoBufMut + Send, { + self.validate_direct_io( + Slice::stable_ptr(&buf).addr(), + Slice::bytes_total(&buf), + offset, + ); + let file_guard = match self .lock_file() .await @@ -816,6 +813,8 @@ impl VirtualFileInner { offset: u64, ctx: &RequestContext, ) -> (FullSlice, Result) { + self.validate_direct_io(buf.as_ptr().addr(), buf.len(), offset); + let file_guard = match self.lock_file().await { Ok(file_guard) => file_guard, Err(e) => return (buf, Err(e)), @@ -830,6 +829,64 @@ impl VirtualFileInner { (buf, result) }) } + + /// Validate all reads and writes to adhere to the O_DIRECT requirements of our production systems. + /// + /// Validating it iin userspace sets a consistent bar, independent of what actual OS/filesystem/block device is in use. + fn validate_direct_io(&self, addr: usize, size: usize, offset: u64) { + // TODO: eventually enable validation in the builds we use in real environments like staging, preprod, and prod. + if !(cfg!(feature = "testing") || cfg!(test)) { + return; + } + if !self.open_options.is_direct() { + return; + } + + // Validate buffer memory alignment. + // + // What practically matters as of Linux 6.1 is bdev_dma_alignment() + // which is practically between 512 and 4096. + // On our production systems, the value is 512. + // The IoBuffer/IoBufferMut hard-code that value. + // + // Because the alloctor might return _more_ aligned addresses than requested, + // there is a chance that testing would not catch violations of a runtime requirement stricter than 512. + { + let requirement = 512; + let remainder = addr % requirement; + assert!( + remainder == 0, + "Direct I/O buffer must be aligned: buffer_addr=0x{addr:x} % 0x{requirement:x} = 0x{remainder:x}" + ); + } + + // Validate offset alignment. + // + // We hard-code 512 throughout the code base. + // So enforce just that and not anything more restrictive. + // Even the shallowest testing will expose more restrictive requirements if those ever arise. + { + let requirement = 512; + let remainder = offset % requirement; + assert!( + remainder == 0, + "Direct I/O offset must be aligned: offset=0x{offset:x} % 0x{requirement:x} = 0x{remainder:x}" + ); + } + + // Validate buffer size multiple requirement. + // + // The requirement in Linux 6.1 is bdev_logical_block_size(). + // On our production systems, that is 512. + { + let requirement = 512; + let remainder = size % requirement; + assert!( + remainder == 0, + "Direct I/O buffer size must be a multiple of {requirement}: size=0x{size:x} % 0x{requirement:x} = 0x{remainder:x}" + ); + } + } } // Adapted from https://doc.rust-lang.org/1.72.0/src/std/os/unix/fs.rs.html#117-135 @@ -1218,7 +1275,6 @@ mod tests { use std::sync::Arc; use owned_buffers_io::io_buf_ext::IoBufExt; - use owned_buffers_io::slice::SliceMutExt; use rand::seq::SliceRandom; use rand::{Rng, thread_rng}; @@ -1226,162 +1282,38 @@ mod tests { use crate::context::DownloadBehavior; use crate::task_mgr::TaskKind; - enum MaybeVirtualFile { - VirtualFile(VirtualFile), - File(File), - } - - impl From for MaybeVirtualFile { - fn from(vf: VirtualFile) -> Self { - MaybeVirtualFile::VirtualFile(vf) - } - } - - impl MaybeVirtualFile { - async fn read_exact_at( - &self, - mut slice: tokio_epoll_uring::Slice, - offset: u64, - ctx: &RequestContext, - ) -> Result, Error> { - match self { - MaybeVirtualFile::VirtualFile(file) => file.read_exact_at(slice, offset, ctx).await, - MaybeVirtualFile::File(file) => { - let rust_slice: &mut [u8] = slice.as_mut_rust_slice_full_zeroed(); - file.read_exact_at(rust_slice, offset).map(|()| slice) - } - } - } - async fn write_all_at( - &self, - buf: FullSlice, - offset: u64, - ctx: &RequestContext, - ) -> Result<(), Error> { - match self { - MaybeVirtualFile::VirtualFile(file) => { - let (_buf, res) = file.write_all_at(buf, offset, ctx).await; - res - } - MaybeVirtualFile::File(file) => file.write_all_at(&buf[..], offset), - } - } - - // Helper function to slurp a portion of a file into a string - async fn read_string_at( - &mut self, - pos: u64, - len: usize, - ctx: &RequestContext, - ) -> Result { - let slice = IoBufferMut::with_capacity(len).slice_full(); - assert_eq!(slice.bytes_total(), len); - let slice = self.read_exact_at(slice, pos, ctx).await?; - let buf = slice.into_inner(); - assert_eq!(buf.len(), len); - - Ok(String::from_utf8(buf.to_vec()).unwrap()) - } - } - #[tokio::test] async fn test_virtual_files() -> anyhow::Result<()> { - // The real work is done in the test_files() helper function. This - // allows us to run the same set of tests against a native File, and - // VirtualFile. We trust the native Files and wouldn't need to test them, - // but this allows us to verify that the operations return the same - // results with VirtualFiles as with native Files. (Except that with - // native files, you will run out of file descriptors if the ulimit - // is low enough.) - struct A; - - impl Adapter for A { - async fn open( - path: Utf8PathBuf, - opts: OpenOptions, - ctx: &RequestContext, - ) -> Result { - let vf = VirtualFile::open_with_options_v2(&path, opts, ctx).await?; - Ok(MaybeVirtualFile::VirtualFile(vf)) - } - } - test_files::("virtual_files").await - } - - #[tokio::test] - async fn test_physical_files() -> anyhow::Result<()> { - struct B; - - impl Adapter for B { - async fn open( - path: Utf8PathBuf, - opts: OpenOptions, - _ctx: &RequestContext, - ) -> Result { - Ok(MaybeVirtualFile::File({ - let owned_fd = opts.open(path.as_std_path()).await?; - File::from(owned_fd) - })) - } - } - - test_files::("physical_files").await - } - - /// This is essentially a closure which returns a MaybeVirtualFile, but because rust edition - /// 2024 is not yet out with new lifetime capture or outlives rules, this is a async function - /// in trait which benefits from the new lifetime capture rules already. - trait Adapter { - async fn open( - path: Utf8PathBuf, - opts: OpenOptions, - ctx: &RequestContext, - ) -> Result; - } - - async fn test_files(testname: &str) -> anyhow::Result<()> - where - A: Adapter, - { let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error).with_scope_unit_test(); - let testdir = crate::config::PageServerConf::test_repo_dir(testname); + let testdir = crate::config::PageServerConf::test_repo_dir("test_virtual_files"); std::fs::create_dir_all(&testdir)?; + let zeropad512 = |content: &[u8]| { + let mut buf = IoBufferMut::with_capacity_zeroed(512); + buf[..content.len()].copy_from_slice(content); + buf.freeze().slice_len() + }; + let path_a = testdir.join("file_a"); - let mut file_a = A::open( + let file_a = VirtualFile::open_with_options_v2( path_a.clone(), OpenOptions::new() + .read(true) .write(true) + // set create & truncate flags to ensure when we trigger a reopen later in this test, + // the reopen_options must have masked out those flags; if they don't, then + // the after reopen we will fail to read the `content_a` that we write here. .create(true) - .truncate(true) - .to_owned(), + .truncate(true), &ctx, ) .await?; + let (_, res) = file_a.write_all_at(zeropad512(b"content_a"), 0, &ctx).await; + res?; - file_a - .write_all_at(IoBuffer::from(b"foobar").slice_len(), 0, &ctx) - .await?; - - // cannot read from a file opened in write-only mode - let _ = file_a.read_string_at(0, 1, &ctx).await.unwrap_err(); - - // Close the file and re-open for reading - let mut file_a = A::open(path_a, OpenOptions::new().read(true), &ctx).await?; - - // cannot write to a file opened in read-only mode - let _ = file_a - .write_all_at(IoBuffer::from(b"bar").slice_len(), 0, &ctx) - .await - .unwrap_err(); - - // Try simple read - assert_eq!("foobar", file_a.read_string_at(0, 6, &ctx).await?); - - // Create another test file, and try FileExt functions on it. let path_b = testdir.join("file_b"); - let mut file_b = A::open( + let file_b = VirtualFile::open_with_options_v2( path_b.clone(), OpenOptions::new() .read(true) @@ -1391,37 +1323,44 @@ mod tests { &ctx, ) .await?; - file_b - .write_all_at(IoBuffer::from(b"BAR").slice_len(), 3, &ctx) - .await?; - file_b - .write_all_at(IoBuffer::from(b"FOO").slice_len(), 0, &ctx) - .await?; + let (_, res) = file_b.write_all_at(zeropad512(b"content_b"), 0, &ctx).await; + res?; - assert_eq!(file_b.read_string_at(2, 3, &ctx).await?, "OBA"); + let assert_first_512_eq = async |vfile: &VirtualFile, expect: &[u8]| { + let buf = vfile + .read_exact_at(IoBufferMut::with_capacity_zeroed(512).slice_full(), 0, &ctx) + .await + .unwrap(); + assert_eq!(&buf[..], &zeropad512(expect)[..]); + }; - // Open a lot of files, enough to cause some evictions. (Or to be precise, - // open the same file many times. The effect is the same.) + // Open a lot of file descriptors / VirtualFile instances. + // Enough to cause some evictions in the fd cache. - let mut vfiles = Vec::new(); + let mut file_b_dupes = Vec::new(); for _ in 0..100 { - let mut vfile = A::open(path_b.clone(), OpenOptions::new().read(true), &ctx).await?; - assert_eq!("FOOBAR", vfile.read_string_at(0, 6, &ctx).await?); - vfiles.push(vfile); + let vfile = VirtualFile::open_with_options_v2( + path_b.clone(), + OpenOptions::new().read(true), + &ctx, + ) + .await?; + assert_first_512_eq(&vfile, b"content_b").await; + file_b_dupes.push(vfile); } // make sure we opened enough files to definitely cause evictions. - assert!(vfiles.len() > TEST_MAX_FILE_DESCRIPTORS * 2); + assert!(file_b_dupes.len() > TEST_MAX_FILE_DESCRIPTORS * 2); // The underlying file descriptor for 'file_a' should be closed now. Try to read - // from it again. - assert_eq!("foobar", file_a.read_string_at(0, 6, &ctx).await?); + // from it again. The VirtualFile reopens the file internally. + assert_first_512_eq(&file_a, b"content_a").await; // Check that all the other FDs still work too. Use them in random order for // good measure. - vfiles.as_mut_slice().shuffle(&mut thread_rng()); - for vfile in vfiles.iter_mut() { - assert_eq!("OOBAR", vfile.read_string_at(1, 5, &ctx).await?); + file_b_dupes.as_mut_slice().shuffle(&mut thread_rng()); + for vfile in file_b_dupes.iter_mut() { + assert_first_512_eq(vfile, b"content_b").await; } Ok(()) @@ -1452,7 +1391,7 @@ mod tests { // Open the file many times. let mut files = Vec::new(); for _ in 0..VIRTUAL_FILES { - let f = VirtualFileInner::open_with_options( + let f = VirtualFile::open_with_options_v2( &test_file_path, OpenOptions::new().read(true), &ctx, @@ -1497,8 +1436,6 @@ mod tests { #[tokio::test] async fn test_atomic_overwrite_basic() { - let ctx = - RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error).with_scope_unit_test(); let testdir = crate::config::PageServerConf::test_repo_dir("test_atomic_overwrite_basic"); std::fs::create_dir_all(&testdir).unwrap(); @@ -1508,26 +1445,22 @@ mod tests { VirtualFileInner::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"foo".to_vec()) .await .unwrap(); - let mut file = MaybeVirtualFile::from(VirtualFile::open(&path, &ctx).await.unwrap()); - let post = file.read_string_at(0, 3, &ctx).await.unwrap(); + + let post = std::fs::read_to_string(&path).unwrap(); assert_eq!(post, "foo"); assert!(!tmp_path.exists()); - drop(file); VirtualFileInner::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"bar".to_vec()) .await .unwrap(); - let mut file = MaybeVirtualFile::from(VirtualFile::open(&path, &ctx).await.unwrap()); - let post = file.read_string_at(0, 3, &ctx).await.unwrap(); + + let post = std::fs::read_to_string(&path).unwrap(); assert_eq!(post, "bar"); assert!(!tmp_path.exists()); - drop(file); } #[tokio::test] async fn test_atomic_overwrite_preexisting_tmp() { - let ctx = - RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error).with_scope_unit_test(); let testdir = crate::config::PageServerConf::test_repo_dir("test_atomic_overwrite_preexisting_tmp"); std::fs::create_dir_all(&testdir).unwrap(); @@ -1542,10 +1475,8 @@ mod tests { .await .unwrap(); - let mut file = MaybeVirtualFile::from(VirtualFile::open(&path, &ctx).await.unwrap()); - let post = file.read_string_at(0, 3, &ctx).await.unwrap(); + let post = std::fs::read_to_string(&path).unwrap(); assert_eq!(post, "foo"); assert!(!tmp_path.exists()); - drop(file); } } diff --git a/pageserver/src/virtual_file/open_options.rs b/pageserver/src/virtual_file/open_options.rs index a40dfed4a4..7d478f3600 100644 --- a/pageserver/src/virtual_file/open_options.rs +++ b/pageserver/src/virtual_file/open_options.rs @@ -8,7 +8,13 @@ use super::io_engine::IoEngine; #[derive(Debug, Clone)] pub struct OpenOptions { + /// We keep a copy of the write() flag we pass to the `inner`` `OptionOptions` + /// to support [`Self::is_write`]. write: bool, + /// We don't expose + pass through a raw `custom_flags()` style API. + /// The only custom flag we support is `O_DIRECT`, which we track here + /// and map to `custom_flags()` in the [`Self::open`] method. + direct: bool, inner: Inner, } #[derive(Debug, Clone)] @@ -30,6 +36,7 @@ impl Default for OpenOptions { }; Self { write: false, + direct: false, inner, } } @@ -44,6 +51,10 @@ impl OpenOptions { self.write } + pub(super) fn is_direct(&self) -> bool { + self.direct + } + pub fn read(mut self, read: bool) -> Self { match &mut self.inner { Inner::StdFs(x) => { @@ -116,13 +127,38 @@ impl OpenOptions { } pub(in crate::virtual_file) async fn open(&self, path: &Path) -> std::io::Result { - match &self.inner { - Inner::StdFs(x) => x.open(path).map(|file| file.into()), + #[cfg_attr(not(target_os = "linux"), allow(unused_mut))] + let mut custom_flags = 0; + if self.direct { #[cfg(target_os = "linux")] - Inner::TokioEpollUring(x) => { + { + custom_flags |= nix::libc::O_DIRECT; + } + #[cfg(not(target_os = "linux"))] + { + // Other platforms may be used for development but don't necessarily have a 1:1 equivalent to Linux's O_DIRECT (macOS!). + // Just don't set the flag; to catch alignment bugs typical for O_DIRECT, + // we have a runtime validation layer inside `VirtualFile::write_at` and `VirtualFile::read_at`. + static WARNING: std::sync::Once = std::sync::Once::new(); + WARNING.call_once(|| { + let span = tracing::info_span!(parent: None, "open_options"); + let _enter = span.enter(); + tracing::warn!("your platform is not a supported production platform, ignoing request for O_DIRECT; this could hide alignment bugs; this warning is logged once per process"); + }); + } + } + + match self.inner.clone() { + Inner::StdFs(mut x) => x + .custom_flags(custom_flags) + .open(path) + .map(|file| file.into()), + #[cfg(target_os = "linux")] + Inner::TokioEpollUring(mut x) => { + x.custom_flags(custom_flags); let system = super::io_engine::tokio_epoll_uring_ext::thread_local_system().await; let (_, res) = super::io_engine::retry_ecanceled_once((), |()| async { - let res = system.open(path, x).await; + let res = system.open(path, &x).await; ((), res) }) .await; @@ -144,19 +180,8 @@ impl OpenOptions { self } - pub fn custom_flags(mut self, flags: i32) -> Self { - if flags & nix::libc::O_APPEND != 0 { - super::io_engine::panic_operation_must_be_idempotent(); - } - match &mut self.inner { - Inner::StdFs(x) => { - let _ = x.custom_flags(flags); - } - #[cfg(target_os = "linux")] - Inner::TokioEpollUring(x) => { - let _ = x.custom_flags(flags); - } - } + pub fn direct(mut self, direct: bool) -> Self { + self.direct = direct; self } } diff --git a/test_runner/fixtures/pageserver/allowed_errors.py b/test_runner/fixtures/pageserver/allowed_errors.py index 43bffd919c..9b564f0a60 100755 --- a/test_runner/fixtures/pageserver/allowed_errors.py +++ b/test_runner/fixtures/pageserver/allowed_errors.py @@ -111,6 +111,13 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = ( ".*stalling layer flushes for compaction backpressure.*", ".*layer roll waiting for flush due to compaction backpressure.*", ".*BatchSpanProcessor.*", + *( + [ + r".*your platform is not a supported production platform, ignoing request for O_DIRECT; this could hide alignment bugs.*" + ] + if sys.platform != "linux" + else [] + ), ) From 64353b48dbd5a73fc2cf9c9eb1bd3c9b442715cc Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Sat, 10 May 2025 17:06:06 +0200 Subject: [PATCH 25/65] direct+concurrent IO: retroactive RFC (#11788) refs - direct IO epic: https://github.com/neondatabase/neon/issues/8130 - concurrent IO epic https://github.com/neondatabase/neon/issues/9378 - obsoletes direct IO proposal RFC: https://github.com/neondatabase/neon/pull/8240 - discussion in https://neondb.slack.com/archives/C07BZ38E6SD/p1746028030574349 --- docs/rfcs/030-vectored-timeline-get.md | 2 + .../2025-04-30-direct-io-for-pageserver.md | 362 ++++++++++++++++++ ...0-pageserver-concurrent-io-on-read-path.md | 251 ++++++++++++ 3 files changed, 615 insertions(+) create mode 100644 docs/rfcs/2025-04-30-direct-io-for-pageserver.md create mode 100644 docs/rfcs/2025-04-30-pageserver-concurrent-io-on-read-path.md diff --git a/docs/rfcs/030-vectored-timeline-get.md b/docs/rfcs/030-vectored-timeline-get.md index 093a964f38..e933eac5fe 100644 --- a/docs/rfcs/030-vectored-timeline-get.md +++ b/docs/rfcs/030-vectored-timeline-get.md @@ -7,6 +7,8 @@ Author: Christian Schwarz A brief RFC / GitHub Epic describing a vectored version of the `Timeline::get` method that is at the heart of Pageserver. +**EDIT**: the implementation of this feature is described in [Vlad's (internal) tech talk](https://drive.google.com/file/d/1vfY24S869UP8lEUUDHRWKF1AJn8fpWoJ/view?usp=drive_link). + # Motivation During basebackup, we issue many `Timeline::get` calls for SLRU pages that are *adjacent* in key space. diff --git a/docs/rfcs/2025-04-30-direct-io-for-pageserver.md b/docs/rfcs/2025-04-30-direct-io-for-pageserver.md new file mode 100644 index 0000000000..847f5e4040 --- /dev/null +++ b/docs/rfcs/2025-04-30-direct-io-for-pageserver.md @@ -0,0 +1,362 @@ +# Direct IO For Pageserver + +Date: Apr 30, 2025 + +## Summary + +This document is a retroactive RFC. It +- provides some background on what direct IO is, +- motivates why Pageserver should be using it for its IO, and +- describes how we changed Pageserver to use it. + +The [initial proposal](https://github.com/neondatabase/neon/pull/8240) that kicked off the work can be found in this closed GitHub PR. + +People primarily involved in this project were: +- Yuchen Liang +- Vlad Lazar +- Christian Schwarz + +## Timeline + +For posterity, here is the rough timeline of the development work that got us to where we are today. + +- Jan 2024: [integrate `tokio-epoll-uring`](https://github.com/neondatabase/neon/pull/5824) along with owned buffers API +- March 2024: `tokio-epoll-uring` enabled in all regions in buffered IO mode +- Feb 2024 to June 2024: PS PageCache Bypass For Data Blocks + - Feb 2024: [Vectored Get Implementation](https://github.com/neondatabase/neon/pull/6576) bypasses delta & image layer blocks for page requests + - Apr to June 2024: [Epic: bypass PageCache for use data blocks](https://github.com/neondatabase/neon/issues/7386) addresses remaining users +- Aug to Nov 2024: direct IO: first code; preliminaries; read path coding; BufferedWriter; benchmarks show perf regressions too high, no-go. +- Nov 2024 to Jan 2025: address perf regressions by developing page_service pipelining (aka batching) and concurrent IO ([Epic](https://github.com/neondatabase/neon/issues/9376)) +- Feb to March 2024: rollout batching, then concurrent+direct IO => read path and InMemoryLayer is now direct IO +- Apr 2025: develop & roll out direct IO for the write path + +## Background: Terminology & Glossary + +**kernel page cache**: the Linux kernel's page cache is a write-back cache for filesystem contents. +The cached unit is memory-page-sized & aligned chunks of the files that are being cached (typically 4k). +The cache lives in kernel memory and is not directly accessible through userspace. + +**Buffered IO**: an application's read/write system calls go through the kernel page cache. +For example, a 10 byte sized read or write to offset 5000 in a file will load the file contents +at offset `[4096,8192)` into a free page in the kernel page cache. If necessary, it will evict +a page to make room (cf eviction). Then, the kernel performs a memory-to-memory copy of 10 bytes +from/to the offset `4` (`5000 = 4096 + 4`) within the cached page. If it's a write, the kernel keeps +track of the fact that the page is now "dirty" in some ancillary structure. + +**Writeback**: a buffered read/write syscall returns after the memory-to-memory copy. The modifications +made by e.g. write system calls are not even *issued* to disk, let alone durable. Instead, the kernel +asynchronously writes back dirtied pages based on a variety of conditions. For us, the most relevant +ones are a) explicit request by userspace (`fsync`) and b) memory pressure. + +**Memory pressure**: the kernel page cache is a best effort service and a user of spare memory capacity. +If there is no free memory, the kernel page allocator will take pages used by page cache to satisfy allocations. +Before reusing a page like that, the page has to be written back (writeback, see above). +The far-reaching consequence of this is that **any allocation of anonymous memory can do IO** if the only +way to get that memory is by eviction & re-using a dirty page cache page. +Notably, this includes a simple `malloc` in userspace, because eventually that boils down to `mmap(..., MAP_ANON, ...)`. +I refer to this effect as the "malloc latency backscatter" caused by buffered IO. + +**Direct IO** allows application's read/write system calls to bypass the kernel page cache. The filesystem +is still involved because it is ultimately in charge of mapping the concept of files & offsets within them +to sectors on block devices. Typically, the filesystem poses size and alignment requirements for memory buffers +and file offsets (statx `Dio_mem_align` / `Dio_offset_align`), see [this gist](https://gist.github.com/problame/1c35cac41b7cd617779f8aae50f97155). +The IO operations will fail at runtime with EINVAL if the alignment requirements are not met. + +**"buffered" vs "direct"**: the central distinction between buffered and direct IO is about who allocates and +fills the IO buffers, and who controls when exactly the IOs are issued. In buffered IO, it's the syscall handlers, +kernel page cache, and memory management subsystems (cf "writeback"). In direct IO, all of it is done by +the application. +It takes more effort by the application to program with direct instead of buffered IO. +The return is precise control over and a clear distinction between consumption/modification of memory vs disk. + +**Pageserver PageCache**: Pageserver has an additional `PageCache` (referred to as PS PageCache from here on, as opposed to "kernel page cache"). +Its caching unit is 8KiB blocks of the layer files written by Pageserver. +A miss in PageCache is filled by reading from the filesystem, through the `VirtualFile` abstraction layer. +The default size is tiny (64MiB), very much like Postgres's `shared_buffers`. +We ran production at 128MiB for a long time but gradually moved it up to 2GiB over the past ~year. + +**VirtualFile** is Pageserver's abstraction for file IO, very similar to the facility in Postgres that bears the same name. +Its historical purpose appears to be working around open file descriptor limitations, which is practically irrelevant on Linux. +However, the facility in Pageserver is useful as an intermediary layer for metrics and abstracts over the different kinds of +IO engines that Pageserver supports (`std-fs` vs `tokio-epoll-uring`). + +## Background: History Of Caching In Pageserver + +For multiple years, Pageserver's `PageCache` was on the path of all read _and write_ IO. +It performed write-back to the kernel using buffered IO. + +We converted it into a read-only cache of immutable data in [PR 4994](https://github.com/neondatabase/neon/pull/4994). + +The introduction of `tokio-epoll-uring` required converting the code base to used owned IO buffers. +The `PageCache` pages are usable as owned IO buffers. + +We then started bypassing PageCache for user data blocks. +Data blocks are the 8k blocks of data in layer files that hold the multiple `Value`s, as opposed to the disk btree index blocks that tell us which values exist in a file at what offsets. +The disk btree embedded in delta & image layers remains `PageCache`'d. +Epics for that work were: +- Vectored `Timeline::get` (cf RFC 30) skipped delta and image layer data block `PageCache`ing outright. +- Epic https://github.com/neondatabase/neon/issues/7386 took care of the remaining users for data blocks: + - Materialized page cache (cached materialized pages; shown to be ~0% hit rate in practice) + - InMemoryLayer + - Compaction + +The outcome of the above: +1. All data blocks are always read through the `VirtualFile` APIs, hitting the kernel buffered read path (=> kernel page cache). +2. Indirect blocks (=disk btree blocks) would be cached in the PS `PageCache`. + +In production we size the PS `PageCache` to be 2GiB. +Thus drives hit rate up to ~99.95% and the eviction rate / replacement rates down to less than 200/second on a 1-minute average, on the busiest machines. +High baseline replacement rates are treated as a signal of resource exhaustion (page cache insufficient to host working set of the PS). +The response to this is to migrate tenants away, or increase PS `PageCache` size. +It is currently manual but could be automated, e.g., in Storage Controller. + +In the future, we may eliminate the `PageCache` even for indirect blocks. +For example with an LRU cache that has as unit the entire disk btree content +instead of individual blocks. + +## High-Level Design + +So, before work on this project started, all data block reads and the entire write path of Pageserver were using kernel-buffered IO, i.e., the kernel page cache. +We now want to get the kernel page cache out of the picture by using direct IO for all interaction with the filesystem. +This achieves the following system properties: + +**Predictable VirtualFile latencies** +* With buffered IO, reads are sometimes fast, sometimes slow, depending on kernel page cache hit/miss. +* With buffered IO, appends when writing out new layer files during ingest or compaction are sometimes fast, sometimes slow because of write-back backpressure. +* With buffered IO, the "malloc backscatter" phenomenon pointed out in the Glossary section is not something we actively observe. + But we do have occasional spikes in Dirty memory amount and Memory PSI graphs, so it may already be affecting to some degree. +* By switching to direct IO, above operations will have the (predictable) device latency -- always. + Reads and appends always go to disk. + And malloc will not have to write back dirty data. + +**Explicitness & Tangibility of resource usage** +* In a multi-tenant system, it is generally desirable and valuable to be *explicit* about the main resources we use for each tenant. +* By using direct IO, we become explicit about the resources *disk IOPs* and *memory capacity* in a way that was previously being conflated through the kernel page cache, outside our immediate control. +* We will be able to build per-tenant observability of resource usage ("what tenant is causing the actual IOs that are sent to the disk?"). +* We will be able to build accounting & QoS by implementing an IO scheduler that is tenant aware. The kernel is not tenant-aware and can't do that. + +**CPU Efficiency** +* The involvement of the kernel page cache means one additional memory-to-memory copy on read and write path. +* Direct IO will eliminate that memory-to-memory copy, if we can make the userspace buffers used for the IO calls satisfy direct IO alignment requirements. + +The **trade-off** is that we no longer get the theoretical benefits of the kernel page cache. These are: +- read latency improvements for repeat reads of the same data ("locality of reference") + - asterisk: only if that state is still cache-resident by time of next access +- write throughput by having kernel page cache batch small VFS writes into bigger disk writes + - asterisk: only if memory pressure is low enough that the kernel can afford to delay writeback + +We are **happy to make this trade-off**: +- Because of the advantages listed above. +- Because we empirically have enough DRAM on Pageservers to serve metadata (=index blocks) from PS PageCache. + (At just 2GiB PS PageCache size, we average a 99.95% hit rate). + So, the latency of going to disk is only for data block reads, not the index traversal. +- Because **the kernel page cache is ineffective** at high tenant density anyway (#tenants/pageserver instance). + And because dense packing of tenants will always be desirable to drive COGS down, we should design the system for it. + (See the appendix for a more detailed explanation why this is). +- So, we accept that some reads that used to be fast by circumstance will have higher but **predictable** latency than before. + +### Desired End State + +The desired end state of the project is as follows, and with some asterisks, we have achieved it. + +All IOs of the Pageserver data path use direct IO, thereby bypassing the kernel page cache. + +In particular, the "data path" includes +- the wal ingest path +- compaction +- anything on the `Timeline::get` / `Timeline::get_vectored` path. + +The production Pageserver config is tuned such that virtually all non-data blocks are cached in the PS PageCache. +Hit rate target is 99.95%. + +There are no regressions to ingest latency. + +The total "wait-for-disk time" contribution to random getpage request latency is `O(1 read IOP latency)`. +We accomplish that by having a near 100% PS PageCache hit rate so that layer index traversal effectively never needs not wait for IO. +Thereby, it can issue all the data blocks as it traverses the index, and only wait at the end of it (concurrent IO). + +The amortized "wait-for-disk time" contribution of this direct IO proposal to a series of sequential getpage requests is `1/32 * read IOP latency` for each getpage request. +We accomplish this by server-side batching of up to 32 reads into a single `Timeline::get_vectored` call. +(This is an ideal world where our batches are full - that's not the case in prod today because of lack of queue depth). + +## Design & Implementation + +### Prerequisites + +A lot of prerequisite work had to happen to enable use of direct IO. + +To meet the "wait-for-disk time" requirements from the DoD, we implement for the read path: +- page_service level server-side batching (config field `page_service_pipelining`) +- concurrent IO (config field `get_vectored_concurrent_io`) +The work for both of these these was tracked [in the epic](https://github.com/neondatabase/neon/issues/9376). +Server-side batching will likely be obsoleted by the [#proj-compute-communicator](https://github.com/neondatabase/neon/pull/10799). +The Concurrent IO work is described in retroactive RFC `2025-04-30-pageserver-concurrent-io-on-read-path.md`. +The implementation is relatively brittle and needs further investment, see the `Future Work` section in that RFC. + +For the write path, and especially WAL ingest, we need to hide write latency. +We accomplish this by implementing a (`BufferedWriter`) type that does double-buffering: flushes of the filled +buffer happen in a sidecar tokio task while new writes fill a new buffer. +We refactor InMemoryLayer as well as BlobWriter (=> delta and image layer writers) to use this new `BufferedWriter`. +The most comprehensive write-up of this work is in [the PR description](https://github.com/neondatabase/neon/pull/11558). + +### Ensuring Adherence to Alignment Requirements + +Direct IO puts requirements on +- memory buffer alignment +- io size (=memory buffer size) +- file offset alignment + +The requirements are specific to a combination of filesystem/block-device/architecture(hardware page size!). + +In Neon production environments we currently use ext4 with Linux 6.1.X on AWS and Azure storage-optimized instances (locally attached NVMe). +Instead of dynamic discovery using `statx`, we statically hard-code 512 bytes as the buffer/offset alignment and size-multiple. +We made this decision because: +- a) it is compatible with all the environments we need to run in +- b) our primary workload can be small-random-read-heavy (we do merge adjacent reads if possible, but the worst case is that all `Value`s that needs to be read are far apart) +- c) 512-byte tail latency on the production instance types is much better than 4k (p99.9: 3x lower, p99.99 5x lower). +- d) hard-coding at compile-time allows us to use the Rust type system to enforce the use of only aligned IO buffers, eliminating a source of runtime errors typically associated with direct IO. + +This was [discussed here](https://neondb.slack.com/archives/C07BZ38E6SD/p1725036790965549?thread_ts=1725026845.455259&cid=C07BZ38E6SD). + +The new `IoBufAligned` / `IoBufAlignedMut` marker traits indicate that a given buffer meets memory alignment requirements. +All `VirtualFile` APIs and several software layers built on top of them only accept buffers that implement those traits. +Implementors of the marker traits are: +- `IoBuffer` / `IoBufferMut`: used for most reads and writes +- `PageWriteGuardBuf`: for filling PS PageCache pages (index blocks!) + +The alignment requirement is infectious; it permeates bottom-up throughout the code base. +We stop the infection at roughly the same layers in the code base where we stopped permeating the +use of owned-buffers-style API for tokio-epoll-uring. The way the stopping works is by introducing +a memory-to-memory copy from/to some unaligned memory location on the stack/current/heap. +The places where we currently stop permeating are sort of arbitrary. For example, it would probably +make sense to replace more usage of `Bytes` that we know holds 8k pages with 8k-sized `IoBuffer`s. + +The `IoBufAligned` / `IoBufAlignedMut` types do not protect us from the following types of runtime errors: +- non-adherence to file offset alignment requirements +- non-adherence to io size requirements + +The following higher-level constructs ensure we meet the requirements: +- read path: the `ChunkedVectoredReadBuilder` and `mod vectored_dio_read` ensure reads happen at aligned offsets and in appropriate size multiples. +- write path: `BufferedWriter` only writes in multiples of the capacity, at offsets that are `start_offset+N*capacity`; see its doc comment. + +Note that these types are used always, regardless of whether direct IO is enabled or not. +There are some cases where this adds unnecessary overhead to buffered IO (e.g. all memcpy's inflated to multiples of 512). +But we could not identify meaningful impact in practice when we shipped these changes while we were still using buffered IO. + +### Configuration / Feature Flagging + +In the previous section we described how all users of VirtualFile were changed to always adhere to direct IO alignment and size-multiple requirements. +To actually enable direct IO, all we need to do is set the `O_DIRECT` flag in `open` syscalls / io_uring operations. + +We set `O_DIRECT` based on: +- the VirtualFile API used to create/open the VirtualFile instance +- the `virtual_file_io_mode` configuration flag +- the OpenOptions `read` and/or `write` flags. + +The VirtualFile APIs suffixed with `_v2` are the only ones that _may_ open with `O_DIRECT` depending on the other two factors in above list. +Other APIs never use `O_DIRECT`. +(The name is bad and should really be `_maybe_direct_io`.) + +The reason for having new APIs is because all code used VirtualFile but implementation and rollout happened in consecutive phases (read path, InMemoryLayer, write path). +At the VirtualFile level, context on whether an instance of VirtualFile is on read path, InMemoryLayer, or write path is not available. + +The `_v2` APIs then check make the decision to set `O_DIRECT` based on the `virtual_file_io_mode` flag and the OpenOptions `read`/`write` flags. +The result is the following runtime behavior: + +|what|OpenOptions|`v_f_io_mode`
=`buffered`|`v_f_io_mode`
=`direct`|`v_f_io_mode`
=`direct-rw`| +|-|-|-|-|-| +|`DeltaLayerInner`|read|()|O_DIRECT|O_DIRECT| +|`ImageLayerInner`|read|()|O_DIRECT|O_DIRECT| +|`InMemoryLayer`|read + write|()|()*|O_DIRECT| +|`DeltaLayerWriter`| write | () | () | O_DIRECT | +|`ImageLayerWriter`| write | () | () | O_DIRECT | +|`download_layer_file`|write |()|()|O_DIRECT| + +The `InMemoryLayer` is marked with `*` because there was a period when it *did* use O_DIRECT under `=direct`. +That period was when we implemented and shipped the first version of `BufferedWriter`. +We used it in `InMemoryLayer` and `download_layer_file` but it was only sensitive to `v_f_io_mode` in `InMemoryLayer`. +The introduction of `=direct-rw`, and the switch of the remaining write path to `BufferedWriter`, happened later, +in https://github.com/neondatabase/neon/pull/11558. + +Note that this way of feature flagging inside VirtualFile makes it less and less a general purpose POSIX file access abstraction. +For example, with `=direct-rw` enabled, it is no longer possible to open a `VirtualFile` without `O_DIRECT`. It'll always be set. + +## Correctness Validation + +The correctness risks with this project were: +- Memory safety issues in the `IoBuffer` / `IoBufferMut` implementation. + These types expose an API that is largely identical to that of the `bytes` crate and/or Vec. +- Runtime errors (=> downtime / unavailability) because of non-adherence to alignment/size-multiple requirements, resulting in EINVAL on the read path. + +We sadly do not have infrastructure to run pageserver under `cargo miri`. +So for memory safety issues, we relied on careful peer review. + +We do assert the production-like alignment requirements in testing builds. +However, these asserts were added retroactively. +The actual validation before rollout happened in staging and pre-prod. +We eventually enabled `=direct`/`=direct-rw` for Rust unit tests and the regression test suite. +I cannot recall a single instance of staging/pre-prod/production errors caused by non-adherence to alignment/size-multiple requirements. +Evidently developer testing was good enough. + +## Performance Validation + +The read path went through a lot of iterations of benchmarking in staging and pre-prod. +The benchmarks in those environments demonstrated performance regressions early in the implementation. +It was actually this performance testing that made us implement batching and concurrent IO to avoid unacceptable regressions. + +The write path was much quicker to validate because `bench_ingest` covered all of the (less numerous) access patterns. + +## Future Work + +There is minor and major follow-up work that can be considered in the future. +Check the (soon-to-be-closed) Epic https://github.com/neondatabase/neon/issues/8130's "Follow-Ups" section for a current list. + +Read Path: +- PS PageCache hit rate is crucial to unlock concurrent IO and reasonable latency for random reads generally. + Instead of reactively sizing PS PageCache, we should estimate the required PS PageCache size + and potentially also use that to drive placement decisions of shards from StorageController + https://github.com/neondatabase/neon/issues/9288 +- ... unless we get rid of PS PageCache entirely and cache the index block in a more specialized cache. + But even then, an estimation of the working set would be helpful to figure out caching strategy. + +Write Path: +- BlobWriter and its users could switch back to a borrowed API https://github.com/neondatabase/neon/issues/10129 +- ... unless we want to implement bypass mode for large writes https://github.com/neondatabase/neon/issues/10101 +- The `TempVirtualFile` introduced as part of this project could internalize more of the common usage pattern: https://github.com/neondatabase/neon/issues/11692 +- Reduce conditional compilation around `virtual_file_io_mode`: https://github.com/neondatabase/neon/issues/11676 + +Both: +- A performance simulation mode that pads VirtualFile op latencies to typical NVMe latencies, even if the underlying storage is faster. + This would avoid misleadingly good performance on developer systems and in benchmarks on systems that are less busy than production hosts. + However, padding latencies at microsecond scale is non-trivial. + +Misc: +- We should finish trimming VirtualFile's scope to be truly limited to core data path read & write. + Abstractions for reading & writing pageserver config, location config, heatmaps, etc, should use + APIs in a different package (`VirtualFile::crashsafe_overwrite` and `VirtualFile::read_to_string` + are good entrypoints for cleanup.) https://github.com/neondatabase/neon/issues/11809 + +# Appendix + +## Why Kernel Page Cache Is Ineffective At Tenant High Density + +In the Motivation section, we stated: + +> - **The kernel page cache ineffective** at high tenant density anyways (#tenants/pageserver instance). + +The reason is that the Pageserver workload sent from Computes is whatever is a Compute cache(s) miss. +That's either sequential scans or random reads. +A random read workload simply causes cache thrashing because a packed Pageserver NVMe drive (`im4gn.2xlarge`) has ~100x more capacity than DRAM available. +It is complete waste to have the kernel page cache cache data blocks in this case. +Sequential read workloads *can* benefit iff those pages have been updated recently (=no image layer yet) and together in time/LSN space. +In such cases, the WAL records of those updates likely sit on the same delta layer block. +When Compute does a sequential scan, it sends a series of single-page requests for these individual pages. +When Pageserver processes the second request in such a series, it goes to the same delta layer block and have a kernel page cache hit. +This dependence on kernel page cache for sequential scan performance is significant, but the solution is at a higher level than generic data block caching. +We can either add a small per-connection LRU cache for such delta layer blocks. +Or we can merge those sequential requests into a larger vectored get request, which is designed to never read a block twice. +This amortizes the read latency for our delta layer block across the vectored get batch size (which currently is up to 32). + +There are Pageserver-internal workloads that do sequential access (compaction, image layer generation), but these +1. are not latency-critical and can do batched access outside of the `page_service` protocol constraints (image layer generation) +2. don't actually need to reconstruct images and therefore can use totally different access methods (=> compaction can use k-way merge iterators with their own internal buffering / prefetching). diff --git a/docs/rfcs/2025-04-30-pageserver-concurrent-io-on-read-path.md b/docs/rfcs/2025-04-30-pageserver-concurrent-io-on-read-path.md new file mode 100644 index 0000000000..2dc937d298 --- /dev/null +++ b/docs/rfcs/2025-04-30-pageserver-concurrent-io-on-read-path.md @@ -0,0 +1,251 @@ +# Concurrent IO for Pageserver Read Path + +Date: May 6, 2025 + +## Summary + +This document is a retroactive RFC on the Pageserver Concurrent IO work that happened in late 2024 / early 2025. + +The gist of it is that Pageserver's `Timeline::get_vectored` now _issues_ the data block read operations against layer files +_as it traverses the layer map_ and only _wait_ once, for all of them, after traversal is complete. + +Assuming a good PS PageCache hits on the index blocks during traversal, this drives down the "wait-for-disk" time +contribution down from `random_read_io_latency * O(number_of_values)` to `random_read_io_latency * O(1 + traversal)`. + +The motivation for why this work had to happen when it happened was the switch of Pageserver to +- not cache user data blocks in PS PageCache and +- switch to use direct IO. +More context on this are given in complimentary RFC `./rfcs/2025-04-30-direct-io-for-pageserver.md`. + +### Refs + +- Epic: https://github.com/neondatabase/neon/issues/9378 +- Prototyping happened during the Lisbon 2024 Offsite hackathon: https://github.com/neondatabase/neon/pull/9002 +- Main implementation PR with good description: https://github.com/neondatabase/neon/issues/9378 + +Design and implementation by: +- Vlad Lazar +- Christian Schwarz + +## Background & Motivation + +The Pageserver read path (`Timeline::get_vectored`) consists of two high-level steps: +- Retrieve the delta and image `Value`s required to reconstruct the requested Page@LSN (`Timeline::get_values_reconstruct_data`). +- Pass these values to walredo to reconstruct the page images. + +The read path used to be single-key but has been made multi-key some time ago. +([Internal tech talk by Vlad](https://drive.google.com/file/d/1vfY24S869UP8lEUUDHRWKF1AJn8fpWoJ/view?usp=drive_link)) +However, for simplicity, most of this doc will explain things in terms of a single key being requested. + +The `Value` retrieval step above can be broken down into the following functions: +- **Traversal** of the layer map to figure out which `Value`s from which layer files are required for the page reconstruction. +- **Read IO Planning**: planning of the read IOs that need to be issued to the layer files / filesystem / disk. + The main job here is to coalesce the small value reads into larger filesystem-level read operations. + This layer also takes care of direct IO alignment and size-multiple requirements (cf the RFC for details.) + Check `struct VectoredReadPlanner` and `mod vectored_dio_read` for how it's done. +- **Perform the read IO** using `tokio-epoll-uring`. + +Before this project, above functions were sequentially interleaved, meaning: +1. we would advance traversal, ... +2. discover, that we need to read a value, ... +3. read it from disk using `tokio-epoll-uring`, ... +4. goto 1 unless we're done. + +This meant that if N `Value`s need to be read to reconstruct a page, +the time we spend waiting for disk will be we `random_read_io_latency * O(number_of_values)`. + +## Design + +The **traversal** and **read IO Planning** jobs still happen sequentially, layer by layer, as before. +But instead of performing the read IOs inline, we submit the IOs to a concurrent tokio task for execution. +After the last read from the last layer is submitted, we wait for the IOs to complete. + +Assuming the filesystem / disk is able to actually process the submitted IOs without queuing, +we arrive at _time spent waiting for disk_ ~ `random_read_io_latency * O(1 + traversal)`. + +Note this whole RFC is concerned with the steady state where all layer files required for reconstruction are resident on local NVMe. +Traversal will stall on on-demand layer download if a layer is not yet resident. +It cannot proceed without the layer being resident beccause its next step depends on the contents of the layer index. + +### Avoiding Waiting For IO During Traversal + +The `traversal` component in above time-spent-waiting-for-disk estimation is dominant and needs to be minimized. + +Before this project, traversal needed to perform IOs for the following: +1. The time we are waiting on PS PageCache to page in the visited layers' disk btree index blocks. +2. When visiting a delta layer, reading the data block that contains a `Value` for a requested key, + to determine whether the `Value::will_init` the page and therefore traversal can stop for this key. + +The solution for (1) is to raise the PS PageCache size such that the hit rate is practically 100%. +(Check out the `Background: History Of Caching In Pageserver` section in the RFC on Direct IO for more details.) + +The solution for (2) is source `will_init` from the disk btree index keys, which fortunately +already encode this bit of information since the introduction of the current storage/layer format. + +### Concurrent IOs, Submission & Completion + +To separate IO submission from waiting for its completion, +we introduce the notion of an `IoConcurrency` struct through which IOs are issued. + +An IO is an opaque future that +- captures the `tx` side of a `oneshot` channel +- performs the read IO by calling `VirtualFile::read_exact_at().await` +- sending the result into the `tx` + +Issuing an IO means `Box`ing the future above and handing that `Box` over to the `IoConcurrency` struct. + +The traversal code that submits the IO stores the the corresponding `oneshot::Receiver` +in the `VectoredValueReconstructState`, in the the place where we previously stored +the sequentially read `img` and `records` fields. + +When we're done with traversal, we wait for all submitted IOs: +for each key, there is a future that awaits all the `oneshot::Receiver`s +for that key, and then calls into walredo to reconstruct the page image. +Walredo is now invoked concurrently for each value instead of sequentially. +Walredo itself remains unchanged. + +The spawned IO futures are driven to completion by a sidecar tokio task that +is separate from the task that performs all the layer visiting and spawning of IOs. +That tasks receives the IO futures via an unbounded mpsc channel and +drives them to completion inside a `FuturedUnordered`. + +### Error handling, Panics, Cancellation-Safety + +There are two error classes during reconstruct data retrieval: +* traversal errors: index lookup, move to next layer, and the like +* value read IO errors + +A traversal error fails the entire `get_vectored` request, as before this PR. +A value read error only fails reconstruction of that value. + +Panics and dropping of the `get_vectored` future before it completes +leaves the sidecar task running and does not cancel submitted IOs +(see next section for details on sidecar task lifecycle). +All of this is safe, but, today's preference in the team is to close out +all resource usage explicitly if possible, rather than cancelling + forgetting +about it on drop. So, there is warning if we drop a +`VectoredValueReconstructState`/`ValuesReconstructState` that still has uncompleted IOs. + +### Sidecar Task Lifecycle + +The sidecar tokio task is spawned as part of the `IoConcurrency::spawn_from_conf` struct. +The `IoConcurrency` object acts as a handle through which IO futures are submitted. + +The spawned tokio task holds the `Timeline::gate` open. +It is _not_ sensitive to `Timeline::cancel`, but instead to the `IoConcurrency` object being dropped. + +Once the `IoConcurrency` struct is dropped, no new IO futures can come in +but already submitted IO futures will be driven to completion regardless. +We _could_ safely stop polling these futures because `tokio-epoll-uring` op futures are cancel-safe. +But the underlying kernel and hardware resources are not magically freed up by that. +So, again, in the interest of closing out all outstanding resource usage, we make timeline shutdown wait for sidecar tasks and their IOs to complete. +Under normal conditions, this should be in the low hundreds of microseconds. + +It is advisable to make the `IoConcurrency` as long-lived as possible to minimize the amount of +tokio task churn (=> lower pressure on tokio). Generally this means creating it "high up" in the call stack. +The pain with this is that the `IoConcurrency` reference needs to be propagated "down" to +the (short-lived) functions/scope where we issue the IOs. +We would like to use `RequestContext` for this propagation in the future (issue [here](https://github.com/neondatabase/neon/issues/10460)). +For now, we just add another argument to the relevant code paths. + +### Feature Gating + +The `IoConcurrency` is an `enum` with two variants: `Sequential` and `SidecarTask`. + +The behavior from before this project is available through `IoConcurrency::Sequential`, +which awaits the IO futures in place, without "spawning" or "submitting" them anywhere. + +The `get_vectored_concurrent_io` pageserver config variable determines the runtime value, +**except** for the places that use `IoConcurrency::sequential` to get an `IoConcurrency` object. + +### Alternatives Explored & Caveats Encountered + +A few words on the rationale behind having a sidecar *task* and what +alternatives were considered but abandoned. + +#### Why We Need A Sidecar *Task* / Why Just `FuturesUnordered` Doesn't Work + +We explored to not have a sidecar task, and instead have a `FuturesUnordered` per +`Timeline::get_vectored`. We would queue all IO futures in it and poll it for the +first time after traversal is complete (i.e., at `collect_pending_ios`). + +The obvious disadvantage, but not showstopper, is that we wouldn't be submitting +IOs until traversal is complete. + +The showstopper however, is that deadlocks happen if we don't drive the +IO futures to completion independently of the traversal task. +The reason is that both the IO futures and the traversal task may hold _some_, +_and_ try to acquire _more_, shared limited resources. +For example, both the travseral task and IO future may try to acquire +* a `VirtualFile` file descriptor cache slot async mutex (observed during impl) +* a `tokio-epoll-uring` submission slot (observed during impl) +* a `PageCache` slot (currently this is not the case but we may move more code into the IO futures in the future) + +#### Why We Don't Do `tokio::task`-per-IO-future + +Another option is to spawn a short-lived `tokio::task` for each IO future. +We implemented and benchmarked it during development, but found little +throughput improvement and moderate mean & tail latency degradation. +Concerns about pressure on the tokio scheduler led us to abandon this variant. + +## Future Work + +In addition to what is listed here, also check the "Punted" list in the epic: +https://github.com/neondatabase/neon/issues/9378 + +### Enable `Timeline::get` + +The only major code path that still uses `IoConcurrency::sequential` is `Timeline::get`. +The impact is that roughly the following parts of pageserver do not benefit yet: +- parts of basebackup +- reads performed by the ingest path +- most internal operations that read metadata keys (e.g. `collect_keyspace`!) + +The solution is to propagate `IoConcurrency` via `RequestContext`:https://github.com/neondatabase/neon/issues/10460 + +The tricky part is to figure out at which level of the code the `IoConcurrency` is spawned (and added to the RequestContext). + +Also, propagation via `RequestContext` makes makes it harder to tell during development whether a given +piece of code uses concurrent vs sequential mode: one has to recurisvely walk up the call tree to find the +place that puts the `IoConcurrency` into the `RequestContext`. +We'd have to use `::Sequential` as the conservative default value in a fresh `RequestContext`, and add some +observability to weed out places that fail to enrich with a properly spanwed `IoConcurrency::spawn_from_conf`. + +### Concurrent On-Demand Downloads enabled by Detached Indices + +As stated earlier, traversal stalls on on-demand download because its next step depends on the contents of the layer index. +Once we have separated indices from data blocks (=> https://github.com/neondatabase/neon/issues/11695) +we will only need to stall if the index is not resident. The download of the data blocks can happen concurrently or in the background. For example: +- Move the `Layer::get_or_maybe_download().await` inside the IO futures. + This goes in the opposite direction of the next "future work" item below, but it's easy to do. +- Serve the IO future directly from object storage and dispatch the layer download + to some other actor, e.g., an actor that is responsible for both downloads & eviction. + +### New `tokio-epoll-uring` API That Separates Submission & Wait-For-Completion + +Instead of `$op().await` style API, it would be useful to have a different `tokio-epoll-uring` API +that separates enqueuing (without necessarily `io_uring_enter`ing the kernel each time), submission, +and then wait for completion. + +The `$op().await` API is too opaque, so we _have_ to stuff it into a `FuturesUnordered`. + +A split API as sketched above would allow traversal to ensure an IO operation is enqueued to the kernel/disk (and get back-pressure iff the io_uring squeue is full). +While avoiding spending of CPU cycles on processing of completions while we're still traversing. + +The idea gets muddied by the fact that we may self-deadlock if we submit too much without completing. +So, the submission part of the split API needs to process completions if squeue is full. + +In any way, this split API is precondition for the bigger issue with the design presented here, +which we dicsuss in the next section. + +### Opaque Futures Are Brittle + +The use of opaque futures to represent submitted IOs is a clever hack to minimize changes & allow for near-perfect feature-gating. +However, we take on **brittleness** because callers must guarantee that the submitted futures are independent. +By our experience, it is non-trivial to identify or rule out the interdependencies. +See the lengthy doc comment on the `IoConcurrency::spawn_io` method for more details. + +The better interface and proper subsystem boundary is a _descriptive_ struct of what needs to be done ("read this range from this VirtualFile into this buffer") +and get back a means to wait for completion. +The subsystem can thereby reason by its own how operations may be related; +unlike today, where the submitted opaque future can do just about anything. From a537b2ffd05cb952a3198ca8b36e0dfdfd26e270 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Mon, 12 May 2025 09:25:54 +0200 Subject: [PATCH 26/65] pull_timeline: check tombstones by default (#11873) Make `pull_timeline` check tombstones by default. Otherwise, we'd be recreating timelines if the order between creation and deletion got mixed up, as seen in #11838. Fixes #11838. --- libs/safekeeper_api/src/models.rs | 1 + safekeeper/src/pull_timeline.rs | 6 +++++- storage_controller/src/service/safekeeper_reconciler.rs | 1 + 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/libs/safekeeper_api/src/models.rs b/libs/safekeeper_api/src/models.rs index cc31b38fe7..8658dc4011 100644 --- a/libs/safekeeper_api/src/models.rs +++ b/libs/safekeeper_api/src/models.rs @@ -299,6 +299,7 @@ pub struct PullTimelineRequest { pub tenant_id: TenantId, pub timeline_id: TimelineId, pub http_hosts: Vec, + pub ignore_tombstone: Option, } #[derive(Debug, Serialize, Deserialize)] diff --git a/safekeeper/src/pull_timeline.rs b/safekeeper/src/pull_timeline.rs index 66f2877cc5..c955e667bd 100644 --- a/safekeeper/src/pull_timeline.rs +++ b/safekeeper/src/pull_timeline.rs @@ -468,12 +468,15 @@ pub async fn handle_request( assert!(status.tenant_id == request.tenant_id); assert!(status.timeline_id == request.timeline_id); + let check_tombstone = !request.ignore_tombstone.unwrap_or_default(); + match pull_timeline( status, safekeeper_host, sk_auth_token, http_client, global_timelines, + check_tombstone, ) .await { @@ -499,6 +502,7 @@ async fn pull_timeline( sk_auth_token: Option, http_client: reqwest::Client, global_timelines: Arc, + check_tombstone: bool, ) -> Result { let ttid = TenantTimelineId::new(status.tenant_id, status.timeline_id); info!( @@ -570,7 +574,7 @@ async fn pull_timeline( // Finally, load the timeline. let _tli = global_timelines - .load_temp_timeline(ttid, &tli_dir_path, false) + .load_temp_timeline(ttid, &tli_dir_path, check_tombstone) .await?; Ok(PullTimelineResponse { diff --git a/storage_controller/src/service/safekeeper_reconciler.rs b/storage_controller/src/service/safekeeper_reconciler.rs index 17bb132982..f756d98c64 100644 --- a/storage_controller/src/service/safekeeper_reconciler.rs +++ b/storage_controller/src/service/safekeeper_reconciler.rs @@ -331,6 +331,7 @@ impl SafekeeperReconcilerInner { http_hosts, tenant_id: req.tenant_id, timeline_id, + ignore_tombstone: Some(false), }; success = self .reconcile_inner( From 307e1e64c8f9edf641ae92e920821af4eb013b09 Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Mon, 12 May 2025 17:17:35 +0800 Subject: [PATCH 27/65] fix(scrubber): more logs wrt relic timelines (#11895) ## Problem Further investigation on https://github.com/neondatabase/neon/issues/11159 reveals that the list_tenant function can find all the shards of the tenant, but then the shard gets missing during the gc timeline list blob. One reason could be that in some ways the timeline gets recognized as a relic timeline. ## Summary of changes Add logging to help identify the issue. Signed-off-by: Alex Chi Z --- storage_scrubber/src/checks.rs | 3 ++- storage_scrubber/src/pageserver_physical_gc.rs | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/storage_scrubber/src/checks.rs b/storage_scrubber/src/checks.rs index b151b612bf..40f3523a7e 100644 --- a/storage_scrubber/src/checks.rs +++ b/storage_scrubber/src/checks.rs @@ -355,6 +355,7 @@ pub(crate) async fn list_timeline_blobs( match res { ListTimelineBlobsResult::Ready(data) => Ok(data), ListTimelineBlobsResult::MissingIndexPart(_) => { + tracing::warn!("listing raced with removal of an index, retrying"); // Retry if listing raced with removal of an index let data = list_timeline_blobs_impl(remote_client, id, root_target) .await? @@ -441,7 +442,7 @@ async fn list_timeline_blobs_impl( } if index_part_keys.is_empty() && s3_layers.is_empty() { - tracing::debug!("Timeline is empty: expected post-deletion state."); + tracing::info!("Timeline is empty: expected post-deletion state."); if initdb_archive { tracing::info!("Timeline is post deletion but initdb archive is still present."); } diff --git a/storage_scrubber/src/pageserver_physical_gc.rs b/storage_scrubber/src/pageserver_physical_gc.rs index e1a4095a3c..49ab192285 100644 --- a/storage_scrubber/src/pageserver_physical_gc.rs +++ b/storage_scrubber/src/pageserver_physical_gc.rs @@ -593,6 +593,7 @@ async fn gc_timeline( index_part_snapshot_time: _, } => (index_part, *index_part_generation, data.unused_index_keys), BlobDataParseResult::Relic => { + tracing::info!("Skipping timeline {ttid}, it is a relic"); // Post-deletion tenant location: don't try and GC it. return Ok(summary); } From a618056770cf83e3a6ff44ccea92d0e15cc1c67a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Ko=C5=82odziejczak?= <31549762+mrl5@users.noreply.github.com> Date: Mon, 12 May 2025 13:24:33 +0200 Subject: [PATCH 28/65] chore(compute): skip audit logs for pg_session_jwt extension (#11883) references https://github.com/neondatabase/cloud/issues/28480#issuecomment-2866961124 related https://github.com/neondatabase/cloud/issues/28863 cc @MihaiBojin @conradludgate --- compute_tools/src/config.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs index 42d245f55a..933b30134f 100644 --- a/compute_tools/src/config.rs +++ b/compute_tools/src/config.rs @@ -224,7 +224,10 @@ pub fn write_postgres_conf( writeln!(file, "pgaudit.log_rotation_age=5")?; // Enable audit logs for pg_session_jwt extension - writeln!(file, "pg_session_jwt.audit_log=on")?; + // TODO: Consider a good approach for shipping pg_session_jwt logs to the same sink as + // pgAudit - additional context in https://github.com/neondatabase/cloud/issues/28863 + // + // writeln!(file, "pg_session_jwt.audit_log=on")?; // Add audit shared_preload_libraries, if they are not present. // From a77919f4b2668277795d731a343f0955bf144eb7 Mon Sep 17 00:00:00 2001 From: Conrad Ludgate Date: Mon, 12 May 2025 16:48:48 +0100 Subject: [PATCH 29/65] merge pg-sni-router into proxy (#11882) ## Problem We realised that pg-sni-router doesn't need to be separate from proxy. just a separate port. ## Summary of changes Add pg-sni-router config to proxy and expose the service. --- proxy/src/binary/local_proxy.rs | 4 +- proxy/src/binary/pg_sni_router.rs | 106 +++++---- proxy/src/binary/proxy.rs | 212 ++++++++++++------ proxy/src/tls/server_config.rs | 33 +-- test_runner/fixtures/neon_fixtures.py | 25 +++ .../regress/test_proxy_metric_collection.py | 4 + test_runner/regress/test_sni_router.py | 26 ++- 7 files changed, 283 insertions(+), 127 deletions(-) diff --git a/proxy/src/binary/local_proxy.rs b/proxy/src/binary/local_proxy.rs index ee7f6ffcd7..a566383390 100644 --- a/proxy/src/binary/local_proxy.rs +++ b/proxy/src/binary/local_proxy.rs @@ -423,8 +423,8 @@ async fn refresh_config_inner( if let Some(tls_config) = data.tls { let tls_config = tokio::task::spawn_blocking(move || { crate::tls::server_config::configure_tls( - &tls_config.key_path, - &tls_config.cert_path, + tls_config.key_path.as_ref(), + tls_config.cert_path.as_ref(), None, false, ) diff --git a/proxy/src/binary/pg_sni_router.rs b/proxy/src/binary/pg_sni_router.rs index 19be058ac3..2239d064b2 100644 --- a/proxy/src/binary/pg_sni_router.rs +++ b/proxy/src/binary/pg_sni_router.rs @@ -1,8 +1,10 @@ -/// A stand-alone program that routes connections, e.g. from -/// `aaa--bbb--1234.external.domain` to `aaa.bbb.internal.domain:1234`. -/// -/// This allows connecting to pods/services running in the same Kubernetes cluster from -/// the outside. Similar to an ingress controller for HTTPS. +//! A stand-alone program that routes connections, e.g. from +//! `aaa--bbb--1234.external.domain` to `aaa.bbb.internal.domain:1234`. +//! +//! This allows connecting to pods/services running in the same Kubernetes cluster from +//! the outside. Similar to an ingress controller for HTTPS. + +use std::path::Path; use std::{net::SocketAddr, sync::Arc}; use anyhow::{Context, anyhow, bail, ensure}; @@ -86,46 +88,7 @@ pub async fn run() -> anyhow::Result<()> { args.get_one::("tls-key"), args.get_one::("tls-cert"), ) { - (Some(key_path), Some(cert_path)) => { - let key = { - let key_bytes = std::fs::read(key_path).context("TLS key file")?; - - let mut keys = - rustls_pemfile::pkcs8_private_keys(&mut &key_bytes[..]).collect_vec(); - - ensure!(keys.len() == 1, "keys.len() = {} (should be 1)", keys.len()); - PrivateKeyDer::Pkcs8( - keys.pop() - .expect("keys should not be empty") - .context(format!("Failed to read TLS keys at '{key_path}'"))?, - ) - }; - - let cert_chain_bytes = std::fs::read(cert_path) - .context(format!("Failed to read TLS cert file at '{cert_path}.'"))?; - - let cert_chain: Vec<_> = { - rustls_pemfile::certs(&mut &cert_chain_bytes[..]) - .try_collect() - .with_context(|| { - format!("Failed to read TLS certificate chain from bytes from file at '{cert_path}'.") - })? - }; - - // needed for channel bindings - let first_cert = cert_chain.first().context("missing certificate")?; - let tls_server_end_point = TlsServerEndPoint::new(first_cert)?; - - let tls_config = - rustls::ServerConfig::builder_with_provider(Arc::new(ring::default_provider())) - .with_protocol_versions(&[&rustls::version::TLS13, &rustls::version::TLS12]) - .context("ring should support TLS1.2 and TLS1.3")? - .with_no_client_auth() - .with_single_cert(cert_chain, key)? - .into(); - - (tls_config, tls_server_end_point) - } + (Some(key_path), Some(cert_path)) => parse_tls(key_path.as_ref(), cert_path.as_ref())?, _ => bail!("tls-key and tls-cert must be specified"), }; @@ -188,7 +151,58 @@ pub async fn run() -> anyhow::Result<()> { match signal {} } -async fn task_main( +pub(super) fn parse_tls( + key_path: &Path, + cert_path: &Path, +) -> anyhow::Result<(Arc, TlsServerEndPoint)> { + let key = { + let key_bytes = std::fs::read(key_path).context("TLS key file")?; + + let mut keys = rustls_pemfile::pkcs8_private_keys(&mut &key_bytes[..]).collect_vec(); + + ensure!(keys.len() == 1, "keys.len() = {} (should be 1)", keys.len()); + PrivateKeyDer::Pkcs8( + keys.pop() + .expect("keys should not be empty") + .context(format!( + "Failed to read TLS keys at '{}'", + key_path.display() + ))?, + ) + }; + + let cert_chain_bytes = std::fs::read(cert_path).context(format!( + "Failed to read TLS cert file at '{}.'", + cert_path.display() + ))?; + + let cert_chain: Vec<_> = { + rustls_pemfile::certs(&mut &cert_chain_bytes[..]) + .try_collect() + .with_context(|| { + format!( + "Failed to read TLS certificate chain from bytes from file at '{}'.", + cert_path.display() + ) + })? + }; + + // needed for channel bindings + let first_cert = cert_chain.first().context("missing certificate")?; + let tls_server_end_point = TlsServerEndPoint::new(first_cert)?; + + let tls_config = + rustls::ServerConfig::builder_with_provider(Arc::new(ring::default_provider())) + .with_protocol_versions(&[&rustls::version::TLS13, &rustls::version::TLS12]) + .context("ring should support TLS1.2 and TLS1.3")? + .with_no_client_auth() + .with_single_cert(cert_chain, key)? + .into(); + + Ok((tls_config, tls_server_end_point)) +} + +pub(super) async fn task_main( dest_suffix: Arc, tls_config: Arc, compute_tls_config: Option>, diff --git a/proxy/src/binary/proxy.rs b/proxy/src/binary/proxy.rs index e03f2f33d9..fe0d551f7f 100644 --- a/proxy/src/binary/proxy.rs +++ b/proxy/src/binary/proxy.rs @@ -1,9 +1,10 @@ use std::net::SocketAddr; +use std::path::PathBuf; use std::pin::pin; use std::sync::Arc; use std::time::Duration; -use anyhow::bail; +use anyhow::{bail, ensure}; use arc_swap::ArcSwapOption; use futures::future::Either; use remote_storage::RemoteStorageConfig; @@ -62,18 +63,18 @@ struct ProxyCliArgs { region: String, /// listen for incoming client connections on ip:port #[clap(short, long, default_value = "127.0.0.1:4432")] - proxy: String, + proxy: SocketAddr, #[clap(value_enum, long, default_value_t = AuthBackendType::ConsoleRedirect)] auth_backend: AuthBackendType, /// listen for management callback connection on ip:port #[clap(short, long, default_value = "127.0.0.1:7000")] - mgmt: String, + mgmt: SocketAddr, /// listen for incoming http connections (metrics, etc) on ip:port #[clap(long, default_value = "127.0.0.1:7001")] - http: String, + http: SocketAddr, /// listen for incoming wss connections on ip:port #[clap(long)] - wss: Option, + wss: Option, /// redirect unauthenticated users to the given uri in case of console redirect auth #[clap(short, long, default_value = "http://localhost:3000/psql_session/")] uri: String, @@ -99,18 +100,18 @@ struct ProxyCliArgs { /// /// tls-key and tls-cert are for backwards compatibility, we can put all certs in one dir #[clap(short = 'k', long, alias = "ssl-key")] - tls_key: Option, + tls_key: Option, /// path to TLS cert for client postgres connections /// /// tls-key and tls-cert are for backwards compatibility, we can put all certs in one dir #[clap(short = 'c', long, alias = "ssl-cert")] - tls_cert: Option, + tls_cert: Option, /// Allow writing TLS session keys to the given file pointed to by the environment variable `SSLKEYLOGFILE`. #[clap(long, alias = "allow-ssl-keylogfile")] allow_tls_keylogfile: bool, /// path to directory with TLS certificates for client postgres connections #[clap(long)] - certs_dir: Option, + certs_dir: Option, /// timeout for the TLS handshake #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)] handshake_timeout: tokio::time::Duration, @@ -229,6 +230,9 @@ struct ProxyCliArgs { // TODO: rename to `console_redirect_confirmation_timeout`. #[clap(long, default_value = "2m", value_parser = humantime::parse_duration)] webauth_confirmation_timeout: std::time::Duration, + + #[clap(flatten)] + pg_sni_router: PgSniRouterArgs, } #[derive(clap::Args, Clone, Copy, Debug)] @@ -277,6 +281,25 @@ struct SqlOverHttpArgs { sql_over_http_max_response_size_bytes: usize, } +#[derive(clap::Args, Clone, Debug)] +struct PgSniRouterArgs { + /// listen for incoming client connections on ip:port + #[clap(id = "sni-router-listen", long, default_value = "127.0.0.1:4432")] + listen: SocketAddr, + /// listen for incoming client connections on ip:port, requiring TLS to compute + #[clap(id = "sni-router-listen-tls", long, default_value = "127.0.0.1:4433")] + listen_tls: SocketAddr, + /// path to TLS key for client postgres connections + #[clap(id = "sni-router-tls-key", long)] + tls_key: Option, + /// path to TLS cert for client postgres connections + #[clap(id = "sni-router-tls-cert", long)] + tls_cert: Option, + /// append this domain zone to the SNI hostname to get the destination address + #[clap(id = "sni-router-destination", long)] + dest: Option, +} + pub async fn run() -> anyhow::Result<()> { let _logging_guard = crate::logging::init().await?; let _panic_hook_guard = utils::logging::replace_panic_hook_with_tracing_panic_hook(); @@ -307,73 +330,51 @@ pub async fn run() -> anyhow::Result<()> { Either::Right(auth_backend) => info!("Authentication backend: {auth_backend:?}"), } info!("Using region: {}", args.aws_region); - - // TODO: untangle the config args - let regional_redis_client = match (args.redis_auth_type.as_str(), &args.redis_notifications) { - ("plain", redis_url) => match redis_url { - None => { - bail!("plain auth requires redis_notifications to be set"); - } - Some(url) => { - Some(ConnectionWithCredentialsProvider::new_with_static_credentials(url.clone())) - } - }, - ("irsa", _) => match (&args.redis_host, args.redis_port) { - (Some(host), Some(port)) => Some( - ConnectionWithCredentialsProvider::new_with_credentials_provider( - host.to_string(), - port, - elasticache::CredentialsProvider::new( - args.aws_region, - args.redis_cluster_name, - args.redis_user_id, - ) - .await, - ), - ), - (None, None) => { - warn!( - "irsa auth requires redis-host and redis-port to be set, continuing without regional_redis_client" - ); - None - } - _ => { - bail!("redis-host and redis-port must be specified together"); - } - }, - _ => { - bail!("unknown auth type given"); - } - }; - - let redis_notifications_client = if let Some(url) = args.redis_notifications { - Some(ConnectionWithCredentialsProvider::new_with_static_credentials(url)) - } else { - regional_redis_client.clone() - }; + let (regional_redis_client, redis_notifications_client) = configure_redis(&args).await?; // Check that we can bind to address before further initialization - let http_address: SocketAddr = args.http.parse()?; - info!("Starting http on {http_address}"); - let http_listener = TcpListener::bind(http_address).await?.into_std()?; + info!("Starting http on {}", args.http); + let http_listener = TcpListener::bind(args.http).await?.into_std()?; - let mgmt_address: SocketAddr = args.mgmt.parse()?; - info!("Starting mgmt on {mgmt_address}"); - let mgmt_listener = TcpListener::bind(mgmt_address).await?; + info!("Starting mgmt on {}", args.mgmt); + let mgmt_listener = TcpListener::bind(args.mgmt).await?; let proxy_listener = if args.is_auth_broker { None } else { - let proxy_address: SocketAddr = args.proxy.parse()?; - info!("Starting proxy on {proxy_address}"); + info!("Starting proxy on {}", args.proxy); + Some(TcpListener::bind(args.proxy).await?) + }; - Some(TcpListener::bind(proxy_address).await?) + let sni_router_listeners = { + let args = &args.pg_sni_router; + if args.dest.is_some() { + ensure!( + args.tls_key.is_some(), + "sni-router-tls-key must be provided" + ); + ensure!( + args.tls_cert.is_some(), + "sni-router-tls-cert must be provided" + ); + + info!( + "Starting pg-sni-router on {} and {}", + args.listen, args.listen_tls + ); + + Some(( + TcpListener::bind(args.listen).await?, + TcpListener::bind(args.listen_tls).await?, + )) + } else { + None + } }; // TODO: rename the argument to something like serverless. // It now covers more than just websockets, it also covers SQL over HTTP. let serverless_listener = if let Some(serverless_address) = args.wss { - let serverless_address: SocketAddr = serverless_address.parse()?; info!("Starting wss on {serverless_address}"); Some(TcpListener::bind(serverless_address).await?) } else if args.is_auth_broker { @@ -458,6 +459,37 @@ pub async fn run() -> anyhow::Result<()> { } } + // spawn pg-sni-router mode. + if let Some((listen, listen_tls)) = sni_router_listeners { + let args = args.pg_sni_router; + let dest = args.dest.expect("already asserted it is set"); + let key_path = args.tls_key.expect("already asserted it is set"); + let cert_path = args.tls_cert.expect("already asserted it is set"); + + let (tls_config, tls_server_end_point) = + super::pg_sni_router::parse_tls(&key_path, &cert_path)?; + + let dest = Arc::new(dest); + + client_tasks.spawn(super::pg_sni_router::task_main( + dest.clone(), + tls_config.clone(), + None, + tls_server_end_point, + listen, + cancellation_token.clone(), + )); + + client_tasks.spawn(super::pg_sni_router::task_main( + dest, + tls_config, + Some(config.connect_to_compute.tls.clone()), + tls_server_end_point, + listen_tls, + cancellation_token.clone(), + )); + } + client_tasks.spawn(crate::context::parquet::worker( cancellation_token.clone(), args.parquet_upload, @@ -565,7 +597,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { (Some(key_path), Some(cert_path)) => Some(config::configure_tls( key_path, cert_path, - args.certs_dir.as_ref(), + args.certs_dir.as_deref(), args.allow_tls_keylogfile, )?), (None, None) => None, @@ -811,6 +843,60 @@ fn build_auth_backend( } } +async fn configure_redis( + args: &ProxyCliArgs, +) -> anyhow::Result<( + Option, + Option, +)> { + // TODO: untangle the config args + let regional_redis_client = match (args.redis_auth_type.as_str(), &args.redis_notifications) { + ("plain", redis_url) => match redis_url { + None => { + bail!("plain auth requires redis_notifications to be set"); + } + Some(url) => { + Some(ConnectionWithCredentialsProvider::new_with_static_credentials(url.clone())) + } + }, + ("irsa", _) => match (&args.redis_host, args.redis_port) { + (Some(host), Some(port)) => Some( + ConnectionWithCredentialsProvider::new_with_credentials_provider( + host.to_string(), + port, + elasticache::CredentialsProvider::new( + args.aws_region.clone(), + args.redis_cluster_name.clone(), + args.redis_user_id.clone(), + ) + .await, + ), + ), + (None, None) => { + // todo: upgrade to error? + warn!( + "irsa auth requires redis-host and redis-port to be set, continuing without regional_redis_client" + ); + None + } + _ => { + bail!("redis-host and redis-port must be specified together"); + } + }, + _ => { + bail!("unknown auth type given"); + } + }; + + let redis_notifications_client = if let Some(url) = &args.redis_notifications { + Some(ConnectionWithCredentialsProvider::new_with_static_credentials(&**url)) + } else { + regional_redis_client.clone() + }; + + Ok((regional_redis_client, redis_notifications_client)) +} + #[cfg(test)] mod tests { use std::time::Duration; diff --git a/proxy/src/tls/server_config.rs b/proxy/src/tls/server_config.rs index 8f8917ef62..66c53b3aff 100644 --- a/proxy/src/tls/server_config.rs +++ b/proxy/src/tls/server_config.rs @@ -1,4 +1,5 @@ use std::collections::{HashMap, HashSet}; +use std::path::Path; use std::sync::Arc; use anyhow::{Context, bail}; @@ -21,9 +22,9 @@ pub struct TlsConfig { /// Configure TLS for the main endpoint. pub fn configure_tls( - key_path: &str, - cert_path: &str, - certs_dir: Option<&String>, + key_path: &Path, + cert_path: &Path, + certs_dir: Option<&Path>, allow_tls_keylogfile: bool, ) -> anyhow::Result { // add default certificate @@ -39,8 +40,7 @@ pub fn configure_tls( let key_path = path.join("tls.key"); let cert_path = path.join("tls.crt"); if key_path.exists() && cert_path.exists() { - cert_resolver - .add_cert_path(&key_path.to_string_lossy(), &cert_path.to_string_lossy())?; + cert_resolver.add_cert_path(&key_path, &cert_path)?; } } } @@ -86,7 +86,7 @@ pub struct CertResolver { } impl CertResolver { - fn parse_new(key_path: &str, cert_path: &str) -> anyhow::Result { + fn parse_new(key_path: &Path, cert_path: &Path) -> anyhow::Result { let (priv_key, cert_chain) = parse_key_cert(key_path, cert_path)?; Self::new(priv_key, cert_chain) } @@ -103,7 +103,7 @@ impl CertResolver { Ok(Self { certs, default }) } - fn add_cert_path(&mut self, key_path: &str, cert_path: &str) -> anyhow::Result<()> { + fn add_cert_path(&mut self, key_path: &Path, cert_path: &Path) -> anyhow::Result<()> { let (priv_key, cert_chain) = parse_key_cert(key_path, cert_path)?; self.add_cert(priv_key, cert_chain) } @@ -124,26 +124,29 @@ impl CertResolver { } fn parse_key_cert( - key_path: &str, - cert_path: &str, + key_path: &Path, + cert_path: &Path, ) -> anyhow::Result<(PrivateKeyDer<'static>, Vec>)> { let priv_key = { let key_bytes = std::fs::read(key_path) - .with_context(|| format!("Failed to read TLS keys at '{key_path}'"))?; + .with_context(|| format!("Failed to read TLS keys at '{}'", key_path.display()))?; rustls_pemfile::private_key(&mut &key_bytes[..]) - .with_context(|| format!("Failed to parse TLS keys at '{key_path}'"))? - .with_context(|| format!("Failed to parse TLS keys at '{key_path}'"))? + .with_context(|| format!("Failed to parse TLS keys at '{}'", key_path.display()))? + .with_context(|| format!("Failed to parse TLS keys at '{}'", key_path.display()))? }; - let cert_chain_bytes = std::fs::read(cert_path) - .context(format!("Failed to read TLS cert file at '{cert_path}.'"))?; + let cert_chain_bytes = std::fs::read(cert_path).context(format!( + "Failed to read TLS cert file at '{}.'", + cert_path.display() + ))?; let cert_chain = { rustls_pemfile::certs(&mut &cert_chain_bytes[..]) .try_collect() .with_context(|| { format!( - "Failed to read TLS certificate chain from bytes from file at '{cert_path}'." + "Failed to read TLS certificate chain from bytes from file at '{}'.", + cert_path.display() ) })? }; diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 8f56ee4392..2801a0e867 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -3607,6 +3607,8 @@ class NeonProxy(PgProtocol): http_port: int, mgmt_port: int, external_http_port: int, + router_port: int, + router_tls_port: int, auth_backend: NeonProxy.AuthBackend, metric_collection_endpoint: str | None = None, metric_collection_interval: str | None = None, @@ -3623,6 +3625,8 @@ class NeonProxy(PgProtocol): self.test_output_dir = test_output_dir self.proxy_port = proxy_port self.mgmt_port = mgmt_port + self.router_port = router_port + self.router_tls_port = router_tls_port self.auth_backend = auth_backend self.metric_collection_endpoint = metric_collection_endpoint self.metric_collection_interval = metric_collection_interval @@ -3637,6 +3641,14 @@ class NeonProxy(PgProtocol): key_path = self.test_output_dir / "proxy.key" generate_proxy_tls_certs("*.local.neon.build", key_path, crt_path) + # generate key for pg-sni-router. + # endpoint.namespace.local.neon.build resolves to 127.0.0.1 + generate_proxy_tls_certs( + "endpoint.namespace.local.neon.build", + self.test_output_dir / "router.key", + self.test_output_dir / "router.crt", + ) + args = [ str(self.neon_binpath / "proxy"), *["--http", f"{self.host}:{self.http_port}"], @@ -3646,6 +3658,11 @@ class NeonProxy(PgProtocol): *["--sql-over-http-timeout", f"{self.http_timeout_seconds}s"], *["-c", str(crt_path)], *["-k", str(key_path)], + *["--sni-router-listen", f"{self.host}:{self.router_port}"], + *["--sni-router-listen-tls", f"{self.host}:{self.router_tls_port}"], + *["--sni-router-tls-cert", str(self.test_output_dir / "router.crt")], + *["--sni-router-tls-key", str(self.test_output_dir / "router.key")], + *["--sni-router-destination", "local.neon.build"], *self.auth_backend.extra_args(), ] @@ -3945,6 +3962,8 @@ def link_proxy( proxy_port = port_distributor.get_port() mgmt_port = port_distributor.get_port() external_http_port = port_distributor.get_port() + router_port = port_distributor.get_port() + router_tls_port = port_distributor.get_port() with NeonProxy( neon_binpath=neon_binpath, @@ -3952,6 +3971,8 @@ def link_proxy( proxy_port=proxy_port, http_port=http_port, mgmt_port=mgmt_port, + router_port=router_port, + router_tls_port=router_tls_port, external_http_port=external_http_port, auth_backend=NeonProxy.Link(), ) as proxy: @@ -3985,6 +4006,8 @@ def static_proxy( mgmt_port = port_distributor.get_port() http_port = port_distributor.get_port() external_http_port = port_distributor.get_port() + router_port = port_distributor.get_port() + router_tls_port = port_distributor.get_port() with NeonProxy( neon_binpath=neon_binpath, @@ -3992,6 +4015,8 @@ def static_proxy( proxy_port=proxy_port, http_port=http_port, mgmt_port=mgmt_port, + router_port=router_port, + router_tls_port=router_tls_port, external_http_port=external_http_port, auth_backend=NeonProxy.Postgres(auth_endpoint), ) as proxy: diff --git a/test_runner/regress/test_proxy_metric_collection.py b/test_runner/regress/test_proxy_metric_collection.py index 85d8a6daaa..7442d50f68 100644 --- a/test_runner/regress/test_proxy_metric_collection.py +++ b/test_runner/regress/test_proxy_metric_collection.py @@ -52,6 +52,8 @@ def proxy_with_metric_collector( proxy_port = port_distributor.get_port() mgmt_port = port_distributor.get_port() external_http_port = port_distributor.get_port() + router_port = port_distributor.get_port() + router_tls_port = port_distributor.get_port() (host, port) = httpserver_listen_address metric_collection_endpoint = f"http://{host}:{port}/billing/api/v1/usage_events" @@ -63,6 +65,8 @@ def proxy_with_metric_collector( proxy_port=proxy_port, http_port=http_port, mgmt_port=mgmt_port, + router_port=router_port, + router_tls_port=router_tls_port, external_http_port=external_http_port, metric_collection_endpoint=metric_collection_endpoint, metric_collection_interval=metric_collection_interval, diff --git a/test_runner/regress/test_sni_router.py b/test_runner/regress/test_sni_router.py index 19952fc71b..61893f22ba 100644 --- a/test_runner/regress/test_sni_router.py +++ b/test_runner/regress/test_sni_router.py @@ -6,7 +6,7 @@ from typing import TYPE_CHECKING import backoff from fixtures.log_helper import log -from fixtures.neon_fixtures import PgProtocol, VanillaPostgres +from fixtures.neon_fixtures import NeonProxy, PgProtocol, VanillaPostgres if TYPE_CHECKING: from pathlib import Path @@ -41,6 +41,7 @@ class PgSniRouter(PgProtocol): self, neon_binpath: Path, port: int, + tls_port: int, destination: str, tls_cert: Path, tls_key: Path, @@ -53,6 +54,7 @@ class PgSniRouter(PgProtocol): self.host = host self.neon_binpath = neon_binpath self.port = port + self.tls_port = tls_port self.destination = destination self.tls_cert = tls_cert self.tls_key = tls_key @@ -64,6 +66,7 @@ class PgSniRouter(PgProtocol): args = [ str(self.neon_binpath / "pg_sni_router"), *["--listen", f"127.0.0.1:{self.port}"], + *["--listen-tls", f"127.0.0.1:{self.tls_port}"], *["--tls-cert", str(self.tls_cert)], *["--tls-key", str(self.tls_key)], *["--destination", self.destination], @@ -127,10 +130,12 @@ def test_pg_sni_router( pg_port = vanilla_pg.default_options["port"] router_port = port_distributor.get_port() + router_tls_port = port_distributor.get_port() with PgSniRouter( neon_binpath=neon_binpath, port=router_port, + tls_port=router_tls_port, destination="local.neon.build", tls_cert=test_output_dir / "router.crt", tls_key=test_output_dir / "router.key", @@ -146,3 +151,22 @@ def test_pg_sni_router( hostaddr="127.0.0.1", ) assert out[0][0] == 1 + + +def test_pg_sni_router_in_proxy( + static_proxy: NeonProxy, + vanilla_pg: VanillaPostgres, +): + # static_proxy starts this. + assert vanilla_pg.is_running() + pg_port = vanilla_pg.default_options["port"] + + out = static_proxy.safe_psql( + "select 1", + dbname="postgres", + sslmode="require", + host=f"endpoint--namespace--{pg_port}.local.neon.build", + hostaddr="127.0.0.1", + port=static_proxy.router_port, + ) + assert out[0][0] == 1 From 9971fba5848ca3928b54e123a338d454e6c65283 Mon Sep 17 00:00:00 2001 From: Tristan Partin Date: Mon, 12 May 2025 12:36:07 -0500 Subject: [PATCH 30/65] Properly configure the dynamic loader to load our compiled libraries (#11858) The first line in /etc/ld.so.conf is: /etc/ld.so.conf.d/* We want to control library load order so that our compiled binaries are picked up before others from system packages. The previous solution allowed the system libraries to load before ours. Part-of: https://github.com/neondatabase/neon/issues/11857 Signed-off-by: Tristan Partin --- compute/compute-node.Dockerfile | 3 ++- compute/etc/ld.so.conf.d/00-neon.conf | 1 + docker-compose/compute_wrapper/shell/compute.sh | 8 ++++++++ 3 files changed, 11 insertions(+), 1 deletion(-) create mode 100644 compute/etc/ld.so.conf.d/00-neon.conf diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile index 6233eaf709..e6e6053554 100644 --- a/compute/compute-node.Dockerfile +++ b/compute/compute-node.Dockerfile @@ -1971,7 +1971,8 @@ COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/sql COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/neon_collector_autoscaling.yml /etc/neon_collector_autoscaling.yml # Make the libraries we built available -RUN echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig +COPY --chmod=0666 compute/etc/ld.so.conf.d/00-neon.conf /etc/ld.so.conf.d/00-neon.conf +RUN /sbin/ldconfig # rsyslog config permissions # directory for rsyslogd pid file diff --git a/compute/etc/ld.so.conf.d/00-neon.conf b/compute/etc/ld.so.conf.d/00-neon.conf new file mode 100644 index 0000000000..e8e4bdcd42 --- /dev/null +++ b/compute/etc/ld.so.conf.d/00-neon.conf @@ -0,0 +1 @@ +/usr/local/lib diff --git a/docker-compose/compute_wrapper/shell/compute.sh b/docker-compose/compute_wrapper/shell/compute.sh index 723b2f8afb..20a1ffb7a0 100755 --- a/docker-compose/compute_wrapper/shell/compute.sh +++ b/docker-compose/compute_wrapper/shell/compute.sh @@ -14,6 +14,14 @@ PG_VERSION=${PG_VERSION:-14} CONFIG_FILE_ORG=/var/db/postgres/configs/config.json CONFIG_FILE=/tmp/config.json +# Test that the first library path that the dynamic loader looks in is the path +# that we use for custom compiled software +first_path="$(ldconfig --verbose 2>/dev/null \ + | grep --invert-match ^$'\t' \ + | cut --delimiter=: --fields=1 \ + | head --lines=1)" +test "$first_path" == '/usr/local/lib' || true # Remove the || true in a follow-up PR. Needed for backwards compat. + echo "Waiting pageserver become ready." while ! nc -z pageserver 6400; do sleep 1; From a113c48c43c9ff0130e404e47a55e4721bbb63a4 Mon Sep 17 00:00:00 2001 From: Conrad Ludgate Date: Tue, 13 May 2025 09:33:53 +0100 Subject: [PATCH 31/65] proxy: fix redis batching support (#11905) ## Problem For `StoreCancelKey`, we were inserting 2 commands, but we were not inserting two replies. This mismatch leads to errors when decoding the response. ## Summary of changes Abstract the command + reply pipeline so that commands and replies are registered at the same time. --- proxy/src/cancellation.rs | 125 ++++++++++++++++++++++++-------------- proxy/src/redis/kv_ops.rs | 2 +- 2 files changed, 79 insertions(+), 48 deletions(-) diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs index c5ba04eb8c..f34fb747ca 100644 --- a/proxy/src/cancellation.rs +++ b/proxy/src/cancellation.rs @@ -6,12 +6,12 @@ use ipnet::{IpNet, Ipv4Net, Ipv6Net}; use postgres_client::CancelToken; use postgres_client::tls::MakeTlsConnect; use pq_proto::CancelKeyData; -use redis::{FromRedisValue, Pipeline, Value, pipe}; +use redis::{Cmd, FromRedisValue, Value}; use serde::{Deserialize, Serialize}; use thiserror::Error; use tokio::net::TcpStream; use tokio::sync::{mpsc, oneshot}; -use tracing::{debug, info, warn}; +use tracing::{debug, error, info, warn}; use crate::auth::backend::ComputeUserInfo; use crate::auth::{AuthError, check_peer_addr_is_in_list}; @@ -56,8 +56,70 @@ pub enum CancelKeyOp { }, } +pub struct Pipeline { + inner: redis::Pipeline, + replies: Vec, +} + +impl Pipeline { + fn with_capacity(n: usize) -> Self { + Self { + inner: redis::Pipeline::with_capacity(n), + replies: Vec::with_capacity(n), + } + } + + async fn execute(&mut self, client: &mut RedisKVClient) { + let responses = self.replies.len(); + let batch_size = self.inner.len(); + + match client.query(&self.inner).await { + // for each reply, we expect that many values. + Ok(Value::Array(values)) if values.len() == responses => { + debug!( + batch_size, + responses, "successfully completed cancellation jobs", + ); + for (value, reply) in std::iter::zip(values, self.replies.drain(..)) { + reply.send_value(value); + } + } + Ok(value) => { + error!(batch_size, ?value, "unexpected redis return value"); + for reply in self.replies.drain(..) { + reply.send_err(anyhow!("incorrect response type from redis")); + } + } + Err(err) => { + for reply in self.replies.drain(..) { + reply.send_err(anyhow!("could not send cmd to redis: {err}")); + } + } + } + + self.inner.clear(); + self.replies.clear(); + } + + fn add_command_with_reply(&mut self, cmd: Cmd, reply: CancelReplyOp) { + self.inner.add_command(cmd); + self.replies.push(reply); + } + + fn add_command_no_reply(&mut self, cmd: Cmd) { + self.inner.add_command(cmd).ignore(); + } + + fn add_command(&mut self, cmd: Cmd, reply: Option) { + match reply { + Some(reply) => self.add_command_with_reply(cmd, reply), + None => self.add_command_no_reply(cmd), + } + } +} + impl CancelKeyOp { - fn register(self, pipe: &mut Pipeline) -> Option { + fn register(self, pipe: &mut Pipeline) { #[allow(clippy::used_underscore_binding)] match self { CancelKeyOp::StoreCancelKey { @@ -68,18 +130,18 @@ impl CancelKeyOp { _guard, expire, } => { - pipe.hset(&key, field, value); - pipe.expire(key, expire); - let resp_tx = resp_tx?; - Some(CancelReplyOp::StoreCancelKey { resp_tx, _guard }) + let reply = + resp_tx.map(|resp_tx| CancelReplyOp::StoreCancelKey { resp_tx, _guard }); + pipe.add_command(Cmd::hset(&key, field, value), reply); + pipe.add_command_no_reply(Cmd::expire(key, expire)); } CancelKeyOp::GetCancelData { key, resp_tx, _guard, } => { - pipe.hgetall(key); - Some(CancelReplyOp::GetCancelData { resp_tx, _guard }) + let reply = CancelReplyOp::GetCancelData { resp_tx, _guard }; + pipe.add_command_with_reply(Cmd::hgetall(key), reply); } CancelKeyOp::RemoveCancelKey { key, @@ -87,9 +149,9 @@ impl CancelKeyOp { resp_tx, _guard, } => { - pipe.hdel(key, field); - let resp_tx = resp_tx?; - Some(CancelReplyOp::RemoveCancelKey { resp_tx, _guard }) + let reply = + resp_tx.map(|resp_tx| CancelReplyOp::RemoveCancelKey { resp_tx, _guard }); + pipe.add_command(Cmd::hdel(key, field), reply); } } } @@ -170,8 +232,8 @@ pub async fn handle_cancel_messages( client: &mut RedisKVClient, mut rx: mpsc::Receiver, ) -> anyhow::Result<()> { - let mut batch = Vec::new(); - let mut replies = vec![]; + let mut batch = Vec::with_capacity(BATCH_SIZE); + let mut pipeline = Pipeline::with_capacity(BATCH_SIZE); loop { if rx.recv_many(&mut batch, BATCH_SIZE).await == 0 { @@ -182,42 +244,11 @@ pub async fn handle_cancel_messages( let batch_size = batch.len(); debug!(batch_size, "running cancellation jobs"); - let mut pipe = pipe(); for msg in batch.drain(..) { - if let Some(reply) = msg.register(&mut pipe) { - replies.push(reply); - } else { - pipe.ignore(); - } + msg.register(&mut pipeline); } - let responses = replies.len(); - - match client.query(pipe).await { - // for each reply, we expect that many values. - Ok(Value::Array(values)) if values.len() == responses => { - debug!( - batch_size, - responses, "successfully completed cancellation jobs", - ); - for (value, reply) in std::iter::zip(values, replies.drain(..)) { - reply.send_value(value); - } - } - Ok(value) => { - debug!(?value, "unexpected redis return value"); - for reply in replies.drain(..) { - reply.send_err(anyhow!("incorrect response type from redis")); - } - } - Err(err) => { - for reply in replies.drain(..) { - reply.send_err(anyhow!("could not send cmd to redis: {err}")); - } - } - } - - replies.clear(); + pipeline.execute(client).await; } } diff --git a/proxy/src/redis/kv_ops.rs b/proxy/src/redis/kv_ops.rs index aa627b29a6..f71730c533 100644 --- a/proxy/src/redis/kv_ops.rs +++ b/proxy/src/redis/kv_ops.rs @@ -47,7 +47,7 @@ impl RedisKVClient { pub(crate) async fn query( &mut self, - q: impl Queryable, + q: &impl Queryable, ) -> anyhow::Result { if !self.limiter.check() { tracing::info!("Rate limit exceeded. Skipping query"); From a9979620c508a089f3f3d6e020877349ff555b0f Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Tue, 13 May 2025 16:53:35 +0800 Subject: [PATCH 32/65] fix(remote_storage): continue on Azure+AWS retryable error (#11903) ## Problem We implemented the retry logic in AWS S3 but not in Azure. Therefore, if there is an error during Azure listing, we will return an Err to the caller, and the stream will end without fetching more tenants. Part of https://github.com/neondatabase/neon/issues/11159 Without this fix, listing tenant will stop once we hit an error (could be network errors -- that happens more frequent on Azure). If we happen to stop at a point that we only listed part of the shards, we will hit the "missed shards" error or even remove layers being used. This bug (for Azure listing) was introduced as part of https://github.com/neondatabase/neon/pull/9840 There is also a bug that stops the stream for AWS when there's a timeout -- this is fixed along with this patch. ## Summary of changes Retry the request on error. In the future, we should make such streams return something like `Result>` where the outer result is the error that ends the stream and the inner one is the error that should be retried by the caller. --------- Signed-off-by: Alex Chi Z --- libs/remote_storage/src/azure_blob.rs | 11 +++++++++-- libs/remote_storage/src/s3_bucket.rs | 9 ++++++++- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs index a5cddb840f..5363e935e3 100644 --- a/libs/remote_storage/src/azure_blob.rs +++ b/libs/remote_storage/src/azure_blob.rs @@ -330,11 +330,18 @@ impl AzureBlobStorage { if let Err(DownloadError::Timeout) = &next_item { timeout_try_cnt += 1; if timeout_try_cnt <= 5 { - continue; + continue 'outer; } } - let next_item = next_item?; + let next_item = match next_item { + Ok(next_item) => next_item, + Err(e) => { + // The error is potentially retryable, so we must rewind the loop after yielding. + yield Err(e); + continue 'outer; + }, + }; // Log a warning if we saw two timeouts in a row before a successful request if timeout_try_cnt > 2 { diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs index 918d9d5a6b..d98ff552ee 100644 --- a/libs/remote_storage/src/s3_bucket.rs +++ b/libs/remote_storage/src/s3_bucket.rs @@ -657,7 +657,14 @@ impl RemoteStorage for S3Bucket { res = request => Ok(res), _ = tokio::time::sleep(self.timeout) => Err(DownloadError::Timeout), _ = cancel.cancelled() => Err(DownloadError::Cancelled), - }?; + }; + + if let Err(DownloadError::Timeout) = &response { + yield Err(DownloadError::Timeout); + continue 'outer; + } + + let response = response?; // always yield cancellation errors and stop the stream let response = response .context("Failed to list S3 prefixes") From 34a42b00caf9e4c45fa3ce29ba95aa2ae7278d05 Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Tue, 13 May 2025 17:49:14 +0800 Subject: [PATCH 33/65] feat(pageserver): add PostHog lite client (#11821) ## Problem part of https://github.com/neondatabase/neon/issues/11813 ## Summary of changes Add a lite PostHog client that only uses the local flag evaluation functionality. Added a test case that parses an example feature flag and gets the evaluation result. TODO: support boolean flag, remote config; implement all operators in PostHog. --------- Signed-off-by: Alex Chi Z --- Cargo.lock | 16 + Cargo.toml | 1 + libs/posthog_client_lite/Cargo.toml | 14 + libs/posthog_client_lite/src/lib.rs | 634 ++++++++++++++++++++++++++++ workspace_hack/Cargo.toml | 3 + 5 files changed, 668 insertions(+) create mode 100644 libs/posthog_client_lite/Cargo.toml create mode 100644 libs/posthog_client_lite/src/lib.rs diff --git a/Cargo.lock b/Cargo.lock index 7083baa092..6df5d4a71e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4848,6 +4848,19 @@ dependencies = [ "workspace_hack", ] +[[package]] +name = "posthog_client_lite" +version = "0.1.0" +dependencies = [ + "anyhow", + "reqwest", + "serde", + "serde_json", + "sha2", + "thiserror 1.0.69", + "workspace_hack", +] + [[package]] name = "powerfmt" version = "0.2.0" @@ -8439,8 +8452,10 @@ dependencies = [ "fail", "form_urlencoded", "futures-channel", + "futures-core", "futures-executor", "futures-io", + "futures-task", "futures-util", "generic-array", "getrandom 0.2.11", @@ -8470,6 +8485,7 @@ dependencies = [ "once_cell", "p256 0.13.2", "parquet", + "percent-encoding", "prettyplease", "proc-macro2", "prost 0.13.3", diff --git a/Cargo.toml b/Cargo.toml index 8d4cc4a75a..6b87ce549d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -26,6 +26,7 @@ members = [ "libs/utils", "libs/consumption_metrics", "libs/postgres_backend", + "libs/posthog_client_lite", "libs/pq_proto", "libs/tenant_size_model", "libs/metrics", diff --git a/libs/posthog_client_lite/Cargo.toml b/libs/posthog_client_lite/Cargo.toml new file mode 100644 index 0000000000..7c19bf2ccb --- /dev/null +++ b/libs/posthog_client_lite/Cargo.toml @@ -0,0 +1,14 @@ +[package] +name = "posthog_client_lite" +version = "0.1.0" +edition = "2024" +license.workspace = true + +[dependencies] +anyhow.workspace = true +reqwest.workspace = true +serde.workspace = true +serde_json.workspace = true +sha2.workspace = true +workspace_hack.workspace = true +thiserror.workspace = true diff --git a/libs/posthog_client_lite/src/lib.rs b/libs/posthog_client_lite/src/lib.rs new file mode 100644 index 0000000000..53deb26ab7 --- /dev/null +++ b/libs/posthog_client_lite/src/lib.rs @@ -0,0 +1,634 @@ +//! A lite version of the PostHog client that only supports local evaluation of feature flags. + +use std::collections::HashMap; + +use serde::{Deserialize, Serialize}; +use serde_json::json; +use sha2::Digest; + +#[derive(Debug, thiserror::Error)] +pub enum PostHogEvaluationError { + /// The feature flag is not available, for example, because the local evaluation data is not populated yet. + #[error("Feature flag not available: {0}")] + NotAvailable(String), + #[error("No condition group is matched")] + NoConditionGroupMatched, + /// Real errors, e.g., the rollout percentage does not add up to 100. + #[error("Failed to evaluate feature flag: {0}")] + Internal(String), +} + +#[derive(Deserialize)] +pub struct LocalEvaluationResponse { + #[allow(dead_code)] + flags: Vec, +} + +#[derive(Deserialize)] +pub struct LocalEvaluationFlag { + key: String, + filters: LocalEvaluationFlagFilters, + active: bool, +} + +#[derive(Deserialize)] +pub struct LocalEvaluationFlagFilters { + groups: Vec, + multivariate: LocalEvaluationFlagMultivariate, +} + +#[derive(Deserialize)] +pub struct LocalEvaluationFlagFilterGroup { + variant: Option, + properties: Option>, + rollout_percentage: i64, +} + +#[derive(Deserialize)] +pub struct LocalEvaluationFlagFilterProperty { + key: String, + value: PostHogFlagFilterPropertyValue, + operator: String, +} + +#[derive(Debug, Serialize, Deserialize)] +#[serde(untagged)] +pub enum PostHogFlagFilterPropertyValue { + String(String), + Number(f64), + Boolean(bool), + List(Vec), +} + +#[derive(Deserialize)] +pub struct LocalEvaluationFlagMultivariate { + variants: Vec, +} + +#[derive(Deserialize)] +pub struct LocalEvaluationFlagMultivariateVariant { + key: String, + rollout_percentage: i64, +} + +pub struct FeatureStore { + flags: HashMap, +} + +impl Default for FeatureStore { + fn default() -> Self { + Self::new() + } +} + +enum GroupEvaluationResult { + MatchedAndOverride(String), + MatchedAndEvaluate, + Unmatched, +} + +impl FeatureStore { + pub fn new() -> Self { + Self { + flags: HashMap::new(), + } + } + + pub fn set_flags(&mut self, flags: Vec) { + self.flags.clear(); + for flag in flags { + self.flags.insert(flag.key.clone(), flag); + } + } + + /// Generate a consistent hash for a user ID (e.g., tenant ID). + /// + /// The implementation is different from PostHog SDK. In PostHog SDK, it is sha1 of `user_id.distinct_id.salt`. + /// However, as we do not upload all of our tenant IDs to PostHog, we do not have the PostHog distinct_id for a + /// tenant. Therefore, the way we compute it is sha256 of `user_id.feature_id.salt`. + fn consistent_hash(user_id: &str, flag_key: &str, salt: &str) -> f64 { + let mut hasher = sha2::Sha256::new(); + hasher.update(user_id); + hasher.update("."); + hasher.update(flag_key); + hasher.update("."); + hasher.update(salt); + let hash = hasher.finalize(); + let hash_int = u64::from_le_bytes(hash[..8].try_into().unwrap()); + hash_int as f64 / u64::MAX as f64 + } + + /// Evaluate a condition. Returns an error if the condition cannot be evaluated due to parsing error or missing + /// property. + fn evaluate_condition( + &self, + operator: &str, + provided: &PostHogFlagFilterPropertyValue, + requested: &PostHogFlagFilterPropertyValue, + ) -> Result { + match operator { + "exact" => { + let PostHogFlagFilterPropertyValue::String(provided) = provided else { + // Left should be a string + return Err(PostHogEvaluationError::Internal(format!( + "The left side of the condition is not a string: {:?}", + provided + ))); + }; + let PostHogFlagFilterPropertyValue::List(requested) = requested else { + // Right should be a list of string + return Err(PostHogEvaluationError::Internal(format!( + "The right side of the condition is not a list: {:?}", + requested + ))); + }; + Ok(requested.contains(provided)) + } + "lt" | "gt" => { + let PostHogFlagFilterPropertyValue::String(requested) = requested else { + // Right should be a string + return Err(PostHogEvaluationError::Internal(format!( + "The right side of the condition is not a string: {:?}", + requested + ))); + }; + let Ok(requested) = requested.parse::() else { + return Err(PostHogEvaluationError::Internal(format!( + "Can not parse the right side of the condition as a number: {:?}", + requested + ))); + }; + // Left can either be a number or a string + let provided = match provided { + PostHogFlagFilterPropertyValue::Number(provided) => *provided, + PostHogFlagFilterPropertyValue::String(provided) => { + let Ok(provided) = provided.parse::() else { + return Err(PostHogEvaluationError::Internal(format!( + "Can not parse the left side of the condition as a number: {:?}", + provided + ))); + }; + provided + } + _ => { + return Err(PostHogEvaluationError::Internal(format!( + "The left side of the condition is not a number or a string: {:?}", + provided + ))); + } + }; + match operator { + "lt" => Ok(provided < requested), + "gt" => Ok(provided > requested), + op => Err(PostHogEvaluationError::Internal(format!( + "Unsupported operator: {}", + op + ))), + } + } + _ => Err(PostHogEvaluationError::Internal(format!( + "Unsupported operator: {}", + operator + ))), + } + } + + /// Evaluate a percentage. + fn evaluate_percentage(&self, mapped_user_id: f64, percentage: i64) -> bool { + mapped_user_id <= percentage as f64 / 100.0 + } + + /// Evaluate a filter group for a feature flag. Returns an error if there are errors during the evaluation. + /// + /// Return values: + /// Ok(GroupEvaluationResult::MatchedAndOverride(variant)): matched and evaluated to this value + /// Ok(GroupEvaluationResult::MatchedAndEvaluate): condition matched but no variant override, use the global rollout percentage + /// Ok(GroupEvaluationResult::Unmatched): condition unmatched + fn evaluate_group( + &self, + group: &LocalEvaluationFlagFilterGroup, + hash_on_group_rollout_percentage: f64, + provided_properties: &HashMap, + ) -> Result { + if let Some(ref properties) = group.properties { + for property in properties { + if let Some(value) = provided_properties.get(&property.key) { + // The user provided the property value + if !self.evaluate_condition( + property.operator.as_ref(), + value, + &property.value, + )? { + return Ok(GroupEvaluationResult::Unmatched); + } + } else { + // We cannot evaluate, the property is not available + return Err(PostHogEvaluationError::NotAvailable(format!( + "The required property in the condition is not available: {}", + property.key + ))); + } + } + } + + // The group has no condition matchers or we matched the properties + if self.evaluate_percentage(hash_on_group_rollout_percentage, group.rollout_percentage) { + if let Some(ref variant_override) = group.variant { + Ok(GroupEvaluationResult::MatchedAndOverride( + variant_override.clone(), + )) + } else { + Ok(GroupEvaluationResult::MatchedAndEvaluate) + } + } else { + Ok(GroupEvaluationResult::Unmatched) + } + } + + /// Evaluate a multivariate feature flag. Returns `None` if the flag is not available or if there are errors + /// during the evaluation. + /// + /// The parsing logic is as follows: + /// + /// * Match each filter group. + /// - If a group is matched, it will first determine whether the user is in the range of the group's rollout + /// percentage. We will generate a consistent hash for the user ID on the group rollout percentage. This hash + /// is shared across all groups. + /// - If the hash falls within the group's rollout percentage, return the variant if it's overridden, or + /// - Evaluate the variant using the global config and the global rollout percentage. + /// * Otherwise, continue with the next group until all groups are evaluated and no group is within the + /// rollout percentage. + /// * If there are no matching groups, return an error. + /// + /// Example: we have a multivariate flag with 3 groups of the configured global rollout percentage: A (10%), B (20%), C (70%). + /// There is a single group with a condition that has a rollout percentage of 10% and it does not have a variant override. + /// Then, we will have 1% of the users evaluated to A, 2% to B, and 7% to C. + pub fn evaluate_multivariate( + &self, + flag_key: &str, + user_id: &str, + ) -> Result { + let hash_on_global_rollout_percentage = + Self::consistent_hash(user_id, flag_key, "multivariate"); + let hash_on_group_rollout_percentage = + Self::consistent_hash(user_id, flag_key, "within_group"); + self.evaluate_multivariate_inner( + flag_key, + hash_on_global_rollout_percentage, + hash_on_group_rollout_percentage, + &HashMap::new(), + ) + } + + /// Evaluate a multivariate feature flag. Note that we directly take the mapped user ID + /// (a consistent hash ranging from 0 to 1) so that it is easier to use it in the tests + /// and avoid duplicate computations. + /// + /// Use a different consistent hash for evaluating the group rollout percentage. + /// The behavior: if the condition is set to rolling out to 10% of the users, and + /// we set the variant A to 20% in the global config, then 2% of the total users will + /// be evaluated to variant A. + /// + /// Note that the hash to determine group rollout percentage is shared across all groups. So if we have two + /// exactly-the-same conditions with 10% and 20% rollout percentage respectively, a total of 20% of the users + /// will be evaluated (versus 30% if group evaluation is done independently). + pub(crate) fn evaluate_multivariate_inner( + &self, + flag_key: &str, + hash_on_global_rollout_percentage: f64, + hash_on_group_rollout_percentage: f64, + properties: &HashMap, + ) -> Result { + if let Some(flag_config) = self.flags.get(flag_key) { + if !flag_config.active { + return Err(PostHogEvaluationError::NotAvailable(format!( + "The feature flag is not active: {}", + flag_key + ))); + } + // TODO: sort the groups so that variant overrides always get evaluated first and it follows the PostHog + // Python SDK behavior; for now we do not configure conditions without variant overrides in Neon so it + // does not matter. + for group in &flag_config.filters.groups { + match self.evaluate_group(group, hash_on_group_rollout_percentage, properties)? { + GroupEvaluationResult::MatchedAndOverride(variant) => return Ok(variant), + GroupEvaluationResult::MatchedAndEvaluate => { + let mut percentage = 0; + for variant in &flag_config.filters.multivariate.variants { + percentage += variant.rollout_percentage; + if self + .evaluate_percentage(hash_on_global_rollout_percentage, percentage) + { + return Ok(variant.key.clone()); + } + } + // This should not happen because the rollout percentage always adds up to 100, but just in case that PostHog + // returned invalid spec, we return an error. + return Err(PostHogEvaluationError::Internal(format!( + "Rollout percentage does not add up to 100: {}", + flag_key + ))); + } + GroupEvaluationResult::Unmatched => continue, + } + } + // If no group is matched, the feature is not available, and up to the caller to decide what to do. + Err(PostHogEvaluationError::NoConditionGroupMatched) + } else { + // The feature flag is not available yet + Err(PostHogEvaluationError::NotAvailable(format!( + "Not found in the local evaluation spec: {}", + flag_key + ))) + } + } +} + +/// A lite PostHog client. +/// +/// At the point of writing this code, PostHog does not have a functional Rust client with feature flag support. +/// This is a lite version that only supports local evaluation of feature flags and only supports those JSON specs +/// that will be used within Neon. +/// +/// PostHog is designed as a browser-server system: the browser (client) side uses the client key and is exposed +/// to the end users; the server side uses a server key and is not exposed to the end users. The client and the +/// server has different API keys and provide a different set of APIs. In Neon, we only have the server (that is +/// pageserver), and it will use both the client API and the server API. So we need to store two API keys within +/// our PostHog client. +/// +/// The server API is used to fetch the feature flag specs. The client API is used to capture events in case we +/// want to report the feature flag usage back to PostHog. The current plan is to use PostHog only as an UI to +/// configure feature flags so it is very likely that the client API will not be used. +pub struct PostHogClient { + /// The server API key. + server_api_key: String, + /// The client API key. + client_api_key: String, + /// The project ID. + project_id: String, + /// The private API URL. + private_api_url: String, + /// The public API URL. + public_api_url: String, + /// The HTTP client. + client: reqwest::Client, +} + +impl PostHogClient { + pub fn new( + server_api_key: String, + client_api_key: String, + project_id: String, + private_api_url: String, + public_api_url: String, + ) -> Self { + let client = reqwest::Client::new(); + Self { + server_api_key, + client_api_key, + project_id, + private_api_url, + public_api_url, + client, + } + } + + pub fn new_with_us_region( + server_api_key: String, + client_api_key: String, + project_id: String, + ) -> Self { + Self::new( + server_api_key, + client_api_key, + project_id, + "https://us.posthog.com".to_string(), + "https://us.i.posthog.com".to_string(), + ) + } + + /// Fetch the feature flag specs from the server. + /// + /// This is unfortunately an undocumented API at: + /// - + /// - + /// + /// The handling logic in [`FeatureStore`] mostly follows the Python API implementation. + /// See `_compute_flag_locally` in + pub async fn get_feature_flags_local_evaluation( + &self, + ) -> anyhow::Result { + // BASE_URL/api/projects/:project_id/feature_flags/local_evaluation + // with bearer token of self.server_api_key + let url = format!( + "{}/api/projects/{}/feature_flags/local_evaluation", + self.private_api_url, self.project_id + ); + let response = self + .client + .get(url) + .bearer_auth(&self.server_api_key) + .send() + .await?; + let body = response.text().await?; + Ok(serde_json::from_str(&body)?) + } + + /// Capture an event. This will only be used to report the feature flag usage back to PostHog, though + /// it also support a lot of other functionalities. + /// + /// + pub async fn capture_event( + &self, + event: &str, + distinct_id: &str, + properties: &HashMap, + ) -> anyhow::Result<()> { + // PUBLIC_URL/capture/ + // with bearer token of self.client_api_key + let url = format!("{}/capture/", self.public_api_url); + self.client + .post(url) + .body(serde_json::to_string(&json!({ + "api_key": self.client_api_key, + "distinct_id": distinct_id, + "event": event, + "properties": properties, + }))?) + .send() + .await?; + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn data() -> &'static str { + r#"{ + "flags": [ + { + "id": 132794, + "team_id": 152860, + "name": "", + "key": "gc-compaction", + "filters": { + "groups": [ + { + "variant": "enabled-stage-2", + "properties": [ + { + "key": "plan_type", + "type": "person", + "value": [ + "free" + ], + "operator": "exact" + }, + { + "key": "pageserver_remote_size", + "type": "person", + "value": "10000000", + "operator": "lt" + } + ], + "rollout_percentage": 50 + }, + { + "properties": [ + { + "key": "plan_type", + "type": "person", + "value": [ + "free" + ], + "operator": "exact" + }, + { + "key": "pageserver_remote_size", + "type": "person", + "value": "10000000", + "operator": "lt" + } + ], + "rollout_percentage": 80 + } + ], + "payloads": {}, + "multivariate": { + "variants": [ + { + "key": "disabled", + "name": "", + "rollout_percentage": 90 + }, + { + "key": "enabled-stage-1", + "name": "", + "rollout_percentage": 10 + }, + { + "key": "enabled-stage-2", + "name": "", + "rollout_percentage": 0 + }, + { + "key": "enabled-stage-3", + "name": "", + "rollout_percentage": 0 + }, + { + "key": "enabled", + "name": "", + "rollout_percentage": 0 + } + ] + } + }, + "deleted": false, + "active": true, + "ensure_experience_continuity": false, + "has_encrypted_payloads": false, + "version": 6 + } + ], + "group_type_mapping": {}, + "cohorts": {} + }"# + } + + #[test] + fn parse_local_evaluation() { + let data = data(); + let _: LocalEvaluationResponse = serde_json::from_str(data).unwrap(); + } + + #[test] + fn evaluate_multivariate() { + let mut store = FeatureStore::new(); + let response: LocalEvaluationResponse = serde_json::from_str(data()).unwrap(); + store.set_flags(response.flags); + + // This lacks the required properties and cannot be evaluated. + let variant = + store.evaluate_multivariate_inner("gc-compaction", 1.00, 0.40, &HashMap::new()); + assert!(matches!( + variant, + Err(PostHogEvaluationError::NotAvailable(_)) + ),); + + let properties_unmatched = HashMap::from([ + ( + "plan_type".to_string(), + PostHogFlagFilterPropertyValue::String("paid".to_string()), + ), + ( + "pageserver_remote_size".to_string(), + PostHogFlagFilterPropertyValue::Number(1000.0), + ), + ]); + + // This does not match any group so there will be an error. + let variant = + store.evaluate_multivariate_inner("gc-compaction", 1.00, 0.40, &properties_unmatched); + assert!(matches!( + variant, + Err(PostHogEvaluationError::NoConditionGroupMatched) + ),); + let variant = + store.evaluate_multivariate_inner("gc-compaction", 0.80, 0.80, &properties_unmatched); + assert!(matches!( + variant, + Err(PostHogEvaluationError::NoConditionGroupMatched) + ),); + + let properties = HashMap::from([ + ( + "plan_type".to_string(), + PostHogFlagFilterPropertyValue::String("free".to_string()), + ), + ( + "pageserver_remote_size".to_string(), + PostHogFlagFilterPropertyValue::Number(1000.0), + ), + ]); + + // It matches the first group as 0.10 <= 0.50 and the properties are matched. Then it gets evaluated to the variant override. + let variant = store.evaluate_multivariate_inner("gc-compaction", 0.10, 0.10, &properties); + assert_eq!(variant.unwrap(), "enabled-stage-2".to_string()); + + // It matches the second group as 0.50 <= 0.60 <= 0.80 and the properties are matched. Then it gets evaluated using the global percentage. + let variant = store.evaluate_multivariate_inner("gc-compaction", 0.99, 0.60, &properties); + assert_eq!(variant.unwrap(), "enabled-stage-1".to_string()); + let variant = store.evaluate_multivariate_inner("gc-compaction", 0.80, 0.60, &properties); + assert_eq!(variant.unwrap(), "disabled".to_string()); + + // It matches the group conditions but not the group rollout percentage. + let variant = store.evaluate_multivariate_inner("gc-compaction", 1.00, 0.90, &properties); + assert!(matches!( + variant, + Err(PostHogEvaluationError::NoConditionGroupMatched) + ),); + } +} diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index f3d8b951a8..fecf62f756 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -39,8 +39,10 @@ env_logger = { version = "0.11" } fail = { version = "0.5", default-features = false, features = ["failpoints"] } form_urlencoded = { version = "1" } futures-channel = { version = "0.3", features = ["sink"] } +futures-core = { version = "0.3" } futures-executor = { version = "0.3" } futures-io = { version = "0.3" } +futures-task = { version = "0.3", default-features = false, features = ["std"] } futures-util = { version = "0.3", features = ["channel", "io", "sink"] } generic-array = { version = "0.14", default-features = false, features = ["more_lengths", "zeroize"] } getrandom = { version = "0.2", default-features = false, features = ["std"] } @@ -70,6 +72,7 @@ num-traits = { version = "0.2", features = ["i128", "libm"] } once_cell = { version = "1" } p256 = { version = "0.13", features = ["jwk"] } parquet = { version = "53", default-features = false, features = ["zstd"] } +percent-encoding = { version = "2" } prost = { version = "0.13", features = ["no-recursion-limit", "prost-derive"] } rand = { version = "0.8", features = ["small_rng"] } regex = { version = "1" } From cfbef4d586f96b9f5e0648d0a7ea04db54b86962 Mon Sep 17 00:00:00 2001 From: Vlad Lazar Date: Tue, 13 May 2025 14:02:25 +0100 Subject: [PATCH 34/65] safekeeper: downgrade stream from future WAL log (#11909) ## Problem 1. Safekeeper selection on the pageserver side isn't very dynamic. Once you connect to one safekeeper, you'll use that one for as long as the safekeeper keeps the connection alive. In principle, we could be more eager, since the wal receiver connection can be cancelled but we don't do that. We wait until the "session" is done and then we pick a new SK. 2. Picking a new SK is quite conservative. We will switch if: a. We haven't received anything from the SK within the last 10 seconds (wal_connect_timeout) or b. The candidate SK is 1GiB ahead or c. The candidate SK is in the same AZ as the PS or d. There's a candidate that is ahead and we've not had any WAL within the last 10 seconds (lagging_wal_timeout) Hence, we can end up with pageservers that are requesting WAL which their safekeeper hasn't seen yet. ## Summary of changes Downgrade warning log to info. --- safekeeper/src/send_wal.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs index 33e3d0485c..05f827494e 100644 --- a/safekeeper/src/send_wal.rs +++ b/safekeeper/src/send_wal.rs @@ -513,7 +513,7 @@ impl SafekeeperPostgresHandler { let end_pos = end_watch.get(); if end_pos < start_pos { - warn!( + info!( "requested start_pos {} is ahead of available WAL end_pos {}", start_pos, end_pos ); From 25ab16ee248e0873939569075b836f5d85d3d5f8 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Tue, 13 May 2025 14:30:09 +0100 Subject: [PATCH 35/65] chore(compute): Postgres 17.5, 16.9, 15.13 and 14.18 (#11886) Bump all minor versions. the only conflict was src/backend/storage/smgr/smgr.c in v17 where our smgr changes conflicted with https://github.com/postgres/postgres/commit/ee578921b60ef9a14eaea54b608549e4f8b14f26 but it was trivial to resolve. --- vendor/postgres-v14 | 2 +- vendor/postgres-v15 | 2 +- vendor/postgres-v16 | 2 +- vendor/postgres-v17 | 2 +- vendor/revisions.json | 16 ++++++++-------- 5 files changed, 12 insertions(+), 12 deletions(-) diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index 06b405bc98..ead1e76bdc 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit 06b405bc982fd53522689aa4acbfd9c44b7993cf +Subproject commit ead1e76bdcb71ef87f52f0610bd7333247f75179 diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index 72f83df76c..052df87d33 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit 72f83df76c61ce18d81bd371f0afd2a43d59c052 +Subproject commit 052df87d338dc30687d0c96f1a4d9b6cb4882b2e diff --git a/vendor/postgres-v16 b/vendor/postgres-v16 index d72d76f2cd..bb5eee65ac 160000 --- a/vendor/postgres-v16 +++ b/vendor/postgres-v16 @@ -1 +1 @@ -Subproject commit d72d76f2cdee4194dd052ce099e9784aca7c794a +Subproject commit bb5eee65ac753b5a66d255ec5fb4c0e33180e8fd diff --git a/vendor/postgres-v17 b/vendor/postgres-v17 index 0d59c91c1a..e5374b7299 160000 --- a/vendor/postgres-v17 +++ b/vendor/postgres-v17 @@ -1 +1 @@ -Subproject commit 0d59c91c1a23e667f1d1169d5f040b3fa0a0ab44 +Subproject commit e5374b72997b0afc8374137674e873f7a558120a diff --git a/vendor/revisions.json b/vendor/revisions.json index e76510f969..cf9f474e1a 100644 --- a/vendor/revisions.json +++ b/vendor/revisions.json @@ -1,18 +1,18 @@ { "v17": [ - "17.4", - "0d59c91c1a23e667f1d1169d5f040b3fa0a0ab44" + "17.5", + "e5374b72997b0afc8374137674e873f7a558120a" ], "v16": [ - "16.8", - "d72d76f2cdee4194dd052ce099e9784aca7c794a" + "16.9", + "bb5eee65ac753b5a66d255ec5fb4c0e33180e8fd" ], "v15": [ - "15.12", - "72f83df76c61ce18d81bd371f0afd2a43d59c052" + "15.13", + "052df87d338dc30687d0c96f1a4d9b6cb4882b2e" ], "v14": [ - "14.17", - "06b405bc982fd53522689aa4acbfd9c44b7993cf" + "14.18", + "ead1e76bdcb71ef87f52f0610bd7333247f75179" ] } From 290369061f22c18850e76355d2be885ee82d1302 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Tue, 13 May 2025 17:13:42 +0300 Subject: [PATCH 36/65] Check prefetch result in DEBUG_COMPARE_LOCAL mode (#11502) ## Problem Prefetched and LFC results are not checked in DEBUG_COMPARE_LOCAL mode ## Summary of changes Add check for this results as well. --------- Co-authored-by: Konstantin Knizhnik --- pgxn/neon/pagestore_smgr.c | 262 ++++++++++++++++--------------------- 1 file changed, 116 insertions(+), 146 deletions(-) diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index f574517b2a..31e47db7d7 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -1281,75 +1281,24 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, communicator_read_at_lsnv(rinfo, forkNum, blkno, &request_lsns, &buffer, 1, NULL); } -#if PG_MAJORVERSION_NUM < 17 -/* - * neon_read() -- Read the specified block from a relation. - */ -#if PG_MAJORVERSION_NUM < 16 -static void -neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, char *buffer) -#else -static void -neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer) -#endif -{ - neon_request_lsns request_lsns; - bits8 present; - void *bufferp; - - switch (reln->smgr_relpersistence) - { - case 0: - neon_log(ERROR, "cannot call smgrread() on rel with unknown persistence"); - - case RELPERSISTENCE_PERMANENT: - break; - - case RELPERSISTENCE_TEMP: - case RELPERSISTENCE_UNLOGGED: - mdread(reln, forkNum, blkno, buffer); - return; - - default: - neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); - } - - /* Try to read PS results if they are available */ - communicator_prefetch_pump_state(); - - neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1); - - present = 0; - bufferp = buffer; - if (communicator_prefetch_lookupv(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1, &bufferp, &present)) - { - /* Prefetch hit */ - return; - } - - /* Try to read from local file cache */ - if (lfc_read(InfoFromSMgrRel(reln), forkNum, blkno, buffer)) - { - MyNeonCounters->file_cache_hits_total++; - return; - } - - neon_read_at_lsn(InfoFromSMgrRel(reln), forkNum, blkno, request_lsns, buffer); - - /* - * Try to receive prefetch results once again just to make sure we don't leave the smgr code while the OS might still have buffered bytes. - */ - communicator_prefetch_pump_state(); - #ifdef DEBUG_COMPARE_LOCAL +static void +compare_with_local(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void* buffer, XLogRecPtr request_lsn) +{ if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln)) { char pageserver_masked[BLCKSZ]; PGIOAlignedBlock mdbuf; PGIOAlignedBlock mdbuf_masked; - XLogRecPtr request_lsn = request_lsns.request_lsn; +#if PG_MAJORVERSION_NUM >= 17 + { + void* mdbuffers[1] = { mdbuf.data }; + mdreadv(reln, forkNum, blkno, mdbuffers, 1); + } +#else mdread(reln, forkNum, blkno, mdbuf.data); +#endif memcpy(pageserver_masked, buffer, BLCKSZ); memcpy(mdbuf_masked.data, mdbuf.data, BLCKSZ); @@ -1413,11 +1362,105 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer } } } +} +#endif + + +#if PG_MAJORVERSION_NUM < 17 + +/* + * neon_read() -- Read the specified block from a relation. + */ +#if PG_MAJORVERSION_NUM < 16 +static void +neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, char *buffer) +#else +static void +neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer) +#endif +{ + neon_request_lsns request_lsns; + bits8 present; + void *bufferp; + + switch (reln->smgr_relpersistence) + { + case 0: + neon_log(ERROR, "cannot call smgrread() on rel with unknown persistence"); + + case RELPERSISTENCE_PERMANENT: + break; + + case RELPERSISTENCE_TEMP: + case RELPERSISTENCE_UNLOGGED: + mdread(reln, forkNum, blkno, buffer); + return; + + default: + neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); + } + + /* Try to read PS results if they are available */ + communicator_prefetch_pump_state(); + + neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1); + + present = 0; + bufferp = buffer; + if (communicator_prefetch_lookupv(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1, &bufferp, &present)) + { + /* Prefetch hit */ +#ifdef DEBUG_COMPARE_LOCAL + compare_with_local(reln, forkNum, blkno, buffer, request_lsns.request_lsn); +#else + return; +#endif + } + + /* Try to read from local file cache */ + if (lfc_read(InfoFromSMgrRel(reln), forkNum, blkno, buffer)) + { + MyNeonCounters->file_cache_hits_total++; +#ifdef DEBUG_COMPARE_LOCAL + compare_with_local(reln, forkNum, blkno, buffer, request_lsns.request_lsn); +#else + return; +#endif + } + + neon_read_at_lsn(InfoFromSMgrRel(reln), forkNum, blkno, request_lsns, buffer); + + /* + * Try to receive prefetch results once again just to make sure we don't leave the smgr code while the OS might still have buffered bytes. + */ + communicator_prefetch_pump_state(); + +#ifdef DEBUG_COMPARE_LOCAL + compare_with_local(reln, forkNum, blkno, buffer, request_lsns.request_lsn); #endif } #endif /* PG_MAJORVERSION_NUM <= 16 */ #if PG_MAJORVERSION_NUM >= 17 + +#ifdef DEBUG_COMPARE_LOCAL +static void +compare_with_localv(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void** buffers, BlockNumber nblocks, neon_request_lsns* request_lsns, bits8* read_pages) +{ + if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln)) + { + for (BlockNumber i = 0; i < nblocks; i++) + { + if (BITMAP_ISSET(read_pages, i)) + { + compare_with_local(reln, forkNum, blkno + i, buffers[i], request_lsns[i].request_lsn); + } + } + } +} +#endif + + static void neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, void **buffers, BlockNumber nblocks) @@ -1460,8 +1503,13 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, blocknum, request_lsns, nblocks, buffers, read_pages); +#ifdef DEBUG_COMPARE_LOCAL + compare_with_localv(reln, forknum, blocknum, buffers, nblocks, request_lsns, read_pages); + memset(read_pages, 0, sizeof(read_pages)); +#else if (prefetch_result == nblocks) return; +#endif /* Try to read from local file cache */ lfc_result = lfc_readv_select(InfoFromSMgrRel(reln), forknum, blocknum, buffers, @@ -1470,9 +1518,14 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, if (lfc_result > 0) MyNeonCounters->file_cache_hits_total += lfc_result; +#ifdef DEBUG_COMPARE_LOCAL + compare_with_localv(reln, forknum, blocknum, buffers, nblocks, request_lsns, read_pages); + memset(read_pages, 0, sizeof(read_pages)); +#else /* Read all blocks from LFC, so we're done */ if (prefetch_result + lfc_result == nblocks) return; +#endif communicator_read_at_lsnv(InfoFromSMgrRel(reln), forknum, blocknum, request_lsns, buffers, nblocks, read_pages); @@ -1483,91 +1536,8 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, communicator_prefetch_pump_state(); #ifdef DEBUG_COMPARE_LOCAL - if (forknum == MAIN_FORKNUM && IS_LOCAL_REL(reln)) - { - char pageserver_masked[BLCKSZ]; - PGIOAlignedBlock mdbuf; - PGIOAlignedBlock mdbuf_masked; - XLogRecPtr request_lsn = request_lsns->request_lsn; - - for (int i = 0; i < nblocks; i++) - { - BlockNumber blkno = blocknum + i; - if (!BITMAP_ISSET(read_pages, i)) - continue; - -#if PG_MAJORVERSION_NUM >= 17 - { - void* mdbuffers[1] = { mdbuf.data }; - mdreadv(reln, forknum, blkno, mdbuffers, 1); - } -#else - mdread(reln, forknum, blkno, mdbuf.data); -#endif - - memcpy(pageserver_masked, buffers[i], BLCKSZ); - memcpy(mdbuf_masked.data, mdbuf.data, BLCKSZ); - - if (PageIsNew((Page) mdbuf.data)) - { - if (!PageIsNew((Page) pageserver_masked)) - { - neon_log(PANIC, "page is new in MD but not in Page Server at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n", - blkno, - RelFileInfoFmt(InfoFromSMgrRel(reln)), - forknum, - (uint32) (request_lsn >> 32), (uint32) request_lsn, - hexdump_page(buffers[i])); - } - } - else if (PageIsNew((Page) buffers[i])) - { - neon_log(PANIC, "page is new in Page Server but not in MD at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n", - blkno, - RelFileInfoFmt(InfoFromSMgrRel(reln)), - forknum, - (uint32) (request_lsn >> 32), (uint32) request_lsn, - hexdump_page(mdbuf.data)); - } - else if (PageGetSpecialSize(mdbuf.data) == 0) - { - /* assume heap */ - RmgrTable[RM_HEAP_ID].rm_mask(mdbuf_masked.data, blkno); - RmgrTable[RM_HEAP_ID].rm_mask(pageserver_masked, blkno); - - if (memcmp(mdbuf_masked.data, pageserver_masked, BLCKSZ) != 0) - { - neon_log(PANIC, "heap buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n", - blkno, - RelFileInfoFmt(InfoFromSMgrRel(reln)), - forknum, - (uint32) (request_lsn >> 32), (uint32) request_lsn, - hexdump_page(mdbuf_masked.data), - hexdump_page(pageserver_masked)); - } - } - else if (PageGetSpecialSize(mdbuf.data) == MAXALIGN(sizeof(BTPageOpaqueData))) - { - if (((BTPageOpaqueData *) PageGetSpecialPointer(mdbuf.data))->btpo_cycleid < MAX_BT_CYCLE_ID) - { - /* assume btree */ - RmgrTable[RM_BTREE_ID].rm_mask(mdbuf_masked.data, blkno); - RmgrTable[RM_BTREE_ID].rm_mask(pageserver_masked, blkno); - - if (memcmp(mdbuf_masked.data, pageserver_masked, BLCKSZ) != 0) - { - neon_log(PANIC, "btree buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n", - blkno, - RelFileInfoFmt(InfoFromSMgrRel(reln)), - forknum, - (uint32) (request_lsn >> 32), (uint32) request_lsn, - hexdump_page(mdbuf_masked.data), - hexdump_page(pageserver_masked)); - } - } - } - } - } + memset(read_pages, 0xFF, sizeof(read_pages)); + compare_with_localv(reln, forknum, blocknum, buffers, nblocks, request_lsns, read_pages); #endif } #endif From 234c882a0768876aa4616420af9a5fb132bb7b38 Mon Sep 17 00:00:00 2001 From: Folke Behrens Date: Tue, 13 May 2025 14:58:37 +0000 Subject: [PATCH 37/65] proxy: Expose handlers for cpu and heap profiling (#11912) ## Problem It's difficult to understand where proxy spends most of cpu and memory. ## Summary of changes Expose cpu and heap profiling handlers for continuous profiling. neondatabase/cloud#22670 --- proxy/src/bin/proxy.rs | 4 ++++ proxy/src/http/health_server.rs | 8 +++++++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs index 7d4b44841d..d60d32eb3b 100644 --- a/proxy/src/bin/proxy.rs +++ b/proxy/src/bin/proxy.rs @@ -1,6 +1,10 @@ #[global_allocator] static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; +#[allow(non_upper_case_globals)] +#[unsafe(export_name = "malloc_conf")] +pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:21\0"; + #[tokio::main] async fn main() -> anyhow::Result<()> { proxy::binary::proxy::run().await diff --git a/proxy/src/http/health_server.rs b/proxy/src/http/health_server.rs index 5278fe2a3e..b0b5a598d1 100644 --- a/proxy/src/http/health_server.rs +++ b/proxy/src/http/health_server.rs @@ -3,7 +3,7 @@ use std::net::TcpListener; use std::sync::{Arc, Mutex}; use anyhow::{anyhow, bail}; -use http_utils::endpoint::{self, request_span}; +use http_utils::endpoint::{self, profile_cpu_handler, profile_heap_handler, request_span}; use http_utils::error::ApiError; use http_utils::json::json_response; use http_utils::{RouterBuilder, RouterService}; @@ -33,6 +33,12 @@ fn make_router(metrics: AppMetrics) -> RouterBuilder { request_span(r, move |b| prometheus_metrics_handler(b, state)) }) .get("/v1/status", status_handler) + .get("/profile/cpu", move |r| { + request_span(r, profile_cpu_handler) + }) + .get("/profile/heap", move |r| { + request_span(r, profile_heap_handler) + }) } pub async fn task_main( From 045ae13e060c3717c921097444d5c6b09925e87c Mon Sep 17 00:00:00 2001 From: Vlad Lazar Date: Tue, 13 May 2025 18:49:49 +0100 Subject: [PATCH 38/65] pageserver: make imports work with tenant shut downs (#11855) ## Problem Lifetime of imported timelines (and implicitly the import background task) has some shortcomings: 1. Timeline activation upon import completion is tricky. Previously, a timeline that finished importing after a tenant detach would not get activated and there's concerns about the safety of activating concurrently with shut-down. 2. Import jobs can prevent tenant shut down since they hold the tenant gate ## Summary of Changes Track the import tasks in memory and abort them explicitly on tenant shutdown. Integrate more closely with the storage controller: 1. When an import task has finished all of its jobs, it notifies the storage controller, but **does not** mark the import as done in the index_part. When all shards have finished importing, the storage controller will call the `/activate_post_import` idempotent endpoint for all of them. The handler, marks the import complete in index part, resets the tenant if required and checks if the timeline is active yet. 2. Not directly related, but the import job now gets the starting state from the storage controller instead of the import bucket. This paves the way for progress checkpointing. Related: https://github.com/neondatabase/neon/issues/11568 --- pageserver/client/src/mgmt_api.rs | 22 ++ pageserver/src/controller_upcall_client.rs | 40 +++ pageserver/src/deletion_queue.rs | 9 + pageserver/src/http/routes.rs | 105 ++++++ pageserver/src/tenant.rs | 222 ++++++------ .../src/tenant/remote_timeline_client.rs | 29 ++ .../src/tenant/timeline/import_pgdata.rs | 284 +++++++-------- .../src/tenant/timeline/import_pgdata/flow.rs | 4 + .../import_pgdata/importbucket_client.rs | 25 -- .../import_pgdata/importbucket_format.rs | 6 - .../import_pgdata/index_part_format.rs | 8 + storage_controller/src/http.rs | 30 ++ storage_controller/src/pageserver_client.rs | 19 + storage_controller/src/persistence.rs | 33 ++ storage_controller/src/service.rs | 328 +++++++++++------- .../src/service/safekeeper_service.rs | 7 +- storage_controller/src/timeline_import.rs | 22 +- test_runner/regress/test_import_pgdata.py | 91 ++++- 18 files changed, 859 insertions(+), 425 deletions(-) diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs index 4a87a91910..219e63c9d4 100644 --- a/pageserver/client/src/mgmt_api.rs +++ b/pageserver/client/src/mgmt_api.rs @@ -1,5 +1,6 @@ use std::collections::HashMap; use std::error::Error as _; +use std::time::Duration; use bytes::Bytes; use detach_ancestor::AncestorDetached; @@ -819,4 +820,25 @@ impl Client { .await .map(|resp| resp.status()) } + + pub async fn activate_post_import( + &self, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + activate_timeline_timeout: Duration, + ) -> Result { + let uri = format!( + "{}/v1/tenant/{}/timeline/{}/activate_post_import?timeline_activate_timeout_ms={}", + self.mgmt_api_endpoint, + tenant_shard_id, + timeline_id, + activate_timeline_timeout.as_millis() + ); + + self.request(Method::PUT, uri, ()) + .await? + .json() + .await + .map_err(Error::ReceiveBody) + } } diff --git a/pageserver/src/controller_upcall_client.rs b/pageserver/src/controller_upcall_client.rs index 468e5463b0..6d186b091a 100644 --- a/pageserver/src/controller_upcall_client.rs +++ b/pageserver/src/controller_upcall_client.rs @@ -53,6 +53,11 @@ pub trait StorageControllerUpcallApi { timeline_id: TimelineId, status: ShardImportStatus, ) -> impl Future> + Send; + fn get_timeline_import_status( + &self, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + ) -> impl Future, RetryForeverError>> + Send; } impl StorageControllerUpcallClient { @@ -302,4 +307,39 @@ impl StorageControllerUpcallApi for StorageControllerUpcallClient { self.retry_http_forever(&url, request).await } + + #[tracing::instrument(skip_all)] // so that warning logs from retry_http_forever have context + async fn get_timeline_import_status( + &self, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + ) -> Result, RetryForeverError> { + let url = self + .base_url + .join(format!("timeline_import_status/{}/{}", tenant_shard_id, timeline_id).as_str()) + .expect("Failed to build path"); + + Ok(backoff::retry( + || async { + let response = self.http_client.get(url.clone()).send().await?; + + if let Err(err) = response.error_for_status_ref() { + if matches!(err.status(), Some(reqwest::StatusCode::NOT_FOUND)) { + return Ok(None); + } else { + return Err(err); + } + } + response.json::().await.map(Some) + }, + |_| false, + 3, + u32::MAX, + "storage controller upcall", + &self.cancel, + ) + .await + .ok_or(RetryForeverError::ShuttingDown)? + .expect("We retry forever, this should never be reached")) + } } diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs index 4d62bc4ab5..65b2de28cd 100644 --- a/pageserver/src/deletion_queue.rs +++ b/pageserver/src/deletion_queue.rs @@ -663,6 +663,7 @@ mod test { use camino::Utf8Path; use hex_literal::hex; use pageserver_api::key::Key; + use pageserver_api::models::ShardImportStatus; use pageserver_api::shard::ShardIndex; use pageserver_api::upcall_api::ReAttachResponseTenant; use remote_storage::{RemoteStorageConfig, RemoteStorageKind}; @@ -796,6 +797,14 @@ mod test { ) -> Result<(), RetryForeverError> { unimplemented!() } + + async fn get_timeline_import_status( + &self, + _tenant_shard_id: TenantShardId, + _timeline_id: TimelineId, + ) -> Result, RetryForeverError> { + unimplemented!() + } } async fn setup(test_name: &str) -> anyhow::Result { diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 8b6500b020..2edec9dda1 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -3500,6 +3500,107 @@ async fn put_tenant_timeline_import_wal( }.instrument(span).await } +/// Activate a timeline after its import has completed +/// +/// The endpoint is idempotent and callers are expected to retry all +/// errors until a successful response. +async fn activate_post_import_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; + check_permission(&request, Some(tenant_shard_id.tenant_id))?; + + let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; + const DEFAULT_ACTIVATE_TIMEOUT: Duration = Duration::from_secs(1); + let activate_timeout = parse_query_param(&request, "timeline_activate_timeout_ms")? + .map(Duration::from_millis) + .unwrap_or(DEFAULT_ACTIVATE_TIMEOUT); + + let span = info_span!( + "activate_post_import_handler", + tenant_id=%tenant_shard_id.tenant_id, + timeline_id=%timeline_id, + shard_id=%tenant_shard_id.shard_slug() + ); + + async move { + let state = get_state(&request); + let tenant = state + .tenant_manager + .get_attached_tenant_shard(tenant_shard_id)?; + + tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; + + tenant + .finalize_importing_timeline(timeline_id) + .await + .map_err(ApiError::InternalServerError)?; + + match tenant.get_timeline(timeline_id, false) { + Ok(_timeline) => { + // Timeline is already visible. Reset not required: fall through. + } + Err(GetTimelineError::NotFound { .. }) => { + // This is crude: we reset the whole tenant such that the new timeline is detected + // and activated. We can come up with something more granular in the future. + // + // Note that we only reset the tenant if required: when the timeline is + // not present in [`Tenant::timelines`]. + let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn); + state + .tenant_manager + .reset_tenant(tenant_shard_id, false, &ctx) + .await + .map_err(ApiError::InternalServerError)?; + } + Err(GetTimelineError::ShuttingDown) => { + return Err(ApiError::ShuttingDown); + } + Err(GetTimelineError::NotActive { .. }) => { + unreachable!("Called get_timeline with active_only=false"); + } + } + + let timeline = tenant.get_timeline(timeline_id, false)?; + + let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn) + .with_scope_timeline(&timeline); + + let result = + tokio::time::timeout(activate_timeout, timeline.wait_to_become_active(&ctx)).await; + match result { + Ok(Ok(())) => { + // fallthrough + } + // Timeline reached some other state that's not active + // TODO(vlad): if the tenant is broken, return a permananet error + Ok(Err(_timeline_state)) => { + return Err(ApiError::InternalServerError(anyhow::anyhow!( + "Timeline activation failed" + ))); + } + // Activation timed out + Err(_) => { + return Err(ApiError::Timeout("Timeline activation timed out".into())); + } + } + + let timeline_info = build_timeline_info( + &timeline, false, // include_non_incremental_logical_size, + false, // force_await_initial_logical_size + &ctx, + ) + .await + .context("get local timeline info") + .map_err(ApiError::InternalServerError)?; + + json_response(StatusCode::OK, timeline_info) + } + .instrument(span) + .await +} + /// Read the end of a tar archive. /// /// A tar archive normally ends with two consecutive blocks of zeros, 512 bytes each. @@ -3924,5 +4025,9 @@ pub fn make_router( "/v1/tenant/:tenant_id/timeline/:timeline_id/import_wal", |r| api_handler(r, put_tenant_timeline_import_wal), ) + .put( + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/activate_post_import", + |r| api_handler(r, activate_post_import_handler), + ) .any(handler_404)) } diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index e59db74479..441049f47d 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -50,6 +50,7 @@ use remote_timeline_client::{ use secondary::heatmap::{HeatMapTenant, HeatMapTimeline}; use storage_broker::BrokerClientChannel; use timeline::compaction::{CompactionOutcome, GcCompactionQueue}; +use timeline::import_pgdata::ImportingTimeline; use timeline::offload::{OffloadError, offload_timeline}; use timeline::{ CompactFlags, CompactOptions, CompactionError, PreviousHeatmap, ShutdownMode, import_pgdata, @@ -284,6 +285,19 @@ pub struct TenantShard { /// **Lock order**: if acquiring all (or a subset), acquire them in order `timelines`, `timelines_offloaded`, `timelines_creating` timelines_offloaded: Mutex>>, + /// Tracks the timelines that are currently importing into this tenant shard. + /// + /// Note that importing timelines are also present in [`Self::timelines_creating`]. + /// Keep this in mind when ordering lock acquisition. + /// + /// Lifetime: + /// * An imported timeline is created while scanning the bucket on tenant attach + /// if the index part contains an `import_pgdata` entry and said field marks the import + /// as in progress. + /// * Imported timelines are removed when the storage controller calls the post timeline + /// import activation endpoint. + timelines_importing: std::sync::Mutex>, + /// The last tenant manifest known to be in remote storage. None if the manifest has not yet /// been either downloaded or uploaded. Always Some after tenant attach. /// @@ -923,19 +937,10 @@ enum StartCreatingTimelineResult { #[allow(clippy::large_enum_variant, reason = "TODO")] enum TimelineInitAndSyncResult { - ReadyToActivate(Arc), + ReadyToActivate, NeedsSpawnImportPgdata(TimelineInitAndSyncNeedsSpawnImportPgdata), } -impl TimelineInitAndSyncResult { - fn ready_to_activate(self) -> Option> { - match self { - Self::ReadyToActivate(timeline) => Some(timeline), - _ => None, - } - } -} - #[must_use] struct TimelineInitAndSyncNeedsSpawnImportPgdata { timeline: Arc, @@ -1012,10 +1017,6 @@ enum CreateTimelineCause { enum LoadTimelineCause { Attach, Unoffload, - ImportPgdata { - create_guard: TimelineCreateGuard, - activate: ActivateTimelineArgs, - }, } #[derive(thiserror::Error, Debug)] @@ -1097,7 +1098,7 @@ impl TenantShard { self: &Arc, timeline_id: TimelineId, resources: TimelineResources, - mut index_part: IndexPart, + index_part: IndexPart, metadata: TimelineMetadata, previous_heatmap: Option, ancestor: Option>, @@ -1106,7 +1107,7 @@ impl TenantShard { ) -> anyhow::Result { let tenant_id = self.tenant_shard_id; - let import_pgdata = index_part.import_pgdata.take(); + let import_pgdata = index_part.import_pgdata.clone(); let idempotency = match &import_pgdata { Some(import_pgdata) => { CreateTimelineIdempotency::ImportPgdata(CreatingTimelineIdempotencyImportPgdata { @@ -1127,7 +1128,7 @@ impl TenantShard { } }; - let (timeline, timeline_ctx) = self.create_timeline_struct( + let (timeline, _timeline_ctx) = self.create_timeline_struct( timeline_id, &metadata, previous_heatmap, @@ -1197,14 +1198,6 @@ impl TenantShard { match import_pgdata { Some(import_pgdata) if !import_pgdata.is_done() => { - match cause { - LoadTimelineCause::Attach | LoadTimelineCause::Unoffload => (), - LoadTimelineCause::ImportPgdata { .. } => { - unreachable!( - "ImportPgdata should not be reloading timeline import is done and persisted as such in s3" - ) - } - } let mut guard = self.timelines_creating.lock().unwrap(); if !guard.insert(timeline_id) { // We should never try and load the same timeline twice during startup @@ -1260,26 +1253,7 @@ impl TenantShard { "Timeline has no ancestor and no layer files" ); - match cause { - LoadTimelineCause::Attach | LoadTimelineCause::Unoffload => (), - LoadTimelineCause::ImportPgdata { - create_guard, - activate, - } => { - // TODO: see the comment in the task code above how I'm not so certain - // it is safe to activate here because of concurrent shutdowns. - match activate { - ActivateTimelineArgs::Yes { broker_client } => { - info!("activating timeline after reload from pgdata import task"); - timeline.activate(self.clone(), broker_client, None, &timeline_ctx); - } - ActivateTimelineArgs::No => (), - } - drop(create_guard); - } - } - - Ok(TimelineInitAndSyncResult::ReadyToActivate(timeline)) + Ok(TimelineInitAndSyncResult::ReadyToActivate) } } } @@ -1768,7 +1742,7 @@ impl TenantShard { })?; match effect { - TimelineInitAndSyncResult::ReadyToActivate(_) => { + TimelineInitAndSyncResult::ReadyToActivate => { // activation happens later, on Tenant::activate } TimelineInitAndSyncResult::NeedsSpawnImportPgdata( @@ -1778,13 +1752,24 @@ impl TenantShard { guard, }, ) => { - tokio::task::spawn(self.clone().create_timeline_import_pgdata_task( - timeline, - import_pgdata, - ActivateTimelineArgs::No, - guard, - ctx.detached_child(TaskKind::ImportPgdata, DownloadBehavior::Warn), - )); + let timeline_id = timeline.timeline_id; + let import_task_handle = + tokio::task::spawn(self.clone().create_timeline_import_pgdata_task( + timeline.clone(), + import_pgdata, + guard, + ctx.detached_child(TaskKind::ImportPgdata, DownloadBehavior::Warn), + )); + + let prev = self.timelines_importing.lock().unwrap().insert( + timeline_id, + ImportingTimeline { + timeline: timeline.clone(), + import_task_handle, + }, + ); + + assert!(prev.is_none()); } } } @@ -2678,14 +2663,7 @@ impl TenantShard { .await? } CreateTimelineParams::ImportPgdata(params) => { - self.create_timeline_import_pgdata( - params, - ActivateTimelineArgs::Yes { - broker_client: broker_client.clone(), - }, - ctx, - ) - .await? + self.create_timeline_import_pgdata(params, ctx).await? } }; @@ -2759,7 +2737,6 @@ impl TenantShard { async fn create_timeline_import_pgdata( self: &Arc, params: CreateTimelineParamsImportPgdata, - activate: ActivateTimelineArgs, ctx: &RequestContext, ) -> Result { let CreateTimelineParamsImportPgdata { @@ -2840,24 +2817,71 @@ impl TenantShard { let (timeline, timeline_create_guard) = uninit_timeline.finish_creation_myself(); - tokio::spawn(self.clone().create_timeline_import_pgdata_task( + let import_task_handle = tokio::spawn(self.clone().create_timeline_import_pgdata_task( timeline.clone(), index_part, - activate, timeline_create_guard, timeline_ctx.detached_child(TaskKind::ImportPgdata, DownloadBehavior::Warn), )); + let prev = self.timelines_importing.lock().unwrap().insert( + timeline.timeline_id, + ImportingTimeline { + timeline: timeline.clone(), + import_task_handle, + }, + ); + + // Idempotency is enforced higher up the stack + assert!(prev.is_none()); + // NB: the timeline doesn't exist in self.timelines at this point Ok(CreateTimelineResult::ImportSpawned(timeline)) } + /// Finalize the import of a timeline on this shard by marking it complete in + /// the index part. If the import task hasn't finished yet, returns an error. + /// + /// This method is idempotent. If the import was finalized once, the next call + /// will be a no-op. + pub(crate) async fn finalize_importing_timeline( + &self, + timeline_id: TimelineId, + ) -> anyhow::Result<()> { + let timeline = { + let locked = self.timelines_importing.lock().unwrap(); + match locked.get(&timeline_id) { + Some(importing_timeline) => { + if !importing_timeline.import_task_handle.is_finished() { + return Err(anyhow::anyhow!("Import task not done yet")); + } + + importing_timeline.timeline.clone() + } + None => { + return Ok(()); + } + } + }; + + timeline + .remote_client + .schedule_index_upload_for_import_pgdata_finalize()?; + timeline.remote_client.wait_completion().await?; + + self.timelines_importing + .lock() + .unwrap() + .remove(&timeline_id); + + Ok(()) + } + #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%timeline.timeline_id))] async fn create_timeline_import_pgdata_task( self: Arc, timeline: Arc, index_part: import_pgdata::index_part_format::Root, - activate: ActivateTimelineArgs, timeline_create_guard: TimelineCreateGuard, ctx: RequestContext, ) { @@ -2869,7 +2893,6 @@ impl TenantShard { .create_timeline_import_pgdata_task_impl( timeline, index_part, - activate, timeline_create_guard, ctx, ) @@ -2885,60 +2908,15 @@ impl TenantShard { self: Arc, timeline: Arc, index_part: import_pgdata::index_part_format::Root, - activate: ActivateTimelineArgs, - timeline_create_guard: TimelineCreateGuard, + _timeline_create_guard: TimelineCreateGuard, ctx: RequestContext, ) -> Result<(), anyhow::Error> { info!("importing pgdata"); + let ctx = ctx.with_scope_timeline(&timeline); import_pgdata::doit(&timeline, index_part, &ctx, self.cancel.clone()) .await .context("import")?; - info!("import done"); - - // - // Reload timeline from remote. - // This proves that the remote state is attachable, and it reuses the code. - // - // TODO: think about whether this is safe to do with concurrent TenantShard::shutdown. - // timeline_create_guard hols the tenant gate open, so, shutdown cannot _complete_ until we exit. - // But our activate() call might launch new background tasks after TenantShard::shutdown - // already went past shutting down the TenantShard::timelines, which this timeline here is no part of. - // I think the same problem exists with the bootstrap & branch mgmt API tasks (tenant shutting - // down while bootstrapping/branching + activating), but, the race condition is much more likely - // to manifest because of the long runtime of this import task. - - // in theory this shouldn't even .await anything except for coop yield - info!("shutting down timeline"); - timeline.shutdown(ShutdownMode::Hard).await; - info!("timeline shut down, reloading from remote"); - // TODO: we can't do the following check because create_timeline_import_pgdata must return an Arc - // let Some(timeline) = Arc::into_inner(timeline) else { - // anyhow::bail!("implementation error: timeline that we shut down was still referenced from somewhere"); - // }; - let timeline_id = timeline.timeline_id; - - // load from object storage like TenantShard::attach does - let resources = self.build_timeline_resources(timeline_id); - let index_part = resources - .remote_client - .download_index_file(&self.cancel) - .await?; - let index_part = match index_part { - MaybeDeletedIndexPart::Deleted(_) => { - // likely concurrent delete call, cplane should prevent this - anyhow::bail!( - "index part says deleted but we are not done creating yet, this should not happen but" - ) - } - MaybeDeletedIndexPart::IndexPart(p) => p, - }; - let metadata = index_part.metadata.clone(); - self - .load_remote_timeline(timeline_id, index_part, metadata, None, resources, LoadTimelineCause::ImportPgdata{ - create_guard: timeline_create_guard, activate, }, &ctx) - .await? - .ready_to_activate() - .context("implementation error: reloaded timeline still needs import after import reported success")?; + info!("import done - waiting for activation"); anyhow::Ok(()) } @@ -3475,6 +3453,14 @@ impl TenantShard { timeline.defuse_for_tenant_drop(); }); } + { + let mut timelines_importing = self.timelines_importing.lock().unwrap(); + timelines_importing + .drain() + .for_each(|(_timeline_id, importing_timeline)| { + importing_timeline.shutdown(); + }); + } // test_long_timeline_create_then_tenant_delete is leaning on this message tracing::info!("Waiting for timelines..."); while let Some(res) = js.join_next().await { @@ -3949,13 +3935,6 @@ where Ok(result) } -enum ActivateTimelineArgs { - Yes { - broker_client: storage_broker::BrokerClientChannel, - }, - No, -} - impl TenantShard { pub fn tenant_specific_overrides(&self) -> pageserver_api::models::TenantConfig { self.tenant_conf.load().tenant_conf.clone() @@ -4322,6 +4301,7 @@ impl TenantShard { timelines: Mutex::new(HashMap::new()), timelines_creating: Mutex::new(HashSet::new()), timelines_offloaded: Mutex::new(HashMap::new()), + timelines_importing: Mutex::new(HashMap::new()), remote_tenant_manifest: Default::default(), gc_cs: tokio::sync::Mutex::new(()), walredo_mgr, diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index ea29f51956..21d68495f7 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -949,6 +949,35 @@ impl RemoteTimelineClient { Ok(()) } + /// If the `import_pgdata` field marks the timeline as having an import in progress, + /// launch an index-file upload operation that transitions it to done in the background + pub(crate) fn schedule_index_upload_for_import_pgdata_finalize( + self: &Arc, + ) -> anyhow::Result<()> { + use import_pgdata::index_part_format; + + let mut guard = self.upload_queue.lock().unwrap(); + let upload_queue = guard.initialized_mut()?; + let to_update = match &upload_queue.dirty.import_pgdata { + Some(import) if !import.is_done() => Some(import), + Some(_) | None => None, + }; + + if let Some(old) = to_update { + let new = + index_part_format::Root::V1(index_part_format::V1::Done(index_part_format::Done { + idempotency_key: old.idempotency_key().clone(), + started_at: *old.started_at(), + finished_at: chrono::Utc::now().naive_utc(), + })); + + upload_queue.dirty.import_pgdata = Some(new); + self.schedule_index_upload(upload_queue); + } + + Ok(()) + } + /// Launch an index-file upload operation in the background, setting `gc_compaction_state` field. pub(crate) fn schedule_index_upload_for_gc_compaction_state_update( self: &Arc, diff --git a/pageserver/src/tenant/timeline/import_pgdata.rs b/pageserver/src/tenant/timeline/import_pgdata.rs index c4a8df39a3..53e15e5395 100644 --- a/pageserver/src/tenant/timeline/import_pgdata.rs +++ b/pageserver/src/tenant/timeline/import_pgdata.rs @@ -3,6 +3,7 @@ use std::sync::Arc; use anyhow::{Context, bail}; use pageserver_api::models::ShardImportStatus; use remote_storage::RemotePath; +use tokio::task::JoinHandle; use tokio_util::sync::CancellationToken; use tracing::info; use utils::lsn::Lsn; @@ -17,6 +18,17 @@ mod importbucket_client; mod importbucket_format; pub(crate) mod index_part_format; +pub(crate) struct ImportingTimeline { + pub import_task_handle: JoinHandle<()>, + pub timeline: Arc, +} + +impl ImportingTimeline { + pub(crate) fn shutdown(self) { + self.import_task_handle.abort(); + } +} + pub async fn doit( timeline: &Arc, index_part: index_part_format::Root, @@ -26,173 +38,161 @@ pub async fn doit( let index_part_format::Root::V1(v1) = index_part; let index_part_format::InProgress { location, - idempotency_key, - started_at, + idempotency_key: _, + started_at: _, } = match v1 { index_part_format::V1::Done(_) => return Ok(()), index_part_format::V1::InProgress(in_progress) => in_progress, }; - let storage = importbucket_client::new(timeline.conf, &location, cancel.clone()).await?; + let storcon_client = StorageControllerUpcallClient::new(timeline.conf, &cancel); - let status_prefix = RemotePath::from_string("status").unwrap(); + let shard_status = storcon_client + .get_timeline_import_status(timeline.tenant_shard_id, timeline.timeline_id) + .await + .map_err(|_err| anyhow::anyhow!("Shut down while getting timeline import status"))?; - // - // See if shard is done. - // TODO: incorporate generations into status key for split brain safety. Figure out together with checkpointing. - // - let shard_status_key = - status_prefix.join(format!("shard-{}", timeline.tenant_shard_id.shard_slug())); - let shard_status: Option = - storage.get_json(&shard_status_key).await?; info!(?shard_status, "peeking shard status"); - if shard_status.map(|st| st.done).unwrap_or(false) { - info!("shard status indicates that the shard is done, skipping import"); - } else { - // TODO: checkpoint the progress into the IndexPart instead of restarting - // from the beginning. + match shard_status { + None | Some(ShardImportStatus::InProgress) => { + // TODO: checkpoint the progress into the IndexPart instead of restarting + // from the beginning. - // - // Wipe the slate clean - the flow does not allow resuming. - // We can implement resuming in the future by checkpointing the progress into the IndexPart. - // - info!("wipe the slate clean"); - { - // TODO: do we need to hold GC lock for this? - let mut guard = timeline.layers.write().await; - assert!( - guard.layer_map()?.open_layer.is_none(), - "while importing, there should be no in-memory layer" // this just seems like a good place to assert it - ); - let all_layers_keys = guard.all_persistent_layers(); - let all_layers: Vec<_> = all_layers_keys - .iter() - .map(|key| guard.get_from_key(key)) - .collect(); - let open = guard.open_mut().context("open_mut")?; + // + // Wipe the slate clean - the flow does not allow resuming. + // We can implement resuming in the future by checkpointing the progress into the IndexPart. + // + info!("wipe the slate clean"); + { + // TODO: do we need to hold GC lock for this? + let mut guard = timeline.layers.write().await; + assert!( + guard.layer_map()?.open_layer.is_none(), + "while importing, there should be no in-memory layer" // this just seems like a good place to assert it + ); + let all_layers_keys = guard.all_persistent_layers(); + let all_layers: Vec<_> = all_layers_keys + .iter() + .map(|key| guard.get_from_key(key)) + .collect(); + let open = guard.open_mut().context("open_mut")?; - timeline.remote_client.schedule_gc_update(&all_layers)?; - open.finish_gc_timeline(&all_layers); - } - - // - // Wait for pgdata to finish uploading - // - info!("wait for pgdata to reach status 'done'"); - let pgdata_status_key = status_prefix.join("pgdata"); - loop { - let res = async { - let pgdata_status: Option = storage - .get_json(&pgdata_status_key) - .await - .context("get pgdata status")?; - info!(?pgdata_status, "peeking pgdata status"); - if pgdata_status.map(|st| st.done).unwrap_or(false) { - Ok(()) - } else { - Err(anyhow::anyhow!("pgdata not done yet")) - } + timeline.remote_client.schedule_gc_update(&all_layers)?; + open.finish_gc_timeline(&all_layers); } - .await; - match res { - Ok(_) => break, - Err(err) => { - info!(?err, "indefinitely waiting for pgdata to finish"); - if tokio::time::timeout(std::time::Duration::from_secs(10), cancel.cancelled()) + + // + // Wait for pgdata to finish uploading + // + info!("wait for pgdata to reach status 'done'"); + let storage = + importbucket_client::new(timeline.conf, &location, cancel.clone()).await?; + let status_prefix = RemotePath::from_string("status").unwrap(); + let pgdata_status_key = status_prefix.join("pgdata"); + loop { + let res = async { + let pgdata_status: Option = storage + .get_json(&pgdata_status_key) + .await + .context("get pgdata status")?; + info!(?pgdata_status, "peeking pgdata status"); + if pgdata_status.map(|st| st.done).unwrap_or(false) { + Ok(()) + } else { + Err(anyhow::anyhow!("pgdata not done yet")) + } + } + .await; + match res { + Ok(_) => break, + Err(err) => { + info!(?err, "indefinitely waiting for pgdata to finish"); + if tokio::time::timeout( + std::time::Duration::from_secs(10), + cancel.cancelled(), + ) .await .is_ok() - { - bail!("cancelled while waiting for pgdata"); + { + bail!("cancelled while waiting for pgdata"); + } } } } - } - // - // Do the import - // - info!("do the import"); - let control_file = storage.get_control_file().await?; - let base_lsn = control_file.base_lsn(); + // + // Do the import + // + info!("do the import"); + let control_file = storage.get_control_file().await?; + let base_lsn = control_file.base_lsn(); - info!("update TimelineMetadata based on LSNs from control file"); - { - let pg_version = control_file.pg_version(); - let _ctx: &RequestContext = ctx; - async move { - // FIXME: The 'disk_consistent_lsn' should be the LSN at the *end* of the - // checkpoint record, and prev_record_lsn should point to its beginning. - // We should read the real end of the record from the WAL, but here we - // just fake it. - let disk_consistent_lsn = Lsn(base_lsn.0 + 8); - let prev_record_lsn = base_lsn; - let metadata = TimelineMetadata::new( - disk_consistent_lsn, - Some(prev_record_lsn), - None, // no ancestor - Lsn(0), // no ancestor lsn - base_lsn, // latest_gc_cutoff_lsn - base_lsn, // initdb_lsn - pg_version, - ); + info!("update TimelineMetadata based on LSNs from control file"); + { + let pg_version = control_file.pg_version(); + let _ctx: &RequestContext = ctx; + async move { + // FIXME: The 'disk_consistent_lsn' should be the LSN at the *end* of the + // checkpoint record, and prev_record_lsn should point to its beginning. + // We should read the real end of the record from the WAL, but here we + // just fake it. + let disk_consistent_lsn = Lsn(base_lsn.0 + 8); + let prev_record_lsn = base_lsn; + let metadata = TimelineMetadata::new( + disk_consistent_lsn, + Some(prev_record_lsn), + None, // no ancestor + Lsn(0), // no ancestor lsn + base_lsn, // latest_gc_cutoff_lsn + base_lsn, // initdb_lsn + pg_version, + ); - let _start_lsn = disk_consistent_lsn + 1; + let _start_lsn = disk_consistent_lsn + 1; - timeline - .remote_client - .schedule_index_upload_for_full_metadata_update(&metadata)?; + timeline + .remote_client + .schedule_index_upload_for_full_metadata_update(&metadata)?; - timeline.remote_client.wait_completion().await?; + timeline.remote_client.wait_completion().await?; - anyhow::Ok(()) + anyhow::Ok(()) + } } + .await?; + + flow::run(timeline.clone(), control_file, storage.clone(), ctx).await?; + + // Communicate that shard is done. + // Ensure at-least-once delivery of the upcall to storage controller + // before we mark the task as done and never come here again. + // + // Note that we do not mark the import complete in the index part now. + // This happens in [`Tenant::finalize_importing_timeline`] in response + // to the storage controller calling + // `/v1/tenant/:tenant_id/timeline/:timeline_id/activate_post_import`. + storcon_client + .put_timeline_import_status( + timeline.tenant_shard_id, + timeline.timeline_id, + // TODO(vlad): What about import errors? + ShardImportStatus::Done, + ) + .await + .map_err(|_err| { + anyhow::anyhow!("Shut down while putting timeline import status") + })?; + } + Some(ShardImportStatus::Error(err)) => { + info!( + "shard status indicates that the shard is done (error), skipping import {}", + err + ); + } + Some(ShardImportStatus::Done) => { + info!("shard status indicates that the shard is done (success), skipping import"); } - .await?; - - flow::run(timeline.clone(), control_file, storage.clone(), ctx).await?; - - // - // Communicate that shard is done. - // Ensure at-least-once delivery of the upcall to storage controller - // before we mark the task as done and never come here again. - // - let storcon_client = StorageControllerUpcallClient::new(timeline.conf, &cancel); - storcon_client - .put_timeline_import_status( - timeline.tenant_shard_id, - timeline.timeline_id, - // TODO(vlad): What about import errors? - ShardImportStatus::Done, - ) - .await - .map_err(|_err| anyhow::anyhow!("Shut down while putting timeline import status"))?; - - storage - .put_json( - &shard_status_key, - &importbucket_format::ShardStatus { done: true }, - ) - .await - .context("put shard status")?; } - // - // Mark as done in index_part. - // This makes subsequent timeline loads enter the normal load code path - // instead of spawning the import task and calling this here function. - // - info!("mark import as complete in index part"); - timeline - .remote_client - .schedule_index_upload_for_import_pgdata_state_update(Some(index_part_format::Root::V1( - index_part_format::V1::Done(index_part_format::Done { - idempotency_key, - started_at, - finished_at: chrono::Utc::now().naive_utc(), - }), - )))?; - - timeline.remote_client.wait_completion().await?; - Ok(()) } diff --git a/pageserver/src/tenant/timeline/import_pgdata/flow.rs b/pageserver/src/tenant/timeline/import_pgdata/flow.rs index 34c073365d..5b9c8ec5b5 100644 --- a/pageserver/src/tenant/timeline/import_pgdata/flow.rs +++ b/pageserver/src/tenant/timeline/import_pgdata/flow.rs @@ -53,6 +53,7 @@ use tokio_stream::StreamExt; use tracing::{debug, instrument}; use utils::bin_ser::BeSer; use utils::lsn::Lsn; +use utils::pausable_failpoint; use super::Timeline; use super::importbucket_client::{ControlFile, RemoteStorageWrapper}; @@ -79,6 +80,9 @@ pub async fn run( let import_config = &timeline.conf.timeline_import_config; let plan = planner.plan(import_config).await?; + + pausable_failpoint!("import-timeline-pre-execute-pausable"); + plan.execute(timeline, import_config, ctx).await } diff --git a/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs b/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs index e7aa8f6038..34313748b7 100644 --- a/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs +++ b/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs @@ -190,31 +190,6 @@ impl RemoteStorageWrapper { Ok(Some(res)) } - #[instrument(level = tracing::Level::DEBUG, skip_all, fields(%path))] - pub async fn put_json(&self, path: &RemotePath, value: &T) -> anyhow::Result<()> - where - T: serde::Serialize, - { - let buf = serde_json::to_vec(value)?; - let bytes = Bytes::from(buf); - utils::backoff::retry( - || async { - let size = bytes.len(); - let bytes = futures::stream::once(futures::future::ready(Ok(bytes.clone()))); - self.storage - .upload_storage_object(bytes, size, path, &self.cancel) - .await - }, - remote_storage::TimeoutOrCancel::caused_by_cancel, - 1, - u32::MAX, - &format!("put json {path}"), - &self.cancel, - ) - .await - .expect("practically infinite retries") - } - #[instrument(level = tracing::Level::DEBUG, skip_all, fields(%path))] pub async fn get_range( &self, diff --git a/pageserver/src/tenant/timeline/import_pgdata/importbucket_format.rs b/pageserver/src/tenant/timeline/import_pgdata/importbucket_format.rs index 57c647cc7f..d9f4da4748 100644 --- a/pageserver/src/tenant/timeline/import_pgdata/importbucket_format.rs +++ b/pageserver/src/tenant/timeline/import_pgdata/importbucket_format.rs @@ -5,9 +5,3 @@ pub struct PgdataStatus { pub done: bool, // TODO: remaining fields } - -#[derive(Deserialize, Serialize, Debug, Clone, PartialEq, Eq)] -pub struct ShardStatus { - pub done: bool, - // TODO: remaining fields -} diff --git a/pageserver/src/tenant/timeline/import_pgdata/index_part_format.rs b/pageserver/src/tenant/timeline/import_pgdata/index_part_format.rs index ea7a41b25f..371fc857dc 100644 --- a/pageserver/src/tenant/timeline/import_pgdata/index_part_format.rs +++ b/pageserver/src/tenant/timeline/import_pgdata/index_part_format.rs @@ -64,4 +64,12 @@ impl Root { }, } } + pub fn started_at(&self) -> &chrono::NaiveDateTime { + match self { + Root::V1(v1) => match v1 { + V1::InProgress(in_progress) => &in_progress.started_at, + V1::Done(done) => &done.started_at, + }, + } + } } diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs index 649113b8ce..8d459cab9c 100644 --- a/storage_controller/src/http.rs +++ b/storage_controller/src/http.rs @@ -157,6 +157,29 @@ async fn handle_validate(req: Request) -> Result, ApiError> json_response(StatusCode::OK, state.service.validate(validate_req).await?) } +async fn handle_get_timeline_import_status(req: Request) -> Result, ApiError> { + check_permissions(&req, Scope::GenerationsApi)?; + + let tenant_shard_id: TenantShardId = parse_request_param(&req, "tenant_shard_id")?; + let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?; + + let req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + + let state = get_state(&req); + json_response( + StatusCode::OK, + state + .service + .handle_timeline_shard_import_progress(tenant_shard_id, timeline_id) + .await?, + ) +} + async fn handle_put_timeline_import_status(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::GenerationsApi)?; @@ -2008,6 +2031,13 @@ pub fn make_router( .post("/upcall/v1/validate", |r| { named_request_span(r, handle_validate, RequestName("upcall_v1_validate")) }) + .get("/upcall/v1/timeline_import_status", |r| { + named_request_span( + r, + handle_get_timeline_import_status, + RequestName("upcall_v1_timeline_import_status"), + ) + }) .post("/upcall/v1/timeline_import_status", |r| { named_request_span( r, diff --git a/storage_controller/src/pageserver_client.rs b/storage_controller/src/pageserver_client.rs index 554ca375f5..817409e112 100644 --- a/storage_controller/src/pageserver_client.rs +++ b/storage_controller/src/pageserver_client.rs @@ -1,3 +1,5 @@ +use std::time::Duration; + use pageserver_api::models::detach_ancestor::AncestorDetached; use pageserver_api::models::{ DetachBehavior, LocationConfig, LocationConfigListResponse, LsnLease, PageserverUtilization, @@ -212,6 +214,7 @@ impl PageserverClient { ) } + #[allow(unused)] pub(crate) async fn timeline_detail( &self, tenant_shard_id: TenantShardId, @@ -357,4 +360,20 @@ impl PageserverClient { self.inner.wait_lsn(tenant_shard_id, request).await ) } + + pub(crate) async fn activate_post_import( + &self, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + timeline_activate_timeout: Duration, + ) -> Result { + measured_request!( + "activate_post_import", + crate::metrics::Method::Put, + &self.node_id_label, + self.inner + .activate_post_import(tenant_shard_id, timeline_id, timeline_activate_timeout) + .await + ) + } } diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs index 9ffcf9b9e6..052c0f02eb 100644 --- a/storage_controller/src/persistence.rs +++ b/storage_controller/src/persistence.rs @@ -1666,6 +1666,39 @@ impl Persistence { } } + pub(crate) async fn get_timeline_import( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + ) -> DatabaseResult> { + use crate::schema::timeline_imports::dsl; + let persistent_import = self + .with_measured_conn(DatabaseOperation::ListTimelineImports, move |conn| { + Box::pin(async move { + let mut from_db: Vec = dsl::timeline_imports + .filter(dsl::tenant_id.eq(tenant_id.to_string())) + .filter(dsl::timeline_id.eq(timeline_id.to_string())) + .load(conn) + .await?; + + if from_db.len() > 1 { + return Err(DatabaseError::Logical(format!( + "unexpected number of rows ({})", + from_db.len() + ))); + } + + Ok(from_db.pop()) + }) + }) + .await?; + + persistent_import + .map(TimelineImport::from_persistent) + .transpose() + .map_err(|err| DatabaseError::Logical(format!("failed to deserialize import: {err}"))) + } + pub(crate) async fn delete_timeline_import( &self, tenant_id: TenantId, diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 193050460d..05430733c2 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -35,12 +35,12 @@ use pageserver_api::controller_api::{ }; use pageserver_api::models::{ self, DetachBehavior, LocationConfig, LocationConfigListResponse, LocationConfigMode, LsnLease, - PageserverUtilization, SecondaryProgress, ShardParameters, TenantConfig, + PageserverUtilization, SecondaryProgress, ShardImportStatus, ShardParameters, TenantConfig, TenantConfigPatchRequest, TenantConfigRequest, TenantLocationConfigRequest, TenantLocationConfigResponse, TenantShardLocation, TenantShardSplitRequest, TenantShardSplitResponse, TenantSorting, TenantTimeTravelRequest, TimelineArchivalConfigRequest, TimelineCreateRequest, TimelineCreateResponseStorcon, - TimelineInfo, TimelineState, TopTenantShardItem, TopTenantShardsRequest, + TimelineInfo, TopTenantShardItem, TopTenantShardsRequest, }; use pageserver_api::shard::{ DEFAULT_STRIPE_SIZE, ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId, @@ -61,6 +61,7 @@ use utils::completion::Barrier; use utils::generation::Generation; use utils::id::{NodeId, TenantId, TimelineId}; use utils::lsn::Lsn; +use utils::shard::ShardIndex; use utils::sync::gate::{Gate, GateGuard}; use utils::{failpoint_support, pausable_failpoint}; @@ -98,7 +99,8 @@ use crate::tenant_shard::{ ScheduleOptimization, ScheduleOptimizationAction, TenantShard, }; use crate::timeline_import::{ - ShardImportStatuses, TimelineImport, TimelineImportState, UpcallClient, + ImportResult, ShardImportStatuses, TimelineImport, TimelineImportFinalizeError, + TimelineImportState, UpcallClient, }; const WAITER_FILL_DRAIN_POLL_TIMEOUT: Duration = Duration::from_millis(500); @@ -3905,6 +3907,38 @@ impl Service { }) } + pub(crate) async fn handle_timeline_shard_import_progress( + self: &Arc, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + ) -> Result { + let maybe_import = self + .persistence + .get_timeline_import(tenant_shard_id.tenant_id, timeline_id) + .await?; + + let import = maybe_import.ok_or_else(|| { + ApiError::NotFound( + format!( + "import for {}/{} not found", + tenant_shard_id.tenant_id, timeline_id + ) + .into(), + ) + })?; + + import + .shard_statuses + .0 + .get(&tenant_shard_id.to_index()) + .cloned() + .ok_or_else(|| { + ApiError::NotFound( + format!("shard {} not found", tenant_shard_id.shard_slug()).into(), + ) + }) + } + pub(crate) async fn handle_timeline_shard_import_progress_upcall( self: &Arc, req: PutTimelineImportStatusRequest, @@ -3943,6 +3977,16 @@ impl Service { Ok(()) } + /// Finalize the import of a timeline + /// + /// This method should be called once all shards have reported that the import is complete. + /// Firstly, it polls the post import timeline activation endpoint exposed by the pageserver. + /// Once the timeline is active on all shards, the timeline also gets created on the + /// safekeepers. Finally, notify cplane of the import completion (whether failed or + /// successful), and remove the import from the database and in-memory. + /// + /// If this method gets pre-empted by shut down, it will be called again at start-up (on-going + /// imports are stored in the database). #[instrument(skip_all, fields( tenant_id=%import.tenant_id, shard_id=%import.timeline_id, @@ -3950,59 +3994,80 @@ impl Service { async fn finalize_timeline_import( self: &Arc, import: TimelineImport, - ) -> anyhow::Result<()> { + ) -> Result<(), TimelineImportFinalizeError> { tracing::info!("Finalizing timeline import"); pausable_failpoint!("timeline-import-pre-cplane-notification"); - let import_failed = import.completion_error().is_some(); + let tenant_id = import.tenant_id; + let timeline_id = import.timeline_id; - if !import_failed { - loop { - if self.cancel.is_cancelled() { - anyhow::bail!("Shut down requested while finalizing import"); - } - - let active = self.timeline_active_on_all_shards(&import).await?; - - match active { - Some(timeline_info) => { - tracing::info!("Timeline became active on all shards"); - - if self.config.timelines_onto_safekeepers { - // Now that we know the start LSN of this timeline, create it on the - // safekeepers. - self.tenant_timeline_create_safekeepers_until_success( - import.tenant_id, - timeline_info, - ) - .await?; - } - - break; - } - None => { - tracing::info!("Timeline not active on all shards yet"); - - tokio::select! { - _ = self.cancel.cancelled() => { - anyhow::bail!("Shut down requested while finalizing import"); - }, - _ = tokio::time::sleep(Duration::from_secs(5)) => {} - }; - } - } + let import_error = import.completion_error(); + match import_error { + Some(err) => { + self.notify_cplane_and_delete_import(tenant_id, timeline_id, Err(err)) + .await?; + tracing::warn!("Timeline import completed with shard errors"); + Ok(()) } - } + None => match self.activate_timeline_post_import(&import).await { + Ok(timeline_info) => { + tracing::info!("Post import timeline activation complete"); + if self.config.timelines_onto_safekeepers { + // Now that we know the start LSN of this timeline, create it on the + // safekeepers. + self.tenant_timeline_create_safekeepers_until_success( + import.tenant_id, + timeline_info, + ) + .await?; + } + + self.notify_cplane_and_delete_import(tenant_id, timeline_id, Ok(())) + .await?; + + tracing::info!("Timeline import completed successfully"); + Ok(()) + } + Err(TimelineImportFinalizeError::ShuttingDown) => { + // We got pre-empted by shut down and will resume after the restart. + Err(TimelineImportFinalizeError::ShuttingDown) + } + Err(err) => { + // Any finalize error apart from shut down is permanent and requires us to notify + // cplane such that it can clean up. + tracing::error!("Import finalize failed with permanent error: {err}"); + self.notify_cplane_and_delete_import( + tenant_id, + timeline_id, + Err(err.to_string()), + ) + .await?; + Err(err) + } + }, + } + } + + async fn notify_cplane_and_delete_import( + self: &Arc, + tenant_id: TenantId, + timeline_id: TimelineId, + import_result: ImportResult, + ) -> Result<(), TimelineImportFinalizeError> { + let import_failed = import_result.is_err(); tracing::info!(%import_failed, "Notifying cplane of import completion"); let client = UpcallClient::new(self.get_config(), self.cancel.child_token()); - client.notify_import_complete(&import).await?; + client + .notify_import_complete(tenant_id, timeline_id, import_result) + .await + .map_err(|_err| TimelineImportFinalizeError::ShuttingDown)?; if let Err(err) = self .persistence - .delete_timeline_import(import.tenant_id, import.timeline_id) + .delete_timeline_import(tenant_id, timeline_id) .await { tracing::warn!("Failed to delete timeline import entry from database: {err}"); @@ -4012,14 +4077,113 @@ impl Service { .write() .unwrap() .tenants - .range_mut(TenantShardId::tenant_range(import.tenant_id)) + .range_mut(TenantShardId::tenant_range(tenant_id)) .for_each(|(_id, shard)| shard.importing = TimelineImportState::Idle); - tracing::info!(%import_failed, "Timeline import complete"); - Ok(()) } + /// Activate an imported timeline on all shards once the import is complete. + /// Returns the [`TimelineInfo`] reported by shard zero. + async fn activate_timeline_post_import( + self: &Arc, + import: &TimelineImport, + ) -> Result { + const TIMELINE_ACTIVATE_TIMEOUT: Duration = Duration::from_millis(128); + + let mut shards_to_activate: HashSet = + import.shard_statuses.0.keys().cloned().collect(); + let mut shard_zero_timeline_info = None; + + while !shards_to_activate.is_empty() { + if self.cancel.is_cancelled() { + return Err(TimelineImportFinalizeError::ShuttingDown); + } + + let targets = { + let locked = self.inner.read().unwrap(); + let mut targets = Vec::new(); + + for (tenant_shard_id, shard) in locked + .tenants + .range(TenantShardId::tenant_range(import.tenant_id)) + { + if !import + .shard_statuses + .0 + .contains_key(&tenant_shard_id.to_index()) + { + return Err(TimelineImportFinalizeError::MismatchedShards( + tenant_shard_id.to_index(), + )); + } + + if let Some(node_id) = shard.intent.get_attached() { + let node = locked + .nodes + .get(node_id) + .expect("Pageservers may not be deleted while referenced"); + targets.push((*tenant_shard_id, node.clone())); + } + } + + targets + }; + + let targeted_tenant_shards: Vec<_> = targets.iter().map(|(tid, _node)| *tid).collect(); + + let results = self + .tenant_for_shards_api( + targets, + |tenant_shard_id, client| async move { + client + .activate_post_import( + tenant_shard_id, + import.timeline_id, + TIMELINE_ACTIVATE_TIMEOUT, + ) + .await + }, + 1, + 1, + SHORT_RECONCILE_TIMEOUT, + &self.cancel, + ) + .await; + + let mut failed = 0; + for (tid, result) in targeted_tenant_shards.iter().zip(results.into_iter()) { + match result { + Ok(ok) => { + if tid.is_shard_zero() { + shard_zero_timeline_info = Some(ok); + } + + shards_to_activate.remove(&tid.to_index()); + } + Err(_err) => { + failed += 1; + } + } + } + + if failed > 0 { + tracing::info!( + "Failed to activate timeline on {failed} shards post import. Will retry" + ); + } + + tokio::select! { + _ = tokio::time::sleep(Duration::from_millis(250)) => {}, + _ = self.cancel.cancelled() => { + return Err(TimelineImportFinalizeError::ShuttingDown); + } + } + } + + Ok(shard_zero_timeline_info.expect("All shards replied")) + } + async fn finalize_timeline_imports(self: &Arc, imports: Vec) { futures::future::join_all( imports @@ -4029,78 +4193,6 @@ impl Service { .await; } - /// If the timeline is active on all shards, returns the [`TimelineInfo`] - /// collected from shard 0. - /// - /// An error is returned if the shard layout has changed during the import. - /// This is guarded against within the storage controller and the pageserver, - /// and, therefore, unexpected. - async fn timeline_active_on_all_shards( - self: &Arc, - import: &TimelineImport, - ) -> anyhow::Result> { - let targets = { - let locked = self.inner.read().unwrap(); - let mut targets = Vec::new(); - - for (tenant_shard_id, shard) in locked - .tenants - .range(TenantShardId::tenant_range(import.tenant_id)) - { - if !import - .shard_statuses - .0 - .contains_key(&tenant_shard_id.to_index()) - { - anyhow::bail!("Shard layout change detected on completion"); - } - - if let Some(node_id) = shard.intent.get_attached() { - let node = locked - .nodes - .get(node_id) - .expect("Pageservers may not be deleted while referenced"); - targets.push((*tenant_shard_id, node.clone())); - } else { - return Ok(None); - } - } - - targets - }; - - if targets.is_empty() { - anyhow::bail!("No shards found to finalize import for"); - } - - let results = self - .tenant_for_shards_api( - targets, - |tenant_shard_id, client| async move { - client - .timeline_detail(tenant_shard_id, import.timeline_id) - .await - }, - 1, - 1, - SHORT_RECONCILE_TIMEOUT, - &self.cancel, - ) - .await; - - let all_active = results.iter().all(|res| match res { - Ok(info) => info.state == TimelineState::Active, - Err(_) => false, - }); - - if all_active { - // Both unwraps are validated above - Ok(Some(results.into_iter().next().unwrap().unwrap())) - } else { - Ok(None) - } - } - pub(crate) async fn tenant_timeline_archival_config( &self, tenant_id: TenantId, diff --git a/storage_controller/src/service/safekeeper_service.rs b/storage_controller/src/service/safekeeper_service.rs index 5c15660ba3..cd5ace449d 100644 --- a/storage_controller/src/service/safekeeper_service.rs +++ b/storage_controller/src/service/safekeeper_service.rs @@ -10,6 +10,7 @@ use crate::persistence::{ DatabaseError, SafekeeperTimelineOpKind, TimelinePendingOpPersistence, TimelinePersistence, }; use crate::safekeeper::Safekeeper; +use crate::timeline_import::TimelineImportFinalizeError; use anyhow::Context; use http_utils::error::ApiError; use pageserver_api::controller_api::{ @@ -327,12 +328,12 @@ impl Service { self: &Arc, tenant_id: TenantId, timeline_info: TimelineInfo, - ) -> anyhow::Result<()> { + ) -> Result<(), TimelineImportFinalizeError> { const BACKOFF: Duration = Duration::from_secs(5); loop { if self.cancel.is_cancelled() { - anyhow::bail!("Shut down requested while finalizing import"); + return Err(TimelineImportFinalizeError::ShuttingDown); } let res = self @@ -348,7 +349,7 @@ impl Service { tracing::error!("Failed to create timeline on safekeepers: {err}"); tokio::select! { _ = self.cancel.cancelled() => { - anyhow::bail!("Shut down requested while finalizing import"); + return Err(TimelineImportFinalizeError::ShuttingDown); }, _ = tokio::time::sleep(BACKOFF) => {} }; diff --git a/storage_controller/src/timeline_import.rs b/storage_controller/src/timeline_import.rs index 6dcc538c4b..5d9d633932 100644 --- a/storage_controller/src/timeline_import.rs +++ b/storage_controller/src/timeline_import.rs @@ -46,6 +46,14 @@ pub(crate) enum TimelineImportUpdateFollowUp { None, } +#[derive(thiserror::Error, Debug)] +pub(crate) enum TimelineImportFinalizeError { + #[error("Shut down interrupted import finalize")] + ShuttingDown, + #[error("Mismatched shard detected during import finalize: {0}")] + MismatchedShards(ShardIndex), +} + pub(crate) enum TimelineImportUpdateError { ImportNotFound { tenant_id: TenantId, @@ -151,6 +159,8 @@ impl TimelineImport { } } +pub(crate) type ImportResult = Result<(), String>; + pub(crate) struct UpcallClient { authorization_header: Option, client: reqwest::Client, @@ -198,7 +208,9 @@ impl UpcallClient { /// eventual cplane availability. The cplane API is idempotent. pub(crate) async fn notify_import_complete( &self, - import: &TimelineImport, + tenant_id: TenantId, + timeline_id: TimelineId, + import_result: ImportResult, ) -> anyhow::Result<()> { let endpoint = if self.base_url.ends_with('/') { format!("{}import_complete", self.base_url) @@ -206,15 +218,13 @@ impl UpcallClient { format!("{}/import_complete", self.base_url) }; - tracing::info!("Endpoint is {endpoint}"); - let request = self .client .request(Method::PUT, endpoint) .json(&ImportCompleteRequest { - tenant_id: import.tenant_id, - timeline_id: import.timeline_id, - error: import.completion_error(), + tenant_id, + timeline_id, + error: import_result.err(), }) .timeout(IMPORT_COMPLETE_REQUEST_TIMEOUT); diff --git a/test_runner/regress/test_import_pgdata.py b/test_runner/regress/test_import_pgdata.py index 05e63ad955..0472b92145 100644 --- a/test_runner/regress/test_import_pgdata.py +++ b/test_runner/regress/test_import_pgdata.py @@ -130,9 +130,8 @@ def test_pgdata_import_smoke( elif rel_block_size == RelBlockSize.TWO_STRPES_PER_SHARD: target_relblock_size = (shard_count or 1) * stripe_size * 8192 * 2 elif rel_block_size == RelBlockSize.MULTIPLE_RELATION_SEGMENTS: - # Postgres uses a 1GiB segment size, fixed at compile time, so we must use >2GB of data - # to exercise multiple segments. - target_relblock_size = int(((2.333 * 1024 * 1024 * 1024) // 8192) * 8192) + segment_size = 16 * 1024 * 1024 + target_relblock_size = segment_size * 8 else: raise ValueError @@ -413,6 +412,88 @@ def test_import_completion_on_restart( wait_until(cplane_notified) +@run_only_on_default_postgres(reason="PG version is irrelevant here") +def test_import_respects_tenant_shutdown( + neon_env_builder: NeonEnvBuilder, vanilla_pg: VanillaPostgres, make_httpserver: HTTPServer +): + """ + Validate that importing timelines respect the usual timeline life cycle: + 1. Shut down on tenant shut-down and resumes upon re-attach + 2. Deletion on timeline deletion (TODO) + """ + # Set up mock control plane HTTP server to listen for import completions + import_completion_signaled = Event() + + def handler(request: Request) -> Response: + log.info(f"control plane /import_complete request: {request.json}") + import_completion_signaled.set() + return Response(json.dumps({}), status=200) + + cplane_mgmt_api_server = make_httpserver + cplane_mgmt_api_server.expect_request( + "/storage/api/v1/import_complete", method="PUT" + ).respond_with_handler(handler) + + # Plug the cplane mock in + neon_env_builder.control_plane_hooks_api = ( + f"http://{cplane_mgmt_api_server.host}:{cplane_mgmt_api_server.port}/storage/api/v1/" + ) + + # The import will specifiy a local filesystem path mocking remote storage + neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) + + vanilla_pg.start() + vanilla_pg.stop() + + env = neon_env_builder.init_configs() + env.start() + + importbucket_path = neon_env_builder.repo_dir / "test_import_completion_bucket" + mock_import_bucket(vanilla_pg, importbucket_path) + + tenant_id = TenantId.generate() + timeline_id = TimelineId.generate() + idempotency = ImportPgdataIdemptencyKey.random() + + # Pause before sending the notification + failpoint_name = "import-timeline-pre-execute-pausable" + env.pageserver.http_client().configure_failpoints((failpoint_name, "pause")) + + env.storage_controller.tenant_create(tenant_id) + env.storage_controller.timeline_create( + tenant_id, + { + "new_timeline_id": str(timeline_id), + "import_pgdata": { + "idempotency_key": str(idempotency), + "location": {"LocalFs": {"path": str(importbucket_path.absolute())}}, + }, + }, + ) + + def hit_failpoint(): + log.info("Checking log for pattern...") + try: + assert env.pageserver.log_contains(f".*at failpoint {failpoint_name}.*") + except Exception: + log.exception("Failed to find pattern in log") + raise + + wait_until(hit_failpoint) + assert not import_completion_signaled.is_set() + + # Restart the pageserver while an import job is in progress. + # This clears the failpoint and we expect that the import starts up afresh + # after the restart and eventually completes. + env.pageserver.stop() + env.pageserver.start() + + def cplane_notified(): + assert import_completion_signaled.is_set() + + wait_until(cplane_notified) + + def test_fast_import_with_pageserver_ingest( test_output_dir, vanilla_pg: VanillaPostgres, @@ -520,7 +601,9 @@ def test_fast_import_with_pageserver_ingest( env.neon_cli.mappings_map_branch(import_branch_name, tenant_id, timeline_id) # Run fast_import - fast_import.set_aws_creds(mock_s3_server, {"RUST_LOG": "aws_config=debug,aws_sdk_kms=debug"}) + fast_import.set_aws_creds( + mock_s3_server, {"RUST_LOG": "info,aws_config=debug,aws_sdk_kms=debug"} + ) pg_port = port_distributor.get_port() fast_import.run_pgdata(pg_port=pg_port, s3prefix=f"s3://{bucket}/{key_prefix}") From d47e88e35305da95b4674d6ef48f6422df7d9dab Mon Sep 17 00:00:00 2001 From: Elizabeth Murray <52375559+bizwark@users.noreply.github.com> Date: Wed, 14 May 2025 00:00:59 -0700 Subject: [PATCH 39/65] Update the pgrag version in the compute dockerfile. (#11867) ## Problem The extensions test are hanging because of pgrag. The new version of pgrag contains a fix for the hang. ## Summary of changes --- compute/compute-node.Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile index e6e6053554..17e50697db 100644 --- a/compute/compute-node.Dockerfile +++ b/compute/compute-node.Dockerfile @@ -1117,8 +1117,8 @@ RUN wget https://github.com/microsoft/onnxruntime/archive/refs/tags/v1.18.1.tar. mkdir onnxruntime-src && cd onnxruntime-src && tar xzf ../onnxruntime.tar.gz --strip-components=1 -C . && \ echo "#nothing to test here" > neon-test.sh -RUN wget https://github.com/neondatabase-labs/pgrag/archive/refs/tags/v0.1.1.tar.gz -O pgrag.tar.gz && \ - echo "087b2ecd11ba307dc968042ef2e9e43dc04d9ba60e8306e882c407bbe1350a50 pgrag.tar.gz" | sha256sum --check && \ +RUN wget https://github.com/neondatabase-labs/pgrag/archive/refs/tags/v0.1.2.tar.gz -O pgrag.tar.gz && \ + echo "7361654ea24f08cbb9db13c2ee1c0fe008f6114076401bb871619690dafc5225 pgrag.tar.gz" | sha256sum --check && \ mkdir pgrag-src && cd pgrag-src && tar xzf ../pgrag.tar.gz --strip-components=1 -C . FROM rust-extensions-build-pgrx14 AS pgrag-build From 81fd652151c9dce2d188ff2ba7c0ed2723640efb Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Wed, 14 May 2025 16:32:55 +0800 Subject: [PATCH 40/65] fix(pageserver): use better estimation for compaction memory usage (#11904) ## Problem Hopefully resolves `test_gc_feedback` flakiness. ## Summary of changes `accumulated_values` should not exceed 512MB to avoid OOM. Previously we only use number of items, which is not a good estimation. --------- Signed-off-by: Alex Chi Z --- libs/pageserver_api/src/value.rs | 18 ++++++++++++++++++ pageserver/src/tenant/timeline/compaction.rs | 12 +++++++++--- 2 files changed, 27 insertions(+), 3 deletions(-) diff --git a/libs/pageserver_api/src/value.rs b/libs/pageserver_api/src/value.rs index 883d903ff3..e9000939c3 100644 --- a/libs/pageserver_api/src/value.rs +++ b/libs/pageserver_api/src/value.rs @@ -36,6 +36,24 @@ impl Value { Value::WalRecord(rec) => rec.will_init(), } } + + #[inline(always)] + pub fn estimated_size(&self) -> usize { + match self { + Value::Image(image) => image.len(), + Value::WalRecord(NeonWalRecord::AuxFile { + content: Some(content), + .. + }) => content.len(), + Value::WalRecord(NeonWalRecord::Postgres { rec, .. }) => rec.len(), + Value::WalRecord(NeonWalRecord::ClogSetAborted { xids }) => xids.len() * 4, + Value::WalRecord(NeonWalRecord::ClogSetCommitted { xids, .. }) => xids.len() * 4, + Value::WalRecord(NeonWalRecord::MultixactMembersCreate { members, .. }) => { + members.len() * 8 + } + _ => 8192, /* use image size as the estimation */ + } + } } #[derive(Debug, PartialEq)] diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index e7d39db70d..37c1a8f60c 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -3435,6 +3435,7 @@ impl Timeline { // Step 2: Produce images+deltas. let mut accumulated_values = Vec::new(); + let mut accumulated_values_estimated_size = 0; let mut last_key: Option = None; // Only create image layers when there is no ancestor branches. TODO: create covering image layer @@ -3611,12 +3612,16 @@ impl Timeline { if last_key.is_none() { last_key = Some(key); } + accumulated_values_estimated_size += val.estimated_size(); accumulated_values.push((key, lsn, val)); - if accumulated_values.len() >= 65536 { - // Assume all of them are images, that would be 512MB of data in memory for a single key. + // Accumulated values should never exceed 512MB. + if accumulated_values_estimated_size >= 1024 * 1024 * 512 { return Err(CompactionError::Other(anyhow!( - "too many values for a single key, giving up gc-compaction" + "too many values for a single key: {} for key {}, {} items", + accumulated_values_estimated_size, + key, + accumulated_values.len() ))); } } else { @@ -3651,6 +3656,7 @@ impl Timeline { .map_err(CompactionError::Other)?; accumulated_values.clear(); *last_key = key; + accumulated_values_estimated_size = val.estimated_size(); accumulated_values.push((key, lsn, val)); } } From a8e652d47e3dec7e588b3bb3dddecc20302a0f98 Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Wed, 14 May 2025 17:25:57 +0800 Subject: [PATCH 41/65] rfc: add bottommost garbage-collection compaction (#8425) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add the RFC for bottommost garbage-collection compaction --------- Signed-off-by: Alex Chi Z Co-authored-by: Arpad Müller --- docs/rfcs/043-bottom-most-gc-compaction.md | 194 ++++++++++++++++++ .../01-basic-idea.svg | 135 ++++++++++++ .../03-retain-lsn.svg | 141 +++++++++++++ .../05-btmgc-parent.svg | 187 +++++++++++++++++ .../06-btmgc-child.svg | 184 +++++++++++++++++ .../07-btmgc-analysis-1.svg | 180 ++++++++++++++++ .../08-optimization.svg | 158 ++++++++++++++ .../09-btmgc-analysis-2.svg | 184 +++++++++++++++++ .../10-btmgc-analysis-3.svg | 81 ++++++++ .../11-btmgc-analysis-4.svg | 81 ++++++++ .../12-staircase-test-gc-feedback.png | Bin 0 -> 145516 bytes .../13-job-split.svg | 176 ++++++++++++++++ 12 files changed, 1701 insertions(+) create mode 100644 docs/rfcs/043-bottom-most-gc-compaction.md create mode 100644 docs/rfcs/images/036-bottom-most-gc-compaction/01-basic-idea.svg create mode 100644 docs/rfcs/images/036-bottom-most-gc-compaction/03-retain-lsn.svg create mode 100644 docs/rfcs/images/036-bottom-most-gc-compaction/05-btmgc-parent.svg create mode 100644 docs/rfcs/images/036-bottom-most-gc-compaction/06-btmgc-child.svg create mode 100644 docs/rfcs/images/036-bottom-most-gc-compaction/07-btmgc-analysis-1.svg create mode 100644 docs/rfcs/images/036-bottom-most-gc-compaction/08-optimization.svg create mode 100644 docs/rfcs/images/036-bottom-most-gc-compaction/09-btmgc-analysis-2.svg create mode 100644 docs/rfcs/images/036-bottom-most-gc-compaction/10-btmgc-analysis-3.svg create mode 100644 docs/rfcs/images/036-bottom-most-gc-compaction/11-btmgc-analysis-4.svg create mode 100644 docs/rfcs/images/036-bottom-most-gc-compaction/12-staircase-test-gc-feedback.png create mode 100644 docs/rfcs/images/036-bottom-most-gc-compaction/13-job-split.svg diff --git a/docs/rfcs/043-bottom-most-gc-compaction.md b/docs/rfcs/043-bottom-most-gc-compaction.md new file mode 100644 index 0000000000..4bba758b31 --- /dev/null +++ b/docs/rfcs/043-bottom-most-gc-compaction.md @@ -0,0 +1,194 @@ +# Bottommost Garbage-Collection Compaction + +## Summary + +The goal of this doc is to propose a way to reliably collect garbages below the GC horizon. This process is called bottom-most garbage-collect-compaction, and is part of the broader legacy-enhanced compaction that we plan to implement in the future. + +## Motivation + +The current GC algorithm will wait until the covering via image layers before collecting the garbages of a key region. Relying on image layer generation to generate covering images is not reliable. There are prior arts to generate feedbacks from the GC algorithm to the image generation process to accelerate garbage collection, but it slows down the system and creates write amplification. + +# Basic Idea + +![](images/036-bottom-most-gc-compaction/01-basic-idea.svg) + +The idea of bottom-most compaction is simple: we rewrite all layers that are below or intersect with the GC horizon to produce a flat level of image layers at the GC horizon and deltas above the GC horizon. In this process, + +- All images and deltas ≤ GC horizon LSN will be dropped. This process collects garbages. +- We produce images for all keys involved in the compaction process at the GC horizon. + +Therefore, it can precisely collect all garbages below the horizon, and reduce the space amplification, i.e., in the staircase pattern (test_gc_feedback). + +![The staircase pattern in test_gc_feedback in the original compaction algorithm. The goal is to collect garbage below the red horizontal line.](images/036-bottom-most-gc-compaction/12-staircase-test-gc-feedback.png) + +The staircase pattern in test_gc_feedback in the original compaction algorithm. The goal is to collect garbage below the red horizontal line. + +# Branches + +With branches, the bottom-most compaction should retain a snapshot of the keyspace at the `retain_lsn` so that the child branch can access data at the branch point. This requires some modifications to the basic bottom-most compaction algorithm that we sketched above. + +![](images/036-bottom-most-gc-compaction/03-retain-lsn.svg) + +## Single Timeline w/ Snapshots: handle `retain_lsn` + +First let’s look into the case where we create branches over the main branch but don’t write any data to them (aka “snapshots”). + +The bottom-most compaction algorithm collects all deltas and images of a key and can make decisions on what data to retain. Given that we have a single key’s history as below: + +``` +LSN 0x10 -> A +LSN 0x20 -> append B +retain_lsn: 0x20 +LSN 0x30 -> append C +LSN 0x40 -> append D +retain_lsn: 0x40 +LSN 0x50 -> append E +GC horizon: 0x50 +LSN 0x60 -> append F +``` + +The algorithm will produce: + +``` +LSN 0x20 -> AB +(drop all history below the earliest retain_lsn) +LSN 0x40 -> ABCD +(assume the cost of replaying 2 deltas is higher than storing the full image, we generate an image here) +LSN 0x50 -> append E +(replay one delta is cheap) +LSN 0x60 -> append F +(keep everything as-is above the GC horizon) +``` + +![](images/036-bottom-most-gc-compaction/05-btmgc-parent.svg) + +What happens is that we balance the space taken by each retain_lsn and the cost of replaying deltas during the bottom-most compaction process. This is controlled by a threshold. If `count(deltas) < $threshold`, the deltas will be retained. Otherwise, an image will be generated and the deltas will be dropped. + +In the example above, the `$threshold` is 2. + +## Child Branches with data: pull + partial images + +In the previous section we have shown how bottom-most compaction respects `retain_lsn` so that all data that was readable at branch creation remains readable. But branches can have data on their own, and that data can fall out of the branch’s PITR window. So, this section explains how we deal with that. + +We will run the same bottom-most compaction for these branches, to ensure the space amplification on the child branch is reasonable. + +``` +branch_lsn: 0x20 +LSN 0x30 -> append P +LSN 0x40 -> append Q +LSN 0x50 -> append R +GC horizon: 0x50 +LSN 0x60 -> append S +``` + +Note that bottom-most compaction happens on a per-timeline basis. When it processes this key, it only reads the history from LSN 0x30 without a base image. Therefore, on child branches, the bottom-most compaction process will make image creation decisions based on the same `count(deltas) < $threshold` criteria, and if it decides to create an image, the base image will be retrieved from the ancestor branch. + +``` +branch_lsn: 0x20 +LSN 0x50 -> ABPQR +(we pull the image at LSN 0x20 from the ancestor branch to get AB, and then apply append PQ to the page; we replace the record at 0x40 with an image and drop the delta) +GC horizon: 0x50 +LSN 0x60 -> append S +``` + +![](images/036-bottom-most-gc-compaction/06-btmgc-child.svg) + +Note that for child branches, we do not create image layers for the images when bottom-most compaction runs. Instead, we drop the 0x30/0x40/0x50 delta records and directly place the image ABPQR@0x50 into the delta layer, which serves as a sparse image layer. For child branches, if we create image layers, we will need to put all keys in the range into the image layer. This causes space bloat and slow compactions. In this proposal, the compaction process will only compact and process keys modified inside the child branch. + +# Result + +Bottom-most compaction ensures all garbage under the GC horizon gets collected right away (compared with “eventually” in the current algorithm). Meanwhile, it generates images at each of the retain_lsn to ensure branch reads are fast. As we make per-key decisions on whether to generate an image or not, the theoretical lower bound of the storage space we need to retain for a branch is lower than before. + +Before: min(sum(logs for each key), sum(image for each key)), for each partition — we always generate image layers on a key range + +After: sum(min(logs for each key, image for each key)) + +# Compaction Trigger + +The bottom-most compaction can be automatically triggered. The goal of the trigger is that it should ensure a constant factor for write amplification. Say that the user write 1GB of WAL into the system, we should write 1GB x C data to S3. The legacy compaction algorithm does not have such a constant factor C. The data we write to S3 is quadratic to the logical size of the database (see [A Theoretical View of Neon Storage](https://www.notion.so/A-Theoretical-View-of-Neon-Storage-8d7ad7555b0c41b2a3597fa780911194?pvs=21)). + +We propose the following compaction trigger that generates a constant write amplification factor. Write amplification >= total writes to S3 / total user writes. We only analyze the write amplification caused by the bottom-most GC-compaction process, ignoring the legacy create image layers amplification. + +Given that we have ***X*** bytes of the delta layers above the GC horizon, ***A*** bytes of the delta layers intersecting with the GC horizon, ***B*** bytes of the delta layers below the GC horizon, and ***C*** bytes of the image layers below the GC horizon. + +The legacy GC + compaction loop will always keep ***A*** unchanged, reduce ***B and C*** when there are image layers covering the key range. This yields 0 write amplification (only file deletions) and extra ***B*** bytes of space. + +![](images/036-bottom-most-gc-compaction/09-btmgc-analysis-2.svg) + +The bottom-most compaction proposed here will split ***A*** into deltas above the GC horizon and below the GC horizon. Everything below the GC horizon will be image layers after the compaction (not considering branches). Therefore, this yields ***A+C*** extra write traffic each iteration, plus 0 extra space. + +![](images/036-bottom-most-gc-compaction/07-btmgc-analysis-1.svg) + +Also considering read amplification (below the GC horizon). When a read request reaches the GC horizon, the read amplification will be (A+B+C)/C=1+(A+B)/C. Reducing ***A*** and ***B*** can help reduce the read amplification below the GC horizon. + +The metrics-based trigger will wait until a point that space amplification is not that large and write amplification is not that large before the compaction gets triggered. The trigger is defined as **(A+B)/C ≥ 1 (or some other ratio)**. + +To reason about this trigger, consider the two cases: + +**Data Ingestion** + +User keeps ingesting data into the database, which indicates that WAL size roughly equals to the database logical size. The compaction gets triggered only when the newly-written WAL roughly equals to the current bottom-most image size (=X). Therefore, it’s triggered when the database size gets doubled. This is a reasonable amount of work. Write amplification is 2X/X=1 for the X amount of data written. + +![](images/036-bottom-most-gc-compaction/10-btmgc-analysis-3.svg) + +**Updates/Deletion** + +In this case, WAL size will be larger than the database logical size ***D***. The compaction gets triggered for every ***D*** bytes of WAL written. Therefore, for every ***D*** bytes of WAL, we rewrite the bottom-most layer, which produces an extra ***D*** bytes of write amplification. This incurs exactly 2x write amplification (by the write of D), 1.5x write amplification (if we count from the start of the process) and no space amplification. + +![](images/036-bottom-most-gc-compaction/11-btmgc-analysis-4.svg) + +Note that here I try to reason that write amplification is a constant (i.e., the data we write to S3 is proportional to the data the user write). The main problem with the current legacy compaction algorithm is that write amplification is proportional to the database size. + +The next step is to optimize the write amplification above the GC horizon (i.e., change the image creation criteria, top-most compaction, or introduce tiered compaction), to ensure the write amplification of the whole system is a constant factor. + +20GB layers → +20GB layers → delete 20GB, need 40GB temporary space + +# Sub-Compactions + +The gc-compaction algorithm may take a long time and we need to split the job into multiple sub-compaction jobs. + +![](images/036-bottom-most-gc-compaction/13-job-split.svg) + +As in the figure, the auto-trigger schedules a compaction job covering the full keyspace below a specific LSN. In such case that we cannot finish compacting it in one run in a reasonable amount of time, the algorithm will vertically split it into multiple jobs (in this case, 5). + +Each gc-compaction job will create one level of delta layers and one flat level of image layers for each LSN. Those layers will be automatically split based on size, which means that if the sub-compaction job produces 1GB of deltas, it will produce 4 * 256MB delta layers. For those layers that is not fully contained within the sub-compaction job rectangles, it will be rewritten to only contain the keys outside of the key range. + +# Implementation + +The main implementation of gc-compaction is in `compaction.rs`. + +* `compact_with_gc`: The main loop of gc-compaction. It takes a rectangle range of the layer map and compact that specific range. It selects layers intersecting with the rectangle, downloads the layers, creates the k-merge iterator to read those layers in the key-lsn order, and decide which keys to keep or insert a reconstructed page. The process is the basic unit of a gc-compaction and is not interruptable. If the process gets preempted by L0 compaction, it has to be restarted from scratch. For layers overlaps with the rectangle but not fully inside, the main loop will also rewrite them so that the new layer (or two layers if both left and right ends are outside of the rectangle) has the same LSN range as the original one but only contain the keys outside of the compaction range. +* `gc_compaction_split_jobs`: Splits a big gc-compaction job into sub-compactions based on heuristics in the layer map. The function looks at the layer map and splits the compaction job based on the size of the layers so that each compaction job only pulls ~4GB of layer files. +* `generate_key_retention` and `KeyHistoryRetention`: Implements the algorithm described in the "basic idea" and "branch" chapter of this RFC. It takes a vector of history of a key (key-lsn-value) and decides which LSNs of the key to retain. If there are too many deltas between two retain_lsns, it will reconstruct the page and insert an image into the compaction result. Also, we implement `KeyHistoryRetention::verify` to ensure the generated result is not corrupted -- all retain_lsns and all LSNs above the gc-horizon should be accessible. +* `GcCompactionQueue`: the automatic trigger implementation for gc-compaction. `GcCompactionQueue::iteration` is called at the end of the tenant compaction loop. It will then call `trigger_auto_compaction` to decide whether to trigger a gc-compaction job for this tenant. If yes, the compaction-job will be added to the compaction queue, and the queue will be slowly drained once there are no other compaction jobs running. gc-compaction has the lowest priority. If a sub-compaction job is not successful or gets preempted by L0 compaction (see limitations for reasons why a compaction job would fail), it will _not_ be retried. +* Changes to `index_part.json`: we added a `last_completed_lsn` field to the index part for the auto-trigger to decide when to trigger a compaction. +* Changes to the read path: when gc-compaction updates the layer map, all reads need to wait. See `gc_compaction_layer_update_lock` and comments in the code path for more information. + +Gc-compaction can also be scheduled over the HTTP API. Example: + +``` +curl 'localhost:9898/v1/tenant/:tenant_id/timeline/:timeline_id/compact?enhanced_gc_bottom_most_compaction=true&dry_run=true' -X PUT -H "Content-Type: application/json" -d '{"scheduled": true, "compact_key_range": { "start": "000000067F0000A0000002A1CF0100000000", "end": "000000067F0000A0000002A1D70100000000" } }' +``` + +The `dry_run` mode can be specified in the query string so that the compaction will go through all layers to estimate how much space can be saved without writing the compaction result into the layer map. + +The auto-trigger is controlled by tenant-level flag `gc_compaction_enabled`. If this is set to false, no gc-compaction will be automatically scheduled on this tenant (but manual trigger still works). + +# Next Steps + +There are still some limitations of gc-compaction itself that needs to be resolved and tested, + +- gc-compaction is currently only automatically triggered on root branches. We have not tested gc-compaction on child branches in staging. +- gc-compaction will skip aux key regions because of the possible conflict with the assumption of aux file tombstones. +- gc-compaction does not consider keyspaces at retain_lsns and only look at keys in the layers. This also causes us giving up some sub-compaction jobs because a key might have part of its history available due to traditional GC removing part of the history. +- We limit gc-compaction to run over shards <= 150GB to avoid gc-compaction taking too much time blocking other compaction jobs. The sub-compaction split algorithm needs to be improved to be able to split vertically and horizontally. Also, we need to move the download layer process out of the compaction loop so that we don't block other compaction jobs for too long. +- The compaction trigger always schedules gc-compaction from the lowest LSN to the gc-horizon. Currently we do not schedule compaction jobs that only selects layers in the middle. Allowing this could potentially reduce the number of layers read/write throughout the process. +- gc-compaction will give up if there are too many layers to rewrite or if there are not enough disk space for the compaction. +- gc-compaction sometimes fails with "no key produced during compaction", which means that all existing keys within the compaction range can be collected; but we don't have a way to write this information back to the layer map -- we cannot generate an empty image layer. +- We limit the maximum size of deltas for a single key to 512MB. If above this size, gc-compaction will give up. This can be resolved by changing `generate_key_retention` to be a stream instead of requiring to collect all the key history. + +In the future, + +- Top-most compaction: ensure we always have an image coverage for the latest data (or near the latest data), so that reads will be fast at the latest LSN. +- Tiered compaction on deltas: ensure read from any LSN is fast. +- Per-timeline compaction → tenant-wide compaction? diff --git a/docs/rfcs/images/036-bottom-most-gc-compaction/01-basic-idea.svg b/docs/rfcs/images/036-bottom-most-gc-compaction/01-basic-idea.svg new file mode 100644 index 0000000000..7107198c0a --- /dev/null +++ b/docs/rfcs/images/036-bottom-most-gc-compaction/01-basic-idea.svg @@ -0,0 +1,135 @@ + + + + + + 01-basic-idea + + + Layer 1 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + GC Horizon + + + + + Images + at earlier LSN + + + + + Deltas + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Deltas + + + + + Images + at GC LSN + + + + + + Deltas above GC Horizon + + + + + + Deltas below GC Horizon + + + + + + Deltas above GC Horizon + + + + + + Deltas and image below GC Horizon gets garbage-collected + + + + + WAL replay of deltas+image below GC Horizon + Reshuffle deltas + + + + + diff --git a/docs/rfcs/images/036-bottom-most-gc-compaction/03-retain-lsn.svg b/docs/rfcs/images/036-bottom-most-gc-compaction/03-retain-lsn.svg new file mode 100644 index 0000000000..792db6d69e --- /dev/null +++ b/docs/rfcs/images/036-bottom-most-gc-compaction/03-retain-lsn.svg @@ -0,0 +1,141 @@ + + + + + + + + + + + + 03-retain-lsn + + + Layer 1 + + + + + + + + + + + + + + + + + + GC Horizon + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + retain_lsn 1 + + + + + + + + retain_lsn 2 + + + + + + + + retain_lsn 3 + + + + + + + + retain_lsn 4 + + + + + + + + + Dependent Branch + + + + + retain_lsn 3 + + + + + + + + Branch GC Horizon + + + + + + + + + Partial Image Coverage + + + + + + + + + diff --git a/docs/rfcs/images/036-bottom-most-gc-compaction/05-btmgc-parent.svg b/docs/rfcs/images/036-bottom-most-gc-compaction/05-btmgc-parent.svg new file mode 100644 index 0000000000..9593ed969e --- /dev/null +++ b/docs/rfcs/images/036-bottom-most-gc-compaction/05-btmgc-parent.svg @@ -0,0 +1,187 @@ + + + + + + 05-btmgc-parent + + + Layer 1 + + + + + Append C@0x30 + + + + + + + + + + + + + + + + GC Horizon + + + + + + + Append F@0x60 + + + + + + + + + + + + + + + Append E@0x50 + Append D@0x40 + + + + + + + + + + + + + + + A@0x10, Append B@0x20 + + + + + + + + + + + + + + + + 0x20 + + + + + 0x50 + + + + + + + + 0x40 + + + + + + + + GC Horizon + + + + + + + Append F@0x60 + + + + + + + Append E@0x50 + + + + + + + + 0x20 + + + + + 0x50 + + + + + + + + 0x40 + + + + + + + AB@0x20 + + + + + + + + + + + + + + + ABCD@0x40 + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/rfcs/images/036-bottom-most-gc-compaction/06-btmgc-child.svg b/docs/rfcs/images/036-bottom-most-gc-compaction/06-btmgc-child.svg new file mode 100644 index 0000000000..b8a93d5b5f --- /dev/null +++ b/docs/rfcs/images/036-bottom-most-gc-compaction/06-btmgc-child.svg @@ -0,0 +1,184 @@ + + + + + + 06-btmgc-child + + + Layer 1 + + + + + + + + + Append P@0x30 + + + + + + + + + + + + + + + + GC Horizon + + + + + + + Append S@0x60 + + + + + + + + + + + + + + + Append R@0x50 + Append Q@0x40 + + + + + + + + + + + + + 0x50 + + + + + + + + 0x20 + + + + + + + AB@0x20 + + + + + + + + + + + + + Ancestor Branch + + + + + + + + + + + + GC Horizon + + + + + + + Append S@0x60 + + + + + 0x50 + + + + + + + + 0x20 + + + + + + + AB + @0x20 + + + + + + + + + + + + + Ancestor Branch + + + + + + + + + + + AB + PQR@0x50 + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/rfcs/images/036-bottom-most-gc-compaction/07-btmgc-analysis-1.svg b/docs/rfcs/images/036-bottom-most-gc-compaction/07-btmgc-analysis-1.svg new file mode 100644 index 0000000000..65034226da --- /dev/null +++ b/docs/rfcs/images/036-bottom-most-gc-compaction/07-btmgc-analysis-1.svg @@ -0,0 +1,180 @@ + + + + + + 07-btmgc-analysis-1 + + + Layer 1 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + GC Horizon + + + + + Images + at earlier LSN + + + + + Deltas + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Deltas + + + + + Images + at GC LSN + + + + + + Deltas above GC Horizon + + + + + + Deltas below GC Horizon + + + + + + Deltas above GC Horizon + + + + + + Deltas and image below GC Horizon gets garbage-collected + + + + + size=A + + + + + + + + + + + + + + + + + + + + + + + + + + + + + size=B + + + + + size=C + + + + + A + + + + + + B + + + + + + diff --git a/docs/rfcs/images/036-bottom-most-gc-compaction/08-optimization.svg b/docs/rfcs/images/036-bottom-most-gc-compaction/08-optimization.svg new file mode 100644 index 0000000000..16a17ec56e --- /dev/null +++ b/docs/rfcs/images/036-bottom-most-gc-compaction/08-optimization.svg @@ -0,0 +1,158 @@ + + + + + + 08-optimization + + + Layer 1 + + + + + + + + + + + + + + GC Horizon + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 0x20 + + + + + + + + 0x50 + + + + + 0x60 + + + + + + + + 0x40 + + + + + + + + + + + + + + + + GC Horizon + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 0x20 + + + + + 0x70 + + + + + + + + 0x40 + + + + + + + + + + + + + + + + 0x50 + + + + + diff --git a/docs/rfcs/images/036-bottom-most-gc-compaction/09-btmgc-analysis-2.svg b/docs/rfcs/images/036-bottom-most-gc-compaction/09-btmgc-analysis-2.svg new file mode 100644 index 0000000000..243f038c88 --- /dev/null +++ b/docs/rfcs/images/036-bottom-most-gc-compaction/09-btmgc-analysis-2.svg @@ -0,0 +1,184 @@ + + + + + + 09-btmgc-analysis-2 + + + Layer 1 + + + + + C + + + + + + + C + + + + + + + C + + + + + + + A + + + + + + + A + + + + + + + A + + + + + + + A + + + + + + + A + + + + + + + + GC Horizon + + + + + + + + + + + B + + + + + + + B + + + + + + + B + + + + + + + C + + + + + + + C + + + + + + + A + + + + + + + A + + + + + + + A + + + + + + + A + + + + + + + A + + + + + + + + GC Horizon + + + + + + + B + + + + + + + C + + + + + B + + + + + + C + + + + + + diff --git a/docs/rfcs/images/036-bottom-most-gc-compaction/10-btmgc-analysis-3.svg b/docs/rfcs/images/036-bottom-most-gc-compaction/10-btmgc-analysis-3.svg new file mode 100644 index 0000000000..1e49ec017b --- /dev/null +++ b/docs/rfcs/images/036-bottom-most-gc-compaction/10-btmgc-analysis-3.svg @@ -0,0 +1,81 @@ + + + + + + 10-btmgc-analysis-3 + + + Layer 1 + + + + + 1/5 X + + + + + + + 1/5 X + + + + + + + 1/5 X + + + + + + + 1/5 X + + + + + + + + GC Horizon + + + + + + + X + + + + + + + + + + + + GC Horizon + + + + + + + 2X + + + + + + + 1/5 X + + + + + diff --git a/docs/rfcs/images/036-bottom-most-gc-compaction/11-btmgc-analysis-4.svg b/docs/rfcs/images/036-bottom-most-gc-compaction/11-btmgc-analysis-4.svg new file mode 100644 index 0000000000..510d7a0c3e --- /dev/null +++ b/docs/rfcs/images/036-bottom-most-gc-compaction/11-btmgc-analysis-4.svg @@ -0,0 +1,81 @@ + + + + + + 11-btmgc-analysis-4 + + + Layer 1 + + + + + 1/5 D + + + + + + + 1/5 D + + + + + + + 1/5 D + + + + + + + 1/5 D + + + + + + + 1/5 D + + + + + + + + GC Horizon + + + + + + + D + + + + + + + + + + + + GC Horizon + + + + + + + D + + + + + diff --git a/docs/rfcs/images/036-bottom-most-gc-compaction/12-staircase-test-gc-feedback.png b/docs/rfcs/images/036-bottom-most-gc-compaction/12-staircase-test-gc-feedback.png new file mode 100644 index 0000000000000000000000000000000000000000..c106f3ee899d46d90c9e1c0b630d3e73075b1f67 GIT binary patch literal 145516 zcmeFZbySqy*D#ELA|NFtl7fgx2`C*RAc}N{h=73P&>bQjf+8S|N_R=ijC2i*gft8x z9m7b>(BB#FUp(vk{<`n)UF-elVJ(YD#1z*Gcg3@W`G#R?x!3 zBhJLbBPh6h3EVLza2>|OBPq3!m)CeAFVCjo0g#jgVflwo zA4q%_m6N!T6T z^#$eEB=DApbjCMdgoF^{D=cp!m*iPr=WQ|Xu01!RLLpgtDqn2vfbB_yt_qsGSHPv!t1qq(l)o4x?{MR{g zSeuE>n>Q(XNX?$ggm)=jJ0yEHJS;4BnZrFu@G8dxF9})cc>(YEvm{};)DIoX@xM2p zTM?oz$TbadX5x2PwSLwqHas*c8Y#+T&I3$*9afL6!+YwGx;{qZVwUM@jk#-6s_>c1BG$0=uyJAJaiMHV zU&1m%JvJh9(f+y452k3XHiBgfsoa3iVk(oeA5j~ErejKt7`H*wgFXx8F)L0b>=VpD zDk`hb8ool)@z{6KsrlQA1$!>~4KIZapPW&|z09;_p8e>mwp_dFcl0g+4iS3{c@-z} z%h$Z{_~(K(9uUm3T{j>gdO(!KCKR0}c^Zq4xFVPN?qfo_q6J%i%u|cs6Lh3Dqe-K+ zcq&;F6Z#7?!70_3xffUb-%#N((Q^!_?m}tF@V>GgzyFzxVYqSHYmM(@BW%nI&)ipK z<@&|XhA;k0MB}2-1Bz=Lb#Lxh5Dk21UK?R048BZBs)r}?_|n>g%P!w-D=yFkU9BM7 zB-DK&TW~SFMbnAt>l@)Y_IeV>H!MyRQkMu`T>D0Nb?1W0L&}igZpw>Vp_W#Fz8%le!c!V>+|pr#Z`$BQt6OOp9NR>_815t;9gmCI@&FYv?PDc$BAp_(MFyWAG?t+*U zXKQn0OLEWC?)i{kZf?m^S>hMnGIupQm9#HgMg7D4V>0EZZ}Jo@ehj@z{MvUfGBzwW zE7oA;wOZXkPT70M9LFHX>TPQ2FDxbkn&r>R<<-BGm%fE@lh_n zCFJvav#;~F+Vt8qIcPcZG@+W=nlQnM@qV%4`c7;a@o0!+ylvH3P8W?xjj*ansYtWs zrL>7OH2ffSpTC{g?OC^}LG=z)yeHi((_!|N%&X7#RIfB&4BDc7-k*J`SDd1E=RdNt zu}-lvv_2eH9_&RHeJ`3>FRdxOKh-#SR7N(@*=3cXn%2)~ENoo1B{N0jLDR(Pk+vzZ z37wQ2C*4Zh>|)wuUSK{Do6gr>yRpZ&XBT@-iK5Fr)|yFHd_vMvoZ0hecw$=oXG4p( zTjikn_CjNaJbxoU??(cDg0g#KUSq5#MAL5529Ea~5!3tA;@dD&&^N%gkJA%aoGLL?<(UT*vx#ZJS$CIrp1@i1z%q(ywt@9_Pw@UcqPlKsBezQ($ak(-gLgp9Pz={?i;S-r6I&x!BT26^sM(-2=0 z$`JCjQVzM$8qEIq>$8z0Ma8ge{>nF|Iorn8DS9(D@bpuJm$#j|2@V9<%jzgJ4 z+0uvQ=H=@KrPV%d>n-b`^uXprnMjf>=MlfoLo*2C_)X>`ru-NN7 zc-Y^%A1aSL3>Y)GX?&Bn5!T~9e1;lC?m|(s_SN2)Wk^T*0wZS7N=1Zr%(=xHer$A(`b4`+rxLur^IKMiV3UKWjF?26@;9C97m zcxC*-dpO8yX*u3vE*U!>owXRaEy?2aZISR)Vfab#2VXT!HQKrdeBONVqrXOjYR7A5 zhtKjQCo`thPtDPNpM@o?ix77*_@tGj^t~L{{mu^2q;E*e`EQvpts6|Bv0CulrjPOh zo~Fe0E?Y&2pHDMl26o4um1Q(+yPfWl9lae*I}kN)+-y`kwZ&`{Iv#H)8I#vd`Y`!U z`qE=Eo3=cT7nsW>^-)5o*>Ci2<#hs8sFqVp3=xYHON@+)FLqh;AiJk)<5=*tL;Y8l zj?_;tqusO~NI$^=fi+R{rKZeKSzoDk7N5hWRg6GeD53|!^XRM6x>BhUBIYn=|C_;) zIW~80=~wX=S(>x)-AU|zkL=9(W_7*PRva6>?vA+MUX9Q6r)?G7|5bG$Eci9iB1; zUVCn}@P5pLYUBlB!ZT{bz>Kgt1ztRfLKzX;<-y6d_jHvm+jt>t*XPM!FftYn^uvU! zkG@j+THFD^*pzJK*{< z9^r-Sco)Id1@J3#f$=}rk1pKCBlvS49}h3w29NMxW7NPW?&lr&#m)KalOR3}j|hCb z3Vyvl;s1Lyapot2e_s<6fMzshn2nBsn}Mo|gayQb-|QvC+>+nR!3hTkPs&RITsl~~nX!2}*gLvPcuBMW z86g3#akmB7+5QZ1vy)~wP}N|Qhqzd>iSXa$zsoK|!p6oXLQCP%zovt4((JF? z+?*r?1Ux-G`8|dCAuiSeg5u)h0(XT3goOCO2tHSDM>jJsK1WxMzkvK3j)JADg^P`o zn+?Q~4F}iE9OCXK&CZTn=s!Py;c4k*^WT*mUH`Q$ut5Rb69GZ~y8{1#4W>%r?n-Fb zcv;#TD%dyxnt?TB#Dztq{*3?sJo)d6|6``Xe`gBa6A}8KQ~$@K|Myf~S4$Uphyz&D zP3FJ+`q$+D`S4#8r37$W{~x6Ii_U-U0z%7>ND2IhX)+`Q9ZIZ#k4!cS>N?;PNEz#)MeWA^;#?d+)5AOlq69qXPuM3+QMBjAoA9NAmduZD8)bkwFL^qd^wa;p7 zp0&)9CMj-4(b9H2*Hi3xvsx4}Q`6*88QolG;!98X8zCAu{juj4Ib=`z0mk#Bd-~=| zf5xF|ysOLaCa%&FY7Q#=hYv1azHoz$Fyu8Q<=^}ALxIpJ(rZ*9C+F5bnEkI6SuRi^ zMyUS&;{QhY6F;GXR52+(H|amy1P)Gw^*X^nCxZq(K6~cRiysq0|275xy@?0y!u$Um zeaZq-K*aSoPkaCQCfF8Ij{Z6Nmq$VY5#3ujEpPm1OMhFc&?^4_SQOa^*B?G;uWp=+ zull=E{9lA4yz*rJ`|WQZ3ilgq3x+06PYnO|&hh`vHw7te*1vfO{|CZBjg*w=68`!e zULe!|kPPgZg%p2#=l@1OfrVNsskVm z|Emrz|F1g0fB1h@9lQ{_ao1<>mFsefb*lbz?~aq-eF9eR73l<BBYF1*654IWG( zW1w07hB7%G7$JM*qx*Y?|3r(QaZz5A4fvK~TT}>N$qIC|%5WboLQQ{`C)yp+R!=!x z$$}NB(ms5l`~aAH?s@$m*8jv_VzVC3ZPg(~Od8z;k$$_X2fhN;aG~X|!)n4ajqZK5 z#afqD9D!e%HVu2dE$|Pl>5gpp_wM%`s)$T!GQzLvZm8Lr&B`v8=UiiD5;yKPP`$eE z7^&;9Eg!=uIuOGk+$maPH#VAkVD~|L*By59J9^AvIPa~^SXqY6&m!Hz?bX5T{_#qS z=I2wj4o~;*ppXOkbiCS*N2jF>(mwE)O|bi;F^8s5m^%`=U)ZyMJoUu+ywK#_^L**0 z)^wn)XP~elO4fb5__*=O?rGY_nT3hzX-A;r@;$Azq$g76D$?%M+p+`wfeuqCle?zU z7}iBuo?Xb+aUA4Ypra#X@^t0|wd`_!YCLvyrrmuy)rdlt(VdYYS*Kb(ZHM!cPA{Ab z&L-6?VIaMjtnSmUsW32g;Z&*VxGwW`;Kr?KSZPyW({&^)^El4kKLLeIIL_|6_G;)f zC{nxu86u5l6&QB@7yJeoN)$w zI+Zd?9xfzTK94D1Io>#HH9al2SE+$xEmSdmA%_r`7?{sCBK0@DEb2wE9YQvH=0te( zSbH_82aBfOMr~Hqo!=k*YSC3Y0@iolc$YVWoG@MSJVGIhMo)|p7|7Xi9LwV99(ka> zkMYV_5;1mhIGGp;(^I){GU%`fl|9qn0DCFKu&xB2gUQExuzS<2C&sEh7y$iRadg_LJEv((!$*sq2|NC`ScZ#tk9qQb`6d2Ner};soSCd8ui_ftl@L)VejZ(O ze6UU;SMFyqI~|W*uu+n5>n7 z;mZ598Mu|aO+%yYl2s#e2FpsmdQ4G5V34oIoA(QIFO^c;E;3n*X*ktDrO>Tq@%r2k zzu!Gvd9Eu1hBa|AN8Mi`$XWQ^wks0{J-TZaw(Wb9_uB8-b&T8^vmWthq}-`A8bfrx zWklY5Y``&QbesbjvYmEgu5Pc^(ft1tAmFp%qLiS2=-C1%_5VPW^2)vKC~dOPNZnFo z#$J1Of6O#3J5FXY-Tz?1@MWyj!sF_$Oy6YL^Fzck)E6bp>bDcL)@*^gGvhfMF!juP ziP9Bb*@dkgR+pS90cOA|lJjH1Q1g5kBYb2lPlORtRe1=kgzEX(Y5ym=;G(?(v6G#) zei>hsg+%{M&&h`Axe#mMN#)wlV*SKzV@R`Q8Fby&bx%;~xK>+T`lo$;%O$JI4*K$; z=Ch-X{;$I-b|RBr9J|0mBvY%2wWND5r|A0VFpZ+_RwK`jQ(~DVQ+>A^x=vPx)%(fM zF@d^`5a?lE!$GQDzoruF($uANq3`zJ(4AI;9dxBb%pNm$g-@>xr;&+H|L{ATSXut{ zL0yCZam^rCNf9Q0oI-HK^kO;Hj^7ny;+ck!IfmQA+71BBZ)af*j^pmj-5hay4*BVR zQ4?;%s{QQI{93t^ZmSx@0eJ^jr(~m|S*I9bU|8^6kG>BV?-i^bW1!1(^5i0}1LFA& zy91K_`rDXEkEX36H&frO`@0CnGaGW*1t+A(P$X9hU+s7+@5xwbv0IBx5_xs<{-`nH zWJUH|kU(8_Cf#S!2JHOg^c^e>hB<9Yv4|CiGuI)_pohzh)%~?oI%`mgbzRqCRne3- za#?s?hF5E(6|kuHNcp~LByQi=H?;u9NUwU^XCqZz_Hg981`(?{x%8ff z@G(<$+7zT|#t$>&X|>@{TQ6HstAB?g)FMy1uUyEbozX;*ljJp8X?nU zXg%W&=VA>sACTDKb?tqvjbr)h_h2}c5|f!#hJ&8#RB2rbYXG*zzGh!fw^0DLSJkjp zJEek@KR&8G>)IEcg20+n0)q~V-NNggl>?2MjwhG1DP)CKGCVWI*M5D#0jxj-Ef-|R z3Nc%4y?8V&)S7jF#-D`R?j%I;-IxC_mU zZd=Km%*ng9T%=E?R+p$$j*F5Z+7s+za*aU=)LnsfI9ct{N!Jw&`J+SW`%OG&EXfa# zZ$P&UFrv-As8`Dwkl8ykJp8a$gtbYN`+e4L=RpmtU!vi}T$woHogHsMVr9>~h69!l ztg!u}Y0isXY5k(eZ_1d%l}u60tZ%va>Rhnj)Swcx18{BVh1FNQi`MH_HBsCceAoERRGnDnGI`5ukNd4O#cAwxs|4~CbPRIyo@|QtD87~ed^EX z5e&-!N3z)M+!-wkXOHGJKdYTY!i^ykP0sGwGZBehO-LW4|H1wIrbC;?wS3{qnTY-R zfTOkiev3GnWrvGPZAbgAW5>Qta^gwRCV_4IrH$)F%l)G1GxfEDj_p2+oo{zxvbAQD zHu?Aen!DbU52sd_F*}&?G)2-(z16|Ak_y}do5(S=Ow@&Ml*6`>nC|TBn~zE)4?b2} z1!9;7q;{_e@826v-UxecZIy2Gs5+>+xKQe_{lju$ewNU2{n;C)NsNZ;yNzhqvo-a? zB36`1(zdytkvC`XO#S>3BM@*F&0NvW3^D|k)oqmRys68BLpR{0va7!uA#HasX(#X( zDkr-GqJ8{zC5Fx&wEZ}zwqJDS+4x%n&pkwTe-IJNs{x5)-oT`WIz~s9&hjUr$g@NG zOpJe~$mSzhKzQoJv{Y8F_e!RRNXx5%>< zPQ*Z#ojeMTB{te^&yEHIWo@7PtSMF>v-o0vw-7OJ`g3hQy0en*A8Y8HnAq>q$7fx^ zSU2laEI#5tsuOTBujJZ&BRts^AvyNdac0u1X>f4jZOxdPZa)Wwtofs6iC%>GJ)fp| zzt7~%5;|*{{-YCCKVhY|NhThKy9?0*sgkJr>I%ln-xSF6xebKpL6556r6$3;U4kZ$ z4Z7mPjc`_<@oxu5c2;91p8J+ZWUjsNYWTBU-_cLiC)2~0Dg3u;z-$cC9~Ic<+5n^8 z>@+?5O<%1yR%3oY@bpJ81%u!B49|&)j|`$+7)S+U;cvDTyRwF5Bdaz3r)l(=fwPB+ z=f7q2iZENE$Rp<#RNP&4&+X}nk{Z4JQ`Y+P3%dhNsN@Wf$({{!Q8kIRo3hXS755Mf z1PCXXnP~Mdt*0p?jtaKRY2`5VnY{sHy9dYU;?ddhwkQgA9#AO~vGu4Z8df(TL1q^` z;@IA38n|C`PR4Nl*k06Jx(wx7i;-XH+gqRM8%K|BR-XCAw&yF!C_t1192i#BD9gV)qpgj6N%HKCrCr3``SVO;A*#X3qYS|{0R87o=Fu+RNww`!3C zCqm2gaX=^v`8P6H()RBLn*1tXj^xVRsTIb}f}Nk1i*Jn#!)?fw2xTk#hMr0`Rl1Zc zm#4@Z^i_#m6P}!SD+3U|Z`o8hrMtGE?l4gec30m93~eS4IDBv;RE+SzJu`>bS@878 zna4{9Q!|ruQ(-kNt66~_9*dt^9djB>MC)dYz_iuuxK#S*UY~(PZF6PS*94^%w@%JY zyCUxMNTdbLJids`I*hMrQI|b^nS34$NWC^Z3ETze8@fbSC*pJ(ThoF;211Sww(Q=3l`Bb`efJOSg2P#S zx{n{U0MTfd;BTyOcDTC=m$Q0%fAsG7Tb(eZ)O0CX=V8qSJB1V23sJ2Zi&&Y?&9A#^ z0sMhGK}2Mg(Jw^JXD~=#FK& z=_za4UKqW!dkrEw>+I3ypQ$tJKUPXMZNynZR>3%vzGY&&maC=fy7*Q!bvcf^d2#fT zz8B+nGP{n12Xd9|&x} zmcb=4+NPoAX6v)BXGdf}q&mVge6G_Ey9j0pz_3H-R$yRbIIBl}7DUox{Z$2H%W;p4MY)IhnrV6OPe4s=D|)PHHEO$Sw@ zZ=%vy>MgHA+B8^gtzNy;N}6AEE0)_#^ipG`%eqaz-kK%|T$IPd+Q=p0rK(oN4B9?R zOZxB#Dvqd{7Bs)w(@lz|_r1FEVc44InC+SVE>>}%=0*XJOh$Fuxg%T9XR?yMPfGY{ zH-{Vs%7i|0#H=xUjJI_0Yt8VOTnW0yX9tuOi0x7S`8shfiYh%5WTPo3WraC*!B)}S z4K)`8AxCzl$y$vfY-%vz2KqL=4;b}w*g5k>i8nbvF$2PIdN76P`ZiSuL0RN|6)w~A z9ynA4epr@i@X#YBC($}6I2$e#mLF3Nm(viXcBA*k$$I^|(%bEVFEtHLZYcuqAhnF_ zmbVJGA>&94x1t@v=Ao{>iSGDYZ1iDGWc2%x8o57u%3oze6EN@eBHEW~yRHpDMJv*$ zbcX1e8HkpsULtoB%w5a7KSt3st0Fq%X4UpqBO|5$w=E9rK6MmC%< zkpf!nqcx>E5pnE`Hlwhb3m%K=fwtq(k7YMttcP`RaiQQ=aJ`5xRUu`zfjcd|wu`g$ z0lOnC8+aIvvW=(I_Of;jdZi|Sm2cIU0bPKGXAKzdNJyus*l?saS9Vm5L=DX9-k}xb zG&gcA-$V@{9$UI}B^YE20pNhtLx`jA1pIk>9*dKBO)b#tV5xNHFk?nH2Lr|&oN79c zMec064jlaa zQA)B3C~n8L0L*uDhdB)UGbW~{_fbgH+31!hr1~xQ0sAeW#KusbkR?Dq(|(j=bMes#z# z^Th~cw9~N@fOrysNgfQqtvKz+viYc}FThhkRBgpwrUEy@2&bFDq=(=D*!B2?_^(Hw zJ3AMuBQR|I8PTdgU9*;dl+tSdKA!p5aJn5|Bj-A8iW+nyj94rCwVe4(rC|+aelmRp z!&8UU{}$(ZRKk&K(C*-4AZmEDF)hpg!xjXeR>)r>a;2o#evOonYE9YXrR2SGWjs3o z+CiWf9qQUx$`N;Xr*6$6qcSRur_AKjMB`jYxoevI=*0^>lkzPW8B2^Ggnn^vzkWOf zY}gXE^*wwuR&sl__}Sb9KNP#*?AlQ?m0rhRfZ9l(ui?jP4ziFq4);sADBLzu8mV&B zZn)h=Kd@><|6oUVprJaJkWy6tnWXis#KSnSjPU6D7k9uidQ+qf}{|#pO_= zghkSo90Mg~v@UtZ!JDWZ?7#D(F~3>1O4?cm1#7Z;b7y46RZ)X-YRs({$}pWdHRu`E{3vB+|1;GaPc}_6U*+8Zb%Cxp1KbG@{o@zz_{*x zBFHN4ktwaB$MzVG)YMk(0*{8XqVj-`G~4+01AzD=JVSfEm}%|11ilKup?}6h zpim#0Ts*0pLWbq^JSLb?bMYLO==R|&#a#RD>Do|}aE>aYS zu8x&wxxZ94UFz2P*PMa()s8n7H)z?>bBG&Xb#7PGLJVdtXZd2Yfs(fBE% z9AsWLN=Wtezjj@O4ud?ce$ZNJrq8)#x~5AxZi96q;VpZ-grz5UB#a`I|J--cva@*et~A=O*eGl;_Xrn z7hSN#(N~TY>>Z_>wfq{CqV!{x%c4Z{@O*4`$E6*m=Z|keT$=|kERFij@tdt zL97F?r<@uDL>okAjirleg}KM(LmEL1p_Jq5@`xX^$GveuC;amCdbUs#+rnJG4^!BE z46C${o7zu_hhHqoA7P~rP5KBW*YOLXDFF`ydx%Bwjf3jvqF^wNo@Jfpt(HqSV&Ue@ za+1YLoFaVVgYKh-T@pi}e9OqNdHXc;3?bJ2#9oPcZM0D}3oyDZ778ryS*rJ&Lw2t5 zL0Ub77E%AR!I-kd$^08dEP2 zYV&CC%a~MTj}t37BGwl;7I@gR0*(sZ2`5Te60w=sPZ7r5#&s#n<*w&1l+dMAn>ieH zDtKe8`jN~_(92l_P0w$QYeFenF>R(e99u}wvsEgx=VIp&VpY#liX5hD zOV6b>=P>SsStVst-|v9^bWr71`{+0V9735pvRnz#S~G=tsl7v4R7Z8Bx2@e7=pBcL z>ja-q=RMbpQ{KG2NZvwkCAHj6m31mc?h%823rnZoyuC}%>KAlw0#;vU4)MMWli{-H zirvRp%g84qnHpsBZJE6rE3)AqtNTTa0T`ue0Wjz8uCv-3W=k2RWMs;U>l+iNoGZwX zW+#fGUpO|>GzHI*TEw|5Tz+*XSE%?{BHKqOy+@_=<_@@rlzzE@ zij!8_2@Mv=T>tHleSZ1ujc)~qE_d%7&F1ZZ$2%RQp~UnpvA-qRu%a#WS)a7N)suy^ z##U{6QWViTtX=d-ID12>ei`ajv=R~W-^D(jslXoX&`Wx5rx(#!b`mdMY@r~7dVRh- z-zrVBf?{1qLxKfZ)?0ZNQ=H$<2{frPC|y>>dZJ4Rv`NQq3Pe!AztTVsDS zFpJ*=D48^Q@NM2kN@Ev7TSG`k1>i6_10)Pi2tbitg zz+JRdvf9vj?l5E9`{MSeYl1(-tywLK*$gdlc8?OYEo? zI%>>~fUhOyW~jnLuWhc@`$SrhbMv;d=gouAVT?N?HVG?Fxx7%79M3UfprnE>suW;4 z6A9o{Tey4k$1!Ca;-871(HPP;XXa!hWT;Eif5cSE>q{&@1$YxAybH}XQ%>Yg^YVj4R2VkP{^YAr27ks0`|AEw8g2HZ)>9D7 zykU;^Hw*mmE9+h!jo-XNKZ42(a;rUIr^odws=poTq((2vDse%1G1$f zfCFBtG;}(%2U$$kniMJQ10EgqR9ebN)_&k=0Jhaqu+*obB{pfRKLj&x6Ktv~MH-#9 zV>M5!#Lk%++M)V`xEvPhMMoz(-`vU*tD`K%LpoGLh`ypE^ye`)X1<2=O^DbdZZt2i zW2=9g-BU`0!Q-5E;q=7xuY~BB*DwA|>5yw(X3wOe49%guIeQlzjhak0Gwn$0y_y6s zy++x@`;>y&HtYLgnBPwf%@^Ry`1g67XN5CSSokuok9 z@(10-SC(UwdGXTq;p;p09zUN-S!iZZzX<^t*(c{85fZmq={L7mSD0l&xfsp=;HhP@ z%`DQi4j_{4?nx70&12_f{$e(471+JgAeR(H-&F0C1p~195-N8ruT2+z-!-<7Q?bVO) zMRyw_tec1VjS>{4()`+m0|-SJZ_%T935mJDU;EmNl9O8P)m3c3>V}2g*5`hFdIY}Bmezm>&US9iAfR+s0KwYJfGs3zZqP4)l zs3C3xxQ8xI{;uj*%$rBsm5>_kl@!elXiF39W3H^qi6ddKtXu@-VHo@gkdM_3!Jss( zqoGQ4vQ|ZQzoJxMM?$OoHSSCM?Fp&MQf;w-A63a$(J>Vhw2NNj1w7ZYOxr`OWKm^m^_}jbNJy&%rAcNS|e8 zy|dZh4upkeDeo#`Az6#<59j?OxkH;%#_84eZ;K}4P`+p?=5Km=`UdxdP11aygvxB7 z>SeEF!P@LV zngv6zWTDM|725?vTP2`C!Fd%xmu#AKj>JBc;_*#J*HI!Kn!g`83kl~Y2x0`VSgcAu zdWjTTD02e|$fLPD4cJTiQ5udlj<5#$s6$pt#n4Jg`&8qsnF`Tsu1r~P$9eHGQosDe zr6420TpyXB2y5ye>@RLC7`AEOw;{TUE;$s=TzV5?sUAK`eUk=VBD84OJraMg_UxTT zdv%x!AGaQRCiBoH0|Ivv*gi*C_kX}`VLis$GMbSJaXmPMJ(GLU(3Ozbn0=AK!?<18 zLQ(2?qREM}Pf^MafI9lho_Tr%hRZ#qLMTYtt+&O1#*xkhaG4on&7K+QCBn^OMT53? zT30))iU4?Lo1b z!+Tk<0m9L}g8m&H)gmyu*DEi^I`B@|MJ1C&Y33r_jRROFnpY}AqCp6(Jc>lH&j-6i>^O_Ugn7plc&L-G$Wc6IWXIPx*e>3pgp+CE zqf5{okW)_HxCM2ba;Vz}FLs_*h-a$lc2;-hYo`cUr#aMWorog+Tm#z$NnW1M_@bJt z8~3F$l$SMP`{izV@V4fqIojTk;*bjYoGs!$l83s|BP(I6t>n;2=*V0n-sHbk<45Oj zDP-H{wF2O&*A;c=(irQDG}`T<5!Rft!8l<(1(o#Q%^Wi9yj%RZinng3u%#5~ z>6?6Q_mSlU-w`WV!0Cy)`;7b8#P&>6cvk7U`>gj$4Tn;*vY=Tag8rKcw+p;;8SG1BmN9`hH z>%blUS&rs$cYP{E_q{MDH@8x@8;nR?d4J5rq2}l2&V1Xc`w@b#Zra>NDFH_ZP&mpl z$+_L-v6u5ftqN2Ltq18$aXmy+R1se?hMvUv&g+F?b=j>cQ2Vm;oe!nAavv;QG3zvC)E?4I3$Dg?3FZ!F z^9a}E1U^vVPJa{(iH-Cl5-A|yR7AYG{P~X&x1x^yzPqcIE4)MyQl@1W8+$0 z$(=NZ;*zQ;$T6sTIe>iJb>AemHW?GExwovpEV z7AMm5+N;H@PD7L}rKWzw-yjDnihx3=Q<&n>?>He~E5On1*NGviItOEoFS4L75;|55 zu=N53Y*?Bck5kJ)X|rdV@hot(-tNM`@sI(Lgn8$O?x3e@RY}>bopJ;ViqP&IR0_j;AkX^Nj4qaj+rb=wYo|$+;g0tShZbn$EP%!1T z4!#r*N+)>2)twM9vV0?)X+gF~D08hql?ow3+*Z5pJ|MpGwH$g>R8Eu+9PK+U-`+)7 z_xTvKh8<$jIEIC|sLb68Uhgk}m$#jq<9E4K0X_@H+7H#m&T7-U?{8E zx$&SoKozBh%1T%HL~$`Z@ySl{BYG0OKqdtc)G5P6UP06WFF1bAV-|k5a_CDnLr$YF5RQ# zNznW1fBcX%63EB8sLp5_pOhc`OZdTQ@Bt0TjJ}{@>7})D zw;`~;kipSLcRNO*8k$6E{smr&m$o*x@9qwa#6v;bo2&!z?Mr|8V<0yrPa!>Wu)(#! z-~}9tK0fu0gh-(GY{exg=5IHDmJhf3-1-OzXna=~nJX0{um6Dhj)ml%g$Wyvk0u`6 z9iVMF@6Kicj#$!rKjbH%E+9WN`BrpxysjCfZaqM^Men+1B+Jod^?Vj5@Xi!V%&#oq ze{>QNcfDm?G5%vM+NsrV8j&foJf3SIBs8rDZI@(ZDS?&QI9H$Do z;MJ_44ed&6>_J?D@6pBVIi9F@6@BHI%+YE#Sc6-yl9?7KmcSMZZ?QwK%oE{@6SCzz zrN`04cLWoCNdF`=UzOsVb)4oj3^!i|M!=F9(TaY^@hGMO=eeVsa_L`^ zlL_Vmw`4*t_)_j2XgV%>1jEa!Uns?iEnG(4Trgaezb0PNo!;8U@c4xCQPJ^E8^sjp z&Yac~5I*iqF5Q zS&WmN73}~CI?q_}?__1*Jy-dN8(?jfRKH7_|5FO;FJ1?{Ex9cTv<7`P;5{#fo8K0qcwFvIJbSm3pB2!%zFmh5s0}ZzMzC~yfLiy$Im~l@(< zxC|=N8w%lbygbhU2gSt}>bO-iR(s>|HS2TL^SNjNprr~{*jy3Pb^gE?eOS4$zgdkq z*pR<-BayeNo1J#c`GRCrvjPb6vdr70IwR~({)!4u&BH)Y_Gj6pv;WANTJ(M?sGrn@)cBdR@AWJBh%2vS;e5ywjA!QS_k-_d4B& zC6MXq!9Ol5xrj~Z^t;gLXzd`5lip~}=w^~veK*ke2 zvfqmIE+|-D$N77yyn48B4slt*qI*QhV{^P@`?>Fyr8?-jLOu%A>%>L);uYZigi+SR ziz4iqr7t0{(}QQCm7=p=*_<3Ktziuj)^8b?xnD3nxdti_z+wYS?H}Tjx)c;s7AJFO z!H}GX3NcWFYaR^`+k}w{d~+hdWm?yH-EKS$0-Z4Jng|@0eNnC2Ud=#aV0)36h@9z_ z=d8DN`SEuG3Yrw7DA`#4?ki8j3|?mWD@aL#v%=waT>$B`!8i3;KARbYkCW}fRcCB< zGahLsz*sT{sgxxdm`A?(-VFLpCxr*ZI0j$chW_s5p;gjT2h9;la4zxrZd*$!$#!|_ z<5uyKhz}tg#f#y0wPvid{A(Aw*RkV=I{G;6!eBv31^n@2h=kg05NPuPzmmdl>^r?4 zE}rS@rxu-d3f>wmO@sJ=2%-eH^`#@?KaL4FixaCF{tWoQ4X_OSP;A|7S}RJu*Wa)r z68P@j=EGqXVb%KZ(tWpeL+cF7C(f}_z$_+>tXQ)Di2~H6_jQ`49GkqmC+?d1tmXF2 zVbs%iei64`(lvY+8%h||z(%-vd3xzrg0@;E==ThVGy?Z;v#O6j4R}Uhaq_^B@kd-h z-cP~|csQumSinlzlO1`I4+33=EVuxF6r3;ek_qdq87S!ChDO4@C}~~U7bZ+RLTUAO zK-=}<3awH|uGd!MMp=@I_T`lPRw^@~F8&zoZqX33-Kq}NZxgo`Cemmbms!58vI0-8 z=H;Y2nMDT1xrB*|W>5`{I{4JZ(@ycG{aN*Mb60N1xbi{y zEQ#23>s@I_im~aBF?N3B>tZtR=d@nMnS!1%x2Feq>)YRU)g)KaU9I+Cg@5XyYI?Ri zbEBjkMOy}l!xEuVkP*fjQ@>)6q z$@A>POcd&TISsUg4P;0jtEPhda25a3IQzO|^S&NyV8qm--Q{m!so1elLR^$+jXKAr z4vxcNoX30&T=L=pq2GHdRY%^dMrS`-Jiw+}%neAQq>uv=$?opcEM-cuOlw=OZtrF? zanSdoH;B2?=T61e+1&eu5mJWIlf+k@?{ST_UpSccMV0WUOig~Z(f;bbKGI{sh!lM$ z94j*8rd{6WJpR&;)M0HSil^+=jde{JFe+G?ck3F<-9J3GCTUUf@2<4!s9}I{Mk_TB zb%v2yaONv98{RWea%vt{lk{#?R>J_V3nN(iqca(Buc-?4lx_<|5Yu(_b54);cLQ#Y>6^GGn%Gx%RQY_B_q#6i zG6E9rHCzWlmhdZ;IwPD|Ma~LHm#~TX-wydlD)Sat05)VIFBF)QgG>PN4Ygl;-`+KT zXDE@dI0jx68}CvoXse|fjs{$p85*6jRj3Ez2ch!5Ud$41ugsLRO~qD2^p~M8lq^ie z00Tg##m-FW21_L?9U%I6tjiZ7>@WpDJPLFVXm*{EoZbPPskC`FTo~;?Zx9alV3^rJ z4!!Fu0Tj~a?FruoD=CMHmP-I*3sAv?2fW;tuFPg~S7zK*E7mIo;zTXqN;G#JSLfGLKkT*W+#e`y&rOybxawI-8O=+IcIFUP`?d$I!*Q$X0LCx^oE_#VP{KTNc=Z%8aGb6Ce;9iYa4g&Zf1D>|@0nS)$jV)IC?wg5lszh& ztgN!LLn1pNrOb@%oz<GVTNsAgSK~H zoEH1+zPvbE^b+#{PBhPU;CME<38Ob-MW}6b5`n*QzC~Ji8rkxiEsL)CxHNbB-|Aw* z2#ofyVAmUYWZrTKYr$)yibnfq+ROg33%;#1)^|te?q{iw<>ia!tX}%L1<@w?&z|e+ z0LRDU%~@AHgAhmL1b;_7EE3|J#6*FmiE(64uT0VdYPB*J!z9iVL5FJ|pi9X7MD{U~ z;CJ3C!OTC94&rbg)|C)czb0eUI^CbzevZ- zx(vw-GisZNxVVu88`FhnM}!S6-+f^UStnRCsb6i&YZ8xt>8en^ItJT+XUFkSHt0BE z^!Qiv&E@06=$i}2k=eQ1{^Ruzc1su9SMbX31~S*<4=o&bD^0R5HmA~euAN_L^wwsW ztA|A-pHBKb(RZX|5?Ozv(jk9^LjNb*)>y{G<->dXcPOx*{rgt)*xQfl^LMjv+a5~L z9Y0<{V{%OnfBebVp4Cd%n~)z9_O*X@khgJPhC%b$4#iQ@(6g=M+|ByO^~Z&^{zbw3 zxu5-;rFbUAim^j_hZfSx^2zw*uGCxam3gf2^jtj># zCit;ODf>qjCW1$4j0`uAj(*qrR}QT#V2g!m<@`Os`>6$mf zVW7_(HtH&1pq~F91BFPOBbUR(ck2bORwTRdl%YeBQyEM9Q;s+B6Ttm}>>*lE^<28| zV)5iIi%#@>ki`R4kLV{1?bHcG?Ez1un^qb`g{+!%@17(($CC!c(RaUe6mqA3WAIX3 z-CRU{ll?*ie+Ua4&ufeuSaWEgr*FM(vfSPoDp?^=aDamCkt{AcpXNq zzix5rHT8qf2(oiribTAj^B{L9gEG*Cz|MG=%TlyZO2YSsh}ODzig4u9knw1R^F@2W zj}cnarvd#|U7NgAmvHV=X33NnpQwiOO&e~|pfAfM>1{r6Qntch>OcLz|M83Q%^dup zYRK|A{V}wZQLtTK1k5u4_Qu~4GxeQRPoVgB*_QwR0FEq>iF0;QrCCvrYEz*#N~@Pj zZZPSkCu{<16L1K9%C#j3ta{AdAaxr-WO)^cJkcGDb=RVH6)>_#ZwS}+S7bOf5xIjx z>gysF$gAm;ZQnXnZ&=C%;#~%iA5tg_<$?bnbHGyiX6oi?413zT0+u*aaDDm1y$9TXVUtK)YN$q^nuAQd3{puC?Vh#k2>VbwS~yy6 z|Gqc;&i=dIup$HAyYY#eNDS8s-}SN?XL$dSlt#;J1ieOA(nJ%U7vR0i;@m>D9jipX zmV;mo?jJr4s@)1-490nV6nV#B&HP$@ciy*=lg1FUM+sdS=0R zdMZ{RyWg&yj_z87(|+1Lgvpw31oMSa;-Ma_5&`6g*JMVUm0X}AvdbA{?lWbJ>iY{h zoj~|dFpJJ>pSttk855l(LQ+tt!-Q0*LVXh^}R%TNE> zdF(Ie9cfi*eb~C8dc((GhRCVwy@0+T9C&tg!n$rM=x|VyA))+lleb+C0c}f0GXXTD&RCANZh%)4%1Q zz3%>>gZ2$iS3e*+-dV?1tthch7jmzlD`R|rZo0N?c;4_iPOR}?z1FwVvP6QKOJ)Qr)_5AZr?C=U8G#)AuL; zGImWyPO|RUvC1Eqdg@HaHNsmo&BsGO|&| zwz*la$=7!r!1{h3a;yxD?IJ{7pXRXWX%&b$dZ=#*YUg+Z&YJ>(*9b_bU$ogLr69!U8bDG(%3A<5-??5T z3JGTWvgZqZc`?Zl6Ly0#RX67VPAKzB)-Rx)rl|V>6wNlO#O~J-7w+Up-|yv^7Nbpu zI`Wtd{J4#E%lvNJ8v$7X{d;b=ke`sDFb$OowqsOcZKBHWA0fGU>6dd6j2?oKj;``= zork--O^)}61jYfwXs!X-=^W(u;!uIh^q+uut_}pUFK!zT`!%7&pcPr zU#L6A9&^asTGu23mUInky*>Q$J7^#8;6>?T%W zOAVQ|t9`ZJ9;HLZ-b^nCIw}QmZ+(WY8_~|p6LUy8+#aIz_g;(ndEGu6J-2fpd%y|A z1~VA{^5^^!gN=2nuirbhlO%(QYA>kUUuj;3t|pW&`43vNG} z#v$-4BL2JeuCQi3e1+4YZ?r;E>A??Z;n93qz14m@8FpKz2)RlmquH;RXq;E?_QWQJ z?BibD(l>0~TpZ{UgytG;R9D_VV$uXvYu)j^2al8oYXDF1t9jGe%^s=S_7ke^QPK|G zF#~!rV$(a2>aY6RLgu_W?VRDeov&kcD5m0~*&-A~>KbQ}bl`GGG3~MW>l@Y^aJ z+0zp99;zr=jG;hd!lyTAxa%?Q=4S`YjzK>bF_XM?2n85N_Iju)R5x95r$y-6q?b3V z&6-1!5v6fVGYoYjw2Rp0mj?m}7{V|2fVk~WPUo&$%hieb_84}@T`cr~`DDhDJH^5| zO>fi2x?wd&1ro<`0GvUWEI#o~<(|IUH0o_F>j<32_m7IRqt)$LhB1|#O_8p0zU&q( zx!~RXMghKQJGrBK0($PtH*26eZyZ|`>UwQ{Q#m>eV$E^wE?$9Wbt^5cuCBLbTy>H_{Spy28>&CXdQY>sibLH%o?O>z>DOJ?p4nw7)b5~>ql8aL zD&&vat3pX8NrAAIrYd{)0Qh-OceHI3vw{?n(6XCP^@_3&(|z>q zNGe6!XSm9-2uWL1A2G25M<7C{C}VsVKnRPx#K{Uwt5N69K{ z{7J&A!E3clq4{Ibae?i|5+9H)xM57)XA90J6>0#ISHImzFx!Gpd${t9wYA=I8BS90 znm3^CdsWozroc7Wo)!VdEabOFb}0W8nl|COI1)BSpP#xKlC3Z_v%}BR=Mm-q>;Z|N z^vQHv2?7~lJfMM@6{9|U{0n-Q@e8S=5!u*+zFguHyc+vUQ9LA6zBM0JQ-Vy{yb z*k$jY_E8?>IQ-ZxE_pURgv|10@OW#v7<*omOt(*`hXNx>BIE!)i zNnN!xekny>GG{*2*nL+pu6s5y@Pa#?*@a2jZPyd)c!CYAt-XjZFq)2Or!{M4yxaK`G)qi-_J;5)WwRy!<4!xHEsEp$^c?R7uIIOX2NfElP{%sdG@NCuW z@G>~(b6mvP*on4Ml9~TR{6tn4mF{o$oVOc`z@2uA8a*_L9PzfT$*wN%5)BpA;hK z89Zq<_J-(h8HN#5CK}x3*$(sVqpEX?6^%*n^nr*UziZLWD%qGTO0cklAzZ>K9@Q(d zIo!r*7tQXp)6bX-VED}Mz~?e|v#T!4?qsff3jy8A+1zmXL}K@Z;&vNSupUx7*Mrfm zD*^WqHdl;`9fB1Ov__k=??yJNkb+F0Yeli%(jG{C2nZ%;5$|k{_(>v=9{~+@2OFwz1vW* z@b@kyAE)3`2zTBA0m_P>cDw$yc6py`PVZO<#qT%eU)Z4bG>0FqDJ0Fi1Vp`_pwewn z{s2*upMAz6b)Sv#Y0kmSLx;%>nmH@;#~9H$x%~?pcfo$E$(uSp7fn(_uFOkgQf*<9l%Oa9OVo!sSCbgmE;*z69Hc0e}_>l+dW?8ON}LGZXF@57e|Be`goydZ=((+tFj{Koe&vm<~Z7irg=c2AHCl=~g57r@$@ zXEWaF{hfUkSEn&r>?O5p3&WgiLhFJ#Ag}8FjqU2(pAf%+tpDE|PmzclC(r9#Uk0b} zc-IvCW#f}AMTqo2ciXhk6mcUT#%CABXxrh{>t4vE8~ygIwagrWHy4g{rAJalb4*o0 z?m>(^&ptGu=j7y~+=ovR*v}+~x^(6E;59^jv?SKD6jvW!2`7rwE%cAx|fa@T>vrje2 zq5%V8*W%}EI?pBhC5MpfCJwmalKw{F`=pRC-l-r;8pw|8&bkmbJ=#cA<1$76c2MYt zFHNPiP5Zj^Obh)oj>k*AHsP`8Bv6p)23dIvP7u@fHday||&RXIhWV3!+2w0@|PKY;E zDYgkCiM}IdIgj^O0q3BQ0j1MlSuvHFpWQ6a`ln$*g_m*4EUo*oBnS)-PvQADKGy|Y z@2h<@t$RN{Ho{`QPu?j-md>5_xD8)XSHi>{ccCeIO1SD~5{L{j+6=oJ%NLUD?yZWy zhuX~%r86@Ly}U47t72gxWh~^GTZ)LJgh?o?TjO>A<$qQN@jFXCJCwHiANT%ZdtBlm zehy@$;{+evm<4$E&rF)BZMjE%?<)&n}6wxe$ zwZ*XPL3_SkDU~Z_(Ry0n*cezAzJ8vpEOdG5`1_v&XIJN*nyy3U7W)JvhA^_$a6}?! z=5e>TtWp!Wm7}}G7kYoYtp(rX_^?~O*+eDxVKjAZ06CpiJt4$!x|dKE$V8dt2kn3D z+4`8N2ms4y0hLT5y@=vHVu{sliU7JQT0OB{k34&`XGb8z~?xw98=iD;Qpo@5j0b=ec%+>m+~=V$ozPp0-wRI zLgoW2o(bmp8Kq)4I6ejyPp5vb2D2t-!@0TKl+E2IH>FBAXkXCY)B1@s9dp_B?daQ2 z{j(4>J4_F|#DHEjfNDw=XVCtP)h$DhCtj3(Bq{{Nt>L!^#T~K-hSo-<;(V@o-!)CU z%C0b$`_e7N8d$2p@F{s&9&x`X|+mSpYtI z(bu{z;(_>VbqoqFtCCBKHJIUP4#x)nKK2o~_X(A5}m+ z2gNUp_7uMAztyv6@_69wv>1$wvs)x$F%0C(Yv-NgPRe8;V5*WU&$hhf#H<2|lm!$A z-Iy~5?XQ5cttHx=lS+lBG`j3(VV{4m!X-l6#}!#D;dF&iqAHg@cMk^K5Ix1ou}d0V zMh)6v$cM`%i(`ai5Gfxb zu+hjyJou1(4a12;xc6_}Zy29Ca4wN+^9b zb>EhB#`_3!nuRiQ}Q@;g5q z`C=B(O1kc2w9T&)))jFQ&-T}_S{>6hGDPoq^vO3wmlQ^vOe=ZEte&xvq0M+c2tE6S ztB$A|J%545$6uv$wP%-RQkw@BUK4jdwdG%>9eBoiq4<-snns2*BBK?U7UYfWf;`e2 zLVzYu7{33jJr3SKXNBcsfko*&u%;Tnc0lI*Rwe9UvLX@0kpg|mdRxF}!D#PfPQTgb z`;5Ly3EewheaXu-aJIzdD!Q$WYnvPKb2#H?qfAVp7IY@yijQBbx0 zAMVRMcsR`=UcWNe&f7PNI%}pHq`pfnl4O_W!4q^IbWx0reZs4Vb5YR2<@1BsSR7jo z7X`)Cfz(Mxs?}>iV&!_m!V>T49A76)vVehK7f&EqFvtw$U}f&Od~DTZZRDhm$-F@F zy?B^jn&%BGSwzZa?Q{;_H1RItJ}DG zTr^C-bic-oEq}M`5s;@&(0y+KRvuVN(BH4fIE$($0RzTo<{UEP0l4z_&{p{tr6XGvaz<}=lI7z#uv@Mj$)wSbI0SHcX zULTGA0nGxDU#I8bp=RE?rK+dYBXi9>sNn29dI)o^e*gEngX@}kA!Hkm4o_J9{L{lF zeg;A6LROjsEqH&P$s&BT8DFVeEl9M42z#i+KJAA8Vn>aBnF(5@3<$g zo$o(iN7@XupMy#l)&S26p?9CCw=k3-a0B@X>O~2NtJs9!WTUz=G`MLv7QRx6Ee}f3 zpZ&_BwrvuYX{C6_9r&LctNl{+PdSsW@5HDt8Q@W&LDI#(_O{2VrgsLEOf99xepnED zors!%Gb_(z`QLBrmQ<8|mVBi29`)x&kC5hl`-P~QK(Pg4t%sb7hAQ?TMMDrXkJGH5 zn3P**yy({K)+e#He}IBV^zeUF1W&FG(h4XE9`4ltt4oV)1WlEEiKW_kFi{9ECy}$P z%W^UQnhc4G{*V+>aBZ`o`ht}P*7^%K{!wa#&w3_9V-~ZgktX$Gib8>+CKh;en-qXP zdy}j=cPEL~JtK1kR{1R3C;q8n^16&E5@Dc8KJd8E$zs^^tzR`m&ezV9Fk36hsgVkL zRV-tQvu8V2Qc2HT{Ib2sJ;j7k7mSc|PskpAUb{WgmF_w93@deX`k#uEt?=uwe$K>i z&*~RBU12EOiFgl$A*A65Qdw9mK%lCZI(8{|Qn41XwB9Yz$h?+7%>b0^}Wy+^j+aDKF+`{GeByM*_tw| z4X50JFBMs>(z>N~Ucz5}0YpJa%{HPwCd|ge=;YyqD=^TVNWxD3N}`4q;=aJ3zp5Kl zUvumSS~+j=lf!oM*8&Sur_h>6cJzpyPm=b@rWYCH*#S zHuunD)c#V;ZVO1uu~5wcAyvBfpM~M2s*FWo`vX!v~qkS_G^378K4hZ74Q zDek&3T4cudJIHnf-jmTLzK>Xu&6w3rYi6!A4$FrpM^tusbh}GA=pkIQs>r8blMZqT_fXcWGN)*%&{(UEH(ETgO<~e!Te80N#?nmP zZYd6+oDhh3(CVJxc|vOr_KUCkmtheF{kgOxJ5E&G`B*6XImSq9Frgc+f#QU{N%eqe zc6)jBWXp)}H+7ybv~#RpMqPM6)5h#r29*88LVqaeFTIK8A9rddD~BpB7HfH&Gm%I{ z!Ftd0XK5Hq`xJqa`Y7ayq)8t;O zMe0+bOkIz7Zux#l`VN@B^=R1x@qD@PKjKu-BK|zC>R&f+>!j`u1Ziof$%ZA>|DjSF zg^P+n&S;ofs-=RkS*sV{z1N3bhOquYDiLT=0$b1sW#FR7!SAjv^3iT=W+sd?x+pe- z67{5MiYKqjdNf{BZXu<7SXsp${5I`*6qob4=WrIjOXhtBW5OoPH8q@54(f^r4E&u) zTj5~Ur$uJQ+m8hB9Kx7XF(MQZ%qC1n&1;HwdS^jZ|3>F-c}p>&^IUJlh0kArf!umC zJL0k{pF(WOqpz0avxMA1*+kRoLf=OBx&YWmWW;ATXgBU8BWQkNCPkAB6AA)0=|7dt zaUG{lBth3%xlZ7fz@(@v{lHL{_ZD)tU;sE_>*JdTz~5oVFj*&MJ;~9Jjje0| z2Iq_$68hM+(%lx^zxlZFl+-3wP||lO2&cc>U5B<>54~1-34Paqj7kAHAsZKQioM-T zNtO|hT{$AJsq=yYa~Y^RmGxirlw+b#B*ieYK5`)Gl_bMb0o=>%8QVw2)@le+0`h=CFaGnT9j{(eXf}&E|Os(t6MHU=i~t`by{G z!`6hwy~u(cgron;-~7(hf?f$ptId<4$5Jm!8#;`GoE{OXge3r>?q8D4M&Mta2?PWA z;DI0z5)1W%R>Be)?|`cDyn*$gsp@H~_2_AKpGgppx&Q-Yw*OE!{zUg_(Od&e2t!$c zVLNH)vE}OxR&wVw(2qYH?M}z|_C0+6!8(+JJzcl7V$3BXhjs54C*l`uyZhtZ8DWb3 z)Ot#ugn^`~l4q;`QILwRx#~&CiFDhbtUMlihUmeQLua9iT=6Mg15w>T;8r~lLx&>7 z@O>fj3k|aoXXss}gyUbMoiH77x$D&0IV&W-{lmT{v1UEkrCMlQy9B+>r{<=lUS4CL zkmFTh_buc0=4={kyOX7Uvg9X}m_3MYJ_#kia8cypVf{1=efkLRwuHq(KLjdHWrJ-K zDs!zh?8m}LEHniwj4;*Ye@V1Lqmg}f0e7c^sQ4oyeYy1f>Iuul!VyZZkx6eoy-T|iX!ojoEGiAzyc&ul=efa;cZTTIxZ zc{>KC4Ro5rBJ$u`OFvKeAJ_8!h=SKTRFs-wp%1IIn{f4#*l0hJfBgH1X^4jXHKFpj zlnN3e%P}(G_p7&3nc>E-K5io7#efePW$3^{=sat92o%Y;mrEWa@_fTQCHZiZj`iLu z*ZJ3o*lc~cAR_!1_(#yCUe7k z{EZN@Y-?B{4-ggO$^PXp!M$=%|NYX=X;9~1-vP8xNrV<^7)svhnJAI)5q7E(&-5@N zp>&k2f`A!`SC#c)cO99+i$023+Jjl({UWVMR}`_(kjJR*4Co~XqO7Qx0;I$BNtgrZ z*`P5rtUzvOp&sU0{$6&gUw3*TtI`kAIs&Q5I6=}K3w}Zf8U}(6rk&)uLa@33h&aHx zNj==amzSA>?5CzhH0M#X%)oMAp62r^@vqpFMeRQj?aD`|+p-N_k>>@&U4#}>XEqG& zkW;6hN{y9FSZY=ExoV*CvZq`-TX`>c_iUuz4v<;OkY3q^8sHDzB&~PhnC%S61bxd^ ziP6j9Q!@1g)|-#y4-sMLNAxdCw$roiQG*iW^62?#Ln)OI$WuP4x0O;tpOGcQE~*8q z5|Ct!fZPx(?`^|&-(moJq^=!CTuYmyZlxNi?kE zI1wC^y(J)aMHoi4?J#YAxbsQvMoVbasAx`+1Z% zVQ!(ODYy5g2HKDBb1rd_?%2GB{uM)=uoW_uFd{bjJrCqAbu0~+2A|xI1rJ?LOdHd@ zR7jlua}Tfi4r8opk{LzoQV+(~%tLP562~c}Hn}O7bH_tFd2_(jBPR`0cdlPY{%;-_WIs~&9QPE#FRuix zSjwDe%FQRY-sxS}(&>eeWIGw*ok*4WANyTTlltWNYWtJJ_#NQny=@11Vr&Q*bI8}= z&gPHzsvohW@Mri?*PH$E!Z4W#&wGsW(|BPU@pe$abx7;3HVB<1y z>o)#9)ut;rKWvmjNS{NOgGI_X5z1Z)@wcKqudc07F z5mVEQtQxJMkN4c}FX{!@yf}Y*yXyME`drU>tqJ-&toivCp%HOReVM9=iR*8;lcN}g%`%EN@;YCUgIl_6iU#krxczJHUBCBlDhqmc9BZ+1MvrqVz`I?K_xl zs%XQ9@!>YyHxm2%0w#yqI%3ZTdOyRn=ekGADy28-POU7`DAj--uJ$P0<{&ynoFfOj zp_u2O=@N2J?X}3sMPqrzdlrfwf3l5{@effzhQ7^W`1IId6L8nOqpwKEb9mngX|c4lpdRLP!h z2W?`Q4tcOn*SlNq#~zHZNxHnbc)#~bOg(nrw&SI6h6kI}d3`QVltmlq76lzTJ+998 z;4zwxuZLQu`ob!iT9Exjo$US5(+E?s%zjkAdJdf-49uEBSr#`RXeL86DZ|+B5A><6 zx6XYHuCzrNqK886f5slb;v+mX=v-oakcqWLb!YRWIgSB)Y8@JRJ|s~SLYlWazIrEv z1PG0c33vPs*V;4f*%1QccgS#;6`vJ-&$%($tlfSsg8l{k_rPRht_QI#rEh6wd;%wj z*E21*fZOOMwO(vR4OE1k1Hw;7XE*PVlH+%2Pm z7}Ks)$x`nK!V#@Xr7eP_v>+<=BJ#}C(07y~TxM4c(KcFBN-u~o6Ld1-qB$Ro{+i#p zYR$};Ny8)?413opI4dd#$<>7n~rKw%T<r7yq88$j4fc9=}G(2z>*y=-N&WD&Cf6rzpvWqT!haJ0KN zz35-lOw4dWFoHq2JP)tkGkR*i37{L^DwkQ}wz0s3X#V2HxzK7_^%7wcD;Qv(C-8v`O|pG`m6ZxPDNJg2_DQ)#h|+Iq4RH|cdFUYyxd?l7$v&J_ z7mQkBOKa=GU8AGh(giQ$c{LY(58;JyBP3fUe75^bu;T^{$l3_gKTz>~3OR9XX7dCG z&9Ya(JfGNbDwznABubfsP}9Ih8$RKnC?X3?5eDr21LfLL0SY23c$u7mR-&-U$BdX4 zX-G2$q^k|fVS3qvK4RKaK7-jR6bmDt-jn;nTfqLwJlDNGj~3=jJ9R4asojMPU!b!F z=*YC;_%TY_^x6E#@RxhNjuYErmz3gsFiuWA!aTRl!ea7{<7{K;<)N(+p8tAUQo{6m z`^bK{`BpICh=tpeo}X$?4l}l{$VXM~^i$E}Rk`6q@wl{3os@}A=r%DWtJ(Xk2leEY zh__Gy>mbPud+fP|?>Qtg0V|i@-!3E{*~ZoLJ|pq)%{{G+#+_V|z%!JMrG|@aTu;tY zb=e>>#1*Sf18}?4-|v*&&(hUarXsR6Be=qzDF@an{-0oi3@VvXF2PSsthy(L)T3}W z<9Lr<;VuP14t>LxS;#!!RIo_(QjIjbv&sgG+fdV=yNM%jrode9#trDnL~xl^?hql( zau%=9;#_CMLt7t=ae)?v3^*;sGJ)sV1+$Xk$eff?PZ2)>lh**L)}n}7lagVtjg}U2 zOVW7)I<&sZ^2+BSl7ziT7rr==BDPJY1Z=E2>1=&OF@R*(;D;GeIV_7`Z~j4HSIh+V z`3$3KokwPWJ~a@{iP&REym0T%Tz4|f2Ip-ql~~HyP@&c{=cV0#^=7>@4)ULk`r() z+M)*0NbK#9S7!?zZ!JDebrtz}h^t36zNE)Fzw{Mx0QiOqB1T__(?U@wxmbHV&~?1R z5m{wIjpFb$M6*b~K0byu+{<0GX4d0112+E)AliD*XNh}nFa0BNgE623&Murfe6;pk zizcK;TD`pJaQ@{C2)lk`D>KtrhV$X!@P&@p{%lwn#6|fPeReJ~Q_!q>^tV2)2@KigY>-c!HM$7@Sh4UdQ@ z{aWX_Nxx(vBh-=6uJKY2YAh6a0YwVWA8Y2uQt8cu613sAYXY)LK6~qX8Z^tz)&$hX zro{n4f}c8cQcnJQoPBu*3_R?*#BNo@Qxaq@K)mKIcile*15lfrPzm|Jv($UmYDJ=EJRw(z#)vPlg0_ z%96A=ENb^dnG%$uI!`r7CCqIz(b{#sIG?A4v!%5t-EB9kWt87~i9K8zKB^{UW+(wx z7YV7mamH{yxz2SjNT+?GNO1wO_yy?|ze%8C6N{w4QJz)pUx(EpY#`e*5rFA!+~fUa zk>B$shCl1kW7r4%vqaqDueqevN}ILL6`0ifJzlE2GK-t@BkC;CDP}FX(pzkl!k>Y^ z@7q^KAwZF9DM8JV@3|c^bQ@!rJ;g+a?(TRFc3q3k_isqLHE_dftkPLDp2>CQ%h{Am zj)PV1&qQ3)l0Du^Gi&1{j>vFmJtP+AlTpyZg|I~+{cM$Z_fisdBYYX0|BFT@;_WZp zwa9QAvmM=vjLGER&$}vat_sdLDxDlBJsIy?r_wLaQsM-^GfGluC3vc50GkjWS3g5x z7I*P-TP#26EE)1O_XSfvFYOY`>k}83l{!duPo?IdAWqLg9cS#^F)TI@rsdPR(vIDo zrWh{@F8F}fn(I%v_9Q@Xwn_Cry#Q_%ln{EY&pel~9sCvZRE(as#3y7O946iOvwZdn z&ZM12a=#(O+FTme4PBx(tEO2z4c zSxYj5UUH=`l!(%Oh4f^q$30%%XKyTpM@a>%qD6%hLX7)lw5dCaKbg$cqfR)t%ynJ^ zOgtQ_AagYso8m=M+n<+yaCqi^buwP9WuB;y8oth$i>}#O9JsMs%ND{#`j#`4X$mil zp#f5J54rU0FnPw(OWij!yxLu9oyn5!o$HPqT?va%n*RlAp3p#xgj?mf4&Fj6 z>Qeq!{u|D+)>u9@w=sC%?7s5j z1GnO*pI#5Xzj;;zwHzri&sEd2$e~K_n|ZCbiG09LBAoi9FlLuDyIXUA+C zBwNqCqF>8I42yWoGe2R*p(JIVGx1pqmcBWP^NiyXg#{8oLHc>C5PyOM%<{V?sK-ez zK>q>9SEeAfTkLN%_)+6!1uv^r7?mr!#a4%XH95RoK_gR&rZ`~i%z6@Us`}*j=q11A zvGhO-uCK-Y)-#gfWE&tvs&@EZjyQjmQ6?9)^}l)0f(R7Q)H^J?B4aY+7)*t#ZuWW>Gpg59*FXaQAXA7AIfbZ>wat52C>ycu~`7K0y8Yw zA1(G@Ke4!y3ad-@xV_BQe1>GVJQ;CjAN1vV<^99;q*I3U-3C8ydU9f8tY(YPOp{Q zHu1{^5QUe`CTE?G2?A`+`knJk+*D9RETnpD^rYd^k{+HPB~CzVr^Ypo4xXv3N(;51 zstOd|TXpwE(1)F)6VP+Y!?VY$D|(J+i;0~~9u}csd4zouZgWhF*Kfwx6D;fjJ`RZi zs`&P-67G~Wd*P}RzoiYKHiMi*{APU(Ex*py>YL9*HfGz>kWQ6dpkyl;-+syL!8T%` zl|4g3+Nj6ZvPa5v_+=*4%sKj4K4Bh-Liq3+GEGl&SBnIOb6+6_*RS5h8;ldz@J=uPoNliA+RZ-U6+EVZ*TG_<1H^p7RvF0EW>h>|^( zIC9r+jAqMWr3X+i!&OrMB()a;Uh$8dGDLp&zm+#7mVo3acKSHb1i@t`2O z^Gn9x^t}v5P3VtIS=RcK@7G~juSpj${Pm^d*|+g%O_9GIA+ zD$nSDx}Ppj)XS@Z!LfZ=pq$Kr^T`ZATY^jytsxt&)rJrO!JGy_4q_9A^R>vYmGhHY zosPv{6z2=2hU_QAH89*Jg$u-6bzN5vv`Wq>NEkrMhR0EcCT$>#Jws3g_C+V-Jsvz1 z^b^RD)y{g-7XlB-kTCrP9hXmIERJ=8`CStQx>JjcZ4kOaipwQ*r>r;sNq9INr>FJd zT%FQvBVVXuGA=QhTwrJ52Nt!~QuTH48~b;2o?U$+Y^wE8)YfdmQXsv!Kld5`FtR6H zpcy$l$Gbzeqx#=?A-)lW*API|>CsFb4w<_36a*Zu7x^vJ7<7>xz|R1A1ikjRJR}k^ z&;JEl(&ndpIGtaWa0hLm1~*ts#Yz06WJB4YB@Oz!GFIJHv!9wEuu_x>Y!v1oud*8! z^Bpvg;ZTolK9au|j%qBu@pJag9&53=8p8&2kp}EujqaGcqVR^<2mNoKYczG(&ZPQg zsM*C*e$OwbwJliEQ!ROI$7a_N$Nf|%Vr7R0mKGrO^R*Wdn3YiEfK(#WnMO2G)S|7o z1P=+B3~k)##8w~^70ZL2{G>xYHg?i*nd5z%8*O?v5?eU%*_%CO&1w#1Ogrfvhz7X!LU?EstQ zHxM}*-~K5X-g4SPRj`F10ll=Y7{-tlq?rU>P4@(%y31a^U+7(1VBDjVqI^973CCze zYef*_c_h&&9z=T;49U+}VYOaiTqMJpyd5XLNtjNkPzyMA=I8l_L)zDYZkYU3J@D5Q{i)XqONK+Np*8* znQvRi28p`Psjuk0Jy4ouKfy~HOT&u7yc;>T#m$=VmO}!{spy^OsbmgTZppYFC%&{z znH-+oe_6FUacy-;G2=eP-_?bhgY}iwf~-gu$?x8IS0&XFaxNxHxjhu}vKiT~hK=*x zi!Y188nK)oOzMp_V-^SApEhwA+pt03WCVtg+mDdEuTD(>Rc_{-I;Z;0_I!`_BlH4*+jAJ2iqlS5yf&z=^Sc@vG(;=ti zl?6qe#a?LHnggMK-GhLRu-%n<1!xBp(oIl}o=ZV$M?At*kr`TFrviJ(3nya_UT#kb0`2-9bS7o?8 z2Rq%gdqZ5<;#|!x*E^-}MdqM5J0ef=SpOZJlsfO0Sq#JZT)f#{NUn{ekC*Ca>mghH zlJS2lZ3z}6O8%2i__hhWHqat+?RW5TM?mIzf9K8h_?fIwHWwtm@PniQDL^oNWylaR zD7vC^Mn-%#7=8r+SKs4)%k!}6rQa7F?KF@UgT@otrCu9z0n{&nJrr^sFo8|XPC#5T z;eKQ6(KA7r6PT}qTNfR%5wa( z8$-8JC~OTb(gYDAO zyHzgzmE%_ec&H`Jw2pHibl-3C&6*m%ty5qi>K0+-{$p#vP=B@MxcVGSt!6aCkA(E1J%>{l(`vJenp`KdEPk08!U^s_!-E3#fH3n&8 z_#=PY8gh`)na{rgtYvlk82?{G=wAfj+7Z;NE+}}ZSUkyu8lfGOTTY!OPtgA`b&D-F z$ZW7lJO1mX%=Ig)c26IlTC9h_V~eh_lq^o%=4*ox9{L=hJ^b?&ND;XJ+KW6!)j$UZ zoey^;c4&!4FbzPGISKr^A!!$0jq+U`xD6+U89er&&Pi-k=?_KR13&ZVd{6gaMs1ui zlWy$CwHU7qnPddfx${`+${|!V4qrSKYu$~GTImgQGjRa#>T(bfbnK1T^^kONVrG*zALT z40O8Zy5UXCXT?6Q;xi7GSj}|Z1oW)2Es7cc5|ss$xHYNJjov)Xj7);v0>e`CL~cHv zH(?xs%!BY9c=RW_IQfjtAypmkM1AL2t*E5jm)_pw`^52^r+cY{@ZVjsVJv`$u1kQo zsq6XzDnf<70^H%*ExJFom8YBAqFLXox&L@JFI7Ao;w^oG_(A@&P&HZCoi1yn_L2t* z$VlgofGw4z;n`ry0{d^TuVM6rwyZlzkOBY#mKS8{ zysKn4?2XUr$w*VDQfxN@^9@SN`)_Zw#jqFJ`i#nNi%!jiy=*Qjj#(i`%9{0xt4ngS z5?bd-8?^rV@uTw0Am=2j&V=h0T#WwE=4VEyJpmjOZY5Qz($ZHO{` zul1~8f)Z3rJ1$N{e8GNIjE$vTv%=3;>jNhZ!(#rTXaM9^oJP(pVS1}qMglA^Y3DgW z;BPfbCys0y3uxo)K%0pi;E-US%|F(aQdNb(0V1svhjOE4N5HOp;H|b&JHpzo0!?%6 zM(+d1@-4tYq&)sCrTtE15YT%YD|qj7zqALyS^sR28bIWPQ~A7y6zC49o4x?aXiKps z%KQ5%TzAVbyJt}0aF11y=>xQfXa&9mk`p0=`&)xAfR?>Kyzx22jU*E$4SDtu`f=T3 zXwQy6J7?-n9pWg4x=Al|=ZzAdw7jQCqj*NfDoam%=w$L5S-(Gmeu&Qp9$_KWBBXw2 zhKFubO_Or7+qufhslfhhU!)*7Pd%Ah!r=_YAJq- zJzncWkR;YVI@~g^e*igAobZZ}7RGnSkJiL41zF-PY+HZ*&2zJ}v<%i*+=2$(QwDC!i_02Lk*V+dDYpVb&FeLd$S6ssldiwvlK}@9V9bX@s-z-+SNjxFMbo%G_9*jEgKS6)M#A)C= z300ylXgQbj;kJ^61oQ1Wal^Nj^K6p{>&wf)FQQ>UuYZ>2#NhU;we7rBdqlc_-h*XMQ|DN3C(3v~{qyJhvI*=a z5d$41hQ5J?G8Dm-*NO6b^|@BhXo@SH zfkIxo?Tz9paj<003mL>)bl7M0=L^>#EOCx@Q6SUSPR(@zttv{;^h^s5=Rw=TxZ-Zd zZ?}zKFcjXvZ*w->nlL&{N&Zwa7~#K0ZE;I%*xOS7L_GjuZrR)BQsyDX5Gp5o2q`B2 z19&!H+Ra;D{RDh3*F?{=Sw>JcZv$C9<7WX95N;rkh8DB*!Q-Yu(iH?<&bgVqR{S-S zV!uqmt2O$g17xr0T_0$B!WsuB^b63ZEsLCNL#q?HB4ON=#;-)~{j2XvyC2*G5wY>~ z=N;RM00{}oG`vAi{Ko#WUfNWBq;6t)J9#&@oM_sycS%k##uGPkt07XT0ClHvX@M#S zw(VXJaOT@?`H!5eZ6`maj{l*lEa1Kgg>(OsfK@h91Jle7PpsD;$ZZyS(ga#?Z1-D5 z`i8}~Pj!f}w%|1(AIAHzbVRhU@JUNxs!DoXfwl-1s%YL~!h($Vk9}eZiq4s_Qit|G zr!FYsb0MZ>UK7?3K5vbj*j!n#)cio2$ePt@>EQb-)pRm$%pYKEz!Bg6dLUHG`Z%{n zALq&J;F?aeUWr<{>*DSZK>`gxShesH{13^7H?X#Mz+a~5$YlCN(D&y)ciqPNE9>I` zyPJP#m612yQW-SP2&Ve|0Pp74q^X?l<4``iR!sZ&srXVqbtT1o!)!FqMN3xR93m`V z4%n)4y}dI{H-$VxVnXw9>Z%@MIa#nK>Tg|kW;_pa5GLw-qIk4jvha5*#V8mN?#@j} zJNj}hYYNSN2Lm4j5$kc+!>?Jb#3v(&2{4kn>nph*r&~`z>{B?OeQV+SxK(PN$%SU3 zKP%=PlrIz9qyzm(bF4hbbMm^3Mr|e%y!V7UOdN4nk>g zAU`_+ny;;Dq($RyRAimhjoWZHqte&!+E&-wWV{J=8zh*cWa#kbH{}Nl?R3n{t_yrP z*7uPO<^AwfCR%4ku8>D>ve2B|XO;8wL2c*QcH#sx=MBcHSl@fTt1gTN*s7ZtnV~6h z_52*R7J;l>l2M;)B0@cm+}shyvC)6dhyN79$$}WZ`~CXFc%NGgI+UdCv`x08H*T-? z$Mi%^TFX2k2tRh!SS`f$j#5$ea!sm;7UWwKb0pL_J0?y}S^7osPRgqN z1vam{UtLhT(hXFC`eb!*vl~5zD!t>-j>e*t^i$oGm6%X}8wAXaY@s`}ApFdg}rffOHLqSb!;nCoDh6;YJ$X5n2xT`aR zJ)%unJ~-EnLK3_Fyi#Rq9Xr=s^*@VGOIn>j6iqNHtQ7ajfwpsL@Lq|DFXM7aR?XAfPbVR1y3tQ+pk9Md^iCT4l5^9;e5crz4Q&;u z^LtN2%8`b}?otESC7`Jg*v6DxDQ&_ctRq|3VZAeb?nk{A;N=cLr=b(2tr&XE-|LEKUcLFCxN+!p9aQ3kK2J(DaO0z7Q=t9PI z>8+$)mV;H69Vp%YV?IIwt=E$lhCAo>j?CVKC#iag1UIi}X7%=tay3 zFaEkYM3hrxM{j;Mjbj{>v;PoA!gRX;+q(FDAS?#=ymbRy7Pzt-_j>WySl`Q|HG(u0 z6Z}8x7g0jxCvJxa1q904`y}S91Z6)6#@#dULV%2hr;xqKlL0C}qg@Q>8vM*)@EZiM zNw6u81VBhdvz(XtE&ePP&NDtCtUIkYuGGku2Z;MK!JR$}Nwl|LLqE9#(}_jM1`n-n zmc@(mLY9z<8S&P+()Ygnhb#{_5s5p`{xe>#FFe`ehHcOOPj}~1u(DVH17&BPblTHp zot5SeW9BdzwGZlSu|LxOywx6XPRH8Q6+aVZ0T}HLv?cg$l34o`B)6~5FBAUR+9@;T z`((eT#R6e*sK0cVH3p=>oFlE*wwx4kAL{7z{6<;_YgZ{5ZQM+xRp2XHf8o?>5DlWya0mz_gAh;KTm;C1NIwRGgqdo+_&vDp2vBpPgcu zvc>)0{gyQS9Ct= z_jPgUE8qYSg$Ni<`q$L2bwxw$oQ21*A>yAK@^z6?WU|^DkCz8{R zl=;L#!5^?=77*)WGrPh_E@E3#27sj>^bR=@(GyFa9wZrF@2t8a4s%sP%jOL9z1uW% z7sO7kYJ`dn7ByUxu&ig&0{dNL!mbl*H=*c=SjPXTdwkCv$Qvm3Y^9FEywPaSTc1tW zR&HS;Lq=Ur@aqZ73JpbOP<$pV8m6_jMQ6m{SJ5}*{=_JDP5#t|^F2@OCyop{nNy2l zfjr$YbHg_;6?gx9L~it^>)fDlmme~z-gZF1GpM^Qb{&8^^StIvZ&$N&2fVKSd<&yK zs_@`h!q7qX1R9;-*cfs$Kvb*W(wgz zAgN!w?ZD~*Sv=m(yqaa`7KilIZY)3fY5hIa_MX*u&NkgZ2;J35#5|c$J_nmVav{S5 z|C_gT$NI88idwz1BL!8>?V7XFtmvUz0a(PpztGMY&%*Eu%)f1?e-|+)2mf(cXwd)H zTD0)5LlUXd(U!wnBXu*W$}Oe^HoMjImxSbx!UI<)Ug`l zwf^2fDf7oB(9h*OFOS2k@Df|E3zNj|o$H<@MZDG_ccspgiYTsIW>e2RKKD6-zTuJk zaPCmC30Xi+_ihgN(%MyPlrH2kSsLhmYfn=6So8^(Z;fh9$z_>Cz#yKfPAaWv*vMR2 zQ0Ap;A;lVOkV53gLlyIW{DDpDV%w{Luh9{^*CSgv*#esipzGOA{$wY=Zf}Y5d()`w zJlTe~ELUI{hE5wjsJs4VBy|DO}tzYi5vM2qjM__#sGg+p1d zEuV>P+s~+7$;zhlDy=g9XryKScaX03-9|8W$^xr0TkOFOHVf+3H@Ne}B-#2vcHY6( z*Lppq?O5MA=03JvwRUeW8H{!LXw+=_!ZzbeRG0J(2fweoZ&69R&irw)uh$O#F38Hq zpgBEp1SiQ66n)>9Ox%j4wb!m-{bO7}HJ4s{CDi{B==aEgZ#L%hSpR{YLmef@2HOkU zF#<}3MMM;UhYz$pqI3UfO|xLd{2qw5&jFKJQqMNi4S#QJi#g=nDUV@B~yM{B~jh7>wol{9@k-q@CkX&ay9wm-p3PIQ%*wHyEr@;<&Q zTA03}dv%~Kb$#q~K32yQXlwplq)}%?l7{i{&Zp#pW5xRW=>7jy%lH8F4UbuZos%vp z69Hq}S}(8a#T>L}wwtcBdhtuM_a1)6l4aCe?NFDZdSs|<%Dn5?4PVg23T))SDNoo@ zCR%*~sz+Gk)%y02@~xmd#DbsFISdQw#3Z`Ej69RMXo>gmmpGk@KctRxZyJ8#8j30 zuj;iFzngu5%5-t(Cw@|F5|Wv8esQeNgkV`k2m3L{9L{Y}{X#~LPHL@$^6S9~;i%|| zHlT7hxWrz>YF-U!F!!A>R?C!PO?W7(8#}W5t%|(1f>ZbQVY4GqPEhxEmaHCwO=lf{ zqLO=VtRyeI1&NW7>wRcK3hjB}_JEZ1M8}TEV;D_V?;Y3#;d~eylnvX56zcTNUufT# zJNbSWxbx3RkNjr6YSDXS+~7F)zR07|lHbCz`M#CbV1lgG&egZ)npm2(=x^av zDC6p&a*-T-A`ep1nXhQ(rKDrki@BsS{bJk10|!+tD0HRq@=exjtd;vv=j>8_epwUH z;l>g&O1yy;m*(E;yHR==eW3n( z{}R*R!m~5}ac!gRe_=BHx0N|1jI^(Az+942Fwpl6lxhze+5{A_@af5TTx8Z#ZK0%dgb&lX@2V z^GO?z>B`s$T6BrtC&+bl887})nCGjKfEsXuUecfEc!hag}xCL+aTR>QV%1n}OXl`T{nSHir^69^*8cdR=34hyXcE$3!UQp$P!@ zB|ib)3fgU4!(Z?(pzS_HI2FtkTNmkreyYRVxPzCKa!G~~b@yP+srZwj<1u2pKL-X; zHqzKsaY#hySNQqTWYKDPa+qDR3>gy>|IJv6;O?&&u%`ADzGAU7q?&7n!_E+xThD<* zIWa9x3oYlS`*3Hc9>Wv)%7T-$UE6(6V&f&OSxWj}R*9`OPODkAvfu-~lU&_~Wn4Ws zotJQTKRfuVTP@HwH>Hu(6;|d6nbIBcZe)|me}Vnt-Fi57bTm@Z=+``1%_CR0jD=~F zQbIIAu5X?PR}Jj}`<=7Pln)7uFB7mKR<~W(HtU3k?*31o3S1h<*dJa^5Q%IVcrsTiP}!^$r05^R6% zEuBg&#-7Dc%pHTMsIvr~y=0?1`*%N=b7&V)Jj1R$zO~TwTw3ZK@fjo9aHKGoUbowD8bk;}3GaJ^-;!R9HxNK&R173GPM ziSk%xgCBMo|S2L9*cz`)k+pXov-Wp((X1-{!T1lH0%ty5R#H7WKUqwmmV@PpPd50{ql3 zI1u{kSvp3qzM}<8GQ=a(b};A~|6)yaVER?nTc5OERPWiT_?cd=_4_4F;vd+z5uojB zq8OjNBd3zmW~*nB`QB5G!yPz`}G&7fB;SRVMPBKeAFj!_THpS;03Im*Ou5v5#d%1+p@l zkbyoJXrPyY|Hyv6^*MdR8RpM5_>ycaN263UVs;Y-^c#qom4?dBy^*JG580AeKIAg- zq!z#>R7IXSEcMmmp`G5+l?8d$I|FUMWZ;xWF9C9zx1E2>wi1J_ne^Kmm9-$P&{X?= zI;Esx-vIB8nJW{_0J;hqI_bE^zWcJIK=`<{PrD7HRD26btjn zeWL?MAJ>$GQ6H$ighfMn4gVj$+yC3II8s-tuh5dvq^2G)_rfLwo>}!1Y-(FYIGzx@ zhBox{?}J@`ynmNQHg?C79xf=4?-=}bsuc4qlz8mRQ+d@kiST4x7P~OeRnaAH%#=L4 zH>&P@;qDyrb?F;G#pbj`mst*(o6tA(vOM8w#Y4*(f7;i&o#J5b4@m0|*|~7ZCVStr zPx#47Op_a{ro#r@w0~oG8^tAbm`(*sd8++YzPg-qguFSo7Su`jo&UR)lkd+e|CdmM zNPGilz3abye+Yg5=s8_4XT%4)p7<@Xz|j zw_+ZM(KDmbhsHk%KZMPiFz%pb-o9$e`j(hKK2#X~T8_vj>~BaUl&EZUFnX3s1p(N^ zdqQLUR0>kku{U3ZQJ|?$Y*6DwEmk_i|E2^KK#Yiuf8yU??DQ!WDoy`C9^|5R6s-9K zE-1*?rKApk=~&$VQ+faImvUVC&%4ljSp52r|J-dsa-XPO)8{0E1eDgm8XFD2phi8i z^!@p5Usv`Tux}hsQu4xmsJcL5_;Sp@4b=JHuSZO@|J53tO3v95wTYt@_^>kjkB_#%cuBX2mA#-X z_%_b7oTVYC)WdKT1y(vR(|#Fz;fmB-^{*CR&I_0`}!S%*~!;? zuSi^3rR<>}3ZgAPwIOu}lMLsgj3XE@l8*ef`neISUtaFyg4J)DSp8()f2SJ#dK%%- zvXg&5YcdamL^dJJpDwh`y(4vfV59sGk4WxPY<3qAv4tM~>rcm~IeTneb0QvBJw_xk zTAt-ba}uzAN1xQkwtDbXpvzw}pwuwXEsR4nFjxoE5<|q|lur>sWipk|4w#q*;xGs) zllavX4|3d!-_1!&;@ZLPW@7Z)9{)A`dB8$mf=Tl+WH`1AewkZV#3tT7&tv0g8brFx zts9cGK8~!%mZ2LRgS5hgN2r_S-xW?^%Xgo2llw=<$S&Hwlsv+i^OGtz?WiF2$VTo1 z!WIt9p1CCbo4gZd4BDSyd5S*)8sA~ED?%~EldRIyG_``+9c5@KN*Ahf5PT$=Nv|HK|S-WJKbiT>m<>PA66;_sp5-$=QD3HCoqtd&$H7EdK)sxH{Zi5MKk z@2N8u9@TeWkSd^!^}NhEZ7d4S(_$bq(J=-+Y>~svlMT|=_ksr%dwCl6DN9@G+kHpE z0RSnFq0hfhV)X%0VT(%#)OA_x;5jn4=*92YowpK=_X<#~6KpT(R%Y;=3>P^1@$5wC zeJpi3wp8m`#h;D&TyiOeU3yn^rxC8@jNpZ)BbW7eeaGRJ%X7Fb&p!n#vKQv@>QhX~ z$5=Y34xupb{04Kh6{;C;chXklk;CiiQpfU+)-2}tgSD8UtJ3H1ZhX}1#Gg5($^z0- zs?Z;I=iQ^z6VKc+B6-XT-I856uqJ#R)R_0V2&UloY$~vD7^-B~7v?N8y`TG#W|U;<@0>uJT*nkX#~pNJI|EO0nV z_r$s|{`3t0)FO}>3w@1hLp^$`C`sY;n#z18n}Q|BmI;93oO;gSVD`4S$nxEt1Ki_1 zV)0IMU>-21#n1Wq)Q+~{0g;lf_>;f%!UD|sDKXR6;Vv9&N>O;vx&?v>NMkVv>WCrsAV*592lrFQzC&1#;%JAvOYe1*ua@9-rR9AF4b zkP+A3uCs=)RMlU6LWcR<@}bEnuMAO)-Ron6BM}jORN}UanXmUR0>J)UZV?lgOilx6 zaM6I{mH~D&Eg`09>cR9*8*)y6c^2sLxB&_xVYrK!Lnt(GJps7CerB$_q%Zk0PMkma z1BZA<0pK7N3TnX~)Wdg^R+qMMi5Jo@uNU+of?2Q}q_@&HG!;KQpX#u76HejyK$|LG zXRs9;nJKY=@zZgVYeJeK;5yG)8i?qLRXw zq?)v415V|pe`2A(XeDw5e(Z4&jtU95CM&))f%?pWJsKx-rGt7UlkQGSH-z*MF}*9Y zXr`~d?oEtPj6cb$wph5&e!6ZZh02p|a*Xs^3t2b>OjL`RX6(Tmj+}if<(@b|(Cm5q zL!yHT3ttA8`v$oW_))h$4gnK9NxhBG;UghIyfEn4yPGaijeZx!Vw3DtwW39rGE~Vt zb>@V}Cu{y1hvkqD==@X((msd4t^YGigv_X^XUGk{E;5mrp0r6!3bGthB=GhS$1kQu zWHmlF_*PiKP5uMgzFZyM+9Xx#?32D~8V!^s=4X{82(+_jj-yzKY}in*w1U7pHO=y1 zcK8i%d^X*onbK{#1ADnu8kw2krn?-Em!u%hL$H)I!$2uUgxYOz5|al%ak?o{V+%`)m>NhqXGncHg$^iq;^ zN9^;B?jAHopoM{6^SOSrbigpu5fsr~1#OFh>xg%}#7`dF4!Lro*xZf(7~W{kOseBq zuYH`Z148kV-a&V$LFFmM_bods{QW;{s~m-7o#9wz0qc$LD!ZTXwG=(DRWa*4Mr zSi)S9*ZVs0Rd*$QgHQR5&uzLTaFWrrh#c7X1zzrRU6z8xP7C870KVO;f8%!>DHjZ1 znv~UZ*r~h!gAPy8r8?Oa4Y};(GdXR8wi?do-lp?jg7S{n!S=W_$0iS&?1)XT-x9Uj zqTH~=D((^RcmwZ0s`I{7zks2k(I{&_(=&q`MB)@^+xrJHZP^EsG)6Pj#k%n5srbZ` z&;7ui3A`=}*sCA2EQ8lKq=`3x(^7o&{OTpK4}C&nRzZ&sj`ekoGpDGk1+0gW2}xdX zfd5dw+HRfw8XZntatV*j_J%7sEnEv5|E=fc+%t-ckL~D?_5o3mqJxRXg-JI;-;8xr zmhx%^IzHK>K@y4~ugiL`oy20L&f7!`0Z@)(;pL!O{;2~m-f@8w)F(Ni?>O;r+omd< zU=7;>*Cb&X^!X8y!x?Gh!A69XgWZY>U7rWa8|OLD0>MGEMqHWzPkD{OrCmFIPlbsCVV<+Q4rpXVvgzOATk=4D=CrYxo7AIH>nFlcz49Q#^3{p08rO$9XITwx+&e!+c`_s<#KY-L(PWiCeg~m#R>u z|3vs9sgfCVQ#VB@dL{Eand;c6o4l|O z+aot^xajrJSb)lI!wGl+yg|t9DtOxm6nmjk@S@RXL_dtIySC+3u~S;SZ~zq8G9IHSl+M_`xo>f>fUIq_xzMUGjc+ zLQ)c3SZ1A(R%V7pFV7_fot14N23cjT`<8P}deS9^4+d4pU_aQGl@#=8rjqJBP|yOg zZ#Zgyet6@)jMG_?E6;rnFl9+p8DAQ}*MA`o@DLhQHBt4hQ#w^IIt~}k*mkrk++rVY zg7heEyV3cf@Z?(UtWc!(?F7xf90@~FsXzg4J-P|raSV9rhau*ReXkSO27JXHKqJFx znFxOg$lmJmC5?(*yicJ;R&~4{>slH#2E%5lj&|T z>LYiZ4@km#k;fNy=*J3_0gv4ha`7g=k*8XRFOarRf{&|(na|qZPfm~fl`{Qs-jx*o zFRRdphwnl&5{}Rnex|}OK!YdbHx2`0{fuOWR{O+l-bF=e%}4X?0>KM+tQLwr7orp?8D<@-kGrA zIC|jJAMG`sETGhGwRR78#Lv)V32k3N6Ef`l%e-$Bos&l8@{r*$;)7O&0F)eBSIF0; zG&;MfE_GG-Z<+0D`IX#$?%DD(ZD-FXprBF6&GP7Lf0<^&LJs)*s~$pmz<4f=>4@++ zsM+3UPm15MUmr?1z7B444>gEkGe5S<+%w9K!BQ)%wrq4we3_-dE(QN528oa*bVinP zxizdHZqujbT=rjIM0*`M-LUNKPJ`BZ3_;x(U^33l9(MMNpNidM12IO}ItPFyw{Kke z&6hCs_58Xt>$651TK>rb*b@7|g1IJ^LSI9YVvtY*VG~l=j7Bbo+umx1C_d-1aCE2z zSJbyfTtbq|l%rK8Of$ieaXl1!#fIRO&-6-G0TSHz`bA*YEg}yG7tOWt(3s6<+$ZpI{vZhjFVc}RCse4JKUY-nG4Xme#D7IUgD8e zUnytLBWG+MPzs0AQMh5*S}F3J`7qqN=i+@wtu=+dv%^q?OujGt#5^W7(NQQFz^`y| z6wh*4BJ8qg@1-oOtL;AedL2*K{Y^aE_EI(TboUb)HMd8yIl~9;9waaZ93|*sL}?wd z@*&?pTa-;R ziLf&pwQSF3NK{Z67u{qqS|LE|8tM_n`uvUcGqx z$Uq(~OPG!gRV0RSqxj+FRH&2AhpXVV%G2&n9cNrfHp~ezUjk*(WuWmTaoUZ0zFR-5 z(8zxMq%H2ZT28!y`pdMq-~NNrQw@+4jja{+xT@UEE9Y;?%)ktb-!kp{py>CnPVX-e z^!ud-c3~xJS~J^MQ(v1nZuRf0W7!vPiWKD{oYSW+TF-+iYB4M4xlIftlF@kKJefIF z98dD+hS+151(C4>uixGw9H=3YeBtdziqnfDOO!P{*+IaR|L)}@BHuuPo}CuC1`$f!J$g|i zG{PS5HN7SWzCY6b5TKmu~!`E0M;><{(qDk(YIIH*pfNpUlW(bg8>}Y^Gu7F7A#NpX)1Oq%7Rqoes$$i7h`sQQrHG})wr`ta z<1ZA-f0>E=5W2CpTZ|;?yi_PoDT&2e5ZCmqrx{nA1!7PKG&$_5o5mT%GN`Cz}W+c@q_^YgF$V=lRsS8VV_ghIRBiUlO8KDbI zv|pa3HWNY#!UHXI{SW6}Usp6Fy%?QQKfhfB@58`pFdn^pNo3~r(pOw?@3iZ?4T^GiCB)A-fZGX3vpWsVq8WgMUY`u9DA(ieQNpv z=uv7NauC_#5z^Z4e-uuwch<^T`!zVlaeJC>XtGo$clBUX+7MTu^`WmAW? z4gru$Q*NKuN8~N&R~Nj(?(e@qOlBnD(M^L9kK|pg_GBZL%GAw?4Xcw*Egu!1u*tXU z`28ZXnwB;sMqXTq+osiCwU3dgxBX(zTw*4*EkC# z)svMyjVkF>87pss^_up(i5I7G({gJ54QkuYVPB3U0FRBSYST4}_GccsM6b5YHptlg z>Xp2F#;Y~0kRm^b$bpAJZ$Hc(<-cECoI&!TD^q{P3zzXWXh&8<6Z1X|G-B9Yh>-mU z?mw@lbP)GggR&O#cq$9lAinemT5jzvTm{KFdum(<&9hOcWHos;#WO;AlhehYGf`=S zEDyPA@9bYkb*ZCM`DHv!uuM%3-nsRx_%wY(wxp3xg1T9dUpy0pYU#AqPQG6-QWh1Mp zpQTzAMzg)=HpV|Z@mB2Y?d|dJa!d``jm{yBQ7hSR`oyr&_^NGKb}m{;JwE=XYwswl z9d{lYTrC0}A^NJcIRupCfl@XZ5F)1j(j`TY9tt>EZDV=2oQGot+z5{F;%@J$q;%|6 zA(Hz9)lAJ1d)L@A$FFUU2Ds`PJf@D_qWR~HU~uNWidlS4^RQ0LaoY{Sk6D}u5yA^gLLSDEoG9}XW)aOhN@04))~gS<$P*w zJ5nW!t_JRSDusZifNs;}oMZ}k<(vkq;jYEkbi&U>|V=shh_1SFh%x``B6>grEu9Rjq zb!9EgNjVi8_D41PUWBdJ+_cc{%8YH3Tx`_+FgBzM6+ZRMHo1R8LMmMe0`c(I`i^qw zB2YE!qTE$IW?RLx&J@7i>*86zF5f9;jujTrJzfU<*Y>pX{+(4`5COh=e-{>AMpH() zCrh@JB7p5+`kPQn@#{r9F|`v0w&2?)k#d+&NOH&~bH(PI;HO7*!EP;}J$*g^SbDI6 zOno#cWb1hYBIZE4nvQ+!vq{2k(e0y+Z4EIv6u;#J+)I=WJO~X(P+%5Q91(!KmwM}R z;7jph&Uo;-tzv%ml-g&lcp*71c!MSrzV=_DMiC#Wo7XxsoA}Fl7AcdxQxX3_&;J)0 zyGepg{57KmiB0QtM)i}~4WzheQ5b~??j3_*Qvh$$3@*2CslQ`ll;(0lt_SHNX2LhpQ8rh~2ZWg2~h zIkVI@4{+E1$42V1esOOwQ^-2rlWh3EjBfXGJwk#ss{N~2MenFie@oB%q}L9$cFd!h zy|Ww9n1=q2#l+~g(3n^_u1U)enH}n(;p>emU0M*xz_Lih6S=1=j7DCQX_7vi@+_5; zQbS3Z`1VGFcq%kfe8$3zvh*uq-Q1q9QAv59Cq(9s(_R~jsm>MHD-L)%A&3&3FzE1* zY~W0Ql@Y<=0J{y)vhtsaW9pb3AuV!0Yim;=m7$BtV<6J}8o6F=;0Jvlp5T?UkKaB) zvAKs>SsEw0<`B!LkD*ONmdpEKkdBn=UAs;Y8M~D(!(z@TIJv%kr#qGm zTCnw&fzvKBqf=4%Vj+>(y7Mvb?v;W0FDK2Nn8_4YJigjWNRL_rh|DL1VlSq7tg_M1 zRG;7g*yq-FWJ5PE=VnWiA$6}jysOJ~{w|7aF6CRG&?hD&a}!Uk*sl20r}}T?F+urM z$55h{D=Xd2tvsl(YaI#ZJX0c$rNg1u& zY{Iu{n-3tQTpxo}sU$u+IOVvk8=iCkj_-hj<{*I*2a5{qkQUm7Jm7rv6@J7bAJhl9 z644}E6JEo}Ox3qzjh-@tBAxAHRwxor7xIW)$ACzFbEX$%vuu@_{~RF=G2G7YJC=*M zz+KKpf*v!3M|}c5c|s?*bgE_1AVf8bla=y_94J{!#hK_9sLjbYHwF}s4!V^;>C9tHzhc&kOj~xJ$eYoz@v#D9o z%+-HhtauvA76(n%_nV;M8BDj7?@eRnfBB~Jkqa{$$BmrBaCV;)T=s7wYOUT?$z_*~ zm;1DCg-aIE71}czp9QZXWM7|)_JoYM@Lvxm&A>5Mb`z7Rr7?8@FD#fSoz4y0dabx0 zTP=V=M;98VcApDa;W7GnJj0qsu( zmR>yE(+ELlEhbW)$PbMAd5e5#61Rb_{@iPcc(&?~mKXTsxvIbTBG8+v7za(Y+~?tF z8ziRUpgayAK4%MAf(OL8C7^WYg|(YQkn)&&4-JdrhypT=2%|%b-&;_%heYldZW|Ux zGE1vt0V#2dN`CH~AM87hLu76uA7hBqfiGa@E37`|jKQRcowsV8y`{{)KGQ{3HNLye z^HdM;Sn*}OTN^v=HG27tlB#m>@mFeBD(L2KmwIXI=q7{471Z)2gYe-4vI zJsc!QR%IKz0P<38737$Hrr$3msm-M0w_Jo?d)XaH^|?R4jmHF4xdWcPh!%c*@4VM@ z8@`Q-7ij)aD4#b00@!ai3avSxT$2SszvlKGK=-UPlet2$>$h*Yhc%_*}C}tJCaw-{Y9#N zcMcIudxVq9yNP8v5(2j!?szSSzY}(yiV2yhRbd=kOg>g|DDaCzP;>q0-1ER;eW?0H z!H8GFS{JEcFp~04ZKUmD3V6!CD=FONOP z5pisJ1{8o(cfj15*d+NCch^)X&V7S5*~r!LK?l}MGo#liK(~*o(Y&!m_kS0^7%IDW z+r*XFp0wGL0ZQV`(K@V!^^B{D@Mo|H8tKPho?p5%Pjd!&8Y`G4-47r6OPr*CyaS1y z^X&5XHh|a~%=~7F-%4y@WkU>ks$^iavzs)xwbLiDqpYG^-G(O_zvs10B0c0=vuOdf z{+qc;P_6jV^1UinozY3*(y}s_@mEEar#n@33!V(*tn4xq54}MRi45>k(fA$@j^;ai zfGUNKLh+q!Hr&h1Nozr2rr``Rg$o>QKx!*%##KZ@L6wF~c9IZ1kGjxVf>mufbU5N{ z2?)ro)%VoyU!4XCifENjMc1NYQ@e4R1UZLm4ZHXC1b3h)(7|ZKoi$YTmq8}yVMC?vP|leck#aP)5C_j;eI)* zEp_jF16&Tf*sN+A-WiTLa9~plk})dS9cXJQE9G6_fZ@WR$li3yWpTUyY4saiz}BLd zj6YH9W6Ntp9%B)v#}a){86XLk7l*?^Ll-16tl+R{ZDrN7*-^C~QybM*r*5+G3yIeO zD%lL3HZ|6ABFM=+Mwk=`SbT_~*Hry)u}+lKj|l`{VZ7o}(jn8v2Px@A)29$zbk_4C zF+cF`38Q6z#G>Uzio1dDv4Oi|m(Wa80pYT}7H?bUu^5b{SH64nJhXdIfV{-?@b=3SYB*JT)E~7S*D| z=2S5qZMa$$Hx-(MIIC9$AKlIM_(q^13r7j?4y9{LSZ2TWK>nz6aJT8z?TQcZG}AtL zZmWOeDE@r_U+pD-(V9)m4Z(V2>Gt)X<0P5NXoxuH;5-M{>AfZr$=g<)r+xhEa|A7W zmn~&!bR7!?Ckxc|WvI4jjSMvW9yahJ={FCv7bHjfMp++*`v3VNc$@u8sj&sraLgAn zUUeJT%0aed7lE3>$OM2>U|gOY$%bMnzkDQTC9chGM` zZhG%tURy}S{}j!x48G^BZ9>Aa?C7=|@O8ekd zCg(mm$6X_=qK=lJc*vPGo!c}Y_{ttzMrB#RHlMU4%hDO+Yy(Y-T@{hmT?)6Y(1xm* z?xDrgRWgAMA#8#hpyX&uBZD=*r?Vj7bW0Mo@Tmid3($`-LSFrGZS+V;wY-&f6-=s) z#lrow4I-P6V!XBAHsM*zk>$!7rU!Xq*rMuigy+65$yrmtrWi!iX<~$sv;paXC@cnd zyaoIe@-2s^Pr`6?HoSbF9cKGT#8G=(#Ni5eyp(NND+}O%@t6&&mqg}k#%7O2QA?m< zqd2N0s`x$%Jk6sK<`B?l`TnL370Gnr`N1sW9+E)fyb8RJVRSP;KpaD3-RIc)TmP;& zFw58?VW(PIYi4R#TD0dWmE(|6sqgb$kdZQWG}s3zhOGEPf9X{z*=Hc3n|QfBFh zS;I)*U{F0c|3;b+s&Z6py6Qj2w0yp;Z2kJ0vD-lFr=$q!<0REg7DJGH+v`323Qnn~ zuKjwsCLUM;>KZsm>ff4sUae{_b}TIXCE~ZxG1Qa`=yoMEZ;iJeS?ue)Wl+N3OEQLyT&~~^>NfX zQ~vdcn?U+q$dh0bmV5BNK&?v}RYKT0OJO`WV5lay9K9I9B_)CXHHpccXHH%spBo0{ zW03()L4E&ZZEePlQ*w3ZjvzR?yG<3PPj(w$C!XPQN-OP?8mPH{{*APRn%f&OI9)LW0ALfzV-vKD>yFJA)aRiOZSh4-21? zn|#P0wuo7%z^R;oWe#!k8rU z&`%wK$|M!Ee>EF17tR@Qh^KOu}M~qTIRq|Q>HMKnG)+L>V*ZP$uF9UPond3Xv)bFnH zYC9HJGj#j6@Ht!jQLcWPVR=^kBFewNUS>P#EnFQK0 zukgj5h&*KDH~z?L(RdmtF^%IcmR@{Eao;I1OUbj_Lxuuk8e)aAp;>*b)n`tN3@Mw^&BYu_nEVtd9P$>#g4qFT%X@;mili!93G)7L3}Xhv0qUuAb6MR9dyC#du`RL8Mw&Ls zzCg5Uz(nMOPt*tqmN=x=bS@2RHzlW+TxH@gcX^{pa&oo&`Fv#@R+2mWU4l){GPjTU zVsxzfnZ+U2#yG34jjj= zaok&H;QhhPs*fXf-1ef~($Y6mf(I2IFee}LgR>o)w0lT29l)6!?6%rP{<{5iB}&e^+Gb5?i8QpuBk zm-;t_2ZKXK^d=fM)QTzHD!VFQxGgE1|A2Y8@!Gn#oy8{oLt!=$Eqte_v*qAed%JYZavmIMrPkInJsnvYu;SqACsX_ z$gjyrYld^4-q`YO=kN2yAG2F4huv=hv+Dnt?Zv-sRn4X&Ttat<ec4i$2oz(NwLwFX$NNyl!$Lt?uj-03X6=ZLu`_k z&eN#Z$Pc=G;NH+4LJboE3(CPQXR6EHb)Z$&E!RT8Ql(O+LZ(5TN}=KiB!J*$7$d$d ztu62BM<D{rECFF3sX>Jq?YNJjt)5IL-)3IeF0|Gfui4%;pULq(`kl;eAo;&S zrss4E$$Jv|V__s-Ynv$>omSN_adnZh3ERoJRCW+MSZf0MVg6Jf_R8A zuj+zm)`$!HaqcBuF1HC{C$UDU47HZG#=JxLL$0LxnT8R#89YK)NLgIa0L91jy>y4$v{Gb?ubs(yEl z&s9JfNm8X9WvoL_%A|~m$Dr@g#%m)h^c>{kpgpNt5z0f56XRU7w$~AG)?2Mh_6*=_ zsFFtPqL8y7|H|&7M416b$46n^ZRB9CjvMlpCaP&jyhnwgYLuxAk4sAP%d}Tl@nC#S z*B5N=Or7S+kE9GW+P1p?RF0C!bP=go@#DDJmX-6)dm3xywEeslZB@5dd|~11xjt*W^(m1mW#{*Z)Fa?jL*7+~2kS0CRkYernaJt(HSGe@b)JlJ}8$ zXpMI3ZR7Q7fl}7mfF=8uoXJ?+n1qg56uWM` zesvHv2?K{lEdIF1OwdRm?-RpS3GL&0`_r50Y4e1XR+92rOL3gYj%;NXdOMghMy3@GC*m3hT{^1{vw4TBDLn zyNkyAk3Tf50aC#4Ff7>Cuy=*w)|eC^$Egp=yn%MLw7Iqjsp*<$>e27@J)h9wq?s!H zOd0F_IHanILKiG~hsc1T9ny1{bcwUU0EBmQb=scz4>Kz(3+SJe`Oo`J_0Wn6YGfJ< zxKt@`=8_TR++_iuMOb-9>wWSb?iM&owvim)=v(i$BB>Pw4tJ5}^;?d*A zD3u_0fzU;0*i(|EHRTws0|d*Z9^%TGk!|^C;4yzGPK&BdQgUy;kUtL3$Hn$(i|X?` zJcSOsnO_=xUTF6&JXT|L+aH`uG(`h%32}{;**nd~Z=d>@ zJ+{|Ue3=|#zRuakbV)ur3}MMA^Z3{3HEhZ_A3!f|L%o!E`oZ*_)6}|oMFOPc)G-qT z9?-WqzR0`S{u&|$w@|Yl&ZUpfi0cOZ-p!L%*_w)@4_+mm1W1t~2MXqQ}qiZT~-fDnwRMNm40FDwINqP>2#4Q6WVd$R3rE zjFwRu$xcFKW|O2M$%>F!X4#Z%&+9mKeZSA|d47N0*SN3Jd44{}@qWK%Y*6)Qd179dC{oik*YWzyP;<8M-q`hTeDvjrQRCCThBF-Nz11i!jpnq(s8Q(ub4;ExAGOM( zNTJLke^4wJfN;z9fZU5P2wr^Lze7tP&DX%{{c3hjK0Hb26Z|$ekKk5XSD}aI3$wkl zt@{Ng^{)?R7F+ezeH0UnsP_ptl~iJz?$go4N|U1_@S2rB!0wq&)K!Fw*s{>adp)v*JP0)@QdOS{EUSd13wv);KASh{FctWzPiP=@uY z#j204+*U?h6hkS@%nlbH&*XRh-IB#X93WB8oXT9Tk{5B!^p14lK%So9~d@KI7<3--C9YmRI2UZJ8tY76~DWCz4Jjyq`Y{y1~{#uzbPm zsO(zmc(Jn!@|AEO-BH#Cio+F@R`^Q&eXb;#smqhhg zSMKl4>8xh5e{jBJ*-?P>=greuOnTGC_ZgE`_$V_j;$5MNvgFY&!>J!IwVS`t_ZQxO z*QMm*#bv9qOI?7G2x{nVT3uC^zS{n}6@x5|<;>>*(l32bTcnag#ag_kkll4p8LI$i zp)aYoAClT97U+pR#!oqyBUFSE@Scmku`vv?=^*9Dr(KO6v5?=w2+5N@_PmbvcMA_6 zGS^QBtb2|_u`BA;SwhvQv`DfBc?cQ7LKiHbza6svY)0i@r zF{K~7d=H$VF1@RYGb-4qpi$2)TbPrNay`Q^V;@nBe)^54gD5v?F%4sSsr?G!my^H9!(97xYAEnR?M4N^XPjSC8NhW zXeVTvP{ussxAJv4Xh6weDU+05$Oa^*6q8*8J{|Z}TuJ zjdm{I9><{d(FklmM*ux5mskkRLvN#Tu5&b2&kxf?z?a8){x&zx3j@RG47zI7(C9lg4)T6Z@Ir-DCD-lQw-jSn&ptYucceK?9t3;>9kwEHH-%x z+=k_YY9!Z9Gtc-&kXLmwh`M4eYFja2=X8@9qDR%kGwAG_Xa0I#MgS)=dKE0JV{K-K zv_eI)H&Rhs0V~E^V~>wNZU$*VpYuQjGsv~FWA4VM0ZO&(CpGC(yZW}-qJeO^hDgBd zU`uX>RF<#|@=FJ<_pS^2RuM+JByeKIUeX0GZw#d~nShN?!HlKE7mqGh4Z}AYi0TqP)KedriG{kWXgz1ilQ`t#$&E0;; zrzL%53nT}QWyjPmg=0C{zv@>U3`mUADqvp|?l<^H@1ExT1bq9x@qk*X%es4w{JSoypG!uo$vPF8>GVxppu((bV?T`M=Lbg--3tXJV?_GAwzQnPS?+mV|hpVoj-1eM99P3(O7S;@2&J*_Fgp7 z>?$5eu%wpYIlYS57u(7W0RkGEm*j+s@jFhm*i`9p=1#y1 zz61B2Z7Bq~;r9{-+tIz*wUg39BK)Od@e_ZzjMy-Y$?EH*j2n%ACVoFSN;Jz$@#Ef~)7SgMFA=(6C;O&~HyB&b;$8d(iVHo>CoIA1gL` zl~V2b!tksACo}X<4pye)!P;pvw{qc!*6!FXQ%33+d_H}ijI>!|uWMrOzrtkjpz$@; zOes5<#cJsG!sc=Dq|ZW<3+bT)9s_u++PZRjUGXSsJjl7@sPJYTTR*Qtx(`)q))GZ5 z`GOJeht+mz-R@`t`r5L`8bD_5%;Ag2YAE7vCkbOg?lmg<7d?vcN01G;%N-1CDGU(@ zIQg3~!cQ-!AgJ)S@o61eMk<>~{wqsEuW1q=GPvtorUt8OFsGnyY7~+hcqJwfD7hk_ z*`3$m&`6SE_3^$WIN}gOojraW^RzMYIOPoG^eI+mS8rcc@Z4<-{I-puq;OaL1x3DQ zG@MzzuH}Dw#`~IHIuen^UcdDhpmPMHskBdl-TSubmSmQai~|v)oy=C&?>-q3FouJ? z)P~M~0z+Mk=BE?w9$i6}p~i|&q$H4`IB%V)09Q6v35Pr=FOjuvU#%@bbMHi|$jnYN z*{BAMxt(UE(c#FyyXn4(IaDNS=h{62E0-{P>S2I?4ccoALWLX)0ftVO6?a? zEGeA#&*vXx{N9e+x3)f!>GobkBV?K`i$`*kzyDJ>2@Im7R* z8a~wwY;&~VTNs)v3s1y-S@Ld;vu(xGej_(67}JfyEe|&%h3AjVfjnabPToXeSgP&U zYQ|gg7}7IwNxN6}antNZHNujPsTXTDXJ4l|2K9jNffMgV%Ws@>rkVe{o zXn~yx8n-$^Cj4X+rw&qGYSdx+Wh)1dLU%M;ZZ)KE1*8d16OkmY(NdqoZ`X$g{9v}) zjQby3W<*DmJg=ip+c74AR~`8)v~DZl#?q1F=X|VlnQ=;GX$vt@VBYlb;-`V&;`NG_ zg&#-3pyS5`5muU$@S~@O)9}XkO5ip67pmZbvBo)fhxNYK)vw9=p?o{f&C!{pqjxpO zLNJ15mB1UXfxk(PJqxlOypSJuA2hwuAk~I9b9939&ca_pFtCuzf?SoT-Y{0&czqzP zBYg8KG%+*{MWsyt9o|UC=*o826pp^=8P`4xU9$Lz+mZeAlVBUMn6+qz?xWbj82KvL zkK3`;XkMs1imQ7@_@Fz6&U_2<2%u(cNrxf{BE={%Kr5HZ02;^Lbo`d);o!q>GDJX{ zyls(W{)jf!x7as$OdpvmKhCJ@*@K%}W}V>lnvSo4iDZ|wGa*WOXsj~yRUjb#6>U1u94cnA4T4;l7bP&aV(pSmwo`Si3M0+b zGs;CD(fD|Ix#jezs!8Wvx`I}OfnFW-)epVKX{M^z@y2vts@?+b(31W%)WZQABPL?? zugp#F?$B1LFJF(~ab%*Jx_UwRw(z&}W#s?mD_rD=k}^9~?aH8A5QLc6-gbHHkouUA zu_Pv=ot-B_(e(U2BM~o}y}?i8(Dv{=#4bx!WaiuTZ@smQ&AwF9Xv4ZQ&y*j;?_3h_ zJ<5z@4>AE<^k_kQbgPGpuzgO1daImRM>-HNKbo)kNAih=9xZfVBRk>NPLVYvl-#uB zU5}~RDVQCl<+xRHkPG%w%G1;cyMi%PCbVq`&ciKiCN*}~2cV7*S-a{XG(Fq}tTY~FiE3%4MSZth4%E=p& zRr{Ha&;Cnot-B58tcd+aDC>Eoxn3bR&Yz}(ewOh%!allDqr!^+f|Apaq}S=Yo3&HH z>+y-HvBf`;VIKRfTjs@rM%mE-@j<@8=qrqMK4q^u-Nd69C%|cZq$e+MU~P* za2))MNUz!Fo@J;My_GFdJ=%SR^y!WpMuzC4*anO9$wR4WGd|*NmZ1&mOa3}$;1&gH z**Ul{rbK=HrkJQfF#icG(&?qg1=(BuU~j)hL4a#!POmW7R#Lt0oaWe(3=44`VfD|UFxzz9-{ zV7lr$8)u9T>_qsC7XcQ0cN@Hc)MFG2tI)~|r5MA6g}Z0~3B+!}gME_V5aipP^AOY8 zK#((L8yb^dZ)Ps{5(P8p*d7WSzK%Bx4)>E}+t6p++S!B2?-5^{F_q5n@-+DdgVvnc z4*w4a9kENqie;X-S9sQ>tM-zFV#O4yVL` zHO?jKu4tpD4K<71R|CSuPVSW$waeo*Vp{mW7_|SF1>huoobur|wMcoUHg{Q7LQbW; zS3i5hwh9@h=FyzCImx@dms;55CqQ=Q9c#*o1hWN%x_Ak`oGb4qzUSqQmG6kzQzPMb zhI5Ro@r_*pR7#zwxEm!EclZ6sH?qjBdM7QZQkSIvind6(BO(6L>$j-XI4+%BadM3j zx)5+y39vmwFm|5pW}Jvgo;wn{oRivVdZRsP2M7;tXaT)hqW7Uvfg?AIXa=Sh+Z}Mw zWtNw$K6&PQkLunBbvJdKqJA`4$R)H(D5SHzVKYPhmHW)8Fe9ZZ*tS|PwQMUqagWFE zkf1%K*bJ@_WG@puaT5GOh~^q< z8~C@81u?LbG5a{^{Ihyr;fb!Bsyn%r(RI4<5B7Zf*Yht)Ve9>6izvcB2A$yldZ#R7 zk!b+8wu6Hp^=hZ?74lC3roBB!VmJN>P#fb`+I+eo z8v%`Ce>9y}`$KW5se1eMIJ6_5W*zl65*8=uN)9XK2Jj5~^Bqkreoe%QS2yQ1U|GHn`quArkA*H3SmZBrLb^&Q3mhCI7Wf=FmOR zsjqdIemnt8{`t`Su_8cwM44dX;txf}MqBm;OD9FSw?I@6pGXqmUvV<%EUdUoHa9e1 ziVI%@8pTz zJ6oFhZ8*>c8=-wKA!F7WijBlB#RCsej)8^ahST+%uddVQtxSIBhp~V?* zC$xJhui|?r)COXDxIkfK!19&{HbBTcbq!qS{#^LV|3t4)>JI>pAb7BITvxF7FnsW2 zUM}RQ<7Ff*!)bbr2PgdMsvCF*!MXCM#2}r`kF1W1(0T}NyWZvHXx_cvlpBkQ`oW2N ze;#Q&34pJDhGAvNGsQ7R>a zuXBcROHtAkyuz5juI##awhSx})6&z|cu<+3cjEAl{${L!ij==7PulL5BJU-5bVT-C zqS8457C#?vtLNuctj_zAGPv{IKilW86Wk8OC!`TDOS^kqEg1EpB;P|c8m02qPj(~w z+5g#*eAtanXPMpkHhoZbE&k25`?6kj;(SF$rz)Ew{|+~jNT_UBZgTWDzkb=CCR!o; zxB`(4sr`JF{$^M#p%&o0k-)(9(EQ2m25d+3pvoQ~%rW1Bui^p2?F=TNoP$`nt%o4! z4ElCdRh7v=kW8`zr~g_mjtlAr41dqAeVrtB1>mBb(p)x&$o?qnMw4LuHT)I&-q&jS z`-WN(LjfX|eCGq3NdZcIlInW3P_bh$eRfKYv;<;)d3VIdW6xTF(dAzlZA2^frF|N( zboZdDeqa!)(-&2ShI$~Zdhp$!pCqCLqxOtt83C0hHw^F6_8)OMLzs?#Nhe8$WiRr* z{HMLzks?iTI!nRV=P52Kf2X3?D@vU{HXVCS5I6y<;^v?pCBuro8W$36?}i(Zzo+#3 z!QA|$hzAUxMd9on?d8yEnpSZ){DYr_9Z zC5R_%Bunh<8%l1cuT)De(YgbOztvgI$XI{)9wP8Kcby`C+==zi-|_-;Xm>%&O#WpM zj|~d59;{TYIkSOVi{^2LANJ)(&yS&oazAu2=$aWASxkul zB@8&P@VPCBS}djt`8e&=E^RDDC12c+BCAft?dnk!ul*&vr{$kF)F4ay%bahu#P6oh z_ho?PEMSN&3Q0$^5v==ujSG2KQuOo1E{XE1m`8W%l-Z^qgFxf_%Bvp~cdcdbu?odh zxa$$@^@>98wlbRuF6YwPfSa>Zjn~NcmaAVupxl4}LO!^{?aK?YwUt zxs=Heb)omr1%xD+WGVTzk(i&()8tFky*#e(U3*?zRQGHlnGb{pNann);|ZA zbVi$o1Ouv!Ub(K=@&L(N{_NX@0H5waZ^a@nOHOI-L?#DN6F5WFj>=nxZ~-|!{_va~$aEmQ$i;qYubhRR~a z5&wVs3@rPFR4>dYRDDoiuXFadTOm%+{=GP<*gJ-Daqqj#2adQP|2}$l1Sq2lJq;S@ zjVpMx-x)TkQoqt4aRi0gLuI`1r&-BY$NyGdvwBGFFkv%W6A<#UrAv_JK^0Y))J0e* zV<1S<-@?B=!;94$fib90vPO`N0F2GyumkwVJyg}Xf&-zizzGqiZuvw{ZG(eX;qTWUW`(_3b!n^(J!8;2y5r6%tFx2{L<09muQiLTBZkL=rfIE&klk^zoWQ zQjV!+xQLN=(p~Uf+=%E$zaC5WG3FNJdb9O-#LkmvtURSy3&ZTz5Slrw?HCL8R~x#& zq_HU9B2q-w((SG(hBF+(BSeOxN{lAjtIYx!s3uPJUM~WMZU_yIbn4-1V4*V-VDx6* zeaY;C!u@pJ1-d7?nYG8$UjT!8&lqzM52lPg?Rc()e6nM6DO^-M? zFKt2nNPhF})$7b?hP345SuFe0$D<>e+^z%18>%zUZrzF`aYhynvoM_8hPKIYwxqi_ zUwbP})O=9!U-(d>Ax+{J$In2Cz_&xieA+I6)KTxhW(#@V^Hbsh_}4n5f(NZ(xGZ z;gYnyc=2(wL_Ez9@tp>+$$Qm~svo%d6N-W~1YYxI+FsP1O_);Tc)R1&y56AAgH6ytNS*HOv3I8to@G2UG59*k#>E++2Usa2M8>YP`%zkIZFfM{X^phhv8`ykVJuxtG42NPzUXp~dgDt^iY4 zA|Lu%`b)W6^dNU$M6Nr62gzV1^aWT+c4YD$3s?}tEJD!H=t61G4hdd4=?;7ykjT)Af5b!j(KDpRfq#?!H{%Jw$UCaJ5knC)O@ z*jLQjC-Fs)O?@lkJ{+sH7vV0jB#Vuh1hAx-_o9p zRq+)fg8O-i1DAx`>@oDn7ws-VHL*P)bk(pP%WsF)R*VnEGZd;g!zW(&%*!U$YtGk07q^MZ(yn0w``E@ zO3mg6A`Ry)?|T@dCGSl3vB%A6L>N@suoBkM$dAP{m-amM+Ra>qxMs6~i#}fhg!cLrj&kj<4 zu@Y+_P$mFk-*J(>J(Hp#k=5TBG8qek< z$8qWJTJ@Q(ztePB-%IXxmMj(>L_)Z31Ap+DA5Y@jZk=eh`kG~anl!;mvCwG?Bz(|A zaiIzNF^BdB=+R7intgkfrdGkPklnRN3sBVxR<@xeT88d(cB5UbRZ- z`BJ(gy5PZ#QKAYV4V9XK9opf`5|v@3!k<*mK_KTqaA#U@hhBvtwZVyc?`FgHnz@GW zu{?d?GQQ^pB!_}M9T_gFC^=}^zGz@z`pohB(qjE`2x!A85IzsgK2@|PY>T{+z9XE` zbqfR9X%FBMG}KuJ^~MtofAys;jr=a-pT(OO)at%|%o5aoQ3mnO>hLei)*%`&ON|up zwm!0D-bXvo;~~cnIsf$5`+VExvcGF@h_a{IaT+Tyy6wY-2kwDG2K0cxk9B8}O+`@m z$0)g5N%zh)C&wV5CK$dj`cj#R#`Er79L$X4s7{mxmZPgga~bEiN#zA<{}fn@U&-4s z07_OcshS4`LepZA+vu{ zT^Z+$-c^CiV9lR&>&()YrM{WZ36WfiUF5qiU!toremPODdV)0@xk0$tTQ(7Cscz32r zY2&lIj_VL|Dsp4Pg_;qpT=%ux)3#~q;+tZmcp2ty>P{(o9?V($tLQ6oJom^x|E)w{ zbt9dHaY=w8V~Ef8*yZdO>=x}z?xuw;CSecm!Z}OOk8FU!e^HRk6w`A5$>6|gwt*U-1J_(NbK?A;59{YarU?YvU& zy=o?v&+wFx0iNJJs_!$<2sxl4z6%Wfj>&mWQzF=bVsgCl0EpsM(rlj0Dh`crKw))- zSNsH8v)@H}jV!bbz$=QO`F#Nit!_ZDR5t;HX1ID%fl@O=ctNzOr$cV=2jTdo`N*w&i zTWLw{Ypg)LGr5{$>8=Kz>b}|0ls)w6J>Jw?ujQS12kyEX1#2N(3J7#nJl$w~lvxdKgD7F0jVhV>YBjl|{UMsL+O?-Y9OIjKz)d>d~P zV-Z!`&-kak%;^{QNw`JH-sN%PUf!V=-T;XL0nn%$&m3Kr_R`;o=g(7$x~LcRnw>y* zRPGmY9WOv2`qZvYhUv87KM=sjKG+jN%M=c{<-$8u{=VgORQo%n|0?rX1YE)W9C$32 zFRSREiuPMX0*;Ryo#9JFR-j2>uixxn1aXdX5r0)IB9hf7& zQ8Dzr0+8Zazr+A{TJU7DnfIYVg3!xJx|@d6vIX+afR5+}Ip?R=jn#WZoG~QGJ$&C4 zvSMxlv7?=&t1&{i_PIlS4(lIx;o2v+y{FqStk&Ot$NM_!`gWazfeap{Tv119YmMCY ztvt)dwdpu*PvIq`KlHYzm2j`v&fFlT68Wx)j`K{1)=I00MNaopu__GGkUi$RrzPUA z@=n|f7ZDirKE!hrlbzMYrCXP2u__!f2>KQ_1VS7mF4UdpQ8=`zah5`)i4w1rEo>m%)UU^Bat#j zak}vs5Op;AAkJ*Jfk~7)1D*gpwC<)s(ppoS&9==4q$s*%5_?SdN}+%)T_eh(7&b*NhXKsgG@BBlB1R z-aI&dF0A4^bVM=W0y9_7Z4?s4?wVbQE?B)M+zjVawQjhX!jv0s#cB~wWzsy=x}Sj0 zTs<3cFoIoty3-x*v@vZ=ZV<6A+Nt-FY?KNd5?8cSVsya3zL&gfEsYpK`~I(=S*^_T zedV2v@Rk;*shj8gdXd<&KYjsi*n2T|oiudLbPQbfa{h{JhxP^>V!di``Ae-?*ywKB zAep^WKTiwBxpYmw%aB+eLUOGQ*->ZWqBc{Yq|m#}5vuhg$xQEGhEI+Rq-KQ%$e$K} zp$L9a^<&VkP>}7ArF>5Ws|T1-==J2e;^#H~CYg6-+>|jTo4g9q*x~&PDA@vMatu70 zwc)iUy@QQVG!!;3)9)ZB8jX?pJk3$1xEc@m0PiWa3%Dcdn4GN5UKUYDiY?$n>D^ zPiG+1wLX!c7H3#noN6St66N19dzp>`@|JjYZ`JnA0$Z3Bw(q0R^Iq@eAqVWvLgUO` zpge&kfeO<&Dc?L@mJJU?Lk$@iRBXJNDGq>zS`!ci!l>e08bY4~*&o%g`@>76GVBh* zho~@2&$5)&h1A$2y@V49#2)VdVyC;lk|++U)l~*|4ZF3TP$Io5l4I}$QJ?$h$i#-4 zif@aNGh`QL{>zPvA-mS9r}h8U8r&5P151@`xOO$K(g$0EFYD_Xad2g1_X zOU-0BS@k{O@WGGyS5l6l9&djZ(g@OxO6VP40DMCHPC&Itwj%Y8`Uo*8Yw?a~FS$v@v?fqDmgvZPn}FW}jO+2jnoB~V&hgA05QyN7 zX9HL_arN>eb70NB_1&H_lO0Q;B1)dWf}{r_VI*;(Q;G8nv=&I$z3p@9=fircb#rx< z5L4?l{m8b8c>KARrG|`#*;L-i`;%cwK9H0cr58_!@h__C`r}X5lse3SzL?m}9}s?= z$~AP5qQ%y|OB&r?F!5q!79ft*mqi5A$Hb*Mln#N8Gi*Y8;lVtQzL-xIxwF%d%V z4jJzBBah&VTJ7BBX6w>0-H)c2`-f<3UFvd`a|yDoa@!mq$|f* zFR?=_-XFyZ?jNlPz|JAvyentn9aoftfbknR@pggc^5qr;(o6!z^2Z@*3Z@-hpmA_tDQyLcU_V|4Zk8|IvNFfgP4R1)RC%oUcU=D0`OQ> z^SJ)=17dZ_fR|R|aPhrIQfn=s-Ic&?M$FcV;WO7U-ro6V6lQ&F1g>$136y7Oy z3)%~a+SpkhJW2J`T8R<#BPZ!hZY`jtG+uH%3{AE#%x6#H6hwO*cCBYbroMixh(71) ze;9HjZ@1^~t;{ntxFHJ4Fw}QeV?;;Gb5P^w0Epm@AUt9VCk|4L-r}V%$J}bQ8)=f~#l_WCuy~T6p_W?39 z1EMpD`rIGI%qqO(=7Pz9D$cb8Ffh(jk~8rR$Gq0wH^kM9AQJMALEbs@W{{W{7%JUO zKhaj1TX{9c`E757H*LwJpz-b44ZGe$uef}!?R*nNF1pudvgd@51~5mk*EXNCiH9yxdwxqc1GS&@3S339g1>fIUHt=xbka^h{i(bgNTQ`Bf-P1#J#N1eo-zFAJFVhx@Th_~| z=l8e*zS40V1WP(Tf?Mq6sZS&w&R%C{+)>YQ0`$eS^G3GRc1{7;}x z%{KG`I(mK127S9=DU&+Rh-78Yf`@h>QP14~_eh2ys)T)Y)hYNCV6xN~^7r2f11WVg zr2=RuRnPe(kFlel*4*8cx&u`1gO3}(zEfER3DyfsN6K=q1-QrgNMtP5C1{q~EuAEZ zO`=jtH}VnI=pMC>*@UQ+dJtu@fW1pd`S09ch=(7in|vwu`5-tlXMG1>|PVOkca|W>_GQVEUhx;5j zDsm`JMgM5pvmFi9UF+f(J|ti+f&J9%shbf8M4^tWo#T~HjGD!SLPaMOh<`70TfHDQ&n?cWafTr?ngw1{KCMKZElGK{zDmgMn zIylMS_m~)_udDjfSiN#jytl$ApeeW)M@_`;7MBA6)YW zsTj`#bl=Y|bz@Mdw8Xm&m=O5t2W``X3+X11Lc9Kexc%OE#b^VpPd!)3_Z{LYk~`HC zO2?+nI|CKhp+xCTM!w#e{y+m?*0HYxax%7ZB^*yPjp?+F#m}#}Ap+%3qK=c>2`2m2 zcOg2Yu9P}R63kG;ploZ!%am)s7gMddlJ7yLbSAlI{@V3Bhs+6Tkn|ktBhfJ*nS)Nz z#F&tbgX{>%p5H&8b)ua^q9|q@I-WI=BFER+rallg-K-psbI=UqKr}7G;e%@d!}Pe+ z;WIP18EoY8B3-6GLSz%Z1BvXc1d&Go-s>>9MnFMmro;#w7r$5?)uBuF9b`=l$gbT( z_A-3$hOm>K1l-Aoh#Djuk~Vck;G?_OK0xil{4ne8Rx$^S*{M4@QnN*_fV62b&#!ud2en+UKIL^DaO= zecF#XBKqzT!S_eHK6bv?-MHWKRWogsZ{Fx>t37JD@Qc}Nx~#9H?q-9-z>MzGV9 zrU)UU*Hm?P6ut!_C|w4+n+`N5LL61+$0HYVzU;&9jM6uFpJ=PLs+M^DL_(>3o8-X7 z0W~`1`W&nCe-k04oteR_5^HQiiP8UkbyWCz^161h(-U z8TLDXwUv$PY{mtqQ#&Uo?*FX#ugO|?E%)jo{D|mX>~H4pO`W2Io<($6p+RcMgS(~* zVW$xtm4pvzVEyPIAvx*zQPhNA4CNK2#N)o0qYJE>C^ z6aX!;q|1`mQEZ!S=n8$PhMU%HM^88GYKOQs7fNN~_bN^!#ACkS`Cvi7Y2VnQYz6EK z-};LP>m)tqVb4q{KvGMs4M~erE2cg zLk%sXK3Z)Z^{$;_qPUZ_y_k6s5V)8)lDV57mrHyfdUCa$0N7hKZ8!&Uz!A#c;yvFg z8ed(~Jff;9Wc9&{!mGOGOKNS^n_zUWY+PGuA^%^z^Y;ZY590^Fa)jg8(WYI$W=;c?R<}15n~#tU`oVAIrooB6dJA#Cih8Wq+e#cTF2>+aL(d1!P@ zv!1AunMpchSFb%}8Z{sd!MFG17Y8bPg0>NdF9@VZ1|g4RUHdu&gnaz*YAP8-Zhu9u@Pv7y50zp~|3LPK=E&ETY#?74*>kE* zHVeQ_SVT>gpt=RgFLb?Le+S5WZFoiddfc7U%A^0y+091D&hQpOYttMq9JQ4}yI2CJ ztGnrI_}B;}TxhUD^KIsmag|*VV6|iJCDIM%7a6JBA~rdCv46D+LPg);c zXtElQD;wE}oA1Nn|NHquJ!$Ze_M~Fj+Ycl7t-p-6zlhF8K^`r-mdOw?3n$lgv=xT1 zMIbEu!<-sMpM~3GUz&3%>ns8B6M21T|0Q&iaR+PYc_7035D90sA0R^P#q-lE^v}y| zq|}edxtm&i7Zc7Z@YlJ#Tj5bT6ISyctrbarHz2t%#ZVS#AqoGWi_4^gyXmkBIGk{) z8*dSy?)vA;mY=B7yF+A&iVP$sDP(1NG@dM*h7j$AV|kPF&DA8|tHD=63QCVLC9E8z zio>ixeKaXI-y>@N`d2;E+|Fv;8!RRW#vNK!=SpMf$(*Jo@ON|8w9-Cz)1t_toy8fC zi&ih5zxY%pqA*k2r(LcWlyfq?Dll_ztcW+iKN(K5?qI_pRpG+Dz|J9M6}1RnJNTlD#)V;)ngd%bjm_ZBeFTo^96VGBhW9x zd$m6JZ3A9O2T^F)=3+H>Q@QN*!QmLd=Xd_k!|;FHZ+QN^x%mpngqQ#0EPNfWUYRxR zk%+5y!jV)?+}=~4gz}FXT>BZ@p{p!|%ja`=bK&*uFvlNvkex74m+*)(dDCaZfm4*a z4P3`l9Ni~>oO*F5}hxUlIpz)VNQG`OwT zlt)3)R)*bPZz)s>Di@zHj0t&jQ)5nmJY00(G5LDoLpwL>Pu?Iz!i#Ze8E=WhrKCi*cZVr}lh5A)i657)un;1J1fa24@uihpP;WVJNik34~i!r{x7Wi)RKI z44)tb*U*t=#XPWs%2BOfs!45(lmyDLf?jiTwk_fekT&7TESYLSwnjPM@GdK_;bwtl3?+XpZ){|30!dEW-dPAT8qj-*HCg+jhYKD# zu9<+qa524y;3dx>(VRZHm1oiB9WjW)B=Qp2eEraPO{b6JUU(U~;mEM^ zD`b!&|GH`FncOm@0M7euGGL`z;@pmxjof!PT}=>bUMwqYFpfxymk>Ger{(W6AhM|a z{GJ+KYZf-X0JhuQ1fEv;fcDId2k+A_vYEv;s$u&s*C~&*EWI@? zT$~|soM#T{=}PUTA%D23iC+$z{(7$06_omZk(N>8D{J>)0sgq|_m>@7vQ%qH4j*y| z;5S-PJMR{M7HmcIxv`$mt9fYxAD04SRX;5E@EiB0R79^#U#JE3%lp8~<5bvpz~vyo zHZC4Cs!VNycIlr5GaK1pI3*nbC-vpWNfVN@56&J=(2!=7vPR!I?ISC^Y|1r>2{^joMrbEF0B6=CD%hsJTnvv0*1M^ z6+oT)@fQKt5B~9wHnt<+`gX9O!~}<%wzFE%DH;-wTumVUrB4gg!SbFFcU$NzB*Xnj6iQ zx`VrFC6pC^sRR1Fp043G=(Pcp`Z(&rZ$gxWVxpBO8OU7ejk!lK;_xE{9%#FvBszu4 zGk4Qp>~~mcEYB?kk!{Ee zv6@-_LyLj>SDFOeC8Fr|4$ncnxUWe0V9$Nc7`Il9jj? zb_m{$XM;uUrAh$J^W0;9C2*MOR_)OCxUh5JzLAna-FF|aKz8DjP0QGJz9BQ1E{kOf zRHeXNE_fd@A9zZ{gv=b9Va|bOCm4;dhsM1Yf1dB315Tml1bL$3c~0@b9w~ne39fte zm|tIY5S)Vc?r=JkUUVT8Go}*oXcG@)W!91GBYpxI&WCmAT__FkeMRUFd@8hbyC}y} z?~93INYMJ=1Igu$z||zItz!y9)cb%vCs+^2%rA3|aK*y?xQIF)kSF>7m+7o({?)R< z3qm(0MUucAw6CLySZO2qVL{kKx~IH{Z0hbn)6`- zR!%*vH&}#HuSXs{P3W-eC*~)zi->YObvtut$yt&q1rp~w*nM@@4JO}>vLJV9PS%%_ zed$F%(U(sn zUmR+D@S@Mu9#5Ix=I0(SHc~Eh4qPO9f1hCV_I=2`f#25sm$+3>J^4P~&zbl-6X)9S z_`)A?BB!996#v&xm%8EYA81TdP?MSJ>Tq5H8JDyFNDVwOCW<8FfDl|6`%b*NSku^l zJQ%IbP1HcT>XTVyE6C`k;&sdOTbssq9O#L`pywUF%wguFaj z`7z31_U;{6(X9?mu%}1Lf5!(eDITH zm(FNlgy_oRu8K~->39DWhKRYt6zgJ&J?@Z}P}Q_ZqnWcqr6N|QxbV{>(muVv zh!ck36+MhtB#JzQ{A3RqLcqz5TK+$_vDXI?N84(LZ;?pjpB+~! z6)3%xF5PyQQ&|5@fG^dy2Daxs4h=#i^WBe`R}s-6iUI=-0U0agNZ4+mo;%Z zb21Y^3yDNTn4giDoLAm@`D+q%z)ue`UT^mKM*JnDCqDUr<)-2cL~|W}U9-&4Wh&Mf ze)eC~0BkF`@_0}+3GL>V-h`#$X8koW-YuZxf0_NLegS354kUgxKYqOF*KKl&yel;l zE;>3=5gdLpSlQMQvEoo%bysdgj1;2yySSZGE%7Xkbu&g7soenX;qEbqi{tmIJJd<2 zWUKsRmo>gyZ%evaH`Y+9Qm2+Y7dMJG?mj1YOR8W5vgv(-36F7Adna@K1{iAw)D+}4 zjA)m^)ZRZ!5~p}3c&<<*_9Xsy528YH#Tm$zPk3bBxccs(TGv(CWs`hwqk=bal0T7A?9&?^P`VH_gc&)hnX z90>Mh$nE$q3*f2UNI`NO zW>%>0{m0aPj9^}P^hix|`!D6p5DhZRNkGq7P4v4u3N853x`=M&v?2T_k+r!troC(8 z6XdjaRSF?+6N*P)sa7m|$Vl|f?!So64il_S$ZPC>SPyyJLo|^f!{pYE{{UJ91x}89 zk2b<5M$JsN6}sA1kPeCuQ#kfLAzsI?rNk$-go2&!RF>KH z1XW!k}+TN3F)Ej33)QVLU6#s^Hn=Vt#G^g8el^xBnk;+HWuc`zoem+6iB zx8lKduRSC}%d}$e?F_`nY2$wDQri+jd9&4+xPrCF#s*m;bz_69i0z{opP*5Ul3x&d z@$x&X@2Qbk`;$;{ty{NMvgrc8UJPuZpdE42eH6o0Lxle%w%i_li>%^{k)Cz@Z6`{?*vB##E1IkyL?TDj+<2e069HFm!8+v`~?xlqX{5 zf}nkDT4}4Ol>ny@`37@9?;#pWN3puSOR+m0_>eb|BRkMfRJS`8VN4&s$_e7Io z1RU5DoV=ip!FRxf%-g9d*Oml~q2;KRQYbmI$bQ%WH+@9<5HaaOsl4ZMoq&S@Y&;vw zLPS@hYe0v(IQfTmd2$-~6JeJPu~mlIf4hX;+zk3IX&{F~1#rp9QuaGilKj2B$svKL z`m)a@(DU-#JIm@WDMEZLyb>PmerHz_Ws0tVk^D-IAPxUiuxSs2f_gp>*mMb9rN5pV z_b(>AOiV0xFIZ@+ScR420~J8Hy!@Y`MsDFscXh04FbOy(#AcuZ>BNCMO7?7^f*nl$ zI!jy7ocX|}lGpKqa~CLz%OAmR|7yAUP(Z+k9pM@<#r(3!7Ck%J@Xn18iT#y=ps^c8q)sSW=8Jq6+rPC3s*`0+LqUZ`ErqyA+Q zLOFGV28F$GmIJ&Ce72x5il0CAk1p^xq&mKs59)=f zI!NLDkHhnNlN~>wi3b$ZuBXrM4x&cv9O-o~2?86A_lm4HJ;Q9%JRh@YmeS`4fePsl zsCec2dv|fh+jN%$0NWUm_Fc7d{S~(v%QvDaKO(gWof{N^MsKViRrS0It6Bda_TD?J z>1^8@pHURiQD(q~3g}1?r6>qUjSMQ%6;!0Fh)4+tNRtu;M~0E2lmw*2LJ?_FLyrYP zYLreA$^aokC{jX6NO;#5&zyVSQ|{sSd!P5Y&wbwej}OTAyT5y{wfEYqewK?wO}Z?& zc%TBDkq4b#8sf}sM#}-H6{-$EmH1ocLHR`ZUsQ7lH&G!wdX*-t=t7W6Rq+XKw-nPLqBcYk@QG7qB$I-NcO%bhoQs~=qX+7S5cwp2oT*>(4Nx% zXWBWWD*9@784@Zu3-BLS1hbW2H#C}d}xET~$-1NCgZ%MZE33VkG)r)6x_!sDMJV@GJP4m^dZGb@w1+!l@!;FA#(MWcbYLg@d?)+7JTInNOc1pj7Qp`HQ z3ENct4w!R5Trc((xDpUc6rP00F=B;j>@GJHF)089ywz4Y8Ui~4t&!iLUkmq2qDx~U z?prs;ges#?Nb+(MF1r>%)5AV%tYN}UmHndpd8`*~KB4o9`z}l&$THd;-yxUCQ4}OU zCd)z2W}&GS8(c6{DFg$+=*g+CO>Q@=Aoz#?hh$|VrhOClbOFS}q!|I2RAK_`(Yn>? z{o`FU&t8YrT&(&9zF2$>Y^)~l=e6WM#pm13>JwK?I%`aQq;fc6#a^ksnIz%El`_f4>6>jhOD# zJt*)hUnfIFgKh;bru;sYbzm<_6UkaGF?hNt;s8J~+hZ?>9!NoOvabrt=`~+7of7Q; z(=3QvfcV@m{b|DgT}=7ldQ!lnTVUi$`DE-oeBmqtpcaq|nRKFBPy)D*x7upoiCb^B zc}_sR)WrY<^ZuqIz-}NMIn4iexo3dSngpE!WFVm47Tm?BYFPw;*cKK~X8E^gKf5km zAc|T3+*?On0lccO#ytfW5G&LD=BwX~N}xm2DL^hy8aROz{=rh?!06gg2VD+vxzfL* z`~Zos5tjZXQFrQ4Ves&J1KcKkNV>Q;BXekIo&FB7>qsnaHFK0V!(?tA;w8F(hGT7s z+M=_*T?=eSdw{La)dAKXn0z$Q^9}mm6%d?|WHPY6R}EzgnihmA?U2Zw!FnpVK5@Mj zd?`Jv=|A9^Lwc1UfM(DsuUGPu4szun(3cfW3lsa@Q7@=Vi_ZY1BO(pLTsh@p3Z{QU=S z{zuv5Uu;>zUu{{l1_&06fE&PC@!3keta&fwI@~KT{MhFQ@W1+K0rmIAt7XS}y>(#P zN`MjbJW&La0C98UJqjl-BB%_zQrY0&r}_muAV#P?1Ullh5!@ve=z4|}?8IrW7Gwqq zK0pH5n^F!Ooiv1QjhZXqo`5W3NTJ_lb?e>$TOJBGz8pINZjS;nL_3k(vcc~t z;`|XN2{~3?iY<@G&%3^ZpVASSsF%K%_Lm>uy#3Jry~?iPfN4}0 z+ZQlvXx3ti@2dMvsu{ecUI)_WknW6apWP3o7eQX-mr6}I5|EES&~yBr>g^;g+Eqxf zo1BBZe@U=|wr%^h0Z{n$RQjJD7l@UIG9S?N_}X6ZezU%BS^ucM_m8`-{!w2cBMsnQ zS-%em<%WXVd1}cT6S(M8b2Ar`IE0oVg>bXf`RDh2nh#7twsSO+z$!c#FZ~64gd2=j zL@~5u8%XCucB&a(z3X}VRy~&u@>8b!?Pl$}JgwsgwEg%v;yaG=KYap9DUXEY4{v_l z|LlZz{pQ%nyC7%`T+2t(2hN3mrpY|IfBL&a*B5&imEZhG)s0JiyTZ4kyxaRmpGTt% z7%(Su^Dku1$z`4}0}RQ_(R+`}uIoGcvBy~@5Uft57}vb~b|&NPuD!7mn|dI$nB&bhCv*gVwG`qXF+Od!!fk>VGSz__cJ z`4-?4WV;>t-{-!Ajs#7mcK!Sh~?6tD%_BE~d zE5}-%mVy?t$7he`LsQ;(HJ@Dn-@c-F)vBkJ`OLu`dj0Fb+sW<0YMDDPG;MAAX?HH= zMD*TUBI};?9o-dptA1h0r|Bw>-qm=@N>r8F_XTOMrg(!!-mZW9+acMz6}9`HUNYI^ zz6J9_^OV(_4Y$m!u9b;h3$Epfu*`mbj^{YmGV?d79YewX&ue!pJ~?&z+7E{g@82^N zyyfsAO5zHqB*l!*oJ&@Y@|lwM(X`&6kT+EpSdQw=U?=r2^{Ooi$_`=k~E)+D88j4!}W2vH#nQM88o-{(;CS&aTPH@Cnbu@J{P3#Am`HAnk zs^^R8&g<+bu560oYv!a6k&W#@SM(Ol5QxKVXCXf1@$Hkn5Fr^M&L&UY&O*Mnz+E6> zGBV#i&dCd6W>sKDL=yaIrS^z{vH}D4S0(woZIxv<(w+0Cyz>0f=J2}A5e!?OD`pZb zXx{2No4uO=Fy7A+ziVPR5{9aU2dt;=bOmtln^=>*ExS7KOM8}=9n+;935r)Ut8Fly zSDZPsdmIUV8~J6X0_bWLr~?;momZzk!g$#K^?~y36@!W@SeBJ7d5jJqkDoXAGxjS6 zMv+|!hflC{HVQM_&imz{37i6*eeN{@)n`=n$!LyX1jmDEoYQf|lRR<6X8QM_{Kv?GMWrYqB(glezG z{fZ2jd}BLas3|=FTSW1v4v8?+a)Q$PcqoOU<|3RgI&O{x%N_eT2kT;nUP07+Tz;i$ zDlm`hnT2M7Eo?$jqZq7e|7Z zH79}XH_0%c*=m?cTz$NWiKU;$283MuHfLrcqf{33hehlv%b^0RAt1%s9Cnng7qe@X zl8$bUKIX04L>J%icH&A{33OMoyR}Jf38g%j+Z6aLsG&5-^dcI*4c{`HR5PDxwnbOu z*v}|%BB^y_(QZlE-)Lpq#5tzVQDu{XSS*DF3e~4@*jTvE>*i>b;cP~vGn_z@=QCIv zo-^){j6t@EUtc?l+37|=_lOA<*|n7b)n@!zgK&DV<3gOG`Evb`T@(@`A~{RiW31G2kV|93d3fh+_JB0g{DL(YsTiglhM(b z2iSlLYc-{oMJ3S-doAQ;bM1zTC}L6(mIckMWVKX%6zGXV0yzu87;| zg+1H{B@eyT=p*U3+YHuhSL*iC};}XBsH)Pkr=oD{D+dNe0HN7EjyE`g>dXz>YeYG@B^k&@ndLT_zst1IR5 zZA)(p0LM0DC{F?nEiuB0NM2CySa^c-s+b8k6rFp@3->{7@aknKl$a948to;v>bSw< zC6R^-XGqc2D_J(oS0x{@{V;gVI;V2jHh)?;a^N>xx$Sl(+Po!%a4 zsO;89jgWGdF4ah1?n6cZkvc_@Yvoi#8WvVbFa2l*SKjX<0K0UYop{U(O)$)#>;htU zlMjUMYek+DHPveZnZn(BNfjB8u*1};F5QtAt{#j5C*P55hL96ppHSo$_s%z#*tNOj zp+PWkVMKG7w=9uxEN#!j+=&ufWcg2~UIV|>l$x2b^tYd)gKXhPuP5}TPtkM};lz>h zC`7)@Amd4q-aixf%tU@Xer({GIA?T^>8t!NWZ;w;Ol3q3_|Z_B(d_e)ke7j1EV>?9 z9ck#zFN}&6g{1y)s@Aa8elR~!#w$EAwfTLu(Ll$x51))OVm=vFW@vxj zcAvbysm4xiIz`pV<1{SMA-KS(va5cAXA4{3RjI6}tL2h}2KnAOOHKa7lESz@pL(9= zT{1T8ZJ6uWIFfjT3=aj?dc7>DnlJw06bw|+yqu!-OR{g?`o~YK+OUR_2bf)amA4PU zWYS`~tFhZoYt&6B&H8WT^BS!3KIG(KXsPt}al5VL*O?S|;aBP|L-+bY{eSC zvmVP!&apBcQoT^tb^S+E9QC36En$lpWDE_7g;G{W9eRl>NC{g=8d5Fm$2LwSEM^N% zE%|J$9+C*7BnGzlOJqKX<`FpQCfVm1Nl`U1wwAwVBH33c8$*F76wYSCv?djF6=TI7 zeRl^4?@8@E&uAgtHlgrU@<=ljJ7G+r)ONl~SoG8}bkA)QGGArVP)THoOK_jppaw(X z$IPzFz$(c@rjmUMKb}3A*=I*C&|O3$zwK7mjW=>i_9ZFFms#iKdnYbwXl26klv3_m zRMi#P$80A{{3OclEt9n9D-Mfo^9U#dGeZfUqVFRV*@eYx1LB_XL|RpRHAio~Q! zw?)s~+4%F2>Rf`sOWi*dr^~r2dES%+UuVfaJ8I`=tl`(nZP5jq>+kVIhGM-R#ojx) zu7U6DJdWopP-u78aBxtMfk=DULM2x0%M+_sIm)(Zr$H}6S*J#cyZzguuTe$pER_UL zcs+_`>Q?4<`64^8e~fTb7VDcVk2dj@*eQ5O)FOJjm1N)S9nFjQ@5Vnx2WAOQIoc(h z6S9zh-x49b=hf62BXG*>?3BsaE%tBYxNN|&O~@lVXA1~x2OF7^3)BhPqk|Dt}oHq zGC21$F7n?$7T3xM*Li>qw}nH}eoS^6vj;Aweu3S&jo09ZEx=O4%3RUzS5qX?lkMh` zeS){$hR&dS!fh#T+w_DL%bv^N`W0kz`}bXtB!o(gDzvotRzxZ*jDD((AWzKO>1Enc z#U?yH#F9&|R^){Rw)<*FNOdUnVF6~`j)ibhdOH4@`1FJM;+mW>l}ViOct-8szevU# zm7s-wpDf;WdGNfp{@~UL=cTHYp`azDboiL_Ubn`{n+8%bZ{bgdoOl!T5PC_>eq@p1 zD|al-J&Yb^TQZ$qqiGs+v))-Z_M-{RPt!M=ZZwuRRJG4|FX-aQZ1cTtdgFvz8H&e} zMqUSBWl>HIT7WVt2g9{Ez$1-QHw_%Wgh-D1q;fz2f4HXSmr|_zN11U~seJ{0Nn@d2 zwJ*va8pKk|WfG@d2hm>jDz1!|j9G>0NF^zCQs$kAV$g?E1idQp4B2{n{V*xoK7v;f z#d_WT(r#E;7&Klj3^F|NOFs~mC_5|VwR++mcJc=;5kn0JB%bLsR8 zG4GrQy&@6`lxANwz5p-XK_+mRg)=wNw0o9T6YDg{^r(#`r+$S+3JDSfr-Vqw?uBE~ z_%O+$X>1MJiCP4#rlfx5@=A~k!W@f5M?U&`TNz}Ey6H+^kvf(JN6qJW`h~pd`S=;O ziJhL1o19{LtgrrFQ$1?B%@Bw6*LS7t)x31o9jG{!xgnV_O1sZEye(!_U`FSX#QutW;k%kQxeg zpUy1~Xma-ISIAIsT_&{KH7NGB#6ef{VZ%><{UT5j63QxQ_lIQSCK5DUf*iYgt zIc)y+E%~L-{M;J@(TUg94mG5jt2x>%*xe+yz8#gkE%pY-apFBa`ZO(s!q1h6PYFG> zwK6sLwri+FQQP#GaZyG{n`nT8zyZNzuYqKAddQPdcTP!ecB8-x^*7D~;*8d5;XMJV zLmi^q)7mMR9_>ICE#I9M0R^<=2N zz0l>*&yEnQi?^GQJLbrH`j|dF5C2JE=rSSXdD6qA7_!hyN=uou8rK}}oR?gbXU}HG zIZJfirnH!pNXQk1JhysSCc)q_>%EcNJ8o7nyvI#)g5lhg=RJT^iYbXvYL^|ob%RLO zI#0evzMeeriq_^$sUOZJRY{_&h{GwLU|K1U{dFiUvnfNY>#Z)%=@wl1V=M?ATA%BpC^va}lQ(Pot3S|W?CQ<`C({h*# zN8YjwZg!^T$2LZ{T%H`AIOU%^%d?*Kc7=ydiNy5b`Et1Ty~!4;Ht?5qVWU@y7ZNH_ zf=Q#>VaZnX(JKofD}4JFcuyyMa^k(*b3@|gZ00@hbFp{5Rq+#n683FEEkVVC6J;%} zZzm;-ByRP2FVAwtq)XOkXE$DXPPF9{%a}D^A1;LxU*9)w#@-sLLgIVCloX8)CnHZ9 z++<1?wRI|d?%ejfz**yyWmO~^N9YUiRqK3otJ)O1Q7bTY*29coKtot`W1B(}LNkz~MCHi{0!qzWKX&=P=8)@&PWLi`6YO*s~ybN5Q?*=mwpW zaNL>K*Btlug2a6b+>_su@7Y&~?S2%7g$IY4uAfZ4l0#E)?>4&O5Exy=#LLDYiLKl1T_m zHuxm*w$$irx)nBi%DBj(HGAbi7}Z%Jp=;92ux3*7Hmy35)yAKxFL6O$2+l!EjTig- zh*;w?CC3Ad89o9xh-RL0hfFtU80k0fpI7uy_rt6RKF`cReNN$w8ozWP7TQj*=4d9y z>?BQ3gKl4vWv;k>9*u(>X>gX}&G8(8P>vu9lGE zj~SFxxu1GKnYHyPk7mA(=BxzY&wMuN zqNcdg41Aj=eb$1dt$V|1iaAP1U1`S$WU((267af{&dG(coRV>~z0M!K@If}H#RTq^ zd6N2^JU)O9@)lu$X&atZ!1Q;1*eE4h#1G@dSTBGn2zZnn}^ zDil>jSE0$f*2BoX>JIWPj%*>R#OY6wwXxCPwaY4|w}YrDOPqEb&N;9!M@)2czyOn)r~{mF$D=M^$-^E199r5FqF-rw5&s zVtC;7$I~Yq>;oeli_tlv6}n~8Va3JuJdUDaz0%_8v1}?_?wNLF*Yxrkb+H1vYiv!d1x{U5u)>jI-e52N z*!h((BQ+V(6O>9~<4wpeBp`M=@WFWQB;w`U2sV~fk zfYEZ9R5si(;e`Sfqjg2CUHaKs!p>spvE7CgQnUB$_?NoanQ(JG11VCrdIjRN=yUKe z7`3}jBv;Ru!iBue?-D^Dux@0AQgyLN{`jFWVs@$oVxK5(5prYrp#OGRz-Zw$uO@vpGyA3;mtksyrrbj{E*OVs$mFyL2 z8{(f&Fzls^+T%@XFSq?BCGOo8B_-}xvO4f^yhmPRP%))NI0nZWVqNRDIyRyHVtk z{B+O&1s=kV6#bNWENxyT1`Xq>VX)ojVL~S)?FA3Qrh4qP9SAA_PEN{L_UkOhk6zo=B&ZmMT1&y>>vhOvNaoTT>tmgMWU&IXx*on76AsPqeV zXg^!)RuR^*w4%f!e9?>Y)W9@ukU)mlm?HjU;t`7{4Y|z|_f3 z#;=90(iY{|5pTlI&O-K5W-Xb_npGdKi!I10Ey|Bnw^i^df^Uojg9_(_pH+!`2ES}- z!#H|Mnkcb5f=17-9w#@wl}up<^h!WwiMHusq&WrN+^4m!SOHT#%xkX}P4V=nC#c>c4vti$MU(0eqT6gn8Dfox}oadcT+D5 zyf*x#ENI5jGIHI-7n`;|Y2b{6S?fs(&piuA*WdhJ+CLZPJC$%&_tCShH_}~H?%<$KQA>DKAhciN<6~au;N6+1^|9Rj%J#EQUos_sjAg~_Z{shx{xy+TF9#L! zp1QxFeHI-{+;OBcoP4`;E(*ti)xu>IbYk!E1e)PCLcpoMx3XOU6_&eQTOwXkf zu-$~(jj3lj<~S{DOUybPXTQGcDWOO0tU~@J4F-8HrX(Sdq~yfu@gBP=C z+)jf#?vHYp1U=6%S!Lb=W57yYPO8{l!mRUs`Nv4oFcqedrQ)FC1LL*0M;`35%7N}~ z8%dlnir59(o`&Ki=fhKA9~03Kkv(-iU-ba%?%sq_HQd}6ulxB@o>((-TOOl2pn%If=|AiQn>~A)CAQQC^Yk^7J zZ$e1c#ZnVZFq0R{9B6$V`~|}zxbMw#Pz!IUTR`QQ8kShZ)<@kQps?6sM|wgHxYvJ? z?sCsT-J=9O5|6PGdX&cH8F%808Zgz$_+}*bn=_&;mnm-pG`Kvw+cIA<9gaRqt$;^` zgMXm5N@X!KLLPCx_;@I2Mc=)?@f_b|7mKYGIj+5q$kzuWJvYUys{>MOG!r`4#qy*k z%Oa%X1T66*IU#kis{><6??YHFA8&F}e|q z;cHLlilueJjMaKV#SGJ3#*952f-m*BqCP%;>s8X#iRTzt-jp;gXy#e*FzD$o;VNlK z&FP9#FUD3kf{(`#i@9?@4fbzT2r^~636v`whZ@7kXTSD;?YTvWu7tA<|NbU7mR#11mf64hG1B+LMN8PMI-N@@!%A z4;G%rSxkEMPb)llmVR#3?aj^{q1z)O{i4GTKP#gZv}pT0BT<4bJ#=pLTbIdE$GmCK z$_0I9s;Q#0*=ISr;8-ZDWAZK~41X-kgn{t%3#d^zIaf4EMI~JI6_yq@wt4m#sX@KY ziBnd~4m-%FdA;8R(`1TCX)AM>!-;(yPOxE+0u@LrO7W)sF`7E8!0NbH#! z$6&%dS(e4z6RAGB0|1m(o08#a#4l_Cgpoh zO_+genso^?qqju#&Pz&Dpu~g*6G+BBxlV^ulN?J@F~($d;yshR`!>#+%4*e7ao1u3 zTc$Ij;wBbwR0lSb9@LIZRAq!(2j_+cN2y0C9`8{mnn5&YkT|Lf&5PjkoP|I-yN9UG zJEQ(Za&kB@Q-SnY-55>^VL@=$jCzBo6qEi$m*eNU)sRY8v1E{z;1`j3;1{)NR)Le# zjt@GWJFQM9UkRF5APuSGJkjl}S*i%@s2)ow6UB+Nz`)gxE7I3I@rbLPGKA)gIu!A} z@-r4t77TdF%pg9}yu^MMRX#CSNM;pQ&FPS@*fgZ~$8t?sYNN&-Ls4cc?+=YerCH@R zQx!}wB|S!;x>$k%dCZK!bndt_@a%22 zh%>s-$@0u?m2<<*CAMK&{F+G?m`O`c7<;Y`0@bWG+ZM$RiB0WjhosM)_#n&Nj-Wv! zfx{kD#q=1~?09qF3uH(I5nZ*+W6AA!O(T^10eivg#vCCZ+ENOu?NY{1rem1hlFS9r z&@hM6W8Lj0*^23{;Mb|Y@1meJLY4Gf24%~C?3{r!{Mnyba!n7Iu~-Uv;!M)ZW69xW zUB7_7M5h|#_Lw$IOYUry_*je5l^c~DUZljEB<_FgVU8#d)I(kFDb8W^_IRUCrev;Q zlpZQGVl*{MVSAf}lYx}iuzGHKyve*csf1fI4Z7AKjw9~gVwH$CI1-o*8bORpS*K^q z>^}J9+yrPwMaiI6#b25{rhP02u3a*$OwxTQtu|JmO>b|?4Fjn`&inu_)DNK``<_r2Gv0#dp0`@NQ9D2k9BZJx$1|_gU+|0 z(DEHF77l+XRm>MHm8faNQdQx?Vr<10o+)W5+qe0=I+nH%6Ct(rLs9fOYdgKsBZfbN zW;SFIAw^pRg6N4Jp+6@MzZV7?YI;RTF$F_ss>915s>$KE6IZDK*Jk{s+EW~HyNqY? zt)=mupt=az$$%PUmrfm`@HFr1M^wn{@F}*wz?l34qyEaj3{n&Yt45{A-N%kw!pEu; z={4|Veh83?gm*ud1L|4y(fH}*<92%aR0z$oE{E|kaqtcc#i@B#DaRR_izCNhipM9Z_LgmxjD z=g`D~p(>^Hv(^E{33vyoBViR4OK))6- z+8*C|4pgxsM19(K6{ajbkX_gX%QV!seJu}aWlvD>Wfppr2zp%0 zX&|1R#VG#`l(RubZ4qc&Ox5@_JZuK5Hf`Il^7?gEJEOz{JE;C1)(+Y6{xv0BeUuD` zQllGsZAyEar@2b`XPL`q1i!Q2zFqD`{iOWnc zxX8&6dmZ5c`6pivy%>&^SA8vCu8aM=)%W%ImmAh+w5HLWD7}qRFY1HEJ5W#BWtQ?f zQJXJdKkxT(;izywr)v2uztGKj>+#}^mKkQ=t3BaMB<6Ug{AHy&k&t%LwtXv+@cb+6 zWqVs4Eu#FLH*x1BonLc5fA=&Cv;DWxWt|Xqz)hl zcB;&ay!k+<|f|l-|wc59J+P!Ow58c=LKJDE*BZqWh`pt zP&WmwP=~tn@&|0QVzOOB!DY^iH`YRionG7*Ew_a>%r(4>Q7mo#Eam|7K56Bkdgmqf zv$of37qsaMqg!U))afXJ4Hc}yycb5ku%4?og|^;pxcMCJ>wTtUqP@}&es^_j%Oqlj z51-JhJ@Vq-4H$-xb($e+DDT6J3U0QHbcEFx^~gWD%UC85xZU(U`z+3^kCq>cjkCby zj(%Q$>*Y-|OfK>BQ-{W4`3twUw@*f*xG$bM`%JcN48?xFo9(H`Io7P<)t)JGxO|wU zqctJF=kjfe@YZzwTn;q^9{KL|5!^zmVbsQ5_VBF>Md1?r)mF~`tYxNzw!bMP?u0!* z^AXExoA-3l*0;noq;fw$y@7mmStpFAcitm!x5YJchUnH`%KeQa(Xhai6?P3O`U&Uh zqOY=dsqIQ-}NK5W` z>5#Fd5vPoMhR>vTIw}hj+x|IDpEGUDDXR{}w&Mevmvnp7vmRin`V4PVOeChgUgR)2 zu2IR8)HEF>u|to%@JW7;^lf%|4r^9Aldj05_o(gpVJq4r3pkyLi9G>3iRob5rfKSZ z{3pfBvP}A-4qu4<_S;iqqFXoeHg?X&4v~~}1eHQ)1s<8qC<&vAA-J@r;yAl4#=M?5 zcrWkD!W_Rm1{-a1b+)z=6RJ#lsVS!A{ioJWTTW<$z4cfWW|AbOHrBmur*NRhOL(Ds zd`xshiXQooPtojCWZg)N2@E}{KgrO~LDhkcT*VEYS74ms`?0rrWGOyScta%b!$=7) zy%gt6R2Sz^>4lknyDestK2B$FT|AEqow$%~iGxiwDk^~k`f-d{3ijL{q-gEA%Zlb(SNAW7#Pc6)`i$;r1pw)M9$N zk{UmZ*`kl7p+6LP7=%fEfCc8!z&#;V=CDmhF})}!p8Vx~hP7o_2WS@w}?i#OTZ23aT5H@L#_kL z@8Fl|i0>4GOBpU;J3SShuuLRDq*`L+Nx{%Hr$P}^@+E_u2q|%FTs0kT=nl-wKE{Wrw> zlPq`Nc=MfAr^w3wx>mZM987Rt6Mv9Tl<1!v?D2M&GFa;y0wk;TYvRzQvjdW-C0&)# z)R+b+S~1Oq`f`s7N^h<@Gt*dxP&I)zyBT#s|BB-}PXHyFUvB;V@aeTo;tFlU$jC^; z6nAn8+Y9BupOcYpSPf86dl{O{5DVIj)MPB?c~Og6WdOz2*z6WGd%BVK!7X5DFYSSH z<<%Zfk_N%tJAa++^5Cw+TEwj?OFr$HM$nJM2oweqf={wsP_ zNR#)ty}e(~s0Az^DPCcu_1SV~aJ1(SWe-EmAVxPaVNTvB%d%H*uqX8*I_j^i``15Q zaasqc`(C(nvvV1!s-#=*_-LwKWi6My#Gf(DNp~(SQR$jy^H9Y*1IXn>_2obK<&cgD zW%Y)x1s8jitEVGqfRihSg1`=vw`>yqAVc#{EC6J-Gw?pavTJ7(;ju6PJkbo@?wFR% z7w1{e7~1OwH7bdIy~|s48n?_ouOr+c{-J6vO|ExLqbH+%7V;LSEOTt@F=A43=^4r1 zWbBafFSd~@P%UI#Z*A&hJWz<|xcKC@s4}u;wZ0BMl#`7L7(Q6x=@qh<`qYf|WTg#H zt{n01(_I5IU;nahNurRrlt`57Umj6z%+O?J$~sC{%)AigykvJ4?5h6yD@};xdj=Pa ztw|(uC`6eRR^Ez5^)gM)t!Ic(Nt2;LM~HC&4ha72s#+dJi zBQ~GK^TdB*Ot@kE{YDh<=9)|O#@LW;sy7?GaWjn<&8#~D-f-96gQkID(Xlimb|K%>cmXUc3+ z^r=MXk{b9nS4?F*(>Tz{Z%`25U4=zyEe`G?mJTP@k!kw2ER!-y?_{TVzV$+Rm(0?} z%uw~=12iNzIT$sSu&~mF;h6bBDm%sXS|U;M>LmZlSN;k&i@k@FPUQw9QUEdTfsIY4 z*UF6CiU}mQ>v+}S-F|h(?W!EuK!YY`cl^34`Z{vw*_-*)CmfBB0KVO!Jilh=o#Sus z@rra?PdO^YToy|){N=OKS|t6YQ~Uc%n7`?$KWDck_S}2FO)XoU^*L&i%b5B!o3c<@ z0QEnUmMyYn!_fuw6n|`hr#UL&@Ki-J+SRmgsl~^;{_0X8PL%a2w1ExP4_t{xWRK)> z-2xewTWl;{PIG^_CUCa}3NYVZ(>P}Sjb6km+7pB;7 zR>k{8;%f2&vbvzJO5-GH*ZdZ`vHv81YKUW0J^iu8vA&%W$F<{?@tc`mZ&G&d*0Q)*`X9ei;DN{_*Pk zacKEB!Tn(*h&|E8NoZa+Z{|k3$+Exbpe*`0lRh|@Rs|Go{6qslyzyNOZs3<^p;)^} zmGpgqbDvk`Ql?~>8YeBt)1q1z?dqDlGSiL8ORFiQYc7=81EK@XUe3QKcnCqHtEj11 zynM-KzwRR>%UPcr?t(!;H5=iqVQ|XCrDhH;_cKvE#rmL<+s|vbyDx8DCqPQa?1UZJ z?`xuXRy{E`bz&`7ZBx|9R0wf7>N42$(O+f}U3s#2NV9#8HG?HU z_Q^jU8yV4QJXLc2;99uDgEa>QXYTEP3u3vr4fg10&ODw0L70&QpxdNu27AecG!uCA_c8AF$nygJQkfAC{` zRREuf_aU^T%tW-ViH09`UapN3AYp&<=go@tEeW~XXS9YL_gWKFbHf#Nef7lZ=~Smj zzDiQMlmlxxjq#447`LIS;7GcuXYPSP^jf{wa!uT@K)7a{2S;+R@vDtt4l&V1!fO%3 z2|HnSoVFuyBx_?{T{RaZRfBHRZ0uBr4*l2a*niOAjmKsLS(^4VWZU7DdAF5Ixn4QW zajREqnp~h`88k^IUQzCYvN{|oXugm}lf(ESCBY5&l_flmKk$q8mIiz1>$TK0Gr)@~ z-RJS~08KH^&rnx`bf>d$XDkO^Fz5EKuraiUw#(NN5)&=IvPyR#_fZ3K<d^@4Cfpo&?_ov}=$qED9OTdT110voijYy}o+Wtp@g&-{G3v&RY` zOOF)0e?Gk{?x9?@Z)m+oRW+BHCr2pn$l2+#s{i=(62F|5v-nD$&+dMGRl97d+0AAo z*E5*`28M_Vv|Ru9hTWyOH*!JK59J%~jn0oE_e69cfY)SfV8^@Z=&$Q>t(dr^1PFRa z(O8@_tO}@c+J*w<*SK3=P^a!=?+s{Y8=zA#T|b)S#M!br+}xenVTQ9^VRhLth=?W+ z`fO_l>V{xr$(=oQL>D#UyB#!j0MTuYNx5Kz)L4?8eMX@(pByrgM$r3uCj;D!XRcKw zv;1>EfJsm#rjkvxa{sWS*9*_nENnieh(E3$=AicTXw{lT#-<-HpBmVxT&mUMoL7a| zE`{P!(wKM0`;F!SRzKjePi`MhEG4JU(~h*3y}aqi_@C}Epr8_|?$Yv`FT`61Exs4U z?(JW!b!L5pkw)7b=xZT9Ozg-ysL=CO0hFLd2hG`pnWwxOFRcJV!P&lR*LX%v;Rdlw zYv3X>?JEWAmL3$=X-v8!DztPzC|3wvU=UOC&+ zi|X`7ejj3ZAbdislrFK4R46^|T~aKgdp}vzNj$qBlY6vY-P6;Pwu`uU zx(90I8WCLLHzU6v$vp(%q!$-h(7DDM_Dg|BGQ-M2t1P>8u2p~4AW{O{v><50>tl}` zV3|W|D3omwsCBsW+ggIVc3S3ZFy%^;X})OKV*A)td&WL0CNMmqFp~xbqvxuf)#1uv z+OBvlRS?+~ljLT_iWmKNvCh*zL3E!6KrkqICAm@x7G}l76x4Wbv7w~w=pos1#?0&| z4~j1RR#Q0f*c7{lg^?K(e1IXLp0*Fl`Rs|DdCS*PqU`Rv21L*G zfCfZ3K;9%)4vWi4A=X*2hxs&=_7hhH4OG#~B8qUu{bEUtQQa*K5?T`<4@OCga=(UF z3_shjp*DMQM44!AGhcvP9xoO#g&%qZOh2@hMuGCaQ>HL>m#3`Ji9hap6ejk% zXiY<(wnD30Z#U3-Ku896l{lP^CcC;nw2Un~-wAr)+9_k@M~mxKjuHP}y8kxze&O=c zJH&E(t0{5MX-?dbrZMGbtim2ITs7kXOYYqUTEQS#sxna5XQ0CR@}lBYal0opt(HBM z=vGJu%fG@cx`ALRNu-|J^SB{{~B(DiI(d9v+X5i)Mxxo%Yv;h!auFc>*7#LF(@e= z5x?O9Mq`=DV2_8>)MaY{l6MoSVl9_cXV|JCb@P7=see>9{wr)nXT@pcYCeJ}N{t_O z*{b^nZBza7bmsZ4s+UyIV?`0|QcvB|XhJwkf3PQV$xnGL)~o7Im$Skg01MQwtg|@x z;;O@@PQ+jOgeBsCSoP!;K*v-$Uv!z-S#?X-X=fWe_##~C$^S@}fBYZD*(*5`DYWex z1K;x>;uQ_JwbV#lZp>-vR_mLNoGHLnWq+%;=jd^?2YdAD-7sKdl=L^Ltwka0eD=n| zNSzIyS8D`YIuhFWPLf<*!TGDN#C}LzeCWX%&A>r_`uwcHaG=bNjH|k82c}(g4L1L@ z7V++LuQ$w2^=fAmjsh%VmK+8R?GH`c&LmQquC;Jn+ILR{1A&X|<;*Wx`2jwi^ z8myb|*V*StrsVu;dT823CMm6d{6$S+v1jhM)LIy}c{4~0&FcR{i|W6DVEsQe8vkdB zf3wN|D>Q&BPr@7!M4g6%BGbREs=VKHxYw1#(fSR5hHRc&%^HIqtPeKik=|_%b8x0h z!!cE(H>3#s&D8(ZMVkE-xuAyr+c`7-XJ7sebN?&$<^OM&RO#UFM;dY~CM^)~_yYE1 z1&%XM$wN?oX#iwI2_6a>+S`AbJ1x4BUrE=Pl7e`sw+eJJX(SLi6NpLkNPC-<@txL~ zhR-^p){vpJ=mEYIoLS}Ijx7z`VM&yf|3tA_PoV4>O#{5#@&CsqTNa4&LXuR4J< z%jVqtLeSJg10e0$Oy23Lz$#b%vDI3h1fh}14KyUgIP)n~v&hQ#k6*XgB*yJP0#6MN zD0a_Gq)(;Y+XI|B$fNRWvJ)`{ox@EUOjO#gv+ro&rCH83oO51jLvh+Xkho;!*I(tv ziMJ!TGSlOG^8EzYu&_5yKc0;?7x-wi)L`aaG27&zS!fl=?4T9r1x^I_RxDIaSx2hR ze4HuD(8%$h0JGZ9Og7AUVg~-6GfrILiuaFrz1Z9Pa(NcD;)z7=PApE`T-sR_g;^e= z;kl!DvSCyE)DpI|G-x)l-|ChX{7ozk9EhS8U!Uw~l>1s*r;$5ewa|ySXROXGSxHEKPe6E=5L93GfT1?mHlXzl}6Ck)Pt_pUuKCx{EE<2qJ(nB@J8d71>y>)5$)=k znCrrMP{bcFu%1?L9OOZWK!RD8C29|eh|hW7tvBCW2ls>1rd@at3D?OUn1XmVMrCmz z#9{h%7xfs;*s<@AZ9ZSBbEfj60>`l&5`hy4E(e*V2i3YVo-YiPFixH(Pu3O}@wylA zq6ZwJE3v#9U$lDh&6PdPt|b?fmmcJbMR6do?S_-~1^~@i1PN?4#_v-(BuqG*=r+#E zBC@gyrmET4)d1d~t_!T&xvXDjGt>*46$dK$q}o7B6RW!tEnb z7}uhpy;L}H<)vIFw{!iTh+39V@nNghj$XN^X~5fTbFM4SsxW zHu%^&UElHvw43EjhWswe=u_C0=tITelL(i&LyI%>u2(^n@P!;31aqzq%bAYCP~)op z*y^^L4v|m0!Tc5c!KG?k#NQ}rcyg2n-#7!jLKY(+4)wuLzZqOKQWYIpqmc0be z936rH)Qx2xHT5;=tUQ|x5O03!={03WAh))HEJVX0`bMFJL$aOx>DGdN;^kkf??_ll)}_~;a6f5U&JK>dw_`Kb6k z=tlOG%IYM@0&rMVBO=?SI<6J+be4dYRUF72*I0|J-Oe)!={DFHW6d%UAPWiaI?>T@ z$h*L&HcxtX@Ne$@e>tc4*KhtdL}u*UsHAg63F21GwSKp^{}T@V!)6!U4xuA0gTyDg z;D}N1vk>0CmYGLxg5b$d8dKK%a40(8n|p_^n!Cw48A z=zIx#SnC714VuArgeKoj%V*f-g2mqY` zQY5Mg9i=K(I)|p!MyA)z&d!Q=)ghU7IOv~{Muo(ueYU<2Y!mhhC)N}SX};YSv4*wY z{qA1<7Ap|{M^?3JA%~>%k*Z@a>&;#_5Wqnchu>eJ^Vik=XFt3d+tCd%jYcHWdB^9L zp1;Jji{5J`{r~Kpzs~Q?_?D*T=D6lWWfJ$^eB`;ciR9dw6!Oa=S4UBkkY+2hg#OD0f=JDKnPL) zf3^4BVKu*h<8_>9811Cuw9(K|s8fy+QmIH9($Jt%Mx{mGXr~eyTE{9>T8j2jR?;AC zZAC+il8m0$yU&Nm$0xeKzw7!v*K<7=f4#5V_qp%;b-(ug8Vz?3buMoZ&Pz9%y7Z}M zMcg9FpQuTdj#fNiB$bz){7LqMzqTjDg=K^#1$_s?uRqMPSjdBJl@h*--S_3d-m)5x z)_j%9q_9^`y@!4FFCWc+S{@QUI=8JJg_aoKGlV(0Je7TP{z`~yF2sOwkYs?%bjzX0 zNKzF?sB7)4VQqp6@Cp^yNf89*PMV6dHai3}0zbK4`1Mi0ql=jUkgl<>c(b5m1kmo= zj_r{svs7?Sw!o%AcEAs22|RnZ2Zc>U;ZpltV$C`&9blV8*#iM6YkgM?ssSR3D#pf0 z0_zISE8qBMHkFA1M{BsgbL00j|N0|MA0$r71$nErySBm(x0WU$iMbjahFOUZgHzaX zzW!=s{X7(K)Cj}QMq^k!GL&!@UGdH$6-80d$AQP(k>sw0Ujj#B={6Kknq(s2aoo^p z@$;L>9c*QWDY=Zvys%rlZoLLwl-+jW0&@5BUc)3Dvb|B6tv_+kY436eerF0sd~s~B zmi%EMf!PQQgZ2s+{1wl`t#`mt?c;WBMu|30!$p=0U8QlIH{hi6p^E|{C~J)Y6q^=H zS=C*ES);DC0`e%cF2h>fUBg0{(C||=6w0Jf#Yw}-b2{RQ?l9Gz^c6+(r?f5j+P2_{ zzj&MhynYZ*(P8ue##u}Ck#rkV463<$bcwCXQU3@5W(E$fC49F`88>e;!!IMS&DDR} zEBoiQLbM3K^9x-o2>V}_2!Tx#%C4I=&HM>Rz;E;Vb%cSwECTX?nYc^_qB<&6KlP7* z`z2?}SMpT8QW9cBsx=oUof48^!g9dVrrpsXB60vpo6=%=3GRrcAHetWB>mCiyo8XF z3J!z$xMj8Ge9gLFQ~8hAh4BJfD{c_4zBH|(@$VloTLZW$<(r{|LaH!EW&>Qe^&3ki zP9!)H86|xI54&{q=OCaVet*_m%m$!EW=#C0&O=N8d3XQbJ^mMYX}mA@;YZ3X>EyBW z5UCH~!L$CnPhc@tvrOhgRHf&pj)q?Hsxc(yOlgCD!_p9h@%jl~d3XdIoH_Xr#V4@T zii@65%{i2)n0Ujk31we9IPSi=9QXt;K|B%`k$;uks~d9BNJUZe1Z_RwD~nb#N*JSo zFab>DrYCj~&^EwVRr^@FNsAya2%J`I+m%xYk~di!`xd6zg$FM|FO1#sjKLRa%s9KH zciM@T$$GrVn9J3{Uktz8h7jdx@0{5cvQZB z9ATQgBplX9`_=~I8bCe$(Vp}pDBuso%_hx9cEki=RQB%K*(hm7GFEUiJ7#CQ5%vOE zoWasj!o|!+aohOI>&J^L65xOhgyN_&P=s@pZAuX$7~>gT0VQh@08gw9gNM-*pI!pl9GB$X&Xtpf^ zB*}Z|)PiCVrai|oePZr)uL69psWvT#5Q4j`aahca+31^w`zcaNtF04J)Ov7$Jawj+ zS>fmylxKu8MGO>kdxD?hy0%4(Pme)nQ&iFH!FSvU0HG&HNFRi#qYG=lcCM9|Cg~VC7;`if(l~dL-6>Lzx%aP-7*N=JlAwx zDn%I+ksW{#$*#Md!4i;d^RBje*m00PF-x7?v`3)}<*X1wtBw_nH74=9)SKfmrSYYm z(J$7iBWHJj%;X^8obx02Is3I8r4K8L#z~#*6h#6Tu{NFtA+5SC;N}b$4qk2XMc^}g zh(SjK`z1~k%7^&aV~qXKx}^)ktwzq+1jaZZ>q=ndkBS~F=rc*>cYAfV3%{KSrJ=Xv z#++w$#pRB#JvhaG)IEO1z0z!n5dJz@+0xu8a!1Xau7&jyNmg)vasBnZRR)o(iC*j) z;^y`*E*7t*i#;7Xua_J&t4(Yr>2PYRRN%UR%n!$}2c>;5>NDhgn2&iC_x9TE z%Tir?s`L~^77GapNz2G!uc)XfuBq9Xt|6yXFnNiN&hz6rtmbyZ~O>(|9&V`If7 zB^$;+yKh`TP-CdAtu5~Ewtm*x5NlUH>0!Al3q!$`FxJRziI^trKmAM~+^RM?he#l@VIp&3;@>3>SfI ziWtC!;DAI5_Irfb) zS{$royT(oLQF5S$9>F9p%`R`^Rxaowexce5%qdNNPMUck>@$&AIntElA1z37tqoFna}vQOqMm?t=vQuLJWnFI>Ju(Yps?Fs zHdb=dBa@x5wHBnT>}LQ0wLGOWn*q1EKN7OEr>B7*&-mC7uJrZW=z?u%JGn~TNk@96X1F`aObAAp2u`>Z;i>7T9-na~o)77{V z<;iMjnarXy>x)_!oGTS^QW603Mh~b$pY68XkEi~XQs2e$vbov2si|q`{d>KyT@E6Y zHe-N`<9;%Vaad)E)z#Id<>m27OG_8Gx0|0cuHnO85sgaP55~RJn~n06u|?lZG5X8+ z>CAg(tlPA;OS-zOGBOthQFbsG@$vDUw9#q~F8l#^mP%IHW--S*hfd+(RUw)9Pj#lt z^H$EminxY847CQheSe^%*c0BmDBB-6;C=4gxwN99CB~m7L_<%XNNv=#IOFpxi2MXK ziZfhejbRcBZNcqhB0wu)23G`s*z)gI(uoB!ajspCz$)=yBr(C|h}7<5oS6?z!ADQL z%E;6-e%G#TXqrJF)9t3WX$1uW!66|f_wL=Zh*+a_f+C33mX$$AbU`IwlwMwVNf zujco;XEr44>2mCt$!0!V_QyuFT?ecSj+Y>4ogG}R9v&>p%F6AZKatIO(6`i*T`6{3PDI`rI?NFWy71aC5rqjAba^ z{+|3G!nx?MeE@{ZCb18Nd05GTo>(J-*7iygI#IqGE8mj{I3GJlDLFtjV~cG zgHFMfaO5wt|Gn);bpQb8uA&~l1<=cJXn8p^0HEnOv69{3GTUWNIL_e%7g40bIE;O# zO412|EdwUGz8G1;ttwC=PkZR?oU7uemNcTkEQ> zzeSp`!8&0otH^A&Y6=^Xv#3q{%KbJb=0$8;I!wM=q-WO!1)1r)s3f9bz*0fWn{pq~Rl%@{SE<(rOsn@%SER7t_+X3lTYI5JYbS?8%Dbni@NY4p~D|rW55h z``LBKO;+s^gfE;r2Qz-;eKvg*>uPf zqWe7%b|(jWDwENZ5EZP-g+Q=IbxqqX2(Y6kq_Wkg@7+8Ofr^~RKbD#Ls0DHtbel{X zXA z$TN?B3gb1*2bTpF#{j9|-ZC}V@Q3Fr8r?@}Wh;vK#l*ylCr0~9pppsK9*DAheaCVG zpVp)?$?MY#)*jT{mq5fA3t4a7iX@^6*B3T&MkCc(@Oq>M*1iG(I~m2mI*= zDLn*XVc~YDh{A{$$I(cJIOR^uqEC_PPZM3kH){GI zy-!l8JXlv*gfC?Gt-?!h6!3RuP6E=rOZh>&{LK+4wFC&sb##Ro+=XD}r^|fFT3f(! z+sD_psbEPbN~{7sP$(o}PBLxSu;Eh8#jm`YUs>x#s~ql>jE}pnhW4!`qjjcB!ljI} z`?O3)MXKC~?#B>8T*qRdG(}9Ta-%0BQ(a#doet z3U+t2FdM;DVu=rR&JPbXpbSfP91Ploz$Rt#PINT%tG0;hL;$*)WKyBbz~Nmi zcvvHK0UVpSS0Knj#;c?>tFrJk`-Th|6AB+czoxmRJp1GIu+_kYQV2@*(j9FO2d#Ve z>qKA4sRgPAVkc6VR^qhGnvW!r+&RXff)zKOns>10T7(ZCetoCJ^CBJQ#&mlCqU*=< zRohXxCp}>vb3SD0w}TvJ-nGZgYBhwn7uOYi9i5$cNCA^$Fv9y+b5%Zh+EV+IL<80k z#d5ppYwwD8ekngjQehShvuDpPHun|a#vh-dc$+w9pV=XT7W4SdXszBn1b#CmnV6WD z#)&Sxcoqf2fG1a7g-bs-F@U0K;}Z2GFmom!YfA(fgLAu;%iwyh79!>N!fryTl)eG^DrTQ?@0uHsxarH<$jKtnJDED z41J*H(7pwXX(6WKh5r^O|NhH69vyOYQIt08b&##e%xX3$++w8=aB9Xp4W`qz5R}~1 zJ)M~Yp0KysMH6K!cJlqhRZ6pC$Ue}-tqNI8KJex%F?a6V{ILU*F_)%{)h8DuN_uOO zuOVpvayVNy+_p=#bxF^i4+eLDq^oHc%Z`)0ls z{H}YsTp^B^9nbY26y4Uk0o0%c>?=?p4Pq{U*4EZmVOkLNBtPfr4cBmuu(99yr@!BY zP9%*+qZkypXT`hLQU(HS=27!dFb~Z976u<5pJJ|EyV+Esa?*$6ksRHkRgI2CUmD*{ zoc2Z17B&ju)@N;saf#j>u`vH2-8;~;GoKy5db~dB5Y#eT7IU=4&Dpf)GcVC7ggoL0 zVGA;QP#z%sOwDqqLXc0lLiq^ug9l~4Vp{c?w99IN#W!~*6+eEwrP5{%)dQ>5>)SD$ z?6Ec;$!OCtv#=1-oXmajw&{*90}J=c)&5CL=S4vq*Kf&2P;@O}X!y>UaP=*~gD#~7 zpu%!vmN%f<&-S<%&OH1+Dl-oNBG=M zQUu_``MT0M1oll`0b`GuyD(uYz>LR8NSPXxWpBGs^7Onb7B)2?*m$LYjcaC1jORzB|A{d|E8a> zQi81Vrs1Hy>_``o$uf7|JZ}ZN+S8d8N)j=y9sZeX`%MT$Tef{FfMEePSyf8se?g2t zst(lo-waVa0(uBSoyB$^|JdccyHgY5sM&^+6m9MeqcA#3J^`wXo0j0E_zpr5KyT;1 z+6T(lOBpSff?zYhu8+W!&kG$anYZdRUWK6@d>qcpB%01qIN_pn@0+1KfnyFt(2{Py zON63t6Ix!3tZ(9hV^1bGBPjGFopjp$`x_T7TEqLNZC^GfEK&%6b4kleg82WPn0OKs7>)36=pvpFC;G9)^=W zIOna700d$;@M}eFnGlZap#3BZZF%$~ivCJ{e~-IGC%YX)WPllWfBMz1AAgEu2$I|K zio9zFoXdaCKF^tNl~M{rMH3xAh{5Gvd~p<|x)?t*e#bmwA6$aR&A*Z+jMG9zwW9FkOsVWFaLw>*kK{aK)P60?0 zHg0BpgPVCQE0>@!SwApTB!l@ib>m7$d*@Ty?f~xA`1SJ{SV4DOt#{T+rVf*G>g=Etr)!k4ihk4(-G`o&xwJ)v(*l0b@ zL(wO}C{rP}K!*xnyg*GmUY2L^pqt=g0D^0V3=;yB0j4_$f!tEeQ2k{|iJ&ikvG9MO z1k+*L84|pld*`is$+}JT3?`3-8+rUe;&U5}$MJyVGRe zSDRuEWopWjk(*;`+hg zwx~$XsY^^(aXD;ZYmMT6uLpkI&4*Fsfsd8n%|pqWr4j&}m{|qZQCZ+ay2|r30v(d( z!`Rh_Wz^hYp)M3kvsE3K zxhUH?K?J@}v<9OkTK$0?u@^VyTkGe|ybe7<0j#_lVxIxw2&`02|LOn=6kK6<9pw)# z-9T(~#iqXffm!_u7Y$*kl|X8j5Ljb@tNE{>=rm73l~K$KIm+|`cbP;aA3=eFEq?ge zdIq93$iL{b4+2r3unZ1=Ei{bx2p%vayHgoqcg%Pgj>tZ{P2ylWA`qTV3=O(3 zOa)~OK}JT>-DW9GDjqu^Hrk9}%95(i(X#6H@Hymkg&(X``S9K;cmX8xuZ17@=;nr! z??z(Bpb-3OyyHqcULAe)*YQr_s3!}FPgwUgySK2o+~ z0dx(zQ^LTrJG8{eg%o<8(g$2MwZm}xS>At@GS>;GB-JdpGjICY+&{wn%^n@?tMkRv zWh!2)fWbIQI9_8tny?M!N`NvTEGNLm@R8jb1R}!^ss~;&{el}DD0mMv7z$dvVePO{ zFanpwHRBqf$(@e98xeP7DOF{ZbJ$9`Z=TeTPshKyk(fw^i0cW zESDp0XWIuJV6W|D4N<8hKf`%CT0Q;kQ54FxdE)_5p3wQ}%K8(Ocm{Y8RXvr5a$M>n z4EL{034F-JoewG6whF#V-gHYhOFCVxg+fyR z=Go7TP+WyYH1K8!;QfYtA#UnbQSv;djbOHM>n_}cqC7{&|I-*fcznqi1ofD}Ejnwx zMv5WSg0Tv*Cq8L_8&*QW%>%(UkOSv#8EK=02#Ei1OM4#_RUs0X{JUcmDlZVf^~+`y zx3vx)t;FCLOsevR51xIhC^2@dzA%=5P2^=VEHol_M|S2*B=Nye`^&etSO6*dGg$9N zKz_^w#^yqJ^X{-@7@84WcbE;)w#NuxGr=5^=^S@_|%^%Ds4U|p9#?odO40w)}P`>fS@S&+@anH)%?avL5>)|qPVQQM}gU{Td-3nJ_z@#GGG(Ym45B>eI zWrrDfi+va$_A#Rs`*M-Qfi^nOP8ht?YoL5b~^hi`O{%* z0D-$jMrPhISZG0nqEF3(#j=n1IR+pS35xJ}k{+^^ZzLIGwRm?tm+U4(1YMs(Ql5H4 z!}+rJ>|YjzLcDF0khpKMsAcrlQE$8zbh2S) z_vD@r4D^IA9Pk+T$>yyX!WH7I#S)JlR0|6K?@Riv6rfDPtxx16d;Z`Eu42qmt z>UXaKG|6VRi8-|CZtu;cv5DkWkWadqmfD#7sOAO==Tcw+fB7V%UEQsDP!s-W30^7w z-GN|nirG=X^kxN56mLuDc^cl0`Cz$1-RU`tT}d}`=4p_eVg3t_m1}2~B2+WXCeBTa z|M+-&JG2>JYH+WYI04>gat5gA!oHaw>5)ho+|{QSvp^x%C?24eUdcCT+i zQTqJA1t!_JZ*F)FaEWeuV+2A7&4i&9J&jVB==?^lv<|Fg`skXv#mT~Z^Kp= zLI`Ax%?EC#bb4eV762d1XFKyFK;gd$IS+XL+gt5+C@WnB>JJMjOXCG=(TX!+=L{5r zOM+`K^g{UEV|#J+SCr`piu(H$qOR;)ELAB~b^r%rdR^6NfHp_(fXLtZq%#v9W(n{L zJ5$l-4s)@qT3o(Vw(V&>vZEtO!c@=ts3V>kr{_7HAE_gxuE>_GzSciBK z#@Z>-e~19HPv3TA48hA{NdjKJ&QE^}ND5qsR1#kh4x;g4+0vu4dYQ_0ap2RW7IT~98wLkAH511sN&NLPUost@5@86U>6IZ>btAHKx;FRi z;ek#(IYZEG(OWo#EO&2S5I^|b4ek9Sks9q%<`Jsbat%PWD9^jl*2i{r^Vx6ofPw=7 zT)2}Ie>>Lg|H_^EA4Jlz`sq&@piqj5Il{!n@V5LSNdcfGrwloiK3a6xEh1!ca^o35 zP}8|2fJ-7*sl=DDU%JW&5;8qL@F0mK3Q2V8!Xa9E`=T7jOo!G&6)=fa+o31r!Mo5H zy@YKu|Fb)JCg<-^`fen~Z6}v0?`=ZR+cUW8bDiyQPcoj!3)1GPOQi!3A1^P{904bt zJ4-&SA$V}@%S!y(KWgHL!f{yr48FaF_QdyRncX|-C$6vv%Ljfp@}ydEW-7-n!_a_$ zrVKV9b~R2PT?E!h?1q;j#kDK6Ij(O7t8%&e>uI*?;_G*Vj!xVcFLd!QCG9<)_BQxR zS|kl@&8MqSN_A5*K(dr>vKhJ$%#VFR)i}c0!eHpuZLI@ZV7n;X7(}W^7YFcg?|5ha z#~b~}8~uOH8;vK>9|X6VuNO31HXPi)=6fjy6slL^zB}?nq9pcc3-3VTr-9(Lo$Clh zcW_ttHCHjsOpTE2$p|jRA|_}=X^(d*E{1+t%{1aI5uICDB@*1M}3Bs z#zHW5q14ry3+TtoG0&0jN;m8N^MZP+JKp`bn+3a(`0FpBht} z$98$W+w=}}&H%2VqsJd?ay;~xDoN7Cu4zR0JbU)+AT&?+#@i72JDA%O_On3D?*7*{ z?g$zuuE~C9a=*1qzuWSZMqqINn|_|^4233de>$GTf^sR0PRnFguH@Xbd2x*_ zce-uYfeGn<36yCB(3b6O4L0%f@Q&U+w#~{rZDygj=T8ueDV?z%sq5Qoo`V(cJK6Zroi^OFQTnA$Z|*GGJ(FPgB8v6v_c} zv^`ZdsdfO2TRomA&TJ~~=1br>=yd}CcTLREab|+2%!Z-7r}ue_;HFtnc@+xSs$ovy z0>+gP+g}@$XN-aq%i#?>iS8L2F&94i2<4j*aG2NuV*`cf>^KiFu(kMK@qe2-^s#5WEgJ!;^AMBBIdvb}*)3(0$zPaCKDGd|rH z1Gl%aQ1>!5m~;N>Ta-r)9XLpdPRe=Q+GdyKMalLv69YVm(#zjOJq{leXPSTB$244r zq;n1y&`JvymBIb({FD|1XtVn5g59;)Ylee8A-?vby7C~`foURBsG695($4OV?7x5W zGAAVC+m$XoC5yXg8?$zIV#1_&2)ydlE1`r{o`C!Z@cEg!Lds9^h6O5|c#oW=)^Z#F zBUvcGq&cZ~7tdndeYjLqKxI&nkO|gIc|xbk%#;iKo0loZbELE#svwu`6HZ!*rxKjc z_dp>>2x*%#SI@fB?5j8&M*)Ptd#==97nLGOJfF#06gX3eyU2 zN+lr$B7(C$R3Ffzuo2=S_WHtO9+ADYl};Wn>$WvX8r@!s_YH-VmwD#p7ZnaIu+Oz5 z8hJteD_~CkKLvaL0;%&@^1;w;Pb-rh9RYW1nX)O8^9diSw5LoBN_}x3TD=J+ z!9NllHTEt`2HCHJa8H&te2oHy7kJ{9aykp+QwTm#6b|J12JOkzg1kC!DdrNod>@^t zO%E97`5Pt05_#GdaSjyE(@9ET_vh2u;)R(Nv$4LuzKryA-GRYD`TG~nf_K=*rd+Q_XlA^^io~I)t|`<=f6~*_)0&sh z$HB-cLuR>u6sg)+9YaNBW!b^ik~};-R>zKg>02=Zy@=wu-v}&O60-Ko`8gWoq2Na-nxe7=6PE?3zuK!2O;9{6tx!gh-q07jW_>*E>7?)LbS$R5zZW{SWy?mnGP|bcb zg2xxXcPayblH15>&dgUgbp|~^yd=N3OA&|PCYia`t`8iTW~0-EG++U6Q(Wmt;UDmm z7&mclrP01;0!SDP5pcS2Q$pES5QkTt1W>X_<<6Sfo}D_(a^{6`JMX(7JV{OsH)53` z+x8hj=0Kb=n{^dF|Q=P{K#X#|?Z+%z4jjvXWW&dEip3 zr^jHITS=y)kbBIu2=g9`A z1T@vo$jBJ^=qsUO?yAq!h~#sE&5Zj|idt8Qi;G*?+4Tu8S@1Ea-HrB!ry2JPGLd>gYwC)v7@5qc8~}%7C_=X8ZT|Ogbtllga%nw>a9`+P>^LU5qLc z4q=IjiE^t}?Q(H(dD7Cd=e1)~UO@ruz*CEWqGN$8GJ|F&vozJqLcVO5 + + + + + gc-compaction-split + + + Layer 1 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + branch point + + + + + + + + last branch point + + + + + + Job 1 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + branch point + + + + + + + + last branch point + + + + + + Job 2 + + + + + + Job 3 + + + + + + Job 4 + + + + + + Job 5 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Delta Layer + + + + + + + Image Layer + + + + + From 68120cfa31b10eda7f807c74b6049f60d7400a45 Mon Sep 17 00:00:00 2001 From: a-masterov <72613290+a-masterov@users.noreply.github.com> Date: Wed, 14 May 2025 15:19:53 +0200 Subject: [PATCH 42/65] Fix Cloud Extensions Regression (#11907) ## Problem The regression test on extensions relied on the admin API to set the default endpoint settings, which is not stable and requires admin privileges. Specifically: - The workflow was using `default_endpoint_settings` to configure necessary PostgreSQL settings like `DateStyle`, `TimeZone`, and `neon.allow_unstable_extensions` - This approach was failing because the API endpoint for setting `default_endpoint_settings` was changed (referenced in a comment as issue #27108) - The admin API requires special privileges. ## Summary of changes We get rid of the admin API dependency and use ALTER DATABASE statements instead: **Removed the default_endpoint_settings mechanism:** - Removed the default_endpoint_settings input parameter from the neon-project-create action - Removed the API call that was attempting to set these settings at the project level - Completely removed the default_endpoint_settings configuration from the cloud-extensions workflow **Added database-level settings:** - Created a new `alter_db.sh` script that applies the same settings directly to each test database - Modified all extension test scripts to call this script after database creation --- .../actions/neon-project-create/action.yml | 20 ------------------- .github/workflows/cloud-extensions.yml | 15 +------------- docker-compose/ext-src/alter_db.sh | 8 ++++++++ .../ext-src/pg_graphql-src/regular-test.sh | 1 + .../ext-src/pgrag-src/regular-test.sh | 1 + docker-compose/ext-src/pgx_ulid-src/Makefile | 1 + .../ext-src/plv8-src/regular-test.sh | 1 + .../ext-src/rag_bge_small_en_v15-src/Makefile | 1 + .../rag_jina_reranker_v1_tiny_en-src/Makefile | 1 + .../ext-src/rum-src/regular-test.sh | 1 + 10 files changed, 16 insertions(+), 34 deletions(-) create mode 100755 docker-compose/ext-src/alter_db.sh diff --git a/.github/actions/neon-project-create/action.yml b/.github/actions/neon-project-create/action.yml index a5b4104908..d7ff05be1a 100644 --- a/.github/actions/neon-project-create/action.yml +++ b/.github/actions/neon-project-create/action.yml @@ -49,10 +49,6 @@ inputs: description: 'A JSON object with project settings' required: false default: '{}' - default_endpoint_settings: - description: 'A JSON object with the default endpoint settings' - required: false - default: '{}' outputs: dsn: @@ -139,21 +135,6 @@ runs: -H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer ${ADMIN_API_KEY}" \ -d "{\"scheduling\": \"Essential\"}" fi - # XXX - # This is a workaround for the default endpoint settings, which currently do not allow some settings in the public API. - # https://github.com/neondatabase/cloud/issues/27108 - if [[ -n ${DEFAULT_ENDPOINT_SETTINGS} && ${DEFAULT_ENDPOINT_SETTINGS} != "{}" ]] ; then - PROJECT_DATA=$(curl -X GET \ - "https://${API_HOST}/regions/${REGION_ID}/api/v1/admin/projects/${project_id}" \ - -H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer ${ADMIN_API_KEY}" \ - -d "{\"scheduling\": \"Essential\"}" - ) - NEW_DEFAULT_ENDPOINT_SETTINGS=$(echo ${PROJECT_DATA} | jq -rc ".project.default_endpoint_settings + ${DEFAULT_ENDPOINT_SETTINGS}") - curl -X POST --fail \ - "https://${API_HOST}/regions/${REGION_ID}/api/v1/admin/projects/${project_id}/default_endpoint_settings" \ - -H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer ${ADMIN_API_KEY}" \ - --data "${NEW_DEFAULT_ENDPOINT_SETTINGS}" - fi env: @@ -171,4 +152,3 @@ runs: PSQL: ${{ inputs.psql_path }} LD_LIBRARY_PATH: ${{ inputs.libpq_lib_path }} PROJECT_SETTINGS: ${{ inputs.project_settings }} - DEFAULT_ENDPOINT_SETTINGS: ${{ inputs.default_endpoint_settings }} diff --git a/.github/workflows/cloud-extensions.yml b/.github/workflows/cloud-extensions.yml index 4114f0f9b4..25fe0877d9 100644 --- a/.github/workflows/cloud-extensions.yml +++ b/.github/workflows/cloud-extensions.yml @@ -35,7 +35,7 @@ jobs: matrix: pg-version: [16, 17] - runs-on: [ self-hosted, small ] + runs-on: us-east-2 container: # We use the neon-test-extensions image here as it contains the source code for the extensions. image: ghcr.io/neondatabase/neon-test-extensions-v${{ matrix.pg-version }}:latest @@ -71,20 +71,7 @@ jobs: region_id: ${{ inputs.region_id || 'aws-us-east-2' }} postgres_version: ${{ matrix.pg-version }} project_settings: ${{ steps.project-settings.outputs.settings }} - # We need these settings to get the expected output results. - # We cannot use the environment variables e.g. PGTZ due to - # https://github.com/neondatabase/neon/issues/1287 - default_endpoint_settings: > - { - "pg_settings": { - "DateStyle": "Postgres,MDY", - "TimeZone": "America/Los_Angeles", - "compute_query_id": "off", - "neon.allow_unstable_extensions": "on" - } - } api_key: ${{ secrets.NEON_STAGING_API_KEY }} - admin_api_key: ${{ secrets.NEON_STAGING_ADMIN_API_KEY }} - name: Run the regression tests run: /run-tests.sh -r /ext-src diff --git a/docker-compose/ext-src/alter_db.sh b/docker-compose/ext-src/alter_db.sh new file mode 100755 index 0000000000..6df37e1c9b --- /dev/null +++ b/docker-compose/ext-src/alter_db.sh @@ -0,0 +1,8 @@ +#!/bin/bash +# We need these settings to get the expected output results. +# We cannot use the environment variables e.g. PGTZ due to +# https://github.com/neondatabase/neon/issues/1287 +export DATABASE=${1:-contrib_regression} +psql -c "ALTER DATABASE ${DATABASE} SET neon.allow_unstable_extensions='on'" \ + -c "ALTER DATABASE ${DATABASE} SET DateStyle='Postgres,MDY'" \ + -c "ALTER DATABASE ${DATABASE} SET TimeZone='America/Los_Angeles'" \ diff --git a/docker-compose/ext-src/pg_graphql-src/regular-test.sh b/docker-compose/ext-src/pg_graphql-src/regular-test.sh index 85e1ae057a..9e7d63b612 100755 --- a/docker-compose/ext-src/pg_graphql-src/regular-test.sh +++ b/docker-compose/ext-src/pg_graphql-src/regular-test.sh @@ -18,6 +18,7 @@ TESTS=${TESTS/row_level_security/} TESTS=${TESTS/sqli_connection/} dropdb --if-exist contrib_regression createdb contrib_regression +. ../alter_db.sh psql -v ON_ERROR_STOP=1 -f test/fixtures.sql -d contrib_regression ${REGRESS} --use-existing --dbname=contrib_regression --inputdir=${TESTDIR} ${TESTS} diff --git a/docker-compose/ext-src/pgrag-src/regular-test.sh b/docker-compose/ext-src/pgrag-src/regular-test.sh index 6cb1b049a4..22eb7498fd 100755 --- a/docker-compose/ext-src/pgrag-src/regular-test.sh +++ b/docker-compose/ext-src/pgrag-src/regular-test.sh @@ -3,6 +3,7 @@ set -ex cd "$(dirname "${0}")" dropdb --if-exist contrib_regression createdb contrib_regression +. ../alter_db.sh psql -d contrib_regression -c "CREATE EXTENSION vector" -c "CREATE EXTENSION rag" PG_REGRESS=$(dirname "$(pg_config --pgxs)")/../test/regress/pg_regress ${PG_REGRESS} --inputdir=./ --bindir='/usr/local/pgsql/bin' --use-existing --load-extension=vector --load-extension=rag --dbname=contrib_regression basic_functions text_processing api_keys chunking_functions document_processing embedding_api_functions voyageai_functions diff --git a/docker-compose/ext-src/pgx_ulid-src/Makefile b/docker-compose/ext-src/pgx_ulid-src/Makefile index 6480c48441..00975e8c48 100644 --- a/docker-compose/ext-src/pgx_ulid-src/Makefile +++ b/docker-compose/ext-src/pgx_ulid-src/Makefile @@ -20,5 +20,6 @@ installcheck: regression-test regression-test: dropdb --if-exists contrib_regression createdb contrib_regression + ../alter_db.sh psql -d contrib_regression -c "CREATE EXTENSION $(EXTNAME)" $(PG_REGRESS) --inputdir=. --outputdir=. --use-existing --dbname=contrib_regression $(REGRESS) diff --git a/docker-compose/ext-src/plv8-src/regular-test.sh b/docker-compose/ext-src/plv8-src/regular-test.sh index b10cc65e8a..d5224e341c 100755 --- a/docker-compose/ext-src/plv8-src/regular-test.sh +++ b/docker-compose/ext-src/plv8-src/regular-test.sh @@ -3,6 +3,7 @@ set -ex cd "$(dirname ${0})" dropdb --if-exist contrib_regression createdb contrib_regression +. ../alter_db.sh PG_REGRESS=$(dirname "$(pg_config --pgxs)")/../test/regress/pg_regress REGRESS="$(make -n installcheck | awk '{print substr($0,index($0,"init-extension"));}')" REGRESS="${REGRESS/startup_perms/}" diff --git a/docker-compose/ext-src/rag_bge_small_en_v15-src/Makefile b/docker-compose/ext-src/rag_bge_small_en_v15-src/Makefile index ac87cc511b..de6bdd06c0 100644 --- a/docker-compose/ext-src/rag_bge_small_en_v15-src/Makefile +++ b/docker-compose/ext-src/rag_bge_small_en_v15-src/Makefile @@ -11,5 +11,6 @@ PG_REGRESS := $(dir $(PGXS))../../src/test/regress/pg_regress installcheck: dropdb --if-exists contrib_regression createdb contrib_regression + ../alter_db.sh psql -d contrib_regression -c "CREATE EXTENSION vector" -c "CREATE EXTENSION rag_bge_small_en_v15" $(PG_REGRESS) --use-existing --dbname=contrib_regression $(REGRESS) \ No newline at end of file diff --git a/docker-compose/ext-src/rag_jina_reranker_v1_tiny_en-src/Makefile b/docker-compose/ext-src/rag_jina_reranker_v1_tiny_en-src/Makefile index e81f94ef47..7adcad32f7 100644 --- a/docker-compose/ext-src/rag_jina_reranker_v1_tiny_en-src/Makefile +++ b/docker-compose/ext-src/rag_jina_reranker_v1_tiny_en-src/Makefile @@ -11,5 +11,6 @@ PG_REGRESS := $(dir $(PGXS))../../src/test/regress/pg_regress installcheck: dropdb --if-exists contrib_regression createdb contrib_regression + ../alter_db.sh psql -d contrib_regression -c "CREATE EXTENSION vector" -c "CREATE EXTENSION rag_jina_reranker_v1_tiny_en" $(PG_REGRESS) --use-existing --dbname=contrib_regression $(REGRESS) diff --git a/docker-compose/ext-src/rum-src/regular-test.sh b/docker-compose/ext-src/rum-src/regular-test.sh index d1d45a36ef..815c1adb53 100755 --- a/docker-compose/ext-src/rum-src/regular-test.sh +++ b/docker-compose/ext-src/rum-src/regular-test.sh @@ -3,5 +3,6 @@ set -ex cd "$(dirname ${0})" dropdb --if-exist contrib_regression createdb contrib_regression +. ../alter_db.sh PG_REGRESS=$(dirname "$(pg_config --pgxs)")/../test/regress/pg_regress ${PG_REGRESS} --inputdir=./ --bindir='/usr/local/pgsql/bin' --use-existing --dbname=contrib_regression rum rum_hash ruminv timestamp orderby orderby_hash altorder altorder_hash limits int2 int4 int8 float4 float8 money oid time timetz date interval macaddr inet cidr text varchar char bytea bit varbit numeric rum_weight expr array \ No newline at end of file From 32a12783fde3aeb246457ae79b18dc00f85f8896 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Wed, 14 May 2025 18:30:21 +0200 Subject: [PATCH 43/65] pageserver: batching & concurrent IO: update binary-built-in defaults; reduce CI matrix (#11923) Use the current production config for batching & concurrent IO. Remove the permutation testing for unit tests from CI. (The pageserver unit test matrix takes ~10min for debug builds). Drive-by-fix use of `if cfg!(test)` inside crate `pageserver_api`. It is ineffective for early-enabling new defaults for pageserver unit tests only. The reason is that the `test` cfg is only set for the crate under test but not its dependencies. So, `cargo test -p pageserver` will build `pageserver_api` with `cfg!(test) == false`. Resort to checking for feature flag `testing` instead, since all our unit tests are run with `--feature testing`. refs - `scattered-lsn` batching has been implemented and rolled out in all envs, cf https://github.com/neondatabase/neon/issues/10765 - preliminary for https://github.com/neondatabase/neon/pull/10466 - epic https://github.com/neondatabase/neon/issues/9377 - epic https://github.com/neondatabase/neon/issues/9378 - drive-by fix https://neondb.slack.com/archives/C0277TKAJCA/p1746821515504219 --- .github/workflows/_build-and-test-locally.yml | 22 +++++++------------ .github/workflows/build_and_test.yml | 2 -- libs/pageserver_api/src/config.rs | 20 +++++------------ libs/pageserver_api/src/models.rs | 11 +--------- libs/utils/src/tracing_span_assert.rs | 4 ++-- 5 files changed, 17 insertions(+), 42 deletions(-) diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml index 7cede309f3..663afa2c8b 100644 --- a/.github/workflows/_build-and-test-locally.yml +++ b/.github/workflows/_build-and-test-locally.yml @@ -279,18 +279,14 @@ jobs: # run all non-pageserver tests ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E '!package(pageserver)' - # run pageserver tests with different settings - for get_vectored_concurrent_io in sequential sidecar-task; do - for io_engine in std-fs tokio-epoll-uring ; do - for io_mode in buffered direct direct-rw ; do - NEON_PAGESERVER_UNIT_TEST_GET_VECTORED_CONCURRENT_IO=$get_vectored_concurrent_io \ - NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine \ - NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IO_MODE=$io_mode \ - ${cov_prefix} \ - cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(pageserver)' - done - done - done + # run pageserver tests + # (When developing new pageserver features gated by config fields, we commonly make the rust + # unit tests sensitive to an environment variable NEON_PAGESERVER_UNIT_TEST_FEATURENAME. + # Then run the nextest invocation below for all relevant combinations. Singling out the + # pageserver tests from non-pageserver tests cuts down the time it takes for this CI step.) + NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=tokio-epoll-uring \ + ${cov_prefix} \ + cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(pageserver)' # Run separate tests for real S3 export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty @@ -405,8 +401,6 @@ jobs: CHECK_ONDISK_DATA_COMPATIBILITY: nonempty BUILD_TAG: ${{ inputs.build-tag }} PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring - PAGESERVER_GET_VECTORED_CONCURRENT_IO: sidecar-task - PAGESERVER_VIRTUAL_FILE_IO_MODE: direct-rw USE_LFC: ${{ matrix.lfc_state == 'with-lfc' && 'true' || 'false' }} # Temporary disable this step until we figure out why it's so flaky diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index e0995218f9..6b19f6ef01 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -323,8 +323,6 @@ jobs: PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}" PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring - PAGESERVER_GET_VECTORED_CONCURRENT_IO: sidecar-task - PAGESERVER_VIRTUAL_FILE_IO_MODE: direct-rw SYNC_BETWEEN_TESTS: true # XXX: no coverage data handling here, since benchmarks are run on release builds, # while coverage is currently collected for the debug ones diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs index 5b0c13dd89..7e0bb7dc57 100644 --- a/libs/pageserver_api/src/config.rs +++ b/libs/pageserver_api/src/config.rs @@ -639,23 +639,15 @@ impl Default for ConfigToml { tenant_config: TenantConfigToml::default(), no_sync: None, wal_receiver_protocol: DEFAULT_WAL_RECEIVER_PROTOCOL, - page_service_pipelining: if !cfg!(test) { - PageServicePipeliningConfig::Serial - } else { - // Do not turn this into the default until scattered reads have been - // validated and rolled-out fully. - PageServicePipeliningConfig::Pipelined(PageServicePipeliningConfigPipelined { + page_service_pipelining: PageServicePipeliningConfig::Pipelined( + PageServicePipeliningConfigPipelined { max_batch_size: NonZeroUsize::new(32).unwrap(), execution: PageServiceProtocolPipelinedExecutionStrategy::ConcurrentFutures, batching: PageServiceProtocolPipelinedBatchingStrategy::ScatteredLsn, - }) - }, - get_vectored_concurrent_io: if !cfg!(test) { - GetVectoredConcurrentIo::Sequential - } else { - GetVectoredConcurrentIo::SidecarTask - }, - enable_read_path_debugging: if cfg!(test) || cfg!(feature = "testing") { + }, + ), + get_vectored_concurrent_io: GetVectoredConcurrentIo::SidecarTask, + enable_read_path_debugging: if cfg!(feature = "testing") { Some(true) } else { None diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 5fcdefba66..89d531d671 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -1803,7 +1803,6 @@ pub struct TopTenantShardsResponse { } pub mod virtual_file { - use std::sync::LazyLock; #[derive( Copy, @@ -1851,15 +1850,7 @@ pub mod virtual_file { impl IoMode { pub fn preferred() -> Self { - // The default behavior when running Rust unit tests without any further - // flags is to use the newest behavior (DirectRw). - // The CI uses the environment variable to unit tests for all different modes. - // NB: the Python regression & perf tests have their own defaults management - // that writes pageserver.toml; they do not use this variable. - static ENV_OVERRIDE: LazyLock> = LazyLock::new(|| { - utils::env::var_serde_json_string("NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IO_MODE") - }); - ENV_OVERRIDE.unwrap_or(IoMode::DirectRw) + IoMode::DirectRw } } diff --git a/libs/utils/src/tracing_span_assert.rs b/libs/utils/src/tracing_span_assert.rs index 3d15e08400..857d98b644 100644 --- a/libs/utils/src/tracing_span_assert.rs +++ b/libs/utils/src/tracing_span_assert.rs @@ -127,12 +127,12 @@ macro_rules! __check_fields_present { match check_fields_present0($extractors) { Ok(FoundEverything) => Ok(()), - Ok(Unconfigured) if cfg!(test) => { + Ok(Unconfigured) if cfg!(feature = "testing") => { // allow unconfigured in tests Ok(()) }, Ok(Unconfigured) => { - panic!("utils::tracing_span_assert: outside of #[cfg(test)] expected tracing to be configured with tracing_error::ErrorLayer") + panic!(r#"utils::tracing_span_assert: outside of #[cfg(feature = "testing")] expected tracing to be configured with tracing_error::ErrorLayer"#) }, Err(missing) => Err(missing) } From 48b870bc078bd2c450eb7b468e743b9c118549bf Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Thu, 15 May 2025 07:45:22 +0300 Subject: [PATCH 44/65] Use unlogged build in GIST for storing root page (#11892) ## Problem See https://github.com/neondatabase/neon/issues/11891 Newly added assert is first when root page of GIST index is written to the disk as part of sorted build. ## Summary of changes Wrap writing of root page in unlogged build. https://github.com/neondatabase/postgres/pull/632 https://github.com/neondatabase/postgres/pull/633 https://github.com/neondatabase/postgres/pull/634 --------- Co-authored-by: Konstantin Knizhnik --- vendor/postgres-v14 | 2 +- vendor/postgres-v15 | 2 +- vendor/postgres-v16 | 2 +- vendor/revisions.json | 6 +++--- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index ead1e76bdc..4cca6f8083 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit ead1e76bdcb71ef87f52f0610bd7333247f75179 +Subproject commit 4cca6f8083483dda9e12eae292cf788d45bd561f diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index 052df87d33..daa81cffcf 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit 052df87d338dc30687d0c96f1a4d9b6cb4882b2e +Subproject commit daa81cffcf063c54b29a9aabdb6604625f675ad0 diff --git a/vendor/postgres-v16 b/vendor/postgres-v16 index bb5eee65ac..15710a76b7 160000 --- a/vendor/postgres-v16 +++ b/vendor/postgres-v16 @@ -1 +1 @@ -Subproject commit bb5eee65ac753b5a66d255ec5fb4c0e33180e8fd +Subproject commit 15710a76b7d07912110fcbbaf0c8ad6d7e5a9fbc diff --git a/vendor/revisions.json b/vendor/revisions.json index cf9f474e1a..0fc2d3996d 100644 --- a/vendor/revisions.json +++ b/vendor/revisions.json @@ -5,14 +5,14 @@ ], "v16": [ "16.9", - "bb5eee65ac753b5a66d255ec5fb4c0e33180e8fd" + "15710a76b7d07912110fcbbaf0c8ad6d7e5a9fbc" ], "v15": [ "15.13", - "052df87d338dc30687d0c96f1a4d9b6cb4882b2e" + "daa81cffcf063c54b29a9aabdb6604625f675ad0" ], "v14": [ "14.18", - "ead1e76bdcb71ef87f52f0610bd7333247f75179" + "4cca6f8083483dda9e12eae292cf788d45bd561f" ] } From 9e5a41a3423782b1ab5f097e04583f38b78d9ba9 Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Thu, 15 May 2025 15:02:16 +0800 Subject: [PATCH 45/65] fix(scrubber): `remote_storage` error causes layers to be deleted as orphans (#11924) ## Problem close https://github.com/neondatabase/neon/issues/11159 ; we get occasional wrong deletions of layer files being used and errors in staging. This patch fixed it. Example errors: ``` Timeline metadata errors: ["index_part.json contains a layer .... (shard 0000) that is not present in remote storage (layer_is_l0: false) with error: Failed to download a remote file: s3 head object\n\nCaused by:\n 0: dispatch failure\n 1: timeout\n 2: error trying to connect: HTTP connect timeout occurred after 3.1s\n ``` This error should not be fired because the file could exist, but we cannot know if it exists due to head request failure. ## Summary of changes Only generate cannot find layer errors when the head_object return type is `NotFound`. Signed-off-by: Alex Chi Z --- storage_scrubber/src/checks.rs | 43 +++++++++++++++++++++------------- 1 file changed, 27 insertions(+), 16 deletions(-) diff --git a/storage_scrubber/src/checks.rs b/storage_scrubber/src/checks.rs index 40f3523a7e..865f0908f9 100644 --- a/storage_scrubber/src/checks.rs +++ b/storage_scrubber/src/checks.rs @@ -13,7 +13,7 @@ use pageserver::tenant::remote_timeline_client::{ }; use pageserver::tenant::storage_layer::LayerName; use pageserver_api::shard::ShardIndex; -use remote_storage::{GenericRemoteStorage, ListingObject, RemotePath}; +use remote_storage::{DownloadError, GenericRemoteStorage, ListingObject, RemotePath}; use tokio_util::sync::CancellationToken; use tracing::{info, warn}; use utils::generation::Generation; @@ -165,23 +165,34 @@ pub(crate) async fn branch_cleanup_and_check_errors( .head_object(&path, &CancellationToken::new()) .await; - if let Err(e) = response { - // Object is not present. - let is_l0 = LayerMap::is_l0(layer.key_range(), layer.is_delta()); + match response { + Ok(_) => {} + Err(DownloadError::NotFound) => { + // Object is not present. + let is_l0 = + LayerMap::is_l0(layer.key_range(), layer.is_delta()); - let msg = format!( - "index_part.json contains a layer {}{} (shard {}) that is not present in remote storage (layer_is_l0: {}) with error: {}", - layer, - metadata.generation.get_suffix(), - metadata.shard, - is_l0, - e, - ); + let msg = format!( + "index_part.json contains a layer {}{} (shard {}) that is not present in remote storage (layer_is_l0: {})", + layer, + metadata.generation.get_suffix(), + metadata.shard, + is_l0, + ); - if is_l0 || ignore_error { - result.warnings.push(msg); - } else { - result.errors.push(msg); + if is_l0 || ignore_error { + result.warnings.push(msg); + } else { + result.errors.push(msg); + } + } + Err(e) => { + tracing::warn!( + "cannot check if the layer {}{} is present in remote storage (error: {})", + layer, + metadata.generation.get_suffix(), + e, + ); } } } From 42e4cf18c97dad427f882c04a70bd33a54503e26 Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Thu, 15 May 2025 10:53:59 +0100 Subject: [PATCH 46/65] CI(neon_extra_builds): fix workflow syntax (#11932) ## Problem ``` Error when evaluating 'strategy' for job 'build-pgxn'. neondatabase/neon/.github/workflows/build-macos.yml@7907a9e2bf898f3d22b98d9d4d2c6ffc4d480fc3 (Line: 45, Col: 27): Matrix vector 'postgres-version' does not contain any values ``` See https://github.com/neondatabase/neon/actions/runs/15039594216/job/42268015127?pr=11929 ## Summary of changes - Fix typo: `.chnages` -> `.changes` - Ensure JSON is JSON by moving step output to env variable --- .github/workflows/neon_extra_builds.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml index 9c504eb5bf..3427a0eb49 100644 --- a/.github/workflows/neon_extra_builds.yml +++ b/.github/workflows/neon_extra_builds.yml @@ -63,8 +63,10 @@ jobs: - name: Filter out only v-string for build matrix id: postgres_changes + env: + CHANGES: ${{ steps.files_changed.outputs.changes }} run: | - v_strings_only_as_json_array=$(echo ${{ steps.files_changed.outputs.chnages }} | jq '.[]|select(test("v\\d+"))' | jq --slurp -c) + v_strings_only_as_json_array=$(echo ${CHANGES} | jq '.[]|select(test("v\\d+"))' | jq --slurp -c) echo "changes=${v_strings_only_as_json_array}" | tee -a "${GITHUB_OUTPUT}" check-macos-build: From a703cd342b1f7f8faf5920cec8ef09902f94eaa8 Mon Sep 17 00:00:00 2001 From: Vlad Lazar Date: Thu, 15 May 2025 11:02:11 +0100 Subject: [PATCH 47/65] storage_controller: enforce generations in import upcalls (#11900) ## Problem Import up-calls did not enforce the usage of the latest generation. The import might have finished in one previous generation, but not in the latest one. Hence, the controller might try to activate a timeline before it is ready. In theory, that would be fine, but it's tricky to reason about. ## Summary of Changes Pageserver provides the current generation in the upcall to the storage controller and the later validates the generation. If the generation is stale, we return an error which stops progress of the import job. Note that the import job will retry the upcall until the stale location is detached. I'll add some proper tests for this as part of the [checkpointing PR](https://github.com/neondatabase/neon/pull/11862). Closes https://github.com/neondatabase/neon/issues/11884 --- libs/pageserver_api/src/upcall_api.rs | 9 ++ pageserver/src/controller_upcall_client.rs | 22 +++- pageserver/src/deletion_queue.rs | 2 + .../src/tenant/timeline/import_pgdata.rs | 7 +- storage_controller/src/http.rs | 12 +- storage_controller/src/service.rs | 107 ++++++++++++++++-- 6 files changed, 142 insertions(+), 17 deletions(-) diff --git a/libs/pageserver_api/src/upcall_api.rs b/libs/pageserver_api/src/upcall_api.rs index 7ee63f9036..4dce5f7817 100644 --- a/libs/pageserver_api/src/upcall_api.rs +++ b/libs/pageserver_api/src/upcall_api.rs @@ -4,6 +4,7 @@ //! See docs/rfcs/025-generation-numbers.md use serde::{Deserialize, Serialize}; +use utils::generation::Generation; use utils::id::{NodeId, TimelineId}; use crate::controller_api::NodeRegisterRequest; @@ -63,9 +64,17 @@ pub struct ValidateResponseTenant { pub valid: bool, } +#[derive(Serialize, Deserialize)] +pub struct TimelineImportStatusRequest { + pub tenant_shard_id: TenantShardId, + pub timeline_id: TimelineId, + pub generation: Generation, +} + #[derive(Serialize, Deserialize)] pub struct PutTimelineImportStatusRequest { pub tenant_shard_id: TenantShardId, pub timeline_id: TimelineId, pub status: ShardImportStatus, + pub generation: Generation, } diff --git a/pageserver/src/controller_upcall_client.rs b/pageserver/src/controller_upcall_client.rs index 6d186b091a..779ef3e37d 100644 --- a/pageserver/src/controller_upcall_client.rs +++ b/pageserver/src/controller_upcall_client.rs @@ -7,7 +7,7 @@ use pageserver_api::models::ShardImportStatus; use pageserver_api::shard::TenantShardId; use pageserver_api::upcall_api::{ PutTimelineImportStatusRequest, ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, - ValidateRequest, ValidateRequestTenant, ValidateResponse, + TimelineImportStatusRequest, ValidateRequest, ValidateRequestTenant, ValidateResponse, }; use reqwest::Certificate; use serde::Serialize; @@ -51,12 +51,14 @@ pub trait StorageControllerUpcallApi { &self, tenant_shard_id: TenantShardId, timeline_id: TimelineId, + generation: Generation, status: ShardImportStatus, ) -> impl Future> + Send; fn get_timeline_import_status( &self, tenant_shard_id: TenantShardId, timeline_id: TimelineId, + generation: Generation, ) -> impl Future, RetryForeverError>> + Send; } @@ -292,6 +294,7 @@ impl StorageControllerUpcallApi for StorageControllerUpcallClient { &self, tenant_shard_id: TenantShardId, timeline_id: TimelineId, + generation: Generation, status: ShardImportStatus, ) -> Result<(), RetryForeverError> { let url = self @@ -302,6 +305,7 @@ impl StorageControllerUpcallApi for StorageControllerUpcallClient { let request = PutTimelineImportStatusRequest { tenant_shard_id, timeline_id, + generation, status, }; @@ -313,15 +317,27 @@ impl StorageControllerUpcallApi for StorageControllerUpcallClient { &self, tenant_shard_id: TenantShardId, timeline_id: TimelineId, + generation: Generation, ) -> Result, RetryForeverError> { let url = self .base_url - .join(format!("timeline_import_status/{}/{}", tenant_shard_id, timeline_id).as_str()) + .join("timeline_import_status") .expect("Failed to build path"); + let request = TimelineImportStatusRequest { + tenant_shard_id, + timeline_id, + generation, + }; + Ok(backoff::retry( || async { - let response = self.http_client.get(url.clone()).send().await?; + let response = self + .http_client + .get(url.clone()) + .json(&request) + .send() + .await?; if let Err(err) = response.error_for_status_ref() { if matches!(err.status(), Some(reqwest::StatusCode::NOT_FOUND)) { diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs index 65b2de28cd..0bbad87c09 100644 --- a/pageserver/src/deletion_queue.rs +++ b/pageserver/src/deletion_queue.rs @@ -793,6 +793,7 @@ mod test { &self, _tenant_shard_id: TenantShardId, _timeline_id: TimelineId, + _generation: Generation, _status: pageserver_api::models::ShardImportStatus, ) -> Result<(), RetryForeverError> { unimplemented!() @@ -802,6 +803,7 @@ mod test { &self, _tenant_shard_id: TenantShardId, _timeline_id: TimelineId, + _generation: Generation, ) -> Result, RetryForeverError> { unimplemented!() } diff --git a/pageserver/src/tenant/timeline/import_pgdata.rs b/pageserver/src/tenant/timeline/import_pgdata.rs index 53e15e5395..5fac9e0ce7 100644 --- a/pageserver/src/tenant/timeline/import_pgdata.rs +++ b/pageserver/src/tenant/timeline/import_pgdata.rs @@ -48,7 +48,11 @@ pub async fn doit( let storcon_client = StorageControllerUpcallClient::new(timeline.conf, &cancel); let shard_status = storcon_client - .get_timeline_import_status(timeline.tenant_shard_id, timeline.timeline_id) + .get_timeline_import_status( + timeline.tenant_shard_id, + timeline.timeline_id, + timeline.generation, + ) .await .map_err(|_err| anyhow::anyhow!("Shut down while getting timeline import status"))?; @@ -175,6 +179,7 @@ pub async fn doit( .put_timeline_import_status( timeline.tenant_shard_id, timeline.timeline_id, + timeline.generation, // TODO(vlad): What about import errors? ShardImportStatus::Done, ) diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs index 8d459cab9c..02c02c0e7f 100644 --- a/storage_controller/src/http.rs +++ b/storage_controller/src/http.rs @@ -31,7 +31,7 @@ use pageserver_api::models::{ }; use pageserver_api::shard::TenantShardId; use pageserver_api::upcall_api::{ - PutTimelineImportStatusRequest, ReAttachRequest, ValidateRequest, + PutTimelineImportStatusRequest, ReAttachRequest, TimelineImportStatusRequest, ValidateRequest, }; use pageserver_client::{BlockUnblock, mgmt_api}; use routerify::Middleware; @@ -160,22 +160,22 @@ async fn handle_validate(req: Request) -> Result, ApiError> async fn handle_get_timeline_import_status(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::GenerationsApi)?; - let tenant_shard_id: TenantShardId = parse_request_param(&req, "tenant_shard_id")?; - let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?; - - let req = match maybe_forward(req).await { + let mut req = match maybe_forward(req).await { ForwardOutcome::Forwarded(res) => { return res; } ForwardOutcome::NotForwarded(req) => req, }; + let get_req = json_request::(&mut req).await?; + let state = get_state(&req); + json_response( StatusCode::OK, state .service - .handle_timeline_shard_import_progress(tenant_shard_id, timeline_id) + .handle_timeline_shard_import_progress(get_req) .await?, ) } diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 05430733c2..852005639a 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -47,7 +47,7 @@ use pageserver_api::shard::{ }; use pageserver_api::upcall_api::{ PutTimelineImportStatusRequest, ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, - ValidateRequest, ValidateResponse, ValidateResponseTenant, + TimelineImportStatusRequest, ValidateRequest, ValidateResponse, ValidateResponseTenant, }; use pageserver_client::{BlockUnblock, mgmt_api}; use reqwest::{Certificate, StatusCode}; @@ -194,6 +194,14 @@ pub(crate) enum LeadershipStatus { Candidate, } +enum ShardGenerationValidity { + Valid, + Mismatched { + claimed: Generation, + actual: Option, + }, +} + pub const RECONCILER_CONCURRENCY_DEFAULT: usize = 128; pub const PRIORITY_RECONCILER_CONCURRENCY_DEFAULT: usize = 256; pub const SAFEKEEPER_RECONCILER_CONCURRENCY_DEFAULT: usize = 32; @@ -3909,19 +3917,36 @@ impl Service { pub(crate) async fn handle_timeline_shard_import_progress( self: &Arc, - tenant_shard_id: TenantShardId, - timeline_id: TimelineId, + req: TimelineImportStatusRequest, ) -> Result { + let validity = self + .validate_shard_generation(req.tenant_shard_id, req.generation) + .await?; + match validity { + ShardGenerationValidity::Valid => { + // fallthrough + } + ShardGenerationValidity::Mismatched { claimed, actual } => { + tracing::info!( + claimed=?claimed.into(), + actual=?actual.and_then(|g| g.into()), + "Rejecting import progress fetch from stale generation" + ); + + return Err(ApiError::BadRequest(anyhow::anyhow!("Invalid generation"))); + } + } + let maybe_import = self .persistence - .get_timeline_import(tenant_shard_id.tenant_id, timeline_id) + .get_timeline_import(req.tenant_shard_id.tenant_id, req.timeline_id) .await?; let import = maybe_import.ok_or_else(|| { ApiError::NotFound( format!( "import for {}/{} not found", - tenant_shard_id.tenant_id, timeline_id + req.tenant_shard_id.tenant_id, req.timeline_id ) .into(), ) @@ -3930,11 +3955,11 @@ impl Service { import .shard_statuses .0 - .get(&tenant_shard_id.to_index()) + .get(&req.tenant_shard_id.to_index()) .cloned() .ok_or_else(|| { ApiError::NotFound( - format!("shard {} not found", tenant_shard_id.shard_slug()).into(), + format!("shard {} not found", req.tenant_shard_id.shard_slug()).into(), ) }) } @@ -3943,6 +3968,24 @@ impl Service { self: &Arc, req: PutTimelineImportStatusRequest, ) -> Result<(), ApiError> { + let validity = self + .validate_shard_generation(req.tenant_shard_id, req.generation) + .await?; + match validity { + ShardGenerationValidity::Valid => { + // fallthrough + } + ShardGenerationValidity::Mismatched { claimed, actual } => { + tracing::info!( + claimed=?claimed.into(), + actual=?actual.and_then(|g| g.into()), + "Rejecting import progress update from stale generation" + ); + + return Err(ApiError::PreconditionFailed("Invalid generation".into())); + } + } + let res = self .persistence .update_timeline_import(req.tenant_shard_id, req.timeline_id, req.status) @@ -3977,6 +4020,56 @@ impl Service { Ok(()) } + /// Check that a provided generation for some tenant shard is the most recent one. + /// + /// Validate with the in-mem state first, and, if that passes, validate with the + /// database state which is authoritative. + async fn validate_shard_generation( + self: &Arc, + tenant_shard_id: TenantShardId, + generation: Generation, + ) -> Result { + { + let locked = self.inner.read().unwrap(); + let tenant_shard = + locked + .tenants + .get(&tenant_shard_id) + .ok_or(ApiError::InternalServerError(anyhow::anyhow!( + "{} shard not found", + tenant_shard_id + )))?; + + if tenant_shard.generation != Some(generation) { + return Ok(ShardGenerationValidity::Mismatched { + claimed: generation, + actual: tenant_shard.generation, + }); + } + } + + let mut db_generations = self + .persistence + .shard_generations(std::iter::once(&tenant_shard_id)) + .await?; + let (_tid, db_generation) = + db_generations + .pop() + .ok_or(ApiError::InternalServerError(anyhow::anyhow!( + "{} shard not found", + tenant_shard_id + )))?; + + if db_generation != Some(generation) { + return Ok(ShardGenerationValidity::Mismatched { + claimed: generation, + actual: db_generation, + }); + } + + Ok(ShardGenerationValidity::Valid) + } + /// Finalize the import of a timeline /// /// This method should be called once all shards have reported that the import is complete. From 2621ce2daf2a49408f54a687e9e691b87f3477d0 Mon Sep 17 00:00:00 2001 From: Vlad Lazar Date: Thu, 15 May 2025 14:18:22 +0100 Subject: [PATCH 48/65] pageserver: checkpoint import progress in the storage controller (#11862) ## Problem Timeline imports do not have progress checkpointing. Any time that the tenant is shut-down, all progress is lost and the import restarts from the beginning when the tenant is re-attached. ## Summary of changes This PR adds progress checkpointing. ### Preliminaries The **unit of work** is a `ChunkProcessingJob`. Each `ChunkProcessingJob` deals with the import for a set of key ranges. The job split is done by using an estimation of how many pages each job will produce. The planning stage must be **pure**: given a fixed set of contents in the import bucket, it will always yield the same plan. This property is enforced by checking that the hash of the plan is identical when resuming from a checkpoint. The storage controller tracks the progress of each shard in the import in the database in the form of the **latest job** that has has completed. ### Flow This is the high level flow for the happy path: 1. On the first run of the import task, the import task queries storcon for the progress and sees that none is recorded. 2. Execute the preparatory stage of the import 3. Import jobs start running concurrently in a `FuturesOrdered`. Every time the checkpointing threshold of jobs has been reached, notify the storage controller. 4. Tenant is detached and re-attached 5. Import task starts up again and gets the latest progress checkpoint from the storage controller in the form of a job index. 6. The plan is computed again and we check that the hash matches with the original plan. 7. Jobs are spawned from where the previous import task left off. Note that we will not report progress after the completion of each job, so some jobs might run twice. Closes https://github.com/neondatabase/neon/issues/11568 Closes https://github.com/neondatabase/neon/issues/11664 --- Cargo.lock | 1 + libs/pageserver_api/src/config.rs | 2 + libs/pageserver_api/src/models.rs | 15 +- pageserver/Cargo.toml | 1 + .../src/tenant/timeline/import_pgdata.rs | 270 +++++++++++------- .../src/tenant/timeline/import_pgdata/flow.rs | 188 ++++++++++-- storage_controller/src/service.rs | 2 +- storage_controller/src/timeline_import.rs | 9 +- test_runner/fixtures/neon_fixtures.py | 6 + 9 files changed, 357 insertions(+), 137 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 6df5d4a71e..f075b45e49 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4331,6 +4331,7 @@ dependencies = [ "toml_edit", "tracing", "tracing-utils", + "twox-hash", "url", "utils", "uuid", diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs index 7e0bb7dc57..f2ba50a86f 100644 --- a/libs/pageserver_api/src/config.rs +++ b/libs/pageserver_api/src/config.rs @@ -305,6 +305,7 @@ impl From for tracing_utils::Protocol { pub struct TimelineImportConfig { pub import_job_concurrency: NonZeroUsize, pub import_job_soft_size_limit: NonZeroUsize, + pub import_job_checkpoint_threshold: NonZeroUsize, } pub mod statvfs { @@ -661,6 +662,7 @@ impl Default for ConfigToml { timeline_import_config: TimelineImportConfig { import_job_concurrency: NonZeroUsize::new(128).unwrap(), import_job_soft_size_limit: NonZeroUsize::new(1024 * 1024 * 1024).unwrap(), + import_job_checkpoint_threshold: NonZeroUsize::new(128).unwrap(), }, } } diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 89d531d671..58b8d80c0a 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -336,14 +336,25 @@ impl TimelineCreateRequest { #[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)] pub enum ShardImportStatus { - InProgress, + InProgress(Option), Done, Error(String), } + +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)] +pub struct ShardImportProgress { + /// Total number of jobs in the import plan + pub jobs: usize, + /// Number of jobs completed + pub completed: usize, + /// Hash of the plan + pub import_plan_hash: u64, +} + impl ShardImportStatus { pub fn is_terminal(&self) -> bool { match self { - ShardImportStatus::InProgress => false, + ShardImportStatus::InProgress(_) => false, ShardImportStatus::Done | ShardImportStatus::Error(_) => true, } } diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 8abd504922..b7b3e0eaf1 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -96,6 +96,7 @@ strum.workspace = true strum_macros.workspace = true wal_decoder.workspace = true smallvec.workspace = true +twox-hash.workspace = true [target.'cfg(target_os = "linux")'.dependencies] procfs.workspace = true diff --git a/pageserver/src/tenant/timeline/import_pgdata.rs b/pageserver/src/tenant/timeline/import_pgdata.rs index 5fac9e0ce7..602b20df97 100644 --- a/pageserver/src/tenant/timeline/import_pgdata.rs +++ b/pageserver/src/tenant/timeline/import_pgdata.rs @@ -1,6 +1,7 @@ use std::sync::Arc; use anyhow::{Context, bail}; +use importbucket_client::{ControlFile, RemoteStorageWrapper}; use pageserver_api::models::ShardImportStatus; use remote_storage::RemotePath; use tokio::task::JoinHandle; @@ -57,115 +58,40 @@ pub async fn doit( .map_err(|_err| anyhow::anyhow!("Shut down while getting timeline import status"))?; info!(?shard_status, "peeking shard status"); - match shard_status { - None | Some(ShardImportStatus::InProgress) => { - // TODO: checkpoint the progress into the IndexPart instead of restarting - // from the beginning. - - // - // Wipe the slate clean - the flow does not allow resuming. - // We can implement resuming in the future by checkpointing the progress into the IndexPart. - // - info!("wipe the slate clean"); - { - // TODO: do we need to hold GC lock for this? - let mut guard = timeline.layers.write().await; - assert!( - guard.layer_map()?.open_layer.is_none(), - "while importing, there should be no in-memory layer" // this just seems like a good place to assert it - ); - let all_layers_keys = guard.all_persistent_layers(); - let all_layers: Vec<_> = all_layers_keys - .iter() - .map(|key| guard.get_from_key(key)) - .collect(); - let open = guard.open_mut().context("open_mut")?; - - timeline.remote_client.schedule_gc_update(&all_layers)?; - open.finish_gc_timeline(&all_layers); - } - - // - // Wait for pgdata to finish uploading - // - info!("wait for pgdata to reach status 'done'"); + match shard_status.unwrap_or(ShardImportStatus::InProgress(None)) { + ShardImportStatus::InProgress(maybe_progress) => { let storage = importbucket_client::new(timeline.conf, &location, cancel.clone()).await?; - let status_prefix = RemotePath::from_string("status").unwrap(); - let pgdata_status_key = status_prefix.join("pgdata"); - loop { - let res = async { - let pgdata_status: Option = storage - .get_json(&pgdata_status_key) - .await - .context("get pgdata status")?; - info!(?pgdata_status, "peeking pgdata status"); - if pgdata_status.map(|st| st.done).unwrap_or(false) { - Ok(()) - } else { - Err(anyhow::anyhow!("pgdata not done yet")) - } - } - .await; - match res { - Ok(_) => break, - Err(err) => { - info!(?err, "indefinitely waiting for pgdata to finish"); - if tokio::time::timeout( - std::time::Duration::from_secs(10), - cancel.cancelled(), - ) - .await - .is_ok() - { - bail!("cancelled while waiting for pgdata"); - } - } - } - } - // - // Do the import - // - info!("do the import"); - let control_file = storage.get_control_file().await?; - let base_lsn = control_file.base_lsn(); + let control_file_res = if maybe_progress.is_none() { + // Only prepare the import once when there's no progress. + prepare_import(timeline, storage.clone(), &cancel).await + } else { + storage.get_control_file().await + }; - info!("update TimelineMetadata based on LSNs from control file"); - { - let pg_version = control_file.pg_version(); - let _ctx: &RequestContext = ctx; - async move { - // FIXME: The 'disk_consistent_lsn' should be the LSN at the *end* of the - // checkpoint record, and prev_record_lsn should point to its beginning. - // We should read the real end of the record from the WAL, but here we - // just fake it. - let disk_consistent_lsn = Lsn(base_lsn.0 + 8); - let prev_record_lsn = base_lsn; - let metadata = TimelineMetadata::new( - disk_consistent_lsn, - Some(prev_record_lsn), - None, // no ancestor - Lsn(0), // no ancestor lsn - base_lsn, // latest_gc_cutoff_lsn - base_lsn, // initdb_lsn - pg_version, + let control_file = match control_file_res { + Ok(cf) => cf, + Err(err) => { + return Err( + terminate_flow_with_error(timeline, err, &storcon_client, &cancel).await, ); - - let _start_lsn = disk_consistent_lsn + 1; - - timeline - .remote_client - .schedule_index_upload_for_full_metadata_update(&metadata)?; - - timeline.remote_client.wait_completion().await?; - - anyhow::Ok(()) } - } - .await?; + }; - flow::run(timeline.clone(), control_file, storage.clone(), ctx).await?; + let res = flow::run( + timeline.clone(), + control_file, + storage.clone(), + maybe_progress, + ctx, + ) + .await; + if let Err(err) = res { + return Err( + terminate_flow_with_error(timeline, err, &storcon_client, &cancel).await, + ); + } // Communicate that shard is done. // Ensure at-least-once delivery of the upcall to storage controller @@ -180,7 +106,6 @@ pub async fn doit( timeline.tenant_shard_id, timeline.timeline_id, timeline.generation, - // TODO(vlad): What about import errors? ShardImportStatus::Done, ) .await @@ -188,16 +113,151 @@ pub async fn doit( anyhow::anyhow!("Shut down while putting timeline import status") })?; } - Some(ShardImportStatus::Error(err)) => { + ShardImportStatus::Error(err) => { info!( "shard status indicates that the shard is done (error), skipping import {}", err ); } - Some(ShardImportStatus::Done) => { + ShardImportStatus::Done => { info!("shard status indicates that the shard is done (success), skipping import"); } } Ok(()) } + +async fn prepare_import( + timeline: &Arc, + storage: RemoteStorageWrapper, + cancel: &CancellationToken, +) -> anyhow::Result { + // Wipe the slate clean before starting the import as a precaution. + // This method is only called when there's no recorded checkpoint for the import + // in the storage controller. + // + // Note that this is split-brain safe (two imports for same timeline shards running in + // different generations) because we go through the usual deletion path, including deletion queue. + info!("wipe the slate clean"); + { + // TODO: do we need to hold GC lock for this? + let mut guard = timeline.layers.write().await; + assert!( + guard.layer_map()?.open_layer.is_none(), + "while importing, there should be no in-memory layer" // this just seems like a good place to assert it + ); + let all_layers_keys = guard.all_persistent_layers(); + let all_layers: Vec<_> = all_layers_keys + .iter() + .map(|key| guard.get_from_key(key)) + .collect(); + let open = guard.open_mut().context("open_mut")?; + + timeline.remote_client.schedule_gc_update(&all_layers)?; + open.finish_gc_timeline(&all_layers); + } + + // + // Wait for pgdata to finish uploading + // + info!("wait for pgdata to reach status 'done'"); + let status_prefix = RemotePath::from_string("status").unwrap(); + let pgdata_status_key = status_prefix.join("pgdata"); + loop { + let res = async { + let pgdata_status: Option = storage + .get_json(&pgdata_status_key) + .await + .context("get pgdata status")?; + info!(?pgdata_status, "peeking pgdata status"); + if pgdata_status.map(|st| st.done).unwrap_or(false) { + Ok(()) + } else { + Err(anyhow::anyhow!("pgdata not done yet")) + } + } + .await; + match res { + Ok(_) => break, + Err(err) => { + info!(?err, "indefinitely waiting for pgdata to finish"); + if tokio::time::timeout(std::time::Duration::from_secs(10), cancel.cancelled()) + .await + .is_ok() + { + bail!("cancelled while waiting for pgdata"); + } + } + } + } + + let control_file = storage.get_control_file().await?; + let base_lsn = control_file.base_lsn(); + + info!("update TimelineMetadata based on LSNs from control file"); + { + let pg_version = control_file.pg_version(); + async move { + // FIXME: The 'disk_consistent_lsn' should be the LSN at the *end* of the + // checkpoint record, and prev_record_lsn should point to its beginning. + // We should read the real end of the record from the WAL, but here we + // just fake it. + let disk_consistent_lsn = Lsn(base_lsn.0 + 8); + let prev_record_lsn = base_lsn; + let metadata = TimelineMetadata::new( + disk_consistent_lsn, + Some(prev_record_lsn), + None, // no ancestor + Lsn(0), // no ancestor lsn + base_lsn, // latest_gc_cutoff_lsn + base_lsn, // initdb_lsn + pg_version, + ); + + let _start_lsn = disk_consistent_lsn + 1; + + timeline + .remote_client + .schedule_index_upload_for_full_metadata_update(&metadata)?; + + timeline.remote_client.wait_completion().await?; + + anyhow::Ok(()) + } + } + .await?; + + Ok(control_file) +} + +async fn terminate_flow_with_error( + timeline: &Arc, + error: anyhow::Error, + storcon_client: &StorageControllerUpcallClient, + cancel: &CancellationToken, +) -> anyhow::Error { + // The import task is a aborted on tenant shutdown, so in principle, it should + // never be cancelled. To be on the safe side, check the cancellation tokens + // before marking the import as failed. + if !(cancel.is_cancelled() || timeline.cancel.is_cancelled()) { + let notify_res = storcon_client + .put_timeline_import_status( + timeline.tenant_shard_id, + timeline.timeline_id, + timeline.generation, + ShardImportStatus::Error(format!("{error:#}")), + ) + .await; + + if let Err(_notify_error) = notify_res { + // The [`StorageControllerUpcallClient::put_timeline_import_status`] retries + // forever internally, so errors returned by it can only be due to cancellation. + info!("failed to notify storcon about permanent import error"); + } + + // Will be logged by [`Tenant::create_timeline_import_pgdata_task`] + error + } else { + anyhow::anyhow!("Import task cancelled") + } +} diff --git a/pageserver/src/tenant/timeline/import_pgdata/flow.rs b/pageserver/src/tenant/timeline/import_pgdata/flow.rs index 5b9c8ec5b5..c8c3bdcdfb 100644 --- a/pageserver/src/tenant/timeline/import_pgdata/flow.rs +++ b/pageserver/src/tenant/timeline/import_pgdata/flow.rs @@ -29,10 +29,11 @@ //! - version-specific CheckPointData (=> pgv abstraction, already exists for regular walingest) use std::collections::HashSet; +use std::hash::{Hash, Hasher}; use std::ops::Range; use std::sync::Arc; -use anyhow::{bail, ensure}; +use anyhow::ensure; use bytes::Bytes; use futures::stream::FuturesOrdered; use itertools::Itertools; @@ -43,6 +44,7 @@ use pageserver_api::key::{ slru_segment_size_to_key, }; use pageserver_api::keyspace::{contiguous_range_len, is_contiguous_range, singleton_range}; +use pageserver_api::models::{ShardImportProgress, ShardImportStatus}; use pageserver_api::reltag::{RelTag, SlruKind}; use pageserver_api::shard::ShardIdentity; use postgres_ffi::relfile_utils::parse_relfilename; @@ -59,16 +61,18 @@ use super::Timeline; use super::importbucket_client::{ControlFile, RemoteStorageWrapper}; use crate::assert_u64_eq_usize::UsizeIsU64; use crate::context::{DownloadBehavior, RequestContext}; +use crate::controller_upcall_client::{StorageControllerUpcallApi, StorageControllerUpcallClient}; use crate::pgdatadir_mapping::{ DbDirectory, RelDirectory, SlruSegmentDirectory, TwoPhaseDirectory, }; use crate::task_mgr::TaskKind; -use crate::tenant::storage_layer::{ImageLayerWriter, Layer}; +use crate::tenant::storage_layer::{AsLayerDesc, ImageLayerWriter, Layer}; pub async fn run( timeline: Arc, control_file: ControlFile, storage: RemoteStorageWrapper, + import_progress: Option, ctx: &RequestContext, ) -> anyhow::Result<()> { let planner = Planner { @@ -81,9 +85,31 @@ pub async fn run( let import_config = &timeline.conf.timeline_import_config; let plan = planner.plan(import_config).await?; + // Hash the plan and compare with the hash of the plan we got back from the storage controller. + // If the two match, it means that the planning stage had the same output. + // + // This is not intended to be a cryptographically secure hash. + const SEED: u64 = 42; + let mut hasher = twox_hash::XxHash64::with_seed(SEED); + plan.hash(&mut hasher); + let plan_hash = hasher.finish(); + + if let Some(progress) = &import_progress { + if plan_hash != progress.import_plan_hash { + anyhow::bail!("Import plan does not match storcon metadata"); + } + + // Handle collisions on jobs of unequal length + if progress.jobs != plan.jobs.len() { + anyhow::bail!("Import plan job length does not match storcon metadata") + } + } + pausable_failpoint!("import-timeline-pre-execute-pausable"); - plan.execute(timeline, import_config, ctx).await + let start_from_job_idx = import_progress.map(|progress| progress.completed); + plan.execute(timeline, start_from_job_idx, plan_hash, import_config, ctx) + .await } struct Planner { @@ -93,8 +119,11 @@ struct Planner { tasks: Vec, } +#[derive(Hash)] struct Plan { jobs: Vec, + // Included here such that it ends up in the hash for the plan + shard: ShardIdentity, } impl Planner { @@ -198,7 +227,10 @@ impl Planner { pgdata_lsn, )); - Ok(Plan { jobs }) + Ok(Plan { + jobs, + shard: self.shard, + }) } #[instrument(level = tracing::Level::DEBUG, skip_all, fields(dboid=%db.dboid, tablespace=%db.spcnode, path=%db.path))] @@ -327,25 +359,45 @@ impl Plan { async fn execute( self, timeline: Arc, + start_after_job_idx: Option, + import_plan_hash: u64, import_config: &TimelineImportConfig, ctx: &RequestContext, ) -> anyhow::Result<()> { + let storcon_client = StorageControllerUpcallClient::new(timeline.conf, &timeline.cancel); + let mut work = FuturesOrdered::new(); let semaphore = Arc::new(Semaphore::new(import_config.import_job_concurrency.into())); let jobs_in_plan = self.jobs.len(); - let mut jobs = self.jobs.into_iter().enumerate().peekable(); - let mut results = Vec::new(); + let mut jobs = self + .jobs + .into_iter() + .enumerate() + .map(|(idx, job)| (idx + 1, job)) + .filter(|(idx, _job)| { + // Filter out any jobs that have been done already + if let Some(start_after) = start_after_job_idx { + *idx > start_after + } else { + true + } + }) + .peekable(); + + let mut last_completed_job_idx = start_after_job_idx.unwrap_or(0); + let checkpoint_every: usize = import_config.import_job_checkpoint_threshold.into(); // Run import jobs concurrently up to the limit specified by the pageserver configuration. // Note that we process completed futures in the oreder of insertion. This will be the // building block for resuming imports across pageserver restarts or tenant migrations. - while results.len() < jobs_in_plan { + while last_completed_job_idx < jobs_in_plan { tokio::select! { permit = semaphore.clone().acquire_owned(), if jobs.peek().is_some() => { let permit = permit.expect("never closed"); let (job_idx, job) = jobs.next().expect("we peeked"); + let job_timeline = timeline.clone(); let ctx = ctx.detached_child(TaskKind::ImportPgdata, DownloadBehavior::Error); @@ -357,13 +409,33 @@ impl Plan { }, maybe_complete_job_idx = work.next() => { match maybe_complete_job_idx { - Some(Ok((_job_idx, res))) => { - results.push(res); + Some(Ok((job_idx, res))) => { + assert!(last_completed_job_idx.checked_add(1).unwrap() == job_idx); + + res?; + last_completed_job_idx = job_idx; + + if last_completed_job_idx % checkpoint_every == 0 { + storcon_client.put_timeline_import_status( + timeline.tenant_shard_id, + timeline.timeline_id, + timeline.generation, + ShardImportStatus::InProgress(Some(ShardImportProgress { + jobs: jobs_in_plan, + completed: last_completed_job_idx, + import_plan_hash, + })) + ) + .await + .map_err(|_err| { + anyhow::anyhow!("Shut down while putting timeline import status") + })?; + } }, Some(Err(_)) => { - results.push(Err(anyhow::anyhow!( - "parallel job panicked or cancelled, check pageserver logs" - ))); + anyhow::bail!( + "import job panicked or cancelled" + ); } None => {} } @@ -371,17 +443,7 @@ impl Plan { } } - if results.iter().all(|r| r.is_ok()) { - Ok(()) - } else { - let mut msg = String::new(); - for result in results { - if let Err(err) = result { - msg.push_str(&format!("{err:?}\n\n")); - } - } - bail!("Some parallel jobs failed:\n\n{msg}"); - } + Ok(()) } } @@ -553,6 +615,15 @@ struct ImportSingleKeyTask { buf: Bytes, } +impl Hash for ImportSingleKeyTask { + fn hash(&self, state: &mut H) { + let ImportSingleKeyTask { key, buf } = self; + + key.hash(state); + buf.hash(state); + } +} + impl ImportSingleKeyTask { fn new(key: Key, buf: Bytes) -> Self { ImportSingleKeyTask { key, buf } @@ -581,6 +652,20 @@ struct ImportRelBlocksTask { storage: RemoteStorageWrapper, } +impl Hash for ImportRelBlocksTask { + fn hash(&self, state: &mut H) { + let ImportRelBlocksTask { + shard_identity: _, + key_range, + path, + storage: _, + } = self; + + key_range.hash(state); + path.hash(state); + } +} + impl ImportRelBlocksTask { fn new( shard_identity: ShardIdentity, @@ -665,6 +750,19 @@ struct ImportSlruBlocksTask { storage: RemoteStorageWrapper, } +impl Hash for ImportSlruBlocksTask { + fn hash(&self, state: &mut H) { + let ImportSlruBlocksTask { + key_range, + path, + storage: _, + } = self; + + key_range.hash(state); + path.hash(state); + } +} + impl ImportSlruBlocksTask { fn new(key_range: Range, path: &RemotePath, storage: RemoteStorageWrapper) -> Self { ImportSlruBlocksTask { @@ -707,6 +805,7 @@ impl ImportTask for ImportSlruBlocksTask { } } +#[derive(Hash)] enum AnyImportTask { SingleKey(ImportSingleKeyTask), RelBlocks(ImportRelBlocksTask), @@ -753,6 +852,7 @@ impl From for AnyImportTask { } } +#[derive(Hash)] struct ChunkProcessingJob { range: Range, tasks: Vec, @@ -790,17 +890,51 @@ impl ChunkProcessingJob { let resident_layer = if nimages > 0 { let (desc, path) = writer.finish(ctx).await?; + + { + let guard = timeline.layers.read().await; + let existing_layer = guard.try_get_from_key(&desc.key()); + if let Some(layer) = existing_layer { + if layer.metadata().generation != timeline.generation { + return Err(anyhow::anyhow!( + "Import attempted to rewrite layer file in the same generation: {}", + layer.local_path() + )); + } + } + } + Layer::finish_creating(timeline.conf, &timeline, desc, &path)? } else { // dropping the writer cleans up return Ok(()); }; - // this is sharing the same code as create_image_layers + // The same import job might run multiple times since not each job is checkpointed. + // Hence, we must support the cases where the layer already exists. We cannot be + // certain that the existing layer is identical to the new one, so in that case + // we replace the old layer with the one we just generated. + let mut guard = timeline.layers.write().await; - guard - .open_mut()? - .track_new_image_layers(&[resident_layer.clone()], &timeline.metrics); + + let existing_layer = guard + .try_get_from_key(&resident_layer.layer_desc().key()) + .cloned(); + match existing_layer { + Some(existing) => { + guard.open_mut()?.rewrite_layers( + &[(existing.clone(), resident_layer.clone())], + &[], + &timeline.metrics, + ); + } + None => { + guard + .open_mut()? + .track_new_image_layers(&[resident_layer.clone()], &timeline.metrics); + } + } + crate::tenant::timeline::drop_wlock(guard); timeline diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 852005639a..7e4bb627af 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -4082,7 +4082,7 @@ impl Service { /// imports are stored in the database). #[instrument(skip_all, fields( tenant_id=%import.tenant_id, - shard_id=%import.timeline_id, + timeline_id=%import.timeline_id, ))] async fn finalize_timeline_import( self: &Arc, diff --git a/storage_controller/src/timeline_import.rs b/storage_controller/src/timeline_import.rs index 5d9d633932..909e8e2899 100644 --- a/storage_controller/src/timeline_import.rs +++ b/storage_controller/src/timeline_import.rs @@ -5,7 +5,7 @@ use http_utils::error::ApiError; use reqwest::Method; use serde::{Deserialize, Serialize}; -use pageserver_api::models::ShardImportStatus; +use pageserver_api::models::{ShardImportProgress, ShardImportStatus}; use tokio_util::sync::CancellationToken; use utils::{ id::{TenantId, TimelineId}, @@ -28,7 +28,12 @@ impl ShardImportStatuses { ShardImportStatuses( shards .into_iter() - .map(|ts_id| (ts_id, ShardImportStatus::InProgress)) + .map(|ts_id| { + ( + ts_id, + ShardImportStatus::InProgress(None::), + ) + }) .collect(), ) } diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 2801a0e867..9d86fd027c 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -1255,6 +1255,12 @@ class NeonEnv: "no_sync": True, # Look for gaps in WAL received from safekeepeers "validate_wal_contiguity": True, + # TODO(vlad): make these configurable through the builder + "timeline_import_config": { + "import_job_concurrency": 4, + "import_job_soft_size_limit": 512 * 1024, + "import_job_checkpoint_threshold": 4, + }, } # Batching (https://github.com/neondatabase/neon/issues/9377): From 31026d5a3c246956dda9ba4925efdc72ded42de0 Mon Sep 17 00:00:00 2001 From: Vlad Lazar Date: Thu, 15 May 2025 17:13:15 +0100 Subject: [PATCH 49/65] pageserver: support import schema evolution (#11935) ## Problem Imports don't support schema evolution nicely. If we want to change the stuff we keep in storcon, we'd have to carry the old cruft around. ## Summary of changes Version import progress. Note that the import progress version determines the version of the import job split and execution. This means that we can also use it as a mechanism for deploying new import implementations in the future. --- libs/pageserver_api/src/models.rs | 7 ++- pageserver/src/controller_upcall_client.rs | 49 ++++++------------- pageserver/src/deletion_queue.rs | 2 +- .../src/tenant/timeline/import_pgdata.rs | 2 +- .../src/tenant/timeline/import_pgdata/flow.rs | 32 +++++++++--- 5 files changed, 50 insertions(+), 42 deletions(-) diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 58b8d80c0a..e9b37c8ca6 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -342,7 +342,12 @@ pub enum ShardImportStatus { } #[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)] -pub struct ShardImportProgress { +pub enum ShardImportProgress { + V1(ShardImportProgressV1), +} + +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)] +pub struct ShardImportProgressV1 { /// Total number of jobs in the import plan pub jobs: usize, /// Number of jobs completed diff --git a/pageserver/src/controller_upcall_client.rs b/pageserver/src/controller_upcall_client.rs index 779ef3e37d..dc38ea616c 100644 --- a/pageserver/src/controller_upcall_client.rs +++ b/pageserver/src/controller_upcall_client.rs @@ -59,7 +59,7 @@ pub trait StorageControllerUpcallApi { tenant_shard_id: TenantShardId, timeline_id: TimelineId, generation: Generation, - ) -> impl Future, RetryForeverError>> + Send; + ) -> impl Future> + Send; } impl StorageControllerUpcallClient { @@ -104,6 +104,7 @@ impl StorageControllerUpcallClient { &self, url: &url::Url, request: R, + method: reqwest::Method, ) -> Result where R: Serialize, @@ -113,7 +114,7 @@ impl StorageControllerUpcallClient { || async { let response = self .http_client - .post(url.clone()) + .request(method.clone(), url.clone()) .json(&request) .send() .await?; @@ -222,7 +223,9 @@ impl StorageControllerUpcallApi for StorageControllerUpcallClient { register: register.clone(), }; - let response: ReAttachResponse = self.retry_http_forever(&url, request).await?; + let response: ReAttachResponse = self + .retry_http_forever(&url, request, reqwest::Method::POST) + .await?; tracing::info!( "Received re-attach response with {} tenants (node {}, register: {:?})", response.tenants.len(), @@ -275,7 +278,9 @@ impl StorageControllerUpcallApi for StorageControllerUpcallClient { return Err(RetryForeverError::ShuttingDown); } - let response: ValidateResponse = self.retry_http_forever(&url, request).await?; + let response: ValidateResponse = self + .retry_http_forever(&url, request, reqwest::Method::POST) + .await?; for rt in response.tenants { result.insert(rt.id, rt.valid); } @@ -309,7 +314,8 @@ impl StorageControllerUpcallApi for StorageControllerUpcallClient { status, }; - self.retry_http_forever(&url, request).await + self.retry_http_forever(&url, request, reqwest::Method::POST) + .await } #[tracing::instrument(skip_all)] // so that warning logs from retry_http_forever have context @@ -318,7 +324,7 @@ impl StorageControllerUpcallApi for StorageControllerUpcallClient { tenant_shard_id: TenantShardId, timeline_id: TimelineId, generation: Generation, - ) -> Result, RetryForeverError> { + ) -> Result { let url = self .base_url .join("timeline_import_status") @@ -330,32 +336,9 @@ impl StorageControllerUpcallApi for StorageControllerUpcallClient { generation, }; - Ok(backoff::retry( - || async { - let response = self - .http_client - .get(url.clone()) - .json(&request) - .send() - .await?; - - if let Err(err) = response.error_for_status_ref() { - if matches!(err.status(), Some(reqwest::StatusCode::NOT_FOUND)) { - return Ok(None); - } else { - return Err(err); - } - } - response.json::().await.map(Some) - }, - |_| false, - 3, - u32::MAX, - "storage controller upcall", - &self.cancel, - ) - .await - .ok_or(RetryForeverError::ShuttingDown)? - .expect("We retry forever, this should never be reached")) + let response: ShardImportStatus = self + .retry_http_forever(&url, request, reqwest::Method::GET) + .await?; + Ok(response) } } diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs index 0bbad87c09..7854fd9e36 100644 --- a/pageserver/src/deletion_queue.rs +++ b/pageserver/src/deletion_queue.rs @@ -804,7 +804,7 @@ mod test { _tenant_shard_id: TenantShardId, _timeline_id: TimelineId, _generation: Generation, - ) -> Result, RetryForeverError> { + ) -> Result { unimplemented!() } } diff --git a/pageserver/src/tenant/timeline/import_pgdata.rs b/pageserver/src/tenant/timeline/import_pgdata.rs index 602b20df97..658d867c18 100644 --- a/pageserver/src/tenant/timeline/import_pgdata.rs +++ b/pageserver/src/tenant/timeline/import_pgdata.rs @@ -58,7 +58,7 @@ pub async fn doit( .map_err(|_err| anyhow::anyhow!("Shut down while getting timeline import status"))?; info!(?shard_status, "peeking shard status"); - match shard_status.unwrap_or(ShardImportStatus::InProgress(None)) { + match shard_status { ShardImportStatus::InProgress(maybe_progress) => { let storage = importbucket_client::new(timeline.conf, &location, cancel.clone()).await?; diff --git a/pageserver/src/tenant/timeline/import_pgdata/flow.rs b/pageserver/src/tenant/timeline/import_pgdata/flow.rs index c8c3bdcdfb..3e10a4e6d6 100644 --- a/pageserver/src/tenant/timeline/import_pgdata/flow.rs +++ b/pageserver/src/tenant/timeline/import_pgdata/flow.rs @@ -44,7 +44,7 @@ use pageserver_api::key::{ slru_segment_size_to_key, }; use pageserver_api::keyspace::{contiguous_range_len, is_contiguous_range, singleton_range}; -use pageserver_api::models::{ShardImportProgress, ShardImportStatus}; +use pageserver_api::models::{ShardImportProgress, ShardImportProgressV1, ShardImportStatus}; use pageserver_api::reltag::{RelTag, SlruKind}; use pageserver_api::shard::ShardIdentity; use postgres_ffi::relfile_utils::parse_relfilename; @@ -74,6 +74,24 @@ pub async fn run( storage: RemoteStorageWrapper, import_progress: Option, ctx: &RequestContext, +) -> anyhow::Result<()> { + // Match how we run the import based on the progress version. + // If there's no import progress, it means that this is a new import + // and we can use whichever version we want. + match import_progress { + Some(ShardImportProgress::V1(progress)) => { + run_v1(timeline, control_file, storage, Some(progress), ctx).await + } + None => run_v1(timeline, control_file, storage, None, ctx).await, + } +} + +async fn run_v1( + timeline: Arc, + control_file: ControlFile, + storage: RemoteStorageWrapper, + import_progress: Option, + ctx: &RequestContext, ) -> anyhow::Result<()> { let planner = Planner { control_file, @@ -416,15 +434,17 @@ impl Plan { last_completed_job_idx = job_idx; if last_completed_job_idx % checkpoint_every == 0 { + let progress = ShardImportProgressV1 { + jobs: jobs_in_plan, + completed: last_completed_job_idx, + import_plan_hash, + }; + storcon_client.put_timeline_import_status( timeline.tenant_shard_id, timeline.timeline_id, timeline.generation, - ShardImportStatus::InProgress(Some(ShardImportProgress { - jobs: jobs_in_plan, - completed: last_completed_job_idx, - import_plan_hash, - })) + ShardImportStatus::InProgress(Some(ShardImportProgress::V1(progress))) ) .await .map_err(|_err| { From a7ce323949d277fa720a612d710b810903c1b1ff Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Thu, 15 May 2025 19:48:13 +0200 Subject: [PATCH 50/65] benchmarking: extend `test_page_service_batching.py` to cover concurrent IO + batching under random reads (#10466) This PR commits the benchmarks I ran to qualify concurrent IO before we released it. Changes: - Add `l0stack` fixture; a reusable abstraction for creating a stack of L0 deltas each of which has 1 Value::Delta per page. - Such a stack of L0 deltas is a good and understandable demo for concurrent IO because to reconstruct any page, $layer_stack_height` Values need to be read. Before concurrent IO, the reads were sequential. With concurrent IO, they are executed concurrently. - So, switch `test_latency` to use the l0stack. - Teach `pagebench`, which is used by `test_latency`, to limit itself to the blocks of the relation created by the l0stack abstraction. - Additional parametrization of `test_latency` over dimensions `ps_io_concurrency,l0_stack_height,queue_depth` - Use better names for the tests to reflect what they do, leave interpretation of the (now quite high-dimensional) results to the reader - `test_{throughput => postgres_seqscan}` - `test_{latency => random_reads}` - Cut down on permutations to those we use in production. Runtime is about 2min. Refs - concurrent IO epic https://github.com/neondatabase/neon/issues/9378 - batching task: fixes https://github.com/neondatabase/neon/issues/9837 --------- Co-authored-by: Peter Bendel --- libs/pageserver_api/src/key.rs | 5 + .../pagebench/src/cmd/getpage_latest_lsn.rs | 10 +- .../bin/neon_local_create_deep_l0_stack.py | 59 +++++++ test_runner/fixtures/neon_fixtures.py | 11 +- .../pageserver/makelayers/__init__.py | 0 .../fixtures/pageserver/makelayers/l0stack.py | 148 ++++++++++++++++ test_runner/performance/README.md | 3 +- test_runner/performance/out_dir_to_csv.py | 57 ++++++ .../pageserver/test_page_service_batching.py | 167 ++++++++++-------- 9 files changed, 387 insertions(+), 73 deletions(-) create mode 100644 test_runner/bin/neon_local_create_deep_l0_stack.py create mode 100644 test_runner/fixtures/pageserver/makelayers/__init__.py create mode 100644 test_runner/fixtures/pageserver/makelayers/l0stack.py create mode 100644 test_runner/performance/out_dir_to_csv.py diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs index 0c4d7fd4cb..c14975167b 100644 --- a/libs/pageserver_api/src/key.rs +++ b/libs/pageserver_api/src/key.rs @@ -910,6 +910,11 @@ impl Key { self.field1 == 0x00 && self.field4 != 0 && self.field6 != 0xffffffff } + #[inline(always)] + pub fn is_rel_block_of_rel(&self, rel: Oid) -> bool { + self.is_rel_block_key() && self.field4 == rel + } + #[inline(always)] pub fn is_rel_dir_key(&self) -> bool { self.field1 == 0x00 diff --git a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs index 771a7cbe5b..50419ec338 100644 --- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs +++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs @@ -65,6 +65,9 @@ pub(crate) struct Args { #[clap(long, default_value = "1")] queue_depth: NonZeroUsize, + #[clap(long)] + only_relnode: Option, + targets: Option>, } @@ -206,7 +209,12 @@ async fn main_impl( for r in partitioning.keys.ranges.iter() { let mut i = r.start; while i != r.end { - if i.is_rel_block_key() { + let mut include = true; + include &= i.is_rel_block_key(); + if let Some(only_relnode) = args.only_relnode { + include &= i.is_rel_block_of_rel(only_relnode); + } + if include { filtered.add_key(i); } i = i.next(); diff --git a/test_runner/bin/neon_local_create_deep_l0_stack.py b/test_runner/bin/neon_local_create_deep_l0_stack.py new file mode 100644 index 0000000000..ebe11f7308 --- /dev/null +++ b/test_runner/bin/neon_local_create_deep_l0_stack.py @@ -0,0 +1,59 @@ +""" +Script to creates a stack of L0 deltas each of which should have 1 Value::Delta per page in `data`, +in your running neon_local setup. + +Use this bash setup to reset your neon_local environment. +The last line of this bash snippet will run this file here. +``` + export NEON_REPO_DIR=$PWD/.neon + export NEON_BIN_DIR=$PWD/target/release + $NEON_BIN_DIR/neon_local stop + rm -rf $NEON_REPO_DIR + $NEON_BIN_DIR/neon_local init + cat >> $NEON_REPO_DIR/pageserver_1/pageserver.toml <<"EOF" + # customizations + virtual_file_io_mode = "direct-rw" + page_service_pipelining={mode="pipelined", max_batch_size=32, execution="concurrent-futures"} + get_vectored_concurrent_io={mode="sidecar-task"} +EOF + $NEON_BIN_DIR/neon_local start + + psql 'postgresql://localhost:1235/storage_controller' -c 'DELETE FROM tenant_shards' + sed 's/.*get_vectored_concurrent_io.*/get_vectored_concurrent_io={mode="sidecar-task"}/' -i $NEON_REPO_DIR/pageserver_1/pageserver.toml + $NEON_BIN_DIR/neon_local pageserver restart + sleep 2 + $NEON_BIN_DIR/neon_local tenant create --set-default + ./target/debug/neon_local endpoint stop foo + rm -rf $NEON_REPO_DIR/endpoints/foo + ./target/debug/neon_local endpoint create foo + echo 'full_page_writes=off' >> $NEON_REPO_DIR/endpoints/foo/postgresql.conf + ./target/debug/neon_local endpoint start foo + + pushd test_runner; poetry run python3 -m bin.neon_local_create_deep_l0_stack 10; popd +``` +""" + +import sys + +import psycopg2 +from fixtures.common_types import TenantShardId, TimelineId +from fixtures.pageserver.http import PageserverHttpClient +from fixtures.pageserver.makelayers.l0stack import L0StackShape, make_l0_stack_standalone + +ps_http = PageserverHttpClient(port=9898, is_testing_enabled_or_skip=lambda: None) +vps_http = PageserverHttpClient(port=1234, is_testing_enabled_or_skip=lambda: None) + +tenants = ps_http.tenant_list() +assert len(tenants) == 1 +tenant_shard_id = TenantShardId.parse(tenants[0]["id"]) + +timlines = ps_http.timeline_list(tenant_shard_id) +assert len(timlines) == 1 +timeline_id = TimelineId(timlines[0]["timeline_id"]) + +connstr = "postgresql://cloud_admin@localhost:55432/postgres" +conn = psycopg2.connect(connstr) + +shape = L0StackShape(logical_table_size_mib=50, delta_stack_height=int(sys.argv[1])) + +make_l0_stack_standalone(vps_http, ps_http, tenant_shard_id, timeline_id, conn, shape) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 9d86fd027c..e413b3c6d2 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -1377,7 +1377,11 @@ class NeonEnv: force=config.config_init_force, ) - def start(self, timeout_in_seconds: int | None = None): + def start( + self, + timeout_in_seconds: int | None = None, + extra_ps_env_vars: dict[str, str] | None = None, + ): # Storage controller starts first, so that pageserver /re-attach calls don't # bounce through retries on startup self.storage_controller.start(timeout_in_seconds=timeout_in_seconds) @@ -1396,7 +1400,10 @@ class NeonEnv: for pageserver in self.pageservers: futs.append( executor.submit( - lambda ps=pageserver: ps.start(timeout_in_seconds=timeout_in_seconds) # type: ignore[misc] + lambda ps=pageserver: ps.start( # type: ignore[misc] + extra_env_vars=extra_ps_env_vars or {}, + timeout_in_seconds=timeout_in_seconds, + ), ) ) diff --git a/test_runner/fixtures/pageserver/makelayers/__init__.py b/test_runner/fixtures/pageserver/makelayers/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test_runner/fixtures/pageserver/makelayers/l0stack.py b/test_runner/fixtures/pageserver/makelayers/l0stack.py new file mode 100644 index 0000000000..408ba1254f --- /dev/null +++ b/test_runner/fixtures/pageserver/makelayers/l0stack.py @@ -0,0 +1,148 @@ +from dataclasses import dataclass + +from psycopg2.extensions import connection as PgConnection + +from fixtures.common_types import Lsn, TenantShardId, TimelineId +from fixtures.log_helper import log +from fixtures.neon_fixtures import Endpoint +from fixtures.pageserver.http import PageserverHttpClient +from fixtures.pageserver.utils import wait_for_last_record_lsn + + +@dataclass +class L0StackShape: + logical_table_size_mib: int = 50 + delta_stack_height: int = 20 + + +def make_l0_stack(endpoint: Endpoint, shape: L0StackShape): + """ + Creates stack of L0 deltas each of which should have 1 Value::Delta per page in table `data`. + """ + env = endpoint.env + + # TDOO: wait for storcon to finish any reonciles before jumping to action here? + description = env.storage_controller.tenant_describe(endpoint.tenant_id) + shards = description["shards"] + assert len(shards) == 1, "does not support sharding" + tenant_shard_id = TenantShardId.parse(shards[0]["tenant_shard_id"]) + + endpoint.config(["full_page_writes=off"]) + endpoint.reconfigure() + + ps = env.get_pageserver(shards[0]["node_attached"]) + + timeline_id = endpoint.show_timeline_id() + + vps_http = env.storage_controller.pageserver_api() + ps_http = ps.http_client() + endpoint_conn = endpoint.connect() + make_l0_stack_standalone(vps_http, ps_http, tenant_shard_id, timeline_id, endpoint_conn, shape) + + +def make_l0_stack_standalone( + vps_http: PageserverHttpClient, + ps_http: PageserverHttpClient, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + endpoint_conn: PgConnection, + shape: L0StackShape, +): + """ + See make_l0_stack for details. + + This function is a standalone version of make_l0_stack, usable from not-test code. + """ + + assert not tenant_shard_id.shard_index.is_sharded, ( + "the current implementation only supports unsharded tenants" + ) + + tenant_id = tenant_shard_id.tenant_id + conn = endpoint_conn + desired_size = shape.logical_table_size_mib * 1024 * 1024 + + config = { + "gc_period": "0s", # disable periodic gc + "checkpoint_timeout": "10 years", + "compaction_period": "1h", # doesn't matter, but 0 value will kill walredo every 10s + "compaction_threshold": 100000, # we just want L0s + "compaction_target_size": 134217728, + "checkpoint_distance": 268435456, + "image_creation_threshold": 100000, # we just want L0s + } + + vps_http.set_tenant_config(tenant_id, config) + + conn.autocommit = True + cur = conn.cursor() + + # Ensure full_page_writes are disabled so that all Value::Delta in + # pageserver are !will_init, and therefore a getpage needs to read + # the entire delta stack. + cur.execute("SHOW full_page_writes") + assert cur.fetchall()[0][0] == "off", "full_page_writes should be off" + + # each tuple is 23 (header) + 100 bytes = 123 bytes + # page header si 24 bytes + # 8k page size + # (8k-24bytes) / 123 bytes = 63 tuples per page + # set fillfactor to 10 to have 6 tuples per page + cur.execute("DROP TABLE IF EXISTS data") + cur.execute("CREATE TABLE data(id bigint, row char(92)) with (fillfactor=10)") + need_pages = desired_size // 8192 + need_rows = need_pages * 6 + log.info(f"Need {need_pages} pages, {need_rows} rows") + cur.execute(f"INSERT INTO data SELECT i,'row'||i FROM generate_series(1, {need_rows}) as i") + # Raise fillfactor to 100% so that all updates are HOT updates. + # We assert they're hot updates by checking fetch_id_to_page_mapping remains the same. + cur.execute("ALTER TABLE data SET (fillfactor=100)") + + def settle_and_flush(): + cur.execute("SELECT pg_current_wal_flush_lsn()") + flush_lsn = Lsn(cur.fetchall()[0][0]) + wait_for_last_record_lsn(ps_http, tenant_shard_id, timeline_id, flush_lsn) + ps_http.timeline_checkpoint(tenant_id, timeline_id) + + # create an L0 for the initial data we just inserted + settle_and_flush() + + # assert we wrote what we think we wrote + cur.execute(""" + with ntuples_per_page as ( + select (ctid::text::point)[0]::bigint pageno,count(*) ntuples from data group by pageno + ) + select ntuples, count(*) npages from ntuples_per_page group by ntuples order by ntuples; + """) + rows = cur.fetchall() + log.info(f"initial table layout: {rows}") + assert len(rows) == 1 + assert rows[0][0] == 6, f"expected 6 tuples per page, got {rows[0][0]}" + assert rows[0][1] == need_pages, f"expected {need_pages} pages, got {rows[0][1]}" + + def fetch_id_to_page_mapping(): + cur.execute(""" + SELECT id,(ctid::text::point)[0]::bigint pageno FROM data ORDER BY id + """) + return cur.fetchall() + + initial_mapping = fetch_id_to_page_mapping() + + # every iteration updates one tuple in each page + delta_stack_height = shape.delta_stack_height + for i in range(0, delta_stack_height): + log.info(i) + cur.execute(f"UPDATE data set row = row||',u' where id % 6 = {i % 6}") + log.info(f"modified rows: {cur.rowcount}") + assert cur.rowcount == need_pages + settle_and_flush() + post_update_mapping = fetch_id_to_page_mapping() + assert initial_mapping == post_update_mapping, "Postgres should be doing HOT updates" + + # Assert the layer count is what we expect it is + layer_map = vps_http.layer_map_info(tenant_id, timeline_id) + assert ( + len(layer_map.delta_l0_layers()) == delta_stack_height + 1 + 1 + ) # +1 for the initdb layer + 1 for the table creation & fill + assert len(layer_map.delta_l0_layers()) == len(layer_map.delta_layers()) # it's all L0s + assert len(layer_map.image_layers()) == 0 # no images diff --git a/test_runner/performance/README.md b/test_runner/performance/README.md index 3b25a60e9b..21844648d1 100644 --- a/test_runner/performance/README.md +++ b/test_runner/performance/README.md @@ -15,7 +15,8 @@ Some handy pytest flags for local development: - `-k` selects a test to run - `--timeout=0` disables our default timeout of 300s (see `setup.cfg`) - `--preserve-database-files` to skip cleanup -- `--out-dir` to produce a JSON with the recorded test metrics +- `--out-dir` to produce a JSON with the recorded test metrics. + There is a post-processing tool at `test_runner/performance/out_dir_to_csv.py`. # What performance tests do we have and how we run them diff --git a/test_runner/performance/out_dir_to_csv.py b/test_runner/performance/out_dir_to_csv.py new file mode 100644 index 0000000000..8647ad4acc --- /dev/null +++ b/test_runner/performance/out_dir_to_csv.py @@ -0,0 +1,57 @@ +# Tool to convert the JSON output from running a perf test with `--out-dir` to a CSV that +# can be easily pasted into a spreadsheet for quick viz & analysis. +# Check the `./README.md` in this directory for `--out-dir`. +# +# TODO: add the pytest.mark.parametrize to the json and make them columns here +# https://github.com/neondatabase/neon/issues/11878 + +import csv +import json +import os +import sys + + +def json_to_csv(json_file): + with open(json_file) as f: + data = json.load(f) + + # Collect all possible metric names to form headers + all_metrics = set() + for result in data.get("result", []): + for metric in result.get("data", []): + all_metrics.add(metric["name"]) + + # Sort metrics for consistent output + metrics = sorted(list(all_metrics)) + + # Create headers + headers = ["suit"] + metrics + + # Prepare rows + rows = [] + for result in data.get("result", []): + row = {"suit": result["suit"]} + + # Initialize all metrics to empty + for metric in metrics: + row[metric] = "" + + # Fill in available metrics + for item in result.get("data", []): + row[item["name"]] = item["value"] + + rows.append(row) + + # Write to stdout as CSV + writer = csv.DictWriter(sys.stdout, fieldnames=headers) + writer.writeheader() + writer.writerows(rows) + + +if __name__ == "__main__": + if len(sys.argv) < 2: + print(f"Usage: python {os.path.basename(__file__)} ") + sys.exit(1) + + json_file = sys.argv[1] + json_to_csv(json_file) diff --git a/test_runner/performance/pageserver/test_page_service_batching.py b/test_runner/performance/pageserver/test_page_service_batching.py index b17ca772c9..9e2312311a 100644 --- a/test_runner/performance/pageserver/test_page_service_batching.py +++ b/test_runner/performance/pageserver/test_page_service_batching.py @@ -10,7 +10,8 @@ from typing import Any import pytest from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnvBuilder, PgBin, wait_for_last_flush_lsn +from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, PgBin +from fixtures.pageserver.makelayers import l0stack from fixtures.utils import humantime_to_ms TARGET_RUNTIME = 30 @@ -34,28 +35,18 @@ class PageServicePipeliningConfigPipelined(PageServicePipeliningConfig): mode: str = "pipelined" -EXECUTION = ["concurrent-futures"] -BATCHING = ["uniform-lsn", "scattered-lsn"] - -NON_BATCHABLE: list[PageServicePipeliningConfig] = [PageServicePipeliningConfigSerial()] -for max_batch_size in [1, 32]: - for execution in EXECUTION: - for batching in BATCHING: - NON_BATCHABLE.append( - PageServicePipeliningConfigPipelined(max_batch_size, execution, batching) - ) - -BATCHABLE: list[PageServicePipeliningConfig] = [] +PS_IO_CONCURRENCY = ["sidecar-task"] +PIPELINING_CONFIGS: list[PageServicePipeliningConfig] = [] for max_batch_size in [32]: - for execution in EXECUTION: - for batching in BATCHING: - BATCHABLE.append( + for execution in ["concurrent-futures"]: + for batching in ["scattered-lsn"]: + PIPELINING_CONFIGS.append( PageServicePipeliningConfigPipelined(max_batch_size, execution, batching) ) @pytest.mark.parametrize( - "tablesize_mib, pipelining_config, target_runtime, effective_io_concurrency, readhead_buffer_size, name", + "tablesize_mib, pipelining_config, target_runtime, ps_io_concurrency, effective_io_concurrency, readhead_buffer_size, name", [ # batchable workloads should show throughput and CPU efficiency improvements *[ @@ -63,20 +54,23 @@ for max_batch_size in [32]: 50, config, TARGET_RUNTIME, + ps_io_concurrency, 100, 128, f"batchable {dataclasses.asdict(config)}", ) - for config in BATCHABLE + for config in PIPELINING_CONFIGS + for ps_io_concurrency in PS_IO_CONCURRENCY ], ], ) -def test_throughput( +def test_postgres_seqscan( neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker, tablesize_mib: int, pipelining_config: PageServicePipeliningConfig, target_runtime: int, + ps_io_concurrency: str, effective_io_concurrency: int, readhead_buffer_size: int, name: str, @@ -97,6 +91,10 @@ def test_throughput( If the compute provides pipeline depth (effective_io_concurrency=100), then pipelining configs, especially with max_batch_size>1 should yield dramatic improvements in all performance metrics. + + We advance the LSN from a disruptor thread to simulate the effect of a workload with concurrent writes + in another table. The `scattered-lsn` batching mode handles this well whereas the + initial implementatin (`uniform-lsn`) would break the batch. """ # @@ -114,7 +112,19 @@ def test_throughput( } ) # For storing configuration as a metric, insert a fake 0 with labels with actual data - params.update({"pipelining_config": (0, {"labels": dataclasses.asdict(pipelining_config)})}) + params.update( + { + "config": ( + 0, + { + "labels": { + "pipelining_config": dataclasses.asdict(pipelining_config), + "ps_io_concurrency": ps_io_concurrency, + } + }, + ) + } + ) log.info("params: %s", params) @@ -266,7 +276,10 @@ def test_throughput( return iters env.pageserver.patch_config_toml_nonrecursive( - {"page_service_pipelining": dataclasses.asdict(pipelining_config)} + { + "page_service_pipelining": dataclasses.asdict(pipelining_config), + "get_vectored_concurrent_io": {"mode": ps_io_concurrency}, + } ) # set trace for log analysis below @@ -318,77 +331,63 @@ def test_throughput( ) -PRECISION_CONFIGS: list[PageServicePipeliningConfig] = [PageServicePipeliningConfigSerial()] -for max_batch_size in [1, 32]: - for execution in EXECUTION: - for batching in BATCHING: - PRECISION_CONFIGS.append( - PageServicePipeliningConfigPipelined(max_batch_size, execution, batching) - ) - - @pytest.mark.parametrize( - "pipelining_config,name", - [(config, f"{dataclasses.asdict(config)}") for config in PRECISION_CONFIGS], + "pipelining_config,ps_io_concurrency,l0_stack_height,queue_depth,name", + [ + (config, ps_io_concurrency, l0_stack_height, queue_depth, f"{dataclasses.asdict(config)}") + for config in PIPELINING_CONFIGS + for ps_io_concurrency in PS_IO_CONCURRENCY + for queue_depth in [1, 2, 32] + for l0_stack_height in [0, 20] + ], ) -def test_latency( +def test_random_reads( neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker, pg_bin: PgBin, pipelining_config: PageServicePipeliningConfig, + ps_io_concurrency: str, + l0_stack_height: int, + queue_depth: int, name: str, ): """ - Measure the latency impact of pipelining in an un-batchable workloads. - - An ideal implementation should not increase average or tail latencies for such workloads. - - We don't have support in pagebench to create queue depth yet. - => https://github.com/neondatabase/neon/issues/9837 + Throw pagebench random getpage at latest lsn workload from a single client against pageserver. """ # # Setup # + def build_snapshot_cb(neon_env_builder: NeonEnvBuilder) -> NeonEnv: + env = neon_env_builder.init_start() + endpoint = env.endpoints.create_start("main") + l0stack.make_l0_stack( + endpoint, + l0stack.L0StackShape(logical_table_size_mib=50, delta_stack_height=l0_stack_height), + ) + return env + + env = neon_env_builder.build_and_use_snapshot( + f"test_page_service_batching--test_pagebench-{l0_stack_height}", build_snapshot_cb + ) + def patch_ps_config(ps_config): - if pipelining_config is not None: - ps_config["page_service_pipelining"] = dataclasses.asdict(pipelining_config) + ps_config["page_service_pipelining"] = dataclasses.asdict(pipelining_config) + ps_config["get_vectored_concurrent_io"] = {"mode": ps_io_concurrency} - neon_env_builder.pageserver_config_override = patch_ps_config + env.pageserver.edit_config_toml(patch_ps_config) - env = neon_env_builder.init_start() - endpoint = env.endpoints.create_start("main") - conn = endpoint.connect() - cur = conn.cursor() + env.start() - cur.execute("SET max_parallel_workers_per_gather=0") # disable parallel backends - cur.execute("SET effective_io_concurrency=1") - - cur.execute("CREATE EXTENSION IF NOT EXISTS neon;") - cur.execute("CREATE EXTENSION IF NOT EXISTS neon_test_utils;") - - log.info("Filling the table") - cur.execute("CREATE TABLE t (data char(1000)) with (fillfactor=10)") - tablesize = 50 * 1024 * 1024 - npages = tablesize // (8 * 1024) - cur.execute("INSERT INTO t SELECT generate_series(1, %s)", (npages,)) - # TODO: can we force postgres to do sequential scans? - - cur.close() - conn.close() - - wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, env.initial_timeline) - - endpoint.stop() + lsn = env.safekeepers[0].get_commit_lsn(env.initial_tenant, env.initial_timeline) + ep = env.endpoints.create_start("main", lsn=lsn) + data_table_relnode_oid = ep.safe_psql_scalar("SELECT 'data'::regclass::oid") + ep.stop_and_destroy() for sk in env.safekeepers: sk.stop() - # - # Run single-threaded pagebench (TODO: dedup with other benchmark code) - # - env.pageserver.allowed_errors.append( # https://github.com/neondatabase/neon/issues/6925 r".*query handler for.*pagestream.*failed: unexpected message: CopyFail during COPY.*" @@ -396,6 +395,8 @@ def test_latency( ps_http = env.pageserver.http_client() + metrics_before = ps_http.get_metrics() + cmd = [ str(env.neon_binpath / "pagebench"), "get-page-latest-lsn", @@ -405,6 +406,10 @@ def test_latency( env.pageserver.connstr(password=None), "--num-clients", "1", + "--queue-depth", + str(queue_depth), + "--only-relnode", + str(data_table_relnode_oid), "--runtime", "10s", ] @@ -413,12 +418,22 @@ def test_latency( results_path = Path(basepath + ".stdout") log.info(f"Benchmark results at: {results_path}") + metrics_after = ps_http.get_metrics() + with open(results_path) as f: results = json.load(f) log.info(f"Results:\n{json.dumps(results, sort_keys=True, indent=2)}") total = results["total"] + metric = "request_count" + zenbenchmark.record( + metric, + metric_value=total[metric], + unit="", + report=MetricReport.HIGHER_IS_BETTER, + ) + metric = "latency_mean" zenbenchmark.record( metric, @@ -435,3 +450,17 @@ def test_latency( unit="ms", report=MetricReport.LOWER_IS_BETTER, ) + + reads_before = metrics_before.query_one( + "pageserver_io_operations_seconds_count", filter={"operation": "read"} + ) + reads_after = metrics_after.query_one( + "pageserver_io_operations_seconds_count", filter={"operation": "read"} + ) + + zenbenchmark.record( + "virtual_file_reads", + metric_value=reads_after.value - reads_before.value, + unit="", + report=MetricReport.LOWER_IS_BETTER, + ) From 2d247375b3b10d80b1f235aa0e12bd41d626d54a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Fri, 16 May 2025 14:21:24 +0200 Subject: [PATCH 51/65] Update rust to 1.87.0 (#11938) We keep the practice of keeping the compiler up to date, pointing to the latest release. This is done by many other projects in the Rust ecosystem as well. The 1.87.0 release marks 10 years of Rust. [Announcement blog post](https://blog.rust-lang.org/2025/05/15/Rust-1.87.0/) Prior update was in #11431 --- build-tools.Dockerfile | 2 +- pageserver/src/virtual_file/io_engine.rs | 4 +--- proxy/src/binary/pg_sni_router.rs | 1 + proxy/src/binary/proxy.rs | 2 +- rust-toolchain.toml | 2 +- storage_controller/src/scheduler.rs | 6 +----- 6 files changed, 6 insertions(+), 11 deletions(-) diff --git a/build-tools.Dockerfile b/build-tools.Dockerfile index f63d844afd..1933fd19d8 100644 --- a/build-tools.Dockerfile +++ b/build-tools.Dockerfile @@ -292,7 +292,7 @@ WORKDIR /home/nonroot # Rust # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`) -ENV RUSTC_VERSION=1.86.0 +ENV RUSTC_VERSION=1.87.0 ENV RUSTUP_HOME="/home/nonroot/.rustup" ENV PATH="/home/nonroot/.cargo/bin:${PATH}" ARG RUSTFILT_VERSION=0.2.1 diff --git a/pageserver/src/virtual_file/io_engine.rs b/pageserver/src/virtual_file/io_engine.rs index 7827682498..3cde34eda7 100644 --- a/pageserver/src/virtual_file/io_engine.rs +++ b/pageserver/src/virtual_file/io_engine.rs @@ -124,9 +124,7 @@ pub(super) fn epoll_uring_error_to_std( ) -> std::io::Error { match e { tokio_epoll_uring::Error::Op(e) => e, - tokio_epoll_uring::Error::System(system) => { - std::io::Error::new(std::io::ErrorKind::Other, system) - } + tokio_epoll_uring::Error::System(system) => std::io::Error::other(system), } } diff --git a/proxy/src/binary/pg_sni_router.rs b/proxy/src/binary/pg_sni_router.rs index 2239d064b2..3e87538ae7 100644 --- a/proxy/src/binary/pg_sni_router.rs +++ b/proxy/src/binary/pg_sni_router.rs @@ -394,6 +394,7 @@ async fn handle_client( } } +#[allow(clippy::large_enum_variant)] enum Connection { Raw(tokio::net::TcpStream), Tls(tokio_rustls::client::TlsStream), diff --git a/proxy/src/binary/proxy.rs b/proxy/src/binary/proxy.rs index fe0d551f7f..4cb5ddc335 100644 --- a/proxy/src/binary/proxy.rs +++ b/proxy/src/binary/proxy.rs @@ -862,7 +862,7 @@ async fn configure_redis( ("irsa", _) => match (&args.redis_host, args.redis_port) { (Some(host), Some(port)) => Some( ConnectionWithCredentialsProvider::new_with_credentials_provider( - host.to_string(), + host.clone(), port, elasticache::CredentialsProvider::new( args.aws_region.clone(), diff --git a/rust-toolchain.toml b/rust-toolchain.toml index a0d5970bd5..c48def3483 100644 --- a/rust-toolchain.toml +++ b/rust-toolchain.toml @@ -1,5 +1,5 @@ [toolchain] -channel = "1.86.0" +channel = "1.87.0" profile = "default" # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy. # https://rust-lang.github.io/rustup/concepts/profiles.html diff --git a/storage_controller/src/scheduler.rs b/storage_controller/src/scheduler.rs index 3d5f36fb98..773373391e 100644 --- a/storage_controller/src/scheduler.rs +++ b/storage_controller/src/scheduler.rs @@ -628,11 +628,7 @@ impl Scheduler { tracing::trace!(%node_id, "attached_shard_count={} shard_count={} expected={}", node.attached_shard_count, node.shard_count, expected_attached_shards_per_node); } - if node.attached_shard_count < expected_attached_shards_per_node { - expected_attached_shards_per_node - node.attached_shard_count - } else { - 0 - } + expected_attached_shards_per_node.saturating_sub(node.attached_shard_count) } pub(crate) fn expected_attached_shard_count(&self) -> usize { From aa22572d8c7602c1e6b26c0afde2df3a4e90f36d Mon Sep 17 00:00:00 2001 From: Evan Fleming Date: Fri, 16 May 2025 05:41:10 -0700 Subject: [PATCH 52/65] safekeeper: refactor static remote storage usage to use Arc (#10179) Greetings! Please add `w=1` to github url when viewing diff (sepcifically `wal_backup.rs`) ## Problem This PR is aimed at addressing the remaining work of #8200. Namely, removing static usage of remote storage in favour of arc. I did not opt to pass `Arc` directly since it is actually `Optional` as it is not necessarily always configured. I wanted to avoid having to pass `Arc>` everywhere with individual consuming functions likely needing to handle unwrapping. Instead I've added a `WalBackup` struct that holds `Optional` and handles initialization/unwrapping RemoteStorage internally. wal_backup functions now take self and `Arc` is passed as a dependency through the various consumers that need it. ## Summary of changes - Add `WalBackup` that holds `Optional` and handles initialization and unwrapping - Modify wal_backup functions to take `WalBackup` as self (Add `w=1` to github url when viewing diff here) - Initialize `WalBackup` in safekeeper root - Store `Arc` in `GlobalTimelineMap` and pass and store in each Timeline as loaded - use `WalBackup` through Timeline as needed ## Refs - task to remove global variables https://github.com/neondatabase/neon/issues/8200 - drive-by fixes https://github.com/neondatabase/neon/issues/11501 by turning the panic reported there into an error `remote storage not configured` --------- Co-authored-by: Christian Schwarz --- safekeeper/src/bin/safekeeper.rs | 9 +- safekeeper/src/copy_timeline.rs | 3 + safekeeper/src/http/routes.rs | 10 ++- safekeeper/src/lib.rs | 6 -- safekeeper/src/pull_timeline.rs | 43 +++++++-- safekeeper/src/test_utils.rs | 6 +- safekeeper/src/timeline.rs | 28 ++++-- safekeeper/src/timeline_eviction.rs | 47 +++++++--- safekeeper/src/timeline_manager.rs | 26 ++++-- safekeeper/src/timelines_global_map.rs | 41 +++++++-- safekeeper/src/wal_backup.rs | 115 ++++++++++++++----------- safekeeper/src/wal_backup_partial.rs | 21 +++-- safekeeper/src/wal_storage.rs | 13 +-- 13 files changed, 255 insertions(+), 113 deletions(-) diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index c267a55cb6..8d31ada24f 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -22,9 +22,10 @@ use safekeeper::defaults::{ DEFAULT_PARTIAL_BACKUP_TIMEOUT, DEFAULT_PG_LISTEN_ADDR, DEFAULT_SSL_CERT_FILE, DEFAULT_SSL_CERT_RELOAD_PERIOD, DEFAULT_SSL_KEY_FILE, }; +use safekeeper::wal_backup::WalBackup; use safekeeper::{ BACKGROUND_RUNTIME, BROKER_RUNTIME, GlobalTimelines, HTTP_RUNTIME, SafeKeeperConf, - WAL_SERVICE_RUNTIME, broker, control_file, http, wal_backup, wal_service, + WAL_SERVICE_RUNTIME, broker, control_file, http, wal_service, }; use sd_notify::NotifyState; use storage_broker::{DEFAULT_ENDPOINT, Uri}; @@ -484,15 +485,15 @@ async fn start_safekeeper(conf: Arc) -> Result<()> { None => None, }; - let global_timelines = Arc::new(GlobalTimelines::new(conf.clone())); + let wal_backup = Arc::new(WalBackup::new(&conf).await?); + + let global_timelines = Arc::new(GlobalTimelines::new(conf.clone(), wal_backup.clone())); // Register metrics collector for active timelines. It's important to do this // after daemonizing, otherwise process collector will be upset. let timeline_collector = safekeeper::metrics::TimelineCollector::new(global_timelines.clone()); metrics::register_internal(Box::new(timeline_collector))?; - wal_backup::init_remote_storage(&conf).await; - // Keep handles to main tasks to die if any of them disappears. let mut tasks_handles: FuturesUnordered> = FuturesUnordered::new(); diff --git a/safekeeper/src/copy_timeline.rs b/safekeeper/src/copy_timeline.rs index 11daff22cb..7984c2e2b9 100644 --- a/safekeeper/src/copy_timeline.rs +++ b/safekeeper/src/copy_timeline.rs @@ -3,6 +3,7 @@ use std::sync::Arc; use anyhow::{Result, bail}; use camino::Utf8PathBuf; use postgres_ffi::{MAX_SEND_SIZE, WAL_SEGMENT_SIZE}; +use remote_storage::GenericRemoteStorage; use safekeeper_api::membership::Configuration; use tokio::fs::OpenOptions; use tokio::io::{AsyncSeekExt, AsyncWriteExt}; @@ -30,6 +31,7 @@ pub struct Request { pub async fn handle_request( request: Request, global_timelines: Arc, + storage: Arc, ) -> Result<()> { // TODO: request.until_lsn MUST be a valid LSN, and we cannot check it :( // if LSN will point to the middle of a WAL record, timeline will be in "broken" state @@ -127,6 +129,7 @@ pub async fn handle_request( assert!(first_ondisk_segment >= first_segment); copy_s3_segments( + &storage, wal_seg_size, &request.source_ttid, &request.destination_ttid, diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index 1a25b07496..384c582678 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -258,6 +258,7 @@ async fn timeline_snapshot_handler(request: Request) -> Result, // so create the chan and write to it in another task. @@ -269,6 +270,7 @@ async fn timeline_snapshot_handler(request: Request) -> Result) -> Result bool { - self.remote_storage.is_some() && self.wal_backup_enabled - } -} - impl SafeKeeperConf { pub fn dummy() -> Self { SafeKeeperConf { diff --git a/safekeeper/src/pull_timeline.rs b/safekeeper/src/pull_timeline.rs index c955e667bd..14aef1ee5e 100644 --- a/safekeeper/src/pull_timeline.rs +++ b/safekeeper/src/pull_timeline.rs @@ -9,6 +9,7 @@ use chrono::{DateTime, Utc}; use futures::{SinkExt, StreamExt, TryStreamExt}; use http_utils::error::ApiError; use postgres_ffi::{PG_TLI, XLogFileName, XLogSegNo}; +use remote_storage::GenericRemoteStorage; use reqwest::Certificate; use safekeeper_api::Term; use safekeeper_api::models::{PullTimelineRequest, PullTimelineResponse, TimelineStatus}; @@ -43,6 +44,7 @@ pub async fn stream_snapshot( source: NodeId, destination: NodeId, tx: mpsc::Sender>, + storage: Option>, ) { match tli.try_wal_residence_guard().await { Err(e) => { @@ -53,10 +55,32 @@ pub async fn stream_snapshot( Ok(maybe_resident_tli) => { if let Err(e) = match maybe_resident_tli { Some(resident_tli) => { - stream_snapshot_resident_guts(resident_tli, source, destination, tx.clone()) - .await + stream_snapshot_resident_guts( + resident_tli, + source, + destination, + tx.clone(), + storage, + ) + .await + } + None => { + if let Some(storage) = storage { + stream_snapshot_offloaded_guts( + tli, + source, + destination, + tx.clone(), + &storage, + ) + .await + } else { + tx.send(Err(anyhow!("remote storage not configured"))) + .await + .ok(); + return; + } } - None => stream_snapshot_offloaded_guts(tli, source, destination, tx.clone()).await, } { // Error type/contents don't matter as they won't can't reach the client // (hyper likely doesn't do anything with it), but http stream will be @@ -123,10 +147,12 @@ pub(crate) async fn stream_snapshot_offloaded_guts( source: NodeId, destination: NodeId, tx: mpsc::Sender>, + storage: &GenericRemoteStorage, ) -> Result<()> { let mut ar = prepare_tar_stream(tx); - tli.snapshot_offloaded(&mut ar, source, destination).await?; + tli.snapshot_offloaded(&mut ar, source, destination, storage) + .await?; ar.finish().await?; @@ -139,10 +165,13 @@ pub async fn stream_snapshot_resident_guts( source: NodeId, destination: NodeId, tx: mpsc::Sender>, + storage: Option>, ) -> Result<()> { let mut ar = prepare_tar_stream(tx); - let bctx = tli.start_snapshot(&mut ar, source, destination).await?; + let bctx = tli + .start_snapshot(&mut ar, source, destination, storage) + .await?; pausable_failpoint!("sk-snapshot-after-list-pausable"); let tli_dir = tli.get_timeline_dir(); @@ -182,6 +211,7 @@ impl Timeline { ar: &mut tokio_tar::Builder, source: NodeId, destination: NodeId, + storage: &GenericRemoteStorage, ) -> Result<()> { // Take initial copy of control file, then release state lock let mut control_file = { @@ -216,6 +246,7 @@ impl Timeline { // can fail if the timeline was un-evicted and modified in the background. let remote_timeline_path = &self.remote_path; wal_backup::copy_partial_segment( + storage, &replace.previous.remote_path(remote_timeline_path), &replace.current.remote_path(remote_timeline_path), ) @@ -262,6 +293,7 @@ impl WalResidentTimeline { ar: &mut tokio_tar::Builder, source: NodeId, destination: NodeId, + storage: Option>, ) -> Result { let mut shared_state = self.write_shared_state().await; let wal_seg_size = shared_state.get_wal_seg_size(); @@ -283,6 +315,7 @@ impl WalResidentTimeline { let remote_timeline_path = &self.tli.remote_path; wal_backup::copy_partial_segment( + &*storage.context("remote storage not configured")?, &replace.previous.remote_path(remote_timeline_path), &replace.current.remote_path(remote_timeline_path), ) diff --git a/safekeeper/src/test_utils.rs b/safekeeper/src/test_utils.rs index 618e2b59d2..e2817c8337 100644 --- a/safekeeper/src/test_utils.rs +++ b/safekeeper/src/test_utils.rs @@ -18,7 +18,7 @@ use crate::send_wal::EndWatch; use crate::state::{TimelinePersistentState, TimelineState}; use crate::timeline::{SharedState, StateSK, Timeline, get_timeline_dir}; use crate::timelines_set::TimelinesSet; -use crate::wal_backup::remote_timeline_path; +use crate::wal_backup::{WalBackup, remote_timeline_path}; use crate::{SafeKeeperConf, control_file, receive_wal, wal_storage}; /// A Safekeeper testing or benchmarking environment. Uses a tempdir for storage, removed on drop. @@ -101,18 +101,22 @@ impl Env { let safekeeper = self.make_safekeeper(node_id, ttid, start_lsn).await?; let shared_state = SharedState::new(StateSK::Loaded(safekeeper)); + let wal_backup = Arc::new(WalBackup::new(&conf).await?); + let timeline = Timeline::new( ttid, &timeline_dir, &remote_path, shared_state, conf.clone(), + wal_backup.clone(), ); timeline.bootstrap( &mut timeline.write_shared_state().await, &conf, Arc::new(TimelinesSet::default()), // ignored for now RateLimiter::new(0, 0), + wal_backup, ); Ok(timeline) } diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index b7ba28f435..588bd4f2c9 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -35,7 +35,8 @@ use crate::state::{EvictionState, TimelineMemState, TimelinePersistentState, Tim use crate::timeline_guard::ResidenceGuard; use crate::timeline_manager::{AtomicStatus, ManagerCtl}; use crate::timelines_set::TimelinesSet; -use crate::wal_backup::{self, remote_timeline_path}; +use crate::wal_backup; +use crate::wal_backup::{WalBackup, remote_timeline_path}; use crate::wal_backup_partial::PartialRemoteSegment; use crate::wal_storage::{Storage as wal_storage_iface, WalReader}; use crate::{SafeKeeperConf, control_file, debug_dump, timeline_manager, wal_storage}; @@ -452,6 +453,8 @@ pub struct Timeline { manager_ctl: ManagerCtl, conf: Arc, + pub(crate) wal_backup: Arc, + remote_deletion: std::sync::Mutex>, /// Hold this gate from code that depends on the Timeline's non-shut-down state. While holding @@ -476,6 +479,7 @@ impl Timeline { remote_path: &RemotePath, shared_state: SharedState, conf: Arc, + wal_backup: Arc, ) -> Arc { let (commit_lsn_watch_tx, commit_lsn_watch_rx) = watch::channel(shared_state.sk.state().commit_lsn); @@ -509,6 +513,7 @@ impl Timeline { wal_backup_active: AtomicBool::new(false), last_removed_segno: AtomicU64::new(0), mgr_status: AtomicStatus::new(), + wal_backup, }) } @@ -516,6 +521,7 @@ impl Timeline { pub fn load_timeline( conf: Arc, ttid: TenantTimelineId, + wal_backup: Arc, ) -> Result> { let _enter = info_span!("load_timeline", timeline = %ttid.timeline_id).entered(); @@ -529,6 +535,7 @@ impl Timeline { &remote_path, shared_state, conf, + wal_backup, )) } @@ -539,6 +546,7 @@ impl Timeline { conf: &SafeKeeperConf, broker_active_set: Arc, partial_backup_rate_limiter: RateLimiter, + wal_backup: Arc, ) { let (tx, rx) = self.manager_ctl.bootstrap_manager(); @@ -561,6 +569,7 @@ impl Timeline { tx, rx, partial_backup_rate_limiter, + wal_backup, ) .await } @@ -606,9 +615,10 @@ impl Timeline { // it is cancelled, so WAL storage won't be opened again. shared_state.sk.close_wal_store(); - if !only_local && self.conf.is_wal_backup_enabled() { + if !only_local { self.remote_delete().await?; } + let dir_existed = delete_dir(&self.timeline_dir).await?; Ok(dir_existed) } @@ -675,11 +685,20 @@ impl Timeline { guard: &mut std::sync::MutexGuard>, ) -> RemoteDeletionReceiver { tracing::info!("starting remote deletion"); + let storage = self.wal_backup.get_storage().clone(); let (result_tx, result_rx) = tokio::sync::watch::channel(None); let ttid = self.ttid; tokio::task::spawn( async move { - let r = wal_backup::delete_timeline(&ttid).await; + let r = if let Some(storage) = storage { + wal_backup::delete_timeline(&storage, &ttid).await + } else { + tracing::info!( + "skipping remote deletion because no remote storage is configured; this effectively leaks the objects in remote storage" + ); + Ok(()) + }; + if let Err(e) = &r { // Log error here in case nobody ever listens for our result (e.g. dropped API request) tracing::error!("remote deletion failed: {e}"); @@ -1046,14 +1065,13 @@ impl WalResidentTimeline { pub async fn get_walreader(&self, start_lsn: Lsn) -> Result { let (_, persisted_state) = self.get_state().await; - let enable_remote_read = self.conf.is_wal_backup_enabled(); WalReader::new( &self.ttid, self.timeline_dir.clone(), &persisted_state, start_lsn, - enable_remote_read, + self.wal_backup.clone(), ) } diff --git a/safekeeper/src/timeline_eviction.rs b/safekeeper/src/timeline_eviction.rs index 84c636daf6..e817dbf6f9 100644 --- a/safekeeper/src/timeline_eviction.rs +++ b/safekeeper/src/timeline_eviction.rs @@ -6,7 +6,7 @@ use anyhow::Context; use camino::Utf8PathBuf; -use remote_storage::RemotePath; +use remote_storage::{GenericRemoteStorage, RemotePath}; use tokio::fs::File; use tokio::io::{AsyncRead, AsyncWriteExt}; use tracing::{debug, info, instrument, warn}; @@ -68,6 +68,10 @@ impl Manager { #[instrument(name = "evict_timeline", skip_all)] pub(crate) async fn evict_timeline(&mut self) -> bool { assert!(!self.is_offloaded); + let Some(storage) = self.wal_backup.get_storage() else { + warn!("no remote storage configured, skipping uneviction"); + return false; + }; let partial_backup_uploaded = match &self.partial_backup_uploaded { Some(p) => p.clone(), None => { @@ -87,7 +91,7 @@ impl Manager { .inc(); }); - if let Err(e) = do_eviction(self, &partial_backup_uploaded).await { + if let Err(e) = do_eviction(self, &partial_backup_uploaded, &storage).await { warn!("failed to evict timeline: {:?}", e); return false; } @@ -102,6 +106,10 @@ impl Manager { #[instrument(name = "unevict_timeline", skip_all)] pub(crate) async fn unevict_timeline(&mut self) { assert!(self.is_offloaded); + let Some(storage) = self.wal_backup.get_storage() else { + warn!("no remote storage configured, skipping uneviction"); + return; + }; let partial_backup_uploaded = match &self.partial_backup_uploaded { Some(p) => p.clone(), None => { @@ -121,7 +129,7 @@ impl Manager { .inc(); }); - if let Err(e) = do_uneviction(self, &partial_backup_uploaded).await { + if let Err(e) = do_uneviction(self, &partial_backup_uploaded, &storage).await { warn!("failed to unevict timeline: {:?}", e); return; } @@ -137,8 +145,12 @@ impl Manager { /// Ensure that content matches the remote partial backup, if local segment exists. /// Then change state in control file and in-memory. If `delete_offloaded_wal` is set, /// delete the local segment. -async fn do_eviction(mgr: &mut Manager, partial: &PartialRemoteSegment) -> anyhow::Result<()> { - compare_local_segment_with_remote(mgr, partial).await?; +async fn do_eviction( + mgr: &mut Manager, + partial: &PartialRemoteSegment, + storage: &GenericRemoteStorage, +) -> anyhow::Result<()> { + compare_local_segment_with_remote(mgr, partial, storage).await?; mgr.tli.switch_to_offloaded(partial).await?; // switch manager state as soon as possible @@ -153,12 +165,16 @@ async fn do_eviction(mgr: &mut Manager, partial: &PartialRemoteSegment) -> anyho /// Ensure that content matches the remote partial backup, if local segment exists. /// Then download segment to local disk and change state in control file and in-memory. -async fn do_uneviction(mgr: &mut Manager, partial: &PartialRemoteSegment) -> anyhow::Result<()> { +async fn do_uneviction( + mgr: &mut Manager, + partial: &PartialRemoteSegment, + storage: &GenericRemoteStorage, +) -> anyhow::Result<()> { // if the local segment is present, validate it - compare_local_segment_with_remote(mgr, partial).await?; + compare_local_segment_with_remote(mgr, partial, storage).await?; // atomically download the partial segment - redownload_partial_segment(mgr, partial).await?; + redownload_partial_segment(mgr, partial, storage).await?; mgr.tli.switch_to_present().await?; // switch manager state as soon as possible @@ -181,6 +197,7 @@ async fn delete_local_segment(mgr: &Manager, partial: &PartialRemoteSegment) -> async fn redownload_partial_segment( mgr: &Manager, partial: &PartialRemoteSegment, + storage: &GenericRemoteStorage, ) -> anyhow::Result<()> { let tmp_file = mgr.tli.timeline_dir().join("remote_partial.tmp"); let remote_segfile = remote_segment_path(mgr, partial); @@ -190,7 +207,7 @@ async fn redownload_partial_segment( remote_segfile, tmp_file ); - let mut reader = wal_backup::read_object(&remote_segfile, 0).await?; + let mut reader = wal_backup::read_object(storage, &remote_segfile, 0).await?; let mut file = File::create(&tmp_file).await?; let actual_len = tokio::io::copy(&mut reader, &mut file).await?; @@ -234,13 +251,16 @@ async fn redownload_partial_segment( async fn compare_local_segment_with_remote( mgr: &Manager, partial: &PartialRemoteSegment, + storage: &GenericRemoteStorage, ) -> anyhow::Result<()> { let local_path = local_segment_path(mgr, partial); match File::open(&local_path).await { - Ok(mut local_file) => do_validation(mgr, &mut local_file, mgr.wal_seg_size, partial) - .await - .context("validation failed"), + Ok(mut local_file) => { + do_validation(mgr, &mut local_file, mgr.wal_seg_size, partial, storage) + .await + .context("validation failed") + } Err(_) => { info!( "local WAL file {} is not present, skipping validation", @@ -258,6 +278,7 @@ async fn do_validation( file: &mut File, wal_seg_size: usize, partial: &PartialRemoteSegment, + storage: &GenericRemoteStorage, ) -> anyhow::Result<()> { let local_size = file.metadata().await?.len() as usize; if local_size != wal_seg_size { @@ -270,7 +291,7 @@ async fn do_validation( let remote_segfile = remote_segment_path(mgr, partial); let mut remote_reader: std::pin::Pin> = - wal_backup::read_object(&remote_segfile, 0).await?; + wal_backup::read_object(storage, &remote_segfile, 0).await?; // remote segment should have bytes excatly up to `flush_lsn` let expected_remote_size = partial.flush_lsn.segment_offset(mgr.wal_seg_size); diff --git a/safekeeper/src/timeline_manager.rs b/safekeeper/src/timeline_manager.rs index 71e99a4de7..48eda92fed 100644 --- a/safekeeper/src/timeline_manager.rs +++ b/safekeeper/src/timeline_manager.rs @@ -35,7 +35,7 @@ use crate::state::TimelineState; use crate::timeline::{ManagerTimeline, ReadGuardSharedState, StateSK, WalResidentTimeline}; use crate::timeline_guard::{AccessService, GuardId, ResidenceGuard}; use crate::timelines_set::{TimelineSetGuard, TimelinesSet}; -use crate::wal_backup::{self, WalBackupTaskHandle}; +use crate::wal_backup::{self, WalBackup, WalBackupTaskHandle}; use crate::wal_backup_partial::{self, PartialBackup, PartialRemoteSegment}; pub(crate) struct StateSnapshot { @@ -200,6 +200,7 @@ pub(crate) struct Manager { pub(crate) conf: SafeKeeperConf, pub(crate) wal_seg_size: usize, pub(crate) walsenders: Arc, + pub(crate) wal_backup: Arc, // current state pub(crate) state_version_rx: tokio::sync::watch::Receiver, @@ -238,6 +239,7 @@ pub async fn main_task( manager_tx: tokio::sync::mpsc::UnboundedSender, mut manager_rx: tokio::sync::mpsc::UnboundedReceiver, global_rate_limiter: RateLimiter, + wal_backup: Arc, ) { tli.set_status(Status::Started); @@ -256,6 +258,7 @@ pub async fn main_task( broker_active_set, manager_tx, global_rate_limiter, + wal_backup, ) .await; @@ -371,7 +374,7 @@ pub async fn main_task( mgr.tli_broker_active.set(false); // shutdown background tasks - if mgr.conf.is_wal_backup_enabled() { + if let Some(storage) = mgr.wal_backup.get_storage() { if let Some(backup_task) = mgr.backup_task.take() { // If we fell through here, then the timeline is shutting down. This is important // because otherwise joining on the wal_backup handle might hang. @@ -379,7 +382,7 @@ pub async fn main_task( backup_task.join().await; } - wal_backup::update_task(&mut mgr, false, &last_state).await; + wal_backup::update_task(&mut mgr, storage, false, &last_state).await; } if let Some(recovery_task) = &mut mgr.recovery_task { @@ -415,11 +418,13 @@ impl Manager { broker_active_set: Arc, manager_tx: tokio::sync::mpsc::UnboundedSender, global_rate_limiter: RateLimiter, + wal_backup: Arc, ) -> Manager { let (is_offloaded, partial_backup_uploaded) = tli.bootstrap_mgr().await; Manager { wal_seg_size: tli.get_wal_seg_size().await, walsenders: tli.get_walsenders().clone(), + wal_backup, state_version_rx: tli.get_state_version_rx(), num_computes_rx: tli.get_walreceivers().get_num_rx(), tli_broker_active: broker_active_set.guard(tli.clone()), @@ -477,8 +482,8 @@ impl Manager { let is_wal_backup_required = wal_backup::is_wal_backup_required(self.wal_seg_size, num_computes, state); - if self.conf.is_wal_backup_enabled() { - wal_backup::update_task(self, is_wal_backup_required, state).await; + if let Some(storage) = self.wal_backup.get_storage() { + wal_backup::update_task(self, storage, is_wal_backup_required, state).await; } // update the state in Arc @@ -624,9 +629,9 @@ impl Manager { /// Spawns partial WAL backup task if needed. async fn update_partial_backup(&mut self, state: &StateSnapshot) { // check if WAL backup is enabled and should be started - if !self.conf.is_wal_backup_enabled() { + let Some(storage) = self.wal_backup.get_storage() else { return; - } + }; if self.partial_backup_task.is_some() { // partial backup is already running @@ -650,6 +655,7 @@ impl Manager { self.conf.clone(), self.global_rate_limiter.clone(), cancel.clone(), + storage, )); self.partial_backup_task = Some((handle, cancel)); } @@ -669,6 +675,10 @@ impl Manager { /// Reset partial backup state and remove its remote storage data. Since it /// might concurrently uploading something, cancel the task first. async fn backup_partial_reset(&mut self) -> anyhow::Result> { + let Some(storage) = self.wal_backup.get_storage() else { + anyhow::bail!("remote storage is not enabled"); + }; + info!("resetting partial backup state"); // Force unevict timeline if it is evicted before erasing partial backup // state. The intended use of this function is to drop corrupted remote @@ -689,7 +699,7 @@ impl Manager { } let tli = self.wal_resident_timeline()?; - let mut partial_backup = PartialBackup::new(tli, self.conf.clone()).await; + let mut partial_backup = PartialBackup::new(tli, self.conf.clone(), storage).await; // Reset might fail e.g. when cfile is already reset but s3 removal // failed, so set manager state to None beforehand. In any case caller // is expected to retry until success. diff --git a/safekeeper/src/timelines_global_map.rs b/safekeeper/src/timelines_global_map.rs index 41abee369e..af33bcbd20 100644 --- a/safekeeper/src/timelines_global_map.rs +++ b/safekeeper/src/timelines_global_map.rs @@ -25,6 +25,7 @@ use crate::rate_limit::RateLimiter; use crate::state::TimelinePersistentState; use crate::timeline::{Timeline, TimelineError, delete_dir, get_tenant_dir, get_timeline_dir}; use crate::timelines_set::TimelinesSet; +use crate::wal_backup::WalBackup; use crate::wal_storage::Storage; use crate::{SafeKeeperConf, control_file, wal_storage}; @@ -47,15 +48,24 @@ struct GlobalTimelinesState { conf: Arc, broker_active_set: Arc, global_rate_limiter: RateLimiter, + wal_backup: Arc, } impl GlobalTimelinesState { /// Get dependencies for a timeline constructor. - fn get_dependencies(&self) -> (Arc, Arc, RateLimiter) { + fn get_dependencies( + &self, + ) -> ( + Arc, + Arc, + RateLimiter, + Arc, + ) { ( self.conf.clone(), self.broker_active_set.clone(), self.global_rate_limiter.clone(), + self.wal_backup.clone(), ) } @@ -84,7 +94,7 @@ pub struct GlobalTimelines { impl GlobalTimelines { /// Create a new instance of the global timelines map. - pub fn new(conf: Arc) -> Self { + pub fn new(conf: Arc, wal_backup: Arc) -> Self { Self { state: Mutex::new(GlobalTimelinesState { timelines: HashMap::new(), @@ -92,6 +102,7 @@ impl GlobalTimelines { conf, broker_active_set: Arc::new(TimelinesSet::default()), global_rate_limiter: RateLimiter::new(1, 1), + wal_backup, }), } } @@ -147,7 +158,7 @@ impl GlobalTimelines { /// just lock and unlock it for each timeline -- this function is called /// during init when nothing else is running, so this is fine. async fn load_tenant_timelines(&self, tenant_id: TenantId) -> Result<()> { - let (conf, broker_active_set, partial_backup_rate_limiter) = { + let (conf, broker_active_set, partial_backup_rate_limiter, wal_backup) = { let state = self.state.lock().unwrap(); state.get_dependencies() }; @@ -162,7 +173,7 @@ impl GlobalTimelines { TimelineId::from_str(timeline_dir_entry.file_name().to_str().unwrap_or("")) { let ttid = TenantTimelineId::new(tenant_id, timeline_id); - match Timeline::load_timeline(conf.clone(), ttid) { + match Timeline::load_timeline(conf.clone(), ttid, wal_backup.clone()) { Ok(tli) => { let mut shared_state = tli.write_shared_state().await; self.state @@ -175,6 +186,7 @@ impl GlobalTimelines { &conf, broker_active_set.clone(), partial_backup_rate_limiter.clone(), + wal_backup.clone(), ); } // If we can't load a timeline, it's most likely because of a corrupted @@ -212,6 +224,10 @@ impl GlobalTimelines { self.state.lock().unwrap().broker_active_set.clone() } + pub fn get_wal_backup(&self) -> Arc { + self.state.lock().unwrap().wal_backup.clone() + } + /// Create a new timeline with the given id. If the timeline already exists, returns /// an existing timeline. pub(crate) async fn create( @@ -222,7 +238,7 @@ impl GlobalTimelines { start_lsn: Lsn, commit_lsn: Lsn, ) -> Result> { - let (conf, _, _) = { + let (conf, _, _, _) = { let state = self.state.lock().unwrap(); if let Ok(timeline) = state.get(&ttid) { // Timeline already exists, return it. @@ -267,7 +283,7 @@ impl GlobalTimelines { check_tombstone: bool, ) -> Result> { // Check for existence and mark that we're creating it. - let (conf, broker_active_set, partial_backup_rate_limiter) = { + let (conf, broker_active_set, partial_backup_rate_limiter, wal_backup) = { let mut state = self.state.lock().unwrap(); match state.timelines.get(&ttid) { Some(GlobalMapTimeline::CreationInProgress) => { @@ -296,7 +312,14 @@ impl GlobalTimelines { }; // Do the actual move and reflect the result in the map. - match GlobalTimelines::install_temp_timeline(ttid, tmp_path, conf.clone()).await { + match GlobalTimelines::install_temp_timeline( + ttid, + tmp_path, + conf.clone(), + wal_backup.clone(), + ) + .await + { Ok(timeline) => { let mut timeline_shared_state = timeline.write_shared_state().await; let mut state = self.state.lock().unwrap(); @@ -314,6 +337,7 @@ impl GlobalTimelines { &conf, broker_active_set, partial_backup_rate_limiter, + wal_backup, ); drop(timeline_shared_state); Ok(timeline) @@ -336,6 +360,7 @@ impl GlobalTimelines { ttid: TenantTimelineId, tmp_path: &Utf8PathBuf, conf: Arc, + wal_backup: Arc, ) -> Result> { let tenant_path = get_tenant_dir(conf.as_ref(), &ttid.tenant_id); let timeline_path = get_timeline_dir(conf.as_ref(), &ttid); @@ -377,7 +402,7 @@ impl GlobalTimelines { // Do the move. durable_rename(tmp_path, &timeline_path, !conf.no_sync).await?; - Timeline::load_timeline(conf, ttid) + Timeline::load_timeline(conf, ttid, wal_backup) } /// Get a timeline from the global map. If it's not present, it doesn't exist on disk, diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs index 56f4a2faf9..0beb272a60 100644 --- a/safekeeper/src/wal_backup.rs +++ b/safekeeper/src/wal_backup.rs @@ -2,6 +2,7 @@ use std::cmp::min; use std::collections::HashSet; use std::num::NonZeroU32; use std::pin::Pin; +use std::sync::Arc; use std::time::Duration; use anyhow::{Context, Result}; @@ -17,7 +18,7 @@ use safekeeper_api::models::PeerInfo; use tokio::fs::File; use tokio::select; use tokio::sync::mpsc::{self, Receiver, Sender}; -use tokio::sync::{OnceCell, watch}; +use tokio::sync::watch; use tokio::task::JoinHandle; use tokio_util::sync::CancellationToken; use tracing::*; @@ -63,7 +64,12 @@ pub(crate) fn is_wal_backup_required( /// Based on peer information determine which safekeeper should offload; if it /// is me, run (per timeline) task, if not yet. OTOH, if it is not me and task /// is running, kill it. -pub(crate) async fn update_task(mgr: &mut Manager, need_backup: bool, state: &StateSnapshot) { +pub(crate) async fn update_task( + mgr: &mut Manager, + storage: Arc, + need_backup: bool, + state: &StateSnapshot, +) { let (offloader, election_dbg_str) = determine_offloader(&state.peers, state.backup_lsn, mgr.tli.ttid, &mgr.conf); let elected_me = Some(mgr.conf.my_id) == offloader; @@ -82,7 +88,12 @@ pub(crate) async fn update_task(mgr: &mut Manager, need_backup: bool, state: &St return; }; - let async_task = backup_task_main(resident, mgr.conf.backup_parallel_jobs, shutdown_rx); + let async_task = backup_task_main( + resident, + storage, + mgr.conf.backup_parallel_jobs, + shutdown_rx, + ); let handle = if mgr.conf.current_thread_runtime { tokio::spawn(async_task) @@ -169,33 +180,31 @@ fn determine_offloader( } } -static REMOTE_STORAGE: OnceCell> = OnceCell::const_new(); - -// Storage must be configured and initialized when this is called. -fn get_configured_remote_storage() -> &'static GenericRemoteStorage { - REMOTE_STORAGE - .get() - .expect("failed to get remote storage") - .as_ref() - .unwrap() +pub struct WalBackup { + storage: Option>, } -pub async fn init_remote_storage(conf: &SafeKeeperConf) { - // TODO: refactor REMOTE_STORAGE to avoid using global variables, and provide - // dependencies to all tasks instead. - REMOTE_STORAGE - .get_or_init(|| async { - if let Some(conf) = conf.remote_storage.as_ref() { - Some( - GenericRemoteStorage::from_config(conf) - .await - .expect("failed to create remote storage"), - ) - } else { - None +impl WalBackup { + /// Create a new WalBackup instance. + pub async fn new(conf: &SafeKeeperConf) -> Result { + if !conf.wal_backup_enabled { + return Ok(Self { storage: None }); + } + + match conf.remote_storage.as_ref() { + Some(config) => { + let storage = GenericRemoteStorage::from_config(config).await?; + Ok(Self { + storage: Some(Arc::new(storage)), + }) } - }) - .await; + None => Ok(Self { storage: None }), + } + } + + pub fn get_storage(&self) -> Option> { + self.storage.clone() + } } struct WalBackupTask { @@ -204,12 +213,14 @@ struct WalBackupTask { wal_seg_size: usize, parallel_jobs: usize, commit_lsn_watch_rx: watch::Receiver, + storage: Arc, } /// Offload single timeline. #[instrument(name = "wal_backup", skip_all, fields(ttid = %tli.ttid))] async fn backup_task_main( tli: WalResidentTimeline, + storage: Arc, parallel_jobs: usize, mut shutdown_rx: Receiver<()>, ) { @@ -223,6 +234,7 @@ async fn backup_task_main( timeline_dir: tli.get_timeline_dir(), timeline: tli, parallel_jobs, + storage, }; // task is spinned up only when wal_seg_size already initialized @@ -293,6 +305,7 @@ impl WalBackupTask { match backup_lsn_range( &self.timeline, + self.storage.clone(), &mut backup_lsn, commit_lsn, self.wal_seg_size, @@ -322,6 +335,7 @@ impl WalBackupTask { async fn backup_lsn_range( timeline: &WalResidentTimeline, + storage: Arc, backup_lsn: &mut Lsn, end_lsn: Lsn, wal_seg_size: usize, @@ -352,7 +366,12 @@ async fn backup_lsn_range( loop { let added_task = match iter.next() { Some(s) => { - uploads.push_back(backup_single_segment(s, timeline_dir, remote_timeline_path)); + uploads.push_back(backup_single_segment( + &storage, + s, + timeline_dir, + remote_timeline_path, + )); true } None => false, @@ -388,6 +407,7 @@ async fn backup_lsn_range( } async fn backup_single_segment( + storage: &GenericRemoteStorage, seg: &Segment, timeline_dir: &Utf8Path, remote_timeline_path: &RemotePath, @@ -395,7 +415,13 @@ async fn backup_single_segment( let segment_file_path = seg.file_path(timeline_dir)?; let remote_segment_path = seg.remote_path(remote_timeline_path); - let res = backup_object(&segment_file_path, &remote_segment_path, seg.size()).await; + let res = backup_object( + storage, + &segment_file_path, + &remote_segment_path, + seg.size(), + ) + .await; if res.is_ok() { BACKED_UP_SEGMENTS.inc(); } else { @@ -455,12 +481,11 @@ fn get_segments(start: Lsn, end: Lsn, seg_size: usize) -> Vec { } async fn backup_object( + storage: &GenericRemoteStorage, source_file: &Utf8Path, target_file: &RemotePath, size: usize, ) -> Result<()> { - let storage = get_configured_remote_storage(); - let file = File::open(&source_file) .await .with_context(|| format!("Failed to open file {source_file:?} for wal backup"))?; @@ -475,12 +500,11 @@ async fn backup_object( } pub(crate) async fn backup_partial_segment( + storage: &GenericRemoteStorage, source_file: &Utf8Path, target_file: &RemotePath, size: usize, ) -> Result<()> { - let storage = get_configured_remote_storage(); - let file = File::open(&source_file) .await .with_context(|| format!("Failed to open file {source_file:?} for wal backup"))?; @@ -504,25 +528,20 @@ pub(crate) async fn backup_partial_segment( } pub(crate) async fn copy_partial_segment( + storage: &GenericRemoteStorage, source: &RemotePath, destination: &RemotePath, ) -> Result<()> { - let storage = get_configured_remote_storage(); let cancel = CancellationToken::new(); storage.copy_object(source, destination, &cancel).await } pub async fn read_object( + storage: &GenericRemoteStorage, file_path: &RemotePath, offset: u64, ) -> anyhow::Result>> { - let storage = REMOTE_STORAGE - .get() - .context("Failed to get remote storage")? - .as_ref() - .context("No remote storage configured")?; - info!("segment download about to start from remote path {file_path:?} at offset {offset}"); let cancel = CancellationToken::new(); @@ -547,8 +566,10 @@ pub async fn read_object( /// Delete WAL files for the given timeline. Remote storage must be configured /// when called. -pub async fn delete_timeline(ttid: &TenantTimelineId) -> Result<()> { - let storage = get_configured_remote_storage(); +pub async fn delete_timeline( + storage: &GenericRemoteStorage, + ttid: &TenantTimelineId, +) -> Result<()> { let remote_path = remote_timeline_path(ttid)?; // see DEFAULT_MAX_KEYS_PER_LIST_RESPONSE @@ -618,14 +639,14 @@ pub async fn delete_timeline(ttid: &TenantTimelineId) -> Result<()> { } /// Used by wal_backup_partial. -pub async fn delete_objects(paths: &[RemotePath]) -> Result<()> { +pub async fn delete_objects(storage: &GenericRemoteStorage, paths: &[RemotePath]) -> Result<()> { let cancel = CancellationToken::new(); // not really used - let storage = get_configured_remote_storage(); storage.delete_objects(paths, &cancel).await } /// Copy segments from one timeline to another. Used in copy_timeline. pub async fn copy_s3_segments( + storage: &GenericRemoteStorage, wal_seg_size: usize, src_ttid: &TenantTimelineId, dst_ttid: &TenantTimelineId, @@ -634,12 +655,6 @@ pub async fn copy_s3_segments( ) -> Result<()> { const SEGMENTS_PROGRESS_REPORT_INTERVAL: u64 = 1024; - let storage = REMOTE_STORAGE - .get() - .expect("failed to get remote storage") - .as_ref() - .unwrap(); - let remote_dst_path = remote_timeline_path(dst_ttid)?; let cancel = CancellationToken::new(); diff --git a/safekeeper/src/wal_backup_partial.rs b/safekeeper/src/wal_backup_partial.rs index 049852a048..fe0f1b3607 100644 --- a/safekeeper/src/wal_backup_partial.rs +++ b/safekeeper/src/wal_backup_partial.rs @@ -19,9 +19,11 @@ //! file. Code updates state in the control file before doing any S3 operations. //! This way control file stores information about all potentially existing //! remote partial segments and can clean them up after uploading a newer version. +use std::sync::Arc; + use camino::Utf8PathBuf; use postgres_ffi::{PG_TLI, XLogFileName, XLogSegNo}; -use remote_storage::RemotePath; +use remote_storage::{GenericRemoteStorage, RemotePath}; use safekeeper_api::Term; use serde::{Deserialize, Serialize}; use tokio_util::sync::CancellationToken; @@ -154,12 +156,16 @@ pub struct PartialBackup { conf: SafeKeeperConf, local_prefix: Utf8PathBuf, remote_timeline_path: RemotePath, - + storage: Arc, state: State, } impl PartialBackup { - pub async fn new(tli: WalResidentTimeline, conf: SafeKeeperConf) -> PartialBackup { + pub async fn new( + tli: WalResidentTimeline, + conf: SafeKeeperConf, + storage: Arc, + ) -> PartialBackup { let (_, persistent_state) = tli.get_state().await; let wal_seg_size = tli.get_wal_seg_size().await; @@ -173,6 +179,7 @@ impl PartialBackup { conf, local_prefix, remote_timeline_path, + storage, } } @@ -240,7 +247,8 @@ impl PartialBackup { let remote_path = prepared.remote_path(&self.remote_timeline_path); // Upload first `backup_bytes` bytes of the segment to the remote storage. - wal_backup::backup_partial_segment(&local_path, &remote_path, backup_bytes).await?; + wal_backup::backup_partial_segment(&self.storage, &local_path, &remote_path, backup_bytes) + .await?; PARTIAL_BACKUP_UPLOADED_BYTES.inc_by(backup_bytes as u64); // We uploaded the segment, now let's verify that the data is still actual. @@ -326,7 +334,7 @@ impl PartialBackup { let remote_path = self.remote_timeline_path.join(seg); objects_to_delete.push(remote_path); } - wal_backup::delete_objects(&objects_to_delete).await + wal_backup::delete_objects(&self.storage, &objects_to_delete).await } /// Delete all non-Uploaded segments from the remote storage. There should be only one @@ -424,6 +432,7 @@ pub async fn main_task( conf: SafeKeeperConf, limiter: RateLimiter, cancel: CancellationToken, + storage: Arc, ) -> Option { debug!("started"); let await_duration = conf.partial_backup_timeout; @@ -432,7 +441,7 @@ pub async fn main_task( let mut commit_lsn_rx = tli.get_commit_lsn_watch_rx(); let mut flush_lsn_rx = tli.get_term_flush_lsn_watch_rx(); - let mut backup = PartialBackup::new(tli, conf).await; + let mut backup = PartialBackup::new(tli, conf, storage).await; debug!("state: {:?}", backup.state); diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs index f0bac4b40a..8ba3e7cc47 100644 --- a/safekeeper/src/wal_storage.rs +++ b/safekeeper/src/wal_storage.rs @@ -21,6 +21,7 @@ use postgres_ffi::waldecoder::WalStreamDecoder; use postgres_ffi::{PG_TLI, XLogFileName, XLogSegNo, dispatch_pgversion}; use pq_proto::SystemId; use remote_storage::RemotePath; +use std::sync::Arc; use tokio::fs::{self, File, OpenOptions, remove_file}; use tokio::io::{AsyncRead, AsyncReadExt, AsyncSeekExt, AsyncWriteExt}; use tracing::*; @@ -32,7 +33,7 @@ use crate::metrics::{ REMOVED_WAL_SEGMENTS, WAL_STORAGE_OPERATION_SECONDS, WalStorageMetrics, time_io_closure, }; use crate::state::TimelinePersistentState; -use crate::wal_backup::{read_object, remote_timeline_path}; +use crate::wal_backup::{WalBackup, read_object, remote_timeline_path}; pub trait Storage { // Last written LSN. @@ -645,7 +646,7 @@ pub struct WalReader { wal_segment: Option>>, // S3 will be used to read WAL if LSN is not available locally - enable_remote_read: bool, + wal_backup: Arc, // We don't have WAL locally if LSN is less than local_start_lsn local_start_lsn: Lsn, @@ -664,7 +665,7 @@ impl WalReader { timeline_dir: Utf8PathBuf, state: &TimelinePersistentState, start_pos: Lsn, - enable_remote_read: bool, + wal_backup: Arc, ) -> Result { if state.server.wal_seg_size == 0 || state.local_start_lsn == Lsn(0) { bail!("state uninitialized, no data to read"); @@ -693,7 +694,7 @@ impl WalReader { wal_seg_size: state.server.wal_seg_size as usize, pos: start_pos, wal_segment: None, - enable_remote_read, + wal_backup, local_start_lsn: state.local_start_lsn, timeline_start_lsn: state.timeline_start_lsn, pg_version: state.server.pg_version / 10000, @@ -812,9 +813,9 @@ impl WalReader { } // Try to open remote file, if remote reads are enabled - if self.enable_remote_read { + if let Some(storage) = self.wal_backup.get_storage() { let remote_wal_file_path = self.remote_path.join(&wal_file_name); - return read_object(&remote_wal_file_path, xlogoff as u64).await; + return read_object(&storage, &remote_wal_file_path, xlogoff as u64).await; } bail!("WAL segment is not found") From baafcc5d4108b1be38edf428c3f3dd87cc0c9508 Mon Sep 17 00:00:00 2001 From: Folke Behrens Date: Fri, 16 May 2025 14:12:39 +0000 Subject: [PATCH 53/65] proxy: Fix misspelled flag value alias, swap names and aliases (#11949) ## Problem There's a misspelled flag value alias that's not really used anywhere. ## Summary of changes Fix the alias and make aliases the official flag values and keep old values as aliases. Also rename enum variant. No need for it to carry the version now. --- proxy/src/binary/proxy.rs | 9 +++++---- proxy/src/context/mod.rs | 2 +- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/proxy/src/binary/proxy.rs b/proxy/src/binary/proxy.rs index 4cb5ddc335..51713902bc 100644 --- a/proxy/src/binary/proxy.rs +++ b/proxy/src/binary/proxy.rs @@ -43,11 +43,12 @@ project_build_tag!(BUILD_TAG); use clap::{Parser, ValueEnum}; #[derive(Clone, Debug, ValueEnum)] +#[clap(rename_all = "kebab-case")] enum AuthBackendType { - #[value(name("cplane-v1"), alias("control-plane"))] - ControlPlaneV1, + #[clap(alias("cplane-v1"))] + ControlPlane, - #[value(name("link"), alias("control-redirect"))] + #[clap(alias("link"))] ConsoleRedirect, #[cfg(any(test, feature = "testing"))] @@ -707,7 +708,7 @@ fn build_auth_backend( args: &ProxyCliArgs, ) -> anyhow::Result, &'static ConsoleRedirectBackend>> { match &args.auth_backend { - AuthBackendType::ControlPlaneV1 => { + AuthBackendType::ControlPlane => { let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?; let project_info_cache_config: ProjectInfoCacheOptions = args.project_info_cache.parse()?; diff --git a/proxy/src/context/mod.rs b/proxy/src/context/mod.rs index 5f649d2b21..79aaf22990 100644 --- a/proxy/src/context/mod.rs +++ b/proxy/src/context/mod.rs @@ -78,7 +78,7 @@ struct RequestContextInner { #[derive(Clone, Debug)] pub(crate) enum AuthMethod { - // aka passwordless, fka link + // aka link ConsoleRedirect, ScramSha256, ScramSha256Plus, From 55f91cf10b30c3c648ac1301b95cd049bd7f0e21 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Fri, 16 May 2025 17:45:08 +0300 Subject: [PATCH 54/65] Update 'nix' package (#11948) There were some incompatible changes. Most churn was from switching from the now-deprecated fcntl:flock() function to fcntl::Flock::lock(). The new function returns a guard object, while with the old function, the lock was associated directly with the file descriptor. It's good to stay up-to-date in general, but the impetus to do this now is that in https://github.com/neondatabase/neon/pull/11929, I want to use some functions that were added only in the latest version of 'nix', and it's nice to not have to build multiple versions. (Although, different versions of 'nix' are still pulled in as indirect dependencies from other packages) --- Cargo.lock | 25 +++++--- Cargo.toml | 2 +- control_plane/src/background_process.rs | 4 +- control_plane/src/bin/neon_local.rs | 13 ++-- libs/utils/src/crashsafe.rs | 6 +- libs/utils/src/fs_ext/rename_noreplace.rs | 4 +- libs/utils/src/lock_file.rs | 63 ++++++++++--------- pageserver/src/tenant/secondary/downloader.rs | 4 +- pageserver/src/virtual_file.rs | 2 +- 9 files changed, 66 insertions(+), 57 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index f075b45e49..1edd20105d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1112,6 +1112,12 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" +[[package]] +name = "cfg_aliases" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724" + [[package]] name = "cgroups-rs" version = "0.3.3" @@ -1306,7 +1312,7 @@ dependencies = [ "itertools 0.10.5", "jsonwebtoken", "metrics", - "nix 0.27.1", + "nix 0.30.1", "notify", "num_cpus", "once_cell", @@ -1429,7 +1435,7 @@ dependencies = [ "humantime-serde", "hyper 0.14.30", "jsonwebtoken", - "nix 0.27.1", + "nix 0.30.1", "once_cell", "pageserver_api", "pageserver_client", @@ -3512,9 +3518,9 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.169" +version = "0.2.172" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a" +checksum = "d750af042f7ef4f724306de029d18836c26c1765a54a6a3f094cbd23a7267ffa" [[package]] name = "libloading" @@ -3821,12 +3827,13 @@ dependencies = [ [[package]] name = "nix" -version = "0.27.1" +version = "0.30.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2eb04e9c688eff1c89d72b407f168cf79bb9e867a9d3323ed6c01519eb9cc053" +checksum = "74523f3a35e05aba87a1d978330aef40f67b0304ac79c1c00b294c9830543db6" dependencies = [ "bitflags 2.8.0", "cfg-if", + "cfg_aliases", "libc", "memoffset 0.9.0", ] @@ -4280,7 +4287,7 @@ dependencies = [ "jsonwebtoken", "md5", "metrics", - "nix 0.27.1", + "nix 0.30.1", "num-traits", "num_cpus", "once_cell", @@ -4356,7 +4363,7 @@ dependencies = [ "humantime", "humantime-serde", "itertools 0.10.5", - "nix 0.27.1", + "nix 0.30.1", "once_cell", "postgres_backend", "postgres_ffi", @@ -7899,7 +7906,7 @@ dependencies = [ "humantime", "jsonwebtoken", "metrics", - "nix 0.27.1", + "nix 0.30.1", "once_cell", "pem", "pin-project-lite", diff --git a/Cargo.toml b/Cargo.toml index 6b87ce549d..d6fffe7768 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -127,7 +127,7 @@ md5 = "0.7.0" measured = { version = "0.0.22", features=["lasso"] } measured-process = { version = "0.0.22" } memoffset = "0.9" -nix = { version = "0.27", features = ["dir", "fs", "process", "socket", "signal", "poll"] } +nix = { version = "0.30.1", features = ["dir", "fs", "process", "socket", "signal", "poll"] } # Do not update to >= 7.0.0, at least. The update will have a significant impact # on compute startup metrics (start_postgres_ms), >= 25% degradation. notify = "6.0.0" diff --git a/control_plane/src/background_process.rs b/control_plane/src/background_process.rs index 1eac4f7ff0..4f0934e411 100644 --- a/control_plane/src/background_process.rs +++ b/control_plane/src/background_process.rs @@ -14,7 +14,7 @@ use std::ffi::OsStr; use std::io::Write; -use std::os::unix::prelude::AsRawFd; +use std::os::fd::AsFd; use std::os::unix::process::CommandExt; use std::path::Path; use std::process::Command; @@ -356,7 +356,7 @@ where let file = pid_file::claim_for_current_process(&path).expect("claim pid file"); // Remove the FD_CLOEXEC flag on the pidfile descriptor so that the pidfile // remains locked after exec. - nix::fcntl::fcntl(file.as_raw_fd(), FcntlArg::F_SETFD(FdFlag::empty())) + nix::fcntl::fcntl(file.as_fd(), FcntlArg::F_SETFD(FdFlag::empty())) .expect("remove FD_CLOEXEC"); // Don't run drop(file), it would close the file before we actually exec. std::mem::forget(file); diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index 191a22f1de..98ab6e5657 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -8,7 +8,6 @@ use std::borrow::Cow; use std::collections::{BTreeSet, HashMap}; use std::fs::File; -use std::os::fd::AsRawFd; use std::path::PathBuf; use std::process::exit; use std::str::FromStr; @@ -31,7 +30,7 @@ use control_plane::safekeeper::SafekeeperNode; use control_plane::storage_controller::{ NeonStorageControllerStartArgs, NeonStorageControllerStopArgs, StorageController, }; -use nix::fcntl::{FlockArg, flock}; +use nix::fcntl::{Flock, FlockArg}; use pageserver_api::config::{ DEFAULT_HTTP_LISTEN_PORT as DEFAULT_PAGESERVER_HTTP_PORT, DEFAULT_PG_LISTEN_PORT as DEFAULT_PAGESERVER_PG_PORT, @@ -749,16 +748,16 @@ struct TimelineTreeEl { /// A flock-based guard over the neon_local repository directory struct RepoLock { - _file: File, + _file: Flock, } impl RepoLock { fn new() -> Result { let repo_dir = File::open(local_env::base_path())?; - let repo_dir_fd = repo_dir.as_raw_fd(); - flock(repo_dir_fd, FlockArg::LockExclusive)?; - - Ok(Self { _file: repo_dir }) + match Flock::lock(repo_dir, FlockArg::LockExclusive) { + Ok(f) => Ok(Self { _file: f }), + Err((_, e)) => Err(e).context("flock error"), + } } } diff --git a/libs/utils/src/crashsafe.rs b/libs/utils/src/crashsafe.rs index 215fa36df4..45acaf682f 100644 --- a/libs/utils/src/crashsafe.rs +++ b/libs/utils/src/crashsafe.rs @@ -1,7 +1,7 @@ use std::borrow::Cow; use std::fs::{self, File}; use std::io::{self, Write}; -use std::os::fd::AsRawFd; +use std::os::fd::AsFd; use camino::{Utf8Path, Utf8PathBuf}; @@ -210,13 +210,13 @@ pub fn overwrite( /// Syncs the filesystem for the given file descriptor. #[cfg_attr(target_os = "macos", allow(unused_variables))] -pub fn syncfs(fd: impl AsRawFd) -> anyhow::Result<()> { +pub fn syncfs(fd: impl AsFd) -> anyhow::Result<()> { // Linux guarantees durability for syncfs. // POSIX doesn't have syncfs, and further does not actually guarantee durability of sync(). #[cfg(target_os = "linux")] { use anyhow::Context; - nix::unistd::syncfs(fd.as_raw_fd()).context("syncfs")?; + nix::unistd::syncfs(fd).context("syncfs")?; } #[cfg(target_os = "macos")] { diff --git a/libs/utils/src/fs_ext/rename_noreplace.rs b/libs/utils/src/fs_ext/rename_noreplace.rs index d0c07353d0..c945ecadf0 100644 --- a/libs/utils/src/fs_ext/rename_noreplace.rs +++ b/libs/utils/src/fs_ext/rename_noreplace.rs @@ -11,9 +11,9 @@ pub fn rename_noreplace( #[cfg(all(target_os = "linux", target_env = "gnu"))] { nix::fcntl::renameat2( - None, + nix::fcntl::AT_FDCWD, src, - None, + nix::fcntl::AT_FDCWD, dst, nix::fcntl::RenameFlags::RENAME_NOREPLACE, ) diff --git a/libs/utils/src/lock_file.rs b/libs/utils/src/lock_file.rs index 6aeeeca021..b3c8d74d7d 100644 --- a/libs/utils/src/lock_file.rs +++ b/libs/utils/src/lock_file.rs @@ -1,6 +1,6 @@ //! A module to create and read lock files. //! -//! File locking is done using [`fcntl::flock`] exclusive locks. +//! File locking is done using [`nix::fcntl::Flock`] exclusive locks. //! The only consumer of this module is currently //! [`pid_file`](crate::pid_file). See the module-level comment //! there for potential pitfalls with lock files that are used @@ -9,26 +9,25 @@ use std::fs; use std::io::{Read, Write}; use std::ops::Deref; -use std::os::unix::prelude::AsRawFd; use anyhow::Context; use camino::{Utf8Path, Utf8PathBuf}; use nix::errno::Errno::EAGAIN; -use nix::fcntl; +use nix::fcntl::{Flock, FlockArg}; use crate::crashsafe; -/// A handle to an open and unlocked, but not-yet-written lock file. +/// A handle to an open and flocked, but not-yet-written lock file. /// Returned by [`create_exclusive`]. #[must_use] pub struct UnwrittenLockFile { path: Utf8PathBuf, - file: fs::File, + file: Flock, } /// Returned by [`UnwrittenLockFile::write_content`]. #[must_use] -pub struct LockFileGuard(fs::File); +pub struct LockFileGuard(Flock); impl Deref for LockFileGuard { type Target = fs::File; @@ -67,17 +66,14 @@ pub fn create_exclusive(lock_file_path: &Utf8Path) -> anyhow::Result Ok(UnwrittenLockFile { + Ok(lock_file) => Ok(UnwrittenLockFile { path: lock_file_path.to_owned(), file: lock_file, }), - Err(EAGAIN) => anyhow::bail!("file is already locked"), - Err(e) => Err(e).context("flock error"), + Err((_, EAGAIN)) => anyhow::bail!("file is already locked"), + Err((_, e)) => Err(e).context("flock error"), } } @@ -105,32 +101,37 @@ pub enum LockFileRead { /// Check the [`LockFileRead`] variants for details. pub fn read_and_hold_lock_file(path: &Utf8Path) -> anyhow::Result { let res = fs::OpenOptions::new().read(true).open(path); - let mut lock_file = match res { + let lock_file = match res { Ok(f) => f, Err(e) => match e.kind() { std::io::ErrorKind::NotFound => return Ok(LockFileRead::NotExist), _ => return Err(e).context("open lock file"), }, }; - let res = fcntl::flock( - lock_file.as_raw_fd(), - fcntl::FlockArg::LockExclusiveNonblock, - ); + let res = Flock::lock(lock_file, FlockArg::LockExclusiveNonblock); // We need the content regardless of lock success / failure. // But, read it after flock so that, if it succeeded, the content is consistent. - let mut content = String::new(); - lock_file - .read_to_string(&mut content) - .context("read lock file")?; match res { - Ok(()) => Ok(LockFileRead::NotHeldByAnyProcess( - LockFileGuard(lock_file), - content, - )), - Err(EAGAIN) => Ok(LockFileRead::LockedByOtherProcess { - not_locked_file: lock_file, - content, - }), - Err(e) => Err(e).context("flock error"), + Ok(mut locked_file) => { + let mut content = String::new(); + locked_file + .read_to_string(&mut content) + .context("read lock file")?; + Ok(LockFileRead::NotHeldByAnyProcess( + LockFileGuard(locked_file), + content, + )) + } + Err((mut not_locked_file, EAGAIN)) => { + let mut content = String::new(); + not_locked_file + .read_to_string(&mut content) + .context("read lock file")?; + Ok(LockFileRead::LockedByOtherProcess { + not_locked_file, + content, + }) + } + Err((_, e)) => Err(e).context("flock error"), } } diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs index c26b7626ef..dd49c843f3 100644 --- a/pageserver/src/tenant/secondary/downloader.rs +++ b/pageserver/src/tenant/secondary/downloader.rs @@ -668,7 +668,9 @@ impl From for UpdateError { impl From for UpdateError { fn from(value: std::io::Error) -> Self { - if let Some(nix::errno::Errno::ENOSPC) = value.raw_os_error().map(nix::errno::from_i32) { + if let Some(nix::errno::Errno::ENOSPC) = + value.raw_os_error().map(nix::errno::Errno::from_raw) + { UpdateError::NoSpace } else if value .get_ref() diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index c707d35114..45b6e44c54 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -408,7 +408,7 @@ impl OpenFiles { /// error types may be elegible for retry. pub(crate) fn is_fatal_io_error(e: &std::io::Error) -> bool { use nix::errno::Errno::*; - match e.raw_os_error().map(nix::errno::from_i32) { + match e.raw_os_error().map(nix::errno::Errno::from_raw) { Some(EIO) => { // Terminate on EIO because we no longer trust the device to store // data safely, or to uphold persistence guarantees on fsync. From 532d9b646e4eaab6e0d94da8a6f890a9c834647c Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Sat, 17 May 2025 00:22:36 +0300 Subject: [PATCH 55/65] Add simple facility for an extendable shared memory area (#11929) You still need to provide a max size up-front, but memory is only allocated for the portion that is in use. The module is currently unused, but will be used by the new compute communicator project, in the neon Postgres extension. See https://github.com/neondatabase/neon/issues/11729 --------- Co-authored-by: Erik Grinaker --- Cargo.lock | 11 + Cargo.toml | 3 +- libs/neon-shmem/Cargo.toml | 13 ++ libs/neon-shmem/src/lib.rs | 418 +++++++++++++++++++++++++++++++++++++ workspace_hack/Cargo.toml | 3 +- 5 files changed, 446 insertions(+), 2 deletions(-) create mode 100644 libs/neon-shmem/Cargo.toml create mode 100644 libs/neon-shmem/src/lib.rs diff --git a/Cargo.lock b/Cargo.lock index 1edd20105d..8ca65b58ce 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3794,6 +3794,16 @@ version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a" +[[package]] +name = "neon-shmem" +version = "0.1.0" +dependencies = [ + "nix 0.30.1", + "tempfile", + "thiserror 1.0.69", + "workspace_hack", +] + [[package]] name = "never-say-never" version = "6.6.666" @@ -8482,6 +8492,7 @@ dependencies = [ "log", "memchr", "nix 0.26.4", + "nix 0.30.1", "nom", "num", "num-bigint", diff --git a/Cargo.toml b/Cargo.toml index d6fffe7768..74b281f88f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -23,6 +23,7 @@ members = [ "libs/postgres_ffi", "libs/safekeeper_api", "libs/desim", + "libs/neon-shmem", "libs/utils", "libs/consumption_metrics", "libs/postgres_backend", @@ -127,7 +128,7 @@ md5 = "0.7.0" measured = { version = "0.0.22", features=["lasso"] } measured-process = { version = "0.0.22" } memoffset = "0.9" -nix = { version = "0.30.1", features = ["dir", "fs", "process", "socket", "signal", "poll"] } +nix = { version = "0.30.1", features = ["dir", "fs", "mman", "process", "socket", "signal", "poll"] } # Do not update to >= 7.0.0, at least. The update will have a significant impact # on compute startup metrics (start_postgres_ms), >= 25% degradation. notify = "6.0.0" diff --git a/libs/neon-shmem/Cargo.toml b/libs/neon-shmem/Cargo.toml new file mode 100644 index 0000000000..2a636bec40 --- /dev/null +++ b/libs/neon-shmem/Cargo.toml @@ -0,0 +1,13 @@ +[package] +name = "neon-shmem" +version = "0.1.0" +edition.workspace = true +license.workspace = true + +[dependencies] +thiserror.workspace = true +nix.workspace=true +workspace_hack = { version = "0.1", path = "../../workspace_hack" } + +[target.'cfg(target_os = "macos")'.dependencies] +tempfile = "3.14.0" diff --git a/libs/neon-shmem/src/lib.rs b/libs/neon-shmem/src/lib.rs new file mode 100644 index 0000000000..e1b14b1371 --- /dev/null +++ b/libs/neon-shmem/src/lib.rs @@ -0,0 +1,418 @@ +//! Shared memory utilities for neon communicator + +use std::num::NonZeroUsize; +use std::os::fd::{AsFd, BorrowedFd, OwnedFd}; +use std::ptr::NonNull; +use std::sync::atomic::{AtomicUsize, Ordering}; + +use nix::errno::Errno; +use nix::sys::mman::MapFlags; +use nix::sys::mman::ProtFlags; +use nix::sys::mman::mmap as nix_mmap; +use nix::sys::mman::munmap as nix_munmap; +use nix::unistd::ftruncate as nix_ftruncate; + +/// ShmemHandle represents a shared memory area that can be shared by processes over fork(). +/// Unlike shared memory allocated by Postgres, this area is resizable, up to 'max_size' that's +/// specified at creation. +/// +/// The area is backed by an anonymous file created with memfd_create(). The full address space for +/// 'max_size' is reserved up-front with mmap(), but whenever you call [`ShmemHandle::set_size`], +/// the underlying file is resized. Do not access the area beyond the current size. Currently, that +/// will cause the file to be expanded, but we might use mprotect() etc. to enforce that in the +/// future. +pub struct ShmemHandle { + /// memfd file descriptor + fd: OwnedFd, + + max_size: usize, + + // Pointer to the beginning of the shared memory area. The header is stored there. + shared_ptr: NonNull, + + // Pointer to the beginning of the user data + pub data_ptr: NonNull, +} + +/// This is stored at the beginning in the shared memory area. +struct SharedStruct { + max_size: usize, + + /// Current size of the backing file. The high-order bit is used for the RESIZE_IN_PROGRESS flag + current_size: AtomicUsize, +} + +const RESIZE_IN_PROGRESS: usize = 1 << 63; + +const HEADER_SIZE: usize = std::mem::size_of::(); + +/// Error type returned by the ShmemHandle functions. +#[derive(thiserror::Error, Debug)] +#[error("{msg}: {errno}")] +pub struct Error { + pub msg: String, + pub errno: Errno, +} + +impl Error { + fn new(msg: &str, errno: Errno) -> Error { + Error { + msg: msg.to_string(), + errno, + } + } +} + +impl ShmemHandle { + /// Create a new shared memory area. To communicate between processes, the processes need to be + /// fork()'d after calling this, so that the ShmemHandle is inherited by all processes. + /// + /// If the ShmemHandle is dropped, the memory is unmapped from the current process. Other + /// processes can continue using it, however. + pub fn new(name: &str, initial_size: usize, max_size: usize) -> Result { + // create the backing anonymous file. + let fd = create_backing_file(name)?; + + Self::new_with_fd(fd, initial_size, max_size) + } + + fn new_with_fd( + fd: OwnedFd, + initial_size: usize, + max_size: usize, + ) -> Result { + // We reserve the high-order bit for the RESIZE_IN_PROGRESS flag, and the actual size + // is a little larger than this because of the SharedStruct header. Make the upper limit + // somewhat smaller than that, because with anything close to that, you'll run out of + // memory anyway. + if max_size >= 1 << 48 { + panic!("max size {} too large", max_size); + } + if initial_size > max_size { + panic!("initial size {initial_size} larger than max size {max_size}"); + } + + // The actual initial / max size is the one given by the caller, plus the size of + // 'SharedStruct'. + let initial_size = HEADER_SIZE + initial_size; + let max_size = NonZeroUsize::new(HEADER_SIZE + max_size).unwrap(); + + // Reserve address space for it with mmap + // + // TODO: Use MAP_HUGETLB if possible + let start_ptr = unsafe { + nix_mmap( + None, + max_size, + ProtFlags::PROT_READ | ProtFlags::PROT_WRITE, + MapFlags::MAP_SHARED, + &fd, + 0, + ) + } + .map_err(|e| Error::new("mmap failed: {e}", e))?; + + // Reserve space for the initial size + enlarge_file(fd.as_fd(), initial_size as u64)?; + + // Initialize the header + let shared: NonNull = start_ptr.cast(); + unsafe { + shared.write(SharedStruct { + max_size: max_size.into(), + current_size: AtomicUsize::new(initial_size), + }) + }; + + // The user data begins after the header + let data_ptr = unsafe { start_ptr.cast().add(HEADER_SIZE) }; + + Ok(ShmemHandle { + fd, + max_size: max_size.into(), + shared_ptr: shared, + data_ptr, + }) + } + + // return reference to the header + fn shared(&self) -> &SharedStruct { + unsafe { self.shared_ptr.as_ref() } + } + + /// Resize the shared memory area. 'new_size' must not be larger than the 'max_size' specified + /// when creating the area. + /// + /// This may only be called from one process/thread concurrently. We detect that case + /// and return an Error. + pub fn set_size(&self, new_size: usize) -> Result<(), Error> { + let new_size = new_size + HEADER_SIZE; + let shared = self.shared(); + + if new_size > self.max_size { + panic!( + "new size ({} is greater than max size ({})", + new_size, self.max_size + ); + } + assert_eq!(self.max_size, shared.max_size); + + // Lock the area by setting the bit in 'current_size' + // + // Ordering::Relaxed would probably be sufficient here, as we don't access any other memory + // and the posix_fallocate/ftruncate call is surely a synchronization point anyway. But + // since this is not performance-critical, better safe than sorry . + let mut old_size = shared.current_size.load(Ordering::Acquire); + loop { + if (old_size & RESIZE_IN_PROGRESS) != 0 { + return Err(Error::new( + "concurrent resize detected", + Errno::UnknownErrno, + )); + } + match shared.current_size.compare_exchange( + old_size, + new_size, + Ordering::Acquire, + Ordering::Relaxed, + ) { + Ok(_) => break, + Err(x) => old_size = x, + } + } + + // Ok, we got the lock. + // + // NB: If anything goes wrong, we *must* clear the bit! + let result = { + use std::cmp::Ordering::{Equal, Greater, Less}; + match new_size.cmp(&old_size) { + Less => nix_ftruncate(&self.fd, new_size as i64).map_err(|e| { + Error::new("could not shrink shmem segment, ftruncate failed: {e}", e) + }), + Equal => Ok(()), + Greater => enlarge_file(self.fd.as_fd(), new_size as u64), + } + }; + + // Unlock + shared.current_size.store( + if result.is_ok() { new_size } else { old_size }, + Ordering::Release, + ); + + result + } + + /// Returns the current user-visible size of the shared memory segment. + /// + /// NOTE: a concurrent set_size() call can change the size at any time. It is the caller's + /// responsibility not to access the area beyond the current size. + pub fn current_size(&self) -> usize { + let total_current_size = + self.shared().current_size.load(Ordering::Relaxed) & !RESIZE_IN_PROGRESS; + total_current_size - HEADER_SIZE + } +} + +impl Drop for ShmemHandle { + fn drop(&mut self) { + // SAFETY: The pointer was obtained from mmap() with the given size. + // We unmap the entire region. + let _ = unsafe { nix_munmap(self.shared_ptr.cast(), self.max_size) }; + // The fd is dropped automatically by OwnedFd. + } +} + +/// Create a "backing file" for the shared memory area. On Linux, use memfd_create(), to create an +/// anonymous in-memory file. One macos, fall back to a regular file. That's good enough for +/// development and testing, but in production we want the file to stay in memory. +/// +/// disable 'unused_variables' warnings, because in the macos path, 'name' is unused. +#[allow(unused_variables)] +fn create_backing_file(name: &str) -> Result { + #[cfg(not(target_os = "macos"))] + { + nix::sys::memfd::memfd_create(name, nix::sys::memfd::MFdFlags::empty()) + .map_err(|e| Error::new("memfd_create failed: {e}", e)) + } + #[cfg(target_os = "macos")] + { + let file = tempfile::tempfile().map_err(|e| { + Error::new( + "could not create temporary file to back shmem area: {e}", + nix::errno::Errno::from_raw(e.raw_os_error().unwrap_or(0)), + ) + })?; + Ok(OwnedFd::from(file)) + } +} + +fn enlarge_file(fd: BorrowedFd, size: u64) -> Result<(), Error> { + // Use posix_fallocate() to enlarge the file. It reserves the space correctly, so that + // we don't get a segfault later when trying to actually use it. + #[cfg(not(target_os = "macos"))] + { + nix::fcntl::posix_fallocate(fd, 0, size as i64).map_err(|e| { + Error::new( + "could not grow shmem segment, posix_fallocate failed: {e}", + e, + ) + }) + } + // As a fallback on macos, which doesn't have posix_fallocate, use plain 'fallocate' + #[cfg(target_os = "macos")] + { + nix::unistd::ftruncate(fd, size as i64) + .map_err(|e| Error::new("could not grow shmem segment, ftruncate failed: {e}", e)) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + use nix::unistd::ForkResult; + use std::ops::Range; + + /// check that all bytes in given range have the expected value. + fn assert_range(ptr: *const u8, expected: u8, range: Range) { + for i in range { + let b = unsafe { *(ptr.add(i)) }; + assert_eq!(expected, b, "unexpected byte at offset {}", i); + } + } + + /// Write 'b' to all bytes in the given range + fn write_range(ptr: *mut u8, b: u8, range: Range) { + unsafe { std::ptr::write_bytes(ptr.add(range.start), b, range.end - range.start) }; + } + + // simple single-process test of growing and shrinking + #[test] + fn test_shmem_resize() -> Result<(), Error> { + let max_size = 1024 * 1024; + let init_struct = ShmemHandle::new("test_shmem_resize", 0, max_size)?; + + assert_eq!(init_struct.current_size(), 0); + + // Initial grow + let size1 = 10000; + init_struct.set_size(size1).unwrap(); + assert_eq!(init_struct.current_size(), size1); + + // Write some data + let data_ptr = init_struct.data_ptr.as_ptr(); + write_range(data_ptr, 0xAA, 0..size1); + assert_range(data_ptr, 0xAA, 0..size1); + + // Shrink + let size2 = 5000; + init_struct.set_size(size2).unwrap(); + assert_eq!(init_struct.current_size(), size2); + + // Grow again + let size3 = 20000; + init_struct.set_size(size3).unwrap(); + assert_eq!(init_struct.current_size(), size3); + + // Try to read it. The area that was shrunk and grown again should read as all zeros now + assert_range(data_ptr, 0xAA, 0..5000); + assert_range(data_ptr, 0, 5000..size1); + + // Try to grow beyond max_size + //let size4 = max_size + 1; + //assert!(init_struct.set_size(size4).is_err()); + + // Dropping init_struct should unmap the memory + drop(init_struct); + + Ok(()) + } + + /// This is used in tests to coordinate between test processes. It's like std::sync::Barrier, + /// but is stored in the shared memory area and works across processes. It's implemented by + /// polling, because e.g. standard rust mutexes are not guaranteed to work across processes. + struct SimpleBarrier { + num_procs: usize, + count: AtomicUsize, + } + + impl SimpleBarrier { + unsafe fn init(ptr: *mut SimpleBarrier, num_procs: usize) { + unsafe { + *ptr = SimpleBarrier { + num_procs, + count: AtomicUsize::new(0), + } + } + } + + pub fn wait(&self) { + let old = self.count.fetch_add(1, Ordering::Relaxed); + + let generation = old / self.num_procs; + + let mut current = old + 1; + while current < (generation + 1) * self.num_procs { + std::thread::sleep(std::time::Duration::from_millis(10)); + current = self.count.load(Ordering::Relaxed); + } + } + } + + #[test] + fn test_multi_process() { + // Initialize + let max_size = 1_000_000_000_000; + let init_struct = ShmemHandle::new("test_multi_process", 0, max_size).unwrap(); + let ptr = init_struct.data_ptr.as_ptr(); + + // Store the SimpleBarrier in the first 1k of the area. + init_struct.set_size(10000).unwrap(); + let barrier_ptr: *mut SimpleBarrier = unsafe { + ptr.add(ptr.align_offset(std::mem::align_of::())) + .cast() + }; + unsafe { SimpleBarrier::init(barrier_ptr, 2) }; + let barrier = unsafe { barrier_ptr.as_ref().unwrap() }; + + // Fork another test process. The code after this runs in both processes concurrently. + let fork_result = unsafe { nix::unistd::fork().unwrap() }; + + // In the parent, fill bytes between 1000..2000. In the child, between 2000..3000 + if fork_result.is_parent() { + write_range(ptr, 0xAA, 1000..2000); + } else { + write_range(ptr, 0xBB, 2000..3000); + } + barrier.wait(); + // Verify the contents. (in both processes) + assert_range(ptr, 0xAA, 1000..2000); + assert_range(ptr, 0xBB, 2000..3000); + + // Grow, from the child this time + let size = 10_000_000; + if !fork_result.is_parent() { + init_struct.set_size(size).unwrap(); + } + barrier.wait(); + + // make some writes at the end + if fork_result.is_parent() { + write_range(ptr, 0xAA, (size - 10)..size); + } else { + write_range(ptr, 0xBB, (size - 20)..(size - 10)); + } + barrier.wait(); + + // Verify the contents. (This runs in both processes) + assert_range(ptr, 0, (size - 1000)..(size - 20)); + assert_range(ptr, 0xBB, (size - 20)..(size - 10)); + assert_range(ptr, 0xAA, (size - 10)..size); + + if let ForkResult::Parent { child } = fork_result { + nix::sys::wait::waitpid(child, None).unwrap(); + } + } +} diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index fecf62f756..69d44b82ea 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -60,7 +60,8 @@ lazy_static = { version = "1", default-features = false, features = ["spin_no_st libc = { version = "0.2", features = ["extra_traits", "use_std"] } log = { version = "0.4", default-features = false, features = ["std"] } memchr = { version = "2" } -nix = { version = "0.26" } +nix-2f80eeee3b1b6c7e = { package = "nix", version = "0.26" } +nix-fa1f6196edfd7249 = { package = "nix", version = "0.30", features = ["dir", "ioctl", "mman", "poll", "signal", "socket"] } nom = { version = "7" } num = { version = "0.4" } num-bigint = { version = "0.4" } From deed46015dd5eaa2dcc48f5f17f3e923a13e6711 Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Sat, 17 May 2025 08:34:54 +0200 Subject: [PATCH 56/65] CI(test-images): increase timeout from 20m to 60m (#11955) ## Problem For some reason (unknown yet) 20m timeout is not enough for `test-images` job on arm runners. Ref: https://github.com/neondatabase/neon/actions/runs/15075321681/job/42387530399?pr=11953 ## Summary of changes - Increase the timeout from 20m to 1h --- .github/workflows/build_and_test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 6b19f6ef01..a887db2ab1 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -963,7 +963,7 @@ jobs: fi - name: Verify docker-compose example and test extensions - timeout-minutes: 20 + timeout-minutes: 60 env: TAG: >- ${{ From 8e05639dbf6def383da7b138e28cf930ac506647 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Sat, 17 May 2025 22:06:59 +0300 Subject: [PATCH 57/65] Invalidate LFC after unlogged build (#11951) ## Problem See https://neondb.slack.com/archives/C04DGM6SMTM/p1747391617951239 LFC is not always properly updated during unlogged build so it can contain stale content. ## Summary of changes Invalidate LFC content at the end of unlogged build Co-authored-by: Konstantin Knizhnik --- pgxn/neon/file_cache.c | 38 ++++++++++++++++++++++++++++++++++++++ pgxn/neon/file_cache.h | 1 + pgxn/neon/pagestore_smgr.c | 19 ++----------------- 3 files changed, 41 insertions(+), 17 deletions(-) diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c index ecc55bb540..176fd9643f 100644 --- a/pgxn/neon/file_cache.c +++ b/pgxn/neon/file_cache.c @@ -936,6 +936,44 @@ lfc_prewarm_main(Datum main_arg) lfc_ctl->prewarm_workers[worker_id].completed = GetCurrentTimestamp(); } +void +lfc_invalidate(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber nblocks) +{ + BufferTag tag; + FileCacheEntry *entry; + uint32 hash; + + if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */ + return; + + CopyNRelFileInfoToBufTag(tag, rinfo); + tag.forkNum = forkNum; + + CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber); + + LWLockAcquire(lfc_lock, LW_EXCLUSIVE); + if (LFC_ENABLED()) + { + for (BlockNumber blkno = 0; blkno < nblocks; blkno += lfc_blocks_per_chunk) + { + tag.blockNum = blkno; + hash = get_hash_value(lfc_hash, &tag); + entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL); + if (entry != NULL) + { + for (int i = 0; i < lfc_blocks_per_chunk; i++) + { + if (GET_STATE(entry, i) == AVAILABLE) + { + lfc_ctl->used_pages -= 1; + SET_STATE(entry, i, UNAVAILABLE); + } + } + } + } + } + LWLockRelease(lfc_lock); +} /* * Check if page is present in the cache. diff --git a/pgxn/neon/file_cache.h b/pgxn/neon/file_cache.h index c7b6b09f72..d5ac55d5ba 100644 --- a/pgxn/neon/file_cache.h +++ b/pgxn/neon/file_cache.h @@ -28,6 +28,7 @@ typedef struct FileCacheState extern bool lfc_store_prefetch_result; /* functions for local file cache */ +extern void lfc_invalidate(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber nblocks); extern void lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, const void *const *buffers, BlockNumber nblocks); diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index 31e47db7d7..5558a903e2 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -919,9 +919,6 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, case RELPERSISTENCE_TEMP: case RELPERSISTENCE_UNLOGGED: mdextend(reln, forkNum, blkno, buffer, skipFsync); - /* Update LFC in case of unlogged index build */ - if (reln == unlogged_build_rel && unlogged_build_phase == UNLOGGED_BUILD_PHASE_2) - lfc_write(InfoFromSMgrRel(reln), forkNum, blkno, buffer); return; default: @@ -1010,14 +1007,6 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum, case RELPERSISTENCE_TEMP: case RELPERSISTENCE_UNLOGGED: mdzeroextend(reln, forkNum, blocknum, nblocks, skipFsync); - /* Update LFC in case of unlogged index build */ - if (reln == unlogged_build_rel && unlogged_build_phase == UNLOGGED_BUILD_PHASE_2) - { - for (int i = 0; i < nblocks; i++) - { - lfc_write(InfoFromSMgrRel(reln), forkNum, blocknum + i, buffer.data); - } - } return; default: @@ -1617,9 +1606,6 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo #else mdwrite(reln, forknum, blocknum, buffer, skipFsync); #endif - /* Update LFC in case of unlogged index build */ - if (reln == unlogged_build_rel && unlogged_build_phase == UNLOGGED_BUILD_PHASE_2) - lfc_write(InfoFromSMgrRel(reln), forknum, blocknum, buffer); return; default: neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); @@ -1685,9 +1671,6 @@ neon_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, case RELPERSISTENCE_TEMP: case RELPERSISTENCE_UNLOGGED: mdwritev(reln, forknum, blkno, buffers, nblocks, skipFsync); - /* Update LFC in case of unlogged index build */ - if (reln == unlogged_build_rel && unlogged_build_phase == UNLOGGED_BUILD_PHASE_2) - lfc_writev(InfoFromSMgrRel(reln), forknum, blkno, buffers, nblocks); return; default: neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); @@ -2083,6 +2066,8 @@ neon_end_unlogged_build(SMgrRelation reln) forknum); forget_cached_relsize(InfoFromNInfoB(rinfob), forknum); + lfc_invalidate(InfoFromNInfoB(rinfob), forknum, nblocks); + mdclose(reln, forknum); #ifndef DEBUG_COMPARE_LOCAL /* use isRedo == true, so that we drop it immediately */ From 81c6a5a796d1a4278b320d241c2dcab95982a7c6 Mon Sep 17 00:00:00 2001 From: Emmanuel Ferdman Date: Sun, 18 May 2025 00:12:01 +0300 Subject: [PATCH 58/65] Migrate to correct logger interface (#11956) ## Problem Currently the `logger` library throws annoying deprecation warnings: ```python DeprecationWarning: The 'warn' method is deprecated, use 'warning' instead ``` ## Summary of changes This small PR resolves the annoying deprecation warnings by migrating to `.warning` as suggested. Signed-off-by: Emmanuel Ferdman --- test_runner/fixtures/neon_cli.py | 2 +- test_runner/regress/test_pageserver_secondary.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/test_runner/fixtures/neon_cli.py b/test_runner/fixtures/neon_cli.py index 4eaa4b7d99..bb07e2b6d1 100644 --- a/test_runner/fixtures/neon_cli.py +++ b/test_runner/fixtures/neon_cli.py @@ -103,7 +103,7 @@ class AbstractNeonCli: else: stdout = "" - log.warn(f"CLI timeout: stderr={stderr}, stdout={stdout}") + log.warning(f"CLI timeout: stderr={stderr}, stdout={stdout}") raise indent = " " diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py index 3aa0c63979..f2523ec9b5 100644 --- a/test_runner/regress/test_pageserver_secondary.py +++ b/test_runner/regress/test_pageserver_secondary.py @@ -510,7 +510,7 @@ def list_elegible_layers( except KeyError: # Unexpected: tests should call this when pageservers are in a quiet state such that the layer map # matches what's on disk. - log.warn(f"Lookup {layer_file_name} from {list(visible_map.keys())}") + log.warning(f"Lookup {layer_file_name} from {list(visible_map.keys())}") raise return list(c for c in candidates if is_visible(c)) @@ -636,7 +636,7 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder): except: # On assertion failures, log some details to help with debugging heatmap = env.pageserver_remote_storage.heatmap_content(tenant_id) - log.warn(f"heatmap contents: {json.dumps(heatmap, indent=2)}") + log.warning(f"heatmap contents: {json.dumps(heatmap, indent=2)}") raise # Scrub the remote storage From 4f0a9fc5698dfcc1a59ce6d32ca2b1e8ebb5de77 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sun, 18 May 2025 00:06:32 +0200 Subject: [PATCH 59/65] chore(deps): bump flask-cors from 5.0.0 to 6.0.0 in the pip group across 1 directory (#11960) Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- poetry.lock | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/poetry.lock b/poetry.lock index 1a772d3415..e6440761be 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 2.1.2 and should not be changed by hand. +# This file is automatically @generated by Poetry 2.1.1 and should not be changed by hand. [[package]] name = "aiohappyeyeballs" @@ -1145,18 +1145,19 @@ dotenv = ["python-dotenv"] [[package]] name = "flask-cors" -version = "5.0.0" -description = "A Flask extension adding a decorator for CORS support" +version = "6.0.0" +description = "A Flask extension simplifying CORS support" optional = false -python-versions = "*" +python-versions = "<4.0,>=3.9" groups = ["main"] files = [ - {file = "Flask_Cors-5.0.0-py2.py3-none-any.whl", hash = "sha256:b9e307d082a9261c100d8fb0ba909eec6a228ed1b60a8315fd85f783d61910bc"}, - {file = "flask_cors-5.0.0.tar.gz", hash = "sha256:5aadb4b950c4e93745034594d9f3ea6591f734bb3662e16e255ffbf5e89c88ef"}, + {file = "flask_cors-6.0.0-py3-none-any.whl", hash = "sha256:6332073356452343a8ccddbfec7befdc3fdd040141fe776ec9b94c262f058657"}, + {file = "flask_cors-6.0.0.tar.gz", hash = "sha256:4592c1570246bf7beee96b74bc0adbbfcb1b0318f6ba05c412e8909eceec3393"}, ] [package.dependencies] -Flask = ">=0.9" +flask = ">=0.9" +Werkzeug = ">=0.7" [[package]] name = "frozenlist" From e9631296784799269f079af6a3c5b2fe65e3c057 Mon Sep 17 00:00:00 2001 From: Trung Dinh Date: Sat, 17 May 2025 15:30:29 -0700 Subject: [PATCH 60/65] pagesteam_handle_batched_message -> pagestream_handle_batched_message (#11916) ## Problem Found a typo in code. ## Summary of changes Co-authored-by: Trung Dinh Co-authored-by: Erik Grinaker --- pageserver/src/page_service.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index bca1cb5b49..101e312ec3 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -1278,7 +1278,7 @@ impl PageServerHandler { } #[instrument(level = tracing::Level::DEBUG, skip_all)] - async fn pagesteam_handle_batched_message( + async fn pagestream_handle_batched_message( &mut self, pgb_writer: &mut PostgresBackend, batch: BatchedFeMessage, @@ -1733,7 +1733,7 @@ impl PageServerHandler { }; let result = self - .pagesteam_handle_batched_message( + .pagestream_handle_batched_message( pgb_writer, msg, io_concurrency.clone(), @@ -1909,7 +1909,7 @@ impl PageServerHandler { return Err(e); } }; - self.pagesteam_handle_batched_message( + self.pagestream_handle_batched_message( pgb_writer, batch, io_concurrency.clone(), From 81c557d87e2381d653deb0b0b9decbbdfc76f30f Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Sun, 18 May 2025 08:02:47 +0300 Subject: [PATCH 61/65] Unlogged build get smgr (#11954) ## Problem See https://github.com/neondatabase/neon/issues/11910 and https://neondb.slack.com/archives/C04DGM6SMTM/p1747314649059129 ## Summary of changes Do not change persistence in `start_unlogged_build` Postgres PRs: https://github.com/neondatabase/postgres/pull/642 https://github.com/neondatabase/postgres/pull/641 https://github.com/neondatabase/postgres/pull/640 https://github.com/neondatabase/postgres/pull/639 --------- Co-authored-by: Konstantin Knizhnik --- compute/patches/rum.patch | 6 +-- pgxn/neon/neon_pgversioncompat.h | 8 +++- pgxn/neon/pagestore_smgr.c | 78 ++++++++++++++++++++++++-------- vendor/postgres-v14 | 2 +- vendor/postgres-v15 | 2 +- vendor/postgres-v16 | 2 +- vendor/postgres-v17 | 2 +- vendor/revisions.json | 8 ++-- 8 files changed, 76 insertions(+), 32 deletions(-) diff --git a/compute/patches/rum.patch b/compute/patches/rum.patch index b45afe2874..aed1badc13 100644 --- a/compute/patches/rum.patch +++ b/compute/patches/rum.patch @@ -7,7 +7,7 @@ index 255e616..1c6edb7 100644 RelationGetRelationName(index)); +#ifdef NEON_SMGR -+ smgr_start_unlogged_build(index->rd_smgr); ++ smgr_start_unlogged_build(RelationGetSmgr(index)); +#endif + initRumState(&buildstate.rumstate, index); @@ -18,7 +18,7 @@ index 255e616..1c6edb7 100644 rumUpdateStats(index, &buildstate.buildStats, buildstate.rumstate.isBuild); +#ifdef NEON_SMGR -+ smgr_finish_unlogged_build_phase_1(index->rd_smgr); ++ smgr_finish_unlogged_build_phase_1(RelationGetSmgr(index)); +#endif + /* @@ -29,7 +29,7 @@ index 255e616..1c6edb7 100644 } +#ifdef NEON_SMGR -+ smgr_end_unlogged_build(index->rd_smgr); ++ smgr_end_unlogged_build(RelationGetSmgr(index)); +#endif + /* diff --git a/pgxn/neon/neon_pgversioncompat.h b/pgxn/neon/neon_pgversioncompat.h index b3ed0c04e8..bf91a02b45 100644 --- a/pgxn/neon/neon_pgversioncompat.h +++ b/pgxn/neon/neon_pgversioncompat.h @@ -86,7 +86,7 @@ InitBufferTag(BufferTag *tag, const RelFileNode *rnode, #define InvalidRelFileNumber InvalidOid -#define SMgrRelGetRelInfo(reln) \ +#define SMgrRelGetRelInfo(reln) \ (reln->smgr_rnode.node) #define DropRelationAllLocalBuffers DropRelFileNodeAllLocalBuffers @@ -148,6 +148,12 @@ InitBufferTag(BufferTag *tag, const RelFileNode *rnode, #define DropRelationAllLocalBuffers DropRelationAllLocalBuffers #endif +#define NRelFileInfoInvalidate(rinfo) do { \ + NInfoGetSpcOid(rinfo) = InvalidOid; \ + NInfoGetDbOid(rinfo) = InvalidOid; \ + NInfoGetRelNumber(rinfo) = InvalidRelFileNumber; \ + } while (0) + #if PG_MAJORVERSION_NUM < 17 #define ProcNumber BackendId #define INVALID_PROC_NUMBER InvalidBackendId diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index 5558a903e2..43fd715bbb 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -108,7 +108,7 @@ typedef enum UNLOGGED_BUILD_NOT_PERMANENT } UnloggedBuildPhase; -static SMgrRelation unlogged_build_rel = NULL; +static NRelFileInfo unlogged_build_rel_info; static UnloggedBuildPhase unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS; static bool neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id); @@ -912,8 +912,14 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, { case 0: neon_log(ERROR, "cannot call smgrextend() on rel with unknown persistence"); + break; case RELPERSISTENCE_PERMANENT: + if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln))) + { + mdextend(reln, forkNum, blkno, buffer, skipFsync); + return; + } break; case RELPERSISTENCE_TEMP: @@ -1000,8 +1006,14 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum, { case 0: neon_log(ERROR, "cannot call smgrextend() on rel with unknown persistence"); + break; case RELPERSISTENCE_PERMANENT: + if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln))) + { + mdzeroextend(reln, forkNum, blocknum, nblocks, skipFsync); + return; + } break; case RELPERSISTENCE_TEMP: @@ -1376,8 +1388,14 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer { case 0: neon_log(ERROR, "cannot call smgrread() on rel with unknown persistence"); + break; case RELPERSISTENCE_PERMANENT: + if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln))) + { + mdread(reln, forkNum, blkno, buffer); + return; + } break; case RELPERSISTENCE_TEMP: @@ -1463,8 +1481,14 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, { case 0: neon_log(ERROR, "cannot call smgrread() on rel with unknown persistence"); + break; case RELPERSISTENCE_PERMANENT: + if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln))) + { + mdreadv(reln, forknum, blocknum, buffers, nblocks); + return; + } break; case RELPERSISTENCE_TEMP: @@ -1597,6 +1621,15 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo break; case RELPERSISTENCE_PERMANENT: + if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln))) + { +#if PG_MAJORVERSION_NUM >= 17 + mdwritev(reln, forknum, blocknum, &buffer, 1, skipFsync); +#else + mdwrite(reln, forknum, blocknum, buffer, skipFsync); +#endif + return; + } break; case RELPERSISTENCE_TEMP: @@ -1666,6 +1699,11 @@ neon_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, break; case RELPERSISTENCE_PERMANENT: + if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln))) + { + mdwritev(reln, forknum, blkno, buffers, nblocks, skipFsync); + return; + } break; case RELPERSISTENCE_TEMP: @@ -1706,6 +1744,10 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum) break; case RELPERSISTENCE_PERMANENT: + if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln))) + { + return mdnblocks(reln, forknum); + } break; case RELPERSISTENCE_TEMP: @@ -1775,6 +1817,11 @@ neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber old_blocks, Blo break; case RELPERSISTENCE_PERMANENT: + if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln))) + { + mdtruncate(reln, forknum, old_blocks, nblocks); + return; + } break; case RELPERSISTENCE_TEMP: @@ -1913,7 +1960,6 @@ neon_start_unlogged_build(SMgrRelation reln) */ if (unlogged_build_phase != UNLOGGED_BUILD_NOT_IN_PROGRESS) neon_log(ERROR, "unlogged relation build is already in progress"); - Assert(unlogged_build_rel == NULL); ereport(SmgrTrace, (errmsg(NEON_TAG "starting unlogged build of relation %u/%u/%u", @@ -1930,7 +1976,7 @@ neon_start_unlogged_build(SMgrRelation reln) case RELPERSISTENCE_TEMP: case RELPERSISTENCE_UNLOGGED: - unlogged_build_rel = reln; + unlogged_build_rel_info = InfoFromSMgrRel(reln); unlogged_build_phase = UNLOGGED_BUILD_NOT_PERMANENT; #ifdef DEBUG_COMPARE_LOCAL if (!IsParallelWorker()) @@ -1951,12 +1997,9 @@ neon_start_unlogged_build(SMgrRelation reln) neon_log(ERROR, "cannot perform unlogged index build, index is not empty "); #endif - unlogged_build_rel = reln; + unlogged_build_rel_info = InfoFromSMgrRel(reln); unlogged_build_phase = UNLOGGED_BUILD_PHASE_1; - /* Make the relation look like it's unlogged */ - reln->smgr_relpersistence = RELPERSISTENCE_UNLOGGED; - /* * Create the local file. In a parallel build, the leader is expected to * call this first and do it. @@ -1983,17 +2026,16 @@ neon_start_unlogged_build(SMgrRelation reln) static void neon_finish_unlogged_build_phase_1(SMgrRelation reln) { - Assert(unlogged_build_rel == reln); + Assert(RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln))); ereport(SmgrTrace, (errmsg(NEON_TAG "finishing phase 1 of unlogged build of relation %u/%u/%u", - RelFileInfoFmt(InfoFromSMgrRel(reln))))); + RelFileInfoFmt((unlogged_build_rel_info))))); if (unlogged_build_phase == UNLOGGED_BUILD_NOT_PERMANENT) return; Assert(unlogged_build_phase == UNLOGGED_BUILD_PHASE_1); - Assert(reln->smgr_relpersistence == RELPERSISTENCE_UNLOGGED); /* * In a parallel build, (only) the leader process performs the 2nd @@ -2001,7 +2043,7 @@ neon_finish_unlogged_build_phase_1(SMgrRelation reln) */ if (IsParallelWorker()) { - unlogged_build_rel = NULL; + NRelFileInfoInvalidate(unlogged_build_rel_info); unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS; } else @@ -2022,11 +2064,11 @@ neon_end_unlogged_build(SMgrRelation reln) { NRelFileInfoBackend rinfob = InfoBFromSMgrRel(reln); - Assert(unlogged_build_rel == reln); + Assert(RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln))); ereport(SmgrTrace, (errmsg(NEON_TAG "ending unlogged build of relation %u/%u/%u", - RelFileInfoFmt(InfoFromNInfoB(rinfob))))); + RelFileInfoFmt(unlogged_build_rel_info)))); if (unlogged_build_phase != UNLOGGED_BUILD_NOT_PERMANENT) { @@ -2034,7 +2076,6 @@ neon_end_unlogged_build(SMgrRelation reln) BlockNumber nblocks; Assert(unlogged_build_phase == UNLOGGED_BUILD_PHASE_2); - Assert(reln->smgr_relpersistence == RELPERSISTENCE_UNLOGGED); /* * Update the last-written LSN cache. @@ -2055,9 +2096,6 @@ neon_end_unlogged_build(SMgrRelation reln) InfoFromNInfoB(rinfob), MAIN_FORKNUM); - /* Make the relation look permanent again */ - reln->smgr_relpersistence = RELPERSISTENCE_PERMANENT; - /* Remove local copy */ for (int forknum = 0; forknum <= MAX_FORKNUM; forknum++) { @@ -2078,7 +2116,7 @@ neon_end_unlogged_build(SMgrRelation reln) mdunlink(rinfob, INIT_FORKNUM, true); #endif } - unlogged_build_rel = NULL; + NRelFileInfoInvalidate(unlogged_build_rel_info); unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS; } @@ -2151,7 +2189,7 @@ AtEOXact_neon(XactEvent event, void *arg) * Forget about any build we might have had in progress. The local * file will be unlinked by smgrDoPendingDeletes() */ - unlogged_build_rel = NULL; + NRelFileInfoInvalidate(unlogged_build_rel_info); unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS; break; @@ -2163,7 +2201,7 @@ AtEOXact_neon(XactEvent event, void *arg) case XACT_EVENT_PRE_PREPARE: if (unlogged_build_phase != UNLOGGED_BUILD_NOT_IN_PROGRESS) { - unlogged_build_rel = NULL; + NRelFileInfoInvalidate(unlogged_build_rel_info); unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS; ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index 4cca6f8083..55c0d45abe 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit 4cca6f8083483dda9e12eae292cf788d45bd561f +Subproject commit 55c0d45abe6467c02084c2192bca117eda6ce1e7 diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index daa81cffcf..de7640f55d 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit daa81cffcf063c54b29a9aabdb6604625f675ad0 +Subproject commit de7640f55da07512834d5cc40c4b3fb376b5f04f diff --git a/vendor/postgres-v16 b/vendor/postgres-v16 index 15710a76b7..0bf96bd6d7 160000 --- a/vendor/postgres-v16 +++ b/vendor/postgres-v16 @@ -1 +1 @@ -Subproject commit 15710a76b7d07912110fcbbaf0c8ad6d7e5a9fbc +Subproject commit 0bf96bd6d70301a0b43b0b3457bb3cf8fb43c198 diff --git a/vendor/postgres-v17 b/vendor/postgres-v17 index e5374b7299..8be779fd3a 160000 --- a/vendor/postgres-v17 +++ b/vendor/postgres-v17 @@ -1 +1 @@ -Subproject commit e5374b72997b0afc8374137674e873f7a558120a +Subproject commit 8be779fd3ab9e87206da96a7e4842ef1abf04f44 diff --git a/vendor/revisions.json b/vendor/revisions.json index 0fc2d3996d..3e999760f4 100644 --- a/vendor/revisions.json +++ b/vendor/revisions.json @@ -1,18 +1,18 @@ { "v17": [ "17.5", - "e5374b72997b0afc8374137674e873f7a558120a" + "8be779fd3ab9e87206da96a7e4842ef1abf04f44" ], "v16": [ "16.9", - "15710a76b7d07912110fcbbaf0c8ad6d7e5a9fbc" + "0bf96bd6d70301a0b43b0b3457bb3cf8fb43c198" ], "v15": [ "15.13", - "daa81cffcf063c54b29a9aabdb6604625f675ad0" + "de7640f55da07512834d5cc40c4b3fb376b5f04f" ], "v14": [ "14.18", - "4cca6f8083483dda9e12eae292cf788d45bd561f" + "55c0d45abe6467c02084c2192bca117eda6ce1e7" ] } From cdb6479c8abd87df7c0c535ced25aeef5991a983 Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Mon, 19 May 2025 11:03:06 +0200 Subject: [PATCH 62/65] pageserver: add gRPC page service schema (#11815) ## Problem For the [communicator project](https://github.com/neondatabase/company_projects/issues/352), we want to move to gRPC for the page service protocol. Touches #11728. ## Summary of changes This patch adds an experimental gRPC Protobuf schema for the page service. It is equivalent to the current page service, but with several improvements, e.g.: * Connection multiplexing. * Reduced head-of-line blocking. * Client-side batching. * Explicit tenant shard routing. * GetPage request classification (normal vs. prefetch). * Explicit rate limiting ("slow down" response status). The API is exposed as a new `pageserver/page_api` package. This is separate from the `pageserver_api` package to reduce the dependency footprint for the communicator. The longer-term plan is to also split out e.g. the WAL ingestion service to a separate gRPC package, e.g. `pageserver/wal_api`. Subsequent PRs will: add Rust domain types for the Protobuf types, expose a gRPC server, and implement the page service. Preliminary prototype benchmarks of this gRPC API is within 10% of baseline libpq performance. We'll do further benchmarking and optimization as the implementation lands in `main` and is deployed to staging. --- Cargo.lock | 10 + Cargo.toml | 2 + pageserver/page_api/Cargo.toml | 13 ++ pageserver/page_api/build.rs | 7 + pageserver/page_api/proto/page_service.proto | 220 +++++++++++++++++++ pageserver/page_api/src/lib.rs | 14 ++ 6 files changed, 266 insertions(+) create mode 100644 pageserver/page_api/Cargo.toml create mode 100644 pageserver/page_api/build.rs create mode 100644 pageserver/page_api/proto/page_service.proto create mode 100644 pageserver/page_api/src/lib.rs diff --git a/Cargo.lock b/Cargo.lock index 8ca65b58ce..d919537818 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4434,6 +4434,16 @@ dependencies = [ "workspace_hack", ] +[[package]] +name = "pageserver_page_api" +version = "0.1.0" +dependencies = [ + "prost 0.13.3", + "tonic", + "tonic-build", + "workspace_hack", +] + [[package]] name = "papaya" version = "0.2.1" diff --git a/Cargo.toml b/Cargo.toml index 74b281f88f..a280c446b9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,6 +9,7 @@ members = [ "pageserver/ctl", "pageserver/client", "pageserver/pagebench", + "pageserver/page_api", "proxy", "safekeeper", "safekeeper/client", @@ -252,6 +253,7 @@ pageserver = { path = "./pageserver" } pageserver_api = { version = "0.1", path = "./libs/pageserver_api/" } pageserver_client = { path = "./pageserver/client" } pageserver_compaction = { version = "0.1", path = "./pageserver/compaction/" } +pageserver_page_api = { path = "./pageserver/page_api" } postgres_backend = { version = "0.1", path = "./libs/postgres_backend/" } postgres_connection = { version = "0.1", path = "./libs/postgres_connection/" } postgres_ffi = { version = "0.1", path = "./libs/postgres_ffi/" } diff --git a/pageserver/page_api/Cargo.toml b/pageserver/page_api/Cargo.toml new file mode 100644 index 0000000000..c237949226 --- /dev/null +++ b/pageserver/page_api/Cargo.toml @@ -0,0 +1,13 @@ +[package] +name = "pageserver_page_api" +version = "0.1.0" +edition.workspace = true +license.workspace = true + +[dependencies] +prost.workspace = true +tonic.workspace = true +workspace_hack.workspace = true + +[build-dependencies] +tonic-build.workspace = true diff --git a/pageserver/page_api/build.rs b/pageserver/page_api/build.rs new file mode 100644 index 0000000000..ce3c49ed82 --- /dev/null +++ b/pageserver/page_api/build.rs @@ -0,0 +1,7 @@ +fn main() -> Result<(), Box> { + // Generates Rust code from .proto Protobuf schemas. + tonic_build::configure() + .bytes(["."]) + .compile_protos(&["proto/page_service.proto"], &["proto"]) + .map_err(|err| err.into()) +} diff --git a/pageserver/page_api/proto/page_service.proto b/pageserver/page_api/proto/page_service.proto new file mode 100644 index 0000000000..12e4d2f9db --- /dev/null +++ b/pageserver/page_api/proto/page_service.proto @@ -0,0 +1,220 @@ +// Page service, presented by pageservers for computes. +// +// This is the compute read path. It primarily serves page versions at given +// LSNs, but also base backups, SLRU segments, and relation metadata. +// +// EXPERIMENTAL: this is still under development and subject to change. +// +// Request metadata headers: +// - authorization: JWT token ("Bearer "), if auth is enabled +// - neon-tenant-id: tenant ID ("7c4a1f9e3bd6470c8f3e21a65bd2e980") +// - neon-shard-id: shard ID, as in hex ("0b10" = shard 11 of 16, 0-based) +// - neon-timeline-id: timeline ID ("f08c4e9a2d5f76b1e3a7c2d8910f4b3e") +// +// TODO: consider adding neon-compute-mode ("primary", "static", "replica"). +// However, this will require reconnecting when changing modes. +// +// TODO: write implementation guidance on +// - Health checks +// - Tracing, OpenTelemetry +// - Compression + +syntax = "proto3"; +package page_service; + +service PageService { + // Returns whether a relation exists. + rpc CheckRelExists(CheckRelExistsRequest) returns (CheckRelExistsResponse); + + // Fetches a base backup. + rpc GetBaseBackup (GetBaseBackupRequest) returns (stream GetBaseBackupResponseChunk); + + // Returns the total size of a database, as # of bytes. + rpc GetDbSize (GetDbSizeRequest) returns (GetDbSizeResponse); + + // Fetches pages. + // + // This is implemented as a bidirectional streaming RPC for performance. Unary + // requests incur costs for e.g. HTTP/2 stream setup, header parsing, + // authentication, and so on -- with streaming, we only pay these costs during + // the initial stream setup. This ~doubles throughput in benchmarks. Other + // RPCs use regular unary requests, since they are not as frequent and + // performance-critical, and this simplifies implementation. + // + // NB: a status response (e.g. errors) will terminate the stream. The stream + // may be shared by e.g. multiple Postgres backends, so we should avoid this. + // Most errors are therefore sent as GetPageResponse.status instead. + rpc GetPages (stream GetPageRequest) returns (stream GetPageResponse); + + // Returns the size of a relation, as # of blocks. + rpc GetRelSize (GetRelSizeRequest) returns (GetRelSizeResponse); + + // Fetches an SLRU segment. + rpc GetSlruSegment (GetSlruSegmentRequest) returns (GetSlruSegmentResponse); +} + +// The LSN a request should read at. +message ReadLsn { + // The request's read LSN. Required. + uint64 request_lsn = 1; + // If given, the caller guarantees that the page has not been modified since + // this LSN. Must be smaller than or equal to request_lsn. This allows the + // Pageserver to serve an old page without waiting for the request LSN to + // arrive. Valid for all request types. + // + // It is undefined behaviour to make a request such that the page was, in + // fact, modified between request_lsn and not_modified_since_lsn. The + // Pageserver might detect it and return an error, or it might return the old + // page version or the new page version. Setting not_modified_since_lsn equal + // to request_lsn is always safe, but can lead to unnecessary waiting. + uint64 not_modified_since_lsn = 2; +} + +// A relation identifier. +message RelTag { + uint32 spc_oid = 1; + uint32 db_oid = 2; + uint32 rel_number = 3; + uint32 fork_number = 4; +} + +// Checks whether a relation exists, at the given LSN. Only valid on shard 0, +// other shards will error. +message CheckRelExistsRequest { + ReadLsn read_lsn = 1; + RelTag rel = 2; +} + +message CheckRelExistsResponse { + bool exists = 1; +} + +// Requests a base backup at a given LSN. +message GetBaseBackupRequest { + // The LSN to fetch a base backup at. + ReadLsn read_lsn = 1; + // If true, logical replication slots will not be created. + bool replica = 2; +} + +// Base backup response chunk, returned as an ordered stream. +message GetBaseBackupResponseChunk { + // A basebackup data chunk. The size is undefined, but bounded by the 4 MB + // gRPC message size limit. + bytes chunk = 1; +} + +// Requests the size of a database, as # of bytes. Only valid on shard 0, other +// shards will error. +message GetDbSizeRequest { + ReadLsn read_lsn = 1; + uint32 db_oid = 2; +} + +message GetDbSizeResponse { + uint64 num_bytes = 1; +} + +// Requests one or more pages. +message GetPageRequest { + // A request ID. Will be included in the response. Should be unique for + // in-flight requests on the stream. + uint64 request_id = 1; + // The request class. + GetPageClass request_class = 2; + // The LSN to read at. + ReadLsn read_lsn = 3; + // The relation to read from. + RelTag rel = 4; + // Page numbers to read. Must belong to the remote shard. + // + // Multiple pages will be executed as a single batch by the Pageserver, + // amortizing layer access costs and parallelizing them. This may increase the + // latency of any individual request, but improves the overall latency and + // throughput of the batch as a whole. + // + // TODO: this causes an allocation in the common single-block case. The sender + // can use a SmallVec to stack-allocate it, but Prost will always deserialize + // into a heap-allocated Vec. Consider optimizing this. + // + // TODO: we might be able to avoid a sort or something if we mandate that these + // are always in order. But we can't currenly rely on this on the server, because + // of compatibility with the libpq protocol handler. + repeated uint32 block_number = 5; +} + +// A GetPageRequest class. Primarily intended for observability, but may also be +// used for prioritization in the future. +enum GetPageClass { + // Unknown class. For forwards compatibility: used when the client sends a + // class that the server doesn't know about. + GET_PAGE_CLASS_UNKNOWN = 0; + // A normal request. This is the default. + GET_PAGE_CLASS_NORMAL = 1; + // A prefetch request. NB: can only be classified on pg < 18. + GET_PAGE_CLASS_PREFETCH = 2; + // A background request (e.g. vacuum). + GET_PAGE_CLASS_BACKGROUND = 3; +} + +// A GetPage response. +// +// A batch response will contain all of the requested pages. We could eagerly +// emit individual pages as soon as they are ready, but on a readv() Postgres +// holds buffer pool locks on all pages in the batch and we'll only return once +// the entire batch is ready, so no one can make use of the individual pages. +message GetPageResponse { + // The original request's ID. + uint64 request_id = 1; + // The response status code. + GetPageStatus status = 2; + // A string describing the status, if any. + string reason = 3; + // The 8KB page images, in the same order as the request. Empty if status != OK. + repeated bytes page_image = 4; +} + +// A GetPageResponse status code. Since we use a bidirectional stream, we don't +// want to send errors as gRPC statuses, since this would terminate the stream. +enum GetPageStatus { + // Unknown status. For forwards compatibility: used when the server sends a + // status code that the client doesn't know about. + GET_PAGE_STATUS_UNKNOWN = 0; + // The request was successful. + GET_PAGE_STATUS_OK = 1; + // The page did not exist. The tenant/timeline/shard has already been + // validated during stream setup. + GET_PAGE_STATUS_NOT_FOUND = 2; + // The request was invalid. + GET_PAGE_STATUS_INVALID = 3; + // The tenant is rate limited. Slow down and retry later. + GET_PAGE_STATUS_SLOW_DOWN = 4; + // TODO: consider adding a GET_PAGE_STATUS_LAYER_DOWNLOAD in the case of a + // layer download. This could free up the server task to process other + // requests while the layer download is in progress. +} + +// Fetches the size of a relation at a given LSN, as # of blocks. Only valid on +// shard 0, other shards will error. +message GetRelSizeRequest { + ReadLsn read_lsn = 1; + RelTag rel = 2; +} + +message GetRelSizeResponse { + uint32 num_blocks = 1; +} + +// Requests an SLRU segment. Only valid on shard 0, other shards will error. +message GetSlruSegmentRequest { + ReadLsn read_lsn = 1; + uint32 kind = 2; + uint32 segno = 3; +} + +// Returns an SLRU segment. +// +// These are up 32 pages (256 KB), so we can send them as a single response. +message GetSlruSegmentResponse { + bytes segment = 1; +} diff --git a/pageserver/page_api/src/lib.rs b/pageserver/page_api/src/lib.rs new file mode 100644 index 0000000000..0226d594cb --- /dev/null +++ b/pageserver/page_api/src/lib.rs @@ -0,0 +1,14 @@ +//! This crate provides the Pageserver's page API. It contains: +//! +//! * proto/page_service.proto: the Protobuf schema for the page API. +//! * proto: auto-generated Protobuf types for gRPC. +//! +//! This crate is used by both the client and the server. Try to keep it slim. + +// Code generated by protobuf. +pub mod proto { + tonic::include_proto!("page_service"); + + pub use page_service_client::PageServiceClient; + pub use page_service_server::{PageService, PageServiceServer}; +} From 76a7d37f7e266a946a0de91dae89f7ded66ef09f Mon Sep 17 00:00:00 2001 From: Ivan Efremov Date: Mon, 19 May 2025 13:10:55 +0300 Subject: [PATCH 63/65] proxy: Drop cancellation ops if they don't fit into the queue (#11950) Add a redis ops batch size argument for proxy and remove timeouts by using try_send() --- proxy/src/binary/proxy.rs | 12 ++++++++++-- proxy/src/cancellation.rs | 20 +++++++++----------- proxy/src/console_redirect_proxy.rs | 4 +--- proxy/src/proxy/mod.rs | 4 +--- proxy/src/proxy/passthrough.rs | 2 +- 5 files changed, 22 insertions(+), 20 deletions(-) diff --git a/proxy/src/binary/proxy.rs b/proxy/src/binary/proxy.rs index 51713902bc..f40d5041c1 100644 --- a/proxy/src/binary/proxy.rs +++ b/proxy/src/binary/proxy.rs @@ -161,8 +161,11 @@ struct ProxyCliArgs { #[clap(long, default_values_t = RateBucketInfo::DEFAULT_REDIS_SET)] redis_rps_limit: Vec, /// Cancellation channel size (max queue size for redis kv client) - #[clap(long, default_value = "1024")] + #[clap(long, default_value_t = 1024)] cancellation_ch_size: usize, + /// Cancellation ops batch size for redis + #[clap(long, default_value_t = 8)] + cancellation_batch_size: usize, /// cache for `allowed_ips` (use `size=0` to disable) #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)] allowed_ips_cache: String, @@ -542,7 +545,12 @@ pub async fn run() -> anyhow::Result<()> { if let Some(mut redis_kv_client) = redis_kv_client { maintenance_tasks.spawn(async move { redis_kv_client.try_connect().await?; - handle_cancel_messages(&mut redis_kv_client, rx_cancel).await?; + handle_cancel_messages( + &mut redis_kv_client, + rx_cancel, + args.cancellation_batch_size, + ) + .await?; drop(redis_kv_client); diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs index f34fb747ca..a6e7bf85a0 100644 --- a/proxy/src/cancellation.rs +++ b/proxy/src/cancellation.rs @@ -30,8 +30,6 @@ use crate::tls::postgres_rustls::MakeRustlsConnect; type IpSubnetKey = IpNet; const CANCEL_KEY_TTL: i64 = 1_209_600; // 2 weeks cancellation key expire time -const REDIS_SEND_TIMEOUT: std::time::Duration = std::time::Duration::from_millis(10); -const BATCH_SIZE: usize = 8; // Message types for sending through mpsc channel pub enum CancelKeyOp { @@ -231,12 +229,13 @@ impl CancelReplyOp { pub async fn handle_cancel_messages( client: &mut RedisKVClient, mut rx: mpsc::Receiver, + batch_size: usize, ) -> anyhow::Result<()> { - let mut batch = Vec::with_capacity(BATCH_SIZE); - let mut pipeline = Pipeline::with_capacity(BATCH_SIZE); + let mut batch = Vec::with_capacity(batch_size); + let mut pipeline = Pipeline::with_capacity(batch_size); loop { - if rx.recv_many(&mut batch, BATCH_SIZE).await == 0 { + if rx.recv_many(&mut batch, batch_size).await == 0 { warn!("shutting down cancellation queue"); break Ok(()); } @@ -367,8 +366,7 @@ impl CancellationHandler { return Err(CancelError::InternalError); }; - tx.send_timeout(op, REDIS_SEND_TIMEOUT) - .await + tx.try_send(op) .map_err(|e| { tracing::warn!("failed to send GetCancelData for {key}: {e}"); }) @@ -570,7 +568,7 @@ impl Session { } // Send the store key op to the cancellation handler and set TTL for the key - pub(crate) async fn write_cancel_key( + pub(crate) fn write_cancel_key( &self, cancel_closure: CancelClosure, ) -> Result<(), CancelError> { @@ -596,14 +594,14 @@ impl Session { expire: CANCEL_KEY_TTL, }; - let _ = tx.send_timeout(op, REDIS_SEND_TIMEOUT).await.map_err(|e| { + let _ = tx.try_send(op).map_err(|e| { let key = self.key; tracing::warn!("failed to send StoreCancelKey for {key}: {e}"); }); Ok(()) } - pub(crate) async fn remove_cancel_key(&self) -> Result<(), CancelError> { + pub(crate) fn remove_cancel_key(&self) -> Result<(), CancelError> { let Some(tx) = &self.cancellation_handler.tx else { tracing::warn!("cancellation handler is not available"); return Err(CancelError::InternalError); @@ -619,7 +617,7 @@ impl Session { .guard(RedisMsgKind::HDel), }; - let _ = tx.send_timeout(op, REDIS_SEND_TIMEOUT).await.map_err(|e| { + let _ = tx.try_send(op).map_err(|e| { let key = self.key; tracing::warn!("failed to send RemoveCancelKey for {key}: {e}"); }); diff --git a/proxy/src/console_redirect_proxy.rs b/proxy/src/console_redirect_proxy.rs index 0f2c3def0d..e3184e20d1 100644 --- a/proxy/src/console_redirect_proxy.rs +++ b/proxy/src/console_redirect_proxy.rs @@ -244,9 +244,7 @@ pub(crate) async fn handle_client( let cancellation_handler_clone = Arc::clone(&cancellation_handler); let session = cancellation_handler_clone.get_key(); - session - .write_cancel_key(node.cancel_closure.clone()) - .await?; + session.write_cancel_key(node.cancel_closure.clone())?; prepare_client_connection(&node, *session.key(), &mut stream).await?; diff --git a/proxy/src/proxy/mod.rs b/proxy/src/proxy/mod.rs index cf331b8bc0..0a86022e78 100644 --- a/proxy/src/proxy/mod.rs +++ b/proxy/src/proxy/mod.rs @@ -383,9 +383,7 @@ pub(crate) async fn handle_client( let cancellation_handler_clone = Arc::clone(&cancellation_handler); let session = cancellation_handler_clone.get_key(); - session - .write_cancel_key(node.cancel_closure.clone()) - .await?; + session.write_cancel_key(node.cancel_closure.clone())?; prepare_client_connection(&node, *session.key(), &mut stream).await?; diff --git a/proxy/src/proxy/passthrough.rs b/proxy/src/proxy/passthrough.rs index c100b8d716..8f9bd2de2d 100644 --- a/proxy/src/proxy/passthrough.rs +++ b/proxy/src/proxy/passthrough.rs @@ -94,7 +94,7 @@ impl ProxyPassthrough { tracing::warn!(session_id = ?self.session_id, ?err, "could not cancel the query in the database"); } - drop(self.cancel.remove_cancel_key().await); // we don't need a result. If the queue is full, we just log the error + drop(self.cancel.remove_cancel_key()); // we don't need a result. If the queue is full, we just log the error res } From 3685ad606d11de706b9d0eb5841b7801d6ae8a7d Mon Sep 17 00:00:00 2001 From: Folke Behrens Date: Mon, 19 May 2025 10:56:03 +0000 Subject: [PATCH 64/65] endpoint_storage: Fix metrics test by excluding assertion on macos (#11952) --- endpoint_storage/src/app.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/endpoint_storage/src/app.rs b/endpoint_storage/src/app.rs index 0bd7fe5f28..f44efe6d7a 100644 --- a/endpoint_storage/src/app.rs +++ b/endpoint_storage/src/app.rs @@ -462,6 +462,8 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH if var(REAL_S3_ENV).is_ok() { assert!(body.contains("remote_storage_s3_deleted_objects_total")); } + + #[cfg(target_os = "linux")] assert!(body.contains("process_threads")); } From 38dbc5f67f3dfbf501fb289f12f193bdec54ff6d Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Mon, 19 May 2025 13:17:45 +0200 Subject: [PATCH 65/65] pageserver/page_api: add binary Protobuf descriptor (#11968) ## Problem A binary Protobuf schema descriptor can be used to expose an API reflection service, which in turn allows convenient usage of e.g. `grpcurl` against the gRPC server. Touches #11728. ## Summary of changes * Generate a binary schema descriptor as `pageserver_page_api::proto::FILE_DESCRIPTOR_SET`. * Opportunistically rename the Protobuf package from `page_service` to `page_api`. --- pageserver/page_api/build.rs | 8 +++++++- pageserver/page_api/proto/page_service.proto | 15 ++++++++++++++- pageserver/page_api/src/lib.rs | 7 ++++++- 3 files changed, 27 insertions(+), 3 deletions(-) diff --git a/pageserver/page_api/build.rs b/pageserver/page_api/build.rs index ce3c49ed82..e96297f10e 100644 --- a/pageserver/page_api/build.rs +++ b/pageserver/page_api/build.rs @@ -1,7 +1,13 @@ +use std::env; +use std::path::PathBuf; + +/// Generates Rust code from .proto Protobuf schemas, along with a binary file +/// descriptor set for Protobuf schema reflection. fn main() -> Result<(), Box> { - // Generates Rust code from .proto Protobuf schemas. + let out_dir = PathBuf::from(env::var("OUT_DIR")?); tonic_build::configure() .bytes(["."]) + .file_descriptor_set_path(out_dir.join("page_api_descriptor.bin")) .compile_protos(&["proto/page_service.proto"], &["proto"]) .map_err(|err| err.into()) } diff --git a/pageserver/page_api/proto/page_service.proto b/pageserver/page_api/proto/page_service.proto index 12e4d2f9db..f6acb3eeeb 100644 --- a/pageserver/page_api/proto/page_service.proto +++ b/pageserver/page_api/proto/page_service.proto @@ -11,6 +11,19 @@ // - neon-shard-id: shard ID, as in hex ("0b10" = shard 11 of 16, 0-based) // - neon-timeline-id: timeline ID ("f08c4e9a2d5f76b1e3a7c2d8910f4b3e") // +// The service can be accessed via e.g. grpcurl: +// +// ``` +// grpcurl \ +// -plaintext \ +// -H "neon-tenant-id: 7c4a1f9e3bd6470c8f3e21a65bd2e980" \ +// -H "neon-shard-id: 0b10" \ +// -H "neon-timeline-id: f08c4e9a2d5f76b1e3a7c2d8910f4b3e" \ +// -H "authorization: Bearer $JWT" \ +// -d '{"read_lsn": {"request_lsn": 1234567890}, "rel": {"spc_oid": 1663, "db_oid": 1234, "rel_number": 5678, "fork_number": 0}}' +// localhost:51051 page_api.PageService/CheckRelExists +// ``` +// // TODO: consider adding neon-compute-mode ("primary", "static", "replica"). // However, this will require reconnecting when changing modes. // @@ -20,7 +33,7 @@ // - Compression syntax = "proto3"; -package page_service; +package page_api; service PageService { // Returns whether a relation exists. diff --git a/pageserver/page_api/src/lib.rs b/pageserver/page_api/src/lib.rs index 0226d594cb..0b68d03aaa 100644 --- a/pageserver/page_api/src/lib.rs +++ b/pageserver/page_api/src/lib.rs @@ -7,7 +7,12 @@ // Code generated by protobuf. pub mod proto { - tonic::include_proto!("page_service"); + tonic::include_proto!("page_api"); + + /// File descriptor set for Protobuf schema reflection. This allows using + /// e.g. grpcurl with the API. + pub const FILE_DESCRIPTOR_SET: &[u8] = + tonic::include_file_descriptor_set!("page_api_descriptor"); pub use page_service_client::PageServiceClient; pub use page_service_server::{PageService, PageServiceServer};