From 4d2e4b19c3dc8816668abc4204b110f1c9fd1b1e Mon Sep 17 00:00:00 2001
From: Shockingly Good <victor@neon.tech>
Date: Wed, 7 May 2025 18:34:08 +0200
Subject: [PATCH 01/65] fix(compute) Correct the PGXN s3 gateway URL. (#11796)

Corrects the postgres extension s3 gateway address to
be not just a domain name but a full base URL.

To make the code more readable, the option is renamed
to "remote_ext_base_url", while keeping the old name
also accessible by providing a clap argument alias.

Also provides a very simple and, perhaps, even redundant
unit test to confirm the logic behind parsing of the
corresponding CLI argument.

## Problem

As it is clearly stated in
https://github.com/neondatabase/cloud/issues/26005, using of the short
version of the domain name might work for now, but in the future, we
should get rid of using the `default` namespace and this is where it
will, most likely, break down.

## Summary of changes

The changes adjust the domain name of the extension s3 gateway to use
the proper base url format instead of the just domain name assuming the
"default" namespace and add a new CLI argument name for to reflect the
change and the expectance.
---
 compute_tools/src/bin/compute_ctl.rs          | 34 +++++++++++++++----
 compute_tools/src/compute.rs                  | 10 +++---
 compute_tools/src/extension_server.rs         |  8 ++---
 .../src/http/routes/extension_server.rs       |  2 +-
 control_plane/src/bin/neon_local.rs           |  9 ++---
 control_plane/src/endpoint.rs                 |  6 ++--
 test_runner/fixtures/neon_cli.py              |  6 ++--
 test_runner/fixtures/neon_fixtures.py         | 12 +++----
 .../regress/test_download_extensions.py       |  4 +--
 9 files changed, 56 insertions(+), 35 deletions(-)
diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs
index e337ee7b15..20b5e567a8 100644
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -60,12 +60,16 @@ use utils::failpoint_support;
 // Compatibility hack: if the control plane specified any remote-ext-config
 // use the default value for extension storage proxy gateway.
 // Remove this once the control plane is updated to pass the gateway URL
-fn parse_remote_ext_config(arg: &str) -> Result<String> {
-    if arg.starts_with("http") {
-        Ok(arg.trim_end_matches('/').to_string())
+fn parse_remote_ext_base_url(arg: &str) -> Result<String> {
+    const FALLBACK_PG_EXT_GATEWAY_BASE_URL: &str =
+        "http://pg-ext-s3-gateway.pg-ext-s3-gateway.svc.cluster.local";
+
+    Ok(if arg.starts_with("http") {
+        arg
     } else {
-        Ok("http://pg-ext-s3-gateway".to_string())
+        FALLBACK_PG_EXT_GATEWAY_BASE_URL
     }
+    .to_owned())
 }
 
 #[derive(Parser)]
@@ -74,8 +78,10 @@ struct Cli {
     #[arg(short = 'b', long, default_value = "postgres", env = "POSTGRES_PATH")]
     pub pgbin: String,
 
-    #[arg(short = 'r', long, value_parser = parse_remote_ext_config)]
-    pub remote_ext_config: Option<String>,
+    /// The base URL for the remote extension storage proxy gateway.
+    /// Should be in the form of `http(s)://<gateway-hostname>[:<port>]`.
+    #[arg(short = 'r', long, value_parser = parse_remote_ext_base_url, alias = "remote-ext-config")]
+    pub remote_ext_base_url: Option<String>,
 
     /// The port to bind the external listening HTTP server to. Clients running
     /// outside the compute will talk to the compute through this port. Keep
@@ -164,7 +170,7 @@ fn main() -> Result<()> {
             pgversion: get_pg_version_string(&cli.pgbin),
             external_http_port: cli.external_http_port,
             internal_http_port: cli.internal_http_port,
-            ext_remote_storage: cli.remote_ext_config.clone(),
+            remote_ext_base_url: cli.remote_ext_base_url.clone(),
             resize_swap_on_bind: cli.resize_swap_on_bind,
             set_disk_quota_for_fs: cli.set_disk_quota_for_fs,
             #[cfg(target_os = "linux")]
@@ -265,4 +271,18 @@ mod test {
     fn verify_cli() {
         Cli::command().debug_assert()
     }
+
+    #[test]
+    fn parse_pg_ext_gateway_base_url() {
+        let arg = "http://pg-ext-s3-gateway2";
+        let result = super::parse_remote_ext_base_url(arg).unwrap();
+        assert_eq!(result, arg);
+
+        let arg = "pg-ext-s3-gateway";
+        let result = super::parse_remote_ext_base_url(arg).unwrap();
+        assert_eq!(
+            result,
+            "http://pg-ext-s3-gateway.pg-ext-s3-gateway.svc.cluster.local"
+        );
+    }
 }
diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 0cda36a6e2..25920675c1 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -95,7 +95,7 @@ pub struct ComputeNodeParams {
     pub internal_http_port: u16,
 
     /// the address of extension storage proxy gateway
-    pub ext_remote_storage: Option<String>,
+    pub remote_ext_base_url: Option<String>,
 }
 
 /// Compute node info shared across several `compute_ctl` threads.
@@ -1896,9 +1896,9 @@ LIMIT 100",
         real_ext_name: String,
         ext_path: RemotePath,
     ) -> Result<u64, DownloadError> {
-        let ext_remote_storage =
+        let remote_ext_base_url =
             self.params
-                .ext_remote_storage
+                .remote_ext_base_url
                 .as_ref()
                 .ok_or(DownloadError::BadInput(anyhow::anyhow!(
                     "Remote extensions storage is not configured",
@@ -1960,7 +1960,7 @@ LIMIT 100",
         let download_size = extension_server::download_extension(
             &real_ext_name,
             &ext_path,
-            ext_remote_storage,
+            remote_ext_base_url,
             &self.params.pgbin,
         )
         .await
@@ -2069,7 +2069,7 @@ LIMIT 100",
         &self,
         spec: &ComputeSpec,
     ) -> Result<RemoteExtensionMetrics> {
-        if self.params.ext_remote_storage.is_none() {
+        if self.params.remote_ext_base_url.is_none() {
             return Ok(RemoteExtensionMetrics {
                 num_ext_downloaded: 0,
                 largest_ext_size: 0,
diff --git a/compute_tools/src/extension_server.rs b/compute_tools/src/extension_server.rs
index ee889e0c40..3439383699 100644
--- a/compute_tools/src/extension_server.rs
+++ b/compute_tools/src/extension_server.rs
@@ -158,14 +158,14 @@ fn parse_pg_version(human_version: &str) -> PostgresMajorVersion {
 pub async fn download_extension(
     ext_name: &str,
     ext_path: &RemotePath,
-    ext_remote_storage: &str,
+    remote_ext_base_url: &str,
     pgbin: &str,
 ) -> Result<u64> {
     info!("Download extension {:?} from {:?}", ext_name, ext_path);
 
     // TODO add retry logic
     let download_buffer =
-        match download_extension_tar(ext_remote_storage, &ext_path.to_string()).await {
+        match download_extension_tar(remote_ext_base_url, &ext_path.to_string()).await {
             Ok(buffer) => buffer,
             Err(error_message) => {
                 return Err(anyhow::anyhow!(
@@ -272,8 +272,8 @@ pub fn create_control_files(remote_extensions: &RemoteExtSpec, pgbin: &str) {
 // Do request to extension storage proxy, e.g.,
 // curl http://pg-ext-s3-gateway/latest/v15/extensions/anon.tar.zst
 // using HTTP GET and return the response body as bytes.
-async fn download_extension_tar(ext_remote_storage: &str, ext_path: &str) -> Result<Bytes> {
-    let uri = format!("{}/{}", ext_remote_storage, ext_path);
+async fn download_extension_tar(remote_ext_base_url: &str, ext_path: &str) -> Result<Bytes> {
+    let uri = format!("{}/{}", remote_ext_base_url, ext_path);
     let filename = Path::new(ext_path)
         .file_name()
         .unwrap_or_else(|| std::ffi::OsStr::new("unknown"))
diff --git a/compute_tools/src/http/routes/extension_server.rs b/compute_tools/src/http/routes/extension_server.rs
index 6508de6eee..e141a48b7f 100644
--- a/compute_tools/src/http/routes/extension_server.rs
+++ b/compute_tools/src/http/routes/extension_server.rs
@@ -22,7 +22,7 @@ pub(in crate::http) async fn download_extension(
     State(compute): State<Arc<ComputeNode>>,
 ) -> Response {
     // Don't even try to download extensions if no remote storage is configured
-    if compute.params.ext_remote_storage.is_none() {
+    if compute.params.remote_ext_base_url.is_none() {
         return JsonResponse::error(
             StatusCode::PRECONDITION_FAILED,
             "remote storage is not configured",
diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index fd625e9ed6..610fa5f865 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -644,9 +644,10 @@ struct EndpointStartCmdArgs {
 
     #[clap(
         long,
-        help = "Configure the remote extensions storage proxy gateway to request for extensions."
+        help = "Configure the remote extensions storage proxy gateway URL to request for extensions.",
+        alias = "remote-ext-config"
     )]
-    remote_ext_config: Option<String>,
+    remote_ext_base_url: Option<String>,
 
     #[clap(
         long,
@@ -1414,7 +1415,7 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
         EndpointCmd::Start(args) => {
             let endpoint_id = &args.endpoint_id;
             let pageserver_id = args.endpoint_pageserver_id;
-            let remote_ext_config = &args.remote_ext_config;
+            let remote_ext_base_url = &args.remote_ext_base_url;
 
             let safekeepers_generation = args.safekeepers_generation.map(SafekeeperGeneration::new);
             // If --safekeepers argument is given, use only the listed
@@ -1510,7 +1511,7 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
                     safekeepers_generation,
                     safekeepers,
                     pageservers,
-                    remote_ext_config.as_ref(),
+                    remote_ext_base_url.as_ref(),
                     stripe_size.0 as usize,
                     args.create_test_user,
                     args.start_timeout,
diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index be73661a3c..708745446d 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -655,7 +655,7 @@ impl Endpoint {
         safekeepers_generation: Option<SafekeeperGeneration>,
         safekeepers: Vec<NodeId>,
         pageservers: Vec<(Host, u16)>,
-        remote_ext_config: Option<&String>,
+        remote_ext_base_url: Option<&String>,
         shard_stripe_size: usize,
         create_test_user: bool,
         start_timeout: Duration,
@@ -825,8 +825,8 @@ impl Endpoint {
         .stderr(logfile.try_clone()?)
         .stdout(logfile);
 
-        if let Some(remote_ext_config) = remote_ext_config {
-            cmd.args(["--remote-ext-config", remote_ext_config]);
+        if let Some(remote_ext_base_url) = remote_ext_base_url {
+            cmd.args(["--remote-ext-base-url", remote_ext_base_url]);
         }
 
         let child = cmd.spawn()?;
diff --git a/test_runner/fixtures/neon_cli.py b/test_runner/fixtures/neon_cli.py
index 3be78719d7..4eaa4b7d99 100644
--- a/test_runner/fixtures/neon_cli.py
+++ b/test_runner/fixtures/neon_cli.py
@@ -557,7 +557,7 @@ class NeonLocalCli(AbstractNeonCli):
         endpoint_id: str,
         safekeepers_generation: int | None = None,
         safekeepers: list[int] | None = None,
-        remote_ext_config: str | None = None,
+        remote_ext_base_url: str | None = None,
         pageserver_id: int | None = None,
         allow_multiple: bool = False,
         create_test_user: bool = False,
@@ -572,8 +572,8 @@ class NeonLocalCli(AbstractNeonCli):
         extra_env_vars = env or {}
         if basebackup_request_tries is not None:
             extra_env_vars["NEON_COMPUTE_TESTING_BASEBACKUP_TRIES"] = str(basebackup_request_tries)
-        if remote_ext_config is not None:
-            args.extend(["--remote-ext-config", remote_ext_config])
+        if remote_ext_base_url is not None:
+            args.extend(["--remote-ext-base-url", remote_ext_base_url])
 
         if safekeepers_generation is not None:
             args.extend(["--safekeepers-generation", str(safekeepers_generation)])
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index d4a750ad3b..85ad49bb4f 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -4226,7 +4226,7 @@ class Endpoint(PgProtocol, LogUtils):
 
     def start(
         self,
-        remote_ext_config: str | None = None,
+        remote_ext_base_url: str | None = None,
         pageserver_id: int | None = None,
         safekeeper_generation: int | None = None,
         safekeepers: list[int] | None = None,
@@ -4252,7 +4252,7 @@ class Endpoint(PgProtocol, LogUtils):
             self.endpoint_id,
             safekeepers_generation=safekeeper_generation,
             safekeepers=self.active_safekeepers,
-            remote_ext_config=remote_ext_config,
+            remote_ext_base_url=remote_ext_base_url,
             pageserver_id=pageserver_id,
             allow_multiple=allow_multiple,
             create_test_user=create_test_user,
@@ -4467,7 +4467,7 @@ class Endpoint(PgProtocol, LogUtils):
         hot_standby: bool = False,
         lsn: Lsn | None = None,
         config_lines: list[str] | None = None,
-        remote_ext_config: str | None = None,
+        remote_ext_base_url: str | None = None,
         pageserver_id: int | None = None,
         allow_multiple: bool = False,
         basebackup_request_tries: int | None = None,
@@ -4486,7 +4486,7 @@ class Endpoint(PgProtocol, LogUtils):
             pageserver_id=pageserver_id,
             allow_multiple=allow_multiple,
         ).start(
-            remote_ext_config=remote_ext_config,
+            remote_ext_base_url=remote_ext_base_url,
             pageserver_id=pageserver_id,
             allow_multiple=allow_multiple,
             basebackup_request_tries=basebackup_request_tries,
@@ -4570,7 +4570,7 @@ class EndpointFactory:
         lsn: Lsn | None = None,
         hot_standby: bool = False,
         config_lines: list[str] | None = None,
-        remote_ext_config: str | None = None,
+        remote_ext_base_url: str | None = None,
         pageserver_id: int | None = None,
         basebackup_request_tries: int | None = None,
     ) -> Endpoint:
@@ -4590,7 +4590,7 @@ class EndpointFactory:
             hot_standby=hot_standby,
             config_lines=config_lines,
             lsn=lsn,
-            remote_ext_config=remote_ext_config,
+            remote_ext_base_url=remote_ext_base_url,
             pageserver_id=pageserver_id,
             basebackup_request_tries=basebackup_request_tries,
         )
diff --git a/test_runner/regress/test_download_extensions.py b/test_runner/regress/test_download_extensions.py
index d28240c722..24ba0713d2 100644
--- a/test_runner/regress/test_download_extensions.py
+++ b/test_runner/regress/test_download_extensions.py
@@ -221,7 +221,7 @@ def test_remote_extensions(
 
     endpoint.create_remote_extension_spec(spec)
 
-    endpoint.start(remote_ext_config=extensions_endpoint)
+    endpoint.start(remote_ext_base_url=extensions_endpoint)
 
     with endpoint.connect() as conn:
         with conn.cursor() as cur:
@@ -249,7 +249,7 @@ def test_remote_extensions(
     # Remove the extension files to force a redownload of the extension.
     extension.remove(test_output_dir, pg_version)
 
-    endpoint.start(remote_ext_config=extensions_endpoint)
+    endpoint.start(remote_ext_base_url=extensions_endpoint)
 
     # Test that ALTER EXTENSION UPDATE statements also fetch remote extensions.
     with endpoint.connect() as conn:

From 24d62c647fba00d1ac93f4118836ceeddf07b270 Mon Sep 17 00:00:00 2001
From: Dmitrii Kovalkov <34828390+DimasKovas@users.noreply.github.com>
Date: Wed, 7 May 2025 21:00:41 +0400
Subject: [PATCH 02/65] storcon: add missing switch_timeline_membership method
 to sk client (#11850)

## Problem

`switch_timeline_membership` is implemented on safekeeper's server side,
but the is missing in the client.

- Part of https://github.com/neondatabase/neon/issues/11823

## Summary of changes
- Add `switch_timeline_membership` method to `SafekeeperClient`
---
 safekeeper/client/src/mgmt_api.rs           | 14 ++++++++++++++
 storage_controller/src/safekeeper_client.rs | 17 +++++++++++++++++
 2 files changed, 31 insertions(+)

diff --git a/safekeeper/client/src/mgmt_api.rs b/safekeeper/client/src/mgmt_api.rs
index 5849df0343..b364ac8e48 100644
--- a/safekeeper/client/src/mgmt_api.rs
+++ b/safekeeper/client/src/mgmt_api.rs
@@ -121,6 +121,20 @@ impl Client {
         resp.json().await.map_err(Error::ReceiveBody)
     }
 
+    pub async fn switch_timeline_membership(
+        &self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        req: &models::TimelineMembershipSwitchRequest,
+    ) -> Result<models::TimelineMembershipSwitchResponse> {
+        let uri = format!(
+            "{}/v1/tenant/{}/timeline/{}/membership",
+            self.mgmt_api_endpoint, tenant_id, timeline_id
+        );
+        let resp = self.put(&uri, req).await?;
+        resp.json().await.map_err(Error::ReceiveBody)
+    }
+
     pub async fn delete_tenant(&self, tenant_id: TenantId) -> Result<models::TenantDeleteResult> {
         let uri = format!("{}/v1/tenant/{}", self.mgmt_api_endpoint, tenant_id);
         let resp = self
diff --git a/storage_controller/src/safekeeper_client.rs b/storage_controller/src/safekeeper_client.rs
index 988159af4a..1f3ea96d96 100644
--- a/storage_controller/src/safekeeper_client.rs
+++ b/storage_controller/src/safekeeper_client.rs
@@ -98,6 +98,23 @@ impl SafekeeperClient {
         )
     }
 
+    #[allow(unused)]
+    pub(crate) async fn switch_timeline_membership(
+        &self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        req: &models::TimelineMembershipSwitchRequest,
+    ) -> Result<models::TimelineMembershipSwitchResponse> {
+        measured_request!(
+            "switch_timeline_membership",
+            crate::metrics::Method::Put,
+            &self.node_id_label,
+            self.inner
+                .switch_timeline_membership(tenant_id, timeline_id, req)
+                .await
+        )
+    }
+
     pub(crate) async fn delete_tenant(
         &self,
         tenant_id: TenantId,

From 7eb85c56acb5f87c730b879c9488e217448ee28b Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 8 May 2025 08:33:29 +0200
Subject: [PATCH 03/65] tokio-epoll-uring: avoid warn! noise due to `ECANCELED`
 during shutdowns (#11819)

# Problem

Before this PR, `test_pageserver_catchup_while_compute_down` would
occasionally fail due to scary-looking WARN log line

```
WARN ephemeral_file_buffered_writer{...}:flush_attempt{attempt=1}: \
 error flushing buffered writer buffer to disk, retrying after backoff err=Operation canceled (os error 125)
```

After lengthy investigation, the conclusion is that this is likely due
to a kernel bug related due to io_uring async workers (io-wq) and
signals.
The main indicator is that the error only ever happens in correlation
with pageserver shtudown when SIGTERM is received.
There is a fix that is merged in 6.14
kernels (`io-wq: backoff when retrying worker creation`).
However, even when I revert that patch, the issue is not reproducible
on 6.14, so, it remains a speculation.

It was ruled out that the ECANCELED is due to the executor thread
exiting before the async worker starts processing the operation.

# Solution

The workaround in this issue is to retry the operation on ECANCELED
once.
Retries are safe because the low-level io_engine operations are
idempotent.
(We don't use O_APPEND and I can't think of another flag that would make
 the APIs covered by this patch not idempotent.)

# Testing

With this PR, the warn! log no longer happens on [my reproducer
setup](https://github.com/neondatabase/neon/issues/11446#issuecomment-2843015111).
And the new rate-limited `info!`-level log line informing about the
internal retry shows up instead, as expected.

# Refs
- fixes https://github.com/neondatabase/neon/issues/11446
---
 libs/utils/src/rate_limit.rs                |  2 +-
 pageserver/src/virtual_file/io_engine.rs    | 85 +++++++++++++++++++--
 pageserver/src/virtual_file/open_options.rs | 18 +++--
 3 files changed, 91 insertions(+), 14 deletions(-)

diff --git a/libs/utils/src/rate_limit.rs b/libs/utils/src/rate_limit.rs
index 945f710b1d..700cd5792b 100644
--- a/libs/utils/src/rate_limit.rs
+++ b/libs/utils/src/rate_limit.rs
@@ -17,7 +17,7 @@ impl std::fmt::Display for RateLimitStats {
 }
 
 impl RateLimit {
-    pub fn new(interval: Duration) -> Self {
+    pub const fn new(interval: Duration) -> Self {
         Self {
             last: None,
             interval,
diff --git a/pageserver/src/virtual_file/io_engine.rs b/pageserver/src/virtual_file/io_engine.rs
index dd04fb561a..d8eb803335 100644
--- a/pageserver/src/virtual_file/io_engine.rs
+++ b/pageserver/src/virtual_file/io_engine.rs
@@ -13,7 +13,7 @@
 pub(super) mod tokio_epoll_uring_ext;
 
 use tokio_epoll_uring::IoBuf;
-use tracing::Instrument;
+use tracing::{Instrument, info};
 
 pub(crate) use super::api::IoEngineKind;
 #[derive(Clone, Copy)]
@@ -111,13 +111,16 @@ pub(crate) fn get() -> IoEngine {
 
 use std::os::unix::prelude::FileExt;
 use std::sync::atomic::{AtomicU8, Ordering};
+use std::time::Duration;
 
 use super::owned_buffers_io::io_buf_ext::FullSlice;
 use super::owned_buffers_io::slice::SliceMutExt;
 use super::{FileGuard, Metadata};
 
 #[cfg(target_os = "linux")]
-fn epoll_uring_error_to_std(e: tokio_epoll_uring::Error<std::io::Error>) -> std::io::Error {
+pub(super) fn epoll_uring_error_to_std(
+    e: tokio_epoll_uring::Error<std::io::Error>,
+) -> std::io::Error {
     match e {
         tokio_epoll_uring::Error::Op(e) => e,
         tokio_epoll_uring::Error::System(system) => {
@@ -149,7 +152,11 @@ impl IoEngine {
             #[cfg(target_os = "linux")]
             IoEngine::TokioEpollUring => {
                 let system = tokio_epoll_uring_ext::thread_local_system().await;
-                let (resources, res) = system.read(file_guard, offset, slice).await;
+                let (resources, res) =
+                    retry_ecanceled_once((file_guard, slice), |(file_guard, slice)| async {
+                        system.read(file_guard, offset, slice).await
+                    })
+                    .await;
                 (resources, res.map_err(epoll_uring_error_to_std))
             }
         }
@@ -164,7 +171,10 @@ impl IoEngine {
             #[cfg(target_os = "linux")]
             IoEngine::TokioEpollUring => {
                 let system = tokio_epoll_uring_ext::thread_local_system().await;
-                let (resources, res) = system.fsync(file_guard).await;
+                let (resources, res) = retry_ecanceled_once(file_guard, |file_guard| async {
+                    system.fsync(file_guard).await
+                })
+                .await;
                 (resources, res.map_err(epoll_uring_error_to_std))
             }
         }
@@ -182,7 +192,10 @@ impl IoEngine {
             #[cfg(target_os = "linux")]
             IoEngine::TokioEpollUring => {
                 let system = tokio_epoll_uring_ext::thread_local_system().await;
-                let (resources, res) = system.fdatasync(file_guard).await;
+                let (resources, res) = retry_ecanceled_once(file_guard, |file_guard| async {
+                    system.fdatasync(file_guard).await
+                })
+                .await;
                 (resources, res.map_err(epoll_uring_error_to_std))
             }
         }
@@ -201,7 +214,10 @@ impl IoEngine {
             #[cfg(target_os = "linux")]
             IoEngine::TokioEpollUring => {
                 let system = tokio_epoll_uring_ext::thread_local_system().await;
-                let (resources, res) = system.statx(file_guard).await;
+                let (resources, res) = retry_ecanceled_once(file_guard, |file_guard| async {
+                    system.statx(file_guard).await
+                })
+                .await;
                 (
                     resources,
                     res.map_err(epoll_uring_error_to_std).map(Metadata::from),
@@ -224,6 +240,7 @@ impl IoEngine {
             #[cfg(target_os = "linux")]
             IoEngine::TokioEpollUring => {
                 // TODO: ftruncate op for tokio-epoll-uring
+                // Don't forget to use retry_ecanceled_once
                 let res = file_guard.with_std_file(|std_file| std_file.set_len(len));
                 (file_guard, res)
             }
@@ -245,8 +262,11 @@ impl IoEngine {
             #[cfg(target_os = "linux")]
             IoEngine::TokioEpollUring => {
                 let system = tokio_epoll_uring_ext::thread_local_system().await;
-                let ((file_guard, slice), res) =
-                    system.write(file_guard, offset, buf.into_raw_slice()).await;
+                let ((file_guard, slice), res) = retry_ecanceled_once(
+                    (file_guard, buf.into_raw_slice()),
+                    async |(file_guard, buf)| system.write(file_guard, offset, buf).await,
+                )
+                .await;
                 (
                     (file_guard, FullSlice::must_new(slice)),
                     res.map_err(epoll_uring_error_to_std),
@@ -282,6 +302,55 @@ impl IoEngine {
     }
 }
 
+/// We observe in tests that stop pageserver with SIGTERM immediately after it was ingesting data,
+/// occasionally buffered writers fail (and get retried by BufferedWriter) with ECANCELED.
+/// The problem is believed to be a race condition in how io_uring handles punted async work (io-wq) and signals.
+/// Investigation ticket: <https://github.com/neondatabase/neon/issues/11446>
+///
+/// This function retries the operation once if it fails with ECANCELED.
+/// ONLY USE FOR IDEMPOTENT [`super::VirtualFile`] operations.
+pub(super) async fn retry_ecanceled_once<F, Fut, T, V>(
+    resources: T,
+    f: F,
+) -> (T, Result<V, tokio_epoll_uring::Error<std::io::Error>>)
+where
+    F: Fn(T) -> Fut,
+    Fut: std::future::Future<Output = (T, Result<V, tokio_epoll_uring::Error<std::io::Error>>)>,
+    T: Send,
+    V: Send,
+{
+    let (resources, res) = f(resources).await;
+    let Err(e) = res else {
+        return (resources, res);
+    };
+    let tokio_epoll_uring::Error::Op(err) = e else {
+        return (resources, Err(e));
+    };
+    if err.raw_os_error() != Some(nix::libc::ECANCELED) {
+        return (resources, Err(tokio_epoll_uring::Error::Op(err)));
+    }
+    {
+        static RATE_LIMIT: std::sync::Mutex<utils::rate_limit::RateLimit> =
+            std::sync::Mutex::new(utils::rate_limit::RateLimit::new(Duration::from_secs(1)));
+        let mut guard = RATE_LIMIT.lock().unwrap();
+        guard.call2(|rate_limit_stats| {
+            info!(
+                %rate_limit_stats, "ECANCELED observed, assuming it is due to a signal being received by the submitting thread, retrying after a delay; this message is rate-limited"
+            );
+        });
+        drop(guard);
+    }
+    tokio::time::sleep(Duration::from_millis(100)).await; // something big enough to beat even heavily overcommitted CI runners
+    let (resources, res) = f(resources).await;
+    (resources, res)
+}
+
+pub(super) fn panic_operation_must_be_idempotent() {
+    panic!(
+        "unsupported; io_engine may retry operations internally and thus needs them to be idempotent (retry_ecanceled_once)"
+    )
+}
+
 pub enum FeatureTestResult {
     PlatformPreferred(IoEngineKind),
     Worse {
diff --git a/pageserver/src/virtual_file/open_options.rs b/pageserver/src/virtual_file/open_options.rs
index 2a7bb693f2..a40dfed4a4 100644
--- a/pageserver/src/virtual_file/open_options.rs
+++ b/pageserver/src/virtual_file/open_options.rs
@@ -110,18 +110,23 @@ impl OpenOptions {
         self
     }
 
+    /// Don't use, `O_APPEND` is not supported.
+    pub fn append(&mut self, _append: bool) {
+        super::io_engine::panic_operation_must_be_idempotent();
+    }
+
     pub(in crate::virtual_file) async fn open(&self, path: &Path) -> std::io::Result<OwnedFd> {
         match &self.inner {
             Inner::StdFs(x) => x.open(path).map(|file| file.into()),
             #[cfg(target_os = "linux")]
             Inner::TokioEpollUring(x) => {
                 let system = super::io_engine::tokio_epoll_uring_ext::thread_local_system().await;
-                system.open(path, x).await.map_err(|e| match e {
-                    tokio_epoll_uring::Error::Op(e) => e,
-                    tokio_epoll_uring::Error::System(system) => {
-                        std::io::Error::new(std::io::ErrorKind::Other, system)
-                    }
+                let (_, res) = super::io_engine::retry_ecanceled_once((), |()| async {
+                    let res = system.open(path, x).await;
+                    ((), res)
                 })
+                .await;
+                res.map_err(super::io_engine::epoll_uring_error_to_std)
             }
         }
     }
@@ -140,6 +145,9 @@ impl OpenOptions {
     }
 
     pub fn custom_flags(mut self, flags: i32) -> Self {
+        if flags & nix::libc::O_APPEND != 0 {
+            super::io_engine::panic_operation_must_be_idempotent();
+        }
         match &mut self.inner {
             Inner::StdFs(x) => {
                 let _ = x.custom_flags(flags);

From 1d1502bc167a2d0372756650581b4666597120c8 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 8 May 2025 08:57:53 +0200
Subject: [PATCH 04/65] fix(pageserver): `flush task cancelled` errors during
 timeline shutdown (#11853)

# Refs
- fixes https://github.com/neondatabase/neon/issues/11762

# Problem

PR #10993 introduced internal retries for BufferedWriter flushes.
PR #11052 added cancellation sensitivity to that retry loop.
That cancellation sensitivity is an error path that didn't exist before.

The result is that during timeline shutdown, after we
`Timeline::cancel`, compaction can now fail with error `flush task
cancelled`.
The problem with that:
1. We mis-classify this as an `error!`-worthy event.
2. This causes tests to become flaky because the error is not in global
`allowed_errors`.

Technically we also trip the `compaction_circuit_breaker` because the
resulting `CompactionError` is variant `::Other`.
But since this is Timeline shutdown, is doesn't matter practically
speaking.

# Solution / Changes

- Log the anyhow stack trace when classifying a compaction error as
`error!`.
  This was helpful to identify sources of `flush task cancelled` errors.
We only log at `error!` level in exceptional circumstances, so, it's ok
to have bit verbose logs.
- Introduce typed errors along the `BufferedWriter::write_*`=>
`BlobWriter::write_blob`
=> `{Delta,Image}LayerWriter::put_*` =>
`Split{Delta,Image}LayerWriter::put_{value,image}` chain.
- Proper mapping to `CompactionError`/`CreateImageLayersError` via new
`From` impls.

I am usually opposed to any magic `From` impls, but, it's how most of
the compaction code
works today.

# Testing

The symptoms are most prevalent in
`test_runner/regress/test_branch_and_gc.py::test_branch_and_gc`.
Before this PR, I was able to reproduce locally 1 or 2 times per 400
runs using
`DEFAULT_PG_VERSION=15 BUILD_TYPE=release poetry run pytest --count 400
-n 8`.
After this PR, it doesn't reproduce anymore after 2000 runs.

# Future Work

Technically the ingest path is also exposed to this new source of errors
because `InMemoryLayer` is backed by `BufferedWriter`.
But we haven't seen it occur in flaky tests yet.
Details and a fix in
- https://github.com/neondatabase/neon/pull/11851
---
 pageserver/src/tenant/blob_io.rs              | 27 ++++++++++++++-----
 pageserver/src/tenant/storage_layer.rs        |  1 +
 .../storage_layer/batch_split_writer.rs       | 18 ++++++++-----
 .../src/tenant/storage_layer/delta_layer.rs   | 25 +++++++++++------
 pageserver/src/tenant/storage_layer/errors.rs | 24 +++++++++++++++++
 .../src/tenant/storage_layer/image_layer.rs   | 20 ++++++++++----
 pageserver/src/tenant/tasks.rs                |  2 +-
 pageserver/src/tenant/timeline.rs             | 20 ++++++++++++++
 pageserver/src/tenant/timeline/compaction.rs  |  3 +--
 .../owned_buffers_io/write/flush.rs           | 13 +++++++++
 10 files changed, 124 insertions(+), 29 deletions(-)
 create mode 100644 pageserver/src/tenant/storage_layer/errors.rs

diff --git a/pageserver/src/tenant/blob_io.rs b/pageserver/src/tenant/blob_io.rs
index 8cf3c548c9..ed541c4f12 100644
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -94,10 +94,23 @@ impl Header {
 pub enum WriteBlobError {
     #[error(transparent)]
     Flush(FlushTaskError),
-    #[error("blob too large ({len} bytes)")]
-    BlobTooLarge { len: usize },
     #[error(transparent)]
-    WriteBlobRaw(anyhow::Error),
+    Other(anyhow::Error),
+}
+
+impl WriteBlobError {
+    pub fn is_cancel(&self) -> bool {
+        match self {
+            WriteBlobError::Flush(e) => e.is_cancel(),
+            WriteBlobError::Other(_) => false,
+        }
+    }
+    pub fn into_anyhow(self) -> anyhow::Error {
+        match self {
+            WriteBlobError::Flush(e) => e.into_anyhow(),
+            WriteBlobError::Other(e) => e,
+        }
+    }
 }
 
 impl BlockCursor<'_> {
@@ -327,7 +340,9 @@ where
                     return (
                         (
                             io_buf.slice_len(),
-                            Err(WriteBlobError::BlobTooLarge { len }),
+                            Err(WriteBlobError::Other(anyhow::anyhow!(
+                                "blob too large ({len} bytes)"
+                            ))),
                         ),
                         srcbuf,
                     );
@@ -391,7 +406,7 @@ where
         // Verify the header, to ensure we don't write invalid/corrupt data.
         let header = match Header::decode(&raw_with_header)
             .context("decoding blob header")
-            .map_err(WriteBlobError::WriteBlobRaw)
+            .map_err(WriteBlobError::Other)
         {
             Ok(header) => header,
             Err(err) => return (raw_with_header, Err(err)),
@@ -401,7 +416,7 @@ where
             let raw_len = raw_with_header.len();
             return (
                 raw_with_header,
-                Err(WriteBlobError::WriteBlobRaw(anyhow::anyhow!(
+                Err(WriteBlobError::Other(anyhow::anyhow!(
                     "header length mismatch: {header_total_len} != {raw_len}"
                 ))),
             );
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index 796ad01e54..5dfa961b71 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -2,6 +2,7 @@
 
 pub mod batch_split_writer;
 pub mod delta_layer;
+pub mod errors;
 pub mod filter_iterator;
 pub mod image_layer;
 pub mod inmemory_layer;
diff --git a/pageserver/src/tenant/storage_layer/batch_split_writer.rs b/pageserver/src/tenant/storage_layer/batch_split_writer.rs
index 39cd02d101..51f2e909a2 100644
--- a/pageserver/src/tenant/storage_layer/batch_split_writer.rs
+++ b/pageserver/src/tenant/storage_layer/batch_split_writer.rs
@@ -10,6 +10,7 @@ use utils::id::TimelineId;
 use utils::lsn::Lsn;
 use utils::shard::TenantShardId;
 
+use super::errors::PutError;
 use super::layer::S3_UPLOAD_LIMIT;
 use super::{
     DeltaLayerWriter, ImageLayerWriter, PersistentLayerDesc, PersistentLayerKey, ResidentLayer,
@@ -235,7 +236,7 @@ impl<'a> SplitImageLayerWriter<'a> {
         key: Key,
         img: Bytes,
         ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), PutError> {
         // The current estimation is an upper bound of the space that the key/image could take
         // because we did not consider compression in this estimation. The resulting image layer
         // could be smaller than the target size.
@@ -253,7 +254,8 @@ impl<'a> SplitImageLayerWriter<'a> {
                 self.cancel.clone(),
                 ctx,
             )
-            .await?;
+            .await
+            .map_err(PutError::Other)?;
             let prev_image_writer = std::mem::replace(&mut self.inner, next_image_writer);
             self.batches.add_unfinished_image_writer(
                 prev_image_writer,
@@ -346,7 +348,7 @@ impl<'a> SplitDeltaLayerWriter<'a> {
         lsn: Lsn,
         val: Value,
         ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), PutError> {
         // The current estimation is key size plus LSN size plus value size estimation. This is not an accurate
         // number, and therefore the final layer size could be a little bit larger or smaller than the target.
         //
@@ -366,7 +368,8 @@ impl<'a> SplitDeltaLayerWriter<'a> {
                     self.cancel.clone(),
                     ctx,
                 )
-                .await?,
+                .await
+                .map_err(PutError::Other)?,
             ));
         }
         let (_, inner) = self.inner.as_mut().unwrap();
@@ -386,7 +389,8 @@ impl<'a> SplitDeltaLayerWriter<'a> {
                     self.cancel.clone(),
                     ctx,
                 )
-                .await?;
+                .await
+                .map_err(PutError::Other)?;
                 let (start_key, prev_delta_writer) =
                     self.inner.replace((key, next_delta_writer)).unwrap();
                 self.batches.add_unfinished_delta_writer(
@@ -396,11 +400,11 @@ impl<'a> SplitDeltaLayerWriter<'a> {
                 );
             } else if inner.estimated_size() >= S3_UPLOAD_LIMIT {
                 // We have to produce a very large file b/c a key is updated too often.
-                anyhow::bail!(
+                return Err(PutError::Other(anyhow::anyhow!(
                     "a single key is updated too often: key={}, estimated_size={}, and the layer file cannot be produced",
                     key,
                     inner.estimated_size()
-                );
+                )));
             }
         }
         self.last_key_written = key;
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index 11875ac653..2c1b27c8d5 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -55,6 +55,7 @@ use utils::bin_ser::SerializeError;
 use utils::id::{TenantId, TimelineId};
 use utils::lsn::Lsn;
 
+use super::errors::PutError;
 use super::{
     AsLayerDesc, LayerName, OnDiskValue, OnDiskValueIo, PersistentLayerDesc, ResidentLayer,
     ValuesReconstructState,
@@ -477,12 +478,15 @@ impl DeltaLayerWriterInner {
         lsn: Lsn,
         val: Value,
         ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), PutError> {
         let (_, res) = self
             .put_value_bytes(
                 key,
                 lsn,
-                Value::ser(&val)?.slice_len(),
+                Value::ser(&val)
+                    .map_err(anyhow::Error::new)
+                    .map_err(PutError::Other)?
+                    .slice_len(),
                 val.will_init(),
                 ctx,
             )
@@ -497,7 +501,7 @@ impl DeltaLayerWriterInner {
         val: FullSlice<Buf>,
         will_init: bool,
         ctx: &RequestContext,
-    ) -> (FullSlice<Buf>, anyhow::Result<()>)
+    ) -> (FullSlice<Buf>, Result<(), PutError>)
     where
         Buf: IoBuf + Send,
     {
@@ -513,19 +517,24 @@ impl DeltaLayerWriterInner {
             .blob_writer
             .write_blob_maybe_compressed(val, ctx, compression)
             .await;
+        let res = res.map_err(PutError::WriteBlob);
         let off = match res {
             Ok((off, _)) => off,
-            Err(e) => return (val, Err(anyhow::anyhow!(e))),
+            Err(e) => return (val, Err(e)),
         };
 
         let blob_ref = BlobRef::new(off, will_init);
 
         let delta_key = DeltaKey::from_key_lsn(&key, lsn);
-        let res = self.tree.append(&delta_key.0, blob_ref.0);
+        let res = self
+            .tree
+            .append(&delta_key.0, blob_ref.0)
+            .map_err(anyhow::Error::new)
+            .map_err(PutError::Other);
 
         self.num_keys += 1;
 
-        (val, res.map_err(|e| anyhow::anyhow!(e)))
+        (val, res)
     }
 
     fn size(&self) -> u64 {
@@ -694,7 +703,7 @@ impl DeltaLayerWriter {
         lsn: Lsn,
         val: Value,
         ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), PutError> {
         self.inner
             .as_mut()
             .unwrap()
@@ -709,7 +718,7 @@ impl DeltaLayerWriter {
         val: FullSlice<Buf>,
         will_init: bool,
         ctx: &RequestContext,
-    ) -> (FullSlice<Buf>, anyhow::Result<()>)
+    ) -> (FullSlice<Buf>, Result<(), PutError>)
     where
         Buf: IoBuf + Send,
     {
diff --git a/pageserver/src/tenant/storage_layer/errors.rs b/pageserver/src/tenant/storage_layer/errors.rs
new file mode 100644
index 0000000000..591e489faa
--- /dev/null
+++ b/pageserver/src/tenant/storage_layer/errors.rs
@@ -0,0 +1,24 @@
+use crate::tenant::blob_io::WriteBlobError;
+
+#[derive(Debug, thiserror::Error)]
+pub enum PutError {
+    #[error(transparent)]
+    WriteBlob(WriteBlobError),
+    #[error(transparent)]
+    Other(anyhow::Error),
+}
+
+impl PutError {
+    pub fn is_cancel(&self) -> bool {
+        match self {
+            PutError::WriteBlob(e) => e.is_cancel(),
+            PutError::Other(_) => false,
+        }
+    }
+    pub fn into_anyhow(self) -> anyhow::Error {
+        match self {
+            PutError::WriteBlob(e) => e.into_anyhow(),
+            PutError::Other(e) => e,
+        }
+    }
+}
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index d684230572..740f53f928 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -53,6 +53,7 @@ use utils::bin_ser::SerializeError;
 use utils::id::{TenantId, TimelineId};
 use utils::lsn::Lsn;
 
+use super::errors::PutError;
 use super::layer_name::ImageLayerName;
 use super::{
     AsLayerDesc, LayerName, OnDiskValue, OnDiskValueIo, PersistentLayerDesc, ResidentLayer,
@@ -842,8 +843,14 @@ impl ImageLayerWriterInner {
         key: Key,
         img: Bytes,
         ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
-        ensure!(self.key_range.contains(&key));
+    ) -> Result<(), PutError> {
+        if !self.key_range.contains(&key) {
+            return Err(PutError::Other(anyhow::anyhow!(
+                "key {:?} not in range {:?}",
+                key,
+                self.key_range
+            )));
+        }
         let compression = self.conf.image_compression;
         let uncompressed_len = img.len() as u64;
         self.uncompressed_bytes += uncompressed_len;
@@ -853,7 +860,7 @@ impl ImageLayerWriterInner {
             .write_blob_maybe_compressed(img.slice_len(), ctx, compression)
             .await;
         // TODO: re-use the buffer for `img` further upstack
-        let (off, compression_info) = res?;
+        let (off, compression_info) = res.map_err(PutError::WriteBlob)?;
         if compression_info.compressed_size.is_some() {
             // The image has been considered for compression at least
             self.uncompressed_bytes_eligible += uncompressed_len;
@@ -865,7 +872,10 @@ impl ImageLayerWriterInner {
 
         let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
         key.write_to_byte_slice(&mut keybuf);
-        self.tree.append(&keybuf, off)?;
+        self.tree
+            .append(&keybuf, off)
+            .map_err(anyhow::Error::new)
+            .map_err(PutError::Other)?;
 
         #[cfg(feature = "testing")]
         {
@@ -1085,7 +1095,7 @@ impl ImageLayerWriter {
         key: Key,
         img: Bytes,
         ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), PutError> {
         self.inner.as_mut().unwrap().put_image(key, img, ctx).await
     }
 
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index 1112a5330b..4709a6d616 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -340,7 +340,7 @@ pub(crate) fn log_compaction_error(
     } else {
         match level {
             Level::ERROR if degrade_to_warning => warn!("Compaction failed and discarded: {err:#}"),
-            Level::ERROR => error!("Compaction failed: {err:#}"),
+            Level::ERROR => error!("Compaction failed: {err:?}"),
             Level::INFO => info!("Compaction failed: {err:#}"),
             level => unimplemented!("unexpected level {level:?}"),
         }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index cfeab77598..c8d897d074 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -987,6 +987,16 @@ impl From<PageReconstructError> for CreateImageLayersError {
     }
 }
 
+impl From<super::storage_layer::errors::PutError> for CreateImageLayersError {
+    fn from(e: super::storage_layer::errors::PutError) -> Self {
+        if e.is_cancel() {
+            CreateImageLayersError::Cancelled
+        } else {
+            CreateImageLayersError::Other(e.into_anyhow())
+        }
+    }
+}
+
 impl From<GetVectoredError> for CreateImageLayersError {
     fn from(e: GetVectoredError) -> Self {
         match e {
@@ -5923,6 +5933,16 @@ impl From<layer_manager::Shutdown> for CompactionError {
     }
 }
 
+impl From<super::storage_layer::errors::PutError> for CompactionError {
+    fn from(e: super::storage_layer::errors::PutError) -> Self {
+        if e.is_cancel() {
+            CompactionError::ShuttingDown
+        } else {
+            CompactionError::Other(e.into_anyhow())
+        }
+    }
+}
+
 #[serde_as]
 #[derive(serde::Serialize)]
 struct RecordedDuration(#[serde_as(as = "serde_with::DurationMicroSeconds")] Duration);
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index d0c13d86ce..07cd274a41 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -2204,8 +2204,7 @@ impl Timeline {
                     .as_mut()
                     .unwrap()
                     .put_value(key, lsn, value, ctx)
-                    .await
-                    .map_err(CompactionError::Other)?;
+                    .await?;
             } else {
                 let owner = self.shard_identity.get_shard_number(&key);
 
diff --git a/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs b/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs
index b41a9f6cd2..ac9867e8b4 100644
--- a/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs
+++ b/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs
@@ -247,6 +247,19 @@ pub enum FlushTaskError {
     Cancelled,
 }
 
+impl FlushTaskError {
+    pub fn is_cancel(&self) -> bool {
+        match self {
+            FlushTaskError::Cancelled => true,
+        }
+    }
+    pub fn into_anyhow(self) -> anyhow::Error {
+        match self {
+            FlushTaskError::Cancelled => anyhow::anyhow!(self),
+        }
+    }
+}
+
 impl<Buf, W> FlushBackgroundTask<Buf, W>
 where
     Buf: IoBufAligned + Send + Sync,

From 40f32ea326ac9f8b691f179d0ced414470eb06ff Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Thu, 8 May 2025 10:19:14 +0100
Subject: [PATCH 05/65] pageserver: refactor import flow and add job
 concurrency limiting (#11816)

## Problem

Import code is one big block. Separating planning and execution will
help with reporting
progress of import to storcon (building block for resuming import).

## Summary of changes

Split up the import into planning and execution.
A concurrency limit driven by PS config is also added.
---
 libs/pageserver_api/src/config.rs             |  11 +
 pageserver/src/config.rs                      |   4 +
 .../src/tenant/timeline/import_pgdata.rs      |   9 +-
 .../src/tenant/timeline/import_pgdata/flow.rs | 195 ++++++++++--------
 4 files changed, 129 insertions(+), 90 deletions(-)

diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
index b64c42a808..5b0c13dd89 100644
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -182,6 +182,7 @@ pub struct ConfigToml {
     pub tracing: Option<Tracing>,
     pub enable_tls_page_service_api: bool,
     pub dev_mode: bool,
+    pub timeline_import_config: TimelineImportConfig,
 }
 
 #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
@@ -300,6 +301,12 @@ impl From<OtelExporterProtocol> for tracing_utils::Protocol {
     }
 }
 
+#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
+pub struct TimelineImportConfig {
+    pub import_job_concurrency: NonZeroUsize,
+    pub import_job_soft_size_limit: NonZeroUsize,
+}
+
 pub mod statvfs {
     pub mod mock {
         #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
@@ -659,6 +666,10 @@ impl Default for ConfigToml {
             tracing: None,
             enable_tls_page_service_api: false,
             dev_mode: false,
+            timeline_import_config: TimelineImportConfig {
+                import_job_concurrency: NonZeroUsize::new(128).unwrap(),
+                import_job_soft_size_limit: NonZeroUsize::new(1024 * 1024 * 1024).unwrap(),
+            },
         }
     }
 }
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index ded2805602..7e773f56b3 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -230,6 +230,8 @@ pub struct PageServerConf {
     /// such as authentication requirements for HTTP and PostgreSQL APIs.
     /// This is insecure and should only be used in development environments.
     pub dev_mode: bool,
+
+    pub timeline_import_config: pageserver_api::config::TimelineImportConfig,
 }
 
 /// Token for authentication to safekeepers
@@ -404,6 +406,7 @@ impl PageServerConf {
             tracing,
             enable_tls_page_service_api,
             dev_mode,
+            timeline_import_config,
         } = config_toml;
 
         let mut conf = PageServerConf {
@@ -457,6 +460,7 @@ impl PageServerConf {
             tracing,
             enable_tls_page_service_api,
             dev_mode,
+            timeline_import_config,
 
             // ------------------------------------------------------------
             // fields that require additional validation or custom handling
diff --git a/pageserver/src/tenant/timeline/import_pgdata.rs b/pageserver/src/tenant/timeline/import_pgdata.rs
index 6ab6b90cb6..c4a8df39a3 100644
--- a/pageserver/src/tenant/timeline/import_pgdata.rs
+++ b/pageserver/src/tenant/timeline/import_pgdata.rs
@@ -149,14 +149,7 @@ pub async fn doit(
         }
         .await?;
 
-        flow::run(
-            timeline.clone(),
-            base_lsn,
-            control_file,
-            storage.clone(),
-            ctx,
-        )
-        .await?;
+        flow::run(timeline.clone(), control_file, storage.clone(), ctx).await?;
 
         //
         // Communicate that shard is done.
diff --git a/pageserver/src/tenant/timeline/import_pgdata/flow.rs b/pageserver/src/tenant/timeline/import_pgdata/flow.rs
index c6d2944769..34c073365d 100644
--- a/pageserver/src/tenant/timeline/import_pgdata/flow.rs
+++ b/pageserver/src/tenant/timeline/import_pgdata/flow.rs
@@ -34,7 +34,9 @@ use std::sync::Arc;
 
 use anyhow::{bail, ensure};
 use bytes::Bytes;
+use futures::stream::FuturesOrdered;
 use itertools::Itertools;
+use pageserver_api::config::TimelineImportConfig;
 use pageserver_api::key::{
     CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, Key, TWOPHASEDIR_KEY, rel_block_to_key,
     rel_dir_to_key, rel_size_to_key, relmap_file_key, slru_block_to_key, slru_dir_to_key,
@@ -46,8 +48,9 @@ use pageserver_api::shard::ShardIdentity;
 use postgres_ffi::relfile_utils::parse_relfilename;
 use postgres_ffi::{BLCKSZ, pg_constants};
 use remote_storage::RemotePath;
-use tokio::task::JoinSet;
-use tracing::{Instrument, debug, info_span, instrument};
+use tokio::sync::Semaphore;
+use tokio_stream::StreamExt;
+use tracing::{debug, instrument};
 use utils::bin_ser::BeSer;
 use utils::lsn::Lsn;
 
@@ -63,37 +66,39 @@ use crate::tenant::storage_layer::{ImageLayerWriter, Layer};
 
 pub async fn run(
     timeline: Arc<Timeline>,
-    pgdata_lsn: Lsn,
     control_file: ControlFile,
     storage: RemoteStorageWrapper,
     ctx: &RequestContext,
 ) -> anyhow::Result<()> {
-    Flow {
-        timeline,
-        pgdata_lsn,
+    let planner = Planner {
         control_file,
-        tasks: Vec::new(),
-        storage,
-    }
-    .run(ctx)
-    .await
+        storage: storage.clone(),
+        shard: timeline.shard_identity,
+        tasks: Vec::default(),
+    };
+
+    let import_config = &timeline.conf.timeline_import_config;
+    let plan = planner.plan(import_config).await?;
+    plan.execute(timeline, import_config, ctx).await
 }
 
-struct Flow {
-    timeline: Arc<Timeline>,
-    pgdata_lsn: Lsn,
+struct Planner {
     control_file: ControlFile,
-    tasks: Vec<AnyImportTask>,
     storage: RemoteStorageWrapper,
+    shard: ShardIdentity,
+    tasks: Vec<AnyImportTask>,
 }
 
-impl Flow {
-    /// Perform the ingestion into [`Self::timeline`].
-    /// Assumes the timeline is empty (= no layers).
-    pub async fn run(mut self, ctx: &RequestContext) -> anyhow::Result<()> {
-        let pgdata_lsn = Lsn(self.control_file.control_file_data().checkPoint).align();
+struct Plan {
+    jobs: Vec<ChunkProcessingJob>,
+}
 
-        self.pgdata_lsn = pgdata_lsn;
+impl Planner {
+    /// Creates an import plan
+    ///
+    /// This function is and must remain pure: given the same input, it will generate the same import plan.
+    async fn plan(mut self, import_config: &TimelineImportConfig) -> anyhow::Result<Plan> {
+        let pgdata_lsn = Lsn(self.control_file.control_file_data().checkPoint).align();
 
         let datadir = PgDataDir::new(&self.storage).await?;
 
@@ -115,7 +120,7 @@ impl Flow {
         }
 
         // Import SLRUs
-        if self.timeline.tenant_shard_id.is_shard_zero() {
+        if self.shard.is_shard_zero() {
             // pg_xact (01:00 keyspace)
             self.import_slru(SlruKind::Clog, &self.storage.pgdata().join("pg_xact"))
                 .await?;
@@ -166,14 +171,16 @@ impl Flow {
         let mut last_end_key = Key::MIN;
         let mut current_chunk = Vec::new();
         let mut current_chunk_size: usize = 0;
-        let mut parallel_jobs = Vec::new();
+        let mut jobs = Vec::new();
         for task in std::mem::take(&mut self.tasks).into_iter() {
-            if current_chunk_size + task.total_size() > 1024 * 1024 * 1024 {
+            if current_chunk_size + task.total_size()
+                > import_config.import_job_soft_size_limit.into()
+            {
                 let key_range = last_end_key..task.key_range().start;
-                parallel_jobs.push(ChunkProcessingJob::new(
+                jobs.push(ChunkProcessingJob::new(
                     key_range.clone(),
                     std::mem::take(&mut current_chunk),
-                    &self,
+                    pgdata_lsn,
                 ));
                 last_end_key = key_range.end;
                 current_chunk_size = 0;
@@ -181,45 +188,13 @@ impl Flow {
             current_chunk_size += task.total_size();
             current_chunk.push(task);
         }
-        parallel_jobs.push(ChunkProcessingJob::new(
+        jobs.push(ChunkProcessingJob::new(
             last_end_key..Key::MAX,
             current_chunk,
-            &self,
+            pgdata_lsn,
         ));
 
-        // Start all jobs simultaneosly
-        let mut work = JoinSet::new();
-        // TODO: semaphore?
-        for job in parallel_jobs {
-            let ctx: RequestContext =
-                ctx.detached_child(TaskKind::ImportPgdata, DownloadBehavior::Error);
-            work.spawn(async move { job.run(&ctx).await }.instrument(info_span!("parallel_job")));
-        }
-        let mut results = Vec::new();
-        while let Some(result) = work.join_next().await {
-            match result {
-                Ok(res) => {
-                    results.push(res);
-                }
-                Err(_joinset_err) => {
-                    results.push(Err(anyhow::anyhow!(
-                        "parallel job panicked or cancelled, check pageserver logs"
-                    )));
-                }
-            }
-        }
-
-        if results.iter().all(|r| r.is_ok()) {
-            Ok(())
-        } else {
-            let mut msg = String::new();
-            for result in results {
-                if let Err(err) = result {
-                    msg.push_str(&format!("{err:?}\n\n"));
-                }
-            }
-            bail!("Some parallel jobs failed:\n\n{msg}");
-        }
+        Ok(Plan { jobs })
     }
 
     #[instrument(level = tracing::Level::DEBUG, skip_all, fields(dboid=%db.dboid, tablespace=%db.spcnode, path=%db.path))]
@@ -266,7 +241,7 @@ impl Flow {
             let end_key = rel_block_to_key(file.rel_tag, start_blk + (len / 8192) as u32);
             self.tasks
                 .push(AnyImportTask::RelBlocks(ImportRelBlocksTask::new(
-                    *self.timeline.get_shard_identity(),
+                    self.shard,
                     start_key..end_key,
                     &file.path,
                     self.storage.clone(),
@@ -289,7 +264,7 @@ impl Flow {
     }
 
     async fn import_slru(&mut self, kind: SlruKind, path: &RemotePath) -> anyhow::Result<()> {
-        assert!(self.timeline.tenant_shard_id.is_shard_zero());
+        assert!(self.shard.is_shard_zero());
 
         let segments = self.storage.listfilesindir(path).await?;
         let segments: Vec<(String, u32, usize)> = segments
@@ -344,6 +319,68 @@ impl Flow {
     }
 }
 
+impl Plan {
+    async fn execute(
+        self,
+        timeline: Arc<Timeline>,
+        import_config: &TimelineImportConfig,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
+        let mut work = FuturesOrdered::new();
+        let semaphore = Arc::new(Semaphore::new(import_config.import_job_concurrency.into()));
+
+        let jobs_in_plan = self.jobs.len();
+
+        let mut jobs = self.jobs.into_iter().enumerate().peekable();
+        let mut results = Vec::new();
+
+        // Run import jobs concurrently up to the limit specified by the pageserver configuration.
+        // Note that we process completed futures in the oreder of insertion. This will be the
+        // building block for resuming imports across pageserver restarts or tenant migrations.
+        while results.len() < jobs_in_plan {
+            tokio::select! {
+                permit = semaphore.clone().acquire_owned(), if jobs.peek().is_some() => {
+                    let permit = permit.expect("never closed");
+                    let (job_idx, job) = jobs.next().expect("we peeked");
+                    let job_timeline = timeline.clone();
+                    let ctx = ctx.detached_child(TaskKind::ImportPgdata, DownloadBehavior::Error);
+
+                    work.push_back(tokio::task::spawn(async move {
+                        let _permit = permit;
+                        let res = job.run(job_timeline, &ctx).await;
+                        (job_idx, res)
+                    }));
+                },
+                maybe_complete_job_idx = work.next() => {
+                    match maybe_complete_job_idx {
+                        Some(Ok((_job_idx, res))) => {
+                            results.push(res);
+                        },
+                        Some(Err(_)) => {
+                            results.push(Err(anyhow::anyhow!(
+                                "parallel job panicked or cancelled, check pageserver logs"
+                            )));
+                        }
+                        None => {}
+                    }
+                }
+            }
+        }
+
+        if results.iter().all(|r| r.is_ok()) {
+            Ok(())
+        } else {
+            let mut msg = String::new();
+            for result in results {
+                if let Err(err) = result {
+                    msg.push_str(&format!("{err:?}\n\n"));
+                }
+            }
+            bail!("Some parallel jobs failed:\n\n{msg}");
+        }
+    }
+}
+
 //
 // dbdir iteration tools
 //
@@ -713,7 +750,6 @@ impl From<ImportSlruBlocksTask> for AnyImportTask {
 }
 
 struct ChunkProcessingJob {
-    timeline: Arc<Timeline>,
     range: Range<Key>,
     tasks: Vec<AnyImportTask>,
 
@@ -721,25 +757,24 @@ struct ChunkProcessingJob {
 }
 
 impl ChunkProcessingJob {
-    fn new(range: Range<Key>, tasks: Vec<AnyImportTask>, env: &Flow) -> Self {
-        assert!(env.pgdata_lsn.is_valid());
+    fn new(range: Range<Key>, tasks: Vec<AnyImportTask>, pgdata_lsn: Lsn) -> Self {
+        assert!(pgdata_lsn.is_valid());
         Self {
-            timeline: env.timeline.clone(),
             range,
             tasks,
-            pgdata_lsn: env.pgdata_lsn,
+            pgdata_lsn,
         }
     }
 
-    async fn run(self, ctx: &RequestContext) -> anyhow::Result<()> {
+    async fn run(self, timeline: Arc<Timeline>, ctx: &RequestContext) -> anyhow::Result<()> {
         let mut writer = ImageLayerWriter::new(
-            self.timeline.conf,
-            self.timeline.timeline_id,
-            self.timeline.tenant_shard_id,
+            timeline.conf,
+            timeline.timeline_id,
+            timeline.tenant_shard_id,
             &self.range,
             self.pgdata_lsn,
-            &self.timeline.gate,
-            self.timeline.cancel.clone(),
+            &timeline.gate,
+            timeline.cancel.clone(),
             ctx,
         )
         .await?;
@@ -751,24 +786,20 @@ impl ChunkProcessingJob {
 
         let resident_layer = if nimages > 0 {
             let (desc, path) = writer.finish(ctx).await?;
-            Layer::finish_creating(self.timeline.conf, &self.timeline, desc, &path)?
+            Layer::finish_creating(timeline.conf, &timeline, desc, &path)?
         } else {
             // dropping the writer cleans up
             return Ok(());
         };
 
         // this is sharing the same code as create_image_layers
-        let mut guard = self.timeline.layers.write().await;
+        let mut guard = timeline.layers.write().await;
         guard
             .open_mut()?
-            .track_new_image_layers(&[resident_layer.clone()], &self.timeline.metrics);
+            .track_new_image_layers(&[resident_layer.clone()], &timeline.metrics);
         crate::tenant::timeline::drop_wlock(guard);
 
-        // Schedule the layer for upload but don't add barriers such as
-        // wait for completion or index upload, so we don't inhibit upload parallelism.
-        // TODO: limit upload parallelism somehow (e.g. by limiting concurrency of jobs?)
-        // TODO: or regulate parallelism by upload queue depth? Prob should happen at a higher level.
-        self.timeline
+        timeline
             .remote_client
             .schedule_layer_file_upload(resident_layer)?;
 

From 7e55497e131f2f26a16ae22bff80cac11951cdd4 Mon Sep 17 00:00:00 2001
From: Dmitrii Kovalkov <34828390+DimasKovas@users.noreply.github.com>
Date: Thu, 8 May 2025 14:00:45 +0400
Subject: [PATCH 06/65] tests: flush wal before waiting for last record lsn
 (#11726)

## Problem
Compute may flush WAL on page boundaries, leaving some records partially
flushed for a long time.
It leads to `wait_for_last_flush_lsn` stuck waiting for this partial
LSN.
- Closes: https://github.com/neondatabase/cloud/issues/27876

## Summary of changes
- Flush WAL via CHECKPOINT after requesting current_wal_lsn to make sure
that the record we point to is flushed in full
- Use proper endpoint in
`test_timeline_detach_with_aux_files_with_detach_v1`
---
 test_runner/fixtures/neon_fixtures.py                | 7 +++++++
 test_runner/regress/test_timeline_detach_ancestor.py | 4 ++--
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 85ad49bb4f..370eca5130 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -5477,6 +5477,13 @@ def wait_for_last_flush_lsn(
 
     if last_flush_lsn is None:
         last_flush_lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
+        # The last_flush_lsn may not correspond to a record boundary.
+        # For example, if the compute flushed WAL on a page boundary,
+        # the remaining part of the record might not be flushed for a long time.
+        # This would prevent the pageserver from reaching last_flush_lsn promptly.
+        # To ensure the rest of the record reaches the pageserver quickly,
+        # we forcibly flush the WAL by using CHECKPOINT.
+        endpoint.safe_psql("CHECKPOINT")
 
     results = []
     for tenant_shard_id, pageserver in shards:
diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py
index a71652af8a..d42c5d403e 100644
--- a/test_runner/regress/test_timeline_detach_ancestor.py
+++ b/test_runner/regress/test_timeline_detach_ancestor.py
@@ -1822,7 +1822,7 @@ def test_timeline_detach_with_aux_files_with_detach_v1(
     endpoint2.safe_psql(
         "SELECT pg_create_logical_replication_slot('test_slot_restore', 'pgoutput')"
     )
-    lsn3 = wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, branch_timeline_id)
+    lsn3 = wait_for_last_flush_lsn(env, endpoint2, env.initial_tenant, branch_timeline_id)
     assert set(http.list_aux_files(env.initial_tenant, branch_timeline_id, lsn1).keys()) == set([])
     assert set(http.list_aux_files(env.initial_tenant, branch_timeline_id, lsn3).keys()) == set(
         ["pg_replslot/test_slot_restore/state"]
@@ -1839,7 +1839,7 @@ def test_timeline_detach_with_aux_files_with_detach_v1(
     assert all_reparented == set([])
 
     # We need to ensure all safekeeper data are ingested before checking aux files: the API does not wait for LSN.
-    wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, branch_timeline_id)
+    wait_for_last_flush_lsn(env, endpoint2, env.initial_tenant, branch_timeline_id)
     assert set(http.list_aux_files(env.initial_tenant, env.initial_timeline, lsn2).keys()) == set(
         ["pg_replslot/test_slot_parent_1/state", "pg_replslot/test_slot_parent_2/state"]
     ), "main branch unaffected"

From 6c70789cfdf145ae4ca73228884ca1359b80c302 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Thu, 8 May 2025 12:14:41 +0200
Subject: [PATCH 07/65] storcon: increase drain+fill secondary warmup timeout
 from 20 to 30 seconds (#11848)

## Problem

During deployment drains/fills, we often see the storage controller
giving up on warmups after 20 seconds, when the warmup is nearly
complete (~90%). This can cause latency spikes for migrated tenants if
they block on layer downloads.

Touches https://github.com/neondatabase/cloud/issues/26193.

## Summary of changes

Increase the drain and fill secondary warmup timeout from 20 to 30
seconds.
---
 storage_controller/src/service.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 21c693af97..fdb791c2cf 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -8485,7 +8485,7 @@ impl Service {
         // By default, live migrations are generous about the wait time for getting
         // the secondary location up to speed. When draining, give up earlier in order
         // to not stall the operation when a cold secondary is encountered.
-        const SECONDARY_WARMUP_TIMEOUT: Duration = Duration::from_secs(20);
+        const SECONDARY_WARMUP_TIMEOUT: Duration = Duration::from_secs(30);
         const SECONDARY_DOWNLOAD_REQUEST_TIMEOUT: Duration = Duration::from_secs(5);
         let reconciler_config = ReconcilerConfigBuilder::new(ReconcilerPriority::Normal)
             .secondary_warmup_timeout(SECONDARY_WARMUP_TIMEOUT)
@@ -8818,7 +8818,7 @@ impl Service {
         node_id: NodeId,
         cancel: CancellationToken,
     ) -> Result<(), OperationError> {
-        const SECONDARY_WARMUP_TIMEOUT: Duration = Duration::from_secs(20);
+        const SECONDARY_WARMUP_TIMEOUT: Duration = Duration::from_secs(30);
         const SECONDARY_DOWNLOAD_REQUEST_TIMEOUT: Duration = Duration::from_secs(5);
         let reconciler_config = ReconcilerConfigBuilder::new(ReconcilerPriority::Normal)
             .secondary_warmup_timeout(SECONDARY_WARMUP_TIMEOUT)

From d22377c754556c95d24970458cb08968828902b3 Mon Sep 17 00:00:00 2001
From: Mark Novikov <piercypixel@gmail.com>
Date: Thu, 8 May 2025 15:04:28 +0400
Subject: [PATCH 08/65] Skip event triggers in dump-restore (#11794)

## Problem

Data import fails if the src db has any event triggers, because those
can only be restored by a superuser. Specifically imports from Heroku
and Supabase are guaranteed to fail.

Closes https://github.com/neondatabase/cloud/issues/27353

## Summary of changes

Depends on `pg_dump` patches per each supported PostgreSQL version:
- https://github.com/neondatabase/postgres/pull/630
- https://github.com/neondatabase/postgres/pull/629
- https://github.com/neondatabase/postgres/pull/627
- https://github.com/neondatabase/postgres/pull/628
---
 compute_tools/src/bin/fast_import.rs      |  1 +
 test_runner/regress/test_import_pgdata.py | 49 +++++++++++++++++++++++
 vendor/postgres-v14                       |  2 +-
 vendor/postgres-v17                       |  2 +-
 vendor/revisions.json                     |  4 +-
 5 files changed, 54 insertions(+), 4 deletions(-)

diff --git a/compute_tools/src/bin/fast_import.rs b/compute_tools/src/bin/fast_import.rs
index 537028cde1..78acd78585 100644
--- a/compute_tools/src/bin/fast_import.rs
+++ b/compute_tools/src/bin/fast_import.rs
@@ -348,6 +348,7 @@ async fn run_dump_restore(
         "--no-security-labels".to_string(),
         "--no-subscriptions".to_string(),
         "--no-tablespaces".to_string(),
+        "--no-event-triggers".to_string(),
         // format
         "--format".to_string(),
         "directory".to_string(),
diff --git a/test_runner/regress/test_import_pgdata.py b/test_runner/regress/test_import_pgdata.py
index a26c3994a5..2fda1991f7 100644
--- a/test_runner/regress/test_import_pgdata.py
+++ b/test_runner/regress/test_import_pgdata.py
@@ -641,6 +641,55 @@ def test_fast_import_binary(
         assert res[0][0] == 10
 
 
+def test_fast_import_event_triggers(
+    test_output_dir,
+    vanilla_pg: VanillaPostgres,
+    port_distributor: PortDistributor,
+    fast_import: FastImport,
+):
+    vanilla_pg.start()
+    vanilla_pg.safe_psql("""
+        CREATE FUNCTION test_event_trigger_for_drops()
+                RETURNS event_trigger LANGUAGE plpgsql AS $$
+        DECLARE
+            obj record;
+        BEGIN
+            FOR obj IN SELECT * FROM pg_event_trigger_dropped_objects()
+            LOOP
+                RAISE NOTICE '% dropped object: % %.% %',
+                            tg_tag,
+                            obj.object_type,
+                            obj.schema_name,
+                            obj.object_name,
+                            obj.object_identity;
+            END LOOP;
+        END
+        $$;
+
+        CREATE EVENT TRIGGER test_event_trigger_for_drops
+        ON sql_drop
+        EXECUTE PROCEDURE test_event_trigger_for_drops();
+    """)
+
+    pg_port = port_distributor.get_port()
+    p = fast_import.run_pgdata(pg_port=pg_port, source_connection_string=vanilla_pg.connstr())
+    assert p.returncode == 0
+
+    vanilla_pg.stop()
+
+    pgbin = PgBin(test_output_dir, fast_import.pg_distrib_dir, fast_import.pg_version)
+    with VanillaPostgres(
+        fast_import.workdir / "pgdata", pgbin, pg_port, False
+    ) as new_pgdata_vanilla_pg:
+        new_pgdata_vanilla_pg.start()
+
+        # database name and user are hardcoded in fast_import binary, and they are different from normal vanilla postgres
+        conn = PgProtocol(dsn=f"postgresql://cloud_admin@localhost:{pg_port}/neondb")
+        res = conn.safe_psql("SELECT count(*) FROM pg_event_trigger;")
+        log.info(f"Result: {res}")
+        assert res[0][0] == 0, f"Neon does not support importing event triggers, got: {res[0][0]}"
+
+
 def test_fast_import_restore_to_connstring(
     test_output_dir,
     vanilla_pg: VanillaPostgres,
diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index c8dab02bfc..108856a4ae 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit c8dab02bfc003ae7bd59096919042d7840f3c194
+Subproject commit 108856a4ae76be285b04497a0ed08fcbe60ddbe9
diff --git a/vendor/postgres-v17 b/vendor/postgres-v17
index eab3a37834..b763ab54b9 160000
--- a/vendor/postgres-v17
+++ b/vendor/postgres-v17
@@ -1 +1 @@
-Subproject commit eab3a37834cac6ec0719bf817ac918a201712d66
+Subproject commit b763ab54b98d232a0959371ab1d07f06ed77c49e
diff --git a/vendor/revisions.json b/vendor/revisions.json
index 74a6ff33d7..4307fd1c3f 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,7 +1,7 @@
 {
   "v17": [
     "17.4",
-    "eab3a37834cac6ec0719bf817ac918a201712d66"
+    "b763ab54b98d232a0959371ab1d07f06ed77c49e"
   ],
   "v16": [
     "16.8",
@@ -13,6 +13,6 @@
   ],
   "v14": [
     "14.17",
-    "c8dab02bfc003ae7bd59096919042d7840f3c194"
+    "108856a4ae76be285b04497a0ed08fcbe60ddbe9"
   ]
 }

From 42d93031a13b31cee2fbb8c2e7f1b094b0f554a2 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 8 May 2025 13:48:29 +0200
Subject: [PATCH 09/65] fixup(#11819): broken macOS build (#11861)

refs
- fixes https://github.com/neondatabase/neon/issues/11860
---
 pageserver/src/virtual_file/io_engine.rs | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/pageserver/src/virtual_file/io_engine.rs b/pageserver/src/virtual_file/io_engine.rs
index d8eb803335..7827682498 100644
--- a/pageserver/src/virtual_file/io_engine.rs
+++ b/pageserver/src/virtual_file/io_engine.rs
@@ -13,7 +13,7 @@
 pub(super) mod tokio_epoll_uring_ext;
 
 use tokio_epoll_uring::IoBuf;
-use tracing::{Instrument, info};
+use tracing::Instrument;
 
 pub(crate) use super::api::IoEngineKind;
 #[derive(Clone, Copy)]
@@ -111,7 +111,8 @@ pub(crate) fn get() -> IoEngine {
 
 use std::os::unix::prelude::FileExt;
 use std::sync::atomic::{AtomicU8, Ordering};
-use std::time::Duration;
+#[cfg(target_os = "linux")]
+use {std::time::Duration, tracing::info};
 
 use super::owned_buffers_io::io_buf_ext::FullSlice;
 use super::owned_buffers_io::slice::SliceMutExt;
@@ -309,6 +310,7 @@ impl IoEngine {
 ///
 /// This function retries the operation once if it fails with ECANCELED.
 /// ONLY USE FOR IDEMPOTENT [`super::VirtualFile`] operations.
+#[cfg(target_os = "linux")]
 pub(super) async fn retry_ecanceled_once<F, Fut, T, V>(
     resources: T,
     f: F,

From 659366060dcef08a46c42c0794a829afb4270b1c Mon Sep 17 00:00:00 2001
From: Santosh Pingale <3813695+santosh-d3vpl3x@users.noreply.github.com>
Date: Thu, 8 May 2025 16:09:15 +0200
Subject: [PATCH 10/65] Reuse remote_client from the SnapshotDownloader instead
 of recreating in download function (#11812)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Problem
At the moment, remote_client and target are recreated in download
function. We could reuse it from SnapshotDownloader instance. This isn't
a problem per se, just a quality of life improvement but it caught my
attention when we were trying out snapshot downloading in one of the
older version and ran into a curious case of s3 clients behaving in two
different manners. One client that used `force_path_style` and other one
didn't.

**Logs from this run:**
```
2025-05-02T12:56:22.384626Z DEBUG /data/snappie/2739e7da34e625e3934ef0b76fa12483/timelines/d44b831adb0a6ba96792dc3a5cc30910/000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014E8F20-00000000014E8F99-00000001 requires download...
2025-05-02T12:56:22.384689Z DEBUG invoke{service=s3 operation=ListObjectVersions sdk_invocation_id=7315885}:apply_configuration: timeout settings for this operation: TimeoutConfig { connect_timeout: Set(3.1s), read_timeout: Disabled, operation_timeout: Disabled, operation_attempt_timeout: Disabled }
2025-05-02T12:56:22.384730Z DEBUG invoke{service=s3 operation=ListObjectVersions sdk_invocation_id=7315885}:try_op: entering 'serialization' phase
2025-05-02T12:56:22.384784Z DEBUG invoke{service=s3 operation=ListObjectVersions sdk_invocation_id=7315885}:try_op: entering 'before transmit' phase
2025-05-02T12:56:22.384813Z DEBUG invoke{service=s3 operation=ListObjectVersions sdk_invocation_id=7315885}:try_op: retry strategy has OKed initial request
2025-05-02T12:56:22.384841Z DEBUG invoke{service=s3 operation=ListObjectVersions sdk_invocation_id=7315885}:try_op: beginning attempt #1
2025-05-02T12:56:22.384870Z DEBUG invoke{service=s3 operation=ListObjectVersions sdk_invocation_id=7315885}:try_op:try_attempt: resolving endpoint endpoint_params=EndpointResolverParams(TypeErasedBox[!Clone]:Params { bucket: Some("bucket"), region: Some("eu-north-1"), use_fips: false, use_dual_stack: false, endpoint: Some("https://s3.self-hosted.company.com"), force_path_style: false, accelerate: false, use_global_endpoint: false, use_object_lambda_endpoint: None, key: None, prefix: Some("/pageserver/tenants/2739e7da34e625e3934ef0b76fa12483/timelines/d44b831adb0a6ba96792dc3a5cc30910/000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014E8F20-00000000014E8F99-00000001"), copy_source: None, disable_access_points: None, disable_multi_region_access_points: false, use_arn_region: None, use_s3_express_control_endpoint: None, disable_s3_express_session_auth: None }) endpoint_prefix=None
2025-05-02T12:56:22.384979Z DEBUG invoke{service=s3 operation=ListObjectVersions sdk_invocation_id=7315885}:try_op:try_attempt: will use endpoint Endpoint { url: "https://neon.s3.self-hosted.company.com", headers: {}, properties: {"authSchemes": Array([Object({"signingRegion": String("eu-north-1"), "disableDoubleEncoding": Bool(true), "name": String("sigv4"), "signingName": String("s3")})])} }
2025-05-02T12:56:22.385042Z DEBUG invoke{service=s3 operation=ListObjectVersions sdk_invocation_id=7315885}:try_op:try_attempt:lazy_load_identity:provide_credentials{provider=default_chain}: loaded credentials provider=Environment
2025-05-02T12:56:22.385066Z DEBUG invoke{service=s3 operation=ListObjectVersions sdk_invocation_id=7315885}:try_op:try_attempt:lazy_load_identity: identity cache miss occurred; added new identity (took 35.958µs) new_expiration=2025-05-02T13:11:22.385028Z valid_for=899.999961437s partition=IdentityCachePartition(5)
2025-05-02T12:56:22.385090Z DEBUG invoke{service=s3 operation=ListObjectVersions sdk_invocation_id=7315885}:try_op:try_attempt: loaded identity
2025-05-02T12:56:22.385162Z DEBUG invoke{service=s3 operation=ListObjectVersions sdk_invocation_id=7315885}:try_op:try_attempt: entering 'transmit' phase
2025-05-02T12:56:22.385211Z DEBUG invoke{service=s3 operation=ListObjectVersions sdk_invocation_id=7315885}:try_op:try_attempt: new TCP connector created in 361ns
2025-05-02T12:56:22.385288Z DEBUG resolving host="neon.s3.self-hosted.company.com"
2025-05-02T12:56:22.390796Z DEBUG invoke{service=s3 operation=ListObjectVersions sdk_invocation_id=7315885}:try_op:try_attempt: encountered orchestrator error; halting
```
---
 storage_scrubber/src/tenant_snapshot.rs | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/storage_scrubber/src/tenant_snapshot.rs b/storage_scrubber/src/tenant_snapshot.rs
index 24231e32fc..d0ca53f8ab 100644
--- a/storage_scrubber/src/tenant_snapshot.rs
+++ b/storage_scrubber/src/tenant_snapshot.rs
@@ -24,7 +24,6 @@ pub struct SnapshotDownloader {
     remote_client: GenericRemoteStorage,
     #[allow(dead_code)]
     target: RootTarget,
-    bucket_config: BucketConfig,
     tenant_id: TenantId,
     output_path: Utf8PathBuf,
     concurrency: usize,
@@ -43,7 +42,6 @@ impl SnapshotDownloader {
         Ok(Self {
             remote_client,
             target,
-            bucket_config,
             tenant_id,
             output_path,
             concurrency,
@@ -218,11 +216,9 @@ impl SnapshotDownloader {
     }
 
     pub async fn download(&self) -> anyhow::Result<()> {
-        let (remote_client, target) =
-            init_remote(self.bucket_config.clone(), NodeKind::Pageserver).await?;
-
         // Generate a stream of TenantShardId
-        let shards = stream_tenant_shards(&remote_client, &target, self.tenant_id).await?;
+        let shards =
+            stream_tenant_shards(&self.remote_client, &self.target, self.tenant_id).await?;
         let shards: Vec<TenantShardId> = shards.try_collect().await?;
 
         // Only read from shards that have the highest count: avoids redundantly downloading
@@ -240,7 +236,8 @@ impl SnapshotDownloader {
 
         for shard in shards.into_iter().filter(|s| s.shard_count == shard_count) {
             // Generate a stream of TenantTimelineId
-            let timelines = stream_tenant_timelines(&remote_client, &target, shard).await?;
+            let timelines =
+                stream_tenant_timelines(&self.remote_client, &self.target, shard).await?;
 
             // Generate a stream of S3TimelineBlobData
             async fn load_timeline_index(
@@ -251,8 +248,8 @@ impl SnapshotDownloader {
                 let data = list_timeline_blobs(remote_client, ttid, target).await?;
                 Ok((ttid, data))
             }
-            let timelines =
-                timelines.map_ok(|ttid| load_timeline_index(&remote_client, &target, ttid));
+            let timelines = timelines
+                .map_ok(|ttid| load_timeline_index(&self.remote_client, &self.target, ttid));
             let mut timelines = std::pin::pin!(timelines.try_buffered(8));
 
             while let Some(i) = timelines.next().await {

From 622b3b29936d0496808396e447e678177a58412d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 8 May 2025 17:13:11 +0200
Subject: [PATCH 11/65] Fixes for enabling --timelines-onto-safekeepers in
 tests (#11854)

Second PR with fixes extracted from #11712, relating to
`--timelines-onto-safekeepers`. Does the following:

* Moves safekeeper registration to `neon_local` instead of the test
fixtures
* Pass safekeeper JWT token if `--timelines-onto-safekeepers` is enabled
* Allow some warnings related to offline safekeepers (similarly to how
we allow them for offline pageservers)
* Enable generations on the compute's config if
`--timelines-onto-safekeepers` is enabled
* fix parallel `pull_timeline` race condition (the one that #11786 put
for later)

Fixes #11424
Part of #11670
---
 control_plane/src/bin/neon_local.rs           |   9 +-
 control_plane/src/storage_controller.rs       | 100 ++++++++++++++++--
 safekeeper/src/http/routes.rs                 |   3 +-
 safekeeper/src/pull_timeline.rs               |  30 ++++--
 test_runner/fixtures/neon_fixtures.py         |  24 -----
 .../fixtures/pageserver/allowed_errors.py     |   4 +
 6 files changed, 131 insertions(+), 39 deletions(-)

diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index 610fa5f865..191a22f1de 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -1417,7 +1417,14 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
             let pageserver_id = args.endpoint_pageserver_id;
             let remote_ext_base_url = &args.remote_ext_base_url;
 
-            let safekeepers_generation = args.safekeepers_generation.map(SafekeeperGeneration::new);
+            let default_generation = env
+                .storage_controller
+                .timelines_onto_safekeepers
+                .then_some(1);
+            let safekeepers_generation = args
+                .safekeepers_generation
+                .or(default_generation)
+                .map(SafekeeperGeneration::new);
             // If --safekeepers argument is given, use only the listed
             // safekeeper nodes; otherwise all from the env.
             let safekeepers = if let Some(safekeepers) = parse_safekeepers(&args.safekeepers)? {
diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs
index a36815d27e..755d67a7ad 100644
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -10,7 +10,8 @@ use camino::{Utf8Path, Utf8PathBuf};
 use hyper0::Uri;
 use nix::unistd::Pid;
 use pageserver_api::controller_api::{
-    NodeConfigureRequest, NodeDescribeResponse, NodeRegisterRequest, TenantCreateRequest,
+    NodeConfigureRequest, NodeDescribeResponse, NodeRegisterRequest,
+    SafekeeperSchedulingPolicyRequest, SkSchedulingPolicy, TenantCreateRequest,
     TenantCreateResponse, TenantLocateResponse,
 };
 use pageserver_api::models::{
@@ -20,7 +21,7 @@ use pageserver_api::shard::TenantShardId;
 use pageserver_client::mgmt_api::ResponseErrorMessageExt;
 use pem::Pem;
 use postgres_backend::AuthType;
-use reqwest::Method;
+use reqwest::{Method, Response};
 use serde::de::DeserializeOwned;
 use serde::{Deserialize, Serialize};
 use tokio::process::Command;
@@ -570,6 +571,11 @@ impl StorageController {
             let peer_jwt_token = encode_from_key_file(&peer_claims, private_key)
                 .expect("failed to generate jwt token");
             args.push(format!("--peer-jwt-token={peer_jwt_token}"));
+
+            let claims = Claims::new(None, Scope::SafekeeperData);
+            let jwt_token =
+                encode_from_key_file(&claims, private_key).expect("failed to generate jwt token");
+            args.push(format!("--safekeeper-jwt-token={jwt_token}"));
         }
 
         if let Some(public_key) = &self.public_key {
@@ -614,6 +620,10 @@ impl StorageController {
             self.env.base_data_dir.display()
         ));
 
+        if self.env.safekeepers.iter().any(|sk| sk.auth_enabled) && self.private_key.is_none() {
+            anyhow::bail!("Safekeeper set up for auth but no private key specified");
+        }
+
         if self.config.timelines_onto_safekeepers {
             args.push("--timelines-onto-safekeepers".to_string());
         }
@@ -640,6 +650,10 @@ impl StorageController {
         )
         .await?;
 
+        if self.config.timelines_onto_safekeepers {
+            self.register_safekeepers().await?;
+        }
+
         Ok(())
     }
 
@@ -743,6 +757,23 @@ impl StorageController {
     where
         RQ: Serialize + Sized,
         RS: DeserializeOwned + Sized,
+    {
+        let response = self.dispatch_inner(method, path, body).await?;
+        Ok(response
+            .json()
+            .await
+            .map_err(pageserver_client::mgmt_api::Error::ReceiveBody)?)
+    }
+
+    /// Simple HTTP request wrapper for calling into storage controller
+    async fn dispatch_inner<RQ>(
+        &self,
+        method: reqwest::Method,
+        path: String,
+        body: Option<RQ>,
+    ) -> anyhow::Result<Response>
+    where
+        RQ: Serialize + Sized,
     {
         // In the special case of the `storage_controller start` subcommand, we wish
         // to use the API endpoint of the newly started storage controller in order
@@ -785,10 +816,31 @@ impl StorageController {
         let response = builder.send().await?;
         let response = response.error_from_body().await?;
 
-        Ok(response
-            .json()
-            .await
-            .map_err(pageserver_client::mgmt_api::Error::ReceiveBody)?)
+        Ok(response)
+    }
+
+    /// Register the safekeepers in the storage controller
+    #[instrument(skip(self))]
+    async fn register_safekeepers(&self) -> anyhow::Result<()> {
+        for sk in self.env.safekeepers.iter() {
+            let sk_id = sk.id;
+            let body = serde_json::json!({
+                "id": sk_id,
+                "created_at": "2023-10-25T09:11:25Z",
+                "updated_at": "2024-08-28T11:32:43Z",
+                "region_id": "aws-us-east-2",
+                "host": "127.0.0.1",
+                "port": sk.pg_port,
+                "http_port": sk.http_port,
+                "https_port": sk.https_port,
+                "version": 5957,
+                "availability_zone_id": format!("us-east-2b-{sk_id}"),
+            });
+            self.upsert_safekeeper(sk_id, body).await?;
+            self.safekeeper_scheduling_policy(sk_id, SkSchedulingPolicy::Active)
+                .await?;
+        }
+        Ok(())
     }
 
     /// Call into the attach_hook API, for use before handing out attachments to pageservers
@@ -816,6 +868,42 @@ impl StorageController {
         Ok(response.generation)
     }
 
+    #[instrument(skip(self))]
+    pub async fn upsert_safekeeper(
+        &self,
+        node_id: NodeId,
+        request: serde_json::Value,
+    ) -> anyhow::Result<()> {
+        let resp = self
+            .dispatch_inner::<serde_json::Value>(
+                Method::POST,
+                format!("control/v1/safekeeper/{node_id}"),
+                Some(request),
+            )
+            .await?;
+        if !resp.status().is_success() {
+            anyhow::bail!(
+                "setting scheduling policy unsuccessful for safekeeper {node_id}: {}",
+                resp.status()
+            );
+        }
+        Ok(())
+    }
+
+    #[instrument(skip(self))]
+    pub async fn safekeeper_scheduling_policy(
+        &self,
+        node_id: NodeId,
+        scheduling_policy: SkSchedulingPolicy,
+    ) -> anyhow::Result<()> {
+        self.dispatch::<SafekeeperSchedulingPolicyRequest, ()>(
+            Method::POST,
+            format!("control/v1/safekeeper/{node_id}/scheduling_policy"),
+            Some(SafekeeperSchedulingPolicyRequest { scheduling_policy }),
+        )
+        .await
+    }
+
     #[instrument(skip(self))]
     pub async fn inspect(
         &self,
diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs
index 2b2d721db2..1a25b07496 100644
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -243,8 +243,7 @@ async fn timeline_pull_handler(mut request: Request<Body>) -> Result<Response<Bo
 
     let resp =
         pull_timeline::handle_request(data, conf.sk_auth_token.clone(), ca_certs, global_timelines)
-            .await
-            .map_err(ApiError::InternalServerError)?;
+            .await?;
     json_response(StatusCode::OK, resp)
 }
 
diff --git a/safekeeper/src/pull_timeline.rs b/safekeeper/src/pull_timeline.rs
index 1510a51019..66f2877cc5 100644
--- a/safekeeper/src/pull_timeline.rs
+++ b/safekeeper/src/pull_timeline.rs
@@ -7,6 +7,7 @@ use bytes::Bytes;
 use camino::Utf8PathBuf;
 use chrono::{DateTime, Utc};
 use futures::{SinkExt, StreamExt, TryStreamExt};
+use http_utils::error::ApiError;
 use postgres_ffi::{PG_TLI, XLogFileName, XLogSegNo};
 use reqwest::Certificate;
 use safekeeper_api::Term;
@@ -30,7 +31,7 @@ use utils::pausable_failpoint;
 
 use crate::control_file::CONTROL_FILE_NAME;
 use crate::state::{EvictionState, TimelinePersistentState};
-use crate::timeline::{Timeline, WalResidentTimeline};
+use crate::timeline::{Timeline, TimelineError, WalResidentTimeline};
 use crate::timelines_global_map::{create_temp_timeline_dir, validate_temp_timeline};
 use crate::wal_storage::open_wal_file;
 use crate::{GlobalTimelines, debug_dump, wal_backup};
@@ -395,7 +396,7 @@ pub async fn handle_request(
     sk_auth_token: Option<SecretString>,
     ssl_ca_certs: Vec<Certificate>,
     global_timelines: Arc<GlobalTimelines>,
-) -> Result<PullTimelineResponse> {
+) -> Result<PullTimelineResponse, ApiError> {
     let existing_tli = global_timelines.get(TenantTimelineId::new(
         request.tenant_id,
         request.timeline_id,
@@ -411,7 +412,9 @@ pub async fn handle_request(
     for ssl_ca_cert in ssl_ca_certs {
         http_client = http_client.add_root_certificate(ssl_ca_cert);
     }
-    let http_client = http_client.build()?;
+    let http_client = http_client
+        .build()
+        .map_err(|e| ApiError::InternalServerError(e.into()))?;
 
     let http_hosts = request.http_hosts.clone();
 
@@ -443,10 +446,10 @@ pub async fn handle_request(
     // offline and C comes online. Then we want a pull on C with A and B as hosts to work.
     let min_required_successful = (http_hosts.len() - 1).max(1);
     if statuses.len() < min_required_successful {
-        bail!(
+        return Err(ApiError::InternalServerError(anyhow::anyhow!(
             "only got {} successful status responses. required: {min_required_successful}",
             statuses.len()
-        )
+        )));
     }
 
     // Find the most advanced safekeeper
@@ -465,7 +468,7 @@ pub async fn handle_request(
     assert!(status.tenant_id == request.tenant_id);
     assert!(status.timeline_id == request.timeline_id);
 
-    pull_timeline(
+    match pull_timeline(
         status,
         safekeeper_host,
         sk_auth_token,
@@ -473,6 +476,21 @@ pub async fn handle_request(
         global_timelines,
     )
     .await
+    {
+        Ok(resp) => Ok(resp),
+        Err(e) => {
+            match e.downcast_ref::<TimelineError>() {
+                Some(TimelineError::AlreadyExists(_)) => Ok(PullTimelineResponse {
+                    safekeeper_host: None,
+                }),
+                Some(TimelineError::CreationInProgress(_)) => {
+                    // We don't return success here because creation might still fail.
+                    Err(ApiError::Conflict("Creation in progress".to_owned()))
+                }
+                _ => Err(ApiError::InternalServerError(e)),
+            }
+        }
+    }
 }
 
 async fn pull_timeline(
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 370eca5130..547c640a40 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1409,30 +1409,6 @@ class NeonEnv:
         for f in futs:
             f.result()
 
-        # Last step: register safekeepers at the storage controller
-        if (
-            self.storage_controller_config is not None
-            and self.storage_controller_config.get("timelines_onto_safekeepers") is True
-        ):
-            for sk_id, sk in enumerate(self.safekeepers):
-                # 0 is an invalid safekeeper id
-                sk_id = sk_id + 1
-                body = {
-                    "id": sk_id,
-                    "created_at": "2023-10-25T09:11:25Z",
-                    "updated_at": "2024-08-28T11:32:43Z",
-                    "region_id": "aws-us-east-2",
-                    "host": "127.0.0.1",
-                    "port": sk.port.pg,
-                    "http_port": sk.port.http,
-                    "https_port": None,
-                    "version": 5957,
-                    "availability_zone_id": f"us-east-2b-{sk_id}",
-                }
-
-                self.storage_controller.on_safekeeper_deploy(sk_id, body)
-                self.storage_controller.safekeeper_scheduling_policy(sk_id, "Active")
-
         self.endpoint_storage.start(timeout_in_seconds=timeout_in_seconds)
 
     def stop(self, immediate=False, ps_assert_metric_no_errors=False, fail_on_endpoint_errors=True):
diff --git a/test_runner/fixtures/pageserver/allowed_errors.py b/test_runner/fixtures/pageserver/allowed_errors.py
index 24c856e279..43bffd919c 100755
--- a/test_runner/fixtures/pageserver/allowed_errors.py
+++ b/test_runner/fixtures/pageserver/allowed_errors.py
@@ -122,6 +122,10 @@ DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS = [
     ".*Call to node.*management API.*failed.*Timeout.*",
     ".*Failed to update node .+ after heartbeat round.*error sending request for url.*",
     ".*background_reconcile: failed to fetch top tenants:.*client error \\(Connect\\).*",
+    # Many tests will take safekeepers offline
+    ".*Call to safekeeper.*management API.*failed.*receive body.*",
+    ".*Call to safekeeper.*management API.*failed.*ReceiveBody.*",
+    ".*Call to safekeeper.*management API.*failed.*Timeout.*",
     # Many tests will start up with a node offline
     ".*startup_reconcile: Could not scan node.*",
     # Tests run in dev mode

From 8477d15f95ffb094c444e658bbcdb95301b1a750 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 8 May 2025 18:11:45 +0200
Subject: [PATCH 12/65] feat(direct IO): remove special case in test suite for
 compat tests (#11864)

PR
- https://github.com/neondatabase/neon/pull/11558
adds special treatment for compat snapshot binaries which don't
understand the `direct-rw` mode.

A new compat snapshot has been published since, so,
we can remove the special case.

refs:
- fixes https://github.com/neondatabase/neon/issues/11598
---
 test_runner/fixtures/neon_fixtures.py | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 547c640a40..aa468d9386 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1299,13 +1299,6 @@ class NeonEnv:
                         for key, value in override.items():
                             ps_cfg[key] = value
 
-            if self.pageserver_virtual_file_io_mode is not None:
-                # TODO(christian): https://github.com/neondatabase/neon/issues/11598
-                if not config.test_may_use_compatibility_snapshot_binaries:
-                    ps_cfg["virtual_file_io_mode"] = self.pageserver_virtual_file_io_mode
-                else:
-                    log.info("ignoring virtual_file_io_mode parametrization for compatibility test")
-
             if self.pageserver_wal_receiver_protocol is not None:
                 key, value = PageserverWalReceiverProtocol.to_config_key_value(
                     self.pageserver_wal_receiver_protocol

From bef5954fd7b8ea43cac6f43a111d437cd7a360ad Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Thu, 8 May 2025 17:46:57 +0100
Subject: [PATCH 13/65] feat(proxy): track SNI usage by protocol, including for
 http (#11863)

## Problem

We want to see how many users of the legacy serverless driver are still
using the old URL for SQL-over-HTTP traffic.

## Summary of changes

Adds a protocol field to the connections_by_sni metric. Ensures it's
incremented for sql-over-http.
---
 proxy/src/auth/credentials.rs         | 29 ++++++++++++++-------------
 proxy/src/metrics.rs                  | 15 +++++++++++---
 proxy/src/serverless/mod.rs           |  1 +
 proxy/src/serverless/sql_over_http.rs | 28 +++++++++++++++++++++++++-
 test_runner/fixtures/neon_fixtures.py |  8 ++++----
 5 files changed, 59 insertions(+), 22 deletions(-)

diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs
index 183976374a..526d0df7f2 100644
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -12,9 +12,9 @@ use tracing::{debug, warn};
 use crate::auth::password_hack::parse_endpoint_param;
 use crate::context::RequestContext;
 use crate::error::{ReportableError, UserFacingError};
-use crate::metrics::{Metrics, SniKind};
+use crate::metrics::{Metrics, SniGroup, SniKind};
 use crate::proxy::NeonOptions;
-use crate::serverless::SERVERLESS_DRIVER_SNI;
+use crate::serverless::{AUTH_BROKER_SNI, SERVERLESS_DRIVER_SNI};
 use crate::types::{EndpointId, RoleName};
 
 #[derive(Debug, Error, PartialEq, Eq, Clone)]
@@ -65,7 +65,7 @@ pub(crate) fn endpoint_sni(sni: &str, common_names: &HashSet<String>) -> Option<
     if !common_names.contains(common_name) {
         return None;
     }
-    if subdomain == SERVERLESS_DRIVER_SNI {
+    if subdomain == SERVERLESS_DRIVER_SNI || subdomain == AUTH_BROKER_SNI {
         return None;
     }
     Some(EndpointId::from(subdomain))
@@ -128,22 +128,23 @@ impl ComputeUserInfoMaybeEndpoint {
 
         let metrics = Metrics::get();
         debug!(%user, "credentials");
-        if sni.is_some() {
+
+        let protocol = ctx.protocol();
+        let kind = if sni.is_some() {
             debug!("Connection with sni");
-            metrics.proxy.accepted_connections_by_sni.inc(SniKind::Sni);
+            SniKind::Sni
         } else if endpoint.is_some() {
-            metrics
-                .proxy
-                .accepted_connections_by_sni
-                .inc(SniKind::NoSni);
             debug!("Connection without sni");
+            SniKind::NoSni
         } else {
-            metrics
-                .proxy
-                .accepted_connections_by_sni
-                .inc(SniKind::PasswordHack);
             debug!("Connection with password hack");
-        }
+            SniKind::PasswordHack
+        };
+
+        metrics
+            .proxy
+            .accepted_connections_by_sni
+            .inc(SniGroup { protocol, kind });
 
         let options = NeonOptions::parse_params(params);
 
diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index e5fc0b724b..4b22c912eb 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -115,8 +115,8 @@ pub struct ProxyMetrics {
     #[metric(metadata = Thresholds::with_buckets([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 10.0, 20.0, 50.0, 100.0]))]
     pub allowed_vpc_endpoint_ids: Histogram<10>,
 
-    /// Number of connections (per sni).
-    pub accepted_connections_by_sni: CounterVec<StaticLabelSet<SniKind>>,
+    /// Number of connections, by the method we used to determine the endpoint.
+    pub accepted_connections_by_sni: CounterVec<SniSet>,
 
     /// Number of connection failures (per kind).
     pub connection_failures_total: CounterVec<StaticLabelSet<ConnectionFailureKind>>,
@@ -342,11 +342,20 @@ pub enum LatencyExclusions {
     ClientCplaneComputeRetry,
 }
 
+#[derive(LabelGroup)]
+#[label(set = SniSet)]
+pub struct SniGroup {
+    pub protocol: Protocol,
+    pub kind: SniKind,
+}
+
 #[derive(FixedCardinalityLabel, Copy, Clone)]
-#[label(singleton = "kind")]
 pub enum SniKind {
+    /// Domain name based routing. SNI for libpq/websockets. Host for HTTP
     Sni,
+    /// Metadata based routing. `options` for libpq/websockets. Header for HTTP
     NoSni,
+    /// Metadata based routing, using the password field.
     PasswordHack,
 }
 
diff --git a/proxy/src/serverless/mod.rs b/proxy/src/serverless/mod.rs
index 6f24ad3dec..2a7069b1c2 100644
--- a/proxy/src/serverless/mod.rs
+++ b/proxy/src/serverless/mod.rs
@@ -56,6 +56,7 @@ use crate::serverless::backend::PoolingBackend;
 use crate::serverless::http_util::{api_error_into_response, json_response};
 
 pub(crate) const SERVERLESS_DRIVER_SNI: &str = "api";
+pub(crate) const AUTH_BROKER_SNI: &str = "apiauth";
 
 pub async fn task_main(
     config: &'static ProxyConfig,
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index fee5942b7e..dfaeedaeae 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -38,7 +38,7 @@ use crate::config::{AuthenticationConfig, HttpConfig, ProxyConfig, TlsConfig};
 use crate::context::RequestContext;
 use crate::error::{ErrorKind, ReportableError, UserFacingError};
 use crate::http::{ReadBodyError, read_body_with_limit};
-use crate::metrics::{HttpDirection, Metrics};
+use crate::metrics::{HttpDirection, Metrics, SniGroup, SniKind};
 use crate::proxy::{NeonOptions, run_until_cancelled};
 use crate::serverless::backend::HttpConnError;
 use crate::types::{DbName, RoleName};
@@ -227,6 +227,32 @@ fn get_conn_info(
         }
     }
 
+    // check the URL that was used, for metrics
+    {
+        let host_endpoint = headers
+            // get the host header
+            .get("host")
+            // extract the domain
+            .and_then(|h| {
+                let (host, _port) = h.to_str().ok()?.split_once(':')?;
+                Some(host)
+            })
+            // get the endpoint prefix
+            .map(|h| h.split_once('.').map_or(h, |(prefix, _)| prefix));
+
+        let kind = if host_endpoint == Some(&*endpoint) {
+            SniKind::Sni
+        } else {
+            SniKind::NoSni
+        };
+
+        let protocol = ctx.protocol();
+        Metrics::get()
+            .proxy
+            .accepted_connections_by_sni
+            .inc(SniGroup { protocol, kind });
+    }
+
     ctx.set_user_agent(
         headers
             .get(hyper::header::USER_AGENT)
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index aa468d9386..1b4562c0b3 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -3835,7 +3835,7 @@ class NeonAuthBroker:
         external_http_port: int,
         auth_backend: NeonAuthBroker.ProxyV1,
     ):
-        self.domain = "apiauth.local.neon.build"  # resolves to 127.0.0.1
+        self.domain = "local.neon.build"  # resolves to 127.0.0.1
         self.host = "127.0.0.1"
         self.http_port = http_port
         self.external_http_port = external_http_port
@@ -3852,7 +3852,7 @@ class NeonAuthBroker:
         # generate key of it doesn't exist
         crt_path = self.test_output_dir / "proxy.crt"
         key_path = self.test_output_dir / "proxy.key"
-        generate_proxy_tls_certs("apiauth.local.neon.build", key_path, crt_path)
+        generate_proxy_tls_certs(f"apiauth.{self.domain}", key_path, crt_path)
 
         args = [
             str(self.neon_binpath / "proxy"),
@@ -3896,10 +3896,10 @@ class NeonAuthBroker:
 
         log.info(f"Executing http query: {query}")
 
-        connstr = f"postgresql://{user}@{self.domain}/postgres"
+        connstr = f"postgresql://{user}@ep-foo-bar-1234.{self.domain}/postgres"
         async with httpx.AsyncClient(verify=str(self.test_output_dir / "proxy.crt")) as client:
             response = await client.post(
-                f"https://{self.domain}:{self.external_http_port}/sql",
+                f"https://apiauth.{self.domain}:{self.external_http_port}/sql",
                 json={"query": query, "params": args},
                 headers={
                     "Neon-Connection-String": connstr,

From b37bb7d7edaab870d05bff7286e345066d49664e Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 8 May 2025 20:48:24 +0200
Subject: [PATCH 14/65] pageserver: timeline shutdown: fully quiesce ingest
 path before`freeze_and_flush` (#11851)

# Problem

Before this PR, timeline shutdown would
- cancel the walreceiver cancellation token subtree (child token of
Timeline::cancel)
- call freeze_and_flush
- Timeline::cancel.cancel()
- ... bunch of waiting for things ...
- Timeline::gate.close()

As noted by the comment that is deleted by this PR, this left a window
where, after freeze_and_flush, walreceiver could still be running and
ingest data into a new InMemoryLayer.

This presents a potential source of log noise during Timeline shutdown
where the InMemoryLayer created after the freeze_and_flush observes
that Timeline::cancel is cancelled, failing the ingest with some
anyhow::Error wrapping (deeply) a `FlushTaskError::Cancelled` instance
(`flush task cancelled` error message).

# Solution

It turns out that it is quite easy to shut down, not just cancel,
walreceiver completely
because the only subtask spawned by walreceiver connection manager is
the `handle_walreceiver_connection` task, which is properly shut down
and waited upon when the manager task observes cancellation and exits
its retry loop.

The alternative is to replace all the usage of `anyhow` on the ingest
path
with differentiated error types. A lot of busywork for little gain to
fix
a potential logging noise nuisance, so, not doing that for now.

# Correctness / Risk

We do not risk leaking walreceiver child tasks because existing
discipline
is to hold a gate guard.

We will prolong `Timeline::shutdown` to the degree that we're no longer
making
progress with the rest of shutdown while the walreceiver task hasn't yet
observed cancellation. In practice, this should be negligible.

`Timeline::shutdown` could fail to complete if there is a hidden
dependency
of walreceiver shutdown on some subsystem. The code certainly suggests
there
isn't, and I'm not aware of any such dependency. Anyway, impact will be
low
because we only shut down Timeline instances that are obsolete, either
because
there is a newer attachment at a different location, or because the
timeline
got deleted by the user. We would learn about this through stuck cplane
operations or stuck storcon reconciliations. We would be able to
mitigate by
cancelling such stuck operations/reconciliations and/or by rolling back
pageserver.

# Refs
- identified this while investigating
https://github.com/neondatabase/neon/issues/11762
- PR that _does_ fix a bunch _real_ `flush task cancelled` noise on the
compaction path: https://github.com/neondatabase/neon/pull/11853
---
 pageserver/src/tenant/timeline.rs             | 12 ++----------
 pageserver/src/tenant/timeline/walreceiver.rs | 16 ++++++++++++++--
 2 files changed, 16 insertions(+), 12 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index c8d897d074..d7f5958128 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2127,22 +2127,14 @@ impl Timeline {
         debug_assert_current_span_has_tenant_and_timeline_id();
 
         // Regardless of whether we're going to try_freeze_and_flush
-        // or not, stop ingesting any more data. Walreceiver only provides
-        // cancellation but no "wait until gone", because it uses the Timeline::gate.
-        // So, only after the self.gate.close() below will we know for sure that
-        // no walreceiver tasks are left.
-        // For `try_freeze_and_flush=true`, this means that we might still be ingesting
-        // data during the call to `self.freeze_and_flush()` below.
-        // That's not ideal, but, we don't have the concept of a ChildGuard,
-        // which is what we'd need to properly model early shutdown of the walreceiver
-        // task sub-tree before the other Timeline task sub-trees.
+        // or not, stop ingesting any more data.
         let walreceiver = self.walreceiver.lock().unwrap().take();
         tracing::debug!(
             is_some = walreceiver.is_some(),
             "Waiting for WalReceiverManager..."
         );
         if let Some(walreceiver) = walreceiver {
-            walreceiver.cancel();
+            walreceiver.shutdown().await;
         }
         // ... and inform any waiters for newer LSNs that there won't be any.
         self.last_record_lsn.shutdown();
diff --git a/pageserver/src/tenant/timeline/walreceiver.rs b/pageserver/src/tenant/timeline/walreceiver.rs
index 4f80073cc3..0f73eb839b 100644
--- a/pageserver/src/tenant/timeline/walreceiver.rs
+++ b/pageserver/src/tenant/timeline/walreceiver.rs
@@ -63,6 +63,7 @@ pub struct WalReceiver {
     /// All task spawned by [`WalReceiver::start`] and its children are sensitive to this token.
     /// It's a child token of [`Timeline`] so that timeline shutdown can cancel WalReceiver tasks early for `freeze_and_flush=true`.
     cancel: CancellationToken,
+    task: tokio::task::JoinHandle<()>,
 }
 
 impl WalReceiver {
@@ -79,7 +80,7 @@ impl WalReceiver {
         let loop_status = Arc::new(std::sync::RwLock::new(None));
         let manager_status = Arc::clone(&loop_status);
         let cancel = timeline.cancel.child_token();
-        WALRECEIVER_RUNTIME.spawn({
+        let task = WALRECEIVER_RUNTIME.spawn({
             let cancel = cancel.clone();
             async move {
                 debug_assert_current_span_has_tenant_and_timeline_id();
@@ -120,14 +121,25 @@ impl WalReceiver {
         Self {
             manager_status,
             cancel,
+            task,
         }
     }
 
     #[instrument(skip_all, level = tracing::Level::DEBUG)]
-    pub fn cancel(&self) {
+    pub async fn shutdown(self) {
         debug_assert_current_span_has_tenant_and_timeline_id();
         debug!("cancelling walreceiver tasks");
         self.cancel.cancel();
+        match self.task.await {
+            Ok(()) => debug!("Shutdown success"),
+            Err(je) if je.is_cancelled() => unreachable!("not used"),
+            Err(je) if je.is_panic() => {
+                // already logged by panic hook
+            }
+            Err(je) => {
+                error!("shutdown walreceiver task join error: {je}")
+            }
+        }
     }
 
     pub(crate) fn status(&self) -> Option<ConnectionManagerStatus> {

From 101e115b3885dd966a839ef50b450771988fa9aa Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Fri, 9 May 2025 09:54:40 +0300
Subject: [PATCH 15/65] Change prefetch logic in vacuum (#11650)

## Problem
See https://neondb.slack.com/archives/C03QLRH7PPD/p1745003314183649

Vacuum doesn't use prefetch because this strange logic in
`lazy_scan_heap`:

```
			/* And only up to the next unskippable block */
			if (next_prefetch_block + prefetch_budget > vacrel->next_unskippable_block)
				prefetch_budget = vacrel->next_unskippable_block - next_prefetch_block;
```
## Summary of changes

Disable prefetch only if vacuum jumps to next skippable block (there is
SKIP_PAGES_THRESHOLD) which cancel seqscan and perform jump only if gap
is large enough).


Postgres PRs:
https://github.com/neondatabase/postgres/pull/620
https://github.com/neondatabase/postgres/pull/621
https://github.com/neondatabase/postgres/pull/622
https://github.com/neondatabase/postgres/pull/623

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 vendor/postgres-v14   | 2 +-
 vendor/postgres-v15   | 2 +-
 vendor/postgres-v17   | 2 +-
 vendor/revisions.json | 6 +++---
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index 108856a4ae..06b405bc98 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit 108856a4ae76be285b04497a0ed08fcbe60ddbe9
+Subproject commit 06b405bc982fd53522689aa4acbfd9c44b7993cf
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index b838c8969b..72f83df76c 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit b838c8969b7c63f3e637a769656f5f36793b797c
+Subproject commit 72f83df76c61ce18d81bd371f0afd2a43d59c052
diff --git a/vendor/postgres-v17 b/vendor/postgres-v17
index b763ab54b9..0d59c91c1a 160000
--- a/vendor/postgres-v17
+++ b/vendor/postgres-v17
@@ -1 +1 @@
-Subproject commit b763ab54b98d232a0959371ab1d07f06ed77c49e
+Subproject commit 0d59c91c1a23e667f1d1169d5f040b3fa0a0ab44
diff --git a/vendor/revisions.json b/vendor/revisions.json
index 4307fd1c3f..10aad7e1a2 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,7 +1,7 @@
 {
   "v17": [
     "17.4",
-    "b763ab54b98d232a0959371ab1d07f06ed77c49e"
+    "0d59c91c1a23e667f1d1169d5f040b3fa0a0ab44"
   ],
   "v16": [
     "16.8",
@@ -9,10 +9,10 @@
   ],
   "v15": [
     "15.12",
-    "b838c8969b7c63f3e637a769656f5f36793b797c"
+    "72f83df76c61ce18d81bd371f0afd2a43d59c052"
   ],
   "v14": [
     "14.17",
-    "108856a4ae76be285b04497a0ed08fcbe60ddbe9"
+    "06b405bc982fd53522689aa4acbfd9c44b7993cf"
   ]
 }

From 5cd7f936f90978673a1f6a1dc64765e701035aa4 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Fri, 9 May 2025 08:48:30 +0100
Subject: [PATCH 16/65] fix(neon-rls): optimistically assume role grants are
 already assigned for replicas (#11811)

## Problem

Read replicas cannot grant permissions for roles for Neon RLS. Usually
the permission is already granted, so we can optimistically check. See
INC-509

## Summary of changes

Perform a permission lookup prior to actually executing any grants.
---
 Cargo.lock                              |  1 +
 compute_tools/Cargo.toml                |  1 +
 compute_tools/src/compute.rs            | 52 +++++++++++++++++--------
 test_runner/fixtures/neon_fixtures.py   | 10 ++++-
 test_runner/regress/test_role_grants.py |  7 ++++
 5 files changed, 52 insertions(+), 19 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index fe4cc35029..7083baa092 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1303,6 +1303,7 @@ dependencies = [
  "futures",
  "http 1.1.0",
  "indexmap 2.0.1",
+ "itertools 0.10.5",
  "jsonwebtoken",
  "metrics",
  "nix 0.27.1",
diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml
index 8ee5dd0665..f9da3ba700 100644
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -28,6 +28,7 @@ flate2.workspace = true
 futures.workspace = true
 http.workspace = true
 indexmap.workspace = true
+itertools.workspace = true
 jsonwebtoken.workspace = true
 metrics.workspace = true
 nix.workspace = true
diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 25920675c1..f494e2444a 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -11,6 +11,7 @@ use compute_api::spec::{
 use futures::StreamExt;
 use futures::future::join_all;
 use futures::stream::FuturesUnordered;
+use itertools::Itertools;
 use nix::sys::signal::{Signal, kill};
 use nix::unistd::Pid;
 use once_cell::sync::Lazy;
@@ -18,7 +19,7 @@ use postgres;
 use postgres::NoTls;
 use postgres::error::SqlState;
 use remote_storage::{DownloadError, RemotePath};
-use std::collections::HashMap;
+use std::collections::{HashMap, HashSet};
 use std::net::SocketAddr;
 use std::os::unix::fs::{PermissionsExt, symlink};
 use std::path::Path;
@@ -1995,23 +1996,40 @@ LIMIT 100",
         tokio::spawn(conn);
 
         // TODO: support other types of grants apart from schemas?
-        let query = format!(
-            "GRANT {} ON SCHEMA {} TO {}",
-            privileges
-                .iter()
-                // should not be quoted as it's part of the command.
-                // is already sanitized so it's ok
-                .map(|p| p.as_str())
-                .collect::<Vec<&'static str>>()
-                .join(", "),
-            // quote the schema and role name as identifiers to sanitize them.
-            schema_name.pg_quote(),
-            role_name.pg_quote(),
-        );
-        db_client
-            .simple_query(&query)
+
+        // check the role grants first - to gracefully handle read-replicas.
+        let select = "SELECT privilege_type
+            FROM pg_namespace
+                JOIN LATERAL (SELECT * FROM aclexplode(nspacl) AS x) acl ON true
+                JOIN pg_user users ON acl.grantee = users.usesysid
+            WHERE users.usename = $1
+                AND nspname = $2";
+        let rows = db_client
+            .query(select, &[role_name, schema_name])
             .await
-            .with_context(|| format!("Failed to execute query: {}", query))?;
+            .with_context(|| format!("Failed to execute query: {select}"))?;
+
+        let already_granted: HashSet<String> = rows.into_iter().map(|row| row.get(0)).collect();
+
+        let grants = privileges
+            .iter()
+            .filter(|p| !already_granted.contains(p.as_str()))
+            // should not be quoted as it's part of the command.
+            // is already sanitized so it's ok
+            .map(|p| p.as_str())
+            .join(", ");
+
+        if !grants.is_empty() {
+            // quote the schema and role name as identifiers to sanitize them.
+            let schema_name = schema_name.pg_quote();
+            let role_name = role_name.pg_quote();
+
+            let query = format!("GRANT {grants} ON SCHEMA {schema_name} TO {role_name}",);
+            db_client
+                .simple_query(&query)
+                .await
+                .with_context(|| format!("Failed to execute query: {}", query))?;
+        }
 
         Ok(())
     }
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 1b4562c0b3..131820f23e 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -4613,7 +4613,10 @@ class EndpointFactory:
         return self
 
     def new_replica(
-        self, origin: Endpoint, endpoint_id: str, config_lines: list[str] | None = None
+        self,
+        origin: Endpoint,
+        endpoint_id: str | None = None,
+        config_lines: list[str] | None = None,
     ):
         branch_name = origin.branch_name
         assert origin in self.endpoints
@@ -4629,7 +4632,10 @@ class EndpointFactory:
         )
 
     def new_replica_start(
-        self, origin: Endpoint, endpoint_id: str, config_lines: list[str] | None = None
+        self,
+        origin: Endpoint,
+        endpoint_id: str | None = None,
+        config_lines: list[str] | None = None,
     ):
         branch_name = origin.branch_name
         assert origin in self.endpoints
diff --git a/test_runner/regress/test_role_grants.py b/test_runner/regress/test_role_grants.py
index b2251875f0..5b13d461f0 100644
--- a/test_runner/regress/test_role_grants.py
+++ b/test_runner/regress/test_role_grants.py
@@ -39,3 +39,10 @@ def test_role_grants(neon_simple_env: NeonEnv):
         res = cur.fetchall()
 
         assert res == [(1,)], "select should not succeed"
+
+    # confirm that replicas can also ensure the grants are correctly set.
+    replica = env.endpoints.new_replica_start(endpoint)
+    replica_client = replica.http_client()
+    replica_client.set_role_grants(
+        "test_role_grants", "test_role", "test_schema", ["CREATE", "USAGE"]
+    )

From 03d635b916ed057826d80bbc709864acb1c108f1 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Fri, 9 May 2025 12:07:08 +0300
Subject: [PATCH 17/65] Add more guards for prefetch_pump_state (#11859)

## Problem

See https://neondb.slack.com/archives/C08PJ07BZ44/p1746566292750689

Looks like there are more cases when `prefetch_pump_state` can be called
in unexpected place and cause core dump.

## Summary of changes

Add more guards.

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pgxn/neon/communicator.c   | 36 +++++++++++++++++++++---------------
 pgxn/neon/communicator.h   |  2 +-
 pgxn/neon/pagestore_smgr.c | 20 ++++++++++----------
 vendor/postgres-v16        |  2 +-
 vendor/revisions.json      |  2 +-
 5 files changed, 34 insertions(+), 28 deletions(-)

diff --git a/pgxn/neon/communicator.c b/pgxn/neon/communicator.c
index 818a149499..9609f186b9 100644
--- a/pgxn/neon/communicator.c
+++ b/pgxn/neon/communicator.c
@@ -425,15 +425,12 @@ compact_prefetch_buffers(void)
  * point inside and outside PostgreSQL.
  *
  * This still does throw errors when it receives malformed responses from PS.
- *
- * When we're not called from CHECK_FOR_INTERRUPTS (indicated by
- * IsHandlingInterrupts) we also report we've ended prefetch receive work,
- * just in case state tracking was lost due to an error in the sync getPage
- * response code.
  */
 void
-communicator_prefetch_pump_state(bool IsHandlingInterrupts)
+communicator_prefetch_pump_state(void)
 {
+	START_PREFETCH_RECEIVE_WORK();
+
 	while (MyPState->ring_receive != MyPState->ring_flush)
 	{
 		NeonResponse   *response;
@@ -482,9 +479,7 @@ communicator_prefetch_pump_state(bool IsHandlingInterrupts)
 		}
 	}
 
-	/* We never pump the prefetch state while handling other pages */
-	if (!IsHandlingInterrupts)
-		END_PREFETCH_RECEIVE_WORK();
+	END_PREFETCH_RECEIVE_WORK();
 
 	communicator_reconfigure_timeout_if_needed();
 }
@@ -672,9 +667,10 @@ prefetch_wait_for(uint64 ring_index)
 
 	Assert(MyPState->ring_unused > ring_index);
 
+	START_PREFETCH_RECEIVE_WORK();
+
 	while (MyPState->ring_receive <= ring_index)
 	{
-		START_PREFETCH_RECEIVE_WORK();
 		entry = GetPrfSlot(MyPState->ring_receive);
 
 		Assert(entry->status == PRFS_REQUESTED);
@@ -683,17 +679,18 @@ prefetch_wait_for(uint64 ring_index)
 			result = false;
 			break;
 		}
-
-		END_PREFETCH_RECEIVE_WORK();
 		CHECK_FOR_INTERRUPTS();
 	}
+
 	if (result)
 	{
 		/* Check that slot is actually received (srver can be disconnected in prefetch_pump_state called from CHECK_FOR_INTERRUPTS */
 		PrefetchRequest *slot = GetPrfSlot(ring_index);
-		return slot->status == PRFS_RECEIVED;
+		result = slot->status == PRFS_RECEIVED;
 	}
-	return false;
+	END_PREFETCH_RECEIVE_WORK();
+
+	return result;
 ;
 }
 
@@ -720,6 +717,7 @@ prefetch_read(PrefetchRequest *slot)
 	Assert(slot->status == PRFS_REQUESTED);
 	Assert(slot->response == NULL);
 	Assert(slot->my_ring_index == MyPState->ring_receive);
+	Assert(readpage_reentrant_guard);
 
 	if (slot->status != PRFS_REQUESTED ||
 		slot->response != NULL ||
@@ -802,6 +800,7 @@ communicator_prefetch_receive(BufferTag tag)
 	PrfHashEntry *entry;
 	PrefetchRequest hashkey;
 
+	Assert(readpage_reentrant_guard);
 	hashkey.buftag = tag;
 	entry = prfh_lookup(MyPState->prf_hash, &hashkey);
 	if (entry != NULL && prefetch_wait_for(entry->slot->my_ring_index))
@@ -821,8 +820,12 @@ communicator_prefetch_receive(BufferTag tag)
 void
 prefetch_on_ps_disconnect(void)
 {
+	bool save_readpage_reentrant_guard = readpage_reentrant_guard;
 	MyPState->ring_flush = MyPState->ring_unused;
 
+	/* Prohibit callig of prefetch_pump_state */
+	START_PREFETCH_RECEIVE_WORK();
+
 	while (MyPState->ring_receive < MyPState->ring_unused)
 	{
 		PrefetchRequest *slot;
@@ -851,6 +854,9 @@ prefetch_on_ps_disconnect(void)
 		MyNeonCounters->getpage_prefetch_discards_total += 1;
 	}
 
+	/* Restore guard */
+	readpage_reentrant_guard = save_readpage_reentrant_guard;
+
 	/*
 	 * We can have gone into retry due to network error, so update stats with
 	 * the latest available
@@ -2509,7 +2515,7 @@ communicator_processinterrupts(void)
 	if (timeout_signaled)
 	{
 		if (!readpage_reentrant_guard && readahead_getpage_pull_timeout_ms > 0)
-			communicator_prefetch_pump_state(true);
+			communicator_prefetch_pump_state();
 
 		timeout_signaled = false;
 		communicator_reconfigure_timeout_if_needed();
diff --git a/pgxn/neon/communicator.h b/pgxn/neon/communicator.h
index f55c4b10f1..5376c9b839 100644
--- a/pgxn/neon/communicator.h
+++ b/pgxn/neon/communicator.h
@@ -44,7 +44,7 @@ extern int communicator_read_slru_segment(SlruKind kind, int64 segno,
 										  void *buffer);
 
 extern void communicator_reconfigure_timeout_if_needed(void);
-extern void communicator_prefetch_pump_state(bool IsHandlingInterrupts);
+extern void communicator_prefetch_pump_state(void);
 
 
 #endif
diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index 87eb420717..f574517b2a 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -1179,7 +1179,7 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 		blocknum += iterblocks;
 	}
 
-	communicator_prefetch_pump_state(false);
+	communicator_prefetch_pump_state();
 
 	return false;
 }
@@ -1218,7 +1218,7 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
 
 	communicator_prefetch_register_bufferv(tag, NULL, 1, NULL);
 
-	communicator_prefetch_pump_state(false);
+	communicator_prefetch_pump_state();
 
 	return false;
 }
@@ -1262,7 +1262,7 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum,
 	 */
 	neon_log(SmgrTrace, "writeback noop");
 
-	communicator_prefetch_pump_state(false);
+	communicator_prefetch_pump_state();
 
 #ifdef DEBUG_COMPARE_LOCAL
 	if (IS_LOCAL_REL(reln))
@@ -1315,7 +1315,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
 	}
 
 	/* Try to read PS results if they are available */
-	communicator_prefetch_pump_state(false);
+	communicator_prefetch_pump_state();
 
 	neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1);
 
@@ -1339,7 +1339,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
 	/*
 	 * Try to receive prefetch results once again just to make sure we don't leave the smgr code while the OS might still have buffered bytes.
 	 */
-	communicator_prefetch_pump_state(false);
+	communicator_prefetch_pump_state();
 
 #ifdef DEBUG_COMPARE_LOCAL
 	if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln))
@@ -1449,7 +1449,7 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 				 nblocks, PG_IOV_MAX);
 
 	/* Try to read PS results if they are available */
-	communicator_prefetch_pump_state(false);
+	communicator_prefetch_pump_state();
 
 	neon_get_request_lsns(InfoFromSMgrRel(reln), forknum, blocknum,
 						  request_lsns, nblocks);
@@ -1480,7 +1480,7 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 	/*
 	 * Try to receive prefetch results once again just to make sure we don't leave the smgr code while the OS might still have buffered bytes.
 	 */
-	communicator_prefetch_pump_state(false);
+	communicator_prefetch_pump_state();
 
 #ifdef DEBUG_COMPARE_LOCAL
 	if (forknum == MAIN_FORKNUM && IS_LOCAL_REL(reln))
@@ -1665,7 +1665,7 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo
 
 	lfc_write(InfoFromSMgrRel(reln), forknum, blocknum, buffer);
 
-	communicator_prefetch_pump_state(false);
+	communicator_prefetch_pump_state();
 
 #ifdef DEBUG_COMPARE_LOCAL
 	if (IS_LOCAL_REL(reln))
@@ -1727,7 +1727,7 @@ neon_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
 
 	lfc_writev(InfoFromSMgrRel(reln), forknum, blkno, buffers, nblocks);
 
-	communicator_prefetch_pump_state(false);
+	communicator_prefetch_pump_state();
 
 #ifdef DEBUG_COMPARE_LOCAL
 	if (IS_LOCAL_REL(reln))
@@ -1902,7 +1902,7 @@ neon_immedsync(SMgrRelation reln, ForkNumber forknum)
 
 	neon_log(SmgrTrace, "[NEON_SMGR] immedsync noop");
 
-	communicator_prefetch_pump_state(false);
+	communicator_prefetch_pump_state();
 
 #ifdef DEBUG_COMPARE_LOCAL
 	if (IS_LOCAL_REL(reln))
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index 05ddf212e2..d72d76f2cd 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit 05ddf212e2e07b788b5c8b88bdcf98630941f6ae
+Subproject commit d72d76f2cdee4194dd052ce099e9784aca7c794a
diff --git a/vendor/revisions.json b/vendor/revisions.json
index 10aad7e1a2..e76510f969 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -5,7 +5,7 @@
   ],
   "v16": [
     "16.8",
-    "05ddf212e2e07b788b5c8b88bdcf98630941f6ae"
+    "d72d76f2cdee4194dd052ce099e9784aca7c794a"
   ],
   "v15": [
     "15.12",

From d0dc65da124d3f84e2f64ac5e3927b0a299c9eab Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Fri, 9 May 2025 18:12:49 +0800
Subject: [PATCH 18/65] fix(pageserver): give up gc-compaction if one key has
 too long history (#11869)

## Problem

The limitation we imposed last week
https://github.com/neondatabase/neon/pull/11709 is not enough to protect
excessive memory usage.

## Summary of changes

If a single key accumulated too much history, give up compaction. In the
future, we can make the `generate_key_retention` function take a stream
of keys instead of first accumulating them in memory, thus easily
support such long key history cases.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant/timeline/compaction.rs | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 07cd274a41..6b155268d6 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -3606,6 +3606,13 @@ impl Timeline {
                     last_key = Some(key);
                 }
                 accumulated_values.push((key, lsn, val));
+
+                if accumulated_values.len() >= 65536 {
+                    // Assume all of them are images, that would be 512MB of data in memory for a single key.
+                    return Err(CompactionError::Other(anyhow!(
+                        "too many values for a single key, giving up gc-compaction"
+                    )));
+                }
             } else {
                 let last_key: &mut Key = last_key.as_mut().unwrap();
                 stat.on_unique_key_visited(); // TODO: adjust statistics for partial compaction

From d0aaec2abbf502a962351b5939f1fae974053cd5 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Fri, 9 May 2025 11:55:26 +0100
Subject: [PATCH 19/65] storage_controller: create imported timelines on
 safekeepers (#11801)

## Problem

SK timeline creations were skipped for imported timelines since we
didn't know the correct start LSN
of the timeline at that point.

## Summary of changes

Created imported timelines on the SK as part of the import finalize
step.
We use the last record LSN of shard 0 as the start LSN for the
safekeeper timeline.

Closes https://github.com/neondatabase/neon/issues/11569
---
 storage_controller/src/service.rs             | 51 ++++++++++++++-----
 .../src/service/safekeeper_service.rs         | 36 +++++++++++++
 test_runner/regress/test_import_pgdata.py     | 33 ++++++------
 3 files changed, 90 insertions(+), 30 deletions(-)

diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index fdb791c2cf..193050460d 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -3886,10 +3886,10 @@ impl Service {
 
             None
         } else if safekeepers {
-            // Note that we do not support creating the timeline on the safekeepers
-            // for imported timelines. The `start_lsn` of the timeline is not known
-            // until the import finshes.
-            // https://github.com/neondatabase/neon/issues/11569
+            // Note that for imported timelines, we do not create the timeline on the safekeepers
+            // straight away. Instead, we do it once the import finalized such that we know what
+            // start LSN to provide for the safekeepers. This is done in
+            // [`Self::finalize_timeline_import`].
             let res = self
                 .tenant_timeline_create_safekeepers(tenant_id, &timeline_info)
                 .instrument(tracing::info_span!("timeline_create_safekeepers", %tenant_id, timeline_id=%timeline_info.timeline_id))
@@ -3966,11 +3966,22 @@ impl Service {
                 let active = self.timeline_active_on_all_shards(&import).await?;
 
                 match active {
-                    true => {
+                    Some(timeline_info) => {
                         tracing::info!("Timeline became active on all shards");
+
+                        if self.config.timelines_onto_safekeepers {
+                            // Now that we know the start LSN of this timeline, create it on the
+                            // safekeepers.
+                            self.tenant_timeline_create_safekeepers_until_success(
+                                import.tenant_id,
+                                timeline_info,
+                            )
+                            .await?;
+                        }
+
                         break;
                     }
-                    false => {
+                    None => {
                         tracing::info!("Timeline not active on all shards yet");
 
                         tokio::select! {
@@ -4004,9 +4015,6 @@ impl Service {
             .range_mut(TenantShardId::tenant_range(import.tenant_id))
             .for_each(|(_id, shard)| shard.importing = TimelineImportState::Idle);
 
-        // TODO(vlad): Timeline creations in import mode do not return a correct initdb lsn,
-        // so we can't create the timeline on the safekeepers. Fix by moving creation here.
-        // https://github.com/neondatabase/neon/issues/11569
         tracing::info!(%import_failed, "Timeline import complete");
 
         Ok(())
@@ -4021,10 +4029,16 @@ impl Service {
         .await;
     }
 
+    /// If the timeline is active on all shards, returns the [`TimelineInfo`]
+    /// collected from shard 0.
+    ///
+    /// An error is returned if the shard layout has changed during the import.
+    /// This is guarded against within the storage controller and the pageserver,
+    /// and, therefore, unexpected.
     async fn timeline_active_on_all_shards(
         self: &Arc<Self>,
         import: &TimelineImport,
-    ) -> anyhow::Result<bool> {
+    ) -> anyhow::Result<Option<TimelineInfo>> {
         let targets = {
             let locked = self.inner.read().unwrap();
             let mut targets = Vec::new();
@@ -4048,13 +4062,17 @@ impl Service {
                         .expect("Pageservers may not be deleted while referenced");
                     targets.push((*tenant_shard_id, node.clone()));
                 } else {
-                    return Ok(false);
+                    return Ok(None);
                 }
             }
 
             targets
         };
 
+        if targets.is_empty() {
+            anyhow::bail!("No shards found to finalize import for");
+        }
+
         let results = self
             .tenant_for_shards_api(
                 targets,
@@ -4070,10 +4088,17 @@ impl Service {
             )
             .await;
 
-        Ok(results.into_iter().all(|res| match res {
+        let all_active = results.iter().all(|res| match res {
             Ok(info) => info.state == TimelineState::Active,
             Err(_) => false,
-        }))
+        });
+
+        if all_active {
+            // Both unwraps are validated above
+            Ok(Some(results.into_iter().next().unwrap().unwrap()))
+        } else {
+            Ok(None)
+        }
     }
 
     pub(crate) async fn tenant_timeline_archival_config(
diff --git a/storage_controller/src/service/safekeeper_service.rs b/storage_controller/src/service/safekeeper_service.rs
index 5eecf0d415..5c15660ba3 100644
--- a/storage_controller/src/service/safekeeper_service.rs
+++ b/storage_controller/src/service/safekeeper_service.rs
@@ -323,6 +323,42 @@ impl Service {
         })
     }
 
+    pub(crate) async fn tenant_timeline_create_safekeepers_until_success(
+        self: &Arc<Self>,
+        tenant_id: TenantId,
+        timeline_info: TimelineInfo,
+    ) -> anyhow::Result<()> {
+        const BACKOFF: Duration = Duration::from_secs(5);
+
+        loop {
+            if self.cancel.is_cancelled() {
+                anyhow::bail!("Shut down requested while finalizing import");
+            }
+
+            let res = self
+                .tenant_timeline_create_safekeepers(tenant_id, &timeline_info)
+                .await;
+
+            match res {
+                Ok(_) => {
+                    tracing::info!("Timeline created on safekeepers");
+                    break;
+                }
+                Err(err) => {
+                    tracing::error!("Failed to create timeline on safekeepers: {err}");
+                    tokio::select! {
+                        _ = self.cancel.cancelled() => {
+                            anyhow::bail!("Shut down requested while finalizing import");
+                        },
+                        _ = tokio::time::sleep(BACKOFF) => {}
+                    };
+                }
+            }
+        }
+
+        Ok(())
+    }
+
     /// Directly insert the timeline into the database without reconciling it with safekeepers.
     ///
     /// Useful if the timeline already exists on the specified safekeepers,
diff --git a/test_runner/regress/test_import_pgdata.py b/test_runner/regress/test_import_pgdata.py
index 2fda1991f7..05e63ad955 100644
--- a/test_runner/regress/test_import_pgdata.py
+++ b/test_runner/regress/test_import_pgdata.py
@@ -24,6 +24,7 @@ from fixtures.utils import (
     skip_in_debug_build,
     wait_until,
 )
+from fixtures.workload import Workload
 from mypy_boto3_kms import KMSClient
 from mypy_boto3_kms.type_defs import EncryptResponseTypeDef
 from mypy_boto3_s3 import S3Client
@@ -97,6 +98,10 @@ def test_pgdata_import_smoke(
         f"http://{cplane_mgmt_api_server.host}:{cplane_mgmt_api_server.port}/storage/api/v1/"
     )
 
+    if neon_env_builder.storage_controller_config is None:
+        neon_env_builder.storage_controller_config = {}
+    neon_env_builder.storage_controller_config["timelines_onto_safekeepers"] = True
+
     env = neon_env_builder.init_start()
 
     # The test needs LocalFs support, which is only built in testing mode.
@@ -286,34 +291,28 @@ def test_pgdata_import_smoke(
     #
     # validate that we can write
     #
-    rw_endpoint = env.endpoints.create_start(
-        branch_name=import_branch_name,
-        endpoint_id="rw",
-        tenant_id=tenant_id,
-        config_lines=ep_config,
-    )
-    rw_endpoint.safe_psql("create table othertable(values text)")
-    rw_lsn = Lsn(rw_endpoint.safe_psql_scalar("select pg_current_wal_flush_lsn()"))
+    workload = Workload(env, tenant_id, timeline_id, branch_name=import_branch_name)
+    workload.init()
+    workload.write_rows(64)
+    workload.validate()
 
-    # TODO: consider using `class Workload` here
-    # to do compaction and whatnot?
+    rw_lsn = Lsn(workload.endpoint().safe_psql_scalar("select pg_current_wal_flush_lsn()"))
 
     #
     # validate that we can branch (important use case)
     #
 
     # ... at the tip
-    _ = env.create_branch(
+    child_timeline_id = env.create_branch(
         new_branch_name="br-tip",
         ancestor_branch_name=import_branch_name,
         tenant_id=tenant_id,
         ancestor_start_lsn=rw_lsn,
     )
-    br_tip_endpoint = env.endpoints.create_start(
-        branch_name="br-tip", endpoint_id="br-tip-ro", tenant_id=tenant_id, config_lines=ep_config
-    )
-    validate_vanilla_equivalence(br_tip_endpoint)
-    br_tip_endpoint.safe_psql("select * from othertable")
+    child_workload = workload.branch(timeline_id=child_timeline_id, branch_name="br-tip")
+    child_workload.validate()
+
+    validate_vanilla_equivalence(child_workload.endpoint())
 
     # ... at the initdb lsn
     _ = env.create_branch(
@@ -330,7 +329,7 @@ def test_pgdata_import_smoke(
     )
     validate_vanilla_equivalence(br_initdb_endpoint)
     with pytest.raises(psycopg2.errors.UndefinedTable):
-        br_initdb_endpoint.safe_psql("select * from othertable")
+        br_initdb_endpoint.safe_psql(f"select * from {workload.table}")
 
 
 @run_only_on_default_postgres(reason="PG version is irrelevant here")

From 93b964f829f05b4c7e9bf6408f504bf6b70e033b Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Fri, 9 May 2025 20:07:52 +0800
Subject: [PATCH 20/65]  fix(pageserver): do not do image compaction if it's
 below gc cutoff (#11872)

## Problem

We observe image compaction errors after gc-compaction finishes
compacting below the gc_cutoff. This is because `repartition` returns an
LSN below the gc horizon as we (likely) determined that `distance <=
self.repartition_threshold`.

I think it's better to keep the current behavior of when to trigger
compaction but we should skip image compaction if the returned LSN is
below the gc horizon.

## Summary of changes

If the repartition returns an invalid LSN, skip image compaction.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant/timeline/compaction.rs | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 6b155268d6..e7d39db70d 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -1277,6 +1277,8 @@ impl Timeline {
             return Ok(CompactionOutcome::YieldForL0);
         }
 
+        let gc_cutoff = *self.applied_gc_cutoff_lsn.read();
+
         // 2. Repartition and create image layers if necessary
         match self
             .repartition(
@@ -1287,7 +1289,7 @@ impl Timeline {
             )
             .await
         {
-            Ok(((dense_partitioning, sparse_partitioning), lsn)) => {
+            Ok(((dense_partitioning, sparse_partitioning), lsn)) if lsn >= gc_cutoff => {
                 // Disables access_stats updates, so that the files we read remain candidates for eviction after we're done with them
                 let image_ctx = RequestContextBuilder::from(ctx)
                     .access_stats_behavior(AccessStatsBehavior::Skip)
@@ -1341,6 +1343,10 @@ impl Timeline {
                 }
             }
 
+            Ok(_) => {
+                info!("skipping repartitioning due to image compaction LSN being below GC cutoff");
+            }
+
             // Suppress errors when cancelled.
             Err(_) if self.cancel.is_cancelled() => {}
             Err(err) if err.is_cancel() => {}

From 33abfc2b741de285846a8cfaef5c2e158d039342 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Fri, 9 May 2025 15:34:22 +0200
Subject: [PATCH 21/65] storcon: remove finished safekeeper reconciliations
 from in-memory hashmap (#11876)

## Problem

Currently there is a memory leak, in that finished safekeeper
reconciliations leave a cancellation token behind which is never cleaned
up.

## Summary of changes

The change adds cleanup after finishing of a reconciliation. In order to
ensure we remove the correct cancellation token, and we haven't raced
with another reconciliation, we introduce a `TokenId` counter to tell
tokens apart.

Part of https://github.com/neondatabase/neon/issues/11670
---
 .../src/service/safekeeper_reconciler.rs      | 133 ++++++++++++------
 1 file changed, 88 insertions(+), 45 deletions(-)

diff --git a/storage_controller/src/service/safekeeper_reconciler.rs b/storage_controller/src/service/safekeeper_reconciler.rs
index 71c73a0112..17bb132982 100644
--- a/storage_controller/src/service/safekeeper_reconciler.rs
+++ b/storage_controller/src/service/safekeeper_reconciler.rs
@@ -1,4 +1,9 @@
-use std::{collections::HashMap, str::FromStr, sync::Arc, time::Duration};
+use std::{
+    collections::HashMap,
+    str::FromStr,
+    sync::{Arc, atomic::AtomicU64},
+    time::Duration,
+};
 
 use clashmap::{ClashMap, Entry};
 use safekeeper_api::models::PullTimelineRequest;
@@ -169,10 +174,17 @@ pub(crate) struct ScheduleRequest {
     pub(crate) kind: SafekeeperTimelineOpKind,
 }
 
+/// A way to keep ongoing/queued reconcile requests apart
+#[derive(Copy, Clone, PartialEq, Eq)]
+struct TokenId(u64);
+
+type OngoingTokens = ClashMap<(TenantId, Option<TimelineId>), (CancellationToken, TokenId)>;
+
 /// Handle to per safekeeper reconciler.
 struct ReconcilerHandle {
-    tx: UnboundedSender<(ScheduleRequest, CancellationToken)>,
-    ongoing_tokens: Arc<ClashMap<(TenantId, Option<TimelineId>), CancellationToken>>,
+    tx: UnboundedSender<(ScheduleRequest, CancellationToken, TokenId)>,
+    ongoing_tokens: Arc<OngoingTokens>,
+    token_id_counter: AtomicU64,
     cancel: CancellationToken,
 }
 
@@ -185,24 +197,28 @@ impl ReconcilerHandle {
         &self,
         tenant_id: TenantId,
         timeline_id: Option<TimelineId>,
-    ) -> CancellationToken {
+    ) -> (CancellationToken, TokenId) {
+        let token_id = self
+            .token_id_counter
+            .fetch_add(1, std::sync::atomic::Ordering::Relaxed);
+        let token_id = TokenId(token_id);
         let entry = self.ongoing_tokens.entry((tenant_id, timeline_id));
         if let Entry::Occupied(entry) = &entry {
-            let cancel: &CancellationToken = entry.get();
+            let (cancel, _) = entry.get();
             cancel.cancel();
         }
-        entry.insert(self.cancel.child_token()).clone()
+        entry.insert((self.cancel.child_token(), token_id)).clone()
     }
     /// Cancel an ongoing reconciliation
     fn cancel_reconciliation(&self, tenant_id: TenantId, timeline_id: Option<TimelineId>) {
-        if let Some((_, cancel)) = self.ongoing_tokens.remove(&(tenant_id, timeline_id)) {
+        if let Some((_, (cancel, _id))) = self.ongoing_tokens.remove(&(tenant_id, timeline_id)) {
             cancel.cancel();
         }
     }
     fn schedule_reconcile(&self, req: ScheduleRequest) {
-        let cancel = self.new_token_slot(req.tenant_id, req.timeline_id);
+        let (cancel, token_id) = self.new_token_slot(req.tenant_id, req.timeline_id);
         let hostname = req.safekeeper.skp.host.clone();
-        if let Err(err) = self.tx.send((req, cancel)) {
+        if let Err(err) = self.tx.send((req, cancel, token_id)) {
             tracing::info!("scheduling request onto {hostname} returned error: {err}");
         }
     }
@@ -211,13 +227,14 @@ impl ReconcilerHandle {
 pub(crate) struct SafekeeperReconciler {
     inner: SafekeeperReconcilerInner,
     concurrency_limiter: Arc<Semaphore>,
-    rx: UnboundedReceiver<(ScheduleRequest, CancellationToken)>,
+    rx: UnboundedReceiver<(ScheduleRequest, CancellationToken, TokenId)>,
     cancel: CancellationToken,
 }
 
 /// Thin wrapper over `Service` to not clutter its inherent functions
 #[derive(Clone)]
 struct SafekeeperReconcilerInner {
+    ongoing_tokens: Arc<OngoingTokens>,
     service: Arc<Service>,
 }
 
@@ -226,15 +243,20 @@ impl SafekeeperReconciler {
         // We hold the ServiceInner lock so we don't want to make sending to the reconciler channel to be blocking.
         let (tx, rx) = mpsc::unbounded_channel();
         let concurrency = service.config.safekeeper_reconciler_concurrency;
+        let ongoing_tokens = Arc::new(ClashMap::new());
         let mut reconciler = SafekeeperReconciler {
-            inner: SafekeeperReconcilerInner { service },
+            inner: SafekeeperReconcilerInner {
+                service,
+                ongoing_tokens: ongoing_tokens.clone(),
+            },
             rx,
             concurrency_limiter: Arc::new(Semaphore::new(concurrency)),
             cancel: cancel.clone(),
         };
         let handle = ReconcilerHandle {
             tx,
-            ongoing_tokens: Arc::new(ClashMap::new()),
+            ongoing_tokens,
+            token_id_counter: AtomicU64::new(0),
             cancel,
         };
         tokio::spawn(async move { reconciler.run().await });
@@ -246,7 +268,9 @@ impl SafekeeperReconciler {
                 req = self.rx.recv() => req,
                 _ = self.cancel.cancelled() => break,
             };
-            let Some((req, req_cancel)) = req else { break };
+            let Some((req, req_cancel, req_token_id)) = req else {
+                break;
+            };
 
             let permit_res = tokio::select! {
                 req = self.concurrency_limiter.clone().acquire_owned() => req,
@@ -265,7 +289,7 @@ impl SafekeeperReconciler {
                 let timeline_id = req.timeline_id;
                 let node_id = req.safekeeper.skp.id;
                 inner
-                    .reconcile_one(req, req_cancel)
+                    .reconcile_one(req, req_cancel, req_token_id)
                     .instrument(tracing::info_span!(
                         "reconcile_one",
                         ?kind,
@@ -280,8 +304,14 @@ impl SafekeeperReconciler {
 }
 
 impl SafekeeperReconcilerInner {
-    async fn reconcile_one(&self, req: ScheduleRequest, req_cancel: CancellationToken) {
+    async fn reconcile_one(
+        &self,
+        req: ScheduleRequest,
+        req_cancel: CancellationToken,
+        req_token_id: TokenId,
+    ) {
         let req_host = req.safekeeper.skp.host.clone();
+        let success;
         match req.kind {
             SafekeeperTimelineOpKind::Pull => {
                 let Some(timeline_id) = req.timeline_id else {
@@ -302,19 +332,22 @@ impl SafekeeperReconcilerInner {
                     tenant_id: req.tenant_id,
                     timeline_id,
                 };
-                self.reconcile_inner(
-                    req,
-                    async |client| client.pull_timeline(&pull_req).await,
-                    |resp| {
-                        if let Some(host) = resp.safekeeper_host {
-                            tracing::info!("pulled timeline from {host} onto {req_host}");
-                        } else {
-                            tracing::info!("timeline already present on safekeeper on {req_host}");
-                        }
-                    },
-                    req_cancel,
-                )
-                .await;
+                success = self
+                    .reconcile_inner(
+                        &req,
+                        async |client| client.pull_timeline(&pull_req).await,
+                        |resp| {
+                            if let Some(host) = resp.safekeeper_host {
+                                tracing::info!("pulled timeline from {host} onto {req_host}");
+                            } else {
+                                tracing::info!(
+                                    "timeline already present on safekeeper on {req_host}"
+                                );
+                            }
+                        },
+                        req_cancel,
+                    )
+                    .await;
             }
             SafekeeperTimelineOpKind::Exclude => {
                 // TODO actually exclude instead of delete here
@@ -325,22 +358,23 @@ impl SafekeeperReconcilerInner {
                     );
                     return;
                 };
-                self.reconcile_inner(
-                    req,
-                    async |client| client.delete_timeline(tenant_id, timeline_id).await,
-                    |_resp| {
-                        tracing::info!("deleted timeline from {req_host}");
-                    },
-                    req_cancel,
-                )
-                .await;
+                success = self
+                    .reconcile_inner(
+                        &req,
+                        async |client| client.delete_timeline(tenant_id, timeline_id).await,
+                        |_resp| {
+                            tracing::info!("deleted timeline from {req_host}");
+                        },
+                        req_cancel,
+                    )
+                    .await;
             }
             SafekeeperTimelineOpKind::Delete => {
                 let tenant_id = req.tenant_id;
                 if let Some(timeline_id) = req.timeline_id {
-                    let deleted = self
+                    success = self
                         .reconcile_inner(
-                            req,
+                            &req,
                             async |client| client.delete_timeline(tenant_id, timeline_id).await,
                             |_resp| {
                                 tracing::info!("deleted timeline from {req_host}");
@@ -348,13 +382,13 @@ impl SafekeeperReconcilerInner {
                             req_cancel,
                         )
                         .await;
-                    if deleted {
+                    if success {
                         self.delete_timeline_from_db(tenant_id, timeline_id).await;
                     }
                 } else {
-                    let deleted = self
+                    success = self
                         .reconcile_inner(
-                            req,
+                            &req,
                             async |client| client.delete_tenant(tenant_id).await,
                             |_resp| {
                                 tracing::info!(%tenant_id, "deleted tenant from {req_host}");
@@ -362,12 +396,21 @@ impl SafekeeperReconcilerInner {
                             req_cancel,
                         )
                         .await;
-                    if deleted {
+                    if success {
                         self.delete_tenant_timelines_from_db(tenant_id).await;
                     }
                 }
             }
         }
+        if success {
+            self.ongoing_tokens.remove_if(
+                &(req.tenant_id, req.timeline_id),
+                |_ttid, (_cancel, token_id)| {
+                    // Ensure that this request is indeed the request we just finished and not a new one
+                    req_token_id == *token_id
+                },
+            );
+        }
     }
     async fn delete_timeline_from_db(&self, tenant_id: TenantId, timeline_id: TimelineId) {
         match self
@@ -421,10 +464,10 @@ impl SafekeeperReconcilerInner {
             self.delete_timeline_from_db(tenant_id, timeline_id).await;
         }
     }
-    /// Returns whether the reconciliation happened successfully
+    /// Returns whether the reconciliation happened successfully (or we got cancelled)
     async fn reconcile_inner<T, F, U>(
         &self,
-        req: ScheduleRequest,
+        req: &ScheduleRequest,
         closure: impl Fn(SafekeeperClient) -> F,
         log_success: impl FnOnce(T) -> U,
         req_cancel: CancellationToken,

From 3b7cc4234c8675b777a3f85798734c0b41748d11 Mon Sep 17 00:00:00 2001
From: Matthias van de Meent <matthias@neon.tech>
Date: Fri, 9 May 2025 19:02:24 +0200
Subject: [PATCH 22/65] Fix PS connect attempt timeouts when facing interrupts
 (#11880)

With the 50ms timeouts of pumping state in connector.c, we need to
correctly handle these timeouts that also wake up pg_usleep.

This new approach makes the connection attempts re-start the wait
whenever it gets woken up early; and CHECK_FOR_INTERRUPTS() is called to
make sure we don't miss query cancellations.

## Problem

https://neondb.slack.com/archives/C04DGM6SMTM/p1746794528680269

## Summary of changes

Make sure we start sleeping again if pg_usleep got woken up ahead of
time.
---
 pgxn/neon/libpagestore.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index ee4e6ccc5b..3b6c4247c3 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -433,7 +433,6 @@ pageserver_connect(shardno_t shard_no, int elevel)
 
 		now = GetCurrentTimestamp();
 		us_since_last_attempt = (int64) (now - shard->last_reconnect_time);
-		shard->last_reconnect_time = now;
 
 		/*
 		 * Make sure we don't do exponential backoff with a constant multiplier
@@ -447,14 +446,23 @@ pageserver_connect(shardno_t shard_no, int elevel)
 		/*
 		 * If we did other tasks between reconnect attempts, then we won't
 		 * need to wait as long as a full delay.
+		 *
+		 * This is a loop to protect against interrupted sleeps.
 		 */
-		if (us_since_last_attempt < shard->delay_us)
+		while (us_since_last_attempt < shard->delay_us)
 		{
 			pg_usleep(shard->delay_us - us_since_last_attempt);
+
+			/* At least we should handle cancellations here */
+			CHECK_FOR_INTERRUPTS();
+
+			now = GetCurrentTimestamp();
+			us_since_last_attempt = (int64) (now - shard->last_reconnect_time);
 		}
 
 		/* update the delay metric */
 		shard->delay_us = Min(shard->delay_us * 2, MAX_RECONNECT_INTERVAL_USEC);
+		shard->last_reconnect_time = now;
 
 		/*
 		 * Connect using the connection string we got from the

From f5070f6aa4dad26b669811bf72923665f0340147 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 9 May 2025 20:13:35 +0200
Subject: [PATCH 23/65] fixup(direct IO): PR #11864 broke test suite
 parametrization (#11887)

PR
- github.com/neondatabase/neon/pull/11864

committed yesterday rendered the `PAGESERVER_VIRTUAL_FILE_IO_MODE`
env-var-based parametrization ineffective.

As a consequence, the tests and benchmarks in `test_runner/` were using
the binary built-in-default, i.e., `buffered`.
---
 test_runner/fixtures/neon_fixtures.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 131820f23e..8f56ee4392 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1274,6 +1274,8 @@ class NeonEnv:
 
             if self.pageserver_virtual_file_io_engine is not None:
                 ps_cfg["virtual_file_io_engine"] = self.pageserver_virtual_file_io_engine
+            if self.pageserver_virtual_file_io_mode is not None:
+                ps_cfg["virtual_file_io_mode"] = self.pageserver_virtual_file_io_mode
             if config.pageserver_default_tenant_config_compaction_algorithm is not None:
                 tenant_config = ps_cfg.setdefault("tenant_config", {})
                 tenant_config["compaction_algorithm"] = (

From 79ddc803af16e35c5d5a9b1c2c520c1fa88adcc4 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Sat, 10 May 2025 16:19:52 +0200
Subject: [PATCH 24/65] feat(direct IO): runtime alignment validation; support
 config flag on macOS; default to `DirectRw` (#11868)

This PR adds a runtime validation mode to check adherence to alignment
and size-multiple requirements at the VirtualFile level.

This can help prevent alignment bugs from slipping into production
because test systems may have more lax requirements than production.
(This is not the case today, but it could change in the future).

It also allows catching O_DIRECT bugs on systems that don't have
O_DIRECT (macOS).
Consequently, we can now accept
`virtual_file_io_mode={direct,direct-rw}` on macOS now.
This has the side benefit of removing some annoying conditional
compilation around `IoMode`.

A third benefit is that it helped weed out size-multiple requirement
violation bugs in how the VirtualFile unit tests exercise read and write
APIs.
I seized the opportunity to trim these tests down to what actually
matters, i.e., exercising of the `OpenFiles` file descriptor cache.

Lastly, this PR flips the binary-built-in default to `DirectRw` so that
when running Python regress tests and benchmarks without specifying
`PAGESERVER_VIRTUAL_FILE_IO_MODE`, one gets the production behavior.

Refs
- fixes https://github.com/neondatabase/neon/issues/11676
---
 .../pageserver_config/pageserver.toml         |   1 +
 libs/pageserver_api/src/models.rs             |  28 +-
 pageserver/benches/bench_ingest.rs            |   9 +-
 pageserver/src/virtual_file.rs                | 309 +++++++-----------
 pageserver/src/virtual_file/open_options.rs   |  59 +++-
 .../fixtures/pageserver/allowed_errors.py     |   7 +
 6 files changed, 178 insertions(+), 235 deletions(-)

diff --git a/docker-compose/pageserver_config/pageserver.toml b/docker-compose/pageserver_config/pageserver.toml
index 7d603b6c65..81445ed412 100644
--- a/docker-compose/pageserver_config/pageserver.toml
+++ b/docker-compose/pageserver_config/pageserver.toml
@@ -5,3 +5,4 @@ listen_http_addr='0.0.0.0:9898'
 remote_storage={ endpoint='http://minio:9000', bucket_name='neon', bucket_region='eu-north-1', prefix_in_bucket='/pageserver' }
 control_plane_api='http://0.0.0.0:6666' # No storage controller in docker compose, specify a junk address
 control_plane_emergency_mode=true
+virtual_file_io_mode="buffered" # the CI runners where we run the docker compose tests have slow disks
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index ff911499ab..5fcdefba66 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -1832,6 +1832,7 @@ pub mod virtual_file {
         Eq,
         Hash,
         strum_macros::EnumString,
+        strum_macros::EnumIter,
         strum_macros::Display,
         serde_with::DeserializeFromStr,
         serde_with::SerializeDisplay,
@@ -1843,10 +1844,8 @@ pub mod virtual_file {
         /// Uses buffered IO.
         Buffered,
         /// Uses direct IO for reads only.
-        #[cfg(target_os = "linux")]
         Direct,
         /// Use direct IO for reads and writes.
-        #[cfg(target_os = "linux")]
         DirectRw,
     }
 
@@ -1854,26 +1853,13 @@ pub mod virtual_file {
         pub fn preferred() -> Self {
             // The default behavior when running Rust unit tests without any further
             // flags is to use the newest behavior (DirectRw).
-            // The CI uses the following environment variable to unit tests for all
-            // different modes.
+            // The CI uses the environment variable to unit tests for all different modes.
             // NB: the Python regression & perf tests have their own defaults management
             // that writes pageserver.toml; they do not use this variable.
-            if cfg!(test) {
-                static CACHED: LazyLock<IoMode> = LazyLock::new(|| {
-                    utils::env::var_serde_json_string(
-                        "NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IO_MODE",
-                    )
-                    .unwrap_or(
-                        #[cfg(target_os = "linux")]
-                        IoMode::DirectRw,
-                        #[cfg(not(target_os = "linux"))]
-                        IoMode::Buffered,
-                    )
-                });
-                *CACHED
-            } else {
-                IoMode::Buffered
-            }
+            static ENV_OVERRIDE: LazyLock<Option<IoMode>> = LazyLock::new(|| {
+                utils::env::var_serde_json_string("NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IO_MODE")
+            });
+            ENV_OVERRIDE.unwrap_or(IoMode::DirectRw)
         }
     }
 
@@ -1883,9 +1869,7 @@ pub mod virtual_file {
         fn try_from(value: u8) -> Result<Self, Self::Error> {
             Ok(match value {
                 v if v == (IoMode::Buffered as u8) => IoMode::Buffered,
-                #[cfg(target_os = "linux")]
                 v if v == (IoMode::Direct as u8) => IoMode::Direct,
-                #[cfg(target_os = "linux")]
                 v if v == (IoMode::DirectRw as u8) => IoMode::DirectRw,
                 x => return Err(x),
             })
diff --git a/pageserver/benches/bench_ingest.rs b/pageserver/benches/bench_ingest.rs
index 2836450a0e..eaadfe14ae 100644
--- a/pageserver/benches/bench_ingest.rs
+++ b/pageserver/benches/bench_ingest.rs
@@ -14,6 +14,7 @@ use pageserver_api::key::Key;
 use pageserver_api::models::virtual_file::IoMode;
 use pageserver_api::shard::TenantShardId;
 use pageserver_api::value::Value;
+use strum::IntoEnumIterator;
 use tokio_util::sync::CancellationToken;
 use utils::bin_ser::BeSer;
 use utils::id::{TenantId, TimelineId};
@@ -244,13 +245,7 @@ fn criterion_benchmark(c: &mut Criterion) {
     ];
     let exploded_parameters = {
         let mut out = Vec::new();
-        for io_mode in [
-            IoMode::Buffered,
-            #[cfg(target_os = "linux")]
-            IoMode::Direct,
-            #[cfg(target_os = "linux")]
-            IoMode::DirectRw,
-        ] {
+        for io_mode in IoMode::iter() {
             for param in expect.clone() {
                 let HandPickedParameters {
                     volume_mib,
diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index f429e59ef3..c707d35114 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -74,6 +74,8 @@ pub struct VirtualFile {
 
 impl VirtualFile {
     /// Open a file in read-only mode. Like File::open.
+    ///
+    /// Insensitive to `virtual_file_io_mode` setting.
     pub async fn open<P: AsRef<Utf8Path>>(
         path: P,
         ctx: &RequestContext,
@@ -95,31 +97,20 @@ impl VirtualFile {
         Self::open_with_options_v2(path.as_ref(), OpenOptions::new().read(true), ctx).await
     }
 
+    /// `O_DIRECT` will be enabled base on `virtual_file_io_mode`.
     pub async fn open_with_options_v2<P: AsRef<Utf8Path>>(
         path: P,
-        #[cfg_attr(not(target_os = "linux"), allow(unused_mut))] mut open_options: OpenOptions,
+        mut open_options: OpenOptions,
         ctx: &RequestContext,
     ) -> Result<Self, std::io::Error> {
         let mode = get_io_mode();
-        let set_o_direct = match (mode, open_options.is_write()) {
+        let direct = match (mode, open_options.is_write()) {
             (IoMode::Buffered, _) => false,
-            #[cfg(target_os = "linux")]
             (IoMode::Direct, false) => true,
-            #[cfg(target_os = "linux")]
             (IoMode::Direct, true) => false,
-            #[cfg(target_os = "linux")]
             (IoMode::DirectRw, _) => true,
         };
-        if set_o_direct {
-            #[cfg(target_os = "linux")]
-            {
-                open_options = open_options.custom_flags(nix::libc::O_DIRECT);
-            }
-            #[cfg(not(target_os = "linux"))]
-            unreachable!(
-                "O_DIRECT is not supported on this platform, IoMode's that result in set_o_direct=true shouldn't even be defined"
-            );
-        }
+        open_options = open_options.direct(direct);
         let inner = VirtualFileInner::open_with_options(path, open_options, ctx).await?;
         Ok(VirtualFile { inner, _mode: mode })
     }
@@ -791,6 +782,12 @@ impl VirtualFileInner {
     where
         Buf: tokio_epoll_uring::IoBufMut + Send,
     {
+        self.validate_direct_io(
+            Slice::stable_ptr(&buf).addr(),
+            Slice::bytes_total(&buf),
+            offset,
+        );
+
         let file_guard = match self
             .lock_file()
             .await
@@ -816,6 +813,8 @@ impl VirtualFileInner {
         offset: u64,
         ctx: &RequestContext,
     ) -> (FullSlice<B>, Result<usize, Error>) {
+        self.validate_direct_io(buf.as_ptr().addr(), buf.len(), offset);
+
         let file_guard = match self.lock_file().await {
             Ok(file_guard) => file_guard,
             Err(e) => return (buf, Err(e)),
@@ -830,6 +829,64 @@ impl VirtualFileInner {
             (buf, result)
         })
     }
+
+    /// Validate all reads and writes to adhere to the O_DIRECT requirements of our production systems.
+    ///
+    /// Validating it iin userspace sets a consistent bar, independent of what actual OS/filesystem/block device is in use.
+    fn validate_direct_io(&self, addr: usize, size: usize, offset: u64) {
+        // TODO: eventually enable validation in the builds we use in real environments like staging, preprod, and prod.
+        if !(cfg!(feature = "testing") || cfg!(test)) {
+            return;
+        }
+        if !self.open_options.is_direct() {
+            return;
+        }
+
+        // Validate buffer memory alignment.
+        //
+        // What practically matters as of Linux 6.1 is bdev_dma_alignment()
+        // which is practically between 512 and 4096.
+        // On our production systems, the value is 512.
+        // The IoBuffer/IoBufferMut hard-code that value.
+        //
+        // Because the alloctor might return _more_ aligned addresses than requested,
+        // there is a chance that testing would not catch violations of a runtime requirement stricter than 512.
+        {
+            let requirement = 512;
+            let remainder = addr % requirement;
+            assert!(
+                remainder == 0,
+                "Direct I/O buffer must be aligned: buffer_addr=0x{addr:x} % 0x{requirement:x} = 0x{remainder:x}"
+            );
+        }
+
+        // Validate offset alignment.
+        //
+        // We hard-code 512 throughout the code base.
+        // So enforce just that and not anything more restrictive.
+        // Even the shallowest testing will expose more restrictive requirements if those ever arise.
+        {
+            let requirement = 512;
+            let remainder = offset % requirement;
+            assert!(
+                remainder == 0,
+                "Direct I/O offset must be aligned: offset=0x{offset:x} % 0x{requirement:x} = 0x{remainder:x}"
+            );
+        }
+
+        // Validate buffer size multiple requirement.
+        //
+        // The requirement in Linux 6.1 is bdev_logical_block_size().
+        // On our production systems, that is 512.
+        {
+            let requirement = 512;
+            let remainder = size % requirement;
+            assert!(
+                remainder == 0,
+                "Direct I/O buffer size must be a multiple of {requirement}: size=0x{size:x} % 0x{requirement:x} = 0x{remainder:x}"
+            );
+        }
+    }
 }
 
 // Adapted from https://doc.rust-lang.org/1.72.0/src/std/os/unix/fs.rs.html#117-135
@@ -1218,7 +1275,6 @@ mod tests {
     use std::sync::Arc;
 
     use owned_buffers_io::io_buf_ext::IoBufExt;
-    use owned_buffers_io::slice::SliceMutExt;
     use rand::seq::SliceRandom;
     use rand::{Rng, thread_rng};
 
@@ -1226,162 +1282,38 @@ mod tests {
     use crate::context::DownloadBehavior;
     use crate::task_mgr::TaskKind;
 
-    enum MaybeVirtualFile {
-        VirtualFile(VirtualFile),
-        File(File),
-    }
-
-    impl From<VirtualFile> for MaybeVirtualFile {
-        fn from(vf: VirtualFile) -> Self {
-            MaybeVirtualFile::VirtualFile(vf)
-        }
-    }
-
-    impl MaybeVirtualFile {
-        async fn read_exact_at(
-            &self,
-            mut slice: tokio_epoll_uring::Slice<IoBufferMut>,
-            offset: u64,
-            ctx: &RequestContext,
-        ) -> Result<tokio_epoll_uring::Slice<IoBufferMut>, Error> {
-            match self {
-                MaybeVirtualFile::VirtualFile(file) => file.read_exact_at(slice, offset, ctx).await,
-                MaybeVirtualFile::File(file) => {
-                    let rust_slice: &mut [u8] = slice.as_mut_rust_slice_full_zeroed();
-                    file.read_exact_at(rust_slice, offset).map(|()| slice)
-                }
-            }
-        }
-        async fn write_all_at<Buf: IoBufAligned + Send>(
-            &self,
-            buf: FullSlice<Buf>,
-            offset: u64,
-            ctx: &RequestContext,
-        ) -> Result<(), Error> {
-            match self {
-                MaybeVirtualFile::VirtualFile(file) => {
-                    let (_buf, res) = file.write_all_at(buf, offset, ctx).await;
-                    res
-                }
-                MaybeVirtualFile::File(file) => file.write_all_at(&buf[..], offset),
-            }
-        }
-
-        // Helper function to slurp a portion of a file into a string
-        async fn read_string_at(
-            &mut self,
-            pos: u64,
-            len: usize,
-            ctx: &RequestContext,
-        ) -> Result<String, Error> {
-            let slice = IoBufferMut::with_capacity(len).slice_full();
-            assert_eq!(slice.bytes_total(), len);
-            let slice = self.read_exact_at(slice, pos, ctx).await?;
-            let buf = slice.into_inner();
-            assert_eq!(buf.len(), len);
-
-            Ok(String::from_utf8(buf.to_vec()).unwrap())
-        }
-    }
-
     #[tokio::test]
     async fn test_virtual_files() -> anyhow::Result<()> {
-        // The real work is done in the test_files() helper function. This
-        // allows us to run the same set of tests against a native File, and
-        // VirtualFile. We trust the native Files and wouldn't need to test them,
-        // but this allows us to verify that the operations return the same
-        // results with VirtualFiles as with native Files. (Except that with
-        // native files, you will run out of file descriptors if the ulimit
-        // is low enough.)
-        struct A;
-
-        impl Adapter for A {
-            async fn open(
-                path: Utf8PathBuf,
-                opts: OpenOptions,
-                ctx: &RequestContext,
-            ) -> Result<MaybeVirtualFile, anyhow::Error> {
-                let vf = VirtualFile::open_with_options_v2(&path, opts, ctx).await?;
-                Ok(MaybeVirtualFile::VirtualFile(vf))
-            }
-        }
-        test_files::<A>("virtual_files").await
-    }
-
-    #[tokio::test]
-    async fn test_physical_files() -> anyhow::Result<()> {
-        struct B;
-
-        impl Adapter for B {
-            async fn open(
-                path: Utf8PathBuf,
-                opts: OpenOptions,
-                _ctx: &RequestContext,
-            ) -> Result<MaybeVirtualFile, anyhow::Error> {
-                Ok(MaybeVirtualFile::File({
-                    let owned_fd = opts.open(path.as_std_path()).await?;
-                    File::from(owned_fd)
-                }))
-            }
-        }
-
-        test_files::<B>("physical_files").await
-    }
-
-    /// This is essentially a closure which returns a MaybeVirtualFile, but because rust edition
-    /// 2024 is not yet out with new lifetime capture or outlives rules, this is a async function
-    /// in trait which benefits from the new lifetime capture rules already.
-    trait Adapter {
-        async fn open(
-            path: Utf8PathBuf,
-            opts: OpenOptions,
-            ctx: &RequestContext,
-        ) -> Result<MaybeVirtualFile, anyhow::Error>;
-    }
-
-    async fn test_files<A>(testname: &str) -> anyhow::Result<()>
-    where
-        A: Adapter,
-    {
         let ctx =
             RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error).with_scope_unit_test();
-        let testdir = crate::config::PageServerConf::test_repo_dir(testname);
+        let testdir = crate::config::PageServerConf::test_repo_dir("test_virtual_files");
         std::fs::create_dir_all(&testdir)?;
 
+        let zeropad512 = |content: &[u8]| {
+            let mut buf = IoBufferMut::with_capacity_zeroed(512);
+            buf[..content.len()].copy_from_slice(content);
+            buf.freeze().slice_len()
+        };
+
         let path_a = testdir.join("file_a");
-        let mut file_a = A::open(
+        let file_a = VirtualFile::open_with_options_v2(
             path_a.clone(),
             OpenOptions::new()
+                .read(true)
                 .write(true)
+                // set create & truncate flags to ensure when we trigger a reopen later in this test,
+                // the reopen_options must have masked out those flags; if they don't, then
+                // the after reopen we will fail to read the `content_a` that we write here.
                 .create(true)
-                .truncate(true)
-                .to_owned(),
+                .truncate(true),
             &ctx,
         )
         .await?;
+        let (_, res) = file_a.write_all_at(zeropad512(b"content_a"), 0, &ctx).await;
+        res?;
 
-        file_a
-            .write_all_at(IoBuffer::from(b"foobar").slice_len(), 0, &ctx)
-            .await?;
-
-        // cannot read from a file opened in write-only mode
-        let _ = file_a.read_string_at(0, 1, &ctx).await.unwrap_err();
-
-        // Close the file and re-open for reading
-        let mut file_a = A::open(path_a, OpenOptions::new().read(true), &ctx).await?;
-
-        // cannot write to a file opened in read-only mode
-        let _ = file_a
-            .write_all_at(IoBuffer::from(b"bar").slice_len(), 0, &ctx)
-            .await
-            .unwrap_err();
-
-        // Try simple read
-        assert_eq!("foobar", file_a.read_string_at(0, 6, &ctx).await?);
-
-        // Create another test file, and try FileExt functions on it.
         let path_b = testdir.join("file_b");
-        let mut file_b = A::open(
+        let file_b = VirtualFile::open_with_options_v2(
             path_b.clone(),
             OpenOptions::new()
                 .read(true)
@@ -1391,37 +1323,44 @@ mod tests {
             &ctx,
         )
         .await?;
-        file_b
-            .write_all_at(IoBuffer::from(b"BAR").slice_len(), 3, &ctx)
-            .await?;
-        file_b
-            .write_all_at(IoBuffer::from(b"FOO").slice_len(), 0, &ctx)
-            .await?;
+        let (_, res) = file_b.write_all_at(zeropad512(b"content_b"), 0, &ctx).await;
+        res?;
 
-        assert_eq!(file_b.read_string_at(2, 3, &ctx).await?, "OBA");
+        let assert_first_512_eq = async |vfile: &VirtualFile, expect: &[u8]| {
+            let buf = vfile
+                .read_exact_at(IoBufferMut::with_capacity_zeroed(512).slice_full(), 0, &ctx)
+                .await
+                .unwrap();
+            assert_eq!(&buf[..], &zeropad512(expect)[..]);
+        };
 
-        // Open a lot of files, enough to cause some evictions. (Or to be precise,
-        // open the same file many times. The effect is the same.)
+        // Open a lot of file descriptors / VirtualFile instances.
+        // Enough to cause some evictions in the fd cache.
 
-        let mut vfiles = Vec::new();
+        let mut file_b_dupes = Vec::new();
         for _ in 0..100 {
-            let mut vfile = A::open(path_b.clone(), OpenOptions::new().read(true), &ctx).await?;
-            assert_eq!("FOOBAR", vfile.read_string_at(0, 6, &ctx).await?);
-            vfiles.push(vfile);
+            let vfile = VirtualFile::open_with_options_v2(
+                path_b.clone(),
+                OpenOptions::new().read(true),
+                &ctx,
+            )
+            .await?;
+            assert_first_512_eq(&vfile, b"content_b").await;
+            file_b_dupes.push(vfile);
         }
 
         // make sure we opened enough files to definitely cause evictions.
-        assert!(vfiles.len() > TEST_MAX_FILE_DESCRIPTORS * 2);
+        assert!(file_b_dupes.len() > TEST_MAX_FILE_DESCRIPTORS * 2);
 
         // The underlying file descriptor for 'file_a' should be closed now. Try to read
-        // from it again.
-        assert_eq!("foobar", file_a.read_string_at(0, 6, &ctx).await?);
+        // from it again. The VirtualFile reopens the file internally.
+        assert_first_512_eq(&file_a, b"content_a").await;
 
         // Check that all the other FDs still work too. Use them in random order for
         // good measure.
-        vfiles.as_mut_slice().shuffle(&mut thread_rng());
-        for vfile in vfiles.iter_mut() {
-            assert_eq!("OOBAR", vfile.read_string_at(1, 5, &ctx).await?);
+        file_b_dupes.as_mut_slice().shuffle(&mut thread_rng());
+        for vfile in file_b_dupes.iter_mut() {
+            assert_first_512_eq(vfile, b"content_b").await;
         }
 
         Ok(())
@@ -1452,7 +1391,7 @@ mod tests {
         // Open the file many times.
         let mut files = Vec::new();
         for _ in 0..VIRTUAL_FILES {
-            let f = VirtualFileInner::open_with_options(
+            let f = VirtualFile::open_with_options_v2(
                 &test_file_path,
                 OpenOptions::new().read(true),
                 &ctx,
@@ -1497,8 +1436,6 @@ mod tests {
 
     #[tokio::test]
     async fn test_atomic_overwrite_basic() {
-        let ctx =
-            RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error).with_scope_unit_test();
         let testdir = crate::config::PageServerConf::test_repo_dir("test_atomic_overwrite_basic");
         std::fs::create_dir_all(&testdir).unwrap();
 
@@ -1508,26 +1445,22 @@ mod tests {
         VirtualFileInner::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"foo".to_vec())
             .await
             .unwrap();
-        let mut file = MaybeVirtualFile::from(VirtualFile::open(&path, &ctx).await.unwrap());
-        let post = file.read_string_at(0, 3, &ctx).await.unwrap();
+
+        let post = std::fs::read_to_string(&path).unwrap();
         assert_eq!(post, "foo");
         assert!(!tmp_path.exists());
-        drop(file);
 
         VirtualFileInner::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"bar".to_vec())
             .await
             .unwrap();
-        let mut file = MaybeVirtualFile::from(VirtualFile::open(&path, &ctx).await.unwrap());
-        let post = file.read_string_at(0, 3, &ctx).await.unwrap();
+
+        let post = std::fs::read_to_string(&path).unwrap();
         assert_eq!(post, "bar");
         assert!(!tmp_path.exists());
-        drop(file);
     }
 
     #[tokio::test]
     async fn test_atomic_overwrite_preexisting_tmp() {
-        let ctx =
-            RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error).with_scope_unit_test();
         let testdir =
             crate::config::PageServerConf::test_repo_dir("test_atomic_overwrite_preexisting_tmp");
         std::fs::create_dir_all(&testdir).unwrap();
@@ -1542,10 +1475,8 @@ mod tests {
             .await
             .unwrap();
 
-        let mut file = MaybeVirtualFile::from(VirtualFile::open(&path, &ctx).await.unwrap());
-        let post = file.read_string_at(0, 3, &ctx).await.unwrap();
+        let post = std::fs::read_to_string(&path).unwrap();
         assert_eq!(post, "foo");
         assert!(!tmp_path.exists());
-        drop(file);
     }
 }
diff --git a/pageserver/src/virtual_file/open_options.rs b/pageserver/src/virtual_file/open_options.rs
index a40dfed4a4..7d478f3600 100644
--- a/pageserver/src/virtual_file/open_options.rs
+++ b/pageserver/src/virtual_file/open_options.rs
@@ -8,7 +8,13 @@ use super::io_engine::IoEngine;
 
 #[derive(Debug, Clone)]
 pub struct OpenOptions {
+    /// We keep a copy of the write() flag we pass to the `inner`` `OptionOptions`
+    /// to support [`Self::is_write`].
     write: bool,
+    /// We don't expose + pass through a raw `custom_flags()` style API.
+    /// The only custom flag we support is `O_DIRECT`, which we track here
+    /// and map to `custom_flags()` in the [`Self::open`] method.
+    direct: bool,
     inner: Inner,
 }
 #[derive(Debug, Clone)]
@@ -30,6 +36,7 @@ impl Default for OpenOptions {
         };
         Self {
             write: false,
+            direct: false,
             inner,
         }
     }
@@ -44,6 +51,10 @@ impl OpenOptions {
         self.write
     }
 
+    pub(super) fn is_direct(&self) -> bool {
+        self.direct
+    }
+
     pub fn read(mut self, read: bool) -> Self {
         match &mut self.inner {
             Inner::StdFs(x) => {
@@ -116,13 +127,38 @@ impl OpenOptions {
     }
 
     pub(in crate::virtual_file) async fn open(&self, path: &Path) -> std::io::Result<OwnedFd> {
-        match &self.inner {
-            Inner::StdFs(x) => x.open(path).map(|file| file.into()),
+        #[cfg_attr(not(target_os = "linux"), allow(unused_mut))]
+        let mut custom_flags = 0;
+        if self.direct {
             #[cfg(target_os = "linux")]
-            Inner::TokioEpollUring(x) => {
+            {
+                custom_flags |= nix::libc::O_DIRECT;
+            }
+            #[cfg(not(target_os = "linux"))]
+            {
+                // Other platforms may be used for development but don't necessarily have a 1:1 equivalent to Linux's O_DIRECT (macOS!).
+                // Just don't set the flag; to catch alignment bugs typical for O_DIRECT,
+                // we have a runtime validation layer inside `VirtualFile::write_at` and `VirtualFile::read_at`.
+                static WARNING: std::sync::Once = std::sync::Once::new();
+                WARNING.call_once(|| {
+                    let span = tracing::info_span!(parent: None, "open_options");
+                    let _enter = span.enter();
+                    tracing::warn!("your platform is not a supported production platform, ignoing request for O_DIRECT; this could hide alignment bugs; this warning is logged once per process");
+                });
+            }
+        }
+
+        match self.inner.clone() {
+            Inner::StdFs(mut x) => x
+                .custom_flags(custom_flags)
+                .open(path)
+                .map(|file| file.into()),
+            #[cfg(target_os = "linux")]
+            Inner::TokioEpollUring(mut x) => {
+                x.custom_flags(custom_flags);
                 let system = super::io_engine::tokio_epoll_uring_ext::thread_local_system().await;
                 let (_, res) = super::io_engine::retry_ecanceled_once((), |()| async {
-                    let res = system.open(path, x).await;
+                    let res = system.open(path, &x).await;
                     ((), res)
                 })
                 .await;
@@ -144,19 +180,8 @@ impl OpenOptions {
         self
     }
 
-    pub fn custom_flags(mut self, flags: i32) -> Self {
-        if flags & nix::libc::O_APPEND != 0 {
-            super::io_engine::panic_operation_must_be_idempotent();
-        }
-        match &mut self.inner {
-            Inner::StdFs(x) => {
-                let _ = x.custom_flags(flags);
-            }
-            #[cfg(target_os = "linux")]
-            Inner::TokioEpollUring(x) => {
-                let _ = x.custom_flags(flags);
-            }
-        }
+    pub fn direct(mut self, direct: bool) -> Self {
+        self.direct = direct;
         self
     }
 }
diff --git a/test_runner/fixtures/pageserver/allowed_errors.py b/test_runner/fixtures/pageserver/allowed_errors.py
index 43bffd919c..9b564f0a60 100755
--- a/test_runner/fixtures/pageserver/allowed_errors.py
+++ b/test_runner/fixtures/pageserver/allowed_errors.py
@@ -111,6 +111,13 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = (
     ".*stalling layer flushes for compaction backpressure.*",
     ".*layer roll waiting for flush due to compaction backpressure.*",
     ".*BatchSpanProcessor.*",
+    *(
+        [
+            r".*your platform is not a supported production platform, ignoing request for O_DIRECT; this could hide alignment bugs.*"
+        ]
+        if sys.platform != "linux"
+        else []
+    ),
 )
 
 

From 64353b48dbd5a73fc2cf9c9eb1bd3c9b442715cc Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Sat, 10 May 2025 17:06:06 +0200
Subject: [PATCH 25/65] direct+concurrent IO: retroactive RFC (#11788)

refs
- direct IO epic: https://github.com/neondatabase/neon/issues/8130
- concurrent IO epic https://github.com/neondatabase/neon/issues/9378
- obsoletes direct IO proposal RFC:
https://github.com/neondatabase/neon/pull/8240
- discussion in
https://neondb.slack.com/archives/C07BZ38E6SD/p1746028030574349
---
 docs/rfcs/030-vectored-timeline-get.md        |   2 +
 .../2025-04-30-direct-io-for-pageserver.md    | 362 ++++++++++++++++++
 ...0-pageserver-concurrent-io-on-read-path.md | 251 ++++++++++++
 3 files changed, 615 insertions(+)
 create mode 100644 docs/rfcs/2025-04-30-direct-io-for-pageserver.md
 create mode 100644 docs/rfcs/2025-04-30-pageserver-concurrent-io-on-read-path.md

diff --git a/docs/rfcs/030-vectored-timeline-get.md b/docs/rfcs/030-vectored-timeline-get.md
index 093a964f38..e933eac5fe 100644
--- a/docs/rfcs/030-vectored-timeline-get.md
+++ b/docs/rfcs/030-vectored-timeline-get.md
@@ -7,6 +7,8 @@ Author: Christian Schwarz
 
 A brief RFC / GitHub Epic describing a vectored version of the `Timeline::get` method that is at the heart of Pageserver.
 
+**EDIT**: the implementation of this feature is described in [Vlad's (internal) tech talk](https://drive.google.com/file/d/1vfY24S869UP8lEUUDHRWKF1AJn8fpWoJ/view?usp=drive_link).
+
 # Motivation
 
 During basebackup, we issue many `Timeline::get` calls for SLRU pages that are *adjacent* in key space.
diff --git a/docs/rfcs/2025-04-30-direct-io-for-pageserver.md b/docs/rfcs/2025-04-30-direct-io-for-pageserver.md
new file mode 100644
index 0000000000..847f5e4040
--- /dev/null
+++ b/docs/rfcs/2025-04-30-direct-io-for-pageserver.md
@@ -0,0 +1,362 @@
+# Direct IO For Pageserver
+
+Date: Apr 30, 2025
+
+## Summary
+
+This document is a retroactive RFC. It
+- provides some background on what direct IO is,
+- motivates why Pageserver should be using it for its IO, and
+- describes how we changed Pageserver to use it.
+
+The [initial proposal](https://github.com/neondatabase/neon/pull/8240) that kicked off the work can be found in this closed GitHub PR.
+
+People primarily involved in this project were:
+- Yuchen Liang <yuchen@neon.tech>
+- Vlad Lazar <vlad@neon.tech>
+- Christian Schwarz <christian@neon.tech>
+
+## Timeline
+
+For posterity, here is the rough timeline of the development work that got us to where we are today.
+
+- Jan 2024: [integrate `tokio-epoll-uring`](https://github.com/neondatabase/neon/pull/5824) along with owned buffers API
+- March 2024: `tokio-epoll-uring` enabled in all regions in buffered IO mode
+- Feb 2024 to June 2024: PS PageCache Bypass For Data Blocks
+  - Feb 2024: [Vectored Get Implementation](https://github.com/neondatabase/neon/pull/6576) bypasses delta & image layer blocks for page requests
+  - Apr to June 2024: [Epic: bypass PageCache for use data blocks](https://github.com/neondatabase/neon/issues/7386) addresses remaining users
+- Aug to Nov 2024: direct IO: first code; preliminaries; read path coding; BufferedWriter; benchmarks show perf regressions too high, no-go.
+- Nov 2024 to Jan 2025: address perf regressions by developing page_service pipelining (aka batching) and concurrent IO ([Epic](https://github.com/neondatabase/neon/issues/9376))
+- Feb to March 2024: rollout batching, then concurrent+direct IO => read path and InMemoryLayer is now direct IO
+- Apr 2025: develop & roll out direct IO for the write path
+
+## Background: Terminology & Glossary
+
+**kernel page cache**: the Linux kernel's page cache is a write-back cache for filesystem contents.
+The cached unit is memory-page-sized & aligned chunks of the files that are being cached (typically 4k).
+The cache lives in kernel memory and is not directly accessible through userspace.
+
+**Buffered IO**: an application's read/write system calls go through the kernel page cache.
+For example, a 10 byte sized read or write to offset 5000 in a file will load the file contents
+at offset `[4096,8192)` into a free page in the kernel page cache. If necessary, it will evict
+a page to make room (cf eviction). Then, the kernel performs a memory-to-memory copy of 10 bytes
+from/to the offset `4` (`5000 = 4096 + 4`) within the cached page. If it's a write, the kernel keeps
+track of the fact that the page is now "dirty" in some ancillary structure.
+
+**Writeback**: a buffered read/write syscall returns after the memory-to-memory copy. The modifications
+made by e.g. write system calls are not even *issued* to disk, let alone durable. Instead, the kernel
+asynchronously writes back dirtied pages based on a variety of conditions. For us, the most relevant
+ones are a) explicit request by userspace (`fsync`) and b) memory pressure.
+
+**Memory pressure**: the kernel page cache is a best effort service and a user of spare memory capacity.
+If there is no free memory, the kernel page allocator will take pages used by page cache to satisfy allocations.
+Before reusing a page like that, the page has to be written back (writeback, see above).
+The far-reaching consequence of this is that **any allocation of anonymous memory can do IO** if the only
+way to get that memory is by eviction & re-using a dirty page cache page.
+Notably, this includes a simple `malloc` in userspace, because eventually that boils down to `mmap(..., MAP_ANON, ...)`.
+I refer to this effect as the "malloc latency backscatter" caused by buffered IO.
+
+**Direct IO** allows application's read/write system calls to bypass the kernel page cache. The filesystem
+is still involved because it is ultimately in charge of mapping the concept of files & offsets within them
+to sectors on block devices. Typically, the filesystem poses size and alignment requirements for memory buffers
+and file offsets (statx `Dio_mem_align` / `Dio_offset_align`), see [this gist](https://gist.github.com/problame/1c35cac41b7cd617779f8aae50f97155).
+The IO operations will fail at runtime with EINVAL if the alignment requirements are not met.
+
+**"buffered" vs "direct"**: the central distinction between buffered and direct IO is about who allocates and
+fills the IO buffers, and who controls when exactly the IOs are issued. In buffered IO, it's the syscall handlers,
+kernel page cache, and memory management subsystems (cf "writeback"). In direct IO, all of it is done by
+the application.
+It takes more effort by the application to program with direct instead of buffered IO.
+The return is precise control over and a clear distinction between consumption/modification of memory vs disk.
+
+**Pageserver PageCache**: Pageserver has an additional `PageCache` (referred to as PS PageCache from here on, as opposed to "kernel page cache").
+Its caching unit is 8KiB blocks of the layer files written by Pageserver.
+A miss in PageCache is filled by reading from the filesystem, through the `VirtualFile` abstraction layer.
+The default size is tiny (64MiB), very much like Postgres's `shared_buffers`.
+We ran production at 128MiB for a long time but gradually moved it up to 2GiB over the past ~year.
+
+**VirtualFile** is Pageserver's abstraction for file IO, very similar to the facility in Postgres that bears the same name.
+Its historical purpose appears to be working around open file descriptor limitations, which is practically irrelevant on Linux.
+However, the facility in Pageserver is useful as an intermediary layer for metrics and abstracts over the different kinds of
+IO engines that Pageserver supports (`std-fs` vs `tokio-epoll-uring`).
+
+## Background: History Of Caching In Pageserver
+
+For multiple years, Pageserver's `PageCache` was on the path of all read _and write_ IO.
+It performed write-back to the kernel using buffered IO.
+
+We converted it into a read-only cache of immutable data in [PR 4994](https://github.com/neondatabase/neon/pull/4994).
+
+The introduction of `tokio-epoll-uring` required converting the code base to used owned IO buffers.
+The `PageCache` pages are usable as owned IO buffers.
+
+We then started bypassing PageCache for user data blocks.
+Data blocks are the 8k blocks of data in layer files that hold the multiple `Value`s, as opposed to the disk btree index blocks that tell us which values exist in a file at what offsets.
+The disk btree embedded in delta & image layers remains `PageCache`'d.
+Epics for that work were:
+- Vectored `Timeline::get` (cf RFC 30) skipped delta and image layer data block `PageCache`ing outright.
+- Epic https://github.com/neondatabase/neon/issues/7386 took care of the remaining users for data blocks:
+  - Materialized page cache (cached materialized pages; shown to be ~0% hit rate in practice)
+  - InMemoryLayer
+  - Compaction
+
+The outcome of the above:
+1. All data blocks are always read through the `VirtualFile` APIs, hitting the kernel buffered read path (=> kernel page cache).
+2. Indirect blocks (=disk btree blocks) would be cached in the PS `PageCache`.
+
+In production we size the PS `PageCache` to be 2GiB.
+Thus drives hit rate up to ~99.95% and the eviction rate / replacement rates down to less than 200/second on a 1-minute average, on the busiest machines.
+High baseline replacement rates are treated as a signal of resource exhaustion (page cache insufficient to host working set of the PS).
+The response to this is to migrate tenants away, or increase PS `PageCache` size.
+It is currently manual but could be automated, e.g., in Storage Controller.
+
+In the future, we may eliminate the `PageCache` even for indirect blocks.
+For example with an LRU cache that has as unit the entire disk btree content
+instead of individual blocks.
+
+## High-Level Design
+
+So, before work on this project started, all data block reads and the entire write path of Pageserver were using kernel-buffered IO, i.e., the kernel page cache.
+We now want to get the kernel page cache out of the picture by using direct IO for all interaction with the filesystem.
+This achieves the following system properties:
+
+**Predictable VirtualFile latencies**
+* With buffered IO, reads are sometimes fast, sometimes slow, depending on kernel page cache hit/miss.
+* With buffered IO, appends when writing out new layer files during ingest or compaction are sometimes fast, sometimes slow because of write-back backpressure.
+* With buffered IO, the "malloc backscatter" phenomenon pointed out in the Glossary section is not something we actively observe.
+  But we do have occasional spikes in Dirty memory amount and Memory PSI graphs, so it may already be affecting to some degree.
+* By switching to direct IO, above operations will have the (predictable) device latency -- always.
+  Reads and appends always go to disk.
+  And malloc will not have to write back dirty data.
+
+**Explicitness & Tangibility of resource usage**
+* In a multi-tenant system, it is generally desirable and valuable to be *explicit* about the main resources we use for each tenant.
+* By using direct IO, we become explicit about the resources *disk IOPs*  and *memory capacity* in a way that was previously being conflated through the kernel page cache, outside our immediate control.
+* We will be able to build per-tenant observability of resource usage ("what tenant is causing the actual IOs that are sent to the disk?").
+* We will be able to build accounting & QoS by implementing an IO scheduler that is tenant aware. The kernel is not tenant-aware and can't do that.
+
+**CPU Efficiency**
+* The involvement of the kernel page cache means one additional memory-to-memory copy on read and write path.
+* Direct IO will eliminate that memory-to-memory copy, if we can make the userspace buffers used for the IO calls satisfy direct IO alignment requirements.
+
+The **trade-off** is that we no longer get the theoretical benefits of the kernel page cache. These are:
+- read latency improvements for repeat reads of the same data ("locality of reference")
+  - asterisk: only if that state is still cache-resident by time of next access
+- write throughput by having kernel page cache batch small VFS writes into bigger disk writes
+  - asterisk: only if memory pressure is low enough that the kernel can afford to delay writeback
+
+We are **happy to make this trade-off**:
+- Because of the advantages listed above.
+- Because we empirically have enough DRAM on Pageservers to serve metadata (=index blocks) from PS PageCache.
+  (At just 2GiB PS PageCache size, we average a 99.95% hit rate).
+  So, the latency of going to disk is only for data block reads, not the index traversal.
+- Because **the kernel page cache is ineffective** at high tenant density anyway (#tenants/pageserver instance).
+  And because dense packing of tenants will always be desirable to drive COGS down, we should design the system for it.
+  (See the appendix for a more detailed explanation why this is).
+- So, we accept that some reads that used to be fast by circumstance will have higher but **predictable** latency than before.
+
+### Desired End State
+
+The desired end state of the project is as follows, and with some asterisks, we have achieved it.
+
+All IOs of the Pageserver data path use direct IO, thereby bypassing the kernel page cache.
+
+In particular, the "data path" includes
+- the wal ingest path
+- compaction
+- anything on the `Timeline::get` / `Timeline::get_vectored` path.
+
+The production Pageserver config is tuned such that virtually all non-data blocks are cached in the PS PageCache.
+Hit rate target is 99.95%.
+
+There are no regressions to ingest latency.
+
+The total "wait-for-disk time" contribution to random getpage request latency is `O(1 read IOP latency)`.
+We accomplish that by having a near 100% PS PageCache hit rate so that layer index traversal effectively never needs not wait for IO.
+Thereby, it can issue all the data blocks as it traverses the index, and only wait at the end of it (concurrent IO).
+
+The amortized "wait-for-disk time" contribution of this direct IO proposal to a series of sequential getpage requests is `1/32 * read IOP latency` for each getpage request.
+We accomplish this by server-side batching of up to 32 reads into a single `Timeline::get_vectored` call.
+(This is an ideal world where our batches are full - that's not the case in prod today because of lack of queue depth).
+
+## Design & Implementation
+
+### Prerequisites
+
+A lot of prerequisite work had to happen to enable use of direct IO.
+
+To meet the "wait-for-disk time" requirements from the DoD, we implement for the read path:
+- page_service level server-side batching (config field `page_service_pipelining`)
+- concurrent IO (config field `get_vectored_concurrent_io`)
+The work for both of these these was tracked [in the epic](https://github.com/neondatabase/neon/issues/9376).
+Server-side batching will likely be obsoleted by the [#proj-compute-communicator](https://github.com/neondatabase/neon/pull/10799).
+The Concurrent IO work is described in retroactive RFC `2025-04-30-pageserver-concurrent-io-on-read-path.md`.
+The implementation is relatively brittle and needs further investment, see the `Future Work` section in that RFC.
+
+For the write path, and especially WAL ingest, we need to hide write latency.
+We accomplish this by implementing a (`BufferedWriter`) type that does double-buffering: flushes of the filled
+buffer happen in a sidecar tokio task while new writes fill a new buffer.
+We refactor InMemoryLayer as well as BlobWriter (=> delta and image layer writers) to use this new `BufferedWriter`.
+The most comprehensive write-up of this work is in [the PR description](https://github.com/neondatabase/neon/pull/11558).
+
+### Ensuring Adherence to Alignment Requirements
+
+Direct IO puts requirements on
+- memory buffer alignment
+- io size (=memory buffer size)
+- file offset alignment
+
+The requirements are specific to a combination of filesystem/block-device/architecture(hardware page size!).
+
+In Neon production environments we currently use ext4 with Linux 6.1.X on AWS and Azure storage-optimized instances (locally attached NVMe).
+Instead of dynamic discovery using `statx`, we statically hard-code 512 bytes as the buffer/offset alignment and size-multiple.
+We made this decision because:
+- a) it is compatible with all the environments we need to run in
+- b) our primary workload can be small-random-read-heavy (we do merge adjacent reads if possible, but the worst case is that all `Value`s that needs to be read are far apart)
+- c) 512-byte tail latency on the production instance types is much better than 4k (p99.9: 3x lower, p99.99 5x lower).
+- d) hard-coding at compile-time allows us to use the Rust type system to enforce the use of only aligned IO buffers, eliminating a source of runtime errors typically associated with direct IO.
+
+This was [discussed here](https://neondb.slack.com/archives/C07BZ38E6SD/p1725036790965549?thread_ts=1725026845.455259&cid=C07BZ38E6SD).
+
+The new `IoBufAligned` / `IoBufAlignedMut` marker traits indicate that a given buffer meets memory alignment requirements.
+All `VirtualFile` APIs and several software layers built on top of them only accept buffers that implement those traits.
+Implementors of the marker traits are:
+- `IoBuffer` / `IoBufferMut`: used for most reads and writes
+- `PageWriteGuardBuf`: for filling PS PageCache pages (index blocks!)
+
+The alignment requirement is infectious; it permeates bottom-up throughout the code base.
+We stop the infection at roughly the same layers in the code base where we stopped permeating the
+use of owned-buffers-style API for tokio-epoll-uring. The way the stopping works is by introducing
+a memory-to-memory copy from/to some unaligned memory location on the stack/current/heap.
+The places where we currently stop permeating are sort of arbitrary. For example, it would probably
+make sense to replace more usage of `Bytes` that we know holds 8k pages with 8k-sized `IoBuffer`s.
+
+The `IoBufAligned` / `IoBufAlignedMut` types do not protect us from the following types of runtime errors:
+- non-adherence to file offset alignment requirements
+- non-adherence to io size requirements
+
+The following higher-level constructs ensure we meet the requirements:
+- read path: the `ChunkedVectoredReadBuilder` and `mod vectored_dio_read` ensure reads happen at aligned offsets and in appropriate size multiples.
+- write path: `BufferedWriter` only writes in multiples of the capacity, at offsets that are `start_offset+N*capacity`; see its doc comment.
+
+Note that these types are used always, regardless of whether direct IO is enabled or not.
+There are some cases where this adds unnecessary overhead to buffered IO (e.g. all memcpy's inflated to multiples of 512).
+But we could not identify meaningful impact in practice when we shipped these changes while we were still using buffered IO.
+
+### Configuration / Feature Flagging
+
+In the previous section we described how all users of VirtualFile were changed to always adhere to direct IO alignment and size-multiple requirements.
+To actually enable direct IO, all we need to do is set the `O_DIRECT` flag in `open` syscalls / io_uring operations.
+
+We set `O_DIRECT` based on:
+- the VirtualFile API used to create/open the VirtualFile instance
+- the `virtual_file_io_mode` configuration flag
+- the OpenOptions `read` and/or `write` flags.
+
+The VirtualFile APIs suffixed with `_v2` are the only ones that _may_ open with `O_DIRECT` depending on the other two factors in above list.
+Other APIs never use `O_DIRECT`.
+(The name is bad and should really be `_maybe_direct_io`.)
+
+The reason for having new APIs is because all code used VirtualFile but implementation and rollout happened in consecutive phases (read path, InMemoryLayer, write path).
+At the VirtualFile level, context on whether an instance of VirtualFile is on read path, InMemoryLayer, or write path is not available.
+
+The `_v2` APIs then check make the decision to set `O_DIRECT` based on the `virtual_file_io_mode` flag and the OpenOptions `read`/`write` flags.
+The result is the following runtime behavior:
+
+|what|OpenOptions|`v_f_io_mode`<br/>=`buffered`|`v_f_io_mode`<br/>=`direct`|`v_f_io_mode`<br/>=`direct-rw`|
+|-|-|-|-|-|
+|`DeltaLayerInner`|read|()|O_DIRECT|O_DIRECT|
+|`ImageLayerInner`|read|()|O_DIRECT|O_DIRECT|
+|`InMemoryLayer`|read + write|()|()*|O_DIRECT|
+|`DeltaLayerWriter`| write | () | () |  O_DIRECT |
+|`ImageLayerWriter`| write | () | () |  O_DIRECT |
+|`download_layer_file`|write |()|()|O_DIRECT|
+
+The `InMemoryLayer` is marked with `*` because there was a period when it *did* use O_DIRECT under `=direct`.
+That period was when we implemented and shipped the first version of `BufferedWriter`.
+We used it in `InMemoryLayer` and `download_layer_file` but it was only sensitive to `v_f_io_mode` in `InMemoryLayer`.
+The introduction of `=direct-rw`, and the switch of the remaining write path to `BufferedWriter`, happened later,
+in https://github.com/neondatabase/neon/pull/11558.
+
+Note that this way of feature flagging inside VirtualFile makes it less and less a general purpose POSIX file access abstraction.
+For example, with `=direct-rw` enabled, it is no longer possible to open a `VirtualFile` without `O_DIRECT`. It'll always be set.
+
+## Correctness Validation
+
+The correctness risks with this project were:
+- Memory safety issues in the `IoBuffer` / `IoBufferMut` implementation.
+  These types expose an API that is largely identical to that of the `bytes` crate and/or Vec.
+- Runtime errors (=> downtime / unavailability) because of non-adherence to alignment/size-multiple requirements, resulting in EINVAL on the read path.
+
+We sadly do not have infrastructure to run pageserver under `cargo miri`.
+So for memory safety issues, we relied on careful peer review.
+
+We do assert the production-like alignment requirements in testing builds.
+However, these asserts were added retroactively.
+The actual validation before rollout happened in staging and pre-prod.
+We eventually enabled  `=direct`/`=direct-rw` for Rust unit tests and the regression test suite.
+I cannot recall a single instance of staging/pre-prod/production errors caused by non-adherence to alignment/size-multiple requirements.
+Evidently developer testing was good enough.
+
+## Performance Validation
+
+The read path went through a lot of iterations of benchmarking in staging and pre-prod.
+The benchmarks in those environments demonstrated performance regressions early in the implementation.
+It was actually this performance testing that made us implement batching and concurrent IO to avoid unacceptable regressions.
+
+The write path was much quicker to validate because `bench_ingest` covered all of the (less numerous) access patterns.
+
+## Future Work
+
+There is minor and major follow-up work that can be considered in the future.
+Check the (soon-to-be-closed) Epic https://github.com/neondatabase/neon/issues/8130's "Follow-Ups" section for a current list.
+
+Read Path:
+- PS PageCache hit rate is crucial to unlock concurrent IO and reasonable latency for random reads generally.
+  Instead of reactively sizing PS PageCache, we should estimate the required PS PageCache size
+  and potentially also use that to drive placement decisions of shards from StorageController
+  https://github.com/neondatabase/neon/issues/9288
+- ... unless we get rid of PS PageCache entirely and cache the index block in a more specialized cache.
+  But even then, an estimation of the working set would be helpful to figure out caching strategy.
+
+Write Path:
+- BlobWriter and its users could switch back to a borrowed API  https://github.com/neondatabase/neon/issues/10129
+- ... unless we want to implement bypass mode for large writes https://github.com/neondatabase/neon/issues/10101
+- The `TempVirtualFile` introduced as part of this project could internalize more of the common usage pattern: https://github.com/neondatabase/neon/issues/11692
+- Reduce conditional compilation around `virtual_file_io_mode`: https://github.com/neondatabase/neon/issues/11676
+
+Both:
+- A performance simulation mode that pads VirtualFile op latencies to typical NVMe latencies, even if the underlying storage is faster.
+  This would avoid misleadingly good performance on developer systems and in benchmarks on systems that are less busy than production hosts.
+  However, padding latencies at microsecond scale is non-trivial.
+
+Misc:
+- We should finish trimming VirtualFile's scope to be truly limited to core data path read & write.
+  Abstractions for reading & writing pageserver config, location config, heatmaps, etc, should use
+  APIs in a different package (`VirtualFile::crashsafe_overwrite` and `VirtualFile::read_to_string`
+  are good entrypoints for cleanup.) https://github.com/neondatabase/neon/issues/11809
+
+# Appendix
+
+## Why Kernel Page Cache Is Ineffective At Tenant High Density
+
+In the Motivation section, we stated:
+
+> - **The kernel page cache ineffective** at high tenant density anyways (#tenants/pageserver instance).
+
+The reason is that the  Pageserver workload sent from Computes is whatever is a Compute cache(s) miss.
+That's either sequential scans or random reads.
+A random read workload simply causes cache thrashing because a packed Pageserver NVMe drive (`im4gn.2xlarge`) has ~100x more capacity than DRAM available.
+It is complete waste to have the kernel page cache cache data blocks in this case.
+Sequential read workloads *can* benefit iff those pages have been updated recently (=no image layer yet) and together in time/LSN space.
+In such cases, the WAL records of those updates likely sit on the same delta layer block.
+When Compute does a sequential scan, it sends a series of single-page requests for these individual pages.
+When Pageserver processes the second request in such a series, it goes to the same delta layer block and have a kernel page cache hit.
+This dependence on kernel page cache for sequential scan performance is significant, but the solution is at a higher level than generic data block caching.
+We can either add a small per-connection LRU cache for such delta layer blocks.
+Or we can merge those sequential requests into a larger vectored get request, which is designed to never read a block twice.
+This amortizes the read latency for our delta layer block across the vectored get batch size (which currently is up to 32).
+
+There are Pageserver-internal workloads that do sequential access (compaction, image layer generation), but these
+1. are not latency-critical and can do batched access outside of the `page_service` protocol constraints (image layer generation)
+2. don't actually need to reconstruct images and therefore can use totally different access methods (=> compaction can use k-way merge iterators with their own internal buffering / prefetching).
diff --git a/docs/rfcs/2025-04-30-pageserver-concurrent-io-on-read-path.md b/docs/rfcs/2025-04-30-pageserver-concurrent-io-on-read-path.md
new file mode 100644
index 0000000000..2dc937d298
--- /dev/null
+++ b/docs/rfcs/2025-04-30-pageserver-concurrent-io-on-read-path.md
@@ -0,0 +1,251 @@
+# Concurrent IO for Pageserver Read Path
+
+Date: May 6, 2025
+
+## Summary
+
+This document is a retroactive RFC on the Pageserver Concurrent IO work that happened in late 2024 / early 2025.
+
+The gist of it is that Pageserver's `Timeline::get_vectored` now _issues_ the data block read operations against layer files
+_as it traverses the layer map_ and only _wait_ once, for all of them, after traversal is complete.
+
+Assuming a good PS PageCache hits on the index blocks during traversal, this drives down the "wait-for-disk" time
+contribution down from `random_read_io_latency * O(number_of_values)` to `random_read_io_latency * O(1 + traversal)`.
+
+The motivation for why this work had to happen when it happened was the switch of Pageserver to
+- not cache user data blocks in PS PageCache and
+- switch to use direct IO.
+More context on this are given in complimentary RFC `./rfcs/2025-04-30-direct-io-for-pageserver.md`.
+
+### Refs
+
+- Epic: https://github.com/neondatabase/neon/issues/9378
+- Prototyping happened during the Lisbon 2024 Offsite hackathon: https://github.com/neondatabase/neon/pull/9002
+- Main implementation PR with good description: https://github.com/neondatabase/neon/issues/9378
+
+Design and implementation by:
+- Vlad Lazar <vlad@neon.tech>
+- Christian Schwarz <christian@neon.tech>
+
+## Background & Motivation
+
+The Pageserver read path (`Timeline::get_vectored`) consists of two high-level steps:
+- Retrieve the delta and image `Value`s required to reconstruct the requested Page@LSN (`Timeline::get_values_reconstruct_data`).
+- Pass these values to walredo to reconstruct the page images.
+
+The read path used to be single-key but has been made multi-key some time ago.
+([Internal tech talk by Vlad](https://drive.google.com/file/d/1vfY24S869UP8lEUUDHRWKF1AJn8fpWoJ/view?usp=drive_link))
+However, for simplicity, most of this doc will explain things in terms of a single key being requested.
+
+The `Value` retrieval step above can be broken down into the following functions:
+- **Traversal** of the layer map to figure out which `Value`s from which layer files are required for the page reconstruction.
+- **Read IO Planning**: planning of the read IOs that need to be issued to the layer files / filesystem / disk.
+  The main job here is to coalesce the small value reads into larger filesystem-level read operations.
+  This layer also takes care of direct IO alignment and size-multiple requirements (cf the RFC for details.)
+  Check `struct VectoredReadPlanner` and `mod vectored_dio_read` for how it's done.
+- **Perform the read IO** using `tokio-epoll-uring`.
+
+Before this project, above functions were sequentially interleaved, meaning:
+1. we would advance traversal, ...
+2. discover, that we need to read a value, ...
+3. read it from disk using `tokio-epoll-uring`, ...
+4. goto 1 unless we're done.
+
+This meant that if N `Value`s need to be read to reconstruct a page,
+the time we spend waiting for disk will be we `random_read_io_latency * O(number_of_values)`.
+
+## Design
+
+The **traversal** and **read IO Planning** jobs still happen sequentially, layer by layer, as before.
+But instead of performing the read IOs inline, we submit the IOs to a concurrent tokio task for execution.
+After the last read from the last layer is submitted, we wait for the IOs to complete.
+
+Assuming the filesystem / disk is able to actually process the submitted IOs without queuing,
+we arrive at _time spent waiting for disk_ ~ `random_read_io_latency * O(1 + traversal)`.
+
+Note this whole RFC is concerned with the steady state where all layer files required for reconstruction are resident on local NVMe.
+Traversal will stall on on-demand layer download if a layer is not yet resident.
+It cannot proceed without the layer being resident beccause its next step depends on the contents of the layer index.
+
+### Avoiding Waiting For IO During Traversal
+
+The `traversal` component in above time-spent-waiting-for-disk estimation is dominant and needs to be minimized.
+
+Before this project, traversal needed to perform IOs for the following:
+1. The time we are waiting on PS PageCache to page in the visited layers' disk btree index blocks.
+2. When visiting a delta layer, reading the data block that contains a `Value` for a requested key,
+   to determine whether the `Value::will_init` the page and therefore traversal can stop for this key.
+
+The solution for (1) is to raise the PS PageCache size such that the hit rate is practically 100%.
+(Check out the `Background: History Of Caching In Pageserver` section in the RFC on Direct IO for more details.)
+
+The solution for (2) is source `will_init` from the disk btree index keys, which fortunately
+already encode this bit of information since the introduction of the current storage/layer format.
+
+### Concurrent IOs, Submission & Completion
+
+To separate IO submission from waiting for its completion,
+we introduce the notion of an `IoConcurrency` struct through which IOs are issued.
+
+An IO is an opaque future that
+- captures the `tx` side of a `oneshot` channel
+- performs the read IO by calling `VirtualFile::read_exact_at().await`
+- sending the result into the `tx`
+
+Issuing an IO means `Box`ing the future above and handing that `Box` over to the `IoConcurrency` struct.
+
+The traversal code that submits the IO stores the the corresponding `oneshot::Receiver`
+in the `VectoredValueReconstructState`, in the the place where we previously stored
+the sequentially read `img` and `records` fields.
+
+When we're done with traversal, we wait for all submitted IOs:
+for each key, there is a future that awaits all the `oneshot::Receiver`s
+for that key, and then calls into walredo to reconstruct the page image.
+Walredo is now invoked concurrently for each value instead of sequentially.
+Walredo itself remains unchanged.
+
+The spawned IO futures are driven to completion by a sidecar tokio task that
+is separate from the task that performs all the layer visiting and spawning of IOs.
+That tasks receives the IO futures via an unbounded mpsc channel and
+drives them to completion inside a `FuturedUnordered`.
+
+### Error handling, Panics, Cancellation-Safety
+
+There are two error classes during reconstruct data retrieval:
+* traversal errors: index lookup, move to next layer, and the like
+* value read IO errors
+
+A traversal error fails the entire `get_vectored` request, as before this PR.
+A value read error only fails reconstruction of that value.
+
+Panics and dropping of the `get_vectored` future before it completes
+leaves the sidecar task running and does not cancel submitted IOs
+(see next section for details on sidecar task lifecycle).
+All of this is safe, but, today's preference in the team is to close out
+all resource usage explicitly if possible, rather than cancelling + forgetting
+about it on drop. So, there is warning if we drop a
+`VectoredValueReconstructState`/`ValuesReconstructState` that still has uncompleted IOs.
+
+### Sidecar Task Lifecycle
+
+The sidecar tokio task is spawned as part of the `IoConcurrency::spawn_from_conf` struct.
+The `IoConcurrency` object acts as a handle through which IO futures are submitted.
+
+The spawned tokio task holds the `Timeline::gate` open.
+It is _not_ sensitive to `Timeline::cancel`, but instead to the `IoConcurrency` object being dropped.
+
+Once the `IoConcurrency` struct is dropped, no new IO futures can come in
+but already submitted IO futures will be driven to completion regardless.
+We _could_ safely stop polling these futures because `tokio-epoll-uring` op futures are cancel-safe.
+But the underlying kernel and hardware resources are not magically freed up by that.
+So, again, in the interest of closing out all outstanding resource usage, we make timeline shutdown wait for sidecar tasks and their IOs to complete.
+Under normal conditions, this should be in the low hundreds of microseconds.
+
+It is advisable to make the `IoConcurrency` as long-lived as possible to minimize the amount of
+tokio task churn (=> lower pressure on tokio). Generally this means creating it "high up" in the call stack.
+The pain with this is that the `IoConcurrency` reference needs to be propagated "down" to
+the (short-lived) functions/scope where we issue the IOs.
+We would like to use `RequestContext` for this propagation in the future (issue [here](https://github.com/neondatabase/neon/issues/10460)).
+For now, we just add another argument to the relevant code paths.
+
+### Feature Gating
+
+The `IoConcurrency` is an `enum` with two variants: `Sequential` and `SidecarTask`.
+
+The behavior from before this project is available through `IoConcurrency::Sequential`,
+which awaits the IO futures in place, without "spawning" or "submitting" them anywhere.
+
+The `get_vectored_concurrent_io` pageserver config variable determines the runtime value,
+**except** for the places that use `IoConcurrency::sequential` to get an `IoConcurrency` object.
+
+### Alternatives Explored & Caveats Encountered
+
+A few words on the rationale behind having a sidecar *task* and what
+alternatives were considered but abandoned.
+
+#### Why We Need A Sidecar *Task* / Why Just `FuturesUnordered` Doesn't Work
+
+We explored to not have a sidecar task, and instead have a `FuturesUnordered` per
+`Timeline::get_vectored`. We would queue all IO futures in it and poll it for the
+first time after traversal is complete (i.e., at `collect_pending_ios`).
+
+The obvious disadvantage, but not showstopper, is that we wouldn't be submitting
+IOs until traversal is complete.
+
+The showstopper however, is that deadlocks happen if we don't drive the
+IO futures to completion independently of the traversal task.
+The reason is that both the IO futures and the traversal task may hold _some_,
+_and_ try to acquire _more_, shared limited resources.
+For example, both the travseral task and IO future may try to acquire
+* a `VirtualFile` file descriptor cache slot async mutex (observed during impl)
+* a `tokio-epoll-uring` submission slot (observed during impl)
+* a `PageCache` slot (currently this is not the case but we may move more code into the IO futures in the future)
+
+#### Why We Don't Do `tokio::task`-per-IO-future
+
+Another option is to spawn a short-lived `tokio::task` for each IO future.
+We implemented and benchmarked it during development, but found little
+throughput improvement and moderate mean & tail latency degradation.
+Concerns about pressure on the tokio scheduler led us to abandon this variant.
+
+## Future Work
+
+In addition to what is listed here, also check the "Punted" list in the epic:
+https://github.com/neondatabase/neon/issues/9378
+
+### Enable `Timeline::get`
+
+The only major code path that still uses `IoConcurrency::sequential` is `Timeline::get`.
+The impact is that roughly the following parts of pageserver do not benefit yet:
+- parts of basebackup
+- reads performed by the ingest path
+- most internal operations that read metadata keys (e.g. `collect_keyspace`!)
+
+The solution is to propagate `IoConcurrency` via `RequestContext`:https://github.com/neondatabase/neon/issues/10460
+
+The tricky part is to figure out at which level of the code the `IoConcurrency` is spawned (and added to the RequestContext).
+
+Also, propagation via `RequestContext` makes makes it harder to tell during development whether a given
+piece of code uses concurrent vs sequential mode: one has to recurisvely walk up the call tree to find the
+place that puts the `IoConcurrency` into the `RequestContext`.
+We'd have to use `::Sequential` as the conservative default value in a fresh `RequestContext`, and add some
+observability to weed out places that fail to enrich with a properly spanwed `IoConcurrency::spawn_from_conf`.
+
+### Concurrent On-Demand Downloads enabled by Detached Indices
+
+As stated earlier, traversal stalls on on-demand download because its next step depends on the contents of the layer index.
+Once we have separated indices from data blocks (=> https://github.com/neondatabase/neon/issues/11695)
+we will only need to stall if the index is not resident. The download of the data blocks can happen concurrently or in the background. For example:
+- Move the `Layer::get_or_maybe_download().await` inside the IO futures.
+  This goes in the opposite direction of the next "future work" item below, but it's easy to do.
+- Serve the IO future directly from object storage and dispatch the layer download
+  to some other actor, e.g., an actor that is responsible for both downloads & eviction.
+
+### New `tokio-epoll-uring` API That Separates Submission & Wait-For-Completion
+
+Instead of `$op().await` style API, it would be useful to have a different `tokio-epoll-uring` API
+that separates enqueuing (without necessarily `io_uring_enter`ing the kernel each time), submission,
+and then wait for completion.
+
+The `$op().await` API is too opaque, so we _have_ to stuff it into a `FuturesUnordered`.
+
+A split API as sketched above would allow traversal to ensure an IO operation is enqueued to the kernel/disk (and get back-pressure iff the io_uring squeue is full).
+While avoiding spending of CPU cycles on processing of completions while we're still traversing.
+
+The idea gets muddied by the fact that we may self-deadlock if we submit too much without completing.
+So, the submission part of the split API needs to process completions if squeue is full.
+
+In any way, this split API is precondition for the bigger issue with the design presented here,
+which we dicsuss in the next section.
+
+### Opaque Futures Are Brittle
+
+The use of opaque futures to represent submitted IOs is a clever hack to minimize changes & allow for near-perfect feature-gating.
+However, we take on **brittleness** because callers must guarantee that the submitted futures are independent.
+By our experience, it is non-trivial to identify or rule out the interdependencies.
+See the lengthy doc comment on the `IoConcurrency::spawn_io` method for more details.
+
+The better interface and proper subsystem boundary is a _descriptive_ struct of what needs to be done ("read this range from this VirtualFile into this buffer")
+and get back a means to wait for completion.
+The subsystem can thereby reason by its own how operations may be related;
+unlike today, where the submitted opaque future can do just about anything.

From a537b2ffd05cb952a3198ca8b36e0dfdfd26e270 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Mon, 12 May 2025 09:25:54 +0200
Subject: [PATCH 26/65] pull_timeline: check tombstones by default (#11873)

Make `pull_timeline` check tombstones by default. Otherwise, we'd be
recreating timelines if the order between creation and deletion got
mixed up, as seen in #11838.

Fixes #11838.
---
 libs/safekeeper_api/src/models.rs                       | 1 +
 safekeeper/src/pull_timeline.rs                         | 6 +++++-
 storage_controller/src/service/safekeeper_reconciler.rs | 1 +
 3 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/libs/safekeeper_api/src/models.rs b/libs/safekeeper_api/src/models.rs
index cc31b38fe7..8658dc4011 100644
--- a/libs/safekeeper_api/src/models.rs
+++ b/libs/safekeeper_api/src/models.rs
@@ -299,6 +299,7 @@ pub struct PullTimelineRequest {
     pub tenant_id: TenantId,
     pub timeline_id: TimelineId,
     pub http_hosts: Vec<String>,
+    pub ignore_tombstone: Option<bool>,
 }
 
 #[derive(Debug, Serialize, Deserialize)]
diff --git a/safekeeper/src/pull_timeline.rs b/safekeeper/src/pull_timeline.rs
index 66f2877cc5..c955e667bd 100644
--- a/safekeeper/src/pull_timeline.rs
+++ b/safekeeper/src/pull_timeline.rs
@@ -468,12 +468,15 @@ pub async fn handle_request(
     assert!(status.tenant_id == request.tenant_id);
     assert!(status.timeline_id == request.timeline_id);
 
+    let check_tombstone = !request.ignore_tombstone.unwrap_or_default();
+
     match pull_timeline(
         status,
         safekeeper_host,
         sk_auth_token,
         http_client,
         global_timelines,
+        check_tombstone,
     )
     .await
     {
@@ -499,6 +502,7 @@ async fn pull_timeline(
     sk_auth_token: Option<SecretString>,
     http_client: reqwest::Client,
     global_timelines: Arc<GlobalTimelines>,
+    check_tombstone: bool,
 ) -> Result<PullTimelineResponse> {
     let ttid = TenantTimelineId::new(status.tenant_id, status.timeline_id);
     info!(
@@ -570,7 +574,7 @@ async fn pull_timeline(
 
     // Finally, load the timeline.
     let _tli = global_timelines
-        .load_temp_timeline(ttid, &tli_dir_path, false)
+        .load_temp_timeline(ttid, &tli_dir_path, check_tombstone)
         .await?;
 
     Ok(PullTimelineResponse {
diff --git a/storage_controller/src/service/safekeeper_reconciler.rs b/storage_controller/src/service/safekeeper_reconciler.rs
index 17bb132982..f756d98c64 100644
--- a/storage_controller/src/service/safekeeper_reconciler.rs
+++ b/storage_controller/src/service/safekeeper_reconciler.rs
@@ -331,6 +331,7 @@ impl SafekeeperReconcilerInner {
                     http_hosts,
                     tenant_id: req.tenant_id,
                     timeline_id,
+                    ignore_tombstone: Some(false),
                 };
                 success = self
                     .reconcile_inner(

From 307e1e64c8f9edf641ae92e920821af4eb013b09 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Mon, 12 May 2025 17:17:35 +0800
Subject: [PATCH 27/65] fix(scrubber): more logs wrt relic timelines (#11895)

## Problem

Further investigation on
https://github.com/neondatabase/neon/issues/11159 reveals that the
list_tenant function can find all the shards of the tenant, but then the
shard gets missing during the gc timeline list blob. One reason could be
that in some ways the timeline gets recognized as a relic timeline.

## Summary of changes

Add logging to help identify the issue.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 storage_scrubber/src/checks.rs                 | 3 ++-
 storage_scrubber/src/pageserver_physical_gc.rs | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/storage_scrubber/src/checks.rs b/storage_scrubber/src/checks.rs
index b151b612bf..40f3523a7e 100644
--- a/storage_scrubber/src/checks.rs
+++ b/storage_scrubber/src/checks.rs
@@ -355,6 +355,7 @@ pub(crate) async fn list_timeline_blobs(
     match res {
         ListTimelineBlobsResult::Ready(data) => Ok(data),
         ListTimelineBlobsResult::MissingIndexPart(_) => {
+            tracing::warn!("listing raced with removal of an index, retrying");
             // Retry if listing raced with removal of an index
             let data = list_timeline_blobs_impl(remote_client, id, root_target)
                 .await?
@@ -441,7 +442,7 @@ async fn list_timeline_blobs_impl(
     }
 
     if index_part_keys.is_empty() && s3_layers.is_empty() {
-        tracing::debug!("Timeline is empty: expected post-deletion state.");
+        tracing::info!("Timeline is empty: expected post-deletion state.");
         if initdb_archive {
             tracing::info!("Timeline is post deletion but initdb archive is still present.");
         }
diff --git a/storage_scrubber/src/pageserver_physical_gc.rs b/storage_scrubber/src/pageserver_physical_gc.rs
index e1a4095a3c..49ab192285 100644
--- a/storage_scrubber/src/pageserver_physical_gc.rs
+++ b/storage_scrubber/src/pageserver_physical_gc.rs
@@ -593,6 +593,7 @@ async fn gc_timeline(
             index_part_snapshot_time: _,
         } => (index_part, *index_part_generation, data.unused_index_keys),
         BlobDataParseResult::Relic => {
+            tracing::info!("Skipping timeline {ttid}, it is a relic");
             // Post-deletion tenant location: don't try and GC it.
             return Ok(summary);
         }

From a618056770cf83e3a6ff44ccea92d0e15cc1c67a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Ko=C5=82odziejczak?=
 <31549762+mrl5@users.noreply.github.com>
Date: Mon, 12 May 2025 13:24:33 +0200
Subject: [PATCH 28/65] chore(compute): skip audit logs for pg_session_jwt
 extension (#11883)

references
https://github.com/neondatabase/cloud/issues/28480#issuecomment-2866961124

related https://github.com/neondatabase/cloud/issues/28863

cc @MihaiBojin @conradludgate
---
 compute_tools/src/config.rs | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs
index 42d245f55a..933b30134f 100644
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -224,7 +224,10 @@ pub fn write_postgres_conf(
             writeln!(file, "pgaudit.log_rotation_age=5")?;
 
             // Enable audit logs for pg_session_jwt extension
-            writeln!(file, "pg_session_jwt.audit_log=on")?;
+            // TODO: Consider a good approach for shipping pg_session_jwt logs to the same sink as
+            // pgAudit - additional context in https://github.com/neondatabase/cloud/issues/28863
+            //
+            // writeln!(file, "pg_session_jwt.audit_log=on")?;
 
             // Add audit shared_preload_libraries, if they are not present.
             //

From a77919f4b2668277795d731a343f0955bf144eb7 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Mon, 12 May 2025 16:48:48 +0100
Subject: [PATCH 29/65] merge pg-sni-router into proxy (#11882)

## Problem

We realised that pg-sni-router doesn't need to be separate from proxy.
just a separate port.

## Summary of changes

Add pg-sni-router config to proxy and expose the service.
---
 proxy/src/binary/local_proxy.rs               |   4 +-
 proxy/src/binary/pg_sni_router.rs             | 106 +++++----
 proxy/src/binary/proxy.rs                     | 212 ++++++++++++------
 proxy/src/tls/server_config.rs                |  33 +--
 test_runner/fixtures/neon_fixtures.py         |  25 +++
 .../regress/test_proxy_metric_collection.py   |   4 +
 test_runner/regress/test_sni_router.py        |  26 ++-
 7 files changed, 283 insertions(+), 127 deletions(-)

diff --git a/proxy/src/binary/local_proxy.rs b/proxy/src/binary/local_proxy.rs
index ee7f6ffcd7..a566383390 100644
--- a/proxy/src/binary/local_proxy.rs
+++ b/proxy/src/binary/local_proxy.rs
@@ -423,8 +423,8 @@ async fn refresh_config_inner(
     if let Some(tls_config) = data.tls {
         let tls_config = tokio::task::spawn_blocking(move || {
             crate::tls::server_config::configure_tls(
-                &tls_config.key_path,
-                &tls_config.cert_path,
+                tls_config.key_path.as_ref(),
+                tls_config.cert_path.as_ref(),
                 None,
                 false,
             )
diff --git a/proxy/src/binary/pg_sni_router.rs b/proxy/src/binary/pg_sni_router.rs
index 19be058ac3..2239d064b2 100644
--- a/proxy/src/binary/pg_sni_router.rs
+++ b/proxy/src/binary/pg_sni_router.rs
@@ -1,8 +1,10 @@
-/// A stand-alone program that routes connections, e.g. from
-/// `aaa--bbb--1234.external.domain` to `aaa.bbb.internal.domain:1234`.
-///
-/// This allows connecting to pods/services running in the same Kubernetes cluster from
-/// the outside. Similar to an ingress controller for HTTPS.
+//! A stand-alone program that routes connections, e.g. from
+//! `aaa--bbb--1234.external.domain` to `aaa.bbb.internal.domain:1234`.
+//!
+//! This allows connecting to pods/services running in the same Kubernetes cluster from
+//! the outside. Similar to an ingress controller for HTTPS.
+
+use std::path::Path;
 use std::{net::SocketAddr, sync::Arc};
 
 use anyhow::{Context, anyhow, bail, ensure};
@@ -86,46 +88,7 @@ pub async fn run() -> anyhow::Result<()> {
         args.get_one::<String>("tls-key"),
         args.get_one::<String>("tls-cert"),
     ) {
-        (Some(key_path), Some(cert_path)) => {
-            let key = {
-                let key_bytes = std::fs::read(key_path).context("TLS key file")?;
-
-                let mut keys =
-                    rustls_pemfile::pkcs8_private_keys(&mut &key_bytes[..]).collect_vec();
-
-                ensure!(keys.len() == 1, "keys.len() = {} (should be 1)", keys.len());
-                PrivateKeyDer::Pkcs8(
-                    keys.pop()
-                        .expect("keys should not be empty")
-                        .context(format!("Failed to read TLS keys at '{key_path}'"))?,
-                )
-            };
-
-            let cert_chain_bytes = std::fs::read(cert_path)
-                .context(format!("Failed to read TLS cert file at '{cert_path}.'"))?;
-
-            let cert_chain: Vec<_> = {
-                rustls_pemfile::certs(&mut &cert_chain_bytes[..])
-                .try_collect()
-                .with_context(|| {
-                    format!("Failed to read TLS certificate chain from bytes from file at '{cert_path}'.")
-                })?
-            };
-
-            // needed for channel bindings
-            let first_cert = cert_chain.first().context("missing certificate")?;
-            let tls_server_end_point = TlsServerEndPoint::new(first_cert)?;
-
-            let tls_config =
-                rustls::ServerConfig::builder_with_provider(Arc::new(ring::default_provider()))
-                    .with_protocol_versions(&[&rustls::version::TLS13, &rustls::version::TLS12])
-                    .context("ring should support TLS1.2 and TLS1.3")?
-                    .with_no_client_auth()
-                    .with_single_cert(cert_chain, key)?
-                    .into();
-
-            (tls_config, tls_server_end_point)
-        }
+        (Some(key_path), Some(cert_path)) => parse_tls(key_path.as_ref(), cert_path.as_ref())?,
         _ => bail!("tls-key and tls-cert must be specified"),
     };
 
@@ -188,7 +151,58 @@ pub async fn run() -> anyhow::Result<()> {
     match signal {}
 }
 
-async fn task_main(
+pub(super) fn parse_tls(
+    key_path: &Path,
+    cert_path: &Path,
+) -> anyhow::Result<(Arc<rustls::ServerConfig>, TlsServerEndPoint)> {
+    let key = {
+        let key_bytes = std::fs::read(key_path).context("TLS key file")?;
+
+        let mut keys = rustls_pemfile::pkcs8_private_keys(&mut &key_bytes[..]).collect_vec();
+
+        ensure!(keys.len() == 1, "keys.len() = {} (should be 1)", keys.len());
+        PrivateKeyDer::Pkcs8(
+            keys.pop()
+                .expect("keys should not be empty")
+                .context(format!(
+                    "Failed to read TLS keys at '{}'",
+                    key_path.display()
+                ))?,
+        )
+    };
+
+    let cert_chain_bytes = std::fs::read(cert_path).context(format!(
+        "Failed to read TLS cert file at '{}.'",
+        cert_path.display()
+    ))?;
+
+    let cert_chain: Vec<_> = {
+        rustls_pemfile::certs(&mut &cert_chain_bytes[..])
+            .try_collect()
+            .with_context(|| {
+                format!(
+                    "Failed to read TLS certificate chain from bytes from file at '{}'.",
+                    cert_path.display()
+                )
+            })?
+    };
+
+    // needed for channel bindings
+    let first_cert = cert_chain.first().context("missing certificate")?;
+    let tls_server_end_point = TlsServerEndPoint::new(first_cert)?;
+
+    let tls_config =
+        rustls::ServerConfig::builder_with_provider(Arc::new(ring::default_provider()))
+            .with_protocol_versions(&[&rustls::version::TLS13, &rustls::version::TLS12])
+            .context("ring should support TLS1.2 and TLS1.3")?
+            .with_no_client_auth()
+            .with_single_cert(cert_chain, key)?
+            .into();
+
+    Ok((tls_config, tls_server_end_point))
+}
+
+pub(super) async fn task_main(
     dest_suffix: Arc<String>,
     tls_config: Arc<rustls::ServerConfig>,
     compute_tls_config: Option<Arc<rustls::ClientConfig>>,
diff --git a/proxy/src/binary/proxy.rs b/proxy/src/binary/proxy.rs
index e03f2f33d9..fe0d551f7f 100644
--- a/proxy/src/binary/proxy.rs
+++ b/proxy/src/binary/proxy.rs
@@ -1,9 +1,10 @@
 use std::net::SocketAddr;
+use std::path::PathBuf;
 use std::pin::pin;
 use std::sync::Arc;
 use std::time::Duration;
 
-use anyhow::bail;
+use anyhow::{bail, ensure};
 use arc_swap::ArcSwapOption;
 use futures::future::Either;
 use remote_storage::RemoteStorageConfig;
@@ -62,18 +63,18 @@ struct ProxyCliArgs {
     region: String,
     /// listen for incoming client connections on ip:port
     #[clap(short, long, default_value = "127.0.0.1:4432")]
-    proxy: String,
+    proxy: SocketAddr,
     #[clap(value_enum, long, default_value_t = AuthBackendType::ConsoleRedirect)]
     auth_backend: AuthBackendType,
     /// listen for management callback connection on ip:port
     #[clap(short, long, default_value = "127.0.0.1:7000")]
-    mgmt: String,
+    mgmt: SocketAddr,
     /// listen for incoming http connections (metrics, etc) on ip:port
     #[clap(long, default_value = "127.0.0.1:7001")]
-    http: String,
+    http: SocketAddr,
     /// listen for incoming wss connections on ip:port
     #[clap(long)]
-    wss: Option<String>,
+    wss: Option<SocketAddr>,
     /// redirect unauthenticated users to the given uri in case of console redirect auth
     #[clap(short, long, default_value = "http://localhost:3000/psql_session/")]
     uri: String,
@@ -99,18 +100,18 @@ struct ProxyCliArgs {
     ///
     /// tls-key and tls-cert are for backwards compatibility, we can put all certs in one dir
     #[clap(short = 'k', long, alias = "ssl-key")]
-    tls_key: Option<String>,
+    tls_key: Option<PathBuf>,
     /// path to TLS cert for client postgres connections
     ///
     /// tls-key and tls-cert are for backwards compatibility, we can put all certs in one dir
     #[clap(short = 'c', long, alias = "ssl-cert")]
-    tls_cert: Option<String>,
+    tls_cert: Option<PathBuf>,
     /// Allow writing TLS session keys to the given file pointed to by the environment variable `SSLKEYLOGFILE`.
     #[clap(long, alias = "allow-ssl-keylogfile")]
     allow_tls_keylogfile: bool,
     /// path to directory with TLS certificates for client postgres connections
     #[clap(long)]
-    certs_dir: Option<String>,
+    certs_dir: Option<PathBuf>,
     /// timeout for the TLS handshake
     #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)]
     handshake_timeout: tokio::time::Duration,
@@ -229,6 +230,9 @@ struct ProxyCliArgs {
     // TODO: rename to `console_redirect_confirmation_timeout`.
     #[clap(long, default_value = "2m", value_parser = humantime::parse_duration)]
     webauth_confirmation_timeout: std::time::Duration,
+
+    #[clap(flatten)]
+    pg_sni_router: PgSniRouterArgs,
 }
 
 #[derive(clap::Args, Clone, Copy, Debug)]
@@ -277,6 +281,25 @@ struct SqlOverHttpArgs {
     sql_over_http_max_response_size_bytes: usize,
 }
 
+#[derive(clap::Args, Clone, Debug)]
+struct PgSniRouterArgs {
+    /// listen for incoming client connections on ip:port
+    #[clap(id = "sni-router-listen", long, default_value = "127.0.0.1:4432")]
+    listen: SocketAddr,
+    /// listen for incoming client connections on ip:port, requiring TLS to compute
+    #[clap(id = "sni-router-listen-tls", long, default_value = "127.0.0.1:4433")]
+    listen_tls: SocketAddr,
+    /// path to TLS key for client postgres connections
+    #[clap(id = "sni-router-tls-key", long)]
+    tls_key: Option<PathBuf>,
+    /// path to TLS cert for client postgres connections
+    #[clap(id = "sni-router-tls-cert", long)]
+    tls_cert: Option<PathBuf>,
+    /// append this domain zone to the SNI hostname to get the destination address
+    #[clap(id = "sni-router-destination", long)]
+    dest: Option<String>,
+}
+
 pub async fn run() -> anyhow::Result<()> {
     let _logging_guard = crate::logging::init().await?;
     let _panic_hook_guard = utils::logging::replace_panic_hook_with_tracing_panic_hook();
@@ -307,73 +330,51 @@ pub async fn run() -> anyhow::Result<()> {
         Either::Right(auth_backend) => info!("Authentication backend: {auth_backend:?}"),
     }
     info!("Using region: {}", args.aws_region);
-
-    // TODO: untangle the config args
-    let regional_redis_client = match (args.redis_auth_type.as_str(), &args.redis_notifications) {
-        ("plain", redis_url) => match redis_url {
-            None => {
-                bail!("plain auth requires redis_notifications to be set");
-            }
-            Some(url) => {
-                Some(ConnectionWithCredentialsProvider::new_with_static_credentials(url.clone()))
-            }
-        },
-        ("irsa", _) => match (&args.redis_host, args.redis_port) {
-            (Some(host), Some(port)) => Some(
-                ConnectionWithCredentialsProvider::new_with_credentials_provider(
-                    host.to_string(),
-                    port,
-                    elasticache::CredentialsProvider::new(
-                        args.aws_region,
-                        args.redis_cluster_name,
-                        args.redis_user_id,
-                    )
-                    .await,
-                ),
-            ),
-            (None, None) => {
-                warn!(
-                    "irsa auth requires redis-host and redis-port to be set, continuing without regional_redis_client"
-                );
-                None
-            }
-            _ => {
-                bail!("redis-host and redis-port must be specified together");
-            }
-        },
-        _ => {
-            bail!("unknown auth type given");
-        }
-    };
-
-    let redis_notifications_client = if let Some(url) = args.redis_notifications {
-        Some(ConnectionWithCredentialsProvider::new_with_static_credentials(url))
-    } else {
-        regional_redis_client.clone()
-    };
+    let (regional_redis_client, redis_notifications_client) = configure_redis(&args).await?;
 
     // Check that we can bind to address before further initialization
-    let http_address: SocketAddr = args.http.parse()?;
-    info!("Starting http on {http_address}");
-    let http_listener = TcpListener::bind(http_address).await?.into_std()?;
+    info!("Starting http on {}", args.http);
+    let http_listener = TcpListener::bind(args.http).await?.into_std()?;
 
-    let mgmt_address: SocketAddr = args.mgmt.parse()?;
-    info!("Starting mgmt on {mgmt_address}");
-    let mgmt_listener = TcpListener::bind(mgmt_address).await?;
+    info!("Starting mgmt on {}", args.mgmt);
+    let mgmt_listener = TcpListener::bind(args.mgmt).await?;
 
     let proxy_listener = if args.is_auth_broker {
         None
     } else {
-        let proxy_address: SocketAddr = args.proxy.parse()?;
-        info!("Starting proxy on {proxy_address}");
+        info!("Starting proxy on {}", args.proxy);
+        Some(TcpListener::bind(args.proxy).await?)
+    };
 
-        Some(TcpListener::bind(proxy_address).await?)
+    let sni_router_listeners = {
+        let args = &args.pg_sni_router;
+        if args.dest.is_some() {
+            ensure!(
+                args.tls_key.is_some(),
+                "sni-router-tls-key must be provided"
+            );
+            ensure!(
+                args.tls_cert.is_some(),
+                "sni-router-tls-cert must be provided"
+            );
+
+            info!(
+                "Starting pg-sni-router on {} and {}",
+                args.listen, args.listen_tls
+            );
+
+            Some((
+                TcpListener::bind(args.listen).await?,
+                TcpListener::bind(args.listen_tls).await?,
+            ))
+        } else {
+            None
+        }
     };
 
     // TODO: rename the argument to something like serverless.
     // It now covers more than just websockets, it also covers SQL over HTTP.
     let serverless_listener = if let Some(serverless_address) = args.wss {
-        let serverless_address: SocketAddr = serverless_address.parse()?;
         info!("Starting wss on {serverless_address}");
         Some(TcpListener::bind(serverless_address).await?)
     } else if args.is_auth_broker {
@@ -458,6 +459,37 @@ pub async fn run() -> anyhow::Result<()> {
         }
     }
 
+    // spawn pg-sni-router mode.
+    if let Some((listen, listen_tls)) = sni_router_listeners {
+        let args = args.pg_sni_router;
+        let dest = args.dest.expect("already asserted it is set");
+        let key_path = args.tls_key.expect("already asserted it is set");
+        let cert_path = args.tls_cert.expect("already asserted it is set");
+
+        let (tls_config, tls_server_end_point) =
+            super::pg_sni_router::parse_tls(&key_path, &cert_path)?;
+
+        let dest = Arc::new(dest);
+
+        client_tasks.spawn(super::pg_sni_router::task_main(
+            dest.clone(),
+            tls_config.clone(),
+            None,
+            tls_server_end_point,
+            listen,
+            cancellation_token.clone(),
+        ));
+
+        client_tasks.spawn(super::pg_sni_router::task_main(
+            dest,
+            tls_config,
+            Some(config.connect_to_compute.tls.clone()),
+            tls_server_end_point,
+            listen_tls,
+            cancellation_token.clone(),
+        ));
+    }
+
     client_tasks.spawn(crate::context::parquet::worker(
         cancellation_token.clone(),
         args.parquet_upload,
@@ -565,7 +597,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
         (Some(key_path), Some(cert_path)) => Some(config::configure_tls(
             key_path,
             cert_path,
-            args.certs_dir.as_ref(),
+            args.certs_dir.as_deref(),
             args.allow_tls_keylogfile,
         )?),
         (None, None) => None,
@@ -811,6 +843,60 @@ fn build_auth_backend(
     }
 }
 
+async fn configure_redis(
+    args: &ProxyCliArgs,
+) -> anyhow::Result<(
+    Option<ConnectionWithCredentialsProvider>,
+    Option<ConnectionWithCredentialsProvider>,
+)> {
+    // TODO: untangle the config args
+    let regional_redis_client = match (args.redis_auth_type.as_str(), &args.redis_notifications) {
+        ("plain", redis_url) => match redis_url {
+            None => {
+                bail!("plain auth requires redis_notifications to be set");
+            }
+            Some(url) => {
+                Some(ConnectionWithCredentialsProvider::new_with_static_credentials(url.clone()))
+            }
+        },
+        ("irsa", _) => match (&args.redis_host, args.redis_port) {
+            (Some(host), Some(port)) => Some(
+                ConnectionWithCredentialsProvider::new_with_credentials_provider(
+                    host.to_string(),
+                    port,
+                    elasticache::CredentialsProvider::new(
+                        args.aws_region.clone(),
+                        args.redis_cluster_name.clone(),
+                        args.redis_user_id.clone(),
+                    )
+                    .await,
+                ),
+            ),
+            (None, None) => {
+                // todo: upgrade to error?
+                warn!(
+                    "irsa auth requires redis-host and redis-port to be set, continuing without regional_redis_client"
+                );
+                None
+            }
+            _ => {
+                bail!("redis-host and redis-port must be specified together");
+            }
+        },
+        _ => {
+            bail!("unknown auth type given");
+        }
+    };
+
+    let redis_notifications_client = if let Some(url) = &args.redis_notifications {
+        Some(ConnectionWithCredentialsProvider::new_with_static_credentials(&**url))
+    } else {
+        regional_redis_client.clone()
+    };
+
+    Ok((regional_redis_client, redis_notifications_client))
+}
+
 #[cfg(test)]
 mod tests {
     use std::time::Duration;
diff --git a/proxy/src/tls/server_config.rs b/proxy/src/tls/server_config.rs
index 8f8917ef62..66c53b3aff 100644
--- a/proxy/src/tls/server_config.rs
+++ b/proxy/src/tls/server_config.rs
@@ -1,4 +1,5 @@
 use std::collections::{HashMap, HashSet};
+use std::path::Path;
 use std::sync::Arc;
 
 use anyhow::{Context, bail};
@@ -21,9 +22,9 @@ pub struct TlsConfig {
 
 /// Configure TLS for the main endpoint.
 pub fn configure_tls(
-    key_path: &str,
-    cert_path: &str,
-    certs_dir: Option<&String>,
+    key_path: &Path,
+    cert_path: &Path,
+    certs_dir: Option<&Path>,
     allow_tls_keylogfile: bool,
 ) -> anyhow::Result<TlsConfig> {
     // add default certificate
@@ -39,8 +40,7 @@ pub fn configure_tls(
                 let key_path = path.join("tls.key");
                 let cert_path = path.join("tls.crt");
                 if key_path.exists() && cert_path.exists() {
-                    cert_resolver
-                        .add_cert_path(&key_path.to_string_lossy(), &cert_path.to_string_lossy())?;
+                    cert_resolver.add_cert_path(&key_path, &cert_path)?;
                 }
             }
         }
@@ -86,7 +86,7 @@ pub struct CertResolver {
 }
 
 impl CertResolver {
-    fn parse_new(key_path: &str, cert_path: &str) -> anyhow::Result<Self> {
+    fn parse_new(key_path: &Path, cert_path: &Path) -> anyhow::Result<Self> {
         let (priv_key, cert_chain) = parse_key_cert(key_path, cert_path)?;
         Self::new(priv_key, cert_chain)
     }
@@ -103,7 +103,7 @@ impl CertResolver {
         Ok(Self { certs, default })
     }
 
-    fn add_cert_path(&mut self, key_path: &str, cert_path: &str) -> anyhow::Result<()> {
+    fn add_cert_path(&mut self, key_path: &Path, cert_path: &Path) -> anyhow::Result<()> {
         let (priv_key, cert_chain) = parse_key_cert(key_path, cert_path)?;
         self.add_cert(priv_key, cert_chain)
     }
@@ -124,26 +124,29 @@ impl CertResolver {
 }
 
 fn parse_key_cert(
-    key_path: &str,
-    cert_path: &str,
+    key_path: &Path,
+    cert_path: &Path,
 ) -> anyhow::Result<(PrivateKeyDer<'static>, Vec<CertificateDer<'static>>)> {
     let priv_key = {
         let key_bytes = std::fs::read(key_path)
-            .with_context(|| format!("Failed to read TLS keys at '{key_path}'"))?;
+            .with_context(|| format!("Failed to read TLS keys at '{}'", key_path.display()))?;
         rustls_pemfile::private_key(&mut &key_bytes[..])
-            .with_context(|| format!("Failed to parse TLS keys at '{key_path}'"))?
-            .with_context(|| format!("Failed to parse TLS keys at '{key_path}'"))?
+            .with_context(|| format!("Failed to parse TLS keys at '{}'", key_path.display()))?
+            .with_context(|| format!("Failed to parse TLS keys at '{}'", key_path.display()))?
     };
 
-    let cert_chain_bytes = std::fs::read(cert_path)
-        .context(format!("Failed to read TLS cert file at '{cert_path}.'"))?;
+    let cert_chain_bytes = std::fs::read(cert_path).context(format!(
+        "Failed to read TLS cert file at '{}.'",
+        cert_path.display()
+    ))?;
 
     let cert_chain = {
         rustls_pemfile::certs(&mut &cert_chain_bytes[..])
             .try_collect()
             .with_context(|| {
                 format!(
-                    "Failed to read TLS certificate chain from bytes from file at '{cert_path}'."
+                    "Failed to read TLS certificate chain from bytes from file at '{}'.",
+                    cert_path.display()
                 )
             })?
     };
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 8f56ee4392..2801a0e867 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -3607,6 +3607,8 @@ class NeonProxy(PgProtocol):
         http_port: int,
         mgmt_port: int,
         external_http_port: int,
+        router_port: int,
+        router_tls_port: int,
         auth_backend: NeonProxy.AuthBackend,
         metric_collection_endpoint: str | None = None,
         metric_collection_interval: str | None = None,
@@ -3623,6 +3625,8 @@ class NeonProxy(PgProtocol):
         self.test_output_dir = test_output_dir
         self.proxy_port = proxy_port
         self.mgmt_port = mgmt_port
+        self.router_port = router_port
+        self.router_tls_port = router_tls_port
         self.auth_backend = auth_backend
         self.metric_collection_endpoint = metric_collection_endpoint
         self.metric_collection_interval = metric_collection_interval
@@ -3637,6 +3641,14 @@ class NeonProxy(PgProtocol):
         key_path = self.test_output_dir / "proxy.key"
         generate_proxy_tls_certs("*.local.neon.build", key_path, crt_path)
 
+        # generate key for pg-sni-router.
+        # endpoint.namespace.local.neon.build resolves to 127.0.0.1
+        generate_proxy_tls_certs(
+            "endpoint.namespace.local.neon.build",
+            self.test_output_dir / "router.key",
+            self.test_output_dir / "router.crt",
+        )
+
         args = [
             str(self.neon_binpath / "proxy"),
             *["--http", f"{self.host}:{self.http_port}"],
@@ -3646,6 +3658,11 @@ class NeonProxy(PgProtocol):
             *["--sql-over-http-timeout", f"{self.http_timeout_seconds}s"],
             *["-c", str(crt_path)],
             *["-k", str(key_path)],
+            *["--sni-router-listen", f"{self.host}:{self.router_port}"],
+            *["--sni-router-listen-tls", f"{self.host}:{self.router_tls_port}"],
+            *["--sni-router-tls-cert", str(self.test_output_dir / "router.crt")],
+            *["--sni-router-tls-key", str(self.test_output_dir / "router.key")],
+            *["--sni-router-destination", "local.neon.build"],
             *self.auth_backend.extra_args(),
         ]
 
@@ -3945,6 +3962,8 @@ def link_proxy(
     proxy_port = port_distributor.get_port()
     mgmt_port = port_distributor.get_port()
     external_http_port = port_distributor.get_port()
+    router_port = port_distributor.get_port()
+    router_tls_port = port_distributor.get_port()
 
     with NeonProxy(
         neon_binpath=neon_binpath,
@@ -3952,6 +3971,8 @@ def link_proxy(
         proxy_port=proxy_port,
         http_port=http_port,
         mgmt_port=mgmt_port,
+        router_port=router_port,
+        router_tls_port=router_tls_port,
         external_http_port=external_http_port,
         auth_backend=NeonProxy.Link(),
     ) as proxy:
@@ -3985,6 +4006,8 @@ def static_proxy(
     mgmt_port = port_distributor.get_port()
     http_port = port_distributor.get_port()
     external_http_port = port_distributor.get_port()
+    router_port = port_distributor.get_port()
+    router_tls_port = port_distributor.get_port()
 
     with NeonProxy(
         neon_binpath=neon_binpath,
@@ -3992,6 +4015,8 @@ def static_proxy(
         proxy_port=proxy_port,
         http_port=http_port,
         mgmt_port=mgmt_port,
+        router_port=router_port,
+        router_tls_port=router_tls_port,
         external_http_port=external_http_port,
         auth_backend=NeonProxy.Postgres(auth_endpoint),
     ) as proxy:
diff --git a/test_runner/regress/test_proxy_metric_collection.py b/test_runner/regress/test_proxy_metric_collection.py
index 85d8a6daaa..7442d50f68 100644
--- a/test_runner/regress/test_proxy_metric_collection.py
+++ b/test_runner/regress/test_proxy_metric_collection.py
@@ -52,6 +52,8 @@ def proxy_with_metric_collector(
     proxy_port = port_distributor.get_port()
     mgmt_port = port_distributor.get_port()
     external_http_port = port_distributor.get_port()
+    router_port = port_distributor.get_port()
+    router_tls_port = port_distributor.get_port()
 
     (host, port) = httpserver_listen_address
     metric_collection_endpoint = f"http://{host}:{port}/billing/api/v1/usage_events"
@@ -63,6 +65,8 @@ def proxy_with_metric_collector(
         proxy_port=proxy_port,
         http_port=http_port,
         mgmt_port=mgmt_port,
+        router_port=router_port,
+        router_tls_port=router_tls_port,
         external_http_port=external_http_port,
         metric_collection_endpoint=metric_collection_endpoint,
         metric_collection_interval=metric_collection_interval,
diff --git a/test_runner/regress/test_sni_router.py b/test_runner/regress/test_sni_router.py
index 19952fc71b..61893f22ba 100644
--- a/test_runner/regress/test_sni_router.py
+++ b/test_runner/regress/test_sni_router.py
@@ -6,7 +6,7 @@ from typing import TYPE_CHECKING
 
 import backoff
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import PgProtocol, VanillaPostgres
+from fixtures.neon_fixtures import NeonProxy, PgProtocol, VanillaPostgres
 
 if TYPE_CHECKING:
     from pathlib import Path
@@ -41,6 +41,7 @@ class PgSniRouter(PgProtocol):
         self,
         neon_binpath: Path,
         port: int,
+        tls_port: int,
         destination: str,
         tls_cert: Path,
         tls_key: Path,
@@ -53,6 +54,7 @@ class PgSniRouter(PgProtocol):
         self.host = host
         self.neon_binpath = neon_binpath
         self.port = port
+        self.tls_port = tls_port
         self.destination = destination
         self.tls_cert = tls_cert
         self.tls_key = tls_key
@@ -64,6 +66,7 @@ class PgSniRouter(PgProtocol):
         args = [
             str(self.neon_binpath / "pg_sni_router"),
             *["--listen", f"127.0.0.1:{self.port}"],
+            *["--listen-tls", f"127.0.0.1:{self.tls_port}"],
             *["--tls-cert", str(self.tls_cert)],
             *["--tls-key", str(self.tls_key)],
             *["--destination", self.destination],
@@ -127,10 +130,12 @@ def test_pg_sni_router(
     pg_port = vanilla_pg.default_options["port"]
 
     router_port = port_distributor.get_port()
+    router_tls_port = port_distributor.get_port()
 
     with PgSniRouter(
         neon_binpath=neon_binpath,
         port=router_port,
+        tls_port=router_tls_port,
         destination="local.neon.build",
         tls_cert=test_output_dir / "router.crt",
         tls_key=test_output_dir / "router.key",
@@ -146,3 +151,22 @@ def test_pg_sni_router(
             hostaddr="127.0.0.1",
         )
         assert out[0][0] == 1
+
+
+def test_pg_sni_router_in_proxy(
+    static_proxy: NeonProxy,
+    vanilla_pg: VanillaPostgres,
+):
+    # static_proxy starts this.
+    assert vanilla_pg.is_running()
+    pg_port = vanilla_pg.default_options["port"]
+
+    out = static_proxy.safe_psql(
+        "select 1",
+        dbname="postgres",
+        sslmode="require",
+        host=f"endpoint--namespace--{pg_port}.local.neon.build",
+        hostaddr="127.0.0.1",
+        port=static_proxy.router_port,
+    )
+    assert out[0][0] == 1

From 9971fba5848ca3928b54e123a338d454e6c65283 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Mon, 12 May 2025 12:36:07 -0500
Subject: [PATCH 30/65] Properly configure the dynamic loader to load our
 compiled libraries (#11858)

The first line in /etc/ld.so.conf is:

	/etc/ld.so.conf.d/*

We want to control library load order so that our compiled binaries are
picked up before others from system packages. The previous solution
allowed the system libraries to load before ours.

Part-of: https://github.com/neondatabase/neon/issues/11857

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 compute/compute-node.Dockerfile                 | 3 ++-
 compute/etc/ld.so.conf.d/00-neon.conf           | 1 +
 docker-compose/compute_wrapper/shell/compute.sh | 8 ++++++++
 3 files changed, 11 insertions(+), 1 deletion(-)
 create mode 100644 compute/etc/ld.so.conf.d/00-neon.conf

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index 6233eaf709..e6e6053554 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -1971,7 +1971,8 @@ COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/sql
 COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/neon_collector_autoscaling.yml /etc/neon_collector_autoscaling.yml
 
 # Make the libraries we built available
-RUN echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig
+COPY --chmod=0666 compute/etc/ld.so.conf.d/00-neon.conf /etc/ld.so.conf.d/00-neon.conf
+RUN /sbin/ldconfig
 
 # rsyslog config permissions
 # directory for rsyslogd pid file
diff --git a/compute/etc/ld.so.conf.d/00-neon.conf b/compute/etc/ld.so.conf.d/00-neon.conf
new file mode 100644
index 0000000000..e8e4bdcd42
--- /dev/null
+++ b/compute/etc/ld.so.conf.d/00-neon.conf
@@ -0,0 +1 @@
+/usr/local/lib
diff --git a/docker-compose/compute_wrapper/shell/compute.sh b/docker-compose/compute_wrapper/shell/compute.sh
index 723b2f8afb..20a1ffb7a0 100755
--- a/docker-compose/compute_wrapper/shell/compute.sh
+++ b/docker-compose/compute_wrapper/shell/compute.sh
@@ -14,6 +14,14 @@ PG_VERSION=${PG_VERSION:-14}
 CONFIG_FILE_ORG=/var/db/postgres/configs/config.json
 CONFIG_FILE=/tmp/config.json
 
+# Test that the first library path that the dynamic loader looks in is the path
+# that we use for custom compiled software
+first_path="$(ldconfig --verbose 2>/dev/null \
+    | grep --invert-match ^$'\t' \
+    | cut --delimiter=: --fields=1 \
+    | head --lines=1)"
+test "$first_path" == '/usr/local/lib' || true # Remove the || true in a follow-up PR. Needed for backwards compat.
+
 echo "Waiting pageserver become ready."
 while ! nc -z pageserver 6400; do
      sleep 1;

From a113c48c43c9ff0130e404e47a55e4721bbb63a4 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Tue, 13 May 2025 09:33:53 +0100
Subject: [PATCH 31/65] proxy: fix redis batching support (#11905)

## Problem

For `StoreCancelKey`, we were inserting 2 commands, but we were not
inserting two replies. This mismatch leads to errors when decoding the
response.

## Summary of changes

Abstract the command + reply pipeline so that commands and replies are
registered at the same time.
---
 proxy/src/cancellation.rs | 125 ++++++++++++++++++++++++--------------
 proxy/src/redis/kv_ops.rs |   2 +-
 2 files changed, 79 insertions(+), 48 deletions(-)

diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs
index c5ba04eb8c..f34fb747ca 100644
--- a/proxy/src/cancellation.rs
+++ b/proxy/src/cancellation.rs
@@ -6,12 +6,12 @@ use ipnet::{IpNet, Ipv4Net, Ipv6Net};
 use postgres_client::CancelToken;
 use postgres_client::tls::MakeTlsConnect;
 use pq_proto::CancelKeyData;
-use redis::{FromRedisValue, Pipeline, Value, pipe};
+use redis::{Cmd, FromRedisValue, Value};
 use serde::{Deserialize, Serialize};
 use thiserror::Error;
 use tokio::net::TcpStream;
 use tokio::sync::{mpsc, oneshot};
-use tracing::{debug, info, warn};
+use tracing::{debug, error, info, warn};
 
 use crate::auth::backend::ComputeUserInfo;
 use crate::auth::{AuthError, check_peer_addr_is_in_list};
@@ -56,8 +56,70 @@ pub enum CancelKeyOp {
     },
 }
 
+pub struct Pipeline {
+    inner: redis::Pipeline,
+    replies: Vec<CancelReplyOp>,
+}
+
+impl Pipeline {
+    fn with_capacity(n: usize) -> Self {
+        Self {
+            inner: redis::Pipeline::with_capacity(n),
+            replies: Vec::with_capacity(n),
+        }
+    }
+
+    async fn execute(&mut self, client: &mut RedisKVClient) {
+        let responses = self.replies.len();
+        let batch_size = self.inner.len();
+
+        match client.query(&self.inner).await {
+            // for each reply, we expect that many values.
+            Ok(Value::Array(values)) if values.len() == responses => {
+                debug!(
+                    batch_size,
+                    responses, "successfully completed cancellation jobs",
+                );
+                for (value, reply) in std::iter::zip(values, self.replies.drain(..)) {
+                    reply.send_value(value);
+                }
+            }
+            Ok(value) => {
+                error!(batch_size, ?value, "unexpected redis return value");
+                for reply in self.replies.drain(..) {
+                    reply.send_err(anyhow!("incorrect response type from redis"));
+                }
+            }
+            Err(err) => {
+                for reply in self.replies.drain(..) {
+                    reply.send_err(anyhow!("could not send cmd to redis: {err}"));
+                }
+            }
+        }
+
+        self.inner.clear();
+        self.replies.clear();
+    }
+
+    fn add_command_with_reply(&mut self, cmd: Cmd, reply: CancelReplyOp) {
+        self.inner.add_command(cmd);
+        self.replies.push(reply);
+    }
+
+    fn add_command_no_reply(&mut self, cmd: Cmd) {
+        self.inner.add_command(cmd).ignore();
+    }
+
+    fn add_command(&mut self, cmd: Cmd, reply: Option<CancelReplyOp>) {
+        match reply {
+            Some(reply) => self.add_command_with_reply(cmd, reply),
+            None => self.add_command_no_reply(cmd),
+        }
+    }
+}
+
 impl CancelKeyOp {
-    fn register(self, pipe: &mut Pipeline) -> Option<CancelReplyOp> {
+    fn register(self, pipe: &mut Pipeline) {
         #[allow(clippy::used_underscore_binding)]
         match self {
             CancelKeyOp::StoreCancelKey {
@@ -68,18 +130,18 @@ impl CancelKeyOp {
                 _guard,
                 expire,
             } => {
-                pipe.hset(&key, field, value);
-                pipe.expire(key, expire);
-                let resp_tx = resp_tx?;
-                Some(CancelReplyOp::StoreCancelKey { resp_tx, _guard })
+                let reply =
+                    resp_tx.map(|resp_tx| CancelReplyOp::StoreCancelKey { resp_tx, _guard });
+                pipe.add_command(Cmd::hset(&key, field, value), reply);
+                pipe.add_command_no_reply(Cmd::expire(key, expire));
             }
             CancelKeyOp::GetCancelData {
                 key,
                 resp_tx,
                 _guard,
             } => {
-                pipe.hgetall(key);
-                Some(CancelReplyOp::GetCancelData { resp_tx, _guard })
+                let reply = CancelReplyOp::GetCancelData { resp_tx, _guard };
+                pipe.add_command_with_reply(Cmd::hgetall(key), reply);
             }
             CancelKeyOp::RemoveCancelKey {
                 key,
@@ -87,9 +149,9 @@ impl CancelKeyOp {
                 resp_tx,
                 _guard,
             } => {
-                pipe.hdel(key, field);
-                let resp_tx = resp_tx?;
-                Some(CancelReplyOp::RemoveCancelKey { resp_tx, _guard })
+                let reply =
+                    resp_tx.map(|resp_tx| CancelReplyOp::RemoveCancelKey { resp_tx, _guard });
+                pipe.add_command(Cmd::hdel(key, field), reply);
             }
         }
     }
@@ -170,8 +232,8 @@ pub async fn handle_cancel_messages(
     client: &mut RedisKVClient,
     mut rx: mpsc::Receiver<CancelKeyOp>,
 ) -> anyhow::Result<()> {
-    let mut batch = Vec::new();
-    let mut replies = vec![];
+    let mut batch = Vec::with_capacity(BATCH_SIZE);
+    let mut pipeline = Pipeline::with_capacity(BATCH_SIZE);
 
     loop {
         if rx.recv_many(&mut batch, BATCH_SIZE).await == 0 {
@@ -182,42 +244,11 @@ pub async fn handle_cancel_messages(
         let batch_size = batch.len();
         debug!(batch_size, "running cancellation jobs");
 
-        let mut pipe = pipe();
         for msg in batch.drain(..) {
-            if let Some(reply) = msg.register(&mut pipe) {
-                replies.push(reply);
-            } else {
-                pipe.ignore();
-            }
+            msg.register(&mut pipeline);
         }
 
-        let responses = replies.len();
-
-        match client.query(pipe).await {
-            // for each reply, we expect that many values.
-            Ok(Value::Array(values)) if values.len() == responses => {
-                debug!(
-                    batch_size,
-                    responses, "successfully completed cancellation jobs",
-                );
-                for (value, reply) in std::iter::zip(values, replies.drain(..)) {
-                    reply.send_value(value);
-                }
-            }
-            Ok(value) => {
-                debug!(?value, "unexpected redis return value");
-                for reply in replies.drain(..) {
-                    reply.send_err(anyhow!("incorrect response type from redis"));
-                }
-            }
-            Err(err) => {
-                for reply in replies.drain(..) {
-                    reply.send_err(anyhow!("could not send cmd to redis: {err}"));
-                }
-            }
-        }
-
-        replies.clear();
+        pipeline.execute(client).await;
     }
 }
 
diff --git a/proxy/src/redis/kv_ops.rs b/proxy/src/redis/kv_ops.rs
index aa627b29a6..f71730c533 100644
--- a/proxy/src/redis/kv_ops.rs
+++ b/proxy/src/redis/kv_ops.rs
@@ -47,7 +47,7 @@ impl RedisKVClient {
 
     pub(crate) async fn query<T: FromRedisValue>(
         &mut self,
-        q: impl Queryable,
+        q: &impl Queryable,
     ) -> anyhow::Result<T> {
         if !self.limiter.check() {
             tracing::info!("Rate limit exceeded. Skipping query");

From a9979620c508a089f3f3d6e020877349ff555b0f Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Tue, 13 May 2025 16:53:35 +0800
Subject: [PATCH 32/65] fix(remote_storage): continue on Azure+AWS retryable
 error (#11903)

## Problem

We implemented the retry logic in AWS S3 but not in Azure. Therefore, if
there is an error during Azure listing, we will return an Err to the
caller, and the stream will end without fetching more tenants.

Part of https://github.com/neondatabase/neon/issues/11159

Without this fix, listing tenant will stop once we hit an error (could
be network errors -- that happens more frequent on Azure). If we happen
to stop at a point that we only listed part of the shards, we will hit
the "missed shards" error or even remove layers being used.

This bug (for Azure listing) was introduced as part of
https://github.com/neondatabase/neon/pull/9840

There is also a bug that stops the stream for AWS when there's a timeout
-- this is fixed along with this patch.

## Summary of changes

Retry the request on error. In the future, we should make such streams
return something like `Result<Result<T>>` where the outer result is the
error that ends the stream and the inner one is the error that should be
retried by the caller.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 libs/remote_storage/src/azure_blob.rs | 11 +++++++++--
 libs/remote_storage/src/s3_bucket.rs  |  9 ++++++++-
 2 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs
index a5cddb840f..5363e935e3 100644
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -330,11 +330,18 @@ impl AzureBlobStorage {
                 if let Err(DownloadError::Timeout) = &next_item {
                     timeout_try_cnt += 1;
                     if timeout_try_cnt <= 5 {
-                        continue;
+                        continue 'outer;
                     }
                 }
 
-                let next_item = next_item?;
+                let next_item = match next_item {
+                    Ok(next_item) => next_item,
+                    Err(e) => {
+                        // The error is potentially retryable, so we must rewind the loop after yielding.
+                        yield Err(e);
+                        continue 'outer;
+                    },
+                };
 
                 // Log a warning if we saw two timeouts in a row before a successful request
                 if timeout_try_cnt > 2 {
diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs
index 918d9d5a6b..d98ff552ee 100644
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -657,7 +657,14 @@ impl RemoteStorage for S3Bucket {
                     res = request => Ok(res),
                     _ = tokio::time::sleep(self.timeout) => Err(DownloadError::Timeout),
                     _ = cancel.cancelled() => Err(DownloadError::Cancelled),
-                }?;
+                };
+
+                if let Err(DownloadError::Timeout) = &response {
+                    yield Err(DownloadError::Timeout);
+                    continue 'outer;
+                }
+
+                let response = response?; // always yield cancellation errors and stop the stream
 
                 let response = response
                     .context("Failed to list S3 prefixes")

From 34a42b00caf9e4c45fa3ce29ba95aa2ae7278d05 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Tue, 13 May 2025 17:49:14 +0800
Subject: [PATCH 33/65] feat(pageserver): add PostHog lite client (#11821)

## Problem

part of https://github.com/neondatabase/neon/issues/11813

## Summary of changes

Add a lite PostHog client that only uses the local flag evaluation
functionality. Added a test case that parses an example feature flag and
gets the evaluation result.

TODO: support boolean flag, remote config; implement all operators in
PostHog.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 Cargo.lock                          |  16 +
 Cargo.toml                          |   1 +
 libs/posthog_client_lite/Cargo.toml |  14 +
 libs/posthog_client_lite/src/lib.rs | 634 ++++++++++++++++++++++++++++
 workspace_hack/Cargo.toml           |   3 +
 5 files changed, 668 insertions(+)
 create mode 100644 libs/posthog_client_lite/Cargo.toml
 create mode 100644 libs/posthog_client_lite/src/lib.rs

diff --git a/Cargo.lock b/Cargo.lock
index 7083baa092..6df5d4a71e 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4848,6 +4848,19 @@ dependencies = [
  "workspace_hack",
 ]
 
+[[package]]
+name = "posthog_client_lite"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "reqwest",
+ "serde",
+ "serde_json",
+ "sha2",
+ "thiserror 1.0.69",
+ "workspace_hack",
+]
+
 [[package]]
 name = "powerfmt"
 version = "0.2.0"
@@ -8439,8 +8452,10 @@ dependencies = [
  "fail",
  "form_urlencoded",
  "futures-channel",
+ "futures-core",
  "futures-executor",
  "futures-io",
+ "futures-task",
  "futures-util",
  "generic-array",
  "getrandom 0.2.11",
@@ -8470,6 +8485,7 @@ dependencies = [
  "once_cell",
  "p256 0.13.2",
  "parquet",
+ "percent-encoding",
  "prettyplease",
  "proc-macro2",
  "prost 0.13.3",
diff --git a/Cargo.toml b/Cargo.toml
index 8d4cc4a75a..6b87ce549d 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -26,6 +26,7 @@ members = [
     "libs/utils",
     "libs/consumption_metrics",
     "libs/postgres_backend",
+    "libs/posthog_client_lite",
     "libs/pq_proto",
     "libs/tenant_size_model",
     "libs/metrics",
diff --git a/libs/posthog_client_lite/Cargo.toml b/libs/posthog_client_lite/Cargo.toml
new file mode 100644
index 0000000000..7c19bf2ccb
--- /dev/null
+++ b/libs/posthog_client_lite/Cargo.toml
@@ -0,0 +1,14 @@
+[package]
+name = "posthog_client_lite"
+version = "0.1.0"
+edition = "2024"
+license.workspace = true
+
+[dependencies]
+anyhow.workspace = true
+reqwest.workspace = true
+serde.workspace = true
+serde_json.workspace = true
+sha2.workspace = true
+workspace_hack.workspace = true
+thiserror.workspace = true
diff --git a/libs/posthog_client_lite/src/lib.rs b/libs/posthog_client_lite/src/lib.rs
new file mode 100644
index 0000000000..53deb26ab7
--- /dev/null
+++ b/libs/posthog_client_lite/src/lib.rs
@@ -0,0 +1,634 @@
+//! A lite version of the PostHog client that only supports local evaluation of feature flags.
+
+use std::collections::HashMap;
+
+use serde::{Deserialize, Serialize};
+use serde_json::json;
+use sha2::Digest;
+
+#[derive(Debug, thiserror::Error)]
+pub enum PostHogEvaluationError {
+    /// The feature flag is not available, for example, because the local evaluation data is not populated yet.
+    #[error("Feature flag not available: {0}")]
+    NotAvailable(String),
+    #[error("No condition group is matched")]
+    NoConditionGroupMatched,
+    /// Real errors, e.g., the rollout percentage does not add up to 100.
+    #[error("Failed to evaluate feature flag: {0}")]
+    Internal(String),
+}
+
+#[derive(Deserialize)]
+pub struct LocalEvaluationResponse {
+    #[allow(dead_code)]
+    flags: Vec<LocalEvaluationFlag>,
+}
+
+#[derive(Deserialize)]
+pub struct LocalEvaluationFlag {
+    key: String,
+    filters: LocalEvaluationFlagFilters,
+    active: bool,
+}
+
+#[derive(Deserialize)]
+pub struct LocalEvaluationFlagFilters {
+    groups: Vec<LocalEvaluationFlagFilterGroup>,
+    multivariate: LocalEvaluationFlagMultivariate,
+}
+
+#[derive(Deserialize)]
+pub struct LocalEvaluationFlagFilterGroup {
+    variant: Option<String>,
+    properties: Option<Vec<LocalEvaluationFlagFilterProperty>>,
+    rollout_percentage: i64,
+}
+
+#[derive(Deserialize)]
+pub struct LocalEvaluationFlagFilterProperty {
+    key: String,
+    value: PostHogFlagFilterPropertyValue,
+    operator: String,
+}
+
+#[derive(Debug, Serialize, Deserialize)]
+#[serde(untagged)]
+pub enum PostHogFlagFilterPropertyValue {
+    String(String),
+    Number(f64),
+    Boolean(bool),
+    List(Vec<String>),
+}
+
+#[derive(Deserialize)]
+pub struct LocalEvaluationFlagMultivariate {
+    variants: Vec<LocalEvaluationFlagMultivariateVariant>,
+}
+
+#[derive(Deserialize)]
+pub struct LocalEvaluationFlagMultivariateVariant {
+    key: String,
+    rollout_percentage: i64,
+}
+
+pub struct FeatureStore {
+    flags: HashMap<String, LocalEvaluationFlag>,
+}
+
+impl Default for FeatureStore {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+enum GroupEvaluationResult {
+    MatchedAndOverride(String),
+    MatchedAndEvaluate,
+    Unmatched,
+}
+
+impl FeatureStore {
+    pub fn new() -> Self {
+        Self {
+            flags: HashMap::new(),
+        }
+    }
+
+    pub fn set_flags(&mut self, flags: Vec<LocalEvaluationFlag>) {
+        self.flags.clear();
+        for flag in flags {
+            self.flags.insert(flag.key.clone(), flag);
+        }
+    }
+
+    /// Generate a consistent hash for a user ID (e.g., tenant ID).
+    ///
+    /// The implementation is different from PostHog SDK. In PostHog SDK, it is sha1 of `user_id.distinct_id.salt`.
+    /// However, as we do not upload all of our tenant IDs to PostHog, we do not have the PostHog distinct_id for a
+    /// tenant. Therefore, the way we compute it is sha256 of `user_id.feature_id.salt`.
+    fn consistent_hash(user_id: &str, flag_key: &str, salt: &str) -> f64 {
+        let mut hasher = sha2::Sha256::new();
+        hasher.update(user_id);
+        hasher.update(".");
+        hasher.update(flag_key);
+        hasher.update(".");
+        hasher.update(salt);
+        let hash = hasher.finalize();
+        let hash_int = u64::from_le_bytes(hash[..8].try_into().unwrap());
+        hash_int as f64 / u64::MAX as f64
+    }
+
+    /// Evaluate a condition. Returns an error if the condition cannot be evaluated due to parsing error or missing
+    /// property.
+    fn evaluate_condition(
+        &self,
+        operator: &str,
+        provided: &PostHogFlagFilterPropertyValue,
+        requested: &PostHogFlagFilterPropertyValue,
+    ) -> Result<bool, PostHogEvaluationError> {
+        match operator {
+            "exact" => {
+                let PostHogFlagFilterPropertyValue::String(provided) = provided else {
+                    // Left should be a string
+                    return Err(PostHogEvaluationError::Internal(format!(
+                        "The left side of the condition is not a string: {:?}",
+                        provided
+                    )));
+                };
+                let PostHogFlagFilterPropertyValue::List(requested) = requested else {
+                    // Right should be a list of string
+                    return Err(PostHogEvaluationError::Internal(format!(
+                        "The right side of the condition is not a list: {:?}",
+                        requested
+                    )));
+                };
+                Ok(requested.contains(provided))
+            }
+            "lt" | "gt" => {
+                let PostHogFlagFilterPropertyValue::String(requested) = requested else {
+                    // Right should be a string
+                    return Err(PostHogEvaluationError::Internal(format!(
+                        "The right side of the condition is not a string: {:?}",
+                        requested
+                    )));
+                };
+                let Ok(requested) = requested.parse::<f64>() else {
+                    return Err(PostHogEvaluationError::Internal(format!(
+                        "Can not parse the right side of the condition as a number: {:?}",
+                        requested
+                    )));
+                };
+                // Left can either be a number or a string
+                let provided = match provided {
+                    PostHogFlagFilterPropertyValue::Number(provided) => *provided,
+                    PostHogFlagFilterPropertyValue::String(provided) => {
+                        let Ok(provided) = provided.parse::<f64>() else {
+                            return Err(PostHogEvaluationError::Internal(format!(
+                                "Can not parse the left side of the condition as a number: {:?}",
+                                provided
+                            )));
+                        };
+                        provided
+                    }
+                    _ => {
+                        return Err(PostHogEvaluationError::Internal(format!(
+                            "The left side of the condition is not a number or a string: {:?}",
+                            provided
+                        )));
+                    }
+                };
+                match operator {
+                    "lt" => Ok(provided < requested),
+                    "gt" => Ok(provided > requested),
+                    op => Err(PostHogEvaluationError::Internal(format!(
+                        "Unsupported operator: {}",
+                        op
+                    ))),
+                }
+            }
+            _ => Err(PostHogEvaluationError::Internal(format!(
+                "Unsupported operator: {}",
+                operator
+            ))),
+        }
+    }
+
+    /// Evaluate a percentage.
+    fn evaluate_percentage(&self, mapped_user_id: f64, percentage: i64) -> bool {
+        mapped_user_id <= percentage as f64 / 100.0
+    }
+
+    /// Evaluate a filter group for a feature flag. Returns an error if there are errors during the evaluation.
+    ///
+    /// Return values:
+    /// Ok(GroupEvaluationResult::MatchedAndOverride(variant)): matched and evaluated to this value
+    /// Ok(GroupEvaluationResult::MatchedAndEvaluate): condition matched but no variant override, use the global rollout percentage
+    /// Ok(GroupEvaluationResult::Unmatched): condition unmatched
+    fn evaluate_group(
+        &self,
+        group: &LocalEvaluationFlagFilterGroup,
+        hash_on_group_rollout_percentage: f64,
+        provided_properties: &HashMap<String, PostHogFlagFilterPropertyValue>,
+    ) -> Result<GroupEvaluationResult, PostHogEvaluationError> {
+        if let Some(ref properties) = group.properties {
+            for property in properties {
+                if let Some(value) = provided_properties.get(&property.key) {
+                    // The user provided the property value
+                    if !self.evaluate_condition(
+                        property.operator.as_ref(),
+                        value,
+                        &property.value,
+                    )? {
+                        return Ok(GroupEvaluationResult::Unmatched);
+                    }
+                } else {
+                    // We cannot evaluate, the property is not available
+                    return Err(PostHogEvaluationError::NotAvailable(format!(
+                        "The required property in the condition is not available: {}",
+                        property.key
+                    )));
+                }
+            }
+        }
+
+        // The group has no condition matchers or we matched the properties
+        if self.evaluate_percentage(hash_on_group_rollout_percentage, group.rollout_percentage) {
+            if let Some(ref variant_override) = group.variant {
+                Ok(GroupEvaluationResult::MatchedAndOverride(
+                    variant_override.clone(),
+                ))
+            } else {
+                Ok(GroupEvaluationResult::MatchedAndEvaluate)
+            }
+        } else {
+            Ok(GroupEvaluationResult::Unmatched)
+        }
+    }
+
+    /// Evaluate a multivariate feature flag. Returns `None` if the flag is not available or if there are errors
+    /// during the evaluation.
+    ///
+    /// The parsing logic is as follows:
+    ///
+    /// * Match each filter group.
+    ///   - If a group is matched, it will first determine whether the user is in the range of the group's rollout
+    ///     percentage. We will generate a consistent hash for the user ID on the group rollout percentage. This hash
+    ///     is shared across all groups.
+    ///   - If the hash falls within the group's rollout percentage, return the variant if it's overridden, or
+    ///   - Evaluate the variant using the global config and the global rollout percentage.
+    /// * Otherwise, continue with the next group until all groups are evaluated and no group is within the
+    ///   rollout percentage.
+    /// * If there are no matching groups, return an error.
+    ///
+    /// Example: we have a multivariate flag with 3 groups of the configured global rollout percentage: A (10%), B (20%), C (70%).
+    /// There is a single group with a condition that has a rollout percentage of 10% and it does not have a variant override.
+    /// Then, we will have 1% of the users evaluated to A, 2% to B, and 7% to C.
+    pub fn evaluate_multivariate(
+        &self,
+        flag_key: &str,
+        user_id: &str,
+    ) -> Result<String, PostHogEvaluationError> {
+        let hash_on_global_rollout_percentage =
+            Self::consistent_hash(user_id, flag_key, "multivariate");
+        let hash_on_group_rollout_percentage =
+            Self::consistent_hash(user_id, flag_key, "within_group");
+        self.evaluate_multivariate_inner(
+            flag_key,
+            hash_on_global_rollout_percentage,
+            hash_on_group_rollout_percentage,
+            &HashMap::new(),
+        )
+    }
+
+    /// Evaluate a multivariate feature flag. Note that we directly take the mapped user ID
+    /// (a consistent hash ranging from 0 to 1) so that it is easier to use it in the tests
+    /// and avoid duplicate computations.
+    ///
+    /// Use a different consistent hash for evaluating the group rollout percentage.
+    /// The behavior: if the condition is set to rolling out to 10% of the users, and
+    /// we set the variant A to 20% in the global config, then 2% of the total users will
+    /// be evaluated to variant A.
+    ///
+    /// Note that the hash to determine group rollout percentage is shared across all groups. So if we have two
+    /// exactly-the-same conditions with 10% and 20% rollout percentage respectively, a total of 20% of the users
+    /// will be evaluated (versus 30% if group evaluation is done independently).
+    pub(crate) fn evaluate_multivariate_inner(
+        &self,
+        flag_key: &str,
+        hash_on_global_rollout_percentage: f64,
+        hash_on_group_rollout_percentage: f64,
+        properties: &HashMap<String, PostHogFlagFilterPropertyValue>,
+    ) -> Result<String, PostHogEvaluationError> {
+        if let Some(flag_config) = self.flags.get(flag_key) {
+            if !flag_config.active {
+                return Err(PostHogEvaluationError::NotAvailable(format!(
+                    "The feature flag is not active: {}",
+                    flag_key
+                )));
+            }
+            // TODO: sort the groups so that variant overrides always get evaluated first and it follows the PostHog
+            // Python SDK behavior; for now we do not configure conditions without variant overrides in Neon so it
+            // does not matter.
+            for group in &flag_config.filters.groups {
+                match self.evaluate_group(group, hash_on_group_rollout_percentage, properties)? {
+                    GroupEvaluationResult::MatchedAndOverride(variant) => return Ok(variant),
+                    GroupEvaluationResult::MatchedAndEvaluate => {
+                        let mut percentage = 0;
+                        for variant in &flag_config.filters.multivariate.variants {
+                            percentage += variant.rollout_percentage;
+                            if self
+                                .evaluate_percentage(hash_on_global_rollout_percentage, percentage)
+                            {
+                                return Ok(variant.key.clone());
+                            }
+                        }
+                        // This should not happen because the rollout percentage always adds up to 100, but just in case that PostHog
+                        // returned invalid spec, we return an error.
+                        return Err(PostHogEvaluationError::Internal(format!(
+                            "Rollout percentage does not add up to 100: {}",
+                            flag_key
+                        )));
+                    }
+                    GroupEvaluationResult::Unmatched => continue,
+                }
+            }
+            // If no group is matched, the feature is not available, and up to the caller to decide what to do.
+            Err(PostHogEvaluationError::NoConditionGroupMatched)
+        } else {
+            // The feature flag is not available yet
+            Err(PostHogEvaluationError::NotAvailable(format!(
+                "Not found in the local evaluation spec: {}",
+                flag_key
+            )))
+        }
+    }
+}
+
+/// A lite PostHog client.
+///
+/// At the point of writing this code, PostHog does not have a functional Rust client with feature flag support.
+/// This is a lite version that only supports local evaluation of feature flags and only supports those JSON specs
+/// that will be used within Neon.
+///
+/// PostHog is designed as a browser-server system: the browser (client) side uses the client key and is exposed
+/// to the end users; the server side uses a server key and is not exposed to the end users. The client and the
+/// server has different API keys and provide a different set of APIs. In Neon, we only have the server (that is
+/// pageserver), and it will use both the client API and the server API. So we need to store two API keys within
+/// our PostHog client.
+///
+/// The server API is used to fetch the feature flag specs. The client API is used to capture events in case we
+/// want to report the feature flag usage back to PostHog. The current plan is to use PostHog only as an UI to
+/// configure feature flags so it is very likely that the client API will not be used.
+pub struct PostHogClient {
+    /// The server API key.
+    server_api_key: String,
+    /// The client API key.
+    client_api_key: String,
+    /// The project ID.
+    project_id: String,
+    /// The private API URL.
+    private_api_url: String,
+    /// The public API URL.
+    public_api_url: String,
+    /// The HTTP client.
+    client: reqwest::Client,
+}
+
+impl PostHogClient {
+    pub fn new(
+        server_api_key: String,
+        client_api_key: String,
+        project_id: String,
+        private_api_url: String,
+        public_api_url: String,
+    ) -> Self {
+        let client = reqwest::Client::new();
+        Self {
+            server_api_key,
+            client_api_key,
+            project_id,
+            private_api_url,
+            public_api_url,
+            client,
+        }
+    }
+
+    pub fn new_with_us_region(
+        server_api_key: String,
+        client_api_key: String,
+        project_id: String,
+    ) -> Self {
+        Self::new(
+            server_api_key,
+            client_api_key,
+            project_id,
+            "https://us.posthog.com".to_string(),
+            "https://us.i.posthog.com".to_string(),
+        )
+    }
+
+    /// Fetch the feature flag specs from the server.
+    ///
+    /// This is unfortunately an undocumented API at:
+    /// - <https://posthog.com/docs/api/feature-flags#get-api-projects-project_id-feature_flags-local_evaluation>
+    /// - <https://posthog.com/docs/feature-flags/local-evaluation>
+    ///
+    /// The handling logic in [`FeatureStore`] mostly follows the Python API implementation.
+    /// See `_compute_flag_locally` in <https://github.com/PostHog/posthog-python/blob/master/posthog/client.py>
+    pub async fn get_feature_flags_local_evaluation(
+        &self,
+    ) -> anyhow::Result<LocalEvaluationResponse> {
+        // BASE_URL/api/projects/:project_id/feature_flags/local_evaluation
+        // with bearer token of self.server_api_key
+        let url = format!(
+            "{}/api/projects/{}/feature_flags/local_evaluation",
+            self.private_api_url, self.project_id
+        );
+        let response = self
+            .client
+            .get(url)
+            .bearer_auth(&self.server_api_key)
+            .send()
+            .await?;
+        let body = response.text().await?;
+        Ok(serde_json::from_str(&body)?)
+    }
+
+    /// Capture an event. This will only be used to report the feature flag usage back to PostHog, though
+    /// it also support a lot of other functionalities.
+    ///
+    /// <https://posthog.com/docs/api/capture>
+    pub async fn capture_event(
+        &self,
+        event: &str,
+        distinct_id: &str,
+        properties: &HashMap<String, PostHogFlagFilterPropertyValue>,
+    ) -> anyhow::Result<()> {
+        // PUBLIC_URL/capture/
+        // with bearer token of self.client_api_key
+        let url = format!("{}/capture/", self.public_api_url);
+        self.client
+            .post(url)
+            .body(serde_json::to_string(&json!({
+                "api_key": self.client_api_key,
+                "distinct_id": distinct_id,
+                "event": event,
+                "properties": properties,
+            }))?)
+            .send()
+            .await?;
+        Ok(())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn data() -> &'static str {
+        r#"{
+            "flags": [
+                {
+                    "id": 132794,
+                    "team_id": 152860,
+                    "name": "",
+                    "key": "gc-compaction",
+                    "filters": {
+                        "groups": [
+                            {
+                                "variant": "enabled-stage-2",
+                                "properties": [
+                                    {
+                                        "key": "plan_type",
+                                        "type": "person",
+                                        "value": [
+                                            "free"
+                                        ],
+                                        "operator": "exact"
+                                    },
+                                    {
+                                        "key": "pageserver_remote_size",
+                                        "type": "person",
+                                        "value": "10000000",
+                                        "operator": "lt"
+                                    }
+                                ],
+                                "rollout_percentage": 50
+                            },
+                            {
+                                "properties": [
+                                    {
+                                        "key": "plan_type",
+                                        "type": "person",
+                                        "value": [
+                                            "free"
+                                        ],
+                                        "operator": "exact"
+                                    },
+                                    {
+                                        "key": "pageserver_remote_size",
+                                        "type": "person",
+                                        "value": "10000000",
+                                        "operator": "lt"
+                                    }
+                                ],
+                                "rollout_percentage": 80
+                            }
+                        ],
+                        "payloads": {},
+                        "multivariate": {
+                            "variants": [
+                                {
+                                    "key": "disabled",
+                                    "name": "",
+                                    "rollout_percentage": 90
+                                },
+                                {
+                                    "key": "enabled-stage-1",
+                                    "name": "",
+                                    "rollout_percentage": 10
+                                },
+                                {
+                                    "key": "enabled-stage-2",
+                                    "name": "",
+                                    "rollout_percentage": 0
+                                },
+                                {
+                                    "key": "enabled-stage-3",
+                                    "name": "",
+                                    "rollout_percentage": 0
+                                },
+                                {
+                                    "key": "enabled",
+                                    "name": "",
+                                    "rollout_percentage": 0
+                                }
+                            ]
+                        }
+                    },
+                    "deleted": false,
+                    "active": true,
+                    "ensure_experience_continuity": false,
+                    "has_encrypted_payloads": false,
+                    "version": 6
+                }
+            ],
+            "group_type_mapping": {},
+            "cohorts": {}
+        }"#
+    }
+
+    #[test]
+    fn parse_local_evaluation() {
+        let data = data();
+        let _: LocalEvaluationResponse = serde_json::from_str(data).unwrap();
+    }
+
+    #[test]
+    fn evaluate_multivariate() {
+        let mut store = FeatureStore::new();
+        let response: LocalEvaluationResponse = serde_json::from_str(data()).unwrap();
+        store.set_flags(response.flags);
+
+        // This lacks the required properties and cannot be evaluated.
+        let variant =
+            store.evaluate_multivariate_inner("gc-compaction", 1.00, 0.40, &HashMap::new());
+        assert!(matches!(
+            variant,
+            Err(PostHogEvaluationError::NotAvailable(_))
+        ),);
+
+        let properties_unmatched = HashMap::from([
+            (
+                "plan_type".to_string(),
+                PostHogFlagFilterPropertyValue::String("paid".to_string()),
+            ),
+            (
+                "pageserver_remote_size".to_string(),
+                PostHogFlagFilterPropertyValue::Number(1000.0),
+            ),
+        ]);
+
+        // This does not match any group so there will be an error.
+        let variant =
+            store.evaluate_multivariate_inner("gc-compaction", 1.00, 0.40, &properties_unmatched);
+        assert!(matches!(
+            variant,
+            Err(PostHogEvaluationError::NoConditionGroupMatched)
+        ),);
+        let variant =
+            store.evaluate_multivariate_inner("gc-compaction", 0.80, 0.80, &properties_unmatched);
+        assert!(matches!(
+            variant,
+            Err(PostHogEvaluationError::NoConditionGroupMatched)
+        ),);
+
+        let properties = HashMap::from([
+            (
+                "plan_type".to_string(),
+                PostHogFlagFilterPropertyValue::String("free".to_string()),
+            ),
+            (
+                "pageserver_remote_size".to_string(),
+                PostHogFlagFilterPropertyValue::Number(1000.0),
+            ),
+        ]);
+
+        // It matches the first group as 0.10 <= 0.50 and the properties are matched. Then it gets evaluated to the variant override.
+        let variant = store.evaluate_multivariate_inner("gc-compaction", 0.10, 0.10, &properties);
+        assert_eq!(variant.unwrap(), "enabled-stage-2".to_string());
+
+        // It matches the second group as 0.50 <= 0.60 <= 0.80 and the properties are matched. Then it gets evaluated using the global percentage.
+        let variant = store.evaluate_multivariate_inner("gc-compaction", 0.99, 0.60, &properties);
+        assert_eq!(variant.unwrap(), "enabled-stage-1".to_string());
+        let variant = store.evaluate_multivariate_inner("gc-compaction", 0.80, 0.60, &properties);
+        assert_eq!(variant.unwrap(), "disabled".to_string());
+
+        // It matches the group conditions but not the group rollout percentage.
+        let variant = store.evaluate_multivariate_inner("gc-compaction", 1.00, 0.90, &properties);
+        assert!(matches!(
+            variant,
+            Err(PostHogEvaluationError::NoConditionGroupMatched)
+        ),);
+    }
+}
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index f3d8b951a8..fecf62f756 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -39,8 +39,10 @@ env_logger = { version = "0.11" }
 fail = { version = "0.5", default-features = false, features = ["failpoints"] }
 form_urlencoded = { version = "1" }
 futures-channel = { version = "0.3", features = ["sink"] }
+futures-core = { version = "0.3" }
 futures-executor = { version = "0.3" }
 futures-io = { version = "0.3" }
+futures-task = { version = "0.3", default-features = false, features = ["std"] }
 futures-util = { version = "0.3", features = ["channel", "io", "sink"] }
 generic-array = { version = "0.14", default-features = false, features = ["more_lengths", "zeroize"] }
 getrandom = { version = "0.2", default-features = false, features = ["std"] }
@@ -70,6 +72,7 @@ num-traits = { version = "0.2", features = ["i128", "libm"] }
 once_cell = { version = "1" }
 p256 = { version = "0.13", features = ["jwk"] }
 parquet = { version = "53", default-features = false, features = ["zstd"] }
+percent-encoding = { version = "2" }
 prost = { version = "0.13", features = ["no-recursion-limit", "prost-derive"] }
 rand = { version = "0.8", features = ["small_rng"] }
 regex = { version = "1" }

From cfbef4d586f96b9f5e0648d0a7ea04db54b86962 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Tue, 13 May 2025 14:02:25 +0100
Subject: [PATCH 34/65] safekeeper: downgrade stream from future WAL log
 (#11909)

## Problem

1. Safekeeper selection on the pageserver side isn't very dynamic. Once
you connect to one safekeeper, you'll use that one for as long as the
safekeeper keeps the connection alive. In principle, we could be more
eager, since the wal receiver connection can be cancelled but we don't
do that. We wait until the "session" is done and then we pick a new SK.
2. Picking a new SK is quite conservative. We will switch if:
a. We haven't received anything from the SK within the last 10 seconds
(wal_connect_timeout) or
b. The candidate SK is 1GiB ahead or
c. The candidate SK is in the same AZ as the PS or d. There's a
candidate that is ahead and we've not had any WAL within the last 10
seconds (lagging_wal_timeout)

Hence, we can end up with pageservers that are requesting WAL which
their safekeeper hasn't seen yet.

## Summary of changes

Downgrade warning log to info.
---
 safekeeper/src/send_wal.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs
index 33e3d0485c..05f827494e 100644
--- a/safekeeper/src/send_wal.rs
+++ b/safekeeper/src/send_wal.rs
@@ -513,7 +513,7 @@ impl SafekeeperPostgresHandler {
         let end_pos = end_watch.get();
 
         if end_pos < start_pos {
-            warn!(
+            info!(
                 "requested start_pos {} is ahead of available WAL end_pos {}",
                 start_pos, end_pos
             );

From 25ab16ee248e0873939569075b836f5d85d3d5f8 Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Tue, 13 May 2025 14:30:09 +0100
Subject: [PATCH 35/65] chore(compute): Postgres 17.5, 16.9, 15.13 and 14.18
 (#11886)

Bump all minor versions.

the only conflict was
src/backend/storage/smgr/smgr.c in v17
where our smgr changes conflicted with

https://github.com/postgres/postgres/commit/ee578921b60ef9a14eaea54b608549e4f8b14f26
but it was trivial to resolve.
---
 vendor/postgres-v14   |  2 +-
 vendor/postgres-v15   |  2 +-
 vendor/postgres-v16   |  2 +-
 vendor/postgres-v17   |  2 +-
 vendor/revisions.json | 16 ++++++++--------
 5 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index 06b405bc98..ead1e76bdc 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit 06b405bc982fd53522689aa4acbfd9c44b7993cf
+Subproject commit ead1e76bdcb71ef87f52f0610bd7333247f75179
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 72f83df76c..052df87d33 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 72f83df76c61ce18d81bd371f0afd2a43d59c052
+Subproject commit 052df87d338dc30687d0c96f1a4d9b6cb4882b2e
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index d72d76f2cd..bb5eee65ac 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit d72d76f2cdee4194dd052ce099e9784aca7c794a
+Subproject commit bb5eee65ac753b5a66d255ec5fb4c0e33180e8fd
diff --git a/vendor/postgres-v17 b/vendor/postgres-v17
index 0d59c91c1a..e5374b7299 160000
--- a/vendor/postgres-v17
+++ b/vendor/postgres-v17
@@ -1 +1 @@
-Subproject commit 0d59c91c1a23e667f1d1169d5f040b3fa0a0ab44
+Subproject commit e5374b72997b0afc8374137674e873f7a558120a
diff --git a/vendor/revisions.json b/vendor/revisions.json
index e76510f969..cf9f474e1a 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,18 +1,18 @@
 {
   "v17": [
-    "17.4",
-    "0d59c91c1a23e667f1d1169d5f040b3fa0a0ab44"
+    "17.5",
+    "e5374b72997b0afc8374137674e873f7a558120a"
   ],
   "v16": [
-    "16.8",
-    "d72d76f2cdee4194dd052ce099e9784aca7c794a"
+    "16.9",
+    "bb5eee65ac753b5a66d255ec5fb4c0e33180e8fd"
   ],
   "v15": [
-    "15.12",
-    "72f83df76c61ce18d81bd371f0afd2a43d59c052"
+    "15.13",
+    "052df87d338dc30687d0c96f1a4d9b6cb4882b2e"
   ],
   "v14": [
-    "14.17",
-    "06b405bc982fd53522689aa4acbfd9c44b7993cf"
+    "14.18",
+    "ead1e76bdcb71ef87f52f0610bd7333247f75179"
   ]
 }

From 290369061f22c18850e76355d2be885ee82d1302 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Tue, 13 May 2025 17:13:42 +0300
Subject: [PATCH 36/65] Check prefetch result in DEBUG_COMPARE_LOCAL mode
 (#11502)

## Problem

Prefetched and LFC results are not checked in DEBUG_COMPARE_LOCAL mode

## Summary of changes

Add check for this results as well.

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pgxn/neon/pagestore_smgr.c | 262 ++++++++++++++++---------------------
 1 file changed, 116 insertions(+), 146 deletions(-)

diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index f574517b2a..31e47db7d7 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -1281,75 +1281,24 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	communicator_read_at_lsnv(rinfo, forkNum, blkno, &request_lsns, &buffer, 1, NULL);
 }
 
-#if PG_MAJORVERSION_NUM < 17
-/*
- *	neon_read() -- Read the specified block from a relation.
- */
-#if PG_MAJORVERSION_NUM < 16
-static void
-neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, char *buffer)
-#else
-static void
-neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer)
-#endif
-{
-	neon_request_lsns request_lsns;
-	bits8		present;
-	void	   *bufferp;
-
-	switch (reln->smgr_relpersistence)
-	{
-		case 0:
-			neon_log(ERROR, "cannot call smgrread() on rel with unknown persistence");
-
-		case RELPERSISTENCE_PERMANENT:
-			break;
-
-		case RELPERSISTENCE_TEMP:
-		case RELPERSISTENCE_UNLOGGED:
-			mdread(reln, forkNum, blkno, buffer);
-			return;
-
-		default:
-			neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
-	}
-
-	/* Try to read PS results if they are available */
-	communicator_prefetch_pump_state();
-
-	neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1);
-
-	present = 0;
-	bufferp = buffer;
-	if (communicator_prefetch_lookupv(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1, &bufferp, &present))
-	{
-		/* Prefetch hit */
-		return;
-	}
-
-	/* Try to read from local file cache */
-	if (lfc_read(InfoFromSMgrRel(reln), forkNum, blkno, buffer))
-	{
-		MyNeonCounters->file_cache_hits_total++;
-		return;
-	}
-
-	neon_read_at_lsn(InfoFromSMgrRel(reln), forkNum, blkno, request_lsns, buffer);
-
-	/*
-	 * Try to receive prefetch results once again just to make sure we don't leave the smgr code while the OS might still have buffered bytes.
-	 */
-	communicator_prefetch_pump_state();
-
 #ifdef DEBUG_COMPARE_LOCAL
+static void
+compare_with_local(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void* buffer, XLogRecPtr request_lsn)
+{
 	if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln))
 	{
 		char		pageserver_masked[BLCKSZ];
 		PGIOAlignedBlock mdbuf;
 		PGIOAlignedBlock mdbuf_masked;
-		XLogRecPtr  request_lsn = request_lsns.request_lsn;
 
+#if PG_MAJORVERSION_NUM >= 17
+		{
+			void* mdbuffers[1] = { mdbuf.data };
+			mdreadv(reln, forkNum, blkno, mdbuffers, 1);
+		}
+#else
 		mdread(reln, forkNum, blkno, mdbuf.data);
+#endif
 
 		memcpy(pageserver_masked, buffer, BLCKSZ);
 		memcpy(mdbuf_masked.data, mdbuf.data, BLCKSZ);
@@ -1413,11 +1362,105 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
 			}
 		}
 	}
+}
+#endif
+
+
+#if PG_MAJORVERSION_NUM < 17
+
+/*
+ *	neon_read() -- Read the specified block from a relation.
+ */
+#if PG_MAJORVERSION_NUM < 16
+static void
+neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, char *buffer)
+#else
+static void
+neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer)
+#endif
+{
+	neon_request_lsns request_lsns;
+	bits8		present;
+	void	   *bufferp;
+
+	switch (reln->smgr_relpersistence)
+	{
+		case 0:
+			neon_log(ERROR, "cannot call smgrread() on rel with unknown persistence");
+
+		case RELPERSISTENCE_PERMANENT:
+			break;
+
+		case RELPERSISTENCE_TEMP:
+		case RELPERSISTENCE_UNLOGGED:
+			mdread(reln, forkNum, blkno, buffer);
+			return;
+
+		default:
+			neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
+	}
+
+	/* Try to read PS results if they are available */
+	communicator_prefetch_pump_state();
+
+	neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1);
+
+	present = 0;
+	bufferp = buffer;
+	if (communicator_prefetch_lookupv(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1, &bufferp, &present))
+	{
+		/* Prefetch hit */
+#ifdef DEBUG_COMPARE_LOCAL
+		compare_with_local(reln, forkNum, blkno, buffer, request_lsns.request_lsn);
+#else
+		return;
+#endif
+	}
+
+	/* Try to read from local file cache */
+	if (lfc_read(InfoFromSMgrRel(reln), forkNum, blkno, buffer))
+	{
+		MyNeonCounters->file_cache_hits_total++;
+#ifdef DEBUG_COMPARE_LOCAL
+		compare_with_local(reln, forkNum, blkno, buffer, request_lsns.request_lsn);
+#else
+		return;
+#endif
+	}
+
+	neon_read_at_lsn(InfoFromSMgrRel(reln), forkNum, blkno, request_lsns, buffer);
+
+	/*
+	 * Try to receive prefetch results once again just to make sure we don't leave the smgr code while the OS might still have buffered bytes.
+	 */
+	communicator_prefetch_pump_state();
+
+#ifdef DEBUG_COMPARE_LOCAL
+	compare_with_local(reln, forkNum, blkno, buffer, request_lsns.request_lsn);
 #endif
 }
 #endif /* PG_MAJORVERSION_NUM <= 16 */
 
 #if PG_MAJORVERSION_NUM >= 17
+
+#ifdef DEBUG_COMPARE_LOCAL
+static void
+compare_with_localv(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void** buffers, BlockNumber nblocks, neon_request_lsns* request_lsns, bits8* read_pages)
+{
+	if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln))
+	{
+		for (BlockNumber i = 0; i < nblocks; i++)
+		{
+			if (BITMAP_ISSET(read_pages, i))
+			{
+				compare_with_local(reln, forkNum, blkno + i, buffers[i], request_lsns[i].request_lsn);
+			}
+		}
+	}
+}
+#endif
+
+
 static void
 neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 		   void **buffers, BlockNumber nblocks)
@@ -1460,8 +1503,13 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 													blocknum, request_lsns, nblocks,
 													buffers, read_pages);
 
+#ifdef DEBUG_COMPARE_LOCAL
+	compare_with_localv(reln, forknum, blocknum, buffers, nblocks, request_lsns, read_pages);
+	memset(read_pages, 0, sizeof(read_pages));
+#else
 	if (prefetch_result == nblocks)
 		return;
+#endif
 
 	/* Try to read from local file cache */
 	lfc_result = lfc_readv_select(InfoFromSMgrRel(reln), forknum, blocknum, buffers,
@@ -1470,9 +1518,14 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 	if (lfc_result > 0)
 		MyNeonCounters->file_cache_hits_total += lfc_result;
 
+#ifdef DEBUG_COMPARE_LOCAL
+	compare_with_localv(reln, forknum, blocknum, buffers, nblocks, request_lsns, read_pages);
+	memset(read_pages, 0, sizeof(read_pages));
+#else
 	/* Read all blocks from LFC, so we're done */
 	if (prefetch_result + lfc_result == nblocks)
 		return;
+#endif
 
 	communicator_read_at_lsnv(InfoFromSMgrRel(reln), forknum, blocknum, request_lsns,
 							  buffers, nblocks, read_pages);
@@ -1483,91 +1536,8 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 	communicator_prefetch_pump_state();
 
 #ifdef DEBUG_COMPARE_LOCAL
-	if (forknum == MAIN_FORKNUM && IS_LOCAL_REL(reln))
-	{
-		char		pageserver_masked[BLCKSZ];
-		PGIOAlignedBlock mdbuf;
-		PGIOAlignedBlock mdbuf_masked;
-		XLogRecPtr  request_lsn = request_lsns->request_lsn;
-
-		for (int i = 0; i < nblocks; i++)
-		{
-			BlockNumber blkno = blocknum + i;
-			if (!BITMAP_ISSET(read_pages, i))
-				continue;
-
-#if PG_MAJORVERSION_NUM >= 17
-			{
-				void* mdbuffers[1] = { mdbuf.data };
-				mdreadv(reln, forknum, blkno, mdbuffers, 1);
-			}
-#else
-			mdread(reln, forknum, blkno, mdbuf.data);
-#endif
-
-			memcpy(pageserver_masked, buffers[i], BLCKSZ);
-			memcpy(mdbuf_masked.data, mdbuf.data, BLCKSZ);
-
-			if (PageIsNew((Page) mdbuf.data))
-			{
-				if (!PageIsNew((Page) pageserver_masked))
-				{
-					neon_log(PANIC, "page is new in MD but not in Page Server at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n",
-						 blkno,
-						 RelFileInfoFmt(InfoFromSMgrRel(reln)),
-						 forknum,
-						 (uint32) (request_lsn >> 32), (uint32) request_lsn,
-						 hexdump_page(buffers[i]));
-				}
-			}
-			else if (PageIsNew((Page) buffers[i]))
-			{
-				neon_log(PANIC, "page is new in Page Server but not in MD at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n",
-					 blkno,
-					 RelFileInfoFmt(InfoFromSMgrRel(reln)),
-					 forknum,
-					 (uint32) (request_lsn >> 32), (uint32) request_lsn,
-					 hexdump_page(mdbuf.data));
-			}
-			else if (PageGetSpecialSize(mdbuf.data) == 0)
-			{
-				/* assume heap */
-				RmgrTable[RM_HEAP_ID].rm_mask(mdbuf_masked.data, blkno);
-				RmgrTable[RM_HEAP_ID].rm_mask(pageserver_masked, blkno);
-
-				if (memcmp(mdbuf_masked.data, pageserver_masked, BLCKSZ) != 0)
-				{
-					neon_log(PANIC, "heap buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n",
-						 blkno,
-						 RelFileInfoFmt(InfoFromSMgrRel(reln)),
-						 forknum,
-						 (uint32) (request_lsn >> 32), (uint32) request_lsn,
-						 hexdump_page(mdbuf_masked.data),
-						 hexdump_page(pageserver_masked));
-				}
-			}
-			else if (PageGetSpecialSize(mdbuf.data) == MAXALIGN(sizeof(BTPageOpaqueData)))
-			{
-				if (((BTPageOpaqueData *) PageGetSpecialPointer(mdbuf.data))->btpo_cycleid < MAX_BT_CYCLE_ID)
-				{
-					/* assume btree */
-					RmgrTable[RM_BTREE_ID].rm_mask(mdbuf_masked.data, blkno);
-					RmgrTable[RM_BTREE_ID].rm_mask(pageserver_masked, blkno);
-	
-					if (memcmp(mdbuf_masked.data, pageserver_masked, BLCKSZ) != 0)
-					{
-						neon_log(PANIC, "btree buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n",
-							 blkno,
-							 RelFileInfoFmt(InfoFromSMgrRel(reln)),
-							 forknum,
-							 (uint32) (request_lsn >> 32), (uint32) request_lsn,
-							 hexdump_page(mdbuf_masked.data),
-							 hexdump_page(pageserver_masked));
-					}
-				}
-			}
-		}
-	}
+	memset(read_pages, 0xFF, sizeof(read_pages));
+	compare_with_localv(reln, forknum, blocknum, buffers, nblocks, request_lsns, read_pages);
 #endif
 }
 #endif

From 234c882a0768876aa4616420af9a5fb132bb7b38 Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Tue, 13 May 2025 14:58:37 +0000
Subject: [PATCH 37/65] proxy: Expose handlers for cpu and heap profiling
 (#11912)

## Problem

It's difficult to understand where proxy spends most of cpu and memory.

## Summary of changes

Expose cpu and heap profiling handlers for continuous profiling.

neondatabase/cloud#22670
---
 proxy/src/bin/proxy.rs          | 4 ++++
 proxy/src/http/health_server.rs | 8 +++++++-
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index 7d4b44841d..d60d32eb3b 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -1,6 +1,10 @@
 #[global_allocator]
 static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
 
+#[allow(non_upper_case_globals)]
+#[unsafe(export_name = "malloc_conf")]
+pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:21\0";
+
 #[tokio::main]
 async fn main() -> anyhow::Result<()> {
     proxy::binary::proxy::run().await
diff --git a/proxy/src/http/health_server.rs b/proxy/src/http/health_server.rs
index 5278fe2a3e..b0b5a598d1 100644
--- a/proxy/src/http/health_server.rs
+++ b/proxy/src/http/health_server.rs
@@ -3,7 +3,7 @@ use std::net::TcpListener;
 use std::sync::{Arc, Mutex};
 
 use anyhow::{anyhow, bail};
-use http_utils::endpoint::{self, request_span};
+use http_utils::endpoint::{self, profile_cpu_handler, profile_heap_handler, request_span};
 use http_utils::error::ApiError;
 use http_utils::json::json_response;
 use http_utils::{RouterBuilder, RouterService};
@@ -33,6 +33,12 @@ fn make_router(metrics: AppMetrics) -> RouterBuilder<hyper0::Body, ApiError> {
             request_span(r, move |b| prometheus_metrics_handler(b, state))
         })
         .get("/v1/status", status_handler)
+        .get("/profile/cpu", move |r| {
+            request_span(r, profile_cpu_handler)
+        })
+        .get("/profile/heap", move |r| {
+            request_span(r, profile_heap_handler)
+        })
 }
 
 pub async fn task_main(

From 045ae13e060c3717c921097444d5c6b09925e87c Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Tue, 13 May 2025 18:49:49 +0100
Subject: [PATCH 38/65] pageserver: make imports work with tenant shut downs
 (#11855)

## Problem

Lifetime of imported timelines (and implicitly the import background
task) has some shortcomings:
1. Timeline activation upon import completion is tricky. Previously, a
timeline that finished importing
after a tenant detach would not get activated and there's concerns about
the safety of activating
concurrently with shut-down.
2. Import jobs can prevent tenant shut down since they hold the tenant
gate

## Summary of Changes

Track the import tasks in memory and abort them explicitly on tenant
shutdown.

Integrate more closely with the storage controller:
1. When an import task has finished all of its jobs, it notifies the
storage controller, but **does not** mark the import as done in the
index_part. When all shards have finished importing, the storage
controller will call the `/activate_post_import` idempotent endpoint for
all of them. The handler, marks the import complete in index part,
resets the tenant if required and checks if the timeline is active yet.
2. Not directly related, but the import job now gets the starting state
from the storage controller instead of the import bucket. This paves the
way for progress checkpointing.

Related: https://github.com/neondatabase/neon/issues/11568
---
 pageserver/client/src/mgmt_api.rs             |  22 ++
 pageserver/src/controller_upcall_client.rs    |  40 +++
 pageserver/src/deletion_queue.rs              |   9 +
 pageserver/src/http/routes.rs                 | 105 ++++++
 pageserver/src/tenant.rs                      | 222 ++++++------
 .../src/tenant/remote_timeline_client.rs      |  29 ++
 .../src/tenant/timeline/import_pgdata.rs      | 284 +++++++--------
 .../src/tenant/timeline/import_pgdata/flow.rs |   4 +
 .../import_pgdata/importbucket_client.rs      |  25 --
 .../import_pgdata/importbucket_format.rs      |   6 -
 .../import_pgdata/index_part_format.rs        |   8 +
 storage_controller/src/http.rs                |  30 ++
 storage_controller/src/pageserver_client.rs   |  19 +
 storage_controller/src/persistence.rs         |  33 ++
 storage_controller/src/service.rs             | 328 +++++++++++-------
 .../src/service/safekeeper_service.rs         |   7 +-
 storage_controller/src/timeline_import.rs     |  22 +-
 test_runner/regress/test_import_pgdata.py     |  91 ++++-
 18 files changed, 859 insertions(+), 425 deletions(-)

diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs
index 4a87a91910..219e63c9d4 100644
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -1,5 +1,6 @@
 use std::collections::HashMap;
 use std::error::Error as _;
+use std::time::Duration;
 
 use bytes::Bytes;
 use detach_ancestor::AncestorDetached;
@@ -819,4 +820,25 @@ impl Client {
             .await
             .map(|resp| resp.status())
     }
+
+    pub async fn activate_post_import(
+        &self,
+        tenant_shard_id: TenantShardId,
+        timeline_id: TimelineId,
+        activate_timeline_timeout: Duration,
+    ) -> Result<TimelineInfo> {
+        let uri = format!(
+            "{}/v1/tenant/{}/timeline/{}/activate_post_import?timeline_activate_timeout_ms={}",
+            self.mgmt_api_endpoint,
+            tenant_shard_id,
+            timeline_id,
+            activate_timeline_timeout.as_millis()
+        );
+
+        self.request(Method::PUT, uri, ())
+            .await?
+            .json()
+            .await
+            .map_err(Error::ReceiveBody)
+    }
 }
diff --git a/pageserver/src/controller_upcall_client.rs b/pageserver/src/controller_upcall_client.rs
index 468e5463b0..6d186b091a 100644
--- a/pageserver/src/controller_upcall_client.rs
+++ b/pageserver/src/controller_upcall_client.rs
@@ -53,6 +53,11 @@ pub trait StorageControllerUpcallApi {
         timeline_id: TimelineId,
         status: ShardImportStatus,
     ) -> impl Future<Output = Result<(), RetryForeverError>> + Send;
+    fn get_timeline_import_status(
+        &self,
+        tenant_shard_id: TenantShardId,
+        timeline_id: TimelineId,
+    ) -> impl Future<Output = Result<Option<ShardImportStatus>, RetryForeverError>> + Send;
 }
 
 impl StorageControllerUpcallClient {
@@ -302,4 +307,39 @@ impl StorageControllerUpcallApi for StorageControllerUpcallClient {
 
         self.retry_http_forever(&url, request).await
     }
+
+    #[tracing::instrument(skip_all)] // so that warning logs from retry_http_forever have context
+    async fn get_timeline_import_status(
+        &self,
+        tenant_shard_id: TenantShardId,
+        timeline_id: TimelineId,
+    ) -> Result<Option<ShardImportStatus>, RetryForeverError> {
+        let url = self
+            .base_url
+            .join(format!("timeline_import_status/{}/{}", tenant_shard_id, timeline_id).as_str())
+            .expect("Failed to build path");
+
+        Ok(backoff::retry(
+            || async {
+                let response = self.http_client.get(url.clone()).send().await?;
+
+                if let Err(err) = response.error_for_status_ref() {
+                    if matches!(err.status(), Some(reqwest::StatusCode::NOT_FOUND)) {
+                        return Ok(None);
+                    } else {
+                        return Err(err);
+                    }
+                }
+                response.json::<ShardImportStatus>().await.map(Some)
+            },
+            |_| false,
+            3,
+            u32::MAX,
+            "storage controller upcall",
+            &self.cancel,
+        )
+        .await
+        .ok_or(RetryForeverError::ShuttingDown)?
+        .expect("We retry forever, this should never be reached"))
+    }
 }
diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs
index 4d62bc4ab5..65b2de28cd 100644
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -663,6 +663,7 @@ mod test {
     use camino::Utf8Path;
     use hex_literal::hex;
     use pageserver_api::key::Key;
+    use pageserver_api::models::ShardImportStatus;
     use pageserver_api::shard::ShardIndex;
     use pageserver_api::upcall_api::ReAttachResponseTenant;
     use remote_storage::{RemoteStorageConfig, RemoteStorageKind};
@@ -796,6 +797,14 @@ mod test {
         ) -> Result<(), RetryForeverError> {
             unimplemented!()
         }
+
+        async fn get_timeline_import_status(
+            &self,
+            _tenant_shard_id: TenantShardId,
+            _timeline_id: TimelineId,
+        ) -> Result<Option<ShardImportStatus>, RetryForeverError> {
+            unimplemented!()
+        }
     }
 
     async fn setup(test_name: &str) -> anyhow::Result<TestSetup> {
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 8b6500b020..2edec9dda1 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -3500,6 +3500,107 @@ async fn put_tenant_timeline_import_wal(
     }.instrument(span).await
 }
 
+/// Activate a timeline after its import has completed
+///
+/// The endpoint is idempotent and callers are expected to retry all
+/// errors until a successful response.
+async fn activate_post_import_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+
+    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
+    const DEFAULT_ACTIVATE_TIMEOUT: Duration = Duration::from_secs(1);
+    let activate_timeout = parse_query_param(&request, "timeline_activate_timeout_ms")?
+        .map(Duration::from_millis)
+        .unwrap_or(DEFAULT_ACTIVATE_TIMEOUT);
+
+    let span = info_span!(
+        "activate_post_import_handler",
+        tenant_id=%tenant_shard_id.tenant_id,
+        timeline_id=%timeline_id,
+        shard_id=%tenant_shard_id.shard_slug()
+    );
+
+    async move {
+        let state = get_state(&request);
+        let tenant = state
+            .tenant_manager
+            .get_attached_tenant_shard(tenant_shard_id)?;
+
+        tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
+
+        tenant
+            .finalize_importing_timeline(timeline_id)
+            .await
+            .map_err(ApiError::InternalServerError)?;
+
+        match tenant.get_timeline(timeline_id, false) {
+            Ok(_timeline) => {
+                // Timeline is already visible. Reset not required: fall through.
+            }
+            Err(GetTimelineError::NotFound { .. }) => {
+                // This is crude: we reset the whole tenant such that the new timeline is detected
+                // and activated. We can come up with something more granular in the future.
+                //
+                // Note that we only reset the tenant if required: when the timeline is
+                // not present in [`Tenant::timelines`].
+                let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
+                state
+                    .tenant_manager
+                    .reset_tenant(tenant_shard_id, false, &ctx)
+                    .await
+                    .map_err(ApiError::InternalServerError)?;
+            }
+            Err(GetTimelineError::ShuttingDown) => {
+                return Err(ApiError::ShuttingDown);
+            }
+            Err(GetTimelineError::NotActive { .. }) => {
+                unreachable!("Called get_timeline with active_only=false");
+            }
+        }
+
+        let timeline = tenant.get_timeline(timeline_id, false)?;
+
+        let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn)
+            .with_scope_timeline(&timeline);
+
+        let result =
+            tokio::time::timeout(activate_timeout, timeline.wait_to_become_active(&ctx)).await;
+        match result {
+            Ok(Ok(())) => {
+                // fallthrough
+            }
+            // Timeline reached some other state that's not active
+            // TODO(vlad): if the tenant is broken, return a permananet error
+            Ok(Err(_timeline_state)) => {
+                return Err(ApiError::InternalServerError(anyhow::anyhow!(
+                    "Timeline activation failed"
+                )));
+            }
+            // Activation timed out
+            Err(_) => {
+                return Err(ApiError::Timeout("Timeline activation timed out".into()));
+            }
+        }
+
+        let timeline_info = build_timeline_info(
+            &timeline, false, // include_non_incremental_logical_size,
+            false, // force_await_initial_logical_size
+            &ctx,
+        )
+        .await
+        .context("get local timeline info")
+        .map_err(ApiError::InternalServerError)?;
+
+        json_response(StatusCode::OK, timeline_info)
+    }
+    .instrument(span)
+    .await
+}
+
 /// Read the end of a tar archive.
 ///
 /// A tar archive normally ends with two consecutive blocks of zeros, 512 bytes each.
@@ -3924,5 +4025,9 @@ pub fn make_router(
             "/v1/tenant/:tenant_id/timeline/:timeline_id/import_wal",
             |r| api_handler(r, put_tenant_timeline_import_wal),
         )
+        .put(
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/activate_post_import",
+            |r| api_handler(r, activate_post_import_handler),
+        )
         .any(handler_404))
 }
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index e59db74479..441049f47d 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -50,6 +50,7 @@ use remote_timeline_client::{
 use secondary::heatmap::{HeatMapTenant, HeatMapTimeline};
 use storage_broker::BrokerClientChannel;
 use timeline::compaction::{CompactionOutcome, GcCompactionQueue};
+use timeline::import_pgdata::ImportingTimeline;
 use timeline::offload::{OffloadError, offload_timeline};
 use timeline::{
     CompactFlags, CompactOptions, CompactionError, PreviousHeatmap, ShutdownMode, import_pgdata,
@@ -284,6 +285,19 @@ pub struct TenantShard {
     /// **Lock order**: if acquiring all (or a subset), acquire them in order `timelines`, `timelines_offloaded`, `timelines_creating`
     timelines_offloaded: Mutex<HashMap<TimelineId, Arc<OffloadedTimeline>>>,
 
+    /// Tracks the timelines that are currently importing into this tenant shard.
+    ///
+    /// Note that importing timelines are also present in [`Self::timelines_creating`].
+    /// Keep this in mind when ordering lock acquisition.
+    ///
+    /// Lifetime:
+    /// * An imported timeline is created while scanning the bucket on tenant attach
+    ///   if the index part contains an `import_pgdata` entry and said field marks the import
+    ///   as in progress.
+    /// * Imported timelines are removed when the storage controller calls the post timeline
+    ///   import activation endpoint.
+    timelines_importing: std::sync::Mutex<HashMap<TimelineId, ImportingTimeline>>,
+
     /// The last tenant manifest known to be in remote storage. None if the manifest has not yet
     /// been either downloaded or uploaded. Always Some after tenant attach.
     ///
@@ -923,19 +937,10 @@ enum StartCreatingTimelineResult {
 
 #[allow(clippy::large_enum_variant, reason = "TODO")]
 enum TimelineInitAndSyncResult {
-    ReadyToActivate(Arc<Timeline>),
+    ReadyToActivate,
     NeedsSpawnImportPgdata(TimelineInitAndSyncNeedsSpawnImportPgdata),
 }
 
-impl TimelineInitAndSyncResult {
-    fn ready_to_activate(self) -> Option<Arc<Timeline>> {
-        match self {
-            Self::ReadyToActivate(timeline) => Some(timeline),
-            _ => None,
-        }
-    }
-}
-
 #[must_use]
 struct TimelineInitAndSyncNeedsSpawnImportPgdata {
     timeline: Arc<Timeline>,
@@ -1012,10 +1017,6 @@ enum CreateTimelineCause {
 enum LoadTimelineCause {
     Attach,
     Unoffload,
-    ImportPgdata {
-        create_guard: TimelineCreateGuard,
-        activate: ActivateTimelineArgs,
-    },
 }
 
 #[derive(thiserror::Error, Debug)]
@@ -1097,7 +1098,7 @@ impl TenantShard {
         self: &Arc<Self>,
         timeline_id: TimelineId,
         resources: TimelineResources,
-        mut index_part: IndexPart,
+        index_part: IndexPart,
         metadata: TimelineMetadata,
         previous_heatmap: Option<PreviousHeatmap>,
         ancestor: Option<Arc<Timeline>>,
@@ -1106,7 +1107,7 @@ impl TenantShard {
     ) -> anyhow::Result<TimelineInitAndSyncResult> {
         let tenant_id = self.tenant_shard_id;
 
-        let import_pgdata = index_part.import_pgdata.take();
+        let import_pgdata = index_part.import_pgdata.clone();
         let idempotency = match &import_pgdata {
             Some(import_pgdata) => {
                 CreateTimelineIdempotency::ImportPgdata(CreatingTimelineIdempotencyImportPgdata {
@@ -1127,7 +1128,7 @@ impl TenantShard {
             }
         };
 
-        let (timeline, timeline_ctx) = self.create_timeline_struct(
+        let (timeline, _timeline_ctx) = self.create_timeline_struct(
             timeline_id,
             &metadata,
             previous_heatmap,
@@ -1197,14 +1198,6 @@ impl TenantShard {
 
         match import_pgdata {
             Some(import_pgdata) if !import_pgdata.is_done() => {
-                match cause {
-                    LoadTimelineCause::Attach | LoadTimelineCause::Unoffload => (),
-                    LoadTimelineCause::ImportPgdata { .. } => {
-                        unreachable!(
-                            "ImportPgdata should not be reloading timeline import is done and persisted as such in s3"
-                        )
-                    }
-                }
                 let mut guard = self.timelines_creating.lock().unwrap();
                 if !guard.insert(timeline_id) {
                     // We should never try and load the same timeline twice during startup
@@ -1260,26 +1253,7 @@ impl TenantShard {
                     "Timeline has no ancestor and no layer files"
                 );
 
-                match cause {
-                    LoadTimelineCause::Attach | LoadTimelineCause::Unoffload => (),
-                    LoadTimelineCause::ImportPgdata {
-                        create_guard,
-                        activate,
-                    } => {
-                        // TODO: see the comment in the task code above how I'm not so certain
-                        // it is safe to activate here because of concurrent shutdowns.
-                        match activate {
-                            ActivateTimelineArgs::Yes { broker_client } => {
-                                info!("activating timeline after reload from pgdata import task");
-                                timeline.activate(self.clone(), broker_client, None, &timeline_ctx);
-                            }
-                            ActivateTimelineArgs::No => (),
-                        }
-                        drop(create_guard);
-                    }
-                }
-
-                Ok(TimelineInitAndSyncResult::ReadyToActivate(timeline))
+                Ok(TimelineInitAndSyncResult::ReadyToActivate)
             }
         }
     }
@@ -1768,7 +1742,7 @@ impl TenantShard {
                 })?;
 
             match effect {
-                TimelineInitAndSyncResult::ReadyToActivate(_) => {
+                TimelineInitAndSyncResult::ReadyToActivate => {
                     // activation happens later, on Tenant::activate
                 }
                 TimelineInitAndSyncResult::NeedsSpawnImportPgdata(
@@ -1778,13 +1752,24 @@ impl TenantShard {
                         guard,
                     },
                 ) => {
-                    tokio::task::spawn(self.clone().create_timeline_import_pgdata_task(
-                        timeline,
-                        import_pgdata,
-                        ActivateTimelineArgs::No,
-                        guard,
-                        ctx.detached_child(TaskKind::ImportPgdata, DownloadBehavior::Warn),
-                    ));
+                    let timeline_id = timeline.timeline_id;
+                    let import_task_handle =
+                        tokio::task::spawn(self.clone().create_timeline_import_pgdata_task(
+                            timeline.clone(),
+                            import_pgdata,
+                            guard,
+                            ctx.detached_child(TaskKind::ImportPgdata, DownloadBehavior::Warn),
+                        ));
+
+                    let prev = self.timelines_importing.lock().unwrap().insert(
+                        timeline_id,
+                        ImportingTimeline {
+                            timeline: timeline.clone(),
+                            import_task_handle,
+                        },
+                    );
+
+                    assert!(prev.is_none());
                 }
             }
         }
@@ -2678,14 +2663,7 @@ impl TenantShard {
                     .await?
             }
             CreateTimelineParams::ImportPgdata(params) => {
-                self.create_timeline_import_pgdata(
-                    params,
-                    ActivateTimelineArgs::Yes {
-                        broker_client: broker_client.clone(),
-                    },
-                    ctx,
-                )
-                .await?
+                self.create_timeline_import_pgdata(params, ctx).await?
             }
         };
 
@@ -2759,7 +2737,6 @@ impl TenantShard {
     async fn create_timeline_import_pgdata(
         self: &Arc<Self>,
         params: CreateTimelineParamsImportPgdata,
-        activate: ActivateTimelineArgs,
         ctx: &RequestContext,
     ) -> Result<CreateTimelineResult, CreateTimelineError> {
         let CreateTimelineParamsImportPgdata {
@@ -2840,24 +2817,71 @@ impl TenantShard {
 
         let (timeline, timeline_create_guard) = uninit_timeline.finish_creation_myself();
 
-        tokio::spawn(self.clone().create_timeline_import_pgdata_task(
+        let import_task_handle = tokio::spawn(self.clone().create_timeline_import_pgdata_task(
             timeline.clone(),
             index_part,
-            activate,
             timeline_create_guard,
             timeline_ctx.detached_child(TaskKind::ImportPgdata, DownloadBehavior::Warn),
         ));
 
+        let prev = self.timelines_importing.lock().unwrap().insert(
+            timeline.timeline_id,
+            ImportingTimeline {
+                timeline: timeline.clone(),
+                import_task_handle,
+            },
+        );
+
+        // Idempotency is enforced higher up the stack
+        assert!(prev.is_none());
+
         // NB: the timeline doesn't exist in self.timelines at this point
         Ok(CreateTimelineResult::ImportSpawned(timeline))
     }
 
+    /// Finalize the import of a timeline on this shard by marking it complete in
+    /// the index part. If the import task hasn't finished yet, returns an error.
+    ///
+    /// This method is idempotent. If the import was finalized once, the next call
+    /// will be a no-op.
+    pub(crate) async fn finalize_importing_timeline(
+        &self,
+        timeline_id: TimelineId,
+    ) -> anyhow::Result<()> {
+        let timeline = {
+            let locked = self.timelines_importing.lock().unwrap();
+            match locked.get(&timeline_id) {
+                Some(importing_timeline) => {
+                    if !importing_timeline.import_task_handle.is_finished() {
+                        return Err(anyhow::anyhow!("Import task not done yet"));
+                    }
+
+                    importing_timeline.timeline.clone()
+                }
+                None => {
+                    return Ok(());
+                }
+            }
+        };
+
+        timeline
+            .remote_client
+            .schedule_index_upload_for_import_pgdata_finalize()?;
+        timeline.remote_client.wait_completion().await?;
+
+        self.timelines_importing
+            .lock()
+            .unwrap()
+            .remove(&timeline_id);
+
+        Ok(())
+    }
+
     #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%timeline.timeline_id))]
     async fn create_timeline_import_pgdata_task(
         self: Arc<TenantShard>,
         timeline: Arc<Timeline>,
         index_part: import_pgdata::index_part_format::Root,
-        activate: ActivateTimelineArgs,
         timeline_create_guard: TimelineCreateGuard,
         ctx: RequestContext,
     ) {
@@ -2869,7 +2893,6 @@ impl TenantShard {
             .create_timeline_import_pgdata_task_impl(
                 timeline,
                 index_part,
-                activate,
                 timeline_create_guard,
                 ctx,
             )
@@ -2885,60 +2908,15 @@ impl TenantShard {
         self: Arc<TenantShard>,
         timeline: Arc<Timeline>,
         index_part: import_pgdata::index_part_format::Root,
-        activate: ActivateTimelineArgs,
-        timeline_create_guard: TimelineCreateGuard,
+        _timeline_create_guard: TimelineCreateGuard,
         ctx: RequestContext,
     ) -> Result<(), anyhow::Error> {
         info!("importing pgdata");
+        let ctx = ctx.with_scope_timeline(&timeline);
         import_pgdata::doit(&timeline, index_part, &ctx, self.cancel.clone())
             .await
             .context("import")?;
-        info!("import done");
-
-        //
-        // Reload timeline from remote.
-        // This proves that the remote state is attachable, and it reuses the code.
-        //
-        // TODO: think about whether this is safe to do with concurrent TenantShard::shutdown.
-        // timeline_create_guard hols the tenant gate open, so, shutdown cannot _complete_ until we exit.
-        // But our activate() call might launch new background tasks after TenantShard::shutdown
-        // already went past shutting down the TenantShard::timelines, which this timeline here is no part of.
-        // I think the same problem exists with the bootstrap & branch mgmt API tasks (tenant shutting
-        // down while bootstrapping/branching + activating), but, the race condition is much more likely
-        // to manifest because of the long runtime of this import task.
-
-        //        in theory this shouldn't even .await anything except for coop yield
-        info!("shutting down timeline");
-        timeline.shutdown(ShutdownMode::Hard).await;
-        info!("timeline shut down, reloading from remote");
-        // TODO: we can't do the following check because create_timeline_import_pgdata must return an Arc<Timeline>
-        // let Some(timeline) = Arc::into_inner(timeline) else {
-        //     anyhow::bail!("implementation error: timeline that we shut down was still referenced from somewhere");
-        // };
-        let timeline_id = timeline.timeline_id;
-
-        // load from object storage like TenantShard::attach does
-        let resources = self.build_timeline_resources(timeline_id);
-        let index_part = resources
-            .remote_client
-            .download_index_file(&self.cancel)
-            .await?;
-        let index_part = match index_part {
-            MaybeDeletedIndexPart::Deleted(_) => {
-                // likely concurrent delete call, cplane should prevent this
-                anyhow::bail!(
-                    "index part says deleted but we are not done creating yet, this should not happen but"
-                )
-            }
-            MaybeDeletedIndexPart::IndexPart(p) => p,
-        };
-        let metadata = index_part.metadata.clone();
-        self
-            .load_remote_timeline(timeline_id, index_part, metadata, None, resources, LoadTimelineCause::ImportPgdata{
-                create_guard: timeline_create_guard, activate, }, &ctx)
-            .await?
-            .ready_to_activate()
-            .context("implementation error: reloaded timeline still needs import after import reported success")?;
+        info!("import done - waiting for activation");
 
         anyhow::Ok(())
     }
@@ -3475,6 +3453,14 @@ impl TenantShard {
                 timeline.defuse_for_tenant_drop();
             });
         }
+        {
+            let mut timelines_importing = self.timelines_importing.lock().unwrap();
+            timelines_importing
+                .drain()
+                .for_each(|(_timeline_id, importing_timeline)| {
+                    importing_timeline.shutdown();
+                });
+        }
         // test_long_timeline_create_then_tenant_delete is leaning on this message
         tracing::info!("Waiting for timelines...");
         while let Some(res) = js.join_next().await {
@@ -3949,13 +3935,6 @@ where
     Ok(result)
 }
 
-enum ActivateTimelineArgs {
-    Yes {
-        broker_client: storage_broker::BrokerClientChannel,
-    },
-    No,
-}
-
 impl TenantShard {
     pub fn tenant_specific_overrides(&self) -> pageserver_api::models::TenantConfig {
         self.tenant_conf.load().tenant_conf.clone()
@@ -4322,6 +4301,7 @@ impl TenantShard {
             timelines: Mutex::new(HashMap::new()),
             timelines_creating: Mutex::new(HashSet::new()),
             timelines_offloaded: Mutex::new(HashMap::new()),
+            timelines_importing: Mutex::new(HashMap::new()),
             remote_tenant_manifest: Default::default(),
             gc_cs: tokio::sync::Mutex::new(()),
             walredo_mgr,
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index ea29f51956..21d68495f7 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -949,6 +949,35 @@ impl RemoteTimelineClient {
         Ok(())
     }
 
+    /// If the `import_pgdata` field marks the timeline as having an import in progress,
+    /// launch an index-file upload operation that transitions it to done in the background
+    pub(crate) fn schedule_index_upload_for_import_pgdata_finalize(
+        self: &Arc<Self>,
+    ) -> anyhow::Result<()> {
+        use import_pgdata::index_part_format;
+
+        let mut guard = self.upload_queue.lock().unwrap();
+        let upload_queue = guard.initialized_mut()?;
+        let to_update = match &upload_queue.dirty.import_pgdata {
+            Some(import) if !import.is_done() => Some(import),
+            Some(_) | None => None,
+        };
+
+        if let Some(old) = to_update {
+            let new =
+                index_part_format::Root::V1(index_part_format::V1::Done(index_part_format::Done {
+                    idempotency_key: old.idempotency_key().clone(),
+                    started_at: *old.started_at(),
+                    finished_at: chrono::Utc::now().naive_utc(),
+                }));
+
+            upload_queue.dirty.import_pgdata = Some(new);
+            self.schedule_index_upload(upload_queue);
+        }
+
+        Ok(())
+    }
+
     /// Launch an index-file upload operation in the background, setting `gc_compaction_state` field.
     pub(crate) fn schedule_index_upload_for_gc_compaction_state_update(
         self: &Arc<Self>,
diff --git a/pageserver/src/tenant/timeline/import_pgdata.rs b/pageserver/src/tenant/timeline/import_pgdata.rs
index c4a8df39a3..53e15e5395 100644
--- a/pageserver/src/tenant/timeline/import_pgdata.rs
+++ b/pageserver/src/tenant/timeline/import_pgdata.rs
@@ -3,6 +3,7 @@ use std::sync::Arc;
 use anyhow::{Context, bail};
 use pageserver_api::models::ShardImportStatus;
 use remote_storage::RemotePath;
+use tokio::task::JoinHandle;
 use tokio_util::sync::CancellationToken;
 use tracing::info;
 use utils::lsn::Lsn;
@@ -17,6 +18,17 @@ mod importbucket_client;
 mod importbucket_format;
 pub(crate) mod index_part_format;
 
+pub(crate) struct ImportingTimeline {
+    pub import_task_handle: JoinHandle<()>,
+    pub timeline: Arc<Timeline>,
+}
+
+impl ImportingTimeline {
+    pub(crate) fn shutdown(self) {
+        self.import_task_handle.abort();
+    }
+}
+
 pub async fn doit(
     timeline: &Arc<Timeline>,
     index_part: index_part_format::Root,
@@ -26,173 +38,161 @@ pub async fn doit(
     let index_part_format::Root::V1(v1) = index_part;
     let index_part_format::InProgress {
         location,
-        idempotency_key,
-        started_at,
+        idempotency_key: _,
+        started_at: _,
     } = match v1 {
         index_part_format::V1::Done(_) => return Ok(()),
         index_part_format::V1::InProgress(in_progress) => in_progress,
     };
 
-    let storage = importbucket_client::new(timeline.conf, &location, cancel.clone()).await?;
+    let storcon_client = StorageControllerUpcallClient::new(timeline.conf, &cancel);
 
-    let status_prefix = RemotePath::from_string("status").unwrap();
+    let shard_status = storcon_client
+        .get_timeline_import_status(timeline.tenant_shard_id, timeline.timeline_id)
+        .await
+        .map_err(|_err| anyhow::anyhow!("Shut down while getting timeline import status"))?;
 
-    //
-    // See if shard is done.
-    // TODO: incorporate generations into status key for split brain safety. Figure out together with checkpointing.
-    //
-    let shard_status_key =
-        status_prefix.join(format!("shard-{}", timeline.tenant_shard_id.shard_slug()));
-    let shard_status: Option<importbucket_format::ShardStatus> =
-        storage.get_json(&shard_status_key).await?;
     info!(?shard_status, "peeking shard status");
-    if shard_status.map(|st| st.done).unwrap_or(false) {
-        info!("shard status indicates that the shard is done, skipping import");
-    } else {
-        // TODO: checkpoint the progress into the IndexPart instead of restarting
-        // from the beginning.
+    match shard_status {
+        None | Some(ShardImportStatus::InProgress) => {
+            // TODO: checkpoint the progress into the IndexPart instead of restarting
+            // from the beginning.
 
-        //
-        // Wipe the slate clean - the flow does not allow resuming.
-        // We can implement resuming in the future by checkpointing the progress into the IndexPart.
-        //
-        info!("wipe the slate clean");
-        {
-            // TODO: do we need to hold GC lock for this?
-            let mut guard = timeline.layers.write().await;
-            assert!(
-                guard.layer_map()?.open_layer.is_none(),
-                "while importing, there should be no in-memory layer" // this just seems like a good place to assert it
-            );
-            let all_layers_keys = guard.all_persistent_layers();
-            let all_layers: Vec<_> = all_layers_keys
-                .iter()
-                .map(|key| guard.get_from_key(key))
-                .collect();
-            let open = guard.open_mut().context("open_mut")?;
+            //
+            // Wipe the slate clean - the flow does not allow resuming.
+            // We can implement resuming in the future by checkpointing the progress into the IndexPart.
+            //
+            info!("wipe the slate clean");
+            {
+                // TODO: do we need to hold GC lock for this?
+                let mut guard = timeline.layers.write().await;
+                assert!(
+                    guard.layer_map()?.open_layer.is_none(),
+                    "while importing, there should be no in-memory layer" // this just seems like a good place to assert it
+                );
+                let all_layers_keys = guard.all_persistent_layers();
+                let all_layers: Vec<_> = all_layers_keys
+                    .iter()
+                    .map(|key| guard.get_from_key(key))
+                    .collect();
+                let open = guard.open_mut().context("open_mut")?;
 
-            timeline.remote_client.schedule_gc_update(&all_layers)?;
-            open.finish_gc_timeline(&all_layers);
-        }
-
-        //
-        // Wait for pgdata to finish uploading
-        //
-        info!("wait for pgdata to reach status 'done'");
-        let pgdata_status_key = status_prefix.join("pgdata");
-        loop {
-            let res = async {
-                let pgdata_status: Option<importbucket_format::PgdataStatus> = storage
-                    .get_json(&pgdata_status_key)
-                    .await
-                    .context("get pgdata status")?;
-                info!(?pgdata_status, "peeking pgdata status");
-                if pgdata_status.map(|st| st.done).unwrap_or(false) {
-                    Ok(())
-                } else {
-                    Err(anyhow::anyhow!("pgdata not done yet"))
-                }
+                timeline.remote_client.schedule_gc_update(&all_layers)?;
+                open.finish_gc_timeline(&all_layers);
             }
-            .await;
-            match res {
-                Ok(_) => break,
-                Err(err) => {
-                    info!(?err, "indefinitely waiting for pgdata to finish");
-                    if tokio::time::timeout(std::time::Duration::from_secs(10), cancel.cancelled())
+
+            //
+            // Wait for pgdata to finish uploading
+            //
+            info!("wait for pgdata to reach status 'done'");
+            let storage =
+                importbucket_client::new(timeline.conf, &location, cancel.clone()).await?;
+            let status_prefix = RemotePath::from_string("status").unwrap();
+            let pgdata_status_key = status_prefix.join("pgdata");
+            loop {
+                let res = async {
+                    let pgdata_status: Option<importbucket_format::PgdataStatus> = storage
+                        .get_json(&pgdata_status_key)
+                        .await
+                        .context("get pgdata status")?;
+                    info!(?pgdata_status, "peeking pgdata status");
+                    if pgdata_status.map(|st| st.done).unwrap_or(false) {
+                        Ok(())
+                    } else {
+                        Err(anyhow::anyhow!("pgdata not done yet"))
+                    }
+                }
+                .await;
+                match res {
+                    Ok(_) => break,
+                    Err(err) => {
+                        info!(?err, "indefinitely waiting for pgdata to finish");
+                        if tokio::time::timeout(
+                            std::time::Duration::from_secs(10),
+                            cancel.cancelled(),
+                        )
                         .await
                         .is_ok()
-                    {
-                        bail!("cancelled while waiting for pgdata");
+                        {
+                            bail!("cancelled while waiting for pgdata");
+                        }
                     }
                 }
             }
-        }
 
-        //
-        // Do the import
-        //
-        info!("do the import");
-        let control_file = storage.get_control_file().await?;
-        let base_lsn = control_file.base_lsn();
+            //
+            // Do the import
+            //
+            info!("do the import");
+            let control_file = storage.get_control_file().await?;
+            let base_lsn = control_file.base_lsn();
 
-        info!("update TimelineMetadata based on LSNs from control file");
-        {
-            let pg_version = control_file.pg_version();
-            let _ctx: &RequestContext = ctx;
-            async move {
-                // FIXME: The 'disk_consistent_lsn' should be the LSN at the *end* of the
-                // checkpoint record, and prev_record_lsn should point to its beginning.
-                // We should read the real end of the record from the WAL, but here we
-                // just fake it.
-                let disk_consistent_lsn = Lsn(base_lsn.0 + 8);
-                let prev_record_lsn = base_lsn;
-                let metadata = TimelineMetadata::new(
-                    disk_consistent_lsn,
-                    Some(prev_record_lsn),
-                    None,     // no ancestor
-                    Lsn(0),   // no ancestor lsn
-                    base_lsn, // latest_gc_cutoff_lsn
-                    base_lsn, // initdb_lsn
-                    pg_version,
-                );
+            info!("update TimelineMetadata based on LSNs from control file");
+            {
+                let pg_version = control_file.pg_version();
+                let _ctx: &RequestContext = ctx;
+                async move {
+                    // FIXME: The 'disk_consistent_lsn' should be the LSN at the *end* of the
+                    // checkpoint record, and prev_record_lsn should point to its beginning.
+                    // We should read the real end of the record from the WAL, but here we
+                    // just fake it.
+                    let disk_consistent_lsn = Lsn(base_lsn.0 + 8);
+                    let prev_record_lsn = base_lsn;
+                    let metadata = TimelineMetadata::new(
+                        disk_consistent_lsn,
+                        Some(prev_record_lsn),
+                        None,     // no ancestor
+                        Lsn(0),   // no ancestor lsn
+                        base_lsn, // latest_gc_cutoff_lsn
+                        base_lsn, // initdb_lsn
+                        pg_version,
+                    );
 
-                let _start_lsn = disk_consistent_lsn + 1;
+                    let _start_lsn = disk_consistent_lsn + 1;
 
-                timeline
-                    .remote_client
-                    .schedule_index_upload_for_full_metadata_update(&metadata)?;
+                    timeline
+                        .remote_client
+                        .schedule_index_upload_for_full_metadata_update(&metadata)?;
 
-                timeline.remote_client.wait_completion().await?;
+                    timeline.remote_client.wait_completion().await?;
 
-                anyhow::Ok(())
+                    anyhow::Ok(())
+                }
             }
+            .await?;
+
+            flow::run(timeline.clone(), control_file, storage.clone(), ctx).await?;
+
+            // Communicate that shard is done.
+            // Ensure at-least-once delivery of the upcall to storage controller
+            // before we mark the task as done and never come here again.
+            //
+            // Note that we do not mark the import complete in the index part now.
+            // This happens in [`Tenant::finalize_importing_timeline`] in response
+            // to the storage controller calling
+            // `/v1/tenant/:tenant_id/timeline/:timeline_id/activate_post_import`.
+            storcon_client
+                .put_timeline_import_status(
+                    timeline.tenant_shard_id,
+                    timeline.timeline_id,
+                    // TODO(vlad): What about import errors?
+                    ShardImportStatus::Done,
+                )
+                .await
+                .map_err(|_err| {
+                    anyhow::anyhow!("Shut down while putting timeline import status")
+                })?;
+        }
+        Some(ShardImportStatus::Error(err)) => {
+            info!(
+                "shard status indicates that the shard is done (error), skipping import {}",
+                err
+            );
+        }
+        Some(ShardImportStatus::Done) => {
+            info!("shard status indicates that the shard is done (success), skipping import");
         }
-        .await?;
-
-        flow::run(timeline.clone(), control_file, storage.clone(), ctx).await?;
-
-        //
-        // Communicate that shard is done.
-        // Ensure at-least-once delivery of the upcall to storage controller
-        // before we mark the task as done and never come here again.
-        //
-        let storcon_client = StorageControllerUpcallClient::new(timeline.conf, &cancel);
-        storcon_client
-            .put_timeline_import_status(
-                timeline.tenant_shard_id,
-                timeline.timeline_id,
-                // TODO(vlad): What about import errors?
-                ShardImportStatus::Done,
-            )
-            .await
-            .map_err(|_err| anyhow::anyhow!("Shut down while putting timeline import status"))?;
-
-        storage
-            .put_json(
-                &shard_status_key,
-                &importbucket_format::ShardStatus { done: true },
-            )
-            .await
-            .context("put shard status")?;
     }
 
-    //
-    // Mark as done in index_part.
-    // This makes subsequent timeline loads enter the normal load code path
-    // instead of spawning the import task and calling this here function.
-    //
-    info!("mark import as complete in index part");
-    timeline
-        .remote_client
-        .schedule_index_upload_for_import_pgdata_state_update(Some(index_part_format::Root::V1(
-            index_part_format::V1::Done(index_part_format::Done {
-                idempotency_key,
-                started_at,
-                finished_at: chrono::Utc::now().naive_utc(),
-            }),
-        )))?;
-
-    timeline.remote_client.wait_completion().await?;
-
     Ok(())
 }
diff --git a/pageserver/src/tenant/timeline/import_pgdata/flow.rs b/pageserver/src/tenant/timeline/import_pgdata/flow.rs
index 34c073365d..5b9c8ec5b5 100644
--- a/pageserver/src/tenant/timeline/import_pgdata/flow.rs
+++ b/pageserver/src/tenant/timeline/import_pgdata/flow.rs
@@ -53,6 +53,7 @@ use tokio_stream::StreamExt;
 use tracing::{debug, instrument};
 use utils::bin_ser::BeSer;
 use utils::lsn::Lsn;
+use utils::pausable_failpoint;
 
 use super::Timeline;
 use super::importbucket_client::{ControlFile, RemoteStorageWrapper};
@@ -79,6 +80,9 @@ pub async fn run(
 
     let import_config = &timeline.conf.timeline_import_config;
     let plan = planner.plan(import_config).await?;
+
+    pausable_failpoint!("import-timeline-pre-execute-pausable");
+
     plan.execute(timeline, import_config, ctx).await
 }
 
diff --git a/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs b/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs
index e7aa8f6038..34313748b7 100644
--- a/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs
+++ b/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs
@@ -190,31 +190,6 @@ impl RemoteStorageWrapper {
         Ok(Some(res))
     }
 
-    #[instrument(level = tracing::Level::DEBUG, skip_all, fields(%path))]
-    pub async fn put_json<T>(&self, path: &RemotePath, value: &T) -> anyhow::Result<()>
-    where
-        T: serde::Serialize,
-    {
-        let buf = serde_json::to_vec(value)?;
-        let bytes = Bytes::from(buf);
-        utils::backoff::retry(
-            || async {
-                let size = bytes.len();
-                let bytes = futures::stream::once(futures::future::ready(Ok(bytes.clone())));
-                self.storage
-                    .upload_storage_object(bytes, size, path, &self.cancel)
-                    .await
-            },
-            remote_storage::TimeoutOrCancel::caused_by_cancel,
-            1,
-            u32::MAX,
-            &format!("put json {path}"),
-            &self.cancel,
-        )
-        .await
-        .expect("practically infinite retries")
-    }
-
     #[instrument(level = tracing::Level::DEBUG, skip_all, fields(%path))]
     pub async fn get_range(
         &self,
diff --git a/pageserver/src/tenant/timeline/import_pgdata/importbucket_format.rs b/pageserver/src/tenant/timeline/import_pgdata/importbucket_format.rs
index 57c647cc7f..d9f4da4748 100644
--- a/pageserver/src/tenant/timeline/import_pgdata/importbucket_format.rs
+++ b/pageserver/src/tenant/timeline/import_pgdata/importbucket_format.rs
@@ -5,9 +5,3 @@ pub struct PgdataStatus {
     pub done: bool,
     // TODO: remaining fields
 }
-
-#[derive(Deserialize, Serialize, Debug, Clone, PartialEq, Eq)]
-pub struct ShardStatus {
-    pub done: bool,
-    // TODO: remaining fields
-}
diff --git a/pageserver/src/tenant/timeline/import_pgdata/index_part_format.rs b/pageserver/src/tenant/timeline/import_pgdata/index_part_format.rs
index ea7a41b25f..371fc857dc 100644
--- a/pageserver/src/tenant/timeline/import_pgdata/index_part_format.rs
+++ b/pageserver/src/tenant/timeline/import_pgdata/index_part_format.rs
@@ -64,4 +64,12 @@ impl Root {
             },
         }
     }
+    pub fn started_at(&self) -> &chrono::NaiveDateTime {
+        match self {
+            Root::V1(v1) => match v1 {
+                V1::InProgress(in_progress) => &in_progress.started_at,
+                V1::Done(done) => &done.started_at,
+            },
+        }
+    }
 }
diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index 649113b8ce..8d459cab9c 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -157,6 +157,29 @@ async fn handle_validate(req: Request<Body>) -> Result<Response<Body>, ApiError>
     json_response(StatusCode::OK, state.service.validate(validate_req).await?)
 }
 
+async fn handle_get_timeline_import_status(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::GenerationsApi)?;
+
+    let tenant_shard_id: TenantShardId = parse_request_param(&req, "tenant_shard_id")?;
+    let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;
+
+    let req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
+    let state = get_state(&req);
+    json_response(
+        StatusCode::OK,
+        state
+            .service
+            .handle_timeline_shard_import_progress(tenant_shard_id, timeline_id)
+            .await?,
+    )
+}
+
 async fn handle_put_timeline_import_status(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::GenerationsApi)?;
 
@@ -2008,6 +2031,13 @@ pub fn make_router(
         .post("/upcall/v1/validate", |r| {
             named_request_span(r, handle_validate, RequestName("upcall_v1_validate"))
         })
+        .get("/upcall/v1/timeline_import_status", |r| {
+            named_request_span(
+                r,
+                handle_get_timeline_import_status,
+                RequestName("upcall_v1_timeline_import_status"),
+            )
+        })
         .post("/upcall/v1/timeline_import_status", |r| {
             named_request_span(
                 r,
diff --git a/storage_controller/src/pageserver_client.rs b/storage_controller/src/pageserver_client.rs
index 554ca375f5..817409e112 100644
--- a/storage_controller/src/pageserver_client.rs
+++ b/storage_controller/src/pageserver_client.rs
@@ -1,3 +1,5 @@
+use std::time::Duration;
+
 use pageserver_api::models::detach_ancestor::AncestorDetached;
 use pageserver_api::models::{
     DetachBehavior, LocationConfig, LocationConfigListResponse, LsnLease, PageserverUtilization,
@@ -212,6 +214,7 @@ impl PageserverClient {
         )
     }
 
+    #[allow(unused)]
     pub(crate) async fn timeline_detail(
         &self,
         tenant_shard_id: TenantShardId,
@@ -357,4 +360,20 @@ impl PageserverClient {
             self.inner.wait_lsn(tenant_shard_id, request).await
         )
     }
+
+    pub(crate) async fn activate_post_import(
+        &self,
+        tenant_shard_id: TenantShardId,
+        timeline_id: TimelineId,
+        timeline_activate_timeout: Duration,
+    ) -> Result<TimelineInfo> {
+        measured_request!(
+            "activate_post_import",
+            crate::metrics::Method::Put,
+            &self.node_id_label,
+            self.inner
+                .activate_post_import(tenant_shard_id, timeline_id, timeline_activate_timeout)
+                .await
+        )
+    }
 }
diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs
index 9ffcf9b9e6..052c0f02eb 100644
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -1666,6 +1666,39 @@ impl Persistence {
         }
     }
 
+    pub(crate) async fn get_timeline_import(
+        &self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+    ) -> DatabaseResult<Option<TimelineImport>> {
+        use crate::schema::timeline_imports::dsl;
+        let persistent_import = self
+            .with_measured_conn(DatabaseOperation::ListTimelineImports, move |conn| {
+                Box::pin(async move {
+                    let mut from_db: Vec<TimelineImportPersistence> = dsl::timeline_imports
+                        .filter(dsl::tenant_id.eq(tenant_id.to_string()))
+                        .filter(dsl::timeline_id.eq(timeline_id.to_string()))
+                        .load(conn)
+                        .await?;
+
+                    if from_db.len() > 1 {
+                        return Err(DatabaseError::Logical(format!(
+                            "unexpected number of rows ({})",
+                            from_db.len()
+                        )));
+                    }
+
+                    Ok(from_db.pop())
+                })
+            })
+            .await?;
+
+        persistent_import
+            .map(TimelineImport::from_persistent)
+            .transpose()
+            .map_err(|err| DatabaseError::Logical(format!("failed to deserialize import: {err}")))
+    }
+
     pub(crate) async fn delete_timeline_import(
         &self,
         tenant_id: TenantId,
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 193050460d..05430733c2 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -35,12 +35,12 @@ use pageserver_api::controller_api::{
 };
 use pageserver_api::models::{
     self, DetachBehavior, LocationConfig, LocationConfigListResponse, LocationConfigMode, LsnLease,
-    PageserverUtilization, SecondaryProgress, ShardParameters, TenantConfig,
+    PageserverUtilization, SecondaryProgress, ShardImportStatus, ShardParameters, TenantConfig,
     TenantConfigPatchRequest, TenantConfigRequest, TenantLocationConfigRequest,
     TenantLocationConfigResponse, TenantShardLocation, TenantShardSplitRequest,
     TenantShardSplitResponse, TenantSorting, TenantTimeTravelRequest,
     TimelineArchivalConfigRequest, TimelineCreateRequest, TimelineCreateResponseStorcon,
-    TimelineInfo, TimelineState, TopTenantShardItem, TopTenantShardsRequest,
+    TimelineInfo, TopTenantShardItem, TopTenantShardsRequest,
 };
 use pageserver_api::shard::{
     DEFAULT_STRIPE_SIZE, ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId,
@@ -61,6 +61,7 @@ use utils::completion::Barrier;
 use utils::generation::Generation;
 use utils::id::{NodeId, TenantId, TimelineId};
 use utils::lsn::Lsn;
+use utils::shard::ShardIndex;
 use utils::sync::gate::{Gate, GateGuard};
 use utils::{failpoint_support, pausable_failpoint};
 
@@ -98,7 +99,8 @@ use crate::tenant_shard::{
     ScheduleOptimization, ScheduleOptimizationAction, TenantShard,
 };
 use crate::timeline_import::{
-    ShardImportStatuses, TimelineImport, TimelineImportState, UpcallClient,
+    ImportResult, ShardImportStatuses, TimelineImport, TimelineImportFinalizeError,
+    TimelineImportState, UpcallClient,
 };
 
 const WAITER_FILL_DRAIN_POLL_TIMEOUT: Duration = Duration::from_millis(500);
@@ -3905,6 +3907,38 @@ impl Service {
         })
     }
 
+    pub(crate) async fn handle_timeline_shard_import_progress(
+        self: &Arc<Self>,
+        tenant_shard_id: TenantShardId,
+        timeline_id: TimelineId,
+    ) -> Result<ShardImportStatus, ApiError> {
+        let maybe_import = self
+            .persistence
+            .get_timeline_import(tenant_shard_id.tenant_id, timeline_id)
+            .await?;
+
+        let import = maybe_import.ok_or_else(|| {
+            ApiError::NotFound(
+                format!(
+                    "import for {}/{} not found",
+                    tenant_shard_id.tenant_id, timeline_id
+                )
+                .into(),
+            )
+        })?;
+
+        import
+            .shard_statuses
+            .0
+            .get(&tenant_shard_id.to_index())
+            .cloned()
+            .ok_or_else(|| {
+                ApiError::NotFound(
+                    format!("shard {} not found", tenant_shard_id.shard_slug()).into(),
+                )
+            })
+    }
+
     pub(crate) async fn handle_timeline_shard_import_progress_upcall(
         self: &Arc<Self>,
         req: PutTimelineImportStatusRequest,
@@ -3943,6 +3977,16 @@ impl Service {
         Ok(())
     }
 
+    /// Finalize the import of a timeline
+    ///
+    /// This method should be called once all shards have reported that the import is complete.
+    /// Firstly, it polls the post import timeline activation endpoint exposed by the pageserver.
+    /// Once the timeline is active on all shards, the timeline also gets created on the
+    /// safekeepers. Finally, notify cplane of the import completion (whether failed or
+    /// successful), and remove the import from the database and in-memory.
+    ///
+    /// If this method gets pre-empted by shut down, it will be called again at start-up (on-going
+    /// imports are stored in the database).
     #[instrument(skip_all, fields(
         tenant_id=%import.tenant_id,
         shard_id=%import.timeline_id,
@@ -3950,59 +3994,80 @@ impl Service {
     async fn finalize_timeline_import(
         self: &Arc<Self>,
         import: TimelineImport,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), TimelineImportFinalizeError> {
         tracing::info!("Finalizing timeline import");
 
         pausable_failpoint!("timeline-import-pre-cplane-notification");
 
-        let import_failed = import.completion_error().is_some();
+        let tenant_id = import.tenant_id;
+        let timeline_id = import.timeline_id;
 
-        if !import_failed {
-            loop {
-                if self.cancel.is_cancelled() {
-                    anyhow::bail!("Shut down requested while finalizing import");
-                }
-
-                let active = self.timeline_active_on_all_shards(&import).await?;
-
-                match active {
-                    Some(timeline_info) => {
-                        tracing::info!("Timeline became active on all shards");
-
-                        if self.config.timelines_onto_safekeepers {
-                            // Now that we know the start LSN of this timeline, create it on the
-                            // safekeepers.
-                            self.tenant_timeline_create_safekeepers_until_success(
-                                import.tenant_id,
-                                timeline_info,
-                            )
-                            .await?;
-                        }
-
-                        break;
-                    }
-                    None => {
-                        tracing::info!("Timeline not active on all shards yet");
-
-                        tokio::select! {
-                            _ = self.cancel.cancelled() => {
-                                anyhow::bail!("Shut down requested while finalizing import");
-                            },
-                            _ = tokio::time::sleep(Duration::from_secs(5)) => {}
-                        };
-                    }
-                }
+        let import_error = import.completion_error();
+        match import_error {
+            Some(err) => {
+                self.notify_cplane_and_delete_import(tenant_id, timeline_id, Err(err))
+                    .await?;
+                tracing::warn!("Timeline import completed with shard errors");
+                Ok(())
             }
-        }
+            None => match self.activate_timeline_post_import(&import).await {
+                Ok(timeline_info) => {
+                    tracing::info!("Post import timeline activation complete");
 
+                    if self.config.timelines_onto_safekeepers {
+                        // Now that we know the start LSN of this timeline, create it on the
+                        // safekeepers.
+                        self.tenant_timeline_create_safekeepers_until_success(
+                            import.tenant_id,
+                            timeline_info,
+                        )
+                        .await?;
+                    }
+
+                    self.notify_cplane_and_delete_import(tenant_id, timeline_id, Ok(()))
+                        .await?;
+
+                    tracing::info!("Timeline import completed successfully");
+                    Ok(())
+                }
+                Err(TimelineImportFinalizeError::ShuttingDown) => {
+                    // We got pre-empted by shut down and will resume after the restart.
+                    Err(TimelineImportFinalizeError::ShuttingDown)
+                }
+                Err(err) => {
+                    // Any finalize error apart from shut down is permanent and requires us to notify
+                    // cplane such that it can clean up.
+                    tracing::error!("Import finalize failed with permanent error: {err}");
+                    self.notify_cplane_and_delete_import(
+                        tenant_id,
+                        timeline_id,
+                        Err(err.to_string()),
+                    )
+                    .await?;
+                    Err(err)
+                }
+            },
+        }
+    }
+
+    async fn notify_cplane_and_delete_import(
+        self: &Arc<Self>,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        import_result: ImportResult,
+    ) -> Result<(), TimelineImportFinalizeError> {
+        let import_failed = import_result.is_err();
         tracing::info!(%import_failed, "Notifying cplane of import completion");
 
         let client = UpcallClient::new(self.get_config(), self.cancel.child_token());
-        client.notify_import_complete(&import).await?;
+        client
+            .notify_import_complete(tenant_id, timeline_id, import_result)
+            .await
+            .map_err(|_err| TimelineImportFinalizeError::ShuttingDown)?;
 
         if let Err(err) = self
             .persistence
-            .delete_timeline_import(import.tenant_id, import.timeline_id)
+            .delete_timeline_import(tenant_id, timeline_id)
             .await
         {
             tracing::warn!("Failed to delete timeline import entry from database: {err}");
@@ -4012,14 +4077,113 @@ impl Service {
             .write()
             .unwrap()
             .tenants
-            .range_mut(TenantShardId::tenant_range(import.tenant_id))
+            .range_mut(TenantShardId::tenant_range(tenant_id))
             .for_each(|(_id, shard)| shard.importing = TimelineImportState::Idle);
 
-        tracing::info!(%import_failed, "Timeline import complete");
-
         Ok(())
     }
 
+    /// Activate an imported timeline on all shards once the import is complete.
+    /// Returns the [`TimelineInfo`] reported by shard zero.
+    async fn activate_timeline_post_import(
+        self: &Arc<Self>,
+        import: &TimelineImport,
+    ) -> Result<TimelineInfo, TimelineImportFinalizeError> {
+        const TIMELINE_ACTIVATE_TIMEOUT: Duration = Duration::from_millis(128);
+
+        let mut shards_to_activate: HashSet<ShardIndex> =
+            import.shard_statuses.0.keys().cloned().collect();
+        let mut shard_zero_timeline_info = None;
+
+        while !shards_to_activate.is_empty() {
+            if self.cancel.is_cancelled() {
+                return Err(TimelineImportFinalizeError::ShuttingDown);
+            }
+
+            let targets = {
+                let locked = self.inner.read().unwrap();
+                let mut targets = Vec::new();
+
+                for (tenant_shard_id, shard) in locked
+                    .tenants
+                    .range(TenantShardId::tenant_range(import.tenant_id))
+                {
+                    if !import
+                        .shard_statuses
+                        .0
+                        .contains_key(&tenant_shard_id.to_index())
+                    {
+                        return Err(TimelineImportFinalizeError::MismatchedShards(
+                            tenant_shard_id.to_index(),
+                        ));
+                    }
+
+                    if let Some(node_id) = shard.intent.get_attached() {
+                        let node = locked
+                            .nodes
+                            .get(node_id)
+                            .expect("Pageservers may not be deleted while referenced");
+                        targets.push((*tenant_shard_id, node.clone()));
+                    }
+                }
+
+                targets
+            };
+
+            let targeted_tenant_shards: Vec<_> = targets.iter().map(|(tid, _node)| *tid).collect();
+
+            let results = self
+                .tenant_for_shards_api(
+                    targets,
+                    |tenant_shard_id, client| async move {
+                        client
+                            .activate_post_import(
+                                tenant_shard_id,
+                                import.timeline_id,
+                                TIMELINE_ACTIVATE_TIMEOUT,
+                            )
+                            .await
+                    },
+                    1,
+                    1,
+                    SHORT_RECONCILE_TIMEOUT,
+                    &self.cancel,
+                )
+                .await;
+
+            let mut failed = 0;
+            for (tid, result) in targeted_tenant_shards.iter().zip(results.into_iter()) {
+                match result {
+                    Ok(ok) => {
+                        if tid.is_shard_zero() {
+                            shard_zero_timeline_info = Some(ok);
+                        }
+
+                        shards_to_activate.remove(&tid.to_index());
+                    }
+                    Err(_err) => {
+                        failed += 1;
+                    }
+                }
+            }
+
+            if failed > 0 {
+                tracing::info!(
+                    "Failed to activate timeline on {failed} shards post import. Will retry"
+                );
+            }
+
+            tokio::select! {
+                _ = tokio::time::sleep(Duration::from_millis(250)) => {},
+                _ = self.cancel.cancelled() => {
+                    return Err(TimelineImportFinalizeError::ShuttingDown);
+                }
+            }
+        }
+
+        Ok(shard_zero_timeline_info.expect("All shards replied"))
+    }
+
     async fn finalize_timeline_imports(self: &Arc<Self>, imports: Vec<TimelineImport>) {
         futures::future::join_all(
             imports
@@ -4029,78 +4193,6 @@ impl Service {
         .await;
     }
 
-    /// If the timeline is active on all shards, returns the [`TimelineInfo`]
-    /// collected from shard 0.
-    ///
-    /// An error is returned if the shard layout has changed during the import.
-    /// This is guarded against within the storage controller and the pageserver,
-    /// and, therefore, unexpected.
-    async fn timeline_active_on_all_shards(
-        self: &Arc<Self>,
-        import: &TimelineImport,
-    ) -> anyhow::Result<Option<TimelineInfo>> {
-        let targets = {
-            let locked = self.inner.read().unwrap();
-            let mut targets = Vec::new();
-
-            for (tenant_shard_id, shard) in locked
-                .tenants
-                .range(TenantShardId::tenant_range(import.tenant_id))
-            {
-                if !import
-                    .shard_statuses
-                    .0
-                    .contains_key(&tenant_shard_id.to_index())
-                {
-                    anyhow::bail!("Shard layout change detected on completion");
-                }
-
-                if let Some(node_id) = shard.intent.get_attached() {
-                    let node = locked
-                        .nodes
-                        .get(node_id)
-                        .expect("Pageservers may not be deleted while referenced");
-                    targets.push((*tenant_shard_id, node.clone()));
-                } else {
-                    return Ok(None);
-                }
-            }
-
-            targets
-        };
-
-        if targets.is_empty() {
-            anyhow::bail!("No shards found to finalize import for");
-        }
-
-        let results = self
-            .tenant_for_shards_api(
-                targets,
-                |tenant_shard_id, client| async move {
-                    client
-                        .timeline_detail(tenant_shard_id, import.timeline_id)
-                        .await
-                },
-                1,
-                1,
-                SHORT_RECONCILE_TIMEOUT,
-                &self.cancel,
-            )
-            .await;
-
-        let all_active = results.iter().all(|res| match res {
-            Ok(info) => info.state == TimelineState::Active,
-            Err(_) => false,
-        });
-
-        if all_active {
-            // Both unwraps are validated above
-            Ok(Some(results.into_iter().next().unwrap().unwrap()))
-        } else {
-            Ok(None)
-        }
-    }
-
     pub(crate) async fn tenant_timeline_archival_config(
         &self,
         tenant_id: TenantId,
diff --git a/storage_controller/src/service/safekeeper_service.rs b/storage_controller/src/service/safekeeper_service.rs
index 5c15660ba3..cd5ace449d 100644
--- a/storage_controller/src/service/safekeeper_service.rs
+++ b/storage_controller/src/service/safekeeper_service.rs
@@ -10,6 +10,7 @@ use crate::persistence::{
     DatabaseError, SafekeeperTimelineOpKind, TimelinePendingOpPersistence, TimelinePersistence,
 };
 use crate::safekeeper::Safekeeper;
+use crate::timeline_import::TimelineImportFinalizeError;
 use anyhow::Context;
 use http_utils::error::ApiError;
 use pageserver_api::controller_api::{
@@ -327,12 +328,12 @@ impl Service {
         self: &Arc<Self>,
         tenant_id: TenantId,
         timeline_info: TimelineInfo,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), TimelineImportFinalizeError> {
         const BACKOFF: Duration = Duration::from_secs(5);
 
         loop {
             if self.cancel.is_cancelled() {
-                anyhow::bail!("Shut down requested while finalizing import");
+                return Err(TimelineImportFinalizeError::ShuttingDown);
             }
 
             let res = self
@@ -348,7 +349,7 @@ impl Service {
                     tracing::error!("Failed to create timeline on safekeepers: {err}");
                     tokio::select! {
                         _ = self.cancel.cancelled() => {
-                            anyhow::bail!("Shut down requested while finalizing import");
+                            return Err(TimelineImportFinalizeError::ShuttingDown);
                         },
                         _ = tokio::time::sleep(BACKOFF) => {}
                     };
diff --git a/storage_controller/src/timeline_import.rs b/storage_controller/src/timeline_import.rs
index 6dcc538c4b..5d9d633932 100644
--- a/storage_controller/src/timeline_import.rs
+++ b/storage_controller/src/timeline_import.rs
@@ -46,6 +46,14 @@ pub(crate) enum TimelineImportUpdateFollowUp {
     None,
 }
 
+#[derive(thiserror::Error, Debug)]
+pub(crate) enum TimelineImportFinalizeError {
+    #[error("Shut down interrupted import finalize")]
+    ShuttingDown,
+    #[error("Mismatched shard detected during import finalize: {0}")]
+    MismatchedShards(ShardIndex),
+}
+
 pub(crate) enum TimelineImportUpdateError {
     ImportNotFound {
         tenant_id: TenantId,
@@ -151,6 +159,8 @@ impl TimelineImport {
     }
 }
 
+pub(crate) type ImportResult = Result<(), String>;
+
 pub(crate) struct UpcallClient {
     authorization_header: Option<String>,
     client: reqwest::Client,
@@ -198,7 +208,9 @@ impl UpcallClient {
     /// eventual cplane availability. The cplane API is idempotent.
     pub(crate) async fn notify_import_complete(
         &self,
-        import: &TimelineImport,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        import_result: ImportResult,
     ) -> anyhow::Result<()> {
         let endpoint = if self.base_url.ends_with('/') {
             format!("{}import_complete", self.base_url)
@@ -206,15 +218,13 @@ impl UpcallClient {
             format!("{}/import_complete", self.base_url)
         };
 
-        tracing::info!("Endpoint is {endpoint}");
-
         let request = self
             .client
             .request(Method::PUT, endpoint)
             .json(&ImportCompleteRequest {
-                tenant_id: import.tenant_id,
-                timeline_id: import.timeline_id,
-                error: import.completion_error(),
+                tenant_id,
+                timeline_id,
+                error: import_result.err(),
             })
             .timeout(IMPORT_COMPLETE_REQUEST_TIMEOUT);
 
diff --git a/test_runner/regress/test_import_pgdata.py b/test_runner/regress/test_import_pgdata.py
index 05e63ad955..0472b92145 100644
--- a/test_runner/regress/test_import_pgdata.py
+++ b/test_runner/regress/test_import_pgdata.py
@@ -130,9 +130,8 @@ def test_pgdata_import_smoke(
     elif rel_block_size == RelBlockSize.TWO_STRPES_PER_SHARD:
         target_relblock_size = (shard_count or 1) * stripe_size * 8192 * 2
     elif rel_block_size == RelBlockSize.MULTIPLE_RELATION_SEGMENTS:
-        # Postgres uses a 1GiB segment size, fixed at compile time, so we must use >2GB of data
-        # to exercise multiple segments.
-        target_relblock_size = int(((2.333 * 1024 * 1024 * 1024) // 8192) * 8192)
+        segment_size = 16 * 1024 * 1024
+        target_relblock_size = segment_size * 8
     else:
         raise ValueError
 
@@ -413,6 +412,88 @@ def test_import_completion_on_restart(
     wait_until(cplane_notified)
 
 
+@run_only_on_default_postgres(reason="PG version is irrelevant here")
+def test_import_respects_tenant_shutdown(
+    neon_env_builder: NeonEnvBuilder, vanilla_pg: VanillaPostgres, make_httpserver: HTTPServer
+):
+    """
+    Validate that importing timelines respect the usual timeline life cycle:
+    1. Shut down on tenant shut-down and resumes upon re-attach
+    2. Deletion on timeline deletion (TODO)
+    """
+    # Set up mock control plane HTTP server to listen for import completions
+    import_completion_signaled = Event()
+
+    def handler(request: Request) -> Response:
+        log.info(f"control plane /import_complete request: {request.json}")
+        import_completion_signaled.set()
+        return Response(json.dumps({}), status=200)
+
+    cplane_mgmt_api_server = make_httpserver
+    cplane_mgmt_api_server.expect_request(
+        "/storage/api/v1/import_complete", method="PUT"
+    ).respond_with_handler(handler)
+
+    # Plug the cplane mock in
+    neon_env_builder.control_plane_hooks_api = (
+        f"http://{cplane_mgmt_api_server.host}:{cplane_mgmt_api_server.port}/storage/api/v1/"
+    )
+
+    # The import will specifiy a local filesystem path mocking remote storage
+    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
+
+    vanilla_pg.start()
+    vanilla_pg.stop()
+
+    env = neon_env_builder.init_configs()
+    env.start()
+
+    importbucket_path = neon_env_builder.repo_dir / "test_import_completion_bucket"
+    mock_import_bucket(vanilla_pg, importbucket_path)
+
+    tenant_id = TenantId.generate()
+    timeline_id = TimelineId.generate()
+    idempotency = ImportPgdataIdemptencyKey.random()
+
+    # Pause before sending the notification
+    failpoint_name = "import-timeline-pre-execute-pausable"
+    env.pageserver.http_client().configure_failpoints((failpoint_name, "pause"))
+
+    env.storage_controller.tenant_create(tenant_id)
+    env.storage_controller.timeline_create(
+        tenant_id,
+        {
+            "new_timeline_id": str(timeline_id),
+            "import_pgdata": {
+                "idempotency_key": str(idempotency),
+                "location": {"LocalFs": {"path": str(importbucket_path.absolute())}},
+            },
+        },
+    )
+
+    def hit_failpoint():
+        log.info("Checking log for pattern...")
+        try:
+            assert env.pageserver.log_contains(f".*at failpoint {failpoint_name}.*")
+        except Exception:
+            log.exception("Failed to find pattern in log")
+            raise
+
+    wait_until(hit_failpoint)
+    assert not import_completion_signaled.is_set()
+
+    # Restart the pageserver while an import job is in progress.
+    # This clears the failpoint and we expect that the import starts up afresh
+    # after the restart and eventually completes.
+    env.pageserver.stop()
+    env.pageserver.start()
+
+    def cplane_notified():
+        assert import_completion_signaled.is_set()
+
+    wait_until(cplane_notified)
+
+
 def test_fast_import_with_pageserver_ingest(
     test_output_dir,
     vanilla_pg: VanillaPostgres,
@@ -520,7 +601,9 @@ def test_fast_import_with_pageserver_ingest(
     env.neon_cli.mappings_map_branch(import_branch_name, tenant_id, timeline_id)
 
     # Run fast_import
-    fast_import.set_aws_creds(mock_s3_server, {"RUST_LOG": "aws_config=debug,aws_sdk_kms=debug"})
+    fast_import.set_aws_creds(
+        mock_s3_server, {"RUST_LOG": "info,aws_config=debug,aws_sdk_kms=debug"}
+    )
     pg_port = port_distributor.get_port()
     fast_import.run_pgdata(pg_port=pg_port, s3prefix=f"s3://{bucket}/{key_prefix}")
 

From d47e88e35305da95b4674d6ef48f6422df7d9dab Mon Sep 17 00:00:00 2001
From: Elizabeth Murray <52375559+bizwark@users.noreply.github.com>
Date: Wed, 14 May 2025 00:00:59 -0700
Subject: [PATCH 39/65] Update the pgrag version in the compute dockerfile.
 (#11867)

## Problem

The extensions test are hanging because of pgrag. The new version of
pgrag contains a fix for the hang.

## Summary of changes
---
 compute/compute-node.Dockerfile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index e6e6053554..17e50697db 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -1117,8 +1117,8 @@ RUN wget https://github.com/microsoft/onnxruntime/archive/refs/tags/v1.18.1.tar.
     mkdir onnxruntime-src && cd onnxruntime-src && tar xzf ../onnxruntime.tar.gz --strip-components=1 -C . && \
     echo "#nothing to test here" > neon-test.sh
 
-RUN wget https://github.com/neondatabase-labs/pgrag/archive/refs/tags/v0.1.1.tar.gz -O pgrag.tar.gz &&  \
-    echo "087b2ecd11ba307dc968042ef2e9e43dc04d9ba60e8306e882c407bbe1350a50 pgrag.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/neondatabase-labs/pgrag/archive/refs/tags/v0.1.2.tar.gz -O pgrag.tar.gz &&  \
+    echo "7361654ea24f08cbb9db13c2ee1c0fe008f6114076401bb871619690dafc5225 pgrag.tar.gz" | sha256sum --check && \
     mkdir pgrag-src && cd pgrag-src && tar xzf ../pgrag.tar.gz --strip-components=1 -C .
 
 FROM rust-extensions-build-pgrx14 AS pgrag-build

From 81fd652151c9dce2d188ff2ba7c0ed2723640efb Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Wed, 14 May 2025 16:32:55 +0800
Subject: [PATCH 40/65] fix(pageserver): use better estimation for compaction
 memory usage (#11904)

## Problem

Hopefully resolves `test_gc_feedback` flakiness.

## Summary of changes

`accumulated_values` should not exceed 512MB to avoid OOM. Previously we
only use number of items, which is not a good estimation.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 libs/pageserver_api/src/value.rs             | 18 ++++++++++++++++++
 pageserver/src/tenant/timeline/compaction.rs | 12 +++++++++---
 2 files changed, 27 insertions(+), 3 deletions(-)

diff --git a/libs/pageserver_api/src/value.rs b/libs/pageserver_api/src/value.rs
index 883d903ff3..e9000939c3 100644
--- a/libs/pageserver_api/src/value.rs
+++ b/libs/pageserver_api/src/value.rs
@@ -36,6 +36,24 @@ impl Value {
             Value::WalRecord(rec) => rec.will_init(),
         }
     }
+
+    #[inline(always)]
+    pub fn estimated_size(&self) -> usize {
+        match self {
+            Value::Image(image) => image.len(),
+            Value::WalRecord(NeonWalRecord::AuxFile {
+                content: Some(content),
+                ..
+            }) => content.len(),
+            Value::WalRecord(NeonWalRecord::Postgres { rec, .. }) => rec.len(),
+            Value::WalRecord(NeonWalRecord::ClogSetAborted { xids }) => xids.len() * 4,
+            Value::WalRecord(NeonWalRecord::ClogSetCommitted { xids, .. }) => xids.len() * 4,
+            Value::WalRecord(NeonWalRecord::MultixactMembersCreate { members, .. }) => {
+                members.len() * 8
+            }
+            _ => 8192, /* use image size as the estimation */
+        }
+    }
 }
 
 #[derive(Debug, PartialEq)]
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index e7d39db70d..37c1a8f60c 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -3435,6 +3435,7 @@ impl Timeline {
 
         // Step 2: Produce images+deltas.
         let mut accumulated_values = Vec::new();
+        let mut accumulated_values_estimated_size = 0;
         let mut last_key: Option<Key> = None;
 
         // Only create image layers when there is no ancestor branches. TODO: create covering image layer
@@ -3611,12 +3612,16 @@ impl Timeline {
                 if last_key.is_none() {
                     last_key = Some(key);
                 }
+                accumulated_values_estimated_size += val.estimated_size();
                 accumulated_values.push((key, lsn, val));
 
-                if accumulated_values.len() >= 65536 {
-                    // Assume all of them are images, that would be 512MB of data in memory for a single key.
+                // Accumulated values should never exceed 512MB.
+                if accumulated_values_estimated_size >= 1024 * 1024 * 512 {
                     return Err(CompactionError::Other(anyhow!(
-                        "too many values for a single key, giving up gc-compaction"
+                        "too many values for a single key: {} for key {}, {} items",
+                        accumulated_values_estimated_size,
+                        key,
+                        accumulated_values.len()
                     )));
                 }
             } else {
@@ -3651,6 +3656,7 @@ impl Timeline {
                     .map_err(CompactionError::Other)?;
                 accumulated_values.clear();
                 *last_key = key;
+                accumulated_values_estimated_size = val.estimated_size();
                 accumulated_values.push((key, lsn, val));
             }
         }

From a8e652d47e3dec7e588b3bb3dddecc20302a0f98 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Wed, 14 May 2025 17:25:57 +0800
Subject: [PATCH 41/65] rfc: add bottommost garbage-collection compaction
 (#8425)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add the RFC for bottommost garbage-collection compaction

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
Co-authored-by: Arpad Müller <arpad-m@users.noreply.github.com>
---
 docs/rfcs/043-bottom-most-gc-compaction.md    | 194 ++++++++++++++++++
 .../01-basic-idea.svg                         | 135 ++++++++++++
 .../03-retain-lsn.svg                         | 141 +++++++++++++
 .../05-btmgc-parent.svg                       | 187 +++++++++++++++++
 .../06-btmgc-child.svg                        | 184 +++++++++++++++++
 .../07-btmgc-analysis-1.svg                   | 180 ++++++++++++++++
 .../08-optimization.svg                       | 158 ++++++++++++++
 .../09-btmgc-analysis-2.svg                   | 184 +++++++++++++++++
 .../10-btmgc-analysis-3.svg                   |  81 ++++++++
 .../11-btmgc-analysis-4.svg                   |  81 ++++++++
 .../12-staircase-test-gc-feedback.png         | Bin 0 -> 145516 bytes
 .../13-job-split.svg                          | 176 ++++++++++++++++
 12 files changed, 1701 insertions(+)
 create mode 100644 docs/rfcs/043-bottom-most-gc-compaction.md
 create mode 100644 docs/rfcs/images/036-bottom-most-gc-compaction/01-basic-idea.svg
 create mode 100644 docs/rfcs/images/036-bottom-most-gc-compaction/03-retain-lsn.svg
 create mode 100644 docs/rfcs/images/036-bottom-most-gc-compaction/05-btmgc-parent.svg
 create mode 100644 docs/rfcs/images/036-bottom-most-gc-compaction/06-btmgc-child.svg
 create mode 100644 docs/rfcs/images/036-bottom-most-gc-compaction/07-btmgc-analysis-1.svg
 create mode 100644 docs/rfcs/images/036-bottom-most-gc-compaction/08-optimization.svg
 create mode 100644 docs/rfcs/images/036-bottom-most-gc-compaction/09-btmgc-analysis-2.svg
 create mode 100644 docs/rfcs/images/036-bottom-most-gc-compaction/10-btmgc-analysis-3.svg
 create mode 100644 docs/rfcs/images/036-bottom-most-gc-compaction/11-btmgc-analysis-4.svg
 create mode 100644 docs/rfcs/images/036-bottom-most-gc-compaction/12-staircase-test-gc-feedback.png
 create mode 100644 docs/rfcs/images/036-bottom-most-gc-compaction/13-job-split.svg

diff --git a/docs/rfcs/043-bottom-most-gc-compaction.md b/docs/rfcs/043-bottom-most-gc-compaction.md
new file mode 100644
index 0000000000..4bba758b31
--- /dev/null
+++ b/docs/rfcs/043-bottom-most-gc-compaction.md
@@ -0,0 +1,194 @@
+# Bottommost Garbage-Collection Compaction
+
+## Summary
+
+The goal of this doc is to propose a way to reliably collect garbages below the GC horizon. This process is called bottom-most garbage-collect-compaction, and is part of the broader legacy-enhanced compaction that we plan to implement in the future.
+
+## Motivation
+
+The current GC algorithm will wait until the covering via image layers before collecting the garbages of a key region. Relying on image layer generation to generate covering images is not reliable. There are prior arts to generate feedbacks from the GC algorithm to the image generation process to accelerate garbage collection, but it slows down the system and creates write amplification.
+
+# Basic Idea
+
+![](images/036-bottom-most-gc-compaction/01-basic-idea.svg)
+
+The idea of bottom-most compaction is simple: we rewrite all layers that are below or intersect with the GC horizon to produce a flat level of image layers at the GC horizon and deltas above the GC horizon. In this process,
+
+- All images and deltas ≤ GC horizon LSN will be dropped. This process collects garbages.
+- We produce images for all keys involved in the compaction process at the GC horizon.
+
+Therefore, it can precisely collect all garbages below the horizon, and reduce the space amplification, i.e., in the staircase pattern (test_gc_feedback).
+
+![The staircase pattern in test_gc_feedback in the original compaction algorithm. The goal is to collect garbage below the red horizontal line.](images/036-bottom-most-gc-compaction/12-staircase-test-gc-feedback.png)
+
+The staircase pattern in test_gc_feedback in the original compaction algorithm. The goal is to collect garbage below the red horizontal line.
+
+# Branches
+
+With branches, the bottom-most compaction should retain a snapshot of the keyspace at the `retain_lsn` so that the child branch can access data at the branch point. This requires some modifications to the basic bottom-most compaction algorithm that we sketched above. 
+
+![](images/036-bottom-most-gc-compaction/03-retain-lsn.svg)
+
+## Single Timeline w/ Snapshots: handle `retain_lsn`
+
+First let’s look into the case where we create branches over the main branch but don’t write any data to them (aka “snapshots”).
+
+The bottom-most compaction algorithm collects all deltas and images of a key and can make decisions on what data to retain. Given that we have a single key’s history as below:
+
+```
+LSN 0x10 -> A
+LSN 0x20 -> append B
+retain_lsn: 0x20
+LSN 0x30 -> append C
+LSN 0x40 -> append D
+retain_lsn: 0x40
+LSN 0x50 -> append E
+GC horizon: 0x50
+LSN 0x60 -> append F
+```
+
+The algorithm will produce:
+
+```
+LSN 0x20 -> AB
+(drop all history below the earliest retain_lsn)
+LSN 0x40 -> ABCD
+(assume the cost of replaying 2 deltas is higher than storing the full image, we generate an image here)
+LSN 0x50 -> append E
+(replay one delta is cheap)
+LSN 0x60 -> append F
+(keep everything as-is above the GC horizon)
+```
+
+![](images/036-bottom-most-gc-compaction/05-btmgc-parent.svg)
+
+What happens is that we balance the space taken by each retain_lsn and the cost of replaying deltas during the bottom-most compaction process. This is controlled by a threshold. If `count(deltas) < $threshold`, the deltas will be retained. Otherwise, an image will be generated and the deltas will be dropped.
+
+In the example above, the `$threshold` is 2.
+
+## Child Branches with data: pull + partial images
+
+In the previous section we have shown how bottom-most compaction respects `retain_lsn` so that all data that was readable at branch creation remains readable. But branches can have data on their own, and that data can fall out of the branch’s PITR window. So, this section explains how we deal with that.
+
+We will run the same bottom-most compaction for these branches, to ensure the space amplification on the child branch is reasonable. 
+
+```
+branch_lsn: 0x20
+LSN 0x30 -> append P
+LSN 0x40 -> append Q
+LSN 0x50 -> append R
+GC horizon: 0x50
+LSN 0x60 -> append S
+```
+
+Note that bottom-most compaction happens on a per-timeline basis. When it processes this key, it only reads the history from LSN 0x30 without a base image. Therefore, on child branches, the bottom-most compaction process will make image creation decisions based on the same `count(deltas) < $threshold` criteria, and if it decides to create an image, the base image will be retrieved from the ancestor branch.
+
+```
+branch_lsn: 0x20
+LSN 0x50 -> ABPQR
+(we pull the image at LSN 0x20 from the ancestor branch to get AB, and then apply append PQ to the page; we replace the record at 0x40 with an image and drop the delta)
+GC horizon: 0x50
+LSN 0x60 -> append S
+```
+
+![](images/036-bottom-most-gc-compaction/06-btmgc-child.svg)
+
+Note that for child branches, we do not create image layers for the images when bottom-most compaction runs. Instead, we drop the 0x30/0x40/0x50 delta records and directly place the image ABPQR@0x50 into the delta layer, which serves as a sparse image layer. For child branches, if we create image layers, we will need to put all keys in the range into the image layer. This causes space bloat and slow compactions. In this proposal, the compaction process will only compact and process keys modified inside the child branch.
+
+# Result
+
+Bottom-most compaction ensures all garbage under the GC horizon gets collected right away (compared with “eventually” in the current algorithm). Meanwhile, it generates images at each of the retain_lsn to ensure branch reads are fast. As we make per-key decisions on whether to generate an image or not, the theoretical lower bound of the storage space we need to retain for a branch is lower than before.
+
+Before: min(sum(logs for each key), sum(image for each key)), for each partition — we always generate image layers on a key range
+
+After: sum(min(logs for each key, image for each key))
+
+# Compaction Trigger
+
+The bottom-most compaction can be automatically triggered. The goal of the trigger is that it should ensure a constant factor for write amplification. Say that the user write 1GB of WAL into the system, we should write 1GB x C data to S3. The legacy compaction algorithm does not have such a constant factor C. The data we write to S3 is quadratic to the logical size of the database (see [A Theoretical View of Neon Storage](https://www.notion.so/A-Theoretical-View-of-Neon-Storage-8d7ad7555b0c41b2a3597fa780911194?pvs=21)).
+
+We propose the following compaction trigger that generates a constant write amplification factor. Write amplification >= total writes to S3 / total user writes. We only analyze the write amplification caused by the bottom-most GC-compaction process, ignoring the legacy create image layers amplification.
+
+Given that we have ***X*** bytes of the delta layers above the GC horizon, ***A*** bytes of the delta layers intersecting with the GC horizon, ***B*** bytes of the delta layers below the GC horizon, and ***C*** bytes of the image layers below the GC horizon.
+
+The legacy GC + compaction loop will always keep ***A*** unchanged, reduce ***B and C*** when there are image layers covering the key range. This yields 0 write amplification (only file deletions) and extra ***B*** bytes of space.
+
+![](images/036-bottom-most-gc-compaction/09-btmgc-analysis-2.svg)
+
+The bottom-most compaction proposed here will split ***A*** into deltas above the GC horizon and below the GC horizon. Everything below the GC horizon will be image layers after the compaction (not considering branches). Therefore, this yields ***A+C*** extra write traffic each iteration, plus 0 extra space.
+
+![](images/036-bottom-most-gc-compaction/07-btmgc-analysis-1.svg)
+
+Also considering read amplification (below the GC horizon). When a read request reaches the GC horizon, the read amplification will be (A+B+C)/C=1+(A+B)/C. Reducing ***A*** and ***B*** can help reduce the read amplification below the GC horizon.
+
+The metrics-based trigger will wait until a point that space amplification is not that large and write amplification is not that large before the compaction gets triggered. The trigger is defined as **(A+B)/C ≥ 1 (or some other ratio)**.
+
+To reason about this trigger, consider the two cases:
+
+**Data Ingestion**
+
+User keeps ingesting data into the database, which indicates that WAL size roughly equals to the database logical size. The compaction gets triggered only when the newly-written WAL roughly equals to the current bottom-most image size (=X). Therefore, it’s triggered when the database size gets doubled. This is a reasonable amount of work. Write amplification is 2X/X=1 for the X amount of data written.
+
+![](images/036-bottom-most-gc-compaction/10-btmgc-analysis-3.svg)
+
+**Updates/Deletion**
+
+In this case, WAL size will be larger than the database logical size ***D***. The compaction gets triggered for every ***D*** bytes of WAL written. Therefore, for every ***D*** bytes of WAL, we rewrite the bottom-most layer, which produces an extra ***D*** bytes of write amplification. This incurs exactly 2x write amplification (by the write of D), 1.5x write amplification (if we count from the start of the process) and no space amplification. 
+
+![](images/036-bottom-most-gc-compaction/11-btmgc-analysis-4.svg)
+
+Note that here I try to reason that write amplification is a constant (i.e., the data we write to S3 is proportional to the data the user write). The main problem with the current legacy compaction algorithm is that write amplification is proportional to the database size.
+
+The next step is to optimize the write amplification above the GC horizon (i.e., change the image creation criteria, top-most compaction, or introduce tiered compaction), to ensure the write amplification of the whole system is a constant factor.
+
+20GB layers → +20GB layers → delete 20GB, need 40GB temporary space
+
+# Sub-Compactions
+
+The gc-compaction algorithm may take a long time and we need to split the job into multiple sub-compaction jobs.
+
+![](images/036-bottom-most-gc-compaction/13-job-split.svg)
+
+As in the figure, the auto-trigger schedules a compaction job covering the full keyspace below a specific LSN. In such case that we cannot finish compacting it in one run in a reasonable amount of time, the algorithm will vertically split it into multiple jobs (in this case, 5).
+
+Each gc-compaction job will create one level of delta layers and one flat level of image layers for each LSN. Those layers will be automatically split based on size, which means that if the sub-compaction job produces 1GB of deltas, it will produce 4 * 256MB delta layers. For those layers that is not fully contained within the sub-compaction job rectangles, it will be rewritten to only contain the keys outside of the key range.
+
+# Implementation
+
+The main implementation of gc-compaction is in `compaction.rs`.
+
+* `compact_with_gc`: The main loop of gc-compaction. It takes a rectangle range of the layer map and compact that specific range. It selects layers intersecting with the rectangle, downloads the layers, creates the k-merge iterator to read those layers in the key-lsn order, and decide which keys to keep or insert a reconstructed page. The process is the basic unit of a gc-compaction and is not interruptable. If the process gets preempted by L0 compaction, it has to be restarted from scratch. For layers overlaps with the rectangle but not fully inside, the main loop will also rewrite them so that the new layer (or two layers if both left and right ends are outside of the rectangle) has the same LSN range as the original one but only contain the keys outside of the compaction range.
+* `gc_compaction_split_jobs`: Splits a big gc-compaction job into sub-compactions based on heuristics in the layer map. The function looks at the layer map and splits the compaction job based on the size of the layers so that each compaction job only pulls ~4GB of layer files.
+* `generate_key_retention` and `KeyHistoryRetention`: Implements the algorithm described in the "basic idea" and "branch" chapter of this RFC. It takes a vector of history of a key (key-lsn-value) and decides which LSNs of the key to retain. If there are too many deltas between two retain_lsns, it will reconstruct the page and insert an image into the compaction result. Also, we implement `KeyHistoryRetention::verify` to ensure the generated result is not corrupted -- all retain_lsns and all LSNs above the gc-horizon should be accessible.
+* `GcCompactionQueue`: the automatic trigger implementation for gc-compaction. `GcCompactionQueue::iteration` is called at the end of the tenant compaction loop. It will then call `trigger_auto_compaction` to decide whether to trigger a gc-compaction job for this tenant. If yes, the compaction-job will be added to the compaction queue, and the queue will be slowly drained once there are no other compaction jobs running. gc-compaction has the lowest priority. If a sub-compaction job is not successful or gets preempted by L0 compaction (see limitations for reasons why a compaction job would fail), it will _not_ be retried.
+* Changes to `index_part.json`: we added a `last_completed_lsn` field to the index part for the auto-trigger to decide when to trigger a compaction.
+* Changes to the read path: when gc-compaction updates the layer map, all reads need to wait. See `gc_compaction_layer_update_lock` and comments in the code path for more information.
+
+Gc-compaction can also be scheduled over the HTTP API. Example:
+
+```
+curl 'localhost:9898/v1/tenant/:tenant_id/timeline/:timeline_id/compact?enhanced_gc_bottom_most_compaction=true&dry_run=true' -X PUT -H "Content-Type: application/json" -d '{"scheduled": true, "compact_key_range": { "start": "000000067F0000A0000002A1CF0100000000", "end": "000000067F0000A0000002A1D70100000000" } }'
+```
+
+The `dry_run` mode can be specified in the query string so that the compaction will go through all layers to estimate how much space can be saved without writing the compaction result into the layer map.
+
+The auto-trigger is controlled by tenant-level flag `gc_compaction_enabled`. If this is set to false, no gc-compaction will be automatically scheduled on this tenant (but manual trigger still works).
+
+# Next Steps
+
+There are still some limitations of gc-compaction itself that needs to be resolved and tested,
+
+- gc-compaction is currently only automatically triggered on root branches. We have not tested gc-compaction on child branches in staging.
+- gc-compaction will skip aux key regions because of the possible conflict with the assumption of aux file tombstones.
+- gc-compaction does not consider keyspaces at retain_lsns and only look at keys in the layers. This also causes us giving up some sub-compaction jobs because a key might have part of its history available due to traditional GC removing part of the history.
+- We limit gc-compaction to run over shards <= 150GB to avoid gc-compaction taking too much time blocking other compaction jobs. The sub-compaction split algorithm needs to be improved to be able to split vertically and horizontally. Also, we need to move the download layer process out of the compaction loop so that we don't block other compaction jobs for too long.
+- The compaction trigger always schedules gc-compaction from the lowest LSN to the gc-horizon. Currently we do not schedule compaction jobs that only selects layers in the middle. Allowing this could potentially reduce the number of layers read/write throughout the process.
+- gc-compaction will give up if there are too many layers to rewrite or if there are not enough disk space for the compaction.
+- gc-compaction sometimes fails with "no key produced during compaction", which means that all existing keys within the compaction range can be collected; but we don't have a way to write this information back to the layer map -- we cannot generate an empty image layer.
+- We limit the maximum size of deltas for a single key to 512MB. If above this size, gc-compaction will give up. This can be resolved by changing `generate_key_retention` to be a stream instead of requiring to collect all the key history.
+
+In the future,
+
+- Top-most compaction: ensure we always have an image coverage for the latest data (or near the latest data), so that reads will be fast at the latest LSN.
+- Tiered compaction on deltas: ensure read from any LSN is fast.
+- Per-timeline compaction → tenant-wide compaction?
diff --git a/docs/rfcs/images/036-bottom-most-gc-compaction/01-basic-idea.svg b/docs/rfcs/images/036-bottom-most-gc-compaction/01-basic-idea.svg
new file mode 100644
index 0000000000..7107198c0a
--- /dev/null
+++ b/docs/rfcs/images/036-bottom-most-gc-compaction/01-basic-idea.svg
@@ -0,0 +1,135 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xl="http://www.w3.org/1999/xlink" xmlns:dc="http://purl.org/dc/elements/1.1/" version="1.1" viewBox="82 284 863 375" width="863" height="375">
+  <defs/>
+  <g id="01-basic-idea" stroke-opacity="1" stroke-dasharray="none" stroke="none" fill="none" fill-opacity="1">
+    <title>01-basic-idea</title>
+    <rect fill="white" x="82" y="284" width="863" height="375"/>
+    <g id="01-basic-idea_Layer_1">
+      <title>Layer 1</title>
+      <g id="Graphic_2">
+        <rect x="234" y="379.5" width="203.5" height="17.5" fill="white"/>
+        <rect x="234" y="379.5" width="203.5" height="17.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_3">
+        <rect x="453.5" y="379.5" width="203.5" height="17.5" fill="white"/>
+        <rect x="453.5" y="379.5" width="203.5" height="17.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_4">
+        <rect x="672.5" y="379.5" width="203.5" height="17.5" fill="white"/>
+        <rect x="672.5" y="379.5" width="203.5" height="17.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_5">
+        <rect x="234" y="288.5" width="127" height="77.5" fill="white"/>
+        <rect x="234" y="288.5" width="127" height="77.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_6">
+        <rect x="375" y="288.5" width="127" height="77.5" fill="white"/>
+        <rect x="375" y="288.5" width="127" height="77.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_7">
+        <rect x="516" y="288.5" width="127" height="77.5" fill="white"/>
+        <rect x="516" y="288.5" width="127" height="77.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_8">
+        <rect x="657" y="288.5" width="127" height="77.5" fill="white"/>
+        <rect x="657" y="288.5" width="127" height="77.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_9">
+        <rect x="798" y="288.5" width="78" height="77.5" fill="white"/>
+        <rect x="798" y="288.5" width="78" height="77.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Line_11">
+        <line x1="185.5" y1="326.75" x2="943.7734" y2="326.75" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_12">
+        <text transform="translate(87 318.026)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="0" y="15" xml:space="preserve">GC Horizon</tspan>
+        </text>
+      </g>
+      <g id="Graphic_13">
+        <text transform="translate(106.41 372.886)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="8.39" y="15" xml:space="preserve">Images </tspan>
+          <tspan font-family="Helvetica Neue" font-size="10" fill="black" x="29132252e-19" y="28.447998" xml:space="preserve">at earlier LSN</tspan>
+        </text>
+      </g>
+      <g id="Graphic_14">
+        <text transform="translate(121.92 289.578)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="8739676e-19" y="15" xml:space="preserve">Deltas</tspan>
+        </text>
+      </g>
+      <g id="Graphic_15">
+        <path d="M 517.125 423.5 L 553.375 423.5 L 553.375 482 L 571.5 482 L 535.25 512 L 499 482 L 517.125 482 Z" fill="white"/>
+        <path d="M 517.125 423.5 L 553.375 423.5 L 553.375 482 L 571.5 482 L 535.25 512 L 499 482 L 517.125 482 Z" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_26">
+        <rect x="234" y="599.474" width="203.5" height="17.5" fill="white"/>
+        <rect x="234" y="599.474" width="203.5" height="17.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_25">
+        <rect x="453.5" y="599.474" width="203.5" height="17.5" fill="white"/>
+        <rect x="453.5" y="599.474" width="203.5" height="17.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_24">
+        <rect x="672.5" y="599.474" width="203.5" height="17.5" fill="white"/>
+        <rect x="672.5" y="599.474" width="203.5" height="17.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_23">
+        <rect x="234" y="533" width="127" height="52.974" fill="white"/>
+        <rect x="234" y="533" width="127" height="52.974" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_22">
+        <rect x="375" y="533" width="310.5" height="52.974" fill="white"/>
+        <rect x="375" y="533" width="310.5" height="52.974" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_21">
+        <rect x="702.5" y="533" width="173.5" height="52.974" fill="white"/>
+        <rect x="702.5" y="533" width="173.5" height="52.974" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Line_18">
+        <line x1="185.5" y1="607.724" x2="943.7734" y2="607.724" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_16">
+        <text transform="translate(121.92 538)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="8739676e-19" y="15" xml:space="preserve">Deltas</tspan>
+        </text>
+      </g>
+      <g id="Graphic_27">
+        <text transform="translate(114.8 592.86)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="3488765e-18" y="15" xml:space="preserve">Images </tspan>
+          <tspan font-family="Helvetica Neue" font-size="10" fill="black" x="4.01" y="28.447998" xml:space="preserve">at GC LSN</tspan>
+        </text>
+      </g>
+      <g id="Graphic_28">
+        <rect x="243.06836" y="300" width="624.3633" height="17.5" fill="#c0ffc0"/>
+        <text transform="translate(248.06836 301.068)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="13" fill="black" x="233.52364" y="12" xml:space="preserve">Deltas above GC Horizon</tspan>
+        </text>
+      </g>
+      <g id="Graphic_30">
+        <rect x="243.06836" y="335.5" width="624.3633" height="17.5" fill="#c0ffff"/>
+        <text transform="translate(248.06836 336.568)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="13" fill="black" x="233.89414" y="12" xml:space="preserve">Deltas below GC Horizon</tspan>
+        </text>
+      </g>
+      <g id="Graphic_32">
+        <rect x="243.06836" y="550.737" width="624.3633" height="17.5" fill="#c0ffc0"/>
+        <text transform="translate(248.06836 551.805)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="13" fill="black" x="233.52364" y="12" xml:space="preserve">Deltas above GC Horizon</tspan>
+        </text>
+      </g>
+      <g id="Graphic_33">
+        <rect x="304" y="630.474" width="485.5" height="28.447998" fill="#c0ffff"/>
+        <text transform="translate(309 637.016)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="13" fill="black" x="63.095" y="12" xml:space="preserve">Deltas and image below GC Horizon gets garbage-collected</tspan>
+        </text>
+      </g>
+      <g id="Graphic_34">
+        <text transform="translate(576.5 444.0325)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="12" fill="black" x="0" y="11" xml:space="preserve">WAL replay of deltas+image below GC Horizon</tspan>
+          <tspan font-family="Helvetica Neue" font-size="12" fill="black" x="0" y="25.336" xml:space="preserve">Reshuffle deltas</tspan>
+        </text>
+      </g>
+    </g>
+  </g>
+</svg>
diff --git a/docs/rfcs/images/036-bottom-most-gc-compaction/03-retain-lsn.svg b/docs/rfcs/images/036-bottom-most-gc-compaction/03-retain-lsn.svg
new file mode 100644
index 0000000000..792db6d69e
--- /dev/null
+++ b/docs/rfcs/images/036-bottom-most-gc-compaction/03-retain-lsn.svg
@@ -0,0 +1,141 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xl="http://www.w3.org/1999/xlink" xmlns:dc="http://purl.org/dc/elements/1.1/" version="1.1" viewBox="-104 215 863 335" width="863" height="335">
+  <defs>
+    <marker orient="auto" overflow="visible" markerUnits="strokeWidth" id="FilledArrow_Marker" stroke-linejoin="miter" stroke-miterlimit="10" viewBox="-1 -4 10 8" markerWidth="10" markerHeight="8" color="#7f8080">
+      <g>
+        <path d="M 8 0 L 0 -3 L 0 3 Z" fill="currentColor" stroke="currentColor" stroke-width="1"/>
+      </g>
+    </marker>
+  </defs>
+  <g id="03-retain-lsn" stroke-opacity="1" stroke-dasharray="none" stroke="none" fill="none" fill-opacity="1">
+    <title>03-retain-lsn</title>
+    <rect fill="white" x="-104" y="215" width="863" height="335"/>
+    <g id="03-retain-lsn_Layer_1">
+      <title>Layer 1</title>
+      <g id="Graphic_28">
+        <rect x="48" y="477" width="203.5" height="9.990005" fill="white"/>
+        <rect x="48" y="477" width="203.5" height="9.990005" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_27">
+        <rect x="267.5" y="477" width="203.5" height="9.990005" fill="white"/>
+        <rect x="267.5" y="477" width="203.5" height="9.990005" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_26">
+        <rect x="486.5" y="477" width="203.5" height="9.990005" fill="white"/>
+        <rect x="486.5" y="477" width="203.5" height="9.990005" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Line_20">
+        <line x1="-.5" y1="387.172" x2="757.7734" y2="387.172" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_19">
+        <text transform="translate(-99 378.448)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="0" y="15" xml:space="preserve">GC Horizon</tspan>
+        </text>
+      </g>
+      <g id="Graphic_31">
+        <rect x="48.25" y="410" width="203.5" height="9.990005" fill="white"/>
+        <rect x="48.25" y="410" width="203.5" height="9.990005" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_30">
+        <rect x="267.75" y="410" width="203.5" height="9.990005" fill="white"/>
+        <rect x="267.75" y="410" width="203.5" height="9.990005" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_29">
+        <rect x="486.75" y="410" width="203.5" height="9.990005" fill="white"/>
+        <rect x="486.75" y="410" width="203.5" height="9.990005" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_34">
+        <rect x="48.25" y="431.495" width="113.75" height="34" fill="white"/>
+        <rect x="48.25" y="431.495" width="113.75" height="34" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_33">
+        <rect x="172.5" y="431.495" width="203.5" height="34" fill="white"/>
+        <rect x="172.5" y="431.495" width="203.5" height="34" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_32">
+        <rect x="386.5" y="431.495" width="303.5" height="34" fill="white"/>
+        <rect x="386.5" y="431.495" width="303.5" height="34" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_37">
+        <rect x="48" y="498.495" width="203.5" height="9.990005" fill="white"/>
+        <rect x="48" y="498.495" width="203.5" height="9.990005" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_36">
+        <rect x="267.5" y="498.495" width="203.5" height="9.990005" fill="white"/>
+        <rect x="267.5" y="498.495" width="203.5" height="9.990005" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_35">
+        <rect x="486.5" y="498.495" width="203.5" height="9.990005" fill="white"/>
+        <rect x="486.5" y="498.495" width="203.5" height="9.990005" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Line_38">
+        <line x1="-10.48" y1="535.5395" x2="39.318294" y2="508.24794" marker-end="url(#FilledArrow_Marker)" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_39">
+        <text transform="translate(-96.984 526.3155)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="40500936e-20" y="15" xml:space="preserve">retain_lsn 1</tspan>
+        </text>
+      </g>
+      <g id="Line_41">
+        <line x1="-10.48" y1="507.0915" x2="38.90236" y2="485.8992" marker-end="url(#FilledArrow_Marker)" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_40">
+        <text transform="translate(-96.984 497.8675)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="40500936e-20" y="15" xml:space="preserve">retain_lsn 2</tspan>
+        </text>
+      </g>
+      <g id="Line_43">
+        <line x1="-10.48" y1="478.6435" x2="39.44267" y2="453.01616" marker-end="url(#FilledArrow_Marker)" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_42">
+        <text transform="translate(-96.984 469.4195)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="40500936e-20" y="15" xml:space="preserve">retain_lsn 3</tspan>
+        </text>
+      </g>
+      <g id="Line_45">
+        <line x1="-10.48" y1="448.495" x2="39.65061" y2="419.90015" marker-end="url(#FilledArrow_Marker)" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_44">
+        <text transform="translate(-96.984 439.271)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="40500936e-20" y="15" xml:space="preserve">retain_lsn 4</tspan>
+        </text>
+      </g>
+      <g id="Graphic_46">
+        <rect x="335.46477" y="215.5" width="353.4299" height="125.495" fill="white"/>
+        <rect x="335.46477" y="215.5" width="353.4299" height="125.495" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_48">
+        <text transform="translate(549.3766 317.547)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="6536993e-19" y="15" xml:space="preserve">Dependent Branch</tspan>
+        </text>
+      </g>
+      <g id="Graphic_50">
+        <text transform="translate(340.43824 317.547)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="40500936e-20" y="15" xml:space="preserve">retain_lsn 3</tspan>
+        </text>
+      </g>
+      <g id="Line_57">
+        <line x1="323.90685" y1="248.8045" x2="714.9232" y2="248.8045" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_56">
+        <text transform="translate(165.91346 240.0805)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="35811354e-19" y="15" xml:space="preserve">Branch GC Horizon</tspan>
+        </text>
+      </g>
+      <g id="Graphic_58">
+        <rect x="493.9232" y="301.6405" width="107.45294" height="9.990005" fill="white"/>
+        <rect x="493.9232" y="301.6405" width="107.45294" height="9.990005" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_59">
+        <text transform="translate(358.9232 277.276)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="0" y="15" xml:space="preserve">Partial Image Coverage</tspan>
+        </text>
+      </g>
+      <g id="Graphic_60">
+        <rect x="354.1732" y="301.6405" width="107.45294" height="9.990005" fill="white"/>
+        <rect x="354.1732" y="301.6405" width="107.45294" height="9.990005" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+    </g>
+  </g>
+</svg>
diff --git a/docs/rfcs/images/036-bottom-most-gc-compaction/05-btmgc-parent.svg b/docs/rfcs/images/036-bottom-most-gc-compaction/05-btmgc-parent.svg
new file mode 100644
index 0000000000..9593ed969e
--- /dev/null
+++ b/docs/rfcs/images/036-bottom-most-gc-compaction/05-btmgc-parent.svg
@@ -0,0 +1,187 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xl="http://www.w3.org/1999/xlink" xmlns:dc="http://purl.org/dc/elements/1.1/" version="1.1" viewBox="-235 426 864 366" width="864" height="366">
+  <defs/>
+  <g id="05-btmgc-parent" stroke-opacity="1" stroke-dasharray="none" stroke="none" fill="none" fill-opacity="1">
+    <title>05-btmgc-parent</title>
+    <rect fill="white" x="-235" y="426" width="864" height="366"/>
+    <g id="05-btmgc-parent_Layer_1">
+      <title>Layer 1</title>
+      <g id="Graphic_23">
+        <rect x="-83" y="510.15" width="203.5" height="26.391998" fill="white"/>
+        <rect x="-83" y="510.15" width="203.5" height="26.391998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(-78 516.178)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="12" fill="black" x="51.714" y="11" xml:space="preserve">Append C@0x30</tspan>
+        </text>
+      </g>
+      <g id="Graphic_22">
+        <rect x="136.5" y="510.15" width="203.5" height="26.391998" fill="white"/>
+        <rect x="136.5" y="510.15" width="203.5" height="26.391998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_21">
+        <rect x="355.5" y="510.15" width="203.5" height="26.391998" fill="white"/>
+        <rect x="355.5" y="510.15" width="203.5" height="26.391998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Line_20">
+        <line x1="-100.448" y1="459.224" x2="626.77344" y2="459.224" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_19">
+        <text transform="translate(-230 450.5)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="0" y="15" xml:space="preserve">GC Horizon</tspan>
+        </text>
+      </g>
+      <g id="Graphic_18">
+        <rect x="-82.75" y="426.748" width="203.5" height="26.391998" fill="white"/>
+        <rect x="-82.75" y="426.748" width="203.5" height="26.391998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(-77.75 432.776)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="12" fill="black" x="52.602" y="11" xml:space="preserve">Append F@0x60</tspan>
+        </text>
+      </g>
+      <g id="Graphic_17">
+        <rect x="136.75" y="426.748" width="203.5" height="26.391998" fill="white"/>
+        <rect x="136.75" y="426.748" width="203.5" height="26.391998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_16">
+        <rect x="355.75" y="426.748" width="203.5" height="26.391998" fill="white"/>
+        <rect x="355.75" y="426.748" width="203.5" height="26.391998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_15">
+        <rect x="-82.75" y="464.645" width="113.75" height="34" fill="white"/>
+        <rect x="-82.75" y="464.645" width="113.75" height="34" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(-77.75 467.309)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="12" fill="black" x="7.505" y="11" xml:space="preserve">Append E@0x50</tspan>
+          <tspan font-family="Helvetica Neue" font-size="12" fill="black" x="6.947" y="25.336" xml:space="preserve">Append D@0x40</tspan>
+        </text>
+      </g>
+      <g id="Graphic_14">
+        <rect x="41.5" y="464.645" width="203.5" height="34" fill="white"/>
+        <rect x="41.5" y="464.645" width="203.5" height="34" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_13">
+        <rect x="255.5" y="464.645" width="303.5" height="34" fill="white"/>
+        <rect x="255.5" y="464.645" width="303.5" height="34" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_12">
+        <rect x="-83" y="548.047" width="203.5" height="26.391998" fill="white"/>
+        <rect x="-83" y="548.047" width="203.5" height="26.391998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(-78 554.075)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="12" fill="black" x="26.796" y="11" xml:space="preserve">A@0x10, Append B@0x20</tspan>
+        </text>
+      </g>
+      <g id="Graphic_11">
+        <rect x="136.5" y="548.047" width="203.5" height="26.391998" fill="white"/>
+        <rect x="136.5" y="548.047" width="203.5" height="26.391998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_10">
+        <rect x="355.5" y="548.047" width="203.5" height="26.391998" fill="white"/>
+        <rect x="355.5" y="548.047" width="203.5" height="26.391998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Line_24">
+        <line x1="-104" y1="542" x2="610.5" y2="542" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-dasharray="4.0,4.0" stroke-width="1"/>
+      </g>
+      <g id="Graphic_25">
+        <text transform="translate(-139.604 534.5)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="14" fill="black" x="0" y="13" xml:space="preserve">0x20</tspan>
+        </text>
+      </g>
+      <g id="Graphic_28">
+        <text transform="translate(-139.604 452.556)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="14" fill="black" x="0" y="13" xml:space="preserve">0x50</tspan>
+        </text>
+      </g>
+      <g id="Line_30">
+        <line x1="-100.448" y1="481.145" x2="614.052" y2="481.145" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-dasharray="4.0,4.0" stroke-width="1"/>
+      </g>
+      <g id="Graphic_29">
+        <text transform="translate(-139.604 473.449)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="14" fill="black" x="0" y="13" xml:space="preserve">0x40</tspan>
+        </text>
+      </g>
+      <g id="Line_48">
+        <line x1="-99.448" y1="701.513" x2="627.77344" y2="701.513" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_47">
+        <text transform="translate(-229 692.789)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="0" y="15" xml:space="preserve">GC Horizon</tspan>
+        </text>
+      </g>
+      <g id="Graphic_46">
+        <rect x="-81.75" y="670.496" width="113.75" height="26.391998" fill="white"/>
+        <rect x="-81.75" y="670.496" width="113.75" height="26.391998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(-76.75 676.524)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="12" fill="black" x="7.727" y="11" xml:space="preserve">Append F@0x60</tspan>
+        </text>
+      </g>
+      <g id="Graphic_43">
+        <rect x="-81.75" y="708.393" width="113.75" height="34" fill="white"/>
+        <rect x="-81.75" y="708.393" width="113.75" height="34" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(-76.75 718.225)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="12" fill="black" x="7.505" y="11" xml:space="preserve">Append E@0x50</tspan>
+        </text>
+      </g>
+      <g id="Line_37">
+        <line x1="-101" y1="777.2665" x2="613.5" y2="777.2665" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-dasharray="4.0,4.0" stroke-width="1"/>
+      </g>
+      <g id="Graphic_36">
+        <text transform="translate(-138.604 769.7665)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="14" fill="black" x="0" y="13" xml:space="preserve">0x20</tspan>
+        </text>
+      </g>
+      <g id="Graphic_33">
+        <text transform="translate(-138.604 694.845)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="14" fill="black" x="0" y="13" xml:space="preserve">0x50</tspan>
+        </text>
+      </g>
+      <g id="Line_32">
+        <line x1="-99.448" y1="755.089" x2="615.052" y2="755.089" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-dasharray="4.0,4.0" stroke-width="1"/>
+      </g>
+      <g id="Graphic_31">
+        <text transform="translate(-138.604 747.393)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="14" fill="black" x="0" y="13" xml:space="preserve">0x40</tspan>
+        </text>
+      </g>
+      <g id="Graphic_40">
+        <rect x="-82" y="770.909" width="203.5" height="14.107002" fill="white"/>
+        <rect x="-82" y="770.909" width="203.5" height="14.107002" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(-77 770.7945)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="12" fill="black" x="70.836" y="11" xml:space="preserve">AB@0x20</tspan>
+        </text>
+      </g>
+      <g id="Graphic_39">
+        <rect x="137.5" y="770.909" width="203.5" height="14.107002" fill="white"/>
+        <rect x="137.5" y="770.909" width="203.5" height="14.107002" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_38">
+        <rect x="356.5" y="770.909" width="203.5" height="14.107002" fill="white"/>
+        <rect x="356.5" y="770.909" width="203.5" height="14.107002" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_54">
+        <rect x="-81.75" y="748.5355" width="203.5" height="14.107002" fill="white"/>
+        <rect x="-81.75" y="748.5355" width="203.5" height="14.107002" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(-76.75 748.421)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="12" fill="black" x="62.28" y="11" xml:space="preserve">ABCD@0x40</tspan>
+        </text>
+      </g>
+      <g id="Graphic_53">
+        <rect x="137.75" y="748.5355" width="203.5" height="14.107002" fill="white"/>
+        <rect x="137.75" y="748.5355" width="203.5" height="14.107002" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_52">
+        <rect x="356.75" y="748.5355" width="203.5" height="14.107002" fill="white"/>
+        <rect x="356.75" y="748.5355" width="203.5" height="14.107002" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_57">
+        <path d="M 211.32422 585 L 265.17578 585 L 265.17578 611.332 L 287.84375 611.332 L 238.25 633.117 L 188.65625 611.332 L 211.32422 611.332 Z" fill="white"/>
+        <path d="M 211.32422 585 L 265.17578 585 L 265.17578 611.332 L 287.84375 611.332 L 238.25 633.117 L 188.65625 611.332 L 211.32422 611.332 Z" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_60">
+        <rect x="359" y="692.858" width="203.5" height="14.107002" fill="white"/>
+        <rect x="359" y="692.858" width="203.5" height="14.107002" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_59">
+        <rect x="41.5" y="693.858" width="303" height="14.107002" fill="white"/>
+        <rect x="41.5" y="693.858" width="303" height="14.107002" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+    </g>
+  </g>
+</svg>
diff --git a/docs/rfcs/images/036-bottom-most-gc-compaction/06-btmgc-child.svg b/docs/rfcs/images/036-bottom-most-gc-compaction/06-btmgc-child.svg
new file mode 100644
index 0000000000..b8a93d5b5f
--- /dev/null
+++ b/docs/rfcs/images/036-bottom-most-gc-compaction/06-btmgc-child.svg
@@ -0,0 +1,184 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xl="http://www.w3.org/1999/xlink" xmlns:dc="http://purl.org/dc/elements/1.1/" version="1.1" viewBox="-413 471 931 354" width="931" height="354">
+  <defs/>
+  <g id="06-btmgc-child" stroke-opacity="1" stroke-dasharray="none" stroke="none" fill="none" fill-opacity="1">
+    <title>06-btmgc-child</title>
+    <rect fill="white" x="-413" y="471" width="931" height="354"/>
+    <g id="06-btmgc-child_Layer_1">
+      <title>Layer 1</title>
+      <g id="Graphic_47">
+        <rect x="-412" y="594.402" width="928" height="28.447998" fill="white"/>
+        <rect x="-412" y="594.402" width="928" height="28.447998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_46">
+        <rect x="-205" y="555.552" width="203.5" height="26.391998" fill="white"/>
+        <rect x="-205" y="555.552" width="203.5" height="26.391998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(-200 561.58)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="12" fill="black" x="52.158" y="11" xml:space="preserve">Append P@0x30</tspan>
+        </text>
+      </g>
+      <g id="Graphic_45">
+        <rect x="14.5" y="555.552" width="203.5" height="26.391998" fill="white"/>
+        <rect x="14.5" y="555.552" width="203.5" height="26.391998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_44">
+        <rect x="233.5" y="555.552" width="203.5" height="26.391998" fill="white"/>
+        <rect x="233.5" y="555.552" width="203.5" height="26.391998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Line_43">
+        <line x1="-222.448" y1="504.724" x2="504.77344" y2="504.724" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_42">
+        <text transform="translate(-352 496)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="0" y="15" xml:space="preserve">GC Horizon</tspan>
+        </text>
+      </g>
+      <g id="Graphic_41">
+        <rect x="-204.75" y="472.15" width="203.5" height="26.391998" fill="white"/>
+        <rect x="-204.75" y="472.15" width="203.5" height="26.391998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(-199.75 478.178)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="12" fill="black" x="52.158" y="11" xml:space="preserve">Append S@0x60</tspan>
+        </text>
+      </g>
+      <g id="Graphic_40">
+        <rect x="14.75" y="472.15" width="203.5" height="26.391998" fill="white"/>
+        <rect x="14.75" y="472.15" width="203.5" height="26.391998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_39">
+        <rect x="233.75" y="472.15" width="203.5" height="26.391998" fill="white"/>
+        <rect x="233.75" y="472.15" width="203.5" height="26.391998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_38">
+        <rect x="-204.75" y="510.047" width="113.75" height="34" fill="white"/>
+        <rect x="-204.75" y="510.047" width="113.75" height="34" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(-199.75 512.711)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="12" fill="black" x="7.061" y="11" xml:space="preserve">Append R@0x50</tspan>
+          <tspan font-family="Helvetica Neue" font-size="12" fill="black" x="6.611" y="25.336" xml:space="preserve">Append Q@0x40</tspan>
+        </text>
+      </g>
+      <g id="Graphic_37">
+        <rect x="-80.5" y="510.047" width="203.5" height="34" fill="white"/>
+        <rect x="-80.5" y="510.047" width="203.5" height="34" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_36">
+        <rect x="133.5" y="510.047" width="303.5" height="34" fill="white"/>
+        <rect x="133.5" y="510.047" width="303.5" height="34" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_33">
+        <text transform="translate(-261.604 498.056)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="14" fill="black" x="0" y="13" xml:space="preserve">0x50</tspan>
+        </text>
+      </g>
+      <g id="Line_30">
+        <line x1="-224" y1="607.9115" x2="490.5" y2="607.9115" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-dasharray="4.0,4.0" stroke-width="1"/>
+      </g>
+      <g id="Graphic_29">
+        <text transform="translate(-261.604 600.4115)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="14" fill="black" x="0" y="13" xml:space="preserve">0x20</tspan>
+        </text>
+      </g>
+      <g id="Graphic_28">
+        <rect x="-205" y="601.554" width="203.5" height="14.107002" fill="white"/>
+        <rect x="-205" y="601.554" width="203.5" height="14.107002" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(-200 601.4395)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="12" fill="black" x="70.836" y="11" xml:space="preserve">AB@0x20</tspan>
+        </text>
+      </g>
+      <g id="Graphic_27">
+        <rect x="14.5" y="601.554" width="203.5" height="14.107002" fill="white"/>
+        <rect x="14.5" y="601.554" width="203.5" height="14.107002" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_26">
+        <rect x="233.5" y="601.554" width="203.5" height="14.107002" fill="white"/>
+        <rect x="233.5" y="601.554" width="203.5" height="14.107002" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_25">
+        <text transform="translate(-407 599.1875)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="0" y="15" xml:space="preserve">Ancestor Branch</tspan>
+        </text>
+      </g>
+      <g id="Graphic_24">
+        <rect x="-411" y="795.46" width="928" height="28.447998" fill="white"/>
+        <rect x="-411" y="795.46" width="928" height="28.447998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Line_20">
+        <line x1="-221.448" y1="755.528" x2="505.77344" y2="755.528" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_19">
+        <text transform="translate(-351 746.804)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="0" y="15" xml:space="preserve">GC Horizon</tspan>
+        </text>
+      </g>
+      <g id="Graphic_18">
+        <rect x="-203.75" y="723.579" width="203.25" height="26.391998" fill="white"/>
+        <rect x="-203.75" y="723.579" width="203.25" height="26.391998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(-198.75 729.607)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="12" fill="black" x="52.033" y="11" xml:space="preserve">Append S@0x60</tspan>
+        </text>
+      </g>
+      <g id="Graphic_10">
+        <text transform="translate(-260.604 748.86)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="14" fill="black" x="0" y="13" xml:space="preserve">0x50</tspan>
+        </text>
+      </g>
+      <g id="Line_7">
+        <line x1="-223" y1="808.9695" x2="491.5" y2="808.9695" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-dasharray="4.0,4.0" stroke-width="1"/>
+      </g>
+      <g id="Graphic_6">
+        <text transform="translate(-260.604 801.4695)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="14" fill="black" x="0" y="13" xml:space="preserve">0x20</tspan>
+        </text>
+      </g>
+      <g id="Graphic_5">
+        <rect x="-204" y="802.612" width="203.5" height="14.107002" fill="white"/>
+        <rect x="-204" y="802.612" width="203.5" height="14.107002" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(-199 802.4975)" fill="#b1001c">
+          <tspan font-family="Helvetica Neue" font-size="12" fill="#b1001c" x="70.836" y="11" xml:space="preserve">AB</tspan>
+          <tspan font-family="Helvetica Neue" font-size="12" fill="black" y="11" xml:space="preserve">@0x20</tspan>
+        </text>
+      </g>
+      <g id="Graphic_4">
+        <rect x="15.5" y="802.612" width="203.5" height="14.107002" fill="white"/>
+        <rect x="15.5" y="802.612" width="203.5" height="14.107002" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_3">
+        <rect x="234.5" y="802.612" width="203.5" height="14.107002" fill="white"/>
+        <rect x="234.5" y="802.612" width="203.5" height="14.107002" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_2">
+        <text transform="translate(-406 800.2455)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="0" y="15" xml:space="preserve">Ancestor Branch</tspan>
+        </text>
+      </g>
+      <g id="Graphic_48">
+        <path d="M 89.32422 639.081 L 143.17578 639.081 L 143.17578 665.413 L 165.84375 665.413 L 116.25 687.198 L 66.65625 665.413 L 89.32422 665.413 Z" fill="white"/>
+        <path d="M 89.32422 639.081 L 143.17578 639.081 L 143.17578 665.413 L 165.84375 665.413 L 116.25 687.198 L 66.65625 665.413 L 89.32422 665.413 Z" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_49">
+        <rect x="-204" y="762.428" width="203.5" height="26.391998" fill="white"/>
+        <rect x="-204" y="762.428" width="203.5" height="26.391998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(-199 768.456)" fill="#b1001c">
+          <tspan font-family="Helvetica Neue" font-size="12" fill="#b1001c" x="58.278" y="11" xml:space="preserve">AB</tspan>
+          <tspan font-family="Helvetica Neue" font-size="12" fill="black" y="11" xml:space="preserve">PQR@0x50</tspan>
+        </text>
+      </g>
+      <g id="Graphic_59">
+        <rect x="14.5" y="723.579" width="203.5" height="26.391998" fill="white"/>
+        <rect x="14.5" y="723.579" width="203.5" height="26.391998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_58">
+        <rect x="233.5" y="723.579" width="203.5" height="26.391998" fill="white"/>
+        <rect x="233.5" y="723.579" width="203.5" height="26.391998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_63">
+        <rect x="9" y="762.085" width="203.5" height="26.391998" fill="white"/>
+        <rect x="9" y="762.085" width="203.5" height="26.391998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_62">
+        <rect x="225" y="762.085" width="213" height="26.391998" fill="white"/>
+        <rect x="225" y="762.085" width="213" height="26.391998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+    </g>
+  </g>
+</svg>
diff --git a/docs/rfcs/images/036-bottom-most-gc-compaction/07-btmgc-analysis-1.svg b/docs/rfcs/images/036-bottom-most-gc-compaction/07-btmgc-analysis-1.svg
new file mode 100644
index 0000000000..65034226da
--- /dev/null
+++ b/docs/rfcs/images/036-bottom-most-gc-compaction/07-btmgc-analysis-1.svg
@@ -0,0 +1,180 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xl="http://www.w3.org/1999/xlink" xmlns:dc="http://purl.org/dc/elements/1.1/" version="1.1" viewBox="-556 476 923 411" width="923" height="411">
+  <defs/>
+  <g id="07-btmgc-analysis-1" stroke-opacity="1" stroke-dasharray="none" stroke="none" fill="none" fill-opacity="1">
+    <title>07-btmgc-analysis-1</title>
+    <rect fill="white" x="-556" y="476" width="923" height="411"/>
+    <g id="07-btmgc-analysis-1_Layer_1">
+      <title>Layer 1</title>
+      <g id="Graphic_85">
+        <rect x="-404" y="609.062" width="203.5" height="17.5" fill="white"/>
+        <rect x="-404" y="609.062" width="203.5" height="17.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_84">
+        <rect x="-184.5" y="609.062" width="203.5" height="17.5" fill="white"/>
+        <rect x="-184.5" y="609.062" width="203.5" height="17.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_83">
+        <rect x="34.5" y="609.062" width="203.5" height="17.5" fill="white"/>
+        <rect x="34.5" y="609.062" width="203.5" height="17.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_82">
+        <rect x="-404" y="479.922" width="127" height="77.5" fill="white"/>
+        <rect x="-404" y="479.922" width="127" height="77.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_81">
+        <rect x="-263" y="479.922" width="127" height="77.5" fill="white"/>
+        <rect x="-263" y="479.922" width="127" height="77.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_80">
+        <rect x="-122" y="479.922" width="127" height="77.5" fill="white"/>
+        <rect x="-122" y="479.922" width="127" height="77.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_79">
+        <rect x="19" y="479.922" width="127" height="77.5" fill="white"/>
+        <rect x="19" y="479.922" width="127" height="77.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_78">
+        <rect x="160" y="479.922" width="78" height="77.5" fill="white"/>
+        <rect x="160" y="479.922" width="78" height="77.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Line_77">
+        <line x1="-452.5" y1="518.172" x2="251" y2="518.172" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_76">
+        <text transform="translate(-551 509.448)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="0" y="15" xml:space="preserve">GC Horizon</tspan>
+        </text>
+      </g>
+      <g id="Graphic_75">
+        <text transform="translate(-531.59 602.448)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="8.39" y="15" xml:space="preserve">Images </tspan>
+          <tspan font-family="Helvetica Neue" font-size="10" fill="black" x="29132252e-19" y="28.447998" xml:space="preserve">at earlier LSN</tspan>
+        </text>
+      </g>
+      <g id="Graphic_74">
+        <text transform="translate(-516.08 481)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="8739676e-19" y="15" xml:space="preserve">Deltas</tspan>
+        </text>
+      </g>
+      <g id="Graphic_73">
+        <path d="M -120.675 651.5 L -84.425 651.5 L -84.425 710 L -66.3 710 L -102.55 740 L -138.8 710 L -120.675 710 Z" fill="white"/>
+        <path d="M -120.675 651.5 L -84.425 651.5 L -84.425 710 L -66.3 710 L -102.55 740 L -138.8 710 L -120.675 710 Z" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_72">
+        <rect x="-403.8" y="827.474" width="203.5" height="17.5" fill="white"/>
+        <rect x="-403.8" y="827.474" width="203.5" height="17.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_71">
+        <rect x="-184.3" y="827.474" width="203.5" height="17.5" fill="white"/>
+        <rect x="-184.3" y="827.474" width="203.5" height="17.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_70">
+        <rect x="34.7" y="827.474" width="203.5" height="17.5" fill="white"/>
+        <rect x="34.7" y="827.474" width="203.5" height="17.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_69">
+        <rect x="-403.8" y="761" width="127" height="52.974" fill="white"/>
+        <rect x="-403.8" y="761" width="127" height="52.974" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_68">
+        <rect x="-262.8" y="761" width="310.5" height="52.974" fill="white"/>
+        <rect x="-262.8" y="761" width="310.5" height="52.974" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_67">
+        <rect x="64.7" y="761" width="173.5" height="52.974" fill="white"/>
+        <rect x="64.7" y="761" width="173.5" height="52.974" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Line_66">
+        <line x1="-452.3" y1="835.724" x2="251.2" y2="835.724" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_65">
+        <text transform="translate(-515.88 766)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="8739676e-19" y="15" xml:space="preserve">Deltas</tspan>
+        </text>
+      </g>
+      <g id="Graphic_64">
+        <text transform="translate(-523 820.86)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="3488765e-18" y="15" xml:space="preserve">Images </tspan>
+          <tspan font-family="Helvetica Neue" font-size="10" fill="black" x="4.01" y="28.447998" xml:space="preserve">at GC LSN</tspan>
+        </text>
+      </g>
+      <g id="Graphic_63">
+        <rect x="-394.93164" y="491.422" width="624.3633" height="17.5" fill="#c0ffc0"/>
+        <text transform="translate(-389.93164 492.49)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="13" fill="black" x="233.52364" y="12" xml:space="preserve">Deltas above GC Horizon</tspan>
+        </text>
+      </g>
+      <g id="Graphic_62">
+        <rect x="-394.93164" y="526.922" width="624.3633" height="17.5" fill="#c0ffff"/>
+        <text transform="translate(-389.93164 527.99)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="13" fill="black" x="233.89414" y="12" xml:space="preserve">Deltas below GC Horizon</tspan>
+        </text>
+      </g>
+      <g id="Graphic_61">
+        <rect x="-394.73164" y="778.737" width="624.3633" height="17.5" fill="#c0ffc0"/>
+        <text transform="translate(-389.73164 779.805)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="13" fill="black" x="233.52364" y="12" xml:space="preserve">Deltas above GC Horizon</tspan>
+        </text>
+      </g>
+      <g id="Graphic_60">
+        <rect x="-333.8" y="858.474" width="485.5" height="28.447998" fill="#c0ffff"/>
+        <text transform="translate(-328.8 865.016)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="13" fill="black" x="63.095" y="12" xml:space="preserve">Deltas and image below GC Horizon gets garbage-collected</tspan>
+        </text>
+      </g>
+      <g id="Graphic_86">
+        <text transform="translate(263 499.724)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="32" fill="black" x="0" y="30" xml:space="preserve">size=A</tspan>
+        </text>
+      </g>
+      <g id="Line_87">
+        <line x1="260.87012" y1="479.068" x2="360.71387" y2="479.068" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Line_88">
+        <line x1="260.87012" y1="561" x2="360.71387" y2="561" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_89">
+        <rect x="-403.8" y="569" width="161.8" height="28.447998" fill="white"/>
+        <rect x="-403.8" y="569" width="161.8" height="28.447998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_90">
+        <rect x="-229.5" y="569.018" width="277.2" height="28.447998" fill="white"/>
+        <rect x="-229.5" y="569.018" width="277.2" height="28.447998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_91">
+        <rect x="64.7" y="569.018" width="173.5" height="28.447998" fill="white"/>
+        <rect x="64.7" y="569.018" width="173.5" height="28.447998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Line_92">
+        <line x1="262" y1="602" x2="361.84375" y2="602" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Line_93">
+        <line x1="263" y1="625.562" x2="362.84375" y2="625.562" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_94">
+        <text transform="translate(264.53787 562.276)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="32" fill="black" x="14210855e-21" y="30" xml:space="preserve">size=B</tspan>
+        </text>
+      </g>
+      <g id="Graphic_95">
+        <text transform="translate(285.12 599.5)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="24" fill="black" x="0" y="23" xml:space="preserve">size=C</tspan>
+        </text>
+      </g>
+      <g id="Graphic_98">
+        <text transform="translate(264.53787 773.772)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="26" fill="black" x="8881784e-19" y="25" xml:space="preserve">A</tspan>
+          <tspan font-family="Lucida Grande" font-size="26" fill="black" y="25" xml:space="preserve">↓</tspan>
+        </text>
+      </g>
+      <g id="Graphic_97">
+        <text transform="translate(265.87013 815.5)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="26" fill="black" x="6536993e-19" y="25" xml:space="preserve">B</tspan>
+          <tspan font-family="Lucida Grande" font-size="26" fill="black" y="25" xml:space="preserve">↓</tspan>
+        </text>
+      </g>
+    </g>
+  </g>
+</svg>
diff --git a/docs/rfcs/images/036-bottom-most-gc-compaction/08-optimization.svg b/docs/rfcs/images/036-bottom-most-gc-compaction/08-optimization.svg
new file mode 100644
index 0000000000..16a17ec56e
--- /dev/null
+++ b/docs/rfcs/images/036-bottom-most-gc-compaction/08-optimization.svg
@@ -0,0 +1,158 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xl="http://www.w3.org/1999/xlink" xmlns:dc="http://purl.org/dc/elements/1.1/" version="1.1" viewBox="-235 406 586 424" width="586" height="424">
+  <defs/>
+  <g id="08-optimization" stroke-opacity="1" stroke-dasharray="none" stroke="none" fill="none" fill-opacity="1">
+    <title>08-optimization</title>
+    <rect fill="white" x="-235" y="406" width="586" height="424"/>
+    <g id="08-optimization_Layer_1">
+      <title>Layer 1</title>
+      <g id="Graphic_22">
+        <rect x="-100.448" y="509.902" width="203.5" height="26.391998" fill="white"/>
+        <rect x="-100.448" y="509.902" width="203.5" height="26.391998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_21">
+        <rect x="118.552" y="509.902" width="203.5" height="26.391998" fill="white"/>
+        <rect x="118.552" y="509.902" width="203.5" height="26.391998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Line_20">
+        <line x1="-101.79572" y1="420.322" x2="349.5" y2="420.322" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_19">
+        <text transform="translate(-230 411.598)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="0" y="15" xml:space="preserve">GC Horizon</tspan>
+        </text>
+      </g>
+      <g id="Graphic_17">
+        <rect x="-100.198" y="426.5" width="203.5" height="26.391998" fill="white"/>
+        <rect x="-100.198" y="426.5" width="203.5" height="26.391998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_16">
+        <rect x="118.802" y="426.5" width="203.5" height="26.391998" fill="white"/>
+        <rect x="118.802" y="426.5" width="203.5" height="26.391998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_14">
+        <rect x="-100.198" y="464.397" width="108.25" height="34" fill="white"/>
+        <rect x="-100.198" y="464.397" width="108.25" height="34" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_13">
+        <rect x="18.552" y="464.397" width="303.5" height="34" fill="white"/>
+        <rect x="18.552" y="464.397" width="303.5" height="34" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_11">
+        <rect x="-100.448" y="547.799" width="203.5" height="26.391998" fill="white"/>
+        <rect x="-100.448" y="547.799" width="203.5" height="26.391998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_10">
+        <rect x="118.552" y="547.799" width="203.5" height="26.391998" fill="white"/>
+        <rect x="118.552" y="547.799" width="203.5" height="26.391998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Line_24">
+        <line x1="-104" y1="542" x2="339.4011" y2="542" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-dasharray="4.0,4.0" stroke-width="1"/>
+      </g>
+      <g id="Graphic_25">
+        <text transform="translate(-139.604 534.5)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="14" fill="black" x="0" y="13" xml:space="preserve">0x20</tspan>
+        </text>
+      </g>
+      <g id="Line_27">
+        <line x1="-101.79572" y1="459.098" x2="341.6054" y2="459.098" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-dasharray="4.0,4.0" stroke-width="1"/>
+      </g>
+      <g id="Graphic_26">
+        <text transform="translate(-139.604 451.402)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="14" fill="black" x="0" y="13" xml:space="preserve">0x50</tspan>
+        </text>
+      </g>
+      <g id="Graphic_28">
+        <text transform="translate(-139.604 413.654)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="14" fill="black" x="0" y="13" xml:space="preserve">0x60</tspan>
+        </text>
+      </g>
+      <g id="Line_30">
+        <line x1="-101.79572" y1="481.145" x2="341.6054" y2="481.145" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-dasharray="4.0,4.0" stroke-width="1"/>
+      </g>
+      <g id="Graphic_29">
+        <text transform="translate(-139.604 473.449)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="14" fill="black" x="0" y="13" xml:space="preserve">0x40</tspan>
+        </text>
+      </g>
+      <g id="Graphic_77">
+        <rect x="-100.448" y="765.19595" width="203.5" height="26.391998" fill="white"/>
+        <rect x="-100.448" y="765.19595" width="203.5" height="26.391998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_76">
+        <rect x="118.552" y="765.19595" width="203.5" height="26.391998" fill="white"/>
+        <rect x="118.552" y="765.19595" width="203.5" height="26.391998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Line_75">
+        <line x1="-101.79572" y1="637.317" x2="349.5" y2="637.317" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_74">
+        <text transform="translate(-230 628.593)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="0" y="15" xml:space="preserve">GC Horizon</tspan>
+        </text>
+      </g>
+      <g id="Graphic_73">
+        <rect x="-100.198" y="681.794" width="203.5" height="26.391998" fill="white"/>
+        <rect x="-100.198" y="681.794" width="203.5" height="26.391998" stroke="#b1001c" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_72">
+        <rect x="118.802" y="681.794" width="203.5" height="26.391998" fill="white"/>
+        <rect x="118.802" y="681.794" width="203.5" height="26.391998" stroke="#b1001c" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_71">
+        <rect x="-100.198" y="719.69096" width="108.25" height="34" fill="white"/>
+        <rect x="-100.198" y="719.69096" width="108.25" height="34" stroke="#b1001c" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_70">
+        <rect x="18.552" y="719.69096" width="303.5" height="34" fill="white"/>
+        <rect x="18.552" y="719.69096" width="303.5" height="34" stroke="#b1001c" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_69">
+        <rect x="-100.448" y="803.09295" width="203.5" height="26.391998" fill="white"/>
+        <rect x="-100.448" y="803.09295" width="203.5" height="26.391998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_68">
+        <rect x="118.552" y="803.09295" width="203.5" height="26.391998" fill="white"/>
+        <rect x="118.552" y="803.09295" width="203.5" height="26.391998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Line_67">
+        <line x1="-104" y1="797.294" x2="339.4011" y2="797.294" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-dasharray="4.0,4.0" stroke-width="1"/>
+      </g>
+      <g id="Graphic_66">
+        <text transform="translate(-139.604 789.794)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="14" fill="black" x="0" y="13" xml:space="preserve">0x20</tspan>
+        </text>
+      </g>
+      <g id="Graphic_63">
+        <text transform="translate(-139.604 630.649)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="14" fill="black" x="0" y="13" xml:space="preserve">0x70</tspan>
+        </text>
+      </g>
+      <g id="Line_62">
+        <line x1="-101.79572" y1="736.439" x2="341.6054" y2="736.439" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-dasharray="4.0,4.0" stroke-width="1"/>
+      </g>
+      <g id="Graphic_61">
+        <text transform="translate(-139.604 728.743)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="14" fill="black" x="0" y="13" xml:space="preserve">0x40</tspan>
+        </text>
+      </g>
+      <g id="Graphic_79">
+        <rect x="-100.198" y="644.393" width="168.198" height="26.391998" fill="white"/>
+        <rect x="-100.198" y="644.393" width="168.198" height="26.391998" stroke="#b1001c" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_78">
+        <rect x="80" y="644.393" width="242.302" height="26.391998" fill="white"/>
+        <rect x="80" y="644.393" width="242.302" height="26.391998" stroke="#b1001c" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Line_81">
+        <line x1="-101.79572" y1="714.139" x2="341.6054" y2="714.139" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-dasharray="1.0,4.0" stroke-width="1"/>
+      </g>
+      <g id="Graphic_80">
+        <text transform="translate(-139.604 706.443)" fill="#a5a5a5">
+          <tspan font-family="Helvetica Neue" font-size="14" fill="#a5a5a5" x="0" y="13" xml:space="preserve">0x50</tspan>
+        </text>
+      </g>
+    </g>
+  </g>
+</svg>
diff --git a/docs/rfcs/images/036-bottom-most-gc-compaction/09-btmgc-analysis-2.svg b/docs/rfcs/images/036-bottom-most-gc-compaction/09-btmgc-analysis-2.svg
new file mode 100644
index 0000000000..243f038c88
--- /dev/null
+++ b/docs/rfcs/images/036-bottom-most-gc-compaction/09-btmgc-analysis-2.svg
@@ -0,0 +1,184 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xl="http://www.w3.org/1999/xlink" xmlns:dc="http://purl.org/dc/elements/1.1/" version="1.1" viewBox="-562 479 876 429" width="876" height="429">
+  <defs/>
+  <g id="09-btmgc-analysis-2" stroke-opacity="1" stroke-dasharray="none" stroke="none" fill="none" fill-opacity="1">
+    <title>09-btmgc-analysis-2</title>
+    <rect fill="white" x="-562" y="479" width="876" height="429"/>
+    <g id="09-btmgc-analysis-2_Layer_1">
+      <title>Layer 1</title>
+      <g id="Graphic_85">
+        <rect x="-404" y="622.386" width="203.5" height="17.5" fill="white"/>
+        <rect x="-404" y="622.386" width="203.5" height="17.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(-399 621.912)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="90.974" y="15" xml:space="preserve">C</tspan>
+        </text>
+      </g>
+      <g id="Graphic_84">
+        <rect x="-184.5" y="622.386" width="203.5" height="17.5" fill="white"/>
+        <rect x="-184.5" y="622.386" width="203.5" height="17.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(-179.5 621.912)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="90.974" y="15" xml:space="preserve">C</tspan>
+        </text>
+      </g>
+      <g id="Graphic_83">
+        <rect x="34.5" y="622.386" width="203.5" height="17.5" fill="white"/>
+        <rect x="34.5" y="622.386" width="203.5" height="17.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(39.5 621.912)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="90.974" y="15" xml:space="preserve">C</tspan>
+        </text>
+      </g>
+      <g id="Graphic_82">
+        <rect x="-404" y="479.922" width="127" height="77.5" fill="white"/>
+        <rect x="-404" y="479.922" width="127" height="77.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(-399 509.448)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="53.316" y="15" xml:space="preserve">A</tspan>
+        </text>
+      </g>
+      <g id="Graphic_81">
+        <rect x="-263" y="479.922" width="127" height="77.5" fill="white"/>
+        <rect x="-263" y="479.922" width="127" height="77.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(-258 509.448)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="53.316" y="15" xml:space="preserve">A</tspan>
+        </text>
+      </g>
+      <g id="Graphic_80">
+        <rect x="-122" y="479.922" width="127" height="77.5" fill="white"/>
+        <rect x="-122" y="479.922" width="127" height="77.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(-117 509.448)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="53.316" y="15" xml:space="preserve">A</tspan>
+        </text>
+      </g>
+      <g id="Graphic_79">
+        <rect x="19" y="479.922" width="127" height="77.5" fill="white"/>
+        <rect x="19" y="479.922" width="127" height="77.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(24 509.448)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="53.316" y="15" xml:space="preserve">A</tspan>
+        </text>
+      </g>
+      <g id="Graphic_78">
+        <rect x="160" y="479.922" width="78" height="77.5" fill="white"/>
+        <rect x="160" y="479.922" width="78" height="77.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(165 509.448)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="28.816" y="15" xml:space="preserve">A</tspan>
+        </text>
+      </g>
+      <g id="Line_77">
+        <line x1="-452.5" y1="518.172" x2="251" y2="518.172" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_76">
+        <text transform="translate(-551 509.448)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="0" y="15" xml:space="preserve">GC Horizon</tspan>
+        </text>
+      </g>
+      <g id="Graphic_73">
+        <path d="M -120.675 651.5 L -84.425 651.5 L -84.425 710 L -66.3 710 L -102.55 740 L -138.8 710 L -120.675 710 Z" fill="white"/>
+        <path d="M -120.675 651.5 L -84.425 651.5 L -84.425 710 L -66.3 710 L -102.55 740 L -138.8 710 L -120.675 710 Z" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_89">
+        <rect x="-403.8" y="582.324" width="161.8" height="28.447998" fill="white"/>
+        <rect x="-403.8" y="582.324" width="161.8" height="28.447998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(-398.8 587.324)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="70.42" y="15" xml:space="preserve">B</tspan>
+        </text>
+      </g>
+      <g id="Graphic_90">
+        <rect x="-229.5" y="582.342" width="277.2" height="28.447998" fill="white"/>
+        <rect x="-229.5" y="582.342" width="277.2" height="28.447998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(-224.5 587.342)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="128.12" y="15" xml:space="preserve">B</tspan>
+        </text>
+      </g>
+      <g id="Graphic_91">
+        <rect x="64.7" y="582.342" width="173.5" height="28.447998" fill="white"/>
+        <rect x="64.7" y="582.342" width="173.5" height="28.447998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(69.7 587.342)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="76.27" y="15" xml:space="preserve">B</tspan>
+        </text>
+      </g>
+      <g id="Graphic_97">
+        <rect x="-403.8" y="564.842" width="490.8" height="12.157997" fill="white"/>
+        <rect x="-403.8" y="564.842" width="490.8" height="12.157997" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(-398.8 561.697)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="234.624" y="15" xml:space="preserve">C</tspan>
+        </text>
+      </g>
+      <g id="Graphic_109">
+        <rect x="28.6" y="889.964" width="203.5" height="17.5" fill="white"/>
+        <rect x="28.6" y="889.964" width="203.5" height="17.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(33.6 889.49)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="90.974" y="15" xml:space="preserve">C</tspan>
+        </text>
+      </g>
+      <g id="Graphic_108">
+        <rect x="-409.9" y="747.5" width="127" height="77.5" fill="white"/>
+        <rect x="-409.9" y="747.5" width="127" height="77.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(-404.9 777.026)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="53.316" y="15" xml:space="preserve">A</tspan>
+        </text>
+      </g>
+      <g id="Graphic_107">
+        <rect x="-268.9" y="747.5" width="127" height="77.5" fill="white"/>
+        <rect x="-268.9" y="747.5" width="127" height="77.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(-263.9 777.026)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="53.316" y="15" xml:space="preserve">A</tspan>
+        </text>
+      </g>
+      <g id="Graphic_106">
+        <rect x="-127.9" y="747.5" width="127" height="77.5" fill="white"/>
+        <rect x="-127.9" y="747.5" width="127" height="77.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(-122.9 777.026)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="53.316" y="15" xml:space="preserve">A</tspan>
+        </text>
+      </g>
+      <g id="Graphic_105">
+        <rect x="13.1" y="747.5" width="127" height="77.5" fill="white"/>
+        <rect x="13.1" y="747.5" width="127" height="77.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(18.1 777.026)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="53.316" y="15" xml:space="preserve">A</tspan>
+        </text>
+      </g>
+      <g id="Graphic_104">
+        <rect x="154.1" y="747.5" width="78" height="77.5" fill="white"/>
+        <rect x="154.1" y="747.5" width="78" height="77.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(159.1 777.026)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="28.816" y="15" xml:space="preserve">A</tspan>
+        </text>
+      </g>
+      <g id="Line_103">
+        <line x1="-458.4" y1="785.75" x2="245.1" y2="785.75" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_102">
+        <text transform="translate(-556.9 777.026)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="0" y="15" xml:space="preserve">GC Horizon</tspan>
+        </text>
+      </g>
+      <g id="Graphic_99">
+        <rect x="58.8" y="849.92" width="173.5" height="28.447998" fill="white"/>
+        <rect x="58.8" y="849.92" width="173.5" height="28.447998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(63.8 854.92)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="76.27" y="15" xml:space="preserve">B</tspan>
+        </text>
+      </g>
+      <g id="Graphic_98">
+        <rect x="-409.7" y="832.42" width="490.8" height="12.157997" fill="white"/>
+        <rect x="-409.7" y="832.42" width="490.8" height="12.157997" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(-404.7 829.275)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="234.624" y="15" xml:space="preserve">C</tspan>
+        </text>
+      </g>
+      <g id="Graphic_112">
+        <text transform="translate(273 797.5)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="26" fill="black" x="6536993e-19" y="25" xml:space="preserve">B</tspan>
+          <tspan font-family="Lucida Grande" font-size="26" fill="black" y="25" xml:space="preserve">↓</tspan>
+        </text>
+      </g>
+      <g id="Graphic_113">
+        <text transform="translate(273 833.974)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="26" fill="black" x="42277293e-20" y="25" xml:space="preserve">C</tspan>
+          <tspan font-family="Lucida Grande" font-size="26" fill="black" y="25" xml:space="preserve">↓</tspan>
+        </text>
+      </g>
+    </g>
+  </g>
+</svg>
diff --git a/docs/rfcs/images/036-bottom-most-gc-compaction/10-btmgc-analysis-3.svg b/docs/rfcs/images/036-bottom-most-gc-compaction/10-btmgc-analysis-3.svg
new file mode 100644
index 0000000000..1e49ec017b
--- /dev/null
+++ b/docs/rfcs/images/036-bottom-most-gc-compaction/10-btmgc-analysis-3.svg
@@ -0,0 +1,81 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xl="http://www.w3.org/1999/xlink" xmlns:dc="http://purl.org/dc/elements/1.1/" version="1.1" viewBox="-12 920 809 269" width="809" height="269">
+  <defs/>
+  <g id="10-btmgc-analysis-3" stroke-opacity="1" stroke-dasharray="none" stroke="none" fill="none" fill-opacity="1">
+    <title>10-btmgc-analysis-3</title>
+    <rect fill="white" x="-12" y="920" width="809" height="269"/>
+    <g id="10-btmgc-analysis-3_Layer_1">
+      <title>Layer 1</title>
+      <g id="Graphic_13">
+        <rect x="433.7" y="949" width="63.559346" height="77.5" fill="white"/>
+        <rect x="433.7" y="949" width="63.559346" height="77.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(438.7 978.526)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="8.107673" y="15" xml:space="preserve">1/5 X</tspan>
+        </text>
+      </g>
+      <g id="Graphic_12">
+        <rect x="503.7654" y="949" width="63.559346" height="77.5" fill="white"/>
+        <rect x="503.7654" y="949" width="63.559346" height="77.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(508.7654 978.526)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="8.107673" y="15" xml:space="preserve">1/5 X</tspan>
+        </text>
+      </g>
+      <g id="Graphic_11">
+        <rect x="574.8318" y="949" width="63.559346" height="77.5" fill="white"/>
+        <rect x="574.8318" y="949" width="63.559346" height="77.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(579.8318 978.526)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="8.107673" y="15" xml:space="preserve">1/5 X</tspan>
+        </text>
+      </g>
+      <g id="Graphic_10">
+        <rect x="645.3977" y="949" width="63.559346" height="77.5" fill="white"/>
+        <rect x="645.3977" y="949" width="63.559346" height="77.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(650.3977 978.526)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="8.107673" y="15" xml:space="preserve">1/5 X</tspan>
+        </text>
+      </g>
+      <g id="Line_8">
+        <line x1="92" y1="934.276" x2="795.5" y2="934.276" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_7">
+        <text transform="translate(-6.500003 925.552)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="0" y="15" xml:space="preserve">GC Horizon</tspan>
+        </text>
+      </g>
+      <g id="Graphic_2">
+        <rect x="113.2" y="1033.92" width="321.3" height="12.157997" fill="white"/>
+        <rect x="113.2" y="1033.92" width="321.3" height="12.157997" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(118.2 1030.775)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="150.762" y="15" xml:space="preserve">X</tspan>
+        </text>
+      </g>
+      <g id="Graphic_17">
+        <path d="M 420.125 1062 L 456.375 1062 L 456.375 1120.5 L 474.5 1120.5 L 438.25 1150.5 L 402 1120.5 L 420.125 1120.5 Z" fill="white"/>
+        <path d="M 420.125 1062 L 456.375 1062 L 456.375 1120.5 L 474.5 1120.5 L 438.25 1150.5 L 402 1120.5 L 420.125 1120.5 Z" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Line_27">
+        <line x1="93" y1="1164.224" x2="796.5" y2="1164.224" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_26">
+        <text transform="translate(-5.5000034 1155.5)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="0" y="15" xml:space="preserve">GC Horizon</tspan>
+        </text>
+      </g>
+      <g id="Graphic_25">
+        <rect x="114" y="1173.5" width="641.8" height="12.157997" fill="white"/>
+        <rect x="114" y="1173.5" width="641.8" height="12.157997" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(119 1170.355)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="306.564" y="15" xml:space="preserve">2X</tspan>
+        </text>
+      </g>
+      <g id="Graphic_33">
+        <rect x="715.96355" y="949" width="63.559346" height="77.5" fill="white"/>
+        <rect x="715.96355" y="949" width="63.559346" height="77.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(720.96355 978.526)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="8.107673" y="15" xml:space="preserve">1/5 X</tspan>
+        </text>
+      </g>
+    </g>
+  </g>
+</svg>
diff --git a/docs/rfcs/images/036-bottom-most-gc-compaction/11-btmgc-analysis-4.svg b/docs/rfcs/images/036-bottom-most-gc-compaction/11-btmgc-analysis-4.svg
new file mode 100644
index 0000000000..510d7a0c3e
--- /dev/null
+++ b/docs/rfcs/images/036-bottom-most-gc-compaction/11-btmgc-analysis-4.svg
@@ -0,0 +1,81 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xl="http://www.w3.org/1999/xlink" xmlns:dc="http://purl.org/dc/elements/1.1/" version="1.1" viewBox="-12 920 809 269" width="809" height="269">
+  <defs/>
+  <g id="11-btmgc-analysis-4" stroke-opacity="1" stroke-dasharray="none" stroke="none" fill="none" fill-opacity="1">
+    <title>11-btmgc-analysis-4</title>
+    <rect fill="white" x="-12" y="920" width="809" height="269"/>
+    <g id="11-btmgc-analysis-4_Layer_1">
+      <title>Layer 1</title>
+      <g id="Graphic_13">
+        <rect x="113" y="949" width="127" height="77.5" fill="white"/>
+        <rect x="113" y="949" width="127" height="77.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(118 978.526)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="39.084" y="15" xml:space="preserve">1/5 D</tspan>
+        </text>
+      </g>
+      <g id="Graphic_12">
+        <rect x="253" y="949" width="127" height="77.5" fill="white"/>
+        <rect x="253" y="949" width="127" height="77.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(258 978.526)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="39.084" y="15" xml:space="preserve">1/5 D</tspan>
+        </text>
+      </g>
+      <g id="Graphic_11">
+        <rect x="395" y="949" width="127" height="77.5" fill="white"/>
+        <rect x="395" y="949" width="127" height="77.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(400 978.526)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="39.084" y="15" xml:space="preserve">1/5 D</tspan>
+        </text>
+      </g>
+      <g id="Graphic_10">
+        <rect x="536" y="949" width="127" height="77.5" fill="white"/>
+        <rect x="536" y="949" width="127" height="77.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(541 978.526)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="39.084" y="15" xml:space="preserve">1/5 D</tspan>
+        </text>
+      </g>
+      <g id="Graphic_9">
+        <rect x="677" y="949" width="78" height="77.5" fill="white"/>
+        <rect x="677" y="949" width="78" height="77.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(682 978.526)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="14.584" y="15" xml:space="preserve">1/5 D</tspan>
+        </text>
+      </g>
+      <g id="Line_8">
+        <line x1="92" y1="934.276" x2="795.5" y2="934.276" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_7">
+        <text transform="translate(-6.500003 925.552)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="0" y="15" xml:space="preserve">GC Horizon</tspan>
+        </text>
+      </g>
+      <g id="Graphic_2">
+        <rect x="113.2" y="1033.92" width="641.8" height="12.157997" fill="white"/>
+        <rect x="113.2" y="1033.92" width="641.8" height="12.157997" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(118.2 1030.775)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="310.268" y="15" xml:space="preserve">D</tspan>
+        </text>
+      </g>
+      <g id="Graphic_17">
+        <path d="M 420.125 1062 L 456.375 1062 L 456.375 1120.5 L 474.5 1120.5 L 438.25 1150.5 L 402 1120.5 L 420.125 1120.5 Z" fill="white"/>
+        <path d="M 420.125 1062 L 456.375 1062 L 456.375 1120.5 L 474.5 1120.5 L 438.25 1150.5 L 402 1120.5 L 420.125 1120.5 Z" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Line_27">
+        <line x1="93" y1="1164.224" x2="796.5" y2="1164.224" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_26">
+        <text transform="translate(-5.5000034 1155.5)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="0" y="15" xml:space="preserve">GC Horizon</tspan>
+        </text>
+      </g>
+      <g id="Graphic_25">
+        <rect x="114" y="1173.5" width="641.8" height="12.157997" fill="white"/>
+        <rect x="114" y="1173.5" width="641.8" height="12.157997" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(119 1170.355)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="310.268" y="15" xml:space="preserve">D</tspan>
+        </text>
+      </g>
+    </g>
+  </g>
+</svg>
diff --git a/docs/rfcs/images/036-bottom-most-gc-compaction/12-staircase-test-gc-feedback.png b/docs/rfcs/images/036-bottom-most-gc-compaction/12-staircase-test-gc-feedback.png
new file mode 100644
index 0000000000000000000000000000000000000000..c106f3ee899d46d90c9e1c0b630d3e73075b1f67
GIT binary patch
literal 145516
zcmeFZbySqy*D#ELA|NFtl7fgx2`C*RAc}N{h=73P&>bQjf+8S|N_R=ijC2i*gft8x
z9m7b>(BB#FUp(vk{<`n)UF-elVJ(<zuIrqA_St)%-4~G>YD#1z*Gcg3@W`G#R?x!3
zBhJLbBPh6h3EVLza2>|OBPq3!m)CeAFVCjo0<p5Ox5UGH9GUo$NJo3^>g#jgVflwo
zA4q%_m6N!T6T<Sqr;aKf?{mmI%2Z~<XDzgf3WR+mB_y}&=^l(mT_lZu(Gter&Hky3
zk@U-#-~3LOn~n#3QQyR^2mDgtvcl)mm~+H4yl$PO2k$QR5xg5fTn#()`C)Fp_^Mv>
z^#$eEB=DApbjCMdgoF^{D=cp!m*iPr=WQ|Xu01<HI}fD2NUq<036DfhEQrHApTqB}
zCb2Oa93M|N9J}vQML6t7W>!RLLpgtDqn2vfbB_yt_qsGSHPv!t1qq(l)o4x?{MR{g
zSeuE>n>Q(XNX?$ggm)=jJ0yEHJS;4BnZrFu@G8dxF9})cc>(YEvm{};)DIoX@xM2p
zTM?o<sELORyw6t_vB>z$TbadX5x2PwSLwqHas*c8Y#+T&I3$*9afL6!+YwGx;{q<R
z6YPk3@#Ttd_&IjNZ}3;d-mkPJEcci{f(z>ZVwUM@jk#-6s_>c1BG$0=uyJAJaiMHV
zU&1m%JvJh9(f+y452k3XHiBgfsoa3iVk(oeA5j~ErejKt7`H*wgFXx8F)L0b>=VpD
zDk`hb8ool)@z{6KsrlQA1$!>~4KIZapPW&|z09;_p8e>mwp_dFcl0g+4iS3{c@-z}
z%h$Z{_~(K(9uUm3T{j>gdO(!KCKR0}c^Zq4xFVPN?qfo_q6J%i%u|cs6Lh3Dqe-K+
zcq&;F6Z#7?!70_3xffUb-%#N((Q^!_?m}tF@V>GgzyFzxVYqSHYmM(@BW%nI&)ipK
z<@&|XhA;k0MB}2-1Bz=Lb#Lxh5Dk21UK?R048BZBs)r}?_|n>g%P!w-D=yFkU9BM7
zB-DK&TW~SFMbnAt>l@)Y_IeV>H!MyRQkMu`T>D0Nb?1W0L&}igZpw>Vp<izhbYH0o
zTYbm#i|pb<GYwuE3Qjq{$5%?jL>_W#Fz8%le!c!V>+|pr#Z`$BQt6OOp9NR>_81<u
z+)O61YYp3BPbL?h!xO`QPR`Ym;w1CzG9;8|{yO&ZG`?vFq!lxF0m~^Bi+`ITK}ea%
zGWJWhlx9nTm=3#ggi-XJN4lSEHB<{cbhy^ZZSnR=_KDvrYX7+RReX|RD?C=AdN5_R
zW|eRC&YqbY>5<S8xeq0uTwvQf!RyzAW2nO-BU^5ox!hGuCV|Vrx#6^MY53h`0=j73
z&g*8(6#?T0_9Py(9;6-uwG6%&rlOf!V;7yobki9IF4D(X%vU)n*3Z;))bG|$)tgN#
zeY^6U5%Lz((lvjG#rL}iy%0zq$SC_J>t;9gmCI@&FYv?PDc$BAp_(MFyWAG?<O}{_
z#OlKswmeEvN}fp7cAoZt1xW@51``IQ6b*(<xKE05iiObF%S)Cr?`fXo-BceEickKU
z=oaA?wD)c=R^=9B{7jGYFW%pJF4Bir54jJsc@;+=CUN(@H{mkon_~AhKB!@m8+Z_V
zqwXQUY1zZ=kZHaK{ojP%^4{v+Nhd~!wui&NF-u*vw@LI!T1eJNjA^dZl<~yeS)d`K
zZKf5E3XQ6X`bZOg`*svd$A{KD`<xNYGL5i8gCeTLC#s*5G~%?H3a@f1avJB}&V80^
zn_I0+(0kib-ZIZ6)ur&a^ODRGg7hB!T{<Uv`{zHt&K<lVrgf+Fq%BjfRIcuoPDB*l
z(}n!h`MFh8TBJVm!#2<Mv#q%8@km)gvsQkQWO4P+=$~=LvO}-$tK4tBr)M)_>t+*U
zXKQn0OLEWC?)i{kZf?m^S>hMnGIupQm9#HgMg7D4V>0EZZ}Jo@ehj@z{MvUfGBzwW
zE7oA;wOZXkPT70M9LFHX>TPQ2FDxbkn&r>R<<-BGm%<bRYXY**I0)Wcm|>fE@lh_n
zCFJvav#;~F+Vt8qIcPcZG@+W=nlQnM@qV%4`c7;a@o0!+ylvH3P8W?xjj*ansYtWs
zrL>7OH2ffSpTC{g?OC^}LG=z)yeHi((_!|N%&X7#RIfB&4BDc7-k*J`SDd1E=RdNt
zu}-lvv_2eH9_&RHeJ`3>FRdxOKh-#SR7N(@*=3cXn%2)~ENoo1B{N0jLDR(Pk+vzZ
z37wQ2C*4Zh>|)wuUSK{Do6gr>yRpZ&XBT@-iK5Fr)|yFHd_vMvoZ0hecw$=oXG4p(
zTjikn_CjNaJbxoU??(cDg0g#KUSq5#MAL5529Ea~5!3tA;@d<sO4n}1&~#FD+EF~E
z_}s-EN370;xSvV?P2)NAdB^ja*{Ip~v%0hD{%-#9e%JkPLOr0xegast6a16f;~8sP
zp$WK8UOTcS@a_2$p4J1e2fE*@9}GV*dvSc#EAl*^`RP&cl{Z+q(iXO4hHo6$ClQ*D
zt-1w1kN<f0WBtd9nVwm3n?xJ;JWm^AtE_^TLc$}d*n6=wN*)S5ihR*Uaj6jpQT)*?
z5r#C?x0bC+jCciV`N-+%X+7^c`W0e^GM<KioZ#az8m*gt<-PrTgJ(o9r&O#-%aPM=
zDbh>D&&^N%gkJA%aoGLL?<(UT*vx#ZJS$CIrp1@i1<Oe5VkF~N;|-F;Qh&vN<*?zM
z;A>z%q(ywt@9_Pw@UcqPlKsBezQ($ak(-gLgp9Pz={?i;S-r6I&x!BT26^sM(-2=0
z$`JCjQVzM$8qEIq>$8z0Ma8ge{>n<MB>F|Iorn8DS9(D@bpuJm$#j|2@V9<%jzgJ4
z+0uvQ=H=@KrPV%d>n-<V?h}nZw4)yNC>b`^uXprnMjf>=MlfoLo*2C_)X>`ru-NN7
zc-Y^%A1aSL3>Y)GX?&Bn5!T~9e1;lC?m|(s_SN2)Wk^T*0wZS<CoQLGZAINQwq-rM
zmo=~Umg5$GjlfJDS<_osDWqxYpv{uR^3YOVGHLS_cYY%MM;NlfWo4gaD(bW;qN%;<
z(j5YxDUS`*Z0+$=U6-HDwsGsr(${@OP$}bm6TZG#$I!gf)Z?cIfoJbAi@^j?A|u-?
zc6V$mYbI(C8(-EX0>7N=1Zr%(=xHer$A(`b4`+rxLur^IKMiV3UKWjF?26@;9C97m
zcxC*-dpO8yX*u3vE*U!>owXRaEy?2aZISR)Vfab#2VXT!HQKrdeBONVqrXOjYR7A5
zhtKjQCo`thPtDPNpM@o?ix77*_@tGj^t~L{{mu^2q;E*e`EQvpts6|Bv0CulrjPOh
zo~Fe0E?Y&2pHDMl26o4um1Q(+yPfWl9lae*I}kN)+-y`kwZ&`{Iv#H)8I#vd`Y`!U
z`qE=Eo3=cT7nsW>^-)5o*>Ci2<#hs8sFqVp3=xYHON@+)FLqh;AiJk)<5=*tL;Y8l
zj?_;tqusO~NI$^=fi+R{rKZeKSzoDk7N5hWRg6GeD53|!^XRM6x>BhUBIYn=|C_;)
zIW~80=~wX=S(>x)-AU|zkL=9(W_<hOFyv*VO7qNbwf&jTm|B!%r+jBtz~l3~;+iL#
z`-}7Hd+Hfk*|Kpe%YH5Xg(tSZYfg>7*PRva6>?vA+MUX9Q6r)?G7|5bG$Eci9iB1;
zUVCn}@P5pLYUBlB!ZT{bz>Kgt1ztRfLKzX;<-y6d_jHvm+jt>t*XPM!FftYn^uvU!
zkG@j+T<Dg!O1_K=0EadoK7bBfJI<neTz%GmM%tuI&;=jr8wLJrl%>HFD^*pzJK*{<
z9^r-Sco)Id1@J3#f$=}rk1pKCBlvS49}h3w29NMxW7NPW?&lr&#m)KalOR3}j|hCb
z3Vyvl;s1Lyapot2e_s<6fM<Ae+VW4HfKP1;7fVY=*H;iXA-P#YaD&+Cv7sv-9vvs{
z_repcn>zshn2nBsn}Mo|gayQb-|QvC+>+nR!3hTkPs&RITsl~~nX!2}*gLvPcuBMW
z86g3#akmB7+5QZ1vy)~wP}N|Qhqzd>iSXa$zsoK|!p6oX<?_->LQCP%zovt4((JF?
z+?*r?1Ux-G`8|dCAuiSeg5u)h0(XT3goOCO2tHSDM>jJsK1WxMzkvK3j)JADg^P`o
zn+?Q~4F}iE9OCXK&CZTn=s!Py;c4k*^WT*mUH`Q$ut5Rb69GZ~y8{1#4W>%r?n-Fb
zcv;#TD%dyxnt?TB#Dztq{*3?sJo)d6|6``Xe`gBa6A}8KQ~$@K|Myf~S4$Uphyz&D
zP3FJ+`q$+D`S4#8r37$W{~x6Ii_U-U0z%7>ND2IhX)+`Q9ZIZ#k4!cS>N?;PNEz<u
z!U*_t`>#)MeWA^;#?d+)5AOlq69qXPuM3+QMBjAoA9NAmduZD8)bkwFL^qd^wa;p7
zp0&)9CMj-4(b9H2*Hi3xvsx4}Q`6*88QolG;!98X8zCAu{juj4Ib=`z0mk#Bd-~=|
zf5xF|ysOLaCa%&FY7Q#=hYv1azHoz$Fyu8Q<=^}ALxIpJ(rZ*9C+F5bnEkI6SuRi^
zMyUS&;{QhY6F;GXR52+(H|amy1P)Gw^*X^nCxZq(K6~cRiysq0|275xy@?0y!u$Um
zeaZq-K*aSoPkaCQCfF8Ij{Z6Nmq$VY5#3ujEpPm1OMhFc&?^4_SQOa^*B?G;uWp=+
zull=E{9lA4yz*rJ`|WQZ3ilgq3x+06PYnO|&hh`vHw7te*1vfO{|CZBjg*w=68`!e
zULe!|kPPgZg%p2#=l@1O<blcM2roVQXIgas+0x&}1gHa|zdh;3zsc~w>frVNsskVm
z|Emrz|F1g0fB1h@9lQ{_ao1<>mFsefb*lbz?~aq-eF9eR73l<<wi}Dr#w!<J9w{`I
zQogzl{D=EwTsFl2#03blQk?6cH@zqAdvLFhuP)utW5QCk$zuy%pqUMKA2YEIyTV{T
zY8rTI)toh}a`_$?ZZWP@AA$d<%)euFfy?JHZbmd*sLdt(;`{3;m>BBYF1*654IWG(
zW1w07hB7%G7$JM*qx*Y?|3r(QaZz5A4fvK~TT}>N$qIC|%5WboLQQ{`C)yp+R!=!x
z$$}NB(ms5l`~aAH?s@$m*8jv_VzVC3ZPg(~Od8z;k$$_X2fhN;aG~X|!)n4ajqZK5
z#afqD9D!e%HVu2dE$|Pl>5gpp_wM%`s)$T!GQzLvZm8Lr&B`v8=UiiD5;yKPP`$eE
z7^&;9Eg!=uIuOGk+$maPH#VAkVD~|L*By59J9^AvIPa~^SXqY6&m!Hz?bX5T{_#qS
z=I2wj4o~;*ppXOkbiCS*N2jF>(mwE)O|bi;F^8s5m^%`=U)ZyMJoUu+ywK#_^L**0
z)^wn)XP~elO4fb5__*=O?rGY_nT3hzX-A;r@;$Azq$g76D$?%M+p+`wfeuqCle?zU
z7}iBuo?Xb+aUA4Ypra#X@^t0|wd`_!YCLvyrrmuy)rdlt(VdYYS*Kb(ZHM!cPA{Ab
z&L-6?VIaMjtnSmUsW32g;Z&*VxGwW`;Kr?KSZPyW({&^)^El4kKLLeIIL_|6_G;)f
z<CH`9u8bnz2_p(QxiNZlrV52M2{fuU!ge)JFc1^O>C{nxu86u5l6&QB@7yJeoN)$w
zI+Zd?9xfzTK94D1Io>#HH9al2SE+$xEmSdmA%_r`7?{sCBK0@DEb2wE9YQvH=0te(
zSbH_82aBfOMr~Hqo!=k*YSC3Y0@iolc$YVWoG@MSJVGIhMo)|p7|7Xi9LwV99(ka>
zkMYV_5;1mhIGGp;(^I){GU%`fl|9qn0DCFKu&xB2gUQExuzS<2C&sEh7y<J;HE0YH
zd1(5$uifpI%Q^R*yqsh`?Xj<9r?8uOf(;f%AK3F2y8?SKSu_}Zl|p!qnD)?b!H!A6
zL>$iRadg_LJEv((!$*sq2|NC`ScZ#tk9qQb`6d2Ner};soSCd8ui_ftl@L)VejZ(O
z<xGEg+f-e}l3lPSDp~F6G%WM}(l6+F`)ERx4R@ZBN>e6UU;SMFyqI~|W*uu+n5>n7
z;mZ598Mu|aO+%yYl2s#e2FpsmdQ4G5V34oIoA(QIFO^c;E;3n*X*ktDrO>Tq@%r2k
zzu!Gvd9Eu1hBa|AN8Mi`$XWQ^wks0{J-TZaw(Wb9_uB8-b&T8^vmWthq}-`A8bfrx
zWklY5Y``&QbesbjvYmEgu5Pc^(ft1tAmFp%qLiS2=-C1%_5VPW^2)vKC~dOPNZnFo
z#$J1Of6O#3J5FXY-Tz?1@MWyj!sF_$Oy6YL^Fzck)E6bp>bDcL)@*^gGvhfMF!juP
ziP9Bb*@dkgR+pS90cOA|lJjH1Q1g5kBYb2lPlORtRe1=kgzEX(Y5ym=;G(?(v6G#)
zei>hsg+%{M&&h`Axe#mMN#)wlV*SKzV@R`Q8Fby&bx%;~xK>+T`lo$;%O$JI4*K$;
z=Ch-X{;$I-b|RBr9J|0mBvY%2wWND5r|A0VFpZ+_RwK`jQ(~DVQ+>A^x=vPx)%(fM
zF@d^`5a?lE!$GQDzoruF($uANq3`zJ(4AI;9dxBb%pNm$g-@>xr;&+H|L{ATSXut{
zL0yCZam^rCNf9Q0oI-HK^kO;Hj^7ny;+ck!IfmQA+71BBZ)af*j^pmj-5hay4*BVR
zQ4?;%s{QQI{93t^ZmSx@0eJ^jr(~m|S*I9bU|8^6kG>BV?-i^bW1!1(^5i0}1LFA&
zy91K_`rDXEkEX36H&frO`@0CnGaGW*1t+A(P$X9hU+s7+@5xwbv0IBx5_xs<{-`nH
zWJUH|kU(8_Cf#S!2JHOg^c^e>hB<9Yv4|CiGuI)_pohzh)%~?oI%`mgbzRqCRne3-
za#?s?hF5E(6|kuHNcp~LByQi=H?;u9NUwU^XCqZz_Hg9<fIy8|BUh>81`(?{x%8ff
z@G(<$+7zT|#t$>&X|>@{TQ<bJ-+Now59>6HstAB?g)FMy1uUyEbozX;*ljJp8X?nU
zXg%W&=VA>sACTDKb?tqvjbr)h_h2}c5|f!#hJ&8#RB2rbYXG*zzGh!fw^0DLSJkjp
zJEek@KR&8G>)IEcg20+n0)q~V-NNggl>?2MjwhG1DP)CKGCVWI*M5D#0jxj-Ef-|R
z3Nc%4y?8V&)S7jF#<edKKx@XvH!Q|Vo<I~Aox519&k%1v94=>-D`R?j%I;-IxC_mU
zZd=Km%*ng9T%=E?R+p$$j*F5Z+7s+za*aU=)LnsfI9ct{N!Jw&`J+SW`%OG&EXfa#
zZ$P&UFrv-As8`Dwkl8ykJp8a$gtbYN`+e4L=RpmtU!vi}T$woHogHsMVr9>~h69!l
ztg!u}Y0isXY5k(eZ_1d%l}u60tZ%va>Rhnj)Swcx18{BVh1FNQi`<AzMaJQhsBc#N
zHKX@X&BvJydqYoE&livN*EB1U(Bl+pb*Vx4NWd-E;kTkGdZk7w{XX`3xyRGhh?HX#
z49Vp6BM7=xz!4|Zv{jpS*e4ZOIMlp3)CWJ=ML<RkT*-E~B~hs@!9;0`N38k)Pw4Tc
zg__I~jqqZOa4O^Zp?YAZga=2WwUUe-i{Fmi@XB`?XrxTjN~W(*y7tUd{)TqmuRk7I
z?f=vl9e>MH_HBsCceAoERRGnDnGI`5ukNd4O#cAwxs|4~CbPRIyo@|QtD87~ed^EX
z5e&-!N3z)M+!-wkXOHGJKdYTY!i^ykP0sGwGZBehO-LW4|H1wIrbC;?wS3{qnTY-R
zfTOkiev3GnWrvGPZAbgAW5>Qta^gwRCV_4IrH$)F%l)G1GxfEDj_p2+oo{zxvbAQD
zHu?Aen!DbU52sd_F*}&?G)2-(z16|Ak_y}do5(S=Ow@&Ml*6`>nC|TBn~zE)4?b2}
z1!9;7q;{_e@826v-UxecZIy2Gs5+>+xKQe_{lju$ewNU2{n;C)NsNZ;yNzhqvo-a?
zB36`1(zdytkvC`XO#S>3BM@*F&0NvW3^D|k)oqmRys68BLpR{0va7!uA#HasX(#X(
zDkr-GqJ8{zC5Fx&wEZ}zwqJDS+4x%n&pkwTe-IJNs{x5)-oT`WIz~s9&hjUr$g@NG
zOpJe~$mSzhKzQoJv{Y8F_e!R<o`IQn^$7`2GR_shTYj;LYUAu=7(w0^3>RNXx5%><
zPQ*Z#ojeMTB{te^&yEHIWo@7PtSMF>v-o0vw-7OJ`g3hQy0en*A8Y8HnAq>q$7fx^
zSU2laEI#5tsuOTBujJZ&BRts^AvyNdac0u1X>f4jZOxdPZa)Wwtofs6iC%>GJ)fp|
zzt7~%5;|*{{-YCCKVhY|NhThKy9?0*sgkJr>I%ln-xSF6xebKpL6556r6$3;U4kZ$
z4Z7mPjc`_<@oxu5c2;91p8J+ZWUjsNYWTBU-_cLiC)2~0Dg3u;z-$cC9~Ic<+5n^8
z>@+?5O<%1yR%3oY@bpJ81%u!B49|&)j|`$+7)S+U;cvDTyRwF5Bdaz3r)l(=fwPB+
z=f7q2iZENE$Rp<#RNP&4&+X}nk{Z4JQ`Y+P3%dhNsN@Wf$({{!Q8kIRo3hXS755Mf
z1PCXXnP~Mdt*0p?jtaKRY2`5VnY{sHy9dYU;?ddhwkQgA9#AO~vGu4Z8df(TL1q^`
z;@IA38n|C`PR4Nl*k06Jx(wx7i;-XH+gqRM8%K|BR-<G>XCAw&yF!<bHcBCMrsmGO
zaA8EU5{q{`lhgMLg{h`#JB*p)1!fLAae*goC;R1r_lrwwjS{zE(-7=heipW-1?xQE
zSnrM98&-GqusFIiqfL}<D>C_t1192i#BD9gV)qpgj6N%HKCrC<hh9n7zO*OO;cUat
zd-RgoEV&2APw9?{T~$#V14gO+Umpe>r3``SVO;A*#X3qYS|{0R87o=Fu+RNww`!3C
zCqm2gaX=^v`8P6H()RBLn*1tXj^xVRsTIb}f}Nk1i*Jn#!)?fw2xTk#hMr0`Rl1Zc
zm#4@Z^i_#m6P}!SD+3U|Z`o8hrMtGE?l4gec30m93~eS4IDBv;RE+SzJu`>bS@878
zna4{9Q!|ruQ(-kNt66~_9*dt^9djB>MC)dYz_iuuxK#S*UY~(PZF6PS*94^%w@%JY
zyCUxMNTdbLJids`I*hMrQI|b^nS34$NWC^Z<ycp0_#)`yH^koWrvkKqeU_zfOT`Wo
zKKR$vF_IrIC7JT;?T*#O*8e=&+f@(f*BvLrcpIz#Y*fb7FHTRs2oI|Xs$(3ugsMGu
z)b(KvSRK8zB3edfz;(I>3ETze8@fbSC*pJ(ThoF;211Sww(Q=3l`Bb`efJOSg2P#S
zx{n{U0MTfd;BTyOcDTC=m$Q0%fAsG7Tb(eZ)O0CX=V8qSJB1V23sJ2Zi&&Y?&9A#^
z0sMhGK}2Mg(Jw^JXD~=#<jG=uAa<W<-?|anJsk(O%G%j@?z6XQ_HD-L3?sfG0>FK&
z=_za4UKqW!dkrEw>+I3ypQ$tJKUPXMZNynZR>3%vzGY&&maC=fy7*Q!bvcf^d2#fT
zz8B+nGP{n12<fj5W25C{(4*fx@~osD9H$jZOaUPicBtl)$+_9U89SMyf)>Xd9|&x}
zmcb=4+NPoAX6v)BXGdf}q&mVge6G_Ey9j0pz_3H-R$yRbIIBl}7DUox{Z$2H%<lQd
zeRT3wkKp7%^Jsx2sAb3G3zznUg_@Rjb7l18T$yCXjFX@k&_?C3gT{ulunqr{;Mx!B
zE*@?B5O7z%oY~eY)7z?g_GlyJ{H$w&%z6>W;p4MY)IhnrV6OPe4s=D|)PHHEO$Sw@
zZ=%vy>MgHA+B8^gtzNy;N}6AEE0)_#^ipG`%eqaz-kK%|T$IPd+Q=p0rK(oN4B9?R
zOZxB#Dvqd{7Bs)w(@lz|_r1FEVc44InC+SVE>>}%=0*XJOh$Fuxg%T9XR?yMPfGY{
zH-{Vs%7i|0#H=xUjJI_0Yt8VOTnW0yX9tuOi0x7S`8shfiYh%5WTPo3WraC*!B)}S
z4K)`8AxCzl$y$vfY-%vz2KqL=4;b}w*g5k>i8nbvF$2PIdN76P`ZiSuL0RN|6)w~A
z9ynA4epr@i@X#YBC($}6I2$e#mLF3Nm(viXcBA*k$$I^|(%bEVFEtHLZYcuqAhnF_
zmbVJGA>&94x1t@v=Ao{>iSGDYZ1iDGWc2%x8o57u%3oze6EN@eBHEW~yRHpDMJv*$
zb<i2Z%ZnKg*vp*l>cX1e8HkpsULtoB%w5a7KSt3st0Fq%X4UpqB<AOd`*rQs{E5K8
zI{JBYyux-n0#@zoA|>O|5$<i1H%rLOfnjnsQCC@OCFQ^OR1y|a6BNhYU~uD<GqFO1
z*e-Y+NQ=7ESqOvpZ-y0~&8;-^ruXa5=a$nflbi)}6ZP(`Jy$4U9KD6D!t#tgcYPcE
z{@$iyyU(*~5$}PHM9j3EI#Pl6(p1lLg*(nFk<Afl;Z4a&X^Yx6g~>w=E9rK6MmC%<
zkpf!nqcx>E5pnE`Hlwhb3m%K=fwtq(k7YMttcP`RaiQQ=aJ`5xRUu`zfjcd|wu`g$
z0lOnC8+aIvvW=(I_Of;jdZi|Sm2cIU0bPKGXAKzdNJyus*l?saS9Vm5L=DX9-k}xb
zG&gcA-$V@{9$UI}B^YE20pNhtLx`jA1pIk>9*dKBO)b#tV5xNHFk?nH2Lr|&oN79c
zMec064jlaa<HT6W2A{rX_M99PrHbbAcyL7`F({wH&3SNJrtEaH+EJj@|3Ib;mJ{EZ
zz~B5!t9tGYY3k2018Loi6G6^8Wd9kPUGQB95s<6#u+wd@VPC3t+BY*1?&z9BGK7<$
z|II&|p6qz$+*z=c8D3urK%SoAj&1;KYHLNHarFzCFBN}TH7WpnR+gEj)qCNif-9iU
z%_BMCcZ$od0t+6M96yDT{64{QG;wcOO*?mrU62!ooT(7Gyz5cHxJI;dJi~wReeyd>
zQA)B3C~n8L0L*uDhdB)UGbW~{_fbgH+31!hr1~xQ0sAeW#KusbkR?Dq(|(j<eZ32q
zQHDf$pj8e|s5U!690}1gsPs_b9w%F#5OJ7lhpd%zd8+iFJe$V3j#F#Ql>=bMes#z#
z^Th~cw9~N@fOrysNgfQqtvKz+viYc}FThhkRBgpwrUEy@2&bFDq=(=D*!B2?_^(Hw
zJ3AMuBQR|I8PTdgU9*;dl+tSdKA!p5aJn5|Bj-A8iW+nyj94rCwVe4(rC|+aelmRp
z!&8UU{}$(ZRKk&K(C*-4AZmEDF)hpg!xjXeR>)r>a;2o#evOonYE9YXrR2SGWjs3o
z+CiWf9qQUx$`N;Xr*6$6qcSRur_AKjMB`jYxoevI=*0^>lkzPW8B2^Ggnn^vzkWOf
zY}gXE^*wwuR&sl__}Sb9KNP#*?AlQ?m0rhRfZ9l(ui?jP4ziFq4);sADBLzu8mV&B
zZn)h=Kd@><|6oUVprJaJkWy6tnWXis#KSnSjPU6D7k9uid<R=9!u>Q+qf}{|#pO_=
zghkSo90Mg~v@U<ei;uu>tZ!JDWZ?7#D(F~3>1O4?cm1#7Z;b7y46RZ)X-YR<kMg~a
zcJZ4Rfo`(+Zq65{yIRa#a+`kUteVEbbksQGiJXxLHUy|azM;e1gGw~M@)uYj1CF=8
zFqw*FTQJMD=!>s({$}pWdHRu`E{3vB+|1;GaPc}_6U*+8Zb%Cxp1KbG@{o@zz_{*x
zBFHN4ktwaB$MzVG)YMk(0*{8XqVj-`G~4+01AzD=JVSfEm}%|11ilKup?}<o^sebx
zlP6?@sXXdH<3&lscT+%}gvywn^t)j*ou42sjE_bc;OVL57LHCK?15Vm_DKyKT#mZ~
zLMwzRKI?_2=`_9QV$x5B%R(UyZ^g@0k{LnX!WjuqU&Ajl)U6w@1m2wb;AoyI<3>6h
zpim#0<sPkNx`aP9>Ts*0pLWbq^JSLb?<b}!Od1>bMYLO==R|&#a#RD>Do|}aE>aYS
zu8x&wxxZ94UFz2P*PMa()s8n7H)<zsc{cD>z?>bBG&Xb#7PGLJVdtXZd2Yfs(fBE%
z9AsWLN=Wtezjj@O4ud?ce<CUQRI~t8%#pwB@NN3=R$pv|Wdm$V<M!ktDLwY~=J@@w
zXBNyCL>$ZNJrq8)#x~5AxZi96q;VpZ-grz5UB#a`I|J--cva@*et~A=O*eGl;_Xrn
z7hSN#(N~TY>>Z_>wfq{CqV!{x%<e&cK>c4Z{@O*4`$E6*m=Z|keT$=|kERFij@tdt
zL97F?r<@uDL>okAjirleg}KM(LmEL1p_Jq5@`xX^$GveuC;amCdbUs#+rnJG4^!BE
z46C${o7zu_hhHqoA7P~rP5KBW*YOLXDFF`ydx%Bwjf3jvqF^wNo@Jfpt(HqSV&Ue@
za+1YLoFaVVgYKh-T@pi}e9OqNdHXc;3?bJ2#9oPcZM0D}3oyDZ778ryS*rJ&Lw2t5
zL0Ub77E%AR!<B%uo;6MSDa~c!<r<-uSSSeDp()<+heBNAntq(^RC>I-kd$^08dEP2
zYV&CC%a~MTj}t37BGwl;7I@gR0*(sZ2`5Te60w=sPZ7r5#&s#n<*w&1l+dMAn>ieH
zD<k&yN;8id>tKe8`jN~_(92l_P0w$QYeFenF>R(e99u}wvsEgx=VIp&VpY#liX5hD
zOV6b>=P>SsStVst-|v9^bWr71`{+0V9735pvRnz#S~G=tsl7v4R7Z8Bx2@e7=pBcL
z>ja-q=RMbpQ{KG2NZvwkCAHj6m31mc?h%823rnZoyuC}%>KAlw0#;vU4)MMWli{-H
zirvRp%g84qnHpsBZJE6rE3)AqtNTTa0T`ue0Wjz8uCv-3W=k2RWMs;U>l+iNoGZwX
zW+#fGUpO|>GzHI*TEw|5Tz+<TOe7g5z~qP4VU>*XSE%?{BHKqOy+@_=<_@@rlzzE@
zij!8_2@Mv=T>tHleSZ1ujc)~qE_d%7&F1ZZ$2%RQp~UnpvA-qRu%a#WS)a7N)suy^
z##U{6QWViTtX=d-ID12>ei`ajv=R~W-^D(jslXoX&`Wx5rx(#!b`mdMY@r~7dVRh-
z-zrVBf?{1qLxKfZ)?0ZNQ=H$<2{fr<f9UM@N>PC|y>>dZJ4Rv`NQq3Pe!AztTVsDS
zFpJ*=D48^Q@NM2kN@Ev7TSG`k1>i6_10<oT)JW)9QUtg7!=#Qg!a`c@)=aZ8%A*3`
zikl~fKY2W8d?SBg!1nIs0f7<2E`UgC;e`@Ib}ZpLfZFI(OKcUnHF@`w>)Pi2tbitg
zz+<S$#=3U{$9P=xL@`7xhP3r~V~=$~6q_Z7qZAHr=g&ZzLO%l43lLxLZVP4RB-f=I
zR^GaeE}e_chg=8g^pDPQLQgD|+ZSEpqq#4F-IR0Xg)nn&N6e8n5fu;B?OI|qV&g2r
z9xbG`&{Me2C+;($H3?0<mJ;>JRdvf9vj?l5E9`{MSeYl1(-tywLK*$gdlc8?OYEo?
zI%>>~fUhOyW~jnLuWhc@`$SrhbMv;d=gouAVT?N?HVG?Fxx7%79M3UfprnE>suW;4
z6A9o{Tey4<y>k$1!Ca;-871(HPP;XXa!hWT;Eif5cSE>q{&@1$YxAybH}XQ%<EblF
zL?4)&iseT$FV!i6i=S^XRuP;Pc>>Yg^YVj4R2VkP{^YAr27ks0`|AEw8g2HZ)>9D7
zykU;^Hw*mmE9+h!jo-XNKZ42(a;rUIr^odws=poTq((2<I~@itiWpJ-JI#k@mV&un
zkXty^bKsL94b0982SdqM?J_Dv+fW0iX;-yfC|(@~EJ5BK=5ylMGFtVAJ%bgZ!2(kD
z;}!7yuK`Hz&3?+K9uy|l`vos$d4Q@gTTT=*b6`2J+6hf5cWE_}-R!@M5<E|Q!ldo;
zi;&gg_Hak+B}ox+97I3k96c3c$=)-b_-)it?<(u&tMRm!%&^m$$GP<{v&6<VH{HgO
zd^ES7QEs?U<R5D_!qV2S%9HvEW2wyoMSo^9btUX*x1Ko`F-{HkP-#bPtpWk6WbU9R
zKW<v{`~nv<cZi<L4?8t6r8;e^zv3&rcIKF~ED(7kyqV!-tpY&H^`McJ-bCd!Rx+z%
zJhf-WV@32;xNEbiqq#e9vI`z%34=XO{gJ}Vb#exwobj|LV5rG~M(gYq51e#Aw^cIC
zzXXKs?ba(Cik%*AAkH(FG~&slIk1XwV8^z$(5u6!J??Gx=NvZ`mOXu>vDse%1G1$f
zfCFBtG;}(%2U$$kniMJQ10EgqR9ebN)_&k=0Jhaqu+*obB{pfRKLj&x6Ktv~MH-#9
zV>M5!#Lk%++M)V`xEvPhMMoz(-`vU*tD`K%LpoGLh`ypE^ye`)X1<2=O^DbdZZt2i
zW2=9g-BU`0!Q-5E;q=7xuY~BB*DwA|>5yw(X3wOe49%guIeQlzjhak0Gwn$0y_y6s
zy+<du#0v}-e1Aclolg{G5@}i>+x@`;>y&HtYLgnBPwf%@^Ry`1g67XN5CSSokuok9
z@(10-SC(UwdGXTq;p;p09zUN-S!iZZzX<^t*(c{85fZmq={L7mSD0l&xfsp=;HhP@
z%`DQi4j_{4?nx70&1<hVl52hN$&r77VZiPW+%@Z5RA_%0K)Q0G(?D*hL)9m+ljWCs
z$R|@*O2oK*G1ueq>2_f{$e(471+JgAeR(H-&F0C1p~195-N8ruT2+z-!-<7Q?bVO)
zMRyw_tec1VjS>{4()`+m0|-SJZ_%T935mJDU;EmNl9O8P)m3c3><OYvcpuY4YI7Nv
zJ6@zZy}ycR4f8Ide-k2zzVgOfZGp|Kxm`G7N1Qm#p(S<!`Je*v<lBUv06K=~g3DFh
zs6V^L@k|6hjJYD$su361M?fcj$k#5+Lf8YQNj<nqc%F&(QZ0ZK9rM6O%>V}2g<m0V
z6PbK9WCiepPWy<jy?Tx`9~s>*5`hFdIY}Bmezm>&US9iAfR+s0KwYJfGs3zZqP4)l
zs3C3xxQ8xI{;uj*%$rBsm5>_kl@!elXiF39W3H^qi6ddKtXu@-VHo@gkdM_3!Jss(
zqoGQ4vQ|ZQzoJxMM?$OoHSSCM?Fp&MQf;w-A63a$(J>Vhw2<WdtPd_gQsDc7L6QNN
zIs8rSA6I0f0Zo(`B(Gl{!Cgrfnk$>NNj1w7ZYOxr`OWKm^m^_}jbNJy&%rAcNS|e8
zy|dZh4upkeDeo#`Az6#<59j?OxkH;%#_84eZ;K}4P`+p?=5Km=`UdxdP11aygvxB7
z<Yw=xyL)%vym|4lKY$yq|4{4opBd+L3x?X}%6w0xc%r#MhwC-%t7w(>>SeEF!P@LV
zngv6zWTDM|725?vTP2`C!Fd%xmu#AKj>JBc;_*#J*HI!Kn!g`83kl~Y2x0`VSgcAu
zdWjTTD02e|$fLPD4cJTiQ5udlj<5#$s6$pt#n4Jg`&8qsnF`Tsu1r~P$9eHGQosDe
zr6420TpyXB2y5ye>@RLC7`AEOw;{TUE;$s=TzV5?sUAK`eUk=VBD84OJraMg_UxTT
zdv%x!AGaQRCiBoH0|Ivv*gi*C_kX}`VLis$GMbSJaXmPMJ(GLU(3Ozbn0=AK!?<18
zLQ(2?qREM}Pf^MafI9lho_Tr%hRZ#qLMTYtt+&O1#*xkhaG4on&7K+QCBn^OMT53?
zT30))iU4<V4fp!YmH#z}8eIZ3HTYsR2r0vjzyA_Vk1i2hOo^~I#Gv%`A2$A>?Lo1b
z!+Tk<0m9L}g8m&H)gmyu*DEi^I`B@|MJ1C&Y33r_jRROFnpY}AqCp6(Jc>l<N@Ywk
zDLF)W-C9VAoqX0G$L%*Sru}rDMx{PNIDEaJpBq!48o-Z3oz##;LCUy7jGZ@QSV4*!
zy*b8GQy632mEYF$F%hl8fJi{wZ=2h<PXtEy64U=#`$irOD^=0~b+4+ltu_OqsCt#i
zRIEdn0dc*06?j3hlfPDbga$R!9HNI`xs5F6{26sUwuC%IJ`r8wT?GbBOeTeA<r7kY
zUKlKG>H&j-6i>^O_Ugn<X3y#Q!TE@acLU;~#hGlMg);BUWE5qKKx&H18f_;tqJ+9u
zLUl@-eKEC6-0{@RHHFzCTph!O{q7@Hm#;>7<z<Wb?ybI08}eiB;`-2k)ezFW&wI+r
zg-dpA4dq5dNR&cC-3AJwv6It3<8Kg%pP%e*iA>plc&L-G$Wc6IWXIPx*e>3pgp+CE
zqf5{okW)_HxCM2ba;Vz}FLs_*h-a$lc2;-hYo`cUr#aMWorog+Tm#z$NnW1M_@bJt
z8~3F$l$SMP`{izV@V4fqIojTk;*bjYoGs!$l83s|BP(I6t>n;2=*V0n-sHbk<45Oj
zDP<SK&Apf+lv!HYc|G#J4exq^Ds#;t$VVTNQysUymCl)0ZWpFPBt7@BK{5<+v<^Y9
zb^&Rl#)&QS@C~hmsajg}W*;bkdhyn!M_7Xda^Rro0x5HicLS$h*lIz)^(g^?^La!$
zD36%ns>-H{wF2O&*A;c=(irQDG}`T<5!Rft!8l<(1(o#Q%^Wi9yj%RZinng3u%#5~
z>6?6Q_mSlU-w`WV!0Cy)`;7b8#P&>6cvk7U`>gj$4Tn<f-acrx0*8aFtTiGEYe1Mu
z!l|w1Rdnk#SjPWo{N2)gFOMR?+P?2=G|f_qb>;*vY=Tag8rKcw+p;;8SG1BmN9`hH
z>%blUS&rs$cYP{E_q{MDH@8x@8;nR?d4J5rq2}l2&V1Xc`w@b#Zra>NDFH_ZP&mpl
z$+_L-v6u5ftqN2Ltq18$aXmy+R1se<w@(7*i0W~dE%H*ZXExifq}r9?r|p@81a|GX
zNywF1A||H?MGJkhXV~m6*_YlLxIO!V?zAfZr8lHt0S1J}fq(EO3{?JJ)@ErC%fT90
z!4U@wKbV7ViQo>?hMvUv&g+F?b=j>cQ2Vm;oe!nAavv;QG3zvC)E?4I3$Dg?3FZ!F
z^9a}E1U^vVPJa{(iH<YAMOp8y!4L`bY!*?b_YT7oaHWqsdNCm+m!Wrel~Zs{CD7yT
zCTN4*SXmMWnX3z^U{BKOKN}RSG?V$MqrEiZH!~o)QP5vKAkk+b=kn%xvrnS={r-Ey
zG`_+&rLPH^E9iYR7+Wtc-<}d{4p(uoe0XIm-E{!nZw(TMGtNptwN}{vEJ_4oBuRnL
zddg?1DCH|QrRyc8-CoTnC4!LQSum{q@bXR~nwOxL#)xylu!n+U!El#|7@TUs#|1b=
zq{s8P9%RU;oGxJxrXUc6G-i5JrGq*q=Lf}5o@yF<hHC+7`O7Ji7%w60yPkX%x@S&u
z$Dm8%$V!NyvS5Jt7w=;uIktcYcs)I4;mlUY2U{Cy-6Qy)b5s`C=JkvnEOD8{lJy)C
zaMVS_q63+LcHfkQnaNtglX>-Cl5-A|yR7AYG{P~X&x1x^yzPqcIE4)MyQl@1W8+$0
z$(=NZ;*zQ;$T6sTIe<pMDj%JzG(a7<#F8_K{n!RQZn*{>>iJb>AemHW?GExwovpEV
z7AMm5+N;H@PD7L}rKWzw-yjDnihx3=Q<&n>?>He~E5On1*NGviItOEoFS4L75;|55
zu=N53Y*?Bck5kJ)X|rdV@hot(-tNM`@sI(Lgn8$O?x3e@RY}>bopJ;V<VnK|vPFQ_
z#U1tJr}W15l?w{RZjOK@SDQNMfDt)0H@Boh+zR`VFw76=54o`bEC$g3pIDjE+V|I-
z81CC9SY9O-I^7#K&b-<~+(n8Gx#qG_3SPm$k!<uD&KfE?D;P+WfyA2N(+3PeI|`kZ
zD5#ymbWS4?v)eoA^@41_`;6Zi?JMMpv=Ttc-ticeUyp<smxGBY_$L=gsSqp`Qq3W+
zC9%<7A~M_T9q#GkH0*0h#!+5+uRlDC@DiyBeH{Lv1Y{q2K(B&*Fmq@@__^jS!64)I
zYHi-dN|od;w(o3x{7ogz0DM)^OL7zI40k`qi9noc;GNQZ#;Ft^<>iqP&IR0_j;A<k
z;8YzioD}pSM*DB@Q6cbI&~~5=D4L@f2;U%mtRTtQAKRLME<ukU{*Y}gj-3ST!A-P~
zc{LEufCK#(UG;&zD|pO{&r<31vL?b>kX^Nj4qaj+rb=wYo|$+;g0tShZbn$EP%!1T
z4!#r*N+)>2)twM9vV0?)X+gF~D08hql?ow3+*Z5pJ|MpGwH$g>R8Eu+9PK+U-`+)7
z_xTvKh8<$jIEIC|sLb68Uh<f7_z3chIM{ST;VuchU65ZP@XvPT&9#t)S4OeBw?8C*
z_wVIq&)fn%5QYT-hf5zx@I6Z{s47S)izqicm|P>gk}m$#jq<9E4K0X_@H+7H#m<NO
z@(u-_9h?4(Krw(hRu#B<^6Tsc-S+BiK8l1JX!&cjY^t{-aGS!3D#LIPurm(fWQ<8^
zxBQoXEPbR$UvW~98rDn$A(CK_@AuailCHeA{E{ug_yK<;ird70|EHqX>&T7-U?{8E
zx$&SoKozBh%1T%HL~<roP{@pfnD4CR-3|H8M~n`?icP(j5~JGL3%75j*_)TwFOZn_
zzaP6%`M3xJG~;3N_40^69+CgAx+LIMm0nhBgaPk|(C)8Y050W)FW<}zT@1ZNchep?
zBkY;`yTUSzuV<P(1yW8+Wn~g96y9e_`x*fYI&Sg)EE_lEJryw(dJ~qE9owhyC-P7V
zH~_}!${o@mW8j*^mH;M41KRpl`!7w}mDg;P)@4F`Pxr^RK)j#yGuvgipRroX=~rch
zHPOq;BXk$5|GRhD)$bMQO-5vXcXJrd0CUFDx;j#*4RTjFR2PHjbM$Z?Wn3w4KZijJ
zXH+<IFh?3LL-ky|%BZ~>$`Z@ySl{BYG0OKqdtc)G<S*f3AdtriHBKWHiN})&g~wa6
z;HH9mR{It=FzQd+fpKq|cIhSm4f6C8pb>5P6UP06WFF1bAV-|k5a_CDnLr$YF5RQ#
zNznW1fBcX%63EB8sLp5<H)yxBzK^jAXmi-lhv9>_pOhc`OZdTQ@Bt0TjJ}{@>7})D
zw;`~;kipSLcRNO*8k$6E{smr&m$o*x@9qwa#6v;bo2&!z?Mr|8V<0yrPa!>Wu)(#!
z-~}9tK0fu0gh-(GY{exg=5IHDmJhf3-1-OzXna=~nJX0{um6Dhj)ml%g$Wyvk0u`6
z9iVMF@6Kicj#$!rKjbH%E+9WN`BrpxysjCfZaqM^Men+1B+Jod^?Vj5@Xi!V%&#oq
ze{>QNcfDm?G5%vM+NsrV8j&foJf3SIBs8rDZI@(ZDS?&QI9<zNlC7R4bC7Hi+p~Ao
zPWH^8y*kC}Dw@}g{4dcenZFTp_f{*9q44nmE&7VoLik;L8T5wKE*$vx)aV!)DoApd
zf5--|2?}^%mQvXQe~t|fV?p{~3Ji$Ry0K@jZ^J=s`-~#R4V-6HiITvL4B)@YY5b+9
zju)vAtyWVwzeL~Ip(cJ1`EDu5UhVNWv&=KOW+jI^i-z(zp|Oin;j_v5a`F*&(jf!9
zP-vl29LoXVc7NeS7mF|4CFGKj75o~^><E-QEZx0@jzEvvZwH(J)q;BJ%VXli>H$Do
z;MJ_44ed&6>_J?D@6pBVIi9F@6@BHI%+YE#Sc6-yl9?7KmcSMZZ?QwK%oE{@6SCzz
zrN`04cLWo<mj!e&e1Y`__G#eRZkBf97Zoch5$pFxxZnm_0ixy*qNkalH&Ua-Ybhwf
zp?0EkO#1NG2alkObOo>CNdF`=UzOsVb)4oj3^!i|M!=F9(TaY^@hGMO=eeVsa_L`^
zlL_Vmw`4*t_)_j2XgV%>1jEa!Uns?iEnG(4Trgaezb0PNo!;8U@c4xCQPJ^E8^sjp
z&Yac<C1Guo&NJsO=9S{bO;{m|rO4Eyx7U}_7F75O`ISELHT;T)9s#~#>~5I*iqF5Q
zS&WmN73}~CI?q_}?_<iF4^kn(r~ebrC^#mM&$oq7{nR&|sq@Ez_zy?bB<9;?S~wn#
z_hnoK8MG%1h`bQf0<>_1*Jy-dN8(?jfRKH7_|5FO;FJ1?{Ex9cTv<7`P<Z8K9E3;K
zbKH?m3I=7>;5{#fo8K0qcwFvIJbSm3pB2!%zFmh5s0}<ngTDQXQb}94%i`avQz2vy
zb{_n=b?qat0DOR-9-R!<@Rna2wKE>ZzMzC~yfLiy$Im~l@(<<UzcE|67Ft34$3is>
zxC|=N8w%lbygbhU2gSt}>bO-iR(s>|HS2TL^SNjNprr~{*jy3Pb^gE?eOS4$zgdkq
z*pR<-BayeNo1J#c`GRCrvjPb6vdr70IwR~({)!4u&BH)Y_Gj6<EDOK~Kf-P{zqo3B
z{`QZhjr2MI{@wBR>pv;WANTJ(M?sGrn@)cBdR@AWJBh%2vS;e5ywjA!QS_k-_d4B&
zC6MXq!9Ol<EChi@#lMU!I9Sr))xvBMaV>5xrj~Z^t;gLXzd`5lip~}=w^~veK*ke2
zvfqmIE+|-D$N77yyn48B4slt*qI*QhV{^P@`?>Fyr8?-jLOu%A>%>L);uYZigi+SR
ziz4iqr7t0{(}QQCm7=p=*_<3Ktziuj)^8b?xnD3nxdti_z+wYS?H}Tjx)c;s7AJFO
z!H}GX3NcWFYaR^`+k}w{d~+hdWm?yH-EKS$0-Z4Jng|@0eNnC2Ud=#aV0)36h@9z_
z=d8DN`SEuG3Yrw7DA`#4?ki8j3|?mWD@aL#v%=waT>$B`!8i3;KARbYkCW}fRcCB<
zGahLsz*sT{sgxxdm`A?(-VFLpCxr*ZI0j$chW_s5p;gjT2h9;la4zxrZd*$!$#!|_
z<5uyKhz}tg#f#y0wPvid{A(Aw*RkV=I{G;6!eBv31^n@2h=kg05NPuPzmmdl>^r?4
zE}rS@rxu-d3f>wmO@sJ=2%-eH^`#@?KaL4FixaCF{tWoQ4X_OSP;A|7S}RJu*Wa)r
z68P@j=EGqXVb%KZ(tWpeL+cF7C(f}_z$_+>tXQ)Di2~H6_jQ`49GkqmC+?d1tmXF2
zVbs%iei64`(lvY+8%h||z(%-vd3xzrg0@;E==ThVGy?Z;v#O6j4R}Uhaq_^B@kd-h
z-cP~|csQumSinlzlO1`I4+33=EVuxF6r3;ek_qdq87S!ChDO4@C}~~U7bZ+RLTUAO
zK-=}<3awH|uGd!MMp=@I_T`lPRw^@~F8&zoZqX33-Kq}NZxgo`Cemmbms!58vI0-8
z=H;Y<SZ4T@bU87Ww3;jDu%q>2nMDT1xrB*|W>5`{I{4JZ(@ycG{aN*Mb60N1xbi{y
zEQ#23>s@I_im~aBF?N3B>tZtR=d@nMnS!1%x2Feq>)YRU)g)KaU9I+Cg@5XyYI?Ri
zbEBjkMOy}l!x<R3$?6W`T$t<^!LMEACER0%*39Ky*D^rMVHK(e>EuVkP*fjQ@>)6q
z$@A>POcd&TISsUg4P;0jtEPhda25a3IQzO|^S&NyV8qm--Q{m!so1elLR^$+jXKAr
z4vxcNoX30&T=L=pq2GHdRY%^dMrS`-Jiw+}%neAQq>uv=$?opcEM-cuOlw=OZtrF?
zanSdoH;B2?=T61e+1&eu5mJWIlf+k@?{ST_UpSccMV0WUOig~Z(f;bbKGI{sh!lM$
z94j*8rd{6WJpR&;)M0HSil^+=jde{JFe+G?ck3F<-9J3GCTUUf@2<4!s9}I{Mk_TB
zb%v2yaONv98{RWea%vt{lk{#?R>J_V3nN(iqca(Buc-?4lx_<|5Yu(<TP%RX0$)cz
zZ@<Uiyr+en1=aT~cfM(kfcZ!%lYjp&fY4vW6Sqd@qlM~1TuyogA;yjkxsb&I(Z^q$
zef|31_kaNAV5_9?#(BrqeCjxpVcEl!<N@Fk8Yjq0mxnWZB+vM4l=?Hu2njaVOGBT7
zx-l<Bg$9w~d!nF)11D9FCnqosoE_Xc#UM>_b54);cLQ#Y>6^GGn%Gx%RQY_B_q#6i
zG6E9rHCzWlmhdZ;IwPD|Ma~LHm#~TX-wydlD)Sat05)VIFBF)QgG>PN4Ygl;-`+KT
zXDE@dI0jx68}CvoXse|fjs{$<x-g_vkd1h7J!-1$zH{A#Ra)t*Y=L;yyFh04ZkRWY
zIFB&-^8$Iik$Xw9S6-B<j;StuV<c8ha_cLdmK~5r!{JMkK0TZtv4V7UU1M-EIMs-t
z6+@G=Z+&O1&`OAqE0wF(?XGfAvK3lSo=RR^IKWEOX2Vn98GnZSp3}cAOfSq8?s5{v
zY{{R$4SJvVpf1xarmioAUlOg0t-s^IUp#QinAD809?t=SX)X_{xvF)&cZy%`V;MJZ
zi^SX6<Jvrve<~8qII;R&=yv+d1T$BvTpBT(nh*uDyD=WSApD{UKzYmDvGOT~qV*L0
z<e~2g$-z6X!nV4Wu1G+M&D+0J)<<!<EAVDr3BO(H3nir2B3WRdUlq2w^}Y;^|7#(o
zF5p3?0)a*MzhT~*ZEiA+4#z~AT1fT|zVi!=?T&G8y$`BSNiLhD;Rje+$?Ka2o1|S5
z&n%SC4Gn4_=K^YZ+;k(0q`S>p85*6jRj3Ez2ch!5Ud$41ugsLRO~qD2^p~M8lq^ie
z00Tg##m-FW21_L?9U%I6tjiZ7>@WpDJPLFVXm*{EoZbPPskC`FTo~;?Zx9alV3^rJ
z4!!Fu0Tj~a?FruoD=CMHmP-I*3sAv?2fW;tuFPg~S7zK*E7mIo;zTXqN;G#JSL<!S
zEPeipE7XGWpHC(8QiSg^ES<$2RQv9hiT4M2*Qn4-MnRN(LH_<|e9$t<uye!AbF$XK
z_$YIHFgsSFw{DRXQafRl3rcbw^q|uxy1H?Ea4E@npl+UYm-6uugFJ&|uRnJe)Cow9
zH|xUw$ULcQR=xByP{HBW>fGLKkT*W+#e`y&rOybxawI-8O=+IcIFUP`?d$<sz{NQ#
zVF{!LH^r+=yHK>I!*Q$X0LCx^oE_#VP{KTNc=Z%8<N9$c*nX!bs7x2eRHw5n3W3`6
z^G*k~IEhYQaB~<=ByO@yv0Mq>aGb6Ce;9iYa4g&Zf1D>|@0nS)$jV)IC?wg5lszh&
ztgN!LLn1pNrOb@%oz<<Z+ltJH?9J`}zVv)PzwhxM$M=79JU!%gUFUgT=lLG5_v`h(
zrt`$nGml_-CfE<A^8Juod0@;!m%RV2fiHNs!6Qxbk{%$M#YDHk=qU>GVTNsAgSK~H
zoEH1+zPvbE^b+#{PBhPU;CME<38Ob-MW}6b5`n*QzC~Ji8rkxiEsL)CxHNbB-|Aw*
z2#ofyVAmUYWZrTKYr$)yibnfq+ROg33%;#1)^|te?q{iw<>ia!tX}%L1<@w?&z|e+
z0LRDU%~@AHgAhmL1b;_7EE3|J#6*FmiE(64uT0VdYPB*J!z9iVL5FJ|pi9X7MD{U~
z;CJ3C!OTC94&rbg)|C)czb0eUI^CbzevZ<Pgha&($x+DSQ;VLHterNC7la&bF~U>-
zx(vw-GisZNxVVu88`FhnM}!S6-+f^UStnRCsb6i&YZ8xt>8en^ItJT+XUFkSHt0BE
z^!Qiv&E@06=$i}2k=eQ1{^Ruzc1su9SMbX31~S*<4=o&bD^0R5HmA~euAN_L^wwsW
ztA|A-pHBKb(RZX|5?Ozv(jk9^LjNb*)>y{G<->dXcPOx*{rgt)*xQfl^LMjv+a5~L
z9Y0<{V{%OnfBebVp4Cd%n~)z9_O*X@khgJPhC%b$4#iQ@(6g=M+|ByO^~Z&^{zbw3
zxu5-;rFbUAim^j_hZfSx^2z<XL$1fYF}fyvrqsqv$3kmI{>w*uGCxam3gf2^jtj>#
zCit;ODf>qjCW1$4j0`uAj(*qrR}QT#V2g!m<@`O<Xea!2CNi+J$5`F{qh=Y6i422d
zih83D*xM^R$Fe`<Cq;^HFQHrfj#rQRcBU#O9laQd;;e2&gxLs1Ic?O|!}B!we|?@1
z)Sf=<vv{I3cqZWdx%4RXHeI^<Jil#F-iFtmQgsMJ8~s)rXQb&BT1w0Hnb39X&GC7G
z*JP^YrEjhE3u+#nvn1r4Hb{=O;`-{@7XQZbY@_urf3r-v|NYOZ_tL4@$S>s`>6$mf
zVW7_(HtH&1pq~F91BFPOBbUR(ck2bORwTRdl%YeBQyEM9Q;s+B6Ttm}>>*lE^<28|
zV)5iIi%#@>ki`R4kLV{1?bHcG?Ez1un^qb`g{+!%@17(($CC!c(RaUe6mqA3WAIX3
z-C<PyZljN*=X>RU{ll?*ie+Ua4&ufeuSaWEgr*FM(vfSPoDp?^=aDamCkt{AcpXNq
zzix5rHT8qf2(oiribTAj^B{L9gEG*Cz|MG=%TlyZO2YSsh}ODzig4u9knw1R^F@2W
zj}cnarvd#|U7NgAmvHV=X33NnpQwiOO&e~|pfAfM>1{r6Qntch>OcLz|M83Q%^dup
zYRK|A{V}wZQLtTK1k5u4_Qu~4GxeQRPoVgB*_QwR0FEq>iF0;QrCCvrYEz*#N~@Pj
zZZPSkCu{<16L1K9%C#j3ta{AdAaxr-WO)^cJkcGDb=RVH6)>_#ZwS}+S7bOf5xIjx
z>gysF$gAm;ZQnXnZ&=C%;#~%iA5tg_<$?bnbHGyiX6oi?413zT0+u*<SGD}`0J7D9
zHTD;t^wA8DC!d~Pio0>aaDDm1y$9TXVUtK)YN$q^nuAQd3{puC?Vh#k2>VbwS~yy6
z|Gqc;&i=dIup$HAyYY#eNDS8s-}SN?XL$dSlt#;J1ieOA(nJ%U7vR0i;@m>D9jipX
zmV;mo?jJr4s@)1-4<Na4|Af!@!y4+ZS7n&*Zbt1YB99(PLL(*N?CdN-C5}-uhHeOQ
zj1RJanD~xA$OAA0k_i1t8e&Aa^)wMdi*CCud~d~m3$uantP~WBX=xykJf>90nV6<Y
z+V6hsV(`Ci&`huW21t}0hL0VtLpH_z&Hl#@;D>nV#B&HP$@ciy*=lg1FUM+sdS=0R
zdMZ{RyWg&yj_z87(|+1Lgvpw31oMSa;-Ma_5&`6g*JMVUm0X}AvdbA{?lWbJ>iY{h
zoj~|dFpJJ>pSttk855l(LQ+tt!-Q<Uwn3+(FyG=eq&l#|G-w0#^QUAwtaulrRfEeI
zvAmOXAB`#;mfaL3{kUz<@Gs@3fZoj>0*LVXh^<Ox3g3@jS6%~1lk-`~*Z<EwcmKgs
zAvb?9vGpUPMeLzI|DP;UtdN)gzV?R$*RtO$TiDIzCl-fl{tiGCbhOQ1K_>}R%TNE>
zdF(Ie9cfi*eb~C8dc((G<b1WiRLoKOA2cJ$<=n!uZz}nY&%oAybUVgGgSpqtP+xn)
zGFqH?h>hRCVwy@0+T9C&tg!n$rM=x|VyA))+lleb+C0c}f0GXXTD&RCANZh%)4%1Q
zz3%>>gZ2$iS3e*+-dV?1tthch7jmzlD`R|rZo0N?c;4_iPOR}?z1FwVvP<G#kn;z9
z@N!%}UB%yVCr|#3XNcHz%!*PKcb)vSsal=COw7>6QKOJ)Qr)_5AZr?C=U8G#)AuL;
zGImWyPO|RUvC1Eqdg@HaHNsmo&BsGO<hNV<U;w)-%RuL1t+TLezcU4+atePS6>|&|
zwz*la$=7!r!1{h3a;yxD?IJ{7pXR<H1x<g4{SEBFN=8UvO0D3OVX4xHx#MO2XEq<c
z%I@4fz*xj210L2{X(wk5P~AnObdVDB&Z<XZ=;8(G*c0Jz<7sxgF+98dofVTDM5^`5
zYd&r05DHdx%V>XWX%&b$dZ=#*YUg+Z&YJ>(*9b_bU$ogLr69!U8bDG(%3A<5-??5T
z3JGTWvgZqZc`?Zl6Ly0#RX67VPAKzB)-Rx)rl|V>6wNlO#O~J-7w+Up-|yv^7Nbpu
zI`Wtd{J4#E%lvNJ8v$7X{d;b=ke`sDFb$OowqsOcZKBHWA0fGU>6dd6j2?oKj;``=
zork--O^)}61jYfwXs!X-=^W(u;!uIh^q+uut_}pUFK!zT`<r+~Mh$yR_>!%7&pcPr
zU#L6A9<bTIOlzp<ficcKS78$+5h~Mq@v&>&^asTGu23mUInky*>Q$J7^#8;6>?T%W
zOAVQ|t9`ZJ9;HLZ-b^nCIw}QmZ+(WY8_~|p6LUy8+#aIz_g;(ndEGu6J-2fpd%y|A
z1~VA{^5^^!gN=<xBH+x44@Q?t0d!3Qi#)LjwOh5V4;4Se&VQ_Y?c9ED54BG;kq<!g
zt<Ss}J|)gq6{bs!ld4zhMyD4_)NX;Mwp*-=QHA#6h4U|l?JLG3yPjCxEq^C)Z}m-K
zrQq$l9(&V0xh=qXbJQ4Q?x8MBw`HBEwV~z3?{>2nuirbhlO%(QYA>kUUu<I~vZ6P#
z6Fv6M`5b8?$GH-5`IIBTTN2aXALhREJM@Evm@6p#Di*Expdqj>j;3t|pW&`43vNG}
z#v$-4BL2JeuCQi3e1+4YZ?r;E>A??Z;n93qz14m@8FpKz2)RlmquH;RXq;E?_QWQJ
z?BibD(l>0~TpZ{UgytG;R9D_VV$uXvYu)j^2al8oYXDF1t9jGe%^s=S_7ke^QPK|G
zF#~!rV$(a2>aY6RLgu_W?VRDeov&kcD5m0~*&-A~>KbQ}bl`GGG3~MW>l@Y^a<!;B
z_Az+k#cIwyyR`tY9PPL#-|0SMTVOV(?EH=G!mr;@?+o^Wx~uOC*Q-7zmMpn0P~l>J
z+0zp99;zr=jG;hd!lyTAxa%?Q=4S`YjzK>bF_XM?2n85N_Iju)R5x95r$y-6q?b3V
z&6-1!5v6fVGYoYjw2Rp0mj?m}7{V|2fVk~WPUo&$%hieb_84}@T`cr~`DDhDJH^5|
zO>fi2x?wd&1ro<`0GvUWEI#o~<(|IUH0o_F>j<32_m7IRqt)$LhB1|#O_8p0zU&q(
zx!~RXMghKQJGrBK0($PtH*26eZyZ|`>UwQ{Q#m>eV$E^<c3d1B*{oMIKiO0xZuy1s
zn0b}ewdwHZm&PB_4kaBtZO`csFRQyH%NJ2Va)<M$>wE?$9Wbt^5cuCBLbTy>H_<jZ
zb4N`z^hJ0DV)w6=-GR5>{Spy28>&CXdQY>sibLH%o?O>z>DOJ?p4nw7)b5~>ql8aL
zD&&vat3pX8NrAAIrYd{)0Qh-OceHI3vw{?n<kj8QF74i<kX{m**eS-1Y)+kYlv!(`
z`SIu5&VqMjsoq_lb1S@NDhoT2(rviYf-fL(KQ+lKO72QM`VEP>(6XCP^@_3&(|z>q
zNGe6!XSm9<G4ykOZDtzg(j%X2Z?BvZCD+AOS|avahA3`snS5G=x0qEH4lJofsB;p{
zK9$9YRo}(krrk_fqgA)!-<vOvp-TZlH5#V9f#;sK(^M|*0e5V=+*FZO54ZOgA=iDj
z%;z5Jb<oc4uRh=-S)N@ioT_Vmh>-2uWL1A2G25M<7C{C}VsVKnRPx#K{Uwt5N69K{
z{7J&A!E3clq4{Ibae?i|5+9H)xM57)XA90J6>0#ISHImzFx!Gpd${t9wYA=I8BS90
znm3^CdsWozroc7Wo)!VdEabOFb}0W8nl|COI1)BSpP#xKlC3Z_v%}BR=Mm-q>;Z|N
z^vQHv2?7~lJf<p!y4@H1)PFD#2)CIDD20-5xk{E3n=txE!Acy`?o+xlS%hFa;f8xO
z^28=&(!{x&x8akKCUcZ?E|4-4!{s6;*km6w?yNpPmNgVF;+b{r^Jt!*!(z^~@f^h#
zhps#~>MM@6{9|U{<UMIrRM*jwpO?OPAm$tE*L=BS_|X!B9NF(jP7QUF-ex4ajabo`
z$<d;6=b)w~$Igjw^9JuH`}!RH08}lwX@fQY*dX7S(d1~_Ju{d<QTgdY`RT}Cnwz?V
zw<WXCKij{ntO_;nz#d|62(D|r+)y`tAYPHvRN1~=nrL<SP1De1Hb_o|-C8aS-HUdW
z!^hJ2CZNM6D@1JO2g@)fKI|h*=mP$vQCGzYQ_^a?IX0t5N%1;aCcP88Si*_46_r$O
zI-iS!rlvu)newG|T^-=ZR^~<K2>0n-Q@e8S=5!u*+zFguHyc+vUQ9LA6<?|3bU(7O
zWLX;xRM05ui}arpH@|{m^q8@KA+SNdGb4U(Q`RSU?A4hA^vJ1+Xmlf{xqK#Qiyf8V
ziEk@yC+KP}&|Q5v*N6(lx4q=^R4-{K?7`${F}+QX+875fMr4k|oaALSBdx2^*F2z$
zCryi=+qFNmcesg`N_R%vykRG0lfe`UyY9?OJIT-K9-7$Q_?l;KY$>zBM0JQ-Vy{yb
z*k$jY_E8?>IQ-ZxE_pURgv|10@OW#v7<*omOt(*`hXNx>B<mnv{ul`ALm0U>IE!)i
zNnN!xekny>GG{*2*nL+pu6s5y@Pa#?*@a2jZ<?)E8au_ja??<=n4G7Zt4gIrTN-nh
zKON^)E?H}1zMa4ByPCfjN$7fZ<BV;9-m3qvLLcS1MzHrkMhC%CCaoK{C^PYH+DyIA
zU-d3gwv2jWD$Jx?mGqD8qd5{EBXg>Pyd)c!CYAt-XjZFq)2Or!{<VW&j*XZ85nrn3
z9JpO;Z>M4yxaK`G)qi-_J;5)WwRy!<4!xHEsEp$^c?R7uIIOX2NfElP{%sdG@NCuW
z@G>~(b<I_nlo~I+Mz4GSjK~RyGOum6;ri3*lMtc1;ep|syS~Kd9#AQ<Hm;{5-TC=t
zUBfFrIy>6mvP*on4Ml9~TR{6tn4mF{o$oVOc`z@2uA8a*_L9PzfT$*wN%5)BpA;hK
z89Zq<_J-(h8HN#5CK}x3*$(sVqpEX?6^%*n^nr*UziZLWD%qGTO0cklAzZ>K9@Q(d
zIo!r*7tQXp)6bX-VED}Mz~?e|v#T!4?qsff3jy8A+1zmXL}K@Z;&vNSupUx7*Mrfm
zD*^WqHdl;`9fB1Ov__k=??yJNkb+F0Yeli%(jG{C2nZ%;5$|<rJ6QML;L+x#sXn0W
z0?Yj({DW*KhR=rs(9?iV#8^)OjI-0MI02Z30sAL;@7Q>k{_(>v=9{~+@2OFwz1vW*
z@b@kyAE)3`2zTBA0m_P>cDw$yc6py`PVZO<#qT%eU)Z4bG>0FqDJ0Fi1Vp`_pwewn
z{s2*upMAz6b)Sv#Y0kmSLx;%>nmH@;#~9H$x%~?pcfo$E$(uSp7fn(_uFO<n<qW!^
z4W<>kgQf*<9l%Oa9OVo!sSCbgmE;*z69Hc0e}_>l+dW?8ON}LGZXF@57e|B<mYTTr
z)a-aBNfYsVyu3B++qBZ*%iPxZ#xPhuEM1-O0ZD#~`^BwbL^!aXymk*p3f|R@))j@~
z13R9Js4Ygnov@Kh2b=S(>e`goydZ=((+tFj{Koe&vm<~Z7irg=c2AHCl=~g57r@$@
zXEWaF{hfUkSEn&r>?O5p3&WgiLhFJ#Ag}8FjqU2(pAf%+tpDE|PmzclC(r9#Uk0b}
zc-IvCW#f}AMTqo2ciXhk6mcUT#%CABXxrh{>t4vE8~ygIwagrWHy4g{rAJalb4*o0
z?m>(^&ptGu=j7y~+=ov<Qu7(ZHx<`XR%Ceib@R7gScipw=hFRm?42$2tGAHf@TW0e
z8#0(K{?WDfQK7KPm`S(gd~r&?NQ`-2*S58e?+YV(TtAXQwW__mbm_j8;+!hCM{9Cl
zRgEgHYzj(MsFt$u-sW*oFqe!bcRtno`4ak_OF^0uC7u1pY@_+h^q~5bl$G<Qa&u@N
z`HV{i0?rfGF3_c{WGr+}c#p2Q(ZmOo#17A`@8JwFj88Sm5Bwy&oK-b7V4rJHF}dM(
z!Sn%Ze#L9q3zd_pZn%UdvnZ7UqoUTHRtfih$(6dL081?Pf`Rs54Ic^4?*M#6!{?r0
zVM$dXX+9p%W0H4Vc}>R*v}+~x^(6E;59^jv?SKD6jvW!2`7rwE%cAx|fa@T>vrje2
zq5%V8*W%}EI?pBhC5MpfCJwmalKw{F`=pRC-l-r;8pw|8&bkmbJ=#cA<1$76c2MYt
zFHNPiP5Zj^Obh<i0us+X9zRXLAu;Dh{@?!;{Z;%ubN%5n$TBkax^H4aNY3Tmpnc{Y
zF|xCpHVlG2aCekf0m+pEcLt79hsL;pR%HQ%<M&PSZfujh?+Ym1i<M_zFtxnLQKmj1
zf9+3zT-~_Pg#ylPOE;hqkB6lBD~i7VC1B`XM4)iD;q4df@AGl~WB&|(=144@#dxW#
z-ne)s+#$R4a>)oj>k*AHsP`8Bv6p)23dIvP7u@fHday||&RXIhWV3!+2w0@|PKY;E
zDYgkCiM}IdIgj^O0q3BQ0j1MlSuvHFpWQ6a`ln$*g_m*4EUo*oBnS)-PvQADKGy|Y
z@2h<@t$RN{Ho{`QPu?j-md>5_xD8)XSHi>{ccCeIO1SD~5{L{j+6=oJ%NLUD?yZWy
zhuX~%r86@Ly}U47t72gxWh~^GTZ)LJgh?o?TjO>A<$qQN@jFXCJCwHiANT%ZdtBlm
zehy@$;{+evm<j8?Qc)CeitB;Iv!cU+RkY=HRc47><4$E&rF)BZMjE%?<)&n}6wxe$
zwZ*XPL3_SkDU~Z_(Ry0n*cezAzJ8vpEOdG5`1_v&XIJN*nyy3U7W)JvhA^_$a6}?!
z=5e>TtWp!Wm7}}G7kYoYtp(rX_^?~O*+eDxVKjAZ06CpiJt4$!x|dKE$V8dt2kn3D
z+4`8N2ms4y0hLT5y@=vHVu{sli<Z*4^KFLk_Mp8o!|d+4^6cFv2O2(*v$jQl?olua
zg-e)#d>U7JQT0OB{k34&`XGb8z~?xw98=iD;Qpo@5j0b=ec%+>m+~=V$ozPp0-wRI
zLgoW2o(bmp8Kq)4I6ejyPp5vb2D2t-!@0TKl+E2IH>FBAXkXCY)B1@s9dp_B?daQ2
z{j(4>J4_F|#DHEjfNDw=XVCtP)h$DhCtj3(Bq{{Nt>L!^#T~K-hSo-<;(V@o-!)CU
z%C0b$`_e7N8d<B)5nHz@_g`}=1)H~?|1GIvz3%2PJtU>$2p@F{s&9&x`X|+mSpYtI
z(<EjYZdvAVs^ET%a8Y(7cZ>bu{z;(_>VbqoqFtCCBKHJIUP4#x)nKK2o~_X(A5}m+
z2gNUp_7uMAztyv6@_69wv>1$wvs)x$F%0C(Yv-NgPRe8;V5*WU&$hhf#H<2|lm!$A
z-Iy~5?XQ5cttHx=lS+lBG`j3(VV{4m!X-l6#}!#D;dF&iqAHg@cMk^K5Ix1ou}d0V
zMh)6v$cM<ewt`7=h!`;6D0Q1E5Q${8l*`-W8!C_aqWG(#j?Eq0?EAe;-U#LfQP6E$
zvq<Jxs}oHKF7;e~trHJ@6`C<fr^08VVJ&q<g{oN4G&VA5^xIuP#C>`%i(`ai5Gfxb
zu+hjyJou1(4a12;xc6_}Zy<XqtScBM;g$of5!UWuv{e?jb}w<~&U|5#i*Wt(qyN)Y
z5Z^7T4r3E{o1(v%xO+#iy0vgkL<=v45gXoHddq3Khk{Lr7@23w6eq>29Ca4wN+^9b
zb>EhB#`_3!nu<YdB=%*KONUd0Ty_bF5#3dsV~6yFwF&!<v(vJypf#G>RiQ}Q@;g5q
z`C=B(O1kc2w9T&)))jFQ&-T}_S{>6hGDPoq^vO3wmlQ^vOe=ZEte&xvq0M+c2tE6S
ztB$A|J%545$6uv$wP%-RQkw@BUK4jdwdG%>9eBoiq4<-snns2*BBK?U7UYfWf;`e2
zLVzYu7{33jJr3SKXNBcsf<gVaiKzeD-D&GHyxyyyIKzCuhY&C%+dz(p$QYN!PSUTu
z{l7)JnU?1r5w%P{W=s0XMWNK&)`&FIMw37ubW24VtzYNhFcf$fj{<i7SBiKa8l4~)
zTq|`6U?l^|V$-g@)f8c2>ko*&u%;Tnc0lI*Rwe9UvLX@0kpg|mdRxF}!D#PfPQTgb
z`;5Ly3EewheaX<jXrTh$G~k?_>u-aJIzdD!Q$WYnvPKb2#H?qfAVp7IY@yijQBbx0
zAMVRMcsR`=UcWNe&f7PNI%}pHq`pfnl4O_W!4q^IbWx0reZs4Vb5YR2<@1BsSR7jo
z7X`)Cfz(Mxs?}>iV&!_m!V>T49A76)vVehK7f&EqFvtw$U}f&Od~DTZZRDhm$-F@F
zy?B^jn&%BGSwzZa?Q{<cg=27-aGixK@=~|%(cM@PK?j|2jOZ(aSV|oSCw1T*5plIf
z6SYZz+HZ&V3SB^u9ZppqE5hHKjKE=RYNL)=Or=zfV5Rxql!#A+T(IHQ87@nUs7yr-
zo?8eBu~Bl@mqV7a66OZ{y)YYIPnK&k?^(Gbm-#lXVnj5hA%Z8n7UUJWm#2=gBXRGS
z%{R15n%^9TuM@n);KrXZ+9*A$x$!#@DJT16KRtohBbHKHt(PL74z6>;_H1RItJ}DG
zTr^C-bic-oEq}M`5s;@&(0y+KRvuV<!;SySS%xF_l#!o-(hDfv)RhQKE+`YEt)Bi3
z6e>N(BH4fIE$($0RzTo<{UEP0l4z_&{p{tr6XGvaz<}=lI7z#uv@Mj$)wSbI0SHcX
zULTGA0nGxDU#I8bp=RE?rK+dYBXi9>sNn29dI)o^e*gEngX@}kA!Hkm4o_J9{L{lF
zeg;A6LROjsEqH&P$s&BT8DFVeEl9M42z#i+KJAA8Vn>aBnF(5@3<<ND1^objYhP*@
z!nOwC*_8CvP0gb0EHx(dLmL^}KmUV_-gNjr(k80Tse!BJq-H0U&N?5el$y^S8M6y_
z0Os7BmX(Yy5}E0?m15+AY(Xm2{kjm*TF|$(witt|l2K7tqZ+Dy^{_gKY;8Eu?@>$g
zo$o(iN7@XupMy#l)&S26p?9CCw=k3-a0B@X>O~2NtJs9!WTUz=G`MLv7QRx6Ee}f3
zpZ&_BwrvuYX{C6_9r&LctNl{+PdSsW@5HDt8Q@W&LDI#(_O{2VrgsLEOf99xepnED
zors!%Gb_(z`QLBrmQ<8|mVBi29`)x&kC5hl`-P~QK(Pg4t%sb7hAQ?TMMDrXkJGH5
zn3P**yy({K)+e#He}IBV^zeUF1W&FG(h4XE9`4ltt4oV)1WlEEiKW_kFi{9ECy}$P
z%W^UQnhc4G{*V+>aBZ`o`ht}P*7^%K{!wa#&w3_9V-~ZgktX$Gib8>+CKh;en-qXP
zdy}j=cPEL~JtK1kR{1R3C;q8n^16&E5@Dc8KJd8E$zs^^tzR`m&ezV9Fk36hsgVkL
zRV-tQvu8V2Qc2HT{Ib2sJ;j7k7mSc|PskpAUb{WgmF_w93@deX`k#uEt?=uwe$K>i
z&*~RBU12EOiFgl$A*A65Qdw9mK%lCZ<feb}uw#5;8W`B*d_4Q5&j=%sEyyEQgY%ND
zU$wQ(5KjQt4_5ngD-claBp+YmO>I(8{|Q<Ac2AdJX(SoQp|PKaP;Ch(OW&`ZvXQIw
zhHs-+T=W;e0@S3b{ep9S34j^ybRyoIOlkqr+&GlB47Bi{#SoIX{oDc()jF<bC*Bnh
zRc6WhEq}kJV!uAa)I=bGyPaBdSQ0$~mz8=LU)-oH6!9J;B)~t8ei=wE^(6$9TK`i(
z(xoh>n41XwB9Yz$h<pTjvTc==2rHS%CRH@;T*NvkV4xXsy%!?Alpr7i?a}a8Xe)Bf
z|AsJd(gB+<kez&TN{7w_3#d;m`x<cP4%VS)@sQnX=6QjG52Cx^w4eW)R(e^x<9V6T
zbI_OxRc!9~K<me41p0q`d|8zd%9#FF5<>?+7%>b0^}Wy+^j+aDKF+`{GeByM*_tw|
z4X50JFBMs>(z>N~Ucz5}0YpJa%{HPwCd|ge=;YyqD=^TVNWxD3N}`4q;=aJ3zp5Kl
zUvumSS~+j=lf!oM*8&Sur_h>6cJzpyPm=b@rWYCH*#<k(Lix&N<eJR+Js}W|VmW>S
zHuunD)c#V;ZVO1uu~5wcAyvBfpM~M2s*FWo`<H_N=Ek>vX!v~qkS_G^378K4hZ74Q
zDek&3T4cudJIHnf-jmTLzK>Xu&6w3rYi6!A4$FrpM^t<hUho;K^r+wZ%evDE_Z~IE
z7m_#iN{}K}jSAHbT8(gM+wz4qVil<W!E@n5NaL{Wm5GU=VzZNeoMr97&=0bYJx!Sv
zuQoQH8{)g36Y1N3newcR#D)vQrFmq@N?wGgpa%$2m~F{J3bU6sDBFN;m!kV=tuG(h
z1Ny%g`jLY74ug~qNL_w;Q0^{-emN-ydrlESI-}(O!#qOBXY0%JbLm1w4gc_8y@m+j
zwlj_o^Mr?psm)cO=eN-zE5)G9!BQ!7y}7(I*2ejs9kfl=phIhiPC2#(#UZs&MEAW1
zjPl^P^T>usbh}GA=pkIQs>r8blMZqT_fXcWGN)*%&{(UEH(ETgO<~e!Te80N#?nmP
zZYd6+oDhh3(CVJxc|vOr_KUCkmtheF{kgOxJ5E&G`B*6XImSq9Frgc+f#QU{N%eqe
zc6)jBWXp)}H+7ybv~#RpMqPM6)5h#r29*88LVqaeFTIK8A9rddD~BpB7HfH&Gm%I{
z!Ftd0XK<YtXmMJx4jT2h-s__*$y+s%gZ&3O%G*IFIbR7s#~KK?)_Vt_GQWf<q3WRm
z=&Kv%;{`<*x!kp83i;i2C>5Hq`xJqa`Y7<MQV7Ta3v0kNoM?Tj_0<=E<-D~U!zw-4
zT-&QiG7KQPOz@AF7Sd{CoEn00*hZjY+J=&|F21LbKXkfrmz&pepELK`T>ayq)8t;O
zMe0+bOkIz7Zux#l`VN@B^=R1x@qD@PKjKu-BK|zC>R&f+>!j`u1Ziof$%ZA>|DjSF
zg^P+n&S;ofs-=RkS*sV{z1N3bhOquYDiLT=0$b1sW#FR7!SAjv^3iT=W+sd?x+pe-
z67{5MiYKqjdNf{BZXu<7SXsp${5I`*6qob4=WrIjOXhtBW5OoPH8q@54(f^r4E&u)
zTj5~Ur$uJQ+m8hB9Kx7XF(MQZ%qC1n&1;HwdS^jZ|3>F-c}p>&^IUJlh0kArf!umC
zJL0k{pF(WOqpz0avxMA1*+kRoLf=OBx&YWmWW;ATXgBU8BWQkNCPkAB6AA)0=|7dt
zaUG{lBth3%xlZ7fz@(@v{lHL{_ZD<H23>)tU;sE_>*JdTz~5oVFj*&MJ;~9Jjje0|
z2Iq_$68hM+(%lx^zxlZFl+-3wP||lO2&cc>U5B<>54~1-34Paqj7kAHAsZKQioM-T
zNtO|hT{$AJsq=yYa~Y^RmGxirlw+b#B*ieYK5`)Gl_bMb0o=>%8QVw2)<g&a+PVGE
zO6IjoTrEgTg6FU5ElZ;+*{J1%egbk4MuTj4rtx#o6yWnS;ICJMw);N@u%+}-G^v%9
z%;+0c;w$t=(12r=sF3T_lU>@le+0`h=CFaGnT9j{(eXf}&E|Os(t6MHU=i~t`by{G
z!`6hwy~u(cgron;-~7(hf?f$ptId<4$5Jm!8#;`GoE{OXge3r>?q8D4M&Mta2?PWA
z;DI0z5)1W%R>Be)?|`cDyn*$gsp@H~_2_AKpGgppx&Q-Yw*OE!{zUg_(Od&e2t!$c
zVLNH)vE}OxR&wVw(2qYH?M}z|_C0+6!8(+JJzcl7V$3BXhjs54C*l`uyZhtZ8DWb3
z)Ot#ugn^`~l4q;`QILwRx#~&CiFDhbtUMlihUmeQLua9iT=6Mg15w>T;8r~lLx&>7
z@O>fj3k|aoXXss}gyUbMoiH77x$D&0IV&W-{lmT{v1UEkrCMlQy9B+>r{<=lUS4CL
zkmFTh_buc0=4={kyOX7Uvg9X}m_3MYJ_#kia8cypVf{1=efkLRwuHq(KLjdHWrJ-K
zDs!zh?8m}LEHniwj4;*Ye@V1Lqmg}f0e7c^sQ4oyeYy1f>Iuul<tRuBEMF1`P}_h2
z2|<Zfw&nMJP<K9Xa)_=**g>!VyZZkx6eoy-T|iX!ojoEGiAzyc&ul=efa;cZTTIxZ
zc{>KC4Ro5rBJ$u`OFvKeAJ_8!h=SKTRFs-wp%1IIn{f4#*l0hJfBgH1X^4jXHKFpj
zlnN3e%P}(G_p7&3nc>E-K5io7#efePW$3^{=sat92o%Y;mrEWa@_fTQCHZiZj`iLu
z*ZJ3o*lc~cAR_!1_(<rb`r8(yx946=xnTE@V+D_f;40{NNMn(%1|WU7A=Aw_WoR*{
zbrPXMzUl&dY82r+TzY<lXo$Y#?@5`nBOLM}=k1A23O`4%H%Z@TAfM<@(y>#yCUe7k
z{EZN@Y-?B{4-ggO$^PXp!M$=%|NYX=X;9~1-vP8xNrV<^7)svhnJAI)5q7E(&-5@N
zp>&k2f`A!`SC#c)cO99+i$023+Jjl({UWVMR}`_(kjJR*4Co~XqO7Qx0;I$BNtgrZ
z*`P5rtUzvOp&sU0{$6&gUw3*TtI`kAIs&Q5I6=}K3w}Zf8U}(6rk&)uLa@33h&aHx
zNj==amzSA>?5CzhH0M#X%)oMAp62r^@vqpFMeRQj?aD`|+p-N_k>>@&U4#}>XEqG&
zkW;6hN{y9FSZY=ExoV*CvZq`-TX`>c_iUuz4v<;OkY3q^8sHDzB&~PhnC%S61bxd^
ziP6j9Q!@1g)|-#y4-sMLNAxdCw$roiQG*iW^62?#Ln)OI$WuP4x0O;tpOGcQE~*8q
z5|Ct!fZPx(?`^|&-(moJq^=!CTuYmyZlxNi?kE<dGe`PUl=ltk-gTnuaaUEpL<ADw
zv`qBV_9VS_ZiBK@Ni#&zy2^Osx519Fx8aUld!J-<UfmG8_k=BG-YL!7;NVWiZ|?<>
zI1wC^y(J)aMHoi4?J#YAxbsQvMoVbasAx`+<jZ#%lfyadn;FNlcxXi*)z!#Jrbh7q
zn@U+q8BR~-Rl%<ZZ^{CLa<2mYTBX*Nifd`Sx+41QFB??1Z~QpN&mbZipS~6bE*ZbW
zN{<U_DQC|UhHzx<xlE_{G?QH-EQvDxBKH`g1E0Q`E@7f{D!6D!MBv|h{2b25L%)Ta
zEYaLkX(gsc2!U}I?tM0`4Q@EE=j3DkvcS09E}2%)p!d?v1Feu~<kwP3@z8|nWKfdy
zF2od>1<rIQi!%o5kf=HAk<LEE&1fJ@zxjfgPGEQJjW0g&-${@%oN6o;Xbh?HZf>Z%
zVQ!(ODYy5g2HKDBb1rd_?%2GB{uM)=uoW_uFd{bjJrCqAbu0~+2A|xI1rJ?LOdHd@
zR7jlua}Tfi4r8opk{LzoQV+(~%tLP562~c}Hn}O7<oAy#X$pI9#}MS#CM2hRN6_hx
zR5)h5xtTJ!Bf6dQ<TPP=@a43>bH_tFd2_(jBPR`0cdlPY{%;-_WIs~&9QPE#FRuix
zSjwDe%FQRY-sxS}(&>eeWIGw*ok*4WANyTTlltWNYWtJJ_#NQny=@11Vr&Q*bI8}=
z&gPHzsvohW@Mri?*PH$E!<h|MdvVT_pdML>Z4W#&wGsW(|BPU@pe$abx7;3HVB<1y
z>o)#9)ut;rKWvmjNS{NOgGI_X5z1Z)@wcKqudc0<V$Yt8cGzj<gL%&@i8x-(bW>7F
z5mVEQtQxJMkN4c}FX{!@yf}Y*yXyME`drU><T3rUMCM4E^VApR_@gG%QUcYj{qS=4
zopL+=KmxMwKVefEX*FH$7PIQ)l0VvvC+8%x7`Yf=BWH3mL+)PZ3r<gh2}+q8Osk~f
z2}KD4pRTVKB<{1@{v4}v{aw7$s!x7Yoj5{zs?ry$_Ej#xrek6J&CxLI<3nUY{~!C9
z>tqJ-&toivCp%HOReVM9=iR*8;lcN}g%`%EN<H2g8Nb!aQY?A2ptEu}fkAPY$3Rh`
z37S@`@@S?PxLV*tOe^(fxXS-<E%a+ohJ1a2L9t@YI70ym(A3|LWAy5PiZ#ZskdlKs
zPk`f5z;Tui)&ckMQb<Wm?$x9P@fB6B`_P{P#UN<F(Hg;UNr|C`QvvrGu}FXovSL&+
z)ssBVA5U?=*ZwiWm>@;YCUgIl_6iU#krxczJHUBCBlDhqmc9BZ+1MvrqVz`I?K_xl
zs%XQ9@!>YyHxm2%0w#yqI%3ZTdOyRn=ekGADy28-POU7`DAj--uJ$P0<{&ynoFfOj
zp_u2O=@N2J?X}3s<Q2JRbq1YTW*OH%Vhu#O1=6`+4jq?f+*lQCar`?lIJu*z5Zi!<
zo~qGQ`_FirdyoxPnt+3Tr~ly{I#<Z3T(#V8xCYJth)2Z`FTpf-%8+bp4_dhhgL2X0
z?eTICD!>MPqrzdlrfwf3l5{@effzhQ7^W`1IId6L8nOqpwKEb9mngX|c4lpdRLP!h
z2W?`Q4tcOn*SlNq#~zHZNxHnbc)#~bOg(nrw&SI6h6kI}d3`QVltmlq76lzTJ+998
z;4zwxuZLQu`ob!iT9Exjo$US5(+E?s%zjkAdJdf-49uEBSr#`RXeL86DZ|+B5A><6
zx6XYHuCzrNqK886f5slb;v+mX=v-oakcqWLb!YRWIgSB)Y8@JRJ|s~SLYlWazIrEv
z1PG0c33vPs*V;4f*%1QccgS#;6`vJ-&$%($tlfSsg8l{k_rPRht_QI#rEh6wd;%wj
z*E21*fZOOM<TCSxD&zeO@B&{zUw}mnFa_BQf_>wO(vR4OE1k1Hw;7XE*PVlH+%2Pm
z7}Ks)$x`nK!V#@Xr7eP_v>+<=BJ#}C(07y~TxM4c(KcFBN-u~o6Ld1-qB$Ro{+i#p
zYR$};Ny8)?413opI4d<u|La*fl5G^vUN5pAMS&6c@0mf2*DylRki7*)y!B*bbi-ya
ze7pytW0dskA8RZ~v!ZQZ@ah5756uW>d#$<>7n~rKw%T<<PC4QZ^xvpb$x1+PfZWOP
zXEeM%dEp>r7yq88$<E3QZwc!!Fzl12NW<81dY0cjI<+;yubb=X8|?eCr7a;H*xO!U
zx;N)B*BR^BNuL~qvitRB88FY`RNhG7%i4$ydM<EFU?QfkjyK)&98Tpj7wN?563Y&1
zoH5lV)BK(Vxl8%fOxq@AeA1e&rS<HdA-`!tf=KRGnCTZ9%ffrN%E)(CCQO{<SBHm7
zZMNjFkYz5cB2kC6|4<smtw#YiF>j4fc9=}G(2z>*y=-N&WD&Cf6rzpvWqT!haJ0KN
zz35-lOw4dWFoHq2JP)tkGkR*i37{L^DwkQ}wz0s3X#V2<xxgl!ls|?Y?uu+2p7c{^
zk;B#91fByW22{_L<ba-XK*i)3yIx7Ww<f}49mFls)t<r)iKCe7ijv^I{HRp-dC#P{
ziMxPUh;PVJe_r}ywNUC#4I%x4BsD(^u@c?WWM^g$p6_29;h6$r9bdJ_%J$b*hDScy
zp<vB#aCop-?@l?=^T0PWU-%B}fOM`+5)T|-wMz}YuEipY5HyjW-8ckrVCQCE9b|<{
zI0N8oktax>HxzK7_^%7wcD<Ozk#)p!Z97f-p)*=uM&)_!X(g1&Vt@WPzXG!W#hhfg
zU?X@d0*gPmD7hZjGR0tGg=ZojZ0XrKYE&|TCWPrZq@IbGrT+b`-y|QU83VQ|*_ugb
zgEMRJnzxQ%YkB95%mx#t+sCm;5~h1;Cb*{YvvOJH-xNs4{rUuVP()2=#H<!)9YHyJ
zk$3&zb2kNi-8kJGLKe?+^VNM4+n?|l$J40G)_lQex(=|1DQT@0_&YgZPGjs!aUBFs
zETUnJPK0mTB;#s{Hu<L)cgDQ#D@oF7F_8+SWh+O0x7ay}4gze}xZhK-%M~;<B#YaO
zbzL3Bpd0E4>;Qv(C-8v`O|pG`m6ZxPDNJg2_DQ)#h|+Iq4RH|cdFUYyxd?l7$v&J_
z7mQkBOKa=GU8AGh(giQ$c{LY(58;JyBP3fUe75^bu;T^{$l3_gKTz>~3OR9XX7dCG
z&9Ya(JfGNbDwznABubfsP}9Ih8$RKnC?X3?5eDr21LfLL0SY23c$u7mR-&-U$BdX4
zX-G2$q^k|fVS3qvK4RKaK7-jR6bmDt-jn;nTfqLwJlDNGj~3=jJ9R4asojMPU!b!F
z=*YC;_%TY_^x6E#@RxhNjuYErmz3gsFiuWA!aTRl!ea7{<7{K;<)N(+p8tAUQo{6m
z`^bK{`BpICh=tpeo}X$?4l}l{$VXM~^i$E}Rk`6q@wl{3os@}A=r%DWtJ(Xk2leEY
zh__Gy>mbPud+fP|?>Qtg0V|i@-!3E{*~ZoLJ|pq)%{{G+#+_V|z%!JMrG|@aTu;tY
zb=e>>#1*Sf18}?4-|v*&&(hUarXsR6Be=qzDF@an{-0oi3@VvXF2PSsthy(L)T3}W
z<9Lr<;VuP14t>LxS;#!!RIo_(QjIjbv&sgG+fdV=yNM%jrode9#trDnL~xl^?hql(
zau%=9;#_CMLt7t=ae)?v3^*;sGJ)sV1+$Xk$eff?PZ2)>lh**L)}n}7lagVtjg}U2
zOVW7)I<&sZ^2+BSl7ziT7rr==BDPJY1Z=E2>1=&OF@R*(;D;GeIV_7`Z~j4HSIh+V
z`3$3KokwPWJ~a@{iP&REym0T%Tz4|f2Ip-ql~~HyP@&c{=cV0#^=7>@4)ULk<Gz9S
zPT35?_l3Zq!g}oEA`!zP)2Fn;Y#Fzv516lzbqlJ4s*?H4o4`jULapKI+IW9_LV8e~
zt;G1JOU?eIM2I35$jLzqLIarj<q3P8Tev;0w6laol@f*$e@;ueEpmZwY48u2>`r()
z+M)*0NbK#9S7!?zZ!JDebrtz}h^t36zNE)Fzw{Mx0QiOqB1T__(?U@wxmbHV&~?1R
z5m{wIjpFb$M6*b~K0byu+{<0GX4d0112+E)AliD*XNh}nFa0BNgE623&Murfe6;pk
zizcK;TD`pJaQ@{C2)lk`D>KtrhV$X!@P&@p{%lwn#6|fP<B~Rq?7{Xf`1a)|MDsc7
z$zmf<d-Bg0)`*GyVv%tCaTno~FT>eReJ~Q_!q>^tV2)2@KigY>-c!HM$7@Sh4UdQ@
z{aWX_Nxx(vBh-=6uJKY2YAh6a0YwVWA8Y2uQt8cu613sAYXY)LK6~qX8Z^tz)&$hX
zro{n4f}c8cQcnJQoPBu*3_R?*#BNo@Qxaq<s}mK!7IC#uOIi^(QGb?$Lhs(qp}UJ%
z)aH)oTI$ax4L`#MW1b5e-y@wgRBKDfWKml!c!zyZ?l6Al*0;_hZ*CR!1%S;M_#QOA
z^=#8gQDr2AOH-j+!ui1*>@K)mKIcile*15lfrPzm|Jv($UmYDJ=EJRw(z#)vPlg0_
z%96A=ENb^dnG%$uI!`r7CCqIz(b{#sIG?A4v!%5t-EB9kWt87~i9K8zKB^{UW+(wx
z7YV7mamH{yxz2SjNT+?GNO1wO_yy?|ze%8C6N{w4QJz)pUx(EpY#`e*5rFA!+~fUa
zk>B$shCl1kW7r4%vqaqDueqevN}ILL6`0ifJzlE2GK-t@BkC;CDP}FX(pzkl!k>Y^
z@7q^KAwZF9DM8JV@3|c^bQ@!rJ;g+a?(TRFc3q3k_isqLHE_dftkPLDp2>CQ%h{Am
zj)PV1&qQ3)l0Du^Gi&1{j>vFmJtP+AlTpyZg|I~+{cM$Z_fisdBYYX0|BFT@;_WZp
zwa9QAvmM=vjLGER&$}vat_sdLDxDlBJsIy?r_wLaQsM-^GfGluC3vc50GkjWS3g5x
z7I*P-TP#26EE)1O_XSfvFYOY`>k}83l{!duPo?IdAWqLg9cS#^F)TI@rsdPR(vIDo
zrWh{@F8F}fn(I%v_9Q@Xwn_Cry#Q_%ln{EY&pel~9sCvZRE(as#3y7O946iOvwZdn
z&ZM12a=#(O+FTme4PBx(tE<u?4-OzIu<m(9&A^b?{)gmmysoxjg{#G%m|A+ok^`1Z
z9FF)8-EZZWpw=Si_(Nvm;@{-XP^cDFWZao1U9lsjz&mKfr-JVdLJfSY*FM$>O2z4c
zSxYj5UUH=`l!(%Oh4f^q$30%%XKyTpM@a>%qD6%hLX7)lw5dCaKbg$cqfR)t%ynJ^
zOgtQ_AagYso8m=M+n<+yaCqi^buwP9WuB;y8oth$i>}#O9JsMs%ND{#`j#`4X$mil
zp#f5J54rU0FnPw(OWij!yxLu9oyn5!o$HPqT?va%n*RlA<qJq>p3p#xgj?mf4&Fj6
z>Qeq!{u|D+)<AJg2Librda5#QrqgsTrp0POKP4Y-Ta&YJcqZxz>>u9@w=sC%?7s5j
z1GnO*pI#5Xzj;;zwHzri&sEd2$e~K_n|ZCbiG<F9acnU71U;G2N@ojNlgac^Q^KYM
z=JBeIaURdaF*c3Q8JggZU*e(T<WvzLWf=NSWZ!0?Rqsp`Vs_bjmWU3VktFTpg9LjG
zQuR|@{ZUz08P)vuy7j$^ZUpVkDbC_*k-^5(lgXYkOf@F?&*Q`Xm}DC4JV}6COVt+r
z`94{8#9fQ=72f(iG&CvGrsd8?9ArlDgo!vr<nDxWerwt>09LBAoi9FlLuDyIXUA+C
zBwNqCqF>8I42yWoGe2R*p(JIVGx1pqmcBWP^NiyXg#{8oLHc>C5PyOM%<{V?sK-ez
zK>q>9SEeAfTkLN%_)+6!1uv^r7?mr!#a4%XH95RoK_gR&rZ`~i%z6@Us`}*j=q11A
zvGhO-uCK-Y)-#gfWE&tvs&@EZjyQjmQ6?9)^}l)0f(R<CSbM3Ij2U&6Kh(ULg}GNw
zNxh?`%=&98lX$KbG$KaXpE=Wa;xmh+yYG=y&vT$g)x|gmn~LL@il6R56tfTmD_14&
zy0<>7Q)H^J?B4aY+7)*t#ZuWW>Gpg59*FXaQAXA7AIfbZ>wat52C>ycu~`7K0y8Yw
zA1(G@Ke4!y3ad-@xV_BQe1>GVJQ;CjAN1vV<^99;q*I3U-3C8ydU9<Rv;<X!7cJKu
zYdo5gL^O@8?QB#dW+54+9xKMIZRqxm5gULRZ@ZaCg860Deb}4%H1dpu+W7tV$NPuG
ze<J}(YOW&VtzpPh-kS2Y75!=5ztoNMDw<99PG}YfDYsch7V&BGJIb^;oh`YqUr0}!
zBnJezuxb(y`(ue{czzYt(!0;I%lq9uit#Y`*%aKmwDQ^2p3YYwJx9>f8tY(YPOp{Q
zHu1{^5QUe`CTE?G2?A`+`knJk+*D9RETnpD^rYd^k{+HPB~CzVr^Ypo4xXv3N(;51
zstOd|TXpwE(1)F)6VP+Y!?VY$D|(J+i;0~~9u}csd4zouZgWhF*Kfwx6D;fjJ`RZi
zs`&P-67G~Wd*P}RzoiYKHiMi*{APU(Ex*py>YL9*HfGz>kWQ6dpkyl;-+syL!8T%`
zl|4g3+Nj6ZvPa5v_+=*4%sKj4K4Bh-Liq3+GEGl&SBnIOb6+6_*RS<ugGo;m*SaB6
zrwhVtAKhE_G|aa|5O>5h8;ldz@J=uPoNliA+RZ-U6+E<Bz9aTyILx&3_XIoh9Q?+%
zG4HRkTWoeY^b@D;UYthuxj6IOpDJ^Cl@s8k;)mL5cdfEIbBetde!%czz4cg)MJbtp
zQd_2+B?>V<Dhv1DvNgm8DXbPGIrrehT?`P!WvwF;fiSI6-Q&#MQx1TPSg7Hl{bHDO
z7kjgmaGXl}c@X@EO2##VH#ZHF^d0Jwf^_Hg`(wBhx>Z*TG_<1H^p7RvF0EW>h>|^(
zIC9r+jAqMWr3X+i!&OrMB()a;Uh$8dGDLp&zm+#7mVo3acKSHb1i@t<Ah-<EIkry9
z%st<b9xuD^%t(G{T0AFiw_jm9q!D^DMhqbl`1gtq@Z|%t<vJ0+g}afriv~D6k>`2O
z^Gn9x^t}v5P3VtIS=RcK@7G~juSpj${Pm^d*|+g%<D8DpR(IPVncb4!fn>O_9GIA+
zD$nSDx}Ppj)XS@Z!LfZ=pq$Kr^T`ZATY^jytsxt&)rJrO!JGy_4q_9A^R>vYmGhHY
zosPv{6z2=2hU_QAH89*Jg$u-6bzN5vv`Wq>NEkrMhR0EcCT$>#Jws3g_C+V-Jsvz1
z^b^RD)y{g-7XlB-kTCrP9hXmIERJ=8`CStQx>JjcZ4kOaipwQ*r>r;sNq9INr>FJd
zT%FQvBVVXuGA=QhTwrJ52Nt!~QuTH48~b;2o?U$+Y^wE8)YfdmQXsv!Kld5`FtR6H
zpcy$l$Gbzeqx#=?A-)lW*API|>CsFb4w<_36a*Zu7x^vJ7<7>xz|R1A1ikjRJR}k^
z&;JEl(&ndpIGtaWa0hLm1~*ts#Yz06WJB4YB@Oz!GFIJHv!9wEuu_x>Y!v1oud*8!
z^Bpvg;ZTolK9au|j%qBu@pJag9&53=8p8&2kp}EujqaGcqVR^<2mNoKYczG(&ZPQg
zsM*C*e$OwbwJliEQ!ROI$7a_N$Nf|%Vr7R0mKGrO^R*Wdn3YiEfK(#WnMO2G)S|7o
z1P=+B3~k)##8w~^70ZL2{G>xYHg?i*nd5z%8*O?v5?eU%*_%CO<cY9-FLR~wyuKlx
zjEeU9DJZp8R$i8MH(BDy8WuU3@7M<3a=n13KEyajE7URvFDJY1Ck=DIN8YzCl*Ni-
z2p1=O+@!VLXqQyIvBqMR-?KY^UvW4e0_Gf)$#Y;8ROiK)3-d@1_(dz8jh`HjGR}Ct
z0kT=51SRjSMea1|_QK$CibJgTrZw~6r5i7)Wn7Pf%4>&1w#1Ogrfvhz7X!LU?EstQ
zHxM}*-~K5X-g4SPRj`F10ll=Y7{-tlq?rU>P4@(%y31a^U+7(1VBDjVqI^973CCze
zYef*_c_h&&9<N^II3f2`=T_3Nx42_{+M^s4QTlk{1)Kbvi6dpf*T#PLUNF4bTB<ti
zHKdtqnVqlPtdnvqJIRDigg7?tv)k`?uf6Mh_R5H<;;_QJoSyEIM-t}8Mu7Y8hC+q_
zBSvulqD$T5qxh37SESmC%*YuBZMKnAB1W%iV4imqK)q~YwpDyJvDX)~;d~3}Ut`=2
z$w5(Evl^(JC+SBWhv%RWaO-#=K?lJQ{0EMY_Zb?+6Gv*k)-L1O8%fgcWR=^G5?E|M
ztsV~1X+oU5EcfMf8OgY`@b4|dB~~N=JIxSMQ_C<5gICeIZ6V|{9fgM;+xS|hU2;aP
zx(GDR#xNV}kk>z=T;49U+}VYOaiTqMJpyd5XLNtjNkPzyMA=I8l_L)zD<Sc~rBOgf
z2b;E8TuH6&pUhK^QpQ{TttY*W1DW=UyBH@O`gOd>YZkYU3J@D5Q{i)XqONK+Np*8*
znQvRi28p`Psjuk0Jy4ouKfy~HOT&u7yc;>T#m$=VmO}!{spy^OsbmgTZppYFC%&{z
znH-+oe_6FUacy-;G2=eP-_?bhgY}iwf~-gu$?x8IS0&XFaxNxHxjhu}vKiT~hK=*x
zi!Y188nK)oOzMp_V-^SApEhwA+pt<Q53s=zHw`6+1nh{nJMMjd?l6*H{Jn0|A9=w~
z*OAl>03WCVtg+mDdEuTD(>Rc_{-I;Z;0_I!`_BlH4*+jAJ2iqlS5yf&z=^Sc@v<TX
zInc^ZT)cJ0Bc3U0zwiK;OI)#oB~eR^GePP3TLRm$%Hqi%3*rd{Um;Er3Gx>G(;=ti
zl?6qe#a?LHnggMK-Gh<Pb;a}g1-%x?GE-;2)`aymR7)5MlHy-#czP-hAWzxtT14`?
zzW=-MH>LRu-%n<1!xBp(oIl}o=ZV$M?At*kr`TFrviJ(3nya_UT#kb0`2-9bS7o?8
z2Rq%gdqZ5<;#|!x*E^-}MdqM5J0ef=SpOZJlsfO0Sq#JZT)f#{NUn{ekC*Ca>mghH
zlJS2lZ3z}6O8%2i__hhWHqat+?RW5TM?mIzf9K8h_?fIwHWwtm@PniQDL^oNWylaR
zD7vC^Mn-%#7=8r+SKs4)%k!}6rQa7F?KF@UgT@otrCu9z0n{&nJrr^sFo8|XPC#5T
z;eKQ6(KA7<jGiP>r6PT}qTNfR<TY2V(W~;`gn$8w_a5*3iE1VxMc=Eobo+R>%5wa(
z8$-8JC~OTb(gYDA<Q!?Ldr`hUF6p;Egpy(170HX<?g2J!!pNp!mGSINCI24<nN_>O
zyHzgzmE%_ec&H`Jw2pHibl-3C&6*m%ty5qi>K0+-{$p#v<kp&Cv1kr&hB_OviLSHj
zHo4M@o--&wup%0n>P=B@<Y}^b%}~{9T(U!Qc6H6*E^P4Ru+=dyG4?4Q`k&iZ+5grv
zVYU7c|1HWyMEG4vcN|t|qT$uJpWhDU=*ROTDfaMP0#GjcPmuHMsbTsp3Y=DhNyD|N
zRySic{&a4XDh6K5#w23;1Z%WRZ8q|==8Cg<do=3^iPcp;u6<JGXaA-#f+fTH@AUY=
z4MneP(!7;P5H>MxcVGSt!6aCkA(E1J%>{l(`vJenp`KdEPk08!U^s_!-E3#fH3n&8
z_#=PY8gh`)na{rgtYvlk82?{G=wAfj+7Z;NE+}}ZSUkyu8lfGOTTY!OPtgA`b&D-F
z$ZW7lJO1mX%=Ig)c26IlTC9h_V~eh_lq^o%=4*ox9{L=hJ^b?&ND;XJ+KW6!)j$UZ
zoey^;c4&!4FbzPGISKr^A!!$0jq+U`xD6+U89er&&Pi-k=?_KR13&ZVd{6gaMs1ui
zlWy$CwHU7qnPddfx${`+${|!V4qrSKYu$~GTImgQGjRa#>T(bfbnK1T^^<CEsYo%{
z^gjF)dv`J?@8+?z1L#msEM{*8#Ve=@zr{@QP?X*XdJwubMM51SOKa>kONVrG*zALT
z40O8Zy5UXCXT?6Q;xi7GSj}|Z1oW)2Es7cc5|ss$xHYNJjov)Xj7);v0>e`CL~cHv
zH(?xs%!BY9c=RW_IQfjtAypmkM1AL2t*E5jm)_pw`^52^r+cY{@ZVjsVJv`$u1kQo
zsq6XzDnf<70^H%*ExJFom8YBAqFLXox&L@JFI7Ao;w^oG_(A@&P&HZCoi1yn_L2t*
z$VlgofGw4z;n`ry0{d^TuVM6rwyZlzkOBY#<gQe$L;D|CffN)1%)5BbXZ@ciEzIG$
zod6vv7(P}uP}W0*1Gf@d+1?q3A|1Y;;%Rd=c$?cm@!rOKg!aR=o9;*qO#rB>mKS8{
zysKn4?2XUr$w*VDQfxN@^9@SN`)_Zw#jqFJ`i#nNi%!jiy=*Qjj#(i`%9{0xt4ngS
z5?bd-8?^rV@uTw0Am=2j&V=h0T#WwE=4VEyJpmjOZY5Qz($ZHO{<OcK9I{r!J3M>`
zul1~8f)Z3rJ1$N{e8GNIjE$vTv%=3;>jNhZ!(#rTXaM9^oJP(pVS1}qMglA^Y3DgW
z;BPfbCys0y3uxo)K%0pi;E-US%|F(aQdNb(0V1svhjOE4N5HOp;H|b&JHpzo0!?%6
zM(+d1@-4tYq&)sCrTtE15YT%YD|qj7zqALyS^sR28bIWPQ~A7y6zC49o4x?aXiKps
z%KQ5%TzAVbyJt}0aF11y=>xQfXa&9mk`p0=`&)xAfR?>Kyzx22jU*E$4SDtu`f=T3
zXwQy6J7?-n9pWg4x=Al|=ZzAdw7jQCqj*NfDoam%=w$L5S-(Gmeu&Qp9$_KWBBXw2
zhKFubO_Or7+qufhslfhhU!)*7Pd%Ah!r=_Y<Nu=Ty92S_`@b(MM3H19BqNE?Fd{;-
z8ZwHKh$tatZ^_80kV3K&QId#Ah^Um1l#$Rt$rg(6ygrxCx$ocad7eM+bIyG_>AJq-
zJzncWkR;YVI@~g^e*igAobZZ}7RGnSkJiL41zF-PY+HZ*&2zJ}v<%i*+<T+BCsb^9
zeN}wqHDO1RqSBuY?gY$muJ0S@#nl1#d=_P>=2$(QwDC!i_02Lk*V+dDY<FC4D4w;I
zOl|aZ>pVb&FeLd$S6ssldiwvlK}@9V9bX@s-z-+SNjxFMbo%G_9*jEgKS6)M#A)C=
z300ylXgQbj;kJ^61oQ1Wal^Nj^K6p{>&wf)F<PvqdVYR?yYeQDHtV??qNL#uc?M!;
zk8Vm;qjP=Ta>QQ>UuYZ>2#NhU;we7rBdqlc_-h*<E`O}Xsp3x|7OaFe_}o%;*05fX
z(mezKuPBczBxZL!O*av|Ed9`5G2|0yyS@i<a+VG}&x>XMQ|DN3C(3v~{qyJhvI*=a
z5d$41hQ5J?G8Dm-*NO6b^|@B<tSQ(QBc!n?CGi2<$kWC&$wFq^yk;AJweN>hXo@SH
zfkIxo?Tz9paj<003mL>)bl7M0=L^>#EOCx@Q6SUSPR(@zttv{;^h^s5=Rw=TxZ-Zd
zZ?}zKFcjXvZ*w->nlL&{N&Zwa7~#K0ZE;I%*xOS7L_GjuZrR)BQsyDX5Gp5o2q`B2
z19&!H+Ra;D{RDh3*F?{=Sw>JcZv$C9<7WX95N;rkh8DB*!Q-Yu(iH?<&bgVqR{S-S
zV!uqmt2O$g17xr0T_0$B!WsuB^b63ZEsLCNL#q?HB4ON=#;-)~{j2XvyC2*G5wY>~
z=N;RM00{}oG`vAi{Ko#WUfNWBq;6t)J9#&@oM_sycS%k##uGPkt07XT0ClHvX@M#S
zw(VXJaOT@?`H!5eZ6`maj{l*lEa1Kgg>(OsfK@h91Jle7PpsD;$ZZyS(ga#?Z1-D5
z`i8}~Pj!f}w%|1(AIAHzbVRhU@JUNxs!DoXfwl-1s%YL~!h($Vk9}eZiq4s_Qit|G
zr!FYsb0MZ>UK7?3K5vbj*j!n#)cio2$ePt@>EQb-)pRm$%pYKEz!Bg6dLUHG`Z%{n
zALq&J;F?aeUWr<{>*DSZK>`gxShesH{13^7H?X#Mz+a~5$YlCN(D&y)ciqPNE9>I`
zyPJP#m612yQW-SP2&Ve|0Pp74q^X?l<4``iR!sZ&srXVqbtT1o!)!FqMN3xR93m`V
z4%n)4y}dI{H-$VxVnXw9>Z%@MIa#nK>Tg|kW;_pa5GLw-qIk4jvha5*#V8mN?#@j}
zJNj}hYYNSN2Lm4j5$kc+!>?Jb#3v(&2{4kn>nph*r&~`z>{B?OeQV+SxK(PN$%SU3
zKP%=PlrIz9qyzm(bF4hbbMm^3Mr|e%y!V7UOdN4nk>g<D5<%yRkB`x*a`LS)NQ{o>
zAU`_+ny;;Dq($RyRAimhjoWZHqte&!+E&-wWV{J=8zh*cWa#kbH{}Nl?R3n{t_yrP
z*7uPO<^AwfCR%4ku8>D>ve2B|XO;8wL2c*QcH#sx=MBcHSl@fTt1gTN*s7ZtnV~6h
z_52*R7J;l>l2M;)B0@cm+}shyvC)6dhyN79$$}WZ`~CXFc%NGgI+UdCv`x08H*T-?
z$Mi%^TFX2k2<n&cLIm4x>tRh!SS`f$j#5$ea!sm;7UWwKb0pL_J0?y}S^7osPRgqN
z1vam{UtLhT(hXFC`eb!*vl~5zD!t>-j>e*t^i$<j&Dn?UjyyKPK=@u<rmU3dxDKO*
z<iQ6o-1a%y(@|+15f3hf=Za|A%-9zra)NQbmY&x3V=s8#SHZ%t<#pB+2JtYQHrYZQ
z)v^FH%8Hgg)>oGm6%X}8wAXaY@s`}ApFdg}rffOHLqSb!;nCoDh6;YJ$X5n2xT`aR
zJ)%unJ~-EnLK3_Fyi#Rq9Xr=s^*@VGOIn>j6<qTmN;~iu59{d0KNn@_o-7fpJowO2
z>iqNHtQ7ajfwpsL@Lq|DFXM7aR?XAfPbVR1y3tQ+pk9Md^iCT4l5^9;e5crz4Q&;u
z^LtN2%8`b}?otESC7`Jg*v6DxDQ&_ctRq|3VZAeb?nk{A;N=cLr=b(2tr&XE<WKN5
z?7IxC4m+zU%gi1;GT-Tqms!v^3@w>-|LEKUcLFCxN+!p9aQ3kK2J(DaO0z7Q=t9PI
z>8<mK%EwwXxoF>+$)mV;H69Vp%YV?IIwt=E$lhCAo>j?CVKC#iag1UIi}X7%=tay3
zFaEkYM3hrxM{j;Mjbj{>v;PoA!gRX;+q(FDAS?#=ymbRy7Pzt-_j>WySl`Q|HG(u0
z6Z}8x7g0jxCvJxa1q904`y}S91Z6)6#@#dULV%2hr;xqKlL0C}qg@Q>8vM*)@EZiM
zNw6u81VBhdvz(XtE&ePP&NDtCtUIkYuGGku2Z;MK!JR$}Nwl|LLqE9#(}_jM1`n-n
zmc@(mLY9z<8S&P+()Ygnhb#{_5s5p`{xe>#FFe`ehHcOOPj}~1u(DVH17&BPblTHp
zot5SeW9BdzwGZlSu|LxOywx6XPRH8Q6+aVZ0T}HLv?cg$l34o`B)6~5FBAUR+9@;T
z`((eT#R6e*sK0cVH3p=>oFlE*wwx4kAL{7z{6<;_YgZ{5ZQM+xRp<W{uq(gf^bpw}
z#lAR~fP^J(ZNIypzNcA$XCp4H+v3o^d2?nD4pL42iY1WB_{aTh0!p#LX4g8Pl=+_H
zHWOFCKb75+G%J>2XHf8o?>5DlWya0mz_gAh;KTm;C1NIwRGgqdo+_&vDp2vBpPgcu
zvc<ZUNIx|wgJn!ueZ)RISwMeYb4_3aPH2+<$hXF$cYNv0hC)V6NE>>)0{gyQS9Ct=
z_jPgU<Hr1BuV@u*yKYgAiHK=4y$}X{RC($Al_Wwz1Z&H90{EjX7O5i@0zdqKmnV^d
ztS<OyTTbK{vnmH_;W879)!bg2LeS*Cv~i3Tm!Z_`_khZ$C&|5Yw7zCl-Ihfoz1@}P
zGI=K)?<o5q{m}rEP}b>E8qYSg$Ni<`q$L2bwxw$oQ21*A>yAK@^z6?WU|^DkCz8{R
zl=;L#!5^?=77*)WGrPh_E@E3#27sj>^bR=@(GyFa9wZrF@2t8a4s%sP%jOL9z1uW%
z7sO7kYJ`dn7ByUxu&ig&0{dNL!mbl*H=*c=SjPXTdwkCv$Qvm3Y^9FEywPaSTc1tW
zR&HS;Lq=Ur@aqZ73JpbOP<$pV8m6_jMQ6m{SJ5}*{=_JDP5#t|^F2@OCyop{nNy2l
zfjr$YbHg_;6?gx9L~it^>)fDlmme~z-gZF1GpM^Qb{&8^^StIvZ&$N&2fVKSd<&yK
zs_@`h!q7qX1R9;-*cfs$<W4(7PIefOiguXH)LBcq4p7eS$)#<7p{!Fx8bsd^qK6@&
znOo-y7LJ0Hpl)70hiiOGF)Q=m{=9c&pk}+KKiayi@lO^`@zn(4dHWLVKwpAq#i}Qp
z_`3JxT0aAGl{kN<SxI6EyK)3KgM#PSO|b(0ZC;ySYGuC3US3&%o$9ty>Kvb*W(wgz
zAgN!w?ZD~*Sv=m(yqaa`7KilIZY)3fY5hIa_MX*u&NkgZ2;J35#5|c$J_nmVav{S5
z|C_gT$NI88idwz1BL!8>?V7XFtmvUz0a(PpztGMY&%*Eu%)f1?e-|+)2mf(cXwd)H
zTD0)5LlUXd(U!wnBXu*W$}Oe^HoMjImxSbx<Dh{D0;STK))V)wp1Uv|>!UI<)Ug`l
zwf^2fDf7oB(9h*OFOS2k@Df|E3zNj|o$H<@MZDG_ccspgiYTsIW>e2RKKD6-zTuJk
zaPCmC30Xi+_ihgN(%MyPlrH2kSsLhmYfn=6So8^(Z;fh9$z_>Cz#yKfPAaWv*vMR2
zQ0Ap;A;lVOkV53gLlyIW{DDpDV%w{Luh9{^*CSgv*#esipzGOA{$wY=Zf}Y5d()`w
zJlTe~E<ReM!-I1a3mSH}YmG>LUI{hE5wjsJs4VBy|DO}tzYi5vM2qjM__#sGg+p1d
zEuV>P+s~+7$;zhlDy=g9XryKScaX03-9|8W$^xr0TkOFOHVf+3H@Ne}B-#2vcHY6(
z*Lppq?O5MA=03JvwRUeW8H{!LXw+=_!ZzbeRG0J(2fweoZ&69R&irw)uh$O#F38Hq
zpgBEp1SiQ66n)>9Ox%j4wb!m-{bO7}HJ4s{CDi{B==aEgZ#L%hSpR{YLmef@2HOkU
zF#<}3MMM;UhYz$pqI3UfO|xLd{2qw5&jFKJQqMNi4S#QJi#g=nDUV@<qZO@{(bz~s
zezJmU9#^vbspKi6->B~yM{B~jh7>wol{9@k-q@CkX&ay9wm-p3PIQ%*wHyEr@;<&Q
zTA03}dv%~Kb$#q~K32yQXlwplq)}%?l7{i{&Zp#pW5xRW=>7jy%lH8F4UbuZos%vp
z69Hq}S}(8a#T>L}wwtcBdhtuM_a1)6l4aCe?NFDZdSs|<%Dn5?4PVg23T))SDNoo@
zCR%*~sz+Gk)%y02@~xmd#DbsFISdQw<l}ij!y<7(5Ie~^c4R%p+?q81Uq}=grf)&U
z_Tpe`S!QEBwA%w8^+s6BS9`cUjGsOH>#3Z`Ej69RMXo>gmmpGk@KctRxZyJ8#8j30
zuj;iFzngu5%5-t(Cw@|F5|Wv8esQeNgkV`k2m3L{9L{Y}{X#~LPHL@$^6S9~;i%||
zHlT7hxWrz>YF-U!F!!A>R?C!PO?W7(8#}W5t%|(1f>ZbQVY4GqPEhxEmaHCwO=lf{
zqLO=VtRyeI1&NW7>wRcK3hjB}_JEZ1M8}TEV;D_V?;Y3#;d~eylnvX56zcTNUufT#
zJNbSWxbx3RkNjr6YSDXS+~7F)zR07|lHbCz`M#CbV1lgG&egZ<cQY>)npm2(=x^av
zDC6p&a*-T-A`ep1nXhQ(rKDrki@BsS{bJk10|!+tD0HRq@=exjtd;vv=j>8_epwUH
z;l>g&O1yy;<i*;QYHTlp_?@lS!?1PwU4&q9s9}dvMuFkzs*eS>m*(E;yHR==eW3n(
z{}R*R!m~5}ac!gRe_=BHx0N|1jI^(Az+942Fwpl6lxhze+5{A_@af5TTx<uA3aPpR
z`@|}2<M(n77QRf%9E=v$vU{(zl7w@9eHGULQ1-<Q`Gz+pgpc>8Z#ZK0%dgb&<JPy4
zUeBk%LZGe~uq!7@W$49Se55k46Ij%_n0NSjaWJ2Mcv?9M^{faG6g9q#qN<|@u`%ho
z*My&FlNB{9u#~-vgY(SEnQYxtcv%o@@VgIk^lruW3_b3_X`pyLXBh?ZLenpft~v~`
zVRDOOhsg+kwe89GInm0VWAqvU0c4hA*~9cB01}qKVmVgPDqvi3Oa*(QyV)s>lX@2V
z^GO?z>B`s$T6BrtC&+bl887})nCGjKfEsXuUe<tzyxmYqn+f*LF34BgO;|q6l-gYM
zytgksWLNxlcuLM^-0=T3V#zREe_b&>cfEc<cY<PQO#e1y+jN^6cq*RoTUuGbuKKsj
zcJn&2E!>!hag}xCL+aTR>QV%1n}OXl`T{nSHir^69^*8cdR=34hyXcE$3!UQp$P!@
zB|ib)3fgU4!(Z?(pzS_HI2FtkTNmkreyYRVxPzCKa!G~~b@yP+srZwj<1u2pKL-X;
zHqzKsaY#hySNQqTWYKDPa+qDR3>gy>|IJv6;O?&&u%`ADzGAU7q?&7n!_E+xThD<*
zIWa9x3oYlS`*3Hc9>Wv)%7T-$UE6(6V&f&OSxWj}R*9`OPODkAvfu-~lU&_~Wn4Ws
zotJQTKRfuVTP@HwH>Hu(6;|d6nbIBcZe)|me}Vnt-Fi57bTm@Z=+``1%_CR0jD=~F
zQbIIAu5X?PR}Jj}`<=7Pln)7uFB7mKR<~W(HtU3<TYG+Q`U%!yv)`WZE_L7q=$ZfD
zVu_#QxReY$)biq<=JekXe!!SlSGjN5!^M;i{p(wRj1y3|^t4v9lH4p{|2w`!%Y+^D
zGpN4}G2<6LI0B-@^N=kMxmxZ$2$DB2uy8dGK{#{A;5l9tm^&6~sHsn_-J#hQ_xskh
zK+!;zNUrb&<DkQ%FTnx@0VJSCWY6#NfmLM#^-JS{EBwhtJ8>k?*3<W);b4LBPmQ&Q
zZ=m$Wy?CQDa+27u$i<;n{r_#hqW<e*#eWEvg#YzD0bt+!i39=fJM`8zs#^ETu6DOw
zQl0xY@eZOENt~?Gc)zXS@>1o3S1h<*dJa^5<?y1Pb(<Eh?9)P7`&JnuFB{(YMuoUH
zW^a@SY+V0-y^7w6e}Ga9mt`jae8$L2!$IbI&3C?pc*cHFjc?St@V`7`|Mj75P~q4j
zkub5V&bfKN-RZtrK)ko}oxU{+@sneSzD7uwJKZ|rJ@!>Q%IVcrsTiP}!^$r05^R6%
zEuBg&#-7Dc%pHTMsIvr~y=0?1`*%N=b7&V)Jj1R$zO~<M?ZUMKs60o7B47ai<ZD#i
z@CMpij87?L<)BmJT)sDHKG%0kCS&q=vUuNCq$b#a5A$-}3wZn3=jt%?17xTR=UjIK
zN?-oB!)w;=xjs@4>TwTw3ZK@fjo9aHKGoUbowD8bk;}3GaJ^-;!R9HxNK&R173GPM
ziSk%xgCBMo<M0m|4n^Vw8E!7HZGdhbrn0xEt8-q@<<8J3Wj;XHR152t(YqMm1V4?m
z_$MZ3&`uTU7-EP<@*_4xrQxr&m#61dy?$mbT_JVQsC-(&5B?5FPKWJs7R~;BMTb^Y
zO{PpxP>|S2L9*cz`)k+pXov-Wp((X1-{!T1lH0%ty5R#H7WKUqwmmV@PpPd50{ql3
zI1u{kSvp3qzM}<8GQ=a(b};A~|6)yaVER?nTc5OERPWiT_?cd=_4_4F;vd+z5uojB
zq8OjNBd3zmW~*nB`QB5G!yPz<nyEbNDX<h#IBF*F<I8^uQz$6m3EOwJ+5Tv`7Y9<3
zK?6rR8All}2QuP;jNhBM*zMP3twLUID|OyAb5UuCeft(_v#Sd!QB<v)0Xckg`&rud
z&3wI(a3xrEVjz^*=Jm5ttiMVXUVNKxfm3J^OxAvuSm>`}G&7fB<j2Fr@^a1ft**Ki
zW<ggpw+B4J)Hm%6grSSA{Cj5NS6ud%G_2-}z_XY9TWDrz|9O~7PSU^qXKvcm!3M`b
zXLrI9Ct<33)S2&+*F%~i$VVx~2=>;SRVMPBKeAFj!_THpS;03Im*Ou5v5#d%1+p@l
zkbyoJXrPyY|Hyv6^*MdR8RpM5_>ycaN263UVs;Y-^c#qom4?dBy^*JG580AeKIAg-
zq!z#>R7IXSEcMmmp`G5+l?8d$I|FUMWZ;xWF9C9zx1E2>wi1J_ne^Kmm9-$P&{X?=
zI;Esx-vIB8n<Wj;B6-tw@_7!Gc@>JW{_0J;hqI_bE^zWcJIK=`<{PrD7HRD26btjn
zeWL?MAJ>$GQ6H$ighfMn4gVj$+yC3II8s-tuh5dvq^2G)_rfLwo>}!1Y-(FYIGzx@
zhBox{?}J@`ynmNQHg?C79xf=4?-=}bsuc4qlz8mRQ+d@kiST4x7P~OeRnaAH%#=L4
zH>&P@;qDyrb?F;G#pbj`mst*(o6tA(vOM8w#Y4*(f7;i&o#J5b4@m0|*|~7ZCVStr
zPx#47Op_a{ro#r@w0~oG8^tAbm`(*sd8++YzPg-qguFSo7Su`jo&UR)lkd+e|CdmM
zNPGilz3abye+Yg5=s8<APoY)mU!xiZV%~`#hAz+7`S1!~XqoF>_4XT%4)p7<@Xz|j
zw_+ZM(KDmbhsHk%KZMPiFz%pb-o9$e`j(hKK2#X~T8_vj>~BaUl&EZUFnX3s1p(N^
zdqQLUR0>kku{U3ZQJ|?$Y*6DwEmk_i|E2^KK#Yiuf8yU??DQ!WDoy`C9^|5R6s-9K
zE-1*?rKApk=~&$VQ+faImvUVC&%4ljSp52r|J-dsa-XPO)8{0E1eDgm8XFD2phi8i
z^!@p5Usv`Tux}hsQu4x<Gbg{xJ^bOvT9@7z(HdDEljo*GyW8kCV%Z-q+2}oKzwYp(
z2N%<-H&qtgDf3geg<#?<%#}@}mFE^Hj^0uerew{y9GmRjU8ZAw(|8aR=MTT!yIh%N
zRPLV_^wZ>msJcL5_;Sp@4b=JHuSZO@|J53tO3v95wTYt@_^>kjkB_#%cuBX2mA#-X
z<S)`I+x!Bj#X38v{1#dGL+T1GE6B64$aOo&)~*$gN$i4e82xj!)+j(4cHV&a)JJm6
zyZNRlFVj^2V0`rR>_%_b7oTVYC)WdKT1y(vR(|#Fz;fmB-^{*CR&I_0`}!S%*~!;?
zuSi^3rR<>}3ZgAPwIOu}lMLsgj3XE@l8*ef`neISUtaFyg4J)DSp8()f2SJ#dK%%-
zvXg&5YcdamL^dJJpDwh`y(4vfV59sGk4WxPY<3qAv4tM~>rcm~IeTneb0QvBJw_xk
zTAt-ba}uzAN1xQkwtDbXpvzw}pwuwXEsR4nFjxoE5<|q|lur>sWipk|4w#q*;xGs)
zllavX4|3d!-_1!&;@ZLPW@7Z)9{)A`dB8$mf=Tl+WH`1AewkZV#3tT7&tv0g8brFx
zts9cGK8~!%mZ2LRgS5hgN2r_S-xW?^%Xgo2llw=<$S&Hwlsv+i^OGtz?WiF2$VTo1
z!WIt9p1CCbo4gZ<x4*dhIJ?cH^8<qn-LW@Xf}Xe98{W*$Oi{Tg?CaQvWgFfgxkDRX
zJB>d4BDSyd5S*)8sA~ED?%~EldRIyG_``+9c5@KN*Ahf5P<CoH<hx@?YdYZ$)6Km{
z{SuJ>T$=Nv|HK|S-WJKbiT>m<>PA66;_sp5-$=QD3HCoqtd&$H7EdK)sxH{Zi5MKk
z@2N8u9@TeWkSd^!^}NhEZ7d4S(_$bq(J=-+Y>~svlMT|=_ksr%dwCl6DN9@G+kHpE
z0RSnFq0hfhV)X%0VT(%#)OA_x;5jn4=*92YowpK=_X<#~6KpT(R%Y;=3>P^1@$5wC
zeJpi3wp8m`#h;D&TyiOeU3yn^rxC8@jNpZ)BbW7eeaGRJ%X7Fb&p!n#vKQv@>QhX~
z$5=Y34xupb{04Kh6{;C;chXklk;CiiQpfU+)-2}tgSD8UtJ3H1ZhX}1#Gg5($^z0-
zs?Z;I=iQ^z6VKc+B6-XT-I856uqJ#R)R_0V2&UloY$~vD7^-B~7v?N8y`TG#<ZuA5
zgP4b0L!a2c=u2-d1ec^G%7F;pCN+R^iBqwFD|qh_aaG#C3owkY?37K+bXQdh_Xj%2
zRPU?uZd-ipy`ppP4)?8h*T)EJpAW_}9EKE1S_XDzB1}D6-k~tR$6-$XtS!;A%(0N;
zTkSiFWl<RZx`<e5PVvxooBRnijb5V(QiB*#eX>W|U;<@0>uJT*nkX#~pNJI|EO0nV
z_r$s|{`3t0)FO}>3w@1hLp^$`C`sY;n#z18n}Q|BmI;93oO;gSVD`4S$nxEt1Ki_1
zV)0IMU>-21#n1Wq)Q+~{0g;lf_>;f%!UD|sDKXR6<Dz5x1D#8EjPHn%LD)UFtj!Iw
znb}#(w(I~un)#|I(G}~JR>;Vv9&N>O;vx&?v>N<amv2#Xb3m_)-#)nTTJA}M%=HG+
zk*&H|@*h)-pAuCDRjF>MkVv>WCrsAV*592lrFQzC&1#;%JAvOYe1*ua@9-rR9AF4b
zkP+A3uCs=)RMlU6LWcR<@}bEnuMAO)-Ron6BM}jORN}UanXmUR0>J)UZV?lgOilx6
zaM6I{mH~D&Eg`09>cR9*8*)y6c^2sLxB&_xVYrK!Lnt(GJps7CerB$_q%Zk0PMkma
z1BZA<0pK7N3TnX~)Wdg^R+qMMi5Jo@uNU+of?2Q}q_@&HG!;KQpX#u76HejyK$|LG
zXRs9;nJKY<ILB_D;H3<W^}PDQzC*;`lFhw*iPHxpX9%=o8(JpPR|V<kDS^sOt|8-q
z&)v7|r+aAMnx{50D_xft3T5a(VV+V7X*yHawfoUtZ%!T`n|(~RLEiO0IZ{`@=Qa9$
zp5@Ti5ja{ow{Omazl6E_n#_7!=RBFp5KpjVj}4=f>=@zZgVYeJeK;5yG)8i?qLRXw
zq?)v415V|pe`2A(XeDw5e(Z4&jtU95CM&))f%?pWJsKx-rGt7UlkQGSH-z*MF}*9Y
zXr`~d?oEtPj6cb$wph5&e!6ZZh02p|a*Xs^3t2b>OjL`RX6(Tmj+}if<(@b|(Cm5q
zL!yHT3ttA8`v$oW_))h$4gnK9NxhBG;UghIyfEn4yPGaijeZx!Vw3DtwW39rGE~Vt
zb>@V}Cu{y1hvkqD==@X((msd4t^YGigv_X^XUGk{E;5mrp0r6!3bGthB=GhS$1kQu
zWHmlF_*PiKP5uMgzFZyM+9Xx#?32D~8V!^s=4X{82(+_jj-yzKY}in*w1U7pHO=y1
zcK8i%d^X*onbK{#1ADnu8kw2krn?<SMX18}4;79LmC`rlshovz@5facZ)|IH`zSfV
z=t`!;%})V5c=BHkN`kVmF`@QPI@!m69pywthx%Ue0ssNL;04V#f@zSbZ{vr3U9&d6
z=T+PfGf3*L>-Em!u%hL$H)I!$2uUgxYOz5|al%ak?o{V+%`)m>NhqXGncHg$^iq;^
zN9^;B?jAHopoM{6^SOSrbigpu5fsr~1#OFh>xg%}#7`dF4!Lro*xZf(7~W{kOseBq
zuYH`Z148kV-a&V$LFFmM_bods{QW;{s~m-7o#9wz<R>0qc$LD!ZTXwG=(DRWa*4Mr
zSi)S9*ZVs0Rd*$QgHQR5&uzLTaFWrrh#c7X1zzrRU6z8xP7C870KVO;f8%!>DHjZ1
znv~UZ*r~h!gAPy8r8?Oa4Y};(GdXR8wi?do-lp?jg7S{n!S=W_$0iS&?1)XT-x9Uj
zqTH~=D((^RcmwZ0s`I{7zks2k(I{&_(=&q`MB)@^+xrJHZP^EsG)6Pj#k%n5srbZ`
z&;7ui3A`=}*sCA2EQ8lKq=`3x(^7o&{OTpK4}C&nRzZ&sj`ekoGpDGk1+0gW2}xdX
zfd5dw+HRfw8XZntatV*j_J%7sEnEv5|E=fc+%t-ckL~D?_5o3mqJxRXg-JI;-;8xr
zmhx%^IzHK>K@y4~ugiL`oy20L&f7!`0Z@)(;pL!O{;2~m-f@8w)F(Ni?>O;r+omd<
zU=7;><F_2d=dlqd;eeQg5GPBgk5619G)OSMj7vpz)#J3Bk2sHu_kmSSEU;0SK{vlC
zLFmbd<&SPW)p5L=K=VTb3g`)$WhMu-zBV!x?^`nfGfw2Lyo=^YGX35N<jyPxLTua&
zqn~`6kFUmHl3#!Gg%EN7)jW^glcK`EDU&{!Z8=_pHjfKeH0y7E6SgmM=qTQ&G+5~L
zQ*xgu2p{)5hnSKF-*SYJmr}B@%_w9k|Lvje-|bL?silC7Rv|M8B+4uO8a&&567z+7
zCB>*Cb&<oQ&W3oWFePxnc+A$`zx~GSvwwfrTno@xMTRVo{knPCi}hf#zd4(NV9D78
zpy5w!#{T@qK?WPQKF!zg#|xJ=pl|OJd@S_FWA=@^EyekRIMgje?$6J5&A+oG%G66^
z>X^!X8y!x?Gh!A69XgWZY>U7rWa8|OLD0>MGEMqHWzPkD{OrCmFIPlbsCV<tSYUNF
zD0Z?Ogu*m_k<KMSRmZ)|7hxFSTah>V<+<L4m0K{#Q7~`N(4M=oWc#+1HVHOZo|TBZ
z12nFE{7n6N`=T@#5jYRuX6_!}{r1ej&YsGTm=7se7-Nfri2p|ULG-Ts7?t^}LVjbc
z_kHUk%>Q4rpXVvgzOATk=4D=CrYxo7AIH>nFlcz49Q#^3{p08rO$9XITwx+<Ns7oH
z2Z!o3pU@t{;Ru=c1kbU)Q&@@td+pCAGY#sO8H>&e!+c`_s<#KY-L(PWiCeg~m#R>u
z|3vs9sgfCVQ#VB@dL{Eand;c6o4l|O<Z=(hJppyK@mP3^ajUydPPmGm0}&P$Ec4=T
zgQqB1lN2NVZMq|@EjtZrV(0oZ<UZYjkl*X9d3EE_FvjkJxF$dyx~Jw#U}2D~_Ft&r
zGrISdD&2WjW)@W$oOoU~_FI7REhu=n+U)hjm99XF-<A#6CwG8^iiVp+^P5P~1E%y>
z+aot^xajrJSb)lI!wGl+yg|t9DtOxm6nmjk@S@RXL_dtIySC+3u~<KU!<#cbFHZ<=
z-&VTAq`BI8NvU-qgbfhjak_&ujAcrM-22qET^K)K#!26^dhzGwb;TQVKm58!pI<JQ
z3|!)bleRPvr1E*LJrxv8bPx^u<~k>S;SZ~zq8G9IHSl+M_`xo>f>fUIq_xzMUGjc+
zLQ)c3SZ1A(R%V7pFV7_fot14N23cjT`<8P}deS9^4+d4pU_aQGl@#=8rjqJBP|yOg
zZ#Zgyet6@)jMG_?E6;rnFl9+p8DAQ}*MA`o@DLhQHBt4hQ#w^IIt~}k*mkrk++rVY
zg7heEyV3cf@Z?(UtWc!(?F7xf90@~FsXzg4J-P|raSV9rhau*ReXkSO27JXHKqJFx
z<F@NN@@l!Nd8#BctaSCCtDRXRxqssqWR5YMH0FsBw7TN8U}gXGxcWCUhHq<Ct$nYn
zm?FM~CdC<L8`OfaDMZZB@RA_S>nFxOg$lmJmC5?(*yicJ;R&~4{>slH#2E%5lj&|T
z>LYiZ4@km#k;fNy=*J3_0gv4ha`7g=k*8XRFOarRf{&|(na|qZPfm~fl`{Qs-jx*o
zFRRdphwnl&5{}Rnex|}OK!YdbHx2`0{fuOWR{O+l-bF=e%}4X?0>KM+tQLwr7<s)w
zNj~FT5HYlmXfCmV#Atrc@h;Z`E^fCG(otV?K^5T(Y7Y;uk%UQb1i7!rGi9$4U4uY-
z)bw0G*4~jC)wxkN<MWmGPl{`OTf~ew*B^V{7Tq0-(qABEh-&bnDDhDnDX-6s3*Kj^
zvn#3*@btZP4=xK+CDEjVgZXryYfE$h(6O98-FN5T)_wR~p%bE{d>orp?8D<@-kGrA
zIC|jJAMG`sETGhGwRR78#Lv)V32k3N6Ef`l%e-$Bos&l8@{r*$;)7O&0F)eBSIF0;
zG&;MfE_GG-Z<+0D`IX#$?%DD(ZD-FXprBF6&GP7Lf0<^&LJs)*s~$pmz<4f=>4@++
zsM+3UPm15MUmr?1z7B444>gEkGe5S<+%w9K!BQ)%wrq4we3_-dE(QN528oa*bVinP
zxizdHZqujbT=rjIM0*`M-LUNKPJ`BZ3_;x(U^33l9(MMNpNidM12IO}ItPFyw{Kke
z&6hCs_58Xt>$651TK>rb*b@7|g1IJ^LSI9YVvtY*VG~l=j7Bbo+umx1C_d-1aCE2z
zSJbyfTtbq|l%rK8Of$ieaXl1!#fIRO&-<ytHbfLc$dC8>6-G0TSHz`b<xh25h{NBV
z4~;meLS>A*YEg}yG7tOWt(3s6<+$ZpI{vZhjFVc}RCse4JKUY-nG4Xme#D7IUgD8e
zUnytLBWG+MPzs0AQMh5*S}F3J`7qqN=i+@wtu=+dv%^q?OujGt#5^W7(NQQFz^`y|
z6wh*4BJ8qg@1-oOtL;AedL2*K{Y^aE_EI(TboUb)HMd8yIl~9;9waaZ93|*sL}?wd
z@*&?pT<VmfE-ASVosZj8t^`_@K4#Z*sOJ{f2}($;JC2X^8y4HbJYs)ur#vJk>a-;R
ziLf&pwQSF3NK{Z67u{qqS|<o|$fRl$;&8TI$m4ppa&6tns#y>LE|8tM_n`uvUcGqx
z$Uq(~OPG!gRV0RSqxj+FRH&2AhpXVV%G2&n9cNrfHp~ezUjk*(WuWmTaoUZ0zFR-5
z(8zxMq%H2ZT28!y`pdMq-~NNrQw@+4jja{+xT@UEE9Y;?%)ktb-!kp{py>CnPVX-e
z^!ud-c3~xJS~J^MQ(v1nZuRf0W7!vPiWKD{oYSW+TF-+iYB4M4xlIftlF@kKJefIF
z98dD+hS+151(C4>ui<GgsbXp9a5HXeR7q=H%p+V?rmJ*xDiSVktg^c^zi4UxQiwRk
zD8;ZoiXQP%8>xGw9H=3YeBtdziqnfDOO!P{*+IaR|L)}@BHuuPo}CuC1`$f!J$g|i
zG{PS5HN7SWzCY6b5TKm<c<iMGYxsKzMVGWsR)UnTwl6|+^uDMV^73gUuaWXO)xRAJ
z%k0$7nWS*rWdSqb)$~Ug?Rr(75?^E-(MoH)ldsD=>u~!`E0M;><{(q<wCk%+5PFaR
zM>Dk(YIIH*pfNpUlW(bg8>}Y^Gu7F7A#NpX)1Oq%7Rqoes$$i7h`sQQrHG})wr`ta
z<1ZA-f0>E=5W2CpTZ|;?yi_PoDT&2e5ZCm<f5;u9*QnqEM49zJ^Y6S1UdJ+RI#d6j
zEF4ENUxgO&iCjkp*r{u)NqPIVT8NN=b$EEB<FOsE1W2Ps4ApDQ+`f|=1M}TCeMEe;
zYCk{78zP<g%PhW)m|5w9v83Yb{Qm4%q?Qd2*qWBz?NIH)l<*+jc4v=}|J<K^AVT-%
z2V>qrx{nA1!7PKG&$_5o5mT%GN`Cz}W+c@q_^YgF$V=lRsS8VV_ghIRBiUlO8KDbI
zv|pa3HWNY#!UHXI{SW6}Usp6Fy%?QQKfhfB@58`p<D{ZltBch4w2;<WFDSve`mu0k
z&`EYw>Fdn^pNo3~r(pOw?@3iZ?4T^GiCB)A-fZGX3vpWsVq8WgMUY`u9DA(ieQNpv
z=uv7NauC_#5z^Z4e-uuwch<^T`!zVlaeJC>XtGo$clBUX+7MT<qiP=Dq>u^`WmAW?
z4gru$Q*NKuN8~N&R~Nj(?(e@qOlBnD(M^L9kK|pg_GBZL%GAw?4Xcw*Egu!1u*tXU
z`28ZXnwB;sMqXTq+osiC<NGQe-6o;LSMYq4s4wOePtBF(>wU3dgxBX(zTw*4*EkC#
z)svMyjVkF>87pss^_up(i5I7G({gJ54QkuYVPB3U0FRBSYST4}_GccsM6b5YHptlg
z>Xp2F#;Y~0kRm^b$bpAJZ$Hc(<-cECoI&!TD^q{P3zzXWXh&8<6Z1X|G-B9Yh>-mU
z?mw@lbP)GggR&O#cq$9lAinemT5jzvTm{KFdum(<&9hOcWHos;#WO;AlhehYGf`=S
zEDyPA@9bYkb*ZCM`DHv!uuM%3-nsRx_%wY(<n+DagC(%*F+eV<KhMzp8QMZI4=()4
z7@=c$k$FVMDHQvff|{Yh9Us0tpLbo+fCcF8yw(8kwb{ch%x<;)uV|Dta*CmkuPj(T
zZ(eOflU#`IJll86B(d8xykzc%KlBXjap*wPtA#|usL3~{Yk}*nw{a7{S)XP@PxwlR
zg4mkWdMa~ime#BgzxRtZ;WWB)vygb<_FB<8#8i-F82luBn9R`q5B75rv%Za_Vjqa<
z7XdVMKh5sr@o(X5$47#`6=823CHTkLnZE>wxp3xg1T9dUpy0pYU#AqPQG6-QWh1Mp
zpQTzAMzg)=HpV|Z@mB2Y?d|dJa!d``jm{yBQ7hSR`oyr&_^NGKb}m{;JwE=XYwswl
z9d{lYTrC0}A^NJcIRupCfl@XZ5F)1j(j`TY9tt>EZDV=2oQGot+z5{F;%@J$q;%|6
zA(Hz9)lAJ1d)L@A$FFUU2Ds`PJf@D_<n0iup2^5z**XLv^kiB7B`-X0JtFw+W+E#B
z>qWR~HU~uNWidlS4^RQ0LaoY{Sk6D}u5yA^gLLSDEoG9}XW)aOhN@04))~gS<$P*w
zJ5nW!t_JRSDusZifNs;}o<HtBY?jj34_2+JBNGUoEXmAjgTz|KWKq{$wfmR)BL1fb
z8~ud&90Z#Fn1Za2id-pKbe`|zm%N)i?tW(;GAD~Z481(mM;IfjBZ{i{_-}d-0dGi)
z)2Cq>MZ}k<FT)i^3Y(00YmG!i)HZK7BAu)E{gxI=zt~gd5MX<KE(mNFmLVd*qS|*k
zoBk@fDV-y$h%ae!4YL)1<@b><(vkq;jYEkbi&U>|V=shh_1SFh%x``B6>grEu9Rjq
zb!9EgNjVi8_D41PUWBdJ+_cc{%8YH3Tx`_+FgBzM6+ZRMHo1R8LMmMe0`c(I`i^qw
zB2YE!qTE$IW?RLx&J@7i>*86zF5f9;jujTrJzfU<*Y>pX{+(4`5COh=e-{>AMpH()
zCrh@JB7p5+`kPQn@#{r9F|`v0w&2?)k#d+&NOH&~bH(PI;HO7*!EP;}J$*g^SbDI6
zOno#cWb1hYBIZE4nvQ+!vq{2k(e0y+Z4EIv6u;#J+)I=WJO~X(P+%5Q91(!KmwM}R
z;7jph&Uo;-tzv%ml-g&lcp*71c!MSrzV=_DMiC#Wo7XxsoA}Fl7AcdxQxX3_&;J)0
zyGepg{57KmiB0QtM)i<l$jG)3PwL3pCwwoZk?_yrp+jv2TJyg8i!^2GZS|Md9OPY;
zvCn5i9aM413dq*Yv~-51N6^~f$~wcHD-3*{_vJsr6tp!c)$SQGQ+)i?|F%`MW^;@6
zg{cmShs@!-qdq;An`Q(o(R0c~8kEN)AJOr(wKgu@bV#!Tw$wC<xxbpCp~hECwT$Q2
zt=i_*>}~4WzheQ5b~??j3_*Qvh$$3@*2CslQ`ll;(0lt_SHNX2LhpQ8rh~2ZWg2~h
zIkVI@4{+E1$42V1esOOwQ^-2rlWh3EjBfXGJwk#ss{N~2MenFie@oB%q}L9$cFd!h
zy|Ww9n1=q2#l+~g(3n^_u1U)enH}n(;p>emU0M*xz_Lih6S=1=j7DCQX_7vi@+_5;
zQbS3Z`1VGFcq%kfe8$3zvh*uq-Q1q9QAv59Cq(9s(_R~jsm>MHD-L)%A&3&3FzE1*
zY~W0Ql@Y<=0J{y)vhtsaW9pb3AuV!0Yim;=m7$BtV<6J}8o6F=;0Jvlp5T?UkKaB)
zvAKs>SsEw0<`B!LkD*ON<QnDIGqHsGW+h}GD;Vr`Iq&6<==cn@q;`2&mCnZOi2vx?
za+uawfyABlpnXm432r)r{Vec495_1>mdpEKkdBn=UAs;Y8M~D(!(z@TIJv%kr#qGm
zTCnw&fzvKBqf=4%Vj+>(y7Mvb?v;W0FDK2Nn8_4YJigjWNRL_rh|DL1VlSq7tg_M1
zRG;7g*yq-FWJ5PE=VnWiA$6}jysOJ~{w|7aF6CRG&?hD&a}!Uk*sl20r}}T?F+urM
z$55h{D=Xd2tvsl(YaI#ZJX0c$rNg<wy6AEh9SOkwe}|sSf7o+fA=<+2cANdVF>1u&
zY{Iu{n-3tQTpxo}sU$u+IOVvk8=iCkj_-hj<{*I*2a5{qkQUm7Jm7rv6@J7bAJhl9
z644}E6JEo}Ox3qzjh-@tBAxAHRwxor7xIW)$ACzFbEX$%vuu@_{~RF=G2G7YJC=*M
zz+KKpf*v!3M|}c5c|s?*bgE_1AVf8bla=y_94J{!#hK_9sLjbYHwF}s4!V<zJsNI|
zlDi`&B2+#Rr7PjKhi^il)9FmCuUw_p6T9W9w*y+O!;;XFl5rInhj3-V<1t_#{l|VH
zI;<AG#y8H$sMZeFDL;w?o3?ncrBvjWJ@G<nE5dFgtaG6Ld5Ad25z8UWEK}$hOOn&6
z;W{MiDUmmT?_?3eW4bmoV6DK$SF1K6iA$gavXV@61#8P1IE_A0e*Gk@yFyIHU-1_C
zRnr{e+FK(lI<@stvx7wPVY^H7eHBPBK0i#oZY;~8iS&L1MtpZJ+=r=3m?|Xp^0}BW
zy1DiL6=3D<OlSKz&<sj^=U#=bGx8szk{+FG*)ZG@!OTIMDYN4-+^4gZP+3K;`Mj^+
z4lu^hW(TW+*X*NkQ-zyer6f88=vv@PvPkJs6d0nyWn-t+Yo$}pz5otJa&cGjenaw+
z=7J&uiS7o}Ks{2;5RaW9=dOd*qHIcs$n8uTL^Y$L!fnxG`Bi*O=C1-ozXd~)K8Ro_
zjTlVtf#2x_4v`xX=%!14Ce?a});s)E#aVCvbBKN_3-SC<TVeO<YLGIP_->^;>C9<o
zjy&<HS-Zh=XT(2m$OGlCQ{gxJv(@+X6^%5HfF<H7#JexO5#Xd*R<tj4!CUDwuFlQQ
z%w*OAZ|*^S!;)uCkt!(=5=$h%zKS7d1|j<To+b~DM+*QXFk8eX0Au=XWi#K2cWJUx
zhkEeUvOV^foH;hE7`!J|5<y7!5K=i1-3XTQ<L-^ZoOYC55Qe?<Hb*q68Z~i20lW)F
z!tjNgnsIYc1IKbh-FgBLDP=djQ4fw0Bh;Zt=GViiW>tHzhc&kOj~xJ$eYoz@v#D9o
z%+-HhtauvA76(n%_nV;M8BDj7?@eRnfBB~Jkqa{$$BmrBaCV;)T=s7wYOUT?$z_*~
zm;1DCg-aIE71}czp9QZXWM7|)_JoYM@Lvxm&A>5Mb`z7Rr7?8@FD#fSoz4y0dabx0
zTP=V=M;98V<WQiu$X4pZm*{|G-FC_pf)#`zM;uQ!{PuvxF_weg;tOVKkVX|Z_awEW
zq~}e_P~=}eP7QL3Z==ZP|Jrx0)wbv*Z1Qt@6u(dm@g$eAAc(VJmzy2Q=MbYN2$~Yq
z`fBLz9=JP12PW6zG=OL}m0#PRr})o#et&;A(LHx&RT;v(zM|)j))%Iv!;2hy=v3J|
zXw2DEcnBBCa%mm3W^2`G6NokeyXDugwaNMi;2$7r!?l+2&d>cApDa;W7GnJj0qsu(
zmR>yE(+ELlEhbW)$PbMAd5e5#61Rb_{@iPcc(&?~mKXTsxvIbTBG8+v7za(Y+~?tF
z8ziRUpgayAK4%MAf(OL8C7^WYg|(YQkn)&&4-JdrhypT=2%|%b-&;_%heYldZW|Ux
zGE1vt0V#2dN`CH~AM87hLu76uA7hBqfiGa@E37`|jKQRcowsV8y`{{)KGQ{3HNLye
z^HdM;Sn*}OTN^v=HG27tlB#m>@mFeBD(L2KmwIXI=q7<V4h{KF>{471Z)2gYe-4vI
zJsc!QR%IKz0P<38737$Hrr$3msm-M0w_Jo?d)XaH^|?R4jmHF4xdWcPh!%c*@4VM@
z8@`Q-7ij)aD4#b00@!ai3avSxT$2Ss<?Zu+vpUPP@Z{66E#&qCsj69oj$EVpyJ<6d
zV#rbjjU$v=Ee6VPI*M5GWN<lm$q!FDHyh7yajsK($&%2Qb+`HGv)85Tq<4%AWIYPX
zkh?5K3OBat1t+R03^H|JwBfV70RR7{kx*YJsseAQafj5edkaGn0@r8@WkvRXft}I5
zNJy41#2aXeu8RDLlz+1cUn;dQ6klFXM*h?0^TL_dJ81IuvCa!_TUo*7rrMZoxK{RM
z63LPQ$p;;PoP<@X`XbhGnJ(L`2bPmIkRD}L%vXmkFV8+$`E=kl)hG5o=X+A<9{Zw%
zGG5>zvlKGK=-UPlet2$>$h*<n30TvGK5=?Y3yl!+I&twS)4f=%QP<8CaXfS)2H^zE
z$^3?U7iO_zCFX3StF{nvAvG|${Pw}Z+qk!yrAItIF`Vs>Yhc%_*}C}tJCaw-{Y9#N
zcMcIudxVq9yNP8v5(2j!?szSSzY}(yiV2yhRbd=kOg>g|DDaCzP;>q0-1ER;eW?0H
z!H8GFS{JEcFp~04ZKUmD3V6!CD<UG|?Z1iNfKQK*LSKiNTb1=p$?rJn-&wTgsH;ae
z<kcP{e+qSTepjxiY3v<yaDjvfv~3Igv82C(zQK)gE{}~`3O|+_j<UAU#~!?Goc=k%
z*FO&$!^6_8(r49XTkPXMs8ro2fpz$t^bH$JfgV!jKkxV~;9Ru`bd3V;h=5>=FONOP
z5pisJ1{8o(cfj15*d+NCch^)X&V7S5*~r!LK?l}MGo#liK(~*o(Y&!m_kS0^7%IDW
z+r*XFp0wGL0ZQV`(K@V!^^B{D@Mo|H8tKPho?p5%Pjd!&8Y`G4-47r6OPr*CyaS1y
z^X&5XHh|a~%=~7F-%4y@WkU>ks$^iavzs)xwbLiDqpYG^-G(O_zvs10B0c0=vuOdf
z{+qc;P_6jV^1UinozY3*(y}s_@mEEar#n@33!V(*tn4xq54}MRi45>k(fA$@j^;ai
zfGUNKLh+q!Hr&h1Nozr2rr``Rg$o>QKx!*%##KZ@L6wF~c9IZ1kGjxVf>mufbU5N{
z2?)ro)%VoyU!4XCifENjMc1NYQ@e4R1UZLm4ZHXC1b3h)(7|ZKoi$YTmq<Ph+j4AO
z2h?aqFxL1bsQ7xp_d-G<5Hp2Fc5sBXtv(Nx^^l?qwCo_*e}vuz8*Ueu^e~e?wD=N=
zZ{9SWt`rOR238*{!B)z{Lu=}iL#-$s$N))6U$>8}yVMC?vP|leck#aP)5C_j;eI)*
zEp_jF16&Tf*sN+A-WiTLa9~plk})dS9cXJQE9G6_fZ@WR$li3yWpTUyY4saiz}BLd
zj6YH9W6Ntp9%B)v#}a){86XLk7l*?^Ll-16tl+R{ZDrN7*-^C~QybM*r*5+G3yIeO
zD%lL3HZ|6ABFM=+Mwk=`SbT_~*Hry)u}+lKj|l`{VZ7o}(jn8v2Px@A)29$zbk_4C
zF+cF`38Q6z#G>Uzio<f{nF4{wE-BbTAI1AK=s6@$ap^hO#NkSNH}Dft0c_r*TV5=e
zh}d2ONr_p3&4o@aOcvW-+&(QRN#DSw4et3Jbg+{wy0ams*gv+cXaDkoV+ZpmnR5$g
z-tdTqk)DcvaV17H8|TLSdH1Bie}1NGdabna?N~es&!_t0;1kmYZQ#n<r`KDj6PI)i
z0C^>1dDv4Oi|m(Wa80pYT}7H?bUu^5b{SH64nJhXdIfV{-?@b=3SYB*JT)E~7S*D|
z=2S5qZMa$$Hx-(MIIC9$AKlIM_(q^13r7j?4y9{LSZ2TWK>nz6aJT8z?TQcZG}AtL
zZmWOeDE@r_U+pD-(V9)m4Z(V2>Gt)X<0P5NXoxuH;5-M{>AfZr$=g<)r+xhEa|A7W
zmn~&!bR7!?Ckxc|WvI4jjSMvW9yahJ={FCv7bHjfMp++*`v3VNc$@u8sj&sraLgAn
zU<jKW`goGcp`aUF;z&xeTVJ?C@u!&;(t-moC|aTH;gA}_zWlj`HGju%`<n0-kIq-~
z^xyh2m7e^l0m~}Y6CfmRqIOoe4!pZCeZ%`Z-nS1REVHKI(&M=B!}(tGeu1_J%sn$&
z_I|@NV<q)^(Gg^f9xp|3{Z?6Wg{NL%{TKpGAm}r4)+pD^^wLL^DMo5x+^N_Mb9i4Z
zUbx&JSxCa`$B{g6VG#JtI|)>UeJT%0aed7lE3>$OM2>U|gOY$%bMnzkDQTC9chGM`
zZhG%tURy}<BE;=zH!+hYXike1u99)2hZ7^--j7F*>S{}j!<UJT6<-7!t9;k4nu3WR
z!m=3%wx2eWpcL=x6tFS+x%axlrzDmDzAMT|eprLrQBM|D*4nT<=&WEiNA5=oH6(&J
zhTd}Mpo!95s%tUokB=e=kNR{ks@NYWF1sqVJCLP^AY?0FVyY-}e@W!}I5UYaQtdPo
z<$OajIbH#n$gke^W6Ov-3M5T_bLOxLd@)6<y*VZitS>x48G^BZ9>Aa?C7=|@O8ekd
zCg(mm$6X_=qK=lJc*vPGo!c}Y_{ttzMrB#RHlMU4%hDO+Yy(Y-T@{hmT?)6Y(1xm*
z?xDrgRWgAMA#8#hpyX&uBZD=*r?Vj7bW0Mo@Tmid3($`-LSFrGZS+V;wY-&f6-=s)
z#lrow4I-P6V!XBAHsM*zk>$!7rU!Xq*rMuigy+65$yrmtrWi!iX<~$sv;paXC@cnd
zyaoIe@-2s^Pr`6?HoSbF9cKGT#8G=(#Ni5eyp(NND+}O%@t6&&mqg}k#%7O2QA?m<
zqd2N0s`x$%Jk6sK<`B?l`TnL370Gnr`N1sW9+E)fyb8RJVRSP;KpaD3-RIc)TmP;&
zFw58?VW(PIYi4R#TD<H>0dWmE(|6sqgb$kdZQWG}s3zhOGEPf9X{z*=Hc3n|QfBFh
zS;I)*U{F0c|3;b+s&Z6py6Qj2w0yp;Z2kJ0vD-lFr=$q!<0REg7DJGH+v`323Qnn~
zuKjwsCLUM;<G{+V3q(3zuFLlTek?jd-7RVg6wDTLx2n$n-dh4GWuc&H8W0`tsiC_#
zdd&w!w)8L0%%@8*#X_H5zhvg4ABydOtqv3w)%46r1`#q@|1{<i(V_VEABg?9{r8d-
z>>KZsm>ff4sUae{_b}TIXCE~ZxG1Qa`=yoMEZ;iJeS?ue)Wl+N3OEQLyT&~~^>NfX
zQ~vdcn?U+q$dh0bmV5BNK&?v}RYKT0OJO`WV5lay9K9I9B_)CXHHpccXHH%spBo0{
zW03()L4E&ZZEePlQ*w3ZjvzR?yG<3PPj(w$<n%DgaL3ynAO33l`$vmUf4PZ$70i2G
zv~Z|>C!XPQN-OP?8mPH{{*APRn%f&OI<!6dBDTwwPjd9Hr<h@D5JeCz7ALEJ2#bSW
zf(WFO&^4(pEBQhg;g<FGYrhivv%r7xM+>9)LW0ALfzV-vKD>yFJA)aRiOZSh4-21?
zn|<ACOH39kB%YbYS|hslQg=8D;E1um+nGHzE2u$_LhL5x-K)0n!=56-f!k}-;kTDK
zYd*v|EvO*AiCMPJRiB1w<7at<l@c?Z1Bpo4m12bnz*qp*9l=<pf`-fU4V|To{YDO_
z=b7J}UdMb-U*e{jl4Ff@8v9|hs(q_Cuk3K-k82EGb=c^pi|2iv8(X#4-A;*JwX9@$
zSjS%7CS#ooQ$BriEt`@pb8Vg5?ccfo8C5=6za<5Oc<z>#P0wuo7%z^R;oWe#!k8rU
z&`%w<I?=!h4jg<^!Ti)UiyHuJw`rQVWp2mUr4w6mBn!;()Ay3r9$&7!e!YHh&<Rj_
zY3AhGVc#9^h#YCCI`0sa1N?B~@y6Bec1<^Rns6%{Wm*_}qCjXp`hw6C-Gc`YI!O}K
z7Lsw!m`of|vP^Xf+HEA0<*7llu0HiTQbfhv5%rw0(Rcc1>K$|M!Ee>EF17tR<h(b@
zF9ZwRvFijJ1%Vuur`<Lm)iu#dyUBnIt|YpUFMbu~T8wN0l9XtT-YXh38=sCy4g_ls
z{6h7nj`;E1f_~Z~b>@Qh^KOu}M~qTIRq|Q>HMKnG)+L>V*ZP$uF9UPond3Xv)bFnH
zYC9HJGj#j6@Ht!jQLcWPVR=^kBFewNUS>P#E<v^jEPU?c&9~<t?U^8aC1Zc8-ktf|
zLPH~QNBmBAzwIpZKFrr&0q@8Cr)Gx;j#IQVoJ(ru&*fY9g)r%+=Xnp6Xk~f4|M0u2
z+y?}L<J|8tghbT4(P;lz`=F3;enHcxUL4TILXp^Gc5XM1d3Me8OWKaspihu>nFQK0
zukgj5h&*KDH~z?L(RdmtF^%IcmR@{Eao;I1O<eN8mY&aEC+D=Ht_%E0kZZdtf69Ec
zp=y$DYg^{>Ubj_Lxuuk8e)aAp;>*b)n`tN3@Mw^&BYu_nEVtd9P$>#g4<t;kw^HXm
zAh`#(H`MwABG<lNjBfQGD(MTfqM=Xy%!H})NQv#~*q1DLA^Ccsb#luk!B_N&A3pE8
zmu|WY%8ILoc+Gp}l&Qp|_=#13&+#q9eXYJ2U;pm<698TeII2aV)2^HpfgJm`{Ctbb
zz-|-qP}*<jfKqmwI>qFT%X@;mili!93G)7L3}Xhv0qUuAb6MR9dyC#du`RL8Mw&Ls
zzCg5Uz(nMOPt*tqmN=x=bS@2RHzlW+TxH@gcX^{pa&oo&`Fv#@R+2mWU4l){GPjTU
zVsxzfnZ+U2#yG34<G$~G(L1l=qy?`q6<E2}_HTioSj||tK6TY%{bh2<N;rE^>jjj=
zaok&H;QhhPs*fXf-1ef~($Y6mf(I2IFee}LgR>o)w0lT29l)6!?6%rP<a_b<hVHk!
zKTbqVKupH!G3u7<v<F%Cy@Xq<Uv62NKQ<0W^ETP?B|WR{_XKXi*>{<{5<c%0EfXF-
zQ%p44VdO<w)&zy&&61mZkkf!D#SG|7JH?&S_0F08xz{E{Fuz~CI$EI=GF<DbAm*XX
za+_$Ym-04`@5^8HV_goW2omBM9P)^V*KY&jFg7aw$SJrktu76@K;-*3BtCR*^pby2
z2(#R1)qHESw+JRa;Zd&V0=d|`YdpfcO?w%c??254(O3V4_ljl~j`^`*T$_Wjd?WnN
z?)FDkC$YM6p1PIfb`AcIH}aP|g=$&ft!M>iB}&e^+<K2sp-*tv1A_>Gb5?i8QpuBk
zm-;t_2ZKXK^d=fM)QTzHD!VFQxGgE1|A2Y8@!Gn<hMJ`tW-V|mr2sLY^Y^xE7yV|u
zjMA(N6g==tW0-ZKzYNI`iR6)8qrg=wx4ZgBP|0k-e$OZM&LXs*Sq*S9<6kGq)!tl&
znH!fK<SZSZ_m8P>#oy8{oLt!=$Eqte_v*qAed%JYZavmIMrPkInJsnvYu;SqACsX_
z$gjyrYld^4-q`YO=kN2yAG2F4huv=hv+Dnt?Zv-sRn4X&Ttat<<bBZTfadLdrhk0(
zypEg@NX1|)DDmF;P4tfYEgW2^Y_JrellR~flKyEML-G*ansnA`mLs{v!)T3UzcOMk
z2P{iIh45XM|5=cSP(w;FP5)E$7Crl2ETijV8OB4t6vtn^2n0}a-<rCaJr&qoh_3Bw
zsa{~Wsru}JE@7w0c#h8vGDN0f>ec4i$2oz(NwLwFX$NNyl!$Lt?uj-03X6=ZLu`_k
z&eN#Z$Pc=G;NH+4LJboE3(CPQXR6EHb)Z$&E!RT8Ql(O+LZ(5TN}=KiB!J*$7$d$d
ztu62BM<<pQz>D{rECFF3sX>Jq?YNJjt)5IL-)3IeF0|Gfui4%;pULq(`kl;eAo;&S
zrss4E$$Jv|V__s-<ohbHP^64!l$2Od2^6Ic^20bvkxw|8MG?PLfTy|Ee{NjR{Sm~F
zwEEkuiOGiuR+4b^RU*ab(x$SD>Ynv$>omSN_adnZh3ERoJRCW+MSZf0MVg6Jf_R8A
zuj+zm)`$!HaqcBuF1HC{C$UDU47HZG#=JxLL$0Lx<JP6l4uBLmEGOyWjv)Bv2Sd(*
zd*-Am{#es8htrCKaJ(!3?iC`<Kd>nT8R#89YK)NLgIa0L91jy>y4$v{Gb?ubs(yEl
z&s9JfNm8X9WvoL_%A|~m$Dr@g#%m)h^c>{kpgpNt5z0f56XRU7w$~AG)?2Mh_6*=_
zsFFtPqL8y7|H|&7M416b$46n^ZRB9CjvMlpCaP&jyhnwgYLuxAk4sAP%d}Tl@nC#S
z*B5N=Or7S+kE9GW+P1p?RF0C!bP=go@#DDJmX-6)dm3xywEeslZB@5dd|~<diJm$7
zx_ZpZL$^O>11xjt*W^(m1mW#{*Z)Fa?jL*7+~2kS0CRkYernaJt(HSGe@b)JlJ}8$
zXpMI3ZR7Q7fl}7mfF=8uoXJ<A{`!8;MrLP!^NRGxji4S5AHYnB%XZCeugk8HflDmO
zYBsB`nv#8JnX-lxM~H<e5Ug^giHtaN4sqJ5yu_8Ix4)pz9GsQkY<>?+n1qg56uWM`
zesvHv2?K{lEdIF1OwdRm?-RpS3GL&0`_r50Y4e1XR+92rOL3<jU&lyaYa5RMj%#=D
z6PUYq@Ep|z*T7ctJ<1iqxf?3Ct+PJAj^;-2F;3FX&SOc3Y4zP~6cWdCS4oXX7Fg+O
z9Ym^G+kW|;=p#vhi;ho6Y%p~|17JjATlZ1LZ)-g!&p#17ohgrSGXS96hSxTqh}mq{
z{9<={<WXIm`FQ2QgU#~$PgUhPkAPG>gYj%;NXdOMghMy3@GC*m3hT{^1{vw4TBDLn
zyNkyAk3Tf50aC#4Ff7>Cuy=*w)|eC^$Egp=yn%MLw7Iqjsp*<$>e27@J)h9wq?s!H
zOd0F_IHanILKiG~hsc1T9ny1{bcwUU0EBmQb=scz4>Kz(3+SJe`Oo`J_0Wn6YGfJ<
zxKt@`=8_TR++_iuMOb-9>wWSb?i<YsDV4OZ>M&owvim)=v(i$BB>Pw4tJ5}^;?d*A
zD3u_0fzU;0*i(|EHRTws0|d*Z9^%TGk!|^C;4yzGPK&BdQgUy;kUtL3$Hn$(i|X?`
zJcSOsnO_=xUTF6&JXT|L+aH`uG<qwh(_z;-<0zRsR=Ol8>(`h%32}{;**nd~Z=d>@
zJ+{|Ue3=|#zRuakbV)ur3}MMA^Z3{3HEhZ_A3!f|L%o!E`oZ*_)6}|oMFOPc)G-qT
z9?-WqzR0`S{u&|$w@|Yl&ZUpfi0cOZ-p!L%*_w)@4_+mm1W1t~2MXqQ<FU_jZS2Z-
z`Vp{IU`<9_rsTpaFdZvCR31YC!B}0t{6WF7S7OD<7Og=v$10;}tk13ys=GbkK8j@2
zSleXJXRaD*Jge<q5U*_<-`Q&ltg<JiC67Men_Nc*a15%CR(seCFSWM~VT6z1;#Vkb
zl)gy?+6G%AzOS<6KgYK!9{)dVy>}qiZT~-fDnwRMNm40FDwINqP>2#4Q6WVd$R3rE
zjFwRu$xcFKW|O2M$%>F!X4#Z%&+9mKeZSA|d47N0*SN3Jd44{}@qWK%Y*6)Qd17<h
zcW>9dC{oik*YWzyP;<8M-q`hTeDvjrQRCCThBF-Nz11i!jpnq(s8Q(ub4;ExAGOM(
zNTJLke^4wJfN;z9fZU5P2wr^Lze7tP&DX%{{c3hjK0Hb26Z|$ekKk5XSD}aI3$wkl
zt@{Ng^{)?R7F+ezeH0UnsP_ptl~iJz?$go4N|U<Al)!MU$+}?;p(65~nbrcT<~qUf
z_j<Pkyz>1<E@1<Lm_<%{<o@|mCA%;%6eA$IRN<|(g~JQ_9MNy#%iXy9SrJK}8ZBTf
zJ3kpSGuQ6AP$;F3lD=ZaSbw?9?+{a3#}PZ@<UE(@NyBjEEE-8vS6#R4W0U1eiBYHb
zmUk@sElGsP7*5B@X%1)96d@Mack@HxzL!zBU_`$7FP$#ZdT2*;{GH*owMK8DQXGj^
zz*vo^k<xBDEBPYQ^UqpW97#)f9ka@gc_UjzP0??|V4PyDRNXGD4+M?=Yk`b^HOWS|
z@)F!$XP6jm-Yx&Zi~~0q(nwcwS922PiC$ze=Ss!ngI;qMEduO+ZSUPs$hf6T*xhtn
z*>_@S2rB!0wq&)K!Fw*s{>adp)v*JP0)@QdOS{EUSd13wv);KASh{FctWzPiP=@uY
z#j204+*U?h6hkS@%nlbH&*XRh-IB#X93WB8oXT9Tk{5B!^p14lK<eD)9V;d(3&wHh
zO1^iM&B&~HUm>%So9~d@KI7<3--C9YmRI2UZJ8tY76~DWCz4Jjyq`Y{y1~{#uzbPm
zsO(zmc(Jn!@<h`-Z-YsCH}-?E{)j$44sT=cne!YyUk#OiZ)9}DiiT&?9~Hj34tZHZ
z!XB;0lh&TCCcHo%sNS;Q#-VDDy^~k-HE&{LMa+3|#(rM$J8j{Y$?^HB{1#3m2pF+Q
z@{Ci6I}SPRzddrpapWjovd2z)p2jlfofHW>|AEO-BH#Cio+F@R`^Q&eXb;#smqhhg
zSMKl4>8xh5e{jBJ*-?P>=greuOnTGC_ZgE`_$V_j;$5MNvgFY&!>J!IwVS`t_ZQxO
z*QMm*#bv9qOI?7G2x{nVT3uC^zS{n}6@x5|<;>>*(l32bTcnag#ag_kkll4p8LI$i
zp)aYoAClT97U+pR#!oqyBUFSE@Scmku`vv?=^*9Dr(KO6v5?=w2+5N@_PmbvcMA_6
zGS^QBtb2|_u`BA<G9`B0IygboRMv_aoh>;SwhvQv`DfBc?cQ7LKiHbza6svY)0i@r
zF{K~7d=H$VF1@RYGb-4qpi$2)TbPrNay`Q^V;@nBe)^54gD5v?F%4sSsr?G!my<hB
zpVjkQ9{84h*#e3Fpe_8xBXih9WV(myM6385@h>^H9!(97xYAEnR?M4N^XPjSC8NhW
zXe<V@u7b8EW4n4b@^CLyJYM;G^;Zq)9VPYyQ9tw`R&;lfk0ZDJcr~8B@J$RsZ&Df0
zSFU$AjV7feCGFwM_pNZdo2r$s`4Y$~#xStj%=#{B#T+k|iRjzQ?0tOKG-RV@Eg7c7
z^%-NTTNEN&7c=~$bC(1zd$+Lwl(1%0&-y({F%B(O-cp|F<CDp&Ox2x6DA_J+vN{tS
zgNVR=u8aM-!CXV->VTvP{ussxAJv4Xh6weDU+05$Oa^*<U8TXT%>6q8*8J{|Z}TuJ
zjdm{I9><{d(FklmM*ux5mskkRLvN#Tu5&b2&kxf?z?a8){x&zx3j@R<BuY6Wcc&Zz
zzFP{0*9Z5>G47zI7(C9lg4)T6Z@Ir-DCD-lQw-jSn&ptYucceK?9t3;>9kwEHH-%x
z+=k_YY9!Z9Gtc-&kXLmwh`M4eYFja2=X8@9qDR%kGwAG_Xa0I#MgS)=dKE0JV{K-K
zv_eI)H&Rhs0V~E^V~>wNZU$*VpYuQjGsv~FWA4VM0ZO&(CpGC(yZW}-qJeO^hDgBd
zU`uX>RF<#|@=FJ<_pS^2RuM+JByeKIUeX0GZw#d~nShN?<L!bz{Z#+f28399cYASL
zSeE7u+_)&Ry6Qzawz}>!HlKE7mqGh4Z}AYi0TqP)KedriG{kWXgz1ilQ`t#$&E0;;
zrzL%53nT}QWyjPmg=0C{zv@>U3`mUADqv<mzk=El_X~PjwKR<Ie-*rR=~5-<1RX`3
z{fS7x2loZWh{0?N>p|?l<^H@1ExT1bq9x@qk*X%es4w{JSoypG!uo$vPF8>GVxp<N
zIZdNWB8=L4%L7$6rJiG%+S<Z>pu((bV?T`M=Lbg--3t<Cl!rWn0$b?607tL|0(89V
z)fe`US(?WW?E6-KuJqTU#-ACd%>XJV?_GAwzQnPS?+mV|hpVoj-1eM99P3(<P2ccY
z|Edef*Nh&cd~{tKa5FV@hcUx5-l!gyy9)UZnxhpn3PtBlMds;6>O7S;@2&J*_Fgp7
z><VDD@=f2TdgFpQrvYGyGPAci|7{&AN(wwBFSE%}M*!=Sllv6=o|mL2B^x36&#L!Y
zITPSW^?24#e8NWRPK}vYf~7n?rutJL)SWuAt$83LmFM_K(jAcYZMg*BSsmcQ8lsP6
zGAd1O60Q2an1ckkNX31s^rr8eSyP-5Qn{~@(i3<tDFw!qt%we|lc!0?_y_muz(R1*
z{GXHu?E`Z4FKDYolu)h4C37Rk?!grx$7+-#j$c%Ey{;;PE<%@L_y+sEY)U&LdfZyJ
z7@=@YOInqhElN8z(R}q!>?$5eu%wpYIlYS57u(7W0RkGEm*j+s@jFhm*i`9p=1#y1
zz61B2Z7Bq~;r9{-+tIz*wUg39BK)Od@e_ZzjMy-Y$?EH*j2n%ACVoFSN;<eQs-@)+
z(Q;+821Vb#ZTbr0EG&2wwAbk(F7WAH9s8Z<j2q6_`1E^Cx+PyUDMj*V;E#qH!ZBT&
zILbMZ+Bqw^+@D7$%dF>Jz$@#Ef~)7SgMFA=(6C;O&~HyB&b;$8d(iVHo>CoIA1gL`
zl~V2b!tksACo}X<4pye)!P;pvw{qc!*6!FXQ%33+d_H}ijI>!|uWMrOzrtkjpz$@;
zOes5<#cJsG!sc=Dq|ZW<3+bT)9s_u++PZRjUGXSsJjl7@sPJYTTR*Qtx(`)q))GZ5
z`GOJeht+mz-R@`t`r5L`8bD_5%;Ag2YAE7vCkbOg?lmg<7d?vcN01G;%N-1CDGU(@
zIQg3~!cQ-!AgJ)S@o61eMk<>~{wqsEuW1q=GPvtorUt8OFsGnyY7~+hcqJwfD7hk_
z*`3$m&`6SE_3^$WIN}gOojraW^RzMYIOPoG^eI+mS8rcc@Z4<-{I-puq;OaL1x3DQ
zG@MzzuH}Dw#`~IHIuen^UcdDhpmPMHskBdl-TSubmSmQai~|v)oy=C&?>-q3FouJ?
z)P~M~0z+Mk=BE?w9$i6}p~i|&q$H4`IB%V)09Q6v35Pr=FOjuvU#%@bbMHi|$jnYN
z*{BAMxt(UE(c#FyyXn4(IaDNS=h{62E0-{P>S2I?4ccoALWLX)0ftVO6?<A_Q+>a?
zEGe<cs~Y|`3Mw)Px_~?j>A#&*vXx{N9e+x3)f!>GobkBV?K`i$`*kzyDJ>2@Im7R*
z8a~wwY;&~VTNs)v3s1y-S@Ld;vu(xGej_(67}JfyEe|&%h3AjVfjnabPToXeSgP&U
zYQ|gg7}7IwNxN6}antNZHNujPsTXTDXJ4l|2K9jNffMgV<C}^%MatR_=+1PWIh`nd
z_N4I(EL*}J`kNE%<M8a?HT8+n-#6hGh5F<$&xKC=xHcqLSmSlc-FUOe6xb7Ol^Oz^
zPq&_t+j9wmIs0F^Stz2#8Mh2mr#dUYqkWO)Sf<UpZ1RF9J*)_qYRKREQ=jbCIwa-y
z;cN^yo+C03uiyA`5y~O!u-A1EcWQ)n16a{W8ZWWMUaN~lQ%zHWlTF>%Ws@>rkVe{o
zXn~yx8n-$^Cj4X+rw&qGYSdx+Wh)1dLU%M;ZZ)KE1*8d16OkmY(NdqoZ`X$g{9v})
zjQby3W<*DmJg=ip+c74AR~`8)v~DZl#?q1F=X|VlnQ=;GX$vt@VBYlb;-`V&;`NG_
zg&#-3pyS5`5muU$@S~@O)9}XkO5ip67pmZbvBo)fhxNYK)vw9=p?o{f&C!{pqjxpO
zLNJ15mB1UXfxk(PJqxlOypSJuA2hwuAk~I9b9939&ca_pFtCuzf?SoT-Y{0&czqzP
zBYg8KG%+*{MWsyt9o|UC=*o826pp^=8P`4xU9$Lz+mZeAlVBUMn6+qz?xWbj82KvL
zkK3`;XkMs1imQ7@_@Fz6&U_2<2%u(cNrxf{BE={%Kr5HZ02;^Lbo`d);o!q>GDJX{
zyls(W{)jf!x7as$OdpvmKhCJ@*@K%}W}V>lnvSo4iDZ|wGa*W<KaVDG($ok$d%!Rb
zypt^n=xMtxEOoV1K9J@YbDM3GQ+scF^ph^_4Y(){oP7H(GGiFSR6kZZBKBz7GQn<T
z9%`nL@KR#6Yq;4h_vOiH5?dT~6jun})OTo&<dFWdn?I%QT<<61b&6F9v~G0~?ZoY3
z?wu?4&|JWYR^jb>OXsj~yRUjb#6>U1u94cnA4T4;l7bP&aV(pSmwo`<WQ>Si3M0+b
zGs;CD(fD|Ix#jezs!8Wvx`I}OfnFW-)epVKX{M^z@y2vts@?+b(31W%)WZQABPL??
zugp#F?$B1LFJF(~ab%*Jx_UwRw(z&}W#s?mD_rD=k}^9~?aH8A5QLc6-gbHHkouUA
zu_Pv=ot-B_(e(U2BM~o}y}?i8(Dv{=#4bx!WaiuTZ@smQ&AwF9Xv4ZQ&y*j;?_3h_
zJ<5z@4>AE<^k_kQbgPGpuzgO1daImRM>-HNKbo)kNAih=9xZfVBRk>NPLVYvl-#uB
zU5}~RDVQCl<+xRHkPG%w%G1;c<PVO*#BBiE?JSUjFgrV5#gC$b2Q&U6-pkc6V7;h!
zRX86_L}Y>yMi%PCbVq`&ciKiCN*}~2cV7*S-a{XG(Fq}tTY~FiE3%4MSZth4%E=p&
zRr{Ha&;Cnot-B58tcd+aDC>Eoxn3bR&Yz}(ewOh%!allDqr!^+f|Apaq}S=Yo3&HH
z>+y-HvBf`;VIKR<XeW1>fTjs@rM%mE-@j<@8=qrqMK4q^u-Nd69C%|cZq$e+MU~P*
za2))MNUz!Fo@J;My_GFdJ=%SR^y!WpMuzC4*anO9$wR4WGd|*NmZ1&mOa3}$;1&gH
z**Ul{rbK=HrkJQfF#icG(&?qg1=(<CTy0RwsC&-J4k-SKUE!50$eY;Wc0vtD=1Hf6
z)x*FKmd_uxy&SlVmaz~u>BuU~j)hL4a#!POmW7R#Lt0oaWe(3=44`VfD|UFxzz9-{
zV7lr$8)u9T>_qsC7XcQ0cN@Hc)MFG2tI)~|r5MA6g}Z0~3B+!}gME_V5aipP^AOY8
zK#((L8yb^dZ)Ps{5(P8p*d7WSzK%Bx4)>E}+t6p++<bLg1YwWR&iiIR!7p$u_~l<I
zem(o3SahRi`HK^wlFtvl&)6&V=L{I#{1a*C^6tJCmMS84?i9#%*1e2TetiDX<*?O4
zXLP@&7!IplA@A3(N*SvBzUj!eaN{`Tx}oamTbqKGe)&~YL$O1jR@}h1%uh~TuptED
zi96=B&5FS-A3BcY5*em#im@9nb*ocvQkHJy!pxTE_N2HH5i@?e=$S+DZ2X7to*Eb1
zX~rnBe2kPW>S!B2?-5<Kz8~FCrQbRiW`==%c1n@6lKP$xey>^{F_q5n@-+DdgVvnc
z4*w4a9kENqie;X-S9sQ<fEsOFV{yt@hbZVD#iDwqG7jn^WvtnQ=G!jgYmGl-JKT0a
zGL9{2adniT5_y@6-$e8=($!%4`APS4`r?e!#*L{)(Vq^bh87_8v}XVM(m@fWkPK4X
zl7^w~InsmnO@|%GL98g9<%T59ZHqr^(-jn%T#)V!vw5nCO{dOjBnk*XZHl36gFM$D
z>><s(8r*?2vy$Jy3iIMmGNnlEZS<lQuLufn7^f}GZn3_1u#kHK*>tM-zFV#O4yVL`
zHO?jKu4tpD4K<71R|CSuPVSW$waeo*Vp{mW7_|SF1>huoobur|wMcoUHg{Q7LQbW;
zS3i5hwh9@h=FyzCImx@dms;55CqQ=Q9c#*o1hWN%x_Ak`oGb4qzUSqQmG6kzQzPMb
zhI5Ro@r_*pR7#zwxEm!EclZ6sH?qjBdM7QZQkSIvind6(BO(6L>$j-XI4+%BadM3j
zx)5+y39vmwFm|5pW}Jvgo;wn{oRivVdZRsP2M7;tXaT)hqW7Uvfg?AIXa=Sh+Z}Mw
zWtNw$K6&PQkLunBbvJdKqJA`4$R)H(D5SHzVKYPhmHW)8Fe9ZZ*tS|PwQMUqagWFE
zkf1%K*bJ@_WG@puaT5GOh~^q<<aUwo)B8~Au}@@iXQrV`m<=`Ljqkbk<oo`@y@n{_
z<IC&LL%KD;3$nA<;hQzMyFCq@@C#T{c^*Lge#tD(sYqeJDd{JU&EW6UUu1VMX}O+N
zUvcxHa#(ahHIC|*OMDQycrL0<?Ax4rbNk;VE^Mw@yGL39lSWxM70S10k=R@DCs05`
zUMCsaMv0|&T0j&D_3aR-!THv~wN*)x|E=xOpAh1&5IFlj#6U^I-USm_ar8BwIVxL(
zJ*DH&)YUjRFws|d=4Vq*8hnx*Zeftj2hx^5<Smh0epuzPGoTE+kC(}7h|$+UNPL;>
z8~C@81u?LbG5a{^{Ihyr;fb!Bsyn%r(RI4<5B7Zf*Yht)Ve9>6izvcB2A$yldZ#R7
zk!b+8wu6Hp^=hZ?74lC3roBB!V<C`<BDrf=5LAMbwmN3X`?uW3>mJN>P#fb`+I+eo
z8v%`Ce>9y}`$KW5se1eMIJ6_5W*<M)VXX)FJC7!15{L20QFP1Vm{gw6j}H)WJ$}*r
zhb@5M=faZPnoIJ8kf}%IZK+a&N=SZ4MkM3(a;M-DA3|snJ%N}!1|rTd%{~mL*^t72
z2~W{Z3@U(ptT+u<jAhj?iCpIyd$~K+60pFvcO}F{SPubMi$q2>zl65*8=u<usVaeJ
z0>N)9XK2Jj5~^Bqkreoe%QS2yQ1U|GHn`quArkA*H3SmZBrLb^&Q3mhCI7Wf=FmOR
zsjqdIemnt8{`t`Su_8cwM44dX;txf}MqBm;OD9FSw?I@6pGXqmUvV<%EUdUoHa9e1
ziVI%@<n3XFvP((Zwf<c&bKYHO<Q<kET27XT&ylk2pP3d9oLC-fzeC<?S-cY7>8pTz
zJ6oFhZ8*>c8=-wKA!F7WijBlB#RCsej)8^ahST<w$zf8xw>+%uddVQtxSIBhp~V?*
zC$xJhui|?r)COXDxIkfK!19&{HbBTcbq!qS{#^LV|3t4)>JI>pAb7BITvxF7FnsW2
zUM}RQ<7Ff*!)bbr2PgdMsvCF*!MXCM#2}r`kF1W1(0T}NyWZvHXx_cvlpBkQ`oW2N
ze;#Q&34pJDhGAvNGsQ7R><j-dDFk`Ql7jPF3uebYE#VK9fSUJveBb}v;X_*}{42gl
z0mvT7azAmp@enPy=!N@;2A7s#4M%W%0E7uS<1Fj2Z8j9mul`*R@hqLcH0B&B9+5S+
zb*@r0Pkc}X9ll{V_-^R4JI{1e*ekZJ?cwavD-#j!uTSh%ldJCw&{Si@5CdFE2qD>a
zuXBcROHtAkyuz5juI##awhSx})6&z|cu<+3cjEAl{${L!ij==7PulL5BJU-5bVT-C
zqS8457C#?vtLNuctj_zAGPv{IKilW86Wk8OC!`TDOS^kqEg1EpB;P|c8m02qPj(~w
z+5g#*eAtanXPMpkHhoZbE&k25`?6kj;(SF$rz)Ew{|+~jNT_UBZgTWDzkb=CCR!o;
zxB`(4sr`JF{$^M#p%&o0k-)(9(EQ2m25d+3pvoQ~%rW1Bui^p2?F=TNoP$`nt%o4!
z4ElCdRh7v=kW8`zr~g_mjtlAr41dqAeVrtB1>mBb(p)x&$o?qnMw4LuHT)I&-q&jS
z`-WN(LjfX|eCGq3NdZcIlInW3P_bh$eRfKYv;<;)d3VIdW6xTF(dAzlZA2^frF|N(
zboZdDeqa!)(-&2ShI$~Zdhp$!pCqCLqxOtt83C0hHw^F6_8)<ID2Ur4r<euD#;oa|
znZulr&KyaJ;W9_)ik|gRMo=`K=Tu&UHFV|M%^xp)*7wZ{O=zhtk6uB7{*uP})5&LY
zX+_M+qQ)q<*Wiz6IFx^`?0BCP$_GsVS`NJemoTatM3r44ql>OMLzs?#Nhe8$WiRr*
z{HMLzks?iTI!nRV=P52Kf2X3?D@vU{HXVCS5I6y<;^v?pCBuro8W$36?}i(Zzo+#3
z!QA|$hzAUxMd9on?d8yEnpS<!gsCMW($`x9SeMe+a6az&^ERUMy<ynSQ^(6)b4Qo{
zma@0_esAy2`XQ~l;P;$1SFy&s-}nBtgjt=j<017oCA2KE&u<}Wzb;?X>Z){DYr_9Z
zC5R_%Bunh<8%l1cuT)De(YgbOztvgI$XI{)9wP8Kcby`C+==zi-|_-;Xm>%&O#WpM
zj|~d59;{TYI<en;H$Z;T-{q?$6fV$U>kSOVi{^2LANJ)(&yS&oazAu2=$aWASxkul
zB@8&P@VPCBS}djt`8e&=E^RDDC12c+BCAft?dnk!ul*&vr{$kF)F4ay%bahu#P6oh
z_ho?PEMSN&3Q0$^5v==ujSG2KQuOo1E{XE1m`8W%l-Z^qgFxf_%Bvp~cdcdbu?odh
zxa$$@^@>98wlbRuF6YwPfSa>Zjn~Ncm<m~icghv4fU@nYp+I<-jp%C-_}skxV8NcP
zm%LQVcT@64n33ii@KUXpq%&ggZl7vlRyoi6++XEz`_ek~1y4zrFr!<0!~{!3$R@*5
z<=ErpwAz{Ke7KmrLb`vGdW!d@KkfcF@}684jWGaT$}>aAVupxl4}LO!^{?aK?YwUt
zxs=Heb)omr1%xD+WGVTzk(i&()8tFky*#e(U<asnct};dPiG}chrC!qAd#pULgEE<
zF;Fc%1Z*PNbt2eI;b0B7yB2_u(a+BuUXEcN6;=lK`>3*?zRQGHlnGb{pNann);|ZA
zbVi$o1Ouv!Ub(K=@&L(N{_NX@0H5wa<kw5ao#prdAmtH{ZR#I5J{j$u-<Z9f?eOno
z`IrS^F}%OjxqpgPWfvHq9Z!^|52nNrZ+cXnGbDOikXkuX9UIc6ygwNf?qDGKsj_S0
zR1G0xDw--THq>Z^a@nOHOI-L?#DN6F5WFj>=nxZ~-|!{_va~$aEmQ$i;qYubhRR~a
z5&wVs3@rPFR4>dYRDDoiuXFadTOm%+{=GP<*gJ-Daqqj#2adQP|2}$l1Sq2lJq;S@
zjVpMx-x)TkQoqt4aRi0gLuI`1r&-BY$NyGdvwBGFFkv%W6A<#UrAv_JK^0Y))J0e*
zV<1S<-@?B=!;94$fib90vPO`N0F2GyumkwVJyg}Xf&-zizzGqiZuvw{<XMLu7-U11
zl!>ZG(<pE-{EyYe-ii<g5N(O<+cR|ye?l@smG-3CY1Dm?lysSOY*1`+5{&qB|3(ZO
zQlO=>eX;qTWUW`(_3b!n^(J!8;2y5r6%tFx2{L<09muQiLTBZkL=rfIE&klk^zoWQ
zQjV!+xQLN=(p~Uf+=%E$zaC5WG3FNJdb9O-#LkmvtURSy3&ZTz5Slrw?HCL8R~x#&
zq_HU9B2q-w((SG(hBF+(BSeOxN{lAjtIYx!s3uPJUM~WMZU_yIbn4-1V4*V-VDx6*
zeaY;C!u@p<xM{TuQg8G4*}cL7mK+LuV!i9j6nG3q2B5}C|FT&Xs?v1G<i8M^e5csr
zsdWd_LEOD#2sG-3o-7lh*hFd69Bl_)x_p{3++WhWYk!jS%tV?A;|gUrk38|0bw6fi
zrAWr?KtoDoC@W+{z1~zPd;r2GYt1FyTaZ~<6RZS*kXcY&yhh2?Qhr~JUlNtCh#^C%
zHE%<2B2-}bR)xdhPe~EN6vvvk+;H02iC5a;7ghepT8PGY{hz0$K4gQt4-`Z+#oLyd
z(xUVFBi)`~UjoBQ(%n7$M4>J1-d7?nYG8$UjT!8&lqzM52lPg?Rc()e6nM6DO^-M?
zFKt2nNPhF})$7b?hP345SuFe0$D<>e+^z%18>%zUZrzF`aYhynvoM_8hPKIYwxqi_
zUwbP})O=9!U-(d>Ax+{J$In2Cz_&xieA+I6)KTxhW(#@V^<ywo;SSgDcn%-c^Q&(g
z;V~{edL|zkTQ!n*)5nQP-4{(*?<dK_*S+KozcH4IF;#N94+uwJ`Y{dl+YV%#T9*p6
z`n*{3pA4x7W7!S4kC$5_jkw+u`8C;ozTWbmvRjq5TGw;=YK-aMM&SirMU|C~FVT+l
zXYvuHJcWk6pk!Wx?b+S!@13{S`PuDFUgxvX^=%CU`{w<ge1X|oCMy+v*KenaK&p|n
z{E3e987{oY`eD4VP#wI_I0RW3fP(<_d-$%AFpH@&|L4uKwT)@#0cUP&UJ<rLWCU3R
z^raK1M<<mim0-0lrV`l*QmD#TVXN(@y@P{uA=2hET&c7j4q@M$xMM$_6_{$8FFNQ1
z9`KV82{U6JU;3*E_fGFUhzjam2OF_qD)sk)Pur%SXPc&QhTD2lS1GM|qfWvJ{yR+U
zzwGwLEDk6}B}<8}pcbJfkg@(l7obwg`tmMQ>Hb<?*bHq-8{-k>sh_}4n5f(NZ(xGZ
z;gYnyc=2(wL_Ez9@tp>+$$Qm~svo%d6N-W~1YY<AC`4CKJiYw&0=<{eEtlx_uGTFK
z(IB@T6?eb2Nn?F_=9Nd(Tb9S~^L;l7Bd826mwyhpEgrL8n7LtRax40(GF3+Anz<u9
zqu;LGLur{bV7oPa^~Ga$6`d^x6ks~Z3We0LH)NEmn*Rub&yKv63=s*Xx1N<0%?(~1
zgQYT8W^QNL-oCS2FoZ7g=kPK$g2~G6bt(c#J}?E;ot<n;^gNuDl73AzhBzm`%-i{z
zgJ>xI+FsP1O_);Tc)R1&y56AAgH6ytNS*HOv3I8to@G2UG59*<x9^y|jIwulgBH4{
zKFa3(>k#>E++2Usa2M8>YP`%zkIZFfM{X^phhv8`ykVJuxtG42NPzUXp~dgDt^iY4
zA|Lu%`b)W6^dNU$M6Nr62gzV1^aWT+c4YD$3s?}tEJD!H=<owDq8DEOE7{Noanb7H
z4AX8R6|%TFeHD<=%X?47zSaQN{hFCMeRXd~$4z~u2O_14Fa3Yotzxvn=);`*@sAM?
zbO5h?pD$-W)vX>t61G4hdd4=?;7ykjT)Af5b!j(KDpRfq#?!H{%Jw$UCaJ5knC)O@
z*jLQjC-Fs)O?@lkJ{+sH7vV0jB#Vuh<ZtOVu9|O#*I}G`6n9;~h|cj&_6IY+k+Vfl
zrPT1MciQq^x`JX~p4!elfcn$--Rbq$bCBV)rV|pn#b2;wP;N(NWAovQ4ljg(IYPF~
zHK@i<r48|FwJ0R#{8;XU%!k3jcbRtSgTRIC4TVTk-Fl|LWdeL}=7$&NimV4S8|X`;
zHOax8xI+^fWTs<5Ck=2t#;Ni2t^@BdeXm20$)WwGAx6GG9PV2f2+`M;OK;MP+*H%n
zu5Xl&s8zRneei7oI<DnpqdEEo4j$HpRMehl7rIDxqFma!@^zmb7Vy{;>1hAx-_o9p
zRq+)fg8O-i1<T=B?F=wDe&mpAY_z|D$VKNaHfPIc9U;Pc;o8In-;*c1k%~@0g;$fE
z-Zyy*hUzaTV@CPkzFv@-)042SExrW?;}Cz$=+66}S6e}17%F=98iyBoe=yNO%|~=(
zP>DAx`>@oDn7ws-VHL*P)bk(pP%WsF)R*Vn<OF21KJQ~DjV1Q(xuMCQagjlgLy^yD
zhxgM%YgNtF*rvfUslzV@LO?=;e2;J`a?(}*DzbllkeRakKvJBDfX(@EmXJ28K~V3a
zug5fmeCh{iLvt^cdJ|R0EDuifRrP8l%E1`r&2q<^j87x09|}KMh+@U4kue8a?I7yp
z&mBnr<o<V+ExuvA^z5{&<+F@GvWtI>EGZd;g!zW(&%*!U$YtGk07q^MZ(yn0w``E@
zO3mg6A`Ry)?|T@dCGSl3vB%A6L><GtR^*M{Qa*Vo4l1R`kCNyT=5b`LnT#Qhmri&!
z?3|$Et%=5`G@7H^(?8yW_h${tJ{;CxBgn295;wzNoDS|FMIG`M?Cz#tAl#jnoJQ+s
z8NoTiw)Ihk&)4aehw&;rWUAUtUYDe#ow});x&*n9ZobzXImwm986*39!#LoOd0;l)
z9}l6<#)JD;1som)X5bv@>N@suoBkM$dAP{m-amM+Ra>qxMs6~i#}fhg!cLrj&kj<4
zu@Y+_P$mFk-*J(>J(Hp#k=5T<b9eiJab`Sj*I#M)29fKgg6_2a2Ip@K5v>BG8qek<
z$8qWJTJ@Q(ztePB-%IXxmMj(>L_)Z31Ap+DA5Y@jZk=eh`kG~anl!;mvCwG?Bz(|A
zaiIzNF^BdB=+R7intgkf<g<OQ-*AEb#>rdGkPklnRN3sBVxR<@xeT88d(cB5UbRZ-
z`BJ(gy5PZ#QKAYV4V9XK9opf`5|v@3!k<*mK_KTqaA#U@hhBvtwZVyc?`FgHnz@GW
zu{?d?GQQ^pB!_}M9T_gFC^=}^zGz@z`pohB(qjE`2x!A85IzsgK2@|PY>T{+z9XE`
zbqfR9X%FBMG}KuJ^~MtofAys;jr=a-pT(OO)at%|%o5aoQ3mnO>hLei)*%`&ON|up
zwm!0D-bXvo;~~cnIsf$5`+VExvcGF@h_a{IaT+Tyy6wY-2kwDG2K0cxk9B8}O+`@m
z$0)g5N%zh)C&wV5CK$dj`cj#R#`Er79L$X4s7{mxmZPgga~bEiN#zA<{}fn@U&-4s
z07_OcshS4`Le<gk6bote*kmVS*`D9N^36_57Em{|##EYbUB`#5%xIqE6(>pZA+vu{
zT<vTCWrIl+@Mf3h{@GKG?(k=bnJm7(Trzqy^FAY8ygpsHckD_hsGo0uh=+M&YO(f0
zDkyIn1|J@&ult7c{qQoc8J!=+w=(nLrZ;7U(TYUf9Ls2j<Fzx7cb~_D{!o=QQ4Wc)
zEw*Qv%a4vgkv*|c!!dIBFHA)a<~L{*_vcD1ec&n~Cm)dc34i+E=TFcvf~NbqiJdtD
zocOE6t<$NJ64;L5R>^Z+$-c^<Qg4o<^*i7YzT*+n8V+bg*}T6ue!PQbgZkB+@AM7V
zdcz$y?>CiV9lR&>&()YrM{WZ36WfiUF5qiU!toremPODdV)0@xk0$tTQ(7Cscz32r
zY2&lIj_VL|Dsp4Pg_;qpT=%ux)3#~q;+tZmcp2ty>P{(o9?V($tLQ6oJom^x|E)w{
zbt9dHaY=w8V~Ef8*yZdO>=x}z?xuw;CSecm!Z}OOk8FU!e^HRk6w`A5<Ygl1c^P^p
zA2z|2Z&+f3z~n8%Ibu?M;0;UQw^y%Tb->$>6|gwt*U-1J_(NbK?A;59{YarU?YvU&
zy=o?v&+wFx0iNJJs_!$<2sxl4z6%Wfj>&mWQzF=bVsgCl0EpsM(rlj0Dh`crKw))-
zSNsH8v)@H}jV!<R(VxmLu7cg_3pOqe9U<-E-!K?7ZfYg-;t|QKBvA~4gvkrFGuSQ!
z0i`)OqotE0K>bbz$=QO`F#Nit!_ZDR5t;HX1ID%fl@O=ctNzOr$cV=2jTdo`N*w&i
zTWLw{Ypg)LGr5{$>8=Kz>b}|0ls)w6J>Jw?ujQS12kyEX1#2N(3J7<Cs?H+bxzm}$
ztMIJ9St7Q|{w%yhI*HO4TX?*)gzlk$>#nJl$w~lvxdKgD7F0<Zzd|-gLqn9@(wl<I
zk0vhUvHm%y4MH)TUts^r^q91e(Tnh*?bm{lU_m&3&%yBG&^83B&*rOT5xDzA@|pdv
zzx)9xz(qvH)IMGrd2emjUC{Mt$}q6tX;>jVhV>YBjl|{UMsL+O?-Y9OIjKz)d>d~P
zV-Z!`&-kak%;^{QNw`JH-sN%PUf!V=-T;XL0nn%$&m3Kr_R`;o=g(7$x~LcRnw>y*
zRPGmY9WOv2`qZvYhUv87KM=sjKG+jN%M=c{<-$8u{=VgORQo%n|0?rX1YE)W9C$32
zFRSREiuPMX0*;Ryo#9JFR-j2>uixxn1aXdX5r0)IB9h<Upx&kCP3ecxKf@XRg>f7&
zQ8Dzr0+8Zazr+A{TJU7DnfIYVg3!xJx|@d6vIX+afR5+}Ip?R=jn#WZoG~QGJ$&C4
zvSMxlv7?=&t1&{i_PIlS4(lIx;o2v+y{FqStk&Ot$NM_!`gWazfeap{Tv119YmMCY
ztvt)dwdpu*PvIq`KlHYzm2j`v&fFlT68Wx)j`K{1)=I00MNaopu__GGkUi$RrzPUA
z@=n|f7ZDirKE!hrlbzMYrCXP2u__!f2>KQ_1VS7mF4UdpQ8=`zah5`)i4w1<rd`HA
zhW$IpWl=%KxYMxOGek69e|EJY@#JwhsD3?3aQ6UKm<x$niR;ly$$K45i1*pz+K0EE
zG0+<plvCZ`zn*4{;N8HPu730qUV`Xoi!H~3h4fo;@o(WPWoJV><K7#SBOO(*`Y)m#
z$OMrb#c5DHESe4<EmheynY`AN+#MJuUd%v{*>rEo><CKW-@PJJr*+_29=cg({Mhoh
z_7On<;dWST4&Gh{jR)#r@Ca;UR=GmNtzJ$yeb!Sn6zKn0wv<}RBH1}{MYqGtqOEFS
z0<=|6B(sxrvDfkc=p2B++VAt{cg{T*|HR$hZt`sXwO8}UW^;|$);Dy7oOJ2~t;5%A
zw!jT30KgQuyWmW+=#AVZ?BTKBzQXJ1u$ptHk2tNRewe=FcTe|&g3PJZg(f;T{l=7a
z@Z-^5|5UjjTaD3W=PxysQxYz}Bp$^Q(I~RTIP1RWVd~eH1gTQBu_=B{kXHK~Ir%~i
zwyN-VBcWEZ39^%Uy+$PBBLynbEJJ0Nee$cz_2o2?eGQlw*MDQE9c;=d4R@awUgfSp
zPZx*rEnjm2{{_|+&q?Z)XN4Bbdoy#$1h9)6*(q&WxgL6xKZ{b|i*3>m%)UU^Bat#j
zak}vs5O<oEG`FxoHsy3C%$Xk_f*IWRqq%gcV-Ik;-TX|JoWcguf-8F#gy;2_woL0a
zMm7wu>p;AAkJ*Jfk~7)1D*gpwC<)s(p<J!4)BRo8bxL*Mf{NLN-=W(zK8V$mzkGLK
ziI;~{%LSFBn4DfmTyw!45v%qDBz-7k29ZEp73I6Zl#;Vj*yHCS9gDL<z)g51@=FZ(
ze-4IP<ZVBt1eEazP4zm{xG-y#0PeRLA_trNQEj}*Nmr54X4mqOcO1vdz{PP+KS&ma
z?dKX#J$%d?lNvM<=dYB)E(sgPRy-|;7EEI-G1x}LD)$nw**gpc=qzNOhrF93V2Sri
z=^1sr5IZ~u|CtfNC&+gLcVvVpLf;Ig6##3L5-(=Cw-~4MG@;DnwF5>poS&9==<yta
zae^4^)31Toz}t+ib_bAHn`4VJ>4q$s*%5_?SdN}+%)T_eh(7&b*NhXKsgG@BBlB1R
z-aI&dF0A4^bVM=W0y9_7Z4?s4?wVbQE?B)M+zjVawQjhX!jv0s#cB~wWzsy=x}Sj0
zTs<3cFoIoty3-x*v@vZ=ZV<6A+Nt-FY?KNd5?8cSVsya3zL&gfEsYpK`~I(=S*^_T
zedV2v@Rk;*shj8gdXd<&KYjsi*n2T|oiudLbPQbfa{h{JhxP^>V!di``Ae-?*ywKB
zAep^WKTiwBxpYmw%aB+eLUOGQ*->ZWqBc{Yq|m#}5vuhg$xQEGhEI+Rq-KQ%$e$K}
zp$L9a^<&VkP>}7ArF>5Ws|T1-==J2e;^#H~CYg6-+>|jTo4g9q*x~&PDA@vMatu70
zwc)iUy@QQV<!MAcqs<_Hjg_lhv`V@&kGx&uQ`%C;DJ{I>G!!;3)9)ZB8jX?<SgbD=
z9bVgY5CHAfkpmepdF}l0h#Y-W>pJk3$1xEc@m0PiWa3%Dcdn4GN5UKUYDiY?$n>D^
zPiG+1wLX!c7H3#noN6St66N19dzp>`@|JjYZ`JnA0$Z3Bw(q0R^Iq@eAqVWvLgUO`
zpge&kfeO<&Dc?L@mJJU?Lk$@iRBXJNDGq>zS`!ci!l>e08bY4~*&o%g`@>76GVBh*
zho~@2&$5)&h1A$2y@V49#2)VdVyC;lk|++U)l~*|4ZF3TP$Io5l4I}$QJ?$h$i#-4
zif@aNGh`QL{>zPvA<UYXyen6jRl2Q|zS1RDBx(AX^F{*zlRRsIgXn1A7Q*%LqJ1B9
z;<(HzdHtlvs8+=Dh-Y|_=floxh_C@Z<iA)af4nHVB_J1XDWHrwD9FHv4sY_c;seO5
zT<|{4yZ<J%S0!^?YlAQI{E_XY#pA28%f-#5q1zMK0NmRk7%_kXb|KFy?;zV9S6xMu
zmFiyXQW8duHH=RV7^9w*J^sFv)ZnWx>-mS9r}h8U8r&5P151@`<EI8s3{B;mWAC`x
zi_j(e7xYKG^kh1%GGCmaDmsG(q;DZ}TqbKL%@Rh_Fe}C1*I%LQ%W33bqIA@Kp{|I|
zqlwb9bHlEj0a}cD@jKnI4UPT_HqUQ0iH54d&Rs0|C+>xOO$K(g$0EFYD_Xad2g1_X
zOU-0BS@k{O@WGGyS5l6l9&djZ(g@OxO6VP40DMCHPC&Itwj%Y8`Uo*8Yw<KUC~{N|
zx|@POzi$(0Tout8*z|@%f>?a~FS$v@v?fqDmgvZPn}FW}jO+2jnoB~V&hgA05QyN7
zX9HL_arN>eb70NB_1&H_lO0Q;B1)dWf}{r_VI*;(Q;G8nv=&I$z3p@9=fircb#rx<
z5L4?l{m8b8c>KARrG|`#*;L-i`;%cwK9H0cr58_!@h__C`r}X5lse3SzL?m}9}s?=
z$~AP5qQ%y|OB&r?F!5q!79ft*mqi5A$Hb*Mln#N8Gi<q>*Y8;lVtQzL-xIxwF%d%V
z4jJzBBah&VTJ7BBX6w>0-H)c2`-f<3UFv<YaTtg%_|iM}oLBkUTbdVsS@gUjf1=-?
zqCZvhkV(9+ppf^7Dr#$iv9zU42-w%LhvyEAzT<$24owzL?qV>d`a|yDoa@!mq$|f*
zFR?=_-XFyZ?jNlPz|JAvyentn9aoftfbknR@pggc^5qr;(o6!z^2Z@*3<MWSUt0NO
zpX%khP55qCNH=dr3>Z@-hpmA_tDQyLcU_V|4Zk8|IvNFfgP4R1)RC%oUcU=D0`OQ>
z^SJ)=17dZ_fR|R|<FA>aPhrIQfn=s-Ic&?M$FcV;WO7U-ro6V6lQ&F1g>$136y7Oy
z3)%~a+SpkhJW2J`T8R<#BPZ!hZY`jtG+uH%3{AE#%x6#H6hwO*cCBYbroMixh(71)
ze;9HjZ@1^~t;{ntxFHJ4Fw}QeV?;;Gb5P^w0Epm@AUt9VCk|4L-r}V%$J}bQ8)<k<
z;jQs^Xg}t6NiKwTIX~~|e0%fS3o3-DExn*v6oT$o!(+K2>=f~#l_WCuy~T6p_W?39
z1EMpD`rIGI%qqO(=7Pz9D$cb8Ffh(jk~8rR$Gq0wH^kM9AQJMALEbs@W{{W{7%JUO
zKhaj1TX{9c`E757H*LwJpz-b44ZGe$uef}!?R*nNF1pudvgd@51~5<fbUAr&gyowH
z5V^KQJDbSBZfcS7WmK%m%VP@X|A0utS2jheoEFk^0c9nkX_af*1QYqsbshL!8Epvl
z4xa*>mk*EXNCiH9yxdwxqc1GS&@3S339g1>fIUHt=xbka^h{i(bgNTQ<CW64eprTO
zUaXOidc}ZK%)a0xiW?^iZCL7D|G}E;I=rZmG1&BkPfWNXXJ}5RKe=m49EgWOy}cyP
zMruOI`wlyIHJp6`oSPl(W0*I}NK2eFbK-S3%>`Bf-P1#J#N1eo-zFAJFVhx@Th_~|
z=l8e*zS40V1WP(T<k{373E|MJg(v;-L;4&bq1#D6j_cES7Psq|+mjERd&jWMR%zR3
z8W&cV4b1iSBiH4{P!Au!j`hXgj&>f?Mq6sZS&w&R%C{+)>YQ0`$eS^G3GRc1{7;}x
z%{KG`I(mK127S9=DU&+Rh-78Yf`@h>QP14~_eh2ys)T)Y)hYNCV6xN~^7r2f11WVg
zr2=RuRnPe(kFlel*4*8cx&u`1gO3}(zEfER3DyfsN6K=q1-QrgNMtP5C1{q~EuAEZ
zO`=jtH}VnI=pMC>*@UQ+dJtu@fW1pd`S09ch=(7in|vw<Ve;@s7x$v_AT+Os6x0J&
zp0DE_KDFPr<IT-lpb82fqDln4gM?6ph`>u`5-tlXMG1>|PVOkca|W>_GQVEUhx;5j
zDsm`JMgM5pvmFi9UF+f(J|ti+f&J9%shbf8M4^tWo#T~HjGD!SLPaMOh<<q~!^Ys4
z)r)R_xscT^ta>`70TfHDQ&n?cWafTr?ngw1{KC<OxJ8zD8GiDu>MKZElGK{zDmgMn
zIylMS_m~)_udDjfSiN#jytl$A<K-VB0vt25$vi3Azs1cOH>peeW)M@_`;7MBA6)YW
zsTj`#bl=Y|bz@Mdw8Xm&m=O5t2W``X3+X11Lc9Kexc%OE#b^VpPd!)3_Z{LYk~`HC
zO2?+nI|CKhp+xCTM!w#e{y+m?*0HYxax%7ZB^*yPjp?+F#m}#}Ap+%3qK=c>2`2m2
zcOg2Yu9P}R63kG;ploZ!%am)s7gMddlJ7yLbSAlI{@V3Bhs+6Tkn|ktBhfJ*nS)Nz
z#F&tbgX{>%p5H&8b)ua^q9|q@I-WI=BFER+rallg-K-psbI=UqKr}7G;e%@<q~IPq
zoq3s`B4b2+TOGM#l`Hs2^RtwGtjcQAPT{UVl3o)Gi(X5a5)UxBkeFkSz}x>d!}Pe+
z;WIP18EoY8B3-6GLSz%Z1BvXc1d&Go-s>>9MnFMmro;#w7r$5?)uBuF9b`=l$gbT(
z_A-3$hOm>K1l-Aoh#Djuk<Wi6uQ(%_%ZN?ErsV1cTIP*=_?qdN*LFZ<!kQyfNmW;n
zL+|^16n>~Vck;G?_OK0xil{4ne8Rx$^S*{M4@QnN*_fV62b&#!ud2en+UKIL^DaO=
zecF#XBKqzT!S_eHK6bv?-MHWKRWogsZ<e>{Fx>t37JD@Qc}Nx~#9H?q-9-z>MzGV9
zrU)UU*Hm?P6ut!_C|w4+n+`N5LL61+$0HYVzU;&9jM6uFpJ=PLs+M^DL_(>3o8-X7
z0W~`1`W&nCe-k04oteR_5^HQi<kZ1T_K!-?5<?c649&XCv>iP8UkbyWCz^161h(-U
z8TLDXwUv$PY{mtqQ#&Uo?*FX#ugO|?E%)jo{D|mX>~H4pO`W2Io<($6p+RcMgS(~*
zVW$xtm4pvzVEyPIAvx*zQPhNA4CNK2#N)o0qY<QXc)^b~!sz9poi5$w3Y{Ctl529?
z*<{WA&nmbj0^4NM+kuOJFkHNGfDJSQQ6Q&2J|E<PAS;53T<$jhtnfTgArnC>JE>C^
z6aX!;q|1`mQEZ!S=n8$PhMU%HM^88GYKOQs7fNN~_bN^!#ACkS`Cvi7Y2VnQYz6EK
z-};LP>m)tqVb4q{KvGM<cFizZ*m#6-ir;~XL!FiOqfU`8<h`&OT5mR9*Wt3@4CZjh
zNujQ-4JQ+Ic8z|_LaO}%mBwZ^-R^2=0+8gnNxaqekqz&U!vfzqO!Ep>s4M~erE2cg
zLk%sXK3Z)Z^{$;_qPUZ_y_k6s5V)8)lDV57mrHyfdUCa$0N7hKZ8!&Uz!A#c;yvFg
z8ed(~Jff;<bDm_P>9Wc9&{!mGOGOKNS^n_zUWY+PGuA^%^z^Y;ZY<mredn@bsXJ8R
zQNLlf?0eCECE?ht>590^Fa)jg8(WYI$W=;c?R<}15n<Om_ZN7eUE^|<+`YM8@|RM$
z<z9CWWwo1Szk^=ve2efro<r_w<Goc5G<FO7XVl0;NxqmI;YEGj&d)**k|Dk)!CXz)
ztVC$5ENuBVeCLEp3N{FG0S?>~#tU`oVAIrooB6dJA#Cih8Wq+e#cTF2>+aL(d1!P@
zv!1AunMpchSFb%}8Z{sd!MFG17Y8bPg0>NdF9@VZ1|g4RUHdu&gnaz<k8z)Pvy`?f
z&-Uk!GorAZQPWf88yKg{1GaBL%8YefO8$@}*MV!cmI9kPiMFga-Ly&wR}0B}_$bU~
zsgUqKIK1KfY#OmnUkB_voAAuxbo!~_@Z;+y+r1!e#+EAa3egtjm9pa%qR~cUQ2$bu
z<&W2+?idyqCVC1kw)SDHum^Z^Qa+GNguF1fo`@YK7k?o6z=}ew;48QOz-8LFojo(-
znrLUF+$b}*`e?yj31j3#t@qHuxtM@Jx@Vi#&B>*YAP8-Z<N|~1zT%9vKsJbEAD{ex
z<vXfO$rudD?~?dE*$1krGu9|EFHoy?>hu9u@Pv7y50zp~|3LPK=E&ETY#?74*>kE*
zHVeQ_SVT>gpt=RgFLb?Le+S5WZFoiddfc7U%A^0y+091D&hQpOYttMq9JQ4}yI2CJ
ztGnrI_}B;}TxhUD^KIsmag|*VV6|iJCDIM%7a6JBA~rdCv46D+LP<qau1%cOOUN=N
zQ4?QAXtz8xJ`R<CB*AC=Q3zJa?b+fdy&1ugBXO)wRUwbawtlsxARdbdB;bGj_q+QZ
zdF0D1t-wr+XtI3LbOD$6`=u?85pIFIPNUc2`EOH7Re^B3%Zz#UhfVeiXat@d>g);c
zXtElQD;wE}oA1Nn|NHquJ!$Ze_M~Fj+Ycl7t-p-6zlhF8K^`r-mdOw?3n$lgv=xT1
zMIbEu!<-sMpM~3GUz&3%>ns8B6M21T|0Q&iaR+PYc_7035D90sA0R^P#q-lE^v}y|
zq|}edxtm&i7Zc7Z@YlJ#Tj5bT6ISyctrbarHz2t%#ZVS#AqoGWi_4^gyXmkBIGk{)
z8*dSy?)vA;mY=B7yF+A&iVP$sDP(1NG@dM*h7j$AV|kPF&DA8|tHD=63QCVLC9E8z
zio>ixeKaXI-y>@N`d2;E+|Fv;8!RRW#vNK!=SpMf$(*Jo@ON|8w9-Cz)1t_toy8fC
zi&ih5zxY%pqA*k2r(LcWlyfq?Dll_ztcW+iKN(K5?qI_pRpG+Dz|J9<b{ZpK8W|9B
zYR#gHiGcN*$ju+D0S@y2BVJ9w!uFh*oumZ|-mA`|S|Kk*Z>M6}<S{4Wi43Yyssqhy
zc?H!9Y;*gIMGX!5A{S=kY==O@?bU<EQi_xB)4>1RnJNTlD#)V;)ngd%bjm_ZBe<sM
zRd!kJjwR(FyJIhXL7+RVJY%5;5W~fuZjBJ<3Z?*tlIMZ`g){iC;XNQffS7fcvICdT
zT7ZqBVnD<~cT%pM<rXvZ$BC1N4-TL?Vi6pBpOBQtsAoKT0<O?b{>FTo^96VGBhW9x
zd$m6JZ3A9O2T^F)=3+H>Q@QN*!QmLd=Xd_k!|;FHZ+QN^x%mpngqQ#0EPNfWUYRxR
zk%+5y!jV)?+}=~4gz}FXT>BZ@p{p!|%ja`=bK&*uFvlNvkex74m+*)(dDCaZfm4*a
z4<SOrTG<x*H1XW&dKnjv_^exW?7<WaJO;r=(;yfS)2xE*Lw9OB53}4iL)T*`oh1G#
zsEc!9R6=!X8R6{LdsluEUUaE*{i7n+?Jy>P3`l9Ni~>oO*F5}hxUlIpz)VNQG`OwT
zlt)3)R)*bPZz)s>Di@zHj0t&jQ)5nmJY00(G5LDoLpwL>Pu?Iz!i#Ze8E=WhrKC<G
z&y@0+xkr52>i*cZVr}lh5A)i657)un;1J1fa24@uihpP;WVJNi<mMa}hiHm1pW}tX
zhgNW8e~^(hP=s7`N|l^?JsXb#f=~;FZ|#sRcut$LFJ4EtH4}>k34~i!r{x7Wi)RKI
z44)tb*<iQDh}gyBU$Wt&3Wu^{cK5T_Y2(Z7JALj5i_aIhF7(Ct(TcCUilBrC;1g#q
z)S}(NX|oH-YsYE+Y&=f`X(+dEL$6VIR$R4=1Q-z*3ntUo{uDN@4fpdLhKJM&;nlMK
z$RF(CmHV0~_<O1%hpB~(7>U*t=#XPWs%2BOfs!45(lmyDLf?jiTwk_fekT&7T<Fd9
z|B^DCZQ-^LG0G6u2$3K#Czn3M&|b6uI(+d-k6@Weg#@l&AR^ER^Qh~{05QzCCe92O
z8Co9r5TcevlA~c)dD)?kSWOZmN~Fv-Ki=K3ICT;G&hZ;gywtwimV*h51G6kmoqn=H
z)DQ_e&x-<wOR<QNuEtyREX!Fl`vq3Lbbj070&oh%gRjErl$Zn8eiNuiYVT3hNs*n`
zxAObUAn2)>ESYLSwnjPM@GdK_;bwtl3?+XpZ){|30!dEW-dPAT8qj-*HCg+jhYKD#
zu9<+qa524y;3dx>(VRZHm1oiB9WjW)B=Q<hIM)98&b>p2eEraPO{b6JUU(U~;mEM^
zD`b!&|GH`FncOm@0M7euGGL`z;@pmxjof!PT}=>bUMwqYFpfxymk>Ger{(W6AhM|a
z{GJ+KYZf-X0J<?2+RWnXbmz9;{>huQ1fEv;fcDId2k+A_vYEv;s$u&s*C~&*EWI@?
zT$~|soM#T{=}PUTA%D23iC+$z{(7$06_omZk(N>8D{J>)0sgq|_m>@7vQ%qH4j*y|
z;5S-PJMR{M7HmcIxv`$mt9fYxAD04SRX;5E@EiB0R79^#U#JE3%lp8~<5bvpz~vyo
zHZC4Cs!VNycIlr5GaK1pI3*nbC-v<D@ti;@27hl5{O^Z)f8SKglKa4An7pZf{T_je
zj!#@iKkfJaux9%SzUAzjAFn&PMvh1|-w(GJxw`f#l@qP2o2wE0G=UII(*ZZ5YF+29
zIT!>pWNfVN@56&J=(2!=7vPR!I?ISC^Y|1r>2{^joMrbEF0B6=CD%hsJTnvv0*1M^
z6+oT)@fQKt5B~9wHnt<+`gX9O!~}<%wzFE%DH;-wTumVUrB4gg!SbFFcU$NzB<h0Q
zLsw%lh!rIq&oT<$oOB{qWsk$q8E83Eg(i)gQRe&%TeiJ&okcB`1F#MB{->*Xnj6iQ
zx`VrFC6pC^sRR1Fp043G=(Pcp`Z(&rZ$gxWVxpBO8OU7ejk!lK;_xE{9%#FvBszu4
zGk4Qp>~~mcEYB<KW8Td#-B}8Y4lZ4B;@ds+pN1lv_G%G9r@eY}Du`BWr(>?kk!{Ee
zv6@-<t%?$v9B@;M*DWb<CZirP>_LyLj>SDFOeC8Fr|4$ncnxUWe0V9$Nc7`Il9jj?
zb_m{$XM;uUrAh$J^W0;9C2*MOR_)OCxUh5JzLAna-FF|aKz8DjP0QGJz9BQ1E{kOf
zRHeXNE_fd@A9zZ{gv=b9Va|bOCm4;dhsM1Yf1dB315Tml1bL$3c~0@b9w~ne39fte
zm|tIY5S)Vc?r=JkUUVT8Go}*oXcG@)W!91GBYpxI&WCmAT__FkeMRUFd@8hbyC}y}
z?~93INYMJ=1Igu$z||zItz!y9)cb%vCs+^2%rA3|aK*y?xQIF)kSF>7m+7o({?)R<
z3qm(0MUucAw6CLySZO2qVL{kKx~<L9_X6FWJ7x}P_1r#mT=}oRNz%*^4Fij0y7k4y
zkg<OdA1H7^+<45s3UFq^<|x|*fQfm!JQ*tR#RFwco?hRWJPAO;bd>IH{Z0hbn)6`-
zR!%*vH&}#HuSXs{P3W-eC*~)zi->YObvtut$yt&q1rp~w*nM@@4JO}>vLJV9PS%%_
z<KOX^0144PV}B-{5P{WZ7a#oJh<q*UYP*o@)F;EbrmhlV7C!HHphuD>ed$F%(U(sn
zUmR+D@S@Mu9#5Ix=I0(SHc~Eh4qPO9f1hCV_I=2`f#25sm$+3>J^4P~&zbl-6X)9S
z_`)A?BB!996#v&xm%8EYA81TdP?MSJ>Tq5H8JDyFNDVwOCW<8FfDl|6`%b*NSku^l
zJQ%IbP1HcT>XTVyE6C`k;&sdOTb<JfA<FQ_D~V%5b6Yd!WbkyhNA?UWQ?0lTAeT&f
z=bYw}ADOFPE<UiyILp`$-hKKUyQZn?s(SuJ7-*sOh&DQ>ssq9O#L`pywUF%wguFaj
z`7z31_U;{6(X9?mu<Xd2rEF!##9O=aRtk?%wPOOe7`6AT4LAi$UytwLC+L%6;Rn|c
z_pRYXTmwrGTpnaCx~?Zw_`8-QE17&^mp)zKm^M!Lb!W=_6&`FFo>%}1Lf5!(eDITH
zm(FNlgy_oRu8K~->39DWhKRYt6zgJ&J<Z5+>?@Z}P}Q_ZqnWcqr6N|QxbV{>(muVv
zh!ck36+MhtB#JzQ{A3RqLc<NkZ;Gtdi^sJOW>qz5TK+$_vDXI?N84(LZ;?pjpB+~!
z6)3%xF5PyQQ&<SWX}>|5@fG^dy2Daxs4h=#i^WBe`R}s-6iUI=-0U0agNZ4+mo;%Z
zb21Y^3yDNTn4giDoLAm@`D+q%z)ue`UT^mKM*JnDCqDUr<)-2cL~|W}U9-&4Wh&Mf
ze)eC~0BkF`@_0}+3GL>V-h`#$X8koW-YuZxf0_NLegS354kUgxKYqOF*KKl&yel;l
zE;>3=5gdLpSlQMQvEoo%bysdgj1;2yySSZGE%7Xkbu&g7soenX;qEbqi{tmIJJd<2
zWUKsRmo>gyZ%evaH`Y+9Qm2+Y7dMJG<mQ0|D4+7i4Ql%bw^%o*6Wj39A5Eud_}ER=
zw{pw66iE5_9Br&#l@8}gt{v?ranLf>?mj1YOR8W5vgv(-36F7Adna@K1{iAw)D+}4
zjA)m^)ZRZ!5~p}3c&<<*_9Xsy528YH#Tm$zPk3bBxccs<dVrf7*C#$QC*Z{-aW1H3
zK@CLoU6ZW|lMgx4p(0^K>(TGv(CWs`hwqk=bal0T7A?9&?^P`VH<EzT%>_gc&)hnX
z90>Mh$nE$q3*f2<t#jzUD{Ej@Mp8C*u%)NEi*c2wNh-%vJxNGKAbd|fk#oodck)Op
z?5F4=<5eJ7u9z8j1FQikYDajZ3AE)|${0xl93HayeV{tg7R?WsFt<%Tk2&>UNI`NO
zW>%>0{m0aPj9^}P^hix|`!D6p5DhZRNkGq7P4v4u3N853x`=M&v?2T_k+r!troC(8
z6XdjaRSF?+6N*P)sa7m|$Vl|f?!So64il_S$ZPC>SPyyJLo|^f!{pYE{{UJ91x}89
zk2b<5M$JsN6}sA1kPeCuQ#kfLAzsI?rN<cgD$t}AE$sK~JfBD~))M--P0u~nYL%_P
zTA|ZKc8#&E%u<F?_R|O{${UhR{L%FN15U*505Uz~K2%BYj1@Cnb>k$-go2&!RF>KH
z<Fd!zi{>1XW!k}+TN3F)Ej33)QVLU6#s^Hn=Vt#G^g8el^xBnk;+HWuc`zoem+6iB
zx8lKduRSC}%d}$e?F_`nY2$wDQri+jd9&4+xPrCF#s*m;bz_69i0z{opP*5Ul3x&d
z@$x&X@2Qbk`;$;{ty{NMvgrc8UJPuZpdE42eH6o0Lxle%w%i_li>%^{k)Cz@Z<CH^
zNaAmRzb(s2fN`Ik0U3WUwtwwMf|+wZpzs;lbo;?m!yleGP%St@`gu9~qa!^ZZT(mb
zR)i@`aC1uyZlbtq_OUO}1Cm(h_`aLhPm~Si!4eQDgUmPas2ewrT_zq(=&bi(IEhh4
zwj$@7;=moG5I$)-<xl-6g#>6`{?*vB##E1IkyL?TDj+<2e069HFm!8+v`~?xlqX{5
zf}nkDT4}4Ol>ny@`37@9?;#pWN3puSOR+m0_>eb|BRkMfRJS`8VN4&<P0-S)B{;pI
z13rhy)<zKY3;`wzVn}1Vxp^1#le}!6Gmx<SpR#HyXZU}NQxGh{wwO%9^t>s$_e7Io
z1RU5DoV=ip!FRxf%-g9d*Oml~q2;KRQYbmI$bQ%WH+@9<5HaaOsl4ZMoq&S@Y&;vw
zLPS@hYe0v(IQfTmd2$-~6JeJPu~mlIf4hX;+zk3IX&{F~1#rp9QuaGilKj2B$svKL
z`m)a@(DU-#JIm@WDMEZLyb>PmerHz_Ws0tVk^D-IAPxUiuxSs2f_gp>*mMb9rN5pV
z_b(>AOiV0xFIZ@+ScR420~J8Hy!@Y`MsDFscXh04FbOy(#AcuZ>BNCMO7?7^f*nl$
zI!jy7ocX|}lGpKqa~CLz%OAmR|7yAUP(Z+k9pM@<#r(3<N?h=ANj6Z5o|l4op<1LV
z9l07roN(jTc>!7Ck%J@Xn18iT#y=ps^c8q)sSW=8Jq6+rPC3s*`0+LqUZ`ErqyA+Q
zLOFGV28F$GmIJ&Ce72x5il0CAk1p^xq&mKs59)=<Vc=}K*1LhP0xk{0{31{ZDc(@g
zCHIk^=$qBWe(L|GUE|5bKTV=mz487;Un|v&iR?+k{jQrX436iMi=mHXqi-e06j36J
zN^MJj0iq#M+O?}loGKhXCFs_?5XSMUyadazF!a|EvJPc4-|7E13tjmd?-X6Atn2Gd
zD^^e`@u_7T__BKc56j~+MnX!6Ao41Po-oZx#1Bb#v)TWr@oB|;%^f;PYw3PQ)wxq_
z%G7<;mi_dfY<=x~{bKy4rk*QqTONc7DtjA;rqKA(?t87}!F-dJc2xl$5-8DD@(r{V
z=3jv)<@M}4t|68jB}BfEi=J9$K*+(SVMgh3f;LV(er4YL)u2)X3peycU=OIy{aJ_<
z9*DASfK7}XSsbnPl-Qv2g-9+#EVvp*jL))Uwda<!jn{vw&pjHdskWbA65)_a;mg>f
zI!NLDkHhnNlN~>wi3b$ZuBXrM4x&cv9O-o~2?86A_lm4HJ;Q9%JRh@YmeS`4fePsl
zsCec2dv|fh+jN%$0NWUm_Fc7d{S~(v%QvDaKO(gWof{N^MsKViRrS0It6Bda_TD?J
z>1^8@pHURiQD(q~3g}1?r6>qUjSMQ%6;!0Fh)4+tNRtu;M~0E2lmw*2LJ?_FLyrYP
zYLreA$^aokC{jX6NO;#5&zyVSQ|{sSd!P5Y&wbwej}OTAyT5y{wfEYqewK?wO}Z?&
zc%TBDkq4b#8sf}sM#}-H6{-$EmH1ocLHR`ZUsQ7lH&G!wdX*-t=t7W6R<D%ReogX+
z?<Rq%{ay$x(zb~g7ZLdOgDt(^3aCC5LX(5iGU&CwgY<OWkDw7o?BWrH03;NK=08fb
z5)Dq+<n(S8#>q+XKw-nPLqBcYk@QG7qB$I-NcO%bhoQs~=qX+7S5cwp2oT*>(4Nx%
zXWBWWD*9@784@Zu3-BLS1hbW2H#<WZjrm)w8a@z^Uf~A(rLBg!Lw322hot@{`MiPd
zdO)$0-2uSjM)6%X`iDZvkWY#I-2uHJ|9Cgz8vm-3<hX-#K~JVF-G1W6nM0wSNSs=E
z91=$`Y)U^@;yc-^Nal5@H+1f{8nxKRFx-99*)FqLcim~d%g{CkCcPSCYBzK36anxS
zixts&w{Mv8F-WFY=<g1%AYheovO}r`{9duw?^#^<=wf~!)Rs*bUjBmZ(E3Fz@LAu6
zkSAu~>C}d}xET~$-1NCgZ%MZE33VkG)r)6x_<za7mqH@for6nTwMX}hwGaM4M|kso
z`A@_4>!sDMJV@GJP4m^dZGb@w1+!l@!;FA#(MWcbYLg@d?)+7JTInNOc1pj7Qp`HQ
z3ENct4w!R5Trc((xDpUc6rP00F=B;j>@GJHF)089ywz4Y8Ui~4t&!iLUkmq2qDx~U
z?prs;ges#?Nb+(MF1r>%)5AV%tYN}UmHndpd8`*~KB4o9`z}l&$THd;-yxUCQ4}OU
zCd)z2W}&GS8(c6{DFg$+=*g+CO>Q@=Aoz#?hh$|VrhOClbOFS}q!|I2RAK_`(Yn>?
z{o`FU&t8YrT&(&<wsH+WL-NUAdcO$l%WB*LI+bVr<AXX7#RTMai1D-T8kN|u_LpA8
z^QlFmjj;T@7l3#VjS=-9xYBct^ImY+t1S$Dc<Bv)DYg5o61z$JJI(9hONaYI5?Hqd
z{8~_(E$~wQtt&AZatn5PZ#yq{iKd0xl67_R9MmSj*4qY&0Du+&X*UxO0N)%6d?0Ba
zg3t&rA4=SZcFU`7iaYgAK>9zF2$>Y^)~l=e6W<R9TZ30q9<o+aF?hF|v~z6f@w&wY
z2zZ9s>M#pm13>JwK?I%`aQq;fc6#a^ksnIz%El`_f4>6>jh<lthR;HD!=JjKr>OD#
zJt*)hUnfIFgKh;bru;sYbzm<_6UkaGF?hNt;s8J~+hZ?>9!NoOvabrt=`~+7of7Q;
z(=3QvfcV@m{b|DgT}=7ldQ!lnTVUi$`DE-oeBmqtpcaq|nRKFBPy)D*x7upoiCb^B
zc}_sR)WrY<^ZuqIz-}NMIn4iexo3dSngpE!WFVm47Tm?BYFPw;*cKK~X8E^gKf5km
zAc|T3+*?On0lccO#ytfW5G&LD=BwX~N}xm2DL^hy8aROz{=rh?!06gg2VD+vxzfL*
z`~Zos5tjZXQFrQ4Ves&J1KcKkNV>Q;BXekIo&FB7>qsnaHFK0V!(?tA;w8F(hGT7s
z+M=_*T?=eSdw{La)dAKXn0z$Q^9}mm6%d?|WHPY6R}EzgnihmA?U2Zw!FnpVK5@Mj
zd?`Jv=|A9^Lwc1UfM(DsuUGPu4szun(3<I?kl{EB3bo{xyCU6SWK$(vpy<=JFKGZg
z_1zi^fU413K|tzw&y{PtZfx**4Vv4>cfW3lsa@Q7@=Vi_ZY1BO(pLTsh@p3Z{QU=S
z{zuv5Uu;>zUu{{l1_&06fE&PC@!3keta&fwI@~KT{MhFQ@W1+K0rmIAt7XS}y>(#P
zN`MjbJW&La0C98UJqjl-BB%_zQrY0&r}_muAV#P?1Ullh5!@ve=z4|}?8IrW7Gwqq
zK0pH5n^F!Ooiv1QjhZXqo`5W3NTJ_lb?e>$TOJBGz8pINZjS;nL_3k(v<m*d>cc~t
z;`|XN2{~3?iY<@<VZA9Oke1GSty7ceOy9CS6$(N}sfbFBplF#L3e)}S67AUtz3_e=
zNJu_V2f|~#dx=HhN_^|51A+<_PetsD=*{19e<UeJ8q5n?1M=6A77b}gBkgu*1a<(t
zmB`87W#jwo3a?x?fJX)=S0N=YfJF<zK)$VpuMiZHcJTW(R7?q|haCI{z5$^IK-Ty(
z!n_SxH{|kHTtWU%$^GZI3Rb%y=Z4>G&%3^ZpVASSsF%K%_Lm>uy#3Jry~?iPfN4}0
z+ZQlvXx3ti@2dMvsu{ecUI)_WknW6apWP3o7eQX-mr6}I5|EES&~yBr>g^;g+Eqxf
zo1BBZe@U=|wr%^h0Z{n$RQjJD7l@UIG9S?N_}X6ZezU%BS^ucM_m8`-{!w2cBMsnQ
zS-%em<%WXVd1}cT6S(M8b2Ar`IE0oVg>bXf`RDh2nh#7twsSO+z$!c#FZ~64gd2=j
zL@~5u8%XCucB&a(z3X}VRy~&u@>8b!?Pl$}JgwsgwEg%v;yaG=KYap9DUXEY4{v_l
z|LlZz{pQ%nyC7%`T+2t(2hN3mrpY|IfBL&a*B5&imEZhG)s0JiyTZ4kyxaRmpGTt%
z7%(Su^Dku1$z`4}0}RQ_(R+`}uIoGcvBy~@5Uft57}vb~b|&NPuD!7mn|dI$nB<a*
z;=NM8-V~s;hxC7bd1d$6F*qn})6mLYM=mw>&bhCv*gVwG`qXF+Od!!fk>VGSz__cJ
z`4-?4WV;>t-{-!Ajs#<Ex^*W)j~xw2hpZP+&(`1C8I+;CY>7mcK!Sh~?6tD%_BE~d
zE5}-%mVy?t$7he`LsQ;(HJ@Dn-@c-F)vBkJ`OLu`dj0Fb+sW<0YMDDPG;MAAX?HH=
zMD*TUBI};?9o-dptA1h0r|Bw>-qm=@N>r8F_XTOMrg(!!-mZW9+acMz6}9`HUNYI^
zz6J9_^OV(_4Y$m!u9b;h3$Epfu*`mbj^{YmGV?d79YewX&ue!pJ~?&z+7E{g@82^N
zyyfsAO5zHqB*l!*oJ&@Y@|lwM(X`&6kT+EpSdQw=U?=r2^{Ooi$_`<RB8nbMJ1UR@
zCp~!|rswpI8u^fCEGVvo?BxkL$M=#+w|-g33#ypFHCG*2_)QuOKQ-bfZwhXrcvIvO
zQG6yLRUG6ee>=k~E)+D88j4!}W2vH#nQM88o-{(;CS&aTPH@Cnbu@J{P3#Am`HAnk
zs^^R8&g<+bu560oYv!a6k&W#@SM(Ol5QxKVXCXf1@$Hkn5Fr^M&L&UY&O*Mnz+E6>
zG<Dy3Ia1KpW*E(pV^IkxXvkNAnWvy-OrcpWBiH)p)7;(?3hNGMsc)g0z?S%xHC58v
z>BV#i&dCd6W>sKDL=yaIrS^z{vH}D4S0(woZIxv<(w+0Cyz>0f=J2}A5e!?OD`pZb
zXx{2No4uO=Fy7A+ziVPR5{9aU2dt;=bOmtln^=>*ExS7KOM8}=9n+;935r)Ut8Fly
zSDZPsdmIUV8~J6X0_bWLr~?;momZzk!g$#K^?~y36@!W@SeBJ7d5jJqkDoXAGxjS6
zMv+|!hflC{HVQM_&imz{37i6*eeN{@)n`=n$!LyX1jmDEoYQf|lRR<6X8QM_{K<YH
zu^V4aao-lGnH-C%dn7(F@9I-w8R(dW$60%u`z#X|Kgg9c9SL~>v?GMWrYqB(glezG
z{fZ2jd}BLas3|=FTSW1v4v8?+a)Q$PcqoOU<|3RgI&O{x%N_eT2kT;nUP07+Tz;i$
zDlm`hnT2<M6~;g1-_IW5hur%do4l2mn1bW<1C1|4m@j1}O0bzIPfbr7MuGcKE&Q6B
zh4M^p*K*?CcA&UACoDI4;f6?F@!}YPq7sVh=~<!`iRoNcsM^2nQ5gG~J=BfPE*-bU
zVirR&-LHB}Xl;`w81hx@i%3-`{R(s=ZJ~(HqRTi{G~=E2T6o|;asW*SJ@0U{o=fMN
z;zBjXuUe9k{ZX67%_Iaw_{~?8P%P59;XWKwH#bo62~LD!gP@V>M7Eo?$jqZq7e|7Z
zH79}XH_0%c*=m?cTz$NWiKU;$283MuHfLrcqf{33hehlv%b^0RAt1%s9Cnng7qe@X
zl8$bUKIX04L>J%icH&A{33OMoyR}Jf38g%j+Z6aLsG&5-^dcI*4c{`HR5PDxwnbOu
z*v}|%BB^y_(QZlE-)Lpq#5tzVQDu{XSS*DF3e~4@*jTvE>*i>b;cP~vGn_z@=QCIv
zo-^){j6t@EUtc?l+37|=_lOA<*|n7b)n@!zg<dXCyt_&$K|S6stn#j^Ty|RqM}EnN
zTX{-NsR~ZK0tOPPe%LX=nuRBgO-aqoTd|D|-Y!pc=~S}^*yXTM__=<Zn)d)JrrDl9
z${+MK<TTL;^|mt{S1_DdP|L}rg&Vd-W$9B~8Raa6{-;m4SFu5`s4)^)QuCI<RimLJ
zWlBW>K&D<NmnYl_$R|;`aGU$9T)-aztUB{*cJEY+gNHpe+KsA9RMThuv2?Qe(@b1m
zfPD}oOZCo}uoe)4mMm9rXdC2w-(vHpPw%omj(6kTLl{nEjCG$FdQ^wvPKTUk+qH$e
zY0f?4--=B>V<3gOG`Evb`T@(@`A~{RiW31G2kV|93d3fh+_JB0g{DL(YsTiglhM(b
z2iSlLYc-{oMJ3S-doAQ;bM1zTC}L6(mIckMW<Q}&`g$*IeT>VKX%6zGXV0yzu87;|
zg+1H{B@eyT=p*U3+Y<Y3BVFt!8`66<hlEJG)zhtBlwD#(dndXniC75i(%btn+Pk8g
zus+o=4Q;PmA-t<iZ}~8~SavDU${<TP5d>HuhSL*iC};}X<fhQ?QXOt%W4Qj5coXjR
zu`+rJ1^pZtjh0Z6#b66%(bA=Gs%EuX)YlixeUp8~T{g3RCD%bT23B#Kiu-DdzMXGJ
zZN>BsH)Pkr=oD{D+dNe0HN7EjyE`g<dzQOC3=bC}NWplMmbaf~9!CXS&&qea_IBu+
zUfn*=CTzarVC2IOu`?NcPV4N*=!gvq3kMchurIt5eMdh<JDgpeJTX8`2@TTU3gc3q
z6K}#AFqhb2+R!~0#r+ahGJc7So<wWwt0_$oC_zi%hAUGm+&oK=vq5Ft?jmFZk7}pM
z#ijMTvh5X=3`J#NKe1`@XAJO#vLID=T7i&l`(3IS+1Z_U7^HB+S$a#M^iI=Pc5V3O
zVM&Cs?xHuQqF{lPs<I{9+H|INz9V5t@Q${=aN>>dXz>YeYG@B^k&@ndLT_zst1IR5
zZA)(p0LM0DC{F?nEiuB0NM2CySa^c-s+b8k6rFp@3->{7@aknKl$a948to;v>bSw<
zC6R^-XGqc2D_J(oS0x{@{V;gVI;V<WlP?5f(Xq53>2jHh)?;a^N>xx$Sl(+Po!%a4
zsO;89jgWGdF4ah1?n6cZkvc_@Yvoi#8WvVbFa2l*SKjX<0K0UYop{U(O)$)#>;htU
zlMjUMYek+DHPveZnZn(BNfjB8u*1};F5QtAt{#j5C*P55hL96ppHSo$_s%z#*tNOj
zp+PWkVMKG7w=9uxEN#!j+=&ufWcg2~UIV|>l$x2b^tYd)gKXhPuP5}TPtkM};lz>h
zC`7)@Amd4q-aixf%tU@Xer({GIA?T^>8t!NWZ;w;Ol3q3_|Z_B(d_e)ke7j1EV>?9
z9ck#<Z%jl}qvJ2ZSbovmfltv^!5?bVwWrLT8E`P|sd)GM_FkLB(k<xMyst-j7-!nC
z+$mB*VnxY0Jei>zFN}&6g{1y)s@Aa8elR~!#w$EAwfTLu(Ll$x51))OVm=vFW@vxj
zcAvbysm4xiIz`pV<1{SMA-KS(va5cAXA4{3RjI6}tL2h}2KnAOOHKa7lESz@pL(9=
zT{1T8ZJ6uWIFfjT3=aj?dc7>DnlJw06bw|+yqu!-OR{g?`o~YK+OUR_2bf)amA4PU
zWYS`~tFhZoYt&6B&H8WT^BS!3KIG(KXsPt}al5VL*O?S|;aBP|<gNla<~{<YX)58(
ztK=NjGV2&;a=|fOPI2LmLU`gbA<url3CSJ#(Ch^@ORxRWp1rmnhN)cDdpO+!_n6y4
zmBoTnWjpJ<Ud~!1ziBK^o<|*js`qGhBEFG~bK+mxfdVo&29%11i&dvn>L-+bY{eSC
zvmVP!&apBcQoT^tb^S+E9QC36En$lpWDE_7g;G{W9eRl>NC{g=8d5Fm$2LwSEM^N%
zE%|J$9+C*7BnGzlOJqKX<`FpQCfVm1Nl`U1wwAwVBH33c8$*F76wYSCv?djF6=TI7
zeRl^4?@8@E&uAgtHlgrU@<=ljJ7G+r)ONl~SoG8}bkA)QGGArVP)THoOK_jppaw(X
z$IPzFz$(c@rjmUMKb}3A*=I*C&|O3$zwK7mjW=>i_9ZFFms#iKdnYbwXl26klv3_m
zRMi#P$80A{{3<!>OclEt9n9D-Mfo^9U#dGeZfUqVFRV*@eYx1LB_XL|RpRHAio~Q!
zw?)s~+4%F2>Rf`sOWi*dr^~r2dES%+UuVfaJ8I`=tl`(nZP5jq>+kVIhGM-R#ojx)
zu7U6DJdWopP-u78aBxtMfk=DULM2x0%M+_sIm)(Zr$H}6S*J#cyZzguuTe$pER_UL
zcs+_`>Q?4<`64^8e~fTb7VDcVk2dj@*eQ5O)FOJjm1N)S9nFjQ@5Vnx2WAOQIoc(h
z6S9zh<Q(%(A)hQ_$4)U&B?$WdbqteqqX8!BC`{1ADRFyB4Z}Sqc4@y3=crD};~K`D
z2xMnX@bp~^>-x49b=hf62BXG*>?3<j`NK&61bjtzbm?4uscykIMs2zeRqLJT4y-d5
zCAf<QoHg-H_O+FsRu$Zo;F7K1GF4i`Hds#I+AiK@Xy}^B7$aG^d98ELuVGjba}0B*
zl0u$|qlre9bY@YadrW%|0XX8UE;y#GPj-VX;7ru0o!2On143n!0$)hrif95S8mc?O
z7Cs2%#SFK?i@WO#qi+l<$1Hl)2Nss(m&>BsaE%<Lz(nb?GnD!XSDM=jPV*P%7`uJS
zx=;90Syv+~HAoTaD3*ST!s_QvMZ1S}`2`iTu@1(ZHwKsrKiN)%FZ3H^Z%n<X68lnJ
zMVU^7er3yS1#F*L4boGPRKYOOt~R%qqVZ0R4wX3W8R4mPQF+&B=^@Vid$G~}qS^`X
zHn~@#67{UP#4+A<vd5@{eg9aEf9|w<%=J+sd`{J3$h%^6SBaqSsB))LCsrR-2xm#L
zGw8|-nMAI0g<IweMc2l3TPu`7G;>tBYxNN|&O~@lVXA1~x2OF7^3)BhPqk|Dt}oHq
zGC21$F7n?$7T3xM*Li>qw}nH}eoS^6vj;Aweu3S&jo09ZEx=O4%3RUzS5qX?lkMh`
zeS){$hR&dS!fh#T+w_DL%bv^N`W0kz`}bXtB!o(gDzvotRzxZ*jDD((AWzKO>1Enc
z#U?yH#F9&|R^){Rw)<*FNOdUnVF6~`j)ibhdOH4@`1FJM;+mW>l}ViOct-8szevU#
zm7s-wpDf;WdGNfp{@~UL=cTHYp`azDboiL_Ubn`{n+8%bZ{bgdoOl!T5PC_>eq@p1
zD|al-J&Yb^TQZ$qqiGs+v))-Z_M-{RPt!M=ZZwuRRJG4|FX-aQZ1cTtdgFvz8H&e}
zMqUSBWl>HIT7WVt2g9{Ez$1-QHw_%Wgh-D1q;fz2f4HXSmr|_zN11U~seJ{0Nn@d2
zwJ*va8pKk|WfG@d2hm>jDz1!|j9G>0NF^zCQs$kAV$g?E1idQp4B2{n{V*xoK7v;f
z#d_<pG^M^mxBv||8}Bt1%>WT(r#E;7&Klj3^F|NOF<m<nT`9RL^a|Y+r79(C_#wM$
zQg<aocGS4Yy|7#p#Xgocx8a+_J0fhVCt5A>s~mC_5|VwR++mcJc=;5kn0JB%bLsR8
zG4GrQy&@6`lxANwz5p-XK_+mRg)=wNw0o9T6YDg{^r(#`r+$S+3JDSfr-Vqw?uBE~
z_%O+$X>1MJiCP4#rlfx5@=A~k!W@f5M?U&`TNz}Ey6H+^kvf(JN6qJW`h~pd`S=;O
ziJhL1o19{LtgrrFQ$1?B%@Bw6*LS7t)x31o9jG{!xgnV_O1s<dIa7Hk!Hic!6I|S_
zcKM~AD5<o`jy<M&*~JkAF;~-}l8Ixq1u@U|)VWwk;(gfn4yO`hQ%cP_0@xC(AW~Jv
zUmipE%-Of5JDIRRF!`M+u{{@Umn|CaNJ!`hQi<6ZA53qrogfFEt(07#m&3rs`x&{7
zBq|*U9Qt#Y?_Ppq<BiJxpwC*995gjoCehcJeVP|$SPI(9?0diT_G#DLX(iV-Y_l(F
zR@AG$FfW4A!CXqH*4aDtQQbU(QbJ4LVX8+=jqnjmzc5sS<_JyM_qR^fM}uant*y^8
zp4b(e8rfMIXw0%lR~`%`lWXC~)6<B!AX}{Z3ktu>ZEye(!_U`FSX#QutW;k%kQxeg
zpUy1~Xma-ISIAIsT_&{KH7NGB#6<Yp>ef{VZ%><{UT5j63QxQ_lIQSCK5DUf*iYgt
zIc)y+E%~L-{M;J@(TUg94mG5jt2x>%*xe+yz8#gkE%pY-apFBa`ZO(s!q1h6PYFG>
zwK6sLwri+FQQP#GaZyG{n`nT8zyZNzuYqKAddQPdcTP!ecB8-x^*7D~;*8d5;XMJV
zLmi^q)7m<XA){3?CT!yAp_m(^;d3D*z7Yo^OhJ8_^1ac#l)kF?L{M7D<1N|vbJ4u6
z^I5{%X(RZ^Yk|90Zakc|zI8~@4PF)0(-tI~cNrDYHT>MR9_>I&#7CE#I9M0R^<=2N
zz0l>*&yEnQi?^GQJLbrH`j|dF5C2JE=rSSXdD6qA7_!hyN=uou8rK}}oR?gbXU}HG
zIZJfirnH!pNXQk1JhysSCc)q_>%EcNJ8o7nyvI#)g5lhg=RJT^iYbXvYL^|ob%RLO
zI#0evzMeeriq_^$sUOZJRY{_&h{GwLU|K1U{dFiUvnf<Nk33`s4H}vFPYM2JY6(dT
zbdj!YuM1@LF+_PsY>NY>#Z)%=@wl1V=M?ATA%BpC^va}lQ(Pot3S|W?CQ<`C({h*#
zN8YjwZg!^T$2LZ{T%H`AIOU%^%d?*Kc7=ydiNy5b`Et1Ty~!4;Ht?5qVWU@y7ZNH_
zf=Q#>VaZnX(JKofD}4JFcuyyMa^k(*b3@|gZ00@hbFp{5Rq+#n683FEEkVVC6J;%}
zZzm;-ByRP2FVAwtq)XOkXE$DXPPF9{%a}D^A1;LxU*9)w#@-sLLgIVCloX8)CnHZ9
z++<1?wRI|d?%ejfz**yyWmO~^N9YUiRqK3<XYf5veJBr$9rB!Kfh|ZDYN|;S-rivx
zShP^qj=xiA>otJ)O1Q7bTY*29coKtot`W1B(}LNkz~MCHi{0!qzWKX<v<@X~exHLx
z1Up7F!N5tfurM#-IK%iR-6n?PBuVy^RdD>&=P=8)@&PWLi`6YO*s~ybN5Q?*=mwpW
zaNL>K*Btlug2a6b+>_su@7Y&~?S2%7g$IY4uAfZ4l0#E)?>4&O5Exy=#<L7pUK4H0
zaMxR3;?Kohxz<St%xRCQf5aWumr!M`j}p5^SYJ7AD!@}<JMpGa;u>LLDYiLKl1T_m
zHuxm*w$$irx)nBi%DBj(HGAbi7}Z%Jp=;92ux3*7Hmy35)yAKxFL6O$2+l!EjTig-
zh*;w?CC3Ad89o9xh-RL0hfFtU80k0fpI7uy_rt6RKF`cReNN$w8ozWP7TQj*=4d9y
z>?B<d-^Lq(YM(_2i%dy;fh{L&J}HWO|Bghn{AhnJX~|H78DpQRrk@*{?S^NTxylP!
zNtR+vJ*xGM98pgnGaV&4{9(i8%}k0>Q3gKl4vWv;k>9*u(>X>gX}&G8(8P>vu9lGE
z<rsz5Fvoc(B2;c8)pIgLm-UH9m;@4;O7gH}NL~OQ*Uz=bvCmr?chbybhMC^aHNsAn
zk%~z@vCW*L@MJkv*>j~SFxxu1GKnYHyPk7mA<MfyP~mMNJi5?DrBmoGeKB;c`K-WB
z@hkCkos@$U(QHTfde@A)Q<MT@hT1Tjm3)&lD?=)qS4k}J6Os6w5!gBGmqNdz)?&~z
zZU%dXbDtY}-9wu15o}|oO26?-{i7;jMsONf3ZtlY{+!fFX+HURScK>(=BxzY&wMuN
zqNcdg41Aj=eb$1dt$V|1iaAP1U1`S$WU((267af{&dG(coRV>~z0M!K@If}H#RTq^
zd6<bYMu|iAPD-4(L1f*-&c!&l8Z=zjwQL9~w&N7B=ZKV`GYt}cGBITnRZ5(tIR+#2
zhKa0x@Hj9jT+pet$xW}>N2^JU)O9@)lu$X&atZ!1Q;1*eE4h#1G@dSTBGn2zZnn}^
zDil>jSE0$f*2BoX>JIWPj%*>R#OY6wwXxCPwaY4|w}YrDOPqE<rN^Fhn9r)}`r(6X
z0mnd|DM?l^a?LA@Z7fw3JU($*d10x>b&N;9!M@)2czyOn)r~{mF$D=<oKR1ry@a9&
zUFMxzKg709=`UYRyQWu066dzg7Z&8H==9{3<Sf`JOX1R>M^$-^E199r5FqF-rw5&s
zVtC;7$I~Yq>;oeli_tlv6}n~8Va3JuJdUDaz0%_8v1}?_?wNLF*Yxrkb+H1<E_TX9
z6;{8&2=EBzUQ!s_tcX2vRjpH3&3GKx(Qui_x%*3Be^l>vYiv!d1x{U5u)>jI-e52N
z*!h((BQ+V(6O>9~<4wpe<VnyK;S2(Y9m;%0y2;`T-K#>Bp`M=@WFWQB;w`U2sV~fk
zfYEZ9R5si(;e`Sfqjg2CUHaKs!p>spvE7CgQnUB$_?NoanQ(JG11VCrdIjRN=yUKe
z7`3}jBv;Ru!iBue?-D^D<sH-IfoUov!d_fF9(I!xe~EQ@)U-%&LWLC8Tf`Z6Q4Y+Q
zUoT*e>ux@0AQgyLN{`jFWVs@$oVxK5(5prYrp#OGRz<CQ%jIX_Ch0FjeeQ$xCYE)j
zP9<vIvM$I87iJyHne%GPaAiLkJ&0&>-Zw$uO@vpGyA3;mtksyrrbj{E*OVs$mFyL2
z8{(f&Fzls^+T%@XFSq?BCGOo8B_-}xvO4f^yhmPRP%)<XkKoVO18XBEs9Fu3@g|fm
zJ&mv(MML4S&D~`yIUG*6VFS$um+!_l+v_1`$bqFrNEHX>)NI0nZWVqNRDIyRyHVtk
z{B+O&1s=kV6#bNWENxyT1`Xq>VX)ojVL~S)?FA3Qrh4qP<g-_;p}bOV%FWKYCuo@h
zQf>9SAA_PEN{L_UkOhk6zo=B&ZmMT1&y>>vhOvNaoTT>tmgMWU&IXx*on76AsPqeV
zXg^!)RuR^*w4%f!e9?>Y)W9@ukU<tW$TS%ycFLfEUqd@ify{Voa!qfPKFlS+2ARwV
z3nL6wwwo_x<zQf<;WSS?8eiCBbWN`(kBVnRt3*?Y6zZ>)mlm?HjU;t`7{4Y|z|_f3
z#;=90(iY{|5pTlI&O-K5W-Xb_npGdKi!I10Ey|Bnw^i^df^Uojg9_(_pH+!`2ES}-
z!#H|Mnkcb5f=17-9w#@wl}up<^h!WwiMHusq&WrN+^4m!SOHT#%xkX}P4V<N&KNy!
zxY*qC0jSc-*OxzuI+Muxd1*uS#Yy>=nC#c>c4vti$MU(0eqT6gn8Dfox}oadcT+D5
zyf*x#ENI5jGIHI-7n`;|Y2b{6S?fs(&piuA*WdhJ+CLZPJC$%&_tCShH_}~H?<a_z
zwQG(@>%<$KQA>DKAhciN<6~au;N6+1^|9Rj%J#EQUos_sjAg~_Z{shx{xy+TF9#L!
zp1QxFe<A`!Y5Gn@E2V^MaHc4v9yAc%(-ObWoXcfO#<%hpq-tY%orv5@$Zb#F@43#a
zn0q^Qc3@1>HI-{+;OBcoP4`;E(*ti)xu>IbYk!E1e)PCLcpoMx3XOU6_&eQTOwXkf
zu-$~(jj3lj<~S{DOUybPXTQGcDWOO0tU~@J4F-8HrX(Sdq~yfu@gBP=C<d~0V|Nc>
z+)jf#?vHYp1U=6%S!Lb=W57yYPO8{l!mRUs`Nv4oFcqedrQ)FC1LL*0M;`35%7N}~
z8%dlnir59(o`&Ki=fhKA9~<whpUyBGCO74GuskQ`L!`@UJ#)HPZ5vH7<Wp)1kJ0H9
z>03Kkv(-iU-ba%?%sq_HQd}6ulxB@o>((-TOOl2pn%If=|AiQn>~A)CAQQC^Yk^7J
zZ$e1c#ZnVZFq0R{9B6$V`~|}zxbMw#Pz!IUTR`QQ8kShZ)<@kQps?6sM|wgHxYvJ?
z?sCsT-J=9O5|6PGdX&cH8F%808Zgz$_+}*bn=_&;mnm-pG`Kvw+cIA<9gaRqt$;^`
zgMXm5N@X!KLLPCx_;@I2Mc=)?@f_b|7mKYGIj+5q$kzuWJvYUys{>MOG!r`4#qy*k
z%<UPwp5BH%iOxaMW$OARUbYVz-&Gd;)v>Oa%X1T66*IU#kis{><6??YHFA8&F}e|q
z;cHLlilueJjMaKV#SGJ3#*952f-m*BqCP%;>s8X#iRTzt-jp;gXy#e*FzD$o;VNlK
z&<d{49HGC;LFIC45`N%%Y5<{o>FP9#FUD3kf{(`#i@9?@4fbzT2r^~636v`wh<C7z
zM1|y9nQ``O3s|(KC|uSnup1*?ANVABuPuS3n9NcryU1`05h~^yyiUzY<?<{8v%F9r
z9}SpHHp6JYwi>Z@7kXTSD;?YTvWu7tA<|NbU7mR#11mf64hG1B+LMN8PMI-N@@!%A
z4;G%rSxkEMPb)llmVR#3?aj^{q1z)O{i4GTKP#gZv}pT0BT<4bJ#=pLTbIdE$GmCK
z$_0I9s;Q#0*=ISr;8-ZDWAZK~41X-kgn{t%3#d^zIaf4EMI~JI6_yq@wt4m#sX@KY
ziBnd~4m-%FdA;8R(`1TCX)AM>!-;(yPOxE+0u@LrO7W)sF`7E8!<i_G;nAPzrKpd_
zx76}zL<@|B_H5In9N&o8x#&9z_i*?nA2<M0jS7;K`ssF-XE^$(22K78DX4>0NbH#!
z$6&%dS(e4z6RA<i?hBmO$7JQ8S~xWkNl{6-O6+VbtF5kKi6K|kGMLQr<fL(vqF_Hw
zH8NuuhnV#8mVCM@10!e}tm!uo!&i-z3$u=%2~H^M^XP>GB0|1m(o08#a#4l_Cgpoh
zO_+genso^?qqju#&Pz&Dpu~g*6G+BBxlV^ulN?J@F~($d;yshR`!>#+%4*e7ao1u3
zTc$Ij;wBbwR0lSb9@LIZRAq!(2j_+cN2y0C9`8{mnn5&YkT|Lf&5PjkoP|I-yN9UG
zJEQ(Za&kB@Q-SnY-55>^VL@=$jCzBo6qEi$m*eNU)sRY8v1E{z;1`j3;1{)NR)Le#
zjt@GWJFQM9UkRF5APuSGJkjl}S*i%@s2)ow6UB+Nz`)gxE7I3I@rbLPGKA)gIu!A}
z@-r4t77TdF%pg9}yu^MMRX#CSNM;pQ&FPS@*fgZ~$8t?sYNN&-Ls4cc?+=YerCH@R
zQx!}wB|S!;x>$k%dCZK!bndt_@a%2<oT#T&CW`(uT74qT@LaQZn`nfoCB`6BU{@>2
zh%>s-$@0u?m2<<*CAMK&{F+G?m`O`c7<;Y`0@bWG+ZM$RiB0WjhosM)_#n&Nj-Wv!
zfx{kD#q=1~?09qF3uH(I5nZ*+W6AA!O(T^10eivg#vCCZ+ENOu?NY{1rem1hlFS9r
z&@hM6W8Lj0*^23{;Mb|Y@1meJLY4Gf24%~C?3{r!{Mnyba!n7Iu~-Uv;!M)ZW69xW
zUB7_7M5h|#_Lw$IOYUry_*je5l^c~DUZljEB<_FgVU8#d)I(kFDb8W^_IRUCrev;Q
zlpZQGVl*{MVSAf}lYx}iuzGHKyve*csf1fI4Z7AKjw9~gVwH$CI1-o*8bORpS*K^q
z>^}J9+yrPwMaiI6#b25{rhP02u3a*$OwxTQtu|JmO>b|?4Fjn<yuUl^7m)D)2M{!<
z*K&HYp0d}pp~@jY-Xuj>`&inu_)DNK``<_r2Gv0#dp0`@NQ9D2k9BZJx$1|_gU+|0
z(DEHF77l+XRm>MHm8faNQdQx?Vr<10o+)W5+qe0=I+nH%6Ct(rLs9fOYdgKsBZfbN
zW;SFIAw^pRg6N4Jp+6@MzZV7?YI;RTF$F_ss>915s>$KE6IZDK*Jk{s+EW~HyNqY?
zt)=mupt=az$$%PUmrfm`@HFr1M^wn{@F}*wz?l34qyEaj3{n&Yt45{A-N%kw!pEu;
z={4|Veh83?gm*ud1L|4y(fH}*<92%aR0z$oE{E|kaqtcc#i@B#DaRR_izCNhip<BC
zK8`o(gg{%R8!))r9BAZem%mCIRiJZCH^MywXY#Q$Us`#6d?pbf<vcV>M9Z_LgmxjD
z=g`D~p(>^H<Fe^--R`*yd)oBW_U&qGQ0%IW^@yy4hi!pp;3EV^0V`^X(uHB4(1WN!
zU-DRpB2U+1`7;;Wrdm8-a6X;xy)j}mh^HGxjD|mSq%B>v(^E{33vyoBViR4OK))6-
z+8*C|4pgxsM19(K6{a<ph(zaHvqepvP;>jbkX_gX%QV!seJu}aWlvD>Wfppr2zp%0
zX&|1R#VG#`l(RubZ4qc&Ox5@_JZuK5Hf`Il^7?gEJEOz{JE;C1)(+Y6{xv0BeUuD`
zQllGsZAyEar@2b`XPL`q1i!Q2zFqD`{iOWnc<g!JD_RNJyT!F*KS$&oQM(f#_jzk>
zxX8&6dmZ5c`6pivy%>&^SA8vCu8aM=)%W%ImmAh+w5HLWD7}qRFY1HEJ5W#BWtQ?f
zQJXJdKkxT(;izywr)v2uztGKj>+#}^mKkQ=t3BaMB<6Ug{AHy&k&t%LwtXv+@cb+6
zWqVs4Eu#FLH*x1BonLc5fA=&Cv;DWxWt|Xqz)<g%p7@b?55Hln2sQ3`d6W9$=z>hl
zcB;&ay<Z2EbcCPFm%r8@BTd&~@>!k+<|f|l-|wc59J+P!Ow58c=LKJDE*BZqWh`pt
zP&WmwP=~tn@&|0QVzOOB!DY^iH`YRionG7*Ew_a>%r(4>Q7mo#Eam|7K56Bkdgmqf
zv$of37qsaMqg!U))afXJ4Hc}yycb5ku%4?og|^;pxcMCJ>wTtUqP@}&es^_j%Oqlj
z51-JhJ@Vq-4H$-xb($e+DDT6J3U0QHbcEFx^~gWD%UC85xZU(U`z+3^kCq>cjkCby
zj(%Q$>*Y-|OfK>BQ-{W4`3twUw@*f*xG$bM`%JcN48?xFo9(H`Io7P<)t)JGxO|wU
zqctJF=kjfe@YZzwTn;q^9{KL|5!^zmVbsQ5_VBF>Md1?r)mF~`tYxNzw!bMP?u0!*
z^AXExoA-3l*0;noq;fw$y@7mmStpFAcitm!x5YJchUnH`%KeQa(Xhai6?P3O`U&Uh
z<rMC@p7Xz4HuJV*MhTp5JY@b%joXg*pSn4C`(}(dlOErC!(`>qOY=dsqIQ-}NK5W`
z>5#Fd5vPoMhR>vTIw}hj+x|IDpEGUDDXR{}w&Mevmvnp7vmRin`V4PVOeChgUgR)2
zu2IR8)HEF>u|to%@JW7;^lf%|4r^9Aldj05_o(gpVJq4r3pkyLi9G>3iRob5rfKSZ
z{3pfBvP}A-4qu4<_S;iqqFXoeHg?X&4v~~}1eHQ)1s<8qC<&vAA-J@r;yAl4#=M?5
zcrWkD!W_Rm1{-a1b+)z=6RJ#lsVS!A{ioJWTTW<$z4cfWW|AbOHrBmur*NRhOL(Ds
zd`xshiXQooPtojCWZg)N2@E}{KgrO~LDhkcT*VEYS74ms`?0rrWGOyScta%b!$=7)
zy%gt6R2Sz^>4lknyDestK2B$FT|AEqow$%~iGxiwDk^<x<&gVx<O?<?oSqx3aTV@|
zG10(=*Oab_9N6-sDetv;FX3b0J8Ve%iNtI_zMFhqNL<gS4yAWxbkZO0dL;Okx(FjH
zEcCnUM`EOy8GUnEPMjA5dR_ec$gUF^5d!B`S72O25pSUE5XsM&D9@yq#eg$U#DdK7
zT=wHeTABbmVn4Q-+gE7o=d#=JvyWJwDe~KPTeKQS!JqP33L}|Z$}7}D)f~f_<Cr1D
zyH&<3=_qR?zXgLFlMf|DE8+Zwd1bi|u=|W_6*E!j!*#@1R$=q`yH{_<^u|~8VH7lN
zv$;tx+VSj@aDE;3sTZ}olMnrreMAk8>~k`f-d{3ijL{q-gEA%<ru~CVCW@`Ld5?Ep
zESLT%y32XR?s$GzTyvn;O<j;S)X<?=mQL6f?bC$t7{WzNZqytRPU@*wrx_$)**ZM^
z-12Nvi$8??+6tR%Zkw#0W?ISIil?+nuDV$$Hk>Zlb(SNAW7#Pc6)`i$;r1pw)M9$N
zk{UmZ*`kl7p+6LP7=%fEfCc8!z&#;V=CDmhF})}!p8Vx~hP7o_2WS<Py=@Ijg~Lzw
z>@w}?i#OTZ23aT<hGHqNFi(!9smsZa)H9VQL8WLdxn=?Gjbf(Tk@VtAc6!ss%jlct
z`e51z&is({0lwXE^ID)TaxYIxK{n8sLZClnKxNYI2w5PZ7K4Ke4QL220>5H@L#_kL
z@8Fl|i0>4GOBpU;J3SShuuLRDq*`L+Nx{%Hr$P}^@+E_u2q|%FTs0kT=nl-w<m=YL
zZ7yyGj`{JOaM^!wvNDhfT!OpLK#E(&538HktcR7FWFk(7qRhDjPXm}suA_l-^s%&Y
z=aFTPJS*w+*dW75RQ_Cth-t$y8S&NSJp53EJ&2IzQc4@l%)aTO-7AV<n_i;DWV}gk
zMoqhufrAvXIC48oc-qCzLVxx1;Hxodh<rG@wb!AGRj!1L&}o6{H1A%ab_rP4TlJV=
zhm%s1BOhwa6OJiOXWM=v^8L8~<lXPSU&piQ-~4crM=Ygm+rNA2&ylWFz~ks$xOe|{
z+`qiBf4WI=Z4-ePDB}C4T`EeqORV*cFo)bVEb-e^x@Er)bD(@kJG$04&Z>KE{Wrw>
zlPq`Nc=MfAr^w3wx>mZM987Rt6Mv9Tl<1!v?D2M&GFa;y0wk;TYvRzQvjdW-C0&)#
z)R+b+S~1Oq`f`s7N^h<@Gt*dxP&I)zyBT#s|BB-}PXHyFUvB;V@aeTo;tFlU$jC^;
z6nAn8+Y9BupOcYpSPf86dl{O{5DVIj)MPB?c~Og6WdOz2*z6WGd%BVK!7X5DFYSSH
z<<%Zfk_N%tJAa++^5Cw<pLF(}-=;SH+5=ZXO>+TEwj?OFr$HM$nJM2oweqf={wsP_
zNR#)ty}e(~s0Az^DPCcu_1SV~aJ1(SWe-EmAVxPaVNTvB%d%H*uqX8*I_j^i``15Q
zaasqc`(C(nvvV1!s-#=*_-LwKWi6My#Gf(DNp~(SQR$jy^H9Y*1IXn>_2obK<&cgD
zW%Y)x1s8jitEVGqfRihSg1`=vw`>yqAVc#{EC6J-Gw?pavTJ7(;ju6PJkbo@?wFR%
z7w1{e7~1OwH7bdIy~|s48n?_ouOr+c{-J6vO|ExLqbH+%7V;LSEOTt@F=A43=^4r1
zWbBafFSd~@P%UI#Z*A&hJWz<|xcKC@s4}u;wZ0BMl#`7L7(Q6x=@qh<`qYf|WTg#H
zt{n01(_I5IU;nahNurRrlt`57Umj6z%+O?J$~sC{%)AigykvJ4?5h6yD@};xdj=Pa
ztw|(uC`6eRR^Ez5^)gM)t!Ic(Nt2;LM~HC&4<y6hi!pmgplKT<Y`03>ha72s#+dJi
zBQ~GK^TdB*Ot@kE{YDh<=9)|O#@LW;sy7?GaWjn<&8#~D-<u0`=lWDJWPJ4n?rTD2
zcit0Jncw0cp+4AYGHkKNJPe01vw&GyNOl)kp1hr*CUaur5v=V%@D|xaglc9^BXl{l
zN|}kxxUCz!q(m$VT~_Hb#<1E1DN<n$f-i~={Ci>f-96gQkID(Xlimb|K%>cmXUc3+
z^r=MXk{b9nS4?F*(>Tz{Z%`25U4=zyEe`G?mJTP@k!kw2ER!-y?_{TVzV$+Rm(0?}
z%uw~=12iNzIT$sSu&~mF;h6bBDm%sXS|U;M>LmZlSN;k&i@k@FPUQw9QUEdTfsIY4
z*UF6CiU}mQ>v+}S-F|h(?W!EuK!YY`cl^34`Z{vw*_-*)CmfBB0KVO!Jilh=o#Sus
z@rra?PdO^YToy|){N=OKS|t6YQ~Uc%n7`?$KWDck_S}2FO)XoU^*L&i%b5B!o3c<@
z0QEnUmMyYn!_fuw6n|`hr#UL&@Ki-J+SRmgsl~^;{_0X8PL%a2w1ExP4_t{xWRK)>
z-2xewTWl;{PIG^_CUCa}3NYVZ(><XdO-+ryzwwCqU_)2cq*pJYe5*L_11c=QmQC)$
zmMVTom%2sr?<-`R<|B#UyDr3_3=`e5`~g0fI8f|d@XKJ2;`v#TwQ%p^fm<JTn8jNA
z^cGur_t@JLdcBYE?z?r;@BQAsbJ2I(Y@<)cxlO4Kg)20eRnNr9>P}Sjb6km+7pB;7
zR>k{8;%f2&vbvzJO5-GH*ZdZ`vHv81YKUW0J^iu8vA&%W<IJdOs7SEEr7mRIa$c=O
zWAg64frH(KWHhHzrz3r=&iQi<#t`W+Q>$F<{?@tc`mZ&G&d*0Q)*`X9ei;DN{_*Pk
zacKEB!Tn(*h&|E8NoZa+Z{|k3$+Exbpe*`0lRh|@Rs|Go{6qslyzyNOZs3<^p;)^}
zmGpgqbDvk`Ql?~>8YeBt)1q1z?dqDlGSiL8ORFiQYc7=81EK@XUe3QKcnCqHtEj11
zynM-KzwRR>%UPcr?t(!;H5=iqVQ|XCrDhH;_cKvE#rmL<+s|vbyDx8DCqPQa?1UZJ
z?`xuXRy{E`bz&`7ZBx<Rn!-`MO_)QQa>|9R0wf7>N42$(O+f}U3s#2NV9#8H<i(Jh
zLS0?*wlydMZw$7;?6{+GzDj{Zx*afxNpy+!8ieN2w$&xx5&wT?i7rQ`$B)=9>G?HU
z_Q^jU8yV4QJXLc2;99uDgEa>QXYTEP3u3vr4fg10&ODw0L70&Qpxd<H4n`h=D$PHh
zlAeW%SVXFTV`lEN4h|3%t5Z{;SEajFkG^bK>Nu27AecG!uCA_c8AF$nygJQkfAC{`
zRREuf_aU^T%tW-ViH09`UapN3AYp&<=go@tEeW~XXS9YL_gWKFbHf#Nef7lZ=~Smj
zzDiQMlmlxxjq#447`LIS;7GcuXYPSP^jf{wa!uT@K)7a{2S;+R@vDtt4l&V1!fO%3
z2|HnSoVFuyBx_?{T{RaZRfBHRZ0uBr4*l2a*niOAjmKsLS(^4VWZU7DdAF5Ixn4QW
zajREqnp~h`88k^IUQzCYvN{|oXugm}lf(ESCBY5&l_flmKk$q8mIiz1>$TK0Gr)@~
z-RJS~08KH^&rnx`bf>d$XDkO^Fz5EKuraiUw#(NN5)&=IvPyR#_fZ3K<<Gq354!dG
zV_V96(ah^mjcn>d^@4Cfpo&?_ov}=$qED9OTdT110voijYy}o+Wtp@g&-{G3v&RY`
zOOF)0e?Gk{?x9?@Z)m+oRW+BHCr2pn$l2+#s{i=(62F|5v-nD$&+dMGRl97d+0AAo
z*E5*`28M_Vv|Ru9hTWyOH*!JK59J%~jn0oE_e69cfY)SfV8^@Z=&$Q>t(dr^1PFRa
z(O8@_tO}@c+J*w<*SK3=P^a!=?+s{Y8=zA#T|b)S#M!br+}xenVTQ9^VRhLth=?W+
z`fO_l>V{xr$(=oQL>D#UyB#!j0MTuYNx5Kz)L4?8eMX@(pByrgM$r3uCj;D!XRcKw
zv;1>EfJsm#rjkvxa{sWS*9*_nENnieh(E3$=AicTXw{lT#-<-HpBmVxT&mUMoL7a|
zE`{P!(wKM0`;F!SRzKjePi`MhEG4JU(~h*3y}aqi_@C}Epr8_|?$Yv`FT`61Exs4U
z?(JW!b!L5pkw)7b=xZT9Ozg-ysL=CO0hFLd2hG`pnWwxOFRcJV!P&lR*LX%v;Rdlw
zYv3X>?JEWAmL3$=X-v8!DztPzC|3<KZhno323CKmAlx#>w<Ns#)mkoH7>vU=UOC&+
zi|<sYCwVrA_>X`7ejj3ZAbdislrFK4R46^|T~aKgdp}vzNj$qBlY6vY-P6;Pwu`uU
zx(90I8WCLLHzU6v$vp(%q!$-h(7DDM_Dg|BGQ-M2t1P>8u2p~4AW{O{v><50>tl}`
zV3|W|D3omwsCBsW+ggIVc3S3ZFy%^;X})OKV*A)td&WL0CNMmqFp~xbqvxuf)#1uv
z+OBvlRS?+~ljLT_iWmKNvCh*zL3E!6KrkqICAm@x7G}l76x4Wbv7w~w=pos1#?0&|
z4~j1RR#Q0f*c7{lg<k2{B=&e5Ire`2>^?K(e1IXLp0*Fl`Rs|DdCS*PqU`Rv21L*G
zfCfZ3K;9%)4vWi4A=X*2hxs&=_7hhH4OG#~B8qUu{bEUtQQa*K5?T`<4@OCga=(UF
z3_shjp*DMQM44!AGhcv<LyV^o@#sglyRU}E`S;DHgyhN58<eX?4aNJrnTvI(AgyBE
z+jr((h+1ndzU7zoZT{UPI@@>P9xoO#g&%qZOh2@hMuGCaQ>HL>m#3`Ji9hap6ejk%
zXiY<(wnD30Z#U3-Ku896l{lP^CcC;nw2Un~-wAr)+9_k@M~mxKjuHP}y8kxze&O=c
zJH&E(t0{5MX-?dbrZMGbtim2ITs7kXOYYqUTEQS#sxna5XQ0CR@}lBYal0opt(HBM
z=vGJu%fG@cx`ALR<E6sLE^K%YgpvK;^^3;RgKWg~v`%=|h4bw7#IvcBiT%-sx@@6S
z-CzMA_QkF$sr11Z{POi4=mNfLya_S+K#|W2H}tmp`T(8%$hTzcemTK^FG4qE+pOM-
z%Kufjq87L*#MX2jUP<48T>Nu-|J^SB{{~B(DiI(d9v+X5i)Mxxo%Yv;h!auF<Ue$g
z|3rA4ltQeW9biEGe^totbwvAmBR4K!R7m%*vI|=d^J5m@H$sIhYB2>c>*7#LF(@e=
z5x?O9Mq`=DV2_8>)MaY{l6MoSVl9_cXV|JCb@P7=see>9{wr)nXT@pcYCeJ}N{t_O
z*{b^nZBza7bmsZ4s+UyIV?`0|QcvB|XhJwkf3PQV$xnGL)~o7Im$Skg01MQwtg|@x
z;;O@@PQ+jOgeBsCSoP!;K*v-$Uv!z-S#?X-X=fWe_##~C$^S@}fBYZD*(*5`DYWex
z1K;x>;uQ_JwbV#lZp>-vR_mLNoGHLnWq+%;=jd^?2YdAD-7sKdl=L^Ltwka0eD=n|
zNSzIyS8D`YIuhFWPLf<*!TGDN#C}LzeCWX%&A>r_`uwcHaG=bNjH|k82c}(g4L1L@
z7V++LuQ<sU>$w2^=fAmjsh%VmK+8R?GH`c&LmQquC;Jn+ILR{1A&X|<;*Wx`2jwi^
z8myb|*V*StrsVu;dT823CMm6d{6$S+v1jhM)LIy}c{4~0&FcR{i|W6DVEsQe8vkdB
zf3wN|D>Q&BPr@7!M4g6%BGbREs=VKHxYw1#(fSR5hHRc&%^HIqtPeKik=|_%b8x0h
z!!cE(H>3#s&D8(ZMVkE-xuAyr+c`7-XJ7sebN?&$<^OM&RO#UFM;dY~CM^)~_yYE1
z1&%XM$wN?oX#iwI2_6a>+S`AbJ1x4BUrE=Pl7e`sw+eJJX(SLi6NpLkNPC-<@txL~
zhR-^p){vpJ=mEYIoLS}Ijx7z`VM&yf|3tA_Po<ydH-D0J_SwN6Fi{)$B$Hkb+=9r4
zXl!ENT)H^lU2NR0ZX}|G+-@F#1se3WCFypnE)60IPKbN80110sa}6r8ic!A9zlM9|
zyL)jv7h7@F7epYW7(0boVfW@zr-Bxad^^FSOCL8>V4>O#{5#@&CsqTNa4&LXuR4J<
z%jVqtLeSJg10e0$Oy23Lz$#b%vDI3h1fh}14KyUgIP)n~v&hQ#k6*XgB*yJP0#6MN
zD0a_Gq)(;Y+XI|B$fNRWvJ)`{ox@EUOjO#gv+ro&rCH83oO51jLvh+Xkho;!*I(tv
ziMJ!TGSlOG^8EzYu&_5yKc0;?7x-wi)L`aaG27&zS!fl=?4T9r1x^I_RxDIaSx2hR
ze4HuD(8%$h0JGZ9Og7AUVg~-6GfrILiuaFrz1Z9Pa(NcD;)z7=PApE`T-sR_g;^e=
z;kl!DvSCyE)DpI|G-x)l-|ChX{7ozk9EhS8U!Uw~l>1s*r;$5ewa|ySXROXGSx<W4
zW*ji{4%W{Hf_LS;1-aMtU9;s{6rM|$_V-)O!`8}UOB9G{iZ4m)b8n9z&Ynrj1|)3h
z(o>HEKPe6E=5L93GfT1?mHlXzl}6Ck)Pt_pUuKCx{EE<2qJ(nB@J8d71>y>)5$)=k
znCrrMP{bcFu%1?L9OOZWK!RD8C29|eh|hW7tvBCW2ls>1rd@at3D?OUn1XmVMrCmz
z#9{h%7xfs;*s<@AZ9ZSBbEfj60>`l&5`hy4E(e*V2i3YVo-YiPFixH(Pu3O}@wylA
zq6ZwJE3v#9U$lDh&6PdPt|b?fmmcJbMR6do?S_-~1^~@i1PN?4#_v-(BuqG*=r+#E
zBC@gyrmET4)d1d~t_!T&xvXDjGt>*46<SWbph9f+2;%T)7->$dK$q}o7B6RW!tEnb
z7}uhpy;L}H<)<oj;E6)-QfHQL^rm>vIFw{!iTh+39V@nNghj$XN^X~5fTbFM4SsxW
zHu%^&UElHvw43EjhWswe=u_C0=tITelL(i&LyI%>u2(^n@P!;31aqzq%bAYCP~)op
z*y^^L4v|m0!Tc5c!KG?k#NQ}rc<sr>yg2n-#7!jLKY(+4)wuLzZqOKQWYIpqmc0be
z936rH)Qx2xHT5;=tUQ|x5O03!={03WAh))HEJVX<jbFb%xP}-5TncYe5jVyE)dgIc
zH1a8LTTWGW^8ZI`<)8dd=aV;(alrP7@~6H~r%4%`J6E`3_@pb|Pyt}eJ<Or8CHI$q
zLcO+K@rj9vLE`$^SA%SJdxTy1{eQDK-fZT(K7I#qUU87+lCuw-aWn*|Z}UW$6JPe$
z@8PVVlO+AulzH*PI>0`bMFJL$aOx>DGdN;^kkf??_ll)}_~;a6f5U&JK>dw_`Kb6k
z=tlOG%IYM@0&rMVBO=?SI<6J+be4dYRUF72*I0|J-Oe)!={DFHW6d%UAPWiaI?>T@
z$h*L&HcxtX@Ne$@e>tc4*KhtdL}u*UsHAg63F21GwSKp^{}T@V!)6!U4xuA0gTyDg
z;D}N1vk>0CmYGLxg5b$d8dKK%a40(8n|p_^<y(KIquGQaZ~dX`^KmV2J%os{CABUK
zR#mp~ALjqhMwo*~nJ1THP8y#aH0Q5Dt;K-@3xe#l%T{n^2Q_WP*Kh<c|8*6Au4^r6
zc9;K3F}0sz%jwKj;{Fql*Pnm7A^~T;dDzkG+@JhfON1hREAX_?T}1c;LJ>n!Cw48A
z=zIx#Sn<x__<RB6;8`j;?fBnL<DXOdyFHn`1HR=<Gw3133BU$6nHgIX4U1tXT$;$t
zx63*Ols~NUlXu9!_`u(szmr#(z%EM-`n!~DBlv9AY-RV6e<D}@esA2{0h_%fc=45u
zAhH2#91CLp{*iyS`5O`0pi<mXyc3v^^P%ttel287KMZfZs#MZk58t{mAg0Q*mKoL~
z{)+63oaPi@LXTQT<Zo*s2QN%)V*gVDsqzh#j#^n-N<iuHp8Y!x{Vf6i`ll;uJdqmn
z-7{}^HD4o9*WO4Usd5n>C714VuArgeKoj%V*f-g2mqY<QNGv6SHofkj(dh5!_~y>`
zQY5Mg9i=K(I)|p!MyA)z&d!Q=)ghU7IOv~{Muo(ueYU<2Y!mhhC)N}SX};YSv4*wY
z{qA1<7Ap|{M^?3JA%~>%k*Z@a>&;#_5Wqnchu>eJ^Vik=XFt3d+tCd%jYcHWdB^9L
zp1;Jji{5J`{r~Kpzs~Q?_?D*T=D6lWWfJ$^eB`;ci<uwyzrJwKvY*vSL;f*&NHS0X
zx$AjYc+(ol3g2&(dJVT=91?3%|9X$#s>R9dw6!Oa=S4UBkkY+2hg#OD0f=JDKnPL)
zf3^4BVKu*h<8_>9811Cuw9(K|s8fy+QmIH9($Jt%Mx{mGXr~eyTE{9>T8j2jR?;AC
zZAC+il8m0$yU&Nm$0xeKzw7!v*K<7=f4#5V_qp%;b-(ug8Vz?3buMoZ&Pz9%y7Z}M
zMcg9FpQuTdj#fNiB$bz){7LqMzqTjDg=K^#1$_s?uRqMPSjdBJl@h*--S_3d-m)5x
z)_j%9q_9^`y@!4FFCWc+S{@QUI=8JJg_aoKGlV(0Je7TP{z`~yF2sOwkYs?%bjzX0
zNKzF?sB7)4VQqp6@Cp^yNf89*PMV6dHai3}0zbK4`1Mi0ql=jUkgl<>c(b5m1kmo=
zj_r{svs7?Sw!o%AcEAs22|RnZ2Zc>U;ZpltV$C`&9blV8*#iM6YkgM?ssSR3D#pf0
z0_zISE8qBMHkFA1M{BsgbL00j|N0|MA0$r71$nErySBm(x0WU$iMbjahFOUZgHzaX
zzW!=s{X7(K)Cj}QMq^k!GL&!@UGdH$6-80d$AQP(k>sw0Ujj#B={6Kknq(s2aoo^p
z@$;L>9c*QWDY=Zvys%rlZoLLwl-+jW0&@5BUc)3Dvb|B6tv_+kY436eerF0sd~s~B
zmi%EMf!PQQgZ2s+{1wl`t#`mt?c;WBMu|30!$p=0U8QlIH{hi6p^E|{C~J)Y6q^=H
zS=C*ES);DC0`e%cF2h>fUBg0{(C||=6w0Jf#Yw}-b2{RQ?l9Gz^c6+(r?f5j+P2_{
zzj&MhynYZ*(P8ue##u}Ck#rkV463<$bcwCXQU3@5W(E$fC49F`88>e;!!IMS&DDR}
zEBoiQLbM3K^9x-o2>V}_2!Tx#%C4I=&HM>Rz;E;Vb%cSwECTX?nYc^_qB<&6KlP7*
z`z2?}SMpT8QW9cBsx=oUof48^!g9dVrrpsXB60vpo6=%=3GRrcAHetWB>mCiyo8XF
z3J!z$xMj8Ge9gLFQ~8hAh4BJfD{c_4zBH|(@$VloTLZW$<(r{|LaH!EW&>Qe^&3ki
zP9!)H86|xI54&{q=OCaVet*_m%m$!EW=#C0&O=N8d3XQbJ^mMYX}mA@;YZ3X>EyBW
z5UCH~!L$CnPhc@tvrOhgRHf&pj)q?Hsxc(yOlgCD!_p9h@%jl~d3XdIoH_Xr#V4@T
zii@65%{i2)n0Ujk31we9IPSi=9QXt;K|B%`k$;uks~d9BNJUZe1Z_RwD~nb#N*JSo
zFab>DrYCj~&^EwVRr^@FNsAya2%J`I+m%xYk~di!`xd6zg$FM|FO1#sjKLRa%s9KH
zciM@<Z8bOH@!68Ns*;0;ISu3usz#k@WQOD>T$$GrVn9J3{Uktz8h7jdx@0{5cvQZB
z9ATQgBplX9`_=~I8bCe$(Vp}pDBuso%_hx9cEki=RQB%K*(hm7GFEUiJ7#CQ5%vOE
zoWasj!o|!+aohOI>&J^L65xOh<kVmUTuYP2WpiM4KWWz`n6=Y&dp|n@iO`<o^jfaI
zjBpMQH{Gw>gyN_&P=s@pZAuX$7~>gT0VQh@08gw9gN<c5W%u_<!mQ|PnLP#w)A}?)
z$SElUFRQ^ZOd+v27=;(vr@~@yxpu_SEa8M&51;8HtVPtP;K~CvDU7xSa?i@9=Meuy
z7qfC)y~#|h2u{TnOS{j+#$z+{GMYPpJ3FqjaIY!uqmj6g>M-*pI!pl9GB$X&Xtpf^
zB*}Z|)PiCVrai|oePZr)uL69psWvT#5Q4j`aahca+31^w`zcaNtF04J)Ov7$Jawj+
zS>fmylxKu8MGO>kdxD?hy0%4(Pme)nQ&iFH!FSvU0HG&HNFRi#qYG=lcCM9<AHOe-
zKThR#eA+vw_7)BI;2oyX&q;2cWzek>|Cg~VC7;`if(l~dL-6>Lzx%aP-7*N=JlAwx
zDn%I+ksW{#$*#Md!4i;d^RBje*m00PF-x7?v`3)}<*X1wtBw_nH74=9)SKfmrSYYm
z(J$7iBWHJj%;X^8obx02Is3I8r4K8L#z~#*6h#6Tu{NFtA+5SC;N}b$4qk2XMc^}g
zh(SjK`z1~k%7^&aV~qXKx}^)ktwzq+1jaZZ>q=ndkBS~F=rc*>cYAfV3%{KSrJ=Xv
z#++w$#pRB#JvhaG)IEO1z0z!n5dJz@+0xu8a!1Xau7&jyNmg)vasBnZRR)o(iC*j)
z;^y`*E*7t*i#;7Xua_J&t4(Yr>2PYRRN<zg_>%UR%n!$}2c>;5>NDhgn2&iC_x9TE
z%Tir?s`L~^77GapNz2G!uc)XfuBq9Xt|6yX<L>FnNiN&hz6rtmbyZ~O>(|9&V`If7
zB^$;+yKh`TP-CdAtu5~Ewtm*x5NlUH>0!Al3q!$`FxJRziI<w*`V71ByBF+Xm*+i6
znM;5US@!B7|71ebGb&SN=)zUn5kU-x9;De$7D!SxTolGi8h&6O&pOf8ka~=a;ykG`
zqraCqd&Ooubr!}r(SkiC2pafvZdS5}M^S>^trKmAM~+^RM?he#l@VIp&3;@>3>SfI
ziWt<Ps2h@|(EIk|!{w~wTPZ{R5?n<nnC`cpG9($GuwQN+9E})ZD>C!;DAI5_Irfb)
zS{$royT(oLQF5S$9>F9p%`R`^Rxaowexce5<!X1r)$Y8J!i~kl?6p##QOtMNC;+r2
zOSOX>%qdNNPMUck>@$&AIntElA1z37tqoFna}vQOqMm?t=vQuLJWnFI>Ju(Yps?Fs
zHdb=dBa@x5wHBnT>}LQ0wLGOWn*q1EKN7OEr>B7*&-mC7uJrZW=z?u<rAnhUNS=}{
zfIEV!zD%5et`eHBNxx!CKd>%JGn~TNk@96X1F`aObAAp2u`>Z;i>7T9-na~o)77{V
z<;iMjnarXy>x)_!oGTS^QW603Mh~b$pY68XkEi~XQs2e$vbov2si|q`{d>KyT@E6Y
zHe-N`<9;%Vaad)E)z#Id<>m27OG_8Gx0|0cuHnO85sgaP55~RJn~n06u|?lZG5X8+
z>CAg(tlPA;OS-zOGBOthQFbsG@$vDUw9#q~F8l#^mP%IHW--S*hfd+(RUw)9Pj#lt
z^H$EminxY847CQheSe^%*c0BmDBB-6;C=4gxwN99CB~m7L_<%XNNv=#IOFpxi2MXK
ziZfhejbRcBZNcqhB0wu)23G`s*z)gI(uoB!ajspCz$)=yBr(C|h}7<5oS6?z!ADQL
z%E;6-e%G#TXqrJF)9t3WX$1uW!66|f_wL=Zh*+a_f+C33mX$$AbU`Iwlw<X>MwVNf
zujco;XEr44>2mCt$!0!V_QyuFT?ecSj+Y>4ogG}R9v&>p%F6AZKa<P*laJHU5uT}U
zT&Kq`An-uhuYzggQXr7_=eJSpuc0E^moLZe=HxI{R#t9dWWBCK8A@Jne+}g*Pq-GI
z*>tIO(6`i*T`6{3PDI`rI?NFWy71aC5rqj<GbB{+f8&y8PG0Q0C$;Xq%jjkjFWjM}
z{(jb(?~rmFkniK$m-owAL6$sR?1Vy;Iutk`-9=&IR+uC~|BUM=jGmTxj{@hZ>Aba^
z{+|3G!nx?MeE@{ZCb18Nd05GTo>(J-*7iygI#IqGE8mj{I3GJ<i+~7>lDLFtjV~cG
zgHFMfaO5wt|Gn);bpQb8uA&~l1<=cJXn8p^0HEnOv69{3GTUWNIL_e%7g40bIE;O#
zO412|<ERYWL{UP%3qwxUaUMFl$;nH%`>EdwUGz8G1;ttwC=PkZR?oU7uemNcTkEQ>
zzeSp`!8&0otH^A&Y6=^Xv#3q{%KbJb=0$8;I!wM=q-WO!1)1r)s3f9bz*0fWn<q}6
zycikjw?0t*_SGE*gPNuv94=KyDcC{A=0HiR=~!BA=D?@PNt3<V%Od1c#$Lp?p~b}!
z8N|fJOQ17n3B=zl43{(vYn#1EEHr_lF)Uf$rimh}SWk9D4H5|Dj7#GbItTP;CIC8&
znaEkWoZ>{pq~Rl%@{SE<(rOsn@%SER7t_+X3lTYI5JYbS?8%Dbni@NY4p~D|rW55h
z``LBKO;+s^gfE;r<i5o0JQS9)oJ)tCbiUf0sKgDEBA=X^89V6->2Qz-;eKvg*>uPf
zqWe7%b|(jWDwENZ5EZP-g+Q=IbxqqX2(Y6kq_Wkg@7+8Ofr^~RKbD#Ls0DHtbel{X
z<kC7G1eQQh%Azr79Q=IQ^41~+GCA2*e@)V?9{?Q5y)Vm%^cN(s*$j;byhiItMM>XA
z$TN?B3gb1*2bTpF#{j9|-ZC}V@Q3Fr8r?@}Wh;vK#l*ylCr0~9pppsK9*DAheaCVG
zpVp)?$?MY#)<U~)zw>*jT{mq5fA3t4a7iX@^6*B3T&MkCc(@Oq>M*1iG(I~m2mI*=
zDLn*XVc~YDh{A{$$I(c<Zt3+PbL&EZ^)suOAn+;u1U$>JIOR^uqEC_PPZM3kH){GI
zy-!l8JXlv*gfC?Gt-?!h6!3RuP6E=rOZh>&{LK+4wFC&sb##Ro+=XD}r^|fFT3f(!
z+sD_psbEPbN~{7sP$(o}PBLxSu;Eh8#jm`YUs>x#s~ql>jE}pnhW4!`qjjcB!ljI}
z`?O3)MXKC~?#<z<bQgMWj(}iT0@0<vo(+#pI0*eBy#c)x6ns2{xeTB$xGRM44kX6Z
zglgN}tVwj$pOq|(a^bx^kufoIS$L%9t|%P0Fvze}|H6}t(P6QK0GmQK6simZ0Ba~J
zof4O(x`w-+c!(g*&P!5V^$$<DIBus5C#xSjiXzV5i%?x_IpP;^yfK1Fc1lZ7Fk&A*
zwukuA^yY~U!yBbd(Z9L;zyB)~gQ2qpjRe){A7qN`m0yV<(`mO)0C%4Chdckza%aFd
zM9vVYPJtI1u2i<`rhpK~_pbWA8uS;RdI1=x&yl>B>8T*qRdG(}9Ta-%0BQ(a#doet
z3U+t2FdM;DVu=rR&JPbXpbSfP91Ploz$Rt#PINT%tG0;hL;$*)WK<xy)>yBbz~Nmi
zcvvHK0UVpSS0Knj#;c?>tFrJk`-Th|6AB+czoxmRJp1GIu+_kYQV2@*(j9FO2d#Ve
z>qKA4sRgPAVkc6VR^qhGnvW!r+&RXff)zKOns>10T7(ZCetoCJ^CBJQ#&mlCqU*=<
zRohXxCp}>vb3SD0w}TvJ-nGZgYBhwn7uOYi9i5$cNCA^$Fv9y+b5%Zh+EV+IL<80k
z#d5ppYwwD8ekngjQehShvuDpPHun|a#vh-dc$+w9pV=XT7W4SdXszBn1b#CmnV6WD
z#)&Sxcoqf2fG1a7g-bs-F@U0K;}Z2GFmom!YfA(fgLAu;%iwyh79!>N!fryTl)e<l
zV|L2-a?YSwl{(jH&#!5eym_+^-Wyce@FbS?T*+EgL#(!B!P;ckTKm}!e+iLeJp1l^
zDVi+2lJ^_qg(`<Z$eC|?diufP2Ry!bZ})ZxBh$CCQTC(5Du|zNmRH1(Q9?Xeqi9eA
ztr!a7ap8$Fry(LwG@~bI1IZCYLH@feftf`C$qrjGIsT{}`of#v1>G^DrTQ?@0u<vw
zDYkbgOvg!b2ucpV{d~iwa_yw&qxfMGJ)wYQY22v88x;Km^C99|U{#e4F4j+-IyFzL
z-H7l<na7p+KjPwlL|g!v*Kf6c3L-}Gk=|Pb=ikzeAeKU@DZH`3)tt^QM2YUD?f{b9
zc8$E5Y6Ks)DzPX+vuZPpRkg?qpg90T%rhSkn^_&CkfdPJ?Gd>HsxarH<$jKtnJDED
z41J*H(7pwXX(6WKh5r^O|NhH69vyOYQIt08b&##e%xX3$++w8=aB9Xp4W`qz5R}~1
zJ)M~Yp0KysMH6K!cJlqhRZ6pC$Ue}-tqNI8KJex%F?a6V{ILU*F_)%{)h8DuN_uOO
z<X=k1UN9;tDfv3KEH+<rD}rj07EOCUIb`eUN!D~WbMdH&Si^#gUGu|-<${8OPD9O_
zwRpXlr*6&rj@oHyCAe}QAkRinlK>uOVpvayVNy+_p=#bxF^i4+eLDq^oHc%Z`)0ls
z{H}YsTp^B^9nbY26y4Uk0o0%c>?=?p4Pq{U*4EZmVOkLNBtPfr4cBmuu(99yr@!BY
zP9%*+qZkypXT`hLQU(HS=27!dFb~Z976u<5pJJ|EyV+Esa?*$6ksRHkRgI2CUmD*{
zoc2Z17B&ju)@N;saf#j>u`vH2-8;~;GoKy5db~dB5Y#eT7IU=4&Dpf)GcVC7ggoL0
zVGA;QP#z%sOwDqqLXc0lLiq^ug9l~4Vp{c?w99IN#W!~*6+eEwrP5{%)dQ>5>)SD$
z?6Ec;$!OCtv#=1-oXmajw&{*90}J=c)&5CL=S4vq*Kf&2P;@O}X!y>UaP=*~gD#~7
zp<wNhEs*&xk8yFr7jRAWMcnp4v3-{V(I&;bz=%rtB)`%P|EBAI|3wy1_CaJo)K*sT
zNC{dgBWU4rU{#c@G_X@!L3QOmwsAHJ80(|KL};<h-$va7M{_?Gi&EcD=L+!PGk1f<
zWCCHf<mwhAG%yA@4|r-uq{8IFhDL;F_|O6vI>u%!vmN%f<&-S<%&OH1+Dl-oNBG=M
zQUu_``MT0M1oll`0b`GuyD(uYz>LR8NSPXx<uK`nh*iZ)m~&X;h3daA@#nu_Ct)Zv
zbJI;-@HeV)`I@7kPq#1_mP@26*{^Q{p0A`K{tpKIi_^U~gt3)vw}gjG=^x0qGXK1j
z=^$B(+ZTMX_c#DwHOQ1s1c%F3@Ucp)IhMETKQIQ|aVqey^ks4aleShNb^!uvGp@wP
zIuQm|?1Xc@c=Z;+8~&KmR_XQi$Z9C-RRZb9gLAXf378XH!1K~4S5e@R%64}kx39Q-
zRVzOv<i^$IcP?RKy}koBu+78<0XU2=#D?Bn3Gu;`pWDIR=LQAOIBfGC-6JygFRRK0
zCt{OFVLum=P&&j6wzX)`nKRx{u1<2dd(X9YyuxYBsfz)WhlG?YySn9ZR{Uu-2ndzK
zBx#bnGAKCM2Z}|j;88+3DK<+8>WpBGs^7Onb7B)2?*m$LYjcaC1jORzB|A{d|E8a>
zQi81Vrs1Hy>_``o$uf7|JZ}ZN+S8d8N)j=y9sZeX`%MT$Tef{FfMEePSyf8se?g2t
zst(lo-waVa0(uBSoyB$^|JdccyHgY5sM&^+6m9MeqcA#3J^`wXo0j0E_zpr5KyT;1
z+6T(lOBpSff?zYhu8+W!&kG$anYZdRUWK6@d>qcpB%01qIN_pn@0+1KfnyFt(2{Py
zON63t6Ix!3tZ(9hV^1bGBPjGFopjp$`x_T7TEq<D1E3b9byN#%K0#~2Ij~r#Gs3N4
zZx}c$FNZun5e1yfYEg0)0``l-JJf~H41vAyd=n_kC6B^Pop4VS+c{rGJ@8)ZuetH#
zbIdQ5y9v-QYFF(4!0MU>LNZC^GfEK&%6b4kleg82WPn0OKs7>)36=pvpFC;G9)^=W
zIOna700d$;@M}eFnGlZap#3BZZF%$~ivCJ{e~-IGC%YX)WPllWfBMz1AAgEu2$I|K
zio9zFoXdaCKF^tNl~M{rMH3xAh{5Gvd~p<|x)?<p=BNyRl)-Mnq}+se>t<eu4okon
z*hc$u4Y23J#ry&UO_>*e#bmwA6$aR&A*Z+jMG9zwW9FkOsVWFaLw>*kK{aK)P60?0
zHg0BpgPVCQE0>@!SwApTB!l@ib>m<mx2PtBB)ZTuFjP_3NJJK{={{s-Os5w7Br0jr
zFxG^-P^d~9K91Yw1)^|0T!Y)XMqm?lFIEdBGinCL_fISLLxU)JZ&Sa9F_lsOv2Ils
z4UCcET<5-Gfuk?kz@M6{`e;rxk3qVHnn9NB&ec#V?c$P$#3Ri3L)@3Q)iIije@U7@
zP>7$d*@Ty?f~xA`1SJ{SV4DOt#{T+rVf*G>g=Etr)!k4ihk4(-G`o&xwJ)v(*l0b@
zL(wO}C{rP}K!*xnyg*GmUY2L^pqt=g0D^0V3=;yB0j4_$f!tEeQ2k{|iJ&ikvG9MO
z1k+*L84|pld*`is$+}JT<E!mVms^d8B`5smqC~P&sFIjB3y+U@yr*39o`T%u2ON$1
z$Tkv%bQHs@!kWKr%xX`Fc6f5E=IRGKDZ`AbvF~U5%>3?`3-8+rUe;&U5}$MJyVGRe
zSDRuEWo<Q%GMyXbjI1~WZ<_O<X7IMV-52eO!JWQA%mS;euaE37yid=Ik_T-zNT{~D
z2K9K3wV&O*m<imU9$r<-&sdTT(uMe)s@6TL+)tZaQFYPw*z&lK4k$>pWjk(*;`+hg
zwx~$XsY^^(aXD;ZYmMT6uLpkI&4*Fsfsd8n%|pqWr4j&}m{|qZQCZ+ay2|r30v(d(
z!`Rh_Wz^hYp)M<jy)*MkiXsec*tq}feULNkEKl$LgD$`OFaG(F2V-3f#{(#9fyYkV
zcq={gI&|1p7`wd7X(1L2A4&-$Bm_Ev7{maZbWw}A?Xcoj^~SiFcZbb|pO>3kvsE3K
zxhUH?K?J@}v<9OkTK$0?u@^VyTkGe|ybe7<0j#_lVxIxw2&`02|LOn=6kK6<9pw)#
z-9T(~#iqXffm!_u7Y$*kl|X8j5Ljb@tNE{>=rm73l~K$KIm+|`cbP;aA3=eFEq?ge
zdIq93$iL{b4+2r3unZ1=Ei{bx2p%vayHgoqcg%Pgj>tZ{P2yl<n1~?7xxUc@QMqGT
zCEIm<_%PB=L|BLMJvg9~L+Wn{rQv=MhlPbGK9emPAZSzaHKRj7da4>WA`qTV3=O(3
zOa)~OK}JT>-DW9GDjqu^Hrk9}%95(i(X#6H@Hymkg&(X``S9K;cmX8xuZ17@=;nr!
z??z(Bpb-3OyyHqcULAe)*YQr_s3!}FPgwUgyS<KmE2p0&ib8Mm@j5{B_wL=>K2o+~
z0dx(zQ^LTrJG8{eg%o<8(g$2MwZm}xS>At@GS>;GB-JdpGjICY+&{wn%^n@?tMkRv
zWh!2)fWbIQI9_8tny?M!N`NvTEGNLm@R8jb1R}!^ss~;&{el}DD0mMv7z$dvVePO{
zFanpwHRBqf$(@e98xeP7DOF{ZbJ$9`Z=TeTPshKyk<be?c)eRFATZA+)>(fw^i0cW
zESDp0XWIuJV6W|D4N<8hKf`%CT0Q;kQ54FxdE)_5p3wQ}%K8(Ocm{Y8RXvr5a$M>n
z4EL{034F-JoewG6whF#V-gH<nFl8l+5!PQF;|HNZ2_md&aos!Rae>YhOFCVxg+fyR
z=Go7TP+WyYH1K8!;QfYtA#UnbQSv;djbOHM>n_}cqC7{&|I-*fcznqi1ofD}Ejnwx
zMv5WSg0Tv*Cq8L_8&*QW%>%(UkOSv#8EK=02#Ei1OM4#_RUs0X{JUcmDlZVf^~+`y
zx3vx)t;FCLOsevR51xIhC^2@dzA%=5P2^=VEHol_M|S2*B=Nye`^&etSO6*dGg$9N
zKz_^w#^y<Ss!`+N<I4*do|)G<LAeHFFC111qCA2RfgNtYdHMftrjit4clGSD+RI?h
zHIW0_Gw+V#14DUzY_Ev`B8T~vhE7LAKia$aV)7vM%dd7Q5JmwZY8?A#&%6#Db`ch^
zJ?Siot$~kjx)DVbDCEQLhOPHz<b%pEQc>qJ^X{-@7@84WcbE<Krp6!q3_-gfT7%S&
zZhn+U>;)w#NuxGr=5^=^S@_|%^%Ds4U|p9#?odO40w)}P`<Bw6H87_u@@6c;?z};}
z;9|e6a0L&akY=BSBK4h?gKOr-`PYnk5>>fS@S&+@anH)%?avL5>)|qPVQQM<huhg*
zA;@SdiWZMQ1!Hvkq5?m|2q|N3{ad?v^;7ql+%FWBYFM;*@mOns#@3g&W`-CRL`-05
z@9gX}zd7sNGyRXra%<M`?tyj|L)mu<r0na{1e1;LNXK_Ew_a7qD(Wz7Zza~wY)k<F
zbrNYKU1cS}+<Bm_$J)~nJ(FLN6w@pY)S$X9&j&@Gflmf+TsdCMQ^<<9%?RVoXfPR;
zj9w~*!bvD32Mq7|*be<RLp-)rJAZSh{Y*L2Lg|f~S<TIz?fsdR^YNaK9%~-j2!nbU
zD8jfGY5wD#W|o#B@NRwj4BslB3lh7!bFFnfvc{nCkBcs!@xSzxA7s)u(2yjs15Mf9
zH|A^VD|cl)9edlfA+LG9c$}-ncw=25ckL~c=rY>}gU{Td-3nJ_z@#GGG(Ym45B>eI
zWrrD<w#sFi)oa!5;8C<(R4&eaLOeDF%+StxWhN;KHB0r#605wT5d=vXcVuKFBZL#1
zPl($|8KjYVOy+8nMB$3hKbMn0;3c#qAiX^g+LtiC*8-K;@r5*k`<V4Q9yx2P`<m^I
zCVa??Yok!#MzD*y&6jRkj{CO!BBc<D&^(4m7s<-q2e^trCVl3OLSgr2_?w;wHY|r-
z1fFL*+l7n(2Ffv*G}@pPMZ@jN%pO;i=By<ARrq?R?Cys!hu7Cm;diX$FC3=~z|b6V
z`yvC_n}ycH-P5A}PxjssoRg}*0^SNduco+HXXbV2u;VaxbVGvxaSY6_rt13=C@cBH
zBvJPBvYEhC%cQrnAYAJc++-8Ci+%{^B=@ex2}nv8s2C08H1byJdVnCatN=><T@^fN
zW`1?gJ42fUczj7cJ&BS#9U23ebNDRpY7UsKdGa<w2`K7D7|Oom&XKurSF%^lkmUF+
zAfsH1cY*{Jkbk3^Z|G3&C;=v23_I%B0}EaAno}SDjq&`<GyWSPLet<p#qGQ*`(cw;
zJLM6On`)0eTH;0agrLQ_P!8C>fi+va$_A#Rs`*M-Qfi^nOP8ht?YoL5b~^hi`O{%*
z0D-$jMrPhISZG0nqEF3(#j=n1IR+pS35xJ}k{+^^ZzLIGwRm?tm+U4(1YMs(Ql5H4
z!}+<yt-C@HklV`iI^OHJi?Wd)B!;Bis7>rJ>|YjzLcDF0khpKMsAcrlQE$8zbh2S)
z_vD@r4D^IA9Pk+T$>yyX!WH7I#S)JlR0|6K?@Riv6rfDPtxx<f`uQrOLQgm4KrKmx
zcx{r-Y1{xc$h6j44gIuAmL<k)zh4n9!WvEKgGN_+5Xs8*j&nto!>16d;Z`Eu42qmt
z>UXaKG|6VRi8-|CZtu;cv5DkWkWadqmfD#7sOAO==Tcw+fB7V%UEQsDP!s-W30^7w
z-GN|nirG=X^kxN56mLuDc^cl0`Cz$1-RU`tT}d}`=4p_eVg3t_m1}2~B2+WXCeBTa
z|M+-&JG2>JYH+WYI04><TCXFkP>gat5gA!oHaw>5)ho+|{QSvp^x%C?24eUdcCT+i
zQTqJA1t!_JZ*F)FaEWeuV+2A7&4i&9J&jV<aL&kTS-zRid4iS<3@dZQ^&I5@d}KXS
zLU?kbK=$Z^%v|qSZ8LnOD3g2?3pVrw?2aJFE2{x>B==?^lv<|Fg`skXv#mT~Z^Kp=
zLI`Ax%?EC#bb4eV762d1XFKyFK;gd$IS+XL+gt5+C@WnB>JJMjOXCG=(TX!+=L{5r
zOM+`K^g{UEV|#J+SCr`piu(H$qOR;)ELAB~b^r%rdR^6NfHp_(fXLtZq%#v9W(n{L
zJ5$l-4<Ha-OdUq>s)@qT3o(Vw(V&>vZEtO!c@=ts3V>kr{_7HAE_gxuE>_GzSciBK
z#@Z>-e~19HPv3TA48hA{NdjKJ&QE^}<eo{E*3U)Z8h^iv|3ALrV6{?rUOpe_ID+Jy
zp->ND5qsR1#kh4x;g4+0vu4dYQ_0ap2RW7IT~98wLkAH511sN&NLPUost@5@8<E1f
zre56s3d-Fo8tFV(T3Q-An7XGx2{%Avj!*Ptst#%qVwrCzEY>6U>6IZ>btAHKx;FRi
z;ek#(IYZEG(OWo#EO&2S5I^|b4ek9Sks9q%<`Jsbat%PWD9^jl*2i{r^Vx6ofPw=7
zT)2}Ie>>Lg|H_^EA4Jlz`sq&@piqj5Il{!n@V5LSNdcfGrwloiK3a6xEh1!ca^o35
zP}8|2fJ-7*sl=DDU%JW&5;8qL@F0mK3Q2V8!Xa9E`=T7jOo!G&6)=fa+o31r!Mo5H
zy@YKu|Fb)JCg<-^`fen~Z6}v0?`=ZR+cUW8bDiyQPcoj!3)1GPOQi!3A1^P{904bt
zJ4-&SA$V}@%S!y(KWgHL!f{yr48FaF_QdyRncX|-C$6vv%Ljfp@}ydEW-7-n!_a_$
zrVKV9b~R2PT?E!h?1q;j#kDK6Ij(O7t8%&e>uI*?;_G*Vj!xVcFLd!QCG9<)_BQxR
zS|kl@&8MqSN_A5*K(dr>vKhJ$%#VFR)i}c0!eHpuZLI@ZV7n;X7(}W^7YFcg?|5ha
z#~b~}8~uOH8;vK>9|X6VuNO31HXPi)=6fjy6slL^zB}?nq9pcc3-3VTr-9(Lo$Clh
zcW_ttHCHjsOpTE2$p|jRA|_}=X^(d*E{1+t%{1aI5uIC<J`k(65qX8>DB@*1M}3Bs
z#zH<mmPu&e#BmEccnv`>W5q14ry3+TtoG0&0jN;m8N^MZP+JKp`bn+3a(`0FpBht}
z$98$W+w=}}&H%2VqsJd?ay;~xDoN7Cu4zR0JbU)+AT&?+#@i72JDA%O_On3D?*7*{
z?g$zuuE~C9a=*1qzuWSZMqqINn|_|^4233de>$GTf^sR0PRnFguH@Xbd2<Q0D&T<Z
zmb$KGs+&$wRuEDW<BuB+C-ba}M+wJ!lSG4rq$SWJd|@FJ_xIVMt)j5|NV&qXf>x*_
zce-uYfeGn<36yCB(3b6O4L0%f@Q&<wC-4$z?K3QLRAwcn2U20X79<iNAbopxwB!6;
zrpKJ`Lc7%ckmma>U+w#~{rZDygj=T8ueDV?z%sq5Qoo<b_!P8aMTwAAC=j*aUbtk*
z0!Z{Se-tcWZe~X6)-B&ek^pTex#RZSnQbMiBn-_M$v!~FZ8A%9l+bI;4R?ETJnbYd
z$Df}VpUa{^=?l&TigNgv{*JyxK=>`V(cJK6Zroi^OFQTnA$Z|*GGJ(FPgB8v6v_c}
zv^`ZdsdfO2TRomA&TJ~~=1br>=yd}CcTLREab|+2%!Z-7r}ue_;HFtnc@+xSs$ovy
z0>+gP+g}@$XN-aq%i#?>iS8L2F&94i2<4j*aG2NuV*`cf>^Ki<oBp0|MHG_^_7tvl
zo?9KB8YI(cK`36|G>Fu(kMK@qe2-^s#5WEgJ!;^AMBBIdvb}*)3(0$zPaCKDGd|rH
z1Gl%aQ1>!5m~;N>Ta-r)9XLpdPRe=Q+GdyKMalLv69YVm(#zjOJq{leXPSTB$244r
zq;n1y&`JvymBIb({FD|1XtVn5g59;)Ylee8A-?vby7C~`foURBsG695($4OV?7x5W
zGAAVC+m$XoC5yXg<XG#o4;YWXNL1x=$RB=~25x7|G~Fx-N#ktE<lQ-P{O)fq)hzD6
z-N7)GJ<7LdxOMhFe1)MQC%n#{)ecvj<~gCtpZ30$X`80z=}VU`ZCkuzAWA^lrC;)u
zRz#(A;dsZv?kF2>8?$zIV#1_&2)ydlE1`r{o`C!Z@cEg!Lds9^h6O5|c#oW=)^Z#F
zBUvcGq&cZ~7tdndeYjLqKxI&nkO|gIc|xbk%#;iKo0loZbELE#svwu`6HZ!*rxKjc
z_dp>>2x*%#SI@<Nwj6)Bs_@w(W0MO>fB?5j8&M*)Ptd#==97nLGOJfF#06gX3eyU2
zN+lr$B7(C$R3Ffzuo2=S_WHtO9+ADYl};Wn>$WvX8r@!s_YH-VmwD#p7ZnaIu+Oz5
z8hJteD_~CkKLvaL0;%&@^1;w;Pb-<v0_}Bhmq%VvSKl%WDvx-dJ<G;TT+V|Vs(Qz7
zA|+@rzF%hsu8z427Wm?Bj-ss@3wGt6q{Eis33Ibij`@EsVGc<C6l+Q-UN|Hyofjpp
z&<tLHpMIdntieXTc<3m~Q!|ADgbWu+=_G{*K1`?|QTjJg_+jj!S7retu+WpTmk(#Y
zAIu(xX81oB@(0Qj<&HdrLOw1q_HD)x7j_)9j?o=WCNr-?Pq+YMcRuNnAqap=(^<1?
z9RiJG^{_h&jr-(l@YZ@E?6wNR?s#DxtA#f&gn{(b5E9#f@|f@xf-Mx!1=KOVz~dc@
z()m!tvn{S^G1aCMyJ4m1S&xx=XAHyc9z>rh9RYW1nX)O8^9diSw5LoBN_}x3TD=J+
z!9NllHTEt`2HCHJa8H&te2oHy7kJ{9aykp+QwTm#6b|J12JOkzg1kC!DdrNod>@^t
zO%E97`5Pt05_#GdaSjyE(@9ET_vh2u;<PB%PlBY|yHe=LY6bt+(YKCn=PUXPT1NA|
zJMyt#>)R(Nv$4LuzKryA-GRYD`TG~nf_K=*rd+Q_XlA^^io~I)t|`<=f6~*_)0&sh
z$HB-cLuR>u6sg)+9YaNBW!b^ik~};-R>zKg>02=Zy@=wu-v}&O60-Ko`8g<vz(`~w
zlga0kl9J>Woq2Na-nxe7=6PE?3<zGksCRO5_PdT9{dDxm%;-*0!lI(0GP1Mvv+vv~
zgHrIhp9viM*3se{?JubvMX{Y=3`LE;Br~8s7)6Q0hqC~vrEb5OKQpMOESKv`&7iDv
zlP=<L*!@tlH+S>zuK!2O;9{6tx!gh-q07jW_>*E>7?)LbS$R5zZW{SWy?mnGP|bcb
zg2xxXcPayblH15>&dgUgbp|~^yd=N3OA&|PCYia`t`8iTW~0-EG++U6Q(Wmt;UDmm
z7&mclrP01;0!SDP5pcS2Q$pES5QkTt1W>X_<<6Sfo}D_(a^{6`JMX(7JV{OsH)53`
z+x<cMA4%go#r;A0@1Xek<NrI-QwE{d+RI|KmoqW6++{M0KRNa|C<;S&Eh3|nx|zB8
zB}r-N@UE^d=?CtsG*Vj!KQT>8hj=0Kb=n{^dF|Q=P{K#X#|?Z+%z4jjvXWW&dEip3
zr^jHITS=y)kbBIu2=g<P=m~2DhbH(ct!-?|U_09R19dItu48t$_n_3A!+A?3i>9`A
z1T@vo$jBJ^=qsUO?yAq!h~#sE&5Zj|idt8Qi;G*?+4Tu8S@1Ea-H<VF!#8c?-`oAi
zUxu>rB!ry2JPGLd>gYwC)v7@5qc8~}%7C_=X8ZT|Ogbtllga%nw>a9`+P>^LU5qLc
z4q=IjiE^t}?Q(H(dD7Cd=e1)~UO@ruz*CEWqGN$8GJ|F&vozJqLcVO5<L8kJqRBD;
zINxu?0qY7gSMF9Q6OP8Dw}_g#hToJnM+EGjU;mXX`vaLjkYW8pO}`Q@e<1S*GNeB|
v@(+*v!y~6N+^6v350CuABmZ+AIkcdvrYv)0Rxd9d{8Lxkw(-__vy=Y?XwXX)

literal 0
HcmV?d00001

diff --git a/docs/rfcs/images/036-bottom-most-gc-compaction/13-job-split.svg b/docs/rfcs/images/036-bottom-most-gc-compaction/13-job-split.svg
new file mode 100644
index 0000000000..37c38c727c
--- /dev/null
+++ b/docs/rfcs/images/036-bottom-most-gc-compaction/13-job-split.svg
@@ -0,0 +1,176 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<svg version="1.1" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns="http://www.w3.org/2000/svg" xmlns:xl="http://www.w3.org/1999/xlink" viewBox="210 271 870 514" width="870" height="514">
+  <defs/>
+  <g id="gc-compaction-split" stroke-dasharray="none" fill-opacity="1" stroke="none" fill="none" stroke-opacity="1">
+    <title>gc-compaction-split</title>
+    <rect fill="white" x="210" y="271" width="870" height="514"/>
+    <g id="gc-compaction-split_Layer_1">
+      <title>Layer 1</title>
+      <g id="Graphic_12">
+        <rect x="241" y="272" width="213" height="50.5" fill="white"/>
+        <rect x="241" y="272" width="213" height="50.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_11">
+        <rect x="468.72266" y="272" width="213" height="50.5" fill="white"/>
+        <rect x="468.72266" y="272" width="213" height="50.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_10">
+        <rect x="695.72266" y="272" width="213" height="50.5" fill="white"/>
+        <rect x="695.72266" y="272" width="213" height="50.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_9">
+        <rect x="241" y="337.3711" width="303.5" height="50.5" fill="white"/>
+        <rect x="241" y="337.3711" width="303.5" height="50.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_8">
+        <rect x="556.2617" y="337.3711" width="352.46094" height="50.5" fill="white"/>
+        <rect x="556.2617" y="337.3711" width="352.46094" height="50.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_7">
+        <rect x="241" y="402.7422" width="667.72266" height="50.5" fill="white"/>
+        <rect x="241" y="402.7422" width="667.72266" height="50.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Line_6">
+        <line x1="211" y1="355.5" x2="947.4961" y2="355.5" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_5">
+        <text transform="translate(952.4961 346.776)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="0" y="15" xml:space="preserve">branch point</tspan>
+        </text>
+      </g>
+      <g id="Line_4">
+        <line x1="212" y1="438.5182" x2="948.4961" y2="438.5182" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_3">
+        <text transform="translate(953.4961 429.7942)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="0" y="15" xml:space="preserve">last branch point</tspan>
+        </text>
+      </g>
+      <g id="Graphic_13">
+        <rect x="241" y="272" width="127.99101" height="181.24219" fill="#3a8eed" fill-opacity=".5"/>
+        <text transform="translate(246 353.3971)" fill="white">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="white" x="38.835502" y="15" xml:space="preserve">Job 1</tspan>
+        </text>
+      </g>
+      <g id="Graphic_57">
+        <rect x="359" y="647.96484" width="551.72266" height="50.5" fill="white"/>
+        <rect x="359" y="647.96484" width="551.72266" height="50.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_54">
+        <rect x="359" y="517.22266" width="96" height="50.5" fill="white"/>
+        <rect x="359" y="517.22266" width="96" height="50.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_53">
+        <rect x="469.72266" y="517.22266" width="213" height="50.5" fill="white"/>
+        <rect x="469.72266" y="517.22266" width="213" height="50.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_52">
+        <rect x="696.72266" y="517.22266" width="213" height="50.5" fill="white"/>
+        <rect x="696.72266" y="517.22266" width="213" height="50.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_51">
+        <rect x="359" y="582.59375" width="186.5" height="50.5" fill="white"/>
+        <rect x="359" y="582.59375" width="186.5" height="50.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_50">
+        <rect x="557.2617" y="582.59375" width="352.46094" height="50.5" fill="white"/>
+        <rect x="557.2617" y="582.59375" width="352.46094" height="50.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Line_49">
+        <line x1="212" y1="600.72266" x2="948.4961" y2="600.72266" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_48">
+        <text transform="translate(953.4961 591.99866)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="0" y="15" xml:space="preserve">branch point</tspan>
+        </text>
+      </g>
+      <g id="Line_47">
+        <line x1="213" y1="683.74084" x2="949.4961" y2="683.74084" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_46">
+        <text transform="translate(954.4961 675.01685)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="0" y="15" xml:space="preserve">last branch point</tspan>
+        </text>
+      </g>
+      <g id="Graphic_63">
+        <rect x="376.72525" y="272" width="127.99101" height="181.24219" fill="#3a8eed" fill-opacity=".5"/>
+        <text transform="translate(381.72525 353.3971)" fill="white">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="white" x="38.835502" y="15" xml:space="preserve">Job 2</tspan>
+        </text>
+      </g>
+      <g id="Graphic_64">
+        <rect x="511.39405" y="272" width="127.99101" height="181.24219" fill="#3a8eed" fill-opacity=".5"/>
+        <text transform="translate(516.39405 353.3971)" fill="white">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="white" x="38.835502" y="15" xml:space="preserve">Job 3</tspan>
+        </text>
+      </g>
+      <g id="Graphic_65">
+        <rect x="646.06285" y="272" width="127.99101" height="181.24219" fill="#3a8eed" fill-opacity=".5"/>
+        <text transform="translate(651.06285 353.3971)" fill="white">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="white" x="38.835502" y="15" xml:space="preserve">Job 4</tspan>
+        </text>
+      </g>
+      <g id="Graphic_66">
+        <rect x="780.73165" y="272" width="127.99101" height="181.24219" fill="#3a8eed" fill-opacity=".5"/>
+        <text transform="translate(785.73165 353.3971)" fill="white">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="white" x="38.835502" y="15" xml:space="preserve">Job 5</tspan>
+        </text>
+      </g>
+      <g id="Graphic_56">
+        <rect x="243.5" y="517.22266" width="125.49101" height="181.24219" fill="#ccc"/>
+        <rect x="243.5" y="517.22266" width="125.49101" height="181.24219" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_55">
+        <rect x="243.5" y="673.46484" width="125.49101" height="17.5" fill="#6b7ca5"/>
+        <rect x="243.5" y="673.46484" width="125.49101" height="17.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_68">
+        <rect x="379.22525" y="517.22266" width="125.49101" height="181.24219" fill="#ccc"/>
+        <rect x="379.22525" y="517.22266" width="125.49101" height="181.24219" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_67">
+        <rect x="379.22525" y="673.46484" width="125.49101" height="17.5" fill="#6b7ca5"/>
+        <rect x="379.22525" y="673.46484" width="125.49101" height="17.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_70">
+        <rect x="514.22525" y="517.22266" width="125.49101" height="181.24219" fill="#ccc"/>
+        <rect x="514.22525" y="517.22266" width="125.49101" height="181.24219" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_69">
+        <rect x="514.22525" y="673.46484" width="125.49101" height="17.5" fill="#6b7ca5"/>
+        <rect x="514.22525" y="673.46484" width="125.49101" height="17.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_72">
+        <rect x="649.22525" y="517.22266" width="125.49101" height="181.24219" fill="#ccc"/>
+        <rect x="649.22525" y="517.22266" width="125.49101" height="181.24219" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_71">
+        <rect x="649.22525" y="673.46484" width="125.49101" height="17.5" fill="#6b7ca5"/>
+        <rect x="649.22525" y="673.46484" width="125.49101" height="17.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_74">
+        <rect x="785.23165" y="517.22266" width="125.49101" height="181.24219" fill="#ccc"/>
+        <rect x="785.23165" y="517.22266" width="125.49101" height="181.24219" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_73">
+        <rect x="785.23165" y="673.46484" width="125.49101" height="17.5" fill="#6b7ca5"/>
+        <rect x="785.23165" y="673.46484" width="125.49101" height="17.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_78">
+        <rect x="241" y="731.3359" width="125.49101" height="27.26953" fill="#ccc"/>
+        <rect x="241" y="731.3359" width="125.49101" height="27.26953" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(246 735.7467)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="17.297502" y="15" xml:space="preserve">Delta Layer</tspan>
+        </text>
+      </g>
+      <g id="Graphic_79">
+        <rect x="241" y="766.759" width="125.49101" height="17.5" fill="#6b7ca5"/>
+        <rect x="241" y="766.759" width="125.49101" height="17.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(246 766.285)" fill="white">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="white" x="13.737502" y="15" xml:space="preserve">Image Layer</tspan>
+        </text>
+      </g>
+    </g>
+  </g>
+</svg>

From 68120cfa31b10eda7f807c74b6049f60d7400a45 Mon Sep 17 00:00:00 2001
From: a-masterov <72613290+a-masterov@users.noreply.github.com>
Date: Wed, 14 May 2025 15:19:53 +0200
Subject: [PATCH 42/65] Fix Cloud Extensions Regression (#11907)

## Problem
The regression test on extensions relied on the admin API to set the
default endpoint settings, which is not stable and requires admin
privileges. Specifically:
- The workflow was using `default_endpoint_settings` to configure
necessary PostgreSQL settings like `DateStyle`, `TimeZone`, and
`neon.allow_unstable_extensions`
- This approach was failing because the API endpoint for setting
`default_endpoint_settings` was changed (referenced in a comment as
issue #27108)
- The admin API requires special privileges.
## Summary of changes
We get rid of the admin API dependency and use ALTER DATABASE statements
instead:
**Removed the default_endpoint_settings mechanism:**
- Removed the default_endpoint_settings input parameter from the
neon-project-create action
- Removed the API call that was attempting to set these settings at the
project level
- Completely removed the default_endpoint_settings configuration from
the cloud-extensions workflow
**Added database-level settings:**
- Created a new `alter_db.sh` script that applies the same settings
directly to each test database
- Modified all extension test scripts to call this script after database
creation
---
 .../actions/neon-project-create/action.yml    | 20 -------------------
 .github/workflows/cloud-extensions.yml        | 15 +-------------
 docker-compose/ext-src/alter_db.sh            |  8 ++++++++
 .../ext-src/pg_graphql-src/regular-test.sh    |  1 +
 .../ext-src/pgrag-src/regular-test.sh         |  1 +
 docker-compose/ext-src/pgx_ulid-src/Makefile  |  1 +
 .../ext-src/plv8-src/regular-test.sh          |  1 +
 .../ext-src/rag_bge_small_en_v15-src/Makefile |  1 +
 .../rag_jina_reranker_v1_tiny_en-src/Makefile |  1 +
 .../ext-src/rum-src/regular-test.sh           |  1 +
 10 files changed, 16 insertions(+), 34 deletions(-)
 create mode 100755 docker-compose/ext-src/alter_db.sh

diff --git a/.github/actions/neon-project-create/action.yml b/.github/actions/neon-project-create/action.yml
index a5b4104908..d7ff05be1a 100644
--- a/.github/actions/neon-project-create/action.yml
+++ b/.github/actions/neon-project-create/action.yml
@@ -49,10 +49,6 @@ inputs:
     description: 'A JSON object with project settings'
     required: false
     default: '{}'
-  default_endpoint_settings:
-    description: 'A JSON object with the default endpoint settings'
-    required: false
-    default: '{}'
 
 outputs:
   dsn:
@@ -139,21 +135,6 @@ runs:
             -H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer ${ADMIN_API_KEY}" \
             -d "{\"scheduling\": \"Essential\"}"
         fi
-        # XXX
-        # This is a workaround for the default endpoint settings, which currently do not allow some settings in the public API.
-        # https://github.com/neondatabase/cloud/issues/27108
-        if [[ -n ${DEFAULT_ENDPOINT_SETTINGS} && ${DEFAULT_ENDPOINT_SETTINGS} != "{}" ]] ; then
-          PROJECT_DATA=$(curl -X GET \
-              "https://${API_HOST}/regions/${REGION_ID}/api/v1/admin/projects/${project_id}" \
-              -H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer ${ADMIN_API_KEY}" \
-              -d "{\"scheduling\": \"Essential\"}"
-          )
-          NEW_DEFAULT_ENDPOINT_SETTINGS=$(echo ${PROJECT_DATA} | jq -rc ".project.default_endpoint_settings + ${DEFAULT_ENDPOINT_SETTINGS}")
-          curl -X POST --fail \
-                "https://${API_HOST}/regions/${REGION_ID}/api/v1/admin/projects/${project_id}/default_endpoint_settings" \
-                -H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer ${ADMIN_API_KEY}" \
-                --data "${NEW_DEFAULT_ENDPOINT_SETTINGS}"
-        fi
         
 
       env:
@@ -171,4 +152,3 @@ runs:
         PSQL: ${{ inputs.psql_path }}
         LD_LIBRARY_PATH: ${{ inputs.libpq_lib_path }}
         PROJECT_SETTINGS: ${{ inputs.project_settings }}
-        DEFAULT_ENDPOINT_SETTINGS: ${{ inputs.default_endpoint_settings }}
diff --git a/.github/workflows/cloud-extensions.yml b/.github/workflows/cloud-extensions.yml
index 4114f0f9b4..25fe0877d9 100644
--- a/.github/workflows/cloud-extensions.yml
+++ b/.github/workflows/cloud-extensions.yml
@@ -35,7 +35,7 @@ jobs:
       matrix:
         pg-version: [16, 17]
 
-    runs-on: [ self-hosted, small ]
+    runs-on: us-east-2
     container:
       # We use the neon-test-extensions image here as it contains the source code for the extensions.
       image: ghcr.io/neondatabase/neon-test-extensions-v${{ matrix.pg-version }}:latest
@@ -71,20 +71,7 @@ jobs:
           region_id: ${{ inputs.region_id || 'aws-us-east-2' }}
           postgres_version: ${{ matrix.pg-version }}
           project_settings: ${{ steps.project-settings.outputs.settings }}
-          # We need these settings to get the expected output results.
-          # We cannot use the environment variables e.g. PGTZ due to
-          # https://github.com/neondatabase/neon/issues/1287
-          default_endpoint_settings: >
-            {
-              "pg_settings": {
-                "DateStyle": "Postgres,MDY",
-                "TimeZone": "America/Los_Angeles",
-                "compute_query_id": "off",
-                "neon.allow_unstable_extensions": "on"
-              }
-            }
           api_key: ${{ secrets.NEON_STAGING_API_KEY }}
-          admin_api_key: ${{ secrets.NEON_STAGING_ADMIN_API_KEY }}
 
       - name: Run the regression tests
         run: /run-tests.sh -r /ext-src
diff --git a/docker-compose/ext-src/alter_db.sh b/docker-compose/ext-src/alter_db.sh
new file mode 100755
index 0000000000..6df37e1c9b
--- /dev/null
+++ b/docker-compose/ext-src/alter_db.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+# We need these settings to get the expected output results.
+# We cannot use the environment variables e.g. PGTZ due to
+# https://github.com/neondatabase/neon/issues/1287
+export DATABASE=${1:-contrib_regression}
+psql -c "ALTER DATABASE ${DATABASE} SET neon.allow_unstable_extensions='on'" \
+     -c "ALTER DATABASE ${DATABASE} SET DateStyle='Postgres,MDY'" \
+     -c "ALTER DATABASE ${DATABASE} SET TimeZone='America/Los_Angeles'" \
diff --git a/docker-compose/ext-src/pg_graphql-src/regular-test.sh b/docker-compose/ext-src/pg_graphql-src/regular-test.sh
index 85e1ae057a..9e7d63b612 100755
--- a/docker-compose/ext-src/pg_graphql-src/regular-test.sh
+++ b/docker-compose/ext-src/pg_graphql-src/regular-test.sh
@@ -18,6 +18,7 @@ TESTS=${TESTS/row_level_security/}
 TESTS=${TESTS/sqli_connection/}
 dropdb --if-exist contrib_regression
 createdb contrib_regression
+. ../alter_db.sh
 psql -v ON_ERROR_STOP=1 -f test/fixtures.sql -d contrib_regression
 ${REGRESS} --use-existing --dbname=contrib_regression --inputdir=${TESTDIR} ${TESTS}
 
diff --git a/docker-compose/ext-src/pgrag-src/regular-test.sh b/docker-compose/ext-src/pgrag-src/regular-test.sh
index 6cb1b049a4..22eb7498fd 100755
--- a/docker-compose/ext-src/pgrag-src/regular-test.sh
+++ b/docker-compose/ext-src/pgrag-src/regular-test.sh
@@ -3,6 +3,7 @@ set -ex
 cd "$(dirname "${0}")"
 dropdb --if-exist contrib_regression
 createdb contrib_regression
+. ../alter_db.sh
 psql -d contrib_regression -c "CREATE EXTENSION vector" -c "CREATE EXTENSION rag"
 PG_REGRESS=$(dirname "$(pg_config --pgxs)")/../test/regress/pg_regress
 ${PG_REGRESS} --inputdir=./ --bindir='/usr/local/pgsql/bin'    --use-existing --load-extension=vector --load-extension=rag --dbname=contrib_regression basic_functions text_processing api_keys chunking_functions document_processing embedding_api_functions voyageai_functions
diff --git a/docker-compose/ext-src/pgx_ulid-src/Makefile b/docker-compose/ext-src/pgx_ulid-src/Makefile
index 6480c48441..00975e8c48 100644
--- a/docker-compose/ext-src/pgx_ulid-src/Makefile
+++ b/docker-compose/ext-src/pgx_ulid-src/Makefile
@@ -20,5 +20,6 @@ installcheck: regression-test
 regression-test:
 	dropdb --if-exists contrib_regression
 	createdb contrib_regression
+	../alter_db.sh
 	psql -d contrib_regression -c "CREATE EXTENSION $(EXTNAME)"
 	$(PG_REGRESS) --inputdir=. --outputdir=. --use-existing --dbname=contrib_regression $(REGRESS)
diff --git a/docker-compose/ext-src/plv8-src/regular-test.sh b/docker-compose/ext-src/plv8-src/regular-test.sh
index b10cc65e8a..d5224e341c 100755
--- a/docker-compose/ext-src/plv8-src/regular-test.sh
+++ b/docker-compose/ext-src/plv8-src/regular-test.sh
@@ -3,6 +3,7 @@ set -ex
 cd "$(dirname ${0})"
 dropdb --if-exist contrib_regression
 createdb contrib_regression
+. ../alter_db.sh
 PG_REGRESS=$(dirname "$(pg_config --pgxs)")/../test/regress/pg_regress
 REGRESS="$(make -n installcheck | awk '{print substr($0,index($0,"init-extension"));}')"
 REGRESS="${REGRESS/startup_perms/}"
diff --git a/docker-compose/ext-src/rag_bge_small_en_v15-src/Makefile b/docker-compose/ext-src/rag_bge_small_en_v15-src/Makefile
index ac87cc511b..de6bdd06c0 100644
--- a/docker-compose/ext-src/rag_bge_small_en_v15-src/Makefile
+++ b/docker-compose/ext-src/rag_bge_small_en_v15-src/Makefile
@@ -11,5 +11,6 @@ PG_REGRESS := $(dir $(PGXS))../../src/test/regress/pg_regress
 installcheck:
 	dropdb --if-exists contrib_regression
 	createdb contrib_regression
+	../alter_db.sh
 	psql -d contrib_regression -c "CREATE EXTENSION vector" -c "CREATE EXTENSION rag_bge_small_en_v15"
 	$(PG_REGRESS) --use-existing --dbname=contrib_regression $(REGRESS)
\ No newline at end of file
diff --git a/docker-compose/ext-src/rag_jina_reranker_v1_tiny_en-src/Makefile b/docker-compose/ext-src/rag_jina_reranker_v1_tiny_en-src/Makefile
index e81f94ef47..7adcad32f7 100644
--- a/docker-compose/ext-src/rag_jina_reranker_v1_tiny_en-src/Makefile
+++ b/docker-compose/ext-src/rag_jina_reranker_v1_tiny_en-src/Makefile
@@ -11,5 +11,6 @@ PG_REGRESS := $(dir $(PGXS))../../src/test/regress/pg_regress
 installcheck:
 	dropdb --if-exists contrib_regression
 	createdb contrib_regression
+	../alter_db.sh
 	psql -d contrib_regression -c "CREATE EXTENSION vector" -c "CREATE EXTENSION rag_jina_reranker_v1_tiny_en"
 	$(PG_REGRESS) --use-existing --dbname=contrib_regression $(REGRESS)
diff --git a/docker-compose/ext-src/rum-src/regular-test.sh b/docker-compose/ext-src/rum-src/regular-test.sh
index d1d45a36ef..815c1adb53 100755
--- a/docker-compose/ext-src/rum-src/regular-test.sh
+++ b/docker-compose/ext-src/rum-src/regular-test.sh
@@ -3,5 +3,6 @@ set -ex
 cd "$(dirname ${0})"
 dropdb --if-exist contrib_regression
 createdb contrib_regression
+. ../alter_db.sh
 PG_REGRESS=$(dirname "$(pg_config --pgxs)")/../test/regress/pg_regress
 ${PG_REGRESS} --inputdir=./ --bindir='/usr/local/pgsql/bin' --use-existing --dbname=contrib_regression rum rum_hash ruminv timestamp orderby orderby_hash altorder altorder_hash limits int2 int4 int8 float4 float8 money oid time timetz date interval macaddr inet cidr text varchar char bytea bit varbit numeric rum_weight expr array
\ No newline at end of file

From 32a12783fde3aeb246457ae79b18dc00f85f8896 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Wed, 14 May 2025 18:30:21 +0200
Subject: [PATCH 43/65] pageserver: batching & concurrent IO: update
 binary-built-in defaults; reduce CI matrix (#11923)

Use the current production config for batching & concurrent IO.

Remove the permutation testing for unit tests from CI.
(The pageserver unit test matrix takes ~10min for debug builds).

Drive-by-fix use of `if cfg!(test)` inside crate `pageserver_api`.
It is ineffective for early-enabling new defaults for pageserver unit
tests only.
The reason is that the `test` cfg is only set for the crate under test
but not its dependencies.
So, `cargo test -p pageserver` will build `pageserver_api` with
`cfg!(test) == false`.
Resort to checking for feature flag `testing` instead, since all our
unit tests are run with `--feature testing`.

refs
- `scattered-lsn` batching has been implemented and rolled out in all
envs, cf https://github.com/neondatabase/neon/issues/10765
- preliminary for https://github.com/neondatabase/neon/pull/10466
- epic https://github.com/neondatabase/neon/issues/9377
- epic https://github.com/neondatabase/neon/issues/9378
- drive-by fix
https://neondb.slack.com/archives/C0277TKAJCA/p1746821515504219
---
 .github/workflows/_build-and-test-locally.yml | 22 +++++++------------
 .github/workflows/build_and_test.yml          |  2 --
 libs/pageserver_api/src/config.rs             | 20 +++++------------
 libs/pageserver_api/src/models.rs             | 11 +---------
 libs/utils/src/tracing_span_assert.rs         |  4 ++--
 5 files changed, 17 insertions(+), 42 deletions(-)

diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml
index 7cede309f3..663afa2c8b 100644
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -279,18 +279,14 @@ jobs:
           # run all non-pageserver tests
           ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E '!package(pageserver)'
 
-          # run pageserver tests with different settings
-          for get_vectored_concurrent_io in sequential sidecar-task; do
-            for io_engine in std-fs tokio-epoll-uring ; do
-                for io_mode in buffered direct direct-rw ; do
-                  NEON_PAGESERVER_UNIT_TEST_GET_VECTORED_CONCURRENT_IO=$get_vectored_concurrent_io \
-                  NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine \
-                  NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IO_MODE=$io_mode \
-                  ${cov_prefix} \
-                  cargo nextest run $CARGO_FLAGS $CARGO_FEATURES  -E 'package(pageserver)'
-              done
-            done
-          done
+          # run pageserver tests
+          # (When developing new pageserver features gated by config fields, we commonly make the rust
+          # unit tests sensitive to an environment variable NEON_PAGESERVER_UNIT_TEST_FEATURENAME.
+          # Then run the nextest invocation below for all relevant combinations. Singling out the
+          # pageserver tests from non-pageserver tests cuts down the time it takes for this CI step.)
+          NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=tokio-epoll-uring  \
+          ${cov_prefix} \
+          cargo nextest run $CARGO_FLAGS $CARGO_FEATURES  -E 'package(pageserver)'
 
           # Run separate tests for real S3
           export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
@@ -405,8 +401,6 @@ jobs:
           CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
           BUILD_TAG: ${{ inputs.build-tag }}
           PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
-          PAGESERVER_GET_VECTORED_CONCURRENT_IO: sidecar-task
-          PAGESERVER_VIRTUAL_FILE_IO_MODE: direct-rw
           USE_LFC: ${{ matrix.lfc_state == 'with-lfc' && 'true' || 'false' }}
 
       # Temporary disable this step until we figure out why it's so flaky
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index e0995218f9..6b19f6ef01 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -323,8 +323,6 @@ jobs:
           PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
           TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}"
           PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
-          PAGESERVER_GET_VECTORED_CONCURRENT_IO: sidecar-task
-          PAGESERVER_VIRTUAL_FILE_IO_MODE: direct-rw
           SYNC_BETWEEN_TESTS: true
       # XXX: no coverage data handling here, since benchmarks are run on release builds,
       # while coverage is currently collected for the debug ones
diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
index 5b0c13dd89..7e0bb7dc57 100644
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -639,23 +639,15 @@ impl Default for ConfigToml {
             tenant_config: TenantConfigToml::default(),
             no_sync: None,
             wal_receiver_protocol: DEFAULT_WAL_RECEIVER_PROTOCOL,
-            page_service_pipelining: if !cfg!(test) {
-                PageServicePipeliningConfig::Serial
-            } else {
-                // Do not turn this into the default until scattered reads have been
-                // validated and rolled-out fully.
-                PageServicePipeliningConfig::Pipelined(PageServicePipeliningConfigPipelined {
+            page_service_pipelining: PageServicePipeliningConfig::Pipelined(
+                PageServicePipeliningConfigPipelined {
                     max_batch_size: NonZeroUsize::new(32).unwrap(),
                     execution: PageServiceProtocolPipelinedExecutionStrategy::ConcurrentFutures,
                     batching: PageServiceProtocolPipelinedBatchingStrategy::ScatteredLsn,
-                })
-            },
-            get_vectored_concurrent_io: if !cfg!(test) {
-                GetVectoredConcurrentIo::Sequential
-            } else {
-                GetVectoredConcurrentIo::SidecarTask
-            },
-            enable_read_path_debugging: if cfg!(test) || cfg!(feature = "testing") {
+                },
+            ),
+            get_vectored_concurrent_io: GetVectoredConcurrentIo::SidecarTask,
+            enable_read_path_debugging: if cfg!(feature = "testing") {
                 Some(true)
             } else {
                 None
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 5fcdefba66..89d531d671 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -1803,7 +1803,6 @@ pub struct TopTenantShardsResponse {
 }
 
 pub mod virtual_file {
-    use std::sync::LazyLock;
 
     #[derive(
         Copy,
@@ -1851,15 +1850,7 @@ pub mod virtual_file {
 
     impl IoMode {
         pub fn preferred() -> Self {
-            // The default behavior when running Rust unit tests without any further
-            // flags is to use the newest behavior (DirectRw).
-            // The CI uses the environment variable to unit tests for all different modes.
-            // NB: the Python regression & perf tests have their own defaults management
-            // that writes pageserver.toml; they do not use this variable.
-            static ENV_OVERRIDE: LazyLock<Option<IoMode>> = LazyLock::new(|| {
-                utils::env::var_serde_json_string("NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IO_MODE")
-            });
-            ENV_OVERRIDE.unwrap_or(IoMode::DirectRw)
+            IoMode::DirectRw
         }
     }
 
diff --git a/libs/utils/src/tracing_span_assert.rs b/libs/utils/src/tracing_span_assert.rs
index 3d15e08400..857d98b644 100644
--- a/libs/utils/src/tracing_span_assert.rs
+++ b/libs/utils/src/tracing_span_assert.rs
@@ -127,12 +127,12 @@ macro_rules! __check_fields_present {
 
             match check_fields_present0($extractors) {
                 Ok(FoundEverything) => Ok(()),
-                Ok(Unconfigured) if cfg!(test) => {
+                Ok(Unconfigured) if cfg!(feature = "testing") => {
                     // allow unconfigured in tests
                     Ok(())
                 },
                 Ok(Unconfigured) => {
-                    panic!("utils::tracing_span_assert: outside of #[cfg(test)] expected tracing to be configured with tracing_error::ErrorLayer")
+                    panic!(r#"utils::tracing_span_assert: outside of #[cfg(feature = "testing")] expected tracing to be configured with tracing_error::ErrorLayer"#)
                 },
                 Err(missing) => Err(missing)
             }

From 48b870bc078bd2c450eb7b468e743b9c118549bf Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Thu, 15 May 2025 07:45:22 +0300
Subject: [PATCH 44/65] Use unlogged build in GIST for storing root page
 (#11892)

## Problem

See https://github.com/neondatabase/neon/issues/11891

Newly added assert is first when root page of GIST index is written to
the disk as part of sorted build.

## Summary of changes

Wrap writing of root page in unlogged build.

https://github.com/neondatabase/postgres/pull/632
https://github.com/neondatabase/postgres/pull/633
https://github.com/neondatabase/postgres/pull/634

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 vendor/postgres-v14   | 2 +-
 vendor/postgres-v15   | 2 +-
 vendor/postgres-v16   | 2 +-
 vendor/revisions.json | 6 +++---
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index ead1e76bdc..4cca6f8083 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit ead1e76bdcb71ef87f52f0610bd7333247f75179
+Subproject commit 4cca6f8083483dda9e12eae292cf788d45bd561f
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 052df87d33..daa81cffcf 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 052df87d338dc30687d0c96f1a4d9b6cb4882b2e
+Subproject commit daa81cffcf063c54b29a9aabdb6604625f675ad0
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index bb5eee65ac..15710a76b7 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit bb5eee65ac753b5a66d255ec5fb4c0e33180e8fd
+Subproject commit 15710a76b7d07912110fcbbaf0c8ad6d7e5a9fbc
diff --git a/vendor/revisions.json b/vendor/revisions.json
index cf9f474e1a..0fc2d3996d 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -5,14 +5,14 @@
   ],
   "v16": [
     "16.9",
-    "bb5eee65ac753b5a66d255ec5fb4c0e33180e8fd"
+    "15710a76b7d07912110fcbbaf0c8ad6d7e5a9fbc"
   ],
   "v15": [
     "15.13",
-    "052df87d338dc30687d0c96f1a4d9b6cb4882b2e"
+    "daa81cffcf063c54b29a9aabdb6604625f675ad0"
   ],
   "v14": [
     "14.18",
-    "ead1e76bdcb71ef87f52f0610bd7333247f75179"
+    "4cca6f8083483dda9e12eae292cf788d45bd561f"
   ]
 }

From 9e5a41a3423782b1ab5f097e04583f38b78d9ba9 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Thu, 15 May 2025 15:02:16 +0800
Subject: [PATCH 45/65] fix(scrubber): `remote_storage` error causes layers to
 be deleted as orphans (#11924)

## Problem

close https://github.com/neondatabase/neon/issues/11159 ; we get
occasional wrong deletions of layer files being used and errors in
staging. This patch fixed it.

Example errors:

```
Timeline metadata errors: ["index_part.json contains a layer .... (shard 0000) that is not present in remote storage (layer_is_l0: false) with error: Failed to download a remote file: s3 head object\n\nCaused by:\n    0: dispatch failure\n    1: timeout\n    2: error trying to connect: HTTP connect timeout occurred after 3.1s\n
```

This error should not be fired because the file could exist, but we
cannot know if it exists due to head request failure.

## Summary of changes

Only generate cannot find layer errors when the head_object return type
is `NotFound`.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 storage_scrubber/src/checks.rs | 43 +++++++++++++++++++++-------------
 1 file changed, 27 insertions(+), 16 deletions(-)

diff --git a/storage_scrubber/src/checks.rs b/storage_scrubber/src/checks.rs
index 40f3523a7e..865f0908f9 100644
--- a/storage_scrubber/src/checks.rs
+++ b/storage_scrubber/src/checks.rs
@@ -13,7 +13,7 @@ use pageserver::tenant::remote_timeline_client::{
 };
 use pageserver::tenant::storage_layer::LayerName;
 use pageserver_api::shard::ShardIndex;
-use remote_storage::{GenericRemoteStorage, ListingObject, RemotePath};
+use remote_storage::{DownloadError, GenericRemoteStorage, ListingObject, RemotePath};
 use tokio_util::sync::CancellationToken;
 use tracing::{info, warn};
 use utils::generation::Generation;
@@ -165,23 +165,34 @@ pub(crate) async fn branch_cleanup_and_check_errors(
                                 .head_object(&path, &CancellationToken::new())
                                 .await;
 
-                            if let Err(e) = response {
-                                // Object is not present.
-                                let is_l0 = LayerMap::is_l0(layer.key_range(), layer.is_delta());
+                            match response {
+                                Ok(_) => {}
+                                Err(DownloadError::NotFound) => {
+                                    // Object is not present.
+                                    let is_l0 =
+                                        LayerMap::is_l0(layer.key_range(), layer.is_delta());
 
-                                let msg = format!(
-                                    "index_part.json contains a layer {}{} (shard {}) that is not present in remote storage (layer_is_l0: {}) with error: {}",
-                                    layer,
-                                    metadata.generation.get_suffix(),
-                                    metadata.shard,
-                                    is_l0,
-                                    e,
-                                );
+                                    let msg = format!(
+                                        "index_part.json contains a layer {}{} (shard {}) that is not present in remote storage (layer_is_l0: {})",
+                                        layer,
+                                        metadata.generation.get_suffix(),
+                                        metadata.shard,
+                                        is_l0,
+                                    );
 
-                                if is_l0 || ignore_error {
-                                    result.warnings.push(msg);
-                                } else {
-                                    result.errors.push(msg);
+                                    if is_l0 || ignore_error {
+                                        result.warnings.push(msg);
+                                    } else {
+                                        result.errors.push(msg);
+                                    }
+                                }
+                                Err(e) => {
+                                    tracing::warn!(
+                                        "cannot check if the layer {}{} is present in remote storage (error: {})",
+                                        layer,
+                                        metadata.generation.get_suffix(),
+                                        e,
+                                    );
                                 }
                             }
                         }

From 42e4cf18c97dad427f882c04a70bd33a54503e26 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Thu, 15 May 2025 10:53:59 +0100
Subject: [PATCH 46/65] CI(neon_extra_builds): fix workflow syntax (#11932)

## Problem

```
Error when evaluating 'strategy' for job 'build-pgxn'. neondatabase/neon/.github/workflows/build-macos.yml@7907a9e2bf898f3d22b98d9d4d2c6ffc4d480fc3 (Line: 45, Col: 27): Matrix vector 'postgres-version' does not contain any values
```
See
https://github.com/neondatabase/neon/actions/runs/15039594216/job/42268015127?pr=11929

## Summary of changes
- Fix typo: `.chnages` -> `.changes`
- Ensure JSON is JSON by moving step output to env variable
---
 .github/workflows/neon_extra_builds.yml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml
index 9c504eb5bf..3427a0eb49 100644
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -63,8 +63,10 @@ jobs:
 
       - name: Filter out only v-string for build matrix
         id: postgres_changes
+        env:
+          CHANGES: ${{ steps.files_changed.outputs.changes }}
         run: |
-          v_strings_only_as_json_array=$(echo ${{ steps.files_changed.outputs.chnages }} | jq '.[]|select(test("v\\d+"))' | jq --slurp -c)
+          v_strings_only_as_json_array=$(echo ${CHANGES} | jq '.[]|select(test("v\\d+"))' | jq --slurp -c)
           echo "changes=${v_strings_only_as_json_array}" | tee -a "${GITHUB_OUTPUT}"
 
   check-macos-build:

From a703cd342b1f7f8faf5920cec8ef09902f94eaa8 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Thu, 15 May 2025 11:02:11 +0100
Subject: [PATCH 47/65] storage_controller: enforce generations in import
 upcalls (#11900)

## Problem

Import up-calls did not enforce the usage of the latest generation. The
import might have finished in one previous generation, but not in the
latest one. Hence, the controller might try to activate a timeline
before it is ready. In theory, that would be fine, but it's tricky to
reason about.

## Summary of Changes

Pageserver provides the current generation in the upcall to the storage
controller and the later validates the generation. If the generation is
stale, we return an error which stops progress of the import job. Note
that the import job will retry the upcall until the stale location is
detached.

I'll add some proper tests for this as part of the [checkpointing
PR](https://github.com/neondatabase/neon/pull/11862).

Closes https://github.com/neondatabase/neon/issues/11884
---
 libs/pageserver_api/src/upcall_api.rs         |   9 ++
 pageserver/src/controller_upcall_client.rs    |  22 +++-
 pageserver/src/deletion_queue.rs              |   2 +
 .../src/tenant/timeline/import_pgdata.rs      |   7 +-
 storage_controller/src/http.rs                |  12 +-
 storage_controller/src/service.rs             | 107 ++++++++++++++++--
 6 files changed, 142 insertions(+), 17 deletions(-)

diff --git a/libs/pageserver_api/src/upcall_api.rs b/libs/pageserver_api/src/upcall_api.rs
index 7ee63f9036..4dce5f7817 100644
--- a/libs/pageserver_api/src/upcall_api.rs
+++ b/libs/pageserver_api/src/upcall_api.rs
@@ -4,6 +4,7 @@
 //! See docs/rfcs/025-generation-numbers.md
 
 use serde::{Deserialize, Serialize};
+use utils::generation::Generation;
 use utils::id::{NodeId, TimelineId};
 
 use crate::controller_api::NodeRegisterRequest;
@@ -63,9 +64,17 @@ pub struct ValidateResponseTenant {
     pub valid: bool,
 }
 
+#[derive(Serialize, Deserialize)]
+pub struct TimelineImportStatusRequest {
+    pub tenant_shard_id: TenantShardId,
+    pub timeline_id: TimelineId,
+    pub generation: Generation,
+}
+
 #[derive(Serialize, Deserialize)]
 pub struct PutTimelineImportStatusRequest {
     pub tenant_shard_id: TenantShardId,
     pub timeline_id: TimelineId,
     pub status: ShardImportStatus,
+    pub generation: Generation,
 }
diff --git a/pageserver/src/controller_upcall_client.rs b/pageserver/src/controller_upcall_client.rs
index 6d186b091a..779ef3e37d 100644
--- a/pageserver/src/controller_upcall_client.rs
+++ b/pageserver/src/controller_upcall_client.rs
@@ -7,7 +7,7 @@ use pageserver_api::models::ShardImportStatus;
 use pageserver_api::shard::TenantShardId;
 use pageserver_api::upcall_api::{
     PutTimelineImportStatusRequest, ReAttachRequest, ReAttachResponse, ReAttachResponseTenant,
-    ValidateRequest, ValidateRequestTenant, ValidateResponse,
+    TimelineImportStatusRequest, ValidateRequest, ValidateRequestTenant, ValidateResponse,
 };
 use reqwest::Certificate;
 use serde::Serialize;
@@ -51,12 +51,14 @@ pub trait StorageControllerUpcallApi {
         &self,
         tenant_shard_id: TenantShardId,
         timeline_id: TimelineId,
+        generation: Generation,
         status: ShardImportStatus,
     ) -> impl Future<Output = Result<(), RetryForeverError>> + Send;
     fn get_timeline_import_status(
         &self,
         tenant_shard_id: TenantShardId,
         timeline_id: TimelineId,
+        generation: Generation,
     ) -> impl Future<Output = Result<Option<ShardImportStatus>, RetryForeverError>> + Send;
 }
 
@@ -292,6 +294,7 @@ impl StorageControllerUpcallApi for StorageControllerUpcallClient {
         &self,
         tenant_shard_id: TenantShardId,
         timeline_id: TimelineId,
+        generation: Generation,
         status: ShardImportStatus,
     ) -> Result<(), RetryForeverError> {
         let url = self
@@ -302,6 +305,7 @@ impl StorageControllerUpcallApi for StorageControllerUpcallClient {
         let request = PutTimelineImportStatusRequest {
             tenant_shard_id,
             timeline_id,
+            generation,
             status,
         };
 
@@ -313,15 +317,27 @@ impl StorageControllerUpcallApi for StorageControllerUpcallClient {
         &self,
         tenant_shard_id: TenantShardId,
         timeline_id: TimelineId,
+        generation: Generation,
     ) -> Result<Option<ShardImportStatus>, RetryForeverError> {
         let url = self
             .base_url
-            .join(format!("timeline_import_status/{}/{}", tenant_shard_id, timeline_id).as_str())
+            .join("timeline_import_status")
             .expect("Failed to build path");
 
+        let request = TimelineImportStatusRequest {
+            tenant_shard_id,
+            timeline_id,
+            generation,
+        };
+
         Ok(backoff::retry(
             || async {
-                let response = self.http_client.get(url.clone()).send().await?;
+                let response = self
+                    .http_client
+                    .get(url.clone())
+                    .json(&request)
+                    .send()
+                    .await?;
 
                 if let Err(err) = response.error_for_status_ref() {
                     if matches!(err.status(), Some(reqwest::StatusCode::NOT_FOUND)) {
diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs
index 65b2de28cd..0bbad87c09 100644
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -793,6 +793,7 @@ mod test {
             &self,
             _tenant_shard_id: TenantShardId,
             _timeline_id: TimelineId,
+            _generation: Generation,
             _status: pageserver_api::models::ShardImportStatus,
         ) -> Result<(), RetryForeverError> {
             unimplemented!()
@@ -802,6 +803,7 @@ mod test {
             &self,
             _tenant_shard_id: TenantShardId,
             _timeline_id: TimelineId,
+            _generation: Generation,
         ) -> Result<Option<ShardImportStatus>, RetryForeverError> {
             unimplemented!()
         }
diff --git a/pageserver/src/tenant/timeline/import_pgdata.rs b/pageserver/src/tenant/timeline/import_pgdata.rs
index 53e15e5395..5fac9e0ce7 100644
--- a/pageserver/src/tenant/timeline/import_pgdata.rs
+++ b/pageserver/src/tenant/timeline/import_pgdata.rs
@@ -48,7 +48,11 @@ pub async fn doit(
     let storcon_client = StorageControllerUpcallClient::new(timeline.conf, &cancel);
 
     let shard_status = storcon_client
-        .get_timeline_import_status(timeline.tenant_shard_id, timeline.timeline_id)
+        .get_timeline_import_status(
+            timeline.tenant_shard_id,
+            timeline.timeline_id,
+            timeline.generation,
+        )
         .await
         .map_err(|_err| anyhow::anyhow!("Shut down while getting timeline import status"))?;
 
@@ -175,6 +179,7 @@ pub async fn doit(
                 .put_timeline_import_status(
                     timeline.tenant_shard_id,
                     timeline.timeline_id,
+                    timeline.generation,
                     // TODO(vlad): What about import errors?
                     ShardImportStatus::Done,
                 )
diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index 8d459cab9c..02c02c0e7f 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -31,7 +31,7 @@ use pageserver_api::models::{
 };
 use pageserver_api::shard::TenantShardId;
 use pageserver_api::upcall_api::{
-    PutTimelineImportStatusRequest, ReAttachRequest, ValidateRequest,
+    PutTimelineImportStatusRequest, ReAttachRequest, TimelineImportStatusRequest, ValidateRequest,
 };
 use pageserver_client::{BlockUnblock, mgmt_api};
 use routerify::Middleware;
@@ -160,22 +160,22 @@ async fn handle_validate(req: Request<Body>) -> Result<Response<Body>, ApiError>
 async fn handle_get_timeline_import_status(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::GenerationsApi)?;
 
-    let tenant_shard_id: TenantShardId = parse_request_param(&req, "tenant_shard_id")?;
-    let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;
-
-    let req = match maybe_forward(req).await {
+    let mut req = match maybe_forward(req).await {
         ForwardOutcome::Forwarded(res) => {
             return res;
         }
         ForwardOutcome::NotForwarded(req) => req,
     };
 
+    let get_req = json_request::<TimelineImportStatusRequest>(&mut req).await?;
+
     let state = get_state(&req);
+
     json_response(
         StatusCode::OK,
         state
             .service
-            .handle_timeline_shard_import_progress(tenant_shard_id, timeline_id)
+            .handle_timeline_shard_import_progress(get_req)
             .await?,
     )
 }
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 05430733c2..852005639a 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -47,7 +47,7 @@ use pageserver_api::shard::{
 };
 use pageserver_api::upcall_api::{
     PutTimelineImportStatusRequest, ReAttachRequest, ReAttachResponse, ReAttachResponseTenant,
-    ValidateRequest, ValidateResponse, ValidateResponseTenant,
+    TimelineImportStatusRequest, ValidateRequest, ValidateResponse, ValidateResponseTenant,
 };
 use pageserver_client::{BlockUnblock, mgmt_api};
 use reqwest::{Certificate, StatusCode};
@@ -194,6 +194,14 @@ pub(crate) enum LeadershipStatus {
     Candidate,
 }
 
+enum ShardGenerationValidity {
+    Valid,
+    Mismatched {
+        claimed: Generation,
+        actual: Option<Generation>,
+    },
+}
+
 pub const RECONCILER_CONCURRENCY_DEFAULT: usize = 128;
 pub const PRIORITY_RECONCILER_CONCURRENCY_DEFAULT: usize = 256;
 pub const SAFEKEEPER_RECONCILER_CONCURRENCY_DEFAULT: usize = 32;
@@ -3909,19 +3917,36 @@ impl Service {
 
     pub(crate) async fn handle_timeline_shard_import_progress(
         self: &Arc<Self>,
-        tenant_shard_id: TenantShardId,
-        timeline_id: TimelineId,
+        req: TimelineImportStatusRequest,
     ) -> Result<ShardImportStatus, ApiError> {
+        let validity = self
+            .validate_shard_generation(req.tenant_shard_id, req.generation)
+            .await?;
+        match validity {
+            ShardGenerationValidity::Valid => {
+                // fallthrough
+            }
+            ShardGenerationValidity::Mismatched { claimed, actual } => {
+                tracing::info!(
+                    claimed=?claimed.into(),
+                    actual=?actual.and_then(|g| g.into()),
+                    "Rejecting import progress fetch from stale generation"
+                );
+
+                return Err(ApiError::BadRequest(anyhow::anyhow!("Invalid generation")));
+            }
+        }
+
         let maybe_import = self
             .persistence
-            .get_timeline_import(tenant_shard_id.tenant_id, timeline_id)
+            .get_timeline_import(req.tenant_shard_id.tenant_id, req.timeline_id)
             .await?;
 
         let import = maybe_import.ok_or_else(|| {
             ApiError::NotFound(
                 format!(
                     "import for {}/{} not found",
-                    tenant_shard_id.tenant_id, timeline_id
+                    req.tenant_shard_id.tenant_id, req.timeline_id
                 )
                 .into(),
             )
@@ -3930,11 +3955,11 @@ impl Service {
         import
             .shard_statuses
             .0
-            .get(&tenant_shard_id.to_index())
+            .get(&req.tenant_shard_id.to_index())
             .cloned()
             .ok_or_else(|| {
                 ApiError::NotFound(
-                    format!("shard {} not found", tenant_shard_id.shard_slug()).into(),
+                    format!("shard {} not found", req.tenant_shard_id.shard_slug()).into(),
                 )
             })
     }
@@ -3943,6 +3968,24 @@ impl Service {
         self: &Arc<Self>,
         req: PutTimelineImportStatusRequest,
     ) -> Result<(), ApiError> {
+        let validity = self
+            .validate_shard_generation(req.tenant_shard_id, req.generation)
+            .await?;
+        match validity {
+            ShardGenerationValidity::Valid => {
+                // fallthrough
+            }
+            ShardGenerationValidity::Mismatched { claimed, actual } => {
+                tracing::info!(
+                    claimed=?claimed.into(),
+                    actual=?actual.and_then(|g| g.into()),
+                    "Rejecting import progress update from stale generation"
+                );
+
+                return Err(ApiError::PreconditionFailed("Invalid generation".into()));
+            }
+        }
+
         let res = self
             .persistence
             .update_timeline_import(req.tenant_shard_id, req.timeline_id, req.status)
@@ -3977,6 +4020,56 @@ impl Service {
         Ok(())
     }
 
+    /// Check that a provided generation for some tenant shard is the most recent one.
+    ///
+    /// Validate with the in-mem state first, and, if that passes, validate with the
+    /// database state which is authoritative.
+    async fn validate_shard_generation(
+        self: &Arc<Self>,
+        tenant_shard_id: TenantShardId,
+        generation: Generation,
+    ) -> Result<ShardGenerationValidity, ApiError> {
+        {
+            let locked = self.inner.read().unwrap();
+            let tenant_shard =
+                locked
+                    .tenants
+                    .get(&tenant_shard_id)
+                    .ok_or(ApiError::InternalServerError(anyhow::anyhow!(
+                        "{} shard not found",
+                        tenant_shard_id
+                    )))?;
+
+            if tenant_shard.generation != Some(generation) {
+                return Ok(ShardGenerationValidity::Mismatched {
+                    claimed: generation,
+                    actual: tenant_shard.generation,
+                });
+            }
+        }
+
+        let mut db_generations = self
+            .persistence
+            .shard_generations(std::iter::once(&tenant_shard_id))
+            .await?;
+        let (_tid, db_generation) =
+            db_generations
+                .pop()
+                .ok_or(ApiError::InternalServerError(anyhow::anyhow!(
+                    "{} shard not found",
+                    tenant_shard_id
+                )))?;
+
+        if db_generation != Some(generation) {
+            return Ok(ShardGenerationValidity::Mismatched {
+                claimed: generation,
+                actual: db_generation,
+            });
+        }
+
+        Ok(ShardGenerationValidity::Valid)
+    }
+
     /// Finalize the import of a timeline
     ///
     /// This method should be called once all shards have reported that the import is complete.

From 2621ce2daf2a49408f54a687e9e691b87f3477d0 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Thu, 15 May 2025 14:18:22 +0100
Subject: [PATCH 48/65] pageserver: checkpoint import progress in the storage
 controller (#11862)

## Problem

Timeline imports do not have progress checkpointing. Any time that the
tenant is shut-down, all progress is lost
and the import restarts from the beginning when the tenant is
re-attached.

## Summary of changes

This PR adds progress checkpointing.


### Preliminaries

The **unit of work** is a `ChunkProcessingJob`. Each
`ChunkProcessingJob` deals with the import for a set of key ranges. The
job split is done by using an estimation of how many pages each job will
produce.

The planning stage must be **pure**: given a fixed set of contents in
the import bucket, it will always yield the same plan. This property is
enforced by checking that the hash of the plan is identical when
resuming from a checkpoint.

The storage controller tracks the progress of each shard in the import
in the database in the form of the **latest
job** that has has completed.

### Flow

This is the high level flow for the happy path:
1. On the first run of the import task, the import task queries storcon
for the progress and sees that none is recorded.
2. Execute the preparatory stage of the import
3. Import jobs start running concurrently in a `FuturesOrdered`. Every
time the checkpointing threshold of jobs has been reached, notify the
storage controller.
4. Tenant is detached and re-attached
5. Import task starts up again and gets the latest progress checkpoint
from the storage controller in the form of a job index.
6. The plan is computed again and we check that the hash matches with
the original plan.
7. Jobs are spawned from where the previous import task left off. Note
that we will not report progress after the completion of each job, so
some jobs might run twice.

Closes https://github.com/neondatabase/neon/issues/11568
Closes https://github.com/neondatabase/neon/issues/11664
---
 Cargo.lock                                    |   1 +
 libs/pageserver_api/src/config.rs             |   2 +
 libs/pageserver_api/src/models.rs             |  15 +-
 pageserver/Cargo.toml                         |   1 +
 .../src/tenant/timeline/import_pgdata.rs      | 270 +++++++++++-------
 .../src/tenant/timeline/import_pgdata/flow.rs | 188 ++++++++++--
 storage_controller/src/service.rs             |   2 +-
 storage_controller/src/timeline_import.rs     |   9 +-
 test_runner/fixtures/neon_fixtures.py         |   6 +
 9 files changed, 357 insertions(+), 137 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 6df5d4a71e..f075b45e49 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4331,6 +4331,7 @@ dependencies = [
  "toml_edit",
  "tracing",
  "tracing-utils",
+ "twox-hash",
  "url",
  "utils",
  "uuid",
diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
index 7e0bb7dc57..f2ba50a86f 100644
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -305,6 +305,7 @@ impl From<OtelExporterProtocol> for tracing_utils::Protocol {
 pub struct TimelineImportConfig {
     pub import_job_concurrency: NonZeroUsize,
     pub import_job_soft_size_limit: NonZeroUsize,
+    pub import_job_checkpoint_threshold: NonZeroUsize,
 }
 
 pub mod statvfs {
@@ -661,6 +662,7 @@ impl Default for ConfigToml {
             timeline_import_config: TimelineImportConfig {
                 import_job_concurrency: NonZeroUsize::new(128).unwrap(),
                 import_job_soft_size_limit: NonZeroUsize::new(1024 * 1024 * 1024).unwrap(),
+                import_job_checkpoint_threshold: NonZeroUsize::new(128).unwrap(),
             },
         }
     }
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 89d531d671..58b8d80c0a 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -336,14 +336,25 @@ impl TimelineCreateRequest {
 
 #[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
 pub enum ShardImportStatus {
-    InProgress,
+    InProgress(Option<ShardImportProgress>),
     Done,
     Error(String),
 }
+
+#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
+pub struct ShardImportProgress {
+    /// Total number of jobs in the import plan
+    pub jobs: usize,
+    /// Number of jobs completed
+    pub completed: usize,
+    /// Hash of the plan
+    pub import_plan_hash: u64,
+}
+
 impl ShardImportStatus {
     pub fn is_terminal(&self) -> bool {
         match self {
-            ShardImportStatus::InProgress => false,
+            ShardImportStatus::InProgress(_) => false,
             ShardImportStatus::Done | ShardImportStatus::Error(_) => true,
         }
     }
diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml
index 8abd504922..b7b3e0eaf1 100644
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -96,6 +96,7 @@ strum.workspace = true
 strum_macros.workspace = true
 wal_decoder.workspace = true
 smallvec.workspace = true
+twox-hash.workspace = true
 
 [target.'cfg(target_os = "linux")'.dependencies]
 procfs.workspace = true
diff --git a/pageserver/src/tenant/timeline/import_pgdata.rs b/pageserver/src/tenant/timeline/import_pgdata.rs
index 5fac9e0ce7..602b20df97 100644
--- a/pageserver/src/tenant/timeline/import_pgdata.rs
+++ b/pageserver/src/tenant/timeline/import_pgdata.rs
@@ -1,6 +1,7 @@
 use std::sync::Arc;
 
 use anyhow::{Context, bail};
+use importbucket_client::{ControlFile, RemoteStorageWrapper};
 use pageserver_api::models::ShardImportStatus;
 use remote_storage::RemotePath;
 use tokio::task::JoinHandle;
@@ -57,115 +58,40 @@ pub async fn doit(
         .map_err(|_err| anyhow::anyhow!("Shut down while getting timeline import status"))?;
 
     info!(?shard_status, "peeking shard status");
-    match shard_status {
-        None | Some(ShardImportStatus::InProgress) => {
-            // TODO: checkpoint the progress into the IndexPart instead of restarting
-            // from the beginning.
-
-            //
-            // Wipe the slate clean - the flow does not allow resuming.
-            // We can implement resuming in the future by checkpointing the progress into the IndexPart.
-            //
-            info!("wipe the slate clean");
-            {
-                // TODO: do we need to hold GC lock for this?
-                let mut guard = timeline.layers.write().await;
-                assert!(
-                    guard.layer_map()?.open_layer.is_none(),
-                    "while importing, there should be no in-memory layer" // this just seems like a good place to assert it
-                );
-                let all_layers_keys = guard.all_persistent_layers();
-                let all_layers: Vec<_> = all_layers_keys
-                    .iter()
-                    .map(|key| guard.get_from_key(key))
-                    .collect();
-                let open = guard.open_mut().context("open_mut")?;
-
-                timeline.remote_client.schedule_gc_update(&all_layers)?;
-                open.finish_gc_timeline(&all_layers);
-            }
-
-            //
-            // Wait for pgdata to finish uploading
-            //
-            info!("wait for pgdata to reach status 'done'");
+    match shard_status.unwrap_or(ShardImportStatus::InProgress(None)) {
+        ShardImportStatus::InProgress(maybe_progress) => {
             let storage =
                 importbucket_client::new(timeline.conf, &location, cancel.clone()).await?;
-            let status_prefix = RemotePath::from_string("status").unwrap();
-            let pgdata_status_key = status_prefix.join("pgdata");
-            loop {
-                let res = async {
-                    let pgdata_status: Option<importbucket_format::PgdataStatus> = storage
-                        .get_json(&pgdata_status_key)
-                        .await
-                        .context("get pgdata status")?;
-                    info!(?pgdata_status, "peeking pgdata status");
-                    if pgdata_status.map(|st| st.done).unwrap_or(false) {
-                        Ok(())
-                    } else {
-                        Err(anyhow::anyhow!("pgdata not done yet"))
-                    }
-                }
-                .await;
-                match res {
-                    Ok(_) => break,
-                    Err(err) => {
-                        info!(?err, "indefinitely waiting for pgdata to finish");
-                        if tokio::time::timeout(
-                            std::time::Duration::from_secs(10),
-                            cancel.cancelled(),
-                        )
-                        .await
-                        .is_ok()
-                        {
-                            bail!("cancelled while waiting for pgdata");
-                        }
-                    }
-                }
-            }
 
-            //
-            // Do the import
-            //
-            info!("do the import");
-            let control_file = storage.get_control_file().await?;
-            let base_lsn = control_file.base_lsn();
+            let control_file_res = if maybe_progress.is_none() {
+                // Only prepare the import once when there's no progress.
+                prepare_import(timeline, storage.clone(), &cancel).await
+            } else {
+                storage.get_control_file().await
+            };
 
-            info!("update TimelineMetadata based on LSNs from control file");
-            {
-                let pg_version = control_file.pg_version();
-                let _ctx: &RequestContext = ctx;
-                async move {
-                    // FIXME: The 'disk_consistent_lsn' should be the LSN at the *end* of the
-                    // checkpoint record, and prev_record_lsn should point to its beginning.
-                    // We should read the real end of the record from the WAL, but here we
-                    // just fake it.
-                    let disk_consistent_lsn = Lsn(base_lsn.0 + 8);
-                    let prev_record_lsn = base_lsn;
-                    let metadata = TimelineMetadata::new(
-                        disk_consistent_lsn,
-                        Some(prev_record_lsn),
-                        None,     // no ancestor
-                        Lsn(0),   // no ancestor lsn
-                        base_lsn, // latest_gc_cutoff_lsn
-                        base_lsn, // initdb_lsn
-                        pg_version,
+            let control_file = match control_file_res {
+                Ok(cf) => cf,
+                Err(err) => {
+                    return Err(
+                        terminate_flow_with_error(timeline, err, &storcon_client, &cancel).await,
                     );
-
-                    let _start_lsn = disk_consistent_lsn + 1;
-
-                    timeline
-                        .remote_client
-                        .schedule_index_upload_for_full_metadata_update(&metadata)?;
-
-                    timeline.remote_client.wait_completion().await?;
-
-                    anyhow::Ok(())
                 }
-            }
-            .await?;
+            };
 
-            flow::run(timeline.clone(), control_file, storage.clone(), ctx).await?;
+            let res = flow::run(
+                timeline.clone(),
+                control_file,
+                storage.clone(),
+                maybe_progress,
+                ctx,
+            )
+            .await;
+            if let Err(err) = res {
+                return Err(
+                    terminate_flow_with_error(timeline, err, &storcon_client, &cancel).await,
+                );
+            }
 
             // Communicate that shard is done.
             // Ensure at-least-once delivery of the upcall to storage controller
@@ -180,7 +106,6 @@ pub async fn doit(
                     timeline.tenant_shard_id,
                     timeline.timeline_id,
                     timeline.generation,
-                    // TODO(vlad): What about import errors?
                     ShardImportStatus::Done,
                 )
                 .await
@@ -188,16 +113,151 @@ pub async fn doit(
                     anyhow::anyhow!("Shut down while putting timeline import status")
                 })?;
         }
-        Some(ShardImportStatus::Error(err)) => {
+        ShardImportStatus::Error(err) => {
             info!(
                 "shard status indicates that the shard is done (error), skipping import {}",
                 err
             );
         }
-        Some(ShardImportStatus::Done) => {
+        ShardImportStatus::Done => {
             info!("shard status indicates that the shard is done (success), skipping import");
         }
     }
 
     Ok(())
 }
+
+async fn prepare_import(
+    timeline: &Arc<Timeline>,
+    storage: RemoteStorageWrapper,
+    cancel: &CancellationToken,
+) -> anyhow::Result<ControlFile> {
+    // Wipe the slate clean before starting the import as a precaution.
+    // This method is only called when there's no recorded checkpoint for the import
+    // in the storage controller.
+    //
+    // Note that this is split-brain safe (two imports for same timeline shards running in
+    // different generations) because we go through the usual deletion path, including deletion queue.
+    info!("wipe the slate clean");
+    {
+        // TODO: do we need to hold GC lock for this?
+        let mut guard = timeline.layers.write().await;
+        assert!(
+            guard.layer_map()?.open_layer.is_none(),
+            "while importing, there should be no in-memory layer" // this just seems like a good place to assert it
+        );
+        let all_layers_keys = guard.all_persistent_layers();
+        let all_layers: Vec<_> = all_layers_keys
+            .iter()
+            .map(|key| guard.get_from_key(key))
+            .collect();
+        let open = guard.open_mut().context("open_mut")?;
+
+        timeline.remote_client.schedule_gc_update(&all_layers)?;
+        open.finish_gc_timeline(&all_layers);
+    }
+
+    //
+    // Wait for pgdata to finish uploading
+    //
+    info!("wait for pgdata to reach status 'done'");
+    let status_prefix = RemotePath::from_string("status").unwrap();
+    let pgdata_status_key = status_prefix.join("pgdata");
+    loop {
+        let res = async {
+            let pgdata_status: Option<importbucket_format::PgdataStatus> = storage
+                .get_json(&pgdata_status_key)
+                .await
+                .context("get pgdata status")?;
+            info!(?pgdata_status, "peeking pgdata status");
+            if pgdata_status.map(|st| st.done).unwrap_or(false) {
+                Ok(())
+            } else {
+                Err(anyhow::anyhow!("pgdata not done yet"))
+            }
+        }
+        .await;
+        match res {
+            Ok(_) => break,
+            Err(err) => {
+                info!(?err, "indefinitely waiting for pgdata to finish");
+                if tokio::time::timeout(std::time::Duration::from_secs(10), cancel.cancelled())
+                    .await
+                    .is_ok()
+                {
+                    bail!("cancelled while waiting for pgdata");
+                }
+            }
+        }
+    }
+
+    let control_file = storage.get_control_file().await?;
+    let base_lsn = control_file.base_lsn();
+
+    info!("update TimelineMetadata based on LSNs from control file");
+    {
+        let pg_version = control_file.pg_version();
+        async move {
+            // FIXME: The 'disk_consistent_lsn' should be the LSN at the *end* of the
+            // checkpoint record, and prev_record_lsn should point to its beginning.
+            // We should read the real end of the record from the WAL, but here we
+            // just fake it.
+            let disk_consistent_lsn = Lsn(base_lsn.0 + 8);
+            let prev_record_lsn = base_lsn;
+            let metadata = TimelineMetadata::new(
+                disk_consistent_lsn,
+                Some(prev_record_lsn),
+                None,     // no ancestor
+                Lsn(0),   // no ancestor lsn
+                base_lsn, // latest_gc_cutoff_lsn
+                base_lsn, // initdb_lsn
+                pg_version,
+            );
+
+            let _start_lsn = disk_consistent_lsn + 1;
+
+            timeline
+                .remote_client
+                .schedule_index_upload_for_full_metadata_update(&metadata)?;
+
+            timeline.remote_client.wait_completion().await?;
+
+            anyhow::Ok(())
+        }
+    }
+    .await?;
+
+    Ok(control_file)
+}
+
+async fn terminate_flow_with_error(
+    timeline: &Arc<Timeline>,
+    error: anyhow::Error,
+    storcon_client: &StorageControllerUpcallClient,
+    cancel: &CancellationToken,
+) -> anyhow::Error {
+    // The import task is a aborted on tenant shutdown, so in principle, it should
+    // never be cancelled. To be on the safe side, check the cancellation tokens
+    // before marking the import as failed.
+    if !(cancel.is_cancelled() || timeline.cancel.is_cancelled()) {
+        let notify_res = storcon_client
+            .put_timeline_import_status(
+                timeline.tenant_shard_id,
+                timeline.timeline_id,
+                timeline.generation,
+                ShardImportStatus::Error(format!("{error:#}")),
+            )
+            .await;
+
+        if let Err(_notify_error) = notify_res {
+            // The [`StorageControllerUpcallClient::put_timeline_import_status`] retries
+            // forever internally, so errors returned by it can only be due to cancellation.
+            info!("failed to notify storcon about permanent import error");
+        }
+
+        // Will be logged by [`Tenant::create_timeline_import_pgdata_task`]
+        error
+    } else {
+        anyhow::anyhow!("Import task cancelled")
+    }
+}
diff --git a/pageserver/src/tenant/timeline/import_pgdata/flow.rs b/pageserver/src/tenant/timeline/import_pgdata/flow.rs
index 5b9c8ec5b5..c8c3bdcdfb 100644
--- a/pageserver/src/tenant/timeline/import_pgdata/flow.rs
+++ b/pageserver/src/tenant/timeline/import_pgdata/flow.rs
@@ -29,10 +29,11 @@
 //! - version-specific CheckPointData (=> pgv abstraction, already exists for regular walingest)
 
 use std::collections::HashSet;
+use std::hash::{Hash, Hasher};
 use std::ops::Range;
 use std::sync::Arc;
 
-use anyhow::{bail, ensure};
+use anyhow::ensure;
 use bytes::Bytes;
 use futures::stream::FuturesOrdered;
 use itertools::Itertools;
@@ -43,6 +44,7 @@ use pageserver_api::key::{
     slru_segment_size_to_key,
 };
 use pageserver_api::keyspace::{contiguous_range_len, is_contiguous_range, singleton_range};
+use pageserver_api::models::{ShardImportProgress, ShardImportStatus};
 use pageserver_api::reltag::{RelTag, SlruKind};
 use pageserver_api::shard::ShardIdentity;
 use postgres_ffi::relfile_utils::parse_relfilename;
@@ -59,16 +61,18 @@ use super::Timeline;
 use super::importbucket_client::{ControlFile, RemoteStorageWrapper};
 use crate::assert_u64_eq_usize::UsizeIsU64;
 use crate::context::{DownloadBehavior, RequestContext};
+use crate::controller_upcall_client::{StorageControllerUpcallApi, StorageControllerUpcallClient};
 use crate::pgdatadir_mapping::{
     DbDirectory, RelDirectory, SlruSegmentDirectory, TwoPhaseDirectory,
 };
 use crate::task_mgr::TaskKind;
-use crate::tenant::storage_layer::{ImageLayerWriter, Layer};
+use crate::tenant::storage_layer::{AsLayerDesc, ImageLayerWriter, Layer};
 
 pub async fn run(
     timeline: Arc<Timeline>,
     control_file: ControlFile,
     storage: RemoteStorageWrapper,
+    import_progress: Option<ShardImportProgress>,
     ctx: &RequestContext,
 ) -> anyhow::Result<()> {
     let planner = Planner {
@@ -81,9 +85,31 @@ pub async fn run(
     let import_config = &timeline.conf.timeline_import_config;
     let plan = planner.plan(import_config).await?;
 
+    // Hash the plan and compare with the hash of the plan we got back from the storage controller.
+    // If the two match, it means that the planning stage had the same output.
+    //
+    // This is not intended to be a cryptographically secure hash.
+    const SEED: u64 = 42;
+    let mut hasher = twox_hash::XxHash64::with_seed(SEED);
+    plan.hash(&mut hasher);
+    let plan_hash = hasher.finish();
+
+    if let Some(progress) = &import_progress {
+        if plan_hash != progress.import_plan_hash {
+            anyhow::bail!("Import plan does not match storcon metadata");
+        }
+
+        // Handle collisions on jobs of unequal length
+        if progress.jobs != plan.jobs.len() {
+            anyhow::bail!("Import plan job length does not match storcon metadata")
+        }
+    }
+
     pausable_failpoint!("import-timeline-pre-execute-pausable");
 
-    plan.execute(timeline, import_config, ctx).await
+    let start_from_job_idx = import_progress.map(|progress| progress.completed);
+    plan.execute(timeline, start_from_job_idx, plan_hash, import_config, ctx)
+        .await
 }
 
 struct Planner {
@@ -93,8 +119,11 @@ struct Planner {
     tasks: Vec<AnyImportTask>,
 }
 
+#[derive(Hash)]
 struct Plan {
     jobs: Vec<ChunkProcessingJob>,
+    // Included here such that it ends up in the hash for the plan
+    shard: ShardIdentity,
 }
 
 impl Planner {
@@ -198,7 +227,10 @@ impl Planner {
             pgdata_lsn,
         ));
 
-        Ok(Plan { jobs })
+        Ok(Plan {
+            jobs,
+            shard: self.shard,
+        })
     }
 
     #[instrument(level = tracing::Level::DEBUG, skip_all, fields(dboid=%db.dboid, tablespace=%db.spcnode, path=%db.path))]
@@ -327,25 +359,45 @@ impl Plan {
     async fn execute(
         self,
         timeline: Arc<Timeline>,
+        start_after_job_idx: Option<usize>,
+        import_plan_hash: u64,
         import_config: &TimelineImportConfig,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
+        let storcon_client = StorageControllerUpcallClient::new(timeline.conf, &timeline.cancel);
+
         let mut work = FuturesOrdered::new();
         let semaphore = Arc::new(Semaphore::new(import_config.import_job_concurrency.into()));
 
         let jobs_in_plan = self.jobs.len();
 
-        let mut jobs = self.jobs.into_iter().enumerate().peekable();
-        let mut results = Vec::new();
+        let mut jobs = self
+            .jobs
+            .into_iter()
+            .enumerate()
+            .map(|(idx, job)| (idx + 1, job))
+            .filter(|(idx, _job)| {
+                // Filter out any jobs that have been done already
+                if let Some(start_after) = start_after_job_idx {
+                    *idx > start_after
+                } else {
+                    true
+                }
+            })
+            .peekable();
+
+        let mut last_completed_job_idx = start_after_job_idx.unwrap_or(0);
+        let checkpoint_every: usize = import_config.import_job_checkpoint_threshold.into();
 
         // Run import jobs concurrently up to the limit specified by the pageserver configuration.
         // Note that we process completed futures in the oreder of insertion. This will be the
         // building block for resuming imports across pageserver restarts or tenant migrations.
-        while results.len() < jobs_in_plan {
+        while last_completed_job_idx < jobs_in_plan {
             tokio::select! {
                 permit = semaphore.clone().acquire_owned(), if jobs.peek().is_some() => {
                     let permit = permit.expect("never closed");
                     let (job_idx, job) = jobs.next().expect("we peeked");
+
                     let job_timeline = timeline.clone();
                     let ctx = ctx.detached_child(TaskKind::ImportPgdata, DownloadBehavior::Error);
 
@@ -357,13 +409,33 @@ impl Plan {
                 },
                 maybe_complete_job_idx = work.next() => {
                     match maybe_complete_job_idx {
-                        Some(Ok((_job_idx, res))) => {
-                            results.push(res);
+                        Some(Ok((job_idx, res))) => {
+                            assert!(last_completed_job_idx.checked_add(1).unwrap() == job_idx);
+
+                            res?;
+                            last_completed_job_idx = job_idx;
+
+                            if last_completed_job_idx % checkpoint_every == 0 {
+                                storcon_client.put_timeline_import_status(
+                                    timeline.tenant_shard_id,
+                                    timeline.timeline_id,
+                                    timeline.generation,
+                                    ShardImportStatus::InProgress(Some(ShardImportProgress {
+                                        jobs: jobs_in_plan,
+                                        completed: last_completed_job_idx,
+                                        import_plan_hash,
+                                    }))
+                                )
+                                .await
+                                .map_err(|_err| {
+                                    anyhow::anyhow!("Shut down while putting timeline import status")
+                                })?;
+                            }
                         },
                         Some(Err(_)) => {
-                            results.push(Err(anyhow::anyhow!(
-                                "parallel job panicked or cancelled, check pageserver logs"
-                            )));
+                            anyhow::bail!(
+                                "import job panicked or cancelled"
+                            );
                         }
                         None => {}
                     }
@@ -371,17 +443,7 @@ impl Plan {
             }
         }
 
-        if results.iter().all(|r| r.is_ok()) {
-            Ok(())
-        } else {
-            let mut msg = String::new();
-            for result in results {
-                if let Err(err) = result {
-                    msg.push_str(&format!("{err:?}\n\n"));
-                }
-            }
-            bail!("Some parallel jobs failed:\n\n{msg}");
-        }
+        Ok(())
     }
 }
 
@@ -553,6 +615,15 @@ struct ImportSingleKeyTask {
     buf: Bytes,
 }
 
+impl Hash for ImportSingleKeyTask {
+    fn hash<H: Hasher>(&self, state: &mut H) {
+        let ImportSingleKeyTask { key, buf } = self;
+
+        key.hash(state);
+        buf.hash(state);
+    }
+}
+
 impl ImportSingleKeyTask {
     fn new(key: Key, buf: Bytes) -> Self {
         ImportSingleKeyTask { key, buf }
@@ -581,6 +652,20 @@ struct ImportRelBlocksTask {
     storage: RemoteStorageWrapper,
 }
 
+impl Hash for ImportRelBlocksTask {
+    fn hash<H: Hasher>(&self, state: &mut H) {
+        let ImportRelBlocksTask {
+            shard_identity: _,
+            key_range,
+            path,
+            storage: _,
+        } = self;
+
+        key_range.hash(state);
+        path.hash(state);
+    }
+}
+
 impl ImportRelBlocksTask {
     fn new(
         shard_identity: ShardIdentity,
@@ -665,6 +750,19 @@ struct ImportSlruBlocksTask {
     storage: RemoteStorageWrapper,
 }
 
+impl Hash for ImportSlruBlocksTask {
+    fn hash<H: Hasher>(&self, state: &mut H) {
+        let ImportSlruBlocksTask {
+            key_range,
+            path,
+            storage: _,
+        } = self;
+
+        key_range.hash(state);
+        path.hash(state);
+    }
+}
+
 impl ImportSlruBlocksTask {
     fn new(key_range: Range<Key>, path: &RemotePath, storage: RemoteStorageWrapper) -> Self {
         ImportSlruBlocksTask {
@@ -707,6 +805,7 @@ impl ImportTask for ImportSlruBlocksTask {
     }
 }
 
+#[derive(Hash)]
 enum AnyImportTask {
     SingleKey(ImportSingleKeyTask),
     RelBlocks(ImportRelBlocksTask),
@@ -753,6 +852,7 @@ impl From<ImportSlruBlocksTask> for AnyImportTask {
     }
 }
 
+#[derive(Hash)]
 struct ChunkProcessingJob {
     range: Range<Key>,
     tasks: Vec<AnyImportTask>,
@@ -790,17 +890,51 @@ impl ChunkProcessingJob {
 
         let resident_layer = if nimages > 0 {
             let (desc, path) = writer.finish(ctx).await?;
+
+            {
+                let guard = timeline.layers.read().await;
+                let existing_layer = guard.try_get_from_key(&desc.key());
+                if let Some(layer) = existing_layer {
+                    if layer.metadata().generation != timeline.generation {
+                        return Err(anyhow::anyhow!(
+                            "Import attempted to rewrite layer file in the same generation: {}",
+                            layer.local_path()
+                        ));
+                    }
+                }
+            }
+
             Layer::finish_creating(timeline.conf, &timeline, desc, &path)?
         } else {
             // dropping the writer cleans up
             return Ok(());
         };
 
-        // this is sharing the same code as create_image_layers
+        // The same import job might run multiple times since not each job is checkpointed.
+        // Hence, we must support the cases where the layer already exists. We cannot be
+        // certain that the existing layer is identical to the new one, so in that case
+        // we replace the old layer with the one we just generated.
+
         let mut guard = timeline.layers.write().await;
-        guard
-            .open_mut()?
-            .track_new_image_layers(&[resident_layer.clone()], &timeline.metrics);
+
+        let existing_layer = guard
+            .try_get_from_key(&resident_layer.layer_desc().key())
+            .cloned();
+        match existing_layer {
+            Some(existing) => {
+                guard.open_mut()?.rewrite_layers(
+                    &[(existing.clone(), resident_layer.clone())],
+                    &[],
+                    &timeline.metrics,
+                );
+            }
+            None => {
+                guard
+                    .open_mut()?
+                    .track_new_image_layers(&[resident_layer.clone()], &timeline.metrics);
+            }
+        }
+
         crate::tenant::timeline::drop_wlock(guard);
 
         timeline
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 852005639a..7e4bb627af 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -4082,7 +4082,7 @@ impl Service {
     /// imports are stored in the database).
     #[instrument(skip_all, fields(
         tenant_id=%import.tenant_id,
-        shard_id=%import.timeline_id,
+        timeline_id=%import.timeline_id,
     ))]
     async fn finalize_timeline_import(
         self: &Arc<Self>,
diff --git a/storage_controller/src/timeline_import.rs b/storage_controller/src/timeline_import.rs
index 5d9d633932..909e8e2899 100644
--- a/storage_controller/src/timeline_import.rs
+++ b/storage_controller/src/timeline_import.rs
@@ -5,7 +5,7 @@ use http_utils::error::ApiError;
 use reqwest::Method;
 use serde::{Deserialize, Serialize};
 
-use pageserver_api::models::ShardImportStatus;
+use pageserver_api::models::{ShardImportProgress, ShardImportStatus};
 use tokio_util::sync::CancellationToken;
 use utils::{
     id::{TenantId, TimelineId},
@@ -28,7 +28,12 @@ impl ShardImportStatuses {
         ShardImportStatuses(
             shards
                 .into_iter()
-                .map(|ts_id| (ts_id, ShardImportStatus::InProgress))
+                .map(|ts_id| {
+                    (
+                        ts_id,
+                        ShardImportStatus::InProgress(None::<ShardImportProgress>),
+                    )
+                })
                 .collect(),
         )
     }
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 2801a0e867..9d86fd027c 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1255,6 +1255,12 @@ class NeonEnv:
                 "no_sync": True,
                 # Look for gaps in WAL received from safekeepeers
                 "validate_wal_contiguity": True,
+                # TODO(vlad): make these configurable through the builder
+                "timeline_import_config": {
+                    "import_job_concurrency": 4,
+                    "import_job_soft_size_limit": 512 * 1024,
+                    "import_job_checkpoint_threshold": 4,
+                },
             }
 
             # Batching (https://github.com/neondatabase/neon/issues/9377):

From 31026d5a3c246956dda9ba4925efdc72ded42de0 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Thu, 15 May 2025 17:13:15 +0100
Subject: [PATCH 49/65] pageserver: support import schema evolution (#11935)

## Problem

Imports don't support schema evolution nicely. If we want to change the
stuff we keep in storcon,
we'd have to carry the old cruft around.

## Summary of changes

Version import progress. Note that the import progress version
determines the version of the import
job split and execution. This means that we can also use it as a
mechanism for deploying new import
implementations in the future.
---
 libs/pageserver_api/src/models.rs             |  7 ++-
 pageserver/src/controller_upcall_client.rs    | 49 ++++++-------------
 pageserver/src/deletion_queue.rs              |  2 +-
 .../src/tenant/timeline/import_pgdata.rs      |  2 +-
 .../src/tenant/timeline/import_pgdata/flow.rs | 32 +++++++++---
 5 files changed, 50 insertions(+), 42 deletions(-)

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 58b8d80c0a..e9b37c8ca6 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -342,7 +342,12 @@ pub enum ShardImportStatus {
 }
 
 #[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
-pub struct ShardImportProgress {
+pub enum ShardImportProgress {
+    V1(ShardImportProgressV1),
+}
+
+#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
+pub struct ShardImportProgressV1 {
     /// Total number of jobs in the import plan
     pub jobs: usize,
     /// Number of jobs completed
diff --git a/pageserver/src/controller_upcall_client.rs b/pageserver/src/controller_upcall_client.rs
index 779ef3e37d..dc38ea616c 100644
--- a/pageserver/src/controller_upcall_client.rs
+++ b/pageserver/src/controller_upcall_client.rs
@@ -59,7 +59,7 @@ pub trait StorageControllerUpcallApi {
         tenant_shard_id: TenantShardId,
         timeline_id: TimelineId,
         generation: Generation,
-    ) -> impl Future<Output = Result<Option<ShardImportStatus>, RetryForeverError>> + Send;
+    ) -> impl Future<Output = Result<ShardImportStatus, RetryForeverError>> + Send;
 }
 
 impl StorageControllerUpcallClient {
@@ -104,6 +104,7 @@ impl StorageControllerUpcallClient {
         &self,
         url: &url::Url,
         request: R,
+        method: reqwest::Method,
     ) -> Result<T, RetryForeverError>
     where
         R: Serialize,
@@ -113,7 +114,7 @@ impl StorageControllerUpcallClient {
             || async {
                 let response = self
                     .http_client
-                    .post(url.clone())
+                    .request(method.clone(), url.clone())
                     .json(&request)
                     .send()
                     .await?;
@@ -222,7 +223,9 @@ impl StorageControllerUpcallApi for StorageControllerUpcallClient {
             register: register.clone(),
         };
 
-        let response: ReAttachResponse = self.retry_http_forever(&url, request).await?;
+        let response: ReAttachResponse = self
+            .retry_http_forever(&url, request, reqwest::Method::POST)
+            .await?;
         tracing::info!(
             "Received re-attach response with {} tenants (node {}, register: {:?})",
             response.tenants.len(),
@@ -275,7 +278,9 @@ impl StorageControllerUpcallApi for StorageControllerUpcallClient {
                 return Err(RetryForeverError::ShuttingDown);
             }
 
-            let response: ValidateResponse = self.retry_http_forever(&url, request).await?;
+            let response: ValidateResponse = self
+                .retry_http_forever(&url, request, reqwest::Method::POST)
+                .await?;
             for rt in response.tenants {
                 result.insert(rt.id, rt.valid);
             }
@@ -309,7 +314,8 @@ impl StorageControllerUpcallApi for StorageControllerUpcallClient {
             status,
         };
 
-        self.retry_http_forever(&url, request).await
+        self.retry_http_forever(&url, request, reqwest::Method::POST)
+            .await
     }
 
     #[tracing::instrument(skip_all)] // so that warning logs from retry_http_forever have context
@@ -318,7 +324,7 @@ impl StorageControllerUpcallApi for StorageControllerUpcallClient {
         tenant_shard_id: TenantShardId,
         timeline_id: TimelineId,
         generation: Generation,
-    ) -> Result<Option<ShardImportStatus>, RetryForeverError> {
+    ) -> Result<ShardImportStatus, RetryForeverError> {
         let url = self
             .base_url
             .join("timeline_import_status")
@@ -330,32 +336,9 @@ impl StorageControllerUpcallApi for StorageControllerUpcallClient {
             generation,
         };
 
-        Ok(backoff::retry(
-            || async {
-                let response = self
-                    .http_client
-                    .get(url.clone())
-                    .json(&request)
-                    .send()
-                    .await?;
-
-                if let Err(err) = response.error_for_status_ref() {
-                    if matches!(err.status(), Some(reqwest::StatusCode::NOT_FOUND)) {
-                        return Ok(None);
-                    } else {
-                        return Err(err);
-                    }
-                }
-                response.json::<ShardImportStatus>().await.map(Some)
-            },
-            |_| false,
-            3,
-            u32::MAX,
-            "storage controller upcall",
-            &self.cancel,
-        )
-        .await
-        .ok_or(RetryForeverError::ShuttingDown)?
-        .expect("We retry forever, this should never be reached"))
+        let response: ShardImportStatus = self
+            .retry_http_forever(&url, request, reqwest::Method::GET)
+            .await?;
+        Ok(response)
     }
 }
diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs
index 0bbad87c09..7854fd9e36 100644
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -804,7 +804,7 @@ mod test {
             _tenant_shard_id: TenantShardId,
             _timeline_id: TimelineId,
             _generation: Generation,
-        ) -> Result<Option<ShardImportStatus>, RetryForeverError> {
+        ) -> Result<ShardImportStatus, RetryForeverError> {
             unimplemented!()
         }
     }
diff --git a/pageserver/src/tenant/timeline/import_pgdata.rs b/pageserver/src/tenant/timeline/import_pgdata.rs
index 602b20df97..658d867c18 100644
--- a/pageserver/src/tenant/timeline/import_pgdata.rs
+++ b/pageserver/src/tenant/timeline/import_pgdata.rs
@@ -58,7 +58,7 @@ pub async fn doit(
         .map_err(|_err| anyhow::anyhow!("Shut down while getting timeline import status"))?;
 
     info!(?shard_status, "peeking shard status");
-    match shard_status.unwrap_or(ShardImportStatus::InProgress(None)) {
+    match shard_status {
         ShardImportStatus::InProgress(maybe_progress) => {
             let storage =
                 importbucket_client::new(timeline.conf, &location, cancel.clone()).await?;
diff --git a/pageserver/src/tenant/timeline/import_pgdata/flow.rs b/pageserver/src/tenant/timeline/import_pgdata/flow.rs
index c8c3bdcdfb..3e10a4e6d6 100644
--- a/pageserver/src/tenant/timeline/import_pgdata/flow.rs
+++ b/pageserver/src/tenant/timeline/import_pgdata/flow.rs
@@ -44,7 +44,7 @@ use pageserver_api::key::{
     slru_segment_size_to_key,
 };
 use pageserver_api::keyspace::{contiguous_range_len, is_contiguous_range, singleton_range};
-use pageserver_api::models::{ShardImportProgress, ShardImportStatus};
+use pageserver_api::models::{ShardImportProgress, ShardImportProgressV1, ShardImportStatus};
 use pageserver_api::reltag::{RelTag, SlruKind};
 use pageserver_api::shard::ShardIdentity;
 use postgres_ffi::relfile_utils::parse_relfilename;
@@ -74,6 +74,24 @@ pub async fn run(
     storage: RemoteStorageWrapper,
     import_progress: Option<ShardImportProgress>,
     ctx: &RequestContext,
+) -> anyhow::Result<()> {
+    // Match how we run the import based on the progress version.
+    // If there's no import progress, it means that this is a new import
+    // and we can use whichever version we want.
+    match import_progress {
+        Some(ShardImportProgress::V1(progress)) => {
+            run_v1(timeline, control_file, storage, Some(progress), ctx).await
+        }
+        None => run_v1(timeline, control_file, storage, None, ctx).await,
+    }
+}
+
+async fn run_v1(
+    timeline: Arc<Timeline>,
+    control_file: ControlFile,
+    storage: RemoteStorageWrapper,
+    import_progress: Option<ShardImportProgressV1>,
+    ctx: &RequestContext,
 ) -> anyhow::Result<()> {
     let planner = Planner {
         control_file,
@@ -416,15 +434,17 @@ impl Plan {
                             last_completed_job_idx = job_idx;
 
                             if last_completed_job_idx % checkpoint_every == 0 {
+                                let progress = ShardImportProgressV1 {
+                                    jobs: jobs_in_plan,
+                                    completed: last_completed_job_idx,
+                                    import_plan_hash,
+                                };
+
                                 storcon_client.put_timeline_import_status(
                                     timeline.tenant_shard_id,
                                     timeline.timeline_id,
                                     timeline.generation,
-                                    ShardImportStatus::InProgress(Some(ShardImportProgress {
-                                        jobs: jobs_in_plan,
-                                        completed: last_completed_job_idx,
-                                        import_plan_hash,
-                                    }))
+                                    ShardImportStatus::InProgress(Some(ShardImportProgress::V1(progress)))
                                 )
                                 .await
                                 .map_err(|_err| {

From a7ce323949d277fa720a612d710b810903c1b1ff Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 15 May 2025 19:48:13 +0200
Subject: [PATCH 50/65] benchmarking: extend `test_page_service_batching.py` to
 cover concurrent IO + batching under random reads (#10466)

This PR commits the benchmarks I ran to qualify concurrent IO before we
released it.

Changes:
- Add `l0stack` fixture; a reusable abstraction for creating a stack of
L0 deltas
  each of which has 1 Value::Delta per page.
- Such a stack of L0 deltas is a good and understandable demo for
concurrent IO
because to reconstruct any page, $layer_stack_height` Values need to be
read.
  Before concurrent IO, the reads were sequential.
  With concurrent IO, they are executed concurrently.
- So, switch `test_latency` to use the l0stack.
- Teach `pagebench`, which is used by `test_latency`, to limit itself to
the blocks of the relation created by the l0stack abstraction.
- Additional parametrization of `test_latency` over dimensions
`ps_io_concurrency,l0_stack_height,queue_depth`
- Use better names for the tests to reflect what they do, leave
interpretation of the (now quite high-dimensional) results to the reader
  - `test_{throughput => postgres_seqscan}`
  - `test_{latency => random_reads}`
- Cut down on permutations to those we use in production. Runtime is
about 2min.

Refs
- concurrent IO epic https://github.com/neondatabase/neon/issues/9378
- batching task: fixes https://github.com/neondatabase/neon/issues/9837

---------

Co-authored-by: Peter Bendel <peterbendel@neon.tech>
---
 libs/pageserver_api/src/key.rs                |   5 +
 .../pagebench/src/cmd/getpage_latest_lsn.rs   |  10 +-
 .../bin/neon_local_create_deep_l0_stack.py    |  59 +++++++
 test_runner/fixtures/neon_fixtures.py         |  11 +-
 .../pageserver/makelayers/__init__.py         |   0
 .../fixtures/pageserver/makelayers/l0stack.py | 148 ++++++++++++++++
 test_runner/performance/README.md             |   3 +-
 test_runner/performance/out_dir_to_csv.py     |  57 ++++++
 .../pageserver/test_page_service_batching.py  | 167 ++++++++++--------
 9 files changed, 387 insertions(+), 73 deletions(-)
 create mode 100644 test_runner/bin/neon_local_create_deep_l0_stack.py
 create mode 100644 test_runner/fixtures/pageserver/makelayers/__init__.py
 create mode 100644 test_runner/fixtures/pageserver/makelayers/l0stack.py
 create mode 100644 test_runner/performance/out_dir_to_csv.py

diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs
index 0c4d7fd4cb..c14975167b 100644
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -910,6 +910,11 @@ impl Key {
         self.field1 == 0x00 && self.field4 != 0 && self.field6 != 0xffffffff
     }
 
+    #[inline(always)]
+    pub fn is_rel_block_of_rel(&self, rel: Oid) -> bool {
+        self.is_rel_block_key() && self.field4 == rel
+    }
+
     #[inline(always)]
     pub fn is_rel_dir_key(&self) -> bool {
         self.field1 == 0x00
diff --git a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
index 771a7cbe5b..50419ec338 100644
--- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
+++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
@@ -65,6 +65,9 @@ pub(crate) struct Args {
     #[clap(long, default_value = "1")]
     queue_depth: NonZeroUsize,
 
+    #[clap(long)]
+    only_relnode: Option<u32>,
+
     targets: Option<Vec<TenantTimelineId>>,
 }
 
@@ -206,7 +209,12 @@ async fn main_impl(
                     for r in partitioning.keys.ranges.iter() {
                         let mut i = r.start;
                         while i != r.end {
-                            if i.is_rel_block_key() {
+                            let mut include = true;
+                            include &= i.is_rel_block_key();
+                            if let Some(only_relnode) = args.only_relnode {
+                                include &= i.is_rel_block_of_rel(only_relnode);
+                            }
+                            if include {
                                 filtered.add_key(i);
                             }
                             i = i.next();
diff --git a/test_runner/bin/neon_local_create_deep_l0_stack.py b/test_runner/bin/neon_local_create_deep_l0_stack.py
new file mode 100644
index 0000000000..ebe11f7308
--- /dev/null
+++ b/test_runner/bin/neon_local_create_deep_l0_stack.py
@@ -0,0 +1,59 @@
+"""
+Script to creates a stack of L0 deltas each of which should have 1 Value::Delta per page in `data`,
+in your running neon_local setup.
+
+Use this bash setup to reset your neon_local environment.
+The last line of this bash snippet will run this file here.
+```
+ export NEON_REPO_DIR=$PWD/.neon
+ export NEON_BIN_DIR=$PWD/target/release
+ $NEON_BIN_DIR/neon_local stop
+ rm -rf $NEON_REPO_DIR
+ $NEON_BIN_DIR/neon_local init
+ cat >>  $NEON_REPO_DIR/pageserver_1/pageserver.toml <<"EOF"
+ # customizations
+ virtual_file_io_mode = "direct-rw"
+ page_service_pipelining={mode="pipelined", max_batch_size=32, execution="concurrent-futures"}
+ get_vectored_concurrent_io={mode="sidecar-task"}
+EOF
+ $NEON_BIN_DIR/neon_local start
+
+ psql 'postgresql://localhost:1235/storage_controller' -c 'DELETE FROM tenant_shards'
+ sed 's/.*get_vectored_concurrent_io.*/get_vectored_concurrent_io={mode="sidecar-task"}/' -i $NEON_REPO_DIR/pageserver_1/pageserver.toml
+ $NEON_BIN_DIR/neon_local pageserver restart
+ sleep 2
+ $NEON_BIN_DIR/neon_local tenant create --set-default
+ ./target/debug/neon_local endpoint stop foo
+ rm -rf  $NEON_REPO_DIR/endpoints/foo
+ ./target/debug/neon_local endpoint create foo
+ echo 'full_page_writes=off' >>  $NEON_REPO_DIR/endpoints/foo/postgresql.conf
+ ./target/debug/neon_local endpoint start foo
+
+  pushd test_runner; poetry run python3 -m bin.neon_local_create_deep_l0_stack 10; popd
+```
+"""
+
+import sys
+
+import psycopg2
+from fixtures.common_types import TenantShardId, TimelineId
+from fixtures.pageserver.http import PageserverHttpClient
+from fixtures.pageserver.makelayers.l0stack import L0StackShape, make_l0_stack_standalone
+
+ps_http = PageserverHttpClient(port=9898, is_testing_enabled_or_skip=lambda: None)
+vps_http = PageserverHttpClient(port=1234, is_testing_enabled_or_skip=lambda: None)
+
+tenants = ps_http.tenant_list()
+assert len(tenants) == 1
+tenant_shard_id = TenantShardId.parse(tenants[0]["id"])
+
+timlines = ps_http.timeline_list(tenant_shard_id)
+assert len(timlines) == 1
+timeline_id = TimelineId(timlines[0]["timeline_id"])
+
+connstr = "postgresql://cloud_admin@localhost:55432/postgres"
+conn = psycopg2.connect(connstr)
+
+shape = L0StackShape(logical_table_size_mib=50, delta_stack_height=int(sys.argv[1]))
+
+make_l0_stack_standalone(vps_http, ps_http, tenant_shard_id, timeline_id, conn, shape)
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 9d86fd027c..e413b3c6d2 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1377,7 +1377,11 @@ class NeonEnv:
             force=config.config_init_force,
         )
 
-    def start(self, timeout_in_seconds: int | None = None):
+    def start(
+        self,
+        timeout_in_seconds: int | None = None,
+        extra_ps_env_vars: dict[str, str] | None = None,
+    ):
         # Storage controller starts first, so that pageserver /re-attach calls don't
         # bounce through retries on startup
         self.storage_controller.start(timeout_in_seconds=timeout_in_seconds)
@@ -1396,7 +1400,10 @@ class NeonEnv:
             for pageserver in self.pageservers:
                 futs.append(
                     executor.submit(
-                        lambda ps=pageserver: ps.start(timeout_in_seconds=timeout_in_seconds)  # type: ignore[misc]
+                        lambda ps=pageserver: ps.start(  # type: ignore[misc]
+                            extra_env_vars=extra_ps_env_vars or {},
+                            timeout_in_seconds=timeout_in_seconds,
+                        ),
                     )
                 )
 
diff --git a/test_runner/fixtures/pageserver/makelayers/__init__.py b/test_runner/fixtures/pageserver/makelayers/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/test_runner/fixtures/pageserver/makelayers/l0stack.py b/test_runner/fixtures/pageserver/makelayers/l0stack.py
new file mode 100644
index 0000000000..408ba1254f
--- /dev/null
+++ b/test_runner/fixtures/pageserver/makelayers/l0stack.py
@@ -0,0 +1,148 @@
+from dataclasses import dataclass
+
+from psycopg2.extensions import connection as PgConnection
+
+from fixtures.common_types import Lsn, TenantShardId, TimelineId
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import Endpoint
+from fixtures.pageserver.http import PageserverHttpClient
+from fixtures.pageserver.utils import wait_for_last_record_lsn
+
+
+@dataclass
+class L0StackShape:
+    logical_table_size_mib: int = 50
+    delta_stack_height: int = 20
+
+
+def make_l0_stack(endpoint: Endpoint, shape: L0StackShape):
+    """
+    Creates stack of L0 deltas each of which should have 1 Value::Delta per page in table `data`.
+    """
+    env = endpoint.env
+
+    # TDOO: wait for storcon to finish any reonciles before jumping to action here?
+    description = env.storage_controller.tenant_describe(endpoint.tenant_id)
+    shards = description["shards"]
+    assert len(shards) == 1, "does not support sharding"
+    tenant_shard_id = TenantShardId.parse(shards[0]["tenant_shard_id"])
+
+    endpoint.config(["full_page_writes=off"])
+    endpoint.reconfigure()
+
+    ps = env.get_pageserver(shards[0]["node_attached"])
+
+    timeline_id = endpoint.show_timeline_id()
+
+    vps_http = env.storage_controller.pageserver_api()
+    ps_http = ps.http_client()
+    endpoint_conn = endpoint.connect()
+    make_l0_stack_standalone(vps_http, ps_http, tenant_shard_id, timeline_id, endpoint_conn, shape)
+
+
+def make_l0_stack_standalone(
+    vps_http: PageserverHttpClient,
+    ps_http: PageserverHttpClient,
+    tenant_shard_id: TenantShardId,
+    timeline_id: TimelineId,
+    endpoint_conn: PgConnection,
+    shape: L0StackShape,
+):
+    """
+    See make_l0_stack for details.
+
+    This function is a standalone version of make_l0_stack, usable from not-test code.
+    """
+
+    assert not tenant_shard_id.shard_index.is_sharded, (
+        "the current implementation only supports unsharded tenants"
+    )
+
+    tenant_id = tenant_shard_id.tenant_id
+    conn = endpoint_conn
+    desired_size = shape.logical_table_size_mib * 1024 * 1024
+
+    config = {
+        "gc_period": "0s",  # disable periodic gc
+        "checkpoint_timeout": "10 years",
+        "compaction_period": "1h",  # doesn't matter, but 0 value will kill walredo every 10s
+        "compaction_threshold": 100000,  # we just want L0s
+        "compaction_target_size": 134217728,
+        "checkpoint_distance": 268435456,
+        "image_creation_threshold": 100000,  # we just want L0s
+    }
+
+    vps_http.set_tenant_config(tenant_id, config)
+
+    conn.autocommit = True
+    cur = conn.cursor()
+
+    # Ensure full_page_writes are disabled so that all Value::Delta in
+    # pageserver are !will_init, and therefore a getpage needs to read
+    # the entire delta stack.
+    cur.execute("SHOW full_page_writes")
+    assert cur.fetchall()[0][0] == "off", "full_page_writes should be off"
+
+    # each tuple is 23 (header) + 100 bytes = 123 bytes
+    # page header si 24 bytes
+    # 8k page size
+    # (8k-24bytes) / 123 bytes = 63 tuples per page
+    # set fillfactor to 10 to have 6 tuples per page
+    cur.execute("DROP TABLE IF EXISTS data")
+    cur.execute("CREATE TABLE data(id bigint, row char(92)) with (fillfactor=10)")
+    need_pages = desired_size // 8192
+    need_rows = need_pages * 6
+    log.info(f"Need {need_pages} pages, {need_rows} rows")
+    cur.execute(f"INSERT INTO data SELECT i,'row'||i FROM generate_series(1, {need_rows}) as i")
+    # Raise fillfactor to 100% so that all updates are HOT updates.
+    # We assert they're hot updates by checking fetch_id_to_page_mapping remains the same.
+    cur.execute("ALTER TABLE data SET (fillfactor=100)")
+
+    def settle_and_flush():
+        cur.execute("SELECT pg_current_wal_flush_lsn()")
+        flush_lsn = Lsn(cur.fetchall()[0][0])
+        wait_for_last_record_lsn(ps_http, tenant_shard_id, timeline_id, flush_lsn)
+        ps_http.timeline_checkpoint(tenant_id, timeline_id)
+
+    # create an L0 for the initial data we just inserted
+    settle_and_flush()
+
+    # assert we wrote what we think we wrote
+    cur.execute("""
+        with ntuples_per_page as (
+            select (ctid::text::point)[0]::bigint pageno,count(*) ntuples from data group by pageno
+        )
+        select ntuples, count(*) npages from ntuples_per_page group by ntuples order by ntuples;
+    """)
+    rows = cur.fetchall()
+    log.info(f"initial table layout: {rows}")
+    assert len(rows) == 1
+    assert rows[0][0] == 6, f"expected 6 tuples per page, got {rows[0][0]}"
+    assert rows[0][1] == need_pages, f"expected {need_pages} pages, got {rows[0][1]}"
+
+    def fetch_id_to_page_mapping():
+        cur.execute("""
+            SELECT id,(ctid::text::point)[0]::bigint pageno FROM data ORDER BY id
+        """)
+        return cur.fetchall()
+
+    initial_mapping = fetch_id_to_page_mapping()
+
+    # every iteration updates one tuple in each page
+    delta_stack_height = shape.delta_stack_height
+    for i in range(0, delta_stack_height):
+        log.info(i)
+        cur.execute(f"UPDATE data set row = row||',u' where id % 6 = {i % 6}")
+        log.info(f"modified rows: {cur.rowcount}")
+        assert cur.rowcount == need_pages
+        settle_and_flush()
+        post_update_mapping = fetch_id_to_page_mapping()
+        assert initial_mapping == post_update_mapping, "Postgres should be doing HOT updates"
+
+    # Assert the layer count is what we expect it is
+    layer_map = vps_http.layer_map_info(tenant_id, timeline_id)
+    assert (
+        len(layer_map.delta_l0_layers()) == delta_stack_height + 1 + 1
+    )  # +1 for the initdb layer + 1 for the table creation & fill
+    assert len(layer_map.delta_l0_layers()) == len(layer_map.delta_layers())  # it's all L0s
+    assert len(layer_map.image_layers()) == 0  # no images
diff --git a/test_runner/performance/README.md b/test_runner/performance/README.md
index 3b25a60e9b..21844648d1 100644
--- a/test_runner/performance/README.md
+++ b/test_runner/performance/README.md
@@ -15,7 +15,8 @@ Some handy pytest flags for local development:
 - `-k` selects a test to run
 - `--timeout=0` disables our default timeout of 300s (see `setup.cfg`)
 - `--preserve-database-files` to skip cleanup
-- `--out-dir` to produce a JSON with the recorded test metrics
+- `--out-dir` to produce a JSON with the recorded test metrics.
+  There is a post-processing tool at `test_runner/performance/out_dir_to_csv.py`.
 
 # What performance tests do we have and how we run them
 
diff --git a/test_runner/performance/out_dir_to_csv.py b/test_runner/performance/out_dir_to_csv.py
new file mode 100644
index 0000000000..8647ad4acc
--- /dev/null
+++ b/test_runner/performance/out_dir_to_csv.py
@@ -0,0 +1,57 @@
+# Tool to convert the JSON output from running a perf test with `--out-dir` to a CSV that
+# can be easily pasted into a spreadsheet for quick viz & analysis.
+# Check the `./README.md` in this directory for `--out-dir`.
+#
+# TODO: add the pytest.mark.parametrize to the json and make them columns here
+# https://github.com/neondatabase/neon/issues/11878
+
+import csv
+import json
+import os
+import sys
+
+
+def json_to_csv(json_file):
+    with open(json_file) as f:
+        data = json.load(f)
+
+    # Collect all possible metric names to form headers
+    all_metrics = set()
+    for result in data.get("result", []):
+        for metric in result.get("data", []):
+            all_metrics.add(metric["name"])
+
+    # Sort metrics for consistent output
+    metrics = sorted(list(all_metrics))
+
+    # Create headers
+    headers = ["suit"] + metrics
+
+    # Prepare rows
+    rows = []
+    for result in data.get("result", []):
+        row = {"suit": result["suit"]}
+
+        # Initialize all metrics to empty
+        for metric in metrics:
+            row[metric] = ""
+
+        # Fill in available metrics
+        for item in result.get("data", []):
+            row[item["name"]] = item["value"]
+
+        rows.append(row)
+
+    # Write to stdout as CSV
+    writer = csv.DictWriter(sys.stdout, fieldnames=headers)
+    writer.writeheader()
+    writer.writerows(rows)
+
+
+if __name__ == "__main__":
+    if len(sys.argv) < 2:
+        print(f"Usage: python {os.path.basename(__file__)} <json_file>")
+        sys.exit(1)
+
+    json_file = sys.argv[1]
+    json_to_csv(json_file)
diff --git a/test_runner/performance/pageserver/test_page_service_batching.py b/test_runner/performance/pageserver/test_page_service_batching.py
index b17ca772c9..9e2312311a 100644
--- a/test_runner/performance/pageserver/test_page_service_batching.py
+++ b/test_runner/performance/pageserver/test_page_service_batching.py
@@ -10,7 +10,8 @@ from typing import Any
 import pytest
 from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnvBuilder, PgBin, wait_for_last_flush_lsn
+from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, PgBin
+from fixtures.pageserver.makelayers import l0stack
 from fixtures.utils import humantime_to_ms
 
 TARGET_RUNTIME = 30
@@ -34,28 +35,18 @@ class PageServicePipeliningConfigPipelined(PageServicePipeliningConfig):
     mode: str = "pipelined"
 
 
-EXECUTION = ["concurrent-futures"]
-BATCHING = ["uniform-lsn", "scattered-lsn"]
-
-NON_BATCHABLE: list[PageServicePipeliningConfig] = [PageServicePipeliningConfigSerial()]
-for max_batch_size in [1, 32]:
-    for execution in EXECUTION:
-        for batching in BATCHING:
-            NON_BATCHABLE.append(
-                PageServicePipeliningConfigPipelined(max_batch_size, execution, batching)
-            )
-
-BATCHABLE: list[PageServicePipeliningConfig] = []
+PS_IO_CONCURRENCY = ["sidecar-task"]
+PIPELINING_CONFIGS: list[PageServicePipeliningConfig] = []
 for max_batch_size in [32]:
-    for execution in EXECUTION:
-        for batching in BATCHING:
-            BATCHABLE.append(
+    for execution in ["concurrent-futures"]:
+        for batching in ["scattered-lsn"]:
+            PIPELINING_CONFIGS.append(
                 PageServicePipeliningConfigPipelined(max_batch_size, execution, batching)
             )
 
 
 @pytest.mark.parametrize(
-    "tablesize_mib, pipelining_config, target_runtime, effective_io_concurrency, readhead_buffer_size, name",
+    "tablesize_mib, pipelining_config, target_runtime, ps_io_concurrency, effective_io_concurrency, readhead_buffer_size, name",
     [
         # batchable workloads should show throughput and CPU efficiency improvements
         *[
@@ -63,20 +54,23 @@ for max_batch_size in [32]:
                 50,
                 config,
                 TARGET_RUNTIME,
+                ps_io_concurrency,
                 100,
                 128,
                 f"batchable {dataclasses.asdict(config)}",
             )
-            for config in BATCHABLE
+            for config in PIPELINING_CONFIGS
+            for ps_io_concurrency in PS_IO_CONCURRENCY
         ],
     ],
 )
-def test_throughput(
+def test_postgres_seqscan(
     neon_env_builder: NeonEnvBuilder,
     zenbenchmark: NeonBenchmarker,
     tablesize_mib: int,
     pipelining_config: PageServicePipeliningConfig,
     target_runtime: int,
+    ps_io_concurrency: str,
     effective_io_concurrency: int,
     readhead_buffer_size: int,
     name: str,
@@ -97,6 +91,10 @@ def test_throughput(
     If the compute provides pipeline depth (effective_io_concurrency=100), then
     pipelining configs, especially with max_batch_size>1 should yield dramatic improvements
     in all performance metrics.
+
+    We advance the LSN from a disruptor thread to simulate the effect of a workload with concurrent writes
+    in another table. The `scattered-lsn` batching mode handles this well whereas the
+    initial implementatin (`uniform-lsn`) would break the batch.
     """
 
     #
@@ -114,7 +112,19 @@ def test_throughput(
         }
     )
     # For storing configuration as a metric, insert a fake 0 with labels with actual data
-    params.update({"pipelining_config": (0, {"labels": dataclasses.asdict(pipelining_config)})})
+    params.update(
+        {
+            "config": (
+                0,
+                {
+                    "labels": {
+                        "pipelining_config": dataclasses.asdict(pipelining_config),
+                        "ps_io_concurrency": ps_io_concurrency,
+                    }
+                },
+            )
+        }
+    )
 
     log.info("params: %s", params)
 
@@ -266,7 +276,10 @@ def test_throughput(
         return iters
 
     env.pageserver.patch_config_toml_nonrecursive(
-        {"page_service_pipelining": dataclasses.asdict(pipelining_config)}
+        {
+            "page_service_pipelining": dataclasses.asdict(pipelining_config),
+            "get_vectored_concurrent_io": {"mode": ps_io_concurrency},
+        }
     )
 
     # set trace for log analysis below
@@ -318,77 +331,63 @@ def test_throughput(
     )
 
 
-PRECISION_CONFIGS: list[PageServicePipeliningConfig] = [PageServicePipeliningConfigSerial()]
-for max_batch_size in [1, 32]:
-    for execution in EXECUTION:
-        for batching in BATCHING:
-            PRECISION_CONFIGS.append(
-                PageServicePipeliningConfigPipelined(max_batch_size, execution, batching)
-            )
-
-
 @pytest.mark.parametrize(
-    "pipelining_config,name",
-    [(config, f"{dataclasses.asdict(config)}") for config in PRECISION_CONFIGS],
+    "pipelining_config,ps_io_concurrency,l0_stack_height,queue_depth,name",
+    [
+        (config, ps_io_concurrency, l0_stack_height, queue_depth, f"{dataclasses.asdict(config)}")
+        for config in PIPELINING_CONFIGS
+        for ps_io_concurrency in PS_IO_CONCURRENCY
+        for queue_depth in [1, 2, 32]
+        for l0_stack_height in [0, 20]
+    ],
 )
-def test_latency(
+def test_random_reads(
     neon_env_builder: NeonEnvBuilder,
     zenbenchmark: NeonBenchmarker,
     pg_bin: PgBin,
     pipelining_config: PageServicePipeliningConfig,
+    ps_io_concurrency: str,
+    l0_stack_height: int,
+    queue_depth: int,
     name: str,
 ):
     """
-    Measure the latency impact of pipelining in an un-batchable workloads.
-
-    An ideal implementation should not increase average or tail latencies for such workloads.
-
-    We don't have support in pagebench to create queue depth yet.
-    => https://github.com/neondatabase/neon/issues/9837
+    Throw pagebench random getpage at latest lsn workload from a single client against pageserver.
     """
 
     #
     # Setup
     #
 
+    def build_snapshot_cb(neon_env_builder: NeonEnvBuilder) -> NeonEnv:
+        env = neon_env_builder.init_start()
+        endpoint = env.endpoints.create_start("main")
+        l0stack.make_l0_stack(
+            endpoint,
+            l0stack.L0StackShape(logical_table_size_mib=50, delta_stack_height=l0_stack_height),
+        )
+        return env
+
+    env = neon_env_builder.build_and_use_snapshot(
+        f"test_page_service_batching--test_pagebench-{l0_stack_height}", build_snapshot_cb
+    )
+
     def patch_ps_config(ps_config):
-        if pipelining_config is not None:
-            ps_config["page_service_pipelining"] = dataclasses.asdict(pipelining_config)
+        ps_config["page_service_pipelining"] = dataclasses.asdict(pipelining_config)
+        ps_config["get_vectored_concurrent_io"] = {"mode": ps_io_concurrency}
 
-    neon_env_builder.pageserver_config_override = patch_ps_config
+    env.pageserver.edit_config_toml(patch_ps_config)
 
-    env = neon_env_builder.init_start()
-    endpoint = env.endpoints.create_start("main")
-    conn = endpoint.connect()
-    cur = conn.cursor()
+    env.start()
 
-    cur.execute("SET max_parallel_workers_per_gather=0")  # disable parallel backends
-    cur.execute("SET effective_io_concurrency=1")
-
-    cur.execute("CREATE EXTENSION IF NOT EXISTS neon;")
-    cur.execute("CREATE EXTENSION IF NOT EXISTS neon_test_utils;")
-
-    log.info("Filling the table")
-    cur.execute("CREATE TABLE t (data char(1000)) with (fillfactor=10)")
-    tablesize = 50 * 1024 * 1024
-    npages = tablesize // (8 * 1024)
-    cur.execute("INSERT INTO t SELECT generate_series(1, %s)", (npages,))
-    # TODO: can we force postgres to do sequential scans?
-
-    cur.close()
-    conn.close()
-
-    wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, env.initial_timeline)
-
-    endpoint.stop()
+    lsn = env.safekeepers[0].get_commit_lsn(env.initial_tenant, env.initial_timeline)
+    ep = env.endpoints.create_start("main", lsn=lsn)
+    data_table_relnode_oid = ep.safe_psql_scalar("SELECT 'data'::regclass::oid")
+    ep.stop_and_destroy()
 
     for sk in env.safekeepers:
         sk.stop()
 
-    #
-    # Run single-threaded pagebench (TODO: dedup with other benchmark code)
-    #
-
     env.pageserver.allowed_errors.append(
         # https://github.com/neondatabase/neon/issues/6925
         r".*query handler for.*pagestream.*failed: unexpected message: CopyFail during COPY.*"
@@ -396,6 +395,8 @@ def test_latency(
 
     ps_http = env.pageserver.http_client()
 
+    metrics_before = ps_http.get_metrics()
+
     cmd = [
         str(env.neon_binpath / "pagebench"),
         "get-page-latest-lsn",
@@ -405,6 +406,10 @@ def test_latency(
         env.pageserver.connstr(password=None),
         "--num-clients",
         "1",
+        "--queue-depth",
+        str(queue_depth),
+        "--only-relnode",
+        str(data_table_relnode_oid),
         "--runtime",
         "10s",
     ]
@@ -413,12 +418,22 @@ def test_latency(
     results_path = Path(basepath + ".stdout")
     log.info(f"Benchmark results at: {results_path}")
 
+    metrics_after = ps_http.get_metrics()
+
     with open(results_path) as f:
         results = json.load(f)
     log.info(f"Results:\n{json.dumps(results, sort_keys=True, indent=2)}")
 
     total = results["total"]
 
+    metric = "request_count"
+    zenbenchmark.record(
+        metric,
+        metric_value=total[metric],
+        unit="",
+        report=MetricReport.HIGHER_IS_BETTER,
+    )
+
     metric = "latency_mean"
     zenbenchmark.record(
         metric,
@@ -435,3 +450,17 @@ def test_latency(
             unit="ms",
             report=MetricReport.LOWER_IS_BETTER,
         )
+
+    reads_before = metrics_before.query_one(
+        "pageserver_io_operations_seconds_count", filter={"operation": "read"}
+    )
+    reads_after = metrics_after.query_one(
+        "pageserver_io_operations_seconds_count", filter={"operation": "read"}
+    )
+
+    zenbenchmark.record(
+        "virtual_file_reads",
+        metric_value=reads_after.value - reads_before.value,
+        unit="",
+        report=MetricReport.LOWER_IS_BETTER,
+    )

From 2d247375b3b10d80b1f235aa0e12bd41d626d54a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Fri, 16 May 2025 14:21:24 +0200
Subject: [PATCH 51/65] Update rust to 1.87.0 (#11938)

We keep the practice of keeping the compiler up to date, pointing to the
latest release. This is done by many other projects in the Rust
ecosystem as well.

The 1.87.0 release marks 10 years of Rust.

[Announcement blog
post](https://blog.rust-lang.org/2025/05/15/Rust-1.87.0/)

Prior update was in #11431
---
 build-tools.Dockerfile                   | 2 +-
 pageserver/src/virtual_file/io_engine.rs | 4 +---
 proxy/src/binary/pg_sni_router.rs        | 1 +
 proxy/src/binary/proxy.rs                | 2 +-
 rust-toolchain.toml                      | 2 +-
 storage_controller/src/scheduler.rs      | 6 +-----
 6 files changed, 6 insertions(+), 11 deletions(-)

diff --git a/build-tools.Dockerfile b/build-tools.Dockerfile
index f63d844afd..1933fd19d8 100644
--- a/build-tools.Dockerfile
+++ b/build-tools.Dockerfile
@@ -292,7 +292,7 @@ WORKDIR /home/nonroot
 
 # Rust
 # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
-ENV RUSTC_VERSION=1.86.0
+ENV RUSTC_VERSION=1.87.0
 ENV RUSTUP_HOME="/home/nonroot/.rustup"
 ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
 ARG RUSTFILT_VERSION=0.2.1
diff --git a/pageserver/src/virtual_file/io_engine.rs b/pageserver/src/virtual_file/io_engine.rs
index 7827682498..3cde34eda7 100644
--- a/pageserver/src/virtual_file/io_engine.rs
+++ b/pageserver/src/virtual_file/io_engine.rs
@@ -124,9 +124,7 @@ pub(super) fn epoll_uring_error_to_std(
 ) -> std::io::Error {
     match e {
         tokio_epoll_uring::Error::Op(e) => e,
-        tokio_epoll_uring::Error::System(system) => {
-            std::io::Error::new(std::io::ErrorKind::Other, system)
-        }
+        tokio_epoll_uring::Error::System(system) => std::io::Error::other(system),
     }
 }
 
diff --git a/proxy/src/binary/pg_sni_router.rs b/proxy/src/binary/pg_sni_router.rs
index 2239d064b2..3e87538ae7 100644
--- a/proxy/src/binary/pg_sni_router.rs
+++ b/proxy/src/binary/pg_sni_router.rs
@@ -394,6 +394,7 @@ async fn handle_client(
     }
 }
 
+#[allow(clippy::large_enum_variant)]
 enum Connection {
     Raw(tokio::net::TcpStream),
     Tls(tokio_rustls::client::TlsStream<tokio::net::TcpStream>),
diff --git a/proxy/src/binary/proxy.rs b/proxy/src/binary/proxy.rs
index fe0d551f7f..4cb5ddc335 100644
--- a/proxy/src/binary/proxy.rs
+++ b/proxy/src/binary/proxy.rs
@@ -862,7 +862,7 @@ async fn configure_redis(
         ("irsa", _) => match (&args.redis_host, args.redis_port) {
             (Some(host), Some(port)) => Some(
                 ConnectionWithCredentialsProvider::new_with_credentials_provider(
-                    host.to_string(),
+                    host.clone(),
                     port,
                     elasticache::CredentialsProvider::new(
                         args.aws_region.clone(),
diff --git a/rust-toolchain.toml b/rust-toolchain.toml
index a0d5970bd5..c48def3483 100644
--- a/rust-toolchain.toml
+++ b/rust-toolchain.toml
@@ -1,5 +1,5 @@
 [toolchain]
-channel = "1.86.0"
+channel = "1.87.0"
 profile = "default"
 # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy.
 # https://rust-lang.github.io/rustup/concepts/profiles.html
diff --git a/storage_controller/src/scheduler.rs b/storage_controller/src/scheduler.rs
index 3d5f36fb98..773373391e 100644
--- a/storage_controller/src/scheduler.rs
+++ b/storage_controller/src/scheduler.rs
@@ -628,11 +628,7 @@ impl Scheduler {
             tracing::trace!(%node_id, "attached_shard_count={} shard_count={} expected={}", node.attached_shard_count, node.shard_count, expected_attached_shards_per_node);
         }
 
-        if node.attached_shard_count < expected_attached_shards_per_node {
-            expected_attached_shards_per_node - node.attached_shard_count
-        } else {
-            0
-        }
+        expected_attached_shards_per_node.saturating_sub(node.attached_shard_count)
     }
 
     pub(crate) fn expected_attached_shard_count(&self) -> usize {

From aa22572d8c7602c1e6b26c0afde2df3a4e90f36d Mon Sep 17 00:00:00 2001
From: Evan Fleming <evan.gordon.fleming@gmail.com>
Date: Fri, 16 May 2025 05:41:10 -0700
Subject: [PATCH 52/65] safekeeper: refactor static remote storage usage to use
 Arc (#10179)

Greetings! Please add `w=1` to github url when viewing diff
(sepcifically `wal_backup.rs`)

## Problem

This PR is aimed at addressing the remaining work of #8200. Namely,
removing static usage of remote storage in favour of arc. I did not opt
to pass `Arc<RemoteStorage>` directly since it is actually
`Optional<RemoteStorage>` as it is not necessarily always configured. I
wanted to avoid having to pass `Arc<Optional<RemoteStorage>>` everywhere
with individual consuming functions likely needing to handle unwrapping.

Instead I've added a `WalBackup` struct that holds
`Optional<RemoteStorage>` and handles initialization/unwrapping
RemoteStorage internally. wal_backup functions now take self and
`Arc<WalBackup>` is passed as a dependency through the various consumers
that need it.

## Summary of changes
- Add `WalBackup` that holds `Optional<RemoteStorage>` and handles
initialization and unwrapping
- Modify wal_backup functions to take `WalBackup` as self (Add `w=1` to
github url when viewing diff here)
- Initialize `WalBackup` in safekeeper root
- Store `Arc<WalBackup>` in `GlobalTimelineMap` and pass and store in
each Timeline as loaded
- use `WalBackup` through Timeline as needed

## Refs

- task to remove global variables
https://github.com/neondatabase/neon/issues/8200
- drive-by fixes https://github.com/neondatabase/neon/issues/11501
by turning the panic reported there into an error `remote storage not
configured`

---------

Co-authored-by: Christian Schwarz <christian@neon.tech>
---
 safekeeper/src/bin/safekeeper.rs       |   9 +-
 safekeeper/src/copy_timeline.rs        |   3 +
 safekeeper/src/http/routes.rs          |  10 ++-
 safekeeper/src/lib.rs                  |   6 --
 safekeeper/src/pull_timeline.rs        |  43 +++++++--
 safekeeper/src/test_utils.rs           |   6 +-
 safekeeper/src/timeline.rs             |  28 ++++--
 safekeeper/src/timeline_eviction.rs    |  47 +++++++---
 safekeeper/src/timeline_manager.rs     |  26 ++++--
 safekeeper/src/timelines_global_map.rs |  41 +++++++--
 safekeeper/src/wal_backup.rs           | 115 ++++++++++++++-----------
 safekeeper/src/wal_backup_partial.rs   |  21 +++--
 safekeeper/src/wal_storage.rs          |  13 +--
 13 files changed, 255 insertions(+), 113 deletions(-)

diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs
index c267a55cb6..8d31ada24f 100644
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -22,9 +22,10 @@ use safekeeper::defaults::{
     DEFAULT_PARTIAL_BACKUP_TIMEOUT, DEFAULT_PG_LISTEN_ADDR, DEFAULT_SSL_CERT_FILE,
     DEFAULT_SSL_CERT_RELOAD_PERIOD, DEFAULT_SSL_KEY_FILE,
 };
+use safekeeper::wal_backup::WalBackup;
 use safekeeper::{
     BACKGROUND_RUNTIME, BROKER_RUNTIME, GlobalTimelines, HTTP_RUNTIME, SafeKeeperConf,
-    WAL_SERVICE_RUNTIME, broker, control_file, http, wal_backup, wal_service,
+    WAL_SERVICE_RUNTIME, broker, control_file, http, wal_service,
 };
 use sd_notify::NotifyState;
 use storage_broker::{DEFAULT_ENDPOINT, Uri};
@@ -484,15 +485,15 @@ async fn start_safekeeper(conf: Arc<SafeKeeperConf>) -> Result<()> {
         None => None,
     };
 
-    let global_timelines = Arc::new(GlobalTimelines::new(conf.clone()));
+    let wal_backup = Arc::new(WalBackup::new(&conf).await?);
+
+    let global_timelines = Arc::new(GlobalTimelines::new(conf.clone(), wal_backup.clone()));
 
     // Register metrics collector for active timelines. It's important to do this
     // after daemonizing, otherwise process collector will be upset.
     let timeline_collector = safekeeper::metrics::TimelineCollector::new(global_timelines.clone());
     metrics::register_internal(Box::new(timeline_collector))?;
 
-    wal_backup::init_remote_storage(&conf).await;
-
     // Keep handles to main tasks to die if any of them disappears.
     let mut tasks_handles: FuturesUnordered<BoxFuture<(String, JoinTaskRes)>> =
         FuturesUnordered::new();
diff --git a/safekeeper/src/copy_timeline.rs b/safekeeper/src/copy_timeline.rs
index 11daff22cb..7984c2e2b9 100644
--- a/safekeeper/src/copy_timeline.rs
+++ b/safekeeper/src/copy_timeline.rs
@@ -3,6 +3,7 @@ use std::sync::Arc;
 use anyhow::{Result, bail};
 use camino::Utf8PathBuf;
 use postgres_ffi::{MAX_SEND_SIZE, WAL_SEGMENT_SIZE};
+use remote_storage::GenericRemoteStorage;
 use safekeeper_api::membership::Configuration;
 use tokio::fs::OpenOptions;
 use tokio::io::{AsyncSeekExt, AsyncWriteExt};
@@ -30,6 +31,7 @@ pub struct Request {
 pub async fn handle_request(
     request: Request,
     global_timelines: Arc<GlobalTimelines>,
+    storage: Arc<GenericRemoteStorage>,
 ) -> Result<()> {
     // TODO: request.until_lsn MUST be a valid LSN, and we cannot check it :(
     //   if LSN will point to the middle of a WAL record, timeline will be in "broken" state
@@ -127,6 +129,7 @@ pub async fn handle_request(
     assert!(first_ondisk_segment >= first_segment);
 
     copy_s3_segments(
+        &storage,
         wal_seg_size,
         &request.source_ttid,
         &request.destination_ttid,
diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs
index 1a25b07496..384c582678 100644
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -258,6 +258,7 @@ async fn timeline_snapshot_handler(request: Request<Body>) -> Result<Response<Bo
 
     let global_timelines = get_global_timelines(&request);
     let tli = global_timelines.get(ttid).map_err(ApiError::from)?;
+    let storage = global_timelines.get_wal_backup().get_storage();
 
     // To stream the body use wrap_stream which wants Stream of Result<Bytes>,
     // so create the chan and write to it in another task.
@@ -269,6 +270,7 @@ async fn timeline_snapshot_handler(request: Request<Body>) -> Result<Response<Bo
         conf.my_id,
         destination,
         tx,
+        storage,
     ));
 
     let rx_stream = ReceiverStream::new(rx);
@@ -390,12 +392,18 @@ async fn timeline_copy_handler(mut request: Request<Body>) -> Result<Response<Bo
     );
 
     let global_timelines = get_global_timelines(&request);
+    let wal_backup = global_timelines.get_wal_backup();
+    let storage = wal_backup
+        .get_storage()
+        .ok_or(ApiError::BadRequest(anyhow::anyhow!(
+            "Remote Storage is not configured"
+        )))?;
 
     copy_timeline::handle_request(copy_timeline::Request{
         source_ttid,
         until_lsn: request_data.until_lsn,
         destination_ttid: TenantTimelineId::new(source_ttid.tenant_id, request_data.target_timeline_id),
-    }, global_timelines)
+    }, global_timelines, storage)
         .instrument(info_span!("copy_timeline", from=%source_ttid, to=%request_data.target_timeline_id, until_lsn=%request_data.until_lsn))
         .await
         .map_err(ApiError::InternalServerError)?;
diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs
index ef2608e5d6..b4d9cadd6d 100644
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -125,12 +125,6 @@ pub struct SafeKeeperConf {
     pub enable_tls_wal_service_api: bool,
 }
 
-impl SafeKeeperConf {
-    pub fn is_wal_backup_enabled(&self) -> bool {
-        self.remote_storage.is_some() && self.wal_backup_enabled
-    }
-}
-
 impl SafeKeeperConf {
     pub fn dummy() -> Self {
         SafeKeeperConf {
diff --git a/safekeeper/src/pull_timeline.rs b/safekeeper/src/pull_timeline.rs
index c955e667bd..14aef1ee5e 100644
--- a/safekeeper/src/pull_timeline.rs
+++ b/safekeeper/src/pull_timeline.rs
@@ -9,6 +9,7 @@ use chrono::{DateTime, Utc};
 use futures::{SinkExt, StreamExt, TryStreamExt};
 use http_utils::error::ApiError;
 use postgres_ffi::{PG_TLI, XLogFileName, XLogSegNo};
+use remote_storage::GenericRemoteStorage;
 use reqwest::Certificate;
 use safekeeper_api::Term;
 use safekeeper_api::models::{PullTimelineRequest, PullTimelineResponse, TimelineStatus};
@@ -43,6 +44,7 @@ pub async fn stream_snapshot(
     source: NodeId,
     destination: NodeId,
     tx: mpsc::Sender<Result<Bytes>>,
+    storage: Option<Arc<GenericRemoteStorage>>,
 ) {
     match tli.try_wal_residence_guard().await {
         Err(e) => {
@@ -53,10 +55,32 @@ pub async fn stream_snapshot(
         Ok(maybe_resident_tli) => {
             if let Err(e) = match maybe_resident_tli {
                 Some(resident_tli) => {
-                    stream_snapshot_resident_guts(resident_tli, source, destination, tx.clone())
-                        .await
+                    stream_snapshot_resident_guts(
+                        resident_tli,
+                        source,
+                        destination,
+                        tx.clone(),
+                        storage,
+                    )
+                    .await
+                }
+                None => {
+                    if let Some(storage) = storage {
+                        stream_snapshot_offloaded_guts(
+                            tli,
+                            source,
+                            destination,
+                            tx.clone(),
+                            &storage,
+                        )
+                        .await
+                    } else {
+                        tx.send(Err(anyhow!("remote storage not configured")))
+                            .await
+                            .ok();
+                        return;
+                    }
                 }
-                None => stream_snapshot_offloaded_guts(tli, source, destination, tx.clone()).await,
             } {
                 // Error type/contents don't matter as they won't can't reach the client
                 // (hyper likely doesn't do anything with it), but http stream will be
@@ -123,10 +147,12 @@ pub(crate) async fn stream_snapshot_offloaded_guts(
     source: NodeId,
     destination: NodeId,
     tx: mpsc::Sender<Result<Bytes>>,
+    storage: &GenericRemoteStorage,
 ) -> Result<()> {
     let mut ar = prepare_tar_stream(tx);
 
-    tli.snapshot_offloaded(&mut ar, source, destination).await?;
+    tli.snapshot_offloaded(&mut ar, source, destination, storage)
+        .await?;
 
     ar.finish().await?;
 
@@ -139,10 +165,13 @@ pub async fn stream_snapshot_resident_guts(
     source: NodeId,
     destination: NodeId,
     tx: mpsc::Sender<Result<Bytes>>,
+    storage: Option<Arc<GenericRemoteStorage>>,
 ) -> Result<()> {
     let mut ar = prepare_tar_stream(tx);
 
-    let bctx = tli.start_snapshot(&mut ar, source, destination).await?;
+    let bctx = tli
+        .start_snapshot(&mut ar, source, destination, storage)
+        .await?;
     pausable_failpoint!("sk-snapshot-after-list-pausable");
 
     let tli_dir = tli.get_timeline_dir();
@@ -182,6 +211,7 @@ impl Timeline {
         ar: &mut tokio_tar::Builder<W>,
         source: NodeId,
         destination: NodeId,
+        storage: &GenericRemoteStorage,
     ) -> Result<()> {
         // Take initial copy of control file, then release state lock
         let mut control_file = {
@@ -216,6 +246,7 @@ impl Timeline {
         // can fail if the timeline was un-evicted and modified in the background.
         let remote_timeline_path = &self.remote_path;
         wal_backup::copy_partial_segment(
+            storage,
             &replace.previous.remote_path(remote_timeline_path),
             &replace.current.remote_path(remote_timeline_path),
         )
@@ -262,6 +293,7 @@ impl WalResidentTimeline {
         ar: &mut tokio_tar::Builder<W>,
         source: NodeId,
         destination: NodeId,
+        storage: Option<Arc<GenericRemoteStorage>>,
     ) -> Result<SnapshotContext> {
         let mut shared_state = self.write_shared_state().await;
         let wal_seg_size = shared_state.get_wal_seg_size();
@@ -283,6 +315,7 @@ impl WalResidentTimeline {
 
             let remote_timeline_path = &self.tli.remote_path;
             wal_backup::copy_partial_segment(
+                &*storage.context("remote storage not configured")?,
                 &replace.previous.remote_path(remote_timeline_path),
                 &replace.current.remote_path(remote_timeline_path),
             )
diff --git a/safekeeper/src/test_utils.rs b/safekeeper/src/test_utils.rs
index 618e2b59d2..e2817c8337 100644
--- a/safekeeper/src/test_utils.rs
+++ b/safekeeper/src/test_utils.rs
@@ -18,7 +18,7 @@ use crate::send_wal::EndWatch;
 use crate::state::{TimelinePersistentState, TimelineState};
 use crate::timeline::{SharedState, StateSK, Timeline, get_timeline_dir};
 use crate::timelines_set::TimelinesSet;
-use crate::wal_backup::remote_timeline_path;
+use crate::wal_backup::{WalBackup, remote_timeline_path};
 use crate::{SafeKeeperConf, control_file, receive_wal, wal_storage};
 
 /// A Safekeeper testing or benchmarking environment. Uses a tempdir for storage, removed on drop.
@@ -101,18 +101,22 @@ impl Env {
         let safekeeper = self.make_safekeeper(node_id, ttid, start_lsn).await?;
         let shared_state = SharedState::new(StateSK::Loaded(safekeeper));
 
+        let wal_backup = Arc::new(WalBackup::new(&conf).await?);
+
         let timeline = Timeline::new(
             ttid,
             &timeline_dir,
             &remote_path,
             shared_state,
             conf.clone(),
+            wal_backup.clone(),
         );
         timeline.bootstrap(
             &mut timeline.write_shared_state().await,
             &conf,
             Arc::new(TimelinesSet::default()), // ignored for now
             RateLimiter::new(0, 0),
+            wal_backup,
         );
         Ok(timeline)
     }
diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs
index b7ba28f435..588bd4f2c9 100644
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -35,7 +35,8 @@ use crate::state::{EvictionState, TimelineMemState, TimelinePersistentState, Tim
 use crate::timeline_guard::ResidenceGuard;
 use crate::timeline_manager::{AtomicStatus, ManagerCtl};
 use crate::timelines_set::TimelinesSet;
-use crate::wal_backup::{self, remote_timeline_path};
+use crate::wal_backup;
+use crate::wal_backup::{WalBackup, remote_timeline_path};
 use crate::wal_backup_partial::PartialRemoteSegment;
 use crate::wal_storage::{Storage as wal_storage_iface, WalReader};
 use crate::{SafeKeeperConf, control_file, debug_dump, timeline_manager, wal_storage};
@@ -452,6 +453,8 @@ pub struct Timeline {
     manager_ctl: ManagerCtl,
     conf: Arc<SafeKeeperConf>,
 
+    pub(crate) wal_backup: Arc<WalBackup>,
+
     remote_deletion: std::sync::Mutex<Option<RemoteDeletionReceiver>>,
 
     /// Hold this gate from code that depends on the Timeline's non-shut-down state.  While holding
@@ -476,6 +479,7 @@ impl Timeline {
         remote_path: &RemotePath,
         shared_state: SharedState,
         conf: Arc<SafeKeeperConf>,
+        wal_backup: Arc<WalBackup>,
     ) -> Arc<Self> {
         let (commit_lsn_watch_tx, commit_lsn_watch_rx) =
             watch::channel(shared_state.sk.state().commit_lsn);
@@ -509,6 +513,7 @@ impl Timeline {
             wal_backup_active: AtomicBool::new(false),
             last_removed_segno: AtomicU64::new(0),
             mgr_status: AtomicStatus::new(),
+            wal_backup,
         })
     }
 
@@ -516,6 +521,7 @@ impl Timeline {
     pub fn load_timeline(
         conf: Arc<SafeKeeperConf>,
         ttid: TenantTimelineId,
+        wal_backup: Arc<WalBackup>,
     ) -> Result<Arc<Timeline>> {
         let _enter = info_span!("load_timeline", timeline = %ttid.timeline_id).entered();
 
@@ -529,6 +535,7 @@ impl Timeline {
             &remote_path,
             shared_state,
             conf,
+            wal_backup,
         ))
     }
 
@@ -539,6 +546,7 @@ impl Timeline {
         conf: &SafeKeeperConf,
         broker_active_set: Arc<TimelinesSet>,
         partial_backup_rate_limiter: RateLimiter,
+        wal_backup: Arc<WalBackup>,
     ) {
         let (tx, rx) = self.manager_ctl.bootstrap_manager();
 
@@ -561,6 +569,7 @@ impl Timeline {
                     tx,
                     rx,
                     partial_backup_rate_limiter,
+                    wal_backup,
                 )
                 .await
             }
@@ -606,9 +615,10 @@ impl Timeline {
         // it is cancelled, so WAL storage won't be opened again.
         shared_state.sk.close_wal_store();
 
-        if !only_local && self.conf.is_wal_backup_enabled() {
+        if !only_local {
             self.remote_delete().await?;
         }
+
         let dir_existed = delete_dir(&self.timeline_dir).await?;
         Ok(dir_existed)
     }
@@ -675,11 +685,20 @@ impl Timeline {
         guard: &mut std::sync::MutexGuard<Option<RemoteDeletionReceiver>>,
     ) -> RemoteDeletionReceiver {
         tracing::info!("starting remote deletion");
+        let storage = self.wal_backup.get_storage().clone();
         let (result_tx, result_rx) = tokio::sync::watch::channel(None);
         let ttid = self.ttid;
         tokio::task::spawn(
             async move {
-                let r = wal_backup::delete_timeline(&ttid).await;
+                let r = if let Some(storage) = storage {
+                    wal_backup::delete_timeline(&storage, &ttid).await
+                } else {
+                    tracing::info!(
+                        "skipping remote deletion because no remote storage is configured; this effectively leaks the objects in remote storage"
+                    );
+                    Ok(())
+                };
+
                 if let Err(e) = &r {
                     // Log error here in case nobody ever listens for our result (e.g. dropped API request)
                     tracing::error!("remote deletion failed: {e}");
@@ -1046,14 +1065,13 @@ impl WalResidentTimeline {
 
     pub async fn get_walreader(&self, start_lsn: Lsn) -> Result<WalReader> {
         let (_, persisted_state) = self.get_state().await;
-        let enable_remote_read = self.conf.is_wal_backup_enabled();
 
         WalReader::new(
             &self.ttid,
             self.timeline_dir.clone(),
             &persisted_state,
             start_lsn,
-            enable_remote_read,
+            self.wal_backup.clone(),
         )
     }
 
diff --git a/safekeeper/src/timeline_eviction.rs b/safekeeper/src/timeline_eviction.rs
index 84c636daf6..e817dbf6f9 100644
--- a/safekeeper/src/timeline_eviction.rs
+++ b/safekeeper/src/timeline_eviction.rs
@@ -6,7 +6,7 @@
 
 use anyhow::Context;
 use camino::Utf8PathBuf;
-use remote_storage::RemotePath;
+use remote_storage::{GenericRemoteStorage, RemotePath};
 use tokio::fs::File;
 use tokio::io::{AsyncRead, AsyncWriteExt};
 use tracing::{debug, info, instrument, warn};
@@ -68,6 +68,10 @@ impl Manager {
     #[instrument(name = "evict_timeline", skip_all)]
     pub(crate) async fn evict_timeline(&mut self) -> bool {
         assert!(!self.is_offloaded);
+        let Some(storage) = self.wal_backup.get_storage() else {
+            warn!("no remote storage configured, skipping uneviction");
+            return false;
+        };
         let partial_backup_uploaded = match &self.partial_backup_uploaded {
             Some(p) => p.clone(),
             None => {
@@ -87,7 +91,7 @@ impl Manager {
                 .inc();
         });
 
-        if let Err(e) = do_eviction(self, &partial_backup_uploaded).await {
+        if let Err(e) = do_eviction(self, &partial_backup_uploaded, &storage).await {
             warn!("failed to evict timeline: {:?}", e);
             return false;
         }
@@ -102,6 +106,10 @@ impl Manager {
     #[instrument(name = "unevict_timeline", skip_all)]
     pub(crate) async fn unevict_timeline(&mut self) {
         assert!(self.is_offloaded);
+        let Some(storage) = self.wal_backup.get_storage() else {
+            warn!("no remote storage configured, skipping uneviction");
+            return;
+        };
         let partial_backup_uploaded = match &self.partial_backup_uploaded {
             Some(p) => p.clone(),
             None => {
@@ -121,7 +129,7 @@ impl Manager {
                 .inc();
         });
 
-        if let Err(e) = do_uneviction(self, &partial_backup_uploaded).await {
+        if let Err(e) = do_uneviction(self, &partial_backup_uploaded, &storage).await {
             warn!("failed to unevict timeline: {:?}", e);
             return;
         }
@@ -137,8 +145,12 @@ impl Manager {
 /// Ensure that content matches the remote partial backup, if local segment exists.
 /// Then change state in control file and in-memory. If `delete_offloaded_wal` is set,
 /// delete the local segment.
-async fn do_eviction(mgr: &mut Manager, partial: &PartialRemoteSegment) -> anyhow::Result<()> {
-    compare_local_segment_with_remote(mgr, partial).await?;
+async fn do_eviction(
+    mgr: &mut Manager,
+    partial: &PartialRemoteSegment,
+    storage: &GenericRemoteStorage,
+) -> anyhow::Result<()> {
+    compare_local_segment_with_remote(mgr, partial, storage).await?;
 
     mgr.tli.switch_to_offloaded(partial).await?;
     // switch manager state as soon as possible
@@ -153,12 +165,16 @@ async fn do_eviction(mgr: &mut Manager, partial: &PartialRemoteSegment) -> anyho
 
 /// Ensure that content matches the remote partial backup, if local segment exists.
 /// Then download segment to local disk and change state in control file and in-memory.
-async fn do_uneviction(mgr: &mut Manager, partial: &PartialRemoteSegment) -> anyhow::Result<()> {
+async fn do_uneviction(
+    mgr: &mut Manager,
+    partial: &PartialRemoteSegment,
+    storage: &GenericRemoteStorage,
+) -> anyhow::Result<()> {
     // if the local segment is present, validate it
-    compare_local_segment_with_remote(mgr, partial).await?;
+    compare_local_segment_with_remote(mgr, partial, storage).await?;
 
     // atomically download the partial segment
-    redownload_partial_segment(mgr, partial).await?;
+    redownload_partial_segment(mgr, partial, storage).await?;
 
     mgr.tli.switch_to_present().await?;
     // switch manager state as soon as possible
@@ -181,6 +197,7 @@ async fn delete_local_segment(mgr: &Manager, partial: &PartialRemoteSegment) ->
 async fn redownload_partial_segment(
     mgr: &Manager,
     partial: &PartialRemoteSegment,
+    storage: &GenericRemoteStorage,
 ) -> anyhow::Result<()> {
     let tmp_file = mgr.tli.timeline_dir().join("remote_partial.tmp");
     let remote_segfile = remote_segment_path(mgr, partial);
@@ -190,7 +207,7 @@ async fn redownload_partial_segment(
         remote_segfile, tmp_file
     );
 
-    let mut reader = wal_backup::read_object(&remote_segfile, 0).await?;
+    let mut reader = wal_backup::read_object(storage, &remote_segfile, 0).await?;
     let mut file = File::create(&tmp_file).await?;
 
     let actual_len = tokio::io::copy(&mut reader, &mut file).await?;
@@ -234,13 +251,16 @@ async fn redownload_partial_segment(
 async fn compare_local_segment_with_remote(
     mgr: &Manager,
     partial: &PartialRemoteSegment,
+    storage: &GenericRemoteStorage,
 ) -> anyhow::Result<()> {
     let local_path = local_segment_path(mgr, partial);
 
     match File::open(&local_path).await {
-        Ok(mut local_file) => do_validation(mgr, &mut local_file, mgr.wal_seg_size, partial)
-            .await
-            .context("validation failed"),
+        Ok(mut local_file) => {
+            do_validation(mgr, &mut local_file, mgr.wal_seg_size, partial, storage)
+                .await
+                .context("validation failed")
+        }
         Err(_) => {
             info!(
                 "local WAL file {} is not present, skipping validation",
@@ -258,6 +278,7 @@ async fn do_validation(
     file: &mut File,
     wal_seg_size: usize,
     partial: &PartialRemoteSegment,
+    storage: &GenericRemoteStorage,
 ) -> anyhow::Result<()> {
     let local_size = file.metadata().await?.len() as usize;
     if local_size != wal_seg_size {
@@ -270,7 +291,7 @@ async fn do_validation(
 
     let remote_segfile = remote_segment_path(mgr, partial);
     let mut remote_reader: std::pin::Pin<Box<dyn AsyncRead + Send + Sync>> =
-        wal_backup::read_object(&remote_segfile, 0).await?;
+        wal_backup::read_object(storage, &remote_segfile, 0).await?;
 
     // remote segment should have bytes excatly up to `flush_lsn`
     let expected_remote_size = partial.flush_lsn.segment_offset(mgr.wal_seg_size);
diff --git a/safekeeper/src/timeline_manager.rs b/safekeeper/src/timeline_manager.rs
index 71e99a4de7..48eda92fed 100644
--- a/safekeeper/src/timeline_manager.rs
+++ b/safekeeper/src/timeline_manager.rs
@@ -35,7 +35,7 @@ use crate::state::TimelineState;
 use crate::timeline::{ManagerTimeline, ReadGuardSharedState, StateSK, WalResidentTimeline};
 use crate::timeline_guard::{AccessService, GuardId, ResidenceGuard};
 use crate::timelines_set::{TimelineSetGuard, TimelinesSet};
-use crate::wal_backup::{self, WalBackupTaskHandle};
+use crate::wal_backup::{self, WalBackup, WalBackupTaskHandle};
 use crate::wal_backup_partial::{self, PartialBackup, PartialRemoteSegment};
 
 pub(crate) struct StateSnapshot {
@@ -200,6 +200,7 @@ pub(crate) struct Manager {
     pub(crate) conf: SafeKeeperConf,
     pub(crate) wal_seg_size: usize,
     pub(crate) walsenders: Arc<WalSenders>,
+    pub(crate) wal_backup: Arc<WalBackup>,
 
     // current state
     pub(crate) state_version_rx: tokio::sync::watch::Receiver<usize>,
@@ -238,6 +239,7 @@ pub async fn main_task(
     manager_tx: tokio::sync::mpsc::UnboundedSender<ManagerCtlMessage>,
     mut manager_rx: tokio::sync::mpsc::UnboundedReceiver<ManagerCtlMessage>,
     global_rate_limiter: RateLimiter,
+    wal_backup: Arc<WalBackup>,
 ) {
     tli.set_status(Status::Started);
 
@@ -256,6 +258,7 @@ pub async fn main_task(
         broker_active_set,
         manager_tx,
         global_rate_limiter,
+        wal_backup,
     )
     .await;
 
@@ -371,7 +374,7 @@ pub async fn main_task(
     mgr.tli_broker_active.set(false);
 
     // shutdown background tasks
-    if mgr.conf.is_wal_backup_enabled() {
+    if let Some(storage) = mgr.wal_backup.get_storage() {
         if let Some(backup_task) = mgr.backup_task.take() {
             // If we fell through here, then the timeline is shutting down. This is important
             // because otherwise joining on the wal_backup handle might hang.
@@ -379,7 +382,7 @@ pub async fn main_task(
 
             backup_task.join().await;
         }
-        wal_backup::update_task(&mut mgr, false, &last_state).await;
+        wal_backup::update_task(&mut mgr, storage, false, &last_state).await;
     }
 
     if let Some(recovery_task) = &mut mgr.recovery_task {
@@ -415,11 +418,13 @@ impl Manager {
         broker_active_set: Arc<TimelinesSet>,
         manager_tx: tokio::sync::mpsc::UnboundedSender<ManagerCtlMessage>,
         global_rate_limiter: RateLimiter,
+        wal_backup: Arc<WalBackup>,
     ) -> Manager {
         let (is_offloaded, partial_backup_uploaded) = tli.bootstrap_mgr().await;
         Manager {
             wal_seg_size: tli.get_wal_seg_size().await,
             walsenders: tli.get_walsenders().clone(),
+            wal_backup,
             state_version_rx: tli.get_state_version_rx(),
             num_computes_rx: tli.get_walreceivers().get_num_rx(),
             tli_broker_active: broker_active_set.guard(tli.clone()),
@@ -477,8 +482,8 @@ impl Manager {
         let is_wal_backup_required =
             wal_backup::is_wal_backup_required(self.wal_seg_size, num_computes, state);
 
-        if self.conf.is_wal_backup_enabled() {
-            wal_backup::update_task(self, is_wal_backup_required, state).await;
+        if let Some(storage) = self.wal_backup.get_storage() {
+            wal_backup::update_task(self, storage, is_wal_backup_required, state).await;
         }
 
         // update the state in Arc<Timeline>
@@ -624,9 +629,9 @@ impl Manager {
     /// Spawns partial WAL backup task if needed.
     async fn update_partial_backup(&mut self, state: &StateSnapshot) {
         // check if WAL backup is enabled and should be started
-        if !self.conf.is_wal_backup_enabled() {
+        let Some(storage) = self.wal_backup.get_storage() else {
             return;
-        }
+        };
 
         if self.partial_backup_task.is_some() {
             // partial backup is already running
@@ -650,6 +655,7 @@ impl Manager {
             self.conf.clone(),
             self.global_rate_limiter.clone(),
             cancel.clone(),
+            storage,
         ));
         self.partial_backup_task = Some((handle, cancel));
     }
@@ -669,6 +675,10 @@ impl Manager {
     /// Reset partial backup state and remove its remote storage data. Since it
     /// might concurrently uploading something, cancel the task first.
     async fn backup_partial_reset(&mut self) -> anyhow::Result<Vec<String>> {
+        let Some(storage) = self.wal_backup.get_storage() else {
+            anyhow::bail!("remote storage is not enabled");
+        };
+
         info!("resetting partial backup state");
         // Force unevict timeline if it is evicted before erasing partial backup
         // state. The intended use of this function is to drop corrupted remote
@@ -689,7 +699,7 @@ impl Manager {
         }
 
         let tli = self.wal_resident_timeline()?;
-        let mut partial_backup = PartialBackup::new(tli, self.conf.clone()).await;
+        let mut partial_backup = PartialBackup::new(tli, self.conf.clone(), storage).await;
         // Reset might fail e.g. when cfile is already reset but s3 removal
         // failed, so set manager state to None beforehand. In any case caller
         // is expected to retry until success.
diff --git a/safekeeper/src/timelines_global_map.rs b/safekeeper/src/timelines_global_map.rs
index 41abee369e..af33bcbd20 100644
--- a/safekeeper/src/timelines_global_map.rs
+++ b/safekeeper/src/timelines_global_map.rs
@@ -25,6 +25,7 @@ use crate::rate_limit::RateLimiter;
 use crate::state::TimelinePersistentState;
 use crate::timeline::{Timeline, TimelineError, delete_dir, get_tenant_dir, get_timeline_dir};
 use crate::timelines_set::TimelinesSet;
+use crate::wal_backup::WalBackup;
 use crate::wal_storage::Storage;
 use crate::{SafeKeeperConf, control_file, wal_storage};
 
@@ -47,15 +48,24 @@ struct GlobalTimelinesState {
     conf: Arc<SafeKeeperConf>,
     broker_active_set: Arc<TimelinesSet>,
     global_rate_limiter: RateLimiter,
+    wal_backup: Arc<WalBackup>,
 }
 
 impl GlobalTimelinesState {
     /// Get dependencies for a timeline constructor.
-    fn get_dependencies(&self) -> (Arc<SafeKeeperConf>, Arc<TimelinesSet>, RateLimiter) {
+    fn get_dependencies(
+        &self,
+    ) -> (
+        Arc<SafeKeeperConf>,
+        Arc<TimelinesSet>,
+        RateLimiter,
+        Arc<WalBackup>,
+    ) {
         (
             self.conf.clone(),
             self.broker_active_set.clone(),
             self.global_rate_limiter.clone(),
+            self.wal_backup.clone(),
         )
     }
 
@@ -84,7 +94,7 @@ pub struct GlobalTimelines {
 
 impl GlobalTimelines {
     /// Create a new instance of the global timelines map.
-    pub fn new(conf: Arc<SafeKeeperConf>) -> Self {
+    pub fn new(conf: Arc<SafeKeeperConf>, wal_backup: Arc<WalBackup>) -> Self {
         Self {
             state: Mutex::new(GlobalTimelinesState {
                 timelines: HashMap::new(),
@@ -92,6 +102,7 @@ impl GlobalTimelines {
                 conf,
                 broker_active_set: Arc::new(TimelinesSet::default()),
                 global_rate_limiter: RateLimiter::new(1, 1),
+                wal_backup,
             }),
         }
     }
@@ -147,7 +158,7 @@ impl GlobalTimelines {
     /// just lock and unlock it for each timeline -- this function is called
     /// during init when nothing else is running, so this is fine.
     async fn load_tenant_timelines(&self, tenant_id: TenantId) -> Result<()> {
-        let (conf, broker_active_set, partial_backup_rate_limiter) = {
+        let (conf, broker_active_set, partial_backup_rate_limiter, wal_backup) = {
             let state = self.state.lock().unwrap();
             state.get_dependencies()
         };
@@ -162,7 +173,7 @@ impl GlobalTimelines {
                         TimelineId::from_str(timeline_dir_entry.file_name().to_str().unwrap_or(""))
                     {
                         let ttid = TenantTimelineId::new(tenant_id, timeline_id);
-                        match Timeline::load_timeline(conf.clone(), ttid) {
+                        match Timeline::load_timeline(conf.clone(), ttid, wal_backup.clone()) {
                             Ok(tli) => {
                                 let mut shared_state = tli.write_shared_state().await;
                                 self.state
@@ -175,6 +186,7 @@ impl GlobalTimelines {
                                     &conf,
                                     broker_active_set.clone(),
                                     partial_backup_rate_limiter.clone(),
+                                    wal_backup.clone(),
                                 );
                             }
                             // If we can't load a timeline, it's most likely because of a corrupted
@@ -212,6 +224,10 @@ impl GlobalTimelines {
         self.state.lock().unwrap().broker_active_set.clone()
     }
 
+    pub fn get_wal_backup(&self) -> Arc<WalBackup> {
+        self.state.lock().unwrap().wal_backup.clone()
+    }
+
     /// Create a new timeline with the given id. If the timeline already exists, returns
     /// an existing timeline.
     pub(crate) async fn create(
@@ -222,7 +238,7 @@ impl GlobalTimelines {
         start_lsn: Lsn,
         commit_lsn: Lsn,
     ) -> Result<Arc<Timeline>> {
-        let (conf, _, _) = {
+        let (conf, _, _, _) = {
             let state = self.state.lock().unwrap();
             if let Ok(timeline) = state.get(&ttid) {
                 // Timeline already exists, return it.
@@ -267,7 +283,7 @@ impl GlobalTimelines {
         check_tombstone: bool,
     ) -> Result<Arc<Timeline>> {
         // Check for existence and mark that we're creating it.
-        let (conf, broker_active_set, partial_backup_rate_limiter) = {
+        let (conf, broker_active_set, partial_backup_rate_limiter, wal_backup) = {
             let mut state = self.state.lock().unwrap();
             match state.timelines.get(&ttid) {
                 Some(GlobalMapTimeline::CreationInProgress) => {
@@ -296,7 +312,14 @@ impl GlobalTimelines {
         };
 
         // Do the actual move and reflect the result in the map.
-        match GlobalTimelines::install_temp_timeline(ttid, tmp_path, conf.clone()).await {
+        match GlobalTimelines::install_temp_timeline(
+            ttid,
+            tmp_path,
+            conf.clone(),
+            wal_backup.clone(),
+        )
+        .await
+        {
             Ok(timeline) => {
                 let mut timeline_shared_state = timeline.write_shared_state().await;
                 let mut state = self.state.lock().unwrap();
@@ -314,6 +337,7 @@ impl GlobalTimelines {
                     &conf,
                     broker_active_set,
                     partial_backup_rate_limiter,
+                    wal_backup,
                 );
                 drop(timeline_shared_state);
                 Ok(timeline)
@@ -336,6 +360,7 @@ impl GlobalTimelines {
         ttid: TenantTimelineId,
         tmp_path: &Utf8PathBuf,
         conf: Arc<SafeKeeperConf>,
+        wal_backup: Arc<WalBackup>,
     ) -> Result<Arc<Timeline>> {
         let tenant_path = get_tenant_dir(conf.as_ref(), &ttid.tenant_id);
         let timeline_path = get_timeline_dir(conf.as_ref(), &ttid);
@@ -377,7 +402,7 @@ impl GlobalTimelines {
         // Do the move.
         durable_rename(tmp_path, &timeline_path, !conf.no_sync).await?;
 
-        Timeline::load_timeline(conf, ttid)
+        Timeline::load_timeline(conf, ttid, wal_backup)
     }
 
     /// Get a timeline from the global map. If it's not present, it doesn't exist on disk,
diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs
index 56f4a2faf9..0beb272a60 100644
--- a/safekeeper/src/wal_backup.rs
+++ b/safekeeper/src/wal_backup.rs
@@ -2,6 +2,7 @@ use std::cmp::min;
 use std::collections::HashSet;
 use std::num::NonZeroU32;
 use std::pin::Pin;
+use std::sync::Arc;
 use std::time::Duration;
 
 use anyhow::{Context, Result};
@@ -17,7 +18,7 @@ use safekeeper_api::models::PeerInfo;
 use tokio::fs::File;
 use tokio::select;
 use tokio::sync::mpsc::{self, Receiver, Sender};
-use tokio::sync::{OnceCell, watch};
+use tokio::sync::watch;
 use tokio::task::JoinHandle;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
@@ -63,7 +64,12 @@ pub(crate) fn is_wal_backup_required(
 /// Based on peer information determine which safekeeper should offload; if it
 /// is me, run (per timeline) task, if not yet. OTOH, if it is not me and task
 /// is running, kill it.
-pub(crate) async fn update_task(mgr: &mut Manager, need_backup: bool, state: &StateSnapshot) {
+pub(crate) async fn update_task(
+    mgr: &mut Manager,
+    storage: Arc<GenericRemoteStorage>,
+    need_backup: bool,
+    state: &StateSnapshot,
+) {
     let (offloader, election_dbg_str) =
         determine_offloader(&state.peers, state.backup_lsn, mgr.tli.ttid, &mgr.conf);
     let elected_me = Some(mgr.conf.my_id) == offloader;
@@ -82,7 +88,12 @@ pub(crate) async fn update_task(mgr: &mut Manager, need_backup: bool, state: &St
                 return;
             };
 
-            let async_task = backup_task_main(resident, mgr.conf.backup_parallel_jobs, shutdown_rx);
+            let async_task = backup_task_main(
+                resident,
+                storage,
+                mgr.conf.backup_parallel_jobs,
+                shutdown_rx,
+            );
 
             let handle = if mgr.conf.current_thread_runtime {
                 tokio::spawn(async_task)
@@ -169,33 +180,31 @@ fn determine_offloader(
     }
 }
 
-static REMOTE_STORAGE: OnceCell<Option<GenericRemoteStorage>> = OnceCell::const_new();
-
-// Storage must be configured and initialized when this is called.
-fn get_configured_remote_storage() -> &'static GenericRemoteStorage {
-    REMOTE_STORAGE
-        .get()
-        .expect("failed to get remote storage")
-        .as_ref()
-        .unwrap()
+pub struct WalBackup {
+    storage: Option<Arc<GenericRemoteStorage>>,
 }
 
-pub async fn init_remote_storage(conf: &SafeKeeperConf) {
-    // TODO: refactor REMOTE_STORAGE to avoid using global variables, and provide
-    // dependencies to all tasks instead.
-    REMOTE_STORAGE
-        .get_or_init(|| async {
-            if let Some(conf) = conf.remote_storage.as_ref() {
-                Some(
-                    GenericRemoteStorage::from_config(conf)
-                        .await
-                        .expect("failed to create remote storage"),
-                )
-            } else {
-                None
+impl WalBackup {
+    /// Create a new WalBackup instance.
+    pub async fn new(conf: &SafeKeeperConf) -> Result<Self> {
+        if !conf.wal_backup_enabled {
+            return Ok(Self { storage: None });
+        }
+
+        match conf.remote_storage.as_ref() {
+            Some(config) => {
+                let storage = GenericRemoteStorage::from_config(config).await?;
+                Ok(Self {
+                    storage: Some(Arc::new(storage)),
+                })
             }
-        })
-        .await;
+            None => Ok(Self { storage: None }),
+        }
+    }
+
+    pub fn get_storage(&self) -> Option<Arc<GenericRemoteStorage>> {
+        self.storage.clone()
+    }
 }
 
 struct WalBackupTask {
@@ -204,12 +213,14 @@ struct WalBackupTask {
     wal_seg_size: usize,
     parallel_jobs: usize,
     commit_lsn_watch_rx: watch::Receiver<Lsn>,
+    storage: Arc<GenericRemoteStorage>,
 }
 
 /// Offload single timeline.
 #[instrument(name = "wal_backup", skip_all, fields(ttid = %tli.ttid))]
 async fn backup_task_main(
     tli: WalResidentTimeline,
+    storage: Arc<GenericRemoteStorage>,
     parallel_jobs: usize,
     mut shutdown_rx: Receiver<()>,
 ) {
@@ -223,6 +234,7 @@ async fn backup_task_main(
         timeline_dir: tli.get_timeline_dir(),
         timeline: tli,
         parallel_jobs,
+        storage,
     };
 
     // task is spinned up only when wal_seg_size already initialized
@@ -293,6 +305,7 @@ impl WalBackupTask {
 
             match backup_lsn_range(
                 &self.timeline,
+                self.storage.clone(),
                 &mut backup_lsn,
                 commit_lsn,
                 self.wal_seg_size,
@@ -322,6 +335,7 @@ impl WalBackupTask {
 
 async fn backup_lsn_range(
     timeline: &WalResidentTimeline,
+    storage: Arc<GenericRemoteStorage>,
     backup_lsn: &mut Lsn,
     end_lsn: Lsn,
     wal_seg_size: usize,
@@ -352,7 +366,12 @@ async fn backup_lsn_range(
     loop {
         let added_task = match iter.next() {
             Some(s) => {
-                uploads.push_back(backup_single_segment(s, timeline_dir, remote_timeline_path));
+                uploads.push_back(backup_single_segment(
+                    &storage,
+                    s,
+                    timeline_dir,
+                    remote_timeline_path,
+                ));
                 true
             }
             None => false,
@@ -388,6 +407,7 @@ async fn backup_lsn_range(
 }
 
 async fn backup_single_segment(
+    storage: &GenericRemoteStorage,
     seg: &Segment,
     timeline_dir: &Utf8Path,
     remote_timeline_path: &RemotePath,
@@ -395,7 +415,13 @@ async fn backup_single_segment(
     let segment_file_path = seg.file_path(timeline_dir)?;
     let remote_segment_path = seg.remote_path(remote_timeline_path);
 
-    let res = backup_object(&segment_file_path, &remote_segment_path, seg.size()).await;
+    let res = backup_object(
+        storage,
+        &segment_file_path,
+        &remote_segment_path,
+        seg.size(),
+    )
+    .await;
     if res.is_ok() {
         BACKED_UP_SEGMENTS.inc();
     } else {
@@ -455,12 +481,11 @@ fn get_segments(start: Lsn, end: Lsn, seg_size: usize) -> Vec<Segment> {
 }
 
 async fn backup_object(
+    storage: &GenericRemoteStorage,
     source_file: &Utf8Path,
     target_file: &RemotePath,
     size: usize,
 ) -> Result<()> {
-    let storage = get_configured_remote_storage();
-
     let file = File::open(&source_file)
         .await
         .with_context(|| format!("Failed to open file {source_file:?} for wal backup"))?;
@@ -475,12 +500,11 @@ async fn backup_object(
 }
 
 pub(crate) async fn backup_partial_segment(
+    storage: &GenericRemoteStorage,
     source_file: &Utf8Path,
     target_file: &RemotePath,
     size: usize,
 ) -> Result<()> {
-    let storage = get_configured_remote_storage();
-
     let file = File::open(&source_file)
         .await
         .with_context(|| format!("Failed to open file {source_file:?} for wal backup"))?;
@@ -504,25 +528,20 @@ pub(crate) async fn backup_partial_segment(
 }
 
 pub(crate) async fn copy_partial_segment(
+    storage: &GenericRemoteStorage,
     source: &RemotePath,
     destination: &RemotePath,
 ) -> Result<()> {
-    let storage = get_configured_remote_storage();
     let cancel = CancellationToken::new();
 
     storage.copy_object(source, destination, &cancel).await
 }
 
 pub async fn read_object(
+    storage: &GenericRemoteStorage,
     file_path: &RemotePath,
     offset: u64,
 ) -> anyhow::Result<Pin<Box<dyn tokio::io::AsyncRead + Send + Sync>>> {
-    let storage = REMOTE_STORAGE
-        .get()
-        .context("Failed to get remote storage")?
-        .as_ref()
-        .context("No remote storage configured")?;
-
     info!("segment download about to start from remote path {file_path:?} at offset {offset}");
 
     let cancel = CancellationToken::new();
@@ -547,8 +566,10 @@ pub async fn read_object(
 
 /// Delete WAL files for the given timeline. Remote storage must be configured
 /// when called.
-pub async fn delete_timeline(ttid: &TenantTimelineId) -> Result<()> {
-    let storage = get_configured_remote_storage();
+pub async fn delete_timeline(
+    storage: &GenericRemoteStorage,
+    ttid: &TenantTimelineId,
+) -> Result<()> {
     let remote_path = remote_timeline_path(ttid)?;
 
     // see DEFAULT_MAX_KEYS_PER_LIST_RESPONSE
@@ -618,14 +639,14 @@ pub async fn delete_timeline(ttid: &TenantTimelineId) -> Result<()> {
 }
 
 /// Used by wal_backup_partial.
-pub async fn delete_objects(paths: &[RemotePath]) -> Result<()> {
+pub async fn delete_objects(storage: &GenericRemoteStorage, paths: &[RemotePath]) -> Result<()> {
     let cancel = CancellationToken::new(); // not really used
-    let storage = get_configured_remote_storage();
     storage.delete_objects(paths, &cancel).await
 }
 
 /// Copy segments from one timeline to another. Used in copy_timeline.
 pub async fn copy_s3_segments(
+    storage: &GenericRemoteStorage,
     wal_seg_size: usize,
     src_ttid: &TenantTimelineId,
     dst_ttid: &TenantTimelineId,
@@ -634,12 +655,6 @@ pub async fn copy_s3_segments(
 ) -> Result<()> {
     const SEGMENTS_PROGRESS_REPORT_INTERVAL: u64 = 1024;
 
-    let storage = REMOTE_STORAGE
-        .get()
-        .expect("failed to get remote storage")
-        .as_ref()
-        .unwrap();
-
     let remote_dst_path = remote_timeline_path(dst_ttid)?;
 
     let cancel = CancellationToken::new();
diff --git a/safekeeper/src/wal_backup_partial.rs b/safekeeper/src/wal_backup_partial.rs
index 049852a048..fe0f1b3607 100644
--- a/safekeeper/src/wal_backup_partial.rs
+++ b/safekeeper/src/wal_backup_partial.rs
@@ -19,9 +19,11 @@
 //! file. Code updates state in the control file before doing any S3 operations.
 //! This way control file stores information about all potentially existing
 //! remote partial segments and can clean them up after uploading a newer version.
+use std::sync::Arc;
+
 use camino::Utf8PathBuf;
 use postgres_ffi::{PG_TLI, XLogFileName, XLogSegNo};
-use remote_storage::RemotePath;
+use remote_storage::{GenericRemoteStorage, RemotePath};
 use safekeeper_api::Term;
 use serde::{Deserialize, Serialize};
 use tokio_util::sync::CancellationToken;
@@ -154,12 +156,16 @@ pub struct PartialBackup {
     conf: SafeKeeperConf,
     local_prefix: Utf8PathBuf,
     remote_timeline_path: RemotePath,
-
+    storage: Arc<GenericRemoteStorage>,
     state: State,
 }
 
 impl PartialBackup {
-    pub async fn new(tli: WalResidentTimeline, conf: SafeKeeperConf) -> PartialBackup {
+    pub async fn new(
+        tli: WalResidentTimeline,
+        conf: SafeKeeperConf,
+        storage: Arc<GenericRemoteStorage>,
+    ) -> PartialBackup {
         let (_, persistent_state) = tli.get_state().await;
         let wal_seg_size = tli.get_wal_seg_size().await;
 
@@ -173,6 +179,7 @@ impl PartialBackup {
             conf,
             local_prefix,
             remote_timeline_path,
+            storage,
         }
     }
 
@@ -240,7 +247,8 @@ impl PartialBackup {
         let remote_path = prepared.remote_path(&self.remote_timeline_path);
 
         // Upload first `backup_bytes` bytes of the segment to the remote storage.
-        wal_backup::backup_partial_segment(&local_path, &remote_path, backup_bytes).await?;
+        wal_backup::backup_partial_segment(&self.storage, &local_path, &remote_path, backup_bytes)
+            .await?;
         PARTIAL_BACKUP_UPLOADED_BYTES.inc_by(backup_bytes as u64);
 
         // We uploaded the segment, now let's verify that the data is still actual.
@@ -326,7 +334,7 @@ impl PartialBackup {
             let remote_path = self.remote_timeline_path.join(seg);
             objects_to_delete.push(remote_path);
         }
-        wal_backup::delete_objects(&objects_to_delete).await
+        wal_backup::delete_objects(&self.storage, &objects_to_delete).await
     }
 
     /// Delete all non-Uploaded segments from the remote storage. There should be only one
@@ -424,6 +432,7 @@ pub async fn main_task(
     conf: SafeKeeperConf,
     limiter: RateLimiter,
     cancel: CancellationToken,
+    storage: Arc<GenericRemoteStorage>,
 ) -> Option<PartialRemoteSegment> {
     debug!("started");
     let await_duration = conf.partial_backup_timeout;
@@ -432,7 +441,7 @@ pub async fn main_task(
     let mut commit_lsn_rx = tli.get_commit_lsn_watch_rx();
     let mut flush_lsn_rx = tli.get_term_flush_lsn_watch_rx();
 
-    let mut backup = PartialBackup::new(tli, conf).await;
+    let mut backup = PartialBackup::new(tli, conf, storage).await;
 
     debug!("state: {:?}", backup.state);
 
diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs
index f0bac4b40a..8ba3e7cc47 100644
--- a/safekeeper/src/wal_storage.rs
+++ b/safekeeper/src/wal_storage.rs
@@ -21,6 +21,7 @@ use postgres_ffi::waldecoder::WalStreamDecoder;
 use postgres_ffi::{PG_TLI, XLogFileName, XLogSegNo, dispatch_pgversion};
 use pq_proto::SystemId;
 use remote_storage::RemotePath;
+use std::sync::Arc;
 use tokio::fs::{self, File, OpenOptions, remove_file};
 use tokio::io::{AsyncRead, AsyncReadExt, AsyncSeekExt, AsyncWriteExt};
 use tracing::*;
@@ -32,7 +33,7 @@ use crate::metrics::{
     REMOVED_WAL_SEGMENTS, WAL_STORAGE_OPERATION_SECONDS, WalStorageMetrics, time_io_closure,
 };
 use crate::state::TimelinePersistentState;
-use crate::wal_backup::{read_object, remote_timeline_path};
+use crate::wal_backup::{WalBackup, read_object, remote_timeline_path};
 
 pub trait Storage {
     // Last written LSN.
@@ -645,7 +646,7 @@ pub struct WalReader {
     wal_segment: Option<Pin<Box<dyn AsyncRead + Send + Sync>>>,
 
     // S3 will be used to read WAL if LSN is not available locally
-    enable_remote_read: bool,
+    wal_backup: Arc<WalBackup>,
 
     // We don't have WAL locally if LSN is less than local_start_lsn
     local_start_lsn: Lsn,
@@ -664,7 +665,7 @@ impl WalReader {
         timeline_dir: Utf8PathBuf,
         state: &TimelinePersistentState,
         start_pos: Lsn,
-        enable_remote_read: bool,
+        wal_backup: Arc<WalBackup>,
     ) -> Result<Self> {
         if state.server.wal_seg_size == 0 || state.local_start_lsn == Lsn(0) {
             bail!("state uninitialized, no data to read");
@@ -693,7 +694,7 @@ impl WalReader {
             wal_seg_size: state.server.wal_seg_size as usize,
             pos: start_pos,
             wal_segment: None,
-            enable_remote_read,
+            wal_backup,
             local_start_lsn: state.local_start_lsn,
             timeline_start_lsn: state.timeline_start_lsn,
             pg_version: state.server.pg_version / 10000,
@@ -812,9 +813,9 @@ impl WalReader {
         }
 
         // Try to open remote file, if remote reads are enabled
-        if self.enable_remote_read {
+        if let Some(storage) = self.wal_backup.get_storage() {
             let remote_wal_file_path = self.remote_path.join(&wal_file_name);
-            return read_object(&remote_wal_file_path, xlogoff as u64).await;
+            return read_object(&storage, &remote_wal_file_path, xlogoff as u64).await;
         }
 
         bail!("WAL segment is not found")

From baafcc5d4108b1be38edf428c3f3dd87cc0c9508 Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Fri, 16 May 2025 14:12:39 +0000
Subject: [PATCH 53/65] proxy: Fix misspelled flag value alias, swap names and
 aliases (#11949)

## Problem

There's a misspelled flag value alias that's not really used anywhere.

## Summary of changes

Fix the alias and make aliases the official flag values and keep old
values as aliases.
Also rename enum variant. No need for it to carry the version now.
---
 proxy/src/binary/proxy.rs | 9 +++++----
 proxy/src/context/mod.rs  | 2 +-
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/proxy/src/binary/proxy.rs b/proxy/src/binary/proxy.rs
index 4cb5ddc335..51713902bc 100644
--- a/proxy/src/binary/proxy.rs
+++ b/proxy/src/binary/proxy.rs
@@ -43,11 +43,12 @@ project_build_tag!(BUILD_TAG);
 use clap::{Parser, ValueEnum};
 
 #[derive(Clone, Debug, ValueEnum)]
+#[clap(rename_all = "kebab-case")]
 enum AuthBackendType {
-    #[value(name("cplane-v1"), alias("control-plane"))]
-    ControlPlaneV1,
+    #[clap(alias("cplane-v1"))]
+    ControlPlane,
 
-    #[value(name("link"), alias("control-redirect"))]
+    #[clap(alias("link"))]
     ConsoleRedirect,
 
     #[cfg(any(test, feature = "testing"))]
@@ -707,7 +708,7 @@ fn build_auth_backend(
     args: &ProxyCliArgs,
 ) -> anyhow::Result<Either<&'static auth::Backend<'static, ()>, &'static ConsoleRedirectBackend>> {
     match &args.auth_backend {
-        AuthBackendType::ControlPlaneV1 => {
+        AuthBackendType::ControlPlane => {
             let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?;
             let project_info_cache_config: ProjectInfoCacheOptions =
                 args.project_info_cache.parse()?;
diff --git a/proxy/src/context/mod.rs b/proxy/src/context/mod.rs
index 5f649d2b21..79aaf22990 100644
--- a/proxy/src/context/mod.rs
+++ b/proxy/src/context/mod.rs
@@ -78,7 +78,7 @@ struct RequestContextInner {
 
 #[derive(Clone, Debug)]
 pub(crate) enum AuthMethod {
-    // aka passwordless, fka link
+    // aka link
     ConsoleRedirect,
     ScramSha256,
     ScramSha256Plus,

From 55f91cf10b30c3c648ac1301b95cd049bd7f0e21 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Fri, 16 May 2025 17:45:08 +0300
Subject: [PATCH 54/65] Update 'nix' package (#11948)

There were some incompatible changes. Most churn was from switching from
the now-deprecated fcntl:flock() function to
fcntl::Flock::lock(). The new function returns a guard object, while
with the old function, the lock was associated directly with the file
descriptor.

It's good to stay up-to-date in general, but the impetus to do this now
is that in https://github.com/neondatabase/neon/pull/11929, I want to
use some functions that were added only in the latest version of 'nix',
and it's nice to not have to build multiple versions. (Although,
different versions of 'nix' are still pulled in as indirect dependencies
from other packages)
---
 Cargo.lock                                    | 25 +++++---
 Cargo.toml                                    |  2 +-
 control_plane/src/background_process.rs       |  4 +-
 control_plane/src/bin/neon_local.rs           | 13 ++--
 libs/utils/src/crashsafe.rs                   |  6 +-
 libs/utils/src/fs_ext/rename_noreplace.rs     |  4 +-
 libs/utils/src/lock_file.rs                   | 63 ++++++++++---------
 pageserver/src/tenant/secondary/downloader.rs |  4 +-
 pageserver/src/virtual_file.rs                |  2 +-
 9 files changed, 66 insertions(+), 57 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index f075b45e49..1edd20105d 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1112,6 +1112,12 @@ version = "1.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
 
+[[package]]
+name = "cfg_aliases"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724"
+
 [[package]]
 name = "cgroups-rs"
 version = "0.3.3"
@@ -1306,7 +1312,7 @@ dependencies = [
  "itertools 0.10.5",
  "jsonwebtoken",
  "metrics",
- "nix 0.27.1",
+ "nix 0.30.1",
  "notify",
  "num_cpus",
  "once_cell",
@@ -1429,7 +1435,7 @@ dependencies = [
  "humantime-serde",
  "hyper 0.14.30",
  "jsonwebtoken",
- "nix 0.27.1",
+ "nix 0.30.1",
  "once_cell",
  "pageserver_api",
  "pageserver_client",
@@ -3512,9 +3518,9 @@ dependencies = [
 
 [[package]]
 name = "libc"
-version = "0.2.169"
+version = "0.2.172"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a"
+checksum = "d750af042f7ef4f724306de029d18836c26c1765a54a6a3f094cbd23a7267ffa"
 
 [[package]]
 name = "libloading"
@@ -3821,12 +3827,13 @@ dependencies = [
 
 [[package]]
 name = "nix"
-version = "0.27.1"
+version = "0.30.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2eb04e9c688eff1c89d72b407f168cf79bb9e867a9d3323ed6c01519eb9cc053"
+checksum = "74523f3a35e05aba87a1d978330aef40f67b0304ac79c1c00b294c9830543db6"
 dependencies = [
  "bitflags 2.8.0",
  "cfg-if",
+ "cfg_aliases",
  "libc",
  "memoffset 0.9.0",
 ]
@@ -4280,7 +4287,7 @@ dependencies = [
  "jsonwebtoken",
  "md5",
  "metrics",
- "nix 0.27.1",
+ "nix 0.30.1",
  "num-traits",
  "num_cpus",
  "once_cell",
@@ -4356,7 +4363,7 @@ dependencies = [
  "humantime",
  "humantime-serde",
  "itertools 0.10.5",
- "nix 0.27.1",
+ "nix 0.30.1",
  "once_cell",
  "postgres_backend",
  "postgres_ffi",
@@ -7899,7 +7906,7 @@ dependencies = [
  "humantime",
  "jsonwebtoken",
  "metrics",
- "nix 0.27.1",
+ "nix 0.30.1",
  "once_cell",
  "pem",
  "pin-project-lite",
diff --git a/Cargo.toml b/Cargo.toml
index 6b87ce549d..d6fffe7768 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -127,7 +127,7 @@ md5 = "0.7.0"
 measured = { version = "0.0.22", features=["lasso"] }
 measured-process = { version = "0.0.22" }
 memoffset = "0.9"
-nix = { version = "0.27", features = ["dir", "fs", "process", "socket", "signal", "poll"] }
+nix = { version = "0.30.1", features = ["dir", "fs", "process", "socket", "signal", "poll"] }
 # Do not update to >= 7.0.0, at least. The update will have a significant impact
 # on compute startup metrics (start_postgres_ms), >= 25% degradation.
 notify = "6.0.0"
diff --git a/control_plane/src/background_process.rs b/control_plane/src/background_process.rs
index 1eac4f7ff0..4f0934e411 100644
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -14,7 +14,7 @@
 
 use std::ffi::OsStr;
 use std::io::Write;
-use std::os::unix::prelude::AsRawFd;
+use std::os::fd::AsFd;
 use std::os::unix::process::CommandExt;
 use std::path::Path;
 use std::process::Command;
@@ -356,7 +356,7 @@ where
             let file = pid_file::claim_for_current_process(&path).expect("claim pid file");
             // Remove the FD_CLOEXEC flag on the pidfile descriptor so that the pidfile
             // remains locked after exec.
-            nix::fcntl::fcntl(file.as_raw_fd(), FcntlArg::F_SETFD(FdFlag::empty()))
+            nix::fcntl::fcntl(file.as_fd(), FcntlArg::F_SETFD(FdFlag::empty()))
                 .expect("remove FD_CLOEXEC");
             // Don't run drop(file), it would close the file before we actually exec.
             std::mem::forget(file);
diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index 191a22f1de..98ab6e5657 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -8,7 +8,6 @@
 use std::borrow::Cow;
 use std::collections::{BTreeSet, HashMap};
 use std::fs::File;
-use std::os::fd::AsRawFd;
 use std::path::PathBuf;
 use std::process::exit;
 use std::str::FromStr;
@@ -31,7 +30,7 @@ use control_plane::safekeeper::SafekeeperNode;
 use control_plane::storage_controller::{
     NeonStorageControllerStartArgs, NeonStorageControllerStopArgs, StorageController,
 };
-use nix::fcntl::{FlockArg, flock};
+use nix::fcntl::{Flock, FlockArg};
 use pageserver_api::config::{
     DEFAULT_HTTP_LISTEN_PORT as DEFAULT_PAGESERVER_HTTP_PORT,
     DEFAULT_PG_LISTEN_PORT as DEFAULT_PAGESERVER_PG_PORT,
@@ -749,16 +748,16 @@ struct TimelineTreeEl {
 
 /// A flock-based guard over the neon_local repository directory
 struct RepoLock {
-    _file: File,
+    _file: Flock<File>,
 }
 
 impl RepoLock {
     fn new() -> Result<Self> {
         let repo_dir = File::open(local_env::base_path())?;
-        let repo_dir_fd = repo_dir.as_raw_fd();
-        flock(repo_dir_fd, FlockArg::LockExclusive)?;
-
-        Ok(Self { _file: repo_dir })
+        match Flock::lock(repo_dir, FlockArg::LockExclusive) {
+            Ok(f) => Ok(Self { _file: f }),
+            Err((_, e)) => Err(e).context("flock error"),
+        }
     }
 }
 
diff --git a/libs/utils/src/crashsafe.rs b/libs/utils/src/crashsafe.rs
index 215fa36df4..45acaf682f 100644
--- a/libs/utils/src/crashsafe.rs
+++ b/libs/utils/src/crashsafe.rs
@@ -1,7 +1,7 @@
 use std::borrow::Cow;
 use std::fs::{self, File};
 use std::io::{self, Write};
-use std::os::fd::AsRawFd;
+use std::os::fd::AsFd;
 
 use camino::{Utf8Path, Utf8PathBuf};
 
@@ -210,13 +210,13 @@ pub fn overwrite(
 
 /// Syncs the filesystem for the given file descriptor.
 #[cfg_attr(target_os = "macos", allow(unused_variables))]
-pub fn syncfs(fd: impl AsRawFd) -> anyhow::Result<()> {
+pub fn syncfs(fd: impl AsFd) -> anyhow::Result<()> {
     // Linux guarantees durability for syncfs.
     // POSIX doesn't have syncfs, and further does not actually guarantee durability of sync().
     #[cfg(target_os = "linux")]
     {
         use anyhow::Context;
-        nix::unistd::syncfs(fd.as_raw_fd()).context("syncfs")?;
+        nix::unistd::syncfs(fd).context("syncfs")?;
     }
     #[cfg(target_os = "macos")]
     {
diff --git a/libs/utils/src/fs_ext/rename_noreplace.rs b/libs/utils/src/fs_ext/rename_noreplace.rs
index d0c07353d0..c945ecadf0 100644
--- a/libs/utils/src/fs_ext/rename_noreplace.rs
+++ b/libs/utils/src/fs_ext/rename_noreplace.rs
@@ -11,9 +11,9 @@ pub fn rename_noreplace<P1: ?Sized + NixPath, P2: ?Sized + NixPath>(
         #[cfg(all(target_os = "linux", target_env = "gnu"))]
         {
             nix::fcntl::renameat2(
-                None,
+                nix::fcntl::AT_FDCWD,
                 src,
-                None,
+                nix::fcntl::AT_FDCWD,
                 dst,
                 nix::fcntl::RenameFlags::RENAME_NOREPLACE,
             )
diff --git a/libs/utils/src/lock_file.rs b/libs/utils/src/lock_file.rs
index 6aeeeca021..b3c8d74d7d 100644
--- a/libs/utils/src/lock_file.rs
+++ b/libs/utils/src/lock_file.rs
@@ -1,6 +1,6 @@
 //! A module to create and read lock files.
 //!
-//! File locking is done using [`fcntl::flock`] exclusive locks.
+//! File locking is done using [`nix::fcntl::Flock`] exclusive locks.
 //! The only consumer of this module is currently
 //! [`pid_file`](crate::pid_file). See the module-level comment
 //! there for potential pitfalls with lock files that are used
@@ -9,26 +9,25 @@
 use std::fs;
 use std::io::{Read, Write};
 use std::ops::Deref;
-use std::os::unix::prelude::AsRawFd;
 
 use anyhow::Context;
 use camino::{Utf8Path, Utf8PathBuf};
 use nix::errno::Errno::EAGAIN;
-use nix::fcntl;
+use nix::fcntl::{Flock, FlockArg};
 
 use crate::crashsafe;
 
-/// A handle to an open and unlocked, but not-yet-written lock file.
+/// A handle to an open and flocked, but not-yet-written lock file.
 /// Returned by [`create_exclusive`].
 #[must_use]
 pub struct UnwrittenLockFile {
     path: Utf8PathBuf,
-    file: fs::File,
+    file: Flock<fs::File>,
 }
 
 /// Returned by [`UnwrittenLockFile::write_content`].
 #[must_use]
-pub struct LockFileGuard(fs::File);
+pub struct LockFileGuard(Flock<fs::File>);
 
 impl Deref for LockFileGuard {
     type Target = fs::File;
@@ -67,17 +66,14 @@ pub fn create_exclusive(lock_file_path: &Utf8Path) -> anyhow::Result<UnwrittenLo
         .open(lock_file_path)
         .context("open lock file")?;
 
-    let res = fcntl::flock(
-        lock_file.as_raw_fd(),
-        fcntl::FlockArg::LockExclusiveNonblock,
-    );
+    let res = Flock::lock(lock_file, FlockArg::LockExclusiveNonblock);
     match res {
-        Ok(()) => Ok(UnwrittenLockFile {
+        Ok(lock_file) => Ok(UnwrittenLockFile {
             path: lock_file_path.to_owned(),
             file: lock_file,
         }),
-        Err(EAGAIN) => anyhow::bail!("file is already locked"),
-        Err(e) => Err(e).context("flock error"),
+        Err((_, EAGAIN)) => anyhow::bail!("file is already locked"),
+        Err((_, e)) => Err(e).context("flock error"),
     }
 }
 
@@ -105,32 +101,37 @@ pub enum LockFileRead {
 /// Check the [`LockFileRead`] variants for details.
 pub fn read_and_hold_lock_file(path: &Utf8Path) -> anyhow::Result<LockFileRead> {
     let res = fs::OpenOptions::new().read(true).open(path);
-    let mut lock_file = match res {
+    let lock_file = match res {
         Ok(f) => f,
         Err(e) => match e.kind() {
             std::io::ErrorKind::NotFound => return Ok(LockFileRead::NotExist),
             _ => return Err(e).context("open lock file"),
         },
     };
-    let res = fcntl::flock(
-        lock_file.as_raw_fd(),
-        fcntl::FlockArg::LockExclusiveNonblock,
-    );
+    let res = Flock::lock(lock_file, FlockArg::LockExclusiveNonblock);
     // We need the content regardless of lock success / failure.
     // But, read it after flock so that, if it succeeded, the content is consistent.
-    let mut content = String::new();
-    lock_file
-        .read_to_string(&mut content)
-        .context("read lock file")?;
     match res {
-        Ok(()) => Ok(LockFileRead::NotHeldByAnyProcess(
-            LockFileGuard(lock_file),
-            content,
-        )),
-        Err(EAGAIN) => Ok(LockFileRead::LockedByOtherProcess {
-            not_locked_file: lock_file,
-            content,
-        }),
-        Err(e) => Err(e).context("flock error"),
+        Ok(mut locked_file) => {
+            let mut content = String::new();
+            locked_file
+                .read_to_string(&mut content)
+                .context("read lock file")?;
+            Ok(LockFileRead::NotHeldByAnyProcess(
+                LockFileGuard(locked_file),
+                content,
+            ))
+        }
+        Err((mut not_locked_file, EAGAIN)) => {
+            let mut content = String::new();
+            not_locked_file
+                .read_to_string(&mut content)
+                .context("read lock file")?;
+            Ok(LockFileRead::LockedByOtherProcess {
+                not_locked_file,
+                content,
+            })
+        }
+        Err((_, e)) => Err(e).context("flock error"),
     }
 }
diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index c26b7626ef..dd49c843f3 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -668,7 +668,9 @@ impl From<DownloadError> for UpdateError {
 
 impl From<std::io::Error> for UpdateError {
     fn from(value: std::io::Error) -> Self {
-        if let Some(nix::errno::Errno::ENOSPC) = value.raw_os_error().map(nix::errno::from_i32) {
+        if let Some(nix::errno::Errno::ENOSPC) =
+            value.raw_os_error().map(nix::errno::Errno::from_raw)
+        {
             UpdateError::NoSpace
         } else if value
             .get_ref()
diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index c707d35114..45b6e44c54 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -408,7 +408,7 @@ impl OpenFiles {
 /// error types may be elegible for retry.
 pub(crate) fn is_fatal_io_error(e: &std::io::Error) -> bool {
     use nix::errno::Errno::*;
-    match e.raw_os_error().map(nix::errno::from_i32) {
+    match e.raw_os_error().map(nix::errno::Errno::from_raw) {
         Some(EIO) => {
             // Terminate on EIO because we no longer trust the device to store
             // data safely, or to uphold persistence guarantees on fsync.

From 532d9b646e4eaab6e0d94da8a6f890a9c834647c Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Sat, 17 May 2025 00:22:36 +0300
Subject: [PATCH 55/65] Add simple facility for an extendable shared memory
 area (#11929)

You still need to provide a max size up-front, but memory is only
allocated for the portion that is in use.

The module is currently unused, but will be used by the new compute
communicator project, in the neon Postgres extension. See
https://github.com/neondatabase/neon/issues/11729

---------

Co-authored-by: Erik Grinaker <erik@neon.tech>
---
 Cargo.lock                 |  11 +
 Cargo.toml                 |   3 +-
 libs/neon-shmem/Cargo.toml |  13 ++
 libs/neon-shmem/src/lib.rs | 418 +++++++++++++++++++++++++++++++++++++
 workspace_hack/Cargo.toml  |   3 +-
 5 files changed, 446 insertions(+), 2 deletions(-)
 create mode 100644 libs/neon-shmem/Cargo.toml
 create mode 100644 libs/neon-shmem/src/lib.rs

diff --git a/Cargo.lock b/Cargo.lock
index 1edd20105d..8ca65b58ce 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3794,6 +3794,16 @@ version = "0.8.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a"
 
+[[package]]
+name = "neon-shmem"
+version = "0.1.0"
+dependencies = [
+ "nix 0.30.1",
+ "tempfile",
+ "thiserror 1.0.69",
+ "workspace_hack",
+]
+
 [[package]]
 name = "never-say-never"
 version = "6.6.666"
@@ -8482,6 +8492,7 @@ dependencies = [
  "log",
  "memchr",
  "nix 0.26.4",
+ "nix 0.30.1",
  "nom",
  "num",
  "num-bigint",
diff --git a/Cargo.toml b/Cargo.toml
index d6fffe7768..74b281f88f 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -23,6 +23,7 @@ members = [
     "libs/postgres_ffi",
     "libs/safekeeper_api",
     "libs/desim",
+    "libs/neon-shmem",
     "libs/utils",
     "libs/consumption_metrics",
     "libs/postgres_backend",
@@ -127,7 +128,7 @@ md5 = "0.7.0"
 measured = { version = "0.0.22", features=["lasso"] }
 measured-process = { version = "0.0.22" }
 memoffset = "0.9"
-nix = { version = "0.30.1", features = ["dir", "fs", "process", "socket", "signal", "poll"] }
+nix = { version = "0.30.1", features = ["dir", "fs", "mman", "process", "socket", "signal", "poll"] }
 # Do not update to >= 7.0.0, at least. The update will have a significant impact
 # on compute startup metrics (start_postgres_ms), >= 25% degradation.
 notify = "6.0.0"
diff --git a/libs/neon-shmem/Cargo.toml b/libs/neon-shmem/Cargo.toml
new file mode 100644
index 0000000000..2a636bec40
--- /dev/null
+++ b/libs/neon-shmem/Cargo.toml
@@ -0,0 +1,13 @@
+[package]
+name = "neon-shmem"
+version = "0.1.0"
+edition.workspace = true
+license.workspace = true
+
+[dependencies]
+thiserror.workspace = true
+nix.workspace=true
+workspace_hack = { version = "0.1", path = "../../workspace_hack" }
+
+[target.'cfg(target_os = "macos")'.dependencies]
+tempfile = "3.14.0"
diff --git a/libs/neon-shmem/src/lib.rs b/libs/neon-shmem/src/lib.rs
new file mode 100644
index 0000000000..e1b14b1371
--- /dev/null
+++ b/libs/neon-shmem/src/lib.rs
@@ -0,0 +1,418 @@
+//! Shared memory utilities for neon communicator
+
+use std::num::NonZeroUsize;
+use std::os::fd::{AsFd, BorrowedFd, OwnedFd};
+use std::ptr::NonNull;
+use std::sync::atomic::{AtomicUsize, Ordering};
+
+use nix::errno::Errno;
+use nix::sys::mman::MapFlags;
+use nix::sys::mman::ProtFlags;
+use nix::sys::mman::mmap as nix_mmap;
+use nix::sys::mman::munmap as nix_munmap;
+use nix::unistd::ftruncate as nix_ftruncate;
+
+/// ShmemHandle represents a shared memory area that can be shared by processes over fork().
+/// Unlike shared memory allocated by Postgres, this area is resizable, up to 'max_size' that's
+/// specified at creation.
+///
+/// The area is backed by an anonymous file created with memfd_create(). The full address space for
+/// 'max_size' is reserved up-front with mmap(), but whenever you call [`ShmemHandle::set_size`],
+/// the underlying file is resized. Do not access the area beyond the current size. Currently, that
+/// will cause the file to be expanded, but we might use mprotect() etc. to enforce that in the
+/// future.
+pub struct ShmemHandle {
+    /// memfd file descriptor
+    fd: OwnedFd,
+
+    max_size: usize,
+
+    // Pointer to the beginning of the shared memory area. The header is stored there.
+    shared_ptr: NonNull<SharedStruct>,
+
+    // Pointer to the beginning of the user data
+    pub data_ptr: NonNull<u8>,
+}
+
+/// This is stored at the beginning in the shared memory area.
+struct SharedStruct {
+    max_size: usize,
+
+    /// Current size of the backing file. The high-order bit is used for the RESIZE_IN_PROGRESS flag
+    current_size: AtomicUsize,
+}
+
+const RESIZE_IN_PROGRESS: usize = 1 << 63;
+
+const HEADER_SIZE: usize = std::mem::size_of::<SharedStruct>();
+
+/// Error type returned by the ShmemHandle functions.
+#[derive(thiserror::Error, Debug)]
+#[error("{msg}: {errno}")]
+pub struct Error {
+    pub msg: String,
+    pub errno: Errno,
+}
+
+impl Error {
+    fn new(msg: &str, errno: Errno) -> Error {
+        Error {
+            msg: msg.to_string(),
+            errno,
+        }
+    }
+}
+
+impl ShmemHandle {
+    /// Create a new shared memory area. To communicate between processes, the processes need to be
+    /// fork()'d after calling this, so that the ShmemHandle is inherited by all processes.
+    ///
+    /// If the ShmemHandle is dropped, the memory is unmapped from the current process. Other
+    /// processes can continue using it, however.
+    pub fn new(name: &str, initial_size: usize, max_size: usize) -> Result<ShmemHandle, Error> {
+        // create the backing anonymous file.
+        let fd = create_backing_file(name)?;
+
+        Self::new_with_fd(fd, initial_size, max_size)
+    }
+
+    fn new_with_fd(
+        fd: OwnedFd,
+        initial_size: usize,
+        max_size: usize,
+    ) -> Result<ShmemHandle, Error> {
+        // We reserve the high-order bit for the RESIZE_IN_PROGRESS flag, and the actual size
+        // is a little larger than this because of the SharedStruct header. Make the upper limit
+        // somewhat smaller than that, because with anything close to that, you'll run out of
+        // memory anyway.
+        if max_size >= 1 << 48 {
+            panic!("max size {} too large", max_size);
+        }
+        if initial_size > max_size {
+            panic!("initial size {initial_size} larger than max size {max_size}");
+        }
+
+        // The actual initial / max size is the one given by the caller, plus the size of
+        // 'SharedStruct'.
+        let initial_size = HEADER_SIZE + initial_size;
+        let max_size = NonZeroUsize::new(HEADER_SIZE + max_size).unwrap();
+
+        // Reserve address space for it with mmap
+        //
+        // TODO: Use MAP_HUGETLB if possible
+        let start_ptr = unsafe {
+            nix_mmap(
+                None,
+                max_size,
+                ProtFlags::PROT_READ | ProtFlags::PROT_WRITE,
+                MapFlags::MAP_SHARED,
+                &fd,
+                0,
+            )
+        }
+        .map_err(|e| Error::new("mmap failed: {e}", e))?;
+
+        // Reserve space for the initial size
+        enlarge_file(fd.as_fd(), initial_size as u64)?;
+
+        // Initialize the header
+        let shared: NonNull<SharedStruct> = start_ptr.cast();
+        unsafe {
+            shared.write(SharedStruct {
+                max_size: max_size.into(),
+                current_size: AtomicUsize::new(initial_size),
+            })
+        };
+
+        // The user data begins after the header
+        let data_ptr = unsafe { start_ptr.cast().add(HEADER_SIZE) };
+
+        Ok(ShmemHandle {
+            fd,
+            max_size: max_size.into(),
+            shared_ptr: shared,
+            data_ptr,
+        })
+    }
+
+    // return reference to the header
+    fn shared(&self) -> &SharedStruct {
+        unsafe { self.shared_ptr.as_ref() }
+    }
+
+    /// Resize the shared memory area. 'new_size' must not be larger than the 'max_size' specified
+    /// when creating the area.
+    ///
+    /// This may only be called from one process/thread concurrently. We detect that case
+    /// and return an Error.
+    pub fn set_size(&self, new_size: usize) -> Result<(), Error> {
+        let new_size = new_size + HEADER_SIZE;
+        let shared = self.shared();
+
+        if new_size > self.max_size {
+            panic!(
+                "new size ({} is greater than max size ({})",
+                new_size, self.max_size
+            );
+        }
+        assert_eq!(self.max_size, shared.max_size);
+
+        // Lock the area by setting the bit in 'current_size'
+        //
+        // Ordering::Relaxed would probably be sufficient here, as we don't access any other memory
+        // and the posix_fallocate/ftruncate call is surely a synchronization point anyway. But
+        // since this is not performance-critical, better safe than sorry .
+        let mut old_size = shared.current_size.load(Ordering::Acquire);
+        loop {
+            if (old_size & RESIZE_IN_PROGRESS) != 0 {
+                return Err(Error::new(
+                    "concurrent resize detected",
+                    Errno::UnknownErrno,
+                ));
+            }
+            match shared.current_size.compare_exchange(
+                old_size,
+                new_size,
+                Ordering::Acquire,
+                Ordering::Relaxed,
+            ) {
+                Ok(_) => break,
+                Err(x) => old_size = x,
+            }
+        }
+
+        // Ok, we got the lock.
+        //
+        // NB: If anything goes wrong, we *must* clear the bit!
+        let result = {
+            use std::cmp::Ordering::{Equal, Greater, Less};
+            match new_size.cmp(&old_size) {
+                Less => nix_ftruncate(&self.fd, new_size as i64).map_err(|e| {
+                    Error::new("could not shrink shmem segment, ftruncate failed: {e}", e)
+                }),
+                Equal => Ok(()),
+                Greater => enlarge_file(self.fd.as_fd(), new_size as u64),
+            }
+        };
+
+        // Unlock
+        shared.current_size.store(
+            if result.is_ok() { new_size } else { old_size },
+            Ordering::Release,
+        );
+
+        result
+    }
+
+    /// Returns the current user-visible size of the shared memory segment.
+    ///
+    /// NOTE: a concurrent set_size() call can change the size at any time. It is the caller's
+    /// responsibility not to access the area beyond the current size.
+    pub fn current_size(&self) -> usize {
+        let total_current_size =
+            self.shared().current_size.load(Ordering::Relaxed) & !RESIZE_IN_PROGRESS;
+        total_current_size - HEADER_SIZE
+    }
+}
+
+impl Drop for ShmemHandle {
+    fn drop(&mut self) {
+        // SAFETY: The pointer was obtained from mmap() with the given size.
+        // We unmap the entire region.
+        let _ = unsafe { nix_munmap(self.shared_ptr.cast(), self.max_size) };
+        // The fd is dropped automatically by OwnedFd.
+    }
+}
+
+/// Create a "backing file" for the shared memory area. On Linux, use memfd_create(), to create an
+/// anonymous in-memory file. One macos, fall back to a regular file. That's good enough for
+/// development and testing, but in production we want the file to stay in memory.
+///
+/// disable 'unused_variables' warnings, because in the macos path, 'name' is unused.
+#[allow(unused_variables)]
+fn create_backing_file(name: &str) -> Result<OwnedFd, Error> {
+    #[cfg(not(target_os = "macos"))]
+    {
+        nix::sys::memfd::memfd_create(name, nix::sys::memfd::MFdFlags::empty())
+            .map_err(|e| Error::new("memfd_create failed: {e}", e))
+    }
+    #[cfg(target_os = "macos")]
+    {
+        let file = tempfile::tempfile().map_err(|e| {
+            Error::new(
+                "could not create temporary file to back shmem area: {e}",
+                nix::errno::Errno::from_raw(e.raw_os_error().unwrap_or(0)),
+            )
+        })?;
+        Ok(OwnedFd::from(file))
+    }
+}
+
+fn enlarge_file(fd: BorrowedFd, size: u64) -> Result<(), Error> {
+    // Use posix_fallocate() to enlarge the file. It reserves the space correctly, so that
+    // we don't get a segfault later when trying to actually use it.
+    #[cfg(not(target_os = "macos"))]
+    {
+        nix::fcntl::posix_fallocate(fd, 0, size as i64).map_err(|e| {
+            Error::new(
+                "could not grow shmem segment, posix_fallocate failed: {e}",
+                e,
+            )
+        })
+    }
+    // As a fallback on macos, which doesn't have posix_fallocate, use plain 'fallocate'
+    #[cfg(target_os = "macos")]
+    {
+        nix::unistd::ftruncate(fd, size as i64)
+            .map_err(|e| Error::new("could not grow shmem segment, ftruncate failed: {e}", e))
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    use nix::unistd::ForkResult;
+    use std::ops::Range;
+
+    /// check that all bytes in given range have the expected value.
+    fn assert_range(ptr: *const u8, expected: u8, range: Range<usize>) {
+        for i in range {
+            let b = unsafe { *(ptr.add(i)) };
+            assert_eq!(expected, b, "unexpected byte at offset {}", i);
+        }
+    }
+
+    /// Write 'b' to all bytes in the given range
+    fn write_range(ptr: *mut u8, b: u8, range: Range<usize>) {
+        unsafe { std::ptr::write_bytes(ptr.add(range.start), b, range.end - range.start) };
+    }
+
+    // simple single-process test of growing and shrinking
+    #[test]
+    fn test_shmem_resize() -> Result<(), Error> {
+        let max_size = 1024 * 1024;
+        let init_struct = ShmemHandle::new("test_shmem_resize", 0, max_size)?;
+
+        assert_eq!(init_struct.current_size(), 0);
+
+        // Initial grow
+        let size1 = 10000;
+        init_struct.set_size(size1).unwrap();
+        assert_eq!(init_struct.current_size(), size1);
+
+        // Write some data
+        let data_ptr = init_struct.data_ptr.as_ptr();
+        write_range(data_ptr, 0xAA, 0..size1);
+        assert_range(data_ptr, 0xAA, 0..size1);
+
+        // Shrink
+        let size2 = 5000;
+        init_struct.set_size(size2).unwrap();
+        assert_eq!(init_struct.current_size(), size2);
+
+        // Grow again
+        let size3 = 20000;
+        init_struct.set_size(size3).unwrap();
+        assert_eq!(init_struct.current_size(), size3);
+
+        // Try to read it. The area that was shrunk and grown again should read as all zeros now
+        assert_range(data_ptr, 0xAA, 0..5000);
+        assert_range(data_ptr, 0, 5000..size1);
+
+        // Try to grow beyond max_size
+        //let size4 = max_size + 1;
+        //assert!(init_struct.set_size(size4).is_err());
+
+        // Dropping init_struct should unmap the memory
+        drop(init_struct);
+
+        Ok(())
+    }
+
+    /// This is used in tests to coordinate between test processes. It's like std::sync::Barrier,
+    /// but is stored in the shared memory area and works across processes. It's implemented by
+    /// polling, because e.g. standard rust mutexes are not guaranteed to work across processes.
+    struct SimpleBarrier {
+        num_procs: usize,
+        count: AtomicUsize,
+    }
+
+    impl SimpleBarrier {
+        unsafe fn init(ptr: *mut SimpleBarrier, num_procs: usize) {
+            unsafe {
+                *ptr = SimpleBarrier {
+                    num_procs,
+                    count: AtomicUsize::new(0),
+                }
+            }
+        }
+
+        pub fn wait(&self) {
+            let old = self.count.fetch_add(1, Ordering::Relaxed);
+
+            let generation = old / self.num_procs;
+
+            let mut current = old + 1;
+            while current < (generation + 1) * self.num_procs {
+                std::thread::sleep(std::time::Duration::from_millis(10));
+                current = self.count.load(Ordering::Relaxed);
+            }
+        }
+    }
+
+    #[test]
+    fn test_multi_process() {
+        // Initialize
+        let max_size = 1_000_000_000_000;
+        let init_struct = ShmemHandle::new("test_multi_process", 0, max_size).unwrap();
+        let ptr = init_struct.data_ptr.as_ptr();
+
+        // Store the SimpleBarrier in the first 1k of the area.
+        init_struct.set_size(10000).unwrap();
+        let barrier_ptr: *mut SimpleBarrier = unsafe {
+            ptr.add(ptr.align_offset(std::mem::align_of::<SimpleBarrier>()))
+                .cast()
+        };
+        unsafe { SimpleBarrier::init(barrier_ptr, 2) };
+        let barrier = unsafe { barrier_ptr.as_ref().unwrap() };
+
+        // Fork another test process. The code after this runs in both processes concurrently.
+        let fork_result = unsafe { nix::unistd::fork().unwrap() };
+
+        // In the parent, fill bytes between 1000..2000. In the child, between 2000..3000
+        if fork_result.is_parent() {
+            write_range(ptr, 0xAA, 1000..2000);
+        } else {
+            write_range(ptr, 0xBB, 2000..3000);
+        }
+        barrier.wait();
+        // Verify the contents. (in both processes)
+        assert_range(ptr, 0xAA, 1000..2000);
+        assert_range(ptr, 0xBB, 2000..3000);
+
+        // Grow, from the child this time
+        let size = 10_000_000;
+        if !fork_result.is_parent() {
+            init_struct.set_size(size).unwrap();
+        }
+        barrier.wait();
+
+        // make some writes at the end
+        if fork_result.is_parent() {
+            write_range(ptr, 0xAA, (size - 10)..size);
+        } else {
+            write_range(ptr, 0xBB, (size - 20)..(size - 10));
+        }
+        barrier.wait();
+
+        // Verify the contents. (This runs in both processes)
+        assert_range(ptr, 0, (size - 1000)..(size - 20));
+        assert_range(ptr, 0xBB, (size - 20)..(size - 10));
+        assert_range(ptr, 0xAA, (size - 10)..size);
+
+        if let ForkResult::Parent { child } = fork_result {
+            nix::sys::wait::waitpid(child, None).unwrap();
+        }
+    }
+}
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index fecf62f756..69d44b82ea 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -60,7 +60,8 @@ lazy_static = { version = "1", default-features = false, features = ["spin_no_st
 libc = { version = "0.2", features = ["extra_traits", "use_std"] }
 log = { version = "0.4", default-features = false, features = ["std"] }
 memchr = { version = "2" }
-nix = { version = "0.26" }
+nix-2f80eeee3b1b6c7e = { package = "nix", version = "0.26" }
+nix-fa1f6196edfd7249 = { package = "nix", version = "0.30", features = ["dir", "ioctl", "mman", "poll", "signal", "socket"] }
 nom = { version = "7" }
 num = { version = "0.4" }
 num-bigint = { version = "0.4" }

From deed46015dd5eaa2dcc48f5f17f3e923a13e6711 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Sat, 17 May 2025 08:34:54 +0200
Subject: [PATCH 56/65] CI(test-images): increase timeout from 20m to 60m
 (#11955)

## Problem

For some reason (unknown yet) 20m timeout is not enough for
`test-images` job on arm runners.
Ref:
https://github.com/neondatabase/neon/actions/runs/15075321681/job/42387530399?pr=11953

## Summary of changes
- Increase the timeout from 20m to 1h
---
 .github/workflows/build_and_test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 6b19f6ef01..a887db2ab1 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -963,7 +963,7 @@ jobs:
           fi
 
       - name: Verify docker-compose example and test extensions
-        timeout-minutes: 20
+        timeout-minutes: 60
         env:
           TAG: >-
             ${{

From 8e05639dbf6def383da7b138e28cf930ac506647 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Sat, 17 May 2025 22:06:59 +0300
Subject: [PATCH 57/65] Invalidate LFC after unlogged build (#11951)

## Problem


See https://neondb.slack.com/archives/C04DGM6SMTM/p1747391617951239

LFC is not always properly updated during unlogged build so it can
contain stale content.

## Summary of changes

Invalidate LFC content at the end of unlogged build

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pgxn/neon/file_cache.c     | 38 ++++++++++++++++++++++++++++++++++++++
 pgxn/neon/file_cache.h     |  1 +
 pgxn/neon/pagestore_smgr.c | 19 ++-----------------
 3 files changed, 41 insertions(+), 17 deletions(-)

diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c
index ecc55bb540..176fd9643f 100644
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -936,6 +936,44 @@ lfc_prewarm_main(Datum main_arg)
 	lfc_ctl->prewarm_workers[worker_id].completed = GetCurrentTimestamp();
 }
 
+void
+lfc_invalidate(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber nblocks)
+{
+	BufferTag	tag;
+	FileCacheEntry *entry;
+	uint32		hash;
+
+	if (lfc_maybe_disabled())	/* fast exit if file cache is disabled */
+		return;
+
+	CopyNRelFileInfoToBufTag(tag, rinfo);
+	tag.forkNum = forkNum;
+
+	CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber);
+
+	LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
+	if (LFC_ENABLED())
+	{
+		for (BlockNumber blkno = 0; blkno < nblocks; blkno += lfc_blocks_per_chunk)
+		{
+			tag.blockNum = blkno;
+			hash = get_hash_value(lfc_hash, &tag);
+			entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL);
+			if (entry != NULL)
+			{
+				for (int i = 0; i < lfc_blocks_per_chunk; i++)
+				{
+					if (GET_STATE(entry, i) == AVAILABLE)
+					{
+						lfc_ctl->used_pages -= 1;
+						SET_STATE(entry, i, UNAVAILABLE);
+					}
+				}
+			}
+		}
+	}
+	LWLockRelease(lfc_lock);
+}
 
 /*
  * Check if page is present in the cache.
diff --git a/pgxn/neon/file_cache.h b/pgxn/neon/file_cache.h
index c7b6b09f72..d5ac55d5ba 100644
--- a/pgxn/neon/file_cache.h
+++ b/pgxn/neon/file_cache.h
@@ -28,6 +28,7 @@ typedef struct FileCacheState
 extern bool lfc_store_prefetch_result;
 
 /* functions for local file cache */
+extern void lfc_invalidate(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber nblocks);
 extern void lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum,
 					   BlockNumber blkno, const void *const *buffers,
 					   BlockNumber nblocks);
diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index 31e47db7d7..5558a903e2 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -919,9 +919,6 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 		case RELPERSISTENCE_TEMP:
 		case RELPERSISTENCE_UNLOGGED:
 			mdextend(reln, forkNum, blkno, buffer, skipFsync);
-			/* Update LFC in case of unlogged index build */
-			if (reln == unlogged_build_rel && unlogged_build_phase == UNLOGGED_BUILD_PHASE_2)
-				lfc_write(InfoFromSMgrRel(reln), forkNum, blkno, buffer);
 			return;
 
 		default:
@@ -1010,14 +1007,6 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,
 		case RELPERSISTENCE_TEMP:
 		case RELPERSISTENCE_UNLOGGED:
 			mdzeroextend(reln, forkNum, blocknum, nblocks, skipFsync);
-			/* Update LFC in case of unlogged index build */
-			if (reln == unlogged_build_rel && unlogged_build_phase == UNLOGGED_BUILD_PHASE_2)
-			{
-				for (int i = 0; i < nblocks; i++)
-				{
-					lfc_write(InfoFromSMgrRel(reln), forkNum, blocknum + i, buffer.data);
-				}
-			}
 			return;
 
 		default:
@@ -1617,9 +1606,6 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo
 			#else
 			mdwrite(reln, forknum, blocknum, buffer, skipFsync);
 			#endif
-			/* Update LFC in case of unlogged index build */
-			if (reln == unlogged_build_rel && unlogged_build_phase == UNLOGGED_BUILD_PHASE_2)
-				lfc_write(InfoFromSMgrRel(reln), forknum, blocknum, buffer);
 			return;
 		default:
 			neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
@@ -1685,9 +1671,6 @@ neon_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
 		case RELPERSISTENCE_TEMP:
 		case RELPERSISTENCE_UNLOGGED:
 			mdwritev(reln, forknum, blkno, buffers, nblocks, skipFsync);
-			/* Update LFC in case of unlogged index build */
-			if (reln == unlogged_build_rel && unlogged_build_phase == UNLOGGED_BUILD_PHASE_2)
-				lfc_writev(InfoFromSMgrRel(reln), forknum, blkno, buffers, nblocks);
 			return;
 		default:
 			neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
@@ -2083,6 +2066,8 @@ neon_end_unlogged_build(SMgrRelation reln)
 				 forknum);
 
 			forget_cached_relsize(InfoFromNInfoB(rinfob), forknum);
+			lfc_invalidate(InfoFromNInfoB(rinfob), forknum, nblocks);
+
 			mdclose(reln, forknum);
 #ifndef DEBUG_COMPARE_LOCAL
 			/* use isRedo == true, so that we drop it immediately */

From 81c6a5a796d1a4278b320d241c2dcab95982a7c6 Mon Sep 17 00:00:00 2001
From: Emmanuel Ferdman <emmanuelferdman@gmail.com>
Date: Sun, 18 May 2025 00:12:01 +0300
Subject: [PATCH 58/65] Migrate to correct logger interface (#11956)

## Problem
Currently the `logger` library throws annoying deprecation warnings:
```python
DeprecationWarning: The 'warn' method is deprecated, use 'warning' instead
```

## Summary of changes
This small PR resolves the annoying deprecation warnings by migrating to
`.warning` as suggested.

Signed-off-by: Emmanuel Ferdman <emmanuelferdman@gmail.com>
---
 test_runner/fixtures/neon_cli.py                 | 2 +-
 test_runner/regress/test_pageserver_secondary.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/test_runner/fixtures/neon_cli.py b/test_runner/fixtures/neon_cli.py
index 4eaa4b7d99..bb07e2b6d1 100644
--- a/test_runner/fixtures/neon_cli.py
+++ b/test_runner/fixtures/neon_cli.py
@@ -103,7 +103,7 @@ class AbstractNeonCli:
             else:
                 stdout = ""
 
-            log.warn(f"CLI timeout: stderr={stderr}, stdout={stdout}")
+            log.warning(f"CLI timeout: stderr={stderr}, stdout={stdout}")
             raise
 
         indent = "  "
diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py
index 3aa0c63979..f2523ec9b5 100644
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -510,7 +510,7 @@ def list_elegible_layers(
         except KeyError:
             # Unexpected: tests should call this when pageservers are in a quiet state such that the layer map
             # matches what's on disk.
-            log.warn(f"Lookup {layer_file_name} from {list(visible_map.keys())}")
+            log.warning(f"Lookup {layer_file_name} from {list(visible_map.keys())}")
             raise
 
     return list(c for c in candidates if is_visible(c))
@@ -636,7 +636,7 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
     except:
         # On assertion failures, log some details to help with debugging
         heatmap = env.pageserver_remote_storage.heatmap_content(tenant_id)
-        log.warn(f"heatmap contents: {json.dumps(heatmap, indent=2)}")
+        log.warning(f"heatmap contents: {json.dumps(heatmap, indent=2)}")
         raise
 
     # Scrub the remote storage

From 4f0a9fc5698dfcc1a59ce6d32ca2b1e8ebb5de77 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Sun, 18 May 2025 00:06:32 +0200
Subject: [PATCH 59/65] chore(deps): bump flask-cors from 5.0.0 to 6.0.0 in the
 pip group across 1 directory (#11960)

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 poetry.lock | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index 1a772d3415..e6440761be 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 2.1.2 and should not be changed by hand.
+# This file is automatically @generated by Poetry 2.1.1 and should not be changed by hand.
 
 [[package]]
 name = "aiohappyeyeballs"
@@ -1145,18 +1145,19 @@ dotenv = ["python-dotenv"]
 
 [[package]]
 name = "flask-cors"
-version = "5.0.0"
-description = "A Flask extension adding a decorator for CORS support"
+version = "6.0.0"
+description = "A Flask extension simplifying CORS support"
 optional = false
-python-versions = "*"
+python-versions = "<4.0,>=3.9"
 groups = ["main"]
 files = [
-    {file = "Flask_Cors-5.0.0-py2.py3-none-any.whl", hash = "sha256:b9e307d082a9261c100d8fb0ba909eec6a228ed1b60a8315fd85f783d61910bc"},
-    {file = "flask_cors-5.0.0.tar.gz", hash = "sha256:5aadb4b950c4e93745034594d9f3ea6591f734bb3662e16e255ffbf5e89c88ef"},
+    {file = "flask_cors-6.0.0-py3-none-any.whl", hash = "sha256:6332073356452343a8ccddbfec7befdc3fdd040141fe776ec9b94c262f058657"},
+    {file = "flask_cors-6.0.0.tar.gz", hash = "sha256:4592c1570246bf7beee96b74bc0adbbfcb1b0318f6ba05c412e8909eceec3393"},
 ]
 
 [package.dependencies]
-Flask = ">=0.9"
+flask = ">=0.9"
+Werkzeug = ">=0.7"
 
 [[package]]
 name = "frozenlist"

From e9631296784799269f079af6a3c5b2fe65e3c057 Mon Sep 17 00:00:00 2001
From: Trung Dinh <dinhanhtrung@gmail.com>
Date: Sat, 17 May 2025 15:30:29 -0700
Subject: [PATCH 60/65] pagesteam_handle_batched_message ->
 pagestream_handle_batched_message (#11916)

## Problem
Found a typo in code.

## Summary of changes

Co-authored-by: Trung Dinh <tdinh@roblox.com>
Co-authored-by: Erik Grinaker <erik@neon.tech>
---
 pageserver/src/page_service.rs | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index bca1cb5b49..101e312ec3 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -1278,7 +1278,7 @@ impl PageServerHandler {
     }
 
     #[instrument(level = tracing::Level::DEBUG, skip_all)]
-    async fn pagesteam_handle_batched_message<IO>(
+    async fn pagestream_handle_batched_message<IO>(
         &mut self,
         pgb_writer: &mut PostgresBackend<IO>,
         batch: BatchedFeMessage,
@@ -1733,7 +1733,7 @@ impl PageServerHandler {
             };
 
             let result = self
-                .pagesteam_handle_batched_message(
+                .pagestream_handle_batched_message(
                     pgb_writer,
                     msg,
                     io_concurrency.clone(),
@@ -1909,7 +1909,7 @@ impl PageServerHandler {
                             return Err(e);
                         }
                     };
-                    self.pagesteam_handle_batched_message(
+                    self.pagestream_handle_batched_message(
                         pgb_writer,
                         batch,
                         io_concurrency.clone(),

From 81c557d87e2381d653deb0b0b9decbbdfc76f30f Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Sun, 18 May 2025 08:02:47 +0300
Subject: [PATCH 61/65] Unlogged build get smgr (#11954)

## Problem

See https://github.com/neondatabase/neon/issues/11910
and https://neondb.slack.com/archives/C04DGM6SMTM/p1747314649059129

## Summary of changes

Do not change persistence in `start_unlogged_build`

Postgres PRs:
https://github.com/neondatabase/postgres/pull/642
https://github.com/neondatabase/postgres/pull/641
https://github.com/neondatabase/postgres/pull/640
https://github.com/neondatabase/postgres/pull/639

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 compute/patches/rum.patch        |  6 +--
 pgxn/neon/neon_pgversioncompat.h |  8 +++-
 pgxn/neon/pagestore_smgr.c       | 78 ++++++++++++++++++++++++--------
 vendor/postgres-v14              |  2 +-
 vendor/postgres-v15              |  2 +-
 vendor/postgres-v16              |  2 +-
 vendor/postgres-v17              |  2 +-
 vendor/revisions.json            |  8 ++--
 8 files changed, 76 insertions(+), 32 deletions(-)

diff --git a/compute/patches/rum.patch b/compute/patches/rum.patch
index b45afe2874..aed1badc13 100644
--- a/compute/patches/rum.patch
+++ b/compute/patches/rum.patch
@@ -7,7 +7,7 @@ index 255e616..1c6edb7 100644
  			 RelationGetRelationName(index));
  
 +#ifdef NEON_SMGR
-+	smgr_start_unlogged_build(index->rd_smgr);
++	smgr_start_unlogged_build(RelationGetSmgr(index));
 +#endif
 +
  	initRumState(&buildstate.rumstate, index);
@@ -18,7 +18,7 @@ index 255e616..1c6edb7 100644
  	rumUpdateStats(index, &buildstate.buildStats, buildstate.rumstate.isBuild);
  
 +#ifdef NEON_SMGR
-+	smgr_finish_unlogged_build_phase_1(index->rd_smgr);
++	smgr_finish_unlogged_build_phase_1(RelationGetSmgr(index));
 +#endif
 +
  	/*
@@ -29,7 +29,7 @@ index 255e616..1c6edb7 100644
  	}
  
 +#ifdef NEON_SMGR
-+	smgr_end_unlogged_build(index->rd_smgr);
++	smgr_end_unlogged_build(RelationGetSmgr(index));
 +#endif
 +
  	/*
diff --git a/pgxn/neon/neon_pgversioncompat.h b/pgxn/neon/neon_pgversioncompat.h
index b3ed0c04e8..bf91a02b45 100644
--- a/pgxn/neon/neon_pgversioncompat.h
+++ b/pgxn/neon/neon_pgversioncompat.h
@@ -86,7 +86,7 @@ InitBufferTag(BufferTag *tag, const RelFileNode *rnode,
 
 #define InvalidRelFileNumber InvalidOid
 
-#define SMgrRelGetRelInfo(reln) \
+#define SMgrRelGetRelInfo(reln)				\
 	(reln->smgr_rnode.node)
 
 #define DropRelationAllLocalBuffers DropRelFileNodeAllLocalBuffers
@@ -148,6 +148,12 @@ InitBufferTag(BufferTag *tag, const RelFileNode *rnode,
 #define DropRelationAllLocalBuffers DropRelationAllLocalBuffers
 #endif
 
+#define NRelFileInfoInvalidate(rinfo) do { \
+		NInfoGetSpcOid(rinfo) = InvalidOid; \
+		NInfoGetDbOid(rinfo) = InvalidOid; \
+		NInfoGetRelNumber(rinfo) = InvalidRelFileNumber; \
+	} while (0)
+
 #if PG_MAJORVERSION_NUM < 17
 #define ProcNumber BackendId
 #define INVALID_PROC_NUMBER InvalidBackendId
diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index 5558a903e2..43fd715bbb 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -108,7 +108,7 @@ typedef enum
 	UNLOGGED_BUILD_NOT_PERMANENT
 } UnloggedBuildPhase;
 
-static SMgrRelation unlogged_build_rel = NULL;
+static NRelFileInfo unlogged_build_rel_info;
 static UnloggedBuildPhase unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
 
 static bool neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id);
@@ -912,8 +912,14 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 	{
 		case 0:
 			neon_log(ERROR, "cannot call smgrextend() on rel with unknown persistence");
+			break;
 
 		case RELPERSISTENCE_PERMANENT:
+			if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln)))
+			{
+				mdextend(reln, forkNum, blkno, buffer, skipFsync);
+				return;
+			}
 			break;
 
 		case RELPERSISTENCE_TEMP:
@@ -1000,8 +1006,14 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,
 	{
 		case 0:
 			neon_log(ERROR, "cannot call smgrextend() on rel with unknown persistence");
+			break;
 
 		case RELPERSISTENCE_PERMANENT:
+			if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln)))
+			{
+				mdzeroextend(reln, forkNum, blocknum, nblocks, skipFsync);
+				return;
+			}
 			break;
 
 		case RELPERSISTENCE_TEMP:
@@ -1376,8 +1388,14 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
 	{
 		case 0:
 			neon_log(ERROR, "cannot call smgrread() on rel with unknown persistence");
+			break;
 
 		case RELPERSISTENCE_PERMANENT:
+			if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln)))
+			{
+				mdread(reln, forkNum, blkno, buffer);
+				return;
+			}
 			break;
 
 		case RELPERSISTENCE_TEMP:
@@ -1463,8 +1481,14 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 	{
 		case 0:
 			neon_log(ERROR, "cannot call smgrread() on rel with unknown persistence");
+			break;
 
 		case RELPERSISTENCE_PERMANENT:
+			if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln)))
+			{
+				mdreadv(reln, forknum, blocknum, buffers, nblocks);
+				return;
+			}
 			break;
 
 		case RELPERSISTENCE_TEMP:
@@ -1597,6 +1621,15 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo
 			break;
 
 		case RELPERSISTENCE_PERMANENT:
+			if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln)))
+			{
+#if PG_MAJORVERSION_NUM >= 17
+				mdwritev(reln, forknum, blocknum, &buffer, 1, skipFsync);
+#else
+				mdwrite(reln, forknum, blocknum, buffer, skipFsync);
+#endif
+				return;
+			}
 			break;
 
 		case RELPERSISTENCE_TEMP:
@@ -1666,6 +1699,11 @@ neon_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
 			break;
 
 		case RELPERSISTENCE_PERMANENT:
+			if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln)))
+			{
+				mdwritev(reln, forknum, blkno, buffers, nblocks, skipFsync);
+				return;
+			}
 			break;
 
 		case RELPERSISTENCE_TEMP:
@@ -1706,6 +1744,10 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
 			break;
 
 		case RELPERSISTENCE_PERMANENT:
+			if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln)))
+			{
+				return mdnblocks(reln, forknum);
+			}
 			break;
 
 		case RELPERSISTENCE_TEMP:
@@ -1775,6 +1817,11 @@ neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber old_blocks, Blo
 			break;
 
 		case RELPERSISTENCE_PERMANENT:
+			if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln)))
+			{
+				mdtruncate(reln, forknum, old_blocks, nblocks);
+				return;
+			}
 			break;
 
 		case RELPERSISTENCE_TEMP:
@@ -1913,7 +1960,6 @@ neon_start_unlogged_build(SMgrRelation reln)
 	 */
 	if (unlogged_build_phase != UNLOGGED_BUILD_NOT_IN_PROGRESS)
 		neon_log(ERROR, "unlogged relation build is already in progress");
-	Assert(unlogged_build_rel == NULL);
 
 	ereport(SmgrTrace,
 			(errmsg(NEON_TAG "starting unlogged build of relation %u/%u/%u",
@@ -1930,7 +1976,7 @@ neon_start_unlogged_build(SMgrRelation reln)
 
 		case RELPERSISTENCE_TEMP:
 		case RELPERSISTENCE_UNLOGGED:
-			unlogged_build_rel = reln;
+			unlogged_build_rel_info = InfoFromSMgrRel(reln);
 			unlogged_build_phase = UNLOGGED_BUILD_NOT_PERMANENT;
 #ifdef DEBUG_COMPARE_LOCAL
 			if (!IsParallelWorker())
@@ -1951,12 +1997,9 @@ neon_start_unlogged_build(SMgrRelation reln)
 		neon_log(ERROR, "cannot perform unlogged index build, index is not empty ");
 #endif
 
-	unlogged_build_rel = reln;
+	unlogged_build_rel_info = InfoFromSMgrRel(reln);
 	unlogged_build_phase = UNLOGGED_BUILD_PHASE_1;
 
-	/* Make the relation look like it's unlogged */
-	reln->smgr_relpersistence = RELPERSISTENCE_UNLOGGED;
-
 	/*
 	 * Create the local file. In a parallel build, the leader is expected to
 	 * call this first and do it.
@@ -1983,17 +2026,16 @@ neon_start_unlogged_build(SMgrRelation reln)
 static void
 neon_finish_unlogged_build_phase_1(SMgrRelation reln)
 {
-	Assert(unlogged_build_rel == reln);
+	Assert(RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln)));
 
 	ereport(SmgrTrace,
 			(errmsg(NEON_TAG "finishing phase 1 of unlogged build of relation %u/%u/%u",
-					RelFileInfoFmt(InfoFromSMgrRel(reln)))));
+					RelFileInfoFmt((unlogged_build_rel_info)))));
 
 	if (unlogged_build_phase == UNLOGGED_BUILD_NOT_PERMANENT)
 		return;
 
 	Assert(unlogged_build_phase == UNLOGGED_BUILD_PHASE_1);
-	Assert(reln->smgr_relpersistence == RELPERSISTENCE_UNLOGGED);
 
 	/*
 	 * In a parallel build, (only) the leader process performs the 2nd
@@ -2001,7 +2043,7 @@ neon_finish_unlogged_build_phase_1(SMgrRelation reln)
 	 */
 	if (IsParallelWorker())
 	{
-		unlogged_build_rel = NULL;
+		NRelFileInfoInvalidate(unlogged_build_rel_info);
 		unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
 	}
 	else
@@ -2022,11 +2064,11 @@ neon_end_unlogged_build(SMgrRelation reln)
 {
 	NRelFileInfoBackend rinfob = InfoBFromSMgrRel(reln);
 
-	Assert(unlogged_build_rel == reln);
+	Assert(RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln)));
 
 	ereport(SmgrTrace,
 			(errmsg(NEON_TAG "ending unlogged build of relation %u/%u/%u",
-					RelFileInfoFmt(InfoFromNInfoB(rinfob)))));
+					RelFileInfoFmt(unlogged_build_rel_info))));
 
 	if (unlogged_build_phase != UNLOGGED_BUILD_NOT_PERMANENT)
 	{
@@ -2034,7 +2076,6 @@ neon_end_unlogged_build(SMgrRelation reln)
 		BlockNumber nblocks;
 
 		Assert(unlogged_build_phase == UNLOGGED_BUILD_PHASE_2);
-		Assert(reln->smgr_relpersistence == RELPERSISTENCE_UNLOGGED);
 
 		/*
 		 * Update the last-written LSN cache.
@@ -2055,9 +2096,6 @@ neon_end_unlogged_build(SMgrRelation reln)
 								InfoFromNInfoB(rinfob),
 								MAIN_FORKNUM);
 
-		/* Make the relation look permanent again */
-		reln->smgr_relpersistence = RELPERSISTENCE_PERMANENT;
-
 		/* Remove local copy */
 		for (int forknum = 0; forknum <= MAX_FORKNUM; forknum++)
 		{
@@ -2078,7 +2116,7 @@ neon_end_unlogged_build(SMgrRelation reln)
 		mdunlink(rinfob, INIT_FORKNUM, true);
 #endif
 	}
-	unlogged_build_rel = NULL;
+	NRelFileInfoInvalidate(unlogged_build_rel_info);
 	unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
 }
 
@@ -2151,7 +2189,7 @@ AtEOXact_neon(XactEvent event, void *arg)
 			 * Forget about any build we might have had in progress. The local
 			 * file will be unlinked by smgrDoPendingDeletes()
 			 */
-			unlogged_build_rel = NULL;
+			NRelFileInfoInvalidate(unlogged_build_rel_info);
 			unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
 			break;
 
@@ -2163,7 +2201,7 @@ AtEOXact_neon(XactEvent event, void *arg)
 		case XACT_EVENT_PRE_PREPARE:
 			if (unlogged_build_phase != UNLOGGED_BUILD_NOT_IN_PROGRESS)
 			{
-				unlogged_build_rel = NULL;
+				NRelFileInfoInvalidate(unlogged_build_rel_info);
 				unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
 				ereport(ERROR,
 						(errcode(ERRCODE_INTERNAL_ERROR),
diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index 4cca6f8083..55c0d45abe 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit 4cca6f8083483dda9e12eae292cf788d45bd561f
+Subproject commit 55c0d45abe6467c02084c2192bca117eda6ce1e7
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index daa81cffcf..de7640f55d 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit daa81cffcf063c54b29a9aabdb6604625f675ad0
+Subproject commit de7640f55da07512834d5cc40c4b3fb376b5f04f
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index 15710a76b7..0bf96bd6d7 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit 15710a76b7d07912110fcbbaf0c8ad6d7e5a9fbc
+Subproject commit 0bf96bd6d70301a0b43b0b3457bb3cf8fb43c198
diff --git a/vendor/postgres-v17 b/vendor/postgres-v17
index e5374b7299..8be779fd3a 160000
--- a/vendor/postgres-v17
+++ b/vendor/postgres-v17
@@ -1 +1 @@
-Subproject commit e5374b72997b0afc8374137674e873f7a558120a
+Subproject commit 8be779fd3ab9e87206da96a7e4842ef1abf04f44
diff --git a/vendor/revisions.json b/vendor/revisions.json
index 0fc2d3996d..3e999760f4 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,18 +1,18 @@
 {
   "v17": [
     "17.5",
-    "e5374b72997b0afc8374137674e873f7a558120a"
+    "8be779fd3ab9e87206da96a7e4842ef1abf04f44"
   ],
   "v16": [
     "16.9",
-    "15710a76b7d07912110fcbbaf0c8ad6d7e5a9fbc"
+    "0bf96bd6d70301a0b43b0b3457bb3cf8fb43c198"
   ],
   "v15": [
     "15.13",
-    "daa81cffcf063c54b29a9aabdb6604625f675ad0"
+    "de7640f55da07512834d5cc40c4b3fb376b5f04f"
   ],
   "v14": [
     "14.18",
-    "4cca6f8083483dda9e12eae292cf788d45bd561f"
+    "55c0d45abe6467c02084c2192bca117eda6ce1e7"
   ]
 }

From cdb6479c8abd87df7c0c535ced25aeef5991a983 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Mon, 19 May 2025 11:03:06 +0200
Subject: [PATCH 62/65] pageserver: add gRPC page service schema (#11815)

## Problem

For the [communicator
project](https://github.com/neondatabase/company_projects/issues/352),
we want to move to gRPC for the page service protocol.

Touches #11728.

## Summary of changes

This patch adds an experimental gRPC Protobuf schema for the page
service. It is equivalent to the current page service, but with several
improvements, e.g.:

* Connection multiplexing.
* Reduced head-of-line blocking.
* Client-side batching.
* Explicit tenant shard routing.
* GetPage request classification (normal vs. prefetch).
* Explicit rate limiting ("slow down" response status).

The API is exposed as a new `pageserver/page_api` package. This is
separate from the `pageserver_api` package to reduce the dependency
footprint for the communicator. The longer-term plan is to also split
out e.g. the WAL ingestion service to a separate gRPC package, e.g.
`pageserver/wal_api`.

Subsequent PRs will: add Rust domain types for the Protobuf types,
expose a gRPC server, and implement the page service.

Preliminary prototype benchmarks of this gRPC API is within 10% of
baseline libpq performance. We'll do further benchmarking and
optimization as the implementation lands in `main` and is deployed to
staging.
---
 Cargo.lock                                   |  10 +
 Cargo.toml                                   |   2 +
 pageserver/page_api/Cargo.toml               |  13 ++
 pageserver/page_api/build.rs                 |   7 +
 pageserver/page_api/proto/page_service.proto | 220 +++++++++++++++++++
 pageserver/page_api/src/lib.rs               |  14 ++
 6 files changed, 266 insertions(+)
 create mode 100644 pageserver/page_api/Cargo.toml
 create mode 100644 pageserver/page_api/build.rs
 create mode 100644 pageserver/page_api/proto/page_service.proto
 create mode 100644 pageserver/page_api/src/lib.rs

diff --git a/Cargo.lock b/Cargo.lock
index 8ca65b58ce..d919537818 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4434,6 +4434,16 @@ dependencies = [
  "workspace_hack",
 ]
 
+[[package]]
+name = "pageserver_page_api"
+version = "0.1.0"
+dependencies = [
+ "prost 0.13.3",
+ "tonic",
+ "tonic-build",
+ "workspace_hack",
+]
+
 [[package]]
 name = "papaya"
 version = "0.2.1"
diff --git a/Cargo.toml b/Cargo.toml
index 74b281f88f..a280c446b9 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -9,6 +9,7 @@ members = [
     "pageserver/ctl",
     "pageserver/client",
     "pageserver/pagebench",
+    "pageserver/page_api",
     "proxy",
     "safekeeper",
     "safekeeper/client",
@@ -252,6 +253,7 @@ pageserver = { path = "./pageserver" }
 pageserver_api = { version = "0.1", path = "./libs/pageserver_api/" }
 pageserver_client = { path = "./pageserver/client" }
 pageserver_compaction = { version = "0.1", path = "./pageserver/compaction/" }
+pageserver_page_api = { path = "./pageserver/page_api" }
 postgres_backend = { version = "0.1", path = "./libs/postgres_backend/" }
 postgres_connection = { version = "0.1", path = "./libs/postgres_connection/" }
 postgres_ffi = { version = "0.1", path = "./libs/postgres_ffi/" }
diff --git a/pageserver/page_api/Cargo.toml b/pageserver/page_api/Cargo.toml
new file mode 100644
index 0000000000..c237949226
--- /dev/null
+++ b/pageserver/page_api/Cargo.toml
@@ -0,0 +1,13 @@
+[package]
+name = "pageserver_page_api"
+version = "0.1.0"
+edition.workspace = true
+license.workspace = true
+
+[dependencies]
+prost.workspace = true
+tonic.workspace = true
+workspace_hack.workspace = true
+
+[build-dependencies]
+tonic-build.workspace = true
diff --git a/pageserver/page_api/build.rs b/pageserver/page_api/build.rs
new file mode 100644
index 0000000000..ce3c49ed82
--- /dev/null
+++ b/pageserver/page_api/build.rs
@@ -0,0 +1,7 @@
+fn main() -> Result<(), Box<dyn std::error::Error>> {
+    // Generates Rust code from .proto Protobuf schemas.
+    tonic_build::configure()
+        .bytes(["."])
+        .compile_protos(&["proto/page_service.proto"], &["proto"])
+        .map_err(|err| err.into())
+}
diff --git a/pageserver/page_api/proto/page_service.proto b/pageserver/page_api/proto/page_service.proto
new file mode 100644
index 0000000000..12e4d2f9db
--- /dev/null
+++ b/pageserver/page_api/proto/page_service.proto
@@ -0,0 +1,220 @@
+// Page service, presented by pageservers for computes.
+//
+// This is the compute read path. It primarily serves page versions at given
+// LSNs, but also base backups, SLRU segments, and relation metadata.
+//
+// EXPERIMENTAL: this is still under development and subject to change.
+//
+// Request metadata headers:
+// - authorization: JWT token ("Bearer <token>"), if auth is enabled
+// - neon-tenant-id: tenant ID ("7c4a1f9e3bd6470c8f3e21a65bd2e980")
+// - neon-shard-id: shard ID, as <number><count> in hex ("0b10" = shard 11 of 16, 0-based)
+// - neon-timeline-id: timeline ID ("f08c4e9a2d5f76b1e3a7c2d8910f4b3e")
+//
+// TODO: consider adding neon-compute-mode ("primary", "static", "replica").
+// However, this will require reconnecting when changing modes.
+//
+// TODO: write implementation guidance on
+// - Health checks
+// - Tracing, OpenTelemetry
+// - Compression
+
+syntax = "proto3";
+package page_service;
+
+service PageService {
+  // Returns whether a relation exists.
+  rpc CheckRelExists(CheckRelExistsRequest) returns (CheckRelExistsResponse);
+
+  // Fetches a base backup.
+  rpc GetBaseBackup (GetBaseBackupRequest) returns (stream GetBaseBackupResponseChunk);
+
+  // Returns the total size of a database, as # of bytes.
+  rpc GetDbSize (GetDbSizeRequest) returns (GetDbSizeResponse);
+
+  // Fetches pages.
+  //
+  // This is implemented as a bidirectional streaming RPC for performance. Unary
+  // requests incur costs for e.g. HTTP/2 stream setup, header parsing,
+  // authentication, and so on -- with streaming, we only pay these costs during
+  // the initial stream setup. This ~doubles throughput in benchmarks. Other
+  // RPCs use regular unary requests, since they are not as frequent and
+  // performance-critical, and this simplifies implementation.
+  //
+  // NB: a status response (e.g. errors) will terminate the stream. The stream
+  // may be shared by e.g. multiple Postgres backends, so we should avoid this.
+  // Most errors are therefore sent as GetPageResponse.status instead.
+  rpc GetPages (stream GetPageRequest) returns (stream GetPageResponse);
+
+  // Returns the size of a relation, as # of blocks.
+  rpc GetRelSize (GetRelSizeRequest) returns (GetRelSizeResponse);
+
+  // Fetches an SLRU segment.
+  rpc GetSlruSegment (GetSlruSegmentRequest) returns (GetSlruSegmentResponse);
+}
+
+// The LSN a request should read at.
+message ReadLsn {
+  // The request's read LSN. Required.
+  uint64 request_lsn = 1;
+  // If given, the caller guarantees that the page has not been modified since
+  // this LSN. Must be smaller than or equal to request_lsn. This allows the
+  // Pageserver to serve an old page without waiting for the request LSN to
+  // arrive. Valid for all request types.
+  //
+  // It is undefined behaviour to make a request such that the page was, in
+  // fact, modified between request_lsn and not_modified_since_lsn. The
+  // Pageserver might detect it and return an error, or it might return the old
+  // page version or the new page version. Setting not_modified_since_lsn equal
+  // to request_lsn is always safe, but can lead to unnecessary waiting.
+  uint64 not_modified_since_lsn = 2;
+}
+
+// A relation identifier.
+message RelTag {
+    uint32 spc_oid = 1;
+    uint32 db_oid = 2;
+    uint32 rel_number = 3;
+    uint32 fork_number = 4;
+}
+
+// Checks whether a relation exists, at the given LSN. Only valid on shard 0,
+// other shards will error.
+message CheckRelExistsRequest {
+  ReadLsn read_lsn = 1;
+  RelTag rel = 2;
+}
+
+message CheckRelExistsResponse {
+  bool exists = 1;
+}
+
+// Requests a base backup at a given LSN.
+message GetBaseBackupRequest {
+  // The LSN to fetch a base backup at.
+  ReadLsn read_lsn = 1;
+  // If true, logical replication slots will not be created.
+  bool replica = 2;
+}
+
+// Base backup response chunk, returned as an ordered stream.
+message GetBaseBackupResponseChunk {
+  // A basebackup data chunk. The size is undefined, but bounded by the 4 MB
+  // gRPC message size limit.
+  bytes chunk = 1;
+}
+
+// Requests the size of a database, as # of bytes. Only valid on shard 0, other
+// shards will error.
+message GetDbSizeRequest {
+  ReadLsn read_lsn = 1;
+  uint32 db_oid = 2;
+}
+
+message GetDbSizeResponse {
+  uint64 num_bytes = 1;
+}
+
+// Requests one or more pages.
+message GetPageRequest {
+  // A request ID. Will be included in the response. Should be unique for
+  // in-flight requests on the stream.
+  uint64 request_id = 1;
+  // The request class.
+  GetPageClass request_class = 2;
+  // The LSN to read at.
+  ReadLsn read_lsn = 3;
+  // The relation to read from.
+  RelTag rel = 4;
+  // Page numbers to read. Must belong to the remote shard.
+  //
+  // Multiple pages will be executed as a single batch by the Pageserver,
+  // amortizing layer access costs and parallelizing them. This may increase the
+  // latency of any individual request, but improves the overall latency and
+  // throughput of the batch as a whole.
+  //
+  // TODO: this causes an allocation in the common single-block case. The sender
+  // can use a SmallVec to stack-allocate it, but Prost will always deserialize
+  // into a heap-allocated Vec. Consider optimizing this.
+  //
+  // TODO: we might be able to avoid a sort or something if we mandate that these
+  // are always in order. But we can't currenly rely on this on the server, because
+  // of compatibility with the libpq protocol handler.
+  repeated uint32 block_number = 5;
+}
+
+// A GetPageRequest class. Primarily intended for observability, but may also be
+// used for prioritization in the future.
+enum GetPageClass {
+  // Unknown class. For forwards compatibility: used when the client sends a
+  // class that the server doesn't know about.
+  GET_PAGE_CLASS_UNKNOWN = 0;
+  // A normal request. This is the default.
+  GET_PAGE_CLASS_NORMAL = 1;
+  // A prefetch request. NB: can only be classified on pg < 18.
+  GET_PAGE_CLASS_PREFETCH = 2;
+  // A background request (e.g. vacuum).
+  GET_PAGE_CLASS_BACKGROUND = 3;
+}
+
+// A GetPage response.
+//
+// A batch response will contain all of the requested pages. We could eagerly
+// emit individual pages as soon as they are ready, but on a readv() Postgres
+// holds buffer pool locks on all pages in the batch and we'll only return once
+// the entire batch is ready, so no one can make use of the individual pages.
+message GetPageResponse {
+  // The original request's ID.
+  uint64 request_id = 1;
+  // The response status code.
+  GetPageStatus status = 2;
+  // A string describing the status, if any.
+  string reason = 3;
+  // The 8KB page images, in the same order as the request. Empty if status != OK.
+  repeated bytes page_image = 4;
+}
+
+// A GetPageResponse status code. Since we use a bidirectional stream, we don't
+// want to send errors as gRPC statuses, since this would terminate the stream.
+enum GetPageStatus {
+  // Unknown status. For forwards compatibility: used when the server sends a
+  // status code that the client doesn't know about.
+  GET_PAGE_STATUS_UNKNOWN = 0;
+  // The request was successful.
+  GET_PAGE_STATUS_OK = 1;
+  // The page did not exist. The tenant/timeline/shard has already been
+  // validated during stream setup.
+  GET_PAGE_STATUS_NOT_FOUND = 2;
+  // The request was invalid.
+  GET_PAGE_STATUS_INVALID = 3;
+  // The tenant is rate limited. Slow down and retry later.
+  GET_PAGE_STATUS_SLOW_DOWN = 4;
+  // TODO: consider adding a GET_PAGE_STATUS_LAYER_DOWNLOAD in the case of a
+  // layer download. This could free up the server task to process other
+  // requests while the layer download is in progress.
+}
+
+// Fetches the size of a relation at a given LSN, as # of blocks. Only valid on
+// shard 0, other shards will error.
+message GetRelSizeRequest {
+  ReadLsn read_lsn = 1;
+  RelTag rel = 2;
+}
+
+message GetRelSizeResponse {
+  uint32 num_blocks = 1;
+}
+
+// Requests an SLRU segment. Only valid on shard 0, other shards will error.
+message GetSlruSegmentRequest {
+  ReadLsn read_lsn = 1;
+  uint32 kind = 2;
+  uint32 segno = 3;
+}
+
+// Returns an SLRU segment.
+//
+// These are up 32 pages (256 KB), so we can send them as a single response.
+message GetSlruSegmentResponse {
+  bytes segment = 1;
+}
diff --git a/pageserver/page_api/src/lib.rs b/pageserver/page_api/src/lib.rs
new file mode 100644
index 0000000000..0226d594cb
--- /dev/null
+++ b/pageserver/page_api/src/lib.rs
@@ -0,0 +1,14 @@
+//! This crate provides the Pageserver's page API. It contains:
+//!
+//! * proto/page_service.proto: the Protobuf schema for the page API.
+//! * proto: auto-generated Protobuf types for gRPC.
+//!
+//! This crate is used by both the client and the server. Try to keep it slim.
+
+// Code generated by protobuf.
+pub mod proto {
+    tonic::include_proto!("page_service");
+
+    pub use page_service_client::PageServiceClient;
+    pub use page_service_server::{PageService, PageServiceServer};
+}

From 76a7d37f7e266a946a0de91dae89f7ded66ef09f Mon Sep 17 00:00:00 2001
From: Ivan Efremov <ivan@neon.tech>
Date: Mon, 19 May 2025 13:10:55 +0300
Subject: [PATCH 63/65] proxy: Drop cancellation ops if they don't fit into the
 queue (#11950)

Add a redis ops batch size argument for proxy and remove timeouts by
using try_send()
---
 proxy/src/binary/proxy.rs           | 12 ++++++++++--
 proxy/src/cancellation.rs           | 20 +++++++++-----------
 proxy/src/console_redirect_proxy.rs |  4 +---
 proxy/src/proxy/mod.rs              |  4 +---
 proxy/src/proxy/passthrough.rs      |  2 +-
 5 files changed, 22 insertions(+), 20 deletions(-)

diff --git a/proxy/src/binary/proxy.rs b/proxy/src/binary/proxy.rs
index 51713902bc..f40d5041c1 100644
--- a/proxy/src/binary/proxy.rs
+++ b/proxy/src/binary/proxy.rs
@@ -161,8 +161,11 @@ struct ProxyCliArgs {
     #[clap(long, default_values_t = RateBucketInfo::DEFAULT_REDIS_SET)]
     redis_rps_limit: Vec<RateBucketInfo>,
     /// Cancellation channel size (max queue size for redis kv client)
-    #[clap(long, default_value = "1024")]
+    #[clap(long, default_value_t = 1024)]
     cancellation_ch_size: usize,
+    /// Cancellation ops batch size for redis
+    #[clap(long, default_value_t = 8)]
+    cancellation_batch_size: usize,
     /// cache for `allowed_ips` (use `size=0` to disable)
     #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)]
     allowed_ips_cache: String,
@@ -542,7 +545,12 @@ pub async fn run() -> anyhow::Result<()> {
             if let Some(mut redis_kv_client) = redis_kv_client {
                 maintenance_tasks.spawn(async move {
                     redis_kv_client.try_connect().await?;
-                    handle_cancel_messages(&mut redis_kv_client, rx_cancel).await?;
+                    handle_cancel_messages(
+                        &mut redis_kv_client,
+                        rx_cancel,
+                        args.cancellation_batch_size,
+                    )
+                    .await?;
 
                     drop(redis_kv_client);
 
diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs
index f34fb747ca..a6e7bf85a0 100644
--- a/proxy/src/cancellation.rs
+++ b/proxy/src/cancellation.rs
@@ -30,8 +30,6 @@ use crate::tls::postgres_rustls::MakeRustlsConnect;
 type IpSubnetKey = IpNet;
 
 const CANCEL_KEY_TTL: i64 = 1_209_600; // 2 weeks cancellation key expire time
-const REDIS_SEND_TIMEOUT: std::time::Duration = std::time::Duration::from_millis(10);
-const BATCH_SIZE: usize = 8;
 
 // Message types for sending through mpsc channel
 pub enum CancelKeyOp {
@@ -231,12 +229,13 @@ impl CancelReplyOp {
 pub async fn handle_cancel_messages(
     client: &mut RedisKVClient,
     mut rx: mpsc::Receiver<CancelKeyOp>,
+    batch_size: usize,
 ) -> anyhow::Result<()> {
-    let mut batch = Vec::with_capacity(BATCH_SIZE);
-    let mut pipeline = Pipeline::with_capacity(BATCH_SIZE);
+    let mut batch = Vec::with_capacity(batch_size);
+    let mut pipeline = Pipeline::with_capacity(batch_size);
 
     loop {
-        if rx.recv_many(&mut batch, BATCH_SIZE).await == 0 {
+        if rx.recv_many(&mut batch, batch_size).await == 0 {
             warn!("shutting down cancellation queue");
             break Ok(());
         }
@@ -367,8 +366,7 @@ impl CancellationHandler {
             return Err(CancelError::InternalError);
         };
 
-        tx.send_timeout(op, REDIS_SEND_TIMEOUT)
-            .await
+        tx.try_send(op)
             .map_err(|e| {
                 tracing::warn!("failed to send GetCancelData for {key}: {e}");
             })
@@ -570,7 +568,7 @@ impl Session {
     }
 
     // Send the store key op to the cancellation handler and set TTL for the key
-    pub(crate) async fn write_cancel_key(
+    pub(crate) fn write_cancel_key(
         &self,
         cancel_closure: CancelClosure,
     ) -> Result<(), CancelError> {
@@ -596,14 +594,14 @@ impl Session {
             expire: CANCEL_KEY_TTL,
         };
 
-        let _ = tx.send_timeout(op, REDIS_SEND_TIMEOUT).await.map_err(|e| {
+        let _ = tx.try_send(op).map_err(|e| {
             let key = self.key;
             tracing::warn!("failed to send StoreCancelKey for {key}: {e}");
         });
         Ok(())
     }
 
-    pub(crate) async fn remove_cancel_key(&self) -> Result<(), CancelError> {
+    pub(crate) fn remove_cancel_key(&self) -> Result<(), CancelError> {
         let Some(tx) = &self.cancellation_handler.tx else {
             tracing::warn!("cancellation handler is not available");
             return Err(CancelError::InternalError);
@@ -619,7 +617,7 @@ impl Session {
                 .guard(RedisMsgKind::HDel),
         };
 
-        let _ = tx.send_timeout(op, REDIS_SEND_TIMEOUT).await.map_err(|e| {
+        let _ = tx.try_send(op).map_err(|e| {
             let key = self.key;
             tracing::warn!("failed to send RemoveCancelKey for {key}: {e}");
         });
diff --git a/proxy/src/console_redirect_proxy.rs b/proxy/src/console_redirect_proxy.rs
index 0f2c3def0d..e3184e20d1 100644
--- a/proxy/src/console_redirect_proxy.rs
+++ b/proxy/src/console_redirect_proxy.rs
@@ -244,9 +244,7 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
     let cancellation_handler_clone = Arc::clone(&cancellation_handler);
     let session = cancellation_handler_clone.get_key();
 
-    session
-        .write_cancel_key(node.cancel_closure.clone())
-        .await?;
+    session.write_cancel_key(node.cancel_closure.clone())?;
 
     prepare_client_connection(&node, *session.key(), &mut stream).await?;
 
diff --git a/proxy/src/proxy/mod.rs b/proxy/src/proxy/mod.rs
index cf331b8bc0..0a86022e78 100644
--- a/proxy/src/proxy/mod.rs
+++ b/proxy/src/proxy/mod.rs
@@ -383,9 +383,7 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
     let cancellation_handler_clone = Arc::clone(&cancellation_handler);
     let session = cancellation_handler_clone.get_key();
 
-    session
-        .write_cancel_key(node.cancel_closure.clone())
-        .await?;
+    session.write_cancel_key(node.cancel_closure.clone())?;
 
     prepare_client_connection(&node, *session.key(), &mut stream).await?;
 
diff --git a/proxy/src/proxy/passthrough.rs b/proxy/src/proxy/passthrough.rs
index c100b8d716..8f9bd2de2d 100644
--- a/proxy/src/proxy/passthrough.rs
+++ b/proxy/src/proxy/passthrough.rs
@@ -94,7 +94,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> ProxyPassthrough<S> {
             tracing::warn!(session_id = ?self.session_id, ?err, "could not cancel the query in the database");
         }
 
-        drop(self.cancel.remove_cancel_key().await); // we don't need a result. If the queue is full, we just log the error
+        drop(self.cancel.remove_cancel_key()); // we don't need a result. If the queue is full, we just log the error
 
         res
     }

From 3685ad606d11de706b9d0eb5841b7801d6ae8a7d Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Mon, 19 May 2025 10:56:03 +0000
Subject: [PATCH 64/65] endpoint_storage: Fix metrics test by excluding
 assertion on macos (#11952)

---
 endpoint_storage/src/app.rs | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/endpoint_storage/src/app.rs b/endpoint_storage/src/app.rs
index 0bd7fe5f28..f44efe6d7a 100644
--- a/endpoint_storage/src/app.rs
+++ b/endpoint_storage/src/app.rs
@@ -462,6 +462,8 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
         if var(REAL_S3_ENV).is_ok() {
             assert!(body.contains("remote_storage_s3_deleted_objects_total"));
         }
+
+        #[cfg(target_os = "linux")]
         assert!(body.contains("process_threads"));
     }
 

From 38dbc5f67f3dfbf501fb289f12f193bdec54ff6d Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Mon, 19 May 2025 13:17:45 +0200
Subject: [PATCH 65/65] pageserver/page_api: add binary Protobuf descriptor
 (#11968)

## Problem

A binary Protobuf schema descriptor can be used to expose an API
reflection service, which in turn allows convenient usage of e.g.
`grpcurl` against the gRPC server.

Touches #11728.

## Summary of changes

* Generate a binary schema descriptor as
`pageserver_page_api::proto::FILE_DESCRIPTOR_SET`.
* Opportunistically rename the Protobuf package from `page_service` to
`page_api`.
---
 pageserver/page_api/build.rs                 |  8 +++++++-
 pageserver/page_api/proto/page_service.proto | 15 ++++++++++++++-
 pageserver/page_api/src/lib.rs               |  7 ++++++-
 3 files changed, 27 insertions(+), 3 deletions(-)

diff --git a/pageserver/page_api/build.rs b/pageserver/page_api/build.rs
index ce3c49ed82..e96297f10e 100644
--- a/pageserver/page_api/build.rs
+++ b/pageserver/page_api/build.rs
@@ -1,7 +1,13 @@
+use std::env;
+use std::path::PathBuf;
+
+/// Generates Rust code from .proto Protobuf schemas, along with a binary file
+/// descriptor set for Protobuf schema reflection.
 fn main() -> Result<(), Box<dyn std::error::Error>> {
-    // Generates Rust code from .proto Protobuf schemas.
+    let out_dir = PathBuf::from(env::var("OUT_DIR")?);
     tonic_build::configure()
         .bytes(["."])
+        .file_descriptor_set_path(out_dir.join("page_api_descriptor.bin"))
         .compile_protos(&["proto/page_service.proto"], &["proto"])
         .map_err(|err| err.into())
 }
diff --git a/pageserver/page_api/proto/page_service.proto b/pageserver/page_api/proto/page_service.proto
index 12e4d2f9db..f6acb3eeeb 100644
--- a/pageserver/page_api/proto/page_service.proto
+++ b/pageserver/page_api/proto/page_service.proto
@@ -11,6 +11,19 @@
 // - neon-shard-id: shard ID, as <number><count> in hex ("0b10" = shard 11 of 16, 0-based)
 // - neon-timeline-id: timeline ID ("f08c4e9a2d5f76b1e3a7c2d8910f4b3e")
 //
+// The service can be accessed via e.g. grpcurl:
+//
+//    ```
+//    grpcurl \
+//      -plaintext \
+//      -H "neon-tenant-id: 7c4a1f9e3bd6470c8f3e21a65bd2e980" \
+//      -H "neon-shard-id: 0b10" \
+//      -H "neon-timeline-id: f08c4e9a2d5f76b1e3a7c2d8910f4b3e" \
+//      -H "authorization: Bearer $JWT" \
+//      -d '{"read_lsn": {"request_lsn": 1234567890}, "rel": {"spc_oid": 1663, "db_oid": 1234, "rel_number": 5678, "fork_number": 0}}'
+//      localhost:51051 page_api.PageService/CheckRelExists
+//    ```
+//
 // TODO: consider adding neon-compute-mode ("primary", "static", "replica").
 // However, this will require reconnecting when changing modes.
 //
@@ -20,7 +33,7 @@
 // - Compression
 
 syntax = "proto3";
-package page_service;
+package page_api;
 
 service PageService {
   // Returns whether a relation exists.
diff --git a/pageserver/page_api/src/lib.rs b/pageserver/page_api/src/lib.rs
index 0226d594cb..0b68d03aaa 100644
--- a/pageserver/page_api/src/lib.rs
+++ b/pageserver/page_api/src/lib.rs
@@ -7,7 +7,12 @@
 
 // Code generated by protobuf.
 pub mod proto {
-    tonic::include_proto!("page_service");
+    tonic::include_proto!("page_api");
+
+    /// File descriptor set for Protobuf schema reflection. This allows using
+    /// e.g. grpcurl with the API.
+    pub const FILE_DESCRIPTOR_SET: &[u8] =
+        tonic::include_file_descriptor_set!("page_api_descriptor");
 
     pub use page_service_client::PageServiceClient;
     pub use page_service_server::{PageService, PageServiceServer};