WIP

WIP advertisement sending
WIP
2026-05-21 07:00:38 +00:00 · 2025-06-12 06:41:07 -07:00 · 2025-06-08 20:10:30 -07:00 · 2025-06-06 19:03:23 -07:00 · 2025-06-06 18:36:16 -07:00 · 2025-06-06 17:40:59 -07:00
92 changed files with 3899 additions and 480 deletions
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -963,7 +963,7 @@ jobs:
          fi

      - name: Verify docker-compose example and test extensions
-        timeout-minutes: 20
+        timeout-minutes: 60
        env:
          TAG: >-
            ${{
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1112,6 +1112,12 @@ version = "1.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"

+[[package]]
+name = "cfg_aliases"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724"
+
 [[package]]
 name = "cgroups-rs"
 version = "0.3.3"
@@ -1306,7 +1312,7 @@ dependencies = [
 "itertools 0.10.5",
 "jsonwebtoken",
 "metrics",
- "nix 0.27.1",
+ "nix 0.30.1",
 "notify",
 "num_cpus",
 "once_cell",
@@ -1429,7 +1435,7 @@ dependencies = [
 "humantime-serde",
 "hyper 0.14.30",
 "jsonwebtoken",
- "nix 0.27.1",
+ "nix 0.30.1",
 "once_cell",
 "pageserver_api",
 "pageserver_client",
@@ -3512,9 +3518,9 @@ dependencies = [

 [[package]]
 name = "libc"
-version = "0.2.169"
+version = "0.2.172"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a"
+checksum = "d750af042f7ef4f724306de029d18836c26c1765a54a6a3f094cbd23a7267ffa"

 [[package]]
 name = "libloading"
@@ -3788,6 +3794,16 @@ version = "0.8.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a"

+[[package]]
+name = "neon-shmem"
+version = "0.1.0"
+dependencies = [
+ "nix 0.30.1",
+ "tempfile",
+ "thiserror 1.0.69",
+ "workspace_hack",
+]
+
 [[package]]
 name = "never-say-never"
 version = "6.6.666"
@@ -3821,12 +3837,13 @@ dependencies = [

 [[package]]
 name = "nix"
-version = "0.27.1"
+version = "0.30.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2eb04e9c688eff1c89d72b407f168cf79bb9e867a9d3323ed6c01519eb9cc053"
+checksum = "74523f3a35e05aba87a1d978330aef40f67b0304ac79c1c00b294c9830543db6"
 dependencies = [
 "bitflags 2.8.0",
 "cfg-if",
+ "cfg_aliases",
 "libc",
 "memoffset 0.9.0",
 ]
@@ -4280,7 +4297,7 @@ dependencies = [
 "jsonwebtoken",
 "md5",
 "metrics",
- "nix 0.27.1",
+ "nix 0.30.1",
 "num-traits",
 "num_cpus",
 "once_cell",
@@ -4356,7 +4373,7 @@ dependencies = [
 "humantime",
 "humantime-serde",
 "itertools 0.10.5",
- "nix 0.27.1",
+ "nix 0.30.1",
 "once_cell",
 "postgres_backend",
 "postgres_ffi",
@@ -4417,6 +4434,16 @@ dependencies = [
 "workspace_hack",
 ]

+[[package]]
+name = "pageserver_page_api"
+version = "0.1.0"
+dependencies = [
+ "prost 0.13.3",
+ "tonic",
+ "tonic-build",
+ "workspace_hack",
+]
+
 [[package]]
 name = "papaya"
 version = "0.2.1"
@@ -6037,8 +6064,10 @@ dependencies = [
 "serde",
 "serde_json",
 "sha2",
+ "sk_ps_discovery",
 "smallvec",
 "storage_broker",
+ "storage_controller_client",
 "strum",
 "strum_macros",
 "thiserror 1.0.69",
@@ -6050,6 +6079,7 @@ dependencies = [
 "tokio-stream",
 "tokio-tar",
 "tokio-util",
+ "tonic",
 "tracing",
 "tracing-subscriber",
 "url",
@@ -6545,6 +6575,76 @@ version = "0.3.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7bd3e3206899af3f8b12af284fafc038cc1dc2b41d1b89dd17297221c5d225de"

+[[package]]
+name = "sk_ps_discovery"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "async-stream",
+ "byteorder",
+ "bytes",
+ "camino",
+ "camino-tempfile",
+ "chrono",
+ "clap",
+ "crc32c",
+ "criterion",
+ "desim",
+ "env_logger",
+ "fail",
+ "futures",
+ "hex",
+ "http 1.1.0",
+ "http-utils",
+ "humantime",
+ "hyper 0.14.30",
+ "itertools 0.10.5",
+ "jsonwebtoken",
+ "metrics",
+ "once_cell",
+ "pageserver_api",
+ "parking_lot 0.12.1",
+ "pem",
+ "postgres-protocol",
+ "postgres_backend",
+ "postgres_ffi",
+ "pprof",
+ "pq_proto",
+ "rand 0.8.5",
+ "regex",
+ "remote_storage",
+ "reqwest",
+ "rustls 0.23.18",
+ "safekeeper_api",
+ "safekeeper_client",
+ "scopeguard",
+ "sd-notify",
+ "serde",
+ "serde_json",
+ "sha2",
+ "smallvec",
+ "storage_broker",
+ "strum",
+ "strum_macros",
+ "thiserror 1.0.69",
+ "tikv-jemallocator",
+ "tokio",
+ "tokio-io-timeout",
+ "tokio-postgres",
+ "tokio-rustls 0.26.0",
+ "tokio-stream",
+ "tokio-tar",
+ "tokio-util",
+ "tonic",
+ "tracing",
+ "tracing-subscriber",
+ "url",
+ "utils",
+ "wal_decoder",
+ "walproposer",
+ "workspace_hack",
+]
+
 [[package]]
 name = "slab"
 version = "0.4.8"
@@ -6651,6 +6751,7 @@ dependencies = [
 "rustls 0.23.18",
 "tokio",
 "tokio-rustls 0.26.0",
+ "tokio-util",
 "tonic",
 "tonic-build",
 "tracing",
@@ -6663,6 +6764,7 @@ name = "storage_controller"
 version = "0.1.0"
 dependencies = [
 "anyhow",
+ "async-stream",
 "bytes",
 "camino",
 "chrono",
@@ -7899,7 +8001,7 @@ dependencies = [
 "humantime",
 "jsonwebtoken",
 "metrics",
- "nix 0.27.1",
+ "nix 0.30.1",
 "once_cell",
 "pem",
 "pin-project-lite",
@@ -8475,6 +8577,7 @@ dependencies = [
 "log",
 "memchr",
 "nix 0.26.4",
+ "nix 0.30.1",
 "nom",
 "num",
 "num-bigint",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -9,6 +9,7 @@ members = [
    "pageserver/ctl",
    "pageserver/client",
    "pageserver/pagebench",
+    "pageserver/page_api",
    "proxy",
    "safekeeper",
    "safekeeper/client",
@@ -23,6 +24,7 @@ members = [
    "libs/postgres_ffi",
    "libs/safekeeper_api",
    "libs/desim",
+    "libs/neon-shmem",
    "libs/utils",
    "libs/consumption_metrics",
    "libs/postgres_backend",
@@ -41,7 +43,7 @@ members = [
    "libs/proxy/postgres-protocol2",
    "libs/proxy/postgres-types2",
    "libs/proxy/tokio-postgres2",
-    "endpoint_storage",
+    "endpoint_storage", "libs/sk_ps_discovery",
 ]

 [workspace.package]
@@ -127,7 +129,7 @@ md5 = "0.7.0"
 measured = { version = "0.0.22", features=["lasso"] }
 measured-process = { version = "0.0.22" }
 memoffset = "0.9"
-nix = { version = "0.27", features = ["dir", "fs", "process", "socket", "signal", "poll"] }
+nix = { version = "0.30.1", features = ["dir", "fs", "mman", "process", "socket", "signal", "poll"] }
 # Do not update to >= 7.0.0, at least. The update will have a significant impact
 # on compute startup metrics (start_postgres_ms), >= 25% degradation.
 notify = "6.0.0"
@@ -251,6 +253,7 @@ pageserver = { path = "./pageserver" }
 pageserver_api = { version = "0.1", path = "./libs/pageserver_api/" }
 pageserver_client = { path = "./pageserver/client" }
 pageserver_compaction = { version = "0.1", path = "./pageserver/compaction/" }
+pageserver_page_api = { path = "./pageserver/page_api" }
 postgres_backend = { version = "0.1", path = "./libs/postgres_backend/" }
 postgres_connection = { version = "0.1", path = "./libs/postgres_connection/" }
 postgres_ffi = { version = "0.1", path = "./libs/postgres_ffi/" }
@@ -259,6 +262,7 @@ pq_proto = { version = "0.1", path = "./libs/pq_proto/" }
 remote_storage = { version = "0.1", path = "./libs/remote_storage/" }
 safekeeper_api = { version = "0.1", path = "./libs/safekeeper_api" }
 safekeeper_client = { path = "./safekeeper/client" }
+sk_ps_discovery = { path = "./libs/sk_ps_discovery" }
 desim = { version = "0.1", path = "./libs/desim" }
 storage_broker = { version = "0.1", path = "./storage_broker/" } # Note: main broker code is inside the binary crate, so linking with the library shouldn't be heavy.
 storage_controller_client = { path = "./storage_controller/client" }
--- a/build-tools.Dockerfile
+++ b/build-tools.Dockerfile
@@ -292,7 +292,7 @@ WORKDIR /home/nonroot

 # Rust
 # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
-ENV RUSTC_VERSION=1.86.0
+ENV RUSTC_VERSION=1.87.0
 ENV RUSTUP_HOME="/home/nonroot/.rustup"
 ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
 ARG RUSTFILT_VERSION=0.2.1
--- a/compute/patches/rum.patch
+++ b/compute/patches/rum.patch
@@ -7,7 +7,7 @@ index 255e616..1c6edb7 100644
 			 RelationGetRelationName(index));
 
 +#ifdef NEON_SMGR
-+	smgr_start_unlogged_build(index->rd_smgr);
+	smgr_start_unlogged_build(RelationGetSmgr(index));
 +#endif
 +
 	initRumState(&buildstate.rumstate, index);
@@ -18,7 +18,7 @@ index 255e616..1c6edb7 100644
 	rumUpdateStats(index, &buildstate.buildStats, buildstate.rumstate.isBuild);
 
 +#ifdef NEON_SMGR
-+	smgr_finish_unlogged_build_phase_1(index->rd_smgr);
+	smgr_finish_unlogged_build_phase_1(RelationGetSmgr(index));
 +#endif
 +
 	/*
@@ -29,7 +29,7 @@ index 255e616..1c6edb7 100644
 	}
 
 +#ifdef NEON_SMGR
-+	smgr_end_unlogged_build(index->rd_smgr);
+	smgr_end_unlogged_build(RelationGetSmgr(index));
 +#endif
 +
 	/*
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -14,7 +14,7 @@

 use std::ffi::OsStr;
 use std::io::Write;
-use std::os::unix::prelude::AsRawFd;
+use std::os::fd::AsFd;
 use std::os::unix::process::CommandExt;
 use std::path::Path;
 use std::process::Command;
@@ -356,7 +356,7 @@ where
            let file = pid_file::claim_for_current_process(&path).expect("claim pid file");
            // Remove the FD_CLOEXEC flag on the pidfile descriptor so that the pidfile
            // remains locked after exec.
-            nix::fcntl::fcntl(file.as_raw_fd(), FcntlArg::F_SETFD(FdFlag::empty()))
+            nix::fcntl::fcntl(file.as_fd(), FcntlArg::F_SETFD(FdFlag::empty()))
                .expect("remove FD_CLOEXEC");
            // Don't run drop(file), it would close the file before we actually exec.
            std::mem::forget(file);
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -8,7 +8,6 @@
 use std::borrow::Cow;
 use std::collections::{BTreeSet, HashMap};
 use std::fs::File;
-use std::os::fd::AsRawFd;
 use std::path::PathBuf;
 use std::process::exit;
 use std::str::FromStr;
@@ -31,7 +30,7 @@ use control_plane::safekeeper::SafekeeperNode;
 use control_plane::storage_controller::{
    NeonStorageControllerStartArgs, NeonStorageControllerStopArgs, StorageController,
 };
-use nix::fcntl::{FlockArg, flock};
+use nix::fcntl::{Flock, FlockArg};
 use pageserver_api::config::{
    DEFAULT_HTTP_LISTEN_PORT as DEFAULT_PAGESERVER_HTTP_PORT,
    DEFAULT_PG_LISTEN_PORT as DEFAULT_PAGESERVER_PG_PORT,
@@ -749,16 +748,16 @@ struct TimelineTreeEl {

 /// A flock-based guard over the neon_local repository directory
 struct RepoLock {
-    _file: File,
+    _file: Flock<File>,
 }

 impl RepoLock {
    fn new() -> Result<Self> {
        let repo_dir = File::open(local_env::base_path())?;
-        let repo_dir_fd = repo_dir.as_raw_fd();
-        flock(repo_dir_fd, FlockArg::LockExclusive)?;
-
-        Ok(Self { _file: repo_dir })
+        match Flock::lock(repo_dir, FlockArg::LockExclusive) {
+            Ok(f) => Ok(Self { _file: f }),
+            Err((_, e)) => Err(e).context("flock error"),
+        }
    }
 }

--- a/libs/neon-shmem/Cargo.toml
+++ b/libs/neon-shmem/Cargo.toml
@@ -0,0 +1,13 @@
+[package]
+name = "neon-shmem"
+version = "0.1.0"
+edition.workspace = true
+license.workspace = true
+
+[dependencies]
+thiserror.workspace = true
+nix.workspace=true
+workspace_hack = { version = "0.1", path = "../../workspace_hack" }
+
+[target.'cfg(target_os = "macos")'.dependencies]
+tempfile = "3.14.0"
--- a/libs/neon-shmem/src/lib.rs
+++ b/libs/neon-shmem/src/lib.rs
@@ -0,0 +1,418 @@
+//! Shared memory utilities for neon communicator
+
+use std::num::NonZeroUsize;
+use std::os::fd::{AsFd, BorrowedFd, OwnedFd};
+use std::ptr::NonNull;
+use std::sync::atomic::{AtomicUsize, Ordering};
+
+use nix::errno::Errno;
+use nix::sys::mman::MapFlags;
+use nix::sys::mman::ProtFlags;
+use nix::sys::mman::mmap as nix_mmap;
+use nix::sys::mman::munmap as nix_munmap;
+use nix::unistd::ftruncate as nix_ftruncate;
+
+/// ShmemHandle represents a shared memory area that can be shared by processes over fork().
+/// Unlike shared memory allocated by Postgres, this area is resizable, up to 'max_size' that's
+/// specified at creation.
+///
+/// The area is backed by an anonymous file created with memfd_create(). The full address space for
+/// 'max_size' is reserved up-front with mmap(), but whenever you call [`ShmemHandle::set_size`],
+/// the underlying file is resized. Do not access the area beyond the current size. Currently, that
+/// will cause the file to be expanded, but we might use mprotect() etc. to enforce that in the
+/// future.
+pub struct ShmemHandle {
+    /// memfd file descriptor
+    fd: OwnedFd,
+
+    max_size: usize,
+
+    // Pointer to the beginning of the shared memory area. The header is stored there.
+    shared_ptr: NonNull<SharedStruct>,
+
+    // Pointer to the beginning of the user data
+    pub data_ptr: NonNull<u8>,
+}
+
+/// This is stored at the beginning in the shared memory area.
+struct SharedStruct {
+    max_size: usize,
+
+    /// Current size of the backing file. The high-order bit is used for the RESIZE_IN_PROGRESS flag
+    current_size: AtomicUsize,
+}
+
+const RESIZE_IN_PROGRESS: usize = 1 << 63;
+
+const HEADER_SIZE: usize = std::mem::size_of::<SharedStruct>();
+
+/// Error type returned by the ShmemHandle functions.
+#[derive(thiserror::Error, Debug)]
+#[error("{msg}: {errno}")]
+pub struct Error {
+    pub msg: String,
+    pub errno: Errno,
+}
+
+impl Error {
+    fn new(msg: &str, errno: Errno) -> Error {
+        Error {
+            msg: msg.to_string(),
+            errno,
+        }
+    }
+}
+
+impl ShmemHandle {
+    /// Create a new shared memory area. To communicate between processes, the processes need to be
+    /// fork()'d after calling this, so that the ShmemHandle is inherited by all processes.
+    ///
+    /// If the ShmemHandle is dropped, the memory is unmapped from the current process. Other
+    /// processes can continue using it, however.
+    pub fn new(name: &str, initial_size: usize, max_size: usize) -> Result<ShmemHandle, Error> {
+        // create the backing anonymous file.
+        let fd = create_backing_file(name)?;
+
+        Self::new_with_fd(fd, initial_size, max_size)
+    }
+
+    fn new_with_fd(
+        fd: OwnedFd,
+        initial_size: usize,
+        max_size: usize,
+    ) -> Result<ShmemHandle, Error> {
+        // We reserve the high-order bit for the RESIZE_IN_PROGRESS flag, and the actual size
+        // is a little larger than this because of the SharedStruct header. Make the upper limit
+        // somewhat smaller than that, because with anything close to that, you'll run out of
+        // memory anyway.
+        if max_size >= 1 << 48 {
+            panic!("max size {} too large", max_size);
+        }
+        if initial_size > max_size {
+            panic!("initial size {initial_size} larger than max size {max_size}");
+        }
+
+        // The actual initial / max size is the one given by the caller, plus the size of
+        // 'SharedStruct'.
+        let initial_size = HEADER_SIZE + initial_size;
+        let max_size = NonZeroUsize::new(HEADER_SIZE + max_size).unwrap();
+
+        // Reserve address space for it with mmap
+        //
+        // TODO: Use MAP_HUGETLB if possible
+        let start_ptr = unsafe {
+            nix_mmap(
+                None,
+                max_size,
+                ProtFlags::PROT_READ | ProtFlags::PROT_WRITE,
+                MapFlags::MAP_SHARED,
+                &fd,
+                0,
+            )
+        }
+        .map_err(|e| Error::new("mmap failed: {e}", e))?;
+
+        // Reserve space for the initial size
+        enlarge_file(fd.as_fd(), initial_size as u64)?;
+
+        // Initialize the header
+        let shared: NonNull<SharedStruct> = start_ptr.cast();
+        unsafe {
+            shared.write(SharedStruct {
+                max_size: max_size.into(),
+                current_size: AtomicUsize::new(initial_size),
+            })
+        };
+
+        // The user data begins after the header
+        let data_ptr = unsafe { start_ptr.cast().add(HEADER_SIZE) };
+
+        Ok(ShmemHandle {
+            fd,
+            max_size: max_size.into(),
+            shared_ptr: shared,
+            data_ptr,
+        })
+    }
+
+    // return reference to the header
+    fn shared(&self) -> &SharedStruct {
+        unsafe { self.shared_ptr.as_ref() }
+    }
+
+    /// Resize the shared memory area. 'new_size' must not be larger than the 'max_size' specified
+    /// when creating the area.
+    ///
+    /// This may only be called from one process/thread concurrently. We detect that case
+    /// and return an Error.
+    pub fn set_size(&self, new_size: usize) -> Result<(), Error> {
+        let new_size = new_size + HEADER_SIZE;
+        let shared = self.shared();
+
+        if new_size > self.max_size {
+            panic!(
+                "new size ({} is greater than max size ({})",
+                new_size, self.max_size
+            );
+        }
+        assert_eq!(self.max_size, shared.max_size);
+
+        // Lock the area by setting the bit in 'current_size'
+        //
+        // Ordering::Relaxed would probably be sufficient here, as we don't access any other memory
+        // and the posix_fallocate/ftruncate call is surely a synchronization point anyway. But
+        // since this is not performance-critical, better safe than sorry .
+        let mut old_size = shared.current_size.load(Ordering::Acquire);
+        loop {
+            if (old_size & RESIZE_IN_PROGRESS) != 0 {
+                return Err(Error::new(
+                    "concurrent resize detected",
+                    Errno::UnknownErrno,
+                ));
+            }
+            match shared.current_size.compare_exchange(
+                old_size,
+                new_size,
+                Ordering::Acquire,
+                Ordering::Relaxed,
+            ) {
+                Ok(_) => break,
+                Err(x) => old_size = x,
+            }
+        }
+
+        // Ok, we got the lock.
+        //
+        // NB: If anything goes wrong, we *must* clear the bit!
+        let result = {
+            use std::cmp::Ordering::{Equal, Greater, Less};
+            match new_size.cmp(&old_size) {
+                Less => nix_ftruncate(&self.fd, new_size as i64).map_err(|e| {
+                    Error::new("could not shrink shmem segment, ftruncate failed: {e}", e)
+                }),
+                Equal => Ok(()),
+                Greater => enlarge_file(self.fd.as_fd(), new_size as u64),
+            }
+        };
+
+        // Unlock
+        shared.current_size.store(
+            if result.is_ok() { new_size } else { old_size },
+            Ordering::Release,
+        );
+
+        result
+    }
+
+    /// Returns the current user-visible size of the shared memory segment.
+    ///
+    /// NOTE: a concurrent set_size() call can change the size at any time. It is the caller's
+    /// responsibility not to access the area beyond the current size.
+    pub fn current_size(&self) -> usize {
+        let total_current_size =
+            self.shared().current_size.load(Ordering::Relaxed) & !RESIZE_IN_PROGRESS;
+        total_current_size - HEADER_SIZE
+    }
+}
+
+impl Drop for ShmemHandle {
+    fn drop(&mut self) {
+        // SAFETY: The pointer was obtained from mmap() with the given size.
+        // We unmap the entire region.
+        let _ = unsafe { nix_munmap(self.shared_ptr.cast(), self.max_size) };
+        // The fd is dropped automatically by OwnedFd.
+    }
+}
+
+/// Create a "backing file" for the shared memory area. On Linux, use memfd_create(), to create an
+/// anonymous in-memory file. One macos, fall back to a regular file. That's good enough for
+/// development and testing, but in production we want the file to stay in memory.
+///
+/// disable 'unused_variables' warnings, because in the macos path, 'name' is unused.
+#[allow(unused_variables)]
+fn create_backing_file(name: &str) -> Result<OwnedFd, Error> {
+    #[cfg(not(target_os = "macos"))]
+    {
+        nix::sys::memfd::memfd_create(name, nix::sys::memfd::MFdFlags::empty())
+            .map_err(|e| Error::new("memfd_create failed: {e}", e))
+    }
+    #[cfg(target_os = "macos")]
+    {
+        let file = tempfile::tempfile().map_err(|e| {
+            Error::new(
+                "could not create temporary file to back shmem area: {e}",
+                nix::errno::Errno::from_raw(e.raw_os_error().unwrap_or(0)),
+            )
+        })?;
+        Ok(OwnedFd::from(file))
+    }
+}
+
+fn enlarge_file(fd: BorrowedFd, size: u64) -> Result<(), Error> {
+    // Use posix_fallocate() to enlarge the file. It reserves the space correctly, so that
+    // we don't get a segfault later when trying to actually use it.
+    #[cfg(not(target_os = "macos"))]
+    {
+        nix::fcntl::posix_fallocate(fd, 0, size as i64).map_err(|e| {
+            Error::new(
+                "could not grow shmem segment, posix_fallocate failed: {e}",
+                e,
+            )
+        })
+    }
+    // As a fallback on macos, which doesn't have posix_fallocate, use plain 'fallocate'
+    #[cfg(target_os = "macos")]
+    {
+        nix::unistd::ftruncate(fd, size as i64)
+            .map_err(|e| Error::new("could not grow shmem segment, ftruncate failed: {e}", e))
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    use nix::unistd::ForkResult;
+    use std::ops::Range;
+
+    /// check that all bytes in given range have the expected value.
+    fn assert_range(ptr: *const u8, expected: u8, range: Range<usize>) {
+        for i in range {
+            let b = unsafe { *(ptr.add(i)) };
+            assert_eq!(expected, b, "unexpected byte at offset {}", i);
+        }
+    }
+
+    /// Write 'b' to all bytes in the given range
+    fn write_range(ptr: *mut u8, b: u8, range: Range<usize>) {
+        unsafe { std::ptr::write_bytes(ptr.add(range.start), b, range.end - range.start) };
+    }
+
+    // simple single-process test of growing and shrinking
+    #[test]
+    fn test_shmem_resize() -> Result<(), Error> {
+        let max_size = 1024 * 1024;
+        let init_struct = ShmemHandle::new("test_shmem_resize", 0, max_size)?;
+
+        assert_eq!(init_struct.current_size(), 0);
+
+        // Initial grow
+        let size1 = 10000;
+        init_struct.set_size(size1).unwrap();
+        assert_eq!(init_struct.current_size(), size1);
+
+        // Write some data
+        let data_ptr = init_struct.data_ptr.as_ptr();
+        write_range(data_ptr, 0xAA, 0..size1);
+        assert_range(data_ptr, 0xAA, 0..size1);
+
+        // Shrink
+        let size2 = 5000;
+        init_struct.set_size(size2).unwrap();
+        assert_eq!(init_struct.current_size(), size2);
+
+        // Grow again
+        let size3 = 20000;
+        init_struct.set_size(size3).unwrap();
+        assert_eq!(init_struct.current_size(), size3);
+
+        // Try to read it. The area that was shrunk and grown again should read as all zeros now
+        assert_range(data_ptr, 0xAA, 0..5000);
+        assert_range(data_ptr, 0, 5000..size1);
+
+        // Try to grow beyond max_size
+        //let size4 = max_size + 1;
+        //assert!(init_struct.set_size(size4).is_err());
+
+        // Dropping init_struct should unmap the memory
+        drop(init_struct);
+
+        Ok(())
+    }
+
+    /// This is used in tests to coordinate between test processes. It's like std::sync::Barrier,
+    /// but is stored in the shared memory area and works across processes. It's implemented by
+    /// polling, because e.g. standard rust mutexes are not guaranteed to work across processes.
+    struct SimpleBarrier {
+        num_procs: usize,
+        count: AtomicUsize,
+    }
+
+    impl SimpleBarrier {
+        unsafe fn init(ptr: *mut SimpleBarrier, num_procs: usize) {
+            unsafe {
+                *ptr = SimpleBarrier {
+                    num_procs,
+                    count: AtomicUsize::new(0),
+                }
+            }
+        }
+
+        pub fn wait(&self) {
+            let old = self.count.fetch_add(1, Ordering::Relaxed);
+
+            let generation = old / self.num_procs;
+
+            let mut current = old + 1;
+            while current < (generation + 1) * self.num_procs {
+                std::thread::sleep(std::time::Duration::from_millis(10));
+                current = self.count.load(Ordering::Relaxed);
+            }
+        }
+    }
+
+    #[test]
+    fn test_multi_process() {
+        // Initialize
+        let max_size = 1_000_000_000_000;
+        let init_struct = ShmemHandle::new("test_multi_process", 0, max_size).unwrap();
+        let ptr = init_struct.data_ptr.as_ptr();
+
+        // Store the SimpleBarrier in the first 1k of the area.
+        init_struct.set_size(10000).unwrap();
+        let barrier_ptr: *mut SimpleBarrier = unsafe {
+            ptr.add(ptr.align_offset(std::mem::align_of::<SimpleBarrier>()))
+                .cast()
+        };
+        unsafe { SimpleBarrier::init(barrier_ptr, 2) };
+        let barrier = unsafe { barrier_ptr.as_ref().unwrap() };
+
+        // Fork another test process. The code after this runs in both processes concurrently.
+        let fork_result = unsafe { nix::unistd::fork().unwrap() };
+
+        // In the parent, fill bytes between 1000..2000. In the child, between 2000..3000
+        if fork_result.is_parent() {
+            write_range(ptr, 0xAA, 1000..2000);
+        } else {
+            write_range(ptr, 0xBB, 2000..3000);
+        }
+        barrier.wait();
+        // Verify the contents. (in both processes)
+        assert_range(ptr, 0xAA, 1000..2000);
+        assert_range(ptr, 0xBB, 2000..3000);
+
+        // Grow, from the child this time
+        let size = 10_000_000;
+        if !fork_result.is_parent() {
+            init_struct.set_size(size).unwrap();
+        }
+        barrier.wait();
+
+        // make some writes at the end
+        if fork_result.is_parent() {
+            write_range(ptr, 0xAA, (size - 10)..size);
+        } else {
+            write_range(ptr, 0xBB, (size - 20)..(size - 10));
+        }
+        barrier.wait();
+
+        // Verify the contents. (This runs in both processes)
+        assert_range(ptr, 0, (size - 1000)..(size - 20));
+        assert_range(ptr, 0xBB, (size - 20)..(size - 10));
+        assert_range(ptr, 0xAA, (size - 10)..size);
+
+        if let ForkResult::Parent { child } = fork_result {
+            nix::sys::wait::waitpid(child, None).unwrap();
+        }
+    }
+}
--- a/libs/safekeeper_api/src/models.rs
+++ b/libs/safekeeper_api/src/models.rs
@@ -6,9 +6,11 @@ use pageserver_api::shard::ShardIdentity;
 use postgres_ffi::TimestampTz;
 use serde::{Deserialize, Serialize};
 use tokio::time::Instant;
+use utils::generation::Generation;
 use utils::id::{NodeId, TenantId, TenantTimelineId, TimelineId};
 use utils::lsn::Lsn;
 use utils::pageserver_feedback::PageserverFeedback;
+use utils::shard::ShardIndex;

 use crate::membership::Configuration;
 use crate::{ServerInfo, Term};
@@ -309,3 +311,29 @@ pub struct PullTimelineResponse {
    pub safekeeper_host: Option<String>,
    // TODO: add more fields?
 }
+
+#[derive(Clone, Debug, Serialize, Deserialize)]
+#[serde(tag = "action")]
+pub enum TenantShardPageserverAttachmentChange {
+    Attach(TenantShardPageserverAttachment),
+    Detach(TenantShardPageserverAttachment),
+}
+
+impl TenantShardPageserverAttachmentChange {
+    pub fn attachment(&self) -> &TenantShardPageserverAttachment {
+        match self {
+            TenantShardPageserverAttachmentChange::Attach(a) => a,
+            TenantShardPageserverAttachmentChange::Detach(a) => a,
+        }
+    }
+}
+
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct TenantShardPageserverAttachment {
+    pub shard_id: ShardIndex,
+    pub generation: Generation,
+    pub ps_id: NodeId,
+    // TODO: avoid transmitting this with every request.
+    // How nice things could be if there were simple DNS records for ps-$node_id.$cell.$region.$cloud.neon.tech
+    pub ps_hostname: String, // TODO: some type safety
+}
--- a/libs/sk_ps_discovery/Cargo.toml
+++ b/libs/sk_ps_discovery/Cargo.toml
@@ -0,0 +1,81 @@
+[package]
+name = "sk_ps_discovery"
+version = "0.1.0"
+edition.workspace = true
+license.workspace = true
+
+[dependencies]
+workspace_hack.workspace = true
+
+async-stream.workspace = true
+anyhow.workspace = true
+byteorder.workspace = true
+bytes.workspace = true
+camino.workspace = true
+camino-tempfile.workspace = true
+chrono.workspace = true
+clap = { workspace = true, features = ["derive"] }
+crc32c.workspace = true
+fail.workspace = true
+hex.workspace = true
+humantime.workspace = true
+http.workspace = true
+hyper0.workspace = true
+itertools.workspace = true
+jsonwebtoken.workspace = true
+futures.workspace = true
+once_cell.workspace = true
+parking_lot.workspace = true
+pageserver_api.workspace = true
+postgres-protocol.workspace = true
+pprof.workspace = true
+rand.workspace = true
+regex.workspace = true
+reqwest = { workspace = true, features = ["json"] }
+rustls.workspace = true
+scopeguard.workspace = true
+serde.workspace = true
+serde_json.workspace = true
+smallvec.workspace = true
+strum.workspace = true
+strum_macros.workspace = true
+thiserror.workspace = true
+tikv-jemallocator.workspace = true
+tokio = { workspace = true, features = ["fs"] }
+tokio-io-timeout.workspace = true
+tokio-postgres.workspace = true
+tokio-rustls.workspace = true
+tokio-tar.workspace = true
+tokio-util = { workspace = true }
+tonic = { workspace = true }
+tracing.workspace = true
+url.workspace = true
+metrics.workspace = true
+pem.workspace = true
+postgres_backend.workspace = true
+postgres_ffi.workspace = true
+pq_proto.workspace = true
+remote_storage.workspace = true
+safekeeper_api.workspace = true
+safekeeper_client.workspace = true
+sha2.workspace = true
+sd-notify.workspace = true
+storage_broker.workspace = true
+tokio-stream.workspace = true
+http-utils.workspace = true
+utils.workspace = true
+wal_decoder.workspace = true
+env_logger.workspace = true
+
+[dev-dependencies]
+criterion.workspace = true
+itertools.workspace = true
+walproposer.workspace = true
+rand.workspace = true
+desim.workspace = true
+tracing.workspace = true
+tracing-subscriber = { workspace = true, features = ["json"] }
+
+[[bench]]
+name = "bench"
+harness = false
--- a/libs/sk_ps_discovery/benches/bench.rs
+++ b/libs/sk_ps_discovery/benches/bench.rs
@@ -0,0 +1,97 @@
+//! WAL ingestion benchmarks.
+
+use std::time::Instant;
+
+use criterion::{Criterion, criterion_group, criterion_main};
+use pprof::criterion::{Output, PProfProfiler};
+use sk_ps_discovery::{
+    AttachmentUpdate, RemoteConsistentLsnAdv, TenantShardAttachmentId, TimelineAttachmentId,
+};
+use utils::{
+    generation::Generation,
+    id::{NodeId, TenantId, TenantTimelineId, TimelineId},
+    lsn::Lsn,
+    shard::ShardIndex,
+};
+
+/// Use jemalloc and enable profiling, to mirror bin/safekeeper.rs.
+#[global_allocator]
+static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
+
+#[allow(non_upper_case_globals)]
+#[unsafe(export_name = "malloc_conf")]
+pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:21\0";
+
+// Register benchmarks with Criterion.
+criterion_group!(
+    name = benches;
+    config = Criterion::default().with_profiler(PProfProfiler::new(100, Output::Flamegraph(None)));
+    targets = bench_simple,
+);
+criterion_main!(benches);
+
+fn bench_simple(c: &mut Criterion) {
+    let mut g = c.benchmark_group("simple");
+
+    // setup
+    let mut world = sk_ps_discovery::World::default();
+
+    // Simplified view: lots of unsharded tenants with one timeline each
+    let n_pageservers = 20;
+    let n_tenant_shards_per_pageserver = 2000;
+    for ps_id in 1..=n_pageservers {
+        for _ in ..n_tenant_shards_per_pageserver {
+            let tenant_id = TenantId::generate();
+            let timeline_id = TimelineId::generate();
+            for generation in 10..=11 {
+                let tenant_shard_attachment_id = TenantShardAttachmentId {
+                    tenant_id,
+                    shard_id: ShardIndex::unsharded(),
+                    generation: Generation::Valid(generation),
+                };
+                let timeline_attachment = TimelineAttachmentId {
+                    tenant_timeline_id: TenantTimelineId {
+                        tenant_id,
+                        timeline_id,
+                    },
+                    shard_id: ShardIndex::unsharded(),
+                    generation: Generation::Valid(generation),
+                };
+                world.update_attachment(AttachmentUpdate {
+                    tenant_shard_attachment_id,
+                    action: sk_ps_discovery::AttachmentUpdateAction::Attach {
+                        ps_id: NodeId(ps_id),
+                    },
+                });
+                world.handle_remote_consistent_lsn_advertisement(RemoteConsistentLsnAdv {
+                    remote_consistent_lsn: Lsn(23),
+                    attachment: timeline_attachment,
+                });
+            }
+            world.handle_commit_lsn_advancement(
+                TenantTimelineId {
+                    tenant_id,
+                    timeline_id,
+                },
+                Lsn(42),
+            );
+        }
+    }
+
+    // setup done
+    let world = world;
+    g.bench_function("get_commit_lsn_advertisements", |bencher| {
+        bencher.iter_custom(|iters| {
+            let started = Instant::now();
+
+            for _ in 0..iters {
+                criterion::black_box(world.get_commit_lsn_advertisements());
+            }
+
+            let elapsed = started.elapsed();
+            elapsed
+        });
+    });
+
+    g.finish();
+}
--- a/libs/sk_ps_discovery/src/lib.rs
+++ b/libs/sk_ps_discovery/src/lib.rs
@@ -0,0 +1,515 @@
+#[cfg(test)]
+mod tests;
+
+use std::{
+    collections::{BTreeMap, BTreeSet, HashMap, HashSet, btree_map, hash_map},
+    ops::RangeInclusive,
+};
+
+use tracing::{info, warn};
+use utils::{
+    generation::Generation,
+    id::{NodeId, TenantId, TenantTimelineId, TimelineId},
+    lsn::Lsn,
+    merge_join,
+    shard::ShardIndex,
+};
+
+#[derive(Debug, Default)]
+pub struct World {
+    attachments: BTreeMap<TenantShardAttachmentId, NodeId>,
+    attachment_count: HashMap<TenantId, u16>,
+    nodes_timelines: HashMap<NodeId, HashMap<TenantTimelineId, u16>>, // u16 is a refcount from each timeline attachment id
+    // continously maintained aggregate for efficient decisionmaking on quiescing;
+    // quiesced timelines are always caught up
+    // can quiesce one == attachment_count (TODO: this requires enforcing foreign key relationship between attachments and remote_consistent_lsn)
+    caught_up_count: HashMap<TenantTimelineId, u16>,
+
+    // BEGIN quiescing/active split
+    quiesced_timelines: BTreeMap<TenantTimelineId, Lsn>,
+    // ^
+    // either a timeline is in quiesced_timelines
+    // or it is below
+    // v
+    commit_lsns: BTreeMap<TenantTimelineId, Lsn>,
+    remote_consistent_lsns: BTreeMap<TimelineAttachmentId, Lsn>,
+    // END quiescing/active split
+
+    // other fields
+}
+
+#[derive(Debug, PartialEq, Eq, Hash, Clone, Copy, PartialOrd, Ord)]
+pub struct TenantShardAttachmentId {
+    pub tenant_id: TenantId,
+    pub shard_id: ShardIndex,
+    pub generation: Generation,
+}
+
+#[derive(Debug, PartialEq, Eq, Hash, Clone, Copy, PartialOrd, Ord)]
+pub struct TimelineAttachmentId {
+    pub tenant_timeline_id: TenantTimelineId,
+    pub shard_id: ShardIndex,
+    pub generation: Generation,
+}
+
+pub struct AttachmentUpdate {
+    pub tenant_shard_attachment_id: TenantShardAttachmentId,
+    pub action: AttachmentUpdateAction,
+}
+
+pub enum AttachmentUpdateAction {
+    Attach { ps_id: NodeId },
+    Detach,
+}
+
+pub struct RemoteConsistentLsnAdv {
+    pub attachment: TimelineAttachmentId,
+    pub remote_consistent_lsn: Lsn,
+}
+
+impl World {
+    fn check_invariants(&self) {
+        if !cfg!(debug_assertions) {
+            return;
+        }
+
+        // caught_up_count maintenance
+        {
+            for (tenant_timeline_id, caught_up_count) in
+                self.caught_up_count.iter().map(|(k, v)| (*k, *v))
+            {
+                let attachment_count = *self
+                    .attachment_count
+                    .get(&tenant_timeline_id.tenant_id)
+                    .unwrap();
+                assert!(caught_up_count <= attachment_count);
+                if caught_up_count == attachment_count {
+                    self.quiesced_timelines.contains_key(&tenant_timeline_id);
+                    // remote_consistent_lsn and commit_lsns is empty, checked by "quiescing XOR ..." below
+                } else {
+                    let commit_lsn = self.commit_lsns[&&tenant_timeline_id];
+                    let mut validate_caught_up = 0;
+                    let mut validate_not_caught_up = 0;
+                    for (_, r_c_lsn) in self
+                        .remote_consistent_lsns
+                        .range(TimelineAttachmentId::timeline_range(tenant_timeline_id))
+                        .map(|(k, v)| (*k, *v))
+                    {
+                        if r_c_lsn == commit_lsn {
+                            validate_caught_up += 1;
+                        } else {
+                            assert!(r_c_lsn < commit_lsn);
+                            validate_not_caught_up += 1;
+                        }
+                    }
+                    assert_eq!(validate_caught_up, caught_up_count);
+                    assert_eq!(
+                        validate_caught_up + validate_not_caught_up,
+                        attachment_count
+                    );
+                }
+            }
+        }
+
+        // quiescing XOR ...
+        {
+            let quiesced_timelines: HashSet<TenantTimelineId> =
+                self.quiesced_timelines.keys().cloned().collect();
+            let commit_lsn_timelines: HashSet<TenantTimelineId> =
+                self.commit_lsns.keys().cloned().collect();
+            let remote_consistent_lsn_timelines: HashSet<TenantTimelineId> = self
+                .remote_consistent_lsns
+                .keys()
+                .map(|tlaid: &TimelineAttachmentId| tlaid.tenant_timeline_id)
+                .collect();
+            #[rustfmt::skip]
+            assert_eq!(0, quiesced_timelines.intersection(&commit_lsn_timelines).count());
+            #[rustfmt::skip]
+            assert_eq!(0, quiesced_timelines.intersection(&remote_consistent_lsn_timelines).count());
+        }
+
+        // nodes_timelines maintenance
+        {
+            let mut expect: HashMap<NodeId, HashMap<TenantTimelineId, u16>> = HashMap::new();
+            let all_ttids: BTreeSet<TenantTimelineId> = self
+                .quiesced_timelines
+                .keys()
+                .cloned()
+                .chain(
+                    self.remote_consistent_lsns
+                        .keys()
+                        .cloned()
+                        .map(|tlaid| tlaid.tenant_timeline_id),
+                )
+                .collect();
+            for ttid in all_ttids {
+                for (_, node_id) in self
+                    .attachments
+                    .range(TenantShardAttachmentId::tenant_range(ttid.tenant_id))
+                    .map(|(k, v)| (*k, *v))
+                {
+                    let expect = expect.entry(node_id).or_default();
+                    let refcount = expect.entry(ttid).or_default();
+                    *refcount += 1;
+                }
+            }
+            assert_eq!(expect, self.nodes_timelines);
+        }
+    }
+    pub fn update_attachment(&mut self, upd: AttachmentUpdate) {
+        self.check_invariants();
+        use AttachmentUpdateAction::*;
+        use btree_map::Entry::*;
+        let AttachmentUpdate {
+            tenant_shard_attachment_id,
+            action,
+        } = upd;
+        match (action, self.attachments.entry(tenant_shard_attachment_id)) {
+            (Attach { ps_id }, Occupied(e)) if *e.get() == ps_id => {
+                info!("attachment is already known")
+            }
+            (Attach { ps_id }, Occupied(e)) => {
+                warn!(current_node=%e.get(), proposed_node=%ps_id, "ignoring update that moves attachment to a different pageserver");
+            }
+            (Attach { ps_id }, Vacant(e)) => {
+                e.insert(ps_id);
+                // Keep attachmount_count up to date
+                let attachment_count = self
+                    .attachment_count
+                    .entry(tenant_shard_attachment_id.tenant_id)
+                    .or_default();
+                *attachment_count += attachment_count.checked_add(1).unwrap();
+                // Keep nodes_timelines up to date
+                let nodes_timelines = self.nodes_timelines.entry(ps_id).or_default();
+                for (ttid, _) in self.commit_lsns.range(TenantTimelineId::tenant_range(
+                    tenant_shard_attachment_id.tenant_id,
+                )) {
+                    let refcount = nodes_timelines.entry(*ttid).or_default();
+                    *refcount = refcount.checked_add(1).unwrap();
+                }
+                if nodes_timelines.is_empty() {
+                    self.nodes_timelines.remove(&ps_id);
+                }
+                // New shards may start at an older LSN than where we quiesced => activate all quiesced timelines.
+                let activate_range =
+                    TenantTimelineId::tenant_range(tenant_shard_attachment_id.tenant_id);
+                let activate: HashSet<TenantTimelineId> = self
+                    .quiesced_timelines
+                    .range(activate_range)
+                    .map(|(ttid, _quiesced_lsn)| *ttid)
+                    .collect();
+                for tenant_timeline_id in activate {
+                    self.activate_timeline(tenant_timeline_id);
+                }
+            }
+            (Detach, Occupied(e)) => {
+                let ps_id = e.remove();
+                // Keep attachment count up to date
+                let attachment_count = self
+                    .attachment_count
+                    .get_mut(&tenant_shard_attachment_id.tenant_id)
+                    .expect("attachment action initializes the hasmap entry");
+                *attachment_count = attachment_count.checked_sub(1).unwrap();
+                // Keep nodes_timelines up to date
+                let nodes_timelines = self
+                    .nodes_timelines
+                    .get_mut(&ps_id)
+                    .expect("attachment action initializes hashmap entry");
+                for (ttid, _) in self.commit_lsns.range(TenantTimelineId::tenant_range(
+                    tenant_shard_attachment_id.tenant_id,
+                )) {
+                    let refcount = nodes_timelines.entry(*ttid).or_default();
+                    *refcount = refcount.checked_sub(1).unwrap();
+                }
+            }
+            (Detach, Vacant(_)) => {
+                info!("detachment is already known");
+            }
+        }
+        self.check_invariants();
+    }
+    pub fn handle_remote_consistent_lsn_advertisement(&mut self, adv: RemoteConsistentLsnAdv) {
+        self.check_invariants();
+        let RemoteConsistentLsnAdv {
+            attachment,
+            remote_consistent_lsn,
+        } = adv;
+
+        match self.remote_consistent_lsns.entry(attachment) {
+            btree_map::Entry::Occupied(mut occupied_entry) => {
+                let current = occupied_entry.get_mut();
+                use std::cmp::Ordering::*;
+                match (*current).cmp(&remote_consistent_lsn) {
+                    Less => {
+                        *current = remote_consistent_lsn;
+                        let caught_up_count = self
+                            .caught_up_count
+                            .get_mut(&attachment.tenant_timeline_id)
+                            .unwrap();
+                        *caught_up_count = caught_up_count.checked_add(1).unwrap();
+                        if *caught_up_count
+                            == self.attachment_count[&attachment.tenant_timeline_id.tenant_id]
+                        {
+                            self.quiesce_timeline(attachment.tenant_timeline_id);
+                        }
+                    }
+                    Equal => {
+                        info!("ignoring no-op update, likely duplicate delivery");
+                    }
+                    Greater => {
+                        warn!(
+                            "ignoring advertisement because remote_consistent_lsn is moving backwards"
+                        );
+                    }
+                }
+            }
+            btree_map::Entry::Vacant(_) => {
+                let ttid = attachment.tenant_timeline_id;
+                match self.quiesced_timelines.get(&ttid).cloned() {
+                    Some(quiesced_lsn) if quiesced_lsn == remote_consistent_lsn => {
+                        info!("ignoring no-op update for quiesced timeline");
+                    }
+                    Some(_) => {
+                        self.activate_timeline(ttid);
+                        // recurse one level, guarnateed to hit `Occupied` case above
+                        self.handle_remote_consistent_lsn_advertisement(adv);
+                    }
+                    None => {
+                        info!("ignoring advertisement because timeline is not known");
+                    }
+                }
+            }
+        }
+        self.check_invariants();
+    }
+    pub fn handle_commit_lsn_advancement(&mut self, ttid: TenantTimelineId, update: Lsn) {
+        self.check_invariants();
+        match self.commit_lsns.entry(ttid) {
+            btree_map::Entry::Occupied(mut entry) => {
+                let current = entry.get_mut();
+                use std::cmp::Ordering::*;
+                match (*current).cmp(&update) {
+                    Less => {
+                        *current = update;
+                        // We never allow remote_consistent_lsn to be ahead of commit_lsn.
+                        // Therefore, it is safe to say nothing is caught up anymore.
+                        let caught_up_count = self.caught_up_count.get_mut(&ttid).unwrap();
+                        *caught_up_count = 0;
+                    }
+                    Equal => {
+                        // This code runs in safekeeper impl, no reason why there would be duplicate delivery.
+                        warn!("ignoring no-op update; why is this happening?");
+                    }
+                    Greater => {
+                        panic!(
+                            "proposed commit_lsn would move it backwards: current={} update={}",
+                            current, update
+                        );
+                    }
+                }
+            }
+
+            btree_map::Entry::Vacant(entry) => {
+                match self.quiesced_timelines.get(&ttid).cloned() {
+                    Some(quiesced_lsn) if quiesced_lsn == update => {
+                        info!("ignoring no-op update for quiesced timeline");
+                    }
+                    Some(_) => {
+                        self.activate_timeline(ttid);
+                        // recurse one level, guarnateed to hit `Occupied` case above
+                        self.handle_commit_lsn_advancement(ttid, update);
+                    }
+                    None => {
+                        info!("first time hearing about this timeline, initializing");
+                        entry.insert(update);
+                        let replaced = self.caught_up_count.insert(ttid, 0);
+                        // only commit_lsn advancement makes timelines known to world
+                        assert_eq!(None, replaced);
+                        for (attachment, node_id) in self
+                            .attachments
+                            .range(TenantShardAttachmentId::tenant_range(ttid.tenant_id))
+                        {
+                            let replaced = self.remote_consistent_lsns.insert(
+                                attachment.timeline_attachment_id(ttid.timeline_id),
+                                Lsn(0),
+                            );
+                            // only commit_lsn advancement makes timelines known to World
+                            assert_eq!(None, replaced);
+
+                            let nodes_timelines = self.nodes_timelines.entry(*node_id).or_default();
+                            let refcount = nodes_timelines.entry(ttid).or_default();
+                            *refcount = refcount.checked_add(1).unwrap();
+                        }
+                    }
+                }
+            }
+        }
+        self.check_invariants();
+    }
+
+    pub fn get_commit_lsn_advertisements(&self) -> HashMap<NodeId, HashMap<TenantTimelineId, Lsn>> {
+        let mut commit_lsn_advertisements_by_node: HashMap<NodeId, HashMap<TenantTimelineId, Lsn>> =
+            HashMap::with_capacity(self.nodes_timelines.len());
+        let commit_lsns_iter = self.commit_lsns.iter().map(|(k, v)| (*k, *v));
+        let attachments_iter = self.attachments.iter().map(|(k, v)| (*k, *v));
+        let remote_consistent_lsns_iter = self.remote_consistent_lsns.iter().map(|(k, v)| (*k, *v));
+
+        let join = merge_join::inner_equi_join_with_merge_strategy(
+            commit_lsns_iter,
+            attachments_iter,
+            |(tenant_timeline_id, _)| tenant_timeline_id.tenant_id,
+            |(shard_attachment_id, _)| shard_attachment_id.tenant_id,
+        );
+        let join = merge_join::left_equi_join_with_merge_strategy(
+            join,
+            remote_consistent_lsns_iter,
+            |((ttid, _), _)| ttid.tenant_id,
+            |(tlaid, _)| tlaid.tenant_timeline_id.tenant_id,
+        );
+        for ((c, a), r) in join {
+            let (tenant_timeline_id, commit_lsn): (TenantTimelineId, Lsn) = c;
+            let (_, node_id): (TenantShardAttachmentId, NodeId) = a;
+            match r {
+                // TODO: can > ever happen?
+                Some((_, remote_consistent_lsn)) if remote_consistent_lsn >= commit_lsn => {
+                    // this timeline shard attachment is already caught up
+                    continue;
+                }
+                Some(_) | None => {
+                    // need to advertise
+                    // -> fallthrough
+                }
+            };
+            // DISTINCT node_id, array_agg(DISTINCT tenant_shard_id )
+            let for_node = commit_lsn_advertisements_by_node
+                .entry(node_id)
+                .or_insert_with(|| HashMap::with_capacity(self.nodes_timelines[&node_id].len()));
+            match for_node.entry(tenant_timeline_id) {
+                hash_map::Entry::Vacant(vacant_entry) => {
+                    vacant_entry.insert(commit_lsn);
+                }
+                hash_map::Entry::Occupied(occupied_entry) => {
+                    assert_eq!(*occupied_entry.get(), commit_lsn);
+                }
+            }
+        }
+        commit_lsn_advertisements_by_node
+    }
+
+    fn activate_timeline(&mut self, tenant_timeline_id: TenantTimelineId) {
+        let quiesced_lsn = self
+            .quiesced_timelines
+            .remove(&tenant_timeline_id)
+            .expect("must call this function only on quiesced tenant_timeline_id");
+        let replaced = self.commit_lsns.insert(tenant_timeline_id, quiesced_lsn);
+        assert_eq!(None, replaced);
+        let reconstruct_remote_consistent_lsn_entries = self
+            .attachments
+            .range(TenantShardAttachmentId::tenant_range(
+                tenant_timeline_id.tenant_id,
+            ))
+            .map(|(k, _)| *k)
+            .map(|tenant_shard_attachment_id| {
+                (
+                    tenant_shard_attachment_id
+                        .timeline_attachment_id(tenant_timeline_id.timeline_id),
+                    quiesced_lsn,
+                )
+            });
+        for (key, value) in reconstruct_remote_consistent_lsn_entries {
+            let replaced = self.remote_consistent_lsns.insert(key, value);
+            assert_eq!(None, replaced);
+        }
+    }
+
+    fn quiesce_timeline(&mut self, tenant_timeline_id: TenantTimelineId) {
+        self.check_invariants();
+        if self.quiesced_timelines.contains_key(&tenant_timeline_id) {
+            panic!("only call this function on active timelines");
+        }
+        let quiesced_lsn = self
+            .commit_lsns
+            .remove(&tenant_timeline_id)
+            .expect("inconsistent: we checked it's not in quiesced_timelines, so, must be active");
+        let caught_up_count = self
+            .caught_up_count
+            .remove(&tenant_timeline_id)
+            .expect("inconsistent: we checked it's not in quiesced_timleines, so, must be active");
+        let mut remove_remote_consistent_lsns = Vec::new();
+        for (k, remote_consistent_lsn) in self
+            .remote_consistent_lsns
+            .range(TimelineAttachmentId::timeline_range(tenant_timeline_id))
+        {
+            assert_eq!(*remote_consistent_lsn, quiesced_lsn);
+            remove_remote_consistent_lsns.push(*k);
+        }
+        assert_eq!(
+            caught_up_count,
+            u16::try_from(remove_remote_consistent_lsns.len()).unwrap()
+        );
+        for k in remove_remote_consistent_lsns {
+            let removed = self.remote_consistent_lsns.remove(&k);
+            assert!(removed.is_some(), "we just added");
+        }
+        let replaced = self
+            .quiesced_timelines
+            .insert(tenant_timeline_id, quiesced_lsn);
+        assert_eq!(None, replaced); // we checked at function entry
+        self.check_invariants();
+    }
+}
+
+impl TimelineAttachmentId {
+    pub fn timeline_range(ttid: TenantTimelineId) -> RangeInclusive<Self> {
+        let shard_index_range: RangeInclusive<_> = ShardIndex::RANGE;
+        let generation_range: RangeInclusive<_> = Generation::RANGE;
+        RangeInclusive::new(
+            TimelineAttachmentId {
+                tenant_timeline_id: ttid,
+                shard_id: *shard_index_range.start(),
+                generation: *generation_range.start(),
+            },
+            TimelineAttachmentId {
+                tenant_timeline_id: ttid,
+                shard_id: *shard_index_range.end(),
+                generation: *generation_range.end(),
+            },
+        )
+    }
+    pub fn tenant_shard_attachment_id(self) -> TenantShardAttachmentId {
+        TenantShardAttachmentId {
+            tenant_id: self.tenant_timeline_id.tenant_id,
+            shard_id: self.shard_id,
+            generation: self.generation,
+        }
+    }
+}
+
+impl TenantShardAttachmentId {
+    pub fn timeline_attachment_id(self, timeline_id: TimelineId) -> TimelineAttachmentId {
+        TimelineAttachmentId {
+            tenant_timeline_id: TenantTimelineId {
+                tenant_id: self.tenant_id,
+                timeline_id,
+            },
+            shard_id: self.shard_id,
+            generation: self.generation,
+        }
+    }
+    pub fn tenant_range(tenant_id: TenantId) -> RangeInclusive<Self> {
+        let shard_index_range: RangeInclusive<_> = ShardIndex::RANGE;
+        let generation_range: RangeInclusive<_> = Generation::RANGE;
+        RangeInclusive::new(
+            Self {
+                tenant_id,
+                shard_id: *shard_index_range.start(),
+                generation: *generation_range.start(),
+            },
+            Self {
+                tenant_id,
+                shard_id: *shard_index_range.end(),
+                generation: *generation_range.end(),
+            },
+        )
+    }
+}
--- a/libs/sk_ps_discovery/src/tests.rs
+++ b/libs/sk_ps_discovery/src/tests.rs
@@ -0,0 +1,224 @@
+use utils::{id::TenantId, logging};
+
+use super::*;
+use crate::World;
+
+#[track_caller]
+fn validate_advertisements(
+    actual: HashMap<NodeId, HashMap<TenantTimelineId, Lsn>>,
+    expect: Vec<(NodeId, Vec<(TenantTimelineId, Lsn)>)>,
+) {
+    let expect: HashMap<_, _> = expect
+        .into_iter()
+        .map(|(node_id, innermap)| (node_id, innermap.into_iter().collect()))
+        .collect();
+    assert_eq!(actual, expect);
+}
+
+#[test]
+fn basic() {
+    let mut world = World::default();
+
+    let tenant_id = TenantId::from_array([0xff; 16]);
+    let timeline_id = TimelineId::from_array([1; 16]);
+    let timeline2 = TimelineId::from_array([2; 16]);
+
+    let attachment1 = TenantShardAttachmentId {
+        tenant_id,
+        shard_id: ShardIndex::unsharded(),
+        generation: Generation::Valid(2),
+    };
+    let attachment2 = TenantShardAttachmentId {
+        tenant_id,
+        shard_id: ShardIndex::unsharded(),
+        generation: Generation::Valid(3),
+    };
+
+    let ps1 = NodeId(0x100);
+
+    // Out of order; in happy path, commit_lsn advances first, but let's test the
+    // case where safekeeper doesn't know about the attachments yet first, before
+    // we extend the case to the happy path.
+
+    world.handle_remote_consistent_lsn_advertisement(RemoteConsistentLsnAdv {
+        attachment: attachment1.timeline_attachment_id(timeline_id),
+        remote_consistent_lsn: Lsn(0x23),
+    });
+    world.handle_remote_consistent_lsn_advertisement(RemoteConsistentLsnAdv {
+        attachment: attachment2.timeline_attachment_id(timeline_id),
+        remote_consistent_lsn: Lsn(0x42),
+    });
+    // SK authoritative info on which advertisements ought exist is still empty
+    assert_eq!(world.get_commit_lsn_advertisements(), HashMap::default());
+    world.update_attachment(AttachmentUpdate {
+        tenant_shard_attachment_id: attachment1,
+        action: AttachmentUpdateAction::Attach { ps_id: ps1 },
+    });
+    // We have not inserted any commit_lsn info yet, so, still no advs expected
+    assert_eq!(world.get_commit_lsn_advertisements(), HashMap::default());
+    // insert commit_lsn info for different timeline
+    world.handle_commit_lsn_advancement(
+        TenantTimelineId {
+            tenant_id,
+            timeline_id: timeline2,
+        },
+        Lsn(0x66),
+    );
+    // Advs should still be empty
+    validate_advertisements(
+        world.get_commit_lsn_advertisements(),
+        vec![(
+            ps1,
+            vec![(
+                TenantTimelineId {
+                    tenant_id,
+                    timeline_id: timeline2,
+                },
+                Lsn(0x66),
+            )],
+        )],
+    );
+
+    // Ok, out of order part tested. Now Safekeeper learns about the attachments.
+
+    // insert commit_lsn info for the timeline we have remote_consistent_lsn info for
+    world.handle_commit_lsn_advancement(
+        TenantTimelineId {
+            tenant_id,
+            timeline_id,
+        },
+        Lsn(0x55),
+    );
+    dbg!(&world);
+    // Now advertisements to attachment1 will be sent out, but attachment2  is still not known, so, no advertisements to it.
+    validate_advertisements(
+        world.get_commit_lsn_advertisements(),
+        vec![(
+            ps1,
+            vec![(
+                TenantTimelineId {
+                    tenant_id,
+                    timeline_id,
+                },
+                Lsn(0x55),
+            )],
+        )],
+    );
+}
+
+#[test]
+fn advertisement_for_new_timeline() {
+    let mut world = World::default();
+
+    let tenant_id = TenantId::generate();
+    let timeline_id = TimelineId::generate();
+    let ttid = TenantTimelineId {
+        tenant_id,
+        timeline_id,
+    };
+
+    let tenant_shard_attachment_id = TenantShardAttachmentId {
+        tenant_id,
+        shard_id: ShardIndex::unsharded(),
+        generation: Generation::Valid(2),
+    };
+
+    let ps_id = NodeId(0x100);
+
+    world.update_attachment(AttachmentUpdate {
+        tenant_shard_attachment_id,
+        action: AttachmentUpdateAction::Attach { ps_id },
+    });
+    world.handle_commit_lsn_advancement(ttid, Lsn(23));
+
+    let advs = world.get_commit_lsn_advertisements();
+    validate_advertisements(advs, vec![(ps_id, vec![(ttid, Lsn(23))])]);
+}
+
+#[test]
+fn quiescing_timeline_catchup() {
+    let _guard = logging::init(
+        logging::LogFormat::Test,
+        logging::TracingErrorLayerEnablement::EnableWithRustLogFilter,
+        logging::Output::Stdout,
+    )
+    .unwrap();
+
+    let mut world = World::default();
+
+    let tenant_id = TenantId::generate();
+    let timeline_id = TimelineId::generate();
+    let ttid = TenantTimelineId {
+        tenant_id,
+        timeline_id,
+    };
+
+    let tenant_shard_attachment_id = TenantShardAttachmentId {
+        tenant_id,
+        shard_id: ShardIndex::unsharded(),
+        generation: Generation::Valid(2),
+    };
+
+    let ps_id = NodeId(0x100);
+
+    world.update_attachment(AttachmentUpdate {
+        tenant_shard_attachment_id,
+        action: AttachmentUpdateAction::Attach { ps_id },
+    });
+    world.handle_commit_lsn_advancement(ttid, Lsn(23));
+
+    assert!(world.quiesced_timelines.is_empty());
+
+    world.handle_remote_consistent_lsn_advertisement(RemoteConsistentLsnAdv {
+        attachment: tenant_shard_attachment_id.timeline_attachment_id(timeline_id),
+        remote_consistent_lsn: Lsn(23),
+    });
+
+    assert!(world.quiesced_timelines.contains_key(&ttid));
+}
+
+#[test]
+fn nodes_timelines() {
+    let mut world = World::default();
+
+    let tenant_id = TenantId::generate();
+    let timeline_id = TimelineId::from_array([0x1; 16]);
+    let ttid = TenantTimelineId {
+        tenant_id,
+        timeline_id,
+    };
+
+    let tenant_shard_attachment_id = TenantShardAttachmentId {
+        tenant_id,
+        shard_id: ShardIndex::unsharded(),
+        generation: Generation::Valid(2),
+    };
+
+    let ps_id = NodeId(0x100);
+
+    world.update_attachment(AttachmentUpdate {
+        tenant_shard_attachment_id,
+        action: AttachmentUpdateAction::Attach { ps_id },
+    });
+
+    assert!(world.nodes_timelines.get(&ps_id).is_none());
+
+    world.handle_commit_lsn_advancement(ttid, Lsn(0x23));
+
+    assert_eq!(world.nodes_timelines[&ps_id].len(), 1);
+
+    let timeline2 = TimelineId::from_array([0x2; 16]);
+    world.handle_remote_consistent_lsn_advertisement(RemoteConsistentLsnAdv {
+        attachment: TimelineAttachmentId {
+            tenant_timeline_id: TenantTimelineId {
+                tenant_id,
+                timeline_id: timeline2,
+            },
+            shard_id: ShardIndex::unsharded(),
+            generation: Generation::Valid(2),
+        },
+        remote_consistent_lsn: Lsn(0x42),
+    });
+}
+
+// TODO: need more tests, esp for the removal path
--- a/libs/utils/src/crashsafe.rs
+++ b/libs/utils/src/crashsafe.rs
@@ -1,7 +1,7 @@
 use std::borrow::Cow;
 use std::fs::{self, File};
 use std::io::{self, Write};
-use std::os::fd::AsRawFd;
+use std::os::fd::AsFd;

 use camino::{Utf8Path, Utf8PathBuf};

@@ -210,13 +210,13 @@ pub fn overwrite(

 /// Syncs the filesystem for the given file descriptor.
 #[cfg_attr(target_os = "macos", allow(unused_variables))]
-pub fn syncfs(fd: impl AsRawFd) -> anyhow::Result<()> {
+pub fn syncfs(fd: impl AsFd) -> anyhow::Result<()> {
    // Linux guarantees durability for syncfs.
    // POSIX doesn't have syncfs, and further does not actually guarantee durability of sync().
    #[cfg(target_os = "linux")]
    {
        use anyhow::Context;
-        nix::unistd::syncfs(fd.as_raw_fd()).context("syncfs")?;
+        nix::unistd::syncfs(fd).context("syncfs")?;
    }
    #[cfg(target_os = "macos")]
    {
--- a/libs/utils/src/fs_ext/rename_noreplace.rs
+++ b/libs/utils/src/fs_ext/rename_noreplace.rs
@@ -11,9 +11,9 @@ pub fn rename_noreplace<P1: ?Sized + NixPath, P2: ?Sized + NixPath>(
        #[cfg(all(target_os = "linux", target_env = "gnu"))]
        {
            nix::fcntl::renameat2(
-                None,
+                nix::fcntl::AT_FDCWD,
                src,
-                None,
+                nix::fcntl::AT_FDCWD,
                dst,
                nix::fcntl::RenameFlags::RENAME_NOREPLACE,
            )
--- a/libs/utils/src/generation.rs
+++ b/libs/utils/src/generation.rs
@@ -1,4 +1,4 @@
-use std::fmt::Debug;
+use std::{fmt::Debug, ops::RangeInclusive};

 use serde::{Deserialize, Serialize};

@@ -25,7 +25,9 @@ pub enum Generation {
 /// scenarios where pageservers might otherwise issue conflicting writes to
 /// remote storage
 impl Generation {
+    pub const MIN: Self = Self::None;
    pub const MAX: Self = Self::Valid(u32::MAX);
+    pub const RANGE: RangeInclusive<Self> = RangeInclusive::new(Self::MIN, Self::MAX);

    /// Create a new Generation that represents a legacy key format with
    /// no generation suffix
--- a/libs/utils/src/id.rs
+++ b/libs/utils/src/id.rs
@@ -1,5 +1,6 @@
 use std::fmt;
 use std::num::ParseIntError;
+use std::ops::RangeInclusive;
 use std::str::FromStr;

 use anyhow::Context;
@@ -320,6 +321,19 @@ impl TenantTimelineId {
    pub fn empty() -> Self {
        Self::new(TenantId::from([0u8; 16]), TimelineId::from([0u8; 16]))
    }
+
+    pub fn tenant_range(tenant_id: TenantId) -> RangeInclusive<Self> {
+        RangeInclusive::new(
+            Self {
+                tenant_id,
+                timeline_id: TimelineId::from_array([u8::MIN; 16]),
+            },
+            Self {
+                tenant_id,
+                timeline_id: TimelineId::from_array([u8::MAX; 16]),
+            },
+        )
+    }
 }

 impl fmt::Display for TenantTimelineId {
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -95,6 +95,9 @@ pub mod guard_arc_swap;

 pub mod elapsed_accum;

+pub mod merge_join;
+
+
 #[cfg(target_os = "linux")]
 pub mod linux_socket_ioctl;

--- a/libs/utils/src/lock_file.rs
+++ b/libs/utils/src/lock_file.rs
@@ -1,6 +1,6 @@
 //! A module to create and read lock files.
 //!
-//! File locking is done using [`fcntl::flock`] exclusive locks.
+//! File locking is done using [`nix::fcntl::Flock`] exclusive locks.
 //! The only consumer of this module is currently
 //! [`pid_file`](crate::pid_file). See the module-level comment
 //! there for potential pitfalls with lock files that are used
@@ -9,26 +9,25 @@
 use std::fs;
 use std::io::{Read, Write};
 use std::ops::Deref;
-use std::os::unix::prelude::AsRawFd;

 use anyhow::Context;
 use camino::{Utf8Path, Utf8PathBuf};
 use nix::errno::Errno::EAGAIN;
-use nix::fcntl;
+use nix::fcntl::{Flock, FlockArg};

 use crate::crashsafe;

-/// A handle to an open and unlocked, but not-yet-written lock file.
+/// A handle to an open and flocked, but not-yet-written lock file.
 /// Returned by [`create_exclusive`].
 #[must_use]
 pub struct UnwrittenLockFile {
    path: Utf8PathBuf,
-    file: fs::File,
+    file: Flock<fs::File>,
 }

 /// Returned by [`UnwrittenLockFile::write_content`].
 #[must_use]
-pub struct LockFileGuard(fs::File);
+pub struct LockFileGuard(Flock<fs::File>);

 impl Deref for LockFileGuard {
    type Target = fs::File;
@@ -67,17 +66,14 @@ pub fn create_exclusive(lock_file_path: &Utf8Path) -> anyhow::Result<UnwrittenLo
        .open(lock_file_path)
        .context("open lock file")?;

-    let res = fcntl::flock(
-        lock_file.as_raw_fd(),
-        fcntl::FlockArg::LockExclusiveNonblock,
-    );
+    let res = Flock::lock(lock_file, FlockArg::LockExclusiveNonblock);
    match res {
-        Ok(()) => Ok(UnwrittenLockFile {
+        Ok(lock_file) => Ok(UnwrittenLockFile {
            path: lock_file_path.to_owned(),
            file: lock_file,
        }),
-        Err(EAGAIN) => anyhow::bail!("file is already locked"),
-        Err(e) => Err(e).context("flock error"),
+        Err((_, EAGAIN)) => anyhow::bail!("file is already locked"),
+        Err((_, e)) => Err(e).context("flock error"),
    }
 }

@@ -105,32 +101,37 @@ pub enum LockFileRead {
 /// Check the [`LockFileRead`] variants for details.
 pub fn read_and_hold_lock_file(path: &Utf8Path) -> anyhow::Result<LockFileRead> {
    let res = fs::OpenOptions::new().read(true).open(path);
-    let mut lock_file = match res {
+    let lock_file = match res {
        Ok(f) => f,
        Err(e) => match e.kind() {
            std::io::ErrorKind::NotFound => return Ok(LockFileRead::NotExist),
            _ => return Err(e).context("open lock file"),
        },
    };
-    let res = fcntl::flock(
-        lock_file.as_raw_fd(),
-        fcntl::FlockArg::LockExclusiveNonblock,
-    );
+    let res = Flock::lock(lock_file, FlockArg::LockExclusiveNonblock);
    // We need the content regardless of lock success / failure.
    // But, read it after flock so that, if it succeeded, the content is consistent.
-    let mut content = String::new();
-    lock_file
-        .read_to_string(&mut content)
-        .context("read lock file")?;
    match res {
-        Ok(()) => Ok(LockFileRead::NotHeldByAnyProcess(
-            LockFileGuard(lock_file),
-            content,
-        )),
-        Err(EAGAIN) => Ok(LockFileRead::LockedByOtherProcess {
-            not_locked_file: lock_file,
-            content,
-        }),
-        Err(e) => Err(e).context("flock error"),
+        Ok(mut locked_file) => {
+            let mut content = String::new();
+            locked_file
+                .read_to_string(&mut content)
+                .context("read lock file")?;
+            Ok(LockFileRead::NotHeldByAnyProcess(
+                LockFileGuard(locked_file),
+                content,
+            ))
+        }
+        Err((mut not_locked_file, EAGAIN)) => {
+            let mut content = String::new();
+            not_locked_file
+                .read_to_string(&mut content)
+                .context("read lock file")?;
+            Ok(LockFileRead::LockedByOtherProcess {
+                not_locked_file,
+                content,
+            })
+        }
+        Err((_, e)) => Err(e).context("flock error"),
    }
 }
--- a/libs/utils/src/merge_join.rs
+++ b/libs/utils/src/merge_join.rs
@@ -0,0 +1,164 @@
+pub fn inner_equi_join_with_merge_strategy<L, LI, R, RI, K, FL, FR>(
+    l: L,
+    r: R,
+    key_l: FL,
+    key_r: FR,
+) -> impl Iterator<Item = (LI, RI)>
+where
+    L: Iterator<Item = LI>, // + Sorted
+    R: Iterator<Item = RI>, // + Sorted
+    FL: 'static + Fn(&LI) -> K,
+    FR: 'static + Fn(&RI) -> K,
+    LI: Copy,
+    RI: Copy,
+    K: PartialEq + Eq + Ord,
+{
+    let mut l = l.map(move |i| (i, key_l(&i))).peekable();
+    let mut r = r.map(move |i| (i, key_r(&i))).peekable();
+    std::iter::from_fn(move || {
+        loop {
+            match (l.peek(), r.peek()) {
+                (Some((_, lk)), Some((_, rk))) if lk < rk => {
+                    drop(l.next());
+                    continue;
+                }
+                (Some((_, lk)), Some((_, rk))) if lk > rk => {
+                    drop(r.next());
+                    continue;
+                }
+                (Some((lv, lk)), Some((_, rk))) => {
+                    assert!(lk == rk);
+                    let (rv, _) = r.next().unwrap();
+                    return Some((lv.clone(), rv));
+                }
+                (None, None) | (None, Some(_)) | (Some(_), None) => return None,
+            }
+        }
+    })
+}
+
+pub fn left_equi_join_with_merge_strategy<L, LI, R, RI, K, FL, FR>(
+    l: L,
+    r: R,
+    key_l: FL,
+    key_r: FR,
+) -> impl Iterator<Item = (LI, Option<RI>)>
+where
+    L: Iterator<Item = LI>, // + Sorted
+    R: Iterator<Item = RI>, // + Sorted
+    FL: 'static + Fn(&LI) -> K,
+    FR: 'static + Fn(&RI) -> K,
+    LI: Copy,
+    RI: Copy,
+    K: PartialEq + Eq + Ord,
+{
+    let mut l = l.map(move |i| (i, key_l(&i))).peekable();
+    let mut r = r.map(move |i| (i, key_r(&i))).peekable();
+    let mut l_had_match = false;
+    std::iter::from_fn(move || {
+        loop {
+            match (l.peek(), r.peek()) {
+                (Some((_, lk)), Some((_, rk))) if lk < rk => {
+                    let (lv, _) = l.next().unwrap();
+                    if l_had_match {
+                        l_had_match = false;
+                        continue;
+                    } else {
+                        return Some((lv, None));
+                    }
+                }
+                (Some((_, _)), None) => {
+                    let (lv, _) = l.next().unwrap();
+                    if l_had_match {
+                        l_had_match = false;
+                        continue;
+                    } else {
+                        return Some((lv, None));
+                    }
+                }
+                (Some((_, lk)), Some((_, rk))) if lk > rk => {
+                    drop(r.next());
+                    continue;
+                }
+                (Some((lv, lk)), Some((_, rk))) => {
+                    l_had_match = true;
+                    assert!(lk == rk);
+                    let (rv, _) = r.next().unwrap();
+                    return Some((lv.clone(), Some(rv)));
+                }
+                (None, None) | (None, Some(_)) => return None,
+            }
+        }
+    })
+}
+#[cfg(test)]
+mod tests {
+
+    #[test]
+    fn inner_equi_basic() {
+        let l = vec![b"a", b"c"];
+        let r = vec![b"aa", b"ad", b"ba", b"bb", b"ca", b"cb", b"cd", b"dd"];
+
+        let res: Vec<_> = super::inner_equi_join_with_merge_strategy(
+            l.into_iter(),
+            r.into_iter(),
+            |l| &l[0..1],
+            |r| &r[0..1],
+        )
+        .collect();
+
+        assert_eq!(
+            res,
+            vec![
+                (b"a", b"aa"),
+                (b"a", b"ad"),
+                (b"c", b"ca"),
+                (b"c", b"cb"),
+                (b"c", b"cd"),
+            ]
+        );
+    }
+
+    #[test]
+    fn left_equi_basic() {
+        /*
+        create table aleft (id text, aleft text);
+        create table aright (id text, aright text);
+        insert into aleft values ('a', 'a'), ('b', 'b');
+        insert into aright values ('a', 'aa'), ('a', 'ab'), ('c', 'cd');
+        select * from aleft left join aright using ("id");
+        */
+
+        let l = vec![b"a", b"b"];
+        let r = vec![b"aa", b"ab", b"cd"];
+
+        let res: Vec<_> = super::left_equi_join_with_merge_strategy(
+            l.into_iter(),
+            r.into_iter(),
+            |l| &l[0..1],
+            |r| &r[0..1],
+        )
+        .collect();
+
+        assert_eq!(
+            res,
+            vec![(b"a", Some(b"aa")), (b"a", Some(b"ab")), (b"b", None)]
+        );
+    }
+
+    #[test]
+    fn left_equi_basic_2() {
+        let l = vec![b"b"];
+        let r = vec![b"aa", b"ab", b"bb"];
+
+        let res: Vec<_> = super::left_equi_join_with_merge_strategy(
+            l.into_iter(),
+            r.into_iter(),
+            |l| &l[0..1],
+            |r| &r[0..1],
+        )
+        .collect();
+
+        assert_eq!(res, vec![(b"b", Some(b"bb"))])
+    }
+}
--- a/libs/utils/src/shard.rs
+++ b/libs/utils/src/shard.rs
@@ -52,6 +52,7 @@ pub struct TenantShardId {
 impl ShardCount {
    pub const MAX: Self = Self(u8::MAX);
    pub const MIN: Self = Self(0);
+    pub const RANGE: RangeInclusive<Self> = RangeInclusive::new(Self::MIN, Self::MAX);

    /// The internal value of a ShardCount may be zero, which means "1 shard, but use
    /// legacy format for TenantShardId that excludes the shard suffix", also known
@@ -85,7 +86,9 @@ impl ShardCount {
 }

 impl ShardNumber {
+    pub const MIN: Self = Self(0);
    pub const MAX: Self = Self(u8::MAX);
+    pub const RANGE: RangeInclusive<Self> = RangeInclusive::new(Self::MIN, Self::MAX);
 }

 impl TenantShardId {
@@ -100,16 +103,17 @@ impl TenantShardId {
    /// The range of all TenantShardId that belong to a particular TenantId.  This is useful when
    /// you have a BTreeMap of TenantShardId, and are querying by TenantId.
    pub fn tenant_range(tenant_id: TenantId) -> RangeInclusive<Self> {
+        let shard_index_range: RangeInclusive<_> = ShardIndex::RANGE;
        RangeInclusive::new(
            Self {
                tenant_id,
-                shard_number: ShardNumber(0),
-                shard_count: ShardCount(0),
+                shard_number: shard_index_range.start().shard_number,
+                shard_count: shard_index_range.start().shard_count,
            },
            Self {
                tenant_id,
-                shard_number: ShardNumber::MAX,
-                shard_count: ShardCount::MAX,
+                shard_number: shard_index_range.end().shard_number,
+                shard_count: shard_index_range.end().shard_count,
            },
        )
    }
@@ -241,6 +245,16 @@ impl From<[u8; 18]> for TenantShardId {
 }

 impl ShardIndex {
+    pub const MIN: Self = ShardIndex {
+        shard_number: ShardNumber::MIN,
+        shard_count: ShardCount::MIN,
+    };
+    pub const MAX: Self = ShardIndex {
+        shard_number: ShardNumber::MAX,
+        shard_count: ShardCount::MAX,
+    };
+    pub const RANGE: RangeInclusive<Self> = RangeInclusive::new(Self::MIN, Self::MAX);
+
    pub fn new(number: ShardNumber, count: ShardCount) -> Self {
        Self {
            shard_number: number,
--- a/libs/utils/src/sync.rs
+++ b/libs/utils/src/sync.rs
@@ -4,3 +4,5 @@ pub mod duplex;
 pub mod gate;

 pub mod spsc_fold;
+
+pub mod spsc_watch;
--- a/libs/utils/src/sync/spsc_fold.rs
+++ b/libs/utils/src/sync/spsc_fold.rs
@@ -56,7 +56,7 @@ impl<T: Send> Sender<T> {
    /// # Panics
    ///
    /// If `try_fold` panics,  any subsequent call to `send` panic.
-    pub async fn send<F>(&mut self, value: T, try_fold: F) -> Result<(), SendError>
+    pub async fn send<F>(&mut self, value: T, try_fold: F) -> Result<(), (T, SendError)>
    where
        F: Fn(&mut T, T) -> Result<(), T>,
    {
@@ -104,7 +104,9 @@ impl<T: Send> Sender<T> {
                    }
                    Poll::Pending
                }
-                State::ReceiverGone => Poll::Ready(Err(SendError::ReceiverGone)),
+                State::ReceiverGone => {
+                    Poll::Ready(Err((value.take().unwrap(), SendError::ReceiverGone)))
+                }
                State::SenderGone(_)
                | State::AllGone
                | State::SenderDropping
--- a/libs/utils/src/sync/spsc_watch.rs
+++ b/libs/utils/src/sync/spsc_watch.rs
@@ -0,0 +1,55 @@
+//! watch is probably not the right word, because we do take out
+
+use tokio_util::sync::CancellationToken;
+
+use crate::sync::spsc_fold;
+
+pub fn channel<T: Send>() -> (Sender<T>, Receiver<T>) {
+    let (tx, rx) = spsc_fold::channel();
+    let cancel = CancellationToken::new();
+    (
+        Sender {
+            tx,
+            _cancel: cancel.clone().drop_guard(),
+        },
+        Receiver { rx, cancel },
+    )
+}
+
+pub struct Sender<T> {
+    tx: spsc_fold::Sender<T>,
+    _cancel: tokio_util::sync::DropGuard,
+}
+
+pub struct Receiver<T> {
+    rx: spsc_fold::Receiver<T>,
+    cancel: CancellationToken,
+}
+
+impl<T: Send> Sender<T> {
+    pub fn send_replace(&mut self, value: T) -> Result<(), (T, spsc_fold::SendError)> {
+        poll_ready(self.tx.send(value, |old, new| {
+            *old = new;
+            Ok(())
+        }))
+    }
+}
+
+impl<T: Send> Receiver<T> {
+    pub async fn recv(&mut self) -> Result<T, spsc_fold::RecvError> {
+        self.rx.recv().await
+    }
+    pub async fn cancelled(&mut self) {
+        self.cancel.cancelled().await
+    }
+}
+
+fn poll_ready<F: Future<Output = O>, O>(f: F) -> O {
+    futures::executor::block_on(async move {
+        let f = std::pin::pin!(f);
+        match futures::poll!(f) {
+            std::task::Poll::Ready(r) => r,
+            std::task::Poll::Pending => unreachable!("expecting future to always return Ready"),
+        }
+    })
+}
--- a/pageserver/page_api/Cargo.toml
+++ b/pageserver/page_api/Cargo.toml
@@ -0,0 +1,13 @@
+[package]
+name = "pageserver_page_api"
+version = "0.1.0"
+edition.workspace = true
+license.workspace = true
+
+[dependencies]
+prost.workspace = true
+tonic.workspace = true
+workspace_hack.workspace = true
+
+[build-dependencies]
+tonic-build.workspace = true
--- a/pageserver/page_api/build.rs
+++ b/pageserver/page_api/build.rs
@@ -0,0 +1,7 @@
+fn main() -> Result<(), Box<dyn std::error::Error>> {
+    // Generates Rust code from .proto Protobuf schemas.
+    tonic_build::configure()
+        .bytes(["."])
+        .compile_protos(&["proto/page_service.proto"], &["proto"])
+        .map_err(|err| err.into())
+}
--- a/pageserver/page_api/proto/page_service.proto
+++ b/pageserver/page_api/proto/page_service.proto
@@ -0,0 +1,220 @@
+// Page service, presented by pageservers for computes.
+//
+// This is the compute read path. It primarily serves page versions at given
+// LSNs, but also base backups, SLRU segments, and relation metadata.
+//
+// EXPERIMENTAL: this is still under development and subject to change.
+//
+// Request metadata headers:
+// - authorization: JWT token ("Bearer <token>"), if auth is enabled
+// - neon-tenant-id: tenant ID ("7c4a1f9e3bd6470c8f3e21a65bd2e980")
+// - neon-shard-id: shard ID, as <number><count> in hex ("0b10" = shard 11 of 16, 0-based)
+// - neon-timeline-id: timeline ID ("f08c4e9a2d5f76b1e3a7c2d8910f4b3e")
+//
+// TODO: consider adding neon-compute-mode ("primary", "static", "replica").
+// However, this will require reconnecting when changing modes.
+//
+// TODO: write implementation guidance on
+// - Health checks
+// - Tracing, OpenTelemetry
+// - Compression
+
+syntax = "proto3";
+package page_service;
+
+service PageService {
+  // Returns whether a relation exists.
+  rpc CheckRelExists(CheckRelExistsRequest) returns (CheckRelExistsResponse);
+
+  // Fetches a base backup.
+  rpc GetBaseBackup (GetBaseBackupRequest) returns (stream GetBaseBackupResponseChunk);
+
+  // Returns the total size of a database, as # of bytes.
+  rpc GetDbSize (GetDbSizeRequest) returns (GetDbSizeResponse);
+
+  // Fetches pages.
+  //
+  // This is implemented as a bidirectional streaming RPC for performance. Unary
+  // requests incur costs for e.g. HTTP/2 stream setup, header parsing,
+  // authentication, and so on -- with streaming, we only pay these costs during
+  // the initial stream setup. This ~doubles throughput in benchmarks. Other
+  // RPCs use regular unary requests, since they are not as frequent and
+  // performance-critical, and this simplifies implementation.
+  //
+  // NB: a status response (e.g. errors) will terminate the stream. The stream
+  // may be shared by e.g. multiple Postgres backends, so we should avoid this.
+  // Most errors are therefore sent as GetPageResponse.status instead.
+  rpc GetPages (stream GetPageRequest) returns (stream GetPageResponse);
+
+  // Returns the size of a relation, as # of blocks.
+  rpc GetRelSize (GetRelSizeRequest) returns (GetRelSizeResponse);
+
+  // Fetches an SLRU segment.
+  rpc GetSlruSegment (GetSlruSegmentRequest) returns (GetSlruSegmentResponse);
+}
+
+// The LSN a request should read at.
+message ReadLsn {
+  // The request's read LSN. Required.
+  uint64 request_lsn = 1;
+  // If given, the caller guarantees that the page has not been modified since
+  // this LSN. Must be smaller than or equal to request_lsn. This allows the
+  // Pageserver to serve an old page without waiting for the request LSN to
+  // arrive. Valid for all request types.
+  //
+  // It is undefined behaviour to make a request such that the page was, in
+  // fact, modified between request_lsn and not_modified_since_lsn. The
+  // Pageserver might detect it and return an error, or it might return the old
+  // page version or the new page version. Setting not_modified_since_lsn equal
+  // to request_lsn is always safe, but can lead to unnecessary waiting.
+  uint64 not_modified_since_lsn = 2;
+}
+
+// A relation identifier.
+message RelTag {
+    uint32 spc_oid = 1;
+    uint32 db_oid = 2;
+    uint32 rel_number = 3;
+    uint32 fork_number = 4;
+}
+
+// Checks whether a relation exists, at the given LSN. Only valid on shard 0,
+// other shards will error.
+message CheckRelExistsRequest {
+  ReadLsn read_lsn = 1;
+  RelTag rel = 2;
+}
+
+message CheckRelExistsResponse {
+  bool exists = 1;
+}
+
+// Requests a base backup at a given LSN.
+message GetBaseBackupRequest {
+  // The LSN to fetch a base backup at.
+  ReadLsn read_lsn = 1;
+  // If true, logical replication slots will not be created.
+  bool replica = 2;
+}
+
+// Base backup response chunk, returned as an ordered stream.
+message GetBaseBackupResponseChunk {
+  // A basebackup data chunk. The size is undefined, but bounded by the 4 MB
+  // gRPC message size limit.
+  bytes chunk = 1;
+}
+
+// Requests the size of a database, as # of bytes. Only valid on shard 0, other
+// shards will error.
+message GetDbSizeRequest {
+  ReadLsn read_lsn = 1;
+  uint32 db_oid = 2;
+}
+
+message GetDbSizeResponse {
+  uint64 num_bytes = 1;
+}
+
+// Requests one or more pages.
+message GetPageRequest {
+  // A request ID. Will be included in the response. Should be unique for
+  // in-flight requests on the stream.
+  uint64 request_id = 1;
+  // The request class.
+  GetPageClass request_class = 2;
+  // The LSN to read at.
+  ReadLsn read_lsn = 3;
+  // The relation to read from.
+  RelTag rel = 4;
+  // Page numbers to read. Must belong to the remote shard.
+  //
+  // Multiple pages will be executed as a single batch by the Pageserver,
+  // amortizing layer access costs and parallelizing them. This may increase the
+  // latency of any individual request, but improves the overall latency and
+  // throughput of the batch as a whole.
+  //
+  // TODO: this causes an allocation in the common single-block case. The sender
+  // can use a SmallVec to stack-allocate it, but Prost will always deserialize
+  // into a heap-allocated Vec. Consider optimizing this.
+  //
+  // TODO: we might be able to avoid a sort or something if we mandate that these
+  // are always in order. But we can't currenly rely on this on the server, because
+  // of compatibility with the libpq protocol handler.
+  repeated uint32 block_number = 5;
+}
+
+// A GetPageRequest class. Primarily intended for observability, but may also be
+// used for prioritization in the future.
+enum GetPageClass {
+  // Unknown class. For forwards compatibility: used when the client sends a
+  // class that the server doesn't know about.
+  GET_PAGE_CLASS_UNKNOWN = 0;
+  // A normal request. This is the default.
+  GET_PAGE_CLASS_NORMAL = 1;
+  // A prefetch request. NB: can only be classified on pg < 18.
+  GET_PAGE_CLASS_PREFETCH = 2;
+  // A background request (e.g. vacuum).
+  GET_PAGE_CLASS_BACKGROUND = 3;
+}
+
+// A GetPage response.
+//
+// A batch response will contain all of the requested pages. We could eagerly
+// emit individual pages as soon as they are ready, but on a readv() Postgres
+// holds buffer pool locks on all pages in the batch and we'll only return once
+// the entire batch is ready, so no one can make use of the individual pages.
+message GetPageResponse {
+  // The original request's ID.
+  uint64 request_id = 1;
+  // The response status code.
+  GetPageStatus status = 2;
+  // A string describing the status, if any.
+  string reason = 3;
+  // The 8KB page images, in the same order as the request. Empty if status != OK.
+  repeated bytes page_image = 4;
+}
+
+// A GetPageResponse status code. Since we use a bidirectional stream, we don't
+// want to send errors as gRPC statuses, since this would terminate the stream.
+enum GetPageStatus {
+  // Unknown status. For forwards compatibility: used when the server sends a
+  // status code that the client doesn't know about.
+  GET_PAGE_STATUS_UNKNOWN = 0;
+  // The request was successful.
+  GET_PAGE_STATUS_OK = 1;
+  // The page did not exist. The tenant/timeline/shard has already been
+  // validated during stream setup.
+  GET_PAGE_STATUS_NOT_FOUND = 2;
+  // The request was invalid.
+  GET_PAGE_STATUS_INVALID = 3;
+  // The tenant is rate limited. Slow down and retry later.
+  GET_PAGE_STATUS_SLOW_DOWN = 4;
+  // TODO: consider adding a GET_PAGE_STATUS_LAYER_DOWNLOAD in the case of a
+  // layer download. This could free up the server task to process other
+  // requests while the layer download is in progress.
+}
+
+// Fetches the size of a relation at a given LSN, as # of blocks. Only valid on
+// shard 0, other shards will error.
+message GetRelSizeRequest {
+  ReadLsn read_lsn = 1;
+  RelTag rel = 2;
+}
+
+message GetRelSizeResponse {
+  uint32 num_blocks = 1;
+}
+
+// Requests an SLRU segment. Only valid on shard 0, other shards will error.
+message GetSlruSegmentRequest {
+  ReadLsn read_lsn = 1;
+  uint32 kind = 2;
+  uint32 segno = 3;
+}
+
+// Returns an SLRU segment.
+//
+// These are up 32 pages (256 KB), so we can send them as a single response.
+message GetSlruSegmentResponse {
+  bytes segment = 1;
+}
--- a/pageserver/page_api/src/lib.rs
+++ b/pageserver/page_api/src/lib.rs
@@ -0,0 +1,14 @@
+//! This crate provides the Pageserver's page API. It contains:
+//!
+//! * proto/page_service.proto: the Protobuf schema for the page API.
+//! * proto: auto-generated Protobuf types for gRPC.
+//!
+//! This crate is used by both the client and the server. Try to keep it slim.
+
+// Code generated by protobuf.
+pub mod proto {
+    tonic::include_proto!("page_service");
+
+    pub use page_service_client::PageServiceClient;
+    pub use page_service_server::{PageService, PageServiceServer};
+}
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -423,11 +423,14 @@ fn start_pageserver(
                    .map(storage_broker::Certificate::from_pem),
            );
            // Note: we do not attempt connecting here (but validate endpoints sanity).
-            storage_broker::connect(
+            let service_client = storage_broker::connect(
                conf.broker_endpoint.clone(),
                conf.broker_keepalive_interval,
                tls_config,
-            )
+            )?;
+            anyhow::Ok(storage_broker::TimelineUpdatesSubscriber::new(
+                service_client,
+            ))
        })
        .with_context(|| {
            format!(
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -100,7 +100,7 @@ pub struct State {
    auth: Option<Arc<SwappableJwtAuth>>,
    allowlist_routes: &'static [&'static str],
    remote_storage: GenericRemoteStorage,
-    broker_client: storage_broker::BrokerClientChannel,
+    broker_client: storage_broker::TimelineUpdatesSubscriber,
    disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
    deletion_queue_client: DeletionQueueClient,
    secondary_controller: SecondaryController,
@@ -114,7 +114,7 @@ impl State {
        tenant_manager: Arc<TenantManager>,
        auth: Option<Arc<SwappableJwtAuth>>,
        remote_storage: GenericRemoteStorage,
-        broker_client: storage_broker::BrokerClientChannel,
+        broker_client: storage_broker::TimelineUpdatesSubscriber,
        disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
        deletion_queue_client: DeletionQueueClient,
        secondary_controller: SecondaryController,
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -1278,7 +1278,7 @@ impl PageServerHandler {
    }

    #[instrument(level = tracing::Level::DEBUG, skip_all)]
-    async fn pagesteam_handle_batched_message<IO>(
+    async fn pagestream_handle_batched_message<IO>(
        &mut self,
        pgb_writer: &mut PostgresBackend<IO>,
        batch: BatchedFeMessage,
@@ -1733,7 +1733,7 @@ impl PageServerHandler {
            };

            let result = self
-                .pagesteam_handle_batched_message(
+                .pagestream_handle_batched_message(
                    pgb_writer,
                    msg,
                    io_concurrency.clone(),
@@ -1909,7 +1909,7 @@ impl PageServerHandler {
                            return Err(e);
                        }
                    };
-                    self.pagesteam_handle_batched_message(
+                    self.pagestream_handle_batched_message(
                        pgb_writer,
                        batch,
                        io_concurrency.clone(),
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -48,7 +48,6 @@ use remote_timeline_client::{
    download_tenant_manifest,
 };
 use secondary::heatmap::{HeatMapTenant, HeatMapTimeline};
-use storage_broker::BrokerClientChannel;
 use timeline::compaction::{CompactionOutcome, GcCompactionQueue};
 use timeline::import_pgdata::ImportingTimeline;
 use timeline::offload::{OffloadError, offload_timeline};
@@ -153,7 +152,7 @@ pub const TIMELINES_SEGMENT_NAME: &str = "timelines";
 /// as the shared remote storage client and process initialization state.
 #[derive(Clone)]
 pub struct TenantSharedResources {
-    pub broker_client: storage_broker::BrokerClientChannel,
+    pub broker_client: storage_broker::TimelineUpdatesSubscriber,
    pub remote_storage: GenericRemoteStorage,
    pub deletion_queue_client: DeletionQueueClient,
    pub l0_flush_global_state: L0FlushGlobalState,
@@ -2107,7 +2106,7 @@ impl TenantShard {
    async fn unoffload_timeline(
        self: &Arc<Self>,
        timeline_id: TimelineId,
-        broker_client: storage_broker::BrokerClientChannel,
+        broker_client: storage_broker::TimelineUpdatesSubscriber,
        ctx: RequestContext,
    ) -> Result<Arc<Timeline>, TimelineArchivalError> {
        info!("unoffloading timeline");
@@ -2242,7 +2241,7 @@ impl TenantShard {
        self: &Arc<Self>,
        timeline_id: TimelineId,
        new_state: TimelineArchivalState,
-        broker_client: storage_broker::BrokerClientChannel,
+        broker_client: storage_broker::TimelineUpdatesSubscriber,
        ctx: RequestContext,
    ) -> Result<(), TimelineArchivalError> {
        info!("setting timeline archival config");
@@ -2571,7 +2570,7 @@ impl TenantShard {
    pub(crate) async fn create_timeline(
        self: &Arc<TenantShard>,
        params: CreateTimelineParams,
-        broker_client: storage_broker::BrokerClientChannel,
+        broker_client: storage_broker::TimelineUpdatesSubscriber,
        ctx: &RequestContext,
    ) -> Result<Arc<Timeline>, CreateTimelineError> {
        if !self.is_active() {
@@ -3299,7 +3298,7 @@ impl TenantShard {
    /// to delay background jobs. Background jobs can be started right away when None is given.
    fn activate(
        self: &Arc<Self>,
-        broker_client: BrokerClientChannel,
+        broker_client: storage_broker::TimelineUpdatesSubscriber,
        background_jobs_can_start: Option<&completion::Barrier>,
        ctx: &RequestContext,
    ) {
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -668,7 +668,9 @@ impl From<DownloadError> for UpdateError {

 impl From<std::io::Error> for UpdateError {
    fn from(value: std::io::Error) -> Self {
-        if let Some(nix::errno::Errno::ENOSPC) = value.raw_os_error().map(nix::errno::from_i32) {
+        if let Some(nix::errno::Errno::ENOSPC) =
+            value.raw_os_error().map(nix::errno::Errno::from_raw)
+        {
            UpdateError::NoSpace
        } else if value
            .get_ref()
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -61,7 +61,6 @@ use postgres_ffi::{WAL_SEGMENT_SIZE, to_pg_timestamp};
 use rand::Rng;
 use remote_storage::DownloadError;
 use serde_with::serde_as;
-use storage_broker::BrokerClientChannel;
 use tokio::runtime::Handle;
 use tokio::sync::mpsc::Sender;
 use tokio::sync::{Notify, oneshot, watch};
@@ -2080,7 +2079,7 @@ impl Timeline {
    pub(crate) fn activate(
        self: &Arc<Self>,
        parent: Arc<crate::tenant::TenantShard>,
-        broker_client: BrokerClientChannel,
+        broker_client: storage_broker::TimelineUpdatesSubscriber,
        background_jobs_can_start: Option<&completion::Barrier>,
        ctx: &RequestContext,
    ) {
@@ -3114,7 +3113,7 @@ impl Timeline {
    fn launch_wal_receiver(
        self: &Arc<Self>,
        ctx: &RequestContext,
-        broker_client: BrokerClientChannel,
+        broker_client: storage_broker::TimelineUpdatesSubscriber,
    ) {
        info!(
            "launching WAL receiver for timeline {} of tenant {}",
--- a/pageserver/src/tenant/timeline/uninit.rs
+++ b/pageserver/src/tenant/timeline/uninit.rs
@@ -161,7 +161,7 @@ impl<'t> UninitializedTimeline<'t> {
        tenant: Arc<TenantShard>,
        copyin_read: &mut (impl tokio::io::AsyncRead + Send + Sync + Unpin),
        base_lsn: Lsn,
-        broker_client: storage_broker::BrokerClientChannel,
+        broker_client: storage_broker::TimelineUpdatesSubscriber,
        ctx: &RequestContext,
    ) -> anyhow::Result<Arc<Timeline>> {
        self.write(|raw_timeline| async move {
--- a/pageserver/src/tenant/timeline/walreceiver.rs
+++ b/pageserver/src/tenant/timeline/walreceiver.rs
@@ -28,7 +28,6 @@ use std::num::NonZeroU64;
 use std::sync::Arc;
 use std::time::Duration;

-use storage_broker::BrokerClientChannel;
 use tokio::sync::watch;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
@@ -70,7 +69,7 @@ impl WalReceiver {
    pub fn start(
        timeline: Arc<Timeline>,
        conf: WalReceiverConf,
-        mut broker_client: BrokerClientChannel,
+        mut broker_client: storage_broker::TimelineUpdatesSubscriber,
        ctx: &RequestContext,
    ) -> Self {
        let tenant_shard_id = timeline.tenant_shard_id;
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -17,19 +17,12 @@ use std::time::Duration;

 use anyhow::Context;
 use chrono::{NaiveDateTime, Utc};
+use futures::StreamExt;
 use pageserver_api::models::TimelineState;
 use postgres_connection::PgConnectionConfig;
-use storage_broker::proto::{
-    FilterTenantTimelineId, MessageType, SafekeeperDiscoveryRequest, SafekeeperDiscoveryResponse,
-    SubscribeByFilterRequest, TenantTimelineId as ProtoTenantTimelineId, TypeSubscription,
-    TypedMessage,
-};
-use storage_broker::{BrokerClientChannel, Code, Streaming};
+use storage_broker::proto::SafekeeperDiscoveryResponse;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
-use utils::backoff::{
-    DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS, exponential_backoff,
-};
 use utils::id::{NodeId, TenantTimelineId};
 use utils::lsn::Lsn;
 use utils::postgres_client::{
@@ -56,7 +49,7 @@ pub(crate) struct Cancelled;
 ///
 /// Not cancellation-safe. Use `cancel` token to request cancellation.
 pub(super) async fn connection_manager_loop_step(
-    broker_client: &mut BrokerClientChannel,
+    broker_client: &mut storage_broker::TimelineUpdatesSubscriber,
    connection_manager_state: &mut ConnectionManagerState,
    ctx: &RequestContext,
    cancel: &CancellationToken,
@@ -81,11 +74,6 @@ pub(super) async fn connection_manager_loop_step(
        WALRECEIVER_ACTIVE_MANAGERS.dec();
    }

-    let id = TenantTimelineId {
-        tenant_id: connection_manager_state.timeline.tenant_shard_id.tenant_id,
-        timeline_id: connection_manager_state.timeline.timeline_id,
-    };
-
    let mut timeline_state_updates = connection_manager_state
        .timeline
        .subscribe_for_state_updates();
@@ -101,7 +89,12 @@ pub(super) async fn connection_manager_loop_step(
    // Subscribe to the broker updates. Stream shares underlying TCP connection
    // with other streams on this client (other connection managers). When
    // object goes out of scope, stream finishes in drop() automatically.
-    let mut broker_subscription = subscribe_for_timeline_updates(broker_client, id, cancel).await?;
+    let (timeline_updates, mut discovery_requester) = broker_client.subscribe(
+        connection_manager_state.timeline.tenant_shard_id,
+        connection_manager_state.timeline.timeline_id,
+        cancel,
+    );
+    let mut timeline_updates = Box::pin(timeline_updates);
    debug!("Subscribed for broker timeline updates");

    loop {
@@ -155,29 +148,10 @@ pub(super) async fn connection_manager_loop_step(
                }
            },

-            // Got a new update from the broker
-            broker_update = broker_subscription.message() /* TODO: review cancellation-safety */ => {
-                match broker_update {
-                    Ok(Some(broker_update)) => connection_manager_state.register_timeline_update(broker_update),
-                    Err(status) => {
-                        match status.code() {
-                            Code::Unknown if status.message().contains("stream closed because of a broken pipe") || status.message().contains("connection reset") || status.message().contains("error reading a body from connection") => {
-                                // tonic's error handling doesn't provide a clear code for disconnections: we get
-                                // "h2 protocol error: error reading a body from connection: stream closed because of a broken pipe"
-                                // => https://github.com/neondatabase/neon/issues/9562
-                                info!("broker disconnected: {status}");
-                            },
-                            _ => {
-                                warn!("broker subscription failed: {status}");
-                            }
-                        }
-                        return Ok(());
-                    }
-                    Ok(None) => {
-                        error!("broker subscription stream ended"); // can't happen
-                        return Ok(());
-                    }
-                }
+            // Got a new update from the broker.
+            // The stream ends with None if and only if `cancel` is cancelled.
+            Some(timeline_update) = timeline_updates.next() => {
+                connection_manager_state.register_timeline_update(timeline_update)
            },

            new_event = async {
@@ -258,32 +232,11 @@ pub(super) async fn connection_manager_loop_step(
                    tokio::time::sleep(next_discovery_ts - now).await;
                }

-                let tenant_timeline_id = Some(ProtoTenantTimelineId {
-                    tenant_id: id.tenant_id.as_ref().to_owned(),
-                    timeline_id: id.timeline_id.as_ref().to_owned(),
-                });
-                let request = SafekeeperDiscoveryRequest { tenant_timeline_id };
-                let msg = TypedMessage {
-                    r#type: MessageType::SafekeeperDiscoveryRequest as i32,
-                    safekeeper_timeline_info: None,
-                    safekeeper_discovery_request: Some(request),
-                    safekeeper_discovery_response: None,
-                    };
+                info!("No active connection and no candidates, sending discovery request to the broker");
+                discovery_requester.request().await;

                last_discovery_ts = Some(std::time::Instant::now());
-                info!("No active connection and no candidates, sending discovery request to the broker");

-                // Cancellation safety: we want to send a message to the broker, but publish_one()
-                // function can get cancelled by the other select! arm. This is absolutely fine, because
-                // we just want to receive broker updates and discovery is not important if we already
-                // receive updates.
-                //
-                // It is possible that `last_discovery_ts` will be updated, but the message will not be sent.
-                // This is totally fine because of the reason above.
-
-                // This is a fire-and-forget request, we don't care about the response
-                let _ = broker_client.publish_one(msg).await;
-                debug!("Discovery request sent to the broker");
                None
            } => {}
        }
@@ -298,63 +251,6 @@ pub(super) async fn connection_manager_loop_step(
    }
 }

-/// Endlessly try to subscribe for broker updates for a given timeline.
-async fn subscribe_for_timeline_updates(
-    broker_client: &mut BrokerClientChannel,
-    id: TenantTimelineId,
-    cancel: &CancellationToken,
-) -> Result<Streaming<TypedMessage>, Cancelled> {
-    let mut attempt = 0;
-    loop {
-        exponential_backoff(
-            attempt,
-            DEFAULT_BASE_BACKOFF_SECONDS,
-            DEFAULT_MAX_BACKOFF_SECONDS,
-            cancel,
-        )
-        .await;
-        attempt += 1;
-
-        // subscribe to the specific timeline
-        let request = SubscribeByFilterRequest {
-            types: vec![
-                TypeSubscription {
-                    r#type: MessageType::SafekeeperTimelineInfo as i32,
-                },
-                TypeSubscription {
-                    r#type: MessageType::SafekeeperDiscoveryResponse as i32,
-                },
-            ],
-            tenant_timeline_id: Some(FilterTenantTimelineId {
-                enabled: true,
-                tenant_timeline_id: Some(ProtoTenantTimelineId {
-                    tenant_id: id.tenant_id.as_ref().to_owned(),
-                    timeline_id: id.timeline_id.as_ref().to_owned(),
-                }),
-            }),
-        };
-
-        match {
-            tokio::select! {
-                r = broker_client.subscribe_by_filter(request) => { r }
-                _ = cancel.cancelled() => { return Err(Cancelled); }
-            }
-        } {
-            Ok(resp) => {
-                return Ok(resp.into_inner());
-            }
-            Err(e) => {
-                // Safekeeper nodes can stop pushing timeline updates to the broker, when no new writes happen and
-                // entire WAL is streamed. Keep this noticeable with logging, but do not warn/error.
-                info!(
-                    "Attempt #{attempt}, failed to subscribe for timeline {id} updates in broker: {e:#}"
-                );
-                continue;
-            }
-        }
-    }
-}
-
 const WALCONNECTION_RETRY_MIN_BACKOFF_SECONDS: f64 = 0.1;
 const WALCONNECTION_RETRY_MAX_BACKOFF_SECONDS: f64 = 15.0;
 const WALCONNECTION_RETRY_BACKOFF_MULTIPLIER: f64 = 1.5;
@@ -695,44 +591,14 @@ impl ConnectionManagerState {
    }

    /// Adds another broker timeline into the state, if its more recent than the one already added there for the same key.
-    fn register_timeline_update(&mut self, typed_msg: TypedMessage) {
-        let mut is_discovery = false;
-        let timeline_update = match typed_msg.r#type() {
-            MessageType::SafekeeperTimelineInfo => {
-                let info = match typed_msg.safekeeper_timeline_info {
-                    Some(info) => info,
-                    None => {
-                        warn!("bad proto message from broker: no safekeeper_timeline_info");
-                        return;
-                    }
-                };
-                SafekeeperDiscoveryResponse {
-                    safekeeper_id: info.safekeeper_id,
-                    tenant_timeline_id: info.tenant_timeline_id,
-                    commit_lsn: info.commit_lsn,
-                    safekeeper_connstr: info.safekeeper_connstr,
-                    availability_zone: info.availability_zone,
-                    standby_horizon: info.standby_horizon,
-                }
-            }
-            MessageType::SafekeeperDiscoveryResponse => {
-                is_discovery = true;
-                match typed_msg.safekeeper_discovery_response {
-                    Some(response) => response,
-                    None => {
-                        warn!("bad proto message from broker: no safekeeper_discovery_response");
-                        return;
-                    }
-                }
-            }
-            _ => {
-                // unexpected message
-                return;
-            }
-        };
-
+    fn register_timeline_update(&mut self, timeline_update: storage_broker::TimelineShardUpdate) {
        WALRECEIVER_BROKER_UPDATES.inc();

+        let storage_broker::TimelineShardUpdate {
+            is_discovery,
+            inner: timeline_update,
+        } = timeline_update;
+
        trace!(
            "safekeeper info update: standby_horizon(cutoff)={}",
            timeline_update.standby_horizon
@@ -1013,7 +879,7 @@ impl ConnectionManagerState {
                    shard_stripe_size,
                    listen_pg_addr_str: info.safekeeper_connstr.as_ref(),
                    auth_token: self.conf.auth_token.as_ref().map(|t| t.as_str()),
-                    availability_zone: self.conf.availability_zone.as_deref()
+                    availability_zone: self.conf.availability_zone.as_deref(),
                };

                match wal_stream_connection_config(connection_conf_args) {
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -408,7 +408,7 @@ impl OpenFiles {
 /// error types may be elegible for retry.
 pub(crate) fn is_fatal_io_error(e: &std::io::Error) -> bool {
    use nix::errno::Errno::*;
-    match e.raw_os_error().map(nix::errno::from_i32) {
+    match e.raw_os_error().map(nix::errno::Errno::from_raw) {
        Some(EIO) => {
            // Terminate on EIO because we no longer trust the device to store
            // data safely, or to uphold persistence guarantees on fsync.
--- a/pageserver/src/virtual_file/io_engine.rs
+++ b/pageserver/src/virtual_file/io_engine.rs
@@ -124,9 +124,7 @@ pub(super) fn epoll_uring_error_to_std(
 ) -> std::io::Error {
    match e {
        tokio_epoll_uring::Error::Op(e) => e,
-        tokio_epoll_uring::Error::System(system) => {
-            std::io::Error::new(std::io::ErrorKind::Other, system)
-        }
+        tokio_epoll_uring::Error::System(system) => std::io::Error::other(system),
    }
 }

--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -936,6 +936,44 @@ lfc_prewarm_main(Datum main_arg)
 	lfc_ctl->prewarm_workers[worker_id].completed = GetCurrentTimestamp();
 }

+void
+lfc_invalidate(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber nblocks)
+{
+	BufferTag	tag;
+	FileCacheEntry *entry;
+	uint32		hash;
+
+	if (lfc_maybe_disabled())	/* fast exit if file cache is disabled */
+		return;
+
+	CopyNRelFileInfoToBufTag(tag, rinfo);
+	tag.forkNum = forkNum;
+
+	CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber);
+
+	LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
+	if (LFC_ENABLED())
+	{
+		for (BlockNumber blkno = 0; blkno < nblocks; blkno += lfc_blocks_per_chunk)
+		{
+			tag.blockNum = blkno;
+			hash = get_hash_value(lfc_hash, &tag);
+			entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL);
+			if (entry != NULL)
+			{
+				for (int i = 0; i < lfc_blocks_per_chunk; i++)
+				{
+					if (GET_STATE(entry, i) == AVAILABLE)
+					{
+						lfc_ctl->used_pages -= 1;
+						SET_STATE(entry, i, UNAVAILABLE);
+					}
+				}
+			}
+		}
+	}
+	LWLockRelease(lfc_lock);
+}

 /*
 * Check if page is present in the cache.
--- a/pgxn/neon/file_cache.h
+++ b/pgxn/neon/file_cache.h
@@ -28,6 +28,7 @@ typedef struct FileCacheState
 extern bool lfc_store_prefetch_result;

 /* functions for local file cache */
+extern void lfc_invalidate(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber nblocks);
 extern void lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum,
 					   BlockNumber blkno, const void *const *buffers,
 					   BlockNumber nblocks);
--- a/pgxn/neon/neon_pgversioncompat.h
+++ b/pgxn/neon/neon_pgversioncompat.h
@@ -86,7 +86,7 @@ InitBufferTag(BufferTag *tag, const RelFileNode *rnode,

 #define InvalidRelFileNumber InvalidOid

-#define SMgrRelGetRelInfo(reln) \
+#define SMgrRelGetRelInfo(reln)				\
 	(reln->smgr_rnode.node)

 #define DropRelationAllLocalBuffers DropRelFileNodeAllLocalBuffers
@@ -148,6 +148,12 @@ InitBufferTag(BufferTag *tag, const RelFileNode *rnode,
 #define DropRelationAllLocalBuffers DropRelationAllLocalBuffers
 #endif

+#define NRelFileInfoInvalidate(rinfo) do { \
+		NInfoGetSpcOid(rinfo) = InvalidOid; \
+		NInfoGetDbOid(rinfo) = InvalidOid; \
+		NInfoGetRelNumber(rinfo) = InvalidRelFileNumber; \
+	} while (0)
+
 #if PG_MAJORVERSION_NUM < 17
 #define ProcNumber BackendId
 #define INVALID_PROC_NUMBER InvalidBackendId
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -108,7 +108,7 @@ typedef enum
 	UNLOGGED_BUILD_NOT_PERMANENT
 } UnloggedBuildPhase;

-static SMgrRelation unlogged_build_rel = NULL;
+static NRelFileInfo unlogged_build_rel_info;
 static UnloggedBuildPhase unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;

 static bool neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id);
@@ -912,16 +912,19 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 	{
 		case 0:
 			neon_log(ERROR, "cannot call smgrextend() on rel with unknown persistence");
+			break;

 		case RELPERSISTENCE_PERMANENT:
+			if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln)))
+			{
+				mdextend(reln, forkNum, blkno, buffer, skipFsync);
+				return;
+			}
 			break;

 		case RELPERSISTENCE_TEMP:
 		case RELPERSISTENCE_UNLOGGED:
 			mdextend(reln, forkNum, blkno, buffer, skipFsync);
-			/* Update LFC in case of unlogged index build */
-			if (reln == unlogged_build_rel && unlogged_build_phase == UNLOGGED_BUILD_PHASE_2)
-				lfc_write(InfoFromSMgrRel(reln), forkNum, blkno, buffer);
 			return;

 		default:
@@ -1003,21 +1006,19 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,
 	{
 		case 0:
 			neon_log(ERROR, "cannot call smgrextend() on rel with unknown persistence");
+			break;

 		case RELPERSISTENCE_PERMANENT:
+			if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln)))
+			{
+				mdzeroextend(reln, forkNum, blocknum, nblocks, skipFsync);
+				return;
+			}
 			break;

 		case RELPERSISTENCE_TEMP:
 		case RELPERSISTENCE_UNLOGGED:
 			mdzeroextend(reln, forkNum, blocknum, nblocks, skipFsync);
-			/* Update LFC in case of unlogged index build */
-			if (reln == unlogged_build_rel && unlogged_build_phase == UNLOGGED_BUILD_PHASE_2)
-			{
-				for (int i = 0; i < nblocks; i++)
-				{
-					lfc_write(InfoFromSMgrRel(reln), forkNum, blocknum + i, buffer.data);
-				}
-			}
 			return;

 		default:
@@ -1387,8 +1388,14 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
 	{
 		case 0:
 			neon_log(ERROR, "cannot call smgrread() on rel with unknown persistence");
+			break;

 		case RELPERSISTENCE_PERMANENT:
+			if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln)))
+			{
+				mdread(reln, forkNum, blkno, buffer);
+				return;
+			}
 			break;

 		case RELPERSISTENCE_TEMP:
@@ -1474,8 +1481,14 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 	{
 		case 0:
 			neon_log(ERROR, "cannot call smgrread() on rel with unknown persistence");
+			break;

 		case RELPERSISTENCE_PERMANENT:
+			if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln)))
+			{
+				mdreadv(reln, forknum, blocknum, buffers, nblocks);
+				return;
+			}
 			break;

 		case RELPERSISTENCE_TEMP:
@@ -1608,6 +1621,15 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo
 			break;

 		case RELPERSISTENCE_PERMANENT:
+			if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln)))
+			{
+#if PG_MAJORVERSION_NUM >= 17
+				mdwritev(reln, forknum, blocknum, &buffer, 1, skipFsync);
+#else
+				mdwrite(reln, forknum, blocknum, buffer, skipFsync);
+#endif
+				return;
+			}
 			break;

 		case RELPERSISTENCE_TEMP:
@@ -1617,9 +1639,6 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo
 			#else
 			mdwrite(reln, forknum, blocknum, buffer, skipFsync);
 			#endif
-			/* Update LFC in case of unlogged index build */
-			if (reln == unlogged_build_rel && unlogged_build_phase == UNLOGGED_BUILD_PHASE_2)
-				lfc_write(InfoFromSMgrRel(reln), forknum, blocknum, buffer);
 			return;
 		default:
 			neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
@@ -1680,14 +1699,16 @@ neon_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
 			break;

 		case RELPERSISTENCE_PERMANENT:
+			if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln)))
+			{
+				mdwritev(reln, forknum, blkno, buffers, nblocks, skipFsync);
+				return;
+			}
 			break;

 		case RELPERSISTENCE_TEMP:
 		case RELPERSISTENCE_UNLOGGED:
 			mdwritev(reln, forknum, blkno, buffers, nblocks, skipFsync);
-			/* Update LFC in case of unlogged index build */
-			if (reln == unlogged_build_rel && unlogged_build_phase == UNLOGGED_BUILD_PHASE_2)
-				lfc_writev(InfoFromSMgrRel(reln), forknum, blkno, buffers, nblocks);
 			return;
 		default:
 			neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
@@ -1723,6 +1744,10 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
 			break;

 		case RELPERSISTENCE_PERMANENT:
+			if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln)))
+			{
+				return mdnblocks(reln, forknum);
+			}
 			break;

 		case RELPERSISTENCE_TEMP:
@@ -1792,6 +1817,11 @@ neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber old_blocks, Blo
 			break;

 		case RELPERSISTENCE_PERMANENT:
+			if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln)))
+			{
+				mdtruncate(reln, forknum, old_blocks, nblocks);
+				return;
+			}
 			break;

 		case RELPERSISTENCE_TEMP:
@@ -1930,7 +1960,6 @@ neon_start_unlogged_build(SMgrRelation reln)
 	 */
 	if (unlogged_build_phase != UNLOGGED_BUILD_NOT_IN_PROGRESS)
 		neon_log(ERROR, "unlogged relation build is already in progress");
-	Assert(unlogged_build_rel == NULL);

 	ereport(SmgrTrace,
 			(errmsg(NEON_TAG "starting unlogged build of relation %u/%u/%u",
@@ -1947,7 +1976,7 @@ neon_start_unlogged_build(SMgrRelation reln)

 		case RELPERSISTENCE_TEMP:
 		case RELPERSISTENCE_UNLOGGED:
-			unlogged_build_rel = reln;
+			unlogged_build_rel_info = InfoFromSMgrRel(reln);
 			unlogged_build_phase = UNLOGGED_BUILD_NOT_PERMANENT;
 #ifdef DEBUG_COMPARE_LOCAL
 			if (!IsParallelWorker())
@@ -1968,12 +1997,9 @@ neon_start_unlogged_build(SMgrRelation reln)
 		neon_log(ERROR, "cannot perform unlogged index build, index is not empty ");
 #endif

-	unlogged_build_rel = reln;
+	unlogged_build_rel_info = InfoFromSMgrRel(reln);
 	unlogged_build_phase = UNLOGGED_BUILD_PHASE_1;

-	/* Make the relation look like it's unlogged */
-	reln->smgr_relpersistence = RELPERSISTENCE_UNLOGGED;
-
 	/*
 	 * Create the local file. In a parallel build, the leader is expected to
 	 * call this first and do it.
@@ -2000,17 +2026,16 @@ neon_start_unlogged_build(SMgrRelation reln)
 static void
 neon_finish_unlogged_build_phase_1(SMgrRelation reln)
 {
-	Assert(unlogged_build_rel == reln);
+	Assert(RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln)));

 	ereport(SmgrTrace,
 			(errmsg(NEON_TAG "finishing phase 1 of unlogged build of relation %u/%u/%u",
-					RelFileInfoFmt(InfoFromSMgrRel(reln)))));
+					RelFileInfoFmt((unlogged_build_rel_info)))));

 	if (unlogged_build_phase == UNLOGGED_BUILD_NOT_PERMANENT)
 		return;

 	Assert(unlogged_build_phase == UNLOGGED_BUILD_PHASE_1);
-	Assert(reln->smgr_relpersistence == RELPERSISTENCE_UNLOGGED);

 	/*
 	 * In a parallel build, (only) the leader process performs the 2nd
@@ -2018,7 +2043,7 @@ neon_finish_unlogged_build_phase_1(SMgrRelation reln)
 	 */
 	if (IsParallelWorker())
 	{
-		unlogged_build_rel = NULL;
+		NRelFileInfoInvalidate(unlogged_build_rel_info);
 		unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
 	}
 	else
@@ -2039,11 +2064,11 @@ neon_end_unlogged_build(SMgrRelation reln)
 {
 	NRelFileInfoBackend rinfob = InfoBFromSMgrRel(reln);

-	Assert(unlogged_build_rel == reln);
+	Assert(RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln)));

 	ereport(SmgrTrace,
 			(errmsg(NEON_TAG "ending unlogged build of relation %u/%u/%u",
-					RelFileInfoFmt(InfoFromNInfoB(rinfob)))));
+					RelFileInfoFmt(unlogged_build_rel_info))));

 	if (unlogged_build_phase != UNLOGGED_BUILD_NOT_PERMANENT)
 	{
@@ -2051,7 +2076,6 @@ neon_end_unlogged_build(SMgrRelation reln)
 		BlockNumber nblocks;

 		Assert(unlogged_build_phase == UNLOGGED_BUILD_PHASE_2);
-		Assert(reln->smgr_relpersistence == RELPERSISTENCE_UNLOGGED);

 		/*
 		 * Update the last-written LSN cache.
@@ -2072,9 +2096,6 @@ neon_end_unlogged_build(SMgrRelation reln)
 								InfoFromNInfoB(rinfob),
 								MAIN_FORKNUM);

-		/* Make the relation look permanent again */
-		reln->smgr_relpersistence = RELPERSISTENCE_PERMANENT;
-
 		/* Remove local copy */
 		for (int forknum = 0; forknum <= MAX_FORKNUM; forknum++)
 		{
@@ -2083,6 +2104,8 @@ neon_end_unlogged_build(SMgrRelation reln)
 				 forknum);

 			forget_cached_relsize(InfoFromNInfoB(rinfob), forknum);
+			lfc_invalidate(InfoFromNInfoB(rinfob), forknum, nblocks);
+
 			mdclose(reln, forknum);
 #ifndef DEBUG_COMPARE_LOCAL
 			/* use isRedo == true, so that we drop it immediately */
@@ -2093,7 +2116,7 @@ neon_end_unlogged_build(SMgrRelation reln)
 		mdunlink(rinfob, INIT_FORKNUM, true);
 #endif
 	}
-	unlogged_build_rel = NULL;
+	NRelFileInfoInvalidate(unlogged_build_rel_info);
 	unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
 }

@@ -2166,7 +2189,7 @@ AtEOXact_neon(XactEvent event, void *arg)
 			 * Forget about any build we might have had in progress. The local
 			 * file will be unlinked by smgrDoPendingDeletes()
 			 */
-			unlogged_build_rel = NULL;
+			NRelFileInfoInvalidate(unlogged_build_rel_info);
 			unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
 			break;

@@ -2178,7 +2201,7 @@ AtEOXact_neon(XactEvent event, void *arg)
 		case XACT_EVENT_PRE_PREPARE:
 			if (unlogged_build_phase != UNLOGGED_BUILD_NOT_IN_PROGRESS)
 			{
-				unlogged_build_rel = NULL;
+				NRelFileInfoInvalidate(unlogged_build_rel_info);
 				unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
 				ereport(ERROR,
 						(errcode(ERRCODE_INTERNAL_ERROR),
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 2.1.2 and should not be changed by hand.
+# This file is automatically @generated by Poetry 2.1.1 and should not be changed by hand.

 [[package]]
 name = "aiohappyeyeballs"
@@ -1145,18 +1145,19 @@ dotenv = ["python-dotenv"]

 [[package]]
 name = "flask-cors"
-version = "5.0.0"
-description = "A Flask extension adding a decorator for CORS support"
+version = "6.0.0"
+description = "A Flask extension simplifying CORS support"
 optional = false
-python-versions = "*"
+python-versions = "<4.0,>=3.9"
 groups = ["main"]
 files = [
-    {file = "Flask_Cors-5.0.0-py2.py3-none-any.whl", hash = "sha256:b9e307d082a9261c100d8fb0ba909eec6a228ed1b60a8315fd85f783d61910bc"},
-    {file = "flask_cors-5.0.0.tar.gz", hash = "sha256:5aadb4b950c4e93745034594d9f3ea6591f734bb3662e16e255ffbf5e89c88ef"},
+    {file = "flask_cors-6.0.0-py3-none-any.whl", hash = "sha256:6332073356452343a8ccddbfec7befdc3fdd040141fe776ec9b94c262f058657"},
+    {file = "flask_cors-6.0.0.tar.gz", hash = "sha256:4592c1570246bf7beee96b74bc0adbbfcb1b0318f6ba05c412e8909eceec3393"},
 ]

 [package.dependencies]
-Flask = ">=0.9"
+flask = ">=0.9"
+Werkzeug = ">=0.7"

 [[package]]
 name = "frozenlist"
--- a/proxy/src/binary/pg_sni_router.rs
+++ b/proxy/src/binary/pg_sni_router.rs
@@ -394,6 +394,7 @@ async fn handle_client(
    }
 }

+#[allow(clippy::large_enum_variant)]
 enum Connection {
    Raw(tokio::net::TcpStream),
    Tls(tokio_rustls::client::TlsStream<tokio::net::TcpStream>),
--- a/proxy/src/binary/proxy.rs
+++ b/proxy/src/binary/proxy.rs
@@ -43,11 +43,12 @@ project_build_tag!(BUILD_TAG);
 use clap::{Parser, ValueEnum};

 #[derive(Clone, Debug, ValueEnum)]
+#[clap(rename_all = "kebab-case")]
 enum AuthBackendType {
-    #[value(name("cplane-v1"), alias("control-plane"))]
-    ControlPlaneV1,
+    #[clap(alias("cplane-v1"))]
+    ControlPlane,

-    #[value(name("link"), alias("control-redirect"))]
+    #[clap(alias("link"))]
    ConsoleRedirect,

    #[cfg(any(test, feature = "testing"))]
@@ -707,7 +708,7 @@ fn build_auth_backend(
    args: &ProxyCliArgs,
 ) -> anyhow::Result<Either<&'static auth::Backend<'static, ()>, &'static ConsoleRedirectBackend>> {
    match &args.auth_backend {
-        AuthBackendType::ControlPlaneV1 => {
+        AuthBackendType::ControlPlane => {
            let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?;
            let project_info_cache_config: ProjectInfoCacheOptions =
                args.project_info_cache.parse()?;
@@ -862,7 +863,7 @@ async fn configure_redis(
        ("irsa", _) => match (&args.redis_host, args.redis_port) {
            (Some(host), Some(port)) => Some(
                ConnectionWithCredentialsProvider::new_with_credentials_provider(
-                    host.to_string(),
+                    host.clone(),
                    port,
                    elasticache::CredentialsProvider::new(
                        args.aws_region.clone(),
--- a/proxy/src/context/mod.rs
+++ b/proxy/src/context/mod.rs
@@ -78,7 +78,7 @@ struct RequestContextInner {

 #[derive(Clone, Debug)]
 pub(crate) enum AuthMethod {
-    // aka passwordless, fka link
+    // aka link
    ConsoleRedirect,
    ScramSha256,
    ScramSha256Plus,
--- a/rust-toolchain.toml
+++ b/rust-toolchain.toml
@@ -1,5 +1,5 @@
 [toolchain]
-channel = "1.86.0"
+channel = "1.87.0"
 profile = "default"
 # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy.
 # https://rust-lang.github.io/rustup/concepts/profiles.html
--- a/safekeeper/Cargo.toml
+++ b/safekeeper/Cargo.toml
@@ -52,6 +52,7 @@ tokio-postgres.workspace = true
 tokio-rustls.workspace = true
 tokio-tar.workspace = true
 tokio-util = { workspace = true }
+tonic = { workspace = true }
 tracing.workspace = true
 url.workspace = true
 metrics.workspace = true
@@ -62,9 +63,11 @@ pq_proto.workspace = true
 remote_storage.workspace = true
 safekeeper_api.workspace = true
 safekeeper_client.workspace = true
+sk_ps_discovery.workspace = true
 sha2.workspace = true
 sd-notify.workspace = true
 storage_broker.workspace = true
+storage_controller_client.workspace = true
 tokio-stream.workspace = true
 http-utils.workspace = true
 utils.workspace = true
--- a/safekeeper/client/src/mgmt_api.rs
+++ b/safekeeper/client/src/mgmt_api.rs
@@ -8,11 +8,12 @@ use std::error::Error as _;
 use http_utils::error::HttpErrorBody;
 use reqwest::{IntoUrl, Method, StatusCode};
 use safekeeper_api::models::{
-    self, PullTimelineRequest, PullTimelineResponse, SafekeeperUtilization, TimelineCreateRequest,
-    TimelineStatus,
+    self, PullTimelineRequest, PullTimelineResponse, SafekeeperUtilization,
+    TenantShardPageserverAttachmentChange, TimelineCreateRequest, TimelineStatus,
 };
 use utils::id::{NodeId, TenantId, TimelineId};
 use utils::logging::SecretString;
+use utils::shard::TenantShardId;

 #[derive(Debug, Clone)]
 pub struct Client {
@@ -189,6 +190,20 @@ impl Client {
        resp.json().await.map_err(Error::ReceiveBody)
    }

+    pub async fn post_tenant_shard_pageserver_attachments(
+        &self,
+        tenant_shard_id: TenantShardId,
+        body: TenantShardPageserverAttachmentChange,
+    ) -> Result<()> {
+        let uri = format!(
+            "{}/v1/tenant/{}/pageserver_attachments",
+            tenant_shard_id.tenant_id,
+            self.mgmt_api_endpoint
+        );
+        let resp = self.post(uri, body).await?;
+        resp.json().await.map_err(Error::ReceiveBody)
+    }
+
    async fn post<B: serde::Serialize, U: IntoUrl>(
        &self,
        uri: U,
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -22,9 +22,10 @@ use safekeeper::defaults::{
    DEFAULT_PARTIAL_BACKUP_TIMEOUT, DEFAULT_PG_LISTEN_ADDR, DEFAULT_SSL_CERT_FILE,
    DEFAULT_SSL_CERT_RELOAD_PERIOD, DEFAULT_SSL_KEY_FILE,
 };
+use safekeeper::wal_backup::WalBackup;
 use safekeeper::{
    BACKGROUND_RUNTIME, BROKER_RUNTIME, GlobalTimelines, HTTP_RUNTIME, SafeKeeperConf,
-    WAL_SERVICE_RUNTIME, broker, control_file, http, wal_backup, wal_service,
+    WAL_ADVERTISER_RUNTIME, WAL_SERVICE_RUNTIME, broker, control_file, http, wal_service,
 };
 use sd_notify::NotifyState;
 use storage_broker::{DEFAULT_ENDPOINT, Uri};
@@ -484,15 +485,15 @@ async fn start_safekeeper(conf: Arc<SafeKeeperConf>) -> Result<()> {
        None => None,
    };

-    let global_timelines = Arc::new(GlobalTimelines::new(conf.clone()));
+    let wal_backup = Arc::new(WalBackup::new(&conf).await?);
+
+    let global_timelines = Arc::new(GlobalTimelines::new(conf.clone(), wal_backup.clone()));

    // Register metrics collector for active timelines. It's important to do this
    // after daemonizing, otherwise process collector will be upset.
    let timeline_collector = safekeeper::metrics::TimelineCollector::new(global_timelines.clone());
    metrics::register_internal(Box::new(timeline_collector))?;

-    wal_backup::init_remote_storage(&conf).await;
-
    // Keep handles to main tasks to die if any of them disappears.
    let mut tasks_handles: FuturesUnordered<BoxFuture<(String, JoinTaskRes)>> =
        FuturesUnordered::new();
@@ -625,6 +626,30 @@ async fn start_safekeeper(conf: Arc<SafeKeeperConf>) -> Result<()> {
        .map(|res| ("broker main".to_owned(), res));
    tasks_handles.push(Box::pin(broker_task_handle));

+    let ps_connectivity_handle = current_thread_rt
+        .as_ref()
+        .unwrap_or_else(|| HTTP_RUNTIME.handle())
+        .spawn(
+            global_timelines
+                .get_pageserver_connectivity()
+                .task_main()
+                .instrument(info_span!("pageserver_connectivity")),
+        )
+        .map(|res| ("pageserver connectivity".to_owned(), res));
+    tasks_handles.push(Box::pin(ps_connectivity_handle));
+
+    let wal_advertiser_task_handle = current_thread_rt
+        .as_ref()
+        .unwrap_or_else(|| WAL_ADVERTISER_RUNTIME.handle())
+        .spawn(
+            global_timelines
+                .get_wal_advertiser()
+                .task_main()
+                .instrument(info_span!("wal_advertiser_main")),
+        )
+        .map(|res| ("wal advertiser task handle".to_owned(), res));
+    tasks_handles.push(Box::pin(wal_advertiser_task_handle));
+
    set_build_info_metric(GIT_VERSION, BUILD_TAG);

    // TODO: update tokio-stream, convert to real async Stream with
--- a/safekeeper/src/broker.rs
+++ b/safekeeper/src/broker.rs
@@ -50,7 +50,8 @@ async fn push_loop(
        conf.broker_endpoint.clone(),
        conf.broker_keepalive_interval,
        make_tls_config(&conf),
-    )?;
+    )?
+    .into_raw_grpc_client();
    let push_interval = Duration::from_millis(PUSH_INTERVAL_MSEC);

    let outbound = async_stream::stream! {
@@ -97,7 +98,8 @@ async fn pull_loop(
        conf.broker_endpoint.clone(),
        conf.broker_keepalive_interval,
        make_tls_config(&conf),
-    )?;
+    )?
+    .into_raw_grpc_client();

    // TODO: subscribe only to local timelines instead of all
    let request = SubscribeSafekeeperInfoRequest {
@@ -153,7 +155,8 @@ async fn discover_loop(
        conf.broker_endpoint.clone(),
        conf.broker_keepalive_interval,
        make_tls_config(&conf),
-    )?;
+    )?
+    .into_raw_grpc_client();

    let request = SubscribeByFilterRequest {
        types: vec![TypeSubscription {
--- a/safekeeper/src/copy_timeline.rs
+++ b/safekeeper/src/copy_timeline.rs
@@ -3,6 +3,7 @@ use std::sync::Arc;
 use anyhow::{Result, bail};
 use camino::Utf8PathBuf;
 use postgres_ffi::{MAX_SEND_SIZE, WAL_SEGMENT_SIZE};
+use remote_storage::GenericRemoteStorage;
 use safekeeper_api::membership::Configuration;
 use tokio::fs::OpenOptions;
 use tokio::io::{AsyncSeekExt, AsyncWriteExt};
@@ -30,6 +31,7 @@ pub struct Request {
 pub async fn handle_request(
    request: Request,
    global_timelines: Arc<GlobalTimelines>,
+    storage: Arc<GenericRemoteStorage>,
 ) -> Result<()> {
    // TODO: request.until_lsn MUST be a valid LSN, and we cannot check it :(
    //   if LSN will point to the middle of a WAL record, timeline will be in "broken" state
@@ -127,6 +129,7 @@ pub async fn handle_request(
    assert!(first_ondisk_segment >= first_segment);

    copy_s3_segments(
+        &storage,
        wal_seg_size,
        &request.source_ttid,
        &request.destination_ttid,
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -67,6 +67,19 @@ fn check_permission(request: &Request<Body>, tenant_id: Option<TenantId>) -> Res
    })
 }

+async fn post_tenant_pageserver_attachments(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let tenant_id = parse_request_param(&request, "tenant_id")?;
+    check_permission(&request, Some(tenant_id))?;
+    let body: models::TenantShardPageserverAttachmentChange = json_request(&mut request).await?;
+    let global_timelines = get_global_timelines(&request);
+    let wal_advertiser = global_timelines.get_wal_advertiser();
+    wal_advertiser
+        .update_pageserver_attachments(tenant_id, body)
+        .map_err(ApiError::InternalServerError)?;
+
+    json_response(StatusCode::OK, ())
+}
+
 /// Deactivates all timelines for the tenant and removes its data directory.
 /// See `timeline_delete_handler`.
 async fn tenant_delete_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
@@ -258,6 +271,7 @@ async fn timeline_snapshot_handler(request: Request<Body>) -> Result<Response<Bo

    let global_timelines = get_global_timelines(&request);
    let tli = global_timelines.get(ttid).map_err(ApiError::from)?;
+    let storage = global_timelines.get_wal_backup().get_storage();

    // To stream the body use wrap_stream which wants Stream of Result<Bytes>,
    // so create the chan and write to it in another task.
@@ -269,6 +283,7 @@ async fn timeline_snapshot_handler(request: Request<Body>) -> Result<Response<Bo
        conf.my_id,
        destination,
        tx,
+        storage,
    ));

    let rx_stream = ReceiverStream::new(rx);
@@ -390,12 +405,18 @@ async fn timeline_copy_handler(mut request: Request<Body>) -> Result<Response<Bo
    );

    let global_timelines = get_global_timelines(&request);
+    let wal_backup = global_timelines.get_wal_backup();
+    let storage = wal_backup
+        .get_storage()
+        .ok_or(ApiError::BadRequest(anyhow::anyhow!(
+            "Remote Storage is not configured"
+        )))?;

    copy_timeline::handle_request(copy_timeline::Request{
        source_ttid,
        until_lsn: request_data.until_lsn,
        destination_ttid: TenantTimelineId::new(source_ttid.tenant_id, request_data.target_timeline_id),
-    }, global_timelines)
+    }, global_timelines, storage)
        .instrument(info_span!("copy_timeline", from=%source_ttid, to=%request_data.target_timeline_id, until_lsn=%request_data.until_lsn))
        .await
        .map_err(ApiError::InternalServerError)?;
@@ -710,6 +731,9 @@ pub fn make_router(
            })
        })
        .get("/v1/utilization", |r| request_span(r, utilization_handler))
+        .post("/v1/tenant/:tenant_id/pageserver_attachments", |r| {
+            request_span(r, post_tenant_pageserver_attachments)
+        })
        .delete("/v1/tenant/:tenant_id", |r| {
            request_span(r, tenant_delete_handler)
        })
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -38,11 +38,13 @@ pub mod timeline_eviction;
 pub mod timeline_guard;
 pub mod timeline_manager;
 pub mod timelines_set;
+pub mod wal_advertiser;
 pub mod wal_backup;
 pub mod wal_backup_partial;
 pub mod wal_reader_stream;
 pub mod wal_service;
 pub mod wal_storage;
+pub(crate) mod pageserver_connectivity;

 #[cfg(any(test, feature = "benchmarking"))]
 pub mod test_utils;
@@ -123,12 +125,7 @@ pub struct SafeKeeperConf {
    pub ssl_ca_certs: Vec<Pem>,
    pub use_https_safekeeper_api: bool,
    pub enable_tls_wal_service_api: bool,
-}
-
-impl SafeKeeperConf {
-    pub fn is_wal_backup_enabled(&self) -> bool {
-        self.remote_storage.is_some() && self.wal_backup_enabled
-    }
+    pub storage_controller_api: Option<Uri>,
 }

 impl SafeKeeperConf {
@@ -174,6 +171,7 @@ impl SafeKeeperConf {
            ssl_ca_certs: Vec::new(),
            use_https_safekeeper_api: false,
            enable_tls_wal_service_api: false,
+            storage_controller_api: None,
        }
    }
 }
@@ -204,6 +202,14 @@ pub static BROKER_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
        .expect("Failed to create broker runtime")
 });

+pub static WAL_ADVERTISER_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
+    tokio::runtime::Builder::new_multi_thread()
+        .thread_name("wal advertiser worker")
+        .enable_all()
+        .build()
+        .expect("Failed to create broker runtime")
+});
+
 pub static WAL_BACKUP_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
    tokio::runtime::Builder::new_multi_thread()
        .thread_name("WAL backup worker")
--- a/safekeeper/src/pageserver_connectivity.rs
+++ b/safekeeper/src/pageserver_connectivity.rs
@@ -0,0 +1,117 @@
+use desim::world::Node;
+use hyper::Uri;
+use pageserver_api::controller_api;
+use utils::id::TenantId;
+
+use crate::timeline::Timeline;
+
+use std::{
+    collections::{HashMap, hash_map},
+    sync::{Arc, Mutex},
+    time::{Duration, Instant},
+};
+
+use anyhow::Context;
+use tracing::{Instrument, error, info, info_span, warn};
+use utils::{
+    id::{NodeId, TenantTimelineId},
+    lsn::Lsn,
+    sync::{spsc_fold, spsc_watch},
+};
+
+use crate::{GlobalTimelines, SafeKeeperConf};
+
+type Advs = HashMap<TenantTimelineId, Lsn>;
+
+#[derive(Default)]
+pub struct GlobalState {
+    inner: once_cell::sync::OnceCell<tokio::sync::mpsc::Sender<Message>>,
+}
+
+enum Message {
+    Resolve {
+        ps_id: NodeId,
+        reply: tokio::sync::oneshot::Sender<tokio::sync::watch::Receiver<hyper::Uri>>,
+    },
+}
+
+#[derive(thiserror::Error, Debug)]
+pub enum Error {
+    #[error("cancelled")]
+    Cancelled,
+}
+
+impl GlobalState {
+    pub fn task_main(&self) -> impl 'static + Future<Output = anyhow::Result<()>> + Send {
+        let mut ret = None;
+        self.inner.get_or_init(|| {
+            let (tx, task_fut) = MainTask::prepare_run();
+            ret = Some(task_fut);
+            tx
+        });
+        ret.expect("must only call this method once")
+    }
+}
+
+struct MainTask {
+    rx: tokio::sync::mpsc::Receiver<Message>,
+}
+
+impl MainTask {
+    fn prepare_run() -> (
+        tokio::sync::mpsc::Sender<Message>,
+        impl Future<Output = anyhow::Result<()>> + Send,
+    ) {
+        let (tx, rx) = tokio::sync::mpsc::channel(100 /* TODO think */);
+        let task = MainTask { rx };
+        (tx, task.task())
+    }
+    async fn task(mut self) -> anyhow::Result<()> {
+        // TODO: persistence
+
+        let storcon_client = todo!();
+
+        let mut resolution: HashMap<NodeId, tokio::sync::watch::Sender<hyper::Uri>> =
+            HashMap::new();
+
+        while let Some(rx) = self.rx.recv().await {
+            match rx {
+                Message::Resolve { ps_id, reply } => match resolution.entry(ps_id) {
+                    hash_map::Entry::Occupied(e) => {}
+                    hash_map::Entry::Vacant(e) => {
+                        tokio::spawn(
+                            ResolutionTask { ps_id, storcon_client }.run()
+                        )
+                    },
+                },
+            }
+        }
+    }
+}
+
+struct ResolutionTask {
+    ps_id: NodeId,
+    storcon_client: storage_controller_client::control_api::Client,
+}
+
+impl ResolutionTask {
+    pub async fn run(self) -> Result<Uri, Error> {
+        loop {
+            // XXX: well-defined upcall API?
+            let res = self
+                .storcon_client
+                .dispatch(
+                    reqwest::Method::GET,
+                    format!("control/v1/node/{}", self.node_id),
+                    None,
+                )
+                .await;
+            let node: NodeDescribeResponse = match res {
+                Ok(res) => res,
+                Err(err) => {
+                    warn!("storcon upcall failed")
+                }
+            };
+        }
+    }
+}
--- a/safekeeper/src/pull_timeline.rs
+++ b/safekeeper/src/pull_timeline.rs
@@ -9,6 +9,7 @@ use chrono::{DateTime, Utc};
 use futures::{SinkExt, StreamExt, TryStreamExt};
 use http_utils::error::ApiError;
 use postgres_ffi::{PG_TLI, XLogFileName, XLogSegNo};
+use remote_storage::GenericRemoteStorage;
 use reqwest::Certificate;
 use safekeeper_api::Term;
 use safekeeper_api::models::{PullTimelineRequest, PullTimelineResponse, TimelineStatus};
@@ -43,6 +44,7 @@ pub async fn stream_snapshot(
    source: NodeId,
    destination: NodeId,
    tx: mpsc::Sender<Result<Bytes>>,
+    storage: Option<Arc<GenericRemoteStorage>>,
 ) {
    match tli.try_wal_residence_guard().await {
        Err(e) => {
@@ -53,10 +55,32 @@ pub async fn stream_snapshot(
        Ok(maybe_resident_tli) => {
            if let Err(e) = match maybe_resident_tli {
                Some(resident_tli) => {
-                    stream_snapshot_resident_guts(resident_tli, source, destination, tx.clone())
-                        .await
+                    stream_snapshot_resident_guts(
+                        resident_tli,
+                        source,
+                        destination,
+                        tx.clone(),
+                        storage,
+                    )
+                    .await
+                }
+                None => {
+                    if let Some(storage) = storage {
+                        stream_snapshot_offloaded_guts(
+                            tli,
+                            source,
+                            destination,
+                            tx.clone(),
+                            &storage,
+                        )
+                        .await
+                    } else {
+                        tx.send(Err(anyhow!("remote storage not configured")))
+                            .await
+                            .ok();
+                        return;
+                    }
                }
-                None => stream_snapshot_offloaded_guts(tli, source, destination, tx.clone()).await,
            } {
                // Error type/contents don't matter as they won't can't reach the client
                // (hyper likely doesn't do anything with it), but http stream will be
@@ -123,10 +147,12 @@ pub(crate) async fn stream_snapshot_offloaded_guts(
    source: NodeId,
    destination: NodeId,
    tx: mpsc::Sender<Result<Bytes>>,
+    storage: &GenericRemoteStorage,
 ) -> Result<()> {
    let mut ar = prepare_tar_stream(tx);

-    tli.snapshot_offloaded(&mut ar, source, destination).await?;
+    tli.snapshot_offloaded(&mut ar, source, destination, storage)
+        .await?;

    ar.finish().await?;

@@ -139,10 +165,13 @@ pub async fn stream_snapshot_resident_guts(
    source: NodeId,
    destination: NodeId,
    tx: mpsc::Sender<Result<Bytes>>,
+    storage: Option<Arc<GenericRemoteStorage>>,
 ) -> Result<()> {
    let mut ar = prepare_tar_stream(tx);

-    let bctx = tli.start_snapshot(&mut ar, source, destination).await?;
+    let bctx = tli
+        .start_snapshot(&mut ar, source, destination, storage)
+        .await?;
    pausable_failpoint!("sk-snapshot-after-list-pausable");

    let tli_dir = tli.get_timeline_dir();
@@ -182,6 +211,7 @@ impl Timeline {
        ar: &mut tokio_tar::Builder<W>,
        source: NodeId,
        destination: NodeId,
+        storage: &GenericRemoteStorage,
    ) -> Result<()> {
        // Take initial copy of control file, then release state lock
        let mut control_file = {
@@ -216,6 +246,7 @@ impl Timeline {
        // can fail if the timeline was un-evicted and modified in the background.
        let remote_timeline_path = &self.remote_path;
        wal_backup::copy_partial_segment(
+            storage,
            &replace.previous.remote_path(remote_timeline_path),
            &replace.current.remote_path(remote_timeline_path),
        )
@@ -262,6 +293,7 @@ impl WalResidentTimeline {
        ar: &mut tokio_tar::Builder<W>,
        source: NodeId,
        destination: NodeId,
+        storage: Option<Arc<GenericRemoteStorage>>,
    ) -> Result<SnapshotContext> {
        let mut shared_state = self.write_shared_state().await;
        let wal_seg_size = shared_state.get_wal_seg_size();
@@ -283,6 +315,7 @@ impl WalResidentTimeline {

            let remote_timeline_path = &self.tli.remote_path;
            wal_backup::copy_partial_segment(
+                &*storage.context("remote storage not configured")?,
                &replace.previous.remote_path(remote_timeline_path),
                &replace.current.remote_path(remote_timeline_path),
            )
--- a/safekeeper/src/test_utils.rs
+++ b/safekeeper/src/test_utils.rs
@@ -18,7 +18,7 @@ use crate::send_wal::EndWatch;
 use crate::state::{TimelinePersistentState, TimelineState};
 use crate::timeline::{SharedState, StateSK, Timeline, get_timeline_dir};
 use crate::timelines_set::TimelinesSet;
-use crate::wal_backup::remote_timeline_path;
+use crate::wal_backup::{WalBackup, remote_timeline_path};
 use crate::{SafeKeeperConf, control_file, receive_wal, wal_storage};

 /// A Safekeeper testing or benchmarking environment. Uses a tempdir for storage, removed on drop.
@@ -101,18 +101,23 @@ impl Env {
        let safekeeper = self.make_safekeeper(node_id, ttid, start_lsn).await?;
        let shared_state = SharedState::new(StateSK::Loaded(safekeeper));

+        let wal_backup = Arc::new(WalBackup::new(&conf).await?);
+
        let timeline = Timeline::new(
            ttid,
            &timeline_dir,
            &remote_path,
            shared_state,
            conf.clone(),
+            wal_backup.clone(),
        );
        timeline.bootstrap(
            &mut timeline.write_shared_state().await,
            &conf,
            Arc::new(TimelinesSet::default()), // ignored for now
            RateLimiter::new(0, 0),
+            wal_backup,
+            todo!(),
        );
        Ok(timeline)
    }
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -35,10 +35,13 @@ use crate::state::{EvictionState, TimelineMemState, TimelinePersistentState, Tim
 use crate::timeline_guard::ResidenceGuard;
 use crate::timeline_manager::{AtomicStatus, ManagerCtl};
 use crate::timelines_set::TimelinesSet;
-use crate::wal_backup::{self, remote_timeline_path};
+use crate::wal_backup;
+use crate::wal_backup::{WalBackup, remote_timeline_path};
 use crate::wal_backup_partial::PartialRemoteSegment;
 use crate::wal_storage::{Storage as wal_storage_iface, WalReader};
-use crate::{SafeKeeperConf, control_file, debug_dump, timeline_manager, wal_storage};
+use crate::{
+    SafeKeeperConf, control_file, debug_dump, timeline_manager, wal_advertiser, wal_storage,
+};

 fn peer_info_from_sk_info(sk_info: &SafekeeperTimelineInfo, ts: Instant) -> PeerInfo {
    PeerInfo {
@@ -452,6 +455,8 @@ pub struct Timeline {
    manager_ctl: ManagerCtl,
    conf: Arc<SafeKeeperConf>,

+    pub(crate) wal_backup: Arc<WalBackup>,
+
    remote_deletion: std::sync::Mutex<Option<RemoteDeletionReceiver>>,

    /// Hold this gate from code that depends on the Timeline's non-shut-down state.  While holding
@@ -476,6 +481,7 @@ impl Timeline {
        remote_path: &RemotePath,
        shared_state: SharedState,
        conf: Arc<SafeKeeperConf>,
+        wal_backup: Arc<WalBackup>,
    ) -> Arc<Self> {
        let (commit_lsn_watch_tx, commit_lsn_watch_rx) =
            watch::channel(shared_state.sk.state().commit_lsn);
@@ -509,6 +515,7 @@ impl Timeline {
            wal_backup_active: AtomicBool::new(false),
            last_removed_segno: AtomicU64::new(0),
            mgr_status: AtomicStatus::new(),
+            wal_backup,
        })
    }

@@ -516,6 +523,7 @@ impl Timeline {
    pub fn load_timeline(
        conf: Arc<SafeKeeperConf>,
        ttid: TenantTimelineId,
+        wal_backup: Arc<WalBackup>,
    ) -> Result<Arc<Timeline>> {
        let _enter = info_span!("load_timeline", timeline = %ttid.timeline_id).entered();

@@ -529,6 +537,7 @@ impl Timeline {
            &remote_path,
            shared_state,
            conf,
+            wal_backup,
        ))
    }

@@ -539,6 +548,8 @@ impl Timeline {
        conf: &SafeKeeperConf,
        broker_active_set: Arc<TimelinesSet>,
        partial_backup_rate_limiter: RateLimiter,
+        wal_backup: Arc<WalBackup>,
+        wal_advertiser: Arc<wal_advertiser::GlobalState>,
    ) {
        let (tx, rx) = self.manager_ctl.bootstrap_manager();

@@ -561,6 +572,8 @@ impl Timeline {
                    tx,
                    rx,
                    partial_backup_rate_limiter,
+                    wal_backup,
+                    wal_advertiser,
                )
                .await
            }
@@ -606,9 +619,10 @@ impl Timeline {
        // it is cancelled, so WAL storage won't be opened again.
        shared_state.sk.close_wal_store();

-        if !only_local && self.conf.is_wal_backup_enabled() {
+        if !only_local {
            self.remote_delete().await?;
        }
+
        let dir_existed = delete_dir(&self.timeline_dir).await?;
        Ok(dir_existed)
    }
@@ -675,11 +689,20 @@ impl Timeline {
        guard: &mut std::sync::MutexGuard<Option<RemoteDeletionReceiver>>,
    ) -> RemoteDeletionReceiver {
        tracing::info!("starting remote deletion");
+        let storage = self.wal_backup.get_storage().clone();
        let (result_tx, result_rx) = tokio::sync::watch::channel(None);
        let ttid = self.ttid;
        tokio::task::spawn(
            async move {
-                let r = wal_backup::delete_timeline(&ttid).await;
+                let r = if let Some(storage) = storage {
+                    wal_backup::delete_timeline(&storage, &ttid).await
+                } else {
+                    tracing::info!(
+                        "skipping remote deletion because no remote storage is configured; this effectively leaks the objects in remote storage"
+                    );
+                    Ok(())
+                };
+
                if let Err(e) = &r {
                    // Log error here in case nobody ever listens for our result (e.g. dropped API request)
                    tracing::error!("remote deletion failed: {e}");
@@ -1046,14 +1069,13 @@ impl WalResidentTimeline {

    pub async fn get_walreader(&self, start_lsn: Lsn) -> Result<WalReader> {
        let (_, persisted_state) = self.get_state().await;
-        let enable_remote_read = self.conf.is_wal_backup_enabled();

        WalReader::new(
            &self.ttid,
            self.timeline_dir.clone(),
            &persisted_state,
            start_lsn,
-            enable_remote_read,
+            self.wal_backup.clone(),
        )
    }

--- a/safekeeper/src/timeline_eviction.rs
+++ b/safekeeper/src/timeline_eviction.rs
@@ -6,7 +6,7 @@

 use anyhow::Context;
 use camino::Utf8PathBuf;
-use remote_storage::RemotePath;
+use remote_storage::{GenericRemoteStorage, RemotePath};
 use tokio::fs::File;
 use tokio::io::{AsyncRead, AsyncWriteExt};
 use tracing::{debug, info, instrument, warn};
@@ -42,6 +42,7 @@ impl Manager {
            && next_event.is_none()
            && self.access_service.is_empty()
            && !self.tli_broker_active.get()
+            && self.wal_advertiser.ready_for_eviction()
            // Partial segment of current flush_lsn is uploaded up to this flush_lsn.
            && !wal_backup_partial::needs_uploading(state, &self.partial_backup_uploaded)
            // And it is the next one after the last removed. Given that local
@@ -68,6 +69,10 @@ impl Manager {
    #[instrument(name = "evict_timeline", skip_all)]
    pub(crate) async fn evict_timeline(&mut self) -> bool {
        assert!(!self.is_offloaded);
+        let Some(storage) = self.wal_backup.get_storage() else {
+            warn!("no remote storage configured, skipping uneviction");
+            return false;
+        };
        let partial_backup_uploaded = match &self.partial_backup_uploaded {
            Some(p) => p.clone(),
            None => {
@@ -87,7 +92,7 @@ impl Manager {
                .inc();
        });

-        if let Err(e) = do_eviction(self, &partial_backup_uploaded).await {
+        if let Err(e) = do_eviction(self, &partial_backup_uploaded, &storage).await {
            warn!("failed to evict timeline: {:?}", e);
            return false;
        }
@@ -102,6 +107,10 @@ impl Manager {
    #[instrument(name = "unevict_timeline", skip_all)]
    pub(crate) async fn unevict_timeline(&mut self) {
        assert!(self.is_offloaded);
+        let Some(storage) = self.wal_backup.get_storage() else {
+            warn!("no remote storage configured, skipping uneviction");
+            return;
+        };
        let partial_backup_uploaded = match &self.partial_backup_uploaded {
            Some(p) => p.clone(),
            None => {
@@ -121,7 +130,7 @@ impl Manager {
                .inc();
        });

-        if let Err(e) = do_uneviction(self, &partial_backup_uploaded).await {
+        if let Err(e) = do_uneviction(self, &partial_backup_uploaded, &storage).await {
            warn!("failed to unevict timeline: {:?}", e);
            return;
        }
@@ -137,8 +146,12 @@ impl Manager {
 /// Ensure that content matches the remote partial backup, if local segment exists.
 /// Then change state in control file and in-memory. If `delete_offloaded_wal` is set,
 /// delete the local segment.
-async fn do_eviction(mgr: &mut Manager, partial: &PartialRemoteSegment) -> anyhow::Result<()> {
-    compare_local_segment_with_remote(mgr, partial).await?;
+async fn do_eviction(
+    mgr: &mut Manager,
+    partial: &PartialRemoteSegment,
+    storage: &GenericRemoteStorage,
+) -> anyhow::Result<()> {
+    compare_local_segment_with_remote(mgr, partial, storage).await?;

    mgr.tli.switch_to_offloaded(partial).await?;
    // switch manager state as soon as possible
@@ -153,12 +166,16 @@ async fn do_eviction(mgr: &mut Manager, partial: &PartialRemoteSegment) -> anyho

 /// Ensure that content matches the remote partial backup, if local segment exists.
 /// Then download segment to local disk and change state in control file and in-memory.
-async fn do_uneviction(mgr: &mut Manager, partial: &PartialRemoteSegment) -> anyhow::Result<()> {
+async fn do_uneviction(
+    mgr: &mut Manager,
+    partial: &PartialRemoteSegment,
+    storage: &GenericRemoteStorage,
+) -> anyhow::Result<()> {
    // if the local segment is present, validate it
-    compare_local_segment_with_remote(mgr, partial).await?;
+    compare_local_segment_with_remote(mgr, partial, storage).await?;

    // atomically download the partial segment
-    redownload_partial_segment(mgr, partial).await?;
+    redownload_partial_segment(mgr, partial, storage).await?;

    mgr.tli.switch_to_present().await?;
    // switch manager state as soon as possible
@@ -181,6 +198,7 @@ async fn delete_local_segment(mgr: &Manager, partial: &PartialRemoteSegment) ->
 async fn redownload_partial_segment(
    mgr: &Manager,
    partial: &PartialRemoteSegment,
+    storage: &GenericRemoteStorage,
 ) -> anyhow::Result<()> {
    let tmp_file = mgr.tli.timeline_dir().join("remote_partial.tmp");
    let remote_segfile = remote_segment_path(mgr, partial);
@@ -190,7 +208,7 @@ async fn redownload_partial_segment(
        remote_segfile, tmp_file
    );

-    let mut reader = wal_backup::read_object(&remote_segfile, 0).await?;
+    let mut reader = wal_backup::read_object(storage, &remote_segfile, 0).await?;
    let mut file = File::create(&tmp_file).await?;

    let actual_len = tokio::io::copy(&mut reader, &mut file).await?;
@@ -234,13 +252,16 @@ async fn redownload_partial_segment(
 async fn compare_local_segment_with_remote(
    mgr: &Manager,
    partial: &PartialRemoteSegment,
+    storage: &GenericRemoteStorage,
 ) -> anyhow::Result<()> {
    let local_path = local_segment_path(mgr, partial);

    match File::open(&local_path).await {
-        Ok(mut local_file) => do_validation(mgr, &mut local_file, mgr.wal_seg_size, partial)
-            .await
-            .context("validation failed"),
+        Ok(mut local_file) => {
+            do_validation(mgr, &mut local_file, mgr.wal_seg_size, partial, storage)
+                .await
+                .context("validation failed")
+        }
        Err(_) => {
            info!(
                "local WAL file {} is not present, skipping validation",
@@ -258,6 +279,7 @@ async fn do_validation(
    file: &mut File,
    wal_seg_size: usize,
    partial: &PartialRemoteSegment,
+    storage: &GenericRemoteStorage,
 ) -> anyhow::Result<()> {
    let local_size = file.metadata().await?.len() as usize;
    if local_size != wal_seg_size {
@@ -270,7 +292,7 @@ async fn do_validation(

    let remote_segfile = remote_segment_path(mgr, partial);
    let mut remote_reader: std::pin::Pin<Box<dyn AsyncRead + Send + Sync>> =
-        wal_backup::read_object(&remote_segfile, 0).await?;
+        wal_backup::read_object(storage, &remote_segfile, 0).await?;

    // remote segment should have bytes excatly up to `flush_lsn`
    let expected_remote_size = partial.flush_lsn.segment_offset(mgr.wal_seg_size);
--- a/safekeeper/src/timeline_manager.rs
+++ b/safekeeper/src/timeline_manager.rs
@@ -22,7 +22,6 @@ use tokio_util::sync::CancellationToken;
 use tracing::{Instrument, debug, info, info_span, instrument, warn};
 use utils::lsn::Lsn;

-use crate::SafeKeeperConf;
 use crate::control_file::{FileStorage, Storage};
 use crate::metrics::{
    MANAGER_ACTIVE_CHANGES, MANAGER_ITERATIONS_TOTAL, MISC_OPERATION_SECONDS, NUM_EVICTED_TIMELINES,
@@ -35,8 +34,9 @@ use crate::state::TimelineState;
 use crate::timeline::{ManagerTimeline, ReadGuardSharedState, StateSK, WalResidentTimeline};
 use crate::timeline_guard::{AccessService, GuardId, ResidenceGuard};
 use crate::timelines_set::{TimelineSetGuard, TimelinesSet};
-use crate::wal_backup::{self, WalBackupTaskHandle};
+use crate::wal_backup::{self, WalBackup, WalBackupTaskHandle};
 use crate::wal_backup_partial::{self, PartialBackup, PartialRemoteSegment};
+use crate::{SafeKeeperConf, wal_advertiser};

 pub(crate) struct StateSnapshot {
    // inmem values
@@ -200,6 +200,8 @@ pub(crate) struct Manager {
    pub(crate) conf: SafeKeeperConf,
    pub(crate) wal_seg_size: usize,
    pub(crate) walsenders: Arc<WalSenders>,
+    pub(crate) wal_backup: Arc<WalBackup>,
+    pub(crate) wal_advertiser: wal_advertiser::SafekeeperTimelineHandle,

    // current state
    pub(crate) state_version_rx: tokio::sync::watch::Receiver<usize>,
@@ -238,6 +240,8 @@ pub async fn main_task(
    manager_tx: tokio::sync::mpsc::UnboundedSender<ManagerCtlMessage>,
    mut manager_rx: tokio::sync::mpsc::UnboundedReceiver<ManagerCtlMessage>,
    global_rate_limiter: RateLimiter,
+    wal_backup: Arc<WalBackup>,
+    wal_advertiser: Arc<wal_advertiser::GlobalState>,
 ) {
    tli.set_status(Status::Started);

@@ -256,6 +260,8 @@ pub async fn main_task(
        broker_active_set,
        manager_tx,
        global_rate_limiter,
+        wal_backup,
+        wal_advertiser,
    )
    .await;

@@ -284,7 +290,8 @@ pub async fn main_task(

            mgr.set_status(Status::UpdateBackup);
            let is_wal_backup_required = mgr.update_backup(num_computes, &state_snapshot).await;
-            mgr.update_is_active(is_wal_backup_required, num_computes, &state_snapshot);
+
+            mgr.update_broker_active(is_wal_backup_required, num_computes, &state_snapshot);

            mgr.set_status(Status::UpdateControlFile);
            mgr.update_control_file_save(&state_snapshot, &mut next_event)
@@ -371,7 +378,7 @@ pub async fn main_task(
    mgr.tli_broker_active.set(false);

    // shutdown background tasks
-    if mgr.conf.is_wal_backup_enabled() {
+    if let Some(storage) = mgr.wal_backup.get_storage() {
        if let Some(backup_task) = mgr.backup_task.take() {
            // If we fell through here, then the timeline is shutting down. This is important
            // because otherwise joining on the wal_backup handle might hang.
@@ -379,7 +386,7 @@ pub async fn main_task(

            backup_task.join().await;
        }
-        wal_backup::update_task(&mut mgr, false, &last_state).await;
+        wal_backup::update_task(&mut mgr, storage, false, &last_state).await;
    }

    if let Some(recovery_task) = &mut mgr.recovery_task {
@@ -415,14 +422,18 @@ impl Manager {
        broker_active_set: Arc<TimelinesSet>,
        manager_tx: tokio::sync::mpsc::UnboundedSender<ManagerCtlMessage>,
        global_rate_limiter: RateLimiter,
+        wal_backup: Arc<WalBackup>,
+        wal_advertiser: Arc<wal_advertiser::GlobalState>,
    ) -> Manager {
        let (is_offloaded, partial_backup_uploaded) = tli.bootstrap_mgr().await;
        Manager {
            wal_seg_size: tli.get_wal_seg_size().await,
            walsenders: tli.get_walsenders().clone(),
+            wal_backup,
            state_version_rx: tli.get_state_version_rx(),
            num_computes_rx: tli.get_walreceivers().get_num_rx(),
            tli_broker_active: broker_active_set.guard(tli.clone()),
+            wal_advertiser: wal_advertiser.new_timeline(tli.clone()).await.unwrap(),
            last_removed_segno: 0,
            is_offloaded,
            backup_task: None,
@@ -477,8 +488,8 @@ impl Manager {
        let is_wal_backup_required =
            wal_backup::is_wal_backup_required(self.wal_seg_size, num_computes, state);

-        if self.conf.is_wal_backup_enabled() {
-            wal_backup::update_task(self, is_wal_backup_required, state).await;
+        if let Some(storage) = self.wal_backup.get_storage() {
+            wal_backup::update_task(self, storage, is_wal_backup_required, state).await;
        }

        // update the state in Arc<Timeline>
@@ -489,8 +500,8 @@ impl Manager {
        is_wal_backup_required
    }

-    /// Update is_active flag and returns its value.
-    fn update_is_active(
+    /// Update broker is_active flag and returns its value.
+    fn update_broker_active(
        &mut self,
        is_wal_backup_required: bool,
        num_computes: usize,
@@ -500,6 +511,7 @@ impl Manager {
            || num_computes > 0
            || state.remote_consistent_lsn < state.commit_lsn;

+
        // update the broker timeline set
        if self.tli_broker_active.set(is_active) {
            // write log if state has changed
@@ -624,9 +636,9 @@ impl Manager {
    /// Spawns partial WAL backup task if needed.
    async fn update_partial_backup(&mut self, state: &StateSnapshot) {
        // check if WAL backup is enabled and should be started
-        if !self.conf.is_wal_backup_enabled() {
+        let Some(storage) = self.wal_backup.get_storage() else {
            return;
-        }
+        };

        if self.partial_backup_task.is_some() {
            // partial backup is already running
@@ -650,6 +662,7 @@ impl Manager {
            self.conf.clone(),
            self.global_rate_limiter.clone(),
            cancel.clone(),
+            storage,
        ));
        self.partial_backup_task = Some((handle, cancel));
    }
@@ -669,6 +682,10 @@ impl Manager {
    /// Reset partial backup state and remove its remote storage data. Since it
    /// might concurrently uploading something, cancel the task first.
    async fn backup_partial_reset(&mut self) -> anyhow::Result<Vec<String>> {
+        let Some(storage) = self.wal_backup.get_storage() else {
+            anyhow::bail!("remote storage is not enabled");
+        };
+
        info!("resetting partial backup state");
        // Force unevict timeline if it is evicted before erasing partial backup
        // state. The intended use of this function is to drop corrupted remote
@@ -689,7 +706,7 @@ impl Manager {
        }

        let tli = self.wal_resident_timeline()?;
-        let mut partial_backup = PartialBackup::new(tli, self.conf.clone()).await;
+        let mut partial_backup = PartialBackup::new(tli, self.conf.clone(), storage).await;
        // Reset might fail e.g. when cfile is already reset but s3 removal
        // failed, so set manager state to None beforehand. In any case caller
        // is expected to retry until success.
--- a/safekeeper/src/timelines_global_map.rs
+++ b/safekeeper/src/timelines_global_map.rs
@@ -25,8 +25,9 @@ use crate::rate_limit::RateLimiter;
 use crate::state::TimelinePersistentState;
 use crate::timeline::{Timeline, TimelineError, delete_dir, get_tenant_dir, get_timeline_dir};
 use crate::timelines_set::TimelinesSet;
+use crate::wal_backup::WalBackup;
 use crate::wal_storage::Storage;
-use crate::{SafeKeeperConf, control_file, wal_storage};
+use crate::{SafeKeeperConf, control_file, pageserver_connectivity, wal_advertiser, wal_storage};

 // Timeline entry in the global map: either a ready timeline, or mark that it is
 // being created.
@@ -46,16 +47,29 @@ struct GlobalTimelinesState {

    conf: Arc<SafeKeeperConf>,
    broker_active_set: Arc<TimelinesSet>,
+    wal_advertisement: Arc<wal_advertiser::GlobalState>,
+    pageserver_connectivity: Arc<pageserver_connectivity::GlobalState>,
    global_rate_limiter: RateLimiter,
+    wal_backup: Arc<WalBackup>,
 }

 impl GlobalTimelinesState {
    /// Get dependencies for a timeline constructor.
-    fn get_dependencies(&self) -> (Arc<SafeKeeperConf>, Arc<TimelinesSet>, RateLimiter) {
+    fn get_dependencies(
+        &self,
+    ) -> (
+        Arc<SafeKeeperConf>,
+        Arc<TimelinesSet>,
+        RateLimiter,
+        Arc<WalBackup>,
+        Arc<wal_advertiser::GlobalState>,
+    ) {
        (
            self.conf.clone(),
            self.broker_active_set.clone(),
            self.global_rate_limiter.clone(),
+            self.wal_backup.clone(),
+            self.wal_advertisement.clone(),
        )
    }

@@ -84,14 +98,17 @@ pub struct GlobalTimelines {

 impl GlobalTimelines {
    /// Create a new instance of the global timelines map.
-    pub fn new(conf: Arc<SafeKeeperConf>) -> Self {
+    pub fn new(conf: Arc<SafeKeeperConf>, wal_backup: Arc<WalBackup>) -> Self {
        Self {
            state: Mutex::new(GlobalTimelinesState {
                timelines: HashMap::new(),
                tombstones: HashMap::new(),
                conf,
                broker_active_set: Arc::new(TimelinesSet::default()),
+                wal_advertisement: Arc::new(wal_advertiser::GlobalState::default()),
+                pageserver_connectivity: Arc::new(pageserver_connectivity::GlobalState::default()),
                global_rate_limiter: RateLimiter::new(1, 1),
+                wal_backup,
            }),
        }
    }
@@ -147,12 +164,13 @@ impl GlobalTimelines {
    /// just lock and unlock it for each timeline -- this function is called
    /// during init when nothing else is running, so this is fine.
    async fn load_tenant_timelines(&self, tenant_id: TenantId) -> Result<()> {
-        let (conf, broker_active_set, partial_backup_rate_limiter) = {
+        let (conf, broker_active_set, partial_backup_rate_limiter, wal_backup, wal_advertiser) = {
            let state = self.state.lock().unwrap();
            state.get_dependencies()
        };

        let timelines_dir = get_tenant_dir(&conf, &tenant_id);
+
        for timelines_dir_entry in std::fs::read_dir(&timelines_dir)
            .with_context(|| format!("failed to list timelines dir {}", timelines_dir))?
        {
@@ -162,7 +180,7 @@ impl GlobalTimelines {
                        TimelineId::from_str(timeline_dir_entry.file_name().to_str().unwrap_or(""))
                    {
                        let ttid = TenantTimelineId::new(tenant_id, timeline_id);
-                        match Timeline::load_timeline(conf.clone(), ttid) {
+                        match Timeline::load_timeline(conf.clone(), ttid, wal_backup.clone()) {
                            Ok(tli) => {
                                let mut shared_state = tli.write_shared_state().await;
                                self.state
@@ -175,6 +193,8 @@ impl GlobalTimelines {
                                    &conf,
                                    broker_active_set.clone(),
                                    partial_backup_rate_limiter.clone(),
+                                    wal_backup.clone(),
+                                    wal_advertiser.clone(),
                                );
                            }
                            // If we can't load a timeline, it's most likely because of a corrupted
@@ -212,6 +232,10 @@ impl GlobalTimelines {
        self.state.lock().unwrap().broker_active_set.clone()
    }

+    pub fn get_wal_backup(&self) -> Arc<WalBackup> {
+        self.state.lock().unwrap().wal_backup.clone()
+    }
+
    /// Create a new timeline with the given id. If the timeline already exists, returns
    /// an existing timeline.
    pub(crate) async fn create(
@@ -222,7 +246,7 @@ impl GlobalTimelines {
        start_lsn: Lsn,
        commit_lsn: Lsn,
    ) -> Result<Arc<Timeline>> {
-        let (conf, _, _) = {
+        let (conf, _, _, _, _) = {
            let state = self.state.lock().unwrap();
            if let Ok(timeline) = state.get(&ttid) {
                // Timeline already exists, return it.
@@ -267,7 +291,7 @@ impl GlobalTimelines {
        check_tombstone: bool,
    ) -> Result<Arc<Timeline>> {
        // Check for existence and mark that we're creating it.
-        let (conf, broker_active_set, partial_backup_rate_limiter) = {
+        let (conf, broker_active_set, partial_backup_rate_limiter, wal_backup, wal_advertiser) = {
            let mut state = self.state.lock().unwrap();
            match state.timelines.get(&ttid) {
                Some(GlobalMapTimeline::CreationInProgress) => {
@@ -296,7 +320,14 @@ impl GlobalTimelines {
        };

        // Do the actual move and reflect the result in the map.
-        match GlobalTimelines::install_temp_timeline(ttid, tmp_path, conf.clone()).await {
+        match GlobalTimelines::install_temp_timeline(
+            ttid,
+            tmp_path,
+            conf.clone(),
+            wal_backup.clone(),
+        )
+        .await
+        {
            Ok(timeline) => {
                let mut timeline_shared_state = timeline.write_shared_state().await;
                let mut state = self.state.lock().unwrap();
@@ -314,6 +345,8 @@ impl GlobalTimelines {
                    &conf,
                    broker_active_set,
                    partial_backup_rate_limiter,
+                    wal_backup,
+                    wal_advertiser.clone(),
                );
                drop(timeline_shared_state);
                Ok(timeline)
@@ -336,6 +369,7 @@ impl GlobalTimelines {
        ttid: TenantTimelineId,
        tmp_path: &Utf8PathBuf,
        conf: Arc<SafeKeeperConf>,
+        wal_backup: Arc<WalBackup>,
    ) -> Result<Arc<Timeline>> {
        let tenant_path = get_tenant_dir(conf.as_ref(), &ttid.tenant_id);
        let timeline_path = get_timeline_dir(conf.as_ref(), &ttid);
@@ -377,7 +411,7 @@ impl GlobalTimelines {
        // Do the move.
        durable_rename(tmp_path, &timeline_path, !conf.no_sync).await?;

-        Timeline::load_timeline(conf, ttid)
+        Timeline::load_timeline(conf, ttid, wal_backup)
    }

    /// Get a timeline from the global map. If it's not present, it doesn't exist on disk,
@@ -565,6 +599,14 @@ impl GlobalTimelines {
        Ok(deleted)
    }

+    pub fn get_wal_advertiser(&self) -> Arc<wal_advertiser::GlobalState> {
+        self.state.lock().unwrap().wal_advertisement.clone()
+    }
+
+    pub fn get_pageserver_connectivity(&self) -> Arc<pageserver_connectivity::GlobalState> {
+        self.state.lock().unwrap().pageserver_connectivity.clone()
+    }
+
    pub fn housekeeping(&self, tombstone_ttl: &Duration) {
        let mut state = self.state.lock().unwrap();

--- a/safekeeper/src/wal_advertiser.rs
+++ b/safekeeper/src/wal_advertiser.rs
@@ -0,0 +1,201 @@
+mod persistence;
+mod pageserver_connectivity;
+
+use utils::id::TenantId;
+
+use crate::timeline::Timeline;
+
+use std::{
+    collections::HashMap,
+    sync::{Arc, Mutex},
+    time::{Duration, Instant},
+};
+
+use anyhow::Context;
+use tracing::{Instrument, error, info, info_span, warn};
+use utils::{
+    id::{NodeId, TenantTimelineId},
+    lsn::Lsn,
+    sync::{spsc_fold, spsc_watch},
+};
+
+use crate::{GlobalTimelines, SafeKeeperConf};
+
+type Advs = HashMap<TenantTimelineId, Lsn>;
+
+#[derive(Default)]
+pub struct GlobalState {
+    inner: once_cell::sync::OnceCell<tokio::sync::mpsc::Sender<Message>>,
+}
+
+pub struct SafekeeperTimelineHandle {
+    tx: tokio::sync::mpsc::Sender<Message>,
+}
+
+enum Message {
+    NewTimeline {
+        reply: tokio::sync::oneshot::Sender<Result<(), Error>>,
+    },
+}
+
+#[derive(thiserror::Error, Debug)]
+pub enum Error {
+    #[error("cancelled")]
+    Cancelled,
+}
+
+impl GlobalState {
+    pub fn task_main(&self) -> impl 'static + Future<Output = anyhow::Result<()>> + Send {
+        let mut ret = None;
+        self.inner.get_or_init(|| {
+            let (tx, task_fut) = MainTask::prepare_run();
+            ret = Some(task_fut);
+            tx
+        });
+        ret.expect("must only call this method once")
+    }
+
+    pub async fn new_timeline(
+        &self,
+        tli: Arc<Timeline>,
+    ) -> Result<SafekeeperTimelineHandle, Error> {
+        let tx = self.inner.get().unwrap().clone();
+        let handle = SafekeeperTimelineHandle { tx };
+        let (reply, rx) = tokio::sync::oneshot::channel();
+        let Ok(()) = handle.tx.send(Message::NewTimeline { reply }).await else {
+            return Err(Error::Cancelled);
+        };
+        let Ok(res) = rx.await else {
+            return Err(Error::Cancelled);
+        };
+        Ok(handle)
+    }
+    pub fn update_pageserver_attachments(
+        &self,
+        tenant_id: TenantId,
+        update: safekeeper_api::models::TenantShardPageserverAttachmentChange,
+    ) -> anyhow::Result<()> {
+        todo!()
+    }
+}
+impl SafekeeperTimelineHandle {
+    pub fn ready_for_eviction(&self) -> bool {
+        todo!()
+    }
+}
+
+struct MainTask {
+    rx: tokio::sync::mpsc::Receiver<Message>,
+    world: sk_ps_discovery::World,
+    senders: HashMap<utils::id::NodeId, spsc_watch::Sender<Advs>>,
+}
+
+impl MainTask {
+    fn prepare_run() -> (
+        tokio::sync::mpsc::Sender<Message>,
+        impl Future<Output = anyhow::Result<()>> + Send,
+    ) {
+        let (tx, rx) = tokio::sync::mpsc::channel(100 /* TODO think */);
+        let task = MainTask {
+            rx,
+            world: sk_ps_discovery::World::default(),
+            senders: Default::default(),
+        };
+        (tx, task.task())
+    }
+    async fn task(mut self) -> anyhow::Result<()> {
+        let mut adv_frequency = tokio::time::interval(Duration::from_secs(1));
+        loop {
+            tokio::select! {
+                _ = adv_frequency.tick() => {
+                    let start = Instant::now();
+                    self.advertisements_iteration();
+                    let elapsed = start.elapsed();
+                    if elapsed > Duration::from_millis(10) {
+                        warn!(?elapsed, "advertisements iteration is slow");
+                    }
+                },
+                message = self.rx.recv() => {
+                    match message {
+                        None => anyhow::bail!("last main task sender dropped, shouldn't happen, exiting"),
+                        Some(_) => todo!(),
+                    }
+                },
+            }
+        }
+    }
+
+    fn advertisements_iteration(&mut self) {
+        loop {
+            let advertisements = self.world.get_commit_lsn_advertisements();
+            for (node_id, mut advs) in advertisements {
+                'inner: loop {
+                    let tx = self.senders.entry(node_id).or_insert_with(|| {
+                        let (tx, rx) = spsc_watch::channel();
+                        tokio::spawn(
+                            PageserverTask {
+                                ps_id: node_id,
+                                endpoint: todo!(),
+                                advs: rx,
+                            }
+                            .run()
+                            .instrument(info_span!("wal_advertiser", ps_id=%node_id)),
+                        );
+                        tx
+                    });
+                    if let Err((failed, err)) = tx.send_replace(advs) {
+                        self.senders.remove(&node_id);
+                        advs = failed;
+                    } else {
+                        break 'inner;
+                    }
+                }
+            }
+        }
+    }
+}
+struct PageserverTask {
+    ps_id: NodeId,
+    advs: spsc_watch::Receiver<Advs>,
+}
+
+impl PageserverTask {
+    /// Cancellation: happens through last PageserverHandle being dropped.
+    async fn run(mut self) {
+        loop {
+            let Ok(advs) = self.advs.recv().await else {
+                info!("main task gone, exiting");
+                return;
+            };
+            let res = self.run0(advs).await;
+            match res {
+                Ok(()) => {}
+                Err(err) => {
+                    error!(?err, "error sending advertisements");
+                    // TODO: proper backoff?
+                    tokio::time::sleep(Duration::from_secs(5)).await;
+                }
+            }
+        }
+    }
+    async fn run0(&mut self, advs: HashMap<TenantTimelineId, Lsn>) -> anyhow::Result<()> {
+        use storage_broker::wal_advertisement as proto;
+        use storage_broker::wal_advertisement::pageserver_client::PageserverClient;
+        let stream = async_stream::stream! {
+            for (tenant_timeline_id, commit_lsn) in advs {
+                yield proto::CommitLsnAdvertisement {tenant_timeline_id: Some(proto::TenantTimelineId {
+                    tenant_id: tenant_timeline_id.tenant_id.as_ref().to_owned(),
+                    timeline_id: tenant_timeline_id.timeline_id.as_ref().to_owned(),
+                }), commit_lsn: commit_lsn.0 };
+            }
+        };
+        let mut client: PageserverClient<_> = PageserverClient::connect(self.endpoint.clone())
+            .await
+            .context("connect")?;
+        let publish_stream = client
+            .publish_commit_lsn_advertisements(stream)
+            .await
+            .context("publish stream")?;
+        Ok(())
+    }
+}
--- a/safekeeper/src/wal_advertiser/persistence.rs
+++ b/safekeeper/src/wal_advertiser/persistence.rs
--- a/safekeeper/src/wal_backup.rs
+++ b/safekeeper/src/wal_backup.rs
@@ -2,6 +2,7 @@ use std::cmp::min;
 use std::collections::HashSet;
 use std::num::NonZeroU32;
 use std::pin::Pin;
+use std::sync::Arc;
 use std::time::Duration;

 use anyhow::{Context, Result};
@@ -17,7 +18,7 @@ use safekeeper_api::models::PeerInfo;
 use tokio::fs::File;
 use tokio::select;
 use tokio::sync::mpsc::{self, Receiver, Sender};
-use tokio::sync::{OnceCell, watch};
+use tokio::sync::watch;
 use tokio::task::JoinHandle;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
@@ -63,7 +64,12 @@ pub(crate) fn is_wal_backup_required(
 /// Based on peer information determine which safekeeper should offload; if it
 /// is me, run (per timeline) task, if not yet. OTOH, if it is not me and task
 /// is running, kill it.
-pub(crate) async fn update_task(mgr: &mut Manager, need_backup: bool, state: &StateSnapshot) {
+pub(crate) async fn update_task(
+    mgr: &mut Manager,
+    storage: Arc<GenericRemoteStorage>,
+    need_backup: bool,
+    state: &StateSnapshot,
+) {
    let (offloader, election_dbg_str) =
        determine_offloader(&state.peers, state.backup_lsn, mgr.tli.ttid, &mgr.conf);
    let elected_me = Some(mgr.conf.my_id) == offloader;
@@ -82,7 +88,12 @@ pub(crate) async fn update_task(mgr: &mut Manager, need_backup: bool, state: &St
                return;
            };

-            let async_task = backup_task_main(resident, mgr.conf.backup_parallel_jobs, shutdown_rx);
+            let async_task = backup_task_main(
+                resident,
+                storage,
+                mgr.conf.backup_parallel_jobs,
+                shutdown_rx,
+            );

            let handle = if mgr.conf.current_thread_runtime {
                tokio::spawn(async_task)
@@ -169,33 +180,31 @@ fn determine_offloader(
    }
 }

-static REMOTE_STORAGE: OnceCell<Option<GenericRemoteStorage>> = OnceCell::const_new();
-
-// Storage must be configured and initialized when this is called.
-fn get_configured_remote_storage() -> &'static GenericRemoteStorage {
-    REMOTE_STORAGE
-        .get()
-        .expect("failed to get remote storage")
-        .as_ref()
-        .unwrap()
+pub struct WalBackup {
+    storage: Option<Arc<GenericRemoteStorage>>,
 }

-pub async fn init_remote_storage(conf: &SafeKeeperConf) {
-    // TODO: refactor REMOTE_STORAGE to avoid using global variables, and provide
-    // dependencies to all tasks instead.
-    REMOTE_STORAGE
-        .get_or_init(|| async {
-            if let Some(conf) = conf.remote_storage.as_ref() {
-                Some(
-                    GenericRemoteStorage::from_config(conf)
-                        .await
-                        .expect("failed to create remote storage"),
-                )
-            } else {
-                None
+impl WalBackup {
+    /// Create a new WalBackup instance.
+    pub async fn new(conf: &SafeKeeperConf) -> Result<Self> {
+        if !conf.wal_backup_enabled {
+            return Ok(Self { storage: None });
+        }
+
+        match conf.remote_storage.as_ref() {
+            Some(config) => {
+                let storage = GenericRemoteStorage::from_config(config).await?;
+                Ok(Self {
+                    storage: Some(Arc::new(storage)),
+                })
            }
-        })
-        .await;
+            None => Ok(Self { storage: None }),
+        }
+    }
+
+    pub fn get_storage(&self) -> Option<Arc<GenericRemoteStorage>> {
+        self.storage.clone()
+    }
 }

 struct WalBackupTask {
@@ -204,12 +213,14 @@ struct WalBackupTask {
    wal_seg_size: usize,
    parallel_jobs: usize,
    commit_lsn_watch_rx: watch::Receiver<Lsn>,
+    storage: Arc<GenericRemoteStorage>,
 }

 /// Offload single timeline.
 #[instrument(name = "wal_backup", skip_all, fields(ttid = %tli.ttid))]
 async fn backup_task_main(
    tli: WalResidentTimeline,
+    storage: Arc<GenericRemoteStorage>,
    parallel_jobs: usize,
    mut shutdown_rx: Receiver<()>,
 ) {
@@ -223,6 +234,7 @@ async fn backup_task_main(
        timeline_dir: tli.get_timeline_dir(),
        timeline: tli,
        parallel_jobs,
+        storage,
    };

    // task is spinned up only when wal_seg_size already initialized
@@ -293,6 +305,7 @@ impl WalBackupTask {

            match backup_lsn_range(
                &self.timeline,
+                self.storage.clone(),
                &mut backup_lsn,
                commit_lsn,
                self.wal_seg_size,
@@ -322,6 +335,7 @@ impl WalBackupTask {

 async fn backup_lsn_range(
    timeline: &WalResidentTimeline,
+    storage: Arc<GenericRemoteStorage>,
    backup_lsn: &mut Lsn,
    end_lsn: Lsn,
    wal_seg_size: usize,
@@ -352,7 +366,12 @@ async fn backup_lsn_range(
    loop {
        let added_task = match iter.next() {
            Some(s) => {
-                uploads.push_back(backup_single_segment(s, timeline_dir, remote_timeline_path));
+                uploads.push_back(backup_single_segment(
+                    &storage,
+                    s,
+                    timeline_dir,
+                    remote_timeline_path,
+                ));
                true
            }
            None => false,
@@ -388,6 +407,7 @@ async fn backup_lsn_range(
 }

 async fn backup_single_segment(
+    storage: &GenericRemoteStorage,
    seg: &Segment,
    timeline_dir: &Utf8Path,
    remote_timeline_path: &RemotePath,
@@ -395,7 +415,13 @@ async fn backup_single_segment(
    let segment_file_path = seg.file_path(timeline_dir)?;
    let remote_segment_path = seg.remote_path(remote_timeline_path);

-    let res = backup_object(&segment_file_path, &remote_segment_path, seg.size()).await;
+    let res = backup_object(
+        storage,
+        &segment_file_path,
+        &remote_segment_path,
+        seg.size(),
+    )
+    .await;
    if res.is_ok() {
        BACKED_UP_SEGMENTS.inc();
    } else {
@@ -455,12 +481,11 @@ fn get_segments(start: Lsn, end: Lsn, seg_size: usize) -> Vec<Segment> {
 }

 async fn backup_object(
+    storage: &GenericRemoteStorage,
    source_file: &Utf8Path,
    target_file: &RemotePath,
    size: usize,
 ) -> Result<()> {
-    let storage = get_configured_remote_storage();
-
    let file = File::open(&source_file)
        .await
        .with_context(|| format!("Failed to open file {source_file:?} for wal backup"))?;
@@ -475,12 +500,11 @@ async fn backup_object(
 }

 pub(crate) async fn backup_partial_segment(
+    storage: &GenericRemoteStorage,
    source_file: &Utf8Path,
    target_file: &RemotePath,
    size: usize,
 ) -> Result<()> {
-    let storage = get_configured_remote_storage();
-
    let file = File::open(&source_file)
        .await
        .with_context(|| format!("Failed to open file {source_file:?} for wal backup"))?;
@@ -504,25 +528,20 @@ pub(crate) async fn backup_partial_segment(
 }

 pub(crate) async fn copy_partial_segment(
+    storage: &GenericRemoteStorage,
    source: &RemotePath,
    destination: &RemotePath,
 ) -> Result<()> {
-    let storage = get_configured_remote_storage();
    let cancel = CancellationToken::new();

    storage.copy_object(source, destination, &cancel).await
 }

 pub async fn read_object(
+    storage: &GenericRemoteStorage,
    file_path: &RemotePath,
    offset: u64,
 ) -> anyhow::Result<Pin<Box<dyn tokio::io::AsyncRead + Send + Sync>>> {
-    let storage = REMOTE_STORAGE
-        .get()
-        .context("Failed to get remote storage")?
-        .as_ref()
-        .context("No remote storage configured")?;
-
    info!("segment download about to start from remote path {file_path:?} at offset {offset}");

    let cancel = CancellationToken::new();
@@ -547,8 +566,10 @@ pub async fn read_object(

 /// Delete WAL files for the given timeline. Remote storage must be configured
 /// when called.
-pub async fn delete_timeline(ttid: &TenantTimelineId) -> Result<()> {
-    let storage = get_configured_remote_storage();
+pub async fn delete_timeline(
+    storage: &GenericRemoteStorage,
+    ttid: &TenantTimelineId,
+) -> Result<()> {
    let remote_path = remote_timeline_path(ttid)?;

    // see DEFAULT_MAX_KEYS_PER_LIST_RESPONSE
@@ -618,14 +639,14 @@ pub async fn delete_timeline(ttid: &TenantTimelineId) -> Result<()> {
 }

 /// Used by wal_backup_partial.
-pub async fn delete_objects(paths: &[RemotePath]) -> Result<()> {
+pub async fn delete_objects(storage: &GenericRemoteStorage, paths: &[RemotePath]) -> Result<()> {
    let cancel = CancellationToken::new(); // not really used
-    let storage = get_configured_remote_storage();
    storage.delete_objects(paths, &cancel).await
 }

 /// Copy segments from one timeline to another. Used in copy_timeline.
 pub async fn copy_s3_segments(
+    storage: &GenericRemoteStorage,
    wal_seg_size: usize,
    src_ttid: &TenantTimelineId,
    dst_ttid: &TenantTimelineId,
@@ -634,12 +655,6 @@ pub async fn copy_s3_segments(
 ) -> Result<()> {
    const SEGMENTS_PROGRESS_REPORT_INTERVAL: u64 = 1024;

-    let storage = REMOTE_STORAGE
-        .get()
-        .expect("failed to get remote storage")
-        .as_ref()
-        .unwrap();
-
    let remote_dst_path = remote_timeline_path(dst_ttid)?;

    let cancel = CancellationToken::new();
--- a/safekeeper/src/wal_backup_partial.rs
+++ b/safekeeper/src/wal_backup_partial.rs
@@ -19,9 +19,11 @@
 //! file. Code updates state in the control file before doing any S3 operations.
 //! This way control file stores information about all potentially existing
 //! remote partial segments and can clean them up after uploading a newer version.
+use std::sync::Arc;
+
 use camino::Utf8PathBuf;
 use postgres_ffi::{PG_TLI, XLogFileName, XLogSegNo};
-use remote_storage::RemotePath;
+use remote_storage::{GenericRemoteStorage, RemotePath};
 use safekeeper_api::Term;
 use serde::{Deserialize, Serialize};
 use tokio_util::sync::CancellationToken;
@@ -154,12 +156,16 @@ pub struct PartialBackup {
    conf: SafeKeeperConf,
    local_prefix: Utf8PathBuf,
    remote_timeline_path: RemotePath,
-
+    storage: Arc<GenericRemoteStorage>,
    state: State,
 }

 impl PartialBackup {
-    pub async fn new(tli: WalResidentTimeline, conf: SafeKeeperConf) -> PartialBackup {
+    pub async fn new(
+        tli: WalResidentTimeline,
+        conf: SafeKeeperConf,
+        storage: Arc<GenericRemoteStorage>,
+    ) -> PartialBackup {
        let (_, persistent_state) = tli.get_state().await;
        let wal_seg_size = tli.get_wal_seg_size().await;

@@ -173,6 +179,7 @@ impl PartialBackup {
            conf,
            local_prefix,
            remote_timeline_path,
+            storage,
        }
    }

@@ -240,7 +247,8 @@ impl PartialBackup {
        let remote_path = prepared.remote_path(&self.remote_timeline_path);

        // Upload first `backup_bytes` bytes of the segment to the remote storage.
-        wal_backup::backup_partial_segment(&local_path, &remote_path, backup_bytes).await?;
+        wal_backup::backup_partial_segment(&self.storage, &local_path, &remote_path, backup_bytes)
+            .await?;
        PARTIAL_BACKUP_UPLOADED_BYTES.inc_by(backup_bytes as u64);

        // We uploaded the segment, now let's verify that the data is still actual.
@@ -326,7 +334,7 @@ impl PartialBackup {
            let remote_path = self.remote_timeline_path.join(seg);
            objects_to_delete.push(remote_path);
        }
-        wal_backup::delete_objects(&objects_to_delete).await
+        wal_backup::delete_objects(&self.storage, &objects_to_delete).await
    }

    /// Delete all non-Uploaded segments from the remote storage. There should be only one
@@ -424,6 +432,7 @@ pub async fn main_task(
    conf: SafeKeeperConf,
    limiter: RateLimiter,
    cancel: CancellationToken,
+    storage: Arc<GenericRemoteStorage>,
 ) -> Option<PartialRemoteSegment> {
    debug!("started");
    let await_duration = conf.partial_backup_timeout;
@@ -432,7 +441,7 @@ pub async fn main_task(
    let mut commit_lsn_rx = tli.get_commit_lsn_watch_rx();
    let mut flush_lsn_rx = tli.get_term_flush_lsn_watch_rx();

-    let mut backup = PartialBackup::new(tli, conf).await;
+    let mut backup = PartialBackup::new(tli, conf, storage).await;

    debug!("state: {:?}", backup.state);

--- a/safekeeper/src/wal_storage.rs
+++ b/safekeeper/src/wal_storage.rs
@@ -21,6 +21,7 @@ use postgres_ffi::waldecoder::WalStreamDecoder;
 use postgres_ffi::{PG_TLI, XLogFileName, XLogSegNo, dispatch_pgversion};
 use pq_proto::SystemId;
 use remote_storage::RemotePath;
+use std::sync::Arc;
 use tokio::fs::{self, File, OpenOptions, remove_file};
 use tokio::io::{AsyncRead, AsyncReadExt, AsyncSeekExt, AsyncWriteExt};
 use tracing::*;
@@ -32,7 +33,7 @@ use crate::metrics::{
    REMOVED_WAL_SEGMENTS, WAL_STORAGE_OPERATION_SECONDS, WalStorageMetrics, time_io_closure,
 };
 use crate::state::TimelinePersistentState;
-use crate::wal_backup::{read_object, remote_timeline_path};
+use crate::wal_backup::{WalBackup, read_object, remote_timeline_path};

 pub trait Storage {
    // Last written LSN.
@@ -645,7 +646,7 @@ pub struct WalReader {
    wal_segment: Option<Pin<Box<dyn AsyncRead + Send + Sync>>>,

    // S3 will be used to read WAL if LSN is not available locally
-    enable_remote_read: bool,
+    wal_backup: Arc<WalBackup>,

    // We don't have WAL locally if LSN is less than local_start_lsn
    local_start_lsn: Lsn,
@@ -664,7 +665,7 @@ impl WalReader {
        timeline_dir: Utf8PathBuf,
        state: &TimelinePersistentState,
        start_pos: Lsn,
-        enable_remote_read: bool,
+        wal_backup: Arc<WalBackup>,
    ) -> Result<Self> {
        if state.server.wal_seg_size == 0 || state.local_start_lsn == Lsn(0) {
            bail!("state uninitialized, no data to read");
@@ -693,7 +694,7 @@ impl WalReader {
            wal_seg_size: state.server.wal_seg_size as usize,
            pos: start_pos,
            wal_segment: None,
-            enable_remote_read,
+            wal_backup,
            local_start_lsn: state.local_start_lsn,
            timeline_start_lsn: state.timeline_start_lsn,
            pg_version: state.server.pg_version / 10000,
@@ -812,9 +813,9 @@ impl WalReader {
        }

        // Try to open remote file, if remote reads are enabled
-        if self.enable_remote_read {
+        if let Some(storage) = self.wal_backup.get_storage() {
            let remote_wal_file_path = self.remote_path.join(&wal_file_name);
-            return read_object(&remote_wal_file_path, xlogoff as u64).await;
+            return read_object(&storage, &remote_wal_file_path, xlogoff as u64).await;
        }

        bail!("WAL segment is not found")
--- a/storage_broker/Cargo.toml
+++ b/storage_broker/Cargo.toml
@@ -27,6 +27,7 @@ parking_lot.workspace = true
 prost.workspace = true
 tonic.workspace = true
 tokio = { workspace = true, features = ["rt-multi-thread"] }
+tokio-util.workspace = true
 tokio-rustls.workspace = true
 tracing.workspace = true
 metrics.workspace = true
--- a/storage_broker/build.rs
+++ b/storage_broker/build.rs
@@ -5,7 +5,13 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
    // easy location, but apparently interference with cachepot sometimes fails
    // the build then. Anyway, per cargo docs build script shouldn't output to
    // anywhere but $OUT_DIR.
-    tonic_build::compile_protos("proto/broker.proto")
-        .unwrap_or_else(|e| panic!("failed to compile protos {:?}", e));
+    let protos = [
+        "proto/broker.proto",
+        "proto/wal_advertisement.proto",
+    ];
+    for proto in protos {
+        tonic_build::compile_protos(proto)
+            .unwrap_or_else(|e| panic!("failed to compile protos {:?}", e));
+    }
    Ok(())
 }
--- a/storage_broker/proto/broker.proto
+++ b/storage_broker/proto/broker.proto
@@ -35,14 +35,14 @@ message SafekeeperTimelineInfo {
    // LSN of the last record.
    uint64 flush_lsn = 4;
    // Up to which LSN safekeeper regards its WAL as committed.
-    uint64 commit_lsn = 5;
+    uint64 commit_lsn = 5; // yes
    // LSN up to which safekeeper has backed WAL.
    uint64 backup_lsn = 6;
    // LSN of last checkpoint uploaded by pageserver.
    uint64 remote_consistent_lsn = 7;
    uint64 peer_horizon_lsn = 8;
    uint64 local_start_lsn = 9;
-    uint64 standby_horizon = 14;
+    uint64 standby_horizon = 14; // yes
    // A connection string to use for WAL receiving.
    string safekeeper_connstr = 10;
    // HTTP endpoint connection string.
--- a/storage_broker/proto/wal_advertisement.proto
+++ b/storage_broker/proto/wal_advertisement.proto
@@ -0,0 +1,29 @@
+syntax = "proto3";
+
+import "google/protobuf/empty.proto";
+
+package wal_advertisement;
+
+service Pageserver {
+    rpc PublishCommitLsnAdvertisements(stream CommitLsnAdvertisement) returns (google.protobuf.Empty) {};
+    rpc SubscribeRemoteConsistentLsnAdvertisements(google.protobuf.Empty) returns (stream RemoteConsistentLsnAdvertisement) {};
+}
+
+message CommitLsnAdvertisement {
+    TenantTimelineId tenant_timeline_id = 1;
+    uint64 commit_lsn = 2;
+}
+
+message RemoteConsistentLsnAdvertisement {
+    bytes tenant_id = 1;
+    uint32 shard_id = 2;
+    bytes timeline_id = 3;
+    uint64 generation = 4;
+    uint64 remote_consistent_lsn = 5;
+}
+
+message TenantTimelineId {
+    bytes tenant_id = 1;
+    bytes timeline_id = 2;
+}
+
--- a/storage_broker/src/lib.rs
+++ b/storage_broker/src/lib.rs
@@ -1,10 +1,14 @@
 use std::time::Duration;

+use futures::Stream;
 use proto::TenantTimelineId as ProtoTenantTimelineId;
-use proto::broker_service_client::BrokerServiceClient;
+use tokio_util::sync::CancellationToken;
 use tonic::Status;
-use tonic::codegen::StdError;
-use tonic::transport::{Channel, Endpoint};
+use tonic::transport::Endpoint;
+use tracing::{debug, error, info, warn};
+use utils::backoff::{
+    DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS, exponential_backoff,
+};
 use utils::id::{TenantId, TenantTimelineId, TimelineId};

 // Code generated by protobuf.
@@ -16,12 +20,18 @@ pub mod proto {
    tonic::include_proto!("storage_broker");
 }

+pub mod wal_advertisement {
+    #![allow(clippy::derive_partial_eq_without_eq)]
+    tonic::include_proto!("wal_advertisement");
+}
+
 pub mod metrics;

 // Re-exports to avoid direct tonic dependency in user crates.
 pub use hyper::Uri;
 pub use tonic::transport::{Certificate, ClientTlsConfig};
 pub use tonic::{Code, Request, Streaming};
+use utils::shard::TenantShardId;

 pub const DEFAULT_LISTEN_ADDR: &str = "127.0.0.1:50051";
 pub const DEFAULT_ENDPOINT: &str = const_format::formatcp!("http://{DEFAULT_LISTEN_ADDR}");
@@ -29,9 +39,199 @@ pub const DEFAULT_ENDPOINT: &str = const_format::formatcp!("http://{DEFAULT_LIST
 pub const DEFAULT_KEEPALIVE_INTERVAL: &str = "5000 ms";
 pub const DEFAULT_CONNECT_TIMEOUT: Duration = Duration::from_millis(5000);

-// BrokerServiceClient charged with tonic provided Channel transport; helps to
-// avoid depending on tonic directly in user crates.
-pub type BrokerClientChannel = BrokerServiceClient<Channel>;
+#[derive(Clone)]
+pub struct TimelineUpdatesSubscriber {
+    client: proto::broker_service_client::BrokerServiceClient<tonic::transport::Channel>,
+}
+
+/// Wrapper type to weed out all places in the codebase that interact directly with the gRPC generated code.
+pub struct BrokerClientChannel {
+    client: proto::broker_service_client::BrokerServiceClient<tonic::transport::Channel>,
+}
+
+impl BrokerClientChannel {
+    pub fn into_raw_grpc_client(
+        self,
+    ) -> proto::broker_service_client::BrokerServiceClient<tonic::transport::Channel> {
+        self.client
+    }
+}
+
+pub struct TimelineShardUpdate {
+    pub is_discovery: bool,
+    pub inner: proto::SafekeeperDiscoveryResponse,
+}
+
+pub struct DiscoveryRequester {
+    id: ProtoTenantTimelineId,
+    client: proto::broker_service_client::BrokerServiceClient<tonic::transport::Channel>,
+}
+
+impl TimelineUpdatesSubscriber {
+    pub fn new(service_client: BrokerClientChannel) -> Self {
+        Self {
+            client: service_client.client.clone(),
+        }
+    }
+    pub fn subscribe(
+        &mut self,
+        tenant_shard_id: TenantShardId,
+        timeline_id: TimelineId,
+        cancel: &CancellationToken,
+    ) -> (impl Stream<Item = TimelineShardUpdate>, DiscoveryRequester) {
+        let id = ProtoTenantTimelineId {
+            tenant_id: tenant_shard_id.tenant_id.as_ref().to_owned(),
+            timeline_id: timeline_id.as_ref().to_owned(),
+        };
+        let discovery_requester = DiscoveryRequester {
+            id: id.clone(),
+            client: self.client.clone(),
+        };
+        let stream = async_stream::stream! {
+            let mut attempt = 0;
+            'resubscribe: loop {
+                exponential_backoff(
+                    attempt,
+                    DEFAULT_BASE_BACKOFF_SECONDS,
+                    DEFAULT_MAX_BACKOFF_SECONDS,
+                    cancel,
+                )
+                .await;
+                attempt += 1;
+
+                use proto::*;
+                // subscribe to the specific timeline
+                let request = SubscribeByFilterRequest {
+                    types: vec![
+                        TypeSubscription {
+                            r#type: MessageType::SafekeeperTimelineInfo as i32,
+                        },
+                        TypeSubscription {
+                            r#type: MessageType::SafekeeperDiscoveryResponse as i32,
+                        },
+                    ],
+                    tenant_timeline_id: Some(FilterTenantTimelineId {
+                        enabled: true,
+                        tenant_timeline_id: Some(id.clone()),
+                    }),
+                };
+
+                let res = tokio::select! {
+                    r = self.client.subscribe_by_filter(request) => { r }
+                    _ = cancel.cancelled() => { return; }
+                };
+                let mut update_stream = match res
+                 {
+                    Ok(resp) => {
+                        resp.into_inner()
+                    }
+                    Err(e) => {
+                        // Safekeeper nodes can stop pushing timeline updates to the broker, when no new writes happen and
+                        // entire WAL is streamed. Keep this noticeable with logging, but do not warn/error.
+                        info!(
+                            attempt, "failed to subscribe: {e:#}"
+                        );
+                        continue 'resubscribe;
+                    }
+                };
+                loop {
+                    let broker_update = tokio::select!{
+                        _ = cancel.cancelled() => {
+                            return;
+                        }
+                        update = update_stream.message() => { update }
+                    };
+                    match broker_update {
+                        Ok(Some(typed_msg)) => {
+                            let mut is_discovery = false;
+                            let timeline_update = match typed_msg.r#type() {
+                                MessageType::SafekeeperTimelineInfo => {
+                                    let info = match typed_msg.safekeeper_timeline_info {
+                                        Some(info) => info,
+                                        None => {
+                                            warn!("bad proto message from broker: no safekeeper_timeline_info");
+                                            continue 'resubscribe;
+                                        }
+                                    };
+                                    SafekeeperDiscoveryResponse {
+                                        safekeeper_id: info.safekeeper_id,
+                                        tenant_timeline_id: info.tenant_timeline_id,
+                                        commit_lsn: info.commit_lsn,
+                                        safekeeper_connstr: info.safekeeper_connstr,
+                                        availability_zone: info.availability_zone,
+                                        standby_horizon: info.standby_horizon,
+                                    }
+                                }
+                                MessageType::SafekeeperDiscoveryResponse => {
+                                    is_discovery = true;
+                                    match typed_msg.safekeeper_discovery_response {
+                                        Some(response) => response,
+                                        None => {
+                                            warn!("bad proto message from broker: no safekeeper_discovery_response");
+                                            continue 'resubscribe;
+                                        }
+                                    }
+                                }
+                                _ => {
+                                    // unexpected message
+                                    warn!("unexpected message from broker: {typed_msg:?}");
+                                    continue 'resubscribe;
+                                }
+                            };
+                            attempt = 0; // reset backoff iff we received a valid update
+                            yield TimelineShardUpdate{is_discovery, inner: timeline_update };
+                        },
+                        Err(status) => {
+                            match status.code() {
+                                Code::Unknown if status.message().contains("stream closed because of a broken pipe") || status.message().contains("connection reset") || status.message().contains("error reading a body from connection") => {
+                                    // tonic's error handling doesn't provide a clear code for disconnections: we get
+                                    // "h2 protocol error: error reading a body from connection: stream closed because of a broken pipe"
+                                    // => https://github.com/neondatabase/neon/issues/9562
+                                    info!("broker disconnected: {status}");
+                                },
+                                _ => {
+                                    warn!("broker subscription failed: {status}");
+                                }
+                            }
+                            continue 'resubscribe;
+                        }
+                        Ok(None) => {
+                            error!("broker subscription stream ended"); // can't happen
+                            continue 'resubscribe;
+                        }
+                    }
+                }
+            }
+        };
+        (stream, discovery_requester)
+    }
+}
+
+impl DiscoveryRequester {
+    pub async fn request(&mut self) {
+        let request = proto::SafekeeperDiscoveryRequest {
+            tenant_timeline_id: Some(self.id.clone()),
+        };
+        let msg = proto::TypedMessage {
+            r#type: proto::MessageType::SafekeeperDiscoveryRequest as i32,
+            safekeeper_timeline_info: None,
+            safekeeper_discovery_request: Some(request),
+            safekeeper_discovery_response: None,
+        };
+
+        // Cancellation safety: we want to send a message to the broker, but publish_one()
+        // function can get cancelled by the other select! arm. This is absolutely fine, because
+        // we just want to receive broker updates and discovery is not important if we already
+        // receive updates.
+        //
+        // It is possible that `last_discovery_ts` will be updated, but the message will not be sent.
+        // This is totally fine because of the reason above.
+
+        // This is a fire-and-forget request, we don't care about the response
+        let _ = self.client.publish_one(msg).await;
+        debug!("Discovery request sent to the broker");
+    }
+}

 // Create connection object configured to run TLS if schema starts with https://
 // and plain text otherwise. Connection is lazy, only endpoint sanity is
@@ -67,19 +267,9 @@ where
        .connect_timeout(DEFAULT_CONNECT_TIMEOUT);
    //  keep_alive_timeout is 20s by default on both client and server side
    let channel = tonic_endpoint.connect_lazy();
-    Ok(BrokerClientChannel::new(channel))
-}
-
-impl BrokerClientChannel {
-    /// Create a new client to the given endpoint, but don't actually connect until the first request.
-    pub async fn connect_lazy<D>(dst: D) -> Result<Self, tonic::transport::Error>
-    where
-        D: std::convert::TryInto<tonic::transport::Endpoint>,
-        D::Error: Into<StdError>,
-    {
-        let conn = tonic::transport::Endpoint::new(dst)?.connect_lazy();
-        Ok(Self::new(conn))
-    }
+    Ok(BrokerClientChannel {
+        client: proto::broker_service_client::BrokerServiceClient::new(channel),
+    })
 }

 // parse variable length bytes from protobuf
--- a/storage_controller/Cargo.toml
+++ b/storage_controller/Cargo.toml
@@ -15,6 +15,7 @@ testing = []

 [dependencies]
 anyhow.workspace = true
+async-stream.workspace = true
 bytes.workspace = true
 camino.workspace = true
 chrono.workspace = true
@@ -69,4 +70,4 @@ http-utils = { path = "../libs/http-utils/" }
 utils = { path = "../libs/utils/" }
 metrics = { path = "../libs/metrics/" }
 control_plane = { path = "../control_plane" }
-workspace_hack = { version = "0.1", path = "../workspace_hack" }
+workspace_hack = { version = "0.1", path = "../workspace_hack" }
--- a/storage_controller/migrations/2025-05-26-105843_sk_ps_discovery/down.sql
+++ b/storage_controller/migrations/2025-05-26-105843_sk_ps_discovery/down.sql
@@ -0,0 +1,16 @@
+DROP TRIGGER on_timelines_UPDATE_enqueue_sk_ps_discovery on "timelines";
+DROP FUNCTION on_timelines_UPDATE_enqueue_sk_ps_discovery_triggerfn;
+DROP TRIGGER on_timelines_DELETE_enqueue_sk_ps_discovery on "timelines";
+DROP FUNCTION on_timelines_DELETE_enqueue_sk_ps_discovery_triggerfn;
+DROP TRIGGER on_timelines_INSERT_enqueue_sk_ps_discovery on "timelines";
+DROP FUNCTION on_timelines_INSERT_enqueue_sk_ps_discovery_triggerfn;
+DROP TRIGGER on_ps_tenant_shard_UPDATE_enqueue_sk_ps_discovery on "tenant_shards";
+DROP FUNCTION on_ps_tenant_shard_UPDATE_enqueue_sk_ps_discovery_triggerfn;
+DROP TRIGGER on_ps_tenant_shard_DELETE_enqueue_sk_ps_discovery on "tenant_shards";
+DROP FUNCTION on_ps_tenant_shard_DELETE_enqueue_sk_ps_discovery_triggerfn;
+DROP TRIGGER on_ps_tenant_shard_INSERT_enqueue_sk_ps_discovery on "tenant_shards";
+DROP FUNCTION on_ps_tenant_shard_INSERT_enqueue_sk_ps_discovery_triggerfn;
+DROP FUNCTION IF EXISTS sk_ps_discovery_enqueue_attachment_create;
+DROP TABLE "sk_ps_discovery";
+
+
--- a/storage_controller/migrations/2025-05-26-105843_sk_ps_discovery/up.sql
+++ b/storage_controller/migrations/2025-05-26-105843_sk_ps_discovery/up.sql
@@ -0,0 +1,122 @@
+CREATE TABLE "sk_ps_discovery"(
+	"tenant_id" VARCHAR NOT NULL,
+	"shard_number" INT4 NOT NULL,
+	"shard_count" INT4 NOT NULL,
+	"ps_generation" INT4 NOT NULL,
+	"sk_id" INT8 NOT NULL REFERENCES "safekeepers"("id") ON DELETE CASCADE, -- more efficient that trigger on "safekeepers"
+	"intent_state" VARCHAR NOT NULL, -- attached,detached
+	"ps_id" INT8 NOT NULL REFERENCES "nodes"("node_id") ON DELETE CASCADE, -- more efficient that trigger on "nodes"
+	"created_at" TIMESTAMPTZ NOT NULL,
+	"retries" INT4 NOT NULL DEFAULT 0,
+	"last_retry_at" TIMESTAMPTZ,
+	"acknowledged_at" TIMESTAMPTZ,
+	PRIMARY KEY("tenant_id", "shard_number", "shard_count", "ps_generation", "sk_id")
+);
+
+CREATE OR REPLACE FUNCTION sk_ps_discovery_enqueue_attachment_create(ARG_TENANT_ID VARCHAR)
+RETURNS VOID AS $$
+BEGIN
+	INSERT INTO sk_ps_discovery (tenant_id, shard_number, shard_count, ps_generation, sk_id, intent_state, ps_id, created_at)
+		WITH intent_attachments AS (
+			SELECT DISTINCT tenant_id,unnest(array_cat(sk_set, new_sk_set)) as sk_id FROM timelines
+			WHERE
+				tenant_id = ARG_TENANT_ID
+				AND
+				timelines.deleted_at IS NULL
+		)
+		SELECT tenant_shards.tenant_id, tenant_shards.shard_number, tenant_shards.shard_count,
+			   tenant_shards.generation, intent_attachments.sk_id, 'attached', tenant_shards.generation_pageserver, NOW()
+		FROM tenant_shards
+		INNER JOIN intent_attachments ON tenant_shards.tenant_id = intent_attachments.tenant_id
+	ON CONFLICT DO NOTHING; -- the first trigger creates the attachment, all others are identical because tenant shard generations are monotonic
+
+	PERFORM pg_notify('sk_ps_discovery', json_build_object(
+		'tenant_id', ARG_TENANT_ID
+	)::text);
+END;
+$$ LANGUAGE plpgsql;
+
+-- Trigger on tenant_shards table
+
+CREATE OR REPLACE FUNCTION on_ps_tenant_shard_INSERT_enqueue_sk_ps_discovery_triggerfn()
+RETURNS TRIGGER AS $$
+BEGIN
+	PERFORM sk_ps_discovery_enqueue_attachment_create(NEW.tenant_id);
+	RETURN NEW;
+END;
+$$ LANGUAGE plpgsql;
+CREATE OR REPLACE TRIGGER on_ps_tenant_shard_INSERT_enqueue_sk_ps_discovery
+AFTER INSERT
+ON "tenant_shards"
+FOR EACH ROW
+EXECUTE FUNCTION on_ps_tenant_shard_INSERT_enqueue_sk_ps_discovery_triggerfn();
+
+
+CREATE OR REPLACE FUNCTION on_ps_tenant_shard_DELETE_enqueue_sk_ps_discovery_triggerfn()
+RETURNS TRIGGER AS $$
+BEGIN
+	PERFORM sk_ps_discovery_enqueue_attachment_create(OLD.tenant_id);
+	RETURN OLD;
+END;
+$$ LANGUAGE plpgsql;
+CREATE OR REPLACE TRIGGER on_ps_tenant_shard_DELETE_enqueue_sk_ps_discovery
+AFTER DELETE
+ON "tenant_shards"
+FOR EACH ROW
+EXECUTE FUNCTION on_ps_tenant_shard_DELETE_enqueue_sk_ps_discovery_triggerfn();
+
+
+CREATE OR REPLACE FUNCTION on_ps_tenant_shard_UPDATE_enqueue_sk_ps_discovery_triggerfn()
+RETURNS TRIGGER AS $$
+BEGIN
+	PERFORM sk_ps_discovery_enqueue_attachment_create(NEW.tenant_id);
+	RETURN NEW;
+END;
+$$ LANGUAGE plpgsql;
+CREATE OR REPLACE TRIGGER on_ps_tenant_shard_UPDATE_enqueue_sk_ps_discovery
+AFTER UPDATE
+ON "tenant_shards"
+FOR EACH ROW
+EXECUTE FUNCTION on_ps_tenant_shard_UPDATE_enqueue_sk_ps_discovery_triggerfn();
+
+--  Trigger on timelines table
+
+CREATE OR REPLACE FUNCTION on_timelines_INSERT_enqueue_sk_ps_discovery_triggerfn()
+RETURNS TRIGGER AS $$
+BEGIN
+	PERFORM sk_ps_discovery_enqueue_attachment_create(NEW.tenant_id);
+	RETURN NEW;
+END;
+$$ LANGUAGE plpgsql;
+CREATE OR REPLACE TRIGGER on_timelines_INSERT_enqueue_sk_ps_discovery
+AFTER INSERT
+ON "timelines"
+FOR EACH ROW
+EXECUTE FUNCTION on_timelines_INSERT_enqueue_sk_ps_discovery_triggerfn();
+
+CREATE OR REPLACE FUNCTION on_timelines_DELETE_enqueue_sk_ps_discovery_triggerfn()
+RETURNS TRIGGER AS $$
+BEGIN
+	PERFORM sk_ps_discovery_enqueue_attachment_create(OLD.tenant_id);
+	RETURN OLD;
+END;
+$$ LANGUAGE plpgsql;
+CREATE OR REPLACE TRIGGER on_timelines_DELETE_enqueue_sk_ps_discovery
+AFTER DELETE
+ON "timelines"
+FOR EACH ROW
+EXECUTE FUNCTION on_timelines_DELETE_enqueue_sk_ps_discovery_triggerfn();
+
+CREATE OR REPLACE FUNCTION on_timelines_UPDATE_enqueue_sk_ps_discovery_triggerfn()
+RETURNS TRIGGER AS $$
+BEGIN
+	PERFORM sk_ps_discovery_enqueue_attachment_create(NEW.tenant_id);
+	RETURN NEW;
+END;
+$$ LANGUAGE plpgsql;
+CREATE OR REPLACE TRIGGER on_timelines_UPDATE_enqueue_sk_ps_discovery
+AFTER UPDATE
+ON "timelines"
+FOR EACH ROW
+EXECUTE FUNCTION on_timelines_UPDATE_enqueue_sk_ps_discovery_triggerfn();
+
--- a/storage_controller/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -436,7 +436,8 @@ async fn async_main() -> anyhow::Result<()> {
    };

    // Validate that we can connect to the database
-    Persistence::await_connection(&secrets.database_url, args.db_connect_timeout.into()).await?;
+    Persistence::await_connection(secrets.database_url.clone(), args.db_connect_timeout.into())
+        .await?;

    let persistence = Arc::new(Persistence::new(secrets.database_url).await);

--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -1,6 +1,8 @@
 pub(crate) mod split_state;
 use std::collections::HashMap;
 use std::io::Write;
+use std::ops::Add;
+use std::pin::Pin;
 use std::str::FromStr;
 use std::sync::Arc;
 use std::time::{Duration, Instant};
@@ -15,8 +17,8 @@ use diesel_async::pooled_connection::bb8::Pool;
 use diesel_async::pooled_connection::{AsyncDieselConnectionManager, ManagerConfig};
 use diesel_async::{AsyncPgConnection, RunQueryDsl};
 use diesel_migrations::{EmbeddedMigrations, embed_migrations};
-use futures::FutureExt;
 use futures::future::BoxFuture;
+use futures::{FutureExt, StreamExt};
 use itertools::Itertools;
 use pageserver_api::controller_api::{
    AvailabilityZone, MetadataHealthRecord, NodeSchedulingPolicy, PlacementPolicy,
@@ -31,6 +33,7 @@ use rustls::client::danger::{ServerCertVerified, ServerCertVerifier};
 use rustls::crypto::ring;
 use scoped_futures::ScopedBoxFuture;
 use serde::{Deserialize, Serialize};
+use tracing::info;
 use utils::generation::Generation;
 use utils::id::{NodeId, TenantId, TimelineId};
 use utils::lsn::Lsn;
@@ -74,6 +77,8 @@ const MIGRATIONS: EmbeddedMigrations = embed_migrations!("./migrations");
 /// we can UPDATE a node's scheduling mode reasonably quickly to mark a bad node offline.
 pub struct Persistence {
    connection_pool: Pool<AsyncPgConnection>,
+    connect_tokio_postgres:
+        Box<dyn Sync + Send + 'static + Fn() -> BoxFuture<'static, TokioPostgresConnectResult>>,
 }

 /// Legacy format, for use in JSON compat objects in test environment
@@ -135,6 +140,8 @@ pub(crate) enum DatabaseOperation {
    DeleteTimelineImport,
    ListTimelineImports,
    IsTenantImportingTimeline,
+    ListSkPsDiscovery,
+    UpdateSkPsDiscoveryAttempt,
 }

 #[must_use]
@@ -177,10 +184,11 @@ impl Persistence {

    pub async fn new(database_url: String) -> Self {
        let mut mgr_config = ManagerConfig::default();
-        mgr_config.custom_setup = Box::new(establish_connection_rustls);
+        mgr_config.custom_setup =
+            Box::new(|config| establish_connection_rustls_diesel(config.to_owned()));

        let manager = AsyncDieselConnectionManager::<AsyncPgConnection>::new_with_config(
-            database_url,
+            database_url.clone(),
            mgr_config,
        );

@@ -197,20 +205,25 @@ impl Persistence {
            .await
            .expect("Could not build connection pool");

-        Self { connection_pool }
+        Self {
+            connection_pool,
+            connect_tokio_postgres: Box::new(move || {
+                establish_connection_rustls_tokio_postgres(database_url.clone())
+            }),
+        }
    }

    /// A helper for use during startup, where we would like to tolerate concurrent restarts of the
    /// database and the storage controller, therefore the database might not be available right away
    pub async fn await_connection(
-        database_url: &str,
+        database_url: String,
        timeout: Duration,
    ) -> Result<(), diesel::ConnectionError> {
        let started_at = Instant::now();
-        log_postgres_connstr_info(database_url)
+        log_postgres_connstr_info(&database_url)
            .map_err(|e| diesel::ConnectionError::InvalidConnectionUrl(e.to_string()))?;
        loop {
-            match establish_connection_rustls(database_url).await {
+            match establish_connection_rustls_diesel(database_url.clone()).await {
                Ok(_) => {
                    tracing::info!("Connected to database.");
                    return Ok(());
@@ -1821,6 +1834,151 @@ impl Persistence {
        })
        .await
    }
+
+    pub(crate) async fn listen_sk_ps_discovery(
+        &self,
+    ) -> DatabaseResult<
+        Pin<Box<dyn Send + 'static + futures::Stream<Item = Result<TenantId, serde_json::Error>>>>,
+    > {
+        let (client, mut conn) = (&self.connect_tokio_postgres)().await?;
+
+        let (tx, mut rx) = tokio::sync::mpsc::channel(1);
+        tokio::spawn(async move {
+            let mut stream = futures::stream::poll_fn(move |cx| conn.poll_message(cx));
+            while let Some(msg) = stream.next().await {
+                info!(?msg, "async message");
+                match msg {
+                    Ok(tokio_postgres::AsyncMessage::Notification(notification))
+                        if notification.channel() == "sk_ps_discovery" =>
+                    {
+                        let Ok(()) = tx.send(notification).await else {
+                            tracing::info!(
+                                "sk_ps_discovery notification rx dropped, stopping async notification processing"
+                            );
+                            break;
+                        };
+                    }
+                    Ok(_) => {}
+                    Err(err) => {
+                        tracing::error!(?err, "tokio_postgres poll_message error");
+                        break;
+                    }
+                }
+            }
+            tracing::info!("sk_ps_discovery notification stream returned None, exiting");
+        });
+
+        client
+            .batch_execute("LISTEN sk_ps_discovery;")
+            .await
+            .expect("TODO");
+
+        #[derive(serde::Deserialize)]
+        struct Notification {
+            tenant_id: TenantId,
+        }
+        Ok(Box::pin(async_stream::stream! {
+            while let Some(msg) = rx.recv().await {
+                let msg: Result<Notification, _> = serde_json::from_str(msg.payload());
+                let msg = msg.map(|Notification { tenant_id }| tenant_id );
+                yield msg;
+            }
+            tracing::info!("sk_ps_discovery channel closed, stopping stream");
+             // keep client alive inside the returned sream object, othrwise `conn` ends as soon as we return from this function
+            drop(client);
+        }))
+    }
+
+    pub(crate) async fn get_all_sk_ps_discovery_work(
+        &self,
+    ) -> DatabaseResult<Vec<SkPsDiscoveryPersistence>> {
+        use crate::schema::sk_ps_discovery::dsl;
+        self.with_measured_conn(DatabaseOperation::ListSkPsDiscovery, move |conn| {
+            Box::pin(async move {
+                let vec: Vec<SkPsDiscoveryPersistence> = dsl::sk_ps_discovery.load(conn).await?;
+                Ok(vec)
+            })
+        })
+        .await
+    }
+
+    pub(crate) async fn update_sk_ps_discovery_attempt(
+        &self,
+        pk: SkPsDiscoveryPersistencePk,
+        intent_state: String,
+        update: Result<(), ()>,
+    ) -> DatabaseResult<()> {
+        use crate::schema::sk_ps_discovery::dsl;
+
+        self.with_measured_conn(DatabaseOperation::UpdateSkPsDiscoveryAttempt, move |conn| {
+            let pk = pk.clone();
+            let intent_state = intent_state.clone();
+            Box::pin(async move {
+                match update {
+                    Ok(()) => {
+                        let SkPsDiscoveryPersistencePk {
+                            tenant_id,
+                            shard_number,
+                            shard_count,
+                            ps_generation,
+                            sk_id,
+                        } = pk;
+                        let nrows = diesel::delete(dsl::sk_ps_discovery)
+                            // primary key
+                            .filter(dsl::tenant_id.eq(tenant_id))
+                            .filter(dsl::shard_number.eq(shard_number))
+                            .filter(dsl::shard_count.eq(shard_count))
+                            .filter(dsl::ps_generation.eq(ps_generation))
+                            .filter(dsl::sk_id.eq(sk_id))
+                            // intent_state could have changed beneath us (split brain or concurrent state gc)
+                            // TODO: this could also just be a globally monotonic sequence number, maybe easier to reason about?
+                            .filter(dsl::intent_state.eq(intent_state))
+                            .execute(conn)
+                            .await?;
+                        if nrows != 1 {
+                            return Err(DatabaseError::Logical(format!(
+                                "unexpected number of deletes: {nrows}"
+                            )));
+                        }
+                    }
+                    Err(_) => {
+                        let SkPsDiscoveryPersistencePk {
+                            tenant_id,
+                            shard_number,
+                            shard_count,
+                            ps_generation,
+                            sk_id,
+                        } = pk;
+
+                        let nrows = diesel::update(dsl::sk_ps_discovery)
+                            // primary key
+                            .filter(dsl::tenant_id.eq(tenant_id))
+                            .filter(dsl::shard_number.eq(shard_number))
+                            .filter(dsl::shard_count.eq(shard_count))
+                            .filter(dsl::ps_generation.eq(ps_generation))
+                            .filter(dsl::sk_id.eq(sk_id))
+                            // intent_state could have changed beneath us (split brain or concurrent state gc)
+                            // TODO: this could also just be a globally monotonic sequence number, maybe easier to reason about?
+                            .filter(dsl::intent_state.eq(intent_state))
+                            // action:
+                            .set((
+                                dsl::retries.eq(dsl::retries.add(1)), // XXX: in split-brain situation we would bump twice...
+                                dsl::last_retry_at.eq(diesel::dsl::now),
+                            ))
+                            .execute(conn) // TODO: check update count?
+                            .await?;
+                        if nrows != 1 {
+                            return Err(DatabaseError::Logical(format!(
+                                "unexpected number of updates: {nrows}"
+                            )));
+                        }
+                    }
+                }
+                Ok(())
+            })
+        })
+        .await
+    }
 }

 pub(crate) fn load_certs() -> anyhow::Result<Arc<rustls::RootCertStore>> {
@@ -1909,21 +2067,40 @@ fn client_config_with_root_certs() -> anyhow::Result<rustls::ClientConfig> {
    })
 }

-fn establish_connection_rustls(config: &str) -> BoxFuture<ConnectionResult<AsyncPgConnection>> {
-    let fut = async {
+type TokioPostgresConnectResult = ConnectionResult<(
+    tokio_postgres::Client,
+    tokio_postgres::Connection<
+        tokio_postgres::Socket,
+        tokio_postgres_rustls::RustlsStream<tokio_postgres::Socket>,
+    >,
+)>;
+
+fn establish_connection_rustls_tokio_postgres(
+    config: String,
+) -> BoxFuture<'static, TokioPostgresConnectResult> {
+    let fut = async move {
        // We first set up the way we want rustls to work.
        let rustls_config = client_config_with_root_certs()
            .map_err(|err| ConnectionError::BadConnection(format!("{err:?}")))?;
        let tls = tokio_postgres_rustls::MakeRustlsConnect::new(rustls_config);
-        let (client, conn) = tokio_postgres::connect(config, tls)
+        let (client, conn) = tokio_postgres::connect(&config, tls)
            .await
            .map_err(|e| ConnectionError::BadConnection(e.to_string()))?;
-
-        AsyncPgConnection::try_from_client_and_connection(client, conn).await
+        Ok((client, conn))
    };
    fut.boxed()
 }

+fn establish_connection_rustls_diesel(
+    config: String,
+) -> BoxFuture<'static, ConnectionResult<AsyncPgConnection>> {
+    async {
+        let (client, conn) = establish_connection_rustls_tokio_postgres(config).await?;
+        AsyncPgConnection::try_from_client_and_connection(client, conn).await
+    }
+    .boxed()
+}
+
 #[cfg_attr(test, test)]
 fn test_config_debug_censors_password() {
    let has_pw =
@@ -2386,3 +2563,61 @@ pub(crate) struct TimelineImportPersistence {
    pub(crate) timeline_id: String,
    pub(crate) shard_statuses: serde_json::Value,
 }
+
+#[derive(Insertable, AsChangeset, Selectable, Clone, PartialEq, Eq, Hash, Debug)]
+#[diesel(table_name = crate::schema::sk_ps_discovery)]
+pub(crate) struct SkPsDiscoveryPersistencePk {
+    pub(crate) tenant_id: String,
+    pub(crate) shard_number: i32,
+    pub(crate) shard_count: i32,
+    pub(crate) ps_generation: i32,
+    pub(crate) sk_id: i64,
+}
+
+#[derive(Queryable, Selectable, Clone, PartialEq, Eq)]
+#[diesel(table_name = crate::schema::sk_ps_discovery)]
+pub(crate) struct SkPsDiscoveryPersistence {
+    pub(crate) tenant_id: String,
+    pub(crate) shard_number: i32,
+    pub(crate) shard_count: i32,
+    pub(crate) ps_generation: i32,
+    pub(crate) sk_id: i64,
+    pub(crate) intent_state: String,
+    pub(crate) ps_id: i64,
+    pub(crate) created_at: chrono::DateTime<chrono::Utc>,
+    pub(crate) retries: i32,
+    pub(crate) last_retry_at: Option<chrono::DateTime<chrono::Utc>>,
+    pub(crate) acknowledged_at: Option<chrono::DateTime<chrono::Utc>>,
+}
+
+impl SkPsDiscoveryPersistence {
+    pub(crate) fn tenant_shard_id(&self) -> Result<TenantShardId, hex::FromHexError> {
+        Ok(TenantShardId {
+            tenant_id: TenantId::from_str(self.tenant_id.as_str())?,
+            shard_number: ShardNumber(self.shard_number as u8),
+            shard_count: ShardCount::new(self.shard_count as u8),
+        })
+    }
+    pub(crate) fn primary_key(&self) -> SkPsDiscoveryPersistencePk {
+        let SkPsDiscoveryPersistence {
+            tenant_id,
+            shard_number,
+            shard_count,
+            ps_generation,
+            sk_id,
+            intent_state: _,
+            ps_id: _,
+            created_at: _,
+            retries: _,
+            last_retry_at: _,
+            acknowledged_at: _,
+        } = self;
+        SkPsDiscoveryPersistencePk {
+            tenant_id: tenant_id.clone(),
+            shard_number: *shard_number,
+            shard_count: *shard_count,
+            ps_generation: *ps_generation,
+            sk_id: *sk_id,
+        }
+    }
+}
--- a/storage_controller/src/safekeeper_client.rs
+++ b/storage_controller/src/safekeeper_client.rs
@@ -1,9 +1,11 @@
 use safekeeper_api::models::{
-    self, PullTimelineRequest, PullTimelineResponse, SafekeeperUtilization, TimelineCreateRequest,
+    self, PullTimelineRequest, PullTimelineResponse, SafekeeperUtilization,
+    TenantShardPageserverAttachmentChange, TimelineCreateRequest,
 };
 use safekeeper_client::mgmt_api::{Client, Result};
 use utils::id::{NodeId, TenantId, TimelineId};
 use utils::logging::SecretString;
+use utils::shard::TenantShardId;

 use crate::metrics::PageserverRequestLabelGroup;

@@ -164,4 +166,19 @@ impl SafekeeperClient {
            self.inner.utilization().await
        )
    }
+
+    pub async fn post_tenant_shard_pageserver_attachments(
+        &self,
+        tenant_shard_id: TenantShardId,
+        body: TenantShardPageserverAttachmentChange,
+    ) -> Result<()> {
+        measured_request!(
+            "post_tenant_shard_pageserver_attachments",
+            crate::metrics::Method::Post,
+            &self.node_id_label,
+            self.inner
+                .post_tenant_shard_pageserver_attachments(tenant_shard_id, body)
+                .await
+        )
+    }
 }
--- a/storage_controller/src/scheduler.rs
+++ b/storage_controller/src/scheduler.rs
@@ -628,11 +628,7 @@ impl Scheduler {
            tracing::trace!(%node_id, "attached_shard_count={} shard_count={} expected={}", node.attached_shard_count, node.shard_count, expected_attached_shards_per_node);
        }

-        if node.attached_shard_count < expected_attached_shards_per_node {
-            expected_attached_shards_per_node - node.attached_shard_count
-        } else {
-            0
-        }
+        expected_attached_shards_per_node.saturating_sub(node.attached_shard_count)
    }

    pub(crate) fn expected_attached_shard_count(&self) -> usize {
--- a/storage_controller/src/schema.rs
+++ b/storage_controller/src/schema.rs
@@ -60,6 +60,22 @@ diesel::table! {
    }
 }

+diesel::table! {
+    sk_ps_discovery (tenant_id, shard_number, shard_count, ps_generation, sk_id) {
+        tenant_id -> Varchar,
+        shard_number -> Int4,
+        shard_count -> Int4,
+        ps_generation -> Int4,
+        sk_id -> Int8,
+        intent_state -> Varchar,
+        ps_id -> Int8,
+        created_at -> Timestamptz,
+        retries -> Int4,
+        last_retry_at -> Nullable<Timestamptz>,
+        acknowledged_at -> Nullable<Timestamptz>,
+    }
+}
+
 diesel::table! {
    tenant_shards (tenant_id, shard_number, shard_count) {
        tenant_id -> Varchar,
@@ -100,12 +116,16 @@ diesel::table! {
    }
 }

+diesel::joinable!(sk_ps_discovery -> nodes (ps_id));
+diesel::joinable!(sk_ps_discovery -> safekeepers (sk_id));
+
 diesel::allow_tables_to_appear_in_same_query!(
    controllers,
    metadata_health,
    nodes,
    safekeeper_timeline_pending_ops,
    safekeepers,
+    sk_ps_discovery,
    tenant_shards,
    timeline_imports,
    timelines,
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -2,6 +2,7 @@ pub mod chaos_injector;
 mod context_iterator;
 pub(crate) mod safekeeper_reconciler;
 mod safekeeper_service;
+mod sk_ps_discovery;

 use std::borrow::Cow;
 use std::cmp::Ordering;
@@ -1192,6 +1193,16 @@ impl Service {
            }
        }
    }
+
+    #[instrument(skip_all)]
+    async fn run_sk_ps_discovery(self: &Arc<Self>) {
+        self.startup_complete.clone().wait().await;
+        sk_ps_discovery::run(
+            self.clone(),
+            self.http_client.clone(), /* TODO this client is configured to openf resh TCP connection each time, very inefficient */
+        ).await;
+    }
+
    /// Heartbeat all storage nodes once in a while.
    #[instrument(skip_all)]
    async fn spawn_heartbeat_driver(&self) {
@@ -1797,7 +1808,7 @@ impl Service {
            reconcilers_gate: Gate::default(),
            tenant_op_locks: Default::default(),
            node_op_locks: Default::default(),
-            http_client,
+            http_client: http_client.clone(),
            step_down_barrier: Default::default(),
        });

@@ -1865,6 +1876,15 @@ impl Service {
            }
        });

+        tokio::task::spawn({
+            let this = this.clone();
+            let startup_complete = startup_complete.clone();
+            async move {
+                startup_complete.wait().await;
+                this.run_sk_ps_discovery().await
+            }
+        });
+
        tokio::task::spawn({
            let this = this.clone();
            let startup_complete = startup_complete.clone();
--- a/storage_controller/src/service/safekeeper_service.rs
+++ b/storage_controller/src/service/safekeeper_service.rs
@@ -647,6 +647,11 @@ impl Service {
        sk.describe_response()
    }

+    pub(crate) fn get_safekeeper_object(&self, node_id: i64) -> Option<Safekeeper> {
+        let locked = self.inner.read().unwrap();
+        locked.safekeepers.get(&NodeId(node_id as u64)).cloned()
+    }
+
    pub(crate) async fn upsert_safekeeper(
        self: &Arc<Service>,
        record: crate::persistence::SafekeeperUpsert,
--- a/storage_controller/src/service/sk_ps_discovery.rs
+++ b/storage_controller/src/service/sk_ps_discovery.rs
@@ -0,0 +1,266 @@
+use std::{
+    collections::{HashMap, hash_map},
+    sync::Arc,
+    time::Duration,
+};
+
+use anyhow::Context;
+use futures::{StreamExt, stream::FuturesUnordered};
+use safekeeper_api::models::{
+    TenantShardPageserverAttachment, TenantShardPageserverAttachmentChange,
+};
+use tokio::task::JoinHandle;
+use tokio_util::sync::CancellationToken;
+use tracing::{Instrument, Span, error, info, info_span};
+use utils::{
+    generation::Generation,
+    id::{NodeId, TenantId},
+    logging::SecretString,
+    shard::ShardIndex,
+};
+
+use crate::{
+    heartbeater::SafekeeperState,
+    persistence::{Persistence, SkPsDiscoveryPersistence},
+};
+
+use super::Service;
+
+struct Actor {
+    service: Arc<Service>,
+    persistence: Arc<Persistence>,
+    http_client: reqwest::Client,
+}
+
+pub async fn run(service: Arc<Service>, http_client: reqwest::Client) {
+    let actor = Actor {
+        persistence: service.persistence.clone(),
+        service,
+        http_client, // XXX: build our own client instead of getting Service's client; we probably want idle conn to each sk
+    };
+    actor.run().await;
+}
+
+impl Actor {
+    async fn run(mut self) {
+        loop {
+            match self.run0().await {
+                Ok(()) => {
+                    info!("sk_ps_discovery actor exiting after shutdown signal observed");
+                    return;
+                }
+                Err(err) => {
+                    tracing::error!(
+                        ?err,
+                        "sk_ps_discovery actor encountered an error, restarting after backoff"
+                    );
+                    // TODO: proper backoff
+                    tokio::time::sleep(std::time::Duration::from_secs(1)).await;
+                }
+            }
+        }
+    }
+
+    async fn run0(&mut self) -> anyhow::Result<()> {
+        let mut subscription = self
+            .persistence
+            .listen_sk_ps_discovery()
+            .await
+            .context("listen to sk_ps_discovery")?;
+
+        let mut sync_full_ticker = tokio::time::interval(std::time::Duration::from_secs(5));
+
+        struct Task {
+            work: SkPsDiscoveryPersistence,
+            cancel: CancellationToken,
+            join_handle: Option<JoinHandle<()>>,
+        }
+        let mut tasks = HashMap::new();
+
+        loop {
+            tokio::select! {
+                biased; // control messages have higher priority, the periodic full tick, then subscriptions.
+                _ = sync_full_ticker.tick() => {
+                    info!("rebuild");
+                }
+                maybe_res = subscription.next() => {
+                    match maybe_res {
+                        None => {
+                            anyhow::bail!("subscription should never end");
+                        }
+                        Some(Ok(tenant_id)) => {
+                            let tenant_id: TenantId = tenant_id;
+                            info!(?tenant_id, "notify for tenant_id");
+                            // for now, just also rebuild everything
+                        }
+                        Some(Err(err)) => {
+                            let err: serde_json::Error = err;
+                            anyhow::bail!("incorrect notification format: {err:?}"); // FIXME repeat message in error so it can be debugged ?
+                        }
+                    }
+                }
+            }
+
+            // get list of tasks from database
+            let mut new_tasks = self
+                .persistence
+                .get_all_sk_ps_discovery_work()
+                .await
+                .context("get_all_sk_ps_discovery_work")?
+                .into_iter()
+                .map(|work: SkPsDiscoveryPersistence| {
+                    (
+                        work.primary_key(),
+                        Task {
+                            work,
+                            cancel: CancellationToken::new(),
+                            join_handle: None,
+                        },
+                    )
+                })
+                .collect::<HashMap<_, _>>();
+
+            // Carry over ongoing tasks
+            let mut cancelled_wait = FuturesUnordered::new();
+            for (
+                task_key,
+                Task {
+                    work: ongoing_persistence,
+                    cancel,
+                    join_handle,
+                },
+            ) in tasks.drain()
+            {
+                match new_tasks.entry(task_key) {
+                    hash_map::Entry::Occupied(mut planned) => {
+                        let Task {
+                            work: planned_persistence,
+                            cancel: planned_cancel,
+                            join_handle: planned_jh,
+                        } = planned.get_mut();
+                        assert!(planned_jh.is_none());
+                        if *planned_persistence == ongoing_persistence {
+                            *planned_jh = join_handle;
+                            *planned_cancel = cancel;
+                            continue;
+                        }
+                    }
+                    hash_map::Entry::Vacant(_) => (),
+                }
+                cancel.cancel();
+                cancelled_wait.push(async move {
+                    if let Some(jh) = join_handle {
+                        let _ = jh.await;
+                    }
+                });
+            }
+            while let Some(_) = cancelled_wait.next().await {}
+            tasks = new_tasks;
+
+            // Kick off new tasks
+            for (key, task) in tasks.iter_mut() {
+                if task.join_handle.is_none() {
+                    task.join_handle = Some(tokio::spawn(
+                        DeliveryAttempt {
+                            cancel: task.cancel.clone(),
+                            persistence: self.persistence.clone(),
+                            service: self.service.clone(),
+                            http_client: self.http_client.clone(),
+                            work: task.work.clone(),
+                        }
+                        .run()
+                        .instrument({
+                            let span = info_span!(parent: None, "sk_ps_discovery_delivery", ?key);
+                            span.follows_from(Span::current());
+                            span
+                        }),
+                    ))
+                }
+            }
+        }
+    }
+}
+
+struct DeliveryAttempt {
+    cancel: CancellationToken,
+    persistence: Arc<Persistence>,
+    service: Arc<super::Service>,
+    http_client: reqwest::Client,
+    work: SkPsDiscoveryPersistence,
+}
+
+impl DeliveryAttempt {
+    pub async fn run(self) {
+        let res = self.run0().await;
+        if self.cancel.is_cancelled() {
+            return;
+        }
+        if let Err(ref err) = res {
+            error!(?err, "attempt failed");
+        }
+        let res = self
+            .persistence
+            .update_sk_ps_discovery_attempt(
+                self.work.primary_key(),
+                self.work.intent_state.clone(),
+                res.map_err(|_| ()),
+            )
+            .await;
+        if let Err(ref err) = res {
+            error!(?err, "persistence of attempt result failed");
+        }
+    }
+    async fn run0(&self) -> anyhow::Result<()> {
+        let Some(sk) = self.service.get_safekeeper_object(self.work.sk_id) else {
+            anyhow::bail!("safekeeper object does not exist");
+        };
+
+        match sk.availability() {
+            SafekeeperState::Available { .. } => (),
+            SafekeeperState::Offline => {
+                anyhow::bail!("safekeeper is offline");
+            }
+        }
+
+        let body = {
+            let val = TenantShardPageserverAttachment {
+                shard_id: ShardIndex {
+                    shard_number: utils::shard::ShardNumber(self.work.shard_number as u8),
+                    shard_count: utils::shard::ShardCount(self.work.shard_count as u8),
+                },
+                ps_id: NodeId(self.work.ps_id as u64),
+                generation: Generation::new(self.work.ps_generation as u32),
+            };
+            match self.work.intent_state.as_str() {
+                "attached" => TenantShardPageserverAttachmentChange::Attach { field1: val },
+                "detached" => TenantShardPageserverAttachmentChange::Detach(val),
+                x => anyhow::bail!("unknown intent state {x:?}"),
+            }
+        };
+        let tenant_shard_id = self.work.tenant_shard_id()?;
+        sk.with_client_retries(
+            |client| {
+                let body = body.clone();
+                async move {
+                    client
+                        .post_tenant_shard_pageserver_attachments(tenant_shard_id, body)
+                        .await
+                }
+            },
+            &self.http_client,
+            &self
+                .service
+                .config
+                .safekeeper_jwt_token
+                .clone()
+                .map(SecretString::from),
+            1,
+            3,
+            Duration::from_secs(1),
+            &self.cancel,
+        )
+        .await?;
+
+        Ok(())
+    }
+}
--- a/test_runner/fixtures/neon_cli.py
+++ b/test_runner/fixtures/neon_cli.py
@@ -103,7 +103,7 @@ class AbstractNeonCli:
            else:
                stdout = ""

-            log.warn(f"CLI timeout: stderr={stderr}, stdout={stdout}")
+            log.warning(f"CLI timeout: stderr={stderr}, stdout={stdout}")
            raise

        indent = "  "
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -510,7 +510,7 @@ def list_elegible_layers(
        except KeyError:
            # Unexpected: tests should call this when pageservers are in a quiet state such that the layer map
            # matches what's on disk.
-            log.warn(f"Lookup {layer_file_name} from {list(visible_map.keys())}")
+            log.warning(f"Lookup {layer_file_name} from {list(visible_map.keys())}")
            raise

    return list(c for c in candidates if is_visible(c))
@@ -636,7 +636,7 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
    except:
        # On assertion failures, log some details to help with debugging
        heatmap = env.pageserver_remote_storage.heatmap_content(tenant_id)
-        log.warn(f"heatmap contents: {json.dumps(heatmap, indent=2)}")
+        log.warning(f"heatmap contents: {json.dumps(heatmap, indent=2)}")
        raise

    # Scrub the remote storage
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
--- a/vendor/postgres-v17
+++ b/vendor/postgres-v17
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,18 +1,18 @@
 {
  "v17": [
    "17.5",
-    "e5374b72997b0afc8374137674e873f7a558120a"
+    "8be779fd3ab9e87206da96a7e4842ef1abf04f44"
  ],
  "v16": [
    "16.9",
-    "15710a76b7d07912110fcbbaf0c8ad6d7e5a9fbc"
+    "0bf96bd6d70301a0b43b0b3457bb3cf8fb43c198"
  ],
  "v15": [
    "15.13",
-    "daa81cffcf063c54b29a9aabdb6604625f675ad0"
+    "de7640f55da07512834d5cc40c4b3fb376b5f04f"
  ],
  "v14": [
    "14.18",
-    "4cca6f8083483dda9e12eae292cf788d45bd561f"
+    "55c0d45abe6467c02084c2192bca117eda6ce1e7"
  ]
 }
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -60,7 +60,8 @@ lazy_static = { version = "1", default-features = false, features = ["spin_no_st
 libc = { version = "0.2", features = ["extra_traits", "use_std"] }
 log = { version = "0.4", default-features = false, features = ["std"] }
 memchr = { version = "2" }
-nix = { version = "0.26" }
+nix-2f80eeee3b1b6c7e = { package = "nix", version = "0.26" }
+nix-fa1f6196edfd7249 = { package = "nix", version = "0.30", features = ["dir", "ioctl", "mman", "poll", "signal", "socket"] }
 nom = { version = "7" }
 num = { version = "0.4" }
 num-bigint = { version = "0.4" }
Author	SHA1	Message	Date
Christian Schwarz	a80790e0c4	WIP	2025-06-12 06:41:07 -07:00
Christian Schwarz	906a963351	WIP advertisement sending	2025-06-08 20:10:30 -07:00
Christian Schwarz	9e7556bef2	WIP	2025-06-06 19:03:23 -07:00
Christian Schwarz	4f4214eea3	WIP integrate	2025-06-06 18:36:16 -07:00
Christian Schwarz	3dffbda428	rip out the old wal_advertiser::advmap impl, stubbing out with todo!()s	2025-06-06 17:40:59 -07:00
Christian Schwarz	04fb256e0f	leave some TODOs on the lib	2025-06-06 16:51:04 -07:00
Christian Schwarz	02fe35831c	fix compile warninign in benchmark	2025-06-06 16:50:14 -07:00
Christian Schwarz	5f7bc3ce60	left equi join for remote_consistent_lsn retrieval	2025-06-06 14:28:20 -07:00
Christian Schwarz	e40b1c79fa	carginality estimation for collected hashmap; maintain `nodes` and `nodes_timelines`; allocations not showing in flamegraph anymore, but replaced with btreemap, but replaced with btreemap, but replaced with btreemap, but replaced with btreemap::get	2025-06-06 13:50:03 -07:00
Christian Schwarz	2f416267dc	finish implementing auto-quiescing; needs more tests	2025-06-05 02:41:56 +02:00
Christian Schwarz	d52d560f16	log int est	2025-06-05 02:08:13 +02:00
Christian Schwarz	de908a7b0d	WIP auto-quiesce	2025-06-05 01:56:55 +02:00
Christian Schwarz	687fb25d41	WIP	2025-06-05 01:28:55 +02:00
Christian Schwarz	eaa91291ae	add test case for initial advertisement and fix everything by switching to btrees and proper merge equi join	2025-06-05 01:09:33 +02:00
Christian Schwarz	fc9f38dd2d	continue	2025-06-04 22:48:21 +02:00
Christian Schwarz	c689110ad6	continue	2025-06-04 22:34:01 +02:00
Christian Schwarz	ba6abe203d	WIP promising quiescing mechanism	2025-06-04 22:07:48 +02:00
Christian Schwarz	d16e024d49	treat storage more likea n actor itself; dead end also	2025-06-04 20:46:48 +02:00
Christian Schwarz	a4b9335b73	dabble around with effect-style system	2025-06-04 20:38:55 +02:00
Christian Schwarz	2cea7a7838	sketch offloading	2025-06-04 19:19:20 +02:00
Christian Schwarz	e1c1aa74fe	get benchmark to work	2025-06-04 18:28:53 +02:00
Christian Schwarz	ee775a24a0	WIP	2025-06-04 16:46:43 +02:00
Christian Schwarz	428f532f08	naive implementation of advertisement generator	2025-06-04 15:29:37 +02:00
Christian Schwarz	5efb0d8072	WIP	2025-06-03 19:45:19 +02:00
Christian Schwarz	36ba2b8e44	WIP proto	2025-06-03 19:14:48 +02:00
Christian Schwarz	1de0f41403	WIP	2025-06-03 18:42:54 +02:00
Christian Schwarz	4d2f27a33f	WIP	2025-06-03 14:55:08 +02:00
Christian Schwarz	88a3c9e7fd	WIP	2025-06-03 14:00:56 +02:00
Christian Schwarz	df36b9aa62	WIP(ctd): plumbing to feed commit_lsn to wal_advertiser	2025-06-02 12:39:42 +02:00
Christian Schwarz	18a43eeab3	undo the remote_consistent_lsn feedback channel brought in by the PoC merge (includes undo of funneling pageserver_connection field via connection options)	2025-06-02 12:04:35 +02:00
Christian Schwarz	39039d1be7	WIP: plumbing to feed commit_lsn to wal_advertiser	2025-06-02 12:03:30 +02:00
Christian Schwarz	9ee75ceee6	merge fixups; storcon and safekeeper compile again	2025-06-02 11:26:14 +02:00
Christian Schwarz	f5210a367d	git merge --squash problame/broker-spof/poc Squashed commit of the following: commit `4a1b52c12e` Author: Christian Schwarz <christian@neon.tech> Date: Mon May 5 12:45:45 2025 +0200 WIP commit `257693e4f2` Author: Christian Schwarz <christian@neon.tech> Date: Mon May 5 10:59:45 2025 +0200 WIP commit `7aa9beaefd` Author: Christian Schwarz <christian@neon.tech> Date: Sun May 4 17:06:46 2025 +0200 make sk compile commit `35dbbbaf60` Author: Christian Schwarz <christian@neon.tech> Date: Sun May 4 16:50:23 2025 +0200 move discovery request mechanism into that type as well Can't move the policy when we send disovery mechanism because that's tied to connection_manager loop state. commit `6380c9674c` Author: Christian Schwarz <christian@neon.tech> Date: Sun May 4 16:22:46 2025 +0200 move subscription code into new client struct commit `1f53688189` Author: Christian Schwarz <christian@neon.tech> Date: Sun May 4 14:40:44 2025 +0200 Revert "rip out broker binary target & launch of it in cplane & mention of it in docs" This reverts commit `8f201b1580`. commit `8f201b1580` Author: Christian Schwarz <christian@neon.tech> Date: Sun May 4 14:38:52 2025 +0200 rip out broker binary target & launch of it in cplane & mention of it in docs	2025-06-02 11:19:37 +02:00
Christian Schwarz	f36520eb94	stub api impl	2025-06-02 11:19:28 +02:00
Christian Schwarz	afa35eea87	trigger now only does insertions; app loop will do cleanup; prepare API for cleanup	2025-05-30 20:36:50 +02:00
Christian Schwarz	8eb853b731	finish the stub implementation of storcon side, it now PUTs to SKs and gets back 404s	2025-05-28 19:29:32 +02:00
Christian Schwarz	a95015d967	triggers for `timelines` table and ps/sk row deletion	2025-05-28 13:14:37 +02:00
Christian Schwarz	3836ee8539	finish prototyping event changes via triggers	2025-05-27 18:28:22 +02:00
Christian Schwarz	a6bd4a3be6	Revert "abandoned prototype how it would be if we do what triggers do but in the app" This reverts commit `24d96e4372`.	2025-05-27 14:13:10 +02:00
Christian Schwarz	24d96e4372	abandoned prototype how it would be if we do what triggers do but in the app	2025-05-27 14:12:48 +02:00
Christian Schwarz	29ea89b61d	trigger-based thing	2025-05-27 14:12:34 +02:00
Christian Schwarz	322e742e4c	schema	2025-05-27 13:44:25 +02:00
Erik Grinaker	cdb6479c8a	pageserver: add gRPC page service schema (#11815 ) ## Problem For the [communicator project](https://github.com/neondatabase/company_projects/issues/352), we want to move to gRPC for the page service protocol. Touches #11728. ## Summary of changes This patch adds an experimental gRPC Protobuf schema for the page service. It is equivalent to the current page service, but with several improvements, e.g.: * Connection multiplexing. * Reduced head-of-line blocking. * Client-side batching. * Explicit tenant shard routing. * GetPage request classification (normal vs. prefetch). * Explicit rate limiting ("slow down" response status). The API is exposed as a new `pageserver/page_api` package. This is separate from the `pageserver_api` package to reduce the dependency footprint for the communicator. The longer-term plan is to also split out e.g. the WAL ingestion service to a separate gRPC package, e.g. `pageserver/wal_api`. Subsequent PRs will: add Rust domain types for the Protobuf types, expose a gRPC server, and implement the page service. Preliminary prototype benchmarks of this gRPC API is within 10% of baseline libpq performance. We'll do further benchmarking and optimization as the implementation lands in `main` and is deployed to staging.	2025-05-19 09:03:06 +00:00
Konstantin Knizhnik	81c557d87e	Unlogged build get smgr (#11954 ) ## Problem See https://github.com/neondatabase/neon/issues/11910 and https://neondb.slack.com/archives/C04DGM6SMTM/p1747314649059129 ## Summary of changes Do not change persistence in `start_unlogged_build` Postgres PRs: https://github.com/neondatabase/postgres/pull/642 https://github.com/neondatabase/postgres/pull/641 https://github.com/neondatabase/postgres/pull/640 https://github.com/neondatabase/postgres/pull/639 --------- Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>	2025-05-18 05:02:47 +00:00
Trung Dinh	e963129678	pagesteam_handle_batched_message -> pagestream_handle_batched_message (#11916 ) ## Problem Found a typo in code. ## Summary of changes Co-authored-by: Trung Dinh <tdinh@roblox.com> Co-authored-by: Erik Grinaker <erik@neon.tech>	2025-05-17 22:30:29 +00:00
dependabot[bot]	4f0a9fc569	chore(deps): bump flask-cors from 5.0.0 to 6.0.0 in the pip group across 1 directory (#11960 ) Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2025-05-17 22:06:32 +00:00
Emmanuel Ferdman	81c6a5a796	Migrate to correct logger interface (#11956 ) ## Problem Currently the `logger` library throws annoying deprecation warnings: ```python DeprecationWarning: The 'warn' method is deprecated, use 'warning' instead ``` ## Summary of changes This small PR resolves the annoying deprecation warnings by migrating to `.warning` as suggested. Signed-off-by: Emmanuel Ferdman <emmanuelferdman@gmail.com>	2025-05-17 21:12:01 +00:00
Konstantin Knizhnik	8e05639dbf	Invalidate LFC after unlogged build (#11951 ) ## Problem See https://neondb.slack.com/archives/C04DGM6SMTM/p1747391617951239 LFC is not always properly updated during unlogged build so it can contain stale content. ## Summary of changes Invalidate LFC content at the end of unlogged build Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>	2025-05-17 19:06:59 +00:00
Alexander Bayandin	deed46015d	CI(test-images): increase timeout from 20m to 60m (#11955 ) ## Problem For some reason (unknown yet) 20m timeout is not enough for `test-images` job on arm runners. Ref: https://github.com/neondatabase/neon/actions/runs/15075321681/job/42387530399?pr=11953 ## Summary of changes - Increase the timeout from 20m to 1h	2025-05-17 06:34:54 +00:00
Heikki Linnakangas	532d9b646e	Add simple facility for an extendable shared memory area (#11929 ) You still need to provide a max size up-front, but memory is only allocated for the portion that is in use. The module is currently unused, but will be used by the new compute communicator project, in the neon Postgres extension. See https://github.com/neondatabase/neon/issues/11729 --------- Co-authored-by: Erik Grinaker <erik@neon.tech>	2025-05-16 21:22:36 +00:00
Heikki Linnakangas	55f91cf10b	Update 'nix' package (#11948 ) There were some incompatible changes. Most churn was from switching from the now-deprecated fcntl:flock() function to fcntl::Flock::lock(). The new function returns a guard object, while with the old function, the lock was associated directly with the file descriptor. It's good to stay up-to-date in general, but the impetus to do this now is that in https://github.com/neondatabase/neon/pull/11929, I want to use some functions that were added only in the latest version of 'nix', and it's nice to not have to build multiple versions. (Although, different versions of 'nix' are still pulled in as indirect dependencies from other packages)	2025-05-16 14:45:08 +00:00
Folke Behrens	baafcc5d41	proxy: Fix misspelled flag value alias, swap names and aliases (#11949 ) ## Problem There's a misspelled flag value alias that's not really used anywhere. ## Summary of changes Fix the alias and make aliases the official flag values and keep old values as aliases. Also rename enum variant. No need for it to carry the version now.	2025-05-16 14:12:39 +00:00
Evan Fleming	aa22572d8c	safekeeper: refactor static remote storage usage to use Arc (#10179 ) Greetings! Please add `w=1` to github url when viewing diff (sepcifically `wal_backup.rs`) ## Problem This PR is aimed at addressing the remaining work of #8200. Namely, removing static usage of remote storage in favour of arc. I did not opt to pass `Arc<RemoteStorage>` directly since it is actually `Optional<RemoteStorage>` as it is not necessarily always configured. I wanted to avoid having to pass `Arc<Optional<RemoteStorage>>` everywhere with individual consuming functions likely needing to handle unwrapping. Instead I've added a `WalBackup` struct that holds `Optional<RemoteStorage>` and handles initialization/unwrapping RemoteStorage internally. wal_backup functions now take self and `Arc<WalBackup>` is passed as a dependency through the various consumers that need it. ## Summary of changes - Add `WalBackup` that holds `Optional<RemoteStorage>` and handles initialization and unwrapping - Modify wal_backup functions to take `WalBackup` as self (Add `w=1` to github url when viewing diff here) - Initialize `WalBackup` in safekeeper root - Store `Arc<WalBackup>` in `GlobalTimelineMap` and pass and store in each Timeline as loaded - use `WalBackup` through Timeline as needed ## Refs - task to remove global variables https://github.com/neondatabase/neon/issues/8200 - drive-by fixes https://github.com/neondatabase/neon/issues/11501 by turning the panic reported there into an error `remote storage not configured` --------- Co-authored-by: Christian Schwarz <christian@neon.tech>	2025-05-16 12:41:10 +00:00
Arpad Müller	2d247375b3	Update rust to 1.87.0 (#11938 ) We keep the practice of keeping the compiler up to date, pointing to the latest release. This is done by many other projects in the Rust ecosystem as well. The 1.87.0 release marks 10 years of Rust. [Announcement blog post](https://blog.rust-lang.org/2025/05/15/Rust-1.87.0/) Prior update was in #11431	2025-05-16 12:21:24 +00:00