add comment

only update in apply_batch_postgres
fix(walredo): walredo process that causes errors is never killed
2026-05-17 21:20:37 +00:00 · 2024-02-01 17:26:39 +00:00 · 2024-02-01 16:17:21 +00:00 · 2024-02-01 14:38:53 +01:00 · 2024-01-31 17:37:25 +02:00 · 2024-01-31 15:30:19 +01:00
73 changed files with 3396 additions and 1000 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -1,27 +1,28 @@
 *

-!rust-toolchain.toml
-!Cargo.toml
+# Files
 !Cargo.lock
+!Cargo.toml
 !Makefile
+!rust-toolchain.toml
+!scripts/combine_control_files.py
+!scripts/ninstall.sh
+!vm-cgconfig.conf

+# Directories
 !.cargo/
 !.config/
-!control_plane/
 !compute_tools/
+!control_plane/
 !libs/
+!neon_local/
 !pageserver/
+!patches/
 !pgxn/
 !proxy/
-!safekeeper/
 !s3_scrubber/
+!safekeeper/
 !storage_broker/
 !trace/
-!vendor/postgres-v14/
-!vendor/postgres-v15/
-!vendor/postgres-v16/
+!vendor/postgres-*/
 !workspace_hack/
-!neon_local/
-!scripts/ninstall.sh
-!scripts/combine_control_files.py
-!vm-cgconfig.conf
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -508,7 +508,7 @@ jobs:
          VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
          PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
          TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}"
-          PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
+          PAGESERVER_VIRTUAL_FILE_IO_ENGINE: std-fs
      # XXX: no coverage data handling here, since benchmarks are run on release builds,
      # while coverage is currently collected for the debug ones

--- a/Cargo.lock
+++ b/Cargo.lock
@@ -285,7 +285,6 @@ dependencies = [
 "metrics",
 "pageserver_api",
 "pageserver_client",
- "postgres_backend",
 "postgres_connection",
 "serde",
 "serde_json",
@@ -2736,6 +2735,12 @@ dependencies = [
 "winapi",
 ]

+[[package]]
+name = "libm"
+version = "0.2.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058"
+
 [[package]]
 name = "linux-raw-sys"
 version = "0.1.4"
@@ -2832,6 +2837,9 @@ dependencies = [
 "libc",
 "once_cell",
 "prometheus",
+ "rand 0.8.5",
+ "rand_distr",
+ "twox-hash",
 "workspace_hack",
 ]

@@ -3057,6 +3065,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "578ede34cf02f8924ab9447f50c28075b4d3e5b269972345e7e0372b38c6cdcd"
 dependencies = [
 "autocfg",
+ "libm",
 ]

 [[package]]
@@ -4071,6 +4080,8 @@ dependencies = [
 "sync_wrapper",
 "task-local-extensions",
 "thiserror",
+ "tikv-jemalloc-ctl",
+ "tikv-jemallocator",
 "tls-listener",
 "tokio",
 "tokio-postgres",
@@ -4171,6 +4182,16 @@ dependencies = [
 "getrandom 0.2.11",
 ]

+[[package]]
+name = "rand_distr"
+version = "0.4.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "32cb0b9bc82b0a0876c2dd994a7e7a2683d3e7390ca40e6886785ef0c7e3ee31"
+dependencies = [
+ "num-traits",
+ "rand 0.8.5",
+]
+
 [[package]]
 name = "rand_hc"
 version = "0.2.0"
@@ -5511,6 +5532,37 @@ dependencies = [
 "ordered-float 2.10.1",
 ]

+[[package]]
+name = "tikv-jemalloc-ctl"
+version = "0.5.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "619bfed27d807b54f7f776b9430d4f8060e66ee138a28632ca898584d462c31c"
+dependencies = [
+ "libc",
+ "paste",
+ "tikv-jemalloc-sys",
+]
+
+[[package]]
+name = "tikv-jemalloc-sys"
+version = "0.5.4+5.3.0-patched"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9402443cb8fd499b6f327e40565234ff34dbda27460c5b47db0db77443dd85d1"
+dependencies = [
+ "cc",
+ "libc",
+]
+
+[[package]]
+name = "tikv-jemallocator"
+version = "0.5.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "965fe0c26be5c56c94e38ba547249074803efd52adfb66de62107d95aab3eaca"
+dependencies = [
+ "libc",
+ "tikv-jemalloc-sys",
+]
+
 [[package]]
 name = "time"
 version = "0.3.21"
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -149,6 +149,8 @@ tar = "0.4"
 task-local-extensions = "0.1.4"
 test-context = "0.1"
 thiserror = "1.0"
+tikv-jemallocator = "0.5"
+tikv-jemalloc-ctl = "0.5"
 tls-listener = { version = "0.7", features = ["rustls", "hyper-h1"] }
 tokio = { version = "1.17", features = ["macros"] }
 tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
@@ -165,6 +167,7 @@ tracing = "0.1"
 tracing-error = "0.2.0"
 tracing-opentelemetry = "0.20.0"
 tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
+twox-hash = { version = "1.6.3", default-features = false }
 url = "2.2"
 uuid = { version = "1.6.1", features = ["v4", "v7", "serde"] }
 walkdir = "2.3.2"
--- a/2
+++ b/2
@@ -53,6 +53,7 @@ RUN set -e \
      --bin pagectl  \
      --bin safekeeper  \
      --bin storage_broker  \
+      --bin attachment_service  \
      --bin proxy  \
      --bin neon_local \
      --locked --release \
@@ -80,6 +81,7 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/pagectl             /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper          /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker      /usr/local/bin
+COPY --from=build --chown=neon:neon /home/nonroot/target/release/attachment_service  /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy               /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local          /usr/local/bin

--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -241,9 +241,12 @@ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz -
 FROM build-deps AS vector-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.5.1.tar.gz -O pgvector.tar.gz && \
-    echo "cc7a8e034a96e30a819911ac79d32f6bc47bdd1aa2de4d7d4904e26b83209dc8 pgvector.tar.gz" | sha256sum --check && \
+COPY patches/pgvector.patch /pgvector.patch
+
+RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.6.0.tar.gz -O pgvector.tar.gz && \
+    echo "b0cf4ba1ab016335ac8fb1cada0d2106235889a194fffeece217c5bda90b2f19 pgvector.tar.gz" | sha256sum --check && \
    mkdir pgvector-src && cd pgvector-src && tar xvzf ../pgvector.tar.gz --strip-components=1 -C . && \
+    patch -p1 < /pgvector.patch && \
    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/vector.control
@@ -520,8 +523,7 @@ RUN apt-get update && \
        libboost-regex1.74-dev \
        libboost-serialization1.74-dev \
        libboost-system1.74-dev \
-        libeigen3-dev \
-        libfreetype6-dev
+        libeigen3-dev

 ENV PATH "/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH"
 RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar.gz -O rdkit.tar.gz && \
@@ -547,6 +549,7 @@ RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar.
        -D PostgreSQL_LIBRARY_DIR=`pg_config --libdir` \
        -D RDK_INSTALL_INTREE=OFF \
        -D RDK_INSTALL_COMIC_FONTS=OFF \
+        -D RDK_BUILD_FREETYPE_SUPPORT=OFF \
        -D CMAKE_BUILD_TYPE=Release \
        . && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
@@ -901,7 +904,7 @@ COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-deb
 # libgeos, libgdal, libsfcgal1, libproj and libprotobuf-c1 for PostGIS
 # libxml2, libxslt1.1 for xml2
 # libzstd1 for zstd
-# libboost*, libfreetype6, and zlib1g for rdkit
+# libboost* for rdkit
 # ca-certificates for communicating with s3 by compute_ctl
 RUN apt update &&  \
    apt install --no-install-recommends -y \
@@ -914,7 +917,6 @@ RUN apt update &&  \
        libboost-serialization1.74.0 \
        libboost-system1.74.0 \
        libossp-uuid16 \
-        libfreetype6 \
        libgeos-c1v5 \
        libgdal28 \
        libproj19 \
@@ -926,7 +928,6 @@ RUN apt update &&  \
        libcurl4-openssl-dev \
        locales \
        procps \
-        zlib1g \
        ca-certificates && \
    rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
    localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -758,6 +758,14 @@ BEGIN
    END LOOP;
 END $$;
 "#,
+        r#"
+DO $$
+BEGIN
+    IF (SELECT setting::numeric >= 160000 FROM pg_settings WHERE name = 'server_version_num') THEN
+        EXECUTE 'GRANT pg_create_subscription TO neon_superuser';
+    END IF;
+END
+$$;"#,
    ];

    let mut query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
--- a/control_plane/attachment_service/Cargo.toml
+++ b/control_plane/attachment_service/Cargo.toml
@@ -21,10 +21,6 @@ tokio.workspace = true
 tokio-util.workspace = true
 tracing.workspace = true

-# TODO: remove this after DB persistence is added, it is only used for
-# a parsing function when loading pageservers from neon_local LocalEnv
-postgres_backend.workspace = true
-
 diesel = { version = "2.1.4", features = ["serde_json", "postgres"] }

 utils = { path = "../../libs/utils/" }
--- a/control_plane/attachment_service/src/http.rs
+++ b/control_plane/attachment_service/src/http.rs
@@ -2,13 +2,17 @@ use crate::reconciler::ReconcileError;
 use crate::service::{Service, STARTUP_RECONCILE_TIMEOUT};
 use hyper::{Body, Request, Response};
 use hyper::{StatusCode, Uri};
-use pageserver_api::models::{TenantCreateRequest, TimelineCreateRequest};
+use pageserver_api::models::{
+    TenantCreateRequest, TenantLocationConfigRequest, TimelineCreateRequest,
+};
 use pageserver_api::shard::TenantShardId;
+use pageserver_client::mgmt_api;
 use std::sync::Arc;
+use std::time::{Duration, Instant};
 use utils::auth::SwappableJwtAuth;
 use utils::http::endpoint::{auth_middleware, request_span};
 use utils::http::request::parse_request_param;
-use utils::id::TenantId;
+use utils::id::{TenantId, TimelineId};

 use utils::{
    http::{
@@ -112,6 +116,78 @@ async fn handle_tenant_create(
    json_response(StatusCode::OK, service.tenant_create(create_req).await?)
 }

+// For tenant and timeline deletions, which both implement an "initially return 202, then 404 once
+// we're done" semantic, we wrap with a retry loop to expose a simpler API upstream.  This avoids
+// needing to track a "deleting" state for tenants.
+async fn deletion_wrapper<R, F>(service: Arc<Service>, f: F) -> Result<Response<Body>, ApiError>
+where
+    R: std::future::Future<Output = Result<StatusCode, ApiError>> + Send + 'static,
+    F: Fn(Arc<Service>) -> R + Send + Sync + 'static,
+{
+    let started_at = Instant::now();
+    // To keep deletion reasonably snappy for small tenants, initially check after 1 second if deletion
+    // completed.
+    let mut retry_period = Duration::from_secs(1);
+    // On subsequent retries, wait longer.
+    let max_retry_period = Duration::from_secs(5);
+    // Enable callers with a 30 second request timeout to reliably get a response
+    let max_wait = Duration::from_secs(25);
+
+    loop {
+        let status = f(service.clone()).await?;
+        match status {
+            StatusCode::ACCEPTED => {
+                tracing::info!("Deletion accepted, waiting to try again...");
+                tokio::time::sleep(retry_period).await;
+                retry_period = max_retry_period;
+            }
+            StatusCode::NOT_FOUND => {
+                tracing::info!("Deletion complete");
+                return json_response(StatusCode::OK, ());
+            }
+            _ => {
+                tracing::warn!("Unexpected status {status}");
+                return json_response(status, ());
+            }
+        }
+
+        let now = Instant::now();
+        if now + retry_period > started_at + max_wait {
+            tracing::info!("Deletion timed out waiting for 404");
+            // REQUEST_TIMEOUT would be more appropriate, but CONFLICT is already part of
+            // the pageserver's swagger definition for this endpoint, and has the same desired
+            // effect of causing the control plane to retry later.
+            return json_response(StatusCode::CONFLICT, ());
+        }
+    }
+}
+
+async fn handle_tenant_location_config(
+    service: Arc<Service>,
+    mut req: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    let config_req = json_request::<TenantLocationConfigRequest>(&mut req).await?;
+    json_response(
+        StatusCode::OK,
+        service
+            .tenant_location_config(tenant_id, config_req)
+            .await?,
+    )
+}
+
+async fn handle_tenant_delete(
+    service: Arc<Service>,
+    req: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+
+    deletion_wrapper(service, move |service| async move {
+        service.tenant_delete(tenant_id).await
+    })
+    .await
+}
+
 async fn handle_tenant_timeline_create(
    service: Arc<Service>,
    mut req: Request<Body>,
@@ -126,6 +202,63 @@ async fn handle_tenant_timeline_create(
    )
 }

+async fn handle_tenant_timeline_delete(
+    service: Arc<Service>,
+    req: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;
+
+    deletion_wrapper(service, move |service| async move {
+        service.tenant_timeline_delete(tenant_id, timeline_id).await
+    })
+    .await
+}
+
+async fn handle_tenant_timeline_passthrough(
+    service: Arc<Service>,
+    req: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+
+    let Some(path) = req.uri().path_and_query() else {
+        // This should never happen, our request router only calls us if there is a path
+        return Err(ApiError::BadRequest(anyhow::anyhow!("Missing path")));
+    };
+
+    tracing::info!("Proxying request for tenant {} ({})", tenant_id, path);
+
+    // Find the node that holds shard zero
+    let (base_url, tenant_shard_id) = service.tenant_shard0_baseurl(tenant_id)?;
+
+    // Callers will always pass an unsharded tenant ID.  Before proxying, we must
+    // rewrite this to a shard-aware shard zero ID.
+    let path = format!("{}", path);
+    let tenant_str = tenant_id.to_string();
+    let tenant_shard_str = format!("{}", tenant_shard_id);
+    let path = path.replace(&tenant_str, &tenant_shard_str);
+
+    let client = mgmt_api::Client::new(base_url, service.get_config().jwt_token.as_deref());
+    let resp = client.get_raw(path).await.map_err(|_e|
+        // FIXME: give APiError a proper Unavailable variant.  We return 503 here because
+        // if we can't successfully send a request to the pageserver, we aren't available.
+        ApiError::ShuttingDown)?;
+
+    // We have a reqest::Response, would like a http::Response
+    let mut builder = hyper::Response::builder()
+        .status(resp.status())
+        .version(resp.version());
+    for (k, v) in resp.headers() {
+        builder = builder.header(k, v);
+    }
+
+    let response = builder
+        .body(Body::wrap_stream(resp.bytes_stream()))
+        .map_err(|e| ApiError::InternalServerError(e.into()))?;
+
+    Ok(response)
+}
+
 async fn handle_tenant_locate(
    service: Arc<Service>,
    req: Request<Body>,
@@ -141,6 +274,11 @@ async fn handle_node_register(mut req: Request<Body>) -> Result<Response<Body>,
    json_response(StatusCode::OK, ())
 }

+async fn handle_node_list(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let state = get_state(&req);
+    json_response(StatusCode::OK, state.service.node_list().await?)
+}
+
 async fn handle_node_configure(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
    let node_id: NodeId = parse_request_param(&req, "node_id")?;
    let config_req = json_request::<NodeConfigureRequest>(&mut req).await?;
@@ -226,26 +364,64 @@ pub fn make_router(

    router
        .data(Arc::new(HttpState::new(service, auth)))
+        // Non-prefixed generic endpoints (status, metrics)
        .get("/status", |r| request_span(r, handle_status))
-        .post("/re-attach", |r| request_span(r, handle_re_attach))
-        .post("/validate", |r| request_span(r, handle_validate))
-        .post("/attach-hook", |r| request_span(r, handle_attach_hook))
-        .post("/inspect", |r| request_span(r, handle_inspect))
-        .post("/node", |r| request_span(r, handle_node_register))
-        .put("/node/:node_id/config", |r| {
+        // Upcalls for the pageserver: point the pageserver's `control_plane_api` config to this prefix
+        .post("/upcall/v1/re-attach", |r| {
+            request_span(r, handle_re_attach)
+        })
+        .post("/upcall/v1/validate", |r| request_span(r, handle_validate))
+        // Test/dev/debug endpoints
+        .post("/debug/v1/attach-hook", |r| {
+            request_span(r, handle_attach_hook)
+        })
+        .post("/debug/v1/inspect", |r| request_span(r, handle_inspect))
+        .get("/control/v1/tenant/:tenant_id/locate", |r| {
+            tenant_service_handler(r, handle_tenant_locate)
+        })
+        // Node operations
+        .post("/control/v1/node", |r| {
+            request_span(r, handle_node_register)
+        })
+        .get("/control/v1/node", |r| request_span(r, handle_node_list))
+        .put("/control/v1/node/:node_id/config", |r| {
            request_span(r, handle_node_configure)
        })
+        // Tenant Shard operations
+        .put("/control/v1/tenant/:tenant_shard_id/migrate", |r| {
+            tenant_service_handler(r, handle_tenant_shard_migrate)
+        })
+        // Tenant operations
+        // The ^/v1/ endpoints act as a "Virtual Pageserver", enabling shard-naive clients to call into
+        // this service to manage tenants that actually consist of many tenant shards, as if they are a single entity.
        .post("/v1/tenant", |r| {
            tenant_service_handler(r, handle_tenant_create)
        })
+        .delete("/v1/tenant/:tenant_id", |r| {
+            tenant_service_handler(r, handle_tenant_delete)
+        })
+        .put("/v1/tenant/:tenant_id/location_config", |r| {
+            tenant_service_handler(r, handle_tenant_location_config)
+        })
+        // Tenant Shard operations (low level/maintenance)
+        .put("/tenant/:tenant_shard_id/migrate", |r| {
+            tenant_service_handler(r, handle_tenant_shard_migrate)
+        })
+        // Timeline operations
+        .delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
+            tenant_service_handler(r, handle_tenant_timeline_delete)
+        })
        .post("/v1/tenant/:tenant_id/timeline", |r| {
            tenant_service_handler(r, handle_tenant_timeline_create)
        })
-        .get("/tenant/:tenant_id/locate", |r| {
-            tenant_service_handler(r, handle_tenant_locate)
+        // Tenant detail GET passthrough to shard zero
+        .get("/v1/tenant/:tenant_id*", |r| {
+            tenant_service_handler(r, handle_tenant_timeline_passthrough)
        })
-        .put("/tenant/:tenant_shard_id/migrate", |r| {
-            tenant_service_handler(r, handle_tenant_shard_migrate)
+        // Timeline GET passthrough to shard zero.  Note that the `*` in the URL is a wildcard: any future
+        // timeline GET APIs will be implicitly included.
+        .get("/v1/tenant/:tenant_id/timeline*", |r| {
+            tenant_service_handler(r, handle_tenant_timeline_passthrough)
        })
        // Path aliases for tests_forward_compatibility
        // TODO: remove these in future PR
--- a/control_plane/attachment_service/src/persistence.rs
+++ b/control_plane/attachment_service/src/persistence.rs
@@ -9,7 +9,6 @@ use diesel::prelude::*;
 use diesel::Connection;
 use pageserver_api::models::TenantConfig;
 use pageserver_api::shard::{ShardCount, ShardNumber, TenantShardId};
-use postgres_connection::parse_host_port;
 use serde::{Deserialize, Serialize};
 use utils::generation::Generation;
 use utils::id::{NodeId, TenantId};
@@ -129,51 +128,11 @@ impl Persistence {
            })
            .await?;

-        if nodes.is_empty() {
-            return self.list_nodes_local_env().await;
-        }
-
        tracing::info!("list_nodes: loaded {} nodes", nodes.len());

        Ok(nodes)
    }

-    /// Shim for automated compatibility tests: load nodes from LocalEnv instead of database
-    pub(crate) async fn list_nodes_local_env(&self) -> DatabaseResult<Vec<Node>> {
-        // Enable test_backward_compatibility to work by populating our list of
-        // nodes from LocalEnv when it is not present in persistent storage.  Otherwise at
-        // first startup in the compat test, we may have shards but no nodes.
-        use control_plane::local_env::LocalEnv;
-        let env = LocalEnv::load_config().map_err(|e| DatabaseError::Logical(format!("{e}")))?;
-        tracing::info!(
-            "Loading {} pageserver nodes from LocalEnv",
-            env.pageservers.len()
-        );
-        let mut nodes = Vec::new();
-        for ps_conf in env.pageservers {
-            let (pg_host, pg_port) =
-                parse_host_port(&ps_conf.listen_pg_addr).expect("Unable to parse listen_pg_addr");
-            let (http_host, http_port) = parse_host_port(&ps_conf.listen_http_addr)
-                .expect("Unable to parse listen_http_addr");
-            let node = Node {
-                id: ps_conf.id,
-                listen_pg_addr: pg_host.to_string(),
-                listen_pg_port: pg_port.unwrap_or(5432),
-                listen_http_addr: http_host.to_string(),
-                listen_http_port: http_port.unwrap_or(80),
-                availability: NodeAvailability::Active,
-                scheduling: NodeSchedulingPolicy::Active,
-            };
-
-            // Synchronize database with what we learn from LocalEnv
-            self.insert_node(&node).await?;
-
-            nodes.push(node);
-        }
-
-        Ok(nodes)
-    }
-
    /// At startup, load the high level state for shards, such as their config + policy.  This will
    /// be enriched at runtime with state discovered on pageservers.
    pub(crate) async fn list_tenant_shards(&self) -> DatabaseResult<Vec<TenantShardPersistence>> {
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -21,6 +21,7 @@ use pageserver_api::{
    models,
    models::{
        LocationConfig, LocationConfigMode, ShardParameters, TenantConfig, TenantCreateRequest,
+        TenantLocationConfigRequest, TenantLocationConfigResponse, TenantShardLocation,
        TimelineCreateRequest, TimelineInfo,
    },
    shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId},
@@ -30,14 +31,14 @@ use utils::{
    completion::Barrier,
    generation::Generation,
    http::error::ApiError,
-    id::{NodeId, TenantId},
+    id::{NodeId, TenantId, TimelineId},
    seqwait::SeqWait,
 };

 use crate::{
    compute_hook::ComputeHook,
    node::Node,
-    persistence::{DatabaseError, Persistence, TenantShardPersistence},
+    persistence::{DatabaseError, NodePersistence, Persistence, TenantShardPersistence},
    scheduler::Scheduler,
    tenant_state::{
        IntentState, ObservedState, ObservedStateLocation, ReconcileResult, ReconcileWaitError,
@@ -635,7 +636,7 @@ impl Service {
                shard_number: tenant_shard_id.shard_number.0 as i32,
                shard_count: tenant_shard_id.shard_count.0 as i32,
                shard_stripe_size: create_req.shard_parameters.stripe_size.0 as i32,
-                generation: 0,
+                generation: create_req.generation.map(|g| g as i32).unwrap_or(0),
                generation_pageserver: i64::MAX,
                placement_policy: serde_json::to_string(&placement_policy).unwrap(),
                config: serde_json::to_string(&create_req.config).unwrap(),
@@ -677,6 +678,7 @@ impl Service {
                        })?;

                        response_shards.push(TenantCreateResponseShard {
+                            shard_id: tenant_shard_id,
                            node_id: entry
                                .get()
                                .intent
@@ -709,6 +711,7 @@ impl Service {
                        })?;

                        response_shards.push(TenantCreateResponseShard {
+                            shard_id: tenant_shard_id,
                            node_id: state
                                .intent
                                .attached
@@ -742,14 +745,257 @@ impl Service {
            (waiters, response_shards)
        };

-        let deadline = Instant::now().checked_add(Duration::from_secs(5)).unwrap();
+        self.await_waiters(waiters).await?;
+
+        Ok(TenantCreateResponse {
+            shards: response_shards,
+        })
+    }
+
+    /// Helper for functions that reconcile a number of shards, and would like to do a timeout-bounded
+    /// wait for reconciliation to complete before responding.
+    async fn await_waiters(
+        &self,
+        waiters: Vec<ReconcilerWaiter>,
+    ) -> Result<(), ReconcileWaitError> {
+        let deadline = Instant::now().checked_add(Duration::from_secs(30)).unwrap();
        for waiter in waiters {
            let timeout = deadline.duration_since(Instant::now());
            waiter.wait_timeout(timeout).await?;
        }
-        Ok(TenantCreateResponse {
-            shards: response_shards,
-        })
+
+        Ok(())
+    }
+
+    /// This API is used by the cloud control plane to do coarse-grained control of tenants:
+    /// - Call with mode Attached* to upsert the tenant.
+    /// - Call with mode Detached to switch to PolicyMode::Detached
+    ///
+    /// In future, calling with mode Secondary may switch to a detach-lite mode in which a tenant only has
+    /// secondary locations.
+    pub(crate) async fn tenant_location_config(
+        &self,
+        tenant_id: TenantId,
+        req: TenantLocationConfigRequest,
+    ) -> Result<TenantLocationConfigResponse, ApiError> {
+        if req.tenant_id.shard_count.0 > 1 {
+            return Err(ApiError::BadRequest(anyhow::anyhow!(
+                "This API is for importing single-sharded or unsharded tenants"
+            )));
+        }
+
+        let mut waiters = Vec::new();
+        let mut result = TenantLocationConfigResponse { shards: Vec::new() };
+        let maybe_create = {
+            let mut locked = self.inner.write().unwrap();
+            let result_tx = locked.result_tx.clone();
+            let compute_hook = locked.compute_hook.clone();
+            let pageservers = locked.nodes.clone();
+
+            let mut scheduler = Scheduler::new(&locked.tenants, &locked.nodes);
+
+            // Maybe we have existing shards
+            let mut create = true;
+            for (shard_id, shard) in locked
+                .tenants
+                .range_mut(TenantShardId::tenant_range(tenant_id))
+            {
+                // Saw an existing shard: this is not a creation
+                create = false;
+
+                // Note that for existing tenants we do _not_ respect the generation in the request: this is likely
+                // to be stale.  Once a tenant is created in this service, our view of generation is authoritative, and
+                // callers' generations may be ignored.  This represents a one-way migration of tenants from the outer
+                // cloud control plane into this service.
+
+                // Use location config mode as an indicator of policy: if they ask for
+                // attached we go to default HA attached mode.  If they ask for secondary
+                // we go to secondary-only mode.  If they ask for detached we detach.
+                match req.config.mode {
+                    LocationConfigMode::Detached => {
+                        shard.policy = PlacementPolicy::Detached;
+                    }
+                    LocationConfigMode::Secondary => {
+                        // TODO: implement secondary-only mode.
+                        todo!();
+                    }
+                    LocationConfigMode::AttachedMulti
+                    | LocationConfigMode::AttachedSingle
+                    | LocationConfigMode::AttachedStale => {
+                        // TODO: persistence for changes in policy
+                        if pageservers.len() > 1 {
+                            shard.policy = PlacementPolicy::Double(1)
+                        } else {
+                            // Convenience for dev/test: if we just have one pageserver, import
+                            // tenants into Single mode so that scheduling will succeed.
+                            shard.policy = PlacementPolicy::Single
+                        }
+                    }
+                }
+
+                shard.schedule(&mut scheduler)?;
+
+                let maybe_waiter = shard.maybe_reconcile(
+                    result_tx.clone(),
+                    &pageservers,
+                    &compute_hook,
+                    &self.config,
+                    &self.persistence,
+                );
+                if let Some(waiter) = maybe_waiter {
+                    waiters.push(waiter);
+                }
+
+                if let Some(node_id) = shard.intent.attached {
+                    result.shards.push(TenantShardLocation {
+                        shard_id: *shard_id,
+                        node_id,
+                    })
+                }
+            }
+
+            if create {
+                // Validate request mode
+                match req.config.mode {
+                    LocationConfigMode::Detached | LocationConfigMode::Secondary => {
+                        // When using this API to onboard an existing tenant to this service, it must start in
+                        // an attached state, because we need the request to come with a generation
+                        return Err(ApiError::BadRequest(anyhow::anyhow!(
+                            "Imported tenant must be in attached mode"
+                        )));
+                    }
+
+                    LocationConfigMode::AttachedMulti
+                    | LocationConfigMode::AttachedSingle
+                    | LocationConfigMode::AttachedStale => {
+                        // Pass
+                    }
+                }
+
+                // Validate request generation
+                let Some(generation) = req.config.generation else {
+                    // We can only import attached tenants, because we need the request to come with a generation
+                    return Err(ApiError::BadRequest(anyhow::anyhow!(
+                        "Generation is mandatory when importing tenant"
+                    )));
+                };
+
+                // Synthesize a creation request
+                Some(TenantCreateRequest {
+                    new_tenant_id: TenantShardId::unsharded(tenant_id),
+                    generation: Some(generation),
+                    shard_parameters: ShardParameters {
+                        // Must preserve the incoming shard_count do distinguish unsharded (0)
+                        // from single-sharded (1): this distinction appears in the S3 keys of the tenant.
+                        count: req.tenant_id.shard_count,
+                        // We only import un-sharded or single-sharded tenants, so stripe
+                        // size can be made up arbitrarily here.
+                        stripe_size: ShardParameters::DEFAULT_STRIPE_SIZE,
+                    },
+                    config: req.config.tenant_conf,
+                })
+            } else {
+                None
+            }
+        };
+
+        if let Some(create_req) = maybe_create {
+            let create_resp = self.tenant_create(create_req).await?;
+            result.shards = create_resp
+                .shards
+                .into_iter()
+                .map(|s| TenantShardLocation {
+                    node_id: s.node_id,
+                    shard_id: s.shard_id,
+                })
+                .collect();
+        } else {
+            // This was an update, wait for reconciliation
+            self.await_waiters(waiters).await?;
+        }
+
+        Ok(result)
+    }
+
+    pub(crate) async fn tenant_delete(&self, tenant_id: TenantId) -> Result<StatusCode, ApiError> {
+        // TODO: refactor into helper
+        let targets = {
+            let locked = self.inner.read().unwrap();
+            let mut targets = Vec::new();
+
+            for (tenant_shard_id, shard) in
+                locked.tenants.range(TenantShardId::tenant_range(tenant_id))
+            {
+                let node_id = shard.intent.attached.ok_or_else(|| {
+                    ApiError::InternalServerError(anyhow::anyhow!("Shard not scheduled"))
+                })?;
+                let node = locked
+                    .nodes
+                    .get(&node_id)
+                    .expect("Pageservers may not be deleted while referenced");
+
+                targets.push((*tenant_shard_id, node.clone()));
+            }
+            targets
+        };
+
+        // TODO: error out if the tenant is not attached anywhere.
+
+        // Phase 1: delete on the pageservers
+        let mut any_pending = false;
+        for (tenant_shard_id, node) in targets {
+            let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());
+            // TODO: this, like many other places, requires proper retry handling for 503, timeout: those should not
+            // surface immediately as an error to our caller.
+            let status = client.tenant_delete(tenant_shard_id).await.map_err(|e| {
+                ApiError::InternalServerError(anyhow::anyhow!(
+                    "Error deleting shard {tenant_shard_id} on node {}: {e}",
+                    node.id
+                ))
+            })?;
+            tracing::info!(
+                "Shard {tenant_shard_id} on node {}, delete returned {}",
+                node.id,
+                status
+            );
+            if status == StatusCode::ACCEPTED {
+                any_pending = true;
+            }
+        }
+
+        if any_pending {
+            // Caller should call us again later.  When we eventually see 404s from
+            // all the shards, we may proceed to delete our records of the tenant.
+            tracing::info!(
+                "Tenant {} has some shards pending deletion, returning 202",
+                tenant_id
+            );
+            return Ok(StatusCode::ACCEPTED);
+        }
+
+        // Fall through: deletion of the tenant on pageservers is complete, we may proceed to drop
+        // our in-memory state and database state.
+
+        // Ordering: we delete persistent state first: if we then
+        // crash, we will drop the in-memory state.
+
+        // Drop persistent state.
+        self.persistence.delete_tenant(tenant_id).await?;
+
+        // Drop in-memory state
+        {
+            let mut locked = self.inner.write().unwrap();
+            locked
+                .tenants
+                .retain(|tenant_shard_id, _shard| tenant_shard_id.tenant_id != tenant_id);
+            tracing::info!(
+                "Deleted tenant {tenant_id}, now have {} tenants",
+                locked.tenants.len()
+            );
+        };
+
+        // Success is represented as 404, to imitate the existing pageserver deletion API
+        Ok(StatusCode::NOT_FOUND)
    }

    pub(crate) async fn tenant_timeline_create(
@@ -759,25 +1005,15 @@ impl Service {
    ) -> Result<TimelineInfo, ApiError> {
        let mut timeline_info = None;

-        let ensure_waiters = {
-            let locked = self.inner.write().unwrap();
-            tracing::info!(
-                "Creating timeline {}/{}, have {} pageservers",
-                tenant_id,
-                create_req.new_timeline_id,
-                locked.nodes.len()
-            );
+        tracing::info!(
+            "Creating timeline {}/{}",
+            tenant_id,
+            create_req.new_timeline_id,
+        );

-            self.ensure_attached(locked, tenant_id)
-                .map_err(ApiError::InternalServerError)?
-        };
-
-        let deadline = Instant::now().checked_add(Duration::from_secs(5)).unwrap();
-        for waiter in ensure_waiters {
-            let timeout = deadline.duration_since(Instant::now());
-            waiter.wait_timeout(timeout).await?;
-        }
+        self.ensure_attached_wait(tenant_id).await?;

+        // TODO: refuse to do this if shard splitting is in progress
        let targets = {
            let locked = self.inner.read().unwrap();
            let mut targets = Vec::new();
@@ -848,6 +1084,111 @@ impl Service {
        Ok(timeline_info.expect("targets cannot be empty"))
    }

+    pub(crate) async fn tenant_timeline_delete(
+        &self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+    ) -> Result<StatusCode, ApiError> {
+        tracing::info!("Deleting timeline {}/{}", tenant_id, timeline_id,);
+
+        self.ensure_attached_wait(tenant_id).await?;
+
+        // TODO: refuse to do this if shard splitting is in progress
+        let targets = {
+            let locked = self.inner.read().unwrap();
+            let mut targets = Vec::new();
+
+            for (tenant_shard_id, shard) in
+                locked.tenants.range(TenantShardId::tenant_range(tenant_id))
+            {
+                let node_id = shard.intent.attached.ok_or_else(|| {
+                    ApiError::InternalServerError(anyhow::anyhow!("Shard not scheduled"))
+                })?;
+                let node = locked
+                    .nodes
+                    .get(&node_id)
+                    .expect("Pageservers may not be deleted while referenced");
+
+                targets.push((*tenant_shard_id, node.clone()));
+            }
+            targets
+        };
+
+        if targets.is_empty() {
+            return Err(ApiError::NotFound(
+                anyhow::anyhow!("Tenant not found").into(),
+            ));
+        }
+
+        // TODO: call into shards concurrently
+        let mut any_pending = false;
+        for (tenant_shard_id, node) in targets {
+            let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());
+
+            tracing::info!(
+                "Deleting timeline on shard {}/{}, attached to node {}",
+                tenant_shard_id,
+                timeline_id,
+                node.id
+            );
+
+            let status = client
+                .timeline_delete(tenant_shard_id, timeline_id)
+                .await
+                .map_err(|e| {
+                    ApiError::InternalServerError(anyhow::anyhow!(
+                    "Error deleting timeline {timeline_id} on {tenant_shard_id} on node {}: {e}",
+                    node.id
+                ))
+                })?;
+
+            if status == StatusCode::ACCEPTED {
+                any_pending = true;
+            }
+        }
+
+        if any_pending {
+            Ok(StatusCode::ACCEPTED)
+        } else {
+            Ok(StatusCode::NOT_FOUND)
+        }
+    }
+
+    /// When you need to send an HTTP request to the pageserver that holds shard0 of a tenant, this
+    /// function looks it up and returns the url.  If the tenant isn't found, returns Err(ApiError::NotFound)
+    pub(crate) fn tenant_shard0_baseurl(
+        &self,
+        tenant_id: TenantId,
+    ) -> Result<(String, TenantShardId), ApiError> {
+        let locked = self.inner.read().unwrap();
+        let Some((tenant_shard_id, shard)) = locked
+            .tenants
+            .range(TenantShardId::tenant_range(tenant_id))
+            .next()
+        else {
+            return Err(ApiError::NotFound(
+                anyhow::anyhow!("Tenant {tenant_id} not found").into(),
+            ));
+        };
+
+        // TODO: should use the ID last published to compute_hook, rather than the intent: the intent might
+        // point to somewhere we haven't attached yet.
+        let Some(node_id) = shard.intent.attached else {
+            return Err(ApiError::Conflict(
+                "Cannot call timeline API on non-attached tenant".to_string(),
+            ));
+        };
+
+        let Some(node) = locked.nodes.get(&node_id) else {
+            // This should never happen
+            return Err(ApiError::InternalServerError(anyhow::anyhow!(
+                "Shard refers to nonexistent node"
+            )));
+        };
+
+        Ok((node.base_url(), *tenant_shard_id))
+    }
+
    pub(crate) fn tenant_locate(
        &self,
        tenant_id: TenantId,
@@ -993,6 +1334,20 @@ impl Service {
        Ok(TenantShardMigrateResponse {})
    }

+    pub(crate) async fn node_list(&self) -> Result<Vec<NodePersistence>, ApiError> {
+        // It is convenient to avoid taking the big lock and converting Node to a serializable
+        // structure, by fetching from storage instead of reading in-memory state.
+        let nodes = self
+            .persistence
+            .list_nodes()
+            .await?
+            .into_iter()
+            .map(|n| n.to_persistent())
+            .collect();
+
+        Ok(nodes)
+    }
+
    pub(crate) async fn node_register(
        &self,
        register_req: NodeRegisterRequest,
@@ -1166,7 +1521,7 @@ impl Service {
    /// Helper for methods that will try and call pageserver APIs for
    /// a tenant, such as timeline CRUD: they cannot proceed unless the tenant
    /// is attached somewhere.
-    fn ensure_attached(
+    fn ensure_attached_schedule(
        &self,
        mut locked: std::sync::RwLockWriteGuard<'_, ServiceState>,
        tenant_id: TenantId,
@@ -1196,6 +1551,23 @@ impl Service {
        Ok(waiters)
    }

+    async fn ensure_attached_wait(&self, tenant_id: TenantId) -> Result<(), ApiError> {
+        let ensure_waiters = {
+            let locked = self.inner.write().unwrap();
+
+            self.ensure_attached_schedule(locked, tenant_id)
+                .map_err(ApiError::InternalServerError)?
+        };
+
+        let deadline = Instant::now().checked_add(Duration::from_secs(5)).unwrap();
+        for waiter in ensure_waiters {
+            let timeout = deadline.duration_since(Instant::now());
+            waiter.wait_timeout(timeout).await?;
+        }
+
+        Ok(())
+    }
+
    /// Check all tenants for pending reconciliation work, and reconcile those in need
    ///
    /// Returns how many reconciliation tasks were started
--- a/control_plane/src/attachment_service.rs
+++ b/control_plane/src/attachment_service.rs
@@ -17,6 +17,7 @@ use serde::{de::DeserializeOwned, Deserialize, Serialize};
 use std::{env, str::FromStr};
 use tokio::process::Command;
 use tracing::instrument;
+use url::Url;
 use utils::{
    auth::{Claims, Scope},
    id::{NodeId, TenantId},
@@ -59,6 +60,7 @@ pub struct InspectResponse {

 #[derive(Serialize, Deserialize)]
 pub struct TenantCreateResponseShard {
+    pub shard_id: TenantShardId,
    pub node_id: NodeId,
    pub generation: u32,
 }
@@ -523,13 +525,15 @@ impl AttachmentService {
        RQ: Serialize + Sized,
        RS: DeserializeOwned + Sized,
    {
-        let url = self
-            .env
-            .control_plane_api
-            .clone()
-            .unwrap()
-            .join(&path)
-            .unwrap();
+        // The configured URL has the /upcall path prefix for pageservers to use: we will strip that out
+        // for general purpose API access.
+        let listen_url = self.env.control_plane_api.clone().unwrap();
+        let url = Url::from_str(&format!(
+            "http://{}:{}/{path}",
+            listen_url.host_str().unwrap(),
+            listen_url.port().unwrap()
+        ))
+        .unwrap();

        let mut builder = self.client.request(method, url);
        if let Some(body) = body {
@@ -566,7 +570,7 @@ impl AttachmentService {
        let response = self
            .dispatch::<_, AttachHookResponse>(
                Method::POST,
-                "attach-hook".to_string(),
+                "debug/v1/attach-hook".to_string(),
                Some(request),
            )
            .await?;
@@ -582,7 +586,11 @@ impl AttachmentService {
        let request = InspectRequest { tenant_shard_id };

        let response = self
-            .dispatch::<_, InspectResponse>(Method::POST, "inspect".to_string(), Some(request))
+            .dispatch::<_, InspectResponse>(
+                Method::POST,
+                "debug/v1/inspect".to_string(),
+                Some(request),
+            )
            .await?;

        Ok(response.attachment)
@@ -599,8 +607,12 @@ impl AttachmentService {

    #[instrument(skip(self))]
    pub async fn tenant_locate(&self, tenant_id: TenantId) -> anyhow::Result<TenantLocateResponse> {
-        self.dispatch::<(), _>(Method::GET, format!("tenant/{tenant_id}/locate"), None)
-            .await
+        self.dispatch::<(), _>(
+            Method::GET,
+            format!("control/v1/tenant/{tenant_id}/locate"),
+            None,
+        )
+        .await
    }

    #[instrument(skip(self))]
@@ -622,7 +634,7 @@ impl AttachmentService {

    #[instrument(skip_all, fields(node_id=%req.node_id))]
    pub async fn node_register(&self, req: NodeRegisterRequest) -> anyhow::Result<()> {
-        self.dispatch::<_, ()>(Method::POST, "node".to_string(), Some(req))
+        self.dispatch::<_, ()>(Method::POST, "control/v1/node".to_string(), Some(req))
            .await
    }

@@ -630,7 +642,7 @@ impl AttachmentService {
    pub async fn node_configure(&self, req: NodeConfigureRequest) -> anyhow::Result<()> {
        self.dispatch::<_, ()>(
            Method::PUT,
-            format!("node/{}/config", req.node_id),
+            format!("control/v1/node/{}/config", req.node_id),
            Some(req),
        )
        .await
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -51,7 +51,7 @@ project_git_version!(GIT_VERSION);

 const DEFAULT_PG_VERSION: &str = "15";

-const DEFAULT_PAGESERVER_CONTROL_PLANE_API: &str = "http://127.0.0.1:1234/";
+const DEFAULT_PAGESERVER_CONTROL_PLANE_API: &str = "http://127.0.0.1:1234/upcall/v1/";

 fn default_conf(num_pageservers: u16) -> String {
    let mut template = format!(
--- a/libs/metrics/Cargo.toml
+++ b/libs/metrics/Cargo.toml
@@ -9,5 +9,10 @@ prometheus.workspace = true
 libc.workspace = true
 once_cell.workspace = true
 chrono.workspace = true
+twox-hash.workspace = true

 workspace_hack.workspace = true
+
+[dev-dependencies]
+rand = "0.8"
+rand_distr = "0.4.3"
--- a/libs/metrics/src/hll.rs
+++ b/libs/metrics/src/hll.rs
@@ -0,0 +1,523 @@
+//! HyperLogLog is an algorithm for the count-distinct problem,
+//! approximating the number of distinct elements in a multiset.
+//! Calculating the exact cardinality of the distinct elements
+//! of a multiset requires an amount of memory proportional to
+//! the cardinality, which is impractical for very large data sets.
+//! Probabilistic cardinality estimators, such as the HyperLogLog algorithm,
+//! use significantly less memory than this, but can only approximate the cardinality.
+
+use std::{
+    collections::HashMap,
+    hash::{BuildHasher, BuildHasherDefault, Hash, Hasher},
+    sync::{atomic::AtomicU8, Arc, RwLock},
+};
+
+use prometheus::{
+    core::{self, Describer},
+    proto, Opts,
+};
+use twox_hash::xxh3;
+
+/// Create an [`HyperLogLogVec`] and registers to default registry.
+#[macro_export(local_inner_macros)]
+macro_rules! register_hll_vec {
+    ($N:literal, $OPTS:expr, $LABELS_NAMES:expr $(,)?) => {{
+        let hll_vec = $crate::HyperLogLogVec::<$N>::new($OPTS, $LABELS_NAMES).unwrap();
+        $crate::register(Box::new(hll_vec.clone())).map(|_| hll_vec)
+    }};
+
+    ($N:literal, $NAME:expr, $HELP:expr, $LABELS_NAMES:expr $(,)?) => {{
+        $crate::register_hll_vec!($N, $crate::opts!($NAME, $HELP), $LABELS_NAMES)
+    }};
+}
+
+/// Create an [`HyperLogLog`] and registers to default registry.
+#[macro_export(local_inner_macros)]
+macro_rules! register_hll {
+    ($N:literal, $OPTS:expr $(,)?) => {{
+        let hll = $crate::HyperLogLog::<$N>::with_opts($OPTS).unwrap();
+        $crate::register(Box::new(hll.clone())).map(|_| hll)
+    }};
+
+    ($N:literal, $NAME:expr, $HELP:expr $(,)?) => {{
+        $crate::register_hll!($N, $crate::opts!($NAME, $HELP), $LABELS_NAMES)
+    }};
+}
+
+/// HLL is a probabilistic cardinality measure.
+///
+/// How to use this time-series for a metric name `my_metrics_total_hll`:
+///
+/// ```promql
+/// # harmonic mean
+/// 1 / (
+///     sum (
+///         2 ^ -(
+///             # HLL merge operation
+///             max (my_metrics_total_hll{}) by (hll_shard, other_labels...)
+///         )
+///     ) without (hll_shard)
+/// )
+/// * alpha
+/// * shards_count
+/// * shards_count
+/// ```
+///
+/// If you want an estimate over time, you can use the following query:
+///
+/// ```promql
+/// # harmonic mean
+/// 1 / (
+///     sum (
+///         2 ^ -(
+///             # HLL merge operation
+///             max (
+///                 max_over_time(my_metrics_total_hll{}[$__rate_interval])
+///             ) by (hll_shard, other_labels...)
+///         )
+///     ) without (hll_shard)
+/// )
+/// * alpha
+/// * shards_count
+/// * shards_count
+/// ```
+///
+/// In the case of low cardinality, you might want to use the linear counting approximation:
+///
+/// ```promql
+/// # LinearCounting(m, V) = m log (m / V)
+/// shards_count * ln(shards_count /
+///     # calculate V = how many shards contain a 0
+///     count(max (proxy_connecting_endpoints{}) by (hll_shard, protocol) == 0) without (hll_shard)
+/// )
+/// ```
+///
+/// See <https://en.wikipedia.org/wiki/HyperLogLog#Practical_considerations> for estimates on alpha
+#[derive(Clone)]
+pub struct HyperLogLogVec<const N: usize> {
+    core: Arc<HyperLogLogVecCore<N>>,
+}
+
+struct HyperLogLogVecCore<const N: usize> {
+    pub children: RwLock<HashMap<u64, HyperLogLog<N>, BuildHasherDefault<xxh3::Hash64>>>,
+    pub desc: core::Desc,
+    pub opts: Opts,
+}
+
+impl<const N: usize> core::Collector for HyperLogLogVec<N> {
+    fn desc(&self) -> Vec<&core::Desc> {
+        vec![&self.core.desc]
+    }
+
+    fn collect(&self) -> Vec<proto::MetricFamily> {
+        let mut m = proto::MetricFamily::default();
+        m.set_name(self.core.desc.fq_name.clone());
+        m.set_help(self.core.desc.help.clone());
+        m.set_field_type(proto::MetricType::GAUGE);
+
+        let mut metrics = Vec::new();
+        for child in self.core.children.read().unwrap().values() {
+            child.core.collect_into(&mut metrics);
+        }
+        m.set_metric(metrics);
+
+        vec![m]
+    }
+}
+
+impl<const N: usize> HyperLogLogVec<N> {
+    /// Create a new [`HyperLogLogVec`] based on the provided
+    /// [`Opts`] and partitioned by the given label names. At least one label name must be
+    /// provided.
+    pub fn new(opts: Opts, label_names: &[&str]) -> prometheus::Result<Self> {
+        assert!(N.is_power_of_two());
+        let variable_names = label_names.iter().map(|s| (*s).to_owned()).collect();
+        let opts = opts.variable_labels(variable_names);
+
+        let desc = opts.describe()?;
+        let v = HyperLogLogVecCore {
+            children: RwLock::new(HashMap::default()),
+            desc,
+            opts,
+        };
+
+        Ok(Self { core: Arc::new(v) })
+    }
+
+    /// `get_metric_with_label_values` returns the [`HyperLogLog<P>`] for the given slice
+    /// of label values (same order as the VariableLabels in Desc). If that combination of
+    /// label values is accessed for the first time, a new [`HyperLogLog<P>`] is created.
+    ///
+    /// An error is returned if the number of label values is not the same as the
+    /// number of VariableLabels in Desc.
+    pub fn get_metric_with_label_values(
+        &self,
+        vals: &[&str],
+    ) -> prometheus::Result<HyperLogLog<N>> {
+        self.core.get_metric_with_label_values(vals)
+    }
+
+    /// `with_label_values` works as `get_metric_with_label_values`, but panics if an error
+    /// occurs.
+    pub fn with_label_values(&self, vals: &[&str]) -> HyperLogLog<N> {
+        self.get_metric_with_label_values(vals).unwrap()
+    }
+}
+
+impl<const N: usize> HyperLogLogVecCore<N> {
+    pub fn get_metric_with_label_values(
+        &self,
+        vals: &[&str],
+    ) -> prometheus::Result<HyperLogLog<N>> {
+        let h = self.hash_label_values(vals)?;
+
+        if let Some(metric) = self.children.read().unwrap().get(&h).cloned() {
+            return Ok(metric);
+        }
+
+        self.get_or_create_metric(h, vals)
+    }
+
+    pub(crate) fn hash_label_values(&self, vals: &[&str]) -> prometheus::Result<u64> {
+        if vals.len() != self.desc.variable_labels.len() {
+            return Err(prometheus::Error::InconsistentCardinality {
+                expect: self.desc.variable_labels.len(),
+                got: vals.len(),
+            });
+        }
+
+        let mut h = xxh3::Hash64::default();
+        for val in vals {
+            h.write(val.as_bytes());
+        }
+
+        Ok(h.finish())
+    }
+
+    fn get_or_create_metric(
+        &self,
+        hash: u64,
+        label_values: &[&str],
+    ) -> prometheus::Result<HyperLogLog<N>> {
+        let mut children = self.children.write().unwrap();
+        // Check exist first.
+        if let Some(metric) = children.get(&hash).cloned() {
+            return Ok(metric);
+        }
+
+        let metric = HyperLogLog::with_opts_and_label_values(&self.opts, label_values)?;
+        children.insert(hash, metric.clone());
+        Ok(metric)
+    }
+}
+
+/// HLL is a probabilistic cardinality measure.
+///
+/// How to use this time-series for a metric name `my_metrics_total_hll`:
+///
+/// ```promql
+/// # harmonic mean
+/// 1 / (
+///     sum (
+///         2 ^ -(
+///             # HLL merge operation
+///             max (my_metrics_total_hll{}) by (hll_shard, other_labels...)
+///         )
+///     ) without (hll_shard)
+/// )
+/// * alpha
+/// * shards_count
+/// * shards_count
+/// ```
+///
+/// If you want an estimate over time, you can use the following query:
+///
+/// ```promql
+/// # harmonic mean
+/// 1 / (
+///     sum (
+///         2 ^ -(
+///             # HLL merge operation
+///             max (
+///                 max_over_time(my_metrics_total_hll{}[$__rate_interval])
+///             ) by (hll_shard, other_labels...)
+///         )
+///     ) without (hll_shard)
+/// )
+/// * alpha
+/// * shards_count
+/// * shards_count
+/// ```
+///
+/// In the case of low cardinality, you might want to use the linear counting approximation:
+///
+/// ```promql
+/// # LinearCounting(m, V) = m log (m / V)
+/// shards_count * ln(shards_count /
+///     # calculate V = how many shards contain a 0
+///     count(max (proxy_connecting_endpoints{}) by (hll_shard, protocol) == 0) without (hll_shard)
+/// )
+/// ```
+///
+/// See <https://en.wikipedia.org/wiki/HyperLogLog#Practical_considerations> for estimates on alpha
+#[derive(Clone)]
+pub struct HyperLogLog<const N: usize> {
+    core: Arc<HyperLogLogCore<N>>,
+}
+
+impl<const N: usize> HyperLogLog<N> {
+    /// Create a [`HyperLogLog`] with the `name` and `help` arguments.
+    pub fn new<S1: Into<String>, S2: Into<String>>(name: S1, help: S2) -> prometheus::Result<Self> {
+        assert!(N.is_power_of_two());
+        let opts = Opts::new(name, help);
+        Self::with_opts(opts)
+    }
+
+    /// Create a [`HyperLogLog`] with the `opts` options.
+    pub fn with_opts(opts: Opts) -> prometheus::Result<Self> {
+        Self::with_opts_and_label_values(&opts, &[])
+    }
+
+    fn with_opts_and_label_values(opts: &Opts, label_values: &[&str]) -> prometheus::Result<Self> {
+        let desc = opts.describe()?;
+        let labels = make_label_pairs(&desc, label_values)?;
+
+        let v = HyperLogLogCore {
+            shards: [0; N].map(AtomicU8::new),
+            desc,
+            labels,
+        };
+        Ok(Self { core: Arc::new(v) })
+    }
+
+    pub fn measure(&self, item: &impl Hash) {
+        // changing the hasher will break compatibility with previous measurements.
+        self.record(BuildHasherDefault::<xxh3::Hash64>::default().hash_one(item));
+    }
+
+    fn record(&self, hash: u64) {
+        let p = N.ilog2() as u8;
+        let j = hash & (N as u64 - 1);
+        let rho = (hash >> p).leading_zeros() as u8 + 1 - p;
+        self.core.shards[j as usize].fetch_max(rho, std::sync::atomic::Ordering::Relaxed);
+    }
+}
+
+struct HyperLogLogCore<const N: usize> {
+    shards: [AtomicU8; N],
+    desc: core::Desc,
+    labels: Vec<proto::LabelPair>,
+}
+
+impl<const N: usize> core::Collector for HyperLogLog<N> {
+    fn desc(&self) -> Vec<&core::Desc> {
+        vec![&self.core.desc]
+    }
+
+    fn collect(&self) -> Vec<proto::MetricFamily> {
+        let mut m = proto::MetricFamily::default();
+        m.set_name(self.core.desc.fq_name.clone());
+        m.set_help(self.core.desc.help.clone());
+        m.set_field_type(proto::MetricType::GAUGE);
+
+        let mut metrics = Vec::new();
+        self.core.collect_into(&mut metrics);
+        m.set_metric(metrics);
+
+        vec![m]
+    }
+}
+
+impl<const N: usize> HyperLogLogCore<N> {
+    fn collect_into(&self, metrics: &mut Vec<proto::Metric>) {
+        self.shards.iter().enumerate().for_each(|(i, x)| {
+            let mut shard_label = proto::LabelPair::default();
+            shard_label.set_name("hll_shard".to_owned());
+            shard_label.set_value(format!("{i}"));
+
+            // We reset the counter to 0 so we can perform a cardinality measure over any time slice in prometheus.
+
+            // This seems like it would be a race condition,
+            // but HLL is not impacted by a write in one shard happening in between.
+            // This is because in PromQL we will be implementing a harmonic mean of all buckets.
+            // we will also merge samples in a time series using `max by (hll_shard)`.
+
+            // TODO: maybe we shouldn't reset this on every collect, instead, only after a time window.
+            // this would mean that a dev port-forwarding the metrics url won't break the sampling.
+            let v = x.swap(0, std::sync::atomic::Ordering::Relaxed);
+
+            let mut m = proto::Metric::default();
+            let mut c = proto::Gauge::default();
+            c.set_value(v as f64);
+            m.set_gauge(c);
+
+            let mut labels = Vec::with_capacity(self.labels.len() + 1);
+            labels.extend_from_slice(&self.labels);
+            labels.push(shard_label);
+
+            m.set_label(labels);
+            metrics.push(m);
+        })
+    }
+}
+
+fn make_label_pairs(
+    desc: &core::Desc,
+    label_values: &[&str],
+) -> prometheus::Result<Vec<proto::LabelPair>> {
+    if desc.variable_labels.len() != label_values.len() {
+        return Err(prometheus::Error::InconsistentCardinality {
+            expect: desc.variable_labels.len(),
+            got: label_values.len(),
+        });
+    }
+
+    let total_len = desc.variable_labels.len() + desc.const_label_pairs.len();
+    if total_len == 0 {
+        return Ok(vec![]);
+    }
+
+    if desc.variable_labels.is_empty() {
+        return Ok(desc.const_label_pairs.clone());
+    }
+
+    let mut label_pairs = Vec::with_capacity(total_len);
+    for (i, n) in desc.variable_labels.iter().enumerate() {
+        let mut label_pair = proto::LabelPair::default();
+        label_pair.set_name(n.clone());
+        label_pair.set_value(label_values[i].to_owned());
+        label_pairs.push(label_pair);
+    }
+
+    for label_pair in &desc.const_label_pairs {
+        label_pairs.push(label_pair.clone());
+    }
+    label_pairs.sort();
+    Ok(label_pairs)
+}
+
+#[cfg(test)]
+mod tests {
+    use std::collections::HashSet;
+
+    use prometheus::{proto, Opts};
+    use rand::{rngs::StdRng, Rng, SeedableRng};
+    use rand_distr::{Distribution, Zipf};
+
+    use crate::HyperLogLogVec;
+
+    fn collect(hll: &HyperLogLogVec<32>) -> Vec<proto::Metric> {
+        let mut metrics = vec![];
+        hll.core
+            .children
+            .read()
+            .unwrap()
+            .values()
+            .for_each(|c| c.core.collect_into(&mut metrics));
+        metrics
+    }
+    fn get_cardinality(metrics: &[proto::Metric], filter: impl Fn(&proto::Metric) -> bool) -> f64 {
+        let mut buckets = [0.0; 32];
+        for metric in metrics.chunks_exact(32) {
+            if filter(&metric[0]) {
+                for (i, m) in metric.iter().enumerate() {
+                    buckets[i] = f64::max(buckets[i], m.get_gauge().get_value());
+                }
+            }
+        }
+
+        buckets
+            .into_iter()
+            .map(|f| 2.0f64.powf(-f))
+            .sum::<f64>()
+            .recip()
+            * 0.697
+            * 32.0
+            * 32.0
+    }
+
+    fn test_cardinality(n: usize, dist: impl Distribution<f64>) -> ([usize; 3], [f64; 3]) {
+        let hll = HyperLogLogVec::<32>::new(Opts::new("foo", "bar"), &["x"]).unwrap();
+
+        let mut iter = StdRng::seed_from_u64(0x2024_0112).sample_iter(dist);
+        let mut set_a = HashSet::new();
+        let mut set_b = HashSet::new();
+
+        for x in iter.by_ref().take(n) {
+            set_a.insert(x.to_bits());
+            hll.with_label_values(&["a"]).measure(&x.to_bits());
+        }
+        for x in iter.by_ref().take(n) {
+            set_b.insert(x.to_bits());
+            hll.with_label_values(&["b"]).measure(&x.to_bits());
+        }
+        let merge = &set_a | &set_b;
+
+        let metrics = collect(&hll);
+        let len = get_cardinality(&metrics, |_| true);
+        let len_a = get_cardinality(&metrics, |l| l.get_label()[0].get_value() == "a");
+        let len_b = get_cardinality(&metrics, |l| l.get_label()[0].get_value() == "b");
+
+        ([merge.len(), set_a.len(), set_b.len()], [len, len_a, len_b])
+    }
+
+    #[test]
+    fn test_cardinality_small() {
+        let (actual, estimate) = test_cardinality(100, Zipf::new(100, 1.2f64).unwrap());
+
+        assert_eq!(actual, [46, 30, 32]);
+        assert!(51.3 < estimate[0] && estimate[0] < 51.4);
+        assert!(44.0 < estimate[1] && estimate[1] < 44.1);
+        assert!(39.0 < estimate[2] && estimate[2] < 39.1);
+    }
+
+    #[test]
+    fn test_cardinality_medium() {
+        let (actual, estimate) = test_cardinality(10000, Zipf::new(10000, 1.2f64).unwrap());
+
+        assert_eq!(actual, [2529, 1618, 1629]);
+        assert!(2309.1 < estimate[0] && estimate[0] < 2309.2);
+        assert!(1566.6 < estimate[1] && estimate[1] < 1566.7);
+        assert!(1629.5 < estimate[2] && estimate[2] < 1629.6);
+    }
+
+    #[test]
+    fn test_cardinality_large() {
+        let (actual, estimate) = test_cardinality(1_000_000, Zipf::new(1_000_000, 1.2f64).unwrap());
+
+        assert_eq!(actual, [129077, 79579, 79630]);
+        assert!(126067.2 < estimate[0] && estimate[0] < 126067.3);
+        assert!(83076.8 < estimate[1] && estimate[1] < 83076.9);
+        assert!(64251.2 < estimate[2] && estimate[2] < 64251.3);
+    }
+
+    #[test]
+    fn test_cardinality_small2() {
+        let (actual, estimate) = test_cardinality(100, Zipf::new(200, 0.8f64).unwrap());
+
+        assert_eq!(actual, [92, 58, 60]);
+        assert!(116.1 < estimate[0] && estimate[0] < 116.2);
+        assert!(81.7 < estimate[1] && estimate[1] < 81.8);
+        assert!(69.3 < estimate[2] && estimate[2] < 69.4);
+    }
+
+    #[test]
+    fn test_cardinality_medium2() {
+        let (actual, estimate) = test_cardinality(10000, Zipf::new(20000, 0.8f64).unwrap());
+
+        assert_eq!(actual, [8201, 5131, 5051]);
+        assert!(6846.4 < estimate[0] && estimate[0] < 6846.5);
+        assert!(5239.1 < estimate[1] && estimate[1] < 5239.2);
+        assert!(4292.8 < estimate[2] && estimate[2] < 4292.9);
+    }
+
+    #[test]
+    fn test_cardinality_large2() {
+        let (actual, estimate) = test_cardinality(1_000_000, Zipf::new(2_000_000, 0.8f64).unwrap());
+
+        assert_eq!(actual, [777847, 482069, 482246]);
+        assert!(699437.4 < estimate[0] && estimate[0] < 699437.5);
+        assert!(374948.9 < estimate[1] && estimate[1] < 374949.0);
+        assert!(434609.7 < estimate[2] && estimate[2] < 434609.8);
+    }
+}
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -28,7 +28,9 @@ use prometheus::{Registry, Result};
 pub mod launch_timestamp;
 mod wrappers;
 pub use wrappers::{CountedReader, CountedWriter};
+mod hll;
 pub mod metric_vec_duration;
+pub use hll::{HyperLogLog, HyperLogLogVec};

 pub type UIntGauge = GenericGauge<AtomicU64>;
 pub type UIntGaugeVec = GenericGaugeVec<AtomicU64>;
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -364,6 +364,19 @@ pub struct TenantLocationConfigRequest {
    pub config: LocationConfig, // as we have a flattened field, we should reject all unknown fields in it
 }

+#[derive(Serialize, Deserialize, Debug)]
+#[serde(deny_unknown_fields)]
+pub struct TenantShardLocation {
+    pub shard_id: TenantShardId,
+    pub node_id: NodeId,
+}
+
+#[derive(Serialize, Deserialize, Debug)]
+#[serde(deny_unknown_fields)]
+pub struct TenantLocationConfigResponse {
+    pub shards: Vec<TenantShardLocation>,
+}
+
 #[derive(Serialize, Deserialize, Debug)]
 #[serde(deny_unknown_fields)]
 pub struct TenantConfigRequest {
--- a/libs/postgres_ffi/src/xlog_utils.rs
+++ b/libs/postgres_ffi/src/xlog_utils.rs
@@ -207,10 +207,16 @@ pub fn find_end_of_wal(
                let seg_offs = curr_lsn.segment_offset(wal_seg_size);
                segment.seek(SeekFrom::Start(seg_offs as u64))?;
                // loop inside segment
-                loop {
+                while curr_lsn.segment_number(wal_seg_size) == segno {
                    let bytes_read = segment.read(&mut buf)?;
                    if bytes_read == 0 {
-                        break; // EOF
+                        debug!(
+                            "find_end_of_wal reached end at {:?}, EOF in segment {:?} at offset {}",
+                            result,
+                            seg_file_path,
+                            curr_lsn.segment_offset(wal_seg_size)
+                        );
+                        return Ok(result);
                    }
                    curr_lsn += bytes_read as u64;
                    decoder.feed_bytes(&buf[0..bytes_read]);
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -646,7 +646,7 @@ impl RemoteStorage for S3Bucket {
        let timestamp = DateTime::from(timestamp);
        let done_if_after = DateTime::from(done_if_after);

-        tracing::trace!("Target time: {timestamp:?}, done_if_after {done_if_after:?}");
+        tracing::info!("Target time: {timestamp:?}, done_if_after {done_if_after:?}");

        // get the passed prefix or if it is not set use prefix_in_bucket value
        let prefix = prefix
@@ -657,75 +657,108 @@ impl RemoteStorage for S3Bucket {
        let max_retries = 10;
        let is_permanent = |_e: &_| false;

-        let list = backoff::retry(
-            || async {
-                Ok(self
-                    .client
-                    .list_object_versions()
-                    .bucket(self.bucket_name.clone())
-                    .set_prefix(prefix.clone())
-                    .send()
-                    .await?)
-            },
-            is_permanent,
-            warn_threshold,
-            max_retries,
-            "listing object versions for time_travel_recover",
-            backoff::Cancel::new(cancel.clone(), || anyhow!("Cancelled")),
-        )
-        .await?;
+        let mut key_marker = None;
+        let mut version_id_marker = None;
+        let mut versions_and_deletes = Vec::new();

-        if list.is_truncated().unwrap_or_default() {
-            anyhow::bail!("Received truncated ListObjectVersions response for prefix={prefix:?}");
+        loop {
+            let response = backoff::retry(
+                || async {
+                    Ok(self
+                        .client
+                        .list_object_versions()
+                        .bucket(self.bucket_name.clone())
+                        .set_prefix(prefix.clone())
+                        .set_key_marker(key_marker.clone())
+                        .set_version_id_marker(version_id_marker.clone())
+                        .send()
+                        .await?)
+                },
+                is_permanent,
+                warn_threshold,
+                max_retries,
+                "listing object versions for time_travel_recover",
+                backoff::Cancel::new(cancel.clone(), || anyhow!("Cancelled")),
+            )
+            .await?;
+
+            tracing::trace!(
+                "  Got List response version_id_marker={:?}, key_marker={:?}",
+                response.version_id_marker,
+                response.key_marker
+            );
+            let versions = response
+                .versions
+                .unwrap_or_default()
+                .into_iter()
+                .map(VerOrDelete::from_version);
+            let deletes = response
+                .delete_markers
+                .unwrap_or_default()
+                .into_iter()
+                .map(VerOrDelete::from_delete_marker);
+            itertools::process_results(versions.chain(deletes), |n_vds| {
+                versions_and_deletes.extend(n_vds)
+            })?;
+            fn none_if_empty(v: Option<String>) -> Option<String> {
+                v.filter(|v| !v.is_empty())
+            }
+            version_id_marker = none_if_empty(response.next_version_id_marker);
+            key_marker = none_if_empty(response.next_key_marker);
+            if version_id_marker.is_none() {
+                // The final response is not supposed to be truncated
+                if response.is_truncated.unwrap_or_default() {
+                    anyhow::bail!(
+                        "Received truncated ListObjectVersions response for prefix={prefix:?}"
+                    );
+                }
+                break;
+            }
+            // Limit the number of versions deletions, mostly so that we don't
+            // keep requesting forever if the list is too long, as we'd put the
+            // list in RAM.
+            // Building a list of 100k entries that reaches the limit roughly takes
+            // 40 seconds, and roughly corresponds to tenants of 2 TiB physical size.
+            const COMPLEXITY_LIMIT: usize = 100_000;
+            if versions_and_deletes.len() >= COMPLEXITY_LIMIT {
+                anyhow::bail!(
+                    "Limit for number of versions/deletions exceeded for prefix={prefix:?}"
+                );
+            }
        }

-        let mut versions_deletes = list
-            .versions()
-            .iter()
-            .map(VerOrDelete::Version)
-            .chain(list.delete_markers().iter().map(VerOrDelete::DeleteMarker))
-            .collect::<Vec<_>>();
+        // Work on the list of references instead of the objects directly,
+        // otherwise we get lifetime errors in the sort_by_key call below.
+        let mut versions_and_deletes = versions_and_deletes.iter().collect::<Vec<_>>();

-        versions_deletes.sort_by_key(|vd| (vd.key(), vd.last_modified()));
+        versions_and_deletes.sort_by_key(|vd| (&vd.key, &vd.last_modified));

        let mut vds_for_key = HashMap::<_, Vec<_>>::new();

-        for vd in versions_deletes {
-            let last_modified = vd.last_modified();
-            let version_id = vd.version_id();
-            let key = vd.key();
-            let (Some(last_modified), Some(version_id), Some(key)) =
-                (last_modified, version_id, key)
-            else {
-                anyhow::bail!(
-                    "One (or more) of last_modified, key, and id is None. \
-                    Is versioning enabled in the bucket? last_modified={:?} key={:?} version_id={:?}",
-                    last_modified, key, version_id,
-                );
-            };
+        for vd in &versions_and_deletes {
+            let VerOrDelete {
+                version_id, key, ..
+            } = &vd;
            if version_id == "null" {
                anyhow::bail!("Received ListVersions response for key={key} with version_id='null', \
                    indicating either disabled versioning, or legacy objects with null version id values");
            }
            tracing::trace!(
-                "Parsing version key={key} version_id={version_id} is_delete={}",
-                matches!(vd, VerOrDelete::DeleteMarker(_))
+                "Parsing version key={key} version_id={version_id} kind={:?}",
+                vd.kind
            );

-            vds_for_key
-                .entry(key)
-                .or_default()
-                .push((vd, last_modified, version_id));
+            vds_for_key.entry(key).or_default().push(vd);
        }
        for (key, versions) in vds_for_key {
-            let (last_vd, last_last_modified, _version_id) = versions.last().unwrap();
-            if last_last_modified > &&done_if_after {
+            let last_vd = versions.last().unwrap();
+            if last_vd.last_modified > done_if_after {
                tracing::trace!("Key {key} has version later than done_if_after, skipping");
                continue;
            }
            // the version we want to restore to.
            let version_to_restore_to =
-                match versions.binary_search_by_key(&timestamp, |tpl| *tpl.1) {
+                match versions.binary_search_by_key(&timestamp, |tpl| tpl.last_modified) {
                    Ok(v) => v,
                    Err(e) => e,
                };
@@ -743,7 +776,11 @@ impl RemoteStorage for S3Bucket {
                do_delete = true;
            } else {
                match &versions[version_to_restore_to - 1] {
-                    (VerOrDelete::Version(_), _last_modified, version_id) => {
+                    VerOrDelete {
+                        kind: VerOrDeleteKind::Version,
+                        version_id,
+                        ..
+                    } => {
                        tracing::trace!("Copying old version {version_id} for {key}...");
                        // Restore the state to the last version by copying
                        let source_id =
@@ -768,13 +805,16 @@ impl RemoteStorage for S3Bucket {
                        )
                        .await?;
                    }
-                    (VerOrDelete::DeleteMarker(_), _last_modified, _version_id) => {
+                    VerOrDelete {
+                        kind: VerOrDeleteKind::DeleteMarker,
+                        ..
+                    } => {
                        do_delete = true;
                    }
                }
            };
            if do_delete {
-                if matches!(last_vd, VerOrDelete::DeleteMarker(_)) {
+                if matches!(last_vd.kind, VerOrDeleteKind::DeleteMarker) {
                    // Key has since been deleted (but there was some history), no need to do anything
                    tracing::trace!("Key {key} already deleted, skipping.");
                } else {
@@ -811,29 +851,59 @@ fn start_measuring_requests(
    })
 }

-enum VerOrDelete<'a> {
-    Version(&'a ObjectVersion),
-    DeleteMarker(&'a DeleteMarkerEntry),
+// Save RAM and only store the needed data instead of the entire ObjectVersion/DeleteMarkerEntry
+struct VerOrDelete {
+    kind: VerOrDeleteKind,
+    last_modified: DateTime,
+    version_id: String,
+    key: String,
 }

-impl<'a> VerOrDelete<'a> {
-    fn last_modified(&self) -> Option<&'a DateTime> {
-        match self {
-            VerOrDelete::Version(v) => v.last_modified(),
-            VerOrDelete::DeleteMarker(v) => v.last_modified(),
-        }
+#[derive(Debug)]
+enum VerOrDeleteKind {
+    Version,
+    DeleteMarker,
+}
+
+impl VerOrDelete {
+    fn with_kind(
+        kind: VerOrDeleteKind,
+        last_modified: Option<DateTime>,
+        version_id: Option<String>,
+        key: Option<String>,
+    ) -> anyhow::Result<Self> {
+        let lvk = (last_modified, version_id, key);
+        let (Some(last_modified), Some(version_id), Some(key)) = lvk else {
+            anyhow::bail!(
+                "One (or more) of last_modified, key, and id is None. \
+            Is versioning enabled in the bucket? last_modified={:?}, version_id={:?}, key={:?}",
+                lvk.0,
+                lvk.1,
+                lvk.2,
+            );
+        };
+        Ok(Self {
+            kind,
+            last_modified,
+            version_id,
+            key,
+        })
    }
-    fn version_id(&self) -> Option<&'a str> {
-        match self {
-            VerOrDelete::Version(v) => v.version_id(),
-            VerOrDelete::DeleteMarker(v) => v.version_id(),
-        }
+    fn from_version(v: ObjectVersion) -> anyhow::Result<Self> {
+        Self::with_kind(
+            VerOrDeleteKind::Version,
+            v.last_modified,
+            v.version_id,
+            v.key,
+        )
    }
-    fn key(&self) -> Option<&'a str> {
-        match self {
-            VerOrDelete::Version(v) => v.key(),
-            VerOrDelete::DeleteMarker(v) => v.key(),
-        }
+    fn from_delete_marker(v: DeleteMarkerEntry) -> anyhow::Result<Self> {
+        Self::with_kind(
+            VerOrDeleteKind::DeleteMarker,
+            v.last_modified,
+            v.version_id,
+            v.key,
+        )
    }
 }

--- a/libs/utils/src/crashsafe.rs
+++ b/libs/utils/src/crashsafe.rs
@@ -112,6 +112,55 @@ pub async fn fsync_async(path: impl AsRef<Utf8Path>) -> Result<(), std::io::Erro
    tokio::fs::File::open(path.as_ref()).await?.sync_all().await
 }

+pub async fn fsync_async_opt(
+    path: impl AsRef<Utf8Path>,
+    do_fsync: bool,
+) -> Result<(), std::io::Error> {
+    if do_fsync {
+        fsync_async(path.as_ref()).await?;
+    }
+    Ok(())
+}
+
+/// Like postgres' durable_rename, renames file issuing fsyncs do make it
+/// durable. After return, file and rename are guaranteed to be persisted.
+///
+/// Unlike postgres, it only does fsyncs to 1) file to be renamed to make
+/// contents durable; 2) its directory entry to make rename durable 3) again to
+/// already renamed file, which is not required by standards but postgres does
+/// it, let's stick to that. Postgres additionally fsyncs newpath *before*
+/// rename if it exists to ensure that at least one of the files survives, but
+/// current callers don't need that.
+///
+/// virtual_file.rs has similar code, but it doesn't use vfs.
+///
+/// Useful links: <https://lwn.net/Articles/457667/>
+/// <https://www.postgresql.org/message-id/flat/56583BDD.9060302%402ndquadrant.com>
+/// <https://thunk.org/tytso/blog/2009/03/15/dont-fear-the-fsync/>
+pub async fn durable_rename(
+    old_path: impl AsRef<Utf8Path>,
+    new_path: impl AsRef<Utf8Path>,
+    do_fsync: bool,
+) -> io::Result<()> {
+    // first fsync the file
+    fsync_async_opt(old_path.as_ref(), do_fsync).await?;
+
+    // Time to do the real deal.
+    tokio::fs::rename(old_path.as_ref(), new_path.as_ref()).await?;
+
+    // Postgres'ish fsync of renamed file.
+    fsync_async_opt(new_path.as_ref(), do_fsync).await?;
+
+    // Now fsync the parent
+    let parent = match new_path.as_ref().parent() {
+        Some(p) => p,
+        None => Utf8Path::new("./"), // assume current dir if there is no parent
+    };
+    fsync_async_opt(parent, do_fsync).await?;
+
+    Ok(())
+}
+
 #[cfg(test)]
 mod tests {

--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -69,6 +69,25 @@ impl Client {
        resp.json().await.map_err(Error::ReceiveBody)
    }

+    /// Get an arbitrary path and returning a streaming Response.  This function is suitable
+    /// for pass-through/proxy use cases where we don't care what the response content looks
+    /// like.
+    ///
+    /// Use/add one of the properly typed methods below if you know aren't proxying, and
+    /// know what kind of response you expect.
+    pub async fn get_raw(&self, path: String) -> Result<reqwest::Response> {
+        debug_assert!(path.starts_with('/'));
+        let uri = format!("{}{}", self.mgmt_api_endpoint, path);
+
+        let req = self.client.request(Method::GET, uri);
+        let req = if let Some(value) = &self.authorization_header {
+            req.header(reqwest::header::AUTHORIZATION, value)
+        } else {
+            req
+        };
+        req.send().await.map_err(Error::ReceiveBody)
+    }
+
    pub async fn tenant_details(
        &self,
        tenant_shard_id: TenantShardId,
@@ -171,6 +190,25 @@ impl Client {
            .map_err(Error::ReceiveBody)
    }

+    /// The tenant deletion API can return 202 if deletion is incomplete, or
+    /// 404 if it is complete.  Callers are responsible for checking the status
+    /// code and retrying.  Error codes other than 404 will return Err().
+    pub async fn tenant_delete(&self, tenant_shard_id: TenantShardId) -> Result<StatusCode> {
+        let uri = format!("{}/v1/tenant/{tenant_shard_id}", self.mgmt_api_endpoint);
+
+        match self.request(Method::DELETE, &uri, ()).await {
+            Err(Error::ApiError(status_code, msg)) => {
+                if status_code == StatusCode::NOT_FOUND {
+                    Ok(StatusCode::NOT_FOUND)
+                } else {
+                    Err(Error::ApiError(status_code, msg))
+                }
+            }
+            Err(e) => Err(e),
+            Ok(response) => Ok(response.status()),
+        }
+    }
+
    pub async fn tenant_config(&self, req: &TenantConfigRequest) -> Result<()> {
        let uri = format!("{}/v1/tenant/config", self.mgmt_api_endpoint);
        self.request(Method::PUT, &uri, req).await?;
@@ -234,6 +272,32 @@ impl Client {
            .map_err(Error::ReceiveBody)
    }

+    /// The timeline deletion API can return 201 if deletion is incomplete, or
+    /// 403 if it is complete.  Callers are responsible for checking the status
+    /// code and retrying.  Error codes other than 403 will return Err().
+    pub async fn timeline_delete(
+        &self,
+        tenant_shard_id: TenantShardId,
+        timeline_id: TimelineId,
+    ) -> Result<StatusCode> {
+        let uri = format!(
+            "{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}",
+            self.mgmt_api_endpoint
+        );
+
+        match self.request(Method::DELETE, &uri, ()).await {
+            Err(Error::ApiError(status_code, msg)) => {
+                if status_code == StatusCode::NOT_FOUND {
+                    Ok(StatusCode::NOT_FOUND)
+                } else {
+                    Err(Error::ApiError(status_code, msg))
+                }
+            }
+            Err(e) => Err(e),
+            Ok(response) => Ok(response.status()),
+        }
+    }
+
    pub async fn tenant_reset(&self, tenant_shard_id: TenantShardId) -> Result<()> {
        let uri = format!(
            "{}/v1/tenant/{}/reset",
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -97,23 +97,86 @@ pub enum EvictionOrder {

    /// Order the layers to be evicted by how recently they have been accessed relatively within
    /// the set of resident layers of a tenant.
-    ///
-    /// This strategy will evict layers more fairly but is untested.
    RelativeAccessed {
-        #[serde(default)]
+        /// Determines if the tenant with most layers should lose first.
+        ///
+        /// Having this enabled is currently the only reasonable option, because the order in which
+        /// we read tenants is deterministic. If we find the need to use this as `false`, we need
+        /// to ensure nondeterminism by adding in a random number to break the
+        /// `relative_last_activity==0.0` ties.
+        #[serde(default = "default_highest_layer_count_loses_first")]
        highest_layer_count_loses_first: bool,
    },
 }

+fn default_highest_layer_count_loses_first() -> bool {
+    true
+}
+
 impl EvictionOrder {
-    /// Return true, if with [`Self::RelativeAccessed`] order the tenants with the highest layer
-    /// counts should be the first ones to have their layers evicted.
-    fn highest_layer_count_loses_first(&self) -> bool {
+    fn sort(&self, candidates: &mut [(MinResidentSizePartition, EvictionCandidate)]) {
+        use EvictionOrder::*;
+
        match self {
-            EvictionOrder::AbsoluteAccessed => false,
-            EvictionOrder::RelativeAccessed {
+            AbsoluteAccessed => {
+                candidates.sort_unstable_by_key(|(partition, candidate)| {
+                    (*partition, candidate.last_activity_ts)
+                });
+            }
+            RelativeAccessed { .. } => candidates.sort_unstable_by_key(|(partition, candidate)| {
+                (*partition, candidate.relative_last_activity)
+            }),
+        }
+    }
+
+    /// Called to fill in the [`EvictionCandidate::relative_last_activity`] while iterating tenants
+    /// layers in **most** recently used order.
+    fn relative_last_activity(&self, total: usize, index: usize) -> finite_f32::FiniteF32 {
+        use EvictionOrder::*;
+
+        match self {
+            AbsoluteAccessed => finite_f32::FiniteF32::ZERO,
+            RelativeAccessed {
                highest_layer_count_loses_first,
-            } => *highest_layer_count_loses_first,
+            } => {
+                // keeping the -1 or not decides if every tenant should lose their least recently accessed
+                // layer OR if this should happen in the order of having highest layer count:
+                let fudge = if *highest_layer_count_loses_first {
+                    // relative_last_activity vs. tenant layer count:
+                    // - 0.1..=1.0 (10 layers)
+                    // - 0.01..=1.0 (100 layers)
+                    // - 0.001..=1.0 (1000 layers)
+                    //
+                    // leading to evicting less of the smallest tenants.
+                    0
+                } else {
+                    // use full 0.0..=1.0 range, which means even the smallest tenants could always lose a
+                    // layer. the actual ordering is unspecified: for 10k tenants on a pageserver it could
+                    // be that less than 10k layer evictions is enough, so we would not need to evict from
+                    // all tenants.
+                    //
+                    // as the tenant ordering is now deterministic this could hit the same tenants
+                    // disproportionetly on multiple invocations. alternative could be to remember how many
+                    // layers did we evict last time from this tenant, and inject that as an additional
+                    // fudge here.
+                    1
+                };
+
+                let total = total.checked_sub(fudge).filter(|&x| x > 1).unwrap_or(1);
+                let divider = total as f32;
+
+                // most recently used is always (total - 0) / divider == 1.0
+                // least recently used depends on the fudge:
+                // -       (total - 1) - (total - 1) / total => 0 / total
+                // -             total - (total - 1) / total => 1 / total
+                let distance = (total - index) as f32;
+
+                finite_f32::FiniteF32::try_from_normalized(distance / divider)
+                    .unwrap_or_else(|val| {
+                        tracing::warn!(%fudge, "calculated invalid relative_last_activity for i={index}, total={total}: {val}");
+                        finite_f32::FiniteF32::ZERO
+                    })
+            }
        }
    }
 }
@@ -389,52 +452,6 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(

    let selection = select_victims(&candidates, usage_pre);

-    let mut candidates = candidates;
-
-    let selection = if matches!(eviction_order, EvictionOrder::RelativeAccessed { .. }) {
-        // we currently have the layers ordered by AbsoluteAccessed so that we can get the summary
-        // for comparison here. this is a temporary measure to develop alternatives.
-        use std::fmt::Write;
-
-        let mut summary_buf = String::with_capacity(256);
-
-        {
-            let absolute_summary = candidates
-                .iter()
-                .take(selection.amount)
-                .map(|(_, candidate)| candidate)
-                .collect::<summary::EvictionSummary>();
-
-            write!(summary_buf, "{absolute_summary}").expect("string grows");
-
-            info!("absolute accessed selection summary: {summary_buf}");
-        }
-
-        candidates.sort_unstable_by_key(|(partition, candidate)| {
-            (*partition, candidate.relative_last_activity)
-        });
-
-        let selection = select_victims(&candidates, usage_pre);
-
-        {
-            summary_buf.clear();
-
-            let relative_summary = candidates
-                .iter()
-                .take(selection.amount)
-                .map(|(_, candidate)| candidate)
-                .collect::<summary::EvictionSummary>();
-
-            write!(summary_buf, "{relative_summary}").expect("string grows");
-
-            info!("relative accessed selection summary: {summary_buf}");
-        }
-
-        selection
-    } else {
-        selection
-    };
-
    let (evicted_amount, usage_planned) = selection.into_amount_and_planned();

    // phase2: evict layers
@@ -835,54 +852,12 @@ async fn collect_eviction_candidates(
            .sort_unstable_by_key(|layer_info| std::cmp::Reverse(layer_info.last_activity_ts));
        let mut cumsum: i128 = 0;

-        // keeping the -1 or not decides if every tenant should lose their least recently accessed
-        // layer OR if this should happen in the order of having highest layer count:
-        let fudge = if eviction_order.highest_layer_count_loses_first() {
-            // relative_age vs. tenant layer count:
-            // - 0.1..=1.0 (10 layers)
-            // - 0.01..=1.0 (100 layers)
-            // - 0.001..=1.0 (1000 layers)
-            //
-            // leading to evicting less of the smallest tenants.
-            0
-        } else {
-            // use full 0.0..=1.0 range, which means even the smallest tenants could always lose a
-            // layer. the actual ordering is unspecified: for 10k tenants on a pageserver it could
-            // be that less than 10k layer evictions is enough, so we would not need to evict from
-            // all tenants.
-            //
-            // as the tenant ordering is now deterministic this could hit the same tenants
-            // disproportionetly on multiple invocations. alternative could be to remember how many
-            // layers did we evict last time from this tenant, and inject that as an additional
-            // fudge here.
-            1
-        };
-
-        let total = tenant_candidates
-            .len()
-            .checked_sub(fudge)
-            .filter(|&x| x > 0)
-            // support 0 or 1 resident layer tenants as well
-            .unwrap_or(1);
-        let divider = total as f32;
+        let total = tenant_candidates.len();

        for (i, mut candidate) in tenant_candidates.into_iter().enumerate() {
            // as we iterate this reverse sorted list, the most recently accessed layer will always
            // be 1.0; this is for us to evict it last.
-            candidate.relative_last_activity = if matches!(
-                eviction_order,
-                EvictionOrder::RelativeAccessed { .. }
-            ) {
-                // another possibility: use buckets, like (256.0 * relative_last_activity) as u8 or
-                // similarly for u16. unsure how it would help.
-                finite_f32::FiniteF32::try_from_normalized((total - i) as f32 / divider)
-                    .unwrap_or_else(|val| {
-                        tracing::warn!(%fudge, "calculated invalid relative_last_activity for i={i}, total={total}: {val}");
-                        finite_f32::FiniteF32::ZERO
-                    })
-            } else {
-                finite_f32::FiniteF32::ZERO
-            };
+            candidate.relative_last_activity = eviction_order.relative_last_activity(total, i);

            let partition = if cumsum > min_resident_size as i128 {
                MinResidentSizePartition::Above
@@ -927,10 +902,7 @@ async fn collect_eviction_candidates(
    debug_assert!(MinResidentSizePartition::Above < MinResidentSizePartition::Below,
        "as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first");

-    // always behave as if AbsoluteAccessed was selected. if RelativeAccessed is in use, we
-    // will sort later by candidate.relative_last_activity to get compare evictions.
-    candidates
-        .sort_unstable_by_key(|(partition, candidate)| (*partition, candidate.last_activity_ts));
+    eviction_order.sort(&mut candidates);

    Ok(EvictionCandidates::Finished(candidates))
 }
@@ -1070,6 +1042,12 @@ pub(crate) mod finite_f32 {
        }
    }

+    impl From<FiniteF32> for f32 {
+        fn from(value: FiniteF32) -> f32 {
+            value.0
+        }
+    }
+
    impl FiniteF32 {
        pub const ZERO: FiniteF32 = FiniteF32(0.0);

@@ -1082,136 +1060,9 @@ pub(crate) mod finite_f32 {
                Err(value)
            }
        }
-    }
-}

-mod summary {
-    use super::finite_f32::FiniteF32;
-    use super::{EvictionCandidate, LayerCount};
-    use pageserver_api::shard::TenantShardId;
-    use std::collections::{BTreeMap, HashMap};
-    use std::time::SystemTime;
-
-    #[derive(Debug, Default)]
-    pub(super) struct EvictionSummary {
-        evicted_per_tenant: HashMap<TenantShardId, LayerCount>,
-        total: LayerCount,
-
-        last_absolute: Option<SystemTime>,
-        last_relative: Option<FiniteF32>,
-    }
-
-    impl<'a> FromIterator<&'a EvictionCandidate> for EvictionSummary {
-        fn from_iter<T: IntoIterator<Item = &'a EvictionCandidate>>(iter: T) -> Self {
-            let mut summary = EvictionSummary::default();
-            for item in iter {
-                let counts = summary
-                    .evicted_per_tenant
-                    .entry(*item.layer.get_tenant_shard_id())
-                    .or_default();
-
-                let sz = item.layer.get_file_size();
-
-                counts.file_sizes += sz;
-                counts.count += 1;
-
-                summary.total.file_sizes += sz;
-                summary.total.count += 1;
-
-                summary.last_absolute = Some(item.last_activity_ts);
-                summary.last_relative = Some(item.relative_last_activity);
-            }
-
-            summary
-        }
-    }
-
-    struct SiBytesAmount(u64);
-
-    impl std::fmt::Display for SiBytesAmount {
-        fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-            if self.0 < 1024 {
-                return write!(f, "{}B", self.0);
-            }
-
-            let mut tmp = self.0;
-            let mut ch = 0;
-            let suffixes = b"KMGTPE";
-
-            while tmp > 1024 * 1024 && ch < suffixes.len() - 1 {
-                tmp /= 1024;
-                ch += 1;
-            }
-
-            let ch = suffixes[ch] as char;
-
-            write!(f, "{:.1}{ch}iB", tmp as f64 / 1024.0)
-        }
-    }
-
-    impl std::fmt::Display for EvictionSummary {
-        fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-            // wasteful, but it's for testing
-
-            let mut sorted: BTreeMap<usize, Vec<(TenantShardId, u64)>> = BTreeMap::new();
-
-            for (tenant_shard_id, count) in &self.evicted_per_tenant {
-                sorted
-                    .entry(count.count)
-                    .or_default()
-                    .push((*tenant_shard_id, count.file_sizes));
-            }
-
-            let total_file_sizes = SiBytesAmount(self.total.file_sizes);
-
-            writeln!(
-                f,
-                "selected {} layers of {total_file_sizes} up to ({:?}, {:.2?}):",
-                self.total.count, self.last_absolute, self.last_relative,
-            )?;
-
-            for (count, per_tenant) in sorted.iter().rev().take(10) {
-                write!(f, "- {count} layers: ")?;
-
-                if per_tenant.len() < 3 {
-                    for (i, (tenant_shard_id, bytes)) in per_tenant.iter().enumerate() {
-                        if i > 0 {
-                            write!(f, ", ")?;
-                        }
-                        let bytes = SiBytesAmount(*bytes);
-                        write!(f, "{tenant_shard_id} ({bytes})")?;
-                    }
-                } else {
-                    let num_tenants = per_tenant.len();
-                    let total_bytes = per_tenant.iter().map(|(_id, bytes)| bytes).sum::<u64>();
-                    let total_bytes = SiBytesAmount(total_bytes);
-                    let layers = num_tenants * count;
-
-                    write!(
-                        f,
-                        "{num_tenants} tenants {total_bytes} in total {layers} layers",
-                    )?;
-                }
-
-                writeln!(f)?;
-            }
-
-            if sorted.len() > 10 {
-                let (rem_count, rem_bytes) = sorted
-                    .iter()
-                    .rev()
-                    .map(|(count, per_tenant)| {
-                        (
-                            count,
-                            per_tenant.iter().map(|(_id, bytes)| bytes).sum::<u64>(),
-                        )
-                    })
-                    .fold((0, 0), |acc, next| (acc.0 + next.0, acc.1 + next.1));
-                let rem_bytes = SiBytesAmount(rem_bytes);
-                writeln!(f, "- rest of tenants ({}) not shown ({rem_count} layers or {:.1}%, {rem_bytes} or {:.1}% bytes)", sorted.len() - 10, 100.0 * rem_count as f64 / self.total.count as f64, 100.0 * rem_bytes.0 as f64 / self.total.file_sizes as f64)?;
-            }
-
-            Ok(())
+        pub fn into_inner(self) -> f32 {
+            self.into()
        }
    }
 }
@@ -1336,3 +1187,40 @@ mod filesystem_level_usage {
        assert!(!usage.has_pressure());
    }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn relative_equal_bounds() {
+        let order = EvictionOrder::RelativeAccessed {
+            highest_layer_count_loses_first: false,
+        };
+
+        let len = 10;
+        let v = (0..len)
+            .map(|i| order.relative_last_activity(len, i).into_inner())
+            .collect::<Vec<_>>();
+
+        assert_eq!(v.first(), Some(&1.0));
+        assert_eq!(v.last(), Some(&0.0));
+        assert!(v.windows(2).all(|slice| slice[0] > slice[1]));
+    }
+
+    #[test]
+    fn relative_spare_bounds() {
+        let order = EvictionOrder::RelativeAccessed {
+            highest_layer_count_loses_first: true,
+        };
+
+        let len = 10;
+        let v = (0..len)
+            .map(|i| order.relative_last_activity(len, i).into_inner())
+            .collect::<Vec<_>>();
+
+        assert_eq!(v.first(), Some(&1.0));
+        assert_eq!(v.last(), Some(&0.1));
+        assert!(v.windows(2).all(|slice| slice[0] > slice[1]));
+    }
+}
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -419,12 +419,6 @@ paths:
            type: string
            format: date-time
          description: A timestamp to get the LSN
-        - name: version
-          in: query
-          required: false
-          schema:
-            type: integer
-          description: The version of the endpoint to use
      responses:
        "200":
          description: OK
@@ -674,6 +668,10 @@ paths:
      responses:
        "200":
          description: Tenant is now in requested state
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/TenantLocationConfigResponse"
        "503":
          description: Tenant's state cannot be changed right now.  Wait a few seconds and retry.
          content:
@@ -1426,6 +1424,27 @@ components:
          $ref: '#/components/schemas/SecondaryConfig'
        tenant_conf:
          $ref: '#/components/schemas/TenantConfig'
+    TenantLocationConfigResponse:
+      type: object
+      required:
+        - shards
+      properties:
+        shards:
+          description: Pageservers where this tenant's shards are attached.  Not populated for secondary locations.
+          type: array
+          items:
+            $ref: "#/components/schemas/TenantShardLocation"
+    TenantShardLocation:
+      type: object
+      required:
+        - node_id
+        - shard_id
+      properties:
+        node_id:
+          description: Pageserver node ID where this shard is attached
+          type: integer
+        shard_id: Tenant shard ID of the shard
+          type: string
    SecondaryConfig:
      type: object
      properties:
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -17,6 +17,8 @@ use metrics::launch_timestamp::LaunchTimestamp;
 use pageserver_api::models::LocationConfigListResponse;
 use pageserver_api::models::ShardParameters;
 use pageserver_api::models::TenantDetails;
+use pageserver_api::models::TenantLocationConfigResponse;
+use pageserver_api::models::TenantShardLocation;
 use pageserver_api::models::TenantState;
 use pageserver_api::models::{
    DownloadRemoteLayersTaskSpawnRequest, LocationConfigMode, TenantAttachRequest,
@@ -1356,7 +1358,7 @@ async fn put_tenant_location_config_handler(
    let location_conf =
        LocationConf::try_from(&request_data.config).map_err(ApiError::BadRequest)?;

-    state
+    let attached = state
        .tenant_manager
        .upsert_location(
            tenant_shard_id,
@@ -1365,7 +1367,8 @@ async fn put_tenant_location_config_handler(
            tenant::SpawnMode::Normal,
            &ctx,
        )
-        .await?;
+        .await?
+        .is_some();

    if let Some(_flush_ms) = flush {
        match state
@@ -1384,7 +1387,18 @@ async fn put_tenant_location_config_handler(
        tracing::info!("No flush requested when configuring");
    }

-    json_response(StatusCode::OK, ())
+    // This API returns a vector of pageservers where the tenant is attached: this is
+    // primarily for use in the sharding service.  For compatibilty, we also return this
+    // when called directly on a pageserver, but the payload is always zero or one shards.
+    let mut response = TenantLocationConfigResponse { shards: Vec::new() };
+    if attached {
+        response.shards.push(TenantShardLocation {
+            shard_id: tenant_shard_id,
+            node_id: state.conf.id,
+        })
+    }
+
+    json_response(StatusCode::OK, response)
 }

 async fn list_location_config_handler(
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -368,6 +368,16 @@ impl From<WaitLsnError> for PageStreamError {
    }
 }

+impl From<WaitLsnError> for QueryError {
+    fn from(value: WaitLsnError) -> Self {
+        match value {
+            e @ WaitLsnError::Timeout(_) => Self::Other(anyhow::Error::new(e)),
+            WaitLsnError::Shutdown => Self::Shutdown,
+            WaitLsnError::BadState => Self::Reconnect,
+        }
+    }
+}
+
 impl PageServerHandler {
    pub fn new(
        conf: &'static PageServerConf,
@@ -1139,7 +1149,7 @@ impl PageServerHandler {
        full_backup: bool,
        gzip: bool,
        ctx: RequestContext,
-    ) -> anyhow::Result<()>
+    ) -> Result<(), QueryError>
    where
        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
    {
@@ -1404,7 +1414,7 @@ where
                    )
                    .await?;
                    pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
-                    anyhow::Ok(())
+                    Result::<(), QueryError>::Ok(())
                },
            )
            .await?;
@@ -1678,6 +1688,7 @@ impl From<GetActiveTenantError> for QueryError {
            | GetActiveTenantError::WillNotBecomeActive(TenantState::Stopping { .. }) => {
                QueryError::Shutdown
            }
+            e @ GetActiveTenantError::NotFound(_) => QueryError::NotFound(format!("{e}").into()),
            e => QueryError::Other(anyhow::anyhow!(e)),
        }
    }
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3778,6 +3778,11 @@ async fn run_initdb(
        .env_clear()
        .env("LD_LIBRARY_PATH", &initdb_lib_dir)
        .env("DYLD_LIBRARY_PATH", &initdb_lib_dir)
+        .stdin(std::process::Stdio::null())
+        // stdout invocation produces the same output every time, we don't need it
+        .stdout(std::process::Stdio::null())
+        // we would be interested in the stderr output, if there was any
+        .stderr(std::process::Stdio::piped())
        .spawn()?;

    // Ideally we'd select here with the cancellation token, but the problem is that
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -51,7 +51,10 @@ use crate::keyspace::KeyPartitioning;
 use crate::repository::Key;
 use crate::tenant::storage_layer::InMemoryLayer;
 use anyhow::Result;
-use std::collections::VecDeque;
+use pageserver_api::keyspace::KeySpaceAccum;
+use std::cmp::Ordering;
+use std::collections::{BTreeMap, VecDeque};
+use std::iter::Peekable;
 use std::ops::Range;
 use std::sync::Arc;
 use utils::lsn::Lsn;
@@ -144,11 +147,221 @@ impl Drop for BatchedUpdates<'_> {
 }

 /// Return value of LayerMap::search
+#[derive(Eq, PartialEq, Debug)]
 pub struct SearchResult {
    pub layer: Arc<PersistentLayerDesc>,
    pub lsn_floor: Lsn,
 }

+pub struct OrderedSearchResult(SearchResult);
+
+impl Ord for OrderedSearchResult {
+    fn cmp(&self, other: &Self) -> Ordering {
+        self.0.lsn_floor.cmp(&other.0.lsn_floor)
+    }
+}
+
+impl PartialOrd for OrderedSearchResult {
+    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+impl PartialEq for OrderedSearchResult {
+    fn eq(&self, other: &Self) -> bool {
+        self.0.lsn_floor == other.0.lsn_floor
+    }
+}
+
+impl Eq for OrderedSearchResult {}
+
+pub struct RangeSearchResult {
+    pub found: BTreeMap<OrderedSearchResult, KeySpaceAccum>,
+    pub not_found: KeySpaceAccum,
+}
+
+impl RangeSearchResult {
+    fn new() -> Self {
+        Self {
+            found: BTreeMap::new(),
+            not_found: KeySpaceAccum::new(),
+        }
+    }
+}
+
+/// Collector for results of range search queries on the LayerMap.
+/// It should be provided with two iterators for the delta and image coverage
+/// that contain all the changes for layers which intersect the range.
+struct RangeSearchCollector<Iter>
+where
+    Iter: Iterator<Item = (i128, Option<Arc<PersistentLayerDesc>>)>,
+{
+    delta_coverage: Peekable<Iter>,
+    image_coverage: Peekable<Iter>,
+    key_range: Range<Key>,
+    end_lsn: Lsn,
+
+    current_delta: Option<Arc<PersistentLayerDesc>>,
+    current_image: Option<Arc<PersistentLayerDesc>>,
+
+    result: RangeSearchResult,
+}
+
+#[derive(Debug)]
+enum NextLayerType {
+    Delta(i128),
+    Image(i128),
+    Both(i128),
+}
+
+impl NextLayerType {
+    fn next_change_at_key(&self) -> Key {
+        match self {
+            NextLayerType::Delta(at) => Key::from_i128(*at),
+            NextLayerType::Image(at) => Key::from_i128(*at),
+            NextLayerType::Both(at) => Key::from_i128(*at),
+        }
+    }
+}
+
+impl<Iter> RangeSearchCollector<Iter>
+where
+    Iter: Iterator<Item = (i128, Option<Arc<PersistentLayerDesc>>)>,
+{
+    fn new(
+        key_range: Range<Key>,
+        end_lsn: Lsn,
+        delta_coverage: Iter,
+        image_coverage: Iter,
+    ) -> Self {
+        Self {
+            delta_coverage: delta_coverage.peekable(),
+            image_coverage: image_coverage.peekable(),
+            key_range,
+            end_lsn,
+            current_delta: None,
+            current_image: None,
+            result: RangeSearchResult::new(),
+        }
+    }
+
+    /// Run the collector. Collection is implemented via a two pointer algorithm.
+    /// One pointer tracks the start of the current range and the other tracks
+    /// the beginning of the next range which will overlap with the next change
+    /// in coverage across both image and delta.
+    fn collect(mut self) -> RangeSearchResult {
+        let next_layer_type = self.choose_next_layer_type();
+        let mut current_range_start = match next_layer_type {
+            None => {
+                // No changes for the range
+                self.pad_range(self.key_range.clone());
+                return self.result;
+            }
+            Some(layer_type) if self.key_range.end <= layer_type.next_change_at_key() => {
+                // Changes only after the end of the range
+                self.pad_range(self.key_range.clone());
+                return self.result;
+            }
+            Some(layer_type) => {
+                // Changes for the range exist. Record anything before the first
+                // coverage change as not found.
+                let coverage_start = layer_type.next_change_at_key();
+                let range_before = self.key_range.start..coverage_start;
+                self.pad_range(range_before);
+
+                self.advance(&layer_type);
+                coverage_start
+            }
+        };
+
+        while current_range_start < self.key_range.end {
+            let next_layer_type = self.choose_next_layer_type();
+            match next_layer_type {
+                Some(t) => {
+                    let current_range_end = t.next_change_at_key();
+                    self.add_range(current_range_start..current_range_end);
+                    current_range_start = current_range_end;
+
+                    self.advance(&t);
+                }
+                None => {
+                    self.add_range(current_range_start..self.key_range.end);
+                    current_range_start = self.key_range.end;
+                }
+            }
+        }
+
+        self.result
+    }
+
+    /// Mark a range as not found (i.e. no layers intersect it)
+    fn pad_range(&mut self, key_range: Range<Key>) {
+        if !key_range.is_empty() {
+            self.result.not_found.add_range(key_range);
+        }
+    }
+
+    /// Select the appropiate layer for the given range and update
+    /// the collector.
+    fn add_range(&mut self, covered_range: Range<Key>) {
+        let selected = LayerMap::select_layer(
+            self.current_delta.clone(),
+            self.current_image.clone(),
+            self.end_lsn,
+        );
+
+        match selected {
+            Some(search_result) => self
+                .result
+                .found
+                .entry(OrderedSearchResult(search_result))
+                .or_default()
+                .add_range(covered_range),
+            None => self.pad_range(covered_range),
+        }
+    }
+
+    /// Move to the next coverage change.
+    fn advance(&mut self, layer_type: &NextLayerType) {
+        match layer_type {
+            NextLayerType::Delta(_) => {
+                let (_, layer) = self.delta_coverage.next().unwrap();
+                self.current_delta = layer;
+            }
+            NextLayerType::Image(_) => {
+                let (_, layer) = self.image_coverage.next().unwrap();
+                self.current_image = layer;
+            }
+            NextLayerType::Both(_) => {
+                let (_, image_layer) = self.image_coverage.next().unwrap();
+                let (_, delta_layer) = self.delta_coverage.next().unwrap();
+
+                self.current_image = image_layer;
+                self.current_delta = delta_layer;
+            }
+        }
+    }
+
+    /// Pick the next coverage change: the one at the lesser key or both if they're alligned.
+    fn choose_next_layer_type(&mut self) -> Option<NextLayerType> {
+        let next_delta_at = self.delta_coverage.peek().map(|(key, _)| key);
+        let next_image_at = self.image_coverage.peek().map(|(key, _)| key);
+
+        match (next_delta_at, next_image_at) {
+            (None, None) => None,
+            (Some(next_delta_at), None) => Some(NextLayerType::Delta(*next_delta_at)),
+            (None, Some(next_image_at)) => Some(NextLayerType::Image(*next_image_at)),
+            (Some(next_delta_at), Some(next_image_at)) if next_image_at < next_delta_at => {
+                Some(NextLayerType::Image(*next_image_at))
+            }
+            (Some(next_delta_at), Some(next_image_at)) if next_delta_at < next_image_at => {
+                Some(NextLayerType::Delta(*next_delta_at))
+            }
+            (Some(next_delta_at), Some(_)) => Some(NextLayerType::Both(*next_delta_at)),
+        }
+    }
+}
+
 impl LayerMap {
    ///
    /// Find the latest layer (by lsn.end) that covers the given
@@ -186,7 +399,18 @@ impl LayerMap {
        let latest_delta = version.delta_coverage.query(key.to_i128());
        let latest_image = version.image_coverage.query(key.to_i128());

-        match (latest_delta, latest_image) {
+        Self::select_layer(latest_delta, latest_image, end_lsn)
+    }
+
+    fn select_layer(
+        delta_layer: Option<Arc<PersistentLayerDesc>>,
+        image_layer: Option<Arc<PersistentLayerDesc>>,
+        end_lsn: Lsn,
+    ) -> Option<SearchResult> {
+        assert!(delta_layer.as_ref().map_or(true, |l| l.is_delta()));
+        assert!(image_layer.as_ref().map_or(true, |l| !l.is_delta()));
+
+        match (delta_layer, image_layer) {
            (None, None) => None,
            (None, Some(image)) => {
                let lsn_floor = image.get_lsn_range().start;
@@ -223,6 +447,17 @@ impl LayerMap {
        }
    }

+    pub fn range_search(&self, key_range: Range<Key>, end_lsn: Lsn) -> Option<RangeSearchResult> {
+        let version = self.historic.get().unwrap().get_version(end_lsn.0 - 1)?;
+
+        let raw_range = key_range.start.to_i128()..key_range.end.to_i128();
+        let delta_changes = version.delta_coverage.range_overlaps(&raw_range);
+        let image_changes = version.image_coverage.range_overlaps(&raw_range);
+
+        let collector = RangeSearchCollector::new(key_range, end_lsn, delta_changes, image_changes);
+        Some(collector.collect())
+    }
+
    /// Start a batch of updates, applied on drop
    pub fn batch_update(&mut self) -> BatchedUpdates<'_> {
        BatchedUpdates { layer_map: self }
@@ -631,3 +866,126 @@ impl LayerMap {
        Ok(())
    }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[derive(Clone)]
+    struct LayerDesc {
+        key_range: Range<Key>,
+        lsn_range: Range<Lsn>,
+        is_delta: bool,
+    }
+
+    fn create_layer_map(layers: Vec<LayerDesc>) -> LayerMap {
+        let mut layer_map = LayerMap::default();
+
+        for layer in layers {
+            layer_map.insert_historic_noflush(PersistentLayerDesc::new_test(
+                layer.key_range,
+                layer.lsn_range,
+                layer.is_delta,
+            ));
+        }
+
+        layer_map.flush_updates();
+        layer_map
+    }
+
+    fn assert_range_search_result_eq(lhs: RangeSearchResult, rhs: RangeSearchResult) {
+        assert_eq!(lhs.not_found.to_keyspace(), rhs.not_found.to_keyspace());
+        let lhs: Vec<_> = lhs
+            .found
+            .into_iter()
+            .map(|(search_result, accum)| (search_result.0, accum.to_keyspace()))
+            .collect();
+        let rhs: Vec<_> = rhs
+            .found
+            .into_iter()
+            .map(|(search_result, accum)| (search_result.0, accum.to_keyspace()))
+            .collect();
+
+        assert_eq!(lhs, rhs);
+    }
+
+    fn brute_force_range_search(
+        layer_map: &LayerMap,
+        key_range: Range<Key>,
+        end_lsn: Lsn,
+    ) -> RangeSearchResult {
+        let mut range_search_result = RangeSearchResult::new();
+
+        let mut key = key_range.start;
+        while key != key_range.end {
+            let res = layer_map.search(key, end_lsn);
+            match res {
+                Some(res) => {
+                    range_search_result
+                        .found
+                        .entry(OrderedSearchResult(res))
+                        .or_default()
+                        .add_key(key);
+                }
+                None => {
+                    range_search_result.not_found.add_key(key);
+                }
+            }
+
+            key = key.next();
+        }
+
+        range_search_result
+    }
+
+    #[test]
+    fn ranged_search_on_empty_layer_map() {
+        let layer_map = LayerMap::default();
+        let range = Key::from_i128(100)..Key::from_i128(200);
+
+        let res = layer_map.range_search(range, Lsn(100));
+        assert!(res.is_none());
+    }
+
+    #[test]
+    fn ranged_search() {
+        let layers = vec![
+            LayerDesc {
+                key_range: Key::from_i128(15)..Key::from_i128(50),
+                lsn_range: Lsn(0)..Lsn(5),
+                is_delta: false,
+            },
+            LayerDesc {
+                key_range: Key::from_i128(10)..Key::from_i128(20),
+                lsn_range: Lsn(5)..Lsn(20),
+                is_delta: true,
+            },
+            LayerDesc {
+                key_range: Key::from_i128(15)..Key::from_i128(25),
+                lsn_range: Lsn(20)..Lsn(30),
+                is_delta: true,
+            },
+            LayerDesc {
+                key_range: Key::from_i128(35)..Key::from_i128(40),
+                lsn_range: Lsn(25)..Lsn(35),
+                is_delta: true,
+            },
+            LayerDesc {
+                key_range: Key::from_i128(35)..Key::from_i128(40),
+                lsn_range: Lsn(35)..Lsn(40),
+                is_delta: false,
+            },
+        ];
+
+        let layer_map = create_layer_map(layers.clone());
+        for start in 0..60 {
+            for end in (start + 1)..60 {
+                let range = Key::from_i128(start)..Key::from_i128(end);
+                let result = layer_map.range_search(range.clone(), Lsn(100)).unwrap();
+                let expected = brute_force_range_search(&layer_map, range, Lsn(100));
+
+                assert_range_search_result_eq(result, expected);
+            }
+        }
+    }
+}
--- a/pageserver/src/tenant/layer_map/layer_coverage.rs
+++ b/pageserver/src/tenant/layer_map/layer_coverage.rs
@@ -129,6 +129,42 @@ impl<Value: Clone> LayerCoverage<Value> {
            .map(|(k, v)| (*k, v.as_ref().map(|x| x.1.clone())))
    }

+    /// Returns an iterator which includes all coverage changes for layers that intersect
+    /// with the provided range.
+    pub fn range_overlaps(
+        &self,
+        key_range: &Range<i128>,
+    ) -> impl Iterator<Item = (i128, Option<Value>)> + '_
+    where
+        Value: Eq,
+    {
+        let first_change = self.query(key_range.start);
+        match first_change {
+            Some(change) => {
+                // If the start of the range is covered, we have to deal with two cases:
+                // 1. Start of the range is aligned with the start of a layer.
+                // In this case the return of `self.range` will contain the layer which aligns with the start of the key range.
+                // We advance said iterator to avoid duplicating the first change.
+                // 2. Start of the range is not aligned with the start of a layer.
+                let range = key_range.start..key_range.end;
+                let mut range_coverage = self.range(range).peekable();
+                if range_coverage
+                    .peek()
+                    .is_some_and(|c| c.1.as_ref() == Some(&change))
+                {
+                    range_coverage.next();
+                }
+                itertools::Either::Left(
+                    std::iter::once((key_range.start, Some(change))).chain(range_coverage),
+                )
+            }
+            None => {
+                let range = key_range.start..key_range.end;
+                let coverage = self.range(range);
+                itertools::Either::Right(coverage)
+            }
+        }
+    }
    /// O(1) clone
    pub fn clone(&self) -> Self {
        Self {
--- a/pageserver/src/tenant/storage_layer/layer_desc.rs
+++ b/pageserver/src/tenant/storage_layer/layer_desc.rs
@@ -55,13 +55,13 @@ impl PersistentLayerDesc {
    }

    #[cfg(test)]
-    pub fn new_test(key_range: Range<Key>) -> Self {
+    pub fn new_test(key_range: Range<Key>, lsn_range: Range<Lsn>, is_delta: bool) -> Self {
        Self {
            tenant_shard_id: TenantShardId::unsharded(TenantId::generate()),
            timeline_id: TimelineId::generate(),
            key_range,
-            lsn_range: Lsn(0)..Lsn(1),
-            is_delta: false,
+            lsn_range,
+            is_delta,
            file_size: 0,
        }
    }
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -1363,16 +1363,22 @@ impl WalIngest {
            self.checkpoint.nextMultiOffset = xlrec.moff + xlrec.nmembers;
            self.checkpoint_modified = true;
        }
-        let max_mbr_xid = xlrec.members.iter().fold(0u32, |acc, mbr| {
-            if mbr.xid.wrapping_sub(acc) as i32 > 0 {
-                mbr.xid
+        let max_mbr_xid = xlrec.members.iter().fold(None, |acc, mbr| {
+            if let Some(max_xid) = acc {
+                if mbr.xid.wrapping_sub(max_xid) as i32 > 0 {
+                    Some(mbr.xid)
+                } else {
+                    acc
+                }
            } else {
-                acc
+                Some(mbr.xid)
            }
        });

-        if self.checkpoint.update_next_xid(max_mbr_xid) {
-            self.checkpoint_modified = true;
+        if let Some(max_xid) = max_mbr_xid {
+            if self.checkpoint.update_next_xid(max_xid) {
+                self.checkpoint_modified = true;
+            }
        }
        Ok(())
    }
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -93,7 +93,7 @@ struct ProcessOutput {
 pub struct PostgresRedoManager {
    tenant_shard_id: TenantShardId,
    conf: &'static PageServerConf,
-    last_redo_at: std::sync::Mutex<Option<Instant>>,
+    last_successful_redo_at: std::sync::Mutex<Option<Instant>>,
    redo_process: RwLock<Option<Arc<WalRedoProcess>>>,
 }

@@ -193,7 +193,7 @@ impl PostgresRedoManager {
        PostgresRedoManager {
            tenant_shard_id,
            conf,
-            last_redo_at: std::sync::Mutex::default(),
+            last_successful_redo_at: std::sync::Mutex::default(),
            redo_process: RwLock::new(None),
        }
    }
@@ -202,9 +202,21 @@ impl PostgresRedoManager {
    /// rely on our owner calling this function periodically in its own housekeeping
    /// loops.
    pub(crate) fn maybe_quiesce(&self, idle_timeout: Duration) {
-        if let Ok(g) = self.last_redo_at.try_lock() {
-            if let Some(last_redo_at) = *g {
-                if last_redo_at.elapsed() >= idle_timeout {
+        if let Ok(g) = self.last_successful_redo_at.try_lock() {
+            if let Some(last_successful_redo_at) = *g {
+                // Kill the walredo process if
+                // - it has been unused for `idle_timeout`
+                // - it has been used, but, without success.
+                // The former is just good housekeeping.
+                // The latter adds robustness for the case where something is wrong
+                // with the walredo process.
+                //
+                // Note that we don't want to kill the process immediately on each redo failure.
+                // The reason is that the redo failure could be caused by corrupted or malicious data.
+                // We don't want to get into a kill-respawn loop in that case.
+                // So, we piggy-back on the quiescing mechanism,
+                // resulting in a max kill-respawn frequency of `1/idle_timeout`.
+                if last_successful_redo_at.elapsed() >= idle_timeout {
                    drop(g);
                    let mut guard = self.redo_process.write().unwrap();
                    *guard = None;
@@ -227,8 +239,32 @@ impl PostgresRedoManager {
        wal_redo_timeout: Duration,
        pg_version: u32,
    ) -> anyhow::Result<Bytes> {
-        *(self.last_redo_at.lock().unwrap()) = Some(Instant::now());
+        let res = self.apply_batch_postgres0(
+            key,
+            lsn,
+            base_img,
+            base_img_lsn,
+            records,
+            wal_redo_timeout,
+            pg_version,
+        );
+        if res.is_ok() {
+            *self.last_successful_redo_at.lock().unwrap() = Some(Instant::now());
+        }
+        res
+    }

+    #[allow(clippy::too_many_arguments)]
+    fn apply_batch_postgres0(
+        &self,
+        key: Key,
+        lsn: Lsn,
+        base_img: Option<Bytes>,
+        base_img_lsn: Lsn,
+        records: &[(Lsn, NeonWalRecord)],
+        wal_redo_timeout: Duration,
+        pg_version: u32,
+    ) -> anyhow::Result<Bytes> {
        let (rel, blknum) = key_to_rel_block(key).context("invalid record")?;
        const MAX_RETRY_ATTEMPTS: u32 = 1;
        let mut n_attempts = 0u32;
--- a/patches/pgvector.patch
+++ b/patches/pgvector.patch
@@ -0,0 +1,56 @@
+From 5518a806a70e7f40d5054a762ccda7d5e6b0d31c Mon Sep 17 00:00:00 2001
+From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
+Date: Tue, 30 Jan 2024 14:33:00 +0200
+Subject: [PATCH] Make v0.6.0 work with Neon
+
+Now that the WAL-logging happens as a separate step at the end of the
+build, we need a few neon-specific hints to make it work.
+---
+ src/hnswbuild.c | 28 ++++++++++++++++++++++++++++
+ 1 file changed, 28 insertions(+)
+
+diff --git a/src/hnswbuild.c b/src/hnswbuild.c
+index 680789ba9044900eac9321844ee2a808a4a2ed12..41c5b709bcb2367ac8b8c498788ecac4c1148b74 100644
+--- a/src/hnswbuild.c
+++ b/src/hnswbuild.c
+@@ -1089,13 +1089,41 @@ BuildIndex(Relation heap, Relation index, IndexInfo *indexInfo,
+ 	SeedRandom(42);
+ #endif
+
+#ifdef NEON_SMGR
+	smgr_start_unlogged_build(index->rd_smgr);
+#endif
+
+ 	InitBuildState(buildstate, heap, index, indexInfo, forkNum);
+
+ 	BuildGraph(buildstate, forkNum);
+
+#ifdef NEON_SMGR
+	smgr_finish_unlogged_build_phase_1(index->rd_smgr);
+#endif
+
+ 	if (RelationNeedsWAL(index))
+	{
+ 		log_newpage_range(index, forkNum, 0, RelationGetNumberOfBlocks(index), true);
+
+#ifdef NEON_SMGR
+		{
+#if PG_VERSION_NUM >= 160000
+			RelFileLocator rlocator = index->rd_smgr->smgr_rlocator.locator;
+#else
+			RelFileNode rlocator = index->rd_smgr->smgr_rnode.node;
+#endif
+
+			SetLastWrittenLSNForBlockRange(XactLastRecEnd, rlocator,
+										   MAIN_FORKNUM, 0, RelationGetNumberOfBlocks(index));
+			SetLastWrittenLSNForRelation(XactLastRecEnd, rlocator, MAIN_FORKNUM);
+		}
+#endif
+	}
+
+#ifdef NEON_SMGR
+	smgr_end_unlogged_build(index->rd_smgr);
+#endif
+
+ 	FreeBuildState(buildstate);
+ }
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -328,18 +328,14 @@ pageserver_connect(shardno_t shard_no, int elevel)

 	now = GetCurrentTimestamp();
 	us_since_last_connect = now - last_connect_time;
-	if (us_since_last_connect < delay_us)
+	if (us_since_last_connect < MAX_RECONNECT_INTERVAL_USEC)
 	{
-		pg_usleep(delay_us - us_since_last_connect);
+		pg_usleep(delay_us);
 		delay_us *= 2;
-		if (delay_us > MAX_RECONNECT_INTERVAL_USEC)
-			delay_us = MAX_RECONNECT_INTERVAL_USEC;
-		last_connect_time = GetCurrentTimestamp();
 	}
 	else
 	{
 		delay_us = MIN_RECONNECT_INTERVAL_USEC;
-		last_connect_time = now;
 	}

 	/*
@@ -366,6 +362,7 @@ pageserver_connect(shardno_t shard_no, int elevel)
 	values[n] = NULL;
 	n++;
 	conn = PQconnectdbParams(keywords, values, 1);
+	last_connect_time = GetCurrentTimestamp();

 	if (PQstatus(conn) == CONNECTION_BAD)
 	{
--- a/poetry.lock
+++ b/poetry.lock
@@ -2,87 +2,87 @@

 [[package]]
 name = "aiohttp"
-version = "3.9.0"
+version = "3.9.2"
 description = "Async http client/server framework (asyncio)"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "aiohttp-3.9.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:6896b8416be9ada4d22cd359d7cb98955576ce863eadad5596b7cdfbf3e17c6c"},
-    {file = "aiohttp-3.9.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1736d87dad8ef46a8ec9cddd349fa9f7bd3a064c47dd6469c0d6763d3d49a4fc"},
-    {file = "aiohttp-3.9.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8c9e5f4d7208cda1a2bb600e29069eecf857e6980d0ccc922ccf9d1372c16f4b"},
-    {file = "aiohttp-3.9.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8488519aa05e636c5997719fe543c8daf19f538f4fa044f3ce94bee608817cff"},
-    {file = "aiohttp-3.9.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5ab16c254e2312efeb799bc3c06897f65a133b38b69682bf75d1f1ee1a9c43a9"},
-    {file = "aiohttp-3.9.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7a94bde005a8f926d0fa38b88092a03dea4b4875a61fbcd9ac6f4351df1b57cd"},
-    {file = "aiohttp-3.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4b777c9286b6c6a94f50ddb3a6e730deec327e9e2256cb08b5530db0f7d40fd8"},
-    {file = "aiohttp-3.9.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:571760ad7736b34d05597a1fd38cbc7d47f7b65deb722cb8e86fd827404d1f6b"},
-    {file = "aiohttp-3.9.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:deac0a32aec29608eb25d730f4bc5a261a65b6c48ded1ed861d2a1852577c932"},
-    {file = "aiohttp-3.9.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:4ee1b4152bc3190cc40ddd6a14715e3004944263ea208229ab4c297712aa3075"},
-    {file = "aiohttp-3.9.0-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:3607375053df58ed6f23903aa10cf3112b1240e8c799d243bbad0f7be0666986"},
-    {file = "aiohttp-3.9.0-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:65b0a70a25456d329a5e1426702dde67be0fb7a4ead718005ba2ca582d023a94"},
-    {file = "aiohttp-3.9.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:5a2eb5311a37fe105aa35f62f75a078537e1a9e4e1d78c86ec9893a3c97d7a30"},
-    {file = "aiohttp-3.9.0-cp310-cp310-win32.whl", hash = "sha256:2cbc14a13fb6b42d344e4f27746a4b03a2cb0c1c3c5b932b0d6ad8881aa390e3"},
-    {file = "aiohttp-3.9.0-cp310-cp310-win_amd64.whl", hash = "sha256:ac9669990e2016d644ba8ae4758688534aabde8dbbc81f9af129c3f5f01ca9cd"},
-    {file = "aiohttp-3.9.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:f8e05f5163528962ce1d1806fce763ab893b1c5b7ace0a3538cd81a90622f844"},
-    {file = "aiohttp-3.9.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4afa8f71dba3a5a2e1e1282a51cba7341ae76585345c43d8f0e624882b622218"},
-    {file = "aiohttp-3.9.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f929f4c9b9a00f3e6cc0587abb95ab9c05681f8b14e0fe1daecfa83ea90f8318"},
-    {file = "aiohttp-3.9.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:28185e36a78d247c55e9fbea2332d16aefa14c5276a582ce7a896231c6b1c208"},
-    {file = "aiohttp-3.9.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a486ddf57ab98b6d19ad36458b9f09e6022de0381674fe00228ca7b741aacb2f"},
-    {file = "aiohttp-3.9.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:70e851f596c00f40a2f00a46126c95c2e04e146015af05a9da3e4867cfc55911"},
-    {file = "aiohttp-3.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c5b7bf8fe4d39886adc34311a233a2e01bc10eb4e842220235ed1de57541a896"},
-    {file = "aiohttp-3.9.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c67a51ea415192c2e53e4e048c78bab82d21955b4281d297f517707dc836bf3d"},
-    {file = "aiohttp-3.9.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:694df243f394629bcae2d8ed94c589a181e8ba8604159e6e45e7b22e58291113"},
-    {file = "aiohttp-3.9.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:3dd8119752dd30dd7bca7d4bc2a92a59be6a003e4e5c2cf7e248b89751b8f4b7"},
-    {file = "aiohttp-3.9.0-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:eb6dfd52063186ac97b4caa25764cdbcdb4b10d97f5c5f66b0fa95052e744eb7"},
-    {file = "aiohttp-3.9.0-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:d97c3e286d0ac9af6223bc132dc4bad6540b37c8d6c0a15fe1e70fb34f9ec411"},
-    {file = "aiohttp-3.9.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:816f4db40555026e4cdda604a1088577c1fb957d02f3f1292e0221353403f192"},
-    {file = "aiohttp-3.9.0-cp311-cp311-win32.whl", hash = "sha256:3abf0551874fecf95f93b58f25ef4fc9a250669a2257753f38f8f592db85ddea"},
-    {file = "aiohttp-3.9.0-cp311-cp311-win_amd64.whl", hash = "sha256:e18d92c3e9e22553a73e33784fcb0ed484c9874e9a3e96c16a8d6a1e74a0217b"},
-    {file = "aiohttp-3.9.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:99ae01fb13a618b9942376df77a1f50c20a281390dad3c56a6ec2942e266220d"},
-    {file = "aiohttp-3.9.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:05857848da443c8c12110d99285d499b4e84d59918a21132e45c3f0804876994"},
-    {file = "aiohttp-3.9.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:317719d7f824eba55857fe0729363af58e27c066c731bc62cd97bc9c3d9c7ea4"},
-    {file = "aiohttp-3.9.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a1e3b3c107ccb0e537f309f719994a55621acd2c8fdf6d5ce5152aed788fb940"},
-    {file = "aiohttp-3.9.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:45820ddbb276113ead8d4907a7802adb77548087ff5465d5c554f9aa3928ae7d"},
-    {file = "aiohttp-3.9.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:05a183f1978802588711aed0dea31e697d760ce9055292db9dc1604daa9a8ded"},
-    {file = "aiohttp-3.9.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:51a4cd44788ea0b5e6bb8fa704597af3a30be75503a7ed1098bc5b8ffdf6c982"},
-    {file = "aiohttp-3.9.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:673343fbc0c1ac44d0d2640addc56e97a052504beacd7ade0dc5e76d3a4c16e8"},
-    {file = "aiohttp-3.9.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:7e8a3b79b6d186a9c99761fd4a5e8dd575a48d96021f220ac5b5fa856e5dd029"},
-    {file = "aiohttp-3.9.0-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:6777a390e41e78e7c45dab43a4a0196c55c3b8c30eebe017b152939372a83253"},
-    {file = "aiohttp-3.9.0-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:7ae5f99a32c53731c93ac3075abd3e1e5cfbe72fc3eaac4c27c9dd64ba3b19fe"},
-    {file = "aiohttp-3.9.0-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:f1e4f254e9c35d8965d377e065c4a8a55d396fe87c8e7e8429bcfdeeb229bfb3"},
-    {file = "aiohttp-3.9.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:11ca808f9a6b63485059f5f6e164ef7ec826483c1212a44f268b3653c91237d8"},
-    {file = "aiohttp-3.9.0-cp312-cp312-win32.whl", hash = "sha256:de3cc86f4ea8b4c34a6e43a7306c40c1275e52bfa9748d869c6b7d54aa6dad80"},
-    {file = "aiohttp-3.9.0-cp312-cp312-win_amd64.whl", hash = "sha256:ca4fddf84ac7d8a7d0866664936f93318ff01ee33e32381a115b19fb5a4d1202"},
-    {file = "aiohttp-3.9.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:f09960b5bb1017d16c0f9e9f7fc42160a5a49fa1e87a175fd4a2b1a1833ea0af"},
-    {file = "aiohttp-3.9.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:8303531e2c17b1a494ffaeba48f2da655fe932c4e9a2626c8718403c83e5dd2b"},
-    {file = "aiohttp-3.9.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:4790e44f46a4aa07b64504089def5744d3b6780468c4ec3a1a36eb7f2cae9814"},
-    {file = "aiohttp-3.9.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a1d7edf74a36de0e5ca50787e83a77cf352f5504eb0ffa3f07000a911ba353fb"},
-    {file = "aiohttp-3.9.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:94697c7293199c2a2551e3e3e18438b4cba293e79c6bc2319f5fd652fccb7456"},
-    {file = "aiohttp-3.9.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a1b66dbb8a7d5f50e9e2ea3804b01e766308331d0cac76eb30c563ac89c95985"},
-    {file = "aiohttp-3.9.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9623cfd9e85b76b83ef88519d98326d4731f8d71869867e47a0b979ffec61c73"},
-    {file = "aiohttp-3.9.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f32c86dc967ab8c719fd229ce71917caad13cc1e8356ee997bf02c5b368799bf"},
-    {file = "aiohttp-3.9.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:f50b4663c3e0262c3a361faf440761fbef60ccdde5fe8545689a4b3a3c149fb4"},
-    {file = "aiohttp-3.9.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:dcf71c55ec853826cd70eadb2b6ac62ec577416442ca1e0a97ad875a1b3a0305"},
-    {file = "aiohttp-3.9.0-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:42fe4fd9f0dfcc7be4248c162d8056f1d51a04c60e53366b0098d1267c4c9da8"},
-    {file = "aiohttp-3.9.0-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:76a86a9989ebf82ee61e06e2bab408aec4ea367dc6da35145c3352b60a112d11"},
-    {file = "aiohttp-3.9.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:f9e09a1c83521d770d170b3801eea19b89f41ccaa61d53026ed111cb6f088887"},
-    {file = "aiohttp-3.9.0-cp38-cp38-win32.whl", hash = "sha256:a00ce44c21612d185c5275c5cba4bab8d7c1590f248638b667ed8a782fa8cd6f"},
-    {file = "aiohttp-3.9.0-cp38-cp38-win_amd64.whl", hash = "sha256:d5b9345ab92ebe6003ae11d8092ce822a0242146e6fa270889b9ba965457ca40"},
-    {file = "aiohttp-3.9.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:98d21092bf2637c5fa724a428a69e8f5955f2182bff61f8036827cf6ce1157bf"},
-    {file = "aiohttp-3.9.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:35a68cd63ca6aaef5707888f17a70c36efe62b099a4e853d33dc2e9872125be8"},
-    {file = "aiohttp-3.9.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3d7f6235c7475658acfc1769d968e07ab585c79f6ca438ddfecaa9a08006aee2"},
-    {file = "aiohttp-3.9.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:db04d1de548f7a62d1dd7e7cdf7c22893ee168e22701895067a28a8ed51b3735"},
-    {file = "aiohttp-3.9.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:536b01513d67d10baf6f71c72decdf492fb7433c5f2f133e9a9087379d4b6f31"},
-    {file = "aiohttp-3.9.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:87c8b0a6487e8109427ccf638580865b54e2e3db4a6e0e11c02639231b41fc0f"},
-    {file = "aiohttp-3.9.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7276fe0017664414fdc3618fca411630405f1aaf0cc3be69def650eb50441787"},
-    {file = "aiohttp-3.9.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:23170247ef89ffa842a02bbfdc425028574d9e010611659abeb24d890bc53bb8"},
-    {file = "aiohttp-3.9.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:b1a2ea8252cacc7fd51df5a56d7a2bb1986ed39be9397b51a08015727dfb69bd"},
-    {file = "aiohttp-3.9.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:2d71abc15ff7047412ef26bf812dfc8d0d1020d664617f4913df2df469f26b76"},
-    {file = "aiohttp-3.9.0-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:2d820162c8c2bdbe97d328cd4f417c955ca370027dce593345e437b2e9ffdc4d"},
-    {file = "aiohttp-3.9.0-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:2779f5e7c70f7b421915fd47db332c81de365678180a9f3ab404088f87ba5ff9"},
-    {file = "aiohttp-3.9.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:366bc870d7ac61726f32a489fbe3d1d8876e87506870be66b01aeb84389e967e"},
-    {file = "aiohttp-3.9.0-cp39-cp39-win32.whl", hash = "sha256:1df43596b826022b14998f0460926ce261544fedefe0d2f653e1b20f49e96454"},
-    {file = "aiohttp-3.9.0-cp39-cp39-win_amd64.whl", hash = "sha256:9c196b30f1b1aa3363a69dd69079ae9bec96c2965c4707eaa6914ba099fb7d4f"},
-    {file = "aiohttp-3.9.0.tar.gz", hash = "sha256:09f23292d29135025e19e8ff4f0a68df078fe4ee013bca0105b2e803989de92d"},
+    {file = "aiohttp-3.9.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:772fbe371788e61c58d6d3d904268e48a594ba866804d08c995ad71b144f94cb"},
+    {file = "aiohttp-3.9.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:edd4f1af2253f227ae311ab3d403d0c506c9b4410c7fc8d9573dec6d9740369f"},
+    {file = "aiohttp-3.9.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:cfee9287778399fdef6f8a11c9e425e1cb13cc9920fd3a3df8f122500978292b"},
+    {file = "aiohttp-3.9.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3cc158466f6a980a6095ee55174d1de5730ad7dec251be655d9a6a9dd7ea1ff9"},
+    {file = "aiohttp-3.9.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:54ec82f45d57c9a65a1ead3953b51c704f9587440e6682f689da97f3e8defa35"},
+    {file = "aiohttp-3.9.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:abeb813a18eb387f0d835ef51f88568540ad0325807a77a6e501fed4610f864e"},
+    {file = "aiohttp-3.9.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cc91d07280d7d169f3a0f9179d8babd0ee05c79d4d891447629ff0d7d8089ec2"},
+    {file = "aiohttp-3.9.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b65e861f4bebfb660f7f0f40fa3eb9f2ab9af10647d05dac824390e7af8f75b7"},
+    {file = "aiohttp-3.9.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:04fd8ffd2be73d42bcf55fd78cde7958eeee6d4d8f73c3846b7cba491ecdb570"},
+    {file = "aiohttp-3.9.2-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:3d8d962b439a859b3ded9a1e111a4615357b01620a546bc601f25b0211f2da81"},
+    {file = "aiohttp-3.9.2-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:8ceb658afd12b27552597cf9a65d9807d58aef45adbb58616cdd5ad4c258c39e"},
+    {file = "aiohttp-3.9.2-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:0e4ee4df741670560b1bc393672035418bf9063718fee05e1796bf867e995fad"},
+    {file = "aiohttp-3.9.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:2dec87a556f300d3211decf018bfd263424f0690fcca00de94a837949fbcea02"},
+    {file = "aiohttp-3.9.2-cp310-cp310-win32.whl", hash = "sha256:3e1a800f988ce7c4917f34096f81585a73dbf65b5c39618b37926b1238cf9bc4"},
+    {file = "aiohttp-3.9.2-cp310-cp310-win_amd64.whl", hash = "sha256:ea510718a41b95c236c992b89fdfc3d04cc7ca60281f93aaada497c2b4e05c46"},
+    {file = "aiohttp-3.9.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:6aaa6f99256dd1b5756a50891a20f0d252bd7bdb0854c5d440edab4495c9f973"},
+    {file = "aiohttp-3.9.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:a27d8c70ad87bcfce2e97488652075a9bdd5b70093f50b10ae051dfe5e6baf37"},
+    {file = "aiohttp-3.9.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:54287bcb74d21715ac8382e9de146d9442b5f133d9babb7e5d9e453faadd005e"},
+    {file = "aiohttp-3.9.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5bb3d05569aa83011fcb346b5266e00b04180105fcacc63743fc2e4a1862a891"},
+    {file = "aiohttp-3.9.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c8534e7d69bb8e8d134fe2be9890d1b863518582f30c9874ed7ed12e48abe3c4"},
+    {file = "aiohttp-3.9.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4bd9d5b989d57b41e4ff56ab250c5ddf259f32db17159cce630fd543376bd96b"},
+    {file = "aiohttp-3.9.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fa6904088e6642609981f919ba775838ebf7df7fe64998b1a954fb411ffb4663"},
+    {file = "aiohttp-3.9.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bda42eb410be91b349fb4ee3a23a30ee301c391e503996a638d05659d76ea4c2"},
+    {file = "aiohttp-3.9.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:193cc1ccd69d819562cc7f345c815a6fc51d223b2ef22f23c1a0f67a88de9a72"},
+    {file = "aiohttp-3.9.2-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:b9f1cb839b621f84a5b006848e336cf1496688059d2408e617af33e3470ba204"},
+    {file = "aiohttp-3.9.2-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:d22a0931848b8c7a023c695fa2057c6aaac19085f257d48baa24455e67df97ec"},
+    {file = "aiohttp-3.9.2-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:4112d8ba61fbd0abd5d43a9cb312214565b446d926e282a6d7da3f5a5aa71d36"},
+    {file = "aiohttp-3.9.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:c4ad4241b52bb2eb7a4d2bde060d31c2b255b8c6597dd8deac2f039168d14fd7"},
+    {file = "aiohttp-3.9.2-cp311-cp311-win32.whl", hash = "sha256:ee2661a3f5b529f4fc8a8ffee9f736ae054adfb353a0d2f78218be90617194b3"},
+    {file = "aiohttp-3.9.2-cp311-cp311-win_amd64.whl", hash = "sha256:4deae2c165a5db1ed97df2868ef31ca3cc999988812e82386d22937d9d6fed52"},
+    {file = "aiohttp-3.9.2-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:6f4cdba12539215aaecf3c310ce9d067b0081a0795dd8a8805fdb67a65c0572a"},
+    {file = "aiohttp-3.9.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:84e843b33d5460a5c501c05539809ff3aee07436296ff9fbc4d327e32aa3a326"},
+    {file = "aiohttp-3.9.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8008d0f451d66140a5aa1c17e3eedc9d56e14207568cd42072c9d6b92bf19b52"},
+    {file = "aiohttp-3.9.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:61c47ab8ef629793c086378b1df93d18438612d3ed60dca76c3422f4fbafa792"},
+    {file = "aiohttp-3.9.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bc71f748e12284312f140eaa6599a520389273174b42c345d13c7e07792f4f57"},
+    {file = "aiohttp-3.9.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a1c3a4d0ab2f75f22ec80bca62385db2e8810ee12efa8c9e92efea45c1849133"},
+    {file = "aiohttp-3.9.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9a87aa0b13bbee025faa59fa58861303c2b064b9855d4c0e45ec70182bbeba1b"},
+    {file = "aiohttp-3.9.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e2cc0d04688b9f4a7854c56c18aa7af9e5b0a87a28f934e2e596ba7e14783192"},
+    {file = "aiohttp-3.9.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:1956e3ac376b1711c1533266dec4efd485f821d84c13ce1217d53e42c9e65f08"},
+    {file = "aiohttp-3.9.2-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:114da29f39eccd71b93a0fcacff178749a5c3559009b4a4498c2c173a6d74dff"},
+    {file = "aiohttp-3.9.2-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:3f17999ae3927d8a9a823a1283b201344a0627272f92d4f3e3a4efe276972fe8"},
+    {file = "aiohttp-3.9.2-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:f31df6a32217a34ae2f813b152a6f348154f948c83213b690e59d9e84020925c"},
+    {file = "aiohttp-3.9.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:7a75307ffe31329928a8d47eae0692192327c599113d41b278d4c12b54e1bd11"},
+    {file = "aiohttp-3.9.2-cp312-cp312-win32.whl", hash = "sha256:972b63d589ff8f305463593050a31b5ce91638918da38139b9d8deaba9e0fed7"},
+    {file = "aiohttp-3.9.2-cp312-cp312-win_amd64.whl", hash = "sha256:200dc0246f0cb5405c80d18ac905c8350179c063ea1587580e3335bfc243ba6a"},
+    {file = "aiohttp-3.9.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:158564d0d1020e0d3fe919a81d97aadad35171e13e7b425b244ad4337fc6793a"},
+    {file = "aiohttp-3.9.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:da1346cd0ccb395f0ed16b113ebb626fa43b7b07fd7344fce33e7a4f04a8897a"},
+    {file = "aiohttp-3.9.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:eaa9256de26ea0334ffa25f1913ae15a51e35c529a1ed9af8e6286dd44312554"},
+    {file = "aiohttp-3.9.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1543e7fb00214fb4ccead42e6a7d86f3bb7c34751ec7c605cca7388e525fd0b4"},
+    {file = "aiohttp-3.9.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:186e94570433a004e05f31f632726ae0f2c9dee4762a9ce915769ce9c0a23d89"},
+    {file = "aiohttp-3.9.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d52d20832ac1560f4510d68e7ba8befbc801a2b77df12bd0cd2bcf3b049e52a4"},
+    {file = "aiohttp-3.9.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1c45e4e815ac6af3b72ca2bde9b608d2571737bb1e2d42299fc1ffdf60f6f9a1"},
+    {file = "aiohttp-3.9.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:aa906b9bdfd4a7972dd0628dbbd6413d2062df5b431194486a78f0d2ae87bd55"},
+    {file = "aiohttp-3.9.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:68bbee9e17d66f17bb0010aa15a22c6eb28583edcc8b3212e2b8e3f77f3ebe2a"},
+    {file = "aiohttp-3.9.2-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:4c189b64bd6d9a403a1a3f86a3ab3acbc3dc41a68f73a268a4f683f89a4dec1f"},
+    {file = "aiohttp-3.9.2-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:8a7876f794523123bca6d44bfecd89c9fec9ec897a25f3dd202ee7fc5c6525b7"},
+    {file = "aiohttp-3.9.2-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:d23fba734e3dd7b1d679b9473129cd52e4ec0e65a4512b488981a56420e708db"},
+    {file = "aiohttp-3.9.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:b141753be581fab842a25cb319f79536d19c2a51995d7d8b29ee290169868eab"},
+    {file = "aiohttp-3.9.2-cp38-cp38-win32.whl", hash = "sha256:103daf41ff3b53ba6fa09ad410793e2e76c9d0269151812e5aba4b9dd674a7e8"},
+    {file = "aiohttp-3.9.2-cp38-cp38-win_amd64.whl", hash = "sha256:328918a6c2835861ff7afa8c6d2c70c35fdaf996205d5932351bdd952f33fa2f"},
+    {file = "aiohttp-3.9.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:5264d7327c9464786f74e4ec9342afbbb6ee70dfbb2ec9e3dfce7a54c8043aa3"},
+    {file = "aiohttp-3.9.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:07205ae0015e05c78b3288c1517afa000823a678a41594b3fdc870878d645305"},
+    {file = "aiohttp-3.9.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ae0a1e638cffc3ec4d4784b8b4fd1cf28968febc4bd2718ffa25b99b96a741bd"},
+    {file = "aiohttp-3.9.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d43302a30ba1166325974858e6ef31727a23bdd12db40e725bec0f759abce505"},
+    {file = "aiohttp-3.9.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:16a967685907003765855999af11a79b24e70b34dc710f77a38d21cd9fc4f5fe"},
+    {file = "aiohttp-3.9.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6fa3ee92cd441d5c2d07ca88d7a9cef50f7ec975f0117cd0c62018022a184308"},
+    {file = "aiohttp-3.9.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b500c5ad9c07639d48615a770f49618130e61be36608fc9bc2d9bae31732b8f"},
+    {file = "aiohttp-3.9.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c07327b368745b1ce2393ae9e1aafed7073d9199e1dcba14e035cc646c7941bf"},
+    {file = "aiohttp-3.9.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:cc7d6502c23a0ec109687bf31909b3fb7b196faf198f8cff68c81b49eb316ea9"},
+    {file = "aiohttp-3.9.2-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:07be2be7071723c3509ab5c08108d3a74f2181d4964e869f2504aaab68f8d3e8"},
+    {file = "aiohttp-3.9.2-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:122468f6fee5fcbe67cb07014a08c195b3d4c41ff71e7b5160a7bcc41d585a5f"},
+    {file = "aiohttp-3.9.2-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:00a9abcea793c81e7f8778ca195a1714a64f6d7436c4c0bb168ad2a212627000"},
+    {file = "aiohttp-3.9.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:7a9825fdd64ecac5c670234d80bb52bdcaa4139d1f839165f548208b3779c6c6"},
+    {file = "aiohttp-3.9.2-cp39-cp39-win32.whl", hash = "sha256:5422cd9a4a00f24c7244e1b15aa9b87935c85fb6a00c8ac9b2527b38627a9211"},
+    {file = "aiohttp-3.9.2-cp39-cp39-win_amd64.whl", hash = "sha256:7d579dcd5d82a86a46f725458418458fa43686f6a7b252f2966d359033ffc8ab"},
+    {file = "aiohttp-3.9.2.tar.gz", hash = "sha256:b0ad0a5e86ce73f5368a164c10ada10504bf91869c05ab75d982c6048217fbf7"},
 ]

 [package.dependencies]
@@ -2043,6 +2043,7 @@ files = [
    {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"},
    {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"},
    {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"},
+    {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"},
    {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"},
    {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"},
    {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"},
@@ -2668,4 +2669,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "9cf2734cafd5b6963165d398f1b24621193d5284d0bc7cc26a720a014f523860"
+content-hash = "e99954cbbfef8dcc5e13cea7103c87657639a192f2372983bdb8c5d624c2e447"
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -62,6 +62,8 @@ socket2.workspace = true
 sync_wrapper.workspace = true
 task-local-extensions.workspace = true
 thiserror.workspace = true
+tikv-jemallocator.workspace = true
+tikv-jemalloc-ctl = { workspace = true, features = ["use_std"] }
 tls-listener.workspace = true
 tokio-postgres.workspace = true
 tokio-rustls.workspace = true
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -190,7 +190,10 @@ async fn auth_quirks(
        Err(info) => {
            let res = hacks::password_hack_no_authentication(info, client, &mut ctx.latency_timer)
                .await?;
-            ctx.set_endpoint_id(Some(res.info.endpoint.clone()));
+
+            ctx.set_endpoint_id(res.info.endpoint.clone());
+            tracing::Span::current().record("ep", &tracing::field::display(&res.info.endpoint));
+
            (res.info, Some(res.keys))
        }
        Ok(info) => (info, None),
@@ -271,19 +274,12 @@ async fn authenticate_with_secret(
    classic::authenticate(info, client, config, &mut ctx.latency_timer, secret).await
 }

-/// Authenticate the user and then wake a compute (or retrieve an existing compute session from cache)
-/// only if authentication was successfuly.
-async fn auth_and_wake_compute(
+/// wake a compute (or retrieve an existing compute session from cache)
+async fn wake_compute(
    ctx: &mut RequestMonitoring,
    api: &impl console::Api,
-    user_info: ComputeUserInfoMaybeEndpoint,
-    client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
-    allow_cleartext: bool,
-    config: &'static AuthenticationConfig,
+    compute_credentials: ComputeCredentials<ComputeCredentialKeys>,
 ) -> auth::Result<(CachedNodeInfo, ComputeUserInfo)> {
-    let compute_credentials =
-        auth_quirks(ctx, api, user_info, client, allow_cleartext, config).await?;
-
    let mut num_retries = 0;
    let mut node = loop {
        let wake_res = api.wake_compute(ctx, &compute_credentials.info).await;
@@ -358,16 +354,16 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint> {
                    "performing authentication using the console"
                );

-                let (cache_info, user_info) =
-                    auth_and_wake_compute(ctx, &*api, user_info, client, allow_cleartext, config)
-                        .await?;
+                let compute_credentials =
+                    auth_quirks(ctx, &*api, user_info, client, allow_cleartext, config).await?;
+                let (cache_info, user_info) = wake_compute(ctx, &*api, compute_credentials).await?;
                (cache_info, BackendType::Console(api, user_info))
            }
            // NOTE: this auth backend doesn't use client credentials.
            Link(url) => {
                info!("performing link authentication");

-                let node_info = link::authenticate(&url, client).await?;
+                let node_info = link::authenticate(ctx, &url, client).await?;

                (
                    CachedNodeInfo::new_uncached(node_info),
--- a/proxy/src/auth/backend/link.rs
+++ b/proxy/src/auth/backend/link.rs
@@ -1,6 +1,7 @@
 use crate::{
    auth, compute,
    console::{self, provider::NodeInfo},
+    context::RequestMonitoring,
    error::UserFacingError,
    stream::PqStream,
    waiters,
@@ -54,6 +55,7 @@ pub fn new_psql_session_id() -> String {
 }

 pub(super) async fn authenticate(
+    ctx: &mut RequestMonitoring,
    link_uri: &reqwest::Url,
    client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
 ) -> auth::Result<NodeInfo> {
@@ -94,6 +96,10 @@ pub(super) async fn authenticate(
        .dbname(&db_info.dbname)
        .user(&db_info.user);

+    ctx.set_user(db_info.user.into());
+    ctx.set_project(db_info.aux.clone());
+    tracing::Span::current().record("ep", &tracing::field::display(&db_info.aux.endpoint_id));
+
    // Backwards compatibility. pg_sni_proxy uses "--" in domain names
    // while direct connections do not. Once we migrate to pg_sni_proxy
    // everywhere, we can remove this.
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -2,7 +2,8 @@

 use crate::{
    auth::password_hack::parse_endpoint_param, context::RequestMonitoring, error::UserFacingError,
-    metrics::NUM_CONNECTION_ACCEPTED_BY_SNI, proxy::NeonOptions, EndpointId, RoleName,
+    metrics::NUM_CONNECTION_ACCEPTED_BY_SNI, proxy::NeonOptions, serverless::SERVERLESS_DRIVER_SNI,
+    EndpointId, RoleName,
 };
 use itertools::Itertools;
 use pq_proto::StartupMessageParams;
@@ -54,10 +55,10 @@ impl ComputeUserInfoMaybeEndpoint {
    }
 }

-pub fn endpoint_sni<'a>(
-    sni: &'a str,
+pub fn endpoint_sni(
+    sni: &str,
    common_names: &HashSet<String>,
-) -> Result<&'a str, ComputeUserInfoParseError> {
+) -> Result<Option<EndpointId>, ComputeUserInfoParseError> {
    let Some((subdomain, common_name)) = sni.split_once('.') else {
        return Err(ComputeUserInfoParseError::UnknownCommonName { cn: sni.into() });
    };
@@ -66,7 +67,10 @@ pub fn endpoint_sni<'a>(
            cn: common_name.into(),
        });
    }
-    Ok(subdomain)
+    if subdomain == SERVERLESS_DRIVER_SNI {
+        return Ok(None);
+    }
+    Ok(Some(EndpointId::from(subdomain)))
 }

 impl ComputeUserInfoMaybeEndpoint {
@@ -85,7 +89,6 @@ impl ComputeUserInfoMaybeEndpoint {
        // record the values if we have them
        ctx.set_application(params.get("application_name").map(SmolStr::from));
        ctx.set_user(user.clone());
-        ctx.set_endpoint_id(sni.map(EndpointId::from));

        // Project name might be passed via PG's command-line options.
        let endpoint_option = params
@@ -103,7 +106,7 @@ impl ComputeUserInfoMaybeEndpoint {

        let endpoint_from_domain = if let Some(sni_str) = sni {
            if let Some(cn) = common_names {
-                Some(EndpointId::from(endpoint_sni(sni_str, cn)?))
+                endpoint_sni(sni_str, cn)?
            } else {
                None
            }
@@ -117,13 +120,18 @@ impl ComputeUserInfoMaybeEndpoint {
                Some(Err(InconsistentProjectNames { domain, option }))
            }
            // Invariant: project name may not contain certain characters.
-            (a, b) => a.or(b).map(|name| match project_name_valid(&name) {
+            (a, b) => a.or(b).map(|name| match project_name_valid(name.as_ref()) {
                false => Err(MalformedProjectName(name)),
                true => Ok(name),
            }),
        }
        .transpose()?;

+        if let Some(ep) = &endpoint {
+            ctx.set_endpoint_id(ep.clone());
+            tracing::Span::current().record("ep", &tracing::field::display(ep));
+        }
+
        info!(%user, project = endpoint.as_deref(), "credentials");
        if sni.is_some() {
            info!("Connection with sni");
@@ -146,7 +154,7 @@ impl ComputeUserInfoMaybeEndpoint {

        Ok(Self {
            user,
-            endpoint_id: endpoint.map(EndpointId::from),
+            endpoint_id: endpoint,
            options,
        })
    }
--- a/proxy/src/bin/pg_sni_router.rs
+++ b/proxy/src/bin/pg_sni_router.rs
@@ -272,5 +272,5 @@ async fn handle_client(
    let client = tokio::net::TcpStream::connect(destination).await?;

    let metrics_aux: MetricsAuxInfo = Default::default();
-    proxy::proxy::proxy_pass(ctx, tls_stream, client, metrics_aux).await
+    proxy::proxy::passthrough::proxy_pass(ctx, tls_stream, client, metrics_aux).await
 }
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -32,6 +32,9 @@ project_build_tag!(BUILD_TAG);

 use clap::{Parser, ValueEnum};

+#[global_allocator]
+static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
+
 #[derive(Clone, Debug, ValueEnum)]
 enum AuthBackend {
    Console,
@@ -187,6 +190,13 @@ async fn main() -> anyhow::Result<()> {
    info!("Build_tag: {BUILD_TAG}");
    ::metrics::set_build_info_metric(GIT_VERSION, BUILD_TAG);

+    match proxy::jemalloc::MetricRecorder::new(prometheus::default_registry()) {
+        Ok(t) => {
+            t.start();
+        }
+        Err(e) => tracing::error!(error = ?e, "could not start jemalloc metrics loop"),
+    }
+
    let args = ProxyCliArgs::parse();
    let config = build_config(&args)?;

--- a/proxy/src/cancellation.rs
+++ b/proxy/src/cancellation.rs
@@ -1,7 +1,7 @@
-use anyhow::{bail, Context};
+use anyhow::Context;
 use dashmap::DashMap;
 use pq_proto::CancelKeyData;
-use std::net::SocketAddr;
+use std::{net::SocketAddr, sync::Arc};
 use tokio::net::TcpStream;
 use tokio_postgres::{CancelToken, NoTls};
 use tracing::info;
@@ -25,39 +25,31 @@ impl CancelMap {
    }

    /// Run async action within an ephemeral session identified by [`CancelKeyData`].
-    pub async fn with_session<'a, F, R, V>(&'a self, f: F) -> anyhow::Result<V>
-    where
-        F: FnOnce(Session<'a>) -> R,
-        R: std::future::Future<Output = anyhow::Result<V>>,
-    {
+    pub fn get_session(self: Arc<Self>) -> Session {
        // HACK: We'd rather get the real backend_pid but tokio_postgres doesn't
        // expose it and we don't want to do another roundtrip to query
        // for it. The client will be able to notice that this is not the
        // actual backend_pid, but backend_pid is not used for anything
        // so it doesn't matter.
-        let key = rand::random();
+        let key = loop {
+            let key = rand::random();

-        // Random key collisions are unlikely to happen here, but they're still possible,
-        // which is why we have to take care not to rewrite an existing key.
-        match self.0.entry(key) {
-            dashmap::mapref::entry::Entry::Occupied(_) => {
-                bail!("query cancellation key already exists: {key}")
+            // Random key collisions are unlikely to happen here, but they're still possible,
+            // which is why we have to take care not to rewrite an existing key.
+            match self.0.entry(key) {
+                dashmap::mapref::entry::Entry::Occupied(_) => continue,
+                dashmap::mapref::entry::Entry::Vacant(e) => {
+                    e.insert(None);
+                }
            }
-            dashmap::mapref::entry::Entry::Vacant(e) => {
-                e.insert(None);
-            }
-        }
-
-        // This will guarantee that the session gets dropped
-        // as soon as the future is finished.
-        scopeguard::defer! {
-            self.0.remove(&key);
-            info!("dropped query cancellation key {key}");
-        }
+            break key;
+        };

        info!("registered new query cancellation key {key}");
-        let session = Session::new(key, self);
-        f(session).await
+        Session {
+            key,
+            cancel_map: self,
+        }
    }

    #[cfg(test)]
@@ -98,23 +90,17 @@ impl CancelClosure {
 }

 /// Helper for registering query cancellation tokens.
-pub struct Session<'a> {
+pub struct Session {
    /// The user-facing key identifying this session.
    key: CancelKeyData,
    /// The [`CancelMap`] this session belongs to.
-    cancel_map: &'a CancelMap,
+    cancel_map: Arc<CancelMap>,
 }

-impl<'a> Session<'a> {
-    fn new(key: CancelKeyData, cancel_map: &'a CancelMap) -> Self {
-        Self { key, cancel_map }
-    }
-}
-
-impl Session<'_> {
+impl Session {
    /// Store the cancel token for the given session.
    /// This enables query cancellation in `crate::proxy::prepare_client_connection`.
-    pub fn enable_query_cancellation(self, cancel_closure: CancelClosure) -> CancelKeyData {
+    pub fn enable_query_cancellation(&self, cancel_closure: CancelClosure) -> CancelKeyData {
        info!("enabling query cancellation for this session");
        self.cancel_map.0.insert(self.key, Some(cancel_closure));

@@ -122,37 +108,26 @@ impl Session<'_> {
    }
 }

+impl Drop for Session {
+    fn drop(&mut self) {
+        self.cancel_map.0.remove(&self.key);
+        info!("dropped query cancellation key {}", &self.key);
+    }
+}
+
 #[cfg(test)]
 mod tests {
    use super::*;
-    use once_cell::sync::Lazy;

    #[tokio::test]
    async fn check_session_drop() -> anyhow::Result<()> {
-        static CANCEL_MAP: Lazy<CancelMap> = Lazy::new(Default::default);
-
-        let (tx, rx) = tokio::sync::oneshot::channel();
-        let task = tokio::spawn(CANCEL_MAP.with_session(|session| async move {
-            assert!(CANCEL_MAP.contains(&session));
-
-            tx.send(()).expect("failed to send");
-            futures::future::pending::<()>().await; // sleep forever
-
-            Ok(())
-        }));
-
-        // Wait until the task has been spawned.
-        rx.await.context("failed to hear from the task")?;
-
-        // Drop the session's entry by cancelling the task.
-        task.abort();
-        let error = task.await.expect_err("task should have failed");
-        if !error.is_cancelled() {
-            anyhow::bail!(error);
-        }
+        let cancel_map: Arc<CancelMap> = Default::default();

+        let session = cancel_map.clone().get_session();
+        assert!(cancel_map.contains(&session));
+        drop(session);
        // Check that the session has been dropped.
-        assert!(CANCEL_MAP.is_empty());
+        assert!(cancel_map.is_empty());

        Ok(())
    }
--- a/proxy/src/context.rs
+++ b/proxy/src/context.rs
@@ -89,8 +89,11 @@ impl RequestMonitoring {
        self.project = Some(x.project_id);
    }

-    pub fn set_endpoint_id(&mut self, endpoint_id: Option<EndpointId>) {
-        self.endpoint_id = endpoint_id.or_else(|| self.endpoint_id.clone());
+    pub fn set_endpoint_id(&mut self, endpoint_id: EndpointId) {
+        crate::metrics::CONNECTING_ENDPOINTS
+            .with_label_values(&[self.protocol])
+            .measure(&endpoint_id);
+        self.endpoint_id = Some(endpoint_id);
    }

    pub fn set_application(&mut self, app: Option<SmolStr>) {
--- a/proxy/src/jemalloc.rs
+++ b/proxy/src/jemalloc.rs
@@ -0,0 +1,100 @@
+use std::time::Duration;
+
+use metrics::IntGauge;
+use prometheus::{register_int_gauge_with_registry, Registry};
+use tikv_jemalloc_ctl::{config, epoch, epoch_mib, stats, version};
+
+pub struct MetricRecorder {
+    epoch: epoch_mib,
+    active: stats::active_mib,
+    active_gauge: IntGauge,
+    allocated: stats::allocated_mib,
+    allocated_gauge: IntGauge,
+    mapped: stats::mapped_mib,
+    mapped_gauge: IntGauge,
+    metadata: stats::metadata_mib,
+    metadata_gauge: IntGauge,
+    resident: stats::resident_mib,
+    resident_gauge: IntGauge,
+    retained: stats::retained_mib,
+    retained_gauge: IntGauge,
+}
+
+impl MetricRecorder {
+    pub fn new(registry: &Registry) -> Result<Self, anyhow::Error> {
+        tracing::info!(
+            config = config::malloc_conf::read()?,
+            version = version::read()?,
+            "starting jemalloc recorder"
+        );
+
+        Ok(Self {
+            epoch: epoch::mib()?,
+            active: stats::active::mib()?,
+            active_gauge: register_int_gauge_with_registry!(
+                "jemalloc_active_bytes",
+                "Total number of bytes in active pages allocated by the process",
+                registry
+            )?,
+            allocated: stats::allocated::mib()?,
+            allocated_gauge: register_int_gauge_with_registry!(
+                "jemalloc_allocated_bytes",
+                "Total number of bytes allocated by the process",
+                registry
+            )?,
+            mapped: stats::mapped::mib()?,
+            mapped_gauge: register_int_gauge_with_registry!(
+                "jemalloc_mapped_bytes",
+                "Total number of bytes in active extents mapped by the allocator",
+                registry
+            )?,
+            metadata: stats::metadata::mib()?,
+            metadata_gauge: register_int_gauge_with_registry!(
+                "jemalloc_metadata_bytes",
+                "Total number of bytes dedicated to jemalloc metadata",
+                registry
+            )?,
+            resident: stats::resident::mib()?,
+            resident_gauge: register_int_gauge_with_registry!(
+                "jemalloc_resident_bytes",
+                "Total number of bytes in physically resident data pages mapped by the allocator",
+                registry
+            )?,
+            retained: stats::retained::mib()?,
+            retained_gauge: register_int_gauge_with_registry!(
+                "jemalloc_retained_bytes",
+                "Total number of bytes in virtual memory mappings that were retained rather than being returned to the operating system",
+                registry
+            )?,
+        })
+    }
+
+    fn _poll(&self) -> Result<(), anyhow::Error> {
+        self.epoch.advance()?;
+        self.active_gauge.set(self.active.read()? as i64);
+        self.allocated_gauge.set(self.allocated.read()? as i64);
+        self.mapped_gauge.set(self.mapped.read()? as i64);
+        self.metadata_gauge.set(self.metadata.read()? as i64);
+        self.resident_gauge.set(self.resident.read()? as i64);
+        self.retained_gauge.set(self.retained.read()? as i64);
+        Ok(())
+    }
+
+    #[inline]
+    pub fn poll(&self) {
+        if let Err(error) = self._poll() {
+            tracing::warn!(%error, "Failed to poll jemalloc stats");
+        }
+    }
+
+    pub fn start(self) -> tokio::task::JoinHandle<()> {
+        tokio::task::spawn(async move {
+            let mut interval = tokio::time::interval(Duration::from_secs(15));
+            interval.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip);
+            loop {
+                self.poll();
+                interval.tick().await;
+            }
+        })
+    }
+}
--- a/proxy/src/lib.rs
+++ b/proxy/src/lib.rs
@@ -16,6 +16,7 @@ pub mod console;
 pub mod context;
 pub mod error;
 pub mod http;
+pub mod jemalloc;
 pub mod logging;
 pub mod metrics;
 pub mod parse;
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -1,10 +1,7 @@
 use ::metrics::{
-    exponential_buckets, register_int_counter_pair_vec, register_int_counter_vec,
-    IntCounterPairVec, IntCounterVec,
-};
-use prometheus::{
-    register_histogram, register_histogram_vec, register_int_gauge_vec, Histogram, HistogramVec,
-    IntGaugeVec,
+    exponential_buckets, register_histogram, register_histogram_vec, register_hll_vec,
+    register_int_counter_pair_vec, register_int_counter_vec, register_int_gauge_vec, Histogram,
+    HistogramVec, HyperLogLogVec, IntCounterPairVec, IntCounterVec, IntGaugeVec,
 };

 use once_cell::sync::Lazy;
@@ -236,3 +233,13 @@ pub const fn bool_to_str(x: bool) -> &'static str {
        "false"
    }
 }
+
+pub static CONNECTING_ENDPOINTS: Lazy<HyperLogLogVec<32>> = Lazy::new(|| {
+    register_hll_vec!(
+        32,
+        "proxy_connecting_endpoints",
+        "HLL approximate cardinality of endpoints that are connecting",
+        &["protocol"],
+    )
+    .unwrap()
+});
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -2,37 +2,34 @@
 mod tests;

 pub mod connect_compute;
+pub mod handshake;
+pub mod passthrough;
 pub mod retry;

 use crate::{
    auth,
    cancellation::{self, CancelMap},
    compute,
-    config::{AuthenticationConfig, ProxyConfig, TlsConfig},
-    console::messages::MetricsAuxInfo,
+    config::{ProxyConfig, TlsConfig},
    context::RequestMonitoring,
-    metrics::{
-        NUM_BYTES_PROXIED_COUNTER, NUM_BYTES_PROXIED_PER_CLIENT_COUNTER,
-        NUM_CLIENT_CONNECTION_GAUGE, NUM_CONNECTION_REQUESTS_GAUGE,
-    },
+    metrics::{NUM_CLIENT_CONNECTION_GAUGE, NUM_CONNECTION_REQUESTS_GAUGE},
    protocol2::WithClientIp,
+    proxy::{handshake::handshake, passthrough::proxy_pass},
    rate_limiter::EndpointRateLimiter,
    stream::{PqStream, Stream},
-    usage_metrics::{Ids, USAGE_METRICS},
    EndpointCacheKey,
 };
 use anyhow::{bail, Context};
 use futures::TryFutureExt;
 use itertools::Itertools;
 use once_cell::sync::OnceCell;
-use pq_proto::{BeMessage as Be, FeStartupPacket, StartupMessageParams};
+use pq_proto::{BeMessage as Be, StartupMessageParams};
 use regex::Regex;
 use smol_str::{format_smolstr, SmolStr};
 use std::sync::Arc;
 use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt};
 use tokio_util::sync::CancellationToken;
 use tracing::{error, info, info_span, Instrument};
-use utils::measured_stream::MeasuredStream;

 use self::connect_compute::{connect_to_compute, TcpMechanism};

@@ -80,6 +77,13 @@ pub async fn task_main(
        let cancel_map = Arc::clone(&cancel_map);
        let endpoint_rate_limiter = endpoint_rate_limiter.clone();

+        let session_span = info_span!(
+            "handle_client",
+            ?session_id,
+            peer_addr = tracing::field::Empty,
+            ep = tracing::field::Empty,
+        );
+
        connections.spawn(
            async move {
                info!("accepted postgres client connection");
@@ -103,22 +107,18 @@ pub async fn task_main(
                handle_client(
                    config,
                    &mut ctx,
-                    &cancel_map,
+                    cancel_map,
                    socket,
                    ClientMode::Tcp,
                    endpoint_rate_limiter,
                )
                .await
            }
-            .instrument(info_span!(
-                "handle_client",
-                ?session_id,
-                peer_addr = tracing::field::Empty
-            ))
            .unwrap_or_else(move |e| {
                // Acknowledge that the task has finished with an error.
-                error!(?session_id, "per-client task finished with an error: {e:#}");
-            }),
+                error!("per-client task finished with an error: {e:#}");
+            })
+            .instrument(session_span),
        );
    }

@@ -171,7 +171,7 @@ impl ClientMode {
 pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
    config: &'static ProxyConfig,
    ctx: &mut RequestMonitoring,
-    cancel_map: &CancelMap,
+    cancel_map: Arc<CancelMap>,
    stream: S,
    mode: ClientMode,
    endpoint_rate_limiter: Arc<EndpointRateLimiter>,
@@ -192,138 +192,88 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
    let tls = config.tls_config.as_ref();

    let pause = ctx.latency_timer.pause();
-    let do_handshake = handshake(stream, mode.handshake_tls(tls), cancel_map);
+    let do_handshake = handshake(stream, mode.handshake_tls(tls), &cancel_map);
    let (mut stream, params) = match do_handshake.await? {
        Some(x) => x,
        None => return Ok(()), // it's a cancellation request
    };
    drop(pause);

+    let hostname = mode.hostname(stream.get_ref());
+
+    let common_names = tls.map(|tls| &tls.common_names);
+
    // Extract credentials which we're going to use for auth.
-    let user_info = {
-        let hostname = mode.hostname(stream.get_ref());
+    let result = config
+        .auth_backend
+        .as_ref()
+        .map(|_| auth::ComputeUserInfoMaybeEndpoint::parse(ctx, &params, hostname, common_names))
+        .transpose();

-        let common_names = tls.map(|tls| &tls.common_names);
-        let result = config
-            .auth_backend
-            .as_ref()
-            .map(|_| {
-                auth::ComputeUserInfoMaybeEndpoint::parse(ctx, &params, hostname, common_names)
-            })
-            .transpose();
+    let user_info = match result {
+        Ok(user_info) => user_info,
+        Err(e) => stream.throw_error(e).await?,
+    };

-        match result {
-            Ok(user_info) => user_info,
-            Err(e) => stream.throw_error(e).await?,
+    // check rate limit
+    if let Some(ep) = user_info.get_endpoint() {
+        if !endpoint_rate_limiter.check(ep) {
+            return stream
+                .throw_error(auth::AuthError::too_many_connections())
+                .await;
+        }
+    }
+
+    let user = user_info.get_user().to_owned();
+    let (mut node_info, user_info) = match user_info
+        .authenticate(
+            ctx,
+            &mut stream,
+            mode.allow_cleartext(),
+            &config.authentication_config,
+        )
+        .await
+    {
+        Ok(auth_result) => auth_result,
+        Err(e) => {
+            let db = params.get("database");
+            let app = params.get("application_name");
+            let params_span = tracing::info_span!("", ?user, ?db, ?app);
+
+            return stream.throw_error(e).instrument(params_span).await;
        }
    };

-    ctx.set_endpoint_id(user_info.get_endpoint());
+    node_info.allow_self_signed_compute = mode.allow_self_signed_compute(config);

-    let client = Client::new(
-        stream,
-        user_info,
-        &params,
-        mode.allow_self_signed_compute(config),
-        endpoint_rate_limiter,
-    );
-    cancel_map
-        .with_session(|session| {
-            client.connect_to_db(ctx, session, mode, &config.authentication_config)
-        })
-        .await
-}
+    let aux = node_info.aux.clone();
+    let mut node = connect_to_compute(
+        ctx,
+        &TcpMechanism { params: &params },
+        node_info,
+        &user_info,
+    )
+    .or_else(|e| stream.throw_error(e))
+    .await?;

-/// Establish a (most probably, secure) connection with the client.
-/// For better testing experience, `stream` can be any object satisfying the traits.
-/// It's easier to work with owned `stream` here as we need to upgrade it to TLS;
-/// we also take an extra care of propagating only the select handshake errors to client.
-#[tracing::instrument(skip_all)]
-async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
-    stream: S,
-    mut tls: Option<&TlsConfig>,
-    cancel_map: &CancelMap,
-) -> anyhow::Result<Option<(PqStream<Stream<S>>, StartupMessageParams)>> {
-    // Client may try upgrading to each protocol only once
-    let (mut tried_ssl, mut tried_gss) = (false, false);
+    let session = cancel_map.get_session();
+    prepare_client_connection(&node, &session, &mut stream).await?;

-    let mut stream = PqStream::new(Stream::from_raw(stream));
-    loop {
-        let msg = stream.read_startup_packet().await?;
-        info!("received {msg:?}");
+    // Before proxy passing, forward to compute whatever data is left in the
+    // PqStream input buffer. Normally there is none, but our serverless npm
+    // driver in pipeline mode sends startup, password and first query
+    // immediately after opening the connection.
+    let (stream, read_buf) = stream.into_inner();
+    node.stream.write_all(&read_buf).await?;

-        use FeStartupPacket::*;
-        match msg {
-            SslRequest => match stream.get_ref() {
-                Stream::Raw { .. } if !tried_ssl => {
-                    tried_ssl = true;
-
-                    // We can't perform TLS handshake without a config
-                    let enc = tls.is_some();
-                    stream.write_message(&Be::EncryptionResponse(enc)).await?;
-                    if let Some(tls) = tls.take() {
-                        // Upgrade raw stream into a secure TLS-backed stream.
-                        // NOTE: We've consumed `tls`; this fact will be used later.
-
-                        let (raw, read_buf) = stream.into_inner();
-                        // TODO: Normally, client doesn't send any data before
-                        // server says TLS handshake is ok and read_buf is empy.
-                        // However, you could imagine pipelining of postgres
-                        // SSLRequest + TLS ClientHello in one hunk similar to
-                        // pipelining in our node js driver. We should probably
-                        // support that by chaining read_buf with the stream.
-                        if !read_buf.is_empty() {
-                            bail!("data is sent before server replied with EncryptionResponse");
-                        }
-                        let tls_stream = raw.upgrade(tls.to_server_config()).await?;
-
-                        let (_, tls_server_end_point) = tls
-                            .cert_resolver
-                            .resolve(tls_stream.get_ref().1.server_name())
-                            .context("missing certificate")?;
-
-                        stream = PqStream::new(Stream::Tls {
-                            tls: Box::new(tls_stream),
-                            tls_server_end_point,
-                        });
-                    }
-                }
-                _ => bail!(ERR_PROTO_VIOLATION),
-            },
-            GssEncRequest => match stream.get_ref() {
-                Stream::Raw { .. } if !tried_gss => {
-                    tried_gss = true;
-
-                    // Currently, we don't support GSSAPI
-                    stream.write_message(&Be::EncryptionResponse(false)).await?;
-                }
-                _ => bail!(ERR_PROTO_VIOLATION),
-            },
-            StartupMessage { params, .. } => {
-                // Check that the config has been consumed during upgrade
-                // OR we didn't provide it at all (for dev purposes).
-                if tls.is_some() {
-                    stream.throw_error_str(ERR_INSECURE_CONNECTION).await?;
-                }
-
-                info!(session_type = "normal", "successful handshake");
-                break Ok(Some((stream, params)));
-            }
-            CancelRequest(cancel_key_data) => {
-                cancel_map.cancel_session(cancel_key_data).await?;
-
-                info!(session_type = "cancellation", "successful handshake");
-                break Ok(None);
-            }
-        }
-    }
+    proxy_pass(ctx, stream, node.stream, aux).await
 }

 /// Finish client connection initialization: confirm auth success, send params, etc.
 #[tracing::instrument(skip_all)]
 async fn prepare_client_connection(
    node: &compute::PostgresConnection,
-    session: cancellation::Session<'_>,
+    session: &cancellation::Session,
    stream: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
 ) -> anyhow::Result<()> {
    // Register compute's query cancellation token and produce a new, unique one.
@@ -349,151 +299,6 @@ async fn prepare_client_connection(
    Ok(())
 }

-/// Forward bytes in both directions (client <-> compute).
-#[tracing::instrument(skip_all)]
-pub async fn proxy_pass(
-    ctx: &mut RequestMonitoring,
-    client: impl AsyncRead + AsyncWrite + Unpin,
-    compute: impl AsyncRead + AsyncWrite + Unpin,
-    aux: MetricsAuxInfo,
-) -> anyhow::Result<()> {
-    ctx.set_success();
-    ctx.log();
-
-    let usage = USAGE_METRICS.register(Ids {
-        endpoint_id: aux.endpoint_id.clone(),
-        branch_id: aux.branch_id.clone(),
-    });
-
-    let m_sent = NUM_BYTES_PROXIED_COUNTER.with_label_values(&["tx"]);
-    let m_sent2 = NUM_BYTES_PROXIED_PER_CLIENT_COUNTER.with_label_values(&aux.traffic_labels("tx"));
-    let mut client = MeasuredStream::new(
-        client,
-        |_| {},
-        |cnt| {
-            // Number of bytes we sent to the client (outbound).
-            m_sent.inc_by(cnt as u64);
-            m_sent2.inc_by(cnt as u64);
-            usage.record_egress(cnt as u64);
-        },
-    );
-
-    let m_recv = NUM_BYTES_PROXIED_COUNTER.with_label_values(&["rx"]);
-    let m_recv2 = NUM_BYTES_PROXIED_PER_CLIENT_COUNTER.with_label_values(&aux.traffic_labels("rx"));
-    let mut compute = MeasuredStream::new(
-        compute,
-        |_| {},
-        |cnt| {
-            // Number of bytes the client sent to the compute node (inbound).
-            m_recv.inc_by(cnt as u64);
-            m_recv2.inc_by(cnt as u64);
-        },
-    );
-
-    // Starting from here we only proxy the client's traffic.
-    info!("performing the proxy pass...");
-    let _ = tokio::io::copy_bidirectional(&mut client, &mut compute).await?;
-
-    Ok(())
-}
-
-/// Thin connection context.
-struct Client<'a, S> {
-    /// The underlying libpq protocol stream.
-    stream: PqStream<Stream<S>>,
-    /// Client credentials that we care about.
-    user_info: auth::BackendType<'a, auth::ComputeUserInfoMaybeEndpoint>,
-    /// KV-dictionary with PostgreSQL connection params.
-    params: &'a StartupMessageParams,
-    /// Allow self-signed certificates (for testing).
-    allow_self_signed_compute: bool,
-    /// Rate limiter for endpoints
-    endpoint_rate_limiter: Arc<EndpointRateLimiter>,
-}
-
-impl<'a, S> Client<'a, S> {
-    /// Construct a new connection context.
-    fn new(
-        stream: PqStream<Stream<S>>,
-        user_info: auth::BackendType<'a, auth::ComputeUserInfoMaybeEndpoint>,
-        params: &'a StartupMessageParams,
-        allow_self_signed_compute: bool,
-        endpoint_rate_limiter: Arc<EndpointRateLimiter>,
-    ) -> Self {
-        Self {
-            stream,
-            user_info,
-            params,
-            allow_self_signed_compute,
-            endpoint_rate_limiter,
-        }
-    }
-}
-
-impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
-    /// Let the client authenticate and connect to the designated compute node.
-    // Instrumentation logs endpoint name everywhere. Doesn't work for link
-    // auth; strictly speaking we don't know endpoint name in its case.
-    #[tracing::instrument(name = "", fields(ep = %self.user_info.get_endpoint().unwrap_or_default()), skip_all)]
-    async fn connect_to_db(
-        self,
-        ctx: &mut RequestMonitoring,
-        session: cancellation::Session<'_>,
-        mode: ClientMode,
-        config: &'static AuthenticationConfig,
-    ) -> anyhow::Result<()> {
-        let Self {
-            mut stream,
-            user_info,
-            params,
-            allow_self_signed_compute,
-            endpoint_rate_limiter,
-        } = self;
-
-        // check rate limit
-        if let Some(ep) = user_info.get_endpoint() {
-            if !endpoint_rate_limiter.check(ep) {
-                return stream
-                    .throw_error(auth::AuthError::too_many_connections())
-                    .await;
-            }
-        }
-
-        let user = user_info.get_user().to_owned();
-        let auth_result = match user_info
-            .authenticate(ctx, &mut stream, mode.allow_cleartext(), config)
-            .await
-        {
-            Ok(auth_result) => auth_result,
-            Err(e) => {
-                let db = params.get("database");
-                let app = params.get("application_name");
-                let params_span = tracing::info_span!("", ?user, ?db, ?app);
-
-                return stream.throw_error(e).instrument(params_span).await;
-            }
-        };
-
-        let (mut node_info, user_info) = auth_result;
-
-        node_info.allow_self_signed_compute = allow_self_signed_compute;
-
-        let aux = node_info.aux.clone();
-        let mut node = connect_to_compute(ctx, &TcpMechanism { params }, node_info, &user_info)
-            .or_else(|e| stream.throw_error(e))
-            .await?;
-
-        prepare_client_connection(&node, session, &mut stream).await?;
-        // Before proxy passing, forward to compute whatever data is left in the
-        // PqStream input buffer. Normally there is none, but our serverless npm
-        // driver in pipeline mode sends startup, password and first query
-        // immediately after opening the connection.
-        let (stream, read_buf) = stream.into_inner();
-        node.stream.write_all(&read_buf).await?;
-        proxy_pass(ctx, stream, node.stream, aux).await
-    }
-}
-
 #[derive(Debug, Clone, PartialEq, Eq, Default)]
 pub struct NeonOptions(Vec<(SmolStr, SmolStr)>);

--- a/proxy/src/proxy/handshake.rs
+++ b/proxy/src/proxy/handshake.rs
@@ -0,0 +1,96 @@
+use anyhow::{bail, Context};
+use pq_proto::{BeMessage as Be, FeStartupPacket, StartupMessageParams};
+use tokio::io::{AsyncRead, AsyncWrite};
+use tracing::info;
+
+use crate::{
+    cancellation::CancelMap,
+    config::TlsConfig,
+    proxy::{ERR_INSECURE_CONNECTION, ERR_PROTO_VIOLATION},
+    stream::{PqStream, Stream},
+};
+
+/// Establish a (most probably, secure) connection with the client.
+/// For better testing experience, `stream` can be any object satisfying the traits.
+/// It's easier to work with owned `stream` here as we need to upgrade it to TLS;
+/// we also take an extra care of propagating only the select handshake errors to client.
+#[tracing::instrument(skip_all)]
+pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
+    stream: S,
+    mut tls: Option<&TlsConfig>,
+    cancel_map: &CancelMap,
+) -> anyhow::Result<Option<(PqStream<Stream<S>>, StartupMessageParams)>> {
+    // Client may try upgrading to each protocol only once
+    let (mut tried_ssl, mut tried_gss) = (false, false);
+
+    let mut stream = PqStream::new(Stream::from_raw(stream));
+    loop {
+        let msg = stream.read_startup_packet().await?;
+        info!("received {msg:?}");
+
+        use FeStartupPacket::*;
+        match msg {
+            SslRequest => match stream.get_ref() {
+                Stream::Raw { .. } if !tried_ssl => {
+                    tried_ssl = true;
+
+                    // We can't perform TLS handshake without a config
+                    let enc = tls.is_some();
+                    stream.write_message(&Be::EncryptionResponse(enc)).await?;
+                    if let Some(tls) = tls.take() {
+                        // Upgrade raw stream into a secure TLS-backed stream.
+                        // NOTE: We've consumed `tls`; this fact will be used later.
+
+                        let (raw, read_buf) = stream.into_inner();
+                        // TODO: Normally, client doesn't send any data before
+                        // server says TLS handshake is ok and read_buf is empy.
+                        // However, you could imagine pipelining of postgres
+                        // SSLRequest + TLS ClientHello in one hunk similar to
+                        // pipelining in our node js driver. We should probably
+                        // support that by chaining read_buf with the stream.
+                        if !read_buf.is_empty() {
+                            bail!("data is sent before server replied with EncryptionResponse");
+                        }
+                        let tls_stream = raw.upgrade(tls.to_server_config()).await?;
+
+                        let (_, tls_server_end_point) = tls
+                            .cert_resolver
+                            .resolve(tls_stream.get_ref().1.server_name())
+                            .context("missing certificate")?;
+
+                        stream = PqStream::new(Stream::Tls {
+                            tls: Box::new(tls_stream),
+                            tls_server_end_point,
+                        });
+                    }
+                }
+                _ => bail!(ERR_PROTO_VIOLATION),
+            },
+            GssEncRequest => match stream.get_ref() {
+                Stream::Raw { .. } if !tried_gss => {
+                    tried_gss = true;
+
+                    // Currently, we don't support GSSAPI
+                    stream.write_message(&Be::EncryptionResponse(false)).await?;
+                }
+                _ => bail!(ERR_PROTO_VIOLATION),
+            },
+            StartupMessage { params, .. } => {
+                // Check that the config has been consumed during upgrade
+                // OR we didn't provide it at all (for dev purposes).
+                if tls.is_some() {
+                    stream.throw_error_str(ERR_INSECURE_CONNECTION).await?;
+                }
+
+                info!(session_type = "normal", "successful handshake");
+                break Ok(Some((stream, params)));
+            }
+            CancelRequest(cancel_key_data) => {
+                cancel_map.cancel_session(cancel_key_data).await?;
+
+                info!(session_type = "cancellation", "successful handshake");
+                break Ok(None);
+            }
+        }
+    }
+}
--- a/proxy/src/proxy/passthrough.rs
+++ b/proxy/src/proxy/passthrough.rs
@@ -0,0 +1,57 @@
+use crate::{
+    console::messages::MetricsAuxInfo,
+    context::RequestMonitoring,
+    metrics::{NUM_BYTES_PROXIED_COUNTER, NUM_BYTES_PROXIED_PER_CLIENT_COUNTER},
+    usage_metrics::{Ids, USAGE_METRICS},
+};
+use tokio::io::{AsyncRead, AsyncWrite};
+use tracing::info;
+use utils::measured_stream::MeasuredStream;
+
+/// Forward bytes in both directions (client <-> compute).
+#[tracing::instrument(skip_all)]
+pub async fn proxy_pass(
+    ctx: &mut RequestMonitoring,
+    client: impl AsyncRead + AsyncWrite + Unpin,
+    compute: impl AsyncRead + AsyncWrite + Unpin,
+    aux: MetricsAuxInfo,
+) -> anyhow::Result<()> {
+    ctx.set_success();
+    ctx.log();
+
+    let usage = USAGE_METRICS.register(Ids {
+        endpoint_id: aux.endpoint_id.clone(),
+        branch_id: aux.branch_id.clone(),
+    });
+
+    let m_sent = NUM_BYTES_PROXIED_COUNTER.with_label_values(&["tx"]);
+    let m_sent2 = NUM_BYTES_PROXIED_PER_CLIENT_COUNTER.with_label_values(&aux.traffic_labels("tx"));
+    let mut client = MeasuredStream::new(
+        client,
+        |_| {},
+        |cnt| {
+            // Number of bytes we sent to the client (outbound).
+            m_sent.inc_by(cnt as u64);
+            m_sent2.inc_by(cnt as u64);
+            usage.record_egress(cnt as u64);
+        },
+    );
+
+    let m_recv = NUM_BYTES_PROXIED_COUNTER.with_label_values(&["rx"]);
+    let m_recv2 = NUM_BYTES_PROXIED_PER_CLIENT_COUNTER.with_label_values(&aux.traffic_labels("rx"));
+    let mut compute = MeasuredStream::new(
+        compute,
+        |_| {},
+        |cnt| {
+            // Number of bytes the client sent to the compute node (inbound).
+            m_recv.inc_by(cnt as u64);
+            m_recv2.inc_by(cnt as u64);
+        },
+    );
+
+    // Starting from here we only proxy the client's traffic.
+    info!("performing the proxy pass...");
+    let _ = tokio::io::copy_bidirectional(&mut client, &mut compute).await?;
+
+    Ok(())
+}
--- a/proxy/src/serverless.rs
+++ b/proxy/src/serverless.rs
@@ -41,6 +41,8 @@ use tokio_util::sync::CancellationToken;
 use tracing::{error, info, info_span, warn, Instrument};
 use utils::http::{error::ApiError, json::json_response};

+pub const SERVERLESS_DRIVER_SNI: &str = "api";
+
 pub async fn task_main(
    config: &'static ProxyConfig,
    ws_listener: TcpListener,
@@ -228,7 +230,7 @@ async fn request_handler(
                    config,
                    &mut ctx,
                    websocket,
-                    &cancel_map,
+                    cancel_map,
                    host,
                    endpoint_rate_limiter,
                )
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -1,6 +1,7 @@
 use std::sync::Arc;

 use anyhow::bail;
+use anyhow::Context;
 use futures::pin_mut;
 use futures::StreamExt;
 use hyper::body::HttpBody;
@@ -35,11 +36,11 @@ use crate::config::TlsConfig;
 use crate::context::RequestMonitoring;
 use crate::metrics::NUM_CONNECTION_REQUESTS_GAUGE;
 use crate::proxy::NeonOptions;
-use crate::EndpointId;
 use crate::RoleName;

 use super::conn_pool::ConnInfo;
 use super::conn_pool::GlobalConnPool;
+use super::SERVERLESS_DRIVER_SNI;

 #[derive(serde::Deserialize)]
 struct QueryData {
@@ -61,7 +62,6 @@ enum Payload {

 const MAX_RESPONSE_SIZE: usize = 10 * 1024 * 1024; // 10 MiB
 const MAX_REQUEST_SIZE: u64 = 10 * 1024 * 1024; // 10 MiB
-const SERVERLESS_DRIVER_SNI_HOSTNAME_FIRST_PART: &str = "api";

 static RAW_TEXT_OUTPUT: HeaderName = HeaderName::from_static("neon-raw-text-output");
 static ARRAY_MODE: HeaderName = HeaderName::from_static("neon-array-mode");
@@ -188,10 +188,8 @@ fn get_conn_info(
        }
    }

-    let endpoint = endpoint_sni(hostname, &tls.common_names)?;
-
-    let endpoint: EndpointId = endpoint.into();
-    ctx.set_endpoint_id(Some(endpoint.clone()));
+    let endpoint = endpoint_sni(hostname, &tls.common_names)?.context("malformed endpoint")?;
+    ctx.set_endpoint_id(endpoint.clone());

    let pairs = connection_url.query_pairs();

@@ -227,8 +225,7 @@ fn check_matches(sni_hostname: &str, hostname: &str) -> Result<bool, anyhow::Err
    let (_, hostname_rest) = hostname
        .split_once('.')
        .ok_or_else(|| anyhow::anyhow!("Unexpected hostname format."))?;
-    Ok(sni_hostname_rest == hostname_rest
-        && sni_hostname_first == SERVERLESS_DRIVER_SNI_HOSTNAME_FIRST_PART)
+    Ok(sni_hostname_rest == hostname_rest && sni_hostname_first == SERVERLESS_DRIVER_SNI)
 }

 // TODO: return different http error codes
--- a/proxy/src/serverless/websocket.rs
+++ b/proxy/src/serverless/websocket.rs
@@ -133,7 +133,7 @@ pub async fn serve_websocket(
    config: &'static ProxyConfig,
    ctx: &mut RequestMonitoring,
    websocket: HyperWebsocket,
-    cancel_map: &CancelMap,
+    cancel_map: Arc<CancelMap>,
    hostname: Option<String>,
    endpoint_rate_limiter: Arc<EndpointRateLimiter>,
 ) -> anyhow::Result<()> {
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -33,7 +33,7 @@ psutil = "^5.9.4"
 types-psutil = "^5.9.5.12"
 types-toml = "^0.10.8.6"
 pytest-httpserver = "^1.0.8"
-aiohttp = "3.9.0"
+aiohttp = "3.9.2"
 pytest-rerunfailures = "^13.0"
 types-pytest-lazy-fixture = "^0.6.3.3"
 pytest-split = "^0.8.1"
--- a/safekeeper/src/control_file.rs
+++ b/safekeeper/src/control_file.rs
@@ -3,8 +3,9 @@
 use anyhow::{bail, ensure, Context, Result};
 use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
 use camino::Utf8PathBuf;
-use tokio::fs::{self, File};
+use tokio::fs::File;
 use tokio::io::AsyncWriteExt;
+use utils::crashsafe::durable_rename;

 use std::io::Read;
 use std::ops::Deref;
@@ -203,35 +204,8 @@ impl Storage for FileStorage {
            )
        })?;

-        // fsync the file
-        if !self.conf.no_sync {
-            control_partial.sync_all().await.with_context(|| {
-                format!(
-                    "failed to sync partial control file at {}",
-                    control_partial_path
-                )
-            })?;
-        }
-
        let control_path = self.timeline_dir.join(CONTROL_FILE_NAME);
-
-        // rename should be atomic
-        fs::rename(&control_partial_path, &control_path).await?;
-        // this sync is not required by any standard but postgres does this (see durable_rename)
-        if !self.conf.no_sync {
-            let new_f = File::open(&control_path).await?;
-            new_f
-                .sync_all()
-                .await
-                .with_context(|| format!("failed to sync control file at: {}", &control_path))?;
-
-            // fsync the directory (linux specific)
-            let tli_dir = File::open(&self.timeline_dir).await?;
-            tli_dir
-                .sync_all()
-                .await
-                .context("failed to sync control file directory")?;
-        }
+        durable_rename(&control_partial_path, &control_path, !self.conf.no_sync).await?;

        // update internal state
        self.state = s.clone();
@@ -249,6 +223,7 @@ mod test {
    use super::*;
    use crate::SafeKeeperConf;
    use anyhow::Result;
+    use tokio::fs;
    use utils::{id::TenantTimelineId, lsn::Lsn};

    fn stub_conf() -> SafeKeeperConf {
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -28,7 +28,7 @@ use crate::safekeeper::Term;
 use crate::safekeeper::{ServerInfo, TermLsn};
 use crate::send_wal::WalSenderState;
 use crate::timeline::PeerInfo;
-use crate::{copy_timeline, debug_dump, pull_timeline};
+use crate::{copy_timeline, debug_dump, patch_control_file, pull_timeline};

 use crate::timelines_global_map::TimelineDeleteForceResult;
 use crate::GlobalTimelines;
@@ -465,6 +465,26 @@ async fn dump_debug_handler(mut request: Request<Body>) -> Result<Response<Body>
    Ok(response)
 }

+async fn patch_control_file_handler(
+    mut request: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
+    check_permission(&request, None)?;
+
+    let ttid = TenantTimelineId::new(
+        parse_request_param(&request, "tenant_id")?,
+        parse_request_param(&request, "timeline_id")?,
+    );
+
+    let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
+
+    let patch_request: patch_control_file::Request = json_request(&mut request).await?;
+    let response = patch_control_file::handle_request(tli, patch_request)
+        .await
+        .map_err(ApiError::InternalServerError)?;
+
+    json_response(StatusCode::OK, response)
+}
+
 /// Safekeeper http router.
 pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError> {
    let mut router = endpoint::make_router();
@@ -526,6 +546,10 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError>
            "/v1/tenant/:tenant_id/timeline/:source_timeline_id/copy",
            |r| request_span(r, timeline_copy_handler),
        )
+        .patch(
+            "/v1/tenant/:tenant_id/timeline/:timeline_id/control_file",
+            |r| request_span(r, patch_control_file_handler),
+        )
        // for tests
        .post("/v1/record_safekeeper_info/:tenant_id/:timeline_id", |r| {
            request_span(r, record_safekeeper_info)
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -22,6 +22,7 @@ pub mod handler;
 pub mod http;
 pub mod json_ctrl;
 pub mod metrics;
+pub mod patch_control_file;
 pub mod pull_timeline;
 pub mod receive_wal;
 pub mod recovery;
--- a/safekeeper/src/patch_control_file.rs
+++ b/safekeeper/src/patch_control_file.rs
@@ -0,0 +1,85 @@
+use std::sync::Arc;
+
+use serde::{Deserialize, Serialize};
+use serde_json::Value;
+use tracing::info;
+
+use crate::{state::TimelinePersistentState, timeline::Timeline};
+
+#[derive(Deserialize, Debug, Clone)]
+pub struct Request {
+    /// JSON object with fields to update
+    pub updates: serde_json::Value,
+    /// List of fields to apply
+    pub apply_fields: Vec<String>,
+}
+
+#[derive(Serialize)]
+pub struct Response {
+    pub old_control_file: TimelinePersistentState,
+    pub new_control_file: TimelinePersistentState,
+}
+
+/// Patch control file with given request. Will update the persistent state using
+/// fields from the request and persist the new state on disk.
+pub async fn handle_request(tli: Arc<Timeline>, request: Request) -> anyhow::Result<Response> {
+    let response = tli
+        .map_control_file(|state| {
+            let old_control_file = state.clone();
+            let new_control_file = state_apply_diff(&old_control_file, &request)?;
+
+            info!(
+                "patching control file, old: {:?}, new: {:?}, patch: {:?}",
+                old_control_file, new_control_file, request
+            );
+            *state = new_control_file.clone();
+
+            Ok(Response {
+                old_control_file,
+                new_control_file,
+            })
+        })
+        .await?;
+
+    Ok(response)
+}
+
+fn state_apply_diff(
+    state: &TimelinePersistentState,
+    request: &Request,
+) -> anyhow::Result<TimelinePersistentState> {
+    let mut json_value = serde_json::to_value(state)?;
+
+    if let Value::Object(a) = &mut json_value {
+        if let Value::Object(b) = &request.updates {
+            json_apply_diff(a, b, &request.apply_fields)?;
+        } else {
+            anyhow::bail!("request.updates is not a json object")
+        }
+    } else {
+        anyhow::bail!("TimelinePersistentState is not a json object")
+    }
+
+    let new_state: TimelinePersistentState = serde_json::from_value(json_value)?;
+    Ok(new_state)
+}
+
+fn json_apply_diff(
+    object: &mut serde_json::Map<String, Value>,
+    updates: &serde_json::Map<String, Value>,
+    apply_keys: &Vec<String>,
+) -> anyhow::Result<()> {
+    for key in apply_keys {
+        if let Some(new_value) = updates.get(key) {
+            if let Some(existing_value) = object.get_mut(key) {
+                *existing_value = new_value.clone();
+            } else {
+                anyhow::bail!("key not found in original object: {}", key);
+            }
+        } else {
+            anyhow::bail!("key not found in request.updates: {}", key);
+        }
+    }
+
+    Ok(())
+}
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -901,6 +901,20 @@ impl Timeline {
            file_open,
        }
    }
+
+    /// Apply a function to the control file state and persist it.
+    pub async fn map_control_file<T>(
+        &self,
+        f: impl FnOnce(&mut TimelinePersistentState) -> Result<T>,
+    ) -> Result<T> {
+        let mut state = self.write_shared_state().await;
+        let mut persistent_state = state.sk.state.start_change();
+        // If f returns error, we abort the change and don't persist anything.
+        let res = f(&mut persistent_state)?;
+        // If persisting fails, we abort the change and return error.
+        state.sk.state.finish_change(&persistent_state).await?;
+        Ok(res)
+    }
 }

 /// Deletes directory and it's contents. Returns false if directory does not exist.
--- a/safekeeper/src/wal_storage.rs
+++ b/safekeeper/src/wal_storage.rs
@@ -21,6 +21,7 @@ use tokio::fs::{self, remove_file, File, OpenOptions};
 use tokio::io::{AsyncRead, AsyncWriteExt};
 use tokio::io::{AsyncReadExt, AsyncSeekExt};
 use tracing::*;
+use utils::crashsafe::durable_rename;

 use crate::metrics::{time_io_closure, WalStorageMetrics, REMOVED_WAL_SEGMENTS};
 use crate::state::TimelinePersistentState;
@@ -196,15 +197,6 @@ impl PhysicalStorage {
        Ok(())
    }

-    /// Call fsync if config requires so.
-    async fn fsync_file(&mut self, file: &File) -> Result<()> {
-        if !self.conf.no_sync {
-            self.metrics
-                .observe_flush_seconds(time_io_closure(file.sync_all()).await?);
-        }
-        Ok(())
-    }
-
    /// Open or create WAL segment file. Caller must call seek to the wanted position.
    /// Returns `file` and `is_partial`.
    async fn open_or_create(&mut self, segno: XLogSegNo) -> Result<(File, bool)> {
@@ -223,15 +215,33 @@ impl PhysicalStorage {
            Ok((file, true))
        } else {
            // Create and fill new partial file
+            //
+            // We're using fdatasync during WAL writing, so file size must not
+            // change; to this end it is filled with zeros here. To avoid using
+            // half initialized segment, first bake it under tmp filename and
+            // then rename.
+            let tmp_path = self.timeline_dir.join("waltmp");
            let mut file = OpenOptions::new()
                .create(true)
                .write(true)
-                .open(&wal_file_partial_path)
+                .open(&tmp_path)
                .await
-                .with_context(|| format!("Failed to open log file {:?}", &wal_file_path))?;
+                .with_context(|| format!("Failed to open tmp wal file {:?}", &tmp_path))?;

            write_zeroes(&mut file, self.wal_seg_size).await?;
-            self.fsync_file(&file).await?;
+
+            // Note: this doesn't get into observe_flush_seconds metric. But
+            // segment init should be separate metric, if any.
+            if let Err(e) =
+                durable_rename(&tmp_path, &wal_file_partial_path, !self.conf.no_sync).await
+            {
+                // Probably rename succeeded, but fsync of it failed. Remove
+                // the file then to avoid using it.
+                remove_file(wal_file_partial_path)
+                    .await
+                    .or_else(utils::fs_ext::ignore_not_found)?;
+                return Err(e.into());
+            }
            Ok((file, true))
        }
    }
@@ -718,6 +728,11 @@ const ZERO_BLOCK: &[u8] = &[0u8; XLOG_BLCKSZ];

 /// Helper for filling file with zeroes.
 async fn write_zeroes(file: &mut File, mut count: usize) -> Result<()> {
+    fail::fail_point!("sk-write-zeroes", |_| {
+        info!("write_zeroes hit failpoint");
+        Err(anyhow::anyhow!("failpoint: sk-write-zeroes"))
+    });
+
    while count >= XLOG_BLCKSZ {
        file.write_all(ZERO_BLOCK).await?;
        count -= XLOG_BLCKSZ;
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -993,13 +993,20 @@ class NeonEnv:
        self.initial_tenant = config.initial_tenant
        self.initial_timeline = config.initial_timeline

-        attachment_service_port = self.port_distributor.get_port()
-        # Reserve the next port after attachment service for use by its postgres: this
-        # will assert out if the next port wasn't free.
-        attachment_service_pg_port = self.port_distributor.get_port()
-        assert attachment_service_pg_port == attachment_service_port + 1
+        # Find two adjacent ports for attachment service and its postgres DB.  This
+        # loop would eventually throw from get_port() if we run out of ports (extremely
+        # unlikely): usually we find two adjacent free ports on the first iteration.
+        while True:
+            self.attachment_service_port = self.port_distributor.get_port()
+            attachment_service_pg_port = self.port_distributor.get_port()
+            if attachment_service_pg_port == self.attachment_service_port + 1:
+                break
+
+        # The URL for the pageserver to use as its control_plane_api config
+        self.control_plane_api: str = f"http://127.0.0.1:{self.attachment_service_port}/upcall/v1"
+        # The base URL of the attachment service
+        self.attachment_service_api: str = f"http://127.0.0.1:{self.attachment_service_port}"

-        self.control_plane_api: str = f"http://127.0.0.1:{attachment_service_port}"
        self.attachment_service: NeonAttachmentService = NeonAttachmentService(
            self, config.auth_enabled
        )
@@ -1914,6 +1921,14 @@ class NeonAttachmentService:
            self.running = False
        return self

+    def pageserver_api(self) -> PageserverHttpClient:
+        """
+        The attachment service implements a subset of the pageserver REST API, for mapping
+        per-tenant actions into per-shard actions (e.g. timeline creation).  Tests should invoke those
+        functions via the HttpClient, as an implicit check that these APIs remain compatible.
+        """
+        return PageserverHttpClient(self.env.attachment_service_port, lambda: True)
+
    def request(self, method, *args, **kwargs) -> requests.Response:
        kwargs["headers"] = self.headers()
        return requests.request(method, *args, **kwargs)
@@ -1931,7 +1946,7 @@ class NeonAttachmentService:
    ) -> int:
        response = self.request(
            "POST",
-            f"{self.env.control_plane_api}/attach-hook",
+            f"{self.env.attachment_service_api}/debug/v1/attach-hook",
            json={"tenant_shard_id": str(tenant_shard_id), "node_id": pageserver_id},
            headers=self.headers(),
        )
@@ -1943,7 +1958,7 @@ class NeonAttachmentService:
    def attach_hook_drop(self, tenant_shard_id: Union[TenantId, TenantShardId]):
        response = self.request(
            "POST",
-            f"{self.env.control_plane_api}/attach-hook",
+            f"{self.env.attachment_service_api}/debug/v1/attach-hook",
            json={"tenant_shard_id": str(tenant_shard_id), "node_id": None},
            headers=self.headers(),
        )
@@ -1955,7 +1970,7 @@ class NeonAttachmentService:
        """
        response = self.request(
            "POST",
-            f"{self.env.control_plane_api}/inspect",
+            f"{self.env.attachment_service_api}/debug/v1/inspect",
            json={"tenant_shard_id": str(tenant_shard_id)},
            headers=self.headers(),
        )
@@ -1976,7 +1991,27 @@ class NeonAttachmentService:
        }
        log.info(f"node_register({body})")
        self.request(
-            "POST", f"{self.env.control_plane_api}/node", json=body, headers=self.headers()
+            "POST",
+            f"{self.env.attachment_service_api}/control/v1/node",
+            json=body,
+            headers=self.headers(),
+        ).raise_for_status()
+
+    def node_list(self):
+        response = self.request(
+            "GET", f"{self.env.attachment_service_api}/control/v1/node", headers=self.headers()
+        )
+        response.raise_for_status()
+        return response.json()
+
+    def node_configure(self, node_id, body: dict[str, Any]):
+        log.info(f"node_configure({node_id}, {body})")
+        body["node_id"] = node_id
+        self.request(
+            "PUT",
+            f"{self.env.attachment_service_api}/control/v1/node/{node_id}/config",
+            json=body,
+            headers=self.headers(),
        ).raise_for_status()

    def tenant_create(
@@ -1986,6 +2021,9 @@ class NeonAttachmentService:
        shard_stripe_size: Optional[int] = None,
        tenant_config: Optional[Dict[Any, Any]] = None,
    ):
+        """
+        Use this rather than pageserver_api() when you need to include shard parameters
+        """
        body: Dict[str, Any] = {"new_tenant_id": str(tenant_id)}

        if shard_count is not None:
@@ -1999,21 +2037,17 @@ class NeonAttachmentService:
            for k, v in tenant_config.items():
                body[k] = v

-        response = self.request("POST", f"{self.env.control_plane_api}/tenant", json=body)
+        response = self.request("POST", f"{self.env.attachment_service_api}/v1/tenant", json=body)
        response.raise_for_status()
        log.info(f"tenant_create success: {response.json()}")

-    def tenant_timeline_create(self, tenant_id: TenantId, timeline_id: TimelineId):
-        body: Dict[str, Any] = {"new_timeline_id": str(timeline_id)}
-
-        response = self.request(
-            "POST", f"{self.env.control_plane_api}/tenant/{tenant_id}/timeline", json=body
-        )
-        response.raise_for_status()
-        log.info(f"tenant_timeline_create success: {response.json()}")
-
    def locate(self, tenant_id: TenantId) -> list[dict[str, Any]]:
-        response = self.request("GET", f"{self.env.control_plane_api}/tenant/{tenant_id}/locate")
+        """
+        :return: list of {"shard_id": "", "node_id": int, "listen_pg_addr": str, "listen_pg_port": int, "listen_http_addr: str, "listen_http_port: int}
+        """
+        response = self.request(
+            "GET", f"{self.env.attachment_service_api}/control/v1/tenant/{tenant_id}/locate"
+        )
        response.raise_for_status()
        body = response.json()
        shards: list[dict[str, Any]] = body["shards"]
@@ -2022,7 +2056,7 @@ class NeonAttachmentService:
    def tenant_shard_split(self, tenant_id: TenantId, shard_count: int) -> list[TenantShardId]:
        response = self.request(
            "PUT",
-            f"{self.env.control_plane_api}/tenant/{tenant_id}/shard_split",
+            f"{self.env.attachment_service_api}/control/v1/tenant/{tenant_id}/shard_split",
            json={"new_shard_count": shard_count},
        )
        response.raise_for_status()
@@ -2034,7 +2068,7 @@ class NeonAttachmentService:
    def tenant_shard_migrate(self, tenant_shard_id: TenantShardId, dest_ps_id: int):
        response = self.request(
            "PUT",
-            f"{self.env.control_plane_api}/tenant/{tenant_shard_id}/migrate",
+            f"{self.env.attachment_service_api}/control/v1/tenant/{tenant_shard_id}/migrate",
            json={"tenant_shard_id": str(tenant_shard_id), "node_id": dest_ps_id},
        )
        response.raise_for_status()
@@ -3062,6 +3096,17 @@ class Endpoint(PgProtocol):

        return self

+    def edit_hba(self, hba: List[str]):
+        """Prepend hba lines into pg_hba.conf file."""
+        with open(os.path.join(self.pg_data_dir_path(), "pg_hba.conf"), "r+") as conf_file:
+            data = conf_file.read()
+            conf_file.seek(0)
+            conf_file.write("\n".join(hba) + "\n")
+            conf_file.write(data)
+
+        if self.running:
+            self.safe_psql("SELECT pg_reload_conf()")
+
    def reconfigure(self, pageserver_id: Optional[int] = None):
        assert self.endpoint_id is not None
        self.env.neon_cli.endpoint_reconfigure(self.endpoint_id, self.tenant_id, pageserver_id)
@@ -3443,6 +3488,24 @@ class SafekeeperHttpClient(requests.Session):
        assert isinstance(res_json, dict)
        return res_json

+    def patch_control_file(
+        self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        patch: Dict[str, Any],
+    ) -> Dict[str, Any]:
+        res = self.patch(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/control_file",
+            json={
+                "updates": patch,
+                "apply_fields": list(patch.keys()),
+            },
+        )
+        res.raise_for_status()
+        res_json = res.json()
+        assert isinstance(res_json, dict)
+        return res_json
+
    def pull_timeline(self, body: Dict[str, Any]) -> Dict[str, Any]:
        res = self.post(f"http://localhost:{self.port}/v1/pull_timeline", json=body)
        res.raise_for_status()
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -549,17 +549,12 @@ class PageserverHttpClient(requests.Session):
        tenant_id: Union[TenantId, TenantShardId],
        timeline_id: TimelineId,
        timestamp,
-        version: Optional[int] = None,
    ):
        log.info(
            f"Requesting lsn by timestamp {timestamp}, tenant {tenant_id}, timeline {timeline_id}"
        )
-        if version is None:
-            version_str = ""
-        else:
-            version_str = f"&version={version}"
        res = self.get(
-            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp?timestamp={timestamp}{version_str}",
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp?timestamp={timestamp}",
        )
        self.verbose_error(res)
        res_json = res.json()
--- a/test_runner/fixtures/pg_version.py
+++ b/test_runner/fixtures/pg_version.py
@@ -52,7 +52,7 @@ class PgVersion(str, enum.Enum):
        return None


-DEFAULT_VERSION: PgVersion = PgVersion.V14
+DEFAULT_VERSION: PgVersion = PgVersion.V15


 def skip_on_postgres(version: PgVersion, reason: str):
@@ -78,6 +78,13 @@ def pytest_addoption(parser: Parser):
    )


+def run_only_on_default_postgres(reason: str):
+    return pytest.mark.skipif(
+        PgVersion(os.environ.get("DEFAULT_PG_VERSION", DEFAULT_VERSION)) is not DEFAULT_VERSION,
+        reason=reason,
+    )
+
+
 def pytest_configure(config: Config):
    if config.getoption("--pg-version"):
        raise Exception("--pg-version is deprecated, use DEFAULT_PG_VERSION env var instead")
--- a/test_runner/regress/test_logging.py
+++ b/test_runner/regress/test_logging.py
@@ -3,10 +3,12 @@ import uuid
 import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder
+from fixtures.pg_version import run_only_on_default_postgres
 from fixtures.utils import wait_until


@pytest.mark.parametrize("level", ["trace", "debug", "info", "warn", "error"])
+@run_only_on_default_postgres("it does not use any postgres functionality")
 def test_logging_event_count(neon_env_builder: NeonEnvBuilder, level: str):
    # self-test: make sure the event is logged (i.e., our testing endpoint works)
    log_expected = {
--- a/test_runner/regress/test_lsn_mapping.py
+++ b/test_runner/regress/test_lsn_mapping.py
@@ -109,7 +109,7 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder):
        # Timestamp is in the unreachable past
        probe_timestamp = tbl[0][1] - timedelta(hours=10)
        result = client.timeline_get_lsn_by_timestamp(
-            tenant_id, timeline_id_child, f"{probe_timestamp.isoformat()}Z", 2
+            tenant_id, timeline_id_child, f"{probe_timestamp.isoformat()}Z"
        )
        assert result["kind"] == "past"
        # make sure that we return the minimum lsn here at the start of the range
--- a/test_runner/regress/test_migrations.py
+++ b/test_runner/regress/test_migrations.py
@@ -18,11 +18,11 @@ def test_migrations(neon_simple_env: NeonEnv):
    with endpoint.cursor() as cur:
        cur.execute("SELECT id FROM neon_migration.migration_id")
        migration_id = cur.fetchall()
-        assert migration_id[0][0] == 2
+        assert migration_id[0][0] == 3

    with open(log_path, "r") as log_file:
        logs = log_file.read()
-        assert "INFO handle_migrations: Ran 2 migrations" in logs
+        assert "INFO handle_migrations: Ran 3 migrations" in logs

    endpoint.stop()
    endpoint.start()
@@ -30,7 +30,7 @@ def test_migrations(neon_simple_env: NeonEnv):
    with endpoint.cursor() as cur:
        cur.execute("SELECT id FROM neon_migration.migration_id")
        migration_id = cur.fetchall()
-        assert migration_id[0][0] == 2
+        assert migration_id[0][0] == 3

    with open(log_path, "r") as log_file:
        logs = log_file.read()
--- a/test_runner/regress/test_neon_superuser.py
+++ b/test_runner/regress/test_neon_superuser.py
@@ -1,26 +1,44 @@
 import time

+from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnv
 from fixtures.pg_version import PgVersion


 def test_neon_superuser(neon_simple_env: NeonEnv, pg_version: PgVersion):
    env = neon_simple_env
-    env.neon_cli.create_branch("test_neon_superuser", "empty")
-    endpoint = env.endpoints.create("test_neon_superuser")
-    endpoint.respec(skip_pg_catalog_updates=False, features=["migrations"])
-    endpoint.start()
+    env.neon_cli.create_branch("test_neon_superuser_publisher", "empty")
+    pub = env.endpoints.create("test_neon_superuser_publisher")
+
+    env.neon_cli.create_branch("test_neon_superuser_subscriber")
+    sub = env.endpoints.create("test_neon_superuser_subscriber")
+
+    pub.respec(skip_pg_catalog_updates=False, features=["migrations"])
+    pub.start()
+
+    sub.respec(skip_pg_catalog_updates=False, features=["migrations"])
+    sub.start()

    time.sleep(1)  # Sleep to let migrations run

-    with endpoint.cursor() as cur:
+    with pub.cursor() as cur:
        cur.execute(
            "CREATE ROLE mr_whiskers WITH PASSWORD 'cat' LOGIN INHERIT CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser"
        )
        cur.execute("CREATE DATABASE neondb WITH OWNER mr_whiskers")
        cur.execute("GRANT ALL PRIVILEGES ON DATABASE neondb TO neon_superuser")

-    with endpoint.cursor(dbname="neondb", user="mr_whiskers", password="cat") as cur:
+        # If we don't do this, creating the subscription will fail later on PG16
+        pub.edit_hba(["host all mr_whiskers 0.0.0.0/0 md5"])
+
+    with sub.cursor() as cur:
+        cur.execute(
+            "CREATE ROLE mr_whiskers WITH PASSWORD 'cat' LOGIN INHERIT CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser"
+        )
+        cur.execute("CREATE DATABASE neondb WITH OWNER mr_whiskers")
+        cur.execute("GRANT ALL PRIVILEGES ON DATABASE neondb TO neon_superuser")
+
+    with pub.cursor(dbname="neondb", user="mr_whiskers", password="cat") as cur:
        cur.execute("SELECT pg_has_role('mr_whiskers', 'neon_superuser', 'member')")
        assert cur.fetchall()[0][0]
        cur.execute("SELECT pg_has_role('mr_whiskers', 'neon_superuser', 'usage')")
@@ -32,3 +50,28 @@ def test_neon_superuser(neon_simple_env: NeonEnv, pg_version: PgVersion):

        cur.execute("CREATE PUBLICATION pub FOR ALL TABLES")
        cur.execute("CREATE ROLE definitely_not_a_superuser WITH PASSWORD 'nope'")
+        cur.execute("CREATE DATABASE definitely_a_database")
+        cur.execute("CREATE TABLE t (a int)")
+        cur.execute("INSERT INTO t VALUES (10), (20)")
+        cur.execute("SELECT * from t")
+        res = cur.fetchall()
+        assert [r[0] for r in res] == [10, 20]
+
+    with sub.cursor(dbname="neondb", user="mr_whiskers", password="cat") as cur:
+        cur.execute("CREATE TABLE t (a int)")
+
+        pub_conn = f"host=localhost port={pub.pg_port} dbname=neondb user=mr_whiskers password=cat"
+        query = f"CREATE SUBSCRIPTION sub CONNECTION '{pub_conn}' PUBLICATION pub"
+        log.info(f"Creating subscription: {query}")
+        cur.execute(query)
+
+        with pub.cursor(dbname="neondb", user="mr_whiskers", password="cat") as pcur:
+            pcur.execute("INSERT INTO t VALUES (30), (40)")
+
+        time.sleep(1)  # Give the change time to propagate
+
+        cur.execute("SELECT * FROM t")
+        res = cur.fetchall()
+        log.info(res)
+        assert len(res) == 4
+        assert [r[0] for r in res] == [10, 20, 30, 40]
--- a/test_runner/regress/test_next_xid.py
+++ b/test_runner/regress/test_next_xid.py
@@ -203,6 +203,16 @@ def test_import_at_2bil(
        $$;
        """
    )
+
+    # Also create a multi-XID with members past the 2 billion mark
+    conn2 = endpoint.connect()
+    cur2 = conn2.cursor()
+    cur.execute("INSERT INTO t VALUES ('x')")
+    cur.execute("BEGIN; select * from t WHERE t = 'x' FOR SHARE;")
+    cur2.execute("BEGIN; select * from t WHERE t = 'x' FOR SHARE;")
+    cur.execute("COMMIT")
+    cur2.execute("COMMIT")
+
    # A checkpoint writes a WAL record with xl_xid=0. Many other WAL
    # records would have the same effect.
    cur.execute("checkpoint")
@@ -217,4 +227,4 @@ def test_import_at_2bil(
    conn = endpoint.connect()
    cur = conn.cursor()
    cur.execute("SELECT count(*) from t")
-    assert cur.fetchone() == (10000 + 1,)
+    assert cur.fetchone() == (10000 + 1 + 1,)
--- a/test_runner/regress/test_sharding_service.py
+++ b/test_runner/regress/test_sharding_service.py
@@ -0,0 +1,272 @@
+import time
+from collections import defaultdict
+
+from fixtures.neon_fixtures import (
+    NeonEnvBuilder,
+)
+from fixtures.pageserver.http import PageserverHttpClient
+from fixtures.pageserver.utils import tenant_delete_wait_completed, timeline_delete_wait_completed
+from fixtures.pg_version import PgVersion
+from fixtures.types import TenantId, TimelineId
+from fixtures.utils import wait_until
+
+
+def test_sharding_service_smoke(
+    neon_env_builder: NeonEnvBuilder,
+):
+    """
+    Test the basic lifecycle of a sharding service:
+    - Restarting
+    - Restarting a pageserver
+    - Creating and deleting tenants and timelines
+    - Marking a pageserver offline
+    """
+
+    neon_env_builder.num_pageservers = 3
+    env = neon_env_builder.init_configs()
+
+    # Start services by hand so that we can skip a pageserver (this will start + register later)
+    env.broker.try_start()
+    env.attachment_service.start()
+    env.pageservers[0].start()
+    env.pageservers[1].start()
+    for sk in env.safekeepers:
+        sk.start()
+
+    # The pageservers we started should have registered with the sharding service on startup
+    nodes = env.attachment_service.node_list()
+    assert len(nodes) == 2
+    assert set(n["node_id"] for n in nodes) == {env.pageservers[0].id, env.pageservers[1].id}
+
+    # Starting an additional pageserver should register successfully
+    env.pageservers[2].start()
+    nodes = env.attachment_service.node_list()
+    assert len(nodes) == 3
+    assert set(n["node_id"] for n in nodes) == {ps.id for ps in env.pageservers}
+
+    # Use a multiple of pageservers to get nice even number of shards on each one
+    tenant_shard_count = len(env.pageservers) * 4
+    tenant_count = len(env.pageservers) * 2
+    shards_per_tenant = tenant_shard_count // tenant_count
+    tenant_ids = set(TenantId.generate() for i in range(0, tenant_count))
+
+    # Creating several tenants should spread out across the pageservers
+    for tid in tenant_ids:
+        env.neon_cli.create_tenant(tid, shard_count=shards_per_tenant)
+
+    def get_node_shard_counts():
+        counts: defaultdict[str, int] = defaultdict(int)
+        for tid in tenant_ids:
+            for shard in env.attachment_service.locate(tid):
+                counts[shard["node_id"]] += 1
+        return counts
+
+    for node_id, count in get_node_shard_counts().items():
+        # we used a multiple of pagservers for the total shard count,
+        # so expect equal number on all pageservers
+        assert count == tenant_shard_count / len(
+            env.pageservers
+        ), f"Node {node_id} has bad count {count}"
+
+    # Creating and deleting timelines should work, using identical API to pageserver
+    timeline_crud_tenant = next(iter(tenant_ids))
+    timeline_id = TimelineId.generate()
+    env.attachment_service.pageserver_api().timeline_create(
+        pg_version=PgVersion.NOT_SET, tenant_id=timeline_crud_tenant, new_timeline_id=timeline_id
+    )
+    timelines = env.attachment_service.pageserver_api().timeline_list(timeline_crud_tenant)
+    assert len(timelines) == 2
+    assert timeline_id in set(TimelineId(t["timeline_id"]) for t in timelines)
+    #    virtual_ps_http.timeline_delete(tenant_id=timeline_crud_tenant, timeline_id=timeline_id)
+    timeline_delete_wait_completed(
+        env.attachment_service.pageserver_api(), timeline_crud_tenant, timeline_id
+    )
+    timelines = env.attachment_service.pageserver_api().timeline_list(timeline_crud_tenant)
+    assert len(timelines) == 1
+    assert timeline_id not in set(TimelineId(t["timeline_id"]) for t in timelines)
+
+    # Marking a pageserver offline should migrate tenants away from it.
+    env.attachment_service.node_configure(env.pageservers[0].id, {"availability": "Offline"})
+
+    def node_evacuated(node_id: int):
+        counts = get_node_shard_counts()
+        assert counts[node_id] == 0
+
+    wait_until(10, 1, lambda: node_evacuated(env.pageservers[0].id))
+
+    # Marking pageserver active should not migrate anything to it
+    # immediately
+    env.attachment_service.node_configure(env.pageservers[0].id, {"availability": "Active"})
+    time.sleep(1)
+    assert get_node_shard_counts()[env.pageservers[0].id] == 0
+
+    # Delete all the tenants
+    for tid in tenant_ids:
+        tenant_delete_wait_completed(env.attachment_service.pageserver_api(), tid, 10)
+
+    # Set a scheduling policy on one node, create all the tenants, observe
+    # that the scheduling policy is respected.
+    env.attachment_service.node_configure(env.pageservers[1].id, {"scheduling": "Draining"})
+
+    # Create some fresh tenants
+    tenant_ids = set(TenantId.generate() for i in range(0, tenant_count))
+    for tid in tenant_ids:
+        env.neon_cli.create_tenant(tid, shard_count=shards_per_tenant)
+
+    counts = get_node_shard_counts()
+    # Nothing should have been scheduled on the node in Draining
+    assert counts[env.pageservers[1].id] == 0
+    assert counts[env.pageservers[0].id] == tenant_shard_count // 2
+    assert counts[env.pageservers[2].id] == tenant_shard_count // 2
+
+
+def test_sharding_service_passthrough(
+    neon_env_builder: NeonEnvBuilder,
+):
+    """
+    For simple timeline/tenant GET APIs that don't require coordination across
+    shards, the sharding service implements a proxy to shard zero.  This test
+    calls those APIs.
+    """
+    neon_env_builder.num_pageservers = 2
+    env = neon_env_builder.init_start()
+
+    # We will talk to attachment service as if it was a pageserver, using the pageserver
+    # HTTP client
+    client = PageserverHttpClient(env.attachment_service_port, lambda: True)
+    timelines = client.timeline_list(tenant_id=env.initial_tenant)
+    assert len(timelines) == 1
+
+
+def test_sharding_service_restart(neon_env_builder: NeonEnvBuilder):
+    env = neon_env_builder.init_start()
+    tenant_a = env.initial_tenant
+    tenant_b = TenantId.generate()
+    env.attachment_service.tenant_create(tenant_b)
+    env.pageserver.tenant_detach(tenant_a)
+
+    # TODO: extend this test to use multiple pageservers, and check that locations don't move around
+    # on restart.
+
+    # Attachment service restart
+    env.attachment_service.stop()
+    env.attachment_service.start()
+
+    observed = set(TenantId(tenant["id"]) for tenant in env.pageserver.http_client().tenant_list())
+
+    # Tenant A should still be attached
+    assert tenant_a not in observed
+
+    # Tenant B should remain detached
+    assert tenant_b in observed
+
+    # Pageserver restart
+    env.pageserver.stop()
+    env.pageserver.start()
+
+    # Same assertions as above: restarting either service should not perturb things
+    observed = set(TenantId(tenant["id"]) for tenant in env.pageserver.http_client().tenant_list())
+    assert tenant_a not in observed
+    assert tenant_b in observed
+
+
+def test_sharding_service_onboarding(
+    neon_env_builder: NeonEnvBuilder,
+):
+    """
+    We onboard tenants to the sharding service by treating it as a 'virtual pageserver'
+    which provides the /location_config API.  This is similar to creating a tenant,
+    but imports the generation number.
+    """
+
+    neon_env_builder.num_pageservers = 2
+
+    # Start services by hand so that we can skip registration on one of the pageservers
+    env = neon_env_builder.init_configs()
+    env.broker.try_start()
+    env.attachment_service.start()
+
+    # This is the pageserver where we'll initially create the tenant
+    env.pageservers[0].start(register=False)
+    origin_ps = env.pageservers[0]
+
+    # This is the pageserver managed by the sharding service, where the tenant
+    # will be attached after onboarding
+    env.pageservers[1].start(register=True)
+    dest_ps = env.pageservers[1]
+    virtual_ps_http = PageserverHttpClient(env.attachment_service_port, lambda: True)
+
+    for sk in env.safekeepers:
+        sk.start()
+
+    # Create a tenant directly via pageserver HTTP API, skipping the attachment service
+    tenant_id = TenantId.generate()
+    generation = 123
+    origin_ps.http_client().tenant_create(tenant_id, generation=generation)
+
+    # As if doing a live migration, first configure origin into stale mode
+    origin_ps.http_client().tenant_location_conf(
+        tenant_id,
+        {
+            "mode": "AttachedStale",
+            "secondary_conf": None,
+            "tenant_conf": {},
+            "generation": generation,
+        },
+    )
+
+    # Call into attachment service to onboard the tenant
+    generation += 1
+    virtual_ps_http.tenant_location_conf(
+        tenant_id,
+        {
+            "mode": "AttachedMulti",
+            "secondary_conf": None,
+            "tenant_conf": {},
+            "generation": generation,
+        },
+    )
+
+    # As if doing a live migration, detach the original pageserver
+    origin_ps.http_client().tenant_location_conf(
+        tenant_id,
+        {
+            "mode": "Detached",
+            "secondary_conf": None,
+            "tenant_conf": {},
+            "generation": None,
+        },
+    )
+
+    # As if doing a live migration, call into the attachment service to
+    # set it to AttachedSingle: this is a no-op, but we test it because the
+    # cloud control plane may call this for symmetry with live migration to
+    # an individual pageserver
+    virtual_ps_http.tenant_location_conf(
+        tenant_id,
+        {
+            "mode": "AttachedSingle",
+            "secondary_conf": None,
+            "tenant_conf": {},
+            "generation": generation,
+        },
+    )
+
+    # We should see the tenant is now attached to the pageserver managed
+    # by the sharding service
+    origin_tenants = origin_ps.http_client().tenant_list()
+    assert len(origin_tenants) == 0
+    dest_tenants = dest_ps.http_client().tenant_list()
+    assert len(dest_tenants) == 1
+    assert TenantId(dest_tenants[0]["id"]) == tenant_id
+
+    # sharding service advances generation by 1 when it first attaches
+    assert dest_tenants[0]["generation"] == generation + 1
+
+    # The onboarded tenant should survive a restart of sharding service
+    env.attachment_service.stop()
+    env.attachment_service.start()
+
+    # The onboarded tenant should surviev a restart of pageserver
+    dest_ps.stop()
+    dest_ps.start()
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -1946,3 +1946,51 @@ def test_timeline_copy(neon_env_builder: NeonEnvBuilder, insert_rows: int):
            assert orig_digest == new_digest

    # TODO: test timelines can start after copy
+
+
+def test_patch_control_file(neon_env_builder: NeonEnvBuilder):
+    neon_env_builder.num_safekeepers = 1
+    env = neon_env_builder.init_start()
+
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    endpoint = env.endpoints.create_start("main")
+    # initialize safekeeper
+    endpoint.safe_psql("create table t(key int, value text)")
+
+    # update control file
+    res = (
+        env.safekeepers[0]
+        .http_client()
+        .patch_control_file(
+            tenant_id,
+            timeline_id,
+            {
+                "timeline_start_lsn": "0/1",
+            },
+        )
+    )
+
+    timeline_start_lsn_before = res["old_control_file"]["timeline_start_lsn"]
+    timeline_start_lsn_after = res["new_control_file"]["timeline_start_lsn"]
+
+    log.info(f"patch_control_file response: {res}")
+    log.info(
+        f"updated control file timeline_start_lsn, before {timeline_start_lsn_before}, after {timeline_start_lsn_after}"
+    )
+
+    assert timeline_start_lsn_after == "0/1"
+    env.safekeepers[0].stop().start()
+
+    # wait/check that safekeeper is alive
+    endpoint.safe_psql("insert into t values (1, 'payload')")
+
+    # check that timeline_start_lsn is updated
+    res = (
+        env.safekeepers[0]
+        .http_client()
+        .debug_dump({"dump_control_file": "true", "timeline_id": str(timeline_id)})
+    )
+    log.info(f"dump_control_file response: {res}")
+    assert res["timelines"][0]["control_file"]["timeline_start_lsn"] == "0/1"
--- a/test_runner/regress/test_wal_acceptor_async.py
+++ b/test_runner/regress/test_wal_acceptor_async.py
@@ -515,6 +515,42 @@ def test_recovery_uncommitted(neon_env_builder: NeonEnvBuilder):
    asyncio.run(run_recovery_uncommitted(env))


+async def run_segment_init_failure(env: NeonEnv):
+    env.neon_cli.create_branch("test_segment_init_failure")
+    ep = env.endpoints.create_start("test_segment_init_failure")
+    ep.safe_psql("create table t(key int, value text)")
+    ep.safe_psql("insert into t select generate_series(1, 100), 'payload'")
+
+    sk = env.safekeepers[0]
+    sk_http = sk.http_client()
+    sk_http.configure_failpoints([("sk-write-zeroes", "return")])
+    conn = await ep.connect_async()
+    ep.safe_psql("select pg_switch_wal()")  # jump to the segment boundary
+    # next insertion should hang until failpoint is disabled.
+    asyncio.create_task(conn.execute("insert into t select generate_series(1,1), 'payload'"))
+    sleep_sec = 2
+    await asyncio.sleep(sleep_sec)
+    # also restart ep at segment boundary to make test more interesting
+    ep.stop()
+    # it must still be not finished
+    # assert not bg_query.done()
+    # Without segment rename during init (#6402) previous statement created
+    # partially initialized 16MB segment, so sk restart also triggers #6401.
+    sk.stop().start()
+    ep = env.endpoints.create_start("test_segment_init_failure")
+    ep.safe_psql("insert into t select generate_series(1,1), 'payload'")  # should be ok now
+
+
+# Test (injected) failure during WAL segment init.
+# https://github.com/neondatabase/neon/issues/6401
+# https://github.com/neondatabase/neon/issues/6402
+def test_segment_init_failure(neon_env_builder: NeonEnvBuilder):
+    neon_env_builder.num_safekeepers = 1
+    env = neon_env_builder.init_start()
+
+    asyncio.run(run_segment_init_failure(env))
+
+
@dataclass
 class RaceConditionTest:
    iteration: int
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,5 +1,5 @@
 {
    "postgres-v16": "cf302768b2890569956641e0e5ba112ae1445351",
-    "postgres-v15": "731b4d1609d6db1c953755810a41e0e67ea3db7b",
-    "postgres-v14": "11e970fe2be56804f0a786ec5fc8141ffefa4ca7"
+    "postgres-v15": "b089a8a02c9f6f4379883fddb33cf10a3aa0b14f",
+    "postgres-v14": "3de48ce3d9c1f4fac1cdc7029487f8db9e537eac"
 }
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -45,13 +45,13 @@ hmac = { version = "0.12", default-features = false, features = ["reset"] }
 hyper = { version = "0.14", features = ["full"] }
 indexmap = { version = "1", default-features = false, features = ["std"] }
 itertools = { version = "0.10" }
-libc = { version = "0.2", features = ["extra_traits"] }
+libc = { version = "0.2", features = ["extra_traits", "use_std"] }
 log = { version = "0.4", default-features = false, features = ["std"] }
 memchr = { version = "2" }
 nom = { version = "7" }
 num-bigint = { version = "0.4" }
 num-integer = { version = "0.1", features = ["i128"] }
-num-traits = { version = "0.2", features = ["i128"] }
+num-traits = { version = "0.2", features = ["i128", "libm"] }
 once_cell = { version = "1" }
 parquet = { git = "https://github.com/neondatabase/arrow-rs", branch = "neon-fix-bugs", default-features = false, features = ["zstd"] }
 prost = { version = "0.11" }
@@ -94,13 +94,13 @@ getrandom = { version = "0.2", default-features = false, features = ["std"] }
 hashbrown = { version = "0.14", default-features = false, features = ["raw"] }
 indexmap = { version = "1", default-features = false, features = ["std"] }
 itertools = { version = "0.10" }
-libc = { version = "0.2", features = ["extra_traits"] }
+libc = { version = "0.2", features = ["extra_traits", "use_std"] }
 log = { version = "0.4", default-features = false, features = ["std"] }
 memchr = { version = "2" }
 nom = { version = "7" }
 num-bigint = { version = "0.4" }
 num-integer = { version = "0.1", features = ["i128"] }
-num-traits = { version = "0.2", features = ["i128"] }
+num-traits = { version = "0.2", features = ["i128", "libm"] }
 once_cell = { version = "1" }
 parquet = { git = "https://github.com/neondatabase/arrow-rs", branch = "neon-fix-bugs", default-features = false, features = ["zstd"] }
 prost = { version = "0.11" }
Author	SHA1	Message	Date
Christian Schwarz	91cec9ba48	add comment	2024-02-01 17:26:39 +00:00
Christian Schwarz	d9f89f828d	only update in apply_batch_postgres	2024-02-01 16:17:21 +00:00
Christian Schwarz	74df4a7b76	fix(walredo): walredo process that causes errors is never killed Before this PR, if walredo failed, we would still update `last_redo_at`. This means the `maybe_quiesce()` would never kill that process, although clearly something is wrong. However, we don't want to kill the process immediately on each redo failure, because, the redo failure could be caused by corrupted or malicious data. So, change `maybe_quiesce()` to determine inactivity based on last successful `last_redo_at`. The result is that a transient redo failure won't cause a kill. If there are only redo failures, we'll spawn & kill walredo process at frequency 1/(10*compaction_period).	2024-02-01 14:38:53 +01:00
Joonas Koivunen	799db161d3	tests: support for running on single pg version, use in one place (#6525 ) Some tests which are unit test alike do not need to run on different pg versions. Logging test is one of them which I found for unrelated reasons. Co-authored-by: Alexander Bayandin <alexander@neon.tech>	2024-01-31 17:37:25 +02:00
Arpad Müller	47380be12d	Remove version param from get_lsn_by_timestamp (#6551 ) This removes the last remnants of the version param added by #5608 , concluding the transition plan laid out in https://github.com/neondatabase/cloud/pull/7553#discussion_r1370473911 . It follows PR https://github.com/neondatabase/cloud/pull/9202, which we now assume has been deployed to all environments. Full history: * https://github.com/neondatabase/neon/pull/5608 * https://github.com/neondatabase/cloud/pull/7553 * https://github.com/neondatabase/neon/pull/6178 * https://github.com/neondatabase/cloud/pull/9202	2024-01-31 15:30:19 +01:00
Conrad Ludgate	c7b02ce8ec	proxy: use jemalloc (#6531 ) ## Summary of changes Experiment with jemalloc in proxy	2024-01-31 14:51:11 +01:00
John Spray	4010adf653	control_plane/attachment_service: complete APIs (#6394 ) Depends on: https://github.com/neondatabase/neon/pull/6468 ## Problem The sharding service will be used as a "virtual pageserver" by the control plane -- so it needs the set of pageserver APIs that the control plane uses, and to present them under identical URLs, including prefix (/v1). ## Summary of changes - Add missing APIs: - Tenant deletion - Timeline deletion - Node list (used in test now, later in tools) - `/location_config` API (for migrating tenants into the sharding service) - Rework attachment service URLs: - `/v1` prefix is used for pageserver-compatible APIs - `/upcall/v1` prefix is used for APIs that are called by the pageserver (re-attach and validate) - `/debug/v1` prefix is used for endpoints that are for testing - `/control/v1` prefix is used for new sharding service APIs that do not mimic a pageserver API, such as registering and configuring nodes. - Add test_sharding_service. The sharding service already had some collateral coverage from its use in general tests, but this is the first dedicated testing for it.	2024-01-31 12:23:06 +00:00
Konstantin Knizhnik	e10a7ee391	Prevent to frequent reconnects in case of race condition errors returned by PS (tenant not found) (#6522 ) ## Problem See https://neondb.slack.com/archives/C04DGM6SMTM/p1706531433057289 ## Summary of changes 1. Do not decrease reconnect timeout until maximal interval value (1 second) is reached 2. Compute reconnect time after connection attempt is taken to exclude connect time itself from the interval measurement. So now backend should not perform more than 4 reconnect attempts per second. But please notice that backoff is performed locally in each backend and so if there are many active backends, then connection (and so error) rate may be much higher. ## Checklist before requesting a review - [ ] I have performed a self-review of my code. - [ ] If it is a core feature, I have added thorough tests. - [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard? - [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section. ## Checklist before merging - [ ] Do not forget to reformat commit message to not include the above checklist --------- Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>	2024-01-31 09:17:32 +02:00
Sasha Krassovsky	e8c9a51273	Allow creating subscriptions as neon_superuser (#6484 ) ## Problem We currently can't create subscriptions in PG14 and PG15 because only superusers can, and PG16 requires adding roles to pg_create_subscription. ## Summary of changes I added changes to PG14 and PG15 that allow neon_superuser to bypass the superuser requirement. For PG16, I didn't do that but added a migration that adds neon_superuser to pg_create_subscription. Also added a test to make sure it works.	2024-01-30 22:32:33 -08:00
Alexander Bayandin	3c3ee8f3e8	Compute: add compatibility patch for pgvector (#6527 ) ## Problem `pgvector` requires a patch to work well with Neon (a patch created by @hlinnaka) ## Summary of changes - Apply the patch to `pgvector`	2024-01-30 17:33:24 +00:00
Arpad Müller	6928a34f59	S3 DR: Large prefix improvements (#6515 ) ## Problem PR #6500 has removed the limiting by number of versions/deletions for time travel calls. We never get informed about how many versions there are, and thus the call would just hang without any indication of progress. ## Summary of changes We improve the pageserver's behaviour with large prefixes, i.e. those with many keys, removed or currently still available. * Add a hard limit of 100k versions/deletions. For the reasoning see https://github.com/neondatabase/cloud/issues/8233#issuecomment-1915021625 , but TLDR it will roughly support tenants of 2 TiB size, of course depending on general write activity and duration of the s3 retention window. The goal is to have a limit at all so that the process doesn't accumulate increasing numbers of versions until an eventual crash. * Lower the RAM footprint for the `VerOrDelete` datastructure. This means we now don't cache a lot of redundant metadata in RAM like the owner ID. The top level datastructure's footprint goes down from 264 bytes to 80 (but it contains strings that are not counted in there). Follow-up of #6500, part of https://github.com/neondatabase/cloud/issues/8233 --------- Co-authored-by: Joonas Koivunen <joonas@neon.tech>	2024-01-30 15:57:27 +00:00
Arseny Sher	bc684e9d3b	Make WAL segment init atomic. Since fdatasync is used for flushing WAL, changing file size is unsafe. Make segment creation atomic by using tmp file + rename to avoid using partially initialized segments. fixes https://github.com/neondatabase/neon/issues/6402	2024-01-30 18:05:22 +04:00
Arseny Sher	08532231ee	Fix find_end_of_wal busy loop. It hanged if file size is less than of a normal segment. Normally that doesn't happen, but it might in case of crash during segment init. We're going to fix that half initialized segment by durably renaming it after cooking, so this fix won't be needed, but better avoid busy loop anyway. fixes https://github.com/neondatabase/neon/issues/6401	2024-01-30 18:05:22 +04:00
Christian Schwarz	79137a089f	fix(#6366 ): pageserver: incorrect log level for Tenant not found during basebackup (#6400 ) Before this patch, when requesting basebackup for a not-found tenant or timeline, we'd emit an ERROR-level log entry with a huge stack trace. See #6366 "Details" section for an example With this patch, we log at INFO level and only a single line. Example: ``` 2024-01-19T14:16:11.479800Z INFO page_service_conn_main{peer_addr=127.0.0.1:43448}: query handler for 'basebackup d69a536d529a68fcf85bc070030cdf4b 035484e9c28d8d0138a492caadd03ffd 0/2204340 --gzip' entity not found: Tenant d69a536d529a68fcf85bc070030cdf4b not found 2024-01-19T14:19:35.807819Z INFO page_service_conn_main{peer_addr=127.0.0.1:48862}: query handler for 'basebackup d69a536d529a68fcf85bc070030cdf4a 035484e9c28d8d0138a492caadd03ffd 0/2204340 --gzip' entity not found: Timeline d69a536d529a68fcf85bc070030cdf4a/035484e9c28d8d0138a492caadd03ffd was not found ``` fixes https://github.com/neondatabase/neon/issues/6366 Changes ------- - Change `handle_basebackup_request` to return a `QueryError` - The new `impl From<WaitLsnError> for QueryError` is needed so the `?` at `wait_lsn()` call in `handle_basebackup_request` works again. It's duplicating `impl From<WaitLsnError> for PageStreamError`. - Remove hard-to-spot conversion of `handle_basebackup_request` return value to anyhow::Result (the place where I replaced `anyhow::Ok` with `Result::<(), QueryError>::Ok(())` - Add forgotten distinguished handling for "Tenant not found" case in `impl From<GetActiveTenantError> for QueryError` This was not at all pleasant, and I find it very hard to follow the various error conversions. It took me a while to spot the hard-to-spot `anyhow::Ok` thing above. It would have been caught by the compiler if we weren't auto-converting `anyhow::Error` into `QueryError::Other`. We should move away from that, in my opinion, instead forcing each `.context()` site to become `.context().map_err(QueryError::Other)`. But that's for a future PR.	2024-01-30 13:10:48 +00:00
Joonas Koivunen	e3cb715e8a	fix: capture initdb stderr, discard others (#6524 ) When using spawn + wait_with_output instead of std::process::Command::output or tokio::process::Command::output we must configure the redirection. Fixes: #6523 by discarding the stdout completely, we only care about stderr if any.	2024-01-30 14:07:58 +01:00
dependabot[bot]	c70bf9150f	build(deps): bump aiohttp from 3.9.0 to 3.9.2 (#6518 )	2024-01-30 10:46:49 +00:00
Alexander Bayandin	8e4da52069	Compute: pgvector 0.6.0 (#6517 ) Update pgvector extension from 0.5.1 to 0.6.0	2024-01-30 09:29:45 +00:00
Arthur Petukhovsky	2ff1a5cecd	Patch safekeeper control file on HTTP request (#6455 ) Closes #6397	2024-01-29 18:20:57 +00:00
Conrad Ludgate	ec8dcc2231	flatten proxy flow (#6447 ) ## Problem Taking my ideas from https://github.com/neondatabase/neon/pull/6283 and doing a bit less radical changes. smaller commits. Proxy flow was quite deeply nested, which makes adding more interesting error handling quite tricky. ## Summary of changes I recommend reviewing commit by commit. 1. move handshake logic into a separate file 2. move passthrough logic into a separate file 3. no longer accept a closure in CancelMap session logic 4. Remove connect_to_db, copy logic into handle_client 5. flatten auth_and_wake_compute in authenticate 6. record info for link auth	2024-01-29 17:38:03 +00:00
Arpad Müller	b844c6f0c7	Do pagination in list_object_versions call (#6500 ) ## Problem The tenants we want to recover might have tens of thousands of keys, or more. At that point, the AWS API returns a paginated response. ## Summary of changes Support paginated responses for `list_object_versions` requests. Follow-up of #6155, part of https://github.com/neondatabase/cloud/issues/8233	2024-01-29 17:59:26 +01:00
Alexander Bayandin	6a85a06e1b	Compute: build rdkit without freetype support (#6495 ) ## Problem `rdkit` extension is built with `RDK_BUILD_FREETYPE_SUPPORT=ON` (by default), which requires a bunch of additional dependencies, but the support of freetype fonts isn't required for Postgres. With `RDK_BUILD_FREETYPE_SUPPORT=ON`: ``` ldd /usr/local/pgsql/lib/rdkit.so linux-vdso.so.1 (0x0000ffff82ea8000) libfreetype.so.6 => /usr/lib/aarch64-linux-gnu/libfreetype.so.6 (0x0000ffff825e5000) libboost_serialization.so.1.74.0 => /usr/lib/aarch64-linux-gnu/libboost_serialization.so.1.74.0 (0x0000ffff82590000) libpthread.so.0 => /lib/aarch64-linux-gnu/libpthread.so.0 (0x0000ffff8255f000) libstdc++.so.6 => /usr/lib/aarch64-linux-gnu/libstdc++.so.6 (0x0000ffff82387000) libm.so.6 => /lib/aarch64-linux-gnu/libm.so.6 (0x0000ffff822dc000) libgcc_s.so.1 => /lib/aarch64-linux-gnu/libgcc_s.so.1 (0x0000ffff822b8000) libc.so.6 => /lib/aarch64-linux-gnu/libc.so.6 (0x0000ffff82144000) libpng16.so.16 => /usr/lib/aarch64-linux-gnu/libpng16.so.16 (0x0000ffff820fd000) libz.so.1 => /lib/aarch64-linux-gnu/libz.so.1 (0x0000ffff820d3000) libbrotlidec.so.1 => /usr/lib/aarch64-linux-gnu/libbrotlidec.so.1 (0x0000ffff820b8000) /lib/ld-linux-aarch64.so.1 (0x0000ffff82e78000) libbrotlicommon.so.1 => /usr/lib/aarch64-linux-gnu/libbrotlicommon.so.1 (0x0000ffff82087000) ``` With `RDK_BUILD_FREETYPE_SUPPORT=OFF`: ``` ldd /usr/local/pgsql/lib/rdkit.so linux-vdso.so.1 (0x0000ffffbba75000) libboost_serialization.so.1.74.0 => /usr/lib/aarch64-linux-gnu/libboost_serialization.so.1.74.0 (0x0000ffffbb259000) libpthread.so.0 => /lib/aarch64-linux-gnu/libpthread.so.0 (0x0000ffffbb228000) libstdc++.so.6 => /usr/lib/aarch64-linux-gnu/libstdc++.so.6 (0x0000ffffbb050000) libm.so.6 => /lib/aarch64-linux-gnu/libm.so.6 (0x0000ffffbafa5000) libgcc_s.so.1 => /lib/aarch64-linux-gnu/libgcc_s.so.1 (0x0000ffffbaf81000) libc.so.6 => /lib/aarch64-linux-gnu/libc.so.6 (0x0000ffffbae0d000) /lib/ld-linux-aarch64.so.1 (0x0000ffffbba45000) ``` ## Summary of changes - Build `rdkit` with `RDK_BUILD_FREETYPE_SUPPORT=OFF` - Remove extra dependencies from the Compute image	2024-01-29 16:16:37 +00:00
John Spray	b04a6acd6c	docker: add attachment_service binary (#6506 ) ## Problem Creating sharded tenants will require an instance of the sharding service -- the initial goal is to deploy one of these in a staging region (https://github.com/neondatabase/cloud/issues/9718). It will run as a kubernetes container, similar to the storage broker, so needs to be built into the container image. ## Summary of changes Add `attachment_service` binary to container image	2024-01-29 13:31:56 +00:00
Vlad Lazar	0c7b89235c	pageserver: add range layer map search implementation (#6469 ) ## Problem There's no efficient way of querying the layer map for a range. ## Summary of changes Introduce a range query for the layer map (`LayerMap::range_search`). There's two broad steps to it: 1. Find all coverage changes for layers that intersect the queried range (see `LayerCoverage::range_overlaps`). The slightly tricky part is dealing with the start of the range. We can either be aligned with a layer or not and we need to treat these cases differently. 2. Iterate over the coverage changes and collect the result. For this we use a two pointer approach: the trailing pointer tracks the start of the current range (current location in the key space) and the forward pointer tracks the next coverage change. Plugging the range search into the read path is deferred to a future PR. ## Performance I adapted the layer map benchmarks on a local branch. Range searches are between 2x and 2.5x slower than point searches. That's in line with what I expected since we query thelayer map twice. Since `Timeline::get` will proxy to `Timeline::get_vectored` we can special case the one element layer map range search at that point.	2024-01-29 09:47:12 +00:00
Joonas Koivunen	1e9a50bca8	disk_usage_eviction_task: cleanup summaries (#6490 ) This is the "partial revert" of #6384. The summaries turned out to be expensive due to naive vec usage, but also inconclusive because of the additional context required. In addition to removing summary traces, small refactoring is done.	2024-01-29 10:38:40 +02:00
Conrad Ludgate	511e730cc0	hll experiment (#6312 ) ## Problem Measuring cardinality using logs is expensive and slow. ## Summary of changes Implement a pre-aggregated HyperLogLog-based cardinality estimate. HyperLogLog estimates the cardinality of a set by using the probability that the uniform hash of a value will have a run of n 0s at the end is `1/2^n`, therefore, having observed a run of `n` 0s suggests we have measured `2^n` distinct values. By using multiple shards, we can use the harmonic mean to get a more accurate estimate. We record this into a Prometheus time-series. HyperLogLog counts can be merged by taking the `max` of each shard. We can apply a `max_over_time` in order to find the estimate of cardinality of distinct values over time	2024-01-29 07:26:20 +00:00
Konstantin Knizhnik	c1148dc9ac	Fix calculation of maximal multixact in ingest_multixact_create_record (#6502 ) ## Problem See https://neondb.slack.com/archives/C06F5UJH601/p1706373716661439 ## Summary of changes Use None instead of 0 as initial accumulator value for calculating maximal multixact XID. ## Checklist before requesting a review - [ ] I have performed a self-review of my code. - [ ] If it is a core feature, I have added thorough tests. - [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard? - [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section. ## Checklist before merging - [ ] Do not forget to reformat commit message to not include the above checklist --------- Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech> Co-authored-by: Heikki Linnakangas <heikki@neon.tech>	2024-01-29 07:39:16 +02:00
Anna Khanova	8253cf1931	proxy: Relax endpoint check (#6503 ) ## Problem http-over-sql allowes host to be in format api.aws.... however it's not the case for the websocket flow. ## Summary of changes Relax endpoint check for the ws serverless connections.	2024-01-28 21:27:14 +00:00
Christian Schwarz	3a82430432	fixup(#6492 ): also switch the benchmarks that runs on merge-to-main back to std-fs (#6501 )	2024-01-28 00:15:11 +01:00