DO_NOT_MERGE: custom pg_session_jwt version without audit

It causes too much noise.
2026-05-23 16:10:37 +00:00 · 2025-04-29 14:20:40 +02:00
34 changed files with 121 additions and 332 deletions
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -824,7 +824,7 @@ jobs:
          - pg: v17
            debian: bookworm
    env:
-      VM_BUILDER_VERSION: v0.46.0
+      VM_BUILDER_VERSION: v0.42.2

    steps:
      - name: Harden the runner (Audit all outbound calls)
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2491,18 +2491,6 @@ dependencies = [
 "wasm-bindgen",
 ]

-[[package]]
-name = "getrandom"
-version = "0.3.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "73fea8450eea4bac3940448fb7ae50d91f034f941199fcd9d909a5a07aa455f0"
-dependencies = [
- "cfg-if",
- "libc",
- "r-efi",
- "wasi 0.14.2+wasi-0.2.4",
-]
-
 [[package]]
 name = "gettid"
 version = "0.1.3"
@@ -5271,12 +5259,6 @@ dependencies = [
 "proc-macro2",
 ]

-[[package]]
-name = "r-efi"
-version = "5.2.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "74765f6d916ee2faa39bc8e68e4f3ed8949b48cccdac59983d287a7cb71ce9c5"
-
 [[package]]
 name = "rand"
 version = "0.7.3"
@@ -5301,16 +5283,6 @@ dependencies = [
 "rand_core 0.6.4",
 ]

-[[package]]
-name = "rand"
-version = "0.9.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9fbfd9d094a40bf3ae768db9361049ace4c0e04a4fd6b359518bd7b73a73dd97"
-dependencies = [
- "rand_chacha 0.9.0",
- "rand_core 0.9.3",
-]
-
 [[package]]
 name = "rand_chacha"
 version = "0.2.2"
@@ -5331,16 +5303,6 @@ dependencies = [
 "rand_core 0.6.4",
 ]

-[[package]]
-name = "rand_chacha"
-version = "0.9.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb"
-dependencies = [
- "ppv-lite86",
- "rand_core 0.9.3",
-]
-
 [[package]]
 name = "rand_core"
 version = "0.5.1"
@@ -5359,15 +5321,6 @@ dependencies = [
 "getrandom 0.2.11",
 ]

-[[package]]
-name = "rand_core"
-version = "0.9.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "99d9a13982dcf210057a8a78572b2217b667c3beacbf3a0d8b454f6f82837d38"
-dependencies = [
- "getrandom 0.3.2",
-]
-
 [[package]]
 name = "rand_distr"
 version = "0.4.3"
@@ -7244,11 +7197,11 @@ dependencies = [
 [[package]]
 name = "tokio-epoll-uring"
 version = "0.1.0"
+source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#781989bb540a1408b0b93daa1e9d1fa452195497"
 dependencies = [
 "futures",
 "nix 0.26.4",
 "once_cell",
- "rand 0.9.1",
 "scopeguard",
 "thiserror 1.0.69",
 "tokio",
@@ -7855,6 +7808,7 @@ dependencies = [
 [[package]]
 name = "uring-common"
 version = "0.1.0"
+source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#781989bb540a1408b0b93daa1e9d1fa452195497"
 dependencies = [
 "bytes",
 "io-uring",
@@ -8096,15 +8050,6 @@ version = "0.11.0+wasi-snapshot-preview1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"

-[[package]]
-name = "wasi"
-version = "0.14.2+wasi-0.2.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9683f9a5a998d873c0d21fcbe3c083009670149a8fab228644b8bd36b2c48cb3"
-dependencies = [
- "wit-bindgen-rt",
-]
-
 [[package]]
 name = "wasite"
 version = "0.1.0"
@@ -8462,15 +8407,6 @@ dependencies = [
 "windows-sys 0.48.0",
 ]

-[[package]]
-name = "wit-bindgen-rt"
-version = "0.39.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6f42320e61fe2cfd34354ecb597f86f413484a798ba44a8ca1165c58d42da6c1"
-dependencies = [
- "bitflags 2.8.0",
-]
-
 [[package]]
 name = "workspace_hack"
 version = "0.1.0"
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -187,8 +187,7 @@ thiserror = "1.0"
 tikv-jemallocator = { version = "0.6", features = ["profiling", "stats", "unprefixed_malloc_on_supported_platforms"] }
 tikv-jemalloc-ctl = { version = "0.6", features = ["stats"] }
 tokio = { version = "1.43.1", features = ["macros"] }
-#tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
-tokio-epoll-uring = { path = "../tokio-epoll-uring/tokio-epoll-uring" }
+tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
 tokio-io-timeout = "1.2.0"
 tokio-postgres-rustls = "0.12.0"
 tokio-rustls = { version = "0.26.0", default-features = false, features = ["tls12", "ring"]}
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -1084,18 +1084,7 @@ RUN cargo install --locked --version 0.12.9 cargo-pgrx && \
    /bin/bash -c 'cargo pgrx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config'

 USER root
-#########################################################################################
-#
-# Layer "rust extensions pgrx14"
-#
-#########################################################################################
-FROM pg-build-nonroot-with-cargo AS rust-extensions-build-pgrx14
-ARG PG_VERSION

-RUN cargo install --locked --version 0.14.1 cargo-pgrx && \
-    /bin/bash -c 'cargo pgrx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config'
-
-USER root
 #########################################################################################
 #
 # Layers "pg-onnx-build" and "pgrag-build"
@@ -1111,11 +1100,11 @@ RUN wget https://github.com/microsoft/onnxruntime/archive/refs/tags/v1.18.1.tar.
    mkdir onnxruntime-src && cd onnxruntime-src && tar xzf ../onnxruntime.tar.gz --strip-components=1 -C . && \
    echo "#nothing to test here" > neon-test.sh

-RUN wget https://github.com/neondatabase-labs/pgrag/archive/refs/tags/v0.1.1.tar.gz -O pgrag.tar.gz &&  \
-    echo "087b2ecd11ba307dc968042ef2e9e43dc04d9ba60e8306e882c407bbe1350a50 pgrag.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/neondatabase-labs/pgrag/archive/refs/tags/v0.0.0.tar.gz -O pgrag.tar.gz &&  \
+    echo "2cbe394c1e74fc8bcad9b52d5fbbfb783aef834ca3ce44626cfd770573700bb4 pgrag.tar.gz" | sha256sum --check && \
    mkdir pgrag-src && cd pgrag-src && tar xzf ../pgrag.tar.gz --strip-components=1 -C .

-FROM rust-extensions-build-pgrx14 AS pgrag-build
+FROM rust-extensions-build-pgrx12 AS pgrag-build
 COPY --from=pgrag-src /ext-src/ /ext-src/

 # Install build-time dependencies
@@ -1135,19 +1124,19 @@ RUN . venv/bin/activate && \

 WORKDIR /ext-src/pgrag-src
 RUN cd exts/rag && \
-    sed -i 's/pgrx = "0.14.1"/pgrx = { version = "0.14.1", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
+    sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
    cargo pgrx install --release && \
    echo "trusted = true" >> /usr/local/pgsql/share/extension/rag.control

 RUN cd exts/rag_bge_small_en_v15 && \
-    sed -i 's/pgrx = "0.14.1"/pgrx = { version = "0.14.1", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
+    sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
    ORT_LIB_LOCATION=/ext-src/onnxruntime-src/build/Linux \
        REMOTE_ONNX_URL=http://pg-ext-s3-gateway/pgrag-data/bge_small_en_v15.onnx \
        cargo pgrx install --release --features remote_onnx && \
    echo "trusted = true" >> /usr/local/pgsql/share/extension/rag_bge_small_en_v15.control

 RUN cd exts/rag_jina_reranker_v1_tiny_en && \
-    sed -i 's/pgrx = "0.14.1"/pgrx = { version = "0.14.1", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
+    sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
    ORT_LIB_LOCATION=/ext-src/onnxruntime-src/build/Linux \
        REMOTE_ONNX_URL=http://pg-ext-s3-gateway/pgrag-data/jina_reranker_v1_tiny_en.onnx \
        cargo pgrx install --release --features remote_onnx && \
@@ -1316,8 +1305,8 @@ ARG PG_VERSION
 # Do not update without approve from proxy team
 # Make sure the version is reflected in proxy/src/serverless/local_conn_pool.rs
 WORKDIR /ext-src
-RUN wget https://github.com/neondatabase/pg_session_jwt/archive/refs/tags/v0.3.0.tar.gz -O pg_session_jwt.tar.gz && \
-    echo "19be2dc0b3834d643706ed430af998bb4c2cdf24b3c45e7b102bb3a550e8660c pg_session_jwt.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/neondatabase/pg_session_jwt/archive/refs/heads/alexk/tmp-disable-audit.tar.gz -O pg_session_jwt.tar.gz && \
+    echo "b868227950973df5186af54dfa03617536d21e96dddcb7b25e7ce199b4956bdb pg_session_jwt.tar.gz" | sha256sum --check && \
    mkdir pg_session_jwt-src && cd pg_session_jwt-src && tar xzf ../pg_session_jwt.tar.gz --strip-components=1 -C . && \
    sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
    sed -i 's/version = "0.12.6"/version = "0.12.9"/g' pgrx-tests/Cargo.toml && \
--- a/compute/etc/neon_collector.jsonnet
+++ b/compute/etc/neon_collector.jsonnet
@@ -23,8 +23,6 @@
    import 'sql_exporter/getpage_prefetch_requests_total.libsonnet',
    import 'sql_exporter/getpage_prefetches_buffered.libsonnet',
    import 'sql_exporter/getpage_sync_requests_total.libsonnet',
-    import 'sql_exporter/compute_getpage_stuck_requests_total.libsonnet',
-    import 'sql_exporter/compute_getpage_max_inflight_stuck_time_ms.libsonnet',
    import 'sql_exporter/getpage_wait_seconds_bucket.libsonnet',
    import 'sql_exporter/getpage_wait_seconds_count.libsonnet',
    import 'sql_exporter/getpage_wait_seconds_sum.libsonnet',
--- a/compute/etc/sql_exporter/compute_getpage_max_inflight_stuck_time_ms.libsonnet
+++ b/compute/etc/sql_exporter/compute_getpage_max_inflight_stuck_time_ms.libsonnet
@@ -1,9 +0,0 @@
-{
-  metric_name: 'compute_getpage_max_inflight_stuck_time_ms',
-  type: 'gauge',
-  help: 'Max wait time for stuck requests among all backends. Includes only active stuck requests, terminated or disconnected ones are not accounted for',
-  values: [
-    'compute_getpage_max_inflight_stuck_time_ms',
-  ],
-  query_ref: 'neon_perf_counters',
-}
--- a/compute/etc/sql_exporter/compute_getpage_stuck_requests_total.libsonnet
+++ b/compute/etc/sql_exporter/compute_getpage_stuck_requests_total.libsonnet
@@ -1,9 +0,0 @@
-{
-  metric_name: 'compute_getpage_stuck_requests_total',
-  type: 'counter',
-  help: 'Total number of Getpage requests left without an answer for more than pageserver_response_log_timeout but less than pageserver_response_disconnect_timeout',
-  values: [
-    'compute_getpage_stuck_requests_total',
-  ],
-  query_ref: 'neon_perf_counters',
-}
--- a/compute/etc/sql_exporter/neon_perf_counters.sql
+++ b/compute/etc/sql_exporter/neon_perf_counters.sql
@@ -9,8 +9,6 @@ SELECT d.* FROM pg_catalog.jsonb_to_record((SELECT jb FROM c)) AS d(
  getpage_wait_seconds_sum numeric,
  getpage_prefetch_requests_total numeric,
  getpage_sync_requests_total numeric,
-  compute_getpage_stuck_requests_total numeric,
-  compute_getpage_max_inflight_stuck_time_ms numeric,
  getpage_prefetch_misses_total numeric,
  getpage_prefetch_discards_total numeric,
  getpage_prefetches_buffered numeric,
--- a/docs/consumption_metrics.md
+++ b/docs/consumption_metrics.md
@@ -38,6 +38,11 @@ Currently, the following metrics are collected:
 Amount of WAL produced , by a timeline, i.e. last_record_lsn
 This is an absolute, per-timeline metric.

+- `resident_size`
+
+Size of all the layer files in the tenant's directory on disk on the pageserver.
+This is an absolute, per-tenant metric.
+
 - `remote_storage_size`

 Size of the remote storage (S3) directory.
--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -841,10 +841,6 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {

        let expected_end = match &end {
            ServerInitiated(_) | CopyDone | CopyFail | Terminate | EOF | Cancelled => true,
-            // The timeline doesn't exist and we have been requested to not auto-create it.
-            // Compute requests for timelines that haven't been created yet
-            // might reach us before the storcon request to create those timelines.
-            TimelineNoCreate => true,
            CopyStreamHandlerEnd::Disconnected(ConnectionError::Io(io_error))
                if is_expected_io_error(io_error) =>
            {
@@ -1063,8 +1059,6 @@ pub enum CopyStreamHandlerEnd {
    Terminate,
    #[error("EOF on COPY stream")]
    EOF,
-    #[error("timeline not found, and allow_timeline_creation is false")]
-    TimelineNoCreate,
    /// The connection was lost
    #[error("connection error: {0}")]
    Disconnected(#[from] ConnectionError),
--- a/libs/safekeeper_api/src/models.rs
+++ b/libs/safekeeper_api/src/models.rs
@@ -303,8 +303,7 @@ pub struct PullTimelineRequest {

 #[derive(Debug, Serialize, Deserialize)]
 pub struct PullTimelineResponse {
-    /// Donor safekeeper host.
-    /// None if no pull happened because the timeline already exists.
-    pub safekeeper_host: Option<String>,
+    // Donor safekeeper host
+    pub safekeeper_host: String,
    // TODO: add more fields?
 }
--- a/libs/utils/src/signals.rs
+++ b/libs/utils/src/signals.rs
@@ -46,7 +46,6 @@ pub async fn signal_handler(token: tokio_util::sync::CancellationToken) {
    let mut sigint = signal(SignalKind::interrupt()).unwrap();
    let mut sigterm = signal(SignalKind::terminate()).unwrap();
    let mut sigquit = signal(SignalKind::quit()).unwrap();
-    let mut sigusr1 = signal(SignalKind::user_defined1()).unwrap();

    loop {
        let signal = tokio::select! {
@@ -56,17 +55,13 @@ pub async fn signal_handler(token: tokio_util::sync::CancellationToken) {
            }
            _ = sigint.recv() => "SIGINT",
            _ = sigterm.recv() => "SIGTERM",
-            _ = sigusr1.recv() => {
-                info!("Got signal SIGUSR1");
-                continue;
-            }
        };

        if !token.is_cancelled() {
-            info!(thread_id=?std::thread::current().id(), "Got signal {signal}. Terminating gracefully in fast shutdown mode.");
+            info!("Got signal {signal}. Terminating gracefully in fast shutdown mode.");
            token.cancel();
        } else {
-            info!(thread_id=?std::thread::current().id(), "Got signal {signal}. Already shutting down.");
+            info!("Got signal {signal}. Already shutting down.");
        }
    }
 }
--- a/pageserver/src/consumption_metrics/metrics.rs
+++ b/pageserver/src/consumption_metrics/metrics.rs
@@ -30,6 +30,9 @@ pub(super) enum Name {
    /// Tenant remote size
    #[serde(rename = "remote_storage_size")]
    RemoteSize,
+    /// Tenant resident size
+    #[serde(rename = "resident_size")]
+    ResidentSize,
    /// Tenant synthetic size
    #[serde(rename = "synthetic_storage_size")]
    SyntheticSize,
@@ -184,6 +187,18 @@ impl MetricsKey {
        .absolute_values()
    }

+    /// Sum of [`Timeline::resident_physical_size`] for each `Tenant`.
+    ///
+    /// [`Timeline::resident_physical_size`]: crate::tenant::Timeline::resident_physical_size
+    const fn resident_size(tenant_id: TenantId) -> AbsoluteValueFactory {
+        MetricsKey {
+            tenant_id,
+            timeline_id: None,
+            metric: Name::ResidentSize,
+        }
+        .absolute_values()
+    }
+
    /// [`TenantShard::cached_synthetic_size`] as refreshed by [`calculate_synthetic_size_worker`].
    ///
    /// [`TenantShard::cached_synthetic_size`]: crate::tenant::TenantShard::cached_synthetic_size
@@ -246,7 +261,10 @@ where
    let mut tenants = std::pin::pin!(tenants);

    while let Some((tenant_id, tenant)) = tenants.next().await {
+        let mut tenant_resident_size = 0;
+
        let timelines = tenant.list_timelines();
+        let timelines_len = timelines.len();
        for timeline in timelines {
            let timeline_id = timeline.timeline_id;

@@ -269,9 +287,16 @@ where
                    continue;
                }
            }
+
+            tenant_resident_size += timeline.resident_physical_size();
        }

-        let snap = TenantSnapshot::collect(&tenant);
+        if timelines_len == 0 {
+            // Force set it to 1 byte to avoid not being reported -- all timelines are offloaded.
+            tenant_resident_size = 1;
+        }
+
+        let snap = TenantSnapshot::collect(&tenant, tenant_resident_size);
        snap.to_metrics(tenant_id, Utc::now(), cache, &mut current_metrics);
    }

@@ -280,14 +305,19 @@ where

 /// In-between abstraction to allow testing metrics without actual Tenants.
 struct TenantSnapshot {
+    resident_size: u64,
    remote_size: u64,
    synthetic_size: u64,
 }

 impl TenantSnapshot {
    /// Collect tenant status to have metrics created out of it.
-    fn collect(t: &Arc<crate::tenant::TenantShard>) -> Self {
+    ///
+    /// `resident_size` is calculated of the timelines we had access to for other metrics, so we
+    /// cannot just list timelines here.
+    fn collect(t: &Arc<crate::tenant::TenantShard>, resident_size: u64) -> Self {
        TenantSnapshot {
+            resident_size,
            remote_size: t.remote_size(),
            // Note that this metric is calculated in a separate bgworker
            // Here we only use cached value, which may lag behind the real latest one
@@ -304,6 +334,8 @@ impl TenantSnapshot {
    ) {
        let remote_size = MetricsKey::remote_storage_size(tenant_id).at(now, self.remote_size);

+        let resident_size = MetricsKey::resident_size(tenant_id).at(now, self.resident_size);
+
        let synthetic_size = {
            let factory = MetricsKey::synthetic_size(tenant_id);
            let mut synthetic_size = self.synthetic_size;
@@ -323,7 +355,11 @@ impl TenantSnapshot {
            }
        };

-        metrics.extend([Some(remote_size), synthetic_size].into_iter().flatten());
+        metrics.extend(
+            [Some(remote_size), Some(resident_size), synthetic_size]
+                .into_iter()
+                .flatten(),
+        );
    }
 }

--- a/pageserver/src/consumption_metrics/metrics/tests.rs
+++ b/pageserver/src/consumption_metrics/metrics/tests.rs
@@ -224,6 +224,7 @@ fn post_restart_synthetic_size_uses_cached_if_available() {
    let tenant_id = TenantId::generate();

    let ts = TenantSnapshot {
+        resident_size: 1000,
        remote_size: 1000,
        // not yet calculated
        synthetic_size: 0,
@@ -244,6 +245,7 @@ fn post_restart_synthetic_size_uses_cached_if_available() {
        metrics,
        &[
            MetricsKey::remote_storage_size(tenant_id).at(now, 1000),
+            MetricsKey::resident_size(tenant_id).at(now, 1000),
            MetricsKey::synthetic_size(tenant_id).at(now, 1000),
        ]
    );
@@ -254,6 +256,7 @@ fn post_restart_synthetic_size_is_not_sent_when_not_cached() {
    let tenant_id = TenantId::generate();

    let ts = TenantSnapshot {
+        resident_size: 1000,
        remote_size: 1000,
        // not yet calculated
        synthetic_size: 0,
@@ -271,6 +274,7 @@ fn post_restart_synthetic_size_is_not_sent_when_not_cached() {
        metrics,
        &[
            MetricsKey::remote_storage_size(tenant_id).at(now, 1000),
+            MetricsKey::resident_size(tenant_id).at(now, 1000),
            // no synthetic size here
        ]
    );
@@ -291,13 +295,14 @@ pub(crate) const fn metric_examples_old(
    timeline_id: TimelineId,
    now: DateTime<Utc>,
    before: DateTime<Utc>,
-) -> [RawMetric; 5] {
+) -> [RawMetric; 6] {
    [
        MetricsKey::written_size(tenant_id, timeline_id).at_old_format(now, 0),
        MetricsKey::written_size_delta(tenant_id, timeline_id)
            .from_until_old_format(before, now, 0),
        MetricsKey::timeline_logical_size(tenant_id, timeline_id).at_old_format(now, 0),
        MetricsKey::remote_storage_size(tenant_id).at_old_format(now, 0),
+        MetricsKey::resident_size(tenant_id).at_old_format(now, 0),
        MetricsKey::synthetic_size(tenant_id).at_old_format(now, 1),
    ]
 }
@@ -307,12 +312,13 @@ pub(crate) const fn metric_examples(
    timeline_id: TimelineId,
    now: DateTime<Utc>,
    before: DateTime<Utc>,
-) -> [NewRawMetric; 5] {
+) -> [NewRawMetric; 6] {
    [
        MetricsKey::written_size(tenant_id, timeline_id).at(now, 0),
        MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(before, now, 0),
        MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0),
        MetricsKey::remote_storage_size(tenant_id).at(now, 0),
+        MetricsKey::resident_size(tenant_id).at(now, 0),
        MetricsKey::synthetic_size(tenant_id).at(now, 1),
    ]
 }
--- a/pageserver/src/consumption_metrics/upload.rs
+++ b/pageserver/src/consumption_metrics/upload.rs
@@ -521,6 +521,10 @@ mod tests {
                line!(),
                r#"{"type":"absolute","time":"2023-09-15T00:00:00.123456789Z","metric":"remote_storage_size","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":0,"tenant_id":"00000000000000000000000000000000"}"#,
            ),
+            (
+                line!(),
+                r#"{"type":"absolute","time":"2023-09-15T00:00:00.123456789Z","metric":"resident_size","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":0,"tenant_id":"00000000000000000000000000000000"}"#,
+            ),
            (
                line!(),
                r#"{"type":"absolute","time":"2023-09-15T00:00:00.123456789Z","metric":"synthetic_storage_size","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":1,"tenant_id":"00000000000000000000000000000000"}"#,
@@ -560,7 +564,7 @@ mod tests {
        assert_eq!(upgraded_samples, new_samples);
    }

-    fn metric_samples_old() -> [RawMetric; 5] {
+    fn metric_samples_old() -> [RawMetric; 6] {
        let tenant_id = TenantId::from_array([0; 16]);
        let timeline_id = TimelineId::from_array([0xff; 16]);

@@ -572,7 +576,7 @@ mod tests {
        super::super::metrics::metric_examples_old(tenant_id, timeline_id, now, before)
    }

-    fn metric_samples() -> [NewRawMetric; 5] {
+    fn metric_samples() -> [NewRawMetric; 6] {
        let tenant_id = TenantId::from_array([0; 16]);
        let timeline_id = TimelineId::from_array([0xff; 16]);

--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -497,24 +497,6 @@ pub(crate) static WAIT_LSN_IN_PROGRESS_GLOBAL_MICROS: Lazy<IntCounter> = Lazy::n
    .expect("failed to define a metric")
 });

-pub(crate) static ONDEMAND_DOWNLOAD_BYTES: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "pageserver_ondemand_download_bytes_total",
-        "Total bytes of layers on-demand downloaded",
-        &["task_kind"]
-    )
-    .expect("failed to define a metric")
-});
-
-pub(crate) static ONDEMAND_DOWNLOAD_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "pageserver_ondemand_download_count",
-        "Total count of layers on-demand downloaded",
-        &["task_kind"]
-    )
-    .expect("failed to define a metric")
-});
-
 pub(crate) mod wait_ondemand_download_time {
    use super::*;
    const WAIT_ONDEMAND_DOWNLOAD_TIME_BUCKETS: &[f64] = &[
@@ -2198,10 +2180,6 @@ impl BasebackupQueryTimeOngoingRecording<'_> {
        // If you want to change categorize of a specific error, also change it in `log_query_error`.
        let metric = match res {
            Ok(_) => &self.parent.ok,
-            Err(QueryError::Shutdown) => {
-                // Do not observe ok/err for shutdown
-                return;
-            }
            Err(QueryError::Disconnected(ConnectionError::Io(io_error)))
                if is_expected_io_error(io_error) =>
            {
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -1035,25 +1035,10 @@ impl PageServerHandler {
                // avoid a somewhat costly Span::record() by constructing the entire span in one go.
                macro_rules! mkspan {
                    (before shard routing) => {{
-                        tracing::info_span!(
-                            parent: &parent_span,
-                            "handle_get_page_request",
-                            rel = %req.rel,
-                            blkno = %req.blkno,
-                            req_lsn = %req.hdr.request_lsn,
-                            not_modified_since_lsn = %req.hdr.not_modified_since
-                        )
+                        tracing::info_span!(parent: &parent_span, "handle_get_page_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.hdr.request_lsn)
                    }};
                    ($shard_id:expr) => {{
-                        tracing::info_span!(
-                            parent: &parent_span,
-                            "handle_get_page_request",
-                            rel = %req.rel,
-                            blkno = %req.blkno,
-                            req_lsn = %req.hdr.request_lsn,
-                            not_modified_since_lsn = %req.hdr.not_modified_since,
-                            shard_id = %$shard_id
-                        )
+                        tracing::info_span!(parent: &parent_span, "handle_get_page_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.hdr.request_lsn, shard_id = %$shard_id)
                    }};
                }

@@ -1117,7 +1102,6 @@ impl PageServerHandler {
                            shard_id = %shard.get_shard_identity().shard_slug(),
                            timeline_id = %timeline_id,
                            lsn = %req.hdr.request_lsn,
-                            not_modified_since_lsn = %req.hdr.not_modified_since,
                            request_id = %req.hdr.reqid,
                            key = %key,
                            )
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -4,7 +4,6 @@ use std::sync::{Arc, Weak};
 use std::time::{Duration, SystemTime};

 use crate::PERF_TRACE_TARGET;
-use crate::metrics::{ONDEMAND_DOWNLOAD_BYTES, ONDEMAND_DOWNLOAD_COUNT};
 use anyhow::Context;
 use camino::{Utf8Path, Utf8PathBuf};
 use pageserver_api::keyspace::KeySpace;
@@ -1256,14 +1255,6 @@ impl LayerInner {

                self.access_stats.record_residence_event();

-                let task_kind: &'static str = ctx.task_kind().into();
-                ONDEMAND_DOWNLOAD_BYTES
-                    .with_label_values(&[task_kind])
-                    .inc_by(self.desc.file_size);
-                ONDEMAND_DOWNLOAD_COUNT
-                    .with_label_values(&[task_kind])
-                    .inc();
-
                Ok(self.initialize_after_layer_is_on_disk(permit))
            }
            Err(e) => {
--- a/pageserver/src/virtual_file/io_engine/tokio_epoll_uring_ext.rs
+++ b/pageserver/src/virtual_file/io_engine/tokio_epoll_uring_ext.rs
@@ -76,13 +76,12 @@ pub async fn thread_local_system() -> Handle {
                    )
                    .await;
                    let per_system_metrics = metrics::THREAD_LOCAL_METRICS_STORAGE.register_system(inner.thread_local_state_id);
-                    info!(thread_id=?std::thread::current().id(), thread_name=?std::thread::current().name(), "launching system");
-                    let res = System::launch_with_metrics(per_system_metrics, usize::try_from(inner.thread_local_state_id).unwrap())
+                    let res = System::launch_with_metrics(per_system_metrics)
                    // this might move us to another executor thread => loop outside the get_or_try_init, not inside it
                    .await;
                    match res {
                        Ok(system) => {
-                            info!(thread_id=?std::thread::current().id(), thread_name=?std::thread::current().name(), "successfully launched system");
+                            info!("successfully launched system");
                            metrics::THREAD_LOCAL_LAUNCH_SUCCESSES.inc();
                            Ok(system)
                        }
--- a/pgxn/neon/communicator.c
+++ b/pgxn/neon/communicator.c
@@ -687,14 +687,8 @@ prefetch_wait_for(uint64 ring_index)
 		END_PREFETCH_RECEIVE_WORK();
 		CHECK_FOR_INTERRUPTS();
 	}
-	if (result)
-	{
-		/* Check that slot is actually received (srver can be disconnected in prefetch_pump_state called from CHECK_FOR_INTERRUPTS */
-		PrefetchRequest *slot = GetPrfSlot(ring_index);
-		return slot->status == PRFS_RECEIVED;
-	}
-	return false;
-;
+
+	return result;
 }

 /*
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -877,7 +877,6 @@ retry:
 			int			port;
 			int			sndbuf;
 			int			recvbuf;
-			uint64*		max_wait;

 			get_local_port(PQsocket(pageserver_conn), &port);
 			get_socket_stats(PQsocket(pageserver_conn), &sndbuf, &recvbuf);
@@ -888,10 +887,7 @@ retry:
 						   shard->nrequests_sent, shard->nresponses_received, port, sndbuf, recvbuf,
 				           pageserver_conn->inStart, pageserver_conn->inEnd);
 			shard->receive_last_log_time = now;
-			MyNeonCounters->compute_getpage_stuck_requests_total += !shard->receive_logged;
 			shard->receive_logged = true;
-			max_wait = &MyNeonCounters->compute_getpage_max_inflight_stuck_time_ms;
-			*max_wait = Max(*max_wait, INSTR_TIME_GET_MILLISEC(since_start));
 		}

 		/*
@@ -914,7 +910,6 @@ retry:
 			get_local_port(PQsocket(pageserver_conn), &port);
 			neon_shard_log(shard_no, LOG, "no response from pageserver for %0.3f s, disconnecting (socket port=%d)",
 					   INSTR_TIME_GET_DOUBLE(since_start), port);
-			MyNeonCounters->compute_getpage_max_inflight_stuck_time_ms = 0;
 			pageserver_disconnect(shard_no);
 			return -1;
 		}
@@ -938,7 +933,6 @@ retry:
 	INSTR_TIME_SET_ZERO(shard->receive_start_time);
 	INSTR_TIME_SET_ZERO(shard->receive_last_log_time);
 	shard->receive_logged = false;
-	MyNeonCounters->compute_getpage_max_inflight_stuck_time_ms = 0;

 	return ret;
 }
--- a/pgxn/neon/neon_perf_counters.c
+++ b/pgxn/neon/neon_perf_counters.c
@@ -148,7 +148,7 @@ histogram_to_metrics(IOHistogram histogram,
 static metric_t *
 neon_perf_counters_to_metrics(neon_per_backend_counters *counters)
 {
-#define NUM_METRICS ((2 + NUM_IO_WAIT_BUCKETS) * 3 + 12)
+#define NUM_METRICS ((2 + NUM_IO_WAIT_BUCKETS) * 3 + 10)
 	metric_t   *metrics = palloc((NUM_METRICS + 1) * sizeof(metric_t));
 	int			i = 0;

@@ -166,8 +166,6 @@ neon_perf_counters_to_metrics(neon_per_backend_counters *counters)

 	APPEND_METRIC(getpage_prefetch_requests_total);
 	APPEND_METRIC(getpage_sync_requests_total);
-	APPEND_METRIC(compute_getpage_stuck_requests_total);
-	APPEND_METRIC(compute_getpage_max_inflight_stuck_time_ms);
 	APPEND_METRIC(getpage_prefetch_misses_total);
 	APPEND_METRIC(getpage_prefetch_discards_total);
 	APPEND_METRIC(pageserver_requests_sent_total);
@@ -296,11 +294,6 @@ neon_get_perf_counters(PG_FUNCTION_ARGS)
 		totals.file_cache_hits_total += counters->file_cache_hits_total;
 		histogram_merge_into(&totals.file_cache_read_hist, &counters->file_cache_read_hist);
 		histogram_merge_into(&totals.file_cache_write_hist, &counters->file_cache_write_hist);
-
-		totals.compute_getpage_stuck_requests_total += counters->compute_getpage_stuck_requests_total;
-		totals.compute_getpage_max_inflight_stuck_time_ms = Max(
-			totals.compute_getpage_max_inflight_stuck_time_ms,
-			counters->compute_getpage_max_inflight_stuck_time_ms);
 	}

 	metrics = neon_perf_counters_to_metrics(&totals);
--- a/pgxn/neon/neon_perf_counters.h
+++ b/pgxn/neon/neon_perf_counters.h
@@ -57,18 +57,6 @@ typedef struct
 	uint64		getpage_prefetch_requests_total;
 	uint64		getpage_sync_requests_total;

-	/* 
-	 * Total number of Getpage requests left without an answer for more than
-	 * pageserver_response_log_timeout but less than pageserver_response_disconnect_timeout
-	 */
-	uint64 compute_getpage_stuck_requests_total;
-
-	/* 
-	 * Longest waiting time for active stuck requests. If a stuck request gets a
-	 * response or disconnects, this metric is updated
-	 */
-	uint64 compute_getpage_max_inflight_stuck_time_ms;
-
 	/*
 	 * Total number of readahead misses; consisting of either prefetches that
 	 * don't satisfy the LSN bounds, or cases where no readahead was issued
--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -836,7 +836,7 @@ TermsCollectedMset(WalProposer *wp, MemberSet *mset, Safekeeper **msk, StringInf
 {
 	uint32		n_greeted = 0;

-	for (uint32 i = 0; i < mset->len; i++)
+	for (uint32 i = 0; i < wp->mconf.members.len; i++)
 	{
 		Safekeeper *sk = msk[i];

@@ -1106,7 +1106,7 @@ VotesCollectedMset(WalProposer *wp, MemberSet *mset, Safekeeper **msk, StringInf
 {
 	uint32		n_votes = 0;

-	for (uint32 i = 0; i < mset->len; i++)
+	for (uint32 i = 0; i < wp->mconf.members.len; i++)
 	{
 		Safekeeper *sk = msk[i];

--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -63,7 +63,7 @@
 char	   *wal_acceptors_list = "";
 int			wal_acceptor_reconnect_timeout = 1000;
 int			wal_acceptor_connection_timeout = 10000;
-int			safekeeper_proto_version = 3;
+int			safekeeper_proto_version = 2;

 /* Set to true in the walproposer bgw. */
 static bool am_walproposer;
@@ -228,7 +228,7 @@ nwp_register_gucs(void)
 							"Version of compute <-> safekeeper protocol.",
 							"Used while migrating from 2 to 3.",
 							&safekeeper_proto_version,
-							3, 0, INT_MAX,
+							2, 0, INT_MAX,
 							PGC_POSTMASTER,
 							0,
 							NULL, NULL, NULL);
--- a/proxy/README.md
+++ b/proxy/README.md
@@ -32,7 +32,7 @@ To play with it locally one may start proxy over a local postgres installation
 (see end of this page on how to generate certs with openssl):

 ```
-LOGFMT=text ./target/debug/proxy -c server.crt -k server.key --auth-backend=postgres --auth-endpoint=postgres://stas@127.0.0.1:5432/stas --wss 0.0.0.0:4444
+./target/debug/proxy -c server.crt -k server.key --auth-backend=postgres --auth-endpoint=postgres://stas@127.0.0.1:5432/stas --wss 0.0.0.0:4444
 ```

 If both postgres and proxy are running you may send a SQL query:
@@ -130,7 +130,7 @@ openssl req -new -x509 -days 365 -nodes -text -out server.crt -keyout server.key

 Then we need to build proxy with 'testing' feature and run, e.g.:
 ```sh
-RUST_LOG=proxy LOGFMT=text cargo run -p proxy --bin proxy --features testing -- --auth-backend postgres --auth-endpoint 'postgresql://postgres:proxy-postgres@127.0.0.1:5432/postgres' -c server.crt -k server.key
+RUST_LOG=proxy cargo run -p proxy --bin proxy --features testing -- --auth-backend postgres --auth-endpoint 'postgresql://postgres:proxy-postgres@127.0.0.1:5432/postgres' -c server.crt -k server.key
 ```

 Now from client you can start a new session:
--- a/proxy/src/logging.rs
+++ b/proxy/src/logging.rs
@@ -132,10 +132,11 @@ impl Drop for LoggingGuard {
    }
 }

+// TODO: make JSON the default
 #[derive(Copy, Clone, PartialEq, Eq, Default, Debug)]
 enum LogFormat {
-    Text,
    #[default]
+    Text = 1,
    Json,
 }

--- a/safekeeper/src/pull_timeline.rs
+++ b/safekeeper/src/pull_timeline.rs
@@ -401,10 +401,7 @@ pub async fn handle_request(
        request.timeline_id,
    ));
    if existing_tli.is_ok() {
-        info!("Timeline {} already exists", request.timeline_id);
-        return Ok(PullTimelineResponse {
-            safekeeper_host: None,
-        });
+        bail!("Timeline {} already exists", request.timeline_id);
    }

    let mut http_client = reqwest::Client::builder();
@@ -428,25 +425,8 @@ pub async fn handle_request(

    let mut statuses = Vec::new();
    for (i, response) in responses.into_iter().enumerate() {
-        match response {
-            Ok(status) => {
-                statuses.push((status, i));
-            }
-            Err(e) => {
-                info!("error fetching status from {}: {e}", http_hosts[i]);
-            }
-        }
-    }
-
-    // Allow missing responses from up to one safekeeper (say due to downtime)
-    // e.g. if we created a timeline on PS A and B, with C being offline. Then B goes
-    // offline and C comes online. Then we want a pull on C with A and B as hosts to work.
-    let min_required_successful = (http_hosts.len() - 1).max(1);
-    if statuses.len() < min_required_successful {
-        bail!(
-            "only got {} successful status responses. required: {min_required_successful}",
-            statuses.len()
-        )
+        let status = response.context(format!("fetching status from {}", http_hosts[i]))?;
+        statuses.push((status, i));
    }

    // Find the most advanced safekeeper
@@ -556,6 +536,6 @@ async fn pull_timeline(
        .await?;

    Ok(PullTimelineResponse {
-        safekeeper_host: Some(host),
+        safekeeper_host: host,
    })
 }
--- a/safekeeper/src/receive_wal.rs
+++ b/safekeeper/src/receive_wal.rs
@@ -32,7 +32,7 @@ use crate::metrics::{
    WAL_RECEIVERS,
 };
 use crate::safekeeper::{AcceptorProposerMessage, ProposerAcceptorMessage};
-use crate::timeline::{TimelineError, WalResidentTimeline};
+use crate::timeline::WalResidentTimeline;

 const DEFAULT_FEEDBACK_CAPACITY: usize = 8;

@@ -357,14 +357,9 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'_, IO> {
                        .await
                        .context("create timeline")?
                } else {
-                    let timeline_res = self.global_timelines.get(self.ttid);
-                    match timeline_res {
-                        Ok(tl) => tl,
-                        Err(TimelineError::NotFound(_)) => {
-                            return Err(CopyStreamHandlerEnd::TimelineNoCreate);
-                        }
-                        other => other.context("get_timeline")?,
-                    }
+                    self.global_timelines
+                        .get(self.ttid)
+                        .context("get timeline")?
                };
                tli.wal_residence_guard().await?
            }
--- a/storage_controller/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -19,8 +19,7 @@ use storage_controller::service::chaos_injector::ChaosInjector;
 use storage_controller::service::{
    Config, HEARTBEAT_INTERVAL_DEFAULT, LONG_RECONCILE_THRESHOLD_DEFAULT,
    MAX_OFFLINE_INTERVAL_DEFAULT, MAX_WARMING_UP_INTERVAL_DEFAULT,
-    PRIORITY_RECONCILER_CONCURRENCY_DEFAULT, RECONCILER_CONCURRENCY_DEFAULT,
-    SAFEKEEPER_RECONCILER_CONCURRENCY_DEFAULT, Service,
+    PRIORITY_RECONCILER_CONCURRENCY_DEFAULT, RECONCILER_CONCURRENCY_DEFAULT, Service,
 };
 use tokio::signal::unix::SignalKind;
 use tokio_util::sync::CancellationToken;
@@ -133,10 +132,6 @@ struct Cli {
    #[arg(long)]
    priority_reconciler_concurrency: Option<usize>,

-    /// Maximum number of safekeeper reconciliations that may run in parallel (per safekeeper)
-    #[arg(long)]
-    safekeeper_reconciler_concurrency: Option<usize>,
-
    /// Tenant API rate limit, as requests per second per tenant.
    #[arg(long, default_value = "10")]
    tenant_rate_limit: NonZeroU32,
@@ -408,9 +403,6 @@ async fn async_main() -> anyhow::Result<()> {
        priority_reconciler_concurrency: args
            .priority_reconciler_concurrency
            .unwrap_or(PRIORITY_RECONCILER_CONCURRENCY_DEFAULT),
-        safekeeper_reconciler_concurrency: args
-            .safekeeper_reconciler_concurrency
-            .unwrap_or(SAFEKEEPER_RECONCILER_CONCURRENCY_DEFAULT),
        tenant_rate_limit: args.tenant_rate_limit,
        split_threshold: args.split_threshold,
        max_split_shards: args.max_split_shards,
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -194,7 +194,6 @@ pub(crate) enum LeadershipStatus {

 pub const RECONCILER_CONCURRENCY_DEFAULT: usize = 128;
 pub const PRIORITY_RECONCILER_CONCURRENCY_DEFAULT: usize = 256;
-pub const SAFEKEEPER_RECONCILER_CONCURRENCY_DEFAULT: usize = 32;

 // Depth of the channel used to enqueue shards for reconciliation when they can't do it immediately.
 // This channel is finite-size to avoid using excessive memory if we get into a state where reconciles are finishing more slowly
@@ -383,9 +382,6 @@ pub struct Config {
    /// How many high-priority Reconcilers may be spawned concurrently
    pub priority_reconciler_concurrency: usize,

-    /// How many safekeeper reconciles may happen concurrently (per safekeeper)
-    pub safekeeper_reconciler_concurrency: usize,
-
    /// How many API requests per second to allow per tenant, across all
    /// tenant-scoped API endpoints. Further API requests queue until ready.
    pub tenant_rate_limit: NonZeroU32,
@@ -3663,7 +3659,7 @@ impl Service {
                locations: ShardMutationLocations,
                http_client: reqwest::Client,
                jwt: Option<String>,
-                mut create_req: TimelineCreateRequest,
+                create_req: TimelineCreateRequest,
            ) -> Result<TimelineInfo, ApiError> {
                let latest = locations.latest.node;

@@ -3682,15 +3678,6 @@ impl Service {
                    .await
                    .map_err(|e| passthrough_api_error(&latest, e))?;

-                // If we are going to create the timeline on some stale locations for shard 0, then ask them to re-use
-                // the initdb generated by the latest location, rather than generating their own.  This avoids racing uploads
-                // of initdb to S3 which might not be binary-identical if different pageservers have different postgres binaries.
-                if tenant_shard_id.is_shard_zero() {
-                    if let models::TimelineCreateRequestMode::Bootstrap { existing_initdb_timeline_id, .. } = &mut create_req.mode {
-                        *existing_initdb_timeline_id = Some(create_req.new_timeline_id);
-                    }
-                }
-
                // We propagate timeline creations to all attached locations such that a compute
                // for the new timeline is able to start regardless of the current state of the
                // tenant shard reconciliation.
--- a/storage_controller/src/service/safekeeper_reconciler.rs
+++ b/storage_controller/src/service/safekeeper_reconciler.rs
@@ -3,10 +3,7 @@ use std::{collections::HashMap, str::FromStr, sync::Arc, time::Duration};
 use clashmap::{ClashMap, Entry};
 use safekeeper_api::models::PullTimelineRequest;
 use safekeeper_client::mgmt_api;
-use tokio::sync::{
-    Semaphore,
-    mpsc::{self, UnboundedReceiver, UnboundedSender},
-};
+use tokio::sync::mpsc::{self, UnboundedReceiver, UnboundedSender};
 use tokio_util::sync::CancellationToken;
 use tracing::Instrument;
 use utils::{
@@ -209,27 +206,18 @@ impl ReconcilerHandle {
 }

 pub(crate) struct SafekeeperReconciler {
-    inner: SafekeeperReconcilerInner,
-    concurrency_limiter: Arc<Semaphore>,
+    service: Arc<Service>,
    rx: UnboundedReceiver<(ScheduleRequest, CancellationToken)>,
    cancel: CancellationToken,
 }

-/// Thin wrapper over `Service` to not clutter its inherent functions
-#[derive(Clone)]
-struct SafekeeperReconcilerInner {
-    service: Arc<Service>,
-}
-
 impl SafekeeperReconciler {
    fn spawn(cancel: CancellationToken, service: Arc<Service>) -> ReconcilerHandle {
        // We hold the ServiceInner lock so we don't want to make sending to the reconciler channel to be blocking.
        let (tx, rx) = mpsc::unbounded_channel();
-        let concurrency = service.config.safekeeper_reconciler_concurrency;
        let mut reconciler = SafekeeperReconciler {
-            inner: SafekeeperReconcilerInner { service },
+            service,
            rx,
-            concurrency_limiter: Arc::new(Semaphore::new(concurrency)),
            cancel: cancel.clone(),
        };
        let handle = ReconcilerHandle {
@@ -242,44 +230,31 @@ impl SafekeeperReconciler {
    }
    async fn run(&mut self) {
        loop {
+            // TODO add parallelism with semaphore here
            let req = tokio::select! {
                req = self.rx.recv() => req,
                _ = self.cancel.cancelled() => break,
            };
            let Some((req, req_cancel)) = req else { break };
-
-            let permit_res = tokio::select! {
-                req = self.concurrency_limiter.clone().acquire_owned() => req,
-                _ = self.cancel.cancelled() => break,
-            };
-            let Ok(_permit) = permit_res else { return };
-
-            let inner = self.inner.clone();
            if req_cancel.is_cancelled() {
                continue;
            }

-            tokio::task::spawn(async move {
-                let kind = req.kind;
-                let tenant_id = req.tenant_id;
-                let timeline_id = req.timeline_id;
-                let node_id = req.safekeeper.skp.id;
-                inner
-                    .reconcile_one(req, req_cancel)
-                    .instrument(tracing::info_span!(
-                        "reconcile_one",
-                        ?kind,
-                        %tenant_id,
-                        ?timeline_id,
-                        %node_id,
-                    ))
-                    .await;
-            });
+            let kind = req.kind;
+            let tenant_id = req.tenant_id;
+            let timeline_id = req.timeline_id;
+            let node_id = req.safekeeper.skp.id;
+            self.reconcile_one(req, req_cancel)
+                .instrument(tracing::info_span!(
+                    "reconcile_one",
+                    ?kind,
+                    %tenant_id,
+                    ?timeline_id,
+                    %node_id,
+                ))
+                .await;
        }
    }
-}
-
-impl SafekeeperReconcilerInner {
    async fn reconcile_one(&self, req: ScheduleRequest, req_cancel: CancellationToken) {
        let req_host = req.safekeeper.skp.host.clone();
        match req.kind {
@@ -306,11 +281,10 @@ impl SafekeeperReconcilerInner {
                    req,
                    async |client| client.pull_timeline(&pull_req).await,
                    |resp| {
-                        if let Some(host) = resp.safekeeper_host {
-                            tracing::info!("pulled timeline from {host} onto {req_host}");
-                        } else {
-                            tracing::info!("timeline already present on safekeeper on {req_host}");
-                        }
+                        tracing::info!(
+                            "pulled timeline from {} onto {req_host}",
+                            resp.safekeeper_host,
+                        );
                    },
                    req_cancel,
                )
--- a/test_runner/regress/test_pageserver_catchup.py
+++ b/test_runner/regress/test_pageserver_catchup.py
@@ -2,8 +2,6 @@ from __future__ import annotations

 from typing import TYPE_CHECKING

-import pytest
-
 if TYPE_CHECKING:
    from fixtures.neon_fixtures import NeonEnvBuilder

@@ -12,7 +10,6 @@ if TYPE_CHECKING:
 # while initial compute node is down and pageserver is lagging behind safekeepers.
 # Ensure that basebackup after restart of all components is correct
 # and new compute node contains all data.
-@pytest.mark.repeat(1000)
 def test_pageserver_catchup_while_compute_down(neon_env_builder: NeonEnvBuilder):
    neon_env_builder.num_safekeepers = 3
    env = neon_env_builder.init_start()
--- a/test_runner/regress/test_pageserver_metric_collection.py
+++ b/test_runner/regress/test_pageserver_metric_collection.py
@@ -506,6 +506,7 @@ class SyntheticSizeVerifier:

 PER_METRIC_VERIFIERS = {
    "remote_storage_size": CannotVerifyAnything,
+    "resident_size": CannotVerifyAnything,
    "written_size": WrittenDataVerifier,
    "written_data_bytes_delta": WrittenDataDeltaVerifier,
    "timeline_logical_size": CannotVerifyAnything,