Run proxy-bench directly on self-hosted runner

Fix run.sh proxy-bench args
2026-05-17 21:20:37 +00:00 · 2025-07-31 13:37:08 +03:00 · 2025-07-30 18:16:14 +03:00
11 changed files with 139 additions and 296 deletions
--- a/.github/workflows/proxy-benchmark.yml
+++ b/.github/workflows/proxy-benchmark.yml
@@ -13,6 +13,12 @@ on:
    #        │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
    # - cron: '0 5 * * *' # Runs at 5 UTC once a day
  workflow_dispatch: # adds an ability to run this manually
+    inputs:
+      commit_hash:
+        type: string
+        description: 'The long neon repo commit hash for the system under test (proxy) to be tested.'
+        required: false
+        default: ''

 defaults:
  run:
@@ -33,20 +39,14 @@ jobs:
      contents: write
      pull-requests: write
    runs-on: [ self-hosted, unit-perf-aws-arm ]
-    timeout-minutes: 60  # 1h timeout
    container:
      image: ghcr.io/neondatabase/build-tools:pinned-bookworm
      credentials:
        username: ${{ github.actor }}
        password: ${{ secrets.GITHUB_TOKEN }}
      options: --init
+    timeout-minutes: 60  # 1h timeout
    steps:
-    - name: Checkout proxy-bench Repo
-      uses: actions/checkout@v4
-      with:
-        repository: neondatabase/proxy-bench
-        path: proxy-bench
-
    - name: Set up the environment which depends on $RUNNER_TEMP on nvme drive
      id: set-env
      shell: bash -euxo pipefail {0}
@@ -54,19 +54,64 @@ jobs:
        PROXY_BENCH_PATH=$(realpath ./proxy-bench)
        {
          echo "PROXY_BENCH_PATH=$PROXY_BENCH_PATH"
-          echo "NEON_DIR=${RUNNER_TEMP}/neon"
-          echo "NEON_PROXY_PATH=${RUNNER_TEMP}/neon/bin/proxy"
+          echo "NEON_DIR=${GITHUB_WORKSPACE}/"
+          echo "NEON_PROXY_PATH=${GITHUB_WORKSPACE}/bin/proxy"
          echo "TEST_OUTPUT=${PROXY_BENCH_PATH}/test_output"
+          echo "DOCKER_COMPOSE_FILE=${PROXY_BENCH_PATH}/docker-compose.yml"
          echo ""
        } >> "$GITHUB_ENV"

+    - name: Determine commit hash
+      id: commit_hash
+      shell: bash -euxo pipefail {0}
+      env:
+        INPUT_COMMIT_HASH: ${{ github.event.inputs.commit_hash }}
+      run: |
+        if [[ -z "${INPUT_COMMIT_HASH}" ]]; then
+          COMMIT_HASH=$(curl -s https://api.github.com/repos/neondatabase/neon/commits/main | jq -r '.sha')
+          echo "COMMIT_HASH=$COMMIT_HASH" >> $GITHUB_ENV
+          echo "commit_hash=$COMMIT_HASH" >> "$GITHUB_OUTPUT"
+          echo "COMMIT_HASH_TYPE=latest" >> $GITHUB_ENV
+        else
+          COMMIT_HASH="${INPUT_COMMIT_HASH}"
+          echo "COMMIT_HASH=$COMMIT_HASH" >> $GITHUB_ENV
+          echo "commit_hash=$COMMIT_HASH" >> "$GITHUB_OUTPUT"
+          echo "COMMIT_HASH_TYPE=manual" >> $GITHUB_ENV
+        fi
+    - name: Checkout the neon repository at given commit hash
+      uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+      with:
+        ref: ${{ steps.commit_hash.outputs.commit_hash }}
+
+    - name: Print GITHUB_WORKSPACE
+      run: echo "$GITHUB_WORKSPACE"
+
+    - name: List parent dir of workspace
+      run: ls -la /__w/neon
+
+    - name: List all env vars
+      run: env
+
+    - name: Checkout proxy-bench Repo
+      uses: actions/checkout@v4
+      with:
+        repository: neondatabase/proxy-bench
+        path: proxy-bench
+
    - name: Cache poetry deps
      uses: actions/cache@v4
      with:
        path: ~/.cache/pypoetry/virtualenvs
        key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-bookworm-${{ hashFiles('poetry.lock') }}

+    - name: DEBUG List files for debugging
+      working-directory: ${{ env.NEON_DIR }}
+      run: |
+        pwd
+        ls -la
+
    - name: Install Python deps
+      working-directory: ${{ env.NEON_DIR }}
      shell: bash -euxo pipefail {0}
      run: ./scripts/pysync

@@ -77,14 +122,14 @@ jobs:

    - name: Run proxy-bench
      working-directory: ${{ env.PROXY_BENCH_PATH }}
-      run: ./run.sh --with-grafana --bare-metal
+      run: ./run.sh --bare-metal

    - name: Ingest Bench Results
      if: always()
      working-directory: ${{ env.NEON_DIR }}
      run: |
        mkdir -p $TEST_OUTPUT
-        python $NEON_DIR/scripts/proxy_bench_results_ingest.py --out $TEST_OUTPUT
+        python3 ./scripts/proxy_bench_results_ingest.py --out $TEST_OUTPUT

    - name: Push Metrics to Proxy perf database
      shell: bash -euxo pipefail {0}
@@ -109,4 +154,4 @@ jobs:
        fi
        if [[ -d "${PROXY_BENCH_PATH}/test_output" ]]; then
          rm -rf ${PROXY_BENCH_PATH}/test_output
-        fi
+        fi
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -2648,11 +2648,7 @@ LIMIT 100",
    /// the pageserver connection strings has changed.
    ///
    /// The operation will time out after a specified duration.
-    pub fn wait_timeout_while_pageserver_connstr_unchanged(
-        &self,
-        duration: Duration,
-        request_pageserver_conninfo: &PageserverConnectionInfo,
-    ) {
+    pub fn wait_timeout_while_pageserver_connstr_unchanged(&self, duration: Duration) {
        let state = self.state.lock().unwrap();
        let old_pageserver_conninfo = state
            .pspec
@@ -2660,10 +2656,6 @@ LIMIT 100",
            .expect("spec must be set")
            .pageserver_conninfo
            .clone();
-        if request_pageserver_conninfo != &old_pageserver_conninfo {
-            info!("Pageserver config changed during the previous request");
-            return;
-        }
        let mut unchanged = true;
        let _ = self
            .state_changed
--- a/compute_tools/src/lsn_lease.rs
+++ b/compute_tools/src/lsn_lease.rs
@@ -1,13 +1,13 @@
 use std::str::FromStr;
 use std::sync::Arc;
+use std::thread;
 use std::time::{Duration, SystemTime};

 use anyhow::{Result, bail};
 use compute_api::spec::{ComputeMode, PageserverConnectionInfo, PageserverProtocol};
-use futures::StreamExt;
 use pageserver_page_api as page_api;
 use postgres::{NoTls, SimpleQueryMessage};
-use tracing::{Instrument, info, warn};
+use tracing::{info, warn};
 use utils::id::{TenantId, TimelineId};
 use utils::lsn::Lsn;
 use utils::shard::TenantShardId;
@@ -15,7 +15,7 @@ use utils::shard::TenantShardId;
 use crate::compute::ComputeNode;

 /// Spawns a background thread to periodically renew LSN leases for static compute.
-/// Do nothing if the compute is not in static mode. MUST run this within a tokio runtime.
+/// Do nothing if the compute is not in static mode.
 pub fn launch_lsn_lease_bg_task_for_static(compute: &Arc<ComputeNode>) {
    let (tenant_id, timeline_id, lsn) = {
        let state = compute.state.lock().unwrap();
@@ -28,27 +28,24 @@ pub fn launch_lsn_lease_bg_task_for_static(compute: &Arc<ComputeNode>) {
    let compute = compute.clone();

    let span = tracing::info_span!("lsn_lease_bg_task", %tenant_id, %timeline_id, %lsn);
-    tokio::spawn(
-        async move {
-            if let Err(e) = lsn_lease_bg_task(compute, tenant_id, timeline_id, lsn).await {
-                // TODO: might need stronger error feedback than logging an warning.
-                warn!("Exited with error: {e}");
-            }
+    thread::spawn(move || {
+        let _entered = span.entered();
+        if let Err(e) = lsn_lease_bg_task(compute, tenant_id, timeline_id, lsn) {
+            // TODO: might need stronger error feedback than logging an warning.
+            warn!("Exited with error: {e}");
        }
-        .instrument(span),
-    );
+    });
 }

 /// Renews lsn lease periodically so static compute are not affected by GC.
-async fn lsn_lease_bg_task(
+fn lsn_lease_bg_task(
    compute: Arc<ComputeNode>,
    tenant_id: TenantId,
    timeline_id: TimelineId,
    lsn: Lsn,
 ) -> Result<()> {
    loop {
-        let (valid_until, last_conninfo) =
-            acquire_lsn_lease_with_retry(&compute, tenant_id, timeline_id, lsn).await?;
+        let valid_until = acquire_lsn_lease_with_retry(&compute, tenant_id, timeline_id, lsn)?;
        let valid_duration = valid_until
            .duration_since(SystemTime::now())
            .unwrap_or(Duration::ZERO);
@@ -62,22 +59,18 @@ async fn lsn_lease_bg_task(
            "Request succeeded, sleeping for {} seconds",
            sleep_duration.as_secs()
        );
-        let compute = compute.clone();
-        tokio::task::spawn_blocking(move || {
-            compute.wait_timeout_while_pageserver_connstr_unchanged(sleep_duration, &last_conninfo);
-        })
-        .await?;
+        compute.wait_timeout_while_pageserver_connstr_unchanged(sleep_duration);
    }
 }

 /// Acquires lsn lease in a retry loop. Returns the expiration time if a lease is granted.
 /// Returns an error if a lease is explicitly not granted. Otherwise, we keep sending requests.
-async fn acquire_lsn_lease_with_retry(
+fn acquire_lsn_lease_with_retry(
    compute: &Arc<ComputeNode>,
    tenant_id: TenantId,
    timeline_id: TimelineId,
    lsn: Lsn,
-) -> Result<(SystemTime, PageserverConnectionInfo)> {
+) -> Result<SystemTime> {
    let mut attempts = 0usize;
    let mut retry_period_ms: f64 = 500.0;
    const MAX_RETRY_PERIOD_MS: f64 = 60.0 * 1000.0;
@@ -93,17 +86,10 @@ async fn acquire_lsn_lease_with_retry(
            )
        };

-        let result = try_acquire_lsn_lease(
-            conninfo.clone(),
-            auth.as_deref(),
-            tenant_id,
-            timeline_id,
-            lsn,
-        )
-        .await;
+        let result = try_acquire_lsn_lease(conninfo, auth.as_deref(), tenant_id, timeline_id, lsn);
        match result {
            Ok(Some(res)) => {
-                return Ok((res, conninfo));
+                return Ok(res);
            }
            Ok(None) => {
                bail!("Permanent error: lease could not be obtained, LSN is behind the GC cutoff");
@@ -111,15 +97,9 @@ async fn acquire_lsn_lease_with_retry(
            Err(e) => {
                warn!("Failed to acquire lsn lease: {e} (attempt {attempts})");

-                let compute = compute.clone();
-                tokio::task::spawn_blocking(move || {
-                    compute.wait_timeout_while_pageserver_connstr_unchanged(
-                        Duration::from_millis(retry_period_ms as u64),
-                        &conninfo,
-                    );
-                })
-                .await?;
-
+                compute.wait_timeout_while_pageserver_connstr_unchanged(Duration::from_millis(
+                    retry_period_ms as u64,
+                ));
                retry_period_ms *= 1.5;
                retry_period_ms = retry_period_ms.min(MAX_RETRY_PERIOD_MS);
            }
@@ -129,16 +109,15 @@ async fn acquire_lsn_lease_with_retry(
 }

 /// Tries to acquire LSN leases on all Pageserver shards.
-async fn try_acquire_lsn_lease(
+fn try_acquire_lsn_lease(
    conninfo: PageserverConnectionInfo,
    auth: Option<&str>,
    tenant_id: TenantId,
    timeline_id: TimelineId,
    lsn: Lsn,
 ) -> Result<Option<SystemTime>> {
-    const MAX_CONCURRENT_LEASE_CONNECTIONS: usize = 8;
+    let mut leases = Vec::new();

-    let mut jobs = Vec::new();
    for (shard_index, shard) in conninfo.shards.into_iter() {
        let tenant_shard_id = TenantShardId {
            tenant_id,
@@ -150,68 +129,46 @@ async fn try_acquire_lsn_lease(
        // leas on all of them? Currently, that's what we assume, but this is hypothetical
        // as of this writing, as we never pass the info for more than one pageserver per
        // shard.
-
-        for shard in shard.pageservers {
-            let shard = shard.clone();
-            jobs.push(async move {
-                match conninfo.prefer_protocol {
-                    PageserverProtocol::Grpc => {
-                        acquire_lsn_lease_grpc(
-                            &shard.grpc_url.unwrap(),
-                            auth,
-                            tenant_shard_id,
-                            timeline_id,
-                            lsn,
-                        )
-                        .await
-                    }
-                    PageserverProtocol::Libpq => {
-                        acquire_lsn_lease_libpq(
-                            &shard.libpq_url.unwrap(),
-                            auth,
-                            tenant_shard_id,
-                            timeline_id,
-                            lsn,
-                        )
-                        .await
-                    }
-                }
-            });
+        for pageserver in shard.pageservers {
+            let lease = match conninfo.prefer_protocol {
+                PageserverProtocol::Grpc => acquire_lsn_lease_grpc(
+                    &pageserver.grpc_url.unwrap(),
+                    auth,
+                    tenant_shard_id,
+                    timeline_id,
+                    lsn,
+                )?,
+                PageserverProtocol::Libpq => acquire_lsn_lease_libpq(
+                    &pageserver.libpq_url.unwrap(),
+                    auth,
+                    tenant_shard_id,
+                    timeline_id,
+                    lsn,
+                )?,
+            };
+            leases.push(lease);
        }
    }

-    let mut stream = futures::stream::iter(jobs).buffer_unordered(MAX_CONCURRENT_LEASE_CONNECTIONS);
-    let mut leases = Vec::new();
-    while let Some(res) = stream.next().await {
-        let lease = res?;
-        leases.push(lease);
-    }
-    Ok(leases.into_iter().flatten().min())
+    Ok(leases.into_iter().min().flatten())
 }

 /// Acquires an LSN lease on a single shard, using the libpq API. The connstring must use a
 /// postgresql:// scheme.
-async fn acquire_lsn_lease_libpq(
+fn acquire_lsn_lease_libpq(
    connstring: &str,
    auth: Option<&str>,
    tenant_shard_id: TenantShardId,
    timeline_id: TimelineId,
    lsn: Lsn,
 ) -> Result<Option<SystemTime>> {
-    let mut config = tokio_postgres::Config::from_str(connstring)?;
+    let mut config = postgres::Config::from_str(connstring)?;
    if let Some(auth) = auth {
        config.password(auth);
    }
-    let (client, connection) = config.connect(NoTls).await?;
-
-    tokio::spawn(async move {
-        if let Err(e) = connection.await {
-            tracing::warn!("lease lsn connection error: {}", e);
-        }
-    });
-
+    let mut client = config.connect(NoTls)?;
    let cmd = format!("lease lsn {tenant_shard_id} {timeline_id} {lsn} ");
-    let res = client.simple_query(&cmd).await?;
+    let res = client.simple_query(&cmd)?;
    let msg = match res.first() {
        Some(msg) => msg,
        None => bail!("empty response"),
@@ -229,34 +186,35 @@ async fn acquire_lsn_lease_libpq(
            .checked_add(Duration::from_millis(u128::from_str(s).unwrap() as u64))
            .expect("Time larger than max SystemTime could handle")
    });
-
    Ok(valid_until)
 }

 /// Acquires an LSN lease on a single shard, using the gRPC API. The connstring must use a
 /// grpc:// scheme.
-async fn acquire_lsn_lease_grpc(
+fn acquire_lsn_lease_grpc(
    connstring: &str,
    auth: Option<&str>,
    tenant_shard_id: TenantShardId,
    timeline_id: TimelineId,
    lsn: Lsn,
 ) -> Result<Option<SystemTime>> {
-    let mut client = page_api::Client::connect(
-        connstring.to_string(),
-        tenant_shard_id.tenant_id,
-        timeline_id,
-        tenant_shard_id.to_index(),
-        auth.map(String::from),
-        None,
-    )
-    .await?;
+    tokio::runtime::Handle::current().block_on(async move {
+        let mut client = page_api::Client::connect(
+            connstring.to_string(),
+            tenant_shard_id.tenant_id,
+            timeline_id,
+            tenant_shard_id.to_index(),
+            auth.map(String::from),
+            None,
+        )
+        .await?;

-    let req = page_api::LeaseLsnRequest { lsn };
-    match client.lease_lsn(req).await {
-        Ok(expires) => Ok(Some(expires)),
-        // Lease couldn't be acquired because the LSN has been garbage collected.
-        Err(err) if err.code() == tonic::Code::FailedPrecondition => Ok(None),
-        Err(err) => Err(err.into()),
-    }
+        let req = page_api::LeaseLsnRequest { lsn };
+        match client.lease_lsn(req).await {
+            Ok(expires) => Ok(Some(expires)),
+            // Lease couldn't be acquired because the LSN has been garbage collected.
+            Err(err) if err.code() == tonic::Code::FailedPrecondition => Ok(None),
+            Err(err) => Err(err.into()),
+        }
+    })
 }
--- a/libs/walproposer/src/api_bindings.rs
+++ b/libs/walproposer/src/api_bindings.rs
@@ -341,34 +341,6 @@ extern "C-unwind" fn log_internal(
    }
 }

-/* BEGIN_HADRON */
-extern "C" fn reset_safekeeper_statuses_for_metrics(wp: *mut WalProposer, num_safekeepers: u32) {
-    unsafe {
-        let callback_data = (*(*wp).config).callback_data;
-        let api = callback_data as *mut Box<dyn ApiImpl>;
-        if api.is_null() {
-            return;
-        }
-        (*api).reset_safekeeper_statuses_for_metrics(&mut (*wp), num_safekeepers);
-    }
-}
-
-extern "C" fn update_safekeeper_status_for_metrics(
-    wp: *mut WalProposer,
-    sk_index: u32,
-    status: u8,
-) {
-    unsafe {
-        let callback_data = (*(*wp).config).callback_data;
-        let api = callback_data as *mut Box<dyn ApiImpl>;
-        if api.is_null() {
-            return;
-        }
-        (*api).update_safekeeper_status_for_metrics(&mut (*wp), sk_index, status);
-    }
-}
-/* END_HADRON */
-
 #[derive(Debug, PartialEq)]
 pub enum Level {
    Debug5,
@@ -442,10 +414,6 @@ pub(crate) fn create_api() -> walproposer_api {
        finish_sync_safekeepers: Some(finish_sync_safekeepers),
        process_safekeeper_feedback: Some(process_safekeeper_feedback),
        log_internal: Some(log_internal),
-        /* BEGIN_HADRON */
-        reset_safekeeper_statuses_for_metrics: Some(reset_safekeeper_statuses_for_metrics),
-        update_safekeeper_status_for_metrics: Some(update_safekeeper_status_for_metrics),
-        /* END_HADRON */
    }
 }

@@ -483,8 +451,6 @@ pub fn empty_shmem() -> crate::bindings::WalproposerShmemState {
        replica_promote: false,
        min_ps_feedback: empty_feedback,
        wal_rate_limiter: empty_wal_rate_limiter,
-        num_safekeepers: 0,
-        safekeeper_status: [0; 32],
    }
 }

--- a/libs/walproposer/src/walproposer.rs
+++ b/libs/walproposer/src/walproposer.rs
@@ -159,21 +159,6 @@ pub trait ApiImpl {
    fn after_election(&self, _wp: &mut WalProposer) {
        todo!()
    }
-
-    /* BEGIN_HADRON */
-    fn reset_safekeeper_statuses_for_metrics(&self, _wp: &mut WalProposer, _num_safekeepers: u32) {
-        // Do nothing for testing purposes.
-    }
-
-    fn update_safekeeper_status_for_metrics(
-        &self,
-        _wp: &mut WalProposer,
-        _sk_index: u32,
-        _status: u8,
-    ) {
-        // Do nothing for testing purposes.
-    }
-    /* END_HADRON */
 }

 #[derive(Debug)]
--- a/pgxn/neon/neon_perf_counters.c
+++ b/pgxn/neon/neon_perf_counters.c
@@ -391,12 +391,6 @@ neon_get_perf_counters(PG_FUNCTION_ARGS)
 	neon_per_backend_counters totals = {0};
 	metric_t   *metrics;

-	/* BEGIN_HADRON */
-	WalproposerShmemState *wp_shmem;
-	uint32 num_safekeepers;
-	uint32 num_active_safekeepers;
-	/* END_HADRON */
-
 	/* We put all the tuples into a tuplestore in one go. */
 	InitMaterializedSRF(fcinfo, 0);

@@ -443,32 +437,11 @@ neon_get_perf_counters(PG_FUNCTION_ARGS)
 		// Not ideal but piggyback our databricks counters into the neon perf counters view
 		// so that we don't need to introduce neon--1.x+1.sql to add a new view.
 		{
-		// Keeping this code in its own block to work around the C90 "don't mix declarations and code" rule when we define
-		// the `databricks_metrics` array in the next block. Yes, we are seriously dealing with C90 rules in 2025.
-
-		// Read safekeeper status from wal proposer shared memory first.
-		// Note that we are taking a mutex when reading from walproposer shared memory so that the total safekeeper count is
-		// consistent with the active wal acceptors count. Assuming that we don't query this view too often the mutex should
-		// not be a huge deal.
-		wp_shmem = GetWalpropShmemState();
-		SpinLockAcquire(&wp_shmem->mutex);
-		num_safekeepers = wp_shmem->num_safekeepers;
-		num_active_safekeepers = 0;
-		for (int i = 0; i < num_safekeepers; i++) {
-			if (wp_shmem->safekeeper_status[i] == 1) {
-				num_active_safekeepers++;
-			}
-		}
-		SpinLockRelease(&wp_shmem->mutex);
-	}
-	{
 			metric_t databricks_metrics[] = {
 				{"sql_index_corruption_count", false, 0, (double) pg_atomic_read_u32(&databricks_metrics_shared->index_corruption_count)},
 				{"sql_data_corruption_count", false, 0, (double) pg_atomic_read_u32(&databricks_metrics_shared->data_corruption_count)},
 				{"sql_internal_error_count", false, 0, (double) pg_atomic_read_u32(&databricks_metrics_shared->internal_error_count)},
 				{"ps_corruption_detected", false, 0, (double) pg_atomic_read_u32(&databricks_metrics_shared->ps_corruption_detected)},
-				{"num_active_safekeepers", false, 0.0, (double) num_active_safekeepers},
-				{"num_configured_safekeepers", false, 0.0, (double) num_safekeepers},
 				{NULL, false, 0, 0},
 			};
 			for (int i = 0; databricks_metrics[i].name != NULL; i++)
--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -154,9 +154,7 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)
 		wp->safekeeper[wp->n_safekeepers].state = SS_OFFLINE;
 		wp->safekeeper[wp->n_safekeepers].active_state = SS_ACTIVE_SEND;
 		wp->safekeeper[wp->n_safekeepers].wp = wp;
-		/* BEGIN_HADRON */
-		wp->safekeeper[wp->n_safekeepers].index = wp->n_safekeepers;
-		/* END_HADRON */
+
 		{
 			Safekeeper *sk = &wp->safekeeper[wp->n_safekeepers];
 			int			written = 0;
@@ -185,10 +183,6 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)
 	if (wp->safekeepers_generation > INVALID_GENERATION && wp->config->proto_version < 3)
 		wp_log(FATAL, "enabling generations requires protocol version 3");
 	wp_log(LOG, "using safekeeper protocol version %d", wp->config->proto_version);
-	
-	/* BEGIN_HADRON */
-	wp->api.reset_safekeeper_statuses_for_metrics(wp, wp->n_safekeepers);
-	/* END_HADRON */

 	/* Fill the greeting package */
 	wp->greetRequest.pam.tag = 'g';
@@ -361,10 +355,6 @@ ShutdownConnection(Safekeeper *sk)
 	sk->state = SS_OFFLINE;
 	sk->streamingAt = InvalidXLogRecPtr;

-	/* BEGIN_HADRON */
-	sk->wp->api.update_safekeeper_status_for_metrics(sk->wp, sk->index, 0);
-	/* END_HADRON */
-
 	MembershipConfigurationFree(&sk->greetResponse.mconf);
 	if (sk->voteResponse.termHistory.entries)
 		pfree(sk->voteResponse.termHistory.entries);
@@ -1540,10 +1530,6 @@ StartStreaming(Safekeeper *sk)
 	sk->active_state = SS_ACTIVE_SEND;
 	sk->streamingAt = sk->startStreamingAt;

-	/* BEGIN_HADRON */
-	sk->wp->api.update_safekeeper_status_for_metrics(sk->wp, sk->index, 1);
-	/* END_HADRON */
-
 	/*
 	 * Donors can only be in SS_ACTIVE state, so we potentially update the
 	 * donor when we switch one to SS_ACTIVE.
--- a/pgxn/neon/walproposer.h
+++ b/pgxn/neon/walproposer.h
@@ -432,10 +432,6 @@ typedef struct WalproposerShmemState
 	/* BEGIN_HADRON */
 	/* The WAL rate limiter */
 	WalRateLimiter wal_rate_limiter;
-	/* Number of safekeepers in the config */
-	uint32 num_safekeepers;
-	/* Per-safekeeper status flags: 0=inactive, 1=active */
-	uint8 safekeeper_status[MAX_SAFEKEEPERS];
 	/* END_HADRON */
 } WalproposerShmemState;

@@ -487,11 +483,6 @@ typedef struct Safekeeper
 	char const *host;
 	char const *port;

-	/* BEGIN_HADRON */
-	/* index of this safekeeper in the WalProposer array */
-	uint32 index;
-	/* END_HADRON */
-
 	/*
 	 * connection string for connecting/reconnecting.
 	 *
@@ -740,23 +731,6 @@ typedef struct walproposer_api
 	 * handled by elog().
 	 */
 	void		(*log_internal) (WalProposer *wp, int level, const char *line);
-
-	/*
-	 * BEGIN_HADRON
-	 * APIs manipulating shared memory state used for Safekeeper quorum health metrics.
-	 */
-
-	/*
-	 * Reset the safekeeper statuses in shared memory for metric purposes.
-	 */
-	void		(*reset_safekeeper_statuses_for_metrics) (WalProposer *wp, uint32 num_safekeepers);
-
-	/*
-	 * Update the safekeeper status in shared memory for metric purposes.
-	 */
-	void		(*update_safekeeper_status_for_metrics) (WalProposer *wp, uint32 sk_index, uint8 status);
-
-	/* END_HADRON */
 } walproposer_api;

 /*
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -2261,27 +2261,6 @@ GetNeonCurrentClusterSize(void)
 }
 uint64		GetNeonCurrentClusterSize(void);

-/* BEGIN_HADRON */
-static void
-walprop_pg_reset_safekeeper_statuses_for_metrics(WalProposer *wp, uint32 num_safekeepers)
-{
-	WalproposerShmemState* shmem = wp->api.get_shmem_state(wp);
-	SpinLockAcquire(&shmem->mutex);
-	shmem->num_safekeepers = num_safekeepers;
-	memset(shmem->safekeeper_status, 0, sizeof(shmem->safekeeper_status));
-	SpinLockRelease(&shmem->mutex);
-}
-
-static void
-walprop_pg_update_safekeeper_status_for_metrics(WalProposer *wp, uint32 sk_index, uint8 status)
-{
-	WalproposerShmemState* shmem = wp->api.get_shmem_state(wp);
-	Assert(sk_index < MAX_SAFEKEEPERS);
-	SpinLockAcquire(&shmem->mutex);
-	shmem->safekeeper_status[sk_index] = status;
-	SpinLockRelease(&shmem->mutex);
-}
-/* END_HADRON */

 static const walproposer_api walprop_pg = {
 	.get_shmem_state = walprop_pg_get_shmem_state,
@@ -2315,6 +2294,4 @@ static const walproposer_api walprop_pg = {
 	.finish_sync_safekeepers = walprop_pg_finish_sync_safekeepers,
 	.process_safekeeper_feedback = walprop_pg_process_safekeeper_feedback,
 	.log_internal = walprop_pg_log_internal,
-	.reset_safekeeper_statuses_for_metrics = walprop_pg_reset_safekeeper_statuses_for_metrics,
-	.update_safekeeper_status_for_metrics = walprop_pg_update_safekeeper_status_for_metrics,
 };
--- a/test_runner/regress/test_readonly_node.py
+++ b/test_runner/regress/test_readonly_node.py
@@ -233,6 +233,15 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder):
        log.info(f"`SELECT` query succeed after GC, {ctx=}")
        return offset

+    # It's not reliable to let the compute renew the lease in this test case as we have a very tight
+    # lease timeout. Therefore, the test case itself will renew the lease.
+    #
+    # This is a workaround to make the test case more deterministic.
+    def renew_lease(env: NeonEnv, lease_lsn: Lsn):
+        env.storage_controller.pageserver_api().timeline_lsn_lease(
+            env.initial_tenant, env.initial_timeline, lease_lsn
+        )
+
    # Insert some records on main branch
    with env.endpoints.create_start("main", config_lines=["shared_buffers=1MB"]) as ep_main:
        with ep_main.cursor() as cur:
@@ -245,6 +254,9 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder):
        XLOG_BLCKSZ = 8192
        lsn = Lsn((int(lsn) // XLOG_BLCKSZ) * XLOG_BLCKSZ)

+        # We need to mock the way cplane works: it gets a lease for a branch before starting the compute.
+        renew_lease(env, lsn)
+
        with env.endpoints.create_start(
            branch_name="main",
            endpoint_id="static",
@@ -265,6 +277,8 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder):
                ps.stop()
                ps.start()

+            renew_lease(env, lsn)
+
            trigger_gc_and_select(
                env,
                ep_static,
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -2742,7 +2742,6 @@ def test_pull_timeline_partial_segment_integrity(neon_env_builder: NeonEnvBuilde
    wait_until(unevicted)


-@pytest.mark.skip(reason="Lakebase mode")
 def test_timeline_disk_usage_limit(neon_env_builder: NeonEnvBuilder):
    """
    Test that the timeline disk usage circuit breaker works as expected. We test that:
@@ -2758,32 +2757,18 @@ def test_timeline_disk_usage_limit(neon_env_builder: NeonEnvBuilder):
    remote_storage_kind = s3_storage()
    neon_env_builder.enable_safekeeper_remote_storage(remote_storage_kind)

+    # Set a very small disk usage limit (1KB)
+    neon_env_builder.safekeeper_extra_opts = ["--max-timeline-disk-usage-bytes=1024"]
+
    env = neon_env_builder.init_start()

    # Create a timeline and endpoint
    env.create_branch("test_timeline_disk_usage_limit")
    endpoint = env.endpoints.create_start("test_timeline_disk_usage_limit")

-    # Install the neon extension in the test database. We need it to query perf counter metrics.
-    with closing(endpoint.connect()) as conn:
-        with conn.cursor() as cur:
-            cur.execute("CREATE EXTENSION IF NOT EXISTS neon")
-            # Sanity-check safekeeper connection status in neon_perf_counters in the happy case.
-            cur.execute(
-                "SELECT value FROM neon_perf_counters WHERE metric = 'num_active_safekeepers'"
-            )
-            assert cur.fetchone() == (1,), "Expected 1 active safekeeper"
-            cur.execute(
-                "SELECT value FROM neon_perf_counters WHERE metric = 'num_configured_safekeepers'"
-            )
-            assert cur.fetchone() == (1,), "Expected 1 configured safekeeper"
-
    # Get the safekeeper
    sk = env.safekeepers[0]

-    # Restart the safekeeper with a very small disk usage limit (1KB)
-    sk.stop().start(["--max-timeline-disk-usage-bytes=1024"])
-
    # Inject a failpoint to stop WAL backup
    with sk.http_client() as http_cli:
        http_cli.configure_failpoints([("backup-lsn-range-pausable", "pause")])
@@ -2809,18 +2794,6 @@ def test_timeline_disk_usage_limit(neon_env_builder: NeonEnvBuilder):
    wait_until(error_logged)
    log.info("Found expected error message in compute log, resuming.")

-    with closing(endpoint.connect()) as conn:
-        with conn.cursor() as cur:
-            # Confirm that neon_perf_counters also indicates that there are no active safekeepers
-            cur.execute(
-                "SELECT value FROM neon_perf_counters WHERE metric = 'num_active_safekeepers'"
-            )
-            assert cur.fetchone() == (0,), "Expected 0 active safekeepers"
-            cur.execute(
-                "SELECT value FROM neon_perf_counters WHERE metric = 'num_configured_safekeepers'"
-            )
-            assert cur.fetchone() == (1,), "Expected 1 configured safekeeper"
-
    # Sanity check that the hanging insert is indeed still hanging. Otherwise means the circuit breaker we
    # implemented didn't work as expected.
    time.sleep(2)
Author	SHA1	Message	Date
Ivan Efremov	d3e1024770	Run proxy-bench directly on self-hosted runner	2025-07-31 13:37:08 +03:00
Ivan Efremov	2c317dad14	Fix run.sh proxy-bench args	2025-07-30 18:16:14 +03:00