Merge remote-tracking branch 'origin/main' into vlad/hadron-jwt

2026-05-25 09:00:37 +00:00 · 2025-07-31 11:29:07 +01:00
parent 94dc55f405 8fe7596120
commit a1cc1f33dc
139 changed files with 3324 additions and 1217 deletions
--- a/test_runner/fixtures/endpoint/http.py
+++ b/test_runner/fixtures/endpoint/http.py
@@ -78,20 +78,26 @@ class EndpointHttpClient(requests.Session):
        json: dict[str, str] = res.json()
        return json

-    def prewarm_lfc(self, from_endpoint_id: str | None = None):
+    def prewarm_lfc(self, from_endpoint_id: str | None = None) -> dict[str, str]:
        """
        Prewarm LFC cache from given endpoint and wait till it finishes or errors
        """
        params = {"from_endpoint": from_endpoint_id} if from_endpoint_id else dict()
        self.post(self.prewarm_url, params=params).raise_for_status()
-        self.prewarm_lfc_wait()
+        return self.prewarm_lfc_wait()

-    def prewarm_lfc_wait(self):
+    def cancel_prewarm_lfc(self):
+        """
+        Cancel LFC prewarm if any is ongoing
+        """
+        self.delete(self.prewarm_url).raise_for_status()
+
+    def prewarm_lfc_wait(self) -> dict[str, str]:
        """
        Wait till LFC prewarm returns with error or success.
        If prewarm was not requested before calling this function, it will error
        """
-        statuses = "failed", "completed", "skipped"
+        statuses = "failed", "completed", "skipped", "cancelled"

        def prewarmed():
            json = self.prewarm_lfc_status()
@@ -101,6 +107,7 @@ class EndpointHttpClient(requests.Session):
        wait_until(prewarmed, timeout=60)
        res = self.prewarm_lfc_status()
        assert res["status"] != "failed", res
+        return res

    def offload_lfc_status(self) -> dict[str, str]:
        res = self.get(self.offload_url)
@@ -108,29 +115,31 @@ class EndpointHttpClient(requests.Session):
        json: dict[str, str] = res.json()
        return json

-    def offload_lfc(self):
+    def offload_lfc(self) -> dict[str, str]:
        """
        Offload LFC cache to endpoint storage and wait till offload finishes or errors
        """
        self.post(self.offload_url).raise_for_status()
-        self.offload_lfc_wait()
+        return self.offload_lfc_wait()

-    def offload_lfc_wait(self):
+    def offload_lfc_wait(self) -> dict[str, str]:
        """
        Wait till LFC offload returns with error or success.
        If offload was not requested before calling this function, it will error
        """
+        statuses = "failed", "completed", "skipped"

        def offloaded():
            json = self.offload_lfc_status()
            status, err = json["status"], json.get("error")
-            assert status in ["failed", "completed"], f"{status}, {err=}"
+            assert status in statuses, f"{status}, {err=}"

        wait_until(offloaded, timeout=60)
        res = self.offload_lfc_status()
        assert res["status"] != "failed", res
+        return res

-    def promote(self, promote_spec: dict[str, Any], disconnect: bool = False):
+    def promote(self, promote_spec: dict[str, Any], disconnect: bool = False) -> dict[str, str]:
        url = f"http://localhost:{self.external_port}/promote"
        if disconnect:
            try:  # send first request to start promote and disconnect
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -266,7 +266,6 @@ class PgProtocol:
        # pooler does not support statement_timeout
        # Check if the hostname contains the string 'pooler'
        hostname = result.get("host", "")
-        log.info(f"Hostname: {hostname}")
        options = result.get("options", "")
        if "statement_timeout" not in options and "pooler" not in hostname:
            options = f"-cstatement_timeout=120s {options}"
@@ -2370,6 +2369,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
        timeline_id: TimelineId,
        new_sk_set: list[int],
    ):
+        log.info(f"migrate_safekeepers({tenant_id}, {timeline_id}, {new_sk_set})")
        response = self.request(
            "POST",
            f"{self.api}/v1/tenant/{tenant_id}/timeline/{timeline_id}/safekeeper_migrate",
@@ -5336,16 +5336,32 @@ class EndpointFactory:
        )

    def stop_all(self, fail_on_error=True) -> Self:
-        exception = None
-        for ep in self.endpoints:
+        """
+        Stop all the endpoints in parallel.
+        """
+
+        # Note: raising an exception from a task in a task group cancels
+        # all the other tasks. We don't want that, hence the 'stop_one'
+        # function catches exceptions and puts them on the 'exceptions'
+        # list for later processing.
+        exceptions = []
+
+        async def stop_one(ep):
            try:
-                ep.stop()
+                await asyncio.to_thread(ep.stop)
            except Exception as e:
                log.error(f"Failed to stop endpoint {ep.endpoint_id}: {e}")
-                exception = e
+                exceptions.append(e)

-        if fail_on_error and exception is not None:
-            raise exception
+        async def async_stop_all():
+            async with asyncio.TaskGroup() as tg:
+                for ep in self.endpoints:
+                    tg.create_task(stop_one(ep))
+
+        asyncio.run(async_stop_all())
+
+        if fail_on_error and exceptions:
+            raise ExceptionGroup("stopping an endpoint failed", exceptions)

        return self

--- a/test_runner/regress/test_basebackup.py
+++ b/test_runner/regress/test_basebackup.py
@@ -2,13 +2,15 @@ from __future__ import annotations

 from typing import TYPE_CHECKING

+import pytest
 from fixtures.utils import wait_until

 if TYPE_CHECKING:
    from fixtures.neon_fixtures import NeonEnvBuilder


-def test_basebackup_cache(neon_env_builder: NeonEnvBuilder):
+@pytest.mark.parametrize("grpc", [True, False])
+def test_basebackup_cache(neon_env_builder: NeonEnvBuilder, grpc: bool):
    """
    Simple test for basebackup cache.
    1. Check that we always hit the cache after compute restart.
@@ -22,7 +24,7 @@ def test_basebackup_cache(neon_env_builder: NeonEnvBuilder):
    """

    env = neon_env_builder.init_start()
-    ep = env.endpoints.create("main")
+    ep = env.endpoints.create("main", grpc=grpc)
    ps = env.pageserver
    ps_http = ps.http_client()

--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -863,6 +863,89 @@ def test_pageserver_compaction_circuit_breaker(neon_env_builder: NeonEnvBuilder)
    assert not env.pageserver.log_contains(".*Circuit breaker failure ended.*")


+def test_ps_corruption_detection_feedback(neon_env_builder: NeonEnvBuilder):
+    """
+    Test that when the pageserver detects corruption during image layer creation,
+    it sends corruption feedback to the safekeeper which gets recorded in its
+    safekeeper_ps_corruption_detected metric.
+    """
+    # Configure tenant with aggressive compaction settings to easily trigger compaction
+    TENANT_CONF = {
+        # Small checkpoint distance to create many layers
+        "checkpoint_distance": 1024 * 128,
+        # Compact small layers
+        "compaction_target_size": 1024 * 128,
+        # Create image layers eagerly
+        "image_creation_threshold": 1,
+        "image_layer_creation_check_threshold": 0,
+        # Force frequent compaction
+        "compaction_period": "1s",
+    }
+
+    env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
+    # We are simulating compaction failures so we should allow these error messages.
+    env.pageserver.allowed_errors.append(".*Compaction failed.*")
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    pageserver_http = env.pageserver.http_client()
+    workload = Workload(
+        env, tenant_id, timeline_id, endpoint_opts={"config_lines": ["neon.lakebase_mode=true"]}
+    )
+    workload.init()
+
+    # Enable the failpoint that will cause image layer creation to fail due to a (simulated) detected
+    # corruption.
+    pageserver_http.configure_failpoints(("create-image-layer-fail-simulated-corruption", "return"))
+
+    # Write some data to trigger compaction and image layer creation
+    log.info("Writing data to trigger compaction...")
+    workload.write_rows(1024 * 64, upload=False)
+    workload.write_rows(1024 * 64, upload=False)
+
+    # Returns True if the corruption signal from PS is propagated to the SK according to the "safekeeper_ps_corruption_detected" metric.
+    # Raises an exception otherwise.
+    def check_corruption_signal_propagated_to_sk():
+        # Get metrics from all safekeepers
+        for sk in env.safekeepers:
+            sk_metrics = sk.http_client().get_metrics()
+            # Look for our corruption detected metric with the right tenant and timeline
+            corruption_metrics = sk_metrics.query_all("safekeeper_ps_corruption_detected")
+
+            for metric in corruption_metrics:
+                # Check if there's a metric for our tenant and timeline that has value 1
+                if (
+                    metric.labels.get("tenant_id") == str(tenant_id)
+                    and metric.labels.get("timeline_id") == str(timeline_id)
+                    and metric.value == 1
+                ):
+                    log.info(f"Corruption detected by safekeeper {sk.id}: {metric}")
+                    return True
+        raise Exception("Corruption detection feedback not found in any safekeeper metrics")
+
+    # Returns True if the corruption signal from PS is propagated to the PG according to the "ps_corruption_detected" metric
+    # in "neon_perf_counters".
+    # Raises an exception otherwise.
+    def check_corruption_signal_propagated_to_pg():
+        endpoint = workload.endpoint()
+        results = endpoint.safe_psql("CREATE EXTENSION IF NOT EXISTS neon")
+        results = endpoint.safe_psql(
+            "SELECT value FROM neon_perf_counters WHERE metric = 'ps_corruption_detected'"
+        )
+        log.info("Query corruption detection metric, results: %s", results)
+        if results[0][0] == 1:
+            log.info("Corruption detection signal is raised on Postgres")
+            return True
+        raise Exception("Corruption detection signal is not raise on Postgres")
+
+    # Confirm that the corruption signal propagates to both the safekeeper and Postgres
+    wait_until(check_corruption_signal_propagated_to_sk, timeout=10, interval=0.1)
+    wait_until(check_corruption_signal_propagated_to_pg, timeout=10, interval=0.1)
+
+    # Cleanup the failpoint
+    pageserver_http.configure_failpoints(("create-image-layer-fail-simulated-corruption", "off"))
+
+
@pytest.mark.parametrize("enabled", [True, False])
 def test_image_layer_compression(neon_env_builder: NeonEnvBuilder, enabled: bool):
    tenant_conf = {
--- a/test_runner/regress/test_lfc_prewarm.py
+++ b/test_runner/regress/test_lfc_prewarm.py
@@ -1,6 +1,6 @@
 import random
-import threading
 from enum import StrEnum
+from threading import Thread
 from time import sleep
 from typing import Any

@@ -47,19 +47,23 @@ def offload_lfc(method: PrewarmMethod, client: EndpointHttpClient, cur: Cursor)
        # With autoprewarm, we need to be sure LFC was offloaded after all writes
        # finish, so we sleep. Otherwise we'll have less prewarmed pages than we want
        sleep(AUTOOFFLOAD_INTERVAL_SECS)
-        client.offload_lfc_wait()
-        return
+        offload_res = client.offload_lfc_wait()
+        log.info(offload_res)
+        return offload_res

    if method == PrewarmMethod.COMPUTE_CTL:
        status = client.prewarm_lfc_status()
        assert status["status"] == "not_prewarmed"
        assert "error" not in status
-        client.offload_lfc()
+        offload_res = client.offload_lfc()
+        log.info(offload_res)
        assert client.prewarm_lfc_status()["status"] == "not_prewarmed"
+
        parsed = prom_parse(client)
        desired = {OFFLOAD_LABEL: 1, PREWARM_LABEL: 0, OFFLOAD_ERR_LABEL: 0, PREWARM_ERR_LABEL: 0}
        assert parsed == desired, f"{parsed=} != {desired=}"
-        return
+
+        return offload_res

    raise AssertionError(f"{method} not in PrewarmMethod")

@@ -68,21 +72,30 @@ def prewarm_endpoint(
    method: PrewarmMethod, client: EndpointHttpClient, cur: Cursor, lfc_state: str | None
 ):
    if method == PrewarmMethod.AUTOPREWARM:
-        client.prewarm_lfc_wait()
+        prewarm_res = client.prewarm_lfc_wait()
+        log.info(prewarm_res)
    elif method == PrewarmMethod.COMPUTE_CTL:
-        client.prewarm_lfc()
+        prewarm_res = client.prewarm_lfc()
+        log.info(prewarm_res)
+        return prewarm_res
    elif method == PrewarmMethod.POSTGRES:
        cur.execute("select neon.prewarm_local_cache(%s)", (lfc_state,))


-def check_prewarmed(
+def check_prewarmed_contains(
    method: PrewarmMethod, client: EndpointHttpClient, desired_status: dict[str, str | int]
 ):
    if method == PrewarmMethod.AUTOPREWARM:
-        assert client.prewarm_lfc_status() == desired_status
+        prewarm_status = client.prewarm_lfc_status()
+        for k in desired_status:
+            assert desired_status[k] == prewarm_status[k]
+
        assert prom_parse(client)[PREWARM_LABEL] == 1
    elif method == PrewarmMethod.COMPUTE_CTL:
-        assert client.prewarm_lfc_status() == desired_status
+        prewarm_status = client.prewarm_lfc_status()
+        for k in desired_status:
+            assert desired_status[k] == prewarm_status[k]
+
        desired = {OFFLOAD_LABEL: 0, PREWARM_LABEL: 1, PREWARM_ERR_LABEL: 0, OFFLOAD_ERR_LABEL: 0}
        assert prom_parse(client) == desired

@@ -149,9 +162,6 @@ def test_lfc_prewarm(neon_simple_env: NeonEnv, method: PrewarmMethod):
    log.info(f"Used LFC size: {lfc_used_pages}")
    pg_cur.execute("select * from neon.get_prewarm_info()")
    total, prewarmed, skipped, _ = pg_cur.fetchall()[0]
-    log.info(f"Prewarm info: {total=} {prewarmed=} {skipped=}")
-    progress = (prewarmed + skipped) * 100 // total
-    log.info(f"Prewarm progress: {progress}%")
    assert lfc_used_pages > 10000
    assert total > 0
    assert prewarmed > 0
@@ -161,7 +171,54 @@ def test_lfc_prewarm(neon_simple_env: NeonEnv, method: PrewarmMethod):
    assert lfc_cur.fetchall()[0][0] == n_records * (n_records + 1) / 2

    desired = {"status": "completed", "total": total, "prewarmed": prewarmed, "skipped": skipped}
-    check_prewarmed(method, client, desired)
+    check_prewarmed_contains(method, client, desired)
+
+
+@pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping")
+def test_lfc_prewarm_cancel(neon_simple_env: NeonEnv):
+    """
+    Test we can cancel LFC prewarm and prewarm successfully after
+    """
+    env = neon_simple_env
+    n_records = 1000000
+    cfg = [
+        "autovacuum = off",
+        "shared_buffers=1MB",
+        "neon.max_file_cache_size=1GB",
+        "neon.file_cache_size_limit=1GB",
+        "neon.file_cache_prewarm_limit=1000",
+    ]
+    endpoint = env.endpoints.create_start(branch_name="main", config_lines=cfg)
+
+    pg_conn = endpoint.connect()
+    pg_cur = pg_conn.cursor()
+    pg_cur.execute("create schema neon; create extension neon with schema neon")
+    pg_cur.execute("create database lfc")
+
+    lfc_conn = endpoint.connect(dbname="lfc")
+    lfc_cur = lfc_conn.cursor()
+    log.info(f"Inserting {n_records} rows")
+    lfc_cur.execute("create table t(pk integer primary key, payload text default repeat('?', 128))")
+    lfc_cur.execute(f"insert into t (pk) values (generate_series(1,{n_records}))")
+    log.info(f"Inserted {n_records} rows")
+
+    client = endpoint.http_client()
+    method = PrewarmMethod.COMPUTE_CTL
+    offload_lfc(method, client, pg_cur)
+
+    endpoint.stop()
+    endpoint.start()
+
+    thread = Thread(target=lambda: prewarm_endpoint(method, client, pg_cur, None))
+    thread.start()
+    # wait 2 seconds to ensure we cancel prewarm SQL query
+    sleep(2)
+    client.cancel_prewarm_lfc()
+    thread.join()
+    assert client.prewarm_lfc_status()["status"] == "cancelled"
+
+    prewarm_endpoint(method, client, pg_cur, None)
+    assert client.prewarm_lfc_status()["status"] == "completed"


@pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping")
@@ -178,9 +235,8 @@ def test_lfc_prewarm_empty(neon_simple_env: NeonEnv):
    cur = conn.cursor()
    cur.execute("create schema neon; create extension neon with schema neon")
    method = PrewarmMethod.COMPUTE_CTL
-    offload_lfc(method, client, cur)
-    prewarm_endpoint(method, client, cur, None)
-    assert client.prewarm_lfc_status()["status"] == "skipped"
+    assert offload_lfc(method, client, cur)["status"] == "skipped"
+    assert prewarm_endpoint(method, client, cur, None)["status"] == "skipped"


 # autoprewarm isn't needed as we prewarm manually
@@ -251,11 +307,11 @@ def test_lfc_prewarm_under_workload(neon_simple_env: NeonEnv, method: PrewarmMet

    workload_threads = []
    for _ in range(n_threads):
-        t = threading.Thread(target=workload)
+        t = Thread(target=workload)
        workload_threads.append(t)
        t.start()

-    prewarm_thread = threading.Thread(target=prewarm)
+    prewarm_thread = Thread(target=prewarm)
    prewarm_thread.start()

    def prewarmed():
--- a/test_runner/regress/test_readonly_node.py
+++ b/test_runner/regress/test_readonly_node.py
@@ -129,7 +129,10 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder):
    Test static endpoint is protected from GC by acquiring and renewing lsn leases.
    """

-    LSN_LEASE_LENGTH = 8
+    LSN_LEASE_LENGTH = (
+        14  # This value needs to be large enough for compute_ctl to send two lease requests.
+    )
+
    neon_env_builder.num_pageservers = 2
    # GC is manual triggered.
    env = neon_env_builder.init_start(
@@ -230,6 +233,15 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder):
        log.info(f"`SELECT` query succeed after GC, {ctx=}")
        return offset

+    # It's not reliable to let the compute renew the lease in this test case as we have a very tight
+    # lease timeout. Therefore, the test case itself will renew the lease.
+    #
+    # This is a workaround to make the test case more deterministic.
+    def renew_lease(env: NeonEnv, lease_lsn: Lsn):
+        env.storage_controller.pageserver_api().timeline_lsn_lease(
+            env.initial_tenant, env.initial_timeline, lease_lsn
+        )
+
    # Insert some records on main branch
    with env.endpoints.create_start("main", config_lines=["shared_buffers=1MB"]) as ep_main:
        with ep_main.cursor() as cur:
@@ -242,6 +254,9 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder):
        XLOG_BLCKSZ = 8192
        lsn = Lsn((int(lsn) // XLOG_BLCKSZ) * XLOG_BLCKSZ)

+        # We need to mock the way cplane works: it gets a lease for a branch before starting the compute.
+        renew_lease(env, lsn)
+
        with env.endpoints.create_start(
            branch_name="main",
            endpoint_id="static",
@@ -251,9 +266,6 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder):
                cur.execute("SELECT count(*) FROM t0")
                assert cur.fetchone() == (ROW_COUNT,)

-            # Wait for static compute to renew lease at least once.
-            time.sleep(LSN_LEASE_LENGTH / 2)
-
            generate_updates_on_main(env, ep_main, 3, end=100)

            offset = trigger_gc_and_select(
@@ -263,10 +275,10 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder):
            # Trigger Pageserver restarts
            for ps in env.pageservers:
                ps.stop()
-                # Static compute should have at least one lease request failure due to connection.
-                time.sleep(LSN_LEASE_LENGTH / 2)
                ps.start()

+            renew_lease(env, lsn)
+
            trigger_gc_and_select(
                env,
                ep_static,
@@ -282,6 +294,9 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder):
            )
            env.storage_controller.reconcile_until_idle()

+            # Wait for static compute to renew lease on the new pageserver.
+            time.sleep(LSN_LEASE_LENGTH + 3)
+
            trigger_gc_and_select(
                env,
                ep_static,
@@ -292,7 +307,6 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder):

        # Do some update so we can increment gc_cutoff
        generate_updates_on_main(env, ep_main, i, end=100)
-
    # Wait for the existing lease to expire.
    time.sleep(LSN_LEASE_LENGTH + 1)
    # Now trigger GC again, layers should be removed.
--- a/test_runner/regress/test_safekeeper_migration.py
+++ b/test_runner/regress/test_safekeeper_migration.py
@@ -286,3 +286,177 @@ def test_sk_generation_aware_tombstones(neon_env_builder: NeonEnvBuilder):
    assert re.match(r".*Timeline .* deleted.*", exc.value.response.text)
    # The timeline should remain deleted.
    expect_deleted(second_sk)
+
+
+def test_safekeeper_migration_stale_timeline(neon_env_builder: NeonEnvBuilder):
+    """
+    Test that safekeeper migration handles stale timeline correctly by migrating to
+    a safekeeper with a stale timeline.
+    1. Check that we are waiting for the stale timeline to catch up with the commit lsn.
+       The migration might fail if there is no compute to advance the WAL.
+    2. Check that we rely on last_log_term (and not the current term) when waiting for the
+       sync_position on step 7.
+    3. Check that migration succeeds if the compute is running.
+    """
+    neon_env_builder.num_safekeepers = 2
+    neon_env_builder.storage_controller_config = {
+        "timelines_onto_safekeepers": True,
+        "timeline_safekeeper_count": 1,
+    }
+    env = neon_env_builder.init_start()
+    env.pageserver.allowed_errors.extend(PAGESERVER_ALLOWED_ERRORS)
+    env.storage_controller.allowed_errors.append(".*not enough successful .* to reach quorum.*")
+
+    mconf = env.storage_controller.timeline_locate(env.initial_tenant, env.initial_timeline)
+
+    active_sk = env.get_safekeeper(mconf["sk_set"][0])
+    other_sk = [sk for sk in env.safekeepers if sk.id != active_sk.id][0]
+
+    ep = env.endpoints.create("main", tenant_id=env.initial_tenant)
+    ep.start(safekeeper_generation=1, safekeepers=[active_sk.id])
+    ep.safe_psql("CREATE TABLE t(a int)")
+    ep.safe_psql("INSERT INTO t VALUES (0)")
+
+    # Pull the timeline to other_sk, so other_sk now has a "stale" timeline on it.
+    other_sk.pull_timeline([active_sk], env.initial_tenant, env.initial_timeline)
+
+    # Advance the WAL on active_sk.
+    ep.safe_psql("INSERT INTO t VALUES (1)")
+
+    # The test is more tricky if we have the same last_log_term but different term/flush_lsn.
+    # Stop the active_sk during the endpoint shutdown because otherwise compute_ctl runs
+    # sync_safekeepers and advances last_log_term on active_sk.
+    active_sk.stop()
+    ep.stop(mode="immediate")
+    active_sk.start()
+
+    active_sk_status = active_sk.http_client().timeline_status(
+        env.initial_tenant, env.initial_timeline
+    )
+    other_sk_status = other_sk.http_client().timeline_status(
+        env.initial_tenant, env.initial_timeline
+    )
+
+    # other_sk should have the same last_log_term, but a stale flush_lsn.
+    assert active_sk_status.last_log_term == other_sk_status.last_log_term
+    assert active_sk_status.flush_lsn > other_sk_status.flush_lsn
+
+    commit_lsn = active_sk_status.flush_lsn
+
+    # Bump the term on other_sk to make it higher than active_sk.
+    # This is to make sure we don't use current term instead of last_log_term in the algorithm.
+    other_sk.http_client().term_bump(
+        env.initial_tenant, env.initial_timeline, active_sk_status.term + 100
+    )
+
+    # TODO(diko): now it fails because the timeline on other_sk is stale and there is no compute
+    # to catch up it with active_sk. It might be fixed in https://databricks.atlassian.net/browse/LKB-946
+    # if we delete stale timelines before starting the migration.
+    # But the rest of the test is still valid: we should not lose committed WAL after the migration.
+    with pytest.raises(
+        StorageControllerApiException, match="not enough successful .* to reach quorum"
+    ):
+        env.storage_controller.migrate_safekeepers(
+            env.initial_tenant, env.initial_timeline, [other_sk.id]
+        )
+
+    mconf = env.storage_controller.timeline_locate(env.initial_tenant, env.initial_timeline)
+    assert mconf["new_sk_set"] == [other_sk.id]
+    assert mconf["sk_set"] == [active_sk.id]
+    assert mconf["generation"] == 2
+
+    # Start the endpoint, so it advances the WAL on other_sk.
+    ep.start(safekeeper_generation=2, safekeepers=[active_sk.id, other_sk.id])
+    # Now the migration should succeed.
+    env.storage_controller.migrate_safekeepers(
+        env.initial_tenant, env.initial_timeline, [other_sk.id]
+    )
+
+    # Check that we didn't lose committed WAL.
+    assert (
+        other_sk.http_client().timeline_status(env.initial_tenant, env.initial_timeline).flush_lsn
+        >= commit_lsn
+    )
+    assert ep.safe_psql("SELECT * FROM t") == [(0,), (1,)]
+
+
+def test_pull_from_most_advanced_sk(neon_env_builder: NeonEnvBuilder):
+    """
+    Test that we pull the timeline from the most advanced safekeeper during the
+    migration and do not lose committed WAL.
+    """
+    neon_env_builder.num_safekeepers = 4
+    neon_env_builder.storage_controller_config = {
+        "timelines_onto_safekeepers": True,
+        "timeline_safekeeper_count": 3,
+    }
+    env = neon_env_builder.init_start()
+    env.pageserver.allowed_errors.extend(PAGESERVER_ALLOWED_ERRORS)
+
+    mconf = env.storage_controller.timeline_locate(env.initial_tenant, env.initial_timeline)
+
+    sk_set = mconf["sk_set"]
+    assert len(sk_set) == 3
+
+    other_sk = [sk.id for sk in env.safekeepers if sk.id not in sk_set][0]
+
+    ep = env.endpoints.create("main", tenant_id=env.initial_tenant)
+    ep.start(safekeeper_generation=1, safekeepers=sk_set)
+    ep.safe_psql("CREATE TABLE t(a int)")
+    ep.safe_psql("INSERT INTO t VALUES (0)")
+
+    # Stop one sk, so we have a lagging WAL on it.
+    env.get_safekeeper(sk_set[0]).stop()
+    # Advance the WAL on the other sks.
+    ep.safe_psql("INSERT INTO t VALUES (1)")
+
+    # Stop other sks to make sure compute_ctl doesn't advance the last_log_term on them during shutdown.
+    for sk_id in sk_set[1:]:
+        env.get_safekeeper(sk_id).stop()
+    ep.stop(mode="immediate")
+    for sk_id in sk_set:
+        env.get_safekeeper(sk_id).start()
+
+    # Bump the term on the lagging sk to make sure we don't use it to choose the most advanced sk.
+    env.get_safekeeper(sk_set[0]).http_client().term_bump(
+        env.initial_tenant, env.initial_timeline, 100
+    )
+
+    def get_commit_lsn(sk_set: list[int]):
+        flush_lsns = []
+        last_log_terms = []
+        for sk_id in sk_set:
+            sk = env.get_safekeeper(sk_id)
+            status = sk.http_client().timeline_status(env.initial_tenant, env.initial_timeline)
+            flush_lsns.append(status.flush_lsn)
+            last_log_terms.append(status.last_log_term)
+
+        # In this test we assume that all sks have the same last_log_term.
+        assert len(set(last_log_terms)) == 1
+
+        flush_lsns.sort(reverse=True)
+        commit_lsn = flush_lsns[len(sk_set) // 2]
+
+        log.info(f"sk_set: {sk_set}, flush_lsns: {flush_lsns}, commit_lsn: {commit_lsn}")
+        return commit_lsn
+
+    commit_lsn_before_migration = get_commit_lsn(sk_set)
+
+    # Make two migrations, so the lagging sk stays in the sk_set, but other sks are replaced.
+    new_sk_set1 = [sk_set[0], sk_set[1], other_sk]  # remove sk_set[2], add other_sk
+    new_sk_set2 = [sk_set[0], other_sk, sk_set[2]]  # remove sk_set[1], add sk_set[2] back
+    env.storage_controller.migrate_safekeepers(
+        env.initial_tenant, env.initial_timeline, new_sk_set1
+    )
+    env.storage_controller.migrate_safekeepers(
+        env.initial_tenant, env.initial_timeline, new_sk_set2
+    )
+
+    commit_lsn_after_migration = get_commit_lsn(new_sk_set2)
+
+    # We should not lose committed WAL.
+    # If we have choosen the lagging sk to pull the timeline from, this might fail.
+    assert commit_lsn_before_migration <= commit_lsn_after_migration
+
+    ep.start(safekeeper_generation=5, safekeepers=new_sk_set2)
+    assert ep.safe_psql("SELECT * FROM t") == [(0,), (1,)]
--- a/test_runner/regress/test_tenant_size.py
+++ b/test_runner/regress/test_tenant_size.py
@@ -1,5 +1,6 @@
 from __future__ import annotations

+import os
 from concurrent.futures import ThreadPoolExecutor
 from typing import TYPE_CHECKING

@@ -768,6 +769,14 @@ def test_lsn_lease_storcon(neon_env_builder: NeonEnvBuilder):
        "compaction_period": "0s",
    }
    env = neon_env_builder.init_start(initial_tenant_conf=conf)
+    # ShardSplit is slow in debug builds, so ignore the warning
+    if os.getenv("BUILD_TYPE", "debug") == "debug":
+        env.storage_controller.allowed_errors.extend(
+            [
+                ".*Exclusive lock by ShardSplit was held.*",
+            ]
+        )
+
    with env.endpoints.create_start(
        "main",
    ) as ep:
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -2757,18 +2757,37 @@ def test_timeline_disk_usage_limit(neon_env_builder: NeonEnvBuilder):
    remote_storage_kind = s3_storage()
    neon_env_builder.enable_safekeeper_remote_storage(remote_storage_kind)

-    # Set a very small disk usage limit (1KB)
-    neon_env_builder.safekeeper_extra_opts = ["--max-timeline-disk-usage-bytes=1024"]
-
    env = neon_env_builder.init_start()

    # Create a timeline and endpoint
    env.create_branch("test_timeline_disk_usage_limit")
-    endpoint = env.endpoints.create_start("test_timeline_disk_usage_limit")
+    endpoint = env.endpoints.create_start(
+        "test_timeline_disk_usage_limit",
+        config_lines=[
+            "neon.lakebase_mode=true",
+        ],
+    )
+
+    # Install the neon extension in the test database. We need it to query perf counter metrics.
+    with closing(endpoint.connect()) as conn:
+        with conn.cursor() as cur:
+            cur.execute("CREATE EXTENSION IF NOT EXISTS neon")
+            # Sanity-check safekeeper connection status in neon_perf_counters in the happy case.
+            cur.execute(
+                "SELECT value FROM neon_perf_counters WHERE metric = 'num_active_safekeepers'"
+            )
+            assert cur.fetchone() == (1,), "Expected 1 active safekeeper"
+            cur.execute(
+                "SELECT value FROM neon_perf_counters WHERE metric = 'num_configured_safekeepers'"
+            )
+            assert cur.fetchone() == (1,), "Expected 1 configured safekeeper"

    # Get the safekeeper
    sk = env.safekeepers[0]

+    # Restart the safekeeper with a very small disk usage limit (1KB)
+    sk.stop().start(["--max-timeline-disk-usage-bytes=1024"])
+
    # Inject a failpoint to stop WAL backup
    with sk.http_client() as http_cli:
        http_cli.configure_failpoints([("backup-lsn-range-pausable", "pause")])
@@ -2794,6 +2813,18 @@ def test_timeline_disk_usage_limit(neon_env_builder: NeonEnvBuilder):
    wait_until(error_logged)
    log.info("Found expected error message in compute log, resuming.")

+    with closing(endpoint.connect()) as conn:
+        with conn.cursor() as cur:
+            # Confirm that neon_perf_counters also indicates that there are no active safekeepers
+            cur.execute(
+                "SELECT value FROM neon_perf_counters WHERE metric = 'num_active_safekeepers'"
+            )
+            assert cur.fetchone() == (0,), "Expected 0 active safekeepers"
+            cur.execute(
+                "SELECT value FROM neon_perf_counters WHERE metric = 'num_configured_safekeepers'"
+            )
+            assert cur.fetchone() == (1,), "Expected 1 configured safekeeper"
+
    # Sanity check that the hanging insert is indeed still hanging. Otherwise means the circuit breaker we
    # implemented didn't work as expected.
    time.sleep(2)