Improve walreceiver logic (#2253)

This patch makes walreceiver logic more complicated, but it should work better in most cases. Added `test_wal_lagging` to test scenarios where alive safekeepers can lag behind other alive safekeepers. - There was a bug which looks like `etcd_info.timeline.commit_lsn > Some(self.local_timeline.get_last_record_lsn())` filtered all safekeepers in some strange cases. I removed this filter, it should probably help with #2237 - Now walreceiver_connection reports status, including commit_lsn. This allows keeping safekeeper connection even when etcd is down. - Safekeeper connection now fails if pageserver doesn't receive safekeeper messages for some time. Usually safekeeper sends messages at least once per second. - `LaggingWal` check now uses `commit_lsn` directly from safekeeper. This fixes the issue with often reconnects, when compute generates WAL really fast. - `NoWalTimeout` is rewritten to trigger only when we know about the new WAL and the connected safekeeper doesn't stream any WAL. This allows setting a small `lagging_wal_timeout` because it will trigger only when we observe that the connected safekeeper has stuck.
2026-05-26 09:30:37 +00:00 · 2022-08-15 13:31:26 +03:00
parent 431393e361
commit 116ecdf87a
5 changed files with 380 additions and 174 deletions
--- a/test_runner/batch_others/test_wal_acceptor.py
+++ b/test_runner/batch_others/test_wal_acceptor.py
@@ -1090,11 +1090,9 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):

    # Remove initial tenant fully (two branches are active)
    response = sk_http.tenant_delete_force(tenant_id)
-    assert response == {
-        timeline_id_3: {
-            "dir_existed": True,
-            "was_active": True,
-        }
+    assert response[timeline_id_3] == {
+        "dir_existed": True,
+        "was_active": True,
    }
    assert not (sk_data_dir / tenant_id).exists()
    assert (sk_data_dir / tenant_id_other / timeline_id_other).is_dir()
--- a/test_runner/batch_others/test_wal_acceptor_async.py
+++ b/test_runner/batch_others/test_wal_acceptor_async.py
@@ -520,3 +520,68 @@ def test_race_conditions(neon_env_builder: NeonEnvBuilder):
    pg = env.postgres.create_start('test_safekeepers_race_conditions')

    asyncio.run(run_race_conditions(env, pg))
+
+
+# Check that pageserver can select safekeeper with largest commit_lsn
+# and switch if LSN is not updated for some time (NoWalTimeout).
+async def run_wal_lagging(env: NeonEnv, pg: Postgres):
+    def safekeepers_guc(env: NeonEnv, active_sk: List[bool]) -> str:
+        # use ports 10, 11 and 12 to simulate unavailable safekeepers
+        return ','.join([
+            f'localhost:{sk.port.pg if active else 10 + i}'
+            for i, (sk, active) in enumerate(zip(env.safekeepers, active_sk))
+        ])
+
+    conn = await pg.connect_async()
+    await conn.execute('CREATE TABLE t(key int primary key, value text)')
+    await conn.close()
+    pg.stop()
+
+    n_iterations = 20
+    n_txes = 10000
+    expected_sum = 0
+    i = 1
+    quorum = len(env.safekeepers) // 2 + 1
+
+    for it in range(n_iterations):
+        active_sk = list(map(lambda _: random.random() >= 0.5, env.safekeepers))
+        active_count = sum(active_sk)
+
+        if active_count < quorum:
+            it -= 1
+            continue
+
+        pg.adjust_for_safekeepers(safekeepers_guc(env, active_sk))
+        log.info(f'Iteration {it}: {active_sk}')
+
+        pg.start()
+        conn = await pg.connect_async()
+
+        for _ in range(n_txes):
+            await conn.execute(f"INSERT INTO t values ({i}, 'payload')")
+            expected_sum += i
+            i += 1
+
+        await conn.close()
+        pg.stop()
+
+    pg.adjust_for_safekeepers(safekeepers_guc(env, [True] * len(env.safekeepers)))
+    pg.start()
+    conn = await pg.connect_async()
+
+    log.info(f'Executed {i-1} queries')
+
+    res = await conn.fetchval('SELECT sum(key) FROM t')
+    assert res == expected_sum
+
+
+# do inserts while restarting postgres and messing with safekeeper addresses
+def test_wal_lagging(neon_env_builder: NeonEnvBuilder):
+
+    neon_env_builder.num_safekeepers = 3
+    env = neon_env_builder.init_start()
+
+    env.neon_cli.create_branch('test_wal_lagging')
+    pg = env.postgres.create_start('test_wal_lagging')
+
+    asyncio.run(run_wal_lagging(env, pg))