mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-26 09:30:37 +00:00
Improve walreceiver logic (#2253)
This patch makes walreceiver logic more complicated, but it should work better in most cases. Added `test_wal_lagging` to test scenarios where alive safekeepers can lag behind other alive safekeepers. - There was a bug which looks like `etcd_info.timeline.commit_lsn > Some(self.local_timeline.get_last_record_lsn())` filtered all safekeepers in some strange cases. I removed this filter, it should probably help with #2237 - Now walreceiver_connection reports status, including commit_lsn. This allows keeping safekeeper connection even when etcd is down. - Safekeeper connection now fails if pageserver doesn't receive safekeeper messages for some time. Usually safekeeper sends messages at least once per second. - `LaggingWal` check now uses `commit_lsn` directly from safekeeper. This fixes the issue with often reconnects, when compute generates WAL really fast. - `NoWalTimeout` is rewritten to trigger only when we know about the new WAL and the connected safekeeper doesn't stream any WAL. This allows setting a small `lagging_wal_timeout` because it will trigger only when we observe that the connected safekeeper has stuck.
This commit is contained in:
committed by
GitHub
parent
431393e361
commit
116ecdf87a
@@ -1090,11 +1090,9 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
|
||||
|
||||
# Remove initial tenant fully (two branches are active)
|
||||
response = sk_http.tenant_delete_force(tenant_id)
|
||||
assert response == {
|
||||
timeline_id_3: {
|
||||
"dir_existed": True,
|
||||
"was_active": True,
|
||||
}
|
||||
assert response[timeline_id_3] == {
|
||||
"dir_existed": True,
|
||||
"was_active": True,
|
||||
}
|
||||
assert not (sk_data_dir / tenant_id).exists()
|
||||
assert (sk_data_dir / tenant_id_other / timeline_id_other).is_dir()
|
||||
|
||||
@@ -520,3 +520,68 @@ def test_race_conditions(neon_env_builder: NeonEnvBuilder):
|
||||
pg = env.postgres.create_start('test_safekeepers_race_conditions')
|
||||
|
||||
asyncio.run(run_race_conditions(env, pg))
|
||||
|
||||
|
||||
# Check that pageserver can select safekeeper with largest commit_lsn
|
||||
# and switch if LSN is not updated for some time (NoWalTimeout).
|
||||
async def run_wal_lagging(env: NeonEnv, pg: Postgres):
|
||||
def safekeepers_guc(env: NeonEnv, active_sk: List[bool]) -> str:
|
||||
# use ports 10, 11 and 12 to simulate unavailable safekeepers
|
||||
return ','.join([
|
||||
f'localhost:{sk.port.pg if active else 10 + i}'
|
||||
for i, (sk, active) in enumerate(zip(env.safekeepers, active_sk))
|
||||
])
|
||||
|
||||
conn = await pg.connect_async()
|
||||
await conn.execute('CREATE TABLE t(key int primary key, value text)')
|
||||
await conn.close()
|
||||
pg.stop()
|
||||
|
||||
n_iterations = 20
|
||||
n_txes = 10000
|
||||
expected_sum = 0
|
||||
i = 1
|
||||
quorum = len(env.safekeepers) // 2 + 1
|
||||
|
||||
for it in range(n_iterations):
|
||||
active_sk = list(map(lambda _: random.random() >= 0.5, env.safekeepers))
|
||||
active_count = sum(active_sk)
|
||||
|
||||
if active_count < quorum:
|
||||
it -= 1
|
||||
continue
|
||||
|
||||
pg.adjust_for_safekeepers(safekeepers_guc(env, active_sk))
|
||||
log.info(f'Iteration {it}: {active_sk}')
|
||||
|
||||
pg.start()
|
||||
conn = await pg.connect_async()
|
||||
|
||||
for _ in range(n_txes):
|
||||
await conn.execute(f"INSERT INTO t values ({i}, 'payload')")
|
||||
expected_sum += i
|
||||
i += 1
|
||||
|
||||
await conn.close()
|
||||
pg.stop()
|
||||
|
||||
pg.adjust_for_safekeepers(safekeepers_guc(env, [True] * len(env.safekeepers)))
|
||||
pg.start()
|
||||
conn = await pg.connect_async()
|
||||
|
||||
log.info(f'Executed {i-1} queries')
|
||||
|
||||
res = await conn.fetchval('SELECT sum(key) FROM t')
|
||||
assert res == expected_sum
|
||||
|
||||
|
||||
# do inserts while restarting postgres and messing with safekeeper addresses
|
||||
def test_wal_lagging(neon_env_builder: NeonEnvBuilder):
|
||||
|
||||
neon_env_builder.num_safekeepers = 3
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
env.neon_cli.create_branch('test_wal_lagging')
|
||||
pg = env.postgres.create_start('test_wal_lagging')
|
||||
|
||||
asyncio.run(run_wal_lagging(env, pg))
|
||||
|
||||
Reference in New Issue
Block a user