Files
neon/test_runner/regress/test_hot_standby.py
Christian Schwarz 643448b1a2 test_hot_standby_gc: work around standby_horizon-related flakiness/raciness uncovered by #12431 (#12704)
PR #12431 set initial lease deadline = 0s for tests.
This turned test_hot_standby_gc flaky because it now runs GC: it started
failing with `tried to request a page version that was garbage
collected`
because the replica reads below applied gc cutoff.

The leading theory is that, we run the timeline_gc() before the first
standby_horizon push arrives at PS. That is definitively a thing that
can happen with the current standby_horizon mechanism, and it's now
tracked as such in https://databricks.atlassian.net/browse/LKB-2499.

We don't have logs to confirm this theory though, but regardless,
try the fix in this PR and see if it stabilizes things.

Refs
- flaky test issue: https://databricks.atlassian.net/browse/LKB-2465

## Problem

## Summary of changes
2025-07-24 14:00:22 +00:00

371 lines
15 KiB
Python

from __future__ import annotations
import asyncio
import os
import threading
import time
from functools import partial
import pytest
from fixtures.log_helper import log
from fixtures.neon_fixtures import (
NeonEnv,
NeonEnvBuilder,
PgBin,
log_replica_lag,
tenant_get_shards,
wait_replica_caughtup,
)
from fixtures.utils import wait_until
def test_hot_standby(neon_simple_env: NeonEnv):
env = neon_simple_env
# We've had a bug caused by WAL records split across multiple XLogData
# messages resulting in corrupted WAL complains on standby. It reproduced
# only when sending from safekeeper is slow enough to grab full
# MAX_SEND_SIZE messages. So insert sleep through failpoints, but only in
# one conf to decrease test time.
slow_down_send = "[debug-pg16]" in os.environ.get("PYTEST_CURRENT_TEST", "")
if slow_down_send:
sk_http = env.safekeepers[0].http_client()
sk_http.configure_failpoints([("sk-send-wal-replica-sleep", "return(100)")])
with env.endpoints.create_start(
branch_name="main",
endpoint_id="primary",
) as primary:
time.sleep(1)
with env.endpoints.new_replica_start(origin=primary, endpoint_id="secondary") as secondary:
queries = [
"SHOW neon.timeline_id",
"SHOW neon.tenant_id",
"SELECT relname FROM pg_class WHERE relnamespace = current_schema()::regnamespace::oid",
"SELECT COUNT(*), SUM(i) FROM test",
]
responses = dict()
with primary.connect() as p_con:
with p_con.cursor() as p_cur:
p_cur.execute("CREATE TABLE test AS SELECT generate_series(1, 100) AS i")
for query in queries:
with p_con.cursor() as p_cur:
p_cur.execute(query)
res = p_cur.fetchone()
assert res is not None
response = res
responses[query] = response
# insert more data to make safekeeper send MAX_SEND_SIZE messages
if slow_down_send:
primary.safe_psql("create table t(key int, value text)")
primary.safe_psql("insert into t select generate_series(1, 100000), 'payload'")
wait_replica_caughtup(primary, secondary)
with secondary.connect() as s_con:
with s_con.cursor() as s_cur:
s_cur.execute("SELECT 1 WHERE pg_is_in_recovery()")
res = s_cur.fetchone()
assert res is not None
for query in queries:
with s_con.cursor() as secondary_cursor:
secondary_cursor.execute(query)
res = secondary_cursor.fetchone()
assert res is not None
response = res
assert response == responses[query]
# Check for corrupted WAL messages which might otherwise go unnoticed if
# reconnection fixes this.
assert not secondary.log_contains(
"incorrect resource manager data|record with incorrect|invalid magic number|unexpected pageaddr"
)
# clean up
if slow_down_send:
sk_http.configure_failpoints(("sk-send-wal-replica-sleep", "off"))
def test_2_replicas_start(neon_simple_env: NeonEnv):
env = neon_simple_env
with env.endpoints.create_start(
branch_name="main",
endpoint_id="primary",
) as primary:
time.sleep(1)
with env.endpoints.new_replica_start(
origin=primary, endpoint_id="secondary1"
) as secondary1:
with env.endpoints.new_replica_start(
origin=primary, endpoint_id="secondary2"
) as secondary2:
wait_replica_caughtup(primary, secondary1)
wait_replica_caughtup(primary, secondary2)
# Test two different scenarios related to gc of data needed by hot standby.
#
# When pause_apply is False, standby is mostly caught up with the primary.
# However, in compute <-> pageserver protocol version 1 only one LSN had been
# sent to the pageserver in page request, and to avoid waits in the pageserver
# it was last-written LSN cache value. If page hasn't been updated for a long
# time that resulted in an error from the pageserver: "Bad request: tried to
# request a page version that was garbage collected". For primary this wasn't a
# problem because pageserver always bumped LSN to the newest one; for standy
# that would be incorrect since we might get page fresher then apply LSN. Hence,
# in protocol version v2 two LSNs were introduced: main request_lsn (apply LSN
# in case of standby) and not_modified_since which could be used as an
# optimization to avoid waiting.
#
# https://github.com/neondatabase/neon/issues/6211
#
# When pause_apply is True we model standby lagging behind primary (e.g. due to
# high max_standby_streaming_delay). To prevent pageserver from removing data
# still needed by the standby apply LSN is propagated in standby -> safekeepers
# -> broker -> pageserver flow so that pageserver could hold off gc for it.
@pytest.mark.parametrize("pause_apply", [False, True])
def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder, pause_apply: bool):
tenant_conf = {
# set PITR interval to be small, so we can do GC
"pitr_interval": "0 s",
# we want to control gc and checkpoint frequency precisely
"gc_period": "0s",
"compaction_period": "0s",
}
env = neon_env_builder.init_start(initial_tenant_conf=tenant_conf)
timeline_id = env.initial_timeline
tenant_id = env.initial_tenant
with env.endpoints.create_start(
branch_name="main",
endpoint_id="primary",
) as primary:
with env.endpoints.new_replica_start(
origin=primary,
endpoint_id="secondary",
# Protocol version 2 was introduced to fix the issue
# that this test exercises. With protocol version 1 it
# fails.
config_lines=["neon.protocol_version=2"],
) as secondary:
p_cur = primary.connect().cursor()
p_cur.execute("CREATE EXTENSION neon_test_utils")
p_cur.execute("CREATE TABLE test (id int primary key) WITH (autovacuum_enabled=false)")
p_cur.execute("INSERT INTO test SELECT generate_series(1, 10000) AS g")
wait_replica_caughtup(primary, secondary)
s_cur = secondary.connect().cursor()
s_cur.execute("SELECT 1 WHERE pg_is_in_recovery()")
res = s_cur.fetchone()
assert res is not None
s_cur.execute("SELECT COUNT(*) FROM test")
res = s_cur.fetchone()
assert res == (10000,)
# Clear the cache in the standby, so that when we
# re-execute the query, it will make GetPage
# requests. This does not clear the last-written LSN cache
# so we still remember the LSNs of the pages.
secondary.clear_buffers(cursor=s_cur)
if pause_apply:
s_cur.execute("SELECT pg_wal_replay_pause()")
# Do other stuff on the primary, to advance the WAL
p_cur.execute("CREATE TABLE test2 AS SELECT generate_series(1, 1000000) AS g")
# Run GC. The PITR interval is very small, so this advances the GC cutoff LSN
# very close to the primary's current insert LSN.
shards = tenant_get_shards(env, tenant_id, None)
for tenant_shard_id, pageserver in shards:
client = pageserver.http_client()
client.timeline_checkpoint(tenant_shard_id, timeline_id)
client.timeline_compact(tenant_shard_id, timeline_id)
# Wait for standby horizon to get propagated.
# This shouldn't be necessary, but the current mechanism for
# standby_horizon propagation is imperfect. Detailed
# description in https://databricks.atlassian.net/browse/LKB-2499
while True:
val = client.get_metric_value(
"pageserver_standby_horizon",
{
"tenant_id": str(tenant_shard_id.tenant_id),
"shard_id": str(tenant_shard_id.shard_index),
"timeline_id": str(timeline_id),
},
)
log.info("waiting for next standby_horizon push from safekeeper, {val=}")
if val != 0:
break
time.sleep(0.1)
client.timeline_gc(tenant_shard_id, timeline_id, 0)
# Re-execute the query. The GetPage requests that this
# generates use old not_modified_since LSNs, older than
# the GC cutoff, but new request LSNs. (In protocol
# version 1 there was only one LSN, and this failed.)
log_replica_lag(primary, secondary)
s_cur.execute("SELECT COUNT(*) FROM test")
log_replica_lag(primary, secondary)
res = s_cur.fetchone()
assert res == (10000,)
def run_pgbench(connstr: str, pg_bin: PgBin):
log.info(f"Start a pgbench workload on pg {connstr}")
pg_bin.run_capture(["pgbench", "-T60", connstr])
# assert that pgbench_accounts and its index are created.
def pgbench_accounts_initialized(ep):
ep.safe_psql_scalar("select 'pgbench_accounts_pkey'::regclass")
# Test that hot_standby_feedback works in neon (it is forwarded through
# safekeepers). That is, ensure queries on standby don't fail during load on
# primary under the following conditions:
# - pgbench bombards primary with updates.
# - On the secondary we run long select of the updated table.
# - Set small max_standby_streaming_delay: hs feedback should prevent conflicts
# so apply doesn't need to wait.
# - Do agressive vacuum on primary which still shouldn't create conflicts.
# Actually this appears to be redundant due to microvacuum existence.
#
# Without hs feedback enabled we'd see 'User query might have needed to see row
# versions that must be removed.' errors.
def test_hot_standby_feedback(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
env = neon_env_builder.init_start(initial_tenant_conf={"lsn_lease_length": "0s"})
agressive_vacuum_conf = [
"log_autovacuum_min_duration = 0",
"autovacuum_naptime = 10s",
"autovacuum_vacuum_threshold = 25",
"autovacuum_vacuum_scale_factor = 0.1",
"autovacuum_vacuum_cost_delay = -1",
]
with env.endpoints.create_start(
branch_name="main", endpoint_id="primary", config_lines=agressive_vacuum_conf
) as primary:
# It would be great to have more strict max_standby_streaming_delay=0s here, but then sometimes it fails with
# 'User was holding shared buffer pin for too long.'.
with env.endpoints.new_replica_start(
origin=primary,
endpoint_id="secondary",
config_lines=[
"max_standby_streaming_delay=2s",
"neon.protocol_version=2",
"hot_standby_feedback=true",
],
) as secondary:
log.info(
f"primary connstr is {primary.connstr()}, secondary connstr {secondary.connstr()}"
)
# s10 is about 150MB of data. In debug mode init takes about 15s on SSD.
pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", "-s10", primary.connstr()])
log.info("pgbench init done in primary")
t = threading.Thread(target=run_pgbench, args=(primary.connstr(), pg_bin))
t.start()
# Wait until we see that the pgbench_accounts is created + filled on replica *and*
# index is created. Otherwise index creation would conflict with
# read queries and hs feedback won't save us.
wait_until(partial(pgbench_accounts_initialized, secondary), timeout=60)
# Test should fail if hs feedback is disabled anyway, but cross
# check that walproposer sets some xmin.
def xmin_is_not_null():
slot_xmin = primary.safe_psql_scalar(
"select xmin from pg_replication_slots where slot_name = 'wal_proposer_slot'",
log_query=False,
)
log.info(f"xmin is {slot_xmin}")
assert int(slot_xmin) > 0
wait_until(xmin_is_not_null)
for _ in range(1, 5):
# in debug mode takes about 5-7s
balance = secondary.safe_psql_scalar("select sum(abalance) from pgbench_accounts")
log.info(f"balance={balance}")
log_replica_lag(primary, secondary)
t.join()
# check xmin is reset when standby is gone
def xmin_is_null():
slot_xmin = primary.safe_psql_scalar(
"select xmin from pg_replication_slots where slot_name = 'wal_proposer_slot'",
log_query=False,
)
log.info(f"xmin is {slot_xmin}")
assert slot_xmin is None
wait_until(xmin_is_null)
# Test race condition between WAL replay and backends performing queries
# https://github.com/neondatabase/neon/issues/7791
def test_replica_query_race(neon_simple_env: NeonEnv):
env = neon_simple_env
primary_ep = env.endpoints.create_start(
branch_name="main",
endpoint_id="primary",
)
with primary_ep.connect() as p_con:
with p_con.cursor() as p_cur:
p_cur.execute("CREATE EXTENSION neon_test_utils")
p_cur.execute("CREATE TABLE test AS SELECT 0 AS counter")
standby_ep = env.endpoints.new_replica_start(origin=primary_ep, endpoint_id="standby")
wait_replica_caughtup(primary_ep, standby_ep)
# In primary, run a lot of UPDATEs on a single page
finished = False
writecounter = 1
async def primary_workload():
nonlocal writecounter, finished
conn = await primary_ep.connect_async()
while writecounter < 10000:
writecounter += 1
await conn.execute(f"UPDATE test SET counter = {writecounter}")
finished = True
# In standby, at the same time, run queries on it. And repeatedly drop caches
async def standby_workload():
nonlocal writecounter, finished
conn = await standby_ep.connect_async()
reads = 0
while not finished:
readcounter = await conn.fetchval("SELECT counter FROM test")
# Check that the replica is keeping up with the primary. In local
# testing, the lag between primary and standby is much smaller, in
# the ballpark of 2-3 counter values. But be generous in case there's
# some hiccup.
# assert(writecounter - readcounter < 1000)
assert readcounter <= writecounter
if reads % 100 == 0:
log.info(f"read {reads}: counter {readcounter}, last update {writecounter}")
reads += 1
# FIXME: what about LFC clearing?
await conn.execute("SELECT clear_buffer_cache()")
async def both():
await asyncio.gather(
primary_workload(),
standby_workload(),
)
asyncio.run(both())