Make test_pageserver_lsn_wait_error_safekeeper_stop less aggressive. (#9517)

Previously it inserted ~150MiB of WAL while expecting page fetching to
work in 1s (wait_lsn_timeout=1s). It failed in CI in debug builds.
Instead, just directly wait for the wanted condition, i.e. needed
safekeepers are reported in pageserver timed out waiting for WAL error
message. Also set NEON_COMPUTE_TESTING_BASEBACKUP_RETRIES to 1 in this
test and neighbour one, it reduces execution time from 2.5m to ~10s.
This commit is contained in:
Arseny Sher
2024-10-25 16:13:46 +03:00
committed by GitHub
parent e0c7f1ce15
commit c6cf5e7c0f

View File

@@ -1,11 +1,12 @@
from __future__ import annotations
import time
import os
from typing import TYPE_CHECKING
from fixtures.common_types import Lsn, TenantId
from fixtures.log_helper import log
from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder
from fixtures.utils import wait_until
if TYPE_CHECKING:
from typing import Any
@@ -19,6 +20,10 @@ def test_pageserver_lsn_wait_error_start(neon_env_builder: NeonEnvBuilder):
env = neon_env_builder.init_start()
env.pageserver.http_client()
# In this test we force 'Timed out while waiting for WAL record error' while
# fetching basebackup and don't want any retries.
os.environ["NEON_COMPUTE_TESTING_BASEBACKUP_RETRIES"] = "1"
tenant_id, timeline_id = env.create_tenant()
expected_timeout_error = f"Timed out while waiting for WAL record at LSN {future_lsn} to arrive"
env.pageserver.allowed_errors.append(f".*{expected_timeout_error}.*")
@@ -49,11 +54,14 @@ def test_pageserver_lsn_wait_error_start(neon_env_builder: NeonEnvBuilder):
def test_pageserver_lsn_wait_error_safekeeper_stop(neon_env_builder: NeonEnvBuilder):
# Trigger WAL wait timeout faster
def customize_pageserver_toml(ps_cfg: dict[str, Any]):
ps_cfg["wait_lsn_timeout"] = "1s"
ps_cfg["wait_lsn_timeout"] = "2s"
tenant_config = ps_cfg.setdefault("tenant_config", {})
tenant_config["walreceiver_connect_timeout"] = "2s"
tenant_config["lagging_wal_timeout"] = "2s"
# In this test we force 'Timed out while waiting for WAL record error' while
# fetching basebackup and don't want any retries.
os.environ["NEON_COMPUTE_TESTING_BASEBACKUP_RETRIES"] = "1"
neon_env_builder.pageserver_config_override = customize_pageserver_toml
# Have notable SK ids to ensure we check logs for their presence, not some other random numbers
@@ -64,7 +72,6 @@ def test_pageserver_lsn_wait_error_safekeeper_stop(neon_env_builder: NeonEnvBuil
tenant_id, timeline_id = env.create_tenant()
elements_to_insert = 1_000_000
expected_timeout_error = f"Timed out while waiting for WAL record at LSN {future_lsn} to arrive"
env.pageserver.allowed_errors.append(f".*{expected_timeout_error}.*")
# we configure wait_lsn_timeout to a shorter value than the lagging_wal_timeout / walreceiver_connect_timeout
@@ -74,45 +81,50 @@ def test_pageserver_lsn_wait_error_safekeeper_stop(neon_env_builder: NeonEnvBuil
".*ingesting record with timestamp lagging more than wait_lsn_timeout.*"
)
insert_test_elements(env, tenant_id, start=0, count=elements_to_insert)
insert_test_elements(env, tenant_id, start=0, count=1)
try:
trigger_wait_lsn_timeout(env, tenant_id)
except Exception as e:
exception_string = str(e)
assert expected_timeout_error in exception_string, "Should time out during waiting for WAL"
for safekeeper in env.safekeepers:
def all_sks_in_wareceiver_state():
try:
trigger_wait_lsn_timeout(env, tenant_id)
except Exception as e:
exception_string = str(e)
assert (
str(safekeeper.id) in exception_string
), f"Should have safekeeper {safekeeper.id} printed in walreceiver state after WAL wait timeout"
expected_timeout_error in exception_string
), "Should time out during waiting for WAL"
for safekeeper in env.safekeepers:
assert (
str(safekeeper.id) in exception_string
), f"Should have safekeeper {safekeeper.id} printed in walreceiver state after WAL wait timeout"
wait_until(60, 0.5, all_sks_in_wareceiver_state)
stopped_safekeeper = env.safekeepers[-1]
stopped_safekeeper_id = stopped_safekeeper.id
log.info(f"Stopping safekeeper {stopped_safekeeper.id}")
stopped_safekeeper.stop()
# sleep until stopped safekeeper is removed from candidates
time.sleep(2)
# Spend some more time inserting, to ensure SKs report updated statuses and walreceiver in PS have time to update its connection stats.
insert_test_elements(env, tenant_id, start=elements_to_insert + 1, count=elements_to_insert)
def all_but_stopped_sks_in_wareceiver_state():
try:
trigger_wait_lsn_timeout(env, tenant_id)
except Exception as e:
# Strip out the part before stdout, as it contains full command with the list of all safekeepers
exception_string = str(e).split("stdout", 1)[-1]
assert (
expected_timeout_error in exception_string
), "Should time out during waiting for WAL"
try:
trigger_wait_lsn_timeout(env, tenant_id)
except Exception as e:
# Strip out the part before stdout, as it contains full command with the list of all safekeepers
exception_string = str(e).split("stdout", 1)[-1]
assert expected_timeout_error in exception_string, "Should time out during waiting for WAL"
for safekeeper in env.safekeepers:
if safekeeper.id == stopped_safekeeper_id:
assert (
str(safekeeper.id) not in exception_string
), f"Should not have stopped safekeeper {safekeeper.id} printed in walreceiver state after 2nd WAL wait timeout"
else:
assert (
str(safekeeper.id) in exception_string
), f"Should have safekeeper {safekeeper.id} printed in walreceiver state after 2nd WAL wait timeout"
for safekeeper in env.safekeepers:
if safekeeper.id == stopped_safekeeper_id:
assert (
str(safekeeper.id) not in exception_string
), f"Should not have stopped safekeeper {safekeeper.id} printed in walreceiver state after 2nd WAL wait timeout"
else:
assert (
str(safekeeper.id) in exception_string
), f"Should have safekeeper {safekeeper.id} printed in walreceiver state after 2nd WAL wait timeout"
wait_until(60, 0.5, all_but_stopped_sks_in_wareceiver_state)
def insert_test_elements(env: NeonEnv, tenant_id: TenantId, start: int, count: int):