test_runner: improve wait_until (#9936)

Improves `wait_until` by:

* Use `timeout` instead of `iterations`. This allows changing the
timeout/interval parameters independently.
* Make `timeout` and `interval` optional (default 20s and 0.5s). Most
callers don't care.
* Only output status every 1s by default, and add optional
`status_interval` parameter.
* Remove `show_intermediate_error`, this was always emitted anyway.

Most callers have been updated to use the defaults, except where they
had good reason otherwise.
This commit is contained in:
Erik Grinaker
2024-12-02 12:26:15 +02:00
committed by GitHub
parent 45658ccccb
commit 5330122049
46 changed files with 234 additions and 326 deletions

View File

@@ -1736,7 +1736,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
def storage_controller_ready(): def storage_controller_ready():
assert self.ready() is True assert self.ready() is True
wait_until(30, 1, storage_controller_ready) wait_until(storage_controller_ready)
return time.time() - t1 return time.time() - t1
def attach_hook_issue( def attach_hook_issue(
@@ -2574,7 +2574,7 @@ class NeonPageserver(PgProtocol, LogUtils):
log.info(f"any_unstable={any_unstable}") log.info(f"any_unstable={any_unstable}")
assert not any_unstable assert not any_unstable
wait_until(20, 0.5, complete) wait_until(complete)
def __enter__(self) -> Self: def __enter__(self) -> Self:
return self return self
@@ -3973,7 +3973,7 @@ class Endpoint(PgProtocol, LogUtils):
migration_id: int = cur.fetchall()[0][0] migration_id: int = cur.fetchall()[0][0]
assert migration_id >= num_migrations assert migration_id >= num_migrations
wait_until(20, 0.5, check_migrations_done) wait_until(check_migrations_done)
# Mock the extension part of spec passed from control plane for local testing # Mock the extension part of spec passed from control plane for local testing
# endpooint.rs adds content of this file as a part of the spec.json # endpooint.rs adds content of this file as a part of the spec.json
@@ -4489,12 +4489,10 @@ class Safekeeper(LogUtils):
) )
assert stat.remote_consistent_lsn >= lsn and stat.backup_lsn >= lsn.segment_lsn() assert stat.remote_consistent_lsn >= lsn and stat.backup_lsn >= lsn.segment_lsn()
# xxx: max wait is long because we might be waiting for reconnection from wait_until(are_lsns_advanced)
# pageserver to this safekeeper
wait_until(30, 1, are_lsns_advanced)
client.checkpoint(tenant_id, timeline_id) client.checkpoint(tenant_id, timeline_id)
if wait_wal_removal: if wait_wal_removal:
wait_until(30, 1, are_segments_removed) wait_until(are_segments_removed)
def wait_until_paused(self, failpoint: str): def wait_until_paused(self, failpoint: str):
msg = f"at failpoint {failpoint}" msg = f"at failpoint {failpoint}"
@@ -4503,7 +4501,7 @@ class Safekeeper(LogUtils):
log.info(f"waiting for hitting failpoint {failpoint}") log.info(f"waiting for hitting failpoint {failpoint}")
self.assert_log_contains(msg) self.assert_log_contains(msg)
wait_until(20, 0.5, paused) wait_until(paused)
class NeonBroker(LogUtils): class NeonBroker(LogUtils):

View File

@@ -13,7 +13,7 @@ from mypy_boto3_s3.type_defs import (
from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineId from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineId
from fixtures.log_helper import log from fixtures.log_helper import log
from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
from fixtures.remote_storage import RemoteStorage, RemoteStorageKind, S3Storage from fixtures.remote_storage import RemoteStorage, S3Storage
from fixtures.utils import wait_until from fixtures.utils import wait_until
if TYPE_CHECKING: if TYPE_CHECKING:
@@ -269,12 +269,7 @@ def wait_timeline_detail_404(
pageserver_http: PageserverHttpClient, pageserver_http: PageserverHttpClient,
tenant_id: TenantId | TenantShardId, tenant_id: TenantId | TenantShardId,
timeline_id: TimelineId, timeline_id: TimelineId,
iterations: int,
interval: float | None = None,
): ):
if interval is None:
interval = 0.25
def timeline_is_missing(): def timeline_is_missing():
data = {} data = {}
try: try:
@@ -287,19 +282,17 @@ def wait_timeline_detail_404(
raise RuntimeError(f"Timeline exists state {data.get('state')}") raise RuntimeError(f"Timeline exists state {data.get('state')}")
wait_until(iterations, interval, func=timeline_is_missing) wait_until(timeline_is_missing)
def timeline_delete_wait_completed( def timeline_delete_wait_completed(
pageserver_http: PageserverHttpClient, pageserver_http: PageserverHttpClient,
tenant_id: TenantId | TenantShardId, tenant_id: TenantId | TenantShardId,
timeline_id: TimelineId, timeline_id: TimelineId,
iterations: int = 20,
interval: float | None = None,
**delete_args, **delete_args,
) -> None: ) -> None:
pageserver_http.timeline_delete(tenant_id=tenant_id, timeline_id=timeline_id, **delete_args) pageserver_http.timeline_delete(tenant_id=tenant_id, timeline_id=timeline_id, **delete_args)
wait_timeline_detail_404(pageserver_http, tenant_id, timeline_id, iterations, interval) wait_timeline_detail_404(pageserver_http, tenant_id, timeline_id)
# remote_storage must not be None, but that's easier for callers to make mypy happy # remote_storage must not be None, but that's easier for callers to make mypy happy
@@ -453,7 +446,3 @@ def many_small_layers_tenant_config() -> dict[str, Any]:
"checkpoint_distance": 1024**2, "checkpoint_distance": 1024**2,
"image_creation_threshold": 100, "image_creation_threshold": 100,
} }
def poll_for_remote_storage_iterations(remote_storage_kind: RemoteStorageKind) -> int:
return 40 if remote_storage_kind is RemoteStorageKind.REAL_S3 else 15

View File

@@ -175,7 +175,7 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter):
assert s > Lsn(0) assert s > Lsn(0)
return s return s
return wait_until(30, 1, timeline_start_lsn_non_zero) return wait_until(timeline_start_lsn_non_zero)
def get_commit_lsn(self, tenant_id: TenantId, timeline_id: TimelineId) -> Lsn: def get_commit_lsn(self, tenant_id: TenantId, timeline_id: TimelineId) -> Lsn:
return self.timeline_status(tenant_id, timeline_id).commit_lsn return self.timeline_status(tenant_id, timeline_id).commit_lsn

View File

@@ -19,4 +19,4 @@ def wait_walreceivers_absent(
log.info(f"waiting for walreceivers to be gone, currently {status.walreceivers}") log.info(f"waiting for walreceivers to be gone, currently {status.walreceivers}")
assert len(status.walreceivers) == 0 assert len(status.walreceivers) == 0
wait_until(30, 0.5, walreceivers_absent) wait_until(walreceivers_absent)

View File

@@ -9,6 +9,7 @@ import tarfile
import threading import threading
import time import time
from collections.abc import Callable, Iterable from collections.abc import Callable, Iterable
from datetime import datetime, timedelta
from hashlib import sha256 from hashlib import sha256
from pathlib import Path from pathlib import Path
from typing import TYPE_CHECKING, Any, TypeVar from typing import TYPE_CHECKING, Any, TypeVar
@@ -380,15 +381,10 @@ def start_in_background(
if return_code is not None: if return_code is not None:
error = f"expected subprocess to run but it exited with code {return_code}" error = f"expected subprocess to run but it exited with code {return_code}"
else: else:
attempts = 10
try: try:
wait_until( wait_until(is_started, timeout=10)
number_of_iterations=attempts,
interval=1,
func=is_started,
)
except Exception: except Exception:
error = f"Failed to get correct status from subprocess in {attempts} attempts" error = "Failed to get correct status from subprocess"
except Exception as e: except Exception as e:
error = f"expected subprocess to start but it failed with exception: {e}" error = f"expected subprocess to start but it failed with exception: {e}"
@@ -402,28 +398,31 @@ def start_in_background(
def wait_until( def wait_until(
number_of_iterations: int,
interval: float,
func: Callable[[], WaitUntilRet], func: Callable[[], WaitUntilRet],
show_intermediate_error: bool = False, name: str | None = None,
timeout: float = 20.0, # seconds
interval: float = 0.5, # seconds
status_interval: float = 1.0, # seconds
) -> WaitUntilRet: ) -> WaitUntilRet:
""" """
Wait until 'func' returns successfully, without exception. Returns the Wait until 'func' returns successfully, without exception. Returns the
last return value from the function. last return value from the function.
""" """
if name is None:
name = getattr(func, "__name__", repr(func))
deadline = datetime.now() + timedelta(seconds=timeout)
next_status = datetime.now()
last_exception = None last_exception = None
for i in range(number_of_iterations): while datetime.now() <= deadline:
try: try:
res = func() return func()
except Exception as e: except Exception as e:
log.info("waiting for %s iteration %s failed: %s", func, i + 1, e) if datetime.now() >= next_status:
log.info("waiting for %s: %s", name, e)
next_status = datetime.now() + timedelta(seconds=status_interval)
last_exception = e last_exception = e
if show_intermediate_error:
log.info(e)
time.sleep(interval) time.sleep(interval)
continue raise Exception(f"timed out while waiting for {name}") from last_exception
return res
raise Exception(f"timed out while waiting for {func}") from last_exception
def assert_eq(a, b) -> None: def assert_eq(a, b) -> None:

View File

@@ -60,24 +60,22 @@ def test_clickhouse(remote_pg: RemotePostgres):
"SETTINGS materialized_postgresql_tables_list = 'table1';" "SETTINGS materialized_postgresql_tables_list = 'table1';"
) )
wait_until( wait_until(
120,
0.5,
lambda: query_clickhouse( lambda: query_clickhouse(
client, client,
"select * from db1_postgres.table1 order by 1", "select * from db1_postgres.table1 order by 1",
"ee600d8f7cd05bd0b169fa81f44300a9dd10085a", "ee600d8f7cd05bd0b169fa81f44300a9dd10085a",
), ),
timeout=60,
) )
cur.execute("INSERT INTO table1 (id, column1) VALUES (3, 'ghi'), (4, 'jkl');") cur.execute("INSERT INTO table1 (id, column1) VALUES (3, 'ghi'), (4, 'jkl');")
conn.commit() conn.commit()
wait_until( wait_until(
120,
0.5,
lambda: query_clickhouse( lambda: query_clickhouse(
client, client,
"select * from db1_postgres.table1 order by 1", "select * from db1_postgres.table1 order by 1",
"9eba2daaf7e4d7d27ac849525f68b562ab53947d", "9eba2daaf7e4d7d27ac849525f68b562ab53947d",
), ),
timeout=60,
) )
log.debug("Sleeping before final checking if Neon is still alive") log.debug("Sleeping before final checking if Neon is still alive")
time.sleep(3) time.sleep(3)

View File

@@ -148,14 +148,12 @@ def test_debezium(debezium):
) )
conn.commit() conn.commit()
wait_until( wait_until(
100,
0.5,
lambda: get_kafka_msg( lambda: get_kafka_msg(
consumer, consumer,
ts_ms, ts_ms,
after={"first_name": "John", "last_name": "Dow", "email": "johndow@example.com"}, after={"first_name": "John", "last_name": "Dow", "email": "johndow@example.com"},
), ),
show_intermediate_error=True, timeout=60,
) )
ts_ms = time.time() * 1000 ts_ms = time.time() * 1000
log.info("Insert 2 ts_ms: %s", ts_ms) log.info("Insert 2 ts_ms: %s", ts_ms)
@@ -165,28 +163,24 @@ def test_debezium(debezium):
) )
conn.commit() conn.commit()
wait_until( wait_until(
100,
0.5,
lambda: get_kafka_msg( lambda: get_kafka_msg(
consumer, consumer,
ts_ms, ts_ms,
after={"first_name": "Alex", "last_name": "Row", "email": "alexrow@example.com"}, after={"first_name": "Alex", "last_name": "Row", "email": "alexrow@example.com"},
), ),
show_intermediate_error=True, timeout=60,
) )
ts_ms = time.time() * 1000 ts_ms = time.time() * 1000
log.info("Update ts_ms: %s", ts_ms) log.info("Update ts_ms: %s", ts_ms)
cur.execute("update inventory.customers set first_name = 'Alexander' where id = 2") cur.execute("update inventory.customers set first_name = 'Alexander' where id = 2")
conn.commit() conn.commit()
wait_until( wait_until(
100,
0.5,
lambda: get_kafka_msg( lambda: get_kafka_msg(
consumer, consumer,
ts_ms, ts_ms,
after={"first_name": "Alexander"}, after={"first_name": "Alexander"},
), ),
show_intermediate_error=True, timeout=60,
) )
time.sleep(3) time.sleep(3)
cur.execute("select 1") cur.execute("select 1")

View File

@@ -137,7 +137,7 @@ def test_branch_creation_many(neon_compare: NeonCompare, n_branches: int, shape:
startup_line = "INFO version: git(-env)?:" startup_line = "INFO version: git(-env)?:"
# find the first line of the log file so we can find the next start later # find the first line of the log file so we can find the next start later
_, first_start = wait_until(5, 1, lambda: env.pageserver.assert_log_contains(startup_line)) _, first_start = wait_until(lambda: env.pageserver.assert_log_contains(startup_line))
# start without gc so we can time compaction with less noise; use shorter # start without gc so we can time compaction with less noise; use shorter
# period for compaction so it starts earlier # period for compaction so it starts earlier
@@ -156,7 +156,7 @@ def test_branch_creation_many(neon_compare: NeonCompare, n_branches: int, shape:
) )
_, second_start = wait_until( _, second_start = wait_until(
5, 1, lambda: env.pageserver.assert_log_contains(startup_line, first_start) lambda: env.pageserver.assert_log_contains(startup_line, first_start),
) )
env.pageserver.quiesce_tenants() env.pageserver.quiesce_tenants()
@@ -164,8 +164,6 @@ def test_branch_creation_many(neon_compare: NeonCompare, n_branches: int, shape:
# wait for compaction to complete, which most likely has already done so multiple times # wait for compaction to complete, which most likely has already done so multiple times
msg, _ = wait_until( msg, _ = wait_until(
30,
1,
lambda: env.pageserver.assert_log_contains( lambda: env.pageserver.assert_log_contains(
f".*tenant_id={env.initial_tenant}.*: compaction iteration complete.*", second_start f".*tenant_id={env.initial_tenant}.*: compaction iteration complete.*", second_start
), ),
@@ -205,7 +203,7 @@ def wait_and_record_startup_metrics(
assert len(matching) == len(expected_labels) assert len(matching) == len(expected_labels)
return matching return matching
samples = wait_until(10, 1, metrics_are_filled) samples = wait_until(metrics_are_filled)
for sample in samples: for sample in samples:
phase = sample.labels["phase"] phase = sample.labels["phase"]

View File

@@ -64,8 +64,6 @@ def negative_env(neon_env_builder: NeonEnvBuilder) -> Generator[NegativeTests, N
) )
wait_until( wait_until(
50,
0.1,
lambda: env.pageserver.assert_log_contains(".*Error processing HTTP request: Bad request"), lambda: env.pageserver.assert_log_contains(".*Error processing HTTP request: Bad request"),
) )

View File

@@ -385,7 +385,7 @@ def test_pageserver_compaction_circuit_breaker(neon_env_builder: NeonEnvBuilder)
# Wait for enough failures to break the circuit breaker # Wait for enough failures to break the circuit breaker
# This wait is fairly long because we back off on compaction failures, so 5 retries takes ~30s # This wait is fairly long because we back off on compaction failures, so 5 retries takes ~30s
wait_until(60, 1, assert_broken) wait_until(assert_broken, timeout=60)
# Sleep for a while, during which time we expect that compaction will _not_ be retried # Sleep for a while, during which time we expect that compaction will _not_ be retried
time.sleep(10) time.sleep(10)

View File

@@ -211,7 +211,7 @@ class EvictionEnv:
pageserver.assert_log_contains(".*running mocked statvfs.*") pageserver.assert_log_contains(".*running mocked statvfs.*")
# we most likely have already completed multiple runs # we most likely have already completed multiple runs
wait_until(10, 1, statvfs_called) wait_until(statvfs_called)
def count_layers_per_tenant( def count_layers_per_tenant(
@@ -772,14 +772,14 @@ def test_statvfs_pressure_usage(eviction_env: EvictionEnv):
) )
wait_until( wait_until(
10, 1, lambda: env.neon_env.pageserver.assert_log_contains(".*disk usage pressure relieved") lambda: env.neon_env.pageserver.assert_log_contains(".*disk usage pressure relieved")
) )
def less_than_max_usage_pct(): def less_than_max_usage_pct():
post_eviction_total_size, _, _ = env.timelines_du(env.pageserver) post_eviction_total_size, _, _ = env.timelines_du(env.pageserver)
assert post_eviction_total_size < 0.33 * total_size, "we requested max 33% usage" assert post_eviction_total_size < 0.33 * total_size, "we requested max 33% usage"
wait_until(2, 2, less_than_max_usage_pct) wait_until(less_than_max_usage_pct, timeout=5)
# Disk usage candidate collection only takes into account active tenants. # Disk usage candidate collection only takes into account active tenants.
# However, the statvfs call takes into account the entire tenants directory, # However, the statvfs call takes into account the entire tenants directory,
@@ -825,7 +825,7 @@ def test_statvfs_pressure_min_avail_bytes(eviction_env: EvictionEnv):
) )
wait_until( wait_until(
10, 1, lambda: env.neon_env.pageserver.assert_log_contains(".*disk usage pressure relieved") lambda: env.neon_env.pageserver.assert_log_contains(".*disk usage pressure relieved"),
) )
def more_than_min_avail_bytes_freed(): def more_than_min_avail_bytes_freed():
@@ -834,7 +834,7 @@ def test_statvfs_pressure_min_avail_bytes(eviction_env: EvictionEnv):
total_size - post_eviction_total_size >= min_avail_bytes total_size - post_eviction_total_size >= min_avail_bytes
), f"we requested at least {min_avail_bytes} worth of free space" ), f"we requested at least {min_avail_bytes} worth of free space"
wait_until(2, 2, more_than_min_avail_bytes_freed) wait_until(more_than_min_avail_bytes_freed, timeout=5)
def test_secondary_mode_eviction(eviction_env_ha: EvictionEnv): def test_secondary_mode_eviction(eviction_env_ha: EvictionEnv):

View File

@@ -257,7 +257,7 @@ def test_hot_standby_feedback(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
# Wait until we see that the pgbench_accounts is created + filled on replica *and* # Wait until we see that the pgbench_accounts is created + filled on replica *and*
# index is created. Otherwise index creation would conflict with # index is created. Otherwise index creation would conflict with
# read queries and hs feedback won't save us. # read queries and hs feedback won't save us.
wait_until(60, 1.0, partial(pgbench_accounts_initialized, secondary)) wait_until(partial(pgbench_accounts_initialized, secondary), timeout=60)
# Test should fail if hs feedback is disabled anyway, but cross # Test should fail if hs feedback is disabled anyway, but cross
# check that walproposer sets some xmin. # check that walproposer sets some xmin.
@@ -269,7 +269,7 @@ def test_hot_standby_feedback(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
log.info(f"xmin is {slot_xmin}") log.info(f"xmin is {slot_xmin}")
assert int(slot_xmin) > 0 assert int(slot_xmin) > 0
wait_until(10, 1.0, xmin_is_not_null) wait_until(xmin_is_not_null)
for _ in range(1, 5): for _ in range(1, 5):
# in debug mode takes about 5-7s # in debug mode takes about 5-7s
balance = secondary.safe_psql_scalar("select sum(abalance) from pgbench_accounts") balance = secondary.safe_psql_scalar("select sum(abalance) from pgbench_accounts")
@@ -286,7 +286,7 @@ def test_hot_standby_feedback(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
log.info(f"xmin is {slot_xmin}") log.info(f"xmin is {slot_xmin}")
assert slot_xmin is None assert slot_xmin is None
wait_until(10, 1.0, xmin_is_null) wait_until(xmin_is_null)
# Test race condition between WAL replay and backends performing queries # Test race condition between WAL replay and backends performing queries

View File

@@ -206,7 +206,7 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder, attach_mode: str):
future_layers = set(get_future_layers()) future_layers = set(get_future_layers())
assert future_layer not in future_layers assert future_layer not in future_layers
wait_until(10, 0.5, future_layer_is_gone_from_index_part) wait_until(future_layer_is_gone_from_index_part)
# We already make deletion stuck here, but we don't necessarily hit the failpoint # We already make deletion stuck here, but we don't necessarily hit the failpoint
# because deletions are batched. # because deletions are batched.

View File

@@ -37,7 +37,7 @@ def test_logging_event_count(neon_env_builder: NeonEnvBuilder, level: str):
return return
env.pageserver.assert_log_contains(f".*{msg_id}.*") env.pageserver.assert_log_contains(f".*{msg_id}.*")
wait_until(10, 0.5, assert_logged) wait_until(assert_logged)
# make sure it's counted # make sure it's counted
def assert_metric_value(): def assert_metric_value():
@@ -49,4 +49,4 @@ def test_logging_event_count(neon_env_builder: NeonEnvBuilder, level: str):
log.info("libmetrics_tracing_event_count: %s", val) log.info("libmetrics_tracing_event_count: %s", val)
assert val > (before or 0.0) assert val > (before or 0.0)
wait_until(10, 1, assert_metric_value) wait_until(assert_metric_value)

View File

@@ -207,7 +207,7 @@ def test_obsolete_slot_drop(neon_simple_env: NeonEnv, vanilla_pg: VanillaPostgre
log.info(f"ep connstr is {endpoint.connstr()}, subscriber connstr {vanilla_pg.connstr()}") log.info(f"ep connstr is {endpoint.connstr()}, subscriber connstr {vanilla_pg.connstr()}")
vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub1") vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub1")
wait_until(number_of_iterations=10, interval=2, func=partial(slot_removed, endpoint)) wait_until(partial(slot_removed, endpoint))
def test_ondemand_wal_download_in_replication_slot_funcs(neon_env_builder: NeonEnvBuilder): def test_ondemand_wal_download_in_replication_slot_funcs(neon_env_builder: NeonEnvBuilder):
@@ -519,7 +519,7 @@ def test_replication_shutdown(neon_simple_env: NeonEnv):
assert len(res) == 4 assert len(res) == 4
assert [r[0] for r in res] == [10, 20, 30, 40] assert [r[0] for r in res] == [10, 20, 30, 40]
wait_until(10, 0.5, check_that_changes_propagated) wait_until(check_that_changes_propagated)
def logical_replication_wait_flush_lsn_sync(publisher: PgProtocol) -> Lsn: def logical_replication_wait_flush_lsn_sync(publisher: PgProtocol) -> Lsn:
@@ -549,7 +549,7 @@ select sent_lsn, flush_lsn, pg_current_wal_flush_lsn() from pg_stat_replication
) )
assert flush_lsn >= publisher_flush_lsn assert flush_lsn >= publisher_flush_lsn
wait_until(30, 0.5, check_caughtup) wait_until(check_caughtup)
return publisher_flush_lsn return publisher_flush_lsn

View File

@@ -169,7 +169,7 @@ def test_get_lsn_by_timestamp_cancelled(neon_env_builder: NeonEnvBuilder):
) )
_, offset = wait_until( _, offset = wait_until(
20, 0.5, lambda: env.pageserver.assert_log_contains(f"at failpoint {failpoint}") lambda: env.pageserver.assert_log_contains(f"at failpoint {failpoint}")
) )
with pytest.raises(ReadTimeout): with pytest.raises(ReadTimeout):
@@ -178,8 +178,6 @@ def test_get_lsn_by_timestamp_cancelled(neon_env_builder: NeonEnvBuilder):
client.configure_failpoints((failpoint, "off")) client.configure_failpoints((failpoint, "off"))
_, offset = wait_until( _, offset = wait_until(
20,
0.5,
lambda: env.pageserver.assert_log_contains( lambda: env.pageserver.assert_log_contains(
"Cancelled request finished with an error: Cancelled$", offset "Cancelled request finished with an error: Cancelled$", offset
), ),

View File

@@ -77,7 +77,7 @@ def test_neon_superuser(neon_simple_env: NeonEnv, pg_version: PgVersion):
assert len(res) == 4 assert len(res) == 4
assert [r[0] for r in res] == [10, 20, 30, 40] assert [r[0] for r in res] == [10, 20, 30, 40]
wait_until(10, 0.5, check_that_changes_propagated) wait_until(check_that_changes_propagated)
# Test that pg_monitor is working for neon_superuser role # Test that pg_monitor is working for neon_superuser role
cur.execute("SELECT query from pg_stat_activity LIMIT 1") cur.execute("SELECT query from pg_stat_activity LIMIT 1")

View File

@@ -256,7 +256,7 @@ def test_ondemand_download_timetravel(neon_env_builder: NeonEnvBuilder):
##### Second start, restore the data and ensure it's the same ##### Second start, restore the data and ensure it's the same
env.pageserver.start() env.pageserver.start()
wait_until(10, 0.2, lambda: assert_tenant_state(client, tenant_id, "Active")) wait_until(lambda: assert_tenant_state(client, tenant_id, "Active"))
# The current_physical_size reports the sum of layers loaded in the layer # The current_physical_size reports the sum of layers loaded in the layer
# map, regardless of where the layer files are located. So even though we # map, regardless of where the layer files are located. So even though we
@@ -413,7 +413,7 @@ def test_download_remote_layers_api(
] ]
) )
wait_until(10, 0.2, lambda: assert_tenant_state(client, tenant_id, "Active")) wait_until(lambda: assert_tenant_state(client, tenant_id, "Active"))
###### Phase 1: exercise download error code path ###### Phase 1: exercise download error code path
@@ -705,7 +705,7 @@ def test_layer_download_cancelled_by_config_location(neon_env_builder: NeonEnvBu
) )
_, offset = wait_until( _, offset = wait_until(
20, 0.5, lambda: env.pageserver.assert_log_contains(f"at failpoint {failpoint}") lambda: env.pageserver.assert_log_contains(f"at failpoint {failpoint}")
) )
location_conf = {"mode": "Detached", "tenant_conf": {}} location_conf = {"mode": "Detached", "tenant_conf": {}}
@@ -713,8 +713,6 @@ def test_layer_download_cancelled_by_config_location(neon_env_builder: NeonEnvBu
detach = exec.submit(client.tenant_location_conf, env.initial_tenant, location_conf) detach = exec.submit(client.tenant_location_conf, env.initial_tenant, location_conf)
_, offset = wait_until( _, offset = wait_until(
20,
0.5,
lambda: env.pageserver.assert_log_contains( lambda: env.pageserver.assert_log_contains(
"closing is taking longer than expected", offset "closing is taking longer than expected", offset
), ),
@@ -734,8 +732,6 @@ def test_layer_download_cancelled_by_config_location(neon_env_builder: NeonEnvBu
client.configure_failpoints((failpoint, "pause")) client.configure_failpoints((failpoint, "pause"))
_, offset = wait_until( _, offset = wait_until(
20,
0.5,
lambda: env.pageserver.assert_log_contains(f"cfg failpoint: {failpoint} pause", offset), lambda: env.pageserver.assert_log_contains(f"cfg failpoint: {failpoint} pause", offset),
) )
@@ -750,8 +746,6 @@ def test_layer_download_cancelled_by_config_location(neon_env_builder: NeonEnvBu
warmup = exec.submit(client.tenant_secondary_download, env.initial_tenant, wait_ms=30000) warmup = exec.submit(client.tenant_secondary_download, env.initial_tenant, wait_ms=30000)
_, offset = wait_until( _, offset = wait_until(
20,
0.5,
lambda: env.pageserver.assert_log_contains(f"at failpoint {failpoint}", offset), lambda: env.pageserver.assert_log_contains(f"at failpoint {failpoint}", offset),
) )
@@ -805,7 +799,7 @@ def test_layer_download_timeouted(neon_env_builder: NeonEnvBuilder):
) )
_, offset = wait_until( _, offset = wait_until(
20, 0.5, lambda: env.pageserver.assert_log_contains(f"at failpoint {failpoint}") lambda: env.pageserver.assert_log_contains(f"at failpoint {failpoint}")
) )
# ensure enough time while paused to trip the timeout # ensure enough time while paused to trip the timeout
time.sleep(2) time.sleep(2)
@@ -824,8 +818,6 @@ def test_layer_download_timeouted(neon_env_builder: NeonEnvBuilder):
# capture the next offset for a new synchronization with the failpoint # capture the next offset for a new synchronization with the failpoint
_, offset = wait_until( _, offset = wait_until(
20,
0.5,
lambda: env.pageserver.assert_log_contains(f"cfg failpoint: {failpoint} pause", offset), lambda: env.pageserver.assert_log_contains(f"cfg failpoint: {failpoint} pause", offset),
) )

View File

@@ -117,19 +117,11 @@ def test_pageserver_http_get_wal_receiver_success(neon_simple_env: NeonEnv):
# We need to wait here because it's possible that we don't have access to # We need to wait here because it's possible that we don't have access to
# the latest WAL yet, when the `timeline_detail` API is first called. # the latest WAL yet, when the `timeline_detail` API is first called.
# See: https://github.com/neondatabase/neon/issues/1768. # See: https://github.com/neondatabase/neon/issues/1768.
lsn = wait_until( lsn = wait_until(lambda: expect_updated_msg_lsn(client, tenant_id, timeline_id, None))
number_of_iterations=5,
interval=1,
func=lambda: expect_updated_msg_lsn(client, tenant_id, timeline_id, None),
)
# Make a DB modification then expect getting a new WAL receiver's data. # Make a DB modification then expect getting a new WAL receiver's data.
endpoint.safe_psql("INSERT INTO t VALUES (1, 'hey')") endpoint.safe_psql("INSERT INTO t VALUES (1, 'hey')")
wait_until( wait_until(lambda: expect_updated_msg_lsn(client, tenant_id, timeline_id, lsn))
number_of_iterations=5,
interval=1,
func=lambda: expect_updated_msg_lsn(client, tenant_id, timeline_id, lsn),
)
def test_pageserver_http_api_client(neon_simple_env: NeonEnv): def test_pageserver_http_api_client(neon_simple_env: NeonEnv):

View File

@@ -352,7 +352,7 @@ def test_deletion_queue_recovery(
def assert_some_validations(): def assert_some_validations():
assert get_deletion_queue_validated(ps_http) > 0 assert get_deletion_queue_validated(ps_http) > 0
wait_until(20, 1, assert_some_validations) wait_until(assert_some_validations)
# The validatated keys statistic advances before the header is written, so we # The validatated keys statistic advances before the header is written, so we
# also wait to see the header hit the disk: this seems paranoid but the race # also wait to see the header hit the disk: this seems paranoid but the race
@@ -360,7 +360,7 @@ def test_deletion_queue_recovery(
def assert_header_written(): def assert_header_written():
assert (main_pageserver.workdir / "deletion" / "header-01").exists() assert (main_pageserver.workdir / "deletion" / "header-01").exists()
wait_until(20, 1, assert_header_written) wait_until(assert_header_written)
# If we will lose attachment, then our expectation on restart is that only the ones # If we will lose attachment, then our expectation on restart is that only the ones
# we already validated will execute. Act like only those were present in the queue. # we already validated will execute. Act like only those were present in the queue.
@@ -382,11 +382,11 @@ def test_deletion_queue_recovery(
# After restart, issue a flush to kick the deletion frontend to do recovery. # After restart, issue a flush to kick the deletion frontend to do recovery.
# It should recover all the operations we submitted before the restart. # It should recover all the operations we submitted before the restart.
ps_http.deletion_queue_flush(execute=False) ps_http.deletion_queue_flush(execute=False)
wait_until(20, 0.25, lambda: assert_deletions_submitted(before_restart_depth)) wait_until(lambda: assert_deletions_submitted(before_restart_depth))
# The queue should drain through completely if we flush it # The queue should drain through completely if we flush it
ps_http.deletion_queue_flush(execute=True) ps_http.deletion_queue_flush(execute=True)
wait_until(10, 1, lambda: assert_deletion_queue(ps_http, lambda n: n == 0)) wait_until(lambda: assert_deletion_queue(ps_http, lambda n: n == 0))
if keep_attachment == KeepAttachment.KEEP: if keep_attachment == KeepAttachment.KEEP:
# - If we kept the attachment, then our pre-restart deletions should execute # - If we kept the attachment, then our pre-restart deletions should execute
@@ -564,7 +564,7 @@ def test_multi_attach(
) )
# Initially, the tenant will be attached to the first pageserver (first is default in our test harness) # Initially, the tenant will be attached to the first pageserver (first is default in our test harness)
wait_until(10, 0.2, lambda: assert_tenant_state(http_clients[0], tenant_id, "Active")) wait_until(lambda: assert_tenant_state(http_clients[0], tenant_id, "Active"))
_detail = http_clients[0].timeline_detail(tenant_id, timeline_id) _detail = http_clients[0].timeline_detail(tenant_id, timeline_id)
with pytest.raises(PageserverApiException): with pytest.raises(PageserverApiException):
http_clients[1].timeline_detail(tenant_id, timeline_id) http_clients[1].timeline_detail(tenant_id, timeline_id)
@@ -579,8 +579,8 @@ def test_multi_attach(
pageservers[1].tenant_attach(env.initial_tenant) pageservers[1].tenant_attach(env.initial_tenant)
pageservers[2].tenant_attach(env.initial_tenant) pageservers[2].tenant_attach(env.initial_tenant)
wait_until(10, 0.2, lambda: assert_tenant_state(http_clients[1], tenant_id, "Active")) wait_until(lambda: assert_tenant_state(http_clients[1], tenant_id, "Active"))
wait_until(10, 0.2, lambda: assert_tenant_state(http_clients[2], tenant_id, "Active")) wait_until(lambda: assert_tenant_state(http_clients[2], tenant_id, "Active"))
# Now they all have it attached # Now they all have it attached
_details = list([c.timeline_detail(tenant_id, timeline_id) for c in http_clients]) _details = list([c.timeline_detail(tenant_id, timeline_id) for c in http_clients])

View File

@@ -81,9 +81,7 @@ def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: P
marker = uuid.uuid4().hex marker = uuid.uuid4().hex
ps_http.post_tracing_event("info", marker) ps_http.post_tracing_event("info", marker)
_, marker_offset = wait_until( _, marker_offset = wait_until(lambda: env.pageserver.assert_log_contains(marker, offset=None))
10, 0.5, lambda: env.pageserver.assert_log_contains(marker, offset=None)
)
log.info("run pagebench") log.info("run pagebench")
duration_secs = 10 duration_secs = 10
@@ -103,12 +101,11 @@ def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: P
log.info("validate that we logged the throttling") log.info("validate that we logged the throttling")
wait_until( wait_until(
10,
compaction_period / 10,
lambda: env.pageserver.assert_log_contains( lambda: env.pageserver.assert_log_contains(
f".*{tenant_id}.*shard was throttled in the last n_seconds.*", f".*{tenant_id}.*shard was throttled in the last n_seconds.*",
offset=marker_offset, offset=marker_offset,
), ),
timeout=compaction_period,
) )
log.info("validate that the metric doesn't include throttle wait time") log.info("validate that the metric doesn't include throttle wait time")

View File

@@ -84,7 +84,7 @@ def wait_for_wal_ingest_metric(pageserver_http: PageserverHttpClient) -> float:
# The metric gets initialised on the first update. # The metric gets initialised on the first update.
# Retry a few times, but return 0 if it's stable. # Retry a few times, but return 0 if it's stable.
try: try:
return float(wait_until(3, 0.5, query)) return float(wait_until(query, timeout=2, interval=0.5))
except Exception: except Exception:
return 0 return 0
@@ -131,7 +131,7 @@ def test_pageserver_small_inmemory_layers(
wait_until_pageserver_is_caught_up(env, last_flush_lsns) wait_until_pageserver_is_caught_up(env, last_flush_lsns)
# We didn't write enough data to trigger a size-based checkpoint: we should see dirty data. # We didn't write enough data to trigger a size-based checkpoint: we should see dirty data.
wait_until(10, 1, lambda: assert_dirty_bytes_nonzero(env)) wait_until(lambda: assert_dirty_bytes_nonzero(env))
ps_http_client = env.pageserver.http_client() ps_http_client = env.pageserver.http_client()
total_wal_ingested_before_restart = wait_for_wal_ingest_metric(ps_http_client) total_wal_ingested_before_restart = wait_for_wal_ingest_metric(ps_http_client)
@@ -139,7 +139,7 @@ def test_pageserver_small_inmemory_layers(
# Within ~ the checkpoint interval, all the ephemeral layers should be frozen and flushed, # Within ~ the checkpoint interval, all the ephemeral layers should be frozen and flushed,
# such that there are zero bytes of ephemeral layer left on the pageserver # such that there are zero bytes of ephemeral layer left on the pageserver
log.info("Waiting for background checkpoints...") log.info("Waiting for background checkpoints...")
wait_until(CHECKPOINT_TIMEOUT_SECONDS * 2, 1, lambda: assert_dirty_bytes(env, 0)) wait_until(lambda: assert_dirty_bytes(env, 0), timeout=2 * CHECKPOINT_TIMEOUT_SECONDS)
# Zero ephemeral layer bytes does not imply that all the frozen layers were uploaded: they # Zero ephemeral layer bytes does not imply that all the frozen layers were uploaded: they
# must be uploaded to remain visible to the pageserver after restart. # must be uploaded to remain visible to the pageserver after restart.
@@ -180,7 +180,7 @@ def test_idle_checkpoints(neon_env_builder: NeonEnvBuilder):
wait_until_pageserver_is_caught_up(env, last_flush_lsns) wait_until_pageserver_is_caught_up(env, last_flush_lsns)
# We didn't write enough data to trigger a size-based checkpoint: we should see dirty data. # We didn't write enough data to trigger a size-based checkpoint: we should see dirty data.
wait_until(10, 1, lambda: assert_dirty_bytes_nonzero(env)) wait_until(lambda: assert_dirty_bytes_nonzero(env))
# Stop the safekeepers, so that we cannot have any more WAL receiver connections # Stop the safekeepers, so that we cannot have any more WAL receiver connections
for sk in env.safekeepers: for sk in env.safekeepers:
@@ -193,7 +193,7 @@ def test_idle_checkpoints(neon_env_builder: NeonEnvBuilder):
# Within ~ the checkpoint interval, all the ephemeral layers should be frozen and flushed, # Within ~ the checkpoint interval, all the ephemeral layers should be frozen and flushed,
# such that there are zero bytes of ephemeral layer left on the pageserver # such that there are zero bytes of ephemeral layer left on the pageserver
log.info("Waiting for background checkpoints...") log.info("Waiting for background checkpoints...")
wait_until(CHECKPOINT_TIMEOUT_SECONDS * 2, 1, lambda: assert_dirty_bytes(env, 0)) wait_until(lambda: assert_dirty_bytes(env, 0), timeout=2 * CHECKPOINT_TIMEOUT_SECONDS)
# The code below verifies that we do not flush on the first write # The code below verifies that we do not flush on the first write
# after an idle period longer than the checkpoint timeout. # after an idle period longer than the checkpoint timeout.
@@ -210,7 +210,7 @@ def test_idle_checkpoints(neon_env_builder: NeonEnvBuilder):
run_worker_for_tenant(env, 5, tenant_with_extra_writes, offset=ENTRIES_PER_TIMELINE) run_worker_for_tenant(env, 5, tenant_with_extra_writes, offset=ENTRIES_PER_TIMELINE)
) )
dirty_after_write = wait_until(10, 1, lambda: assert_dirty_bytes_nonzero(env)) dirty_after_write = wait_until(lambda: assert_dirty_bytes_nonzero(env))
# We shouldn't flush since we've just opened a new layer # We shouldn't flush since we've just opened a new layer
waited_for = 0 waited_for = 0
@@ -305,11 +305,11 @@ def test_total_size_limit(neon_env_builder: NeonEnvBuilder):
# Wait until enough layers have rolled that the amount of dirty data is under the threshold. # Wait until enough layers have rolled that the amount of dirty data is under the threshold.
# We do this indirectly via layer maps, rather than the dirty bytes metric, to avoid false-passing # We do this indirectly via layer maps, rather than the dirty bytes metric, to avoid false-passing
# if that metric isn't updated quickly enough to reflect the dirty bytes exceeding the limit. # if that metric isn't updated quickly enough to reflect the dirty bytes exceeding the limit.
wait_until(compaction_period_s * 2, 1, assert_bytes_rolled) wait_until(assert_bytes_rolled, timeout=2 * compaction_period_s)
# The end state should also have the reported metric under the limit # The end state should also have the reported metric under the limit
def assert_dirty_data_limited(): def assert_dirty_data_limited():
dirty_bytes = get_dirty_bytes(env) dirty_bytes = get_dirty_bytes(env)
assert dirty_bytes < max_dirty_data assert dirty_bytes < max_dirty_data
wait_until(compaction_period_s * 2, 1, lambda: assert_dirty_data_limited()) wait_until(lambda: assert_dirty_data_limited(), timeout=2 * compaction_period_s)

View File

@@ -103,7 +103,7 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder):
raise AssertionError("No 'complete' metric yet") raise AssertionError("No 'complete' metric yet")
wait_until(30, 1.0, assert_complete) wait_until(assert_complete)
# Expectation callbacks: arg t is sample value, arg p is the previous phase's sample value # Expectation callbacks: arg t is sample value, arg p is the previous phase's sample value
expectations = [ expectations = [

View File

@@ -356,7 +356,7 @@ def test_live_migration(neon_env_builder: NeonEnvBuilder):
) )
assert destination_lsn >= origin_lsn assert destination_lsn >= origin_lsn
wait_until(100, 0.1, caught_up) wait_until(caught_up)
# The destination should accept writes # The destination should accept writes
workload.churn_rows(64, pageserver_b.id) workload.churn_rows(64, pageserver_b.id)
@@ -411,7 +411,7 @@ def test_live_migration(neon_env_builder: NeonEnvBuilder):
assert submitted is not None assert submitted is not None
assert submitted > 0 assert submitted > 0
wait_until(10, 0.1, blocked_deletions_drained) wait_until(blocked_deletions_drained)
workload.churn_rows(64, pageserver_b.id) workload.churn_rows(64, pageserver_b.id)
workload.validate(pageserver_b.id) workload.validate(pageserver_b.id)
@@ -702,7 +702,7 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder):
else: else:
timeout = int(deadline - now) + 1 timeout = int(deadline - now) + 1
try: try:
wait_until(timeout, 1, lambda: pageserver.assert_log_contains(expression)) wait_until(lambda: pageserver.assert_log_contains(expression), timeout=timeout)
except: except:
log.error(f"Timed out waiting for '{expression}'") log.error(f"Timed out waiting for '{expression}'")
raise raise

View File

@@ -215,8 +215,6 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder):
# wait for lease renewal before running query. # wait for lease renewal before running query.
_, offset = wait_until( _, offset = wait_until(
20,
0.5,
lambda: ep_static.assert_log_contains( lambda: ep_static.assert_log_contains(
"lsn_lease_bg_task.*Request succeeded", offset=offset "lsn_lease_bg_task.*Request succeeded", offset=offset
), ),

View File

@@ -300,9 +300,9 @@ def test_remote_storage_upload_queue_retries(
print_gc_result(gc_result) print_gc_result(gc_result)
assert gc_result["layers_removed"] > 0 assert gc_result["layers_removed"] > 0
wait_until(2, 1, lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="upload"), 0)) wait_until(lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="upload"), 0))
wait_until(2, 1, lambda: assert_eq(get_queued_count(file_kind="index", op_kind="upload"), 0)) wait_until(lambda: assert_eq(get_queued_count(file_kind="index", op_kind="upload"), 0))
wait_until(2, 1, lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="delete"), 0)) wait_until(lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="delete"), 0))
# let all future operations queue up # let all future operations queue up
configure_storage_sync_failpoints("return") configure_storage_sync_failpoints("return")
@@ -333,16 +333,28 @@ def test_remote_storage_upload_queue_retries(
# wait for churn thread's data to get stuck in the upload queue # wait for churn thread's data to get stuck in the upload queue
# Exponential back-off in upload queue, so, gracious timeouts. # Exponential back-off in upload queue, so, gracious timeouts.
wait_until(30, 1, lambda: assert_gt(get_queued_count(file_kind="layer", op_kind="upload"), 0)) wait_until(
wait_until(30, 1, lambda: assert_ge(get_queued_count(file_kind="index", op_kind="upload"), 1)) lambda: assert_gt(get_queued_count(file_kind="layer", op_kind="upload"), 0), timeout=30
wait_until(30, 1, lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="delete"), 0)) )
wait_until(
lambda: assert_ge(get_queued_count(file_kind="index", op_kind="upload"), 1), timeout=30
)
wait_until(
lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="delete"), 0), timeout=30
)
# unblock churn operations # unblock churn operations
configure_storage_sync_failpoints("off") configure_storage_sync_failpoints("off")
wait_until(30, 1, lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="upload"), 0)) wait_until(
wait_until(30, 1, lambda: assert_eq(get_queued_count(file_kind="index", op_kind="upload"), 0)) lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="upload"), 0), timeout=30
wait_until(30, 1, lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="delete"), 0)) )
wait_until(
lambda: assert_eq(get_queued_count(file_kind="index", op_kind="upload"), 0), timeout=30
)
wait_until(
lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="delete"), 0), timeout=30
)
# The churn thread doesn't make progress once it blocks on the first wait_completion() call, # The churn thread doesn't make progress once it blocks on the first wait_completion() call,
# so, give it some time to wrap up. # so, give it some time to wrap up.
@@ -580,7 +592,7 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue(
> 0 > 0
) )
wait_until(200, 0.1, assert_compacted_and_uploads_queued) wait_until(assert_compacted_and_uploads_queued)
# Regardless, give checkpoint some time to block for good. # Regardless, give checkpoint some time to block for good.
# Not strictly necessary, but might help uncover failure modes in the future. # Not strictly necessary, but might help uncover failure modes in the future.
@@ -598,9 +610,7 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue(
] ]
) )
# Generous timeout, because currently deletions can get blocked waiting for compaction timeline_delete_wait_completed(client, tenant_id, timeline_id)
# This can be reduced when https://github.com/neondatabase/neon/issues/4998 is fixed.
timeline_delete_wait_completed(client, tenant_id, timeline_id, iterations=30, interval=1)
assert not timeline_path.exists() assert not timeline_path.exists()
@@ -826,22 +836,16 @@ def wait_upload_queue_empty(
client: PageserverHttpClient, tenant_id: TenantId, timeline_id: TimelineId client: PageserverHttpClient, tenant_id: TenantId, timeline_id: TimelineId
): ):
wait_until( wait_until(
2,
1,
lambda: assert_eq( lambda: assert_eq(
get_queued_count(client, tenant_id, timeline_id, file_kind="layer", op_kind="upload"), 0 get_queued_count(client, tenant_id, timeline_id, file_kind="layer", op_kind="upload"), 0
), ),
) )
wait_until( wait_until(
2,
1,
lambda: assert_eq( lambda: assert_eq(
get_queued_count(client, tenant_id, timeline_id, file_kind="index", op_kind="upload"), 0 get_queued_count(client, tenant_id, timeline_id, file_kind="index", op_kind="upload"), 0
), ),
) )
wait_until( wait_until(
2,
1,
lambda: assert_eq( lambda: assert_eq(
get_queued_count(client, tenant_id, timeline_id, file_kind="layer", op_kind="delete"), 0 get_queued_count(client, tenant_id, timeline_id, file_kind="layer", op_kind="delete"), 0
), ),

View File

@@ -378,7 +378,7 @@ def test_replica_too_many_known_assigned_xids(neon_simple_env: NeonEnv):
return None return None
raise RuntimeError("connection succeeded") raise RuntimeError("connection succeeded")
wait_until(20, 0.5, check_replica_crashed) wait_until(check_replica_crashed)
assert secondary.log_contains("too many KnownAssignedXids") assert secondary.log_contains("too many KnownAssignedXids")
# Replica is crashed, so ignore stop result # Replica is crashed, so ignore stop result

View File

@@ -836,7 +836,7 @@ def test_sharding_split_stripe_size(
assert len(notifications) == 3 assert len(notifications) == 3
assert notifications[2] == expect_after assert notifications[2] == expect_after
wait_until(10, 1, assert_restart_notification) wait_until(assert_restart_notification)
# The quantity of data isn't huge, but debug can be _very_ slow, and the things we're # The quantity of data isn't huge, but debug can be _very_ slow, and the things we're
@@ -1025,7 +1025,7 @@ def test_sharding_ingest_gaps(
assert Lsn(timeline_detail["disk_consistent_lsn"]) >= expect_lsn assert Lsn(timeline_detail["disk_consistent_lsn"]) >= expect_lsn
# We set a short checkpoint timeout: expect things to get frozen+flushed within that # We set a short checkpoint timeout: expect things to get frozen+flushed within that
wait_until(checkpoint_interval_secs * 3, 1, assert_all_disk_consistent) wait_until(assert_all_disk_consistent, timeout=3 * checkpoint_interval_secs)
def assert_all_remote_consistent(): def assert_all_remote_consistent():
""" """
@@ -1037,7 +1037,7 @@ def test_sharding_ingest_gaps(
assert Lsn(timeline_detail["remote_consistent_lsn"]) >= expect_lsn assert Lsn(timeline_detail["remote_consistent_lsn"]) >= expect_lsn
# We set a short checkpoint timeout: expect things to get frozen+flushed within that # We set a short checkpoint timeout: expect things to get frozen+flushed within that
wait_until(checkpoint_interval_secs * 3, 1, assert_all_remote_consistent) wait_until(assert_all_remote_consistent, timeout=3 * checkpoint_interval_secs)
workload.validate() workload.validate()
@@ -1405,14 +1405,14 @@ def test_sharding_split_failures(
# e.g. while waiting for a storage controller to re-attach a parent shard if we failed # e.g. while waiting for a storage controller to re-attach a parent shard if we failed
# inside the pageserver and the storage controller responds by detaching children and attaching # inside the pageserver and the storage controller responds by detaching children and attaching
# parents concurrently (https://github.com/neondatabase/neon/issues/7148) # parents concurrently (https://github.com/neondatabase/neon/issues/7148)
wait_until(10, 1, lambda: workload.churn_rows(10, upload=False, ingest=False)) wait_until(lambda: workload.churn_rows(10, upload=False, ingest=False))
workload.validate() workload.validate()
if failure.fails_forward(env): if failure.fails_forward(env):
log.info("Fail-forward failure, checking split eventually completes...") log.info("Fail-forward failure, checking split eventually completes...")
# A failure type which results in eventual completion of the split # A failure type which results in eventual completion of the split
wait_until(30, 1, assert_split_done) wait_until(assert_split_done)
elif failure.can_mitigate(): elif failure.can_mitigate():
log.info("Mitigating failure...") log.info("Mitigating failure...")
# Mitigation phase: we expect to be able to proceed with a successful shard split # Mitigation phase: we expect to be able to proceed with a successful shard split
@@ -1420,21 +1420,21 @@ def test_sharding_split_failures(
# The split should appear to be rolled back from the point of view of all pageservers # The split should appear to be rolled back from the point of view of all pageservers
# apart from the one that is offline # apart from the one that is offline
wait_until(30, 1, lambda: assert_rolled_back(exclude_ps_id=failure.pageserver_id)) wait_until(lambda: assert_rolled_back(exclude_ps_id=failure.pageserver_id))
finish_split() finish_split()
wait_until(30, 1, lambda: assert_split_done(exclude_ps_id=failure.pageserver_id)) wait_until(lambda: assert_split_done(exclude_ps_id=failure.pageserver_id))
# Having cleared the failure, everything should converge to a pristine state # Having cleared the failure, everything should converge to a pristine state
failure.clear(env) failure.clear(env)
wait_until(30, 1, assert_split_done) wait_until(assert_split_done)
else: else:
# Once we restore the faulty pageserver's API to good health, rollback should # Once we restore the faulty pageserver's API to good health, rollback should
# eventually complete. # eventually complete.
log.info("Clearing failure...") log.info("Clearing failure...")
failure.clear(env) failure.clear(env)
wait_until(30, 1, assert_rolled_back) wait_until(assert_rolled_back)
# Having rolled back, the tenant should be working # Having rolled back, the tenant should be working
workload.churn_rows(10) workload.churn_rows(10)

View File

@@ -154,7 +154,7 @@ def test_storage_controller_smoke(neon_env_builder: NeonEnvBuilder, combination)
counts = get_node_shard_counts(env, tenant_ids) counts = get_node_shard_counts(env, tenant_ids)
assert counts[node_id] == 0 assert counts[node_id] == 0
wait_until(10, 1, lambda: node_evacuated(env.pageservers[0].id)) wait_until(lambda: node_evacuated(env.pageservers[0].id))
# Let all the reconciliations after marking the node offline complete # Let all the reconciliations after marking the node offline complete
env.storage_controller.reconcile_until_idle() env.storage_controller.reconcile_until_idle()
@@ -222,7 +222,7 @@ def test_node_status_after_restart(
def is_ready(): def is_ready():
assert env.storage_controller.ready() is True assert env.storage_controller.ready() is True
wait_until(30, 1, is_ready) wait_until(is_ready)
# We loaded nodes from database on restart # We loaded nodes from database on restart
nodes = env.storage_controller.node_list() nodes = env.storage_controller.node_list()
@@ -606,7 +606,7 @@ def test_storage_controller_compute_hook(
counts = get_node_shard_counts(env, [env.initial_tenant]) counts = get_node_shard_counts(env, [env.initial_tenant])
assert counts[node_id] == 0 assert counts[node_id] == 0
wait_until(10, 1, lambda: node_evacuated(env.pageservers[0].id)) wait_until(lambda: node_evacuated(env.pageservers[0].id))
# Additional notification from migration # Additional notification from migration
log.info(f"notifications: {notifications}") log.info(f"notifications: {notifications}")
@@ -620,7 +620,7 @@ def test_storage_controller_compute_hook(
assert len(notifications) == 2 assert len(notifications) == 2
assert notifications[1] == expect assert notifications[1] == expect
wait_until(20, 0.25, received_migration_notification) wait_until(received_migration_notification)
# When we restart, we should re-emit notifications for all tenants # When we restart, we should re-emit notifications for all tenants
env.storage_controller.stop() env.storage_controller.stop()
@@ -630,7 +630,7 @@ def test_storage_controller_compute_hook(
assert len(notifications) == 3 assert len(notifications) == 3
assert notifications[2] == expect assert notifications[2] == expect
wait_until(10, 1, received_restart_notification) wait_until(received_restart_notification)
# Splitting a tenant should cause its stripe size to become visible in the compute notification # Splitting a tenant should cause its stripe size to become visible in the compute notification
env.storage_controller.tenant_shard_split(env.initial_tenant, shard_count=2) env.storage_controller.tenant_shard_split(env.initial_tenant, shard_count=2)
@@ -647,7 +647,7 @@ def test_storage_controller_compute_hook(
assert len(notifications) == 4 assert len(notifications) == 4
assert notifications[3] == expect assert notifications[3] == expect
wait_until(10, 1, received_split_notification) wait_until(received_split_notification)
# If the compute hook is unavailable, that should not block creating a tenant and # If the compute hook is unavailable, that should not block creating a tenant and
# creating a timeline. This simulates a control plane refusing to accept notifications # creating a timeline. This simulates a control plane refusing to accept notifications
@@ -736,7 +736,7 @@ def test_storage_controller_stuck_compute_hook(
def logged_stuck(): def logged_stuck():
env.storage_controller.assert_log_contains(NOTIFY_BLOCKED_LOG) env.storage_controller.assert_log_contains(NOTIFY_BLOCKED_LOG)
wait_until(10, 0.25, logged_stuck) wait_until(logged_stuck)
contains_r = env.storage_controller.log_contains(NOTIFY_BLOCKED_LOG) contains_r = env.storage_controller.log_contains(NOTIFY_BLOCKED_LOG)
assert contains_r is not None # Appease mypy assert contains_r is not None # Appease mypy
(_, log_cursor) = contains_r (_, log_cursor) = contains_r
@@ -764,7 +764,7 @@ def test_storage_controller_stuck_compute_hook(
def logged_stuck_again(): def logged_stuck_again():
env.storage_controller.assert_log_contains(NOTIFY_BLOCKED_LOG, offset=log_cursor) env.storage_controller.assert_log_contains(NOTIFY_BLOCKED_LOG, offset=log_cursor)
wait_until(10, 0.25, logged_stuck_again) wait_until(logged_stuck_again)
assert migrate_fut.running() assert migrate_fut.running()
# This time, the compute hook remains stuck, but we mark the origin node offline: this should # This time, the compute hook remains stuck, but we mark the origin node offline: this should
@@ -865,7 +865,7 @@ def test_storage_controller_compute_hook_revert(
assert latest["shards"] is not None assert latest["shards"] is not None
assert latest["shards"][0]["node_id"] == ps_id assert latest["shards"][0]["node_id"] == ps_id
wait_until(30, 1, lambda: notified_ps(pageserver_a.id)) wait_until(lambda: notified_ps(pageserver_a.id))
env.storage_controller.allowed_errors.append(NOTIFY_BLOCKED_LOG) env.storage_controller.allowed_errors.append(NOTIFY_BLOCKED_LOG)
env.storage_controller.allowed_errors.extend(NOTIFY_FAILURE_LOGS) env.storage_controller.allowed_errors.extend(NOTIFY_FAILURE_LOGS)
@@ -880,7 +880,7 @@ def test_storage_controller_compute_hook_revert(
# Although the migration API failed, the hook should still see pageserver B (it remembers what # Although the migration API failed, the hook should still see pageserver B (it remembers what
# was posted even when returning an error code) # was posted even when returning an error code)
wait_until(30, 1, lambda: notified_ps(pageserver_b.id)) wait_until(lambda: notified_ps(pageserver_b.id))
# Although the migration API failed, the tenant should still have moved to the right pageserver # Although the migration API failed, the tenant should still have moved to the right pageserver
assert len(pageserver_b.http_client().tenant_list()) == 1 assert len(pageserver_b.http_client().tenant_list()) == 1
@@ -898,7 +898,7 @@ def test_storage_controller_compute_hook_revert(
def logged_giving_up(): def logged_giving_up():
env.storage_controller.assert_log_contains(".*Giving up on compute notification.*") env.storage_controller.assert_log_contains(".*Giving up on compute notification.*")
wait_until(30, 1, logged_giving_up) wait_until(logged_giving_up)
pageserver_a.start() pageserver_a.start()
@@ -919,7 +919,7 @@ def test_storage_controller_compute_hook_revert(
handle_params["status"] = 200 handle_params["status"] = 200
env.storage_controller.tenant_shard_migrate(tenant_shard_id, pageserver_a.id) env.storage_controller.tenant_shard_migrate(tenant_shard_id, pageserver_a.id)
wait_until(30, 1, lambda: notified_ps(pageserver_a.id)) wait_until(lambda: notified_ps(pageserver_a.id))
def test_storage_controller_debug_apis(neon_env_builder: NeonEnvBuilder): def test_storage_controller_debug_apis(neon_env_builder: NeonEnvBuilder):
@@ -1453,7 +1453,7 @@ def test_storage_controller_heartbeats(
# Check that each node got one tenant # Check that each node got one tenant
assert all(len(ts) == 1 for ts in node_to_tenants.values()) assert all(len(ts) == 1 for ts in node_to_tenants.values())
wait_until(10, 1, tenants_placed) wait_until(tenants_placed)
# ... then we apply the failure # ... then we apply the failure
offline_node_ids = set(failure.nodes()) offline_node_ids = set(failure.nodes())
@@ -1476,7 +1476,7 @@ def test_storage_controller_heartbeats(
assert node["availability"] == "Offline" assert node["availability"] == "Offline"
start = time.time() start = time.time()
wait_until(failure.offline_timeout, 1, nodes_offline) wait_until(nodes_offline, timeout=failure.offline_timeout)
detected_after = time.time() - start detected_after = time.time() - start
log.info(f"Detected node failures after {detected_after}s") log.info(f"Detected node failures after {detected_after}s")
@@ -1497,7 +1497,7 @@ def test_storage_controller_heartbeats(
assert observed_tenants == set(tenant_ids) assert observed_tenants == set(tenant_ids)
wait_until(10, 1, tenant_migrated) wait_until(tenant_migrated)
# ... then we clear the failure # ... then we clear the failure
failure.clear(env) failure.clear(env)
@@ -1509,7 +1509,7 @@ def test_storage_controller_heartbeats(
if node["id"] in online_node_ids: if node["id"] in online_node_ids:
assert node["availability"] == "Active" assert node["availability"] == "Active"
wait_until(10, 1, nodes_online) wait_until(nodes_online)
time.sleep(5) time.sleep(5)
@@ -1562,7 +1562,7 @@ def test_storage_controller_re_attach(neon_env_builder: NeonEnvBuilder):
# We could pre-empty this by configuring the node to Offline, but it's preferable to test # We could pre-empty this by configuring the node to Offline, but it's preferable to test
# the realistic path we would take when a node restarts uncleanly. # the realistic path we would take when a node restarts uncleanly.
# The delay here will be ~NEON_LOCAL_MAX_UNAVAILABLE_INTERVAL in neon_local # The delay here will be ~NEON_LOCAL_MAX_UNAVAILABLE_INTERVAL in neon_local
wait_until(30, 1, failed_over) wait_until(failed_over)
reconciles_before_restart = env.storage_controller.get_metric_value( reconciles_before_restart = env.storage_controller.get_metric_value(
"storage_controller_reconcile_complete_total", filter={"status": "ok"} "storage_controller_reconcile_complete_total", filter={"status": "ok"}
@@ -1640,12 +1640,12 @@ def test_storage_controller_shard_scheduling_policy(neon_env_builder: NeonEnvBui
assert e > n assert e > n
return e return e
errs = wait_until(10, 1, lambda: assert_errors_gt(0)) errs = wait_until(lambda: assert_errors_gt(0))
# Try reconciling again, it should fail again # Try reconciling again, it should fail again
with pytest.raises(StorageControllerApiException): with pytest.raises(StorageControllerApiException):
env.storage_controller.reconcile_all() env.storage_controller.reconcile_all()
errs = wait_until(10, 1, lambda: assert_errors_gt(errs)) errs = wait_until(lambda: assert_errors_gt(errs))
# Configure the tenant to disable reconciles # Configure the tenant to disable reconciles
env.storage_controller.tenant_policy_update( env.storage_controller.tenant_policy_update(
@@ -1674,7 +1674,7 @@ def test_storage_controller_shard_scheduling_policy(neon_env_builder: NeonEnvBui
return o return o
# We should see a successful reconciliation # We should see a successful reconciliation
wait_until(10, 1, lambda: assert_ok_gt(0)) wait_until(lambda: assert_ok_gt(0))
# And indeed the tenant should be attached # And indeed the tenant should be attached
assert len(env.pageserver.http_client().tenant_list_locations()["tenant_shards"]) == 1 assert len(env.pageserver.http_client().tenant_list_locations()["tenant_shards"]) == 1
@@ -2073,7 +2073,7 @@ def test_skip_drain_on_secondary_lag(neon_env_builder: NeonEnvBuilder, pg_bin: P
raise Exception(f"Secondary lag not big enough: {lag}") raise Exception(f"Secondary lag not big enough: {lag}")
log.info(f"Looking for lag to develop on the secondary {secondary}") log.info(f"Looking for lag to develop on the secondary {secondary}")
wait_until(10, 1, secondary_is_lagging) wait_until(secondary_is_lagging)
log.info(f"Starting drain of primary {primary} with laggy secondary {secondary}") log.info(f"Starting drain of primary {primary} with laggy secondary {secondary}")
env.storage_controller.retryable_node_operation( env.storage_controller.retryable_node_operation(
@@ -2107,7 +2107,7 @@ def test_skip_drain_on_secondary_lag(neon_env_builder: NeonEnvBuilder, pg_bin: P
if lag > 1 * 1024 * 1024: if lag > 1 * 1024 * 1024:
raise Exception(f"Secondary lag not big enough: {lag}") raise Exception(f"Secondary lag not big enough: {lag}")
wait_until(10, 1, lag_is_acceptable) wait_until(lag_is_acceptable)
env.storage_controller.node_configure(primary, {"scheduling": "Active"}) env.storage_controller.node_configure(primary, {"scheduling": "Active"})
@@ -2227,7 +2227,7 @@ def test_storage_controller_node_deletion(
log.info(f"Shards on nodes other than on victim: {elsewhere}") log.info(f"Shards on nodes other than on victim: {elsewhere}")
assert elsewhere == tenant_count * shard_count_per_tenant assert elsewhere == tenant_count * shard_count_per_tenant
wait_until(30, 1, assert_shards_migrated) wait_until(assert_shards_migrated)
log.info(f"Deleting pageserver {victim.id}") log.info(f"Deleting pageserver {victim.id}")
env.storage_controller.node_delete(victim.id) env.storage_controller.node_delete(victim.id)
@@ -2240,7 +2240,7 @@ def test_storage_controller_node_deletion(
log.info(f"Shards on node {victim.id}: {count}") log.info(f"Shards on node {victim.id}: {count}")
assert count == 0 assert count == 0
wait_until(30, 1, assert_victim_evacuated) wait_until(assert_victim_evacuated)
# The node should be gone from the list API # The node should be gone from the list API
assert victim.id not in [n["id"] for n in env.storage_controller.node_list()] assert victim.id not in [n["id"] for n in env.storage_controller.node_list()]
@@ -2569,7 +2569,7 @@ def test_storage_controller_leadership_transfer(
== StorageControllerLeadershipStatus.STEPPED_DOWN == StorageControllerLeadershipStatus.STEPPED_DOWN
) )
wait_until(5, 1, previous_stepped_down) wait_until(previous_stepped_down)
storage_controller_proxy.route_to(f"http://127.0.0.1:{storage_controller_2_port}") storage_controller_proxy.route_to(f"http://127.0.0.1:{storage_controller_2_port}")
@@ -2579,7 +2579,7 @@ def test_storage_controller_leadership_transfer(
== StorageControllerLeadershipStatus.LEADER == StorageControllerLeadershipStatus.LEADER
) )
wait_until(15, 1, new_becomes_leader) wait_until(new_becomes_leader)
leader = env.storage_controller.get_leader() leader = env.storage_controller.get_leader()
assert leader["address"] == f"http://127.0.0.1:{storage_controller_2_port}/" assert leader["address"] == f"http://127.0.0.1:{storage_controller_2_port}/"
@@ -2624,7 +2624,7 @@ def test_storage_controller_ps_restarted_during_drain(neon_env_builder: NeonEnvB
env.storage_controller.configure_failpoints(("sleepy-drain-loop", "return(10000)")) env.storage_controller.configure_failpoints(("sleepy-drain-loop", "return(10000)"))
env.storage_controller.node_drain(attached.id) env.storage_controller.node_drain(attached.id)
wait_until(10, 0.5, attached_is_draining) wait_until(attached_is_draining)
attached.restart() attached.restart()
@@ -2646,7 +2646,7 @@ def test_storage_controller_ps_restarted_during_drain(neon_env_builder: NeonEnvB
env.storage_controller.node_configure(attached.id, {"scheduling": "Pause"}) env.storage_controller.node_configure(attached.id, {"scheduling": "Pause"})
# allow for small delay between actually having cancelled and being able reconfigure again # allow for small delay between actually having cancelled and being able reconfigure again
wait_until(4, 0.5, reconfigure_node_again) wait_until(reconfigure_node_again)
def test_storage_controller_timeline_crud_race(neon_env_builder: NeonEnvBuilder): def test_storage_controller_timeline_crud_race(neon_env_builder: NeonEnvBuilder):
@@ -2691,7 +2691,7 @@ def test_storage_controller_timeline_crud_race(neon_env_builder: NeonEnvBuilder)
ps.log_contains(f"at failpoint {failpoint}") is not None for ps in env.pageservers ps.log_contains(f"at failpoint {failpoint}") is not None for ps in env.pageservers
) )
wait_until(10, 1, has_hit_failpoint) wait_until(has_hit_failpoint)
# Migrate the tenant while the timeline creation is in progress: this migration will complete once it # Migrate the tenant while the timeline creation is in progress: this migration will complete once it
# can detach from the old pageserver, which will happen once the failpoint completes. # can detach from the old pageserver, which will happen once the failpoint completes.
@@ -2775,7 +2775,7 @@ def test_storage_controller_validate_during_migration(neon_env_builder: NeonEnvB
def has_hit_compaction_failpoint(): def has_hit_compaction_failpoint():
assert origin_pageserver.log_contains(f"at failpoint {compaction_failpoint}") assert origin_pageserver.log_contains(f"at failpoint {compaction_failpoint}")
wait_until(10, 1, has_hit_compaction_failpoint) wait_until(has_hit_compaction_failpoint)
# While the compaction is running, start a live migration which will pause long enough for the compaction to sleep, # While the compaction is running, start a live migration which will pause long enough for the compaction to sleep,
# after incrementing generation and attaching the new location # after incrementing generation and attaching the new location
@@ -2794,7 +2794,7 @@ def test_storage_controller_validate_during_migration(neon_env_builder: NeonEnvB
# before it reaches this point. The timeout is because the AttachedStale transition includes # before it reaches this point. The timeout is because the AttachedStale transition includes
# a flush of remote storage, and if the compaction already enqueued an index upload this cannot # a flush of remote storage, and if the compaction already enqueued an index upload this cannot
# make progress. # make progress.
wait_until(60, 1, has_hit_migration_failpoint) wait_until(has_hit_migration_failpoint, timeout=60)
# Origin pageserver has succeeded with compaction before the migration completed. It has done all the writes it wanted to do in its own (stale) generation # Origin pageserver has succeeded with compaction before the migration completed. It has done all the writes it wanted to do in its own (stale) generation
origin_pageserver.http_client().configure_failpoints((compaction_failpoint, "off")) origin_pageserver.http_client().configure_failpoints((compaction_failpoint, "off"))
@@ -2917,7 +2917,7 @@ def test_storage_controller_proxy_during_migration(
log.info(expr) log.info(expr)
assert env.storage_controller.log_contains(expr) assert env.storage_controller.log_contains(expr)
wait_until(10, 1, has_hit_migration_failpoint) wait_until(has_hit_migration_failpoint)
# This request should be routed to whichever pageserver holds the highest generation # This request should be routed to whichever pageserver holds the highest generation
tenant_info = env.storage_controller.pageserver_api().tenant_status( tenant_info = env.storage_controller.pageserver_api().tenant_status(
@@ -2934,7 +2934,7 @@ def test_storage_controller_proxy_during_migration(
# We expect request to land on the origin # We expect request to land on the origin
assert tenant_info["generation"] == 1 assert tenant_info["generation"] == 1
wait_until(10, 1, long_migration_metric_published) wait_until(long_migration_metric_published)
# Eventually migration completes # Eventually migration completes
env.storage_controller.configure_failpoints((migration_failpoint.value, "off")) env.storage_controller.configure_failpoints((migration_failpoint.value, "off"))
@@ -3113,7 +3113,7 @@ def test_timeline_delete_mid_live_migration(neon_env_builder: NeonEnvBuilder, mi
log.info(expr) log.info(expr)
assert env.storage_controller.log_contains(expr) assert env.storage_controller.log_contains(expr)
wait_until(10, 1, has_hit_migration_failpoint) wait_until(has_hit_migration_failpoint)
env.storage_controller.pageserver_api().timeline_delete( env.storage_controller.pageserver_api().timeline_delete(
tenant_id=tenant_id, timeline_id=timeline_id tenant_id=tenant_id, timeline_id=timeline_id
@@ -3182,7 +3182,7 @@ def test_multi_attached_timeline_creation(neon_env_builder: NeonEnvBuilder, migr
log.info(expr) log.info(expr)
assert env.storage_controller.log_contains(expr) assert env.storage_controller.log_contains(expr)
wait_until(10, 1, has_hit_migration_failpoint) wait_until(has_hit_migration_failpoint)
timeline_id = TimelineId.generate() timeline_id = TimelineId.generate()
env.storage_controller.pageserver_api().timeline_create( env.storage_controller.pageserver_api().timeline_create(

View File

@@ -431,8 +431,6 @@ def test_scrubber_physical_gc_ancestors_split(neon_env_builder: NeonEnvBuilder):
# Let the controller reach the failpoint # Let the controller reach the failpoint
wait_until( wait_until(
10,
1,
lambda: env.storage_controller.assert_log_contains( lambda: env.storage_controller.assert_log_contains(
'failpoint "shard-split-post-remote-sleep": sleeping' 'failpoint "shard-split-post-remote-sleep": sleeping'
), ),

View File

@@ -56,4 +56,4 @@ def test_subscriber_restart(neon_simple_env: NeonEnv):
pcur.execute(f"INSERT into t values ({n_records}, 0)") pcur.execute(f"INSERT into t values ({n_records}, 0)")
n_records += 1 n_records += 1
with sub.cursor() as scur: with sub.cursor() as scur:
wait_until(60, 0.5, check_that_changes_propagated) wait_until(check_that_changes_propagated)

View File

@@ -234,11 +234,7 @@ def test_creating_tenant_conf_after_attach(neon_env_builder: NeonEnvBuilder):
assert not config_path.exists(), "detach did not remove config file" assert not config_path.exists(), "detach did not remove config file"
env.pageserver.tenant_attach(tenant_id) env.pageserver.tenant_attach(tenant_id)
wait_until( wait_until(lambda: assert_tenant_state(http_client, tenant_id, "Active"))
number_of_iterations=5,
interval=1,
func=lambda: assert_tenant_state(http_client, tenant_id, "Active"),
)
env.config_tenant(tenant_id, {"gc_horizon": "1000000"}) env.config_tenant(tenant_id, {"gc_horizon": "1000000"})
contents_first = config_path.read_text() contents_first = config_path.read_text()

View File

@@ -185,21 +185,21 @@ def test_long_timeline_create_cancelled_by_tenant_delete(neon_env_builder: NeonE
deletion = None deletion = None
try: try:
wait_until(10, 1, has_hit_failpoint) wait_until(has_hit_failpoint)
# it should start ok, sync up with the stuck creation, then hang waiting for the timeline # it should start ok, sync up with the stuck creation, then hang waiting for the timeline
# to shut down. # to shut down.
deletion = Thread(target=start_deletion) deletion = Thread(target=start_deletion)
deletion.start() deletion.start()
wait_until(10, 1, deletion_has_started_waiting_for_timelines) wait_until(deletion_has_started_waiting_for_timelines)
pageserver_http.configure_failpoints((failpoint, "off")) pageserver_http.configure_failpoints((failpoint, "off"))
creation.join() creation.join()
deletion.join() deletion.join()
wait_until(10, 1, tenant_is_deleted) wait_until(tenant_is_deleted)
finally: finally:
creation.join() creation.join()
if deletion is not None: if deletion is not None:
@@ -264,7 +264,7 @@ def test_tenant_delete_races_timeline_creation(neon_env_builder: NeonEnvBuilder)
def hit_initdb_upload_failpoint(): def hit_initdb_upload_failpoint():
env.pageserver.assert_log_contains(f"at failpoint {BEFORE_INITDB_UPLOAD_FAILPOINT}") env.pageserver.assert_log_contains(f"at failpoint {BEFORE_INITDB_UPLOAD_FAILPOINT}")
wait_until(100, 0.1, hit_initdb_upload_failpoint) wait_until(hit_initdb_upload_failpoint)
def creation_connection_timed_out(): def creation_connection_timed_out():
env.pageserver.assert_log_contains( env.pageserver.assert_log_contains(
@@ -273,7 +273,7 @@ def test_tenant_delete_races_timeline_creation(neon_env_builder: NeonEnvBuilder)
# Wait so that we hit the timeout and the connection is dropped # Wait so that we hit the timeout and the connection is dropped
# (But timeline creation still continues) # (But timeline creation still continues)
wait_until(100, 0.1, creation_connection_timed_out) wait_until(creation_connection_timed_out)
ps_http.configure_failpoints((DELETE_BEFORE_CLEANUP_FAILPOINT, "pause")) ps_http.configure_failpoints((DELETE_BEFORE_CLEANUP_FAILPOINT, "pause"))
@@ -281,7 +281,7 @@ def test_tenant_delete_races_timeline_creation(neon_env_builder: NeonEnvBuilder)
def tenant_delete_inner(): def tenant_delete_inner():
ps_http.tenant_delete(tenant_id) ps_http.tenant_delete(tenant_id)
wait_until(100, 0.5, tenant_delete_inner) wait_until(tenant_delete_inner)
Thread(target=tenant_delete).start() Thread(target=tenant_delete).start()
@@ -290,7 +290,7 @@ def test_tenant_delete_races_timeline_creation(neon_env_builder: NeonEnvBuilder)
f"cfg failpoint: {DELETE_BEFORE_CLEANUP_FAILPOINT} pause" f"cfg failpoint: {DELETE_BEFORE_CLEANUP_FAILPOINT} pause"
) )
wait_until(100, 0.1, deletion_arrived) wait_until(deletion_arrived)
ps_http.configure_failpoints((DELETE_BEFORE_CLEANUP_FAILPOINT, "off")) ps_http.configure_failpoints((DELETE_BEFORE_CLEANUP_FAILPOINT, "off"))

View File

@@ -212,7 +212,7 @@ def test_tenant_reattach_while_busy(
nonlocal updates_started, updates_finished, updates_to_perform nonlocal updates_started, updates_finished, updates_to_perform
# Wait until we have performed some updates # Wait until we have performed some updates
wait_until(20, 0.5, lambda: updates_finished > 500) wait_until(lambda: updates_finished > 500)
log.info("Detaching tenant") log.info("Detaching tenant")
pageserver_http.tenant_detach(tenant_id) pageserver_http.tenant_detach(tenant_id)
@@ -512,7 +512,7 @@ def test_metrics_while_ignoring_broken_tenant_and_reloading(
) )
assert only_int(active) == 0 and only_int(broken) == 1 and only_int(broken_set) == 1 assert only_int(active) == 0 and only_int(broken) == 1 and only_int(broken_set) == 1
wait_until(10, 0.5, found_broken) wait_until(found_broken)
client.tenant_detach(env.initial_tenant) client.tenant_detach(env.initial_tenant)
@@ -524,7 +524,7 @@ def test_metrics_while_ignoring_broken_tenant_and_reloading(
) )
assert only_int(broken) == 0 and len(broken_set) == 0 assert only_int(broken) == 0 and len(broken_set) == 0
wait_until(10, 0.5, found_cleaned_up) wait_until(found_cleaned_up)
env.pageserver.tenant_attach(env.initial_tenant) env.pageserver.tenant_attach(env.initial_tenant)
@@ -536,4 +536,4 @@ def test_metrics_while_ignoring_broken_tenant_and_reloading(
) )
assert only_int(active) == 1 and len(broken_set) == 0 assert only_int(active) == 1 and len(broken_set) == 0
wait_until(10, 0.5, found_active) wait_until(found_active)

View File

@@ -298,11 +298,7 @@ def test_tenant_relocation(
destination_ps.tenant_attach(tenant_id) destination_ps.tenant_attach(tenant_id)
# wait for tenant to finish attaching # wait for tenant to finish attaching
wait_until( wait_until(lambda: assert_tenant_state(destination_http, tenant_id, "Active"))
number_of_iterations=10,
interval=1,
func=lambda: assert_tenant_state(destination_http, tenant_id, "Active"),
)
check_timeline_attached( check_timeline_attached(
destination_http, destination_http,

View File

@@ -638,7 +638,7 @@ def test_synthetic_size_while_deleting(neon_env_builder: NeonEnvBuilder):
with ThreadPoolExecutor(max_workers=1) as exec: with ThreadPoolExecutor(max_workers=1) as exec:
completion = exec.submit(client.tenant_size, env.initial_tenant) completion = exec.submit(client.tenant_size, env.initial_tenant)
_, last_offset = wait_until( _, last_offset = wait_until(
10, 1.0, lambda: env.pageserver.assert_log_contains(f"at failpoint {failpoint}") lambda: env.pageserver.assert_log_contains(f"at failpoint {failpoint}")
) )
timeline_delete_wait_completed(client, env.initial_tenant, branch_id) timeline_delete_wait_completed(client, env.initial_tenant, branch_id)
@@ -656,8 +656,6 @@ def test_synthetic_size_while_deleting(neon_env_builder: NeonEnvBuilder):
with ThreadPoolExecutor(max_workers=1) as exec: with ThreadPoolExecutor(max_workers=1) as exec:
completion = exec.submit(client.tenant_size, env.initial_tenant) completion = exec.submit(client.tenant_size, env.initial_tenant)
wait_until( wait_until(
10,
1.0,
lambda: env.pageserver.assert_log_contains( lambda: env.pageserver.assert_log_contains(
f"at failpoint {failpoint}", offset=last_offset f"at failpoint {failpoint}", offset=last_offset
), ),

View File

@@ -77,4 +77,4 @@ def test_tenant_tasks(neon_env_builder: NeonEnvBuilder):
assert tasks_started == tasks_ended assert tasks_started == tasks_ended
assert tasks_panicked is None or int(tasks_panicked) == 0 assert tasks_panicked is None or int(tasks_panicked) == 0
wait_until(10, 0.2, assert_tasks_finish) wait_until(assert_tasks_finish)

View File

@@ -330,7 +330,7 @@ def test_pageserver_with_empty_tenants(neon_env_builder: NeonEnvBuilder):
assert len(tenants) == 1 assert len(tenants) == 1
assert all(t["state"]["slug"] != "Attaching" for t in tenants) assert all(t["state"]["slug"] != "Attaching" for t in tenants)
wait_until(10, 0.2, not_attaching) wait_until(not_attaching)
tenants = client.tenant_list() tenants = client.tenant_list()

View File

@@ -178,11 +178,7 @@ def test_tenants_attached_after_download(neon_env_builder: NeonEnvBuilder):
env.pageserver.start() env.pageserver.start()
client = env.pageserver.http_client() client = env.pageserver.http_client()
wait_until( wait_until(lambda: assert_tenant_state(client, tenant_id, "Active"))
number_of_iterations=5,
interval=1,
func=lambda: assert_tenant_state(client, tenant_id, "Active"),
)
restored_timelines = client.timeline_list(tenant_id) restored_timelines = client.timeline_list(tenant_id)
assert ( assert (
@@ -257,11 +253,7 @@ def test_tenant_redownloads_truncated_file_on_startup(
env.pageserver.start() env.pageserver.start()
client = env.pageserver.http_client() client = env.pageserver.http_client()
wait_until( wait_until(lambda: assert_tenant_state(client, tenant_id, "Active"))
number_of_iterations=5,
interval=1,
func=lambda: assert_tenant_state(client, tenant_id, "Active"),
)
restored_timelines = client.timeline_list(tenant_id) restored_timelines = client.timeline_list(tenant_id)
assert ( assert (

View File

@@ -227,8 +227,8 @@ def test_timeline_offloading(neon_env_builder: NeonEnvBuilder, manual_offload: b
ps_http.timeline_offload(tenant_id=tenant_id, timeline_id=leaf_timeline_id) ps_http.timeline_offload(tenant_id=tenant_id, timeline_id=leaf_timeline_id)
assert timeline_offloaded_logged(leaf_timeline_id) assert timeline_offloaded_logged(leaf_timeline_id)
wait_until(30, 1, leaf_offloaded) wait_until(leaf_offloaded)
wait_until(30, 1, parent_offloaded) wait_until(parent_offloaded)
# Offloaded child timelines should still prevent deletion # Offloaded child timelines should still prevent deletion
with pytest.raises( with pytest.raises(
@@ -331,7 +331,7 @@ def test_timeline_offload_persist(neon_env_builder: NeonEnvBuilder, delete_timel
ps_http.timeline_offload(tenant_id=tenant_id, timeline_id=child_timeline_id) ps_http.timeline_offload(tenant_id=tenant_id, timeline_id=child_timeline_id)
assert timeline_offloaded_api(child_timeline_id) assert timeline_offloaded_api(child_timeline_id)
wait_until(30, 1, child_offloaded) wait_until(child_offloaded)
assert timeline_offloaded_api(child_timeline_id) assert timeline_offloaded_api(child_timeline_id)
assert not timeline_offloaded_api(root_timeline_id) assert not timeline_offloaded_api(root_timeline_id)

View File

@@ -21,7 +21,6 @@ from fixtures.pageserver.utils import (
assert_prefix_empty, assert_prefix_empty,
assert_prefix_not_empty, assert_prefix_not_empty,
many_small_layers_tenant_config, many_small_layers_tenant_config,
poll_for_remote_storage_iterations,
timeline_delete_wait_completed, timeline_delete_wait_completed,
wait_for_last_record_lsn, wait_for_last_record_lsn,
wait_for_upload, wait_for_upload,
@@ -94,12 +93,7 @@ def test_timeline_delete(neon_simple_env: NeonEnv):
assert timeline_path.exists() assert timeline_path.exists()
# retry deletes when compaction or gc is running in pageserver # retry deletes when compaction or gc is running in pageserver
# TODO: review whether this wait_until is actually necessary, we do an await() internally timeline_delete_wait_completed(ps_http, env.initial_tenant, leaf_timeline_id)
wait_until(
number_of_iterations=3,
interval=0.2,
func=lambda: timeline_delete_wait_completed(ps_http, env.initial_tenant, leaf_timeline_id),
)
assert not timeline_path.exists() assert not timeline_path.exists()
@@ -111,13 +105,7 @@ def test_timeline_delete(neon_simple_env: NeonEnv):
ps_http.timeline_detail(env.initial_tenant, leaf_timeline_id) ps_http.timeline_detail(env.initial_tenant, leaf_timeline_id)
assert exc.value.status_code == 404 assert exc.value.status_code == 404
wait_until( timeline_delete_wait_completed(ps_http, env.initial_tenant, parent_timeline_id)
number_of_iterations=3,
interval=0.2,
func=lambda: timeline_delete_wait_completed(
ps_http, env.initial_tenant, parent_timeline_id
),
)
# Check that we didn't pick up the timeline again after restart. # Check that we didn't pick up the timeline again after restart.
# See https://github.com/neondatabase/neon/issues/3560 # See https://github.com/neondatabase/neon/issues/3560
@@ -226,8 +214,6 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
ps_http.configure_failpoints((failpoint, "return")) ps_http.configure_failpoints((failpoint, "return"))
iterations = poll_for_remote_storage_iterations(remote_storage_kind)
# These failpoints are earlier than background task is spawned. # These failpoints are earlier than background task is spawned.
# so they result in api request failure. # so they result in api request failure.
if failpoint in ( if failpoint in (
@@ -244,7 +230,7 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
tenant_id=env.initial_tenant, tenant_id=env.initial_tenant,
timeline_id=timeline_id, timeline_id=timeline_id,
expected_state="Broken", expected_state="Broken",
iterations=iterations, iterations=40,
) )
reason = timeline_info["state"]["Broken"]["reason"] reason = timeline_info["state"]["Broken"]["reason"]
@@ -257,25 +243,21 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
env.pageserver.stop() env.pageserver.stop()
env.pageserver.start() env.pageserver.start()
wait_until_tenant_active(ps_http, env.initial_tenant, iterations=iterations) wait_until_tenant_active(ps_http, env.initial_tenant)
if failpoint == "timeline-delete-before-index-deleted-at": if failpoint == "timeline-delete-before-index-deleted-at":
# We crashed before persisting this to remote storage, need to retry delete request # We crashed before persisting this to remote storage, need to retry delete request
timeline_delete_wait_completed(ps_http, env.initial_tenant, timeline_id) timeline_delete_wait_completed(ps_http, env.initial_tenant, timeline_id)
else: else:
# Pageserver should've resumed deletion after restart. # Pageserver should've resumed deletion after restart.
wait_timeline_detail_404( wait_timeline_detail_404(ps_http, env.initial_tenant, timeline_id)
ps_http, env.initial_tenant, timeline_id, iterations=iterations
)
elif check is Check.RETRY_WITHOUT_RESTART: elif check is Check.RETRY_WITHOUT_RESTART:
# this should succeed # this should succeed
# this also checks that delete can be retried even when timeline is in Broken state # this also checks that delete can be retried even when timeline is in Broken state
ps_http.configure_failpoints((failpoint, "off")) ps_http.configure_failpoints((failpoint, "off"))
timeline_delete_wait_completed( timeline_delete_wait_completed(ps_http, env.initial_tenant, timeline_id)
ps_http, env.initial_tenant, timeline_id, iterations=iterations
)
# Check remote is empty # Check remote is empty
if remote_storage_kind is RemoteStorageKind.MOCK_S3: if remote_storage_kind is RemoteStorageKind.MOCK_S3:
@@ -378,7 +360,7 @@ def test_timeline_resurrection_on_attach(
env.pageserver.tenant_attach(tenant_id=tenant_id) env.pageserver.tenant_attach(tenant_id=tenant_id)
wait_until_tenant_active(ps_http, tenant_id=tenant_id, iterations=10, period=0.5) wait_until_tenant_active(ps_http, tenant_id=tenant_id)
timelines = ps_http.timeline_list(tenant_id=tenant_id) timelines = ps_http.timeline_list(tenant_id=tenant_id)
assert {TimelineId(tl["timeline_id"]) for tl in timelines} == { assert {TimelineId(tl["timeline_id"]) for tl in timelines} == {
@@ -439,7 +421,7 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild
# Wait for tenant to finish loading. # Wait for tenant to finish loading.
wait_until_tenant_active(ps_http, tenant_id=env.initial_tenant, iterations=10, period=1) wait_until_tenant_active(ps_http, tenant_id=env.initial_tenant, iterations=10, period=1)
wait_timeline_detail_404(ps_http, env.initial_tenant, leaf_timeline_id, iterations=4) wait_timeline_detail_404(ps_http, env.initial_tenant, leaf_timeline_id)
assert ( assert (
not leaf_timeline_path.exists() not leaf_timeline_path.exists()
@@ -481,11 +463,10 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild
) )
# for some reason the check above doesnt immediately take effect for the below. # for some reason the check above doesnt immediately take effect for the below.
# Assume it is mock server incosistency and check twice. # Assume it is mock server incosistency and check a few times.
wait_until( wait_until(
2,
0.5,
lambda: assert_prefix_empty(neon_env_builder.pageserver_remote_storage), lambda: assert_prefix_empty(neon_env_builder.pageserver_remote_storage),
timeout=2,
) )
# We deleted our only tenant, and the scrubber fails if it detects nothing # We deleted our only tenant, and the scrubber fails if it detects nothing
@@ -544,7 +525,7 @@ def test_concurrent_timeline_delete_stuck_on(
f".*{child_timeline_id}.*at failpoint {stuck_failpoint}" f".*{child_timeline_id}.*at failpoint {stuck_failpoint}"
) )
wait_until(50, 0.1, first_call_hit_failpoint) wait_until(first_call_hit_failpoint, interval=0.1, status_interval=1.0)
# make the second call and assert behavior # make the second call and assert behavior
log.info("second call start") log.info("second call start")
@@ -613,7 +594,7 @@ def test_delete_timeline_client_hangup(neon_env_builder: NeonEnvBuilder):
def hit_failpoint(): def hit_failpoint():
env.pageserver.assert_log_contains(at_failpoint_log_message) env.pageserver.assert_log_contains(at_failpoint_log_message)
wait_until(50, 0.1, hit_failpoint) wait_until(hit_failpoint, interval=0.1)
# we log this error if a client hangs up # we log this error if a client hangs up
# might as well use it as another indicator that the test works # might as well use it as another indicator that the test works
@@ -623,7 +604,7 @@ def test_delete_timeline_client_hangup(neon_env_builder: NeonEnvBuilder):
def got_hangup_log_message(): def got_hangup_log_message():
env.pageserver.assert_log_contains(hangup_log_message) env.pageserver.assert_log_contains(hangup_log_message)
wait_until(50, 0.1, got_hangup_log_message) wait_until(got_hangup_log_message, interval=0.1)
# check that the timeline is still present # check that the timeline is still present
ps_http.timeline_detail(env.initial_tenant, child_timeline_id) ps_http.timeline_detail(env.initial_tenant, child_timeline_id)
@@ -635,10 +616,10 @@ def test_delete_timeline_client_hangup(neon_env_builder: NeonEnvBuilder):
message = f".*DELETE.*{child_timeline_id}.*Cancelled request finished" message = f".*DELETE.*{child_timeline_id}.*Cancelled request finished"
env.pageserver.assert_log_contains(message) env.pageserver.assert_log_contains(message)
wait_until(50, 0.1, first_request_finished) wait_until(first_request_finished, interval=0.1)
# check that the timeline is gone # check that the timeline is gone
wait_timeline_detail_404(ps_http, env.initial_tenant, child_timeline_id, iterations=10) wait_timeline_detail_404(ps_http, env.initial_tenant, child_timeline_id)
def test_timeline_delete_works_for_remote_smoke( def test_timeline_delete_works_for_remote_smoke(
@@ -707,7 +688,7 @@ def test_timeline_delete_works_for_remote_smoke(
# for some reason the check above doesnt immediately take effect for the below. # for some reason the check above doesnt immediately take effect for the below.
# Assume it is mock server inconsistency and check twice. # Assume it is mock server inconsistency and check twice.
wait_until(2, 0.5, lambda: assert_prefix_empty(neon_env_builder.pageserver_remote_storage)) wait_until(lambda: assert_prefix_empty(neon_env_builder.pageserver_remote_storage))
# We deleted our only tenant, and the scrubber fails if it detects nothing # We deleted our only tenant, and the scrubber fails if it detects nothing
neon_env_builder.disable_scrub_on_exit() neon_env_builder.disable_scrub_on_exit()
@@ -753,15 +734,13 @@ def test_delete_orphaned_objects(
env.pageserver.allowed_errors.append(f".*failpoint: {failpoint}") env.pageserver.allowed_errors.append(f".*failpoint: {failpoint}")
iterations = poll_for_remote_storage_iterations(remote_storage_kind)
ps_http.timeline_delete(env.initial_tenant, timeline_id) ps_http.timeline_delete(env.initial_tenant, timeline_id)
timeline_info = wait_until_timeline_state( timeline_info = wait_until_timeline_state(
pageserver_http=ps_http, pageserver_http=ps_http,
tenant_id=env.initial_tenant, tenant_id=env.initial_tenant,
timeline_id=timeline_id, timeline_id=timeline_id,
expected_state="Broken", expected_state="Broken",
iterations=iterations, iterations=40,
) )
reason = timeline_info["state"]["Broken"]["reason"] reason = timeline_info["state"]["Broken"]["reason"]
@@ -827,8 +806,6 @@ def test_timeline_delete_resumed_on_attach(
) )
) )
iterations = poll_for_remote_storage_iterations(remote_storage_kind)
ps_http.timeline_delete(tenant_id, timeline_id) ps_http.timeline_delete(tenant_id, timeline_id)
timeline_info = wait_until_timeline_state( timeline_info = wait_until_timeline_state(
@@ -836,7 +813,7 @@ def test_timeline_delete_resumed_on_attach(
tenant_id=env.initial_tenant, tenant_id=env.initial_tenant,
timeline_id=timeline_id, timeline_id=timeline_id,
expected_state="Broken", expected_state="Broken",
iterations=iterations, iterations=40,
) )
reason = timeline_info["state"]["Broken"]["reason"] reason = timeline_info["state"]["Broken"]["reason"]
@@ -871,7 +848,7 @@ def test_timeline_delete_resumed_on_attach(
env.pageserver.tenant_attach(tenant_id=tenant_id) env.pageserver.tenant_attach(tenant_id=tenant_id)
# delete should be resumed # delete should be resumed
wait_timeline_detail_404(ps_http, env.initial_tenant, timeline_id, iterations=iterations) wait_timeline_detail_404(ps_http, env.initial_tenant, timeline_id)
tenant_path = env.pageserver.timeline_dir(tenant_id, timeline_id) tenant_path = env.pageserver.timeline_dir(tenant_id, timeline_id)
assert not tenant_path.exists() assert not tenant_path.exists()

View File

@@ -203,7 +203,7 @@ def test_ancestor_detach_branched_from(
) )
client.timeline_delete(env.initial_tenant, env.initial_timeline) client.timeline_delete(env.initial_tenant, env.initial_timeline)
wait_timeline_detail_404(client, env.initial_tenant, env.initial_timeline, 10, 1.0) wait_timeline_detail_404(client, env.initial_tenant, env.initial_timeline)
# because we do the fullbackup from ancestor at the branch_lsn, the zenith.signal is always different # because we do the fullbackup from ancestor at the branch_lsn, the zenith.signal is always different
# as there is always "PREV_LSN: invalid" for "before" # as there is always "PREV_LSN: invalid" for "before"
@@ -336,10 +336,10 @@ def test_ancestor_detach_reparents_earlier(neon_env_builder: NeonEnvBuilder):
# delete the timelines to confirm detach actually worked # delete the timelines to confirm detach actually worked
client.timeline_delete(env.initial_tenant, after) client.timeline_delete(env.initial_tenant, after)
wait_timeline_detail_404(client, env.initial_tenant, after, 10, 1.0) wait_timeline_detail_404(client, env.initial_tenant, after)
client.timeline_delete(env.initial_tenant, env.initial_timeline) client.timeline_delete(env.initial_tenant, env.initial_timeline)
wait_timeline_detail_404(client, env.initial_tenant, env.initial_timeline, 10, 1.0) wait_timeline_detail_404(client, env.initial_tenant, env.initial_timeline)
def test_detached_receives_flushes_while_being_detached(neon_env_builder: NeonEnvBuilder): def test_detached_receives_flushes_while_being_detached(neon_env_builder: NeonEnvBuilder):
@@ -973,17 +973,17 @@ def test_timeline_detach_ancestor_interrupted_by_deletion(
with ThreadPoolExecutor(max_workers=2) as pool: with ThreadPoolExecutor(max_workers=2) as pool:
try: try:
fut = pool.submit(detach_ancestor) fut = pool.submit(detach_ancestor)
offset = wait_until(10, 1.0, at_failpoint) offset = wait_until(at_failpoint)
delete = pool.submit(start_delete) delete = pool.submit(start_delete)
offset = wait_until(10, 1.0, lambda: at_waiting_on_gate_close(offset)) offset = wait_until(lambda: at_waiting_on_gate_close(offset))
victim_http.configure_failpoints((pausepoint, "off")) victim_http.configure_failpoints((pausepoint, "off"))
delete.result() delete.result()
assert wait_until(10, 1.0, is_deleted), f"unimplemented mode {mode}" assert wait_until(is_deleted), f"unimplemented mode {mode}"
# TODO: match the error # TODO: match the error
with pytest.raises(PageserverApiException) as exc: with pytest.raises(PageserverApiException) as exc:
@@ -1115,11 +1115,11 @@ def test_sharded_tad_interleaved_after_partial_success(neon_env_builder: NeonEnv
with ThreadPoolExecutor(max_workers=1) as pool: with ThreadPoolExecutor(max_workers=1) as pool:
try: try:
fut = pool.submit(detach_timeline) fut = pool.submit(detach_timeline)
wait_until(10, 1.0, paused_at_failpoint) wait_until(paused_at_failpoint)
# let stuck complete # let stuck complete
stuck_http.configure_failpoints((pausepoint, "off")) stuck_http.configure_failpoints((pausepoint, "off"))
wait_until(10, 1.0, first_completed) wait_until(first_completed)
if mode == "delete_reparentable_timeline": if mode == "delete_reparentable_timeline":
assert first_branch is not None assert first_branch is not None
@@ -1127,7 +1127,7 @@ def test_sharded_tad_interleaved_after_partial_success(neon_env_builder: NeonEnv
env.initial_tenant, first_branch env.initial_tenant, first_branch
) )
victim_http.configure_failpoints((pausepoint, "off")) victim_http.configure_failpoints((pausepoint, "off"))
wait_until(10, 1.0, first_branch_gone) wait_until(first_branch_gone)
elif mode == "create_reparentable_timeline": elif mode == "create_reparentable_timeline":
first_branch = create_reparentable_timeline() first_branch = create_reparentable_timeline()
victim_http.configure_failpoints((pausepoint, "off")) victim_http.configure_failpoints((pausepoint, "off"))
@@ -1271,11 +1271,11 @@ def test_retryable_500_hit_through_storcon_during_timeline_detach_ancestor(
with ThreadPoolExecutor(max_workers=1) as pool: with ThreadPoolExecutor(max_workers=1) as pool:
try: try:
fut = pool.submit(detach_timeline) fut = pool.submit(detach_timeline)
wait_until(10, 1.0, paused_at_failpoint) wait_until(paused_at_failpoint)
# let stuck complete # let stuck complete
stuck_http.configure_failpoints((pausepoint, "off")) stuck_http.configure_failpoints((pausepoint, "off"))
wait_until(10, 1.0, first_completed) wait_until(first_completed)
victim_http.configure_failpoints((pausepoint, "off")) victim_http.configure_failpoints((pausepoint, "off"))
@@ -1456,7 +1456,7 @@ def test_retried_detach_ancestor_after_failed_reparenting(neon_env_builder: Neon
# other tests take the "detach? reparent complete", but this only hits # other tests take the "detach? reparent complete", but this only hits
# "complete". # "complete".
http.timeline_delete(env.initial_tenant, env.initial_timeline) http.timeline_delete(env.initial_tenant, env.initial_timeline)
wait_timeline_detail_404(http, env.initial_tenant, env.initial_timeline, 20) wait_timeline_detail_404(http, env.initial_tenant, env.initial_timeline)
http.configure_failpoints(("timeline-detach-ancestor::complete_before_uploading", "off")) http.configure_failpoints(("timeline-detach-ancestor::complete_before_uploading", "off"))
@@ -1518,7 +1518,7 @@ def test_timeline_is_deleted_before_timeline_detach_ancestor_completes(
with ThreadPoolExecutor(max_workers=1) as pool: with ThreadPoolExecutor(max_workers=1) as pool:
detach = pool.submit(detach_and_get_stuck) detach = pool.submit(detach_and_get_stuck)
offset = wait_until(10, 1.0, request_processing_noted_in_log) offset = wait_until(request_processing_noted_in_log)
# make this named fn tor more clear failure test output logging # make this named fn tor more clear failure test output logging
def pausepoint_hit_with_gc_paused() -> LogCursor: def pausepoint_hit_with_gc_paused() -> LogCursor:
@@ -1529,11 +1529,11 @@ def test_timeline_is_deleted_before_timeline_detach_ancestor_completes(
) )
return at return at
offset = wait_until(10, 1.0, pausepoint_hit_with_gc_paused) offset = wait_until(pausepoint_hit_with_gc_paused)
delete_detached() delete_detached()
wait_timeline_detail_404(http, env.initial_tenant, detached, 10, 1.0) wait_timeline_detail_404(http, env.initial_tenant, detached)
http.configure_failpoints((failpoint, "off")) http.configure_failpoints((failpoint, "off"))

View File

@@ -61,7 +61,7 @@ def test_gc_blocking_by_timeline(neon_env_builder: NeonEnvBuilder, sharded: bool
# deletion unblocks gc # deletion unblocks gc
http.timeline_delete(env.initial_tenant, foo_branch) http.timeline_delete(env.initial_tenant, foo_branch)
wait_timeline_detail_404(http, env.initial_tenant, foo_branch, 10, 1.0) wait_timeline_detail_404(http, env.initial_tenant, foo_branch)
wait_for_another_gc_round() wait_for_another_gc_round()
pss.assert_log_contains(gc_active_line) pss.assert_log_contains(gc_active_line)

View File

@@ -396,11 +396,7 @@ def test_timeline_physical_size_init(neon_env_builder: NeonEnvBuilder):
# Wait for the tenant to be loaded # Wait for the tenant to be loaded
client = env.pageserver.http_client() client = env.pageserver.http_client()
wait_until( wait_until(lambda: assert_tenant_state(client, env.initial_tenant, "Active"))
number_of_iterations=5,
interval=1,
func=lambda: assert_tenant_state(client, env.initial_tenant, "Active"),
)
assert_physical_size_invariants( assert_physical_size_invariants(
get_physical_size_values(env, env.initial_tenant, new_timeline_id), get_physical_size_values(env, env.initial_tenant, new_timeline_id),
@@ -433,7 +429,7 @@ def test_timeline_physical_size_post_checkpoint(neon_env_builder: NeonEnvBuilder
get_physical_size_values(env, env.initial_tenant, new_timeline_id), get_physical_size_values(env, env.initial_tenant, new_timeline_id),
) )
wait_until(10, 1, check) wait_until(check)
def test_timeline_physical_size_post_compaction(neon_env_builder: NeonEnvBuilder): def test_timeline_physical_size_post_compaction(neon_env_builder: NeonEnvBuilder):
@@ -721,7 +717,7 @@ def wait_for_tenant_startup_completions(client: PageserverHttpClient, count: int
def condition(): def condition():
assert client.get_metric_value("pageserver_tenant_startup_complete_total") == count assert client.get_metric_value("pageserver_tenant_startup_complete_total") == count
wait_until(5, 1.0, condition) wait_until(condition)
def test_ondemand_activation(neon_env_builder: NeonEnvBuilder): def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
@@ -768,7 +764,7 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
assert "Active" in set(get_tenant_states().values()) assert "Active" in set(get_tenant_states().values())
# One tenant should activate, then get stuck in their logical size calculation # One tenant should activate, then get stuck in their logical size calculation
wait_until(10, 1, at_least_one_active) wait_until(at_least_one_active)
# Wait some walltime to gain confidence that other tenants really are stuck and not proceeding to activate # Wait some walltime to gain confidence that other tenants really are stuck and not proceeding to activate
time.sleep(5) time.sleep(5)
@@ -836,13 +832,13 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
def all_active(): def all_active():
assert all(s == "Active" for s in get_tenant_states().values()) assert all(s == "Active" for s in get_tenant_states().values())
wait_until(10, 1, all_active) wait_until(all_active)
# Final control check: restarting with no failpoints at all results in all tenants coming active # Final control check: restarting with no failpoints at all results in all tenants coming active
# without being prompted by client I/O # without being prompted by client I/O
env.pageserver.stop() env.pageserver.stop()
env.pageserver.start() env.pageserver.start()
wait_until(10, 1, all_active) wait_until(all_active)
assert ( assert (
pageserver_http.get_metric_value("pageserver_tenant_startup_scheduled_total") == n_tenants pageserver_http.get_metric_value("pageserver_tenant_startup_scheduled_total") == n_tenants
@@ -856,7 +852,7 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
extra_env_vars={"FAILPOINTS": "timeline-calculate-logical-size-pause=pause"} extra_env_vars={"FAILPOINTS": "timeline-calculate-logical-size-pause=pause"}
) )
wait_until(10, 1, at_least_one_active) wait_until(at_least_one_active)
detach_tenant_id = list( detach_tenant_id = list(
[(tid, s) for (tid, s) in get_tenant_states().items() if s == "Attaching"] [(tid, s) for (tid, s) in get_tenant_states().items() if s == "Attaching"]
@@ -881,7 +877,7 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
# Check that all the stuck tenants proceed to active (apart from the one that deletes, and the one # Check that all the stuck tenants proceed to active (apart from the one that deletes, and the one
# we detached) # we detached)
wait_until(10, 1, all_active) wait_until(all_active)
assert len(get_tenant_states()) == n_tenants - 2 assert len(get_tenant_states()) == n_tenants - 2
@@ -908,7 +904,7 @@ def delete_lazy_activating(
try: try:
# Deletion will get to the point in shutdown where it's waiting for timeline shutdown, then # Deletion will get to the point in shutdown where it's waiting for timeline shutdown, then
# hang because of our failpoint blocking activation. # hang because of our failpoint blocking activation.
wait_until(10, 1, shutting_down) wait_until(shutting_down)
finally: finally:
log.info("Clearing failpoint") log.info("Clearing failpoint")
pageserver_http.configure_failpoints(("timeline-calculate-logical-size-pause", "off")) pageserver_http.configure_failpoints(("timeline-calculate-logical-size-pause", "off"))
@@ -1030,13 +1026,13 @@ def test_eager_attach_does_not_queue_up(neon_env_builder: NeonEnvBuilder):
log.info(f"{states}") log.info(f"{states}")
assert len(states["Active"]) == 1 assert len(states["Active"]) == 1
wait_until(10, 1, one_is_active) wait_until(one_is_active)
def other_is_attaching(): def other_is_attaching():
states = get_tenant_states() states = get_tenant_states()
assert len(states["Attaching"]) == 1 assert len(states["Attaching"]) == 1
wait_until(10, 1, other_is_attaching) wait_until(other_is_attaching)
def eager_tenant_is_active(): def eager_tenant_is_active():
resp = client.tenant_status(eager_tenant) resp = client.tenant_status(eager_tenant)
@@ -1053,7 +1049,7 @@ def test_eager_attach_does_not_queue_up(neon_env_builder: NeonEnvBuilder):
}, },
lazy=False, lazy=False,
) )
wait_until(10, 1, eager_tenant_is_active) wait_until(eager_tenant_is_active)
other_is_attaching() other_is_attaching()
@@ -1096,7 +1092,7 @@ def test_lazy_attach_activation(neon_env_builder: NeonEnvBuilder, activation_met
resp = client.tenant_status(env.initial_tenant) resp = client.tenant_status(env.initial_tenant)
assert resp["state"]["slug"] == "Active" assert resp["state"]["slug"] == "Active"
wait_until(10, 1, initial_tenant_is_active) wait_until(initial_tenant_is_active)
# even though the initial tenant is now active, because it was startup time # even though the initial tenant is now active, because it was startup time
# attach, it will consume the only permit because logical size calculation # attach, it will consume the only permit because logical size calculation
@@ -1119,7 +1115,7 @@ def test_lazy_attach_activation(neon_env_builder: NeonEnvBuilder, activation_met
assert resp["state"]["slug"] == "Attaching" assert resp["state"]["slug"] == "Attaching"
# paused logical size calculation of env.initial_tenant is keeping it attaching # paused logical size calculation of env.initial_tenant is keeping it attaching
wait_until(10, 1, lazy_tenant_is_attaching) wait_until(lazy_tenant_is_attaching)
for _ in range(5): for _ in range(5):
lazy_tenant_is_attaching() lazy_tenant_is_attaching()
@@ -1132,10 +1128,10 @@ def test_lazy_attach_activation(neon_env_builder: NeonEnvBuilder, activation_met
if activation_method == "endpoint": if activation_method == "endpoint":
with env.endpoints.create_start("main", tenant_id=lazy_tenant): with env.endpoints.create_start("main", tenant_id=lazy_tenant):
# starting up the endpoint should make it jump the queue # starting up the endpoint should make it jump the queue
wait_until(10, 1, lazy_tenant_is_active) wait_until(lazy_tenant_is_active)
elif activation_method == "branch": elif activation_method == "branch":
env.create_timeline("second_branch", lazy_tenant) env.create_timeline("second_branch", lazy_tenant)
wait_until(10, 1, lazy_tenant_is_active) wait_until(lazy_tenant_is_active)
elif activation_method == "delete": elif activation_method == "delete":
delete_lazy_activating(lazy_tenant, env.pageserver, expect_attaching=True) delete_lazy_activating(lazy_tenant, env.pageserver, expect_attaching=True)
else: else:

View File

@@ -2136,7 +2136,7 @@ def test_pull_timeline_while_evicted(neon_env_builder: NeonEnvBuilder):
# Check that on source no segment files are present # Check that on source no segment files are present
assert src_sk.list_segments(tenant_id, timeline_id) == [] assert src_sk.list_segments(tenant_id, timeline_id) == []
wait_until(60, 1, evicted_on_source) wait_until(evicted_on_source, timeout=60)
# Invoke pull_timeline: source should serve snapshot request without promoting anything to local disk, # Invoke pull_timeline: source should serve snapshot request without promoting anything to local disk,
# destination should import the control file only & go into evicted mode immediately # destination should import the control file only & go into evicted mode immediately
@@ -2155,7 +2155,7 @@ def test_pull_timeline_while_evicted(neon_env_builder: NeonEnvBuilder):
# This should be fast, it is a wait_until because eviction state is updated # This should be fast, it is a wait_until because eviction state is updated
# in the background wrt pull_timeline. # in the background wrt pull_timeline.
wait_until(10, 0.1, evicted_on_destination) wait_until(evicted_on_destination, timeout=1.0, interval=0.1)
# Delete the timeline on the source, to prove that deletion works on an # Delete the timeline on the source, to prove that deletion works on an
# evicted timeline _and_ that the final compute test is really not using # evicted timeline _and_ that the final compute test is really not using
@@ -2178,7 +2178,7 @@ def test_pull_timeline_while_evicted(neon_env_builder: NeonEnvBuilder):
n_evicted = dst_sk.http_client().get_metric_value("safekeeper_evicted_timelines") n_evicted = dst_sk.http_client().get_metric_value("safekeeper_evicted_timelines")
assert n_evicted == 0 assert n_evicted == 0
wait_until(10, 1, unevicted_on_dest) wait_until(unevicted_on_dest, interval=0.1, timeout=1.0)
# In this test we check for excessive START_REPLICATION and START_WAL_PUSH queries # In this test we check for excessive START_REPLICATION and START_WAL_PUSH queries
@@ -2606,10 +2606,10 @@ def test_s3_eviction(
assert n_evicted # make mypy happy assert n_evicted # make mypy happy
assert int(n_evicted) == n_timelines assert int(n_evicted) == n_timelines
wait_until(60, 0.5, all_evicted) wait_until(all_evicted, timeout=30)
# restart should preserve the metric value # restart should preserve the metric value
sk.stop().start() sk.stop().start()
wait_until(60, 0.5, all_evicted) wait_until(all_evicted)
# and endpoint start should reduce is # and endpoint start should reduce is
endpoints[0].start() endpoints[0].start()
@@ -2618,7 +2618,7 @@ def test_s3_eviction(
assert n_evicted # make mypy happy assert n_evicted # make mypy happy
assert int(n_evicted) < n_timelines assert int(n_evicted) < n_timelines
wait_until(60, 0.5, one_unevicted) wait_until(one_unevicted)
# Test resetting uploaded partial segment state. # Test resetting uploaded partial segment state.
@@ -2666,7 +2666,7 @@ def test_backup_partial_reset(neon_env_builder: NeonEnvBuilder):
if isinstance(eviction_state, str) and eviction_state == "Present": if isinstance(eviction_state, str) and eviction_state == "Present":
raise Exception("eviction didn't happen yet") raise Exception("eviction didn't happen yet")
wait_until(30, 1, evicted) wait_until(evicted)
# it must have uploaded something # it must have uploaded something
uploaded_segs = sk.list_uploaded_segments(tenant_id, timeline_id) uploaded_segs = sk.list_uploaded_segments(tenant_id, timeline_id)
log.info(f"uploaded segments before reset: {uploaded_segs}") log.info(f"uploaded segments before reset: {uploaded_segs}")
@@ -2763,7 +2763,7 @@ def test_pull_timeline_partial_segment_integrity(neon_env_builder: NeonEnvBuilde
raise Exception("Partial segment not uploaded yet") raise Exception("Partial segment not uploaded yet")
source_partial_segment = wait_until(15, 1, source_partial_segment_uploaded) source_partial_segment = wait_until(source_partial_segment_uploaded)
log.info( log.info(
f"Uploaded segments before pull are {src_sk.list_uploaded_segments(tenant_id, timeline_id)}" f"Uploaded segments before pull are {src_sk.list_uploaded_segments(tenant_id, timeline_id)}"
) )
@@ -2787,7 +2787,7 @@ def test_pull_timeline_partial_segment_integrity(neon_env_builder: NeonEnvBuilde
if evictions is None or evictions == 0: if evictions is None or evictions == 0:
raise Exception("Eviction did not happen on source safekeeper yet") raise Exception("Eviction did not happen on source safekeeper yet")
wait_until(30, 1, evicted) wait_until(evicted)
endpoint.start(safekeepers=[2, 3]) endpoint.start(safekeepers=[2, 3])
@@ -2804,7 +2804,7 @@ def test_pull_timeline_partial_segment_integrity(neon_env_builder: NeonEnvBuilde
) )
endpoint.safe_psql("insert into t select generate_series(1, 1000), 'pear'") endpoint.safe_psql("insert into t select generate_series(1, 1000), 'pear'")
wait_until(15, 1, new_partial_segment_uploaded) wait_until(new_partial_segment_uploaded)
log.info( log.info(
f"Uploaded segments after post-pull ingest are {src_sk.list_uploaded_segments(tenant_id, timeline_id)}" f"Uploaded segments after post-pull ingest are {src_sk.list_uploaded_segments(tenant_id, timeline_id)}"
@@ -2833,4 +2833,4 @@ def test_pull_timeline_partial_segment_integrity(neon_env_builder: NeonEnvBuilde
if unevictions is None or unevictions == 0: if unevictions is None or unevictions == 0:
raise Exception("Uneviction did not happen on source safekeeper yet") raise Exception("Uneviction did not happen on source safekeeper yet")
wait_until(10, 1, unevicted) wait_until(unevicted)

View File

@@ -97,7 +97,7 @@ def test_pageserver_lsn_wait_error_safekeeper_stop(neon_env_builder: NeonEnvBuil
str(safekeeper.id) in exception_string str(safekeeper.id) in exception_string
), f"Should have safekeeper {safekeeper.id} printed in walreceiver state after WAL wait timeout" ), f"Should have safekeeper {safekeeper.id} printed in walreceiver state after WAL wait timeout"
wait_until(60, 0.5, all_sks_in_wareceiver_state) wait_until(all_sks_in_wareceiver_state, timeout=30)
stopped_safekeeper = env.safekeepers[-1] stopped_safekeeper = env.safekeepers[-1]
stopped_safekeeper_id = stopped_safekeeper.id stopped_safekeeper_id = stopped_safekeeper.id
@@ -124,7 +124,7 @@ def test_pageserver_lsn_wait_error_safekeeper_stop(neon_env_builder: NeonEnvBuil
str(safekeeper.id) in exception_string str(safekeeper.id) in exception_string
), f"Should have safekeeper {safekeeper.id} printed in walreceiver state after 2nd WAL wait timeout" ), f"Should have safekeeper {safekeeper.id} printed in walreceiver state after 2nd WAL wait timeout"
wait_until(60, 0.5, all_but_stopped_sks_in_wareceiver_state) wait_until(all_but_stopped_sks_in_wareceiver_state, timeout=30)
def insert_test_elements(env: NeonEnv, tenant_id: TenantId, start: int, count: int): def insert_test_elements(env: NeonEnv, tenant_id: TenantId, start: int, count: int):