tests: make Workload more determinstic (#10741)

## Problem

Previously, Workload was reconfiguring the compute before each run of
writes, which was meant to be a no-op when nothing changed, but was
actually writing extra data due to an issue being fixed in
https://github.com/neondatabase/neon/pull/10696.

The row counts in tests were too low in some cases, these tests were
only working because of those extra writes that shouldn't have been
happening, and moreover were relying on checkpoints happening.

## Summary of changes

- Only reconfigure compute if the attached pageserver actually changed.
If pageserver is set to None, that means controller is managing
everything, so never reconfigure compute.
- Update tests that wrote too few rows.

---------

Co-authored-by: Alexey Kondratov <kondratov.aleksey@gmail.com>
This commit is contained in:
John Spray
2025-02-12 12:35:29 +00:00
committed by GitHub
parent 9537829ccd
commit 9989d8bfae
7 changed files with 35 additions and 10 deletions

View File

@@ -69,7 +69,10 @@ def compute_reconfigure_listener(make_httpserver: HTTPServer):
# This causes the endpoint to query storage controller for its location, which
# is redundant since we already have it here, but this avoids extending the
# neon_local CLI to take full lists of locations
reconfigure_threads.submit(lambda workload=workload: workload.reconfigure()) # type: ignore[misc]
fut = reconfigure_threads.submit(lambda workload=workload: workload.reconfigure()) # type: ignore[misc]
# To satisfy semantics of notify-attach API, we must wait for the change to be applied before returning 200
fut.result()
return Response(status=200)

View File

@@ -487,6 +487,7 @@ class NeonLocalCli(AbstractNeonCli):
lsn: Lsn | None = None,
pageserver_id: int | None = None,
allow_multiple=False,
update_catalog: bool = False,
) -> subprocess.CompletedProcess[str]:
args = [
"endpoint",
@@ -514,6 +515,8 @@ class NeonLocalCli(AbstractNeonCli):
args.extend(["--pageserver-id", str(pageserver_id)])
if allow_multiple:
args.extend(["--allow-multiple"])
if update_catalog:
args.extend(["--update-catalog"])
res = self.raw_cli(args)
res.check_returncode()

View File

@@ -3849,6 +3849,7 @@ class Endpoint(PgProtocol, LogUtils):
config_lines: list[str] | None = None,
pageserver_id: int | None = None,
allow_multiple: bool = False,
update_catalog: bool = False,
) -> Self:
"""
Create a new Postgres endpoint.
@@ -3874,6 +3875,7 @@ class Endpoint(PgProtocol, LogUtils):
pg_version=self.env.pg_version,
pageserver_id=pageserver_id,
allow_multiple=allow_multiple,
update_catalog=update_catalog,
)
path = Path("endpoints") / self.endpoint_id / "pgdata"
self.pgdata_dir = self.env.repo_dir / path
@@ -4288,6 +4290,7 @@ class EndpointFactory:
hot_standby: bool = False,
config_lines: list[str] | None = None,
pageserver_id: int | None = None,
update_catalog: bool = False,
) -> Endpoint:
ep = Endpoint(
self.env,
@@ -4309,6 +4312,7 @@ class EndpointFactory:
hot_standby=hot_standby,
config_lines=config_lines,
pageserver_id=pageserver_id,
update_catalog=update_catalog,
)
def stop_all(self, fail_on_error=True) -> Self:

View File

@@ -53,6 +53,8 @@ class Workload:
self._endpoint: Endpoint | None = None
self._endpoint_opts = endpoint_opts or {}
self._configured_pageserver: int | None = None
def branch(
self,
timeline_id: TimelineId,
@@ -92,8 +94,12 @@ class Workload:
**self._endpoint_opts,
)
self._endpoint.start(pageserver_id=pageserver_id)
self._configured_pageserver = pageserver_id
else:
self._endpoint.reconfigure(pageserver_id=pageserver_id)
if self._configured_pageserver != pageserver_id:
self._configured_pageserver = pageserver_id
self._endpoint.reconfigure(pageserver_id=pageserver_id)
self._endpoint_config = pageserver_id
connstring = self._endpoint.safe_psql(
"SELECT setting FROM pg_settings WHERE name='neon.pageserver_connstring'"
@@ -122,6 +128,7 @@ class Workload:
def write_rows(self, n: int, pageserver_id: int | None = None, upload: bool = True):
endpoint = self.endpoint(pageserver_id)
start = self.expect_rows
end = start + n - 1
self.expect_rows += n

View File

@@ -689,9 +689,7 @@ def test_pageserver_compaction_circuit_breaker(neon_env_builder: NeonEnvBuilder)
env.pageserver.http_client().configure_failpoints((FAILPOINT, "return"))
# Write some data to trigger compaction
workload.write_rows(1024, upload=False)
workload.write_rows(1024, upload=False)
workload.write_rows(1024, upload=False)
workload.write_rows(32768, upload=False)
def assert_broken():
env.pageserver.assert_log_contains(BROKEN_LOG)

View File

@@ -91,7 +91,7 @@ def test_sharding_smoke(
workload.init()
sizes_before = get_sizes()
workload.write_rows(256)
workload.write_rows(65536)
# Test that we can read data back from a sharded tenant
workload.validate()
@@ -1368,6 +1368,7 @@ def test_sharding_split_failures(
workload = Workload(env, tenant_id, timeline_id)
workload.init()
workload.write_rows(100)
compute_reconfigure_listener.register_workload(workload)
# Put the environment into a failing state (exact meaning depends on `failure`)
failure.apply(env)
@@ -1546,6 +1547,9 @@ def test_sharding_backpressure(neon_env_builder: NeonEnvBuilder):
# Tip: set to 100MB to make the test fail
"max_replication_write_lag=1MB",
],
# We need `neon` extension for calling backpressure functions,
# this flag instructs `compute_ctl` to pre-install it.
"update_catalog": True,
},
)
workload.init()
@@ -1815,6 +1819,9 @@ def test_sharding_gc(
# This is not okay, but it's not a scrubber bug: it's a pageserver issue that is exposed by
# the specific pattern of aggressive checkpointing+image layer generation + GC that this test does.
# TODO: remove when https://github.com/neondatabase/neon/issues/10720 is fixed
ps.allowed_errors.append(
".*could not find data for key 020000000000000000000000000000000000.*"
ps.allowed_errors.extend(
[
".*could not find data for key 020000000000000000000000000000000000.*",
".*could not ingest record.*",
]
)

View File

@@ -316,8 +316,11 @@ def test_scrubber_physical_gc_ancestors(neon_env_builder: NeonEnvBuilder, shard_
# This is not okay, but it's not a scrubber bug: it's a pageserver issue that is exposed by
# the specific pattern of aggressive checkpointing+image layer generation + GC that this test does.
# TODO: remove when https://github.com/neondatabase/neon/issues/10720 is fixed
ps.allowed_errors.append(
".*could not find data for key 020000000000000000000000000000000000.*"
ps.allowed_errors.extend(
[
".*could not find data for key 020000000000000000000000000000000000.*",
".*could not ingest record.*",
]
)