tests: make Workload more determinstic (#10741)

## Problem Previously, Workload was reconfiguring the compute before each run of writes, which was meant to be a no-op when nothing changed, but was actually writing extra data due to an issue being fixed in https://github.com/neondatabase/neon/pull/10696. The row counts in tests were too low in some cases, these tests were only working because of those extra writes that shouldn't have been happening, and moreover were relying on checkpoints happening. ## Summary of changes - Only reconfigure compute if the attached pageserver actually changed. If pageserver is set to None, that means controller is managing everything, so never reconfigure compute. - Update tests that wrote too few rows. --------- Co-authored-by: Alexey Kondratov <kondratov.aleksey@gmail.com>
2026-01-10 06:52:55 +00:00 · 2025-02-12 12:35:29 +00:00
parent 9537829ccd
commit 9989d8bfae
7 changed files with 35 additions and 10 deletions
--- a/test_runner/fixtures/compute_reconfigure.py
+++ b/test_runner/fixtures/compute_reconfigure.py
@@ -69,7 +69,10 @@ def compute_reconfigure_listener(make_httpserver: HTTPServer):
            # This causes the endpoint to query storage controller for its location, which
            # is redundant since we already have it here, but this avoids extending the
            # neon_local CLI to take full lists of locations
-            reconfigure_threads.submit(lambda workload=workload: workload.reconfigure())  # type: ignore[misc]
+            fut = reconfigure_threads.submit(lambda workload=workload: workload.reconfigure())  # type: ignore[misc]
+
+            # To satisfy semantics of notify-attach API, we must wait for the change to be applied before returning 200
+            fut.result()

        return Response(status=200)

--- a/test_runner/fixtures/neon_cli.py
+++ b/test_runner/fixtures/neon_cli.py
@@ -487,6 +487,7 @@ class NeonLocalCli(AbstractNeonCli):
        lsn: Lsn | None = None,
        pageserver_id: int | None = None,
        allow_multiple=False,
+        update_catalog: bool = False,
    ) -> subprocess.CompletedProcess[str]:
        args = [
            "endpoint",
@@ -514,6 +515,8 @@ class NeonLocalCli(AbstractNeonCli):
            args.extend(["--pageserver-id", str(pageserver_id)])
        if allow_multiple:
            args.extend(["--allow-multiple"])
+        if update_catalog:
+            args.extend(["--update-catalog"])

        res = self.raw_cli(args)
        res.check_returncode()
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -3849,6 +3849,7 @@ class Endpoint(PgProtocol, LogUtils):
        config_lines: list[str] | None = None,
        pageserver_id: int | None = None,
        allow_multiple: bool = False,
+        update_catalog: bool = False,
    ) -> Self:
        """
        Create a new Postgres endpoint.
@@ -3874,6 +3875,7 @@ class Endpoint(PgProtocol, LogUtils):
            pg_version=self.env.pg_version,
            pageserver_id=pageserver_id,
            allow_multiple=allow_multiple,
+            update_catalog=update_catalog,
        )
        path = Path("endpoints") / self.endpoint_id / "pgdata"
        self.pgdata_dir = self.env.repo_dir / path
@@ -4288,6 +4290,7 @@ class EndpointFactory:
        hot_standby: bool = False,
        config_lines: list[str] | None = None,
        pageserver_id: int | None = None,
+        update_catalog: bool = False,
    ) -> Endpoint:
        ep = Endpoint(
            self.env,
@@ -4309,6 +4312,7 @@ class EndpointFactory:
            hot_standby=hot_standby,
            config_lines=config_lines,
            pageserver_id=pageserver_id,
+            update_catalog=update_catalog,
        )

    def stop_all(self, fail_on_error=True) -> Self:
--- a/test_runner/fixtures/workload.py
+++ b/test_runner/fixtures/workload.py
@@ -53,6 +53,8 @@ class Workload:
        self._endpoint: Endpoint | None = None
        self._endpoint_opts = endpoint_opts or {}

+        self._configured_pageserver: int | None = None
+
    def branch(
        self,
        timeline_id: TimelineId,
@@ -92,8 +94,12 @@ class Workload:
                    **self._endpoint_opts,
                )
                self._endpoint.start(pageserver_id=pageserver_id)
+                self._configured_pageserver = pageserver_id
            else:
-                self._endpoint.reconfigure(pageserver_id=pageserver_id)
+                if self._configured_pageserver != pageserver_id:
+                    self._configured_pageserver = pageserver_id
+                    self._endpoint.reconfigure(pageserver_id=pageserver_id)
+                    self._endpoint_config = pageserver_id

        connstring = self._endpoint.safe_psql(
            "SELECT setting FROM pg_settings WHERE name='neon.pageserver_connstring'"
@@ -122,6 +128,7 @@ class Workload:

    def write_rows(self, n: int, pageserver_id: int | None = None, upload: bool = True):
        endpoint = self.endpoint(pageserver_id)
+
        start = self.expect_rows
        end = start + n - 1
        self.expect_rows += n
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -689,9 +689,7 @@ def test_pageserver_compaction_circuit_breaker(neon_env_builder: NeonEnvBuilder)
    env.pageserver.http_client().configure_failpoints((FAILPOINT, "return"))

    # Write some data to trigger compaction
-    workload.write_rows(1024, upload=False)
-    workload.write_rows(1024, upload=False)
-    workload.write_rows(1024, upload=False)
+    workload.write_rows(32768, upload=False)

    def assert_broken():
        env.pageserver.assert_log_contains(BROKEN_LOG)
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -91,7 +91,7 @@ def test_sharding_smoke(
    workload.init()

    sizes_before = get_sizes()
-    workload.write_rows(256)
+    workload.write_rows(65536)

    # Test that we can read data back from a sharded tenant
    workload.validate()
@@ -1368,6 +1368,7 @@ def test_sharding_split_failures(
    workload = Workload(env, tenant_id, timeline_id)
    workload.init()
    workload.write_rows(100)
+    compute_reconfigure_listener.register_workload(workload)

    # Put the environment into a failing state (exact meaning depends on `failure`)
    failure.apply(env)
@@ -1546,6 +1547,9 @@ def test_sharding_backpressure(neon_env_builder: NeonEnvBuilder):
                # Tip: set to 100MB to make the test fail
                "max_replication_write_lag=1MB",
            ],
+            # We need `neon` extension for calling backpressure functions,
+            # this flag instructs `compute_ctl` to pre-install it.
+            "update_catalog": True,
        },
    )
    workload.init()
@@ -1815,6 +1819,9 @@ def test_sharding_gc(
        # This is not okay, but it's not a scrubber bug: it's a pageserver issue that is exposed by
        # the specific pattern of aggressive checkpointing+image layer generation + GC that this test does.
        # TODO: remove when https://github.com/neondatabase/neon/issues/10720 is fixed
-        ps.allowed_errors.append(
-            ".*could not find data for key 020000000000000000000000000000000000.*"
+        ps.allowed_errors.extend(
+            [
+                ".*could not find data for key 020000000000000000000000000000000000.*",
+                ".*could not ingest record.*",
+            ]
        )
--- a/test_runner/regress/test_storage_scrubber.py
+++ b/test_runner/regress/test_storage_scrubber.py
@@ -316,8 +316,11 @@ def test_scrubber_physical_gc_ancestors(neon_env_builder: NeonEnvBuilder, shard_
        # This is not okay, but it's not a scrubber bug: it's a pageserver issue that is exposed by
        # the specific pattern of aggressive checkpointing+image layer generation + GC that this test does.
        # TODO: remove when https://github.com/neondatabase/neon/issues/10720 is fixed
-        ps.allowed_errors.append(
-            ".*could not find data for key 020000000000000000000000000000000000.*"
+        ps.allowed_errors.extend(
+            [
+                ".*could not find data for key 020000000000000000000000000000000000.*",
+                ".*could not ingest record.*",
+            ]
        )