diff --git a/test_runner/fixtures/compute_reconfigure.py b/test_runner/fixtures/compute_reconfigure.py index 33f01f80fb..425abef935 100644 --- a/test_runner/fixtures/compute_reconfigure.py +++ b/test_runner/fixtures/compute_reconfigure.py @@ -69,7 +69,10 @@ def compute_reconfigure_listener(make_httpserver: HTTPServer): # This causes the endpoint to query storage controller for its location, which # is redundant since we already have it here, but this avoids extending the # neon_local CLI to take full lists of locations - reconfigure_threads.submit(lambda workload=workload: workload.reconfigure()) # type: ignore[misc] + fut = reconfigure_threads.submit(lambda workload=workload: workload.reconfigure()) # type: ignore[misc] + + # To satisfy semantics of notify-attach API, we must wait for the change to be applied before returning 200 + fut.result() return Response(status=200) diff --git a/test_runner/fixtures/neon_cli.py b/test_runner/fixtures/neon_cli.py index 6a016d2621..97a5a36814 100644 --- a/test_runner/fixtures/neon_cli.py +++ b/test_runner/fixtures/neon_cli.py @@ -487,6 +487,7 @@ class NeonLocalCli(AbstractNeonCli): lsn: Lsn | None = None, pageserver_id: int | None = None, allow_multiple=False, + update_catalog: bool = False, ) -> subprocess.CompletedProcess[str]: args = [ "endpoint", @@ -514,6 +515,8 @@ class NeonLocalCli(AbstractNeonCli): args.extend(["--pageserver-id", str(pageserver_id)]) if allow_multiple: args.extend(["--allow-multiple"]) + if update_catalog: + args.extend(["--update-catalog"]) res = self.raw_cli(args) res.check_returncode() diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 41e9952b8a..2fa82754ef 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -3849,6 +3849,7 @@ class Endpoint(PgProtocol, LogUtils): config_lines: list[str] | None = None, pageserver_id: int | None = None, allow_multiple: bool = False, + update_catalog: bool = False, ) -> Self: """ Create a new Postgres endpoint. @@ -3874,6 +3875,7 @@ class Endpoint(PgProtocol, LogUtils): pg_version=self.env.pg_version, pageserver_id=pageserver_id, allow_multiple=allow_multiple, + update_catalog=update_catalog, ) path = Path("endpoints") / self.endpoint_id / "pgdata" self.pgdata_dir = self.env.repo_dir / path @@ -4288,6 +4290,7 @@ class EndpointFactory: hot_standby: bool = False, config_lines: list[str] | None = None, pageserver_id: int | None = None, + update_catalog: bool = False, ) -> Endpoint: ep = Endpoint( self.env, @@ -4309,6 +4312,7 @@ class EndpointFactory: hot_standby=hot_standby, config_lines=config_lines, pageserver_id=pageserver_id, + update_catalog=update_catalog, ) def stop_all(self, fail_on_error=True) -> Self: diff --git a/test_runner/fixtures/workload.py b/test_runner/fixtures/workload.py index eea0ec2b95..1947a9c3fb 100644 --- a/test_runner/fixtures/workload.py +++ b/test_runner/fixtures/workload.py @@ -53,6 +53,8 @@ class Workload: self._endpoint: Endpoint | None = None self._endpoint_opts = endpoint_opts or {} + self._configured_pageserver: int | None = None + def branch( self, timeline_id: TimelineId, @@ -92,8 +94,12 @@ class Workload: **self._endpoint_opts, ) self._endpoint.start(pageserver_id=pageserver_id) + self._configured_pageserver = pageserver_id else: - self._endpoint.reconfigure(pageserver_id=pageserver_id) + if self._configured_pageserver != pageserver_id: + self._configured_pageserver = pageserver_id + self._endpoint.reconfigure(pageserver_id=pageserver_id) + self._endpoint_config = pageserver_id connstring = self._endpoint.safe_psql( "SELECT setting FROM pg_settings WHERE name='neon.pageserver_connstring'" @@ -122,6 +128,7 @@ class Workload: def write_rows(self, n: int, pageserver_id: int | None = None, upload: bool = True): endpoint = self.endpoint(pageserver_id) + start = self.expect_rows end = start + n - 1 self.expect_rows += n diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py index f3347b594e..f10872590c 100644 --- a/test_runner/regress/test_compaction.py +++ b/test_runner/regress/test_compaction.py @@ -689,9 +689,7 @@ def test_pageserver_compaction_circuit_breaker(neon_env_builder: NeonEnvBuilder) env.pageserver.http_client().configure_failpoints((FAILPOINT, "return")) # Write some data to trigger compaction - workload.write_rows(1024, upload=False) - workload.write_rows(1024, upload=False) - workload.write_rows(1024, upload=False) + workload.write_rows(32768, upload=False) def assert_broken(): env.pageserver.assert_log_contains(BROKEN_LOG) diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py index 6f8070e2ba..8910873690 100644 --- a/test_runner/regress/test_sharding.py +++ b/test_runner/regress/test_sharding.py @@ -91,7 +91,7 @@ def test_sharding_smoke( workload.init() sizes_before = get_sizes() - workload.write_rows(256) + workload.write_rows(65536) # Test that we can read data back from a sharded tenant workload.validate() @@ -1368,6 +1368,7 @@ def test_sharding_split_failures( workload = Workload(env, tenant_id, timeline_id) workload.init() workload.write_rows(100) + compute_reconfigure_listener.register_workload(workload) # Put the environment into a failing state (exact meaning depends on `failure`) failure.apply(env) @@ -1546,6 +1547,9 @@ def test_sharding_backpressure(neon_env_builder: NeonEnvBuilder): # Tip: set to 100MB to make the test fail "max_replication_write_lag=1MB", ], + # We need `neon` extension for calling backpressure functions, + # this flag instructs `compute_ctl` to pre-install it. + "update_catalog": True, }, ) workload.init() @@ -1815,6 +1819,9 @@ def test_sharding_gc( # This is not okay, but it's not a scrubber bug: it's a pageserver issue that is exposed by # the specific pattern of aggressive checkpointing+image layer generation + GC that this test does. # TODO: remove when https://github.com/neondatabase/neon/issues/10720 is fixed - ps.allowed_errors.append( - ".*could not find data for key 020000000000000000000000000000000000.*" + ps.allowed_errors.extend( + [ + ".*could not find data for key 020000000000000000000000000000000000.*", + ".*could not ingest record.*", + ] ) diff --git a/test_runner/regress/test_storage_scrubber.py b/test_runner/regress/test_storage_scrubber.py index 46038ccbbb..b8253fb125 100644 --- a/test_runner/regress/test_storage_scrubber.py +++ b/test_runner/regress/test_storage_scrubber.py @@ -316,8 +316,11 @@ def test_scrubber_physical_gc_ancestors(neon_env_builder: NeonEnvBuilder, shard_ # This is not okay, but it's not a scrubber bug: it's a pageserver issue that is exposed by # the specific pattern of aggressive checkpointing+image layer generation + GC that this test does. # TODO: remove when https://github.com/neondatabase/neon/issues/10720 is fixed - ps.allowed_errors.append( - ".*could not find data for key 020000000000000000000000000000000000.*" + ps.allowed_errors.extend( + [ + ".*could not find data for key 020000000000000000000000000000000000.*", + ".*could not ingest record.*", + ] )