pageserver: run all Rust tests with remote storage enabled (#5164)

For [#5086](https://github.com/neondatabase/neon/pull/5086#issuecomment-1701331777) we will require remote storage to be configured in pageserver. This PR enables `localfs`-based storage for all Rust unit tests. Changes: - In `TenantHarness`, set up localfs remote storage for the tenant. - `create_test_timeline` should mimic what real timeline creation does, and real timeline creation waits for the timeline to reach remote storage. With this PR, `create_test_timeline` now does that as well. - All the places that create the harness tenant twice need to shut down the tenant before the re-create through a second call to `try_load` or `load`. - Without shutting down, upload tasks initiated by/through the first incarnation of the harness tenant might still be ongoing when the second incarnation of the harness tenant is `try_load`/`load`ed. That doesn't make sense in the tests that do that, they generally try to set up a scenario similar to pageserver stop & start. - There was one test that recreates a timeline, not the tenant. For that case, I needed to create a `Timeline::shutdown` method. It's a refactoring of the existing `Tenant::shutdown` method. - The remote_timeline_client tests previously set up their own `GenericRemoteStorage` and `RemoteTimelineClient`. Now they re-use the one that's pre-created by the TenantHarness. Some adjustments to the assertions were needed because the assertions now need to account for the initial image layer that's created by `create_test_timeline` to be present.
2026-07-08 14:40:37 +00:00 · 2023-09-01 18:10:40 +02:00
parent aa22000e67
commit cfc0fb573d
5 changed files with 177 additions and 208 deletions
--- a/test_runner/regress/test_tenant_delete.py
+++ b/test_runner/regress/test_tenant_delete.py
@@ -192,7 +192,7 @@ def test_delete_tenant_exercise_crash_safety_failpoints(
            # allow errors caused by failpoints
            f".*failpoint: {failpoint}",
            # It appears when we stopped flush loop during deletion (attempt) and then pageserver is stopped
-            ".*freeze_and_flush_on_shutdown.*failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
+            ".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
            # We may leave some upload tasks in the queue. They're likely deletes.
            # For uploads we explicitly wait with `last_flush_lsn_upload` below.
            # So by ignoring these instead of waiting for empty upload queue
@@ -338,7 +338,7 @@ def test_tenant_delete_is_resumed_on_attach(
            # From deletion polling
            f".*NotFound: tenant {env.initial_tenant}.*",
            # It appears when we stopped flush loop during deletion (attempt) and then pageserver is stopped
-            ".*freeze_and_flush_on_shutdown.*failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
+            ".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
            # error from http response is also logged
            ".*InternalServerError\\(Tenant is marked as deleted on remote storage.*",
            '.*shutdown_pageserver{exit_code=0}: stopping left-over name="remote upload".*',
--- a/test_runner/regress/test_timeline_delete.py
+++ b/test_runner/regress/test_timeline_delete.py
@@ -231,7 +231,7 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
    env.pageserver.allowed_errors.append(f".*{timeline_id}.*failpoint: {failpoint}")
    # It appears when we stopped flush loop during deletion and then pageserver is stopped
    env.pageserver.allowed_errors.append(
-        ".*freeze_and_flush_on_shutdown.*failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited"
+        ".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
    )
    # This happens when we fail before scheduling background operation.
    # Timeline is left in stopping state and retry tries to stop it again.
@@ -449,7 +449,7 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild
    )
    # this happens, because the stuck timeline is visible to shutdown
    env.pageserver.allowed_errors.append(
-        ".*freeze_and_flush_on_shutdown.+: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited"
+        ".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
    )

    ps_http = env.pageserver.http_client()
@@ -881,7 +881,7 @@ def test_timeline_delete_resumed_on_attach(
            # allow errors caused by failpoints
            f".*failpoint: {failpoint}",
            # It appears when we stopped flush loop during deletion (attempt) and then pageserver is stopped
-            ".*freeze_and_flush_on_shutdown.*failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
+            ".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
            # error from http response is also logged
            ".*InternalServerError\\(Tenant is marked as deleted on remote storage.*",
            # Polling after attach may fail with this