pageserver: run all Rust tests with remote storage enabled (#5164)

For
[#5086](https://github.com/neondatabase/neon/pull/5086#issuecomment-1701331777)
we will require remote storage to be configured in pageserver.

This PR enables `localfs`-based storage for all Rust unit tests.

Changes:

- In `TenantHarness`, set up localfs remote storage for the tenant.
- `create_test_timeline` should mimic what real timeline creation does,
and real timeline creation waits for the timeline to reach remote
storage. With this PR, `create_test_timeline` now does that as well.
- All the places that create the harness tenant twice need to shut down
the tenant before the re-create through a second call to `try_load` or
`load`.
- Without shutting down, upload tasks initiated by/through the first
incarnation of the harness tenant might still be ongoing when the second
incarnation of the harness tenant is `try_load`/`load`ed. That doesn't
make sense in the tests that do that, they generally try to set up a
scenario similar to pageserver stop & start.
- There was one test that recreates a timeline, not the tenant. For that
case, I needed to create a `Timeline::shutdown` method. It's a
refactoring of the existing `Tenant::shutdown` method.
- The remote_timeline_client tests previously set up their own
`GenericRemoteStorage` and `RemoteTimelineClient`. Now they re-use the
one that's pre-created by the TenantHarness. Some adjustments to the
assertions were needed because the assertions now need to account for
the initial image layer that's created by `create_test_timeline` to be
present.
This commit is contained in:
Christian Schwarz
2023-09-01 18:10:40 +02:00
committed by GitHub
parent aa22000e67
commit cfc0fb573d
5 changed files with 177 additions and 208 deletions

View File

@@ -192,7 +192,7 @@ def test_delete_tenant_exercise_crash_safety_failpoints(
# allow errors caused by failpoints
f".*failpoint: {failpoint}",
# It appears when we stopped flush loop during deletion (attempt) and then pageserver is stopped
".*freeze_and_flush_on_shutdown.*failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
# We may leave some upload tasks in the queue. They're likely deletes.
# For uploads we explicitly wait with `last_flush_lsn_upload` below.
# So by ignoring these instead of waiting for empty upload queue
@@ -338,7 +338,7 @@ def test_tenant_delete_is_resumed_on_attach(
# From deletion polling
f".*NotFound: tenant {env.initial_tenant}.*",
# It appears when we stopped flush loop during deletion (attempt) and then pageserver is stopped
".*freeze_and_flush_on_shutdown.*failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
# error from http response is also logged
".*InternalServerError\\(Tenant is marked as deleted on remote storage.*",
'.*shutdown_pageserver{exit_code=0}: stopping left-over name="remote upload".*',

View File

@@ -231,7 +231,7 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
env.pageserver.allowed_errors.append(f".*{timeline_id}.*failpoint: {failpoint}")
# It appears when we stopped flush loop during deletion and then pageserver is stopped
env.pageserver.allowed_errors.append(
".*freeze_and_flush_on_shutdown.*failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited"
".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
)
# This happens when we fail before scheduling background operation.
# Timeline is left in stopping state and retry tries to stop it again.
@@ -449,7 +449,7 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild
)
# this happens, because the stuck timeline is visible to shutdown
env.pageserver.allowed_errors.append(
".*freeze_and_flush_on_shutdown.+: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited"
".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
)
ps_http = env.pageserver.http_client()
@@ -881,7 +881,7 @@ def test_timeline_delete_resumed_on_attach(
# allow errors caused by failpoints
f".*failpoint: {failpoint}",
# It appears when we stopped flush loop during deletion (attempt) and then pageserver is stopped
".*freeze_and_flush_on_shutdown.*failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
# error from http response is also logged
".*InternalServerError\\(Tenant is marked as deleted on remote storage.*",
# Polling after attach may fail with this