Files
neon/test_runner/regress/test_s3_restore.py
Christian Schwarz 06113e94e6 fix(test_regress): always use storcon virtual pageserver API to set tenant config (#9622)
Problem
-------

Tests that directly call the Pageserver Management API to set tenant
config are flaky if the Pageserver is managed by Storcon because Storcon
is the source of truth and may (theoretically) reconcile a tenant at any
time.

Solution
--------

Switch all users of
`set_tenant_config`/`patch_tenant_config_client_side`
to use the `env.storage_controller.pageserver_api()`

Future Work
-----------

Prevent regressions from creeping in.

And generally clean up up tenant configuration.
Maybe we can avoid the Pageserver having a default tenant config at all
and put the default into Storcon instead?

* => https://github.com/neondatabase/neon/issues/9621

Refs
----

fixes https://github.com/neondatabase/neon/issues/9522
2024-11-04 17:42:08 +01:00

131 lines
5.1 KiB
Python

from __future__ import annotations
import time
from datetime import datetime, timezone
from fixtures.common_types import Lsn
from fixtures.log_helper import log
from fixtures.neon_fixtures import (
NeonEnvBuilder,
PgBin,
)
from fixtures.pageserver.utils import (
assert_prefix_empty,
enable_remote_storage_versioning,
many_small_layers_tenant_config,
wait_for_upload,
)
from fixtures.remote_storage import RemoteStorageKind, s3_storage
from fixtures.utils import run_pg_bench_small
def test_tenant_s3_restore(
neon_env_builder: NeonEnvBuilder,
pg_bin: PgBin,
):
remote_storage_kind = s3_storage()
neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
# Mock S3 doesn't have versioning enabled by default, enable it
# (also do it before there is any writes to the bucket)
if remote_storage_kind == RemoteStorageKind.MOCK_S3:
remote_storage = neon_env_builder.pageserver_remote_storage
assert remote_storage, "remote storage not configured"
enable_remote_storage_versioning(remote_storage)
# change it back after initdb, recovery doesn't work if the two
# index_part.json uploads happen at same second or too close to each other.
initial_tenant_conf = many_small_layers_tenant_config()
del initial_tenant_conf["checkpoint_distance"]
env = neon_env_builder.init_start(initial_tenant_conf)
env.pageserver.allowed_errors.extend(
[
# The deletion queue will complain when it encounters simulated S3 errors
".*deletion executor: DeleteObjects request failed.*",
# lucky race with stopping from flushing a layer we fail to schedule any uploads
".*layer flush task.+: could not flush frozen layer: update_metadata_file",
]
)
ps_http = env.pageserver.http_client()
tenant_id = env.initial_tenant
# now lets create the small layers
env.storage_controller.pageserver_api().set_tenant_config(
tenant_id, many_small_layers_tenant_config()
)
# Default tenant and the one we created
assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 1
# create two timelines one being the parent of another, both with non-trivial data
parent = "main"
last_flush_lsns = []
for timeline in ["first", "second"]:
timeline_id = env.create_branch(timeline, ancestor_branch_name=parent, tenant_id=tenant_id)
with env.endpoints.create_start(timeline, tenant_id=tenant_id) as endpoint:
run_pg_bench_small(pg_bin, endpoint.connstr())
endpoint.safe_psql(f"CREATE TABLE created_{timeline}(id integer);")
last_flush_lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
last_flush_lsns.append(last_flush_lsn)
ps_http.timeline_checkpoint(tenant_id, timeline_id)
wait_for_upload(ps_http, tenant_id, timeline_id, last_flush_lsn)
log.info(f"{timeline} timeline {timeline_id} {last_flush_lsn=}")
parent = timeline
# These sleeps are important because they fend off differences in clocks between us and S3
time.sleep(4)
ts_before_deletion = datetime.now(tz=timezone.utc).replace(tzinfo=None)
time.sleep(4)
assert (
ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 1
), "tenant removed before we deletion was issued"
ps_http.tenant_delete(tenant_id)
ps_http.deletion_queue_flush(execute=True)
assert (
ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 0
), "tenant removed before we deletion was issued"
env.storage_controller.attach_hook_drop(tenant_id)
tenant_path = env.pageserver.tenant_dir(tenant_id)
assert not tenant_path.exists()
assert_prefix_empty(
neon_env_builder.pageserver_remote_storage,
prefix="/".join(
(
"tenants",
str(tenant_id),
)
),
)
time.sleep(4)
ts_after_deletion = datetime.now(tz=timezone.utc).replace(tzinfo=None)
time.sleep(4)
ps_http.tenant_time_travel_remote_storage(
tenant_id, timestamp=ts_before_deletion, done_if_after=ts_after_deletion
)
generation = env.storage_controller.attach_hook_issue(tenant_id, env.pageserver.id)
ps_http.tenant_attach(tenant_id, generation=generation)
env.pageserver.quiesce_tenants()
for tline in ps_http.timeline_list(env.initial_tenant):
log.info(f"timeline detail: {tline}")
for i, timeline in enumerate(["first", "second"]):
with env.endpoints.create_start(timeline, tenant_id=tenant_id) as endpoint:
endpoint.safe_psql(f"SELECT * FROM created_{timeline};")
last_flush_lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
expected_last_flush_lsn = last_flush_lsns[i]
# There might be some activity that advances the lsn so we can't use a strict equality check
assert last_flush_lsn >= expected_last_flush_lsn, "last_flush_lsn too old"
assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 1