mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-24 16:40:38 +00:00
storage controller: miscellaneous improvements (#6800)
- Add some context to logs - Add tests for pageserver restarts when managed by storage controller - Make /location_config tolerate compute hook failures on shard creations, not just modifications.
This commit is contained in:
@@ -302,6 +302,15 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
|
||||
)
|
||||
self.verbose_error(res)
|
||||
|
||||
def tenant_list_locations(self):
|
||||
res = self.get(
|
||||
f"http://localhost:{self.port}/v1/location_config",
|
||||
)
|
||||
self.verbose_error(res)
|
||||
res_json = res.json()
|
||||
assert isinstance(res_json["tenant_shards"], list)
|
||||
return res_json
|
||||
|
||||
def tenant_delete(self, tenant_id: Union[TenantId, TenantShardId]):
|
||||
res = self.delete(f"http://localhost:{self.port}/v1/tenant/{tenant_id}")
|
||||
self.verbose_error(res)
|
||||
|
||||
@@ -235,11 +235,6 @@ def test_sharding_split_smoke(
|
||||
all_shards = tenant_get_shards(env, tenant_id)
|
||||
for tenant_shard_id, pageserver in all_shards:
|
||||
pageserver.http_client().timeline_gc(tenant_shard_id, timeline_id, None)
|
||||
|
||||
# Restart all nodes, to check that the newly created shards are durable
|
||||
for ps in env.pageservers:
|
||||
ps.restart()
|
||||
|
||||
workload.validate()
|
||||
|
||||
migrate_to_pageserver_ids = list(
|
||||
@@ -288,6 +283,32 @@ def test_sharding_split_smoke(
|
||||
|
||||
env.attachment_service.consistency_check()
|
||||
|
||||
# Validate pageserver state
|
||||
shards_exist: list[TenantShardId] = []
|
||||
for pageserver in env.pageservers:
|
||||
locations = pageserver.http_client().tenant_list_locations()
|
||||
shards_exist.extend(TenantShardId.parse(s[0]) for s in locations["tenant_shards"])
|
||||
|
||||
log.info("Shards after split: {shards_exist}")
|
||||
assert len(shards_exist) == split_shard_count
|
||||
|
||||
# Ensure post-split pageserver locations survive a restart (i.e. the child shards
|
||||
# correctly wrote config to disk, and the storage controller responds correctly
|
||||
# to /re-attach)
|
||||
for pageserver in env.pageservers:
|
||||
pageserver.stop()
|
||||
pageserver.start()
|
||||
|
||||
shards_exist = []
|
||||
for pageserver in env.pageservers:
|
||||
locations = pageserver.http_client().tenant_list_locations()
|
||||
shards_exist.extend(TenantShardId.parse(s[0]) for s in locations["tenant_shards"])
|
||||
|
||||
log.info("Shards after restart: {shards_exist}")
|
||||
assert len(shards_exist) == split_shard_count
|
||||
|
||||
workload.validate()
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
# The quantity of data isn't huge, but debug can be _very_ slow, and the things we're
|
||||
|
||||
@@ -125,6 +125,20 @@ def test_sharding_service_smoke(
|
||||
time.sleep(1)
|
||||
assert get_node_shard_counts(env, tenant_ids)[env.pageservers[0].id] == 0
|
||||
|
||||
# Restarting a pageserver should not detach any tenants (i.e. /re-attach works)
|
||||
before_restart = env.pageservers[1].http_client().tenant_list_locations()
|
||||
env.pageservers[1].stop()
|
||||
env.pageservers[1].start()
|
||||
after_restart = env.pageservers[1].http_client().tenant_list_locations()
|
||||
assert len(after_restart) == len(before_restart)
|
||||
|
||||
# Locations should be the same before & after restart, apart from generations
|
||||
for _shard_id, tenant in after_restart["tenant_shards"]:
|
||||
del tenant["generation"]
|
||||
for _shard_id, tenant in before_restart["tenant_shards"]:
|
||||
del tenant["generation"]
|
||||
assert before_restart == after_restart
|
||||
|
||||
# Delete all the tenants
|
||||
for tid in tenant_ids:
|
||||
tenant_delete_wait_completed(env.attachment_service.pageserver_api(), tid, 10)
|
||||
|
||||
Reference in New Issue
Block a user