storage controller: tech debt (#7165)

This is a mixed bag of changes split out for separate review while
working on other things, and batched together to reduce load on CI
runners. Each commits stands alone for review purposes:
- do_tenant_shard_split was a long function and had a synchronous
validation phase at the start that could readily be pulled out into a
separate function. This also avoids the special casing of
ApiError::BadRequest when deciding whether an abort is needed on errors
- Add a 'describe' API (GET on tenant ID) that will enable storcon-cli
to see what's going on with a tenant
- the 'locate' API wasn't really meant for use in the field. It's for
tests: demote it to the /debug/ prefix
- The `Single` placement policy was a redundant duplicate of Double(0),
and Double was a bad name. Rename it Attached.
(https://github.com/neondatabase/neon/issues/7107)
- Some neon_local commands were added for debug/demos, which are now
replaced by commands in storcon-cli (#7114 ). Even though that's not
merged yet, we don't need the neon_local ones any more.

Closes https://github.com/neondatabase/neon/issues/7107

## Backward compat of Single/Double -> `Attached(n)` change

A database migration is used to convert any existing values.
This commit is contained in:
John Spray
2024-03-19 16:08:20 +00:00
committed by GitHub
parent 64c6dfd3e4
commit a5d5c2a6a0
14 changed files with 206 additions and 207 deletions

View File

@@ -1892,19 +1892,6 @@ class NeonCli(AbstractNeonCli):
return self.raw_cli(args, check_return_code=True)
def tenant_migrate(
self, tenant_shard_id: TenantShardId, new_pageserver: int, timeout_secs: Optional[int]
):
args = [
"tenant",
"migrate",
"--tenant-id",
str(tenant_shard_id),
"--id",
str(new_pageserver),
]
return self.raw_cli(args, check_return_code=True, timeout=timeout_secs)
def start(self, check_return_code=True) -> "subprocess.CompletedProcess[str]":
return self.raw_cli(["start"], check_return_code=check_return_code)
@@ -2156,7 +2143,7 @@ class NeonStorageController(MetricsGetter):
"""
response = self.request(
"GET",
f"{self.env.storage_controller_api}/control/v1/tenant/{tenant_id}/locate",
f"{self.env.storage_controller_api}/debug/v1/tenant/{tenant_id}/locate",
headers=self.headers(TokenScope.ADMIN),
)
body = response.json()

View File

@@ -158,6 +158,9 @@ class TenantShardId:
def __str__(self):
return f"{self.tenant_id}-{self.shard_number:02x}{self.shard_count:02x}"
def __repr__(self):
return self.__str__()
def _tuple(self) -> tuple[TenantId, int, int]:
return (self.tenant_id, self.shard_number, self.shard_count)

View File

@@ -576,7 +576,7 @@ def test_slow_secondary_downloads(neon_env_builder: NeonEnvBuilder, via_controll
timeline_id = TimelineId.generate()
env.neon_cli.create_tenant(
tenant_id, timeline_id, conf=TENANT_CONF, placement_policy='{"Double":1}'
tenant_id, timeline_id, conf=TENANT_CONF, placement_policy='{"Attached":1}'
)
attached_to_id = env.storage_controller.locate(tenant_id)[0]["node_id"]

View File

@@ -264,7 +264,7 @@ def test_sharding_split_smoke(
destination = migrate_to_pageserver_ids.pop()
log.info(f"Migrating shard {migrate_shard} from {ps_id} to {destination}")
env.neon_cli.tenant_migrate(migrate_shard, destination, timeout_secs=10)
env.storage_controller.tenant_shard_migrate(migrate_shard, destination)
workload.validate()
@@ -299,7 +299,7 @@ def test_sharding_split_smoke(
locations = pageserver.http_client().tenant_list_locations()
shards_exist.extend(TenantShardId.parse(s[0]) for s in locations["tenant_shards"])
log.info("Shards after split: {shards_exist}")
log.info(f"Shards after split: {shards_exist}")
assert len(shards_exist) == split_shard_count
# Ensure post-split pageserver locations survive a restart (i.e. the child shards