mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-26 17:40:37 +00:00
storcon: Fix migration for Attached(0) tenants (#12256)
## Problem `Attached(0)` tenant migrations can get stuck if the heatmap file has not been uploaded. ## Summary of Changes - Added a test to reproduce the issue. - Introduced a `kick_secondary_downloads` config flag: - Enabled in testing environments. - Disabled in production (and in the new test). - Updated `Attached(0)` locations to consider the number of secondaries in their intent when deciding whether to download the heatmap.
This commit is contained in:
committed by
GitHub
parent
85164422d0
commit
5eecde461d
@@ -453,6 +453,7 @@ class NeonEnvBuilder:
|
||||
pageserver_get_vectored_concurrent_io: str | None = None,
|
||||
pageserver_tracing_config: PageserverTracingConfig | None = None,
|
||||
pageserver_import_config: PageserverImportConfig | None = None,
|
||||
storcon_kick_secondary_downloads: bool | None = None,
|
||||
):
|
||||
self.repo_dir = repo_dir
|
||||
self.rust_log_override = rust_log_override
|
||||
@@ -514,6 +515,8 @@ class NeonEnvBuilder:
|
||||
self.pageserver_tracing_config = pageserver_tracing_config
|
||||
self.pageserver_import_config = pageserver_import_config
|
||||
|
||||
self.storcon_kick_secondary_downloads = storcon_kick_secondary_downloads
|
||||
|
||||
self.pageserver_default_tenant_config_compaction_algorithm: dict[str, Any] | None = (
|
||||
pageserver_default_tenant_config_compaction_algorithm
|
||||
)
|
||||
@@ -1221,6 +1224,14 @@ class NeonEnv:
|
||||
else:
|
||||
cfg["storage_controller"] = {"use_local_compute_notifications": False}
|
||||
|
||||
if config.storcon_kick_secondary_downloads is not None:
|
||||
# Configure whether storage controller should actively kick off secondary downloads
|
||||
if "storage_controller" not in cfg:
|
||||
cfg["storage_controller"] = {}
|
||||
cfg["storage_controller"]["kick_secondary_downloads"] = (
|
||||
config.storcon_kick_secondary_downloads
|
||||
)
|
||||
|
||||
# Create config for pageserver
|
||||
http_auth_type = "NeonJWT" if config.auth_enabled else "Trust"
|
||||
pg_auth_type = "NeonJWT" if config.auth_enabled else "Trust"
|
||||
|
||||
@@ -4434,6 +4434,53 @@ def test_storage_controller_graceful_migration(neon_env_builder: NeonEnvBuilder,
|
||||
assert initial_ps.http_client().tenant_list_locations()["tenant_shards"] == []
|
||||
|
||||
|
||||
def test_attached_0_graceful_migration(neon_env_builder: NeonEnvBuilder):
|
||||
neon_env_builder.num_pageservers = 4
|
||||
neon_env_builder.num_azs = 2
|
||||
|
||||
neon_env_builder.storcon_kick_secondary_downloads = False
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
# It is default, but we want to ensure that there are no secondary locations requested
|
||||
env.storage_controller.tenant_policy_update(env.initial_tenant, {"placement": {"Attached": 0}})
|
||||
env.storage_controller.reconcile_until_idle()
|
||||
|
||||
desc = env.storage_controller.tenant_describe(env.initial_tenant)["shards"][0]
|
||||
src_ps_id = desc["node_attached"]
|
||||
src_ps = env.get_pageserver(src_ps_id)
|
||||
src_az = desc["preferred_az_id"]
|
||||
|
||||
# There must be no secondary locations with Attached(0) placement policy
|
||||
assert len(desc["node_secondary"]) == 0
|
||||
|
||||
# Migrate tenant shard to the same AZ node
|
||||
dst_ps = [ps for ps in env.pageservers if ps.id != src_ps_id and ps.az_id == src_az][0]
|
||||
|
||||
env.storage_controller.tenant_shard_migrate(
|
||||
TenantShardId(env.initial_tenant, 0, 0),
|
||||
dst_ps.id,
|
||||
config=StorageControllerMigrationConfig(prewarm=True),
|
||||
)
|
||||
|
||||
def tenant_shard_migrated():
|
||||
src_locations = src_ps.http_client().tenant_list_locations()["tenant_shards"]
|
||||
assert len(src_locations) == 0
|
||||
log.info(f"Tenant shard migrated from {src_ps.id}")
|
||||
dst_locations = dst_ps.http_client().tenant_list_locations()["tenant_shards"]
|
||||
assert len(dst_locations) == 1
|
||||
assert dst_locations[0][1]["mode"] == "AttachedSingle"
|
||||
log.info(f"Tenant shard migrated to {dst_ps.id}")
|
||||
|
||||
# After all we expect that tenant shard exists only on dst node.
|
||||
# We wait so long because [`DEFAULT_HEATMAP_PERIOD`] and [`DEFAULT_DOWNLOAD_INTERVAL`]
|
||||
# are set to 60 seconds by default.
|
||||
#
|
||||
# TODO: we should consider making these configurable, so the test can run faster.
|
||||
wait_until(tenant_shard_migrated, timeout=180, interval=5, status_interval=10)
|
||||
log.info("Tenant shard migrated successfully")
|
||||
|
||||
|
||||
@run_only_on_default_postgres("this is like a 'unit test' against storcon db")
|
||||
def test_storage_controller_migrate_with_pageserver_restart(
|
||||
neon_env_builder: NeonEnvBuilder, make_httpserver
|
||||
|
||||
Reference in New Issue
Block a user