mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-29 19:10:38 +00:00
storcon: skip draining shard if it's secondary is lagging too much (#8644)
## Problem Migrations of tenant shards with cold secondaries are holding up drains in during production deployments. ## Summary of changes If a secondary locations is lagging by more than 256MiB (configurable, but that's the default), then skip cutting it over to the secondary as part of the node drain.
This commit is contained in:
@@ -14,6 +14,7 @@ import textwrap
|
||||
import threading
|
||||
import time
|
||||
import uuid
|
||||
from collections import defaultdict
|
||||
from contextlib import closing, contextmanager
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
@@ -2667,6 +2668,69 @@ class NeonStorageController(MetricsGetter, LogUtils):
|
||||
log.info(f"Got failpoints request response code {res.status_code}")
|
||||
res.raise_for_status()
|
||||
|
||||
def get_tenants_placement(self) -> defaultdict[str, Dict[str, Any]]:
|
||||
"""
|
||||
Get the intent and observed placements of all tenants known to the storage controller.
|
||||
"""
|
||||
tenants = self.tenant_list()
|
||||
|
||||
tenant_placement: defaultdict[str, Dict[str, Any]] = defaultdict(
|
||||
lambda: {
|
||||
"observed": {"attached": None, "secondary": []},
|
||||
"intent": {"attached": None, "secondary": []},
|
||||
}
|
||||
)
|
||||
|
||||
for t in tenants:
|
||||
for node_id, loc_state in t["observed"]["locations"].items():
|
||||
if (
|
||||
loc_state is not None
|
||||
and "conf" in loc_state
|
||||
and loc_state["conf"] is not None
|
||||
and loc_state["conf"]["mode"]
|
||||
in set(["AttachedSingle", "AttachedMulti", "AttachedStale"])
|
||||
):
|
||||
tenant_placement[t["tenant_shard_id"]]["observed"]["attached"] = int(node_id)
|
||||
|
||||
if (
|
||||
loc_state is not None
|
||||
and "conf" in loc_state
|
||||
and loc_state["conf"] is not None
|
||||
and loc_state["conf"]["mode"] == "Secondary"
|
||||
):
|
||||
tenant_placement[t["tenant_shard_id"]]["observed"]["secondary"].append(
|
||||
int(node_id)
|
||||
)
|
||||
|
||||
if "attached" in t["intent"]:
|
||||
tenant_placement[t["tenant_shard_id"]]["intent"]["attached"] = t["intent"][
|
||||
"attached"
|
||||
]
|
||||
|
||||
if "secondary" in t["intent"]:
|
||||
tenant_placement[t["tenant_shard_id"]]["intent"]["secondary"] += t["intent"][
|
||||
"secondary"
|
||||
]
|
||||
|
||||
return tenant_placement
|
||||
|
||||
def warm_up_all_secondaries(self):
|
||||
log.info("Warming up all secondary locations")
|
||||
|
||||
tenant_placement = self.get_tenants_placement()
|
||||
for tid, placement in tenant_placement.items():
|
||||
assert placement["observed"]["attached"] is not None
|
||||
primary_id = placement["observed"]["attached"]
|
||||
|
||||
assert len(placement["observed"]["secondary"]) == 1
|
||||
secondary_id = placement["observed"]["secondary"][0]
|
||||
|
||||
parsed_tid = TenantShardId.parse(tid)
|
||||
self.env.get_pageserver(primary_id).http_client().tenant_heatmap_upload(parsed_tid)
|
||||
self.env.get_pageserver(secondary_id).http_client().tenant_secondary_download(
|
||||
parsed_tid, wait_ms=250
|
||||
)
|
||||
|
||||
@property
|
||||
def workdir(self) -> Path:
|
||||
return self.env.repo_dir
|
||||
|
||||
@@ -361,6 +361,12 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
|
||||
self.verbose_error(res)
|
||||
return (res.status_code, res.json())
|
||||
|
||||
def tenant_secondary_status(self, tenant_id: Union[TenantId, TenantShardId]):
|
||||
url = f"http://localhost:{self.port}/v1/tenant/{tenant_id}/secondary/status"
|
||||
res = self.get(url)
|
||||
self.verbose_error(res)
|
||||
return res.json()
|
||||
|
||||
def set_tenant_config(self, tenant_id: Union[TenantId, TenantShardId], config: dict[str, Any]):
|
||||
assert "tenant_id" not in config.keys()
|
||||
res = self.put(
|
||||
|
||||
@@ -2,7 +2,6 @@ import concurrent.futures
|
||||
import random
|
||||
import time
|
||||
from collections import defaultdict
|
||||
from typing import Any, Dict
|
||||
|
||||
import pytest
|
||||
from fixtures.common_types import TenantId, TenantShardId, TimelineId
|
||||
@@ -24,51 +23,14 @@ def get_consistent_node_shard_counts(env: NeonEnv, total_shards) -> defaultdict[
|
||||
This function takes into account the intersection of the intent and the observed state.
|
||||
If they do not match, it asserts out.
|
||||
"""
|
||||
tenants = env.storage_controller.tenant_list()
|
||||
|
||||
intent = dict()
|
||||
observed = dict()
|
||||
|
||||
tenant_placement: defaultdict[str, Dict[str, Any]] = defaultdict(
|
||||
lambda: {
|
||||
"observed": {"attached": None, "secondary": []},
|
||||
"intent": {"attached": None, "secondary": []},
|
||||
}
|
||||
)
|
||||
|
||||
for t in tenants:
|
||||
for node_id, loc_state in t["observed"]["locations"].items():
|
||||
if (
|
||||
loc_state is not None
|
||||
and "conf" in loc_state
|
||||
and loc_state["conf"] is not None
|
||||
and loc_state["conf"]["mode"]
|
||||
in set(["AttachedSingle", "AttachedMulti", "AttachedStale"])
|
||||
):
|
||||
observed[t["tenant_shard_id"]] = int(node_id)
|
||||
tenant_placement[t["tenant_shard_id"]]["observed"]["attached"] = int(node_id)
|
||||
|
||||
if (
|
||||
loc_state is not None
|
||||
and "conf" in loc_state
|
||||
and loc_state["conf"] is not None
|
||||
and loc_state["conf"]["mode"] == "Secondary"
|
||||
):
|
||||
tenant_placement[t["tenant_shard_id"]]["observed"]["secondary"].append(int(node_id))
|
||||
|
||||
if "attached" in t["intent"]:
|
||||
intent[t["tenant_shard_id"]] = t["intent"]["attached"]
|
||||
tenant_placement[t["tenant_shard_id"]]["intent"]["attached"] = t["intent"]["attached"]
|
||||
|
||||
if "secondary" in t["intent"]:
|
||||
tenant_placement[t["tenant_shard_id"]]["intent"]["secondary"] += t["intent"][
|
||||
"secondary"
|
||||
]
|
||||
|
||||
tenant_placement = env.storage_controller.get_tenants_placement()
|
||||
log.info(f"{tenant_placement=}")
|
||||
|
||||
matching = {
|
||||
tid: intent[tid] for tid in observed if tid in intent and intent[tid] == observed[tid]
|
||||
tid: tenant_placement[tid]["intent"]["attached"]
|
||||
for tid in tenant_placement
|
||||
if tenant_placement[tid]["intent"]["attached"]
|
||||
== tenant_placement[tid]["observed"]["attached"]
|
||||
}
|
||||
assert len(matching) == total_shards
|
||||
|
||||
|
||||
@@ -17,6 +17,7 @@ from fixtures.neon_fixtures import (
|
||||
PgBin,
|
||||
StorageControllerApiException,
|
||||
TokenScope,
|
||||
last_flush_lsn_upload,
|
||||
)
|
||||
from fixtures.pageserver.http import PageserverHttpClient
|
||||
from fixtures.pageserver.utils import (
|
||||
@@ -1597,6 +1598,8 @@ def test_graceful_cluster_restart(neon_env_builder: NeonEnvBuilder):
|
||||
|
||||
# Perform a graceful rolling restart
|
||||
for ps in env.pageservers:
|
||||
env.storage_controller.warm_up_all_secondaries()
|
||||
|
||||
env.storage_controller.retryable_node_operation(
|
||||
lambda ps_id: env.storage_controller.node_drain(ps_id), ps.id, max_attempts=3, backoff=2
|
||||
)
|
||||
@@ -1645,6 +1648,115 @@ def test_graceful_cluster_restart(neon_env_builder: NeonEnvBuilder):
|
||||
assert_shard_counts_balanced(env, shard_counts, total_shards)
|
||||
|
||||
|
||||
def test_skip_drain_on_secondary_lag(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
|
||||
"""
|
||||
Artificially make a tenant shard's secondary location lag behind the primary
|
||||
and check that storage controller driven node drains skip the lagging tenant shard.
|
||||
Finally, validate that the tenant shard is migrated when a new drain request comes
|
||||
in and it's no longer lagging.
|
||||
"""
|
||||
neon_env_builder.num_pageservers = 2
|
||||
neon_env_builder.storage_controller_config = {
|
||||
"max_secondary_lag_bytes": 1 * 1024 * 1024,
|
||||
}
|
||||
|
||||
env = neon_env_builder.init_configs()
|
||||
env.start()
|
||||
|
||||
tid, timeline_id = env.neon_cli.create_tenant(placement_policy='{"Attached":1}')
|
||||
|
||||
# Give things a chance to settle.
|
||||
env.storage_controller.reconcile_until_idle(timeout_secs=30)
|
||||
|
||||
locations = env.storage_controller.locate(tid)
|
||||
assert len(locations) == 1
|
||||
primary: int = locations[0]["node_id"]
|
||||
not_primary = [ps.id for ps in env.pageservers if ps.id != primary]
|
||||
assert len(not_primary) == 1
|
||||
secondary = not_primary[0]
|
||||
|
||||
log.info(f"Paused secondary downloads on {secondary}")
|
||||
env.get_pageserver(secondary).http_client().configure_failpoints(
|
||||
("secondary-layer-download-pausable", "pause")
|
||||
)
|
||||
|
||||
log.info(f"Ingesting some data for {tid}")
|
||||
|
||||
with env.endpoints.create_start("main", tenant_id=tid) as endpoint:
|
||||
run_pg_bench_small(pg_bin, endpoint.connstr())
|
||||
endpoint.safe_psql("CREATE TABLE created_foo(id integer);")
|
||||
last_flush_lsn_upload(env, endpoint, tid, timeline_id)
|
||||
|
||||
log.info(f"Uploading heatmap from {primary} and requesting download from {secondary}")
|
||||
|
||||
env.get_pageserver(primary).http_client().tenant_heatmap_upload(tid)
|
||||
env.get_pageserver(secondary).http_client().tenant_secondary_download(tid, wait_ms=100)
|
||||
|
||||
def secondary_is_lagging():
|
||||
resp = env.get_pageserver(secondary).http_client().tenant_secondary_status(tid)
|
||||
lag = resp["bytes_total"] - resp["bytes_downloaded"]
|
||||
|
||||
if lag <= 1 * 1024 * 1024:
|
||||
raise Exception(f"Secondary lag not big enough: {lag}")
|
||||
|
||||
log.info(f"Looking for lag to develop on the secondary {secondary}")
|
||||
wait_until(10, 1, secondary_is_lagging)
|
||||
|
||||
log.info(f"Starting drain of primary {primary} with laggy secondary {secondary}")
|
||||
env.storage_controller.retryable_node_operation(
|
||||
lambda ps_id: env.storage_controller.node_drain(ps_id), primary, max_attempts=3, backoff=2
|
||||
)
|
||||
|
||||
env.storage_controller.poll_node_status(
|
||||
primary,
|
||||
PageserverAvailability.ACTIVE,
|
||||
PageserverSchedulingPolicy.PAUSE_FOR_RESTART,
|
||||
max_attempts=6,
|
||||
backoff=5,
|
||||
)
|
||||
|
||||
locations = env.storage_controller.locate(tid)
|
||||
assert len(locations) == 1
|
||||
assert locations[0]["node_id"] == primary
|
||||
|
||||
log.info(f"Unpausing secondary downloads on {secondary}")
|
||||
env.get_pageserver(secondary).http_client().configure_failpoints(
|
||||
("secondary-layer-download-pausable", "off")
|
||||
)
|
||||
env.get_pageserver(secondary).http_client().tenant_secondary_download(tid, wait_ms=100)
|
||||
|
||||
log.info(f"Waiting for lag to reduce on {secondary}")
|
||||
|
||||
def lag_is_acceptable():
|
||||
resp = env.get_pageserver(secondary).http_client().tenant_secondary_status(tid)
|
||||
lag = resp["bytes_total"] - resp["bytes_downloaded"]
|
||||
|
||||
if lag > 1 * 1024 * 1024:
|
||||
raise Exception(f"Secondary lag not big enough: {lag}")
|
||||
|
||||
wait_until(10, 1, lag_is_acceptable)
|
||||
|
||||
env.storage_controller.node_configure(primary, {"scheduling": "Active"})
|
||||
|
||||
log.info(f"Starting drain of primary {primary} with non-laggy secondary {secondary}")
|
||||
|
||||
env.storage_controller.retryable_node_operation(
|
||||
lambda ps_id: env.storage_controller.node_drain(ps_id), primary, max_attempts=3, backoff=2
|
||||
)
|
||||
|
||||
env.storage_controller.poll_node_status(
|
||||
primary,
|
||||
PageserverAvailability.ACTIVE,
|
||||
PageserverSchedulingPolicy.PAUSE_FOR_RESTART,
|
||||
max_attempts=6,
|
||||
backoff=5,
|
||||
)
|
||||
|
||||
locations = env.storage_controller.locate(tid)
|
||||
assert len(locations) == 1
|
||||
assert locations[0]["node_id"] == secondary
|
||||
|
||||
|
||||
def test_background_operation_cancellation(neon_env_builder: NeonEnvBuilder):
|
||||
neon_env_builder.num_pageservers = 2
|
||||
env = neon_env_builder.init_configs()
|
||||
@@ -1671,6 +1783,7 @@ def test_background_operation_cancellation(neon_env_builder: NeonEnvBuilder):
|
||||
|
||||
ps_id_to_drain = env.pageservers[0].id
|
||||
|
||||
env.storage_controller.warm_up_all_secondaries()
|
||||
env.storage_controller.retryable_node_operation(
|
||||
lambda ps_id: env.storage_controller.node_drain(ps_id),
|
||||
ps_id_to_drain,
|
||||
|
||||
Reference in New Issue
Block a user