mirror of
https://github.com/neondatabase/neon.git
synced 2025-12-23 06:09:59 +00:00
The binary etc were renamed some time ago, but the path in the source tree remained "attachment_service" to avoid disruption to ongoing PRs. There aren't any big PRs out right now, so it's a good time to cut over. - Rename `attachment_service` to `storage_controller` - Move it to the top level for symmetry with `storage_broker` & to avoid mixing the non-prod neon_local stuff (`control_plane/`) with the storage controller which is a production component.
1233 lines
44 KiB
Python
1233 lines
44 KiB
Python
import json
|
|
import time
|
|
from collections import defaultdict
|
|
from datetime import datetime, timezone
|
|
from typing import Any, Dict, List, Union
|
|
|
|
import pytest
|
|
from fixtures.log_helper import log
|
|
from fixtures.neon_fixtures import (
|
|
NeonEnv,
|
|
NeonEnvBuilder,
|
|
PgBin,
|
|
StorageControllerApiException,
|
|
TokenScope,
|
|
)
|
|
from fixtures.pageserver.http import PageserverHttpClient
|
|
from fixtures.pageserver.utils import (
|
|
MANY_SMALL_LAYERS_TENANT_CONFIG,
|
|
enable_remote_storage_versioning,
|
|
list_prefix,
|
|
remote_storage_delete_key,
|
|
tenant_delete_wait_completed,
|
|
timeline_delete_wait_completed,
|
|
)
|
|
from fixtures.pg_version import PgVersion
|
|
from fixtures.remote_storage import RemoteStorageKind, s3_storage
|
|
from fixtures.types import TenantId, TenantShardId, TimelineId
|
|
from fixtures.utils import run_pg_bench_small, subprocess_capture, wait_until
|
|
from mypy_boto3_s3.type_defs import (
|
|
ObjectTypeDef,
|
|
)
|
|
from pytest_httpserver import HTTPServer
|
|
from werkzeug.wrappers.request import Request
|
|
from werkzeug.wrappers.response import Response
|
|
|
|
|
|
def get_node_shard_counts(env: NeonEnv, tenant_ids):
|
|
counts: defaultdict[str, int] = defaultdict(int)
|
|
for tid in tenant_ids:
|
|
for shard in env.storage_controller.locate(tid):
|
|
counts[shard["node_id"]] += 1
|
|
return counts
|
|
|
|
|
|
def test_storage_controller_smoke(
|
|
neon_env_builder: NeonEnvBuilder,
|
|
):
|
|
"""
|
|
Test the basic lifecycle of a storage controller:
|
|
- Restarting
|
|
- Restarting a pageserver
|
|
- Creating and deleting tenants and timelines
|
|
- Marking a pageserver offline
|
|
"""
|
|
|
|
neon_env_builder.num_pageservers = 3
|
|
env = neon_env_builder.init_configs()
|
|
|
|
for pageserver in env.pageservers:
|
|
# This test detaches tenants during migration, which can race with deletion queue operations,
|
|
# during detach we only do an advisory flush, we don't wait for it.
|
|
pageserver.allowed_errors.extend([".*Dropped remote consistent LSN updates.*"])
|
|
|
|
# Start services by hand so that we can skip a pageserver (this will start + register later)
|
|
env.broker.try_start()
|
|
env.storage_controller.start()
|
|
env.pageservers[0].start()
|
|
env.pageservers[1].start()
|
|
for sk in env.safekeepers:
|
|
sk.start()
|
|
|
|
# The pageservers we started should have registered with the sharding service on startup
|
|
nodes = env.storage_controller.node_list()
|
|
assert len(nodes) == 2
|
|
assert set(n["id"] for n in nodes) == {env.pageservers[0].id, env.pageservers[1].id}
|
|
|
|
# Starting an additional pageserver should register successfully
|
|
env.pageservers[2].start()
|
|
nodes = env.storage_controller.node_list()
|
|
assert len(nodes) == 3
|
|
assert set(n["id"] for n in nodes) == {ps.id for ps in env.pageservers}
|
|
|
|
# Use a multiple of pageservers to get nice even number of shards on each one
|
|
tenant_shard_count = len(env.pageservers) * 4
|
|
tenant_count = len(env.pageservers) * 2
|
|
shards_per_tenant = tenant_shard_count // tenant_count
|
|
tenant_ids = set(TenantId.generate() for i in range(0, tenant_count))
|
|
|
|
# Creating several tenants should spread out across the pageservers
|
|
for tid in tenant_ids:
|
|
env.neon_cli.create_tenant(tid, shard_count=shards_per_tenant)
|
|
|
|
# Repeating a creation should be idempotent (we are just testing it doesn't return an error)
|
|
env.storage_controller.tenant_create(
|
|
tenant_id=next(iter(tenant_ids)), shard_count=shards_per_tenant
|
|
)
|
|
|
|
for node_id, count in get_node_shard_counts(env, tenant_ids).items():
|
|
# we used a multiple of pagservers for the total shard count,
|
|
# so expect equal number on all pageservers
|
|
assert count == tenant_shard_count / len(
|
|
env.pageservers
|
|
), f"Node {node_id} has bad count {count}"
|
|
|
|
# Creating and deleting timelines should work, using identical API to pageserver
|
|
timeline_crud_tenant = next(iter(tenant_ids))
|
|
timeline_id = TimelineId.generate()
|
|
env.storage_controller.pageserver_api().timeline_create(
|
|
pg_version=PgVersion.NOT_SET, tenant_id=timeline_crud_tenant, new_timeline_id=timeline_id
|
|
)
|
|
timelines = env.storage_controller.pageserver_api().timeline_list(timeline_crud_tenant)
|
|
assert len(timelines) == 2
|
|
assert timeline_id in set(TimelineId(t["timeline_id"]) for t in timelines)
|
|
# virtual_ps_http.timeline_delete(tenant_id=timeline_crud_tenant, timeline_id=timeline_id)
|
|
timeline_delete_wait_completed(
|
|
env.storage_controller.pageserver_api(), timeline_crud_tenant, timeline_id
|
|
)
|
|
timelines = env.storage_controller.pageserver_api().timeline_list(timeline_crud_tenant)
|
|
assert len(timelines) == 1
|
|
assert timeline_id not in set(TimelineId(t["timeline_id"]) for t in timelines)
|
|
|
|
# Marking a pageserver offline should migrate tenants away from it.
|
|
env.storage_controller.node_configure(env.pageservers[0].id, {"availability": "Offline"})
|
|
|
|
def node_evacuated(node_id: int) -> None:
|
|
counts = get_node_shard_counts(env, tenant_ids)
|
|
assert counts[node_id] == 0
|
|
|
|
wait_until(10, 1, lambda: node_evacuated(env.pageservers[0].id))
|
|
|
|
# Marking pageserver active should not migrate anything to it
|
|
# immediately
|
|
env.storage_controller.node_configure(env.pageservers[0].id, {"availability": "Active"})
|
|
time.sleep(1)
|
|
assert get_node_shard_counts(env, tenant_ids)[env.pageservers[0].id] == 0
|
|
|
|
# Restarting a pageserver should not detach any tenants (i.e. /re-attach works)
|
|
before_restart = env.pageservers[1].http_client().tenant_list_locations()
|
|
env.pageservers[1].stop()
|
|
env.pageservers[1].start()
|
|
after_restart = env.pageservers[1].http_client().tenant_list_locations()
|
|
assert len(after_restart) == len(before_restart)
|
|
|
|
# Locations should be the same before & after restart, apart from generations
|
|
for _shard_id, tenant in after_restart["tenant_shards"]:
|
|
del tenant["generation"]
|
|
for _shard_id, tenant in before_restart["tenant_shards"]:
|
|
del tenant["generation"]
|
|
assert before_restart == after_restart
|
|
|
|
# Delete all the tenants
|
|
for tid in tenant_ids:
|
|
tenant_delete_wait_completed(env.storage_controller.pageserver_api(), tid, 10)
|
|
|
|
env.storage_controller.consistency_check()
|
|
|
|
# Set a scheduling policy on one node, create all the tenants, observe
|
|
# that the scheduling policy is respected.
|
|
env.storage_controller.node_configure(env.pageservers[1].id, {"scheduling": "Draining"})
|
|
|
|
# Create some fresh tenants
|
|
tenant_ids = set(TenantId.generate() for i in range(0, tenant_count))
|
|
for tid in tenant_ids:
|
|
env.neon_cli.create_tenant(tid, shard_count=shards_per_tenant)
|
|
|
|
counts = get_node_shard_counts(env, tenant_ids)
|
|
# Nothing should have been scheduled on the node in Draining
|
|
assert counts[env.pageservers[1].id] == 0
|
|
assert counts[env.pageservers[0].id] == tenant_shard_count // 2
|
|
assert counts[env.pageservers[2].id] == tenant_shard_count // 2
|
|
|
|
env.storage_controller.consistency_check()
|
|
|
|
|
|
def test_node_status_after_restart(
|
|
neon_env_builder: NeonEnvBuilder,
|
|
):
|
|
neon_env_builder.num_pageservers = 2
|
|
env = neon_env_builder.init_start()
|
|
|
|
# Initially we have two online pageservers
|
|
nodes = env.storage_controller.node_list()
|
|
assert len(nodes) == 2
|
|
|
|
env.pageservers[1].stop()
|
|
env.storage_controller.allowed_errors.extend([".*Could not scan node"])
|
|
|
|
env.storage_controller.stop()
|
|
env.storage_controller.start()
|
|
|
|
def is_ready():
|
|
assert env.storage_controller.ready() is True
|
|
|
|
wait_until(30, 1, is_ready)
|
|
|
|
# We loaded nodes from database on restart
|
|
nodes = env.storage_controller.node_list()
|
|
assert len(nodes) == 2
|
|
|
|
# We should still be able to create a tenant, because the pageserver which is still online
|
|
# should have had its availabilty state set to Active.
|
|
env.storage_controller.tenant_create(TenantId.generate())
|
|
|
|
env.storage_controller.consistency_check()
|
|
|
|
|
|
def test_storage_controller_passthrough(
|
|
neon_env_builder: NeonEnvBuilder,
|
|
):
|
|
"""
|
|
For simple timeline/tenant GET APIs that don't require coordination across
|
|
shards, the sharding service implements a proxy to shard zero. This test
|
|
calls those APIs.
|
|
"""
|
|
neon_env_builder.num_pageservers = 2
|
|
env = neon_env_builder.init_start()
|
|
|
|
# We will talk to storage controller as if it was a pageserver, using the pageserver
|
|
# HTTP client
|
|
client = PageserverHttpClient(env.storage_controller_port, lambda: True)
|
|
timelines = client.timeline_list(tenant_id=env.initial_tenant)
|
|
assert len(timelines) == 1
|
|
|
|
status = client.tenant_status(env.initial_tenant)
|
|
assert TenantId(status["id"]) == env.initial_tenant
|
|
assert set(TimelineId(t) for t in status["timelines"]) == {
|
|
env.initial_timeline,
|
|
}
|
|
assert status["state"]["slug"] == "Active"
|
|
|
|
env.storage_controller.consistency_check()
|
|
|
|
|
|
def test_storage_controller_restart(neon_env_builder: NeonEnvBuilder):
|
|
env = neon_env_builder.init_start()
|
|
tenant_a = env.initial_tenant
|
|
tenant_b = TenantId.generate()
|
|
env.storage_controller.tenant_create(tenant_b)
|
|
env.pageserver.tenant_detach(tenant_a)
|
|
|
|
# TODO: extend this test to use multiple pageservers, and check that locations don't move around
|
|
# on restart.
|
|
|
|
# Storage controller restart
|
|
env.storage_controller.stop()
|
|
env.storage_controller.start()
|
|
|
|
observed = set(TenantId(tenant["id"]) for tenant in env.pageserver.http_client().tenant_list())
|
|
|
|
# Tenant A should still be attached
|
|
assert tenant_a not in observed
|
|
|
|
# Tenant B should remain detached
|
|
assert tenant_b in observed
|
|
|
|
# Pageserver restart
|
|
env.pageserver.stop()
|
|
env.pageserver.start()
|
|
|
|
# Same assertions as above: restarting either service should not perturb things
|
|
observed = set(TenantId(tenant["id"]) for tenant in env.pageserver.http_client().tenant_list())
|
|
assert tenant_a not in observed
|
|
assert tenant_b in observed
|
|
|
|
env.storage_controller.consistency_check()
|
|
|
|
|
|
@pytest.mark.parametrize("warm_up", [True, False])
|
|
def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up: bool):
|
|
"""
|
|
We onboard tenants to the sharding service by treating it as a 'virtual pageserver'
|
|
which provides the /location_config API. This is similar to creating a tenant,
|
|
but imports the generation number.
|
|
"""
|
|
|
|
neon_env_builder.num_pageservers = 2
|
|
|
|
# Start services by hand so that we can skip registration on one of the pageservers
|
|
env = neon_env_builder.init_configs()
|
|
env.broker.try_start()
|
|
env.storage_controller.start()
|
|
|
|
# This is the pageserver where we'll initially create the tenant. Run it in emergency
|
|
# mode so that it doesn't talk to storage controller, and do not register it.
|
|
env.pageservers[0].allowed_errors.append(".*Emergency mode!.*")
|
|
env.pageservers[0].start(
|
|
overrides=("--pageserver-config-override=control_plane_emergency_mode=true",),
|
|
)
|
|
origin_ps = env.pageservers[0]
|
|
|
|
# This is the pageserver managed by the sharding service, where the tenant
|
|
# will be attached after onboarding
|
|
env.pageservers[1].start()
|
|
dest_ps = env.pageservers[1]
|
|
virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True)
|
|
|
|
for sk in env.safekeepers:
|
|
sk.start()
|
|
|
|
# Create a tenant directly via pageserver HTTP API, skipping the storage controller
|
|
tenant_id = TenantId.generate()
|
|
generation = 123
|
|
origin_ps.http_client().tenant_create(tenant_id, generation=generation)
|
|
|
|
# As if doing a live migration, first configure origin into stale mode
|
|
r = origin_ps.http_client().tenant_location_conf(
|
|
tenant_id,
|
|
{
|
|
"mode": "AttachedStale",
|
|
"secondary_conf": None,
|
|
"tenant_conf": {},
|
|
"generation": generation,
|
|
},
|
|
)
|
|
assert len(r["shards"]) == 1
|
|
|
|
if warm_up:
|
|
origin_ps.http_client().tenant_heatmap_upload(tenant_id)
|
|
|
|
# We expect to be called via live migration code, which may try to configure the tenant into secondary
|
|
# mode before attaching it.
|
|
virtual_ps_http.tenant_location_conf(
|
|
tenant_id,
|
|
{
|
|
"mode": "Secondary",
|
|
"secondary_conf": {"warm": True},
|
|
"tenant_conf": {},
|
|
"generation": None,
|
|
},
|
|
)
|
|
|
|
virtual_ps_http.tenant_secondary_download(tenant_id)
|
|
|
|
# Call into storage controller to onboard the tenant
|
|
generation += 1
|
|
r = virtual_ps_http.tenant_location_conf(
|
|
tenant_id,
|
|
{
|
|
"mode": "AttachedMulti",
|
|
"secondary_conf": None,
|
|
"tenant_conf": {},
|
|
"generation": generation,
|
|
},
|
|
)
|
|
assert len(r["shards"]) == 1
|
|
|
|
# As if doing a live migration, detach the original pageserver
|
|
origin_ps.http_client().tenant_location_conf(
|
|
tenant_id,
|
|
{
|
|
"mode": "Detached",
|
|
"secondary_conf": None,
|
|
"tenant_conf": {},
|
|
"generation": None,
|
|
},
|
|
)
|
|
|
|
# As if doing a live migration, call into the storage controller to
|
|
# set it to AttachedSingle: this is a no-op, but we test it because the
|
|
# cloud control plane may call this for symmetry with live migration to
|
|
# an individual pageserver
|
|
r = virtual_ps_http.tenant_location_conf(
|
|
tenant_id,
|
|
{
|
|
"mode": "AttachedSingle",
|
|
"secondary_conf": None,
|
|
"tenant_conf": {},
|
|
"generation": generation,
|
|
},
|
|
)
|
|
assert len(r["shards"]) == 1
|
|
|
|
# We should see the tenant is now attached to the pageserver managed
|
|
# by the sharding service
|
|
origin_tenants = origin_ps.http_client().tenant_list()
|
|
assert len(origin_tenants) == 0
|
|
dest_tenants = dest_ps.http_client().tenant_list()
|
|
assert len(dest_tenants) == 1
|
|
assert TenantId(dest_tenants[0]["id"]) == tenant_id
|
|
|
|
# sharding service advances generation by 1 when it first attaches. We started
|
|
# with a nonzero generation so this equality also proves that the generation
|
|
# was properly carried over during onboarding.
|
|
assert dest_tenants[0]["generation"] == generation + 1
|
|
|
|
# The onboarded tenant should survive a restart of sharding service
|
|
env.storage_controller.stop()
|
|
env.storage_controller.start()
|
|
|
|
# The onboarded tenant should surviev a restart of pageserver
|
|
dest_ps.stop()
|
|
dest_ps.start()
|
|
|
|
# Having onboarded via /location_config, we should also be able to update the
|
|
# TenantConf part of LocationConf, without inadvertently resetting the generation
|
|
modified_tenant_conf = {"max_lsn_wal_lag": 1024 * 1024 * 1024 * 100}
|
|
dest_tenant_before_conf_change = dest_ps.http_client().tenant_status(tenant_id)
|
|
|
|
# The generation has moved on since we onboarded
|
|
assert generation != dest_tenant_before_conf_change["generation"]
|
|
|
|
r = virtual_ps_http.tenant_location_conf(
|
|
tenant_id,
|
|
{
|
|
"mode": "AttachedSingle",
|
|
"secondary_conf": None,
|
|
"tenant_conf": modified_tenant_conf,
|
|
# This is intentionally a stale generation
|
|
"generation": generation,
|
|
},
|
|
)
|
|
assert len(r["shards"]) == 1
|
|
dest_tenant_after_conf_change = dest_ps.http_client().tenant_status(tenant_id)
|
|
assert (
|
|
dest_tenant_after_conf_change["generation"] == dest_tenant_before_conf_change["generation"]
|
|
)
|
|
dest_tenant_conf_after = dest_ps.http_client().tenant_config(tenant_id)
|
|
assert dest_tenant_conf_after.tenant_specific_overrides == modified_tenant_conf
|
|
|
|
env.storage_controller.consistency_check()
|
|
|
|
|
|
def test_storage_controller_compute_hook(
|
|
httpserver: HTTPServer,
|
|
neon_env_builder: NeonEnvBuilder,
|
|
httpserver_listen_address,
|
|
):
|
|
"""
|
|
Test that the sharding service calls out to the configured HTTP endpoint on attachment changes
|
|
"""
|
|
|
|
# We will run two pageserver to migrate and check that the storage controller sends notifications
|
|
# when migrating.
|
|
neon_env_builder.num_pageservers = 2
|
|
(host, port) = httpserver_listen_address
|
|
neon_env_builder.control_plane_compute_hook_api = f"http://{host}:{port}/notify"
|
|
|
|
# Set up fake HTTP notify endpoint
|
|
notifications = []
|
|
|
|
handle_params = {"status": 200}
|
|
|
|
def handler(request: Request):
|
|
status = handle_params["status"]
|
|
log.info(f"Notify request[{status}]: {request}")
|
|
notifications.append(request.json)
|
|
return Response(status=status)
|
|
|
|
httpserver.expect_request("/notify", method="PUT").respond_with_handler(handler)
|
|
|
|
# Start running
|
|
env = neon_env_builder.init_start()
|
|
|
|
# We will to an unclean migration, which will result in deletion queue warnings
|
|
env.pageservers[0].allowed_errors.append(".*Dropped remote consistent LSN updates for tenant.*")
|
|
|
|
# Initial notification from tenant creation
|
|
assert len(notifications) == 1
|
|
expect: Dict[str, Union[List[Dict[str, int]], str, None, int]] = {
|
|
"tenant_id": str(env.initial_tenant),
|
|
"stripe_size": None,
|
|
"shards": [{"node_id": int(env.pageservers[0].id), "shard_number": 0}],
|
|
}
|
|
assert notifications[0] == expect
|
|
|
|
env.storage_controller.node_configure(env.pageservers[0].id, {"availability": "Offline"})
|
|
|
|
def node_evacuated(node_id: int) -> None:
|
|
counts = get_node_shard_counts(env, [env.initial_tenant])
|
|
assert counts[node_id] == 0
|
|
|
|
wait_until(10, 1, lambda: node_evacuated(env.pageservers[0].id))
|
|
|
|
# Additional notification from migration
|
|
log.info(f"notifications: {notifications}")
|
|
expect = {
|
|
"tenant_id": str(env.initial_tenant),
|
|
"stripe_size": None,
|
|
"shards": [{"node_id": int(env.pageservers[1].id), "shard_number": 0}],
|
|
}
|
|
|
|
def received_migration_notification():
|
|
assert len(notifications) == 2
|
|
assert notifications[1] == expect
|
|
|
|
wait_until(20, 0.25, received_migration_notification)
|
|
|
|
# When we restart, we should re-emit notifications for all tenants
|
|
env.storage_controller.stop()
|
|
env.storage_controller.start()
|
|
|
|
def received_restart_notification():
|
|
assert len(notifications) == 3
|
|
assert notifications[2] == expect
|
|
|
|
wait_until(10, 1, received_restart_notification)
|
|
|
|
# Splitting a tenant should cause its stripe size to become visible in the compute notification
|
|
env.storage_controller.tenant_shard_split(env.initial_tenant, shard_count=2)
|
|
expect = {
|
|
"tenant_id": str(env.initial_tenant),
|
|
"stripe_size": 32768,
|
|
"shards": [
|
|
{"node_id": int(env.pageservers[1].id), "shard_number": 0},
|
|
{"node_id": int(env.pageservers[1].id), "shard_number": 1},
|
|
],
|
|
}
|
|
|
|
def received_split_notification():
|
|
assert len(notifications) == 4
|
|
assert notifications[3] == expect
|
|
|
|
wait_until(10, 1, received_split_notification)
|
|
|
|
# If the compute hook is unavailable, that should not block creating a tenant and
|
|
# creating a timeline. This simulates a control plane refusing to accept notifications
|
|
handle_params["status"] = 423
|
|
degraded_tenant_id = TenantId.generate()
|
|
degraded_timeline_id = TimelineId.generate()
|
|
env.storage_controller.tenant_create(degraded_tenant_id)
|
|
env.storage_controller.pageserver_api().timeline_create(
|
|
PgVersion.NOT_SET, degraded_tenant_id, degraded_timeline_id
|
|
)
|
|
|
|
# Ensure we hit the handler error path
|
|
env.storage_controller.allowed_errors.append(
|
|
".*Failed to notify compute of attached pageserver.*tenant busy.*"
|
|
)
|
|
env.storage_controller.allowed_errors.append(".*Reconcile error.*tenant busy.*")
|
|
assert notifications[-1] is not None
|
|
assert notifications[-1]["tenant_id"] == str(degraded_tenant_id)
|
|
|
|
env.storage_controller.consistency_check()
|
|
|
|
|
|
def test_storage_controller_debug_apis(neon_env_builder: NeonEnvBuilder):
|
|
"""
|
|
Verify that occasional-use debug APIs work as expected. This is a lightweight test
|
|
that just hits the endpoints to check that they don't bitrot.
|
|
"""
|
|
|
|
neon_env_builder.num_pageservers = 2
|
|
env = neon_env_builder.init_start()
|
|
|
|
tenant_id = TenantId.generate()
|
|
env.storage_controller.tenant_create(tenant_id, shard_count=2, shard_stripe_size=8192)
|
|
|
|
# Check that the consistency check passes on a freshly setup system
|
|
env.storage_controller.consistency_check()
|
|
|
|
# These APIs are intentionally not implemented as methods on NeonStorageController, as
|
|
# they're just for use in unanticipated circumstances.
|
|
|
|
# Initial tenant (1 shard) and the one we just created (2 shards) should be visible
|
|
response = env.storage_controller.request(
|
|
"GET",
|
|
f"{env.storage_controller_api}/debug/v1/tenant",
|
|
headers=env.storage_controller.headers(TokenScope.ADMIN),
|
|
)
|
|
assert len(response.json()) == 3
|
|
|
|
# Scheduler should report the expected nodes and shard counts
|
|
response = env.storage_controller.request(
|
|
"GET", f"{env.storage_controller_api}/debug/v1/scheduler"
|
|
)
|
|
# Two nodes, in a dict of node_id->node
|
|
assert len(response.json()["nodes"]) == 2
|
|
assert sum(v["shard_count"] for v in response.json()["nodes"].values()) == 3
|
|
assert all(v["may_schedule"] for v in response.json()["nodes"].values())
|
|
|
|
response = env.storage_controller.request(
|
|
"POST",
|
|
f"{env.storage_controller_api}/debug/v1/node/{env.pageservers[1].id}/drop",
|
|
headers=env.storage_controller.headers(TokenScope.ADMIN),
|
|
)
|
|
assert len(env.storage_controller.node_list()) == 1
|
|
|
|
response = env.storage_controller.request(
|
|
"POST",
|
|
f"{env.storage_controller_api}/debug/v1/tenant/{tenant_id}/drop",
|
|
headers=env.storage_controller.headers(TokenScope.ADMIN),
|
|
)
|
|
|
|
# Tenant drop should be reflected in dump output
|
|
response = env.storage_controller.request(
|
|
"GET",
|
|
f"{env.storage_controller_api}/debug/v1/tenant",
|
|
headers=env.storage_controller.headers(TokenScope.ADMIN),
|
|
)
|
|
assert len(response.json()) == 1
|
|
|
|
# Check that the 'drop' APIs didn't leave things in a state that would fail a consistency check: they're
|
|
# meant to be unclean wrt the pageserver state, but not leave a broken storage controller behind.
|
|
env.storage_controller.consistency_check()
|
|
|
|
|
|
def test_storage_controller_s3_time_travel_recovery(
|
|
neon_env_builder: NeonEnvBuilder,
|
|
pg_bin: PgBin,
|
|
):
|
|
"""
|
|
Test for S3 time travel
|
|
"""
|
|
|
|
remote_storage_kind = s3_storage()
|
|
neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
|
|
|
|
# Mock S3 doesn't have versioning enabled by default, enable it
|
|
# (also do it before there is any writes to the bucket)
|
|
if remote_storage_kind == RemoteStorageKind.MOCK_S3:
|
|
remote_storage = neon_env_builder.pageserver_remote_storage
|
|
assert remote_storage, "remote storage not configured"
|
|
enable_remote_storage_versioning(remote_storage)
|
|
|
|
neon_env_builder.num_pageservers = 1
|
|
|
|
env = neon_env_builder.init_start()
|
|
virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True)
|
|
|
|
tenant_id = TenantId.generate()
|
|
env.storage_controller.tenant_create(
|
|
tenant_id,
|
|
shard_count=2,
|
|
shard_stripe_size=8192,
|
|
tenant_config=MANY_SMALL_LAYERS_TENANT_CONFIG,
|
|
)
|
|
|
|
# Check that the consistency check passes
|
|
env.storage_controller.consistency_check()
|
|
|
|
branch_name = "main"
|
|
timeline_id = env.neon_cli.create_timeline(
|
|
branch_name,
|
|
tenant_id=tenant_id,
|
|
)
|
|
# Write some nontrivial amount of data into the endpoint and wait until it is uploaded
|
|
with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint:
|
|
run_pg_bench_small(pg_bin, endpoint.connstr())
|
|
endpoint.safe_psql("CREATE TABLE created_foo(id integer);")
|
|
# last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id)
|
|
|
|
# Give the data time to be uploaded
|
|
time.sleep(4)
|
|
|
|
# Detach the tenant
|
|
virtual_ps_http.tenant_location_conf(
|
|
tenant_id,
|
|
{
|
|
"mode": "Detached",
|
|
"secondary_conf": None,
|
|
"tenant_conf": {},
|
|
"generation": None,
|
|
},
|
|
)
|
|
|
|
time.sleep(4)
|
|
ts_before_disaster = datetime.now(tz=timezone.utc).replace(tzinfo=None)
|
|
time.sleep(4)
|
|
|
|
# Simulate a "disaster": delete some random files from remote storage for one of the shards
|
|
assert env.pageserver_remote_storage
|
|
shard_id_for_list = "0002"
|
|
objects: List[ObjectTypeDef] = list_prefix(
|
|
env.pageserver_remote_storage,
|
|
f"tenants/{tenant_id}-{shard_id_for_list}/timelines/{timeline_id}/",
|
|
).get("Contents", [])
|
|
assert len(objects) > 1
|
|
log.info(f"Found {len(objects)} objects in remote storage")
|
|
should_delete = False
|
|
for obj in objects:
|
|
obj_key = obj["Key"]
|
|
should_delete = not should_delete
|
|
if not should_delete:
|
|
log.info(f"Keeping key on remote storage: {obj_key}")
|
|
continue
|
|
log.info(f"Deleting key from remote storage: {obj_key}")
|
|
remote_storage_delete_key(env.pageserver_remote_storage, obj_key)
|
|
pass
|
|
|
|
time.sleep(4)
|
|
ts_after_disaster = datetime.now(tz=timezone.utc).replace(tzinfo=None)
|
|
time.sleep(4)
|
|
|
|
# Do time travel recovery
|
|
virtual_ps_http.tenant_time_travel_remote_storage(
|
|
tenant_id, ts_before_disaster, ts_after_disaster, shard_counts=[2]
|
|
)
|
|
time.sleep(4)
|
|
|
|
# Attach the tenant again
|
|
virtual_ps_http.tenant_location_conf(
|
|
tenant_id,
|
|
{
|
|
"mode": "AttachedSingle",
|
|
"secondary_conf": None,
|
|
"tenant_conf": {},
|
|
"generation": 100,
|
|
},
|
|
)
|
|
|
|
with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint:
|
|
endpoint.safe_psql("SELECT * FROM created_foo;")
|
|
|
|
env.storage_controller.consistency_check()
|
|
|
|
|
|
def test_storage_controller_auth(neon_env_builder: NeonEnvBuilder):
|
|
neon_env_builder.auth_enabled = True
|
|
env = neon_env_builder.init_start()
|
|
svc = env.storage_controller
|
|
api = env.storage_controller_api
|
|
|
|
tenant_id = TenantId.generate()
|
|
body: Dict[str, Any] = {"new_tenant_id": str(tenant_id)}
|
|
|
|
env.storage_controller.allowed_errors.append(".*Unauthorized.*")
|
|
env.storage_controller.allowed_errors.append(".*Forbidden.*")
|
|
|
|
# No token
|
|
with pytest.raises(
|
|
StorageControllerApiException,
|
|
match="Unauthorized: missing authorization header",
|
|
):
|
|
svc.request("POST", f"{env.storage_controller_api}/v1/tenant", json=body)
|
|
|
|
# Token with incorrect scope
|
|
with pytest.raises(
|
|
StorageControllerApiException,
|
|
match="Forbidden: JWT authentication error",
|
|
):
|
|
svc.request(
|
|
"POST", f"{api}/v1/tenant", json=body, headers=svc.headers(TokenScope.SAFEKEEPER_DATA)
|
|
)
|
|
|
|
# Token with correct scope
|
|
svc.request(
|
|
"POST", f"{api}/v1/tenant", json=body, headers=svc.headers(TokenScope.PAGE_SERVER_API)
|
|
)
|
|
|
|
# Token with admin scope should also be permitted
|
|
svc.request("POST", f"{api}/v1/tenant", json=body, headers=svc.headers(TokenScope.ADMIN))
|
|
|
|
# No token
|
|
with pytest.raises(
|
|
StorageControllerApiException,
|
|
match="Unauthorized: missing authorization header",
|
|
):
|
|
svc.request("GET", f"{api}/debug/v1/tenant")
|
|
|
|
# Token with incorrect scope
|
|
with pytest.raises(
|
|
StorageControllerApiException,
|
|
match="Forbidden: JWT authentication error",
|
|
):
|
|
svc.request(
|
|
"GET", f"{api}/debug/v1/tenant", headers=svc.headers(TokenScope.GENERATIONS_API)
|
|
)
|
|
|
|
# No token
|
|
with pytest.raises(
|
|
StorageControllerApiException,
|
|
match="Unauthorized: missing authorization header",
|
|
):
|
|
svc.request("POST", f"{api}/upcall/v1/re-attach")
|
|
|
|
# Token with incorrect scope
|
|
with pytest.raises(
|
|
StorageControllerApiException,
|
|
match="Forbidden: JWT authentication error",
|
|
):
|
|
svc.request(
|
|
"POST", f"{api}/upcall/v1/re-attach", headers=svc.headers(TokenScope.PAGE_SERVER_API)
|
|
)
|
|
|
|
|
|
def test_storage_controller_tenant_conf(neon_env_builder: NeonEnvBuilder):
|
|
"""
|
|
Validate the pageserver-compatible API endpoints for setting and getting tenant conf, without
|
|
supplying the whole LocationConf.
|
|
"""
|
|
|
|
env = neon_env_builder.init_start()
|
|
tenant_id = env.initial_tenant
|
|
|
|
http = env.storage_controller.pageserver_api()
|
|
|
|
default_value = "7days"
|
|
new_value = "1h"
|
|
http.set_tenant_config(tenant_id, {"pitr_interval": new_value})
|
|
|
|
# Ensure the change landed on the storage controller
|
|
readback_controller = http.tenant_config(tenant_id)
|
|
assert readback_controller.effective_config["pitr_interval"] == new_value
|
|
assert readback_controller.tenant_specific_overrides["pitr_interval"] == new_value
|
|
|
|
# Ensure the change made it down to the pageserver
|
|
readback_ps = env.pageservers[0].http_client().tenant_config(tenant_id)
|
|
assert readback_ps.effective_config["pitr_interval"] == new_value
|
|
assert readback_ps.tenant_specific_overrides["pitr_interval"] == new_value
|
|
|
|
# Omitting a value clears it. This looks different in storage controller
|
|
# vs. pageserver API calls, because pageserver has defaults.
|
|
http.set_tenant_config(tenant_id, {})
|
|
readback_controller = http.tenant_config(tenant_id)
|
|
assert readback_controller.effective_config["pitr_interval"] is None
|
|
assert readback_controller.tenant_specific_overrides["pitr_interval"] is None
|
|
readback_ps = env.pageservers[0].http_client().tenant_config(tenant_id)
|
|
assert readback_ps.effective_config["pitr_interval"] == default_value
|
|
assert "pitr_interval" not in readback_ps.tenant_specific_overrides
|
|
|
|
env.storage_controller.consistency_check()
|
|
|
|
|
|
class Failure:
|
|
pageserver_id: int
|
|
|
|
def apply(self, env: NeonEnv):
|
|
raise NotImplementedError()
|
|
|
|
def clear(self, env: NeonEnv):
|
|
raise NotImplementedError()
|
|
|
|
|
|
class NodeStop(Failure):
|
|
def __init__(self, pageserver_id, immediate):
|
|
self.pageserver_id = pageserver_id
|
|
self.immediate = immediate
|
|
|
|
def apply(self, env: NeonEnv):
|
|
pageserver = env.get_pageserver(self.pageserver_id)
|
|
pageserver.stop(immediate=self.immediate)
|
|
|
|
def clear(self, env: NeonEnv):
|
|
pageserver = env.get_pageserver(self.pageserver_id)
|
|
pageserver.start()
|
|
|
|
|
|
class PageserverFailpoint(Failure):
|
|
def __init__(self, failpoint, pageserver_id):
|
|
self.failpoint = failpoint
|
|
self.pageserver_id = pageserver_id
|
|
|
|
def apply(self, env: NeonEnv):
|
|
pageserver = env.get_pageserver(self.pageserver_id)
|
|
pageserver.http_client().configure_failpoints((self.failpoint, "return(1)"))
|
|
|
|
def clear(self, env: NeonEnv):
|
|
pageserver = env.get_pageserver(self.pageserver_id)
|
|
pageserver.http_client().configure_failpoints((self.failpoint, "off"))
|
|
|
|
|
|
def build_node_to_tenants_map(env: NeonEnv) -> dict[int, list[TenantId]]:
|
|
tenants = env.storage_controller.tenant_list()
|
|
|
|
node_to_tenants: dict[int, list[TenantId]] = {}
|
|
for t in tenants:
|
|
for node_id, loc_state in t["observed"]["locations"].items():
|
|
if (
|
|
loc_state is not None
|
|
and "conf" in loc_state
|
|
and loc_state["conf"] is not None
|
|
and loc_state["conf"]["mode"] == "AttachedSingle"
|
|
):
|
|
crnt = node_to_tenants.get(int(node_id), [])
|
|
crnt.append(TenantId(t["tenant_shard_id"]))
|
|
node_to_tenants[int(node_id)] = crnt
|
|
|
|
return node_to_tenants
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"failure",
|
|
[
|
|
NodeStop(pageserver_id=1, immediate=False),
|
|
NodeStop(pageserver_id=1, immediate=True),
|
|
PageserverFailpoint(pageserver_id=1, failpoint="get-utilization-http-handler"),
|
|
],
|
|
)
|
|
def test_storage_controller_heartbeats(
|
|
neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, failure: Failure
|
|
):
|
|
neon_env_builder.num_pageservers = 2
|
|
env = neon_env_builder.init_configs()
|
|
env.start()
|
|
|
|
# Default log allow list permits connection errors, but this test will use error responses on
|
|
# the utilization endpoint.
|
|
env.storage_controller.allowed_errors.append(
|
|
".*Call to node.*management API.*failed.*failpoint.*"
|
|
)
|
|
|
|
# Initially we have two online pageservers
|
|
nodes = env.storage_controller.node_list()
|
|
assert len(nodes) == 2
|
|
assert all([n["availability"] == "Active" for n in nodes])
|
|
|
|
# ... then we create two tenants and write some data into them
|
|
def create_tenant(tid: TenantId):
|
|
env.storage_controller.tenant_create(tid)
|
|
|
|
branch_name = "main"
|
|
env.neon_cli.create_timeline(
|
|
branch_name,
|
|
tenant_id=tid,
|
|
)
|
|
|
|
with env.endpoints.create_start("main", tenant_id=tid) as endpoint:
|
|
run_pg_bench_small(pg_bin, endpoint.connstr())
|
|
endpoint.safe_psql("CREATE TABLE created_foo(id integer);")
|
|
|
|
tenant_ids = [TenantId.generate(), TenantId.generate()]
|
|
for tid in tenant_ids:
|
|
create_tenant(tid)
|
|
|
|
# ... expecting that each tenant will be placed on a different node
|
|
def tenants_placed():
|
|
node_to_tenants = build_node_to_tenants_map(env)
|
|
log.info(f"{node_to_tenants=}")
|
|
|
|
# Check that all the tenants have been attached
|
|
assert sum((len(ts) for ts in node_to_tenants.values())) == len(tenant_ids)
|
|
# Check that each node got one tenant
|
|
assert all((len(ts) == 1 for ts in node_to_tenants.values()))
|
|
|
|
wait_until(10, 1, tenants_placed)
|
|
|
|
# ... then we apply the failure
|
|
offline_node_id = failure.pageserver_id
|
|
online_node_id = (set(range(1, len(env.pageservers) + 1)) - {offline_node_id}).pop()
|
|
env.get_pageserver(offline_node_id).allowed_errors.append(
|
|
# In the case of the failpoint failure, the impacted pageserver
|
|
# still believes it has the tenant attached since location
|
|
# config calls into it will fail due to being marked offline.
|
|
".*Dropped remote consistent LSN updates.*",
|
|
)
|
|
|
|
failure.apply(env)
|
|
|
|
# ... expecting the heartbeats to mark it offline
|
|
def node_offline():
|
|
nodes = env.storage_controller.node_list()
|
|
log.info(f"{nodes=}")
|
|
target = next(n for n in nodes if n["id"] == offline_node_id)
|
|
assert target["availability"] == "Offline"
|
|
|
|
# A node is considered offline if the last successful heartbeat
|
|
# was more than 10 seconds ago (hardcoded in the storage controller).
|
|
wait_until(20, 1, node_offline)
|
|
|
|
# .. expecting the tenant on the offline node to be migrated
|
|
def tenant_migrated():
|
|
node_to_tenants = build_node_to_tenants_map(env)
|
|
log.info(f"{node_to_tenants=}")
|
|
assert set(node_to_tenants[online_node_id]) == set(tenant_ids)
|
|
|
|
wait_until(10, 1, tenant_migrated)
|
|
|
|
# ... then we clear the failure
|
|
failure.clear(env)
|
|
|
|
# ... expecting the offline node to become active again
|
|
def node_online():
|
|
nodes = env.storage_controller.node_list()
|
|
target = next(n for n in nodes if n["id"] == offline_node_id)
|
|
assert target["availability"] == "Active"
|
|
|
|
wait_until(10, 1, node_online)
|
|
|
|
time.sleep(5)
|
|
|
|
# ... then we create a new tenant
|
|
tid = TenantId.generate()
|
|
env.storage_controller.tenant_create(tid)
|
|
|
|
# ... expecting it to be placed on the node that just came back online
|
|
tenants = env.storage_controller.tenant_list()
|
|
newest_tenant = next(t for t in tenants if t["tenant_shard_id"] == str(tid))
|
|
locations = list(newest_tenant["observed"]["locations"].keys())
|
|
locations = [int(node_id) for node_id in locations]
|
|
assert locations == [offline_node_id]
|
|
|
|
# ... expecting the storage controller to reach a consistent state
|
|
def storage_controller_consistent():
|
|
env.storage_controller.consistency_check()
|
|
|
|
wait_until(10, 1, storage_controller_consistent)
|
|
|
|
|
|
def test_storage_controller_re_attach(neon_env_builder: NeonEnvBuilder):
|
|
"""
|
|
Exercise the behavior of the /re-attach endpoint on pageserver startup when
|
|
pageservers have a mixture of attached and secondary locations
|
|
"""
|
|
|
|
neon_env_builder.num_pageservers = 2
|
|
env = neon_env_builder.init_configs()
|
|
env.start()
|
|
|
|
# We'll have two tenants.
|
|
tenant_a = TenantId.generate()
|
|
env.neon_cli.create_tenant(tenant_a, placement_policy='{"Attached":1}')
|
|
tenant_b = TenantId.generate()
|
|
env.neon_cli.create_tenant(tenant_b, placement_policy='{"Attached":1}')
|
|
|
|
# Each pageserver will have one attached and one secondary location
|
|
env.storage_controller.tenant_shard_migrate(
|
|
TenantShardId(tenant_a, 0, 0), env.pageservers[0].id
|
|
)
|
|
env.storage_controller.tenant_shard_migrate(
|
|
TenantShardId(tenant_b, 0, 0), env.pageservers[1].id
|
|
)
|
|
|
|
# Hard-fail a pageserver
|
|
victim_ps = env.pageservers[1]
|
|
survivor_ps = env.pageservers[0]
|
|
victim_ps.stop(immediate=True)
|
|
|
|
# Heatbeater will notice it's offline, and consequently attachments move to the other pageserver
|
|
def failed_over():
|
|
locations = survivor_ps.http_client().tenant_list_locations()["tenant_shards"]
|
|
log.info(f"locations: {locations}")
|
|
assert len(locations) == 2
|
|
assert all(loc[1]["mode"] == "AttachedSingle" for loc in locations)
|
|
|
|
# We could pre-empty this by configuring the node to Offline, but it's preferable to test
|
|
# the realistic path we would take when a node restarts uncleanly.
|
|
# The delay here will be ~NEON_LOCAL_MAX_UNAVAILABLE_INTERVAL in neon_local
|
|
wait_until(30, 1, failed_over)
|
|
|
|
reconciles_before_restart = env.storage_controller.get_metric_value(
|
|
"storage_controller_reconcile_complete_total", filter={"status": "ok"}
|
|
)
|
|
|
|
# Restart the failed pageserver
|
|
victim_ps.start()
|
|
|
|
# We expect that the re-attach call correctly tipped off the pageserver that its locations
|
|
# are all secondaries now.
|
|
locations = victim_ps.http_client().tenant_list_locations()["tenant_shards"]
|
|
assert len(locations) == 2
|
|
assert all(loc[1]["mode"] == "Secondary" for loc in locations)
|
|
|
|
# We expect that this situation resulted from the re_attach call, and not any explicit
|
|
# Reconciler runs: assert that the reconciliation count has not gone up since we restarted.
|
|
reconciles_after_restart = env.storage_controller.get_metric_value(
|
|
"storage_controller_reconcile_complete_total", filter={"status": "ok"}
|
|
)
|
|
assert reconciles_after_restart == reconciles_before_restart
|
|
|
|
|
|
def test_storage_controller_shard_scheduling_policy(neon_env_builder: NeonEnvBuilder):
|
|
"""
|
|
Check that emergency hooks for disabling rogue tenants' reconcilers work as expected.
|
|
"""
|
|
env = neon_env_builder.init_configs()
|
|
env.start()
|
|
|
|
tenant_id = TenantId.generate()
|
|
|
|
env.storage_controller.allowed_errors.extend(
|
|
[
|
|
# We will intentionally cause reconcile errors
|
|
".*Reconcile error.*",
|
|
# Message from using a scheduling policy
|
|
".*Scheduling is disabled by policy.*",
|
|
".*Skipping reconcile for policy.*",
|
|
# Message from a node being offline
|
|
".*Call to node .* management API .* failed",
|
|
]
|
|
)
|
|
|
|
# Stop pageserver so that reconcile cannot complete
|
|
env.pageserver.stop()
|
|
|
|
env.storage_controller.tenant_create(tenant_id, placement_policy="Detached")
|
|
|
|
# Try attaching it: we should see reconciles failing
|
|
env.storage_controller.tenant_policy_update(
|
|
tenant_id,
|
|
{
|
|
"placement": {"Attached": 0},
|
|
},
|
|
)
|
|
|
|
def reconcile_errors() -> int:
|
|
return int(
|
|
env.storage_controller.get_metric_value(
|
|
"storage_controller_reconcile_complete_total", filter={"status": "error"}
|
|
)
|
|
or 0
|
|
)
|
|
|
|
def reconcile_ok() -> int:
|
|
return int(
|
|
env.storage_controller.get_metric_value(
|
|
"storage_controller_reconcile_complete_total", filter={"status": "ok"}
|
|
)
|
|
or 0
|
|
)
|
|
|
|
def assert_errors_gt(n) -> int:
|
|
e = reconcile_errors()
|
|
assert e > n
|
|
return e
|
|
|
|
errs = wait_until(10, 1, lambda: assert_errors_gt(0))
|
|
|
|
# Try reconciling again, it should fail again
|
|
with pytest.raises(StorageControllerApiException):
|
|
env.storage_controller.reconcile_all()
|
|
errs = wait_until(10, 1, lambda: assert_errors_gt(errs))
|
|
|
|
# Configure the tenant to disable reconciles
|
|
env.storage_controller.tenant_policy_update(
|
|
tenant_id,
|
|
{
|
|
"scheduling": "Stop",
|
|
},
|
|
)
|
|
|
|
# Try reconciling again, it should not cause an error (silently skip)
|
|
env.storage_controller.reconcile_all()
|
|
assert reconcile_errors() == errs
|
|
|
|
# Start the pageserver and re-enable reconciles
|
|
env.pageserver.start()
|
|
env.storage_controller.tenant_policy_update(
|
|
tenant_id,
|
|
{
|
|
"scheduling": "Active",
|
|
},
|
|
)
|
|
|
|
def assert_ok_gt(n) -> int:
|
|
o = reconcile_ok()
|
|
assert o > n
|
|
return o
|
|
|
|
# We should see a successful reconciliation
|
|
wait_until(10, 1, lambda: assert_ok_gt(0))
|
|
|
|
# And indeed the tenant should be attached
|
|
assert len(env.pageserver.http_client().tenant_list_locations()["tenant_shards"]) == 1
|
|
|
|
|
|
def test_storcon_cli(neon_env_builder: NeonEnvBuilder):
|
|
"""
|
|
The storage controller command line interface (storcon-cli) is an internal tool. Most tests
|
|
just use the APIs directly: this test exercises some basics of the CLI as a regression test
|
|
that the client remains usable as the server evolves.
|
|
"""
|
|
output_dir = neon_env_builder.test_output_dir
|
|
shard_count = 4
|
|
env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count)
|
|
base_args = [env.neon_binpath / "storcon_cli", "--api", env.storage_controller_api]
|
|
|
|
def storcon_cli(args):
|
|
"""
|
|
CLI wrapper: returns stdout split into a list of non-empty strings
|
|
"""
|
|
(output_path, stdout, status_code) = subprocess_capture(
|
|
output_dir,
|
|
[str(s) for s in base_args + args],
|
|
echo_stderr=True,
|
|
echo_stdout=True,
|
|
env={},
|
|
check=False,
|
|
capture_stdout=True,
|
|
timeout=10,
|
|
)
|
|
if status_code:
|
|
log.warning(f"Command {args} failed")
|
|
log.warning(f"Output at: {output_path}")
|
|
|
|
raise RuntimeError("CLI failure (check logs for stderr)")
|
|
|
|
assert stdout is not None
|
|
return [line.strip() for line in stdout.split("\n") if line.strip()]
|
|
|
|
# List nodes
|
|
node_lines = storcon_cli(["nodes"])
|
|
# Table header, footer, and one line of data
|
|
assert len(node_lines) == 5
|
|
assert "localhost" in node_lines[3]
|
|
|
|
# Pause scheduling onto a node
|
|
storcon_cli(["node-configure", "--node-id", "1", "--scheduling", "pause"])
|
|
assert "Pause" in storcon_cli(["nodes"])[3]
|
|
|
|
# Make a node offline
|
|
storcon_cli(["node-configure", "--node-id", "1", "--availability", "offline"])
|
|
assert "Offline" in storcon_cli(["nodes"])[3]
|
|
|
|
# List tenants
|
|
tenant_lines = storcon_cli(["tenants"])
|
|
assert len(tenant_lines) == 5
|
|
assert str(env.initial_tenant) in tenant_lines[3]
|
|
|
|
# Setting scheduling policies intentionally result in warnings, they're for rare use.
|
|
env.storage_controller.allowed_errors.extend(
|
|
[".*Skipping reconcile for policy.*", ".*Scheduling is disabled by policy.*"]
|
|
)
|
|
|
|
# Describe a tenant
|
|
tenant_lines = storcon_cli(["tenant-describe", "--tenant-id", str(env.initial_tenant)])
|
|
assert len(tenant_lines) == 3 + shard_count * 2
|
|
assert str(env.initial_tenant) in tenant_lines[3]
|
|
|
|
# Pause changes on a tenant
|
|
storcon_cli(["tenant-policy", "--tenant-id", str(env.initial_tenant), "--scheduling", "stop"])
|
|
assert "Stop" in storcon_cli(["tenants"])[3]
|
|
|
|
# Change a tenant's placement
|
|
storcon_cli(
|
|
["tenant-policy", "--tenant-id", str(env.initial_tenant), "--placement", "secondary"]
|
|
)
|
|
assert "Secondary" in storcon_cli(["tenants"])[3]
|
|
|
|
# Modify a tenant's config
|
|
storcon_cli(
|
|
[
|
|
"tenant-config",
|
|
"--tenant-id",
|
|
str(env.initial_tenant),
|
|
"--config",
|
|
json.dumps({"pitr_interval": "1m"}),
|
|
]
|
|
)
|
|
|
|
# Quiesce any background reconciliation before doing consistency check
|
|
env.storage_controller.reconcile_until_idle(timeout_secs=10)
|
|
env.storage_controller.consistency_check()
|