mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-23 16:10:37 +00:00
storcon: implement safekeeper_migrate handler (#11849)
This PR implements a safekeeper migration algorithm from RFC-035 https://github.com/neondatabase/neon/blob/main/docs/rfcs/035-safekeeper-dynamic-membership-change.md#change-algorithm - Closes: https://github.com/neondatabase/neon/issues/11823 It is not production-ready yet, but I think it's good enough to commit and start testing. There are some known issues which will be addressed in later PRs: - https://github.com/neondatabase/neon/issues/12186 - https://github.com/neondatabase/neon/issues/12187 - https://github.com/neondatabase/neon/issues/12188 - https://github.com/neondatabase/neon/issues/12189 - https://github.com/neondatabase/neon/issues/12190 - https://github.com/neondatabase/neon/issues/12191 - https://github.com/neondatabase/neon/issues/12192 ## Summary of changes - Implement `tenant_timeline_safekeeper_migrate` handler to drive the migration - Add possibility to specify number of safekeepers per timeline in tests (`timeline_safekeeper_count`) - Add `term` and `flush_lsn` to `TimelineMembershipSwitchResponse` - Implement compare-and-swap (CAS) operation over timeline in DB for updating membership configuration safely. - Write simple test to verify that migration code works
This commit is contained in:
@@ -1215,6 +1215,13 @@ class NeonEnv:
|
||||
storage_controller_config = storage_controller_config or {}
|
||||
storage_controller_config["use_https_safekeeper_api"] = True
|
||||
|
||||
# TODO(diko): uncomment when timeline_safekeeper_count option is in the release branch,
|
||||
# so the compat tests will not fail bacause of it presence.
|
||||
# if config.num_safekeepers < 3:
|
||||
# storage_controller_config = storage_controller_config or {}
|
||||
# if "timeline_safekeeper_count" not in storage_controller_config:
|
||||
# storage_controller_config["timeline_safekeeper_count"] = config.num_safekeepers
|
||||
|
||||
if storage_controller_config is not None:
|
||||
cfg["storage_controller"] = storage_controller_config
|
||||
|
||||
@@ -2226,6 +2233,21 @@ class NeonStorageController(MetricsGetter, LogUtils):
|
||||
response.raise_for_status()
|
||||
log.info(f"timeline_create success: {response.json()}")
|
||||
|
||||
def migrate_safekeepers(
|
||||
self,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
new_sk_set: list[int],
|
||||
):
|
||||
response = self.request(
|
||||
"POST",
|
||||
f"{self.api}/v1/tenant/{tenant_id}/timeline/{timeline_id}/safekeeper_migrate",
|
||||
json={"new_sk_set": new_sk_set},
|
||||
headers=self.headers(TokenScope.PAGE_SERVER_API),
|
||||
)
|
||||
response.raise_for_status()
|
||||
log.info(f"migrate_safekeepers success: {response.json()}")
|
||||
|
||||
def locate(self, tenant_id: TenantId) -> list[dict[str, Any]]:
|
||||
"""
|
||||
:return: list of {"shard_id": "", "node_id": int, "listen_pg_addr": str, "listen_pg_port": int, "listen_http_addr": str, "listen_http_port": int}
|
||||
|
||||
64
test_runner/regress/test_safekeeper_migration.py
Normal file
64
test_runner/regress/test_safekeeper_migration.py
Normal file
@@ -0,0 +1,64 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from fixtures.neon_fixtures import NeonEnvBuilder
|
||||
|
||||
|
||||
def test_safekeeper_migration_simple(neon_env_builder: NeonEnvBuilder):
|
||||
"""
|
||||
Simple safekeeper migration test.
|
||||
Creates 3 safekeepers. The timeline is configuret to use only one safekeeper.
|
||||
1. Go through all safekeepers, migrate the timeline to it.
|
||||
2. Stop the other safekeepers. Validate that the insert is successful.
|
||||
3. Start the other safekeepers again and go to the next safekeeper.
|
||||
4. Validate that the table contains all inserted values.
|
||||
"""
|
||||
neon_env_builder.num_safekeepers = 3
|
||||
neon_env_builder.storage_controller_config = {
|
||||
"timelines_onto_safekeepers": True,
|
||||
"timeline_safekeeper_count": 1,
|
||||
}
|
||||
env = neon_env_builder.init_start()
|
||||
# TODO(diko): pageserver spams with various errors during safekeeper migration.
|
||||
# Fix the code so it handles the migration better.
|
||||
env.pageserver.allowed_errors.extend(
|
||||
[
|
||||
".*Timeline .* was cancelled and cannot be used anymore.*",
|
||||
".*Timeline .* has been deleted.*",
|
||||
".*wal receiver task finished with an error.*",
|
||||
]
|
||||
)
|
||||
|
||||
ep = env.endpoints.create("main", tenant_id=env.initial_tenant)
|
||||
# We specify all safekeepers, so compute will connect to all of them.
|
||||
# Only those from the current membership configuration will be used.
|
||||
# TODO(diko): set only current safekeepers when cplane notify is implemented.
|
||||
ep.start(safekeeper_generation=1, safekeepers=[1, 2, 3])
|
||||
ep.safe_psql("CREATE EXTENSION neon_test_utils;")
|
||||
ep.safe_psql("CREATE TABLE t(a int)")
|
||||
|
||||
for active_sk in range(1, 4):
|
||||
env.storage_controller.migrate_safekeepers(
|
||||
env.initial_tenant, env.initial_timeline, [active_sk]
|
||||
)
|
||||
|
||||
other_sks = [sk for sk in range(1, 4) if sk != active_sk]
|
||||
|
||||
for sk in other_sks:
|
||||
env.safekeepers[sk - 1].stop()
|
||||
|
||||
ep.safe_psql(f"INSERT INTO t VALUES ({active_sk})")
|
||||
|
||||
for sk in other_sks:
|
||||
env.safekeepers[sk - 1].start()
|
||||
|
||||
ep.clear_buffers()
|
||||
|
||||
assert ep.safe_psql("SELECT * FROM t") == [(i,) for i in range(1, 4)]
|
||||
|
||||
ep.stop()
|
||||
ep.start(safekeeper_generation=1, safekeepers=[1, 2, 3])
|
||||
|
||||
assert ep.safe_psql("SELECT * FROM t") == [(i,) for i in range(1, 4)]
|
||||
Reference in New Issue
Block a user