mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-03 19:42:55 +00:00
## Problem We don't validate the validity of the `new_sk_set` before starting the migration. It is validated later, so the migration to an invalid safekeeper set will fail anyway. But at this point we might already commited an invalid `new_sk_set` to the database and there is no `abort` command yet (I ran into this issue in neon_local and ruined the timeline :) - Part of https://github.com/neondatabase/neon/issues/11669 ## Summary of changes - Add safekeeper count and safekeeper duplication checks before starting the migration - Test that we validate the `new_sk_set` before starting the migration - Add `force` option to the `TimelineSafekeeperMigrateRequest` to disable not-mandatory checks
116 lines
4.2 KiB
Python
116 lines
4.2 KiB
Python
from __future__ import annotations
|
|
|
|
from typing import TYPE_CHECKING
|
|
|
|
import pytest
|
|
from fixtures.neon_fixtures import StorageControllerApiException
|
|
|
|
if TYPE_CHECKING:
|
|
from fixtures.neon_fixtures import NeonEnvBuilder
|
|
|
|
|
|
def test_safekeeper_migration_simple(neon_env_builder: NeonEnvBuilder):
|
|
"""
|
|
Simple safekeeper migration test.
|
|
Creates 3 safekeepers. The timeline is configuret to use only one safekeeper.
|
|
1. Go through all safekeepers, migrate the timeline to it.
|
|
2. Stop the other safekeepers. Validate that the insert is successful.
|
|
3. Start the other safekeepers again and go to the next safekeeper.
|
|
4. Validate that the table contains all inserted values.
|
|
"""
|
|
neon_env_builder.num_safekeepers = 3
|
|
neon_env_builder.storage_controller_config = {
|
|
"timelines_onto_safekeepers": True,
|
|
"timeline_safekeeper_count": 1,
|
|
}
|
|
env = neon_env_builder.init_start()
|
|
# TODO(diko): pageserver spams with various errors during safekeeper migration.
|
|
# Fix the code so it handles the migration better.
|
|
env.pageserver.allowed_errors.extend(
|
|
[
|
|
".*Timeline .* was cancelled and cannot be used anymore.*",
|
|
".*Timeline .* has been deleted.*",
|
|
".*Timeline .* was not found in global map.*",
|
|
".*wal receiver task finished with an error.*",
|
|
]
|
|
)
|
|
|
|
ep = env.endpoints.create("main", tenant_id=env.initial_tenant)
|
|
|
|
mconf = env.storage_controller.timeline_locate(env.initial_tenant, env.initial_timeline)
|
|
assert mconf["new_sk_set"] is None
|
|
assert len(mconf["sk_set"]) == 1
|
|
assert mconf["generation"] == 1
|
|
|
|
ep.start(safekeeper_generation=1, safekeepers=mconf["sk_set"])
|
|
ep.safe_psql("CREATE EXTENSION neon_test_utils;")
|
|
ep.safe_psql("CREATE TABLE t(a int)")
|
|
|
|
for active_sk in range(1, 4):
|
|
env.storage_controller.migrate_safekeepers(
|
|
env.initial_tenant, env.initial_timeline, [active_sk]
|
|
)
|
|
|
|
other_sks = [sk for sk in range(1, 4) if sk != active_sk]
|
|
|
|
for sk in other_sks:
|
|
env.safekeepers[sk - 1].stop()
|
|
|
|
ep.safe_psql(f"INSERT INTO t VALUES ({active_sk})")
|
|
|
|
for sk in other_sks:
|
|
env.safekeepers[sk - 1].start()
|
|
|
|
ep.clear_buffers()
|
|
|
|
assert ep.safe_psql("SELECT * FROM t") == [(i,) for i in range(1, 4)]
|
|
|
|
# 1 initial generation + 2 migrations on each loop iteration.
|
|
expected_gen = 1 + 2 * 3
|
|
|
|
mconf = env.storage_controller.timeline_locate(env.initial_tenant, env.initial_timeline)
|
|
assert mconf["generation"] == expected_gen
|
|
|
|
assert ep.safe_psql("SHOW neon.safekeepers")[0][0].startswith(f"g#{expected_gen}:")
|
|
|
|
# Restart and check again to make sure data is persistent.
|
|
ep.stop()
|
|
ep.start(safekeeper_generation=1, safekeepers=[3])
|
|
|
|
assert ep.safe_psql("SELECT * FROM t") == [(i,) for i in range(1, 4)]
|
|
|
|
|
|
def test_new_sk_set_validation(neon_env_builder: NeonEnvBuilder):
|
|
"""
|
|
Test that safekeeper_migrate validates the new_sk_set before starting the migration.
|
|
"""
|
|
neon_env_builder.num_safekeepers = 3
|
|
neon_env_builder.storage_controller_config = {
|
|
"timelines_onto_safekeepers": True,
|
|
"timeline_safekeeper_count": 2,
|
|
}
|
|
env = neon_env_builder.init_start()
|
|
|
|
def expect_fail(sk_set: list[int], match: str):
|
|
with pytest.raises(StorageControllerApiException, match=match):
|
|
env.storage_controller.migrate_safekeepers(
|
|
env.initial_tenant, env.initial_timeline, sk_set
|
|
)
|
|
# Check that we failed before commiting to the database.
|
|
mconf = env.storage_controller.timeline_locate(env.initial_tenant, env.initial_timeline)
|
|
assert mconf["generation"] == 1
|
|
|
|
expect_fail([], "safekeeper set is empty")
|
|
expect_fail([1], "must have at least 2 safekeepers")
|
|
expect_fail([1, 1], "duplicate safekeeper")
|
|
expect_fail([1, 100500], "does not exist")
|
|
|
|
mconf = env.storage_controller.timeline_locate(env.initial_tenant, env.initial_timeline)
|
|
sk_set = mconf["sk_set"]
|
|
assert len(sk_set) == 2
|
|
|
|
decom_sk = [sk.id for sk in env.safekeepers if sk.id not in sk_set][0]
|
|
env.storage_controller.safekeeper_scheduling_policy(decom_sk, "Decomissioned")
|
|
|
|
expect_fail([sk_set[0], decom_sk], "decomissioned")
|