Add more substantial tests for compute migrations (#9811)

The previous tests really didn't do much. This set should be quite a bit
more encompassing.

Signed-off-by: Tristan Partin <tristan@neon.tech>
This commit is contained in:
Tristan Partin
2025-01-02 12:37:50 -06:00
committed by GitHub
parent 56e6ebfe17
commit 363ea97f69
26 changed files with 327 additions and 47 deletions

View File

@@ -8,6 +8,7 @@ pytest_plugins = (
"fixtures.compute_reconfigure",
"fixtures.storage_controller_proxy",
"fixtures.paths",
"fixtures.compute_migrations",
"fixtures.neon_fixtures",
"fixtures.benchmark_fixture",
"fixtures.pg_stats",

View File

@@ -0,0 +1,34 @@
from __future__ import annotations
import os
from typing import TYPE_CHECKING
import pytest
from fixtures.paths import BASE_DIR
if TYPE_CHECKING:
from collections.abc import Iterator
from pathlib import Path
COMPUTE_MIGRATIONS_DIR = BASE_DIR / "compute_tools" / "src" / "migrations"
COMPUTE_MIGRATIONS_TEST_DIR = COMPUTE_MIGRATIONS_DIR / "tests"
COMPUTE_MIGRATIONS = sorted(next(os.walk(COMPUTE_MIGRATIONS_DIR))[2])
NUM_COMPUTE_MIGRATIONS = len(COMPUTE_MIGRATIONS)
@pytest.fixture(scope="session")
def compute_migrations_dir() -> Iterator[Path]:
"""
Retrieve the path to the compute migrations directory.
"""
yield COMPUTE_MIGRATIONS_DIR
@pytest.fixture(scope="session")
def compute_migrations_test_dir() -> Iterator[Path]:
"""
Retrieve the path to the compute migrations test directory.
"""
yield COMPUTE_MIGRATIONS_TEST_DIR

View File

@@ -55,3 +55,17 @@ class EndpointHttpClient(requests.Session):
res = self.get(f"http://localhost:{self.port}/metrics")
res.raise_for_status()
return res.text
def configure_failpoints(self, *args: tuple[str, str]) -> None:
body: list[dict[str, str]] = []
for fp in args:
body.append(
{
"name": fp[0],
"action": fp[1],
}
)
res = self.post(f"http://localhost:{self.port}/failpoints", json=body)
res.raise_for_status()

View File

@@ -522,14 +522,15 @@ class NeonLocalCli(AbstractNeonCli):
safekeepers: list[int] | None = None,
remote_ext_config: str | None = None,
pageserver_id: int | None = None,
allow_multiple=False,
allow_multiple: bool = False,
basebackup_request_tries: int | None = None,
env: dict[str, str] | None = None,
) -> subprocess.CompletedProcess[str]:
args = [
"endpoint",
"start",
]
extra_env_vars = {}
extra_env_vars = env or {}
if basebackup_request_tries is not None:
extra_env_vars["NEON_COMPUTE_TESTING_BASEBACKUP_TRIES"] = str(basebackup_request_tries)
if remote_ext_config is not None:

View File

@@ -54,6 +54,7 @@ from fixtures.common_types import (
TimelineArchivalState,
TimelineId,
)
from fixtures.compute_migrations import NUM_COMPUTE_MIGRATIONS
from fixtures.endpoint.http import EndpointHttpClient
from fixtures.h2server import H2Server
from fixtures.log_helper import log
@@ -3855,6 +3856,7 @@ class Endpoint(PgProtocol, LogUtils):
safekeepers: list[int] | None = None,
allow_multiple: bool = False,
basebackup_request_tries: int | None = None,
env: dict[str, str] | None = None,
) -> Self:
"""
Start the Postgres instance.
@@ -3875,6 +3877,7 @@ class Endpoint(PgProtocol, LogUtils):
pageserver_id=pageserver_id,
allow_multiple=allow_multiple,
basebackup_request_tries=basebackup_request_tries,
env=env,
)
self._running.release(1)
self.log_config_value("shared_buffers")
@@ -3988,14 +3991,17 @@ class Endpoint(PgProtocol, LogUtils):
log.info("Updating compute spec to: %s", json.dumps(data_dict, indent=4))
json.dump(data_dict, file, indent=4)
# Please note: Migrations only run if pg_skip_catalog_updates is false
def wait_for_migrations(self, num_migrations: int = 11):
def wait_for_migrations(self, wait_for: int = NUM_COMPUTE_MIGRATIONS) -> None:
"""
Wait for all compute migrations to be ran. Remember that migrations only
run if "pg_skip_catalog_updates" is set in the compute spec to false.
"""
with self.cursor() as cur:
def check_migrations_done():
cur.execute("SELECT id FROM neon_migration.migration_id")
migration_id: int = cur.fetchall()[0][0]
assert migration_id >= num_migrations
assert migration_id >= wait_for
wait_until(check_migrations_done)

View File

@@ -21,8 +21,8 @@ if TYPE_CHECKING:
BASE_DIR = Path(__file__).parents[2]
COMPUTE_CONFIG_DIR = BASE_DIR / "compute" / "etc"
DEFAULT_OUTPUT_DIR: str = "test_output"
COMPUTE_CONFIG_DIR = BASE_DIR / "compute" / "etc"
def get_test_dir(request: FixtureRequest, top_output_dir: Path, prefix: str | None = None) -> Path:

View File

@@ -0,0 +1,90 @@
from __future__ import annotations
from pathlib import Path
from typing import TYPE_CHECKING, cast
import pytest
from fixtures.compute_migrations import COMPUTE_MIGRATIONS, NUM_COMPUTE_MIGRATIONS
if TYPE_CHECKING:
from fixtures.neon_fixtures import NeonEnv
def test_compute_migrations_retry(neon_simple_env: NeonEnv, compute_migrations_dir: Path):
"""
Test that compute_ctl can recover from migration failures next time it
starts, and that the persisted migration ID is correct in such cases.
"""
env = neon_simple_env
endpoint = env.endpoints.create("main")
endpoint.respec(skip_pg_catalog_updates=False)
for i in range(1, NUM_COMPUTE_MIGRATIONS + 1):
endpoint.start(env={"FAILPOINTS": f"compute-migration=return({i})"})
# Make sure that the migrations ran
endpoint.wait_for_migrations(wait_for=i - 1)
# Confirm that we correctly recorded that in the
# neon_migration.migration_id table
with endpoint.cursor() as cur:
cur.execute("SELECT id FROM neon_migration.migration_id")
migration_id = cast("int", cur.fetchall()[0][0])
assert migration_id == i - 1
endpoint.stop()
endpoint.start()
# Now wait for the rest of the migrations
endpoint.wait_for_migrations()
with endpoint.cursor() as cur:
cur.execute("SELECT id FROM neon_migration.migration_id")
migration_id = cast("int", cur.fetchall()[0][0])
assert migration_id == NUM_COMPUTE_MIGRATIONS
for i, m in enumerate(COMPUTE_MIGRATIONS, start=1):
migration_query = (compute_migrations_dir / m).read_text(encoding="utf-8")
if not migration_query.startswith("-- SKIP"):
pattern = rf"Skipping migration id={i}"
else:
pattern = rf"Running migration id={i}"
endpoint.log_contains(pattern)
@pytest.mark.parametrize(
"migration",
(pytest.param((i, m), id=str(i)) for i, m in enumerate(COMPUTE_MIGRATIONS, start=1)),
)
def test_compute_migrations_e2e(
neon_simple_env: NeonEnv,
compute_migrations_dir: Path,
compute_migrations_test_dir: Path,
migration: tuple[int, str],
):
"""
Test that the migrations perform as advertised.
"""
env = neon_simple_env
migration_id = migration[0]
migration_filename = migration[1]
migration_query = (compute_migrations_dir / migration_filename).read_text(encoding="utf-8")
if migration_query.startswith("-- SKIP"):
pytest.skip("The migration is marked as SKIP")
endpoint = env.endpoints.create("main")
endpoint.respec(skip_pg_catalog_updates=False)
# Stop applying migrations after the one we want to test, so that we can
# test the state of the cluster at the given migration ID
endpoint.start(env={"FAILPOINTS": f"compute-migration=return({migration_id + 1})"})
endpoint.wait_for_migrations(wait_for=migration_id)
check_query = (compute_migrations_test_dir / migration_filename).read_text(encoding="utf-8")
endpoint.safe_psql(check_query)

View File

@@ -1,33 +0,0 @@
from __future__ import annotations
import time
from typing import TYPE_CHECKING
if TYPE_CHECKING:
from fixtures.neon_fixtures import NeonEnv
def test_migrations(neon_simple_env: NeonEnv):
env = neon_simple_env
endpoint = env.endpoints.create("main")
endpoint.respec(skip_pg_catalog_updates=False)
endpoint.start()
num_migrations = 11
endpoint.wait_for_migrations(num_migrations=num_migrations)
with endpoint.cursor() as cur:
cur.execute("SELECT id FROM neon_migration.migration_id")
migration_id = cur.fetchall()
assert migration_id[0][0] == num_migrations
endpoint.stop()
endpoint.start()
# We don't have a good way of knowing that the migrations code path finished executing
# in compute_ctl in the case that no migrations are being run
time.sleep(1)
with endpoint.cursor() as cur:
cur.execute("SELECT id FROM neon_migration.migration_id")
migration_id = cur.fetchall()
assert migration_id[0][0] == num_migrations