mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-07 21:42:56 +00:00
Instead of having a lot of separate fixtures for setting up the page server, the compute nodes, the safekeepers etc., have one big ZenithEnv object that encapsulates the whole environment. Every test either uses a shared "zenith_simple_env" fixture, which contains the default setup of a pageserver with no authentication, and no safekeepers. Tests that want to use safekeepers or authentication set up a custom test-specific ZenithEnv fixture. Gathering information about the whole environment into one object makes some things simpler. For example, when a new compute node is created, you no longer need to pass the 'wal_acceptors' connection string as argument to the 'postgres.create_start' function. The 'create_start' function fetches that information directly from the ZenithEnv object.
333 lines
11 KiB
Python
333 lines
11 KiB
Python
import pytest
|
|
import random
|
|
import time
|
|
import os
|
|
import subprocess
|
|
import uuid
|
|
|
|
from contextlib import closing
|
|
from multiprocessing import Process, Value
|
|
from fixtures.zenith_fixtures import PgBin, ZenithEnv, ZenithEnvBuilder
|
|
from fixtures.utils import lsn_to_hex, mkdir_if_needed
|
|
from fixtures.log_helper import log
|
|
|
|
pytest_plugins = ("fixtures.zenith_fixtures")
|
|
|
|
|
|
# basic test, write something in setup with wal acceptors, ensure that commits
|
|
# succeed and data is written
|
|
def test_normal_work(zenith_env_builder: ZenithEnvBuilder):
|
|
zenith_env_builder.num_safekeepers = 3
|
|
env = zenith_env_builder.init()
|
|
|
|
env.zenith_cli(["branch", "test_wal_acceptors_normal_work", "main"])
|
|
|
|
pg = env.postgres.create_start('test_wal_acceptors_normal_work')
|
|
|
|
with closing(pg.connect()) as conn:
|
|
with conn.cursor() as cur:
|
|
# we rely upon autocommit after each statement
|
|
# as waiting for acceptors happens there
|
|
cur.execute('CREATE TABLE t(key int primary key, value text)')
|
|
cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'")
|
|
cur.execute('SELECT sum(key) FROM t')
|
|
assert cur.fetchone() == (5000050000, )
|
|
|
|
|
|
# Run page server and multiple acceptors, and multiple compute nodes running
|
|
# against different timelines.
|
|
def test_many_timelines(zenith_env_builder: ZenithEnvBuilder):
|
|
zenith_env_builder.num_safekeepers = 3
|
|
env = zenith_env_builder.init()
|
|
|
|
n_timelines = 2
|
|
|
|
branches = ["test_wal_acceptors_many_timelines_{}".format(tlin) for tlin in range(n_timelines)]
|
|
|
|
# start postgres on each timeline
|
|
pgs = []
|
|
for branch in branches:
|
|
env.zenith_cli(["branch", branch, "main"])
|
|
pgs.append(env.postgres.create_start(branch))
|
|
|
|
# Do everything in different loops to have actions on different timelines
|
|
# interleaved.
|
|
# create schema
|
|
for pg in pgs:
|
|
pg.safe_psql("CREATE TABLE t(key int primary key, value text)")
|
|
|
|
# Populate data
|
|
for pg in pgs:
|
|
pg.safe_psql("INSERT INTO t SELECT generate_series(1,100000), 'payload'")
|
|
|
|
# Check data
|
|
for pg in pgs:
|
|
res = pg.safe_psql("SELECT sum(key) FROM t")
|
|
assert res[0] == (5000050000, )
|
|
|
|
|
|
# Check that dead minority doesn't prevent the commits: execute insert n_inserts
|
|
# times, with fault_probability chance of getting a wal acceptor down or up
|
|
# along the way. 2 of 3 are always alive, so the work keeps going.
|
|
def test_restarts(zenith_env_builder: ZenithEnvBuilder):
|
|
fault_probability = 0.01
|
|
n_inserts = 1000
|
|
n_acceptors = 3
|
|
|
|
zenith_env_builder.num_safekeepers = n_acceptors
|
|
env = zenith_env_builder.init()
|
|
|
|
env.zenith_cli(["branch", "test_wal_acceptors_restarts", "main"])
|
|
pg = env.postgres.create_start('test_wal_acceptors_restarts')
|
|
|
|
# we rely upon autocommit after each statement
|
|
# as waiting for acceptors happens there
|
|
pg_conn = pg.connect()
|
|
cur = pg_conn.cursor()
|
|
|
|
failed_node = None
|
|
cur.execute('CREATE TABLE t(key int primary key, value text)')
|
|
for i in range(n_inserts):
|
|
cur.execute("INSERT INTO t values (%s, 'payload');", (i + 1, ))
|
|
|
|
if random.random() <= fault_probability:
|
|
if failed_node is None:
|
|
failed_node = env.safekeepers[random.randrange(0, n_acceptors)]
|
|
failed_node.stop()
|
|
else:
|
|
failed_node.start()
|
|
failed_node = None
|
|
cur.execute('SELECT sum(key) FROM t')
|
|
assert cur.fetchone() == (500500, )
|
|
|
|
|
|
start_delay_sec = 2
|
|
|
|
|
|
def delayed_wal_acceptor_start(wa):
|
|
time.sleep(start_delay_sec)
|
|
wa.start()
|
|
|
|
|
|
# When majority of acceptors is offline, commits are expected to be frozen
|
|
def test_unavailability(zenith_env_builder: ZenithEnvBuilder):
|
|
zenith_env_builder.num_safekeepers = 2
|
|
env = zenith_env_builder.init()
|
|
|
|
env.zenith_cli(["branch", "test_wal_acceptors_unavailability", "main"])
|
|
pg = env.postgres.create_start('test_wal_acceptors_unavailability')
|
|
|
|
# we rely upon autocommit after each statement
|
|
# as waiting for acceptors happens there
|
|
pg_conn = pg.connect()
|
|
cur = pg_conn.cursor()
|
|
|
|
# check basic work with table
|
|
cur.execute('CREATE TABLE t(key int primary key, value text)')
|
|
cur.execute("INSERT INTO t values (1, 'payload')")
|
|
|
|
# shutdown one of two acceptors, that is, majority
|
|
env.safekeepers[0].stop()
|
|
|
|
proc = Process(target=delayed_wal_acceptor_start, args=(env.safekeepers[0], ))
|
|
proc.start()
|
|
|
|
start = time.time()
|
|
cur.execute("INSERT INTO t values (2, 'payload')")
|
|
# ensure that the query above was hanging while acceptor was down
|
|
assert (time.time() - start) >= start_delay_sec
|
|
proc.join()
|
|
|
|
# for the world's balance, do the same with second acceptor
|
|
env.safekeepers[1].stop()
|
|
|
|
proc = Process(target=delayed_wal_acceptor_start, args=(env.safekeepers[1], ))
|
|
proc.start()
|
|
|
|
start = time.time()
|
|
cur.execute("INSERT INTO t values (3, 'payload')")
|
|
# ensure that the query above was hanging while acceptor was down
|
|
assert (time.time() - start) >= start_delay_sec
|
|
proc.join()
|
|
|
|
cur.execute("INSERT INTO t values (4, 'payload')")
|
|
|
|
cur.execute('SELECT sum(key) FROM t')
|
|
assert cur.fetchone() == (10, )
|
|
|
|
|
|
# shut down random subset of acceptors, sleep, wake them up, rinse, repeat
|
|
def xmas_garland(acceptors, stop):
|
|
while not bool(stop.value):
|
|
victims = []
|
|
for wa in acceptors:
|
|
if random.random() >= 0.5:
|
|
victims.append(wa)
|
|
for v in victims:
|
|
v.stop()
|
|
time.sleep(1)
|
|
for v in victims:
|
|
v.start()
|
|
time.sleep(1)
|
|
|
|
|
|
# value which gets unset on exit
|
|
@pytest.fixture
|
|
def stop_value():
|
|
stop = Value('i', 0)
|
|
yield stop
|
|
stop.value = 1
|
|
|
|
|
|
# do inserts while concurrently getting up/down subsets of acceptors
|
|
def test_race_conditions(zenith_env_builder: ZenithEnvBuilder, stop_value):
|
|
|
|
zenith_env_builder.num_safekeepers = 3
|
|
env = zenith_env_builder.init()
|
|
|
|
env.zenith_cli(["branch", "test_wal_acceptors_race_conditions", "main"])
|
|
pg = env.postgres.create_start('test_wal_acceptors_race_conditions')
|
|
|
|
# we rely upon autocommit after each statement
|
|
# as waiting for acceptors happens there
|
|
pg_conn = pg.connect()
|
|
cur = pg_conn.cursor()
|
|
|
|
cur.execute('CREATE TABLE t(key int primary key, value text)')
|
|
|
|
proc = Process(target=xmas_garland, args=(env.safekeepers, stop_value))
|
|
proc.start()
|
|
|
|
for i in range(1000):
|
|
cur.execute("INSERT INTO t values (%s, 'payload');", (i + 1, ))
|
|
|
|
cur.execute('SELECT sum(key) FROM t')
|
|
assert cur.fetchone() == (500500, )
|
|
|
|
stop_value.value = 1
|
|
proc.join()
|
|
|
|
|
|
class ProposerPostgres:
|
|
"""Object for running safekeepers sync with walproposer"""
|
|
def __init__(self, env: ZenithEnv, pgdata_dir: str, pg_bin, timeline_id: str, tenant_id: str):
|
|
self.env = env
|
|
self.pgdata_dir: str = pgdata_dir
|
|
self.pg_bin: PgBin = pg_bin
|
|
self.timeline_id: str = timeline_id
|
|
self.tenant_id: str = tenant_id
|
|
|
|
def pg_data_dir_path(self) -> str:
|
|
""" Path to data directory """
|
|
return self.pgdata_dir
|
|
|
|
def config_file_path(self) -> str:
|
|
""" Path to postgresql.conf """
|
|
return os.path.join(self.pgdata_dir, 'postgresql.conf')
|
|
|
|
def create_dir_config(self, wal_acceptors: str):
|
|
""" Create dir and config for running --sync-safekeepers """
|
|
|
|
mkdir_if_needed(self.pg_data_dir_path())
|
|
with open(self.config_file_path(), "w") as f:
|
|
f.writelines([
|
|
"synchronous_standby_names = 'walproposer'\n",
|
|
f"zenith.zenith_timeline = '{self.timeline_id}'\n",
|
|
f"zenith.zenith_tenant = '{self.tenant_id}'\n",
|
|
f"wal_acceptors = '{wal_acceptors}'\n",
|
|
])
|
|
|
|
def sync_safekeepers(self) -> str:
|
|
"""
|
|
Run 'postgres --sync-safekeepers'.
|
|
Returns execution result, which is commit_lsn after sync.
|
|
"""
|
|
|
|
command = ["postgres", "--sync-safekeepers"]
|
|
env = {
|
|
"PGDATA": self.pg_data_dir_path(),
|
|
}
|
|
|
|
basepath = self.pg_bin.run_capture(command, env)
|
|
stdout_filename = basepath + '.stdout'
|
|
|
|
with open(stdout_filename, 'r') as stdout_f:
|
|
stdout = stdout_f.read()
|
|
return stdout.strip("\n ")
|
|
|
|
|
|
# insert wal in all safekeepers and run sync on proposer
|
|
def test_sync_safekeepers(zenith_env_builder: ZenithEnvBuilder, pg_bin: PgBin):
|
|
|
|
# We don't really need the full environment for this test, just the
|
|
# safekeepers would be enough.
|
|
zenith_env_builder.num_safekeepers = 3
|
|
env = zenith_env_builder.init()
|
|
|
|
timeline_id = uuid.uuid4().hex
|
|
tenant_id = uuid.uuid4().hex
|
|
|
|
# write config for proposer
|
|
pgdata_dir = os.path.join(env.repo_dir, "proposer_pgdata")
|
|
pg = ProposerPostgres(env, pgdata_dir, pg_bin, timeline_id, tenant_id)
|
|
pg.create_dir_config(env.get_safekeeper_connstrs())
|
|
|
|
# valid lsn, which is not in the segment start, nor in zero segment
|
|
epoch_start_lsn = 0x16B9188 # 0/16B9188
|
|
begin_lsn = epoch_start_lsn
|
|
|
|
# append and commit WAL
|
|
lsn_after_append = []
|
|
for i in range(3):
|
|
res = env.safekeepers[i].append_logical_message(
|
|
tenant_id,
|
|
timeline_id,
|
|
{
|
|
"lm_prefix": "prefix",
|
|
"lm_message": "message",
|
|
"set_commit_lsn": True,
|
|
"term": 2,
|
|
"begin_lsn": begin_lsn,
|
|
"epoch_start_lsn": epoch_start_lsn,
|
|
"truncate_lsn": epoch_start_lsn,
|
|
},
|
|
)
|
|
lsn_hex = lsn_to_hex(res["inserted_wal"]["end_lsn"])
|
|
lsn_after_append.append(lsn_hex)
|
|
log.info(f"safekeeper[{i}] lsn after append: {lsn_hex}")
|
|
|
|
# run sync safekeepers
|
|
lsn_after_sync = pg.sync_safekeepers()
|
|
log.info(f"lsn after sync = {lsn_after_sync}")
|
|
|
|
assert all(lsn_after_sync == lsn for lsn in lsn_after_append)
|
|
|
|
|
|
def test_timeline_status(zenith_env_builder: ZenithEnvBuilder):
|
|
|
|
zenith_env_builder.num_safekeepers = 1
|
|
env = zenith_env_builder.init()
|
|
|
|
env.zenith_cli(["branch", "test_timeline_status", "main"])
|
|
pg = env.postgres.create_start('test_timeline_status')
|
|
|
|
wa = env.safekeepers[0]
|
|
wa_http_cli = wa.http_client()
|
|
wa_http_cli.check_status()
|
|
|
|
# learn zenith timeline from compute
|
|
tenant_id = pg.safe_psql("show zenith.zenith_tenant")[0][0]
|
|
timeline_id = pg.safe_psql("show zenith.zenith_timeline")[0][0]
|
|
|
|
# fetch something sensible from status
|
|
epoch = wa_http_cli.timeline_status(tenant_id, timeline_id).acceptor_epoch
|
|
|
|
pg.safe_psql("create table t(i int)")
|
|
|
|
# ensure epoch goes up after reboot
|
|
pg.stop().start()
|
|
pg.safe_psql("insert into t values(10)")
|
|
|
|
epoch_after_reboot = wa_http_cli.timeline_status(tenant_id, timeline_id).acceptor_epoch
|
|
assert epoch_after_reboot > epoch
|