Files
neon/test_runner/regress/test_pageserver_restart.py
Alexander Bayandin 8fcba150db test_seqscans: temporarily disable remote test (#3101)
Temporarily disable `test_seqscans` for remote projects; they acquire
too much space and time. We can try to reenable it back after switching
to per-test projects.
2022-12-14 18:05:05 +00:00

146 lines
5.8 KiB
Python

from contextlib import closing
import pytest
from fixtures.log_helper import log
from fixtures.neon_fixtures import NeonEnvBuilder
# Test restarting page server, while safekeeper and compute node keep
# running.
def test_pageserver_restart(neon_env_builder: NeonEnvBuilder):
env = neon_env_builder.init_start()
env.neon_cli.create_branch("test_pageserver_restart")
pg = env.postgres.create_start("test_pageserver_restart")
pg_conn = pg.connect()
cur = pg_conn.cursor()
# Create table, and insert some rows. Make it big enough that it doesn't fit in
# shared_buffers, otherwise the SELECT after restart will just return answer
# from shared_buffers without hitting the page server, which defeats the point
# of this test.
cur.execute("CREATE TABLE foo (t text)")
cur.execute(
"""
INSERT INTO foo
SELECT 'long string to consume some space' || g
FROM generate_series(1, 100000) g
"""
)
# Verify that the table is larger than shared_buffers
cur.execute(
"""
select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_size
from pg_settings where name = 'shared_buffers'
"""
)
row = cur.fetchone()
assert row is not None
log.info(f"shared_buffers is {row[0]}, table size {row[1]}")
assert int(row[0]) < int(row[1])
# Stop the pageserver gracefully and restart it.
env.pageserver.stop()
env.pageserver.start()
# Stopping the pageserver breaks the connection from the postgres backend to
# the page server, and causes the next query on the connection to fail. Start a new
# postgres connection too, to avoid that error. (Ideally, the compute node would
# handle that and retry internally, without propagating the error to the user, but
# currently it doesn't...)
pg_conn = pg.connect()
cur = pg_conn.cursor()
cur.execute("SELECT count(*) FROM foo")
assert cur.fetchone() == (100000,)
# Restart the server again, but delay the loading of tenants, and test what the
# pageserver does if a compute node connects and sends a request for the tenant
# while it's still in Loading state. (It waits for the loading to finish, and then
# processes the request.)
env.pageserver.stop()
env.pageserver.start(extra_env_vars={"FAILPOINTS": "before-loading-tenant=return(5000)"})
# Check that it's in Loading state
client = env.pageserver.http_client()
tenant_status = client.tenant_status(env.initial_tenant)
log.info("Tenant status : %s", tenant_status)
assert tenant_status["state"] == "Loading"
# Try to read. This waits until the loading finishes, and then return normally.
pg_conn = pg.connect()
cur = pg_conn.cursor()
cur.execute("SELECT count(*) FROM foo")
assert cur.fetchone() == (100000,)
# Test that repeatedly kills and restarts the page server, while the
# safekeeper and compute node keep running.
@pytest.mark.timeout(540)
def test_pageserver_chaos(neon_env_builder: NeonEnvBuilder):
env = neon_env_builder.init_start()
# These warnings are expected, when the pageserver is restarted abruptly
env.pageserver.allowed_errors.append(".*found future image layer.*")
env.pageserver.allowed_errors.append(".*found future delta layer.*")
# Use a tiny checkpoint distance, to create a lot of layers quickly.
# That allows us to stress the compaction and layer flushing logic more.
tenant, _ = env.neon_cli.create_tenant(
conf={
"checkpoint_distance": "5000000",
}
)
env.neon_cli.create_timeline("test_pageserver_chaos", tenant_id=tenant)
pg = env.postgres.create_start("test_pageserver_chaos", tenant_id=tenant)
# Create table, and insert some rows. Make it big enough that it doesn't fit in
# shared_buffers, otherwise the SELECT after restart will just return answer
# from shared_buffers without hitting the page server, which defeats the point
# of this test.
with closing(pg.connect()) as conn:
with conn.cursor() as cur:
cur.execute("CREATE TABLE foo (id int, t text, updates int)")
cur.execute("CREATE INDEX ON foo (id)")
cur.execute(
"""
INSERT INTO foo
SELECT g, 'long string to consume some space' || g, 0
FROM generate_series(1, 100000) g
"""
)
# Verify that the table is larger than shared_buffers
cur.execute(
"""
select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_size
from pg_settings where name = 'shared_buffers'
"""
)
row = cur.fetchone()
assert row is not None
log.info(f"shared_buffers is {row[0]}, table size {row[1]}")
assert int(row[0]) < int(row[1])
# Update the whole table, then immediately kill and restart the pageserver
for i in range(1, 15):
pg.safe_psql("UPDATE foo set updates = updates + 1")
# This kills the pageserver immediately, to simulate a crash
env.pageserver.stop(immediate=True)
env.pageserver.start()
# Stopping the pageserver breaks the connection from the postgres backend to
# the page server, and causes the next query on the connection to fail. Start a new
# postgres connection too, to avoid that error. (Ideally, the compute node would
# handle that and retry internally, without propagating the error to the user, but
# currently it doesn't...)
pg_conn = pg.connect()
cur = pg_conn.cursor()
# Check that all the updates are visible
num_updates = pg.safe_psql("SELECT sum(updates) FROM foo")[0][0]
assert num_updates == i * 100000