mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-10 06:52:55 +00:00
Add test that repeatedly kills and restarts the pageserver.
This caught or reproduced several bugs when I originally wrote this test back in May, including #1731, #1740, #1751, and #707. I believe all the issues have been fixed now, but since this was a very fruitful test, let's add it to the test suite. We didn't commit this earlier, because the test was very slow especially with a debug build. We've since changed the build options so that even the debug builds are not quite so slow anymore.
This commit is contained in:
@@ -1,3 +1,6 @@
|
||||
from contextlib import closing
|
||||
|
||||
import pytest
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import NeonEnvBuilder
|
||||
|
||||
@@ -38,9 +41,7 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder):
|
||||
log.info(f"shared_buffers is {row[0]}, table size {row[1]}")
|
||||
assert int(row[0]) < int(row[1])
|
||||
|
||||
# Stop and restart pageserver. This is a more or less graceful shutdown, although
|
||||
# the page server doesn't currently have a shutdown routine so there's no difference
|
||||
# between stopping and crashing.
|
||||
# Stop the pageserver gracefully and restart it.
|
||||
env.pageserver.stop()
|
||||
env.pageserver.start()
|
||||
|
||||
@@ -58,3 +59,68 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder):
|
||||
# Stop the page server by force, and restart it
|
||||
env.pageserver.stop()
|
||||
env.pageserver.start()
|
||||
|
||||
|
||||
# Test that repeatedly kills and restarts the page server, while the
|
||||
# safekeeper and compute node keep running.
|
||||
@pytest.mark.timeout(540)
|
||||
def test_pageserver_chaos(neon_env_builder: NeonEnvBuilder):
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
# Use a tiny checkpoint distance, to create a lot of layers quickly.
|
||||
# That allows us to stress the compaction and layer flushing logic more.
|
||||
tenant, _ = env.neon_cli.create_tenant(
|
||||
conf={
|
||||
"checkpoint_distance": "5000000",
|
||||
}
|
||||
)
|
||||
env.neon_cli.create_timeline("test_pageserver_chaos", tenant_id=tenant)
|
||||
pg = env.postgres.create_start("test_pageserver_chaos", tenant_id=tenant)
|
||||
|
||||
# Create table, and insert some rows. Make it big enough that it doesn't fit in
|
||||
# shared_buffers, otherwise the SELECT after restart will just return answer
|
||||
# from shared_buffers without hitting the page server, which defeats the point
|
||||
# of this test.
|
||||
with closing(pg.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("CREATE TABLE foo (id int, t text, updates int)")
|
||||
cur.execute("CREATE INDEX ON foo (id)")
|
||||
cur.execute(
|
||||
"""
|
||||
INSERT INTO foo
|
||||
SELECT g, 'long string to consume some space' || g, 0
|
||||
FROM generate_series(1, 100000) g
|
||||
"""
|
||||
)
|
||||
|
||||
# Verify that the table is larger than shared_buffers
|
||||
cur.execute(
|
||||
"""
|
||||
select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_ize
|
||||
from pg_settings where name = 'shared_buffers'
|
||||
"""
|
||||
)
|
||||
row = cur.fetchone()
|
||||
assert row is not None
|
||||
log.info(f"shared_buffers is {row[0]}, table size {row[1]}")
|
||||
assert int(row[0]) < int(row[1])
|
||||
|
||||
# Update the whole table, then immediately kill and restart the pageserver
|
||||
for i in range(1, 15):
|
||||
pg.safe_psql("UPDATE foo set updates = updates + 1")
|
||||
|
||||
# This kills the pageserver immediately, to simulate a crash
|
||||
env.pageserver.stop(immediate=True)
|
||||
env.pageserver.start()
|
||||
|
||||
# Stopping the pageserver breaks the connection from the postgres backend to
|
||||
# the page server, and causes the next query on the connection to fail. Start a new
|
||||
# postgres connection too, to avoid that error. (Ideally, the compute node would
|
||||
# handle that and retry internally, without propagating the error to the user, but
|
||||
# currently it doesn't...)
|
||||
pg_conn = pg.connect()
|
||||
cur = pg_conn.cursor()
|
||||
|
||||
# Check that all the updates are visible
|
||||
num_updates = pg.safe_psql("SELECT sum(updates) FROM foo")[0][0]
|
||||
assert num_updates == i * 100000
|
||||
|
||||
Reference in New Issue
Block a user