Add test that repeatedly kills and restarts the pageserver.

This caught or reproduced several bugs when I originally wrote this test back in May, including #1731, #1740, #1751, and #707. I believe all the issues have been fixed now, but since this was a very fruitful test, let's add it to the test suite. We didn't commit this earlier, because the test was very slow especially with a debug build. We've since changed the build options so that even the debug builds are not quite so slow anymore.
2026-01-10 06:52:55 +00:00 · 2022-09-06 13:00:40 +03:00
parent f081419e68
commit cf157ad8e4
1 changed files with 69 additions and 3 deletions
--- a/test_runner/regress/test_pageserver_restart.py
+++ b/test_runner/regress/test_pageserver_restart.py
@@ -1,3 +1,6 @@
+from contextlib import closing
+
+import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder

@@ -38,9 +41,7 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder):
    log.info(f"shared_buffers is {row[0]}, table size {row[1]}")
    assert int(row[0]) < int(row[1])

-    # Stop and restart pageserver. This is a more or less graceful shutdown, although
-    # the page server doesn't currently have a shutdown routine so there's no difference
-    # between stopping and crashing.
+    # Stop the pageserver gracefully and restart it.
    env.pageserver.stop()
    env.pageserver.start()

@@ -58,3 +59,68 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder):
    # Stop the page server by force, and restart it
    env.pageserver.stop()
    env.pageserver.start()
+
+
+# Test that repeatedly kills and restarts the page server, while the
+# safekeeper and compute node keep running.
+@pytest.mark.timeout(540)
+def test_pageserver_chaos(neon_env_builder: NeonEnvBuilder):
+    env = neon_env_builder.init_start()
+
+    # Use a tiny checkpoint distance, to create a lot of layers quickly.
+    # That allows us to stress the compaction and layer flushing logic more.
+    tenant, _ = env.neon_cli.create_tenant(
+        conf={
+            "checkpoint_distance": "5000000",
+        }
+    )
+    env.neon_cli.create_timeline("test_pageserver_chaos", tenant_id=tenant)
+    pg = env.postgres.create_start("test_pageserver_chaos", tenant_id=tenant)
+
+    # Create table, and insert some rows. Make it big enough that it doesn't fit in
+    # shared_buffers, otherwise the SELECT after restart will just return answer
+    # from shared_buffers without hitting the page server, which defeats the point
+    # of this test.
+    with closing(pg.connect()) as conn:
+        with conn.cursor() as cur:
+            cur.execute("CREATE TABLE foo (id int, t text, updates int)")
+            cur.execute("CREATE INDEX ON foo (id)")
+            cur.execute(
+                """
+            INSERT INTO foo
+            SELECT g, 'long string to consume some space' || g, 0
+            FROM generate_series(1, 100000) g
+            """
+            )
+
+            # Verify that the table is larger than shared_buffers
+            cur.execute(
+                """
+            select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_ize
+            from pg_settings where name = 'shared_buffers'
+            """
+            )
+            row = cur.fetchone()
+            assert row is not None
+            log.info(f"shared_buffers is {row[0]}, table size {row[1]}")
+            assert int(row[0]) < int(row[1])
+
+    # Update the whole table, then immediately kill and restart the pageserver
+    for i in range(1, 15):
+        pg.safe_psql("UPDATE foo set updates = updates + 1")
+
+        # This kills the pageserver immediately, to simulate a crash
+        env.pageserver.stop(immediate=True)
+        env.pageserver.start()
+
+        # Stopping the pageserver breaks the connection from the postgres backend to
+        # the page server, and causes the next query on the connection to fail. Start a new
+        # postgres connection too, to avoid that error. (Ideally, the compute node would
+        # handle that and retry internally, without propagating the error to the user, but
+        # currently it doesn't...)
+        pg_conn = pg.connect()
+        cur = pg_conn.cursor()
+
+        # Check that all the updates are visible
+        num_updates = pg.safe_psql("SELECT sum(updates) FROM foo")[0][0]
+        assert num_updates == i * 100000