Fix segfault if pageserver connection is lost during backend startup.

It's not OK to return early from within a PG_TRY-CATCH block. The PG_TRY macro sets the global PG_exception_stack variable, and PG_END_TRY restores it. If we jump out in between with "return NULL", the PG_exception_stack is left to point to garbage. (I'm surprised the comments in PG_TRY_CATCH don't warn about this.) Add test that re-attaches tenant in pageserver while Postgres is running. If the tenant is detached while compute is connected and busy running queries, those queries will fail if they try to fetch any pages. But when the tenant is re-attached, things should start working again, without disconnecting the client <-> postgres connections. Without this fix, this reproduced the segfault. Fixes issue #3231
2026-01-11 07:22:55 +00:00 · 2023-01-05 14:45:28 +02:00
parent c187de1101
commit 8b710b9753
2 changed files with 169 additions and 16 deletions
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -243,29 +243,33 @@ pageserver_receive(void)
 	PG_TRY();
 	{
 		/* read response */
-		resp_buff.len = call_PQgetCopyData(&resp_buff.data);
-		resp_buff.cursor = 0;
+		int			rc;

-		if (resp_buff.len < 0)
+		rc = call_PQgetCopyData(&resp_buff.data);
+		if (rc >= 0)
 		{
-			if (resp_buff.len == -1)
+			resp_buff.len = rc;
+			resp_buff.cursor = 0;
+			resp = nm_unpack_response(&resp_buff);
+			PQfreemem(resp_buff.data);
+
+			if (message_level_is_interesting(PageStoreTrace))
 			{
-				pageserver_disconnect();
-				return NULL;
+				char	   *msg = nm_to_string((NeonMessage *) resp);
+
+				neon_log(PageStoreTrace, "got response: %s", msg);
+				pfree(msg);
 			}
-			else if (resp_buff.len == -2)
-				neon_log(ERROR, "could not read COPY data: %s", PQerrorMessage(pageserver_conn));
 		}
-		resp = nm_unpack_response(&resp_buff);
-		PQfreemem(resp_buff.data);
-
-		if (message_level_is_interesting(PageStoreTrace))
+		else if (rc == -1)
 		{
-			char	   *msg = nm_to_string((NeonMessage *) resp);
-
-			neon_log(PageStoreTrace, "got response: %s", msg);
-			pfree(msg);
+			pageserver_disconnect();
+			resp = NULL;
 		}
+		else if (rc == -2)
+			neon_log(ERROR, "could not read COPY data: %s", PQerrorMessage(pageserver_conn));
+		else
+			neon_log(ERROR, "unexpected PQgetCopyData return value: %d", rc);
 	}
 	PG_CATCH();
 	{
--- a/test_runner/regress/test_tenant_detach.py
+++ b/test_runner/regress/test_tenant_detach.py
@@ -1,9 +1,13 @@
+import asyncio
+import random
 import time
 from threading import Thread

+import asyncpg
 import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
+    NeonEnv,
    NeonEnvBuilder,
    PageserverApiException,
    PageserverHttpClient,
@@ -12,6 +16,7 @@ from fixtures.neon_fixtures import (
    available_remote_storages,
    wait_for_last_record_lsn,
    wait_for_upload,
+    wait_until,
    wait_until_tenant_state,
 )
 from fixtures.types import Lsn, TenantId, TimelineId
@@ -84,6 +89,150 @@ def test_tenant_reattach(
    assert env.pageserver.log_contains(".*download.*failed, will retry.*")


+num_connections = 10
+num_rows = 100000
+updates_to_perform = 0
+
+updates_started = 0
+updates_finished = 0
+
+
+# Run random UPDATEs on test table. On failure, try again.
+async def update_table(pg_conn: asyncpg.Connection):
+    global updates_started, updates_finished, updates_to_perform
+
+    while updates_started < updates_to_perform or updates_to_perform == 0:
+        updates_started += 1
+        id = random.randrange(1, num_rows)
+
+        # Loop to retry until the UPDATE succeeds
+        while True:
+            try:
+                await pg_conn.fetchrow(f"UPDATE t SET counter = counter + 1 WHERE id = {id}")
+                updates_finished += 1
+                if updates_finished % 1000 == 0:
+                    log.info(f"update {updates_finished} / {updates_to_perform}")
+                break
+            except asyncpg.PostgresError as e:
+                # Received error from Postgres. Log it, sleep a little, and continue
+                log.info(f"UPDATE error: {e}")
+                await asyncio.sleep(0.1)
+
+
+async def sleep_and_reattach(pageserver_http: PageserverHttpClient, tenant_id: TenantId):
+    global updates_started, updates_finished, updates_to_perform
+
+    # Wait until we have performed some updates
+    wait_until(20, 0.5, lambda: updates_finished > 500)
+
+    log.info("Detaching tenant")
+    pageserver_http.tenant_detach(tenant_id)
+    await asyncio.sleep(1)
+    log.info("Re-attaching tenant")
+    pageserver_http.tenant_attach(tenant_id)
+    log.info("Re-attach finished")
+
+    # Continue with 5000 more updates
+    updates_to_perform = updates_started + 5000
+
+
+# async guts of test_tenant_reattach_while_bysy test
+async def reattach_while_busy(
+    env: NeonEnv, pg: Postgres, pageserver_http: PageserverHttpClient, tenant_id: TenantId
+):
+    workers = []
+    for worker_id in range(num_connections):
+        pg_conn = await pg.connect_async()
+        workers.append(asyncio.create_task(update_table(pg_conn)))
+
+    workers.append(asyncio.create_task(sleep_and_reattach(pageserver_http, tenant_id)))
+    await asyncio.gather(*workers)
+
+    assert updates_finished == updates_to_perform
+
+
+# Detach and re-attach tenant, while compute is busy running queries.
+#
+# Some of the queries may fail, in the window that the tenant has been
+# detached but not yet re-attached. But Postgres itself should keep
+# running, and when we retry the queries, they should start working
+# after the attach has finished.
+
+# FIXME:
+#
+# This is pretty unstable at the moment. I've seen it fail with a warning like this:
+#
+# AssertionError: assert not ['2023-01-05T13:09:40.708303Z  WARN remote_upload{tenant=c3fc41f6cf29a7626b90316e3518cd4b timeline=7978246f85faa71ab03...1282b/000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001716699-0000000001736681"\n']
+#
+# (https://neon-github-public-dev.s3.amazonaws.com/reports/pr-3232/debug/3846817847/index.html#suites/f9eba3cfdb71aa6e2b54f6466222829b/470fc62b5db7d7d7/)
+# I believe that failure happened because there is a race condition
+# between detach and starting remote upload tasks:
+#
+# 1. detach_timeline calls task_mgr::shutdown_tasks(), sending shutdown
+#    signal to all in-progress tasks associated with the tenant.
+# 2. Just after shutdown_tasks() has collected the list of tasks,
+#    a new remote-upload task is spawned.
+#
+# See https://github.com/neondatabase/neon/issues/3273
+#
+#
+# I also saw this failure:
+#
+# test_runner/regress/test_tenant_detach.py:194: in test_tenant_reattach_while_busy
+#     asyncio.run(reattach_while_busy(env, pg, pageserver_http, tenant_id))
+# /home/nonroot/.pyenv/versions/3.9.2/lib/python3.9/asyncio/runners.py:44: in run
+#     return loop.run_until_complete(main)
+# /home/nonroot/.pyenv/versions/3.9.2/lib/python3.9/asyncio/base_events.py:642: in run_until_complete
+#     return future.result()
+# test_runner/regress/test_tenant_detach.py:151: in reattach_while_busy
+#     assert updates_finished == updates_to_perform
+# E   assert 5010 == 10010
+# E     +5010
+# E     -10010
+#
+# I don't know what's causing that...
+@pytest.mark.skip(reason="fixme")
+@pytest.mark.parametrize("remote_storage_kind", available_remote_storages())
+def test_tenant_reattach_while_busy(
+    neon_env_builder: NeonEnvBuilder,
+    remote_storage_kind: RemoteStorageKind,
+):
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=remote_storage_kind,
+        test_name="test_tenant_reattach_while_busy",
+    )
+    env = neon_env_builder.init_start()
+
+    # Attempts to connect from compute to pageserver while the tenant is
+    # temporarily detached produces these errors in the pageserver log.
+    env.pageserver.allowed_errors.append(".*Tenant .* not found in the local state.*")
+    env.pageserver.allowed_errors.append(
+        ".*Tenant .* will not become active\\. Current state: Stopping.*"
+    )
+
+    pageserver_http = env.pageserver.http_client()
+
+    # create new nenant
+    tenant_id, timeline_id = env.neon_cli.create_tenant(
+        # Create layers aggressively
+        conf={"checkpoint_distance": "100000"}
+    )
+
+    pg = env.postgres.create_start("main", tenant_id=tenant_id)
+
+    cur = pg.connect().cursor()
+
+    cur.execute("CREATE TABLE t(id int primary key, counter int)")
+    cur.execute(f"INSERT INTO t SELECT generate_series(1,{num_rows}), 0")
+
+    # Run the test
+    asyncio.run(reattach_while_busy(env, pg, pageserver_http, tenant_id))
+
+    # Verify table contents
+    assert query_scalar(cur, "SELECT count(*) FROM t") == num_rows
+    assert query_scalar(cur, "SELECT sum(counter) FROM t") == updates_to_perform
+
+
 def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
    env = neon_env_builder.init_start()
    pageserver_http = env.pageserver.http_client()