SELECT 💣(); (#8270)

## Problem
We want to be able to test how our infrastructure reacts on segfaults in
Postgres (for example, we collect cores, and get some required
logs/metrics, etc)

## Summary of changes
- Add `trigger_segfauls` function to `neon_test_utils` to trigger a
segfault in Postgres
- Add `trigger_panic` function to `neon_test_utils` to trigger SIGABRT
(by using `elog(PANIC, ...))
- Fix cleanup logic in regression tests in endpoint crashed
This commit is contained in:
Alexander Bayandin
2024-07-05 15:12:01 +01:00
committed by Vlad Lazar
parent a020f55a80
commit 94e9811ca4
6 changed files with 80 additions and 6 deletions

View File

@@ -7,7 +7,7 @@ OBJS = \
neontest.o
EXTENSION = neon_test_utils
DATA = neon_test_utils--1.2.sql
DATA = neon_test_utils--1.3.sql
PGFILEDESC = "neon_test_utils - helpers for neon testing and debugging"
PG_CONFIG = pg_config

View File

@@ -45,3 +45,21 @@ CREATE FUNCTION neon_xlogflush(lsn pg_lsn DEFAULT NULL)
RETURNS VOID
AS 'MODULE_PATHNAME', 'neon_xlogflush'
LANGUAGE C PARALLEL UNSAFE;
CREATE FUNCTION trigger_panic()
RETURNS VOID
AS 'MODULE_PATHNAME', 'trigger_panic'
LANGUAGE C PARALLEL UNSAFE;
CREATE FUNCTION trigger_segfault()
RETURNS VOID
AS 'MODULE_PATHNAME', 'trigger_segfault'
LANGUAGE C PARALLEL UNSAFE;
-- Alias for `trigger_segfault`, just because `SELECT 💣()` looks fun
CREATE OR REPLACE FUNCTION 💣() RETURNS void
LANGUAGE plpgsql AS $$
BEGIN
PERFORM trigger_segfault();
END;
$$;

View File

@@ -1,6 +1,6 @@
# neon_test_utils extension
comment = 'helpers for neon testing and debugging'
default_version = '1.2'
default_version = '1.3'
module_pathname = '$libdir/neon_test_utils'
relocatable = true
trusted = true

View File

@@ -42,6 +42,8 @@ PG_FUNCTION_INFO_V1(clear_buffer_cache);
PG_FUNCTION_INFO_V1(get_raw_page_at_lsn);
PG_FUNCTION_INFO_V1(get_raw_page_at_lsn_ex);
PG_FUNCTION_INFO_V1(neon_xlogflush);
PG_FUNCTION_INFO_V1(trigger_panic);
PG_FUNCTION_INFO_V1(trigger_segfault);
/*
* Linkage to functions in neon module.
@@ -489,3 +491,24 @@ neon_xlogflush(PG_FUNCTION_ARGS)
XLogFlush(lsn);
PG_RETURN_VOID();
}
/*
* Function to trigger panic.
*/
Datum
trigger_panic(PG_FUNCTION_ARGS)
{
elog(PANIC, "neon_test_utils: panic");
PG_RETURN_VOID();
}
/*
* Function to trigger a segfault.
*/
Datum
trigger_segfault(PG_FUNCTION_ARGS)
{
int *ptr = NULL;
*ptr = 42;
PG_RETURN_VOID();
}

View File

@@ -943,6 +943,8 @@ class NeonEnvBuilder:
# if the test threw an exception, don't check for errors
# as a failing assertion would cause the cleanup below to fail
ps_assert_metric_no_errors=(exc_type is None),
# do not fail on endpoint errors to allow the rest of cleanup to proceed
fail_on_endpoint_errors=False,
)
cleanup_error = None
@@ -1214,11 +1216,11 @@ class NeonEnv:
for f in futs:
f.result()
def stop(self, immediate=False, ps_assert_metric_no_errors=False):
def stop(self, immediate=False, ps_assert_metric_no_errors=False, fail_on_endpoint_errors=True):
"""
After this method returns, there should be no child processes running.
"""
self.endpoints.stop_all()
self.endpoints.stop_all(fail_on_endpoint_errors)
# Stop storage controller before pageservers: we don't want it to spuriously
# detect a pageserver "failure" during test teardown
@@ -3900,9 +3902,17 @@ class EndpointFactory:
pageserver_id=pageserver_id,
)
def stop_all(self) -> "EndpointFactory":
def stop_all(self, fail_on_error=True) -> "EndpointFactory":
exception = None
for ep in self.endpoints:
ep.stop()
try:
ep.stop()
except Exception as e:
log.error(f"Failed to stop endpoint {ep.endpoint_id}: {e}")
exception = e
if fail_on_error and exception is not None:
raise exception
return self

View File

@@ -0,0 +1,23 @@
import pytest
from fixtures.neon_fixtures import NeonEnvBuilder
@pytest.mark.parametrize(
"sql_func",
[
"trigger_panic",
"trigger_segfault",
"💣", # calls `trigger_segfault` internally
],
)
def test_endpoint_crash(neon_env_builder: NeonEnvBuilder, sql_func: str):
"""
Test that triggering crash from neon_test_utils crashes the endpoint
"""
env = neon_env_builder.init_start()
env.neon_cli.create_branch("test_endpoint_crash")
endpoint = env.endpoints.create_start("test_endpoint_crash")
endpoint.safe_psql("CREATE EXTENSION neon_test_utils;")
with pytest.raises(Exception, match="This probably means the server terminated abnormally"):
endpoint.safe_psql(f"SELECT {sql_func}();")