Compare commits

...

6 Commits

Author SHA1 Message Date
Matthias van de Meent
f12bb1b957 Fix test, now really 2024-07-01 11:30:08 +02:00
Matthias van de Meent
a37b8c58bf More potential test fixes 2024-06-29 03:13:31 +02:00
Matthias van de Meent
cd4fdd75d8 Make sure we only delete a tenant we've created. 2024-06-29 03:13:31 +02:00
Matthias van de Meent
b237e83c0e Style fixes 2024-06-29 03:13:31 +02:00
Matthias van de Meent
4ab2b8f575 Implement testing, and add a call to cleanup that I'd missed. 2024-06-29 03:13:31 +02:00
Matthias van de Meent
088a743d39 Handle PS error responses cleanly
Before this, we'd assume the PS connection would be good once we got a response.
Now, we also check that the response was a positive one.

This means our exponential backoff policy now also applies to PS connections that
do connect but fail to get a pagestream, where previously it did not.
2024-06-29 03:13:31 +02:00
2 changed files with 103 additions and 0 deletions

View File

@@ -581,6 +581,8 @@ pageserver_connect(shardno_t shard_no, int elevel)
}
case PS_Connecting_PageStream:
{
PGresult *result;
neon_shard_log(shard_no, DEBUG5, "Connection state: Connecting_PageStream");
if (PQstatus(shard->conn) == CONNECTION_BAD)
@@ -620,6 +622,48 @@ pageserver_connect(shardno_t shard_no, int elevel)
}
}
}
result = PQgetResult(shard->conn);
if (!result)
{
CLEANUP_AND_DISCONNECT(shard);
neon_shard_log(shard_no, elevel,
"could not complete handshake: unexpected end of data");
return false;
}
switch (PQresultStatus(result))
{
/*
* We only expect COPY_BOTH, all other responses are indications of
* problems out of our control
*/
case PGRES_COPY_BOTH:
break;
case PGRES_BAD_RESPONSE:
case PGRES_NONFATAL_ERROR:
case PGRES_FATAL_ERROR:
CLEANUP_AND_DISCONNECT(shard);
neon_shard_log(shard_no, elevel,
"could not complete handshake: PageServer returned error: %s",
PQresultErrorMessage(result));
PQclear(result);
return false;
case PGRES_EMPTY_QUERY:
case PGRES_COMMAND_OK:
case PGRES_TUPLES_OK:
case PGRES_COPY_OUT:
case PGRES_COPY_IN:
case PGRES_SINGLE_TUPLE:
case PGRES_PIPELINE_SYNC:
case PGRES_PIPELINE_ABORTED:
CLEANUP_AND_DISCONNECT(shard);
neon_shard_log(shard_no, elevel,
"could not complete handshake: unexpected result type: %d",
PQresultStatus(result));
PQclear(result);
return false;
}
shard->state = PS_Connected;
/* fallthrough */

View File

@@ -0,0 +1,59 @@
from contextlib import closing
import pytest
from psycopg2.errors import QueryCanceled
"""
Test that we can handle broken pageservers correctly
"""
def test_pageserver_breaks_while_running(neon_simple_env):
env = neon_simple_env
ps = env.pageserver
ps_http = ps.http_client()
ps_http.is_testing_enabled_or_skip()
(tid, tlid) = env.neon_cli.create_tenant()
env.neon_cli.create_branch("test_config", tenant_id=tid)
env.pageserver.quiesce_tenants()
# We don't want to have any racy behaviour with autovacuum IOs
ep = env.endpoints.create_start(
"test_config",
tenant_id=tid,
config_lines=[
"autovacuum = off",
"shared_buffers = 128MB",
],
)
# tenant is still attached, no errors from PS
with closing(ep.connect()) as conn:
with conn.cursor() as cur:
cur.execute(
"""
CREATE TABLE test1 AS
SELECT id, sha256(id::text::bytea) payload
FROM generate_series(1, 1024::bigint) p(id);
"""
)
ps_http.tenant_detach(tid)
with conn.cursor() as cur:
cur.execute(
"""
SET statement_timeout = '1s';
"""
)
with pytest.raises(QueryCanceled):
# definitely uncached relation
cur.execute(
"""
SELECT count(*) FROM pg_rewrite;
"""
)
ep.stop()
ep.log_contains("""could not complete handshake: PageServer returned error: """)