mirror of
https://github.com/neondatabase/neon.git
synced 2026-06-04 14:00:38 +00:00
Rework PageStream connection state handling: (#7611)
* Make PS connection startup use async APIs This allows for improved query cancellation when we start connections * Make PS connections have per-shard connection retry state. Previously they shared global backoff state, which is bad for quickly getting all connections started and/or back online. * Make sure we clean up most connection state on failed connections. Previously, we could technically leak some resources that we'd otherwise clean up. Now, the resources are correctly cleaned up. * pagestore_smgr.c now PANICs on unexpected response message types. Unexpected responses are likely a symptom of having a desynchronized view of the connection state. As a desynchronized connection state can cause corruption, we PANIC, as we don't know what data may have been written to buffers: the only solution is to fail fast & hope we didn't write wrong data. * Catch errors in sync pagestream request handling. Previously, if a query was cancelled after a message was sent to the pageserver, but before the data was received, the backend could forget that it sent the synchronous request, and let others deal with the repercussions. This could then lead to incorrect responses, or errors such as "unexpected response from page server with tag 0x68"
This commit is contained in:
@@ -260,6 +260,8 @@ async fn page_service_conn_main(
|
||||
socket.set_timeout(Some(std::time::Duration::from_millis(socket_timeout_ms)));
|
||||
let socket = std::pin::pin!(socket);
|
||||
|
||||
fail::fail_point!("ps::connection-start::pre-login");
|
||||
|
||||
// XXX: pgbackend.run() should take the connection_ctx,
|
||||
// and create a child per-query context when it invokes process_query.
|
||||
// But it's in a shared crate, so, we store connection_ctx inside PageServerHandler
|
||||
@@ -603,6 +605,7 @@ impl PageServerHandler {
|
||||
};
|
||||
|
||||
trace!("query: {copy_data_bytes:?}");
|
||||
fail::fail_point!("ps::handle-pagerequest-message");
|
||||
|
||||
// Trace request if needed
|
||||
if let Some(t) = tracer.as_mut() {
|
||||
@@ -617,6 +620,7 @@ impl PageServerHandler {
|
||||
|
||||
let (response, span) = match neon_fe_msg {
|
||||
PagestreamFeMessage::Exists(req) => {
|
||||
fail::fail_point!("ps::handle-pagerequest-message::exists");
|
||||
let span = tracing::info_span!("handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.request_lsn);
|
||||
(
|
||||
self.handle_get_rel_exists_request(tenant_id, timeline_id, &req, &ctx)
|
||||
@@ -626,6 +630,7 @@ impl PageServerHandler {
|
||||
)
|
||||
}
|
||||
PagestreamFeMessage::Nblocks(req) => {
|
||||
fail::fail_point!("ps::handle-pagerequest-message::nblocks");
|
||||
let span = tracing::info_span!("handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.request_lsn);
|
||||
(
|
||||
self.handle_get_nblocks_request(tenant_id, timeline_id, &req, &ctx)
|
||||
@@ -635,6 +640,7 @@ impl PageServerHandler {
|
||||
)
|
||||
}
|
||||
PagestreamFeMessage::GetPage(req) => {
|
||||
fail::fail_point!("ps::handle-pagerequest-message::getpage");
|
||||
// shard_id is filled in by the handler
|
||||
let span = tracing::info_span!("handle_get_page_at_lsn_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.request_lsn);
|
||||
(
|
||||
@@ -645,6 +651,7 @@ impl PageServerHandler {
|
||||
)
|
||||
}
|
||||
PagestreamFeMessage::DbSize(req) => {
|
||||
fail::fail_point!("ps::handle-pagerequest-message::dbsize");
|
||||
let span = tracing::info_span!("handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.request_lsn);
|
||||
(
|
||||
self.handle_db_size_request(tenant_id, timeline_id, &req, &ctx)
|
||||
@@ -654,6 +661,7 @@ impl PageServerHandler {
|
||||
)
|
||||
}
|
||||
PagestreamFeMessage::GetSlruSegment(req) => {
|
||||
fail::fail_point!("ps::handle-pagerequest-message::slrusegment");
|
||||
let span = tracing::info_span!("handle_get_slru_segment_request", kind = %req.kind, segno = %req.segno, req_lsn = %req.request_lsn);
|
||||
(
|
||||
self.handle_get_slru_segment_request(tenant_id, timeline_id, &req, &ctx)
|
||||
@@ -1505,6 +1513,7 @@ where
|
||||
_pgb: &mut PostgresBackend<IO>,
|
||||
_sm: &FeStartupPacket,
|
||||
) -> Result<(), QueryError> {
|
||||
fail::fail_point!("ps::connection-start::startup-packet");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -1519,6 +1528,8 @@ where
|
||||
Err(QueryError::SimulatedConnectionError)
|
||||
});
|
||||
|
||||
fail::fail_point!("ps::connection-start::process-query");
|
||||
|
||||
let ctx = self.connection_ctx.attached_child();
|
||||
debug!("process query {query_string:?}");
|
||||
let parts = query_string.split_whitespace().collect::<Vec<_>>();
|
||||
|
||||
@@ -51,7 +51,6 @@ int flush_every_n_requests = 8;
|
||||
|
||||
int neon_protocol_version = 2;
|
||||
|
||||
static int n_reconnect_attempts = 0;
|
||||
static int max_reconnect_attempts = 60;
|
||||
static int stripe_size;
|
||||
|
||||
@@ -95,18 +94,44 @@ static shmem_startup_hook_type prev_shmem_startup_hook;
|
||||
static PagestoreShmemState *pagestore_shared;
|
||||
static uint64 pagestore_local_counter = 0;
|
||||
|
||||
typedef enum PSConnectionState {
|
||||
PS_Disconnected, /* no connection yet */
|
||||
PS_Connecting_Startup, /* connection starting up */
|
||||
PS_Connecting_PageStream, /* negotiating pagestream */
|
||||
PS_Connected, /* connected, pagestream established */
|
||||
} PSConnectionState;
|
||||
|
||||
/* This backend's per-shard connections */
|
||||
typedef struct
|
||||
{
|
||||
PGconn *conn;
|
||||
TimestampTz last_connect_time; /* read-only debug value */
|
||||
TimestampTz last_reconnect_time;
|
||||
uint32 delay_us;
|
||||
int n_reconnect_attempts;
|
||||
|
||||
/*---
|
||||
* WaitEventSet containing:
|
||||
* - WL_SOCKET_READABLE on 'conn'
|
||||
* - WL_LATCH_SET on MyLatch, and
|
||||
* - WL_EXIT_ON_PM_DEATH.
|
||||
* Pageserver connection state, i.e.
|
||||
* disconnected: conn == NULL, wes == NULL;
|
||||
* conn_startup: connection initiated, waiting for connection establishing
|
||||
* conn_ps: PageStream query sent, waiting for confirmation
|
||||
* connected: PageStream established
|
||||
*/
|
||||
WaitEventSet *wes;
|
||||
PSConnectionState state;
|
||||
PGconn *conn;
|
||||
/*---
|
||||
* WaitEventSet containing:
|
||||
* - WL_SOCKET_READABLE on 'conn'
|
||||
* - WL_LATCH_SET on MyLatch, and
|
||||
* - WL_EXIT_ON_PM_DEATH.
|
||||
*/
|
||||
WaitEventSet *wes_read;
|
||||
/*---
|
||||
* WaitEventSet containing:
|
||||
* - WL_SOCKET_WRITABLE on 'conn'
|
||||
* - WL_LATCH_SET on MyLatch, and
|
||||
* - WL_EXIT_ON_PM_DEATH.
|
||||
*/
|
||||
WaitEventSet *wes_write;
|
||||
} PageServer;
|
||||
|
||||
static PageServer page_servers[MAX_SHARDS];
|
||||
@@ -303,119 +328,269 @@ get_shard_number(BufferTag *tag)
|
||||
return hash % n_shards;
|
||||
}
|
||||
|
||||
static inline void
|
||||
CLEANUP_AND_DISCONNECT(PageServer *shard)
|
||||
{
|
||||
if (shard->wes_read)
|
||||
{
|
||||
FreeWaitEventSet(shard->wes_read);
|
||||
shard->wes_read = NULL;
|
||||
}
|
||||
if (shard->wes_write)
|
||||
{
|
||||
FreeWaitEventSet(shard->wes_write);
|
||||
shard->wes_write = NULL;
|
||||
}
|
||||
if (shard->conn)
|
||||
{
|
||||
PQfinish(shard->conn);
|
||||
shard->conn = NULL;
|
||||
}
|
||||
|
||||
shard->state = PS_Disconnected;
|
||||
}
|
||||
|
||||
/*
|
||||
* Connect to a pageserver, or continue to try to connect if we're yet to
|
||||
* complete the connection (e.g. due to receiving an earlier cancellation
|
||||
* during connection start).
|
||||
* Returns true if successfully connected; false if the connection failed.
|
||||
*
|
||||
* Throws errors in unrecoverable situations, or when this backend's query
|
||||
* is canceled.
|
||||
*/
|
||||
static bool
|
||||
pageserver_connect(shardno_t shard_no, int elevel)
|
||||
{
|
||||
char *query;
|
||||
int ret;
|
||||
const char *keywords[3];
|
||||
const char *values[3];
|
||||
int n;
|
||||
PGconn *conn;
|
||||
WaitEventSet *wes;
|
||||
PageServer *shard = &page_servers[shard_no];
|
||||
char connstr[MAX_PAGESERVER_CONNSTRING_SIZE];
|
||||
|
||||
static TimestampTz last_connect_time = 0;
|
||||
static uint64_t delay_us = MIN_RECONNECT_INTERVAL_USEC;
|
||||
TimestampTz now;
|
||||
uint64_t us_since_last_connect;
|
||||
bool broke_from_loop = false;
|
||||
|
||||
Assert(page_servers[shard_no].conn == NULL);
|
||||
|
||||
/*
|
||||
* Get the connection string for this shard. If the shard map has been
|
||||
* updated since we last looked, this will also disconnect any existing
|
||||
* pageserver connections as a side effect.
|
||||
* Note that connstr is used both during connection start, and when we
|
||||
* log the successful connection.
|
||||
*/
|
||||
load_shard_map(shard_no, connstr, NULL);
|
||||
|
||||
now = GetCurrentTimestamp();
|
||||
us_since_last_connect = now - last_connect_time;
|
||||
if (us_since_last_connect < MAX_RECONNECT_INTERVAL_USEC)
|
||||
switch (shard->state)
|
||||
{
|
||||
pg_usleep(delay_us);
|
||||
delay_us *= 2;
|
||||
}
|
||||
else
|
||||
case PS_Disconnected:
|
||||
{
|
||||
delay_us = MIN_RECONNECT_INTERVAL_USEC;
|
||||
}
|
||||
const char *keywords[3];
|
||||
const char *values[3];
|
||||
int n_pgsql_params;
|
||||
TimestampTz now;
|
||||
int64 us_since_last_attempt;
|
||||
|
||||
/*
|
||||
* Connect using the connection string we got from the
|
||||
* neon.pageserver_connstring GUC. If the NEON_AUTH_TOKEN environment
|
||||
* variable was set, use that as the password.
|
||||
*
|
||||
* The connection options are parsed in the order they're given, so when
|
||||
* we set the password before the connection string, the connection string
|
||||
* can override the password from the env variable. Seems useful, although
|
||||
* we don't currently use that capability anywhere.
|
||||
*/
|
||||
n = 0;
|
||||
if (neon_auth_token)
|
||||
{
|
||||
keywords[n] = "password";
|
||||
values[n] = neon_auth_token;
|
||||
n++;
|
||||
/* Make sure we start with a clean slate */
|
||||
CLEANUP_AND_DISCONNECT(shard);
|
||||
|
||||
neon_shard_log(shard_no, DEBUG5, "Connection state: Disconnected");
|
||||
|
||||
now = GetCurrentTimestamp();
|
||||
us_since_last_attempt = (int64) (now - shard->last_reconnect_time);
|
||||
shard->last_reconnect_time = now;
|
||||
|
||||
/*
|
||||
* If we did other tasks between reconnect attempts, then we won't
|
||||
* need to wait as long as a full delay.
|
||||
*/
|
||||
if (us_since_last_attempt < shard->delay_us)
|
||||
{
|
||||
pg_usleep(shard->delay_us - us_since_last_attempt);
|
||||
}
|
||||
|
||||
/* update the delay metric */
|
||||
shard->delay_us = Min(shard->delay_us * 2, MAX_RECONNECT_INTERVAL_USEC);
|
||||
|
||||
/*
|
||||
* Connect using the connection string we got from the
|
||||
* neon.pageserver_connstring GUC. If the NEON_AUTH_TOKEN environment
|
||||
* variable was set, use that as the password.
|
||||
*
|
||||
* The connection options are parsed in the order they're given, so when
|
||||
* we set the password before the connection string, the connection string
|
||||
* can override the password from the env variable. Seems useful, although
|
||||
* we don't currently use that capability anywhere.
|
||||
*/
|
||||
keywords[0] = "dbname";
|
||||
values[0] = connstr;
|
||||
n_pgsql_params = 1;
|
||||
|
||||
if (neon_auth_token)
|
||||
{
|
||||
keywords[1] = "password";
|
||||
values[1] = neon_auth_token;
|
||||
n_pgsql_params++;
|
||||
}
|
||||
|
||||
keywords[n_pgsql_params] = NULL;
|
||||
values[n_pgsql_params] = NULL;
|
||||
|
||||
shard->conn = PQconnectStartParams(keywords, values, 1);
|
||||
if (!shard->conn)
|
||||
{
|
||||
neon_shard_log(shard_no, elevel, "Failed to connect to pageserver: out of memory");
|
||||
return false;
|
||||
}
|
||||
|
||||
shard->wes_read = CreateWaitEventSet(TopMemoryContext, 3);
|
||||
AddWaitEventToSet(shard->wes_read, WL_LATCH_SET, PGINVALID_SOCKET,
|
||||
MyLatch, NULL);
|
||||
AddWaitEventToSet(shard->wes_read, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
|
||||
NULL, NULL);
|
||||
AddWaitEventToSet(shard->wes_read, WL_SOCKET_READABLE, PQsocket(shard->conn), NULL, NULL);
|
||||
|
||||
shard->wes_write = CreateWaitEventSet(TopMemoryContext, 3);
|
||||
AddWaitEventToSet(shard->wes_write, WL_LATCH_SET, PGINVALID_SOCKET,
|
||||
MyLatch, NULL);
|
||||
AddWaitEventToSet(shard->wes_write, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
|
||||
NULL, NULL);
|
||||
AddWaitEventToSet(shard->wes_write, WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE,
|
||||
PQsocket(shard->conn),
|
||||
NULL, NULL);
|
||||
|
||||
shard->state = PS_Connecting_Startup;
|
||||
/* fallthrough */
|
||||
}
|
||||
keywords[n] = "dbname";
|
||||
values[n] = connstr;
|
||||
n++;
|
||||
keywords[n] = NULL;
|
||||
values[n] = NULL;
|
||||
n++;
|
||||
conn = PQconnectdbParams(keywords, values, 1);
|
||||
last_connect_time = GetCurrentTimestamp();
|
||||
|
||||
if (PQstatus(conn) == CONNECTION_BAD)
|
||||
case PS_Connecting_Startup:
|
||||
{
|
||||
char *msg = pchomp(PQerrorMessage(conn));
|
||||
char *pagestream_query;
|
||||
int ps_send_query_ret;
|
||||
bool connected = false;
|
||||
|
||||
PQfinish(conn);
|
||||
neon_shard_log(shard_no, DEBUG5, "Connection state: Connecting_Startup");
|
||||
|
||||
ereport(elevel,
|
||||
(errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION),
|
||||
errmsg(NEON_TAG "[shard %d] could not establish connection to pageserver", shard_no),
|
||||
errdetail_internal("%s", msg)));
|
||||
pfree(msg);
|
||||
return false;
|
||||
}
|
||||
switch (neon_protocol_version)
|
||||
{
|
||||
do
|
||||
{
|
||||
WaitEvent event;
|
||||
int poll_result = PQconnectPoll(shard->conn);
|
||||
|
||||
switch (poll_result)
|
||||
{
|
||||
default: /* unknown/unused states are handled as a failed connection */
|
||||
case PGRES_POLLING_FAILED:
|
||||
{
|
||||
char *pqerr = PQerrorMessage(shard->conn);
|
||||
char *msg = NULL;
|
||||
neon_shard_log(shard_no, DEBUG5, "POLLING_FAILED");
|
||||
|
||||
if (pqerr)
|
||||
msg = pchomp(pqerr);
|
||||
|
||||
CLEANUP_AND_DISCONNECT(shard);
|
||||
|
||||
if (msg)
|
||||
{
|
||||
neon_shard_log(shard_no, elevel,
|
||||
"could not connect to pageserver: %s",
|
||||
msg);
|
||||
pfree(msg);
|
||||
}
|
||||
else
|
||||
neon_shard_log(shard_no, elevel,
|
||||
"could not connect to pageserver");
|
||||
|
||||
return false;
|
||||
}
|
||||
case PGRES_POLLING_READING:
|
||||
/* Sleep until there's something to do */
|
||||
(void) WaitEventSetWait(shard->wes_read, -1L, &event, 1,
|
||||
PG_WAIT_EXTENSION);
|
||||
ResetLatch(MyLatch);
|
||||
|
||||
/* query cancellation, backend shutdown */
|
||||
CHECK_FOR_INTERRUPTS();
|
||||
|
||||
/* PQconnectPoll() handles the socket polling state updates */
|
||||
|
||||
break;
|
||||
case PGRES_POLLING_WRITING:
|
||||
/* Sleep until there's something to do */
|
||||
(void) WaitEventSetWait(shard->wes_write, -1L, &event, 1,
|
||||
PG_WAIT_EXTENSION);
|
||||
ResetLatch(MyLatch);
|
||||
|
||||
/* query cancellation, backend shutdown */
|
||||
CHECK_FOR_INTERRUPTS();
|
||||
|
||||
/* PQconnectPoll() handles the socket polling state updates */
|
||||
|
||||
break;
|
||||
case PGRES_POLLING_OK:
|
||||
neon_shard_log(shard_no, DEBUG5, "POLLING_OK");
|
||||
connected = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
while (!connected);
|
||||
|
||||
/* No more polling needed; connection succeeded */
|
||||
shard->last_connect_time = GetCurrentTimestamp();
|
||||
|
||||
switch (neon_protocol_version)
|
||||
{
|
||||
case 2:
|
||||
query = psprintf("pagestream_v2 %s %s", neon_tenant, neon_timeline);
|
||||
pagestream_query = psprintf("pagestream_v2 %s %s", neon_tenant, neon_timeline);
|
||||
break;
|
||||
case 1:
|
||||
query = psprintf("pagestream %s %s", neon_tenant, neon_timeline);
|
||||
pagestream_query = psprintf("pagestream %s %s", neon_tenant, neon_timeline);
|
||||
break;
|
||||
default:
|
||||
elog(ERROR, "unexpected neon_protocol_version %d", neon_protocol_version);
|
||||
}
|
||||
ret = PQsendQuery(conn, query);
|
||||
pfree(query);
|
||||
if (ret != 1)
|
||||
{
|
||||
PQfinish(conn);
|
||||
neon_shard_log(shard_no, elevel, "could not send pagestream command to pageserver");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
wes = CreateWaitEventSet(TopMemoryContext, 3);
|
||||
AddWaitEventToSet(wes, WL_LATCH_SET, PGINVALID_SOCKET,
|
||||
MyLatch, NULL);
|
||||
AddWaitEventToSet(wes, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
|
||||
NULL, NULL);
|
||||
AddWaitEventToSet(wes, WL_SOCKET_READABLE, PQsocket(conn), NULL, NULL);
|
||||
if (PQstatus(shard->conn) == CONNECTION_BAD)
|
||||
{
|
||||
char *msg = pchomp(PQerrorMessage(shard->conn));
|
||||
|
||||
PG_TRY();
|
||||
CLEANUP_AND_DISCONNECT(shard);
|
||||
|
||||
ereport(elevel,
|
||||
(errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION),
|
||||
errmsg(NEON_TAG "[shard %d] could not establish connection to pageserver", shard_no),
|
||||
errdetail_internal("%s", msg)));
|
||||
pfree(msg);
|
||||
return false;
|
||||
}
|
||||
|
||||
ps_send_query_ret = PQsendQuery(shard->conn, pagestream_query);
|
||||
pfree(pagestream_query);
|
||||
if (ps_send_query_ret != 1)
|
||||
{
|
||||
CLEANUP_AND_DISCONNECT(shard);
|
||||
|
||||
neon_shard_log(shard_no, elevel, "could not send pagestream command to pageserver");
|
||||
return false;
|
||||
}
|
||||
|
||||
shard->state = PS_Connecting_PageStream;
|
||||
/* fallthrough */
|
||||
}
|
||||
case PS_Connecting_PageStream:
|
||||
{
|
||||
while (PQisBusy(conn))
|
||||
neon_shard_log(shard_no, DEBUG5, "Connection state: Connecting_PageStream");
|
||||
|
||||
if (PQstatus(shard->conn) == CONNECTION_BAD)
|
||||
{
|
||||
char *msg = pchomp(PQerrorMessage(shard->conn));
|
||||
CLEANUP_AND_DISCONNECT(shard);
|
||||
ereport(elevel,
|
||||
(errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION),
|
||||
errmsg(NEON_TAG "[shard %d] could not establish connection to pageserver", shard_no),
|
||||
errdetail_internal("%s", msg)));
|
||||
pfree(msg);
|
||||
return false;
|
||||
}
|
||||
|
||||
while (PQisBusy(shard->conn))
|
||||
{
|
||||
WaitEvent event;
|
||||
|
||||
/* Sleep until there's something to do */
|
||||
(void) WaitEventSetWait(wes, -1L, &event, 1, PG_WAIT_EXTENSION);
|
||||
(void) WaitEventSetWait(shard->wes_read, -1L, &event, 1, PG_WAIT_EXTENSION);
|
||||
ResetLatch(MyLatch);
|
||||
|
||||
CHECK_FOR_INTERRUPTS();
|
||||
@@ -423,40 +598,37 @@ pageserver_connect(shardno_t shard_no, int elevel)
|
||||
/* Data available in socket? */
|
||||
if (event.events & WL_SOCKET_READABLE)
|
||||
{
|
||||
if (!PQconsumeInput(conn))
|
||||
if (!PQconsumeInput(shard->conn))
|
||||
{
|
||||
char *msg = pchomp(PQerrorMessage(conn));
|
||||
|
||||
PQfinish(conn);
|
||||
FreeWaitEventSet(wes);
|
||||
char *msg = pchomp(PQerrorMessage(shard->conn));
|
||||
|
||||
CLEANUP_AND_DISCONNECT(shard);
|
||||
neon_shard_log(shard_no, elevel, "could not complete handshake with pageserver: %s",
|
||||
msg);
|
||||
/* Returning from inside PG_TRY is bad, so we break/return later */
|
||||
broke_from_loop = true;
|
||||
break;
|
||||
pfree(msg);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
PG_CATCH();
|
||||
{
|
||||
PQfinish(conn);
|
||||
FreeWaitEventSet(wes);
|
||||
PG_RE_THROW();
|
||||
}
|
||||
PG_END_TRY();
|
||||
|
||||
if (broke_from_loop)
|
||||
{
|
||||
return false;
|
||||
shard->state = PS_Connected;
|
||||
/* fallthrough */
|
||||
}
|
||||
case PS_Connected:
|
||||
/*
|
||||
* We successfully connected. Future connections to this PageServer
|
||||
* will do fast retries again, with exponential backoff.
|
||||
*/
|
||||
shard->delay_us = MIN_RECONNECT_INTERVAL_USEC;
|
||||
|
||||
neon_shard_log(shard_no, LOG, "libpagestore: connected to '%s' with protocol version %d", connstr, neon_protocol_version);
|
||||
page_servers[shard_no].conn = conn;
|
||||
page_servers[shard_no].wes = wes;
|
||||
|
||||
return true;
|
||||
neon_shard_log(shard_no, DEBUG5, "Connection state: Connected");
|
||||
neon_shard_log(shard_no, LOG, "libpagestore: connected to '%s' with protocol version %d", connstr, neon_protocol_version);
|
||||
return true;
|
||||
default:
|
||||
neon_shard_log(shard_no, ERROR, "libpagestore: invalid connection state %d", shard->state);
|
||||
}
|
||||
/* This shouldn't be hit */
|
||||
Assert(false);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -476,7 +648,7 @@ retry:
|
||||
WaitEvent event;
|
||||
|
||||
/* Sleep until there's something to do */
|
||||
(void) WaitEventSetWait(page_servers[shard_no].wes, -1L, &event, 1, PG_WAIT_EXTENSION);
|
||||
(void) WaitEventSetWait(page_servers[shard_no].wes_read, -1L, &event, 1, PG_WAIT_EXTENSION);
|
||||
ResetLatch(MyLatch);
|
||||
|
||||
CHECK_FOR_INTERRUPTS();
|
||||
@@ -502,7 +674,8 @@ retry:
|
||||
|
||||
/*
|
||||
* Reset prefetch and drop connection to the shard.
|
||||
* It also drops connection to all other shards involved in prefetch.
|
||||
* It also drops connection to all other shards involved in prefetch, through
|
||||
* prefetch_on_ps_disconnect().
|
||||
*/
|
||||
static void
|
||||
pageserver_disconnect(shardno_t shard_no)
|
||||
@@ -512,9 +685,6 @@ pageserver_disconnect(shardno_t shard_no)
|
||||
* whole prefetch queue, even for other pageservers. It should not
|
||||
* cause big problems, because connection loss is supposed to be a
|
||||
* rare event.
|
||||
*
|
||||
* Prefetch state should be reset even if page_servers[shard_no].conn == NULL,
|
||||
* because prefetch request may be registered before connection is established.
|
||||
*/
|
||||
prefetch_on_ps_disconnect();
|
||||
|
||||
@@ -527,37 +697,36 @@ pageserver_disconnect(shardno_t shard_no)
|
||||
static void
|
||||
pageserver_disconnect_shard(shardno_t shard_no)
|
||||
{
|
||||
PageServer *shard = &page_servers[shard_no];
|
||||
/*
|
||||
* If anything goes wrong while we were sending a request, it's not clear
|
||||
* what state the connection is in. For example, if we sent the request
|
||||
* but didn't receive a response yet, we might receive the response some
|
||||
* time later after we have already sent a new unrelated request. Close
|
||||
* the connection to avoid getting confused.
|
||||
* Similarly, even when we're in PS_DISCONNECTED, we may have junk to
|
||||
* clean up: It is possible that we encountered an error allocating any
|
||||
* of the wait event sets or the psql connection, or failed when we tried
|
||||
* to attach wait events to the WaitEventSets.
|
||||
*/
|
||||
if (page_servers[shard_no].conn)
|
||||
{
|
||||
neon_shard_log(shard_no, LOG, "dropping connection to page server due to error");
|
||||
PQfinish(page_servers[shard_no].conn);
|
||||
page_servers[shard_no].conn = NULL;
|
||||
}
|
||||
if (page_servers[shard_no].wes != NULL)
|
||||
{
|
||||
FreeWaitEventSet(page_servers[shard_no].wes);
|
||||
page_servers[shard_no].wes = NULL;
|
||||
}
|
||||
CLEANUP_AND_DISCONNECT(shard);
|
||||
|
||||
shard->state = PS_Disconnected;
|
||||
}
|
||||
|
||||
static bool
|
||||
pageserver_send(shardno_t shard_no, NeonRequest *request)
|
||||
{
|
||||
StringInfoData req_buff;
|
||||
PGconn *pageserver_conn = page_servers[shard_no].conn;
|
||||
PageServer *shard = &page_servers[shard_no];
|
||||
PGconn *pageserver_conn;
|
||||
|
||||
/* If the connection was lost for some reason, reconnect */
|
||||
if (pageserver_conn && PQstatus(pageserver_conn) == CONNECTION_BAD)
|
||||
if (shard->state == PS_Connected && PQstatus(shard->conn) == CONNECTION_BAD)
|
||||
{
|
||||
neon_shard_log(shard_no, LOG, "pageserver_send disconnect bad connection");
|
||||
pageserver_disconnect(shard_no);
|
||||
pageserver_conn = NULL;
|
||||
}
|
||||
|
||||
req_buff = nm_pack_request(request);
|
||||
@@ -571,17 +740,19 @@ pageserver_send(shardno_t shard_no, NeonRequest *request)
|
||||
* https://github.com/neondatabase/neon/issues/1138 So try to reestablish
|
||||
* connection in case of failure.
|
||||
*/
|
||||
if (!page_servers[shard_no].conn)
|
||||
if (shard->state != PS_Connected)
|
||||
{
|
||||
while (!pageserver_connect(shard_no, n_reconnect_attempts < max_reconnect_attempts ? LOG : ERROR))
|
||||
while (!pageserver_connect(shard_no, shard->n_reconnect_attempts < max_reconnect_attempts ? LOG : ERROR))
|
||||
{
|
||||
HandleMainLoopInterrupts();
|
||||
n_reconnect_attempts += 1;
|
||||
shard->n_reconnect_attempts += 1;
|
||||
}
|
||||
n_reconnect_attempts = 0;
|
||||
shard->n_reconnect_attempts = 0;
|
||||
} else {
|
||||
Assert(shard->conn != NULL);
|
||||
}
|
||||
|
||||
pageserver_conn = page_servers[shard_no].conn;
|
||||
pageserver_conn = shard->conn;
|
||||
|
||||
/*
|
||||
* Send request.
|
||||
@@ -590,13 +761,17 @@ pageserver_send(shardno_t shard_no, NeonRequest *request)
|
||||
* should use async mode and check for interrupts while waiting. In
|
||||
* practice, our requests are small enough to always fit in the output and
|
||||
* TCP buffer.
|
||||
*
|
||||
* Note that this also will fail when the connection is in the
|
||||
* PGRES_POLLING_WRITING state. It's kinda dirty to disconnect at this
|
||||
* point, but on the grand scheme of things it's only a small issue.
|
||||
*/
|
||||
if (PQputCopyData(pageserver_conn, req_buff.data, req_buff.len) <= 0)
|
||||
{
|
||||
char *msg = pchomp(PQerrorMessage(pageserver_conn));
|
||||
|
||||
pageserver_disconnect(shard_no);
|
||||
neon_shard_log(shard_no, LOG, "pageserver_send disconnect because failed to send page request (try to reconnect): %s", msg);
|
||||
neon_shard_log(shard_no, LOG, "pageserver_send disconnected: failed to send page request (try to reconnect): %s", msg);
|
||||
pfree(msg);
|
||||
pfree(req_buff.data);
|
||||
return false;
|
||||
@@ -611,6 +786,7 @@ pageserver_send(shardno_t shard_no, NeonRequest *request)
|
||||
neon_shard_log(shard_no, PageStoreTrace, "sent request: %s", msg);
|
||||
pfree(msg);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -619,58 +795,68 @@ pageserver_receive(shardno_t shard_no)
|
||||
{
|
||||
StringInfoData resp_buff;
|
||||
NeonResponse *resp;
|
||||
PGconn *pageserver_conn = page_servers[shard_no].conn;
|
||||
PageServer *shard = &page_servers[shard_no];
|
||||
PGconn *pageserver_conn = shard->conn;
|
||||
/* read response */
|
||||
int rc;
|
||||
|
||||
if (!pageserver_conn)
|
||||
return NULL;
|
||||
|
||||
PG_TRY();
|
||||
if (shard->state != PS_Connected)
|
||||
{
|
||||
/* read response */
|
||||
int rc;
|
||||
neon_shard_log(shard_no, LOG,
|
||||
"pageserver_receive: returning NULL for non-connected pageserver connection: 0x%02x",
|
||||
shard->state);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
rc = call_PQgetCopyData(shard_no, &resp_buff.data);
|
||||
if (rc >= 0)
|
||||
Assert(pageserver_conn);
|
||||
|
||||
rc = call_PQgetCopyData(shard_no, &resp_buff.data);
|
||||
if (rc >= 0)
|
||||
{
|
||||
/* call_PQgetCopyData handles rc == 0 */
|
||||
Assert(rc > 0);
|
||||
|
||||
PG_TRY();
|
||||
{
|
||||
resp_buff.len = rc;
|
||||
resp_buff.cursor = 0;
|
||||
resp = nm_unpack_response(&resp_buff);
|
||||
PQfreemem(resp_buff.data);
|
||||
|
||||
if (message_level_is_interesting(PageStoreTrace))
|
||||
{
|
||||
char *msg = nm_to_string((NeonMessage *) resp);
|
||||
|
||||
neon_shard_log(shard_no, PageStoreTrace, "got response: %s", msg);
|
||||
pfree(msg);
|
||||
}
|
||||
}
|
||||
else if (rc == -1)
|
||||
PG_CATCH();
|
||||
{
|
||||
neon_shard_log(shard_no, LOG, "pageserver_receive disconnect because call_PQgetCopyData returns -1: %s", pchomp(PQerrorMessage(pageserver_conn)));
|
||||
neon_shard_log(shard_no, LOG, "pageserver_receive: disconnect due malformatted response");
|
||||
pageserver_disconnect(shard_no);
|
||||
resp = NULL;
|
||||
PG_RE_THROW();
|
||||
}
|
||||
else if (rc == -2)
|
||||
{
|
||||
char *msg = pchomp(PQerrorMessage(pageserver_conn));
|
||||
PG_END_TRY();
|
||||
|
||||
pageserver_disconnect(shard_no);
|
||||
neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect because could not read COPY data: %s", msg);
|
||||
}
|
||||
else
|
||||
if (message_level_is_interesting(PageStoreTrace))
|
||||
{
|
||||
pageserver_disconnect(shard_no);
|
||||
neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect because unexpected PQgetCopyData return value: %d", rc);
|
||||
char *msg = nm_to_string((NeonMessage *) resp);
|
||||
|
||||
neon_shard_log(shard_no, PageStoreTrace, "got response: %s", msg);
|
||||
pfree(msg);
|
||||
}
|
||||
}
|
||||
PG_CATCH();
|
||||
else if (rc == -1)
|
||||
{
|
||||
neon_shard_log(shard_no, LOG, "pageserver_receive disconnect due to caught exception");
|
||||
neon_shard_log(shard_no, LOG, "pageserver_receive disconnect: psql end of copy data: %s", pchomp(PQerrorMessage(pageserver_conn)));
|
||||
pageserver_disconnect(shard_no);
|
||||
PG_RE_THROW();
|
||||
resp = NULL;
|
||||
}
|
||||
else if (rc == -2)
|
||||
{
|
||||
char *msg = pchomp(PQerrorMessage(pageserver_conn));
|
||||
|
||||
pageserver_disconnect(shard_no);
|
||||
neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect: could not read COPY data: %s", msg);
|
||||
}
|
||||
else
|
||||
{
|
||||
pageserver_disconnect(shard_no);
|
||||
neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect: unexpected PQgetCopyData return value: %d", rc);
|
||||
}
|
||||
PG_END_TRY();
|
||||
|
||||
return (NeonResponse *) resp;
|
||||
}
|
||||
@@ -681,7 +867,7 @@ pageserver_flush(shardno_t shard_no)
|
||||
{
|
||||
PGconn *pageserver_conn = page_servers[shard_no].conn;
|
||||
|
||||
if (!pageserver_conn)
|
||||
if (page_servers[shard_no].state != PS_Connected)
|
||||
{
|
||||
neon_shard_log(shard_no, WARNING, "Tried to flush while disconnected");
|
||||
}
|
||||
@@ -697,6 +883,7 @@ pageserver_flush(shardno_t shard_no)
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -891,5 +1078,7 @@ pg_init_libpagestore(void)
|
||||
dbsize_hook = neon_dbsize;
|
||||
}
|
||||
|
||||
memset(page_servers, 0, sizeof(page_servers));
|
||||
|
||||
lfc_init();
|
||||
}
|
||||
|
||||
@@ -94,6 +94,10 @@ static char *hexdump_page(char *page);
|
||||
|
||||
const int SmgrTrace = DEBUG5;
|
||||
|
||||
#define NEON_PANIC_CONNECTION_STATE(shard_no, elvl, message, ...) \
|
||||
neon_shard_log(shard_no, elvl, "Broken connection state: " message, \
|
||||
##__VA_ARGS__)
|
||||
|
||||
page_server_api *page_server;
|
||||
|
||||
/* unlogged relation build states */
|
||||
@@ -526,6 +530,8 @@ prefetch_flush_requests(void)
|
||||
*
|
||||
* NOTE: this function may indirectly update MyPState->pfs_hash; which
|
||||
* invalidates any active pointers into the hash table.
|
||||
* NOTE: callers should make sure they can handle query cancellations in this
|
||||
* function's call path.
|
||||
*/
|
||||
static bool
|
||||
prefetch_wait_for(uint64 ring_index)
|
||||
@@ -561,6 +567,8 @@ prefetch_wait_for(uint64 ring_index)
|
||||
*
|
||||
* NOTE: this function may indirectly update MyPState->pfs_hash; which
|
||||
* invalidates any active pointers into the hash table.
|
||||
*
|
||||
* NOTE: this does IO, and can get canceled out-of-line.
|
||||
*/
|
||||
static bool
|
||||
prefetch_read(PrefetchRequest *slot)
|
||||
@@ -572,6 +580,14 @@ prefetch_read(PrefetchRequest *slot)
|
||||
Assert(slot->response == NULL);
|
||||
Assert(slot->my_ring_index == MyPState->ring_receive);
|
||||
|
||||
if (slot->status != PRFS_REQUESTED ||
|
||||
slot->response != NULL ||
|
||||
slot->my_ring_index != MyPState->ring_receive)
|
||||
neon_shard_log(slot->shard_no, ERROR,
|
||||
"Incorrect prefetch read: status=%d response=%llx my=%llu receive=%llu",
|
||||
slot->status, (size_t) (void *) slot->response,
|
||||
slot->my_ring_index, MyPState->ring_receive);
|
||||
|
||||
old = MemoryContextSwitchTo(MyPState->errctx);
|
||||
response = (NeonResponse *) page_server->receive(slot->shard_no);
|
||||
MemoryContextSwitchTo(old);
|
||||
@@ -589,6 +605,11 @@ prefetch_read(PrefetchRequest *slot)
|
||||
}
|
||||
else
|
||||
{
|
||||
neon_shard_log(slot->shard_no, WARNING,
|
||||
"No response from reading prefetch entry %llu: %u/%u/%u.%u block %u. This can be caused by a concurrent disconnect",
|
||||
slot->my_ring_index,
|
||||
RelFileInfoFmt(BufTagGetNRelFileInfo(slot->buftag)),
|
||||
slot->buftag.forkNum, slot->buftag.blockNum);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
@@ -603,6 +624,7 @@ void
|
||||
prefetch_on_ps_disconnect(void)
|
||||
{
|
||||
MyPState->ring_flush = MyPState->ring_unused;
|
||||
|
||||
while (MyPState->ring_receive < MyPState->ring_unused)
|
||||
{
|
||||
PrefetchRequest *slot;
|
||||
@@ -625,6 +647,7 @@ prefetch_on_ps_disconnect(void)
|
||||
slot->status = PRFS_TAG_REMAINS;
|
||||
MyPState->n_requests_inflight -= 1;
|
||||
MyPState->ring_receive += 1;
|
||||
|
||||
prefetch_set_unused(ring_index);
|
||||
}
|
||||
}
|
||||
@@ -691,6 +714,8 @@ static void
|
||||
prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns)
|
||||
{
|
||||
bool found;
|
||||
uint64 mySlotNo = slot->my_ring_index;
|
||||
|
||||
NeonGetPageRequest request = {
|
||||
.req.tag = T_NeonGetPageRequest,
|
||||
/* lsn and not_modified_since are filled in below */
|
||||
@@ -699,6 +724,8 @@ prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns
|
||||
.blkno = slot->buftag.blockNum,
|
||||
};
|
||||
|
||||
Assert(mySlotNo == MyPState->ring_unused);
|
||||
|
||||
if (force_request_lsns)
|
||||
slot->request_lsns = *force_request_lsns;
|
||||
else
|
||||
@@ -711,7 +738,11 @@ prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns
|
||||
Assert(slot->response == NULL);
|
||||
Assert(slot->my_ring_index == MyPState->ring_unused);
|
||||
|
||||
while (!page_server->send(slot->shard_no, (NeonRequest *) &request));
|
||||
while (!page_server->send(slot->shard_no, (NeonRequest *) &request))
|
||||
{
|
||||
Assert(mySlotNo == MyPState->ring_unused);
|
||||
/* loop */
|
||||
}
|
||||
|
||||
/* update prefetch state */
|
||||
MyPState->n_requests_inflight += 1;
|
||||
@@ -722,7 +753,6 @@ prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns
|
||||
|
||||
/* update slot state */
|
||||
slot->status = PRFS_REQUESTED;
|
||||
|
||||
prfh_insert(MyPState->prf_hash, slot, &found);
|
||||
Assert(!found);
|
||||
}
|
||||
@@ -894,6 +924,10 @@ Retry:
|
||||
return ring_index;
|
||||
}
|
||||
|
||||
/*
|
||||
* Note: this function can get canceled and use a long jump to the next catch
|
||||
* context. Take care.
|
||||
*/
|
||||
static NeonResponse *
|
||||
page_server_request(void const *req)
|
||||
{
|
||||
@@ -925,19 +959,38 @@ page_server_request(void const *req)
|
||||
* Current sharding model assumes that all metadata is present only at shard 0.
|
||||
* We still need to call get_shard_no() to check if shard map is up-to-date.
|
||||
*/
|
||||
if (((NeonRequest *) req)->tag != T_NeonGetPageRequest || ((NeonGetPageRequest *) req)->forknum != MAIN_FORKNUM)
|
||||
if (((NeonRequest *) req)->tag != T_NeonGetPageRequest ||
|
||||
((NeonGetPageRequest *) req)->forknum != MAIN_FORKNUM)
|
||||
{
|
||||
shard_no = 0;
|
||||
}
|
||||
|
||||
do
|
||||
{
|
||||
while (!page_server->send(shard_no, (NeonRequest *) req) || !page_server->flush(shard_no));
|
||||
consume_prefetch_responses();
|
||||
resp = page_server->receive(shard_no);
|
||||
} while (resp == NULL);
|
||||
return resp;
|
||||
PG_TRY();
|
||||
{
|
||||
while (!page_server->send(shard_no, (NeonRequest *) req)
|
||||
|| !page_server->flush(shard_no))
|
||||
{
|
||||
/* do nothing */
|
||||
}
|
||||
consume_prefetch_responses();
|
||||
resp = page_server->receive(shard_no);
|
||||
}
|
||||
PG_CATCH();
|
||||
{
|
||||
/*
|
||||
* Cancellation in this code needs to be handled better at some
|
||||
* point, but this currently seems fine for now.
|
||||
*/
|
||||
page_server->disconnect(shard_no);
|
||||
PG_RE_THROW();
|
||||
}
|
||||
PG_END_TRY();
|
||||
|
||||
} while (resp == NULL);
|
||||
|
||||
return resp;
|
||||
}
|
||||
|
||||
|
||||
@@ -1905,7 +1958,9 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
|
||||
break;
|
||||
|
||||
default:
|
||||
neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_exists", resp->tag);
|
||||
NEON_PANIC_CONNECTION_STATE(-1, PANIC,
|
||||
"Expected Exists (0x%02x) or Error (0x%02x) response to ExistsRequest, but got 0x%02x",
|
||||
T_NeonExistsResponse, T_NeonErrorResponse, resp->tag);
|
||||
}
|
||||
pfree(resp);
|
||||
return exists;
|
||||
@@ -2357,7 +2412,7 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
/*
|
||||
* Try to find prefetched page in the list of received pages.
|
||||
*/
|
||||
Retry:
|
||||
Retry:
|
||||
entry = prfh_lookup(MyPState->prf_hash, (PrefetchRequest *) &buftag);
|
||||
|
||||
if (entry != NULL)
|
||||
@@ -2443,7 +2498,9 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
((NeonErrorResponse *) resp)->message)));
|
||||
break;
|
||||
default:
|
||||
neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_read_at_lsn", resp->tag);
|
||||
NEON_PANIC_CONNECTION_STATE(slot->shard_no, PANIC,
|
||||
"Expected GetPage (0x%02x) or Error (0x%02x) response to GetPageRequest, but got 0x%02x",
|
||||
T_NeonGetPageResponse, T_NeonErrorResponse, resp->tag);
|
||||
}
|
||||
|
||||
/* buffer was used, clean up for later reuse */
|
||||
@@ -2714,7 +2771,9 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
|
||||
break;
|
||||
|
||||
default:
|
||||
neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_nblocks", resp->tag);
|
||||
NEON_PANIC_CONNECTION_STATE(-1, PANIC,
|
||||
"Expected Nblocks (0x%02x) or Error (0x%02x) response to NblocksRequest, but got 0x%02x",
|
||||
T_NeonNblocksResponse, T_NeonErrorResponse, resp->tag);
|
||||
}
|
||||
update_cached_relsize(InfoFromSMgrRel(reln), forknum, n_blocks);
|
||||
|
||||
@@ -2767,7 +2826,9 @@ neon_dbsize(Oid dbNode)
|
||||
break;
|
||||
|
||||
default:
|
||||
neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_dbsize", resp->tag);
|
||||
NEON_PANIC_CONNECTION_STATE(-1, PANIC,
|
||||
"Expected DbSize (0x%02x) or Error (0x%02x) response to DbSizeRequest, but got 0x%02x",
|
||||
T_NeonDbSizeResponse, T_NeonErrorResponse, resp->tag);
|
||||
}
|
||||
|
||||
neon_log(SmgrTrace, "neon_dbsize: db %u (request LSN %X/%08X): %ld bytes",
|
||||
@@ -3106,7 +3167,9 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf
|
||||
break;
|
||||
|
||||
default:
|
||||
neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_read_slru_segment", resp->tag);
|
||||
NEON_PANIC_CONNECTION_STATE(-1, PANIC,
|
||||
"Expected GetSlruSegment (0x%02x) or Error (0x%02x) response to GetSlruSegmentRequest, but got 0x%02x",
|
||||
T_NeonGetSlruSegmentResponse, T_NeonErrorResponse, resp->tag);
|
||||
}
|
||||
pfree(resp);
|
||||
|
||||
|
||||
282
test_runner/regress/test_pg_query_cancellation.py
Normal file
282
test_runner/regress/test_pg_query_cancellation.py
Normal file
@@ -0,0 +1,282 @@
|
||||
from contextlib import closing
|
||||
from typing import Set
|
||||
|
||||
import pytest
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import Endpoint, NeonEnv, NeonPageserver
|
||||
from fixtures.pageserver.http import PageserverHttpClient
|
||||
from psycopg2.errors import QueryCanceled
|
||||
|
||||
CRITICAL_PG_PS_WAIT_FAILPOINTS: Set[str] = {
|
||||
"ps::connection-start::pre-login",
|
||||
"ps::connection-start::startup-packet",
|
||||
"ps::connection-start::process-query",
|
||||
"ps::handle-pagerequest-message::exists",
|
||||
"ps::handle-pagerequest-message::nblocks",
|
||||
"ps::handle-pagerequest-message::getpage",
|
||||
"ps::handle-pagerequest-message::dbsize",
|
||||
# We don't yet have a good way to on-demand guarantee the download of an
|
||||
# SLRU segment, so that's disabled for now.
|
||||
# "ps::handle-pagerequest-message::slrusegment",
|
||||
}
|
||||
|
||||
PG_PS_START_FAILPOINTS = {
|
||||
"ps::connection-start::pre-login",
|
||||
"ps::connection-start::startup-packet",
|
||||
"ps::connection-start::process-query",
|
||||
}
|
||||
SMGR_EXISTS = "ps::handle-pagerequest-message::exists"
|
||||
SMGR_NBLOCKS = "ps::handle-pagerequest-message::nblocks"
|
||||
SMGR_GETPAGE = "ps::handle-pagerequest-message::getpage"
|
||||
SMGR_DBSIZE = "ps::handle-pagerequest-message::dbsize"
|
||||
|
||||
"""
|
||||
Test that we can handle connection delays and cancellations at various
|
||||
unfortunate connection startup and request states.
|
||||
"""
|
||||
|
||||
|
||||
def test_cancellations(neon_simple_env: NeonEnv):
|
||||
env = neon_simple_env
|
||||
ps = env.pageserver
|
||||
ps_http = ps.http_client()
|
||||
ps_http.is_testing_enabled_or_skip()
|
||||
|
||||
env.neon_cli.create_branch("test_config", "empty")
|
||||
|
||||
# We don't want to have any racy behaviour with autovacuum IOs
|
||||
ep = env.endpoints.create_start(
|
||||
"test_config",
|
||||
config_lines=[
|
||||
"autovacuum = off",
|
||||
"shared_buffers = 128MB",
|
||||
],
|
||||
)
|
||||
|
||||
with closing(ep.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(
|
||||
"""
|
||||
CREATE TABLE test1 AS
|
||||
SELECT id, sha256(id::text::bytea) payload
|
||||
FROM generate_series(1, 1024::bigint) p(id);
|
||||
"""
|
||||
)
|
||||
cur.execute(
|
||||
"""
|
||||
CREATE TABLE test2 AS
|
||||
SELECT id, sha256(id::text::bytea) payload
|
||||
FROM generate_series(1025, 2048::bigint) p(id);
|
||||
"""
|
||||
)
|
||||
cur.execute(
|
||||
"""
|
||||
VACUUM (ANALYZE, FREEZE) test1, test2;
|
||||
"""
|
||||
)
|
||||
cur.execute(
|
||||
"""
|
||||
CREATE EXTENSION pg_buffercache;
|
||||
"""
|
||||
)
|
||||
cur.execute(
|
||||
"""
|
||||
CREATE EXTENSION pg_prewarm;
|
||||
"""
|
||||
)
|
||||
|
||||
# data preparation is now complete, with 2 disjoint tables that aren't
|
||||
# preloaded into any caches.
|
||||
|
||||
ep.stop()
|
||||
|
||||
for failpoint in CRITICAL_PG_PS_WAIT_FAILPOINTS:
|
||||
connect_works_correctly(failpoint, ep, ps, ps_http)
|
||||
|
||||
|
||||
ENABLED_FAILPOINTS: Set[str] = set()
|
||||
|
||||
|
||||
def connect_works_correctly(
|
||||
failpoint: str, ep: Endpoint, ps: NeonPageserver, ps_http: PageserverHttpClient
|
||||
):
|
||||
log.debug("Starting work on %s", failpoint)
|
||||
# All queries we use should finish (incl. IO) within 500ms,
|
||||
# including all their IO.
|
||||
# This allows us to use `SET statement_timeout` to let the query
|
||||
# timeout system cancel queries, rather than us having to go
|
||||
# through the most annoying effort of manual query cancellation
|
||||
# in psycopg2.
|
||||
options = "-cstatement_timeout=500ms -ceffective_io_concurrency=1"
|
||||
|
||||
ep.start()
|
||||
|
||||
def fp_enable():
|
||||
global ENABLED_FAILPOINTS
|
||||
ps_http.configure_failpoints(
|
||||
[
|
||||
(failpoint, "pause"),
|
||||
]
|
||||
)
|
||||
ENABLED_FAILPOINTS = ENABLED_FAILPOINTS | {failpoint}
|
||||
log.info(
|
||||
'Enabled failpoint "%s", current_active=%s', failpoint, ENABLED_FAILPOINTS, stacklevel=2
|
||||
)
|
||||
|
||||
def fp_disable():
|
||||
global ENABLED_FAILPOINTS
|
||||
ps_http.configure_failpoints(
|
||||
[
|
||||
(failpoint, "off"),
|
||||
]
|
||||
)
|
||||
ENABLED_FAILPOINTS = ENABLED_FAILPOINTS - {failpoint}
|
||||
log.info(
|
||||
'Disabled failpoint "%s", current_active=%s',
|
||||
failpoint,
|
||||
ENABLED_FAILPOINTS,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
def check_buffers(cur):
|
||||
cur.execute(
|
||||
"""
|
||||
SELECT n.nspname AS nspname
|
||||
, c.relname AS relname
|
||||
, count(*) AS count
|
||||
FROM pg_buffercache b
|
||||
JOIN pg_class c
|
||||
ON b.relfilenode = pg_relation_filenode(c.oid) AND
|
||||
b.reldatabase = (SELECT oid FROM pg_database WHERE datname = current_database())
|
||||
JOIN pg_namespace n ON n.oid = c.relnamespace
|
||||
WHERE c.oid IN ('test1'::regclass::oid, 'test2'::regclass::oid)
|
||||
GROUP BY n.nspname, c.relname
|
||||
ORDER BY 3 DESC
|
||||
LIMIT 10
|
||||
"""
|
||||
)
|
||||
return cur.fetchone()
|
||||
|
||||
def exec_may_cancel(query, cursor, result, cancels):
|
||||
if cancels:
|
||||
with pytest.raises(QueryCanceled):
|
||||
cursor.execute(query)
|
||||
assert cursor.fetchone() == result
|
||||
else:
|
||||
cursor.execute(query)
|
||||
assert cursor.fetchone() == result
|
||||
|
||||
fp_disable()
|
||||
|
||||
# Warm caches required for new connections, so that they can run without
|
||||
# requiring catalog reads.
|
||||
with closing(ep.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(
|
||||
"""
|
||||
SELECT 1;
|
||||
"""
|
||||
)
|
||||
assert cur.fetchone() == (1,)
|
||||
|
||||
assert check_buffers(cur) is None
|
||||
# Ensure all caches required for connection start are correctly
|
||||
# filled, so that we don't have any "accidents" in this test run
|
||||
# caused by changes in connection startup plans that require
|
||||
# requests to the PageServer.
|
||||
cur.execute(
|
||||
"""
|
||||
select array_agg(distinct (pg_prewarm(c.oid::regclass, 'buffer') >= 0))
|
||||
from pg_class c
|
||||
where c.oid < 16384 AND c.relkind IN ('i', 'r');
|
||||
"""
|
||||
)
|
||||
assert cur.fetchone() == ([True],)
|
||||
|
||||
# Enable failpoint
|
||||
fp_enable()
|
||||
|
||||
with closing(ep.connect(options=options, autocommit=True)) as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("SHOW statement_timeout;")
|
||||
assert cur.fetchone() == ("500ms",)
|
||||
assert check_buffers(cur) is None
|
||||
exec_may_cancel(
|
||||
"""
|
||||
SELECT min(id) FROM test1;
|
||||
""",
|
||||
cur,
|
||||
(1,),
|
||||
failpoint in (CRITICAL_PG_PS_WAIT_FAILPOINTS - {SMGR_EXISTS, SMGR_DBSIZE}),
|
||||
)
|
||||
|
||||
fp_disable()
|
||||
|
||||
with closing(ep.connect(options=options, autocommit=True)) as conn:
|
||||
with conn.cursor() as cur:
|
||||
# Do a select on the data, putting some buffers into the prefetch
|
||||
# queue.
|
||||
cur.execute(
|
||||
"""
|
||||
SELECT count(id) FROM (select * from test1 LIMIT 256) a;
|
||||
"""
|
||||
)
|
||||
assert cur.fetchone() == (256,)
|
||||
|
||||
ps.stop()
|
||||
ps.start()
|
||||
fp_enable()
|
||||
|
||||
exec_may_cancel(
|
||||
"""
|
||||
SELECT COUNT(id) FROM test1;
|
||||
""",
|
||||
cur,
|
||||
(1024,),
|
||||
failpoint
|
||||
in (CRITICAL_PG_PS_WAIT_FAILPOINTS - {SMGR_EXISTS, SMGR_NBLOCKS, SMGR_DBSIZE}),
|
||||
)
|
||||
|
||||
with closing(ep.connect(options=options, autocommit=True)) as conn:
|
||||
with conn.cursor() as cur:
|
||||
exec_may_cancel(
|
||||
"""
|
||||
SELECT COUNT(id) FROM test2;
|
||||
""",
|
||||
cur,
|
||||
(1024,),
|
||||
failpoint in (CRITICAL_PG_PS_WAIT_FAILPOINTS - {SMGR_EXISTS, SMGR_DBSIZE}),
|
||||
)
|
||||
|
||||
fp_disable()
|
||||
fp_enable()
|
||||
|
||||
exec_may_cancel(
|
||||
"""
|
||||
SELECT 0 < pg_database_size(CURRENT_DATABASE());
|
||||
""",
|
||||
cur,
|
||||
(True,),
|
||||
failpoint
|
||||
in (CRITICAL_PG_PS_WAIT_FAILPOINTS - {SMGR_EXISTS, SMGR_GETPAGE, SMGR_NBLOCKS}),
|
||||
)
|
||||
|
||||
fp_disable()
|
||||
|
||||
cur.execute(
|
||||
"""
|
||||
SELECT count(id), count(distinct payload), min(id), max(id), sum(id) FROM test2;
|
||||
"""
|
||||
)
|
||||
|
||||
assert cur.fetchone() == (1024, 1024, 1025, 2048, 1573376)
|
||||
|
||||
cur.execute(
|
||||
"""
|
||||
SELECT count(id), count(distinct payload), min(id), max(id), sum(id) FROM test1;
|
||||
"""
|
||||
)
|
||||
|
||||
assert cur.fetchone() == (1024, 1024, 1, 1024, 524800)
|
||||
|
||||
ep.stop()
|
||||
Reference in New Issue
Block a user