diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index d250864fd6..e9651165b1 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -260,6 +260,8 @@ async fn page_service_conn_main( socket.set_timeout(Some(std::time::Duration::from_millis(socket_timeout_ms))); let socket = std::pin::pin!(socket); + fail::fail_point!("ps::connection-start::pre-login"); + // XXX: pgbackend.run() should take the connection_ctx, // and create a child per-query context when it invokes process_query. // But it's in a shared crate, so, we store connection_ctx inside PageServerHandler @@ -603,6 +605,7 @@ impl PageServerHandler { }; trace!("query: {copy_data_bytes:?}"); + fail::fail_point!("ps::handle-pagerequest-message"); // Trace request if needed if let Some(t) = tracer.as_mut() { @@ -617,6 +620,7 @@ impl PageServerHandler { let (response, span) = match neon_fe_msg { PagestreamFeMessage::Exists(req) => { + fail::fail_point!("ps::handle-pagerequest-message::exists"); let span = tracing::info_span!("handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.request_lsn); ( self.handle_get_rel_exists_request(tenant_id, timeline_id, &req, &ctx) @@ -626,6 +630,7 @@ impl PageServerHandler { ) } PagestreamFeMessage::Nblocks(req) => { + fail::fail_point!("ps::handle-pagerequest-message::nblocks"); let span = tracing::info_span!("handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.request_lsn); ( self.handle_get_nblocks_request(tenant_id, timeline_id, &req, &ctx) @@ -635,6 +640,7 @@ impl PageServerHandler { ) } PagestreamFeMessage::GetPage(req) => { + fail::fail_point!("ps::handle-pagerequest-message::getpage"); // shard_id is filled in by the handler let span = tracing::info_span!("handle_get_page_at_lsn_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.request_lsn); ( @@ -645,6 +651,7 @@ impl PageServerHandler { ) } PagestreamFeMessage::DbSize(req) => { + fail::fail_point!("ps::handle-pagerequest-message::dbsize"); let span = tracing::info_span!("handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.request_lsn); ( self.handle_db_size_request(tenant_id, timeline_id, &req, &ctx) @@ -654,6 +661,7 @@ impl PageServerHandler { ) } PagestreamFeMessage::GetSlruSegment(req) => { + fail::fail_point!("ps::handle-pagerequest-message::slrusegment"); let span = tracing::info_span!("handle_get_slru_segment_request", kind = %req.kind, segno = %req.segno, req_lsn = %req.request_lsn); ( self.handle_get_slru_segment_request(tenant_id, timeline_id, &req, &ctx) @@ -1505,6 +1513,7 @@ where _pgb: &mut PostgresBackend, _sm: &FeStartupPacket, ) -> Result<(), QueryError> { + fail::fail_point!("ps::connection-start::startup-packet"); Ok(()) } @@ -1519,6 +1528,8 @@ where Err(QueryError::SimulatedConnectionError) }); + fail::fail_point!("ps::connection-start::process-query"); + let ctx = self.connection_ctx.attached_child(); debug!("process query {query_string:?}"); let parts = query_string.split_whitespace().collect::>(); diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index f5ce2caff3..a9c8d59c3a 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -51,7 +51,6 @@ int flush_every_n_requests = 8; int neon_protocol_version = 2; -static int n_reconnect_attempts = 0; static int max_reconnect_attempts = 60; static int stripe_size; @@ -95,18 +94,44 @@ static shmem_startup_hook_type prev_shmem_startup_hook; static PagestoreShmemState *pagestore_shared; static uint64 pagestore_local_counter = 0; +typedef enum PSConnectionState { + PS_Disconnected, /* no connection yet */ + PS_Connecting_Startup, /* connection starting up */ + PS_Connecting_PageStream, /* negotiating pagestream */ + PS_Connected, /* connected, pagestream established */ +} PSConnectionState; + /* This backend's per-shard connections */ typedef struct { - PGconn *conn; + TimestampTz last_connect_time; /* read-only debug value */ + TimestampTz last_reconnect_time; + uint32 delay_us; + int n_reconnect_attempts; /*--- - * WaitEventSet containing: - * - WL_SOCKET_READABLE on 'conn' - * - WL_LATCH_SET on MyLatch, and - * - WL_EXIT_ON_PM_DEATH. + * Pageserver connection state, i.e. + * disconnected: conn == NULL, wes == NULL; + * conn_startup: connection initiated, waiting for connection establishing + * conn_ps: PageStream query sent, waiting for confirmation + * connected: PageStream established */ - WaitEventSet *wes; + PSConnectionState state; + PGconn *conn; + /*--- + * WaitEventSet containing: + * - WL_SOCKET_READABLE on 'conn' + * - WL_LATCH_SET on MyLatch, and + * - WL_EXIT_ON_PM_DEATH. + */ + WaitEventSet *wes_read; + /*--- + * WaitEventSet containing: + * - WL_SOCKET_WRITABLE on 'conn' + * - WL_LATCH_SET on MyLatch, and + * - WL_EXIT_ON_PM_DEATH. + */ + WaitEventSet *wes_write; } PageServer; static PageServer page_servers[MAX_SHARDS]; @@ -303,119 +328,269 @@ get_shard_number(BufferTag *tag) return hash % n_shards; } +static inline void +CLEANUP_AND_DISCONNECT(PageServer *shard) +{ + if (shard->wes_read) + { + FreeWaitEventSet(shard->wes_read); + shard->wes_read = NULL; + } + if (shard->wes_write) + { + FreeWaitEventSet(shard->wes_write); + shard->wes_write = NULL; + } + if (shard->conn) + { + PQfinish(shard->conn); + shard->conn = NULL; + } + + shard->state = PS_Disconnected; +} + +/* + * Connect to a pageserver, or continue to try to connect if we're yet to + * complete the connection (e.g. due to receiving an earlier cancellation + * during connection start). + * Returns true if successfully connected; false if the connection failed. + * + * Throws errors in unrecoverable situations, or when this backend's query + * is canceled. + */ static bool pageserver_connect(shardno_t shard_no, int elevel) { - char *query; - int ret; - const char *keywords[3]; - const char *values[3]; - int n; - PGconn *conn; - WaitEventSet *wes; + PageServer *shard = &page_servers[shard_no]; char connstr[MAX_PAGESERVER_CONNSTRING_SIZE]; - static TimestampTz last_connect_time = 0; - static uint64_t delay_us = MIN_RECONNECT_INTERVAL_USEC; - TimestampTz now; - uint64_t us_since_last_connect; - bool broke_from_loop = false; - - Assert(page_servers[shard_no].conn == NULL); - /* * Get the connection string for this shard. If the shard map has been * updated since we last looked, this will also disconnect any existing * pageserver connections as a side effect. + * Note that connstr is used both during connection start, and when we + * log the successful connection. */ load_shard_map(shard_no, connstr, NULL); - now = GetCurrentTimestamp(); - us_since_last_connect = now - last_connect_time; - if (us_since_last_connect < MAX_RECONNECT_INTERVAL_USEC) + switch (shard->state) { - pg_usleep(delay_us); - delay_us *= 2; - } - else + case PS_Disconnected: { - delay_us = MIN_RECONNECT_INTERVAL_USEC; - } + const char *keywords[3]; + const char *values[3]; + int n_pgsql_params; + TimestampTz now; + int64 us_since_last_attempt; - /* - * Connect using the connection string we got from the - * neon.pageserver_connstring GUC. If the NEON_AUTH_TOKEN environment - * variable was set, use that as the password. - * - * The connection options are parsed in the order they're given, so when - * we set the password before the connection string, the connection string - * can override the password from the env variable. Seems useful, although - * we don't currently use that capability anywhere. - */ - n = 0; - if (neon_auth_token) - { - keywords[n] = "password"; - values[n] = neon_auth_token; - n++; + /* Make sure we start with a clean slate */ + CLEANUP_AND_DISCONNECT(shard); + + neon_shard_log(shard_no, DEBUG5, "Connection state: Disconnected"); + + now = GetCurrentTimestamp(); + us_since_last_attempt = (int64) (now - shard->last_reconnect_time); + shard->last_reconnect_time = now; + + /* + * If we did other tasks between reconnect attempts, then we won't + * need to wait as long as a full delay. + */ + if (us_since_last_attempt < shard->delay_us) + { + pg_usleep(shard->delay_us - us_since_last_attempt); + } + + /* update the delay metric */ + shard->delay_us = Min(shard->delay_us * 2, MAX_RECONNECT_INTERVAL_USEC); + + /* + * Connect using the connection string we got from the + * neon.pageserver_connstring GUC. If the NEON_AUTH_TOKEN environment + * variable was set, use that as the password. + * + * The connection options are parsed in the order they're given, so when + * we set the password before the connection string, the connection string + * can override the password from the env variable. Seems useful, although + * we don't currently use that capability anywhere. + */ + keywords[0] = "dbname"; + values[0] = connstr; + n_pgsql_params = 1; + + if (neon_auth_token) + { + keywords[1] = "password"; + values[1] = neon_auth_token; + n_pgsql_params++; + } + + keywords[n_pgsql_params] = NULL; + values[n_pgsql_params] = NULL; + + shard->conn = PQconnectStartParams(keywords, values, 1); + if (!shard->conn) + { + neon_shard_log(shard_no, elevel, "Failed to connect to pageserver: out of memory"); + return false; + } + + shard->wes_read = CreateWaitEventSet(TopMemoryContext, 3); + AddWaitEventToSet(shard->wes_read, WL_LATCH_SET, PGINVALID_SOCKET, + MyLatch, NULL); + AddWaitEventToSet(shard->wes_read, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET, + NULL, NULL); + AddWaitEventToSet(shard->wes_read, WL_SOCKET_READABLE, PQsocket(shard->conn), NULL, NULL); + + shard->wes_write = CreateWaitEventSet(TopMemoryContext, 3); + AddWaitEventToSet(shard->wes_write, WL_LATCH_SET, PGINVALID_SOCKET, + MyLatch, NULL); + AddWaitEventToSet(shard->wes_write, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET, + NULL, NULL); + AddWaitEventToSet(shard->wes_write, WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE, + PQsocket(shard->conn), + NULL, NULL); + + shard->state = PS_Connecting_Startup; + /* fallthrough */ } - keywords[n] = "dbname"; - values[n] = connstr; - n++; - keywords[n] = NULL; - values[n] = NULL; - n++; - conn = PQconnectdbParams(keywords, values, 1); - last_connect_time = GetCurrentTimestamp(); - - if (PQstatus(conn) == CONNECTION_BAD) + case PS_Connecting_Startup: { - char *msg = pchomp(PQerrorMessage(conn)); + char *pagestream_query; + int ps_send_query_ret; + bool connected = false; - PQfinish(conn); + neon_shard_log(shard_no, DEBUG5, "Connection state: Connecting_Startup"); - ereport(elevel, - (errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION), - errmsg(NEON_TAG "[shard %d] could not establish connection to pageserver", shard_no), - errdetail_internal("%s", msg))); - pfree(msg); - return false; - } - switch (neon_protocol_version) - { + do + { + WaitEvent event; + int poll_result = PQconnectPoll(shard->conn); + + switch (poll_result) + { + default: /* unknown/unused states are handled as a failed connection */ + case PGRES_POLLING_FAILED: + { + char *pqerr = PQerrorMessage(shard->conn); + char *msg = NULL; + neon_shard_log(shard_no, DEBUG5, "POLLING_FAILED"); + + if (pqerr) + msg = pchomp(pqerr); + + CLEANUP_AND_DISCONNECT(shard); + + if (msg) + { + neon_shard_log(shard_no, elevel, + "could not connect to pageserver: %s", + msg); + pfree(msg); + } + else + neon_shard_log(shard_no, elevel, + "could not connect to pageserver"); + + return false; + } + case PGRES_POLLING_READING: + /* Sleep until there's something to do */ + (void) WaitEventSetWait(shard->wes_read, -1L, &event, 1, + PG_WAIT_EXTENSION); + ResetLatch(MyLatch); + + /* query cancellation, backend shutdown */ + CHECK_FOR_INTERRUPTS(); + + /* PQconnectPoll() handles the socket polling state updates */ + + break; + case PGRES_POLLING_WRITING: + /* Sleep until there's something to do */ + (void) WaitEventSetWait(shard->wes_write, -1L, &event, 1, + PG_WAIT_EXTENSION); + ResetLatch(MyLatch); + + /* query cancellation, backend shutdown */ + CHECK_FOR_INTERRUPTS(); + + /* PQconnectPoll() handles the socket polling state updates */ + + break; + case PGRES_POLLING_OK: + neon_shard_log(shard_no, DEBUG5, "POLLING_OK"); + connected = true; + break; + } + } + while (!connected); + + /* No more polling needed; connection succeeded */ + shard->last_connect_time = GetCurrentTimestamp(); + + switch (neon_protocol_version) + { case 2: - query = psprintf("pagestream_v2 %s %s", neon_tenant, neon_timeline); + pagestream_query = psprintf("pagestream_v2 %s %s", neon_tenant, neon_timeline); break; case 1: - query = psprintf("pagestream %s %s", neon_tenant, neon_timeline); + pagestream_query = psprintf("pagestream %s %s", neon_tenant, neon_timeline); break; default: elog(ERROR, "unexpected neon_protocol_version %d", neon_protocol_version); - } - ret = PQsendQuery(conn, query); - pfree(query); - if (ret != 1) - { - PQfinish(conn); - neon_shard_log(shard_no, elevel, "could not send pagestream command to pageserver"); - return false; - } + } - wes = CreateWaitEventSet(TopMemoryContext, 3); - AddWaitEventToSet(wes, WL_LATCH_SET, PGINVALID_SOCKET, - MyLatch, NULL); - AddWaitEventToSet(wes, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET, - NULL, NULL); - AddWaitEventToSet(wes, WL_SOCKET_READABLE, PQsocket(conn), NULL, NULL); + if (PQstatus(shard->conn) == CONNECTION_BAD) + { + char *msg = pchomp(PQerrorMessage(shard->conn)); - PG_TRY(); + CLEANUP_AND_DISCONNECT(shard); + + ereport(elevel, + (errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION), + errmsg(NEON_TAG "[shard %d] could not establish connection to pageserver", shard_no), + errdetail_internal("%s", msg))); + pfree(msg); + return false; + } + + ps_send_query_ret = PQsendQuery(shard->conn, pagestream_query); + pfree(pagestream_query); + if (ps_send_query_ret != 1) + { + CLEANUP_AND_DISCONNECT(shard); + + neon_shard_log(shard_no, elevel, "could not send pagestream command to pageserver"); + return false; + } + + shard->state = PS_Connecting_PageStream; + /* fallthrough */ + } + case PS_Connecting_PageStream: { - while (PQisBusy(conn)) + neon_shard_log(shard_no, DEBUG5, "Connection state: Connecting_PageStream"); + + if (PQstatus(shard->conn) == CONNECTION_BAD) + { + char *msg = pchomp(PQerrorMessage(shard->conn)); + CLEANUP_AND_DISCONNECT(shard); + ereport(elevel, + (errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION), + errmsg(NEON_TAG "[shard %d] could not establish connection to pageserver", shard_no), + errdetail_internal("%s", msg))); + pfree(msg); + return false; + } + + while (PQisBusy(shard->conn)) { WaitEvent event; /* Sleep until there's something to do */ - (void) WaitEventSetWait(wes, -1L, &event, 1, PG_WAIT_EXTENSION); + (void) WaitEventSetWait(shard->wes_read, -1L, &event, 1, PG_WAIT_EXTENSION); ResetLatch(MyLatch); CHECK_FOR_INTERRUPTS(); @@ -423,40 +598,37 @@ pageserver_connect(shardno_t shard_no, int elevel) /* Data available in socket? */ if (event.events & WL_SOCKET_READABLE) { - if (!PQconsumeInput(conn)) + if (!PQconsumeInput(shard->conn)) { - char *msg = pchomp(PQerrorMessage(conn)); - - PQfinish(conn); - FreeWaitEventSet(wes); + char *msg = pchomp(PQerrorMessage(shard->conn)); + CLEANUP_AND_DISCONNECT(shard); neon_shard_log(shard_no, elevel, "could not complete handshake with pageserver: %s", msg); - /* Returning from inside PG_TRY is bad, so we break/return later */ - broke_from_loop = true; - break; + pfree(msg); + return false; } } } - } - PG_CATCH(); - { - PQfinish(conn); - FreeWaitEventSet(wes); - PG_RE_THROW(); - } - PG_END_TRY(); - if (broke_from_loop) - { - return false; + shard->state = PS_Connected; + /* fallthrough */ } + case PS_Connected: + /* + * We successfully connected. Future connections to this PageServer + * will do fast retries again, with exponential backoff. + */ + shard->delay_us = MIN_RECONNECT_INTERVAL_USEC; - neon_shard_log(shard_no, LOG, "libpagestore: connected to '%s' with protocol version %d", connstr, neon_protocol_version); - page_servers[shard_no].conn = conn; - page_servers[shard_no].wes = wes; - - return true; + neon_shard_log(shard_no, DEBUG5, "Connection state: Connected"); + neon_shard_log(shard_no, LOG, "libpagestore: connected to '%s' with protocol version %d", connstr, neon_protocol_version); + return true; + default: + neon_shard_log(shard_no, ERROR, "libpagestore: invalid connection state %d", shard->state); + } + /* This shouldn't be hit */ + Assert(false); } /* @@ -476,7 +648,7 @@ retry: WaitEvent event; /* Sleep until there's something to do */ - (void) WaitEventSetWait(page_servers[shard_no].wes, -1L, &event, 1, PG_WAIT_EXTENSION); + (void) WaitEventSetWait(page_servers[shard_no].wes_read, -1L, &event, 1, PG_WAIT_EXTENSION); ResetLatch(MyLatch); CHECK_FOR_INTERRUPTS(); @@ -502,7 +674,8 @@ retry: /* * Reset prefetch and drop connection to the shard. - * It also drops connection to all other shards involved in prefetch. + * It also drops connection to all other shards involved in prefetch, through + * prefetch_on_ps_disconnect(). */ static void pageserver_disconnect(shardno_t shard_no) @@ -512,9 +685,6 @@ pageserver_disconnect(shardno_t shard_no) * whole prefetch queue, even for other pageservers. It should not * cause big problems, because connection loss is supposed to be a * rare event. - * - * Prefetch state should be reset even if page_servers[shard_no].conn == NULL, - * because prefetch request may be registered before connection is established. */ prefetch_on_ps_disconnect(); @@ -527,37 +697,36 @@ pageserver_disconnect(shardno_t shard_no) static void pageserver_disconnect_shard(shardno_t shard_no) { + PageServer *shard = &page_servers[shard_no]; /* * If anything goes wrong while we were sending a request, it's not clear * what state the connection is in. For example, if we sent the request * but didn't receive a response yet, we might receive the response some * time later after we have already sent a new unrelated request. Close * the connection to avoid getting confused. + * Similarly, even when we're in PS_DISCONNECTED, we may have junk to + * clean up: It is possible that we encountered an error allocating any + * of the wait event sets or the psql connection, or failed when we tried + * to attach wait events to the WaitEventSets. */ - if (page_servers[shard_no].conn) - { - neon_shard_log(shard_no, LOG, "dropping connection to page server due to error"); - PQfinish(page_servers[shard_no].conn); - page_servers[shard_no].conn = NULL; - } - if (page_servers[shard_no].wes != NULL) - { - FreeWaitEventSet(page_servers[shard_no].wes); - page_servers[shard_no].wes = NULL; - } + CLEANUP_AND_DISCONNECT(shard); + + shard->state = PS_Disconnected; } static bool pageserver_send(shardno_t shard_no, NeonRequest *request) { StringInfoData req_buff; - PGconn *pageserver_conn = page_servers[shard_no].conn; + PageServer *shard = &page_servers[shard_no]; + PGconn *pageserver_conn; /* If the connection was lost for some reason, reconnect */ - if (pageserver_conn && PQstatus(pageserver_conn) == CONNECTION_BAD) + if (shard->state == PS_Connected && PQstatus(shard->conn) == CONNECTION_BAD) { neon_shard_log(shard_no, LOG, "pageserver_send disconnect bad connection"); pageserver_disconnect(shard_no); + pageserver_conn = NULL; } req_buff = nm_pack_request(request); @@ -571,17 +740,19 @@ pageserver_send(shardno_t shard_no, NeonRequest *request) * https://github.com/neondatabase/neon/issues/1138 So try to reestablish * connection in case of failure. */ - if (!page_servers[shard_no].conn) + if (shard->state != PS_Connected) { - while (!pageserver_connect(shard_no, n_reconnect_attempts < max_reconnect_attempts ? LOG : ERROR)) + while (!pageserver_connect(shard_no, shard->n_reconnect_attempts < max_reconnect_attempts ? LOG : ERROR)) { HandleMainLoopInterrupts(); - n_reconnect_attempts += 1; + shard->n_reconnect_attempts += 1; } - n_reconnect_attempts = 0; + shard->n_reconnect_attempts = 0; + } else { + Assert(shard->conn != NULL); } - pageserver_conn = page_servers[shard_no].conn; + pageserver_conn = shard->conn; /* * Send request. @@ -590,13 +761,17 @@ pageserver_send(shardno_t shard_no, NeonRequest *request) * should use async mode and check for interrupts while waiting. In * practice, our requests are small enough to always fit in the output and * TCP buffer. + * + * Note that this also will fail when the connection is in the + * PGRES_POLLING_WRITING state. It's kinda dirty to disconnect at this + * point, but on the grand scheme of things it's only a small issue. */ if (PQputCopyData(pageserver_conn, req_buff.data, req_buff.len) <= 0) { char *msg = pchomp(PQerrorMessage(pageserver_conn)); pageserver_disconnect(shard_no); - neon_shard_log(shard_no, LOG, "pageserver_send disconnect because failed to send page request (try to reconnect): %s", msg); + neon_shard_log(shard_no, LOG, "pageserver_send disconnected: failed to send page request (try to reconnect): %s", msg); pfree(msg); pfree(req_buff.data); return false; @@ -611,6 +786,7 @@ pageserver_send(shardno_t shard_no, NeonRequest *request) neon_shard_log(shard_no, PageStoreTrace, "sent request: %s", msg); pfree(msg); } + return true; } @@ -619,58 +795,68 @@ pageserver_receive(shardno_t shard_no) { StringInfoData resp_buff; NeonResponse *resp; - PGconn *pageserver_conn = page_servers[shard_no].conn; + PageServer *shard = &page_servers[shard_no]; + PGconn *pageserver_conn = shard->conn; + /* read response */ + int rc; - if (!pageserver_conn) - return NULL; - - PG_TRY(); + if (shard->state != PS_Connected) { - /* read response */ - int rc; + neon_shard_log(shard_no, LOG, + "pageserver_receive: returning NULL for non-connected pageserver connection: 0x%02x", + shard->state); + return NULL; + } - rc = call_PQgetCopyData(shard_no, &resp_buff.data); - if (rc >= 0) + Assert(pageserver_conn); + + rc = call_PQgetCopyData(shard_no, &resp_buff.data); + if (rc >= 0) + { + /* call_PQgetCopyData handles rc == 0 */ + Assert(rc > 0); + + PG_TRY(); { resp_buff.len = rc; resp_buff.cursor = 0; resp = nm_unpack_response(&resp_buff); PQfreemem(resp_buff.data); - - if (message_level_is_interesting(PageStoreTrace)) - { - char *msg = nm_to_string((NeonMessage *) resp); - - neon_shard_log(shard_no, PageStoreTrace, "got response: %s", msg); - pfree(msg); - } } - else if (rc == -1) + PG_CATCH(); { - neon_shard_log(shard_no, LOG, "pageserver_receive disconnect because call_PQgetCopyData returns -1: %s", pchomp(PQerrorMessage(pageserver_conn))); + neon_shard_log(shard_no, LOG, "pageserver_receive: disconnect due malformatted response"); pageserver_disconnect(shard_no); - resp = NULL; + PG_RE_THROW(); } - else if (rc == -2) - { - char *msg = pchomp(PQerrorMessage(pageserver_conn)); + PG_END_TRY(); - pageserver_disconnect(shard_no); - neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect because could not read COPY data: %s", msg); - } - else + if (message_level_is_interesting(PageStoreTrace)) { - pageserver_disconnect(shard_no); - neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect because unexpected PQgetCopyData return value: %d", rc); + char *msg = nm_to_string((NeonMessage *) resp); + + neon_shard_log(shard_no, PageStoreTrace, "got response: %s", msg); + pfree(msg); } } - PG_CATCH(); + else if (rc == -1) { - neon_shard_log(shard_no, LOG, "pageserver_receive disconnect due to caught exception"); + neon_shard_log(shard_no, LOG, "pageserver_receive disconnect: psql end of copy data: %s", pchomp(PQerrorMessage(pageserver_conn))); pageserver_disconnect(shard_no); - PG_RE_THROW(); + resp = NULL; + } + else if (rc == -2) + { + char *msg = pchomp(PQerrorMessage(pageserver_conn)); + + pageserver_disconnect(shard_no); + neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect: could not read COPY data: %s", msg); + } + else + { + pageserver_disconnect(shard_no); + neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect: unexpected PQgetCopyData return value: %d", rc); } - PG_END_TRY(); return (NeonResponse *) resp; } @@ -681,7 +867,7 @@ pageserver_flush(shardno_t shard_no) { PGconn *pageserver_conn = page_servers[shard_no].conn; - if (!pageserver_conn) + if (page_servers[shard_no].state != PS_Connected) { neon_shard_log(shard_no, WARNING, "Tried to flush while disconnected"); } @@ -697,6 +883,7 @@ pageserver_flush(shardno_t shard_no) return false; } } + return true; } @@ -891,5 +1078,7 @@ pg_init_libpagestore(void) dbsize_hook = neon_dbsize; } + memset(page_servers, 0, sizeof(page_servers)); + lfc_init(); } diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index 41546eae85..ac505fe6fb 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -94,6 +94,10 @@ static char *hexdump_page(char *page); const int SmgrTrace = DEBUG5; +#define NEON_PANIC_CONNECTION_STATE(shard_no, elvl, message, ...) \ + neon_shard_log(shard_no, elvl, "Broken connection state: " message, \ + ##__VA_ARGS__) + page_server_api *page_server; /* unlogged relation build states */ @@ -526,6 +530,8 @@ prefetch_flush_requests(void) * * NOTE: this function may indirectly update MyPState->pfs_hash; which * invalidates any active pointers into the hash table. + * NOTE: callers should make sure they can handle query cancellations in this + * function's call path. */ static bool prefetch_wait_for(uint64 ring_index) @@ -561,6 +567,8 @@ prefetch_wait_for(uint64 ring_index) * * NOTE: this function may indirectly update MyPState->pfs_hash; which * invalidates any active pointers into the hash table. + * + * NOTE: this does IO, and can get canceled out-of-line. */ static bool prefetch_read(PrefetchRequest *slot) @@ -572,6 +580,14 @@ prefetch_read(PrefetchRequest *slot) Assert(slot->response == NULL); Assert(slot->my_ring_index == MyPState->ring_receive); + if (slot->status != PRFS_REQUESTED || + slot->response != NULL || + slot->my_ring_index != MyPState->ring_receive) + neon_shard_log(slot->shard_no, ERROR, + "Incorrect prefetch read: status=%d response=%llx my=%llu receive=%llu", + slot->status, (size_t) (void *) slot->response, + slot->my_ring_index, MyPState->ring_receive); + old = MemoryContextSwitchTo(MyPState->errctx); response = (NeonResponse *) page_server->receive(slot->shard_no); MemoryContextSwitchTo(old); @@ -589,6 +605,11 @@ prefetch_read(PrefetchRequest *slot) } else { + neon_shard_log(slot->shard_no, WARNING, + "No response from reading prefetch entry %llu: %u/%u/%u.%u block %u. This can be caused by a concurrent disconnect", + slot->my_ring_index, + RelFileInfoFmt(BufTagGetNRelFileInfo(slot->buftag)), + slot->buftag.forkNum, slot->buftag.blockNum); return false; } } @@ -603,6 +624,7 @@ void prefetch_on_ps_disconnect(void) { MyPState->ring_flush = MyPState->ring_unused; + while (MyPState->ring_receive < MyPState->ring_unused) { PrefetchRequest *slot; @@ -625,6 +647,7 @@ prefetch_on_ps_disconnect(void) slot->status = PRFS_TAG_REMAINS; MyPState->n_requests_inflight -= 1; MyPState->ring_receive += 1; + prefetch_set_unused(ring_index); } } @@ -691,6 +714,8 @@ static void prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns) { bool found; + uint64 mySlotNo = slot->my_ring_index; + NeonGetPageRequest request = { .req.tag = T_NeonGetPageRequest, /* lsn and not_modified_since are filled in below */ @@ -699,6 +724,8 @@ prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns .blkno = slot->buftag.blockNum, }; + Assert(mySlotNo == MyPState->ring_unused); + if (force_request_lsns) slot->request_lsns = *force_request_lsns; else @@ -711,7 +738,11 @@ prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns Assert(slot->response == NULL); Assert(slot->my_ring_index == MyPState->ring_unused); - while (!page_server->send(slot->shard_no, (NeonRequest *) &request)); + while (!page_server->send(slot->shard_no, (NeonRequest *) &request)) + { + Assert(mySlotNo == MyPState->ring_unused); + /* loop */ + } /* update prefetch state */ MyPState->n_requests_inflight += 1; @@ -722,7 +753,6 @@ prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns /* update slot state */ slot->status = PRFS_REQUESTED; - prfh_insert(MyPState->prf_hash, slot, &found); Assert(!found); } @@ -894,6 +924,10 @@ Retry: return ring_index; } +/* + * Note: this function can get canceled and use a long jump to the next catch + * context. Take care. + */ static NeonResponse * page_server_request(void const *req) { @@ -925,19 +959,38 @@ page_server_request(void const *req) * Current sharding model assumes that all metadata is present only at shard 0. * We still need to call get_shard_no() to check if shard map is up-to-date. */ - if (((NeonRequest *) req)->tag != T_NeonGetPageRequest || ((NeonGetPageRequest *) req)->forknum != MAIN_FORKNUM) + if (((NeonRequest *) req)->tag != T_NeonGetPageRequest || + ((NeonGetPageRequest *) req)->forknum != MAIN_FORKNUM) { shard_no = 0; } do { - while (!page_server->send(shard_no, (NeonRequest *) req) || !page_server->flush(shard_no)); - consume_prefetch_responses(); - resp = page_server->receive(shard_no); - } while (resp == NULL); - return resp; + PG_TRY(); + { + while (!page_server->send(shard_no, (NeonRequest *) req) + || !page_server->flush(shard_no)) + { + /* do nothing */ + } + consume_prefetch_responses(); + resp = page_server->receive(shard_no); + } + PG_CATCH(); + { + /* + * Cancellation in this code needs to be handled better at some + * point, but this currently seems fine for now. + */ + page_server->disconnect(shard_no); + PG_RE_THROW(); + } + PG_END_TRY(); + } while (resp == NULL); + + return resp; } @@ -1905,7 +1958,9 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum) break; default: - neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_exists", resp->tag); + NEON_PANIC_CONNECTION_STATE(-1, PANIC, + "Expected Exists (0x%02x) or Error (0x%02x) response to ExistsRequest, but got 0x%02x", + T_NeonExistsResponse, T_NeonErrorResponse, resp->tag); } pfree(resp); return exists; @@ -2357,7 +2412,7 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, /* * Try to find prefetched page in the list of received pages. */ - Retry: +Retry: entry = prfh_lookup(MyPState->prf_hash, (PrefetchRequest *) &buftag); if (entry != NULL) @@ -2443,7 +2498,9 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, ((NeonErrorResponse *) resp)->message))); break; default: - neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_read_at_lsn", resp->tag); + NEON_PANIC_CONNECTION_STATE(slot->shard_no, PANIC, + "Expected GetPage (0x%02x) or Error (0x%02x) response to GetPageRequest, but got 0x%02x", + T_NeonGetPageResponse, T_NeonErrorResponse, resp->tag); } /* buffer was used, clean up for later reuse */ @@ -2714,7 +2771,9 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum) break; default: - neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_nblocks", resp->tag); + NEON_PANIC_CONNECTION_STATE(-1, PANIC, + "Expected Nblocks (0x%02x) or Error (0x%02x) response to NblocksRequest, but got 0x%02x", + T_NeonNblocksResponse, T_NeonErrorResponse, resp->tag); } update_cached_relsize(InfoFromSMgrRel(reln), forknum, n_blocks); @@ -2767,7 +2826,9 @@ neon_dbsize(Oid dbNode) break; default: - neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_dbsize", resp->tag); + NEON_PANIC_CONNECTION_STATE(-1, PANIC, + "Expected DbSize (0x%02x) or Error (0x%02x) response to DbSizeRequest, but got 0x%02x", + T_NeonDbSizeResponse, T_NeonErrorResponse, resp->tag); } neon_log(SmgrTrace, "neon_dbsize: db %u (request LSN %X/%08X): %ld bytes", @@ -3106,7 +3167,9 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf break; default: - neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_read_slru_segment", resp->tag); + NEON_PANIC_CONNECTION_STATE(-1, PANIC, + "Expected GetSlruSegment (0x%02x) or Error (0x%02x) response to GetSlruSegmentRequest, but got 0x%02x", + T_NeonGetSlruSegmentResponse, T_NeonErrorResponse, resp->tag); } pfree(resp); diff --git a/test_runner/regress/test_pg_query_cancellation.py b/test_runner/regress/test_pg_query_cancellation.py new file mode 100644 index 0000000000..bad2e5865e --- /dev/null +++ b/test_runner/regress/test_pg_query_cancellation.py @@ -0,0 +1,282 @@ +from contextlib import closing +from typing import Set + +import pytest +from fixtures.log_helper import log +from fixtures.neon_fixtures import Endpoint, NeonEnv, NeonPageserver +from fixtures.pageserver.http import PageserverHttpClient +from psycopg2.errors import QueryCanceled + +CRITICAL_PG_PS_WAIT_FAILPOINTS: Set[str] = { + "ps::connection-start::pre-login", + "ps::connection-start::startup-packet", + "ps::connection-start::process-query", + "ps::handle-pagerequest-message::exists", + "ps::handle-pagerequest-message::nblocks", + "ps::handle-pagerequest-message::getpage", + "ps::handle-pagerequest-message::dbsize", + # We don't yet have a good way to on-demand guarantee the download of an + # SLRU segment, so that's disabled for now. + # "ps::handle-pagerequest-message::slrusegment", +} + +PG_PS_START_FAILPOINTS = { + "ps::connection-start::pre-login", + "ps::connection-start::startup-packet", + "ps::connection-start::process-query", +} +SMGR_EXISTS = "ps::handle-pagerequest-message::exists" +SMGR_NBLOCKS = "ps::handle-pagerequest-message::nblocks" +SMGR_GETPAGE = "ps::handle-pagerequest-message::getpage" +SMGR_DBSIZE = "ps::handle-pagerequest-message::dbsize" + +""" +Test that we can handle connection delays and cancellations at various +unfortunate connection startup and request states. +""" + + +def test_cancellations(neon_simple_env: NeonEnv): + env = neon_simple_env + ps = env.pageserver + ps_http = ps.http_client() + ps_http.is_testing_enabled_or_skip() + + env.neon_cli.create_branch("test_config", "empty") + + # We don't want to have any racy behaviour with autovacuum IOs + ep = env.endpoints.create_start( + "test_config", + config_lines=[ + "autovacuum = off", + "shared_buffers = 128MB", + ], + ) + + with closing(ep.connect()) as conn: + with conn.cursor() as cur: + cur.execute( + """ + CREATE TABLE test1 AS + SELECT id, sha256(id::text::bytea) payload + FROM generate_series(1, 1024::bigint) p(id); + """ + ) + cur.execute( + """ + CREATE TABLE test2 AS + SELECT id, sha256(id::text::bytea) payload + FROM generate_series(1025, 2048::bigint) p(id); + """ + ) + cur.execute( + """ + VACUUM (ANALYZE, FREEZE) test1, test2; + """ + ) + cur.execute( + """ + CREATE EXTENSION pg_buffercache; + """ + ) + cur.execute( + """ + CREATE EXTENSION pg_prewarm; + """ + ) + + # data preparation is now complete, with 2 disjoint tables that aren't + # preloaded into any caches. + + ep.stop() + + for failpoint in CRITICAL_PG_PS_WAIT_FAILPOINTS: + connect_works_correctly(failpoint, ep, ps, ps_http) + + +ENABLED_FAILPOINTS: Set[str] = set() + + +def connect_works_correctly( + failpoint: str, ep: Endpoint, ps: NeonPageserver, ps_http: PageserverHttpClient +): + log.debug("Starting work on %s", failpoint) + # All queries we use should finish (incl. IO) within 500ms, + # including all their IO. + # This allows us to use `SET statement_timeout` to let the query + # timeout system cancel queries, rather than us having to go + # through the most annoying effort of manual query cancellation + # in psycopg2. + options = "-cstatement_timeout=500ms -ceffective_io_concurrency=1" + + ep.start() + + def fp_enable(): + global ENABLED_FAILPOINTS + ps_http.configure_failpoints( + [ + (failpoint, "pause"), + ] + ) + ENABLED_FAILPOINTS = ENABLED_FAILPOINTS | {failpoint} + log.info( + 'Enabled failpoint "%s", current_active=%s', failpoint, ENABLED_FAILPOINTS, stacklevel=2 + ) + + def fp_disable(): + global ENABLED_FAILPOINTS + ps_http.configure_failpoints( + [ + (failpoint, "off"), + ] + ) + ENABLED_FAILPOINTS = ENABLED_FAILPOINTS - {failpoint} + log.info( + 'Disabled failpoint "%s", current_active=%s', + failpoint, + ENABLED_FAILPOINTS, + stacklevel=2, + ) + + def check_buffers(cur): + cur.execute( + """ + SELECT n.nspname AS nspname + , c.relname AS relname + , count(*) AS count + FROM pg_buffercache b + JOIN pg_class c + ON b.relfilenode = pg_relation_filenode(c.oid) AND + b.reldatabase = (SELECT oid FROM pg_database WHERE datname = current_database()) + JOIN pg_namespace n ON n.oid = c.relnamespace + WHERE c.oid IN ('test1'::regclass::oid, 'test2'::regclass::oid) + GROUP BY n.nspname, c.relname + ORDER BY 3 DESC + LIMIT 10 + """ + ) + return cur.fetchone() + + def exec_may_cancel(query, cursor, result, cancels): + if cancels: + with pytest.raises(QueryCanceled): + cursor.execute(query) + assert cursor.fetchone() == result + else: + cursor.execute(query) + assert cursor.fetchone() == result + + fp_disable() + + # Warm caches required for new connections, so that they can run without + # requiring catalog reads. + with closing(ep.connect()) as conn: + with conn.cursor() as cur: + cur.execute( + """ + SELECT 1; + """ + ) + assert cur.fetchone() == (1,) + + assert check_buffers(cur) is None + # Ensure all caches required for connection start are correctly + # filled, so that we don't have any "accidents" in this test run + # caused by changes in connection startup plans that require + # requests to the PageServer. + cur.execute( + """ + select array_agg(distinct (pg_prewarm(c.oid::regclass, 'buffer') >= 0)) + from pg_class c + where c.oid < 16384 AND c.relkind IN ('i', 'r'); + """ + ) + assert cur.fetchone() == ([True],) + + # Enable failpoint + fp_enable() + + with closing(ep.connect(options=options, autocommit=True)) as conn: + with conn.cursor() as cur: + cur.execute("SHOW statement_timeout;") + assert cur.fetchone() == ("500ms",) + assert check_buffers(cur) is None + exec_may_cancel( + """ + SELECT min(id) FROM test1; + """, + cur, + (1,), + failpoint in (CRITICAL_PG_PS_WAIT_FAILPOINTS - {SMGR_EXISTS, SMGR_DBSIZE}), + ) + + fp_disable() + + with closing(ep.connect(options=options, autocommit=True)) as conn: + with conn.cursor() as cur: + # Do a select on the data, putting some buffers into the prefetch + # queue. + cur.execute( + """ + SELECT count(id) FROM (select * from test1 LIMIT 256) a; + """ + ) + assert cur.fetchone() == (256,) + + ps.stop() + ps.start() + fp_enable() + + exec_may_cancel( + """ + SELECT COUNT(id) FROM test1; + """, + cur, + (1024,), + failpoint + in (CRITICAL_PG_PS_WAIT_FAILPOINTS - {SMGR_EXISTS, SMGR_NBLOCKS, SMGR_DBSIZE}), + ) + + with closing(ep.connect(options=options, autocommit=True)) as conn: + with conn.cursor() as cur: + exec_may_cancel( + """ + SELECT COUNT(id) FROM test2; + """, + cur, + (1024,), + failpoint in (CRITICAL_PG_PS_WAIT_FAILPOINTS - {SMGR_EXISTS, SMGR_DBSIZE}), + ) + + fp_disable() + fp_enable() + + exec_may_cancel( + """ + SELECT 0 < pg_database_size(CURRENT_DATABASE()); + """, + cur, + (True,), + failpoint + in (CRITICAL_PG_PS_WAIT_FAILPOINTS - {SMGR_EXISTS, SMGR_GETPAGE, SMGR_NBLOCKS}), + ) + + fp_disable() + + cur.execute( + """ + SELECT count(id), count(distinct payload), min(id), max(id), sum(id) FROM test2; + """ + ) + + assert cur.fetchone() == (1024, 1024, 1025, 2048, 1573376) + + cur.execute( + """ + SELECT count(id), count(distinct payload), min(id), max(id), sum(id) FROM test1; + """ + ) + + assert cur.fetchone() == (1024, 1024, 1, 1024, 524800) + + ep.stop()