From 63b2060aef39da8e9eb00cda72ff1e99eed2a74d Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Thu, 28 Mar 2024 08:16:05 +0200 Subject: [PATCH] Drop connections with all shards invoplved in prefetch in case of error (#7249) ## Problem See https://github.com/neondatabase/cloud/issues/11559 If we have multiple shards, we need to reset connections to all shards involved in prefetch (having active prefetch requests) if connection with any of them is lost. ## Summary of changes In `prefetch_on_ps_disconnect` drop connection to all shards with active page requests. ## Checklist before requesting a review - [ ] I have performed a self-review of my code. - [ ] If it is a core feature, I have added thorough tests. - [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard? - [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section. ## Checklist before merging - [ ] Do not forget to reformat commit message to not include the above checklist Co-authored-by: Konstantin Knizhnik --- pgxn/neon/libpagestore.c | 36 ++++++++++++++++++++++++++---------- pgxn/neon/pagestore_client.h | 1 + pgxn/neon/pagestore_smgr.c | 8 ++++++++ 3 files changed, 35 insertions(+), 10 deletions(-) diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index e31de3c6b5..1bc8a2e87c 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -111,6 +111,7 @@ static PageServer page_servers[MAX_SHARDS]; static bool pageserver_flush(shardno_t shard_no); static void pageserver_disconnect(shardno_t shard_no); +static void pageserver_disconnect_shard(shardno_t shard_no); static bool PagestoreShmemIsValid(void) @@ -487,9 +488,31 @@ retry: return ret; } - +/* + * Reset prefetch and drop connection to the shard. + * It also drops connection to all other shards involved in prefetch. + */ static void pageserver_disconnect(shardno_t shard_no) +{ + if (page_servers[shard_no].conn) + { + /* + * If the connection to any pageserver is lost, we throw away the + * whole prefetch queue, even for other pageservers. It should not + * cause big problems, because connection loss is supposed to be a + * rare event. + */ + prefetch_on_ps_disconnect(); + } + pageserver_disconnect_shard(shard_no); +} + +/* + * Disconnect from specified shard + */ +static void +pageserver_disconnect_shard(shardno_t shard_no) { /* * If anything goes wrong while we were sending a request, it's not clear @@ -503,14 +526,6 @@ pageserver_disconnect(shardno_t shard_no) neon_shard_log(shard_no, LOG, "dropping connection to page server due to error"); PQfinish(page_servers[shard_no].conn); page_servers[shard_no].conn = NULL; - - /* - * If the connection to any pageserver is lost, we throw away the - * whole prefetch queue, even for other pageservers. It should not - * cause big problems, because connection loss is supposed to be a - * rare event. - */ - prefetch_on_ps_disconnect(); } if (page_servers[shard_no].wes != NULL) { @@ -676,7 +691,8 @@ page_server_api api = { .send = pageserver_send, .flush = pageserver_flush, - .receive = pageserver_receive + .receive = pageserver_receive, + .disconnect = pageserver_disconnect_shard }; static bool diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h index 2889ffacae..44ae766f76 100644 --- a/pgxn/neon/pagestore_client.h +++ b/pgxn/neon/pagestore_client.h @@ -180,6 +180,7 @@ typedef struct bool (*send) (shardno_t shard_no, NeonRequest * request); NeonResponse *(*receive) (shardno_t shard_no); bool (*flush) (shardno_t shard_no); + void (*disconnect) (shardno_t shard_no); } page_server_api; extern void prefetch_on_ps_disconnect(void); diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index 2d222e3c7c..ecc8ddb384 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -613,6 +613,14 @@ prefetch_on_ps_disconnect(void) Assert(slot->status == PRFS_REQUESTED); Assert(slot->my_ring_index == ring_index); + /* + * Drop connection to all shards which have prefetch requests. + * It is not a problem to call disconnect multiple times on the same connection + * because disconnect implementation in libpagestore.c will check if connection + * is alive and do nothing of connection was already dropped. + */ + page_server->disconnect(slot->shard_no); + /* clean up the request */ slot->status = PRFS_TAG_REMAINS; MyPState->n_requests_inflight -= 1;