Drop connections with all shards invoplved in prefetch in case of error (#7249)

## Problem

See https://github.com/neondatabase/cloud/issues/11559

If we have multiple shards, we need to reset connections to all shards
involved in prefetch (having active prefetch requests) if connection
with any of them is lost.

## Summary of changes

In `prefetch_on_ps_disconnect` drop connection to all shards with active
page requests.

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
This commit is contained in:
Konstantin Knizhnik
2024-03-28 08:16:05 +02:00
committed by GitHub
parent 24c5a5ac16
commit 63b2060aef
3 changed files with 35 additions and 10 deletions

View File

@@ -111,6 +111,7 @@ static PageServer page_servers[MAX_SHARDS];
static bool pageserver_flush(shardno_t shard_no);
static void pageserver_disconnect(shardno_t shard_no);
static void pageserver_disconnect_shard(shardno_t shard_no);
static bool
PagestoreShmemIsValid(void)
@@ -487,9 +488,31 @@ retry:
return ret;
}
/*
* Reset prefetch and drop connection to the shard.
* It also drops connection to all other shards involved in prefetch.
*/
static void
pageserver_disconnect(shardno_t shard_no)
{
if (page_servers[shard_no].conn)
{
/*
* If the connection to any pageserver is lost, we throw away the
* whole prefetch queue, even for other pageservers. It should not
* cause big problems, because connection loss is supposed to be a
* rare event.
*/
prefetch_on_ps_disconnect();
}
pageserver_disconnect_shard(shard_no);
}
/*
* Disconnect from specified shard
*/
static void
pageserver_disconnect_shard(shardno_t shard_no)
{
/*
* If anything goes wrong while we were sending a request, it's not clear
@@ -503,14 +526,6 @@ pageserver_disconnect(shardno_t shard_no)
neon_shard_log(shard_no, LOG, "dropping connection to page server due to error");
PQfinish(page_servers[shard_no].conn);
page_servers[shard_no].conn = NULL;
/*
* If the connection to any pageserver is lost, we throw away the
* whole prefetch queue, even for other pageservers. It should not
* cause big problems, because connection loss is supposed to be a
* rare event.
*/
prefetch_on_ps_disconnect();
}
if (page_servers[shard_no].wes != NULL)
{
@@ -676,7 +691,8 @@ page_server_api api =
{
.send = pageserver_send,
.flush = pageserver_flush,
.receive = pageserver_receive
.receive = pageserver_receive,
.disconnect = pageserver_disconnect_shard
};
static bool

View File

@@ -180,6 +180,7 @@ typedef struct
bool (*send) (shardno_t shard_no, NeonRequest * request);
NeonResponse *(*receive) (shardno_t shard_no);
bool (*flush) (shardno_t shard_no);
void (*disconnect) (shardno_t shard_no);
} page_server_api;
extern void prefetch_on_ps_disconnect(void);

View File

@@ -613,6 +613,14 @@ prefetch_on_ps_disconnect(void)
Assert(slot->status == PRFS_REQUESTED);
Assert(slot->my_ring_index == ring_index);
/*
* Drop connection to all shards which have prefetch requests.
* It is not a problem to call disconnect multiple times on the same connection
* because disconnect implementation in libpagestore.c will check if connection
* is alive and do nothing of connection was already dropped.
*/
page_server->disconnect(slot->shard_no);
/* clean up the request */
slot->status = PRFS_TAG_REMAINS;
MyPState->n_requests_inflight -= 1;