From 77f9e74d86d62bca524571aa50416680afb1f5ec Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Thu, 6 Feb 2025 02:14:29 +0100 Subject: [PATCH] pgxn: include socket send & recv queue size in slow response logs (#10673) # Problem When we see an apparent slow request, one possible cause is that the client is failing to consume responses, but we don't have a clear way to see that. # Solution - Log the socket queue depths on slow/stuck connections, so that we have an indication of whether the compute is keeping up with processing the connection's responses. refs - slack https://neondb.slack.com/archives/C036U0GRMRB/p1738652644396329 - refs https://github.com/neondatabase/cloud/issues/23515 - refs https://github.com/neondatabase/cloud/issues/23486 --- pgxn/neon/libpagestore.c | 34 ++++++++++++++++++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index 4460e3b40c..22aeb2e2d6 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -36,6 +36,11 @@ #include "pagestore_client.h" #include "walproposer.h" +#ifdef __linux__ +#include +#include +#endif + #define PageStoreTrace DEBUG5 #define MIN_RECONNECT_INTERVAL_USEC 1000 @@ -728,11 +733,36 @@ retry: INSTR_TIME_SUBTRACT(since_last_log, last_log_ts); if (INSTR_TIME_GET_MILLISEC(since_last_log) >= LOG_INTERVAL_MS) { + int sndbuf = -1; + int recvbuf = -1; +#ifdef __linux__ + int socketfd; +#endif + since_start = now; INSTR_TIME_SUBTRACT(since_start, start_ts); - neon_shard_log(shard_no, LOG, "no response received from pageserver for %0.3f s, still waiting (sent " UINT64_FORMAT " requests, received " UINT64_FORMAT " responses)", + +#ifdef __linux__ + /* + * get kernel's send and recv queue size via ioctl + * https://elixir.bootlin.com/linux/v6.1.128/source/include/uapi/linux/sockios.h#L25-L27 + */ + socketfd = PQsocket(pageserver_conn); + if (socketfd != -1) { + int ioctl_err; + ioctl_err = ioctl(socketfd, SIOCOUTQ, &sndbuf); + if (ioctl_err!= 0) { + sndbuf = -errno; + } + ioctl_err = ioctl(socketfd, FIONREAD, &recvbuf); + if (ioctl_err != 0) { + recvbuf = -errno; + } + } +#endif + neon_shard_log(shard_no, LOG, "no response received from pageserver for %0.3f s, still waiting (sent " UINT64_FORMAT " requests, received " UINT64_FORMAT " responses) (socket sndbuf=%d recvbuf=%d)", INSTR_TIME_GET_DOUBLE(since_start), - shard->nrequests_sent, shard->nresponses_received); + shard->nrequests_sent, shard->nresponses_received, sndbuf, recvbuf); last_log_ts = now; logged = true; }