From d2825e72ad0c3b925dda69f8988b4da14ed12922 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Wed, 9 Apr 2025 14:17:45 +0200 Subject: [PATCH 1/2] Add is_stopping check around critical macro in walreceiver (#11496) The timeline stopping state is set much earlier than the cancellation token is fired, so by checking for the stopping state, we can prevent races with timeline shutdown where we issue a cancellation error but the cancellation token hasn't been fired yet. Fix #11427. --- .../src/tenant/timeline/walreceiver/walreceiver_connection.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs index f41a9cfe82..6bf05a0f86 100644 --- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs @@ -445,7 +445,7 @@ pub(super) async fn handle_walreceiver_connection( .inspect_err(|err| { // TODO: we can't differentiate cancellation errors with // anyhow::Error, so just ignore it if we're cancelled. - if !cancellation.is_cancelled() { + if !cancellation.is_cancelled() && !timeline.is_stopping() { critical!("{err:?}") } })?; @@ -577,7 +577,7 @@ pub(super) async fn handle_walreceiver_connection( .inspect_err(|err| { // TODO: we can't differentiate cancellation errors with // anyhow::Error, so just ignore it if we're cancelled. - if !cancellation.is_cancelled() { + if !cancellation.is_cancelled() && !timeline.is_stopping() { critical!("{err:?}") } })?; From ef8101a9be3ce80d104943238a7d608561432189 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 9 Apr 2025 15:28:59 +0300 Subject: [PATCH 2/2] refactor: Split "communicator" routines to a separate source file (#11459) pagestore_smgr.c had grown pretty large. Split into two parts, such that the smgr routines that PostgreSQL code calls stays in pagestore_smgr.c, and all the prefetching logic and other lower-level routines related to communicating with the pageserver are moved to a new source file, "communicator.c". There are plans to replace communicator parts with a new implementation. See https://github.com/neondatabase/neon/pull/10799. This commit doesn't implement any of the new things yet, but it is good preparation for it. I'm imagining that the new implementation will approximately replace the current "communicator.c" code, exposing roughly the same functions to pagestore_smgr.c. This commit doesn't change any functionality or behavior, or make any other changes to the existing code: It just moves existing code around. --- pgxn/neon/Makefile | 1 + pgxn/neon/communicator.c | 2504 ++++++++++++++++++++++++++++++++++ pgxn/neon/communicator.h | 48 + pgxn/neon/neon.c | 3 +- pgxn/neon/neon.h | 1 - pgxn/neon/pagestore_client.h | 5 + pgxn/neon/pagestore_smgr.c | 2459 +-------------------------------- 7 files changed, 2610 insertions(+), 2411 deletions(-) create mode 100644 pgxn/neon/communicator.c create mode 100644 pgxn/neon/communicator.h diff --git a/pgxn/neon/Makefile b/pgxn/neon/Makefile index 8259d24359..426b176af9 100644 --- a/pgxn/neon/Makefile +++ b/pgxn/neon/Makefile @@ -4,6 +4,7 @@ MODULE_big = neon OBJS = \ $(WIN32RES) \ + communicator.o \ extension_server.o \ file_cache.o \ hll.o \ diff --git a/pgxn/neon/communicator.c b/pgxn/neon/communicator.c new file mode 100644 index 0000000000..932034e22e --- /dev/null +++ b/pgxn/neon/communicator.c @@ -0,0 +1,2504 @@ +/*------------------------------------------------------------------------- + * + * communicator.c + * Functions for communicating with remote pageservers. + * + * This is the so-called "legacy" communicator. It consists of functions that + * are called from the smgr implementation, in pagestore_smgr.c. There are + * plans to replace this with a different implementation, see RFC. + * + * The communicator is a collection of functions that are called in each + * backend, when the backend needs to read a page or other information. It + * does not spawn background threads or anything like that. To process + * responses to prefetch requests in a timely fashion, however, it registers + * a ProcessInterrupts hook that gets called periodically from any + * CHECK_FOR_INTERRUPTS() point in the backend. + * + * By the time the functions in this file are called, the caller has already + * established that a request to the pageserver is necessary. The functions + * are only called for permanent relations (i.e. not temp or unlogged tables). + * Before making a call to the communicator, the caller has already checked + * the relation size or local file cache. + * + * However, when processing responses to getpage requests, the communicator + * writes pages directly to the LFC. + * + * The communicator functions take request LSNs as arguments; the caller is + * responsible for determining the correct LSNs to use. There's one exception + * to that, in prefetch_do_request(); it sometimes calls back to + * neon_get_request_lsns(). That's because sometimes a suitable response is + * found in the prefetch buffer and the request LSns are not needed, and the + * caller doesn't know whether it's needed or not. + * + * The main interface consists of the following "synchronous" calls: + * + * communicator_exists - Returns true if a relation file exists + * communicator_nblocks - Returns a relation's size + * communicator_dbsize - Returns a databases's total size + * communicator_read_at_lsnv - Read contents of one relation block + * communicator_read_slru_segment - Read contents of one SLRU segment + * + * In addition, there functions related to prefetching: + * communicator_prefetch_register_bufferv - Start prefetching a page + * communicator_prefetch_lookupv - Check if a page is already in prefetch queue + * + * Misc other functions: + * - communicator_init - Initialize the module at startup + * - communicator_prefetch_pump_state - Called periodically to advance the state + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/xlog.h" +#include "access/xlogdefs.h" +#include "access/xlog_internal.h" +#include "access/xlogutils.h" +#include "common/hashfn.h" +#include "executor/instrument.h" +#include "libpq/pqformat.h" +#include "miscadmin.h" +#include "port/pg_iovec.h" +#include "postmaster/interrupt.h" +#include "replication/walsender.h" +#include "utils/timeout.h" + +#include "bitmap.h" +#include "communicator.h" +#include "file_cache.h" +#include "neon.h" +#include "neon_perf_counters.h" +#include "pagestore_client.h" + +#if PG_VERSION_NUM >= 150000 +#include "access/xlogrecovery.h" +#endif + +#if PG_VERSION_NUM < 160000 +typedef PGAlignedBlock PGIOAlignedBlock; +#endif + +#define NEON_PANIC_CONNECTION_STATE(shard_no, elvl, message, ...) \ + neon_shard_log(shard_no, elvl, "Broken connection state: " message, \ + ##__VA_ARGS__) + +page_server_api *page_server; + +static uint32 local_request_counter; +#define GENERATE_REQUEST_ID() (((NeonRequestId)MyProcPid << 32) | ++local_request_counter) + +/* + * Various settings related to prompt (fast) handling of PageStream responses + * at any CHECK_FOR_INTERRUPTS point. + */ +int readahead_getpage_pull_timeout_ms = 0; +static int PS_TIMEOUT_ID = 0; +static bool timeout_set = false; +static bool timeout_signaled = false; + +/* + * We have a CHECK_FOR_INTERRUPTS in page_server->receive(), and we don't want + * that to handle any getpage responses if we're already working on the + * backlog of those, as we'd hit issues with determining which prefetch slot + * we just got a response for. + * + * To protect against that, we have this variable that's set whenever we start + * receiving data for prefetch slots, so that we don't get confused. + * + * Note that in certain error cases during readpage we may leak r_r_g=true, + * which results in a failure to pick up further responses until we first + * actively try to receive new getpage responses. + */ +static bool readpage_reentrant_guard = false; + +static void pagestore_timeout_handler(void); + +#define START_PREFETCH_RECEIVE_WORK() \ + do { \ + readpage_reentrant_guard = true; \ + } while (false) + +#define END_PREFETCH_RECEIVE_WORK() \ + do { \ + readpage_reentrant_guard = false; \ + if (unlikely(timeout_signaled && !InterruptPending)) \ + InterruptPending = true; \ + } while (false) + +/* + * Prefetch implementation: + * + * Prefetch is performed locally by each backend. + * + * There can be up to readahead_buffer_size active IO requests registered at + * any time. Requests using smgr_prefetch are sent to the pageserver, but we + * don't wait on the response. Requests using smgr_read are either read from + * the buffer, or (if that's not possible) we wait on the response to arrive - + * this also will allow us to receive other prefetched pages. + * Each request is immediately written to the output buffer of the pageserver + * connection, but may not be flushed if smgr_prefetch is used: pageserver + * flushes sent requests on manual flush, or every neon.flush_output_after + * unflushed requests; which is not necessarily always and all the time. + * + * Once we have received a response, this value will be stored in the response + * buffer, indexed in a hash table. This allows us to retain our buffered + * prefetch responses even when we have cache misses. + * + * Reading of prefetch responses is delayed until them are actually needed + * (smgr_read). In case of prefetch miss or any other SMGR request other than + * smgr_read, all prefetch responses in the pipeline will need to be read from + * the connection; the responses are stored for later use. + * + * NOTE: The current implementation of the prefetch system implements a ring + * buffer of up to readahead_buffer_size requests. If there are more _read and + * _prefetch requests between the initial _prefetch and the _read of a buffer, + * the prefetch request will have been dropped from this prefetch buffer, and + * your prefetch was wasted. + */ + +/* + * State machine: + * + * not in hash : in hash + * : + * UNUSED ------> REQUESTED --> RECEIVED + * ^ : | | + * | : v | + * | : TAG_REMAINS | + * | : | | + * +----------------+------------+ + * : + */ +typedef enum PrefetchStatus +{ + PRFS_UNUSED = 0, /* unused slot */ + PRFS_REQUESTED, /* request was written to the sendbuffer to + * PS, but not necessarily flushed. all fields + * except response valid */ + PRFS_RECEIVED, /* all fields valid */ + PRFS_TAG_REMAINS, /* only buftag and my_ring_index are still + * valid */ +} PrefetchStatus; + +/* must fit in uint8; bits 0x1 are used */ +typedef enum { + PRFSF_NONE = 0x0, + PRFSF_LFC = 0x1 /* received prefetch result is stored in LFC */ +} PrefetchRequestFlags; + +typedef struct PrefetchRequest +{ + BufferTag buftag; /* must be first entry in the struct */ + shardno_t shard_no; + uint8 status; /* see PrefetchStatus for valid values */ + uint8 flags; /* see PrefetchRequestFlags */ + neon_request_lsns request_lsns; + NeonRequestId reqid; + NeonResponse *response; /* may be null */ + uint64 my_ring_index; +} PrefetchRequest; + +/* prefetch buffer lookup hash table */ + +typedef struct PrfHashEntry +{ + PrefetchRequest *slot; + uint32 status; + uint32 hash; +} PrfHashEntry; + +#define SH_PREFIX prfh +#define SH_ELEMENT_TYPE PrfHashEntry +#define SH_KEY_TYPE PrefetchRequest * +#define SH_KEY slot +#define SH_STORE_HASH +#define SH_GET_HASH(tb, a) ((a)->hash) +#define SH_HASH_KEY(tb, key) hash_bytes( \ + ((const unsigned char *) &(key)->buftag), \ + sizeof(BufferTag) \ +) + +#define SH_EQUAL(tb, a, b) (BufferTagsEqual(&(a)->buftag, &(b)->buftag)) +#define SH_SCOPE static inline +#define SH_DEFINE +#define SH_DECLARE +#include "lib/simplehash.h" + +/* + * PrefetchState maintains the state of (prefetch) getPage@LSN requests. + * It maintains a (ring) buffer of in-flight requests and responses. + * + * We maintain several indexes into the ring buffer: + * ring_unused >= ring_flush >= ring_receive >= ring_last >= 0 + * + * ring_unused points to the first unused slot of the buffer + * ring_receive is the next request that is to be received + * ring_last is the oldest received entry in the buffer + * + * Apart from being an entry in the ring buffer of prefetch requests, each + * PrefetchRequest that is not UNUSED is indexed in prf_hash by buftag. + */ +typedef struct PrefetchState +{ + MemoryContext bufctx; /* context for prf_buffer[].response + * allocations */ + MemoryContext errctx; /* context for prf_buffer[].response + * allocations */ + MemoryContext hashctx; /* context for prf_buffer */ + + /* buffer indexes */ + uint64 ring_unused; /* first unused slot */ + uint64 ring_flush; /* next request to flush */ + uint64 ring_receive; /* next slot that is to receive a response */ + uint64 ring_last; /* min slot with a response value */ + + /* metrics / statistics */ + int n_responses_buffered; /* count of PS responses not yet in + * buffers */ + int n_requests_inflight; /* count of PS requests considered in + * flight */ + int n_unused; /* count of buffers < unused, > last, that are + * also unused */ + + /* the buffers */ + prfh_hash *prf_hash; + int max_shard_no; + /* Mark shards involved in prefetch */ + uint8 shard_bitmap[(MAX_SHARDS + 7)/8]; + PrefetchRequest prf_buffer[]; /* prefetch buffers */ +} PrefetchState; + +static PrefetchState *MyPState; + +#define GetPrfSlotNoCheck(ring_index) ( \ + &MyPState->prf_buffer[((ring_index) % readahead_buffer_size)] \ +) + +#define GetPrfSlot(ring_index) ( \ + ( \ + AssertMacro((ring_index) < MyPState->ring_unused && \ + (ring_index) >= MyPState->ring_last), \ + GetPrfSlotNoCheck(ring_index) \ + ) \ +) + +#define ReceiveBufferNeedsCompaction() (\ + (MyPState->n_responses_buffered / 8) < ( \ + MyPState->ring_receive - \ + MyPState->ring_last - \ + MyPState->n_responses_buffered \ + ) \ +) + +static process_interrupts_callback_t prev_interrupt_cb; + +static bool compact_prefetch_buffers(void); +static void consume_prefetch_responses(void); +static uint64 prefetch_register_bufferv(BufferTag tag, neon_request_lsns *frlsns, + BlockNumber nblocks, const bits8 *mask, + bool is_prefetch); +static bool prefetch_read(PrefetchRequest *slot); +static void prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns); +static bool prefetch_wait_for(uint64 ring_index); +static void prefetch_cleanup_trailing_unused(void); +static inline void prefetch_set_unused(uint64 ring_index); + +static bool neon_prefetch_response_usable(neon_request_lsns *request_lsns, + PrefetchRequest *slot); +static bool communicator_processinterrupts(void); + +void +pg_init_communicator(void) +{ + prev_interrupt_cb = ProcessInterruptsCallback; + ProcessInterruptsCallback = communicator_processinterrupts; +} + +static bool +compact_prefetch_buffers(void) +{ + uint64 empty_ring_index = MyPState->ring_last; + uint64 search_ring_index = MyPState->ring_receive; + int n_moved = 0; + + if (MyPState->ring_receive == MyPState->ring_last) + return false; + + while (search_ring_index > MyPState->ring_last) + { + search_ring_index--; + if (GetPrfSlot(search_ring_index)->status == PRFS_UNUSED) + { + empty_ring_index = search_ring_index; + break; + } + } + + /* + * Here we have established: slots < search_ring_index have an unknown + * state (not scanned) slots >= search_ring_index and <= empty_ring_index + * are unused slots > empty_ring_index are in use, or outside our buffer's + * range. ... unless search_ring_index <= ring_last + * + * Therefore, there is a gap of at least one unused items between + * search_ring_index and empty_ring_index (both inclusive), which grows as + * we hit more unused items while moving backwards through the array. + */ + + while (search_ring_index > MyPState->ring_last) + { + PrefetchRequest *source_slot; + PrefetchRequest *target_slot; + bool found; + + /* update search index to an unprocessed entry */ + search_ring_index--; + + source_slot = GetPrfSlot(search_ring_index); + + if (source_slot->status == PRFS_UNUSED) + continue; + + /* slot is used -- start moving slot */ + target_slot = GetPrfSlot(empty_ring_index); + + Assert(source_slot->status == PRFS_RECEIVED); + Assert(target_slot->status == PRFS_UNUSED); + + target_slot->buftag = source_slot->buftag; + target_slot->shard_no = source_slot->shard_no; + target_slot->status = source_slot->status; + target_slot->flags = source_slot->flags; + target_slot->response = source_slot->response; + target_slot->reqid = source_slot->reqid; + target_slot->request_lsns = source_slot->request_lsns; + target_slot->my_ring_index = empty_ring_index; + + prfh_delete(MyPState->prf_hash, source_slot); + prfh_insert(MyPState->prf_hash, target_slot, &found); + + Assert(!found); + + /* Adjust the location of our known-empty slot */ + empty_ring_index--; + + /* empty the moved slot */ + source_slot->status = PRFS_UNUSED; + source_slot->buftag = (BufferTag) + { + 0 + }; + source_slot->response = NULL; + source_slot->my_ring_index = 0; + source_slot->request_lsns = (neon_request_lsns) { + InvalidXLogRecPtr, InvalidXLogRecPtr, InvalidXLogRecPtr + }; + + /* update bookkeeping */ + n_moved++; + } + + /* + * Only when we've moved slots we can expect trailing unused slots, so + * only then we clean up trailing unused slots. + */ + if (n_moved > 0) + { + prefetch_cleanup_trailing_unused(); + return true; + } + + return false; +} + +/* + * If there might be responses still in the TCP buffer, then we should try to + * use those, to reduce any TCP backpressure on the OS/PS side. + * + * This procedure handles that. + * + * Note that this works because we don't pipeline non-getPage requests. + * + * NOTE: This procedure is not allowed to throw errors that should be handled + * by SMGR-related code, as this can be called from every CHECK_FOR_INTERRUPTS + * point inside and outside PostgreSQL. + * + * This still does throw errors when it receives malformed responses from PS. + * + * When we're not called from CHECK_FOR_INTERRUPTS (indicated by + * IsHandlingInterrupts) we also report we've ended prefetch receive work, + * just in case state tracking was lost due to an error in the sync getPage + * response code. + */ +void +communicator_prefetch_pump_state(bool IsHandlingInterrupts) +{ + while (MyPState->ring_receive != MyPState->ring_flush) + { + NeonResponse *response; + PrefetchRequest *slot; + MemoryContext old; + + slot = GetPrfSlot(MyPState->ring_receive); + + old = MemoryContextSwitchTo(MyPState->errctx); + response = page_server->try_receive(slot->shard_no); + MemoryContextSwitchTo(old); + + if (response == NULL) + break; + + /* The slot should still be valid */ + if (slot->status != PRFS_REQUESTED || + slot->response != NULL || + slot->my_ring_index != MyPState->ring_receive) + neon_shard_log(slot->shard_no, ERROR, + "Incorrect prefetch slot state after receive: status=%d response=%p my=%lu receive=%lu", + slot->status, slot->response, + (long) slot->my_ring_index, (long) MyPState->ring_receive); + + /* update prefetch state */ + MyPState->n_responses_buffered += 1; + MyPState->n_requests_inflight -= 1; + MyPState->ring_receive += 1; + MyNeonCounters->getpage_prefetches_buffered = + MyPState->n_responses_buffered; + + /* update slot state */ + slot->status = PRFS_RECEIVED; + slot->response = response; + + if (response->tag == T_NeonGetPageResponse && !(slot->flags & PRFSF_LFC) && lfc_store_prefetch_result) + { + /* + * Store prefetched result in LFC (please read comments to lfc_prefetch + * explaining why it can be done without holding shared buffer lock + */ + if (lfc_prefetch(BufTagGetNRelFileInfo(slot->buftag), slot->buftag.forkNum, slot->buftag.blockNum, ((NeonGetPageResponse*)response)->page, slot->request_lsns.not_modified_since)) + { + slot->flags |= PRFSF_LFC; + } + } + } + + /* We never pump the prefetch state while handling other pages */ + if (!IsHandlingInterrupts) + END_PREFETCH_RECEIVE_WORK(); + + communicator_reconfigure_timeout_if_needed(); +} + +void +readahead_buffer_resize(int newsize, void *extra) +{ + uint64 end, + nfree = newsize; + PrefetchState *newPState; + Size newprfs_size = offsetof(PrefetchState, prf_buffer) + + (sizeof(PrefetchRequest) * newsize); + + /* don't try to re-initialize if we haven't initialized yet */ + if (MyPState == NULL) + return; + + /* + * Make sure that we don't lose track of active prefetch requests by + * ensuring we have received all but the last n requests (n = newsize). + */ + if (MyPState->n_requests_inflight > newsize) + { + prefetch_wait_for(MyPState->ring_unused - newsize - 1); + Assert(MyPState->n_requests_inflight <= newsize); + } + + /* construct the new PrefetchState, and copy over the memory contexts */ + newPState = MemoryContextAllocZero(TopMemoryContext, newprfs_size); + + newPState->bufctx = MyPState->bufctx; + newPState->errctx = MyPState->errctx; + newPState->hashctx = MyPState->hashctx; + newPState->prf_hash = prfh_create(MyPState->hashctx, newsize, NULL); + newPState->n_unused = newsize; + newPState->n_requests_inflight = 0; + newPState->n_responses_buffered = 0; + newPState->ring_last = newsize; + newPState->ring_unused = newsize; + newPState->ring_receive = newsize; + newPState->max_shard_no = MyPState->max_shard_no; + memcpy(newPState->shard_bitmap, MyPState->shard_bitmap, sizeof(MyPState->shard_bitmap)); + + /* + * Copy over the prefetches. + * + * We populate the prefetch array from the end; to retain the most recent + * prefetches, but this has the benefit of only needing to do one + * iteration on the dataset, and trivial compaction. + */ + for (end = MyPState->ring_unused - 1; + end >= MyPState->ring_last && end != UINT64_MAX && nfree != 0; + end -= 1) + { + PrefetchRequest *slot = GetPrfSlot(end); + PrefetchRequest *newslot; + bool found; + + if (slot->status == PRFS_UNUSED) + continue; + + nfree -= 1; + + newslot = &newPState->prf_buffer[nfree]; + *newslot = *slot; + newslot->my_ring_index = nfree; + + prfh_insert(newPState->prf_hash, newslot, &found); + + Assert(!found); + + switch (newslot->status) + { + case PRFS_UNUSED: + pg_unreachable(); + case PRFS_REQUESTED: + newPState->n_requests_inflight += 1; + newPState->ring_receive -= 1; + newPState->ring_last -= 1; + break; + case PRFS_RECEIVED: + newPState->n_responses_buffered += 1; + newPState->ring_last -= 1; + break; + case PRFS_TAG_REMAINS: + newPState->ring_last -= 1; + break; + } + newPState->n_unused -= 1; + } + newPState->ring_flush = newPState->ring_receive; + + MyNeonCounters->getpage_prefetches_buffered = + MyPState->n_responses_buffered; + MyNeonCounters->pageserver_open_requests = + MyPState->n_requests_inflight; + + for (; end >= MyPState->ring_last && end != UINT64_MAX; end -= 1) + { + PrefetchRequest *slot = GetPrfSlot(end); + Assert(slot->status != PRFS_REQUESTED); + if (slot->status == PRFS_RECEIVED) + { + pfree(slot->response); + } + } + + prfh_destroy(MyPState->prf_hash); + pfree(MyPState); + MyPState = newPState; +} + + + +/* + * Make sure that there are no responses still in the buffer. + * + * This function may indirectly update MyPState->pfs_hash; which invalidates + * any active pointers into the hash table. + */ +static void +consume_prefetch_responses(void) +{ + if (MyPState->ring_receive < MyPState->ring_unused) + prefetch_wait_for(MyPState->ring_unused - 1); +} + +static void +prefetch_cleanup_trailing_unused(void) +{ + uint64 ring_index; + PrefetchRequest *slot; + + while (MyPState->ring_last < MyPState->ring_receive) + { + ring_index = MyPState->ring_last; + slot = GetPrfSlot(ring_index); + + if (slot->status == PRFS_UNUSED) + MyPState->ring_last += 1; + else + break; + } +} + + +static bool +prefetch_flush_requests(void) +{ + for (shardno_t shard_no = 0; shard_no < MyPState->max_shard_no; shard_no++) + { + if (BITMAP_ISSET(MyPState->shard_bitmap, shard_no)) + { + if (!page_server->flush(shard_no)) + return false; + BITMAP_CLR(MyPState->shard_bitmap, shard_no); + } + } + MyPState->max_shard_no = 0; + return true; +} + +/* + * Wait for slot of ring_index to have received its response. + * The caller is responsible for making sure the request buffer is flushed. + * + * NOTE: this function may indirectly update MyPState->pfs_hash; which + * invalidates any active pointers into the hash table. + * NOTE: callers should make sure they can handle query cancellations in this + * function's call path. + */ +static bool +prefetch_wait_for(uint64 ring_index) +{ + PrefetchRequest *entry; + bool result = true; + + if (MyPState->ring_flush <= ring_index && + MyPState->ring_unused > MyPState->ring_flush) + { + if (!prefetch_flush_requests()) + return false; + MyPState->ring_flush = MyPState->ring_unused; + } + + Assert(MyPState->ring_unused > ring_index); + + while (MyPState->ring_receive <= ring_index) + { + START_PREFETCH_RECEIVE_WORK(); + entry = GetPrfSlot(MyPState->ring_receive); + + Assert(entry->status == PRFS_REQUESTED); + if (!prefetch_read(entry)) + { + result = false; + break; + } + + END_PREFETCH_RECEIVE_WORK(); + CHECK_FOR_INTERRUPTS(); + } + + return result; +} + +/* + * Read the response of a prefetch request into its slot. + * + * The caller is responsible for making sure that the request for this buffer + * was flushed to the PageServer. + * + * NOTE: this function may indirectly update MyPState->pfs_hash; which + * invalidates any active pointers into the hash table. + * + * NOTE: this does IO, and can get canceled out-of-line. + */ +static bool +prefetch_read(PrefetchRequest *slot) +{ + NeonResponse *response; + MemoryContext old; + BufferTag buftag; + shardno_t shard_no; + uint64 my_ring_index; + + Assert(slot->status == PRFS_REQUESTED); + Assert(slot->response == NULL); + Assert(slot->my_ring_index == MyPState->ring_receive); + + if (slot->status != PRFS_REQUESTED || + slot->response != NULL || + slot->my_ring_index != MyPState->ring_receive) + neon_shard_log(slot->shard_no, ERROR, + "Incorrect prefetch read: status=%d response=%p my=%lu receive=%lu", + slot->status, slot->response, + (long)slot->my_ring_index, (long)MyPState->ring_receive); + + /* + * Copy the request info so that if an error happens and the prefetch + * queue is flushed during the receive call, we can print the original + * values in the error message + */ + buftag = slot->buftag; + shard_no = slot->shard_no; + my_ring_index = slot->my_ring_index; + + old = MemoryContextSwitchTo(MyPState->errctx); + response = (NeonResponse *) page_server->receive(shard_no); + MemoryContextSwitchTo(old); + if (response) + { + /* The slot should still be valid */ + if (slot->status != PRFS_REQUESTED || + slot->response != NULL || + slot->my_ring_index != MyPState->ring_receive) + neon_shard_log(shard_no, ERROR, + "Incorrect prefetch slot state after receive: status=%d response=%p my=%lu receive=%lu", + slot->status, slot->response, + (long) slot->my_ring_index, (long) MyPState->ring_receive); + + /* update prefetch state */ + MyPState->n_responses_buffered += 1; + MyPState->n_requests_inflight -= 1; + MyPState->ring_receive += 1; + MyNeonCounters->getpage_prefetches_buffered = + MyPState->n_responses_buffered; + + /* update slot state */ + slot->status = PRFS_RECEIVED; + slot->response = response; + + if (response->tag == T_NeonGetPageResponse && !(slot->flags & PRFSF_LFC) && lfc_store_prefetch_result) + { + /* + * Store prefetched result in LFC (please read comments to lfc_prefetch + * explaining why it can be done without holding shared buffer lock + */ + if (lfc_prefetch(BufTagGetNRelFileInfo(buftag), buftag.forkNum, buftag.blockNum, ((NeonGetPageResponse*)response)->page, slot->request_lsns.not_modified_since)) + { + slot->flags |= PRFSF_LFC; + } + } + return true; + } + else + { + /* + * Note: The slot might no longer be valid, if the connection was lost + * and the prefetch queue was flushed during the receive call + */ + neon_shard_log(shard_no, LOG, + "No response from reading prefetch entry %lu: %u/%u/%u.%u block %u. This can be caused by a concurrent disconnect", + (long) my_ring_index, + RelFileInfoFmt(BufTagGetNRelFileInfo(buftag)), + buftag.forkNum, buftag.blockNum); + return false; + } +} + +/* + * Disconnect hook - drop prefetches when the connection drops + * + * If we don't remove the failed prefetches, we'd be serving incorrect + * data to the smgr. + */ +void +prefetch_on_ps_disconnect(void) +{ + MyPState->ring_flush = MyPState->ring_unused; + + while (MyPState->ring_receive < MyPState->ring_unused) + { + PrefetchRequest *slot; + uint64 ring_index = MyPState->ring_receive; + + slot = GetPrfSlot(ring_index); + + Assert(slot->status == PRFS_REQUESTED); + Assert(slot->my_ring_index == ring_index); + + /* + * Drop connection to all shards which have prefetch requests. + * It is not a problem to call disconnect multiple times on the same connection + * because disconnect implementation in libpagestore.c will check if connection + * is alive and do nothing of connection was already dropped. + */ + page_server->disconnect(slot->shard_no); + + /* clean up the request */ + slot->status = PRFS_TAG_REMAINS; + MyPState->n_requests_inflight -= 1; + MyPState->ring_receive += 1; + + prefetch_set_unused(ring_index); + pgBufferUsage.prefetch.expired += 1; + MyNeonCounters->getpage_prefetch_discards_total += 1; + } + + /* + * We can have gone into retry due to network error, so update stats with + * the latest available + */ + MyNeonCounters->pageserver_open_requests = + MyPState->n_requests_inflight; + MyNeonCounters->getpage_prefetches_buffered = + MyPState->n_responses_buffered; +} + +/* + * prefetch_set_unused() - clear a received prefetch slot + * + * The slot at ring_index must be a current member of the ring buffer, + * and may not be in the PRFS_REQUESTED state. + * + * NOTE: this function will update MyPState->pfs_hash; which invalidates any + * active pointers into the hash table. + */ +static inline void +prefetch_set_unused(uint64 ring_index) +{ + PrefetchRequest *slot; + + if (ring_index < MyPState->ring_last) + return; /* Should already be unused */ + + slot = GetPrfSlot(ring_index); + if (slot->status == PRFS_UNUSED) + return; + + Assert(slot->status == PRFS_RECEIVED || slot->status == PRFS_TAG_REMAINS); + + if (slot->status == PRFS_RECEIVED) + { + pfree(slot->response); + slot->response = NULL; + + MyPState->n_responses_buffered -= 1; + MyPState->n_unused += 1; + + MyNeonCounters->getpage_prefetches_buffered = + MyPState->n_responses_buffered; + } + else + { + Assert(slot->response == NULL); + } + + prfh_delete(MyPState->prf_hash, slot); + + /* clear all fields */ + MemSet(slot, 0, sizeof(PrefetchRequest)); + slot->status = PRFS_UNUSED; + + /* run cleanup if we're holding back ring_last */ + if (MyPState->ring_last == ring_index) + prefetch_cleanup_trailing_unused(); + + /* + * ... and try to store the buffered responses more compactly if > 12.5% + * of the buffer is gaps + */ + else if (ReceiveBufferNeedsCompaction()) + compact_prefetch_buffers(); +} + +/* + * Send one prefetch request to the pageserver. To wait for the response, call + * prefetch_wait_for(). + */ +static void +prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns) +{ + bool found; + uint64 mySlotNo PG_USED_FOR_ASSERTS_ONLY = slot->my_ring_index; + + NeonGetPageRequest request = { + .hdr.tag = T_NeonGetPageRequest, + .hdr.reqid = GENERATE_REQUEST_ID(), + /* lsn and not_modified_since are filled in below */ + .rinfo = BufTagGetNRelFileInfo(slot->buftag), + .forknum = slot->buftag.forkNum, + .blkno = slot->buftag.blockNum, + }; + + Assert(mySlotNo == MyPState->ring_unused); + + slot->reqid = request.hdr.reqid; + + if (force_request_lsns) + slot->request_lsns = *force_request_lsns; + else + neon_get_request_lsns(BufTagGetNRelFileInfo(slot->buftag), + slot->buftag.forkNum, slot->buftag.blockNum, + &slot->request_lsns, 1); + request.hdr.lsn = slot->request_lsns.request_lsn; + request.hdr.not_modified_since = slot->request_lsns.not_modified_since; + + Assert(slot->response == NULL); + Assert(slot->my_ring_index == MyPState->ring_unused); + + while (!page_server->send(slot->shard_no, (NeonRequest *) &request)) + { + Assert(mySlotNo == MyPState->ring_unused); + /* loop */ + } + + /* update prefetch state */ + MyPState->n_requests_inflight += 1; + MyPState->n_unused -= 1; + MyPState->ring_unused += 1; + BITMAP_SET(MyPState->shard_bitmap, slot->shard_no); + MyPState->max_shard_no = Max(slot->shard_no+1, MyPState->max_shard_no); + + /* update slot state */ + slot->status = PRFS_REQUESTED; + prfh_insert(MyPState->prf_hash, slot, &found); + Assert(!found); +} + +/* + * Lookup of already received prefetch requests. Only already received responses matching required LSNs are accepted. + * Present pages are marked in "mask" bitmap and total number of such pages is returned. + */ +int +communicator_prefetch_lookupv(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blocknum, + neon_request_lsns *lsns, BlockNumber nblocks, + void **buffers, bits8 *mask) +{ + int hits = 0; + PrefetchRequest hashkey; + + /* + * Use an intermediate PrefetchRequest struct as the hash key to ensure + * correct alignment and that the padding bytes are cleared. + */ + memset(&hashkey.buftag, 0, sizeof(BufferTag)); + CopyNRelFileInfoToBufTag(hashkey.buftag, rinfo); + hashkey.buftag.forkNum = forknum; + + for (int i = 0; i < nblocks; i++) + { + PrfHashEntry *entry; + + hashkey.buftag.blockNum = blocknum + i; + entry = prfh_lookup(MyPState->prf_hash, &hashkey); + + if (entry != NULL) + { + PrefetchRequest *slot = entry->slot; + uint64 ring_index = slot->my_ring_index; + Assert(slot == GetPrfSlot(ring_index)); + + Assert(slot->status != PRFS_UNUSED); + Assert(MyPState->ring_last <= ring_index && + ring_index < MyPState->ring_unused); + Assert(BufferTagsEqual(&slot->buftag, &hashkey.buftag)); + + if (slot->status != PRFS_RECEIVED) + continue; + + /* + * If the caller specified a request LSN to use, only accept + * prefetch responses that satisfy that request. + */ + if (!neon_prefetch_response_usable(&lsns[i], slot)) + continue; + + /* + * Ignore errors + */ + if (slot->response->tag != T_NeonGetPageResponse) + { + if (slot->response->tag != T_NeonErrorResponse) + { + NEON_PANIC_CONNECTION_STATE(slot->shard_no, PANIC, + "Expected GetPage (0x%02x) or Error (0x%02x) response to GetPageRequest, but got 0x%02x", + T_NeonGetPageResponse, T_NeonErrorResponse, slot->response->tag); + } + continue; + } + memcpy(buffers[i], ((NeonGetPageResponse*)slot->response)->page, BLCKSZ); + + + /* + * With lfc_store_prefetch_result=true prefetch result is stored in LFC in prefetch_pump_state when response is received + * from page server. But if lfc_store_prefetch_result=false then it is not yet stored in LFC and we have to do it here + * under buffer lock. + */ + if (!lfc_store_prefetch_result) + lfc_write(rinfo, forknum, blocknum + i, buffers[i]); + + prefetch_set_unused(ring_index); + BITMAP_SET(mask, i); + + hits += 1; + inc_getpage_wait(0); + } + } + pgBufferUsage.prefetch.hits += hits; + return hits; +} + +/* + * prefetch_register_bufferv() - register and prefetch buffers + * + * Register that we may want the contents of BufferTag in the near future. + * This is used when issuing a speculative prefetch request, but also when + * performing a synchronous request and need the buffer right now. + * + * If force_request_lsns is not NULL, those values are sent to the + * pageserver. If NULL, we utilize the lastWrittenLsn -infrastructure + * to calculate the LSNs to send. + * + * Bits set in *mask (if present) indicate pages already read; i.e. pages we + * can skip in this process. + * + * When performing a prefetch rather than a synchronous request, + * is_prefetch==true. Currently, it only affects how the request is accounted + * in the perf counters. + * + * NOTE: this function may indirectly update MyPState->pfs_hash; which + * invalidates any active pointers into the hash table. + */ +void +communicator_prefetch_register_bufferv(BufferTag tag, neon_request_lsns *frlsns, + BlockNumber nblocks, const bits8 *mask) +{ + uint64 ring_index PG_USED_FOR_ASSERTS_ONLY; + + ring_index = prefetch_register_bufferv(tag, frlsns, nblocks, mask, true); + + Assert(ring_index < MyPState->ring_unused && + MyPState->ring_last <= ring_index); +} + +/* internal version. Returns the ring index */ +static uint64 +prefetch_register_bufferv(BufferTag tag, neon_request_lsns *frlsns, + BlockNumber nblocks, const bits8 *mask, + bool is_prefetch) +{ + uint64 min_ring_index; + PrefetchRequest hashkey; +#ifdef USE_ASSERT_CHECKING + bool any_hits = false; +#endif + /* We will never read further ahead than our buffer can store. */ + nblocks = Max(1, Min(nblocks, readahead_buffer_size)); + + /* + * Use an intermediate PrefetchRequest struct as the hash key to ensure + * correct alignment and that the padding bytes are cleared. + */ + memset(&hashkey.buftag, 0, sizeof(BufferTag)); + hashkey.buftag = tag; + +Retry: + /* + * We can have gone into retry due to network error, so update stats with + * the latest available + */ + MyNeonCounters->pageserver_open_requests = + MyPState->ring_unused - MyPState->ring_receive; + MyNeonCounters->getpage_prefetches_buffered = + MyPState->n_responses_buffered; + + min_ring_index = UINT64_MAX; + for (int i = 0; i < nblocks; i++) + { + PrefetchRequest *slot = NULL; + PrfHashEntry *entry = NULL; + uint64 ring_index; + neon_request_lsns *lsns; + + if (PointerIsValid(mask) && BITMAP_ISSET(mask, i)) + continue; + + if (frlsns) + lsns = &frlsns[i]; + else + lsns = NULL; + +#ifdef USE_ASSERT_CHECKING + any_hits = true; +#endif + + slot = NULL; + entry = NULL; + + hashkey.buftag.blockNum = tag.blockNum + i; + entry = prfh_lookup(MyPState->prf_hash, &hashkey); + + if (entry != NULL) + { + slot = entry->slot; + ring_index = slot->my_ring_index; + Assert(slot == GetPrfSlot(ring_index)); + + Assert(slot->status != PRFS_UNUSED); + Assert(MyPState->ring_last <= ring_index && + ring_index < MyPState->ring_unused); + Assert(BufferTagsEqual(&slot->buftag, &hashkey.buftag)); + + /* + * If the caller specified a request LSN to use, only accept + * prefetch responses that satisfy that request. + */ + if (!is_prefetch) + { + if (!neon_prefetch_response_usable(lsns, slot)) + { + /* Wait for the old request to finish and discard it */ + if (!prefetch_wait_for(ring_index)) + goto Retry; + prefetch_set_unused(ring_index); + entry = NULL; + slot = NULL; + pgBufferUsage.prefetch.expired += 1; + MyNeonCounters->getpage_prefetch_discards_total += 1; + } + } + + if (entry != NULL) + { + /* + * We received a prefetch for a page that was recently read + * and removed from the buffers. Remove that request from the + * buffers. + */ + if (slot->status == PRFS_TAG_REMAINS) + { + prefetch_set_unused(ring_index); + entry = NULL; + slot = NULL; + } + else + { + min_ring_index = Min(min_ring_index, ring_index); + /* The buffered request is good enough, return that index */ + if (is_prefetch) + pgBufferUsage.prefetch.duplicates++; + continue; + } + } + } + else if (!is_prefetch) + { + pgBufferUsage.prefetch.misses += 1; + MyNeonCounters->getpage_prefetch_misses_total++; + } + /* + * We can only leave the block above by finding that there's + * no entry that can satisfy this request, either because there + * was no entry, or because the entry was invalid or didn't satisfy + * the LSNs provided. + * + * The code should've made sure to clear up the data. + */ + Assert(entry == NULL); + Assert(slot == NULL); + + /* There should be no buffer overflow */ + Assert(MyPState->ring_last + readahead_buffer_size >= MyPState->ring_unused); + + /* + * If the prefetch queue is full, we need to make room by clearing the + * oldest slot. If the oldest slot holds a buffer that was already + * received, we can just throw it away; we fetched the page + * unnecessarily in that case. If the oldest slot holds a request that + * we haven't received a response for yet, we have to wait for the + * response to that before we can continue. We might not have even + * flushed the request to the pageserver yet, it might be just sitting + * in the output buffer. In that case, we flush it and wait for the + * response. (We could decide not to send it, but it's hard to abort + * when the request is already in the output buffer, and 'not sending' + * a prefetch request kind of goes against the principles of + * prefetching) + */ + if (MyPState->ring_last + readahead_buffer_size == MyPState->ring_unused) + { + uint64 cleanup_index = MyPState->ring_last; + + slot = GetPrfSlot(cleanup_index); + + Assert(slot->status != PRFS_UNUSED); + + /* + * If there is good reason to run compaction on the prefetch buffers, + * try to do that. + */ + if (ReceiveBufferNeedsCompaction() && compact_prefetch_buffers()) + { + Assert(slot->status == PRFS_UNUSED); + } + else + { + /* + * We have the slot for ring_last, so that must still be in + * progress + */ + switch (slot->status) + { + case PRFS_REQUESTED: + Assert(MyPState->ring_receive == cleanup_index); + if (!prefetch_wait_for(cleanup_index)) + goto Retry; + prefetch_set_unused(cleanup_index); + pgBufferUsage.prefetch.expired += 1; + MyNeonCounters->getpage_prefetch_discards_total += 1; + break; + case PRFS_RECEIVED: + case PRFS_TAG_REMAINS: + prefetch_set_unused(cleanup_index); + pgBufferUsage.prefetch.expired += 1; + MyNeonCounters->getpage_prefetch_discards_total += 1; + break; + default: + pg_unreachable(); + } + } + } + + /* + * The next buffer pointed to by `ring_unused` is now definitely empty, so + * we can insert the new request to it. + */ + ring_index = MyPState->ring_unused; + + Assert(MyPState->ring_last <= ring_index && + ring_index <= MyPState->ring_unused); + + slot = GetPrfSlotNoCheck(ring_index); + + Assert(slot->status == PRFS_UNUSED); + + /* + * We must update the slot data before insertion, because the hash + * function reads the buffer tag from the slot. + */ + slot->buftag = hashkey.buftag; + slot->shard_no = get_shard_number(&tag); + slot->my_ring_index = ring_index; + slot->flags = 0; + + min_ring_index = Min(min_ring_index, ring_index); + + if (is_prefetch) + MyNeonCounters->getpage_prefetch_requests_total++; + else + MyNeonCounters->getpage_sync_requests_total++; + + prefetch_do_request(slot, lsns); + } + + MyNeonCounters->pageserver_open_requests = + MyPState->ring_unused - MyPState->ring_receive; + + Assert(any_hits); + + Assert(GetPrfSlot(min_ring_index)->status == PRFS_REQUESTED || + GetPrfSlot(min_ring_index)->status == PRFS_RECEIVED); + Assert(MyPState->ring_last <= min_ring_index && + min_ring_index < MyPState->ring_unused); + + if (flush_every_n_requests > 0 && + MyPState->ring_unused - MyPState->ring_flush >= flush_every_n_requests) + { + if (!prefetch_flush_requests()) + { + /* + * Prefetch set is reset in case of error, so we should try to + * register our request once again + */ + goto Retry; + } + MyPState->ring_flush = MyPState->ring_unused; + } + + return min_ring_index; +} + +static bool +equal_requests(NeonRequest* a, NeonRequest* b) +{ + return a->reqid == b->reqid && a->lsn == b->lsn && a->not_modified_since == b->not_modified_since; +} + + +/* + * Note: this function can get canceled and use a long jump to the next catch + * context. Take care. + */ +static NeonResponse * +page_server_request(void const *req) +{ + NeonResponse *resp; + BufferTag tag = {0}; + shardno_t shard_no; + + switch (messageTag(req)) + { + case T_NeonExistsRequest: + CopyNRelFileInfoToBufTag(tag, ((NeonExistsRequest *) req)->rinfo); + break; + case T_NeonNblocksRequest: + CopyNRelFileInfoToBufTag(tag, ((NeonNblocksRequest *) req)->rinfo); + break; + case T_NeonDbSizeRequest: + NInfoGetDbOid(BufTagGetNRelFileInfo(tag)) = ((NeonDbSizeRequest *) req)->dbNode; + break; + case T_NeonGetPageRequest: + CopyNRelFileInfoToBufTag(tag, ((NeonGetPageRequest *) req)->rinfo); + tag.blockNum = ((NeonGetPageRequest *) req)->blkno; + break; + default: + neon_log(ERROR, "Unexpected request tag: %d", messageTag(req)); + } + shard_no = get_shard_number(&tag); + + /* + * Current sharding model assumes that all metadata is present only at shard 0. + * We still need to call get_shard_no() to check if shard map is up-to-date. + */ + if (((NeonRequest *) req)->tag != T_NeonGetPageRequest) + { + shard_no = 0; + } + + do + { + PG_TRY(); + { + while (!page_server->send(shard_no, (NeonRequest *) req) + || !page_server->flush(shard_no)) + { + /* do nothing */ + } + MyNeonCounters->pageserver_open_requests++; + consume_prefetch_responses(); + resp = page_server->receive(shard_no); + MyNeonCounters->pageserver_open_requests--; + } + PG_CATCH(); + { + /* + * Cancellation in this code needs to be handled better at some + * point, but this currently seems fine for now. + */ + page_server->disconnect(shard_no); + MyNeonCounters->pageserver_open_requests = 0; + + /* + * We know for sure we're not working on any prefetch pages after + * this. + */ + END_PREFETCH_RECEIVE_WORK(); + + PG_RE_THROW(); + } + PG_END_TRY(); + + } while (resp == NULL); + + return resp; +} + + +StringInfoData +nm_pack_request(NeonRequest *msg) +{ + StringInfoData s; + + initStringInfo(&s); + + pq_sendbyte(&s, msg->tag); + if (neon_protocol_version >= 3) + { + pq_sendint64(&s, msg->reqid); + } + pq_sendint64(&s, msg->lsn); + pq_sendint64(&s, msg->not_modified_since); + + switch (messageTag(msg)) + { + /* pagestore_client -> pagestore */ + case T_NeonExistsRequest: + { + NeonExistsRequest *msg_req = (NeonExistsRequest *) msg; + + pq_sendint32(&s, NInfoGetSpcOid(msg_req->rinfo)); + pq_sendint32(&s, NInfoGetDbOid(msg_req->rinfo)); + pq_sendint32(&s, NInfoGetRelNumber(msg_req->rinfo)); + pq_sendbyte(&s, msg_req->forknum); + + break; + } + case T_NeonNblocksRequest: + { + NeonNblocksRequest *msg_req = (NeonNblocksRequest *) msg; + + pq_sendint32(&s, NInfoGetSpcOid(msg_req->rinfo)); + pq_sendint32(&s, NInfoGetDbOid(msg_req->rinfo)); + pq_sendint32(&s, NInfoGetRelNumber(msg_req->rinfo)); + pq_sendbyte(&s, msg_req->forknum); + + break; + } + case T_NeonDbSizeRequest: + { + NeonDbSizeRequest *msg_req = (NeonDbSizeRequest *) msg; + + pq_sendint32(&s, msg_req->dbNode); + + break; + } + case T_NeonGetPageRequest: + { + NeonGetPageRequest *msg_req = (NeonGetPageRequest *) msg; + + pq_sendint32(&s, NInfoGetSpcOid(msg_req->rinfo)); + pq_sendint32(&s, NInfoGetDbOid(msg_req->rinfo)); + pq_sendint32(&s, NInfoGetRelNumber(msg_req->rinfo)); + pq_sendbyte(&s, msg_req->forknum); + pq_sendint32(&s, msg_req->blkno); + + break; + } + + case T_NeonGetSlruSegmentRequest: + { + NeonGetSlruSegmentRequest *msg_req = (NeonGetSlruSegmentRequest *) msg; + + pq_sendbyte(&s, msg_req->kind); + pq_sendint32(&s, msg_req->segno); + + break; + } + + /* pagestore -> pagestore_client. We never need to create these. */ + case T_NeonExistsResponse: + case T_NeonNblocksResponse: + case T_NeonGetPageResponse: + case T_NeonErrorResponse: + case T_NeonDbSizeResponse: + case T_NeonGetSlruSegmentResponse: + default: + neon_log(ERROR, "unexpected neon message tag 0x%02x", msg->tag); + break; + } + return s; +} + +NeonResponse * +nm_unpack_response(StringInfo s) +{ + NeonMessageTag tag = pq_getmsgbyte(s); + NeonResponse resp_hdr = {0}; /* make valgrind happy */ + NeonResponse *resp = NULL; + + resp_hdr.tag = tag; + if (neon_protocol_version >= 3) + { + resp_hdr.reqid = pq_getmsgint64(s); + resp_hdr.lsn = pq_getmsgint64(s); + resp_hdr.not_modified_since = pq_getmsgint64(s); + } + switch (tag) + { + /* pagestore -> pagestore_client */ + case T_NeonExistsResponse: + { + NeonExistsResponse *msg_resp = palloc0(sizeof(NeonExistsResponse)); + + if (neon_protocol_version >= 3) + { + NInfoGetSpcOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4); + NInfoGetDbOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4); + NInfoGetRelNumber(msg_resp->req.rinfo) = pq_getmsgint(s, 4); + msg_resp->req.forknum = pq_getmsgbyte(s); + } + msg_resp->req.hdr = resp_hdr; + msg_resp->exists = pq_getmsgbyte(s); + pq_getmsgend(s); + + resp = (NeonResponse *) msg_resp; + break; + } + + case T_NeonNblocksResponse: + { + NeonNblocksResponse *msg_resp = palloc0(sizeof(NeonNblocksResponse)); + + if (neon_protocol_version >= 3) + { + NInfoGetSpcOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4); + NInfoGetDbOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4); + NInfoGetRelNumber(msg_resp->req.rinfo) = pq_getmsgint(s, 4); + msg_resp->req.forknum = pq_getmsgbyte(s); + } + msg_resp->req.hdr = resp_hdr; + msg_resp->n_blocks = pq_getmsgint(s, 4); + pq_getmsgend(s); + + resp = (NeonResponse *) msg_resp; + break; + } + + case T_NeonGetPageResponse: + { + NeonGetPageResponse *msg_resp; + + msg_resp = MemoryContextAllocZero(MyPState->bufctx, PS_GETPAGERESPONSE_SIZE); + if (neon_protocol_version >= 3) + { + NInfoGetSpcOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4); + NInfoGetDbOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4); + NInfoGetRelNumber(msg_resp->req.rinfo) = pq_getmsgint(s, 4); + msg_resp->req.forknum = pq_getmsgbyte(s); + msg_resp->req.blkno = pq_getmsgint(s, 4); + } + msg_resp->req.hdr = resp_hdr; + /* XXX: should be varlena */ + memcpy(msg_resp->page, pq_getmsgbytes(s, BLCKSZ), BLCKSZ); + pq_getmsgend(s); + + Assert(msg_resp->req.hdr.tag == T_NeonGetPageResponse); + + resp = (NeonResponse *) msg_resp; + break; + } + + case T_NeonDbSizeResponse: + { + NeonDbSizeResponse *msg_resp = palloc0(sizeof(NeonDbSizeResponse)); + + if (neon_protocol_version >= 3) + { + msg_resp->req.dbNode = pq_getmsgint(s, 4); + } + msg_resp->req.hdr = resp_hdr; + msg_resp->db_size = pq_getmsgint64(s); + pq_getmsgend(s); + + resp = (NeonResponse *) msg_resp; + break; + } + + case T_NeonErrorResponse: + { + NeonErrorResponse *msg_resp; + size_t msglen; + const char *msgtext; + + msgtext = pq_getmsgrawstring(s); + msglen = strlen(msgtext); + + msg_resp = palloc0(sizeof(NeonErrorResponse) + msglen + 1); + msg_resp->req = resp_hdr; + memcpy(msg_resp->message, msgtext, msglen + 1); + pq_getmsgend(s); + + resp = (NeonResponse *) msg_resp; + break; + } + + case T_NeonGetSlruSegmentResponse: + { + NeonGetSlruSegmentResponse *msg_resp; + int n_blocks; + msg_resp = palloc0(sizeof(NeonGetSlruSegmentResponse)); + + if (neon_protocol_version >= 3) + { + msg_resp->req.kind = pq_getmsgbyte(s); + msg_resp->req.segno = pq_getmsgint(s, 4); + } + msg_resp->req.hdr = resp_hdr; + + n_blocks = pq_getmsgint(s, 4); + msg_resp->n_blocks = n_blocks; + memcpy(msg_resp->data, pq_getmsgbytes(s, n_blocks * BLCKSZ), n_blocks * BLCKSZ); + pq_getmsgend(s); + + resp = (NeonResponse *) msg_resp; + break; + } + + /* + * pagestore_client -> pagestore + * + * We create these ourselves, and don't need to decode them. + */ + case T_NeonExistsRequest: + case T_NeonNblocksRequest: + case T_NeonGetPageRequest: + case T_NeonDbSizeRequest: + case T_NeonGetSlruSegmentRequest: + default: + neon_log(ERROR, "unexpected neon message tag 0x%02x", tag); + break; + } + + return resp; +} + +/* dump to json for debugging / error reporting purposes */ +char * +nm_to_string(NeonMessage *msg) +{ + StringInfoData s; + + initStringInfo(&s); + + switch (messageTag(msg)) + { + /* pagestore_client -> pagestore */ + case T_NeonExistsRequest: + { + NeonExistsRequest *msg_req = (NeonExistsRequest *) msg; + + appendStringInfoString(&s, "{\"type\": \"NeonExistsRequest\""); + appendStringInfo(&s, ", \"rinfo\": \"%u/%u/%u\"", RelFileInfoFmt(msg_req->rinfo)); + appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum); + appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.lsn)); + appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.not_modified_since)); + appendStringInfoChar(&s, '}'); + break; + } + + case T_NeonNblocksRequest: + { + NeonNblocksRequest *msg_req = (NeonNblocksRequest *) msg; + + appendStringInfoString(&s, "{\"type\": \"NeonNblocksRequest\""); + appendStringInfo(&s, ", \"rinfo\": \"%u/%u/%u\"", RelFileInfoFmt(msg_req->rinfo)); + appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum); + appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.lsn)); + appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.not_modified_since)); + appendStringInfoChar(&s, '}'); + break; + } + + case T_NeonGetPageRequest: + { + NeonGetPageRequest *msg_req = (NeonGetPageRequest *) msg; + + appendStringInfoString(&s, "{\"type\": \"NeonGetPageRequest\""); + appendStringInfo(&s, ", \"rinfo\": \"%u/%u/%u\"", RelFileInfoFmt(msg_req->rinfo)); + appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum); + appendStringInfo(&s, ", \"blkno\": %u", msg_req->blkno); + appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.lsn)); + appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.not_modified_since)); + appendStringInfoChar(&s, '}'); + break; + } + case T_NeonDbSizeRequest: + { + NeonDbSizeRequest *msg_req = (NeonDbSizeRequest *) msg; + + appendStringInfoString(&s, "{\"type\": \"NeonDbSizeRequest\""); + appendStringInfo(&s, ", \"dbnode\": \"%u\"", msg_req->dbNode); + appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.lsn)); + appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.not_modified_since)); + appendStringInfoChar(&s, '}'); + break; + } + case T_NeonGetSlruSegmentRequest: + { + NeonGetSlruSegmentRequest *msg_req = (NeonGetSlruSegmentRequest *) msg; + + appendStringInfoString(&s, "{\"type\": \"NeonGetSlruSegmentRequest\""); + appendStringInfo(&s, ", \"kind\": %u", msg_req->kind); + appendStringInfo(&s, ", \"segno\": %u", msg_req->segno); + appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.lsn)); + appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.not_modified_since)); + appendStringInfoChar(&s, '}'); + break; + } + /* pagestore -> pagestore_client */ + case T_NeonExistsResponse: + { + NeonExistsResponse *msg_resp = (NeonExistsResponse *) msg; + + appendStringInfoString(&s, "{\"type\": \"NeonExistsResponse\""); + appendStringInfo(&s, ", \"exists\": %d}", + msg_resp->exists); + appendStringInfoChar(&s, '}'); + + break; + } + case T_NeonNblocksResponse: + { + NeonNblocksResponse *msg_resp = (NeonNblocksResponse *) msg; + + appendStringInfoString(&s, "{\"type\": \"NeonNblocksResponse\""); + appendStringInfo(&s, ", \"n_blocks\": %u}", + msg_resp->n_blocks); + appendStringInfoChar(&s, '}'); + + break; + } + case T_NeonGetPageResponse: + { +#if 0 + NeonGetPageResponse *msg_resp = (NeonGetPageResponse *) msg; +#endif + + appendStringInfoString(&s, "{\"type\": \"NeonGetPageResponse\""); + appendStringInfo(&s, ", \"page\": \"XXX\"}"); + appendStringInfoChar(&s, '}'); + break; + } + case T_NeonErrorResponse: + { + NeonErrorResponse *msg_resp = (NeonErrorResponse *) msg; + + /* FIXME: escape double-quotes in the message */ + appendStringInfoString(&s, "{\"type\": \"NeonErrorResponse\""); + appendStringInfo(&s, ", \"message\": \"%s\"}", msg_resp->message); + appendStringInfoChar(&s, '}'); + break; + } + case T_NeonDbSizeResponse: + { + NeonDbSizeResponse *msg_resp = (NeonDbSizeResponse *) msg; + + appendStringInfoString(&s, "{\"type\": \"NeonDbSizeResponse\""); + appendStringInfo(&s, ", \"db_size\": %ld}", + msg_resp->db_size); + appendStringInfoChar(&s, '}'); + + break; + } + case T_NeonGetSlruSegmentResponse: + { + NeonGetSlruSegmentResponse *msg_resp = (NeonGetSlruSegmentResponse *) msg; + + appendStringInfoString(&s, "{\"type\": \"NeonGetSlruSegmentResponse\""); + appendStringInfo(&s, ", \"n_blocks\": %u}", + msg_resp->n_blocks); + appendStringInfoChar(&s, '}'); + + break; + } + + default: + appendStringInfo(&s, "{\"type\": \"unknown 0x%02x\"", msg->tag); + } + return s.data; +} + +/* + * communicator_init() -- Initialize per-backend private state + */ +void +communicator_init(void) +{ + Size prfs_size; + + if (MyPState != NULL) + return; + + /* + * Sanity check that theperf counters array is sized correctly. We got + * this wrong once, and the formula for max number of backends and aux + * processes might well change in the future, so better safe than sorry. + * This is a very cheap check so we do it even without assertions. On + * v14, this gets called before initializing MyProc, so we cannot perform + * the check here. That's OK, we don't expect the logic to change in old + * releases. + */ +#if PG_VERSION_NUM>=150000 + if (MyNeonCounters >= &neon_per_backend_counters_shared[NUM_NEON_PERF_COUNTER_SLOTS]) + elog(ERROR, "MyNeonCounters points past end of array"); +#endif + + prfs_size = offsetof(PrefetchState, prf_buffer) + + sizeof(PrefetchRequest) * readahead_buffer_size; + + MyPState = MemoryContextAllocZero(TopMemoryContext, prfs_size); + + MyPState->n_unused = readahead_buffer_size; + + MyPState->bufctx = SlabContextCreate(TopMemoryContext, + "NeonSMGR/prefetch", + SLAB_DEFAULT_BLOCK_SIZE * 17, + PS_GETPAGERESPONSE_SIZE); + MyPState->errctx = AllocSetContextCreate(TopMemoryContext, + "NeonSMGR/errors", + ALLOCSET_DEFAULT_SIZES); + MyPState->hashctx = AllocSetContextCreate(TopMemoryContext, + "NeonSMGR/prefetch", + ALLOCSET_DEFAULT_SIZES); + + MyPState->prf_hash = prfh_create(MyPState->hashctx, + readahead_buffer_size, NULL); +} + +/* + * neon_prefetch_response_usable -- Can a new request be satisfied by old one? + * + * This is used to check if the response to a prefetch request can be used to + * satisfy a page read now. + */ +static bool +neon_prefetch_response_usable(neon_request_lsns *request_lsns, + PrefetchRequest *slot) +{ + /* sanity check the LSN's on the old and the new request */ + Assert(request_lsns->request_lsn >= request_lsns->not_modified_since); + Assert(request_lsns->effective_request_lsn >= request_lsns->not_modified_since); + Assert(request_lsns->effective_request_lsn <= request_lsns->request_lsn); + Assert(slot->request_lsns.request_lsn >= slot->request_lsns.not_modified_since); + Assert(slot->request_lsns.effective_request_lsn >= slot->request_lsns.not_modified_since); + Assert(slot->request_lsns.effective_request_lsn <= slot->request_lsns.request_lsn); + Assert(slot->status != PRFS_UNUSED); + + /* + * The new request's LSN should never be older than the old one. This + * could be an Assert, except that for testing purposes, we do provide an + * interface in neon_test_utils to fetch pages at arbitary LSNs, which + * violates this. + * + * Similarly, the not_modified_since value calculated for a page should + * never move backwards. This assumption is a bit fragile; if we updated + * the last-written cache when we read in a page, for example, then it + * might. But as the code stands, it should not. + * + * (If two backends issue a request at the same time, they might race and + * calculate LSNs "out of order" with each other, but the prefetch queue + * is backend-private at the moment.) + */ + if (request_lsns->effective_request_lsn < slot->request_lsns.effective_request_lsn || + request_lsns->not_modified_since < slot->request_lsns.not_modified_since) + { + ereport(LOG, + (errcode(ERRCODE_IO_ERROR), + errmsg(NEON_TAG "request with unexpected LSN after prefetch"), + errdetail("Request %X/%X not_modified_since %X/%X, prefetch %X/%X not_modified_since %X/%X)", + LSN_FORMAT_ARGS(request_lsns->effective_request_lsn), + LSN_FORMAT_ARGS(request_lsns->not_modified_since), + LSN_FORMAT_ARGS(slot->request_lsns.effective_request_lsn), + LSN_FORMAT_ARGS(slot->request_lsns.not_modified_since)))); + return false; + } + + /*--- + * Each request to the pageserver has three LSN values associated with it: + * `not_modified_since`, `request_lsn`, and 'effective_request_lsn'. + * `not_modified_since` and `request_lsn` are sent to the pageserver, but + * in the primary node, we always use UINT64_MAX as the `request_lsn`, so + * we remember `effective_request_lsn` separately. In a primary, + * `effective_request_lsn` is the same as `not_modified_since`. + * See comments in neon_get_request_lsns why we can not use last flush WAL position here. + * + * To determine whether a response to a GetPage request issued earlier is + * still valid to satisfy a new page read, we look at the + * (not_modified_since, effective_request_lsn] range of the request. It is + * effectively a claim that the page has not been modified between those + * LSNs. If the range of the old request in the queue overlaps with the + * new request, we know that the page hasn't been modified in the union of + * the ranges. We can use the response to old request to satisfy the new + * request in that case. For example: + * + * 100 500 + * Old request: +--------+ + * + * 400 800 + * New request: +--------+ + * + * The old request claims that the page was not modified between LSNs 100 + * and 500, and the second claims that it was not modified between 400 and + * 800. Together they mean that the page was not modified between 100 and + * 800. Therefore the response to the old request is also valid for the + * new request. + * + * This logic also holds at the boundary case that the old request's LSN + * matches the new request's not_modified_since LSN exactly: + * + * 100 500 + * Old request: +--------+ + * + * 500 900 + * New request: +--------+ + * + * The response to the old request is the page as it was at LSN 500, and + * the page hasn't been changed in the range (500, 900], therefore the + * response is valid also for the new request. + */ + + /* this follows from the checks above */ + Assert(request_lsns->effective_request_lsn >= slot->request_lsns.not_modified_since); + + return request_lsns->not_modified_since <= slot->request_lsns.effective_request_lsn; +} + +/* + * Does the physical file exist? + */ +bool +communicator_exists(NRelFileInfo rinfo, ForkNumber forkNum, neon_request_lsns *request_lsns) +{ + bool exists; + NeonResponse *resp; + + { + NeonExistsRequest request = { + .hdr.tag = T_NeonExistsRequest, + .hdr.reqid = GENERATE_REQUEST_ID(), + .hdr.lsn = request_lsns->request_lsn, + .hdr.not_modified_since = request_lsns->not_modified_since, + .rinfo = rinfo, + .forknum = forkNum + }; + + resp = page_server_request(&request); + + switch (resp->tag) + { + case T_NeonExistsResponse: + { + NeonExistsResponse* exists_resp = (NeonExistsResponse *) resp; + if (neon_protocol_version >= 3) + { + if (!equal_requests(resp, &request.hdr) || + !RelFileInfoEquals(exists_resp->req.rinfo, request.rinfo) || + exists_resp->req.forknum != request.forknum) + { + NEON_PANIC_CONNECTION_STATE(-1, PANIC, + "Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u} to exits request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u}", + resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), RelFileInfoFmt(exists_resp->req.rinfo), exists_resp->req.forknum, + request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), RelFileInfoFmt(request.rinfo), request.forknum); + } + } + exists = exists_resp->exists; + break; + } + case T_NeonErrorResponse: + if (neon_protocol_version >= 3) + { + if (!equal_requests(resp, &request.hdr)) + { + elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match exists request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}", + resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), + request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since)); + } + } + ereport(ERROR, + (errcode(ERRCODE_IO_ERROR), + errmsg(NEON_TAG "[reqid %lx] could not read relation existence of rel %u/%u/%u.%u from page server at lsn %X/%08X", + resp->reqid, + RelFileInfoFmt(rinfo), + forkNum, + LSN_FORMAT_ARGS(request_lsns->effective_request_lsn)), + errdetail("page server returned error: %s", + ((NeonErrorResponse *) resp)->message))); + break; + + default: + NEON_PANIC_CONNECTION_STATE(-1, PANIC, + "Expected Exists (0x%02x) or Error (0x%02x) response to ExistsRequest, but got 0x%02x", + T_NeonExistsResponse, T_NeonErrorResponse, resp->tag); + } + pfree(resp); + } + return exists; +} + +/* + * Read N pages at a specific LSN. + * + * *mask is set for pages read at a previous point in time, and which we + * should not touch, nor overwrite. + * New bits should be set in *mask for the pages we'successfully read. + * + * The offsets in request_lsns, buffers, and mask are linked. + */ +void +communicator_read_at_lsnv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber base_blockno, + neon_request_lsns *request_lsns, + void **buffers, BlockNumber nblocks, const bits8 *mask) +{ + NeonResponse *resp; + uint64 ring_index; + PrfHashEntry *entry; + PrefetchRequest *slot; + PrefetchRequest hashkey; + + Assert(PointerIsValid(request_lsns)); + Assert(nblocks >= 1); + + /* + * Use an intermediate PrefetchRequest struct as the hash key to ensure + * correct alignment and that the padding bytes are cleared. + */ + memset(&hashkey.buftag, 0, sizeof(BufferTag)); + CopyNRelFileInfoToBufTag(hashkey.buftag, rinfo); + hashkey.buftag.forkNum = forkNum; + hashkey.buftag.blockNum = base_blockno; + + /* + * The redo process does not lock pages that it needs to replay but are + * not in the shared buffers, so a concurrent process may request the page + * after redo has decided it won't redo that page and updated the LwLSN + * for that page. If we're in hot standby we need to take care that we + * don't return until after REDO has finished replaying up to that LwLSN, + * as the page should have been locked up to that point. + * + * See also the description on neon_redo_read_buffer_filter below. + * + * NOTE: It is possible that the WAL redo process will still do IO due to + * concurrent failed read IOs. Those IOs should never have a request_lsn + * that is as large as the WAL record we're currently replaying, if it + * weren't for the behaviour of the LwLsn cache that uses the highest + * value of the LwLsn cache when the entry is not found. + */ + (void) prefetch_register_bufferv(hashkey.buftag, request_lsns, nblocks, mask, false); + + for (int i = 0; i < nblocks; i++) + { + void *buffer = buffers[i]; + BlockNumber blockno = base_blockno + i; + neon_request_lsns *reqlsns = &request_lsns[i]; + TimestampTz start_ts, end_ts; + + if (PointerIsValid(mask) && BITMAP_ISSET(mask, i)) + continue; + + start_ts = GetCurrentTimestamp(); + + if (RecoveryInProgress() && MyBackendType != B_STARTUP) + XLogWaitForReplayOf(reqlsns->request_lsn); + + /* + * Try to find prefetched page in the list of received pages. + */ +Retry: + hashkey.buftag.blockNum = blockno; + entry = prfh_lookup(MyPState->prf_hash, &hashkey); + + if (entry != NULL) + { + slot = entry->slot; + if (neon_prefetch_response_usable(reqlsns, slot)) + { + ring_index = slot->my_ring_index; + } + else + { + /* + * Cannot use this prefetch, discard it + * + * We can't drop cache for not-yet-received requested items. It is + * unlikely this happens, but it can happen if prefetch distance + * is large enough and a backend didn't consume all prefetch + * requests. + */ + if (slot->status == PRFS_REQUESTED) + { + if (!prefetch_wait_for(slot->my_ring_index)) + goto Retry; + } + /* drop caches */ + prefetch_set_unused(slot->my_ring_index); + pgBufferUsage.prefetch.expired += 1; + MyNeonCounters->getpage_prefetch_discards_total++; + /* make it look like a prefetch cache miss */ + entry = NULL; + } + } + + do + { + if (entry == NULL) + { + ring_index = prefetch_register_bufferv(hashkey.buftag, reqlsns, 1, NULL, false); + Assert(ring_index != UINT64_MAX); + slot = GetPrfSlot(ring_index); + } + else + { + /* + * Empty our reference to the prefetch buffer's hash entry. When + * we wait for prefetches, the entry reference is invalidated by + * potential updates to the hash, and when we reconnect to the + * pageserver the prefetch we're waiting for may be dropped, in + * which case we need to retry and take the branch above. + */ + entry = NULL; + } + + Assert(slot->my_ring_index == ring_index); + Assert(MyPState->ring_last <= ring_index && + MyPState->ring_unused > ring_index); + Assert(slot->status != PRFS_UNUSED); + Assert(GetPrfSlot(ring_index) == slot); + + } while (!prefetch_wait_for(ring_index)); + + Assert(slot->status == PRFS_RECEIVED); + Assert(memcmp(&hashkey.buftag, &slot->buftag, sizeof(BufferTag)) == 0); + Assert(hashkey.buftag.blockNum == base_blockno + i); + + resp = slot->response; + + switch (resp->tag) + { + case T_NeonGetPageResponse: + { + NeonGetPageResponse* getpage_resp = (NeonGetPageResponse *) resp; + if (neon_protocol_version >= 3) + { + if (resp->reqid != slot->reqid || + resp->lsn != slot->request_lsns.request_lsn || + resp->not_modified_since != slot->request_lsns.not_modified_since || + !RelFileInfoEquals(getpage_resp->req.rinfo, rinfo) || + getpage_resp->req.forknum != forkNum || + getpage_resp->req.blkno != base_blockno + i) + { + NEON_PANIC_CONNECTION_STATE(-1, PANIC, + "Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u, block=%u} to get page request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u, block=%u}", + resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), RelFileInfoFmt(getpage_resp->req.rinfo), getpage_resp->req.forknum, getpage_resp->req.blkno, + slot->reqid, LSN_FORMAT_ARGS(slot->request_lsns.request_lsn), LSN_FORMAT_ARGS(slot->request_lsns.not_modified_since), RelFileInfoFmt(rinfo), forkNum, base_blockno + i); + } + } + memcpy(buffer, getpage_resp->page, BLCKSZ); + + /* + * With lfc_store_prefetch_result=true prefetch result is stored in LFC in prefetch_pump_state when response is received + * from page server. But if lfc_store_prefetch_result=false then it is not yet stored in LFC and we have to do it here + * under buffer lock. + */ + if (!lfc_store_prefetch_result) + lfc_write(rinfo, forkNum, blockno, buffer); + break; + } + case T_NeonErrorResponse: + if (neon_protocol_version >= 3) + { + if (resp->reqid != slot->reqid || + resp->lsn != slot->request_lsns.request_lsn || + resp->not_modified_since != slot->request_lsns.not_modified_since) + { + elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match get relsize request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}", + resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), + slot->reqid, LSN_FORMAT_ARGS(slot->request_lsns.request_lsn), LSN_FORMAT_ARGS(slot->request_lsns.not_modified_since)); + } + } + ereport(ERROR, + (errcode(ERRCODE_IO_ERROR), + errmsg(NEON_TAG "[shard %d, reqid %lx] could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X", + slot->shard_no, resp->reqid, blockno, RelFileInfoFmt(rinfo), + forkNum, LSN_FORMAT_ARGS(reqlsns->effective_request_lsn)), + errdetail("page server returned error: %s", + ((NeonErrorResponse *) resp)->message))); + break; + default: + NEON_PANIC_CONNECTION_STATE(slot->shard_no, PANIC, + "Expected GetPage (0x%02x) or Error (0x%02x) response to GetPageRequest, but got 0x%02x", + T_NeonGetPageResponse, T_NeonErrorResponse, resp->tag); + } + + /* buffer was used, clean up for later reuse */ + prefetch_set_unused(ring_index); + prefetch_cleanup_trailing_unused(); + + end_ts = GetCurrentTimestamp(); + inc_getpage_wait(end_ts >= start_ts ? (end_ts - start_ts) : 0); + } +} + +/* + * neon_nblocks() -- Get the number of blocks stored in a relation. + */ +BlockNumber +communicator_nblocks(NRelFileInfo rinfo, ForkNumber forknum, neon_request_lsns *request_lsns) +{ + NeonResponse *resp; + BlockNumber n_blocks; + + { + NeonNblocksRequest request = { + .hdr.tag = T_NeonNblocksRequest, + .hdr.reqid = GENERATE_REQUEST_ID(), + .hdr.lsn = request_lsns->request_lsn, + .hdr.not_modified_since = request_lsns->not_modified_since, + .rinfo = rinfo, + .forknum = forknum, + }; + + resp = page_server_request(&request); + + switch (resp->tag) + { + case T_NeonNblocksResponse: + { + NeonNblocksResponse * relsize_resp = (NeonNblocksResponse *) resp; + if (neon_protocol_version >= 3) + { + if (!equal_requests(resp, &request.hdr) || + !RelFileInfoEquals(relsize_resp->req.rinfo, request.rinfo) || + relsize_resp->req.forknum != forknum) + { + NEON_PANIC_CONNECTION_STATE(-1, PANIC, + "Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u} to get relsize request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u}", + resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), RelFileInfoFmt(relsize_resp->req.rinfo), relsize_resp->req.forknum, + request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), RelFileInfoFmt(request.rinfo), forknum); + } + } + n_blocks = relsize_resp->n_blocks; + break; + } + case T_NeonErrorResponse: + if (neon_protocol_version >= 3) + { + if (!equal_requests(resp, &request.hdr)) + { + elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match get relsize request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}", + resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), + request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since)); + } + } + ereport(ERROR, + (errcode(ERRCODE_IO_ERROR), + errmsg(NEON_TAG "[reqid %lx] could not read relation size of rel %u/%u/%u.%u from page server at lsn %X/%08X", + resp->reqid, + RelFileInfoFmt(rinfo), + forknum, + LSN_FORMAT_ARGS(request_lsns->effective_request_lsn)), + errdetail("page server returned error: %s", + ((NeonErrorResponse *) resp)->message))); + break; + + default: + NEON_PANIC_CONNECTION_STATE(-1, PANIC, + "Expected Nblocks (0x%02x) or Error (0x%02x) response to NblocksRequest, but got 0x%02x", + T_NeonNblocksResponse, T_NeonErrorResponse, resp->tag); + } + + pfree(resp); + } + return n_blocks; +} + +/* + * neon_db_size() -- Get the size of the database in bytes. + */ +int64 +communicator_dbsize(Oid dbNode, neon_request_lsns *request_lsns) +{ + NeonResponse *resp; + int64 db_size; + + { + NeonDbSizeRequest request = { + .hdr.tag = T_NeonDbSizeRequest, + .hdr.reqid = GENERATE_REQUEST_ID(), + .hdr.lsn = request_lsns->request_lsn, + .hdr.not_modified_since = request_lsns->not_modified_since, + .dbNode = dbNode, + }; + + resp = page_server_request(&request); + + switch (resp->tag) + { + case T_NeonDbSizeResponse: + { + NeonDbSizeResponse* dbsize_resp = (NeonDbSizeResponse *) resp; + if (neon_protocol_version >= 3) + { + if (!equal_requests(resp, &request.hdr) || + dbsize_resp->req.dbNode != dbNode) + { + NEON_PANIC_CONNECTION_STATE(-1, PANIC, + "Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, dbNode=%u} to get DB size request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, dbNode=%u}", + resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), dbsize_resp->req.dbNode, + request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), dbNode); + } + } + db_size = dbsize_resp->db_size; + break; + } + case T_NeonErrorResponse: + if (neon_protocol_version >= 3) + { + if (!equal_requests(resp, &request.hdr)) + { + elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match get DB size request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}", + resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), + request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since)); + } + } + ereport(ERROR, + (errcode(ERRCODE_IO_ERROR), + errmsg(NEON_TAG "[reqid %lx] could not read db size of db %u from page server at lsn %X/%08X", + resp->reqid, + dbNode, LSN_FORMAT_ARGS(request_lsns->effective_request_lsn)), + errdetail("page server returned error: %s", + ((NeonErrorResponse *) resp)->message))); + break; + + default: + NEON_PANIC_CONNECTION_STATE(-1, PANIC, + "Expected DbSize (0x%02x) or Error (0x%02x) response to DbSizeRequest, but got 0x%02x", + T_NeonDbSizeResponse, T_NeonErrorResponse, resp->tag); + } + + pfree(resp); + } + return db_size; +} + +int +communicator_read_slru_segment(SlruKind kind, int64 segno, neon_request_lsns *request_lsns, + void *buffer) +{ + int n_blocks; + shardno_t shard_no = 0; /* All SLRUs are at shard 0 */ + NeonResponse *resp; + NeonGetSlruSegmentRequest request; + + request = (NeonGetSlruSegmentRequest) { + .hdr.tag = T_NeonGetSlruSegmentRequest, + .hdr.reqid = GENERATE_REQUEST_ID(), + .hdr.lsn = request_lsns->request_lsn, + .hdr.not_modified_since = request_lsns->not_modified_since, + .kind = kind, + .segno = segno + }; + + do + { + while (!page_server->send(shard_no, &request.hdr) || !page_server->flush(shard_no)); + + consume_prefetch_responses(); + + resp = page_server->receive(shard_no); + } while (resp == NULL); + + switch (resp->tag) + { + case T_NeonGetSlruSegmentResponse: + { + NeonGetSlruSegmentResponse* slru_resp = (NeonGetSlruSegmentResponse *) resp; + if (neon_protocol_version >= 3) + { + if (!equal_requests(resp, &request.hdr) || + slru_resp->req.kind != kind || + slru_resp->req.segno != segno) + { + NEON_PANIC_CONNECTION_STATE(-1, PANIC, + "Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, kind=%u, segno=%u} to get SLRU segment request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, kind=%u, segno=%lluu}", + resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), slru_resp->req.kind, slru_resp->req.segno, + request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), kind, (unsigned long long) segno); + } + } + n_blocks = slru_resp->n_blocks; + memcpy(buffer, slru_resp->data, n_blocks*BLCKSZ); + break; + } + case T_NeonErrorResponse: + if (neon_protocol_version >= 3) + { + if (!equal_requests(resp, &request.hdr)) + { + elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match get SLRU segment request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}", + resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), + request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since)); + } + } + ereport(ERROR, + (errcode(ERRCODE_IO_ERROR), + errmsg(NEON_TAG "[reqid %lx] could not read SLRU %d segment %llu at lsn %X/%08X", + resp->reqid, + kind, + (unsigned long long) segno, + LSN_FORMAT_ARGS(request_lsns->request_lsn)), + errdetail("page server returned error: %s", + ((NeonErrorResponse *) resp)->message))); + break; + + default: + NEON_PANIC_CONNECTION_STATE(-1, PANIC, + "Expected GetSlruSegment (0x%02x) or Error (0x%02x) response to GetSlruSegmentRequest, but got 0x%02x", + T_NeonGetSlruSegmentResponse, T_NeonErrorResponse, resp->tag); + } + pfree(resp); + + communicator_reconfigure_timeout_if_needed(); + return n_blocks; +} + +void +communicator_reconfigure_timeout_if_needed(void) +{ + bool needs_set = MyPState->ring_receive != MyPState->ring_unused && + readahead_getpage_pull_timeout_ms > 0; + + if (needs_set != timeout_set) + { + /* The background writer doens't (shouldn't) read any pages */ + Assert(!AmBackgroundWriterProcess()); + /* The checkpointer doens't (shouldn't) read any pages */ + Assert(!AmCheckpointerProcess()); + + if (unlikely(PS_TIMEOUT_ID == 0)) + { + PS_TIMEOUT_ID = RegisterTimeout(USER_TIMEOUT, pagestore_timeout_handler); + } + + if (needs_set) + { +#if PG_MAJORVERSION_NUM <= 14 + enable_timeout_after(PS_TIMEOUT_ID, readahead_getpage_pull_timeout_ms); +#else + enable_timeout_every( + PS_TIMEOUT_ID, + TimestampTzPlusMilliseconds(GetCurrentTimestamp(), + readahead_getpage_pull_timeout_ms), + readahead_getpage_pull_timeout_ms + ); +#endif + timeout_set = true; + } + else + { + Assert(timeout_set); + disable_timeout(PS_TIMEOUT_ID, false); + timeout_set = false; + } + } +} + +static void +pagestore_timeout_handler(void) +{ +#if PG_MAJORVERSION_NUM <= 14 + /* + * PG14: Setting a repeating timeout is not possible, so we signal here + * that the timeout has already been reset, and by telling the system + * that system will re-schedule it later if we need to. + */ + timeout_set = false; +#endif + timeout_signaled = true; + InterruptPending = true; +} + +/* + * Process new data received in our active PageStream sockets. + * + * This relies on the invariant that all pipelined yet-to-be-received requests + * are getPage requests managed by MyPState. This is currently true, any + * modification will probably require some stuff to make it work again. + */ +static bool +communicator_processinterrupts(void) +{ + if (timeout_signaled) + { + if (!readpage_reentrant_guard && readahead_getpage_pull_timeout_ms > 0) + communicator_prefetch_pump_state(true); + + timeout_signaled = false; + communicator_reconfigure_timeout_if_needed(); + } + + if (!prev_interrupt_cb) + return false; + + return prev_interrupt_cb(); +} diff --git a/pgxn/neon/communicator.h b/pgxn/neon/communicator.h new file mode 100644 index 0000000000..72cba526c1 --- /dev/null +++ b/pgxn/neon/communicator.h @@ -0,0 +1,48 @@ +/*------------------------------------------------------------------------- + * + * communicator.h + * internal interface for communicating with remote pageservers + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + *------------------------------------------------------------------------- + */ +#ifndef COMMUNICATOR_h +#define COMMUNICATOR_h + +#include "neon_pgversioncompat.h" + +#include "storage/buf_internals.h" + +#include "pagestore_client.h" + +/* initialization at postmaster startup */ +extern void pg_init_communicator(void); + +/* initialization at backend startup */ +extern void communicator_init(void); + +extern bool communicator_exists(NRelFileInfo rinfo, ForkNumber forkNum, + neon_request_lsns *request_lsns); +extern BlockNumber communicator_nblocks(NRelFileInfo rinfo, ForkNumber forknum, + neon_request_lsns *request_lsns); +extern int64 communicator_dbsize(Oid dbNode, neon_request_lsns *request_lsns); +extern void communicator_read_at_lsnv(NRelFileInfo rinfo, ForkNumber forkNum, + BlockNumber base_blockno, neon_request_lsns *request_lsns, + void **buffers, BlockNumber nblocks, const bits8 *mask); +extern int communicator_prefetch_lookupv(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blocknum, + neon_request_lsns *lsns, + BlockNumber nblocks, void **buffers, bits8 *mask); +extern void communicator_prefetch_register_bufferv(BufferTag tag, neon_request_lsns *frlsns, + BlockNumber nblocks, const bits8 *mask); +extern int communicator_read_slru_segment(SlruKind kind, int64 segno, + neon_request_lsns *request_lsns, + void *buffer); + +extern void communicator_reconfigure_timeout_if_needed(void); +extern void communicator_prefetch_pump_state(bool IsHandlingInterrupts); + + +#endif diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c index 69da83f3fb..a6a7021756 100644 --- a/pgxn/neon/neon.c +++ b/pgxn/neon/neon.c @@ -28,6 +28,7 @@ #include "utils/guc.h" #include "utils/guc_tables.h" +#include "communicator.h" #include "extension_server.h" #include "file_cache.h" #include "neon.h" @@ -439,7 +440,7 @@ _PG_init(void) pg_init_walproposer(); init_lwlsncache(); - pagestore_smgr_init(); + pg_init_communicator(); Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines; InitUnstableExtensionsSupport(); diff --git a/pgxn/neon/neon.h b/pgxn/neon/neon.h index a4339c9776..a2e81feb5f 100644 --- a/pgxn/neon/neon.h +++ b/pgxn/neon/neon.h @@ -59,7 +59,6 @@ extern uint32 WAIT_EVENT_NEON_WAL_DL; extern void pg_init_libpagestore(void); extern void pg_init_walproposer(void); -extern void pagestore_smgr_init(void); extern uint64 BackpressureThrottlingTime(void); extern void SetNeonCurrentClusterSize(uint64 size); diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h index 6ddad21362..0ab539fe56 100644 --- a/pgxn/neon/pagestore_client.h +++ b/pgxn/neon/pagestore_client.h @@ -233,6 +233,7 @@ extern const f_smgr *smgr_neon(ProcNumber backend, NRelFileInfo rinfo); extern void smgr_init_neon(void); extern void readahead_buffer_resize(int newsize, void *extra); + /* * LSN values associated with each request to the pageserver */ @@ -269,6 +270,10 @@ extern PGDLLEXPORT void neon_read_at_lsn(NRelFileInfo rnode, ForkNumber forkNum, neon_request_lsns request_lsns, void *buffer); extern int64 neon_dbsize(Oid dbNode); +extern void neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, + BlockNumber blkno, neon_request_lsns *output, + BlockNumber nblocks); + /* utils for neon relsize cache */ extern void relsize_hash_init(void); extern bool get_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber *size); diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index 0a43f3a6a3..ef6bd038bb 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -49,9 +49,6 @@ #include "access/xlog_internal.h" #include "access/xlogutils.h" #include "catalog/pg_class.h" -#include "common/hashfn.h" -#include "executor/instrument.h" -#include "libpq/pqformat.h" #include "pgstat.h" #include "postmaster/autovacuum.h" #include "postmaster/interrupt.h" @@ -62,9 +59,9 @@ #include "storage/fsm_internals.h" #include "storage/md.h" #include "storage/smgr.h" -#include "utils/timeout.h" #include "bitmap.h" +#include "communicator.h" #include "file_cache.h" #include "neon.h" #include "neon_lwlsncache.h" @@ -102,12 +99,6 @@ static char *hexdump_page(char *page); const int SmgrTrace = DEBUG5; -#define NEON_PANIC_CONNECTION_STATE(shard_no, elvl, message, ...) \ - neon_shard_log(shard_no, elvl, "Broken connection state: " message, \ - ##__VA_ARGS__) - -page_server_api *page_server; - /* unlogged relation build states */ typedef enum { @@ -125,1685 +116,6 @@ static bool (*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block static BlockNumber neon_nblocks(SMgrRelation reln, ForkNumber forknum); -static uint32 local_request_counter; -#define GENERATE_REQUEST_ID() (((NeonRequestId)MyProcPid << 32) | ++local_request_counter) - -/* - * Various settings related to prompt (fast) handling of PageStream responses - * at any CHECK_FOR_INTERRUPTS point. - */ -int readahead_getpage_pull_timeout_ms = 0; -static int PS_TIMEOUT_ID = 0; -static bool timeout_set = false; -static bool timeout_signaled = false; - -/* - * We have a CHECK_FOR_INTERRUPTS in page_server->receive(), and we don't want - * that to handle any getpage responses if we're already working on the - * backlog of those, as we'd hit issues with determining which prefetch slot - * we just got a response for. - * - * To protect against that, we have this variable that's set whenever we start - * receiving data for prefetch slots, so that we don't get confused. - * - * Note that in certain error cases during readpage we may leak r_r_g=true, - * which results in a failure to pick up further responses until we first - * actively try to receive new getpage responses. - */ -static bool readpage_reentrant_guard = false; - -static void reconfigure_timeout_if_needed(void); -static void pagestore_timeout_handler(void); - -#define START_PREFETCH_RECEIVE_WORK() \ - do { \ - readpage_reentrant_guard = true; \ - } while (false) - -#define END_PREFETCH_RECEIVE_WORK() \ - do { \ - readpage_reentrant_guard = false; \ - if (unlikely(timeout_signaled && !InterruptPending)) \ - InterruptPending = true; \ - } while (false) - -/* - * Prefetch implementation: - * - * Prefetch is performed locally by each backend. - * - * There can be up to readahead_buffer_size active IO requests registered at - * any time. Requests using smgr_prefetch are sent to the pageserver, but we - * don't wait on the response. Requests using smgr_read are either read from - * the buffer, or (if that's not possible) we wait on the response to arrive - - * this also will allow us to receive other prefetched pages. - * Each request is immediately written to the output buffer of the pageserver - * connection, but may not be flushed if smgr_prefetch is used: pageserver - * flushes sent requests on manual flush, or every neon.flush_output_after - * unflushed requests; which is not necessarily always and all the time. - * - * Once we have received a response, this value will be stored in the response - * buffer, indexed in a hash table. This allows us to retain our buffered - * prefetch responses even when we have cache misses. - * - * Reading of prefetch responses is delayed until them are actually needed - * (smgr_read). In case of prefetch miss or any other SMGR request other than - * smgr_read, all prefetch responses in the pipeline will need to be read from - * the connection; the responses are stored for later use. - * - * NOTE: The current implementation of the prefetch system implements a ring - * buffer of up to readahead_buffer_size requests. If there are more _read and - * _prefetch requests between the initial _prefetch and the _read of a buffer, - * the prefetch request will have been dropped from this prefetch buffer, and - * your prefetch was wasted. - */ - -/* - * State machine: - * - * not in hash : in hash - * : - * UNUSED ------> REQUESTED --> RECEIVED - * ^ : | | - * | : v | - * | : TAG_REMAINS | - * | : | | - * +----------------+------------+ - * : - */ -typedef enum PrefetchStatus -{ - PRFS_UNUSED = 0, /* unused slot */ - PRFS_REQUESTED, /* request was written to the sendbuffer to - * PS, but not necessarily flushed. all fields - * except response valid */ - PRFS_RECEIVED, /* all fields valid */ - PRFS_TAG_REMAINS, /* only buftag and my_ring_index are still - * valid */ -} PrefetchStatus; - -/* must fit in uint8; bits 0x1 are used */ -typedef enum { - PRFSF_NONE = 0x0, - PRFSF_LFC = 0x1 /* received prefetch result is stored in LFC */ -} PrefetchRequestFlags; - -typedef struct PrefetchRequest -{ - BufferTag buftag; /* must be first entry in the struct */ - shardno_t shard_no; - uint8 status; /* see PrefetchStatus for valid values */ - uint8 flags; /* see PrefetchRequestFlags */ - neon_request_lsns request_lsns; - NeonRequestId reqid; - NeonResponse *response; /* may be null */ - uint64 my_ring_index; -} PrefetchRequest; - -/* prefetch buffer lookup hash table */ - -typedef struct PrfHashEntry -{ - PrefetchRequest *slot; - uint32 status; - uint32 hash; -} PrfHashEntry; - -#define SH_PREFIX prfh -#define SH_ELEMENT_TYPE PrfHashEntry -#define SH_KEY_TYPE PrefetchRequest * -#define SH_KEY slot -#define SH_STORE_HASH -#define SH_GET_HASH(tb, a) ((a)->hash) -#define SH_HASH_KEY(tb, key) hash_bytes( \ - ((const unsigned char *) &(key)->buftag), \ - sizeof(BufferTag) \ -) - -#define SH_EQUAL(tb, a, b) (BufferTagsEqual(&(a)->buftag, &(b)->buftag)) -#define SH_SCOPE static inline -#define SH_DEFINE -#define SH_DECLARE -#include "lib/simplehash.h" - -/* - * PrefetchState maintains the state of (prefetch) getPage@LSN requests. - * It maintains a (ring) buffer of in-flight requests and responses. - * - * We maintain several indexes into the ring buffer: - * ring_unused >= ring_flush >= ring_receive >= ring_last >= 0 - * - * ring_unused points to the first unused slot of the buffer - * ring_receive is the next request that is to be received - * ring_last is the oldest received entry in the buffer - * - * Apart from being an entry in the ring buffer of prefetch requests, each - * PrefetchRequest that is not UNUSED is indexed in prf_hash by buftag. - */ -typedef struct PrefetchState -{ - MemoryContext bufctx; /* context for prf_buffer[].response - * allocations */ - MemoryContext errctx; /* context for prf_buffer[].response - * allocations */ - MemoryContext hashctx; /* context for prf_buffer */ - - /* buffer indexes */ - uint64 ring_unused; /* first unused slot */ - uint64 ring_flush; /* next request to flush */ - uint64 ring_receive; /* next slot that is to receive a response */ - uint64 ring_last; /* min slot with a response value */ - - /* metrics / statistics */ - int n_responses_buffered; /* count of PS responses not yet in - * buffers */ - int n_requests_inflight; /* count of PS requests considered in - * flight */ - int n_unused; /* count of buffers < unused, > last, that are - * also unused */ - - /* the buffers */ - prfh_hash *prf_hash; - int max_shard_no; - /* Mark shards involved in prefetch */ - uint8 shard_bitmap[(MAX_SHARDS + 7)/8]; - PrefetchRequest prf_buffer[]; /* prefetch buffers */ -} PrefetchState; - -static PrefetchState *MyPState; - -#define GetPrfSlotNoCheck(ring_index) ( \ - &MyPState->prf_buffer[((ring_index) % readahead_buffer_size)] \ -) - -#define GetPrfSlot(ring_index) ( \ - ( \ - AssertMacro((ring_index) < MyPState->ring_unused && \ - (ring_index) >= MyPState->ring_last), \ - GetPrfSlotNoCheck(ring_index) \ - ) \ -) - -#define ReceiveBufferNeedsCompaction() (\ - (MyPState->n_responses_buffered / 8) < ( \ - MyPState->ring_receive - \ - MyPState->ring_last - \ - MyPState->n_responses_buffered \ - ) \ -) - -static bool compact_prefetch_buffers(void); -static void consume_prefetch_responses(void); -static bool prefetch_read(PrefetchRequest *slot); -static void prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns); -static bool prefetch_wait_for(uint64 ring_index); -static void prefetch_cleanup_trailing_unused(void); -static inline void prefetch_set_unused(uint64 ring_index); - -static void -neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, - BlockNumber blkno, neon_request_lsns *output, - BlockNumber nblocks); -static bool neon_prefetch_response_usable(neon_request_lsns *request_lsns, - PrefetchRequest *slot); - -static bool -compact_prefetch_buffers(void) -{ - uint64 empty_ring_index = MyPState->ring_last; - uint64 search_ring_index = MyPState->ring_receive; - int n_moved = 0; - - if (MyPState->ring_receive == MyPState->ring_last) - return false; - - while (search_ring_index > MyPState->ring_last) - { - search_ring_index--; - if (GetPrfSlot(search_ring_index)->status == PRFS_UNUSED) - { - empty_ring_index = search_ring_index; - break; - } - } - - /* - * Here we have established: slots < search_ring_index have an unknown - * state (not scanned) slots >= search_ring_index and <= empty_ring_index - * are unused slots > empty_ring_index are in use, or outside our buffer's - * range. ... unless search_ring_index <= ring_last - * - * Therefore, there is a gap of at least one unused items between - * search_ring_index and empty_ring_index (both inclusive), which grows as - * we hit more unused items while moving backwards through the array. - */ - - while (search_ring_index > MyPState->ring_last) - { - PrefetchRequest *source_slot; - PrefetchRequest *target_slot; - bool found; - - /* update search index to an unprocessed entry */ - search_ring_index--; - - source_slot = GetPrfSlot(search_ring_index); - - if (source_slot->status == PRFS_UNUSED) - continue; - - /* slot is used -- start moving slot */ - target_slot = GetPrfSlot(empty_ring_index); - - Assert(source_slot->status == PRFS_RECEIVED); - Assert(target_slot->status == PRFS_UNUSED); - - target_slot->buftag = source_slot->buftag; - target_slot->shard_no = source_slot->shard_no; - target_slot->status = source_slot->status; - target_slot->flags = source_slot->flags; - target_slot->response = source_slot->response; - target_slot->reqid = source_slot->reqid; - target_slot->request_lsns = source_slot->request_lsns; - target_slot->my_ring_index = empty_ring_index; - - prfh_delete(MyPState->prf_hash, source_slot); - prfh_insert(MyPState->prf_hash, target_slot, &found); - - Assert(!found); - - /* Adjust the location of our known-empty slot */ - empty_ring_index--; - - /* empty the moved slot */ - source_slot->status = PRFS_UNUSED; - source_slot->buftag = (BufferTag) - { - 0 - }; - source_slot->response = NULL; - source_slot->my_ring_index = 0; - source_slot->request_lsns = (neon_request_lsns) { - InvalidXLogRecPtr, InvalidXLogRecPtr, InvalidXLogRecPtr - }; - - /* update bookkeeping */ - n_moved++; - } - - /* - * Only when we've moved slots we can expect trailing unused slots, so - * only then we clean up trailing unused slots. - */ - if (n_moved > 0) - { - prefetch_cleanup_trailing_unused(); - return true; - } - - return false; -} - -/* - * If there might be responses still in the TCP buffer, then we should try to - * use those, to reduce any TCP backpressure on the OS/PS side. - * - * This procedure handles that. - * - * Note that this works because we don't pipeline non-getPage requests. - * - * NOTE: This procedure is not allowed to throw errors that should be handled - * by SMGR-related code, as this can be called from every CHECK_FOR_INTERRUPTS - * point inside and outside PostgreSQL. - * - * This still does throw errors when it receives malformed responses from PS. - * - * When we're not called from CHECK_FOR_INTERRUPTS (indicated by - * IsHandlingInterrupts) we also report we've ended prefetch receive work, - * just in case state tracking was lost due to an error in the sync getPage - * response code. - */ -static void -prefetch_pump_state(bool IsHandlingInterrupts) -{ - while (MyPState->ring_receive != MyPState->ring_flush) - { - NeonResponse *response; - PrefetchRequest *slot; - MemoryContext old; - - slot = GetPrfSlot(MyPState->ring_receive); - - old = MemoryContextSwitchTo(MyPState->errctx); - response = page_server->try_receive(slot->shard_no); - MemoryContextSwitchTo(old); - - if (response == NULL) - break; - - /* The slot should still be valid */ - if (slot->status != PRFS_REQUESTED || - slot->response != NULL || - slot->my_ring_index != MyPState->ring_receive) - neon_shard_log(slot->shard_no, ERROR, - "Incorrect prefetch slot state after receive: status=%d response=%p my=%lu receive=%lu", - slot->status, slot->response, - (long) slot->my_ring_index, (long) MyPState->ring_receive); - - /* update prefetch state */ - MyPState->n_responses_buffered += 1; - MyPState->n_requests_inflight -= 1; - MyPState->ring_receive += 1; - MyNeonCounters->getpage_prefetches_buffered = - MyPState->n_responses_buffered; - - /* update slot state */ - slot->status = PRFS_RECEIVED; - slot->response = response; - - if (response->tag == T_NeonGetPageResponse && !(slot->flags & PRFSF_LFC) && lfc_store_prefetch_result) - { - /* - * Store prefetched result in LFC (please read comments to lfc_prefetch - * explaining why it can be done without holding shared buffer lock - */ - if (lfc_prefetch(BufTagGetNRelFileInfo(slot->buftag), slot->buftag.forkNum, slot->buftag.blockNum, ((NeonGetPageResponse*)response)->page, slot->request_lsns.not_modified_since)) - { - slot->flags |= PRFSF_LFC; - } - } - } - - /* We never pump the prefetch state while handling other pages */ - if (!IsHandlingInterrupts) - END_PREFETCH_RECEIVE_WORK(); - - reconfigure_timeout_if_needed(); -} - -void -readahead_buffer_resize(int newsize, void *extra) -{ - uint64 end, - nfree = newsize; - PrefetchState *newPState; - Size newprfs_size = offsetof(PrefetchState, prf_buffer) + - (sizeof(PrefetchRequest) * newsize); - - /* don't try to re-initialize if we haven't initialized yet */ - if (MyPState == NULL) - return; - - /* - * Make sure that we don't lose track of active prefetch requests by - * ensuring we have received all but the last n requests (n = newsize). - */ - if (MyPState->n_requests_inflight > newsize) - { - prefetch_wait_for(MyPState->ring_unused - newsize - 1); - Assert(MyPState->n_requests_inflight <= newsize); - } - - /* construct the new PrefetchState, and copy over the memory contexts */ - newPState = MemoryContextAllocZero(TopMemoryContext, newprfs_size); - - newPState->bufctx = MyPState->bufctx; - newPState->errctx = MyPState->errctx; - newPState->hashctx = MyPState->hashctx; - newPState->prf_hash = prfh_create(MyPState->hashctx, newsize, NULL); - newPState->n_unused = newsize; - newPState->n_requests_inflight = 0; - newPState->n_responses_buffered = 0; - newPState->ring_last = newsize; - newPState->ring_unused = newsize; - newPState->ring_receive = newsize; - newPState->max_shard_no = MyPState->max_shard_no; - memcpy(newPState->shard_bitmap, MyPState->shard_bitmap, sizeof(MyPState->shard_bitmap)); - - /* - * Copy over the prefetches. - * - * We populate the prefetch array from the end; to retain the most recent - * prefetches, but this has the benefit of only needing to do one - * iteration on the dataset, and trivial compaction. - */ - for (end = MyPState->ring_unused - 1; - end >= MyPState->ring_last && end != UINT64_MAX && nfree != 0; - end -= 1) - { - PrefetchRequest *slot = GetPrfSlot(end); - PrefetchRequest *newslot; - bool found; - - if (slot->status == PRFS_UNUSED) - continue; - - nfree -= 1; - - newslot = &newPState->prf_buffer[nfree]; - *newslot = *slot; - newslot->my_ring_index = nfree; - - prfh_insert(newPState->prf_hash, newslot, &found); - - Assert(!found); - - switch (newslot->status) - { - case PRFS_UNUSED: - pg_unreachable(); - case PRFS_REQUESTED: - newPState->n_requests_inflight += 1; - newPState->ring_receive -= 1; - newPState->ring_last -= 1; - break; - case PRFS_RECEIVED: - newPState->n_responses_buffered += 1; - newPState->ring_last -= 1; - break; - case PRFS_TAG_REMAINS: - newPState->ring_last -= 1; - break; - } - newPState->n_unused -= 1; - } - newPState->ring_flush = newPState->ring_receive; - - MyNeonCounters->getpage_prefetches_buffered = - MyPState->n_responses_buffered; - MyNeonCounters->pageserver_open_requests = - MyPState->n_requests_inflight; - - for (; end >= MyPState->ring_last && end != UINT64_MAX; end -= 1) - { - PrefetchRequest *slot = GetPrfSlot(end); - Assert(slot->status != PRFS_REQUESTED); - if (slot->status == PRFS_RECEIVED) - { - pfree(slot->response); - } - } - - prfh_destroy(MyPState->prf_hash); - pfree(MyPState); - MyPState = newPState; -} - - - -/* - * Make sure that there are no responses still in the buffer. - * - * This function may indirectly update MyPState->pfs_hash; which invalidates - * any active pointers into the hash table. - */ -static void -consume_prefetch_responses(void) -{ - if (MyPState->ring_receive < MyPState->ring_unused) - prefetch_wait_for(MyPState->ring_unused - 1); -} - -static void -prefetch_cleanup_trailing_unused(void) -{ - uint64 ring_index; - PrefetchRequest *slot; - - while (MyPState->ring_last < MyPState->ring_receive) - { - ring_index = MyPState->ring_last; - slot = GetPrfSlot(ring_index); - - if (slot->status == PRFS_UNUSED) - MyPState->ring_last += 1; - else - break; - } -} - - -static bool -prefetch_flush_requests(void) -{ - for (shardno_t shard_no = 0; shard_no < MyPState->max_shard_no; shard_no++) - { - if (BITMAP_ISSET(MyPState->shard_bitmap, shard_no)) - { - if (!page_server->flush(shard_no)) - return false; - BITMAP_CLR(MyPState->shard_bitmap, shard_no); - } - } - MyPState->max_shard_no = 0; - return true; -} - -/* - * Wait for slot of ring_index to have received its response. - * The caller is responsible for making sure the request buffer is flushed. - * - * NOTE: this function may indirectly update MyPState->pfs_hash; which - * invalidates any active pointers into the hash table. - * NOTE: callers should make sure they can handle query cancellations in this - * function's call path. - */ -static bool -prefetch_wait_for(uint64 ring_index) -{ - PrefetchRequest *entry; - bool result = true; - - if (MyPState->ring_flush <= ring_index && - MyPState->ring_unused > MyPState->ring_flush) - { - if (!prefetch_flush_requests()) - return false; - MyPState->ring_flush = MyPState->ring_unused; - } - - Assert(MyPState->ring_unused > ring_index); - - while (MyPState->ring_receive <= ring_index) - { - START_PREFETCH_RECEIVE_WORK(); - entry = GetPrfSlot(MyPState->ring_receive); - - Assert(entry->status == PRFS_REQUESTED); - if (!prefetch_read(entry)) - { - result = false; - break; - } - - END_PREFETCH_RECEIVE_WORK(); - CHECK_FOR_INTERRUPTS(); - } - - return result; -} - -/* - * Read the response of a prefetch request into its slot. - * - * The caller is responsible for making sure that the request for this buffer - * was flushed to the PageServer. - * - * NOTE: this function may indirectly update MyPState->pfs_hash; which - * invalidates any active pointers into the hash table. - * - * NOTE: this does IO, and can get canceled out-of-line. - */ -static bool -prefetch_read(PrefetchRequest *slot) -{ - NeonResponse *response; - MemoryContext old; - BufferTag buftag; - shardno_t shard_no; - uint64 my_ring_index; - - Assert(slot->status == PRFS_REQUESTED); - Assert(slot->response == NULL); - Assert(slot->my_ring_index == MyPState->ring_receive); - - if (slot->status != PRFS_REQUESTED || - slot->response != NULL || - slot->my_ring_index != MyPState->ring_receive) - neon_shard_log(slot->shard_no, ERROR, - "Incorrect prefetch read: status=%d response=%p my=%lu receive=%lu", - slot->status, slot->response, - (long)slot->my_ring_index, (long)MyPState->ring_receive); - - /* - * Copy the request info so that if an error happens and the prefetch - * queue is flushed during the receive call, we can print the original - * values in the error message - */ - buftag = slot->buftag; - shard_no = slot->shard_no; - my_ring_index = slot->my_ring_index; - - old = MemoryContextSwitchTo(MyPState->errctx); - response = (NeonResponse *) page_server->receive(shard_no); - MemoryContextSwitchTo(old); - if (response) - { - /* The slot should still be valid */ - if (slot->status != PRFS_REQUESTED || - slot->response != NULL || - slot->my_ring_index != MyPState->ring_receive) - neon_shard_log(shard_no, ERROR, - "Incorrect prefetch slot state after receive: status=%d response=%p my=%lu receive=%lu", - slot->status, slot->response, - (long) slot->my_ring_index, (long) MyPState->ring_receive); - - /* update prefetch state */ - MyPState->n_responses_buffered += 1; - MyPState->n_requests_inflight -= 1; - MyPState->ring_receive += 1; - MyNeonCounters->getpage_prefetches_buffered = - MyPState->n_responses_buffered; - - /* update slot state */ - slot->status = PRFS_RECEIVED; - slot->response = response; - - if (response->tag == T_NeonGetPageResponse && !(slot->flags & PRFSF_LFC) && lfc_store_prefetch_result) - { - /* - * Store prefetched result in LFC (please read comments to lfc_prefetch - * explaining why it can be done without holding shared buffer lock - */ - if (lfc_prefetch(BufTagGetNRelFileInfo(buftag), buftag.forkNum, buftag.blockNum, ((NeonGetPageResponse*)response)->page, slot->request_lsns.not_modified_since)) - { - slot->flags |= PRFSF_LFC; - } - } - return true; - } - else - { - /* - * Note: The slot might no longer be valid, if the connection was lost - * and the prefetch queue was flushed during the receive call - */ - neon_shard_log(shard_no, LOG, - "No response from reading prefetch entry %lu: %u/%u/%u.%u block %u. This can be caused by a concurrent disconnect", - (long) my_ring_index, - RelFileInfoFmt(BufTagGetNRelFileInfo(buftag)), - buftag.forkNum, buftag.blockNum); - return false; - } -} - -/* - * Disconnect hook - drop prefetches when the connection drops - * - * If we don't remove the failed prefetches, we'd be serving incorrect - * data to the smgr. - */ -void -prefetch_on_ps_disconnect(void) -{ - MyPState->ring_flush = MyPState->ring_unused; - - while (MyPState->ring_receive < MyPState->ring_unused) - { - PrefetchRequest *slot; - uint64 ring_index = MyPState->ring_receive; - - slot = GetPrfSlot(ring_index); - - Assert(slot->status == PRFS_REQUESTED); - Assert(slot->my_ring_index == ring_index); - - /* - * Drop connection to all shards which have prefetch requests. - * It is not a problem to call disconnect multiple times on the same connection - * because disconnect implementation in libpagestore.c will check if connection - * is alive and do nothing of connection was already dropped. - */ - page_server->disconnect(slot->shard_no); - - /* clean up the request */ - slot->status = PRFS_TAG_REMAINS; - MyPState->n_requests_inflight -= 1; - MyPState->ring_receive += 1; - - prefetch_set_unused(ring_index); - pgBufferUsage.prefetch.expired += 1; - MyNeonCounters->getpage_prefetch_discards_total += 1; - } - - /* - * We can have gone into retry due to network error, so update stats with - * the latest available - */ - MyNeonCounters->pageserver_open_requests = - MyPState->n_requests_inflight; - MyNeonCounters->getpage_prefetches_buffered = - MyPState->n_responses_buffered; -} - -/* - * prefetch_set_unused() - clear a received prefetch slot - * - * The slot at ring_index must be a current member of the ring buffer, - * and may not be in the PRFS_REQUESTED state. - * - * NOTE: this function will update MyPState->pfs_hash; which invalidates any - * active pointers into the hash table. - */ -static inline void -prefetch_set_unused(uint64 ring_index) -{ - PrefetchRequest *slot; - - if (ring_index < MyPState->ring_last) - return; /* Should already be unused */ - - slot = GetPrfSlot(ring_index); - if (slot->status == PRFS_UNUSED) - return; - - Assert(slot->status == PRFS_RECEIVED || slot->status == PRFS_TAG_REMAINS); - - if (slot->status == PRFS_RECEIVED) - { - pfree(slot->response); - slot->response = NULL; - - MyPState->n_responses_buffered -= 1; - MyPState->n_unused += 1; - - MyNeonCounters->getpage_prefetches_buffered = - MyPState->n_responses_buffered; - } - else - { - Assert(slot->response == NULL); - } - - prfh_delete(MyPState->prf_hash, slot); - - /* clear all fields */ - MemSet(slot, 0, sizeof(PrefetchRequest)); - slot->status = PRFS_UNUSED; - - /* run cleanup if we're holding back ring_last */ - if (MyPState->ring_last == ring_index) - prefetch_cleanup_trailing_unused(); - - /* - * ... and try to store the buffered responses more compactly if > 12.5% - * of the buffer is gaps - */ - else if (ReceiveBufferNeedsCompaction()) - compact_prefetch_buffers(); -} - -/* - * Send one prefetch request to the pageserver. To wait for the response, call - * prefetch_wait_for(). - */ -static void -prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns) -{ - bool found; - uint64 mySlotNo PG_USED_FOR_ASSERTS_ONLY = slot->my_ring_index; - - NeonGetPageRequest request = { - .hdr.tag = T_NeonGetPageRequest, - .hdr.reqid = GENERATE_REQUEST_ID(), - /* lsn and not_modified_since are filled in below */ - .rinfo = BufTagGetNRelFileInfo(slot->buftag), - .forknum = slot->buftag.forkNum, - .blkno = slot->buftag.blockNum, - }; - - Assert(mySlotNo == MyPState->ring_unused); - - slot->reqid = request.hdr.reqid; - - if (force_request_lsns) - slot->request_lsns = *force_request_lsns; - else - neon_get_request_lsns(BufTagGetNRelFileInfo(slot->buftag), - slot->buftag.forkNum, slot->buftag.blockNum, - &slot->request_lsns, 1); - request.hdr.lsn = slot->request_lsns.request_lsn; - request.hdr.not_modified_since = slot->request_lsns.not_modified_since; - - Assert(slot->response == NULL); - Assert(slot->my_ring_index == MyPState->ring_unused); - - while (!page_server->send(slot->shard_no, (NeonRequest *) &request)) - { - Assert(mySlotNo == MyPState->ring_unused); - /* loop */ - } - - /* update prefetch state */ - MyPState->n_requests_inflight += 1; - MyPState->n_unused -= 1; - MyPState->ring_unused += 1; - BITMAP_SET(MyPState->shard_bitmap, slot->shard_no); - MyPState->max_shard_no = Max(slot->shard_no+1, MyPState->max_shard_no); - - /* update slot state */ - slot->status = PRFS_REQUESTED; - prfh_insert(MyPState->prf_hash, slot, &found); - Assert(!found); -} - -/* - * Lookup of already received prefetch requests. Only already received responses matching required LSNs are accepted. - * Present pages are marked in "mask" bitmap and total number of such pages is returned. - */ -static int -prefetch_lookupv(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blocknum, neon_request_lsns *lsns, - BlockNumber nblocks, void **buffers, bits8 *mask) -{ - int hits = 0; - PrefetchRequest hashkey; - - /* - * Use an intermediate PrefetchRequest struct as the hash key to ensure - * correct alignment and that the padding bytes are cleared. - */ - memset(&hashkey.buftag, 0, sizeof(BufferTag)); - CopyNRelFileInfoToBufTag(hashkey.buftag, rinfo); - hashkey.buftag.forkNum = forknum; - - for (int i = 0; i < nblocks; i++) - { - PrfHashEntry *entry; - - hashkey.buftag.blockNum = blocknum + i; - entry = prfh_lookup(MyPState->prf_hash, &hashkey); - - if (entry != NULL) - { - PrefetchRequest *slot = entry->slot; - uint64 ring_index = slot->my_ring_index; - Assert(slot == GetPrfSlot(ring_index)); - - Assert(slot->status != PRFS_UNUSED); - Assert(MyPState->ring_last <= ring_index && - ring_index < MyPState->ring_unused); - Assert(BufferTagsEqual(&slot->buftag, &hashkey.buftag)); - - if (slot->status != PRFS_RECEIVED) - continue; - - /* - * If the caller specified a request LSN to use, only accept - * prefetch responses that satisfy that request. - */ - if (!neon_prefetch_response_usable(&lsns[i], slot)) - continue; - - /* - * Ignore errors - */ - if (slot->response->tag != T_NeonGetPageResponse) - { - if (slot->response->tag != T_NeonErrorResponse) - { - NEON_PANIC_CONNECTION_STATE(slot->shard_no, PANIC, - "Expected GetPage (0x%02x) or Error (0x%02x) response to GetPageRequest, but got 0x%02x", - T_NeonGetPageResponse, T_NeonErrorResponse, slot->response->tag); - } - continue; - } - memcpy(buffers[i], ((NeonGetPageResponse*)slot->response)->page, BLCKSZ); - - - /* - * With lfc_store_prefetch_result=true prefetch result is stored in LFC in prefetch_pump_state when response is received - * from page server. But if lfc_store_prefetch_result=false then it is not yet stored in LFC and we have to do it here - * under buffer lock. - */ - if (!lfc_store_prefetch_result) - lfc_write(rinfo, forknum, blocknum + i, buffers[i]); - - prefetch_set_unused(ring_index); - BITMAP_SET(mask, i); - - hits += 1; - inc_getpage_wait(0); - } - } - pgBufferUsage.prefetch.hits += hits; - return hits; -} - -#if PG_MAJORVERSION_NUM < 17 -static bool -prefetch_lookup(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkn, neon_request_lsns *lsns, void *buffer) -{ - bits8 present = 0; - return prefetch_lookupv(rinfo, forkNum, blkn, lsns, 1, &buffer, &present) != 0; -} -#endif - -/* - * prefetch_register_bufferv() - register and prefetch buffers - * - * Register that we may want the contents of BufferTag in the near future. - * This is used when issuing a speculative prefetch request, but also when - * performing a synchronous request and need the buffer right now. - * - * If force_request_lsns is not NULL, those values are sent to the - * pageserver. If NULL, we utilize the lastWrittenLsn -infrastructure - * to calculate the LSNs to send. - * - * Bits set in *mask (if present) indicate pages already read; i.e. pages we - * can skip in this process. - * - * When performing a prefetch rather than a synchronous request, - * is_prefetch==true. Currently, it only affects how the request is accounted - * in the perf counters. - * - * NOTE: this function may indirectly update MyPState->pfs_hash; which - * invalidates any active pointers into the hash table. - */ -static uint64 -prefetch_register_bufferv(BufferTag tag, neon_request_lsns *frlsns, - BlockNumber nblocks, const bits8 *mask, - bool is_prefetch) -{ - uint64 min_ring_index; - PrefetchRequest hashkey; -#ifdef USE_ASSERT_CHECKING - bool any_hits = false; -#endif - /* We will never read further ahead than our buffer can store. */ - nblocks = Max(1, Min(nblocks, readahead_buffer_size)); - - /* - * Use an intermediate PrefetchRequest struct as the hash key to ensure - * correct alignment and that the padding bytes are cleared. - */ - memset(&hashkey.buftag, 0, sizeof(BufferTag)); - hashkey.buftag = tag; - -Retry: - /* - * We can have gone into retry due to network error, so update stats with - * the latest available - */ - MyNeonCounters->pageserver_open_requests = - MyPState->ring_unused - MyPState->ring_receive; - MyNeonCounters->getpage_prefetches_buffered = - MyPState->n_responses_buffered; - - min_ring_index = UINT64_MAX; - for (int i = 0; i < nblocks; i++) - { - PrefetchRequest *slot = NULL; - PrfHashEntry *entry = NULL; - uint64 ring_index; - neon_request_lsns *lsns; - - if (PointerIsValid(mask) && BITMAP_ISSET(mask, i)) - continue; - - if (frlsns) - lsns = &frlsns[i]; - else - lsns = NULL; - -#ifdef USE_ASSERT_CHECKING - any_hits = true; -#endif - - slot = NULL; - entry = NULL; - - hashkey.buftag.blockNum = tag.blockNum + i; - entry = prfh_lookup(MyPState->prf_hash, &hashkey); - - if (entry != NULL) - { - slot = entry->slot; - ring_index = slot->my_ring_index; - Assert(slot == GetPrfSlot(ring_index)); - - Assert(slot->status != PRFS_UNUSED); - Assert(MyPState->ring_last <= ring_index && - ring_index < MyPState->ring_unused); - Assert(BufferTagsEqual(&slot->buftag, &hashkey.buftag)); - - /* - * If the caller specified a request LSN to use, only accept - * prefetch responses that satisfy that request. - */ - if (lsns) - { - if (!neon_prefetch_response_usable(lsns, slot)) - { - /* Wait for the old request to finish and discard it */ - if (!prefetch_wait_for(ring_index)) - goto Retry; - prefetch_set_unused(ring_index); - entry = NULL; - slot = NULL; - pgBufferUsage.prefetch.expired += 1; - MyNeonCounters->getpage_prefetch_discards_total += 1; - } - } - - if (entry != NULL) - { - /* - * We received a prefetch for a page that was recently read - * and removed from the buffers. Remove that request from the - * buffers. - */ - if (slot->status == PRFS_TAG_REMAINS) - { - prefetch_set_unused(ring_index); - entry = NULL; - slot = NULL; - } - else - { - min_ring_index = Min(min_ring_index, ring_index); - /* The buffered request is good enough, return that index */ - if (is_prefetch) - pgBufferUsage.prefetch.duplicates++; - continue; - } - } - } - else if (!is_prefetch) - { - pgBufferUsage.prefetch.misses += 1; - MyNeonCounters->getpage_prefetch_misses_total++; - } - /* - * We can only leave the block above by finding that there's - * no entry that can satisfy this request, either because there - * was no entry, or because the entry was invalid or didn't satisfy - * the LSNs provided. - * - * The code should've made sure to clear up the data. - */ - Assert(entry == NULL); - Assert(slot == NULL); - - /* There should be no buffer overflow */ - Assert(MyPState->ring_last + readahead_buffer_size >= MyPState->ring_unused); - - /* - * If the prefetch queue is full, we need to make room by clearing the - * oldest slot. If the oldest slot holds a buffer that was already - * received, we can just throw it away; we fetched the page - * unnecessarily in that case. If the oldest slot holds a request that - * we haven't received a response for yet, we have to wait for the - * response to that before we can continue. We might not have even - * flushed the request to the pageserver yet, it might be just sitting - * in the output buffer. In that case, we flush it and wait for the - * response. (We could decide not to send it, but it's hard to abort - * when the request is already in the output buffer, and 'not sending' - * a prefetch request kind of goes against the principles of - * prefetching) - */ - if (MyPState->ring_last + readahead_buffer_size == MyPState->ring_unused) - { - uint64 cleanup_index = MyPState->ring_last; - - slot = GetPrfSlot(cleanup_index); - - Assert(slot->status != PRFS_UNUSED); - - /* - * If there is good reason to run compaction on the prefetch buffers, - * try to do that. - */ - if (ReceiveBufferNeedsCompaction() && compact_prefetch_buffers()) - { - Assert(slot->status == PRFS_UNUSED); - } - else - { - /* - * We have the slot for ring_last, so that must still be in - * progress - */ - switch (slot->status) - { - case PRFS_REQUESTED: - Assert(MyPState->ring_receive == cleanup_index); - if (!prefetch_wait_for(cleanup_index)) - goto Retry; - prefetch_set_unused(cleanup_index); - pgBufferUsage.prefetch.expired += 1; - MyNeonCounters->getpage_prefetch_discards_total += 1; - break; - case PRFS_RECEIVED: - case PRFS_TAG_REMAINS: - prefetch_set_unused(cleanup_index); - pgBufferUsage.prefetch.expired += 1; - MyNeonCounters->getpage_prefetch_discards_total += 1; - break; - default: - pg_unreachable(); - } - } - } - - /* - * The next buffer pointed to by `ring_unused` is now definitely empty, so - * we can insert the new request to it. - */ - ring_index = MyPState->ring_unused; - - Assert(MyPState->ring_last <= ring_index && - ring_index <= MyPState->ring_unused); - - slot = GetPrfSlotNoCheck(ring_index); - - Assert(slot->status == PRFS_UNUSED); - - /* - * We must update the slot data before insertion, because the hash - * function reads the buffer tag from the slot. - */ - slot->buftag = hashkey.buftag; - slot->shard_no = get_shard_number(&tag); - slot->my_ring_index = ring_index; - slot->flags = 0; - - min_ring_index = Min(min_ring_index, ring_index); - - if (is_prefetch) - MyNeonCounters->getpage_prefetch_requests_total++; - else - MyNeonCounters->getpage_sync_requests_total++; - - prefetch_do_request(slot, lsns); - } - - MyNeonCounters->pageserver_open_requests = - MyPState->ring_unused - MyPState->ring_receive; - - Assert(any_hits); - - Assert(GetPrfSlot(min_ring_index)->status == PRFS_REQUESTED || - GetPrfSlot(min_ring_index)->status == PRFS_RECEIVED); - Assert(MyPState->ring_last <= min_ring_index && - min_ring_index < MyPState->ring_unused); - - if (flush_every_n_requests > 0 && - MyPState->ring_unused - MyPState->ring_flush >= flush_every_n_requests) - { - if (!prefetch_flush_requests()) - { - /* - * Prefetch set is reset in case of error, so we should try to - * register our request once again - */ - goto Retry; - } - MyPState->ring_flush = MyPState->ring_unused; - } - - return min_ring_index; -} - -static bool -equal_requests(NeonRequest* a, NeonRequest* b) -{ - return a->reqid == b->reqid && a->lsn == b->lsn && a->not_modified_since == b->not_modified_since; -} - - -/* - * Note: this function can get canceled and use a long jump to the next catch - * context. Take care. - */ -static NeonResponse * -page_server_request(void const *req) -{ - NeonResponse *resp; - BufferTag tag = {0}; - shardno_t shard_no; - - switch (messageTag(req)) - { - case T_NeonExistsRequest: - CopyNRelFileInfoToBufTag(tag, ((NeonExistsRequest *) req)->rinfo); - break; - case T_NeonNblocksRequest: - CopyNRelFileInfoToBufTag(tag, ((NeonNblocksRequest *) req)->rinfo); - break; - case T_NeonDbSizeRequest: - NInfoGetDbOid(BufTagGetNRelFileInfo(tag)) = ((NeonDbSizeRequest *) req)->dbNode; - break; - case T_NeonGetPageRequest: - CopyNRelFileInfoToBufTag(tag, ((NeonGetPageRequest *) req)->rinfo); - tag.blockNum = ((NeonGetPageRequest *) req)->blkno; - break; - default: - neon_log(ERROR, "Unexpected request tag: %d", messageTag(req)); - } - shard_no = get_shard_number(&tag); - - /* - * Current sharding model assumes that all metadata is present only at shard 0. - * We still need to call get_shard_no() to check if shard map is up-to-date. - */ - if (((NeonRequest *) req)->tag != T_NeonGetPageRequest) - { - shard_no = 0; - } - - do - { - PG_TRY(); - { - while (!page_server->send(shard_no, (NeonRequest *) req) - || !page_server->flush(shard_no)) - { - /* do nothing */ - } - MyNeonCounters->pageserver_open_requests++; - consume_prefetch_responses(); - resp = page_server->receive(shard_no); - MyNeonCounters->pageserver_open_requests--; - } - PG_CATCH(); - { - /* - * Cancellation in this code needs to be handled better at some - * point, but this currently seems fine for now. - */ - page_server->disconnect(shard_no); - MyNeonCounters->pageserver_open_requests = 0; - - /* - * We know for sure we're not working on any prefetch pages after - * this. - */ - END_PREFETCH_RECEIVE_WORK(); - - PG_RE_THROW(); - } - PG_END_TRY(); - - } while (resp == NULL); - - return resp; -} - - -StringInfoData -nm_pack_request(NeonRequest *msg) -{ - StringInfoData s; - - initStringInfo(&s); - - pq_sendbyte(&s, msg->tag); - if (neon_protocol_version >= 3) - { - pq_sendint64(&s, msg->reqid); - } - pq_sendint64(&s, msg->lsn); - pq_sendint64(&s, msg->not_modified_since); - - switch (messageTag(msg)) - { - /* pagestore_client -> pagestore */ - case T_NeonExistsRequest: - { - NeonExistsRequest *msg_req = (NeonExistsRequest *) msg; - - pq_sendint32(&s, NInfoGetSpcOid(msg_req->rinfo)); - pq_sendint32(&s, NInfoGetDbOid(msg_req->rinfo)); - pq_sendint32(&s, NInfoGetRelNumber(msg_req->rinfo)); - pq_sendbyte(&s, msg_req->forknum); - - break; - } - case T_NeonNblocksRequest: - { - NeonNblocksRequest *msg_req = (NeonNblocksRequest *) msg; - - pq_sendint32(&s, NInfoGetSpcOid(msg_req->rinfo)); - pq_sendint32(&s, NInfoGetDbOid(msg_req->rinfo)); - pq_sendint32(&s, NInfoGetRelNumber(msg_req->rinfo)); - pq_sendbyte(&s, msg_req->forknum); - - break; - } - case T_NeonDbSizeRequest: - { - NeonDbSizeRequest *msg_req = (NeonDbSizeRequest *) msg; - - pq_sendint32(&s, msg_req->dbNode); - - break; - } - case T_NeonGetPageRequest: - { - NeonGetPageRequest *msg_req = (NeonGetPageRequest *) msg; - - pq_sendint32(&s, NInfoGetSpcOid(msg_req->rinfo)); - pq_sendint32(&s, NInfoGetDbOid(msg_req->rinfo)); - pq_sendint32(&s, NInfoGetRelNumber(msg_req->rinfo)); - pq_sendbyte(&s, msg_req->forknum); - pq_sendint32(&s, msg_req->blkno); - - break; - } - - case T_NeonGetSlruSegmentRequest: - { - NeonGetSlruSegmentRequest *msg_req = (NeonGetSlruSegmentRequest *) msg; - - pq_sendbyte(&s, msg_req->kind); - pq_sendint32(&s, msg_req->segno); - - break; - } - - /* pagestore -> pagestore_client. We never need to create these. */ - case T_NeonExistsResponse: - case T_NeonNblocksResponse: - case T_NeonGetPageResponse: - case T_NeonErrorResponse: - case T_NeonDbSizeResponse: - case T_NeonGetSlruSegmentResponse: - default: - neon_log(ERROR, "unexpected neon message tag 0x%02x", msg->tag); - break; - } - return s; -} - -NeonResponse * -nm_unpack_response(StringInfo s) -{ - NeonMessageTag tag = pq_getmsgbyte(s); - NeonResponse resp_hdr = {0}; /* make valgrind happy */ - NeonResponse *resp = NULL; - - resp_hdr.tag = tag; - if (neon_protocol_version >= 3) - { - resp_hdr.reqid = pq_getmsgint64(s); - resp_hdr.lsn = pq_getmsgint64(s); - resp_hdr.not_modified_since = pq_getmsgint64(s); - } - switch (tag) - { - /* pagestore -> pagestore_client */ - case T_NeonExistsResponse: - { - NeonExistsResponse *msg_resp = palloc0(sizeof(NeonExistsResponse)); - - if (neon_protocol_version >= 3) - { - NInfoGetSpcOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4); - NInfoGetDbOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4); - NInfoGetRelNumber(msg_resp->req.rinfo) = pq_getmsgint(s, 4); - msg_resp->req.forknum = pq_getmsgbyte(s); - } - msg_resp->req.hdr = resp_hdr; - msg_resp->exists = pq_getmsgbyte(s); - pq_getmsgend(s); - - resp = (NeonResponse *) msg_resp; - break; - } - - case T_NeonNblocksResponse: - { - NeonNblocksResponse *msg_resp = palloc0(sizeof(NeonNblocksResponse)); - - if (neon_protocol_version >= 3) - { - NInfoGetSpcOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4); - NInfoGetDbOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4); - NInfoGetRelNumber(msg_resp->req.rinfo) = pq_getmsgint(s, 4); - msg_resp->req.forknum = pq_getmsgbyte(s); - } - msg_resp->req.hdr = resp_hdr; - msg_resp->n_blocks = pq_getmsgint(s, 4); - pq_getmsgend(s); - - resp = (NeonResponse *) msg_resp; - break; - } - - case T_NeonGetPageResponse: - { - NeonGetPageResponse *msg_resp; - - msg_resp = MemoryContextAllocZero(MyPState->bufctx, PS_GETPAGERESPONSE_SIZE); - if (neon_protocol_version >= 3) - { - NInfoGetSpcOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4); - NInfoGetDbOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4); - NInfoGetRelNumber(msg_resp->req.rinfo) = pq_getmsgint(s, 4); - msg_resp->req.forknum = pq_getmsgbyte(s); - msg_resp->req.blkno = pq_getmsgint(s, 4); - } - msg_resp->req.hdr = resp_hdr; - /* XXX: should be varlena */ - memcpy(msg_resp->page, pq_getmsgbytes(s, BLCKSZ), BLCKSZ); - pq_getmsgend(s); - - Assert(msg_resp->req.hdr.tag == T_NeonGetPageResponse); - - resp = (NeonResponse *) msg_resp; - break; - } - - case T_NeonDbSizeResponse: - { - NeonDbSizeResponse *msg_resp = palloc0(sizeof(NeonDbSizeResponse)); - - if (neon_protocol_version >= 3) - { - msg_resp->req.dbNode = pq_getmsgint(s, 4); - } - msg_resp->req.hdr = resp_hdr; - msg_resp->db_size = pq_getmsgint64(s); - pq_getmsgend(s); - - resp = (NeonResponse *) msg_resp; - break; - } - - case T_NeonErrorResponse: - { - NeonErrorResponse *msg_resp; - size_t msglen; - const char *msgtext; - - msgtext = pq_getmsgrawstring(s); - msglen = strlen(msgtext); - - msg_resp = palloc0(sizeof(NeonErrorResponse) + msglen + 1); - msg_resp->req = resp_hdr; - memcpy(msg_resp->message, msgtext, msglen + 1); - pq_getmsgend(s); - - resp = (NeonResponse *) msg_resp; - break; - } - - case T_NeonGetSlruSegmentResponse: - { - NeonGetSlruSegmentResponse *msg_resp; - int n_blocks; - msg_resp = palloc0(sizeof(NeonGetSlruSegmentResponse)); - - if (neon_protocol_version >= 3) - { - msg_resp->req.kind = pq_getmsgbyte(s); - msg_resp->req.segno = pq_getmsgint(s, 4); - } - msg_resp->req.hdr = resp_hdr; - - n_blocks = pq_getmsgint(s, 4); - msg_resp->n_blocks = n_blocks; - memcpy(msg_resp->data, pq_getmsgbytes(s, n_blocks * BLCKSZ), n_blocks * BLCKSZ); - pq_getmsgend(s); - - resp = (NeonResponse *) msg_resp; - break; - } - - /* - * pagestore_client -> pagestore - * - * We create these ourselves, and don't need to decode them. - */ - case T_NeonExistsRequest: - case T_NeonNblocksRequest: - case T_NeonGetPageRequest: - case T_NeonDbSizeRequest: - case T_NeonGetSlruSegmentRequest: - default: - neon_log(ERROR, "unexpected neon message tag 0x%02x", tag); - break; - } - - return resp; -} - -/* dump to json for debugging / error reporting purposes */ -char * -nm_to_string(NeonMessage *msg) -{ - StringInfoData s; - - initStringInfo(&s); - - switch (messageTag(msg)) - { - /* pagestore_client -> pagestore */ - case T_NeonExistsRequest: - { - NeonExistsRequest *msg_req = (NeonExistsRequest *) msg; - - appendStringInfoString(&s, "{\"type\": \"NeonExistsRequest\""); - appendStringInfo(&s, ", \"rinfo\": \"%u/%u/%u\"", RelFileInfoFmt(msg_req->rinfo)); - appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum); - appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.lsn)); - appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.not_modified_since)); - appendStringInfoChar(&s, '}'); - break; - } - - case T_NeonNblocksRequest: - { - NeonNblocksRequest *msg_req = (NeonNblocksRequest *) msg; - - appendStringInfoString(&s, "{\"type\": \"NeonNblocksRequest\""); - appendStringInfo(&s, ", \"rinfo\": \"%u/%u/%u\"", RelFileInfoFmt(msg_req->rinfo)); - appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum); - appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.lsn)); - appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.not_modified_since)); - appendStringInfoChar(&s, '}'); - break; - } - - case T_NeonGetPageRequest: - { - NeonGetPageRequest *msg_req = (NeonGetPageRequest *) msg; - - appendStringInfoString(&s, "{\"type\": \"NeonGetPageRequest\""); - appendStringInfo(&s, ", \"rinfo\": \"%u/%u/%u\"", RelFileInfoFmt(msg_req->rinfo)); - appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum); - appendStringInfo(&s, ", \"blkno\": %u", msg_req->blkno); - appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.lsn)); - appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.not_modified_since)); - appendStringInfoChar(&s, '}'); - break; - } - case T_NeonDbSizeRequest: - { - NeonDbSizeRequest *msg_req = (NeonDbSizeRequest *) msg; - - appendStringInfoString(&s, "{\"type\": \"NeonDbSizeRequest\""); - appendStringInfo(&s, ", \"dbnode\": \"%u\"", msg_req->dbNode); - appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.lsn)); - appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.not_modified_since)); - appendStringInfoChar(&s, '}'); - break; - } - case T_NeonGetSlruSegmentRequest: - { - NeonGetSlruSegmentRequest *msg_req = (NeonGetSlruSegmentRequest *) msg; - - appendStringInfoString(&s, "{\"type\": \"NeonGetSlruSegmentRequest\""); - appendStringInfo(&s, ", \"kind\": %u", msg_req->kind); - appendStringInfo(&s, ", \"segno\": %u", msg_req->segno); - appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.lsn)); - appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.not_modified_since)); - appendStringInfoChar(&s, '}'); - break; - } - /* pagestore -> pagestore_client */ - case T_NeonExistsResponse: - { - NeonExistsResponse *msg_resp = (NeonExistsResponse *) msg; - - appendStringInfoString(&s, "{\"type\": \"NeonExistsResponse\""); - appendStringInfo(&s, ", \"exists\": %d}", - msg_resp->exists); - appendStringInfoChar(&s, '}'); - - break; - } - case T_NeonNblocksResponse: - { - NeonNblocksResponse *msg_resp = (NeonNblocksResponse *) msg; - - appendStringInfoString(&s, "{\"type\": \"NeonNblocksResponse\""); - appendStringInfo(&s, ", \"n_blocks\": %u}", - msg_resp->n_blocks); - appendStringInfoChar(&s, '}'); - - break; - } - case T_NeonGetPageResponse: - { -#if 0 - NeonGetPageResponse *msg_resp = (NeonGetPageResponse *) msg; -#endif - - appendStringInfoString(&s, "{\"type\": \"NeonGetPageResponse\""); - appendStringInfo(&s, ", \"page\": \"XXX\"}"); - appendStringInfoChar(&s, '}'); - break; - } - case T_NeonErrorResponse: - { - NeonErrorResponse *msg_resp = (NeonErrorResponse *) msg; - - /* FIXME: escape double-quotes in the message */ - appendStringInfoString(&s, "{\"type\": \"NeonErrorResponse\""); - appendStringInfo(&s, ", \"message\": \"%s\"}", msg_resp->message); - appendStringInfoChar(&s, '}'); - break; - } - case T_NeonDbSizeResponse: - { - NeonDbSizeResponse *msg_resp = (NeonDbSizeResponse *) msg; - - appendStringInfoString(&s, "{\"type\": \"NeonDbSizeResponse\""); - appendStringInfo(&s, ", \"db_size\": %ld}", - msg_resp->db_size); - appendStringInfoChar(&s, '}'); - - break; - } - case T_NeonGetSlruSegmentResponse: - { - NeonGetSlruSegmentResponse *msg_resp = (NeonGetSlruSegmentResponse *) msg; - - appendStringInfoString(&s, "{\"type\": \"NeonGetSlruSegmentResponse\""); - appendStringInfo(&s, ", \"n_blocks\": %u}", - msg_resp->n_blocks); - appendStringInfoChar(&s, '}'); - - break; - } - - default: - appendStringInfo(&s, "{\"type\": \"unknown 0x%02x\"", msg->tag); - } - return s.data; -} - /* * Wrapper around log_newpage() that makes a temporary copy of the block and * WAL-logs that. This makes it safe to use while holding only a shared lock @@ -2149,11 +461,6 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co static void neon_init(void) { - Size prfs_size; - - if (MyPState != NULL) - return; - /* * Sanity check that theperf counters array is sized correctly. We got * this wrong once, and the formula for max number of backends and aux @@ -2168,27 +475,6 @@ neon_init(void) elog(ERROR, "MyNeonCounters points past end of array"); #endif - prfs_size = offsetof(PrefetchState, prf_buffer) + - sizeof(PrefetchRequest) * readahead_buffer_size; - - MyPState = MemoryContextAllocZero(TopMemoryContext, prfs_size); - - MyPState->n_unused = readahead_buffer_size; - - MyPState->bufctx = SlabContextCreate(TopMemoryContext, - "NeonSMGR/prefetch", - SLAB_DEFAULT_BLOCK_SIZE * 17, - PS_GETPAGERESPONSE_SIZE); - MyPState->errctx = AllocSetContextCreate(TopMemoryContext, - "NeonSMGR/errors", - ALLOCSET_DEFAULT_SIZES); - MyPState->hashctx = AllocSetContextCreate(TopMemoryContext, - "NeonSMGR/prefetch", - ALLOCSET_DEFAULT_SIZES); - - MyPState->prf_hash = prfh_create(MyPState->hashctx, - readahead_buffer_size, NULL); - old_redo_read_buffer_filter = redo_read_buffer_filter; redo_read_buffer_filter = neon_redo_read_buffer_filter; @@ -2225,8 +511,10 @@ nm_adjust_lsn(XLogRecPtr lsn) /* * Return LSN for requesting pages and number of blocks from page server + * + * XXX: exposed so that prefetch_do_request() can call back here. */ -static void +void neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, neon_request_lsns *output, BlockNumber nblocks) { @@ -2429,112 +717,12 @@ neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, } } -/* - * neon_prefetch_response_usable -- Can a new request be satisfied by old one? - * - * This is used to check if the response to a prefetch request can be used to - * satisfy a page read now. - */ -static bool -neon_prefetch_response_usable(neon_request_lsns *request_lsns, - PrefetchRequest *slot) -{ - /* sanity check the LSN's on the old and the new request */ - Assert(request_lsns->request_lsn >= request_lsns->not_modified_since); - Assert(request_lsns->effective_request_lsn >= request_lsns->not_modified_since); - Assert(request_lsns->effective_request_lsn <= request_lsns->request_lsn); - Assert(slot->request_lsns.request_lsn >= slot->request_lsns.not_modified_since); - Assert(slot->request_lsns.effective_request_lsn >= slot->request_lsns.not_modified_since); - Assert(slot->request_lsns.effective_request_lsn <= slot->request_lsns.request_lsn); - Assert(slot->status != PRFS_UNUSED); - - /* - * The new request's LSN should never be older than the old one. This - * could be an Assert, except that for testing purposes, we do provide an - * interface in neon_test_utils to fetch pages at arbitary LSNs, which - * violates this. - * - * Similarly, the not_modified_since value calculated for a page should - * never move backwards. This assumption is a bit fragile; if we updated - * the last-written cache when we read in a page, for example, then it - * might. But as the code stands, it should not. - * - * (If two backends issue a request at the same time, they might race and - * calculate LSNs "out of order" with each other, but the prefetch queue - * is backend-private at the moment.) - */ - if (request_lsns->effective_request_lsn < slot->request_lsns.effective_request_lsn || - request_lsns->not_modified_since < slot->request_lsns.not_modified_since) - { - ereport(LOG, - (errcode(ERRCODE_IO_ERROR), - errmsg(NEON_TAG "request with unexpected LSN after prefetch"), - errdetail("Request %X/%X not_modified_since %X/%X, prefetch %X/%X not_modified_since %X/%X)", - LSN_FORMAT_ARGS(request_lsns->effective_request_lsn), - LSN_FORMAT_ARGS(request_lsns->not_modified_since), - LSN_FORMAT_ARGS(slot->request_lsns.effective_request_lsn), - LSN_FORMAT_ARGS(slot->request_lsns.not_modified_since)))); - return false; - } - - /*--- - * Each request to the pageserver has three LSN values associated with it: - * `not_modified_since`, `request_lsn`, and 'effective_request_lsn'. - * `not_modified_since` and `request_lsn` are sent to the pageserver, but - * in the primary node, we always use UINT64_MAX as the `request_lsn`, so - * we remember `effective_request_lsn` separately. In a primary, - * `effective_request_lsn` is the same as `not_modified_since`. - * See comments in neon_get_request_lsns why we can not use last flush WAL position here. - * - * To determine whether a response to a GetPage request issued earlier is - * still valid to satisfy a new page read, we look at the - * (not_modified_since, effective_request_lsn] range of the request. It is - * effectively a claim that the page has not been modified between those - * LSNs. If the range of the old request in the queue overlaps with the - * new request, we know that the page hasn't been modified in the union of - * the ranges. We can use the response to old request to satisfy the new - * request in that case. For example: - * - * 100 500 - * Old request: +--------+ - * - * 400 800 - * New request: +--------+ - * - * The old request claims that the page was not modified between LSNs 100 - * and 500, and the second claims that it was not modified between 400 and - * 800. Together they mean that the page was not modified between 100 and - * 800. Therefore the response to the old request is also valid for the - * new request. - * - * This logic also holds at the boundary case that the old request's LSN - * matches the new request's not_modified_since LSN exactly: - * - * 100 500 - * Old request: +--------+ - * - * 500 900 - * New request: +--------+ - * - * The response to the old request is the page as it was at LSN 500, and - * the page hasn't been changed in the range (500, 900], therefore the - * response is valid also for the new request. - */ - - /* this follows from the checks above */ - Assert(request_lsns->effective_request_lsn >= slot->request_lsns.not_modified_since); - - return request_lsns->not_modified_since <= slot->request_lsns.effective_request_lsn; -} - /* * neon_exists() -- Does the physical file exist? */ static bool neon_exists(SMgrRelation reln, ForkNumber forkNum) { - bool exists; - NeonResponse *resp; BlockNumber n_blocks; neon_request_lsns request_lsns; @@ -2593,67 +781,8 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum) neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1); - { - NeonExistsRequest request = { - .hdr.tag = T_NeonExistsRequest, - .hdr.reqid = GENERATE_REQUEST_ID(), - .hdr.lsn = request_lsns.request_lsn, - .hdr.not_modified_since = request_lsns.not_modified_since, - .rinfo = InfoFromSMgrRel(reln), - .forknum = forkNum - }; - resp = page_server_request(&request); - - switch (resp->tag) - { - case T_NeonExistsResponse: - { - NeonExistsResponse* exists_resp = (NeonExistsResponse *) resp; - if (neon_protocol_version >= 3) - { - if (!equal_requests(resp, &request.hdr) || - !RelFileInfoEquals(exists_resp->req.rinfo, request.rinfo) || - exists_resp->req.forknum != request.forknum) - { - NEON_PANIC_CONNECTION_STATE(-1, PANIC, - "Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u} to exits request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u}", - resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), RelFileInfoFmt(exists_resp->req.rinfo), exists_resp->req.forknum, - request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), RelFileInfoFmt(request.rinfo), request.forknum); - } - } - exists = exists_resp->exists; - break; - } - case T_NeonErrorResponse: - if (neon_protocol_version >= 3) - { - if (!equal_requests(resp, &request.hdr)) - { - elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match exists request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}", - resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), - request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since)); - } - } - ereport(ERROR, - (errcode(ERRCODE_IO_ERROR), - errmsg(NEON_TAG "[reqid %lx] could not read relation existence of rel %u/%u/%u.%u from page server at lsn %X/%08X", - resp->reqid, - RelFileInfoFmt(InfoFromSMgrRel(reln)), - forkNum, - LSN_FORMAT_ARGS(request_lsns.effective_request_lsn)), - errdetail("page server returned error: %s", - ((NeonErrorResponse *) resp)->message))); - break; - - default: - NEON_PANIC_CONNECTION_STATE(-1, PANIC, - "Expected Exists (0x%02x) or Error (0x%02x) response to ExistsRequest, but got 0x%02x", - T_NeonExistsResponse, T_NeonErrorResponse, resp->tag); - } - pfree(resp); - } - return exists; + return communicator_exists(InfoFromSMgrRel(reln), forkNum, &request_lsns); } /* @@ -3002,7 +1131,6 @@ static bool neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks) { - uint64 ring_index PG_USED_FOR_ASSERTS_ONLY; BufferTag tag; switch (reln->smgr_relpersistence) @@ -3039,17 +1167,13 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, tag.blockNum = blocknum; - ring_index = prefetch_register_bufferv(tag, NULL, iterblocks, - lfc_present, true); + communicator_prefetch_register_bufferv(tag, NULL, iterblocks, lfc_present); nblocks -= iterblocks; blocknum += iterblocks; - - Assert(ring_index < MyPState->ring_unused && - MyPState->ring_last <= ring_index); } - prefetch_pump_state(false); + communicator_prefetch_pump_state(false); return false; } @@ -3062,7 +1186,6 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, static bool neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) { - uint64 ring_index PG_USED_FOR_ASSERTS_ONLY; BufferTag tag; switch (reln->smgr_relpersistence) @@ -3087,12 +1210,9 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) CopyNRelFileInfoToBufTag(tag, InfoFromSMgrRel(reln)); - ring_index = prefetch_register_bufferv(tag, NULL, 1, NULL, true); + communicator_prefetch_register_bufferv(tag, NULL, 1, NULL); - Assert(ring_index < MyPState->ring_unused && - MyPState->ring_last <= ring_index); - - prefetch_pump_state(false); + communicator_prefetch_pump_state(false); return false; } @@ -3136,7 +1256,7 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum, */ neon_log(SmgrTrace, "writeback noop"); - prefetch_pump_state(false); + communicator_prefetch_pump_state(false); #ifdef DEBUG_COMPARE_LOCAL if (IS_LOCAL_REL(reln)) @@ -3144,208 +1264,6 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum, #endif } -/* - * Read N pages at a specific LSN. - * - * *mask is set for pages read at a previous point in time, and which we - * should not touch, nor overwrite. - * New bits should be set in *mask for the pages we'successfully read. - * - * The offsets in request_lsns, buffers, and mask are linked. - */ -static void -neon_read_at_lsnv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber base_blockno, neon_request_lsns *request_lsns, - void **buffers, BlockNumber nblocks, const bits8 *mask) -{ - NeonResponse *resp; - uint64 ring_index; - PrfHashEntry *entry; - PrefetchRequest *slot; - PrefetchRequest hashkey; - - Assert(PointerIsValid(request_lsns)); - Assert(nblocks >= 1); - - /* - * Use an intermediate PrefetchRequest struct as the hash key to ensure - * correct alignment and that the padding bytes are cleared. - */ - memset(&hashkey.buftag, 0, sizeof(BufferTag)); - CopyNRelFileInfoToBufTag(hashkey.buftag, rinfo); - hashkey.buftag.forkNum = forkNum; - hashkey.buftag.blockNum = base_blockno; - - /* - * The redo process does not lock pages that it needs to replay but are - * not in the shared buffers, so a concurrent process may request the page - * after redo has decided it won't redo that page and updated the LwLSN - * for that page. If we're in hot standby we need to take care that we - * don't return until after REDO has finished replaying up to that LwLSN, - * as the page should have been locked up to that point. - * - * See also the description on neon_redo_read_buffer_filter below. - * - * NOTE: It is possible that the WAL redo process will still do IO due to - * concurrent failed read IOs. Those IOs should never have a request_lsn - * that is as large as the WAL record we're currently replaying, if it - * weren't for the behaviour of the LwLsn cache that uses the highest - * value of the LwLsn cache when the entry is not found. - */ - prefetch_register_bufferv(hashkey.buftag, request_lsns, nblocks, mask, false); - - for (int i = 0; i < nblocks; i++) - { - void *buffer = buffers[i]; - BlockNumber blockno = base_blockno + i; - neon_request_lsns *reqlsns = &request_lsns[i]; - TimestampTz start_ts, end_ts; - - if (PointerIsValid(mask) && BITMAP_ISSET(mask, i)) - continue; - - start_ts = GetCurrentTimestamp(); - - if (RecoveryInProgress() && MyBackendType != B_STARTUP) - XLogWaitForReplayOf(reqlsns->request_lsn); - - /* - * Try to find prefetched page in the list of received pages. - */ -Retry: - hashkey.buftag.blockNum = blockno; - entry = prfh_lookup(MyPState->prf_hash, &hashkey); - - if (entry != NULL) - { - slot = entry->slot; - if (neon_prefetch_response_usable(reqlsns, slot)) - { - ring_index = slot->my_ring_index; - } - else - { - /* - * Cannot use this prefetch, discard it - * - * We can't drop cache for not-yet-received requested items. It is - * unlikely this happens, but it can happen if prefetch distance - * is large enough and a backend didn't consume all prefetch - * requests. - */ - if (slot->status == PRFS_REQUESTED) - { - if (!prefetch_wait_for(slot->my_ring_index)) - goto Retry; - } - /* drop caches */ - prefetch_set_unused(slot->my_ring_index); - pgBufferUsage.prefetch.expired += 1; - MyNeonCounters->getpage_prefetch_discards_total++; - /* make it look like a prefetch cache miss */ - entry = NULL; - } - } - - do - { - if (entry == NULL) - { - ring_index = prefetch_register_bufferv(hashkey.buftag, reqlsns, 1, NULL, false); - Assert(ring_index != UINT64_MAX); - slot = GetPrfSlot(ring_index); - } - else - { - /* - * Empty our reference to the prefetch buffer's hash entry. When - * we wait for prefetches, the entry reference is invalidated by - * potential updates to the hash, and when we reconnect to the - * pageserver the prefetch we're waiting for may be dropped, in - * which case we need to retry and take the branch above. - */ - entry = NULL; - } - - Assert(slot->my_ring_index == ring_index); - Assert(MyPState->ring_last <= ring_index && - MyPState->ring_unused > ring_index); - Assert(slot->status != PRFS_UNUSED); - Assert(GetPrfSlot(ring_index) == slot); - - } while (!prefetch_wait_for(ring_index)); - - Assert(slot->status == PRFS_RECEIVED); - Assert(memcmp(&hashkey.buftag, &slot->buftag, sizeof(BufferTag)) == 0); - Assert(hashkey.buftag.blockNum == base_blockno + i); - - resp = slot->response; - - switch (resp->tag) - { - case T_NeonGetPageResponse: - { - NeonGetPageResponse* getpage_resp = (NeonGetPageResponse *) resp; - if (neon_protocol_version >= 3) - { - if (resp->reqid != slot->reqid || - resp->lsn != slot->request_lsns.request_lsn || - resp->not_modified_since != slot->request_lsns.not_modified_since || - !RelFileInfoEquals(getpage_resp->req.rinfo, rinfo) || - getpage_resp->req.forknum != forkNum || - getpage_resp->req.blkno != base_blockno + i) - { - NEON_PANIC_CONNECTION_STATE(-1, PANIC, - "Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u, block=%u} to get page request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u, block=%u}", - resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), RelFileInfoFmt(getpage_resp->req.rinfo), getpage_resp->req.forknum, getpage_resp->req.blkno, - slot->reqid, LSN_FORMAT_ARGS(slot->request_lsns.request_lsn), LSN_FORMAT_ARGS(slot->request_lsns.not_modified_since), RelFileInfoFmt(rinfo), forkNum, base_blockno + i); - } - } - memcpy(buffer, getpage_resp->page, BLCKSZ); - - /* - * With lfc_store_prefetch_result=true prefetch result is stored in LFC in prefetch_pump_state when response is received - * from page server. But if lfc_store_prefetch_result=false then it is not yet stored in LFC and we have to do it here - * under buffer lock. - */ - if (!lfc_store_prefetch_result) - lfc_write(rinfo, forkNum, blockno, buffer); - break; - } - case T_NeonErrorResponse: - if (neon_protocol_version >= 3) - { - if (resp->reqid != slot->reqid || - resp->lsn != slot->request_lsns.request_lsn || - resp->not_modified_since != slot->request_lsns.not_modified_since) - { - elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match get relsize request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}", - resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), - slot->reqid, LSN_FORMAT_ARGS(slot->request_lsns.request_lsn), LSN_FORMAT_ARGS(slot->request_lsns.not_modified_since)); - } - } - ereport(ERROR, - (errcode(ERRCODE_IO_ERROR), - errmsg(NEON_TAG "[shard %d, reqid %lx] could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X", - slot->shard_no, resp->reqid, blockno, RelFileInfoFmt(rinfo), - forkNum, LSN_FORMAT_ARGS(reqlsns->effective_request_lsn)), - errdetail("page server returned error: %s", - ((NeonErrorResponse *) resp)->message))); - break; - default: - NEON_PANIC_CONNECTION_STATE(slot->shard_no, PANIC, - "Expected GetPage (0x%02x) or Error (0x%02x) response to GetPageRequest, but got 0x%02x", - T_NeonGetPageResponse, T_NeonErrorResponse, resp->tag); - } - - /* buffer was used, clean up for later reuse */ - prefetch_set_unused(ring_index); - prefetch_cleanup_trailing_unused(); - - end_ts = GetCurrentTimestamp(); - inc_getpage_wait(end_ts >= start_ts ? (end_ts - start_ts) : 0); - } -} - /* * While function is defined in the neon extension it's used within neon_test_utils directly. * To avoid breaking tests in the runtime please keep function signature in sync. @@ -3354,7 +1272,7 @@ void neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, neon_request_lsns request_lsns, void *buffer) { - neon_read_at_lsnv(rinfo, forkNum, blkno, &request_lsns, &buffer, 1, NULL); + communicator_read_at_lsnv(rinfo, forkNum, blkno, &request_lsns, &buffer, 1, NULL); } #if PG_MAJORVERSION_NUM < 17 @@ -3370,6 +1288,8 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer #endif { neon_request_lsns request_lsns; + bits8 present; + void *bufferp; switch (reln->smgr_relpersistence) { @@ -3389,11 +1309,13 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer } /* Try to read PS results if they are available */ - prefetch_pump_state(false); + communicator_prefetch_pump_state(false); neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1); - if (prefetch_lookup(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, buffer)) + present = 0; + bufferp = buffer; + if (communicator_prefetch_lookupv(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1, &bufferp, &present)) { /* Prefetch hit */ return; @@ -3411,7 +1333,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer /* * Try to receive prefetch results once again just to make sure we don't leave the smgr code while the OS might still have buffered bytes. */ - prefetch_pump_state(false); + communicator_prefetch_pump_state(false); #ifdef DEBUG_COMPARE_LOCAL if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln)) @@ -3521,16 +1443,16 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, nblocks, PG_IOV_MAX); /* Try to read PS results if they are available */ - prefetch_pump_state(false); + communicator_prefetch_pump_state(false); neon_get_request_lsns(InfoFromSMgrRel(reln), forknum, blocknum, request_lsns, nblocks); memset(read_pages, 0, sizeof(read_pages)); - prefetch_result = prefetch_lookupv(InfoFromSMgrRel(reln), forknum, - blocknum, request_lsns, nblocks, - buffers, read_pages); + prefetch_result = communicator_prefetch_lookupv(InfoFromSMgrRel(reln), forknum, + blocknum, request_lsns, nblocks, + buffers, read_pages); if (prefetch_result == nblocks) return; @@ -3546,13 +1468,13 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, if (prefetch_result + lfc_result == nblocks) return; - neon_read_at_lsnv(InfoFromSMgrRel(reln), forknum, blocknum, request_lsns, - buffers, nblocks, read_pages); + communicator_read_at_lsnv(InfoFromSMgrRel(reln), forknum, blocknum, request_lsns, + buffers, nblocks, read_pages); /* * Try to receive prefetch results once again just to make sure we don't leave the smgr code while the OS might still have buffered bytes. */ - prefetch_pump_state(false); + communicator_prefetch_pump_state(false); #ifdef DEBUG_COMPARE_LOCAL if (forknum == MAIN_FORKNUM && IS_LOCAL_REL(reln)) @@ -3737,7 +1659,7 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo lfc_write(InfoFromSMgrRel(reln), forknum, blocknum, buffer); - prefetch_pump_state(false); + communicator_prefetch_pump_state(false); #ifdef DEBUG_COMPARE_LOCAL if (IS_LOCAL_REL(reln)) @@ -3799,7 +1721,7 @@ neon_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, lfc_writev(InfoFromSMgrRel(reln), forknum, blkno, buffers, nblocks); - prefetch_pump_state(false); + communicator_prefetch_pump_state(false); #ifdef DEBUG_COMPARE_LOCAL if (IS_LOCAL_REL(reln)) @@ -3815,7 +1737,6 @@ neon_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, static BlockNumber neon_nblocks(SMgrRelation reln, ForkNumber forknum) { - NeonResponse *resp; BlockNumber n_blocks; neon_request_lsns request_lsns; @@ -3847,74 +1768,15 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum) neon_get_request_lsns(InfoFromSMgrRel(reln), forknum, REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1); - { - NeonNblocksRequest request = { - .hdr.tag = T_NeonNblocksRequest, - .hdr.reqid = GENERATE_REQUEST_ID(), - .hdr.lsn = request_lsns.request_lsn, - .hdr.not_modified_since = request_lsns.not_modified_since, - .rinfo = InfoFromSMgrRel(reln), - .forknum = forknum, - }; + n_blocks = communicator_nblocks(InfoFromSMgrRel(reln), forknum, &request_lsns); + update_cached_relsize(InfoFromSMgrRel(reln), forknum, n_blocks); - resp = page_server_request(&request); + neon_log(SmgrTrace, "neon_nblocks: rel %u/%u/%u fork %u (request LSN %X/%08X): %u blocks", + RelFileInfoFmt(InfoFromSMgrRel(reln)), + forknum, + LSN_FORMAT_ARGS(request_lsns.effective_request_lsn), + n_blocks); - switch (resp->tag) - { - case T_NeonNblocksResponse: - { - NeonNblocksResponse * relsize_resp = (NeonNblocksResponse *) resp; - if (neon_protocol_version >= 3) - { - if (!equal_requests(resp, &request.hdr) || - !RelFileInfoEquals(relsize_resp->req.rinfo, request.rinfo) || - relsize_resp->req.forknum != forknum) - { - NEON_PANIC_CONNECTION_STATE(-1, PANIC, - "Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u} to get relsize request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u}", - resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), RelFileInfoFmt(relsize_resp->req.rinfo), relsize_resp->req.forknum, - request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), RelFileInfoFmt(request.rinfo), forknum); - } - } - n_blocks = relsize_resp->n_blocks; - break; - } - case T_NeonErrorResponse: - if (neon_protocol_version >= 3) - { - if (!equal_requests(resp, &request.hdr)) - { - elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match get relsize request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}", - resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), - request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since)); - } - } - ereport(ERROR, - (errcode(ERRCODE_IO_ERROR), - errmsg(NEON_TAG "[reqid %lx] could not read relation size of rel %u/%u/%u.%u from page server at lsn %X/%08X", - resp->reqid, - RelFileInfoFmt(InfoFromSMgrRel(reln)), - forknum, - LSN_FORMAT_ARGS(request_lsns.effective_request_lsn)), - errdetail("page server returned error: %s", - ((NeonErrorResponse *) resp)->message))); - break; - - default: - NEON_PANIC_CONNECTION_STATE(-1, PANIC, - "Expected Nblocks (0x%02x) or Error (0x%02x) response to NblocksRequest, but got 0x%02x", - T_NeonNblocksResponse, T_NeonErrorResponse, resp->tag); - } - update_cached_relsize(InfoFromSMgrRel(reln), forknum, n_blocks); - - neon_log(SmgrTrace, "neon_nblocks: rel %u/%u/%u fork %u (request LSN %X/%08X): %u blocks", - RelFileInfoFmt(InfoFromSMgrRel(reln)), - forknum, - LSN_FORMAT_ARGS(request_lsns.effective_request_lsn), - n_blocks); - - pfree(resp); - } return n_blocks; } @@ -3924,7 +1786,6 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum) int64 neon_dbsize(Oid dbNode) { - NeonResponse *resp; int64 db_size; neon_request_lsns request_lsns; NRelFileInfo dummy_node = {0}; @@ -3932,66 +1793,11 @@ neon_dbsize(Oid dbNode) neon_get_request_lsns(dummy_node, MAIN_FORKNUM, REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1); - { - NeonDbSizeRequest request = { - .hdr.tag = T_NeonDbSizeRequest, - .hdr.reqid = GENERATE_REQUEST_ID(), - .hdr.lsn = request_lsns.request_lsn, - .hdr.not_modified_since = request_lsns.not_modified_since, - .dbNode = dbNode, - }; + db_size = communicator_dbsize(dbNode, &request_lsns); - resp = page_server_request(&request); + neon_log(SmgrTrace, "neon_dbsize: db %u (request LSN %X/%08X): %ld bytes", + dbNode, LSN_FORMAT_ARGS(request_lsns.effective_request_lsn), db_size); - switch (resp->tag) - { - case T_NeonDbSizeResponse: - { - NeonDbSizeResponse* dbsize_resp = (NeonDbSizeResponse *) resp; - if (neon_protocol_version >= 3) - { - if (!equal_requests(resp, &request.hdr) || - dbsize_resp->req.dbNode != dbNode) - { - NEON_PANIC_CONNECTION_STATE(-1, PANIC, - "Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, dbNode=%u} to get DB size request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, dbNode=%u}", - resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), dbsize_resp->req.dbNode, - request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), dbNode); - } - } - db_size = dbsize_resp->db_size; - break; - } - case T_NeonErrorResponse: - if (neon_protocol_version >= 3) - { - if (!equal_requests(resp, &request.hdr)) - { - elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match get DB size request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}", - resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), - request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since)); - } - } - ereport(ERROR, - (errcode(ERRCODE_IO_ERROR), - errmsg(NEON_TAG "[reqid %lx] could not read db size of db %u from page server at lsn %X/%08X", - resp->reqid, - dbNode, LSN_FORMAT_ARGS(request_lsns.effective_request_lsn)), - errdetail("page server returned error: %s", - ((NeonErrorResponse *) resp)->message))); - break; - - default: - NEON_PANIC_CONNECTION_STATE(-1, PANIC, - "Expected DbSize (0x%02x) or Error (0x%02x) response to DbSizeRequest, but got 0x%02x", - T_NeonDbSizeResponse, T_NeonErrorResponse, resp->tag); - } - - neon_log(SmgrTrace, "neon_dbsize: db %u (request LSN %X/%08X): %ld bytes", - dbNode, LSN_FORMAT_ARGS(request_lsns.effective_request_lsn), db_size); - - pfree(resp); - } return db_size; } @@ -4090,7 +1896,7 @@ neon_immedsync(SMgrRelation reln, ForkNumber forknum) neon_log(SmgrTrace, "[NEON_SMGR] immedsync noop"); - prefetch_pump_state(false); + communicator_prefetch_pump_state(false); #ifdef DEBUG_COMPARE_LOCAL if (IS_LOCAL_REL(reln)) @@ -4291,9 +2097,7 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf not_modified_since; SlruKind kind; int n_blocks; - shardno_t shard_no = 0; /* All SLRUs are at shard 0 */ - NeonResponse *resp; - NeonGetSlruSegmentRequest request; + neon_request_lsns request_lsns; /* * Compute a request LSN to use, similar to neon_get_request_lsns() but the @@ -4332,74 +2136,12 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf else return -1; - request = (NeonGetSlruSegmentRequest) { - .hdr.tag = T_NeonGetSlruSegmentRequest, - .hdr.reqid = GENERATE_REQUEST_ID(), - .hdr.lsn = request_lsn, - .hdr.not_modified_since = not_modified_since, - .kind = kind, - .segno = segno - }; + request_lsns.request_lsn = request_lsn; + request_lsns.not_modified_since = not_modified_since; + request_lsns.effective_request_lsn = request_lsn; - do - { - while (!page_server->send(shard_no, &request.hdr) || !page_server->flush(shard_no)); + n_blocks = communicator_read_slru_segment(kind, segno, &request_lsns, buffer); - consume_prefetch_responses(); - - resp = page_server->receive(shard_no); - } while (resp == NULL); - - switch (resp->tag) - { - case T_NeonGetSlruSegmentResponse: - { - NeonGetSlruSegmentResponse* slru_resp = (NeonGetSlruSegmentResponse *) resp; - if (neon_protocol_version >= 3) - { - if (!equal_requests(resp, &request.hdr) || - slru_resp->req.kind != kind || - slru_resp->req.segno != segno) - { - NEON_PANIC_CONNECTION_STATE(-1, PANIC, - "Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, kind=%u, segno=%u} to get SLRU segment request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, kind=%u, segno=%u}", - resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), slru_resp->req.kind, slru_resp->req.segno, - request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), kind, segno); - } - } - n_blocks = slru_resp->n_blocks; - memcpy(buffer, slru_resp->data, n_blocks*BLCKSZ); - break; - } - case T_NeonErrorResponse: - if (neon_protocol_version >= 3) - { - if (!equal_requests(resp, &request.hdr)) - { - elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match get SLRU segment request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}", - resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), - request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since)); - } - } - ereport(ERROR, - (errcode(ERRCODE_IO_ERROR), - errmsg(NEON_TAG "[reqid %lx] could not read SLRU %d segment %d at lsn %X/%08X", - resp->reqid, - kind, - segno, - LSN_FORMAT_ARGS(request_lsn)), - errdetail("page server returned error: %s", - ((NeonErrorResponse *) resp)->message))); - break; - - default: - NEON_PANIC_CONNECTION_STATE(-1, PANIC, - "Expected GetSlruSegment (0x%02x) or Error (0x%02x) response to GetSlruSegmentRequest, but got 0x%02x", - T_NeonGetSlruSegmentResponse, T_NeonErrorResponse, resp->tag); - } - pfree(resp); - - reconfigure_timeout_if_needed(); return n_blocks; } @@ -4435,7 +2177,7 @@ AtEOXact_neon(XactEvent event, void *arg) } break; } - reconfigure_timeout_if_needed(); + communicator_reconfigure_timeout_if_needed(); } static const struct f_smgr neon_smgr = @@ -4493,6 +2235,7 @@ smgr_init_neon(void) smgr_init_standard(); neon_init(); + communicator_init(); } @@ -4522,25 +2265,14 @@ neon_extend_rel_size(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, * This length is later reused when we open the smgr to read the * block, which is fine and expected. */ - NeonResponse *response; - NeonNblocksResponse *nbresponse; - NeonNblocksRequest request = { - .hdr = (NeonRequest) { - .tag = T_NeonNblocksRequest, - .reqid = GENERATE_REQUEST_ID(), - .lsn = end_recptr, - .not_modified_since = end_recptr, - }, - .rinfo = rinfo, - .forknum = forknum, - }; + neon_request_lsns request_lsns; - response = page_server_request(&request); + neon_get_request_lsns(rinfo, forknum, + REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1); - Assert(response->tag == T_NeonNblocksResponse); - nbresponse = (NeonNblocksResponse *) response; + relsize = communicator_nblocks(rinfo, forknum, &request_lsns); - relsize = Max(nbresponse->n_blocks, blkno + 1); + relsize = Max(relsize, blkno + 1); set_cached_relsize(rinfo, forknum, relsize); neon_set_lwlsn_relation(end_recptr, rinfo, forknum); @@ -4692,94 +2424,3 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id) } return no_redo_needed; } - -static void -reconfigure_timeout_if_needed(void) -{ - bool needs_set = MyPState->ring_receive != MyPState->ring_unused && - readahead_getpage_pull_timeout_ms > 0; - - if (needs_set != timeout_set) - { - /* The background writer doens't (shouldn't) read any pages */ - Assert(!AmBackgroundWriterProcess()); - /* The checkpointer doens't (shouldn't) read any pages */ - Assert(!AmCheckpointerProcess()); - - if (unlikely(PS_TIMEOUT_ID == 0)) - { - PS_TIMEOUT_ID = RegisterTimeout(USER_TIMEOUT, pagestore_timeout_handler); - } - - if (needs_set) - { -#if PG_MAJORVERSION_NUM <= 14 - enable_timeout_after(PS_TIMEOUT_ID, readahead_getpage_pull_timeout_ms); -#else - enable_timeout_every( - PS_TIMEOUT_ID, - TimestampTzPlusMilliseconds(GetCurrentTimestamp(), - readahead_getpage_pull_timeout_ms), - readahead_getpage_pull_timeout_ms - ); -#endif - timeout_set = true; - } - else - { - Assert(timeout_set); - disable_timeout(PS_TIMEOUT_ID, false); - timeout_set = false; - } - } -} - -static void -pagestore_timeout_handler(void) -{ -#if PG_MAJORVERSION_NUM <= 14 - /* - * PG14: Setting a repeating timeout is not possible, so we signal here - * that the timeout has already been reset, and by telling the system - * that system will re-schedule it later if we need to. - */ - timeout_set = false; -#endif - timeout_signaled = true; - InterruptPending = true; -} - -static process_interrupts_callback_t prev_interrupt_cb; - -/* - * Process new data received in our active PageStream sockets. - * - * This relies on the invariant that all pipelined yet-to-be-received requests - * are getPage requests managed by MyPState. This is currently true, any - * modification will probably require some stuff to make it work again. - */ -static bool -pagestore_smgr_processinterrupts(void) -{ - if (timeout_signaled) - { - if (!readpage_reentrant_guard && readahead_getpage_pull_timeout_ms > 0) - prefetch_pump_state(true); - - timeout_signaled = false; - reconfigure_timeout_if_needed(); - } - - if (!prev_interrupt_cb) - return false; - - return prev_interrupt_cb(); -} - - -void -pagestore_smgr_init(void) -{ - prev_interrupt_cb = ProcessInterruptsCallback; - ProcessInterruptsCallback = pagestore_smgr_processinterrupts; -}