From 883379f936718213098f4286f484b7dba4ab2027 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Sun, 8 Jun 2025 18:26:11 +0300 Subject: [PATCH] Add cache for relation kind --- pgxn/neon/Makefile | 1 + pgxn/neon/libpagestore.c | 3 + pgxn/neon/pagestore_client.h | 26 +++ pgxn/neon/pagestore_smgr.c | 86 ++++++--- pgxn/neon/relkind_cache.c | 356 +++++++++++++++++++++++++++++++++++ pgxn/neon/relsize_cache.c | 1 + 6 files changed, 445 insertions(+), 28 deletions(-) create mode 100644 pgxn/neon/relkind_cache.c diff --git a/pgxn/neon/Makefile b/pgxn/neon/Makefile index 958ca5c378..48c38839c6 100644 --- a/pgxn/neon/Makefile +++ b/pgxn/neon/Makefile @@ -19,6 +19,7 @@ OBJS = \ neon_walreader.o \ pagestore_smgr.o \ relsize_cache.o \ + relkind_cache.o \ unstable_extensions.o \ walproposer.o \ walproposer_pg.o \ diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index 1031f185a6..010e5dd966 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -1630,6 +1630,9 @@ pg_init_libpagestore(void) 0, NULL, NULL, NULL); + relsize_hash_init(); + relkind_hash_init(); + if (page_server != NULL) neon_log(ERROR, "libpagestore already loaded"); diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h index 4470d3a94d..7bc5e0dd5a 100644 --- a/pgxn/neon/pagestore_client.h +++ b/pgxn/neon/pagestore_client.h @@ -298,4 +298,30 @@ extern void set_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumb extern void update_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber size); extern void forget_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum); +enum RelKindEntryFlags +{ + RELKIND_UNLOGGED = 1, /* relation is temp or unlogged */ + RELKIND_UNLOGGED_BUILD = 2, /* unlogged index build */ + RELKIND_RAW = 4 /* relation persistence is not known */ +}; + +/* utils for neon relkind cache */ +typedef struct +{ + NRelFileInfo rel; + uint8 flags; /* See RelKindEntryFlags */ + uint16 access_count; + dlist_node lru_node; /* LRU list node */ +} RelKindEntry; + + +extern void relkind_hash_init(void); +extern RelKindEntry* set_cached_relkind(NRelFileInfo rinfo, uint8 flags); +extern RelKindEntry* get_cached_relkind(NRelFileInfo rinfo, uint8* flags); +extern void store_cached_relkind(RelKindEntry* entry, uint8 flags); +extern void clear_cached_relkind_flags(RelKindEntry* entry, uint8 flags); +extern void unpin_cached_relkind(RelKindEntry* entry); +extern void unlock_cached_relkind(void); +extern void forget_cached_relkind(NRelFileInfo rinfo); + #endif /* PAGESTORE_CLIENT_H */ diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index d3e51ba682..d55d3fa464 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -97,6 +97,7 @@ typedef enum int debug_compare_local; static NRelFileInfo unlogged_build_rel_info; +static RelKindEntry* unlogged_build_rel_entry; static UnloggedBuildPhase unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS; static bool neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id); @@ -877,6 +878,7 @@ neon_unlink(NRelFileInfoBackend rinfo, ForkNumber forkNum, bool isRedo) if (!NRelFileInfoBackendIsTemp(rinfo)) { forget_cached_relsize(InfoFromNInfoB(rinfo), forkNum); + forget_cached_relkind(InfoFromNInfoB(rinfo)); } } @@ -1601,26 +1603,35 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo #endif { XLogRecPtr lsn; + RelKindEntry *entry; + bool unlogged; + uint8 flags; switch (reln->smgr_relpersistence) { case 0: - /* This is a bit tricky. Check if the relation exists locally */ - if (mdexists(reln, debug_compare_local ? INIT_FORKNUM : forknum)) + entry = get_cached_relkind(InfoFromSMgrRel(reln), &flags); + if (entry) + { + /* We do not know relation persistence: let's determine it */ + unlogged = mdexists(reln, debug_compare_local ? INIT_FORKNUM : forknum); + store_cached_relkind(entry, unlogged ? RELKIND_UNLOGGED : 0); + } + else + { + unlogged = (flags & (RELKIND_UNLOGGED_BUILD|RELKIND_UNLOGGED)) != 0; + } + if (unlogged) { - /* It exists locally. Guess it's unlogged then. */ #if PG_MAJORVERSION_NUM >= 17 mdwritev(reln, forknum, blocknum, &buffer, 1, skipFsync); #else mdwrite(reln, forknum, blocknum, buffer, skipFsync); #endif - /* - * We could set relpersistence now that we have determined - * that it's local. But we don't dare to do it, because that - * would immediately allow reads as well, which shouldn't - * happen. We could cache it with a different 'relpersistence' - * value, but this isn't performance critical. - */ + if (flags & RELKIND_UNLOGGED_BUILD) + { + unlock_cached_relkind(); + } return; } break; @@ -1682,22 +1693,32 @@ static void neon_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, const void **buffers, BlockNumber nblocks, bool skipFsync) { + RelKindEntry *entry; + bool unlogged; + uint8 flags; + switch (reln->smgr_relpersistence) { case 0: - /* This is a bit tricky. Check if the relation exists locally */ - if (mdexists(reln, debug_compare_local ? INIT_FORKNUM : forknum)) + entry = get_cached_relkind(InfoFromSMgrRel(reln), &flags); + if (entry) + { + /* We do not know relation persistence: let's determine it */ + unlogged = mdexists(reln, debug_compare_local ? INIT_FORKNUM : forknum); + store_cached_relkind(entry, unlogged ? RELKIND_UNLOGGED : 0); + } + else + { + unlogged = (flags & (RELKIND_UNLOGGED_BUILD|RELKIND_UNLOGGED)) != 0; + } + if (unlogged) { /* It exists locally. Guess it's unlogged then. */ mdwritev(reln, forknum, blkno, buffers, nblocks, skipFsync); - - /* - * We could set relpersistence now that we have determined - * that it's local. But we don't dare to do it, because that - * would immediately allow reads as well, which shouldn't - * happen. We could cache it with a different 'relpersistence' - * value, but this isn't performance critical. - */ + if (flags & RELKIND_UNLOGGED_BUILD) + { + unlock_cached_relkind(); + } return; } break; @@ -1985,6 +2006,7 @@ neon_start_unlogged_build(SMgrRelation reln) case RELPERSISTENCE_TEMP: case RELPERSISTENCE_UNLOGGED: unlogged_build_rel_info = InfoFromSMgrRel(reln); + unlogged_build_rel_entry = set_cached_relkind(unlogged_build_rel_info, RELKIND_UNLOGGED|RELKIND_UNLOGGED_BUILD); unlogged_build_phase = UNLOGGED_BUILD_NOT_PERMANENT; if (debug_compare_local) { @@ -2007,6 +2029,7 @@ neon_start_unlogged_build(SMgrRelation reln) #endif unlogged_build_rel_info = InfoFromSMgrRel(reln); + unlogged_build_rel_entry = set_cached_relkind(unlogged_build_rel_info, RELKIND_UNLOGGED_BUILD); unlogged_build_phase = UNLOGGED_BUILD_PHASE_1; /* @@ -2022,6 +2045,15 @@ neon_start_unlogged_build(SMgrRelation reln) } } +static void +unlogged_build_cleanup(void) +{ + NRelFileInfoInvalidate(unlogged_build_rel_info); + unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS; + unpin_cached_relkind(unlogged_build_rel_entry); + unlogged_build_rel_entry = NULL; +} + /* * neon_finish_unlogged_build_phase_1() * @@ -2048,8 +2080,7 @@ neon_finish_unlogged_build_phase_1(SMgrRelation reln) */ if (IsParallelWorker()) { - NRelFileInfoInvalidate(unlogged_build_rel_info); - unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS; + unlogged_build_cleanup(); } else unlogged_build_phase = UNLOGGED_BUILD_PHASE_2; @@ -2101,6 +2132,8 @@ neon_end_unlogged_build(SMgrRelation reln) InfoFromNInfoB(rinfob), MAIN_FORKNUM); + clear_cached_relkind_flags(unlogged_build_rel_entry, RELKIND_UNLOGGED_BUILD); + /* Remove local copy */ for (int forknum = 0; forknum <= MAX_FORKNUM; forknum++) { @@ -2121,8 +2154,7 @@ neon_end_unlogged_build(SMgrRelation reln) if (debug_compare_local) mdunlink(rinfob, INIT_FORKNUM, true); } - NRelFileInfoInvalidate(unlogged_build_rel_info); - unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS; + unlogged_build_cleanup(); } #define STRPREFIX(str, prefix) (strncmp(str, prefix, strlen(prefix)) == 0) @@ -2194,8 +2226,7 @@ AtEOXact_neon(XactEvent event, void *arg) * Forget about any build we might have had in progress. The local * file will be unlinked by smgrDoPendingDeletes() */ - NRelFileInfoInvalidate(unlogged_build_rel_info); - unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS; + unlogged_build_cleanup(); break; case XACT_EVENT_COMMIT: @@ -2206,8 +2237,7 @@ AtEOXact_neon(XactEvent event, void *arg) case XACT_EVENT_PRE_PREPARE: if (unlogged_build_phase != UNLOGGED_BUILD_NOT_IN_PROGRESS) { - NRelFileInfoInvalidate(unlogged_build_rel_info); - unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS; + unlogged_build_cleanup(); ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), (errmsg(NEON_TAG "unlogged index build was not properly finished")))); diff --git a/pgxn/neon/relkind_cache.c b/pgxn/neon/relkind_cache.c new file mode 100644 index 0000000000..052468b405 --- /dev/null +++ b/pgxn/neon/relkind_cache.c @@ -0,0 +1,356 @@ +/*------------------------------------------------------------------------- + * + * relkind_cache.c + * Cache for marking unlogged relations + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "neon_pgversioncompat.h" + +#include "pagestore_client.h" +#include RELFILEINFO_HDR +#include "storage/smgr.h" +#include "storage/lwlock.h" +#include "storage/ipc.h" +#include "storage/shmem.h" +#include "catalog/pg_tablespace_d.h" +#include "utils/dynahash.h" +#include "utils/guc.h" + +#if PG_VERSION_NUM >= 150000 +#include "miscadmin.h" +#endif + +/* + * The main goal of this cache is to avoid calls of mdexists in neon_write, + * which is needed to distinguish unlogged relations. + * + * This hash is also used to mark relation during unlogged build. + * It has limited size, implementing eviction based on LRU algorithm. + * Relations involved in unlogged build are pinned in the cache (assuming that + * number of concurrent unlogged build is small. + * + * Another task of this hash is to prevent race condition during unlogged build termination. + * Some backend may want to evict page which backenf performing unlogged build can complete it and unlinking local files. + * We are using shared lock which is hold during all write operation. As far as lock is shared is doesn't prevent concurrent writes. + * Exclusive lock is taken by unlogged_build_end to change relation kind. + */ + +typedef struct +{ + size_t size; + uint64 hits; + uint64 misses; + uint64 pinned; + slock_t mutex; + dlist_head lru; /* double linked list for LRU replacement + * algorithm */ +} RelKindHashControl; + +static HTAB *relkind_hash; +static LWLockId relkind_lock; +static int relkind_hash_size; +static RelKindHashControl* relkind_ctl; +static shmem_startup_hook_type prev_shmem_startup_hook = NULL; +#if PG_VERSION_NUM >= 150000 +static shmem_request_hook_type prev_shmem_request_hook = NULL; +static void relkind_shmem_request(void); +#endif + +#define MAX_CONCURRENTLY_ACCESSED_UNLOGGED_RELS 100 /* MaxBackend? */ + +/* + * Should not be smaller than MAX_CONCURRENTLY_ACCESSED_UNLOGGED_RELS. + * Size of a cache entry is 32 bytes. So this default will take about 2 MB, + * which seems reasonable. + */ +#define DEFAULT_RELKIND_HASH_SIZE (64 * 1024) + + + +static void +relkind_cache_startup(void) +{ + static HASHCTL info; + bool found; + + if (prev_shmem_startup_hook) + prev_shmem_startup_hook(); + + LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE); + relkind_ctl = (RelKindHashControl *) ShmemInitStruct("relkind_hash", sizeof(RelKindHashControl), &found); + if (!found) + { + relkind_lock = (LWLockId) GetNamedLWLockTranche("neon_relkind"); + info.keysize = sizeof(NRelFileInfo); + info.entrysize = sizeof(RelKindEntry); + relkind_hash = ShmemInitHash("neon_relkind", + relkind_hash_size, relkind_hash_size, + &info, + HASH_ELEM | HASH_BLOBS); + SpinLockInit(&relkind_ctl->mutex); + relkind_ctl->size = 0; + relkind_ctl->hits = 0; + relkind_ctl->misses = 0; + relkind_ctl->pinned = 0; + dlist_init(&relkind_ctl->lru); + } + LWLockRelease(AddinShmemInitLock); +} + +/* + * Intialize new entry. This function is used by neon_start_unlogged_build to mark relation involved in unlogged build. + * In case of overflow removes least recently used entry. + * Return pinned entry. It will be released by unpin_cached_relkind at the end of unlogged build. + */ +RelKindEntry* +set_cached_relkind(NRelFileInfo rinfo, uint8 flags) +{ + RelKindEntry *entry = NULL; + bool found; + + /* Use spinlock to prevent concurrent hash modifitcation */ + SpinLockAcquire(&relkind_ctl->mutex); + + /* + * This should actually never happen! Below we check if hash is full and delete least recently user item in this case. + * But for further safety we also perform check here. + */ + while ((entry = hash_search(relkind_hash, &rinfo, HASH_ENTER_NULL, &found)) == NULL) + { + RelKindEntry *victim = dlist_container(RelKindEntry, lru_node, dlist_pop_head_node(&relkind_ctl->lru)); + hash_search(relkind_hash, &victim->rel, HASH_REMOVE, NULL); + Assert(relkind_ctl->size > 0); + relkind_ctl->size -= 1; + } + if (!found) + { + if (++relkind_ctl->size == relkind_hash_size) + { + /* + * Remove least recently used elment from the hash. + * Hash size after is becomes `relkind_hash_size-1`. + * But it is not considered to be a problem, because size of this hash is expecrted large enough and +-1 doesn't matter. + */ + RelKindEntry *victim = dlist_container(RelKindEntry, lru_node, dlist_pop_head_node(&relkind_ctl->lru)); + hash_search(relkind_hash, &victim->rel, HASH_REMOVE, NULL); + relkind_ctl->size -= 1; + } + relkind_ctl->pinned += 1; + entry->access_count = 1; + } + else if (entry->access_count++ == 0) + { + dlist_delete(&entry->lru_node); + relkind_ctl->pinned += 1; + } + entry->flags = flags; + SpinLockRelease(&relkind_ctl->mutex); + return entry; +} + +/* + * Lookup entry and create new one if not exists. This function is called by neon_write to detenmine if changes should be written to the local disk. + * In case of overflow removes least recently used entry. + * If entry is found and is not raw, then flags are stord in flags and NULL is returned. + * If entry is not found then new one is created, pinned and returned. Entry should be updated using store_cached_relkind. + * Shared lock is obtained if relation is involved in inlogged build. + */ +RelKindEntry* +get_cached_relkind(NRelFileInfo rinfo, uint8* flags) +{ + RelKindEntry *entry; + bool found; + + SpinLockAcquire(&relkind_ctl->mutex); + /* + * This should actually never happen! Below we check if hash is full and delete least recently user item in this case. + * But for further safety we also perform check here. + */ + while ((entry = hash_search(relkind_hash, &rinfo, HASH_ENTER_NULL, &found)) == NULL) + { + RelKindEntry *victim = dlist_container(RelKindEntry, lru_node, dlist_pop_head_node(&relkind_ctl->lru)); + hash_search(relkind_hash, &victim->rel, HASH_REMOVE, NULL); + Assert(relkind_ctl->size > 0); + relkind_ctl->size -= 1; + } + if (!found) + { + if (++relkind_ctl->size == relkind_hash_size) + { + /* + * Remove least recently used elment from the hash. + * Hash size after is becomes `relkind_hash_size-1`. + * But it is not considered to be a problem, because size of this hash is expecrted large enough and +-1 doesn't matter. + */ + RelKindEntry *victim = dlist_container(RelKindEntry, lru_node, dlist_pop_head_node(&relkind_ctl->lru)); + hash_search(relkind_hash, &victim->rel, HASH_REMOVE, NULL); + relkind_ctl->size -= 1; + } + entry->flags = RELKIND_RAW; /* information about relation kind is not yet available */ + entry->access_count = 1; + relkind_ctl->pinned += 1; + } + else + { + if (entry->access_count++ == 0) /* Entry is not pinned */ + { + /* + * Pin entry by remving it from the LRU list + */ + Assert(!(entry->flags & RELKIND_RAW)); /* unpinned entry can not be raw */ + dlist_delete(&entry->lru_node); + } + /* If entry is not raw, then there is no need to pin it */ + if (!(entry->flags & RELKIND_RAW)) + { + /* Fast path: normal (persistent) relation with kind stored in the cache */ + if (--entry->access_count == 0) + { + dlist_push_tail(&relkind_ctl->lru, &entry->lru_node); + } + } + /* Need to set shared lock in case of unlogged build to prevent race condition on unlogged build end */ + if (entry->flags & RELKIND_UNLOGGED_BUILD) + { + /* Set shared lock to prevent unlinking relation files by backend completed unlogged build. + * This backend will set exclsuive lock before unlinking files. + * Shared locks allows other backends to perform write in parallel. + */ + LWLockAcquire(relkind_lock, LW_SHARED); + /* Recheck flags under lock */ + if (!(entry->flags & RELKIND_UNLOGGED_BUILD)) + { + /* Unlogged build is already completed: release lock - we do not need to do any writes to local disk */ + LWLockRelease(relkind_lock); + } + } + *flags = entry->flags; + if (!(entry->flags & RELKIND_RAW)) + { + /* We do not need this entry any more */ + entry = NULL; + } + } + SpinLockRelease(&relkind_ctl->mutex); + return entry; +} + +/* + * Store relation persistence as a result of mdexists check. + * Unpin entry. + */ +void +store_cached_relkind(RelKindEntry* entry, uint8 flags) +{ + SpinLockAcquire(&relkind_ctl->mutex); + entry->flags = flags; + Assert(entry->access_count != 0); + if (--entry->access_count == 0) + { + Assert(relkind_ctl->pinned != 0); + relkind_ctl->pinned -= 1; + dlist_push_tail(&relkind_ctl->lru, &entry->lru_node); + } + SpinLockRelease(&relkind_ctl->mutex); +} + + +/* + * Change relation persistence. + * This operation obtains exclusiove lock, preventing any concurrent writes. + */ +void +clear_cached_relkind_flags(RelKindEntry* entry, uint8 flags) +{ + LWLockAcquire(relkind_lock, LW_EXCLUSIVE); + entry->flags &= ~flags; + LWLockRelease(relkind_lock); +} + +void +unpin_cached_relkind(RelKindEntry* entry) +{ + if (entry) + { + SpinLockAcquire(&relkind_ctl->mutex); + Assert(entry->access_count != 0); + if (--entry->access_count == 0) + { + Assert(relkind_ctl->pinned != 0); + relkind_ctl->pinned -= 1; + dlist_push_tail(&relkind_ctl->lru, &entry->lru_node); + } + SpinLockRelease(&relkind_ctl->mutex); + } +} + +void +unlock_cached_relkind(void) +{ + LWLockRelease(relkind_lock); +} + +void +forget_cached_relkind(NRelFileInfo rinfo) +{ + RelKindEntry *entry; + SpinLockAcquire(&relkind_ctl->mutex); + entry = hash_search(relkind_hash, &rinfo, HASH_REMOVE, NULL); + if (entry) + { + dlist_delete(&entry->lru_node); + relkind_ctl->size -= 1; + } + SpinLockRelease(&relkind_ctl->mutex); +} + + + + +void +relkind_hash_init(void) +{ + DefineCustomIntVariable("neon.relkind_hash_size", + "Sets the maximum number of cached relation kinds for neon", + NULL, + &relkind_hash_size, + DEFAULT_RELKIND_HASH_SIZE, + MAX_CONCURRENTLY_ACCESSED_UNLOGGED_RELS, + INT_MAX, + PGC_POSTMASTER, + 0, + NULL, NULL, NULL); + +#if PG_VERSION_NUM >= 150000 + prev_shmem_request_hook = shmem_request_hook; + shmem_request_hook = relkind_shmem_request; +#else + RequestAddinShmemSpace(hash_estimate_size(relkind_hash_size, sizeof(RelKindEntry))); + RequestNamedLWLockTranche("neon_relkind", 1); +#endif + + prev_shmem_startup_hook = shmem_startup_hook; + shmem_startup_hook = relkind_cache_startup; +} + +#if PG_VERSION_NUM >= 150000 +/* + * shmem_request hook: request additional shared resources. We'll allocate or + * attach to the shared resources in neon_smgr_shmem_startup(). + */ +static void +relkind_shmem_request(void) +{ + if (prev_shmem_request_hook) + prev_shmem_request_hook(); + + RequestAddinShmemSpace(sizeof(RelKindHashControl) + hash_estimate_size(relkind_hash_size, sizeof(RelKindEntry))); + RequestNamedLWLockTranche("neon_relkind", 1); +} +#endif diff --git a/pgxn/neon/relsize_cache.c b/pgxn/neon/relsize_cache.c index c6b4aeb394..5fe1967acc 100644 --- a/pgxn/neon/relsize_cache.c +++ b/pgxn/neon/relsize_cache.c @@ -80,6 +80,7 @@ RelsizeCacheShmemInit(void) relsize_ctl->writes = 0; dlist_init(&relsize_ctl->lru); } + LWLockRelease(AddinShmemInitLock); } bool