diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h index 532613ac75..2336800fbe 100644 --- a/pgxn/neon/pagestore_client.h +++ b/pgxn/neon/pagestore_client.h @@ -333,12 +333,13 @@ typedef struct extern void relkind_hash_init(void); -extern RelKindEntry* set_cached_relkind(NRelFileInfo rinfo, RelKind relkind); -extern RelKindEntry* get_cached_relkind(NRelFileInfo rinfo, RelKind* relkind); -extern void store_cached_relkind(RelKindEntry* entry, RelKind relkind); -extern void update_cached_relkind(RelKindEntry* entry, RelKind relkind); +extern void set_cached_relkind(NRelFileInfo rinfo, RelKind relkind); +extern RelKind get_cached_relkind(NRelFileInfo rinfo); +extern RelKindEntry* pin_cached_relkind(NRelFileInfo rinfo, RelKind relkind); extern void unpin_cached_relkind(RelKindEntry* entry); -extern void unlock_cached_relkind(void); +extern void fs_exclusive_lock(void); +extern void fs_shared_lock(void); +extern void fs_unlock(void); extern void forget_cached_relkind(NRelFileInfo rinfo); #endif /* PAGESTORE_CLIENT_H */ diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index 5a56294b26..99b2766c6f 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -1587,22 +1587,6 @@ hexdump_page(char *page) return result.data; } -static RelKind determine_entry_relkind(RelKindEntry *entry, SMgrRelation reln, ForkNumber forknum) -{ - RelKind relkind = RELKIND_UNKNOWN; - PG_TRY(); - { - relkind = mdexists(reln, forknum) ? RELKIND_UNLOGGED : RELKIND_PERMANENT; - } - PG_CATCH(); - { - unpin_cached_relkind(entry); - PG_RE_THROW(); - } - PG_END_TRY(); - return relkind; -} - #if PG_MAJORVERSION_NUM < 17 /* * neon_write() -- Write the supplied block at the appropriate location. @@ -1619,19 +1603,31 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo #endif { XLogRecPtr lsn; - RelKindEntry *entry; - RelKind relkind = RELKIND_UNKNOWN; + RelKind relkind; + bool fs_locked = false; + NRelFileInfo rinfo = InfoFromSMgrRel(reln); switch (reln->smgr_relpersistence) { case 0: - entry = get_cached_relkind(InfoFromSMgrRel(reln), &relkind); - if (entry) + relkind = get_cached_relkind(rinfo); + if (relkind == RELKIND_UNKNOWN) { - Assert(relkind == RELKIND_UNKNOWN); /* We do not know relation persistence: let's determine it */ - relkind = determine_entry_relkind(entry, reln, debug_compare_local ? INIT_FORKNUM : forknum); - store_cached_relkind(entry, relkind); + relkind = mdexists(reln, debug_compare_local ? INIT_FORKNUM : forknum) ? RELKIND_UNLOGGED : RELKIND_PERMANENT; + set_cached_relkind(rinfo, relkind); + } + if (relkind == RELKIND_UNLOGGED_BUILD) + { + /* In case of unlogged build we need to avoid race condition at unlogged build end. + * Obtain shared lock here to prevent backend completing unlogged build from performing cleanup amnd remvong files. + */ + fs_shared_lock(); + fs_locked = true; + /* + * Recheck relkind under lock - may be unlogged build is already finished + */ + relkind = get_cached_relkind(rinfo); } if (relkind == RELKIND_UNLOGGED || relkind == RELKIND_UNLOGGED_BUILD) { @@ -1640,16 +1636,19 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo #else mdwrite(reln, forknum, blocknum, buffer, skipFsync); #endif - if (relkind == RELKIND_UNLOGGED_BUILD) - { - unlock_cached_relkind(); - } + } + if (fs_locked) + { + fs_unlock(); + } + if (relkind == RELKIND_UNLOGGED || relkind == RELKIND_UNLOGGED_BUILD) + { return; } break; case RELPERSISTENCE_PERMANENT: - if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln))) + if (RelFileInfoEquals(unlogged_build_rel_info, rinfo)) { #if PG_MAJORVERSION_NUM >= 17 mdwritev(reln, forknum, blocknum, &buffer, 1, skipFsync); @@ -1680,7 +1679,7 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo forknum, blocknum, (uint32) (lsn >> 32), (uint32) lsn); - lfc_write(InfoFromSMgrRel(reln), forknum, blocknum, buffer); + lfc_write(rinfo, forknum, blocknum, buffer); communicator_prefetch_pump_state(); @@ -1705,33 +1704,48 @@ static void neon_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, const void **buffers, BlockNumber nblocks, bool skipFsync) { - RelKindEntry *entry; - RelKind relkind = RELKIND_UNKNOWN; + RelKind relkind; + NRelFileInfo rinfo = InfoFromSMgrRel(reln); + bool fs_locked = false; switch (reln->smgr_relpersistence) { case 0: - entry = get_cached_relkind(InfoFromSMgrRel(reln), &relkind); - if (entry) + relkind = get_cached_relkind(rinfo); + if (relkind == RELKIND_UNKNOWN) { - Assert(relkind == RELKIND_UNKNOWN); /* We do not know relation persistence: let's determine it */ - relkind = determine_entry_relkind(entry, reln, debug_compare_local ? INIT_FORKNUM : forknum); - store_cached_relkind(entry, relkind); + relkind = mdexists(reln, debug_compare_local ? INIT_FORKNUM : forknum) ? RELKIND_UNLOGGED : RELKIND_PERMANENT; + set_cached_relkind(rinfo, relkind); + } + if (relkind == RELKIND_UNLOGGED_BUILD) + { + /* In case of unlogged build we need to avoid race condition at unlogged build end. + * Obtain shared lock here to prevent backend completing unlogged build from performing cleanup amnd remvong files. + */ + fs_shared_lock(); + fs_locked = true; + /* + * Recheck relkind under lock - may be unlogged build is already finished + */ + relkind = get_cached_relkind(rinfo); } if (relkind == RELKIND_UNLOGGED || relkind == RELKIND_UNLOGGED_BUILD) { /* It exists locally. Guess it's unlogged then. */ mdwritev(reln, forknum, blkno, buffers, nblocks, skipFsync); - if (relkind == RELKIND_UNLOGGED_BUILD) - { - unlock_cached_relkind(); - } + } + if (fs_locked) + { + fs_unlock(); + } + if (relkind == RELKIND_UNLOGGED || relkind == RELKIND_UNLOGGED_BUILD) + { return; } break; case RELPERSISTENCE_PERMANENT: - if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln))) + if (RelFileInfoEquals(unlogged_build_rel_info, rinfo)) { mdwritev(reln, forknum, blkno, buffers, nblocks, skipFsync); return; @@ -1748,7 +1762,7 @@ neon_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, neon_wallog_pagev(reln, forknum, blkno, nblocks, (const char **) buffers, false); - lfc_writev(InfoFromSMgrRel(reln), forknum, blkno, buffers, nblocks); + lfc_writev(rinfo, forknum, blkno, buffers, nblocks); communicator_prefetch_pump_state(); @@ -2013,7 +2027,7 @@ neon_start_unlogged_build(SMgrRelation reln) case RELPERSISTENCE_TEMP: case RELPERSISTENCE_UNLOGGED: unlogged_build_rel_info = InfoFromSMgrRel(reln); - unlogged_build_rel_entry = set_cached_relkind(unlogged_build_rel_info, RELKIND_UNLOGGED); + unlogged_build_rel_entry = pin_cached_relkind(unlogged_build_rel_info, RELKIND_UNLOGGED); unlogged_build_phase = UNLOGGED_BUILD_NOT_PERMANENT; if (debug_compare_local) { @@ -2036,7 +2050,7 @@ neon_start_unlogged_build(SMgrRelation reln) #endif unlogged_build_rel_info = InfoFromSMgrRel(reln); - unlogged_build_rel_entry = set_cached_relkind(unlogged_build_rel_info, RELKIND_UNLOGGED_BUILD); + unlogged_build_rel_entry = pin_cached_relkind(unlogged_build_rel_info, RELKIND_UNLOGGED_BUILD); unlogged_build_phase = UNLOGGED_BUILD_PHASE_1; /* @@ -2106,8 +2120,9 @@ static void neon_end_unlogged_build(SMgrRelation reln) { NRelFileInfoBackend rinfob = InfoBFromSMgrRel(reln); + NRelFileInfo rinfo = InfoFromSMgrRel(reln); - Assert(RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln))); + Assert(RelFileInfoEquals(unlogged_build_rel_info, rinfo)); ereport(SmgrTrace, (errmsg(NEON_TAG "ending unlogged build of relation %u/%u/%u", @@ -2133,23 +2148,26 @@ neon_end_unlogged_build(SMgrRelation reln) recptr = GetXLogInsertRecPtr(); neon_set_lwlsn_block_range(recptr, - InfoFromNInfoB(rinfob), + rinfo, MAIN_FORKNUM, 0, nblocks); neon_set_lwlsn_relation(recptr, - InfoFromNInfoB(rinfob), + rinfo, MAIN_FORKNUM); - update_cached_relkind(unlogged_build_rel_entry, RELKIND_PERMANENT); + /* Obtain exclusive lock to prevent concrrent writes to the file while we performing cleanup */ + fs_exclusive_lock(); + unlogged_build_rel_entry->relkind = RELKIND_PERMANENT; + fs_unlock(); /* Remove local copy */ for (int forknum = 0; forknum <= MAX_FORKNUM; forknum++) { neon_log(SmgrTrace, "forgetting cached relsize for %u/%u/%u.%u", - RelFileInfoFmt(InfoFromNInfoB(rinfob)), + RelFileInfoFmt(rinfo), forknum); - forget_cached_relsize(InfoFromNInfoB(rinfob), forknum); - lfc_invalidate(InfoFromNInfoB(rinfob), forknum, nblocks); + forget_cached_relsize(rinfo, forknum); + lfc_invalidate(rinfo, forknum, nblocks); mdclose(reln, forknum); if (!debug_compare_local) diff --git a/pgxn/neon/relkind_cache.c b/pgxn/neon/relkind_cache.c index 0484ce9118..2c301273dd 100644 --- a/pgxn/neon/relkind_cache.c +++ b/pgxn/neon/relkind_cache.c @@ -10,6 +10,7 @@ */ #include "postgres.h" +#include "miscadmin.h" #include "neon_pgversioncompat.h" #include "pagestore_client.h" @@ -62,10 +63,7 @@ static shmem_request_hook_type prev_shmem_request_hook = NULL; static void relkind_shmem_request(void); #endif -#define MAX_CONCURRENTLY_ACCESSED_UNLOGGED_RELS 100 /* MaxBackend? */ - /* - * Should not be smaller than MAX_CONCURRENTLY_ACCESSED_UNLOGGED_RELS. * Size of a cache entry is 32 bytes. So this default will take about 2 MB, * which seems reasonable. */ @@ -88,11 +86,17 @@ relkind_cache_startup(void) relkind_ctl = (RelKindHashControl *) ShmemInitStruct("relkind_hash", sizeof(RelKindHashControl), &found); if (!found) { + /* + * In the worst case, the hash needs to be large enough for the case that all backends are performing an unlogged index build at the same time. + * Or actually twice that, because while performing an unlogged index build, each backend can also be trying to write out a page for another + * relation and hence hold one more entry in the cache pinned. + */ + size_t hash_size = Max(2 * MaxBackends, relkind_hash_size); finish_unlogged_build_lock = (LWLockId) GetNamedLWLockTranche("neon_relkind"); info.keysize = sizeof(NRelFileInfo); info.entrysize = sizeof(RelKindEntry); relkind_hash = ShmemInitHash("neon_relkind", - relkind_hash_size, relkind_hash_size, + hash_size, hash_size, &info, HASH_ELEM | HASH_BLOBS); SpinLockInit(&relkind_ctl->mutex); @@ -109,37 +113,29 @@ relkind_cache_startup(void) * Lookup existed entry or create new one */ static RelKindEntry* -get_entry(NRelFileInfo rinfo, bool* found) +get_pinned_entry(NRelFileInfo rinfo) { + bool found; RelKindEntry* entry; - /* - * This should actually never happen! Below we check if hash is full and delete least recently used item in this case. - * But for further safety we also perform check here. - */ - while ((entry = hash_search(relkind_hash, &rinfo, HASH_ENTER_NULL, found)) == NULL) + while ((entry = hash_search(relkind_hash, &rinfo, HASH_ENTER_NULL, &found)) == NULL) { + /* + * Remove least recently used elment from the hash. + * Hash size after is becomes `relkind_hash_size-1`. + * But it is not considered to be a problem, because size of this hash is expecrted large enough and +-1 doesn't matter. + */ RelKindEntry *victim = dlist_container(RelKindEntry, lru_node, dlist_pop_head_node(&relkind_ctl->lru)); hash_search(relkind_hash, &victim->rel, HASH_REMOVE, NULL); Assert(relkind_ctl->size > 0); relkind_ctl->size -= 1; } - if (!*found) + if (!found) { - if (++relkind_ctl->size == relkind_hash_size) - { - /* - * Remove least recently used elment from the hash. - * Hash size after is becomes `relkind_hash_size-1`. - * But it is not considered to be a problem, because size of this hash is expecrted large enough and +-1 doesn't matter. - */ - RelKindEntry *victim = dlist_container(RelKindEntry, lru_node, dlist_pop_head_node(&relkind_ctl->lru)); - hash_search(relkind_hash, &victim->rel, HASH_REMOVE, NULL); - relkind_ctl->size -= 1; - } entry->relkind = RELKIND_UNKNOWN; /* information about relation kind is not yet available */ relkind_ctl->pinned += 1; entry->access_count = 1; + relkind_ctl->size += 1; } else if (entry->access_count++ == 0) { @@ -149,20 +145,34 @@ get_entry(NRelFileInfo rinfo, bool* found) return entry; } +/* + * Unpin entry and place it at the end of LRU list + */ +static void +unpin_entry(RelKindEntry *entry) +{ + Assert(entry->access_count != 0); + if (--entry->access_count == 0) + { + Assert(relkind_ctl->pinned != 0); + relkind_ctl->pinned -= 1; + dlist_push_tail(&relkind_ctl->lru, &entry->lru_node); + } +} + /* * Intialize new entry. This function is used by neon_start_unlogged_build to mark relation involved in unlogged build. * In case of overflow removes least recently used entry. * Return pinned entry. It will be released by unpin_cached_relkind at the end of unlogged build. */ RelKindEntry* -set_cached_relkind(NRelFileInfo rinfo, RelKind relkind) +pin_cached_relkind(NRelFileInfo rinfo, RelKind relkind) { - RelKindEntry *entry = NULL; - bool found; + RelKindEntry *entry; /* Use spinlock to prevent concurrent hash modifitcation */ SpinLockAcquire(&relkind_ctl->mutex); - entry = get_entry(rinfo, &found); + entry = get_pinned_entry(rinfo); entry->relkind = relkind; SpinLockRelease(&relkind_ctl->mutex); return entry; @@ -175,79 +185,60 @@ set_cached_relkind(NRelFileInfo rinfo, RelKind relkind) * If entry is not found then new one is created, pinned and returned. Entry should be updated using store_cached_relkind. * Shared lock is obtained if relation is involved in inlogged build. */ -RelKindEntry* -get_cached_relkind(NRelFileInfo rinfo, RelKind* relkind) +RelKind +get_cached_relkind(NRelFileInfo rinfo) { RelKindEntry *entry; - bool found; + RelKind relkind = RELKIND_UNKNOWN; SpinLockAcquire(&relkind_ctl->mutex); - entry = get_entry(rinfo, &found); - if (found) + entry = hash_search(relkind_hash, &rinfo, HASH_FIND, NULL); + if (entry != NULL) { - /* If relation persistence is known, then there is no need to pin it */ - if (entry->relkind != RELKIND_UNKNOWN) + if (entry->access_count++ == 0) { - /* Fast path: normal (persistent) relation with kind stored in the cache */ - if (--entry->access_count == 0) - { - dlist_push_tail(&relkind_ctl->lru, &entry->lru_node); - } - } - /* Need to set shared lock in case of unlogged build to prevent race condition at unlogged build end */ - if (entry->relkind == RELKIND_UNLOGGED_BUILD) - { - /* Set shared lock to prevent unlinking relation files by backend completed unlogged build. - * This backend will set exclsuive lock before unlinking files. - * Shared locks allows other backends to perform write in parallel. - */ - LWLockAcquire(finish_unlogged_build_lock, LW_SHARED); - /* Recheck relkind under lock */ - if (entry->relkind != RELKIND_UNLOGGED_BUILD) - { - /* Unlogged build is already completed: release lock - we do not need to do any writes to local disk */ - LWLockRelease(finish_unlogged_build_lock); - } - } - *relkind = entry->relkind; - if (entry->relkind != RELKIND_UNKNOWN) - { - /* We do not need this entry any more */ - entry = NULL; + dlist_delete(&entry->lru_node); + relkind_ctl->pinned += 1; } + relkind = entry->relkind; + unpin_entry(entry); } SpinLockRelease(&relkind_ctl->mutex); - return entry; + return relkind; } + /* * Store relation kind as a result of mdexists check. Unpin entry. */ void -store_cached_relkind(RelKindEntry* entry, RelKind relkind) +set_cached_relkind(NRelFileInfo rinfo, RelKind relkind) { + RelKindEntry *entry; + SpinLockAcquire(&relkind_ctl->mutex); + entry = get_pinned_entry(rinfo); Assert(entry->relkind == RELKIND_UNKNOWN || entry->relkind == relkind); entry->relkind = relkind; - Assert(entry->access_count != 0); - if (--entry->access_count == 0) - { - Assert(relkind_ctl->pinned != 0); - relkind_ctl->pinned -= 1; - dlist_push_tail(&relkind_ctl->lru, &entry->lru_node); - } + unpin_entry(entry); SpinLockRelease(&relkind_ctl->mutex); } -/* - * Change relation persistence. - * This operation obtains exclusiove lock, preventing any concurrent writes. - */ void -update_cached_relkind(RelKindEntry* entry, RelKind relkind) +fs_exclusive_lock(void) { LWLockAcquire(finish_unlogged_build_lock, LW_EXCLUSIVE); - entry->relkind = relkind; +} + +void +fs_shared_lock(void) +{ + LWLockAcquire(finish_unlogged_build_lock, LW_SHARED); +} + +void +fs_unlock(void) +{ LWLockRelease(finish_unlogged_build_lock); } @@ -257,23 +248,11 @@ unpin_cached_relkind(RelKindEntry* entry) if (entry) { SpinLockAcquire(&relkind_ctl->mutex); - Assert(entry->access_count != 0); - if (--entry->access_count == 0) - { - Assert(relkind_ctl->pinned != 0); - relkind_ctl->pinned -= 1; - dlist_push_tail(&relkind_ctl->lru, &entry->lru_node); - } + unpin_entry(entry); SpinLockRelease(&relkind_ctl->mutex); } } -void -unlock_cached_relkind(void) -{ - LWLockRelease(finish_unlogged_build_lock); -} - void forget_cached_relkind(NRelFileInfo rinfo) { @@ -299,7 +278,7 @@ relkind_hash_init(void) NULL, &relkind_hash_size, DEFAULT_RELKIND_HASH_SIZE, - MAX_CONCURRENTLY_ACCESSED_UNLOGGED_RELS, + 1, INT_MAX, PGC_POSTMASTER, 0,