Move lwlock to pagestore_smgr

This commit is contained in:
Konstantin Knizhnik
2025-07-13 16:20:26 +03:00
committed by Konstantin Knizhnik
parent ca13e7ad7a
commit 20dea3aafb
3 changed files with 142 additions and 144 deletions

View File

@@ -333,12 +333,13 @@ typedef struct
extern void relkind_hash_init(void);
extern RelKindEntry* set_cached_relkind(NRelFileInfo rinfo, RelKind relkind);
extern RelKindEntry* get_cached_relkind(NRelFileInfo rinfo, RelKind* relkind);
extern void store_cached_relkind(RelKindEntry* entry, RelKind relkind);
extern void update_cached_relkind(RelKindEntry* entry, RelKind relkind);
extern void set_cached_relkind(NRelFileInfo rinfo, RelKind relkind);
extern RelKind get_cached_relkind(NRelFileInfo rinfo);
extern RelKindEntry* pin_cached_relkind(NRelFileInfo rinfo, RelKind relkind);
extern void unpin_cached_relkind(RelKindEntry* entry);
extern void unlock_cached_relkind(void);
extern void fs_exclusive_lock(void);
extern void fs_shared_lock(void);
extern void fs_unlock(void);
extern void forget_cached_relkind(NRelFileInfo rinfo);
#endif /* PAGESTORE_CLIENT_H */

View File

@@ -1587,22 +1587,6 @@ hexdump_page(char *page)
return result.data;
}
static RelKind determine_entry_relkind(RelKindEntry *entry, SMgrRelation reln, ForkNumber forknum)
{
RelKind relkind = RELKIND_UNKNOWN;
PG_TRY();
{
relkind = mdexists(reln, forknum) ? RELKIND_UNLOGGED : RELKIND_PERMANENT;
}
PG_CATCH();
{
unpin_cached_relkind(entry);
PG_RE_THROW();
}
PG_END_TRY();
return relkind;
}
#if PG_MAJORVERSION_NUM < 17
/*
* neon_write() -- Write the supplied block at the appropriate location.
@@ -1619,19 +1603,31 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo
#endif
{
XLogRecPtr lsn;
RelKindEntry *entry;
RelKind relkind = RELKIND_UNKNOWN;
RelKind relkind;
bool fs_locked = false;
NRelFileInfo rinfo = InfoFromSMgrRel(reln);
switch (reln->smgr_relpersistence)
{
case 0:
entry = get_cached_relkind(InfoFromSMgrRel(reln), &relkind);
if (entry)
relkind = get_cached_relkind(rinfo);
if (relkind == RELKIND_UNKNOWN)
{
Assert(relkind == RELKIND_UNKNOWN);
/* We do not know relation persistence: let's determine it */
relkind = determine_entry_relkind(entry, reln, debug_compare_local ? INIT_FORKNUM : forknum);
store_cached_relkind(entry, relkind);
relkind = mdexists(reln, debug_compare_local ? INIT_FORKNUM : forknum) ? RELKIND_UNLOGGED : RELKIND_PERMANENT;
set_cached_relkind(rinfo, relkind);
}
if (relkind == RELKIND_UNLOGGED_BUILD)
{
/* In case of unlogged build we need to avoid race condition at unlogged build end.
* Obtain shared lock here to prevent backend completing unlogged build from performing cleanup amnd remvong files.
*/
fs_shared_lock();
fs_locked = true;
/*
* Recheck relkind under lock - may be unlogged build is already finished
*/
relkind = get_cached_relkind(rinfo);
}
if (relkind == RELKIND_UNLOGGED || relkind == RELKIND_UNLOGGED_BUILD)
{
@@ -1640,16 +1636,19 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo
#else
mdwrite(reln, forknum, blocknum, buffer, skipFsync);
#endif
if (relkind == RELKIND_UNLOGGED_BUILD)
{
unlock_cached_relkind();
}
}
if (fs_locked)
{
fs_unlock();
}
if (relkind == RELKIND_UNLOGGED || relkind == RELKIND_UNLOGGED_BUILD)
{
return;
}
break;
case RELPERSISTENCE_PERMANENT:
if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln)))
if (RelFileInfoEquals(unlogged_build_rel_info, rinfo))
{
#if PG_MAJORVERSION_NUM >= 17
mdwritev(reln, forknum, blocknum, &buffer, 1, skipFsync);
@@ -1680,7 +1679,7 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo
forknum, blocknum,
(uint32) (lsn >> 32), (uint32) lsn);
lfc_write(InfoFromSMgrRel(reln), forknum, blocknum, buffer);
lfc_write(rinfo, forknum, blocknum, buffer);
communicator_prefetch_pump_state();
@@ -1705,33 +1704,48 @@ static void
neon_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
const void **buffers, BlockNumber nblocks, bool skipFsync)
{
RelKindEntry *entry;
RelKind relkind = RELKIND_UNKNOWN;
RelKind relkind;
NRelFileInfo rinfo = InfoFromSMgrRel(reln);
bool fs_locked = false;
switch (reln->smgr_relpersistence)
{
case 0:
entry = get_cached_relkind(InfoFromSMgrRel(reln), &relkind);
if (entry)
relkind = get_cached_relkind(rinfo);
if (relkind == RELKIND_UNKNOWN)
{
Assert(relkind == RELKIND_UNKNOWN);
/* We do not know relation persistence: let's determine it */
relkind = determine_entry_relkind(entry, reln, debug_compare_local ? INIT_FORKNUM : forknum);
store_cached_relkind(entry, relkind);
relkind = mdexists(reln, debug_compare_local ? INIT_FORKNUM : forknum) ? RELKIND_UNLOGGED : RELKIND_PERMANENT;
set_cached_relkind(rinfo, relkind);
}
if (relkind == RELKIND_UNLOGGED_BUILD)
{
/* In case of unlogged build we need to avoid race condition at unlogged build end.
* Obtain shared lock here to prevent backend completing unlogged build from performing cleanup amnd remvong files.
*/
fs_shared_lock();
fs_locked = true;
/*
* Recheck relkind under lock - may be unlogged build is already finished
*/
relkind = get_cached_relkind(rinfo);
}
if (relkind == RELKIND_UNLOGGED || relkind == RELKIND_UNLOGGED_BUILD)
{
/* It exists locally. Guess it's unlogged then. */
mdwritev(reln, forknum, blkno, buffers, nblocks, skipFsync);
if (relkind == RELKIND_UNLOGGED_BUILD)
{
unlock_cached_relkind();
}
}
if (fs_locked)
{
fs_unlock();
}
if (relkind == RELKIND_UNLOGGED || relkind == RELKIND_UNLOGGED_BUILD)
{
return;
}
break;
case RELPERSISTENCE_PERMANENT:
if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln)))
if (RelFileInfoEquals(unlogged_build_rel_info, rinfo))
{
mdwritev(reln, forknum, blkno, buffers, nblocks, skipFsync);
return;
@@ -1748,7 +1762,7 @@ neon_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
neon_wallog_pagev(reln, forknum, blkno, nblocks, (const char **) buffers, false);
lfc_writev(InfoFromSMgrRel(reln), forknum, blkno, buffers, nblocks);
lfc_writev(rinfo, forknum, blkno, buffers, nblocks);
communicator_prefetch_pump_state();
@@ -2013,7 +2027,7 @@ neon_start_unlogged_build(SMgrRelation reln)
case RELPERSISTENCE_TEMP:
case RELPERSISTENCE_UNLOGGED:
unlogged_build_rel_info = InfoFromSMgrRel(reln);
unlogged_build_rel_entry = set_cached_relkind(unlogged_build_rel_info, RELKIND_UNLOGGED);
unlogged_build_rel_entry = pin_cached_relkind(unlogged_build_rel_info, RELKIND_UNLOGGED);
unlogged_build_phase = UNLOGGED_BUILD_NOT_PERMANENT;
if (debug_compare_local)
{
@@ -2036,7 +2050,7 @@ neon_start_unlogged_build(SMgrRelation reln)
#endif
unlogged_build_rel_info = InfoFromSMgrRel(reln);
unlogged_build_rel_entry = set_cached_relkind(unlogged_build_rel_info, RELKIND_UNLOGGED_BUILD);
unlogged_build_rel_entry = pin_cached_relkind(unlogged_build_rel_info, RELKIND_UNLOGGED_BUILD);
unlogged_build_phase = UNLOGGED_BUILD_PHASE_1;
/*
@@ -2106,8 +2120,9 @@ static void
neon_end_unlogged_build(SMgrRelation reln)
{
NRelFileInfoBackend rinfob = InfoBFromSMgrRel(reln);
NRelFileInfo rinfo = InfoFromSMgrRel(reln);
Assert(RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln)));
Assert(RelFileInfoEquals(unlogged_build_rel_info, rinfo));
ereport(SmgrTrace,
(errmsg(NEON_TAG "ending unlogged build of relation %u/%u/%u",
@@ -2133,23 +2148,26 @@ neon_end_unlogged_build(SMgrRelation reln)
recptr = GetXLogInsertRecPtr();
neon_set_lwlsn_block_range(recptr,
InfoFromNInfoB(rinfob),
rinfo,
MAIN_FORKNUM, 0, nblocks);
neon_set_lwlsn_relation(recptr,
InfoFromNInfoB(rinfob),
rinfo,
MAIN_FORKNUM);
update_cached_relkind(unlogged_build_rel_entry, RELKIND_PERMANENT);
/* Obtain exclusive lock to prevent concrrent writes to the file while we performing cleanup */
fs_exclusive_lock();
unlogged_build_rel_entry->relkind = RELKIND_PERMANENT;
fs_unlock();
/* Remove local copy */
for (int forknum = 0; forknum <= MAX_FORKNUM; forknum++)
{
neon_log(SmgrTrace, "forgetting cached relsize for %u/%u/%u.%u",
RelFileInfoFmt(InfoFromNInfoB(rinfob)),
RelFileInfoFmt(rinfo),
forknum);
forget_cached_relsize(InfoFromNInfoB(rinfob), forknum);
lfc_invalidate(InfoFromNInfoB(rinfob), forknum, nblocks);
forget_cached_relsize(rinfo, forknum);
lfc_invalidate(rinfo, forknum, nblocks);
mdclose(reln, forknum);
if (!debug_compare_local)

View File

@@ -10,6 +10,7 @@
*/
#include "postgres.h"
#include "miscadmin.h"
#include "neon_pgversioncompat.h"
#include "pagestore_client.h"
@@ -62,10 +63,7 @@ static shmem_request_hook_type prev_shmem_request_hook = NULL;
static void relkind_shmem_request(void);
#endif
#define MAX_CONCURRENTLY_ACCESSED_UNLOGGED_RELS 100 /* MaxBackend? */
/*
* Should not be smaller than MAX_CONCURRENTLY_ACCESSED_UNLOGGED_RELS.
* Size of a cache entry is 32 bytes. So this default will take about 2 MB,
* which seems reasonable.
*/
@@ -88,11 +86,17 @@ relkind_cache_startup(void)
relkind_ctl = (RelKindHashControl *) ShmemInitStruct("relkind_hash", sizeof(RelKindHashControl), &found);
if (!found)
{
/*
* In the worst case, the hash needs to be large enough for the case that all backends are performing an unlogged index build at the same time.
* Or actually twice that, because while performing an unlogged index build, each backend can also be trying to write out a page for another
* relation and hence hold one more entry in the cache pinned.
*/
size_t hash_size = Max(2 * MaxBackends, relkind_hash_size);
finish_unlogged_build_lock = (LWLockId) GetNamedLWLockTranche("neon_relkind");
info.keysize = sizeof(NRelFileInfo);
info.entrysize = sizeof(RelKindEntry);
relkind_hash = ShmemInitHash("neon_relkind",
relkind_hash_size, relkind_hash_size,
hash_size, hash_size,
&info,
HASH_ELEM | HASH_BLOBS);
SpinLockInit(&relkind_ctl->mutex);
@@ -109,37 +113,29 @@ relkind_cache_startup(void)
* Lookup existed entry or create new one
*/
static RelKindEntry*
get_entry(NRelFileInfo rinfo, bool* found)
get_pinned_entry(NRelFileInfo rinfo)
{
bool found;
RelKindEntry* entry;
/*
* This should actually never happen! Below we check if hash is full and delete least recently used item in this case.
* But for further safety we also perform check here.
*/
while ((entry = hash_search(relkind_hash, &rinfo, HASH_ENTER_NULL, found)) == NULL)
while ((entry = hash_search(relkind_hash, &rinfo, HASH_ENTER_NULL, &found)) == NULL)
{
/*
* Remove least recently used elment from the hash.
* Hash size after is becomes `relkind_hash_size-1`.
* But it is not considered to be a problem, because size of this hash is expecrted large enough and +-1 doesn't matter.
*/
RelKindEntry *victim = dlist_container(RelKindEntry, lru_node, dlist_pop_head_node(&relkind_ctl->lru));
hash_search(relkind_hash, &victim->rel, HASH_REMOVE, NULL);
Assert(relkind_ctl->size > 0);
relkind_ctl->size -= 1;
}
if (!*found)
if (!found)
{
if (++relkind_ctl->size == relkind_hash_size)
{
/*
* Remove least recently used elment from the hash.
* Hash size after is becomes `relkind_hash_size-1`.
* But it is not considered to be a problem, because size of this hash is expecrted large enough and +-1 doesn't matter.
*/
RelKindEntry *victim = dlist_container(RelKindEntry, lru_node, dlist_pop_head_node(&relkind_ctl->lru));
hash_search(relkind_hash, &victim->rel, HASH_REMOVE, NULL);
relkind_ctl->size -= 1;
}
entry->relkind = RELKIND_UNKNOWN; /* information about relation kind is not yet available */
relkind_ctl->pinned += 1;
entry->access_count = 1;
relkind_ctl->size += 1;
}
else if (entry->access_count++ == 0)
{
@@ -149,20 +145,34 @@ get_entry(NRelFileInfo rinfo, bool* found)
return entry;
}
/*
* Unpin entry and place it at the end of LRU list
*/
static void
unpin_entry(RelKindEntry *entry)
{
Assert(entry->access_count != 0);
if (--entry->access_count == 0)
{
Assert(relkind_ctl->pinned != 0);
relkind_ctl->pinned -= 1;
dlist_push_tail(&relkind_ctl->lru, &entry->lru_node);
}
}
/*
* Intialize new entry. This function is used by neon_start_unlogged_build to mark relation involved in unlogged build.
* In case of overflow removes least recently used entry.
* Return pinned entry. It will be released by unpin_cached_relkind at the end of unlogged build.
*/
RelKindEntry*
set_cached_relkind(NRelFileInfo rinfo, RelKind relkind)
pin_cached_relkind(NRelFileInfo rinfo, RelKind relkind)
{
RelKindEntry *entry = NULL;
bool found;
RelKindEntry *entry;
/* Use spinlock to prevent concurrent hash modifitcation */
SpinLockAcquire(&relkind_ctl->mutex);
entry = get_entry(rinfo, &found);
entry = get_pinned_entry(rinfo);
entry->relkind = relkind;
SpinLockRelease(&relkind_ctl->mutex);
return entry;
@@ -175,79 +185,60 @@ set_cached_relkind(NRelFileInfo rinfo, RelKind relkind)
* If entry is not found then new one is created, pinned and returned. Entry should be updated using store_cached_relkind.
* Shared lock is obtained if relation is involved in inlogged build.
*/
RelKindEntry*
get_cached_relkind(NRelFileInfo rinfo, RelKind* relkind)
RelKind
get_cached_relkind(NRelFileInfo rinfo)
{
RelKindEntry *entry;
bool found;
RelKind relkind = RELKIND_UNKNOWN;
SpinLockAcquire(&relkind_ctl->mutex);
entry = get_entry(rinfo, &found);
if (found)
entry = hash_search(relkind_hash, &rinfo, HASH_FIND, NULL);
if (entry != NULL)
{
/* If relation persistence is known, then there is no need to pin it */
if (entry->relkind != RELKIND_UNKNOWN)
if (entry->access_count++ == 0)
{
/* Fast path: normal (persistent) relation with kind stored in the cache */
if (--entry->access_count == 0)
{
dlist_push_tail(&relkind_ctl->lru, &entry->lru_node);
}
}
/* Need to set shared lock in case of unlogged build to prevent race condition at unlogged build end */
if (entry->relkind == RELKIND_UNLOGGED_BUILD)
{
/* Set shared lock to prevent unlinking relation files by backend completed unlogged build.
* This backend will set exclsuive lock before unlinking files.
* Shared locks allows other backends to perform write in parallel.
*/
LWLockAcquire(finish_unlogged_build_lock, LW_SHARED);
/* Recheck relkind under lock */
if (entry->relkind != RELKIND_UNLOGGED_BUILD)
{
/* Unlogged build is already completed: release lock - we do not need to do any writes to local disk */
LWLockRelease(finish_unlogged_build_lock);
}
}
*relkind = entry->relkind;
if (entry->relkind != RELKIND_UNKNOWN)
{
/* We do not need this entry any more */
entry = NULL;
dlist_delete(&entry->lru_node);
relkind_ctl->pinned += 1;
}
relkind = entry->relkind;
unpin_entry(entry);
}
SpinLockRelease(&relkind_ctl->mutex);
return entry;
return relkind;
}
/*
* Store relation kind as a result of mdexists check. Unpin entry.
*/
void
store_cached_relkind(RelKindEntry* entry, RelKind relkind)
set_cached_relkind(NRelFileInfo rinfo, RelKind relkind)
{
RelKindEntry *entry;
SpinLockAcquire(&relkind_ctl->mutex);
entry = get_pinned_entry(rinfo);
Assert(entry->relkind == RELKIND_UNKNOWN || entry->relkind == relkind);
entry->relkind = relkind;
Assert(entry->access_count != 0);
if (--entry->access_count == 0)
{
Assert(relkind_ctl->pinned != 0);
relkind_ctl->pinned -= 1;
dlist_push_tail(&relkind_ctl->lru, &entry->lru_node);
}
unpin_entry(entry);
SpinLockRelease(&relkind_ctl->mutex);
}
/*
* Change relation persistence.
* This operation obtains exclusiove lock, preventing any concurrent writes.
*/
void
update_cached_relkind(RelKindEntry* entry, RelKind relkind)
fs_exclusive_lock(void)
{
LWLockAcquire(finish_unlogged_build_lock, LW_EXCLUSIVE);
entry->relkind = relkind;
}
void
fs_shared_lock(void)
{
LWLockAcquire(finish_unlogged_build_lock, LW_SHARED);
}
void
fs_unlock(void)
{
LWLockRelease(finish_unlogged_build_lock);
}
@@ -257,23 +248,11 @@ unpin_cached_relkind(RelKindEntry* entry)
if (entry)
{
SpinLockAcquire(&relkind_ctl->mutex);
Assert(entry->access_count != 0);
if (--entry->access_count == 0)
{
Assert(relkind_ctl->pinned != 0);
relkind_ctl->pinned -= 1;
dlist_push_tail(&relkind_ctl->lru, &entry->lru_node);
}
unpin_entry(entry);
SpinLockRelease(&relkind_ctl->mutex);
}
}
void
unlock_cached_relkind(void)
{
LWLockRelease(finish_unlogged_build_lock);
}
void
forget_cached_relkind(NRelFileInfo rinfo)
{
@@ -299,7 +278,7 @@ relkind_hash_init(void)
NULL,
&relkind_hash_size,
DEFAULT_RELKIND_HASH_SIZE,
MAX_CONCURRENTLY_ACCESSED_UNLOGGED_RELS,
1,
INT_MAX,
PGC_POSTMASTER,
0,