Add cache for relation kind

This commit is contained in:
Konstantin Knizhnik
2025-06-08 18:26:11 +03:00
committed by Konstantin Knizhnik
parent 6be572177c
commit 883379f936
6 changed files with 445 additions and 28 deletions

View File

@@ -19,6 +19,7 @@ OBJS = \
neon_walreader.o \
pagestore_smgr.o \
relsize_cache.o \
relkind_cache.o \
unstable_extensions.o \
walproposer.o \
walproposer_pg.o \

View File

@@ -1630,6 +1630,9 @@ pg_init_libpagestore(void)
0,
NULL, NULL, NULL);
relsize_hash_init();
relkind_hash_init();
if (page_server != NULL)
neon_log(ERROR, "libpagestore already loaded");

View File

@@ -298,4 +298,30 @@ extern void set_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumb
extern void update_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber size);
extern void forget_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum);
enum RelKindEntryFlags
{
RELKIND_UNLOGGED = 1, /* relation is temp or unlogged */
RELKIND_UNLOGGED_BUILD = 2, /* unlogged index build */
RELKIND_RAW = 4 /* relation persistence is not known */
};
/* utils for neon relkind cache */
typedef struct
{
NRelFileInfo rel;
uint8 flags; /* See RelKindEntryFlags */
uint16 access_count;
dlist_node lru_node; /* LRU list node */
} RelKindEntry;
extern void relkind_hash_init(void);
extern RelKindEntry* set_cached_relkind(NRelFileInfo rinfo, uint8 flags);
extern RelKindEntry* get_cached_relkind(NRelFileInfo rinfo, uint8* flags);
extern void store_cached_relkind(RelKindEntry* entry, uint8 flags);
extern void clear_cached_relkind_flags(RelKindEntry* entry, uint8 flags);
extern void unpin_cached_relkind(RelKindEntry* entry);
extern void unlock_cached_relkind(void);
extern void forget_cached_relkind(NRelFileInfo rinfo);
#endif /* PAGESTORE_CLIENT_H */

View File

@@ -97,6 +97,7 @@ typedef enum
int debug_compare_local;
static NRelFileInfo unlogged_build_rel_info;
static RelKindEntry* unlogged_build_rel_entry;
static UnloggedBuildPhase unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
static bool neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id);
@@ -877,6 +878,7 @@ neon_unlink(NRelFileInfoBackend rinfo, ForkNumber forkNum, bool isRedo)
if (!NRelFileInfoBackendIsTemp(rinfo))
{
forget_cached_relsize(InfoFromNInfoB(rinfo), forkNum);
forget_cached_relkind(InfoFromNInfoB(rinfo));
}
}
@@ -1601,26 +1603,35 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo
#endif
{
XLogRecPtr lsn;
RelKindEntry *entry;
bool unlogged;
uint8 flags;
switch (reln->smgr_relpersistence)
{
case 0:
/* This is a bit tricky. Check if the relation exists locally */
if (mdexists(reln, debug_compare_local ? INIT_FORKNUM : forknum))
entry = get_cached_relkind(InfoFromSMgrRel(reln), &flags);
if (entry)
{
/* We do not know relation persistence: let's determine it */
unlogged = mdexists(reln, debug_compare_local ? INIT_FORKNUM : forknum);
store_cached_relkind(entry, unlogged ? RELKIND_UNLOGGED : 0);
}
else
{
unlogged = (flags & (RELKIND_UNLOGGED_BUILD|RELKIND_UNLOGGED)) != 0;
}
if (unlogged)
{
/* It exists locally. Guess it's unlogged then. */
#if PG_MAJORVERSION_NUM >= 17
mdwritev(reln, forknum, blocknum, &buffer, 1, skipFsync);
#else
mdwrite(reln, forknum, blocknum, buffer, skipFsync);
#endif
/*
* We could set relpersistence now that we have determined
* that it's local. But we don't dare to do it, because that
* would immediately allow reads as well, which shouldn't
* happen. We could cache it with a different 'relpersistence'
* value, but this isn't performance critical.
*/
if (flags & RELKIND_UNLOGGED_BUILD)
{
unlock_cached_relkind();
}
return;
}
break;
@@ -1682,22 +1693,32 @@ static void
neon_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
const void **buffers, BlockNumber nblocks, bool skipFsync)
{
RelKindEntry *entry;
bool unlogged;
uint8 flags;
switch (reln->smgr_relpersistence)
{
case 0:
/* This is a bit tricky. Check if the relation exists locally */
if (mdexists(reln, debug_compare_local ? INIT_FORKNUM : forknum))
entry = get_cached_relkind(InfoFromSMgrRel(reln), &flags);
if (entry)
{
/* We do not know relation persistence: let's determine it */
unlogged = mdexists(reln, debug_compare_local ? INIT_FORKNUM : forknum);
store_cached_relkind(entry, unlogged ? RELKIND_UNLOGGED : 0);
}
else
{
unlogged = (flags & (RELKIND_UNLOGGED_BUILD|RELKIND_UNLOGGED)) != 0;
}
if (unlogged)
{
/* It exists locally. Guess it's unlogged then. */
mdwritev(reln, forknum, blkno, buffers, nblocks, skipFsync);
/*
* We could set relpersistence now that we have determined
* that it's local. But we don't dare to do it, because that
* would immediately allow reads as well, which shouldn't
* happen. We could cache it with a different 'relpersistence'
* value, but this isn't performance critical.
*/
if (flags & RELKIND_UNLOGGED_BUILD)
{
unlock_cached_relkind();
}
return;
}
break;
@@ -1985,6 +2006,7 @@ neon_start_unlogged_build(SMgrRelation reln)
case RELPERSISTENCE_TEMP:
case RELPERSISTENCE_UNLOGGED:
unlogged_build_rel_info = InfoFromSMgrRel(reln);
unlogged_build_rel_entry = set_cached_relkind(unlogged_build_rel_info, RELKIND_UNLOGGED|RELKIND_UNLOGGED_BUILD);
unlogged_build_phase = UNLOGGED_BUILD_NOT_PERMANENT;
if (debug_compare_local)
{
@@ -2007,6 +2029,7 @@ neon_start_unlogged_build(SMgrRelation reln)
#endif
unlogged_build_rel_info = InfoFromSMgrRel(reln);
unlogged_build_rel_entry = set_cached_relkind(unlogged_build_rel_info, RELKIND_UNLOGGED_BUILD);
unlogged_build_phase = UNLOGGED_BUILD_PHASE_1;
/*
@@ -2022,6 +2045,15 @@ neon_start_unlogged_build(SMgrRelation reln)
}
}
static void
unlogged_build_cleanup(void)
{
NRelFileInfoInvalidate(unlogged_build_rel_info);
unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
unpin_cached_relkind(unlogged_build_rel_entry);
unlogged_build_rel_entry = NULL;
}
/*
* neon_finish_unlogged_build_phase_1()
*
@@ -2048,8 +2080,7 @@ neon_finish_unlogged_build_phase_1(SMgrRelation reln)
*/
if (IsParallelWorker())
{
NRelFileInfoInvalidate(unlogged_build_rel_info);
unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
unlogged_build_cleanup();
}
else
unlogged_build_phase = UNLOGGED_BUILD_PHASE_2;
@@ -2101,6 +2132,8 @@ neon_end_unlogged_build(SMgrRelation reln)
InfoFromNInfoB(rinfob),
MAIN_FORKNUM);
clear_cached_relkind_flags(unlogged_build_rel_entry, RELKIND_UNLOGGED_BUILD);
/* Remove local copy */
for (int forknum = 0; forknum <= MAX_FORKNUM; forknum++)
{
@@ -2121,8 +2154,7 @@ neon_end_unlogged_build(SMgrRelation reln)
if (debug_compare_local)
mdunlink(rinfob, INIT_FORKNUM, true);
}
NRelFileInfoInvalidate(unlogged_build_rel_info);
unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
unlogged_build_cleanup();
}
#define STRPREFIX(str, prefix) (strncmp(str, prefix, strlen(prefix)) == 0)
@@ -2194,8 +2226,7 @@ AtEOXact_neon(XactEvent event, void *arg)
* Forget about any build we might have had in progress. The local
* file will be unlinked by smgrDoPendingDeletes()
*/
NRelFileInfoInvalidate(unlogged_build_rel_info);
unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
unlogged_build_cleanup();
break;
case XACT_EVENT_COMMIT:
@@ -2206,8 +2237,7 @@ AtEOXact_neon(XactEvent event, void *arg)
case XACT_EVENT_PRE_PREPARE:
if (unlogged_build_phase != UNLOGGED_BUILD_NOT_IN_PROGRESS)
{
NRelFileInfoInvalidate(unlogged_build_rel_info);
unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
unlogged_build_cleanup();
ereport(ERROR,
(errcode(ERRCODE_INTERNAL_ERROR),
(errmsg(NEON_TAG "unlogged index build was not properly finished"))));

356
pgxn/neon/relkind_cache.c Normal file
View File

@@ -0,0 +1,356 @@
/*-------------------------------------------------------------------------
*
* relkind_cache.c
* Cache for marking unlogged relations
*
* Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "neon_pgversioncompat.h"
#include "pagestore_client.h"
#include RELFILEINFO_HDR
#include "storage/smgr.h"
#include "storage/lwlock.h"
#include "storage/ipc.h"
#include "storage/shmem.h"
#include "catalog/pg_tablespace_d.h"
#include "utils/dynahash.h"
#include "utils/guc.h"
#if PG_VERSION_NUM >= 150000
#include "miscadmin.h"
#endif
/*
* The main goal of this cache is to avoid calls of mdexists in neon_write,
* which is needed to distinguish unlogged relations.
*
* This hash is also used to mark relation during unlogged build.
* It has limited size, implementing eviction based on LRU algorithm.
* Relations involved in unlogged build are pinned in the cache (assuming that
* number of concurrent unlogged build is small.
*
* Another task of this hash is to prevent race condition during unlogged build termination.
* Some backend may want to evict page which backenf performing unlogged build can complete it and unlinking local files.
* We are using shared lock which is hold during all write operation. As far as lock is shared is doesn't prevent concurrent writes.
* Exclusive lock is taken by unlogged_build_end to change relation kind.
*/
typedef struct
{
size_t size;
uint64 hits;
uint64 misses;
uint64 pinned;
slock_t mutex;
dlist_head lru; /* double linked list for LRU replacement
* algorithm */
} RelKindHashControl;
static HTAB *relkind_hash;
static LWLockId relkind_lock;
static int relkind_hash_size;
static RelKindHashControl* relkind_ctl;
static shmem_startup_hook_type prev_shmem_startup_hook = NULL;
#if PG_VERSION_NUM >= 150000
static shmem_request_hook_type prev_shmem_request_hook = NULL;
static void relkind_shmem_request(void);
#endif
#define MAX_CONCURRENTLY_ACCESSED_UNLOGGED_RELS 100 /* MaxBackend? */
/*
* Should not be smaller than MAX_CONCURRENTLY_ACCESSED_UNLOGGED_RELS.
* Size of a cache entry is 32 bytes. So this default will take about 2 MB,
* which seems reasonable.
*/
#define DEFAULT_RELKIND_HASH_SIZE (64 * 1024)
static void
relkind_cache_startup(void)
{
static HASHCTL info;
bool found;
if (prev_shmem_startup_hook)
prev_shmem_startup_hook();
LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
relkind_ctl = (RelKindHashControl *) ShmemInitStruct("relkind_hash", sizeof(RelKindHashControl), &found);
if (!found)
{
relkind_lock = (LWLockId) GetNamedLWLockTranche("neon_relkind");
info.keysize = sizeof(NRelFileInfo);
info.entrysize = sizeof(RelKindEntry);
relkind_hash = ShmemInitHash("neon_relkind",
relkind_hash_size, relkind_hash_size,
&info,
HASH_ELEM | HASH_BLOBS);
SpinLockInit(&relkind_ctl->mutex);
relkind_ctl->size = 0;
relkind_ctl->hits = 0;
relkind_ctl->misses = 0;
relkind_ctl->pinned = 0;
dlist_init(&relkind_ctl->lru);
}
LWLockRelease(AddinShmemInitLock);
}
/*
* Intialize new entry. This function is used by neon_start_unlogged_build to mark relation involved in unlogged build.
* In case of overflow removes least recently used entry.
* Return pinned entry. It will be released by unpin_cached_relkind at the end of unlogged build.
*/
RelKindEntry*
set_cached_relkind(NRelFileInfo rinfo, uint8 flags)
{
RelKindEntry *entry = NULL;
bool found;
/* Use spinlock to prevent concurrent hash modifitcation */
SpinLockAcquire(&relkind_ctl->mutex);
/*
* This should actually never happen! Below we check if hash is full and delete least recently user item in this case.
* But for further safety we also perform check here.
*/
while ((entry = hash_search(relkind_hash, &rinfo, HASH_ENTER_NULL, &found)) == NULL)
{
RelKindEntry *victim = dlist_container(RelKindEntry, lru_node, dlist_pop_head_node(&relkind_ctl->lru));
hash_search(relkind_hash, &victim->rel, HASH_REMOVE, NULL);
Assert(relkind_ctl->size > 0);
relkind_ctl->size -= 1;
}
if (!found)
{
if (++relkind_ctl->size == relkind_hash_size)
{
/*
* Remove least recently used elment from the hash.
* Hash size after is becomes `relkind_hash_size-1`.
* But it is not considered to be a problem, because size of this hash is expecrted large enough and +-1 doesn't matter.
*/
RelKindEntry *victim = dlist_container(RelKindEntry, lru_node, dlist_pop_head_node(&relkind_ctl->lru));
hash_search(relkind_hash, &victim->rel, HASH_REMOVE, NULL);
relkind_ctl->size -= 1;
}
relkind_ctl->pinned += 1;
entry->access_count = 1;
}
else if (entry->access_count++ == 0)
{
dlist_delete(&entry->lru_node);
relkind_ctl->pinned += 1;
}
entry->flags = flags;
SpinLockRelease(&relkind_ctl->mutex);
return entry;
}
/*
* Lookup entry and create new one if not exists. This function is called by neon_write to detenmine if changes should be written to the local disk.
* In case of overflow removes least recently used entry.
* If entry is found and is not raw, then flags are stord in flags and NULL is returned.
* If entry is not found then new one is created, pinned and returned. Entry should be updated using store_cached_relkind.
* Shared lock is obtained if relation is involved in inlogged build.
*/
RelKindEntry*
get_cached_relkind(NRelFileInfo rinfo, uint8* flags)
{
RelKindEntry *entry;
bool found;
SpinLockAcquire(&relkind_ctl->mutex);
/*
* This should actually never happen! Below we check if hash is full and delete least recently user item in this case.
* But for further safety we also perform check here.
*/
while ((entry = hash_search(relkind_hash, &rinfo, HASH_ENTER_NULL, &found)) == NULL)
{
RelKindEntry *victim = dlist_container(RelKindEntry, lru_node, dlist_pop_head_node(&relkind_ctl->lru));
hash_search(relkind_hash, &victim->rel, HASH_REMOVE, NULL);
Assert(relkind_ctl->size > 0);
relkind_ctl->size -= 1;
}
if (!found)
{
if (++relkind_ctl->size == relkind_hash_size)
{
/*
* Remove least recently used elment from the hash.
* Hash size after is becomes `relkind_hash_size-1`.
* But it is not considered to be a problem, because size of this hash is expecrted large enough and +-1 doesn't matter.
*/
RelKindEntry *victim = dlist_container(RelKindEntry, lru_node, dlist_pop_head_node(&relkind_ctl->lru));
hash_search(relkind_hash, &victim->rel, HASH_REMOVE, NULL);
relkind_ctl->size -= 1;
}
entry->flags = RELKIND_RAW; /* information about relation kind is not yet available */
entry->access_count = 1;
relkind_ctl->pinned += 1;
}
else
{
if (entry->access_count++ == 0) /* Entry is not pinned */
{
/*
* Pin entry by remving it from the LRU list
*/
Assert(!(entry->flags & RELKIND_RAW)); /* unpinned entry can not be raw */
dlist_delete(&entry->lru_node);
}
/* If entry is not raw, then there is no need to pin it */
if (!(entry->flags & RELKIND_RAW))
{
/* Fast path: normal (persistent) relation with kind stored in the cache */
if (--entry->access_count == 0)
{
dlist_push_tail(&relkind_ctl->lru, &entry->lru_node);
}
}
/* Need to set shared lock in case of unlogged build to prevent race condition on unlogged build end */
if (entry->flags & RELKIND_UNLOGGED_BUILD)
{
/* Set shared lock to prevent unlinking relation files by backend completed unlogged build.
* This backend will set exclsuive lock before unlinking files.
* Shared locks allows other backends to perform write in parallel.
*/
LWLockAcquire(relkind_lock, LW_SHARED);
/* Recheck flags under lock */
if (!(entry->flags & RELKIND_UNLOGGED_BUILD))
{
/* Unlogged build is already completed: release lock - we do not need to do any writes to local disk */
LWLockRelease(relkind_lock);
}
}
*flags = entry->flags;
if (!(entry->flags & RELKIND_RAW))
{
/* We do not need this entry any more */
entry = NULL;
}
}
SpinLockRelease(&relkind_ctl->mutex);
return entry;
}
/*
* Store relation persistence as a result of mdexists check.
* Unpin entry.
*/
void
store_cached_relkind(RelKindEntry* entry, uint8 flags)
{
SpinLockAcquire(&relkind_ctl->mutex);
entry->flags = flags;
Assert(entry->access_count != 0);
if (--entry->access_count == 0)
{
Assert(relkind_ctl->pinned != 0);
relkind_ctl->pinned -= 1;
dlist_push_tail(&relkind_ctl->lru, &entry->lru_node);
}
SpinLockRelease(&relkind_ctl->mutex);
}
/*
* Change relation persistence.
* This operation obtains exclusiove lock, preventing any concurrent writes.
*/
void
clear_cached_relkind_flags(RelKindEntry* entry, uint8 flags)
{
LWLockAcquire(relkind_lock, LW_EXCLUSIVE);
entry->flags &= ~flags;
LWLockRelease(relkind_lock);
}
void
unpin_cached_relkind(RelKindEntry* entry)
{
if (entry)
{
SpinLockAcquire(&relkind_ctl->mutex);
Assert(entry->access_count != 0);
if (--entry->access_count == 0)
{
Assert(relkind_ctl->pinned != 0);
relkind_ctl->pinned -= 1;
dlist_push_tail(&relkind_ctl->lru, &entry->lru_node);
}
SpinLockRelease(&relkind_ctl->mutex);
}
}
void
unlock_cached_relkind(void)
{
LWLockRelease(relkind_lock);
}
void
forget_cached_relkind(NRelFileInfo rinfo)
{
RelKindEntry *entry;
SpinLockAcquire(&relkind_ctl->mutex);
entry = hash_search(relkind_hash, &rinfo, HASH_REMOVE, NULL);
if (entry)
{
dlist_delete(&entry->lru_node);
relkind_ctl->size -= 1;
}
SpinLockRelease(&relkind_ctl->mutex);
}
void
relkind_hash_init(void)
{
DefineCustomIntVariable("neon.relkind_hash_size",
"Sets the maximum number of cached relation kinds for neon",
NULL,
&relkind_hash_size,
DEFAULT_RELKIND_HASH_SIZE,
MAX_CONCURRENTLY_ACCESSED_UNLOGGED_RELS,
INT_MAX,
PGC_POSTMASTER,
0,
NULL, NULL, NULL);
#if PG_VERSION_NUM >= 150000
prev_shmem_request_hook = shmem_request_hook;
shmem_request_hook = relkind_shmem_request;
#else
RequestAddinShmemSpace(hash_estimate_size(relkind_hash_size, sizeof(RelKindEntry)));
RequestNamedLWLockTranche("neon_relkind", 1);
#endif
prev_shmem_startup_hook = shmem_startup_hook;
shmem_startup_hook = relkind_cache_startup;
}
#if PG_VERSION_NUM >= 150000
/*
* shmem_request hook: request additional shared resources. We'll allocate or
* attach to the shared resources in neon_smgr_shmem_startup().
*/
static void
relkind_shmem_request(void)
{
if (prev_shmem_request_hook)
prev_shmem_request_hook();
RequestAddinShmemSpace(sizeof(RelKindHashControl) + hash_estimate_size(relkind_hash_size, sizeof(RelKindEntry)));
RequestNamedLWLockTranche("neon_relkind", 1);
}
#endif

View File

@@ -80,6 +80,7 @@ RelsizeCacheShmemInit(void)
relsize_ctl->writes = 0;
dlist_init(&relsize_ctl->lru);
}
LWLockRelease(AddinShmemInitLock);
}
bool