mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-20 03:42:55 +00:00
Compare commits
66 Commits
release-pr
...
relkind_ca
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
fbd668ee3d | ||
|
|
9290103e53 | ||
|
|
f8b4875c21 | ||
|
|
9a1317e41b | ||
|
|
75dafd46b6 | ||
|
|
d3f91be1d3 | ||
|
|
499df1168c | ||
|
|
b71c609297 | ||
|
|
d161115b9f | ||
|
|
43b0106abf | ||
|
|
71980e3103 | ||
|
|
2546b79428 | ||
|
|
b7dbf4cf56 | ||
|
|
4c49423246 | ||
|
|
9030bc8d04 | ||
|
|
382895d073 | ||
|
|
963ffdae12 | ||
|
|
374cd22437 | ||
|
|
34cf566ac1 | ||
|
|
74076ee306 | ||
|
|
d961d39e76 | ||
|
|
dd1440960a | ||
|
|
6164f5eaeb | ||
|
|
0226a67603 | ||
|
|
d6d39f790b | ||
|
|
b36c02dda5 | ||
|
|
ec89876612 | ||
|
|
adf103a557 | ||
|
|
388be47144 | ||
|
|
9955d02a01 | ||
|
|
657c63b9cb | ||
|
|
4885621e55 | ||
|
|
4580391963 | ||
|
|
28ce584d01 | ||
|
|
ae7b92abeb | ||
|
|
3c54a235dd | ||
|
|
de33affb1f | ||
|
|
eabac14080 | ||
|
|
8e150568ec | ||
|
|
1ca23b47fd | ||
|
|
1c0f4d6f97 | ||
|
|
67c31b61e8 | ||
|
|
9d12eea25a | ||
|
|
c1362cbf71 | ||
|
|
902ea0ccd9 | ||
|
|
fb6d7c4676 | ||
|
|
5d93a8cc71 | ||
|
|
c3fdab3886 | ||
|
|
1e4783f3f9 | ||
|
|
20dea3aafb | ||
|
|
ca13e7ad7a | ||
|
|
87c9b067c2 | ||
|
|
e9df43abda | ||
|
|
840c73e3c4 | ||
|
|
a9e940e236 | ||
|
|
02ecb1ebbf | ||
|
|
2c0a87af68 | ||
|
|
a9d4cbe242 | ||
|
|
d5d41241fa | ||
|
|
2e34fe03c7 | ||
|
|
510c891ae5 | ||
|
|
ac233dc9aa | ||
|
|
c083765840 | ||
|
|
8884f55eee | ||
|
|
1f93b664ad | ||
|
|
883379f936 |
@@ -19,6 +19,7 @@ OBJS = \
|
||||
neon_walreader.o \
|
||||
pagestore_smgr.o \
|
||||
relsize_cache.o \
|
||||
relperst_cache.o \
|
||||
unstable_extensions.o \
|
||||
walproposer.o \
|
||||
walproposer_pg.o \
|
||||
|
||||
@@ -489,6 +489,7 @@ _PG_init(void)
|
||||
/* Stage 1: Define GUCs, and other early intialization */
|
||||
pg_init_libpagestore();
|
||||
relsize_hash_init();
|
||||
relperst_hash_init();
|
||||
lfc_init();
|
||||
pg_init_walproposer();
|
||||
init_lwlsncache();
|
||||
@@ -722,6 +723,7 @@ neon_shmem_request_hook(void)
|
||||
NeonPerfCountersShmemRequest();
|
||||
PagestoreShmemRequest();
|
||||
RelsizeCacheShmemRequest();
|
||||
RelperstCacheShmemRequest();
|
||||
WalproposerShmemRequest();
|
||||
LwLsnCacheShmemRequest();
|
||||
}
|
||||
@@ -744,6 +746,7 @@ neon_shmem_startup_hook(void)
|
||||
NeonPerfCountersShmemInit();
|
||||
PagestoreShmemInit();
|
||||
RelsizeCacheShmemInit();
|
||||
RelperstCacheShmemInit();
|
||||
WalproposerShmemInit();
|
||||
LwLsnCacheShmemInit();
|
||||
|
||||
|
||||
@@ -74,6 +74,7 @@ extern PGDLLEXPORT void LogicalSlotsMonitorMain(Datum main_arg);
|
||||
extern void LfcShmemRequest(void);
|
||||
extern void PagestoreShmemRequest(void);
|
||||
extern void RelsizeCacheShmemRequest(void);
|
||||
extern void RelperstCacheShmemRequest(void);
|
||||
extern void WalproposerShmemRequest(void);
|
||||
extern void LwLsnCacheShmemRequest(void);
|
||||
extern void NeonPerfCountersShmemRequest(void);
|
||||
@@ -81,6 +82,7 @@ extern void NeonPerfCountersShmemRequest(void);
|
||||
extern void LfcShmemInit(void);
|
||||
extern void PagestoreShmemInit(void);
|
||||
extern void RelsizeCacheShmemInit(void);
|
||||
extern void RelperstCacheShmemInit(void);
|
||||
extern void WalproposerShmemInit(void);
|
||||
extern void LwLsnCacheShmemInit(void);
|
||||
extern void NeonPerfCountersShmemInit(void);
|
||||
|
||||
@@ -298,4 +298,50 @@ extern void set_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumb
|
||||
extern void update_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber size);
|
||||
extern void forget_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum);
|
||||
|
||||
/*
|
||||
* Relation persistence enum.
|
||||
*/
|
||||
typedef enum
|
||||
{
|
||||
/* The persistence is not known */
|
||||
NEON_RELPERSISTENCE_UNKNOWN,
|
||||
|
||||
/* The relation is a permanent relation that is WAL-logged normally */
|
||||
NEON_RELPERSISTENCE_PERMANENT,
|
||||
|
||||
/* The relation is an unlogged table/index, stored only on local disk */
|
||||
NEON_RELPERSISTENCE_UNLOGGED,
|
||||
|
||||
/*
|
||||
* The relation is a permanent (index) relation, but it is being built by an in-progress
|
||||
* transaction. It currently only lives on local disk and hasn't been WAL-logged yet.
|
||||
* It will turn into a permanent relation later when the index build completes.
|
||||
* This is currently used for GiST, SP-GiST and GIN indexes, as well as the pgvector
|
||||
* extension.
|
||||
*/
|
||||
NEON_RELPERSISTENCE_UNLOGGED_BUILD
|
||||
} NeonRelPersistence;
|
||||
|
||||
/*
|
||||
* Entry type stored in relperst_hash. We have just one entry for the whole relation, i.e. we don't have separate entries for the individual forks.
|
||||
* It gets a little complicated with unlogged relations. The main fork of an unlogged relation is considered UNLOGGED, but its init-fork is
|
||||
* treated as PERMANENT. It is specially checked in neon_write.
|
||||
*/
|
||||
typedef struct
|
||||
{
|
||||
NRelFileInfo rel;
|
||||
uint8 relperst; /* See NeonRelPersistence */
|
||||
uint16 access_count;
|
||||
dlist_node lru_node; /* LRU list node */
|
||||
} NeonRelPersistenceEntry;
|
||||
|
||||
extern LWLockId finish_unlogged_build_lock;
|
||||
|
||||
extern void relperst_hash_init(void);
|
||||
extern void set_cached_relperst(NRelFileInfo rinfo, NeonRelPersistence relperst);
|
||||
extern NeonRelPersistence get_cached_relperst(NRelFileInfo rinfo);
|
||||
extern NeonRelPersistenceEntry* pin_cached_relperst(NRelFileInfo rinfo, NeonRelPersistence relperst);
|
||||
extern void unpin_cached_relperst(NeonRelPersistenceEntry* entry);
|
||||
extern void forget_cached_relperst(NRelFileInfo rinfo);
|
||||
|
||||
#endif /* PAGESTORE_CLIENT_H */
|
||||
|
||||
@@ -97,6 +97,7 @@ typedef enum
|
||||
int debug_compare_local;
|
||||
|
||||
static NRelFileInfo unlogged_build_rel_info;
|
||||
static NeonRelPersistenceEntry* unlogged_build_rel_entry;
|
||||
static UnloggedBuildPhase unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
|
||||
|
||||
static bool neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id);
|
||||
@@ -617,7 +618,7 @@ neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
|
||||
result->effective_request_lsn = result->request_lsn;
|
||||
Assert(last_written_lsn <= result->request_lsn);
|
||||
|
||||
neon_log(DEBUG1, "neon_get_request_lsns request lsn %X/%X, not_modified_since %X/%X",
|
||||
neon_log(DEBUG2, "neon_get_request_lsns request lsn %X/%X, not_modified_since %X/%X",
|
||||
LSN_FORMAT_ARGS(result->request_lsn), LSN_FORMAT_ARGS(result->not_modified_since));
|
||||
}
|
||||
}
|
||||
@@ -641,7 +642,7 @@ neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
|
||||
* must still in the buffer cache, so our request cannot concern
|
||||
* those.
|
||||
*/
|
||||
neon_log(DEBUG1, "neon_get_request_lsns GetLastWrittenLSN lsn %X/%X",
|
||||
neon_log(DEBUG2, "neon_get_request_lsns GetLastWrittenLSN lsn %X/%X",
|
||||
LSN_FORMAT_ARGS(last_written_lsn));
|
||||
|
||||
/*
|
||||
@@ -877,6 +878,12 @@ neon_unlink(NRelFileInfoBackend rinfo, ForkNumber forkNum, bool isRedo)
|
||||
if (!NRelFileInfoBackendIsTemp(rinfo))
|
||||
{
|
||||
forget_cached_relsize(InfoFromNInfoB(rinfo), forkNum);
|
||||
/*
|
||||
* This removes information about all forks from relpersistence cache, but it is ok because
|
||||
* the only relations pinned in this cache are one involved in unlogged build.
|
||||
* And relation should not be removed during unlogged build.
|
||||
*/
|
||||
forget_cached_relperst(InfoFromNInfoB(rinfo));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1601,21 +1608,54 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo
|
||||
#endif
|
||||
{
|
||||
XLogRecPtr lsn;
|
||||
NeonRelPersistence relperst;
|
||||
bool is_locked = false;
|
||||
NRelFileInfo rinfo = InfoFromSMgrRel(reln);
|
||||
|
||||
switch (reln->smgr_relpersistence)
|
||||
{
|
||||
case 0:
|
||||
/* This is a bit tricky. Check if the relation exists locally */
|
||||
if (mdexists(reln, debug_compare_local ? INIT_FORKNUM : forknum))
|
||||
relperst = get_cached_relperst(rinfo);
|
||||
if (relperst == NEON_RELPERSISTENCE_UNKNOWN)
|
||||
{
|
||||
/* We do not know relation persistence: let's determine it */
|
||||
relperst = mdexists(reln, debug_compare_local ? INIT_FORKNUM : forknum) ? NEON_RELPERSISTENCE_UNLOGGED : NEON_RELPERSISTENCE_PERMANENT;
|
||||
/*
|
||||
* There is no lock hold between get_cached_relperst and set_cached_relperst.
|
||||
* We assume that if multiple backends perform this check, they all get the same result (there is assert in set_cached_relperst).
|
||||
* Furthermore we assume that when a relation changes from PERMANENT to UNLOGGED_BUILD, we assume that it has no buffers in
|
||||
* the shared buffer cache and therefore no other backend will try to concurrently write its pages. (In fact we require that the relation is completely empty.)
|
||||
*/
|
||||
set_cached_relperst(rinfo, relperst);
|
||||
}
|
||||
if (relperst == NEON_RELPERSISTENCE_UNLOGGED_BUILD)
|
||||
{
|
||||
/*
|
||||
* A relation going through an unlogged build can complete the unlogged build at any time.
|
||||
* To make sure that the backend performing the build doesn't complete and
|
||||
* remove the underlying local file just when we are about to write it, acquire the lock.
|
||||
*/
|
||||
LWLockAcquire(finish_unlogged_build_lock, LW_SHARED);
|
||||
is_locked = true;
|
||||
/* Recheck now that we hold the lock - the build might already have finished */
|
||||
relperst = get_cached_relperst(rinfo);
|
||||
}
|
||||
if (relperst == NEON_RELPERSISTENCE_UNLOGGED || relperst == NEON_RELPERSISTENCE_UNLOGGED_BUILD)
|
||||
{
|
||||
/* It exists locally. Guess it's unlogged then. */
|
||||
#if PG_MAJORVERSION_NUM >= 17
|
||||
mdwritev(reln, forknum, blocknum, &buffer, 1, skipFsync);
|
||||
#else
|
||||
mdwrite(reln, forknum, blocknum, buffer, skipFsync);
|
||||
#endif
|
||||
}
|
||||
if (is_locked)
|
||||
{
|
||||
LWLockRelease(finish_unlogged_build_lock);
|
||||
}
|
||||
if (relperst == NEON_RELPERSISTENCE_UNLOGGED || relperst == NEON_RELPERSISTENCE_UNLOGGED_BUILD)
|
||||
{
|
||||
/*
|
||||
* We could set relpersistence now that we have determined
|
||||
* We could set reln->smgr_relpersistence now that we have determined
|
||||
* that it's local. But we don't dare to do it, because that
|
||||
* would immediately allow reads as well, which shouldn't
|
||||
* happen. We could cache it with a different 'relpersistence'
|
||||
@@ -1626,7 +1666,7 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo
|
||||
break;
|
||||
|
||||
case RELPERSISTENCE_PERMANENT:
|
||||
if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln)))
|
||||
if (RelFileInfoEquals(unlogged_build_rel_info, rinfo))
|
||||
{
|
||||
#if PG_MAJORVERSION_NUM >= 17
|
||||
mdwritev(reln, forknum, blocknum, &buffer, 1, skipFsync);
|
||||
@@ -1657,7 +1697,7 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo
|
||||
forknum, blocknum,
|
||||
(uint32) (lsn >> 32), (uint32) lsn);
|
||||
|
||||
lfc_write(InfoFromSMgrRel(reln), forknum, blocknum, buffer);
|
||||
lfc_write(rinfo, forknum, blocknum, buffer);
|
||||
|
||||
communicator_prefetch_pump_state();
|
||||
|
||||
@@ -1682,28 +1722,52 @@ static void
|
||||
neon_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
|
||||
const void **buffers, BlockNumber nblocks, bool skipFsync)
|
||||
{
|
||||
NeonRelPersistence relperst;
|
||||
NRelFileInfo rinfo = InfoFromSMgrRel(reln);
|
||||
bool is_locked = false;
|
||||
switch (reln->smgr_relpersistence)
|
||||
{
|
||||
case 0:
|
||||
/* This is a bit tricky. Check if the relation exists locally */
|
||||
if (mdexists(reln, debug_compare_local ? INIT_FORKNUM : forknum))
|
||||
if (forknum == INIT_FORKNUM)
|
||||
{
|
||||
break; /* init fork is always permanent */
|
||||
}
|
||||
relperst = get_cached_relperst(rinfo);
|
||||
if (relperst == NEON_RELPERSISTENCE_UNKNOWN)
|
||||
{
|
||||
/* We do not know relation persistence: let's determine it */
|
||||
relperst = mdexists(reln, debug_compare_local ? INIT_FORKNUM : forknum) ? NEON_RELPERSISTENCE_UNLOGGED : NEON_RELPERSISTENCE_PERMANENT;
|
||||
set_cached_relperst(rinfo, relperst);
|
||||
}
|
||||
if (relperst == NEON_RELPERSISTENCE_UNLOGGED_BUILD)
|
||||
{
|
||||
/* In case of unlogged build we need to avoid race condition at unlogged build end.
|
||||
* Obtain shared lock here to prevent backend completing unlogged build from performing cleanup amnd remvong files.
|
||||
*/
|
||||
LWLockAcquire(finish_unlogged_build_lock, LW_SHARED);
|
||||
is_locked = true;
|
||||
/*
|
||||
* Recheck relperst under lock - may be unlogged build is already finished
|
||||
*/
|
||||
relperst = get_cached_relperst(rinfo);
|
||||
}
|
||||
if (relperst == NEON_RELPERSISTENCE_UNLOGGED || relperst == NEON_RELPERSISTENCE_UNLOGGED_BUILD)
|
||||
{
|
||||
/* It exists locally. Guess it's unlogged then. */
|
||||
mdwritev(reln, forknum, blkno, buffers, nblocks, skipFsync);
|
||||
|
||||
/*
|
||||
* We could set relpersistence now that we have determined
|
||||
* that it's local. But we don't dare to do it, because that
|
||||
* would immediately allow reads as well, which shouldn't
|
||||
* happen. We could cache it with a different 'relpersistence'
|
||||
* value, but this isn't performance critical.
|
||||
*/
|
||||
}
|
||||
if (is_locked)
|
||||
{
|
||||
LWLockRelease(finish_unlogged_build_lock);
|
||||
}
|
||||
if (relperst == NEON_RELPERSISTENCE_UNLOGGED || relperst == NEON_RELPERSISTENCE_UNLOGGED_BUILD)
|
||||
{
|
||||
return;
|
||||
}
|
||||
break;
|
||||
|
||||
case RELPERSISTENCE_PERMANENT:
|
||||
if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln)))
|
||||
if (RelFileInfoEquals(unlogged_build_rel_info, rinfo))
|
||||
{
|
||||
mdwritev(reln, forknum, blkno, buffers, nblocks, skipFsync);
|
||||
return;
|
||||
@@ -1720,7 +1784,7 @@ neon_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
|
||||
|
||||
neon_wallog_pagev(reln, forknum, blkno, nblocks, (const char **) buffers, false);
|
||||
|
||||
lfc_writev(InfoFromSMgrRel(reln), forknum, blkno, buffers, nblocks);
|
||||
lfc_writev(rinfo, forknum, blkno, buffers, nblocks);
|
||||
|
||||
communicator_prefetch_pump_state();
|
||||
|
||||
@@ -1969,7 +2033,7 @@ neon_start_unlogged_build(SMgrRelation reln)
|
||||
if (unlogged_build_phase != UNLOGGED_BUILD_NOT_IN_PROGRESS)
|
||||
neon_log(ERROR, "unlogged relation build is already in progress");
|
||||
|
||||
ereport(SmgrTrace,
|
||||
ereport(DEBUG1,
|
||||
(errmsg(NEON_TAG "starting unlogged build of relation %u/%u/%u",
|
||||
RelFileInfoFmt(InfoFromSMgrRel(reln)))));
|
||||
|
||||
@@ -1985,6 +2049,7 @@ neon_start_unlogged_build(SMgrRelation reln)
|
||||
case RELPERSISTENCE_TEMP:
|
||||
case RELPERSISTENCE_UNLOGGED:
|
||||
unlogged_build_rel_info = InfoFromSMgrRel(reln);
|
||||
unlogged_build_rel_entry = pin_cached_relperst(unlogged_build_rel_info, NEON_RELPERSISTENCE_UNLOGGED);
|
||||
unlogged_build_phase = UNLOGGED_BUILD_NOT_PERMANENT;
|
||||
if (debug_compare_local)
|
||||
{
|
||||
@@ -2007,6 +2072,7 @@ neon_start_unlogged_build(SMgrRelation reln)
|
||||
#endif
|
||||
|
||||
unlogged_build_rel_info = InfoFromSMgrRel(reln);
|
||||
unlogged_build_rel_entry = pin_cached_relperst(unlogged_build_rel_info, NEON_RELPERSISTENCE_UNLOGGED_BUILD);
|
||||
unlogged_build_phase = UNLOGGED_BUILD_PHASE_1;
|
||||
|
||||
/*
|
||||
@@ -2022,6 +2088,15 @@ neon_start_unlogged_build(SMgrRelation reln)
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
unlogged_build_cleanup(void)
|
||||
{
|
||||
NRelFileInfoInvalidate(unlogged_build_rel_info);
|
||||
unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
|
||||
unpin_cached_relperst(unlogged_build_rel_entry);
|
||||
unlogged_build_rel_entry = NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* neon_finish_unlogged_build_phase_1()
|
||||
*
|
||||
@@ -2033,7 +2108,7 @@ neon_finish_unlogged_build_phase_1(SMgrRelation reln)
|
||||
{
|
||||
Assert(RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln)));
|
||||
|
||||
ereport(SmgrTrace,
|
||||
ereport(DEBUG1,
|
||||
(errmsg(NEON_TAG "finishing phase 1 of unlogged build of relation %u/%u/%u",
|
||||
RelFileInfoFmt((unlogged_build_rel_info)))));
|
||||
|
||||
@@ -2048,8 +2123,7 @@ neon_finish_unlogged_build_phase_1(SMgrRelation reln)
|
||||
*/
|
||||
if (IsParallelWorker())
|
||||
{
|
||||
NRelFileInfoInvalidate(unlogged_build_rel_info);
|
||||
unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
|
||||
unlogged_build_cleanup();
|
||||
}
|
||||
else
|
||||
unlogged_build_phase = UNLOGGED_BUILD_PHASE_2;
|
||||
@@ -2068,10 +2142,11 @@ static void
|
||||
neon_end_unlogged_build(SMgrRelation reln)
|
||||
{
|
||||
NRelFileInfoBackend rinfob = InfoBFromSMgrRel(reln);
|
||||
NRelFileInfo rinfo = InfoFromSMgrRel(reln);
|
||||
|
||||
Assert(RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln)));
|
||||
Assert(RelFileInfoEquals(unlogged_build_rel_info, rinfo));
|
||||
|
||||
ereport(SmgrTrace,
|
||||
ereport(DEBUG1,
|
||||
(errmsg(NEON_TAG "ending unlogged build of relation %u/%u/%u",
|
||||
RelFileInfoFmt(unlogged_build_rel_info))));
|
||||
|
||||
@@ -2095,21 +2170,26 @@ neon_end_unlogged_build(SMgrRelation reln)
|
||||
recptr = GetXLogInsertRecPtr();
|
||||
|
||||
neon_set_lwlsn_block_range(recptr,
|
||||
InfoFromNInfoB(rinfob),
|
||||
rinfo,
|
||||
MAIN_FORKNUM, 0, nblocks);
|
||||
neon_set_lwlsn_relation(recptr,
|
||||
InfoFromNInfoB(rinfob),
|
||||
rinfo,
|
||||
MAIN_FORKNUM);
|
||||
|
||||
/* Obtain exclusive lock to prevent concurrent writes to the file while we perform cleanup */
|
||||
LWLockAcquire(finish_unlogged_build_lock, LW_EXCLUSIVE);
|
||||
unlogged_build_rel_entry->relperst = NEON_RELPERSISTENCE_PERMANENT;
|
||||
LWLockRelease(finish_unlogged_build_lock);
|
||||
|
||||
/* Remove local copy */
|
||||
for (int forknum = 0; forknum <= MAX_FORKNUM; forknum++)
|
||||
{
|
||||
neon_log(SmgrTrace, "forgetting cached relsize for %u/%u/%u.%u",
|
||||
RelFileInfoFmt(InfoFromNInfoB(rinfob)),
|
||||
RelFileInfoFmt(rinfo),
|
||||
forknum);
|
||||
|
||||
forget_cached_relsize(InfoFromNInfoB(rinfob), forknum);
|
||||
lfc_invalidate(InfoFromNInfoB(rinfob), forknum, nblocks);
|
||||
forget_cached_relsize(rinfo, forknum);
|
||||
lfc_invalidate(rinfo, forknum, nblocks);
|
||||
|
||||
mdclose(reln, forknum);
|
||||
if (!debug_compare_local)
|
||||
@@ -2121,8 +2201,7 @@ neon_end_unlogged_build(SMgrRelation reln)
|
||||
if (debug_compare_local)
|
||||
mdunlink(rinfob, INIT_FORKNUM, true);
|
||||
}
|
||||
NRelFileInfoInvalidate(unlogged_build_rel_info);
|
||||
unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
|
||||
unlogged_build_cleanup();
|
||||
}
|
||||
|
||||
#define STRPREFIX(str, prefix) (strncmp(str, prefix, strlen(prefix)) == 0)
|
||||
@@ -2194,8 +2273,7 @@ AtEOXact_neon(XactEvent event, void *arg)
|
||||
* Forget about any build we might have had in progress. The local
|
||||
* file will be unlinked by smgrDoPendingDeletes()
|
||||
*/
|
||||
NRelFileInfoInvalidate(unlogged_build_rel_info);
|
||||
unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
|
||||
unlogged_build_cleanup();
|
||||
break;
|
||||
|
||||
case XACT_EVENT_COMMIT:
|
||||
@@ -2206,8 +2284,7 @@ AtEOXact_neon(XactEvent event, void *arg)
|
||||
case XACT_EVENT_PRE_PREPARE:
|
||||
if (unlogged_build_phase != UNLOGGED_BUILD_NOT_IN_PROGRESS)
|
||||
{
|
||||
NRelFileInfoInvalidate(unlogged_build_rel_info);
|
||||
unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
|
||||
unlogged_build_cleanup();
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_INTERNAL_ERROR),
|
||||
(errmsg(NEON_TAG "unlogged index build was not properly finished"))));
|
||||
|
||||
292
pgxn/neon/relperst_cache.c
Normal file
292
pgxn/neon/relperst_cache.c
Normal file
@@ -0,0 +1,292 @@
|
||||
/*-------------------------------------------------------------------------
|
||||
*
|
||||
* relperst_cache.c
|
||||
* Cache to track the relpersistence of relations
|
||||
*
|
||||
* Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
|
||||
* Portions Copyright (c) 1994, Regents of the University of California
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
#include "postgres.h"
|
||||
|
||||
#include "neon.h"
|
||||
#include "miscadmin.h"
|
||||
#include "neon_pgversioncompat.h"
|
||||
|
||||
#include "pagestore_client.h"
|
||||
#include RELFILEINFO_HDR
|
||||
#include "storage/smgr.h"
|
||||
#include "storage/lwlock.h"
|
||||
#include "storage/ipc.h"
|
||||
#include "storage/shmem.h"
|
||||
#include "catalog/pg_tablespace_d.h"
|
||||
#include "utils/dynahash.h"
|
||||
#include "utils/guc.h"
|
||||
|
||||
#include "miscadmin.h"
|
||||
|
||||
/*
|
||||
* The main goal of this cache is to avoid repeated calls of mdexists in neon_write,
|
||||
* which is needed to distinguish unlogged relations.
|
||||
* It has a fixed size, implementing eviction with the LRU algorithm.
|
||||
*
|
||||
* This hash is also used to mark a relation during an unlogged build.
|
||||
* Relations involved in unlogged build are pinned in the cache and never evicted. (Relying
|
||||
* on the fact that the number of concurrent unlogged builds is small). Evicting a page
|
||||
* belonging to an unlogged build involves an extra locking step to eliminate a race condition
|
||||
* between unlogged build completing and deleted the local file, at the same time that
|
||||
* another backend is evicting a page belonging to it. See how `finish_unlogged_build_lock`
|
||||
* is used in `neon_write`
|
||||
*/
|
||||
|
||||
typedef struct
|
||||
{
|
||||
size_t size;
|
||||
uint64 hits;
|
||||
uint64 misses;
|
||||
uint64 pinned;
|
||||
dlist_head lru; /* double linked list for LRU replacement
|
||||
* algorithm */
|
||||
} NeonRelPersistenceHashControl;
|
||||
|
||||
/*
|
||||
* Size of a cache entry is 32 bytes. So this default will take about 0.5 MB,
|
||||
* which seems reasonable.
|
||||
*/
|
||||
#define DEFAULT_RELPERST_HASH_SIZE (16 * 1024)
|
||||
#define MAX_RELPERST_HASH_SIZE (1024 * 1024)
|
||||
|
||||
|
||||
static HTAB *relperst_hash;
|
||||
static int relperst_hash_size = DEFAULT_RELPERST_HASH_SIZE;
|
||||
static NeonRelPersistenceHashControl* relperst_ctl;
|
||||
|
||||
/* Protects unlogged build completing while another backend is writing to it */
|
||||
LWLockId finish_unlogged_build_lock;
|
||||
/* Protects 'relperst_hash' */
|
||||
static LWLockId relperst_hash_lock;
|
||||
|
||||
/*
|
||||
* Shared memory registration
|
||||
*/
|
||||
void
|
||||
RelperstCacheShmemRequest(void)
|
||||
{
|
||||
RequestAddinShmemSpace(sizeof(NeonRelPersistenceHashControl) + hash_estimate_size(relperst_hash_size, sizeof(NeonRelPersistenceEntry)));
|
||||
RequestNamedLWLockTranche("neon_relperst", 2);
|
||||
}
|
||||
|
||||
/*
|
||||
* Initialize shared memory
|
||||
*/
|
||||
void
|
||||
RelperstCacheShmemInit(void)
|
||||
{
|
||||
static HASHCTL info;
|
||||
bool found;
|
||||
|
||||
relperst_ctl = (NeonRelPersistenceHashControl *) ShmemInitStruct("relperst_hash", sizeof(NeonRelPersistenceHashControl), &found);
|
||||
if (!found)
|
||||
{
|
||||
/*
|
||||
* In the worst case, the hash needs to be large enough for the case that all backends are performing an unlogged index build at the same time.
|
||||
* Or actually twice that, because while performing an unlogged index build, each backend can also be trying to write out a page for another
|
||||
* relation and hence hold one more entry in the cache pinned. Use MaxConnections instead of MaxBackends because only normal backends can perform unlogged build.
|
||||
*/
|
||||
size_t hash_size = Max(2 * MaxConnections, relperst_hash_size);
|
||||
relperst_hash_lock = (LWLockId) GetNamedLWLockTranche("neon_relperst");
|
||||
finish_unlogged_build_lock = (LWLockId)(GetNamedLWLockTranche("neon_relperst") + 1);
|
||||
info.keysize = sizeof(NRelFileInfo);
|
||||
info.entrysize = sizeof(NeonRelPersistenceEntry);
|
||||
relperst_hash = ShmemInitHash("neon_relperst",
|
||||
hash_size, hash_size,
|
||||
&info,
|
||||
HASH_ELEM | HASH_BLOBS);
|
||||
relperst_ctl->size = 0;
|
||||
relperst_ctl->hits = 0;
|
||||
relperst_ctl->misses = 0;
|
||||
relperst_ctl->pinned = 0;
|
||||
dlist_init(&relperst_ctl->lru);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Lookup existing entry or create a new one
|
||||
*/
|
||||
static NeonRelPersistenceEntry*
|
||||
get_pinned_entry(NRelFileInfo rinfo)
|
||||
{
|
||||
bool found;
|
||||
NeonRelPersistenceEntry* entry = hash_search(relperst_hash, &rinfo, HASH_ENTER_NULL, &found);
|
||||
|
||||
if (entry == NULL)
|
||||
{
|
||||
if (dlist_is_empty(&relperst_ctl->lru))
|
||||
{
|
||||
/* Cannot happen, because we size the hash table to be large enough for the worst case */
|
||||
neon_log(PANIC, "No unpinned relperst entries");
|
||||
}
|
||||
else
|
||||
{
|
||||
/*
|
||||
* Remove least recently used element from the hash.
|
||||
*/
|
||||
NeonRelPersistenceEntry *victim = dlist_container(NeonRelPersistenceEntry, lru_node, dlist_pop_head_node(&relperst_ctl->lru));
|
||||
Assert(victim->access_count == 0);
|
||||
hash_search(relperst_hash, &victim->rel, HASH_REMOVE, &found);
|
||||
Assert(found);
|
||||
Assert(relperst_ctl->size > 0);
|
||||
relperst_ctl->size -= 1;
|
||||
}
|
||||
entry = hash_search(relperst_hash, &rinfo, HASH_ENTER_NULL, &found);
|
||||
Assert(!found);
|
||||
}
|
||||
if (!found)
|
||||
{
|
||||
/* the caller will fill this in by calling set_cached_relperst() later */
|
||||
entry->relperst = NEON_RELPERSISTENCE_UNKNOWN;
|
||||
relperst_ctl->pinned += 1;
|
||||
entry->access_count = 1;
|
||||
relperst_ctl->size += 1;
|
||||
}
|
||||
else if (entry->access_count++ == 0)
|
||||
{
|
||||
dlist_delete(&entry->lru_node);
|
||||
relperst_ctl->pinned += 1;
|
||||
}
|
||||
return entry;
|
||||
}
|
||||
|
||||
/*
|
||||
* Unpin entry and place it at the end of LRU list
|
||||
*/
|
||||
static void
|
||||
unpin_entry(NeonRelPersistenceEntry *entry)
|
||||
{
|
||||
Assert(entry->access_count != 0);
|
||||
if (--entry->access_count == 0)
|
||||
{
|
||||
Assert(relperst_ctl->pinned != 0);
|
||||
relperst_ctl->pinned -= 1;
|
||||
dlist_push_tail(&relperst_ctl->lru, &entry->lru_node);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Get existed or intialize new entry. This function is used by neon_start_unlogged_build to mark relation involved in unlogged build.
|
||||
* In case of overflow removes least recently used entry.
|
||||
* Return pinned entry. It will be released by unpin_cached_relperst at the end of unlogged build.
|
||||
*/
|
||||
NeonRelPersistenceEntry*
|
||||
pin_cached_relperst(NRelFileInfo rinfo, NeonRelPersistence relperst)
|
||||
{
|
||||
NeonRelPersistenceEntry *entry;
|
||||
|
||||
LWLockAcquire(relperst_hash_lock, LW_EXCLUSIVE);
|
||||
|
||||
entry = get_pinned_entry(rinfo);
|
||||
entry->relperst = relperst;
|
||||
|
||||
LWLockRelease(relperst_hash_lock);
|
||||
return entry;
|
||||
}
|
||||
|
||||
/*
|
||||
* Lookup entry or create new one if not exists. This function is called by neon_write to detenmine if changes should be written to the local disk.
|
||||
* In case of overflow removes least recently used entry.
|
||||
* If relation in involved in unlogged build, the caller should obtain shared lock on `finish_unlogged_build_lock` and recheck
|
||||
* state under lock.
|
||||
*/
|
||||
NeonRelPersistence
|
||||
get_cached_relperst(NRelFileInfo rinfo)
|
||||
{
|
||||
NeonRelPersistenceEntry *entry;
|
||||
NeonRelPersistence relperst = NEON_RELPERSISTENCE_UNKNOWN;
|
||||
|
||||
/* we don't modify the hash table, but need an exclusive lock to manipulate the LRU list */
|
||||
LWLockAcquire(relperst_hash_lock, LW_EXCLUSIVE);
|
||||
|
||||
entry = hash_search(relperst_hash, &rinfo, HASH_FIND, NULL);
|
||||
if (entry != NULL)
|
||||
{
|
||||
/* Do pin+unpin entry to move it to the end of LRU list */
|
||||
if (entry->access_count++ == 0)
|
||||
{
|
||||
dlist_delete(&entry->lru_node);
|
||||
relperst_ctl->pinned += 1;
|
||||
}
|
||||
relperst = entry->relperst;
|
||||
unpin_entry(entry);
|
||||
}
|
||||
LWLockRelease(relperst_hash_lock);
|
||||
return relperst;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Store relation kind as a result of mdexists check.
|
||||
*/
|
||||
void
|
||||
set_cached_relperst(NRelFileInfo rinfo, NeonRelPersistence relperst)
|
||||
{
|
||||
NeonRelPersistenceEntry *entry;
|
||||
|
||||
LWLockAcquire(relperst_hash_lock, LW_EXCLUSIVE);
|
||||
|
||||
/* Do pin+unpin entry to move it to the end of LRU list */
|
||||
entry = get_pinned_entry(rinfo);
|
||||
Assert(entry->relperst == NEON_RELPERSISTENCE_UNKNOWN || entry->relperst == relperst);
|
||||
entry->relperst = relperst;
|
||||
unpin_entry(entry);
|
||||
|
||||
LWLockRelease(relperst_hash_lock);
|
||||
}
|
||||
|
||||
/* Release a pin that was acquired earlier with pin_cached_relperst() */
|
||||
void
|
||||
unpin_cached_relperst(NeonRelPersistenceEntry* entry)
|
||||
{
|
||||
if (entry)
|
||||
{
|
||||
LWLockAcquire(relperst_hash_lock, LW_EXCLUSIVE);
|
||||
unpin_entry(entry);
|
||||
LWLockRelease(relperst_hash_lock);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
forget_cached_relperst(NRelFileInfo rinfo)
|
||||
{
|
||||
NeonRelPersistenceEntry *entry;
|
||||
|
||||
LWLockAcquire(relperst_hash_lock, LW_EXCLUSIVE);
|
||||
|
||||
entry = hash_search(relperst_hash, &rinfo, HASH_REMOVE, NULL);
|
||||
if (entry)
|
||||
{
|
||||
Assert(entry->access_count == 0);
|
||||
dlist_delete(&entry->lru_node);
|
||||
relperst_ctl->size -= 1;
|
||||
}
|
||||
|
||||
LWLockRelease(relperst_hash_lock);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
void
|
||||
relperst_hash_init(void)
|
||||
{
|
||||
DefineCustomIntVariable("neon.relperst_hash_size",
|
||||
"Sets the maximum number of cached relation persistence for neon",
|
||||
NULL,
|
||||
&relperst_hash_size,
|
||||
DEFAULT_RELPERST_HASH_SIZE,
|
||||
1,
|
||||
MAX_RELPERST_HASH_SIZE,
|
||||
PGC_POSTMASTER,
|
||||
0,
|
||||
NULL, NULL, NULL);
|
||||
}
|
||||
65
test_runner/performance/test_unlogged.py
Normal file
65
test_runner/performance/test_unlogged.py
Normal file
@@ -0,0 +1,65 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import threading
|
||||
from contextlib import closing
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import pytest
|
||||
from pytest_lazyfixture import lazy_fixture
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from fixtures.compare_fixtures import PgCompare
|
||||
|
||||
|
||||
#
|
||||
# This test demonstrates effect of relpersistence cache. Postgres doesn't store relation persistence in shared buffer tag.
|
||||
# It means that if page is evicted from shared buffers and relation is not in the backend's relation cache, then persistence=0 (auto) is used.
|
||||
# For vanilla Postgres it is not important, because in both cases we need to write changes to the file.
|
||||
# In Neon, neon_write does nothing nothing for a permanent relation, while for an unlogged relation, it writes the page to the local file.
|
||||
# Originally Neon always called `mdexists` to check if the local file exists and determine if it's an unlogged relation. Now we check the cache first.
|
||||
# mdexists is not so cheap: it closes and opens the file.
|
||||
#
|
||||
# This test tries to recreate the situation that most of writes are with persistence=0.
|
||||
# We open multiple connections to the database and in each fill its own table. So each backends writes only its own table and other table's
|
||||
# descriptors are not cached. At the same time all backends perform eviction from shared buffers. Probability that backends evicts page of its own
|
||||
# relation is 1/N when N is number of relations=number of backends. The more relations, the smaller probability.
|
||||
# For large enough number of relations most of writes are with unknown persistence.
|
||||
#
|
||||
# On Linux, introducing the relpersistence cache shows about 2x time speed improvement in this test.
|
||||
#
|
||||
@pytest.mark.timeout(1000)
|
||||
@pytest.mark.parametrize(
|
||||
"env",
|
||||
[
|
||||
pytest.param(lazy_fixture("neon_compare"), id="neon"),
|
||||
pytest.param(lazy_fixture("vanilla_compare"), id="vanilla"),
|
||||
],
|
||||
)
|
||||
def test_unlogged(env: PgCompare):
|
||||
n_tables = 20
|
||||
n_records = 1000
|
||||
n_updates = 1000
|
||||
|
||||
with env.record_duration("insert"):
|
||||
with closing(env.pg.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
for i in range(n_tables):
|
||||
cur.execute(
|
||||
f"create unlogged table t{i}(pk integer primary key, sk integer, fillter text default repeat('x', 1000)) with (fillfactor=10)"
|
||||
)
|
||||
cur.execute(f"insert into t{i} values (generate_series(1,{n_records}),0)")
|
||||
|
||||
def do_updates(table_id: int):
|
||||
with closing(env.pg.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
for _ in range(n_updates):
|
||||
cur.execute(f"update t{table_id} set sk=sk+1")
|
||||
|
||||
with env.record_duration("update"):
|
||||
threads = [threading.Thread(target=do_updates, args=(i,)) for i in range(n_tables)]
|
||||
|
||||
for thread in threads:
|
||||
thread.start()
|
||||
|
||||
for thread in threads:
|
||||
thread.join()
|
||||
83
test_runner/regress/test_unlogged_build.py
Normal file
83
test_runner/regress/test_unlogged_build.py
Normal file
@@ -0,0 +1,83 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import threading
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import pytest
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.utils import query_scalar, skip_in_debug_build
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from fixtures.neon_fixtures import NeonEnvBuilder
|
||||
|
||||
|
||||
@pytest.mark.timeout(600)
|
||||
@skip_in_debug_build("this test is slow so only run with release build")
|
||||
def test_unlogged_build(neon_env_builder: NeonEnvBuilder):
|
||||
"""
|
||||
Check for race conditions between end of unlogged build and backends
|
||||
evicting pages of this index. We used to have a race condition where a
|
||||
backend completed unlogged build and removed local files, while another
|
||||
backend is evicting a page belonging to the newly-built index and tries to
|
||||
write to the file that's removed.
|
||||
"""
|
||||
n_connections = 4
|
||||
|
||||
# A small shared_buffers setting forces backends to evict more pages from
|
||||
# the buffer cache during the build, which is needed to expose the bug with
|
||||
# concurrent eviction.
|
||||
shared_buffers = "128MB"
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
endpoint = env.endpoints.create_start(
|
||||
"main", config_lines=[f"shared_buffers='{shared_buffers}'"]
|
||||
)
|
||||
|
||||
def unlogged_build(i: int):
|
||||
con = endpoint.connect()
|
||||
cur = con.cursor()
|
||||
cur.execute("set statement_timeout=0")
|
||||
cur.execute(f"CREATE TABLE ginarray_tbl_{i} (arr text[])")
|
||||
cur.execute(
|
||||
f"INSERT INTO ginarray_tbl_{i} SELECT array_agg(repeat((x*1000000 + y)::text, 40)) FROM generate_series(1, 100) x, generate_series(1, 200 * 100) y group by y"
|
||||
)
|
||||
cur.execute("SET log_min_messages='debug1'")
|
||||
cur.execute(f"CREATE INDEX ginarray_tbl_idx_{i} ON ginarray_tbl_{i} USING gin(arr)")
|
||||
cur.execute("RESET log_min_messages")
|
||||
|
||||
# Check that the index is larger than 1 GB, so that it doesn't fit into
|
||||
# a single segment file. Otherwise, the race condition is avoided,
|
||||
# because when the backend evicts and writes out the page, it already
|
||||
# has the file descriptor opened and the write succeeds even if the
|
||||
# underlying file is deleted concurrently.
|
||||
assert (
|
||||
query_scalar(cur, f"select pg_relation_size('ginarray_tbl_idx_{i}')")
|
||||
> 1024 * 1024 * 1024
|
||||
)
|
||||
|
||||
# Run a simple query that uses the index as a sanity check. (In the
|
||||
# original bug, the CREATE INDEX step failed already. But since we
|
||||
# already went through all the effort to build the index, might as well
|
||||
# check that it also works.)
|
||||
qry = f"SELECT count(*) FROM ginarray_tbl_{i} WHERE ARRAY[repeat('1012345', 40)] <@ arr"
|
||||
result = query_scalar(cur, qry)
|
||||
assert result == 1
|
||||
|
||||
# Verify that the query actually uses the index
|
||||
cur.execute(f"EXPLAIN (COSTS OFF) {qry}")
|
||||
rows = cur.fetchall()
|
||||
plan = "\n".join(r[0] for r in rows)
|
||||
log.debug(plan)
|
||||
assert "Bitmap Index Scan on ginarray_tbl_idx" in plan
|
||||
|
||||
threads = [threading.Thread(target=unlogged_build, args=(i,)) for i in range(n_connections)]
|
||||
for thread in threads:
|
||||
thread.start()
|
||||
|
||||
for thread in threads:
|
||||
thread.join()
|
||||
|
||||
# Sanity check that the indexes were built with the "unlogged build"
|
||||
# method. GIN always uses that method currently, but if a different, more
|
||||
# efficient, method is invented later, that might invalidate this test.
|
||||
assert endpoint.log_contains("starting unlogged build of relation")
|
||||
Reference in New Issue
Block a user