Compare commits

...

66 Commits

Author SHA1 Message Date
Konstantin Knizhnik
fbd668ee3d Update comment for test_runner/regress/test_unlogged_build.py 2025-07-31 18:25:18 +03:00
Konstantin Knizhnik
9290103e53 Update test_runner/regress/test_unlogged_build.py
Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
2025-07-31 18:19:27 +03:00
Konstantin Knizhnik
f8b4875c21 Update test_runner/regress/test_unlogged_build.py
Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
2025-07-31 18:19:13 +03:00
Konstantin Knizhnik
9a1317e41b Update pgxn/neon/relperst_cache.c
Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
2025-07-31 18:19:04 +03:00
Konstantin Knizhnik
75dafd46b6 Speedup test_unlogged_build.py 2025-07-29 18:04:27 +03:00
Konstantin Knizhnik
d3f91be1d3 Compare unlogged build speed with vanilla 2025-07-29 15:15:30 +03:00
Konstantin Knizhnik
499df1168c Fix bug with using @skip_in_debug_build directive 2025-07-29 08:04:11 +03:00
Konstantin Knizhnik
b71c609297 Move test_unlogged_build.py from p[erformance to regression tests 2025-07-29 08:04:11 +03:00
Konstantin Knizhnik
d161115b9f Update test_runner/performance/test_unlogged_build.py
Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
2025-07-29 08:04:11 +03:00
Heikki Linnakangas
43b0106abf Improve the added test case
- Make it faster by using GIN instead of SP-GiST

- Make it more robust by checking some of the assumptions, like that
  the index is larger than 1 GB

- Improve comments
2025-07-29 08:04:11 +03:00
Kosntantin Knizhnik
71980e3103 Address review comments 2025-07-29 08:04:11 +03:00
Konstantin Knizhnik
2546b79428 Update test_runner/performance/test_unlogged.py
Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
2025-07-29 08:04:11 +03:00
Konstantin Knizhnik
b7dbf4cf56 Update test_runner/performance/test_unlogged.py
Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
2025-07-29 08:04:11 +03:00
Konstantin Knizhnik
4c49423246 Update pgxn/neon/relperst_cache.c
Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
2025-07-29 08:04:11 +03:00
Konstantin Knizhnik
9030bc8d04 Update pgxn/neon/relperst_cache.c
Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
2025-07-29 08:04:11 +03:00
Konstantin Knizhnik
382895d073 Update pgxn/neon/relperst_cache.c
Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
2025-07-29 08:04:11 +03:00
Konstantin Knizhnik
963ffdae12 Update pgxn/neon/relperst_cache.c
Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
2025-07-29 08:04:11 +03:00
Konstantin Knizhnik
374cd22437 Update pgxn/neon/relperst_cache.c
Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
2025-07-29 08:04:11 +03:00
Konstantin Knizhnik
34cf566ac1 Update pgxn/neon/relperst_cache.c
Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
2025-07-29 08:04:11 +03:00
Konstantin Knizhnik
74076ee306 Update pgxn/neon/relperst_cache.c
Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
2025-07-29 08:04:11 +03:00
Konstantin Knizhnik
d961d39e76 Update pgxn/neon/relperst_cache.c
Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
2025-07-29 08:04:11 +03:00
Konstantin Knizhnik
dd1440960a Update pgxn/neon/relperst_cache.c
Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
2025-07-29 08:04:11 +03:00
Konstantin Knizhnik
6164f5eaeb Update pgxn/neon/relperst_cache.c
Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
2025-07-29 08:04:11 +03:00
Kosntantin Knizhnik
0226a67603 Restore timeout for test_unlogged_)build_test 2025-07-29 08:04:11 +03:00
Kosntantin Knizhnik
d6d39f790b Reduce size of shared_buffer in test_unlogged_build to rertproduce the problem at main 2025-07-29 08:04:11 +03:00
Konstantin Knizhnik
b36c02dda5 Update pgxn/neon/pagestore_smgr.c
Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
2025-07-29 08:04:11 +03:00
Konstantin Knizhnik
ec89876612 Update test_runner/performance/test_unlogged_build.py
Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
2025-07-29 08:04:11 +03:00
Konstantin Knizhnik
adf103a557 Update test_runner/performance/test_unlogged.py
Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
2025-07-29 08:04:11 +03:00
Konstantin Knizhnik
388be47144 Update test_runner/performance/test_unlogged.py
Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
2025-07-29 08:04:11 +03:00
Konstantin Knizhnik
9955d02a01 Update pgxn/neon/pagestore_smgr.c
Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
2025-07-29 08:04:11 +03:00
Konstantin Knizhnik
657c63b9cb Update pgxn/neon/pagestore_smgr.c
Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
2025-07-29 08:04:11 +03:00
Konstantin Knizhnik
4885621e55 Update pgxn/neon/pagestore_smgr.c
Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
2025-07-29 08:04:11 +03:00
Konstantin Knizhnik
4580391963 Update pgxn/neon/pagestore_smgr.c
Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
2025-07-29 08:04:11 +03:00
Kosntantin Knizhnik
28ce584d01 Rename relkind to relpersistence 2025-07-29 08:04:11 +03:00
Kosntantin Knizhnik
ae7b92abeb Undo check for INIT_FORKNUM 2025-07-29 08:04:11 +03:00
Kosntantin Knizhnik
3c54a235dd Add test_unlogged_build.py 2025-07-29 08:04:11 +03:00
Kosntantin Knizhnik
de33affb1f Fix merge conflicts 2025-07-29 08:04:11 +03:00
Kosntantin Knizhnik
eabac14080 Fix merge conflicts 2025-07-29 08:04:11 +03:00
Kosntantin Knizhnik
8e150568ec Handle init fork in specialk way 2025-07-29 08:04:11 +03:00
Kosntantin Knizhnik
1ca23b47fd Add comment to the test 2025-07-29 08:04:11 +03:00
Kosntantin Knizhnik
1c0f4d6f97 Replace spinlock with LWLock 2025-07-29 08:04:11 +03:00
Konstantin Knizhnik
67c31b61e8 Fix warning 2025-07-29 08:04:11 +03:00
Konstantin Knizhnik
9d12eea25a Fix merge problems 2025-07-29 08:04:11 +03:00
Konstantin Knizhnik
c1362cbf71 Fix empty list check 2025-07-29 08:04:11 +03:00
Konstantin Knizhnik
902ea0ccd9 Address review comments 2025-07-29 08:04:11 +03:00
Konstantin Knizhnik
fb6d7c4676 Fix merge conflict 2025-07-29 08:04:11 +03:00
Konstantin Knizhnik
5d93a8cc71 Update pgxn/neon/relkind_cache.c
Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
2025-07-29 08:04:11 +03:00
Konstantin Knizhnik
c3fdab3886 Update pgxn/neon/pagestore_client.h
Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
2025-07-29 08:04:11 +03:00
Konstantin Knizhnik
1e4783f3f9 Update pgxn/neon/pagestore_client.h
Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
2025-07-29 08:04:11 +03:00
Konstantin Knizhnik
20dea3aafb Move lwlock to pagestore_smgr 2025-07-29 08:04:11 +03:00
Konstantin Knizhnik
ca13e7ad7a Do not return from TRY/CATCH in determine_entry_relkind 2025-07-29 08:04:11 +03:00
Konstantin Knizhnik
87c9b067c2 Remove obsolete comment 2025-07-29 08:04:11 +03:00
Konstantin Knizhnik
e9df43abda Change return type of determine_entry_relkind to RelKind 2025-07-29 08:04:11 +03:00
Konstantin Knizhnik
840c73e3c4 Rename safe_mdexists to determine_entry_relkind and do unpin instead of unlock in it 2025-07-29 08:04:11 +03:00
Konstantin Knizhnik
a9e940e236 Add assertion to store_cached_relkind 2025-07-29 08:04:11 +03:00
Konstantin Knizhnik
02ecb1ebbf Update pgxn/neon/pagestore_client.h
Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
2025-07-29 08:04:11 +03:00
Konstantin Knizhnik
2c0a87af68 Update pgxn/neon/relkind_cache.c
Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
2025-07-29 08:04:11 +03:00
Konstantin Knizhnik
a9d4cbe242 Unpin entry in case of mdexists error 2025-07-29 08:04:11 +03:00
Kosntantin Knizhnik
d5d41241fa Fix incorrect unpin condition in get_cached_relkind 2025-07-29 08:04:11 +03:00
Kosntantin Knizhnik
2e34fe03c7 Replace flags with enum 2025-07-29 08:04:11 +03:00
Konstantin Knizhnik
510c891ae5 Add comments 2025-07-29 08:04:11 +03:00
Konstantin Knizhnik
ac233dc9aa Fix access to uninitialized flag 2025-07-29 08:04:11 +03:00
Konstantin Knizhnik
c083765840 Address review comments 2025-07-29 08:04:11 +03:00
Konstantin Knizhnik
8884f55eee Increase number of updates in test_unlogged.py 2025-07-29 08:04:11 +03:00
Konstantin Knizhnik
1f93b664ad Add test_unlogged for measuring effect of relkind cache 2025-07-29 08:04:11 +03:00
Konstantin Knizhnik
883379f936 Add cache for relation kind 2025-07-29 08:04:11 +03:00
8 changed files with 606 additions and 37 deletions

View File

@@ -19,6 +19,7 @@ OBJS = \
neon_walreader.o \
pagestore_smgr.o \
relsize_cache.o \
relperst_cache.o \
unstable_extensions.o \
walproposer.o \
walproposer_pg.o \

View File

@@ -489,6 +489,7 @@ _PG_init(void)
/* Stage 1: Define GUCs, and other early intialization */
pg_init_libpagestore();
relsize_hash_init();
relperst_hash_init();
lfc_init();
pg_init_walproposer();
init_lwlsncache();
@@ -722,6 +723,7 @@ neon_shmem_request_hook(void)
NeonPerfCountersShmemRequest();
PagestoreShmemRequest();
RelsizeCacheShmemRequest();
RelperstCacheShmemRequest();
WalproposerShmemRequest();
LwLsnCacheShmemRequest();
}
@@ -744,6 +746,7 @@ neon_shmem_startup_hook(void)
NeonPerfCountersShmemInit();
PagestoreShmemInit();
RelsizeCacheShmemInit();
RelperstCacheShmemInit();
WalproposerShmemInit();
LwLsnCacheShmemInit();

View File

@@ -74,6 +74,7 @@ extern PGDLLEXPORT void LogicalSlotsMonitorMain(Datum main_arg);
extern void LfcShmemRequest(void);
extern void PagestoreShmemRequest(void);
extern void RelsizeCacheShmemRequest(void);
extern void RelperstCacheShmemRequest(void);
extern void WalproposerShmemRequest(void);
extern void LwLsnCacheShmemRequest(void);
extern void NeonPerfCountersShmemRequest(void);
@@ -81,6 +82,7 @@ extern void NeonPerfCountersShmemRequest(void);
extern void LfcShmemInit(void);
extern void PagestoreShmemInit(void);
extern void RelsizeCacheShmemInit(void);
extern void RelperstCacheShmemInit(void);
extern void WalproposerShmemInit(void);
extern void LwLsnCacheShmemInit(void);
extern void NeonPerfCountersShmemInit(void);

View File

@@ -298,4 +298,50 @@ extern void set_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumb
extern void update_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber size);
extern void forget_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum);
/*
* Relation persistence enum.
*/
typedef enum
{
/* The persistence is not known */
NEON_RELPERSISTENCE_UNKNOWN,
/* The relation is a permanent relation that is WAL-logged normally */
NEON_RELPERSISTENCE_PERMANENT,
/* The relation is an unlogged table/index, stored only on local disk */
NEON_RELPERSISTENCE_UNLOGGED,
/*
* The relation is a permanent (index) relation, but it is being built by an in-progress
* transaction. It currently only lives on local disk and hasn't been WAL-logged yet.
* It will turn into a permanent relation later when the index build completes.
* This is currently used for GiST, SP-GiST and GIN indexes, as well as the pgvector
* extension.
*/
NEON_RELPERSISTENCE_UNLOGGED_BUILD
} NeonRelPersistence;
/*
* Entry type stored in relperst_hash. We have just one entry for the whole relation, i.e. we don't have separate entries for the individual forks.
* It gets a little complicated with unlogged relations. The main fork of an unlogged relation is considered UNLOGGED, but its init-fork is
* treated as PERMANENT. It is specially checked in neon_write.
*/
typedef struct
{
NRelFileInfo rel;
uint8 relperst; /* See NeonRelPersistence */
uint16 access_count;
dlist_node lru_node; /* LRU list node */
} NeonRelPersistenceEntry;
extern LWLockId finish_unlogged_build_lock;
extern void relperst_hash_init(void);
extern void set_cached_relperst(NRelFileInfo rinfo, NeonRelPersistence relperst);
extern NeonRelPersistence get_cached_relperst(NRelFileInfo rinfo);
extern NeonRelPersistenceEntry* pin_cached_relperst(NRelFileInfo rinfo, NeonRelPersistence relperst);
extern void unpin_cached_relperst(NeonRelPersistenceEntry* entry);
extern void forget_cached_relperst(NRelFileInfo rinfo);
#endif /* PAGESTORE_CLIENT_H */

View File

@@ -97,6 +97,7 @@ typedef enum
int debug_compare_local;
static NRelFileInfo unlogged_build_rel_info;
static NeonRelPersistenceEntry* unlogged_build_rel_entry;
static UnloggedBuildPhase unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
static bool neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id);
@@ -617,7 +618,7 @@ neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
result->effective_request_lsn = result->request_lsn;
Assert(last_written_lsn <= result->request_lsn);
neon_log(DEBUG1, "neon_get_request_lsns request lsn %X/%X, not_modified_since %X/%X",
neon_log(DEBUG2, "neon_get_request_lsns request lsn %X/%X, not_modified_since %X/%X",
LSN_FORMAT_ARGS(result->request_lsn), LSN_FORMAT_ARGS(result->not_modified_since));
}
}
@@ -641,7 +642,7 @@ neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
* must still in the buffer cache, so our request cannot concern
* those.
*/
neon_log(DEBUG1, "neon_get_request_lsns GetLastWrittenLSN lsn %X/%X",
neon_log(DEBUG2, "neon_get_request_lsns GetLastWrittenLSN lsn %X/%X",
LSN_FORMAT_ARGS(last_written_lsn));
/*
@@ -877,6 +878,12 @@ neon_unlink(NRelFileInfoBackend rinfo, ForkNumber forkNum, bool isRedo)
if (!NRelFileInfoBackendIsTemp(rinfo))
{
forget_cached_relsize(InfoFromNInfoB(rinfo), forkNum);
/*
* This removes information about all forks from relpersistence cache, but it is ok because
* the only relations pinned in this cache are one involved in unlogged build.
* And relation should not be removed during unlogged build.
*/
forget_cached_relperst(InfoFromNInfoB(rinfo));
}
}
@@ -1601,21 +1608,54 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo
#endif
{
XLogRecPtr lsn;
NeonRelPersistence relperst;
bool is_locked = false;
NRelFileInfo rinfo = InfoFromSMgrRel(reln);
switch (reln->smgr_relpersistence)
{
case 0:
/* This is a bit tricky. Check if the relation exists locally */
if (mdexists(reln, debug_compare_local ? INIT_FORKNUM : forknum))
relperst = get_cached_relperst(rinfo);
if (relperst == NEON_RELPERSISTENCE_UNKNOWN)
{
/* We do not know relation persistence: let's determine it */
relperst = mdexists(reln, debug_compare_local ? INIT_FORKNUM : forknum) ? NEON_RELPERSISTENCE_UNLOGGED : NEON_RELPERSISTENCE_PERMANENT;
/*
* There is no lock hold between get_cached_relperst and set_cached_relperst.
* We assume that if multiple backends perform this check, they all get the same result (there is assert in set_cached_relperst).
* Furthermore we assume that when a relation changes from PERMANENT to UNLOGGED_BUILD, we assume that it has no buffers in
* the shared buffer cache and therefore no other backend will try to concurrently write its pages. (In fact we require that the relation is completely empty.)
*/
set_cached_relperst(rinfo, relperst);
}
if (relperst == NEON_RELPERSISTENCE_UNLOGGED_BUILD)
{
/*
* A relation going through an unlogged build can complete the unlogged build at any time.
* To make sure that the backend performing the build doesn't complete and
* remove the underlying local file just when we are about to write it, acquire the lock.
*/
LWLockAcquire(finish_unlogged_build_lock, LW_SHARED);
is_locked = true;
/* Recheck now that we hold the lock - the build might already have finished */
relperst = get_cached_relperst(rinfo);
}
if (relperst == NEON_RELPERSISTENCE_UNLOGGED || relperst == NEON_RELPERSISTENCE_UNLOGGED_BUILD)
{
/* It exists locally. Guess it's unlogged then. */
#if PG_MAJORVERSION_NUM >= 17
mdwritev(reln, forknum, blocknum, &buffer, 1, skipFsync);
#else
mdwrite(reln, forknum, blocknum, buffer, skipFsync);
#endif
}
if (is_locked)
{
LWLockRelease(finish_unlogged_build_lock);
}
if (relperst == NEON_RELPERSISTENCE_UNLOGGED || relperst == NEON_RELPERSISTENCE_UNLOGGED_BUILD)
{
/*
* We could set relpersistence now that we have determined
* We could set reln->smgr_relpersistence now that we have determined
* that it's local. But we don't dare to do it, because that
* would immediately allow reads as well, which shouldn't
* happen. We could cache it with a different 'relpersistence'
@@ -1626,7 +1666,7 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo
break;
case RELPERSISTENCE_PERMANENT:
if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln)))
if (RelFileInfoEquals(unlogged_build_rel_info, rinfo))
{
#if PG_MAJORVERSION_NUM >= 17
mdwritev(reln, forknum, blocknum, &buffer, 1, skipFsync);
@@ -1657,7 +1697,7 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo
forknum, blocknum,
(uint32) (lsn >> 32), (uint32) lsn);
lfc_write(InfoFromSMgrRel(reln), forknum, blocknum, buffer);
lfc_write(rinfo, forknum, blocknum, buffer);
communicator_prefetch_pump_state();
@@ -1682,28 +1722,52 @@ static void
neon_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
const void **buffers, BlockNumber nblocks, bool skipFsync)
{
NeonRelPersistence relperst;
NRelFileInfo rinfo = InfoFromSMgrRel(reln);
bool is_locked = false;
switch (reln->smgr_relpersistence)
{
case 0:
/* This is a bit tricky. Check if the relation exists locally */
if (mdexists(reln, debug_compare_local ? INIT_FORKNUM : forknum))
if (forknum == INIT_FORKNUM)
{
break; /* init fork is always permanent */
}
relperst = get_cached_relperst(rinfo);
if (relperst == NEON_RELPERSISTENCE_UNKNOWN)
{
/* We do not know relation persistence: let's determine it */
relperst = mdexists(reln, debug_compare_local ? INIT_FORKNUM : forknum) ? NEON_RELPERSISTENCE_UNLOGGED : NEON_RELPERSISTENCE_PERMANENT;
set_cached_relperst(rinfo, relperst);
}
if (relperst == NEON_RELPERSISTENCE_UNLOGGED_BUILD)
{
/* In case of unlogged build we need to avoid race condition at unlogged build end.
* Obtain shared lock here to prevent backend completing unlogged build from performing cleanup amnd remvong files.
*/
LWLockAcquire(finish_unlogged_build_lock, LW_SHARED);
is_locked = true;
/*
* Recheck relperst under lock - may be unlogged build is already finished
*/
relperst = get_cached_relperst(rinfo);
}
if (relperst == NEON_RELPERSISTENCE_UNLOGGED || relperst == NEON_RELPERSISTENCE_UNLOGGED_BUILD)
{
/* It exists locally. Guess it's unlogged then. */
mdwritev(reln, forknum, blkno, buffers, nblocks, skipFsync);
/*
* We could set relpersistence now that we have determined
* that it's local. But we don't dare to do it, because that
* would immediately allow reads as well, which shouldn't
* happen. We could cache it with a different 'relpersistence'
* value, but this isn't performance critical.
*/
}
if (is_locked)
{
LWLockRelease(finish_unlogged_build_lock);
}
if (relperst == NEON_RELPERSISTENCE_UNLOGGED || relperst == NEON_RELPERSISTENCE_UNLOGGED_BUILD)
{
return;
}
break;
case RELPERSISTENCE_PERMANENT:
if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln)))
if (RelFileInfoEquals(unlogged_build_rel_info, rinfo))
{
mdwritev(reln, forknum, blkno, buffers, nblocks, skipFsync);
return;
@@ -1720,7 +1784,7 @@ neon_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
neon_wallog_pagev(reln, forknum, blkno, nblocks, (const char **) buffers, false);
lfc_writev(InfoFromSMgrRel(reln), forknum, blkno, buffers, nblocks);
lfc_writev(rinfo, forknum, blkno, buffers, nblocks);
communicator_prefetch_pump_state();
@@ -1969,7 +2033,7 @@ neon_start_unlogged_build(SMgrRelation reln)
if (unlogged_build_phase != UNLOGGED_BUILD_NOT_IN_PROGRESS)
neon_log(ERROR, "unlogged relation build is already in progress");
ereport(SmgrTrace,
ereport(DEBUG1,
(errmsg(NEON_TAG "starting unlogged build of relation %u/%u/%u",
RelFileInfoFmt(InfoFromSMgrRel(reln)))));
@@ -1985,6 +2049,7 @@ neon_start_unlogged_build(SMgrRelation reln)
case RELPERSISTENCE_TEMP:
case RELPERSISTENCE_UNLOGGED:
unlogged_build_rel_info = InfoFromSMgrRel(reln);
unlogged_build_rel_entry = pin_cached_relperst(unlogged_build_rel_info, NEON_RELPERSISTENCE_UNLOGGED);
unlogged_build_phase = UNLOGGED_BUILD_NOT_PERMANENT;
if (debug_compare_local)
{
@@ -2007,6 +2072,7 @@ neon_start_unlogged_build(SMgrRelation reln)
#endif
unlogged_build_rel_info = InfoFromSMgrRel(reln);
unlogged_build_rel_entry = pin_cached_relperst(unlogged_build_rel_info, NEON_RELPERSISTENCE_UNLOGGED_BUILD);
unlogged_build_phase = UNLOGGED_BUILD_PHASE_1;
/*
@@ -2022,6 +2088,15 @@ neon_start_unlogged_build(SMgrRelation reln)
}
}
static void
unlogged_build_cleanup(void)
{
NRelFileInfoInvalidate(unlogged_build_rel_info);
unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
unpin_cached_relperst(unlogged_build_rel_entry);
unlogged_build_rel_entry = NULL;
}
/*
* neon_finish_unlogged_build_phase_1()
*
@@ -2033,7 +2108,7 @@ neon_finish_unlogged_build_phase_1(SMgrRelation reln)
{
Assert(RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln)));
ereport(SmgrTrace,
ereport(DEBUG1,
(errmsg(NEON_TAG "finishing phase 1 of unlogged build of relation %u/%u/%u",
RelFileInfoFmt((unlogged_build_rel_info)))));
@@ -2048,8 +2123,7 @@ neon_finish_unlogged_build_phase_1(SMgrRelation reln)
*/
if (IsParallelWorker())
{
NRelFileInfoInvalidate(unlogged_build_rel_info);
unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
unlogged_build_cleanup();
}
else
unlogged_build_phase = UNLOGGED_BUILD_PHASE_2;
@@ -2068,10 +2142,11 @@ static void
neon_end_unlogged_build(SMgrRelation reln)
{
NRelFileInfoBackend rinfob = InfoBFromSMgrRel(reln);
NRelFileInfo rinfo = InfoFromSMgrRel(reln);
Assert(RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln)));
Assert(RelFileInfoEquals(unlogged_build_rel_info, rinfo));
ereport(SmgrTrace,
ereport(DEBUG1,
(errmsg(NEON_TAG "ending unlogged build of relation %u/%u/%u",
RelFileInfoFmt(unlogged_build_rel_info))));
@@ -2095,21 +2170,26 @@ neon_end_unlogged_build(SMgrRelation reln)
recptr = GetXLogInsertRecPtr();
neon_set_lwlsn_block_range(recptr,
InfoFromNInfoB(rinfob),
rinfo,
MAIN_FORKNUM, 0, nblocks);
neon_set_lwlsn_relation(recptr,
InfoFromNInfoB(rinfob),
rinfo,
MAIN_FORKNUM);
/* Obtain exclusive lock to prevent concurrent writes to the file while we perform cleanup */
LWLockAcquire(finish_unlogged_build_lock, LW_EXCLUSIVE);
unlogged_build_rel_entry->relperst = NEON_RELPERSISTENCE_PERMANENT;
LWLockRelease(finish_unlogged_build_lock);
/* Remove local copy */
for (int forknum = 0; forknum <= MAX_FORKNUM; forknum++)
{
neon_log(SmgrTrace, "forgetting cached relsize for %u/%u/%u.%u",
RelFileInfoFmt(InfoFromNInfoB(rinfob)),
RelFileInfoFmt(rinfo),
forknum);
forget_cached_relsize(InfoFromNInfoB(rinfob), forknum);
lfc_invalidate(InfoFromNInfoB(rinfob), forknum, nblocks);
forget_cached_relsize(rinfo, forknum);
lfc_invalidate(rinfo, forknum, nblocks);
mdclose(reln, forknum);
if (!debug_compare_local)
@@ -2121,8 +2201,7 @@ neon_end_unlogged_build(SMgrRelation reln)
if (debug_compare_local)
mdunlink(rinfob, INIT_FORKNUM, true);
}
NRelFileInfoInvalidate(unlogged_build_rel_info);
unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
unlogged_build_cleanup();
}
#define STRPREFIX(str, prefix) (strncmp(str, prefix, strlen(prefix)) == 0)
@@ -2194,8 +2273,7 @@ AtEOXact_neon(XactEvent event, void *arg)
* Forget about any build we might have had in progress. The local
* file will be unlinked by smgrDoPendingDeletes()
*/
NRelFileInfoInvalidate(unlogged_build_rel_info);
unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
unlogged_build_cleanup();
break;
case XACT_EVENT_COMMIT:
@@ -2206,8 +2284,7 @@ AtEOXact_neon(XactEvent event, void *arg)
case XACT_EVENT_PRE_PREPARE:
if (unlogged_build_phase != UNLOGGED_BUILD_NOT_IN_PROGRESS)
{
NRelFileInfoInvalidate(unlogged_build_rel_info);
unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
unlogged_build_cleanup();
ereport(ERROR,
(errcode(ERRCODE_INTERNAL_ERROR),
(errmsg(NEON_TAG "unlogged index build was not properly finished"))));

292
pgxn/neon/relperst_cache.c Normal file
View File

@@ -0,0 +1,292 @@
/*-------------------------------------------------------------------------
*
* relperst_cache.c
* Cache to track the relpersistence of relations
*
* Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "neon.h"
#include "miscadmin.h"
#include "neon_pgversioncompat.h"
#include "pagestore_client.h"
#include RELFILEINFO_HDR
#include "storage/smgr.h"
#include "storage/lwlock.h"
#include "storage/ipc.h"
#include "storage/shmem.h"
#include "catalog/pg_tablespace_d.h"
#include "utils/dynahash.h"
#include "utils/guc.h"
#include "miscadmin.h"
/*
* The main goal of this cache is to avoid repeated calls of mdexists in neon_write,
* which is needed to distinguish unlogged relations.
* It has a fixed size, implementing eviction with the LRU algorithm.
*
* This hash is also used to mark a relation during an unlogged build.
* Relations involved in unlogged build are pinned in the cache and never evicted. (Relying
* on the fact that the number of concurrent unlogged builds is small). Evicting a page
* belonging to an unlogged build involves an extra locking step to eliminate a race condition
* between unlogged build completing and deleted the local file, at the same time that
* another backend is evicting a page belonging to it. See how `finish_unlogged_build_lock`
* is used in `neon_write`
*/
typedef struct
{
size_t size;
uint64 hits;
uint64 misses;
uint64 pinned;
dlist_head lru; /* double linked list for LRU replacement
* algorithm */
} NeonRelPersistenceHashControl;
/*
* Size of a cache entry is 32 bytes. So this default will take about 0.5 MB,
* which seems reasonable.
*/
#define DEFAULT_RELPERST_HASH_SIZE (16 * 1024)
#define MAX_RELPERST_HASH_SIZE (1024 * 1024)
static HTAB *relperst_hash;
static int relperst_hash_size = DEFAULT_RELPERST_HASH_SIZE;
static NeonRelPersistenceHashControl* relperst_ctl;
/* Protects unlogged build completing while another backend is writing to it */
LWLockId finish_unlogged_build_lock;
/* Protects 'relperst_hash' */
static LWLockId relperst_hash_lock;
/*
* Shared memory registration
*/
void
RelperstCacheShmemRequest(void)
{
RequestAddinShmemSpace(sizeof(NeonRelPersistenceHashControl) + hash_estimate_size(relperst_hash_size, sizeof(NeonRelPersistenceEntry)));
RequestNamedLWLockTranche("neon_relperst", 2);
}
/*
* Initialize shared memory
*/
void
RelperstCacheShmemInit(void)
{
static HASHCTL info;
bool found;
relperst_ctl = (NeonRelPersistenceHashControl *) ShmemInitStruct("relperst_hash", sizeof(NeonRelPersistenceHashControl), &found);
if (!found)
{
/*
* In the worst case, the hash needs to be large enough for the case that all backends are performing an unlogged index build at the same time.
* Or actually twice that, because while performing an unlogged index build, each backend can also be trying to write out a page for another
* relation and hence hold one more entry in the cache pinned. Use MaxConnections instead of MaxBackends because only normal backends can perform unlogged build.
*/
size_t hash_size = Max(2 * MaxConnections, relperst_hash_size);
relperst_hash_lock = (LWLockId) GetNamedLWLockTranche("neon_relperst");
finish_unlogged_build_lock = (LWLockId)(GetNamedLWLockTranche("neon_relperst") + 1);
info.keysize = sizeof(NRelFileInfo);
info.entrysize = sizeof(NeonRelPersistenceEntry);
relperst_hash = ShmemInitHash("neon_relperst",
hash_size, hash_size,
&info,
HASH_ELEM | HASH_BLOBS);
relperst_ctl->size = 0;
relperst_ctl->hits = 0;
relperst_ctl->misses = 0;
relperst_ctl->pinned = 0;
dlist_init(&relperst_ctl->lru);
}
}
/*
* Lookup existing entry or create a new one
*/
static NeonRelPersistenceEntry*
get_pinned_entry(NRelFileInfo rinfo)
{
bool found;
NeonRelPersistenceEntry* entry = hash_search(relperst_hash, &rinfo, HASH_ENTER_NULL, &found);
if (entry == NULL)
{
if (dlist_is_empty(&relperst_ctl->lru))
{
/* Cannot happen, because we size the hash table to be large enough for the worst case */
neon_log(PANIC, "No unpinned relperst entries");
}
else
{
/*
* Remove least recently used element from the hash.
*/
NeonRelPersistenceEntry *victim = dlist_container(NeonRelPersistenceEntry, lru_node, dlist_pop_head_node(&relperst_ctl->lru));
Assert(victim->access_count == 0);
hash_search(relperst_hash, &victim->rel, HASH_REMOVE, &found);
Assert(found);
Assert(relperst_ctl->size > 0);
relperst_ctl->size -= 1;
}
entry = hash_search(relperst_hash, &rinfo, HASH_ENTER_NULL, &found);
Assert(!found);
}
if (!found)
{
/* the caller will fill this in by calling set_cached_relperst() later */
entry->relperst = NEON_RELPERSISTENCE_UNKNOWN;
relperst_ctl->pinned += 1;
entry->access_count = 1;
relperst_ctl->size += 1;
}
else if (entry->access_count++ == 0)
{
dlist_delete(&entry->lru_node);
relperst_ctl->pinned += 1;
}
return entry;
}
/*
* Unpin entry and place it at the end of LRU list
*/
static void
unpin_entry(NeonRelPersistenceEntry *entry)
{
Assert(entry->access_count != 0);
if (--entry->access_count == 0)
{
Assert(relperst_ctl->pinned != 0);
relperst_ctl->pinned -= 1;
dlist_push_tail(&relperst_ctl->lru, &entry->lru_node);
}
}
/*
* Get existed or intialize new entry. This function is used by neon_start_unlogged_build to mark relation involved in unlogged build.
* In case of overflow removes least recently used entry.
* Return pinned entry. It will be released by unpin_cached_relperst at the end of unlogged build.
*/
NeonRelPersistenceEntry*
pin_cached_relperst(NRelFileInfo rinfo, NeonRelPersistence relperst)
{
NeonRelPersistenceEntry *entry;
LWLockAcquire(relperst_hash_lock, LW_EXCLUSIVE);
entry = get_pinned_entry(rinfo);
entry->relperst = relperst;
LWLockRelease(relperst_hash_lock);
return entry;
}
/*
* Lookup entry or create new one if not exists. This function is called by neon_write to detenmine if changes should be written to the local disk.
* In case of overflow removes least recently used entry.
* If relation in involved in unlogged build, the caller should obtain shared lock on `finish_unlogged_build_lock` and recheck
* state under lock.
*/
NeonRelPersistence
get_cached_relperst(NRelFileInfo rinfo)
{
NeonRelPersistenceEntry *entry;
NeonRelPersistence relperst = NEON_RELPERSISTENCE_UNKNOWN;
/* we don't modify the hash table, but need an exclusive lock to manipulate the LRU list */
LWLockAcquire(relperst_hash_lock, LW_EXCLUSIVE);
entry = hash_search(relperst_hash, &rinfo, HASH_FIND, NULL);
if (entry != NULL)
{
/* Do pin+unpin entry to move it to the end of LRU list */
if (entry->access_count++ == 0)
{
dlist_delete(&entry->lru_node);
relperst_ctl->pinned += 1;
}
relperst = entry->relperst;
unpin_entry(entry);
}
LWLockRelease(relperst_hash_lock);
return relperst;
}
/*
* Store relation kind as a result of mdexists check.
*/
void
set_cached_relperst(NRelFileInfo rinfo, NeonRelPersistence relperst)
{
NeonRelPersistenceEntry *entry;
LWLockAcquire(relperst_hash_lock, LW_EXCLUSIVE);
/* Do pin+unpin entry to move it to the end of LRU list */
entry = get_pinned_entry(rinfo);
Assert(entry->relperst == NEON_RELPERSISTENCE_UNKNOWN || entry->relperst == relperst);
entry->relperst = relperst;
unpin_entry(entry);
LWLockRelease(relperst_hash_lock);
}
/* Release a pin that was acquired earlier with pin_cached_relperst() */
void
unpin_cached_relperst(NeonRelPersistenceEntry* entry)
{
if (entry)
{
LWLockAcquire(relperst_hash_lock, LW_EXCLUSIVE);
unpin_entry(entry);
LWLockRelease(relperst_hash_lock);
}
}
void
forget_cached_relperst(NRelFileInfo rinfo)
{
NeonRelPersistenceEntry *entry;
LWLockAcquire(relperst_hash_lock, LW_EXCLUSIVE);
entry = hash_search(relperst_hash, &rinfo, HASH_REMOVE, NULL);
if (entry)
{
Assert(entry->access_count == 0);
dlist_delete(&entry->lru_node);
relperst_ctl->size -= 1;
}
LWLockRelease(relperst_hash_lock);
}
void
relperst_hash_init(void)
{
DefineCustomIntVariable("neon.relperst_hash_size",
"Sets the maximum number of cached relation persistence for neon",
NULL,
&relperst_hash_size,
DEFAULT_RELPERST_HASH_SIZE,
1,
MAX_RELPERST_HASH_SIZE,
PGC_POSTMASTER,
0,
NULL, NULL, NULL);
}

View File

@@ -0,0 +1,65 @@
from __future__ import annotations
import threading
from contextlib import closing
from typing import TYPE_CHECKING
import pytest
from pytest_lazyfixture import lazy_fixture
if TYPE_CHECKING:
from fixtures.compare_fixtures import PgCompare
#
# This test demonstrates effect of relpersistence cache. Postgres doesn't store relation persistence in shared buffer tag.
# It means that if page is evicted from shared buffers and relation is not in the backend's relation cache, then persistence=0 (auto) is used.
# For vanilla Postgres it is not important, because in both cases we need to write changes to the file.
# In Neon, neon_write does nothing nothing for a permanent relation, while for an unlogged relation, it writes the page to the local file.
# Originally Neon always called `mdexists` to check if the local file exists and determine if it's an unlogged relation. Now we check the cache first.
# mdexists is not so cheap: it closes and opens the file.
#
# This test tries to recreate the situation that most of writes are with persistence=0.
# We open multiple connections to the database and in each fill its own table. So each backends writes only its own table and other table's
# descriptors are not cached. At the same time all backends perform eviction from shared buffers. Probability that backends evicts page of its own
# relation is 1/N when N is number of relations=number of backends. The more relations, the smaller probability.
# For large enough number of relations most of writes are with unknown persistence.
#
# On Linux, introducing the relpersistence cache shows about 2x time speed improvement in this test.
#
@pytest.mark.timeout(1000)
@pytest.mark.parametrize(
"env",
[
pytest.param(lazy_fixture("neon_compare"), id="neon"),
pytest.param(lazy_fixture("vanilla_compare"), id="vanilla"),
],
)
def test_unlogged(env: PgCompare):
n_tables = 20
n_records = 1000
n_updates = 1000
with env.record_duration("insert"):
with closing(env.pg.connect()) as conn:
with conn.cursor() as cur:
for i in range(n_tables):
cur.execute(
f"create unlogged table t{i}(pk integer primary key, sk integer, fillter text default repeat('x', 1000)) with (fillfactor=10)"
)
cur.execute(f"insert into t{i} values (generate_series(1,{n_records}),0)")
def do_updates(table_id: int):
with closing(env.pg.connect()) as conn:
with conn.cursor() as cur:
for _ in range(n_updates):
cur.execute(f"update t{table_id} set sk=sk+1")
with env.record_duration("update"):
threads = [threading.Thread(target=do_updates, args=(i,)) for i in range(n_tables)]
for thread in threads:
thread.start()
for thread in threads:
thread.join()

View File

@@ -0,0 +1,83 @@
from __future__ import annotations
import threading
from typing import TYPE_CHECKING
import pytest
from fixtures.log_helper import log
from fixtures.utils import query_scalar, skip_in_debug_build
if TYPE_CHECKING:
from fixtures.neon_fixtures import NeonEnvBuilder
@pytest.mark.timeout(600)
@skip_in_debug_build("this test is slow so only run with release build")
def test_unlogged_build(neon_env_builder: NeonEnvBuilder):
"""
Check for race conditions between end of unlogged build and backends
evicting pages of this index. We used to have a race condition where a
backend completed unlogged build and removed local files, while another
backend is evicting a page belonging to the newly-built index and tries to
write to the file that's removed.
"""
n_connections = 4
# A small shared_buffers setting forces backends to evict more pages from
# the buffer cache during the build, which is needed to expose the bug with
# concurrent eviction.
shared_buffers = "128MB"
env = neon_env_builder.init_start()
endpoint = env.endpoints.create_start(
"main", config_lines=[f"shared_buffers='{shared_buffers}'"]
)
def unlogged_build(i: int):
con = endpoint.connect()
cur = con.cursor()
cur.execute("set statement_timeout=0")
cur.execute(f"CREATE TABLE ginarray_tbl_{i} (arr text[])")
cur.execute(
f"INSERT INTO ginarray_tbl_{i} SELECT array_agg(repeat((x*1000000 + y)::text, 40)) FROM generate_series(1, 100) x, generate_series(1, 200 * 100) y group by y"
)
cur.execute("SET log_min_messages='debug1'")
cur.execute(f"CREATE INDEX ginarray_tbl_idx_{i} ON ginarray_tbl_{i} USING gin(arr)")
cur.execute("RESET log_min_messages")
# Check that the index is larger than 1 GB, so that it doesn't fit into
# a single segment file. Otherwise, the race condition is avoided,
# because when the backend evicts and writes out the page, it already
# has the file descriptor opened and the write succeeds even if the
# underlying file is deleted concurrently.
assert (
query_scalar(cur, f"select pg_relation_size('ginarray_tbl_idx_{i}')")
> 1024 * 1024 * 1024
)
# Run a simple query that uses the index as a sanity check. (In the
# original bug, the CREATE INDEX step failed already. But since we
# already went through all the effort to build the index, might as well
# check that it also works.)
qry = f"SELECT count(*) FROM ginarray_tbl_{i} WHERE ARRAY[repeat('1012345', 40)] <@ arr"
result = query_scalar(cur, qry)
assert result == 1
# Verify that the query actually uses the index
cur.execute(f"EXPLAIN (COSTS OFF) {qry}")
rows = cur.fetchall()
plan = "\n".join(r[0] for r in rows)
log.debug(plan)
assert "Bitmap Index Scan on ginarray_tbl_idx" in plan
threads = [threading.Thread(target=unlogged_build, args=(i,)) for i in range(n_connections)]
for thread in threads:
thread.start()
for thread in threads:
thread.join()
# Sanity check that the indexes were built with the "unlogged build"
# method. GIN always uses that method currently, but if a different, more
# efficient, method is invented later, that might invalidate this test.
assert endpoint.log_contains("starting unlogged build of relation")