diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h index 6c726c22d9..94389521d2 100644 --- a/pgxn/neon/pagestore_client.h +++ b/pgxn/neon/pagestore_client.h @@ -295,12 +295,13 @@ extern void neon_immedsync(SMgrRelation reln, ForkNumber forknum); /* utils for neon relsize cache */ extern void relsize_hash_init(void); extern bool get_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber *size); -extern void set_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber size); +extern bool set_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber new_size, BlockNumber* old_size); extern void update_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber size); extern void forget_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum); -extern bool start_unlogged_build(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber size); -extern bool is_unlogged_build(NRelFileInfo rinfo, ForkNumber forknum); +extern bool start_unlogged_build(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blocknum, BlockNumber* relsize); +extern bool is_unlogged_build_extend(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blocknum, BlockNumber* relsize); +extern bool is_unlogged_build(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber* relsize); extern bool stop_unlogged_build(NRelFileInfo rinfo, ForkNumber forknum); extern void resume_unlogged_build(void); diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index a30250856f..1ec6391666 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -97,6 +97,8 @@ const int SmgrTrace = DEBUG5; page_server_api *page_server; +const PGAlignedBlock zero_buffer; + static bool neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id); static bool (*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id) = NULL; @@ -1391,6 +1393,17 @@ PageIsEmptyHeapPage(char *buffer) * A page is being evicted from the shared buffer cache. Update the * last-written LSN of the page, and WAL-log it if needed. */ +static void +unlogged_extend(SMgrRelation reln, ForkNumber forknum, BlockNumber old_relsize, BlockNumber new_relsize) +{ +#if PG_MAJORVERSION_NUM < 16 + mdextend(reln, forknum, new_relsize, (char *) zero_buffer.data, true); +#else + mdzeroextend(reln, forknum, old_relsize, new_relsize - old_relsize, true); +#endif +} + + static void #if PG_MAJORVERSION_NUM < 16 neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool force) @@ -1398,6 +1411,7 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, ch neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const char *buffer, bool force) #endif { + BlockNumber relsize; XLogRecPtr lsn = PageGetLSN((Page) buffer); bool log_page; @@ -1474,12 +1488,16 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co } else if (forknum != FSM_FORKNUM && forknum != VISIBILITYMAP_FORKNUM) { - if (start_unlogged_build(InfoFromSMgrRel(reln), forknum, blocknum+1)) + if (start_unlogged_build(InfoFromSMgrRel(reln), forknum, blocknum, &relsize)) { mdcreate(reln, forknum, true); - resume_unlogged_build(); + } + if (blocknum >= relsize) + { + unlogged_extend(reln, forknum, relsize, blocknum+1); } mdwrite(reln, forknum, blocknum, buffer, true); + resume_unlogged_build(); ereport(SmgrTrace, (errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u is saved locally.", @@ -1493,12 +1511,16 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co } else if (lsn < FirstNormalUnloggedLSN) { - if (start_unlogged_build(InfoFromSMgrRel(reln),forknum, blocknum+1)) + if (start_unlogged_build(InfoFromSMgrRel(reln),forknum, blocknum, &relsize)) { mdcreate(reln, forknum, true); - resume_unlogged_build(); + } + if (blocknum >= relsize) + { + unlogged_extend(reln, forknum, relsize, blocknum+1); } mdwrite(reln, forknum, blocknum, buffer, true); + resume_unlogged_build(); ereport(SmgrTrace, (errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u is saved locally.", @@ -1508,10 +1530,15 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co } else { - if (is_unlogged_build(InfoFromSMgrRel(reln), forknum)) + if (is_unlogged_build_extend(InfoFromSMgrRel(reln), forknum, blocknum, &relsize)) { - resume_unlogged_build(); + if (blocknum >= relsize) + { + unlogged_extend(reln, forknum, relsize, blocknum+1); + } mdwrite(reln, forknum, blocknum, buffer, true); + resume_unlogged_build(); + ereport(SmgrTrace, (errmsg(NEON_TAG "Page %u with LSN=%X/%X of relation %u/%u/%u.%u is saved locally.", blocknum, @@ -2054,7 +2081,7 @@ neon_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo) &reln->smgr_cached_nblocks[forkNum]); } else - set_cached_relsize(InfoFromSMgrRel(reln), forkNum, 0); + set_cached_relsize(InfoFromSMgrRel(reln), forkNum, 0, NULL); #ifdef DEBUG_COMPARE_LOCAL if (IS_LOCAL_REL(reln)) @@ -2114,6 +2141,7 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, #endif { XLogRecPtr lsn; + BlockNumber old_relsize; BlockNumber n_blocks = 0; switch (reln->smgr_relpersistence) @@ -2165,8 +2193,12 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, neon_wallog_page(reln, forkNum, n_blocks++, buffer, true); neon_wallog_page(reln, forkNum, blkno, buffer, false); - set_cached_relsize(InfoFromSMgrRel(reln), forkNum, blkno + 1); + if (set_cached_relsize(InfoFromSMgrRel(reln), forkNum, blkno + 1, &old_relsize)) + { + unlogged_extend(reln, forkNum, old_relsize, blkno + 1); + resume_unlogged_build(); + } lsn = PageGetLSN((Page) buffer); neon_log(SmgrTrace, "smgrextend called for %u/%u/%u.%u blk %u, page LSN: %X/%08X", RelFileInfoFmt(InfoFromSMgrRel(reln)), @@ -2200,8 +2232,8 @@ void neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum, int nblocks, bool skipFsync) { - const PGAlignedBlock buffer = {0}; - int remblocks = nblocks; + BlockNumber old_relsize; + BlockNumber remblocks = nblocks; XLogRecPtr lsn = 0; switch (reln->smgr_relpersistence) @@ -2251,11 +2283,29 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum, if (!XLogInsertAllowed()) return; - set_cached_relsize(InfoFromSMgrRel(reln), forkNum, blocknum + nblocks); + if (set_cached_relsize(InfoFromSMgrRel(reln), forkNum, blocknum + nblocks, &old_relsize)) + { + unlogged_extend(reln, forkNum, old_relsize, blocknum + nblocks); + resume_unlogged_build(); + } + + if (forkNum != MAIN_FORKNUM) /* no need to wal-log zero pages except VM/FSM forks */ + { + /* ensure we have enough xlog buffers to log max-sized records */ + XLogEnsureRecordSpace(Min(remblocks, (XLR_MAX_BLOCK_ID - 1)), 0); + } + else + { + /* + * smgr_extend is often called with an all-zeroes page, so + * lsn==InvalidXLogRecPtr. An smgr_write() call will come for the buffer + * later, after it has been initialized with the real page contents, and + * it is eventually evicted from the buffer cache. But we need a valid LSN + * to the relation metadata update now. + */ + lsn = GetXLogInsertRecPtr(); + } -#if 0 - /* ensure we have enough xlog buffers to log max-sized records */ - XLogEnsureRecordSpace(Min(remblocks, (XLR_MAX_BLOCK_ID - 1)), 0); /* * Iterate over all the pages. They are collected into batches of * XLR_MAX_BLOCK_ID pages, and a single WAL-record is written for each @@ -2265,17 +2315,19 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum, { int count = Min(remblocks, XLR_MAX_BLOCK_ID); - XLogBeginInsert(); + if (forkNum != MAIN_FORKNUM) /* no need to wal-log zero pages except VM/FSM forks */ + { + XLogBeginInsert(); - for (int i = 0; i < count; i++) - XLogRegisterBlock(i, &InfoFromSMgrRel(reln), forkNum, blocknum + i, - (char *) buffer.data, REGBUF_FORCE_IMAGE | REGBUF_STANDARD); - - lsn = XLogInsert(RM_XLOG_ID, XLOG_FPI); + for (int i = 0; i < count; i++) + XLogRegisterBlock(i, &InfoFromSMgrRel(reln), forkNum, blocknum + i, + (char *) zero_buffer.data, REGBUF_FORCE_IMAGE | REGBUF_STANDARD); + lsn = XLogInsert(RM_XLOG_ID, XLOG_FPI); + } for (int i = 0; i < count; i++) { - lfc_write(InfoFromSMgrRel(reln), forkNum, blocknum + i, buffer.data); + lfc_write(InfoFromSMgrRel(reln), forkNum, blocknum + i, zero_buffer.data); SetLastWrittenLSNForBlock(lsn, InfoFromSMgrRel(reln), forkNum, blocknum + i); } @@ -2287,8 +2339,6 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum, Assert(lsn != 0); SetLastWrittenLSNForRelation(lsn, InfoFromSMgrRel(reln), forkNum); - set_cached_relsize(InfoFromSMgrRel(reln), forkNum, blocknum); -#endif } #endif @@ -2555,6 +2605,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer #endif { neon_request_lsns request_lsns; + BlockNumber relsize; switch (reln->smgr_relpersistence) { @@ -2581,9 +2632,9 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer request_lsns = neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, blkno); neon_read_at_lsn(InfoFromSMgrRel(reln), forkNum, blkno, request_lsns, buffer); - if (is_unlogged_build(InfoFromSMgrRel(reln), forkNum)) + if (is_unlogged_build(InfoFromSMgrRel(reln), forkNum, &relsize)) { - if (blkno >= mdnblocks(reln, forkNum)) + if (blkno >= relsize) { elog(SmgrTrace, "Get empty local page %d of relation %u/%u/%u.%u", blkno, RelFileInfoFmt(InfoFromSMgrRel(reln)), forkNum); @@ -2711,11 +2762,22 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *bu neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void *buffer, bool skipFsync) #endif { + BlockNumber relsize; XLogRecPtr lsn; switch (reln->smgr_relpersistence) { case 0: + if (is_unlogged_build_extend(InfoFromSMgrRel(reln), forknum, blocknum, &relsize)) + { + if (blocknum >= relsize) + { + unlogged_extend(reln, forknum, relsize, blocknum+1); + } + mdwrite(reln, forknum, blocknum, buffer, skipFsync); + resume_unlogged_build(); + return; + } /* This is a bit tricky. Check if the relation exists locally */ if (mdexists(reln, forknum)) { @@ -2920,7 +2982,7 @@ neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); } - set_cached_relsize(InfoFromSMgrRel(reln), forknum, nblocks); + set_cached_relsize(InfoFromSMgrRel(reln), forknum, nblocks, NULL); /* * Truncating a relation drops all its buffers from the buffer cache @@ -3176,7 +3238,7 @@ neon_extend_rel_size(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, relsize = Max(nbresponse->n_blocks, blkno + 1); - set_cached_relsize(rinfo, forknum, relsize); + set_cached_relsize(rinfo, forknum, relsize, NULL); SetLastWrittenLSNForRelation(end_recptr, rinfo, forknum); neon_log(SmgrTrace, "Set length to %d", relsize); diff --git a/pgxn/neon/relsize_cache.c b/pgxn/neon/relsize_cache.c index 1be335927b..47a382d9b1 100644 --- a/pgxn/neon/relsize_cache.c +++ b/pgxn/neon/relsize_cache.c @@ -134,9 +134,15 @@ get_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber *size) return found; } -void -set_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber size) +/* + * Cache relation size. + * Returns true if it happens during unlogged build. + * In thids case lock isnot released. + */ +bool +set_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber new_size, BlockNumber* old_size) { + bool unlogged = false; if (relsize_hash_size > 0) { RelTag tag; @@ -164,7 +170,11 @@ set_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber size) relsize_ctl->size -= 1; } } - entry->size = size; + if (old_size) + { + *old_size = found ? entry->size : 0; + } + entry->size = new_size; if (!found) { entry->unlogged = false; @@ -190,17 +200,27 @@ set_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber size) relsize_ctl->size += 1; } } - else if (!entry->unlogged) /* entries of relation involved in unlogged build are pinned */ + else if (entry->unlogged) /* entries of relation involved in unlogged build are pinned */ { dlist_delete(&entry->lru_node); } + if (!entry->unlogged) /* entries of relation involved in unlogged build are pinned */ { dlist_push_tail(&relsize_ctl->lru, &entry->lru_node); } + else + { + Assert(old_size); + unlogged = true; + } relsize_ctl->writes += 1; - LWLockRelease(relsize_lock); + if (!unlogged) + { + LWLockRelease(relsize_lock); + } } + return unlogged; } void @@ -292,7 +312,7 @@ forget_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum) * in critical section, for example right now it create relation on the disk using mdcreate */ bool -start_unlogged_build(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber size) +start_unlogged_build(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blocknum, BlockNumber* relsize) { bool start = false; if (relsize_hash_size > 0) @@ -306,7 +326,8 @@ start_unlogged_build(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber size) LWLockAcquire(relsize_lock, LW_EXCLUSIVE); entry = hash_search(relsize_hash, &tag, HASH_ENTER, &found); if (!found) { - entry->size = size; + *relsize = 0; + entry->size = blocknum + 1; start = true; if (relsize_ctl->size+1 == relsize_hash_size) @@ -330,8 +351,11 @@ start_unlogged_build(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber size) { start = !entry->unlogged; - if (entry->size < size) - entry->size = size; + *relsize = entry->size; + if (entry->size <= blocknum) + { + entry->size = blocknum + 1; + } if (start) { @@ -346,12 +370,9 @@ start_unlogged_build(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber size) * We are not putting entry in LRU least to prevent it fro eviction until the end of unlogged build */ - if (!start) - LWLockRelease(relsize_lock); - else + if (start) elog(LOG, "Start unlogged build for %u/%u/%u.%u", RelFileInfoFmt(rinfo), forknum); - } return start; } @@ -363,7 +384,7 @@ start_unlogged_build(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber size) * It allows to read page from local file without risk that it is removed by stop_unlogged_build by some other backend. */ bool -is_unlogged_build(NRelFileInfo rinfo, ForkNumber forknum) +is_unlogged_build(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber* relsize) { bool unlogged = false; @@ -379,6 +400,62 @@ is_unlogged_build(NRelFileInfo rinfo, ForkNumber forknum) if (entry != NULL) { unlogged = entry->unlogged; + *relsize = entry->size; + relsize_ctl->hits += 1; + } + else + { + relsize_ctl->misses += 1; + } + if (!unlogged) + LWLockRelease(relsize_lock); + } + return unlogged; +} + +/* + * Check if releation is extended during unlogged build. + * If it is unlogged, true is returns and lock on relsize cache is hold. + * It should be later released by called using resume_unlogged_build(). + * It allows to atomocally extend local file. + */ +bool +is_unlogged_build_extend(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blocknum, BlockNumber* relsize) +{ + bool unlogged = false; + + if (relsize_hash_size > 0) + { + RelTag tag; + RelSizeEntry *entry; + + tag.rinfo = rinfo; + tag.forknum = forknum; + + LWLockAcquire(relsize_lock, LW_SHARED); + entry = hash_search(relsize_hash, &tag, HASH_FIND, NULL); + if (entry != NULL) + { + if (entry->size <= blocknum) + { + /* Very rare case: it can happen only if relation is thrown away from relcache before unlogged build is detected */ + /* Repeat search under exclusive lock */ + LWLockRelease(relsize_lock); + LWLockAcquire(relsize_lock, LW_EXCLUSIVE); + entry = hash_search(relsize_hash, &tag, HASH_FIND, NULL); + if (entry == NULL) + { + relsize_ctl->misses += 1; + LWLockRelease(relsize_lock); + return false; + } + } + unlogged = entry->unlogged; + *relsize = entry->size; + if (entry->size <= blocknum) + { + entry->size = blocknum + 1; + } relsize_ctl->hits += 1; } else @@ -436,7 +513,8 @@ stop_unlogged_build(NRelFileInfo rinfo, ForkNumber forknum) void resume_unlogged_build(void) { - LWLockRelease(relsize_lock); + if (relsize_hash_size > 0) + LWLockRelease(relsize_lock); }