Compare commits

...

8 Commits

Author SHA1 Message Date
Konstantin Knizhnik
83294d771b Fix readahead distance definition 2023-06-27 10:33:24 +03:00
Konstantin Knizhnik
b3ef6c7bf5 Add readahead_distance GUC 2023-06-22 12:35:50 +03:00
Konstantin Knizhnik
f88ff9f3c6 Do not free simple hash entries 2023-06-21 18:23:06 +03:00
Konstantin Knizhnik
593c4244fd Add missed access_stat_init() call 2023-06-21 15:04:47 +03:00
Konstantin Knizhnik
d7aa36c4c0 Fix mistyping in get_relation_access_statistics function name 2023-06-21 14:29:15 +03:00
Konstantin Knizhnik
df127ef209 Fix GUC definition for access statistic 2023-06-21 09:25:25 +03:00
Konstantin Knizhnik
72a73d2c82 Fix update of current block in access statistic 2023-06-21 09:04:45 +03:00
Konstantin Knizhnik
172239c7ee Implement relation access statistic to be able to detect sequential access and use prefetch in this case 2023-06-21 08:58:13 +03:00
6 changed files with 310 additions and 2 deletions

View File

@@ -8,6 +8,7 @@ OBJS = \
libpagestore.o \
libpqwalproposer.o \
neon.o \
access_stat.o \
pagestore_smgr.o \
relsize_cache.o \
walproposer.o \

274
pgxn/neon/access_stat.c Normal file
View File

@@ -0,0 +1,274 @@
/*
* We want this statistic to rpresent current access patern mthis is why when
* (n_seq_accesses + n_rnd_accesses) > MAX_ACCESS_COUNTER then we divide both counters by two,
* so decreasng weight of historical data
*/
#include "postgres.h"
#include "funcapi.h"
#include "miscadmin.h"
#include "common/hashfn.h"
#include "pagestore_client.h"
#include "storage/relfilenode.h"
#include "utils/guc.h"
/* Structure used to predict sequential access */
typedef struct AccessStatEntry {
RelFileNode relnode;
BlockNumber blkno; /* last accessed black number */
uint32 n_seq_accesses; /* number of sequential accesses (when block N+1 is accessed after block N) */
uint32 n_rnd_accesses; /* number of random accesses */
uint32 hash;
uint32 status;
uint64 access_count; /* total number of relation accesses since backend start */
dlist_node lru_node; /* LRU list node */
} AccessStatEntry;
#define SH_PREFIX as
#define SH_ELEMENT_TYPE AccessStatEntry
#define SH_KEY_TYPE RelFileNode
#define SH_KEY relnode
#define SH_STORE_HASH
#define SH_GET_HASH(tb, a) ((a)->hash)
#define SH_HASH_KEY(tb, key) hash_bytes( \
((const unsigned char *) &(key)), \
sizeof(RelFileNode) \
)
#define SH_EQUAL(tb, a, b) RelFileNodeEquals((a), (b))
#define SH_SCOPE static inline
#define SH_DEFINE
#define SH_DECLARE
#include "lib/simplehash.h"
static as_hash *hash;
static dlist_head lru;
static int max_access_stat_size;
static int max_access_stat_count;
static double min_seq_access_ratio;
static int min_seq_access_count;
void access_stat_init(void)
{
MemoryContext memctx = AllocSetContextCreate(TopMemoryContext,
"NeonSMGR/access_stat",
ALLOCSET_DEFAULT_SIZES);
DefineCustomIntVariable("neon.max_access_stat_size",
"Maximal size of Neon relation access statistic hash",
NULL,
&max_access_stat_size,
1024,
0,
INT_MAX,
PGC_USERSET,
0,
NULL,
NULL,
NULL);
DefineCustomIntVariable("neon.max_access_stat_count",
"Maximal value of relation access counter after which counters are divided by 2",
NULL,
&max_access_stat_count,
1024,
0,
INT_MAX,
PGC_USERSET,
0,
NULL,
NULL,
NULL);
DefineCustomRealVariable("neon.min_seq_access_ratio",
"Minimal seq/(rnd+seq) ratio to determine sequential access",
NULL,
&min_seq_access_ratio,
0.9,
0,
INT_MAX,
PGC_USERSET,
0,
NULL,
NULL,
NULL);
DefineCustomIntVariable("neon.min_seq_access_count",
"Minimal access count to determine sequetial access",
NULL,
&min_seq_access_count,
10,
0,
INT_MAX,
PGC_USERSET,
0,
NULL,
NULL,
NULL);
hash = as_create(memctx, max_access_stat_size, NULL);
dlist_init(&lru);
}
bool is_sequential_access(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno)
{
bool is_seq_access = false;
if (forkNum == MAIN_FORKNUM /* prefetch makes sense only for main fork */
&& max_access_stat_size != 0)
{
AccessStatEntry* entry = as_lookup(hash, rnode);
if (entry == NULL)
{
bool found;
/* New item */
while (hash->members >= max_access_stat_size)
{
/* Hash overflow: find candidate for replacement */
AccessStatEntry* victim = dlist_container(AccessStatEntry, lru_node, dlist_pop_head_node(&lru));
as_delete_item(hash, victim);
}
entry = as_insert(hash, rnode, &found);
Assert(!found);
/* Set both counter to zero because we don't know whethr first access is sequential or random */
entry->n_seq_accesses = 0;
entry->n_rnd_accesses = 0;
entry->access_count = 1;
}
else
{
uint32 access_count = entry->n_seq_accesses + entry->n_rnd_accesses;
/*
* We want this function to represent most recent access pattern,
* so when number of accesses exceed threashold value `max_access_stat_count`
* we divide bother coutners by two devaluing old data
*/
if (access_count >= max_access_stat_count)
{
entry->n_seq_accesses >>= 1;
entry->n_rnd_accesses >>= 1;
}
if (entry->blkno+1 == blkno)
entry->n_seq_accesses += 1;
else
entry->n_rnd_accesses += 1;
entry->access_count += 1;
access_count = entry->n_seq_accesses + entry->n_rnd_accesses;
is_seq_access = access_count >= min_seq_access_count
&& (double)entry->n_seq_accesses / access_count >= min_seq_access_ratio;
/* Remove entry from LRU list tobe able to insert it to the end of this list */
dlist_delete(&entry->lru_node);
}
/* Place entry to the tail of LRU list */
dlist_push_tail(&lru, &entry->lru_node);
entry->blkno = blkno;
}
return is_seq_access;
}
/*
* Get relation access pattern
*/
PG_FUNCTION_INFO_V1(get_relation_access_statistics);
typedef struct
{
TupleDesc tupdesc;
dlist_node* curr;
} AccessStatContext;
#define NUM_ACCESS_STAT_COLUMNS 6
Datum
get_relation_access_statistics(PG_FUNCTION_ARGS)
{
FuncCallContext *funcctx;
Datum result;
MemoryContext oldcontext;
AccessStatContext *fctx; /* User function context. */
TupleDesc tupledesc;
TupleDesc expected_tupledesc;
HeapTuple tuple;
if (SRF_IS_FIRSTCALL())
{
funcctx = SRF_FIRSTCALL_INIT();
/* Switch context when allocating stuff to be used in later calls */
oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
/* Create a user function context for cross-call persistence */
fctx = (AccessStatContext *) palloc(sizeof(AccessStatContext));
/*
* To smoothly support upgrades from version 1.0 of this extension
* transparently handle the (non-)existence of the pinning_backends
* column. We unfortunately have to get the result type for that... -
* we can't use the result type determined by the function definition
* without potentially crashing when somebody uses the old (or even
* wrong) function definition though.
*/
if (get_call_result_type(fcinfo, NULL, &expected_tupledesc) != TYPEFUNC_COMPOSITE)
elog(ERROR, "return type must be a row type");
if (expected_tupledesc->natts != NUM_ACCESS_STAT_COLUMNS)
elog(ERROR, "incorrect number of output arguments");
/* Construct a tuple descriptor for the result rows. */
tupledesc = CreateTemplateTupleDesc(expected_tupledesc->natts);
TupleDescInitEntry(tupledesc, (AttrNumber) 1, "relfilenode",
OIDOID, -1, 0);
TupleDescInitEntry(tupledesc, (AttrNumber) 2, "reltablespace",
OIDOID, -1, 0);
TupleDescInitEntry(tupledesc, (AttrNumber) 3, "reldatabase",
OIDOID, -1, 0);
TupleDescInitEntry(tupledesc, (AttrNumber) 4, "seqaccess",
INT4OID, -1, 0);
TupleDescInitEntry(tupledesc, (AttrNumber) 5, "rndaccess",
INT4OID, -1, 0);
TupleDescInitEntry(tupledesc, (AttrNumber) 6, "accesscnt",
INT8OID, -1, 0);
fctx->tupdesc = BlessTupleDesc(tupledesc);
fctx->curr = dlist_is_empty(&lru) ? NULL : dlist_tail_node(&lru);
/* Set max calls and remember the user function context. */
funcctx->max_calls = hash->members;
funcctx->user_fctx = fctx;
/* Return to original context when allocating transient memory */
MemoryContextSwitchTo(oldcontext);
}
funcctx = SRF_PERCALL_SETUP();
/* Get the saved state */
fctx = funcctx->user_fctx;
if (fctx->curr)
{
AccessStatEntry* entry = dlist_container(AccessStatEntry, lru_node, fctx->curr);
Datum values[NUM_ACCESS_STAT_COLUMNS];
bool nulls[NUM_ACCESS_STAT_COLUMNS] = {
false, false, false, false, false, false
};
values[0] = ObjectIdGetDatum(entry->relnode.relNode);
values[1] = ObjectIdGetDatum(entry->relnode.spcNode);
values[2] = ObjectIdGetDatum(entry->relnode.dbNode);
values[3] = Int32GetDatum(entry->n_seq_accesses);
values[4] = Int32GetDatum(entry->n_rnd_accesses);
values[5] = Int64GetDatum(entry->access_count);
/* Build and return the tuple. */
tuple = heap_form_tuple(fctx->tupdesc, values, nulls);
result = HeapTupleGetDatum(tuple);
fctx->curr = dlist_has_prev(&lru, fctx->curr) ? dlist_prev_node(&lru, fctx->curr) : NULL;
SRF_RETURN_NEXT(funcctx, result);
}
else
SRF_RETURN_DONE(funcctx);
}

View File

@@ -58,6 +58,7 @@ char *neon_auth_token;
int n_unflushed_requests = 0;
int flush_every_n_requests = 8;
int readahead_buffer_size = 128;
int readahead_distance = 10;
bool (*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id) = NULL;
@@ -452,6 +453,18 @@ pg_init_libpagestore(void)
PGC_USERSET,
0, /* no flags required */
NULL, (GucIntAssignHook) &readahead_buffer_resize, NULL);
DefineCustomIntVariable("neon.readahead_distance",
"Number of read-ahead blocks",
NULL,
&readahead_distance,
10,
0,
INT_MAX,
PGC_USERSET,
0,
NULL,
NULL,
NULL);
relsize_hash_init();
@@ -476,4 +489,5 @@ pg_init_libpagestore(void)
redo_read_buffer_filter = neon_redo_read_buffer_filter;
}
lfc_init();
access_stat_init();
}

View File

@@ -27,8 +27,18 @@ RETURNS SETOF RECORD
AS 'MODULE_PATHNAME', 'local_cache_pages'
LANGUAGE C PARALLEL SAFE;
CREATE FUNCTION get_relation_access_statistics()
RETURNS SETOF RECORD
AS 'MODULE_PATHNAME', 'get_relation_access_statistics'
LANGUAGE C PARALLEL SAFE;
-- Create a view for convenient access.
CREATE VIEW local_cache AS
SELECT P.* FROM local_cache_pages() AS P
SELECT relname,P.* FROM local_cache_pages() AS P
(pageoffs int8, relfilenode oid, reltablespace oid, reldatabase oid,
relforknumber int2, relblocknumber int8, accesscount int4);
relforknumber int2, relblocknumber int8, accesscount int4) JOIN pg_class pc ON (P.relfilenode = pc.relfilenode);
CREATE VIEW relation_access_statistics AS
SELECT relname,P.* FROM get_relation_access_statistics() AS P
(relfilenode oid, reltablespace oid, reldatabase oid,
seqaccess int4, rndaccess int4, access_count int8) JOIN pg_class pc ON (P.relfilenode = pc.relfilenode);

View File

@@ -157,6 +157,7 @@ extern page_server_api * page_server;
extern char *page_server_connstring;
extern int flush_every_n_requests;
extern int readahead_buffer_size;
extern int readahead_distance;
extern bool seqscan_prefetch_enabled;
extern int seqscan_prefetch_distance;
extern char *neon_timeline;
@@ -210,5 +211,8 @@ extern bool lfc_cache_contains(RelFileNode rnode, ForkNumber forkNum, BlockNumbe
extern void lfc_evict(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno);
extern void lfc_init(void);
/* Access statistic */
extern void access_stat_init(void);
extern bool is_sequential_access(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno);
#endif

View File

@@ -1881,6 +1881,7 @@ neon_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
if (RecoveryInProgress() && !(MyBackendType == B_STARTUP))
XLogWaitForReplayOf(request_lsn);
/*
* Try to find prefetched page in the list of received pages.
*/
@@ -2003,6 +2004,10 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
}
/* If it is expected to be sequential access then initiate prefetch of next block */
if (is_sequential_access(reln->smgr_rnode.node, forkNum, blkno))
neon_prefetch(reln, forkNum, blkno + readahead_distance);
/* Try to read from local file cache */
if (lfc_read(reln->smgr_rnode.node, forkNum, blkno, buffer))
{