mirror of
https://github.com/neondatabase/neon.git
synced 2026-02-26 05:50:37 +00:00
Compare commits
10 Commits
conrad/ref
...
heikki/hac
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
7b5e3d6d40 | ||
|
|
e086568e21 | ||
|
|
7b818f8d64 | ||
|
|
14fefd261f | ||
|
|
01abd4afc5 | ||
|
|
c8541ad29f | ||
|
|
eaad1db9f0 | ||
|
|
6ddcf68829 | ||
|
|
d701f8285c | ||
|
|
77082a0f63 |
@@ -15,6 +15,10 @@ commands:
|
||||
user: root
|
||||
sysvInitAction: sysinit
|
||||
shell: 'chmod 711 /neonvm/bin/set-disk-quota'
|
||||
- name: chmod-clockcache_dev
|
||||
user: root
|
||||
sysvInitAction: sysinit
|
||||
shell: 'chmod 777 /dev/clockcache_dev' # FIXME: not very secure
|
||||
- name: pgbouncer
|
||||
user: postgres
|
||||
sysvInitAction: respawn
|
||||
|
||||
@@ -12,6 +12,7 @@
|
||||
#include "postgres.h"
|
||||
|
||||
#include <sys/file.h>
|
||||
#include <sys/ioctl.h>
|
||||
#include <unistd.h>
|
||||
#include <fcntl.h>
|
||||
|
||||
@@ -52,6 +53,10 @@
|
||||
#include "pagestore_client.h"
|
||||
#include "communicator.h"
|
||||
|
||||
/* For the kernel module */
|
||||
#include "neon_pagecache.h"
|
||||
#define CLOCKCACHE_DEV_PATH "/dev/clockcache_dev"
|
||||
|
||||
#define CriticalAssert(cond) do if (!(cond)) elog(PANIC, "LFC: assertion %s failed at %s:%d: ", #cond, __FILE__, __LINE__); while (0)
|
||||
|
||||
/*
|
||||
@@ -159,6 +164,13 @@ typedef struct FileCacheControl
|
||||
uint64 time_write; /* time spent writing (us) */
|
||||
uint64 resizes; /* number of LFC resizes */
|
||||
uint64 evicted_pages; /* number of evicted pages */
|
||||
|
||||
/* FIXME: should make these atomic, they're not protected by any locks */
|
||||
uint64 kernel_module_read_hits; /* success returns from read ioctl */
|
||||
uint64 kernel_module_read_misses; /* ENOENT returns from read ioctl */
|
||||
uint64 kernel_module_write_hits; /* success returns from write ioctl */
|
||||
uint64 kernel_module_write_misses; /* ENOMEM returns from write ioctl */
|
||||
|
||||
dlist_head lru; /* double linked list for LRU replacement
|
||||
* algorithm */
|
||||
dlist_head holes; /* double linked list of punched holes */
|
||||
@@ -183,6 +195,7 @@ typedef struct FileCacheControl
|
||||
static HTAB *lfc_hash;
|
||||
static int lfc_desc = -1;
|
||||
static LWLockId lfc_lock;
|
||||
|
||||
static int lfc_max_size;
|
||||
static int lfc_size_limit;
|
||||
static int lfc_prewarm_limit;
|
||||
@@ -190,6 +203,8 @@ static int lfc_prewarm_batch;
|
||||
static int lfc_chunk_size_log = MAX_BLOCKS_PER_CHUNK_LOG;
|
||||
static int lfc_blocks_per_chunk = MAX_BLOCKS_PER_CHUNK;
|
||||
static char *lfc_path;
|
||||
static bool lfc_use_kernel_module;
|
||||
|
||||
static uint64 lfc_generation;
|
||||
static FileCacheControl *lfc_ctl;
|
||||
static bool lfc_do_prewarm;
|
||||
@@ -203,6 +218,9 @@ bool lfc_prewarm_update_ws_estimation;
|
||||
|
||||
#define LFC_ENABLED() (lfc_ctl->limit != 0)
|
||||
|
||||
static int pread_with_ioctl(void *buffer, uint64 blkno);
|
||||
static int pwrite_with_ioctl(const void *buffer, uint64 blkno);
|
||||
|
||||
/*
|
||||
* Close LFC file if opened.
|
||||
* All backends should close their LFC files once LFC is disabled.
|
||||
@@ -251,14 +269,19 @@ lfc_switch_off(void)
|
||||
/*
|
||||
* We need to use unlink to to avoid races in LFC write, because it is not
|
||||
* protected by lock
|
||||
*
|
||||
* FIXME: how to clean up the kernel module device on trouble?
|
||||
*/
|
||||
unlink(lfc_path);
|
||||
if (!lfc_use_kernel_module)
|
||||
{
|
||||
unlink(lfc_path);
|
||||
|
||||
fd = BasicOpenFile(lfc_path, O_RDWR | O_CREAT | O_TRUNC);
|
||||
if (fd < 0)
|
||||
elog(WARNING, "LFC: failed to recreate local file cache %s: %m", lfc_path);
|
||||
else
|
||||
close(fd);
|
||||
fd = BasicOpenFile(lfc_path, O_RDWR | O_CREAT | O_TRUNC);
|
||||
if (fd < 0)
|
||||
elog(WARNING, "LFC: failed to recreate local file cache %s: %m", lfc_path);
|
||||
else
|
||||
close(fd);
|
||||
}
|
||||
|
||||
/* Wakeup waiting backends */
|
||||
for (int i = 0; i < N_COND_VARS; i++)
|
||||
@@ -270,7 +293,8 @@ lfc_switch_off(void)
|
||||
static void
|
||||
lfc_disable(char const *op)
|
||||
{
|
||||
elog(WARNING, "LFC: failed to %s local file cache at %s: %m, disabling local file cache", op, lfc_path);
|
||||
elog(WARNING, "LFC: failed to %s local file cache at %s: %m, disabling local file cache",
|
||||
op, lfc_use_kernel_module ? CLOCKCACHE_DEV_PATH : lfc_path);
|
||||
|
||||
LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
|
||||
lfc_switch_off();
|
||||
@@ -301,7 +325,9 @@ lfc_ensure_opened(void)
|
||||
/* Open cache file if not done yet */
|
||||
if (lfc_desc < 0)
|
||||
{
|
||||
lfc_desc = BasicOpenFile(lfc_path, O_RDWR);
|
||||
lfc_desc = BasicOpenFile(
|
||||
lfc_use_kernel_module ? CLOCKCACHE_DEV_PATH : lfc_path,
|
||||
O_RDWR);
|
||||
|
||||
if (lfc_desc < 0)
|
||||
{
|
||||
@@ -351,10 +377,16 @@ lfc_shmem_startup(void)
|
||||
initSHLL(&lfc_ctl->wss_estimation);
|
||||
|
||||
/* Recreate file cache on restart */
|
||||
fd = BasicOpenFile(lfc_path, O_RDWR | O_CREAT | O_TRUNC);
|
||||
if (lfc_use_kernel_module)
|
||||
fd = BasicOpenFile(CLOCKCACHE_DEV_PATH, O_RDWR);
|
||||
else
|
||||
fd = BasicOpenFile(lfc_path, O_RDWR | O_CREAT | O_TRUNC);
|
||||
if (fd < 0)
|
||||
{
|
||||
elog(WARNING, "LFC: failed to create local file cache %s: %m", lfc_path);
|
||||
if (lfc_use_kernel_module)
|
||||
elog(WARNING, "LFC: failed to open " CLOCKCACHE_DEV_PATH ": %m");
|
||||
else
|
||||
elog(WARNING, "LFC: failed to create local file cache %s: %m", lfc_path);
|
||||
lfc_ctl->limit = 0;
|
||||
}
|
||||
else
|
||||
@@ -613,6 +645,15 @@ lfc_init(void)
|
||||
NULL,
|
||||
NULL);
|
||||
|
||||
DefineCustomBoolVariable("neon.use_kernel_module",
|
||||
"Use neon_pagecache kernel module instead of a regular file (EXPERIMENTAL)",
|
||||
NULL,
|
||||
&lfc_use_kernel_module,
|
||||
true,
|
||||
PGC_POSTMASTER,
|
||||
0,
|
||||
NULL, NULL, NULL);
|
||||
|
||||
if (lfc_max_size == 0)
|
||||
return;
|
||||
|
||||
@@ -1297,27 +1338,57 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
/* offset of first IOV */
|
||||
first_read_offset += chunk_offs + first_block_in_chunk_read;
|
||||
|
||||
pgstat_report_wait_start(WAIT_EVENT_NEON_LFC_READ);
|
||||
|
||||
/* Read only the blocks we're interested in, limiting */
|
||||
rc = preadv(lfc_desc, &iov[first_block_in_chunk_read],
|
||||
nwrite, first_read_offset * BLCKSZ);
|
||||
pgstat_report_wait_end();
|
||||
|
||||
if (rc != (BLCKSZ * nwrite))
|
||||
if (lfc_use_kernel_module)
|
||||
{
|
||||
lfc_disable("read");
|
||||
return -1;
|
||||
pgstat_report_wait_start(WAIT_EVENT_NEON_LFC_READ);
|
||||
for (int i = first_block_in_chunk_read; i < iov_last_used; i++)
|
||||
{
|
||||
if (!BITMAP_ISSET(chunk_mask, i))
|
||||
continue;
|
||||
Assert(iov[i].iov_len == BLCKSZ);
|
||||
rc = pread_with_ioctl(iov[i].iov_base, first_read_offset + i - first_block_in_chunk_read);
|
||||
if (rc < 0 && errno == ENOENT)
|
||||
{
|
||||
/* The kernel module evicted the page */
|
||||
elog(DEBUG1, "kernel module had evicted block");
|
||||
}
|
||||
else if (rc < 0)
|
||||
{
|
||||
pgstat_report_wait_end();
|
||||
lfc_disable("ioctl read");
|
||||
return -1;
|
||||
}
|
||||
else
|
||||
{
|
||||
/* success! */
|
||||
BITMAP_SET(mask, buf_offset + i);
|
||||
}
|
||||
}
|
||||
pgstat_report_wait_end();
|
||||
}
|
||||
|
||||
/*
|
||||
* We successfully read the pages we know were valid when we
|
||||
* started reading; now mark those pages as read
|
||||
*/
|
||||
for (int i = first_block_in_chunk_read; i < iov_last_used; i++)
|
||||
else
|
||||
{
|
||||
if (BITMAP_ISSET(chunk_mask, i))
|
||||
BITMAP_SET(mask, buf_offset + i);
|
||||
pgstat_report_wait_start(WAIT_EVENT_NEON_LFC_READ);
|
||||
rc = preadv(lfc_desc, &iov[first_block_in_chunk_read],
|
||||
nwrite, first_read_offset * BLCKSZ);
|
||||
pgstat_report_wait_end();
|
||||
|
||||
if (rc != (BLCKSZ * nwrite))
|
||||
{
|
||||
lfc_disable("read");
|
||||
return -1;
|
||||
}
|
||||
|
||||
/*
|
||||
* We successfully read the pages we know were valid when we
|
||||
* started reading; now mark those pages as read
|
||||
*/
|
||||
for (int i = first_block_in_chunk_read; i < iov_last_used; i++)
|
||||
{
|
||||
if (BITMAP_ISSET(chunk_mask, i))
|
||||
BITMAP_SET(mask, buf_offset + i);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1364,6 +1435,65 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
return blocks_read;
|
||||
}
|
||||
|
||||
static int
|
||||
pread_with_ioctl(void *buffer, uint64 blkno)
|
||||
{
|
||||
struct neon_rw_args args = {
|
||||
.key = {
|
||||
.key_hi = 0,
|
||||
.key_lo = blkno
|
||||
},
|
||||
.offset = 0,
|
||||
.length = POSTGRES_PAGE_SIZE,
|
||||
.buffer = (__u64)(uintptr_t) buffer
|
||||
};
|
||||
int rc;
|
||||
|
||||
errno = 0;
|
||||
|
||||
elog(LOG, "calling ioctl read for blk %lu with buffer=%p (shared_buffers is at %p-%p)",
|
||||
blkno,
|
||||
buffer,
|
||||
BufferBlocks,
|
||||
BufferBlocks + BLCKSZ * NBuffers);
|
||||
rc = ioctl(lfc_desc, NEON_IOCTL_READ, &args);
|
||||
if (rc >= 0)
|
||||
lfc_ctl->kernel_module_read_hits++;
|
||||
else if (rc < 0 && errno == ENOENT)
|
||||
lfc_ctl->kernel_module_read_misses++;
|
||||
else
|
||||
elog(LOG, "ioctl read failed for blk %lu with buffer=%p: %m", blkno, buffer);
|
||||
return rc;
|
||||
}
|
||||
|
||||
static int
|
||||
pwrite_with_ioctl(const void *buffer, uint64 blkno)
|
||||
{
|
||||
struct neon_rw_args args = {
|
||||
.key = {
|
||||
.key_hi = 0,
|
||||
.key_lo = blkno
|
||||
},
|
||||
.offset = 0,
|
||||
.length = POSTGRES_PAGE_SIZE,
|
||||
.buffer = (__u64)(uintptr_t) buffer
|
||||
};
|
||||
int rc;
|
||||
|
||||
elog(LOG, "calling ioctl write for blk %lu with buffer=%p (shared_buffers is at %p-%p)",
|
||||
blkno,
|
||||
buffer,
|
||||
BufferBlocks,
|
||||
BufferBlocks + BLCKSZ * NBuffers);
|
||||
|
||||
rc = ioctl(lfc_desc, NEON_IOCTL_WRITE, &args);
|
||||
if (rc >= 0)
|
||||
lfc_ctl->kernel_module_write_hits++;
|
||||
else if (rc < 0 && errno == ENOMEM)
|
||||
lfc_ctl->kernel_module_write_misses++;
|
||||
return rc;
|
||||
}
|
||||
|
||||
/*
|
||||
* Initialize new LFC hash entry, perform eviction if needed.
|
||||
* Returns false if there are no unpinned entries and chunk can not be added.
|
||||
@@ -1484,7 +1614,6 @@ lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
|
||||
{
|
||||
BufferTag tag;
|
||||
FileCacheEntry *entry;
|
||||
ssize_t rc;
|
||||
bool found;
|
||||
uint32 hash;
|
||||
uint64 generation;
|
||||
@@ -1493,6 +1622,7 @@ lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
|
||||
ConditionVariable* cv;
|
||||
FileCacheBlockState state;
|
||||
XLogRecPtr lwlsn;
|
||||
bool success;
|
||||
|
||||
int chunk_offs = BLOCK_TO_CHUNK_OFF(blkno);
|
||||
|
||||
@@ -1571,16 +1701,60 @@ lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
|
||||
|
||||
LWLockRelease(lfc_lock);
|
||||
|
||||
pgstat_report_wait_start(WAIT_EVENT_NEON_LFC_WRITE);
|
||||
INSTR_TIME_SET_CURRENT(io_start);
|
||||
rc = pwrite(lfc_desc, buffer, BLCKSZ,
|
||||
((off_t) entry_offset * lfc_blocks_per_chunk + chunk_offs) * BLCKSZ);
|
||||
INSTR_TIME_SET_CURRENT(io_end);
|
||||
pgstat_report_wait_end();
|
||||
|
||||
if (rc != BLCKSZ)
|
||||
if (lfc_use_kernel_module)
|
||||
{
|
||||
lfc_disable("write");
|
||||
int rc;
|
||||
|
||||
pgstat_report_wait_start(WAIT_EVENT_NEON_LFC_WRITE);
|
||||
INSTR_TIME_SET_CURRENT(io_start);
|
||||
rc = pwrite_with_ioctl(buffer,
|
||||
entry_offset * lfc_blocks_per_chunk + chunk_offs);
|
||||
INSTR_TIME_SET_CURRENT(io_end);
|
||||
pgstat_report_wait_end();
|
||||
|
||||
if (rc < 0 && errno == ENOMEM)
|
||||
{
|
||||
/*
|
||||
* Write was wasted.
|
||||
*
|
||||
* FIXME: We could mark the page in the chunk as UNAVAILABLE,
|
||||
* since we know it was not actually present in the kernel
|
||||
* cache. Any subsequent read on it will inevitably fail with
|
||||
* ENOENT. That's not a correctness issue however, assuming that
|
||||
* the call never returns ENOMEM when the old version of the page
|
||||
* is still in the cache.
|
||||
*/
|
||||
success = true;
|
||||
}
|
||||
else if (rc < 0)
|
||||
{
|
||||
success = false;
|
||||
}
|
||||
else
|
||||
{
|
||||
/* successful write */
|
||||
success = true;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
ssize_t rc;
|
||||
|
||||
pgstat_report_wait_start(WAIT_EVENT_NEON_LFC_WRITE);
|
||||
INSTR_TIME_SET_CURRENT(io_start);
|
||||
|
||||
rc = pwrite(lfc_desc, buffer, BLCKSZ,
|
||||
((off_t) entry_offset * lfc_blocks_per_chunk + chunk_offs) * BLCKSZ);
|
||||
|
||||
INSTR_TIME_SET_CURRENT(io_end);
|
||||
pgstat_report_wait_end();
|
||||
|
||||
success = (rc == BLCKSZ);
|
||||
}
|
||||
|
||||
if (!success)
|
||||
{
|
||||
lfc_disable(lfc_use_kernel_module ? "write ioctl" : "write");
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -1756,19 +1930,60 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
}
|
||||
LWLockRelease(lfc_lock);
|
||||
|
||||
pgstat_report_wait_start(WAIT_EVENT_NEON_LFC_WRITE);
|
||||
INSTR_TIME_SET_CURRENT(io_start);
|
||||
rc = pwritev(lfc_desc, iov, blocks_in_chunk,
|
||||
((off_t) entry_offset * lfc_blocks_per_chunk + chunk_offs) * BLCKSZ);
|
||||
INSTR_TIME_SET_CURRENT(io_end);
|
||||
pgstat_report_wait_end();
|
||||
|
||||
if (rc != BLCKSZ * blocks_in_chunk)
|
||||
/* Perform the write */
|
||||
if (lfc_use_kernel_module)
|
||||
{
|
||||
lfc_disable("write");
|
||||
return;
|
||||
pgstat_report_wait_start(WAIT_EVENT_NEON_LFC_WRITE);
|
||||
INSTR_TIME_SET_CURRENT(io_start);
|
||||
for (int i = 0; i < blocks_in_chunk; i++)
|
||||
{
|
||||
int rc;
|
||||
|
||||
rc = pwrite_with_ioctl(
|
||||
iov[i].iov_base,
|
||||
entry_offset * lfc_blocks_per_chunk + chunk_offs
|
||||
);
|
||||
if (rc < 0 && errno == ENOMEM)
|
||||
{
|
||||
/*
|
||||
* Write was wasted.
|
||||
*
|
||||
* FIXME: We could mark the page in the chunk as UNAVAILABLE,
|
||||
* since we know it was not actually present in the kernel
|
||||
* cache. Any subsequent read on it will inevitably fail with
|
||||
* ENOENT. That's not a correctness issue however, assuming that
|
||||
* the call never returns ENOMEM when the old version of the page
|
||||
* is still in the cache.
|
||||
*/
|
||||
}
|
||||
else if (rc < 0)
|
||||
{
|
||||
/* other error, not expected */
|
||||
INSTR_TIME_SET_CURRENT(io_end);
|
||||
pgstat_report_wait_end();
|
||||
lfc_disable("write ioctl");
|
||||
return;
|
||||
}
|
||||
}
|
||||
INSTR_TIME_SET_CURRENT(io_end);
|
||||
pgstat_report_wait_end();
|
||||
}
|
||||
else
|
||||
{
|
||||
pgstat_report_wait_start(WAIT_EVENT_NEON_LFC_WRITE);
|
||||
INSTR_TIME_SET_CURRENT(io_start);
|
||||
rc = pwritev(lfc_desc, iov, blocks_in_chunk,
|
||||
((off_t) entry_offset * lfc_blocks_per_chunk + chunk_offs) * BLCKSZ);
|
||||
INSTR_TIME_SET_CURRENT(io_end);
|
||||
pgstat_report_wait_end();
|
||||
if (rc != BLCKSZ * blocks_in_chunk)
|
||||
{
|
||||
lfc_disable("write");
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
/* success */
|
||||
{
|
||||
LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
|
||||
|
||||
@@ -1922,6 +2137,26 @@ neon_get_lfc_stats(PG_FUNCTION_ARGS)
|
||||
if (lfc_ctl)
|
||||
value = lfc_ctl->pinned;
|
||||
break;
|
||||
case 10:
|
||||
key = "file_cache_kernel_module_read_hits";
|
||||
if (lfc_ctl)
|
||||
value = lfc_ctl->kernel_module_read_hits;
|
||||
break;
|
||||
case 11:
|
||||
key = "file_cache_kernel_module_read_misses";
|
||||
if (lfc_ctl)
|
||||
value = lfc_ctl->kernel_module_read_misses;
|
||||
break;
|
||||
case 12:
|
||||
key = "file_cache_kernel_module_write_hits";
|
||||
if (lfc_ctl)
|
||||
value = lfc_ctl->kernel_module_write_hits;
|
||||
break;
|
||||
case 13:
|
||||
key = "file_cache_kernel_module_write_misses";
|
||||
if (lfc_ctl)
|
||||
value = lfc_ctl->kernel_module_write_misses;
|
||||
break;
|
||||
default:
|
||||
SRF_RETURN_DONE(funcctx);
|
||||
}
|
||||
|
||||
35
pgxn/neon/neon_pagecache.h
Normal file
35
pgxn/neon/neon_pagecache.h
Normal file
@@ -0,0 +1,35 @@
|
||||
/*
|
||||
* This is for the special ioctl in the neon_pagecache kernel module.
|
||||
*
|
||||
* DO NOT MODIFY! This header must agree with what the kernel module was
|
||||
* compiled with!
|
||||
*/
|
||||
|
||||
#ifndef NEON_PAGECACHE_H
|
||||
#define NEON_PAGECACHE_H
|
||||
|
||||
#include <linux/types.h>
|
||||
|
||||
|
||||
#define POSTGRES_PAGE_SIZE 8192 // 8 KiB
|
||||
|
||||
struct neon_key {
|
||||
__u64 key_hi; // Upper 64 bits of 128-bit key
|
||||
__u64 key_lo; // Lower 64 bits of 128-bit key
|
||||
};
|
||||
|
||||
struct neon_rw_args {
|
||||
struct neon_key key;
|
||||
__u32 offset; // Offset within page (0-8191)
|
||||
__u32 length; // Length to read/write
|
||||
__u64 buffer; // User buffer address
|
||||
};
|
||||
|
||||
#define NEON_IOC_MAGIC 'N'
|
||||
|
||||
#define NEON_IOCTL_READ _IOWR(NEON_IOC_MAGIC, 1, struct neon_rw_args)
|
||||
#define NEON_IOCTL_WRITE _IOWR(NEON_IOC_MAGIC, 2, struct neon_rw_args)
|
||||
|
||||
|
||||
|
||||
#endif /* NEON_PAGECACHE_H */
|
||||
Reference in New Issue
Block a user