Compare commits

...

10 Commits

Author SHA1 Message Date
Christian Schwarz
7b5e3d6d40 Revert "work around copy_to_user failure on reads"
Not necessary as of 271b19bd3e

Reason why that commit fixes the issues is
https://neondb.slack.com/archives/C08SXUSNFBP/p1747837985950309

 > implemented a fix in the kernel module, no need for userspace to
> pre-fault mmapped pages in; the reason for the EFAULT was that we were
> holding a spinlock during copy_to_user, which disables preemption, which
> in turn makes the page fault handler give up and cause copy_to_user to
> fail; commit to be used is 271b19bd3e2de7777770ac6b8b1b1c94bb33830b
> (edited)

This reverts commit 7b818f8d64.
2025-05-23 12:11:01 +02:00
Christian Schwarz
e086568e21 demote log levels to avoid flooding the logs 2025-05-23 12:10:53 +02:00
Heikki Linnakangas
7b818f8d64 work around copy_to_user failure on reads 2025-05-21 16:33:52 +03:00
Heikki Linnakangas
14fefd261f add separate counters for ioctl read misses, and some debugging LOG messages 2025-05-21 01:33:58 +03:00
Heikki Linnakangas
01abd4afc5 fix address argument again 2025-05-21 01:33:43 +03:00
Heikki Linnakangas
c8541ad29f fix bogus pointer 2025-05-21 00:20:56 +03:00
Heikki Linnakangas
eaad1db9f0 Add bespoken metrics for kernel module cache misses 2025-05-21 00:20:39 +03:00
Heikki Linnakangas
6ddcf68829 use correct request code for writes 2025-05-20 23:53:56 +03:00
Heikki Linnakangas
d701f8285c hack permissions on /dev/clockcache_dev to allow access to everyone 2025-05-20 23:52:58 +03:00
Heikki Linnakangas
77082a0f63 Implement using the kernel module
Enabled when you set "lfc_use_kernel_module=on" in postgresql.conf

XXX: This compiles, but is 100% untested
2025-05-20 17:45:40 +03:00
3 changed files with 321 additions and 47 deletions

View File

@@ -15,6 +15,10 @@ commands:
user: root
sysvInitAction: sysinit
shell: 'chmod 711 /neonvm/bin/set-disk-quota'
- name: chmod-clockcache_dev
user: root
sysvInitAction: sysinit
shell: 'chmod 777 /dev/clockcache_dev' # FIXME: not very secure
- name: pgbouncer
user: postgres
sysvInitAction: respawn

View File

@@ -12,6 +12,7 @@
#include "postgres.h"
#include <sys/file.h>
#include <sys/ioctl.h>
#include <unistd.h>
#include <fcntl.h>
@@ -52,6 +53,10 @@
#include "pagestore_client.h"
#include "communicator.h"
/* For the kernel module */
#include "neon_pagecache.h"
#define CLOCKCACHE_DEV_PATH "/dev/clockcache_dev"
#define CriticalAssert(cond) do if (!(cond)) elog(PANIC, "LFC: assertion %s failed at %s:%d: ", #cond, __FILE__, __LINE__); while (0)
/*
@@ -159,6 +164,13 @@ typedef struct FileCacheControl
uint64 time_write; /* time spent writing (us) */
uint64 resizes; /* number of LFC resizes */
uint64 evicted_pages; /* number of evicted pages */
/* FIXME: should make these atomic, they're not protected by any locks */
uint64 kernel_module_read_hits; /* success returns from read ioctl */
uint64 kernel_module_read_misses; /* ENOENT returns from read ioctl */
uint64 kernel_module_write_hits; /* success returns from write ioctl */
uint64 kernel_module_write_misses; /* ENOMEM returns from write ioctl */
dlist_head lru; /* double linked list for LRU replacement
* algorithm */
dlist_head holes; /* double linked list of punched holes */
@@ -183,6 +195,7 @@ typedef struct FileCacheControl
static HTAB *lfc_hash;
static int lfc_desc = -1;
static LWLockId lfc_lock;
static int lfc_max_size;
static int lfc_size_limit;
static int lfc_prewarm_limit;
@@ -190,6 +203,8 @@ static int lfc_prewarm_batch;
static int lfc_chunk_size_log = MAX_BLOCKS_PER_CHUNK_LOG;
static int lfc_blocks_per_chunk = MAX_BLOCKS_PER_CHUNK;
static char *lfc_path;
static bool lfc_use_kernel_module;
static uint64 lfc_generation;
static FileCacheControl *lfc_ctl;
static bool lfc_do_prewarm;
@@ -203,6 +218,9 @@ bool lfc_prewarm_update_ws_estimation;
#define LFC_ENABLED() (lfc_ctl->limit != 0)
static int pread_with_ioctl(void *buffer, uint64 blkno);
static int pwrite_with_ioctl(const void *buffer, uint64 blkno);
/*
* Close LFC file if opened.
* All backends should close their LFC files once LFC is disabled.
@@ -251,14 +269,19 @@ lfc_switch_off(void)
/*
* We need to use unlink to to avoid races in LFC write, because it is not
* protected by lock
*
* FIXME: how to clean up the kernel module device on trouble?
*/
unlink(lfc_path);
if (!lfc_use_kernel_module)
{
unlink(lfc_path);
fd = BasicOpenFile(lfc_path, O_RDWR | O_CREAT | O_TRUNC);
if (fd < 0)
elog(WARNING, "LFC: failed to recreate local file cache %s: %m", lfc_path);
else
close(fd);
fd = BasicOpenFile(lfc_path, O_RDWR | O_CREAT | O_TRUNC);
if (fd < 0)
elog(WARNING, "LFC: failed to recreate local file cache %s: %m", lfc_path);
else
close(fd);
}
/* Wakeup waiting backends */
for (int i = 0; i < N_COND_VARS; i++)
@@ -270,7 +293,8 @@ lfc_switch_off(void)
static void
lfc_disable(char const *op)
{
elog(WARNING, "LFC: failed to %s local file cache at %s: %m, disabling local file cache", op, lfc_path);
elog(WARNING, "LFC: failed to %s local file cache at %s: %m, disabling local file cache",
op, lfc_use_kernel_module ? CLOCKCACHE_DEV_PATH : lfc_path);
LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
lfc_switch_off();
@@ -301,7 +325,9 @@ lfc_ensure_opened(void)
/* Open cache file if not done yet */
if (lfc_desc < 0)
{
lfc_desc = BasicOpenFile(lfc_path, O_RDWR);
lfc_desc = BasicOpenFile(
lfc_use_kernel_module ? CLOCKCACHE_DEV_PATH : lfc_path,
O_RDWR);
if (lfc_desc < 0)
{
@@ -351,10 +377,16 @@ lfc_shmem_startup(void)
initSHLL(&lfc_ctl->wss_estimation);
/* Recreate file cache on restart */
fd = BasicOpenFile(lfc_path, O_RDWR | O_CREAT | O_TRUNC);
if (lfc_use_kernel_module)
fd = BasicOpenFile(CLOCKCACHE_DEV_PATH, O_RDWR);
else
fd = BasicOpenFile(lfc_path, O_RDWR | O_CREAT | O_TRUNC);
if (fd < 0)
{
elog(WARNING, "LFC: failed to create local file cache %s: %m", lfc_path);
if (lfc_use_kernel_module)
elog(WARNING, "LFC: failed to open " CLOCKCACHE_DEV_PATH ": %m");
else
elog(WARNING, "LFC: failed to create local file cache %s: %m", lfc_path);
lfc_ctl->limit = 0;
}
else
@@ -613,6 +645,15 @@ lfc_init(void)
NULL,
NULL);
DefineCustomBoolVariable("neon.use_kernel_module",
"Use neon_pagecache kernel module instead of a regular file (EXPERIMENTAL)",
NULL,
&lfc_use_kernel_module,
true,
PGC_POSTMASTER,
0,
NULL, NULL, NULL);
if (lfc_max_size == 0)
return;
@@ -1297,27 +1338,57 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
/* offset of first IOV */
first_read_offset += chunk_offs + first_block_in_chunk_read;
pgstat_report_wait_start(WAIT_EVENT_NEON_LFC_READ);
/* Read only the blocks we're interested in, limiting */
rc = preadv(lfc_desc, &iov[first_block_in_chunk_read],
nwrite, first_read_offset * BLCKSZ);
pgstat_report_wait_end();
if (rc != (BLCKSZ * nwrite))
if (lfc_use_kernel_module)
{
lfc_disable("read");
return -1;
pgstat_report_wait_start(WAIT_EVENT_NEON_LFC_READ);
for (int i = first_block_in_chunk_read; i < iov_last_used; i++)
{
if (!BITMAP_ISSET(chunk_mask, i))
continue;
Assert(iov[i].iov_len == BLCKSZ);
rc = pread_with_ioctl(iov[i].iov_base, first_read_offset + i - first_block_in_chunk_read);
if (rc < 0 && errno == ENOENT)
{
/* The kernel module evicted the page */
elog(DEBUG1, "kernel module had evicted block");
}
else if (rc < 0)
{
pgstat_report_wait_end();
lfc_disable("ioctl read");
return -1;
}
else
{
/* success! */
BITMAP_SET(mask, buf_offset + i);
}
}
pgstat_report_wait_end();
}
/*
* We successfully read the pages we know were valid when we
* started reading; now mark those pages as read
*/
for (int i = first_block_in_chunk_read; i < iov_last_used; i++)
else
{
if (BITMAP_ISSET(chunk_mask, i))
BITMAP_SET(mask, buf_offset + i);
pgstat_report_wait_start(WAIT_EVENT_NEON_LFC_READ);
rc = preadv(lfc_desc, &iov[first_block_in_chunk_read],
nwrite, first_read_offset * BLCKSZ);
pgstat_report_wait_end();
if (rc != (BLCKSZ * nwrite))
{
lfc_disable("read");
return -1;
}
/*
* We successfully read the pages we know were valid when we
* started reading; now mark those pages as read
*/
for (int i = first_block_in_chunk_read; i < iov_last_used; i++)
{
if (BITMAP_ISSET(chunk_mask, i))
BITMAP_SET(mask, buf_offset + i);
}
}
}
@@ -1364,6 +1435,65 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
return blocks_read;
}
static int
pread_with_ioctl(void *buffer, uint64 blkno)
{
struct neon_rw_args args = {
.key = {
.key_hi = 0,
.key_lo = blkno
},
.offset = 0,
.length = POSTGRES_PAGE_SIZE,
.buffer = (__u64)(uintptr_t) buffer
};
int rc;
errno = 0;
elog(LOG, "calling ioctl read for blk %lu with buffer=%p (shared_buffers is at %p-%p)",
blkno,
buffer,
BufferBlocks,
BufferBlocks + BLCKSZ * NBuffers);
rc = ioctl(lfc_desc, NEON_IOCTL_READ, &args);
if (rc >= 0)
lfc_ctl->kernel_module_read_hits++;
else if (rc < 0 && errno == ENOENT)
lfc_ctl->kernel_module_read_misses++;
else
elog(LOG, "ioctl read failed for blk %lu with buffer=%p: %m", blkno, buffer);
return rc;
}
static int
pwrite_with_ioctl(const void *buffer, uint64 blkno)
{
struct neon_rw_args args = {
.key = {
.key_hi = 0,
.key_lo = blkno
},
.offset = 0,
.length = POSTGRES_PAGE_SIZE,
.buffer = (__u64)(uintptr_t) buffer
};
int rc;
elog(LOG, "calling ioctl write for blk %lu with buffer=%p (shared_buffers is at %p-%p)",
blkno,
buffer,
BufferBlocks,
BufferBlocks + BLCKSZ * NBuffers);
rc = ioctl(lfc_desc, NEON_IOCTL_WRITE, &args);
if (rc >= 0)
lfc_ctl->kernel_module_write_hits++;
else if (rc < 0 && errno == ENOMEM)
lfc_ctl->kernel_module_write_misses++;
return rc;
}
/*
* Initialize new LFC hash entry, perform eviction if needed.
* Returns false if there are no unpinned entries and chunk can not be added.
@@ -1484,7 +1614,6 @@ lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
{
BufferTag tag;
FileCacheEntry *entry;
ssize_t rc;
bool found;
uint32 hash;
uint64 generation;
@@ -1493,6 +1622,7 @@ lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
ConditionVariable* cv;
FileCacheBlockState state;
XLogRecPtr lwlsn;
bool success;
int chunk_offs = BLOCK_TO_CHUNK_OFF(blkno);
@@ -1571,16 +1701,60 @@ lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
LWLockRelease(lfc_lock);
pgstat_report_wait_start(WAIT_EVENT_NEON_LFC_WRITE);
INSTR_TIME_SET_CURRENT(io_start);
rc = pwrite(lfc_desc, buffer, BLCKSZ,
((off_t) entry_offset * lfc_blocks_per_chunk + chunk_offs) * BLCKSZ);
INSTR_TIME_SET_CURRENT(io_end);
pgstat_report_wait_end();
if (rc != BLCKSZ)
if (lfc_use_kernel_module)
{
lfc_disable("write");
int rc;
pgstat_report_wait_start(WAIT_EVENT_NEON_LFC_WRITE);
INSTR_TIME_SET_CURRENT(io_start);
rc = pwrite_with_ioctl(buffer,
entry_offset * lfc_blocks_per_chunk + chunk_offs);
INSTR_TIME_SET_CURRENT(io_end);
pgstat_report_wait_end();
if (rc < 0 && errno == ENOMEM)
{
/*
* Write was wasted.
*
* FIXME: We could mark the page in the chunk as UNAVAILABLE,
* since we know it was not actually present in the kernel
* cache. Any subsequent read on it will inevitably fail with
* ENOENT. That's not a correctness issue however, assuming that
* the call never returns ENOMEM when the old version of the page
* is still in the cache.
*/
success = true;
}
else if (rc < 0)
{
success = false;
}
else
{
/* successful write */
success = true;
}
}
else
{
ssize_t rc;
pgstat_report_wait_start(WAIT_EVENT_NEON_LFC_WRITE);
INSTR_TIME_SET_CURRENT(io_start);
rc = pwrite(lfc_desc, buffer, BLCKSZ,
((off_t) entry_offset * lfc_blocks_per_chunk + chunk_offs) * BLCKSZ);
INSTR_TIME_SET_CURRENT(io_end);
pgstat_report_wait_end();
success = (rc == BLCKSZ);
}
if (!success)
{
lfc_disable(lfc_use_kernel_module ? "write ioctl" : "write");
}
else
{
@@ -1756,19 +1930,60 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
}
LWLockRelease(lfc_lock);
pgstat_report_wait_start(WAIT_EVENT_NEON_LFC_WRITE);
INSTR_TIME_SET_CURRENT(io_start);
rc = pwritev(lfc_desc, iov, blocks_in_chunk,
((off_t) entry_offset * lfc_blocks_per_chunk + chunk_offs) * BLCKSZ);
INSTR_TIME_SET_CURRENT(io_end);
pgstat_report_wait_end();
if (rc != BLCKSZ * blocks_in_chunk)
/* Perform the write */
if (lfc_use_kernel_module)
{
lfc_disable("write");
return;
pgstat_report_wait_start(WAIT_EVENT_NEON_LFC_WRITE);
INSTR_TIME_SET_CURRENT(io_start);
for (int i = 0; i < blocks_in_chunk; i++)
{
int rc;
rc = pwrite_with_ioctl(
iov[i].iov_base,
entry_offset * lfc_blocks_per_chunk + chunk_offs
);
if (rc < 0 && errno == ENOMEM)
{
/*
* Write was wasted.
*
* FIXME: We could mark the page in the chunk as UNAVAILABLE,
* since we know it was not actually present in the kernel
* cache. Any subsequent read on it will inevitably fail with
* ENOENT. That's not a correctness issue however, assuming that
* the call never returns ENOMEM when the old version of the page
* is still in the cache.
*/
}
else if (rc < 0)
{
/* other error, not expected */
INSTR_TIME_SET_CURRENT(io_end);
pgstat_report_wait_end();
lfc_disable("write ioctl");
return;
}
}
INSTR_TIME_SET_CURRENT(io_end);
pgstat_report_wait_end();
}
else
{
pgstat_report_wait_start(WAIT_EVENT_NEON_LFC_WRITE);
INSTR_TIME_SET_CURRENT(io_start);
rc = pwritev(lfc_desc, iov, blocks_in_chunk,
((off_t) entry_offset * lfc_blocks_per_chunk + chunk_offs) * BLCKSZ);
INSTR_TIME_SET_CURRENT(io_end);
pgstat_report_wait_end();
if (rc != BLCKSZ * blocks_in_chunk)
{
lfc_disable("write");
return;
}
}
/* success */
{
LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
@@ -1922,6 +2137,26 @@ neon_get_lfc_stats(PG_FUNCTION_ARGS)
if (lfc_ctl)
value = lfc_ctl->pinned;
break;
case 10:
key = "file_cache_kernel_module_read_hits";
if (lfc_ctl)
value = lfc_ctl->kernel_module_read_hits;
break;
case 11:
key = "file_cache_kernel_module_read_misses";
if (lfc_ctl)
value = lfc_ctl->kernel_module_read_misses;
break;
case 12:
key = "file_cache_kernel_module_write_hits";
if (lfc_ctl)
value = lfc_ctl->kernel_module_write_hits;
break;
case 13:
key = "file_cache_kernel_module_write_misses";
if (lfc_ctl)
value = lfc_ctl->kernel_module_write_misses;
break;
default:
SRF_RETURN_DONE(funcctx);
}

View File

@@ -0,0 +1,35 @@
/*
* This is for the special ioctl in the neon_pagecache kernel module.
*
* DO NOT MODIFY! This header must agree with what the kernel module was
* compiled with!
*/
#ifndef NEON_PAGECACHE_H
#define NEON_PAGECACHE_H
#include <linux/types.h>
#define POSTGRES_PAGE_SIZE 8192 // 8 KiB
struct neon_key {
__u64 key_hi; // Upper 64 bits of 128-bit key
__u64 key_lo; // Lower 64 bits of 128-bit key
};
struct neon_rw_args {
struct neon_key key;
__u32 offset; // Offset within page (0-8191)
__u32 length; // Length to read/write
__u64 buffer; // User buffer address
};
#define NEON_IOC_MAGIC 'N'
#define NEON_IOCTL_READ _IOWR(NEON_IOC_MAGIC, 1, struct neon_rw_args)
#define NEON_IOCTL_WRITE _IOWR(NEON_IOC_MAGIC, 2, struct neon_rw_args)
#endif /* NEON_PAGECACHE_H */