git add missing file

Use a custom Rust implementation to replace the LFC hash table
The new implementation lives in a separately allocated shared memory area, which could be resized. Resizing it isn't actually implemented yet, though. It would require some co-operation from the LFC code.
2026-01-17 10:22:56 +00:00 · 2025-06-12 02:37:59 +03:00 · 2025-06-05 18:31:29 +03:00 · 2025-06-05 18:13:03 +03:00 · 2025-06-05 18:08:40 +03:00 · 2025-06-05 18:08:35 +03:00
18 changed files with 1952 additions and 596 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1086,6 +1086,25 @@ version = "0.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5"

+[[package]]
+name = "cbindgen"
+version = "0.28.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "eadd868a2ce9ca38de7eeafdcec9c7065ef89b42b32f0839278d55f35c54d1ff"
+dependencies = [
+ "clap",
+ "heck 0.4.1",
+ "indexmap 2.9.0",
+ "log",
+ "proc-macro2",
+ "quote",
+ "serde",
+ "serde_json",
+ "syn 2.0.100",
+ "tempfile",
+ "toml",
+]
+
 [[package]]
 name = "cc"
 version = "1.2.16"
@@ -1212,7 +1231,7 @@ version = "4.5.18"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4ac6a0c7b1a9e9a5186361f67dfa1b88213572f427fb9ab038efb2bd8c582dab"
 dependencies = [
- "heck",
+ "heck 0.5.0",
 "proc-macro2",
 "quote",
 "syn 2.0.100",
@@ -1270,6 +1289,14 @@ dependencies = [
 "unicode-width",
 ]

+[[package]]
+name = "communicator"
+version = "0.1.0"
+dependencies = [
+ "cbindgen",
+ "neon-shmem",
+]
+
 [[package]]
 name = "compute_api"
 version = "0.1.0"
@@ -1936,7 +1963,7 @@ checksum = "0892a17df262a24294c382f0d5997571006e7a4348b4327557c4ff1cd4a8bccc"
 dependencies = [
 "darling",
 "either",
- "heck",
+ "heck 0.5.0",
 "proc-macro2",
 "quote",
 "syn 2.0.100",
@@ -2500,6 +2527,18 @@ dependencies = [
 "wasm-bindgen",
 ]

+[[package]]
+name = "getrandom"
+version = "0.3.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "26145e563e54f2cadc477553f1ec5ee650b00862f0a58bcd12cbdc5f0ea2d2f4"
+dependencies = [
+ "cfg-if",
+ "libc",
+ "r-efi",
+ "wasi 0.14.2+wasi-0.2.4",
+]
+
 [[package]]
 name = "gettid"
 version = "0.1.3"
@@ -2712,6 +2751,12 @@ dependencies = [
 "http 1.1.0",
 ]

+[[package]]
+name = "heck"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"
+
 [[package]]
 name = "heck"
 version = "0.5.0"
@@ -3648,7 +3693,7 @@ version = "0.0.22"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b9e6777fc80a575f9503d908c8b498782a6c3ee88a06cb416dc3941401e43b94"
 dependencies = [
- "heck",
+ "heck 0.5.0",
 "proc-macro2",
 "quote",
 "syn 2.0.100",
@@ -3710,7 +3755,7 @@ dependencies = [
 "procfs",
 "prometheus",
 "rand 0.8.5",
- "rand_distr",
+ "rand_distr 0.4.3",
 "twox-hash",
 ]

@@ -3799,6 +3844,8 @@ name = "neon-shmem"
 version = "0.1.0"
 dependencies = [
 "nix 0.30.1",
+ "rand 0.9.1",
+ "rand_distr 0.5.1",
 "tempfile",
 "thiserror 1.0.69",
 "workspace_hack",
@@ -5092,7 +5139,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "22505a5c94da8e3b7c2996394d1c933236c4d743e81a410bcca4e6989fc066a4"
 dependencies = [
 "bytes",
- "heck",
+ "heck 0.5.0",
 "itertools 0.12.1",
 "log",
 "multimap",
@@ -5113,7 +5160,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0c1318b19085f08681016926435853bbf7858f9c082d0999b80550ff5d9abe15"
 dependencies = [
 "bytes",
- "heck",
+ "heck 0.5.0",
 "itertools 0.12.1",
 "log",
 "multimap",
@@ -5238,7 +5285,7 @@ dependencies = [
 "postgres_backend",
 "pq_proto",
 "rand 0.8.5",
- "rand_distr",
+ "rand_distr 0.4.3",
 "rcgen",
 "redis",
 "regex",
@@ -5342,6 +5389,12 @@ dependencies = [
 "proc-macro2",
 ]

+[[package]]
+name = "r-efi"
+version = "5.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "74765f6d916ee2faa39bc8e68e4f3ed8949b48cccdac59983d287a7cb71ce9c5"
+
 [[package]]
 name = "rand"
 version = "0.7.3"
@@ -5366,6 +5419,16 @@ dependencies = [
 "rand_core 0.6.4",
 ]

+[[package]]
+name = "rand"
+version = "0.9.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9fbfd9d094a40bf3ae768db9361049ace4c0e04a4fd6b359518bd7b73a73dd97"
+dependencies = [
+ "rand_chacha 0.9.0",
+ "rand_core 0.9.3",
+]
+
 [[package]]
 name = "rand_chacha"
 version = "0.2.2"
@@ -5386,6 +5449,16 @@ dependencies = [
 "rand_core 0.6.4",
 ]

+[[package]]
+name = "rand_chacha"
+version = "0.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb"
+dependencies = [
+ "ppv-lite86",
+ "rand_core 0.9.3",
+]
+
 [[package]]
 name = "rand_core"
 version = "0.5.1"
@@ -5404,6 +5477,15 @@ dependencies = [
 "getrandom 0.2.11",
 ]

+[[package]]
+name = "rand_core"
+version = "0.9.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "99d9a13982dcf210057a8a78572b2217b667c3beacbf3a0d8b454f6f82837d38"
+dependencies = [
+ "getrandom 0.3.3",
+]
+
 [[package]]
 name = "rand_distr"
 version = "0.4.3"
@@ -5414,6 +5496,16 @@ dependencies = [
 "rand 0.8.5",
 ]

+[[package]]
+name = "rand_distr"
+version = "0.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6a8615d50dcf34fa31f7ab52692afec947c4dd0ab803cc87cb3b0b4570ff7463"
+dependencies = [
+ "num-traits",
+ "rand 0.9.1",
+]
+
 [[package]]
 name = "rand_hc"
 version = "0.2.0"
@@ -6900,7 +6992,7 @@ version = "0.26.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4c6bee85a5a24955dc440386795aa378cd9cf82acd5f764469152d2270e581be"
 dependencies = [
- "heck",
+ "heck 0.5.0",
 "proc-macro2",
 "quote",
 "rustversion",
@@ -8199,6 +8291,15 @@ version = "0.11.0+wasi-snapshot-preview1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"

+[[package]]
+name = "wasi"
+version = "0.14.2+wasi-0.2.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9683f9a5a998d873c0d21fcbe3c083009670149a8fab228644b8bd36b2c48cb3"
+dependencies = [
+ "wit-bindgen-rt",
+]
+
 [[package]]
 name = "wasite"
 version = "0.1.0"
@@ -8556,6 +8657,15 @@ dependencies = [
 "windows-sys 0.48.0",
 ]

+[[package]]
+name = "wit-bindgen-rt"
+version = "0.39.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6f42320e61fe2cfd34354ecb597f86f413484a798ba44a8ca1165c58d42da6c1"
+dependencies = [
+ "bitflags 2.8.0",
+]
+
 [[package]]
 name = "workspace_hack"
 version = "0.1.0"
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -44,6 +44,7 @@ members = [
    "libs/proxy/postgres-types2",
    "libs/proxy/tokio-postgres2",
    "endpoint_storage",
+    "pgxn/neon/communicator",
 ]

 [workspace.package]
@@ -251,6 +252,7 @@ desim = { version = "0.1", path = "./libs/desim" }
 endpoint_storage = { version = "0.0.1", path = "./endpoint_storage/" }
 http-utils = { version = "0.1", path = "./libs/http-utils/" }
 metrics = { version = "0.1", path = "./libs/metrics/" }
+neon-shmem = { version = "0.1", path = "./libs/neon-shmem/" }
 pageserver = { path = "./pageserver" }
 pageserver_api = { version = "0.1", path = "./libs/pageserver_api/" }
 pageserver_client = { path = "./pageserver/client" }
@@ -278,6 +280,7 @@ walproposer = { version = "0.1", path = "./libs/walproposer/" }
 workspace_hack = { version = "0.1", path = "./workspace_hack/" }

 ## Build dependencies
+cbindgen = "0.28.0"
 criterion = "0.5.1"
 rcgen = "0.13"
 rstest = "0.18"
--- a/7
+++ b/7
@@ -18,10 +18,12 @@ ifeq ($(BUILD_TYPE),release)
 	PG_LDFLAGS = $(LDFLAGS)
 	# Unfortunately, `--profile=...` is a nightly feature
 	CARGO_BUILD_FLAGS += --release
+	NEON_CARGO_ARTIFACT_TARGET_DIR = $(ROOT_PROJECT_DIR)/target/release
 else ifeq ($(BUILD_TYPE),debug)
 	PG_CONFIGURE_OPTS = --enable-debug --with-openssl --enable-cassert --enable-depend
 	PG_CFLAGS += -O0 -g3 $(CFLAGS)
 	PG_LDFLAGS = $(LDFLAGS)
+	NEON_CARGO_ARTIFACT_TARGET_DIR = $(ROOT_PROJECT_DIR)/target/debug
 else
 	$(error Bad build type '$(BUILD_TYPE)', see Makefile for options)
 endif
@@ -180,11 +182,16 @@ postgres-check-%: postgres-%

 .PHONY: neon-pg-ext-%
 neon-pg-ext-%: postgres-%
+	+@echo "Compiling communicator $*"
+	$(CARGO_CMD_PREFIX) cargo build -p communicator $(CARGO_BUILD_FLAGS)
+
 	+@echo "Compiling neon $*"
 	mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-$*
 	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config COPT='$(COPT)' \
+		LIBCOMMUNICATOR_PATH=$(NEON_CARGO_ARTIFACT_TARGET_DIR) \
 		-C $(POSTGRES_INSTALL_DIR)/build/neon-$* \
 		-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile install
+
 	+@echo "Compiling neon_walredo $*"
 	mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-walredo-$*
 	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config COPT='$(COPT)' \
--- a/libs/neon-shmem/Cargo.toml
+++ b/libs/neon-shmem/Cargo.toml
@@ -6,8 +6,12 @@ license.workspace = true

 [dependencies]
 thiserror.workspace = true
-nix.workspace=true
+nix.workspace = true
 workspace_hack = { version = "0.1", path = "../../workspace_hack" }

+[dev-dependencies]
+rand = "0.9.1"
+rand_distr = "0.5.1"
+
 [target.'cfg(target_os = "macos")'.dependencies]
 tempfile = "3.14.0"
--- a/libs/neon-shmem/src/hash.rs
+++ b/libs/neon-shmem/src/hash.rs
@@ -0,0 +1,304 @@
+//! Hash table implementation on top of 'shmem'
+//!
+//! Features required in the long run by the communicator project:
+//!
+//! [X] Accessible from both Postgres processes and rust threads in the communicator process
+//! [X] Low latency
+//! [ ] Scalable to lots of concurrent accesses (currently relies on caller for locking)
+//! [ ] Resizable
+
+use std::fmt::Debug;
+use std::hash::{DefaultHasher, Hash, Hasher};
+use std::mem::MaybeUninit;
+
+use crate::shmem::ShmemHandle;
+
+mod core;
+pub mod entry;
+
+#[cfg(test)]
+mod tests;
+
+use core::CoreHashMap;
+use entry::{Entry, OccupiedEntry};
+
+#[derive(Debug)]
+pub struct OutOfMemoryError();
+
+pub struct HashMapInit<'a, K, V> {
+    // Hash table can be allocated in a fixed memory area, or in a resizeable ShmemHandle.
+    shmem_handle: Option<ShmemHandle>,
+    shared_ptr: *mut HashMapShared<'a, K, V>,
+}
+
+pub struct HashMapAccess<'a, K, V> {
+    shmem_handle: Option<ShmemHandle>,
+    shared_ptr: *mut HashMapShared<'a, K, V>,
+}
+
+unsafe impl<'a, K: Sync, V: Sync> Sync for HashMapAccess<'a, K, V> {}
+unsafe impl<'a, K: Send, V: Send> Send for HashMapAccess<'a, K, V> {}
+
+impl<'a, K, V> HashMapInit<'a, K, V> {
+    pub fn attach_writer(self) -> HashMapAccess<'a, K, V> {
+        HashMapAccess {
+            shmem_handle: self.shmem_handle,
+            shared_ptr: self.shared_ptr,
+        }
+    }
+
+    pub fn attach_reader(self) -> HashMapAccess<'a, K, V> {
+        // no difference to attach_writer currently
+        self.attach_writer()
+    }
+}
+
+/// This is stored in the shared memory area
+///
+/// NOTE: We carve out the parts from a contiguous chunk. Growing and shrinking the hash table
+/// relies on the memory layout! The data structures are laid out in the contiguous shared memory
+/// area as follows:
+///
+/// HashMapShared
+/// [buckets]
+/// [dictionary]
+///
+/// In between the above parts, there can be padding bytes to align the parts correctly.
+struct HashMapShared<'a, K, V> {
+    inner: CoreHashMap<'a, K, V>,
+}
+
+impl<'a, K, V> HashMapInit<'a, K, V>
+where
+    K: Clone + Hash + Eq,
+{
+    pub fn estimate_size(num_buckets: u32) -> usize {
+        // add some margin to cover alignment etc.
+        CoreHashMap::<K, V>::estimate_size(num_buckets) + size_of::<HashMapShared<K, V>>() + 1000
+    }
+
+    pub fn init_in_fixed_area(
+        num_buckets: u32,
+        area: &'a mut [MaybeUninit<u8>],
+    ) -> HashMapInit<'a, K, V> {
+        Self::init_common(num_buckets, None, area.as_mut_ptr().cast(), area.len())
+    }
+
+    /// Initialize a new hash map in the given shared memory area
+    pub fn init_in_shmem(num_buckets: u32, mut shmem: ShmemHandle) -> HashMapInit<'a, K, V> {
+        let size = Self::estimate_size(num_buckets);
+        shmem
+            .set_size(size)
+            .expect("could not resize shared memory area");
+
+        let ptr = unsafe { shmem.data_ptr.as_mut() };
+        Self::init_common(num_buckets, Some(shmem), ptr, size)
+    }
+
+    fn init_common(
+        num_buckets: u32,
+        shmem_handle: Option<ShmemHandle>,
+        area_ptr: *mut u8,
+        area_len: usize,
+    ) -> HashMapInit<'a, K, V> {
+        // carve out the HashMapShared struct from the area.
+        let mut ptr: *mut u8 = area_ptr;
+        let end_ptr: *mut u8 = unsafe { area_ptr.add(area_len) };
+        ptr = unsafe { ptr.add(ptr.align_offset(align_of::<HashMapShared<K, V>>())) };
+        let shared_ptr: *mut HashMapShared<K, V> = ptr.cast();
+        ptr = unsafe { ptr.add(size_of::<HashMapShared<K, V>>()) };
+
+        // carve out the buckets
+        ptr = unsafe { ptr.byte_add(ptr.align_offset(align_of::<core::Bucket<K, V>>())) };
+        let buckets_ptr = ptr;
+        ptr = unsafe { ptr.add(size_of::<core::Bucket<K, V>>() * num_buckets as usize) };
+
+        // use remaining space for the dictionary
+        ptr = unsafe { ptr.byte_add(ptr.align_offset(align_of::<u32>())) };
+        assert!(ptr.addr() < end_ptr.addr());
+        let dictionary_ptr = ptr;
+        let dictionary_size = unsafe { end_ptr.byte_offset_from(ptr) / size_of::<u32>() as isize };
+        assert!(dictionary_size > 0);
+
+        let buckets =
+            unsafe { std::slice::from_raw_parts_mut(buckets_ptr.cast(), num_buckets as usize) };
+        let dictionary = unsafe {
+            std::slice::from_raw_parts_mut(dictionary_ptr.cast(), dictionary_size as usize)
+        };
+        let hashmap = CoreHashMap::new(buckets, dictionary);
+        unsafe {
+            std::ptr::write(shared_ptr, HashMapShared { inner: hashmap });
+        }
+
+        HashMapInit {
+            shmem_handle: shmem_handle,
+            shared_ptr,
+        }
+    }
+}
+
+impl<'a, K, V> HashMapAccess<'a, K, V>
+where
+    K: Clone + Hash + Eq,
+{
+    pub fn get_hash_value(&self, key: &K) -> u64 {
+        let mut hasher = DefaultHasher::new();
+        key.hash(&mut hasher);
+        hasher.finish()
+    }
+
+    pub fn get_with_hash<'e>(&'e self, key: &K, hash: u64) -> Option<&'e V> {
+        let map = unsafe { self.shared_ptr.as_ref() }.unwrap();
+
+        map.inner.get_with_hash(key, hash)
+    }
+
+    pub fn entry_with_hash(&mut self, key: K, hash: u64) -> Entry<'a, '_, K, V> {
+        let map = unsafe { self.shared_ptr.as_mut() }.unwrap();
+
+        map.inner.entry_with_hash(key, hash)
+    }
+
+    pub fn remove_with_hash(&mut self, key: &K, hash: u64) {
+        let map = unsafe { self.shared_ptr.as_mut() }.unwrap();
+
+        match map.inner.entry_with_hash(key.clone(), hash) {
+            Entry::Occupied(e) => {
+                e.remove();
+            }
+            Entry::Vacant(_) => {}
+        };
+    }
+
+    pub fn entry_at_bucket(&mut self, pos: usize) -> Option<OccupiedEntry<'a, '_, K, V>> {
+        let map = unsafe { self.shared_ptr.as_mut() }.unwrap();
+        map.inner.entry_at_bucket(pos)
+    }
+
+    pub fn get_num_buckets(&self) -> usize {
+        let map = unsafe { self.shared_ptr.as_ref() }.unwrap();
+        map.inner.get_num_buckets()
+    }
+
+    /// Return the key and value stored in bucket with given index. This can be used to
+    /// iterate through the hash map. (An Iterator might be nicer. The communicator's
+    /// clock algorithm needs to _slowly_ iterate through all buckets with its clock hand,
+    /// without holding a lock. If we switch to an Iterator, it must not hold the lock.)
+    pub fn get_at_bucket(&self, pos: usize) -> Option<&(K, V)> {
+        let map = unsafe { self.shared_ptr.as_ref() }.unwrap();
+
+        if pos >= map.inner.buckets.len() {
+            return None;
+        }
+        let bucket = &map.inner.buckets[pos];
+        bucket.inner.as_ref()
+    }
+
+    pub fn get_bucket_for_value(&self, val_ptr: *const V) -> usize {
+        let map = unsafe { self.shared_ptr.as_ref() }.unwrap();
+
+        let origin = map.inner.buckets.as_ptr();
+        let idx = (val_ptr as usize - origin as usize) / (size_of::<V>() as usize);
+        assert!(idx < map.inner.buckets.len());
+
+        idx
+    }
+
+    // for metrics
+    pub fn get_num_buckets_in_use(&self) -> usize {
+        let map = unsafe { self.shared_ptr.as_ref() }.unwrap();
+        map.inner.buckets_in_use as usize
+    }
+
+    /// Grow
+    ///
+    /// 1. grow the underlying shared memory area
+    /// 2. Initialize new buckets. This overwrites the current dictionary
+    /// 3. Recalculate the dictionary
+    pub fn grow(&mut self, num_buckets: u32) -> Result<(), crate::shmem::Error> {
+        let map = unsafe { self.shared_ptr.as_mut() }.unwrap();
+        let inner = &mut map.inner;
+        let old_num_buckets = inner.buckets.len() as u32;
+
+        if num_buckets < old_num_buckets {
+            panic!("grow called with a smaller number of buckets");
+        }
+        if num_buckets == old_num_buckets {
+            return Ok(());
+        }
+        let shmem_handle = self
+            .shmem_handle
+            .as_ref()
+            .expect("grow called on a fixed-size hash table");
+
+        let size_bytes = HashMapInit::<K, V>::estimate_size(num_buckets);
+        shmem_handle.set_size(size_bytes)?;
+        let end_ptr: *mut u8 = unsafe { shmem_handle.data_ptr.as_ptr().add(size_bytes) };
+
+        // Initialize new buckets. The new buckets are linked to the free list. NB: This overwrites
+        // the dictionary!
+        let buckets_ptr = inner.buckets.as_mut_ptr();
+        unsafe {
+            for i in old_num_buckets..num_buckets {
+                let bucket_ptr = buckets_ptr.add(i as usize);
+                bucket_ptr.write(core::Bucket {
+                    next: if i < num_buckets {
+                        i as u32 + 1
+                    } else {
+                        inner.free_head
+                    },
+                    inner: None,
+                });
+            }
+        }
+
+        // Recalculate the dictionary
+        let buckets;
+        let dictionary;
+        unsafe {
+            let buckets_end_ptr = buckets_ptr.add(num_buckets as usize);
+            let dictionary_ptr: *mut u32 = buckets_end_ptr
+                .byte_add(buckets_end_ptr.align_offset(align_of::<u32>()))
+                .cast();
+            let dictionary_size: usize =
+                end_ptr.byte_offset_from(buckets_end_ptr) as usize / size_of::<u32>();
+
+            buckets = std::slice::from_raw_parts_mut(buckets_ptr, num_buckets as usize);
+            dictionary = std::slice::from_raw_parts_mut(dictionary_ptr, dictionary_size);
+        }
+        for i in 0..dictionary.len() {
+            dictionary[i] = core::INVALID_POS;
+        }
+
+        for i in 0..old_num_buckets as usize {
+            if buckets[i].inner.is_none() {
+                continue;
+            }
+
+            let mut hasher = DefaultHasher::new();
+            buckets[i].inner.as_ref().unwrap().0.hash(&mut hasher);
+            let hash = hasher.finish();
+
+            let pos: usize = (hash % dictionary.len() as u64) as usize;
+            buckets[i].next = dictionary[pos];
+            dictionary[pos] = i as u32;
+        }
+
+        // Finally, update the CoreHashMap struct
+        inner.dictionary = dictionary;
+        inner.buckets = buckets;
+        inner.free_head = old_num_buckets;
+
+        Ok(())
+    }
+
+    // TODO: Shrinking is a multi-step process that requires co-operation from the caller
+    //
+    // 1. The caller must first call begin_shrink(). That forbids allocation of higher-numbered
+    // buckets.
+    //
+    // 2. Next, the caller must evict all entries in higher-numbered buckets.
+    //
+    // 3. Finally, call finish_shrink(). This recomputes the dictionary and shrinks the underlying
+    //    shmem area
+}
--- a/libs/neon-shmem/src/hash/core.rs
+++ b/libs/neon-shmem/src/hash/core.rs
@@ -0,0 +1,174 @@
+//! Simple hash table with chaining
+//!
+//! # Resizing
+//!
+
+use std::hash::Hash;
+use std::mem::MaybeUninit;
+
+use crate::hash::entry::{Entry, OccupiedEntry, PrevPos, VacantEntry};
+
+pub(crate) const INVALID_POS: u32 = u32::MAX;
+
+// Bucket
+pub(crate) struct Bucket<K, V> {
+    pub(crate) next: u32,
+    pub(crate) inner: Option<(K, V)>,
+}
+
+pub(crate) struct CoreHashMap<'a, K, V> {
+    pub(crate) dictionary: &'a mut [u32],
+    pub(crate) buckets: &'a mut [Bucket<K, V>],
+    pub(crate) free_head: u32,
+
+    pub(crate) _user_list_head: u32,
+
+    // metrics
+    pub(crate) buckets_in_use: u32,
+}
+
+#[derive(Debug)]
+pub struct FullError();
+
+impl<'a, K: Hash + Eq, V> CoreHashMap<'a, K, V>
+where
+    K: Clone + Hash + Eq,
+{
+    const FILL_FACTOR: f32 = 0.60;
+
+    pub fn estimate_size(num_buckets: u32) -> usize {
+        let mut size = 0;
+
+        // buckets
+        size += size_of::<Bucket<K, V>>() * num_buckets as usize;
+
+        // dictionary
+        size += (f32::ceil((size_of::<u32>() * num_buckets as usize) as f32 / Self::FILL_FACTOR))
+            as usize;
+
+        size
+    }
+
+    pub fn new(
+        buckets: &'a mut [MaybeUninit<Bucket<K, V>>],
+        dictionary: &'a mut [MaybeUninit<u32>],
+    ) -> CoreHashMap<'a, K, V> {
+        // Initialize the buckets
+        for i in 0..buckets.len() {
+            buckets[i].write(Bucket {
+                next: if i < buckets.len() - 1 {
+                    i as u32 + 1
+                } else {
+                    INVALID_POS
+                },
+                inner: None,
+            });
+        }
+
+        // Initialize the dictionary
+        for i in 0..dictionary.len() {
+            dictionary[i].write(INVALID_POS);
+        }
+
+        // TODO: use std::slice::assume_init_mut() once it stabilizes
+        let buckets =
+            unsafe { std::slice::from_raw_parts_mut(buckets.as_mut_ptr().cast(), buckets.len()) };
+        let dictionary = unsafe {
+            std::slice::from_raw_parts_mut(dictionary.as_mut_ptr().cast(), dictionary.len())
+        };
+
+        CoreHashMap {
+            dictionary,
+            buckets,
+            free_head: 0,
+            buckets_in_use: 0,
+            _user_list_head: INVALID_POS,
+        }
+    }
+
+    pub fn get_with_hash(&self, key: &K, hash: u64) -> Option<&V> {
+        let mut next = self.dictionary[hash as usize % self.dictionary.len()];
+        loop {
+            if next == INVALID_POS {
+                return None;
+            }
+
+            let bucket = &self.buckets[next as usize];
+            let (bucket_key, bucket_value) = bucket.inner.as_ref().expect("entry is in use");
+            if bucket_key == key {
+                return Some(&bucket_value);
+            }
+            next = bucket.next;
+        }
+    }
+
+    // all updates are done through Entry
+    pub fn entry_with_hash(&mut self, key: K, hash: u64) -> Entry<'a, '_, K, V> {
+        let dict_pos = hash as usize % self.dictionary.len();
+        let first = self.dictionary[dict_pos];
+        if first == INVALID_POS {
+            // no existing entry
+            return Entry::Vacant(VacantEntry {
+                map: self,
+                key,
+                dict_pos: dict_pos as u32,
+            });
+        }
+
+        let mut prev_pos = PrevPos::First(dict_pos as u32);
+        let mut next = first;
+        loop {
+            let bucket = &mut self.buckets[next as usize];
+            let (bucket_key, _bucket_value) = bucket.inner.as_mut().expect("entry is in use");
+            if *bucket_key == key {
+                // found existing entry
+                return Entry::Occupied(OccupiedEntry {
+                    map: self,
+                    _key: key,
+                    prev_pos,
+                    bucket_pos: next,
+                });
+            }
+
+            if bucket.next == INVALID_POS {
+                // No existing entry
+                return Entry::Vacant(VacantEntry {
+                    map: self,
+                    key,
+                    dict_pos: dict_pos as u32,
+                });
+            }
+            prev_pos = PrevPos::Chained(next);
+            next = bucket.next;
+        }
+    }
+
+    pub fn get_num_buckets(&self) -> usize {
+        self.buckets.len()
+    }
+
+    pub fn entry_at_bucket(&mut self, pos: usize) -> Option<OccupiedEntry<K, V>> {
+        if pos >= self.buckets.len() {
+            return None;
+        }
+
+        todo!()
+        //self.buckets[pos].inner.as_ref()
+    }
+
+    pub(crate) fn alloc_bucket(&mut self, key: K, value: V) -> Result<u32, FullError> {
+        let pos = self.free_head;
+        if pos == INVALID_POS {
+            return Err(FullError());
+        }
+
+        let bucket = &mut self.buckets[pos as usize];
+        self.free_head = bucket.next;
+        self.buckets_in_use += 1;
+
+        bucket.next = INVALID_POS;
+        bucket.inner = Some((key, value));
+
+        return Ok(pos);
+    }
+}
--- a/libs/neon-shmem/src/hash/entry.rs
+++ b/libs/neon-shmem/src/hash/entry.rs
@@ -0,0 +1,91 @@
+//! Like std::collections::hash_map::Entry;
+
+use crate::hash::core::{CoreHashMap, FullError, INVALID_POS};
+
+use std::hash::Hash;
+use std::mem;
+
+pub enum Entry<'a, 'b, K, V> {
+    Occupied(OccupiedEntry<'a, 'b, K, V>),
+    Vacant(VacantEntry<'a, 'b, K, V>),
+}
+
+pub(crate) enum PrevPos {
+    First(u32),
+    Chained(u32),
+}
+
+pub struct OccupiedEntry<'a, 'b, K, V> {
+    pub(crate) map: &'b mut CoreHashMap<'a, K, V>,
+    pub(crate) _key: K, // The key of the occupied entry
+    pub(crate) prev_pos: PrevPos,
+    pub(crate) bucket_pos: u32, // The position of the bucket in the CoreHashMap's buckets array
+}
+
+impl<'a, 'b, K, V> OccupiedEntry<'a, 'b, K, V> {
+    pub fn get(&self) -> &V {
+        &self.map.buckets[self.bucket_pos as usize]
+            .inner
+            .as_ref()
+            .unwrap()
+            .1
+    }
+
+    pub fn get_mut(&mut self) -> &mut V {
+        &mut self.map.buckets[self.bucket_pos as usize]
+            .inner
+            .as_mut()
+            .unwrap()
+            .1
+    }
+
+    pub fn insert(&mut self, value: V) -> V {
+        let bucket = &mut self.map.buckets[self.bucket_pos as usize];
+        // This assumes inner is Some, which it must be for an OccupiedEntry
+        let old_value = mem::replace(&mut bucket.inner.as_mut().unwrap().1, value);
+        old_value
+    }
+
+    pub fn remove(self) -> V {
+        // CoreHashMap::remove returns Option<(K, V)>. We know it's Some for an OccupiedEntry.
+        let bucket = &mut self.map.buckets[self.bucket_pos as usize];
+
+        // unlink it from the chain
+        match self.prev_pos {
+            PrevPos::First(dict_pos) => self.map.dictionary[dict_pos as usize] = bucket.next,
+            PrevPos::Chained(bucket_pos) => {
+                self.map.buckets[bucket_pos as usize].next = bucket.next
+            }
+        }
+
+        // and add it to the freelist
+        let bucket = &mut self.map.buckets[self.bucket_pos as usize];
+        let old_value = bucket.inner.take();
+        bucket.next = self.map.free_head;
+        self.map.free_head = self.bucket_pos;
+        self.map.buckets_in_use -= 1;
+
+        return old_value.unwrap().1;
+    }
+}
+
+pub struct VacantEntry<'a, 'b, K, V> {
+    pub(crate) map: &'b mut CoreHashMap<'a, K, V>,
+    pub(crate) key: K, // The key to insert
+    pub(crate) dict_pos: u32,
+}
+
+impl<'a, 'b, K: Clone + Hash + Eq, V> VacantEntry<'a, 'b, K, V> {
+    pub fn insert(self, value: V) -> Result<&'b mut V, FullError> {
+        let pos = self.map.alloc_bucket(self.key, value)?;
+        if pos == INVALID_POS {
+            return Err(FullError());
+        }
+        let bucket = &mut self.map.buckets[pos as usize];
+        bucket.next = self.map.dictionary[self.dict_pos as usize];
+        self.map.dictionary[self.dict_pos as usize] = pos;
+
+        let result = &mut self.map.buckets[pos as usize].inner.as_mut().unwrap().1;
+        return Ok(result);
+    }
+}
--- a/libs/neon-shmem/src/hash/tests.rs
+++ b/libs/neon-shmem/src/hash/tests.rs
@@ -0,0 +1,220 @@
+use std::collections::BTreeMap;
+use std::collections::HashSet;
+use std::fmt::{Debug, Formatter};
+use std::sync::atomic::{AtomicUsize, Ordering};
+
+use crate::hash::HashMapAccess;
+use crate::hash::HashMapInit;
+use crate::hash::UpdateAction;
+use crate::shmem::ShmemHandle;
+
+use rand::seq::SliceRandom;
+use rand::{Rng, RngCore};
+use rand_distr::Zipf;
+
+const TEST_KEY_LEN: usize = 16;
+
+#[derive(Clone, Copy, Debug, Hash, PartialEq, Eq, PartialOrd, Ord)]
+struct TestKey([u8; TEST_KEY_LEN]);
+
+impl From<&TestKey> for u128 {
+    fn from(val: &TestKey) -> u128 {
+        u128::from_be_bytes(val.0)
+    }
+}
+
+impl From<u128> for TestKey {
+    fn from(val: u128) -> TestKey {
+        TestKey(val.to_be_bytes())
+    }
+}
+
+impl<'a> From<&'a [u8]> for TestKey {
+    fn from(bytes: &'a [u8]) -> TestKey {
+        TestKey(bytes.try_into().unwrap())
+    }
+}
+
+fn test_inserts<K: Into<TestKey> + Copy>(keys: &[K]) {
+    const MAX_MEM_SIZE: usize = 10000000;
+    let shmem = ShmemHandle::new("test_inserts", 0, MAX_MEM_SIZE).unwrap();
+
+    let init_struct = HashMapInit::<TestKey, usize>::init_in_shmem(100000, shmem);
+    let w = init_struct.attach_writer();
+
+    for (idx, k) in keys.iter().enumerate() {
+        let res = w.insert(&(*k).into(), idx);
+        assert!(res.is_ok());
+    }
+
+    for (idx, k) in keys.iter().enumerate() {
+        let x = w.get(&(*k).into());
+        let value = x.as_deref().copied();
+        assert_eq!(value, Some(idx));
+    }
+
+    //eprintln!("stats: {:?}", tree_writer.get_statistics());
+}
+
+#[test]
+fn dense() {
+    // This exercises splitting a node with prefix
+    let keys: &[u128] = &[0, 1, 2, 3, 256];
+    test_inserts(keys);
+
+    // Dense keys
+    let mut keys: Vec<u128> = (0..10000).collect();
+    test_inserts(&keys);
+
+    // Do the same in random orders
+    for _ in 1..10 {
+        keys.shuffle(&mut rand::rng());
+        test_inserts(&keys);
+    }
+}
+
+#[test]
+fn sparse() {
+    // sparse keys
+    let mut keys: Vec<TestKey> = Vec::new();
+    let mut used_keys = HashSet::new();
+    for _ in 0..10000 {
+        loop {
+            let key = rand::random::<u128>();
+            if used_keys.get(&key).is_some() {
+                continue;
+            }
+            used_keys.insert(key);
+            keys.push(key.into());
+            break;
+        }
+    }
+    test_inserts(&keys);
+}
+
+struct TestValue(AtomicUsize);
+
+impl TestValue {
+    fn new(val: usize) -> TestValue {
+        TestValue(AtomicUsize::new(val))
+    }
+
+    fn load(&self) -> usize {
+        self.0.load(Ordering::Relaxed)
+    }
+}
+
+impl Clone for TestValue {
+    fn clone(&self) -> TestValue {
+        TestValue::new(self.load())
+    }
+}
+
+impl Debug for TestValue {
+    fn fmt(&self, fmt: &mut Formatter<'_>) -> Result<(), std::fmt::Error> {
+        write!(fmt, "{:?}", self.load())
+    }
+}
+
+#[derive(Clone, Debug)]
+struct TestOp(TestKey, Option<usize>);
+
+fn apply_op(
+    op: &TestOp,
+    sut: &HashMapAccess<TestKey, TestValue>,
+    shadow: &mut BTreeMap<TestKey, usize>,
+) {
+    eprintln!("applying op: {op:?}");
+
+    // apply the change to the shadow tree first
+    let shadow_existing = if let Some(v) = op.1 {
+        shadow.insert(op.0, v)
+    } else {
+        shadow.remove(&op.0)
+    };
+
+    // apply to Art tree
+    sut.update_with_fn(&op.0, |existing| {
+        assert_eq!(existing.map(TestValue::load), shadow_existing);
+
+        match (existing, op.1) {
+            (None, None) => UpdateAction::Nothing,
+            (None, Some(new_val)) => UpdateAction::Insert(TestValue::new(new_val)),
+            (Some(_old_val), None) => UpdateAction::Remove,
+            (Some(old_val), Some(new_val)) => {
+                old_val.0.store(new_val, Ordering::Relaxed);
+                UpdateAction::Nothing
+            }
+        }
+    })
+    .expect("out of memory");
+}
+
+#[test]
+fn random_ops() {
+    const MAX_MEM_SIZE: usize = 10000000;
+    let shmem = ShmemHandle::new("test_inserts", 0, MAX_MEM_SIZE).unwrap();
+
+    let init_struct = HashMapInit::<TestKey, TestValue>::init_in_shmem(100000, shmem);
+    let writer = init_struct.attach_writer();
+
+    let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
+
+    let distribution = Zipf::new(u128::MAX as f64, 1.1).unwrap();
+    let mut rng = rand::rng();
+    for i in 0..100000 {
+        let key: TestKey = (rng.sample(distribution) as u128).into();
+
+        let op = TestOp(key, if rng.random_bool(0.75) { Some(i) } else { None });
+
+        apply_op(&op, &writer, &mut shadow);
+
+        if i % 1000 == 0 {
+            eprintln!("{i} ops processed");
+            //eprintln!("stats: {:?}", tree_writer.get_statistics());
+            //test_iter(&tree_writer, &shadow);
+        }
+    }
+}
+
+#[test]
+fn test_grow() {
+    const MEM_SIZE: usize = 10000000;
+    let shmem = ShmemHandle::new("test_grow", 0, MEM_SIZE).unwrap();
+
+    let init_struct = HashMapInit::<TestKey, TestValue>::init_in_shmem(1000, shmem);
+    let writer = init_struct.attach_writer();
+
+    let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
+
+    let mut rng = rand::rng();
+    for i in 0..10000 {
+        let key: TestKey = ((rng.next_u32() % 1000) as u128).into();
+
+        let op = TestOp(key, if rng.random_bool(0.75) { Some(i) } else { None });
+
+        apply_op(&op, &writer, &mut shadow);
+
+        if i % 1000 == 0 {
+            eprintln!("{i} ops processed");
+            //eprintln!("stats: {:?}", tree_writer.get_statistics());
+            //test_iter(&tree_writer, &shadow);
+        }
+    }
+
+    writer.grow(1500).unwrap();
+
+    for i in 0..10000 {
+        let key: TestKey = ((rng.next_u32() % 1500) as u128).into();
+
+        let op = TestOp(key, if rng.random_bool(0.75) { Some(i) } else { None });
+
+        apply_op(&op, &writer, &mut shadow);
+
+        if i % 1000 == 0 {
+            eprintln!("{i} ops processed");
+            //eprintln!("stats: {:?}", tree_writer.get_statistics());
+            //test_iter(&tree_writer, &shadow);
+        }
+    }
+}
--- a/libs/neon-shmem/src/lib.rs
+++ b/libs/neon-shmem/src/lib.rs
@@ -1,418 +1,4 @@
 //! Shared memory utilities for neon communicator

-use std::num::NonZeroUsize;
-use std::os::fd::{AsFd, BorrowedFd, OwnedFd};
-use std::ptr::NonNull;
-use std::sync::atomic::{AtomicUsize, Ordering};
-
-use nix::errno::Errno;
-use nix::sys::mman::MapFlags;
-use nix::sys::mman::ProtFlags;
-use nix::sys::mman::mmap as nix_mmap;
-use nix::sys::mman::munmap as nix_munmap;
-use nix::unistd::ftruncate as nix_ftruncate;
-
-/// ShmemHandle represents a shared memory area that can be shared by processes over fork().
-/// Unlike shared memory allocated by Postgres, this area is resizable, up to 'max_size' that's
-/// specified at creation.
-///
-/// The area is backed by an anonymous file created with memfd_create(). The full address space for
-/// 'max_size' is reserved up-front with mmap(), but whenever you call [`ShmemHandle::set_size`],
-/// the underlying file is resized. Do not access the area beyond the current size. Currently, that
-/// will cause the file to be expanded, but we might use mprotect() etc. to enforce that in the
-/// future.
-pub struct ShmemHandle {
-    /// memfd file descriptor
-    fd: OwnedFd,
-
-    max_size: usize,
-
-    // Pointer to the beginning of the shared memory area. The header is stored there.
-    shared_ptr: NonNull<SharedStruct>,
-
-    // Pointer to the beginning of the user data
-    pub data_ptr: NonNull<u8>,
-}
-
-/// This is stored at the beginning in the shared memory area.
-struct SharedStruct {
-    max_size: usize,
-
-    /// Current size of the backing file. The high-order bit is used for the RESIZE_IN_PROGRESS flag
-    current_size: AtomicUsize,
-}
-
-const RESIZE_IN_PROGRESS: usize = 1 << 63;
-
-const HEADER_SIZE: usize = std::mem::size_of::<SharedStruct>();
-
-/// Error type returned by the ShmemHandle functions.
-#[derive(thiserror::Error, Debug)]
-#[error("{msg}: {errno}")]
-pub struct Error {
-    pub msg: String,
-    pub errno: Errno,
-}
-
-impl Error {
-    fn new(msg: &str, errno: Errno) -> Error {
-        Error {
-            msg: msg.to_string(),
-            errno,
-        }
-    }
-}
-
-impl ShmemHandle {
-    /// Create a new shared memory area. To communicate between processes, the processes need to be
-    /// fork()'d after calling this, so that the ShmemHandle is inherited by all processes.
-    ///
-    /// If the ShmemHandle is dropped, the memory is unmapped from the current process. Other
-    /// processes can continue using it, however.
-    pub fn new(name: &str, initial_size: usize, max_size: usize) -> Result<ShmemHandle, Error> {
-        // create the backing anonymous file.
-        let fd = create_backing_file(name)?;
-
-        Self::new_with_fd(fd, initial_size, max_size)
-    }
-
-    fn new_with_fd(
-        fd: OwnedFd,
-        initial_size: usize,
-        max_size: usize,
-    ) -> Result<ShmemHandle, Error> {
-        // We reserve the high-order bit for the RESIZE_IN_PROGRESS flag, and the actual size
-        // is a little larger than this because of the SharedStruct header. Make the upper limit
-        // somewhat smaller than that, because with anything close to that, you'll run out of
-        // memory anyway.
-        if max_size >= 1 << 48 {
-            panic!("max size {} too large", max_size);
-        }
-        if initial_size > max_size {
-            panic!("initial size {initial_size} larger than max size {max_size}");
-        }
-
-        // The actual initial / max size is the one given by the caller, plus the size of
-        // 'SharedStruct'.
-        let initial_size = HEADER_SIZE + initial_size;
-        let max_size = NonZeroUsize::new(HEADER_SIZE + max_size).unwrap();
-
-        // Reserve address space for it with mmap
-        //
-        // TODO: Use MAP_HUGETLB if possible
-        let start_ptr = unsafe {
-            nix_mmap(
-                None,
-                max_size,
-                ProtFlags::PROT_READ | ProtFlags::PROT_WRITE,
-                MapFlags::MAP_SHARED,
-                &fd,
-                0,
-            )
-        }
-        .map_err(|e| Error::new("mmap failed: {e}", e))?;
-
-        // Reserve space for the initial size
-        enlarge_file(fd.as_fd(), initial_size as u64)?;
-
-        // Initialize the header
-        let shared: NonNull<SharedStruct> = start_ptr.cast();
-        unsafe {
-            shared.write(SharedStruct {
-                max_size: max_size.into(),
-                current_size: AtomicUsize::new(initial_size),
-            })
-        };
-
-        // The user data begins after the header
-        let data_ptr = unsafe { start_ptr.cast().add(HEADER_SIZE) };
-
-        Ok(ShmemHandle {
-            fd,
-            max_size: max_size.into(),
-            shared_ptr: shared,
-            data_ptr,
-        })
-    }
-
-    // return reference to the header
-    fn shared(&self) -> &SharedStruct {
-        unsafe { self.shared_ptr.as_ref() }
-    }
-
-    /// Resize the shared memory area. 'new_size' must not be larger than the 'max_size' specified
-    /// when creating the area.
-    ///
-    /// This may only be called from one process/thread concurrently. We detect that case
-    /// and return an Error.
-    pub fn set_size(&self, new_size: usize) -> Result<(), Error> {
-        let new_size = new_size + HEADER_SIZE;
-        let shared = self.shared();
-
-        if new_size > self.max_size {
-            panic!(
-                "new size ({} is greater than max size ({})",
-                new_size, self.max_size
-            );
-        }
-        assert_eq!(self.max_size, shared.max_size);
-
-        // Lock the area by setting the bit in 'current_size'
-        //
-        // Ordering::Relaxed would probably be sufficient here, as we don't access any other memory
-        // and the posix_fallocate/ftruncate call is surely a synchronization point anyway. But
-        // since this is not performance-critical, better safe than sorry .
-        let mut old_size = shared.current_size.load(Ordering::Acquire);
-        loop {
-            if (old_size & RESIZE_IN_PROGRESS) != 0 {
-                return Err(Error::new(
-                    "concurrent resize detected",
-                    Errno::UnknownErrno,
-                ));
-            }
-            match shared.current_size.compare_exchange(
-                old_size,
-                new_size,
-                Ordering::Acquire,
-                Ordering::Relaxed,
-            ) {
-                Ok(_) => break,
-                Err(x) => old_size = x,
-            }
-        }
-
-        // Ok, we got the lock.
-        //
-        // NB: If anything goes wrong, we *must* clear the bit!
-        let result = {
-            use std::cmp::Ordering::{Equal, Greater, Less};
-            match new_size.cmp(&old_size) {
-                Less => nix_ftruncate(&self.fd, new_size as i64).map_err(|e| {
-                    Error::new("could not shrink shmem segment, ftruncate failed: {e}", e)
-                }),
-                Equal => Ok(()),
-                Greater => enlarge_file(self.fd.as_fd(), new_size as u64),
-            }
-        };
-
-        // Unlock
-        shared.current_size.store(
-            if result.is_ok() { new_size } else { old_size },
-            Ordering::Release,
-        );
-
-        result
-    }
-
-    /// Returns the current user-visible size of the shared memory segment.
-    ///
-    /// NOTE: a concurrent set_size() call can change the size at any time. It is the caller's
-    /// responsibility not to access the area beyond the current size.
-    pub fn current_size(&self) -> usize {
-        let total_current_size =
-            self.shared().current_size.load(Ordering::Relaxed) & !RESIZE_IN_PROGRESS;
-        total_current_size - HEADER_SIZE
-    }
-}
-
-impl Drop for ShmemHandle {
-    fn drop(&mut self) {
-        // SAFETY: The pointer was obtained from mmap() with the given size.
-        // We unmap the entire region.
-        let _ = unsafe { nix_munmap(self.shared_ptr.cast(), self.max_size) };
-        // The fd is dropped automatically by OwnedFd.
-    }
-}
-
-/// Create a "backing file" for the shared memory area. On Linux, use memfd_create(), to create an
-/// anonymous in-memory file. One macos, fall back to a regular file. That's good enough for
-/// development and testing, but in production we want the file to stay in memory.
-///
-/// disable 'unused_variables' warnings, because in the macos path, 'name' is unused.
-#[allow(unused_variables)]
-fn create_backing_file(name: &str) -> Result<OwnedFd, Error> {
-    #[cfg(not(target_os = "macos"))]
-    {
-        nix::sys::memfd::memfd_create(name, nix::sys::memfd::MFdFlags::empty())
-            .map_err(|e| Error::new("memfd_create failed: {e}", e))
-    }
-    #[cfg(target_os = "macos")]
-    {
-        let file = tempfile::tempfile().map_err(|e| {
-            Error::new(
-                "could not create temporary file to back shmem area: {e}",
-                nix::errno::Errno::from_raw(e.raw_os_error().unwrap_or(0)),
-            )
-        })?;
-        Ok(OwnedFd::from(file))
-    }
-}
-
-fn enlarge_file(fd: BorrowedFd, size: u64) -> Result<(), Error> {
-    // Use posix_fallocate() to enlarge the file. It reserves the space correctly, so that
-    // we don't get a segfault later when trying to actually use it.
-    #[cfg(not(target_os = "macos"))]
-    {
-        nix::fcntl::posix_fallocate(fd, 0, size as i64).map_err(|e| {
-            Error::new(
-                "could not grow shmem segment, posix_fallocate failed: {e}",
-                e,
-            )
-        })
-    }
-    // As a fallback on macos, which doesn't have posix_fallocate, use plain 'fallocate'
-    #[cfg(target_os = "macos")]
-    {
-        nix::unistd::ftruncate(fd, size as i64)
-            .map_err(|e| Error::new("could not grow shmem segment, ftruncate failed: {e}", e))
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    use nix::unistd::ForkResult;
-    use std::ops::Range;
-
-    /// check that all bytes in given range have the expected value.
-    fn assert_range(ptr: *const u8, expected: u8, range: Range<usize>) {
-        for i in range {
-            let b = unsafe { *(ptr.add(i)) };
-            assert_eq!(expected, b, "unexpected byte at offset {}", i);
-        }
-    }
-
-    /// Write 'b' to all bytes in the given range
-    fn write_range(ptr: *mut u8, b: u8, range: Range<usize>) {
-        unsafe { std::ptr::write_bytes(ptr.add(range.start), b, range.end - range.start) };
-    }
-
-    // simple single-process test of growing and shrinking
-    #[test]
-    fn test_shmem_resize() -> Result<(), Error> {
-        let max_size = 1024 * 1024;
-        let init_struct = ShmemHandle::new("test_shmem_resize", 0, max_size)?;
-
-        assert_eq!(init_struct.current_size(), 0);
-
-        // Initial grow
-        let size1 = 10000;
-        init_struct.set_size(size1).unwrap();
-        assert_eq!(init_struct.current_size(), size1);
-
-        // Write some data
-        let data_ptr = init_struct.data_ptr.as_ptr();
-        write_range(data_ptr, 0xAA, 0..size1);
-        assert_range(data_ptr, 0xAA, 0..size1);
-
-        // Shrink
-        let size2 = 5000;
-        init_struct.set_size(size2).unwrap();
-        assert_eq!(init_struct.current_size(), size2);
-
-        // Grow again
-        let size3 = 20000;
-        init_struct.set_size(size3).unwrap();
-        assert_eq!(init_struct.current_size(), size3);
-
-        // Try to read it. The area that was shrunk and grown again should read as all zeros now
-        assert_range(data_ptr, 0xAA, 0..5000);
-        assert_range(data_ptr, 0, 5000..size1);
-
-        // Try to grow beyond max_size
-        //let size4 = max_size + 1;
-        //assert!(init_struct.set_size(size4).is_err());
-
-        // Dropping init_struct should unmap the memory
-        drop(init_struct);
-
-        Ok(())
-    }
-
-    /// This is used in tests to coordinate between test processes. It's like std::sync::Barrier,
-    /// but is stored in the shared memory area and works across processes. It's implemented by
-    /// polling, because e.g. standard rust mutexes are not guaranteed to work across processes.
-    struct SimpleBarrier {
-        num_procs: usize,
-        count: AtomicUsize,
-    }
-
-    impl SimpleBarrier {
-        unsafe fn init(ptr: *mut SimpleBarrier, num_procs: usize) {
-            unsafe {
-                *ptr = SimpleBarrier {
-                    num_procs,
-                    count: AtomicUsize::new(0),
-                }
-            }
-        }
-
-        pub fn wait(&self) {
-            let old = self.count.fetch_add(1, Ordering::Relaxed);
-
-            let generation = old / self.num_procs;
-
-            let mut current = old + 1;
-            while current < (generation + 1) * self.num_procs {
-                std::thread::sleep(std::time::Duration::from_millis(10));
-                current = self.count.load(Ordering::Relaxed);
-            }
-        }
-    }
-
-    #[test]
-    fn test_multi_process() {
-        // Initialize
-        let max_size = 1_000_000_000_000;
-        let init_struct = ShmemHandle::new("test_multi_process", 0, max_size).unwrap();
-        let ptr = init_struct.data_ptr.as_ptr();
-
-        // Store the SimpleBarrier in the first 1k of the area.
-        init_struct.set_size(10000).unwrap();
-        let barrier_ptr: *mut SimpleBarrier = unsafe {
-            ptr.add(ptr.align_offset(std::mem::align_of::<SimpleBarrier>()))
-                .cast()
-        };
-        unsafe { SimpleBarrier::init(barrier_ptr, 2) };
-        let barrier = unsafe { barrier_ptr.as_ref().unwrap() };
-
-        // Fork another test process. The code after this runs in both processes concurrently.
-        let fork_result = unsafe { nix::unistd::fork().unwrap() };
-
-        // In the parent, fill bytes between 1000..2000. In the child, between 2000..3000
-        if fork_result.is_parent() {
-            write_range(ptr, 0xAA, 1000..2000);
-        } else {
-            write_range(ptr, 0xBB, 2000..3000);
-        }
-        barrier.wait();
-        // Verify the contents. (in both processes)
-        assert_range(ptr, 0xAA, 1000..2000);
-        assert_range(ptr, 0xBB, 2000..3000);
-
-        // Grow, from the child this time
-        let size = 10_000_000;
-        if !fork_result.is_parent() {
-            init_struct.set_size(size).unwrap();
-        }
-        barrier.wait();
-
-        // make some writes at the end
-        if fork_result.is_parent() {
-            write_range(ptr, 0xAA, (size - 10)..size);
-        } else {
-            write_range(ptr, 0xBB, (size - 20)..(size - 10));
-        }
-        barrier.wait();
-
-        // Verify the contents. (This runs in both processes)
-        assert_range(ptr, 0, (size - 1000)..(size - 20));
-        assert_range(ptr, 0xBB, (size - 20)..(size - 10));
-        assert_range(ptr, 0xAA, (size - 10)..size);
-
-        if let ForkResult::Parent { child } = fork_result {
-            nix::sys::wait::waitpid(child, None).unwrap();
-        }
-    }
-}
+pub mod hash;
+pub mod shmem;
--- a/libs/neon-shmem/src/shmem.rs
+++ b/libs/neon-shmem/src/shmem.rs
@@ -0,0 +1,418 @@
+//! Dynamically resizable contiguous chunk of shared memory
+
+use std::num::NonZeroUsize;
+use std::os::fd::{AsFd, BorrowedFd, OwnedFd};
+use std::ptr::NonNull;
+use std::sync::atomic::{AtomicUsize, Ordering};
+
+use nix::errno::Errno;
+use nix::sys::mman::MapFlags;
+use nix::sys::mman::ProtFlags;
+use nix::sys::mman::mmap as nix_mmap;
+use nix::sys::mman::munmap as nix_munmap;
+use nix::unistd::ftruncate as nix_ftruncate;
+
+/// ShmemHandle represents a shared memory area that can be shared by processes over fork().
+/// Unlike shared memory allocated by Postgres, this area is resizable, up to 'max_size' that's
+/// specified at creation.
+///
+/// The area is backed by an anonymous file created with memfd_create(). The full address space for
+/// 'max_size' is reserved up-front with mmap(), but whenever you call [`ShmemHandle::set_size`],
+/// the underlying file is resized. Do not access the area beyond the current size. Currently, that
+/// will cause the file to be expanded, but we might use mprotect() etc. to enforce that in the
+/// future.
+pub struct ShmemHandle {
+    /// memfd file descriptor
+    fd: OwnedFd,
+
+    max_size: usize,
+
+    // Pointer to the beginning of the shared memory area. The header is stored there.
+    shared_ptr: NonNull<SharedStruct>,
+
+    // Pointer to the beginning of the user data
+    pub data_ptr: NonNull<u8>,
+}
+
+/// This is stored at the beginning in the shared memory area.
+struct SharedStruct {
+    max_size: usize,
+
+    /// Current size of the backing file. The high-order bit is used for the RESIZE_IN_PROGRESS flag
+    current_size: AtomicUsize,
+}
+
+const RESIZE_IN_PROGRESS: usize = 1 << 63;
+
+const HEADER_SIZE: usize = std::mem::size_of::<SharedStruct>();
+
+/// Error type returned by the ShmemHandle functions.
+#[derive(thiserror::Error, Debug)]
+#[error("{msg}: {errno}")]
+pub struct Error {
+    pub msg: String,
+    pub errno: Errno,
+}
+
+impl Error {
+    fn new(msg: &str, errno: Errno) -> Error {
+        Error {
+            msg: msg.to_string(),
+            errno,
+        }
+    }
+}
+
+impl ShmemHandle {
+    /// Create a new shared memory area. To communicate between processes, the processes need to be
+    /// fork()'d after calling this, so that the ShmemHandle is inherited by all processes.
+    ///
+    /// If the ShmemHandle is dropped, the memory is unmapped from the current process. Other
+    /// processes can continue using it, however.
+    pub fn new(name: &str, initial_size: usize, max_size: usize) -> Result<ShmemHandle, Error> {
+        // create the backing anonymous file.
+        let fd = create_backing_file(name)?;
+
+        Self::new_with_fd(fd, initial_size, max_size)
+    }
+
+    fn new_with_fd(
+        fd: OwnedFd,
+        initial_size: usize,
+        max_size: usize,
+    ) -> Result<ShmemHandle, Error> {
+        // We reserve the high-order bit for the RESIZE_IN_PROGRESS flag, and the actual size
+        // is a little larger than this because of the SharedStruct header. Make the upper limit
+        // somewhat smaller than that, because with anything close to that, you'll run out of
+        // memory anyway.
+        if max_size >= 1 << 48 {
+            panic!("max size {} too large", max_size);
+        }
+        if initial_size > max_size {
+            panic!("initial size {initial_size} larger than max size {max_size}");
+        }
+
+        // The actual initial / max size is the one given by the caller, plus the size of
+        // 'SharedStruct'.
+        let initial_size = HEADER_SIZE + initial_size;
+        let max_size = NonZeroUsize::new(HEADER_SIZE + max_size).unwrap();
+
+        // Reserve address space for it with mmap
+        //
+        // TODO: Use MAP_HUGETLB if possible
+        let start_ptr = unsafe {
+            nix_mmap(
+                None,
+                max_size,
+                ProtFlags::PROT_READ | ProtFlags::PROT_WRITE,
+                MapFlags::MAP_SHARED,
+                &fd,
+                0,
+            )
+        }
+        .map_err(|e| Error::new("mmap failed: {e}", e))?;
+
+        // Reserve space for the initial size
+        enlarge_file(fd.as_fd(), initial_size as u64)?;
+
+        // Initialize the header
+        let shared: NonNull<SharedStruct> = start_ptr.cast();
+        unsafe {
+            shared.write(SharedStruct {
+                max_size: max_size.into(),
+                current_size: AtomicUsize::new(initial_size),
+            })
+        };
+
+        // The user data begins after the header
+        let data_ptr = unsafe { start_ptr.cast().add(HEADER_SIZE) };
+
+        Ok(ShmemHandle {
+            fd,
+            max_size: max_size.into(),
+            shared_ptr: shared,
+            data_ptr,
+        })
+    }
+
+    // return reference to the header
+    fn shared(&self) -> &SharedStruct {
+        unsafe { self.shared_ptr.as_ref() }
+    }
+
+    /// Resize the shared memory area. 'new_size' must not be larger than the 'max_size' specified
+    /// when creating the area.
+    ///
+    /// This may only be called from one process/thread concurrently. We detect that case
+    /// and return an Error.
+    pub fn set_size(&self, new_size: usize) -> Result<(), Error> {
+        let new_size = new_size + HEADER_SIZE;
+        let shared = self.shared();
+
+        if new_size > self.max_size {
+            panic!(
+                "new size ({} is greater than max size ({})",
+                new_size, self.max_size
+            );
+        }
+        assert_eq!(self.max_size, shared.max_size);
+
+        // Lock the area by setting the bit in 'current_size'
+        //
+        // Ordering::Relaxed would probably be sufficient here, as we don't access any other memory
+        // and the posix_fallocate/ftruncate call is surely a synchronization point anyway. But
+        // since this is not performance-critical, better safe than sorry .
+        let mut old_size = shared.current_size.load(Ordering::Acquire);
+        loop {
+            if (old_size & RESIZE_IN_PROGRESS) != 0 {
+                return Err(Error::new(
+                    "concurrent resize detected",
+                    Errno::UnknownErrno,
+                ));
+            }
+            match shared.current_size.compare_exchange(
+                old_size,
+                new_size,
+                Ordering::Acquire,
+                Ordering::Relaxed,
+            ) {
+                Ok(_) => break,
+                Err(x) => old_size = x,
+            }
+        }
+
+        // Ok, we got the lock.
+        //
+        // NB: If anything goes wrong, we *must* clear the bit!
+        let result = {
+            use std::cmp::Ordering::{Equal, Greater, Less};
+            match new_size.cmp(&old_size) {
+                Less => nix_ftruncate(&self.fd, new_size as i64).map_err(|e| {
+                    Error::new("could not shrink shmem segment, ftruncate failed: {e}", e)
+                }),
+                Equal => Ok(()),
+                Greater => enlarge_file(self.fd.as_fd(), new_size as u64),
+            }
+        };
+
+        // Unlock
+        shared.current_size.store(
+            if result.is_ok() { new_size } else { old_size },
+            Ordering::Release,
+        );
+
+        result
+    }
+
+    /// Returns the current user-visible size of the shared memory segment.
+    ///
+    /// NOTE: a concurrent set_size() call can change the size at any time. It is the caller's
+    /// responsibility not to access the area beyond the current size.
+    pub fn current_size(&self) -> usize {
+        let total_current_size =
+            self.shared().current_size.load(Ordering::Relaxed) & !RESIZE_IN_PROGRESS;
+        total_current_size - HEADER_SIZE
+    }
+}
+
+impl Drop for ShmemHandle {
+    fn drop(&mut self) {
+        // SAFETY: The pointer was obtained from mmap() with the given size.
+        // We unmap the entire region.
+        let _ = unsafe { nix_munmap(self.shared_ptr.cast(), self.max_size) };
+        // The fd is dropped automatically by OwnedFd.
+    }
+}
+
+/// Create a "backing file" for the shared memory area. On Linux, use memfd_create(), to create an
+/// anonymous in-memory file. One macos, fall back to a regular file. That's good enough for
+/// development and testing, but in production we want the file to stay in memory.
+///
+/// disable 'unused_variables' warnings, because in the macos path, 'name' is unused.
+#[allow(unused_variables)]
+fn create_backing_file(name: &str) -> Result<OwnedFd, Error> {
+    #[cfg(not(target_os = "macos"))]
+    {
+        nix::sys::memfd::memfd_create(name, nix::sys::memfd::MFdFlags::empty())
+            .map_err(|e| Error::new("memfd_create failed: {e}", e))
+    }
+    #[cfg(target_os = "macos")]
+    {
+        let file = tempfile::tempfile().map_err(|e| {
+            Error::new(
+                "could not create temporary file to back shmem area: {e}",
+                nix::errno::Errno::from_raw(e.raw_os_error().unwrap_or(0)),
+            )
+        })?;
+        Ok(OwnedFd::from(file))
+    }
+}
+
+fn enlarge_file(fd: BorrowedFd, size: u64) -> Result<(), Error> {
+    // Use posix_fallocate() to enlarge the file. It reserves the space correctly, so that
+    // we don't get a segfault later when trying to actually use it.
+    #[cfg(not(target_os = "macos"))]
+    {
+        nix::fcntl::posix_fallocate(fd, 0, size as i64).map_err(|e| {
+            Error::new(
+                "could not grow shmem segment, posix_fallocate failed: {e}",
+                e,
+            )
+        })
+    }
+    // As a fallback on macos, which doesn't have posix_fallocate, use plain 'fallocate'
+    #[cfg(target_os = "macos")]
+    {
+        nix::unistd::ftruncate(fd, size as i64)
+            .map_err(|e| Error::new("could not grow shmem segment, ftruncate failed: {e}", e))
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    use nix::unistd::ForkResult;
+    use std::ops::Range;
+
+    /// check that all bytes in given range have the expected value.
+    fn assert_range(ptr: *const u8, expected: u8, range: Range<usize>) {
+        for i in range {
+            let b = unsafe { *(ptr.add(i)) };
+            assert_eq!(expected, b, "unexpected byte at offset {}", i);
+        }
+    }
+
+    /// Write 'b' to all bytes in the given range
+    fn write_range(ptr: *mut u8, b: u8, range: Range<usize>) {
+        unsafe { std::ptr::write_bytes(ptr.add(range.start), b, range.end - range.start) };
+    }
+
+    // simple single-process test of growing and shrinking
+    #[test]
+    fn test_shmem_resize() -> Result<(), Error> {
+        let max_size = 1024 * 1024;
+        let init_struct = ShmemHandle::new("test_shmem_resize", 0, max_size)?;
+
+        assert_eq!(init_struct.current_size(), 0);
+
+        // Initial grow
+        let size1 = 10000;
+        init_struct.set_size(size1).unwrap();
+        assert_eq!(init_struct.current_size(), size1);
+
+        // Write some data
+        let data_ptr = init_struct.data_ptr.as_ptr();
+        write_range(data_ptr, 0xAA, 0..size1);
+        assert_range(data_ptr, 0xAA, 0..size1);
+
+        // Shrink
+        let size2 = 5000;
+        init_struct.set_size(size2).unwrap();
+        assert_eq!(init_struct.current_size(), size2);
+
+        // Grow again
+        let size3 = 20000;
+        init_struct.set_size(size3).unwrap();
+        assert_eq!(init_struct.current_size(), size3);
+
+        // Try to read it. The area that was shrunk and grown again should read as all zeros now
+        assert_range(data_ptr, 0xAA, 0..5000);
+        assert_range(data_ptr, 0, 5000..size1);
+
+        // Try to grow beyond max_size
+        //let size4 = max_size + 1;
+        //assert!(init_struct.set_size(size4).is_err());
+
+        // Dropping init_struct should unmap the memory
+        drop(init_struct);
+
+        Ok(())
+    }
+
+    /// This is used in tests to coordinate between test processes. It's like std::sync::Barrier,
+    /// but is stored in the shared memory area and works across processes. It's implemented by
+    /// polling, because e.g. standard rust mutexes are not guaranteed to work across processes.
+    struct SimpleBarrier {
+        num_procs: usize,
+        count: AtomicUsize,
+    }
+
+    impl SimpleBarrier {
+        unsafe fn init(ptr: *mut SimpleBarrier, num_procs: usize) {
+            unsafe {
+                *ptr = SimpleBarrier {
+                    num_procs,
+                    count: AtomicUsize::new(0),
+                }
+            }
+        }
+
+        pub fn wait(&self) {
+            let old = self.count.fetch_add(1, Ordering::Relaxed);
+
+            let generation = old / self.num_procs;
+
+            let mut current = old + 1;
+            while current < (generation + 1) * self.num_procs {
+                std::thread::sleep(std::time::Duration::from_millis(10));
+                current = self.count.load(Ordering::Relaxed);
+            }
+        }
+    }
+
+    #[test]
+    fn test_multi_process() {
+        // Initialize
+        let max_size = 1_000_000_000_000;
+        let init_struct = ShmemHandle::new("test_multi_process", 0, max_size).unwrap();
+        let ptr = init_struct.data_ptr.as_ptr();
+
+        // Store the SimpleBarrier in the first 1k of the area.
+        init_struct.set_size(10000).unwrap();
+        let barrier_ptr: *mut SimpleBarrier = unsafe {
+            ptr.add(ptr.align_offset(std::mem::align_of::<SimpleBarrier>()))
+                .cast()
+        };
+        unsafe { SimpleBarrier::init(barrier_ptr, 2) };
+        let barrier = unsafe { barrier_ptr.as_ref().unwrap() };
+
+        // Fork another test process. The code after this runs in both processes concurrently.
+        let fork_result = unsafe { nix::unistd::fork().unwrap() };
+
+        // In the parent, fill bytes between 1000..2000. In the child, between 2000..3000
+        if fork_result.is_parent() {
+            write_range(ptr, 0xAA, 1000..2000);
+        } else {
+            write_range(ptr, 0xBB, 2000..3000);
+        }
+        barrier.wait();
+        // Verify the contents. (in both processes)
+        assert_range(ptr, 0xAA, 1000..2000);
+        assert_range(ptr, 0xBB, 2000..3000);
+
+        // Grow, from the child this time
+        let size = 10_000_000;
+        if !fork_result.is_parent() {
+            init_struct.set_size(size).unwrap();
+        }
+        barrier.wait();
+
+        // make some writes at the end
+        if fork_result.is_parent() {
+            write_range(ptr, 0xAA, (size - 10)..size);
+        } else {
+            write_range(ptr, 0xBB, (size - 20)..(size - 10));
+        }
+        barrier.wait();
+
+        // Verify the contents. (This runs in both processes)
+        assert_range(ptr, 0, (size - 1000)..(size - 20));
+        assert_range(ptr, 0xBB, (size - 20)..(size - 10));
+        assert_range(ptr, 0xAA, (size - 10)..size);
+
+        if let ForkResult::Parent { child } = fork_result {
+            nix::sys::wait::waitpid(child, None).unwrap();
+        }
+    }
+}
--- a/pgxn/neon/Makefile
+++ b/pgxn/neon/Makefile
@@ -1,6 +1,5 @@
 # pgxs/neon/Makefile

-
 MODULE_big = neon
 OBJS = \
 	$(WIN32RES) \
@@ -22,7 +21,8 @@ OBJS = \
 	walproposer.o \
 	walproposer_pg.o \
 	control_plane_connector.o \
-	walsender_hooks.o
+	walsender_hooks.o \
+	$(LIBCOMMUNICATOR_PATH)/libcommunicator.a

 PG_CPPFLAGS = -I$(libpq_srcdir)
 SHLIB_LINK_INTERNAL = $(libpq)
--- a/pgxn/neon/communicator/Cargo.toml
+++ b/pgxn/neon/communicator/Cargo.toml
@@ -0,0 +1,13 @@
+[package]
+name = "communicator"
+version = "0.1.0"
+edition = "2024"
+
+[lib]
+crate-type = ["staticlib"]
+
+[dependencies]
+neon-shmem.workspace = true
+
+[build-dependencies]
+cbindgen.workspace = true
--- a/pgxn/neon/communicator/README.md
+++ b/pgxn/neon/communicator/README.md
@@ -0,0 +1,8 @@
+This package will evolve into a "compute-pageserver communicator"
+process and machinery. For now, it just provides wrappers on the
+neon-shmem Rust crate, to allow using it in the C implementation of
+the LFC.
+
+At compilation time, pgxn/neon/communicator/ produces a static
+library, libcommunicator.a. It is linked to the neon.so extension
+library. 
--- a/pgxn/neon/communicator/build.rs
+++ b/pgxn/neon/communicator/build.rs
@@ -0,0 +1,22 @@
+use std::env;
+
+fn main() -> Result<(), Box<dyn std::error::Error>> {
+    let crate_dir = env::var("CARGO_MANIFEST_DIR").unwrap();
+
+    cbindgen::generate(crate_dir).map_or_else(
+        |error| match error {
+            cbindgen::Error::ParseSyntaxError { .. } => {
+                // This means there was a syntax error in the Rust sources. Don't panic, because
+                // we want the build to continue and the Rust compiler to hit the error. The
+                // Rust compiler produces a better error message than cbindgen.
+                eprintln!("Generating C bindings failed because of a Rust syntax error");
+            }
+            e => panic!("Unable to generate C bindings: {:?}", e),
+        },
+        |bindings| {
+            bindings.write_to_file("communicator_bindings.h");
+        },
+    );
+
+    Ok(())
+}
--- a/pgxn/neon/communicator/cbindgen.toml
+++ b/pgxn/neon/communicator/cbindgen.toml
@@ -0,0 +1,4 @@
+language = "C"
+
+[enum]
+prefix_with_name = true
--- a/pgxn/neon/communicator/src/file_cache_hashmap.rs
+++ b/pgxn/neon/communicator/src/file_cache_hashmap.rs
@@ -0,0 +1,240 @@
+//! Glue code to allow using the Rust shmem hash map implementation from C code
+//!
+//! For convience of adapting existing code, the interface provided somewhat resembles the dynahash
+//! interface.
+//!
+//! NOTE: The caller is responsible for locking! The caller is expected to hold the PostgreSQL
+//! LWLock, 'lfc_lock', while accessing the hash table, in shared or exclusive mode as appropriate.
+
+use std::ffi::c_void;
+use std::marker::PhantomData;
+
+use neon_shmem::hash::entry::Entry;
+use neon_shmem::hash::{HashMapAccess, HashMapInit};
+use neon_shmem::shmem::ShmemHandle;
+
+/// NB: This must match the definition of BufferTag in Postgres C headers. We could use bindgen to
+/// generate this from the C headers, but prefer to not introduce dependency on bindgen for now.
+///
+/// Note that there are no padding bytes. If the corresponding C struct has padding bytes, the C C
+/// code must clear them.
+#[derive(Clone, Debug, Hash, Eq, PartialEq)]
+#[repr(C)]
+pub struct FileCacheKey {
+    pub _spc_id: u32,
+    pub _db_id: u32,
+    pub _rel_number: u32,
+    pub _fork_num: u32,
+    pub _block_num: u32,
+}
+
+/// Like with FileCacheKey, this must match the definition of FileCacheEntry in file_cache.c.  We
+/// don't look at the contents here though, it's sufficent that the size and alignment matches.
+#[derive(Clone, Debug, Default)]
+#[repr(C)]
+pub struct FileCacheEntry {
+    pub _offset: u32,
+    pub _access_count: u32,
+    pub _prev: *mut FileCacheEntry,
+    pub _next: *mut FileCacheEntry,
+    pub _state: [u32; 8],
+}
+
+/// XXX: This could be just:
+///
+/// ```ignore
+/// type FileCacheHashMapHandle = HashMapInit<'a, FileCacheKey, FileCacheEntry>
+/// ```
+///
+/// but with that, cbindgen generates a broken typedef in the C header file which doesn't
+/// compile. It apparently gets confused by the generics.
+#[repr(transparent)]
+pub struct FileCacheHashMapHandle<'a>(
+    pub *mut c_void,
+    PhantomData<HashMapInit<'a, FileCacheKey, FileCacheEntry>>,
+);
+impl<'a> From<Box<HashMapInit<'a, FileCacheKey, FileCacheEntry>>> for FileCacheHashMapHandle<'a> {
+    fn from(x: Box<HashMapInit<'a, FileCacheKey, FileCacheEntry>>) -> Self {
+        FileCacheHashMapHandle(Box::into_raw(x) as *mut c_void, PhantomData::default())
+    }
+}
+impl<'a> From<FileCacheHashMapHandle<'a>> for Box<HashMapInit<'a, FileCacheKey, FileCacheEntry>> {
+    fn from(x: FileCacheHashMapHandle) -> Self {
+        unsafe { Box::from_raw(x.0.cast()) }
+    }
+}
+
+/// XXX: same for this
+#[repr(transparent)]
+pub struct FileCacheHashMapAccess<'a>(
+    pub *mut c_void,
+    PhantomData<HashMapAccess<'a, FileCacheKey, FileCacheEntry>>,
+);
+impl<'a> From<Box<HashMapAccess<'a, FileCacheKey, FileCacheEntry>>> for FileCacheHashMapAccess<'a> {
+    fn from(x: Box<HashMapAccess<'a, FileCacheKey, FileCacheEntry>>) -> Self {
+        // Convert the Box into a raw mutable pointer to the HashMapAccess itself.
+        // This transfers ownership of the HashMapAccess (and its contained ShmemHandle)
+        // to the raw pointer. The C caller is now responsible for managing this memory.
+        FileCacheHashMapAccess(Box::into_raw(x) as *mut c_void, PhantomData::default())
+    }
+}
+impl<'a> FileCacheHashMapAccess<'a> {
+    fn as_ref(self) -> &'a HashMapAccess<'a, FileCacheKey, FileCacheEntry> {
+        let ptr: *mut HashMapAccess<'_, FileCacheKey, FileCacheEntry> = self.0.cast();
+        unsafe { ptr.as_ref().unwrap() }
+    }
+    fn as_mut(self) -> &'a mut HashMapAccess<'a, FileCacheKey, FileCacheEntry> {
+        let ptr: *mut HashMapAccess<'_, FileCacheKey, FileCacheEntry> = self.0.cast();
+        unsafe { ptr.as_mut().unwrap() }
+    }
+}
+
+/// Initialize the shared memory area at postmaster startup. The returned handle is inherited
+/// by all the backend processes across fork()
+#[unsafe(no_mangle)]
+pub extern "C" fn bcomm_file_cache_shmem_init<'a>(
+    initial_num_buckets: u32,
+    max_num_buckets: u32,
+) -> FileCacheHashMapHandle<'a> {
+    let max_bytes = HashMapInit::<FileCacheKey, FileCacheEntry>::estimate_size(max_num_buckets);
+    let shmem_handle =
+        ShmemHandle::new("lfc mapping", 0, max_bytes).expect("shmem initialization failed");
+
+    let handle = HashMapInit::<FileCacheKey, FileCacheEntry>::init_in_shmem(
+        initial_num_buckets,
+        shmem_handle,
+    );
+
+    Box::new(handle).into()
+}
+
+/// Initialize the access to the shared memory area in a backend process.
+///
+/// XXX: I'm not sure if this actually gets called in each process, or if the returned struct
+/// is also inherited across fork(). It currently works either way but if this did more
+/// initialization that needed to be done after fork(), then it would matter.
+#[unsafe(no_mangle)]
+pub extern "C" fn bcomm_file_cache_shmem_access<'a>(
+    handle: FileCacheHashMapHandle<'a>,
+) -> FileCacheHashMapAccess<'a> {
+    let handle: Box<HashMapInit<'_, FileCacheKey, FileCacheEntry>> = handle.into();
+    Box::new(handle.attach_writer()).into()
+}
+
+/// Return the current number of buckets in the hash table
+#[unsafe(no_mangle)]
+pub extern "C" fn bcomm_file_cache_get_num_buckets<'a>(
+    map: FileCacheHashMapAccess<'static>,
+) -> u32 {
+    let map = map.as_ref();
+    map.get_num_buckets().try_into().unwrap()
+}
+
+/// Look up the entry with given key and hash.
+///
+/// This is similar to dynahash's hash_search(... , HASH_FIND)
+#[unsafe(no_mangle)]
+pub extern "C" fn bcomm_file_cache_hash_find<'a>(
+    map: FileCacheHashMapAccess<'static>,
+    key: &FileCacheKey,
+    hash: u64,
+) -> Option<&'static FileCacheEntry> {
+    let map = map.as_ref();
+    map.get_with_hash(key, hash)
+}
+
+/// Look up the entry at given bucket position
+///
+/// This has no direct equivalent in the dynahash interface, but can be used to
+/// iterate through all entries in the hash table.
+#[unsafe(no_mangle)]
+pub extern "C" fn bcomm_file_cache_hash_get_at_pos<'a>(
+    map: FileCacheHashMapAccess<'static>,
+    pos: u32,
+) -> Option<&'static FileCacheEntry> {
+    let map = map.as_ref();
+    map.get_at_bucket(pos as usize).map(|(_k, v)| v)
+}
+
+/// Remove entry, given a pointer to the value.
+///
+/// This is equivalent to dynahash hash_search(entry->key, HASH_REMOVE), where 'entry'
+/// is an entry you have previously looked up
+#[unsafe(no_mangle)]
+pub extern "C" fn bcomm_file_cache_hash_remove_entry<'a, 'b>(
+    map: FileCacheHashMapAccess,
+    entry: *mut FileCacheEntry,
+) {
+    let map = map.as_mut();
+    let pos = map.get_bucket_for_value(entry);
+    match map.entry_at_bucket(pos) {
+        Some(e) => {
+            e.remove();
+        }
+        None => {
+            // todo: shouldn't happen, panic?
+        }
+    }
+}
+
+/// Compute the hash for given key
+///
+/// This is equivalent to dynahash get_hash_value() function. We use Rust's default hasher
+/// for calculating the hash though.
+#[unsafe(no_mangle)]
+pub extern "C" fn bcomm_file_cache_get_hash_value<'a, 'b>(
+    map: FileCacheHashMapAccess<'static>,
+    key: &FileCacheKey,
+) -> u64 {
+    map.as_ref().get_hash_value(key)
+}
+
+/// Insert a new entry to the hash table
+///
+/// This is equivalent to dynahash hash_search(..., HASH_ENTER).
+#[unsafe(no_mangle)]
+pub extern "C" fn bcomm_file_cache_hash_enter<'a, 'b>(
+    map: FileCacheHashMapAccess,
+    key: &FileCacheKey,
+    hash: u64,
+    found: &mut bool,
+) -> *mut FileCacheEntry {
+    match map.as_mut().entry_with_hash(key.clone(), hash) {
+        Entry::Occupied(mut e) => {
+            *found = true;
+            e.get_mut()
+        }
+        Entry::Vacant(e) => {
+            *found = false;
+            let initial_value = FileCacheEntry::default();
+            e.insert(initial_value).expect("TODO: hash table full")
+        }
+    }
+}
+
+/// Get the key for a given entry, which must be present in the hash table.
+///
+/// Dynahash requires the key to be part of the "value" struct, so you can always
+/// access the key with something like `entry->key`. The Rust implementation however
+/// stores the key separately. This function extracts the separately stored key.
+#[unsafe(no_mangle)]
+pub extern "C" fn bcomm_file_cache_hash_get_key_for_entry<'a, 'b>(
+    map: FileCacheHashMapAccess,
+    entry: *const FileCacheEntry,
+) -> Option<&FileCacheKey> {
+    let map = map.as_ref();
+    let pos = map.get_bucket_for_value(entry);
+    map.get_at_bucket(pos as usize).map(|(k, _v)| k)
+}
+
+/// Remove all entries from the hash table
+#[unsafe(no_mangle)]
+pub extern "C" fn bcomm_file_cache_hash_reset<'a, 'b>(map: FileCacheHashMapAccess) {
+    let map = map.as_mut();
+    let num_buckets = map.get_num_buckets();
+    for i in 0..num_buckets {
+        if let Some(e) = map.entry_at_bucket(i) {
+            e.remove();
+        }
+    }
+}
--- a/pgxn/neon/communicator/src/lib.rs
+++ b/pgxn/neon/communicator/src/lib.rs
@@ -0,0 +1 @@
+pub mod file_cache_hashmap;
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -21,7 +21,7 @@
 #include "access/xlog.h"
 #include "funcapi.h"
 #include "miscadmin.h"
-#include "common/hashfn.h"
+#include "common/file_utils.h"
 #include "pgstat.h"
 #include "port/pg_iovec.h"
 #include "postmaster/bgworker.h"
@@ -36,7 +36,6 @@
 #include "storage/procsignal.h"
 #include "tcop/tcopprot.h"
 #include "utils/builtins.h"
-#include "utils/dynahash.h"
 #include "utils/guc.h"

 #if PG_VERSION_NUM >= 150000
@@ -46,6 +45,7 @@
 #include "hll.h"
 #include "bitmap.h"
 #include "file_cache.h"
+#include "file_cache_rust_hash.h"
 #include "neon.h"
 #include "neon_lwlsncache.h"
 #include "neon_perf_counters.h"
@@ -64,7 +64,7 @@
 *
 * Cache is always reconstructed at node startup, so we do not need to save mapping somewhere and worry about
 * its consistency.
-
+ *
 *
 * ## Holes
 *
@@ -76,13 +76,15 @@
 * fallocate(FALLOC_FL_PUNCH_HOLE) call. The nominal size of the file doesn't
 * shrink, but the disk space it uses does.
 *
- * Each hole is tracked by a dummy FileCacheEntry, which are kept in the
- * 'holes' linked list. They are entered into the chunk hash table, with a
- * special key where the blockNumber is used to store the 'offset' of the
- * hole, and all other fields are zero. Holes are never looked up in the hash
- * table, we only enter them there to have a FileCacheEntry that we can keep
- * in the linked list. If the soft limit is raised again, we reuse the holes
- * before extending the nominal size of the file.
+ * Each hole is tracked in a freelist. The freelist consists of two parts: a
+ * fixed-size array in shared memory, and a linked chain of on-disk
+ * blocks. When the in-memory array fills up, it's flushed to a new on-disk
+ * chunk. If the soft limit is raised again, we reuse the holes before
+ * extending the nominal size of the file.
+ *
+ * The in-memory freelist array is protected by 'lfc_lock', while the on-disk
+ * chain is protected by a separate 'lfc_freelist_lock'.  Locking rule to
+ * avoid deadlocks: always acquire lfc_freelist_lock first, then lfc_lock.
 */

 /* Local file storage allocation chunk.
@@ -92,13 +94,15 @@
 *    1Mb chunks can reduce hash map size to 320Mb.
 * 2. Improve access locality, subsequent pages will be allocated together improving seqscan speed
 */
-#define MAX_BLOCKS_PER_CHUNK_LOG  7 /* 1Mb chunk */
-#define MAX_BLOCKS_PER_CHUNK	  (1 << MAX_BLOCKS_PER_CHUNK_LOG)
+#define BLOCKS_PER_CHUNK_LOG  7 /* 1Mb chunk */
+#define BLOCKS_PER_CHUNK	  (1 << BLOCKS_PER_CHUNK_LOG)

 #define MB					((uint64)1024*1024)

-#define SIZE_MB_TO_CHUNKS(size) ((uint32)((size) * MB / BLCKSZ >> lfc_chunk_size_log))
-#define BLOCK_TO_CHUNK_OFF(blkno) ((blkno) & (lfc_blocks_per_chunk-1))
+#define SIZE_MB_TO_CHUNKS(size) ((uint32)((size) * MB / BLCKSZ >> BLOCKS_PER_CHUNK_LOG))
+#define BLOCK_TO_CHUNK_OFF(blkno) ((blkno) & (BLOCKS_PER_CHUNK-1))
+
+#define INVALID_OFFSET (0xffffffff)

 /*
 * Blocks are read or written to LFC file outside LFC critical section.
@@ -119,15 +123,18 @@ typedef enum FileCacheBlockState

 typedef struct FileCacheEntry
 {
-	BufferTag	key;
-	uint32		hash;
 	uint32		offset;
 	uint32		access_count;
-	dlist_node	list_node;		/* LRU/holes list node */
-	uint32		state[FLEXIBLE_ARRAY_MEMBER]; /* two bits per block */
+	dlist_node	list_node;		/* LRU list node */
+	uint32		state[(BLOCKS_PER_CHUNK * 2 + 31) / 32]; /* two bits per block */
 } FileCacheEntry;

-#define FILE_CACHE_ENRTY_SIZE MAXALIGN(offsetof(FileCacheEntry, state) + (lfc_blocks_per_chunk*2+31)/32*4)
+/* Todo: alignment must be the same too */
+StaticAssertDecl(sizeof(FileCacheEntry) == sizeof(RustFileCacheEntry),
+				 "Rust and C declarations of FileCacheEntry are incompatible");
+StaticAssertDecl(sizeof(BufferTag) == sizeof(RustFileCacheKey),
+				 "Rust and C declarations of FileCacheKey are incompatible");
+
 #define GET_STATE(entry, i) (((entry)->state[(i) / 16] >> ((i) % 16 * 2)) & 3)
 #define SET_STATE(entry, i, new_state) (entry)->state[(i) / 16] = ((entry)->state[(i) / 16] & ~(3 << ((i) % 16 * 2))) | ((new_state) << ((i) % 16 * 2))

@@ -136,6 +143,9 @@ typedef struct FileCacheEntry

 #define MAX_PREWARM_WORKERS 8

+
+#define FREELIST_ENTRIES_PER_CHUNK (BLOCKS_PER_CHUNK * BLCKSZ / sizeof(uint32) - 2)
+
 typedef struct PrewarmWorkerState
 {
 	uint32		prewarmed_pages;
@@ -161,7 +171,6 @@ typedef struct FileCacheControl
 	uint64		evicted_pages;	/* number of evicted pages */
 	dlist_head	lru;			/* double linked list for LRU replacement
 								 * algorithm */
-	dlist_head  holes;          /* double linked list of punched holes */
 	HyperLogLogState wss_estimation; /* estimation of working set size */
 	ConditionVariable cv[N_COND_VARS]; /* turnstile of condition variables */
 	PrewarmWorkerState prewarm_workers[MAX_PREWARM_WORKERS];
@@ -172,23 +181,39 @@ typedef struct FileCacheControl
 	bool   prewarm_active;
 	bool   prewarm_canceled;
 	dsm_handle prewarm_lfc_state_handle;
+
+	/*
+	 * Free list. This is large enough to hold one chunks worth of entries.
+	 */
+	uint32		freelist_size;
+	uint32		freelist_head;
+	uint32		num_free_pages;
+	uint32		free_pages[FREELIST_ENTRIES_PER_CHUNK];
 } FileCacheControl;

+typedef struct FreeListChunk
+{
+	uint32		next;
+	uint32		num_free_pages;
+	uint32		free_pages[FREELIST_ENTRIES_PER_CHUNK];
+} FreeListChunk;
+
 #define FILE_CACHE_STATE_MAGIC 0xfcfcfcfc

 #define FILE_CACHE_STATE_BITMAP(fcs)	((uint8*)&(fcs)->chunks[(fcs)->n_chunks])
-#define FILE_CACHE_STATE_SIZE_FOR_CHUNKS(n_chunks)	(sizeof(FileCacheState) + (n_chunks)*sizeof(BufferTag) + (((n_chunks) * lfc_blocks_per_chunk)+7)/8)
+#define FILE_CACHE_STATE_SIZE_FOR_CHUNKS(n_chunks)	(sizeof(FileCacheState) + (n_chunks)*sizeof(BufferTag) + (((n_chunks) * BLOCKS_PER_CHUNK)+7)/8)
 #define FILE_CACHE_STATE_SIZE(fcs)		(sizeof(FileCacheState) + (fcs->n_chunks)*sizeof(BufferTag) + (((fcs->n_chunks) << fcs->chunk_size_log)+7)/8)

-static HTAB *lfc_hash;
+static FileCacheHashMapHandle lfc_hash_handle;
+static FileCacheHashMapAccess lfc_hash;
 static int	lfc_desc = -1;
 static LWLockId lfc_lock;
+static LWLockId lfc_freelist_lock;
 static int	lfc_max_size;
 static int	lfc_size_limit;
 static int	lfc_prewarm_limit;
 static int	lfc_prewarm_batch;
-static int	lfc_chunk_size_log = MAX_BLOCKS_PER_CHUNK_LOG;
-static int	lfc_blocks_per_chunk = MAX_BLOCKS_PER_CHUNK;
+static int	lfc_blocks_per_chunk_ro = BLOCKS_PER_CHUNK;
 static char *lfc_path;
 static uint64 lfc_generation;
 static FileCacheControl *lfc_ctl;
@@ -205,6 +230,11 @@ bool AmPrewarmWorker;

 #define LFC_ENABLED() (lfc_ctl->limit != 0)

+static bool freelist_push(uint32 offset);
+static bool freelist_prepare_pop(void);
+static uint32 freelist_pop(void);
+static bool freelist_is_empty(void);
+
 /*
 * Close LFC file if opened.
 * All backends should close their LFC files once LFC is disabled.
@@ -232,15 +262,9 @@ lfc_switch_off(void)

 	if (LFC_ENABLED())
 	{
-		HASH_SEQ_STATUS status;
-		FileCacheEntry *entry;
-
 		/* Invalidate hash */
-		hash_seq_init(&status, lfc_hash);
-		while ((entry = hash_seq_search(&status)) != NULL)
-		{
-			hash_search_with_hash_value(lfc_hash, &entry->key, entry->hash, HASH_REMOVE, NULL);
-		}
+		file_cache_hash_reset(lfc_hash);
+
 		lfc_ctl->generation += 1;
 		lfc_ctl->size = 0;
 		lfc_ctl->pinned = 0;
@@ -248,7 +272,9 @@ lfc_switch_off(void)
 		lfc_ctl->used_pages = 0;
 		lfc_ctl->limit = 0;
 		dlist_init(&lfc_ctl->lru);
-		dlist_init(&lfc_ctl->holes);
+
+		lfc_ctl->freelist_head = INVALID_OFFSET;
+		lfc_ctl->num_free_pages = 0;

 		/*
 		 * We need to use unlink to to avoid races in LFC write, because it is not
@@ -317,8 +343,8 @@ lfc_ensure_opened(void)
 static void
 lfc_shmem_startup(void)
 {
+	size_t		size;
 	bool		found;
-	static HASHCTL info;

 	if (prev_shmem_startup_hook)
 	{
@@ -327,27 +353,29 @@ lfc_shmem_startup(void)

 	LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);

-	lfc_ctl = (FileCacheControl *) ShmemInitStruct("lfc", sizeof(FileCacheControl), &found);
+	size = sizeof(FileCacheControl);
+
+	lfc_ctl = (FileCacheControl *) ShmemInitStruct("lfc", size, &found);
 	if (!found)
 	{
 		int			fd;
 		uint32		n_chunks = SIZE_MB_TO_CHUNKS(lfc_max_size);

 		lfc_lock = (LWLockId) GetNamedLWLockTranche("lfc_lock");
-		info.keysize = sizeof(BufferTag);
-		info.entrysize = FILE_CACHE_ENRTY_SIZE;
+		lfc_freelist_lock = (LWLockId) GetNamedLWLockTranche("lfc_freelist_lock");

 		/*
 		 * n_chunks+1 because we add new element to hash table before eviction
 		 * of victim
 		 */
-		lfc_hash = ShmemInitHash("lfc_hash",
-								 n_chunks + 1, n_chunks + 1,
-								 &info,
-								 HASH_ELEM | HASH_BLOBS);
-		memset(lfc_ctl, 0, sizeof(FileCacheControl));
+		lfc_hash_handle = file_cache_hash_shmem_init(n_chunks + 1, n_chunks + 1);
+
+		memset(lfc_ctl, 0, offsetof(FileCacheControl, free_pages));
 		dlist_init(&lfc_ctl->lru);
-		dlist_init(&lfc_ctl->holes);
+
+		lfc_ctl->freelist_size = FREELIST_ENTRIES_PER_CHUNK;
+		lfc_ctl->freelist_head = INVALID_OFFSET;
+		lfc_ctl->num_free_pages = 0;

 		/* Initialize hyper-log-log structure for estimating working set size */
 		initSHLL(&lfc_ctl->wss_estimation);
@@ -371,18 +399,25 @@ lfc_shmem_startup(void)

 	}
 	LWLockRelease(AddinShmemInitLock);
+
+	lfc_hash = file_cache_hash_shmem_access(lfc_hash_handle);
 }

 static void
 lfc_shmem_request(void)
 {
+	size_t		size;
+
 #if PG_VERSION_NUM>=150000
 	if (prev_shmem_request_hook)
 		prev_shmem_request_hook();
 #endif

-	RequestAddinShmemSpace(sizeof(FileCacheControl) + hash_estimate_size(SIZE_MB_TO_CHUNKS(lfc_max_size) + 1, FILE_CACHE_ENRTY_SIZE));
+	size = sizeof(FileCacheControl);
+
+	RequestAddinShmemSpace(size);
 	RequestNamedLWLockTranche("lfc_lock", 1);
+	RequestNamedLWLockTranche("lfc_freelist_lock", 2);
 }

 static bool
@@ -398,24 +433,6 @@ is_normal_backend(void)
 	return lfc_ctl && MyProc && UsedShmemSegAddr && !IsParallelWorker();
 }

-static bool
-lfc_check_chunk_size(int *newval, void **extra, GucSource source)
-{
-	if (*newval & (*newval - 1))
-	{
-		elog(ERROR, "LFC chunk size should be power of two");
-		return false;
-	}
-	return true;
-}
-
-static void
-lfc_change_chunk_size(int newval, void* extra)
-{
-	lfc_chunk_size_log = pg_ceil_log2_32(newval);
-}
-
-
 static bool
 lfc_check_limit_hook(int *newval, void **extra, GucSource source)
 {
@@ -435,12 +452,14 @@ lfc_change_limit_hook(int newval, void *extra)
 	if (!lfc_ctl || !is_normal_backend())
 		return;

+	LWLockAcquire(lfc_freelist_lock, LW_EXCLUSIVE);
 	LWLockAcquire(lfc_lock, LW_EXCLUSIVE);

 	/* Open LFC file only if LFC was enabled or we are going to reenable it */
 	if (newval == 0 && !LFC_ENABLED())
 	{
 		LWLockRelease(lfc_lock);
+		LWLockRelease(lfc_freelist_lock);
 		/* File should be reopened if LFC is reenabled */
 		lfc_close_file();
 		return;
@@ -449,6 +468,7 @@ lfc_change_limit_hook(int newval, void *extra)
 	if (!lfc_ensure_opened())
 	{
 		LWLockRelease(lfc_lock);
+		LWLockRelease(lfc_freelist_lock);
 		return;
 	}

@@ -464,35 +484,30 @@ lfc_change_limit_hook(int newval, void *extra)
 		 * returning their space to file system
 		 */
 		FileCacheEntry *victim = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->lru));
-		FileCacheEntry *hole;
 		uint32		offset = victim->offset;
-		uint32		hash;
-		bool		found;
-		BufferTag	holetag;

 		CriticalAssert(victim->access_count == 0);
 #ifdef FALLOC_FL_PUNCH_HOLE
-		if (fallocate(lfc_desc, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, (off_t) victim->offset * lfc_blocks_per_chunk * BLCKSZ, lfc_blocks_per_chunk * BLCKSZ) < 0)
+		if (fallocate(lfc_desc, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, (off_t) victim->offset * BLOCKS_PER_CHUNK * BLCKSZ, BLOCKS_PER_CHUNK * BLCKSZ) < 0)
 			neon_log(LOG, "Failed to punch hole in file: %m");
 #endif
-		/* We remove the old entry, and re-enter a hole to the hash table */
-		for (int i = 0; i < lfc_blocks_per_chunk; i++)
+		/* We remove the entry, and enter a hole to the freelist */
+		for (int i = 0; i < BLOCKS_PER_CHUNK; i++)
 		{
 			bool is_page_cached = GET_STATE(victim, i) == AVAILABLE;
 			lfc_ctl->used_pages -= is_page_cached;
 			lfc_ctl->evicted_pages += is_page_cached;
 		}
-		hash_search_with_hash_value(lfc_hash, &victim->key, victim->hash, HASH_REMOVE, NULL);
+		file_cache_hash_remove_entry(lfc_hash, victim);

-		memset(&holetag, 0, sizeof(holetag));
-		holetag.blockNum = offset;
-		hash = get_hash_value(lfc_hash, &holetag);
-		hole = hash_search_with_hash_value(lfc_hash, &holetag, hash, HASH_ENTER, &found);
-		hole->hash = hash;
-		hole->offset = offset;
-		hole->access_count = 0;
-		CriticalAssert(!found);
-		dlist_push_tail(&lfc_ctl->holes, &hole->list_node);
+		if (!freelist_push(offset))
+		{
+			/* freelist_push already logged the error */
+			lfc_switch_off();
+			LWLockRelease(lfc_lock);
+			LWLockRelease(lfc_freelist_lock);
+			return;
+		}

 		lfc_ctl->used -= 1;
 	}
@@ -504,6 +519,7 @@ lfc_change_limit_hook(int newval, void *extra)
 	neon_log(DEBUG1, "set local file cache limit to %d", new_size);

 	LWLockRelease(lfc_lock);
+	LWLockRelease(lfc_freelist_lock);
 }

 void
@@ -579,14 +595,14 @@ lfc_init(void)
 	DefineCustomIntVariable("neon.file_cache_chunk_size",
 							"LFC chunk size in blocks (should be power of two)",
 							NULL,
-							&lfc_blocks_per_chunk,
-							MAX_BLOCKS_PER_CHUNK,
-							1,
-							MAX_BLOCKS_PER_CHUNK,
-							PGC_POSTMASTER,
+							&lfc_blocks_per_chunk_ro,
+							BLOCKS_PER_CHUNK,
+							BLOCKS_PER_CHUNK,
+							BLOCKS_PER_CHUNK,
+							PGC_INTERNAL,
 							GUC_UNIT_BLOCKS,
-							lfc_check_chunk_size,
-							lfc_change_chunk_size,
+							NULL,
+							NULL,
 							NULL);

 	DefineCustomIntVariable("neon.file_cache_prewarm_limit",
@@ -649,19 +665,19 @@ lfc_get_state(size_t max_entries)
 		fcs = (FileCacheState*)palloc0(state_size);
 		SET_VARSIZE(fcs, state_size);
 		fcs->magic = FILE_CACHE_STATE_MAGIC;
-		fcs->chunk_size_log = lfc_chunk_size_log;
+		fcs->chunk_size_log = BLOCKS_PER_CHUNK_LOG;
 		fcs->n_chunks = n_entries;
 		bitmap = FILE_CACHE_STATE_BITMAP(fcs);

 		dlist_reverse_foreach(iter, &lfc_ctl->lru)
 		{
 			FileCacheEntry *entry = dlist_container(FileCacheEntry, list_node, iter.cur);
-			fcs->chunks[i] = entry->key;
-			for (int j = 0; j < lfc_blocks_per_chunk; j++)
+			fcs->chunks[i] = *file_cache_hash_get_key_for_entry(lfc_hash, entry);
+			for (int j = 0; j < BLOCKS_PER_CHUNK; j++)
 			{
 				if (GET_STATE(entry, j) != UNAVAILABLE)
 				{
-					BITMAP_SET(bitmap, i*lfc_blocks_per_chunk + j);
+					BITMAP_SET(bitmap, i*BLOCKS_PER_CHUNK + j);
 					n_pages += 1;
 				}
 			}
@@ -670,7 +686,7 @@ lfc_get_state(size_t max_entries)
 		}
 		Assert(i == n_entries);
 		fcs->n_pages = n_pages;
-		Assert(pg_popcount((char*)bitmap, ((n_entries << lfc_chunk_size_log) + 7)/8) == n_pages);
+		Assert(pg_popcount((char*)bitmap, ((n_entries << BLOCKS_PER_CHUNK_LOG) + 7)/8) == n_pages);
 		elog(LOG, "LFC: save state of %d chunks %d pages", (int)n_entries, (int)n_pages);
 	}

@@ -726,7 +742,7 @@ lfc_prewarm(FileCacheState* fcs, uint32 n_workers)
 	}

 	fcs_chunk_size_log = fcs->chunk_size_log;
-	if (fcs_chunk_size_log > MAX_BLOCKS_PER_CHUNK_LOG)
+	if (fcs_chunk_size_log > BLOCKS_PER_CHUNK_LOG)
 	{
 		elog(ERROR, "LFC: Invalid chunk size log: %u", fcs->chunk_size_log);
 	}
@@ -945,7 +961,7 @@ lfc_invalidate(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber nblocks)
 {
 	BufferTag	tag;
 	FileCacheEntry *entry;
-	uint32		hash;
+	uint64		hash;

 	if (lfc_maybe_disabled())	/* fast exit if file cache is disabled */
 		return;
@@ -958,14 +974,14 @@ lfc_invalidate(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber nblocks)
 	LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
 	if (LFC_ENABLED())
 	{
-		for (BlockNumber blkno = 0; blkno < nblocks; blkno += lfc_blocks_per_chunk)
+		for (BlockNumber blkno = 0; blkno < nblocks; blkno += BLOCKS_PER_CHUNK)
 		{
 			tag.blockNum = blkno;
-			hash = get_hash_value(lfc_hash, &tag);
-			entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL);
+			hash = file_cache_hash_get_hash_value(lfc_hash, &tag);
+			entry = file_cache_hash_find(lfc_hash, &tag, hash);
 			if (entry != NULL)
 			{
-				for (int i = 0; i < lfc_blocks_per_chunk; i++)
+				for (int i = 0; i < BLOCKS_PER_CHUNK; i++)
 				{
 					if (GET_STATE(entry, i) == AVAILABLE)
 					{
@@ -990,7 +1006,7 @@ lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
 	FileCacheEntry *entry;
 	int			chunk_offs = BLOCK_TO_CHUNK_OFF(blkno);
 	bool		found = false;
-	uint32		hash;
+	uint64		hash;

 	if (lfc_maybe_disabled())	/* fast exit if file cache is disabled */
 		return false;
@@ -1000,12 +1016,12 @@ lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
 	tag.blockNum = blkno - chunk_offs;

 	CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber);
-	hash = get_hash_value(lfc_hash, &tag);
+	hash = file_cache_hash_get_hash_value(lfc_hash, &tag);

 	LWLockAcquire(lfc_lock, LW_SHARED);
 	if (LFC_ENABLED())
 	{
-		entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL);
+		entry = file_cache_hash_find(lfc_hash, &tag, hash);
 		found = entry != NULL && GET_STATE(entry, chunk_offs) != UNAVAILABLE;
 	}
 	LWLockRelease(lfc_lock);
@@ -1024,7 +1040,7 @@ lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	FileCacheEntry *entry;
 	uint32		chunk_offs;
 	int			found = 0;
-	uint32		hash;
+	uint64		hash;
 	int			i = 0;

 	if (lfc_maybe_disabled())	/* fast exit if file cache is disabled */
@@ -1037,7 +1053,7 @@ lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,

 	chunk_offs = BLOCK_TO_CHUNK_OFF(blkno);
 	tag.blockNum = blkno - chunk_offs;
-	hash = get_hash_value(lfc_hash, &tag);
+	hash = file_cache_hash_get_hash_value(lfc_hash, &tag);

 	LWLockAcquire(lfc_lock, LW_SHARED);

@@ -1048,12 +1064,12 @@ lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	}
 	while (true)
 	{
-		int		this_chunk = Min(nblocks - i, lfc_blocks_per_chunk - chunk_offs);
-		entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL);
+		int		this_chunk = Min(nblocks - i, BLOCKS_PER_CHUNK - chunk_offs);
+		entry = file_cache_hash_find(lfc_hash, &tag, hash);

 		if (entry != NULL)
 		{
-			for (; chunk_offs < lfc_blocks_per_chunk && i < nblocks; chunk_offs++, i++)
+			for (; chunk_offs < BLOCKS_PER_CHUNK && i < nblocks; chunk_offs++, i++)
 			{
 				if (GET_STATE(entry, chunk_offs) != UNAVAILABLE)
 				{
@@ -1079,7 +1095,7 @@ lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 		 */
 		chunk_offs = BLOCK_TO_CHUNK_OFF(blkno + i);
 		tag.blockNum = (blkno + i) - chunk_offs;
-		hash = get_hash_value(lfc_hash, &tag);
+		hash = file_cache_hash_get_hash_value(lfc_hash, &tag);
 	}

 	LWLockRelease(lfc_lock);
@@ -1128,7 +1144,7 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	BufferTag	tag;
 	FileCacheEntry *entry;
 	ssize_t		rc;
-	uint32		hash;
+	uint64		hash;
 	uint64		generation;
 	uint32		entry_offset;
 	int			blocks_read = 0;
@@ -1154,9 +1170,9 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	while (nblocks > 0)
 	{
 		struct iovec iov[PG_IOV_MAX];
-		uint8	chunk_mask[MAX_BLOCKS_PER_CHUNK / 8] = {0};
+		uint8	chunk_mask[BLOCKS_PER_CHUNK / 8] = {0};
 		int		chunk_offs = BLOCK_TO_CHUNK_OFF(blkno);
-		int		blocks_in_chunk = Min(nblocks, lfc_blocks_per_chunk - chunk_offs);
+		int		blocks_in_chunk = Min(nblocks, BLOCKS_PER_CHUNK - chunk_offs);
 		int		iteration_hits = 0;
 		int		iteration_misses = 0;
 		uint64	io_time_us = 0;
@@ -1206,7 +1222,7 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 		Assert(iov_last_used - first_block_in_chunk_read >= n_blocks_to_read);

 		tag.blockNum = blkno - chunk_offs;
-		hash = get_hash_value(lfc_hash, &tag);
+		hash = file_cache_hash_get_hash_value(lfc_hash, &tag);
 		cv = &lfc_ctl->cv[hash % N_COND_VARS];

 		LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
@@ -1219,13 +1235,13 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 			return blocks_read;
 		}

-		entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL);
+		entry = file_cache_hash_find(lfc_hash, &tag, hash);

 		/* Approximate working set for the blocks assumed in this entry */
 		for (int i = 0; i < blocks_in_chunk; i++)
 		{
 			tag.blockNum = blkno + i;
-			addSHLL(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag)));
+			addSHLL(&lfc_ctl->wss_estimation, file_cache_hash_get_hash_value(lfc_hash, &tag));
 		}

 		if (entry == NULL)
@@ -1296,7 +1312,7 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 		if (iteration_hits != 0)
 		{
 			/* chunk offset (# of pages) into the LFC file */
-			off_t	first_read_offset = (off_t) entry_offset * lfc_blocks_per_chunk;
+			off_t	first_read_offset = (off_t) entry_offset * BLOCKS_PER_CHUNK;
 			int		nwrite = iov_last_used - first_block_in_chunk_read;
 			/* offset of first IOV */
 			first_read_offset += chunk_offs + first_block_in_chunk_read;
@@ -1373,14 +1389,14 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 * Returns false if there are no unpinned entries and chunk can not be added.
 */
 static bool
-lfc_init_new_entry(FileCacheEntry* entry, uint32 hash)
+lfc_init_new_entry(FileCacheEntry *entry)
 {
 	/*-----------
 	 * If the chunk wasn't already in the LFC then we have these
 	 * options, in order of preference:
 	 *
 	 * Unless there is no space available, we can:
-	 *  1. Use an entry from the `holes` list, and
+	 *  1. Use an entry from the freelist, and
 	 *  2. Create a new entry.
 	 * We can always, regardless of space in the LFC:
 	 *  3. evict an entry from LRU, and
@@ -1388,17 +1404,10 @@ lfc_init_new_entry(FileCacheEntry* entry, uint32 hash)
 	 */
 	if (lfc_ctl->used < lfc_ctl->limit)
 	{
-		if (!dlist_is_empty(&lfc_ctl->holes))
+		if (!freelist_is_empty())
 		{
 			/* We can reuse a hole that was left behind when the LFC was shrunk previously */
-			FileCacheEntry *hole = dlist_container(FileCacheEntry, list_node,
-												   dlist_pop_head_node(&lfc_ctl->holes));
-			uint32 offset = hole->offset;
-			bool hole_found;
-
-			hash_search_with_hash_value(lfc_hash, &hole->key,
-										hole->hash, HASH_REMOVE, &hole_found);
-			CriticalAssert(hole_found);
+			uint32 offset = freelist_pop();

 			lfc_ctl->used += 1;
 			entry->offset = offset;			/* reuse the hole */
@@ -1427,7 +1436,7 @@ lfc_init_new_entry(FileCacheEntry* entry, uint32 hash)
 		FileCacheEntry *victim = dlist_container(FileCacheEntry, list_node,
 												 dlist_pop_head_node(&lfc_ctl->lru));

-		for (int i = 0; i < lfc_blocks_per_chunk; i++)
+		for (int i = 0; i < BLOCKS_PER_CHUNK; i++)
 		{
 			bool is_page_cached = GET_STATE(victim, i) == AVAILABLE;
 			lfc_ctl->used_pages -= is_page_cached;
@@ -1436,24 +1445,21 @@ lfc_init_new_entry(FileCacheEntry* entry, uint32 hash)

 		CriticalAssert(victim->access_count == 0);
 		entry->offset = victim->offset; /* grab victim's chunk */
-		hash_search_with_hash_value(lfc_hash, &victim->key,
-									victim->hash, HASH_REMOVE, NULL);
+		file_cache_hash_remove_entry(lfc_hash, victim);
 		neon_log(DEBUG2, "Swap file cache page");
 	}
 	else
 	{
 		/* Can't add this chunk - we don't have the space for it */
-		hash_search_with_hash_value(lfc_hash, &entry->key, hash,
-									HASH_REMOVE, NULL);
+		file_cache_hash_remove_entry(lfc_hash, entry);
 		lfc_ctl->prewarm_canceled = true; /* cancel prewarm if LFC limit is reached */
 		return false;
 	}

 	entry->access_count = 1;
-	entry->hash = hash;
 	lfc_ctl->pinned += 1;

-	for (int i = 0; i < lfc_blocks_per_chunk; i++)
+	for (int i = 0; i < BLOCKS_PER_CHUNK; i++)
 		SET_STATE(entry, i, UNAVAILABLE);

 	return true;
@@ -1490,7 +1496,7 @@ lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
 	FileCacheEntry *entry;
 	ssize_t		rc;
 	bool		found;
-	uint32		hash;
+	uint64		hash;
 	uint64		generation;
 	uint32		entry_offset;
 	instr_time io_start, io_end;
@@ -1509,9 +1515,10 @@ lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
 	CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber);

 	tag.blockNum = blkno - chunk_offs;
-	hash = get_hash_value(lfc_hash, &tag);
+	hash = file_cache_hash_get_hash_value(lfc_hash, &tag);
 	cv = &lfc_ctl->cv[hash % N_COND_VARS];

+ retry:
 	LWLockAcquire(lfc_lock, LW_EXCLUSIVE);

 	if (!LFC_ENABLED() || !lfc_ensure_opened())
@@ -1520,6 +1527,9 @@ lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
 		return false;
 	}

+	if (!freelist_prepare_pop())
+		goto retry;
+
 	lwlsn = neon_get_lwlsn(rinfo, forknum, blkno);

 	if (lwlsn > lsn)
@@ -1530,12 +1540,12 @@ lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
 		return false;
 	}

-	entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_ENTER, &found);
+	entry = file_cache_hash_enter(lfc_hash, &tag, hash, &found);

 	if (lfc_prewarm_update_ws_estimation)
 	{
 		tag.blockNum = blkno;
-		addSHLL(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag)));
+		addSHLL(&lfc_ctl->wss_estimation, file_cache_hash_get_hash_value(lfc_hash, &tag));
 	}
 	if (found)
 	{
@@ -1557,7 +1567,7 @@ lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
 	}
 	else
 	{
-		if (!lfc_init_new_entry(entry, hash))
+		if (!lfc_init_new_entry(entry))
 		{
 			/*
 			 * We can't process this chunk due to lack of space in LFC,
@@ -1578,7 +1588,7 @@ lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
 	pgstat_report_wait_start(WAIT_EVENT_NEON_LFC_WRITE);
 	INSTR_TIME_SET_CURRENT(io_start);
 	rc = pwrite(lfc_desc, buffer, BLCKSZ,
-				((off_t) entry_offset * lfc_blocks_per_chunk + chunk_offs) * BLCKSZ);
+				((off_t) entry_offset * BLOCKS_PER_CHUNK + chunk_offs) * BLCKSZ);
 	INSTR_TIME_SET_CURRENT(io_end);
 	pgstat_report_wait_end();

@@ -1640,7 +1650,7 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	FileCacheEntry *entry;
 	ssize_t		rc;
 	bool		found;
-	uint32		hash;
+	uint64		hash;
 	uint64		generation;
 	uint32		entry_offset;
 	int			buf_offset = 0;
@@ -1653,6 +1663,7 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,

 	CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber);

+ retry:
 	LWLockAcquire(lfc_lock, LW_EXCLUSIVE);

 	if (!LFC_ENABLED() || !lfc_ensure_opened())
@@ -1662,6 +1673,9 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	}
 	generation = lfc_ctl->generation;

+	if (!freelist_prepare_pop())
+		goto retry;
+
 	/*
 	 * For every chunk that has blocks we're interested in, we
 	 * 1. get the chunk header
@@ -1675,7 +1689,7 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	{
 		struct iovec iov[PG_IOV_MAX];
 		int		chunk_offs = BLOCK_TO_CHUNK_OFF(blkno);
-		int		blocks_in_chunk = Min(nblocks, lfc_blocks_per_chunk - chunk_offs);
+		int		blocks_in_chunk = Min(nblocks, BLOCKS_PER_CHUNK - chunk_offs);
 		instr_time io_start, io_end;
 		ConditionVariable* cv;

@@ -1688,16 +1702,16 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 		}

 		tag.blockNum = blkno - chunk_offs;
-		hash = get_hash_value(lfc_hash, &tag);
+		hash = file_cache_hash_get_hash_value(lfc_hash, &tag);
 		cv = &lfc_ctl->cv[hash % N_COND_VARS];

-		entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_ENTER, &found);
+		entry = file_cache_hash_enter(lfc_hash, &tag, hash, &found);

 		/* Approximate working set for the blocks assumed in this entry */
 		for (int i = 0; i < blocks_in_chunk; i++)
 		{
 			tag.blockNum = blkno + i;
-			addSHLL(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag)));
+			addSHLL(&lfc_ctl->wss_estimation, file_cache_hash_get_hash_value(lfc_hash, &tag));
 		}

 		if (found)
@@ -1714,7 +1728,7 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 		}
 		else
 		{
-			if (!lfc_init_new_entry(entry, hash))
+			if (!lfc_init_new_entry(entry))
 			{
 				/*
 				 * We can't process this chunk due to lack of space in LFC,
@@ -1763,7 +1777,7 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 		pgstat_report_wait_start(WAIT_EVENT_NEON_LFC_WRITE);
 		INSTR_TIME_SET_CURRENT(io_start);
 		rc = pwritev(lfc_desc, iov, blocks_in_chunk,
-					 ((off_t) entry_offset * lfc_blocks_per_chunk + chunk_offs) * BLCKSZ);
+					 ((off_t) entry_offset * BLOCKS_PER_CHUNK + chunk_offs) * BLCKSZ);
 		INSTR_TIME_SET_CURRENT(io_end);
 		pgstat_report_wait_end();

@@ -1823,6 +1837,140 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	LWLockRelease(lfc_lock);
 }

+/**** freelist management ****/
+
+
+/*
+ * Prerequisites:
+ * - The caller is holding 'lfc_lock'. XXX
+ */
+static bool
+freelist_prepare_pop(void)
+{
+	/*
+	 * If the in-memory freelist is empty, but there are more blocks available, load them.
+	 *
+	 * TODO: if there
+	 */
+	if (lfc_ctl->num_free_pages == 0 && lfc_ctl->freelist_head != INVALID_OFFSET)
+	{
+		uint32		freelist_head;
+		FreeListChunk *freelist_chunk;
+		size_t		bytes_read;
+
+		LWLockRelease(lfc_lock);
+		LWLockAcquire(lfc_freelist_lock, LW_EXCLUSIVE);
+
+		if (!(lfc_ctl->num_free_pages == 0 && lfc_ctl->freelist_head != INVALID_OFFSET))
+		{
+			/* someone else did the work for us while we were not holding the lock */
+			LWLockRelease(lfc_freelist_lock);
+			return false;
+		}
+
+		freelist_head = lfc_ctl->freelist_head;
+		freelist_chunk = palloc(BLOCKS_PER_CHUNK * BLCKSZ);
+
+		bytes_read = 0;
+		while (bytes_read < BLOCKS_PER_CHUNK * BLCKSZ)
+		{
+			ssize_t		rc;
+
+			rc = pread(lfc_desc, freelist_chunk, BLOCKS_PER_CHUNK * BLCKSZ - bytes_read, (off_t) freelist_head * BLOCKS_PER_CHUNK * BLCKSZ + bytes_read);
+			if (rc < 0)
+			{
+				lfc_disable("read freelist page");
+				return false;
+			}
+			bytes_read += rc;
+		}
+
+		LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
+		if (lfc_generation != lfc_ctl->generation)
+		{
+			LWLockRelease(lfc_lock);
+			return false;
+		}
+
+		Assert(lfc_ctl->freelist_head == freelist_head);
+		Assert(lfc_ctl->num_free_pages == 0);
+		lfc_ctl->freelist_head = freelist_chunk->next;
+		lfc_ctl->num_free_pages = freelist_chunk->num_free_pages;
+		memcpy(lfc_ctl->free_pages, freelist_chunk->free_pages, lfc_ctl->num_free_pages * sizeof(uint32));
+		pfree(freelist_chunk);
+
+		LWLockRelease(lfc_lock);
+		LWLockRelease(lfc_freelist_lock);
+		return false;
+	}
+
+	return true;
+}
+
+/*
+ * Prerequisites:
+ * - The caller is holding 'lfc_lock' and 'lfc_freelist_lock'.
+ *
+ * Returns 'false' on error.
+ */
+static bool
+freelist_push(uint32 offset)
+{
+	Assert(lfc_ctl->freelist_size == FREELIST_ENTRIES_PER_CHUNK);
+	if (lfc_ctl->num_free_pages == lfc_ctl->freelist_size)
+	{
+		FreeListChunk *freelist_chunk;
+		struct iovec iov;
+		ssize_t		rc;
+
+		freelist_chunk = palloc(BLOCKS_PER_CHUNK * BLCKSZ);
+
+		/* write the existing entries to the chunk on disk */
+		freelist_chunk->next = lfc_ctl->freelist_head;
+		freelist_chunk->num_free_pages = lfc_ctl->num_free_pages;
+		memcpy(freelist_chunk->free_pages, lfc_ctl->free_pages, lfc_ctl->num_free_pages * sizeof(uint32));
+
+		/* Use the passed-in offset to hold the freelist chunk itself */
+		iov.iov_base = freelist_chunk;
+		iov.iov_len = BLOCKS_PER_CHUNK * BLCKSZ;
+		rc = pg_pwritev_with_retry(lfc_desc, &iov, 1, (off_t) offset * BLOCKS_PER_CHUNK * BLCKSZ);
+
+		pfree(freelist_chunk);
+
+		if (rc < 0)
+			return false;
+
+		lfc_ctl->freelist_head = offset;
+		lfc_ctl->num_free_pages = 0;
+	}
+	else
+	{
+		lfc_ctl->free_pages[lfc_ctl->num_free_pages] = offset;
+		lfc_ctl->num_free_pages++;
+	}
+	return true;
+}
+
+static uint32
+freelist_pop(void)
+{
+	uint32		result;
+
+	/* The caller should've checked that the list is not empty */
+	Assert(lfc_ctl->num_free_pages > 0);
+
+	result = lfc_ctl->free_pages[lfc_ctl->num_free_pages - 1];
+	lfc_ctl->num_free_pages--;
+
+	return result;
+}
+
+static bool
+freelist_is_empty(void)
+{
+	return lfc_ctl->num_free_pages == 0;
+}
+
 typedef struct
 {
 	TupleDesc	tupdesc;
@@ -1919,7 +2067,7 @@ neon_get_lfc_stats(PG_FUNCTION_ARGS)
 			break;
 		case 8:
 			key = "file_cache_chunk_size_pages";
-			value = lfc_blocks_per_chunk;
+			value = BLOCKS_PER_CHUNK;
 			break;
 		case 9:
 			key = "file_cache_chunks_pinned";
@@ -1990,7 +2138,6 @@ local_cache_pages(PG_FUNCTION_ARGS)

 	if (SRF_IS_FIRSTCALL())
 	{
-		HASH_SEQ_STATUS status;
 		FileCacheEntry *entry;
 		uint32		n_pages = 0;

@@ -2046,15 +2193,16 @@ local_cache_pages(PG_FUNCTION_ARGS)

 			if (LFC_ENABLED())
 			{
-				hash_seq_init(&status, lfc_hash);
-				while ((entry = hash_seq_search(&status)) != NULL)
+				uint32		num_buckets = file_cache_hash_get_num_buckets(lfc_hash);
+
+				for (uint32 pos = 0; pos < num_buckets; pos++)
 				{
-					/* Skip hole tags */
-					if (NInfoGetRelNumber(BufTagGetNRelFileInfo(entry->key)) != 0)
-					{
-						for (int i = 0; i < lfc_blocks_per_chunk; i++)
-							n_pages += GET_STATE(entry, i) == AVAILABLE;
-					}
+					entry = file_cache_hash_get_at_pos(lfc_hash, pos);
+					if (entry == NULL)
+						continue;
+
+					for (int i = 0; i < BLOCKS_PER_CHUNK; i++)
+						n_pages += GET_STATE(entry, i) == AVAILABLE;
 				}
 			}
 		}
@@ -2076,25 +2224,28 @@ local_cache_pages(PG_FUNCTION_ARGS)
 			 * in the fctx->record structure.
 			 */
 			uint32		n = 0;
+			uint32		num_buckets = file_cache_hash_get_num_buckets(lfc_hash);

-			hash_seq_init(&status, lfc_hash);
-			while ((entry = hash_seq_search(&status)) != NULL)
+			for (uint32 pos = 0; pos < num_buckets; pos++)
 			{
-				for (int i = 0; i < lfc_blocks_per_chunk; i++)
+				entry = file_cache_hash_get_at_pos(lfc_hash, pos);
+				if (entry == NULL)
+					continue;
+
+				for (int i = 0; i < BLOCKS_PER_CHUNK; i++)
 				{
-					if (NInfoGetRelNumber(BufTagGetNRelFileInfo(entry->key)) != 0)
+					const BufferTag *key = file_cache_hash_get_key_for_entry(lfc_hash, entry);
+
+					if (GET_STATE(entry, i) == AVAILABLE)
 					{
-						if (GET_STATE(entry, i) == AVAILABLE)
-						{
-							fctx->record[n].pageoffs = entry->offset * lfc_blocks_per_chunk + i;
-							fctx->record[n].relfilenode = NInfoGetRelNumber(BufTagGetNRelFileInfo(entry->key));
-							fctx->record[n].reltablespace = NInfoGetSpcOid(BufTagGetNRelFileInfo(entry->key));
-							fctx->record[n].reldatabase = NInfoGetDbOid(BufTagGetNRelFileInfo(entry->key));
-							fctx->record[n].forknum = entry->key.forkNum;
-							fctx->record[n].blocknum = entry->key.blockNum + i;
-							fctx->record[n].accesscount = entry->access_count;
-							n += 1;
-						}
+						fctx->record[n].pageoffs = entry->offset * BLOCKS_PER_CHUNK + i;
+						fctx->record[n].relfilenode = NInfoGetRelNumber(BufTagGetNRelFileInfo(*key));
+						fctx->record[n].reltablespace = NInfoGetSpcOid(BufTagGetNRelFileInfo(*key));
+						fctx->record[n].reldatabase = NInfoGetDbOid(BufTagGetNRelFileInfo(*key));
+						fctx->record[n].forknum = key->forkNum;
+						fctx->record[n].blocknum = key->blockNum + i;
+						fctx->record[n].accesscount = entry->access_count;
+						n += 1;
 					}
 				}
 			}
Author	SHA1	Message	Date
Heikki Linnakangas	16d6898e44	git add missing file	2025-06-12 02:37:59 +03:00
Heikki Linnakangas	10b936bf03	Use a custom Rust implementation to replace the LFC hash table The new implementation lives in a separately allocated shared memory area, which could be resized. Resizing it isn't actually implemented yet, though. It would require some co-operation from the LFC code.	2025-06-05 18:31:29 +03:00
Heikki Linnakangas	6145cfd1c2	Move neon-shmem facility to separate module within the crate	2025-06-05 18:13:03 +03:00
Heikki Linnakangas	96b4de1de6	Make LFC chunk size a compile-time constant A runtime setting is nicer, but the next commit will replace the hash table with a different implementation that requires the value size to be a compile-time constant.	2025-06-05 18:08:40 +03:00
Heikki Linnakangas	9fdf5fbb7e	Use a separate freelist to track LFC "holes" When the LFC is shrunk, we punch holes in the underlying file to release the disk space to the OS. We tracked it in the same hash table as the in-use entries, because that was convenient. However, I'm working on being able to shrink the hash table too, and once we do that, we'll need some other place to track the holes. Implement a simple scheme of an in-memory array and a chain of on-disk blocks for that.	2025-06-05 18:08:35 +03:00