diff --git a/Cargo.lock b/Cargo.lock index 4c464c62b8..98fd2fa2f9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -253,6 +253,17 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a8ab6b55fe97976e46f91ddbed8d147d966475dc29b2032757ba47e02376fbc3" +[[package]] +name = "atomic_enum" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "99e1aca718ea7b89985790c94aad72d77533063fe00bc497bb79a7c2dae6a661" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.100", +] + [[package]] name = "autocfg" version = "1.1.0" @@ -687,13 +698,40 @@ dependencies = [ "tracing", ] +[[package]] +name = "axum" +version = "0.7.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "edca88bc138befd0323b20752846e6587272d3b03b0343c8ea28a6f819e6e71f" +dependencies = [ + "async-trait", + "axum-core 0.4.5", + "bytes", + "futures-util", + "http 1.1.0", + "http-body 1.0.0", + "http-body-util", + "itoa", + "matchit 0.7.3", + "memchr", + "mime", + "percent-encoding", + "pin-project-lite", + "rustversion", + "serde", + "sync_wrapper 1.0.1", + "tower 0.5.2", + "tower-layer", + "tower-service", +] + [[package]] name = "axum" version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6d6fd624c75e18b3b4c6b9caf42b1afe24437daaee904069137d8bab077be8b8" dependencies = [ - "axum-core", + "axum-core 0.5.0", "base64 0.22.1", "bytes", "form_urlencoded", @@ -704,7 +742,7 @@ dependencies = [ "hyper 1.4.1", "hyper-util", "itoa", - "matchit", + "matchit 0.8.4", "memchr", "mime", "percent-encoding", @@ -724,6 +762,26 @@ dependencies = [ "tracing", ] +[[package]] +name = "axum-core" +version = "0.4.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09f2bd6146b97ae3359fa0cc6d6b376d9539582c7b4220f041a33ec24c226199" +dependencies = [ + "async-trait", + "bytes", + "futures-util", + "http 1.1.0", + "http-body 1.0.0", + "http-body-util", + "mime", + "pin-project-lite", + "rustversion", + "sync_wrapper 1.0.1", + "tower-layer", + "tower-service", +] + [[package]] name = "axum-core" version = "0.5.0" @@ -750,8 +808,8 @@ version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "460fc6f625a1f7705c6cf62d0d070794e94668988b1c38111baeec177c715f7b" dependencies = [ - "axum", - "axum-core", + "axum 0.8.1", + "axum-core 0.5.0", "bytes", "futures-util", "headers", @@ -1086,6 +1144,25 @@ version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" +[[package]] +name = "cbindgen" +version = "0.28.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eadd868a2ce9ca38de7eeafdcec9c7065ef89b42b32f0839278d55f35c54d1ff" +dependencies = [ + "clap", + "heck 0.4.1", + "indexmap 2.9.0", + "log", + "proc-macro2", + "quote", + "serde", + "serde_json", + "syn 2.0.100", + "tempfile", + "toml", +] + [[package]] name = "cc" version = "1.2.16" @@ -1206,7 +1283,7 @@ version = "4.5.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4ac6a0c7b1a9e9a5186361f67dfa1b88213572f427fb9ab038efb2bd8c582dab" dependencies = [ - "heck", + "heck 0.5.0", "proc-macro2", "quote", "syn 2.0.100", @@ -1264,13 +1341,40 @@ dependencies = [ "unicode-width", ] +[[package]] +name = "communicator" +version = "0.1.0" +dependencies = [ + "atomic_enum", + "bytes", + "cbindgen", + "http 1.1.0", + "libc", + "neonart", + "nix 0.27.1", + "pageserver_client_grpc", + "pageserver_data_api", + "prost 0.13.3", + "thiserror 1.0.69", + "tokio", + "tokio-epoll-uring", + "tokio-pipe", + "tonic", + "tracing", + "tracing-subscriber", + "uring-common", + "utils", + "zerocopy 0.8.24", + "zerocopy-derive 0.8.24", +] + [[package]] name = "compute_api" version = "0.1.0" dependencies = [ "anyhow", "chrono", - "indexmap 2.0.1", + "indexmap 2.9.0", "jsonwebtoken", "regex", "remote_storage", @@ -1288,7 +1392,7 @@ dependencies = [ "aws-sdk-kms", "aws-sdk-s3", "aws-smithy-types", - "axum", + "axum 0.8.1", "axum-extra", "base64 0.13.1", "bytes", @@ -1301,7 +1405,7 @@ dependencies = [ "flate2", "futures", "http 1.1.0", - "indexmap 2.0.1", + "indexmap 2.9.0", "jsonwebtoken", "metrics", "nix 0.27.1", @@ -1927,7 +2031,7 @@ checksum = "0892a17df262a24294c382f0d5997571006e7a4348b4327557c4ff1cd4a8bccc" dependencies = [ "darling", "either", - "heck", + "heck 0.5.0", "proc-macro2", "quote", "syn 2.0.100", @@ -2041,7 +2145,7 @@ name = "endpoint_storage" version = "0.0.1" dependencies = [ "anyhow", - "axum", + "axum 0.8.1", "axum-extra", "camino", "camino-tempfile", @@ -2588,7 +2692,7 @@ dependencies = [ "futures-sink", "futures-util", "http 0.2.9", - "indexmap 2.0.1", + "indexmap 2.9.0", "slab", "tokio", "tokio-util", @@ -2607,7 +2711,7 @@ dependencies = [ "futures-sink", "futures-util", "http 1.1.0", - "indexmap 2.0.1", + "indexmap 2.9.0", "slab", "tokio", "tokio-util", @@ -2703,6 +2807,12 @@ dependencies = [ "http 1.1.0", ] +[[package]] +name = "heck" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" + [[package]] name = "heck" version = "0.5.0" @@ -3191,12 +3301,12 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.0.1" +version = "2.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ad227c3af19d4914570ad36d30409928b75967c298feb9ea1969db3a610bb14e" +checksum = "cea70ddb795996207ad57735b50c5982d8844f38ba9ee5f1aedcfb708a2aa11e" dependencies = [ "equivalent", - "hashbrown 0.14.5", + "hashbrown 0.15.2", "serde", ] @@ -3219,7 +3329,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "232929e1d75fe899576a3d5c7416ad0d88dbfbb3c3d6aa00873a7408a50ddb88" dependencies = [ "ahash", - "indexmap 2.0.1", + "indexmap 2.9.0", "is-terminal", "itoa", "log", @@ -3242,7 +3352,7 @@ dependencies = [ "crossbeam-utils", "dashmap 6.1.0", "env_logger", - "indexmap 2.0.1", + "indexmap 2.9.0", "itoa", "log", "num-format", @@ -3594,6 +3704,12 @@ dependencies = [ "regex-automata 0.1.10", ] +[[package]] +name = "matchit" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0e7465ac9959cc2b1404e8e2367b43684a6d13790fe23056cc8c6c5a6b7bcb94" + [[package]] name = "matchit" version = "0.8.4" @@ -3639,7 +3755,7 @@ version = "0.0.22" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b9e6777fc80a575f9503d908c8b498782a6c3ee88a06cb416dc3941401e43b94" dependencies = [ - "heck", + "heck 0.5.0", "proc-macro2", "quote", "syn 2.0.100", @@ -3785,6 +3901,15 @@ version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a" +[[package]] +name = "neonart" +version = "0.1.0" +dependencies = [ + "rand 0.8.5", + "tracing", + "zerocopy 0.8.24", +] + [[package]] name = "never-say-never" version = "6.6.666" @@ -4208,6 +4333,8 @@ dependencies = [ "humantime-serde", "pageserver_api", "pageserver_client", + "pageserver_client_grpc", + "pageserver_data_api", "rand 0.8.5", "reqwest", "serde", @@ -4284,6 +4411,8 @@ dependencies = [ "pageserver_api", "pageserver_client", "pageserver_compaction", + "pageserver_data_api", + "peekable", "pem", "pin-project-lite", "postgres-protocol", @@ -4295,6 +4424,7 @@ dependencies = [ "pprof", "pq_proto", "procfs", + "prost 0.13.3", "rand 0.8.5", "range-set-blaze", "regex", @@ -4326,6 +4456,7 @@ dependencies = [ "tokio-tar", "tokio-util", "toml_edit", + "tonic", "tracing", "tracing-utils", "url", @@ -4390,6 +4521,18 @@ dependencies = [ "workspace_hack", ] +[[package]] +name = "pageserver_client_grpc" +version = "0.1.0" +dependencies = [ + "bytes", + "http 1.1.0", + "pageserver_data_api", + "thiserror 1.0.69", + "tonic", + "tracing", +] + [[package]] name = "pageserver_compaction" version = "0.1.0" @@ -4413,6 +4556,17 @@ dependencies = [ "workspace_hack", ] +[[package]] +name = "pageserver_data_api" +version = "0.1.0" +dependencies = [ + "prost 0.13.3", + "thiserror 1.0.69", + "tonic", + "tonic-build", + "utils", +] + [[package]] name = "papaya" version = "0.2.1" @@ -4539,6 +4693,15 @@ dependencies = [ "sha2", ] +[[package]] +name = "peekable" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "225f9651e475709164f871dc2f5724956be59cb9edb055372ffeeab01ec2d20b" +dependencies = [ + "smallvec", +] + [[package]] name = "pem" version = "3.0.3" @@ -5010,7 +5173,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "22505a5c94da8e3b7c2996394d1c933236c4d743e81a410bcca4e6989fc066a4" dependencies = [ "bytes", - "heck", + "heck 0.5.0", "itertools 0.12.1", "log", "multimap", @@ -5031,7 +5194,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0c1318b19085f08681016926435853bbf7858f9c082d0999b80550ff5d9abe15" dependencies = [ "bytes", - "heck", + "heck 0.5.0", "itertools 0.12.1", "log", "multimap", @@ -5134,7 +5297,7 @@ dependencies = [ "hyper 0.14.30", "hyper 1.4.1", "hyper-util", - "indexmap 2.0.1", + "indexmap 2.9.0", "ipnet", "itertools 0.10.5", "itoa", @@ -5645,7 +5808,7 @@ dependencies = [ "async-trait", "getrandom 0.2.11", "http 1.1.0", - "matchit", + "matchit 0.8.4", "opentelemetry", "reqwest", "reqwest-middleware", @@ -6806,7 +6969,7 @@ version = "0.26.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4c6bee85a5a24955dc440386795aa378cd9cf82acd5f764469152d2270e581be" dependencies = [ - "heck", + "heck 0.5.0", "proc-macro2", "quote", "rustversion", @@ -7231,6 +7394,16 @@ dependencies = [ "syn 2.0.100", ] +[[package]] +name = "tokio-pipe" +version = "0.2.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f213a84bffbd61b8fa0ba8a044b4bbe35d471d0b518867181e82bd5c15542784" +dependencies = [ + "libc", + "tokio", +] + [[package]] name = "tokio-postgres" version = "0.7.10" @@ -7413,7 +7586,7 @@ version = "0.22.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f21c7aaf97f1bd9ca9d4f9e73b0a6c74bd5afef56f2bc931943a6e1c37e04e38" dependencies = [ - "indexmap 2.0.1", + "indexmap 2.9.0", "serde", "serde_spanned", "toml_datetime", @@ -7426,9 +7599,13 @@ version = "0.12.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "877c5b330756d856ffcc4553ab34a5684481ade925ecc54bcd1bf02b1d0d4d52" dependencies = [ + "async-stream", "async-trait", + "axum 0.7.9", "base64 0.22.1", "bytes", + "flate2", + "h2 0.4.4", "http 1.1.0", "http-body 1.0.0", "http-body-util", @@ -7440,6 +7617,7 @@ dependencies = [ "prost 0.13.3", "rustls-native-certs 0.8.0", "rustls-pemfile 2.1.1", + "socket2", "tokio", "tokio-rustls 0.26.0", "tokio-stream", @@ -7939,7 +8117,7 @@ name = "vm_monitor" version = "0.1.0" dependencies = [ "anyhow", - "axum", + "axum 0.8.1", "cgroups-rs", "clap", "futures", @@ -8449,7 +8627,7 @@ dependencies = [ "hyper 1.4.1", "hyper-util", "indexmap 1.9.3", - "indexmap 2.0.1", + "indexmap 2.9.0", "itertools 0.12.1", "lazy_static", "libc", diff --git a/Cargo.toml b/Cargo.toml index 1c203af9e0..ed0127a13b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -8,6 +8,7 @@ members = [ "pageserver/compaction", "pageserver/ctl", "pageserver/client", + "pageserver/client_grpc", "pageserver/pagebench", "proxy", "safekeeper", @@ -29,6 +30,7 @@ members = [ "libs/pq_proto", "libs/tenant_size_model", "libs/metrics", + "libs/neonart", "libs/postgres_connection", "libs/remote_storage", "libs/tracing-utils", @@ -41,6 +43,7 @@ members = [ "libs/proxy/postgres-types2", "libs/proxy/tokio-postgres2", "endpoint_storage", + "pgxn/neon/communicator", ] [workspace.package] @@ -142,6 +145,7 @@ parquet = { version = "53", default-features = false, features = ["zstd"] } parquet_derive = "53" pbkdf2 = { version = "0.12.1", features = ["simple", "std"] } pem = "3.0.3" +peekable = "0.3.0" pin-project-lite = "0.2" pprof = { version = "0.14", features = ["criterion", "flamegraph", "frame-pointer", "prost-codec"] } procfs = "0.16" @@ -187,7 +191,6 @@ thiserror = "1.0" tikv-jemallocator = { version = "0.6", features = ["profiling", "stats", "unprefixed_malloc_on_supported_platforms"] } tikv-jemalloc-ctl = { version = "0.6", features = ["stats"] } tokio = { version = "1.43.1", features = ["macros"] } -tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" } tokio-io-timeout = "1.2.0" tokio-postgres-rustls = "0.12.0" tokio-rustls = { version = "0.26.0", default-features = false, features = ["tls12", "ring"]} @@ -196,7 +199,7 @@ tokio-tar = "0.3" tokio-util = { version = "0.7.10", features = ["io", "rt"] } toml = "0.8" toml_edit = "0.22" -tonic = {version = "0.12.3", default-features = false, features = ["channel", "tls", "tls-roots"]} +tonic = {version = "0.12.3", default-features = false, features = ["channel", "server", "tls", "tls-roots", "gzip"]} tower = { version = "0.5.2", default-features = false } tower-http = { version = "0.6.2", features = ["auth", "request-id", "trace"] } @@ -228,6 +231,9 @@ x509-cert = { version = "0.2.5" } env_logger = "0.11" log = "0.4" +tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" } +uring-common = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" } + ## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch = "neon" } postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", branch = "neon" } @@ -245,9 +251,12 @@ compute_api = { version = "0.1", path = "./libs/compute_api/" } consumption_metrics = { version = "0.1", path = "./libs/consumption_metrics/" } http-utils = { version = "0.1", path = "./libs/http-utils/" } metrics = { version = "0.1", path = "./libs/metrics/" } +neonart = { version = "0.1", path = "./libs/neonart/" } pageserver = { path = "./pageserver" } pageserver_api = { version = "0.1", path = "./libs/pageserver_api/" } pageserver_client = { path = "./pageserver/client" } +pageserver_client_grpc = { path = "./pageserver/client_grpc" } +pageserver_data_api = { path = "./pageserver/data_api" } pageserver_compaction = { version = "0.1", path = "./pageserver/compaction/" } postgres_backend = { version = "0.1", path = "./libs/postgres_backend/" } postgres_connection = { version = "0.1", path = "./libs/postgres_connection/" } @@ -271,6 +280,7 @@ wal_decoder = { version = "0.1", path = "./libs/wal_decoder" } workspace_hack = { version = "0.1", path = "./workspace_hack/" } ## Build dependencies +cbindgen = "0.28.0" criterion = "0.5.1" rcgen = "0.13" rstest = "0.18" diff --git a/Makefile b/Makefile index 0911465fb8..820f3c20f1 100644 --- a/Makefile +++ b/Makefile @@ -18,10 +18,12 @@ ifeq ($(BUILD_TYPE),release) PG_LDFLAGS = $(LDFLAGS) # Unfortunately, `--profile=...` is a nightly feature CARGO_BUILD_FLAGS += --release + NEON_CARGO_ARTIFACT_TARGET_DIR = $(ROOT_PROJECT_DIR)/target/release else ifeq ($(BUILD_TYPE),debug) PG_CONFIGURE_OPTS = --enable-debug --with-openssl --enable-cassert --enable-depend PG_CFLAGS += -O0 -g3 $(CFLAGS) PG_LDFLAGS = $(LDFLAGS) + NEON_CARGO_ARTIFACT_TARGET_DIR = $(ROOT_PROJECT_DIR)/target/debug else $(error Bad build type '$(BUILD_TYPE)', see Makefile for options) endif @@ -180,11 +182,16 @@ postgres-check-%: postgres-% .PHONY: neon-pg-ext-% neon-pg-ext-%: postgres-% + +@echo "Compiling communicator $*" + $(CARGO_CMD_PREFIX) cargo build -p communicator $(CARGO_BUILD_FLAGS) + +@echo "Compiling neon $*" mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-$* $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config COPT='$(COPT)' \ + LIBCOMMUNICATOR_PATH=$(NEON_CARGO_ARTIFACT_TARGET_DIR) \ -C $(POSTGRES_INSTALL_DIR)/build/neon-$* \ -f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile install + +@echo "Compiling neon_walredo $*" mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-walredo-$* $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config COPT='$(COPT)' \ diff --git a/libs/neonart/Cargo.toml b/libs/neonart/Cargo.toml new file mode 100644 index 0000000000..9581a595b1 --- /dev/null +++ b/libs/neonart/Cargo.toml @@ -0,0 +1,11 @@ +[package] +name = "neonart" +version = "0.1.0" +edition.workspace = true +license.workspace = true + +[dependencies] +tracing.workspace = true + +rand.workspace = true # for tests +zerocopy = "0.8" diff --git a/libs/neonart/src/algorithm.rs b/libs/neonart/src/algorithm.rs new file mode 100644 index 0000000000..c021957827 --- /dev/null +++ b/libs/neonart/src/algorithm.rs @@ -0,0 +1,377 @@ +mod lock_and_version; +mod node_ptr; +mod node_ref; + +use std::vec::Vec; + +use crate::algorithm::lock_and_version::ResultOrRestart; +use crate::algorithm::node_ptr::{MAX_PREFIX_LEN, NodePtr}; +use crate::algorithm::node_ref::ChildOrValue; +use crate::algorithm::node_ref::{NodeRef, ReadLockedNodeRef, WriteLockedNodeRef}; + +use crate::epoch::EpochPin; +use crate::{Allocator, Key, Value}; + +pub(crate) type RootPtr = node_ptr::NodePtr; + +pub fn new_root(allocator: &Allocator) -> RootPtr { + node_ptr::new_root(allocator) +} + +pub(crate) fn search<'e, K: Key, V: Value>( + key: &K, + root: RootPtr, + epoch_pin: &'e EpochPin, +) -> Option { + loop { + let root_ref = NodeRef::from_root_ptr(root); + if let Ok(result) = lookup_recurse(key.as_bytes(), root_ref, None, epoch_pin) { + break result; + } + // retry + } +} + +pub(crate) fn update_fn<'e, K: Key, V: Value, F>( + key: &K, + value_fn: F, + root: RootPtr, + allocator: &Allocator, + epoch_pin: &'e EpochPin, +) where + F: FnOnce(Option<&V>) -> Option, +{ + let value_fn_cell = std::cell::Cell::new(Some(value_fn)); + loop { + let root_ref = NodeRef::from_root_ptr(root); + let this_value_fn = |arg: Option<&V>| value_fn_cell.take().unwrap()(arg); + let key_bytes = key.as_bytes(); + if let Ok(()) = update_recurse( + key_bytes, + this_value_fn, + root_ref, + None, + allocator, + epoch_pin, + 0, + key_bytes, + ) { + break; + } + // retry + } +} + +pub(crate) fn dump_tree<'e, V: Value + std::fmt::Debug>(root: RootPtr, epoch_pin: &'e EpochPin) { + let root_ref = NodeRef::from_root_ptr(root); + + let _ = dump_recurse(&[], root_ref, &epoch_pin, 0); +} + +// Error means you must retry. +// +// This corresponds to the 'lookupOpt' function in the paper +fn lookup_recurse<'e, V: Value>( + key: &[u8], + node: NodeRef<'e, V>, + parent: Option>, + epoch_pin: &'e EpochPin, +) -> ResultOrRestart> { + let rnode = node.read_lock_or_restart()?; + if let Some(parent) = parent { + parent.read_unlock_or_restart()?; + } + + // check if prefix matches, may increment level + let prefix_len = if let Some(prefix_len) = rnode.prefix_matches(key) { + prefix_len + } else { + rnode.read_unlock_or_restart()?; + return Ok(None); + }; + let key = &key[prefix_len..]; + + // find child (or leaf value) + let next_node = rnode.find_child_or_value_or_restart(key[0])?; + + match next_node { + None => Ok(None), // key not found + Some(ChildOrValue::Value(vptr)) => { + // safety: It's OK to follow the pointer because we checked the version. + let v = unsafe { (*vptr).clone() }; + Ok(Some(v)) + } + Some(ChildOrValue::Child(v)) => lookup_recurse(&key[1..], v, Some(rnode), epoch_pin), + } +} + +// This corresponds to the 'insertOpt' function in the paper +pub(crate) fn update_recurse<'e, V: Value, F>( + key: &[u8], + value_fn: F, + node: NodeRef<'e, V>, + rparent: Option<(ReadLockedNodeRef, u8)>, + allocator: &Allocator, + epoch_pin: &'e EpochPin, + level: usize, + orig_key: &[u8], +) -> ResultOrRestart<()> +where + F: FnOnce(Option<&V>) -> Option, +{ + let rnode = node.read_lock_or_restart()?; + + let prefix_match_len = rnode.prefix_matches(key); + if prefix_match_len.is_none() { + let (rparent, parent_key) = rparent.expect("direct children of the root have no prefix"); + let mut wparent = rparent.upgrade_to_write_lock_or_restart()?; + let mut wnode = rnode.upgrade_to_write_lock_or_restart()?; + + if let Some(new_value) = value_fn(None) { + insert_split_prefix( + key, + new_value, + &mut wnode, + &mut wparent, + parent_key, + allocator, + ); + } + wnode.write_unlock(); + wparent.write_unlock(); + return Ok(()); + } + let prefix_match_len = prefix_match_len.unwrap(); + let key = &key[prefix_match_len as usize..]; + let level = level + prefix_match_len as usize; + + let next_node = rnode.find_child_or_value_or_restart(key[0])?; + + if next_node.is_none() { + if rnode.is_full() { + let (rparent, parent_key) = rparent.expect("root node cannot become full"); + let mut wparent = rparent.upgrade_to_write_lock_or_restart()?; + let wnode = rnode.upgrade_to_write_lock_or_restart()?; + + if let Some(new_value) = value_fn(None) { + insert_and_grow(key, new_value, &wnode, &mut wparent, parent_key, allocator); + wnode.write_unlock_obsolete(); + wparent.write_unlock(); + } else { + wnode.write_unlock(); + wparent.write_unlock(); + } + } else { + let mut wnode = rnode.upgrade_to_write_lock_or_restart()?; + if let Some((rparent, _)) = rparent { + rparent.read_unlock_or_restart()?; + } + if let Some(new_value) = value_fn(None) { + insert_to_node(&mut wnode, key, new_value, allocator); + } + wnode.write_unlock(); + } + return Ok(()); + } else { + let next_node = next_node.unwrap(); // checked above it's not None + if let Some((rparent, _)) = rparent { + rparent.read_unlock_or_restart()?; + } + + match next_node { + ChildOrValue::Value(existing_value_ptr) => { + assert!(key.len() == 1); + let wnode = rnode.upgrade_to_write_lock_or_restart()?; + + // safety: Now that we have acquired the write lock, we have exclusive access to the + // value + let vmut = unsafe { existing_value_ptr.cast_mut().as_mut() }.unwrap(); + if let Some(new_value) = value_fn(Some(vmut)) { + *vmut = new_value; + } else { + // TODO: Treat this as deletion? + } + wnode.write_unlock(); + + Ok(()) + } + ChildOrValue::Child(next_child) => { + // recurse to next level + update_recurse( + &key[1..], + value_fn, + next_child, + Some((rnode, key[0])), + allocator, + epoch_pin, + level + 1, + orig_key, + ) + } + } + } +} + +#[derive(Clone)] +enum PathElement { + Prefix(Vec), + KeyByte(u8), +} + +impl std::fmt::Debug for PathElement { + fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> { + match self { + PathElement::Prefix(prefix) => write!(fmt, "{:?}", prefix), + PathElement::KeyByte(key_byte) => write!(fmt, "{}", key_byte), + } + } +} + +fn dump_recurse<'e, V: Value + std::fmt::Debug>( + path: &[PathElement], + node: NodeRef<'e, V>, + epoch_pin: &'e EpochPin, + level: usize, +) -> ResultOrRestart<()> { + let indent = str::repeat(" ", level); + + let rnode = node.read_lock_or_restart()?; + let mut path = Vec::from(path); + let prefix = rnode.get_prefix(); + if prefix.len() != 0 { + path.push(PathElement::Prefix(Vec::from(prefix))); + } + + for key_byte in 0..u8::MAX { + match rnode.find_child_or_value_or_restart(key_byte)? { + None => continue, + Some(ChildOrValue::Child(child_ref)) => { + let rchild = child_ref.read_lock_or_restart()?; + eprintln!( + "{} {:?}, {}: prefix {:?}", + indent, + &path, + key_byte, + rchild.get_prefix() + ); + + let mut child_path = path.clone(); + child_path.push(PathElement::KeyByte(key_byte)); + + dump_recurse(&child_path, child_ref, epoch_pin, level + 1)?; + } + Some(ChildOrValue::Value(val)) => { + eprintln!("{} {:?}, {}: {:?}", indent, path, key_byte, unsafe { + val.as_ref().unwrap() + }); + } + } + } + + Ok(()) +} + +///```text +/// [fooba]r -> value +/// +/// [foo]b -> [a]r -> value +/// e -> [ls]e -> value +///``` +fn insert_split_prefix<'a, V: Value>( + key: &[u8], + value: V, + node: &mut WriteLockedNodeRef, + parent: &mut WriteLockedNodeRef, + parent_key: u8, + allocator: &Allocator, +) { + let old_node = node; + let old_prefix = old_node.get_prefix(); + let common_prefix_len = common_prefix(key, old_prefix); + + // Allocate a node for the new value. + let new_value_node = allocate_node_for_value(&key[common_prefix_len + 1..], value, allocator); + + // Allocate a new internal node with the common prefix + let mut prefix_node = node_ref::new_internal(&key[..common_prefix_len], allocator); + + // Add the old node and the new nodes to the new internal node + prefix_node.insert_child(old_prefix[common_prefix_len], old_node.as_ptr()); + prefix_node.insert_child(key[common_prefix_len], new_value_node); + + // Modify the prefix of the old child in place + old_node.truncate_prefix(old_prefix.len() - common_prefix_len - 1); + + // replace the pointer in the parent + parent.replace_child(parent_key, prefix_node.into_ptr()); +} + +fn insert_to_node( + wnode: &mut WriteLockedNodeRef, + key: &[u8], + value: V, + allocator: &Allocator, +) { + if wnode.is_leaf() { + wnode.insert_value(key[0], value); + } else { + let value_child = allocate_node_for_value(&key[1..], value, allocator); + wnode.insert_child(key[0], value_child); + } +} + +// On entry: 'parent' and 'node' are locked +fn insert_and_grow( + key: &[u8], + value: V, + wnode: &WriteLockedNodeRef, + parent: &mut WriteLockedNodeRef, + parent_key_byte: u8, + allocator: &Allocator, +) { + let mut bigger_node = wnode.grow(allocator); + + if wnode.is_leaf() { + bigger_node.insert_value(key[0], value); + } else { + let value_child = allocate_node_for_value(&key[1..], value, allocator); + bigger_node.insert_child(key[0], value_child); + } + + // Replace the pointer in the parent + parent.replace_child(parent_key_byte, bigger_node.into_ptr()); +} + +// Allocate a new leaf node to hold 'value'. If key is long, we may need to allocate +// new internal nodes to hold it too +fn allocate_node_for_value(key: &[u8], value: V, allocator: &Allocator) -> NodePtr { + let mut prefix_off = key.len().saturating_sub(MAX_PREFIX_LEN + 1); + + let mut leaf_node = node_ref::new_leaf(&key[prefix_off..key.len() - 1], allocator); + leaf_node.insert_value(*key.last().unwrap(), value); + + let mut node = leaf_node; + while prefix_off > 0 { + // Need another internal node + let remain_prefix = &key[0..prefix_off]; + + prefix_off = remain_prefix.len().saturating_sub(MAX_PREFIX_LEN + 1); + let mut internal_node = node_ref::new_internal( + &remain_prefix[prefix_off..remain_prefix.len() - 1], + allocator, + ); + internal_node.insert_child(*remain_prefix.last().unwrap(), node.into_ptr()); + node = internal_node; + } + + node.into_ptr() +} + +fn common_prefix(a: &[u8], b: &[u8]) -> usize { + for i in 0..MAX_PREFIX_LEN { + if a[i] != b[i] { + return i; + } + } + panic!("prefixes are equal"); +} diff --git a/libs/neonart/src/algorithm/lock_and_version.rs b/libs/neonart/src/algorithm/lock_and_version.rs new file mode 100644 index 0000000000..94117cd531 --- /dev/null +++ b/libs/neonart/src/algorithm/lock_and_version.rs @@ -0,0 +1,85 @@ +use std::sync::atomic::{AtomicU64, Ordering}; + +pub(crate) struct AtomicLockAndVersion { + inner: AtomicU64, +} + +impl AtomicLockAndVersion { + pub(crate) fn new() -> AtomicLockAndVersion { + AtomicLockAndVersion { + inner: AtomicU64::new(0), + } + } +} + +pub(crate) type ResultOrRestart = Result; + +const fn restart() -> ResultOrRestart { + Err(()) +} + +impl AtomicLockAndVersion { + pub(crate) fn read_lock_or_restart(&self) -> ResultOrRestart { + let version = self.await_node_unlocked(); + if is_obsolete(version) { + return restart(); + } + Ok(version) + } + + pub(crate) fn check_or_restart(&self, version: u64) -> ResultOrRestart<()> { + self.read_unlock_or_restart(version) + } + + pub(crate) fn read_unlock_or_restart(&self, version: u64) -> ResultOrRestart<()> { + if self.inner.load(Ordering::Acquire) != version { + return restart(); + } + Ok(()) + } + + pub(crate) fn upgrade_to_write_lock_or_restart(&self, version: u64) -> ResultOrRestart<()> { + if self + .inner + .compare_exchange( + version, + set_locked_bit(version), + Ordering::Acquire, + Ordering::Relaxed, + ) + .is_err() + { + return restart(); + } + Ok(()) + } + + pub(crate) fn write_unlock(&self) { + // reset locked bit and overflow into version + self.inner.fetch_add(2, Ordering::Release); + } + + pub(crate) fn write_unlock_obsolete(&self) { + // set obsolete, reset locked, overflow into version + self.inner.fetch_add(3, Ordering::Release); + } + + // Helper functions + fn await_node_unlocked(&self) -> u64 { + let mut version = self.inner.load(Ordering::Acquire); + while (version & 2) == 2 { + // spinlock + std::thread::yield_now(); + version = self.inner.load(Ordering::Acquire) + } + version + } +} + +fn set_locked_bit(version: u64) -> u64 { + return version + 2; +} + +fn is_obsolete(version: u64) -> bool { + return (version & 1) == 1; +} diff --git a/libs/neonart/src/algorithm/node_ptr.rs b/libs/neonart/src/algorithm/node_ptr.rs new file mode 100644 index 0000000000..3ae0da693c --- /dev/null +++ b/libs/neonart/src/algorithm/node_ptr.rs @@ -0,0 +1,983 @@ +use std::marker::PhantomData; +use std::ptr::NonNull; + +use super::lock_and_version::AtomicLockAndVersion; + +use crate::Allocator; +use crate::Value; + +pub(crate) const MAX_PREFIX_LEN: usize = 8; + +enum NodeTag { + Internal4, + Internal16, + Internal48, + Internal256, + Leaf4, + Leaf16, + Leaf48, + Leaf256, +} + +#[repr(C)] +struct NodeBase { + tag: NodeTag, + lock_and_version: AtomicLockAndVersion, +} + +pub(crate) struct NodePtr { + ptr: *mut NodeBase, + + phantom_value: PhantomData, +} + +impl std::fmt::Debug for NodePtr { + fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> { + write!(fmt, "0x{}", self.ptr.addr()) + } +} + +impl Copy for NodePtr {} +impl Clone for NodePtr { + fn clone(&self) -> NodePtr { + NodePtr { + ptr: self.ptr, + phantom_value: PhantomData, + } + } +} + +enum NodeVariant<'a, V> { + Internal4(&'a NodeInternal4), + Internal16(&'a NodeInternal16), + Internal48(&'a NodeInternal48), + Internal256(&'a NodeInternal256), + Leaf4(&'a NodeLeaf4), + Leaf16(&'a NodeLeaf16), + Leaf48(&'a NodeLeaf48), + Leaf256(&'a NodeLeaf256), +} + +enum NodeVariantMut<'a, V> { + Internal4(&'a mut NodeInternal4), + Internal16(&'a mut NodeInternal16), + Internal48(&'a mut NodeInternal48), + Internal256(&'a mut NodeInternal256), + Leaf4(&'a mut NodeLeaf4), + Leaf16(&'a mut NodeLeaf16), + Leaf48(&'a mut NodeLeaf48), + Leaf256(&'a mut NodeLeaf256), +} + +pub(crate) enum ChildOrValuePtr { + Child(NodePtr), + Value(*const V), +} + +#[repr(C)] +struct NodeInternal4 { + tag: NodeTag, + lock_and_version: AtomicLockAndVersion, + + prefix: [u8; MAX_PREFIX_LEN], + prefix_len: u8, + num_children: u8, + + child_keys: [u8; 4], + child_ptrs: [NodePtr; 4], +} + +#[repr(C)] +struct NodeInternal16 { + tag: NodeTag, + lock_and_version: AtomicLockAndVersion, + + prefix: [u8; MAX_PREFIX_LEN], + prefix_len: u8, + + num_children: u8, + child_keys: [u8; 16], + child_ptrs: [NodePtr; 16], +} + +const INVALID_CHILD_INDEX: u8 = u8::MAX; + +#[repr(C)] +struct NodeInternal48 { + tag: NodeTag, + lock_and_version: AtomicLockAndVersion, + + prefix: [u8; MAX_PREFIX_LEN], + prefix_len: u8, + + num_children: u8, + child_indexes: [u8; 256], + child_ptrs: [NodePtr; 48], +} + +#[repr(C)] +pub(crate) struct NodeInternal256 { + tag: NodeTag, + lock_and_version: AtomicLockAndVersion, + + prefix: [u8; MAX_PREFIX_LEN], + prefix_len: u8, + + num_children: u16, + child_ptrs: [NodePtr; 256], +} + +#[repr(C)] +struct NodeLeaf4 { + tag: NodeTag, + lock_and_version: AtomicLockAndVersion, + + prefix: [u8; MAX_PREFIX_LEN], + prefix_len: u8, + + num_values: u8, + child_keys: [u8; 4], + child_values: [Option; 4], +} + +#[repr(C)] +struct NodeLeaf16 { + tag: NodeTag, + lock_and_version: AtomicLockAndVersion, + + prefix: [u8; MAX_PREFIX_LEN], + prefix_len: u8, + + num_values: u8, + child_keys: [u8; 16], + child_values: [Option; 16], +} + +#[repr(C)] +struct NodeLeaf48 { + tag: NodeTag, + lock_and_version: AtomicLockAndVersion, + + prefix: [u8; MAX_PREFIX_LEN], + prefix_len: u8, + + num_values: u8, + child_indexes: [u8; 256], + child_values: [Option; 48], +} + +#[repr(C)] +struct NodeLeaf256 { + tag: NodeTag, + lock_and_version: AtomicLockAndVersion, + + prefix: [u8; MAX_PREFIX_LEN], + prefix_len: u8, + + num_values: u16, + child_values: [Option; 256], +} + +impl NodePtr { + pub(crate) fn is_leaf(&self) -> bool { + match self.variant() { + NodeVariant::Internal4(_) => false, + NodeVariant::Internal16(_) => false, + NodeVariant::Internal48(_) => false, + NodeVariant::Internal256(_) => false, + NodeVariant::Leaf4(_) => true, + NodeVariant::Leaf16(_) => true, + NodeVariant::Leaf48(_) => true, + NodeVariant::Leaf256(_) => true, + } + } + + pub(crate) fn lockword(&self) -> &AtomicLockAndVersion { + match self.variant() { + NodeVariant::Internal4(n) => &n.lock_and_version, + NodeVariant::Internal16(n) => &n.lock_and_version, + NodeVariant::Internal48(n) => &n.lock_and_version, + NodeVariant::Internal256(n) => &n.lock_and_version, + NodeVariant::Leaf4(n) => &n.lock_and_version, + NodeVariant::Leaf16(n) => &n.lock_and_version, + NodeVariant::Leaf48(n) => &n.lock_and_version, + NodeVariant::Leaf256(n) => &n.lock_and_version, + } + } + + pub(crate) fn is_null(&self) -> bool { + self.ptr.is_null() + } + + pub(crate) const fn null() -> NodePtr { + NodePtr { + ptr: std::ptr::null_mut(), + phantom_value: PhantomData, + } + } + + fn variant(&self) -> NodeVariant { + unsafe { + match (*self.ptr).tag { + NodeTag::Internal4 => NodeVariant::Internal4( + NonNull::new_unchecked(self.ptr.cast::>()).as_ref(), + ), + NodeTag::Internal16 => NodeVariant::Internal16( + NonNull::new_unchecked(self.ptr.cast::>()).as_ref(), + ), + NodeTag::Internal48 => NodeVariant::Internal48( + NonNull::new_unchecked(self.ptr.cast::>()).as_ref(), + ), + NodeTag::Internal256 => NodeVariant::Internal256( + NonNull::new_unchecked(self.ptr.cast::>()).as_ref(), + ), + NodeTag::Leaf4 => NodeVariant::Leaf4( + NonNull::new_unchecked(self.ptr.cast::>()).as_ref(), + ), + NodeTag::Leaf16 => NodeVariant::Leaf16( + NonNull::new_unchecked(self.ptr.cast::>()).as_ref(), + ), + NodeTag::Leaf48 => NodeVariant::Leaf48( + NonNull::new_unchecked(self.ptr.cast::>()).as_ref(), + ), + NodeTag::Leaf256 => NodeVariant::Leaf256( + NonNull::new_unchecked(self.ptr.cast::>()).as_ref(), + ), + } + } + } + + fn variant_mut(&mut self) -> NodeVariantMut { + unsafe { + match (*self.ptr).tag { + NodeTag::Internal4 => NodeVariantMut::Internal4( + NonNull::new_unchecked(self.ptr.cast::>()).as_mut(), + ), + NodeTag::Internal16 => NodeVariantMut::Internal16( + NonNull::new_unchecked(self.ptr.cast::>()).as_mut(), + ), + NodeTag::Internal48 => NodeVariantMut::Internal48( + NonNull::new_unchecked(self.ptr.cast::>()).as_mut(), + ), + NodeTag::Internal256 => NodeVariantMut::Internal256( + NonNull::new_unchecked(self.ptr.cast::>()).as_mut(), + ), + NodeTag::Leaf4 => NodeVariantMut::Leaf4( + NonNull::new_unchecked(self.ptr.cast::>()).as_mut(), + ), + NodeTag::Leaf16 => NodeVariantMut::Leaf16( + NonNull::new_unchecked(self.ptr.cast::>()).as_mut(), + ), + NodeTag::Leaf48 => NodeVariantMut::Leaf48( + NonNull::new_unchecked(self.ptr.cast::>()).as_mut(), + ), + NodeTag::Leaf256 => NodeVariantMut::Leaf256( + NonNull::new_unchecked(self.ptr.cast::>()).as_mut(), + ), + } + } + } +} + +impl NodePtr { + pub(crate) fn prefix_matches(&self, key: &[u8]) -> Option { + let node_prefix = self.get_prefix(); + assert!(node_prefix.len() <= key.len()); // because we only use fixed-size keys + if &key[0..node_prefix.len()] != node_prefix { + None + } else { + Some(node_prefix.len()) + } + } + + pub(crate) fn get_prefix(&self) -> &[u8] { + match self.variant() { + NodeVariant::Internal4(n) => n.get_prefix(), + NodeVariant::Internal16(n) => n.get_prefix(), + NodeVariant::Internal48(n) => n.get_prefix(), + NodeVariant::Internal256(n) => n.get_prefix(), + NodeVariant::Leaf4(n) => n.get_prefix(), + NodeVariant::Leaf16(n) => n.get_prefix(), + NodeVariant::Leaf48(n) => n.get_prefix(), + NodeVariant::Leaf256(n) => n.get_prefix(), + } + } + + pub(crate) fn is_full(&self) -> bool { + match self.variant() { + NodeVariant::Internal4(n) => n.is_full(), + NodeVariant::Internal16(n) => n.is_full(), + NodeVariant::Internal48(n) => n.is_full(), + NodeVariant::Internal256(n) => n.is_full(), + NodeVariant::Leaf4(n) => n.is_full(), + NodeVariant::Leaf16(n) => n.is_full(), + NodeVariant::Leaf48(n) => n.is_full(), + NodeVariant::Leaf256(n) => n.is_full(), + } + } + + pub(crate) fn find_child_or_value(&self, key_byte: u8) -> Option> { + match self.variant() { + NodeVariant::Internal4(n) => n.find_child(key_byte).map(|c| ChildOrValuePtr::Child(c)), + NodeVariant::Internal16(n) => n.find_child(key_byte).map(|c| ChildOrValuePtr::Child(c)), + NodeVariant::Internal48(n) => n.find_child(key_byte).map(|c| ChildOrValuePtr::Child(c)), + NodeVariant::Internal256(n) => { + n.find_child(key_byte).map(|c| ChildOrValuePtr::Child(c)) + } + NodeVariant::Leaf4(n) => n + .get_leaf_value(key_byte) + .map(|v| ChildOrValuePtr::Value(v)), + NodeVariant::Leaf16(n) => n + .get_leaf_value(key_byte) + .map(|v| ChildOrValuePtr::Value(v)), + NodeVariant::Leaf48(n) => n + .get_leaf_value(key_byte) + .map(|v| ChildOrValuePtr::Value(v)), + NodeVariant::Leaf256(n) => n + .get_leaf_value(key_byte) + .map(|v| ChildOrValuePtr::Value(v)), + } + } + + pub(crate) fn truncate_prefix(&mut self, new_prefix_len: usize) { + match self.variant_mut() { + NodeVariantMut::Internal4(n) => n.truncate_prefix(new_prefix_len), + NodeVariantMut::Internal16(n) => n.truncate_prefix(new_prefix_len), + NodeVariantMut::Internal48(n) => n.truncate_prefix(new_prefix_len), + NodeVariantMut::Internal256(n) => n.truncate_prefix(new_prefix_len), + NodeVariantMut::Leaf4(n) => n.truncate_prefix(new_prefix_len), + NodeVariantMut::Leaf16(n) => n.truncate_prefix(new_prefix_len), + NodeVariantMut::Leaf48(n) => n.truncate_prefix(new_prefix_len), + NodeVariantMut::Leaf256(n) => n.truncate_prefix(new_prefix_len), + } + } + + pub(crate) fn grow(&self, allocator: &Allocator) -> NodePtr { + match self.variant() { + NodeVariant::Internal4(n) => n.grow(allocator), + NodeVariant::Internal16(n) => n.grow(allocator), + NodeVariant::Internal48(n) => n.grow(allocator), + NodeVariant::Internal256(_) => panic!("cannot grow Internal256 node"), + NodeVariant::Leaf4(n) => n.grow(allocator), + NodeVariant::Leaf16(n) => n.grow(allocator), + NodeVariant::Leaf48(n) => n.grow(allocator), + NodeVariant::Leaf256(_) => panic!("cannot grow Leaf256 node"), + } + } + + pub(crate) fn insert_child(&mut self, key_byte: u8, child: NodePtr) { + match self.variant_mut() { + NodeVariantMut::Internal4(n) => n.insert_child(key_byte, child), + NodeVariantMut::Internal16(n) => n.insert_child(key_byte, child), + NodeVariantMut::Internal48(n) => n.insert_child(key_byte, child), + NodeVariantMut::Internal256(n) => n.insert_child(key_byte, child), + NodeVariantMut::Leaf4(_) + | NodeVariantMut::Leaf16(_) + | NodeVariantMut::Leaf48(_) + | NodeVariantMut::Leaf256(_) => panic!("insert_child called on leaf node"), + } + } + + pub(crate) fn replace_child(&mut self, key_byte: u8, replacement: NodePtr) { + match self.variant_mut() { + NodeVariantMut::Internal4(n) => n.replace_child(key_byte, replacement), + NodeVariantMut::Internal16(n) => n.replace_child(key_byte, replacement), + NodeVariantMut::Internal48(n) => n.replace_child(key_byte, replacement), + NodeVariantMut::Internal256(n) => n.replace_child(key_byte, replacement), + NodeVariantMut::Leaf4(_) + | NodeVariantMut::Leaf16(_) + | NodeVariantMut::Leaf48(_) + | NodeVariantMut::Leaf256(_) => panic!("replace_child called on leaf node"), + } + } + + pub(crate) fn insert_value(&mut self, key_byte: u8, value: V) { + match self.variant_mut() { + NodeVariantMut::Internal4(_) + | NodeVariantMut::Internal16(_) + | NodeVariantMut::Internal48(_) + | NodeVariantMut::Internal256(_) => panic!("insert_value called on internal node"), + NodeVariantMut::Leaf4(n) => n.insert_value(key_byte, value), + NodeVariantMut::Leaf16(n) => n.insert_value(key_byte, value), + NodeVariantMut::Leaf48(n) => n.insert_value(key_byte, value), + NodeVariantMut::Leaf256(n) => n.insert_value(key_byte, value), + } + } +} + +pub fn new_root(allocator: &Allocator) -> NodePtr { + NodePtr { + ptr: allocator.alloc(NodeInternal256::::new()).as_ptr().cast(), + phantom_value: PhantomData, + } +} + +pub fn new_internal(prefix: &[u8], allocator: &Allocator) -> NodePtr { + let mut node = allocator.alloc(NodeInternal4 { + tag: NodeTag::Internal4, + lock_and_version: AtomicLockAndVersion::new(), + + prefix: [8; MAX_PREFIX_LEN], + prefix_len: prefix.len() as u8, + num_children: 0, + + child_keys: [0; 4], + child_ptrs: [const { NodePtr::null() }; 4], + }); + node.prefix[0..prefix.len()].copy_from_slice(prefix); + + node.as_ptr().into() +} + +pub fn new_leaf(prefix: &[u8], allocator: &Allocator) -> NodePtr { + let mut node = allocator.alloc(NodeLeaf4 { + tag: NodeTag::Leaf4, + lock_and_version: AtomicLockAndVersion::new(), + + prefix: [8; MAX_PREFIX_LEN], + prefix_len: prefix.len() as u8, + num_values: 0, + + child_keys: [0; 4], + child_values: [const { None }; 4], + }); + node.prefix[0..prefix.len()].copy_from_slice(prefix); + + node.as_ptr().into() +} + +impl NodeInternal4 { + fn get_prefix(&self) -> &[u8] { + &self.prefix[0..self.prefix_len as usize] + } + + fn truncate_prefix(&mut self, new_prefix_len: usize) { + assert!(new_prefix_len < self.prefix_len as usize); + let prefix = &mut self.prefix; + let offset = self.prefix_len as usize - new_prefix_len; + for i in 0..new_prefix_len { + prefix[i] = prefix[i + offset]; + } + self.prefix_len = new_prefix_len as u8; + } + + fn find_child(&self, key: u8) -> Option> { + for i in 0..self.num_children as usize { + if self.child_keys[i] == key { + return Some(self.child_ptrs[i]); + } + } + None + } + + fn replace_child(&mut self, key_byte: u8, replacement: NodePtr) { + for i in 0..self.num_children as usize { + if self.child_keys[i] == key_byte { + self.child_ptrs[i] = replacement; + return; + } + } + panic!("could not re-find parent with key {}", key_byte); + } + + fn is_full(&self) -> bool { + self.num_children == 4 + } + + fn insert_child(&mut self, key_byte: u8, child: NodePtr) { + assert!(self.num_children < 4); + + let idx = self.num_children as usize; + self.child_keys[idx] = key_byte; + self.child_ptrs[idx] = child; + self.num_children += 1; + } + + fn grow(&self, allocator: &Allocator) -> NodePtr { + let mut node16 = allocator.alloc(NodeInternal16 { + tag: NodeTag::Internal16, + lock_and_version: AtomicLockAndVersion::new(), + + prefix: self.prefix.clone(), + prefix_len: self.prefix_len, + num_children: self.num_children, + + child_keys: [0; 16], + child_ptrs: [const { NodePtr::null() }; 16], + }); + for i in 0..self.num_children as usize { + node16.child_keys[i] = self.child_keys[i]; + node16.child_ptrs[i] = self.child_ptrs[i]; + } + + node16.as_ptr().into() + } +} + +impl NodeInternal16 { + fn get_prefix(&self) -> &[u8] { + &self.prefix[0..self.prefix_len as usize] + } + + fn truncate_prefix(&mut self, new_prefix_len: usize) { + assert!(new_prefix_len < self.prefix_len as usize); + let prefix = &mut self.prefix; + let offset = self.prefix_len as usize - new_prefix_len; + for i in 0..new_prefix_len { + prefix[i] = prefix[i + offset]; + } + self.prefix_len = new_prefix_len as u8; + } + + fn find_child(&self, key_byte: u8) -> Option> { + for i in 0..self.num_children as usize { + if self.child_keys[i] == key_byte { + return Some(self.child_ptrs[i]); + } + } + None + } + + fn replace_child(&mut self, key_byte: u8, replacement: NodePtr) { + for i in 0..self.num_children as usize { + if self.child_keys[i] == key_byte { + self.child_ptrs[i] = replacement; + return; + } + } + panic!("could not re-find parent with key {}", key_byte); + } + + fn is_full(&self) -> bool { + self.num_children == 16 + } + + fn insert_child(&mut self, key_byte: u8, child: NodePtr) { + assert!(self.num_children < 16); + + let idx = self.num_children as usize; + self.child_keys[idx] = key_byte; + self.child_ptrs[idx] = child; + self.num_children += 1; + } + + fn grow(&self, allocator: &Allocator) -> NodePtr { + let mut node48 = allocator.alloc(NodeInternal48 { + tag: NodeTag::Internal48, + lock_and_version: AtomicLockAndVersion::new(), + + prefix: self.prefix.clone(), + prefix_len: self.prefix_len, + num_children: self.num_children, + + child_indexes: [INVALID_CHILD_INDEX; 256], + child_ptrs: [const { NodePtr::null() }; 48], + }); + for i in 0..self.num_children as usize { + let idx = self.child_keys[i] as usize; + node48.child_indexes[idx] = i as u8; + node48.child_ptrs[i] = self.child_ptrs[i]; + } + + node48.as_ptr().into() + } +} + +impl NodeInternal48 { + fn get_prefix(&self) -> &[u8] { + &self.prefix[0..self.prefix_len as usize] + } + + fn truncate_prefix(&mut self, new_prefix_len: usize) { + assert!(new_prefix_len < self.prefix_len as usize); + let prefix = &mut self.prefix; + let offset = self.prefix_len as usize - new_prefix_len; + for i in 0..new_prefix_len { + prefix[i] = prefix[i + offset]; + } + self.prefix_len = new_prefix_len as u8; + } + + fn find_child(&self, key_byte: u8) -> Option> { + let idx = self.child_indexes[key_byte as usize]; + if idx != INVALID_CHILD_INDEX { + Some(self.child_ptrs[idx as usize]) + } else { + None + } + } + + fn replace_child(&mut self, key_byte: u8, replacement: NodePtr) { + let idx = self.child_indexes[key_byte as usize]; + if idx != INVALID_CHILD_INDEX { + self.child_ptrs[idx as usize] = replacement + } else { + panic!("could not re-find parent with key {}", key_byte); + } + } + + fn is_full(&self) -> bool { + self.num_children == 48 + } + + fn insert_child(&mut self, key_byte: u8, child: NodePtr) { + assert!(self.num_children < 48); + assert!(self.child_indexes[key_byte as usize] == INVALID_CHILD_INDEX); + let idx = self.num_children; + self.child_indexes[key_byte as usize] = idx; + self.child_ptrs[idx as usize] = child; + self.num_children += 1; + } + + fn grow(&self, allocator: &Allocator) -> NodePtr { + let mut node256 = allocator.alloc(NodeInternal256 { + tag: NodeTag::Internal256, + lock_and_version: AtomicLockAndVersion::new(), + + prefix: self.prefix.clone(), + prefix_len: self.prefix_len, + num_children: self.num_children as u16, + + child_ptrs: [const { NodePtr::null() }; 256], + }); + for i in 0..256 { + let idx = self.child_indexes[i]; + if idx != INVALID_CHILD_INDEX { + node256.child_ptrs[i] = self.child_ptrs[idx as usize]; + } + } + node256.as_ptr().into() + } +} + +impl NodeInternal256 { + fn get_prefix(&self) -> &[u8] { + &self.prefix[0..self.prefix_len as usize] + } + + fn truncate_prefix(&mut self, new_prefix_len: usize) { + assert!(new_prefix_len < self.prefix_len as usize); + let prefix = &mut self.prefix; + let offset = self.prefix_len as usize - new_prefix_len; + for i in 0..new_prefix_len { + prefix[i] = prefix[i + offset]; + } + self.prefix_len = new_prefix_len as u8; + } + + fn find_child(&self, key_byte: u8) -> Option> { + let idx = key_byte as usize; + if !self.child_ptrs[idx].is_null() { + Some(self.child_ptrs[idx]) + } else { + None + } + } + + fn replace_child(&mut self, key_byte: u8, replacement: NodePtr) { + let idx = key_byte as usize; + if !self.child_ptrs[idx].is_null() { + self.child_ptrs[idx] = replacement + } else { + panic!("could not re-find parent with key {}", key_byte); + } + } + + fn is_full(&self) -> bool { + self.num_children == 256 + } + + fn insert_child(&mut self, key_byte: u8, child: NodePtr) { + assert!(self.num_children < 256); + assert!(self.child_ptrs[key_byte as usize].is_null()); + self.child_ptrs[key_byte as usize] = child; + self.num_children += 1; + } +} + +impl NodeLeaf4 { + fn get_prefix(&self) -> &[u8] { + &self.prefix[0..self.prefix_len as usize] + } + + fn truncate_prefix(&mut self, new_prefix_len: usize) { + assert!(new_prefix_len < self.prefix_len as usize); + let prefix = &mut self.prefix; + let offset = self.prefix_len as usize - new_prefix_len; + for i in 0..new_prefix_len { + prefix[i] = prefix[i + offset]; + } + self.prefix_len = new_prefix_len as u8; + } + + fn get_leaf_value<'a: 'b, 'b>(&'a self, key: u8) -> Option<&'b V> { + for i in 0..self.num_values { + if self.child_keys[i as usize] == key { + assert!(self.child_values[i as usize].is_some()); + return self.child_values[i as usize].as_ref(); + } + } + None + } + fn is_full(&self) -> bool { + self.num_values == 4 + } + + fn insert_value(&mut self, key_byte: u8, value: V) { + assert!(self.num_values < 16); + + let idx = self.num_values as usize; + self.child_keys[idx] = key_byte; + self.child_values[idx] = Some(value); + self.num_values += 1; + } + + fn grow(&self, allocator: &Allocator) -> NodePtr { + let mut node16 = allocator.alloc(NodeLeaf16 { + tag: NodeTag::Leaf16, + lock_and_version: AtomicLockAndVersion::new(), + + prefix: self.prefix.clone(), + prefix_len: self.prefix_len, + num_values: self.num_values, + + child_keys: [0; 16], + child_values: [const { None }; 16], + }); + for i in 0..self.num_values as usize { + node16.child_keys[i] = self.child_keys[i]; + node16.child_values[i] = self.child_values[i].clone(); + } + node16.as_ptr().into() + } +} + +impl NodeLeaf16 { + fn get_prefix(&self) -> &[u8] { + &self.prefix[0..self.prefix_len as usize] + } + + fn truncate_prefix(&mut self, new_prefix_len: usize) { + assert!(new_prefix_len < self.prefix_len as usize); + let prefix = &mut self.prefix; + let offset = self.prefix_len as usize - new_prefix_len; + for i in 0..new_prefix_len { + prefix[i] = prefix[i + offset]; + } + self.prefix_len = new_prefix_len as u8; + } + + fn get_leaf_value(&self, key: u8) -> Option<&V> { + for i in 0..self.num_values { + if self.child_keys[i as usize] == key { + assert!(self.child_values[i as usize].is_some()); + return self.child_values[i as usize].as_ref(); + } + } + None + } + fn is_full(&self) -> bool { + self.num_values == 16 + } + + fn insert_value(&mut self, key_byte: u8, value: V) { + assert!(self.num_values < 16); + + let idx = self.num_values as usize; + self.child_keys[idx] = key_byte; + self.child_values[idx] = Some(value); + self.num_values += 1; + } + fn grow(&self, allocator: &Allocator) -> NodePtr { + let mut node48 = allocator.alloc(NodeLeaf48 { + tag: NodeTag::Leaf48, + lock_and_version: AtomicLockAndVersion::new(), + + prefix: self.prefix.clone(), + prefix_len: self.prefix_len, + num_values: self.num_values, + + child_indexes: [INVALID_CHILD_INDEX; 256], + child_values: [const { None }; 48], + }); + for i in 0..self.num_values { + let idx = self.child_keys[i as usize]; + node48.child_indexes[idx as usize] = i; + node48.child_values[i as usize] = self.child_values[i as usize].clone(); + } + node48.as_ptr().into() + } +} + +impl NodeLeaf48 { + fn get_prefix(&self) -> &[u8] { + &self.prefix[0..self.prefix_len as usize] + } + + fn truncate_prefix(&mut self, new_prefix_len: usize) { + assert!(new_prefix_len < self.prefix_len as usize); + let prefix = &mut self.prefix; + let offset = self.prefix_len as usize - new_prefix_len; + for i in 0..new_prefix_len { + prefix[i] = prefix[i + offset]; + } + self.prefix_len = new_prefix_len as u8; + } + + fn get_leaf_value(&self, key: u8) -> Option<&V> { + let idx = self.child_indexes[key as usize]; + if idx != INVALID_CHILD_INDEX { + assert!(self.child_values[idx as usize].is_some()); + self.child_values[idx as usize].as_ref() + } else { + None + } + } + fn is_full(&self) -> bool { + self.num_values == 48 + } + + fn insert_value(&mut self, key_byte: u8, value: V) { + assert!(self.num_values < 48); + assert!(self.child_indexes[key_byte as usize] == INVALID_CHILD_INDEX); + let idx = self.num_values; + self.child_indexes[key_byte as usize] = idx; + self.child_values[idx as usize] = Some(value); + self.num_values += 1; + } + fn grow(&self, allocator: &Allocator) -> NodePtr { + let mut node256 = allocator.alloc(NodeLeaf256 { + tag: NodeTag::Leaf256, + lock_and_version: AtomicLockAndVersion::new(), + + prefix: self.prefix.clone(), + prefix_len: self.prefix_len, + num_values: self.num_values as u16, + + child_values: [const { None }; 256], + }); + for i in 0..256 { + let idx = self.child_indexes[i]; + if idx != INVALID_CHILD_INDEX { + node256.child_values[i] = self.child_values[idx as usize].clone(); + } + } + node256.as_ptr().into() + } +} + +impl NodeLeaf256 { + fn get_prefix(&self) -> &[u8] { + &self.prefix[0..self.prefix_len as usize] + } + + fn truncate_prefix(&mut self, new_prefix_len: usize) { + assert!(new_prefix_len < self.prefix_len as usize); + let prefix = &mut self.prefix; + let offset = self.prefix_len as usize - new_prefix_len; + for i in 0..new_prefix_len { + prefix[i] = prefix[i + offset]; + } + self.prefix_len = new_prefix_len as u8; + } + + fn get_leaf_value(&self, key: u8) -> Option<&V> { + let idx = key as usize; + self.child_values[idx].as_ref() + } + fn is_full(&self) -> bool { + self.num_values == 256 + } + + fn insert_value(&mut self, key_byte: u8, value: V) { + assert!(self.num_values < 256); + assert!(self.child_values[key_byte as usize].is_none()); + self.child_values[key_byte as usize] = Some(value); + self.num_values += 1; + } +} + +impl NodeInternal256 { + pub(crate) fn new() -> NodeInternal256 { + NodeInternal256 { + tag: NodeTag::Internal256, + lock_and_version: AtomicLockAndVersion::new(), + + prefix: [0; MAX_PREFIX_LEN], + prefix_len: 0, + num_children: 0, + + child_ptrs: [const { NodePtr::null() }; 256], + } + } +} + +impl From<*mut NodeInternal4> for NodePtr { + fn from(val: *mut NodeInternal4) -> NodePtr { + NodePtr { + ptr: val.cast(), + phantom_value: PhantomData, + } + } +} +impl From<*mut NodeInternal16> for NodePtr { + fn from(val: *mut NodeInternal16) -> NodePtr { + NodePtr { + ptr: val.cast(), + phantom_value: PhantomData, + } + } +} + +impl From<*mut NodeInternal48> for NodePtr { + fn from(val: *mut NodeInternal48) -> NodePtr { + NodePtr { + ptr: val.cast(), + phantom_value: PhantomData, + } + } +} + +impl From<*mut NodeInternal256> for NodePtr { + fn from(val: *mut NodeInternal256) -> NodePtr { + NodePtr { + ptr: val.cast(), + phantom_value: PhantomData, + } + } +} + +impl From<*mut NodeLeaf4> for NodePtr { + fn from(val: *mut NodeLeaf4) -> NodePtr { + NodePtr { + ptr: val.cast(), + phantom_value: PhantomData, + } + } +} +impl From<*mut NodeLeaf16> for NodePtr { + fn from(val: *mut NodeLeaf16) -> NodePtr { + NodePtr { + ptr: val.cast(), + phantom_value: PhantomData, + } + } +} + +impl From<*mut NodeLeaf48> for NodePtr { + fn from(val: *mut NodeLeaf48) -> NodePtr { + NodePtr { + ptr: val.cast(), + phantom_value: PhantomData, + } + } +} + +impl From<*mut NodeLeaf256> for NodePtr { + fn from(val: *mut NodeLeaf256) -> NodePtr { + NodePtr { + ptr: val.cast(), + phantom_value: PhantomData, + } + } +} diff --git a/libs/neonart/src/algorithm/node_ref.rs b/libs/neonart/src/algorithm/node_ref.rs new file mode 100644 index 0000000000..c5627b352b --- /dev/null +++ b/libs/neonart/src/algorithm/node_ref.rs @@ -0,0 +1,202 @@ +use std::fmt::Debug; +use std::marker::PhantomData; + +use super::lock_and_version::ResultOrRestart; +use super::node_ptr; +use super::node_ptr::ChildOrValuePtr; +use super::node_ptr::NodePtr; +use crate::EpochPin; +use crate::algorithm::lock_and_version::AtomicLockAndVersion; +use crate::{Allocator, Value}; + +pub struct NodeRef<'e, V> { + ptr: NodePtr, + + phantom: PhantomData<&'e EpochPin>, +} + +impl<'e, V> Debug for NodeRef<'e, V> { + fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> { + write!(fmt, "{:?}", self.ptr) + } +} + +impl<'e, V: Value> NodeRef<'e, V> { + pub(crate) fn from_root_ptr(root_ptr: NodePtr) -> NodeRef<'e, V> { + NodeRef { + ptr: root_ptr, + phantom: PhantomData, + } + } + + pub(crate) fn read_lock_or_restart(&self) -> ResultOrRestart> { + let version = self.lockword().read_lock_or_restart()?; + Ok(ReadLockedNodeRef { + ptr: self.ptr, + version, + phantom: self.phantom, + }) + } + + fn lockword(&self) -> &AtomicLockAndVersion { + self.ptr.lockword() + } +} + +/// A reference to a node that has been optimistically read-locked. The functions re-check +/// the version after each read. +pub struct ReadLockedNodeRef<'e, V> { + ptr: NodePtr, + version: u64, + + phantom: PhantomData<&'e EpochPin>, +} + +pub(crate) enum ChildOrValue<'e, V> { + Child(NodeRef<'e, V>), + Value(*const V), +} + +impl<'e, V: Value> ReadLockedNodeRef<'e, V> { + pub(crate) fn is_full(&self) -> bool { + self.ptr.is_full() + } + + pub(crate) fn get_prefix(&self) -> &[u8] { + self.ptr.get_prefix() + } + + /// Note: because we're only holding a read lock, the prefix can change concurrently. + /// You must be prepared to restart, if read_unlock() returns error later. + /// + /// Returns the length of the prefix, or None if it's not a match + pub(crate) fn prefix_matches(&self, key: &[u8]) -> Option { + self.ptr.prefix_matches(key) + } + + pub(crate) fn find_child_or_value_or_restart( + &self, + key_byte: u8, + ) -> ResultOrRestart>> { + let child_or_value = self.ptr.find_child_or_value(key_byte); + self.ptr.lockword().check_or_restart(self.version)?; + + match child_or_value { + None => Ok(None), + Some(ChildOrValuePtr::Value(vptr)) => Ok(Some(ChildOrValue::Value(vptr))), + Some(ChildOrValuePtr::Child(child_ptr)) => Ok(Some(ChildOrValue::Child(NodeRef { + ptr: child_ptr, + phantom: self.phantom, + }))), + } + } + + pub(crate) fn upgrade_to_write_lock_or_restart( + self, + ) -> ResultOrRestart> { + self.ptr + .lockword() + .upgrade_to_write_lock_or_restart(self.version)?; + + Ok(WriteLockedNodeRef { + ptr: self.ptr, + phantom: self.phantom, + }) + } + + pub(crate) fn read_unlock_or_restart(self) -> ResultOrRestart<()> { + self.ptr.lockword().check_or_restart(self.version)?; + Ok(()) + } +} + +/// A reference to a node that has been optimistically read-locked. The functions re-check +/// the version after each read. +pub struct WriteLockedNodeRef<'e, V> { + ptr: NodePtr, + phantom: PhantomData<&'e EpochPin>, +} + +impl<'e, V: Value> WriteLockedNodeRef<'e, V> { + pub(crate) fn is_leaf(&self) -> bool { + self.ptr.is_leaf() + } + + pub(crate) fn write_unlock(mut self) { + self.ptr.lockword().write_unlock(); + self.ptr = NodePtr::null(); + } + + pub(crate) fn write_unlock_obsolete(mut self) { + self.ptr.lockword().write_unlock_obsolete(); + self.ptr = NodePtr::null(); + } + + pub(crate) fn get_prefix(&self) -> &[u8] { + self.ptr.get_prefix() + } + + pub(crate) fn truncate_prefix(&mut self, new_prefix_len: usize) { + self.ptr.truncate_prefix(new_prefix_len) + } + + pub(crate) fn insert_child(&mut self, key_byte: u8, child: NodePtr) { + self.ptr.insert_child(key_byte, child) + } + + pub(crate) fn insert_value(&mut self, key_byte: u8, value: V) { + self.ptr.insert_value(key_byte, value) + } + + pub(crate) fn grow(&self, allocator: &Allocator) -> NewNodeRef { + let new_node = self.ptr.grow(allocator); + NewNodeRef { ptr: new_node } + } + + pub(crate) fn as_ptr(&self) -> NodePtr { + self.ptr + } + + pub(crate) fn replace_child(&mut self, key_byte: u8, replacement: NodePtr) { + self.ptr.replace_child(key_byte, replacement); + } +} + +impl<'e, V> Drop for WriteLockedNodeRef<'e, V> { + fn drop(&mut self) { + if !self.ptr.is_null() { + self.ptr.lockword().write_unlock(); + } + } +} + +pub(crate) struct NewNodeRef { + ptr: NodePtr, +} + +impl NewNodeRef { + pub(crate) fn insert_child(&mut self, key_byte: u8, child: NodePtr) { + self.ptr.insert_child(key_byte, child) + } + + pub(crate) fn insert_value(&mut self, key_byte: u8, value: V) { + self.ptr.insert_value(key_byte, value) + } + + pub(crate) fn into_ptr(self) -> NodePtr { + let ptr = self.ptr; + ptr + } +} + +pub(crate) fn new_internal(prefix: &[u8], allocator: &Allocator) -> NewNodeRef { + NewNodeRef { + ptr: node_ptr::new_internal(prefix, allocator), + } +} + +pub(crate) fn new_leaf(prefix: &[u8], allocator: &Allocator) -> NewNodeRef { + NewNodeRef { + ptr: node_ptr::new_leaf(prefix, allocator), + } +} diff --git a/libs/neonart/src/allocator.rs b/libs/neonart/src/allocator.rs new file mode 100644 index 0000000000..5e417a5426 --- /dev/null +++ b/libs/neonart/src/allocator.rs @@ -0,0 +1,107 @@ +use std::marker::PhantomData; +use std::mem::MaybeUninit; +use std::ops::{Deref, DerefMut}; +use std::ptr::NonNull; +use std::sync::atomic::{AtomicUsize, Ordering}; + +pub struct Allocator { + area: *mut MaybeUninit, + allocated: AtomicUsize, + size: usize, +} + +// FIXME: I don't know if these are really safe... +unsafe impl Send for Allocator {} +unsafe impl Sync for Allocator {} + +#[repr(transparent)] +pub struct AllocatedBox<'a, T> { + inner: NonNull, + + _phantom: PhantomData<&'a Allocator>, +} + +// FIXME: I don't know if these are really safe... +unsafe impl<'a, T> Send for AllocatedBox<'a, T> {} +unsafe impl<'a, T> Sync for AllocatedBox<'a, T> {} + +impl Deref for AllocatedBox<'_, T> { + type Target = T; + + fn deref(&self) -> &T { + unsafe { self.inner.as_ref() } + } +} + +impl DerefMut for AllocatedBox<'_, T> { + fn deref_mut(&mut self) -> &mut T { + unsafe { self.inner.as_mut() } + } +} + +impl AsMut for AllocatedBox<'_, T> { + fn as_mut(&mut self) -> &mut T { + unsafe { self.inner.as_mut() } + } +} + +impl AllocatedBox<'_, T> { + pub fn as_ptr(&self) -> *mut T { + self.inner.as_ptr() + } +} + +const MAXALIGN: usize = std::mem::align_of::(); + +impl Allocator { + pub fn new_uninit(area: &'static mut [MaybeUninit]) -> Allocator { + let ptr = area.as_mut_ptr(); + let size = area.len(); + Self::new_from_ptr(ptr, size) + } + + pub fn new(area: &'static mut [u8]) -> Allocator { + let ptr: *mut MaybeUninit = area.as_mut_ptr().cast(); + let size = area.len(); + Self::new_from_ptr(ptr, size) + } + + pub fn new_from_ptr(ptr: *mut MaybeUninit, size: usize) -> Allocator { + let padding = ptr.align_offset(MAXALIGN); + + Allocator { + area: ptr, + allocated: AtomicUsize::new(padding), + size, + } + } + + pub fn alloc<'a, T: Sized>(&'a self, value: T) -> AllocatedBox<'a, T> { + let sz = std::mem::size_of::(); + + // pad all allocations to MAXALIGN boundaries + assert!(std::mem::align_of::() <= MAXALIGN); + let sz = sz.next_multiple_of(MAXALIGN); + + let offset = self.allocated.fetch_add(sz, Ordering::Relaxed); + + if offset + sz > self.size { + panic!("out of memory"); + } + + let inner = unsafe { + let inner = self.area.offset(offset as isize).cast::(); + *inner = value; + NonNull::new_unchecked(inner) + }; + + AllocatedBox { + inner, + _phantom: PhantomData, + } + } + + pub fn _dealloc_node(&self, _node: AllocatedBox) { + // doesn't free it immediately. + } +} diff --git a/libs/neonart/src/epoch.rs b/libs/neonart/src/epoch.rs new file mode 100644 index 0000000000..00019a3b9a --- /dev/null +++ b/libs/neonart/src/epoch.rs @@ -0,0 +1,23 @@ +//! This is similar to crossbeam_epoch crate, but works in shared memory +//! +//! FIXME: not implemented yet. (We haven't implemented removing any nodes from the ART +//! tree, which is why we get away without this now) + +pub(crate) struct EpochPin {} + +pub(crate) fn pin_epoch() -> EpochPin { + EpochPin {} +} + +/* +struct CollectorGlobal { + epoch: AtomicU64, + + participants: CachePadded, // make it an array +} + + +struct CollectorQueue { + +} +*/ diff --git a/libs/neonart/src/lib.rs b/libs/neonart/src/lib.rs new file mode 100644 index 0000000000..64a08dd45d --- /dev/null +++ b/libs/neonart/src/lib.rs @@ -0,0 +1,301 @@ +//! Adaptive Radix Tree (ART) implementation, with Optimistic Lock Coupling. +//! +//! The data structure is described in these two papers: +//! +//! [1] Leis, V. & Kemper, Alfons & Neumann, Thomas. (2013). +//! The adaptive radix tree: ARTful indexing for main-memory databases. +//! Proceedings - International Conference on Data Engineering. 38-49. 10.1109/ICDE.2013.6544812. +//! https://db.in.tum.de/~leis/papers/ART.pdf +//! +//! [2] Leis, Viktor & Scheibner, Florian & Kemper, Alfons & Neumann, Thomas. (2016). +//! The ART of practical synchronization. +//! 1-8. 10.1145/2933349.2933352. +//! https://db.in.tum.de/~leis/papers/artsync.pdf +//! +//! [1] describes the base data structure, and [2] describes the Optimistic Lock Coupling that we +//! use. +//! +//! The papers mention a few different variants. We have made the following choices in this +//! implementation: +//! +//! - All keys have the same length +//! +//! - Multi-value leaves. The values are stored directly in one of the four different leaf node +//! types. +//! +//! - For collapsing inner nodes, we use the Pessimistic approach, where each inner node stores a +//! variable length "prefix", which stores the keys of all the one-way nodes which have been +//! removed. However, similar to the "hybrid" approach described in the paper, each node only has +//! space for a constant-size prefix of 8 bytes. If a node would have a longer prefix, then we +//! create create one-way nodes to store them. (There was no particular reason for this choice, +//! the "hybrid" approach described in the paper might be better.) +//! +//! - For concurrency, we use Optimistic Lock Coupling. The paper [2] also describes another method, +//! ROWEX, which generally performs better when there is contention, but that is not important +//! for use and Optimisic Lock Coupling is simpler to implement. +//! +//! ## Requirements +//! +//! This data structure is currently used for the integrated LFC, relsize and last-written LSN cache +//! in the compute communicator, part of the 'neon' Postgres extension. We have some unique +//! requirements, which is why we had to write our own. Namely: +//! +//! - The data structure has to live in fixed-sized shared memory segment. That rules out any +//! built-in Rust collections and most crates. (Except possibly with the 'allocator_api' rust +//! feature, which still nightly-only experimental as of this writing). +//! +//! - The data structure is accessed from multiple processes. Only one process updates the data +//! structure, but other processes perform reads. That rules out using built-in Rust locking +//! primitives like Mutex and RwLock, and most crates too. +//! +//! - Within the one process with write-access, multiple threads can perform updates concurrently. +//! That rules out using PostgreSQL LWLocks for the locking. +//! +//! The implementation is generic, and doesn't depend on any PostgreSQL specifics, but it has been +//! written with that usage and the above constraints in mind. Some noteworthy assumptions: +//! +//! - Contention is assumed to be rare. In the integrated cache in PostgreSQL, there's higher level +//! locking in the PostgreSQL buffer manager, which ensures that two backends should not try to +//! read / write the same page at the same time. (Prefetching can conflict with actual reads, +//! however.) +//! +//! - The keys in the integrated cache are 17 bytes long. +//! +//! ## Usage +//! +//! Because this is designed to be used as a Postgres shared memory data structure, initialization +//! happens in three stages: +//! +//! 0. A fixed area of shared memory is allocated at postmaster startup. +//! +//! 1. TreeInitStruct::new() is called to initialize it, still in Postmaster process, before any +//! other process or thread is running. It returns a TreeInitStruct, which is inherited by all +//! the processes through fork(). +//! +//! 2. One process may have write-access to the struct, by calling +//! [TreeInitStruct::attach_writer]. (That process is the communicator process.) +//! +//! 3. Other processes get read-access to the struct, by calling [TreeInitStruct::attach_reader] +//! +//! "Write access" means that you can insert / update / delete values in the tree. +//! +//! NOTE: The Values stored in the tree are sometimes moved, when a leaf node fills up and a new +//! larger node needs to be allocated. The versioning and epoch-based allocator ensure that the data +//! structure stays consistent, but if the Value has interior mutability, like atomic fields, +//! updates to such fields might be lost if the leaf node is concurrently moved! If that becomes a +//! problem, the version check could be passed up to the caller, so that the caller could detect the +//! lost updates and retry the operation. +//! +//! ## Implementation +//! +//! node_ptr: Provides low-level implementations of the four different node types (eight actually, +//! since there is an Internal and Leaf variant of each) +//! +//! lock_and_version.rs: Provides an abstraction for the combined lock and version counter on each +//! node. +//! +//! node_ref.rs: The code in node_ptr.rs deals with raw pointers. node_ref.rs provides more type-safe +//! abstractions on top. +//! +//! algorithm.rs: Contains the functions to implement lookups and updates in the tree +//! +//! allocator.rs: Provides a facility to allocate memory for the tree nodes. (We must provide our +//! own abstraction for that because we need the data structure to live in a pre-allocated shared +//! memory segment). +//! +//! epoch.rs: The data structure requires that when a node is removed from the tree, it is not +//! immediately deallocated, but stays around for as long as concurrent readers might still have +//! pointers to them. This is enforced by an epoch system. This is similar to +//! e.g. crossbeam_epoch, but we couldn't use that either because it has to work across processes +//! communicating over the shared memory segment. +//! +//! ## See also +//! +//! There are some existing Rust ART implementations out there, but none of them filled all +//! the requirements: +//! +//! - https://github.com/XiangpengHao/congee +//! - https://github.com/declanvk/blart +//! +//! ## TODO +//! +//! - Removing values has not been implemented + +mod algorithm; +mod allocator; +mod epoch; + +use algorithm::RootPtr; + +use allocator::AllocatedBox; + +use std::fmt::Debug; +use std::marker::PhantomData; +use std::sync::atomic::{AtomicBool, Ordering}; + +use crate::epoch::EpochPin; + +#[cfg(test)] +mod tests; + +pub use allocator::Allocator; + +/// Fixed-length key type. +/// +pub trait Key: Clone + Debug { + const KEY_LEN: usize; + + fn as_bytes(&self) -> &[u8]; +} + +/// Values stored in the tree +/// +/// Values need to be Cloneable, because when a node "grows", the value is copied to a new node and +/// the old sticks around until all readers that might see the old value are gone. +pub trait Value: Clone {} + +struct Tree { + root: RootPtr, + + writer_attached: AtomicBool, + + phantom_key: PhantomData, +} + +/// Struct created at postmaster startup +pub struct TreeInitStruct<'t, K: Key, V: Value> { + tree: AllocatedBox<'t, Tree>, + + allocator: &'t Allocator, +} + +/// The worker process has a reference to this. The write operations are only safe +/// from the worker process +pub struct TreeWriteAccess<'t, K: Key, V: Value> +where + K: Key, + V: Value, +{ + tree: AllocatedBox<'t, Tree>, + + allocator: &'t Allocator, +} + +/// The backends have a reference to this. It cannot be used to modify the tree +pub struct TreeReadAccess<'t, K: Key, V: Value> +where + K: Key, + V: Value, +{ + tree: AllocatedBox<'t, Tree>, +} + +impl<'a, 't: 'a, K: Key, V: Value> TreeInitStruct<'t, K, V> { + pub fn new(allocator: &'t Allocator) -> TreeInitStruct<'t, K, V> { + let tree = allocator.alloc(Tree { + root: algorithm::new_root(allocator), + writer_attached: AtomicBool::new(false), + phantom_key: PhantomData, + }); + + TreeInitStruct { tree, allocator } + } + + pub fn attach_writer(self) -> TreeWriteAccess<'t, K, V> { + let previously_attached = self.tree.writer_attached.swap(true, Ordering::Relaxed); + if previously_attached { + panic!("writer already attached"); + } + TreeWriteAccess { + tree: self.tree, + allocator: self.allocator, + } + } + + pub fn attach_reader(self) -> TreeReadAccess<'t, K, V> { + TreeReadAccess { tree: self.tree } + } +} + +impl<'t, K: Key + Clone, V: Value> TreeWriteAccess<'t, K, V> { + pub fn start_write(&'t self) -> TreeWriteGuard<'t, K, V> { + // TODO: grab epoch guard + TreeWriteGuard { + allocator: self.allocator, + tree: &self.tree, + epoch_pin: epoch::pin_epoch(), + } + } + + pub fn start_read(&'t self) -> TreeReadGuard<'t, K, V> { + TreeReadGuard { + tree: &self.tree, + epoch_pin: epoch::pin_epoch(), + } + } +} + +impl<'t, K: Key + Clone, V: Value> TreeReadAccess<'t, K, V> { + pub fn start_read(&'t self) -> TreeReadGuard<'t, K, V> { + TreeReadGuard { + tree: &self.tree, + epoch_pin: epoch::pin_epoch(), + } + } +} + +pub struct TreeReadGuard<'t, K, V> +where + K: Key, + V: Value, +{ + tree: &'t AllocatedBox<'t, Tree>, + + epoch_pin: EpochPin, +} + +impl<'t, K: Key, V: Value> TreeReadGuard<'t, K, V> { + pub fn get(&self, key: &K) -> Option { + algorithm::search(key, self.tree.root, &self.epoch_pin) + } +} + +pub struct TreeWriteGuard<'t, K, V> +where + K: Key, + V: Value, +{ + tree: &'t AllocatedBox<'t, Tree>, + allocator: &'t Allocator, + + epoch_pin: EpochPin, +} + +impl<'t, K: Key, V: Value> TreeWriteGuard<'t, K, V> { + pub fn insert(&mut self, key: &K, value: V) { + self.update_with_fn(key, |_| Some(value)) + } + + pub fn update_with_fn(&mut self, key: &K, value_fn: F) + where + F: FnOnce(Option<&V>) -> Option, + { + algorithm::update_fn( + key, + value_fn, + self.tree.root, + self.allocator, + &self.epoch_pin, + ) + } + + pub fn get(&mut self, key: &K) -> Option { + algorithm::search(key, self.tree.root, &self.epoch_pin) + } +} + +impl<'t, K: Key, V: Value + Debug> TreeWriteGuard<'t, K, V> { + pub fn dump(&mut self) { + algorithm::dump_tree(self.tree.root, &self.epoch_pin) + } +} diff --git a/libs/neonart/src/tests.rs b/libs/neonart/src/tests.rs new file mode 100644 index 0000000000..2a81e7a0a0 --- /dev/null +++ b/libs/neonart/src/tests.rs @@ -0,0 +1,90 @@ +use std::collections::HashSet; + +use crate::Allocator; +use crate::TreeInitStruct; + +use crate::{Key, Value}; + +use rand::seq::SliceRandom; +use rand::thread_rng; + +const TEST_KEY_LEN: usize = 16; + +#[derive(Clone, Copy, Debug)] +struct TestKey([u8; TEST_KEY_LEN]); + +impl Key for TestKey { + const KEY_LEN: usize = TEST_KEY_LEN; + + fn as_bytes(&self) -> &[u8] { + &self.0 + } +} + +impl From for TestKey { + fn from(val: u128) -> TestKey { + TestKey(val.to_be_bytes()) + } +} + +impl Value for usize {} + +fn test_inserts + Copy>(keys: &[K]) { + const MEM_SIZE: usize = 10000000; + let area = Box::leak(Box::new_uninit_slice(MEM_SIZE)); + + let allocator = Box::leak(Box::new(Allocator::new_uninit(area))); + + let init_struct = TreeInitStruct::::new(allocator); + let tree_writer = init_struct.attach_writer(); + + for (idx, k) in keys.iter().enumerate() { + let mut w = tree_writer.start_write(); + w.insert(&(*k).into(), idx); + eprintln!("INSERTED {:?}", Into::::into(*k)); + } + + //tree_writer.start_read().dump(); + + for (idx, k) in keys.iter().enumerate() { + let r = tree_writer.start_read(); + let value = r.get(&(*k).into()); + assert_eq!(value, Some(idx)); + } +} + +#[test] +fn dense() { + // This exercises splitting a node with prefix + let keys: &[u128] = &[0, 1, 2, 3, 256]; + test_inserts(keys); + + // Dense keys + let mut keys: Vec = (0..10000).collect(); + test_inserts(&keys); + + // Do the same in random orders + for _ in 1..10 { + keys.shuffle(&mut thread_rng()); + test_inserts(&keys); + } +} + +#[test] +fn sparse() { + // sparse keys + let mut keys: Vec = Vec::new(); + let mut used_keys = HashSet::new(); + for _ in 0..10000 { + loop { + let key = rand::random::(); + if used_keys.get(&key).is_some() { + continue; + } + used_keys.insert(key); + keys.push(key.into()); + break; + } + } + test_inserts(&keys); +} diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 8abd504922..fbd577f3b7 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -42,12 +42,14 @@ nix.workspace = true num_cpus.workspace = true num-traits.workspace = true once_cell.workspace = true +peekable.workspace = true pin-project-lite.workspace = true postgres_backend.workspace = true postgres-protocol.workspace = true postgres-types.workspace = true postgres_initdb.workspace = true pprof.workspace = true +prost.workspace = true rand.workspace = true range-set-blaze = { version = "0.1.16", features = ["alloc"] } regex.workspace = true @@ -60,6 +62,7 @@ serde_path_to_error.workspace = true serde_with.workspace = true sysinfo.workspace = true tokio-tar.workspace = true +tonic.workspace = true thiserror.workspace = true tikv-jemallocator.workspace = true tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] } @@ -76,6 +79,7 @@ url.workspace = true walkdir.workspace = true metrics.workspace = true pageserver_api.workspace = true +pageserver_data_api.workspace = true pageserver_client.workspace = true # for ResponseErrorMessageExt TOOD refactor that pageserver_compaction.workspace = true pem.workspace = true diff --git a/pageserver/client_grpc/Cargo.toml b/pageserver/client_grpc/Cargo.toml new file mode 100644 index 0000000000..3a2e4150b1 --- /dev/null +++ b/pageserver/client_grpc/Cargo.toml @@ -0,0 +1,13 @@ +[package] +name = "pageserver_client_grpc" +version = "0.1.0" +edition = "2024" + +[dependencies] +bytes.workspace = true +http.workspace = true +thiserror.workspace = true +tonic.workspace = true +tracing.workspace = true + +pageserver_data_api.workspace = true diff --git a/pageserver/client_grpc/src/lib.rs b/pageserver/client_grpc/src/lib.rs new file mode 100644 index 0000000000..3115990331 --- /dev/null +++ b/pageserver/client_grpc/src/lib.rs @@ -0,0 +1,221 @@ +//! Pageserver Data API client +//! +//! - Manage connections to pageserver +//! - Send requests to correct shards +//! +use std::collections::HashMap; +use std::sync::RwLock; + +use bytes::Bytes; +use http; +use thiserror::Error; +use tonic; +use tonic::metadata::AsciiMetadataValue; +use tonic::transport::Channel; + +use pageserver_data_api::model::*; +use pageserver_data_api::proto; + +type Shardno = u16; + +use pageserver_data_api::client::PageServiceClient; + +type MyPageServiceClient = pageserver_data_api::client::PageServiceClient< + tonic::service::interceptor::InterceptedService, +>; + +#[derive(Error, Debug)] +pub enum PageserverClientError { + #[error("could not connect to service: {0}")] + ConnectError(#[from] tonic::transport::Error), + #[error("could not perform request: {0}`")] + RequestError(#[from] tonic::Status), + + #[error("could not perform request: {0}`")] + InvalidUri(#[from] http::uri::InvalidUri), +} + +pub struct PageserverClient { + _tenant_id: String, + _timeline_id: String, + + _auth_token: Option, + + shard_map: HashMap, + + channels: RwLock>, + + auth_interceptor: AuthInterceptor, +} + +impl PageserverClient { + /// TODO: this doesn't currently react to changes in the shard map. + pub fn new( + tenant_id: &str, + timeline_id: &str, + auth_token: &Option, + shard_map: HashMap, + ) -> Self { + Self { + _tenant_id: tenant_id.to_string(), + _timeline_id: timeline_id.to_string(), + _auth_token: auth_token.clone(), + shard_map, + channels: RwLock::new(HashMap::new()), + auth_interceptor: AuthInterceptor::new(tenant_id, timeline_id, auth_token.as_ref()), + } + } + + pub async fn process_rel_exists_request( + &self, + request: &RelExistsRequest, + ) -> Result { + // Current sharding model assumes that all metadata is present only at shard 0. + let shard_no = 0; + + let mut client = self.get_client(shard_no).await?; + + let request = proto::RelExistsRequest::from(request); + let response = client.rel_exists(tonic::Request::new(request)).await?; + + Ok(response.get_ref().exists) + } + + pub async fn process_rel_size_request( + &self, + request: &RelSizeRequest, + ) -> Result { + // Current sharding model assumes that all metadata is present only at shard 0. + let shard_no = 0; + + let mut client = self.get_client(shard_no).await?; + + let request = proto::RelSizeRequest::from(request); + let response = client.rel_size(tonic::Request::new(request)).await?; + + Ok(response.get_ref().num_blocks) + } + + pub async fn get_page(&self, request: &GetPageRequest) -> Result { + // FIXME: calculate the shard number correctly + let shard_no = 0; + + let mut client = self.get_client(shard_no).await?; + + let request = proto::GetPageRequest::from(request); + let response = client.get_page(tonic::Request::new(request)).await?; + + Ok(response.into_inner().page_image) + } + + /// Process a request to get the size of a database. + pub async fn process_dbsize_request( + &self, + request: &DbSizeRequest, + ) -> Result { + // Current sharding model assumes that all metadata is present only at shard 0. + let shard_no = 0; + + let mut client = self.get_client(shard_no).await?; + + let request = proto::DbSizeRequest::from(request); + let response = client.db_size(tonic::Request::new(request)).await?; + + Ok(response.get_ref().num_bytes) + } + + /// Process a request to get the size of a database. + pub async fn get_base_backup( + &self, + request: &GetBaseBackupRequest, + gzip: bool, + ) -> std::result::Result< + tonic::Response>, + PageserverClientError, + > { + // Current sharding model assumes that all metadata is present only at shard 0. + let shard_no = 0; + + let mut client = self.get_client(shard_no).await?; + if gzip { + client = client.accept_compressed(tonic::codec::CompressionEncoding::Gzip); + } + + let request = proto::GetBaseBackupRequest::from(request); + let response = client.get_base_backup(tonic::Request::new(request)).await?; + + Ok(response) + } + + /// Get a client for given shard + /// + /// This implements very basic caching. If we already have a client for the given shard, + /// reuse it. If not, create a new client and put it to the cache. + async fn get_client( + &self, + shard_no: u16, + ) -> Result { + let reused_channel: Option = { + let channels = self.channels.read().unwrap(); + + channels.get(&shard_no).cloned() + }; + + let channel = if let Some(reused_channel) = reused_channel { + reused_channel + } else { + let endpoint: tonic::transport::Endpoint = self + .shard_map + .get(&shard_no) + .expect("no url for shard {shard_no}") + .parse()?; + let channel = endpoint.connect().await?; + + // Insert it to the cache so that it can be reused on subsequent calls. It's possible + // that another thread did the same concurrently, in which case we will overwrite the + // client in the cache. + { + let mut channels = self.channels.write().unwrap(); + channels.insert(shard_no, channel.clone()); + } + channel + }; + + let client = PageServiceClient::with_interceptor(channel, self.auth_interceptor.clone()); + Ok(client) + } +} + +/// Inject tenant_id, timeline_id and authentication token to all pageserver requests. +#[derive(Clone)] +struct AuthInterceptor { + tenant_id: AsciiMetadataValue, + timeline_id: AsciiMetadataValue, + + auth_token: Option, +} + +impl AuthInterceptor { + fn new(tenant_id: &str, timeline_id: &str, auth_token: Option<&String>) -> Self { + Self { + tenant_id: tenant_id.parse().expect("could not parse tenant id"), + timeline_id: timeline_id.parse().expect("could not parse timeline id"), + auth_token: auth_token.map(|x| x.parse().expect("could not parse auth token")), + } + } +} + +impl tonic::service::Interceptor for AuthInterceptor { + fn call(&mut self, mut req: tonic::Request<()>) -> Result, tonic::Status> { + req.metadata_mut() + .insert("neon-tenant-id", self.tenant_id.clone()); + req.metadata_mut() + .insert("neon-timeline-id", self.timeline_id.clone()); + if let Some(auth_token) = &self.auth_token { + req.metadata_mut() + .insert("neon-auth-token", auth_token.clone()); + } + + Ok(req) + } +} diff --git a/pageserver/data_api/Cargo.toml b/pageserver/data_api/Cargo.toml new file mode 100644 index 0000000000..895f6fb2b7 --- /dev/null +++ b/pageserver/data_api/Cargo.toml @@ -0,0 +1,18 @@ +[package] +name = "pageserver_data_api" +version = "0.1.0" +edition = "2024" + +[dependencies] + +# For Lsn. +# +# TODO: move Lsn to separate crate? This draws in a lot more dependencies +utils.workspace = true + +prost.workspace = true +thiserror.workspace = true +tonic.workspace = true + +[build-dependencies] +tonic-build.workspace = true diff --git a/pageserver/data_api/build.rs b/pageserver/data_api/build.rs new file mode 100644 index 0000000000..8a4dfca836 --- /dev/null +++ b/pageserver/data_api/build.rs @@ -0,0 +1,8 @@ +fn main() -> Result<(), Box> { + // Generate rust code from .proto protobuf. + tonic_build::configure() + .bytes(&["."]) + .compile_protos(&["proto/page_service.proto"], &["proto"]) + .unwrap_or_else(|e| panic!("failed to compile protos {:?}", e)); + Ok(()) +} diff --git a/pageserver/data_api/proto/page_service.proto b/pageserver/data_api/proto/page_service.proto new file mode 100644 index 0000000000..0e9116f39a --- /dev/null +++ b/pageserver/data_api/proto/page_service.proto @@ -0,0 +1,84 @@ +// Page service presented by pageservers, for computes +// +// Each request must come with the following metadata: +// - neon-tenant-id +// - neon-timeline-id +// - neon-auth-token (if auth is enabled) +// +// TODO: what else? Priority? OpenTelemetry tracing? +// + +syntax = "proto3"; +package page_service; + +service PageService { + rpc RelExists(RelExistsRequest) returns (RelExistsResponse); + + // Returns size of a relation, as # of blocks + rpc RelSize (RelSizeRequest) returns (RelSizeResponse); + + rpc GetPage (GetPageRequest) returns (GetPageResponse); + + // Returns total size of a database, as # of bytes + rpc DbSize (DbSizeRequest) returns (DbSizeResponse); + + rpc GetBaseBackup (GetBaseBackupRequest) returns (stream GetBaseBackupResponseChunk); +} + +message RequestCommon { + uint64 request_lsn = 1; + uint64 not_modified_since_lsn = 2; +} + +message RelTag { + uint32 spc_oid = 1; + uint32 db_oid = 2; + uint32 rel_number = 3; + uint32 fork_number = 4; +} + +message RelExistsRequest { + RequestCommon common = 1; + RelTag rel = 2; +} + +message RelExistsResponse { + bool exists = 1; +} + +message RelSizeRequest { + RequestCommon common = 1; + RelTag rel = 2; +} + +message RelSizeResponse { + uint32 num_blocks = 1; +} + +message GetPageRequest { + RequestCommon common = 1; + RelTag rel = 2; + uint32 block_number = 3; +} + +message GetPageResponse { + bytes page_image = 1; +} + +message DbSizeRequest { + RequestCommon common = 1; + uint32 db_oid = 2; +} + +message DbSizeResponse { + uint64 num_bytes = 1; +} + +message GetBaseBackupRequest { + RequestCommon common = 1; + bool replica = 2; +} + +message GetBaseBackupResponseChunk { + bytes chunk = 1; +} diff --git a/pageserver/data_api/src/lib.rs b/pageserver/data_api/src/lib.rs new file mode 100644 index 0000000000..3c0963ae1c --- /dev/null +++ b/pageserver/data_api/src/lib.rs @@ -0,0 +1,17 @@ +//! This crate has two modules related to the Pageserver Data API: +//! +//! proto: code auto-generated from the protobuf definition +//! model: slightly more ergonomic structs representing the same API +//! +//! See protobuf spec under the protos/ subdirectory. +//! +//! This crate is used by both the client and the server. Try to keep it slim. +//! +pub mod model; + +// Code generated by protobuf. +pub mod proto { + tonic::include_proto!("page_service"); +} + +pub use proto::page_service_client as client; diff --git a/pageserver/data_api/src/model.rs b/pageserver/data_api/src/model.rs new file mode 100644 index 0000000000..85faa131e2 --- /dev/null +++ b/pageserver/data_api/src/model.rs @@ -0,0 +1,239 @@ +//! Structs representing the API +//! +//! These mirror the pageserver APIs and the structs automatically generated +//! from the protobuf specification. The differences are: +//! +//! - Types that are in fact required by the API are not Options. The protobuf "required" +//! attribute is deprecated and 'prost' marks a lot of members as optional because of that. +//! (See https://github.com/tokio-rs/prost/issues/800 for a gripe on this) +//! +//! - Use more precise datatypes, e.g. Lsn and uints shorter than 32 bits. + +use utils::lsn::Lsn; + +use crate::proto; + +#[derive(Clone, Debug)] +pub struct RequestCommon { + pub request_lsn: Lsn, + pub not_modified_since_lsn: Lsn, +} + +#[derive(Clone, Debug, Eq, PartialEq, Hash, PartialOrd, Ord)] +pub struct RelTag { + pub spc_oid: u32, + pub db_oid: u32, + pub rel_number: u32, + pub fork_number: u8, +} + +#[derive(Clone, Debug)] +pub struct RelExistsRequest { + pub common: RequestCommon, + pub rel: RelTag, +} + +#[derive(Clone, Debug)] +pub struct RelSizeRequest { + pub common: RequestCommon, + pub rel: RelTag, +} + +#[derive(Clone, Debug)] +pub struct RelSizeResponse { + pub num_blocks: u32, +} + +#[derive(Clone, Debug)] +pub struct GetPageRequest { + pub common: RequestCommon, + pub rel: RelTag, + pub block_number: u32, +} + +#[derive(Clone, Debug)] +pub struct GetPageResponse { + pub page_image: std::vec::Vec, +} + +#[derive(Clone, Debug)] +pub struct DbSizeRequest { + pub common: RequestCommon, + pub db_oid: u32, +} + +#[derive(Clone, Debug)] +pub struct DbSizeResponse { + pub num_bytes: u64, +} + +#[derive(Clone, Debug)] +pub struct GetBaseBackupRequest { + pub common: RequestCommon, + pub replica: bool, +} + +//--- Conversions to/from the generated proto types + +use thiserror::Error; + +#[derive(Error, Debug)] +pub enum ProtocolError { + #[error("the value for field `{0}` is invalid")] + InvalidValue(&'static str), + #[error("the required field `{0}` is missing ")] + Missing(&'static str), +} + +impl From for tonic::Status { + fn from(e: ProtocolError) -> Self { + match e { + ProtocolError::InvalidValue(_field) => tonic::Status::invalid_argument(e.to_string()), + ProtocolError::Missing(_field) => tonic::Status::invalid_argument(e.to_string()), + } + } +} + +impl From<&RelTag> for proto::RelTag { + fn from(value: &RelTag) -> proto::RelTag { + proto::RelTag { + spc_oid: value.spc_oid, + db_oid: value.db_oid, + rel_number: value.rel_number, + fork_number: value.fork_number as u32, + } + } +} +impl TryFrom<&proto::RelTag> for RelTag { + type Error = ProtocolError; + + fn try_from(value: &proto::RelTag) -> Result { + Ok(RelTag { + spc_oid: value.spc_oid, + db_oid: value.db_oid, + rel_number: value.rel_number, + fork_number: value + .fork_number + .try_into() + .or(Err(ProtocolError::InvalidValue("fork_number")))?, + }) + } +} + +impl From<&RequestCommon> for proto::RequestCommon { + fn from(value: &RequestCommon) -> proto::RequestCommon { + proto::RequestCommon { + request_lsn: value.request_lsn.into(), + not_modified_since_lsn: value.not_modified_since_lsn.into(), + } + } +} +impl From<&proto::RequestCommon> for RequestCommon { + fn from(value: &proto::RequestCommon) -> RequestCommon { + RequestCommon { + request_lsn: value.request_lsn.into(), + not_modified_since_lsn: value.not_modified_since_lsn.into(), + } + } +} + +impl From<&RelExistsRequest> for proto::RelExistsRequest { + fn from(value: &RelExistsRequest) -> proto::RelExistsRequest { + proto::RelExistsRequest { + common: Some((&value.common).into()), + rel: Some((&value.rel).into()), + } + } +} +impl TryFrom<&proto::RelExistsRequest> for RelExistsRequest { + type Error = ProtocolError; + + fn try_from(value: &proto::RelExistsRequest) -> Result { + Ok(RelExistsRequest { + common: (&value.common.ok_or(ProtocolError::Missing("common"))?).into(), + rel: (&value.rel.ok_or(ProtocolError::Missing("rel"))?).try_into()?, + }) + } +} + +impl From<&RelSizeRequest> for proto::RelSizeRequest { + fn from(value: &RelSizeRequest) -> proto::RelSizeRequest { + proto::RelSizeRequest { + common: Some((&value.common).into()), + rel: Some((&value.rel).into()), + } + } +} +impl TryFrom<&proto::RelSizeRequest> for RelSizeRequest { + type Error = ProtocolError; + + fn try_from(value: &proto::RelSizeRequest) -> Result { + Ok(RelSizeRequest { + common: (&value.common.ok_or(ProtocolError::Missing("common"))?).into(), + rel: (&value.rel.ok_or(ProtocolError::Missing("rel"))?).try_into()?, + }) + } +} + +impl From<&GetPageRequest> for proto::GetPageRequest { + fn from(value: &GetPageRequest) -> proto::GetPageRequest { + proto::GetPageRequest { + common: Some((&value.common).into()), + rel: Some((&value.rel).into()), + block_number: value.block_number, + } + } +} +impl TryFrom<&proto::GetPageRequest> for GetPageRequest { + type Error = ProtocolError; + + fn try_from(value: &proto::GetPageRequest) -> Result { + Ok(GetPageRequest { + common: (&value.common.ok_or(ProtocolError::Missing("common"))?).into(), + rel: (&value.rel.ok_or(ProtocolError::Missing("rel"))?).try_into()?, + block_number: value.block_number, + }) + } +} + +impl From<&DbSizeRequest> for proto::DbSizeRequest { + fn from(value: &DbSizeRequest) -> proto::DbSizeRequest { + proto::DbSizeRequest { + common: Some((&value.common).into()), + db_oid: value.db_oid, + } + } +} + +impl TryFrom<&proto::DbSizeRequest> for DbSizeRequest { + type Error = ProtocolError; + + fn try_from(value: &proto::DbSizeRequest) -> Result { + Ok(DbSizeRequest { + common: (&value.common.ok_or(ProtocolError::Missing("common"))?).into(), + db_oid: value.db_oid, + }) + } +} + +impl From<&GetBaseBackupRequest> for proto::GetBaseBackupRequest { + fn from(value: &GetBaseBackupRequest) -> proto::GetBaseBackupRequest { + proto::GetBaseBackupRequest { + common: Some((&value.common).into()), + replica: value.replica, + } + } +} + +impl TryFrom<&proto::GetBaseBackupRequest> for GetBaseBackupRequest { + type Error = ProtocolError; + + fn try_from( + value: &proto::GetBaseBackupRequest, + ) -> Result { + Ok(GetBaseBackupRequest { + common: (&value.common.ok_or(ProtocolError::Missing("common"))?).into(), + replica: value.replica, + }) + } +} diff --git a/pageserver/pagebench/Cargo.toml b/pageserver/pagebench/Cargo.toml index 5b5ed09a2b..c41007f3bb 100644 --- a/pageserver/pagebench/Cargo.toml +++ b/pageserver/pagebench/Cargo.toml @@ -23,6 +23,8 @@ tokio.workspace = true tokio-util.workspace = true pageserver_client.workspace = true +pageserver_client_grpc.workspace = true +pageserver_data_api.workspace = true pageserver_api.workspace = true utils = { path = "../../libs/utils/" } workspace_hack = { version = "0.1", path = "../../workspace_hack" } diff --git a/pageserver/pagebench/src/cmd/basebackup.rs b/pageserver/pagebench/src/cmd/basebackup.rs index 43ad92980c..bcd7710239 100644 --- a/pageserver/pagebench/src/cmd/basebackup.rs +++ b/pageserver/pagebench/src/cmd/basebackup.rs @@ -9,6 +9,9 @@ use anyhow::Context; use pageserver_api::shard::TenantShardId; use pageserver_client::mgmt_api::ForceAwaitLogicalSize; use pageserver_client::page_service::BasebackupRequest; +use pageserver_client_grpc; +use pageserver_data_api::model::{GetBaseBackupRequest, RequestCommon}; + use rand::prelude::*; use tokio::sync::Barrier; use tokio::task::JoinSet; @@ -22,6 +25,8 @@ use crate::util::{request_stats, tokio_thread_local_stats}; /// basebackup@LatestLSN #[derive(clap::Parser)] pub(crate) struct Args { + #[clap(long, default_value = "false")] + grpc: bool, #[clap(long, default_value = "http://localhost:9898")] mgmt_api_endpoint: String, #[clap(long, default_value = "postgres://postgres@localhost:64000")] @@ -52,7 +57,7 @@ impl LiveStats { struct Target { timeline: TenantTimelineId, - lsn_range: Option>, + lsn_range: Range, } #[derive(serde::Serialize)] @@ -105,7 +110,7 @@ async fn main_impl( anyhow::Ok(Target { timeline, // TODO: support lsn_range != latest LSN - lsn_range: Some(info.last_record_lsn..(info.last_record_lsn + 1)), + lsn_range: info.last_record_lsn..(info.last_record_lsn + 1), }) } }); @@ -149,14 +154,27 @@ async fn main_impl( for tl in &timelines { let (sender, receiver) = tokio::sync::mpsc::channel(1); // TODO: not sure what the implications of this are work_senders.insert(tl, sender); - tasks.push(tokio::spawn(client( - args, - *tl, - Arc::clone(&start_work_barrier), - receiver, - Arc::clone(&all_work_done_barrier), - Arc::clone(&live_stats), - ))); + + let client_task = if args.grpc { + tokio::spawn(client_grpc( + args, + *tl, + Arc::clone(&start_work_barrier), + receiver, + Arc::clone(&all_work_done_barrier), + Arc::clone(&live_stats), + )) + } else { + tokio::spawn(client( + args, + *tl, + Arc::clone(&start_work_barrier), + receiver, + Arc::clone(&all_work_done_barrier), + Arc::clone(&live_stats), + )) + }; + tasks.push(client_task); } let work_sender = async move { @@ -165,7 +183,7 @@ async fn main_impl( let (timeline, work) = { let mut rng = rand::thread_rng(); let target = all_targets.choose(&mut rng).unwrap(); - let lsn = target.lsn_range.clone().map(|r| rng.gen_range(r)); + let lsn = rng.gen_range(target.lsn_range.clone()); ( target.timeline, Work { @@ -215,7 +233,7 @@ async fn main_impl( #[derive(Copy, Clone)] struct Work { - lsn: Option, + lsn: Lsn, gzip: bool, } @@ -240,7 +258,7 @@ async fn client( .basebackup(&BasebackupRequest { tenant_id: timeline.tenant_id, timeline_id: timeline.timeline_id, - lsn, + lsn: Some(lsn), gzip, }) .await @@ -270,3 +288,71 @@ async fn client( all_work_done_barrier.wait().await; } + +#[instrument(skip_all)] +async fn client_grpc( + args: &'static Args, + timeline: TenantTimelineId, + start_work_barrier: Arc, + mut work: tokio::sync::mpsc::Receiver, + all_work_done_barrier: Arc, + live_stats: Arc, +) { + let shard_map = HashMap::from([(0, args.page_service_connstring.clone())]); + let client = pageserver_client_grpc::PageserverClient::new( + &timeline.tenant_id.to_string(), + &timeline.timeline_id.to_string(), + &None, + shard_map, + ); + + start_work_barrier.wait().await; + + while let Some(Work { lsn, gzip }) = work.recv().await { + let start = Instant::now(); + + //tokio::time::sleep(std::time::Duration::from_secs(1)).await; + + info!("starting get_base_backup"); + let mut basebackup_stream = client + .get_base_backup( + &GetBaseBackupRequest { + common: RequestCommon { + request_lsn: lsn, + not_modified_since_lsn: lsn, + }, + replica: false, + }, + gzip, + ) + .await + .with_context(|| format!("start basebackup for {timeline}")) + .unwrap() + .into_inner(); + + info!("starting receive"); + use futures::StreamExt; + let mut size = 0; + let mut nchunks = 0; + while let Some(chunk) = basebackup_stream.next().await { + let chunk = chunk + .with_context(|| format!("error during basebackup")) + .unwrap(); + size += chunk.chunk.len(); + nchunks += 1; + } + + info!( + "basebackup size is {} bytes, avg chunk size {} bytes", + size, + size as f32 / nchunks as f32 + ); + let elapsed = start.elapsed(); + live_stats.inc(); + STATS.with(|stats| { + stats.borrow().lock().unwrap().observe(elapsed).unwrap(); + }); + } + + all_work_done_barrier.wait().await; +} diff --git a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs index 771a7cbe5b..2b535d8507 100644 --- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs +++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs @@ -1,4 +1,4 @@ -use std::collections::{HashSet, VecDeque}; +use std::collections::{HashMap, HashSet, VecDeque}; use std::future::Future; use std::num::NonZeroUsize; use std::pin::Pin; @@ -8,6 +8,8 @@ use std::time::{Duration, Instant}; use anyhow::Context; use camino::Utf8PathBuf; +use futures::StreamExt; +use futures::stream::FuturesOrdered; use pageserver_api::key::Key; use pageserver_api::keyspace::KeySpaceAccum; use pageserver_api::models::{PagestreamGetPageRequest, PagestreamRequest}; @@ -25,6 +27,8 @@ use crate::util::{request_stats, tokio_thread_local_stats}; /// GetPage@LatestLSN, uniformly distributed across the compute-accessible keyspace. #[derive(clap::Parser)] pub(crate) struct Args { + #[clap(long, default_value = "false")] + grpc: bool, #[clap(long, default_value = "http://localhost:9898")] mgmt_api_endpoint: String, #[clap(long, default_value = "postgres://postgres@localhost:64000")] @@ -295,7 +299,29 @@ async fn main_impl( .unwrap(); Box::pin(async move { - client_libpq(args, worker_id, ss, cancel, rps_period, ranges, weights).await + if args.grpc { + client_grpc( + args, + worker_id, + ss, + cancel, + rps_period, + ranges, + weights, + ) + .await + } else { + client_libpq( + args, + worker_id, + ss, + cancel, + rps_period, + ranges, + weights, + ) + .await + } }) }; @@ -434,3 +460,100 @@ async fn client_libpq( } } } + +async fn client_grpc( + args: &Args, + worker_id: WorkerId, + shared_state: Arc, + cancel: CancellationToken, + rps_period: Option, + ranges: Vec, + weights: rand::distributions::weighted::WeightedIndex, +) { + let shard_map = HashMap::from([(0, args.page_service_connstring.clone())]); + let client = pageserver_client_grpc::PageserverClient::new( + &worker_id.timeline.tenant_id.to_string(), + &worker_id.timeline.timeline_id.to_string(), + &None, + shard_map, + ); + let client = Arc::new(client); + + shared_state.start_work_barrier.wait().await; + let client_start = Instant::now(); + let mut ticks_processed = 0; + let mut inflight = FuturesOrdered::new(); + while !cancel.is_cancelled() { + // Detect if a request took longer than the RPS rate + if let Some(period) = &rps_period { + let periods_passed_until_now = + usize::try_from(client_start.elapsed().as_micros() / period.as_micros()).unwrap(); + + if periods_passed_until_now > ticks_processed { + shared_state + .live_stats + .missed((periods_passed_until_now - ticks_processed) as u64); + } + ticks_processed = periods_passed_until_now; + } + + while inflight.len() < args.queue_depth.get() { + let start = Instant::now(); + let req = { + let mut rng = rand::thread_rng(); + let r = &ranges[weights.sample(&mut rng)]; + let key: i128 = rng.gen_range(r.start..r.end); + let key = Key::from_i128(key); + assert!(key.is_rel_block_key()); + let (rel_tag, block_no) = key + .to_rel_block() + .expect("we filter non-rel-block keys out above"); + pageserver_data_api::model::GetPageRequest { + common: pageserver_data_api::model::RequestCommon { + request_lsn: if rng.gen_bool(args.req_latest_probability) { + Lsn::MAX + } else { + r.timeline_lsn + }, + not_modified_since_lsn: r.timeline_lsn, + }, + rel: pageserver_data_api::model::RelTag { + spc_oid: rel_tag.spcnode, + db_oid: rel_tag.dbnode, + rel_number: rel_tag.relnode, + fork_number: rel_tag.forknum, + }, + block_number: block_no, + } + }; + let client_clone = client.clone(); + let getpage_fut = async move { + let result = client_clone.get_page(&req).await; + (start, result) + }; + inflight.push_back(getpage_fut); + } + + let (start, result) = inflight.next().await.unwrap(); + result.expect("getpage request should succeed"); + let end = Instant::now(); + shared_state.live_stats.request_done(); + ticks_processed += 1; + STATS.with(|stats| { + stats + .borrow() + .lock() + .unwrap() + .observe(end.duration_since(start)) + .unwrap(); + }); + + if let Some(period) = &rps_period { + let next_at = client_start + + Duration::from_micros( + (ticks_processed) as u64 * u64::try_from(period.as_micros()).unwrap(), + ); + tokio::time::sleep_until(next_at.into()).await; + } + } +} diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index 3510ccb529..58520c5d7a 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -151,10 +151,14 @@ where .map_err(|_| BasebackupError::Shutdown)?, ), }; - basebackup + let res = basebackup .send_tarball() .instrument(info_span!("send_tarball", backup_lsn=%backup_lsn)) - .await + .await; + + info!("basebackup done!"); + + res } /// This is short-living object only for the time of tarball creation, diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 6cfaec955b..9b764b8f83 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -16,6 +16,7 @@ use http_utils::tls_certs::ReloadingCertificateResolver; use metrics::launch_timestamp::{LaunchTimestamp, set_launch_timestamp_metric}; use metrics::set_build_info_metric; use nix::sys::socket::{setsockopt, sockopt}; +use pageserver::compute_service; use pageserver::config::{PageServerConf, PageserverIdentity, ignored_fields}; use pageserver::controller_upcall_client::StorageControllerUpcallClient; use pageserver::deletion_queue::DeletionQueue; @@ -27,7 +28,7 @@ use pageserver::task_mgr::{ use pageserver::tenant::{TenantSharedResources, mgr, secondary}; use pageserver::{ CancellableTask, ConsumptionMetricsTasks, HttpEndpointListener, HttpsEndpointListener, http, - page_cache, page_service, task_mgr, virtual_file, + page_cache, task_mgr, virtual_file, }; use postgres_backend::AuthType; use remote_storage::GenericRemoteStorage; @@ -745,7 +746,7 @@ fn start_pageserver( // Spawn a task to listen for libpq connections. It will spawn further tasks // for each connection. We created the listener earlier already. let perf_trace_dispatch = otel_guard.as_ref().map(|g| g.dispatch.clone()); - let page_service = page_service::spawn( + let compute_service = compute_service::spawn( conf, tenant_manager.clone(), pg_auth, @@ -782,7 +783,7 @@ fn start_pageserver( pageserver::shutdown_pageserver( http_endpoint_listener, https_endpoint_listener, - page_service, + compute_service, consumption_metrics_tasks, disk_usage_eviction_task, &tenant_manager, diff --git a/pageserver/src/compute_service.rs b/pageserver/src/compute_service.rs new file mode 100644 index 0000000000..952089bee7 --- /dev/null +++ b/pageserver/src/compute_service.rs @@ -0,0 +1,286 @@ +//! +//! The Compute Service listens for compute connections, and serves requests like +//! the GetPage@LSN requests. +//! +//! We support two protocols: +//! +//! 1. Legacy, connection-oriented libpq based protocol. That's +//! handled by the code in page_service.rs. +//! +//! 2. gRPC based protocol. See compute_service_grpc.rs. +//! +//! To make the transition smooth, without having to open up new firewall ports +//! etc, both protocols are served on the same port. When a new TCP connection +//! is accepted, we peek at the first few bytes incoming from the client to +//! determine which protocol it speaks. +//! +//! TODO: This gets easier once we drop the legacy protocol support. Or if we +//! open a separate port for them. + +use std::sync::Arc; + +use anyhow::Context; +use futures::FutureExt; +use pageserver_api::config::PageServicePipeliningConfig; +use postgres_backend::AuthType; +use tokio::task::JoinHandle; +use tokio_util::sync::CancellationToken; +use tracing::*; +use utils::auth::SwappableJwtAuth; +use utils::sync::gate::{Gate, GateGuard}; + +use crate::compute_service_grpc::launch_compute_service_grpc_server; +use crate::config::PageServerConf; +use crate::context::{DownloadBehavior, RequestContext, RequestContextBuilder}; +use crate::page_service::libpq_page_service_conn_main; +use crate::task_mgr::{self, COMPUTE_REQUEST_RUNTIME, TaskKind}; +use crate::tenant::mgr::TenantManager; + +/////////////////////////////////////////////////////////////////////////////// + +pub type ConnectionHandlerResult = anyhow::Result<()>; + +pub struct Connections { + cancel: CancellationToken, + tasks: tokio::task::JoinSet, + gate: Gate, +} + +impl Connections { + pub(crate) async fn shutdown(self) { + let Self { + cancel, + mut tasks, + gate, + } = self; + cancel.cancel(); + while let Some(res) = tasks.join_next().await { + Self::handle_connection_completion(res); + } + gate.close().await; + } + + fn handle_connection_completion(res: Result, tokio::task::JoinError>) { + match res { + Ok(Ok(())) => {} + Ok(Err(e)) => error!("error in page_service connection task: {:?}", e), + Err(e) => error!("page_service connection task panicked: {:?}", e), + } + } +} + +pub struct Listener { + cancel: CancellationToken, + /// Cancel the listener task through `listen_cancel` to shut down the listener + /// and get a handle on the existing connections. + task: JoinHandle, +} + +pub fn spawn( + conf: &'static PageServerConf, + tenant_manager: Arc, + pg_auth: Option>, + perf_trace_dispatch: Option, + tcp_listener: tokio::net::TcpListener, + tls_config: Option>, +) -> Listener { + let cancel = CancellationToken::new(); + let libpq_ctx = RequestContext::todo_child( + TaskKind::LibpqEndpointListener, + // listener task shouldn't need to download anything. (We will + // create a separate sub-contexts for each connection, with their + // own download behavior. This context is used only to listen and + // accept connections.) + DownloadBehavior::Error, + ); + + let task = COMPUTE_REQUEST_RUNTIME.spawn(task_mgr::exit_on_panic_or_error( + "compute connection listener", + compute_connection_listener_main( + conf, + tenant_manager, + pg_auth, + perf_trace_dispatch, + tcp_listener, + conf.pg_auth_type, + tls_config, + conf.page_service_pipelining.clone(), + libpq_ctx, + cancel.clone(), + ) + .map(anyhow::Ok), + )); + + Listener { cancel, task } +} + +impl Listener { + pub async fn stop_accepting(self) -> Connections { + self.cancel.cancel(); + self.task + .await + .expect("unreachable: we wrap the listener task in task_mgr::exit_on_panic_or_error") + } +} + +/// Listener loop. Listens for connections, and launches a new handler +/// task for each. +/// +/// Returns Ok(()) upon cancellation via `cancel`, returning the set of +/// open connections. +/// +#[allow(clippy::too_many_arguments)] +pub async fn compute_connection_listener_main( + conf: &'static PageServerConf, + tenant_manager: Arc, + auth: Option>, + perf_trace_dispatch: Option, + listener: tokio::net::TcpListener, + auth_type: AuthType, + tls_config: Option>, + pipelining_config: PageServicePipeliningConfig, + listener_ctx: RequestContext, + listener_cancel: CancellationToken, +) -> Connections { + let connections_cancel = CancellationToken::new(); + let connections_gate = Gate::default(); + let mut connection_handler_tasks = tokio::task::JoinSet::default(); + + // The connection handling task passes the gRPC protocol + // connections to this channel. The tonic gRPC server reads the + // channel and takes over the connections from there. + let (grpc_connections_tx, grpc_connections_rx) = tokio::sync::mpsc::channel(1000); + + // Set up the gRPC service + launch_compute_service_grpc_server( + grpc_connections_rx, + conf, + tenant_manager.clone(), + auth.clone(), + auth_type, + connections_cancel.clone(), + &listener_ctx, + ); + + // Main listener loop + loop { + let gate_guard = match connections_gate.enter() { + Ok(guard) => guard, + Err(_) => break, + }; + + let accepted = tokio::select! { + biased; + _ = listener_cancel.cancelled() => break, + next = connection_handler_tasks.join_next(), if !connection_handler_tasks.is_empty() => { + let res = next.expect("we dont poll while empty"); + Connections::handle_connection_completion(res); + continue; + } + accepted = listener.accept() => accepted, + }; + + match accepted { + Ok((socket, peer_addr)) => { + // Connection established. Spawn a new task to handle it. + debug!("accepted connection from {}", peer_addr); + let local_auth = auth.clone(); + let connection_ctx = RequestContextBuilder::from(&listener_ctx) + .task_kind(TaskKind::PageRequestHandler) + .download_behavior(DownloadBehavior::Download) + .perf_span_dispatch(perf_trace_dispatch.clone()) + .detached_child(); + + connection_handler_tasks.spawn(page_service_conn_main( + conf, + tenant_manager.clone(), + local_auth, + socket, + auth_type, + tls_config.clone(), + pipelining_config.clone(), + connection_ctx, + connections_cancel.child_token(), + gate_guard, + grpc_connections_tx.clone(), + )); + } + Err(err) => { + // accept() failed. Log the error, and loop back to retry on next connection. + error!("accept() failed: {:?}", err); + } + } + } + + debug!("page_service listener loop terminated"); + + Connections { + cancel: connections_cancel, + tasks: connection_handler_tasks, + gate: connections_gate, + } +} + +/// Handle a new incoming connection. +/// +/// This peeks at the first few incoming bytes and dispatches the connection +/// to the legacy libpq handler or the new gRPC handler accordingly. +#[instrument(skip_all, fields(peer_addr, application_name, compute_mode))] +#[allow(clippy::too_many_arguments)] +pub async fn page_service_conn_main( + conf: &'static PageServerConf, + tenant_manager: Arc, + auth: Option>, + socket: tokio::net::TcpStream, + auth_type: AuthType, + tls_config: Option>, + pipelining_config: PageServicePipeliningConfig, + connection_ctx: RequestContext, + cancel: CancellationToken, + gate_guard: GateGuard, + grpc_connections_tx: tokio::sync::mpsc::Sender>, +) -> ConnectionHandlerResult { + let mut buf: [u8; 4] = [0; 4]; + + socket + .set_nodelay(true) + .context("could not set TCP_NODELAY")?; + + // Peek + socket.peek(&mut buf).await?; + + let mut grpc = false; + if buf[0] == 0x16 { + // looks like a TLS handshake. Assume gRPC. + // XXX: Starting with v17, PostgreSQL also supports "direct TLS mode". But + // the compute doesn't use it. + grpc = true; + } + + if buf[0] == b'G' || buf[0] == b'P' { + // Looks like 'GET' or 'POST' + // or 'PRI', indicating gRPC over HTTP/2 with prior knowledge + grpc = true; + } + + // Dispatch + if grpc { + grpc_connections_tx.send(Ok(socket)).await?; + info!("connection sent to channel"); + Ok(()) + } else { + libpq_page_service_conn_main( + conf, + tenant_manager, + auth, + socket, + auth_type, + tls_config, + pipelining_config, + connection_ctx, + cancel, + gate_guard, + ) + .await + } +} diff --git a/pageserver/src/compute_service_grpc.rs b/pageserver/src/compute_service_grpc.rs new file mode 100644 index 0000000000..337c249187 --- /dev/null +++ b/pageserver/src/compute_service_grpc.rs @@ -0,0 +1,746 @@ +//! +//! Compute <-> Pageserver API handler. This is for the new gRPC-based protocol +//! +//! TODO: +//! +//! - Many of the API endpoints are still missing +//! +//! - This is very much not optimized. +//! +//! - Much of the code was copy-pasted from page_service.rs. Like the code to get the +//! Timeline object, and the JWT auth. Could refactor and share. +//! +//! + +use std::pin::Pin; +use std::str::FromStr; +use std::sync::Arc; +use std::task::Poll; +use std::time::Duration; +use std::time::Instant; + +use crate::TenantManager; +use crate::auth::check_permission; +use crate::basebackup; +use crate::basebackup::BasebackupError; +use crate::config::PageServerConf; +use crate::context::{DownloadBehavior, RequestContext, RequestContextBuilder}; +use crate::task_mgr::TaskKind; +use crate::tenant::Timeline; +use crate::tenant::mgr::ShardResolveResult; +use crate::tenant::mgr::ShardSelector; +use crate::tenant::storage_layer::IoConcurrency; +use crate::tenant::timeline::WaitLsnTimeout; +use tokio::io::{AsyncWriteExt, ReadHalf, SimplexStream}; +use tokio::task::JoinHandle; +use tokio_util::codec::{Decoder, FramedRead}; +use tokio_util::sync::CancellationToken; + +use futures::stream::StreamExt; + +use pageserver_data_api::model; +use pageserver_data_api::proto::page_service_server::PageService; +use pageserver_data_api::proto::page_service_server::PageServiceServer; + +use anyhow::Context; +use bytes::BytesMut; +use jsonwebtoken::TokenData; +use tracing::Instrument; +use tracing::{debug, error}; +use utils::auth::SwappableJwtAuth; + +use utils::id::{TenantId, TenantTimelineId, TimelineId}; +use utils::lsn::Lsn; +use utils::simple_rcu::RcuReadGuard; + +use crate::tenant::PageReconstructError; + +use postgres_ffi::BLCKSZ; + +use tonic; +use tonic::codec::CompressionEncoding; +use tonic::service::interceptor::InterceptedService; + +use pageserver_api::key::rel_block_to_key; + +use crate::pgdatadir_mapping::Version; +use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID; + +use postgres_backend::AuthType; + +pub use pageserver_data_api::proto; + +pub(super) fn launch_compute_service_grpc_server( + tcp_connections_rx: tokio::sync::mpsc::Receiver>, + conf: &'static PageServerConf, + tenant_manager: Arc, + auth: Option>, + auth_type: AuthType, + connections_cancel: CancellationToken, + listener_ctx: &RequestContext, +) { + // Set up the gRPC service + let service_ctx = RequestContextBuilder::from(listener_ctx) + .task_kind(TaskKind::PageRequestHandler) + .download_behavior(DownloadBehavior::Download) + .attached_child(); + let service = crate::compute_service_grpc::PageServiceService { + conf, + tenant_mgr: tenant_manager.clone(), + ctx: Arc::new(service_ctx), + }; + let authenticator = PageServiceAuthenticator { + auth: auth.clone(), + auth_type, + }; + + let server = InterceptedService::new( + PageServiceServer::new(service).send_compressed(CompressionEncoding::Gzip), + authenticator, + ); + + let cc = connections_cancel.clone(); + tokio::spawn(async move { + tonic::transport::Server::builder() + .add_service(server) + .serve_with_incoming_shutdown( + tokio_stream::wrappers::ReceiverStream::new(tcp_connections_rx), + cc.cancelled(), + ) + .await + }); +} + +struct PageServiceService { + conf: &'static PageServerConf, + tenant_mgr: Arc, + ctx: Arc, +} + +/// An error happened in a get() operation. +impl From for tonic::Status { + fn from(e: PageReconstructError) -> Self { + match e { + PageReconstructError::Other(err) => tonic::Status::unknown(err.to_string()), + PageReconstructError::AncestorLsnTimeout(_) => { + tonic::Status::unavailable(e.to_string()) + } + PageReconstructError::Cancelled => tonic::Status::aborted(e.to_string()), + PageReconstructError::WalRedo(_) => tonic::Status::internal(e.to_string()), + PageReconstructError::MissingKey(_) => tonic::Status::internal(e.to_string()), + } + } +} + +fn convert_reltag(value: &model::RelTag) -> pageserver_api::reltag::RelTag { + pageserver_api::reltag::RelTag { + spcnode: value.spc_oid, + dbnode: value.db_oid, + relnode: value.rel_number, + forknum: value.fork_number, + } +} + +#[tonic::async_trait] +impl PageService for PageServiceService { + type GetBaseBackupStream = GetBaseBackupStream; + + async fn rel_exists( + &self, + request: tonic::Request, + ) -> std::result::Result, tonic::Status> { + let ttid = self.extract_ttid(request.metadata())?; + let req: model::RelExistsRequest = request.get_ref().try_into()?; + + let rel = convert_reltag(&req.rel); + let span = tracing::info_span!("rel_exists", tenant_id = %ttid.tenant_id, timeline_id = %ttid.timeline_id, rel = %rel, req_lsn = %req.common.request_lsn); + + async { + let timeline = self.get_timeline(ttid, ShardSelector::Zero).await?; + let ctx = self.ctx.with_scope_timeline(&timeline); + let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn(); + let lsn = Self::wait_or_get_last_lsn( + &timeline, + req.common.request_lsn, + req.common.not_modified_since_lsn, + &latest_gc_cutoff_lsn, + &ctx, + ) + .await?; + + let exists = timeline + .get_rel_exists(rel, Version::Lsn(lsn), &ctx) + .await?; + + Ok(tonic::Response::new(proto::RelExistsResponse { exists })) + } + .instrument(span) + .await + } + + /// Returns size of a relation, as # of blocks + async fn rel_size( + &self, + request: tonic::Request, + ) -> std::result::Result, tonic::Status> { + let ttid = self.extract_ttid(request.metadata())?; + let req: model::RelSizeRequest = request.get_ref().try_into()?; + let rel = convert_reltag(&req.rel); + + let span = tracing::info_span!("rel_size", tenant_id = %ttid.tenant_id, timeline_id = %ttid.timeline_id, rel = %rel, req_lsn = %req.common.request_lsn); + + async { + let timeline = self.get_timeline(ttid, ShardSelector::Zero).await?; + let ctx = self.ctx.with_scope_timeline(&timeline); + let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn(); + let lsn = Self::wait_or_get_last_lsn( + &timeline, + req.common.request_lsn, + req.common.not_modified_since_lsn, + &latest_gc_cutoff_lsn, + &ctx, + ) + .await?; + + let num_blocks = timeline.get_rel_size(rel, Version::Lsn(lsn), &ctx).await?; + + Ok(tonic::Response::new(proto::RelSizeResponse { num_blocks })) + } + .instrument(span) + .await + } + + async fn get_page( + &self, + request: tonic::Request, + ) -> std::result::Result, tonic::Status> { + let ttid = self.extract_ttid(request.metadata())?; + let req: model::GetPageRequest = request.get_ref().try_into()?; + + // Calculate shard number. + // + // FIXME: this should probably be part of the data_api crate. + let rel = convert_reltag(&req.rel); + let key = rel_block_to_key(rel, req.block_number); + let timeline = self.get_timeline(ttid, ShardSelector::Page(key)).await?; + + let ctx = self.ctx.with_scope_timeline(&timeline); + let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn(); + let lsn = Self::wait_or_get_last_lsn( + &timeline, + req.common.request_lsn, + req.common.not_modified_since_lsn, + &latest_gc_cutoff_lsn, + &ctx, + ) + .await?; + + let shard_id = timeline.tenant_shard_id.shard_number; + let span = tracing::info_span!("get_page", tenant_id = %ttid.tenant_id, shard_id = %shard_id, timeline_id = %ttid.timeline_id, rel = %rel, block_number = %req.block_number, req_lsn = %req.common.request_lsn); + + async { + let gate_guard = match timeline.gate.enter() { + Ok(guard) => guard, + Err(_) => { + return Err(tonic::Status::unavailable("timeline is shutting down")); + } + }; + + let io_concurrency = IoConcurrency::spawn_from_conf(self.conf, gate_guard); + + let page_image = timeline + .get_rel_page_at_lsn( + rel, + req.block_number, + Version::Lsn(lsn), + &ctx, + io_concurrency, + ) + .await?; + + Ok(tonic::Response::new(proto::GetPageResponse { + page_image: page_image, + })) + } + .instrument(span) + .await + } + + async fn db_size( + &self, + request: tonic::Request, + ) -> Result, tonic::Status> { + let ttid = self.extract_ttid(request.metadata())?; + let req: model::DbSizeRequest = request.get_ref().try_into()?; + + let span = tracing::info_span!("get_page", tenant_id = %ttid.tenant_id, timeline_id = %ttid.timeline_id, db_oid = %req.db_oid, req_lsn = %req.common.request_lsn); + + async { + let timeline = self.get_timeline(ttid, ShardSelector::Zero).await?; + let ctx = self.ctx.with_scope_timeline(&timeline); + let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn(); + let lsn = Self::wait_or_get_last_lsn( + &timeline, + req.common.request_lsn, + req.common.not_modified_since_lsn, + &latest_gc_cutoff_lsn, + &ctx, + ) + .await?; + + let total_blocks = timeline + .get_db_size(DEFAULTTABLESPACE_OID, req.db_oid, Version::Lsn(lsn), &ctx) + .await?; + + Ok(tonic::Response::new(proto::DbSizeResponse { + num_bytes: total_blocks as u64 * BLCKSZ as u64, + })) + } + .instrument(span) + .await + } + + async fn get_base_backup( + &self, + request: tonic::Request, + ) -> Result, tonic::Status> { + let ttid = self.extract_ttid(request.metadata())?; + let req: model::GetBaseBackupRequest = request.get_ref().try_into()?; + + let timeline = self.get_timeline(ttid, ShardSelector::Zero).await?; + + let ctx = self.ctx.with_scope_timeline(&timeline); + let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn(); + let lsn = Self::wait_or_get_last_lsn( + &timeline, + req.common.request_lsn, + req.common.not_modified_since_lsn, + &latest_gc_cutoff_lsn, + &ctx, + ) + .await?; + + let span = tracing::info_span!("get_base_backup", tenant_id = %ttid.tenant_id, timeline_id = %ttid.timeline_id, req_lsn = %req.common.request_lsn); + + tracing::info!("starting basebackup"); + + #[allow(dead_code)] + enum TestMode { + /// Create real basebackup, in streaming fashion + Streaming, + /// Create real basebackup, but fully materialize it in the 'simplex' pipe buffer first + Materialize, + /// Create a dummy all-zeros basebackup, in streaming fashion + DummyStreaming, + /// Create a dummy all-zeros basebackup, but fully materialize it first + DummyMaterialize, + } + let mode = TestMode::Streaming; + + let buf_size = match mode { + TestMode::Streaming | TestMode::DummyStreaming => 64 * 1024, + TestMode::Materialize | TestMode::DummyMaterialize => 64 * 1024 * 1024, + }; + + let (simplex_read, mut simplex_write) = tokio::io::simplex(buf_size); + + let basebackup_task = match mode { + TestMode::DummyStreaming => { + tokio::spawn( + async move { + // hold onto the guard for as long as the basebackup runs + let _latest_gc_cutoff_lsn = latest_gc_cutoff_lsn; + + let zerosbuf: [u8; 1024] = [0; 1024]; + let nbytes = 16900000; + let mut bytes_written = 0; + while bytes_written < nbytes { + let s = std::cmp::min(1024, nbytes - bytes_written); + let _ = simplex_write.write_all(&zerosbuf[0..s]).await; + bytes_written += s; + } + simplex_write + .shutdown() + .await + .context("shutdown of basebackup pipe")?; + + Ok(()) + } + .instrument(span), + ) + } + TestMode::DummyMaterialize => { + let zerosbuf: [u8; 1024] = [0; 1024]; + let nbytes = 16900000; + let mut bytes_written = 0; + while bytes_written < nbytes { + let s = std::cmp::min(1024, nbytes - bytes_written); + let _ = simplex_write.write_all(&zerosbuf[0..s]).await; + bytes_written += s; + } + simplex_write + .shutdown() + .await + .expect("shutdown of basebackup pipe"); + tracing::info!("basebackup (dummy) materialized"); + let result = Ok(()); + + tokio::spawn(std::future::ready(result)) + } + TestMode::Materialize => { + let result = basebackup::send_basebackup_tarball( + &mut simplex_write, + &timeline, + Some(lsn), + None, + false, + req.replica, + &ctx, + ) + .await; + simplex_write + .shutdown() + .await + .expect("shutdown of basebackup pipe"); + tracing::info!("basebackup materialized"); + + // Launch a task that writes the basebackup tarball to the simplex pipe + tokio::spawn(std::future::ready(result)) + } + TestMode::Streaming => { + tokio::spawn( + async move { + // hold onto the guard for as long as the basebackup runs + let _latest_gc_cutoff_lsn = latest_gc_cutoff_lsn; + + let result = basebackup::send_basebackup_tarball( + &mut simplex_write, + &timeline, + Some(lsn), + None, + false, + req.replica, + &ctx, + ) + .await; + simplex_write + .shutdown() + .await + .context("shutdown of basebackup pipe")?; + result + } + .instrument(span), + ) + } + }; + + let response = new_basebackup_response_stream(simplex_read, basebackup_task); + + Ok(tonic::Response::new(response)) + } +} + +/// NB: this is a different value than [`crate::http::routes::ACTIVE_TENANT_TIMEOUT`]. +/// NB: and also different from page_service::ACTIVE_TENANT_TIMEOUT +const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(30000); + +impl PageServiceService { + async fn get_timeline( + &self, + ttid: TenantTimelineId, + shard_selector: ShardSelector, + ) -> Result, tonic::Status> { + let timeout = ACTIVE_TENANT_TIMEOUT; + let wait_start = Instant::now(); + let deadline = wait_start + timeout; + + let tenant_shard = loop { + let resolved = self + .tenant_mgr + .resolve_attached_shard(&ttid.tenant_id, shard_selector); + + match resolved { + ShardResolveResult::Found(tenant_shard) => break tenant_shard, + ShardResolveResult::NotFound => { + return Err(tonic::Status::not_found("tenant not found")); + } + ShardResolveResult::InProgress(barrier) => { + // We can't authoritatively answer right now: wait for InProgress state + // to end, then try again + tokio::select! { + _ = barrier.wait() => { + // The barrier completed: proceed around the loop to try looking up again + }, + _ = tokio::time::sleep(deadline.duration_since(Instant::now())) => { + return Err(tonic::Status::unavailable("tenant is in InProgress state")); + } + } + } + } + }; + + tracing::debug!("Waiting for tenant to enter active state..."); + tenant_shard + .wait_to_become_active(deadline.duration_since(Instant::now())) + .await + .map_err(|e| { + tonic::Status::unavailable(format!("tenant is not in active state: {e}")) + })?; + + let timeline = tenant_shard + .get_timeline(ttid.timeline_id, true) + .map_err(|e| tonic::Status::unavailable(format!("could not get timeline: {e}")))?; + + // FIXME: need to do something with the 'gate' here? + + Ok(timeline) + } + + /// Extract TenantTimelineId from the request metadata + /// + /// Note: the interceptor has already authenticated the request + /// + /// TOOD: Could we use "binary" metadata for these, for efficiency? gRPC has such a concept + fn extract_ttid( + &self, + metadata: &tonic::metadata::MetadataMap, + ) -> Result { + let tenant_id = metadata + .get("neon-tenant-id") + .ok_or(tonic::Status::invalid_argument( + "neon-tenant-id metadata missing", + ))?; + let tenant_id = tenant_id.to_str().map_err(|_| { + tonic::Status::invalid_argument("invalid UTF-8 characters in neon-tenant-id metadata") + })?; + let tenant_id = TenantId::from_str(tenant_id) + .map_err(|_| tonic::Status::invalid_argument("invalid neon-tenant-id metadata"))?; + + let timeline_id = + metadata + .get("neon-timeline-id") + .ok_or(tonic::Status::invalid_argument( + "neon-timeline-id metadata missing", + ))?; + let timeline_id = timeline_id.to_str().map_err(|_| { + tonic::Status::invalid_argument("invalid UTF-8 characters in neon-timeline-id metadata") + })?; + let timeline_id = TimelineId::from_str(timeline_id) + .map_err(|_| tonic::Status::invalid_argument("invalid neon-timelineid metadata"))?; + + Ok(TenantTimelineId::new(tenant_id, timeline_id)) + } + + // XXX: copied from PageServerHandler + async fn wait_or_get_last_lsn( + timeline: &Timeline, + request_lsn: Lsn, + not_modified_since: Lsn, + latest_gc_cutoff_lsn: &RcuReadGuard, + ctx: &RequestContext, + ) -> Result { + let last_record_lsn = timeline.get_last_record_lsn(); + + // Sanity check the request + if request_lsn < not_modified_since { + return Err(tonic::Status::invalid_argument(format!( + "invalid request with request LSN {} and not_modified_since {}", + request_lsn, not_modified_since, + ))); + } + + // Check explicitly for INVALID just to get a less scary error message if the request is obviously bogus + if request_lsn == Lsn::INVALID { + return Err(tonic::Status::invalid_argument("invalid LSN(0) in request")); + } + + // Clients should only read from recent LSNs on their timeline, or from locations holding an LSN lease. + // + // We may have older data available, but we make a best effort to detect this case and return an error, + // to distinguish a misbehaving client (asking for old LSN) from a storage issue (data missing at a legitimate LSN). + if request_lsn < **latest_gc_cutoff_lsn && !timeline.is_gc_blocked_by_lsn_lease_deadline() { + let gc_info = &timeline.gc_info.read().unwrap(); + if !gc_info.lsn_covered_by_lease(request_lsn) { + return Err(tonic::Status::not_found(format!( + "tried to request a page version that was garbage collected. requested at {} gc cutoff {}", + request_lsn, **latest_gc_cutoff_lsn + ))); + } + } + + // Wait for WAL up to 'not_modified_since' to arrive, if necessary + if not_modified_since > last_record_lsn { + timeline + .wait_lsn( + not_modified_since, + crate::tenant::timeline::WaitLsnWaiter::PageService, + WaitLsnTimeout::Default, + ctx, + ) + .await + .map_err(|_| { + tonic::Status::unavailable("not_modified_since LSN not arrived yet") + })?; + // Since we waited for 'not_modified_since' to arrive, that is now the last + // record LSN. (Or close enough for our purposes; the last-record LSN can + // advance immediately after we return anyway) + Ok(not_modified_since) + } else { + // It might be better to use max(not_modified_since, latest_gc_cutoff_lsn) + // here instead. That would give the same result, since we know that there + // haven't been any modifications since 'not_modified_since'. Using an older + // LSN might be faster, because that could allow skipping recent layers when + // finding the page. However, we have historically used 'last_record_lsn', so + // stick to that for now. + Ok(std::cmp::min(last_record_lsn, request_lsn)) + } + } +} + +#[derive(Clone)] +pub struct PageServiceAuthenticator { + pub auth: Option>, + pub auth_type: AuthType, +} + +impl tonic::service::Interceptor for PageServiceAuthenticator { + fn call( + &mut self, + req: tonic::Request<()>, + ) -> std::result::Result, tonic::Status> { + // Check the tenant_id in any case + let tenant_id = + req.metadata() + .get("neon-tenant-id") + .ok_or(tonic::Status::invalid_argument( + "neon-tenant-id metadata missing", + ))?; + let tenant_id = tenant_id.to_str().map_err(|_| { + tonic::Status::invalid_argument("invalid UTF-8 characters in neon-tenant-id metadata") + })?; + let tenant_id = TenantId::from_str(tenant_id) + .map_err(|_| tonic::Status::invalid_argument("invalid neon-tenant-id metadata"))?; + + // when accessing management api supply None as an argument + // when using to authorize tenant pass corresponding tenant id + let auth = if let Some(auth) = &self.auth { + auth + } else { + // auth is set to Trust, nothing to check so just return ok + return Ok(req); + }; + + let jwt = req + .metadata() + .get("neon-auth-token") + .ok_or(tonic::Status::unauthenticated("no neon-auth-token"))?; + let jwt = jwt.to_str().map_err(|_| { + tonic::Status::invalid_argument("invalid UTF-8 characters in neon-auth-token metadata") + })?; + + let jwtdata: TokenData = auth + .decode(jwt) + .map_err(|err| tonic::Status::unauthenticated(format!("invalid JWT token: {}", err)))?; + let claims = jwtdata.claims; + + if matches!(claims.scope, utils::auth::Scope::Tenant) && claims.tenant_id.is_none() { + return Err(tonic::Status::unauthenticated( + "jwt token scope is Tenant, but tenant id is missing", + )); + } + + debug!( + "jwt scope check succeeded for scope: {:#?} by tenant id: {:?}", + claims.scope, claims.tenant_id, + ); + + // The token is valid. Check if it's allowed to access the tenant ID + // given in the request. + + check_permission(&claims, Some(tenant_id)) + .map_err(|err| tonic::Status::permission_denied(err.to_string()))?; + + // All checks out + Ok(req) + } +} + +/// Stream of GetBaseBackupResponseChunk messages. +/// +/// The first part of the Chain chunks the tarball. The second part checks the return value +/// of the send_basebackup_tarball Future that created the tarball. + +type GetBaseBackupStream = futures::stream::Chain; + +fn new_basebackup_response_stream( + simplex_read: ReadHalf, + basebackup_task: JoinHandle>, +) -> GetBaseBackupStream { + let framed = FramedRead::new(simplex_read, GetBaseBackupResponseDecoder {}); + + framed.chain(CheckResultStream { basebackup_task }) +} + +/// Stream that uses GetBaseBackupResponseDecoder +type BasebackupChunkedStream = + tokio_util::codec::FramedRead, GetBaseBackupResponseDecoder>; + +struct GetBaseBackupResponseDecoder; +impl Decoder for GetBaseBackupResponseDecoder { + type Item = proto::GetBaseBackupResponseChunk; + type Error = tonic::Status; + + fn decode(&mut self, src: &mut BytesMut) -> Result, Self::Error> { + if src.len() < 64 * 1024 { + return Ok(None); + } + + let item = proto::GetBaseBackupResponseChunk { + chunk: bytes::Bytes::from(std::mem::take(src)), + }; + + Ok(Some(item)) + } + + fn decode_eof(&mut self, src: &mut BytesMut) -> Result, Self::Error> { + if src.is_empty() { + return Ok(None); + } + + let item = proto::GetBaseBackupResponseChunk { + chunk: bytes::Bytes::from(std::mem::take(src)), + }; + + Ok(Some(item)) + } +} + +struct CheckResultStream { + basebackup_task: tokio::task::JoinHandle>, +} +impl futures::Stream for CheckResultStream { + type Item = Result; + + fn poll_next( + mut self: Pin<&mut Self>, + ctx: &mut std::task::Context<'_>, + ) -> Poll> { + let task = Pin::new(&mut self.basebackup_task); + match task.poll(ctx) { + Poll::Pending => Poll::Pending, + Poll::Ready(Ok(Ok(()))) => Poll::Ready(None), + Poll::Ready(Ok(Err(basebackup_err))) => { + error!(error=%basebackup_err, "error getting basebackup"); + Poll::Ready(Some(Err(tonic::Status::internal( + "could not get basebackup", + )))) + } + Poll::Ready(Err(join_err)) => { + error!(error=%join_err, "JoinError getting basebackup"); + Poll::Ready(Some(Err(tonic::Status::internal( + "could not get basebackup", + )))) + } + } + } +} diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index 42454e7356..ea161fc739 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -21,6 +21,8 @@ pub use pageserver_api::keyspace; use tokio_util::sync::CancellationToken; mod assert_u64_eq_usize; pub mod aux_file; +pub mod compute_service; +pub mod compute_service_grpc; pub mod metrics; pub mod page_cache; pub mod page_service; @@ -82,7 +84,7 @@ impl CancellableTask { pub async fn shutdown_pageserver( http_listener: HttpEndpointListener, https_listener: Option, - page_service: page_service::Listener, + compute_service: compute_service::Listener, consumption_metrics_worker: ConsumptionMetricsTasks, disk_usage_eviction_task: Option, tenant_manager: &TenantManager, @@ -167,11 +169,11 @@ pub async fn shutdown_pageserver( } }); - // Shut down the libpq endpoint task. This prevents new connections from + // Shut down the compute service endpoint task. This prevents new connections from // being accepted. let remaining_connections = timed( - page_service.stop_accepting(), - "shutdown LibpqEndpointListener", + compute_service.stop_accepting(), + "shutdown compte service listener", Duration::from_secs(1), ) .await; diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index d1a210a786..ddea8aab6f 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -13,7 +13,6 @@ use crate::PERF_TRACE_TARGET; use anyhow::{Context, bail}; use async_compression::tokio::write::GzipEncoder; use bytes::Buf; -use futures::FutureExt; use itertools::Itertools; use jsonwebtoken::TokenData; use once_cell::sync::OnceCell; @@ -40,7 +39,6 @@ use pq_proto::framed::ConnectionError; use pq_proto::{BeMessage, FeMessage, FeStartupPacket, RowDescriptor}; use strum_macros::IntoStaticStr; use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt, BufWriter}; -use tokio::task::JoinHandle; use tokio_util::sync::CancellationToken; use tracing::*; use utils::auth::{Claims, Scope, SwappableJwtAuth}; @@ -49,15 +47,13 @@ use utils::id::{TenantId, TimelineId}; use utils::logging::log_slow; use utils::lsn::Lsn; use utils::simple_rcu::RcuReadGuard; -use utils::sync::gate::{Gate, GateGuard}; +use utils::sync::gate::GateGuard; use utils::sync::spsc_fold; use crate::auth::check_permission; use crate::basebackup::BasebackupError; use crate::config::PageServerConf; -use crate::context::{ - DownloadBehavior, PerfInstrumentFutureExt, RequestContext, RequestContextBuilder, -}; +use crate::context::{PerfInstrumentFutureExt, RequestContext, RequestContextBuilder}; use crate::metrics::{ self, COMPUTE_COMMANDS_COUNTERS, ComputeCommandKind, GetPageBatchBreakReason, LIVE_CONNECTIONS, SmgrOpTimer, TimelineMetrics, @@ -67,7 +63,6 @@ use crate::span::{ debug_assert_current_span_has_tenant_and_timeline_id, debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id, }; -use crate::task_mgr::{self, COMPUTE_REQUEST_RUNTIME, TaskKind}; use crate::tenant::mgr::{ GetActiveTenantError, GetTenantError, ShardResolveResult, ShardSelector, TenantManager, }; @@ -85,171 +80,6 @@ const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(30000); /// Threshold at which to log slow GetPage requests. const LOG_SLOW_GETPAGE_THRESHOLD: Duration = Duration::from_secs(30); -/////////////////////////////////////////////////////////////////////////////// - -pub struct Listener { - cancel: CancellationToken, - /// Cancel the listener task through `listen_cancel` to shut down the listener - /// and get a handle on the existing connections. - task: JoinHandle, -} - -pub struct Connections { - cancel: CancellationToken, - tasks: tokio::task::JoinSet, - gate: Gate, -} - -pub fn spawn( - conf: &'static PageServerConf, - tenant_manager: Arc, - pg_auth: Option>, - perf_trace_dispatch: Option, - tcp_listener: tokio::net::TcpListener, - tls_config: Option>, -) -> Listener { - let cancel = CancellationToken::new(); - let libpq_ctx = RequestContext::todo_child( - TaskKind::LibpqEndpointListener, - // listener task shouldn't need to download anything. (We will - // create a separate sub-contexts for each connection, with their - // own download behavior. This context is used only to listen and - // accept connections.) - DownloadBehavior::Error, - ); - let task = COMPUTE_REQUEST_RUNTIME.spawn(task_mgr::exit_on_panic_or_error( - "libpq listener", - libpq_listener_main( - conf, - tenant_manager, - pg_auth, - perf_trace_dispatch, - tcp_listener, - conf.pg_auth_type, - tls_config, - conf.page_service_pipelining.clone(), - libpq_ctx, - cancel.clone(), - ) - .map(anyhow::Ok), - )); - - Listener { cancel, task } -} - -impl Listener { - pub async fn stop_accepting(self) -> Connections { - self.cancel.cancel(); - self.task - .await - .expect("unreachable: we wrap the listener task in task_mgr::exit_on_panic_or_error") - } -} -impl Connections { - pub(crate) async fn shutdown(self) { - let Self { - cancel, - mut tasks, - gate, - } = self; - cancel.cancel(); - while let Some(res) = tasks.join_next().await { - Self::handle_connection_completion(res); - } - gate.close().await; - } - - fn handle_connection_completion(res: Result, tokio::task::JoinError>) { - match res { - Ok(Ok(())) => {} - Ok(Err(e)) => error!("error in page_service connection task: {:?}", e), - Err(e) => error!("page_service connection task panicked: {:?}", e), - } - } -} - -/// -/// Main loop of the page service. -/// -/// Listens for connections, and launches a new handler task for each. -/// -/// Returns Ok(()) upon cancellation via `cancel`, returning the set of -/// open connections. -/// -#[allow(clippy::too_many_arguments)] -pub async fn libpq_listener_main( - conf: &'static PageServerConf, - tenant_manager: Arc, - auth: Option>, - perf_trace_dispatch: Option, - listener: tokio::net::TcpListener, - auth_type: AuthType, - tls_config: Option>, - pipelining_config: PageServicePipeliningConfig, - listener_ctx: RequestContext, - listener_cancel: CancellationToken, -) -> Connections { - let connections_cancel = CancellationToken::new(); - let connections_gate = Gate::default(); - let mut connection_handler_tasks = tokio::task::JoinSet::default(); - - loop { - let gate_guard = match connections_gate.enter() { - Ok(guard) => guard, - Err(_) => break, - }; - - let accepted = tokio::select! { - biased; - _ = listener_cancel.cancelled() => break, - next = connection_handler_tasks.join_next(), if !connection_handler_tasks.is_empty() => { - let res = next.expect("we dont poll while empty"); - Connections::handle_connection_completion(res); - continue; - } - accepted = listener.accept() => accepted, - }; - - match accepted { - Ok((socket, peer_addr)) => { - // Connection established. Spawn a new task to handle it. - debug!("accepted connection from {}", peer_addr); - let local_auth = auth.clone(); - let connection_ctx = RequestContextBuilder::from(&listener_ctx) - .task_kind(TaskKind::PageRequestHandler) - .download_behavior(DownloadBehavior::Download) - .perf_span_dispatch(perf_trace_dispatch.clone()) - .detached_child(); - - connection_handler_tasks.spawn(page_service_conn_main( - conf, - tenant_manager.clone(), - local_auth, - socket, - auth_type, - tls_config.clone(), - pipelining_config.clone(), - connection_ctx, - connections_cancel.child_token(), - gate_guard, - )); - } - Err(err) => { - // accept() failed. Log the error, and loop back to retry on next connection. - error!("accept() failed: {:?}", err); - } - } - } - - debug!("page_service listener loop terminated"); - - Connections { - cancel: connections_cancel, - tasks: connection_handler_tasks, - gate: connections_gate, - } -} - type ConnectionHandlerResult = anyhow::Result<()>; /// Perf root spans start at the per-request level, after shard routing. @@ -261,9 +91,10 @@ struct ConnectionPerfSpanFields { compute_mode: Option, } +/// note: the caller has already set TCP_NODELAY on the socket #[instrument(skip_all, fields(peer_addr, application_name, compute_mode))] #[allow(clippy::too_many_arguments)] -async fn page_service_conn_main( +pub async fn libpq_page_service_conn_main( conf: &'static PageServerConf, tenant_manager: Arc, auth: Option>, @@ -279,10 +110,6 @@ async fn page_service_conn_main( .with_label_values(&["page_service"]) .guard(); - socket - .set_nodelay(true) - .context("could not set TCP_NODELAY")?; - let socket_fd = socket.as_raw_fd(); let peer_addr = socket.peer_addr().context("get peer address")?; @@ -393,7 +220,7 @@ struct PageServerHandler { gate_guard: GateGuard, } -struct TimelineHandles { +pub struct TimelineHandles { wrapper: TenantManagerWrapper, /// Note on size: the typical size of this map is 1. The largest size we expect /// to see is the number of shards divided by the number of pageservers (typically < 2), diff --git a/pgxn/neon/Makefile b/pgxn/neon/Makefile index 426b176af9..90bdff32a9 100644 --- a/pgxn/neon/Makefile +++ b/pgxn/neon/Makefile @@ -1,10 +1,10 @@ # pgxs/neon/Makefile - MODULE_big = neon OBJS = \ $(WIN32RES) \ communicator.o \ + communicator_new.o \ extension_server.o \ file_cache.o \ hll.o \ @@ -22,7 +22,8 @@ OBJS = \ walproposer.o \ walproposer_pg.o \ control_plane_connector.o \ - walsender_hooks.o + walsender_hooks.o \ + $(LIBCOMMUNICATOR_PATH)/libcommunicator.a PG_CPPFLAGS = -I$(libpq_srcdir) SHLIB_LINK_INTERNAL = $(libpq) diff --git a/pgxn/neon/communicator/Cargo.lock b/pgxn/neon/communicator/Cargo.lock new file mode 100644 index 0000000000..2a62aacfba --- /dev/null +++ b/pgxn/neon/communicator/Cargo.lock @@ -0,0 +1,372 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "addr2line" +version = "0.24.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dfbe277e56a376000877090da837660b4427aad530e3028d44e0bffe4f89a1c1" +dependencies = [ + "gimli", +] + +[[package]] +name = "adler2" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627" + +[[package]] +name = "backtrace" +version = "0.3.74" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8d82cb332cdfaed17ae235a638438ac4d4839913cc2af585c3c6746e8f8bee1a" +dependencies = [ + "addr2line", + "cfg-if", + "libc", + "miniz_oxide", + "object", + "rustc-demangle", + "windows-targets", +] + +[[package]] +name = "base64" +version = "0.22.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" + +[[package]] +name = "bytes" +version = "1.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d71b6127be86fdcfddb610f7182ac57211d4b18a3e9c82eb2d17662f2227ad6a" + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "communicator" +version = "0.1.0" +dependencies = [ + "tonic", +] + +[[package]] +name = "fnv" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" + +[[package]] +name = "futures-core" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e" + +[[package]] +name = "gimli" +version = "0.31.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07e28edb80900c19c28f1072f2e8aeca7fa06b23cd4169cefe1af5aa3260783f" + +[[package]] +name = "http" +version = "1.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f4a85d31aea989eead29a3aaf9e1115a180df8282431156e533de47660892565" +dependencies = [ + "bytes", + "fnv", + "itoa", +] + +[[package]] +name = "http-body" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1efedce1fb8e6913f23e0c92de8e62cd5b772a67e7b3946df930a62566c93184" +dependencies = [ + "bytes", + "http", +] + +[[package]] +name = "http-body-util" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b021d93e26becf5dc7e1b75b1bed1fd93124b374ceb73f43d4d4eafec896a64a" +dependencies = [ + "bytes", + "futures-core", + "http", + "http-body", + "pin-project-lite", +] + +[[package]] +name = "itoa" +version = "1.0.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c" + +[[package]] +name = "libc" +version = "0.2.171" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c19937216e9d3aa9956d9bb8dfc0b0c8beb6058fc4f7a4dc4d850edf86a237d6" + +[[package]] +name = "memchr" +version = "2.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" + +[[package]] +name = "miniz_oxide" +version = "0.8.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff70ce3e48ae43fa075863cef62e8b43b71a4f2382229920e0df362592919430" +dependencies = [ + "adler2", +] + +[[package]] +name = "object" +version = "0.36.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62948e14d923ea95ea2c7c86c71013138b66525b86bdc08d2dcc262bdb497b87" +dependencies = [ + "memchr", +] + +[[package]] +name = "once_cell" +version = "1.21.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" + +[[package]] +name = "percent-encoding" +version = "2.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" + +[[package]] +name = "pin-project" +version = "1.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "677f1add503faace112b9f1373e43e9e054bfdd22ff1a63c1bc485eaec6a6a8a" +dependencies = [ + "pin-project-internal", +] + +[[package]] +name = "pin-project-internal" +version = "1.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e918e4ff8c4549eb882f14b3a4bc8c8bc93de829416eacf579f1207a8fbf861" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "pin-project-lite" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b3cff922bd51709b605d9ead9aa71031d81447142d828eb4a6eba76fe619f9b" + +[[package]] +name = "proc-macro2" +version = "1.0.94" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a31971752e70b8b2686d7e46ec17fb38dad4051d94024c88df49b667caea9c84" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.40" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "rustc-demangle" +version = "0.1.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f" + +[[package]] +name = "syn" +version = "2.0.100" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b09a44accad81e1ba1cd74a32461ba89dee89095ba17b32f5d03683b1b1fc2a0" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "tokio" +version = "1.44.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6b88822cbe49de4185e3a4cbf8321dd487cf5fe0c5c65695fef6346371e9c48" +dependencies = [ + "backtrace", + "pin-project-lite", +] + +[[package]] +name = "tokio-stream" +version = "0.1.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eca58d7bba4a75707817a2c44174253f9236b2d5fbd055602e9d5c07c139a047" +dependencies = [ + "futures-core", + "pin-project-lite", + "tokio", +] + +[[package]] +name = "tonic" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85839f0b32fd242bb3209262371d07feda6d780d16ee9d2bc88581b89da1549b" +dependencies = [ + "base64", + "bytes", + "http", + "http-body", + "http-body-util", + "percent-encoding", + "pin-project", + "tokio-stream", + "tower-layer", + "tower-service", + "tracing", +] + +[[package]] +name = "tower-layer" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "121c2a6cda46980bb0fcd1647ffaf6cd3fc79a013de288782836f6df9c48780e" + +[[package]] +name = "tower-service" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8df9b6e13f2d32c91b9bd719c00d1958837bc7dec474d94952798cc8e69eeec3" + +[[package]] +name = "tracing" +version = "0.1.41" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "784e0ac535deb450455cbfa28a6f0df145ea1bb7ae51b821cf5e7927fdcfbdd0" +dependencies = [ + "pin-project-lite", + "tracing-attributes", + "tracing-core", +] + +[[package]] +name = "tracing-attributes" +version = "0.1.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "395ae124c09f9e6918a2310af6038fba074bcf474ac352496d5910dd59a2226d" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "tracing-core" +version = "0.1.33" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e672c95779cf947c5311f83787af4fa8fffd12fb27e4993211a84bdfd9610f9c" +dependencies = [ + "once_cell", +] + +[[package]] +name = "unicode-ident" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512" + +[[package]] +name = "windows-targets" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" +dependencies = [ + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_gnullvm", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" + +[[package]] +name = "windows_i686_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" + +[[package]] +name = "windows_i686_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" diff --git a/pgxn/neon/communicator/Cargo.toml b/pgxn/neon/communicator/Cargo.toml new file mode 100644 index 0000000000..e8d12024e1 --- /dev/null +++ b/pgxn/neon/communicator/Cargo.toml @@ -0,0 +1,35 @@ +[package] +name = "communicator" +version = "0.1.0" +edition = "2024" + +[lib] +crate-type = ["staticlib"] + +[dependencies] +bytes.workspace = true +http.workspace = true +libc.workspace = true +nix.workspace = true +atomic_enum = "0.3.0" +prost.workspace = true +tonic = { version = "0.12.0", default-features = false, features=["codegen", "prost", "transport"] } +tokio = { version = "1.43.1", features = ["macros", "net", "io-util", "rt", "rt-multi-thread"] } +tokio-pipe = { version = "0.2.12" } +thiserror.workspace = true +tracing.workspace = true +tracing-subscriber.workspace = true +zerocopy = "0.8.0" +zerocopy-derive = "0.8.0" + +tokio-epoll-uring.workspace = true +uring-common.workspace = true + +pageserver_client_grpc.workspace = true +pageserver_data_api.workspace = true + +neonart.workspace = true +utils.workspace = true + +[build-dependencies] +cbindgen.workspace = true diff --git a/pgxn/neon/communicator/README.md b/pgxn/neon/communicator/README.md new file mode 100644 index 0000000000..8887a01cbc --- /dev/null +++ b/pgxn/neon/communicator/README.md @@ -0,0 +1,123 @@ +# Communicator + +This package provides the so-called "compute-pageserver communicator", +or just "communicator" in short. It runs in a PostgreSQL server, as +part of the neon extension, and handles the communication with the +pageservers. On the PostgreSQL side, the glue code in pgxn/neon/ uses +the communicator to implement the PostgreSQL Storage Manager (SMGR) +interface. + +## Design criteria + +- Low latency +- Saturate a 10 Gbit / s network interface without becoming a bottleneck + +## Source code view + +pgxn/neon/communicator_new.c + Contains the glue that interact with PostgreSQL code and the Rust + communicator code. + +pgxn/neon/communicator/src/backend_interface.rs + The entry point for calls from each backend. + +pgxn/neon/communicator/src/init.rs + Initialization at server startup + +pgxn/neon/communicator/src/worker_process/ + Worker process main loop and glue code + +At compilation time, pgxn/neon/communicator/ produces a static +library, libcommunicator.a. It is linked to the neon.so extension +library. + +The real networking code, which is independent of PostgreSQL, is in +the pageserver/client_grpc crate. + +## Process view + +The communicator runs in a dedicated background worker process, the +"communicator process". The communicator uses a multi-threaded Tokio +runtime to execute the IO requests. So the communicator process has +multiple threads running. That's unusual for Postgres processes and +care must be taken to make that work. + +### Backend <-> worker communication + +Each backend has a number of I/O request slots in shared memory. The +slots are statically allocated for each backend, and must not be +accessed by other backends. The worker process reads requests from the +shared memory slots, and writes responses back to the slots. + +To submit an IO request, first pick one of your backend's free slots, +and write the details of the IO request in the slot. Finally, update +the 'state' field of the slot to Submitted. That informs the worker +process that it can start processing the request. Once the state has +been set to Submitted, the backend *must not* access the slot anymore, +until the worker process sets its state to 'Completed'. In other +words, each slot is owned by either the backend or the worker process +at all times, and the 'state' field indicates who has ownership at the +moment. + +To inform the worker process that a request slot has a pending IO +request, there's a pipe shared by the worker process and all backend +processes. After you have changed the slot's state to Submitted, write +the index of the request slot to the pipe. This wakes up the worker +process. + +(Note that the pipe is just used for wakeups, but the worker process +is free to pick up Submitted IO requests even without receiving the +wakeup. As of this writing, it doesn't do that, but it might be useful +in the future to reduce latency even further, for example.) + +When the worker process has completed processing the request, it +writes the result back in the request slot. A GetPage request can also +contain a pointer to buffer in the shared buffer cache. In that case, +the worker process writes the resulting page contents directly to the +buffer, and just a result code in the request slot. It then updates +the 'state' field to Completed, which passes the owner ship back to +the originating backend. Finally, it signals the process Latch of the +originating backend, waking it up. + +### Differences between PostgreSQL v16, v17 and v18 + +PostgreSQL v18 introduced the new AIO mechanism. The PostgreSQL AIO +mechanism uses a very similar mechanism as described in the previous +section, for the communication between AIO worker processes and +backends. With our communicator, the AIO worker processes are not +used, but we use the same PgAioHandle request slots as in upstream. +For Neon-specific IO requests like GetDbSize, a neon request slot is +used. But for the actual IO requests, the request slot merely contains +a pointer to the PgAioHandle slot. The worker process updates the +status of that, calls the IO callbacks upon completionetc, just like +the upstream AIO worker processes do. + +## Sequence diagram + + neon + PostgreSQL extension backend_interface.rs worker_process.rs processor tonic + | . . . . + | smgr_read() . . . . + +-------------> + . . . + . | . . . + . | rcommunicator_ . . . + . | get_page_at_lsn . . . + . +------------------> + . . + | . . + | write request to . . . + | slot . . + | . . + | . . + | submit_request() . . + +-----------------> + . + | | . + | | db_size_request . . + +---------------->. + . TODO + + + +### Compute <-> pageserver protocol + +The protocol between Compute and the pageserver is based on gRPC. See `protos/`. + diff --git a/pgxn/neon/communicator/build.rs b/pgxn/neon/communicator/build.rs new file mode 100644 index 0000000000..851a2d9b37 --- /dev/null +++ b/pgxn/neon/communicator/build.rs @@ -0,0 +1,24 @@ +use cbindgen; + +use std::env; + +fn main() -> Result<(), Box> { + let crate_dir = env::var("CARGO_MANIFEST_DIR").unwrap(); + + cbindgen::generate(crate_dir).map_or_else( + |error| match error { + cbindgen::Error::ParseSyntaxError { .. } => { + // This means there was a syntax error in the Rust sources. Don't panic, because + // we want the build to continue and the Rust compiler to hit the error. The + // Rust compiler produces a better error message than cbindgen. + eprintln!("Generating C bindings failed because of a Rust syntax error"); + } + e => panic!("Unable to generate C bindings: {:?}", e), + }, + |bindings| { + bindings.write_to_file("communicator_bindings.h"); + }, + ); + + Ok(()) +} diff --git a/pgxn/neon/communicator/cbindgen.toml b/pgxn/neon/communicator/cbindgen.toml new file mode 100644 index 0000000000..72e0c8174a --- /dev/null +++ b/pgxn/neon/communicator/cbindgen.toml @@ -0,0 +1,4 @@ +language = "C" + +[enum] +prefix_with_name = true diff --git a/pgxn/neon/communicator/src/backend_comms.rs b/pgxn/neon/communicator/src/backend_comms.rs new file mode 100644 index 0000000000..c798dcf30e --- /dev/null +++ b/pgxn/neon/communicator/src/backend_comms.rs @@ -0,0 +1,204 @@ +//! This module implements a request/response "slot" for submitting requests from backends +//! to the communicator process. +//! +//! NB: The "backend" side of this code runs in Postgres backend processes, +//! which means that it is not safe to use the 'tracing' crate for logging, nor +//! to launch threads or use tokio tasks. +use std::cell::UnsafeCell; +use std::sync::atomic::fence; +use std::sync::atomic::{AtomicI32, Ordering}; + +use crate::neon_request::{NeonIORequest, NeonIOResult}; + +use atomic_enum::atomic_enum; + +/// One request/response slot. Each backend has its own set of slots that it uses. +/// +/// This is the moral equivalent of PgAioHandle for Postgres AIO requests +/// Like PgAioHandle, try to keep this small. +/// +/// There is an array of these in shared memory. Therefore, this must be Sized. +/// +/// ## Lifecycle of a request +/// +/// The slot is always owned by either the backend process or the communicator +/// process, depending on the 'state'. Only the owning process is allowed to +/// read or modify the slot, except for reading the 'state' itself to check who +/// owns it. +/// +/// A slot begins in the Idle state, where it is owned by the backend process. +/// To submit a request, the backend process fills the slot with the request +/// data, and changes it to the Submitted state. After changing the state, the +/// slot is owned by the communicator process, and the backend is not allowed +/// to access it until the communicator process marks it as Completed. +/// +/// When the communicator process sees that the slot is in Submitted state, it +/// starts to process the request. After processing the request, it stores the +/// result in the slot, and changes the state to Completed. It is now owned by +/// the backend process again, which may now read the result, and reuse the +/// slot for a new request. +/// +/// For correctness of the above protocol, we really only need two states: +/// "owned by backend" and "owned by communicator process. But to help with +/// debugging, there are a few more states. When the backend starts to fill in +/// the request details in the slot, it first sets the state from Idle to +/// Filling, and when it's done with that, from Filling to Submitted. In the +/// Filling state, the slot is still owned by the backend. Similarly, when the +/// communicator process starts to process a request, it sets it to Processing +/// state first, but the slot is still owned by the communicator process. +/// +/// This struct doesn't handle waking up the communicator process when a request +/// has been submitted or when a response is ready. We only store the 'owner_procno' +/// which can be used for waking up the backend on completion, but the wakeups are +/// performed elsewhere. +pub struct NeonIOHandle { + /// similar to PgAioHandleState + state: AtomicNeonIOHandleState, + + /// The owning process's ProcNumber. The worker process uses this to set the process's + /// latch on completion. + /// + /// (This could be calculated from num_neon_request_slots_per_backend and the index of + /// this slot in the overall 'neon_requst_slots array') + owner_procno: AtomicI32, + + /// SAFETY: This is modified by fill_request(), after it has established ownership + /// of the slot by setting state from Idle to Filling + request: UnsafeCell, + + /// valid when state is Completed + /// + /// SAFETY: This is modified by RequestProcessingGuard::complete(). There can be + /// only one RequestProcessingGuard outstanding for a slot at a time, because + /// it is returned by start_processing_request() which checks the state, so + /// RequestProcessingGuard has exclusive access to the slot. + result: UnsafeCell, +} + +// The protocol described in the "Lifecycle of a request" section above ensures +// the safe access to the fields +unsafe impl Send for NeonIOHandle {} +unsafe impl Sync for NeonIOHandle {} + +impl Default for NeonIOHandle { + fn default() -> NeonIOHandle { + NeonIOHandle { + owner_procno: AtomicI32::new(-1), + request: UnsafeCell::new(NeonIORequest::Empty), + result: UnsafeCell::new(NeonIOResult::Empty), + state: AtomicNeonIOHandleState::new(NeonIOHandleState::Idle), + } + } +} + +#[atomic_enum] +#[derive(Eq, PartialEq)] +pub enum NeonIOHandleState { + Idle, + + /// backend is filling in the request + Filling, + + /// Backend has submitted the request to the communicator, but the + /// communicator process has not yet started processing it. + Submitted, + + /// Communicator is processing the request + Processing, + + /// Communicator has completed the request, and the 'result' field is now + /// valid, but the backend has not read the result yet. + Completed, +} + +pub struct RequestProcessingGuard<'a>(&'a NeonIOHandle); + +unsafe impl<'a> Send for RequestProcessingGuard<'a> {} +unsafe impl<'a> Sync for RequestProcessingGuard<'a> {} + +impl<'a> RequestProcessingGuard<'a> { + pub fn get_request(&self) -> &NeonIORequest { + unsafe { &*self.0.request.get() } + } + + pub fn get_owner_procno(&self) -> i32 { + self.0.owner_procno.load(Ordering::Relaxed) + } + + pub fn completed(self, result: NeonIOResult) { + unsafe { + *self.0.result.get() = result; + }; + + // Ok, we have completed the IO. Mark the request as completed. After that, + // we no longer have ownership of the slot, and must not modify it. + let old_state = self + .0 + .state + .swap(NeonIOHandleState::Completed, Ordering::Release); + assert!(old_state == NeonIOHandleState::Processing); + } +} + +impl NeonIOHandle { + pub fn fill_request(&self, request: &NeonIORequest, proc_number: i32) { + // Verify that the slot is in Idle state previously, and start filling it. + // + // XXX: This step isn't strictly necessary. Assuming the caller didn't screw up + // and try to use a slot that's already in use, we could fill the slot and + // switch it directly from Idle to Submitted state. + if let Err(s) = self.state.compare_exchange( + NeonIOHandleState::Idle, + NeonIOHandleState::Filling, + Ordering::Relaxed, + Ordering::Relaxed, + ) { + panic!("unexpected state in request slot: {s:?}"); + } + + // This fence synchronizes-with store/swap in `communicator_process_main_loop`. + fence(Ordering::Acquire); + + self.owner_procno.store(proc_number, Ordering::Relaxed); + unsafe { *self.request.get() = *request } + self.state + .store(NeonIOHandleState::Submitted, Ordering::Release); + } + + pub fn try_get_result(&self) -> Option { + // FIXME: ordering? + let state = self.state.load(Ordering::Relaxed); + if state == NeonIOHandleState::Completed { + // This fence synchronizes-with store/swap in `communicator_process_main_loop`. + fence(Ordering::Acquire); + let result = unsafe { *self.result.get() }; + self.state.store(NeonIOHandleState::Idle, Ordering::Relaxed); + Some(result) + } else { + None + } + } + + pub fn start_processing_request<'a>(&'a self) -> Option> { + // Read the IO request from the slot indicated in the wakeup + // + // XXX: using compare_exchange for this is not strictly necessary, as long as + // the communicator process has _some_ means of tracking which requests it's + // already processing. That could be a flag somewhere in communicator's private + // memory, for example. + if let Err(s) = self.state.compare_exchange( + NeonIOHandleState::Submitted, + NeonIOHandleState::Processing, + Ordering::Relaxed, + Ordering::Relaxed, + ) { + // FIXME surprising state. This is unexpected at the moment, but if we + // started to process requests more aggressively, without waiting for the + // read from the pipe, then this could happen + panic!("unexpected state in request slot: {s:?}"); + } + fence(Ordering::Acquire); + + Some(RequestProcessingGuard(self)) + } +} diff --git a/pgxn/neon/communicator/src/backend_interface.rs b/pgxn/neon/communicator/src/backend_interface.rs new file mode 100644 index 0000000000..a9a06d6225 --- /dev/null +++ b/pgxn/neon/communicator/src/backend_interface.rs @@ -0,0 +1,196 @@ +//! This code runs in each backend process. That means that launching Rust threads, panicking +//! etc. is forbidden! + +use crate::backend_comms::NeonIOHandle; +use crate::init::CommunicatorInitStruct; +use crate::integrated_cache::{BackendCacheReadOp, IntegratedCacheReadAccess}; +use crate::neon_request::CCachedGetPageVResult; +use crate::neon_request::{NeonIORequest, NeonIOResult}; + +pub struct CommunicatorBackendStruct<'t> { + my_proc_number: i32, + + next_neon_request_idx: u32, + + my_start_idx: u32, // First request slot that belongs to this backend + my_end_idx: u32, // end + 1 request slot that belongs to this backend + + neon_request_slots: &'t [NeonIOHandle], + + submission_pipe_write_fd: std::ffi::c_int, + + pending_cache_read_op: Option>, + + integrated_cache: &'t IntegratedCacheReadAccess<'t>, +} + +#[unsafe(no_mangle)] +pub extern "C" fn rcommunicator_backend_init( + cis: Box, + my_proc_number: i32, +) -> &'static mut CommunicatorBackendStruct<'static> { + let start_idx = my_proc_number as u32 * cis.num_neon_request_slots_per_backend; + let end_idx = start_idx + cis.num_neon_request_slots_per_backend; + + let integrated_cache = Box::leak(Box::new(cis.integrated_cache_init_struct.backend_init())); + + let bs: &'static mut CommunicatorBackendStruct = + Box::leak(Box::new(CommunicatorBackendStruct { + my_proc_number, + next_neon_request_idx: start_idx, + my_start_idx: start_idx, + my_end_idx: end_idx, + neon_request_slots: cis.neon_request_slots, + + submission_pipe_write_fd: cis.submission_pipe_write_fd, + pending_cache_read_op: None, + + integrated_cache, + })); + bs +} + +/// Start a request. You can poll for its completion and get the result by +/// calling bcomm_poll_dbsize_request_completion(). The communicator will wake +/// us up by setting our process latch, so to wait for the completion, wait on +/// the latch and call bcomm_poll_dbsize_request_completion() every time the +/// latch is set. +/// +/// Safety: The C caller must ensure that the references are valid. +#[unsafe(no_mangle)] +pub extern "C" fn bcomm_start_io_request<'t>( + bs: &'t mut CommunicatorBackendStruct, + request: &NeonIORequest, + immediate_result_ptr: &mut NeonIOResult, +) -> i32 { + assert!(bs.pending_cache_read_op.is_none()); + + // Check if the request can be satisfied from the cache first + if let NeonIORequest::RelSize(req) = request { + if let Some(nblocks) = bs.integrated_cache.get_rel_size(&req.reltag()) { + *immediate_result_ptr = NeonIOResult::RelSize(nblocks); + return -1; + } + } + + // Create neon request and submit it + let request_idx = bs.start_neon_request(request); + + // Tell the communicator about it + bs.submit_request(request_idx); + + return request_idx; +} + +#[unsafe(no_mangle)] +pub extern "C" fn bcomm_start_get_page_v_request<'t>( + bs: &'t mut CommunicatorBackendStruct, + request: &NeonIORequest, + immediate_result_ptr: &mut CCachedGetPageVResult, +) -> i32 { + let NeonIORequest::GetPageV(get_pagev_request) = request else { + panic!("invalid request passed to bcomm_start_get_page_v_request()"); + }; + assert!(matches!(request, NeonIORequest::GetPageV(_))); + assert!(bs.pending_cache_read_op.is_none()); + + // Check if the request can be satisfied from the cache first + let mut all_cached = true; + let read_op = bs.integrated_cache.start_read_op(); + for i in 0..get_pagev_request.nblocks { + if let Some(cache_block) = read_op.get_page( + &get_pagev_request.reltag(), + get_pagev_request.block_number + i as u32, + ) { + (*immediate_result_ptr).cache_block_numbers[i as usize] = cache_block; + } else { + // not found in cache + all_cached = false; + break; + } + } + if all_cached { + bs.pending_cache_read_op = Some(read_op); + return -1; + } + + // Create neon request and submit it + let request_idx = bs.start_neon_request(request); + + // Tell the communicator about it + bs.submit_request(request_idx); + + return request_idx; +} + +/// Check if a request has completed. Returns: +/// +/// -1 if the request is still being processed +/// 0 on success +#[unsafe(no_mangle)] +pub extern "C" fn bcomm_poll_request_completion( + bs: &mut CommunicatorBackendStruct, + request_idx: u32, + result_p: &mut NeonIOResult, +) -> i32 { + match bs.neon_request_slots[request_idx as usize].try_get_result() { + None => -1, // still processing + Some(result) => { + *result_p = result; + 0 + } + } +} + +// LFC functions + +/// Finish a local file cache read +/// +// +#[unsafe(no_mangle)] +pub extern "C" fn bcomm_finish_cache_read(bs: &mut CommunicatorBackendStruct) -> bool { + if let Some(op) = bs.pending_cache_read_op.take() { + op.finish() + } else { + panic!("bcomm_finish_cache_read() called with no cached read pending"); + } +} + +impl<'t> CommunicatorBackendStruct<'t> { + /// Send a wakeup to the communicator process + fn submit_request(self: &CommunicatorBackendStruct<'t>, request_idx: i32) { + // wake up communicator by writing the idx to the submission pipe + // + // This can block, if the pipe is full. That should be very rare, + // because the communicator tries hard to drain the pipe to prevent + // that. Also, there's a natural upper bound on how many wakeups can be + // queued up: there is only a limited number of request slots for each + // backend. + // + // If it does block very briefly, that's not too serious. + let idxbuf = request_idx.to_ne_bytes(); + let _res = nix::unistd::write(self.submission_pipe_write_fd, &idxbuf); + // FIXME: check result, return any errors + } + + /// Note: there's no guarantee on when the communicator might pick it up. You should ring + /// the doorbell. But it might pick it up immediately. + pub(crate) fn start_neon_request(&mut self, request: &NeonIORequest) -> i32 { + let my_proc_number = self.my_proc_number; + + // Grab next free slot + // FIXME: any guarantee that there will be any? + let idx = self.next_neon_request_idx; + + let next_idx = idx + 1; + self.next_neon_request_idx = if next_idx == self.my_end_idx { + self.my_start_idx + } else { + next_idx + }; + + self.neon_request_slots[idx as usize].fill_request(request, my_proc_number); + + return idx as i32; + } +} diff --git a/pgxn/neon/communicator/src/file_cache.rs b/pgxn/neon/communicator/src/file_cache.rs new file mode 100644 index 0000000000..9509c15d25 --- /dev/null +++ b/pgxn/neon/communicator/src/file_cache.rs @@ -0,0 +1,109 @@ +//! Implement the "low-level" parts of the file cache. +//! +//! This module just deals with reading and writing the file, and keeping track +//! which blocks in the cache file are in use and which are free. The "high +//! level" parts of tracking which block in the cache file corresponds to which +//! relation block is handled in 'integrated_cache' instead. +//! +//! This module is only used to access the file from the communicator +//! process. The backend processes *also* read the file (and sometimes also +//! write it? ), but the backends use direct C library calls for that. +use std::fs::File; +use std::path::Path; +use std::sync::Arc; +use std::sync::atomic::{AtomicU64, Ordering}; + +use tokio_epoll_uring; + +use crate::BLCKSZ; + +pub type CacheBlock = u64; + +pub struct FileCache { + uring_system: tokio_epoll_uring::SystemHandle, + + file: Arc, + + // TODO: there's no reclamation mechanism, the cache grows + // indefinitely. This is the next free block, i.e. the current + // size of the file + next_free_block: AtomicU64, +} + +impl FileCache { + pub fn new( + file_cache_path: &Path, + uring_system: tokio_epoll_uring::SystemHandle, + ) -> Result { + let file = std::fs::OpenOptions::new() + .read(true) + .write(true) + .truncate(true) + .create(true) + .open(file_cache_path)?; + + tracing::info!("Created cache file {file_cache_path:?}"); + + Ok(FileCache { + file: Arc::new(file), + uring_system, + next_free_block: AtomicU64::new(0), + }) + } + + // File cache management + + pub async fn read_block( + &self, + cache_block: CacheBlock, + dst: impl uring_common::buf::IoBufMut + Send + Sync, + ) -> Result<(), std::io::Error> { + assert!(dst.bytes_total() == BLCKSZ); + let file = self.file.clone(); + + let ((_file, _buf), res) = self + .uring_system + .read(file, cache_block as u64 * BLCKSZ as u64, dst) + .await; + + let res = res.map_err(map_io_uring_error)?; + if res != BLCKSZ { + panic!("unexpected read result"); + } + + Ok(()) + } + + pub async fn write_block( + &self, + cache_block: CacheBlock, + src: impl uring_common::buf::IoBuf + Send + Sync, + ) -> Result<(), std::io::Error> { + assert!(src.bytes_init() == BLCKSZ); + let file = self.file.clone(); + + let ((_file, _buf), res) = self + .uring_system + .write(file, cache_block as u64 * BLCKSZ as u64, src) + .await; + let res = res.map_err(map_io_uring_error)?; + if res != BLCKSZ { + panic!("unexpected read result"); + } + + Ok(()) + } + + pub fn alloc_block(&self) -> CacheBlock { + self.next_free_block.fetch_add(1, Ordering::Relaxed) + } +} + +fn map_io_uring_error(err: tokio_epoll_uring::Error) -> std::io::Error { + match err { + tokio_epoll_uring::Error::Op(err) => err, + tokio_epoll_uring::Error::System(err) => { + std::io::Error::new(std::io::ErrorKind::Other, err) + } + } +} diff --git a/pgxn/neon/communicator/src/init.rs b/pgxn/neon/communicator/src/init.rs new file mode 100644 index 0000000000..b337ac13c4 --- /dev/null +++ b/pgxn/neon/communicator/src/init.rs @@ -0,0 +1,130 @@ +//! Initialization functions. These are executed in the postmaster process, +//! at different stages of server startup. +//! +//! +//! Communicator initialization steps: +//! +//! 1. At postmaster startup, before shared memory is allocated, +//! rcommunicator_shmem_size() is called to get the amount of +//! shared memory that this module needs. +//! +//! 2. Later, after the shared memory has been allocated, +//! rcommunicator_shmem_init() is called to initialize the shmem +//! area. +//! +//! Per process initialization: +//! +//! When a backend process starts up, it calls rcommunicator_backend_init(). +//! In the communicator worker process, other functions are called, see +//! `worker_process` module. + +use std::ffi::c_int; +use std::mem; + +use crate::backend_comms::NeonIOHandle; +use crate::integrated_cache::IntegratedCacheInitStruct; + +const NUM_NEON_REQUEST_SLOTS_PER_BACKEND: u32 = 5; + +/// This struct is created in the postmaster process, and inherited to +/// the communicator process and all backend processes through fork() +#[repr(C)] +pub struct CommunicatorInitStruct { + #[allow(dead_code)] + pub max_procs: u32, + + pub submission_pipe_read_fd: std::ffi::c_int, + pub submission_pipe_write_fd: std::ffi::c_int, + + // Shared memory data structures + pub num_neon_request_slots_per_backend: u32, + + pub neon_request_slots: &'static [NeonIOHandle], + + pub integrated_cache_init_struct: IntegratedCacheInitStruct<'static>, +} + +impl std::fmt::Debug for CommunicatorInitStruct { + fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> { + fmt.debug_struct("CommunicatorInitStruct") + .field("max_procs", &self.max_procs) + .field("submission_pipe_read_fd", &self.submission_pipe_read_fd) + .field("submission_pipe_write_fd", &self.submission_pipe_write_fd) + .field( + "num_neon_request_slots_per_backend", + &self.num_neon_request_slots_per_backend, + ) + .field("neon_request_slots length", &self.neon_request_slots.len()) + .finish() + } +} + +#[unsafe(no_mangle)] +pub extern "C" fn rcommunicator_shmem_size(max_procs: u32) -> u64 { + let mut size = 0; + + let num_neon_request_slots = max_procs * NUM_NEON_REQUEST_SLOTS_PER_BACKEND; + size += mem::size_of::() * num_neon_request_slots as usize; + + // For integrated_cache's Allocator. TODO: make this adjustable + size += IntegratedCacheInitStruct::shmem_size(max_procs); + + size as u64 +} + +/// Initialize the shared memory segment. Returns a backend-private +/// struct, which will be inherited by backend processes through fork +#[unsafe(no_mangle)] +pub extern "C" fn rcommunicator_shmem_init( + submission_pipe_read_fd: c_int, + submission_pipe_write_fd: c_int, + max_procs: u32, + shmem_area_ptr: *mut u8, + shmem_area_len: u64, +) -> &'static mut CommunicatorInitStruct { + let mut ptr = shmem_area_ptr; + + // Carve out the request slots from the shmem area and initialize them + let num_neon_request_slots_per_backend = NUM_NEON_REQUEST_SLOTS_PER_BACKEND; + let num_neon_request_slots = max_procs * num_neon_request_slots_per_backend; + + let len_used; + let neon_request_slots: &mut [NeonIOHandle] = unsafe { + ptr = ptr.add(ptr.align_offset(std::mem::align_of::())); + let neon_request_slots_ptr: *mut NeonIOHandle = ptr.cast(); + for _i in 0..num_neon_request_slots { + let slot: *mut NeonIOHandle = ptr.cast(); + *slot = NeonIOHandle::default(); + ptr = ptr.byte_add(mem::size_of::()); + } + len_used = ptr.byte_offset_from(shmem_area_ptr) as usize; + assert!(len_used <= shmem_area_len as usize); + + std::slice::from_raw_parts_mut(neon_request_slots_ptr, num_neon_request_slots as usize) + }; + + let remaining_area = + unsafe { std::slice::from_raw_parts_mut(ptr, shmem_area_len as usize - len_used) }; + + // Give the rest of the area to the integrated cache + let integrated_cache_init_struct = + IntegratedCacheInitStruct::shmem_init(max_procs, remaining_area); + + eprintln!( + "PIPE READ {} WRITE {}", + submission_pipe_read_fd, submission_pipe_write_fd + ); + + let cis: &'static mut CommunicatorInitStruct = Box::leak(Box::new(CommunicatorInitStruct { + max_procs, + submission_pipe_read_fd, + submission_pipe_write_fd, + + num_neon_request_slots_per_backend: NUM_NEON_REQUEST_SLOTS_PER_BACKEND, + neon_request_slots, + + integrated_cache_init_struct, + })); + + cis +} diff --git a/pgxn/neon/communicator/src/integrated_cache.rs b/pgxn/neon/communicator/src/integrated_cache.rs new file mode 100644 index 0000000000..f290db7ba3 --- /dev/null +++ b/pgxn/neon/communicator/src/integrated_cache.rs @@ -0,0 +1,423 @@ +//! Integrated communicator cache +//! +//! Tracks: +//! - Relation sizes and existence +//! - Last-written LSN +//! - TODO: Block cache (also known as LFC) +//! +//! TODO: limit the size +//! TODO: concurrency +//! +//! Note: This deals with "relations", which is really just one "relation fork" in Postgres +//! terms. RelFileLocator + ForkNumber is the key. + +use utils::lsn::Lsn; + +use crate::file_cache::{CacheBlock, FileCache}; +use pageserver_data_api::model::RelTag; + +use neonart; +use neonart::TreeInitStruct; + +const CACHE_AREA_SIZE: usize = 10 * 1024 * 1024; + +/// This struct is stored in the shared memory segment. +struct IntegratedCacheShmemData { + allocator: neonart::Allocator, +} + +/// This struct is initialized at postmaster startup, and passed to all the processes via fork(). +pub struct IntegratedCacheInitStruct<'t> { + shmem_data: &'t IntegratedCacheShmemData, + handle: TreeInitStruct<'t, TreeKey, TreeEntry>, +} + +/// Represents write-access to the integrated cache. This is used by the communicator process. +pub struct IntegratedCacheWriteAccess<'t> { + cache_tree: neonart::TreeWriteAccess<'t, TreeKey, TreeEntry>, + + global_lw_lsn: Lsn, + + file_cache: Option, +} + +/// Represents read-only access to the integrated cache. Backend processes have this. +pub struct IntegratedCacheReadAccess<'t> { + cache_tree: neonart::TreeReadAccess<'t, TreeKey, TreeEntry>, +} + +impl<'t> IntegratedCacheInitStruct<'t> { + /// Return the desired size in bytes of the shared memory area to reserve for the integrated + /// cache. + pub fn shmem_size(_max_procs: u32) -> usize { + CACHE_AREA_SIZE + } + + /// Initialize the shared memory segment. This runs once in postmaster. Returns a struct which + /// will be inherited by all processes through fork. + pub fn shmem_init(_max_procs: u32, shmem_area: &'t mut [u8]) -> IntegratedCacheInitStruct<'t> { + assert!(shmem_area.len() > std::mem::size_of::()); + + let mut ptr = shmem_area.as_mut_ptr(); + let shmem_data_ptr; + let len_used; + unsafe { + ptr = ptr.byte_add(ptr.align_offset(align_of::())); + shmem_data_ptr = ptr.cast::(); + ptr = ptr.byte_add(std::mem::size_of::()); + len_used = ptr.byte_offset_from(shmem_area.as_mut_ptr()) as usize; + }; + assert!(len_used < shmem_area.len()); + + let area_ptr = ptr; + let area_size = shmem_area.len() - len_used; + + let cache_area: &mut [u8] = unsafe { std::slice::from_raw_parts_mut(area_ptr, area_size) }; + let allocator = neonart::Allocator::new(cache_area); + + // Initialize the shared memory area + let shmem_data = unsafe { + *shmem_data_ptr = IntegratedCacheShmemData { allocator }; + &*shmem_data_ptr + }; + + let tree_handle = TreeInitStruct::new(&shmem_data.allocator); + + IntegratedCacheInitStruct { + shmem_data, + handle: tree_handle, + } + } + + pub fn worker_process_init( + self, + lsn: Lsn, + file_cache: Option, + ) -> IntegratedCacheWriteAccess<'t> { + let IntegratedCacheInitStruct { + shmem_data: _shmem, + handle, + } = self; + let tree_writer = handle.attach_writer(); + + IntegratedCacheWriteAccess { + cache_tree: tree_writer, + global_lw_lsn: lsn, + file_cache, + } + } + + pub fn backend_init(self) -> IntegratedCacheReadAccess<'t> { + let IntegratedCacheInitStruct { + shmem_data: _shmem, + handle, + } = self; + + let tree_reader = handle.attach_reader(); + + IntegratedCacheReadAccess { + cache_tree: tree_reader, + } + } +} + +#[derive(Clone)] +enum TreeEntry { + Rel(RelEntry), + Block(BlockEntry), +} + +#[derive(Clone)] +struct BlockEntry { + lw_lsn: Lsn, + cache_block: Option, +} + +#[derive(Clone, Default)] +struct RelEntry { + /// cached size of the relation + nblocks: Option, +} + +#[derive( + Clone, + Debug, + PartialEq, + PartialOrd, + Eq, + Ord, + zerocopy_derive::IntoBytes, + zerocopy_derive::Immutable, +)] +#[repr(packed)] +struct TreeKey { + spc_oid: u32, + db_oid: u32, + rel_number: u32, + fork_number: u8, + block_number: u32, +} + +impl From<&RelTag> for TreeKey { + fn from(val: &RelTag) -> TreeKey { + TreeKey { + spc_oid: val.spc_oid, + db_oid: val.db_oid, + rel_number: val.rel_number, + fork_number: val.fork_number, + block_number: u32::MAX, + } + } +} + +impl From<(&RelTag, u32)> for TreeKey { + fn from(val: (&RelTag, u32)) -> TreeKey { + TreeKey { + spc_oid: val.0.spc_oid, + db_oid: val.0.db_oid, + rel_number: val.0.rel_number, + fork_number: val.0.fork_number, + block_number: val.1, + } + } +} + +impl neonart::Key for TreeKey { + const KEY_LEN: usize = 4 + 4 + 4 + 1 + 32; + + fn as_bytes(&self) -> &[u8] { + zerocopy::IntoBytes::as_bytes(self) + } +} + +impl neonart::Value for TreeEntry {} + +/// Return type used in the cache's get_*() functions. 'Found' means that the page, or other +/// information that was enqueried, exists in the cache. ' +pub enum CacheResult { + /// The enqueried page or other information existed in the cache. + Found(V), + + /// The cache doesn't contain the page (or other enqueried information, like relation size). The + /// Lsn is the 'not_modified_since' LSN that should be used in the request to the pageserver to + /// read the page. + NotFound(Lsn), +} + +impl<'t> IntegratedCacheWriteAccess<'t> { + pub fn get_rel_size(&'t self, rel: &RelTag) -> CacheResult { + let r = self.cache_tree.start_read(); + if let Some(nblocks) = get_rel_size(&r, rel) { + CacheResult::Found(nblocks) + } else { + CacheResult::NotFound(self.global_lw_lsn) + } + } + + pub async fn get_page( + &'t self, + rel: &RelTag, + block_number: u32, + dst: impl uring_common::buf::IoBufMut + Send + Sync, + ) -> Result, std::io::Error> { + let r = self.cache_tree.start_read(); + if let Some(block_tree_entry) = r.get(&TreeKey::from((rel, block_number))) { + let block_entry = if let TreeEntry::Block(e) = block_tree_entry { + e + } else { + panic!("unexpected tree entry type for block key"); + }; + + if let Some(cache_block) = block_entry.cache_block { + self.file_cache + .as_ref() + .unwrap() + .read_block(cache_block, dst) + .await?; + Ok(CacheResult::Found(())) + } else { + Ok(CacheResult::NotFound(block_entry.lw_lsn)) + } + } else { + Ok(CacheResult::NotFound(self.global_lw_lsn)) + } + } + + pub async fn page_is_cached( + &'t self, + rel: &RelTag, + block_number: u32, + ) -> Result, std::io::Error> { + let r = self.cache_tree.start_read(); + if let Some(block_tree_entry) = r.get(&TreeKey::from((rel, block_number))) { + let block_entry = if let TreeEntry::Block(e) = block_tree_entry { + e + } else { + panic!("unexpected tree entry type for block key"); + }; + + if let Some(_cache_block) = block_entry.cache_block { + Ok(CacheResult::Found(())) + } else { + Ok(CacheResult::NotFound(block_entry.lw_lsn)) + } + } else { + Ok(CacheResult::NotFound(self.global_lw_lsn)) + } + } + + /// Does the relation exists? CacheResult::NotFound means that the cache doesn't contain that + /// information, i.e. we don't know if the relation exists or not. + pub fn get_rel_exists(&'t self, rel: &RelTag) -> CacheResult { + // we don't currently cache negative entries, so if the relation is in the cache, it exists + let r = self.cache_tree.start_read(); + if let Some(_rel_entry) = r.get(&TreeKey::from(rel)) { + CacheResult::Found(true) + } else { + CacheResult::NotFound(self.global_lw_lsn) + } + } + + pub fn get_db_size(&'t self, _db_oid: u32) -> CacheResult { + // fixme: is this right lsn? + CacheResult::NotFound(self.global_lw_lsn) + } + + pub fn remember_rel_size(&'t self, rel: &RelTag, nblocks: u32) { + let mut w = self.cache_tree.start_write(); + + w.insert( + &TreeKey::from(rel), + TreeEntry::Rel(RelEntry { + nblocks: Some(nblocks), + }), + ); + } + + /// Remember the given page contents in the cache. + pub async fn remember_page( + &'t self, + rel: &RelTag, + block_number: u32, + src: impl uring_common::buf::IoBuf + Send + Sync, + lw_lsn: Lsn, + ) { + if let Some(file_cache) = self.file_cache.as_ref() { + let mut w = self.cache_tree.start_write(); + + let key = TreeKey::from((rel, block_number)); + + let mut cache_block = None; + + w.update_with_fn(&key, |existing| { + if let Some(existing) = existing { + let mut block_entry = if let TreeEntry::Block(e) = existing.clone() { + e + } else { + panic!("unexpected tree entry type for block key"); + }; + block_entry.lw_lsn = lw_lsn; + if block_entry.cache_block.is_none() { + block_entry.cache_block = Some(file_cache.alloc_block()); + } + cache_block = block_entry.cache_block; + Some(TreeEntry::Block(block_entry)) + } else { + cache_block = Some(file_cache.alloc_block()); + Some(TreeEntry::Block(BlockEntry { + lw_lsn: lw_lsn, + cache_block: cache_block, + })) + } + }); + let cache_block = cache_block.unwrap(); + file_cache + .write_block(cache_block, src) + .await + .expect("error writing to cache"); + } + } + + /// Forget information about given relation in the cache. (For DROP TABLE and such) + pub fn forget_rel(&'t self, rel: &RelTag) { + // FIXME: not implemented properly. smgrexists() would still return true for this + let mut w = self.cache_tree.start_write(); + w.insert( + &TreeKey::from(rel), + TreeEntry::Rel(RelEntry { nblocks: None }), + ); + } +} + +/// Read relation size from the cache. +/// +/// This is in a separate function so that it can be shared by +/// IntegratedCacheReadAccess::get_rel_size() and IntegratedCacheWriteAccess::get_rel_size() +fn get_rel_size<'t>(r: &neonart::TreeReadGuard, rel: &RelTag) -> Option { + if let Some(existing) = r.get(&TreeKey::from(rel)) { + let rel_entry = if let TreeEntry::Rel(e) = existing { + e + } else { + panic!("unexpected tree entry type for rel key"); + }; + + if let Some(nblocks) = rel_entry.nblocks { + Some(nblocks) + } else { + None + } + } else { + None + } +} + +/// Accessor for other backends +/// +/// This allows backends to read pages from the cache directly, on their own, without making a +/// request to the communicator process. +impl<'t> IntegratedCacheReadAccess<'t> { + pub fn get_rel_size(&'t self, rel: &RelTag) -> Option { + get_rel_size(&self.cache_tree.start_read(), rel) + } + + pub fn start_read_op(&'t self) -> BackendCacheReadOp<'t> { + let r = self.cache_tree.start_read(); + BackendCacheReadOp { read_guard: r } + } +} + +pub struct BackendCacheReadOp<'t> { + read_guard: neonart::TreeReadGuard<'t, TreeKey, TreeEntry>, +} + +impl<'e> BackendCacheReadOp<'e> { + /// Initiate a read of the page from the cache. + /// + /// This returns the "cache block number", i.e. the block number within the cache file, where + /// the page's contents is stored. To get the page contents, the caller needs to read that block + /// from the cache file. This returns a guard object that you must hold while it performs the + /// read. It's possible that while you are performing the read, the cache block is invalidated. + /// After you have completed the read, call BackendCacheReadResult::finish() to check if the + /// read was in fact valid or not. If it was concurrently invalidated, you need to retry. + pub fn get_page(&self, rel: &RelTag, block_number: u32) -> Option { + if let Some(block_tree_entry) = self.read_guard.get(&TreeKey::from((rel, block_number))) { + let block_entry = if let TreeEntry::Block(e) = block_tree_entry { + e + } else { + panic!("unexpected tree entry type for block key"); + }; + + block_entry.cache_block + } else { + None + } + } + + pub fn finish(self) -> bool { + // TODO: currently, we use a spinlock to protect the in-memory tree, so concurrent + // invalidations are not possible. But the plan is to switch to optimistic locking, + // and once we do that, this would return 'false' if the optimistic locking failed and + // you need to retry. + true + } +} diff --git a/pgxn/neon/communicator/src/lib.rs b/pgxn/neon/communicator/src/lib.rs new file mode 100644 index 0000000000..3e4773983a --- /dev/null +++ b/pgxn/neon/communicator/src/lib.rs @@ -0,0 +1,25 @@ +//! +//! Three main parts: +//! - async tokio communicator core, which receives requests and processes them. +//! - Main loop and requests queues, which routes requests from backends to the core +//! - the per-backend glue code, which submits requests +//! + +mod backend_comms; + +// mark this 'pub', because these functions are called from C code. Otherwise, the compiler +// complains about a bunch of structs and enum variants being unused, because it thinkgs +// the functions that use them are never called. There are some C-callable functions in +// other modules too, but marking this as pub is currently enough to silence the warnings +// +// TODO: perhaps collect *all* the extern "C" functions to one module? +pub mod backend_interface; + +mod file_cache; +mod init; +mod integrated_cache; +mod neon_request; +mod worker_process; + +// FIXME get this from postgres headers somehow +pub const BLCKSZ: usize = 8192; diff --git a/pgxn/neon/communicator/src/neon_request.rs b/pgxn/neon/communicator/src/neon_request.rs new file mode 100644 index 0000000000..0c1293e663 --- /dev/null +++ b/pgxn/neon/communicator/src/neon_request.rs @@ -0,0 +1,346 @@ +type CLsn = u64; +type COid = u32; + +// This conveniently matches PG_IOV_MAX +pub const MAX_GETPAGEV_PAGES: usize = 32; + +use pageserver_data_api::model; + +#[repr(C)] +#[derive(Copy, Clone, Debug)] +pub enum NeonIORequest { + Empty, + + // Read requests. These are C-friendly variants of the corresponding structs in + // pageserver_data_api::model. + RelExists(CRelExistsRequest), + RelSize(CRelSizeRequest), + GetPageV(CGetPageVRequest), + PrefetchV(CPrefetchVRequest), + DbSize(CDbSizeRequest), + + // Write requests. These are needed to keep the relation size cache and LFC up-to-date. + // They are not sent to the pageserver. + WritePage(CWritePageRequest), + RelExtend(CRelExtendRequest), + RelZeroExtend(CRelZeroExtendRequest), + RelCreate(CRelCreateRequest), + RelTruncate(CRelTruncateRequest), + RelUnlink(CRelUnlinkRequest), +} + +#[repr(C)] +#[derive(Copy, Clone, Debug)] +pub enum NeonIOResult { + Empty, + RelExists(bool), + RelSize(u32), + + /// the result pages are written to the shared memory addresses given in the request + GetPageV, + + /// A prefetch request returns as soon as the request has been received by the communicator. + /// It is processed in the background. + PrefetchVLaunched, + + DbSize(u64), + + // FIXME design compact error codes. Can't easily pass a string or other dynamic data. + // currently, this is 'errno' + Error(i32), + + Aborted, + + /// used for all write requests + WriteOK, +} + +#[repr(C)] +#[derive(Copy, Clone, Debug)] +pub struct CCachedGetPageVResult { + pub cache_block_numbers: [u64; MAX_GETPAGEV_PAGES], +} + +/// ShmemBuf represents a buffer in shared memory. +/// +/// SAFETY: The pointer must point to an area in shared memory. The functions allow you to liberally +/// get a mutable pointer to the contents; it is the caller's responsibility to ensure that you +/// don't access a buffer that's you're not allowed to. Inappropriate access to the buffer doesn't +/// violate Rust's safety semantics, but it will mess up and crash Postgres. +/// +#[repr(C)] +#[derive(Copy, Clone, Debug)] +pub struct ShmemBuf { + // These fields define where the result is written. Must point into a buffer in shared memory! + pub ptr: *mut u8, +} + +unsafe impl Send for ShmemBuf {} +unsafe impl Sync for ShmemBuf {} + +unsafe impl uring_common::buf::IoBuf for ShmemBuf { + fn stable_ptr(&self) -> *const u8 { + self.ptr + } + + fn bytes_init(&self) -> usize { + crate::BLCKSZ + } + + fn bytes_total(&self) -> usize { + crate::BLCKSZ + } +} + +unsafe impl uring_common::buf::IoBufMut for ShmemBuf { + fn stable_mut_ptr(&mut self) -> *mut u8 { + self.ptr + } + + unsafe fn set_init(&mut self, pos: usize) { + if pos > crate::BLCKSZ as usize { + panic!( + "set_init called past end of buffer, pos {}, buffer size {}", + pos, + crate::BLCKSZ + ); + } + } +} + +impl ShmemBuf { + pub fn as_mut_ptr(&self) -> *mut u8 { + self.ptr + } +} + +#[repr(C)] +#[derive(Copy, Clone, Debug)] +pub struct CRelExistsRequest { + pub spc_oid: COid, + pub db_oid: COid, + pub rel_number: u32, + pub fork_number: u8, +} + +#[repr(C)] +#[derive(Copy, Clone, Debug)] +pub struct CRelSizeRequest { + pub spc_oid: COid, + pub db_oid: COid, + pub rel_number: u32, + pub fork_number: u8, +} + +#[repr(C)] +#[derive(Copy, Clone, Debug)] +pub struct CGetPageVRequest { + pub spc_oid: COid, + pub db_oid: COid, + pub rel_number: u32, + pub fork_number: u8, + pub block_number: u32, + pub nblocks: u8, + + // These fields define where the result is written. Must point into a buffer in shared memory! + pub dest: [ShmemBuf; MAX_GETPAGEV_PAGES], +} + +#[repr(C)] +#[derive(Copy, Clone, Debug)] +pub struct CPrefetchVRequest { + pub spc_oid: COid, + pub db_oid: COid, + pub rel_number: u32, + pub fork_number: u8, + pub block_number: u32, + pub nblocks: u8, +} + +#[repr(C)] +#[derive(Copy, Clone, Debug)] +pub struct CDbSizeRequest { + pub db_oid: COid, + pub request_lsn: CLsn, +} + +#[repr(C)] +#[derive(Copy, Clone, Debug)] +pub struct CWritePageRequest { + pub spc_oid: COid, + pub db_oid: COid, + pub rel_number: u32, + pub fork_number: u8, + pub block_number: u32, + pub lsn: CLsn, + + // These fields define where the result is written. Must point into a buffer in shared memory! + pub src: ShmemBuf, +} + +#[repr(C)] +#[derive(Copy, Clone, Debug)] +pub struct CRelExtendRequest { + pub spc_oid: COid, + pub db_oid: COid, + pub rel_number: u32, + pub fork_number: u8, + pub block_number: u32, + pub lsn: CLsn, + + // These fields define page contents. Must point into a buffer in shared memory! + pub src_ptr: usize, + pub src_size: u32, +} + +#[repr(C)] +#[derive(Copy, Clone, Debug)] +pub struct CRelZeroExtendRequest { + pub spc_oid: COid, + pub db_oid: COid, + pub rel_number: u32, + pub fork_number: u8, + pub block_number: u32, + pub nblocks: u32, + pub lsn: CLsn, +} + +#[repr(C)] +#[derive(Copy, Clone, Debug)] +pub struct CRelCreateRequest { + pub spc_oid: COid, + pub db_oid: COid, + pub rel_number: u32, + pub fork_number: u8, +} + +#[repr(C)] +#[derive(Copy, Clone, Debug)] +pub struct CRelTruncateRequest { + pub spc_oid: COid, + pub db_oid: COid, + pub rel_number: u32, + pub fork_number: u8, + pub nblocks: u32, +} + +#[repr(C)] +#[derive(Copy, Clone, Debug)] +pub struct CRelUnlinkRequest { + pub spc_oid: COid, + pub db_oid: COid, + pub rel_number: u32, + pub fork_number: u8, + pub block_number: u32, + pub nblocks: u32, +} + +impl CRelExistsRequest { + pub fn reltag(&self) -> model::RelTag { + model::RelTag { + spc_oid: self.spc_oid, + db_oid: self.db_oid, + rel_number: self.rel_number, + fork_number: self.fork_number, + } + } +} + +impl CRelSizeRequest { + pub fn reltag(&self) -> model::RelTag { + model::RelTag { + spc_oid: self.spc_oid, + db_oid: self.db_oid, + rel_number: self.rel_number, + fork_number: self.fork_number, + } + } +} + +impl CGetPageVRequest { + pub fn reltag(&self) -> model::RelTag { + model::RelTag { + spc_oid: self.spc_oid, + db_oid: self.db_oid, + rel_number: self.rel_number, + fork_number: self.fork_number, + } + } +} + +impl CPrefetchVRequest { + pub fn reltag(&self) -> model::RelTag { + model::RelTag { + spc_oid: self.spc_oid, + db_oid: self.db_oid, + rel_number: self.rel_number, + fork_number: self.fork_number, + } + } +} + +impl CWritePageRequest { + pub fn reltag(&self) -> model::RelTag { + model::RelTag { + spc_oid: self.spc_oid, + db_oid: self.db_oid, + rel_number: self.rel_number, + fork_number: self.fork_number, + } + } +} + +impl CRelExtendRequest { + pub fn reltag(&self) -> model::RelTag { + model::RelTag { + spc_oid: self.spc_oid, + db_oid: self.db_oid, + rel_number: self.rel_number, + fork_number: self.fork_number, + } + } +} + +impl CRelZeroExtendRequest { + pub fn reltag(&self) -> model::RelTag { + model::RelTag { + spc_oid: self.spc_oid, + db_oid: self.db_oid, + rel_number: self.rel_number, + fork_number: self.fork_number, + } + } +} + +impl CRelCreateRequest { + pub fn reltag(&self) -> model::RelTag { + model::RelTag { + spc_oid: self.spc_oid, + db_oid: self.db_oid, + rel_number: self.rel_number, + fork_number: self.fork_number, + } + } +} + +impl CRelTruncateRequest { + pub fn reltag(&self) -> model::RelTag { + model::RelTag { + spc_oid: self.spc_oid, + db_oid: self.db_oid, + rel_number: self.rel_number, + fork_number: self.fork_number, + } + } +} + +impl CRelUnlinkRequest { + pub fn reltag(&self) -> model::RelTag { + model::RelTag { + spc_oid: self.spc_oid, + db_oid: self.db_oid, + rel_number: self.rel_number, + fork_number: self.fork_number, + } + } +} diff --git a/pgxn/neon/communicator/src/worker_process/callbacks.rs b/pgxn/neon/communicator/src/worker_process/callbacks.rs new file mode 100644 index 0000000000..c3b3a8e3b5 --- /dev/null +++ b/pgxn/neon/communicator/src/worker_process/callbacks.rs @@ -0,0 +1,28 @@ +//! C callbacks to PostgreSQL facilities that the neon extension needs +//! to provide. These are implemented in `neon/pgxn/communicator_new.c`. +//! The function signatures better match! +//! +//! These are called from the communicator threads! Careful what you do, most +//! Postgres functions are not safe to call in that context. + +use utils::lsn::Lsn; + +unsafe extern "C" { + pub fn notify_proc_unsafe(procno: std::ffi::c_int); + pub fn callback_set_my_latch_unsafe(); + pub fn callback_get_request_lsn_unsafe() -> u64; +} + +// safe wrappers + +pub(super) fn notify_proc(procno: std::ffi::c_int) { + unsafe { notify_proc_unsafe(procno) }; +} + +pub(super) fn callback_set_my_latch() { + unsafe { callback_set_my_latch_unsafe() }; +} + +pub(super) fn get_request_lsn() -> Lsn { + Lsn(unsafe { callback_get_request_lsn_unsafe() }) +} diff --git a/pgxn/neon/communicator/src/worker_process/logging.rs b/pgxn/neon/communicator/src/worker_process/logging.rs new file mode 100644 index 0000000000..9eeb4340fa --- /dev/null +++ b/pgxn/neon/communicator/src/worker_process/logging.rs @@ -0,0 +1,229 @@ +//! Glue code to hook up Rust logging, with the `tracing` crate, to the PostgreSQL log +//! +//! In the Rust threads, the log messages are written to a mpsc Channel, and the Postgres +//! process latch is raised. That wakes up the loop in the main thread. It reads the +//! message from the channel and ereport()s it. This ensures that only one thread, the main +//! thread, calls the PostgreSQL logging routines at any time. + +use std::sync::mpsc::sync_channel; +use std::sync::mpsc::{Receiver, SyncSender}; +use std::sync::mpsc::{TryRecvError, TrySendError}; + +use tracing::info; +use tracing::{Event, Level, Metadata, Subscriber}; +use tracing_subscriber::filter::LevelFilter; +use tracing_subscriber::fmt::FmtContext; +use tracing_subscriber::fmt::FormatEvent; +use tracing_subscriber::fmt::FormatFields; +use tracing_subscriber::fmt::FormattedFields; +use tracing_subscriber::fmt::MakeWriter; +use tracing_subscriber::fmt::format::Writer; +use tracing_subscriber::registry::LookupSpan; + +use crate::worker_process::callbacks::callback_set_my_latch; + +pub struct LoggingState { + receiver: Receiver, +} + +/// Called once, at worker process startup. The returned LoggingState is passed back +/// in the subsequent calls to `pump_logging`. It is opaque to the C code. +#[unsafe(no_mangle)] +pub extern "C" fn configure_logging() -> Box { + let (sender, receiver) = sync_channel(1000); + + let maker = Maker { channel: sender }; + + use tracing_subscriber::prelude::*; + let r = tracing_subscriber::registry(); + + let r = r.with( + tracing_subscriber::fmt::layer() + .event_format(SimpleFormatter::new()) + .with_writer(maker) + // TODO: derive this from log_min_messages? + .with_filter(LevelFilter::from_level(Level::INFO)), + ); + r.init(); + + info!("communicator process logging started"); + + let state = LoggingState { receiver }; + + Box::new(state) +} + +/// Read one message from the logging queue. This is essentially a wrapper to Receiver, +/// with a C-friendly signature. +/// +/// The message is copied into *errbuf, which is a caller-supplied buffer of size `errbuf_len`. +/// If the message doesn't fit in the buffer, it is truncated. It is always NULL-terminated. +/// +/// The error level is returned *elevel_p. It's one of the PostgreSQL error levels, see elog.h +#[unsafe(no_mangle)] +pub extern "C" fn pump_logging( + state: &mut LoggingState, + errbuf: *mut u8, + errbuf_len: u32, + elevel_p: &mut i32, +) -> i32 { + let msg = match state.receiver.try_recv() { + Err(TryRecvError::Empty) => return 0, + Err(TryRecvError::Disconnected) => return -1, + Ok(msg) => msg, + }; + + let src: &[u8] = &msg.message; + let dst = errbuf; + let len = std::cmp::min(src.len(), errbuf_len as usize - 1); + unsafe { + std::ptr::copy_nonoverlapping(src.as_ptr(), dst, len); + *(errbuf.add(len)) = b'\0'; // NULL terminator + } + + // XXX: these levels are copied from PostgreSQL's elog.h. Introduce another enum + // to hide these? + *elevel_p = match msg.level { + Level::TRACE => 10, // DEBUG5 + Level::DEBUG => 14, // DEBUG1 + Level::INFO => 17, // INFO + Level::WARN => 19, // WARNING + Level::ERROR => 21, // ERROR + }; + 1 +} + +//---- The following functions can be called from any thread ---- + +#[derive(Clone)] +struct FormattedEventWithMeta { + message: Vec, + level: tracing::Level, +} + +impl Default for FormattedEventWithMeta { + fn default() -> Self { + FormattedEventWithMeta { + message: Vec::new(), + level: tracing::Level::DEBUG, + } + } +} + +struct EventBuilder<'a> { + event: FormattedEventWithMeta, + + maker: &'a Maker, +} + +impl<'a> std::io::Write for EventBuilder<'a> { + fn write(&mut self, buf: &[u8]) -> std::io::Result { + self.event.message.write(buf) + } + fn flush(&mut self) -> std::io::Result<()> { + self.maker.send_event(self.event.clone()); + Ok(()) + } +} + +impl<'a> Drop for EventBuilder<'a> { + fn drop(&mut self) { + let maker = self.maker; + let event = std::mem::take(&mut self.event); + + maker.send_event(event); + } +} + +struct Maker { + channel: SyncSender, +} + +impl<'a> MakeWriter<'a> for Maker { + type Writer = EventBuilder<'a>; + + fn make_writer(&'a self) -> Self::Writer { + panic!("not expected to be called when make_writer_for is implemented"); + } + + fn make_writer_for(&'a self, meta: &Metadata<'_>) -> Self::Writer { + EventBuilder { + event: FormattedEventWithMeta { + message: Vec::new(), + level: *meta.level(), + }, + maker: self, + } + } +} + +impl Maker { + fn send_event(&self, e: FormattedEventWithMeta) { + match self.channel.try_send(e) { + Ok(()) => { + // notify the main thread + callback_set_my_latch(); + } + Err(TrySendError::Disconnected(_)) => {} + Err(TrySendError::Full(_)) => { + // TODO: record that some messages were lost + } + } + } +} + +/// Simple formatter implementation for tracing_subscriber, which prints the log +/// spans and message part like the default formatter, but no timestamp or error +/// level. The error level is captured separately by `FormattedEventWithMeta', +/// and when the error is printed by the main thread, with PostgreSQL ereport(), +/// it gets a timestamp at that point. (The timestamp printed will therefore lag +/// behind the timestamp on the event here, if the main thread doesn't process +/// the log message promptly) +struct SimpleFormatter; + +impl FormatEvent for SimpleFormatter +where + S: Subscriber + for<'a> LookupSpan<'a>, + N: for<'a> FormatFields<'a> + 'static, +{ + fn format_event( + &self, + ctx: &FmtContext<'_, S, N>, + mut writer: Writer<'_>, + event: &Event<'_>, + ) -> std::fmt::Result { + // Format all the spans in the event's span context. + if let Some(scope) = ctx.event_scope() { + for span in scope.from_root() { + write!(writer, "{}", span.name())?; + + // `FormattedFields` is a formatted representation of the span's + // fields, which is stored in its extensions by the `fmt` layer's + // `new_span` method. The fields will have been formatted + // by the same field formatter that's provided to the event + // formatter in the `FmtContext`. + let ext = span.extensions(); + let fields = &ext + .get::>() + .expect("will never be `None`"); + + // Skip formatting the fields if the span had no fields. + if !fields.is_empty() { + write!(writer, "{{{}}}", fields)?; + } + write!(writer, ": ")?; + } + } + + // Write fields on the event + ctx.field_format().format_fields(writer.by_ref(), event)?; + + writeln!(writer) + } +} + +impl SimpleFormatter { + fn new() -> Self { + SimpleFormatter {} + } +} diff --git a/pgxn/neon/communicator/src/worker_process/main_loop.rs b/pgxn/neon/communicator/src/worker_process/main_loop.rs new file mode 100644 index 0000000000..284bf25376 --- /dev/null +++ b/pgxn/neon/communicator/src/worker_process/main_loop.rs @@ -0,0 +1,384 @@ +use std::collections::HashMap; +use std::path::PathBuf; + +use crate::backend_comms::NeonIOHandle; +use crate::file_cache::FileCache; +use crate::init::CommunicatorInitStruct; +use crate::integrated_cache::{CacheResult, IntegratedCacheWriteAccess}; +use crate::neon_request::{CGetPageVRequest, CPrefetchVRequest}; +use crate::neon_request::{NeonIORequest, NeonIOResult}; +use pageserver_client_grpc::PageserverClient; +use pageserver_data_api::model; + +use tokio::io::AsyncReadExt; +use tokio_epoll_uring::IoBuf; +use tokio_pipe::PipeRead; + +use super::callbacks::{get_request_lsn, notify_proc}; + +use tracing::{error, info, trace}; + +use utils::lsn::Lsn; + +pub struct CommunicatorWorkerProcessStruct<'a> { + neon_request_slots: &'a [NeonIOHandle], + + pageserver_client: PageserverClient, + + cache: IntegratedCacheWriteAccess<'a>, + + submission_pipe_read_raw_fd: i32, +} + +pub(super) async fn init( + cis: Box, + tenant_id: String, + timeline_id: String, + auth_token: Option, + shard_map: HashMap, + _file_cache_size: u64, + file_cache_path: Option, +) -> CommunicatorWorkerProcessStruct<'static> { + let last_lsn = get_request_lsn(); + + let uring_system = tokio_epoll_uring::System::launch().await.unwrap(); + + let file_cache = if let Some(path) = file_cache_path { + Some(FileCache::new(&path, uring_system).expect("could not create cache file")) + } else { + // FIXME: temporarily for testing, use LFC even if disabled + Some( + FileCache::new(&PathBuf::from("new_filecache"), uring_system) + .expect("could not create cache file"), + ) + }; + + // Initialize subsystems + let cache = cis + .integrated_cache_init_struct + .worker_process_init(last_lsn, file_cache); + + let pageserver_client = PageserverClient::new(&tenant_id, &timeline_id, &auth_token, shard_map); + + let this = CommunicatorWorkerProcessStruct { + neon_request_slots: cis.neon_request_slots, + pageserver_client, + cache, + submission_pipe_read_raw_fd: cis.submission_pipe_read_fd, + }; + + this +} + +impl<'t> CommunicatorWorkerProcessStruct<'t> { + /// Main loop of the worker process. Receive requests from the backends and process them. + pub(super) async fn run(self: &'static Self) { + let mut idxbuf: [u8; 4] = [0; 4]; + + let mut submission_pipe_read = + PipeRead::from_raw_fd_checked(self.submission_pipe_read_raw_fd) + .expect("invalid pipe fd"); + + loop { + // Wait for a backend to ring the doorbell + + match submission_pipe_read.read(&mut idxbuf).await { + Ok(4) => {} + Ok(nbytes) => panic!("short read ({nbytes} bytes) on communicator pipe"), + Err(e) => panic!("error reading from communicator pipe: {e}"), + } + let request_idx = u32::from_ne_bytes(idxbuf); + + // Read the IO request from the slot indicated in the wakeup + let Some(slot) = + self.neon_request_slots[request_idx as usize].start_processing_request() + else { + // This currently should not happen. But if we have multiple threads picking up + // requests, and without waiting for the notifications, it could. + panic!("no request in slot"); + }; + + // Ok, we have ownership of this request now. We must process + // it now, there's no going back. + + //trace!("processing request {request_idx}: {request:?}"); + + // Spawn a separate task for every request. That's a little excessive for requests that + // can be quickly satisfied from the cache, but we expect that to be rare, because the + // requesting backend would have already checked the cache. + tokio::spawn(async { + let result = self.handle_request(slot.get_request()).await; + let owner_procno = slot.get_owner_procno(); + + // Ok, we have completed the IO. Mark the request as completed. After that, + // we no longer have ownership of the slot, and must not modify it. + slot.completed(result); + + // Notify the backend about the completion. (Note that the backend might see + // the completed status even before this; this is just a wakeup) + notify_proc(owner_procno); + }); + } + } + + fn request_common(&self, not_modified_since_lsn: Lsn) -> model::RequestCommon { + model::RequestCommon { + request_lsn: get_request_lsn(), + not_modified_since_lsn, + } + } + + async fn handle_request<'x>(self: &'static Self, req: &'x NeonIORequest) -> NeonIOResult { + match req { + NeonIORequest::Empty => { + error!("unexpected Empty IO request"); + NeonIOResult::Error(-1) + } + NeonIORequest::RelExists(req) => { + let rel = req.reltag(); + + let not_modified_since = match self.cache.get_rel_exists(&rel) { + CacheResult::Found(exists) => return NeonIOResult::RelExists(exists), + CacheResult::NotFound(lsn) => lsn, + }; + + match self + .pageserver_client + .process_rel_exists_request(&model::RelExistsRequest { + common: self.request_common(not_modified_since), + rel, + }) + .await + { + Ok(exists) => NeonIOResult::RelExists(exists), + Err(err) => { + info!("tonic error: {err:?}"); + NeonIOResult::Error(-1) + } + } + } + + NeonIORequest::RelSize(req) => { + let rel = req.reltag(); + + // Check the cache first + let not_modified_since = match self.cache.get_rel_size(&rel) { + CacheResult::Found(nblocks) => { + tracing::trace!("found relsize for {:?} in cache: {}", rel, nblocks); + return NeonIOResult::RelSize(nblocks); + } + CacheResult::NotFound(lsn) => lsn, + }; + + let common = self.request_common(not_modified_since); + match self + .pageserver_client + .process_rel_size_request(&model::RelSizeRequest { + common: common.clone(), + rel: rel.clone(), + }) + .await + { + Ok(nblocks) => { + // update the cache + tracing::info!("updated relsize for {:?} in cache: {}", rel, nblocks); + self.cache.remember_rel_size(&rel, nblocks); + + NeonIOResult::RelSize(nblocks) + } + Err(err) => { + info!("tonic error: {err:?}"); + NeonIOResult::Error(-1) + } + } + } + NeonIORequest::GetPageV(req) => match self.handle_get_pagev_request(req).await { + Ok(()) => NeonIOResult::GetPageV, + Err(errno) => NeonIOResult::Error(errno), + }, + NeonIORequest::PrefetchV(req) => { + let req = req.clone(); + tokio::spawn(async move { self.handle_prefetchv_request(&req).await }); + NeonIOResult::PrefetchVLaunched + } + NeonIORequest::DbSize(req) => { + // Check the cache first + let not_modified_since = match self.cache.get_db_size(req.db_oid) { + CacheResult::Found(db_size) => { + // get_page already copied the block content to the destination + return NeonIOResult::DbSize(db_size); + } + CacheResult::NotFound(lsn) => lsn, + }; + + match self + .pageserver_client + .process_dbsize_request(&model::DbSizeRequest { + common: self.request_common(not_modified_since), + db_oid: req.db_oid, + }) + .await + { + Ok(db_size) => NeonIOResult::DbSize(db_size), + Err(err) => { + info!("tonic error: {err:?}"); + NeonIOResult::Error(-1) + } + } + } + + // Write requests + NeonIORequest::WritePage(req) => { + // Also store it in the LFC while we still have it + let rel = req.reltag(); + self.cache + .remember_page(&rel, req.block_number, req.src, Lsn(req.lsn)) + .await; + NeonIOResult::WriteOK + } + NeonIORequest::RelExtend(req) => { + self.cache + .remember_rel_size(&req.reltag(), req.block_number + 1); + NeonIOResult::WriteOK + } + NeonIORequest::RelZeroExtend(req) => { + self.cache + .remember_rel_size(&req.reltag(), req.block_number + req.nblocks); + NeonIOResult::WriteOK + } + NeonIORequest::RelCreate(req) => { + self.cache.remember_rel_size(&req.reltag(), 0); + NeonIOResult::WriteOK + } + NeonIORequest::RelTruncate(req) => { + self.cache.remember_rel_size(&req.reltag(), req.nblocks); + NeonIOResult::WriteOK + } + NeonIORequest::RelUnlink(req) => { + self.cache.forget_rel(&req.reltag()); + NeonIOResult::WriteOK + } + } + } + + async fn handle_get_pagev_request(&'t self, req: &CGetPageVRequest) -> Result<(), i32> { + let rel = req.reltag(); + + // Check the cache first + let mut cache_misses = Vec::new(); + for i in 0..req.nblocks { + let blkno = req.block_number + i as u32; + let dest = req.dest[i as usize]; + let not_modified_since = match self.cache.get_page(&rel, blkno, dest).await { + Ok(CacheResult::Found(_)) => { + // get_page already copied the block content to the destination + trace!("found blk {} in rel {:?} in LFC ", blkno, rel); + continue; + } + Ok(CacheResult::NotFound(lsn)) => lsn, + Err(_io_error) => return Err(-1), // FIXME errno? + }; + cache_misses.push((blkno, not_modified_since, dest)); + } + if cache_misses.is_empty() { + return Ok(()); + } + let not_modified_since = cache_misses + .iter() + .map(|(_blkno, lsn, _dest)| *lsn) + .max() + .unwrap(); + + // TODO: Use batched protocol + for (blkno, _lsn, dest) in cache_misses.iter() { + match self + .pageserver_client + .get_page(&model::GetPageRequest { + common: self.request_common(not_modified_since), + rel: rel.clone(), + block_number: *blkno, + }) + .await + { + Ok(page_image) => { + // Write the received page image directly to the shared memory location + // that the backend requested. + let src: &[u8] = page_image.as_ref(); + let len = std::cmp::min(src.len(), dest.bytes_total() as usize); + unsafe { + std::ptr::copy_nonoverlapping(src.as_ptr(), dest.as_mut_ptr(), len); + }; + + trace!("remembering blk {} in rel {:?} in LFC", blkno, rel); + + // Also store it in the LFC while we have it + self.cache + .remember_page(&rel, *blkno, page_image, not_modified_since) + .await; + } + Err(err) => { + info!("tonic error: {err:?}"); + return Err(-1); + } + } + } + Ok(()) + } + + async fn handle_prefetchv_request( + self: &'static Self, + req: &CPrefetchVRequest, + ) -> Result<(), i32> { + let rel = req.reltag(); + + // Check the cache first + let mut cache_misses = Vec::new(); + for i in 0..req.nblocks { + let blkno = req.block_number + i as u32; + let not_modified_since = match self.cache.page_is_cached(&rel, blkno).await { + Ok(CacheResult::Found(_)) => { + trace!("found blk {} in rel {:?} in LFC ", req.block_number, rel); + continue; + } + Ok(CacheResult::NotFound(lsn)) => lsn, + Err(_io_error) => return Err(-1), // FIXME errno? + }; + cache_misses.push((req.block_number, not_modified_since)); + } + if cache_misses.is_empty() { + return Ok(()); + } + let not_modified_since = cache_misses.iter().map(|(_blkno, lsn)| *lsn).max().unwrap(); + + // TODO: spawn separate tasks for these. Use the integrated cache to keep track of the + // in-flight requests + + // TODO: Use batched protocol + for (blkno, _lsn) in cache_misses.iter() { + match self + .pageserver_client + .get_page(&model::GetPageRequest { + common: self.request_common(not_modified_since), + rel: rel.clone(), + block_number: *blkno, + }) + .await + { + Ok(page_image) => { + trace!( + "prefetch completed, remembering blk {} in rel {:?} in LFC", + req.block_number, rel + ); + self.cache + .remember_page(&rel, req.block_number, page_image, not_modified_since) + .await; + } + Err(err) => { + info!("tonic error: {err:?}"); + return Err(-1); + } + } + } + Ok(()) + } +} diff --git a/pgxn/neon/communicator/src/worker_process/mod.rs b/pgxn/neon/communicator/src/worker_process/mod.rs new file mode 100644 index 0000000000..edd35bfdcc --- /dev/null +++ b/pgxn/neon/communicator/src/worker_process/mod.rs @@ -0,0 +1,11 @@ +//! This code runs in the communicator worker process. This provides +//! the glue code to: +//! +//! - launch the 'processor', +//! - receive IO requests from backends and pass them to the processor, +//! - write results back to backends. + +mod callbacks; +mod logging; +mod main_loop; +mod worker_interface; diff --git a/pgxn/neon/communicator/src/worker_process/worker_interface.rs b/pgxn/neon/communicator/src/worker_process/worker_interface.rs new file mode 100644 index 0000000000..49c78713b4 --- /dev/null +++ b/pgxn/neon/communicator/src/worker_process/worker_interface.rs @@ -0,0 +1,93 @@ +//! Functions called from the C code in the worker process + +use std::collections::HashMap; +use std::ffi::{CStr, c_char}; +use std::path::PathBuf; + +use tracing::error; + +use crate::init::CommunicatorInitStruct; +use crate::worker_process::main_loop; + +/// Launch the communicator's tokio tasks, which do most of the work. +/// +/// The caller has initialized the process as a regular PostgreSQL +/// background worker process. The shared memory segment used to +/// communicate with the backends has been allocated and initialized +/// earlier, at postmaster startup, in rcommunicator_shmem_init(). +#[unsafe(no_mangle)] +pub extern "C" fn communicator_worker_process_launch( + cis: Box, + tenant_id: *const c_char, + timeline_id: *const c_char, + auth_token: *const c_char, + shard_map: *mut *mut c_char, + nshards: u32, + file_cache_path: *const c_char, + file_cache_size: u64, +) { + // Convert the arguments into more convenient Rust types + let tenant_id = unsafe { CStr::from_ptr(tenant_id) }.to_str().unwrap(); + let timeline_id = unsafe { CStr::from_ptr(timeline_id) }.to_str().unwrap(); + let auth_token = { + if auth_token.is_null() { + None + } else { + let c_str = unsafe { CStr::from_ptr(auth_token) }; + Some(c_str.to_str().unwrap().to_string()) + } + }; + let file_cache_path = { + if file_cache_path.is_null() { + None + } else { + let c_str = unsafe { CStr::from_ptr(file_cache_path) }; + Some(PathBuf::from(c_str.to_str().unwrap())) + } + }; + let shard_map = parse_shard_map(nshards, shard_map); + + // start main loop + let runtime = tokio::runtime::Builder::new_multi_thread() + .enable_all() + .thread_name("communicator thread") + .build() + .unwrap(); + + let worker_struct = runtime.block_on(main_loop::init( + cis, + tenant_id.to_string(), + timeline_id.to_string(), + auth_token, + shard_map, + file_cache_size, + file_cache_path, + )); + let worker_struct = Box::leak(Box::new(worker_struct)); + + let main_loop_handle = runtime.spawn(worker_struct.run()); + + runtime.spawn(async { + let err = main_loop_handle.await.unwrap_err(); + error!("error: {err:?}"); + }); + + // keep the runtime running after we exit this function + Box::leak(Box::new(runtime)); +} + +/// Convert the "shard map" from an array of C strings, indexed by shard no to a rust HashMap +fn parse_shard_map(nshards: u32, shard_map: *mut *mut c_char) -> HashMap { + let mut result: HashMap = HashMap::new(); + let mut p = shard_map; + + for i in 0..nshards { + let c_str = unsafe { CStr::from_ptr(*p) }; + + p = unsafe { p.add(1) }; + + let s = c_str.to_str().unwrap(); + result.insert(i as u16, s.into()); + } + result +} diff --git a/pgxn/neon/communicator_new.c b/pgxn/neon/communicator_new.c new file mode 100644 index 0000000000..87b26926eb --- /dev/null +++ b/pgxn/neon/communicator_new.c @@ -0,0 +1,953 @@ +/*------------------------------------------------------------------------- + * + * communicator_new.c + * Functions for communicating with remote pageservers. + * + * This is the "new" communicator. It consists of functions that + * are called from the smgr implementation, in pagestore_smgr.c. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/xlog.h" +#include "access/xlogdefs.h" +#if PG_VERSION_NUM >= 150000 +#include "access/xlogrecovery.h" +#endif +#include "access/xlog_internal.h" +#include "access/xlogutils.h" +#include "executor/instrument.h" +#include "miscadmin.h" +#include "postmaster/bgworker.h" +#include "postmaster/interrupt.h" +#include "replication/walsender.h" +#include "storage/fd.h" +#include "storage/ipc.h" +#include "storage/latch.h" +#include "storage/procarray.h" +#if PG_VERSION_NUM >= 170000 +#include "storage/procnumber.h" +#endif +#include "storage/spin.h" +#include "tcop/tcopprot.h" + +#include "communicator_new.h" +#include "neon.h" +#include "neon_perf_counters.h" +#include "pagestore_client.h" + +/* + * FIXME: these are in file_cache.h, but I don't want to #include that + * here. This code shouldn't be using the C file cache for anything else than + * the GUCs. + */ +extern int lfc_size_limit; +extern char *lfc_path; + + +/* the rust bindings, generated by cbindgen */ +#include "communicator/communicator_bindings.h" + +#define MaxProcs (MaxBackends + NUM_AUXILIARY_PROCS) + +static CommunicatorInitStruct *cis; +static CommunicatorBackendStruct *my_bs; + +static File cache_file = 0; + +typedef struct CommunicatorShmemPerBackendData +{ + /* + * Latch used to notify backend of IO completion. We cannot use the + * standard process latch (MyProc->latch) because we cannot clear that + * latch as part of the IO handling, or we might cause the caller to miss + * some other events. + */ + Latch io_completion_latch; + + /* + * Normally, when reading or writing pages from shared buffer cache, the + * worker process can operate directly on the shared buffer. But when + * working with a local buffer, we use this "bounce buffer" to pass the + * data to the worker process. + * + * TODO: That's slow, because it incurs an extra memory copy, and there's + * currently only one of these per backend, which means you can have only + * one such IO in progress at a time. + */ + PGIOAlignedBlock bounce_buffer; +} CommunicatorShmemPerBackendData; + +typedef struct CommunicatorShmemData +{ + int dummy; + + CommunicatorShmemPerBackendData backends[]; /* MaxProcs */ + + /* rust-managed shmem area follows at next MAXALIGN boundary */ +} CommunicatorShmemData; + +static CommunicatorShmemData *communicator_shmem_ptr; + +#define MyIOCompletionLatch (&communicator_shmem_ptr->backends[MyProcNumber].io_completion_latch) + +static slock_t in_elog; + +#define MAX_INFLIGHT_ASYNC_REQUESTS 5 + +/* request indexes of (prefetch) requests that have been started */ +static int inflight_requests[MAX_INFLIGHT_ASYNC_REQUESTS]; +static int num_inflight_requests = 0; + +static int start_request(NeonIORequest *request, struct NeonIOResult *immediate_result_p); +static void wait_request_completion(int request_idx, struct NeonIOResult *result_p); +static void perform_request(NeonIORequest *request, struct NeonIOResult *result_p); +static void process_inflight_requests(void); + +static bool bounce_needed(void *buffer); +static void *bounce_buf(void); +static void *bounce_write_if_needed(void *buffer); + +PGDLLEXPORT void communicator_new_bgworker_main(Datum main_arg); +static void communicator_new_backend_exit(int code, Datum arg); + +/**** Initialization functions. These run in postmaster ****/ + +void +pg_init_communicator_new(void) +{ + BackgroundWorker bgw; + + /* Initialize the background worker process */ + memset(&bgw, 0, sizeof(bgw)); + bgw.bgw_flags = BGWORKER_SHMEM_ACCESS; + bgw.bgw_start_time = BgWorkerStart_PostmasterStart; + snprintf(bgw.bgw_library_name, BGW_MAXLEN, "neon"); + snprintf(bgw.bgw_function_name, BGW_MAXLEN, "communicator_new_bgworker_main"); + snprintf(bgw.bgw_name, BGW_MAXLEN, "Storage communicator process"); + snprintf(bgw.bgw_type, BGW_MAXLEN, "Storage communicator process"); + bgw.bgw_restart_time = 5; + bgw.bgw_notify_pid = 0; + bgw.bgw_main_arg = (Datum) 0; + + RegisterBackgroundWorker(&bgw); + + SpinLockInit(&in_elog); +} + +static size_t +communicator_new_shmem_size(void) +{ + size_t size = 0; + + size += MAXALIGN( + offsetof(CommunicatorShmemData, backends) + + MaxProcs * sizeof(CommunicatorShmemPerBackendData) + ); + + /* space needed by the rust code */ + size += rcommunicator_shmem_size(MaxProcs); + + return size; +} + +void +communicator_new_shmem_request(void) +{ + RequestAddinShmemSpace(communicator_new_shmem_size()); +} + +void +communicator_new_shmem_startup(void) +{ + bool found; + int pipefd[2]; + int rc; + size_t communicator_size; + size_t shmem_size; + void *shmem_ptr; + + rc = pipe(pipefd); + if (rc != 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg_internal("could not create pipe between neon communicator and backends : %m"))); + if (fcntl(pipefd[0], F_SETFL, O_NONBLOCK) == -1) + elog(FATAL, "fcntl(F_SETFL) failed on read-end of communicator pipe: %m"); + if (fcntl(pipefd[1], F_SETFL, O_NONBLOCK) == -1) + elog(FATAL, "fcntl(F_SETFL) failed on write-end of communicator pipe: %m"); + + shmem_size = communicator_new_shmem_size(); + shmem_ptr = ShmemInitStruct("Communicator shmem state", + shmem_size, + &found); + Assert(!found); + + /* Initialize the C-managed parts */ + communicator_shmem_ptr = (CommunicatorShmemData *) shmem_ptr; + communicator_size = MAXALIGN(offsetof(CommunicatorShmemData, backends) + MaxProcs * sizeof(CommunicatorShmemPerBackendData)); + shmem_ptr = (char *) shmem_ptr + communicator_size; + shmem_size -= communicator_size; + + for (int i = 0; i < MaxProcs; i++) + InitSharedLatch(&communicator_shmem_ptr->backends[i].io_completion_latch); + + /* Initialize the rust-managed parts */ + cis = rcommunicator_shmem_init(pipefd[0], pipefd[1], MaxProcs, shmem_ptr, shmem_size); +} + +/**** Worker process functions. These run in the communicator worker process ****/ + +/* Entry point for the communicator bgworker process */ +void +communicator_new_bgworker_main(Datum main_arg) +{ + char **connstrs; + shardno_t num_shards; + struct LoggingState *logging; + char errbuf[1000]; + int elevel; + + /* Establish signal handlers. */ + pqsignal(SIGUSR1, procsignal_sigusr1_handler); + pqsignal(SIGHUP, SignalHandlerForConfigReload); + pqsignal(SIGTERM, die); + + BackgroundWorkerUnblockSignals(); + + get_shard_map(&connstrs, &num_shards); + + logging = configure_logging(); + + communicator_worker_process_launch( + cis, + neon_tenant, + neon_timeline, + neon_auth_token, + connstrs, + num_shards, + lfc_path, + lfc_size_limit); + cis = NULL; + + elog(LOG, "communicator threads started"); + for (;;) + { + int32 rc; + + CHECK_FOR_INTERRUPTS(); + + for (;;) + { + rc = pump_logging(logging, (uint8 *) errbuf, sizeof(errbuf), &elevel); + if (rc == 0) + { + /* nothing to do */ + break; + } + else if (rc == 1) + { + /* Because we don't want to exit on error */ + if (elevel == ERROR) + elevel = LOG; + if (elevel == INFO) + elevel = LOG; + elog(elevel, "[COMMUNICATOR] %s", errbuf); + } + else if (rc == -1) + { + elog(ERROR, "logging channel was closed unexpectedly"); + } + } + + (void) WaitLatch(MyLatch, + WL_LATCH_SET | WL_EXIT_ON_PM_DEATH, + 0, + PG_WAIT_EXTENSION); + ResetLatch(MyLatch); + } +} + +/* + * Callbacks from the rust code, in the communicator process. + * + * NOTE: These must be thread safe! It's very limited which PostgreSQL functions you can use!!! + * + * NOTE: the signatures of these better match the Rust definitions! + */ + +void +notify_proc_unsafe(int procno) +{ + SetLatch(&communicator_shmem_ptr->backends[procno].io_completion_latch); + +} + +void +callback_set_my_latch_unsafe(void) +{ + SetLatch(MyLatch); +} + +/* + * FIXME: The logic from neon_get_request_lsns() needs to go here, except for + * the last-written LSN cache stuff, which is managed by the rust code now. + */ +uint64 +callback_get_request_lsn_unsafe(void) +{ + /* + * NB: be very careful with what you do here! This is called from tokio + * threads, so anything tha tries to take LWLocks is unsafe, for example. + * + * RecoveryInProgress() is OK + */ + if (RecoveryInProgress()) + { + XLogRecPtr replay_lsn = GetXLogReplayRecPtr(NULL); + + return replay_lsn; + } + else + { + XLogRecPtr flushlsn; + +#if PG_VERSION_NUM >= 150000 + flushlsn = GetFlushRecPtr(NULL); +#else + flushlsn = GetFlushRecPtr(); +#endif + + return flushlsn; + } +} + +/**** Backend functions. These run in each backend ****/ + +/* Initialize per-backend private state */ +void +communicator_new_init(void) +{ + Assert(cis != NULL); + Assert(my_bs == NULL); + + if (MyBgworkerEntry && strcmp(MyBgworkerEntry->bgw_function_name, "communicator_new_bgworker_main") == 0) + return; + + OwnLatch(MyIOCompletionLatch); + + my_bs = rcommunicator_backend_init(cis, MyProcNumber); + cis = NULL; + + /* + * Arrange to clean up at backend exit. + */ + on_shmem_exit(communicator_new_backend_exit, 0); +} + +static void +communicator_new_backend_exit(int code, Datum arg) +{ + DisownLatch(MyIOCompletionLatch); +} + +/* + * prefetch_register_bufferv() - register and prefetch buffers + * + * Register that we may want the contents of BufferTag in the near future. + * This is used when issuing a speculative prefetch request, but also when + * performing a synchronous request and need the buffer right now. + * + * When performing a prefetch rather than a synchronous request, + * is_prefetch==true. Currently, it only affects how the request is accounted + * in the perf counters. + * + * NOTE: this function may indirectly update MyPState->pfs_hash; which + * invalidates any active pointers into the hash table. + */ +void +communicator_new_prefetch_register_bufferv(NRelFileInfo rinfo, ForkNumber forkNum, + BlockNumber blockno, BlockNumber nblocks) +{ + int request_idx; + NeonIORequest request = { + .tag = NeonIORequest_PrefetchV, + .prefetch_v = { + .spc_oid = NInfoGetSpcOid(rinfo), + .db_oid = NInfoGetDbOid(rinfo), + .rel_number = NInfoGetRelNumber(rinfo), + .fork_number = forkNum, + .block_number = blockno, + .nblocks = nblocks, + } + }; + struct NeonIOResult result; + + elog(LOG, "prefetch called for rel %u/%u/%u.%u block %u (%u blocks)", + RelFileInfoFmt(rinfo), forkNum, blockno, nblocks); + + if (num_inflight_requests >= MAX_INFLIGHT_ASYNC_REQUESTS) + process_inflight_requests(); + + request_idx = bcomm_start_io_request(my_bs, &request, &result); + if (request_idx == -1) + { + /* -1 means the request was satisfied immediately. */ + /* FIXME: check and log errors */ + return; + } + inflight_requests[num_inflight_requests] = request_idx; + num_inflight_requests++; + + elog(LOG, "sent prefetch request with idx %d", request_idx); +} + +static void +process_inflight_requests(void) +{ + struct NeonIOResult result; + + /* FIXME: log errors */ + for (int i = 0; i < num_inflight_requests; i++) + wait_request_completion(inflight_requests[i], &result); + num_inflight_requests = 0; +} + +/* + * Perform an IO request in a synchronous fashion. + * + * Returns a pointer to the result slot. It is valid until the next time a + * request is submitted. + */ +static void +perform_request(NeonIORequest * request, struct NeonIOResult *result_p) +{ + int request_idx; + + process_inflight_requests(); + + request_idx = start_request(request, result_p); + if (request_idx == -1) + { + /* it was completed immediately */ + return; + } + wait_request_completion(request_idx, result_p); +} + +static int +start_request(NeonIORequest * request, struct NeonIOResult *immediate_result_p) +{ + int request_idx; + + request_idx = bcomm_start_io_request(my_bs, request, immediate_result_p); + if (request_idx == -1) + { + /* -1 means the request was satisfied immediately. */ + return -1; + } + elog(DEBUG5, "sent request with idx %d: tag %d", request_idx, request->tag); + return request_idx; +} + +static void +wait_request_completion(int request_idx, struct NeonIOResult *result_p) +{ + int32_t poll_res; + + /* fixme: check 'request_idx' ? */ + + for (;;) + { + ResetLatch(MyIOCompletionLatch); + + poll_res = bcomm_poll_request_completion(my_bs, request_idx, result_p); + if (poll_res == -1) + { + CHECK_FOR_INTERRUPTS(); + + /* + * TODO: wake up periodically for CHECK_FOR_INTERRUPTS(). Because + * we wait on MyIOCompletionLatch rather than MyLatch, we won't be + * woken up for the standard interrupts. + */ + (void) WaitLatch(MyIOCompletionLatch, + WL_EXIT_ON_PM_DEATH | WL_LATCH_SET, + 0, + WAIT_EVENT_NEON_PS_STARTING); + continue; /* still busy */ + } + else if (poll_res == 0) + { + return; + } + else + { + elog(ERROR, "unexpected return code from bcomm_poll_request_completion()"); + } + } +} + +/* + * Does the physical file exist? + */ +bool +communicator_new_rel_exists(NRelFileInfo rinfo, ForkNumber forkNum) +{ + NeonIORequest request = { + .tag = NeonIORequest_RelExists, + .rel_exists = { + .spc_oid = NInfoGetSpcOid(rinfo), + .db_oid = NInfoGetDbOid(rinfo), + .rel_number = NInfoGetRelNumber(rinfo), + .fork_number = forkNum, + } + }; + NeonIOResult result; + + perform_request(&request, &result); + switch (result.tag) + { + case NeonIOResult_RelExists: + return result.rel_exists; + case NeonIOResult_Error: + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not check existence of rel %u/%u/%u.%u: %s", + RelFileInfoFmt(rinfo), forkNum, pg_strerror(result.error)))); + break; + default: + elog(ERROR, "unexpected result for RelExists operation: %d", result.tag); + break; + } +} + +/* + * Read N consecutive pages from a relation + */ +void +communicator_new_read_at_lsnv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blockno, + void **buffers, BlockNumber nblocks) +{ + NeonIOResult result; + CCachedGetPageVResult cached_result; + void *bounce_buf_used = NULL; + int request_idx; + NeonIORequest request = { + .tag = NeonIORequest_GetPageV, + .get_page_v = { + .spc_oid = NInfoGetSpcOid(rinfo), + .db_oid = NInfoGetDbOid(rinfo), + .rel_number = NInfoGetRelNumber(rinfo), + .fork_number = forkNum, + .block_number = blockno, + .nblocks = nblocks, + } + }; + + elog(LOG, "getpagev called for rel %u/%u/%u.%u block %u (%u blocks)", + RelFileInfoFmt(rinfo), forkNum, blockno, nblocks); + + /* Fill in the destination buffers in the request */ + if (nblocks == 1) + { + if (bounce_needed(buffers[0])) + { + bounce_buf_used = bounce_buf(); + request.get_page_v.dest[0].ptr = bounce_buf_used; + } + else + request.get_page_v.dest[0].ptr = buffers[0]; + } + else + { + for (int i = 0; i < nblocks; i++) + { + if (bounce_needed(buffers[i])) + { + /* Split the vector-request into single page requests */ + for (int j = 0; j < nblocks; j++) + { + communicator_new_read_at_lsnv(rinfo, forkNum, blockno + j, + &buffers[j], 1); + } + return; + } + request.get_page_v.dest[i].ptr = buffers[i]; + } + } + + process_inflight_requests(); + +retry: + request_idx = bcomm_start_get_page_v_request(my_bs, &request, &cached_result); + if (request_idx == -1) + { + bool completed; + + /* + * LFC hit, but we are responsible for completing the I/O on the local + * file + */ + if (cache_file == 0) + cache_file = PathNameOpenFile(lfc_path, O_RDONLY | PG_BINARY); + + for (int i = 0; i < nblocks; i++) + { + uint64_t cached_block = cached_result.cache_block_numbers[i]; + ssize_t bytes_total = 0; + + while (bytes_total < BLCKSZ) + { + ssize_t nbytes; + + nbytes = FileRead(cache_file, ((char *) buffers[i]) + bytes_total, BLCKSZ - bytes_total, cached_block * BLCKSZ + bytes_total, WAIT_EVENT_NEON_LFC_READ); + if (nbytes == -1) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read block %lu in local cache file: %m", + cached_block))); + bytes_total += nbytes; + } + } + completed = bcomm_finish_cache_read(my_bs); + if (!completed) + { + elog(DEBUG1, "read from local cache file was superseded by concurrent update"); + goto retry; + } + return; + } + + wait_request_completion(request_idx, &result); + switch (result.tag) + { + case NeonIOResult_GetPageV: + if (bounce_buf_used) + memcpy(buffers[0], bounce_buf_used, BLCKSZ); + return; + case NeonIOResult_Error: + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read block %u in rel %u/%u/%u.%u: %s", + blockno, RelFileInfoFmt(rinfo), forkNum, pg_strerror(result.error)))); + break; + default: + elog(ERROR, "unexpected result for GetPage operation: %d", result.tag); + break; + } +} + +/* + * neon_nblocks() -- Get the number of blocks stored in a relation. + */ +BlockNumber +communicator_new_rel_nblocks(NRelFileInfo rinfo, ForkNumber forkNum) +{ + NeonIORequest request = { + .tag = NeonIORequest_RelSize, + .rel_size = { + .spc_oid = NInfoGetSpcOid(rinfo), + .db_oid = NInfoGetDbOid(rinfo), + .rel_number = NInfoGetRelNumber(rinfo), + .fork_number = forkNum, + } + }; + NeonIOResult result; + + perform_request(&request, &result); + switch (result.tag) + { + case NeonIOResult_RelSize: + return result.rel_size; + case NeonIOResult_Error: + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read size of rel %u/%u/%u.%u: %s", + RelFileInfoFmt(rinfo), forkNum, pg_strerror(result.error)))); + break; + default: + elog(ERROR, "unexpected result for RelSize operation: %d", result.tag); + break; + } +} + +/* + * neon_db_size() -- Get the size of the database in bytes. + */ +int64 +communicator_new_dbsize(Oid dbNode) +{ + NeonIORequest request = { + .tag = NeonIORequest_DbSize, + .db_size = { + .db_oid = dbNode, + } + }; + NeonIOResult result; + + perform_request(&request, &result); + switch (result.tag) + { + case NeonIOResult_DbSize: + return (int64) result.db_size; + case NeonIOResult_Error: + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read database size of database %u: %s", + dbNode, pg_strerror(result.error)))); + break; + default: + elog(ERROR, "unexpected result for DbSize operation: %d", result.tag); + break; + } +} + +int +communicator_new_read_slru_segment(SlruKind kind, int64 segno, void *buffer) +{ + /* TODO */ + elog(ERROR, "not implemented"); +} + +/* Write requests */ +void +communicator_new_write_page(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blockno, + const void *buffer, XLogRecPtr lsn) +{ + void *src = bounce_write_if_needed((void *) buffer); + NeonIORequest request = { + .tag = NeonIORequest_WritePage, + .write_page = { + .spc_oid = NInfoGetSpcOid(rinfo), + .db_oid = NInfoGetDbOid(rinfo), + .rel_number = NInfoGetRelNumber(rinfo), + .fork_number = forkNum, + .block_number = blockno, + .lsn = lsn, + .src.ptr = src, + } + }; + NeonIOResult result; + + perform_request(&request, &result); + switch (result.tag) + { + case NeonIOResult_WriteOK: + return; + case NeonIOResult_Error: + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not write block %u in rel %u/%u/%u.%u: %s", + blockno, RelFileInfoFmt(rinfo), forkNum, pg_strerror(result.error)))); + break; + default: + elog(ERROR, "unexpected result for WritePage operation: %d", result.tag); + break; + } +} + +void +communicator_new_rel_extend(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blockno, + const void *buffer, XLogRecPtr lsn) +{ + void *src = bounce_write_if_needed((void *) buffer); + NeonIORequest request = { + .tag = NeonIORequest_RelExtend, + .rel_extend = { + .spc_oid = NInfoGetSpcOid(rinfo), + .db_oid = NInfoGetDbOid(rinfo), + .rel_number = NInfoGetRelNumber(rinfo), + .fork_number = forkNum, + .block_number = blockno, + .lsn = lsn, + .src_ptr = (uintptr_t) src, + .src_size = BLCKSZ, + } + }; + NeonIOResult result; + + perform_request(&request, &result); + switch (result.tag) + { + case NeonIOResult_WriteOK: + return; + case NeonIOResult_Error: + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not extend to block %u in rel %u/%u/%u.%u: %s", + blockno, RelFileInfoFmt(rinfo), forkNum, pg_strerror(result.error)))); + break; + default: + elog(ERROR, "unexpected result for Extend operation: %d", result.tag); + break; + } +} + +void +communicator_new_rel_zeroextend(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blockno, + BlockNumber nblocks, XLogRecPtr lsn) +{ + NeonIORequest request = { + .tag = NeonIORequest_RelZeroExtend, + .rel_zero_extend = { + .spc_oid = NInfoGetSpcOid(rinfo), + .db_oid = NInfoGetDbOid(rinfo), + .rel_number = NInfoGetRelNumber(rinfo), + .fork_number = forkNum, + .block_number = blockno, + .nblocks = nblocks, + .lsn = lsn, + } + }; + NeonIOResult result; + + perform_request(&request, &result); + switch (result.tag) + { + case NeonIOResult_WriteOK: + return; + case NeonIOResult_Error: + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not zeroextend to block %u in rel %u/%u/%u.%u: %s", + blockno, RelFileInfoFmt(rinfo), forkNum, pg_strerror(result.error)))); + break; + default: + elog(ERROR, "unexpected result for ZeroExtend operation: %d", result.tag); + break; + } +} + +void +communicator_new_rel_create(NRelFileInfo rinfo, ForkNumber forkNum) +{ + NeonIORequest request = { + .tag = NeonIORequest_RelCreate, + .rel_create = { + .spc_oid = NInfoGetSpcOid(rinfo), + .db_oid = NInfoGetDbOid(rinfo), + .rel_number = NInfoGetRelNumber(rinfo), + .fork_number = forkNum, + } + }; + NeonIOResult result; + + perform_request(&request, &result); + switch (result.tag) + { + case NeonIOResult_WriteOK: + return; + case NeonIOResult_Error: + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not create rel %u/%u/%u.%u: %s", + RelFileInfoFmt(rinfo), forkNum, pg_strerror(result.error)))); + break; + default: + elog(ERROR, "unexpected result for Create operation: %d", result.tag); + break; + } +} + +void +communicator_new_rel_truncate(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber nblocks) +{ + NeonIORequest request = { + .tag = NeonIORequest_RelTruncate, + .rel_truncate = { + .spc_oid = NInfoGetSpcOid(rinfo), + .db_oid = NInfoGetDbOid(rinfo), + .rel_number = NInfoGetRelNumber(rinfo), + .fork_number = forkNum, + .nblocks = nblocks, + } + }; + NeonIOResult result; + + perform_request(&request, &result); + switch (result.tag) + { + case NeonIOResult_WriteOK: + return; + case NeonIOResult_Error: + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not truncate rel %u/%u/%u.%u to %u blocks: %s", + RelFileInfoFmt(rinfo), forkNum, nblocks, pg_strerror(result.error)))); + break; + default: + elog(ERROR, "unexpected result for Truncate operation: %d", result.tag); + break; + } +} + +void +communicator_new_rel_unlink(NRelFileInfo rinfo, ForkNumber forkNum) +{ + NeonIORequest request = { + .tag = NeonIORequest_RelUnlink, + .rel_unlink = { + .spc_oid = NInfoGetSpcOid(rinfo), + .db_oid = NInfoGetDbOid(rinfo), + .rel_number = NInfoGetRelNumber(rinfo), + .fork_number = forkNum, + } + }; + NeonIOResult result; + + perform_request(&request, &result); + switch (result.tag) + { + case NeonIOResult_WriteOK: + return; + case NeonIOResult_Error: + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not unlink rel %u/%u/%u.%u: %s", + RelFileInfoFmt(rinfo), forkNum, pg_strerror(result.error)))); + break; + default: + elog(ERROR, "unexpected result for Unlink operation: %d", result.tag); + break; + } +} + +/* + * The worker process can read / write shared buffers directly. But if smgrread() or + * smgrwrite() is called with a private temporary buffer, we need to copy it to the + * "bounce buffer", to make it available fro the worker process. + */ +static bool +bounce_needed(void *buffer) +{ + if ((uintptr_t) buffer >= (uintptr_t) BufferBlocks && + (uintptr_t) buffer < (uintptr_t) BufferBlocks + NBuffers * BLCKSZ) + { + return false; + } + return true; +} + +static void * +bounce_buf(void) +{ + return &communicator_shmem_ptr->backends[MyProcNumber].bounce_buffer; +} + +static void * +bounce_write_if_needed(void *buffer) +{ + void *p; + + if (!bounce_needed(buffer)) + return buffer; + + p = bounce_buf(); + memcpy(p, buffer, BLCKSZ); + return p; +} diff --git a/pgxn/neon/communicator_new.h b/pgxn/neon/communicator_new.h new file mode 100644 index 0000000000..43dc1ad793 --- /dev/null +++ b/pgxn/neon/communicator_new.h @@ -0,0 +1,54 @@ +/*------------------------------------------------------------------------- + * + * communicator_new.h + * new implementation + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + *------------------------------------------------------------------------- + */ +#ifndef COMMUNICATOR_NEW_H +#define COMMUNICATOR_NEW_H + +#include "neon_pgversioncompat.h" + +#include "storage/buf_internals.h" + +#include "pagestore_client.h" + +/* initialization at postmaster startup */ +extern void pg_init_communicator_new(void); +extern void communicator_new_shmem_request(void); +extern void communicator_new_shmem_startup(void); + +/* initialization at backend startup */ +extern void communicator_new_init(void); + +/* Read requests */ +extern bool communicator_new_rel_exists(NRelFileInfo rinfo, ForkNumber forkNum); +extern BlockNumber communicator_new_rel_nblocks(NRelFileInfo rinfo, ForkNumber forknum); +extern int64 communicator_new_dbsize(Oid dbNode); +extern void communicator_new_read_at_lsnv(NRelFileInfo rinfo, ForkNumber forkNum, + BlockNumber base_blockno, + void **buffers, BlockNumber nblocks); +extern void communicator_new_prefetch_register_bufferv(NRelFileInfo rinfo, ForkNumber forkNum, + BlockNumber blockno, + BlockNumber nblocks); +extern int communicator_new_read_slru_segment(SlruKind kind, int64 segno, + void *buffer); + +/* Write requests, to keep the caches up-to-date */ +extern void communicator_new_write_page(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blockno, + const void *buffer, XLogRecPtr lsn); +extern void communicator_new_rel_extend(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blockno, + const void *buffer, XLogRecPtr lsn); +extern void communicator_new_rel_zeroextend(NRelFileInfo rinfo, ForkNumber forkNum, + BlockNumber blockno, BlockNumber nblocks, + XLogRecPtr lsn); +extern void communicator_new_rel_create(NRelFileInfo rinfo, ForkNumber forkNum); +extern void communicator_new_rel_truncate(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber nblocks); +extern void communicator_new_rel_unlink(NRelFileInfo rinfo, ForkNumber forkNum); + +#endif /* COMMUNICATOR_NEW_H */ diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c index e2c1f7682f..9f06fb4da8 100644 --- a/pgxn/neon/file_cache.c +++ b/pgxn/neon/file_cache.c @@ -164,10 +164,10 @@ static HTAB *lfc_hash; static int lfc_desc = -1; static LWLockId lfc_lock; static int lfc_max_size; -static int lfc_size_limit; +int lfc_size_limit; static int lfc_chunk_size_log = MAX_BLOCKS_PER_CHUNK_LOG; static int lfc_blocks_per_chunk = MAX_BLOCKS_PER_CHUNK; -static char *lfc_path; +char *lfc_path; static uint64 lfc_generation; static FileCacheControl *lfc_ctl; static shmem_startup_hook_type prev_shmem_startup_hook; diff --git a/pgxn/neon/file_cache.h b/pgxn/neon/file_cache.h index 849558b83d..8c31738484 100644 --- a/pgxn/neon/file_cache.h +++ b/pgxn/neon/file_cache.h @@ -15,6 +15,8 @@ /* GUCs */ extern bool lfc_store_prefetch_result; +extern int lfc_size_limit; +extern char *lfc_path; /* functions for local file cache */ extern void lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index ccb072d6f9..12a5e87e7b 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -279,6 +279,55 @@ AssignPageserverConnstring(const char *newval, void *extra) } } +/* Return a copy of the whole shard map from shared memory */ +void +get_shard_map(char ***connstrs_p, shardno_t *num_shards_p) +{ + uint64 begin_update_counter; + uint64 end_update_counter; + ShardMap *shard_map = &pagestore_shared->shard_map; + shardno_t num_shards; + char *buf; + char **connstrs; + + buf = palloc(MAX_SHARDS*MAX_PAGESERVER_CONNSTRING_SIZE); + connstrs = palloc(sizeof(char *) * MAX_SHARDS); + + /* + * Postmaster can update the shared memory values concurrently, in which + * case we would copy a garbled mix of the old and new values. We will + * detect it because the counter's won't match, and retry. But it's + * important that we don't do anything within the retry-loop that would + * depend on the string having valid contents. + */ + do + { + char *p; + + begin_update_counter = pg_atomic_read_u64(&pagestore_shared->begin_update_counter); + end_update_counter = pg_atomic_read_u64(&pagestore_shared->end_update_counter); + + num_shards = shard_map->num_shards; + + p = buf; + for (int i = 0; i < Min(num_shards, MAX_SHARDS); i++) + { + strlcpy(p, shard_map->connstring[i], MAX_PAGESERVER_CONNSTRING_SIZE); + connstrs[i] = p; + elog(LOG, "XX: connstrs[%d]: %p", i, p); + p += MAX_PAGESERVER_CONNSTRING_SIZE; + } + + pg_memory_barrier(); + } + while (begin_update_counter != end_update_counter + || begin_update_counter != pg_atomic_read_u64(&pagestore_shared->begin_update_counter) + || end_update_counter != pg_atomic_read_u64(&pagestore_shared->end_update_counter)); + + *connstrs_p = connstrs; + *num_shards_p = num_shards; +} + /* * Get the current number of shards, and/or the connection string for a * particular shard from the shard map in shared memory. diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c index a6a7021756..a6ad45f1ea 100644 --- a/pgxn/neon/neon.c +++ b/pgxn/neon/neon.c @@ -20,6 +20,7 @@ #include "replication/logicallauncher.h" #include "replication/slot.h" #include "replication/walsender.h" +#include "storage/ipc.h" #include "storage/proc.h" #include "funcapi.h" #include "access/htup_details.h" @@ -29,6 +30,7 @@ #include "utils/guc_tables.h" #include "communicator.h" +#include "communicator_new.h" #include "extension_server.h" #include "file_cache.h" #include "neon.h" @@ -45,13 +47,17 @@ PG_MODULE_MAGIC; void _PG_init(void); +bool neon_enable_new_communicator; static int running_xacts_overflow_policy; -#if PG_MAJORVERSION_NUM >= 16 static shmem_startup_hook_type prev_shmem_startup_hook; - -static void neon_shmem_startup_hook(void); +#if PG_VERSION_NUM>=150000 +static shmem_request_hook_type prev_shmem_request_hook; #endif + +static void neon_shmem_request(void); +static void neon_shmem_startup_hook(void); + #if PG_MAJORVERSION_NUM >= 17 uint32 WAIT_EVENT_NEON_LFC_MAINTENANCE; uint32 WAIT_EVENT_NEON_LFC_READ; @@ -430,17 +436,36 @@ _PG_init(void) */ #if PG_VERSION_NUM >= 160000 load_file("$libdir/neon_rmgr", false); +#endif prev_shmem_startup_hook = shmem_startup_hook; shmem_startup_hook = neon_shmem_startup_hook; +#if PG_VERSION_NUM>=150000 + prev_shmem_request_hook = shmem_request_hook; + shmem_request_hook = neon_shmem_request; +#else + neon_shmem_request(); #endif + DefineCustomBoolVariable( + "neon.enable_new_communicator", + "Enables new communicator implementation", + NULL, + &neon_enable_new_communicator, + true, + PGC_POSTMASTER, + 0, + NULL, NULL, NULL); + pg_init_libpagestore(); lfc_init(); pg_init_walproposer(); init_lwlsncache(); pg_init_communicator(); + if (neon_enable_new_communicator) + pg_init_communicator_new(); + Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines; InitUnstableExtensionsSupport(); @@ -559,7 +584,17 @@ backpressure_throttling_time(PG_FUNCTION_ARGS) PG_RETURN_UINT64(BackpressureThrottlingTime()); } -#if PG_MAJORVERSION_NUM >= 16 +static void +neon_shmem_request(void) +{ +#if PG_VERSION_NUM>=150000 + if (prev_shmem_request_hook) + prev_shmem_request_hook(); +#endif + + communicator_new_shmem_request(); +} + static void neon_shmem_startup_hook(void) { @@ -579,5 +614,6 @@ neon_shmem_startup_hook(void) WAIT_EVENT_NEON_PS_READ = WaitEventExtensionNew("Neon/PS_ReadIO"); WAIT_EVENT_NEON_WAL_DL = WaitEventExtensionNew("Neon/WAL_Download"); #endif + + communicator_new_shmem_startup(); } -#endif diff --git a/pgxn/neon/neon.h b/pgxn/neon/neon.h index a2e81feb5f..792e9fa2ff 100644 --- a/pgxn/neon/neon.h +++ b/pgxn/neon/neon.h @@ -13,6 +13,7 @@ #include "utils/wait_event.h" /* GUCs */ +extern bool neon_enable_new_communicator; extern char *neon_auth_token; extern char *neon_timeline; extern char *neon_tenant; diff --git a/pgxn/neon/neon_pgversioncompat.h b/pgxn/neon/neon_pgversioncompat.h index b3ed0c04e8..f2d6292768 100644 --- a/pgxn/neon/neon_pgversioncompat.h +++ b/pgxn/neon/neon_pgversioncompat.h @@ -9,6 +9,10 @@ #include "fmgr.h" #include "storage/buf_internals.h" +#if PG_MAJORVERSION_NUM < 16 +typedef PGAlignedBlock PGIOAlignedBlock; +#endif + #if PG_MAJORVERSION_NUM < 17 #define NRelFileInfoBackendIsTemp(rinfo) (rinfo.backend != InvalidBackendId) #else @@ -154,6 +158,10 @@ InitBufferTag(BufferTag *tag, const RelFileNode *rnode, #define AmAutoVacuumWorkerProcess() (IsAutoVacuumWorkerProcess()) #endif +#if PG_MAJORVERSION_NUM < 17 +#define MyProcNumber (MyProc - &ProcGlobal->allProcs[0]) +#endif + #if PG_MAJORVERSION_NUM < 15 extern void InitMaterializedSRF(FunctionCallInfo fcinfo, bits32 flags); #endif diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h index 9df202290d..e6b8486248 100644 --- a/pgxn/neon/pagestore_client.h +++ b/pgxn/neon/pagestore_client.h @@ -228,6 +228,7 @@ extern char *neon_tenant; extern int32 max_cluster_size; extern int neon_protocol_version; +extern void get_shard_map(char ***connstrs_p, shardno_t *num_shards_p); extern shardno_t get_shard_number(BufferTag* tag); extern const f_smgr *smgr_neon(ProcNumber backend, NRelFileInfo rinfo); diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index 3bf0bedf99..04369d1dbf 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -62,6 +62,7 @@ #include "bitmap.h" #include "communicator.h" +#include "communicator_new.h" #include "file_cache.h" #include "neon.h" #include "neon_lwlsncache.h" @@ -72,10 +73,6 @@ #include "access/xlogrecovery.h" #endif -#if PG_VERSION_NUM < 160000 -typedef PGAlignedBlock PGIOAlignedBlock; -#endif - /* * If DEBUG_COMPARE_LOCAL is defined, we pass through all the SMGR API * calls to md.c, and *also* do the calls to the Page Server. On every @@ -97,7 +94,7 @@ static char *hexdump_page(char *page); NInfoGetRelNumber(InfoFromSMgrRel(reln)) >= FirstNormalObjectId \ ) -const int SmgrTrace = DEBUG5; +const int SmgrTrace = DEBUG1; /* unlogged relation build states */ typedef enum @@ -779,10 +776,15 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum) return false; } - neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, - REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1); + if (neon_enable_new_communicator) + return communicator_new_rel_exists(InfoFromSMgrRel(reln), forkNum); + else + { + neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, + REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1); - return communicator_exists(InfoFromSMgrRel(reln), forkNum, &request_lsns); + return communicator_exists(InfoFromSMgrRel(reln), forkNum, &request_lsns); + } } /* @@ -820,33 +822,40 @@ neon_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo) RelFileInfoFmt(InfoFromSMgrRel(reln)), forkNum); - /* - * Newly created relation is empty, remember that in the relsize cache. - * - * Note that in REDO, this is called to make sure the relation fork - * exists, but it does not truncate the relation. So, we can only update - * the relsize if it didn't exist before. - * - * Also, in redo, we must make sure to update the cached size of the - * relation, as that is the primary source of truth for REDO's file length - * considerations, and as file extension isn't (perfectly) logged, we need - * to take care of that before we hit file size checks. - * - * FIXME: This is currently not just an optimization, but required for - * correctness. Postgres can call smgrnblocks() on the newly-created - * relation. Currently, we don't call SetLastWrittenLSN() when a new - * relation created, so if we didn't remember the size in the relsize - * cache, we might call smgrnblocks() on the newly-created relation before - * the creation WAL record hass been received by the page server. - */ - if (isRedo) + if (neon_enable_new_communicator) { - update_cached_relsize(InfoFromSMgrRel(reln), forkNum, 0); - get_cached_relsize(InfoFromSMgrRel(reln), forkNum, - &reln->smgr_cached_nblocks[forkNum]); + communicator_new_rel_create(InfoFromSMgrRel(reln), forkNum); } else - set_cached_relsize(InfoFromSMgrRel(reln), forkNum, 0); + { + /* + * Newly created relation is empty, remember that in the relsize cache. + * + * Note that in REDO, this is called to make sure the relation fork + * exists, but it does not truncate the relation. So, we can only update + * the relsize if it didn't exist before. + * + * Also, in redo, we must make sure to update the cached size of the + * relation, as that is the primary source of truth for REDO's file length + * considerations, and as file extension isn't (perfectly) logged, we need + * to take care of that before we hit file size checks. + * + * FIXME: This is currently not just an optimization, but required for + * correctness. Postgres can call smgrnblocks() on the newly-created + * relation. Currently, we don't call SetLastWrittenLSN() when a new + * relation created, so if we didn't remember the size in the relsize + * cache, we might call smgrnblocks() on the newly-created relation before + * the creation WAL record hass been received by the page server. + */ + if (isRedo) + { + update_cached_relsize(InfoFromSMgrRel(reln), forkNum, 0); + get_cached_relsize(InfoFromSMgrRel(reln), forkNum, + &reln->smgr_cached_nblocks[forkNum]); + } + else + set_cached_relsize(InfoFromSMgrRel(reln), forkNum, 0); + } #ifdef DEBUG_COMPARE_LOCAL if (IS_LOCAL_REL(reln)) @@ -968,34 +977,43 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, forkNum, blkno, (uint32) (lsn >> 32), (uint32) lsn); - lfc_write(InfoFromSMgrRel(reln), forkNum, blkno, buffer); + if (neon_enable_new_communicator) + { + // FIXME: this can pass lsn == invalid. Is that ok? + communicator_new_rel_extend(InfoFromSMgrRel(reln), forkNum, blkno, (const void *) buffer, lsn); + } + else + { + lfc_write(InfoFromSMgrRel(reln), forkNum, blkno, buffer); #ifdef DEBUG_COMPARE_LOCAL - if (IS_LOCAL_REL(reln)) - mdextend(reln, forkNum, blkno, buffer, skipFsync); + if (IS_LOCAL_REL(reln)) + mdextend(reln, forkNum, blkno, buffer, skipFsync); #endif - /* - * smgr_extend is often called with an all-zeroes page, so - * lsn==InvalidXLogRecPtr. An smgr_write() call will come for the buffer - * later, after it has been initialized with the real page contents, and - * it is eventually evicted from the buffer cache. But we need a valid LSN - * to the relation metadata update now. - */ - if (lsn == InvalidXLogRecPtr) - { - lsn = GetXLogInsertRecPtr(); - neon_set_lwlsn_block(lsn, InfoFromSMgrRel(reln), forkNum, blkno); + /* + * smgr_extend is often called with an all-zeroes page, so + * lsn==InvalidXLogRecPtr. An smgr_write() call will come for the buffer + * later, after it has been initialized with the real page contents, and + * it is eventually evicted from the buffer cache. But we need a valid LSN + * to the relation metadata update now. + */ + if (lsn == InvalidXLogRecPtr) + { + lsn = GetXLogInsertRecPtr(); + neon_set_lwlsn_block(lsn, InfoFromSMgrRel(reln), forkNum, blkno); + } + neon_set_lwlsn_relation(lsn, InfoFromSMgrRel(reln), forkNum); } - neon_set_lwlsn_relation(lsn, InfoFromSMgrRel(reln), forkNum); } #if PG_MAJORVERSION_NUM >= 16 static void -neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum, +neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber start_block, int nblocks, bool skipFsync) { const PGIOAlignedBlock buffer = {0}; + BlockNumber blocknum = start_block; int remblocks = nblocks; XLogRecPtr lsn = 0; @@ -1092,8 +1110,15 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum, Assert(lsn != 0); - neon_set_lwlsn_relation(lsn, InfoFromSMgrRel(reln), forkNum); - set_cached_relsize(InfoFromSMgrRel(reln), forkNum, blocknum); + if (neon_enable_new_communicator) + { + communicator_new_rel_zeroextend(InfoFromSMgrRel(reln), forkNum, start_block, nblocks, lsn); + } + else + { + neon_set_lwlsn_relation(lsn, InfoFromSMgrRel(reln), forkNum); + set_cached_relsize(InfoFromSMgrRel(reln), forkNum, blocknum); + } } #endif @@ -1153,11 +1178,17 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); } + if (neon_enable_new_communicator) + { + communicator_new_prefetch_register_bufferv(InfoFromSMgrRel(reln), forknum, blocknum, nblocks); + return false; + } + tag.spcOid = reln->smgr_rlocator.locator.spcOid; tag.dbOid = reln->smgr_rlocator.locator.dbOid; tag.relNumber = reln->smgr_rlocator.locator.relNumber; tag.forkNum = forknum; - + while (nblocks > 0) { int iterblocks = Min(nblocks, PG_IOV_MAX); @@ -1179,7 +1210,8 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, blocknum += iterblocks; } - communicator_prefetch_pump_state(false); + if (!neon_enable_new_communicator) + communicator_prefetch_pump_state(false); return false; } @@ -1216,9 +1248,13 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) CopyNRelFileInfoToBufTag(tag, InfoFromSMgrRel(reln)); - communicator_prefetch_register_bufferv(tag, NULL, 1, NULL); + if (neon_enable_new_communicator) + communicator_new_prefetch_register_bufferv(InfoFromSMgrRel(reln), forknum, blocknum, 1); + else + communicator_prefetch_register_bufferv(tag, NULL, 1, NULL); - communicator_prefetch_pump_state(false); + if (!neon_enable_new_communicator) + communicator_prefetch_pump_state(false); return false; } @@ -1262,7 +1298,8 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum, */ neon_log(SmgrTrace, "writeback noop"); - communicator_prefetch_pump_state(false); + if (!neon_enable_new_communicator) + communicator_prefetch_pump_state(false); #ifdef DEBUG_COMPARE_LOCAL if (IS_LOCAL_REL(reln)) @@ -1278,7 +1315,14 @@ void neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, neon_request_lsns request_lsns, void *buffer) { - communicator_read_at_lsnv(rinfo, forkNum, blkno, &request_lsns, &buffer, 1, NULL); + if (neon_enable_new_communicator) + { + // FIXME: request_lsns is ignored. That affects the neon_test_utils callers. + // Add the capability to specify the LSNs explicitly, for the sake of neon_test_utils ? + communicator_new_read_at_lsnv(rinfo, forkNum, blkno, &buffer, 1); + } + else + communicator_read_at_lsnv(rinfo, forkNum, blkno, &request_lsns, &buffer, 1, NULL); } #if PG_MAJORVERSION_NUM < 17 @@ -1296,6 +1340,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer neon_request_lsns request_lsns; bits8 present; void *bufferp; + bool prefetch_hit; switch (reln->smgr_relpersistence) { @@ -1314,33 +1359,62 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); } - /* Try to read PS results if they are available */ - communicator_prefetch_pump_state(false); - - neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1); - present = 0; bufferp = buffer; - if (communicator_prefetch_lookupv(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1, &bufferp, &present)) + + if (neon_enable_new_communicator) { - /* Prefetch hit */ - return; + communicator_new_read_at_lsnv(InfoFromSMgrRel(reln), forkNum, blkno, + (void *) &buffer, 1); } - - /* Try to read from local file cache */ - if (lfc_read(InfoFromSMgrRel(reln), forkNum, blkno, buffer)) + else { - MyNeonCounters->file_cache_hits_total++; - return; + prefetch_hit = communicator_prefetch_lookupv(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1, &bufferp, &present); + if (prefetch_hit) + { + /* Prefetch hit */ + return; + } + + /* Try to read from local file cache */ + if (lfc_read(InfoFromSMgrRel(reln), forkNum, blkno, buffer)) + { + MyNeonCounters->file_cache_hits_total++; + return; + } + + /* + * Try to receive prefetch results once again just to make sure we + * don't leave the smgr code while the OS might still have buffered + * bytes. + */ + communicator_prefetch_pump_state(false); + + neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1); + + prefetch_hit = communicator_prefetch_lookupv(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1, &bufferp, &present); + + if (prefetch_hit) + { + /* Prefetch hit */ + return; + } + + /* Try to read from local file cache */ + if (lfc_read(InfoFromSMgrRel(reln), forkNum, blkno, buffer)) + { + MyNeonCounters->file_cache_hits_total++; + return; + } + + neon_read_at_lsn(InfoFromSMgrRel(reln), forkNum, blkno, request_lsns, buffer); + + /* + * Try to receive prefetch results once again just to make sure we don't leave the smgr code while the OS might still have buffered bytes. + */ + communicator_prefetch_pump_state(false); } - neon_read_at_lsn(InfoFromSMgrRel(reln), forkNum, blkno, request_lsns, buffer); - - /* - * Try to receive prefetch results once again just to make sure we don't leave the smgr code while the OS might still have buffered bytes. - */ - communicator_prefetch_pump_state(false); - #ifdef DEBUG_COMPARE_LOCAL if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln)) { @@ -1449,38 +1523,47 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, nblocks, PG_IOV_MAX); /* Try to read PS results if they are available */ - communicator_prefetch_pump_state(false); - - neon_get_request_lsns(InfoFromSMgrRel(reln), forknum, blocknum, - request_lsns, nblocks); + if (!neon_enable_new_communicator) + communicator_prefetch_pump_state(false); memset(read_pages, 0, sizeof(read_pages)); - prefetch_result = communicator_prefetch_lookupv(InfoFromSMgrRel(reln), forknum, - blocknum, request_lsns, nblocks, - buffers, read_pages); + if (neon_enable_new_communicator) + { + communicator_new_read_at_lsnv(InfoFromSMgrRel(reln), forknum, blocknum, + buffers, nblocks); + } + else + { + neon_get_request_lsns(InfoFromSMgrRel(reln), forknum, blocknum, + request_lsns, nblocks); + + prefetch_result = communicator_prefetch_lookupv(InfoFromSMgrRel(reln), forknum, + blocknum, request_lsns, nblocks, + buffers, read_pages); - if (prefetch_result == nblocks) - return; + if (prefetch_result == nblocks) + return; - /* Try to read from local file cache */ - lfc_result = lfc_readv_select(InfoFromSMgrRel(reln), forknum, blocknum, buffers, - nblocks, read_pages); + /* Try to read from local file cache */ + lfc_result = lfc_readv_select(InfoFromSMgrRel(reln), forknum, blocknum, buffers, + nblocks, read_pages); - if (lfc_result > 0) - MyNeonCounters->file_cache_hits_total += lfc_result; + if (lfc_result > 0) + MyNeonCounters->file_cache_hits_total += lfc_result; - /* Read all blocks from LFC, so we're done */ - if (prefetch_result + lfc_result == nblocks) - return; + /* Read all blocks from LFC, so we're done */ + if (prefetch_result + lfc_result == nblocks) + return; - communicator_read_at_lsnv(InfoFromSMgrRel(reln), forknum, blocknum, request_lsns, - buffers, nblocks, read_pages); + communicator_read_at_lsnv(InfoFromSMgrRel(reln), forknum, blocknum, request_lsns, + buffers, nblocks, read_pages); - /* - * Try to receive prefetch results once again just to make sure we don't leave the smgr code while the OS might still have buffered bytes. - */ - communicator_prefetch_pump_state(false); + /* + * Try to receive prefetch results once again just to make sure we don't leave the smgr code while the OS might still have buffered bytes. + */ + communicator_prefetch_pump_state(false); + } #ifdef DEBUG_COMPARE_LOCAL if (forknum == MAIN_FORKNUM && IS_LOCAL_REL(reln)) @@ -1663,9 +1746,16 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo forknum, blocknum, (uint32) (lsn >> 32), (uint32) lsn); - lfc_write(InfoFromSMgrRel(reln), forknum, blocknum, buffer); + if (neon_enable_new_communicator) + { + communicator_new_write_page(InfoFromSMgrRel(reln), forknum, blocknum, buffer, lsn); + } + else + { + lfc_write(InfoFromSMgrRel(reln), forknum, blocknum, buffer); - communicator_prefetch_pump_state(false); + communicator_prefetch_pump_state(false); + } #ifdef DEBUG_COMPARE_LOCAL if (IS_LOCAL_REL(reln)) @@ -1725,9 +1815,21 @@ neon_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, neon_wallog_pagev(reln, forknum, blkno, nblocks, (const char **) buffers, false); - lfc_writev(InfoFromSMgrRel(reln), forknum, blkno, buffers, nblocks); + if (neon_enable_new_communicator) + { + for (int i = 0; i < nblocks; i++) + { + XLogRecPtr lsn = PageGetLSN((Page) buffers[i]); - communicator_prefetch_pump_state(false); + communicator_new_write_page(InfoFromSMgrRel(reln), forknum, blkno + i, buffers[i], lsn); + } + } + else + { + lfc_writev(InfoFromSMgrRel(reln), forknum, blkno, buffers, nblocks); + + communicator_prefetch_pump_state(false); + } #ifdef DEBUG_COMPARE_LOCAL if (IS_LOCAL_REL(reln)) @@ -1763,19 +1865,26 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum) neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); } - if (get_cached_relsize(InfoFromSMgrRel(reln), forknum, &n_blocks)) + if (neon_enable_new_communicator) { - neon_log(SmgrTrace, "cached nblocks for %u/%u/%u.%u: %u blocks", - RelFileInfoFmt(InfoFromSMgrRel(reln)), - forknum, n_blocks); - return n_blocks; + n_blocks = communicator_new_rel_nblocks(InfoFromSMgrRel(reln), forknum); } + else + { + if (get_cached_relsize(InfoFromSMgrRel(reln), forknum, &n_blocks)) + { + neon_log(SmgrTrace, "cached nblocks for %u/%u/%u.%u: %u blocks", + RelFileInfoFmt(InfoFromSMgrRel(reln)), + forknum, n_blocks); + return n_blocks; + } - neon_get_request_lsns(InfoFromSMgrRel(reln), forknum, - REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1); + neon_get_request_lsns(InfoFromSMgrRel(reln), forknum, + REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1); - n_blocks = communicator_nblocks(InfoFromSMgrRel(reln), forknum, &request_lsns); - update_cached_relsize(InfoFromSMgrRel(reln), forknum, n_blocks); + n_blocks = communicator_nblocks(InfoFromSMgrRel(reln), forknum, &request_lsns); + update_cached_relsize(InfoFromSMgrRel(reln), forknum, n_blocks); + } neon_log(SmgrTrace, "neon_nblocks: rel %u/%u/%u fork %u (request LSN %X/%08X): %u blocks", RelFileInfoFmt(InfoFromSMgrRel(reln)), @@ -1796,10 +1905,17 @@ neon_dbsize(Oid dbNode) neon_request_lsns request_lsns; NRelFileInfo dummy_node = {0}; - neon_get_request_lsns(dummy_node, MAIN_FORKNUM, - REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1); + if (neon_enable_new_communicator) + { + db_size = communicator_new_dbsize(dbNode); + } + else + { + neon_get_request_lsns(dummy_node, MAIN_FORKNUM, + REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1); - db_size = communicator_dbsize(dbNode, &request_lsns); + db_size = communicator_dbsize(dbNode, &request_lsns); + } neon_log(SmgrTrace, "neon_dbsize: db %u (request LSN %X/%08X): %ld bytes", dbNode, LSN_FORMAT_ARGS(request_lsns.effective_request_lsn), db_size); @@ -1813,8 +1929,6 @@ neon_dbsize(Oid dbNode) static void neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber old_blocks, BlockNumber nblocks) { - XLogRecPtr lsn; - switch (reln->smgr_relpersistence) { case 0: @@ -1833,34 +1947,43 @@ neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber old_blocks, Blo neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); } - set_cached_relsize(InfoFromSMgrRel(reln), forknum, nblocks); + if (neon_enable_new_communicator) + { + communicator_new_rel_truncate(InfoFromSMgrRel(reln), forknum, nblocks); + } + else + { + XLogRecPtr lsn; - /* - * Truncating a relation drops all its buffers from the buffer cache - * without calling smgrwrite() on them. But we must account for that in - * our tracking of last-written-LSN all the same: any future smgrnblocks() - * request must return the new size after the truncation. We don't know - * what the LSN of the truncation record was, so be conservative and use - * the most recently inserted WAL record's LSN. - */ - lsn = GetXLogInsertRecPtr(); - lsn = nm_adjust_lsn(lsn); + set_cached_relsize(InfoFromSMgrRel(reln), forknum, nblocks); - /* - * Flush it, too. We don't actually care about it here, but let's uphold - * the invariant that last-written LSN <= flush LSN. - */ - XLogFlush(lsn); + /* + * Truncating a relation drops all its buffers from the buffer cache + * without calling smgrwrite() on them. But we must account for that in + * our tracking of last-written-LSN all the same: any future smgrnblocks() + * request must return the new size after the truncation. We don't know + * what the LSN of the truncation record was, so be conservative and use + * the most recently inserted WAL record's LSN. + */ + lsn = GetXLogInsertRecPtr(); + lsn = nm_adjust_lsn(lsn); - /* - * Truncate may affect several chunks of relations. So we should either - * update last written LSN for all of them, or update LSN for "dummy" - * metadata block. Second approach seems more efficient. If the relation - * is extended again later, the extension will update the last-written LSN - * for the extended pages, so there's no harm in leaving behind obsolete - * entries for the truncated chunks. - */ - neon_set_lwlsn_relation(lsn, InfoFromSMgrRel(reln), forknum); + /* + * Flush it, too. We don't actually care about it here, but let's uphold + * the invariant that last-written LSN <= flush LSN. + */ + XLogFlush(lsn); + + /* + * Truncate may affect several chunks of relations. So we should either + * update last written LSN for all of them, or update LSN for "dummy" + * metadata block. Second approach seems more efficient. If the relation + * is extended again later, the extension will update the last-written LSN + * for the extended pages, so there's no harm in leaving behind obsolete + * entries for the truncated chunks. + */ + neon_set_lwlsn_relation(lsn, InfoFromSMgrRel(reln), forknum); + } #ifdef DEBUG_COMPARE_LOCAL if (IS_LOCAL_REL(reln)) @@ -1902,7 +2025,8 @@ neon_immedsync(SMgrRelation reln, ForkNumber forknum) neon_log(SmgrTrace, "[NEON_SMGR] immedsync noop"); - communicator_prefetch_pump_state(false); + if (!neon_enable_new_communicator) + communicator_prefetch_pump_state(false); #ifdef DEBUG_COMPARE_LOCAL if (IS_LOCAL_REL(reln)) @@ -2173,7 +2297,10 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf request_lsns.not_modified_since = not_modified_since; request_lsns.effective_request_lsn = request_lsn; - n_blocks = communicator_read_slru_segment(kind, segno, &request_lsns, buffer); + if (neon_enable_new_communicator) + n_blocks = communicator_new_read_slru_segment(kind, segno, buffer); + else + n_blocks = communicator_read_slru_segment(kind, segno, &request_lsns, buffer); return n_blocks; } @@ -2210,7 +2337,8 @@ AtEOXact_neon(XactEvent event, void *arg) } break; } - communicator_reconfigure_timeout_if_needed(); + if (!neon_enable_new_communicator) + communicator_reconfigure_timeout_if_needed(); } static const struct f_smgr neon_smgr = @@ -2268,7 +2396,10 @@ smgr_init_neon(void) smgr_init_standard(); neon_init(); - communicator_init(); + if (neon_enable_new_communicator) + communicator_new_init(); + else + communicator_init(); } @@ -2280,6 +2411,12 @@ neon_extend_rel_size(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, /* This is only used in WAL replay */ Assert(RecoveryInProgress()); + if (neon_enable_new_communicator) + { + // FIXME: broken, but this is only used in replica + elog(ERROR, "not implemented yet"); + } + /* Extend the relation if we know its size */ if (get_cached_relsize(rinfo, forknum, &relsize)) {