New communicator, with "integrated" cache accessible from all processes

This commit is contained in:
Heikki Linnakangas
2025-04-29 11:52:44 +03:00
parent 11f6044338
commit e58d0fece1
57 changed files with 8418 additions and 385 deletions

230
Cargo.lock generated
View File

@@ -253,6 +253,17 @@ version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a8ab6b55fe97976e46f91ddbed8d147d966475dc29b2032757ba47e02376fbc3"
[[package]]
name = "atomic_enum"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "99e1aca718ea7b89985790c94aad72d77533063fe00bc497bb79a7c2dae6a661"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.100",
]
[[package]]
name = "autocfg"
version = "1.1.0"
@@ -687,13 +698,40 @@ dependencies = [
"tracing",
]
[[package]]
name = "axum"
version = "0.7.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "edca88bc138befd0323b20752846e6587272d3b03b0343c8ea28a6f819e6e71f"
dependencies = [
"async-trait",
"axum-core 0.4.5",
"bytes",
"futures-util",
"http 1.1.0",
"http-body 1.0.0",
"http-body-util",
"itoa",
"matchit 0.7.3",
"memchr",
"mime",
"percent-encoding",
"pin-project-lite",
"rustversion",
"serde",
"sync_wrapper 1.0.1",
"tower 0.5.2",
"tower-layer",
"tower-service",
]
[[package]]
name = "axum"
version = "0.8.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6d6fd624c75e18b3b4c6b9caf42b1afe24437daaee904069137d8bab077be8b8"
dependencies = [
"axum-core",
"axum-core 0.5.0",
"base64 0.22.1",
"bytes",
"form_urlencoded",
@@ -704,7 +742,7 @@ dependencies = [
"hyper 1.4.1",
"hyper-util",
"itoa",
"matchit",
"matchit 0.8.4",
"memchr",
"mime",
"percent-encoding",
@@ -724,6 +762,26 @@ dependencies = [
"tracing",
]
[[package]]
name = "axum-core"
version = "0.4.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "09f2bd6146b97ae3359fa0cc6d6b376d9539582c7b4220f041a33ec24c226199"
dependencies = [
"async-trait",
"bytes",
"futures-util",
"http 1.1.0",
"http-body 1.0.0",
"http-body-util",
"mime",
"pin-project-lite",
"rustversion",
"sync_wrapper 1.0.1",
"tower-layer",
"tower-service",
]
[[package]]
name = "axum-core"
version = "0.5.0"
@@ -750,8 +808,8 @@ version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "460fc6f625a1f7705c6cf62d0d070794e94668988b1c38111baeec177c715f7b"
dependencies = [
"axum",
"axum-core",
"axum 0.8.1",
"axum-core 0.5.0",
"bytes",
"futures-util",
"headers",
@@ -1086,6 +1144,25 @@ version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5"
[[package]]
name = "cbindgen"
version = "0.28.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "eadd868a2ce9ca38de7eeafdcec9c7065ef89b42b32f0839278d55f35c54d1ff"
dependencies = [
"clap",
"heck 0.4.1",
"indexmap 2.9.0",
"log",
"proc-macro2",
"quote",
"serde",
"serde_json",
"syn 2.0.100",
"tempfile",
"toml",
]
[[package]]
name = "cc"
version = "1.2.16"
@@ -1206,7 +1283,7 @@ version = "4.5.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4ac6a0c7b1a9e9a5186361f67dfa1b88213572f427fb9ab038efb2bd8c582dab"
dependencies = [
"heck",
"heck 0.5.0",
"proc-macro2",
"quote",
"syn 2.0.100",
@@ -1264,13 +1341,40 @@ dependencies = [
"unicode-width",
]
[[package]]
name = "communicator"
version = "0.1.0"
dependencies = [
"atomic_enum",
"bytes",
"cbindgen",
"http 1.1.0",
"libc",
"neonart",
"nix 0.27.1",
"pageserver_client_grpc",
"pageserver_data_api",
"prost 0.13.3",
"thiserror 1.0.69",
"tokio",
"tokio-epoll-uring",
"tokio-pipe",
"tonic",
"tracing",
"tracing-subscriber",
"uring-common",
"utils",
"zerocopy 0.8.24",
"zerocopy-derive 0.8.24",
]
[[package]]
name = "compute_api"
version = "0.1.0"
dependencies = [
"anyhow",
"chrono",
"indexmap 2.0.1",
"indexmap 2.9.0",
"jsonwebtoken",
"regex",
"remote_storage",
@@ -1288,7 +1392,7 @@ dependencies = [
"aws-sdk-kms",
"aws-sdk-s3",
"aws-smithy-types",
"axum",
"axum 0.8.1",
"axum-extra",
"base64 0.13.1",
"bytes",
@@ -1301,7 +1405,7 @@ dependencies = [
"flate2",
"futures",
"http 1.1.0",
"indexmap 2.0.1",
"indexmap 2.9.0",
"jsonwebtoken",
"metrics",
"nix 0.27.1",
@@ -1927,7 +2031,7 @@ checksum = "0892a17df262a24294c382f0d5997571006e7a4348b4327557c4ff1cd4a8bccc"
dependencies = [
"darling",
"either",
"heck",
"heck 0.5.0",
"proc-macro2",
"quote",
"syn 2.0.100",
@@ -2041,7 +2145,7 @@ name = "endpoint_storage"
version = "0.0.1"
dependencies = [
"anyhow",
"axum",
"axum 0.8.1",
"axum-extra",
"camino",
"camino-tempfile",
@@ -2588,7 +2692,7 @@ dependencies = [
"futures-sink",
"futures-util",
"http 0.2.9",
"indexmap 2.0.1",
"indexmap 2.9.0",
"slab",
"tokio",
"tokio-util",
@@ -2607,7 +2711,7 @@ dependencies = [
"futures-sink",
"futures-util",
"http 1.1.0",
"indexmap 2.0.1",
"indexmap 2.9.0",
"slab",
"tokio",
"tokio-util",
@@ -2703,6 +2807,12 @@ dependencies = [
"http 1.1.0",
]
[[package]]
name = "heck"
version = "0.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"
[[package]]
name = "heck"
version = "0.5.0"
@@ -3191,12 +3301,12 @@ dependencies = [
[[package]]
name = "indexmap"
version = "2.0.1"
version = "2.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ad227c3af19d4914570ad36d30409928b75967c298feb9ea1969db3a610bb14e"
checksum = "cea70ddb795996207ad57735b50c5982d8844f38ba9ee5f1aedcfb708a2aa11e"
dependencies = [
"equivalent",
"hashbrown 0.14.5",
"hashbrown 0.15.2",
"serde",
]
@@ -3219,7 +3329,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "232929e1d75fe899576a3d5c7416ad0d88dbfbb3c3d6aa00873a7408a50ddb88"
dependencies = [
"ahash",
"indexmap 2.0.1",
"indexmap 2.9.0",
"is-terminal",
"itoa",
"log",
@@ -3242,7 +3352,7 @@ dependencies = [
"crossbeam-utils",
"dashmap 6.1.0",
"env_logger",
"indexmap 2.0.1",
"indexmap 2.9.0",
"itoa",
"log",
"num-format",
@@ -3594,6 +3704,12 @@ dependencies = [
"regex-automata 0.1.10",
]
[[package]]
name = "matchit"
version = "0.7.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0e7465ac9959cc2b1404e8e2367b43684a6d13790fe23056cc8c6c5a6b7bcb94"
[[package]]
name = "matchit"
version = "0.8.4"
@@ -3639,7 +3755,7 @@ version = "0.0.22"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b9e6777fc80a575f9503d908c8b498782a6c3ee88a06cb416dc3941401e43b94"
dependencies = [
"heck",
"heck 0.5.0",
"proc-macro2",
"quote",
"syn 2.0.100",
@@ -3785,6 +3901,15 @@ version = "0.8.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a"
[[package]]
name = "neonart"
version = "0.1.0"
dependencies = [
"rand 0.8.5",
"tracing",
"zerocopy 0.8.24",
]
[[package]]
name = "never-say-never"
version = "6.6.666"
@@ -4208,6 +4333,8 @@ dependencies = [
"humantime-serde",
"pageserver_api",
"pageserver_client",
"pageserver_client_grpc",
"pageserver_data_api",
"rand 0.8.5",
"reqwest",
"serde",
@@ -4284,6 +4411,8 @@ dependencies = [
"pageserver_api",
"pageserver_client",
"pageserver_compaction",
"pageserver_data_api",
"peekable",
"pem",
"pin-project-lite",
"postgres-protocol",
@@ -4295,6 +4424,7 @@ dependencies = [
"pprof",
"pq_proto",
"procfs",
"prost 0.13.3",
"rand 0.8.5",
"range-set-blaze",
"regex",
@@ -4326,6 +4456,7 @@ dependencies = [
"tokio-tar",
"tokio-util",
"toml_edit",
"tonic",
"tracing",
"tracing-utils",
"url",
@@ -4390,6 +4521,18 @@ dependencies = [
"workspace_hack",
]
[[package]]
name = "pageserver_client_grpc"
version = "0.1.0"
dependencies = [
"bytes",
"http 1.1.0",
"pageserver_data_api",
"thiserror 1.0.69",
"tonic",
"tracing",
]
[[package]]
name = "pageserver_compaction"
version = "0.1.0"
@@ -4413,6 +4556,17 @@ dependencies = [
"workspace_hack",
]
[[package]]
name = "pageserver_data_api"
version = "0.1.0"
dependencies = [
"prost 0.13.3",
"thiserror 1.0.69",
"tonic",
"tonic-build",
"utils",
]
[[package]]
name = "papaya"
version = "0.2.1"
@@ -4539,6 +4693,15 @@ dependencies = [
"sha2",
]
[[package]]
name = "peekable"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "225f9651e475709164f871dc2f5724956be59cb9edb055372ffeeab01ec2d20b"
dependencies = [
"smallvec",
]
[[package]]
name = "pem"
version = "3.0.3"
@@ -5010,7 +5173,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "22505a5c94da8e3b7c2996394d1c933236c4d743e81a410bcca4e6989fc066a4"
dependencies = [
"bytes",
"heck",
"heck 0.5.0",
"itertools 0.12.1",
"log",
"multimap",
@@ -5031,7 +5194,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0c1318b19085f08681016926435853bbf7858f9c082d0999b80550ff5d9abe15"
dependencies = [
"bytes",
"heck",
"heck 0.5.0",
"itertools 0.12.1",
"log",
"multimap",
@@ -5134,7 +5297,7 @@ dependencies = [
"hyper 0.14.30",
"hyper 1.4.1",
"hyper-util",
"indexmap 2.0.1",
"indexmap 2.9.0",
"ipnet",
"itertools 0.10.5",
"itoa",
@@ -5645,7 +5808,7 @@ dependencies = [
"async-trait",
"getrandom 0.2.11",
"http 1.1.0",
"matchit",
"matchit 0.8.4",
"opentelemetry",
"reqwest",
"reqwest-middleware",
@@ -6806,7 +6969,7 @@ version = "0.26.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4c6bee85a5a24955dc440386795aa378cd9cf82acd5f764469152d2270e581be"
dependencies = [
"heck",
"heck 0.5.0",
"proc-macro2",
"quote",
"rustversion",
@@ -7231,6 +7394,16 @@ dependencies = [
"syn 2.0.100",
]
[[package]]
name = "tokio-pipe"
version = "0.2.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f213a84bffbd61b8fa0ba8a044b4bbe35d471d0b518867181e82bd5c15542784"
dependencies = [
"libc",
"tokio",
]
[[package]]
name = "tokio-postgres"
version = "0.7.10"
@@ -7413,7 +7586,7 @@ version = "0.22.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f21c7aaf97f1bd9ca9d4f9e73b0a6c74bd5afef56f2bc931943a6e1c37e04e38"
dependencies = [
"indexmap 2.0.1",
"indexmap 2.9.0",
"serde",
"serde_spanned",
"toml_datetime",
@@ -7426,9 +7599,13 @@ version = "0.12.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "877c5b330756d856ffcc4553ab34a5684481ade925ecc54bcd1bf02b1d0d4d52"
dependencies = [
"async-stream",
"async-trait",
"axum 0.7.9",
"base64 0.22.1",
"bytes",
"flate2",
"h2 0.4.4",
"http 1.1.0",
"http-body 1.0.0",
"http-body-util",
@@ -7440,6 +7617,7 @@ dependencies = [
"prost 0.13.3",
"rustls-native-certs 0.8.0",
"rustls-pemfile 2.1.1",
"socket2",
"tokio",
"tokio-rustls 0.26.0",
"tokio-stream",
@@ -7939,7 +8117,7 @@ name = "vm_monitor"
version = "0.1.0"
dependencies = [
"anyhow",
"axum",
"axum 0.8.1",
"cgroups-rs",
"clap",
"futures",
@@ -8449,7 +8627,7 @@ dependencies = [
"hyper 1.4.1",
"hyper-util",
"indexmap 1.9.3",
"indexmap 2.0.1",
"indexmap 2.9.0",
"itertools 0.12.1",
"lazy_static",
"libc",

View File

@@ -8,6 +8,7 @@ members = [
"pageserver/compaction",
"pageserver/ctl",
"pageserver/client",
"pageserver/client_grpc",
"pageserver/pagebench",
"proxy",
"safekeeper",
@@ -29,6 +30,7 @@ members = [
"libs/pq_proto",
"libs/tenant_size_model",
"libs/metrics",
"libs/neonart",
"libs/postgres_connection",
"libs/remote_storage",
"libs/tracing-utils",
@@ -41,6 +43,7 @@ members = [
"libs/proxy/postgres-types2",
"libs/proxy/tokio-postgres2",
"endpoint_storage",
"pgxn/neon/communicator",
]
[workspace.package]
@@ -142,6 +145,7 @@ parquet = { version = "53", default-features = false, features = ["zstd"] }
parquet_derive = "53"
pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
pem = "3.0.3"
peekable = "0.3.0"
pin-project-lite = "0.2"
pprof = { version = "0.14", features = ["criterion", "flamegraph", "frame-pointer", "prost-codec"] }
procfs = "0.16"
@@ -187,7 +191,6 @@ thiserror = "1.0"
tikv-jemallocator = { version = "0.6", features = ["profiling", "stats", "unprefixed_malloc_on_supported_platforms"] }
tikv-jemalloc-ctl = { version = "0.6", features = ["stats"] }
tokio = { version = "1.43.1", features = ["macros"] }
tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
tokio-io-timeout = "1.2.0"
tokio-postgres-rustls = "0.12.0"
tokio-rustls = { version = "0.26.0", default-features = false, features = ["tls12", "ring"]}
@@ -196,7 +199,7 @@ tokio-tar = "0.3"
tokio-util = { version = "0.7.10", features = ["io", "rt"] }
toml = "0.8"
toml_edit = "0.22"
tonic = {version = "0.12.3", default-features = false, features = ["channel", "tls", "tls-roots"]}
tonic = {version = "0.12.3", default-features = false, features = ["channel", "server", "tls", "tls-roots", "gzip"]}
tower = { version = "0.5.2", default-features = false }
tower-http = { version = "0.6.2", features = ["auth", "request-id", "trace"] }
@@ -228,6 +231,9 @@ x509-cert = { version = "0.2.5" }
env_logger = "0.11"
log = "0.4"
tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
uring-common = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch = "neon" }
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", branch = "neon" }
@@ -245,9 +251,12 @@ compute_api = { version = "0.1", path = "./libs/compute_api/" }
consumption_metrics = { version = "0.1", path = "./libs/consumption_metrics/" }
http-utils = { version = "0.1", path = "./libs/http-utils/" }
metrics = { version = "0.1", path = "./libs/metrics/" }
neonart = { version = "0.1", path = "./libs/neonart/" }
pageserver = { path = "./pageserver" }
pageserver_api = { version = "0.1", path = "./libs/pageserver_api/" }
pageserver_client = { path = "./pageserver/client" }
pageserver_client_grpc = { path = "./pageserver/client_grpc" }
pageserver_data_api = { path = "./pageserver/data_api" }
pageserver_compaction = { version = "0.1", path = "./pageserver/compaction/" }
postgres_backend = { version = "0.1", path = "./libs/postgres_backend/" }
postgres_connection = { version = "0.1", path = "./libs/postgres_connection/" }
@@ -271,6 +280,7 @@ wal_decoder = { version = "0.1", path = "./libs/wal_decoder" }
workspace_hack = { version = "0.1", path = "./workspace_hack/" }
## Build dependencies
cbindgen = "0.28.0"
criterion = "0.5.1"
rcgen = "0.13"
rstest = "0.18"

View File

@@ -18,10 +18,12 @@ ifeq ($(BUILD_TYPE),release)
PG_LDFLAGS = $(LDFLAGS)
# Unfortunately, `--profile=...` is a nightly feature
CARGO_BUILD_FLAGS += --release
NEON_CARGO_ARTIFACT_TARGET_DIR = $(ROOT_PROJECT_DIR)/target/release
else ifeq ($(BUILD_TYPE),debug)
PG_CONFIGURE_OPTS = --enable-debug --with-openssl --enable-cassert --enable-depend
PG_CFLAGS += -O0 -g3 $(CFLAGS)
PG_LDFLAGS = $(LDFLAGS)
NEON_CARGO_ARTIFACT_TARGET_DIR = $(ROOT_PROJECT_DIR)/target/debug
else
$(error Bad build type '$(BUILD_TYPE)', see Makefile for options)
endif
@@ -180,11 +182,16 @@ postgres-check-%: postgres-%
.PHONY: neon-pg-ext-%
neon-pg-ext-%: postgres-%
+@echo "Compiling communicator $*"
$(CARGO_CMD_PREFIX) cargo build -p communicator $(CARGO_BUILD_FLAGS)
+@echo "Compiling neon $*"
mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-$*
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config COPT='$(COPT)' \
LIBCOMMUNICATOR_PATH=$(NEON_CARGO_ARTIFACT_TARGET_DIR) \
-C $(POSTGRES_INSTALL_DIR)/build/neon-$* \
-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile install
+@echo "Compiling neon_walredo $*"
mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-walredo-$*
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config COPT='$(COPT)' \

11
libs/neonart/Cargo.toml Normal file
View File

@@ -0,0 +1,11 @@
[package]
name = "neonart"
version = "0.1.0"
edition.workspace = true
license.workspace = true
[dependencies]
tracing.workspace = true
rand.workspace = true # for tests
zerocopy = "0.8"

View File

@@ -0,0 +1,377 @@
mod lock_and_version;
mod node_ptr;
mod node_ref;
use std::vec::Vec;
use crate::algorithm::lock_and_version::ResultOrRestart;
use crate::algorithm::node_ptr::{MAX_PREFIX_LEN, NodePtr};
use crate::algorithm::node_ref::ChildOrValue;
use crate::algorithm::node_ref::{NodeRef, ReadLockedNodeRef, WriteLockedNodeRef};
use crate::epoch::EpochPin;
use crate::{Allocator, Key, Value};
pub(crate) type RootPtr<V> = node_ptr::NodePtr<V>;
pub fn new_root<V: Value>(allocator: &Allocator) -> RootPtr<V> {
node_ptr::new_root(allocator)
}
pub(crate) fn search<'e, K: Key, V: Value>(
key: &K,
root: RootPtr<V>,
epoch_pin: &'e EpochPin,
) -> Option<V> {
loop {
let root_ref = NodeRef::from_root_ptr(root);
if let Ok(result) = lookup_recurse(key.as_bytes(), root_ref, None, epoch_pin) {
break result;
}
// retry
}
}
pub(crate) fn update_fn<'e, K: Key, V: Value, F>(
key: &K,
value_fn: F,
root: RootPtr<V>,
allocator: &Allocator,
epoch_pin: &'e EpochPin,
) where
F: FnOnce(Option<&V>) -> Option<V>,
{
let value_fn_cell = std::cell::Cell::new(Some(value_fn));
loop {
let root_ref = NodeRef::from_root_ptr(root);
let this_value_fn = |arg: Option<&V>| value_fn_cell.take().unwrap()(arg);
let key_bytes = key.as_bytes();
if let Ok(()) = update_recurse(
key_bytes,
this_value_fn,
root_ref,
None,
allocator,
epoch_pin,
0,
key_bytes,
) {
break;
}
// retry
}
}
pub(crate) fn dump_tree<'e, V: Value + std::fmt::Debug>(root: RootPtr<V>, epoch_pin: &'e EpochPin) {
let root_ref = NodeRef::from_root_ptr(root);
let _ = dump_recurse(&[], root_ref, &epoch_pin, 0);
}
// Error means you must retry.
//
// This corresponds to the 'lookupOpt' function in the paper
fn lookup_recurse<'e, V: Value>(
key: &[u8],
node: NodeRef<'e, V>,
parent: Option<ReadLockedNodeRef<V>>,
epoch_pin: &'e EpochPin,
) -> ResultOrRestart<Option<V>> {
let rnode = node.read_lock_or_restart()?;
if let Some(parent) = parent {
parent.read_unlock_or_restart()?;
}
// check if prefix matches, may increment level
let prefix_len = if let Some(prefix_len) = rnode.prefix_matches(key) {
prefix_len
} else {
rnode.read_unlock_or_restart()?;
return Ok(None);
};
let key = &key[prefix_len..];
// find child (or leaf value)
let next_node = rnode.find_child_or_value_or_restart(key[0])?;
match next_node {
None => Ok(None), // key not found
Some(ChildOrValue::Value(vptr)) => {
// safety: It's OK to follow the pointer because we checked the version.
let v = unsafe { (*vptr).clone() };
Ok(Some(v))
}
Some(ChildOrValue::Child(v)) => lookup_recurse(&key[1..], v, Some(rnode), epoch_pin),
}
}
// This corresponds to the 'insertOpt' function in the paper
pub(crate) fn update_recurse<'e, V: Value, F>(
key: &[u8],
value_fn: F,
node: NodeRef<'e, V>,
rparent: Option<(ReadLockedNodeRef<V>, u8)>,
allocator: &Allocator,
epoch_pin: &'e EpochPin,
level: usize,
orig_key: &[u8],
) -> ResultOrRestart<()>
where
F: FnOnce(Option<&V>) -> Option<V>,
{
let rnode = node.read_lock_or_restart()?;
let prefix_match_len = rnode.prefix_matches(key);
if prefix_match_len.is_none() {
let (rparent, parent_key) = rparent.expect("direct children of the root have no prefix");
let mut wparent = rparent.upgrade_to_write_lock_or_restart()?;
let mut wnode = rnode.upgrade_to_write_lock_or_restart()?;
if let Some(new_value) = value_fn(None) {
insert_split_prefix(
key,
new_value,
&mut wnode,
&mut wparent,
parent_key,
allocator,
);
}
wnode.write_unlock();
wparent.write_unlock();
return Ok(());
}
let prefix_match_len = prefix_match_len.unwrap();
let key = &key[prefix_match_len as usize..];
let level = level + prefix_match_len as usize;
let next_node = rnode.find_child_or_value_or_restart(key[0])?;
if next_node.is_none() {
if rnode.is_full() {
let (rparent, parent_key) = rparent.expect("root node cannot become full");
let mut wparent = rparent.upgrade_to_write_lock_or_restart()?;
let wnode = rnode.upgrade_to_write_lock_or_restart()?;
if let Some(new_value) = value_fn(None) {
insert_and_grow(key, new_value, &wnode, &mut wparent, parent_key, allocator);
wnode.write_unlock_obsolete();
wparent.write_unlock();
} else {
wnode.write_unlock();
wparent.write_unlock();
}
} else {
let mut wnode = rnode.upgrade_to_write_lock_or_restart()?;
if let Some((rparent, _)) = rparent {
rparent.read_unlock_or_restart()?;
}
if let Some(new_value) = value_fn(None) {
insert_to_node(&mut wnode, key, new_value, allocator);
}
wnode.write_unlock();
}
return Ok(());
} else {
let next_node = next_node.unwrap(); // checked above it's not None
if let Some((rparent, _)) = rparent {
rparent.read_unlock_or_restart()?;
}
match next_node {
ChildOrValue::Value(existing_value_ptr) => {
assert!(key.len() == 1);
let wnode = rnode.upgrade_to_write_lock_or_restart()?;
// safety: Now that we have acquired the write lock, we have exclusive access to the
// value
let vmut = unsafe { existing_value_ptr.cast_mut().as_mut() }.unwrap();
if let Some(new_value) = value_fn(Some(vmut)) {
*vmut = new_value;
} else {
// TODO: Treat this as deletion?
}
wnode.write_unlock();
Ok(())
}
ChildOrValue::Child(next_child) => {
// recurse to next level
update_recurse(
&key[1..],
value_fn,
next_child,
Some((rnode, key[0])),
allocator,
epoch_pin,
level + 1,
orig_key,
)
}
}
}
}
#[derive(Clone)]
enum PathElement {
Prefix(Vec<u8>),
KeyByte(u8),
}
impl std::fmt::Debug for PathElement {
fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
match self {
PathElement::Prefix(prefix) => write!(fmt, "{:?}", prefix),
PathElement::KeyByte(key_byte) => write!(fmt, "{}", key_byte),
}
}
}
fn dump_recurse<'e, V: Value + std::fmt::Debug>(
path: &[PathElement],
node: NodeRef<'e, V>,
epoch_pin: &'e EpochPin,
level: usize,
) -> ResultOrRestart<()> {
let indent = str::repeat(" ", level);
let rnode = node.read_lock_or_restart()?;
let mut path = Vec::from(path);
let prefix = rnode.get_prefix();
if prefix.len() != 0 {
path.push(PathElement::Prefix(Vec::from(prefix)));
}
for key_byte in 0..u8::MAX {
match rnode.find_child_or_value_or_restart(key_byte)? {
None => continue,
Some(ChildOrValue::Child(child_ref)) => {
let rchild = child_ref.read_lock_or_restart()?;
eprintln!(
"{} {:?}, {}: prefix {:?}",
indent,
&path,
key_byte,
rchild.get_prefix()
);
let mut child_path = path.clone();
child_path.push(PathElement::KeyByte(key_byte));
dump_recurse(&child_path, child_ref, epoch_pin, level + 1)?;
}
Some(ChildOrValue::Value(val)) => {
eprintln!("{} {:?}, {}: {:?}", indent, path, key_byte, unsafe {
val.as_ref().unwrap()
});
}
}
}
Ok(())
}
///```text
/// [fooba]r -> value
///
/// [foo]b -> [a]r -> value
/// e -> [ls]e -> value
///```
fn insert_split_prefix<'a, V: Value>(
key: &[u8],
value: V,
node: &mut WriteLockedNodeRef<V>,
parent: &mut WriteLockedNodeRef<V>,
parent_key: u8,
allocator: &Allocator,
) {
let old_node = node;
let old_prefix = old_node.get_prefix();
let common_prefix_len = common_prefix(key, old_prefix);
// Allocate a node for the new value.
let new_value_node = allocate_node_for_value(&key[common_prefix_len + 1..], value, allocator);
// Allocate a new internal node with the common prefix
let mut prefix_node = node_ref::new_internal(&key[..common_prefix_len], allocator);
// Add the old node and the new nodes to the new internal node
prefix_node.insert_child(old_prefix[common_prefix_len], old_node.as_ptr());
prefix_node.insert_child(key[common_prefix_len], new_value_node);
// Modify the prefix of the old child in place
old_node.truncate_prefix(old_prefix.len() - common_prefix_len - 1);
// replace the pointer in the parent
parent.replace_child(parent_key, prefix_node.into_ptr());
}
fn insert_to_node<V: Value>(
wnode: &mut WriteLockedNodeRef<V>,
key: &[u8],
value: V,
allocator: &Allocator,
) {
if wnode.is_leaf() {
wnode.insert_value(key[0], value);
} else {
let value_child = allocate_node_for_value(&key[1..], value, allocator);
wnode.insert_child(key[0], value_child);
}
}
// On entry: 'parent' and 'node' are locked
fn insert_and_grow<V: Value>(
key: &[u8],
value: V,
wnode: &WriteLockedNodeRef<V>,
parent: &mut WriteLockedNodeRef<V>,
parent_key_byte: u8,
allocator: &Allocator,
) {
let mut bigger_node = wnode.grow(allocator);
if wnode.is_leaf() {
bigger_node.insert_value(key[0], value);
} else {
let value_child = allocate_node_for_value(&key[1..], value, allocator);
bigger_node.insert_child(key[0], value_child);
}
// Replace the pointer in the parent
parent.replace_child(parent_key_byte, bigger_node.into_ptr());
}
// Allocate a new leaf node to hold 'value'. If key is long, we may need to allocate
// new internal nodes to hold it too
fn allocate_node_for_value<V: Value>(key: &[u8], value: V, allocator: &Allocator) -> NodePtr<V> {
let mut prefix_off = key.len().saturating_sub(MAX_PREFIX_LEN + 1);
let mut leaf_node = node_ref::new_leaf(&key[prefix_off..key.len() - 1], allocator);
leaf_node.insert_value(*key.last().unwrap(), value);
let mut node = leaf_node;
while prefix_off > 0 {
// Need another internal node
let remain_prefix = &key[0..prefix_off];
prefix_off = remain_prefix.len().saturating_sub(MAX_PREFIX_LEN + 1);
let mut internal_node = node_ref::new_internal(
&remain_prefix[prefix_off..remain_prefix.len() - 1],
allocator,
);
internal_node.insert_child(*remain_prefix.last().unwrap(), node.into_ptr());
node = internal_node;
}
node.into_ptr()
}
fn common_prefix(a: &[u8], b: &[u8]) -> usize {
for i in 0..MAX_PREFIX_LEN {
if a[i] != b[i] {
return i;
}
}
panic!("prefixes are equal");
}

View File

@@ -0,0 +1,85 @@
use std::sync::atomic::{AtomicU64, Ordering};
pub(crate) struct AtomicLockAndVersion {
inner: AtomicU64,
}
impl AtomicLockAndVersion {
pub(crate) fn new() -> AtomicLockAndVersion {
AtomicLockAndVersion {
inner: AtomicU64::new(0),
}
}
}
pub(crate) type ResultOrRestart<T> = Result<T, ()>;
const fn restart<T>() -> ResultOrRestart<T> {
Err(())
}
impl AtomicLockAndVersion {
pub(crate) fn read_lock_or_restart(&self) -> ResultOrRestart<u64> {
let version = self.await_node_unlocked();
if is_obsolete(version) {
return restart();
}
Ok(version)
}
pub(crate) fn check_or_restart(&self, version: u64) -> ResultOrRestart<()> {
self.read_unlock_or_restart(version)
}
pub(crate) fn read_unlock_or_restart(&self, version: u64) -> ResultOrRestart<()> {
if self.inner.load(Ordering::Acquire) != version {
return restart();
}
Ok(())
}
pub(crate) fn upgrade_to_write_lock_or_restart(&self, version: u64) -> ResultOrRestart<()> {
if self
.inner
.compare_exchange(
version,
set_locked_bit(version),
Ordering::Acquire,
Ordering::Relaxed,
)
.is_err()
{
return restart();
}
Ok(())
}
pub(crate) fn write_unlock(&self) {
// reset locked bit and overflow into version
self.inner.fetch_add(2, Ordering::Release);
}
pub(crate) fn write_unlock_obsolete(&self) {
// set obsolete, reset locked, overflow into version
self.inner.fetch_add(3, Ordering::Release);
}
// Helper functions
fn await_node_unlocked(&self) -> u64 {
let mut version = self.inner.load(Ordering::Acquire);
while (version & 2) == 2 {
// spinlock
std::thread::yield_now();
version = self.inner.load(Ordering::Acquire)
}
version
}
}
fn set_locked_bit(version: u64) -> u64 {
return version + 2;
}
fn is_obsolete(version: u64) -> bool {
return (version & 1) == 1;
}

View File

@@ -0,0 +1,983 @@
use std::marker::PhantomData;
use std::ptr::NonNull;
use super::lock_and_version::AtomicLockAndVersion;
use crate::Allocator;
use crate::Value;
pub(crate) const MAX_PREFIX_LEN: usize = 8;
enum NodeTag {
Internal4,
Internal16,
Internal48,
Internal256,
Leaf4,
Leaf16,
Leaf48,
Leaf256,
}
#[repr(C)]
struct NodeBase {
tag: NodeTag,
lock_and_version: AtomicLockAndVersion,
}
pub(crate) struct NodePtr<V> {
ptr: *mut NodeBase,
phantom_value: PhantomData<V>,
}
impl<V> std::fmt::Debug for NodePtr<V> {
fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
write!(fmt, "0x{}", self.ptr.addr())
}
}
impl<V> Copy for NodePtr<V> {}
impl<V> Clone for NodePtr<V> {
fn clone(&self) -> NodePtr<V> {
NodePtr {
ptr: self.ptr,
phantom_value: PhantomData,
}
}
}
enum NodeVariant<'a, V> {
Internal4(&'a NodeInternal4<V>),
Internal16(&'a NodeInternal16<V>),
Internal48(&'a NodeInternal48<V>),
Internal256(&'a NodeInternal256<V>),
Leaf4(&'a NodeLeaf4<V>),
Leaf16(&'a NodeLeaf16<V>),
Leaf48(&'a NodeLeaf48<V>),
Leaf256(&'a NodeLeaf256<V>),
}
enum NodeVariantMut<'a, V> {
Internal4(&'a mut NodeInternal4<V>),
Internal16(&'a mut NodeInternal16<V>),
Internal48(&'a mut NodeInternal48<V>),
Internal256(&'a mut NodeInternal256<V>),
Leaf4(&'a mut NodeLeaf4<V>),
Leaf16(&'a mut NodeLeaf16<V>),
Leaf48(&'a mut NodeLeaf48<V>),
Leaf256(&'a mut NodeLeaf256<V>),
}
pub(crate) enum ChildOrValuePtr<V> {
Child(NodePtr<V>),
Value(*const V),
}
#[repr(C)]
struct NodeInternal4<V> {
tag: NodeTag,
lock_and_version: AtomicLockAndVersion,
prefix: [u8; MAX_PREFIX_LEN],
prefix_len: u8,
num_children: u8,
child_keys: [u8; 4],
child_ptrs: [NodePtr<V>; 4],
}
#[repr(C)]
struct NodeInternal16<V> {
tag: NodeTag,
lock_and_version: AtomicLockAndVersion,
prefix: [u8; MAX_PREFIX_LEN],
prefix_len: u8,
num_children: u8,
child_keys: [u8; 16],
child_ptrs: [NodePtr<V>; 16],
}
const INVALID_CHILD_INDEX: u8 = u8::MAX;
#[repr(C)]
struct NodeInternal48<V> {
tag: NodeTag,
lock_and_version: AtomicLockAndVersion,
prefix: [u8; MAX_PREFIX_LEN],
prefix_len: u8,
num_children: u8,
child_indexes: [u8; 256],
child_ptrs: [NodePtr<V>; 48],
}
#[repr(C)]
pub(crate) struct NodeInternal256<V> {
tag: NodeTag,
lock_and_version: AtomicLockAndVersion,
prefix: [u8; MAX_PREFIX_LEN],
prefix_len: u8,
num_children: u16,
child_ptrs: [NodePtr<V>; 256],
}
#[repr(C)]
struct NodeLeaf4<V> {
tag: NodeTag,
lock_and_version: AtomicLockAndVersion,
prefix: [u8; MAX_PREFIX_LEN],
prefix_len: u8,
num_values: u8,
child_keys: [u8; 4],
child_values: [Option<V>; 4],
}
#[repr(C)]
struct NodeLeaf16<V> {
tag: NodeTag,
lock_and_version: AtomicLockAndVersion,
prefix: [u8; MAX_PREFIX_LEN],
prefix_len: u8,
num_values: u8,
child_keys: [u8; 16],
child_values: [Option<V>; 16],
}
#[repr(C)]
struct NodeLeaf48<V> {
tag: NodeTag,
lock_and_version: AtomicLockAndVersion,
prefix: [u8; MAX_PREFIX_LEN],
prefix_len: u8,
num_values: u8,
child_indexes: [u8; 256],
child_values: [Option<V>; 48],
}
#[repr(C)]
struct NodeLeaf256<V> {
tag: NodeTag,
lock_and_version: AtomicLockAndVersion,
prefix: [u8; MAX_PREFIX_LEN],
prefix_len: u8,
num_values: u16,
child_values: [Option<V>; 256],
}
impl<V> NodePtr<V> {
pub(crate) fn is_leaf(&self) -> bool {
match self.variant() {
NodeVariant::Internal4(_) => false,
NodeVariant::Internal16(_) => false,
NodeVariant::Internal48(_) => false,
NodeVariant::Internal256(_) => false,
NodeVariant::Leaf4(_) => true,
NodeVariant::Leaf16(_) => true,
NodeVariant::Leaf48(_) => true,
NodeVariant::Leaf256(_) => true,
}
}
pub(crate) fn lockword(&self) -> &AtomicLockAndVersion {
match self.variant() {
NodeVariant::Internal4(n) => &n.lock_and_version,
NodeVariant::Internal16(n) => &n.lock_and_version,
NodeVariant::Internal48(n) => &n.lock_and_version,
NodeVariant::Internal256(n) => &n.lock_and_version,
NodeVariant::Leaf4(n) => &n.lock_and_version,
NodeVariant::Leaf16(n) => &n.lock_and_version,
NodeVariant::Leaf48(n) => &n.lock_and_version,
NodeVariant::Leaf256(n) => &n.lock_and_version,
}
}
pub(crate) fn is_null(&self) -> bool {
self.ptr.is_null()
}
pub(crate) const fn null() -> NodePtr<V> {
NodePtr {
ptr: std::ptr::null_mut(),
phantom_value: PhantomData,
}
}
fn variant(&self) -> NodeVariant<V> {
unsafe {
match (*self.ptr).tag {
NodeTag::Internal4 => NodeVariant::Internal4(
NonNull::new_unchecked(self.ptr.cast::<NodeInternal4<V>>()).as_ref(),
),
NodeTag::Internal16 => NodeVariant::Internal16(
NonNull::new_unchecked(self.ptr.cast::<NodeInternal16<V>>()).as_ref(),
),
NodeTag::Internal48 => NodeVariant::Internal48(
NonNull::new_unchecked(self.ptr.cast::<NodeInternal48<V>>()).as_ref(),
),
NodeTag::Internal256 => NodeVariant::Internal256(
NonNull::new_unchecked(self.ptr.cast::<NodeInternal256<V>>()).as_ref(),
),
NodeTag::Leaf4 => NodeVariant::Leaf4(
NonNull::new_unchecked(self.ptr.cast::<NodeLeaf4<V>>()).as_ref(),
),
NodeTag::Leaf16 => NodeVariant::Leaf16(
NonNull::new_unchecked(self.ptr.cast::<NodeLeaf16<V>>()).as_ref(),
),
NodeTag::Leaf48 => NodeVariant::Leaf48(
NonNull::new_unchecked(self.ptr.cast::<NodeLeaf48<V>>()).as_ref(),
),
NodeTag::Leaf256 => NodeVariant::Leaf256(
NonNull::new_unchecked(self.ptr.cast::<NodeLeaf256<V>>()).as_ref(),
),
}
}
}
fn variant_mut(&mut self) -> NodeVariantMut<V> {
unsafe {
match (*self.ptr).tag {
NodeTag::Internal4 => NodeVariantMut::Internal4(
NonNull::new_unchecked(self.ptr.cast::<NodeInternal4<V>>()).as_mut(),
),
NodeTag::Internal16 => NodeVariantMut::Internal16(
NonNull::new_unchecked(self.ptr.cast::<NodeInternal16<V>>()).as_mut(),
),
NodeTag::Internal48 => NodeVariantMut::Internal48(
NonNull::new_unchecked(self.ptr.cast::<NodeInternal48<V>>()).as_mut(),
),
NodeTag::Internal256 => NodeVariantMut::Internal256(
NonNull::new_unchecked(self.ptr.cast::<NodeInternal256<V>>()).as_mut(),
),
NodeTag::Leaf4 => NodeVariantMut::Leaf4(
NonNull::new_unchecked(self.ptr.cast::<NodeLeaf4<V>>()).as_mut(),
),
NodeTag::Leaf16 => NodeVariantMut::Leaf16(
NonNull::new_unchecked(self.ptr.cast::<NodeLeaf16<V>>()).as_mut(),
),
NodeTag::Leaf48 => NodeVariantMut::Leaf48(
NonNull::new_unchecked(self.ptr.cast::<NodeLeaf48<V>>()).as_mut(),
),
NodeTag::Leaf256 => NodeVariantMut::Leaf256(
NonNull::new_unchecked(self.ptr.cast::<NodeLeaf256<V>>()).as_mut(),
),
}
}
}
}
impl<V: Value> NodePtr<V> {
pub(crate) fn prefix_matches(&self, key: &[u8]) -> Option<usize> {
let node_prefix = self.get_prefix();
assert!(node_prefix.len() <= key.len()); // because we only use fixed-size keys
if &key[0..node_prefix.len()] != node_prefix {
None
} else {
Some(node_prefix.len())
}
}
pub(crate) fn get_prefix(&self) -> &[u8] {
match self.variant() {
NodeVariant::Internal4(n) => n.get_prefix(),
NodeVariant::Internal16(n) => n.get_prefix(),
NodeVariant::Internal48(n) => n.get_prefix(),
NodeVariant::Internal256(n) => n.get_prefix(),
NodeVariant::Leaf4(n) => n.get_prefix(),
NodeVariant::Leaf16(n) => n.get_prefix(),
NodeVariant::Leaf48(n) => n.get_prefix(),
NodeVariant::Leaf256(n) => n.get_prefix(),
}
}
pub(crate) fn is_full(&self) -> bool {
match self.variant() {
NodeVariant::Internal4(n) => n.is_full(),
NodeVariant::Internal16(n) => n.is_full(),
NodeVariant::Internal48(n) => n.is_full(),
NodeVariant::Internal256(n) => n.is_full(),
NodeVariant::Leaf4(n) => n.is_full(),
NodeVariant::Leaf16(n) => n.is_full(),
NodeVariant::Leaf48(n) => n.is_full(),
NodeVariant::Leaf256(n) => n.is_full(),
}
}
pub(crate) fn find_child_or_value(&self, key_byte: u8) -> Option<ChildOrValuePtr<V>> {
match self.variant() {
NodeVariant::Internal4(n) => n.find_child(key_byte).map(|c| ChildOrValuePtr::Child(c)),
NodeVariant::Internal16(n) => n.find_child(key_byte).map(|c| ChildOrValuePtr::Child(c)),
NodeVariant::Internal48(n) => n.find_child(key_byte).map(|c| ChildOrValuePtr::Child(c)),
NodeVariant::Internal256(n) => {
n.find_child(key_byte).map(|c| ChildOrValuePtr::Child(c))
}
NodeVariant::Leaf4(n) => n
.get_leaf_value(key_byte)
.map(|v| ChildOrValuePtr::Value(v)),
NodeVariant::Leaf16(n) => n
.get_leaf_value(key_byte)
.map(|v| ChildOrValuePtr::Value(v)),
NodeVariant::Leaf48(n) => n
.get_leaf_value(key_byte)
.map(|v| ChildOrValuePtr::Value(v)),
NodeVariant::Leaf256(n) => n
.get_leaf_value(key_byte)
.map(|v| ChildOrValuePtr::Value(v)),
}
}
pub(crate) fn truncate_prefix(&mut self, new_prefix_len: usize) {
match self.variant_mut() {
NodeVariantMut::Internal4(n) => n.truncate_prefix(new_prefix_len),
NodeVariantMut::Internal16(n) => n.truncate_prefix(new_prefix_len),
NodeVariantMut::Internal48(n) => n.truncate_prefix(new_prefix_len),
NodeVariantMut::Internal256(n) => n.truncate_prefix(new_prefix_len),
NodeVariantMut::Leaf4(n) => n.truncate_prefix(new_prefix_len),
NodeVariantMut::Leaf16(n) => n.truncate_prefix(new_prefix_len),
NodeVariantMut::Leaf48(n) => n.truncate_prefix(new_prefix_len),
NodeVariantMut::Leaf256(n) => n.truncate_prefix(new_prefix_len),
}
}
pub(crate) fn grow(&self, allocator: &Allocator) -> NodePtr<V> {
match self.variant() {
NodeVariant::Internal4(n) => n.grow(allocator),
NodeVariant::Internal16(n) => n.grow(allocator),
NodeVariant::Internal48(n) => n.grow(allocator),
NodeVariant::Internal256(_) => panic!("cannot grow Internal256 node"),
NodeVariant::Leaf4(n) => n.grow(allocator),
NodeVariant::Leaf16(n) => n.grow(allocator),
NodeVariant::Leaf48(n) => n.grow(allocator),
NodeVariant::Leaf256(_) => panic!("cannot grow Leaf256 node"),
}
}
pub(crate) fn insert_child(&mut self, key_byte: u8, child: NodePtr<V>) {
match self.variant_mut() {
NodeVariantMut::Internal4(n) => n.insert_child(key_byte, child),
NodeVariantMut::Internal16(n) => n.insert_child(key_byte, child),
NodeVariantMut::Internal48(n) => n.insert_child(key_byte, child),
NodeVariantMut::Internal256(n) => n.insert_child(key_byte, child),
NodeVariantMut::Leaf4(_)
| NodeVariantMut::Leaf16(_)
| NodeVariantMut::Leaf48(_)
| NodeVariantMut::Leaf256(_) => panic!("insert_child called on leaf node"),
}
}
pub(crate) fn replace_child(&mut self, key_byte: u8, replacement: NodePtr<V>) {
match self.variant_mut() {
NodeVariantMut::Internal4(n) => n.replace_child(key_byte, replacement),
NodeVariantMut::Internal16(n) => n.replace_child(key_byte, replacement),
NodeVariantMut::Internal48(n) => n.replace_child(key_byte, replacement),
NodeVariantMut::Internal256(n) => n.replace_child(key_byte, replacement),
NodeVariantMut::Leaf4(_)
| NodeVariantMut::Leaf16(_)
| NodeVariantMut::Leaf48(_)
| NodeVariantMut::Leaf256(_) => panic!("replace_child called on leaf node"),
}
}
pub(crate) fn insert_value(&mut self, key_byte: u8, value: V) {
match self.variant_mut() {
NodeVariantMut::Internal4(_)
| NodeVariantMut::Internal16(_)
| NodeVariantMut::Internal48(_)
| NodeVariantMut::Internal256(_) => panic!("insert_value called on internal node"),
NodeVariantMut::Leaf4(n) => n.insert_value(key_byte, value),
NodeVariantMut::Leaf16(n) => n.insert_value(key_byte, value),
NodeVariantMut::Leaf48(n) => n.insert_value(key_byte, value),
NodeVariantMut::Leaf256(n) => n.insert_value(key_byte, value),
}
}
}
pub fn new_root<V: Value>(allocator: &Allocator) -> NodePtr<V> {
NodePtr {
ptr: allocator.alloc(NodeInternal256::<V>::new()).as_ptr().cast(),
phantom_value: PhantomData,
}
}
pub fn new_internal<V: Value>(prefix: &[u8], allocator: &Allocator) -> NodePtr<V> {
let mut node = allocator.alloc(NodeInternal4 {
tag: NodeTag::Internal4,
lock_and_version: AtomicLockAndVersion::new(),
prefix: [8; MAX_PREFIX_LEN],
prefix_len: prefix.len() as u8,
num_children: 0,
child_keys: [0; 4],
child_ptrs: [const { NodePtr::null() }; 4],
});
node.prefix[0..prefix.len()].copy_from_slice(prefix);
node.as_ptr().into()
}
pub fn new_leaf<V: Value>(prefix: &[u8], allocator: &Allocator) -> NodePtr<V> {
let mut node = allocator.alloc(NodeLeaf4 {
tag: NodeTag::Leaf4,
lock_and_version: AtomicLockAndVersion::new(),
prefix: [8; MAX_PREFIX_LEN],
prefix_len: prefix.len() as u8,
num_values: 0,
child_keys: [0; 4],
child_values: [const { None }; 4],
});
node.prefix[0..prefix.len()].copy_from_slice(prefix);
node.as_ptr().into()
}
impl<V: Value> NodeInternal4<V> {
fn get_prefix(&self) -> &[u8] {
&self.prefix[0..self.prefix_len as usize]
}
fn truncate_prefix(&mut self, new_prefix_len: usize) {
assert!(new_prefix_len < self.prefix_len as usize);
let prefix = &mut self.prefix;
let offset = self.prefix_len as usize - new_prefix_len;
for i in 0..new_prefix_len {
prefix[i] = prefix[i + offset];
}
self.prefix_len = new_prefix_len as u8;
}
fn find_child(&self, key: u8) -> Option<NodePtr<V>> {
for i in 0..self.num_children as usize {
if self.child_keys[i] == key {
return Some(self.child_ptrs[i]);
}
}
None
}
fn replace_child(&mut self, key_byte: u8, replacement: NodePtr<V>) {
for i in 0..self.num_children as usize {
if self.child_keys[i] == key_byte {
self.child_ptrs[i] = replacement;
return;
}
}
panic!("could not re-find parent with key {}", key_byte);
}
fn is_full(&self) -> bool {
self.num_children == 4
}
fn insert_child(&mut self, key_byte: u8, child: NodePtr<V>) {
assert!(self.num_children < 4);
let idx = self.num_children as usize;
self.child_keys[idx] = key_byte;
self.child_ptrs[idx] = child;
self.num_children += 1;
}
fn grow(&self, allocator: &Allocator) -> NodePtr<V> {
let mut node16 = allocator.alloc(NodeInternal16 {
tag: NodeTag::Internal16,
lock_and_version: AtomicLockAndVersion::new(),
prefix: self.prefix.clone(),
prefix_len: self.prefix_len,
num_children: self.num_children,
child_keys: [0; 16],
child_ptrs: [const { NodePtr::null() }; 16],
});
for i in 0..self.num_children as usize {
node16.child_keys[i] = self.child_keys[i];
node16.child_ptrs[i] = self.child_ptrs[i];
}
node16.as_ptr().into()
}
}
impl<V: Value> NodeInternal16<V> {
fn get_prefix(&self) -> &[u8] {
&self.prefix[0..self.prefix_len as usize]
}
fn truncate_prefix(&mut self, new_prefix_len: usize) {
assert!(new_prefix_len < self.prefix_len as usize);
let prefix = &mut self.prefix;
let offset = self.prefix_len as usize - new_prefix_len;
for i in 0..new_prefix_len {
prefix[i] = prefix[i + offset];
}
self.prefix_len = new_prefix_len as u8;
}
fn find_child(&self, key_byte: u8) -> Option<NodePtr<V>> {
for i in 0..self.num_children as usize {
if self.child_keys[i] == key_byte {
return Some(self.child_ptrs[i]);
}
}
None
}
fn replace_child(&mut self, key_byte: u8, replacement: NodePtr<V>) {
for i in 0..self.num_children as usize {
if self.child_keys[i] == key_byte {
self.child_ptrs[i] = replacement;
return;
}
}
panic!("could not re-find parent with key {}", key_byte);
}
fn is_full(&self) -> bool {
self.num_children == 16
}
fn insert_child(&mut self, key_byte: u8, child: NodePtr<V>) {
assert!(self.num_children < 16);
let idx = self.num_children as usize;
self.child_keys[idx] = key_byte;
self.child_ptrs[idx] = child;
self.num_children += 1;
}
fn grow(&self, allocator: &Allocator) -> NodePtr<V> {
let mut node48 = allocator.alloc(NodeInternal48 {
tag: NodeTag::Internal48,
lock_and_version: AtomicLockAndVersion::new(),
prefix: self.prefix.clone(),
prefix_len: self.prefix_len,
num_children: self.num_children,
child_indexes: [INVALID_CHILD_INDEX; 256],
child_ptrs: [const { NodePtr::null() }; 48],
});
for i in 0..self.num_children as usize {
let idx = self.child_keys[i] as usize;
node48.child_indexes[idx] = i as u8;
node48.child_ptrs[i] = self.child_ptrs[i];
}
node48.as_ptr().into()
}
}
impl<V: Value> NodeInternal48<V> {
fn get_prefix(&self) -> &[u8] {
&self.prefix[0..self.prefix_len as usize]
}
fn truncate_prefix(&mut self, new_prefix_len: usize) {
assert!(new_prefix_len < self.prefix_len as usize);
let prefix = &mut self.prefix;
let offset = self.prefix_len as usize - new_prefix_len;
for i in 0..new_prefix_len {
prefix[i] = prefix[i + offset];
}
self.prefix_len = new_prefix_len as u8;
}
fn find_child(&self, key_byte: u8) -> Option<NodePtr<V>> {
let idx = self.child_indexes[key_byte as usize];
if idx != INVALID_CHILD_INDEX {
Some(self.child_ptrs[idx as usize])
} else {
None
}
}
fn replace_child(&mut self, key_byte: u8, replacement: NodePtr<V>) {
let idx = self.child_indexes[key_byte as usize];
if idx != INVALID_CHILD_INDEX {
self.child_ptrs[idx as usize] = replacement
} else {
panic!("could not re-find parent with key {}", key_byte);
}
}
fn is_full(&self) -> bool {
self.num_children == 48
}
fn insert_child(&mut self, key_byte: u8, child: NodePtr<V>) {
assert!(self.num_children < 48);
assert!(self.child_indexes[key_byte as usize] == INVALID_CHILD_INDEX);
let idx = self.num_children;
self.child_indexes[key_byte as usize] = idx;
self.child_ptrs[idx as usize] = child;
self.num_children += 1;
}
fn grow(&self, allocator: &Allocator) -> NodePtr<V> {
let mut node256 = allocator.alloc(NodeInternal256 {
tag: NodeTag::Internal256,
lock_and_version: AtomicLockAndVersion::new(),
prefix: self.prefix.clone(),
prefix_len: self.prefix_len,
num_children: self.num_children as u16,
child_ptrs: [const { NodePtr::null() }; 256],
});
for i in 0..256 {
let idx = self.child_indexes[i];
if idx != INVALID_CHILD_INDEX {
node256.child_ptrs[i] = self.child_ptrs[idx as usize];
}
}
node256.as_ptr().into()
}
}
impl<V: Value> NodeInternal256<V> {
fn get_prefix(&self) -> &[u8] {
&self.prefix[0..self.prefix_len as usize]
}
fn truncate_prefix(&mut self, new_prefix_len: usize) {
assert!(new_prefix_len < self.prefix_len as usize);
let prefix = &mut self.prefix;
let offset = self.prefix_len as usize - new_prefix_len;
for i in 0..new_prefix_len {
prefix[i] = prefix[i + offset];
}
self.prefix_len = new_prefix_len as u8;
}
fn find_child(&self, key_byte: u8) -> Option<NodePtr<V>> {
let idx = key_byte as usize;
if !self.child_ptrs[idx].is_null() {
Some(self.child_ptrs[idx])
} else {
None
}
}
fn replace_child(&mut self, key_byte: u8, replacement: NodePtr<V>) {
let idx = key_byte as usize;
if !self.child_ptrs[idx].is_null() {
self.child_ptrs[idx] = replacement
} else {
panic!("could not re-find parent with key {}", key_byte);
}
}
fn is_full(&self) -> bool {
self.num_children == 256
}
fn insert_child(&mut self, key_byte: u8, child: NodePtr<V>) {
assert!(self.num_children < 256);
assert!(self.child_ptrs[key_byte as usize].is_null());
self.child_ptrs[key_byte as usize] = child;
self.num_children += 1;
}
}
impl<V: Value> NodeLeaf4<V> {
fn get_prefix(&self) -> &[u8] {
&self.prefix[0..self.prefix_len as usize]
}
fn truncate_prefix(&mut self, new_prefix_len: usize) {
assert!(new_prefix_len < self.prefix_len as usize);
let prefix = &mut self.prefix;
let offset = self.prefix_len as usize - new_prefix_len;
for i in 0..new_prefix_len {
prefix[i] = prefix[i + offset];
}
self.prefix_len = new_prefix_len as u8;
}
fn get_leaf_value<'a: 'b, 'b>(&'a self, key: u8) -> Option<&'b V> {
for i in 0..self.num_values {
if self.child_keys[i as usize] == key {
assert!(self.child_values[i as usize].is_some());
return self.child_values[i as usize].as_ref();
}
}
None
}
fn is_full(&self) -> bool {
self.num_values == 4
}
fn insert_value(&mut self, key_byte: u8, value: V) {
assert!(self.num_values < 16);
let idx = self.num_values as usize;
self.child_keys[idx] = key_byte;
self.child_values[idx] = Some(value);
self.num_values += 1;
}
fn grow(&self, allocator: &Allocator) -> NodePtr<V> {
let mut node16 = allocator.alloc(NodeLeaf16 {
tag: NodeTag::Leaf16,
lock_and_version: AtomicLockAndVersion::new(),
prefix: self.prefix.clone(),
prefix_len: self.prefix_len,
num_values: self.num_values,
child_keys: [0; 16],
child_values: [const { None }; 16],
});
for i in 0..self.num_values as usize {
node16.child_keys[i] = self.child_keys[i];
node16.child_values[i] = self.child_values[i].clone();
}
node16.as_ptr().into()
}
}
impl<V: Value> NodeLeaf16<V> {
fn get_prefix(&self) -> &[u8] {
&self.prefix[0..self.prefix_len as usize]
}
fn truncate_prefix(&mut self, new_prefix_len: usize) {
assert!(new_prefix_len < self.prefix_len as usize);
let prefix = &mut self.prefix;
let offset = self.prefix_len as usize - new_prefix_len;
for i in 0..new_prefix_len {
prefix[i] = prefix[i + offset];
}
self.prefix_len = new_prefix_len as u8;
}
fn get_leaf_value(&self, key: u8) -> Option<&V> {
for i in 0..self.num_values {
if self.child_keys[i as usize] == key {
assert!(self.child_values[i as usize].is_some());
return self.child_values[i as usize].as_ref();
}
}
None
}
fn is_full(&self) -> bool {
self.num_values == 16
}
fn insert_value(&mut self, key_byte: u8, value: V) {
assert!(self.num_values < 16);
let idx = self.num_values as usize;
self.child_keys[idx] = key_byte;
self.child_values[idx] = Some(value);
self.num_values += 1;
}
fn grow(&self, allocator: &Allocator) -> NodePtr<V> {
let mut node48 = allocator.alloc(NodeLeaf48 {
tag: NodeTag::Leaf48,
lock_and_version: AtomicLockAndVersion::new(),
prefix: self.prefix.clone(),
prefix_len: self.prefix_len,
num_values: self.num_values,
child_indexes: [INVALID_CHILD_INDEX; 256],
child_values: [const { None }; 48],
});
for i in 0..self.num_values {
let idx = self.child_keys[i as usize];
node48.child_indexes[idx as usize] = i;
node48.child_values[i as usize] = self.child_values[i as usize].clone();
}
node48.as_ptr().into()
}
}
impl<V: Value> NodeLeaf48<V> {
fn get_prefix(&self) -> &[u8] {
&self.prefix[0..self.prefix_len as usize]
}
fn truncate_prefix(&mut self, new_prefix_len: usize) {
assert!(new_prefix_len < self.prefix_len as usize);
let prefix = &mut self.prefix;
let offset = self.prefix_len as usize - new_prefix_len;
for i in 0..new_prefix_len {
prefix[i] = prefix[i + offset];
}
self.prefix_len = new_prefix_len as u8;
}
fn get_leaf_value(&self, key: u8) -> Option<&V> {
let idx = self.child_indexes[key as usize];
if idx != INVALID_CHILD_INDEX {
assert!(self.child_values[idx as usize].is_some());
self.child_values[idx as usize].as_ref()
} else {
None
}
}
fn is_full(&self) -> bool {
self.num_values == 48
}
fn insert_value(&mut self, key_byte: u8, value: V) {
assert!(self.num_values < 48);
assert!(self.child_indexes[key_byte as usize] == INVALID_CHILD_INDEX);
let idx = self.num_values;
self.child_indexes[key_byte as usize] = idx;
self.child_values[idx as usize] = Some(value);
self.num_values += 1;
}
fn grow(&self, allocator: &Allocator) -> NodePtr<V> {
let mut node256 = allocator.alloc(NodeLeaf256 {
tag: NodeTag::Leaf256,
lock_and_version: AtomicLockAndVersion::new(),
prefix: self.prefix.clone(),
prefix_len: self.prefix_len,
num_values: self.num_values as u16,
child_values: [const { None }; 256],
});
for i in 0..256 {
let idx = self.child_indexes[i];
if idx != INVALID_CHILD_INDEX {
node256.child_values[i] = self.child_values[idx as usize].clone();
}
}
node256.as_ptr().into()
}
}
impl<V: Value> NodeLeaf256<V> {
fn get_prefix(&self) -> &[u8] {
&self.prefix[0..self.prefix_len as usize]
}
fn truncate_prefix(&mut self, new_prefix_len: usize) {
assert!(new_prefix_len < self.prefix_len as usize);
let prefix = &mut self.prefix;
let offset = self.prefix_len as usize - new_prefix_len;
for i in 0..new_prefix_len {
prefix[i] = prefix[i + offset];
}
self.prefix_len = new_prefix_len as u8;
}
fn get_leaf_value(&self, key: u8) -> Option<&V> {
let idx = key as usize;
self.child_values[idx].as_ref()
}
fn is_full(&self) -> bool {
self.num_values == 256
}
fn insert_value(&mut self, key_byte: u8, value: V) {
assert!(self.num_values < 256);
assert!(self.child_values[key_byte as usize].is_none());
self.child_values[key_byte as usize] = Some(value);
self.num_values += 1;
}
}
impl<V: Value> NodeInternal256<V> {
pub(crate) fn new() -> NodeInternal256<V> {
NodeInternal256 {
tag: NodeTag::Internal256,
lock_and_version: AtomicLockAndVersion::new(),
prefix: [0; MAX_PREFIX_LEN],
prefix_len: 0,
num_children: 0,
child_ptrs: [const { NodePtr::null() }; 256],
}
}
}
impl<V: Value> From<*mut NodeInternal4<V>> for NodePtr<V> {
fn from(val: *mut NodeInternal4<V>) -> NodePtr<V> {
NodePtr {
ptr: val.cast(),
phantom_value: PhantomData,
}
}
}
impl<V: Value> From<*mut NodeInternal16<V>> for NodePtr<V> {
fn from(val: *mut NodeInternal16<V>) -> NodePtr<V> {
NodePtr {
ptr: val.cast(),
phantom_value: PhantomData,
}
}
}
impl<V: Value> From<*mut NodeInternal48<V>> for NodePtr<V> {
fn from(val: *mut NodeInternal48<V>) -> NodePtr<V> {
NodePtr {
ptr: val.cast(),
phantom_value: PhantomData,
}
}
}
impl<V: Value> From<*mut NodeInternal256<V>> for NodePtr<V> {
fn from(val: *mut NodeInternal256<V>) -> NodePtr<V> {
NodePtr {
ptr: val.cast(),
phantom_value: PhantomData,
}
}
}
impl<V: Value> From<*mut NodeLeaf4<V>> for NodePtr<V> {
fn from(val: *mut NodeLeaf4<V>) -> NodePtr<V> {
NodePtr {
ptr: val.cast(),
phantom_value: PhantomData,
}
}
}
impl<V: Value> From<*mut NodeLeaf16<V>> for NodePtr<V> {
fn from(val: *mut NodeLeaf16<V>) -> NodePtr<V> {
NodePtr {
ptr: val.cast(),
phantom_value: PhantomData,
}
}
}
impl<V: Value> From<*mut NodeLeaf48<V>> for NodePtr<V> {
fn from(val: *mut NodeLeaf48<V>) -> NodePtr<V> {
NodePtr {
ptr: val.cast(),
phantom_value: PhantomData,
}
}
}
impl<V: Value> From<*mut NodeLeaf256<V>> for NodePtr<V> {
fn from(val: *mut NodeLeaf256<V>) -> NodePtr<V> {
NodePtr {
ptr: val.cast(),
phantom_value: PhantomData,
}
}
}

View File

@@ -0,0 +1,202 @@
use std::fmt::Debug;
use std::marker::PhantomData;
use super::lock_and_version::ResultOrRestart;
use super::node_ptr;
use super::node_ptr::ChildOrValuePtr;
use super::node_ptr::NodePtr;
use crate::EpochPin;
use crate::algorithm::lock_and_version::AtomicLockAndVersion;
use crate::{Allocator, Value};
pub struct NodeRef<'e, V> {
ptr: NodePtr<V>,
phantom: PhantomData<&'e EpochPin>,
}
impl<'e, V> Debug for NodeRef<'e, V> {
fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
write!(fmt, "{:?}", self.ptr)
}
}
impl<'e, V: Value> NodeRef<'e, V> {
pub(crate) fn from_root_ptr(root_ptr: NodePtr<V>) -> NodeRef<'e, V> {
NodeRef {
ptr: root_ptr,
phantom: PhantomData,
}
}
pub(crate) fn read_lock_or_restart(&self) -> ResultOrRestart<ReadLockedNodeRef<'e, V>> {
let version = self.lockword().read_lock_or_restart()?;
Ok(ReadLockedNodeRef {
ptr: self.ptr,
version,
phantom: self.phantom,
})
}
fn lockword(&self) -> &AtomicLockAndVersion {
self.ptr.lockword()
}
}
/// A reference to a node that has been optimistically read-locked. The functions re-check
/// the version after each read.
pub struct ReadLockedNodeRef<'e, V> {
ptr: NodePtr<V>,
version: u64,
phantom: PhantomData<&'e EpochPin>,
}
pub(crate) enum ChildOrValue<'e, V> {
Child(NodeRef<'e, V>),
Value(*const V),
}
impl<'e, V: Value> ReadLockedNodeRef<'e, V> {
pub(crate) fn is_full(&self) -> bool {
self.ptr.is_full()
}
pub(crate) fn get_prefix(&self) -> &[u8] {
self.ptr.get_prefix()
}
/// Note: because we're only holding a read lock, the prefix can change concurrently.
/// You must be prepared to restart, if read_unlock() returns error later.
///
/// Returns the length of the prefix, or None if it's not a match
pub(crate) fn prefix_matches(&self, key: &[u8]) -> Option<usize> {
self.ptr.prefix_matches(key)
}
pub(crate) fn find_child_or_value_or_restart(
&self,
key_byte: u8,
) -> ResultOrRestart<Option<ChildOrValue<'e, V>>> {
let child_or_value = self.ptr.find_child_or_value(key_byte);
self.ptr.lockword().check_or_restart(self.version)?;
match child_or_value {
None => Ok(None),
Some(ChildOrValuePtr::Value(vptr)) => Ok(Some(ChildOrValue::Value(vptr))),
Some(ChildOrValuePtr::Child(child_ptr)) => Ok(Some(ChildOrValue::Child(NodeRef {
ptr: child_ptr,
phantom: self.phantom,
}))),
}
}
pub(crate) fn upgrade_to_write_lock_or_restart(
self,
) -> ResultOrRestart<WriteLockedNodeRef<'e, V>> {
self.ptr
.lockword()
.upgrade_to_write_lock_or_restart(self.version)?;
Ok(WriteLockedNodeRef {
ptr: self.ptr,
phantom: self.phantom,
})
}
pub(crate) fn read_unlock_or_restart(self) -> ResultOrRestart<()> {
self.ptr.lockword().check_or_restart(self.version)?;
Ok(())
}
}
/// A reference to a node that has been optimistically read-locked. The functions re-check
/// the version after each read.
pub struct WriteLockedNodeRef<'e, V> {
ptr: NodePtr<V>,
phantom: PhantomData<&'e EpochPin>,
}
impl<'e, V: Value> WriteLockedNodeRef<'e, V> {
pub(crate) fn is_leaf(&self) -> bool {
self.ptr.is_leaf()
}
pub(crate) fn write_unlock(mut self) {
self.ptr.lockword().write_unlock();
self.ptr = NodePtr::null();
}
pub(crate) fn write_unlock_obsolete(mut self) {
self.ptr.lockword().write_unlock_obsolete();
self.ptr = NodePtr::null();
}
pub(crate) fn get_prefix(&self) -> &[u8] {
self.ptr.get_prefix()
}
pub(crate) fn truncate_prefix(&mut self, new_prefix_len: usize) {
self.ptr.truncate_prefix(new_prefix_len)
}
pub(crate) fn insert_child(&mut self, key_byte: u8, child: NodePtr<V>) {
self.ptr.insert_child(key_byte, child)
}
pub(crate) fn insert_value(&mut self, key_byte: u8, value: V) {
self.ptr.insert_value(key_byte, value)
}
pub(crate) fn grow(&self, allocator: &Allocator) -> NewNodeRef<V> {
let new_node = self.ptr.grow(allocator);
NewNodeRef { ptr: new_node }
}
pub(crate) fn as_ptr(&self) -> NodePtr<V> {
self.ptr
}
pub(crate) fn replace_child(&mut self, key_byte: u8, replacement: NodePtr<V>) {
self.ptr.replace_child(key_byte, replacement);
}
}
impl<'e, V> Drop for WriteLockedNodeRef<'e, V> {
fn drop(&mut self) {
if !self.ptr.is_null() {
self.ptr.lockword().write_unlock();
}
}
}
pub(crate) struct NewNodeRef<V> {
ptr: NodePtr<V>,
}
impl<V: Value> NewNodeRef<V> {
pub(crate) fn insert_child(&mut self, key_byte: u8, child: NodePtr<V>) {
self.ptr.insert_child(key_byte, child)
}
pub(crate) fn insert_value(&mut self, key_byte: u8, value: V) {
self.ptr.insert_value(key_byte, value)
}
pub(crate) fn into_ptr(self) -> NodePtr<V> {
let ptr = self.ptr;
ptr
}
}
pub(crate) fn new_internal<V: Value>(prefix: &[u8], allocator: &Allocator) -> NewNodeRef<V> {
NewNodeRef {
ptr: node_ptr::new_internal(prefix, allocator),
}
}
pub(crate) fn new_leaf<V: Value>(prefix: &[u8], allocator: &Allocator) -> NewNodeRef<V> {
NewNodeRef {
ptr: node_ptr::new_leaf(prefix, allocator),
}
}

View File

@@ -0,0 +1,107 @@
use std::marker::PhantomData;
use std::mem::MaybeUninit;
use std::ops::{Deref, DerefMut};
use std::ptr::NonNull;
use std::sync::atomic::{AtomicUsize, Ordering};
pub struct Allocator {
area: *mut MaybeUninit<u8>,
allocated: AtomicUsize,
size: usize,
}
// FIXME: I don't know if these are really safe...
unsafe impl Send for Allocator {}
unsafe impl Sync for Allocator {}
#[repr(transparent)]
pub struct AllocatedBox<'a, T> {
inner: NonNull<T>,
_phantom: PhantomData<&'a Allocator>,
}
// FIXME: I don't know if these are really safe...
unsafe impl<'a, T> Send for AllocatedBox<'a, T> {}
unsafe impl<'a, T> Sync for AllocatedBox<'a, T> {}
impl<T> Deref for AllocatedBox<'_, T> {
type Target = T;
fn deref(&self) -> &T {
unsafe { self.inner.as_ref() }
}
}
impl<T> DerefMut for AllocatedBox<'_, T> {
fn deref_mut(&mut self) -> &mut T {
unsafe { self.inner.as_mut() }
}
}
impl<T> AsMut<T> for AllocatedBox<'_, T> {
fn as_mut(&mut self) -> &mut T {
unsafe { self.inner.as_mut() }
}
}
impl<T> AllocatedBox<'_, T> {
pub fn as_ptr(&self) -> *mut T {
self.inner.as_ptr()
}
}
const MAXALIGN: usize = std::mem::align_of::<usize>();
impl Allocator {
pub fn new_uninit(area: &'static mut [MaybeUninit<u8>]) -> Allocator {
let ptr = area.as_mut_ptr();
let size = area.len();
Self::new_from_ptr(ptr, size)
}
pub fn new(area: &'static mut [u8]) -> Allocator {
let ptr: *mut MaybeUninit<u8> = area.as_mut_ptr().cast();
let size = area.len();
Self::new_from_ptr(ptr, size)
}
pub fn new_from_ptr(ptr: *mut MaybeUninit<u8>, size: usize) -> Allocator {
let padding = ptr.align_offset(MAXALIGN);
Allocator {
area: ptr,
allocated: AtomicUsize::new(padding),
size,
}
}
pub fn alloc<'a, T: Sized>(&'a self, value: T) -> AllocatedBox<'a, T> {
let sz = std::mem::size_of::<T>();
// pad all allocations to MAXALIGN boundaries
assert!(std::mem::align_of::<T>() <= MAXALIGN);
let sz = sz.next_multiple_of(MAXALIGN);
let offset = self.allocated.fetch_add(sz, Ordering::Relaxed);
if offset + sz > self.size {
panic!("out of memory");
}
let inner = unsafe {
let inner = self.area.offset(offset as isize).cast::<T>();
*inner = value;
NonNull::new_unchecked(inner)
};
AllocatedBox {
inner,
_phantom: PhantomData,
}
}
pub fn _dealloc_node<T>(&self, _node: AllocatedBox<T>) {
// doesn't free it immediately.
}
}

23
libs/neonart/src/epoch.rs Normal file
View File

@@ -0,0 +1,23 @@
//! This is similar to crossbeam_epoch crate, but works in shared memory
//!
//! FIXME: not implemented yet. (We haven't implemented removing any nodes from the ART
//! tree, which is why we get away without this now)
pub(crate) struct EpochPin {}
pub(crate) fn pin_epoch() -> EpochPin {
EpochPin {}
}
/*
struct CollectorGlobal {
epoch: AtomicU64,
participants: CachePadded<AtomicU64>, // make it an array
}
struct CollectorQueue {
}
*/

301
libs/neonart/src/lib.rs Normal file
View File

@@ -0,0 +1,301 @@
//! Adaptive Radix Tree (ART) implementation, with Optimistic Lock Coupling.
//!
//! The data structure is described in these two papers:
//!
//! [1] Leis, V. & Kemper, Alfons & Neumann, Thomas. (2013).
//! The adaptive radix tree: ARTful indexing for main-memory databases.
//! Proceedings - International Conference on Data Engineering. 38-49. 10.1109/ICDE.2013.6544812.
//! https://db.in.tum.de/~leis/papers/ART.pdf
//!
//! [2] Leis, Viktor & Scheibner, Florian & Kemper, Alfons & Neumann, Thomas. (2016).
//! The ART of practical synchronization.
//! 1-8. 10.1145/2933349.2933352.
//! https://db.in.tum.de/~leis/papers/artsync.pdf
//!
//! [1] describes the base data structure, and [2] describes the Optimistic Lock Coupling that we
//! use.
//!
//! The papers mention a few different variants. We have made the following choices in this
//! implementation:
//!
//! - All keys have the same length
//!
//! - Multi-value leaves. The values are stored directly in one of the four different leaf node
//! types.
//!
//! - For collapsing inner nodes, we use the Pessimistic approach, where each inner node stores a
//! variable length "prefix", which stores the keys of all the one-way nodes which have been
//! removed. However, similar to the "hybrid" approach described in the paper, each node only has
//! space for a constant-size prefix of 8 bytes. If a node would have a longer prefix, then we
//! create create one-way nodes to store them. (There was no particular reason for this choice,
//! the "hybrid" approach described in the paper might be better.)
//!
//! - For concurrency, we use Optimistic Lock Coupling. The paper [2] also describes another method,
//! ROWEX, which generally performs better when there is contention, but that is not important
//! for use and Optimisic Lock Coupling is simpler to implement.
//!
//! ## Requirements
//!
//! This data structure is currently used for the integrated LFC, relsize and last-written LSN cache
//! in the compute communicator, part of the 'neon' Postgres extension. We have some unique
//! requirements, which is why we had to write our own. Namely:
//!
//! - The data structure has to live in fixed-sized shared memory segment. That rules out any
//! built-in Rust collections and most crates. (Except possibly with the 'allocator_api' rust
//! feature, which still nightly-only experimental as of this writing).
//!
//! - The data structure is accessed from multiple processes. Only one process updates the data
//! structure, but other processes perform reads. That rules out using built-in Rust locking
//! primitives like Mutex and RwLock, and most crates too.
//!
//! - Within the one process with write-access, multiple threads can perform updates concurrently.
//! That rules out using PostgreSQL LWLocks for the locking.
//!
//! The implementation is generic, and doesn't depend on any PostgreSQL specifics, but it has been
//! written with that usage and the above constraints in mind. Some noteworthy assumptions:
//!
//! - Contention is assumed to be rare. In the integrated cache in PostgreSQL, there's higher level
//! locking in the PostgreSQL buffer manager, which ensures that two backends should not try to
//! read / write the same page at the same time. (Prefetching can conflict with actual reads,
//! however.)
//!
//! - The keys in the integrated cache are 17 bytes long.
//!
//! ## Usage
//!
//! Because this is designed to be used as a Postgres shared memory data structure, initialization
//! happens in three stages:
//!
//! 0. A fixed area of shared memory is allocated at postmaster startup.
//!
//! 1. TreeInitStruct::new() is called to initialize it, still in Postmaster process, before any
//! other process or thread is running. It returns a TreeInitStruct, which is inherited by all
//! the processes through fork().
//!
//! 2. One process may have write-access to the struct, by calling
//! [TreeInitStruct::attach_writer]. (That process is the communicator process.)
//!
//! 3. Other processes get read-access to the struct, by calling [TreeInitStruct::attach_reader]
//!
//! "Write access" means that you can insert / update / delete values in the tree.
//!
//! NOTE: The Values stored in the tree are sometimes moved, when a leaf node fills up and a new
//! larger node needs to be allocated. The versioning and epoch-based allocator ensure that the data
//! structure stays consistent, but if the Value has interior mutability, like atomic fields,
//! updates to such fields might be lost if the leaf node is concurrently moved! If that becomes a
//! problem, the version check could be passed up to the caller, so that the caller could detect the
//! lost updates and retry the operation.
//!
//! ## Implementation
//!
//! node_ptr: Provides low-level implementations of the four different node types (eight actually,
//! since there is an Internal and Leaf variant of each)
//!
//! lock_and_version.rs: Provides an abstraction for the combined lock and version counter on each
//! node.
//!
//! node_ref.rs: The code in node_ptr.rs deals with raw pointers. node_ref.rs provides more type-safe
//! abstractions on top.
//!
//! algorithm.rs: Contains the functions to implement lookups and updates in the tree
//!
//! allocator.rs: Provides a facility to allocate memory for the tree nodes. (We must provide our
//! own abstraction for that because we need the data structure to live in a pre-allocated shared
//! memory segment).
//!
//! epoch.rs: The data structure requires that when a node is removed from the tree, it is not
//! immediately deallocated, but stays around for as long as concurrent readers might still have
//! pointers to them. This is enforced by an epoch system. This is similar to
//! e.g. crossbeam_epoch, but we couldn't use that either because it has to work across processes
//! communicating over the shared memory segment.
//!
//! ## See also
//!
//! There are some existing Rust ART implementations out there, but none of them filled all
//! the requirements:
//!
//! - https://github.com/XiangpengHao/congee
//! - https://github.com/declanvk/blart
//!
//! ## TODO
//!
//! - Removing values has not been implemented
mod algorithm;
mod allocator;
mod epoch;
use algorithm::RootPtr;
use allocator::AllocatedBox;
use std::fmt::Debug;
use std::marker::PhantomData;
use std::sync::atomic::{AtomicBool, Ordering};
use crate::epoch::EpochPin;
#[cfg(test)]
mod tests;
pub use allocator::Allocator;
/// Fixed-length key type.
///
pub trait Key: Clone + Debug {
const KEY_LEN: usize;
fn as_bytes(&self) -> &[u8];
}
/// Values stored in the tree
///
/// Values need to be Cloneable, because when a node "grows", the value is copied to a new node and
/// the old sticks around until all readers that might see the old value are gone.
pub trait Value: Clone {}
struct Tree<K: Key, V: Value> {
root: RootPtr<V>,
writer_attached: AtomicBool,
phantom_key: PhantomData<K>,
}
/// Struct created at postmaster startup
pub struct TreeInitStruct<'t, K: Key, V: Value> {
tree: AllocatedBox<'t, Tree<K, V>>,
allocator: &'t Allocator,
}
/// The worker process has a reference to this. The write operations are only safe
/// from the worker process
pub struct TreeWriteAccess<'t, K: Key, V: Value>
where
K: Key,
V: Value,
{
tree: AllocatedBox<'t, Tree<K, V>>,
allocator: &'t Allocator,
}
/// The backends have a reference to this. It cannot be used to modify the tree
pub struct TreeReadAccess<'t, K: Key, V: Value>
where
K: Key,
V: Value,
{
tree: AllocatedBox<'t, Tree<K, V>>,
}
impl<'a, 't: 'a, K: Key, V: Value> TreeInitStruct<'t, K, V> {
pub fn new(allocator: &'t Allocator) -> TreeInitStruct<'t, K, V> {
let tree = allocator.alloc(Tree {
root: algorithm::new_root(allocator),
writer_attached: AtomicBool::new(false),
phantom_key: PhantomData,
});
TreeInitStruct { tree, allocator }
}
pub fn attach_writer(self) -> TreeWriteAccess<'t, K, V> {
let previously_attached = self.tree.writer_attached.swap(true, Ordering::Relaxed);
if previously_attached {
panic!("writer already attached");
}
TreeWriteAccess {
tree: self.tree,
allocator: self.allocator,
}
}
pub fn attach_reader(self) -> TreeReadAccess<'t, K, V> {
TreeReadAccess { tree: self.tree }
}
}
impl<'t, K: Key + Clone, V: Value> TreeWriteAccess<'t, K, V> {
pub fn start_write(&'t self) -> TreeWriteGuard<'t, K, V> {
// TODO: grab epoch guard
TreeWriteGuard {
allocator: self.allocator,
tree: &self.tree,
epoch_pin: epoch::pin_epoch(),
}
}
pub fn start_read(&'t self) -> TreeReadGuard<'t, K, V> {
TreeReadGuard {
tree: &self.tree,
epoch_pin: epoch::pin_epoch(),
}
}
}
impl<'t, K: Key + Clone, V: Value> TreeReadAccess<'t, K, V> {
pub fn start_read(&'t self) -> TreeReadGuard<'t, K, V> {
TreeReadGuard {
tree: &self.tree,
epoch_pin: epoch::pin_epoch(),
}
}
}
pub struct TreeReadGuard<'t, K, V>
where
K: Key,
V: Value,
{
tree: &'t AllocatedBox<'t, Tree<K, V>>,
epoch_pin: EpochPin,
}
impl<'t, K: Key, V: Value> TreeReadGuard<'t, K, V> {
pub fn get(&self, key: &K) -> Option<V> {
algorithm::search(key, self.tree.root, &self.epoch_pin)
}
}
pub struct TreeWriteGuard<'t, K, V>
where
K: Key,
V: Value,
{
tree: &'t AllocatedBox<'t, Tree<K, V>>,
allocator: &'t Allocator,
epoch_pin: EpochPin,
}
impl<'t, K: Key, V: Value> TreeWriteGuard<'t, K, V> {
pub fn insert(&mut self, key: &K, value: V) {
self.update_with_fn(key, |_| Some(value))
}
pub fn update_with_fn<F>(&mut self, key: &K, value_fn: F)
where
F: FnOnce(Option<&V>) -> Option<V>,
{
algorithm::update_fn(
key,
value_fn,
self.tree.root,
self.allocator,
&self.epoch_pin,
)
}
pub fn get(&mut self, key: &K) -> Option<V> {
algorithm::search(key, self.tree.root, &self.epoch_pin)
}
}
impl<'t, K: Key, V: Value + Debug> TreeWriteGuard<'t, K, V> {
pub fn dump(&mut self) {
algorithm::dump_tree(self.tree.root, &self.epoch_pin)
}
}

90
libs/neonart/src/tests.rs Normal file
View File

@@ -0,0 +1,90 @@
use std::collections::HashSet;
use crate::Allocator;
use crate::TreeInitStruct;
use crate::{Key, Value};
use rand::seq::SliceRandom;
use rand::thread_rng;
const TEST_KEY_LEN: usize = 16;
#[derive(Clone, Copy, Debug)]
struct TestKey([u8; TEST_KEY_LEN]);
impl Key for TestKey {
const KEY_LEN: usize = TEST_KEY_LEN;
fn as_bytes(&self) -> &[u8] {
&self.0
}
}
impl From<u128> for TestKey {
fn from(val: u128) -> TestKey {
TestKey(val.to_be_bytes())
}
}
impl Value for usize {}
fn test_inserts<K: Into<TestKey> + Copy>(keys: &[K]) {
const MEM_SIZE: usize = 10000000;
let area = Box::leak(Box::new_uninit_slice(MEM_SIZE));
let allocator = Box::leak(Box::new(Allocator::new_uninit(area)));
let init_struct = TreeInitStruct::<TestKey, usize>::new(allocator);
let tree_writer = init_struct.attach_writer();
for (idx, k) in keys.iter().enumerate() {
let mut w = tree_writer.start_write();
w.insert(&(*k).into(), idx);
eprintln!("INSERTED {:?}", Into::<TestKey>::into(*k));
}
//tree_writer.start_read().dump();
for (idx, k) in keys.iter().enumerate() {
let r = tree_writer.start_read();
let value = r.get(&(*k).into());
assert_eq!(value, Some(idx));
}
}
#[test]
fn dense() {
// This exercises splitting a node with prefix
let keys: &[u128] = &[0, 1, 2, 3, 256];
test_inserts(keys);
// Dense keys
let mut keys: Vec<u128> = (0..10000).collect();
test_inserts(&keys);
// Do the same in random orders
for _ in 1..10 {
keys.shuffle(&mut thread_rng());
test_inserts(&keys);
}
}
#[test]
fn sparse() {
// sparse keys
let mut keys: Vec<TestKey> = Vec::new();
let mut used_keys = HashSet::new();
for _ in 0..10000 {
loop {
let key = rand::random::<u128>();
if used_keys.get(&key).is_some() {
continue;
}
used_keys.insert(key);
keys.push(key.into());
break;
}
}
test_inserts(&keys);
}

View File

@@ -42,12 +42,14 @@ nix.workspace = true
num_cpus.workspace = true
num-traits.workspace = true
once_cell.workspace = true
peekable.workspace = true
pin-project-lite.workspace = true
postgres_backend.workspace = true
postgres-protocol.workspace = true
postgres-types.workspace = true
postgres_initdb.workspace = true
pprof.workspace = true
prost.workspace = true
rand.workspace = true
range-set-blaze = { version = "0.1.16", features = ["alloc"] }
regex.workspace = true
@@ -60,6 +62,7 @@ serde_path_to_error.workspace = true
serde_with.workspace = true
sysinfo.workspace = true
tokio-tar.workspace = true
tonic.workspace = true
thiserror.workspace = true
tikv-jemallocator.workspace = true
tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] }
@@ -76,6 +79,7 @@ url.workspace = true
walkdir.workspace = true
metrics.workspace = true
pageserver_api.workspace = true
pageserver_data_api.workspace = true
pageserver_client.workspace = true # for ResponseErrorMessageExt TOOD refactor that
pageserver_compaction.workspace = true
pem.workspace = true

View File

@@ -0,0 +1,13 @@
[package]
name = "pageserver_client_grpc"
version = "0.1.0"
edition = "2024"
[dependencies]
bytes.workspace = true
http.workspace = true
thiserror.workspace = true
tonic.workspace = true
tracing.workspace = true
pageserver_data_api.workspace = true

View File

@@ -0,0 +1,221 @@
//! Pageserver Data API client
//!
//! - Manage connections to pageserver
//! - Send requests to correct shards
//!
use std::collections::HashMap;
use std::sync::RwLock;
use bytes::Bytes;
use http;
use thiserror::Error;
use tonic;
use tonic::metadata::AsciiMetadataValue;
use tonic::transport::Channel;
use pageserver_data_api::model::*;
use pageserver_data_api::proto;
type Shardno = u16;
use pageserver_data_api::client::PageServiceClient;
type MyPageServiceClient = pageserver_data_api::client::PageServiceClient<
tonic::service::interceptor::InterceptedService<tonic::transport::Channel, AuthInterceptor>,
>;
#[derive(Error, Debug)]
pub enum PageserverClientError {
#[error("could not connect to service: {0}")]
ConnectError(#[from] tonic::transport::Error),
#[error("could not perform request: {0}`")]
RequestError(#[from] tonic::Status),
#[error("could not perform request: {0}`")]
InvalidUri(#[from] http::uri::InvalidUri),
}
pub struct PageserverClient {
_tenant_id: String,
_timeline_id: String,
_auth_token: Option<String>,
shard_map: HashMap<Shardno, String>,
channels: RwLock<HashMap<Shardno, Channel>>,
auth_interceptor: AuthInterceptor,
}
impl PageserverClient {
/// TODO: this doesn't currently react to changes in the shard map.
pub fn new(
tenant_id: &str,
timeline_id: &str,
auth_token: &Option<String>,
shard_map: HashMap<Shardno, String>,
) -> Self {
Self {
_tenant_id: tenant_id.to_string(),
_timeline_id: timeline_id.to_string(),
_auth_token: auth_token.clone(),
shard_map,
channels: RwLock::new(HashMap::new()),
auth_interceptor: AuthInterceptor::new(tenant_id, timeline_id, auth_token.as_ref()),
}
}
pub async fn process_rel_exists_request(
&self,
request: &RelExistsRequest,
) -> Result<bool, PageserverClientError> {
// Current sharding model assumes that all metadata is present only at shard 0.
let shard_no = 0;
let mut client = self.get_client(shard_no).await?;
let request = proto::RelExistsRequest::from(request);
let response = client.rel_exists(tonic::Request::new(request)).await?;
Ok(response.get_ref().exists)
}
pub async fn process_rel_size_request(
&self,
request: &RelSizeRequest,
) -> Result<u32, PageserverClientError> {
// Current sharding model assumes that all metadata is present only at shard 0.
let shard_no = 0;
let mut client = self.get_client(shard_no).await?;
let request = proto::RelSizeRequest::from(request);
let response = client.rel_size(tonic::Request::new(request)).await?;
Ok(response.get_ref().num_blocks)
}
pub async fn get_page(&self, request: &GetPageRequest) -> Result<Bytes, PageserverClientError> {
// FIXME: calculate the shard number correctly
let shard_no = 0;
let mut client = self.get_client(shard_no).await?;
let request = proto::GetPageRequest::from(request);
let response = client.get_page(tonic::Request::new(request)).await?;
Ok(response.into_inner().page_image)
}
/// Process a request to get the size of a database.
pub async fn process_dbsize_request(
&self,
request: &DbSizeRequest,
) -> Result<u64, PageserverClientError> {
// Current sharding model assumes that all metadata is present only at shard 0.
let shard_no = 0;
let mut client = self.get_client(shard_no).await?;
let request = proto::DbSizeRequest::from(request);
let response = client.db_size(tonic::Request::new(request)).await?;
Ok(response.get_ref().num_bytes)
}
/// Process a request to get the size of a database.
pub async fn get_base_backup(
&self,
request: &GetBaseBackupRequest,
gzip: bool,
) -> std::result::Result<
tonic::Response<tonic::codec::Streaming<proto::GetBaseBackupResponseChunk>>,
PageserverClientError,
> {
// Current sharding model assumes that all metadata is present only at shard 0.
let shard_no = 0;
let mut client = self.get_client(shard_no).await?;
if gzip {
client = client.accept_compressed(tonic::codec::CompressionEncoding::Gzip);
}
let request = proto::GetBaseBackupRequest::from(request);
let response = client.get_base_backup(tonic::Request::new(request)).await?;
Ok(response)
}
/// Get a client for given shard
///
/// This implements very basic caching. If we already have a client for the given shard,
/// reuse it. If not, create a new client and put it to the cache.
async fn get_client(
&self,
shard_no: u16,
) -> Result<MyPageServiceClient, PageserverClientError> {
let reused_channel: Option<Channel> = {
let channels = self.channels.read().unwrap();
channels.get(&shard_no).cloned()
};
let channel = if let Some(reused_channel) = reused_channel {
reused_channel
} else {
let endpoint: tonic::transport::Endpoint = self
.shard_map
.get(&shard_no)
.expect("no url for shard {shard_no}")
.parse()?;
let channel = endpoint.connect().await?;
// Insert it to the cache so that it can be reused on subsequent calls. It's possible
// that another thread did the same concurrently, in which case we will overwrite the
// client in the cache.
{
let mut channels = self.channels.write().unwrap();
channels.insert(shard_no, channel.clone());
}
channel
};
let client = PageServiceClient::with_interceptor(channel, self.auth_interceptor.clone());
Ok(client)
}
}
/// Inject tenant_id, timeline_id and authentication token to all pageserver requests.
#[derive(Clone)]
struct AuthInterceptor {
tenant_id: AsciiMetadataValue,
timeline_id: AsciiMetadataValue,
auth_token: Option<AsciiMetadataValue>,
}
impl AuthInterceptor {
fn new(tenant_id: &str, timeline_id: &str, auth_token: Option<&String>) -> Self {
Self {
tenant_id: tenant_id.parse().expect("could not parse tenant id"),
timeline_id: timeline_id.parse().expect("could not parse timeline id"),
auth_token: auth_token.map(|x| x.parse().expect("could not parse auth token")),
}
}
}
impl tonic::service::Interceptor for AuthInterceptor {
fn call(&mut self, mut req: tonic::Request<()>) -> Result<tonic::Request<()>, tonic::Status> {
req.metadata_mut()
.insert("neon-tenant-id", self.tenant_id.clone());
req.metadata_mut()
.insert("neon-timeline-id", self.timeline_id.clone());
if let Some(auth_token) = &self.auth_token {
req.metadata_mut()
.insert("neon-auth-token", auth_token.clone());
}
Ok(req)
}
}

View File

@@ -0,0 +1,18 @@
[package]
name = "pageserver_data_api"
version = "0.1.0"
edition = "2024"
[dependencies]
# For Lsn.
#
# TODO: move Lsn to separate crate? This draws in a lot more dependencies
utils.workspace = true
prost.workspace = true
thiserror.workspace = true
tonic.workspace = true
[build-dependencies]
tonic-build.workspace = true

View File

@@ -0,0 +1,8 @@
fn main() -> Result<(), Box<dyn std::error::Error>> {
// Generate rust code from .proto protobuf.
tonic_build::configure()
.bytes(&["."])
.compile_protos(&["proto/page_service.proto"], &["proto"])
.unwrap_or_else(|e| panic!("failed to compile protos {:?}", e));
Ok(())
}

View File

@@ -0,0 +1,84 @@
// Page service presented by pageservers, for computes
//
// Each request must come with the following metadata:
// - neon-tenant-id
// - neon-timeline-id
// - neon-auth-token (if auth is enabled)
//
// TODO: what else? Priority? OpenTelemetry tracing?
//
syntax = "proto3";
package page_service;
service PageService {
rpc RelExists(RelExistsRequest) returns (RelExistsResponse);
// Returns size of a relation, as # of blocks
rpc RelSize (RelSizeRequest) returns (RelSizeResponse);
rpc GetPage (GetPageRequest) returns (GetPageResponse);
// Returns total size of a database, as # of bytes
rpc DbSize (DbSizeRequest) returns (DbSizeResponse);
rpc GetBaseBackup (GetBaseBackupRequest) returns (stream GetBaseBackupResponseChunk);
}
message RequestCommon {
uint64 request_lsn = 1;
uint64 not_modified_since_lsn = 2;
}
message RelTag {
uint32 spc_oid = 1;
uint32 db_oid = 2;
uint32 rel_number = 3;
uint32 fork_number = 4;
}
message RelExistsRequest {
RequestCommon common = 1;
RelTag rel = 2;
}
message RelExistsResponse {
bool exists = 1;
}
message RelSizeRequest {
RequestCommon common = 1;
RelTag rel = 2;
}
message RelSizeResponse {
uint32 num_blocks = 1;
}
message GetPageRequest {
RequestCommon common = 1;
RelTag rel = 2;
uint32 block_number = 3;
}
message GetPageResponse {
bytes page_image = 1;
}
message DbSizeRequest {
RequestCommon common = 1;
uint32 db_oid = 2;
}
message DbSizeResponse {
uint64 num_bytes = 1;
}
message GetBaseBackupRequest {
RequestCommon common = 1;
bool replica = 2;
}
message GetBaseBackupResponseChunk {
bytes chunk = 1;
}

View File

@@ -0,0 +1,17 @@
//! This crate has two modules related to the Pageserver Data API:
//!
//! proto: code auto-generated from the protobuf definition
//! model: slightly more ergonomic structs representing the same API
//!
//! See protobuf spec under the protos/ subdirectory.
//!
//! This crate is used by both the client and the server. Try to keep it slim.
//!
pub mod model;
// Code generated by protobuf.
pub mod proto {
tonic::include_proto!("page_service");
}
pub use proto::page_service_client as client;

View File

@@ -0,0 +1,239 @@
//! Structs representing the API
//!
//! These mirror the pageserver APIs and the structs automatically generated
//! from the protobuf specification. The differences are:
//!
//! - Types that are in fact required by the API are not Options. The protobuf "required"
//! attribute is deprecated and 'prost' marks a lot of members as optional because of that.
//! (See https://github.com/tokio-rs/prost/issues/800 for a gripe on this)
//!
//! - Use more precise datatypes, e.g. Lsn and uints shorter than 32 bits.
use utils::lsn::Lsn;
use crate::proto;
#[derive(Clone, Debug)]
pub struct RequestCommon {
pub request_lsn: Lsn,
pub not_modified_since_lsn: Lsn,
}
#[derive(Clone, Debug, Eq, PartialEq, Hash, PartialOrd, Ord)]
pub struct RelTag {
pub spc_oid: u32,
pub db_oid: u32,
pub rel_number: u32,
pub fork_number: u8,
}
#[derive(Clone, Debug)]
pub struct RelExistsRequest {
pub common: RequestCommon,
pub rel: RelTag,
}
#[derive(Clone, Debug)]
pub struct RelSizeRequest {
pub common: RequestCommon,
pub rel: RelTag,
}
#[derive(Clone, Debug)]
pub struct RelSizeResponse {
pub num_blocks: u32,
}
#[derive(Clone, Debug)]
pub struct GetPageRequest {
pub common: RequestCommon,
pub rel: RelTag,
pub block_number: u32,
}
#[derive(Clone, Debug)]
pub struct GetPageResponse {
pub page_image: std::vec::Vec<u8>,
}
#[derive(Clone, Debug)]
pub struct DbSizeRequest {
pub common: RequestCommon,
pub db_oid: u32,
}
#[derive(Clone, Debug)]
pub struct DbSizeResponse {
pub num_bytes: u64,
}
#[derive(Clone, Debug)]
pub struct GetBaseBackupRequest {
pub common: RequestCommon,
pub replica: bool,
}
//--- Conversions to/from the generated proto types
use thiserror::Error;
#[derive(Error, Debug)]
pub enum ProtocolError {
#[error("the value for field `{0}` is invalid")]
InvalidValue(&'static str),
#[error("the required field `{0}` is missing ")]
Missing(&'static str),
}
impl From<ProtocolError> for tonic::Status {
fn from(e: ProtocolError) -> Self {
match e {
ProtocolError::InvalidValue(_field) => tonic::Status::invalid_argument(e.to_string()),
ProtocolError::Missing(_field) => tonic::Status::invalid_argument(e.to_string()),
}
}
}
impl From<&RelTag> for proto::RelTag {
fn from(value: &RelTag) -> proto::RelTag {
proto::RelTag {
spc_oid: value.spc_oid,
db_oid: value.db_oid,
rel_number: value.rel_number,
fork_number: value.fork_number as u32,
}
}
}
impl TryFrom<&proto::RelTag> for RelTag {
type Error = ProtocolError;
fn try_from(value: &proto::RelTag) -> Result<RelTag, ProtocolError> {
Ok(RelTag {
spc_oid: value.spc_oid,
db_oid: value.db_oid,
rel_number: value.rel_number,
fork_number: value
.fork_number
.try_into()
.or(Err(ProtocolError::InvalidValue("fork_number")))?,
})
}
}
impl From<&RequestCommon> for proto::RequestCommon {
fn from(value: &RequestCommon) -> proto::RequestCommon {
proto::RequestCommon {
request_lsn: value.request_lsn.into(),
not_modified_since_lsn: value.not_modified_since_lsn.into(),
}
}
}
impl From<&proto::RequestCommon> for RequestCommon {
fn from(value: &proto::RequestCommon) -> RequestCommon {
RequestCommon {
request_lsn: value.request_lsn.into(),
not_modified_since_lsn: value.not_modified_since_lsn.into(),
}
}
}
impl From<&RelExistsRequest> for proto::RelExistsRequest {
fn from(value: &RelExistsRequest) -> proto::RelExistsRequest {
proto::RelExistsRequest {
common: Some((&value.common).into()),
rel: Some((&value.rel).into()),
}
}
}
impl TryFrom<&proto::RelExistsRequest> for RelExistsRequest {
type Error = ProtocolError;
fn try_from(value: &proto::RelExistsRequest) -> Result<RelExistsRequest, ProtocolError> {
Ok(RelExistsRequest {
common: (&value.common.ok_or(ProtocolError::Missing("common"))?).into(),
rel: (&value.rel.ok_or(ProtocolError::Missing("rel"))?).try_into()?,
})
}
}
impl From<&RelSizeRequest> for proto::RelSizeRequest {
fn from(value: &RelSizeRequest) -> proto::RelSizeRequest {
proto::RelSizeRequest {
common: Some((&value.common).into()),
rel: Some((&value.rel).into()),
}
}
}
impl TryFrom<&proto::RelSizeRequest> for RelSizeRequest {
type Error = ProtocolError;
fn try_from(value: &proto::RelSizeRequest) -> Result<RelSizeRequest, ProtocolError> {
Ok(RelSizeRequest {
common: (&value.common.ok_or(ProtocolError::Missing("common"))?).into(),
rel: (&value.rel.ok_or(ProtocolError::Missing("rel"))?).try_into()?,
})
}
}
impl From<&GetPageRequest> for proto::GetPageRequest {
fn from(value: &GetPageRequest) -> proto::GetPageRequest {
proto::GetPageRequest {
common: Some((&value.common).into()),
rel: Some((&value.rel).into()),
block_number: value.block_number,
}
}
}
impl TryFrom<&proto::GetPageRequest> for GetPageRequest {
type Error = ProtocolError;
fn try_from(value: &proto::GetPageRequest) -> Result<GetPageRequest, ProtocolError> {
Ok(GetPageRequest {
common: (&value.common.ok_or(ProtocolError::Missing("common"))?).into(),
rel: (&value.rel.ok_or(ProtocolError::Missing("rel"))?).try_into()?,
block_number: value.block_number,
})
}
}
impl From<&DbSizeRequest> for proto::DbSizeRequest {
fn from(value: &DbSizeRequest) -> proto::DbSizeRequest {
proto::DbSizeRequest {
common: Some((&value.common).into()),
db_oid: value.db_oid,
}
}
}
impl TryFrom<&proto::DbSizeRequest> for DbSizeRequest {
type Error = ProtocolError;
fn try_from(value: &proto::DbSizeRequest) -> Result<DbSizeRequest, ProtocolError> {
Ok(DbSizeRequest {
common: (&value.common.ok_or(ProtocolError::Missing("common"))?).into(),
db_oid: value.db_oid,
})
}
}
impl From<&GetBaseBackupRequest> for proto::GetBaseBackupRequest {
fn from(value: &GetBaseBackupRequest) -> proto::GetBaseBackupRequest {
proto::GetBaseBackupRequest {
common: Some((&value.common).into()),
replica: value.replica,
}
}
}
impl TryFrom<&proto::GetBaseBackupRequest> for GetBaseBackupRequest {
type Error = ProtocolError;
fn try_from(
value: &proto::GetBaseBackupRequest,
) -> Result<GetBaseBackupRequest, ProtocolError> {
Ok(GetBaseBackupRequest {
common: (&value.common.ok_or(ProtocolError::Missing("common"))?).into(),
replica: value.replica,
})
}
}

View File

@@ -23,6 +23,8 @@ tokio.workspace = true
tokio-util.workspace = true
pageserver_client.workspace = true
pageserver_client_grpc.workspace = true
pageserver_data_api.workspace = true
pageserver_api.workspace = true
utils = { path = "../../libs/utils/" }
workspace_hack = { version = "0.1", path = "../../workspace_hack" }

View File

@@ -9,6 +9,9 @@ use anyhow::Context;
use pageserver_api::shard::TenantShardId;
use pageserver_client::mgmt_api::ForceAwaitLogicalSize;
use pageserver_client::page_service::BasebackupRequest;
use pageserver_client_grpc;
use pageserver_data_api::model::{GetBaseBackupRequest, RequestCommon};
use rand::prelude::*;
use tokio::sync::Barrier;
use tokio::task::JoinSet;
@@ -22,6 +25,8 @@ use crate::util::{request_stats, tokio_thread_local_stats};
/// basebackup@LatestLSN
#[derive(clap::Parser)]
pub(crate) struct Args {
#[clap(long, default_value = "false")]
grpc: bool,
#[clap(long, default_value = "http://localhost:9898")]
mgmt_api_endpoint: String,
#[clap(long, default_value = "postgres://postgres@localhost:64000")]
@@ -52,7 +57,7 @@ impl LiveStats {
struct Target {
timeline: TenantTimelineId,
lsn_range: Option<Range<Lsn>>,
lsn_range: Range<Lsn>,
}
#[derive(serde::Serialize)]
@@ -105,7 +110,7 @@ async fn main_impl(
anyhow::Ok(Target {
timeline,
// TODO: support lsn_range != latest LSN
lsn_range: Some(info.last_record_lsn..(info.last_record_lsn + 1)),
lsn_range: info.last_record_lsn..(info.last_record_lsn + 1),
})
}
});
@@ -149,14 +154,27 @@ async fn main_impl(
for tl in &timelines {
let (sender, receiver) = tokio::sync::mpsc::channel(1); // TODO: not sure what the implications of this are
work_senders.insert(tl, sender);
tasks.push(tokio::spawn(client(
args,
*tl,
Arc::clone(&start_work_barrier),
receiver,
Arc::clone(&all_work_done_barrier),
Arc::clone(&live_stats),
)));
let client_task = if args.grpc {
tokio::spawn(client_grpc(
args,
*tl,
Arc::clone(&start_work_barrier),
receiver,
Arc::clone(&all_work_done_barrier),
Arc::clone(&live_stats),
))
} else {
tokio::spawn(client(
args,
*tl,
Arc::clone(&start_work_barrier),
receiver,
Arc::clone(&all_work_done_barrier),
Arc::clone(&live_stats),
))
};
tasks.push(client_task);
}
let work_sender = async move {
@@ -165,7 +183,7 @@ async fn main_impl(
let (timeline, work) = {
let mut rng = rand::thread_rng();
let target = all_targets.choose(&mut rng).unwrap();
let lsn = target.lsn_range.clone().map(|r| rng.gen_range(r));
let lsn = rng.gen_range(target.lsn_range.clone());
(
target.timeline,
Work {
@@ -215,7 +233,7 @@ async fn main_impl(
#[derive(Copy, Clone)]
struct Work {
lsn: Option<Lsn>,
lsn: Lsn,
gzip: bool,
}
@@ -240,7 +258,7 @@ async fn client(
.basebackup(&BasebackupRequest {
tenant_id: timeline.tenant_id,
timeline_id: timeline.timeline_id,
lsn,
lsn: Some(lsn),
gzip,
})
.await
@@ -270,3 +288,71 @@ async fn client(
all_work_done_barrier.wait().await;
}
#[instrument(skip_all)]
async fn client_grpc(
args: &'static Args,
timeline: TenantTimelineId,
start_work_barrier: Arc<Barrier>,
mut work: tokio::sync::mpsc::Receiver<Work>,
all_work_done_barrier: Arc<Barrier>,
live_stats: Arc<LiveStats>,
) {
let shard_map = HashMap::from([(0, args.page_service_connstring.clone())]);
let client = pageserver_client_grpc::PageserverClient::new(
&timeline.tenant_id.to_string(),
&timeline.timeline_id.to_string(),
&None,
shard_map,
);
start_work_barrier.wait().await;
while let Some(Work { lsn, gzip }) = work.recv().await {
let start = Instant::now();
//tokio::time::sleep(std::time::Duration::from_secs(1)).await;
info!("starting get_base_backup");
let mut basebackup_stream = client
.get_base_backup(
&GetBaseBackupRequest {
common: RequestCommon {
request_lsn: lsn,
not_modified_since_lsn: lsn,
},
replica: false,
},
gzip,
)
.await
.with_context(|| format!("start basebackup for {timeline}"))
.unwrap()
.into_inner();
info!("starting receive");
use futures::StreamExt;
let mut size = 0;
let mut nchunks = 0;
while let Some(chunk) = basebackup_stream.next().await {
let chunk = chunk
.with_context(|| format!("error during basebackup"))
.unwrap();
size += chunk.chunk.len();
nchunks += 1;
}
info!(
"basebackup size is {} bytes, avg chunk size {} bytes",
size,
size as f32 / nchunks as f32
);
let elapsed = start.elapsed();
live_stats.inc();
STATS.with(|stats| {
stats.borrow().lock().unwrap().observe(elapsed).unwrap();
});
}
all_work_done_barrier.wait().await;
}

View File

@@ -1,4 +1,4 @@
use std::collections::{HashSet, VecDeque};
use std::collections::{HashMap, HashSet, VecDeque};
use std::future::Future;
use std::num::NonZeroUsize;
use std::pin::Pin;
@@ -8,6 +8,8 @@ use std::time::{Duration, Instant};
use anyhow::Context;
use camino::Utf8PathBuf;
use futures::StreamExt;
use futures::stream::FuturesOrdered;
use pageserver_api::key::Key;
use pageserver_api::keyspace::KeySpaceAccum;
use pageserver_api::models::{PagestreamGetPageRequest, PagestreamRequest};
@@ -25,6 +27,8 @@ use crate::util::{request_stats, tokio_thread_local_stats};
/// GetPage@LatestLSN, uniformly distributed across the compute-accessible keyspace.
#[derive(clap::Parser)]
pub(crate) struct Args {
#[clap(long, default_value = "false")]
grpc: bool,
#[clap(long, default_value = "http://localhost:9898")]
mgmt_api_endpoint: String,
#[clap(long, default_value = "postgres://postgres@localhost:64000")]
@@ -295,7 +299,29 @@ async fn main_impl(
.unwrap();
Box::pin(async move {
client_libpq(args, worker_id, ss, cancel, rps_period, ranges, weights).await
if args.grpc {
client_grpc(
args,
worker_id,
ss,
cancel,
rps_period,
ranges,
weights,
)
.await
} else {
client_libpq(
args,
worker_id,
ss,
cancel,
rps_period,
ranges,
weights,
)
.await
}
})
};
@@ -434,3 +460,100 @@ async fn client_libpq(
}
}
}
async fn client_grpc(
args: &Args,
worker_id: WorkerId,
shared_state: Arc<SharedState>,
cancel: CancellationToken,
rps_period: Option<Duration>,
ranges: Vec<KeyRange>,
weights: rand::distributions::weighted::WeightedIndex<i128>,
) {
let shard_map = HashMap::from([(0, args.page_service_connstring.clone())]);
let client = pageserver_client_grpc::PageserverClient::new(
&worker_id.timeline.tenant_id.to_string(),
&worker_id.timeline.timeline_id.to_string(),
&None,
shard_map,
);
let client = Arc::new(client);
shared_state.start_work_barrier.wait().await;
let client_start = Instant::now();
let mut ticks_processed = 0;
let mut inflight = FuturesOrdered::new();
while !cancel.is_cancelled() {
// Detect if a request took longer than the RPS rate
if let Some(period) = &rps_period {
let periods_passed_until_now =
usize::try_from(client_start.elapsed().as_micros() / period.as_micros()).unwrap();
if periods_passed_until_now > ticks_processed {
shared_state
.live_stats
.missed((periods_passed_until_now - ticks_processed) as u64);
}
ticks_processed = periods_passed_until_now;
}
while inflight.len() < args.queue_depth.get() {
let start = Instant::now();
let req = {
let mut rng = rand::thread_rng();
let r = &ranges[weights.sample(&mut rng)];
let key: i128 = rng.gen_range(r.start..r.end);
let key = Key::from_i128(key);
assert!(key.is_rel_block_key());
let (rel_tag, block_no) = key
.to_rel_block()
.expect("we filter non-rel-block keys out above");
pageserver_data_api::model::GetPageRequest {
common: pageserver_data_api::model::RequestCommon {
request_lsn: if rng.gen_bool(args.req_latest_probability) {
Lsn::MAX
} else {
r.timeline_lsn
},
not_modified_since_lsn: r.timeline_lsn,
},
rel: pageserver_data_api::model::RelTag {
spc_oid: rel_tag.spcnode,
db_oid: rel_tag.dbnode,
rel_number: rel_tag.relnode,
fork_number: rel_tag.forknum,
},
block_number: block_no,
}
};
let client_clone = client.clone();
let getpage_fut = async move {
let result = client_clone.get_page(&req).await;
(start, result)
};
inflight.push_back(getpage_fut);
}
let (start, result) = inflight.next().await.unwrap();
result.expect("getpage request should succeed");
let end = Instant::now();
shared_state.live_stats.request_done();
ticks_processed += 1;
STATS.with(|stats| {
stats
.borrow()
.lock()
.unwrap()
.observe(end.duration_since(start))
.unwrap();
});
if let Some(period) = &rps_period {
let next_at = client_start
+ Duration::from_micros(
(ticks_processed) as u64 * u64::try_from(period.as_micros()).unwrap(),
);
tokio::time::sleep_until(next_at.into()).await;
}
}
}

View File

@@ -151,10 +151,14 @@ where
.map_err(|_| BasebackupError::Shutdown)?,
),
};
basebackup
let res = basebackup
.send_tarball()
.instrument(info_span!("send_tarball", backup_lsn=%backup_lsn))
.await
.await;
info!("basebackup done!");
res
}
/// This is short-living object only for the time of tarball creation,

View File

@@ -16,6 +16,7 @@ use http_utils::tls_certs::ReloadingCertificateResolver;
use metrics::launch_timestamp::{LaunchTimestamp, set_launch_timestamp_metric};
use metrics::set_build_info_metric;
use nix::sys::socket::{setsockopt, sockopt};
use pageserver::compute_service;
use pageserver::config::{PageServerConf, PageserverIdentity, ignored_fields};
use pageserver::controller_upcall_client::StorageControllerUpcallClient;
use pageserver::deletion_queue::DeletionQueue;
@@ -27,7 +28,7 @@ use pageserver::task_mgr::{
use pageserver::tenant::{TenantSharedResources, mgr, secondary};
use pageserver::{
CancellableTask, ConsumptionMetricsTasks, HttpEndpointListener, HttpsEndpointListener, http,
page_cache, page_service, task_mgr, virtual_file,
page_cache, task_mgr, virtual_file,
};
use postgres_backend::AuthType;
use remote_storage::GenericRemoteStorage;
@@ -745,7 +746,7 @@ fn start_pageserver(
// Spawn a task to listen for libpq connections. It will spawn further tasks
// for each connection. We created the listener earlier already.
let perf_trace_dispatch = otel_guard.as_ref().map(|g| g.dispatch.clone());
let page_service = page_service::spawn(
let compute_service = compute_service::spawn(
conf,
tenant_manager.clone(),
pg_auth,
@@ -782,7 +783,7 @@ fn start_pageserver(
pageserver::shutdown_pageserver(
http_endpoint_listener,
https_endpoint_listener,
page_service,
compute_service,
consumption_metrics_tasks,
disk_usage_eviction_task,
&tenant_manager,

View File

@@ -0,0 +1,286 @@
//!
//! The Compute Service listens for compute connections, and serves requests like
//! the GetPage@LSN requests.
//!
//! We support two protocols:
//!
//! 1. Legacy, connection-oriented libpq based protocol. That's
//! handled by the code in page_service.rs.
//!
//! 2. gRPC based protocol. See compute_service_grpc.rs.
//!
//! To make the transition smooth, without having to open up new firewall ports
//! etc, both protocols are served on the same port. When a new TCP connection
//! is accepted, we peek at the first few bytes incoming from the client to
//! determine which protocol it speaks.
//!
//! TODO: This gets easier once we drop the legacy protocol support. Or if we
//! open a separate port for them.
use std::sync::Arc;
use anyhow::Context;
use futures::FutureExt;
use pageserver_api::config::PageServicePipeliningConfig;
use postgres_backend::AuthType;
use tokio::task::JoinHandle;
use tokio_util::sync::CancellationToken;
use tracing::*;
use utils::auth::SwappableJwtAuth;
use utils::sync::gate::{Gate, GateGuard};
use crate::compute_service_grpc::launch_compute_service_grpc_server;
use crate::config::PageServerConf;
use crate::context::{DownloadBehavior, RequestContext, RequestContextBuilder};
use crate::page_service::libpq_page_service_conn_main;
use crate::task_mgr::{self, COMPUTE_REQUEST_RUNTIME, TaskKind};
use crate::tenant::mgr::TenantManager;
///////////////////////////////////////////////////////////////////////////////
pub type ConnectionHandlerResult = anyhow::Result<()>;
pub struct Connections {
cancel: CancellationToken,
tasks: tokio::task::JoinSet<ConnectionHandlerResult>,
gate: Gate,
}
impl Connections {
pub(crate) async fn shutdown(self) {
let Self {
cancel,
mut tasks,
gate,
} = self;
cancel.cancel();
while let Some(res) = tasks.join_next().await {
Self::handle_connection_completion(res);
}
gate.close().await;
}
fn handle_connection_completion(res: Result<anyhow::Result<()>, tokio::task::JoinError>) {
match res {
Ok(Ok(())) => {}
Ok(Err(e)) => error!("error in page_service connection task: {:?}", e),
Err(e) => error!("page_service connection task panicked: {:?}", e),
}
}
}
pub struct Listener {
cancel: CancellationToken,
/// Cancel the listener task through `listen_cancel` to shut down the listener
/// and get a handle on the existing connections.
task: JoinHandle<Connections>,
}
pub fn spawn(
conf: &'static PageServerConf,
tenant_manager: Arc<TenantManager>,
pg_auth: Option<Arc<SwappableJwtAuth>>,
perf_trace_dispatch: Option<Dispatch>,
tcp_listener: tokio::net::TcpListener,
tls_config: Option<Arc<rustls::ServerConfig>>,
) -> Listener {
let cancel = CancellationToken::new();
let libpq_ctx = RequestContext::todo_child(
TaskKind::LibpqEndpointListener,
// listener task shouldn't need to download anything. (We will
// create a separate sub-contexts for each connection, with their
// own download behavior. This context is used only to listen and
// accept connections.)
DownloadBehavior::Error,
);
let task = COMPUTE_REQUEST_RUNTIME.spawn(task_mgr::exit_on_panic_or_error(
"compute connection listener",
compute_connection_listener_main(
conf,
tenant_manager,
pg_auth,
perf_trace_dispatch,
tcp_listener,
conf.pg_auth_type,
tls_config,
conf.page_service_pipelining.clone(),
libpq_ctx,
cancel.clone(),
)
.map(anyhow::Ok),
));
Listener { cancel, task }
}
impl Listener {
pub async fn stop_accepting(self) -> Connections {
self.cancel.cancel();
self.task
.await
.expect("unreachable: we wrap the listener task in task_mgr::exit_on_panic_or_error")
}
}
/// Listener loop. Listens for connections, and launches a new handler
/// task for each.
///
/// Returns Ok(()) upon cancellation via `cancel`, returning the set of
/// open connections.
///
#[allow(clippy::too_many_arguments)]
pub async fn compute_connection_listener_main(
conf: &'static PageServerConf,
tenant_manager: Arc<TenantManager>,
auth: Option<Arc<SwappableJwtAuth>>,
perf_trace_dispatch: Option<Dispatch>,
listener: tokio::net::TcpListener,
auth_type: AuthType,
tls_config: Option<Arc<rustls::ServerConfig>>,
pipelining_config: PageServicePipeliningConfig,
listener_ctx: RequestContext,
listener_cancel: CancellationToken,
) -> Connections {
let connections_cancel = CancellationToken::new();
let connections_gate = Gate::default();
let mut connection_handler_tasks = tokio::task::JoinSet::default();
// The connection handling task passes the gRPC protocol
// connections to this channel. The tonic gRPC server reads the
// channel and takes over the connections from there.
let (grpc_connections_tx, grpc_connections_rx) = tokio::sync::mpsc::channel(1000);
// Set up the gRPC service
launch_compute_service_grpc_server(
grpc_connections_rx,
conf,
tenant_manager.clone(),
auth.clone(),
auth_type,
connections_cancel.clone(),
&listener_ctx,
);
// Main listener loop
loop {
let gate_guard = match connections_gate.enter() {
Ok(guard) => guard,
Err(_) => break,
};
let accepted = tokio::select! {
biased;
_ = listener_cancel.cancelled() => break,
next = connection_handler_tasks.join_next(), if !connection_handler_tasks.is_empty() => {
let res = next.expect("we dont poll while empty");
Connections::handle_connection_completion(res);
continue;
}
accepted = listener.accept() => accepted,
};
match accepted {
Ok((socket, peer_addr)) => {
// Connection established. Spawn a new task to handle it.
debug!("accepted connection from {}", peer_addr);
let local_auth = auth.clone();
let connection_ctx = RequestContextBuilder::from(&listener_ctx)
.task_kind(TaskKind::PageRequestHandler)
.download_behavior(DownloadBehavior::Download)
.perf_span_dispatch(perf_trace_dispatch.clone())
.detached_child();
connection_handler_tasks.spawn(page_service_conn_main(
conf,
tenant_manager.clone(),
local_auth,
socket,
auth_type,
tls_config.clone(),
pipelining_config.clone(),
connection_ctx,
connections_cancel.child_token(),
gate_guard,
grpc_connections_tx.clone(),
));
}
Err(err) => {
// accept() failed. Log the error, and loop back to retry on next connection.
error!("accept() failed: {:?}", err);
}
}
}
debug!("page_service listener loop terminated");
Connections {
cancel: connections_cancel,
tasks: connection_handler_tasks,
gate: connections_gate,
}
}
/// Handle a new incoming connection.
///
/// This peeks at the first few incoming bytes and dispatches the connection
/// to the legacy libpq handler or the new gRPC handler accordingly.
#[instrument(skip_all, fields(peer_addr, application_name, compute_mode))]
#[allow(clippy::too_many_arguments)]
pub async fn page_service_conn_main(
conf: &'static PageServerConf,
tenant_manager: Arc<TenantManager>,
auth: Option<Arc<SwappableJwtAuth>>,
socket: tokio::net::TcpStream,
auth_type: AuthType,
tls_config: Option<Arc<rustls::ServerConfig>>,
pipelining_config: PageServicePipeliningConfig,
connection_ctx: RequestContext,
cancel: CancellationToken,
gate_guard: GateGuard,
grpc_connections_tx: tokio::sync::mpsc::Sender<tokio::io::Result<tokio::net::TcpStream>>,
) -> ConnectionHandlerResult {
let mut buf: [u8; 4] = [0; 4];
socket
.set_nodelay(true)
.context("could not set TCP_NODELAY")?;
// Peek
socket.peek(&mut buf).await?;
let mut grpc = false;
if buf[0] == 0x16 {
// looks like a TLS handshake. Assume gRPC.
// XXX: Starting with v17, PostgreSQL also supports "direct TLS mode". But
// the compute doesn't use it.
grpc = true;
}
if buf[0] == b'G' || buf[0] == b'P' {
// Looks like 'GET' or 'POST'
// or 'PRI', indicating gRPC over HTTP/2 with prior knowledge
grpc = true;
}
// Dispatch
if grpc {
grpc_connections_tx.send(Ok(socket)).await?;
info!("connection sent to channel");
Ok(())
} else {
libpq_page_service_conn_main(
conf,
tenant_manager,
auth,
socket,
auth_type,
tls_config,
pipelining_config,
connection_ctx,
cancel,
gate_guard,
)
.await
}
}

View File

@@ -0,0 +1,746 @@
//!
//! Compute <-> Pageserver API handler. This is for the new gRPC-based protocol
//!
//! TODO:
//!
//! - Many of the API endpoints are still missing
//!
//! - This is very much not optimized.
//!
//! - Much of the code was copy-pasted from page_service.rs. Like the code to get the
//! Timeline object, and the JWT auth. Could refactor and share.
//!
//!
use std::pin::Pin;
use std::str::FromStr;
use std::sync::Arc;
use std::task::Poll;
use std::time::Duration;
use std::time::Instant;
use crate::TenantManager;
use crate::auth::check_permission;
use crate::basebackup;
use crate::basebackup::BasebackupError;
use crate::config::PageServerConf;
use crate::context::{DownloadBehavior, RequestContext, RequestContextBuilder};
use crate::task_mgr::TaskKind;
use crate::tenant::Timeline;
use crate::tenant::mgr::ShardResolveResult;
use crate::tenant::mgr::ShardSelector;
use crate::tenant::storage_layer::IoConcurrency;
use crate::tenant::timeline::WaitLsnTimeout;
use tokio::io::{AsyncWriteExt, ReadHalf, SimplexStream};
use tokio::task::JoinHandle;
use tokio_util::codec::{Decoder, FramedRead};
use tokio_util::sync::CancellationToken;
use futures::stream::StreamExt;
use pageserver_data_api::model;
use pageserver_data_api::proto::page_service_server::PageService;
use pageserver_data_api::proto::page_service_server::PageServiceServer;
use anyhow::Context;
use bytes::BytesMut;
use jsonwebtoken::TokenData;
use tracing::Instrument;
use tracing::{debug, error};
use utils::auth::SwappableJwtAuth;
use utils::id::{TenantId, TenantTimelineId, TimelineId};
use utils::lsn::Lsn;
use utils::simple_rcu::RcuReadGuard;
use crate::tenant::PageReconstructError;
use postgres_ffi::BLCKSZ;
use tonic;
use tonic::codec::CompressionEncoding;
use tonic::service::interceptor::InterceptedService;
use pageserver_api::key::rel_block_to_key;
use crate::pgdatadir_mapping::Version;
use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
use postgres_backend::AuthType;
pub use pageserver_data_api::proto;
pub(super) fn launch_compute_service_grpc_server(
tcp_connections_rx: tokio::sync::mpsc::Receiver<tokio::io::Result<tokio::net::TcpStream>>,
conf: &'static PageServerConf,
tenant_manager: Arc<TenantManager>,
auth: Option<Arc<SwappableJwtAuth>>,
auth_type: AuthType,
connections_cancel: CancellationToken,
listener_ctx: &RequestContext,
) {
// Set up the gRPC service
let service_ctx = RequestContextBuilder::from(listener_ctx)
.task_kind(TaskKind::PageRequestHandler)
.download_behavior(DownloadBehavior::Download)
.attached_child();
let service = crate::compute_service_grpc::PageServiceService {
conf,
tenant_mgr: tenant_manager.clone(),
ctx: Arc::new(service_ctx),
};
let authenticator = PageServiceAuthenticator {
auth: auth.clone(),
auth_type,
};
let server = InterceptedService::new(
PageServiceServer::new(service).send_compressed(CompressionEncoding::Gzip),
authenticator,
);
let cc = connections_cancel.clone();
tokio::spawn(async move {
tonic::transport::Server::builder()
.add_service(server)
.serve_with_incoming_shutdown(
tokio_stream::wrappers::ReceiverStream::new(tcp_connections_rx),
cc.cancelled(),
)
.await
});
}
struct PageServiceService {
conf: &'static PageServerConf,
tenant_mgr: Arc<TenantManager>,
ctx: Arc<RequestContext>,
}
/// An error happened in a get() operation.
impl From<PageReconstructError> for tonic::Status {
fn from(e: PageReconstructError) -> Self {
match e {
PageReconstructError::Other(err) => tonic::Status::unknown(err.to_string()),
PageReconstructError::AncestorLsnTimeout(_) => {
tonic::Status::unavailable(e.to_string())
}
PageReconstructError::Cancelled => tonic::Status::aborted(e.to_string()),
PageReconstructError::WalRedo(_) => tonic::Status::internal(e.to_string()),
PageReconstructError::MissingKey(_) => tonic::Status::internal(e.to_string()),
}
}
}
fn convert_reltag(value: &model::RelTag) -> pageserver_api::reltag::RelTag {
pageserver_api::reltag::RelTag {
spcnode: value.spc_oid,
dbnode: value.db_oid,
relnode: value.rel_number,
forknum: value.fork_number,
}
}
#[tonic::async_trait]
impl PageService for PageServiceService {
type GetBaseBackupStream = GetBaseBackupStream;
async fn rel_exists(
&self,
request: tonic::Request<proto::RelExistsRequest>,
) -> std::result::Result<tonic::Response<proto::RelExistsResponse>, tonic::Status> {
let ttid = self.extract_ttid(request.metadata())?;
let req: model::RelExistsRequest = request.get_ref().try_into()?;
let rel = convert_reltag(&req.rel);
let span = tracing::info_span!("rel_exists", tenant_id = %ttid.tenant_id, timeline_id = %ttid.timeline_id, rel = %rel, req_lsn = %req.common.request_lsn);
async {
let timeline = self.get_timeline(ttid, ShardSelector::Zero).await?;
let ctx = self.ctx.with_scope_timeline(&timeline);
let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn();
let lsn = Self::wait_or_get_last_lsn(
&timeline,
req.common.request_lsn,
req.common.not_modified_since_lsn,
&latest_gc_cutoff_lsn,
&ctx,
)
.await?;
let exists = timeline
.get_rel_exists(rel, Version::Lsn(lsn), &ctx)
.await?;
Ok(tonic::Response::new(proto::RelExistsResponse { exists }))
}
.instrument(span)
.await
}
/// Returns size of a relation, as # of blocks
async fn rel_size(
&self,
request: tonic::Request<proto::RelSizeRequest>,
) -> std::result::Result<tonic::Response<proto::RelSizeResponse>, tonic::Status> {
let ttid = self.extract_ttid(request.metadata())?;
let req: model::RelSizeRequest = request.get_ref().try_into()?;
let rel = convert_reltag(&req.rel);
let span = tracing::info_span!("rel_size", tenant_id = %ttid.tenant_id, timeline_id = %ttid.timeline_id, rel = %rel, req_lsn = %req.common.request_lsn);
async {
let timeline = self.get_timeline(ttid, ShardSelector::Zero).await?;
let ctx = self.ctx.with_scope_timeline(&timeline);
let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn();
let lsn = Self::wait_or_get_last_lsn(
&timeline,
req.common.request_lsn,
req.common.not_modified_since_lsn,
&latest_gc_cutoff_lsn,
&ctx,
)
.await?;
let num_blocks = timeline.get_rel_size(rel, Version::Lsn(lsn), &ctx).await?;
Ok(tonic::Response::new(proto::RelSizeResponse { num_blocks }))
}
.instrument(span)
.await
}
async fn get_page(
&self,
request: tonic::Request<proto::GetPageRequest>,
) -> std::result::Result<tonic::Response<proto::GetPageResponse>, tonic::Status> {
let ttid = self.extract_ttid(request.metadata())?;
let req: model::GetPageRequest = request.get_ref().try_into()?;
// Calculate shard number.
//
// FIXME: this should probably be part of the data_api crate.
let rel = convert_reltag(&req.rel);
let key = rel_block_to_key(rel, req.block_number);
let timeline = self.get_timeline(ttid, ShardSelector::Page(key)).await?;
let ctx = self.ctx.with_scope_timeline(&timeline);
let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn();
let lsn = Self::wait_or_get_last_lsn(
&timeline,
req.common.request_lsn,
req.common.not_modified_since_lsn,
&latest_gc_cutoff_lsn,
&ctx,
)
.await?;
let shard_id = timeline.tenant_shard_id.shard_number;
let span = tracing::info_span!("get_page", tenant_id = %ttid.tenant_id, shard_id = %shard_id, timeline_id = %ttid.timeline_id, rel = %rel, block_number = %req.block_number, req_lsn = %req.common.request_lsn);
async {
let gate_guard = match timeline.gate.enter() {
Ok(guard) => guard,
Err(_) => {
return Err(tonic::Status::unavailable("timeline is shutting down"));
}
};
let io_concurrency = IoConcurrency::spawn_from_conf(self.conf, gate_guard);
let page_image = timeline
.get_rel_page_at_lsn(
rel,
req.block_number,
Version::Lsn(lsn),
&ctx,
io_concurrency,
)
.await?;
Ok(tonic::Response::new(proto::GetPageResponse {
page_image: page_image,
}))
}
.instrument(span)
.await
}
async fn db_size(
&self,
request: tonic::Request<proto::DbSizeRequest>,
) -> Result<tonic::Response<proto::DbSizeResponse>, tonic::Status> {
let ttid = self.extract_ttid(request.metadata())?;
let req: model::DbSizeRequest = request.get_ref().try_into()?;
let span = tracing::info_span!("get_page", tenant_id = %ttid.tenant_id, timeline_id = %ttid.timeline_id, db_oid = %req.db_oid, req_lsn = %req.common.request_lsn);
async {
let timeline = self.get_timeline(ttid, ShardSelector::Zero).await?;
let ctx = self.ctx.with_scope_timeline(&timeline);
let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn();
let lsn = Self::wait_or_get_last_lsn(
&timeline,
req.common.request_lsn,
req.common.not_modified_since_lsn,
&latest_gc_cutoff_lsn,
&ctx,
)
.await?;
let total_blocks = timeline
.get_db_size(DEFAULTTABLESPACE_OID, req.db_oid, Version::Lsn(lsn), &ctx)
.await?;
Ok(tonic::Response::new(proto::DbSizeResponse {
num_bytes: total_blocks as u64 * BLCKSZ as u64,
}))
}
.instrument(span)
.await
}
async fn get_base_backup(
&self,
request: tonic::Request<proto::GetBaseBackupRequest>,
) -> Result<tonic::Response<Self::GetBaseBackupStream>, tonic::Status> {
let ttid = self.extract_ttid(request.metadata())?;
let req: model::GetBaseBackupRequest = request.get_ref().try_into()?;
let timeline = self.get_timeline(ttid, ShardSelector::Zero).await?;
let ctx = self.ctx.with_scope_timeline(&timeline);
let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn();
let lsn = Self::wait_or_get_last_lsn(
&timeline,
req.common.request_lsn,
req.common.not_modified_since_lsn,
&latest_gc_cutoff_lsn,
&ctx,
)
.await?;
let span = tracing::info_span!("get_base_backup", tenant_id = %ttid.tenant_id, timeline_id = %ttid.timeline_id, req_lsn = %req.common.request_lsn);
tracing::info!("starting basebackup");
#[allow(dead_code)]
enum TestMode {
/// Create real basebackup, in streaming fashion
Streaming,
/// Create real basebackup, but fully materialize it in the 'simplex' pipe buffer first
Materialize,
/// Create a dummy all-zeros basebackup, in streaming fashion
DummyStreaming,
/// Create a dummy all-zeros basebackup, but fully materialize it first
DummyMaterialize,
}
let mode = TestMode::Streaming;
let buf_size = match mode {
TestMode::Streaming | TestMode::DummyStreaming => 64 * 1024,
TestMode::Materialize | TestMode::DummyMaterialize => 64 * 1024 * 1024,
};
let (simplex_read, mut simplex_write) = tokio::io::simplex(buf_size);
let basebackup_task = match mode {
TestMode::DummyStreaming => {
tokio::spawn(
async move {
// hold onto the guard for as long as the basebackup runs
let _latest_gc_cutoff_lsn = latest_gc_cutoff_lsn;
let zerosbuf: [u8; 1024] = [0; 1024];
let nbytes = 16900000;
let mut bytes_written = 0;
while bytes_written < nbytes {
let s = std::cmp::min(1024, nbytes - bytes_written);
let _ = simplex_write.write_all(&zerosbuf[0..s]).await;
bytes_written += s;
}
simplex_write
.shutdown()
.await
.context("shutdown of basebackup pipe")?;
Ok(())
}
.instrument(span),
)
}
TestMode::DummyMaterialize => {
let zerosbuf: [u8; 1024] = [0; 1024];
let nbytes = 16900000;
let mut bytes_written = 0;
while bytes_written < nbytes {
let s = std::cmp::min(1024, nbytes - bytes_written);
let _ = simplex_write.write_all(&zerosbuf[0..s]).await;
bytes_written += s;
}
simplex_write
.shutdown()
.await
.expect("shutdown of basebackup pipe");
tracing::info!("basebackup (dummy) materialized");
let result = Ok(());
tokio::spawn(std::future::ready(result))
}
TestMode::Materialize => {
let result = basebackup::send_basebackup_tarball(
&mut simplex_write,
&timeline,
Some(lsn),
None,
false,
req.replica,
&ctx,
)
.await;
simplex_write
.shutdown()
.await
.expect("shutdown of basebackup pipe");
tracing::info!("basebackup materialized");
// Launch a task that writes the basebackup tarball to the simplex pipe
tokio::spawn(std::future::ready(result))
}
TestMode::Streaming => {
tokio::spawn(
async move {
// hold onto the guard for as long as the basebackup runs
let _latest_gc_cutoff_lsn = latest_gc_cutoff_lsn;
let result = basebackup::send_basebackup_tarball(
&mut simplex_write,
&timeline,
Some(lsn),
None,
false,
req.replica,
&ctx,
)
.await;
simplex_write
.shutdown()
.await
.context("shutdown of basebackup pipe")?;
result
}
.instrument(span),
)
}
};
let response = new_basebackup_response_stream(simplex_read, basebackup_task);
Ok(tonic::Response::new(response))
}
}
/// NB: this is a different value than [`crate::http::routes::ACTIVE_TENANT_TIMEOUT`].
/// NB: and also different from page_service::ACTIVE_TENANT_TIMEOUT
const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(30000);
impl PageServiceService {
async fn get_timeline(
&self,
ttid: TenantTimelineId,
shard_selector: ShardSelector,
) -> Result<Arc<Timeline>, tonic::Status> {
let timeout = ACTIVE_TENANT_TIMEOUT;
let wait_start = Instant::now();
let deadline = wait_start + timeout;
let tenant_shard = loop {
let resolved = self
.tenant_mgr
.resolve_attached_shard(&ttid.tenant_id, shard_selector);
match resolved {
ShardResolveResult::Found(tenant_shard) => break tenant_shard,
ShardResolveResult::NotFound => {
return Err(tonic::Status::not_found("tenant not found"));
}
ShardResolveResult::InProgress(barrier) => {
// We can't authoritatively answer right now: wait for InProgress state
// to end, then try again
tokio::select! {
_ = barrier.wait() => {
// The barrier completed: proceed around the loop to try looking up again
},
_ = tokio::time::sleep(deadline.duration_since(Instant::now())) => {
return Err(tonic::Status::unavailable("tenant is in InProgress state"));
}
}
}
}
};
tracing::debug!("Waiting for tenant to enter active state...");
tenant_shard
.wait_to_become_active(deadline.duration_since(Instant::now()))
.await
.map_err(|e| {
tonic::Status::unavailable(format!("tenant is not in active state: {e}"))
})?;
let timeline = tenant_shard
.get_timeline(ttid.timeline_id, true)
.map_err(|e| tonic::Status::unavailable(format!("could not get timeline: {e}")))?;
// FIXME: need to do something with the 'gate' here?
Ok(timeline)
}
/// Extract TenantTimelineId from the request metadata
///
/// Note: the interceptor has already authenticated the request
///
/// TOOD: Could we use "binary" metadata for these, for efficiency? gRPC has such a concept
fn extract_ttid(
&self,
metadata: &tonic::metadata::MetadataMap,
) -> Result<TenantTimelineId, tonic::Status> {
let tenant_id = metadata
.get("neon-tenant-id")
.ok_or(tonic::Status::invalid_argument(
"neon-tenant-id metadata missing",
))?;
let tenant_id = tenant_id.to_str().map_err(|_| {
tonic::Status::invalid_argument("invalid UTF-8 characters in neon-tenant-id metadata")
})?;
let tenant_id = TenantId::from_str(tenant_id)
.map_err(|_| tonic::Status::invalid_argument("invalid neon-tenant-id metadata"))?;
let timeline_id =
metadata
.get("neon-timeline-id")
.ok_or(tonic::Status::invalid_argument(
"neon-timeline-id metadata missing",
))?;
let timeline_id = timeline_id.to_str().map_err(|_| {
tonic::Status::invalid_argument("invalid UTF-8 characters in neon-timeline-id metadata")
})?;
let timeline_id = TimelineId::from_str(timeline_id)
.map_err(|_| tonic::Status::invalid_argument("invalid neon-timelineid metadata"))?;
Ok(TenantTimelineId::new(tenant_id, timeline_id))
}
// XXX: copied from PageServerHandler
async fn wait_or_get_last_lsn(
timeline: &Timeline,
request_lsn: Lsn,
not_modified_since: Lsn,
latest_gc_cutoff_lsn: &RcuReadGuard<Lsn>,
ctx: &RequestContext,
) -> Result<Lsn, tonic::Status> {
let last_record_lsn = timeline.get_last_record_lsn();
// Sanity check the request
if request_lsn < not_modified_since {
return Err(tonic::Status::invalid_argument(format!(
"invalid request with request LSN {} and not_modified_since {}",
request_lsn, not_modified_since,
)));
}
// Check explicitly for INVALID just to get a less scary error message if the request is obviously bogus
if request_lsn == Lsn::INVALID {
return Err(tonic::Status::invalid_argument("invalid LSN(0) in request"));
}
// Clients should only read from recent LSNs on their timeline, or from locations holding an LSN lease.
//
// We may have older data available, but we make a best effort to detect this case and return an error,
// to distinguish a misbehaving client (asking for old LSN) from a storage issue (data missing at a legitimate LSN).
if request_lsn < **latest_gc_cutoff_lsn && !timeline.is_gc_blocked_by_lsn_lease_deadline() {
let gc_info = &timeline.gc_info.read().unwrap();
if !gc_info.lsn_covered_by_lease(request_lsn) {
return Err(tonic::Status::not_found(format!(
"tried to request a page version that was garbage collected. requested at {} gc cutoff {}",
request_lsn, **latest_gc_cutoff_lsn
)));
}
}
// Wait for WAL up to 'not_modified_since' to arrive, if necessary
if not_modified_since > last_record_lsn {
timeline
.wait_lsn(
not_modified_since,
crate::tenant::timeline::WaitLsnWaiter::PageService,
WaitLsnTimeout::Default,
ctx,
)
.await
.map_err(|_| {
tonic::Status::unavailable("not_modified_since LSN not arrived yet")
})?;
// Since we waited for 'not_modified_since' to arrive, that is now the last
// record LSN. (Or close enough for our purposes; the last-record LSN can
// advance immediately after we return anyway)
Ok(not_modified_since)
} else {
// It might be better to use max(not_modified_since, latest_gc_cutoff_lsn)
// here instead. That would give the same result, since we know that there
// haven't been any modifications since 'not_modified_since'. Using an older
// LSN might be faster, because that could allow skipping recent layers when
// finding the page. However, we have historically used 'last_record_lsn', so
// stick to that for now.
Ok(std::cmp::min(last_record_lsn, request_lsn))
}
}
}
#[derive(Clone)]
pub struct PageServiceAuthenticator {
pub auth: Option<Arc<SwappableJwtAuth>>,
pub auth_type: AuthType,
}
impl tonic::service::Interceptor for PageServiceAuthenticator {
fn call(
&mut self,
req: tonic::Request<()>,
) -> std::result::Result<tonic::Request<()>, tonic::Status> {
// Check the tenant_id in any case
let tenant_id =
req.metadata()
.get("neon-tenant-id")
.ok_or(tonic::Status::invalid_argument(
"neon-tenant-id metadata missing",
))?;
let tenant_id = tenant_id.to_str().map_err(|_| {
tonic::Status::invalid_argument("invalid UTF-8 characters in neon-tenant-id metadata")
})?;
let tenant_id = TenantId::from_str(tenant_id)
.map_err(|_| tonic::Status::invalid_argument("invalid neon-tenant-id metadata"))?;
// when accessing management api supply None as an argument
// when using to authorize tenant pass corresponding tenant id
let auth = if let Some(auth) = &self.auth {
auth
} else {
// auth is set to Trust, nothing to check so just return ok
return Ok(req);
};
let jwt = req
.metadata()
.get("neon-auth-token")
.ok_or(tonic::Status::unauthenticated("no neon-auth-token"))?;
let jwt = jwt.to_str().map_err(|_| {
tonic::Status::invalid_argument("invalid UTF-8 characters in neon-auth-token metadata")
})?;
let jwtdata: TokenData<utils::auth::Claims> = auth
.decode(jwt)
.map_err(|err| tonic::Status::unauthenticated(format!("invalid JWT token: {}", err)))?;
let claims = jwtdata.claims;
if matches!(claims.scope, utils::auth::Scope::Tenant) && claims.tenant_id.is_none() {
return Err(tonic::Status::unauthenticated(
"jwt token scope is Tenant, but tenant id is missing",
));
}
debug!(
"jwt scope check succeeded for scope: {:#?} by tenant id: {:?}",
claims.scope, claims.tenant_id,
);
// The token is valid. Check if it's allowed to access the tenant ID
// given in the request.
check_permission(&claims, Some(tenant_id))
.map_err(|err| tonic::Status::permission_denied(err.to_string()))?;
// All checks out
Ok(req)
}
}
/// Stream of GetBaseBackupResponseChunk messages.
///
/// The first part of the Chain chunks the tarball. The second part checks the return value
/// of the send_basebackup_tarball Future that created the tarball.
type GetBaseBackupStream = futures::stream::Chain<BasebackupChunkedStream, CheckResultStream>;
fn new_basebackup_response_stream(
simplex_read: ReadHalf<SimplexStream>,
basebackup_task: JoinHandle<Result<(), BasebackupError>>,
) -> GetBaseBackupStream {
let framed = FramedRead::new(simplex_read, GetBaseBackupResponseDecoder {});
framed.chain(CheckResultStream { basebackup_task })
}
/// Stream that uses GetBaseBackupResponseDecoder
type BasebackupChunkedStream =
tokio_util::codec::FramedRead<ReadHalf<SimplexStream>, GetBaseBackupResponseDecoder>;
struct GetBaseBackupResponseDecoder;
impl Decoder for GetBaseBackupResponseDecoder {
type Item = proto::GetBaseBackupResponseChunk;
type Error = tonic::Status;
fn decode(&mut self, src: &mut BytesMut) -> Result<Option<Self::Item>, Self::Error> {
if src.len() < 64 * 1024 {
return Ok(None);
}
let item = proto::GetBaseBackupResponseChunk {
chunk: bytes::Bytes::from(std::mem::take(src)),
};
Ok(Some(item))
}
fn decode_eof(&mut self, src: &mut BytesMut) -> Result<Option<Self::Item>, Self::Error> {
if src.is_empty() {
return Ok(None);
}
let item = proto::GetBaseBackupResponseChunk {
chunk: bytes::Bytes::from(std::mem::take(src)),
};
Ok(Some(item))
}
}
struct CheckResultStream {
basebackup_task: tokio::task::JoinHandle<Result<(), BasebackupError>>,
}
impl futures::Stream for CheckResultStream {
type Item = Result<proto::GetBaseBackupResponseChunk, tonic::Status>;
fn poll_next(
mut self: Pin<&mut Self>,
ctx: &mut std::task::Context<'_>,
) -> Poll<Option<Self::Item>> {
let task = Pin::new(&mut self.basebackup_task);
match task.poll(ctx) {
Poll::Pending => Poll::Pending,
Poll::Ready(Ok(Ok(()))) => Poll::Ready(None),
Poll::Ready(Ok(Err(basebackup_err))) => {
error!(error=%basebackup_err, "error getting basebackup");
Poll::Ready(Some(Err(tonic::Status::internal(
"could not get basebackup",
))))
}
Poll::Ready(Err(join_err)) => {
error!(error=%join_err, "JoinError getting basebackup");
Poll::Ready(Some(Err(tonic::Status::internal(
"could not get basebackup",
))))
}
}
}
}

View File

@@ -21,6 +21,8 @@ pub use pageserver_api::keyspace;
use tokio_util::sync::CancellationToken;
mod assert_u64_eq_usize;
pub mod aux_file;
pub mod compute_service;
pub mod compute_service_grpc;
pub mod metrics;
pub mod page_cache;
pub mod page_service;
@@ -82,7 +84,7 @@ impl CancellableTask {
pub async fn shutdown_pageserver(
http_listener: HttpEndpointListener,
https_listener: Option<HttpsEndpointListener>,
page_service: page_service::Listener,
compute_service: compute_service::Listener,
consumption_metrics_worker: ConsumptionMetricsTasks,
disk_usage_eviction_task: Option<DiskUsageEvictionTask>,
tenant_manager: &TenantManager,
@@ -167,11 +169,11 @@ pub async fn shutdown_pageserver(
}
});
// Shut down the libpq endpoint task. This prevents new connections from
// Shut down the compute service endpoint task. This prevents new connections from
// being accepted.
let remaining_connections = timed(
page_service.stop_accepting(),
"shutdown LibpqEndpointListener",
compute_service.stop_accepting(),
"shutdown compte service listener",
Duration::from_secs(1),
)
.await;

View File

@@ -13,7 +13,6 @@ use crate::PERF_TRACE_TARGET;
use anyhow::{Context, bail};
use async_compression::tokio::write::GzipEncoder;
use bytes::Buf;
use futures::FutureExt;
use itertools::Itertools;
use jsonwebtoken::TokenData;
use once_cell::sync::OnceCell;
@@ -40,7 +39,6 @@ use pq_proto::framed::ConnectionError;
use pq_proto::{BeMessage, FeMessage, FeStartupPacket, RowDescriptor};
use strum_macros::IntoStaticStr;
use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt, BufWriter};
use tokio::task::JoinHandle;
use tokio_util::sync::CancellationToken;
use tracing::*;
use utils::auth::{Claims, Scope, SwappableJwtAuth};
@@ -49,15 +47,13 @@ use utils::id::{TenantId, TimelineId};
use utils::logging::log_slow;
use utils::lsn::Lsn;
use utils::simple_rcu::RcuReadGuard;
use utils::sync::gate::{Gate, GateGuard};
use utils::sync::gate::GateGuard;
use utils::sync::spsc_fold;
use crate::auth::check_permission;
use crate::basebackup::BasebackupError;
use crate::config::PageServerConf;
use crate::context::{
DownloadBehavior, PerfInstrumentFutureExt, RequestContext, RequestContextBuilder,
};
use crate::context::{PerfInstrumentFutureExt, RequestContext, RequestContextBuilder};
use crate::metrics::{
self, COMPUTE_COMMANDS_COUNTERS, ComputeCommandKind, GetPageBatchBreakReason, LIVE_CONNECTIONS,
SmgrOpTimer, TimelineMetrics,
@@ -67,7 +63,6 @@ use crate::span::{
debug_assert_current_span_has_tenant_and_timeline_id,
debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id,
};
use crate::task_mgr::{self, COMPUTE_REQUEST_RUNTIME, TaskKind};
use crate::tenant::mgr::{
GetActiveTenantError, GetTenantError, ShardResolveResult, ShardSelector, TenantManager,
};
@@ -85,171 +80,6 @@ const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(30000);
/// Threshold at which to log slow GetPage requests.
const LOG_SLOW_GETPAGE_THRESHOLD: Duration = Duration::from_secs(30);
///////////////////////////////////////////////////////////////////////////////
pub struct Listener {
cancel: CancellationToken,
/// Cancel the listener task through `listen_cancel` to shut down the listener
/// and get a handle on the existing connections.
task: JoinHandle<Connections>,
}
pub struct Connections {
cancel: CancellationToken,
tasks: tokio::task::JoinSet<ConnectionHandlerResult>,
gate: Gate,
}
pub fn spawn(
conf: &'static PageServerConf,
tenant_manager: Arc<TenantManager>,
pg_auth: Option<Arc<SwappableJwtAuth>>,
perf_trace_dispatch: Option<Dispatch>,
tcp_listener: tokio::net::TcpListener,
tls_config: Option<Arc<rustls::ServerConfig>>,
) -> Listener {
let cancel = CancellationToken::new();
let libpq_ctx = RequestContext::todo_child(
TaskKind::LibpqEndpointListener,
// listener task shouldn't need to download anything. (We will
// create a separate sub-contexts for each connection, with their
// own download behavior. This context is used only to listen and
// accept connections.)
DownloadBehavior::Error,
);
let task = COMPUTE_REQUEST_RUNTIME.spawn(task_mgr::exit_on_panic_or_error(
"libpq listener",
libpq_listener_main(
conf,
tenant_manager,
pg_auth,
perf_trace_dispatch,
tcp_listener,
conf.pg_auth_type,
tls_config,
conf.page_service_pipelining.clone(),
libpq_ctx,
cancel.clone(),
)
.map(anyhow::Ok),
));
Listener { cancel, task }
}
impl Listener {
pub async fn stop_accepting(self) -> Connections {
self.cancel.cancel();
self.task
.await
.expect("unreachable: we wrap the listener task in task_mgr::exit_on_panic_or_error")
}
}
impl Connections {
pub(crate) async fn shutdown(self) {
let Self {
cancel,
mut tasks,
gate,
} = self;
cancel.cancel();
while let Some(res) = tasks.join_next().await {
Self::handle_connection_completion(res);
}
gate.close().await;
}
fn handle_connection_completion(res: Result<anyhow::Result<()>, tokio::task::JoinError>) {
match res {
Ok(Ok(())) => {}
Ok(Err(e)) => error!("error in page_service connection task: {:?}", e),
Err(e) => error!("page_service connection task panicked: {:?}", e),
}
}
}
///
/// Main loop of the page service.
///
/// Listens for connections, and launches a new handler task for each.
///
/// Returns Ok(()) upon cancellation via `cancel`, returning the set of
/// open connections.
///
#[allow(clippy::too_many_arguments)]
pub async fn libpq_listener_main(
conf: &'static PageServerConf,
tenant_manager: Arc<TenantManager>,
auth: Option<Arc<SwappableJwtAuth>>,
perf_trace_dispatch: Option<Dispatch>,
listener: tokio::net::TcpListener,
auth_type: AuthType,
tls_config: Option<Arc<rustls::ServerConfig>>,
pipelining_config: PageServicePipeliningConfig,
listener_ctx: RequestContext,
listener_cancel: CancellationToken,
) -> Connections {
let connections_cancel = CancellationToken::new();
let connections_gate = Gate::default();
let mut connection_handler_tasks = tokio::task::JoinSet::default();
loop {
let gate_guard = match connections_gate.enter() {
Ok(guard) => guard,
Err(_) => break,
};
let accepted = tokio::select! {
biased;
_ = listener_cancel.cancelled() => break,
next = connection_handler_tasks.join_next(), if !connection_handler_tasks.is_empty() => {
let res = next.expect("we dont poll while empty");
Connections::handle_connection_completion(res);
continue;
}
accepted = listener.accept() => accepted,
};
match accepted {
Ok((socket, peer_addr)) => {
// Connection established. Spawn a new task to handle it.
debug!("accepted connection from {}", peer_addr);
let local_auth = auth.clone();
let connection_ctx = RequestContextBuilder::from(&listener_ctx)
.task_kind(TaskKind::PageRequestHandler)
.download_behavior(DownloadBehavior::Download)
.perf_span_dispatch(perf_trace_dispatch.clone())
.detached_child();
connection_handler_tasks.spawn(page_service_conn_main(
conf,
tenant_manager.clone(),
local_auth,
socket,
auth_type,
tls_config.clone(),
pipelining_config.clone(),
connection_ctx,
connections_cancel.child_token(),
gate_guard,
));
}
Err(err) => {
// accept() failed. Log the error, and loop back to retry on next connection.
error!("accept() failed: {:?}", err);
}
}
}
debug!("page_service listener loop terminated");
Connections {
cancel: connections_cancel,
tasks: connection_handler_tasks,
gate: connections_gate,
}
}
type ConnectionHandlerResult = anyhow::Result<()>;
/// Perf root spans start at the per-request level, after shard routing.
@@ -261,9 +91,10 @@ struct ConnectionPerfSpanFields {
compute_mode: Option<String>,
}
/// note: the caller has already set TCP_NODELAY on the socket
#[instrument(skip_all, fields(peer_addr, application_name, compute_mode))]
#[allow(clippy::too_many_arguments)]
async fn page_service_conn_main(
pub async fn libpq_page_service_conn_main(
conf: &'static PageServerConf,
tenant_manager: Arc<TenantManager>,
auth: Option<Arc<SwappableJwtAuth>>,
@@ -279,10 +110,6 @@ async fn page_service_conn_main(
.with_label_values(&["page_service"])
.guard();
socket
.set_nodelay(true)
.context("could not set TCP_NODELAY")?;
let socket_fd = socket.as_raw_fd();
let peer_addr = socket.peer_addr().context("get peer address")?;
@@ -393,7 +220,7 @@ struct PageServerHandler {
gate_guard: GateGuard,
}
struct TimelineHandles {
pub struct TimelineHandles {
wrapper: TenantManagerWrapper,
/// Note on size: the typical size of this map is 1. The largest size we expect
/// to see is the number of shards divided by the number of pageservers (typically < 2),

View File

@@ -1,10 +1,10 @@
# pgxs/neon/Makefile
MODULE_big = neon
OBJS = \
$(WIN32RES) \
communicator.o \
communicator_new.o \
extension_server.o \
file_cache.o \
hll.o \
@@ -22,7 +22,8 @@ OBJS = \
walproposer.o \
walproposer_pg.o \
control_plane_connector.o \
walsender_hooks.o
walsender_hooks.o \
$(LIBCOMMUNICATOR_PATH)/libcommunicator.a
PG_CPPFLAGS = -I$(libpq_srcdir)
SHLIB_LINK_INTERNAL = $(libpq)

372
pgxn/neon/communicator/Cargo.lock generated Normal file
View File

@@ -0,0 +1,372 @@
# This file is automatically @generated by Cargo.
# It is not intended for manual editing.
version = 4
[[package]]
name = "addr2line"
version = "0.24.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dfbe277e56a376000877090da837660b4427aad530e3028d44e0bffe4f89a1c1"
dependencies = [
"gimli",
]
[[package]]
name = "adler2"
version = "2.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627"
[[package]]
name = "backtrace"
version = "0.3.74"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8d82cb332cdfaed17ae235a638438ac4d4839913cc2af585c3c6746e8f8bee1a"
dependencies = [
"addr2line",
"cfg-if",
"libc",
"miniz_oxide",
"object",
"rustc-demangle",
"windows-targets",
]
[[package]]
name = "base64"
version = "0.22.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6"
[[package]]
name = "bytes"
version = "1.10.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d71b6127be86fdcfddb610f7182ac57211d4b18a3e9c82eb2d17662f2227ad6a"
[[package]]
name = "cfg-if"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
[[package]]
name = "communicator"
version = "0.1.0"
dependencies = [
"tonic",
]
[[package]]
name = "fnv"
version = "1.0.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
[[package]]
name = "futures-core"
version = "0.3.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e"
[[package]]
name = "gimli"
version = "0.31.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "07e28edb80900c19c28f1072f2e8aeca7fa06b23cd4169cefe1af5aa3260783f"
[[package]]
name = "http"
version = "1.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f4a85d31aea989eead29a3aaf9e1115a180df8282431156e533de47660892565"
dependencies = [
"bytes",
"fnv",
"itoa",
]
[[package]]
name = "http-body"
version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1efedce1fb8e6913f23e0c92de8e62cd5b772a67e7b3946df930a62566c93184"
dependencies = [
"bytes",
"http",
]
[[package]]
name = "http-body-util"
version = "0.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b021d93e26becf5dc7e1b75b1bed1fd93124b374ceb73f43d4d4eafec896a64a"
dependencies = [
"bytes",
"futures-core",
"http",
"http-body",
"pin-project-lite",
]
[[package]]
name = "itoa"
version = "1.0.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c"
[[package]]
name = "libc"
version = "0.2.171"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c19937216e9d3aa9956d9bb8dfc0b0c8beb6058fc4f7a4dc4d850edf86a237d6"
[[package]]
name = "memchr"
version = "2.7.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
[[package]]
name = "miniz_oxide"
version = "0.8.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ff70ce3e48ae43fa075863cef62e8b43b71a4f2382229920e0df362592919430"
dependencies = [
"adler2",
]
[[package]]
name = "object"
version = "0.36.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "62948e14d923ea95ea2c7c86c71013138b66525b86bdc08d2dcc262bdb497b87"
dependencies = [
"memchr",
]
[[package]]
name = "once_cell"
version = "1.21.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d"
[[package]]
name = "percent-encoding"
version = "2.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e"
[[package]]
name = "pin-project"
version = "1.1.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "677f1add503faace112b9f1373e43e9e054bfdd22ff1a63c1bc485eaec6a6a8a"
dependencies = [
"pin-project-internal",
]
[[package]]
name = "pin-project-internal"
version = "1.1.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6e918e4ff8c4549eb882f14b3a4bc8c8bc93de829416eacf579f1207a8fbf861"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "pin-project-lite"
version = "0.2.16"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3b3cff922bd51709b605d9ead9aa71031d81447142d828eb4a6eba76fe619f9b"
[[package]]
name = "proc-macro2"
version = "1.0.94"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a31971752e70b8b2686d7e46ec17fb38dad4051d94024c88df49b667caea9c84"
dependencies = [
"unicode-ident",
]
[[package]]
name = "quote"
version = "1.0.40"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d"
dependencies = [
"proc-macro2",
]
[[package]]
name = "rustc-demangle"
version = "0.1.24"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f"
[[package]]
name = "syn"
version = "2.0.100"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b09a44accad81e1ba1cd74a32461ba89dee89095ba17b32f5d03683b1b1fc2a0"
dependencies = [
"proc-macro2",
"quote",
"unicode-ident",
]
[[package]]
name = "tokio"
version = "1.44.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e6b88822cbe49de4185e3a4cbf8321dd487cf5fe0c5c65695fef6346371e9c48"
dependencies = [
"backtrace",
"pin-project-lite",
]
[[package]]
name = "tokio-stream"
version = "0.1.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "eca58d7bba4a75707817a2c44174253f9236b2d5fbd055602e9d5c07c139a047"
dependencies = [
"futures-core",
"pin-project-lite",
"tokio",
]
[[package]]
name = "tonic"
version = "0.13.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "85839f0b32fd242bb3209262371d07feda6d780d16ee9d2bc88581b89da1549b"
dependencies = [
"base64",
"bytes",
"http",
"http-body",
"http-body-util",
"percent-encoding",
"pin-project",
"tokio-stream",
"tower-layer",
"tower-service",
"tracing",
]
[[package]]
name = "tower-layer"
version = "0.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "121c2a6cda46980bb0fcd1647ffaf6cd3fc79a013de288782836f6df9c48780e"
[[package]]
name = "tower-service"
version = "0.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8df9b6e13f2d32c91b9bd719c00d1958837bc7dec474d94952798cc8e69eeec3"
[[package]]
name = "tracing"
version = "0.1.41"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "784e0ac535deb450455cbfa28a6f0df145ea1bb7ae51b821cf5e7927fdcfbdd0"
dependencies = [
"pin-project-lite",
"tracing-attributes",
"tracing-core",
]
[[package]]
name = "tracing-attributes"
version = "0.1.28"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "395ae124c09f9e6918a2310af6038fba074bcf474ac352496d5910dd59a2226d"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "tracing-core"
version = "0.1.33"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e672c95779cf947c5311f83787af4fa8fffd12fb27e4993211a84bdfd9610f9c"
dependencies = [
"once_cell",
]
[[package]]
name = "unicode-ident"
version = "1.0.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512"
[[package]]
name = "windows-targets"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973"
dependencies = [
"windows_aarch64_gnullvm",
"windows_aarch64_msvc",
"windows_i686_gnu",
"windows_i686_gnullvm",
"windows_i686_msvc",
"windows_x86_64_gnu",
"windows_x86_64_gnullvm",
"windows_x86_64_msvc",
]
[[package]]
name = "windows_aarch64_gnullvm"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
[[package]]
name = "windows_aarch64_msvc"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
[[package]]
name = "windows_i686_gnu"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
[[package]]
name = "windows_i686_gnullvm"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
[[package]]
name = "windows_i686_msvc"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
[[package]]
name = "windows_x86_64_gnu"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
[[package]]
name = "windows_x86_64_gnullvm"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
[[package]]
name = "windows_x86_64_msvc"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"

View File

@@ -0,0 +1,35 @@
[package]
name = "communicator"
version = "0.1.0"
edition = "2024"
[lib]
crate-type = ["staticlib"]
[dependencies]
bytes.workspace = true
http.workspace = true
libc.workspace = true
nix.workspace = true
atomic_enum = "0.3.0"
prost.workspace = true
tonic = { version = "0.12.0", default-features = false, features=["codegen", "prost", "transport"] }
tokio = { version = "1.43.1", features = ["macros", "net", "io-util", "rt", "rt-multi-thread"] }
tokio-pipe = { version = "0.2.12" }
thiserror.workspace = true
tracing.workspace = true
tracing-subscriber.workspace = true
zerocopy = "0.8.0"
zerocopy-derive = "0.8.0"
tokio-epoll-uring.workspace = true
uring-common.workspace = true
pageserver_client_grpc.workspace = true
pageserver_data_api.workspace = true
neonart.workspace = true
utils.workspace = true
[build-dependencies]
cbindgen.workspace = true

View File

@@ -0,0 +1,123 @@
# Communicator
This package provides the so-called "compute-pageserver communicator",
or just "communicator" in short. It runs in a PostgreSQL server, as
part of the neon extension, and handles the communication with the
pageservers. On the PostgreSQL side, the glue code in pgxn/neon/ uses
the communicator to implement the PostgreSQL Storage Manager (SMGR)
interface.
## Design criteria
- Low latency
- Saturate a 10 Gbit / s network interface without becoming a bottleneck
## Source code view
pgxn/neon/communicator_new.c
Contains the glue that interact with PostgreSQL code and the Rust
communicator code.
pgxn/neon/communicator/src/backend_interface.rs
The entry point for calls from each backend.
pgxn/neon/communicator/src/init.rs
Initialization at server startup
pgxn/neon/communicator/src/worker_process/
Worker process main loop and glue code
At compilation time, pgxn/neon/communicator/ produces a static
library, libcommunicator.a. It is linked to the neon.so extension
library.
The real networking code, which is independent of PostgreSQL, is in
the pageserver/client_grpc crate.
## Process view
The communicator runs in a dedicated background worker process, the
"communicator process". The communicator uses a multi-threaded Tokio
runtime to execute the IO requests. So the communicator process has
multiple threads running. That's unusual for Postgres processes and
care must be taken to make that work.
### Backend <-> worker communication
Each backend has a number of I/O request slots in shared memory. The
slots are statically allocated for each backend, and must not be
accessed by other backends. The worker process reads requests from the
shared memory slots, and writes responses back to the slots.
To submit an IO request, first pick one of your backend's free slots,
and write the details of the IO request in the slot. Finally, update
the 'state' field of the slot to Submitted. That informs the worker
process that it can start processing the request. Once the state has
been set to Submitted, the backend *must not* access the slot anymore,
until the worker process sets its state to 'Completed'. In other
words, each slot is owned by either the backend or the worker process
at all times, and the 'state' field indicates who has ownership at the
moment.
To inform the worker process that a request slot has a pending IO
request, there's a pipe shared by the worker process and all backend
processes. After you have changed the slot's state to Submitted, write
the index of the request slot to the pipe. This wakes up the worker
process.
(Note that the pipe is just used for wakeups, but the worker process
is free to pick up Submitted IO requests even without receiving the
wakeup. As of this writing, it doesn't do that, but it might be useful
in the future to reduce latency even further, for example.)
When the worker process has completed processing the request, it
writes the result back in the request slot. A GetPage request can also
contain a pointer to buffer in the shared buffer cache. In that case,
the worker process writes the resulting page contents directly to the
buffer, and just a result code in the request slot. It then updates
the 'state' field to Completed, which passes the owner ship back to
the originating backend. Finally, it signals the process Latch of the
originating backend, waking it up.
### Differences between PostgreSQL v16, v17 and v18
PostgreSQL v18 introduced the new AIO mechanism. The PostgreSQL AIO
mechanism uses a very similar mechanism as described in the previous
section, for the communication between AIO worker processes and
backends. With our communicator, the AIO worker processes are not
used, but we use the same PgAioHandle request slots as in upstream.
For Neon-specific IO requests like GetDbSize, a neon request slot is
used. But for the actual IO requests, the request slot merely contains
a pointer to the PgAioHandle slot. The worker process updates the
status of that, calls the IO callbacks upon completionetc, just like
the upstream AIO worker processes do.
## Sequence diagram
neon
PostgreSQL extension backend_interface.rs worker_process.rs processor tonic
| . . . .
| smgr_read() . . . .
+-------------> + . . .
. | . . .
. | rcommunicator_ . . .
. | get_page_at_lsn . . .
. +------------------> + . .
| . .
| write request to . . .
| slot . .
| . .
| . .
| submit_request() . .
+-----------------> + .
| | .
| | db_size_request . .
+---------------->.
. TODO
### Compute <-> pageserver protocol
The protocol between Compute and the pageserver is based on gRPC. See `protos/`.

View File

@@ -0,0 +1,24 @@
use cbindgen;
use std::env;
fn main() -> Result<(), Box<dyn std::error::Error>> {
let crate_dir = env::var("CARGO_MANIFEST_DIR").unwrap();
cbindgen::generate(crate_dir).map_or_else(
|error| match error {
cbindgen::Error::ParseSyntaxError { .. } => {
// This means there was a syntax error in the Rust sources. Don't panic, because
// we want the build to continue and the Rust compiler to hit the error. The
// Rust compiler produces a better error message than cbindgen.
eprintln!("Generating C bindings failed because of a Rust syntax error");
}
e => panic!("Unable to generate C bindings: {:?}", e),
},
|bindings| {
bindings.write_to_file("communicator_bindings.h");
},
);
Ok(())
}

View File

@@ -0,0 +1,4 @@
language = "C"
[enum]
prefix_with_name = true

View File

@@ -0,0 +1,204 @@
//! This module implements a request/response "slot" for submitting requests from backends
//! to the communicator process.
//!
//! NB: The "backend" side of this code runs in Postgres backend processes,
//! which means that it is not safe to use the 'tracing' crate for logging, nor
//! to launch threads or use tokio tasks.
use std::cell::UnsafeCell;
use std::sync::atomic::fence;
use std::sync::atomic::{AtomicI32, Ordering};
use crate::neon_request::{NeonIORequest, NeonIOResult};
use atomic_enum::atomic_enum;
/// One request/response slot. Each backend has its own set of slots that it uses.
///
/// This is the moral equivalent of PgAioHandle for Postgres AIO requests
/// Like PgAioHandle, try to keep this small.
///
/// There is an array of these in shared memory. Therefore, this must be Sized.
///
/// ## Lifecycle of a request
///
/// The slot is always owned by either the backend process or the communicator
/// process, depending on the 'state'. Only the owning process is allowed to
/// read or modify the slot, except for reading the 'state' itself to check who
/// owns it.
///
/// A slot begins in the Idle state, where it is owned by the backend process.
/// To submit a request, the backend process fills the slot with the request
/// data, and changes it to the Submitted state. After changing the state, the
/// slot is owned by the communicator process, and the backend is not allowed
/// to access it until the communicator process marks it as Completed.
///
/// When the communicator process sees that the slot is in Submitted state, it
/// starts to process the request. After processing the request, it stores the
/// result in the slot, and changes the state to Completed. It is now owned by
/// the backend process again, which may now read the result, and reuse the
/// slot for a new request.
///
/// For correctness of the above protocol, we really only need two states:
/// "owned by backend" and "owned by communicator process. But to help with
/// debugging, there are a few more states. When the backend starts to fill in
/// the request details in the slot, it first sets the state from Idle to
/// Filling, and when it's done with that, from Filling to Submitted. In the
/// Filling state, the slot is still owned by the backend. Similarly, when the
/// communicator process starts to process a request, it sets it to Processing
/// state first, but the slot is still owned by the communicator process.
///
/// This struct doesn't handle waking up the communicator process when a request
/// has been submitted or when a response is ready. We only store the 'owner_procno'
/// which can be used for waking up the backend on completion, but the wakeups are
/// performed elsewhere.
pub struct NeonIOHandle {
/// similar to PgAioHandleState
state: AtomicNeonIOHandleState,
/// The owning process's ProcNumber. The worker process uses this to set the process's
/// latch on completion.
///
/// (This could be calculated from num_neon_request_slots_per_backend and the index of
/// this slot in the overall 'neon_requst_slots array')
owner_procno: AtomicI32,
/// SAFETY: This is modified by fill_request(), after it has established ownership
/// of the slot by setting state from Idle to Filling
request: UnsafeCell<NeonIORequest>,
/// valid when state is Completed
///
/// SAFETY: This is modified by RequestProcessingGuard::complete(). There can be
/// only one RequestProcessingGuard outstanding for a slot at a time, because
/// it is returned by start_processing_request() which checks the state, so
/// RequestProcessingGuard has exclusive access to the slot.
result: UnsafeCell<NeonIOResult>,
}
// The protocol described in the "Lifecycle of a request" section above ensures
// the safe access to the fields
unsafe impl Send for NeonIOHandle {}
unsafe impl Sync for NeonIOHandle {}
impl Default for NeonIOHandle {
fn default() -> NeonIOHandle {
NeonIOHandle {
owner_procno: AtomicI32::new(-1),
request: UnsafeCell::new(NeonIORequest::Empty),
result: UnsafeCell::new(NeonIOResult::Empty),
state: AtomicNeonIOHandleState::new(NeonIOHandleState::Idle),
}
}
}
#[atomic_enum]
#[derive(Eq, PartialEq)]
pub enum NeonIOHandleState {
Idle,
/// backend is filling in the request
Filling,
/// Backend has submitted the request to the communicator, but the
/// communicator process has not yet started processing it.
Submitted,
/// Communicator is processing the request
Processing,
/// Communicator has completed the request, and the 'result' field is now
/// valid, but the backend has not read the result yet.
Completed,
}
pub struct RequestProcessingGuard<'a>(&'a NeonIOHandle);
unsafe impl<'a> Send for RequestProcessingGuard<'a> {}
unsafe impl<'a> Sync for RequestProcessingGuard<'a> {}
impl<'a> RequestProcessingGuard<'a> {
pub fn get_request(&self) -> &NeonIORequest {
unsafe { &*self.0.request.get() }
}
pub fn get_owner_procno(&self) -> i32 {
self.0.owner_procno.load(Ordering::Relaxed)
}
pub fn completed(self, result: NeonIOResult) {
unsafe {
*self.0.result.get() = result;
};
// Ok, we have completed the IO. Mark the request as completed. After that,
// we no longer have ownership of the slot, and must not modify it.
let old_state = self
.0
.state
.swap(NeonIOHandleState::Completed, Ordering::Release);
assert!(old_state == NeonIOHandleState::Processing);
}
}
impl NeonIOHandle {
pub fn fill_request(&self, request: &NeonIORequest, proc_number: i32) {
// Verify that the slot is in Idle state previously, and start filling it.
//
// XXX: This step isn't strictly necessary. Assuming the caller didn't screw up
// and try to use a slot that's already in use, we could fill the slot and
// switch it directly from Idle to Submitted state.
if let Err(s) = self.state.compare_exchange(
NeonIOHandleState::Idle,
NeonIOHandleState::Filling,
Ordering::Relaxed,
Ordering::Relaxed,
) {
panic!("unexpected state in request slot: {s:?}");
}
// This fence synchronizes-with store/swap in `communicator_process_main_loop`.
fence(Ordering::Acquire);
self.owner_procno.store(proc_number, Ordering::Relaxed);
unsafe { *self.request.get() = *request }
self.state
.store(NeonIOHandleState::Submitted, Ordering::Release);
}
pub fn try_get_result(&self) -> Option<NeonIOResult> {
// FIXME: ordering?
let state = self.state.load(Ordering::Relaxed);
if state == NeonIOHandleState::Completed {
// This fence synchronizes-with store/swap in `communicator_process_main_loop`.
fence(Ordering::Acquire);
let result = unsafe { *self.result.get() };
self.state.store(NeonIOHandleState::Idle, Ordering::Relaxed);
Some(result)
} else {
None
}
}
pub fn start_processing_request<'a>(&'a self) -> Option<RequestProcessingGuard<'a>> {
// Read the IO request from the slot indicated in the wakeup
//
// XXX: using compare_exchange for this is not strictly necessary, as long as
// the communicator process has _some_ means of tracking which requests it's
// already processing. That could be a flag somewhere in communicator's private
// memory, for example.
if let Err(s) = self.state.compare_exchange(
NeonIOHandleState::Submitted,
NeonIOHandleState::Processing,
Ordering::Relaxed,
Ordering::Relaxed,
) {
// FIXME surprising state. This is unexpected at the moment, but if we
// started to process requests more aggressively, without waiting for the
// read from the pipe, then this could happen
panic!("unexpected state in request slot: {s:?}");
}
fence(Ordering::Acquire);
Some(RequestProcessingGuard(self))
}
}

View File

@@ -0,0 +1,196 @@
//! This code runs in each backend process. That means that launching Rust threads, panicking
//! etc. is forbidden!
use crate::backend_comms::NeonIOHandle;
use crate::init::CommunicatorInitStruct;
use crate::integrated_cache::{BackendCacheReadOp, IntegratedCacheReadAccess};
use crate::neon_request::CCachedGetPageVResult;
use crate::neon_request::{NeonIORequest, NeonIOResult};
pub struct CommunicatorBackendStruct<'t> {
my_proc_number: i32,
next_neon_request_idx: u32,
my_start_idx: u32, // First request slot that belongs to this backend
my_end_idx: u32, // end + 1 request slot that belongs to this backend
neon_request_slots: &'t [NeonIOHandle],
submission_pipe_write_fd: std::ffi::c_int,
pending_cache_read_op: Option<BackendCacheReadOp<'t>>,
integrated_cache: &'t IntegratedCacheReadAccess<'t>,
}
#[unsafe(no_mangle)]
pub extern "C" fn rcommunicator_backend_init(
cis: Box<CommunicatorInitStruct>,
my_proc_number: i32,
) -> &'static mut CommunicatorBackendStruct<'static> {
let start_idx = my_proc_number as u32 * cis.num_neon_request_slots_per_backend;
let end_idx = start_idx + cis.num_neon_request_slots_per_backend;
let integrated_cache = Box::leak(Box::new(cis.integrated_cache_init_struct.backend_init()));
let bs: &'static mut CommunicatorBackendStruct =
Box::leak(Box::new(CommunicatorBackendStruct {
my_proc_number,
next_neon_request_idx: start_idx,
my_start_idx: start_idx,
my_end_idx: end_idx,
neon_request_slots: cis.neon_request_slots,
submission_pipe_write_fd: cis.submission_pipe_write_fd,
pending_cache_read_op: None,
integrated_cache,
}));
bs
}
/// Start a request. You can poll for its completion and get the result by
/// calling bcomm_poll_dbsize_request_completion(). The communicator will wake
/// us up by setting our process latch, so to wait for the completion, wait on
/// the latch and call bcomm_poll_dbsize_request_completion() every time the
/// latch is set.
///
/// Safety: The C caller must ensure that the references are valid.
#[unsafe(no_mangle)]
pub extern "C" fn bcomm_start_io_request<'t>(
bs: &'t mut CommunicatorBackendStruct,
request: &NeonIORequest,
immediate_result_ptr: &mut NeonIOResult,
) -> i32 {
assert!(bs.pending_cache_read_op.is_none());
// Check if the request can be satisfied from the cache first
if let NeonIORequest::RelSize(req) = request {
if let Some(nblocks) = bs.integrated_cache.get_rel_size(&req.reltag()) {
*immediate_result_ptr = NeonIOResult::RelSize(nblocks);
return -1;
}
}
// Create neon request and submit it
let request_idx = bs.start_neon_request(request);
// Tell the communicator about it
bs.submit_request(request_idx);
return request_idx;
}
#[unsafe(no_mangle)]
pub extern "C" fn bcomm_start_get_page_v_request<'t>(
bs: &'t mut CommunicatorBackendStruct,
request: &NeonIORequest,
immediate_result_ptr: &mut CCachedGetPageVResult,
) -> i32 {
let NeonIORequest::GetPageV(get_pagev_request) = request else {
panic!("invalid request passed to bcomm_start_get_page_v_request()");
};
assert!(matches!(request, NeonIORequest::GetPageV(_)));
assert!(bs.pending_cache_read_op.is_none());
// Check if the request can be satisfied from the cache first
let mut all_cached = true;
let read_op = bs.integrated_cache.start_read_op();
for i in 0..get_pagev_request.nblocks {
if let Some(cache_block) = read_op.get_page(
&get_pagev_request.reltag(),
get_pagev_request.block_number + i as u32,
) {
(*immediate_result_ptr).cache_block_numbers[i as usize] = cache_block;
} else {
// not found in cache
all_cached = false;
break;
}
}
if all_cached {
bs.pending_cache_read_op = Some(read_op);
return -1;
}
// Create neon request and submit it
let request_idx = bs.start_neon_request(request);
// Tell the communicator about it
bs.submit_request(request_idx);
return request_idx;
}
/// Check if a request has completed. Returns:
///
/// -1 if the request is still being processed
/// 0 on success
#[unsafe(no_mangle)]
pub extern "C" fn bcomm_poll_request_completion(
bs: &mut CommunicatorBackendStruct,
request_idx: u32,
result_p: &mut NeonIOResult,
) -> i32 {
match bs.neon_request_slots[request_idx as usize].try_get_result() {
None => -1, // still processing
Some(result) => {
*result_p = result;
0
}
}
}
// LFC functions
/// Finish a local file cache read
///
//
#[unsafe(no_mangle)]
pub extern "C" fn bcomm_finish_cache_read(bs: &mut CommunicatorBackendStruct) -> bool {
if let Some(op) = bs.pending_cache_read_op.take() {
op.finish()
} else {
panic!("bcomm_finish_cache_read() called with no cached read pending");
}
}
impl<'t> CommunicatorBackendStruct<'t> {
/// Send a wakeup to the communicator process
fn submit_request(self: &CommunicatorBackendStruct<'t>, request_idx: i32) {
// wake up communicator by writing the idx to the submission pipe
//
// This can block, if the pipe is full. That should be very rare,
// because the communicator tries hard to drain the pipe to prevent
// that. Also, there's a natural upper bound on how many wakeups can be
// queued up: there is only a limited number of request slots for each
// backend.
//
// If it does block very briefly, that's not too serious.
let idxbuf = request_idx.to_ne_bytes();
let _res = nix::unistd::write(self.submission_pipe_write_fd, &idxbuf);
// FIXME: check result, return any errors
}
/// Note: there's no guarantee on when the communicator might pick it up. You should ring
/// the doorbell. But it might pick it up immediately.
pub(crate) fn start_neon_request(&mut self, request: &NeonIORequest) -> i32 {
let my_proc_number = self.my_proc_number;
// Grab next free slot
// FIXME: any guarantee that there will be any?
let idx = self.next_neon_request_idx;
let next_idx = idx + 1;
self.next_neon_request_idx = if next_idx == self.my_end_idx {
self.my_start_idx
} else {
next_idx
};
self.neon_request_slots[idx as usize].fill_request(request, my_proc_number);
return idx as i32;
}
}

View File

@@ -0,0 +1,109 @@
//! Implement the "low-level" parts of the file cache.
//!
//! This module just deals with reading and writing the file, and keeping track
//! which blocks in the cache file are in use and which are free. The "high
//! level" parts of tracking which block in the cache file corresponds to which
//! relation block is handled in 'integrated_cache' instead.
//!
//! This module is only used to access the file from the communicator
//! process. The backend processes *also* read the file (and sometimes also
//! write it? ), but the backends use direct C library calls for that.
use std::fs::File;
use std::path::Path;
use std::sync::Arc;
use std::sync::atomic::{AtomicU64, Ordering};
use tokio_epoll_uring;
use crate::BLCKSZ;
pub type CacheBlock = u64;
pub struct FileCache {
uring_system: tokio_epoll_uring::SystemHandle,
file: Arc<File>,
// TODO: there's no reclamation mechanism, the cache grows
// indefinitely. This is the next free block, i.e. the current
// size of the file
next_free_block: AtomicU64,
}
impl FileCache {
pub fn new(
file_cache_path: &Path,
uring_system: tokio_epoll_uring::SystemHandle,
) -> Result<FileCache, std::io::Error> {
let file = std::fs::OpenOptions::new()
.read(true)
.write(true)
.truncate(true)
.create(true)
.open(file_cache_path)?;
tracing::info!("Created cache file {file_cache_path:?}");
Ok(FileCache {
file: Arc::new(file),
uring_system,
next_free_block: AtomicU64::new(0),
})
}
// File cache management
pub async fn read_block(
&self,
cache_block: CacheBlock,
dst: impl uring_common::buf::IoBufMut + Send + Sync,
) -> Result<(), std::io::Error> {
assert!(dst.bytes_total() == BLCKSZ);
let file = self.file.clone();
let ((_file, _buf), res) = self
.uring_system
.read(file, cache_block as u64 * BLCKSZ as u64, dst)
.await;
let res = res.map_err(map_io_uring_error)?;
if res != BLCKSZ {
panic!("unexpected read result");
}
Ok(())
}
pub async fn write_block(
&self,
cache_block: CacheBlock,
src: impl uring_common::buf::IoBuf + Send + Sync,
) -> Result<(), std::io::Error> {
assert!(src.bytes_init() == BLCKSZ);
let file = self.file.clone();
let ((_file, _buf), res) = self
.uring_system
.write(file, cache_block as u64 * BLCKSZ as u64, src)
.await;
let res = res.map_err(map_io_uring_error)?;
if res != BLCKSZ {
panic!("unexpected read result");
}
Ok(())
}
pub fn alloc_block(&self) -> CacheBlock {
self.next_free_block.fetch_add(1, Ordering::Relaxed)
}
}
fn map_io_uring_error(err: tokio_epoll_uring::Error<std::io::Error>) -> std::io::Error {
match err {
tokio_epoll_uring::Error::Op(err) => err,
tokio_epoll_uring::Error::System(err) => {
std::io::Error::new(std::io::ErrorKind::Other, err)
}
}
}

View File

@@ -0,0 +1,130 @@
//! Initialization functions. These are executed in the postmaster process,
//! at different stages of server startup.
//!
//!
//! Communicator initialization steps:
//!
//! 1. At postmaster startup, before shared memory is allocated,
//! rcommunicator_shmem_size() is called to get the amount of
//! shared memory that this module needs.
//!
//! 2. Later, after the shared memory has been allocated,
//! rcommunicator_shmem_init() is called to initialize the shmem
//! area.
//!
//! Per process initialization:
//!
//! When a backend process starts up, it calls rcommunicator_backend_init().
//! In the communicator worker process, other functions are called, see
//! `worker_process` module.
use std::ffi::c_int;
use std::mem;
use crate::backend_comms::NeonIOHandle;
use crate::integrated_cache::IntegratedCacheInitStruct;
const NUM_NEON_REQUEST_SLOTS_PER_BACKEND: u32 = 5;
/// This struct is created in the postmaster process, and inherited to
/// the communicator process and all backend processes through fork()
#[repr(C)]
pub struct CommunicatorInitStruct {
#[allow(dead_code)]
pub max_procs: u32,
pub submission_pipe_read_fd: std::ffi::c_int,
pub submission_pipe_write_fd: std::ffi::c_int,
// Shared memory data structures
pub num_neon_request_slots_per_backend: u32,
pub neon_request_slots: &'static [NeonIOHandle],
pub integrated_cache_init_struct: IntegratedCacheInitStruct<'static>,
}
impl std::fmt::Debug for CommunicatorInitStruct {
fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
fmt.debug_struct("CommunicatorInitStruct")
.field("max_procs", &self.max_procs)
.field("submission_pipe_read_fd", &self.submission_pipe_read_fd)
.field("submission_pipe_write_fd", &self.submission_pipe_write_fd)
.field(
"num_neon_request_slots_per_backend",
&self.num_neon_request_slots_per_backend,
)
.field("neon_request_slots length", &self.neon_request_slots.len())
.finish()
}
}
#[unsafe(no_mangle)]
pub extern "C" fn rcommunicator_shmem_size(max_procs: u32) -> u64 {
let mut size = 0;
let num_neon_request_slots = max_procs * NUM_NEON_REQUEST_SLOTS_PER_BACKEND;
size += mem::size_of::<NeonIOHandle>() * num_neon_request_slots as usize;
// For integrated_cache's Allocator. TODO: make this adjustable
size += IntegratedCacheInitStruct::shmem_size(max_procs);
size as u64
}
/// Initialize the shared memory segment. Returns a backend-private
/// struct, which will be inherited by backend processes through fork
#[unsafe(no_mangle)]
pub extern "C" fn rcommunicator_shmem_init(
submission_pipe_read_fd: c_int,
submission_pipe_write_fd: c_int,
max_procs: u32,
shmem_area_ptr: *mut u8,
shmem_area_len: u64,
) -> &'static mut CommunicatorInitStruct {
let mut ptr = shmem_area_ptr;
// Carve out the request slots from the shmem area and initialize them
let num_neon_request_slots_per_backend = NUM_NEON_REQUEST_SLOTS_PER_BACKEND;
let num_neon_request_slots = max_procs * num_neon_request_slots_per_backend;
let len_used;
let neon_request_slots: &mut [NeonIOHandle] = unsafe {
ptr = ptr.add(ptr.align_offset(std::mem::align_of::<NeonIOHandle>()));
let neon_request_slots_ptr: *mut NeonIOHandle = ptr.cast();
for _i in 0..num_neon_request_slots {
let slot: *mut NeonIOHandle = ptr.cast();
*slot = NeonIOHandle::default();
ptr = ptr.byte_add(mem::size_of::<NeonIOHandle>());
}
len_used = ptr.byte_offset_from(shmem_area_ptr) as usize;
assert!(len_used <= shmem_area_len as usize);
std::slice::from_raw_parts_mut(neon_request_slots_ptr, num_neon_request_slots as usize)
};
let remaining_area =
unsafe { std::slice::from_raw_parts_mut(ptr, shmem_area_len as usize - len_used) };
// Give the rest of the area to the integrated cache
let integrated_cache_init_struct =
IntegratedCacheInitStruct::shmem_init(max_procs, remaining_area);
eprintln!(
"PIPE READ {} WRITE {}",
submission_pipe_read_fd, submission_pipe_write_fd
);
let cis: &'static mut CommunicatorInitStruct = Box::leak(Box::new(CommunicatorInitStruct {
max_procs,
submission_pipe_read_fd,
submission_pipe_write_fd,
num_neon_request_slots_per_backend: NUM_NEON_REQUEST_SLOTS_PER_BACKEND,
neon_request_slots,
integrated_cache_init_struct,
}));
cis
}

View File

@@ -0,0 +1,423 @@
//! Integrated communicator cache
//!
//! Tracks:
//! - Relation sizes and existence
//! - Last-written LSN
//! - TODO: Block cache (also known as LFC)
//!
//! TODO: limit the size
//! TODO: concurrency
//!
//! Note: This deals with "relations", which is really just one "relation fork" in Postgres
//! terms. RelFileLocator + ForkNumber is the key.
use utils::lsn::Lsn;
use crate::file_cache::{CacheBlock, FileCache};
use pageserver_data_api::model::RelTag;
use neonart;
use neonart::TreeInitStruct;
const CACHE_AREA_SIZE: usize = 10 * 1024 * 1024;
/// This struct is stored in the shared memory segment.
struct IntegratedCacheShmemData {
allocator: neonart::Allocator,
}
/// This struct is initialized at postmaster startup, and passed to all the processes via fork().
pub struct IntegratedCacheInitStruct<'t> {
shmem_data: &'t IntegratedCacheShmemData,
handle: TreeInitStruct<'t, TreeKey, TreeEntry>,
}
/// Represents write-access to the integrated cache. This is used by the communicator process.
pub struct IntegratedCacheWriteAccess<'t> {
cache_tree: neonart::TreeWriteAccess<'t, TreeKey, TreeEntry>,
global_lw_lsn: Lsn,
file_cache: Option<FileCache>,
}
/// Represents read-only access to the integrated cache. Backend processes have this.
pub struct IntegratedCacheReadAccess<'t> {
cache_tree: neonart::TreeReadAccess<'t, TreeKey, TreeEntry>,
}
impl<'t> IntegratedCacheInitStruct<'t> {
/// Return the desired size in bytes of the shared memory area to reserve for the integrated
/// cache.
pub fn shmem_size(_max_procs: u32) -> usize {
CACHE_AREA_SIZE
}
/// Initialize the shared memory segment. This runs once in postmaster. Returns a struct which
/// will be inherited by all processes through fork.
pub fn shmem_init(_max_procs: u32, shmem_area: &'t mut [u8]) -> IntegratedCacheInitStruct<'t> {
assert!(shmem_area.len() > std::mem::size_of::<IntegratedCacheShmemData>());
let mut ptr = shmem_area.as_mut_ptr();
let shmem_data_ptr;
let len_used;
unsafe {
ptr = ptr.byte_add(ptr.align_offset(align_of::<IntegratedCacheShmemData>()));
shmem_data_ptr = ptr.cast::<IntegratedCacheShmemData>();
ptr = ptr.byte_add(std::mem::size_of::<IntegratedCacheShmemData>());
len_used = ptr.byte_offset_from(shmem_area.as_mut_ptr()) as usize;
};
assert!(len_used < shmem_area.len());
let area_ptr = ptr;
let area_size = shmem_area.len() - len_used;
let cache_area: &mut [u8] = unsafe { std::slice::from_raw_parts_mut(area_ptr, area_size) };
let allocator = neonart::Allocator::new(cache_area);
// Initialize the shared memory area
let shmem_data = unsafe {
*shmem_data_ptr = IntegratedCacheShmemData { allocator };
&*shmem_data_ptr
};
let tree_handle = TreeInitStruct::new(&shmem_data.allocator);
IntegratedCacheInitStruct {
shmem_data,
handle: tree_handle,
}
}
pub fn worker_process_init(
self,
lsn: Lsn,
file_cache: Option<FileCache>,
) -> IntegratedCacheWriteAccess<'t> {
let IntegratedCacheInitStruct {
shmem_data: _shmem,
handle,
} = self;
let tree_writer = handle.attach_writer();
IntegratedCacheWriteAccess {
cache_tree: tree_writer,
global_lw_lsn: lsn,
file_cache,
}
}
pub fn backend_init(self) -> IntegratedCacheReadAccess<'t> {
let IntegratedCacheInitStruct {
shmem_data: _shmem,
handle,
} = self;
let tree_reader = handle.attach_reader();
IntegratedCacheReadAccess {
cache_tree: tree_reader,
}
}
}
#[derive(Clone)]
enum TreeEntry {
Rel(RelEntry),
Block(BlockEntry),
}
#[derive(Clone)]
struct BlockEntry {
lw_lsn: Lsn,
cache_block: Option<CacheBlock>,
}
#[derive(Clone, Default)]
struct RelEntry {
/// cached size of the relation
nblocks: Option<u32>,
}
#[derive(
Clone,
Debug,
PartialEq,
PartialOrd,
Eq,
Ord,
zerocopy_derive::IntoBytes,
zerocopy_derive::Immutable,
)]
#[repr(packed)]
struct TreeKey {
spc_oid: u32,
db_oid: u32,
rel_number: u32,
fork_number: u8,
block_number: u32,
}
impl From<&RelTag> for TreeKey {
fn from(val: &RelTag) -> TreeKey {
TreeKey {
spc_oid: val.spc_oid,
db_oid: val.db_oid,
rel_number: val.rel_number,
fork_number: val.fork_number,
block_number: u32::MAX,
}
}
}
impl From<(&RelTag, u32)> for TreeKey {
fn from(val: (&RelTag, u32)) -> TreeKey {
TreeKey {
spc_oid: val.0.spc_oid,
db_oid: val.0.db_oid,
rel_number: val.0.rel_number,
fork_number: val.0.fork_number,
block_number: val.1,
}
}
}
impl neonart::Key for TreeKey {
const KEY_LEN: usize = 4 + 4 + 4 + 1 + 32;
fn as_bytes(&self) -> &[u8] {
zerocopy::IntoBytes::as_bytes(self)
}
}
impl neonart::Value for TreeEntry {}
/// Return type used in the cache's get_*() functions. 'Found' means that the page, or other
/// information that was enqueried, exists in the cache. '
pub enum CacheResult<V> {
/// The enqueried page or other information existed in the cache.
Found(V),
/// The cache doesn't contain the page (or other enqueried information, like relation size). The
/// Lsn is the 'not_modified_since' LSN that should be used in the request to the pageserver to
/// read the page.
NotFound(Lsn),
}
impl<'t> IntegratedCacheWriteAccess<'t> {
pub fn get_rel_size(&'t self, rel: &RelTag) -> CacheResult<u32> {
let r = self.cache_tree.start_read();
if let Some(nblocks) = get_rel_size(&r, rel) {
CacheResult::Found(nblocks)
} else {
CacheResult::NotFound(self.global_lw_lsn)
}
}
pub async fn get_page(
&'t self,
rel: &RelTag,
block_number: u32,
dst: impl uring_common::buf::IoBufMut + Send + Sync,
) -> Result<CacheResult<()>, std::io::Error> {
let r = self.cache_tree.start_read();
if let Some(block_tree_entry) = r.get(&TreeKey::from((rel, block_number))) {
let block_entry = if let TreeEntry::Block(e) = block_tree_entry {
e
} else {
panic!("unexpected tree entry type for block key");
};
if let Some(cache_block) = block_entry.cache_block {
self.file_cache
.as_ref()
.unwrap()
.read_block(cache_block, dst)
.await?;
Ok(CacheResult::Found(()))
} else {
Ok(CacheResult::NotFound(block_entry.lw_lsn))
}
} else {
Ok(CacheResult::NotFound(self.global_lw_lsn))
}
}
pub async fn page_is_cached(
&'t self,
rel: &RelTag,
block_number: u32,
) -> Result<CacheResult<()>, std::io::Error> {
let r = self.cache_tree.start_read();
if let Some(block_tree_entry) = r.get(&TreeKey::from((rel, block_number))) {
let block_entry = if let TreeEntry::Block(e) = block_tree_entry {
e
} else {
panic!("unexpected tree entry type for block key");
};
if let Some(_cache_block) = block_entry.cache_block {
Ok(CacheResult::Found(()))
} else {
Ok(CacheResult::NotFound(block_entry.lw_lsn))
}
} else {
Ok(CacheResult::NotFound(self.global_lw_lsn))
}
}
/// Does the relation exists? CacheResult::NotFound means that the cache doesn't contain that
/// information, i.e. we don't know if the relation exists or not.
pub fn get_rel_exists(&'t self, rel: &RelTag) -> CacheResult<bool> {
// we don't currently cache negative entries, so if the relation is in the cache, it exists
let r = self.cache_tree.start_read();
if let Some(_rel_entry) = r.get(&TreeKey::from(rel)) {
CacheResult::Found(true)
} else {
CacheResult::NotFound(self.global_lw_lsn)
}
}
pub fn get_db_size(&'t self, _db_oid: u32) -> CacheResult<u64> {
// fixme: is this right lsn?
CacheResult::NotFound(self.global_lw_lsn)
}
pub fn remember_rel_size(&'t self, rel: &RelTag, nblocks: u32) {
let mut w = self.cache_tree.start_write();
w.insert(
&TreeKey::from(rel),
TreeEntry::Rel(RelEntry {
nblocks: Some(nblocks),
}),
);
}
/// Remember the given page contents in the cache.
pub async fn remember_page(
&'t self,
rel: &RelTag,
block_number: u32,
src: impl uring_common::buf::IoBuf + Send + Sync,
lw_lsn: Lsn,
) {
if let Some(file_cache) = self.file_cache.as_ref() {
let mut w = self.cache_tree.start_write();
let key = TreeKey::from((rel, block_number));
let mut cache_block = None;
w.update_with_fn(&key, |existing| {
if let Some(existing) = existing {
let mut block_entry = if let TreeEntry::Block(e) = existing.clone() {
e
} else {
panic!("unexpected tree entry type for block key");
};
block_entry.lw_lsn = lw_lsn;
if block_entry.cache_block.is_none() {
block_entry.cache_block = Some(file_cache.alloc_block());
}
cache_block = block_entry.cache_block;
Some(TreeEntry::Block(block_entry))
} else {
cache_block = Some(file_cache.alloc_block());
Some(TreeEntry::Block(BlockEntry {
lw_lsn: lw_lsn,
cache_block: cache_block,
}))
}
});
let cache_block = cache_block.unwrap();
file_cache
.write_block(cache_block, src)
.await
.expect("error writing to cache");
}
}
/// Forget information about given relation in the cache. (For DROP TABLE and such)
pub fn forget_rel(&'t self, rel: &RelTag) {
// FIXME: not implemented properly. smgrexists() would still return true for this
let mut w = self.cache_tree.start_write();
w.insert(
&TreeKey::from(rel),
TreeEntry::Rel(RelEntry { nblocks: None }),
);
}
}
/// Read relation size from the cache.
///
/// This is in a separate function so that it can be shared by
/// IntegratedCacheReadAccess::get_rel_size() and IntegratedCacheWriteAccess::get_rel_size()
fn get_rel_size<'t>(r: &neonart::TreeReadGuard<TreeKey, TreeEntry>, rel: &RelTag) -> Option<u32> {
if let Some(existing) = r.get(&TreeKey::from(rel)) {
let rel_entry = if let TreeEntry::Rel(e) = existing {
e
} else {
panic!("unexpected tree entry type for rel key");
};
if let Some(nblocks) = rel_entry.nblocks {
Some(nblocks)
} else {
None
}
} else {
None
}
}
/// Accessor for other backends
///
/// This allows backends to read pages from the cache directly, on their own, without making a
/// request to the communicator process.
impl<'t> IntegratedCacheReadAccess<'t> {
pub fn get_rel_size(&'t self, rel: &RelTag) -> Option<u32> {
get_rel_size(&self.cache_tree.start_read(), rel)
}
pub fn start_read_op(&'t self) -> BackendCacheReadOp<'t> {
let r = self.cache_tree.start_read();
BackendCacheReadOp { read_guard: r }
}
}
pub struct BackendCacheReadOp<'t> {
read_guard: neonart::TreeReadGuard<'t, TreeKey, TreeEntry>,
}
impl<'e> BackendCacheReadOp<'e> {
/// Initiate a read of the page from the cache.
///
/// This returns the "cache block number", i.e. the block number within the cache file, where
/// the page's contents is stored. To get the page contents, the caller needs to read that block
/// from the cache file. This returns a guard object that you must hold while it performs the
/// read. It's possible that while you are performing the read, the cache block is invalidated.
/// After you have completed the read, call BackendCacheReadResult::finish() to check if the
/// read was in fact valid or not. If it was concurrently invalidated, you need to retry.
pub fn get_page(&self, rel: &RelTag, block_number: u32) -> Option<u64> {
if let Some(block_tree_entry) = self.read_guard.get(&TreeKey::from((rel, block_number))) {
let block_entry = if let TreeEntry::Block(e) = block_tree_entry {
e
} else {
panic!("unexpected tree entry type for block key");
};
block_entry.cache_block
} else {
None
}
}
pub fn finish(self) -> bool {
// TODO: currently, we use a spinlock to protect the in-memory tree, so concurrent
// invalidations are not possible. But the plan is to switch to optimistic locking,
// and once we do that, this would return 'false' if the optimistic locking failed and
// you need to retry.
true
}
}

View File

@@ -0,0 +1,25 @@
//!
//! Three main parts:
//! - async tokio communicator core, which receives requests and processes them.
//! - Main loop and requests queues, which routes requests from backends to the core
//! - the per-backend glue code, which submits requests
//!
mod backend_comms;
// mark this 'pub', because these functions are called from C code. Otherwise, the compiler
// complains about a bunch of structs and enum variants being unused, because it thinkgs
// the functions that use them are never called. There are some C-callable functions in
// other modules too, but marking this as pub is currently enough to silence the warnings
//
// TODO: perhaps collect *all* the extern "C" functions to one module?
pub mod backend_interface;
mod file_cache;
mod init;
mod integrated_cache;
mod neon_request;
mod worker_process;
// FIXME get this from postgres headers somehow
pub const BLCKSZ: usize = 8192;

View File

@@ -0,0 +1,346 @@
type CLsn = u64;
type COid = u32;
// This conveniently matches PG_IOV_MAX
pub const MAX_GETPAGEV_PAGES: usize = 32;
use pageserver_data_api::model;
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub enum NeonIORequest {
Empty,
// Read requests. These are C-friendly variants of the corresponding structs in
// pageserver_data_api::model.
RelExists(CRelExistsRequest),
RelSize(CRelSizeRequest),
GetPageV(CGetPageVRequest),
PrefetchV(CPrefetchVRequest),
DbSize(CDbSizeRequest),
// Write requests. These are needed to keep the relation size cache and LFC up-to-date.
// They are not sent to the pageserver.
WritePage(CWritePageRequest),
RelExtend(CRelExtendRequest),
RelZeroExtend(CRelZeroExtendRequest),
RelCreate(CRelCreateRequest),
RelTruncate(CRelTruncateRequest),
RelUnlink(CRelUnlinkRequest),
}
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub enum NeonIOResult {
Empty,
RelExists(bool),
RelSize(u32),
/// the result pages are written to the shared memory addresses given in the request
GetPageV,
/// A prefetch request returns as soon as the request has been received by the communicator.
/// It is processed in the background.
PrefetchVLaunched,
DbSize(u64),
// FIXME design compact error codes. Can't easily pass a string or other dynamic data.
// currently, this is 'errno'
Error(i32),
Aborted,
/// used for all write requests
WriteOK,
}
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub struct CCachedGetPageVResult {
pub cache_block_numbers: [u64; MAX_GETPAGEV_PAGES],
}
/// ShmemBuf represents a buffer in shared memory.
///
/// SAFETY: The pointer must point to an area in shared memory. The functions allow you to liberally
/// get a mutable pointer to the contents; it is the caller's responsibility to ensure that you
/// don't access a buffer that's you're not allowed to. Inappropriate access to the buffer doesn't
/// violate Rust's safety semantics, but it will mess up and crash Postgres.
///
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub struct ShmemBuf {
// These fields define where the result is written. Must point into a buffer in shared memory!
pub ptr: *mut u8,
}
unsafe impl Send for ShmemBuf {}
unsafe impl Sync for ShmemBuf {}
unsafe impl uring_common::buf::IoBuf for ShmemBuf {
fn stable_ptr(&self) -> *const u8 {
self.ptr
}
fn bytes_init(&self) -> usize {
crate::BLCKSZ
}
fn bytes_total(&self) -> usize {
crate::BLCKSZ
}
}
unsafe impl uring_common::buf::IoBufMut for ShmemBuf {
fn stable_mut_ptr(&mut self) -> *mut u8 {
self.ptr
}
unsafe fn set_init(&mut self, pos: usize) {
if pos > crate::BLCKSZ as usize {
panic!(
"set_init called past end of buffer, pos {}, buffer size {}",
pos,
crate::BLCKSZ
);
}
}
}
impl ShmemBuf {
pub fn as_mut_ptr(&self) -> *mut u8 {
self.ptr
}
}
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub struct CRelExistsRequest {
pub spc_oid: COid,
pub db_oid: COid,
pub rel_number: u32,
pub fork_number: u8,
}
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub struct CRelSizeRequest {
pub spc_oid: COid,
pub db_oid: COid,
pub rel_number: u32,
pub fork_number: u8,
}
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub struct CGetPageVRequest {
pub spc_oid: COid,
pub db_oid: COid,
pub rel_number: u32,
pub fork_number: u8,
pub block_number: u32,
pub nblocks: u8,
// These fields define where the result is written. Must point into a buffer in shared memory!
pub dest: [ShmemBuf; MAX_GETPAGEV_PAGES],
}
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub struct CPrefetchVRequest {
pub spc_oid: COid,
pub db_oid: COid,
pub rel_number: u32,
pub fork_number: u8,
pub block_number: u32,
pub nblocks: u8,
}
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub struct CDbSizeRequest {
pub db_oid: COid,
pub request_lsn: CLsn,
}
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub struct CWritePageRequest {
pub spc_oid: COid,
pub db_oid: COid,
pub rel_number: u32,
pub fork_number: u8,
pub block_number: u32,
pub lsn: CLsn,
// These fields define where the result is written. Must point into a buffer in shared memory!
pub src: ShmemBuf,
}
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub struct CRelExtendRequest {
pub spc_oid: COid,
pub db_oid: COid,
pub rel_number: u32,
pub fork_number: u8,
pub block_number: u32,
pub lsn: CLsn,
// These fields define page contents. Must point into a buffer in shared memory!
pub src_ptr: usize,
pub src_size: u32,
}
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub struct CRelZeroExtendRequest {
pub spc_oid: COid,
pub db_oid: COid,
pub rel_number: u32,
pub fork_number: u8,
pub block_number: u32,
pub nblocks: u32,
pub lsn: CLsn,
}
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub struct CRelCreateRequest {
pub spc_oid: COid,
pub db_oid: COid,
pub rel_number: u32,
pub fork_number: u8,
}
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub struct CRelTruncateRequest {
pub spc_oid: COid,
pub db_oid: COid,
pub rel_number: u32,
pub fork_number: u8,
pub nblocks: u32,
}
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub struct CRelUnlinkRequest {
pub spc_oid: COid,
pub db_oid: COid,
pub rel_number: u32,
pub fork_number: u8,
pub block_number: u32,
pub nblocks: u32,
}
impl CRelExistsRequest {
pub fn reltag(&self) -> model::RelTag {
model::RelTag {
spc_oid: self.spc_oid,
db_oid: self.db_oid,
rel_number: self.rel_number,
fork_number: self.fork_number,
}
}
}
impl CRelSizeRequest {
pub fn reltag(&self) -> model::RelTag {
model::RelTag {
spc_oid: self.spc_oid,
db_oid: self.db_oid,
rel_number: self.rel_number,
fork_number: self.fork_number,
}
}
}
impl CGetPageVRequest {
pub fn reltag(&self) -> model::RelTag {
model::RelTag {
spc_oid: self.spc_oid,
db_oid: self.db_oid,
rel_number: self.rel_number,
fork_number: self.fork_number,
}
}
}
impl CPrefetchVRequest {
pub fn reltag(&self) -> model::RelTag {
model::RelTag {
spc_oid: self.spc_oid,
db_oid: self.db_oid,
rel_number: self.rel_number,
fork_number: self.fork_number,
}
}
}
impl CWritePageRequest {
pub fn reltag(&self) -> model::RelTag {
model::RelTag {
spc_oid: self.spc_oid,
db_oid: self.db_oid,
rel_number: self.rel_number,
fork_number: self.fork_number,
}
}
}
impl CRelExtendRequest {
pub fn reltag(&self) -> model::RelTag {
model::RelTag {
spc_oid: self.spc_oid,
db_oid: self.db_oid,
rel_number: self.rel_number,
fork_number: self.fork_number,
}
}
}
impl CRelZeroExtendRequest {
pub fn reltag(&self) -> model::RelTag {
model::RelTag {
spc_oid: self.spc_oid,
db_oid: self.db_oid,
rel_number: self.rel_number,
fork_number: self.fork_number,
}
}
}
impl CRelCreateRequest {
pub fn reltag(&self) -> model::RelTag {
model::RelTag {
spc_oid: self.spc_oid,
db_oid: self.db_oid,
rel_number: self.rel_number,
fork_number: self.fork_number,
}
}
}
impl CRelTruncateRequest {
pub fn reltag(&self) -> model::RelTag {
model::RelTag {
spc_oid: self.spc_oid,
db_oid: self.db_oid,
rel_number: self.rel_number,
fork_number: self.fork_number,
}
}
}
impl CRelUnlinkRequest {
pub fn reltag(&self) -> model::RelTag {
model::RelTag {
spc_oid: self.spc_oid,
db_oid: self.db_oid,
rel_number: self.rel_number,
fork_number: self.fork_number,
}
}
}

View File

@@ -0,0 +1,28 @@
//! C callbacks to PostgreSQL facilities that the neon extension needs
//! to provide. These are implemented in `neon/pgxn/communicator_new.c`.
//! The function signatures better match!
//!
//! These are called from the communicator threads! Careful what you do, most
//! Postgres functions are not safe to call in that context.
use utils::lsn::Lsn;
unsafe extern "C" {
pub fn notify_proc_unsafe(procno: std::ffi::c_int);
pub fn callback_set_my_latch_unsafe();
pub fn callback_get_request_lsn_unsafe() -> u64;
}
// safe wrappers
pub(super) fn notify_proc(procno: std::ffi::c_int) {
unsafe { notify_proc_unsafe(procno) };
}
pub(super) fn callback_set_my_latch() {
unsafe { callback_set_my_latch_unsafe() };
}
pub(super) fn get_request_lsn() -> Lsn {
Lsn(unsafe { callback_get_request_lsn_unsafe() })
}

View File

@@ -0,0 +1,229 @@
//! Glue code to hook up Rust logging, with the `tracing` crate, to the PostgreSQL log
//!
//! In the Rust threads, the log messages are written to a mpsc Channel, and the Postgres
//! process latch is raised. That wakes up the loop in the main thread. It reads the
//! message from the channel and ereport()s it. This ensures that only one thread, the main
//! thread, calls the PostgreSQL logging routines at any time.
use std::sync::mpsc::sync_channel;
use std::sync::mpsc::{Receiver, SyncSender};
use std::sync::mpsc::{TryRecvError, TrySendError};
use tracing::info;
use tracing::{Event, Level, Metadata, Subscriber};
use tracing_subscriber::filter::LevelFilter;
use tracing_subscriber::fmt::FmtContext;
use tracing_subscriber::fmt::FormatEvent;
use tracing_subscriber::fmt::FormatFields;
use tracing_subscriber::fmt::FormattedFields;
use tracing_subscriber::fmt::MakeWriter;
use tracing_subscriber::fmt::format::Writer;
use tracing_subscriber::registry::LookupSpan;
use crate::worker_process::callbacks::callback_set_my_latch;
pub struct LoggingState {
receiver: Receiver<FormattedEventWithMeta>,
}
/// Called once, at worker process startup. The returned LoggingState is passed back
/// in the subsequent calls to `pump_logging`. It is opaque to the C code.
#[unsafe(no_mangle)]
pub extern "C" fn configure_logging() -> Box<LoggingState> {
let (sender, receiver) = sync_channel(1000);
let maker = Maker { channel: sender };
use tracing_subscriber::prelude::*;
let r = tracing_subscriber::registry();
let r = r.with(
tracing_subscriber::fmt::layer()
.event_format(SimpleFormatter::new())
.with_writer(maker)
// TODO: derive this from log_min_messages?
.with_filter(LevelFilter::from_level(Level::INFO)),
);
r.init();
info!("communicator process logging started");
let state = LoggingState { receiver };
Box::new(state)
}
/// Read one message from the logging queue. This is essentially a wrapper to Receiver,
/// with a C-friendly signature.
///
/// The message is copied into *errbuf, which is a caller-supplied buffer of size `errbuf_len`.
/// If the message doesn't fit in the buffer, it is truncated. It is always NULL-terminated.
///
/// The error level is returned *elevel_p. It's one of the PostgreSQL error levels, see elog.h
#[unsafe(no_mangle)]
pub extern "C" fn pump_logging(
state: &mut LoggingState,
errbuf: *mut u8,
errbuf_len: u32,
elevel_p: &mut i32,
) -> i32 {
let msg = match state.receiver.try_recv() {
Err(TryRecvError::Empty) => return 0,
Err(TryRecvError::Disconnected) => return -1,
Ok(msg) => msg,
};
let src: &[u8] = &msg.message;
let dst = errbuf;
let len = std::cmp::min(src.len(), errbuf_len as usize - 1);
unsafe {
std::ptr::copy_nonoverlapping(src.as_ptr(), dst, len);
*(errbuf.add(len)) = b'\0'; // NULL terminator
}
// XXX: these levels are copied from PostgreSQL's elog.h. Introduce another enum
// to hide these?
*elevel_p = match msg.level {
Level::TRACE => 10, // DEBUG5
Level::DEBUG => 14, // DEBUG1
Level::INFO => 17, // INFO
Level::WARN => 19, // WARNING
Level::ERROR => 21, // ERROR
};
1
}
//---- The following functions can be called from any thread ----
#[derive(Clone)]
struct FormattedEventWithMeta {
message: Vec<u8>,
level: tracing::Level,
}
impl Default for FormattedEventWithMeta {
fn default() -> Self {
FormattedEventWithMeta {
message: Vec::new(),
level: tracing::Level::DEBUG,
}
}
}
struct EventBuilder<'a> {
event: FormattedEventWithMeta,
maker: &'a Maker,
}
impl<'a> std::io::Write for EventBuilder<'a> {
fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
self.event.message.write(buf)
}
fn flush(&mut self) -> std::io::Result<()> {
self.maker.send_event(self.event.clone());
Ok(())
}
}
impl<'a> Drop for EventBuilder<'a> {
fn drop(&mut self) {
let maker = self.maker;
let event = std::mem::take(&mut self.event);
maker.send_event(event);
}
}
struct Maker {
channel: SyncSender<FormattedEventWithMeta>,
}
impl<'a> MakeWriter<'a> for Maker {
type Writer = EventBuilder<'a>;
fn make_writer(&'a self) -> Self::Writer {
panic!("not expected to be called when make_writer_for is implemented");
}
fn make_writer_for(&'a self, meta: &Metadata<'_>) -> Self::Writer {
EventBuilder {
event: FormattedEventWithMeta {
message: Vec::new(),
level: *meta.level(),
},
maker: self,
}
}
}
impl Maker {
fn send_event(&self, e: FormattedEventWithMeta) {
match self.channel.try_send(e) {
Ok(()) => {
// notify the main thread
callback_set_my_latch();
}
Err(TrySendError::Disconnected(_)) => {}
Err(TrySendError::Full(_)) => {
// TODO: record that some messages were lost
}
}
}
}
/// Simple formatter implementation for tracing_subscriber, which prints the log
/// spans and message part like the default formatter, but no timestamp or error
/// level. The error level is captured separately by `FormattedEventWithMeta',
/// and when the error is printed by the main thread, with PostgreSQL ereport(),
/// it gets a timestamp at that point. (The timestamp printed will therefore lag
/// behind the timestamp on the event here, if the main thread doesn't process
/// the log message promptly)
struct SimpleFormatter;
impl<S, N> FormatEvent<S, N> for SimpleFormatter
where
S: Subscriber + for<'a> LookupSpan<'a>,
N: for<'a> FormatFields<'a> + 'static,
{
fn format_event(
&self,
ctx: &FmtContext<'_, S, N>,
mut writer: Writer<'_>,
event: &Event<'_>,
) -> std::fmt::Result {
// Format all the spans in the event's span context.
if let Some(scope) = ctx.event_scope() {
for span in scope.from_root() {
write!(writer, "{}", span.name())?;
// `FormattedFields` is a formatted representation of the span's
// fields, which is stored in its extensions by the `fmt` layer's
// `new_span` method. The fields will have been formatted
// by the same field formatter that's provided to the event
// formatter in the `FmtContext`.
let ext = span.extensions();
let fields = &ext
.get::<FormattedFields<N>>()
.expect("will never be `None`");
// Skip formatting the fields if the span had no fields.
if !fields.is_empty() {
write!(writer, "{{{}}}", fields)?;
}
write!(writer, ": ")?;
}
}
// Write fields on the event
ctx.field_format().format_fields(writer.by_ref(), event)?;
writeln!(writer)
}
}
impl SimpleFormatter {
fn new() -> Self {
SimpleFormatter {}
}
}

View File

@@ -0,0 +1,384 @@
use std::collections::HashMap;
use std::path::PathBuf;
use crate::backend_comms::NeonIOHandle;
use crate::file_cache::FileCache;
use crate::init::CommunicatorInitStruct;
use crate::integrated_cache::{CacheResult, IntegratedCacheWriteAccess};
use crate::neon_request::{CGetPageVRequest, CPrefetchVRequest};
use crate::neon_request::{NeonIORequest, NeonIOResult};
use pageserver_client_grpc::PageserverClient;
use pageserver_data_api::model;
use tokio::io::AsyncReadExt;
use tokio_epoll_uring::IoBuf;
use tokio_pipe::PipeRead;
use super::callbacks::{get_request_lsn, notify_proc};
use tracing::{error, info, trace};
use utils::lsn::Lsn;
pub struct CommunicatorWorkerProcessStruct<'a> {
neon_request_slots: &'a [NeonIOHandle],
pageserver_client: PageserverClient,
cache: IntegratedCacheWriteAccess<'a>,
submission_pipe_read_raw_fd: i32,
}
pub(super) async fn init(
cis: Box<CommunicatorInitStruct>,
tenant_id: String,
timeline_id: String,
auth_token: Option<String>,
shard_map: HashMap<u16, String>,
_file_cache_size: u64,
file_cache_path: Option<PathBuf>,
) -> CommunicatorWorkerProcessStruct<'static> {
let last_lsn = get_request_lsn();
let uring_system = tokio_epoll_uring::System::launch().await.unwrap();
let file_cache = if let Some(path) = file_cache_path {
Some(FileCache::new(&path, uring_system).expect("could not create cache file"))
} else {
// FIXME: temporarily for testing, use LFC even if disabled
Some(
FileCache::new(&PathBuf::from("new_filecache"), uring_system)
.expect("could not create cache file"),
)
};
// Initialize subsystems
let cache = cis
.integrated_cache_init_struct
.worker_process_init(last_lsn, file_cache);
let pageserver_client = PageserverClient::new(&tenant_id, &timeline_id, &auth_token, shard_map);
let this = CommunicatorWorkerProcessStruct {
neon_request_slots: cis.neon_request_slots,
pageserver_client,
cache,
submission_pipe_read_raw_fd: cis.submission_pipe_read_fd,
};
this
}
impl<'t> CommunicatorWorkerProcessStruct<'t> {
/// Main loop of the worker process. Receive requests from the backends and process them.
pub(super) async fn run(self: &'static Self) {
let mut idxbuf: [u8; 4] = [0; 4];
let mut submission_pipe_read =
PipeRead::from_raw_fd_checked(self.submission_pipe_read_raw_fd)
.expect("invalid pipe fd");
loop {
// Wait for a backend to ring the doorbell
match submission_pipe_read.read(&mut idxbuf).await {
Ok(4) => {}
Ok(nbytes) => panic!("short read ({nbytes} bytes) on communicator pipe"),
Err(e) => panic!("error reading from communicator pipe: {e}"),
}
let request_idx = u32::from_ne_bytes(idxbuf);
// Read the IO request from the slot indicated in the wakeup
let Some(slot) =
self.neon_request_slots[request_idx as usize].start_processing_request()
else {
// This currently should not happen. But if we have multiple threads picking up
// requests, and without waiting for the notifications, it could.
panic!("no request in slot");
};
// Ok, we have ownership of this request now. We must process
// it now, there's no going back.
//trace!("processing request {request_idx}: {request:?}");
// Spawn a separate task for every request. That's a little excessive for requests that
// can be quickly satisfied from the cache, but we expect that to be rare, because the
// requesting backend would have already checked the cache.
tokio::spawn(async {
let result = self.handle_request(slot.get_request()).await;
let owner_procno = slot.get_owner_procno();
// Ok, we have completed the IO. Mark the request as completed. After that,
// we no longer have ownership of the slot, and must not modify it.
slot.completed(result);
// Notify the backend about the completion. (Note that the backend might see
// the completed status even before this; this is just a wakeup)
notify_proc(owner_procno);
});
}
}
fn request_common(&self, not_modified_since_lsn: Lsn) -> model::RequestCommon {
model::RequestCommon {
request_lsn: get_request_lsn(),
not_modified_since_lsn,
}
}
async fn handle_request<'x>(self: &'static Self, req: &'x NeonIORequest) -> NeonIOResult {
match req {
NeonIORequest::Empty => {
error!("unexpected Empty IO request");
NeonIOResult::Error(-1)
}
NeonIORequest::RelExists(req) => {
let rel = req.reltag();
let not_modified_since = match self.cache.get_rel_exists(&rel) {
CacheResult::Found(exists) => return NeonIOResult::RelExists(exists),
CacheResult::NotFound(lsn) => lsn,
};
match self
.pageserver_client
.process_rel_exists_request(&model::RelExistsRequest {
common: self.request_common(not_modified_since),
rel,
})
.await
{
Ok(exists) => NeonIOResult::RelExists(exists),
Err(err) => {
info!("tonic error: {err:?}");
NeonIOResult::Error(-1)
}
}
}
NeonIORequest::RelSize(req) => {
let rel = req.reltag();
// Check the cache first
let not_modified_since = match self.cache.get_rel_size(&rel) {
CacheResult::Found(nblocks) => {
tracing::trace!("found relsize for {:?} in cache: {}", rel, nblocks);
return NeonIOResult::RelSize(nblocks);
}
CacheResult::NotFound(lsn) => lsn,
};
let common = self.request_common(not_modified_since);
match self
.pageserver_client
.process_rel_size_request(&model::RelSizeRequest {
common: common.clone(),
rel: rel.clone(),
})
.await
{
Ok(nblocks) => {
// update the cache
tracing::info!("updated relsize for {:?} in cache: {}", rel, nblocks);
self.cache.remember_rel_size(&rel, nblocks);
NeonIOResult::RelSize(nblocks)
}
Err(err) => {
info!("tonic error: {err:?}");
NeonIOResult::Error(-1)
}
}
}
NeonIORequest::GetPageV(req) => match self.handle_get_pagev_request(req).await {
Ok(()) => NeonIOResult::GetPageV,
Err(errno) => NeonIOResult::Error(errno),
},
NeonIORequest::PrefetchV(req) => {
let req = req.clone();
tokio::spawn(async move { self.handle_prefetchv_request(&req).await });
NeonIOResult::PrefetchVLaunched
}
NeonIORequest::DbSize(req) => {
// Check the cache first
let not_modified_since = match self.cache.get_db_size(req.db_oid) {
CacheResult::Found(db_size) => {
// get_page already copied the block content to the destination
return NeonIOResult::DbSize(db_size);
}
CacheResult::NotFound(lsn) => lsn,
};
match self
.pageserver_client
.process_dbsize_request(&model::DbSizeRequest {
common: self.request_common(not_modified_since),
db_oid: req.db_oid,
})
.await
{
Ok(db_size) => NeonIOResult::DbSize(db_size),
Err(err) => {
info!("tonic error: {err:?}");
NeonIOResult::Error(-1)
}
}
}
// Write requests
NeonIORequest::WritePage(req) => {
// Also store it in the LFC while we still have it
let rel = req.reltag();
self.cache
.remember_page(&rel, req.block_number, req.src, Lsn(req.lsn))
.await;
NeonIOResult::WriteOK
}
NeonIORequest::RelExtend(req) => {
self.cache
.remember_rel_size(&req.reltag(), req.block_number + 1);
NeonIOResult::WriteOK
}
NeonIORequest::RelZeroExtend(req) => {
self.cache
.remember_rel_size(&req.reltag(), req.block_number + req.nblocks);
NeonIOResult::WriteOK
}
NeonIORequest::RelCreate(req) => {
self.cache.remember_rel_size(&req.reltag(), 0);
NeonIOResult::WriteOK
}
NeonIORequest::RelTruncate(req) => {
self.cache.remember_rel_size(&req.reltag(), req.nblocks);
NeonIOResult::WriteOK
}
NeonIORequest::RelUnlink(req) => {
self.cache.forget_rel(&req.reltag());
NeonIOResult::WriteOK
}
}
}
async fn handle_get_pagev_request(&'t self, req: &CGetPageVRequest) -> Result<(), i32> {
let rel = req.reltag();
// Check the cache first
let mut cache_misses = Vec::new();
for i in 0..req.nblocks {
let blkno = req.block_number + i as u32;
let dest = req.dest[i as usize];
let not_modified_since = match self.cache.get_page(&rel, blkno, dest).await {
Ok(CacheResult::Found(_)) => {
// get_page already copied the block content to the destination
trace!("found blk {} in rel {:?} in LFC ", blkno, rel);
continue;
}
Ok(CacheResult::NotFound(lsn)) => lsn,
Err(_io_error) => return Err(-1), // FIXME errno?
};
cache_misses.push((blkno, not_modified_since, dest));
}
if cache_misses.is_empty() {
return Ok(());
}
let not_modified_since = cache_misses
.iter()
.map(|(_blkno, lsn, _dest)| *lsn)
.max()
.unwrap();
// TODO: Use batched protocol
for (blkno, _lsn, dest) in cache_misses.iter() {
match self
.pageserver_client
.get_page(&model::GetPageRequest {
common: self.request_common(not_modified_since),
rel: rel.clone(),
block_number: *blkno,
})
.await
{
Ok(page_image) => {
// Write the received page image directly to the shared memory location
// that the backend requested.
let src: &[u8] = page_image.as_ref();
let len = std::cmp::min(src.len(), dest.bytes_total() as usize);
unsafe {
std::ptr::copy_nonoverlapping(src.as_ptr(), dest.as_mut_ptr(), len);
};
trace!("remembering blk {} in rel {:?} in LFC", blkno, rel);
// Also store it in the LFC while we have it
self.cache
.remember_page(&rel, *blkno, page_image, not_modified_since)
.await;
}
Err(err) => {
info!("tonic error: {err:?}");
return Err(-1);
}
}
}
Ok(())
}
async fn handle_prefetchv_request(
self: &'static Self,
req: &CPrefetchVRequest,
) -> Result<(), i32> {
let rel = req.reltag();
// Check the cache first
let mut cache_misses = Vec::new();
for i in 0..req.nblocks {
let blkno = req.block_number + i as u32;
let not_modified_since = match self.cache.page_is_cached(&rel, blkno).await {
Ok(CacheResult::Found(_)) => {
trace!("found blk {} in rel {:?} in LFC ", req.block_number, rel);
continue;
}
Ok(CacheResult::NotFound(lsn)) => lsn,
Err(_io_error) => return Err(-1), // FIXME errno?
};
cache_misses.push((req.block_number, not_modified_since));
}
if cache_misses.is_empty() {
return Ok(());
}
let not_modified_since = cache_misses.iter().map(|(_blkno, lsn)| *lsn).max().unwrap();
// TODO: spawn separate tasks for these. Use the integrated cache to keep track of the
// in-flight requests
// TODO: Use batched protocol
for (blkno, _lsn) in cache_misses.iter() {
match self
.pageserver_client
.get_page(&model::GetPageRequest {
common: self.request_common(not_modified_since),
rel: rel.clone(),
block_number: *blkno,
})
.await
{
Ok(page_image) => {
trace!(
"prefetch completed, remembering blk {} in rel {:?} in LFC",
req.block_number, rel
);
self.cache
.remember_page(&rel, req.block_number, page_image, not_modified_since)
.await;
}
Err(err) => {
info!("tonic error: {err:?}");
return Err(-1);
}
}
}
Ok(())
}
}

View File

@@ -0,0 +1,11 @@
//! This code runs in the communicator worker process. This provides
//! the glue code to:
//!
//! - launch the 'processor',
//! - receive IO requests from backends and pass them to the processor,
//! - write results back to backends.
mod callbacks;
mod logging;
mod main_loop;
mod worker_interface;

View File

@@ -0,0 +1,93 @@
//! Functions called from the C code in the worker process
use std::collections::HashMap;
use std::ffi::{CStr, c_char};
use std::path::PathBuf;
use tracing::error;
use crate::init::CommunicatorInitStruct;
use crate::worker_process::main_loop;
/// Launch the communicator's tokio tasks, which do most of the work.
///
/// The caller has initialized the process as a regular PostgreSQL
/// background worker process. The shared memory segment used to
/// communicate with the backends has been allocated and initialized
/// earlier, at postmaster startup, in rcommunicator_shmem_init().
#[unsafe(no_mangle)]
pub extern "C" fn communicator_worker_process_launch(
cis: Box<CommunicatorInitStruct>,
tenant_id: *const c_char,
timeline_id: *const c_char,
auth_token: *const c_char,
shard_map: *mut *mut c_char,
nshards: u32,
file_cache_path: *const c_char,
file_cache_size: u64,
) {
// Convert the arguments into more convenient Rust types
let tenant_id = unsafe { CStr::from_ptr(tenant_id) }.to_str().unwrap();
let timeline_id = unsafe { CStr::from_ptr(timeline_id) }.to_str().unwrap();
let auth_token = {
if auth_token.is_null() {
None
} else {
let c_str = unsafe { CStr::from_ptr(auth_token) };
Some(c_str.to_str().unwrap().to_string())
}
};
let file_cache_path = {
if file_cache_path.is_null() {
None
} else {
let c_str = unsafe { CStr::from_ptr(file_cache_path) };
Some(PathBuf::from(c_str.to_str().unwrap()))
}
};
let shard_map = parse_shard_map(nshards, shard_map);
// start main loop
let runtime = tokio::runtime::Builder::new_multi_thread()
.enable_all()
.thread_name("communicator thread")
.build()
.unwrap();
let worker_struct = runtime.block_on(main_loop::init(
cis,
tenant_id.to_string(),
timeline_id.to_string(),
auth_token,
shard_map,
file_cache_size,
file_cache_path,
));
let worker_struct = Box::leak(Box::new(worker_struct));
let main_loop_handle = runtime.spawn(worker_struct.run());
runtime.spawn(async {
let err = main_loop_handle.await.unwrap_err();
error!("error: {err:?}");
});
// keep the runtime running after we exit this function
Box::leak(Box::new(runtime));
}
/// Convert the "shard map" from an array of C strings, indexed by shard no to a rust HashMap
fn parse_shard_map(nshards: u32, shard_map: *mut *mut c_char) -> HashMap<u16, String> {
let mut result: HashMap<u16, String> = HashMap::new();
let mut p = shard_map;
for i in 0..nshards {
let c_str = unsafe { CStr::from_ptr(*p) };
p = unsafe { p.add(1) };
let s = c_str.to_str().unwrap();
result.insert(i as u16, s.into());
}
result
}

View File

@@ -0,0 +1,953 @@
/*-------------------------------------------------------------------------
*
* communicator_new.c
* Functions for communicating with remote pageservers.
*
* This is the "new" communicator. It consists of functions that
* are called from the smgr implementation, in pagestore_smgr.c.
*
* Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "access/xlog.h"
#include "access/xlogdefs.h"
#if PG_VERSION_NUM >= 150000
#include "access/xlogrecovery.h"
#endif
#include "access/xlog_internal.h"
#include "access/xlogutils.h"
#include "executor/instrument.h"
#include "miscadmin.h"
#include "postmaster/bgworker.h"
#include "postmaster/interrupt.h"
#include "replication/walsender.h"
#include "storage/fd.h"
#include "storage/ipc.h"
#include "storage/latch.h"
#include "storage/procarray.h"
#if PG_VERSION_NUM >= 170000
#include "storage/procnumber.h"
#endif
#include "storage/spin.h"
#include "tcop/tcopprot.h"
#include "communicator_new.h"
#include "neon.h"
#include "neon_perf_counters.h"
#include "pagestore_client.h"
/*
* FIXME: these are in file_cache.h, but I don't want to #include that
* here. This code shouldn't be using the C file cache for anything else than
* the GUCs.
*/
extern int lfc_size_limit;
extern char *lfc_path;
/* the rust bindings, generated by cbindgen */
#include "communicator/communicator_bindings.h"
#define MaxProcs (MaxBackends + NUM_AUXILIARY_PROCS)
static CommunicatorInitStruct *cis;
static CommunicatorBackendStruct *my_bs;
static File cache_file = 0;
typedef struct CommunicatorShmemPerBackendData
{
/*
* Latch used to notify backend of IO completion. We cannot use the
* standard process latch (MyProc->latch) because we cannot clear that
* latch as part of the IO handling, or we might cause the caller to miss
* some other events.
*/
Latch io_completion_latch;
/*
* Normally, when reading or writing pages from shared buffer cache, the
* worker process can operate directly on the shared buffer. But when
* working with a local buffer, we use this "bounce buffer" to pass the
* data to the worker process.
*
* TODO: That's slow, because it incurs an extra memory copy, and there's
* currently only one of these per backend, which means you can have only
* one such IO in progress at a time.
*/
PGIOAlignedBlock bounce_buffer;
} CommunicatorShmemPerBackendData;
typedef struct CommunicatorShmemData
{
int dummy;
CommunicatorShmemPerBackendData backends[]; /* MaxProcs */
/* rust-managed shmem area follows at next MAXALIGN boundary */
} CommunicatorShmemData;
static CommunicatorShmemData *communicator_shmem_ptr;
#define MyIOCompletionLatch (&communicator_shmem_ptr->backends[MyProcNumber].io_completion_latch)
static slock_t in_elog;
#define MAX_INFLIGHT_ASYNC_REQUESTS 5
/* request indexes of (prefetch) requests that have been started */
static int inflight_requests[MAX_INFLIGHT_ASYNC_REQUESTS];
static int num_inflight_requests = 0;
static int start_request(NeonIORequest *request, struct NeonIOResult *immediate_result_p);
static void wait_request_completion(int request_idx, struct NeonIOResult *result_p);
static void perform_request(NeonIORequest *request, struct NeonIOResult *result_p);
static void process_inflight_requests(void);
static bool bounce_needed(void *buffer);
static void *bounce_buf(void);
static void *bounce_write_if_needed(void *buffer);
PGDLLEXPORT void communicator_new_bgworker_main(Datum main_arg);
static void communicator_new_backend_exit(int code, Datum arg);
/**** Initialization functions. These run in postmaster ****/
void
pg_init_communicator_new(void)
{
BackgroundWorker bgw;
/* Initialize the background worker process */
memset(&bgw, 0, sizeof(bgw));
bgw.bgw_flags = BGWORKER_SHMEM_ACCESS;
bgw.bgw_start_time = BgWorkerStart_PostmasterStart;
snprintf(bgw.bgw_library_name, BGW_MAXLEN, "neon");
snprintf(bgw.bgw_function_name, BGW_MAXLEN, "communicator_new_bgworker_main");
snprintf(bgw.bgw_name, BGW_MAXLEN, "Storage communicator process");
snprintf(bgw.bgw_type, BGW_MAXLEN, "Storage communicator process");
bgw.bgw_restart_time = 5;
bgw.bgw_notify_pid = 0;
bgw.bgw_main_arg = (Datum) 0;
RegisterBackgroundWorker(&bgw);
SpinLockInit(&in_elog);
}
static size_t
communicator_new_shmem_size(void)
{
size_t size = 0;
size += MAXALIGN(
offsetof(CommunicatorShmemData, backends) +
MaxProcs * sizeof(CommunicatorShmemPerBackendData)
);
/* space needed by the rust code */
size += rcommunicator_shmem_size(MaxProcs);
return size;
}
void
communicator_new_shmem_request(void)
{
RequestAddinShmemSpace(communicator_new_shmem_size());
}
void
communicator_new_shmem_startup(void)
{
bool found;
int pipefd[2];
int rc;
size_t communicator_size;
size_t shmem_size;
void *shmem_ptr;
rc = pipe(pipefd);
if (rc != 0)
ereport(ERROR,
(errcode_for_file_access(),
errmsg_internal("could not create pipe between neon communicator and backends : %m")));
if (fcntl(pipefd[0], F_SETFL, O_NONBLOCK) == -1)
elog(FATAL, "fcntl(F_SETFL) failed on read-end of communicator pipe: %m");
if (fcntl(pipefd[1], F_SETFL, O_NONBLOCK) == -1)
elog(FATAL, "fcntl(F_SETFL) failed on write-end of communicator pipe: %m");
shmem_size = communicator_new_shmem_size();
shmem_ptr = ShmemInitStruct("Communicator shmem state",
shmem_size,
&found);
Assert(!found);
/* Initialize the C-managed parts */
communicator_shmem_ptr = (CommunicatorShmemData *) shmem_ptr;
communicator_size = MAXALIGN(offsetof(CommunicatorShmemData, backends) + MaxProcs * sizeof(CommunicatorShmemPerBackendData));
shmem_ptr = (char *) shmem_ptr + communicator_size;
shmem_size -= communicator_size;
for (int i = 0; i < MaxProcs; i++)
InitSharedLatch(&communicator_shmem_ptr->backends[i].io_completion_latch);
/* Initialize the rust-managed parts */
cis = rcommunicator_shmem_init(pipefd[0], pipefd[1], MaxProcs, shmem_ptr, shmem_size);
}
/**** Worker process functions. These run in the communicator worker process ****/
/* Entry point for the communicator bgworker process */
void
communicator_new_bgworker_main(Datum main_arg)
{
char **connstrs;
shardno_t num_shards;
struct LoggingState *logging;
char errbuf[1000];
int elevel;
/* Establish signal handlers. */
pqsignal(SIGUSR1, procsignal_sigusr1_handler);
pqsignal(SIGHUP, SignalHandlerForConfigReload);
pqsignal(SIGTERM, die);
BackgroundWorkerUnblockSignals();
get_shard_map(&connstrs, &num_shards);
logging = configure_logging();
communicator_worker_process_launch(
cis,
neon_tenant,
neon_timeline,
neon_auth_token,
connstrs,
num_shards,
lfc_path,
lfc_size_limit);
cis = NULL;
elog(LOG, "communicator threads started");
for (;;)
{
int32 rc;
CHECK_FOR_INTERRUPTS();
for (;;)
{
rc = pump_logging(logging, (uint8 *) errbuf, sizeof(errbuf), &elevel);
if (rc == 0)
{
/* nothing to do */
break;
}
else if (rc == 1)
{
/* Because we don't want to exit on error */
if (elevel == ERROR)
elevel = LOG;
if (elevel == INFO)
elevel = LOG;
elog(elevel, "[COMMUNICATOR] %s", errbuf);
}
else if (rc == -1)
{
elog(ERROR, "logging channel was closed unexpectedly");
}
}
(void) WaitLatch(MyLatch,
WL_LATCH_SET | WL_EXIT_ON_PM_DEATH,
0,
PG_WAIT_EXTENSION);
ResetLatch(MyLatch);
}
}
/*
* Callbacks from the rust code, in the communicator process.
*
* NOTE: These must be thread safe! It's very limited which PostgreSQL functions you can use!!!
*
* NOTE: the signatures of these better match the Rust definitions!
*/
void
notify_proc_unsafe(int procno)
{
SetLatch(&communicator_shmem_ptr->backends[procno].io_completion_latch);
}
void
callback_set_my_latch_unsafe(void)
{
SetLatch(MyLatch);
}
/*
* FIXME: The logic from neon_get_request_lsns() needs to go here, except for
* the last-written LSN cache stuff, which is managed by the rust code now.
*/
uint64
callback_get_request_lsn_unsafe(void)
{
/*
* NB: be very careful with what you do here! This is called from tokio
* threads, so anything tha tries to take LWLocks is unsafe, for example.
*
* RecoveryInProgress() is OK
*/
if (RecoveryInProgress())
{
XLogRecPtr replay_lsn = GetXLogReplayRecPtr(NULL);
return replay_lsn;
}
else
{
XLogRecPtr flushlsn;
#if PG_VERSION_NUM >= 150000
flushlsn = GetFlushRecPtr(NULL);
#else
flushlsn = GetFlushRecPtr();
#endif
return flushlsn;
}
}
/**** Backend functions. These run in each backend ****/
/* Initialize per-backend private state */
void
communicator_new_init(void)
{
Assert(cis != NULL);
Assert(my_bs == NULL);
if (MyBgworkerEntry && strcmp(MyBgworkerEntry->bgw_function_name, "communicator_new_bgworker_main") == 0)
return;
OwnLatch(MyIOCompletionLatch);
my_bs = rcommunicator_backend_init(cis, MyProcNumber);
cis = NULL;
/*
* Arrange to clean up at backend exit.
*/
on_shmem_exit(communicator_new_backend_exit, 0);
}
static void
communicator_new_backend_exit(int code, Datum arg)
{
DisownLatch(MyIOCompletionLatch);
}
/*
* prefetch_register_bufferv() - register and prefetch buffers
*
* Register that we may want the contents of BufferTag in the near future.
* This is used when issuing a speculative prefetch request, but also when
* performing a synchronous request and need the buffer right now.
*
* When performing a prefetch rather than a synchronous request,
* is_prefetch==true. Currently, it only affects how the request is accounted
* in the perf counters.
*
* NOTE: this function may indirectly update MyPState->pfs_hash; which
* invalidates any active pointers into the hash table.
*/
void
communicator_new_prefetch_register_bufferv(NRelFileInfo rinfo, ForkNumber forkNum,
BlockNumber blockno, BlockNumber nblocks)
{
int request_idx;
NeonIORequest request = {
.tag = NeonIORequest_PrefetchV,
.prefetch_v = {
.spc_oid = NInfoGetSpcOid(rinfo),
.db_oid = NInfoGetDbOid(rinfo),
.rel_number = NInfoGetRelNumber(rinfo),
.fork_number = forkNum,
.block_number = blockno,
.nblocks = nblocks,
}
};
struct NeonIOResult result;
elog(LOG, "prefetch called for rel %u/%u/%u.%u block %u (%u blocks)",
RelFileInfoFmt(rinfo), forkNum, blockno, nblocks);
if (num_inflight_requests >= MAX_INFLIGHT_ASYNC_REQUESTS)
process_inflight_requests();
request_idx = bcomm_start_io_request(my_bs, &request, &result);
if (request_idx == -1)
{
/* -1 means the request was satisfied immediately. */
/* FIXME: check and log errors */
return;
}
inflight_requests[num_inflight_requests] = request_idx;
num_inflight_requests++;
elog(LOG, "sent prefetch request with idx %d", request_idx);
}
static void
process_inflight_requests(void)
{
struct NeonIOResult result;
/* FIXME: log errors */
for (int i = 0; i < num_inflight_requests; i++)
wait_request_completion(inflight_requests[i], &result);
num_inflight_requests = 0;
}
/*
* Perform an IO request in a synchronous fashion.
*
* Returns a pointer to the result slot. It is valid until the next time a
* request is submitted.
*/
static void
perform_request(NeonIORequest * request, struct NeonIOResult *result_p)
{
int request_idx;
process_inflight_requests();
request_idx = start_request(request, result_p);
if (request_idx == -1)
{
/* it was completed immediately */
return;
}
wait_request_completion(request_idx, result_p);
}
static int
start_request(NeonIORequest * request, struct NeonIOResult *immediate_result_p)
{
int request_idx;
request_idx = bcomm_start_io_request(my_bs, request, immediate_result_p);
if (request_idx == -1)
{
/* -1 means the request was satisfied immediately. */
return -1;
}
elog(DEBUG5, "sent request with idx %d: tag %d", request_idx, request->tag);
return request_idx;
}
static void
wait_request_completion(int request_idx, struct NeonIOResult *result_p)
{
int32_t poll_res;
/* fixme: check 'request_idx' ? */
for (;;)
{
ResetLatch(MyIOCompletionLatch);
poll_res = bcomm_poll_request_completion(my_bs, request_idx, result_p);
if (poll_res == -1)
{
CHECK_FOR_INTERRUPTS();
/*
* TODO: wake up periodically for CHECK_FOR_INTERRUPTS(). Because
* we wait on MyIOCompletionLatch rather than MyLatch, we won't be
* woken up for the standard interrupts.
*/
(void) WaitLatch(MyIOCompletionLatch,
WL_EXIT_ON_PM_DEATH | WL_LATCH_SET,
0,
WAIT_EVENT_NEON_PS_STARTING);
continue; /* still busy */
}
else if (poll_res == 0)
{
return;
}
else
{
elog(ERROR, "unexpected return code from bcomm_poll_request_completion()");
}
}
}
/*
* Does the physical file exist?
*/
bool
communicator_new_rel_exists(NRelFileInfo rinfo, ForkNumber forkNum)
{
NeonIORequest request = {
.tag = NeonIORequest_RelExists,
.rel_exists = {
.spc_oid = NInfoGetSpcOid(rinfo),
.db_oid = NInfoGetDbOid(rinfo),
.rel_number = NInfoGetRelNumber(rinfo),
.fork_number = forkNum,
}
};
NeonIOResult result;
perform_request(&request, &result);
switch (result.tag)
{
case NeonIOResult_RelExists:
return result.rel_exists;
case NeonIOResult_Error:
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not check existence of rel %u/%u/%u.%u: %s",
RelFileInfoFmt(rinfo), forkNum, pg_strerror(result.error))));
break;
default:
elog(ERROR, "unexpected result for RelExists operation: %d", result.tag);
break;
}
}
/*
* Read N consecutive pages from a relation
*/
void
communicator_new_read_at_lsnv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blockno,
void **buffers, BlockNumber nblocks)
{
NeonIOResult result;
CCachedGetPageVResult cached_result;
void *bounce_buf_used = NULL;
int request_idx;
NeonIORequest request = {
.tag = NeonIORequest_GetPageV,
.get_page_v = {
.spc_oid = NInfoGetSpcOid(rinfo),
.db_oid = NInfoGetDbOid(rinfo),
.rel_number = NInfoGetRelNumber(rinfo),
.fork_number = forkNum,
.block_number = blockno,
.nblocks = nblocks,
}
};
elog(LOG, "getpagev called for rel %u/%u/%u.%u block %u (%u blocks)",
RelFileInfoFmt(rinfo), forkNum, blockno, nblocks);
/* Fill in the destination buffers in the request */
if (nblocks == 1)
{
if (bounce_needed(buffers[0]))
{
bounce_buf_used = bounce_buf();
request.get_page_v.dest[0].ptr = bounce_buf_used;
}
else
request.get_page_v.dest[0].ptr = buffers[0];
}
else
{
for (int i = 0; i < nblocks; i++)
{
if (bounce_needed(buffers[i]))
{
/* Split the vector-request into single page requests */
for (int j = 0; j < nblocks; j++)
{
communicator_new_read_at_lsnv(rinfo, forkNum, blockno + j,
&buffers[j], 1);
}
return;
}
request.get_page_v.dest[i].ptr = buffers[i];
}
}
process_inflight_requests();
retry:
request_idx = bcomm_start_get_page_v_request(my_bs, &request, &cached_result);
if (request_idx == -1)
{
bool completed;
/*
* LFC hit, but we are responsible for completing the I/O on the local
* file
*/
if (cache_file == 0)
cache_file = PathNameOpenFile(lfc_path, O_RDONLY | PG_BINARY);
for (int i = 0; i < nblocks; i++)
{
uint64_t cached_block = cached_result.cache_block_numbers[i];
ssize_t bytes_total = 0;
while (bytes_total < BLCKSZ)
{
ssize_t nbytes;
nbytes = FileRead(cache_file, ((char *) buffers[i]) + bytes_total, BLCKSZ - bytes_total, cached_block * BLCKSZ + bytes_total, WAIT_EVENT_NEON_LFC_READ);
if (nbytes == -1)
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not read block %lu in local cache file: %m",
cached_block)));
bytes_total += nbytes;
}
}
completed = bcomm_finish_cache_read(my_bs);
if (!completed)
{
elog(DEBUG1, "read from local cache file was superseded by concurrent update");
goto retry;
}
return;
}
wait_request_completion(request_idx, &result);
switch (result.tag)
{
case NeonIOResult_GetPageV:
if (bounce_buf_used)
memcpy(buffers[0], bounce_buf_used, BLCKSZ);
return;
case NeonIOResult_Error:
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not read block %u in rel %u/%u/%u.%u: %s",
blockno, RelFileInfoFmt(rinfo), forkNum, pg_strerror(result.error))));
break;
default:
elog(ERROR, "unexpected result for GetPage operation: %d", result.tag);
break;
}
}
/*
* neon_nblocks() -- Get the number of blocks stored in a relation.
*/
BlockNumber
communicator_new_rel_nblocks(NRelFileInfo rinfo, ForkNumber forkNum)
{
NeonIORequest request = {
.tag = NeonIORequest_RelSize,
.rel_size = {
.spc_oid = NInfoGetSpcOid(rinfo),
.db_oid = NInfoGetDbOid(rinfo),
.rel_number = NInfoGetRelNumber(rinfo),
.fork_number = forkNum,
}
};
NeonIOResult result;
perform_request(&request, &result);
switch (result.tag)
{
case NeonIOResult_RelSize:
return result.rel_size;
case NeonIOResult_Error:
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not read size of rel %u/%u/%u.%u: %s",
RelFileInfoFmt(rinfo), forkNum, pg_strerror(result.error))));
break;
default:
elog(ERROR, "unexpected result for RelSize operation: %d", result.tag);
break;
}
}
/*
* neon_db_size() -- Get the size of the database in bytes.
*/
int64
communicator_new_dbsize(Oid dbNode)
{
NeonIORequest request = {
.tag = NeonIORequest_DbSize,
.db_size = {
.db_oid = dbNode,
}
};
NeonIOResult result;
perform_request(&request, &result);
switch (result.tag)
{
case NeonIOResult_DbSize:
return (int64) result.db_size;
case NeonIOResult_Error:
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not read database size of database %u: %s",
dbNode, pg_strerror(result.error))));
break;
default:
elog(ERROR, "unexpected result for DbSize operation: %d", result.tag);
break;
}
}
int
communicator_new_read_slru_segment(SlruKind kind, int64 segno, void *buffer)
{
/* TODO */
elog(ERROR, "not implemented");
}
/* Write requests */
void
communicator_new_write_page(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blockno,
const void *buffer, XLogRecPtr lsn)
{
void *src = bounce_write_if_needed((void *) buffer);
NeonIORequest request = {
.tag = NeonIORequest_WritePage,
.write_page = {
.spc_oid = NInfoGetSpcOid(rinfo),
.db_oid = NInfoGetDbOid(rinfo),
.rel_number = NInfoGetRelNumber(rinfo),
.fork_number = forkNum,
.block_number = blockno,
.lsn = lsn,
.src.ptr = src,
}
};
NeonIOResult result;
perform_request(&request, &result);
switch (result.tag)
{
case NeonIOResult_WriteOK:
return;
case NeonIOResult_Error:
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not write block %u in rel %u/%u/%u.%u: %s",
blockno, RelFileInfoFmt(rinfo), forkNum, pg_strerror(result.error))));
break;
default:
elog(ERROR, "unexpected result for WritePage operation: %d", result.tag);
break;
}
}
void
communicator_new_rel_extend(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blockno,
const void *buffer, XLogRecPtr lsn)
{
void *src = bounce_write_if_needed((void *) buffer);
NeonIORequest request = {
.tag = NeonIORequest_RelExtend,
.rel_extend = {
.spc_oid = NInfoGetSpcOid(rinfo),
.db_oid = NInfoGetDbOid(rinfo),
.rel_number = NInfoGetRelNumber(rinfo),
.fork_number = forkNum,
.block_number = blockno,
.lsn = lsn,
.src_ptr = (uintptr_t) src,
.src_size = BLCKSZ,
}
};
NeonIOResult result;
perform_request(&request, &result);
switch (result.tag)
{
case NeonIOResult_WriteOK:
return;
case NeonIOResult_Error:
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not extend to block %u in rel %u/%u/%u.%u: %s",
blockno, RelFileInfoFmt(rinfo), forkNum, pg_strerror(result.error))));
break;
default:
elog(ERROR, "unexpected result for Extend operation: %d", result.tag);
break;
}
}
void
communicator_new_rel_zeroextend(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blockno,
BlockNumber nblocks, XLogRecPtr lsn)
{
NeonIORequest request = {
.tag = NeonIORequest_RelZeroExtend,
.rel_zero_extend = {
.spc_oid = NInfoGetSpcOid(rinfo),
.db_oid = NInfoGetDbOid(rinfo),
.rel_number = NInfoGetRelNumber(rinfo),
.fork_number = forkNum,
.block_number = blockno,
.nblocks = nblocks,
.lsn = lsn,
}
};
NeonIOResult result;
perform_request(&request, &result);
switch (result.tag)
{
case NeonIOResult_WriteOK:
return;
case NeonIOResult_Error:
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not zeroextend to block %u in rel %u/%u/%u.%u: %s",
blockno, RelFileInfoFmt(rinfo), forkNum, pg_strerror(result.error))));
break;
default:
elog(ERROR, "unexpected result for ZeroExtend operation: %d", result.tag);
break;
}
}
void
communicator_new_rel_create(NRelFileInfo rinfo, ForkNumber forkNum)
{
NeonIORequest request = {
.tag = NeonIORequest_RelCreate,
.rel_create = {
.spc_oid = NInfoGetSpcOid(rinfo),
.db_oid = NInfoGetDbOid(rinfo),
.rel_number = NInfoGetRelNumber(rinfo),
.fork_number = forkNum,
}
};
NeonIOResult result;
perform_request(&request, &result);
switch (result.tag)
{
case NeonIOResult_WriteOK:
return;
case NeonIOResult_Error:
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not create rel %u/%u/%u.%u: %s",
RelFileInfoFmt(rinfo), forkNum, pg_strerror(result.error))));
break;
default:
elog(ERROR, "unexpected result for Create operation: %d", result.tag);
break;
}
}
void
communicator_new_rel_truncate(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber nblocks)
{
NeonIORequest request = {
.tag = NeonIORequest_RelTruncate,
.rel_truncate = {
.spc_oid = NInfoGetSpcOid(rinfo),
.db_oid = NInfoGetDbOid(rinfo),
.rel_number = NInfoGetRelNumber(rinfo),
.fork_number = forkNum,
.nblocks = nblocks,
}
};
NeonIOResult result;
perform_request(&request, &result);
switch (result.tag)
{
case NeonIOResult_WriteOK:
return;
case NeonIOResult_Error:
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not truncate rel %u/%u/%u.%u to %u blocks: %s",
RelFileInfoFmt(rinfo), forkNum, nblocks, pg_strerror(result.error))));
break;
default:
elog(ERROR, "unexpected result for Truncate operation: %d", result.tag);
break;
}
}
void
communicator_new_rel_unlink(NRelFileInfo rinfo, ForkNumber forkNum)
{
NeonIORequest request = {
.tag = NeonIORequest_RelUnlink,
.rel_unlink = {
.spc_oid = NInfoGetSpcOid(rinfo),
.db_oid = NInfoGetDbOid(rinfo),
.rel_number = NInfoGetRelNumber(rinfo),
.fork_number = forkNum,
}
};
NeonIOResult result;
perform_request(&request, &result);
switch (result.tag)
{
case NeonIOResult_WriteOK:
return;
case NeonIOResult_Error:
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not unlink rel %u/%u/%u.%u: %s",
RelFileInfoFmt(rinfo), forkNum, pg_strerror(result.error))));
break;
default:
elog(ERROR, "unexpected result for Unlink operation: %d", result.tag);
break;
}
}
/*
* The worker process can read / write shared buffers directly. But if smgrread() or
* smgrwrite() is called with a private temporary buffer, we need to copy it to the
* "bounce buffer", to make it available fro the worker process.
*/
static bool
bounce_needed(void *buffer)
{
if ((uintptr_t) buffer >= (uintptr_t) BufferBlocks &&
(uintptr_t) buffer < (uintptr_t) BufferBlocks + NBuffers * BLCKSZ)
{
return false;
}
return true;
}
static void *
bounce_buf(void)
{
return &communicator_shmem_ptr->backends[MyProcNumber].bounce_buffer;
}
static void *
bounce_write_if_needed(void *buffer)
{
void *p;
if (!bounce_needed(buffer))
return buffer;
p = bounce_buf();
memcpy(p, buffer, BLCKSZ);
return p;
}

View File

@@ -0,0 +1,54 @@
/*-------------------------------------------------------------------------
*
* communicator_new.h
* new implementation
*
*
* Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*-------------------------------------------------------------------------
*/
#ifndef COMMUNICATOR_NEW_H
#define COMMUNICATOR_NEW_H
#include "neon_pgversioncompat.h"
#include "storage/buf_internals.h"
#include "pagestore_client.h"
/* initialization at postmaster startup */
extern void pg_init_communicator_new(void);
extern void communicator_new_shmem_request(void);
extern void communicator_new_shmem_startup(void);
/* initialization at backend startup */
extern void communicator_new_init(void);
/* Read requests */
extern bool communicator_new_rel_exists(NRelFileInfo rinfo, ForkNumber forkNum);
extern BlockNumber communicator_new_rel_nblocks(NRelFileInfo rinfo, ForkNumber forknum);
extern int64 communicator_new_dbsize(Oid dbNode);
extern void communicator_new_read_at_lsnv(NRelFileInfo rinfo, ForkNumber forkNum,
BlockNumber base_blockno,
void **buffers, BlockNumber nblocks);
extern void communicator_new_prefetch_register_bufferv(NRelFileInfo rinfo, ForkNumber forkNum,
BlockNumber blockno,
BlockNumber nblocks);
extern int communicator_new_read_slru_segment(SlruKind kind, int64 segno,
void *buffer);
/* Write requests, to keep the caches up-to-date */
extern void communicator_new_write_page(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blockno,
const void *buffer, XLogRecPtr lsn);
extern void communicator_new_rel_extend(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blockno,
const void *buffer, XLogRecPtr lsn);
extern void communicator_new_rel_zeroextend(NRelFileInfo rinfo, ForkNumber forkNum,
BlockNumber blockno, BlockNumber nblocks,
XLogRecPtr lsn);
extern void communicator_new_rel_create(NRelFileInfo rinfo, ForkNumber forkNum);
extern void communicator_new_rel_truncate(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber nblocks);
extern void communicator_new_rel_unlink(NRelFileInfo rinfo, ForkNumber forkNum);
#endif /* COMMUNICATOR_NEW_H */

View File

@@ -164,10 +164,10 @@ static HTAB *lfc_hash;
static int lfc_desc = -1;
static LWLockId lfc_lock;
static int lfc_max_size;
static int lfc_size_limit;
int lfc_size_limit;
static int lfc_chunk_size_log = MAX_BLOCKS_PER_CHUNK_LOG;
static int lfc_blocks_per_chunk = MAX_BLOCKS_PER_CHUNK;
static char *lfc_path;
char *lfc_path;
static uint64 lfc_generation;
static FileCacheControl *lfc_ctl;
static shmem_startup_hook_type prev_shmem_startup_hook;

View File

@@ -15,6 +15,8 @@
/* GUCs */
extern bool lfc_store_prefetch_result;
extern int lfc_size_limit;
extern char *lfc_path;
/* functions for local file cache */
extern void lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum,

View File

@@ -279,6 +279,55 @@ AssignPageserverConnstring(const char *newval, void *extra)
}
}
/* Return a copy of the whole shard map from shared memory */
void
get_shard_map(char ***connstrs_p, shardno_t *num_shards_p)
{
uint64 begin_update_counter;
uint64 end_update_counter;
ShardMap *shard_map = &pagestore_shared->shard_map;
shardno_t num_shards;
char *buf;
char **connstrs;
buf = palloc(MAX_SHARDS*MAX_PAGESERVER_CONNSTRING_SIZE);
connstrs = palloc(sizeof(char *) * MAX_SHARDS);
/*
* Postmaster can update the shared memory values concurrently, in which
* case we would copy a garbled mix of the old and new values. We will
* detect it because the counter's won't match, and retry. But it's
* important that we don't do anything within the retry-loop that would
* depend on the string having valid contents.
*/
do
{
char *p;
begin_update_counter = pg_atomic_read_u64(&pagestore_shared->begin_update_counter);
end_update_counter = pg_atomic_read_u64(&pagestore_shared->end_update_counter);
num_shards = shard_map->num_shards;
p = buf;
for (int i = 0; i < Min(num_shards, MAX_SHARDS); i++)
{
strlcpy(p, shard_map->connstring[i], MAX_PAGESERVER_CONNSTRING_SIZE);
connstrs[i] = p;
elog(LOG, "XX: connstrs[%d]: %p", i, p);
p += MAX_PAGESERVER_CONNSTRING_SIZE;
}
pg_memory_barrier();
}
while (begin_update_counter != end_update_counter
|| begin_update_counter != pg_atomic_read_u64(&pagestore_shared->begin_update_counter)
|| end_update_counter != pg_atomic_read_u64(&pagestore_shared->end_update_counter));
*connstrs_p = connstrs;
*num_shards_p = num_shards;
}
/*
* Get the current number of shards, and/or the connection string for a
* particular shard from the shard map in shared memory.

View File

@@ -20,6 +20,7 @@
#include "replication/logicallauncher.h"
#include "replication/slot.h"
#include "replication/walsender.h"
#include "storage/ipc.h"
#include "storage/proc.h"
#include "funcapi.h"
#include "access/htup_details.h"
@@ -29,6 +30,7 @@
#include "utils/guc_tables.h"
#include "communicator.h"
#include "communicator_new.h"
#include "extension_server.h"
#include "file_cache.h"
#include "neon.h"
@@ -45,13 +47,17 @@ PG_MODULE_MAGIC;
void _PG_init(void);
bool neon_enable_new_communicator;
static int running_xacts_overflow_policy;
#if PG_MAJORVERSION_NUM >= 16
static shmem_startup_hook_type prev_shmem_startup_hook;
static void neon_shmem_startup_hook(void);
#if PG_VERSION_NUM>=150000
static shmem_request_hook_type prev_shmem_request_hook;
#endif
static void neon_shmem_request(void);
static void neon_shmem_startup_hook(void);
#if PG_MAJORVERSION_NUM >= 17
uint32 WAIT_EVENT_NEON_LFC_MAINTENANCE;
uint32 WAIT_EVENT_NEON_LFC_READ;
@@ -430,17 +436,36 @@ _PG_init(void)
*/
#if PG_VERSION_NUM >= 160000
load_file("$libdir/neon_rmgr", false);
#endif
prev_shmem_startup_hook = shmem_startup_hook;
shmem_startup_hook = neon_shmem_startup_hook;
#if PG_VERSION_NUM>=150000
prev_shmem_request_hook = shmem_request_hook;
shmem_request_hook = neon_shmem_request;
#else
neon_shmem_request();
#endif
DefineCustomBoolVariable(
"neon.enable_new_communicator",
"Enables new communicator implementation",
NULL,
&neon_enable_new_communicator,
true,
PGC_POSTMASTER,
0,
NULL, NULL, NULL);
pg_init_libpagestore();
lfc_init();
pg_init_walproposer();
init_lwlsncache();
pg_init_communicator();
if (neon_enable_new_communicator)
pg_init_communicator_new();
Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines;
InitUnstableExtensionsSupport();
@@ -559,7 +584,17 @@ backpressure_throttling_time(PG_FUNCTION_ARGS)
PG_RETURN_UINT64(BackpressureThrottlingTime());
}
#if PG_MAJORVERSION_NUM >= 16
static void
neon_shmem_request(void)
{
#if PG_VERSION_NUM>=150000
if (prev_shmem_request_hook)
prev_shmem_request_hook();
#endif
communicator_new_shmem_request();
}
static void
neon_shmem_startup_hook(void)
{
@@ -579,5 +614,6 @@ neon_shmem_startup_hook(void)
WAIT_EVENT_NEON_PS_READ = WaitEventExtensionNew("Neon/PS_ReadIO");
WAIT_EVENT_NEON_WAL_DL = WaitEventExtensionNew("Neon/WAL_Download");
#endif
communicator_new_shmem_startup();
}
#endif

View File

@@ -13,6 +13,7 @@
#include "utils/wait_event.h"
/* GUCs */
extern bool neon_enable_new_communicator;
extern char *neon_auth_token;
extern char *neon_timeline;
extern char *neon_tenant;

View File

@@ -9,6 +9,10 @@
#include "fmgr.h"
#include "storage/buf_internals.h"
#if PG_MAJORVERSION_NUM < 16
typedef PGAlignedBlock PGIOAlignedBlock;
#endif
#if PG_MAJORVERSION_NUM < 17
#define NRelFileInfoBackendIsTemp(rinfo) (rinfo.backend != InvalidBackendId)
#else
@@ -154,6 +158,10 @@ InitBufferTag(BufferTag *tag, const RelFileNode *rnode,
#define AmAutoVacuumWorkerProcess() (IsAutoVacuumWorkerProcess())
#endif
#if PG_MAJORVERSION_NUM < 17
#define MyProcNumber (MyProc - &ProcGlobal->allProcs[0])
#endif
#if PG_MAJORVERSION_NUM < 15
extern void InitMaterializedSRF(FunctionCallInfo fcinfo, bits32 flags);
#endif

View File

@@ -228,6 +228,7 @@ extern char *neon_tenant;
extern int32 max_cluster_size;
extern int neon_protocol_version;
extern void get_shard_map(char ***connstrs_p, shardno_t *num_shards_p);
extern shardno_t get_shard_number(BufferTag* tag);
extern const f_smgr *smgr_neon(ProcNumber backend, NRelFileInfo rinfo);

View File

@@ -62,6 +62,7 @@
#include "bitmap.h"
#include "communicator.h"
#include "communicator_new.h"
#include "file_cache.h"
#include "neon.h"
#include "neon_lwlsncache.h"
@@ -72,10 +73,6 @@
#include "access/xlogrecovery.h"
#endif
#if PG_VERSION_NUM < 160000
typedef PGAlignedBlock PGIOAlignedBlock;
#endif
/*
* If DEBUG_COMPARE_LOCAL is defined, we pass through all the SMGR API
* calls to md.c, and *also* do the calls to the Page Server. On every
@@ -97,7 +94,7 @@ static char *hexdump_page(char *page);
NInfoGetRelNumber(InfoFromSMgrRel(reln)) >= FirstNormalObjectId \
)
const int SmgrTrace = DEBUG5;
const int SmgrTrace = DEBUG1;
/* unlogged relation build states */
typedef enum
@@ -779,10 +776,15 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
return false;
}
neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum,
REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1);
if (neon_enable_new_communicator)
return communicator_new_rel_exists(InfoFromSMgrRel(reln), forkNum);
else
{
neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum,
REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1);
return communicator_exists(InfoFromSMgrRel(reln), forkNum, &request_lsns);
return communicator_exists(InfoFromSMgrRel(reln), forkNum, &request_lsns);
}
}
/*
@@ -820,33 +822,40 @@ neon_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
RelFileInfoFmt(InfoFromSMgrRel(reln)),
forkNum);
/*
* Newly created relation is empty, remember that in the relsize cache.
*
* Note that in REDO, this is called to make sure the relation fork
* exists, but it does not truncate the relation. So, we can only update
* the relsize if it didn't exist before.
*
* Also, in redo, we must make sure to update the cached size of the
* relation, as that is the primary source of truth for REDO's file length
* considerations, and as file extension isn't (perfectly) logged, we need
* to take care of that before we hit file size checks.
*
* FIXME: This is currently not just an optimization, but required for
* correctness. Postgres can call smgrnblocks() on the newly-created
* relation. Currently, we don't call SetLastWrittenLSN() when a new
* relation created, so if we didn't remember the size in the relsize
* cache, we might call smgrnblocks() on the newly-created relation before
* the creation WAL record hass been received by the page server.
*/
if (isRedo)
if (neon_enable_new_communicator)
{
update_cached_relsize(InfoFromSMgrRel(reln), forkNum, 0);
get_cached_relsize(InfoFromSMgrRel(reln), forkNum,
&reln->smgr_cached_nblocks[forkNum]);
communicator_new_rel_create(InfoFromSMgrRel(reln), forkNum);
}
else
set_cached_relsize(InfoFromSMgrRel(reln), forkNum, 0);
{
/*
* Newly created relation is empty, remember that in the relsize cache.
*
* Note that in REDO, this is called to make sure the relation fork
* exists, but it does not truncate the relation. So, we can only update
* the relsize if it didn't exist before.
*
* Also, in redo, we must make sure to update the cached size of the
* relation, as that is the primary source of truth for REDO's file length
* considerations, and as file extension isn't (perfectly) logged, we need
* to take care of that before we hit file size checks.
*
* FIXME: This is currently not just an optimization, but required for
* correctness. Postgres can call smgrnblocks() on the newly-created
* relation. Currently, we don't call SetLastWrittenLSN() when a new
* relation created, so if we didn't remember the size in the relsize
* cache, we might call smgrnblocks() on the newly-created relation before
* the creation WAL record hass been received by the page server.
*/
if (isRedo)
{
update_cached_relsize(InfoFromSMgrRel(reln), forkNum, 0);
get_cached_relsize(InfoFromSMgrRel(reln), forkNum,
&reln->smgr_cached_nblocks[forkNum]);
}
else
set_cached_relsize(InfoFromSMgrRel(reln), forkNum, 0);
}
#ifdef DEBUG_COMPARE_LOCAL
if (IS_LOCAL_REL(reln))
@@ -968,34 +977,43 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
forkNum, blkno,
(uint32) (lsn >> 32), (uint32) lsn);
lfc_write(InfoFromSMgrRel(reln), forkNum, blkno, buffer);
if (neon_enable_new_communicator)
{
// FIXME: this can pass lsn == invalid. Is that ok?
communicator_new_rel_extend(InfoFromSMgrRel(reln), forkNum, blkno, (const void *) buffer, lsn);
}
else
{
lfc_write(InfoFromSMgrRel(reln), forkNum, blkno, buffer);
#ifdef DEBUG_COMPARE_LOCAL
if (IS_LOCAL_REL(reln))
mdextend(reln, forkNum, blkno, buffer, skipFsync);
if (IS_LOCAL_REL(reln))
mdextend(reln, forkNum, blkno, buffer, skipFsync);
#endif
/*
* smgr_extend is often called with an all-zeroes page, so
* lsn==InvalidXLogRecPtr. An smgr_write() call will come for the buffer
* later, after it has been initialized with the real page contents, and
* it is eventually evicted from the buffer cache. But we need a valid LSN
* to the relation metadata update now.
*/
if (lsn == InvalidXLogRecPtr)
{
lsn = GetXLogInsertRecPtr();
neon_set_lwlsn_block(lsn, InfoFromSMgrRel(reln), forkNum, blkno);
/*
* smgr_extend is often called with an all-zeroes page, so
* lsn==InvalidXLogRecPtr. An smgr_write() call will come for the buffer
* later, after it has been initialized with the real page contents, and
* it is eventually evicted from the buffer cache. But we need a valid LSN
* to the relation metadata update now.
*/
if (lsn == InvalidXLogRecPtr)
{
lsn = GetXLogInsertRecPtr();
neon_set_lwlsn_block(lsn, InfoFromSMgrRel(reln), forkNum, blkno);
}
neon_set_lwlsn_relation(lsn, InfoFromSMgrRel(reln), forkNum);
}
neon_set_lwlsn_relation(lsn, InfoFromSMgrRel(reln), forkNum);
}
#if PG_MAJORVERSION_NUM >= 16
static void
neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,
neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber start_block,
int nblocks, bool skipFsync)
{
const PGIOAlignedBlock buffer = {0};
BlockNumber blocknum = start_block;
int remblocks = nblocks;
XLogRecPtr lsn = 0;
@@ -1092,8 +1110,15 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,
Assert(lsn != 0);
neon_set_lwlsn_relation(lsn, InfoFromSMgrRel(reln), forkNum);
set_cached_relsize(InfoFromSMgrRel(reln), forkNum, blocknum);
if (neon_enable_new_communicator)
{
communicator_new_rel_zeroextend(InfoFromSMgrRel(reln), forkNum, start_block, nblocks, lsn);
}
else
{
neon_set_lwlsn_relation(lsn, InfoFromSMgrRel(reln), forkNum);
set_cached_relsize(InfoFromSMgrRel(reln), forkNum, blocknum);
}
}
#endif
@@ -1153,11 +1178,17 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
}
if (neon_enable_new_communicator)
{
communicator_new_prefetch_register_bufferv(InfoFromSMgrRel(reln), forknum, blocknum, nblocks);
return false;
}
tag.spcOid = reln->smgr_rlocator.locator.spcOid;
tag.dbOid = reln->smgr_rlocator.locator.dbOid;
tag.relNumber = reln->smgr_rlocator.locator.relNumber;
tag.forkNum = forknum;
while (nblocks > 0)
{
int iterblocks = Min(nblocks, PG_IOV_MAX);
@@ -1179,7 +1210,8 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
blocknum += iterblocks;
}
communicator_prefetch_pump_state(false);
if (!neon_enable_new_communicator)
communicator_prefetch_pump_state(false);
return false;
}
@@ -1216,9 +1248,13 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
CopyNRelFileInfoToBufTag(tag, InfoFromSMgrRel(reln));
communicator_prefetch_register_bufferv(tag, NULL, 1, NULL);
if (neon_enable_new_communicator)
communicator_new_prefetch_register_bufferv(InfoFromSMgrRel(reln), forknum, blocknum, 1);
else
communicator_prefetch_register_bufferv(tag, NULL, 1, NULL);
communicator_prefetch_pump_state(false);
if (!neon_enable_new_communicator)
communicator_prefetch_pump_state(false);
return false;
}
@@ -1262,7 +1298,8 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum,
*/
neon_log(SmgrTrace, "writeback noop");
communicator_prefetch_pump_state(false);
if (!neon_enable_new_communicator)
communicator_prefetch_pump_state(false);
#ifdef DEBUG_COMPARE_LOCAL
if (IS_LOCAL_REL(reln))
@@ -1278,7 +1315,14 @@ void
neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
neon_request_lsns request_lsns, void *buffer)
{
communicator_read_at_lsnv(rinfo, forkNum, blkno, &request_lsns, &buffer, 1, NULL);
if (neon_enable_new_communicator)
{
// FIXME: request_lsns is ignored. That affects the neon_test_utils callers.
// Add the capability to specify the LSNs explicitly, for the sake of neon_test_utils ?
communicator_new_read_at_lsnv(rinfo, forkNum, blkno, &buffer, 1);
}
else
communicator_read_at_lsnv(rinfo, forkNum, blkno, &request_lsns, &buffer, 1, NULL);
}
#if PG_MAJORVERSION_NUM < 17
@@ -1296,6 +1340,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
neon_request_lsns request_lsns;
bits8 present;
void *bufferp;
bool prefetch_hit;
switch (reln->smgr_relpersistence)
{
@@ -1314,33 +1359,62 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
}
/* Try to read PS results if they are available */
communicator_prefetch_pump_state(false);
neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1);
present = 0;
bufferp = buffer;
if (communicator_prefetch_lookupv(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1, &bufferp, &present))
if (neon_enable_new_communicator)
{
/* Prefetch hit */
return;
communicator_new_read_at_lsnv(InfoFromSMgrRel(reln), forkNum, blkno,
(void *) &buffer, 1);
}
/* Try to read from local file cache */
if (lfc_read(InfoFromSMgrRel(reln), forkNum, blkno, buffer))
else
{
MyNeonCounters->file_cache_hits_total++;
return;
prefetch_hit = communicator_prefetch_lookupv(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1, &bufferp, &present);
if (prefetch_hit)
{
/* Prefetch hit */
return;
}
/* Try to read from local file cache */
if (lfc_read(InfoFromSMgrRel(reln), forkNum, blkno, buffer))
{
MyNeonCounters->file_cache_hits_total++;
return;
}
/*
* Try to receive prefetch results once again just to make sure we
* don't leave the smgr code while the OS might still have buffered
* bytes.
*/
communicator_prefetch_pump_state(false);
neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1);
prefetch_hit = communicator_prefetch_lookupv(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1, &bufferp, &present);
if (prefetch_hit)
{
/* Prefetch hit */
return;
}
/* Try to read from local file cache */
if (lfc_read(InfoFromSMgrRel(reln), forkNum, blkno, buffer))
{
MyNeonCounters->file_cache_hits_total++;
return;
}
neon_read_at_lsn(InfoFromSMgrRel(reln), forkNum, blkno, request_lsns, buffer);
/*
* Try to receive prefetch results once again just to make sure we don't leave the smgr code while the OS might still have buffered bytes.
*/
communicator_prefetch_pump_state(false);
}
neon_read_at_lsn(InfoFromSMgrRel(reln), forkNum, blkno, request_lsns, buffer);
/*
* Try to receive prefetch results once again just to make sure we don't leave the smgr code while the OS might still have buffered bytes.
*/
communicator_prefetch_pump_state(false);
#ifdef DEBUG_COMPARE_LOCAL
if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln))
{
@@ -1449,38 +1523,47 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
nblocks, PG_IOV_MAX);
/* Try to read PS results if they are available */
communicator_prefetch_pump_state(false);
neon_get_request_lsns(InfoFromSMgrRel(reln), forknum, blocknum,
request_lsns, nblocks);
if (!neon_enable_new_communicator)
communicator_prefetch_pump_state(false);
memset(read_pages, 0, sizeof(read_pages));
prefetch_result = communicator_prefetch_lookupv(InfoFromSMgrRel(reln), forknum,
blocknum, request_lsns, nblocks,
buffers, read_pages);
if (neon_enable_new_communicator)
{
communicator_new_read_at_lsnv(InfoFromSMgrRel(reln), forknum, blocknum,
buffers, nblocks);
}
else
{
neon_get_request_lsns(InfoFromSMgrRel(reln), forknum, blocknum,
request_lsns, nblocks);
prefetch_result = communicator_prefetch_lookupv(InfoFromSMgrRel(reln), forknum,
blocknum, request_lsns, nblocks,
buffers, read_pages);
if (prefetch_result == nblocks)
return;
if (prefetch_result == nblocks)
return;
/* Try to read from local file cache */
lfc_result = lfc_readv_select(InfoFromSMgrRel(reln), forknum, blocknum, buffers,
nblocks, read_pages);
/* Try to read from local file cache */
lfc_result = lfc_readv_select(InfoFromSMgrRel(reln), forknum, blocknum, buffers,
nblocks, read_pages);
if (lfc_result > 0)
MyNeonCounters->file_cache_hits_total += lfc_result;
if (lfc_result > 0)
MyNeonCounters->file_cache_hits_total += lfc_result;
/* Read all blocks from LFC, so we're done */
if (prefetch_result + lfc_result == nblocks)
return;
/* Read all blocks from LFC, so we're done */
if (prefetch_result + lfc_result == nblocks)
return;
communicator_read_at_lsnv(InfoFromSMgrRel(reln), forknum, blocknum, request_lsns,
buffers, nblocks, read_pages);
communicator_read_at_lsnv(InfoFromSMgrRel(reln), forknum, blocknum, request_lsns,
buffers, nblocks, read_pages);
/*
* Try to receive prefetch results once again just to make sure we don't leave the smgr code while the OS might still have buffered bytes.
*/
communicator_prefetch_pump_state(false);
/*
* Try to receive prefetch results once again just to make sure we don't leave the smgr code while the OS might still have buffered bytes.
*/
communicator_prefetch_pump_state(false);
}
#ifdef DEBUG_COMPARE_LOCAL
if (forknum == MAIN_FORKNUM && IS_LOCAL_REL(reln))
@@ -1663,9 +1746,16 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo
forknum, blocknum,
(uint32) (lsn >> 32), (uint32) lsn);
lfc_write(InfoFromSMgrRel(reln), forknum, blocknum, buffer);
if (neon_enable_new_communicator)
{
communicator_new_write_page(InfoFromSMgrRel(reln), forknum, blocknum, buffer, lsn);
}
else
{
lfc_write(InfoFromSMgrRel(reln), forknum, blocknum, buffer);
communicator_prefetch_pump_state(false);
communicator_prefetch_pump_state(false);
}
#ifdef DEBUG_COMPARE_LOCAL
if (IS_LOCAL_REL(reln))
@@ -1725,9 +1815,21 @@ neon_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
neon_wallog_pagev(reln, forknum, blkno, nblocks, (const char **) buffers, false);
lfc_writev(InfoFromSMgrRel(reln), forknum, blkno, buffers, nblocks);
if (neon_enable_new_communicator)
{
for (int i = 0; i < nblocks; i++)
{
XLogRecPtr lsn = PageGetLSN((Page) buffers[i]);
communicator_prefetch_pump_state(false);
communicator_new_write_page(InfoFromSMgrRel(reln), forknum, blkno + i, buffers[i], lsn);
}
}
else
{
lfc_writev(InfoFromSMgrRel(reln), forknum, blkno, buffers, nblocks);
communicator_prefetch_pump_state(false);
}
#ifdef DEBUG_COMPARE_LOCAL
if (IS_LOCAL_REL(reln))
@@ -1763,19 +1865,26 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
}
if (get_cached_relsize(InfoFromSMgrRel(reln), forknum, &n_blocks))
if (neon_enable_new_communicator)
{
neon_log(SmgrTrace, "cached nblocks for %u/%u/%u.%u: %u blocks",
RelFileInfoFmt(InfoFromSMgrRel(reln)),
forknum, n_blocks);
return n_blocks;
n_blocks = communicator_new_rel_nblocks(InfoFromSMgrRel(reln), forknum);
}
else
{
if (get_cached_relsize(InfoFromSMgrRel(reln), forknum, &n_blocks))
{
neon_log(SmgrTrace, "cached nblocks for %u/%u/%u.%u: %u blocks",
RelFileInfoFmt(InfoFromSMgrRel(reln)),
forknum, n_blocks);
return n_blocks;
}
neon_get_request_lsns(InfoFromSMgrRel(reln), forknum,
REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1);
neon_get_request_lsns(InfoFromSMgrRel(reln), forknum,
REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1);
n_blocks = communicator_nblocks(InfoFromSMgrRel(reln), forknum, &request_lsns);
update_cached_relsize(InfoFromSMgrRel(reln), forknum, n_blocks);
n_blocks = communicator_nblocks(InfoFromSMgrRel(reln), forknum, &request_lsns);
update_cached_relsize(InfoFromSMgrRel(reln), forknum, n_blocks);
}
neon_log(SmgrTrace, "neon_nblocks: rel %u/%u/%u fork %u (request LSN %X/%08X): %u blocks",
RelFileInfoFmt(InfoFromSMgrRel(reln)),
@@ -1796,10 +1905,17 @@ neon_dbsize(Oid dbNode)
neon_request_lsns request_lsns;
NRelFileInfo dummy_node = {0};
neon_get_request_lsns(dummy_node, MAIN_FORKNUM,
REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1);
if (neon_enable_new_communicator)
{
db_size = communicator_new_dbsize(dbNode);
}
else
{
neon_get_request_lsns(dummy_node, MAIN_FORKNUM,
REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1);
db_size = communicator_dbsize(dbNode, &request_lsns);
db_size = communicator_dbsize(dbNode, &request_lsns);
}
neon_log(SmgrTrace, "neon_dbsize: db %u (request LSN %X/%08X): %ld bytes",
dbNode, LSN_FORMAT_ARGS(request_lsns.effective_request_lsn), db_size);
@@ -1813,8 +1929,6 @@ neon_dbsize(Oid dbNode)
static void
neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber old_blocks, BlockNumber nblocks)
{
XLogRecPtr lsn;
switch (reln->smgr_relpersistence)
{
case 0:
@@ -1833,34 +1947,43 @@ neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber old_blocks, Blo
neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
}
set_cached_relsize(InfoFromSMgrRel(reln), forknum, nblocks);
if (neon_enable_new_communicator)
{
communicator_new_rel_truncate(InfoFromSMgrRel(reln), forknum, nblocks);
}
else
{
XLogRecPtr lsn;
/*
* Truncating a relation drops all its buffers from the buffer cache
* without calling smgrwrite() on them. But we must account for that in
* our tracking of last-written-LSN all the same: any future smgrnblocks()
* request must return the new size after the truncation. We don't know
* what the LSN of the truncation record was, so be conservative and use
* the most recently inserted WAL record's LSN.
*/
lsn = GetXLogInsertRecPtr();
lsn = nm_adjust_lsn(lsn);
set_cached_relsize(InfoFromSMgrRel(reln), forknum, nblocks);
/*
* Flush it, too. We don't actually care about it here, but let's uphold
* the invariant that last-written LSN <= flush LSN.
*/
XLogFlush(lsn);
/*
* Truncating a relation drops all its buffers from the buffer cache
* without calling smgrwrite() on them. But we must account for that in
* our tracking of last-written-LSN all the same: any future smgrnblocks()
* request must return the new size after the truncation. We don't know
* what the LSN of the truncation record was, so be conservative and use
* the most recently inserted WAL record's LSN.
*/
lsn = GetXLogInsertRecPtr();
lsn = nm_adjust_lsn(lsn);
/*
* Truncate may affect several chunks of relations. So we should either
* update last written LSN for all of them, or update LSN for "dummy"
* metadata block. Second approach seems more efficient. If the relation
* is extended again later, the extension will update the last-written LSN
* for the extended pages, so there's no harm in leaving behind obsolete
* entries for the truncated chunks.
*/
neon_set_lwlsn_relation(lsn, InfoFromSMgrRel(reln), forknum);
/*
* Flush it, too. We don't actually care about it here, but let's uphold
* the invariant that last-written LSN <= flush LSN.
*/
XLogFlush(lsn);
/*
* Truncate may affect several chunks of relations. So we should either
* update last written LSN for all of them, or update LSN for "dummy"
* metadata block. Second approach seems more efficient. If the relation
* is extended again later, the extension will update the last-written LSN
* for the extended pages, so there's no harm in leaving behind obsolete
* entries for the truncated chunks.
*/
neon_set_lwlsn_relation(lsn, InfoFromSMgrRel(reln), forknum);
}
#ifdef DEBUG_COMPARE_LOCAL
if (IS_LOCAL_REL(reln))
@@ -1902,7 +2025,8 @@ neon_immedsync(SMgrRelation reln, ForkNumber forknum)
neon_log(SmgrTrace, "[NEON_SMGR] immedsync noop");
communicator_prefetch_pump_state(false);
if (!neon_enable_new_communicator)
communicator_prefetch_pump_state(false);
#ifdef DEBUG_COMPARE_LOCAL
if (IS_LOCAL_REL(reln))
@@ -2173,7 +2297,10 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf
request_lsns.not_modified_since = not_modified_since;
request_lsns.effective_request_lsn = request_lsn;
n_blocks = communicator_read_slru_segment(kind, segno, &request_lsns, buffer);
if (neon_enable_new_communicator)
n_blocks = communicator_new_read_slru_segment(kind, segno, buffer);
else
n_blocks = communicator_read_slru_segment(kind, segno, &request_lsns, buffer);
return n_blocks;
}
@@ -2210,7 +2337,8 @@ AtEOXact_neon(XactEvent event, void *arg)
}
break;
}
communicator_reconfigure_timeout_if_needed();
if (!neon_enable_new_communicator)
communicator_reconfigure_timeout_if_needed();
}
static const struct f_smgr neon_smgr =
@@ -2268,7 +2396,10 @@ smgr_init_neon(void)
smgr_init_standard();
neon_init();
communicator_init();
if (neon_enable_new_communicator)
communicator_new_init();
else
communicator_init();
}
@@ -2280,6 +2411,12 @@ neon_extend_rel_size(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
/* This is only used in WAL replay */
Assert(RecoveryInProgress());
if (neon_enable_new_communicator)
{
// FIXME: broken, but this is only used in replica
elog(ERROR, "not implemented yet");
}
/* Extend the relation if we know its size */
if (get_cached_relsize(rinfo, forknum, &relsize))
{