mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-14 17:02:56 +00:00
New communicator, with "integrated" cache accessible from all processes
This commit is contained in:
230
Cargo.lock
generated
230
Cargo.lock
generated
@@ -253,6 +253,17 @@ version = "1.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a8ab6b55fe97976e46f91ddbed8d147d966475dc29b2032757ba47e02376fbc3"
|
||||
|
||||
[[package]]
|
||||
name = "atomic_enum"
|
||||
version = "0.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "99e1aca718ea7b89985790c94aad72d77533063fe00bc497bb79a7c2dae6a661"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.100",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "autocfg"
|
||||
version = "1.1.0"
|
||||
@@ -687,13 +698,40 @@ dependencies = [
|
||||
"tracing",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "axum"
|
||||
version = "0.7.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "edca88bc138befd0323b20752846e6587272d3b03b0343c8ea28a6f819e6e71f"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"axum-core 0.4.5",
|
||||
"bytes",
|
||||
"futures-util",
|
||||
"http 1.1.0",
|
||||
"http-body 1.0.0",
|
||||
"http-body-util",
|
||||
"itoa",
|
||||
"matchit 0.7.3",
|
||||
"memchr",
|
||||
"mime",
|
||||
"percent-encoding",
|
||||
"pin-project-lite",
|
||||
"rustversion",
|
||||
"serde",
|
||||
"sync_wrapper 1.0.1",
|
||||
"tower 0.5.2",
|
||||
"tower-layer",
|
||||
"tower-service",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "axum"
|
||||
version = "0.8.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6d6fd624c75e18b3b4c6b9caf42b1afe24437daaee904069137d8bab077be8b8"
|
||||
dependencies = [
|
||||
"axum-core",
|
||||
"axum-core 0.5.0",
|
||||
"base64 0.22.1",
|
||||
"bytes",
|
||||
"form_urlencoded",
|
||||
@@ -704,7 +742,7 @@ dependencies = [
|
||||
"hyper 1.4.1",
|
||||
"hyper-util",
|
||||
"itoa",
|
||||
"matchit",
|
||||
"matchit 0.8.4",
|
||||
"memchr",
|
||||
"mime",
|
||||
"percent-encoding",
|
||||
@@ -724,6 +762,26 @@ dependencies = [
|
||||
"tracing",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "axum-core"
|
||||
version = "0.4.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "09f2bd6146b97ae3359fa0cc6d6b376d9539582c7b4220f041a33ec24c226199"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"bytes",
|
||||
"futures-util",
|
||||
"http 1.1.0",
|
||||
"http-body 1.0.0",
|
||||
"http-body-util",
|
||||
"mime",
|
||||
"pin-project-lite",
|
||||
"rustversion",
|
||||
"sync_wrapper 1.0.1",
|
||||
"tower-layer",
|
||||
"tower-service",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "axum-core"
|
||||
version = "0.5.0"
|
||||
@@ -750,8 +808,8 @@ version = "0.10.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "460fc6f625a1f7705c6cf62d0d070794e94668988b1c38111baeec177c715f7b"
|
||||
dependencies = [
|
||||
"axum",
|
||||
"axum-core",
|
||||
"axum 0.8.1",
|
||||
"axum-core 0.5.0",
|
||||
"bytes",
|
||||
"futures-util",
|
||||
"headers",
|
||||
@@ -1086,6 +1144,25 @@ version = "0.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5"
|
||||
|
||||
[[package]]
|
||||
name = "cbindgen"
|
||||
version = "0.28.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "eadd868a2ce9ca38de7eeafdcec9c7065ef89b42b32f0839278d55f35c54d1ff"
|
||||
dependencies = [
|
||||
"clap",
|
||||
"heck 0.4.1",
|
||||
"indexmap 2.9.0",
|
||||
"log",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"syn 2.0.100",
|
||||
"tempfile",
|
||||
"toml",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cc"
|
||||
version = "1.2.16"
|
||||
@@ -1206,7 +1283,7 @@ version = "4.5.18"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4ac6a0c7b1a9e9a5186361f67dfa1b88213572f427fb9ab038efb2bd8c582dab"
|
||||
dependencies = [
|
||||
"heck",
|
||||
"heck 0.5.0",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.100",
|
||||
@@ -1264,13 +1341,40 @@ dependencies = [
|
||||
"unicode-width",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "communicator"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"atomic_enum",
|
||||
"bytes",
|
||||
"cbindgen",
|
||||
"http 1.1.0",
|
||||
"libc",
|
||||
"neonart",
|
||||
"nix 0.27.1",
|
||||
"pageserver_client_grpc",
|
||||
"pageserver_data_api",
|
||||
"prost 0.13.3",
|
||||
"thiserror 1.0.69",
|
||||
"tokio",
|
||||
"tokio-epoll-uring",
|
||||
"tokio-pipe",
|
||||
"tonic",
|
||||
"tracing",
|
||||
"tracing-subscriber",
|
||||
"uring-common",
|
||||
"utils",
|
||||
"zerocopy 0.8.24",
|
||||
"zerocopy-derive 0.8.24",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "compute_api"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"chrono",
|
||||
"indexmap 2.0.1",
|
||||
"indexmap 2.9.0",
|
||||
"jsonwebtoken",
|
||||
"regex",
|
||||
"remote_storage",
|
||||
@@ -1288,7 +1392,7 @@ dependencies = [
|
||||
"aws-sdk-kms",
|
||||
"aws-sdk-s3",
|
||||
"aws-smithy-types",
|
||||
"axum",
|
||||
"axum 0.8.1",
|
||||
"axum-extra",
|
||||
"base64 0.13.1",
|
||||
"bytes",
|
||||
@@ -1301,7 +1405,7 @@ dependencies = [
|
||||
"flate2",
|
||||
"futures",
|
||||
"http 1.1.0",
|
||||
"indexmap 2.0.1",
|
||||
"indexmap 2.9.0",
|
||||
"jsonwebtoken",
|
||||
"metrics",
|
||||
"nix 0.27.1",
|
||||
@@ -1927,7 +2031,7 @@ checksum = "0892a17df262a24294c382f0d5997571006e7a4348b4327557c4ff1cd4a8bccc"
|
||||
dependencies = [
|
||||
"darling",
|
||||
"either",
|
||||
"heck",
|
||||
"heck 0.5.0",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.100",
|
||||
@@ -2041,7 +2145,7 @@ name = "endpoint_storage"
|
||||
version = "0.0.1"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"axum",
|
||||
"axum 0.8.1",
|
||||
"axum-extra",
|
||||
"camino",
|
||||
"camino-tempfile",
|
||||
@@ -2588,7 +2692,7 @@ dependencies = [
|
||||
"futures-sink",
|
||||
"futures-util",
|
||||
"http 0.2.9",
|
||||
"indexmap 2.0.1",
|
||||
"indexmap 2.9.0",
|
||||
"slab",
|
||||
"tokio",
|
||||
"tokio-util",
|
||||
@@ -2607,7 +2711,7 @@ dependencies = [
|
||||
"futures-sink",
|
||||
"futures-util",
|
||||
"http 1.1.0",
|
||||
"indexmap 2.0.1",
|
||||
"indexmap 2.9.0",
|
||||
"slab",
|
||||
"tokio",
|
||||
"tokio-util",
|
||||
@@ -2703,6 +2807,12 @@ dependencies = [
|
||||
"http 1.1.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "heck"
|
||||
version = "0.4.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"
|
||||
|
||||
[[package]]
|
||||
name = "heck"
|
||||
version = "0.5.0"
|
||||
@@ -3191,12 +3301,12 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "indexmap"
|
||||
version = "2.0.1"
|
||||
version = "2.9.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ad227c3af19d4914570ad36d30409928b75967c298feb9ea1969db3a610bb14e"
|
||||
checksum = "cea70ddb795996207ad57735b50c5982d8844f38ba9ee5f1aedcfb708a2aa11e"
|
||||
dependencies = [
|
||||
"equivalent",
|
||||
"hashbrown 0.14.5",
|
||||
"hashbrown 0.15.2",
|
||||
"serde",
|
||||
]
|
||||
|
||||
@@ -3219,7 +3329,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "232929e1d75fe899576a3d5c7416ad0d88dbfbb3c3d6aa00873a7408a50ddb88"
|
||||
dependencies = [
|
||||
"ahash",
|
||||
"indexmap 2.0.1",
|
||||
"indexmap 2.9.0",
|
||||
"is-terminal",
|
||||
"itoa",
|
||||
"log",
|
||||
@@ -3242,7 +3352,7 @@ dependencies = [
|
||||
"crossbeam-utils",
|
||||
"dashmap 6.1.0",
|
||||
"env_logger",
|
||||
"indexmap 2.0.1",
|
||||
"indexmap 2.9.0",
|
||||
"itoa",
|
||||
"log",
|
||||
"num-format",
|
||||
@@ -3594,6 +3704,12 @@ dependencies = [
|
||||
"regex-automata 0.1.10",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "matchit"
|
||||
version = "0.7.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0e7465ac9959cc2b1404e8e2367b43684a6d13790fe23056cc8c6c5a6b7bcb94"
|
||||
|
||||
[[package]]
|
||||
name = "matchit"
|
||||
version = "0.8.4"
|
||||
@@ -3639,7 +3755,7 @@ version = "0.0.22"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b9e6777fc80a575f9503d908c8b498782a6c3ee88a06cb416dc3941401e43b94"
|
||||
dependencies = [
|
||||
"heck",
|
||||
"heck 0.5.0",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.100",
|
||||
@@ -3785,6 +3901,15 @@ version = "0.8.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a"
|
||||
|
||||
[[package]]
|
||||
name = "neonart"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"rand 0.8.5",
|
||||
"tracing",
|
||||
"zerocopy 0.8.24",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "never-say-never"
|
||||
version = "6.6.666"
|
||||
@@ -4208,6 +4333,8 @@ dependencies = [
|
||||
"humantime-serde",
|
||||
"pageserver_api",
|
||||
"pageserver_client",
|
||||
"pageserver_client_grpc",
|
||||
"pageserver_data_api",
|
||||
"rand 0.8.5",
|
||||
"reqwest",
|
||||
"serde",
|
||||
@@ -4284,6 +4411,8 @@ dependencies = [
|
||||
"pageserver_api",
|
||||
"pageserver_client",
|
||||
"pageserver_compaction",
|
||||
"pageserver_data_api",
|
||||
"peekable",
|
||||
"pem",
|
||||
"pin-project-lite",
|
||||
"postgres-protocol",
|
||||
@@ -4295,6 +4424,7 @@ dependencies = [
|
||||
"pprof",
|
||||
"pq_proto",
|
||||
"procfs",
|
||||
"prost 0.13.3",
|
||||
"rand 0.8.5",
|
||||
"range-set-blaze",
|
||||
"regex",
|
||||
@@ -4326,6 +4456,7 @@ dependencies = [
|
||||
"tokio-tar",
|
||||
"tokio-util",
|
||||
"toml_edit",
|
||||
"tonic",
|
||||
"tracing",
|
||||
"tracing-utils",
|
||||
"url",
|
||||
@@ -4390,6 +4521,18 @@ dependencies = [
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pageserver_client_grpc"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"http 1.1.0",
|
||||
"pageserver_data_api",
|
||||
"thiserror 1.0.69",
|
||||
"tonic",
|
||||
"tracing",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pageserver_compaction"
|
||||
version = "0.1.0"
|
||||
@@ -4413,6 +4556,17 @@ dependencies = [
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pageserver_data_api"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"prost 0.13.3",
|
||||
"thiserror 1.0.69",
|
||||
"tonic",
|
||||
"tonic-build",
|
||||
"utils",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "papaya"
|
||||
version = "0.2.1"
|
||||
@@ -4539,6 +4693,15 @@ dependencies = [
|
||||
"sha2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "peekable"
|
||||
version = "0.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "225f9651e475709164f871dc2f5724956be59cb9edb055372ffeeab01ec2d20b"
|
||||
dependencies = [
|
||||
"smallvec",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pem"
|
||||
version = "3.0.3"
|
||||
@@ -5010,7 +5173,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "22505a5c94da8e3b7c2996394d1c933236c4d743e81a410bcca4e6989fc066a4"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"heck",
|
||||
"heck 0.5.0",
|
||||
"itertools 0.12.1",
|
||||
"log",
|
||||
"multimap",
|
||||
@@ -5031,7 +5194,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0c1318b19085f08681016926435853bbf7858f9c082d0999b80550ff5d9abe15"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"heck",
|
||||
"heck 0.5.0",
|
||||
"itertools 0.12.1",
|
||||
"log",
|
||||
"multimap",
|
||||
@@ -5134,7 +5297,7 @@ dependencies = [
|
||||
"hyper 0.14.30",
|
||||
"hyper 1.4.1",
|
||||
"hyper-util",
|
||||
"indexmap 2.0.1",
|
||||
"indexmap 2.9.0",
|
||||
"ipnet",
|
||||
"itertools 0.10.5",
|
||||
"itoa",
|
||||
@@ -5645,7 +5808,7 @@ dependencies = [
|
||||
"async-trait",
|
||||
"getrandom 0.2.11",
|
||||
"http 1.1.0",
|
||||
"matchit",
|
||||
"matchit 0.8.4",
|
||||
"opentelemetry",
|
||||
"reqwest",
|
||||
"reqwest-middleware",
|
||||
@@ -6806,7 +6969,7 @@ version = "0.26.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4c6bee85a5a24955dc440386795aa378cd9cf82acd5f764469152d2270e581be"
|
||||
dependencies = [
|
||||
"heck",
|
||||
"heck 0.5.0",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"rustversion",
|
||||
@@ -7231,6 +7394,16 @@ dependencies = [
|
||||
"syn 2.0.100",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tokio-pipe"
|
||||
version = "0.2.12"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f213a84bffbd61b8fa0ba8a044b4bbe35d471d0b518867181e82bd5c15542784"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"tokio",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tokio-postgres"
|
||||
version = "0.7.10"
|
||||
@@ -7413,7 +7586,7 @@ version = "0.22.14"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f21c7aaf97f1bd9ca9d4f9e73b0a6c74bd5afef56f2bc931943a6e1c37e04e38"
|
||||
dependencies = [
|
||||
"indexmap 2.0.1",
|
||||
"indexmap 2.9.0",
|
||||
"serde",
|
||||
"serde_spanned",
|
||||
"toml_datetime",
|
||||
@@ -7426,9 +7599,13 @@ version = "0.12.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "877c5b330756d856ffcc4553ab34a5684481ade925ecc54bcd1bf02b1d0d4d52"
|
||||
dependencies = [
|
||||
"async-stream",
|
||||
"async-trait",
|
||||
"axum 0.7.9",
|
||||
"base64 0.22.1",
|
||||
"bytes",
|
||||
"flate2",
|
||||
"h2 0.4.4",
|
||||
"http 1.1.0",
|
||||
"http-body 1.0.0",
|
||||
"http-body-util",
|
||||
@@ -7440,6 +7617,7 @@ dependencies = [
|
||||
"prost 0.13.3",
|
||||
"rustls-native-certs 0.8.0",
|
||||
"rustls-pemfile 2.1.1",
|
||||
"socket2",
|
||||
"tokio",
|
||||
"tokio-rustls 0.26.0",
|
||||
"tokio-stream",
|
||||
@@ -7939,7 +8117,7 @@ name = "vm_monitor"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"axum",
|
||||
"axum 0.8.1",
|
||||
"cgroups-rs",
|
||||
"clap",
|
||||
"futures",
|
||||
@@ -8449,7 +8627,7 @@ dependencies = [
|
||||
"hyper 1.4.1",
|
||||
"hyper-util",
|
||||
"indexmap 1.9.3",
|
||||
"indexmap 2.0.1",
|
||||
"indexmap 2.9.0",
|
||||
"itertools 0.12.1",
|
||||
"lazy_static",
|
||||
"libc",
|
||||
|
||||
14
Cargo.toml
14
Cargo.toml
@@ -8,6 +8,7 @@ members = [
|
||||
"pageserver/compaction",
|
||||
"pageserver/ctl",
|
||||
"pageserver/client",
|
||||
"pageserver/client_grpc",
|
||||
"pageserver/pagebench",
|
||||
"proxy",
|
||||
"safekeeper",
|
||||
@@ -29,6 +30,7 @@ members = [
|
||||
"libs/pq_proto",
|
||||
"libs/tenant_size_model",
|
||||
"libs/metrics",
|
||||
"libs/neonart",
|
||||
"libs/postgres_connection",
|
||||
"libs/remote_storage",
|
||||
"libs/tracing-utils",
|
||||
@@ -41,6 +43,7 @@ members = [
|
||||
"libs/proxy/postgres-types2",
|
||||
"libs/proxy/tokio-postgres2",
|
||||
"endpoint_storage",
|
||||
"pgxn/neon/communicator",
|
||||
]
|
||||
|
||||
[workspace.package]
|
||||
@@ -142,6 +145,7 @@ parquet = { version = "53", default-features = false, features = ["zstd"] }
|
||||
parquet_derive = "53"
|
||||
pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
|
||||
pem = "3.0.3"
|
||||
peekable = "0.3.0"
|
||||
pin-project-lite = "0.2"
|
||||
pprof = { version = "0.14", features = ["criterion", "flamegraph", "frame-pointer", "prost-codec"] }
|
||||
procfs = "0.16"
|
||||
@@ -187,7 +191,6 @@ thiserror = "1.0"
|
||||
tikv-jemallocator = { version = "0.6", features = ["profiling", "stats", "unprefixed_malloc_on_supported_platforms"] }
|
||||
tikv-jemalloc-ctl = { version = "0.6", features = ["stats"] }
|
||||
tokio = { version = "1.43.1", features = ["macros"] }
|
||||
tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
|
||||
tokio-io-timeout = "1.2.0"
|
||||
tokio-postgres-rustls = "0.12.0"
|
||||
tokio-rustls = { version = "0.26.0", default-features = false, features = ["tls12", "ring"]}
|
||||
@@ -196,7 +199,7 @@ tokio-tar = "0.3"
|
||||
tokio-util = { version = "0.7.10", features = ["io", "rt"] }
|
||||
toml = "0.8"
|
||||
toml_edit = "0.22"
|
||||
tonic = {version = "0.12.3", default-features = false, features = ["channel", "tls", "tls-roots"]}
|
||||
tonic = {version = "0.12.3", default-features = false, features = ["channel", "server", "tls", "tls-roots", "gzip"]}
|
||||
tower = { version = "0.5.2", default-features = false }
|
||||
tower-http = { version = "0.6.2", features = ["auth", "request-id", "trace"] }
|
||||
|
||||
@@ -228,6 +231,9 @@ x509-cert = { version = "0.2.5" }
|
||||
env_logger = "0.11"
|
||||
log = "0.4"
|
||||
|
||||
tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
|
||||
uring-common = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
|
||||
|
||||
## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
|
||||
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch = "neon" }
|
||||
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", branch = "neon" }
|
||||
@@ -245,9 +251,12 @@ compute_api = { version = "0.1", path = "./libs/compute_api/" }
|
||||
consumption_metrics = { version = "0.1", path = "./libs/consumption_metrics/" }
|
||||
http-utils = { version = "0.1", path = "./libs/http-utils/" }
|
||||
metrics = { version = "0.1", path = "./libs/metrics/" }
|
||||
neonart = { version = "0.1", path = "./libs/neonart/" }
|
||||
pageserver = { path = "./pageserver" }
|
||||
pageserver_api = { version = "0.1", path = "./libs/pageserver_api/" }
|
||||
pageserver_client = { path = "./pageserver/client" }
|
||||
pageserver_client_grpc = { path = "./pageserver/client_grpc" }
|
||||
pageserver_data_api = { path = "./pageserver/data_api" }
|
||||
pageserver_compaction = { version = "0.1", path = "./pageserver/compaction/" }
|
||||
postgres_backend = { version = "0.1", path = "./libs/postgres_backend/" }
|
||||
postgres_connection = { version = "0.1", path = "./libs/postgres_connection/" }
|
||||
@@ -271,6 +280,7 @@ wal_decoder = { version = "0.1", path = "./libs/wal_decoder" }
|
||||
workspace_hack = { version = "0.1", path = "./workspace_hack/" }
|
||||
|
||||
## Build dependencies
|
||||
cbindgen = "0.28.0"
|
||||
criterion = "0.5.1"
|
||||
rcgen = "0.13"
|
||||
rstest = "0.18"
|
||||
|
||||
7
Makefile
7
Makefile
@@ -18,10 +18,12 @@ ifeq ($(BUILD_TYPE),release)
|
||||
PG_LDFLAGS = $(LDFLAGS)
|
||||
# Unfortunately, `--profile=...` is a nightly feature
|
||||
CARGO_BUILD_FLAGS += --release
|
||||
NEON_CARGO_ARTIFACT_TARGET_DIR = $(ROOT_PROJECT_DIR)/target/release
|
||||
else ifeq ($(BUILD_TYPE),debug)
|
||||
PG_CONFIGURE_OPTS = --enable-debug --with-openssl --enable-cassert --enable-depend
|
||||
PG_CFLAGS += -O0 -g3 $(CFLAGS)
|
||||
PG_LDFLAGS = $(LDFLAGS)
|
||||
NEON_CARGO_ARTIFACT_TARGET_DIR = $(ROOT_PROJECT_DIR)/target/debug
|
||||
else
|
||||
$(error Bad build type '$(BUILD_TYPE)', see Makefile for options)
|
||||
endif
|
||||
@@ -180,11 +182,16 @@ postgres-check-%: postgres-%
|
||||
|
||||
.PHONY: neon-pg-ext-%
|
||||
neon-pg-ext-%: postgres-%
|
||||
+@echo "Compiling communicator $*"
|
||||
$(CARGO_CMD_PREFIX) cargo build -p communicator $(CARGO_BUILD_FLAGS)
|
||||
|
||||
+@echo "Compiling neon $*"
|
||||
mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-$*
|
||||
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config COPT='$(COPT)' \
|
||||
LIBCOMMUNICATOR_PATH=$(NEON_CARGO_ARTIFACT_TARGET_DIR) \
|
||||
-C $(POSTGRES_INSTALL_DIR)/build/neon-$* \
|
||||
-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile install
|
||||
|
||||
+@echo "Compiling neon_walredo $*"
|
||||
mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-walredo-$*
|
||||
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config COPT='$(COPT)' \
|
||||
|
||||
11
libs/neonart/Cargo.toml
Normal file
11
libs/neonart/Cargo.toml
Normal file
@@ -0,0 +1,11 @@
|
||||
[package]
|
||||
name = "neonart"
|
||||
version = "0.1.0"
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
tracing.workspace = true
|
||||
|
||||
rand.workspace = true # for tests
|
||||
zerocopy = "0.8"
|
||||
377
libs/neonart/src/algorithm.rs
Normal file
377
libs/neonart/src/algorithm.rs
Normal file
@@ -0,0 +1,377 @@
|
||||
mod lock_and_version;
|
||||
mod node_ptr;
|
||||
mod node_ref;
|
||||
|
||||
use std::vec::Vec;
|
||||
|
||||
use crate::algorithm::lock_and_version::ResultOrRestart;
|
||||
use crate::algorithm::node_ptr::{MAX_PREFIX_LEN, NodePtr};
|
||||
use crate::algorithm::node_ref::ChildOrValue;
|
||||
use crate::algorithm::node_ref::{NodeRef, ReadLockedNodeRef, WriteLockedNodeRef};
|
||||
|
||||
use crate::epoch::EpochPin;
|
||||
use crate::{Allocator, Key, Value};
|
||||
|
||||
pub(crate) type RootPtr<V> = node_ptr::NodePtr<V>;
|
||||
|
||||
pub fn new_root<V: Value>(allocator: &Allocator) -> RootPtr<V> {
|
||||
node_ptr::new_root(allocator)
|
||||
}
|
||||
|
||||
pub(crate) fn search<'e, K: Key, V: Value>(
|
||||
key: &K,
|
||||
root: RootPtr<V>,
|
||||
epoch_pin: &'e EpochPin,
|
||||
) -> Option<V> {
|
||||
loop {
|
||||
let root_ref = NodeRef::from_root_ptr(root);
|
||||
if let Ok(result) = lookup_recurse(key.as_bytes(), root_ref, None, epoch_pin) {
|
||||
break result;
|
||||
}
|
||||
// retry
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn update_fn<'e, K: Key, V: Value, F>(
|
||||
key: &K,
|
||||
value_fn: F,
|
||||
root: RootPtr<V>,
|
||||
allocator: &Allocator,
|
||||
epoch_pin: &'e EpochPin,
|
||||
) where
|
||||
F: FnOnce(Option<&V>) -> Option<V>,
|
||||
{
|
||||
let value_fn_cell = std::cell::Cell::new(Some(value_fn));
|
||||
loop {
|
||||
let root_ref = NodeRef::from_root_ptr(root);
|
||||
let this_value_fn = |arg: Option<&V>| value_fn_cell.take().unwrap()(arg);
|
||||
let key_bytes = key.as_bytes();
|
||||
if let Ok(()) = update_recurse(
|
||||
key_bytes,
|
||||
this_value_fn,
|
||||
root_ref,
|
||||
None,
|
||||
allocator,
|
||||
epoch_pin,
|
||||
0,
|
||||
key_bytes,
|
||||
) {
|
||||
break;
|
||||
}
|
||||
// retry
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn dump_tree<'e, V: Value + std::fmt::Debug>(root: RootPtr<V>, epoch_pin: &'e EpochPin) {
|
||||
let root_ref = NodeRef::from_root_ptr(root);
|
||||
|
||||
let _ = dump_recurse(&[], root_ref, &epoch_pin, 0);
|
||||
}
|
||||
|
||||
// Error means you must retry.
|
||||
//
|
||||
// This corresponds to the 'lookupOpt' function in the paper
|
||||
fn lookup_recurse<'e, V: Value>(
|
||||
key: &[u8],
|
||||
node: NodeRef<'e, V>,
|
||||
parent: Option<ReadLockedNodeRef<V>>,
|
||||
epoch_pin: &'e EpochPin,
|
||||
) -> ResultOrRestart<Option<V>> {
|
||||
let rnode = node.read_lock_or_restart()?;
|
||||
if let Some(parent) = parent {
|
||||
parent.read_unlock_or_restart()?;
|
||||
}
|
||||
|
||||
// check if prefix matches, may increment level
|
||||
let prefix_len = if let Some(prefix_len) = rnode.prefix_matches(key) {
|
||||
prefix_len
|
||||
} else {
|
||||
rnode.read_unlock_or_restart()?;
|
||||
return Ok(None);
|
||||
};
|
||||
let key = &key[prefix_len..];
|
||||
|
||||
// find child (or leaf value)
|
||||
let next_node = rnode.find_child_or_value_or_restart(key[0])?;
|
||||
|
||||
match next_node {
|
||||
None => Ok(None), // key not found
|
||||
Some(ChildOrValue::Value(vptr)) => {
|
||||
// safety: It's OK to follow the pointer because we checked the version.
|
||||
let v = unsafe { (*vptr).clone() };
|
||||
Ok(Some(v))
|
||||
}
|
||||
Some(ChildOrValue::Child(v)) => lookup_recurse(&key[1..], v, Some(rnode), epoch_pin),
|
||||
}
|
||||
}
|
||||
|
||||
// This corresponds to the 'insertOpt' function in the paper
|
||||
pub(crate) fn update_recurse<'e, V: Value, F>(
|
||||
key: &[u8],
|
||||
value_fn: F,
|
||||
node: NodeRef<'e, V>,
|
||||
rparent: Option<(ReadLockedNodeRef<V>, u8)>,
|
||||
allocator: &Allocator,
|
||||
epoch_pin: &'e EpochPin,
|
||||
level: usize,
|
||||
orig_key: &[u8],
|
||||
) -> ResultOrRestart<()>
|
||||
where
|
||||
F: FnOnce(Option<&V>) -> Option<V>,
|
||||
{
|
||||
let rnode = node.read_lock_or_restart()?;
|
||||
|
||||
let prefix_match_len = rnode.prefix_matches(key);
|
||||
if prefix_match_len.is_none() {
|
||||
let (rparent, parent_key) = rparent.expect("direct children of the root have no prefix");
|
||||
let mut wparent = rparent.upgrade_to_write_lock_or_restart()?;
|
||||
let mut wnode = rnode.upgrade_to_write_lock_or_restart()?;
|
||||
|
||||
if let Some(new_value) = value_fn(None) {
|
||||
insert_split_prefix(
|
||||
key,
|
||||
new_value,
|
||||
&mut wnode,
|
||||
&mut wparent,
|
||||
parent_key,
|
||||
allocator,
|
||||
);
|
||||
}
|
||||
wnode.write_unlock();
|
||||
wparent.write_unlock();
|
||||
return Ok(());
|
||||
}
|
||||
let prefix_match_len = prefix_match_len.unwrap();
|
||||
let key = &key[prefix_match_len as usize..];
|
||||
let level = level + prefix_match_len as usize;
|
||||
|
||||
let next_node = rnode.find_child_or_value_or_restart(key[0])?;
|
||||
|
||||
if next_node.is_none() {
|
||||
if rnode.is_full() {
|
||||
let (rparent, parent_key) = rparent.expect("root node cannot become full");
|
||||
let mut wparent = rparent.upgrade_to_write_lock_or_restart()?;
|
||||
let wnode = rnode.upgrade_to_write_lock_or_restart()?;
|
||||
|
||||
if let Some(new_value) = value_fn(None) {
|
||||
insert_and_grow(key, new_value, &wnode, &mut wparent, parent_key, allocator);
|
||||
wnode.write_unlock_obsolete();
|
||||
wparent.write_unlock();
|
||||
} else {
|
||||
wnode.write_unlock();
|
||||
wparent.write_unlock();
|
||||
}
|
||||
} else {
|
||||
let mut wnode = rnode.upgrade_to_write_lock_or_restart()?;
|
||||
if let Some((rparent, _)) = rparent {
|
||||
rparent.read_unlock_or_restart()?;
|
||||
}
|
||||
if let Some(new_value) = value_fn(None) {
|
||||
insert_to_node(&mut wnode, key, new_value, allocator);
|
||||
}
|
||||
wnode.write_unlock();
|
||||
}
|
||||
return Ok(());
|
||||
} else {
|
||||
let next_node = next_node.unwrap(); // checked above it's not None
|
||||
if let Some((rparent, _)) = rparent {
|
||||
rparent.read_unlock_or_restart()?;
|
||||
}
|
||||
|
||||
match next_node {
|
||||
ChildOrValue::Value(existing_value_ptr) => {
|
||||
assert!(key.len() == 1);
|
||||
let wnode = rnode.upgrade_to_write_lock_or_restart()?;
|
||||
|
||||
// safety: Now that we have acquired the write lock, we have exclusive access to the
|
||||
// value
|
||||
let vmut = unsafe { existing_value_ptr.cast_mut().as_mut() }.unwrap();
|
||||
if let Some(new_value) = value_fn(Some(vmut)) {
|
||||
*vmut = new_value;
|
||||
} else {
|
||||
// TODO: Treat this as deletion?
|
||||
}
|
||||
wnode.write_unlock();
|
||||
|
||||
Ok(())
|
||||
}
|
||||
ChildOrValue::Child(next_child) => {
|
||||
// recurse to next level
|
||||
update_recurse(
|
||||
&key[1..],
|
||||
value_fn,
|
||||
next_child,
|
||||
Some((rnode, key[0])),
|
||||
allocator,
|
||||
epoch_pin,
|
||||
level + 1,
|
||||
orig_key,
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
enum PathElement {
|
||||
Prefix(Vec<u8>),
|
||||
KeyByte(u8),
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for PathElement {
|
||||
fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
|
||||
match self {
|
||||
PathElement::Prefix(prefix) => write!(fmt, "{:?}", prefix),
|
||||
PathElement::KeyByte(key_byte) => write!(fmt, "{}", key_byte),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn dump_recurse<'e, V: Value + std::fmt::Debug>(
|
||||
path: &[PathElement],
|
||||
node: NodeRef<'e, V>,
|
||||
epoch_pin: &'e EpochPin,
|
||||
level: usize,
|
||||
) -> ResultOrRestart<()> {
|
||||
let indent = str::repeat(" ", level);
|
||||
|
||||
let rnode = node.read_lock_or_restart()?;
|
||||
let mut path = Vec::from(path);
|
||||
let prefix = rnode.get_prefix();
|
||||
if prefix.len() != 0 {
|
||||
path.push(PathElement::Prefix(Vec::from(prefix)));
|
||||
}
|
||||
|
||||
for key_byte in 0..u8::MAX {
|
||||
match rnode.find_child_or_value_or_restart(key_byte)? {
|
||||
None => continue,
|
||||
Some(ChildOrValue::Child(child_ref)) => {
|
||||
let rchild = child_ref.read_lock_or_restart()?;
|
||||
eprintln!(
|
||||
"{} {:?}, {}: prefix {:?}",
|
||||
indent,
|
||||
&path,
|
||||
key_byte,
|
||||
rchild.get_prefix()
|
||||
);
|
||||
|
||||
let mut child_path = path.clone();
|
||||
child_path.push(PathElement::KeyByte(key_byte));
|
||||
|
||||
dump_recurse(&child_path, child_ref, epoch_pin, level + 1)?;
|
||||
}
|
||||
Some(ChildOrValue::Value(val)) => {
|
||||
eprintln!("{} {:?}, {}: {:?}", indent, path, key_byte, unsafe {
|
||||
val.as_ref().unwrap()
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
///```text
|
||||
/// [fooba]r -> value
|
||||
///
|
||||
/// [foo]b -> [a]r -> value
|
||||
/// e -> [ls]e -> value
|
||||
///```
|
||||
fn insert_split_prefix<'a, V: Value>(
|
||||
key: &[u8],
|
||||
value: V,
|
||||
node: &mut WriteLockedNodeRef<V>,
|
||||
parent: &mut WriteLockedNodeRef<V>,
|
||||
parent_key: u8,
|
||||
allocator: &Allocator,
|
||||
) {
|
||||
let old_node = node;
|
||||
let old_prefix = old_node.get_prefix();
|
||||
let common_prefix_len = common_prefix(key, old_prefix);
|
||||
|
||||
// Allocate a node for the new value.
|
||||
let new_value_node = allocate_node_for_value(&key[common_prefix_len + 1..], value, allocator);
|
||||
|
||||
// Allocate a new internal node with the common prefix
|
||||
let mut prefix_node = node_ref::new_internal(&key[..common_prefix_len], allocator);
|
||||
|
||||
// Add the old node and the new nodes to the new internal node
|
||||
prefix_node.insert_child(old_prefix[common_prefix_len], old_node.as_ptr());
|
||||
prefix_node.insert_child(key[common_prefix_len], new_value_node);
|
||||
|
||||
// Modify the prefix of the old child in place
|
||||
old_node.truncate_prefix(old_prefix.len() - common_prefix_len - 1);
|
||||
|
||||
// replace the pointer in the parent
|
||||
parent.replace_child(parent_key, prefix_node.into_ptr());
|
||||
}
|
||||
|
||||
fn insert_to_node<V: Value>(
|
||||
wnode: &mut WriteLockedNodeRef<V>,
|
||||
key: &[u8],
|
||||
value: V,
|
||||
allocator: &Allocator,
|
||||
) {
|
||||
if wnode.is_leaf() {
|
||||
wnode.insert_value(key[0], value);
|
||||
} else {
|
||||
let value_child = allocate_node_for_value(&key[1..], value, allocator);
|
||||
wnode.insert_child(key[0], value_child);
|
||||
}
|
||||
}
|
||||
|
||||
// On entry: 'parent' and 'node' are locked
|
||||
fn insert_and_grow<V: Value>(
|
||||
key: &[u8],
|
||||
value: V,
|
||||
wnode: &WriteLockedNodeRef<V>,
|
||||
parent: &mut WriteLockedNodeRef<V>,
|
||||
parent_key_byte: u8,
|
||||
allocator: &Allocator,
|
||||
) {
|
||||
let mut bigger_node = wnode.grow(allocator);
|
||||
|
||||
if wnode.is_leaf() {
|
||||
bigger_node.insert_value(key[0], value);
|
||||
} else {
|
||||
let value_child = allocate_node_for_value(&key[1..], value, allocator);
|
||||
bigger_node.insert_child(key[0], value_child);
|
||||
}
|
||||
|
||||
// Replace the pointer in the parent
|
||||
parent.replace_child(parent_key_byte, bigger_node.into_ptr());
|
||||
}
|
||||
|
||||
// Allocate a new leaf node to hold 'value'. If key is long, we may need to allocate
|
||||
// new internal nodes to hold it too
|
||||
fn allocate_node_for_value<V: Value>(key: &[u8], value: V, allocator: &Allocator) -> NodePtr<V> {
|
||||
let mut prefix_off = key.len().saturating_sub(MAX_PREFIX_LEN + 1);
|
||||
|
||||
let mut leaf_node = node_ref::new_leaf(&key[prefix_off..key.len() - 1], allocator);
|
||||
leaf_node.insert_value(*key.last().unwrap(), value);
|
||||
|
||||
let mut node = leaf_node;
|
||||
while prefix_off > 0 {
|
||||
// Need another internal node
|
||||
let remain_prefix = &key[0..prefix_off];
|
||||
|
||||
prefix_off = remain_prefix.len().saturating_sub(MAX_PREFIX_LEN + 1);
|
||||
let mut internal_node = node_ref::new_internal(
|
||||
&remain_prefix[prefix_off..remain_prefix.len() - 1],
|
||||
allocator,
|
||||
);
|
||||
internal_node.insert_child(*remain_prefix.last().unwrap(), node.into_ptr());
|
||||
node = internal_node;
|
||||
}
|
||||
|
||||
node.into_ptr()
|
||||
}
|
||||
|
||||
fn common_prefix(a: &[u8], b: &[u8]) -> usize {
|
||||
for i in 0..MAX_PREFIX_LEN {
|
||||
if a[i] != b[i] {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
panic!("prefixes are equal");
|
||||
}
|
||||
85
libs/neonart/src/algorithm/lock_and_version.rs
Normal file
85
libs/neonart/src/algorithm/lock_and_version.rs
Normal file
@@ -0,0 +1,85 @@
|
||||
use std::sync::atomic::{AtomicU64, Ordering};
|
||||
|
||||
pub(crate) struct AtomicLockAndVersion {
|
||||
inner: AtomicU64,
|
||||
}
|
||||
|
||||
impl AtomicLockAndVersion {
|
||||
pub(crate) fn new() -> AtomicLockAndVersion {
|
||||
AtomicLockAndVersion {
|
||||
inner: AtomicU64::new(0),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) type ResultOrRestart<T> = Result<T, ()>;
|
||||
|
||||
const fn restart<T>() -> ResultOrRestart<T> {
|
||||
Err(())
|
||||
}
|
||||
|
||||
impl AtomicLockAndVersion {
|
||||
pub(crate) fn read_lock_or_restart(&self) -> ResultOrRestart<u64> {
|
||||
let version = self.await_node_unlocked();
|
||||
if is_obsolete(version) {
|
||||
return restart();
|
||||
}
|
||||
Ok(version)
|
||||
}
|
||||
|
||||
pub(crate) fn check_or_restart(&self, version: u64) -> ResultOrRestart<()> {
|
||||
self.read_unlock_or_restart(version)
|
||||
}
|
||||
|
||||
pub(crate) fn read_unlock_or_restart(&self, version: u64) -> ResultOrRestart<()> {
|
||||
if self.inner.load(Ordering::Acquire) != version {
|
||||
return restart();
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) fn upgrade_to_write_lock_or_restart(&self, version: u64) -> ResultOrRestart<()> {
|
||||
if self
|
||||
.inner
|
||||
.compare_exchange(
|
||||
version,
|
||||
set_locked_bit(version),
|
||||
Ordering::Acquire,
|
||||
Ordering::Relaxed,
|
||||
)
|
||||
.is_err()
|
||||
{
|
||||
return restart();
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) fn write_unlock(&self) {
|
||||
// reset locked bit and overflow into version
|
||||
self.inner.fetch_add(2, Ordering::Release);
|
||||
}
|
||||
|
||||
pub(crate) fn write_unlock_obsolete(&self) {
|
||||
// set obsolete, reset locked, overflow into version
|
||||
self.inner.fetch_add(3, Ordering::Release);
|
||||
}
|
||||
|
||||
// Helper functions
|
||||
fn await_node_unlocked(&self) -> u64 {
|
||||
let mut version = self.inner.load(Ordering::Acquire);
|
||||
while (version & 2) == 2 {
|
||||
// spinlock
|
||||
std::thread::yield_now();
|
||||
version = self.inner.load(Ordering::Acquire)
|
||||
}
|
||||
version
|
||||
}
|
||||
}
|
||||
|
||||
fn set_locked_bit(version: u64) -> u64 {
|
||||
return version + 2;
|
||||
}
|
||||
|
||||
fn is_obsolete(version: u64) -> bool {
|
||||
return (version & 1) == 1;
|
||||
}
|
||||
983
libs/neonart/src/algorithm/node_ptr.rs
Normal file
983
libs/neonart/src/algorithm/node_ptr.rs
Normal file
@@ -0,0 +1,983 @@
|
||||
use std::marker::PhantomData;
|
||||
use std::ptr::NonNull;
|
||||
|
||||
use super::lock_and_version::AtomicLockAndVersion;
|
||||
|
||||
use crate::Allocator;
|
||||
use crate::Value;
|
||||
|
||||
pub(crate) const MAX_PREFIX_LEN: usize = 8;
|
||||
|
||||
enum NodeTag {
|
||||
Internal4,
|
||||
Internal16,
|
||||
Internal48,
|
||||
Internal256,
|
||||
Leaf4,
|
||||
Leaf16,
|
||||
Leaf48,
|
||||
Leaf256,
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
struct NodeBase {
|
||||
tag: NodeTag,
|
||||
lock_and_version: AtomicLockAndVersion,
|
||||
}
|
||||
|
||||
pub(crate) struct NodePtr<V> {
|
||||
ptr: *mut NodeBase,
|
||||
|
||||
phantom_value: PhantomData<V>,
|
||||
}
|
||||
|
||||
impl<V> std::fmt::Debug for NodePtr<V> {
|
||||
fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
|
||||
write!(fmt, "0x{}", self.ptr.addr())
|
||||
}
|
||||
}
|
||||
|
||||
impl<V> Copy for NodePtr<V> {}
|
||||
impl<V> Clone for NodePtr<V> {
|
||||
fn clone(&self) -> NodePtr<V> {
|
||||
NodePtr {
|
||||
ptr: self.ptr,
|
||||
phantom_value: PhantomData,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
enum NodeVariant<'a, V> {
|
||||
Internal4(&'a NodeInternal4<V>),
|
||||
Internal16(&'a NodeInternal16<V>),
|
||||
Internal48(&'a NodeInternal48<V>),
|
||||
Internal256(&'a NodeInternal256<V>),
|
||||
Leaf4(&'a NodeLeaf4<V>),
|
||||
Leaf16(&'a NodeLeaf16<V>),
|
||||
Leaf48(&'a NodeLeaf48<V>),
|
||||
Leaf256(&'a NodeLeaf256<V>),
|
||||
}
|
||||
|
||||
enum NodeVariantMut<'a, V> {
|
||||
Internal4(&'a mut NodeInternal4<V>),
|
||||
Internal16(&'a mut NodeInternal16<V>),
|
||||
Internal48(&'a mut NodeInternal48<V>),
|
||||
Internal256(&'a mut NodeInternal256<V>),
|
||||
Leaf4(&'a mut NodeLeaf4<V>),
|
||||
Leaf16(&'a mut NodeLeaf16<V>),
|
||||
Leaf48(&'a mut NodeLeaf48<V>),
|
||||
Leaf256(&'a mut NodeLeaf256<V>),
|
||||
}
|
||||
|
||||
pub(crate) enum ChildOrValuePtr<V> {
|
||||
Child(NodePtr<V>),
|
||||
Value(*const V),
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
struct NodeInternal4<V> {
|
||||
tag: NodeTag,
|
||||
lock_and_version: AtomicLockAndVersion,
|
||||
|
||||
prefix: [u8; MAX_PREFIX_LEN],
|
||||
prefix_len: u8,
|
||||
num_children: u8,
|
||||
|
||||
child_keys: [u8; 4],
|
||||
child_ptrs: [NodePtr<V>; 4],
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
struct NodeInternal16<V> {
|
||||
tag: NodeTag,
|
||||
lock_and_version: AtomicLockAndVersion,
|
||||
|
||||
prefix: [u8; MAX_PREFIX_LEN],
|
||||
prefix_len: u8,
|
||||
|
||||
num_children: u8,
|
||||
child_keys: [u8; 16],
|
||||
child_ptrs: [NodePtr<V>; 16],
|
||||
}
|
||||
|
||||
const INVALID_CHILD_INDEX: u8 = u8::MAX;
|
||||
|
||||
#[repr(C)]
|
||||
struct NodeInternal48<V> {
|
||||
tag: NodeTag,
|
||||
lock_and_version: AtomicLockAndVersion,
|
||||
|
||||
prefix: [u8; MAX_PREFIX_LEN],
|
||||
prefix_len: u8,
|
||||
|
||||
num_children: u8,
|
||||
child_indexes: [u8; 256],
|
||||
child_ptrs: [NodePtr<V>; 48],
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
pub(crate) struct NodeInternal256<V> {
|
||||
tag: NodeTag,
|
||||
lock_and_version: AtomicLockAndVersion,
|
||||
|
||||
prefix: [u8; MAX_PREFIX_LEN],
|
||||
prefix_len: u8,
|
||||
|
||||
num_children: u16,
|
||||
child_ptrs: [NodePtr<V>; 256],
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
struct NodeLeaf4<V> {
|
||||
tag: NodeTag,
|
||||
lock_and_version: AtomicLockAndVersion,
|
||||
|
||||
prefix: [u8; MAX_PREFIX_LEN],
|
||||
prefix_len: u8,
|
||||
|
||||
num_values: u8,
|
||||
child_keys: [u8; 4],
|
||||
child_values: [Option<V>; 4],
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
struct NodeLeaf16<V> {
|
||||
tag: NodeTag,
|
||||
lock_and_version: AtomicLockAndVersion,
|
||||
|
||||
prefix: [u8; MAX_PREFIX_LEN],
|
||||
prefix_len: u8,
|
||||
|
||||
num_values: u8,
|
||||
child_keys: [u8; 16],
|
||||
child_values: [Option<V>; 16],
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
struct NodeLeaf48<V> {
|
||||
tag: NodeTag,
|
||||
lock_and_version: AtomicLockAndVersion,
|
||||
|
||||
prefix: [u8; MAX_PREFIX_LEN],
|
||||
prefix_len: u8,
|
||||
|
||||
num_values: u8,
|
||||
child_indexes: [u8; 256],
|
||||
child_values: [Option<V>; 48],
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
struct NodeLeaf256<V> {
|
||||
tag: NodeTag,
|
||||
lock_and_version: AtomicLockAndVersion,
|
||||
|
||||
prefix: [u8; MAX_PREFIX_LEN],
|
||||
prefix_len: u8,
|
||||
|
||||
num_values: u16,
|
||||
child_values: [Option<V>; 256],
|
||||
}
|
||||
|
||||
impl<V> NodePtr<V> {
|
||||
pub(crate) fn is_leaf(&self) -> bool {
|
||||
match self.variant() {
|
||||
NodeVariant::Internal4(_) => false,
|
||||
NodeVariant::Internal16(_) => false,
|
||||
NodeVariant::Internal48(_) => false,
|
||||
NodeVariant::Internal256(_) => false,
|
||||
NodeVariant::Leaf4(_) => true,
|
||||
NodeVariant::Leaf16(_) => true,
|
||||
NodeVariant::Leaf48(_) => true,
|
||||
NodeVariant::Leaf256(_) => true,
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn lockword(&self) -> &AtomicLockAndVersion {
|
||||
match self.variant() {
|
||||
NodeVariant::Internal4(n) => &n.lock_and_version,
|
||||
NodeVariant::Internal16(n) => &n.lock_and_version,
|
||||
NodeVariant::Internal48(n) => &n.lock_and_version,
|
||||
NodeVariant::Internal256(n) => &n.lock_and_version,
|
||||
NodeVariant::Leaf4(n) => &n.lock_and_version,
|
||||
NodeVariant::Leaf16(n) => &n.lock_and_version,
|
||||
NodeVariant::Leaf48(n) => &n.lock_and_version,
|
||||
NodeVariant::Leaf256(n) => &n.lock_and_version,
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn is_null(&self) -> bool {
|
||||
self.ptr.is_null()
|
||||
}
|
||||
|
||||
pub(crate) const fn null() -> NodePtr<V> {
|
||||
NodePtr {
|
||||
ptr: std::ptr::null_mut(),
|
||||
phantom_value: PhantomData,
|
||||
}
|
||||
}
|
||||
|
||||
fn variant(&self) -> NodeVariant<V> {
|
||||
unsafe {
|
||||
match (*self.ptr).tag {
|
||||
NodeTag::Internal4 => NodeVariant::Internal4(
|
||||
NonNull::new_unchecked(self.ptr.cast::<NodeInternal4<V>>()).as_ref(),
|
||||
),
|
||||
NodeTag::Internal16 => NodeVariant::Internal16(
|
||||
NonNull::new_unchecked(self.ptr.cast::<NodeInternal16<V>>()).as_ref(),
|
||||
),
|
||||
NodeTag::Internal48 => NodeVariant::Internal48(
|
||||
NonNull::new_unchecked(self.ptr.cast::<NodeInternal48<V>>()).as_ref(),
|
||||
),
|
||||
NodeTag::Internal256 => NodeVariant::Internal256(
|
||||
NonNull::new_unchecked(self.ptr.cast::<NodeInternal256<V>>()).as_ref(),
|
||||
),
|
||||
NodeTag::Leaf4 => NodeVariant::Leaf4(
|
||||
NonNull::new_unchecked(self.ptr.cast::<NodeLeaf4<V>>()).as_ref(),
|
||||
),
|
||||
NodeTag::Leaf16 => NodeVariant::Leaf16(
|
||||
NonNull::new_unchecked(self.ptr.cast::<NodeLeaf16<V>>()).as_ref(),
|
||||
),
|
||||
NodeTag::Leaf48 => NodeVariant::Leaf48(
|
||||
NonNull::new_unchecked(self.ptr.cast::<NodeLeaf48<V>>()).as_ref(),
|
||||
),
|
||||
NodeTag::Leaf256 => NodeVariant::Leaf256(
|
||||
NonNull::new_unchecked(self.ptr.cast::<NodeLeaf256<V>>()).as_ref(),
|
||||
),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn variant_mut(&mut self) -> NodeVariantMut<V> {
|
||||
unsafe {
|
||||
match (*self.ptr).tag {
|
||||
NodeTag::Internal4 => NodeVariantMut::Internal4(
|
||||
NonNull::new_unchecked(self.ptr.cast::<NodeInternal4<V>>()).as_mut(),
|
||||
),
|
||||
NodeTag::Internal16 => NodeVariantMut::Internal16(
|
||||
NonNull::new_unchecked(self.ptr.cast::<NodeInternal16<V>>()).as_mut(),
|
||||
),
|
||||
NodeTag::Internal48 => NodeVariantMut::Internal48(
|
||||
NonNull::new_unchecked(self.ptr.cast::<NodeInternal48<V>>()).as_mut(),
|
||||
),
|
||||
NodeTag::Internal256 => NodeVariantMut::Internal256(
|
||||
NonNull::new_unchecked(self.ptr.cast::<NodeInternal256<V>>()).as_mut(),
|
||||
),
|
||||
NodeTag::Leaf4 => NodeVariantMut::Leaf4(
|
||||
NonNull::new_unchecked(self.ptr.cast::<NodeLeaf4<V>>()).as_mut(),
|
||||
),
|
||||
NodeTag::Leaf16 => NodeVariantMut::Leaf16(
|
||||
NonNull::new_unchecked(self.ptr.cast::<NodeLeaf16<V>>()).as_mut(),
|
||||
),
|
||||
NodeTag::Leaf48 => NodeVariantMut::Leaf48(
|
||||
NonNull::new_unchecked(self.ptr.cast::<NodeLeaf48<V>>()).as_mut(),
|
||||
),
|
||||
NodeTag::Leaf256 => NodeVariantMut::Leaf256(
|
||||
NonNull::new_unchecked(self.ptr.cast::<NodeLeaf256<V>>()).as_mut(),
|
||||
),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<V: Value> NodePtr<V> {
|
||||
pub(crate) fn prefix_matches(&self, key: &[u8]) -> Option<usize> {
|
||||
let node_prefix = self.get_prefix();
|
||||
assert!(node_prefix.len() <= key.len()); // because we only use fixed-size keys
|
||||
if &key[0..node_prefix.len()] != node_prefix {
|
||||
None
|
||||
} else {
|
||||
Some(node_prefix.len())
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn get_prefix(&self) -> &[u8] {
|
||||
match self.variant() {
|
||||
NodeVariant::Internal4(n) => n.get_prefix(),
|
||||
NodeVariant::Internal16(n) => n.get_prefix(),
|
||||
NodeVariant::Internal48(n) => n.get_prefix(),
|
||||
NodeVariant::Internal256(n) => n.get_prefix(),
|
||||
NodeVariant::Leaf4(n) => n.get_prefix(),
|
||||
NodeVariant::Leaf16(n) => n.get_prefix(),
|
||||
NodeVariant::Leaf48(n) => n.get_prefix(),
|
||||
NodeVariant::Leaf256(n) => n.get_prefix(),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn is_full(&self) -> bool {
|
||||
match self.variant() {
|
||||
NodeVariant::Internal4(n) => n.is_full(),
|
||||
NodeVariant::Internal16(n) => n.is_full(),
|
||||
NodeVariant::Internal48(n) => n.is_full(),
|
||||
NodeVariant::Internal256(n) => n.is_full(),
|
||||
NodeVariant::Leaf4(n) => n.is_full(),
|
||||
NodeVariant::Leaf16(n) => n.is_full(),
|
||||
NodeVariant::Leaf48(n) => n.is_full(),
|
||||
NodeVariant::Leaf256(n) => n.is_full(),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn find_child_or_value(&self, key_byte: u8) -> Option<ChildOrValuePtr<V>> {
|
||||
match self.variant() {
|
||||
NodeVariant::Internal4(n) => n.find_child(key_byte).map(|c| ChildOrValuePtr::Child(c)),
|
||||
NodeVariant::Internal16(n) => n.find_child(key_byte).map(|c| ChildOrValuePtr::Child(c)),
|
||||
NodeVariant::Internal48(n) => n.find_child(key_byte).map(|c| ChildOrValuePtr::Child(c)),
|
||||
NodeVariant::Internal256(n) => {
|
||||
n.find_child(key_byte).map(|c| ChildOrValuePtr::Child(c))
|
||||
}
|
||||
NodeVariant::Leaf4(n) => n
|
||||
.get_leaf_value(key_byte)
|
||||
.map(|v| ChildOrValuePtr::Value(v)),
|
||||
NodeVariant::Leaf16(n) => n
|
||||
.get_leaf_value(key_byte)
|
||||
.map(|v| ChildOrValuePtr::Value(v)),
|
||||
NodeVariant::Leaf48(n) => n
|
||||
.get_leaf_value(key_byte)
|
||||
.map(|v| ChildOrValuePtr::Value(v)),
|
||||
NodeVariant::Leaf256(n) => n
|
||||
.get_leaf_value(key_byte)
|
||||
.map(|v| ChildOrValuePtr::Value(v)),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn truncate_prefix(&mut self, new_prefix_len: usize) {
|
||||
match self.variant_mut() {
|
||||
NodeVariantMut::Internal4(n) => n.truncate_prefix(new_prefix_len),
|
||||
NodeVariantMut::Internal16(n) => n.truncate_prefix(new_prefix_len),
|
||||
NodeVariantMut::Internal48(n) => n.truncate_prefix(new_prefix_len),
|
||||
NodeVariantMut::Internal256(n) => n.truncate_prefix(new_prefix_len),
|
||||
NodeVariantMut::Leaf4(n) => n.truncate_prefix(new_prefix_len),
|
||||
NodeVariantMut::Leaf16(n) => n.truncate_prefix(new_prefix_len),
|
||||
NodeVariantMut::Leaf48(n) => n.truncate_prefix(new_prefix_len),
|
||||
NodeVariantMut::Leaf256(n) => n.truncate_prefix(new_prefix_len),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn grow(&self, allocator: &Allocator) -> NodePtr<V> {
|
||||
match self.variant() {
|
||||
NodeVariant::Internal4(n) => n.grow(allocator),
|
||||
NodeVariant::Internal16(n) => n.grow(allocator),
|
||||
NodeVariant::Internal48(n) => n.grow(allocator),
|
||||
NodeVariant::Internal256(_) => panic!("cannot grow Internal256 node"),
|
||||
NodeVariant::Leaf4(n) => n.grow(allocator),
|
||||
NodeVariant::Leaf16(n) => n.grow(allocator),
|
||||
NodeVariant::Leaf48(n) => n.grow(allocator),
|
||||
NodeVariant::Leaf256(_) => panic!("cannot grow Leaf256 node"),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn insert_child(&mut self, key_byte: u8, child: NodePtr<V>) {
|
||||
match self.variant_mut() {
|
||||
NodeVariantMut::Internal4(n) => n.insert_child(key_byte, child),
|
||||
NodeVariantMut::Internal16(n) => n.insert_child(key_byte, child),
|
||||
NodeVariantMut::Internal48(n) => n.insert_child(key_byte, child),
|
||||
NodeVariantMut::Internal256(n) => n.insert_child(key_byte, child),
|
||||
NodeVariantMut::Leaf4(_)
|
||||
| NodeVariantMut::Leaf16(_)
|
||||
| NodeVariantMut::Leaf48(_)
|
||||
| NodeVariantMut::Leaf256(_) => panic!("insert_child called on leaf node"),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn replace_child(&mut self, key_byte: u8, replacement: NodePtr<V>) {
|
||||
match self.variant_mut() {
|
||||
NodeVariantMut::Internal4(n) => n.replace_child(key_byte, replacement),
|
||||
NodeVariantMut::Internal16(n) => n.replace_child(key_byte, replacement),
|
||||
NodeVariantMut::Internal48(n) => n.replace_child(key_byte, replacement),
|
||||
NodeVariantMut::Internal256(n) => n.replace_child(key_byte, replacement),
|
||||
NodeVariantMut::Leaf4(_)
|
||||
| NodeVariantMut::Leaf16(_)
|
||||
| NodeVariantMut::Leaf48(_)
|
||||
| NodeVariantMut::Leaf256(_) => panic!("replace_child called on leaf node"),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn insert_value(&mut self, key_byte: u8, value: V) {
|
||||
match self.variant_mut() {
|
||||
NodeVariantMut::Internal4(_)
|
||||
| NodeVariantMut::Internal16(_)
|
||||
| NodeVariantMut::Internal48(_)
|
||||
| NodeVariantMut::Internal256(_) => panic!("insert_value called on internal node"),
|
||||
NodeVariantMut::Leaf4(n) => n.insert_value(key_byte, value),
|
||||
NodeVariantMut::Leaf16(n) => n.insert_value(key_byte, value),
|
||||
NodeVariantMut::Leaf48(n) => n.insert_value(key_byte, value),
|
||||
NodeVariantMut::Leaf256(n) => n.insert_value(key_byte, value),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn new_root<V: Value>(allocator: &Allocator) -> NodePtr<V> {
|
||||
NodePtr {
|
||||
ptr: allocator.alloc(NodeInternal256::<V>::new()).as_ptr().cast(),
|
||||
phantom_value: PhantomData,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn new_internal<V: Value>(prefix: &[u8], allocator: &Allocator) -> NodePtr<V> {
|
||||
let mut node = allocator.alloc(NodeInternal4 {
|
||||
tag: NodeTag::Internal4,
|
||||
lock_and_version: AtomicLockAndVersion::new(),
|
||||
|
||||
prefix: [8; MAX_PREFIX_LEN],
|
||||
prefix_len: prefix.len() as u8,
|
||||
num_children: 0,
|
||||
|
||||
child_keys: [0; 4],
|
||||
child_ptrs: [const { NodePtr::null() }; 4],
|
||||
});
|
||||
node.prefix[0..prefix.len()].copy_from_slice(prefix);
|
||||
|
||||
node.as_ptr().into()
|
||||
}
|
||||
|
||||
pub fn new_leaf<V: Value>(prefix: &[u8], allocator: &Allocator) -> NodePtr<V> {
|
||||
let mut node = allocator.alloc(NodeLeaf4 {
|
||||
tag: NodeTag::Leaf4,
|
||||
lock_and_version: AtomicLockAndVersion::new(),
|
||||
|
||||
prefix: [8; MAX_PREFIX_LEN],
|
||||
prefix_len: prefix.len() as u8,
|
||||
num_values: 0,
|
||||
|
||||
child_keys: [0; 4],
|
||||
child_values: [const { None }; 4],
|
||||
});
|
||||
node.prefix[0..prefix.len()].copy_from_slice(prefix);
|
||||
|
||||
node.as_ptr().into()
|
||||
}
|
||||
|
||||
impl<V: Value> NodeInternal4<V> {
|
||||
fn get_prefix(&self) -> &[u8] {
|
||||
&self.prefix[0..self.prefix_len as usize]
|
||||
}
|
||||
|
||||
fn truncate_prefix(&mut self, new_prefix_len: usize) {
|
||||
assert!(new_prefix_len < self.prefix_len as usize);
|
||||
let prefix = &mut self.prefix;
|
||||
let offset = self.prefix_len as usize - new_prefix_len;
|
||||
for i in 0..new_prefix_len {
|
||||
prefix[i] = prefix[i + offset];
|
||||
}
|
||||
self.prefix_len = new_prefix_len as u8;
|
||||
}
|
||||
|
||||
fn find_child(&self, key: u8) -> Option<NodePtr<V>> {
|
||||
for i in 0..self.num_children as usize {
|
||||
if self.child_keys[i] == key {
|
||||
return Some(self.child_ptrs[i]);
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
fn replace_child(&mut self, key_byte: u8, replacement: NodePtr<V>) {
|
||||
for i in 0..self.num_children as usize {
|
||||
if self.child_keys[i] == key_byte {
|
||||
self.child_ptrs[i] = replacement;
|
||||
return;
|
||||
}
|
||||
}
|
||||
panic!("could not re-find parent with key {}", key_byte);
|
||||
}
|
||||
|
||||
fn is_full(&self) -> bool {
|
||||
self.num_children == 4
|
||||
}
|
||||
|
||||
fn insert_child(&mut self, key_byte: u8, child: NodePtr<V>) {
|
||||
assert!(self.num_children < 4);
|
||||
|
||||
let idx = self.num_children as usize;
|
||||
self.child_keys[idx] = key_byte;
|
||||
self.child_ptrs[idx] = child;
|
||||
self.num_children += 1;
|
||||
}
|
||||
|
||||
fn grow(&self, allocator: &Allocator) -> NodePtr<V> {
|
||||
let mut node16 = allocator.alloc(NodeInternal16 {
|
||||
tag: NodeTag::Internal16,
|
||||
lock_and_version: AtomicLockAndVersion::new(),
|
||||
|
||||
prefix: self.prefix.clone(),
|
||||
prefix_len: self.prefix_len,
|
||||
num_children: self.num_children,
|
||||
|
||||
child_keys: [0; 16],
|
||||
child_ptrs: [const { NodePtr::null() }; 16],
|
||||
});
|
||||
for i in 0..self.num_children as usize {
|
||||
node16.child_keys[i] = self.child_keys[i];
|
||||
node16.child_ptrs[i] = self.child_ptrs[i];
|
||||
}
|
||||
|
||||
node16.as_ptr().into()
|
||||
}
|
||||
}
|
||||
|
||||
impl<V: Value> NodeInternal16<V> {
|
||||
fn get_prefix(&self) -> &[u8] {
|
||||
&self.prefix[0..self.prefix_len as usize]
|
||||
}
|
||||
|
||||
fn truncate_prefix(&mut self, new_prefix_len: usize) {
|
||||
assert!(new_prefix_len < self.prefix_len as usize);
|
||||
let prefix = &mut self.prefix;
|
||||
let offset = self.prefix_len as usize - new_prefix_len;
|
||||
for i in 0..new_prefix_len {
|
||||
prefix[i] = prefix[i + offset];
|
||||
}
|
||||
self.prefix_len = new_prefix_len as u8;
|
||||
}
|
||||
|
||||
fn find_child(&self, key_byte: u8) -> Option<NodePtr<V>> {
|
||||
for i in 0..self.num_children as usize {
|
||||
if self.child_keys[i] == key_byte {
|
||||
return Some(self.child_ptrs[i]);
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
fn replace_child(&mut self, key_byte: u8, replacement: NodePtr<V>) {
|
||||
for i in 0..self.num_children as usize {
|
||||
if self.child_keys[i] == key_byte {
|
||||
self.child_ptrs[i] = replacement;
|
||||
return;
|
||||
}
|
||||
}
|
||||
panic!("could not re-find parent with key {}", key_byte);
|
||||
}
|
||||
|
||||
fn is_full(&self) -> bool {
|
||||
self.num_children == 16
|
||||
}
|
||||
|
||||
fn insert_child(&mut self, key_byte: u8, child: NodePtr<V>) {
|
||||
assert!(self.num_children < 16);
|
||||
|
||||
let idx = self.num_children as usize;
|
||||
self.child_keys[idx] = key_byte;
|
||||
self.child_ptrs[idx] = child;
|
||||
self.num_children += 1;
|
||||
}
|
||||
|
||||
fn grow(&self, allocator: &Allocator) -> NodePtr<V> {
|
||||
let mut node48 = allocator.alloc(NodeInternal48 {
|
||||
tag: NodeTag::Internal48,
|
||||
lock_and_version: AtomicLockAndVersion::new(),
|
||||
|
||||
prefix: self.prefix.clone(),
|
||||
prefix_len: self.prefix_len,
|
||||
num_children: self.num_children,
|
||||
|
||||
child_indexes: [INVALID_CHILD_INDEX; 256],
|
||||
child_ptrs: [const { NodePtr::null() }; 48],
|
||||
});
|
||||
for i in 0..self.num_children as usize {
|
||||
let idx = self.child_keys[i] as usize;
|
||||
node48.child_indexes[idx] = i as u8;
|
||||
node48.child_ptrs[i] = self.child_ptrs[i];
|
||||
}
|
||||
|
||||
node48.as_ptr().into()
|
||||
}
|
||||
}
|
||||
|
||||
impl<V: Value> NodeInternal48<V> {
|
||||
fn get_prefix(&self) -> &[u8] {
|
||||
&self.prefix[0..self.prefix_len as usize]
|
||||
}
|
||||
|
||||
fn truncate_prefix(&mut self, new_prefix_len: usize) {
|
||||
assert!(new_prefix_len < self.prefix_len as usize);
|
||||
let prefix = &mut self.prefix;
|
||||
let offset = self.prefix_len as usize - new_prefix_len;
|
||||
for i in 0..new_prefix_len {
|
||||
prefix[i] = prefix[i + offset];
|
||||
}
|
||||
self.prefix_len = new_prefix_len as u8;
|
||||
}
|
||||
|
||||
fn find_child(&self, key_byte: u8) -> Option<NodePtr<V>> {
|
||||
let idx = self.child_indexes[key_byte as usize];
|
||||
if idx != INVALID_CHILD_INDEX {
|
||||
Some(self.child_ptrs[idx as usize])
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
fn replace_child(&mut self, key_byte: u8, replacement: NodePtr<V>) {
|
||||
let idx = self.child_indexes[key_byte as usize];
|
||||
if idx != INVALID_CHILD_INDEX {
|
||||
self.child_ptrs[idx as usize] = replacement
|
||||
} else {
|
||||
panic!("could not re-find parent with key {}", key_byte);
|
||||
}
|
||||
}
|
||||
|
||||
fn is_full(&self) -> bool {
|
||||
self.num_children == 48
|
||||
}
|
||||
|
||||
fn insert_child(&mut self, key_byte: u8, child: NodePtr<V>) {
|
||||
assert!(self.num_children < 48);
|
||||
assert!(self.child_indexes[key_byte as usize] == INVALID_CHILD_INDEX);
|
||||
let idx = self.num_children;
|
||||
self.child_indexes[key_byte as usize] = idx;
|
||||
self.child_ptrs[idx as usize] = child;
|
||||
self.num_children += 1;
|
||||
}
|
||||
|
||||
fn grow(&self, allocator: &Allocator) -> NodePtr<V> {
|
||||
let mut node256 = allocator.alloc(NodeInternal256 {
|
||||
tag: NodeTag::Internal256,
|
||||
lock_and_version: AtomicLockAndVersion::new(),
|
||||
|
||||
prefix: self.prefix.clone(),
|
||||
prefix_len: self.prefix_len,
|
||||
num_children: self.num_children as u16,
|
||||
|
||||
child_ptrs: [const { NodePtr::null() }; 256],
|
||||
});
|
||||
for i in 0..256 {
|
||||
let idx = self.child_indexes[i];
|
||||
if idx != INVALID_CHILD_INDEX {
|
||||
node256.child_ptrs[i] = self.child_ptrs[idx as usize];
|
||||
}
|
||||
}
|
||||
node256.as_ptr().into()
|
||||
}
|
||||
}
|
||||
|
||||
impl<V: Value> NodeInternal256<V> {
|
||||
fn get_prefix(&self) -> &[u8] {
|
||||
&self.prefix[0..self.prefix_len as usize]
|
||||
}
|
||||
|
||||
fn truncate_prefix(&mut self, new_prefix_len: usize) {
|
||||
assert!(new_prefix_len < self.prefix_len as usize);
|
||||
let prefix = &mut self.prefix;
|
||||
let offset = self.prefix_len as usize - new_prefix_len;
|
||||
for i in 0..new_prefix_len {
|
||||
prefix[i] = prefix[i + offset];
|
||||
}
|
||||
self.prefix_len = new_prefix_len as u8;
|
||||
}
|
||||
|
||||
fn find_child(&self, key_byte: u8) -> Option<NodePtr<V>> {
|
||||
let idx = key_byte as usize;
|
||||
if !self.child_ptrs[idx].is_null() {
|
||||
Some(self.child_ptrs[idx])
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
fn replace_child(&mut self, key_byte: u8, replacement: NodePtr<V>) {
|
||||
let idx = key_byte as usize;
|
||||
if !self.child_ptrs[idx].is_null() {
|
||||
self.child_ptrs[idx] = replacement
|
||||
} else {
|
||||
panic!("could not re-find parent with key {}", key_byte);
|
||||
}
|
||||
}
|
||||
|
||||
fn is_full(&self) -> bool {
|
||||
self.num_children == 256
|
||||
}
|
||||
|
||||
fn insert_child(&mut self, key_byte: u8, child: NodePtr<V>) {
|
||||
assert!(self.num_children < 256);
|
||||
assert!(self.child_ptrs[key_byte as usize].is_null());
|
||||
self.child_ptrs[key_byte as usize] = child;
|
||||
self.num_children += 1;
|
||||
}
|
||||
}
|
||||
|
||||
impl<V: Value> NodeLeaf4<V> {
|
||||
fn get_prefix(&self) -> &[u8] {
|
||||
&self.prefix[0..self.prefix_len as usize]
|
||||
}
|
||||
|
||||
fn truncate_prefix(&mut self, new_prefix_len: usize) {
|
||||
assert!(new_prefix_len < self.prefix_len as usize);
|
||||
let prefix = &mut self.prefix;
|
||||
let offset = self.prefix_len as usize - new_prefix_len;
|
||||
for i in 0..new_prefix_len {
|
||||
prefix[i] = prefix[i + offset];
|
||||
}
|
||||
self.prefix_len = new_prefix_len as u8;
|
||||
}
|
||||
|
||||
fn get_leaf_value<'a: 'b, 'b>(&'a self, key: u8) -> Option<&'b V> {
|
||||
for i in 0..self.num_values {
|
||||
if self.child_keys[i as usize] == key {
|
||||
assert!(self.child_values[i as usize].is_some());
|
||||
return self.child_values[i as usize].as_ref();
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
fn is_full(&self) -> bool {
|
||||
self.num_values == 4
|
||||
}
|
||||
|
||||
fn insert_value(&mut self, key_byte: u8, value: V) {
|
||||
assert!(self.num_values < 16);
|
||||
|
||||
let idx = self.num_values as usize;
|
||||
self.child_keys[idx] = key_byte;
|
||||
self.child_values[idx] = Some(value);
|
||||
self.num_values += 1;
|
||||
}
|
||||
|
||||
fn grow(&self, allocator: &Allocator) -> NodePtr<V> {
|
||||
let mut node16 = allocator.alloc(NodeLeaf16 {
|
||||
tag: NodeTag::Leaf16,
|
||||
lock_and_version: AtomicLockAndVersion::new(),
|
||||
|
||||
prefix: self.prefix.clone(),
|
||||
prefix_len: self.prefix_len,
|
||||
num_values: self.num_values,
|
||||
|
||||
child_keys: [0; 16],
|
||||
child_values: [const { None }; 16],
|
||||
});
|
||||
for i in 0..self.num_values as usize {
|
||||
node16.child_keys[i] = self.child_keys[i];
|
||||
node16.child_values[i] = self.child_values[i].clone();
|
||||
}
|
||||
node16.as_ptr().into()
|
||||
}
|
||||
}
|
||||
|
||||
impl<V: Value> NodeLeaf16<V> {
|
||||
fn get_prefix(&self) -> &[u8] {
|
||||
&self.prefix[0..self.prefix_len as usize]
|
||||
}
|
||||
|
||||
fn truncate_prefix(&mut self, new_prefix_len: usize) {
|
||||
assert!(new_prefix_len < self.prefix_len as usize);
|
||||
let prefix = &mut self.prefix;
|
||||
let offset = self.prefix_len as usize - new_prefix_len;
|
||||
for i in 0..new_prefix_len {
|
||||
prefix[i] = prefix[i + offset];
|
||||
}
|
||||
self.prefix_len = new_prefix_len as u8;
|
||||
}
|
||||
|
||||
fn get_leaf_value(&self, key: u8) -> Option<&V> {
|
||||
for i in 0..self.num_values {
|
||||
if self.child_keys[i as usize] == key {
|
||||
assert!(self.child_values[i as usize].is_some());
|
||||
return self.child_values[i as usize].as_ref();
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
fn is_full(&self) -> bool {
|
||||
self.num_values == 16
|
||||
}
|
||||
|
||||
fn insert_value(&mut self, key_byte: u8, value: V) {
|
||||
assert!(self.num_values < 16);
|
||||
|
||||
let idx = self.num_values as usize;
|
||||
self.child_keys[idx] = key_byte;
|
||||
self.child_values[idx] = Some(value);
|
||||
self.num_values += 1;
|
||||
}
|
||||
fn grow(&self, allocator: &Allocator) -> NodePtr<V> {
|
||||
let mut node48 = allocator.alloc(NodeLeaf48 {
|
||||
tag: NodeTag::Leaf48,
|
||||
lock_and_version: AtomicLockAndVersion::new(),
|
||||
|
||||
prefix: self.prefix.clone(),
|
||||
prefix_len: self.prefix_len,
|
||||
num_values: self.num_values,
|
||||
|
||||
child_indexes: [INVALID_CHILD_INDEX; 256],
|
||||
child_values: [const { None }; 48],
|
||||
});
|
||||
for i in 0..self.num_values {
|
||||
let idx = self.child_keys[i as usize];
|
||||
node48.child_indexes[idx as usize] = i;
|
||||
node48.child_values[i as usize] = self.child_values[i as usize].clone();
|
||||
}
|
||||
node48.as_ptr().into()
|
||||
}
|
||||
}
|
||||
|
||||
impl<V: Value> NodeLeaf48<V> {
|
||||
fn get_prefix(&self) -> &[u8] {
|
||||
&self.prefix[0..self.prefix_len as usize]
|
||||
}
|
||||
|
||||
fn truncate_prefix(&mut self, new_prefix_len: usize) {
|
||||
assert!(new_prefix_len < self.prefix_len as usize);
|
||||
let prefix = &mut self.prefix;
|
||||
let offset = self.prefix_len as usize - new_prefix_len;
|
||||
for i in 0..new_prefix_len {
|
||||
prefix[i] = prefix[i + offset];
|
||||
}
|
||||
self.prefix_len = new_prefix_len as u8;
|
||||
}
|
||||
|
||||
fn get_leaf_value(&self, key: u8) -> Option<&V> {
|
||||
let idx = self.child_indexes[key as usize];
|
||||
if idx != INVALID_CHILD_INDEX {
|
||||
assert!(self.child_values[idx as usize].is_some());
|
||||
self.child_values[idx as usize].as_ref()
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
fn is_full(&self) -> bool {
|
||||
self.num_values == 48
|
||||
}
|
||||
|
||||
fn insert_value(&mut self, key_byte: u8, value: V) {
|
||||
assert!(self.num_values < 48);
|
||||
assert!(self.child_indexes[key_byte as usize] == INVALID_CHILD_INDEX);
|
||||
let idx = self.num_values;
|
||||
self.child_indexes[key_byte as usize] = idx;
|
||||
self.child_values[idx as usize] = Some(value);
|
||||
self.num_values += 1;
|
||||
}
|
||||
fn grow(&self, allocator: &Allocator) -> NodePtr<V> {
|
||||
let mut node256 = allocator.alloc(NodeLeaf256 {
|
||||
tag: NodeTag::Leaf256,
|
||||
lock_and_version: AtomicLockAndVersion::new(),
|
||||
|
||||
prefix: self.prefix.clone(),
|
||||
prefix_len: self.prefix_len,
|
||||
num_values: self.num_values as u16,
|
||||
|
||||
child_values: [const { None }; 256],
|
||||
});
|
||||
for i in 0..256 {
|
||||
let idx = self.child_indexes[i];
|
||||
if idx != INVALID_CHILD_INDEX {
|
||||
node256.child_values[i] = self.child_values[idx as usize].clone();
|
||||
}
|
||||
}
|
||||
node256.as_ptr().into()
|
||||
}
|
||||
}
|
||||
|
||||
impl<V: Value> NodeLeaf256<V> {
|
||||
fn get_prefix(&self) -> &[u8] {
|
||||
&self.prefix[0..self.prefix_len as usize]
|
||||
}
|
||||
|
||||
fn truncate_prefix(&mut self, new_prefix_len: usize) {
|
||||
assert!(new_prefix_len < self.prefix_len as usize);
|
||||
let prefix = &mut self.prefix;
|
||||
let offset = self.prefix_len as usize - new_prefix_len;
|
||||
for i in 0..new_prefix_len {
|
||||
prefix[i] = prefix[i + offset];
|
||||
}
|
||||
self.prefix_len = new_prefix_len as u8;
|
||||
}
|
||||
|
||||
fn get_leaf_value(&self, key: u8) -> Option<&V> {
|
||||
let idx = key as usize;
|
||||
self.child_values[idx].as_ref()
|
||||
}
|
||||
fn is_full(&self) -> bool {
|
||||
self.num_values == 256
|
||||
}
|
||||
|
||||
fn insert_value(&mut self, key_byte: u8, value: V) {
|
||||
assert!(self.num_values < 256);
|
||||
assert!(self.child_values[key_byte as usize].is_none());
|
||||
self.child_values[key_byte as usize] = Some(value);
|
||||
self.num_values += 1;
|
||||
}
|
||||
}
|
||||
|
||||
impl<V: Value> NodeInternal256<V> {
|
||||
pub(crate) fn new() -> NodeInternal256<V> {
|
||||
NodeInternal256 {
|
||||
tag: NodeTag::Internal256,
|
||||
lock_and_version: AtomicLockAndVersion::new(),
|
||||
|
||||
prefix: [0; MAX_PREFIX_LEN],
|
||||
prefix_len: 0,
|
||||
num_children: 0,
|
||||
|
||||
child_ptrs: [const { NodePtr::null() }; 256],
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<V: Value> From<*mut NodeInternal4<V>> for NodePtr<V> {
|
||||
fn from(val: *mut NodeInternal4<V>) -> NodePtr<V> {
|
||||
NodePtr {
|
||||
ptr: val.cast(),
|
||||
phantom_value: PhantomData,
|
||||
}
|
||||
}
|
||||
}
|
||||
impl<V: Value> From<*mut NodeInternal16<V>> for NodePtr<V> {
|
||||
fn from(val: *mut NodeInternal16<V>) -> NodePtr<V> {
|
||||
NodePtr {
|
||||
ptr: val.cast(),
|
||||
phantom_value: PhantomData,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<V: Value> From<*mut NodeInternal48<V>> for NodePtr<V> {
|
||||
fn from(val: *mut NodeInternal48<V>) -> NodePtr<V> {
|
||||
NodePtr {
|
||||
ptr: val.cast(),
|
||||
phantom_value: PhantomData,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<V: Value> From<*mut NodeInternal256<V>> for NodePtr<V> {
|
||||
fn from(val: *mut NodeInternal256<V>) -> NodePtr<V> {
|
||||
NodePtr {
|
||||
ptr: val.cast(),
|
||||
phantom_value: PhantomData,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<V: Value> From<*mut NodeLeaf4<V>> for NodePtr<V> {
|
||||
fn from(val: *mut NodeLeaf4<V>) -> NodePtr<V> {
|
||||
NodePtr {
|
||||
ptr: val.cast(),
|
||||
phantom_value: PhantomData,
|
||||
}
|
||||
}
|
||||
}
|
||||
impl<V: Value> From<*mut NodeLeaf16<V>> for NodePtr<V> {
|
||||
fn from(val: *mut NodeLeaf16<V>) -> NodePtr<V> {
|
||||
NodePtr {
|
||||
ptr: val.cast(),
|
||||
phantom_value: PhantomData,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<V: Value> From<*mut NodeLeaf48<V>> for NodePtr<V> {
|
||||
fn from(val: *mut NodeLeaf48<V>) -> NodePtr<V> {
|
||||
NodePtr {
|
||||
ptr: val.cast(),
|
||||
phantom_value: PhantomData,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<V: Value> From<*mut NodeLeaf256<V>> for NodePtr<V> {
|
||||
fn from(val: *mut NodeLeaf256<V>) -> NodePtr<V> {
|
||||
NodePtr {
|
||||
ptr: val.cast(),
|
||||
phantom_value: PhantomData,
|
||||
}
|
||||
}
|
||||
}
|
||||
202
libs/neonart/src/algorithm/node_ref.rs
Normal file
202
libs/neonart/src/algorithm/node_ref.rs
Normal file
@@ -0,0 +1,202 @@
|
||||
use std::fmt::Debug;
|
||||
use std::marker::PhantomData;
|
||||
|
||||
use super::lock_and_version::ResultOrRestart;
|
||||
use super::node_ptr;
|
||||
use super::node_ptr::ChildOrValuePtr;
|
||||
use super::node_ptr::NodePtr;
|
||||
use crate::EpochPin;
|
||||
use crate::algorithm::lock_and_version::AtomicLockAndVersion;
|
||||
use crate::{Allocator, Value};
|
||||
|
||||
pub struct NodeRef<'e, V> {
|
||||
ptr: NodePtr<V>,
|
||||
|
||||
phantom: PhantomData<&'e EpochPin>,
|
||||
}
|
||||
|
||||
impl<'e, V> Debug for NodeRef<'e, V> {
|
||||
fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
|
||||
write!(fmt, "{:?}", self.ptr)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'e, V: Value> NodeRef<'e, V> {
|
||||
pub(crate) fn from_root_ptr(root_ptr: NodePtr<V>) -> NodeRef<'e, V> {
|
||||
NodeRef {
|
||||
ptr: root_ptr,
|
||||
phantom: PhantomData,
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn read_lock_or_restart(&self) -> ResultOrRestart<ReadLockedNodeRef<'e, V>> {
|
||||
let version = self.lockword().read_lock_or_restart()?;
|
||||
Ok(ReadLockedNodeRef {
|
||||
ptr: self.ptr,
|
||||
version,
|
||||
phantom: self.phantom,
|
||||
})
|
||||
}
|
||||
|
||||
fn lockword(&self) -> &AtomicLockAndVersion {
|
||||
self.ptr.lockword()
|
||||
}
|
||||
}
|
||||
|
||||
/// A reference to a node that has been optimistically read-locked. The functions re-check
|
||||
/// the version after each read.
|
||||
pub struct ReadLockedNodeRef<'e, V> {
|
||||
ptr: NodePtr<V>,
|
||||
version: u64,
|
||||
|
||||
phantom: PhantomData<&'e EpochPin>,
|
||||
}
|
||||
|
||||
pub(crate) enum ChildOrValue<'e, V> {
|
||||
Child(NodeRef<'e, V>),
|
||||
Value(*const V),
|
||||
}
|
||||
|
||||
impl<'e, V: Value> ReadLockedNodeRef<'e, V> {
|
||||
pub(crate) fn is_full(&self) -> bool {
|
||||
self.ptr.is_full()
|
||||
}
|
||||
|
||||
pub(crate) fn get_prefix(&self) -> &[u8] {
|
||||
self.ptr.get_prefix()
|
||||
}
|
||||
|
||||
/// Note: because we're only holding a read lock, the prefix can change concurrently.
|
||||
/// You must be prepared to restart, if read_unlock() returns error later.
|
||||
///
|
||||
/// Returns the length of the prefix, or None if it's not a match
|
||||
pub(crate) fn prefix_matches(&self, key: &[u8]) -> Option<usize> {
|
||||
self.ptr.prefix_matches(key)
|
||||
}
|
||||
|
||||
pub(crate) fn find_child_or_value_or_restart(
|
||||
&self,
|
||||
key_byte: u8,
|
||||
) -> ResultOrRestart<Option<ChildOrValue<'e, V>>> {
|
||||
let child_or_value = self.ptr.find_child_or_value(key_byte);
|
||||
self.ptr.lockword().check_or_restart(self.version)?;
|
||||
|
||||
match child_or_value {
|
||||
None => Ok(None),
|
||||
Some(ChildOrValuePtr::Value(vptr)) => Ok(Some(ChildOrValue::Value(vptr))),
|
||||
Some(ChildOrValuePtr::Child(child_ptr)) => Ok(Some(ChildOrValue::Child(NodeRef {
|
||||
ptr: child_ptr,
|
||||
phantom: self.phantom,
|
||||
}))),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn upgrade_to_write_lock_or_restart(
|
||||
self,
|
||||
) -> ResultOrRestart<WriteLockedNodeRef<'e, V>> {
|
||||
self.ptr
|
||||
.lockword()
|
||||
.upgrade_to_write_lock_or_restart(self.version)?;
|
||||
|
||||
Ok(WriteLockedNodeRef {
|
||||
ptr: self.ptr,
|
||||
phantom: self.phantom,
|
||||
})
|
||||
}
|
||||
|
||||
pub(crate) fn read_unlock_or_restart(self) -> ResultOrRestart<()> {
|
||||
self.ptr.lockword().check_or_restart(self.version)?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// A reference to a node that has been optimistically read-locked. The functions re-check
|
||||
/// the version after each read.
|
||||
pub struct WriteLockedNodeRef<'e, V> {
|
||||
ptr: NodePtr<V>,
|
||||
phantom: PhantomData<&'e EpochPin>,
|
||||
}
|
||||
|
||||
impl<'e, V: Value> WriteLockedNodeRef<'e, V> {
|
||||
pub(crate) fn is_leaf(&self) -> bool {
|
||||
self.ptr.is_leaf()
|
||||
}
|
||||
|
||||
pub(crate) fn write_unlock(mut self) {
|
||||
self.ptr.lockword().write_unlock();
|
||||
self.ptr = NodePtr::null();
|
||||
}
|
||||
|
||||
pub(crate) fn write_unlock_obsolete(mut self) {
|
||||
self.ptr.lockword().write_unlock_obsolete();
|
||||
self.ptr = NodePtr::null();
|
||||
}
|
||||
|
||||
pub(crate) fn get_prefix(&self) -> &[u8] {
|
||||
self.ptr.get_prefix()
|
||||
}
|
||||
|
||||
pub(crate) fn truncate_prefix(&mut self, new_prefix_len: usize) {
|
||||
self.ptr.truncate_prefix(new_prefix_len)
|
||||
}
|
||||
|
||||
pub(crate) fn insert_child(&mut self, key_byte: u8, child: NodePtr<V>) {
|
||||
self.ptr.insert_child(key_byte, child)
|
||||
}
|
||||
|
||||
pub(crate) fn insert_value(&mut self, key_byte: u8, value: V) {
|
||||
self.ptr.insert_value(key_byte, value)
|
||||
}
|
||||
|
||||
pub(crate) fn grow(&self, allocator: &Allocator) -> NewNodeRef<V> {
|
||||
let new_node = self.ptr.grow(allocator);
|
||||
NewNodeRef { ptr: new_node }
|
||||
}
|
||||
|
||||
pub(crate) fn as_ptr(&self) -> NodePtr<V> {
|
||||
self.ptr
|
||||
}
|
||||
|
||||
pub(crate) fn replace_child(&mut self, key_byte: u8, replacement: NodePtr<V>) {
|
||||
self.ptr.replace_child(key_byte, replacement);
|
||||
}
|
||||
}
|
||||
|
||||
impl<'e, V> Drop for WriteLockedNodeRef<'e, V> {
|
||||
fn drop(&mut self) {
|
||||
if !self.ptr.is_null() {
|
||||
self.ptr.lockword().write_unlock();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) struct NewNodeRef<V> {
|
||||
ptr: NodePtr<V>,
|
||||
}
|
||||
|
||||
impl<V: Value> NewNodeRef<V> {
|
||||
pub(crate) fn insert_child(&mut self, key_byte: u8, child: NodePtr<V>) {
|
||||
self.ptr.insert_child(key_byte, child)
|
||||
}
|
||||
|
||||
pub(crate) fn insert_value(&mut self, key_byte: u8, value: V) {
|
||||
self.ptr.insert_value(key_byte, value)
|
||||
}
|
||||
|
||||
pub(crate) fn into_ptr(self) -> NodePtr<V> {
|
||||
let ptr = self.ptr;
|
||||
ptr
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn new_internal<V: Value>(prefix: &[u8], allocator: &Allocator) -> NewNodeRef<V> {
|
||||
NewNodeRef {
|
||||
ptr: node_ptr::new_internal(prefix, allocator),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn new_leaf<V: Value>(prefix: &[u8], allocator: &Allocator) -> NewNodeRef<V> {
|
||||
NewNodeRef {
|
||||
ptr: node_ptr::new_leaf(prefix, allocator),
|
||||
}
|
||||
}
|
||||
107
libs/neonart/src/allocator.rs
Normal file
107
libs/neonart/src/allocator.rs
Normal file
@@ -0,0 +1,107 @@
|
||||
use std::marker::PhantomData;
|
||||
use std::mem::MaybeUninit;
|
||||
use std::ops::{Deref, DerefMut};
|
||||
use std::ptr::NonNull;
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
|
||||
pub struct Allocator {
|
||||
area: *mut MaybeUninit<u8>,
|
||||
allocated: AtomicUsize,
|
||||
size: usize,
|
||||
}
|
||||
|
||||
// FIXME: I don't know if these are really safe...
|
||||
unsafe impl Send for Allocator {}
|
||||
unsafe impl Sync for Allocator {}
|
||||
|
||||
#[repr(transparent)]
|
||||
pub struct AllocatedBox<'a, T> {
|
||||
inner: NonNull<T>,
|
||||
|
||||
_phantom: PhantomData<&'a Allocator>,
|
||||
}
|
||||
|
||||
// FIXME: I don't know if these are really safe...
|
||||
unsafe impl<'a, T> Send for AllocatedBox<'a, T> {}
|
||||
unsafe impl<'a, T> Sync for AllocatedBox<'a, T> {}
|
||||
|
||||
impl<T> Deref for AllocatedBox<'_, T> {
|
||||
type Target = T;
|
||||
|
||||
fn deref(&self) -> &T {
|
||||
unsafe { self.inner.as_ref() }
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> DerefMut for AllocatedBox<'_, T> {
|
||||
fn deref_mut(&mut self) -> &mut T {
|
||||
unsafe { self.inner.as_mut() }
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> AsMut<T> for AllocatedBox<'_, T> {
|
||||
fn as_mut(&mut self) -> &mut T {
|
||||
unsafe { self.inner.as_mut() }
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> AllocatedBox<'_, T> {
|
||||
pub fn as_ptr(&self) -> *mut T {
|
||||
self.inner.as_ptr()
|
||||
}
|
||||
}
|
||||
|
||||
const MAXALIGN: usize = std::mem::align_of::<usize>();
|
||||
|
||||
impl Allocator {
|
||||
pub fn new_uninit(area: &'static mut [MaybeUninit<u8>]) -> Allocator {
|
||||
let ptr = area.as_mut_ptr();
|
||||
let size = area.len();
|
||||
Self::new_from_ptr(ptr, size)
|
||||
}
|
||||
|
||||
pub fn new(area: &'static mut [u8]) -> Allocator {
|
||||
let ptr: *mut MaybeUninit<u8> = area.as_mut_ptr().cast();
|
||||
let size = area.len();
|
||||
Self::new_from_ptr(ptr, size)
|
||||
}
|
||||
|
||||
pub fn new_from_ptr(ptr: *mut MaybeUninit<u8>, size: usize) -> Allocator {
|
||||
let padding = ptr.align_offset(MAXALIGN);
|
||||
|
||||
Allocator {
|
||||
area: ptr,
|
||||
allocated: AtomicUsize::new(padding),
|
||||
size,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn alloc<'a, T: Sized>(&'a self, value: T) -> AllocatedBox<'a, T> {
|
||||
let sz = std::mem::size_of::<T>();
|
||||
|
||||
// pad all allocations to MAXALIGN boundaries
|
||||
assert!(std::mem::align_of::<T>() <= MAXALIGN);
|
||||
let sz = sz.next_multiple_of(MAXALIGN);
|
||||
|
||||
let offset = self.allocated.fetch_add(sz, Ordering::Relaxed);
|
||||
|
||||
if offset + sz > self.size {
|
||||
panic!("out of memory");
|
||||
}
|
||||
|
||||
let inner = unsafe {
|
||||
let inner = self.area.offset(offset as isize).cast::<T>();
|
||||
*inner = value;
|
||||
NonNull::new_unchecked(inner)
|
||||
};
|
||||
|
||||
AllocatedBox {
|
||||
inner,
|
||||
_phantom: PhantomData,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn _dealloc_node<T>(&self, _node: AllocatedBox<T>) {
|
||||
// doesn't free it immediately.
|
||||
}
|
||||
}
|
||||
23
libs/neonart/src/epoch.rs
Normal file
23
libs/neonart/src/epoch.rs
Normal file
@@ -0,0 +1,23 @@
|
||||
//! This is similar to crossbeam_epoch crate, but works in shared memory
|
||||
//!
|
||||
//! FIXME: not implemented yet. (We haven't implemented removing any nodes from the ART
|
||||
//! tree, which is why we get away without this now)
|
||||
|
||||
pub(crate) struct EpochPin {}
|
||||
|
||||
pub(crate) fn pin_epoch() -> EpochPin {
|
||||
EpochPin {}
|
||||
}
|
||||
|
||||
/*
|
||||
struct CollectorGlobal {
|
||||
epoch: AtomicU64,
|
||||
|
||||
participants: CachePadded<AtomicU64>, // make it an array
|
||||
}
|
||||
|
||||
|
||||
struct CollectorQueue {
|
||||
|
||||
}
|
||||
*/
|
||||
301
libs/neonart/src/lib.rs
Normal file
301
libs/neonart/src/lib.rs
Normal file
@@ -0,0 +1,301 @@
|
||||
//! Adaptive Radix Tree (ART) implementation, with Optimistic Lock Coupling.
|
||||
//!
|
||||
//! The data structure is described in these two papers:
|
||||
//!
|
||||
//! [1] Leis, V. & Kemper, Alfons & Neumann, Thomas. (2013).
|
||||
//! The adaptive radix tree: ARTful indexing for main-memory databases.
|
||||
//! Proceedings - International Conference on Data Engineering. 38-49. 10.1109/ICDE.2013.6544812.
|
||||
//! https://db.in.tum.de/~leis/papers/ART.pdf
|
||||
//!
|
||||
//! [2] Leis, Viktor & Scheibner, Florian & Kemper, Alfons & Neumann, Thomas. (2016).
|
||||
//! The ART of practical synchronization.
|
||||
//! 1-8. 10.1145/2933349.2933352.
|
||||
//! https://db.in.tum.de/~leis/papers/artsync.pdf
|
||||
//!
|
||||
//! [1] describes the base data structure, and [2] describes the Optimistic Lock Coupling that we
|
||||
//! use.
|
||||
//!
|
||||
//! The papers mention a few different variants. We have made the following choices in this
|
||||
//! implementation:
|
||||
//!
|
||||
//! - All keys have the same length
|
||||
//!
|
||||
//! - Multi-value leaves. The values are stored directly in one of the four different leaf node
|
||||
//! types.
|
||||
//!
|
||||
//! - For collapsing inner nodes, we use the Pessimistic approach, where each inner node stores a
|
||||
//! variable length "prefix", which stores the keys of all the one-way nodes which have been
|
||||
//! removed. However, similar to the "hybrid" approach described in the paper, each node only has
|
||||
//! space for a constant-size prefix of 8 bytes. If a node would have a longer prefix, then we
|
||||
//! create create one-way nodes to store them. (There was no particular reason for this choice,
|
||||
//! the "hybrid" approach described in the paper might be better.)
|
||||
//!
|
||||
//! - For concurrency, we use Optimistic Lock Coupling. The paper [2] also describes another method,
|
||||
//! ROWEX, which generally performs better when there is contention, but that is not important
|
||||
//! for use and Optimisic Lock Coupling is simpler to implement.
|
||||
//!
|
||||
//! ## Requirements
|
||||
//!
|
||||
//! This data structure is currently used for the integrated LFC, relsize and last-written LSN cache
|
||||
//! in the compute communicator, part of the 'neon' Postgres extension. We have some unique
|
||||
//! requirements, which is why we had to write our own. Namely:
|
||||
//!
|
||||
//! - The data structure has to live in fixed-sized shared memory segment. That rules out any
|
||||
//! built-in Rust collections and most crates. (Except possibly with the 'allocator_api' rust
|
||||
//! feature, which still nightly-only experimental as of this writing).
|
||||
//!
|
||||
//! - The data structure is accessed from multiple processes. Only one process updates the data
|
||||
//! structure, but other processes perform reads. That rules out using built-in Rust locking
|
||||
//! primitives like Mutex and RwLock, and most crates too.
|
||||
//!
|
||||
//! - Within the one process with write-access, multiple threads can perform updates concurrently.
|
||||
//! That rules out using PostgreSQL LWLocks for the locking.
|
||||
//!
|
||||
//! The implementation is generic, and doesn't depend on any PostgreSQL specifics, but it has been
|
||||
//! written with that usage and the above constraints in mind. Some noteworthy assumptions:
|
||||
//!
|
||||
//! - Contention is assumed to be rare. In the integrated cache in PostgreSQL, there's higher level
|
||||
//! locking in the PostgreSQL buffer manager, which ensures that two backends should not try to
|
||||
//! read / write the same page at the same time. (Prefetching can conflict with actual reads,
|
||||
//! however.)
|
||||
//!
|
||||
//! - The keys in the integrated cache are 17 bytes long.
|
||||
//!
|
||||
//! ## Usage
|
||||
//!
|
||||
//! Because this is designed to be used as a Postgres shared memory data structure, initialization
|
||||
//! happens in three stages:
|
||||
//!
|
||||
//! 0. A fixed area of shared memory is allocated at postmaster startup.
|
||||
//!
|
||||
//! 1. TreeInitStruct::new() is called to initialize it, still in Postmaster process, before any
|
||||
//! other process or thread is running. It returns a TreeInitStruct, which is inherited by all
|
||||
//! the processes through fork().
|
||||
//!
|
||||
//! 2. One process may have write-access to the struct, by calling
|
||||
//! [TreeInitStruct::attach_writer]. (That process is the communicator process.)
|
||||
//!
|
||||
//! 3. Other processes get read-access to the struct, by calling [TreeInitStruct::attach_reader]
|
||||
//!
|
||||
//! "Write access" means that you can insert / update / delete values in the tree.
|
||||
//!
|
||||
//! NOTE: The Values stored in the tree are sometimes moved, when a leaf node fills up and a new
|
||||
//! larger node needs to be allocated. The versioning and epoch-based allocator ensure that the data
|
||||
//! structure stays consistent, but if the Value has interior mutability, like atomic fields,
|
||||
//! updates to such fields might be lost if the leaf node is concurrently moved! If that becomes a
|
||||
//! problem, the version check could be passed up to the caller, so that the caller could detect the
|
||||
//! lost updates and retry the operation.
|
||||
//!
|
||||
//! ## Implementation
|
||||
//!
|
||||
//! node_ptr: Provides low-level implementations of the four different node types (eight actually,
|
||||
//! since there is an Internal and Leaf variant of each)
|
||||
//!
|
||||
//! lock_and_version.rs: Provides an abstraction for the combined lock and version counter on each
|
||||
//! node.
|
||||
//!
|
||||
//! node_ref.rs: The code in node_ptr.rs deals with raw pointers. node_ref.rs provides more type-safe
|
||||
//! abstractions on top.
|
||||
//!
|
||||
//! algorithm.rs: Contains the functions to implement lookups and updates in the tree
|
||||
//!
|
||||
//! allocator.rs: Provides a facility to allocate memory for the tree nodes. (We must provide our
|
||||
//! own abstraction for that because we need the data structure to live in a pre-allocated shared
|
||||
//! memory segment).
|
||||
//!
|
||||
//! epoch.rs: The data structure requires that when a node is removed from the tree, it is not
|
||||
//! immediately deallocated, but stays around for as long as concurrent readers might still have
|
||||
//! pointers to them. This is enforced by an epoch system. This is similar to
|
||||
//! e.g. crossbeam_epoch, but we couldn't use that either because it has to work across processes
|
||||
//! communicating over the shared memory segment.
|
||||
//!
|
||||
//! ## See also
|
||||
//!
|
||||
//! There are some existing Rust ART implementations out there, but none of them filled all
|
||||
//! the requirements:
|
||||
//!
|
||||
//! - https://github.com/XiangpengHao/congee
|
||||
//! - https://github.com/declanvk/blart
|
||||
//!
|
||||
//! ## TODO
|
||||
//!
|
||||
//! - Removing values has not been implemented
|
||||
|
||||
mod algorithm;
|
||||
mod allocator;
|
||||
mod epoch;
|
||||
|
||||
use algorithm::RootPtr;
|
||||
|
||||
use allocator::AllocatedBox;
|
||||
|
||||
use std::fmt::Debug;
|
||||
use std::marker::PhantomData;
|
||||
use std::sync::atomic::{AtomicBool, Ordering};
|
||||
|
||||
use crate::epoch::EpochPin;
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests;
|
||||
|
||||
pub use allocator::Allocator;
|
||||
|
||||
/// Fixed-length key type.
|
||||
///
|
||||
pub trait Key: Clone + Debug {
|
||||
const KEY_LEN: usize;
|
||||
|
||||
fn as_bytes(&self) -> &[u8];
|
||||
}
|
||||
|
||||
/// Values stored in the tree
|
||||
///
|
||||
/// Values need to be Cloneable, because when a node "grows", the value is copied to a new node and
|
||||
/// the old sticks around until all readers that might see the old value are gone.
|
||||
pub trait Value: Clone {}
|
||||
|
||||
struct Tree<K: Key, V: Value> {
|
||||
root: RootPtr<V>,
|
||||
|
||||
writer_attached: AtomicBool,
|
||||
|
||||
phantom_key: PhantomData<K>,
|
||||
}
|
||||
|
||||
/// Struct created at postmaster startup
|
||||
pub struct TreeInitStruct<'t, K: Key, V: Value> {
|
||||
tree: AllocatedBox<'t, Tree<K, V>>,
|
||||
|
||||
allocator: &'t Allocator,
|
||||
}
|
||||
|
||||
/// The worker process has a reference to this. The write operations are only safe
|
||||
/// from the worker process
|
||||
pub struct TreeWriteAccess<'t, K: Key, V: Value>
|
||||
where
|
||||
K: Key,
|
||||
V: Value,
|
||||
{
|
||||
tree: AllocatedBox<'t, Tree<K, V>>,
|
||||
|
||||
allocator: &'t Allocator,
|
||||
}
|
||||
|
||||
/// The backends have a reference to this. It cannot be used to modify the tree
|
||||
pub struct TreeReadAccess<'t, K: Key, V: Value>
|
||||
where
|
||||
K: Key,
|
||||
V: Value,
|
||||
{
|
||||
tree: AllocatedBox<'t, Tree<K, V>>,
|
||||
}
|
||||
|
||||
impl<'a, 't: 'a, K: Key, V: Value> TreeInitStruct<'t, K, V> {
|
||||
pub fn new(allocator: &'t Allocator) -> TreeInitStruct<'t, K, V> {
|
||||
let tree = allocator.alloc(Tree {
|
||||
root: algorithm::new_root(allocator),
|
||||
writer_attached: AtomicBool::new(false),
|
||||
phantom_key: PhantomData,
|
||||
});
|
||||
|
||||
TreeInitStruct { tree, allocator }
|
||||
}
|
||||
|
||||
pub fn attach_writer(self) -> TreeWriteAccess<'t, K, V> {
|
||||
let previously_attached = self.tree.writer_attached.swap(true, Ordering::Relaxed);
|
||||
if previously_attached {
|
||||
panic!("writer already attached");
|
||||
}
|
||||
TreeWriteAccess {
|
||||
tree: self.tree,
|
||||
allocator: self.allocator,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn attach_reader(self) -> TreeReadAccess<'t, K, V> {
|
||||
TreeReadAccess { tree: self.tree }
|
||||
}
|
||||
}
|
||||
|
||||
impl<'t, K: Key + Clone, V: Value> TreeWriteAccess<'t, K, V> {
|
||||
pub fn start_write(&'t self) -> TreeWriteGuard<'t, K, V> {
|
||||
// TODO: grab epoch guard
|
||||
TreeWriteGuard {
|
||||
allocator: self.allocator,
|
||||
tree: &self.tree,
|
||||
epoch_pin: epoch::pin_epoch(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn start_read(&'t self) -> TreeReadGuard<'t, K, V> {
|
||||
TreeReadGuard {
|
||||
tree: &self.tree,
|
||||
epoch_pin: epoch::pin_epoch(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'t, K: Key + Clone, V: Value> TreeReadAccess<'t, K, V> {
|
||||
pub fn start_read(&'t self) -> TreeReadGuard<'t, K, V> {
|
||||
TreeReadGuard {
|
||||
tree: &self.tree,
|
||||
epoch_pin: epoch::pin_epoch(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct TreeReadGuard<'t, K, V>
|
||||
where
|
||||
K: Key,
|
||||
V: Value,
|
||||
{
|
||||
tree: &'t AllocatedBox<'t, Tree<K, V>>,
|
||||
|
||||
epoch_pin: EpochPin,
|
||||
}
|
||||
|
||||
impl<'t, K: Key, V: Value> TreeReadGuard<'t, K, V> {
|
||||
pub fn get(&self, key: &K) -> Option<V> {
|
||||
algorithm::search(key, self.tree.root, &self.epoch_pin)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct TreeWriteGuard<'t, K, V>
|
||||
where
|
||||
K: Key,
|
||||
V: Value,
|
||||
{
|
||||
tree: &'t AllocatedBox<'t, Tree<K, V>>,
|
||||
allocator: &'t Allocator,
|
||||
|
||||
epoch_pin: EpochPin,
|
||||
}
|
||||
|
||||
impl<'t, K: Key, V: Value> TreeWriteGuard<'t, K, V> {
|
||||
pub fn insert(&mut self, key: &K, value: V) {
|
||||
self.update_with_fn(key, |_| Some(value))
|
||||
}
|
||||
|
||||
pub fn update_with_fn<F>(&mut self, key: &K, value_fn: F)
|
||||
where
|
||||
F: FnOnce(Option<&V>) -> Option<V>,
|
||||
{
|
||||
algorithm::update_fn(
|
||||
key,
|
||||
value_fn,
|
||||
self.tree.root,
|
||||
self.allocator,
|
||||
&self.epoch_pin,
|
||||
)
|
||||
}
|
||||
|
||||
pub fn get(&mut self, key: &K) -> Option<V> {
|
||||
algorithm::search(key, self.tree.root, &self.epoch_pin)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'t, K: Key, V: Value + Debug> TreeWriteGuard<'t, K, V> {
|
||||
pub fn dump(&mut self) {
|
||||
algorithm::dump_tree(self.tree.root, &self.epoch_pin)
|
||||
}
|
||||
}
|
||||
90
libs/neonart/src/tests.rs
Normal file
90
libs/neonart/src/tests.rs
Normal file
@@ -0,0 +1,90 @@
|
||||
use std::collections::HashSet;
|
||||
|
||||
use crate::Allocator;
|
||||
use crate::TreeInitStruct;
|
||||
|
||||
use crate::{Key, Value};
|
||||
|
||||
use rand::seq::SliceRandom;
|
||||
use rand::thread_rng;
|
||||
|
||||
const TEST_KEY_LEN: usize = 16;
|
||||
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
struct TestKey([u8; TEST_KEY_LEN]);
|
||||
|
||||
impl Key for TestKey {
|
||||
const KEY_LEN: usize = TEST_KEY_LEN;
|
||||
|
||||
fn as_bytes(&self) -> &[u8] {
|
||||
&self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl From<u128> for TestKey {
|
||||
fn from(val: u128) -> TestKey {
|
||||
TestKey(val.to_be_bytes())
|
||||
}
|
||||
}
|
||||
|
||||
impl Value for usize {}
|
||||
|
||||
fn test_inserts<K: Into<TestKey> + Copy>(keys: &[K]) {
|
||||
const MEM_SIZE: usize = 10000000;
|
||||
let area = Box::leak(Box::new_uninit_slice(MEM_SIZE));
|
||||
|
||||
let allocator = Box::leak(Box::new(Allocator::new_uninit(area)));
|
||||
|
||||
let init_struct = TreeInitStruct::<TestKey, usize>::new(allocator);
|
||||
let tree_writer = init_struct.attach_writer();
|
||||
|
||||
for (idx, k) in keys.iter().enumerate() {
|
||||
let mut w = tree_writer.start_write();
|
||||
w.insert(&(*k).into(), idx);
|
||||
eprintln!("INSERTED {:?}", Into::<TestKey>::into(*k));
|
||||
}
|
||||
|
||||
//tree_writer.start_read().dump();
|
||||
|
||||
for (idx, k) in keys.iter().enumerate() {
|
||||
let r = tree_writer.start_read();
|
||||
let value = r.get(&(*k).into());
|
||||
assert_eq!(value, Some(idx));
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn dense() {
|
||||
// This exercises splitting a node with prefix
|
||||
let keys: &[u128] = &[0, 1, 2, 3, 256];
|
||||
test_inserts(keys);
|
||||
|
||||
// Dense keys
|
||||
let mut keys: Vec<u128> = (0..10000).collect();
|
||||
test_inserts(&keys);
|
||||
|
||||
// Do the same in random orders
|
||||
for _ in 1..10 {
|
||||
keys.shuffle(&mut thread_rng());
|
||||
test_inserts(&keys);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn sparse() {
|
||||
// sparse keys
|
||||
let mut keys: Vec<TestKey> = Vec::new();
|
||||
let mut used_keys = HashSet::new();
|
||||
for _ in 0..10000 {
|
||||
loop {
|
||||
let key = rand::random::<u128>();
|
||||
if used_keys.get(&key).is_some() {
|
||||
continue;
|
||||
}
|
||||
used_keys.insert(key);
|
||||
keys.push(key.into());
|
||||
break;
|
||||
}
|
||||
}
|
||||
test_inserts(&keys);
|
||||
}
|
||||
@@ -42,12 +42,14 @@ nix.workspace = true
|
||||
num_cpus.workspace = true
|
||||
num-traits.workspace = true
|
||||
once_cell.workspace = true
|
||||
peekable.workspace = true
|
||||
pin-project-lite.workspace = true
|
||||
postgres_backend.workspace = true
|
||||
postgres-protocol.workspace = true
|
||||
postgres-types.workspace = true
|
||||
postgres_initdb.workspace = true
|
||||
pprof.workspace = true
|
||||
prost.workspace = true
|
||||
rand.workspace = true
|
||||
range-set-blaze = { version = "0.1.16", features = ["alloc"] }
|
||||
regex.workspace = true
|
||||
@@ -60,6 +62,7 @@ serde_path_to_error.workspace = true
|
||||
serde_with.workspace = true
|
||||
sysinfo.workspace = true
|
||||
tokio-tar.workspace = true
|
||||
tonic.workspace = true
|
||||
thiserror.workspace = true
|
||||
tikv-jemallocator.workspace = true
|
||||
tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] }
|
||||
@@ -76,6 +79,7 @@ url.workspace = true
|
||||
walkdir.workspace = true
|
||||
metrics.workspace = true
|
||||
pageserver_api.workspace = true
|
||||
pageserver_data_api.workspace = true
|
||||
pageserver_client.workspace = true # for ResponseErrorMessageExt TOOD refactor that
|
||||
pageserver_compaction.workspace = true
|
||||
pem.workspace = true
|
||||
|
||||
13
pageserver/client_grpc/Cargo.toml
Normal file
13
pageserver/client_grpc/Cargo.toml
Normal file
@@ -0,0 +1,13 @@
|
||||
[package]
|
||||
name = "pageserver_client_grpc"
|
||||
version = "0.1.0"
|
||||
edition = "2024"
|
||||
|
||||
[dependencies]
|
||||
bytes.workspace = true
|
||||
http.workspace = true
|
||||
thiserror.workspace = true
|
||||
tonic.workspace = true
|
||||
tracing.workspace = true
|
||||
|
||||
pageserver_data_api.workspace = true
|
||||
221
pageserver/client_grpc/src/lib.rs
Normal file
221
pageserver/client_grpc/src/lib.rs
Normal file
@@ -0,0 +1,221 @@
|
||||
//! Pageserver Data API client
|
||||
//!
|
||||
//! - Manage connections to pageserver
|
||||
//! - Send requests to correct shards
|
||||
//!
|
||||
use std::collections::HashMap;
|
||||
use std::sync::RwLock;
|
||||
|
||||
use bytes::Bytes;
|
||||
use http;
|
||||
use thiserror::Error;
|
||||
use tonic;
|
||||
use tonic::metadata::AsciiMetadataValue;
|
||||
use tonic::transport::Channel;
|
||||
|
||||
use pageserver_data_api::model::*;
|
||||
use pageserver_data_api::proto;
|
||||
|
||||
type Shardno = u16;
|
||||
|
||||
use pageserver_data_api::client::PageServiceClient;
|
||||
|
||||
type MyPageServiceClient = pageserver_data_api::client::PageServiceClient<
|
||||
tonic::service::interceptor::InterceptedService<tonic::transport::Channel, AuthInterceptor>,
|
||||
>;
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
pub enum PageserverClientError {
|
||||
#[error("could not connect to service: {0}")]
|
||||
ConnectError(#[from] tonic::transport::Error),
|
||||
#[error("could not perform request: {0}`")]
|
||||
RequestError(#[from] tonic::Status),
|
||||
|
||||
#[error("could not perform request: {0}`")]
|
||||
InvalidUri(#[from] http::uri::InvalidUri),
|
||||
}
|
||||
|
||||
pub struct PageserverClient {
|
||||
_tenant_id: String,
|
||||
_timeline_id: String,
|
||||
|
||||
_auth_token: Option<String>,
|
||||
|
||||
shard_map: HashMap<Shardno, String>,
|
||||
|
||||
channels: RwLock<HashMap<Shardno, Channel>>,
|
||||
|
||||
auth_interceptor: AuthInterceptor,
|
||||
}
|
||||
|
||||
impl PageserverClient {
|
||||
/// TODO: this doesn't currently react to changes in the shard map.
|
||||
pub fn new(
|
||||
tenant_id: &str,
|
||||
timeline_id: &str,
|
||||
auth_token: &Option<String>,
|
||||
shard_map: HashMap<Shardno, String>,
|
||||
) -> Self {
|
||||
Self {
|
||||
_tenant_id: tenant_id.to_string(),
|
||||
_timeline_id: timeline_id.to_string(),
|
||||
_auth_token: auth_token.clone(),
|
||||
shard_map,
|
||||
channels: RwLock::new(HashMap::new()),
|
||||
auth_interceptor: AuthInterceptor::new(tenant_id, timeline_id, auth_token.as_ref()),
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn process_rel_exists_request(
|
||||
&self,
|
||||
request: &RelExistsRequest,
|
||||
) -> Result<bool, PageserverClientError> {
|
||||
// Current sharding model assumes that all metadata is present only at shard 0.
|
||||
let shard_no = 0;
|
||||
|
||||
let mut client = self.get_client(shard_no).await?;
|
||||
|
||||
let request = proto::RelExistsRequest::from(request);
|
||||
let response = client.rel_exists(tonic::Request::new(request)).await?;
|
||||
|
||||
Ok(response.get_ref().exists)
|
||||
}
|
||||
|
||||
pub async fn process_rel_size_request(
|
||||
&self,
|
||||
request: &RelSizeRequest,
|
||||
) -> Result<u32, PageserverClientError> {
|
||||
// Current sharding model assumes that all metadata is present only at shard 0.
|
||||
let shard_no = 0;
|
||||
|
||||
let mut client = self.get_client(shard_no).await?;
|
||||
|
||||
let request = proto::RelSizeRequest::from(request);
|
||||
let response = client.rel_size(tonic::Request::new(request)).await?;
|
||||
|
||||
Ok(response.get_ref().num_blocks)
|
||||
}
|
||||
|
||||
pub async fn get_page(&self, request: &GetPageRequest) -> Result<Bytes, PageserverClientError> {
|
||||
// FIXME: calculate the shard number correctly
|
||||
let shard_no = 0;
|
||||
|
||||
let mut client = self.get_client(shard_no).await?;
|
||||
|
||||
let request = proto::GetPageRequest::from(request);
|
||||
let response = client.get_page(tonic::Request::new(request)).await?;
|
||||
|
||||
Ok(response.into_inner().page_image)
|
||||
}
|
||||
|
||||
/// Process a request to get the size of a database.
|
||||
pub async fn process_dbsize_request(
|
||||
&self,
|
||||
request: &DbSizeRequest,
|
||||
) -> Result<u64, PageserverClientError> {
|
||||
// Current sharding model assumes that all metadata is present only at shard 0.
|
||||
let shard_no = 0;
|
||||
|
||||
let mut client = self.get_client(shard_no).await?;
|
||||
|
||||
let request = proto::DbSizeRequest::from(request);
|
||||
let response = client.db_size(tonic::Request::new(request)).await?;
|
||||
|
||||
Ok(response.get_ref().num_bytes)
|
||||
}
|
||||
|
||||
/// Process a request to get the size of a database.
|
||||
pub async fn get_base_backup(
|
||||
&self,
|
||||
request: &GetBaseBackupRequest,
|
||||
gzip: bool,
|
||||
) -> std::result::Result<
|
||||
tonic::Response<tonic::codec::Streaming<proto::GetBaseBackupResponseChunk>>,
|
||||
PageserverClientError,
|
||||
> {
|
||||
// Current sharding model assumes that all metadata is present only at shard 0.
|
||||
let shard_no = 0;
|
||||
|
||||
let mut client = self.get_client(shard_no).await?;
|
||||
if gzip {
|
||||
client = client.accept_compressed(tonic::codec::CompressionEncoding::Gzip);
|
||||
}
|
||||
|
||||
let request = proto::GetBaseBackupRequest::from(request);
|
||||
let response = client.get_base_backup(tonic::Request::new(request)).await?;
|
||||
|
||||
Ok(response)
|
||||
}
|
||||
|
||||
/// Get a client for given shard
|
||||
///
|
||||
/// This implements very basic caching. If we already have a client for the given shard,
|
||||
/// reuse it. If not, create a new client and put it to the cache.
|
||||
async fn get_client(
|
||||
&self,
|
||||
shard_no: u16,
|
||||
) -> Result<MyPageServiceClient, PageserverClientError> {
|
||||
let reused_channel: Option<Channel> = {
|
||||
let channels = self.channels.read().unwrap();
|
||||
|
||||
channels.get(&shard_no).cloned()
|
||||
};
|
||||
|
||||
let channel = if let Some(reused_channel) = reused_channel {
|
||||
reused_channel
|
||||
} else {
|
||||
let endpoint: tonic::transport::Endpoint = self
|
||||
.shard_map
|
||||
.get(&shard_no)
|
||||
.expect("no url for shard {shard_no}")
|
||||
.parse()?;
|
||||
let channel = endpoint.connect().await?;
|
||||
|
||||
// Insert it to the cache so that it can be reused on subsequent calls. It's possible
|
||||
// that another thread did the same concurrently, in which case we will overwrite the
|
||||
// client in the cache.
|
||||
{
|
||||
let mut channels = self.channels.write().unwrap();
|
||||
channels.insert(shard_no, channel.clone());
|
||||
}
|
||||
channel
|
||||
};
|
||||
|
||||
let client = PageServiceClient::with_interceptor(channel, self.auth_interceptor.clone());
|
||||
Ok(client)
|
||||
}
|
||||
}
|
||||
|
||||
/// Inject tenant_id, timeline_id and authentication token to all pageserver requests.
|
||||
#[derive(Clone)]
|
||||
struct AuthInterceptor {
|
||||
tenant_id: AsciiMetadataValue,
|
||||
timeline_id: AsciiMetadataValue,
|
||||
|
||||
auth_token: Option<AsciiMetadataValue>,
|
||||
}
|
||||
|
||||
impl AuthInterceptor {
|
||||
fn new(tenant_id: &str, timeline_id: &str, auth_token: Option<&String>) -> Self {
|
||||
Self {
|
||||
tenant_id: tenant_id.parse().expect("could not parse tenant id"),
|
||||
timeline_id: timeline_id.parse().expect("could not parse timeline id"),
|
||||
auth_token: auth_token.map(|x| x.parse().expect("could not parse auth token")),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl tonic::service::Interceptor for AuthInterceptor {
|
||||
fn call(&mut self, mut req: tonic::Request<()>) -> Result<tonic::Request<()>, tonic::Status> {
|
||||
req.metadata_mut()
|
||||
.insert("neon-tenant-id", self.tenant_id.clone());
|
||||
req.metadata_mut()
|
||||
.insert("neon-timeline-id", self.timeline_id.clone());
|
||||
if let Some(auth_token) = &self.auth_token {
|
||||
req.metadata_mut()
|
||||
.insert("neon-auth-token", auth_token.clone());
|
||||
}
|
||||
|
||||
Ok(req)
|
||||
}
|
||||
}
|
||||
18
pageserver/data_api/Cargo.toml
Normal file
18
pageserver/data_api/Cargo.toml
Normal file
@@ -0,0 +1,18 @@
|
||||
[package]
|
||||
name = "pageserver_data_api"
|
||||
version = "0.1.0"
|
||||
edition = "2024"
|
||||
|
||||
[dependencies]
|
||||
|
||||
# For Lsn.
|
||||
#
|
||||
# TODO: move Lsn to separate crate? This draws in a lot more dependencies
|
||||
utils.workspace = true
|
||||
|
||||
prost.workspace = true
|
||||
thiserror.workspace = true
|
||||
tonic.workspace = true
|
||||
|
||||
[build-dependencies]
|
||||
tonic-build.workspace = true
|
||||
8
pageserver/data_api/build.rs
Normal file
8
pageserver/data_api/build.rs
Normal file
@@ -0,0 +1,8 @@
|
||||
fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
// Generate rust code from .proto protobuf.
|
||||
tonic_build::configure()
|
||||
.bytes(&["."])
|
||||
.compile_protos(&["proto/page_service.proto"], &["proto"])
|
||||
.unwrap_or_else(|e| panic!("failed to compile protos {:?}", e));
|
||||
Ok(())
|
||||
}
|
||||
84
pageserver/data_api/proto/page_service.proto
Normal file
84
pageserver/data_api/proto/page_service.proto
Normal file
@@ -0,0 +1,84 @@
|
||||
// Page service presented by pageservers, for computes
|
||||
//
|
||||
// Each request must come with the following metadata:
|
||||
// - neon-tenant-id
|
||||
// - neon-timeline-id
|
||||
// - neon-auth-token (if auth is enabled)
|
||||
//
|
||||
// TODO: what else? Priority? OpenTelemetry tracing?
|
||||
//
|
||||
|
||||
syntax = "proto3";
|
||||
package page_service;
|
||||
|
||||
service PageService {
|
||||
rpc RelExists(RelExistsRequest) returns (RelExistsResponse);
|
||||
|
||||
// Returns size of a relation, as # of blocks
|
||||
rpc RelSize (RelSizeRequest) returns (RelSizeResponse);
|
||||
|
||||
rpc GetPage (GetPageRequest) returns (GetPageResponse);
|
||||
|
||||
// Returns total size of a database, as # of bytes
|
||||
rpc DbSize (DbSizeRequest) returns (DbSizeResponse);
|
||||
|
||||
rpc GetBaseBackup (GetBaseBackupRequest) returns (stream GetBaseBackupResponseChunk);
|
||||
}
|
||||
|
||||
message RequestCommon {
|
||||
uint64 request_lsn = 1;
|
||||
uint64 not_modified_since_lsn = 2;
|
||||
}
|
||||
|
||||
message RelTag {
|
||||
uint32 spc_oid = 1;
|
||||
uint32 db_oid = 2;
|
||||
uint32 rel_number = 3;
|
||||
uint32 fork_number = 4;
|
||||
}
|
||||
|
||||
message RelExistsRequest {
|
||||
RequestCommon common = 1;
|
||||
RelTag rel = 2;
|
||||
}
|
||||
|
||||
message RelExistsResponse {
|
||||
bool exists = 1;
|
||||
}
|
||||
|
||||
message RelSizeRequest {
|
||||
RequestCommon common = 1;
|
||||
RelTag rel = 2;
|
||||
}
|
||||
|
||||
message RelSizeResponse {
|
||||
uint32 num_blocks = 1;
|
||||
}
|
||||
|
||||
message GetPageRequest {
|
||||
RequestCommon common = 1;
|
||||
RelTag rel = 2;
|
||||
uint32 block_number = 3;
|
||||
}
|
||||
|
||||
message GetPageResponse {
|
||||
bytes page_image = 1;
|
||||
}
|
||||
|
||||
message DbSizeRequest {
|
||||
RequestCommon common = 1;
|
||||
uint32 db_oid = 2;
|
||||
}
|
||||
|
||||
message DbSizeResponse {
|
||||
uint64 num_bytes = 1;
|
||||
}
|
||||
|
||||
message GetBaseBackupRequest {
|
||||
RequestCommon common = 1;
|
||||
bool replica = 2;
|
||||
}
|
||||
|
||||
message GetBaseBackupResponseChunk {
|
||||
bytes chunk = 1;
|
||||
}
|
||||
17
pageserver/data_api/src/lib.rs
Normal file
17
pageserver/data_api/src/lib.rs
Normal file
@@ -0,0 +1,17 @@
|
||||
//! This crate has two modules related to the Pageserver Data API:
|
||||
//!
|
||||
//! proto: code auto-generated from the protobuf definition
|
||||
//! model: slightly more ergonomic structs representing the same API
|
||||
//!
|
||||
//! See protobuf spec under the protos/ subdirectory.
|
||||
//!
|
||||
//! This crate is used by both the client and the server. Try to keep it slim.
|
||||
//!
|
||||
pub mod model;
|
||||
|
||||
// Code generated by protobuf.
|
||||
pub mod proto {
|
||||
tonic::include_proto!("page_service");
|
||||
}
|
||||
|
||||
pub use proto::page_service_client as client;
|
||||
239
pageserver/data_api/src/model.rs
Normal file
239
pageserver/data_api/src/model.rs
Normal file
@@ -0,0 +1,239 @@
|
||||
//! Structs representing the API
|
||||
//!
|
||||
//! These mirror the pageserver APIs and the structs automatically generated
|
||||
//! from the protobuf specification. The differences are:
|
||||
//!
|
||||
//! - Types that are in fact required by the API are not Options. The protobuf "required"
|
||||
//! attribute is deprecated and 'prost' marks a lot of members as optional because of that.
|
||||
//! (See https://github.com/tokio-rs/prost/issues/800 for a gripe on this)
|
||||
//!
|
||||
//! - Use more precise datatypes, e.g. Lsn and uints shorter than 32 bits.
|
||||
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use crate::proto;
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct RequestCommon {
|
||||
pub request_lsn: Lsn,
|
||||
pub not_modified_since_lsn: Lsn,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Eq, PartialEq, Hash, PartialOrd, Ord)]
|
||||
pub struct RelTag {
|
||||
pub spc_oid: u32,
|
||||
pub db_oid: u32,
|
||||
pub rel_number: u32,
|
||||
pub fork_number: u8,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct RelExistsRequest {
|
||||
pub common: RequestCommon,
|
||||
pub rel: RelTag,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct RelSizeRequest {
|
||||
pub common: RequestCommon,
|
||||
pub rel: RelTag,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct RelSizeResponse {
|
||||
pub num_blocks: u32,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct GetPageRequest {
|
||||
pub common: RequestCommon,
|
||||
pub rel: RelTag,
|
||||
pub block_number: u32,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct GetPageResponse {
|
||||
pub page_image: std::vec::Vec<u8>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct DbSizeRequest {
|
||||
pub common: RequestCommon,
|
||||
pub db_oid: u32,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct DbSizeResponse {
|
||||
pub num_bytes: u64,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct GetBaseBackupRequest {
|
||||
pub common: RequestCommon,
|
||||
pub replica: bool,
|
||||
}
|
||||
|
||||
//--- Conversions to/from the generated proto types
|
||||
|
||||
use thiserror::Error;
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
pub enum ProtocolError {
|
||||
#[error("the value for field `{0}` is invalid")]
|
||||
InvalidValue(&'static str),
|
||||
#[error("the required field `{0}` is missing ")]
|
||||
Missing(&'static str),
|
||||
}
|
||||
|
||||
impl From<ProtocolError> for tonic::Status {
|
||||
fn from(e: ProtocolError) -> Self {
|
||||
match e {
|
||||
ProtocolError::InvalidValue(_field) => tonic::Status::invalid_argument(e.to_string()),
|
||||
ProtocolError::Missing(_field) => tonic::Status::invalid_argument(e.to_string()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<&RelTag> for proto::RelTag {
|
||||
fn from(value: &RelTag) -> proto::RelTag {
|
||||
proto::RelTag {
|
||||
spc_oid: value.spc_oid,
|
||||
db_oid: value.db_oid,
|
||||
rel_number: value.rel_number,
|
||||
fork_number: value.fork_number as u32,
|
||||
}
|
||||
}
|
||||
}
|
||||
impl TryFrom<&proto::RelTag> for RelTag {
|
||||
type Error = ProtocolError;
|
||||
|
||||
fn try_from(value: &proto::RelTag) -> Result<RelTag, ProtocolError> {
|
||||
Ok(RelTag {
|
||||
spc_oid: value.spc_oid,
|
||||
db_oid: value.db_oid,
|
||||
rel_number: value.rel_number,
|
||||
fork_number: value
|
||||
.fork_number
|
||||
.try_into()
|
||||
.or(Err(ProtocolError::InvalidValue("fork_number")))?,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl From<&RequestCommon> for proto::RequestCommon {
|
||||
fn from(value: &RequestCommon) -> proto::RequestCommon {
|
||||
proto::RequestCommon {
|
||||
request_lsn: value.request_lsn.into(),
|
||||
not_modified_since_lsn: value.not_modified_since_lsn.into(),
|
||||
}
|
||||
}
|
||||
}
|
||||
impl From<&proto::RequestCommon> for RequestCommon {
|
||||
fn from(value: &proto::RequestCommon) -> RequestCommon {
|
||||
RequestCommon {
|
||||
request_lsn: value.request_lsn.into(),
|
||||
not_modified_since_lsn: value.not_modified_since_lsn.into(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<&RelExistsRequest> for proto::RelExistsRequest {
|
||||
fn from(value: &RelExistsRequest) -> proto::RelExistsRequest {
|
||||
proto::RelExistsRequest {
|
||||
common: Some((&value.common).into()),
|
||||
rel: Some((&value.rel).into()),
|
||||
}
|
||||
}
|
||||
}
|
||||
impl TryFrom<&proto::RelExistsRequest> for RelExistsRequest {
|
||||
type Error = ProtocolError;
|
||||
|
||||
fn try_from(value: &proto::RelExistsRequest) -> Result<RelExistsRequest, ProtocolError> {
|
||||
Ok(RelExistsRequest {
|
||||
common: (&value.common.ok_or(ProtocolError::Missing("common"))?).into(),
|
||||
rel: (&value.rel.ok_or(ProtocolError::Missing("rel"))?).try_into()?,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl From<&RelSizeRequest> for proto::RelSizeRequest {
|
||||
fn from(value: &RelSizeRequest) -> proto::RelSizeRequest {
|
||||
proto::RelSizeRequest {
|
||||
common: Some((&value.common).into()),
|
||||
rel: Some((&value.rel).into()),
|
||||
}
|
||||
}
|
||||
}
|
||||
impl TryFrom<&proto::RelSizeRequest> for RelSizeRequest {
|
||||
type Error = ProtocolError;
|
||||
|
||||
fn try_from(value: &proto::RelSizeRequest) -> Result<RelSizeRequest, ProtocolError> {
|
||||
Ok(RelSizeRequest {
|
||||
common: (&value.common.ok_or(ProtocolError::Missing("common"))?).into(),
|
||||
rel: (&value.rel.ok_or(ProtocolError::Missing("rel"))?).try_into()?,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl From<&GetPageRequest> for proto::GetPageRequest {
|
||||
fn from(value: &GetPageRequest) -> proto::GetPageRequest {
|
||||
proto::GetPageRequest {
|
||||
common: Some((&value.common).into()),
|
||||
rel: Some((&value.rel).into()),
|
||||
block_number: value.block_number,
|
||||
}
|
||||
}
|
||||
}
|
||||
impl TryFrom<&proto::GetPageRequest> for GetPageRequest {
|
||||
type Error = ProtocolError;
|
||||
|
||||
fn try_from(value: &proto::GetPageRequest) -> Result<GetPageRequest, ProtocolError> {
|
||||
Ok(GetPageRequest {
|
||||
common: (&value.common.ok_or(ProtocolError::Missing("common"))?).into(),
|
||||
rel: (&value.rel.ok_or(ProtocolError::Missing("rel"))?).try_into()?,
|
||||
block_number: value.block_number,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl From<&DbSizeRequest> for proto::DbSizeRequest {
|
||||
fn from(value: &DbSizeRequest) -> proto::DbSizeRequest {
|
||||
proto::DbSizeRequest {
|
||||
common: Some((&value.common).into()),
|
||||
db_oid: value.db_oid,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<&proto::DbSizeRequest> for DbSizeRequest {
|
||||
type Error = ProtocolError;
|
||||
|
||||
fn try_from(value: &proto::DbSizeRequest) -> Result<DbSizeRequest, ProtocolError> {
|
||||
Ok(DbSizeRequest {
|
||||
common: (&value.common.ok_or(ProtocolError::Missing("common"))?).into(),
|
||||
db_oid: value.db_oid,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl From<&GetBaseBackupRequest> for proto::GetBaseBackupRequest {
|
||||
fn from(value: &GetBaseBackupRequest) -> proto::GetBaseBackupRequest {
|
||||
proto::GetBaseBackupRequest {
|
||||
common: Some((&value.common).into()),
|
||||
replica: value.replica,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<&proto::GetBaseBackupRequest> for GetBaseBackupRequest {
|
||||
type Error = ProtocolError;
|
||||
|
||||
fn try_from(
|
||||
value: &proto::GetBaseBackupRequest,
|
||||
) -> Result<GetBaseBackupRequest, ProtocolError> {
|
||||
Ok(GetBaseBackupRequest {
|
||||
common: (&value.common.ok_or(ProtocolError::Missing("common"))?).into(),
|
||||
replica: value.replica,
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -23,6 +23,8 @@ tokio.workspace = true
|
||||
tokio-util.workspace = true
|
||||
|
||||
pageserver_client.workspace = true
|
||||
pageserver_client_grpc.workspace = true
|
||||
pageserver_data_api.workspace = true
|
||||
pageserver_api.workspace = true
|
||||
utils = { path = "../../libs/utils/" }
|
||||
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
||||
|
||||
@@ -9,6 +9,9 @@ use anyhow::Context;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use pageserver_client::mgmt_api::ForceAwaitLogicalSize;
|
||||
use pageserver_client::page_service::BasebackupRequest;
|
||||
use pageserver_client_grpc;
|
||||
use pageserver_data_api::model::{GetBaseBackupRequest, RequestCommon};
|
||||
|
||||
use rand::prelude::*;
|
||||
use tokio::sync::Barrier;
|
||||
use tokio::task::JoinSet;
|
||||
@@ -22,6 +25,8 @@ use crate::util::{request_stats, tokio_thread_local_stats};
|
||||
/// basebackup@LatestLSN
|
||||
#[derive(clap::Parser)]
|
||||
pub(crate) struct Args {
|
||||
#[clap(long, default_value = "false")]
|
||||
grpc: bool,
|
||||
#[clap(long, default_value = "http://localhost:9898")]
|
||||
mgmt_api_endpoint: String,
|
||||
#[clap(long, default_value = "postgres://postgres@localhost:64000")]
|
||||
@@ -52,7 +57,7 @@ impl LiveStats {
|
||||
|
||||
struct Target {
|
||||
timeline: TenantTimelineId,
|
||||
lsn_range: Option<Range<Lsn>>,
|
||||
lsn_range: Range<Lsn>,
|
||||
}
|
||||
|
||||
#[derive(serde::Serialize)]
|
||||
@@ -105,7 +110,7 @@ async fn main_impl(
|
||||
anyhow::Ok(Target {
|
||||
timeline,
|
||||
// TODO: support lsn_range != latest LSN
|
||||
lsn_range: Some(info.last_record_lsn..(info.last_record_lsn + 1)),
|
||||
lsn_range: info.last_record_lsn..(info.last_record_lsn + 1),
|
||||
})
|
||||
}
|
||||
});
|
||||
@@ -149,14 +154,27 @@ async fn main_impl(
|
||||
for tl in &timelines {
|
||||
let (sender, receiver) = tokio::sync::mpsc::channel(1); // TODO: not sure what the implications of this are
|
||||
work_senders.insert(tl, sender);
|
||||
tasks.push(tokio::spawn(client(
|
||||
args,
|
||||
*tl,
|
||||
Arc::clone(&start_work_barrier),
|
||||
receiver,
|
||||
Arc::clone(&all_work_done_barrier),
|
||||
Arc::clone(&live_stats),
|
||||
)));
|
||||
|
||||
let client_task = if args.grpc {
|
||||
tokio::spawn(client_grpc(
|
||||
args,
|
||||
*tl,
|
||||
Arc::clone(&start_work_barrier),
|
||||
receiver,
|
||||
Arc::clone(&all_work_done_barrier),
|
||||
Arc::clone(&live_stats),
|
||||
))
|
||||
} else {
|
||||
tokio::spawn(client(
|
||||
args,
|
||||
*tl,
|
||||
Arc::clone(&start_work_barrier),
|
||||
receiver,
|
||||
Arc::clone(&all_work_done_barrier),
|
||||
Arc::clone(&live_stats),
|
||||
))
|
||||
};
|
||||
tasks.push(client_task);
|
||||
}
|
||||
|
||||
let work_sender = async move {
|
||||
@@ -165,7 +183,7 @@ async fn main_impl(
|
||||
let (timeline, work) = {
|
||||
let mut rng = rand::thread_rng();
|
||||
let target = all_targets.choose(&mut rng).unwrap();
|
||||
let lsn = target.lsn_range.clone().map(|r| rng.gen_range(r));
|
||||
let lsn = rng.gen_range(target.lsn_range.clone());
|
||||
(
|
||||
target.timeline,
|
||||
Work {
|
||||
@@ -215,7 +233,7 @@ async fn main_impl(
|
||||
|
||||
#[derive(Copy, Clone)]
|
||||
struct Work {
|
||||
lsn: Option<Lsn>,
|
||||
lsn: Lsn,
|
||||
gzip: bool,
|
||||
}
|
||||
|
||||
@@ -240,7 +258,7 @@ async fn client(
|
||||
.basebackup(&BasebackupRequest {
|
||||
tenant_id: timeline.tenant_id,
|
||||
timeline_id: timeline.timeline_id,
|
||||
lsn,
|
||||
lsn: Some(lsn),
|
||||
gzip,
|
||||
})
|
||||
.await
|
||||
@@ -270,3 +288,71 @@ async fn client(
|
||||
|
||||
all_work_done_barrier.wait().await;
|
||||
}
|
||||
|
||||
#[instrument(skip_all)]
|
||||
async fn client_grpc(
|
||||
args: &'static Args,
|
||||
timeline: TenantTimelineId,
|
||||
start_work_barrier: Arc<Barrier>,
|
||||
mut work: tokio::sync::mpsc::Receiver<Work>,
|
||||
all_work_done_barrier: Arc<Barrier>,
|
||||
live_stats: Arc<LiveStats>,
|
||||
) {
|
||||
let shard_map = HashMap::from([(0, args.page_service_connstring.clone())]);
|
||||
let client = pageserver_client_grpc::PageserverClient::new(
|
||||
&timeline.tenant_id.to_string(),
|
||||
&timeline.timeline_id.to_string(),
|
||||
&None,
|
||||
shard_map,
|
||||
);
|
||||
|
||||
start_work_barrier.wait().await;
|
||||
|
||||
while let Some(Work { lsn, gzip }) = work.recv().await {
|
||||
let start = Instant::now();
|
||||
|
||||
//tokio::time::sleep(std::time::Duration::from_secs(1)).await;
|
||||
|
||||
info!("starting get_base_backup");
|
||||
let mut basebackup_stream = client
|
||||
.get_base_backup(
|
||||
&GetBaseBackupRequest {
|
||||
common: RequestCommon {
|
||||
request_lsn: lsn,
|
||||
not_modified_since_lsn: lsn,
|
||||
},
|
||||
replica: false,
|
||||
},
|
||||
gzip,
|
||||
)
|
||||
.await
|
||||
.with_context(|| format!("start basebackup for {timeline}"))
|
||||
.unwrap()
|
||||
.into_inner();
|
||||
|
||||
info!("starting receive");
|
||||
use futures::StreamExt;
|
||||
let mut size = 0;
|
||||
let mut nchunks = 0;
|
||||
while let Some(chunk) = basebackup_stream.next().await {
|
||||
let chunk = chunk
|
||||
.with_context(|| format!("error during basebackup"))
|
||||
.unwrap();
|
||||
size += chunk.chunk.len();
|
||||
nchunks += 1;
|
||||
}
|
||||
|
||||
info!(
|
||||
"basebackup size is {} bytes, avg chunk size {} bytes",
|
||||
size,
|
||||
size as f32 / nchunks as f32
|
||||
);
|
||||
let elapsed = start.elapsed();
|
||||
live_stats.inc();
|
||||
STATS.with(|stats| {
|
||||
stats.borrow().lock().unwrap().observe(elapsed).unwrap();
|
||||
});
|
||||
}
|
||||
|
||||
all_work_done_barrier.wait().await;
|
||||
}
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use std::collections::{HashSet, VecDeque};
|
||||
use std::collections::{HashMap, HashSet, VecDeque};
|
||||
use std::future::Future;
|
||||
use std::num::NonZeroUsize;
|
||||
use std::pin::Pin;
|
||||
@@ -8,6 +8,8 @@ use std::time::{Duration, Instant};
|
||||
|
||||
use anyhow::Context;
|
||||
use camino::Utf8PathBuf;
|
||||
use futures::StreamExt;
|
||||
use futures::stream::FuturesOrdered;
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::keyspace::KeySpaceAccum;
|
||||
use pageserver_api::models::{PagestreamGetPageRequest, PagestreamRequest};
|
||||
@@ -25,6 +27,8 @@ use crate::util::{request_stats, tokio_thread_local_stats};
|
||||
/// GetPage@LatestLSN, uniformly distributed across the compute-accessible keyspace.
|
||||
#[derive(clap::Parser)]
|
||||
pub(crate) struct Args {
|
||||
#[clap(long, default_value = "false")]
|
||||
grpc: bool,
|
||||
#[clap(long, default_value = "http://localhost:9898")]
|
||||
mgmt_api_endpoint: String,
|
||||
#[clap(long, default_value = "postgres://postgres@localhost:64000")]
|
||||
@@ -295,7 +299,29 @@ async fn main_impl(
|
||||
.unwrap();
|
||||
|
||||
Box::pin(async move {
|
||||
client_libpq(args, worker_id, ss, cancel, rps_period, ranges, weights).await
|
||||
if args.grpc {
|
||||
client_grpc(
|
||||
args,
|
||||
worker_id,
|
||||
ss,
|
||||
cancel,
|
||||
rps_period,
|
||||
ranges,
|
||||
weights,
|
||||
)
|
||||
.await
|
||||
} else {
|
||||
client_libpq(
|
||||
args,
|
||||
worker_id,
|
||||
ss,
|
||||
cancel,
|
||||
rps_period,
|
||||
ranges,
|
||||
weights,
|
||||
)
|
||||
.await
|
||||
}
|
||||
})
|
||||
};
|
||||
|
||||
@@ -434,3 +460,100 @@ async fn client_libpq(
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn client_grpc(
|
||||
args: &Args,
|
||||
worker_id: WorkerId,
|
||||
shared_state: Arc<SharedState>,
|
||||
cancel: CancellationToken,
|
||||
rps_period: Option<Duration>,
|
||||
ranges: Vec<KeyRange>,
|
||||
weights: rand::distributions::weighted::WeightedIndex<i128>,
|
||||
) {
|
||||
let shard_map = HashMap::from([(0, args.page_service_connstring.clone())]);
|
||||
let client = pageserver_client_grpc::PageserverClient::new(
|
||||
&worker_id.timeline.tenant_id.to_string(),
|
||||
&worker_id.timeline.timeline_id.to_string(),
|
||||
&None,
|
||||
shard_map,
|
||||
);
|
||||
let client = Arc::new(client);
|
||||
|
||||
shared_state.start_work_barrier.wait().await;
|
||||
let client_start = Instant::now();
|
||||
let mut ticks_processed = 0;
|
||||
let mut inflight = FuturesOrdered::new();
|
||||
while !cancel.is_cancelled() {
|
||||
// Detect if a request took longer than the RPS rate
|
||||
if let Some(period) = &rps_period {
|
||||
let periods_passed_until_now =
|
||||
usize::try_from(client_start.elapsed().as_micros() / period.as_micros()).unwrap();
|
||||
|
||||
if periods_passed_until_now > ticks_processed {
|
||||
shared_state
|
||||
.live_stats
|
||||
.missed((periods_passed_until_now - ticks_processed) as u64);
|
||||
}
|
||||
ticks_processed = periods_passed_until_now;
|
||||
}
|
||||
|
||||
while inflight.len() < args.queue_depth.get() {
|
||||
let start = Instant::now();
|
||||
let req = {
|
||||
let mut rng = rand::thread_rng();
|
||||
let r = &ranges[weights.sample(&mut rng)];
|
||||
let key: i128 = rng.gen_range(r.start..r.end);
|
||||
let key = Key::from_i128(key);
|
||||
assert!(key.is_rel_block_key());
|
||||
let (rel_tag, block_no) = key
|
||||
.to_rel_block()
|
||||
.expect("we filter non-rel-block keys out above");
|
||||
pageserver_data_api::model::GetPageRequest {
|
||||
common: pageserver_data_api::model::RequestCommon {
|
||||
request_lsn: if rng.gen_bool(args.req_latest_probability) {
|
||||
Lsn::MAX
|
||||
} else {
|
||||
r.timeline_lsn
|
||||
},
|
||||
not_modified_since_lsn: r.timeline_lsn,
|
||||
},
|
||||
rel: pageserver_data_api::model::RelTag {
|
||||
spc_oid: rel_tag.spcnode,
|
||||
db_oid: rel_tag.dbnode,
|
||||
rel_number: rel_tag.relnode,
|
||||
fork_number: rel_tag.forknum,
|
||||
},
|
||||
block_number: block_no,
|
||||
}
|
||||
};
|
||||
let client_clone = client.clone();
|
||||
let getpage_fut = async move {
|
||||
let result = client_clone.get_page(&req).await;
|
||||
(start, result)
|
||||
};
|
||||
inflight.push_back(getpage_fut);
|
||||
}
|
||||
|
||||
let (start, result) = inflight.next().await.unwrap();
|
||||
result.expect("getpage request should succeed");
|
||||
let end = Instant::now();
|
||||
shared_state.live_stats.request_done();
|
||||
ticks_processed += 1;
|
||||
STATS.with(|stats| {
|
||||
stats
|
||||
.borrow()
|
||||
.lock()
|
||||
.unwrap()
|
||||
.observe(end.duration_since(start))
|
||||
.unwrap();
|
||||
});
|
||||
|
||||
if let Some(period) = &rps_period {
|
||||
let next_at = client_start
|
||||
+ Duration::from_micros(
|
||||
(ticks_processed) as u64 * u64::try_from(period.as_micros()).unwrap(),
|
||||
);
|
||||
tokio::time::sleep_until(next_at.into()).await;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -151,10 +151,14 @@ where
|
||||
.map_err(|_| BasebackupError::Shutdown)?,
|
||||
),
|
||||
};
|
||||
basebackup
|
||||
let res = basebackup
|
||||
.send_tarball()
|
||||
.instrument(info_span!("send_tarball", backup_lsn=%backup_lsn))
|
||||
.await
|
||||
.await;
|
||||
|
||||
info!("basebackup done!");
|
||||
|
||||
res
|
||||
}
|
||||
|
||||
/// This is short-living object only for the time of tarball creation,
|
||||
|
||||
@@ -16,6 +16,7 @@ use http_utils::tls_certs::ReloadingCertificateResolver;
|
||||
use metrics::launch_timestamp::{LaunchTimestamp, set_launch_timestamp_metric};
|
||||
use metrics::set_build_info_metric;
|
||||
use nix::sys::socket::{setsockopt, sockopt};
|
||||
use pageserver::compute_service;
|
||||
use pageserver::config::{PageServerConf, PageserverIdentity, ignored_fields};
|
||||
use pageserver::controller_upcall_client::StorageControllerUpcallClient;
|
||||
use pageserver::deletion_queue::DeletionQueue;
|
||||
@@ -27,7 +28,7 @@ use pageserver::task_mgr::{
|
||||
use pageserver::tenant::{TenantSharedResources, mgr, secondary};
|
||||
use pageserver::{
|
||||
CancellableTask, ConsumptionMetricsTasks, HttpEndpointListener, HttpsEndpointListener, http,
|
||||
page_cache, page_service, task_mgr, virtual_file,
|
||||
page_cache, task_mgr, virtual_file,
|
||||
};
|
||||
use postgres_backend::AuthType;
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
@@ -745,7 +746,7 @@ fn start_pageserver(
|
||||
// Spawn a task to listen for libpq connections. It will spawn further tasks
|
||||
// for each connection. We created the listener earlier already.
|
||||
let perf_trace_dispatch = otel_guard.as_ref().map(|g| g.dispatch.clone());
|
||||
let page_service = page_service::spawn(
|
||||
let compute_service = compute_service::spawn(
|
||||
conf,
|
||||
tenant_manager.clone(),
|
||||
pg_auth,
|
||||
@@ -782,7 +783,7 @@ fn start_pageserver(
|
||||
pageserver::shutdown_pageserver(
|
||||
http_endpoint_listener,
|
||||
https_endpoint_listener,
|
||||
page_service,
|
||||
compute_service,
|
||||
consumption_metrics_tasks,
|
||||
disk_usage_eviction_task,
|
||||
&tenant_manager,
|
||||
|
||||
286
pageserver/src/compute_service.rs
Normal file
286
pageserver/src/compute_service.rs
Normal file
@@ -0,0 +1,286 @@
|
||||
//!
|
||||
//! The Compute Service listens for compute connections, and serves requests like
|
||||
//! the GetPage@LSN requests.
|
||||
//!
|
||||
//! We support two protocols:
|
||||
//!
|
||||
//! 1. Legacy, connection-oriented libpq based protocol. That's
|
||||
//! handled by the code in page_service.rs.
|
||||
//!
|
||||
//! 2. gRPC based protocol. See compute_service_grpc.rs.
|
||||
//!
|
||||
//! To make the transition smooth, without having to open up new firewall ports
|
||||
//! etc, both protocols are served on the same port. When a new TCP connection
|
||||
//! is accepted, we peek at the first few bytes incoming from the client to
|
||||
//! determine which protocol it speaks.
|
||||
//!
|
||||
//! TODO: This gets easier once we drop the legacy protocol support. Or if we
|
||||
//! open a separate port for them.
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
use anyhow::Context;
|
||||
use futures::FutureExt;
|
||||
use pageserver_api::config::PageServicePipeliningConfig;
|
||||
use postgres_backend::AuthType;
|
||||
use tokio::task::JoinHandle;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
use utils::auth::SwappableJwtAuth;
|
||||
use utils::sync::gate::{Gate, GateGuard};
|
||||
|
||||
use crate::compute_service_grpc::launch_compute_service_grpc_server;
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::{DownloadBehavior, RequestContext, RequestContextBuilder};
|
||||
use crate::page_service::libpq_page_service_conn_main;
|
||||
use crate::task_mgr::{self, COMPUTE_REQUEST_RUNTIME, TaskKind};
|
||||
use crate::tenant::mgr::TenantManager;
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
pub type ConnectionHandlerResult = anyhow::Result<()>;
|
||||
|
||||
pub struct Connections {
|
||||
cancel: CancellationToken,
|
||||
tasks: tokio::task::JoinSet<ConnectionHandlerResult>,
|
||||
gate: Gate,
|
||||
}
|
||||
|
||||
impl Connections {
|
||||
pub(crate) async fn shutdown(self) {
|
||||
let Self {
|
||||
cancel,
|
||||
mut tasks,
|
||||
gate,
|
||||
} = self;
|
||||
cancel.cancel();
|
||||
while let Some(res) = tasks.join_next().await {
|
||||
Self::handle_connection_completion(res);
|
||||
}
|
||||
gate.close().await;
|
||||
}
|
||||
|
||||
fn handle_connection_completion(res: Result<anyhow::Result<()>, tokio::task::JoinError>) {
|
||||
match res {
|
||||
Ok(Ok(())) => {}
|
||||
Ok(Err(e)) => error!("error in page_service connection task: {:?}", e),
|
||||
Err(e) => error!("page_service connection task panicked: {:?}", e),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct Listener {
|
||||
cancel: CancellationToken,
|
||||
/// Cancel the listener task through `listen_cancel` to shut down the listener
|
||||
/// and get a handle on the existing connections.
|
||||
task: JoinHandle<Connections>,
|
||||
}
|
||||
|
||||
pub fn spawn(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_manager: Arc<TenantManager>,
|
||||
pg_auth: Option<Arc<SwappableJwtAuth>>,
|
||||
perf_trace_dispatch: Option<Dispatch>,
|
||||
tcp_listener: tokio::net::TcpListener,
|
||||
tls_config: Option<Arc<rustls::ServerConfig>>,
|
||||
) -> Listener {
|
||||
let cancel = CancellationToken::new();
|
||||
let libpq_ctx = RequestContext::todo_child(
|
||||
TaskKind::LibpqEndpointListener,
|
||||
// listener task shouldn't need to download anything. (We will
|
||||
// create a separate sub-contexts for each connection, with their
|
||||
// own download behavior. This context is used only to listen and
|
||||
// accept connections.)
|
||||
DownloadBehavior::Error,
|
||||
);
|
||||
|
||||
let task = COMPUTE_REQUEST_RUNTIME.spawn(task_mgr::exit_on_panic_or_error(
|
||||
"compute connection listener",
|
||||
compute_connection_listener_main(
|
||||
conf,
|
||||
tenant_manager,
|
||||
pg_auth,
|
||||
perf_trace_dispatch,
|
||||
tcp_listener,
|
||||
conf.pg_auth_type,
|
||||
tls_config,
|
||||
conf.page_service_pipelining.clone(),
|
||||
libpq_ctx,
|
||||
cancel.clone(),
|
||||
)
|
||||
.map(anyhow::Ok),
|
||||
));
|
||||
|
||||
Listener { cancel, task }
|
||||
}
|
||||
|
||||
impl Listener {
|
||||
pub async fn stop_accepting(self) -> Connections {
|
||||
self.cancel.cancel();
|
||||
self.task
|
||||
.await
|
||||
.expect("unreachable: we wrap the listener task in task_mgr::exit_on_panic_or_error")
|
||||
}
|
||||
}
|
||||
|
||||
/// Listener loop. Listens for connections, and launches a new handler
|
||||
/// task for each.
|
||||
///
|
||||
/// Returns Ok(()) upon cancellation via `cancel`, returning the set of
|
||||
/// open connections.
|
||||
///
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub async fn compute_connection_listener_main(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_manager: Arc<TenantManager>,
|
||||
auth: Option<Arc<SwappableJwtAuth>>,
|
||||
perf_trace_dispatch: Option<Dispatch>,
|
||||
listener: tokio::net::TcpListener,
|
||||
auth_type: AuthType,
|
||||
tls_config: Option<Arc<rustls::ServerConfig>>,
|
||||
pipelining_config: PageServicePipeliningConfig,
|
||||
listener_ctx: RequestContext,
|
||||
listener_cancel: CancellationToken,
|
||||
) -> Connections {
|
||||
let connections_cancel = CancellationToken::new();
|
||||
let connections_gate = Gate::default();
|
||||
let mut connection_handler_tasks = tokio::task::JoinSet::default();
|
||||
|
||||
// The connection handling task passes the gRPC protocol
|
||||
// connections to this channel. The tonic gRPC server reads the
|
||||
// channel and takes over the connections from there.
|
||||
let (grpc_connections_tx, grpc_connections_rx) = tokio::sync::mpsc::channel(1000);
|
||||
|
||||
// Set up the gRPC service
|
||||
launch_compute_service_grpc_server(
|
||||
grpc_connections_rx,
|
||||
conf,
|
||||
tenant_manager.clone(),
|
||||
auth.clone(),
|
||||
auth_type,
|
||||
connections_cancel.clone(),
|
||||
&listener_ctx,
|
||||
);
|
||||
|
||||
// Main listener loop
|
||||
loop {
|
||||
let gate_guard = match connections_gate.enter() {
|
||||
Ok(guard) => guard,
|
||||
Err(_) => break,
|
||||
};
|
||||
|
||||
let accepted = tokio::select! {
|
||||
biased;
|
||||
_ = listener_cancel.cancelled() => break,
|
||||
next = connection_handler_tasks.join_next(), if !connection_handler_tasks.is_empty() => {
|
||||
let res = next.expect("we dont poll while empty");
|
||||
Connections::handle_connection_completion(res);
|
||||
continue;
|
||||
}
|
||||
accepted = listener.accept() => accepted,
|
||||
};
|
||||
|
||||
match accepted {
|
||||
Ok((socket, peer_addr)) => {
|
||||
// Connection established. Spawn a new task to handle it.
|
||||
debug!("accepted connection from {}", peer_addr);
|
||||
let local_auth = auth.clone();
|
||||
let connection_ctx = RequestContextBuilder::from(&listener_ctx)
|
||||
.task_kind(TaskKind::PageRequestHandler)
|
||||
.download_behavior(DownloadBehavior::Download)
|
||||
.perf_span_dispatch(perf_trace_dispatch.clone())
|
||||
.detached_child();
|
||||
|
||||
connection_handler_tasks.spawn(page_service_conn_main(
|
||||
conf,
|
||||
tenant_manager.clone(),
|
||||
local_auth,
|
||||
socket,
|
||||
auth_type,
|
||||
tls_config.clone(),
|
||||
pipelining_config.clone(),
|
||||
connection_ctx,
|
||||
connections_cancel.child_token(),
|
||||
gate_guard,
|
||||
grpc_connections_tx.clone(),
|
||||
));
|
||||
}
|
||||
Err(err) => {
|
||||
// accept() failed. Log the error, and loop back to retry on next connection.
|
||||
error!("accept() failed: {:?}", err);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
debug!("page_service listener loop terminated");
|
||||
|
||||
Connections {
|
||||
cancel: connections_cancel,
|
||||
tasks: connection_handler_tasks,
|
||||
gate: connections_gate,
|
||||
}
|
||||
}
|
||||
|
||||
/// Handle a new incoming connection.
|
||||
///
|
||||
/// This peeks at the first few incoming bytes and dispatches the connection
|
||||
/// to the legacy libpq handler or the new gRPC handler accordingly.
|
||||
#[instrument(skip_all, fields(peer_addr, application_name, compute_mode))]
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub async fn page_service_conn_main(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_manager: Arc<TenantManager>,
|
||||
auth: Option<Arc<SwappableJwtAuth>>,
|
||||
socket: tokio::net::TcpStream,
|
||||
auth_type: AuthType,
|
||||
tls_config: Option<Arc<rustls::ServerConfig>>,
|
||||
pipelining_config: PageServicePipeliningConfig,
|
||||
connection_ctx: RequestContext,
|
||||
cancel: CancellationToken,
|
||||
gate_guard: GateGuard,
|
||||
grpc_connections_tx: tokio::sync::mpsc::Sender<tokio::io::Result<tokio::net::TcpStream>>,
|
||||
) -> ConnectionHandlerResult {
|
||||
let mut buf: [u8; 4] = [0; 4];
|
||||
|
||||
socket
|
||||
.set_nodelay(true)
|
||||
.context("could not set TCP_NODELAY")?;
|
||||
|
||||
// Peek
|
||||
socket.peek(&mut buf).await?;
|
||||
|
||||
let mut grpc = false;
|
||||
if buf[0] == 0x16 {
|
||||
// looks like a TLS handshake. Assume gRPC.
|
||||
// XXX: Starting with v17, PostgreSQL also supports "direct TLS mode". But
|
||||
// the compute doesn't use it.
|
||||
grpc = true;
|
||||
}
|
||||
|
||||
if buf[0] == b'G' || buf[0] == b'P' {
|
||||
// Looks like 'GET' or 'POST'
|
||||
// or 'PRI', indicating gRPC over HTTP/2 with prior knowledge
|
||||
grpc = true;
|
||||
}
|
||||
|
||||
// Dispatch
|
||||
if grpc {
|
||||
grpc_connections_tx.send(Ok(socket)).await?;
|
||||
info!("connection sent to channel");
|
||||
Ok(())
|
||||
} else {
|
||||
libpq_page_service_conn_main(
|
||||
conf,
|
||||
tenant_manager,
|
||||
auth,
|
||||
socket,
|
||||
auth_type,
|
||||
tls_config,
|
||||
pipelining_config,
|
||||
connection_ctx,
|
||||
cancel,
|
||||
gate_guard,
|
||||
)
|
||||
.await
|
||||
}
|
||||
}
|
||||
746
pageserver/src/compute_service_grpc.rs
Normal file
746
pageserver/src/compute_service_grpc.rs
Normal file
@@ -0,0 +1,746 @@
|
||||
//!
|
||||
//! Compute <-> Pageserver API handler. This is for the new gRPC-based protocol
|
||||
//!
|
||||
//! TODO:
|
||||
//!
|
||||
//! - Many of the API endpoints are still missing
|
||||
//!
|
||||
//! - This is very much not optimized.
|
||||
//!
|
||||
//! - Much of the code was copy-pasted from page_service.rs. Like the code to get the
|
||||
//! Timeline object, and the JWT auth. Could refactor and share.
|
||||
//!
|
||||
//!
|
||||
|
||||
use std::pin::Pin;
|
||||
use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
use std::task::Poll;
|
||||
use std::time::Duration;
|
||||
use std::time::Instant;
|
||||
|
||||
use crate::TenantManager;
|
||||
use crate::auth::check_permission;
|
||||
use crate::basebackup;
|
||||
use crate::basebackup::BasebackupError;
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::{DownloadBehavior, RequestContext, RequestContextBuilder};
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::tenant::Timeline;
|
||||
use crate::tenant::mgr::ShardResolveResult;
|
||||
use crate::tenant::mgr::ShardSelector;
|
||||
use crate::tenant::storage_layer::IoConcurrency;
|
||||
use crate::tenant::timeline::WaitLsnTimeout;
|
||||
use tokio::io::{AsyncWriteExt, ReadHalf, SimplexStream};
|
||||
use tokio::task::JoinHandle;
|
||||
use tokio_util::codec::{Decoder, FramedRead};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
|
||||
use futures::stream::StreamExt;
|
||||
|
||||
use pageserver_data_api::model;
|
||||
use pageserver_data_api::proto::page_service_server::PageService;
|
||||
use pageserver_data_api::proto::page_service_server::PageServiceServer;
|
||||
|
||||
use anyhow::Context;
|
||||
use bytes::BytesMut;
|
||||
use jsonwebtoken::TokenData;
|
||||
use tracing::Instrument;
|
||||
use tracing::{debug, error};
|
||||
use utils::auth::SwappableJwtAuth;
|
||||
|
||||
use utils::id::{TenantId, TenantTimelineId, TimelineId};
|
||||
use utils::lsn::Lsn;
|
||||
use utils::simple_rcu::RcuReadGuard;
|
||||
|
||||
use crate::tenant::PageReconstructError;
|
||||
|
||||
use postgres_ffi::BLCKSZ;
|
||||
|
||||
use tonic;
|
||||
use tonic::codec::CompressionEncoding;
|
||||
use tonic::service::interceptor::InterceptedService;
|
||||
|
||||
use pageserver_api::key::rel_block_to_key;
|
||||
|
||||
use crate::pgdatadir_mapping::Version;
|
||||
use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
|
||||
|
||||
use postgres_backend::AuthType;
|
||||
|
||||
pub use pageserver_data_api::proto;
|
||||
|
||||
pub(super) fn launch_compute_service_grpc_server(
|
||||
tcp_connections_rx: tokio::sync::mpsc::Receiver<tokio::io::Result<tokio::net::TcpStream>>,
|
||||
conf: &'static PageServerConf,
|
||||
tenant_manager: Arc<TenantManager>,
|
||||
auth: Option<Arc<SwappableJwtAuth>>,
|
||||
auth_type: AuthType,
|
||||
connections_cancel: CancellationToken,
|
||||
listener_ctx: &RequestContext,
|
||||
) {
|
||||
// Set up the gRPC service
|
||||
let service_ctx = RequestContextBuilder::from(listener_ctx)
|
||||
.task_kind(TaskKind::PageRequestHandler)
|
||||
.download_behavior(DownloadBehavior::Download)
|
||||
.attached_child();
|
||||
let service = crate::compute_service_grpc::PageServiceService {
|
||||
conf,
|
||||
tenant_mgr: tenant_manager.clone(),
|
||||
ctx: Arc::new(service_ctx),
|
||||
};
|
||||
let authenticator = PageServiceAuthenticator {
|
||||
auth: auth.clone(),
|
||||
auth_type,
|
||||
};
|
||||
|
||||
let server = InterceptedService::new(
|
||||
PageServiceServer::new(service).send_compressed(CompressionEncoding::Gzip),
|
||||
authenticator,
|
||||
);
|
||||
|
||||
let cc = connections_cancel.clone();
|
||||
tokio::spawn(async move {
|
||||
tonic::transport::Server::builder()
|
||||
.add_service(server)
|
||||
.serve_with_incoming_shutdown(
|
||||
tokio_stream::wrappers::ReceiverStream::new(tcp_connections_rx),
|
||||
cc.cancelled(),
|
||||
)
|
||||
.await
|
||||
});
|
||||
}
|
||||
|
||||
struct PageServiceService {
|
||||
conf: &'static PageServerConf,
|
||||
tenant_mgr: Arc<TenantManager>,
|
||||
ctx: Arc<RequestContext>,
|
||||
}
|
||||
|
||||
/// An error happened in a get() operation.
|
||||
impl From<PageReconstructError> for tonic::Status {
|
||||
fn from(e: PageReconstructError) -> Self {
|
||||
match e {
|
||||
PageReconstructError::Other(err) => tonic::Status::unknown(err.to_string()),
|
||||
PageReconstructError::AncestorLsnTimeout(_) => {
|
||||
tonic::Status::unavailable(e.to_string())
|
||||
}
|
||||
PageReconstructError::Cancelled => tonic::Status::aborted(e.to_string()),
|
||||
PageReconstructError::WalRedo(_) => tonic::Status::internal(e.to_string()),
|
||||
PageReconstructError::MissingKey(_) => tonic::Status::internal(e.to_string()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn convert_reltag(value: &model::RelTag) -> pageserver_api::reltag::RelTag {
|
||||
pageserver_api::reltag::RelTag {
|
||||
spcnode: value.spc_oid,
|
||||
dbnode: value.db_oid,
|
||||
relnode: value.rel_number,
|
||||
forknum: value.fork_number,
|
||||
}
|
||||
}
|
||||
|
||||
#[tonic::async_trait]
|
||||
impl PageService for PageServiceService {
|
||||
type GetBaseBackupStream = GetBaseBackupStream;
|
||||
|
||||
async fn rel_exists(
|
||||
&self,
|
||||
request: tonic::Request<proto::RelExistsRequest>,
|
||||
) -> std::result::Result<tonic::Response<proto::RelExistsResponse>, tonic::Status> {
|
||||
let ttid = self.extract_ttid(request.metadata())?;
|
||||
let req: model::RelExistsRequest = request.get_ref().try_into()?;
|
||||
|
||||
let rel = convert_reltag(&req.rel);
|
||||
let span = tracing::info_span!("rel_exists", tenant_id = %ttid.tenant_id, timeline_id = %ttid.timeline_id, rel = %rel, req_lsn = %req.common.request_lsn);
|
||||
|
||||
async {
|
||||
let timeline = self.get_timeline(ttid, ShardSelector::Zero).await?;
|
||||
let ctx = self.ctx.with_scope_timeline(&timeline);
|
||||
let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn();
|
||||
let lsn = Self::wait_or_get_last_lsn(
|
||||
&timeline,
|
||||
req.common.request_lsn,
|
||||
req.common.not_modified_since_lsn,
|
||||
&latest_gc_cutoff_lsn,
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
|
||||
let exists = timeline
|
||||
.get_rel_exists(rel, Version::Lsn(lsn), &ctx)
|
||||
.await?;
|
||||
|
||||
Ok(tonic::Response::new(proto::RelExistsResponse { exists }))
|
||||
}
|
||||
.instrument(span)
|
||||
.await
|
||||
}
|
||||
|
||||
/// Returns size of a relation, as # of blocks
|
||||
async fn rel_size(
|
||||
&self,
|
||||
request: tonic::Request<proto::RelSizeRequest>,
|
||||
) -> std::result::Result<tonic::Response<proto::RelSizeResponse>, tonic::Status> {
|
||||
let ttid = self.extract_ttid(request.metadata())?;
|
||||
let req: model::RelSizeRequest = request.get_ref().try_into()?;
|
||||
let rel = convert_reltag(&req.rel);
|
||||
|
||||
let span = tracing::info_span!("rel_size", tenant_id = %ttid.tenant_id, timeline_id = %ttid.timeline_id, rel = %rel, req_lsn = %req.common.request_lsn);
|
||||
|
||||
async {
|
||||
let timeline = self.get_timeline(ttid, ShardSelector::Zero).await?;
|
||||
let ctx = self.ctx.with_scope_timeline(&timeline);
|
||||
let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn();
|
||||
let lsn = Self::wait_or_get_last_lsn(
|
||||
&timeline,
|
||||
req.common.request_lsn,
|
||||
req.common.not_modified_since_lsn,
|
||||
&latest_gc_cutoff_lsn,
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
|
||||
let num_blocks = timeline.get_rel_size(rel, Version::Lsn(lsn), &ctx).await?;
|
||||
|
||||
Ok(tonic::Response::new(proto::RelSizeResponse { num_blocks }))
|
||||
}
|
||||
.instrument(span)
|
||||
.await
|
||||
}
|
||||
|
||||
async fn get_page(
|
||||
&self,
|
||||
request: tonic::Request<proto::GetPageRequest>,
|
||||
) -> std::result::Result<tonic::Response<proto::GetPageResponse>, tonic::Status> {
|
||||
let ttid = self.extract_ttid(request.metadata())?;
|
||||
let req: model::GetPageRequest = request.get_ref().try_into()?;
|
||||
|
||||
// Calculate shard number.
|
||||
//
|
||||
// FIXME: this should probably be part of the data_api crate.
|
||||
let rel = convert_reltag(&req.rel);
|
||||
let key = rel_block_to_key(rel, req.block_number);
|
||||
let timeline = self.get_timeline(ttid, ShardSelector::Page(key)).await?;
|
||||
|
||||
let ctx = self.ctx.with_scope_timeline(&timeline);
|
||||
let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn();
|
||||
let lsn = Self::wait_or_get_last_lsn(
|
||||
&timeline,
|
||||
req.common.request_lsn,
|
||||
req.common.not_modified_since_lsn,
|
||||
&latest_gc_cutoff_lsn,
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
|
||||
let shard_id = timeline.tenant_shard_id.shard_number;
|
||||
let span = tracing::info_span!("get_page", tenant_id = %ttid.tenant_id, shard_id = %shard_id, timeline_id = %ttid.timeline_id, rel = %rel, block_number = %req.block_number, req_lsn = %req.common.request_lsn);
|
||||
|
||||
async {
|
||||
let gate_guard = match timeline.gate.enter() {
|
||||
Ok(guard) => guard,
|
||||
Err(_) => {
|
||||
return Err(tonic::Status::unavailable("timeline is shutting down"));
|
||||
}
|
||||
};
|
||||
|
||||
let io_concurrency = IoConcurrency::spawn_from_conf(self.conf, gate_guard);
|
||||
|
||||
let page_image = timeline
|
||||
.get_rel_page_at_lsn(
|
||||
rel,
|
||||
req.block_number,
|
||||
Version::Lsn(lsn),
|
||||
&ctx,
|
||||
io_concurrency,
|
||||
)
|
||||
.await?;
|
||||
|
||||
Ok(tonic::Response::new(proto::GetPageResponse {
|
||||
page_image: page_image,
|
||||
}))
|
||||
}
|
||||
.instrument(span)
|
||||
.await
|
||||
}
|
||||
|
||||
async fn db_size(
|
||||
&self,
|
||||
request: tonic::Request<proto::DbSizeRequest>,
|
||||
) -> Result<tonic::Response<proto::DbSizeResponse>, tonic::Status> {
|
||||
let ttid = self.extract_ttid(request.metadata())?;
|
||||
let req: model::DbSizeRequest = request.get_ref().try_into()?;
|
||||
|
||||
let span = tracing::info_span!("get_page", tenant_id = %ttid.tenant_id, timeline_id = %ttid.timeline_id, db_oid = %req.db_oid, req_lsn = %req.common.request_lsn);
|
||||
|
||||
async {
|
||||
let timeline = self.get_timeline(ttid, ShardSelector::Zero).await?;
|
||||
let ctx = self.ctx.with_scope_timeline(&timeline);
|
||||
let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn();
|
||||
let lsn = Self::wait_or_get_last_lsn(
|
||||
&timeline,
|
||||
req.common.request_lsn,
|
||||
req.common.not_modified_since_lsn,
|
||||
&latest_gc_cutoff_lsn,
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
|
||||
let total_blocks = timeline
|
||||
.get_db_size(DEFAULTTABLESPACE_OID, req.db_oid, Version::Lsn(lsn), &ctx)
|
||||
.await?;
|
||||
|
||||
Ok(tonic::Response::new(proto::DbSizeResponse {
|
||||
num_bytes: total_blocks as u64 * BLCKSZ as u64,
|
||||
}))
|
||||
}
|
||||
.instrument(span)
|
||||
.await
|
||||
}
|
||||
|
||||
async fn get_base_backup(
|
||||
&self,
|
||||
request: tonic::Request<proto::GetBaseBackupRequest>,
|
||||
) -> Result<tonic::Response<Self::GetBaseBackupStream>, tonic::Status> {
|
||||
let ttid = self.extract_ttid(request.metadata())?;
|
||||
let req: model::GetBaseBackupRequest = request.get_ref().try_into()?;
|
||||
|
||||
let timeline = self.get_timeline(ttid, ShardSelector::Zero).await?;
|
||||
|
||||
let ctx = self.ctx.with_scope_timeline(&timeline);
|
||||
let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn();
|
||||
let lsn = Self::wait_or_get_last_lsn(
|
||||
&timeline,
|
||||
req.common.request_lsn,
|
||||
req.common.not_modified_since_lsn,
|
||||
&latest_gc_cutoff_lsn,
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
|
||||
let span = tracing::info_span!("get_base_backup", tenant_id = %ttid.tenant_id, timeline_id = %ttid.timeline_id, req_lsn = %req.common.request_lsn);
|
||||
|
||||
tracing::info!("starting basebackup");
|
||||
|
||||
#[allow(dead_code)]
|
||||
enum TestMode {
|
||||
/// Create real basebackup, in streaming fashion
|
||||
Streaming,
|
||||
/// Create real basebackup, but fully materialize it in the 'simplex' pipe buffer first
|
||||
Materialize,
|
||||
/// Create a dummy all-zeros basebackup, in streaming fashion
|
||||
DummyStreaming,
|
||||
/// Create a dummy all-zeros basebackup, but fully materialize it first
|
||||
DummyMaterialize,
|
||||
}
|
||||
let mode = TestMode::Streaming;
|
||||
|
||||
let buf_size = match mode {
|
||||
TestMode::Streaming | TestMode::DummyStreaming => 64 * 1024,
|
||||
TestMode::Materialize | TestMode::DummyMaterialize => 64 * 1024 * 1024,
|
||||
};
|
||||
|
||||
let (simplex_read, mut simplex_write) = tokio::io::simplex(buf_size);
|
||||
|
||||
let basebackup_task = match mode {
|
||||
TestMode::DummyStreaming => {
|
||||
tokio::spawn(
|
||||
async move {
|
||||
// hold onto the guard for as long as the basebackup runs
|
||||
let _latest_gc_cutoff_lsn = latest_gc_cutoff_lsn;
|
||||
|
||||
let zerosbuf: [u8; 1024] = [0; 1024];
|
||||
let nbytes = 16900000;
|
||||
let mut bytes_written = 0;
|
||||
while bytes_written < nbytes {
|
||||
let s = std::cmp::min(1024, nbytes - bytes_written);
|
||||
let _ = simplex_write.write_all(&zerosbuf[0..s]).await;
|
||||
bytes_written += s;
|
||||
}
|
||||
simplex_write
|
||||
.shutdown()
|
||||
.await
|
||||
.context("shutdown of basebackup pipe")?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
.instrument(span),
|
||||
)
|
||||
}
|
||||
TestMode::DummyMaterialize => {
|
||||
let zerosbuf: [u8; 1024] = [0; 1024];
|
||||
let nbytes = 16900000;
|
||||
let mut bytes_written = 0;
|
||||
while bytes_written < nbytes {
|
||||
let s = std::cmp::min(1024, nbytes - bytes_written);
|
||||
let _ = simplex_write.write_all(&zerosbuf[0..s]).await;
|
||||
bytes_written += s;
|
||||
}
|
||||
simplex_write
|
||||
.shutdown()
|
||||
.await
|
||||
.expect("shutdown of basebackup pipe");
|
||||
tracing::info!("basebackup (dummy) materialized");
|
||||
let result = Ok(());
|
||||
|
||||
tokio::spawn(std::future::ready(result))
|
||||
}
|
||||
TestMode::Materialize => {
|
||||
let result = basebackup::send_basebackup_tarball(
|
||||
&mut simplex_write,
|
||||
&timeline,
|
||||
Some(lsn),
|
||||
None,
|
||||
false,
|
||||
req.replica,
|
||||
&ctx,
|
||||
)
|
||||
.await;
|
||||
simplex_write
|
||||
.shutdown()
|
||||
.await
|
||||
.expect("shutdown of basebackup pipe");
|
||||
tracing::info!("basebackup materialized");
|
||||
|
||||
// Launch a task that writes the basebackup tarball to the simplex pipe
|
||||
tokio::spawn(std::future::ready(result))
|
||||
}
|
||||
TestMode::Streaming => {
|
||||
tokio::spawn(
|
||||
async move {
|
||||
// hold onto the guard for as long as the basebackup runs
|
||||
let _latest_gc_cutoff_lsn = latest_gc_cutoff_lsn;
|
||||
|
||||
let result = basebackup::send_basebackup_tarball(
|
||||
&mut simplex_write,
|
||||
&timeline,
|
||||
Some(lsn),
|
||||
None,
|
||||
false,
|
||||
req.replica,
|
||||
&ctx,
|
||||
)
|
||||
.await;
|
||||
simplex_write
|
||||
.shutdown()
|
||||
.await
|
||||
.context("shutdown of basebackup pipe")?;
|
||||
result
|
||||
}
|
||||
.instrument(span),
|
||||
)
|
||||
}
|
||||
};
|
||||
|
||||
let response = new_basebackup_response_stream(simplex_read, basebackup_task);
|
||||
|
||||
Ok(tonic::Response::new(response))
|
||||
}
|
||||
}
|
||||
|
||||
/// NB: this is a different value than [`crate::http::routes::ACTIVE_TENANT_TIMEOUT`].
|
||||
/// NB: and also different from page_service::ACTIVE_TENANT_TIMEOUT
|
||||
const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(30000);
|
||||
|
||||
impl PageServiceService {
|
||||
async fn get_timeline(
|
||||
&self,
|
||||
ttid: TenantTimelineId,
|
||||
shard_selector: ShardSelector,
|
||||
) -> Result<Arc<Timeline>, tonic::Status> {
|
||||
let timeout = ACTIVE_TENANT_TIMEOUT;
|
||||
let wait_start = Instant::now();
|
||||
let deadline = wait_start + timeout;
|
||||
|
||||
let tenant_shard = loop {
|
||||
let resolved = self
|
||||
.tenant_mgr
|
||||
.resolve_attached_shard(&ttid.tenant_id, shard_selector);
|
||||
|
||||
match resolved {
|
||||
ShardResolveResult::Found(tenant_shard) => break tenant_shard,
|
||||
ShardResolveResult::NotFound => {
|
||||
return Err(tonic::Status::not_found("tenant not found"));
|
||||
}
|
||||
ShardResolveResult::InProgress(barrier) => {
|
||||
// We can't authoritatively answer right now: wait for InProgress state
|
||||
// to end, then try again
|
||||
tokio::select! {
|
||||
_ = barrier.wait() => {
|
||||
// The barrier completed: proceed around the loop to try looking up again
|
||||
},
|
||||
_ = tokio::time::sleep(deadline.duration_since(Instant::now())) => {
|
||||
return Err(tonic::Status::unavailable("tenant is in InProgress state"));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
tracing::debug!("Waiting for tenant to enter active state...");
|
||||
tenant_shard
|
||||
.wait_to_become_active(deadline.duration_since(Instant::now()))
|
||||
.await
|
||||
.map_err(|e| {
|
||||
tonic::Status::unavailable(format!("tenant is not in active state: {e}"))
|
||||
})?;
|
||||
|
||||
let timeline = tenant_shard
|
||||
.get_timeline(ttid.timeline_id, true)
|
||||
.map_err(|e| tonic::Status::unavailable(format!("could not get timeline: {e}")))?;
|
||||
|
||||
// FIXME: need to do something with the 'gate' here?
|
||||
|
||||
Ok(timeline)
|
||||
}
|
||||
|
||||
/// Extract TenantTimelineId from the request metadata
|
||||
///
|
||||
/// Note: the interceptor has already authenticated the request
|
||||
///
|
||||
/// TOOD: Could we use "binary" metadata for these, for efficiency? gRPC has such a concept
|
||||
fn extract_ttid(
|
||||
&self,
|
||||
metadata: &tonic::metadata::MetadataMap,
|
||||
) -> Result<TenantTimelineId, tonic::Status> {
|
||||
let tenant_id = metadata
|
||||
.get("neon-tenant-id")
|
||||
.ok_or(tonic::Status::invalid_argument(
|
||||
"neon-tenant-id metadata missing",
|
||||
))?;
|
||||
let tenant_id = tenant_id.to_str().map_err(|_| {
|
||||
tonic::Status::invalid_argument("invalid UTF-8 characters in neon-tenant-id metadata")
|
||||
})?;
|
||||
let tenant_id = TenantId::from_str(tenant_id)
|
||||
.map_err(|_| tonic::Status::invalid_argument("invalid neon-tenant-id metadata"))?;
|
||||
|
||||
let timeline_id =
|
||||
metadata
|
||||
.get("neon-timeline-id")
|
||||
.ok_or(tonic::Status::invalid_argument(
|
||||
"neon-timeline-id metadata missing",
|
||||
))?;
|
||||
let timeline_id = timeline_id.to_str().map_err(|_| {
|
||||
tonic::Status::invalid_argument("invalid UTF-8 characters in neon-timeline-id metadata")
|
||||
})?;
|
||||
let timeline_id = TimelineId::from_str(timeline_id)
|
||||
.map_err(|_| tonic::Status::invalid_argument("invalid neon-timelineid metadata"))?;
|
||||
|
||||
Ok(TenantTimelineId::new(tenant_id, timeline_id))
|
||||
}
|
||||
|
||||
// XXX: copied from PageServerHandler
|
||||
async fn wait_or_get_last_lsn(
|
||||
timeline: &Timeline,
|
||||
request_lsn: Lsn,
|
||||
not_modified_since: Lsn,
|
||||
latest_gc_cutoff_lsn: &RcuReadGuard<Lsn>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Lsn, tonic::Status> {
|
||||
let last_record_lsn = timeline.get_last_record_lsn();
|
||||
|
||||
// Sanity check the request
|
||||
if request_lsn < not_modified_since {
|
||||
return Err(tonic::Status::invalid_argument(format!(
|
||||
"invalid request with request LSN {} and not_modified_since {}",
|
||||
request_lsn, not_modified_since,
|
||||
)));
|
||||
}
|
||||
|
||||
// Check explicitly for INVALID just to get a less scary error message if the request is obviously bogus
|
||||
if request_lsn == Lsn::INVALID {
|
||||
return Err(tonic::Status::invalid_argument("invalid LSN(0) in request"));
|
||||
}
|
||||
|
||||
// Clients should only read from recent LSNs on their timeline, or from locations holding an LSN lease.
|
||||
//
|
||||
// We may have older data available, but we make a best effort to detect this case and return an error,
|
||||
// to distinguish a misbehaving client (asking for old LSN) from a storage issue (data missing at a legitimate LSN).
|
||||
if request_lsn < **latest_gc_cutoff_lsn && !timeline.is_gc_blocked_by_lsn_lease_deadline() {
|
||||
let gc_info = &timeline.gc_info.read().unwrap();
|
||||
if !gc_info.lsn_covered_by_lease(request_lsn) {
|
||||
return Err(tonic::Status::not_found(format!(
|
||||
"tried to request a page version that was garbage collected. requested at {} gc cutoff {}",
|
||||
request_lsn, **latest_gc_cutoff_lsn
|
||||
)));
|
||||
}
|
||||
}
|
||||
|
||||
// Wait for WAL up to 'not_modified_since' to arrive, if necessary
|
||||
if not_modified_since > last_record_lsn {
|
||||
timeline
|
||||
.wait_lsn(
|
||||
not_modified_since,
|
||||
crate::tenant::timeline::WaitLsnWaiter::PageService,
|
||||
WaitLsnTimeout::Default,
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
.map_err(|_| {
|
||||
tonic::Status::unavailable("not_modified_since LSN not arrived yet")
|
||||
})?;
|
||||
// Since we waited for 'not_modified_since' to arrive, that is now the last
|
||||
// record LSN. (Or close enough for our purposes; the last-record LSN can
|
||||
// advance immediately after we return anyway)
|
||||
Ok(not_modified_since)
|
||||
} else {
|
||||
// It might be better to use max(not_modified_since, latest_gc_cutoff_lsn)
|
||||
// here instead. That would give the same result, since we know that there
|
||||
// haven't been any modifications since 'not_modified_since'. Using an older
|
||||
// LSN might be faster, because that could allow skipping recent layers when
|
||||
// finding the page. However, we have historically used 'last_record_lsn', so
|
||||
// stick to that for now.
|
||||
Ok(std::cmp::min(last_record_lsn, request_lsn))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct PageServiceAuthenticator {
|
||||
pub auth: Option<Arc<SwappableJwtAuth>>,
|
||||
pub auth_type: AuthType,
|
||||
}
|
||||
|
||||
impl tonic::service::Interceptor for PageServiceAuthenticator {
|
||||
fn call(
|
||||
&mut self,
|
||||
req: tonic::Request<()>,
|
||||
) -> std::result::Result<tonic::Request<()>, tonic::Status> {
|
||||
// Check the tenant_id in any case
|
||||
let tenant_id =
|
||||
req.metadata()
|
||||
.get("neon-tenant-id")
|
||||
.ok_or(tonic::Status::invalid_argument(
|
||||
"neon-tenant-id metadata missing",
|
||||
))?;
|
||||
let tenant_id = tenant_id.to_str().map_err(|_| {
|
||||
tonic::Status::invalid_argument("invalid UTF-8 characters in neon-tenant-id metadata")
|
||||
})?;
|
||||
let tenant_id = TenantId::from_str(tenant_id)
|
||||
.map_err(|_| tonic::Status::invalid_argument("invalid neon-tenant-id metadata"))?;
|
||||
|
||||
// when accessing management api supply None as an argument
|
||||
// when using to authorize tenant pass corresponding tenant id
|
||||
let auth = if let Some(auth) = &self.auth {
|
||||
auth
|
||||
} else {
|
||||
// auth is set to Trust, nothing to check so just return ok
|
||||
return Ok(req);
|
||||
};
|
||||
|
||||
let jwt = req
|
||||
.metadata()
|
||||
.get("neon-auth-token")
|
||||
.ok_or(tonic::Status::unauthenticated("no neon-auth-token"))?;
|
||||
let jwt = jwt.to_str().map_err(|_| {
|
||||
tonic::Status::invalid_argument("invalid UTF-8 characters in neon-auth-token metadata")
|
||||
})?;
|
||||
|
||||
let jwtdata: TokenData<utils::auth::Claims> = auth
|
||||
.decode(jwt)
|
||||
.map_err(|err| tonic::Status::unauthenticated(format!("invalid JWT token: {}", err)))?;
|
||||
let claims = jwtdata.claims;
|
||||
|
||||
if matches!(claims.scope, utils::auth::Scope::Tenant) && claims.tenant_id.is_none() {
|
||||
return Err(tonic::Status::unauthenticated(
|
||||
"jwt token scope is Tenant, but tenant id is missing",
|
||||
));
|
||||
}
|
||||
|
||||
debug!(
|
||||
"jwt scope check succeeded for scope: {:#?} by tenant id: {:?}",
|
||||
claims.scope, claims.tenant_id,
|
||||
);
|
||||
|
||||
// The token is valid. Check if it's allowed to access the tenant ID
|
||||
// given in the request.
|
||||
|
||||
check_permission(&claims, Some(tenant_id))
|
||||
.map_err(|err| tonic::Status::permission_denied(err.to_string()))?;
|
||||
|
||||
// All checks out
|
||||
Ok(req)
|
||||
}
|
||||
}
|
||||
|
||||
/// Stream of GetBaseBackupResponseChunk messages.
|
||||
///
|
||||
/// The first part of the Chain chunks the tarball. The second part checks the return value
|
||||
/// of the send_basebackup_tarball Future that created the tarball.
|
||||
|
||||
type GetBaseBackupStream = futures::stream::Chain<BasebackupChunkedStream, CheckResultStream>;
|
||||
|
||||
fn new_basebackup_response_stream(
|
||||
simplex_read: ReadHalf<SimplexStream>,
|
||||
basebackup_task: JoinHandle<Result<(), BasebackupError>>,
|
||||
) -> GetBaseBackupStream {
|
||||
let framed = FramedRead::new(simplex_read, GetBaseBackupResponseDecoder {});
|
||||
|
||||
framed.chain(CheckResultStream { basebackup_task })
|
||||
}
|
||||
|
||||
/// Stream that uses GetBaseBackupResponseDecoder
|
||||
type BasebackupChunkedStream =
|
||||
tokio_util::codec::FramedRead<ReadHalf<SimplexStream>, GetBaseBackupResponseDecoder>;
|
||||
|
||||
struct GetBaseBackupResponseDecoder;
|
||||
impl Decoder for GetBaseBackupResponseDecoder {
|
||||
type Item = proto::GetBaseBackupResponseChunk;
|
||||
type Error = tonic::Status;
|
||||
|
||||
fn decode(&mut self, src: &mut BytesMut) -> Result<Option<Self::Item>, Self::Error> {
|
||||
if src.len() < 64 * 1024 {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
let item = proto::GetBaseBackupResponseChunk {
|
||||
chunk: bytes::Bytes::from(std::mem::take(src)),
|
||||
};
|
||||
|
||||
Ok(Some(item))
|
||||
}
|
||||
|
||||
fn decode_eof(&mut self, src: &mut BytesMut) -> Result<Option<Self::Item>, Self::Error> {
|
||||
if src.is_empty() {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
let item = proto::GetBaseBackupResponseChunk {
|
||||
chunk: bytes::Bytes::from(std::mem::take(src)),
|
||||
};
|
||||
|
||||
Ok(Some(item))
|
||||
}
|
||||
}
|
||||
|
||||
struct CheckResultStream {
|
||||
basebackup_task: tokio::task::JoinHandle<Result<(), BasebackupError>>,
|
||||
}
|
||||
impl futures::Stream for CheckResultStream {
|
||||
type Item = Result<proto::GetBaseBackupResponseChunk, tonic::Status>;
|
||||
|
||||
fn poll_next(
|
||||
mut self: Pin<&mut Self>,
|
||||
ctx: &mut std::task::Context<'_>,
|
||||
) -> Poll<Option<Self::Item>> {
|
||||
let task = Pin::new(&mut self.basebackup_task);
|
||||
match task.poll(ctx) {
|
||||
Poll::Pending => Poll::Pending,
|
||||
Poll::Ready(Ok(Ok(()))) => Poll::Ready(None),
|
||||
Poll::Ready(Ok(Err(basebackup_err))) => {
|
||||
error!(error=%basebackup_err, "error getting basebackup");
|
||||
Poll::Ready(Some(Err(tonic::Status::internal(
|
||||
"could not get basebackup",
|
||||
))))
|
||||
}
|
||||
Poll::Ready(Err(join_err)) => {
|
||||
error!(error=%join_err, "JoinError getting basebackup");
|
||||
Poll::Ready(Some(Err(tonic::Status::internal(
|
||||
"could not get basebackup",
|
||||
))))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -21,6 +21,8 @@ pub use pageserver_api::keyspace;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
mod assert_u64_eq_usize;
|
||||
pub mod aux_file;
|
||||
pub mod compute_service;
|
||||
pub mod compute_service_grpc;
|
||||
pub mod metrics;
|
||||
pub mod page_cache;
|
||||
pub mod page_service;
|
||||
@@ -82,7 +84,7 @@ impl CancellableTask {
|
||||
pub async fn shutdown_pageserver(
|
||||
http_listener: HttpEndpointListener,
|
||||
https_listener: Option<HttpsEndpointListener>,
|
||||
page_service: page_service::Listener,
|
||||
compute_service: compute_service::Listener,
|
||||
consumption_metrics_worker: ConsumptionMetricsTasks,
|
||||
disk_usage_eviction_task: Option<DiskUsageEvictionTask>,
|
||||
tenant_manager: &TenantManager,
|
||||
@@ -167,11 +169,11 @@ pub async fn shutdown_pageserver(
|
||||
}
|
||||
});
|
||||
|
||||
// Shut down the libpq endpoint task. This prevents new connections from
|
||||
// Shut down the compute service endpoint task. This prevents new connections from
|
||||
// being accepted.
|
||||
let remaining_connections = timed(
|
||||
page_service.stop_accepting(),
|
||||
"shutdown LibpqEndpointListener",
|
||||
compute_service.stop_accepting(),
|
||||
"shutdown compte service listener",
|
||||
Duration::from_secs(1),
|
||||
)
|
||||
.await;
|
||||
|
||||
@@ -13,7 +13,6 @@ use crate::PERF_TRACE_TARGET;
|
||||
use anyhow::{Context, bail};
|
||||
use async_compression::tokio::write::GzipEncoder;
|
||||
use bytes::Buf;
|
||||
use futures::FutureExt;
|
||||
use itertools::Itertools;
|
||||
use jsonwebtoken::TokenData;
|
||||
use once_cell::sync::OnceCell;
|
||||
@@ -40,7 +39,6 @@ use pq_proto::framed::ConnectionError;
|
||||
use pq_proto::{BeMessage, FeMessage, FeStartupPacket, RowDescriptor};
|
||||
use strum_macros::IntoStaticStr;
|
||||
use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt, BufWriter};
|
||||
use tokio::task::JoinHandle;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
use utils::auth::{Claims, Scope, SwappableJwtAuth};
|
||||
@@ -49,15 +47,13 @@ use utils::id::{TenantId, TimelineId};
|
||||
use utils::logging::log_slow;
|
||||
use utils::lsn::Lsn;
|
||||
use utils::simple_rcu::RcuReadGuard;
|
||||
use utils::sync::gate::{Gate, GateGuard};
|
||||
use utils::sync::gate::GateGuard;
|
||||
use utils::sync::spsc_fold;
|
||||
|
||||
use crate::auth::check_permission;
|
||||
use crate::basebackup::BasebackupError;
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::{
|
||||
DownloadBehavior, PerfInstrumentFutureExt, RequestContext, RequestContextBuilder,
|
||||
};
|
||||
use crate::context::{PerfInstrumentFutureExt, RequestContext, RequestContextBuilder};
|
||||
use crate::metrics::{
|
||||
self, COMPUTE_COMMANDS_COUNTERS, ComputeCommandKind, GetPageBatchBreakReason, LIVE_CONNECTIONS,
|
||||
SmgrOpTimer, TimelineMetrics,
|
||||
@@ -67,7 +63,6 @@ use crate::span::{
|
||||
debug_assert_current_span_has_tenant_and_timeline_id,
|
||||
debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id,
|
||||
};
|
||||
use crate::task_mgr::{self, COMPUTE_REQUEST_RUNTIME, TaskKind};
|
||||
use crate::tenant::mgr::{
|
||||
GetActiveTenantError, GetTenantError, ShardResolveResult, ShardSelector, TenantManager,
|
||||
};
|
||||
@@ -85,171 +80,6 @@ const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(30000);
|
||||
/// Threshold at which to log slow GetPage requests.
|
||||
const LOG_SLOW_GETPAGE_THRESHOLD: Duration = Duration::from_secs(30);
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
pub struct Listener {
|
||||
cancel: CancellationToken,
|
||||
/// Cancel the listener task through `listen_cancel` to shut down the listener
|
||||
/// and get a handle on the existing connections.
|
||||
task: JoinHandle<Connections>,
|
||||
}
|
||||
|
||||
pub struct Connections {
|
||||
cancel: CancellationToken,
|
||||
tasks: tokio::task::JoinSet<ConnectionHandlerResult>,
|
||||
gate: Gate,
|
||||
}
|
||||
|
||||
pub fn spawn(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_manager: Arc<TenantManager>,
|
||||
pg_auth: Option<Arc<SwappableJwtAuth>>,
|
||||
perf_trace_dispatch: Option<Dispatch>,
|
||||
tcp_listener: tokio::net::TcpListener,
|
||||
tls_config: Option<Arc<rustls::ServerConfig>>,
|
||||
) -> Listener {
|
||||
let cancel = CancellationToken::new();
|
||||
let libpq_ctx = RequestContext::todo_child(
|
||||
TaskKind::LibpqEndpointListener,
|
||||
// listener task shouldn't need to download anything. (We will
|
||||
// create a separate sub-contexts for each connection, with their
|
||||
// own download behavior. This context is used only to listen and
|
||||
// accept connections.)
|
||||
DownloadBehavior::Error,
|
||||
);
|
||||
let task = COMPUTE_REQUEST_RUNTIME.spawn(task_mgr::exit_on_panic_or_error(
|
||||
"libpq listener",
|
||||
libpq_listener_main(
|
||||
conf,
|
||||
tenant_manager,
|
||||
pg_auth,
|
||||
perf_trace_dispatch,
|
||||
tcp_listener,
|
||||
conf.pg_auth_type,
|
||||
tls_config,
|
||||
conf.page_service_pipelining.clone(),
|
||||
libpq_ctx,
|
||||
cancel.clone(),
|
||||
)
|
||||
.map(anyhow::Ok),
|
||||
));
|
||||
|
||||
Listener { cancel, task }
|
||||
}
|
||||
|
||||
impl Listener {
|
||||
pub async fn stop_accepting(self) -> Connections {
|
||||
self.cancel.cancel();
|
||||
self.task
|
||||
.await
|
||||
.expect("unreachable: we wrap the listener task in task_mgr::exit_on_panic_or_error")
|
||||
}
|
||||
}
|
||||
impl Connections {
|
||||
pub(crate) async fn shutdown(self) {
|
||||
let Self {
|
||||
cancel,
|
||||
mut tasks,
|
||||
gate,
|
||||
} = self;
|
||||
cancel.cancel();
|
||||
while let Some(res) = tasks.join_next().await {
|
||||
Self::handle_connection_completion(res);
|
||||
}
|
||||
gate.close().await;
|
||||
}
|
||||
|
||||
fn handle_connection_completion(res: Result<anyhow::Result<()>, tokio::task::JoinError>) {
|
||||
match res {
|
||||
Ok(Ok(())) => {}
|
||||
Ok(Err(e)) => error!("error in page_service connection task: {:?}", e),
|
||||
Err(e) => error!("page_service connection task panicked: {:?}", e),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Main loop of the page service.
|
||||
///
|
||||
/// Listens for connections, and launches a new handler task for each.
|
||||
///
|
||||
/// Returns Ok(()) upon cancellation via `cancel`, returning the set of
|
||||
/// open connections.
|
||||
///
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub async fn libpq_listener_main(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_manager: Arc<TenantManager>,
|
||||
auth: Option<Arc<SwappableJwtAuth>>,
|
||||
perf_trace_dispatch: Option<Dispatch>,
|
||||
listener: tokio::net::TcpListener,
|
||||
auth_type: AuthType,
|
||||
tls_config: Option<Arc<rustls::ServerConfig>>,
|
||||
pipelining_config: PageServicePipeliningConfig,
|
||||
listener_ctx: RequestContext,
|
||||
listener_cancel: CancellationToken,
|
||||
) -> Connections {
|
||||
let connections_cancel = CancellationToken::new();
|
||||
let connections_gate = Gate::default();
|
||||
let mut connection_handler_tasks = tokio::task::JoinSet::default();
|
||||
|
||||
loop {
|
||||
let gate_guard = match connections_gate.enter() {
|
||||
Ok(guard) => guard,
|
||||
Err(_) => break,
|
||||
};
|
||||
|
||||
let accepted = tokio::select! {
|
||||
biased;
|
||||
_ = listener_cancel.cancelled() => break,
|
||||
next = connection_handler_tasks.join_next(), if !connection_handler_tasks.is_empty() => {
|
||||
let res = next.expect("we dont poll while empty");
|
||||
Connections::handle_connection_completion(res);
|
||||
continue;
|
||||
}
|
||||
accepted = listener.accept() => accepted,
|
||||
};
|
||||
|
||||
match accepted {
|
||||
Ok((socket, peer_addr)) => {
|
||||
// Connection established. Spawn a new task to handle it.
|
||||
debug!("accepted connection from {}", peer_addr);
|
||||
let local_auth = auth.clone();
|
||||
let connection_ctx = RequestContextBuilder::from(&listener_ctx)
|
||||
.task_kind(TaskKind::PageRequestHandler)
|
||||
.download_behavior(DownloadBehavior::Download)
|
||||
.perf_span_dispatch(perf_trace_dispatch.clone())
|
||||
.detached_child();
|
||||
|
||||
connection_handler_tasks.spawn(page_service_conn_main(
|
||||
conf,
|
||||
tenant_manager.clone(),
|
||||
local_auth,
|
||||
socket,
|
||||
auth_type,
|
||||
tls_config.clone(),
|
||||
pipelining_config.clone(),
|
||||
connection_ctx,
|
||||
connections_cancel.child_token(),
|
||||
gate_guard,
|
||||
));
|
||||
}
|
||||
Err(err) => {
|
||||
// accept() failed. Log the error, and loop back to retry on next connection.
|
||||
error!("accept() failed: {:?}", err);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
debug!("page_service listener loop terminated");
|
||||
|
||||
Connections {
|
||||
cancel: connections_cancel,
|
||||
tasks: connection_handler_tasks,
|
||||
gate: connections_gate,
|
||||
}
|
||||
}
|
||||
|
||||
type ConnectionHandlerResult = anyhow::Result<()>;
|
||||
|
||||
/// Perf root spans start at the per-request level, after shard routing.
|
||||
@@ -261,9 +91,10 @@ struct ConnectionPerfSpanFields {
|
||||
compute_mode: Option<String>,
|
||||
}
|
||||
|
||||
/// note: the caller has already set TCP_NODELAY on the socket
|
||||
#[instrument(skip_all, fields(peer_addr, application_name, compute_mode))]
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
async fn page_service_conn_main(
|
||||
pub async fn libpq_page_service_conn_main(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_manager: Arc<TenantManager>,
|
||||
auth: Option<Arc<SwappableJwtAuth>>,
|
||||
@@ -279,10 +110,6 @@ async fn page_service_conn_main(
|
||||
.with_label_values(&["page_service"])
|
||||
.guard();
|
||||
|
||||
socket
|
||||
.set_nodelay(true)
|
||||
.context("could not set TCP_NODELAY")?;
|
||||
|
||||
let socket_fd = socket.as_raw_fd();
|
||||
|
||||
let peer_addr = socket.peer_addr().context("get peer address")?;
|
||||
@@ -393,7 +220,7 @@ struct PageServerHandler {
|
||||
gate_guard: GateGuard,
|
||||
}
|
||||
|
||||
struct TimelineHandles {
|
||||
pub struct TimelineHandles {
|
||||
wrapper: TenantManagerWrapper,
|
||||
/// Note on size: the typical size of this map is 1. The largest size we expect
|
||||
/// to see is the number of shards divided by the number of pageservers (typically < 2),
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
# pgxs/neon/Makefile
|
||||
|
||||
|
||||
MODULE_big = neon
|
||||
OBJS = \
|
||||
$(WIN32RES) \
|
||||
communicator.o \
|
||||
communicator_new.o \
|
||||
extension_server.o \
|
||||
file_cache.o \
|
||||
hll.o \
|
||||
@@ -22,7 +22,8 @@ OBJS = \
|
||||
walproposer.o \
|
||||
walproposer_pg.o \
|
||||
control_plane_connector.o \
|
||||
walsender_hooks.o
|
||||
walsender_hooks.o \
|
||||
$(LIBCOMMUNICATOR_PATH)/libcommunicator.a
|
||||
|
||||
PG_CPPFLAGS = -I$(libpq_srcdir)
|
||||
SHLIB_LINK_INTERNAL = $(libpq)
|
||||
|
||||
372
pgxn/neon/communicator/Cargo.lock
generated
Normal file
372
pgxn/neon/communicator/Cargo.lock
generated
Normal file
@@ -0,0 +1,372 @@
|
||||
# This file is automatically @generated by Cargo.
|
||||
# It is not intended for manual editing.
|
||||
version = 4
|
||||
|
||||
[[package]]
|
||||
name = "addr2line"
|
||||
version = "0.24.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "dfbe277e56a376000877090da837660b4427aad530e3028d44e0bffe4f89a1c1"
|
||||
dependencies = [
|
||||
"gimli",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "adler2"
|
||||
version = "2.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627"
|
||||
|
||||
[[package]]
|
||||
name = "backtrace"
|
||||
version = "0.3.74"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8d82cb332cdfaed17ae235a638438ac4d4839913cc2af585c3c6746e8f8bee1a"
|
||||
dependencies = [
|
||||
"addr2line",
|
||||
"cfg-if",
|
||||
"libc",
|
||||
"miniz_oxide",
|
||||
"object",
|
||||
"rustc-demangle",
|
||||
"windows-targets",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "base64"
|
||||
version = "0.22.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6"
|
||||
|
||||
[[package]]
|
||||
name = "bytes"
|
||||
version = "1.10.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d71b6127be86fdcfddb610f7182ac57211d4b18a3e9c82eb2d17662f2227ad6a"
|
||||
|
||||
[[package]]
|
||||
name = "cfg-if"
|
||||
version = "1.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
|
||||
|
||||
[[package]]
|
||||
name = "communicator"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"tonic",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "fnv"
|
||||
version = "1.0.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
|
||||
|
||||
[[package]]
|
||||
name = "futures-core"
|
||||
version = "0.3.31"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e"
|
||||
|
||||
[[package]]
|
||||
name = "gimli"
|
||||
version = "0.31.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "07e28edb80900c19c28f1072f2e8aeca7fa06b23cd4169cefe1af5aa3260783f"
|
||||
|
||||
[[package]]
|
||||
name = "http"
|
||||
version = "1.3.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f4a85d31aea989eead29a3aaf9e1115a180df8282431156e533de47660892565"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"fnv",
|
||||
"itoa",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "http-body"
|
||||
version = "1.0.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1efedce1fb8e6913f23e0c92de8e62cd5b772a67e7b3946df930a62566c93184"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"http",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "http-body-util"
|
||||
version = "0.1.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b021d93e26becf5dc7e1b75b1bed1fd93124b374ceb73f43d4d4eafec896a64a"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"futures-core",
|
||||
"http",
|
||||
"http-body",
|
||||
"pin-project-lite",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "itoa"
|
||||
version = "1.0.15"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c"
|
||||
|
||||
[[package]]
|
||||
name = "libc"
|
||||
version = "0.2.171"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c19937216e9d3aa9956d9bb8dfc0b0c8beb6058fc4f7a4dc4d850edf86a237d6"
|
||||
|
||||
[[package]]
|
||||
name = "memchr"
|
||||
version = "2.7.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
|
||||
|
||||
[[package]]
|
||||
name = "miniz_oxide"
|
||||
version = "0.8.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ff70ce3e48ae43fa075863cef62e8b43b71a4f2382229920e0df362592919430"
|
||||
dependencies = [
|
||||
"adler2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "object"
|
||||
version = "0.36.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "62948e14d923ea95ea2c7c86c71013138b66525b86bdc08d2dcc262bdb497b87"
|
||||
dependencies = [
|
||||
"memchr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "once_cell"
|
||||
version = "1.21.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d"
|
||||
|
||||
[[package]]
|
||||
name = "percent-encoding"
|
||||
version = "2.3.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e"
|
||||
|
||||
[[package]]
|
||||
name = "pin-project"
|
||||
version = "1.1.10"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "677f1add503faace112b9f1373e43e9e054bfdd22ff1a63c1bc485eaec6a6a8a"
|
||||
dependencies = [
|
||||
"pin-project-internal",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pin-project-internal"
|
||||
version = "1.1.10"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6e918e4ff8c4549eb882f14b3a4bc8c8bc93de829416eacf579f1207a8fbf861"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pin-project-lite"
|
||||
version = "0.2.16"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3b3cff922bd51709b605d9ead9aa71031d81447142d828eb4a6eba76fe619f9b"
|
||||
|
||||
[[package]]
|
||||
name = "proc-macro2"
|
||||
version = "1.0.94"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a31971752e70b8b2686d7e46ec17fb38dad4051d94024c88df49b667caea9c84"
|
||||
dependencies = [
|
||||
"unicode-ident",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "quote"
|
||||
version = "1.0.40"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rustc-demangle"
|
||||
version = "0.1.24"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f"
|
||||
|
||||
[[package]]
|
||||
name = "syn"
|
||||
version = "2.0.100"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b09a44accad81e1ba1cd74a32461ba89dee89095ba17b32f5d03683b1b1fc2a0"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"unicode-ident",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tokio"
|
||||
version = "1.44.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e6b88822cbe49de4185e3a4cbf8321dd487cf5fe0c5c65695fef6346371e9c48"
|
||||
dependencies = [
|
||||
"backtrace",
|
||||
"pin-project-lite",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tokio-stream"
|
||||
version = "0.1.17"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "eca58d7bba4a75707817a2c44174253f9236b2d5fbd055602e9d5c07c139a047"
|
||||
dependencies = [
|
||||
"futures-core",
|
||||
"pin-project-lite",
|
||||
"tokio",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tonic"
|
||||
version = "0.13.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "85839f0b32fd242bb3209262371d07feda6d780d16ee9d2bc88581b89da1549b"
|
||||
dependencies = [
|
||||
"base64",
|
||||
"bytes",
|
||||
"http",
|
||||
"http-body",
|
||||
"http-body-util",
|
||||
"percent-encoding",
|
||||
"pin-project",
|
||||
"tokio-stream",
|
||||
"tower-layer",
|
||||
"tower-service",
|
||||
"tracing",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tower-layer"
|
||||
version = "0.3.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "121c2a6cda46980bb0fcd1647ffaf6cd3fc79a013de288782836f6df9c48780e"
|
||||
|
||||
[[package]]
|
||||
name = "tower-service"
|
||||
version = "0.3.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8df9b6e13f2d32c91b9bd719c00d1958837bc7dec474d94952798cc8e69eeec3"
|
||||
|
||||
[[package]]
|
||||
name = "tracing"
|
||||
version = "0.1.41"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "784e0ac535deb450455cbfa28a6f0df145ea1bb7ae51b821cf5e7927fdcfbdd0"
|
||||
dependencies = [
|
||||
"pin-project-lite",
|
||||
"tracing-attributes",
|
||||
"tracing-core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tracing-attributes"
|
||||
version = "0.1.28"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "395ae124c09f9e6918a2310af6038fba074bcf474ac352496d5910dd59a2226d"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tracing-core"
|
||||
version = "0.1.33"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e672c95779cf947c5311f83787af4fa8fffd12fb27e4993211a84bdfd9610f9c"
|
||||
dependencies = [
|
||||
"once_cell",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "unicode-ident"
|
||||
version = "1.0.18"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512"
|
||||
|
||||
[[package]]
|
||||
name = "windows-targets"
|
||||
version = "0.52.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973"
|
||||
dependencies = [
|
||||
"windows_aarch64_gnullvm",
|
||||
"windows_aarch64_msvc",
|
||||
"windows_i686_gnu",
|
||||
"windows_i686_gnullvm",
|
||||
"windows_i686_msvc",
|
||||
"windows_x86_64_gnu",
|
||||
"windows_x86_64_gnullvm",
|
||||
"windows_x86_64_msvc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "windows_aarch64_gnullvm"
|
||||
version = "0.52.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
|
||||
|
||||
[[package]]
|
||||
name = "windows_aarch64_msvc"
|
||||
version = "0.52.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
|
||||
|
||||
[[package]]
|
||||
name = "windows_i686_gnu"
|
||||
version = "0.52.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
|
||||
|
||||
[[package]]
|
||||
name = "windows_i686_gnullvm"
|
||||
version = "0.52.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
|
||||
|
||||
[[package]]
|
||||
name = "windows_i686_msvc"
|
||||
version = "0.52.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
|
||||
|
||||
[[package]]
|
||||
name = "windows_x86_64_gnu"
|
||||
version = "0.52.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
|
||||
|
||||
[[package]]
|
||||
name = "windows_x86_64_gnullvm"
|
||||
version = "0.52.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
|
||||
|
||||
[[package]]
|
||||
name = "windows_x86_64_msvc"
|
||||
version = "0.52.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
|
||||
35
pgxn/neon/communicator/Cargo.toml
Normal file
35
pgxn/neon/communicator/Cargo.toml
Normal file
@@ -0,0 +1,35 @@
|
||||
[package]
|
||||
name = "communicator"
|
||||
version = "0.1.0"
|
||||
edition = "2024"
|
||||
|
||||
[lib]
|
||||
crate-type = ["staticlib"]
|
||||
|
||||
[dependencies]
|
||||
bytes.workspace = true
|
||||
http.workspace = true
|
||||
libc.workspace = true
|
||||
nix.workspace = true
|
||||
atomic_enum = "0.3.0"
|
||||
prost.workspace = true
|
||||
tonic = { version = "0.12.0", default-features = false, features=["codegen", "prost", "transport"] }
|
||||
tokio = { version = "1.43.1", features = ["macros", "net", "io-util", "rt", "rt-multi-thread"] }
|
||||
tokio-pipe = { version = "0.2.12" }
|
||||
thiserror.workspace = true
|
||||
tracing.workspace = true
|
||||
tracing-subscriber.workspace = true
|
||||
zerocopy = "0.8.0"
|
||||
zerocopy-derive = "0.8.0"
|
||||
|
||||
tokio-epoll-uring.workspace = true
|
||||
uring-common.workspace = true
|
||||
|
||||
pageserver_client_grpc.workspace = true
|
||||
pageserver_data_api.workspace = true
|
||||
|
||||
neonart.workspace = true
|
||||
utils.workspace = true
|
||||
|
||||
[build-dependencies]
|
||||
cbindgen.workspace = true
|
||||
123
pgxn/neon/communicator/README.md
Normal file
123
pgxn/neon/communicator/README.md
Normal file
@@ -0,0 +1,123 @@
|
||||
# Communicator
|
||||
|
||||
This package provides the so-called "compute-pageserver communicator",
|
||||
or just "communicator" in short. It runs in a PostgreSQL server, as
|
||||
part of the neon extension, and handles the communication with the
|
||||
pageservers. On the PostgreSQL side, the glue code in pgxn/neon/ uses
|
||||
the communicator to implement the PostgreSQL Storage Manager (SMGR)
|
||||
interface.
|
||||
|
||||
## Design criteria
|
||||
|
||||
- Low latency
|
||||
- Saturate a 10 Gbit / s network interface without becoming a bottleneck
|
||||
|
||||
## Source code view
|
||||
|
||||
pgxn/neon/communicator_new.c
|
||||
Contains the glue that interact with PostgreSQL code and the Rust
|
||||
communicator code.
|
||||
|
||||
pgxn/neon/communicator/src/backend_interface.rs
|
||||
The entry point for calls from each backend.
|
||||
|
||||
pgxn/neon/communicator/src/init.rs
|
||||
Initialization at server startup
|
||||
|
||||
pgxn/neon/communicator/src/worker_process/
|
||||
Worker process main loop and glue code
|
||||
|
||||
At compilation time, pgxn/neon/communicator/ produces a static
|
||||
library, libcommunicator.a. It is linked to the neon.so extension
|
||||
library.
|
||||
|
||||
The real networking code, which is independent of PostgreSQL, is in
|
||||
the pageserver/client_grpc crate.
|
||||
|
||||
## Process view
|
||||
|
||||
The communicator runs in a dedicated background worker process, the
|
||||
"communicator process". The communicator uses a multi-threaded Tokio
|
||||
runtime to execute the IO requests. So the communicator process has
|
||||
multiple threads running. That's unusual for Postgres processes and
|
||||
care must be taken to make that work.
|
||||
|
||||
### Backend <-> worker communication
|
||||
|
||||
Each backend has a number of I/O request slots in shared memory. The
|
||||
slots are statically allocated for each backend, and must not be
|
||||
accessed by other backends. The worker process reads requests from the
|
||||
shared memory slots, and writes responses back to the slots.
|
||||
|
||||
To submit an IO request, first pick one of your backend's free slots,
|
||||
and write the details of the IO request in the slot. Finally, update
|
||||
the 'state' field of the slot to Submitted. That informs the worker
|
||||
process that it can start processing the request. Once the state has
|
||||
been set to Submitted, the backend *must not* access the slot anymore,
|
||||
until the worker process sets its state to 'Completed'. In other
|
||||
words, each slot is owned by either the backend or the worker process
|
||||
at all times, and the 'state' field indicates who has ownership at the
|
||||
moment.
|
||||
|
||||
To inform the worker process that a request slot has a pending IO
|
||||
request, there's a pipe shared by the worker process and all backend
|
||||
processes. After you have changed the slot's state to Submitted, write
|
||||
the index of the request slot to the pipe. This wakes up the worker
|
||||
process.
|
||||
|
||||
(Note that the pipe is just used for wakeups, but the worker process
|
||||
is free to pick up Submitted IO requests even without receiving the
|
||||
wakeup. As of this writing, it doesn't do that, but it might be useful
|
||||
in the future to reduce latency even further, for example.)
|
||||
|
||||
When the worker process has completed processing the request, it
|
||||
writes the result back in the request slot. A GetPage request can also
|
||||
contain a pointer to buffer in the shared buffer cache. In that case,
|
||||
the worker process writes the resulting page contents directly to the
|
||||
buffer, and just a result code in the request slot. It then updates
|
||||
the 'state' field to Completed, which passes the owner ship back to
|
||||
the originating backend. Finally, it signals the process Latch of the
|
||||
originating backend, waking it up.
|
||||
|
||||
### Differences between PostgreSQL v16, v17 and v18
|
||||
|
||||
PostgreSQL v18 introduced the new AIO mechanism. The PostgreSQL AIO
|
||||
mechanism uses a very similar mechanism as described in the previous
|
||||
section, for the communication between AIO worker processes and
|
||||
backends. With our communicator, the AIO worker processes are not
|
||||
used, but we use the same PgAioHandle request slots as in upstream.
|
||||
For Neon-specific IO requests like GetDbSize, a neon request slot is
|
||||
used. But for the actual IO requests, the request slot merely contains
|
||||
a pointer to the PgAioHandle slot. The worker process updates the
|
||||
status of that, calls the IO callbacks upon completionetc, just like
|
||||
the upstream AIO worker processes do.
|
||||
|
||||
## Sequence diagram
|
||||
|
||||
neon
|
||||
PostgreSQL extension backend_interface.rs worker_process.rs processor tonic
|
||||
| . . . .
|
||||
| smgr_read() . . . .
|
||||
+-------------> + . . .
|
||||
. | . . .
|
||||
. | rcommunicator_ . . .
|
||||
. | get_page_at_lsn . . .
|
||||
. +------------------> + . .
|
||||
| . .
|
||||
| write request to . . .
|
||||
| slot . .
|
||||
| . .
|
||||
| . .
|
||||
| submit_request() . .
|
||||
+-----------------> + .
|
||||
| | .
|
||||
| | db_size_request . .
|
||||
+---------------->.
|
||||
. TODO
|
||||
|
||||
|
||||
|
||||
### Compute <-> pageserver protocol
|
||||
|
||||
The protocol between Compute and the pageserver is based on gRPC. See `protos/`.
|
||||
|
||||
24
pgxn/neon/communicator/build.rs
Normal file
24
pgxn/neon/communicator/build.rs
Normal file
@@ -0,0 +1,24 @@
|
||||
use cbindgen;
|
||||
|
||||
use std::env;
|
||||
|
||||
fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
let crate_dir = env::var("CARGO_MANIFEST_DIR").unwrap();
|
||||
|
||||
cbindgen::generate(crate_dir).map_or_else(
|
||||
|error| match error {
|
||||
cbindgen::Error::ParseSyntaxError { .. } => {
|
||||
// This means there was a syntax error in the Rust sources. Don't panic, because
|
||||
// we want the build to continue and the Rust compiler to hit the error. The
|
||||
// Rust compiler produces a better error message than cbindgen.
|
||||
eprintln!("Generating C bindings failed because of a Rust syntax error");
|
||||
}
|
||||
e => panic!("Unable to generate C bindings: {:?}", e),
|
||||
},
|
||||
|bindings| {
|
||||
bindings.write_to_file("communicator_bindings.h");
|
||||
},
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
4
pgxn/neon/communicator/cbindgen.toml
Normal file
4
pgxn/neon/communicator/cbindgen.toml
Normal file
@@ -0,0 +1,4 @@
|
||||
language = "C"
|
||||
|
||||
[enum]
|
||||
prefix_with_name = true
|
||||
204
pgxn/neon/communicator/src/backend_comms.rs
Normal file
204
pgxn/neon/communicator/src/backend_comms.rs
Normal file
@@ -0,0 +1,204 @@
|
||||
//! This module implements a request/response "slot" for submitting requests from backends
|
||||
//! to the communicator process.
|
||||
//!
|
||||
//! NB: The "backend" side of this code runs in Postgres backend processes,
|
||||
//! which means that it is not safe to use the 'tracing' crate for logging, nor
|
||||
//! to launch threads or use tokio tasks.
|
||||
use std::cell::UnsafeCell;
|
||||
use std::sync::atomic::fence;
|
||||
use std::sync::atomic::{AtomicI32, Ordering};
|
||||
|
||||
use crate::neon_request::{NeonIORequest, NeonIOResult};
|
||||
|
||||
use atomic_enum::atomic_enum;
|
||||
|
||||
/// One request/response slot. Each backend has its own set of slots that it uses.
|
||||
///
|
||||
/// This is the moral equivalent of PgAioHandle for Postgres AIO requests
|
||||
/// Like PgAioHandle, try to keep this small.
|
||||
///
|
||||
/// There is an array of these in shared memory. Therefore, this must be Sized.
|
||||
///
|
||||
/// ## Lifecycle of a request
|
||||
///
|
||||
/// The slot is always owned by either the backend process or the communicator
|
||||
/// process, depending on the 'state'. Only the owning process is allowed to
|
||||
/// read or modify the slot, except for reading the 'state' itself to check who
|
||||
/// owns it.
|
||||
///
|
||||
/// A slot begins in the Idle state, where it is owned by the backend process.
|
||||
/// To submit a request, the backend process fills the slot with the request
|
||||
/// data, and changes it to the Submitted state. After changing the state, the
|
||||
/// slot is owned by the communicator process, and the backend is not allowed
|
||||
/// to access it until the communicator process marks it as Completed.
|
||||
///
|
||||
/// When the communicator process sees that the slot is in Submitted state, it
|
||||
/// starts to process the request. After processing the request, it stores the
|
||||
/// result in the slot, and changes the state to Completed. It is now owned by
|
||||
/// the backend process again, which may now read the result, and reuse the
|
||||
/// slot for a new request.
|
||||
///
|
||||
/// For correctness of the above protocol, we really only need two states:
|
||||
/// "owned by backend" and "owned by communicator process. But to help with
|
||||
/// debugging, there are a few more states. When the backend starts to fill in
|
||||
/// the request details in the slot, it first sets the state from Idle to
|
||||
/// Filling, and when it's done with that, from Filling to Submitted. In the
|
||||
/// Filling state, the slot is still owned by the backend. Similarly, when the
|
||||
/// communicator process starts to process a request, it sets it to Processing
|
||||
/// state first, but the slot is still owned by the communicator process.
|
||||
///
|
||||
/// This struct doesn't handle waking up the communicator process when a request
|
||||
/// has been submitted or when a response is ready. We only store the 'owner_procno'
|
||||
/// which can be used for waking up the backend on completion, but the wakeups are
|
||||
/// performed elsewhere.
|
||||
pub struct NeonIOHandle {
|
||||
/// similar to PgAioHandleState
|
||||
state: AtomicNeonIOHandleState,
|
||||
|
||||
/// The owning process's ProcNumber. The worker process uses this to set the process's
|
||||
/// latch on completion.
|
||||
///
|
||||
/// (This could be calculated from num_neon_request_slots_per_backend and the index of
|
||||
/// this slot in the overall 'neon_requst_slots array')
|
||||
owner_procno: AtomicI32,
|
||||
|
||||
/// SAFETY: This is modified by fill_request(), after it has established ownership
|
||||
/// of the slot by setting state from Idle to Filling
|
||||
request: UnsafeCell<NeonIORequest>,
|
||||
|
||||
/// valid when state is Completed
|
||||
///
|
||||
/// SAFETY: This is modified by RequestProcessingGuard::complete(). There can be
|
||||
/// only one RequestProcessingGuard outstanding for a slot at a time, because
|
||||
/// it is returned by start_processing_request() which checks the state, so
|
||||
/// RequestProcessingGuard has exclusive access to the slot.
|
||||
result: UnsafeCell<NeonIOResult>,
|
||||
}
|
||||
|
||||
// The protocol described in the "Lifecycle of a request" section above ensures
|
||||
// the safe access to the fields
|
||||
unsafe impl Send for NeonIOHandle {}
|
||||
unsafe impl Sync for NeonIOHandle {}
|
||||
|
||||
impl Default for NeonIOHandle {
|
||||
fn default() -> NeonIOHandle {
|
||||
NeonIOHandle {
|
||||
owner_procno: AtomicI32::new(-1),
|
||||
request: UnsafeCell::new(NeonIORequest::Empty),
|
||||
result: UnsafeCell::new(NeonIOResult::Empty),
|
||||
state: AtomicNeonIOHandleState::new(NeonIOHandleState::Idle),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[atomic_enum]
|
||||
#[derive(Eq, PartialEq)]
|
||||
pub enum NeonIOHandleState {
|
||||
Idle,
|
||||
|
||||
/// backend is filling in the request
|
||||
Filling,
|
||||
|
||||
/// Backend has submitted the request to the communicator, but the
|
||||
/// communicator process has not yet started processing it.
|
||||
Submitted,
|
||||
|
||||
/// Communicator is processing the request
|
||||
Processing,
|
||||
|
||||
/// Communicator has completed the request, and the 'result' field is now
|
||||
/// valid, but the backend has not read the result yet.
|
||||
Completed,
|
||||
}
|
||||
|
||||
pub struct RequestProcessingGuard<'a>(&'a NeonIOHandle);
|
||||
|
||||
unsafe impl<'a> Send for RequestProcessingGuard<'a> {}
|
||||
unsafe impl<'a> Sync for RequestProcessingGuard<'a> {}
|
||||
|
||||
impl<'a> RequestProcessingGuard<'a> {
|
||||
pub fn get_request(&self) -> &NeonIORequest {
|
||||
unsafe { &*self.0.request.get() }
|
||||
}
|
||||
|
||||
pub fn get_owner_procno(&self) -> i32 {
|
||||
self.0.owner_procno.load(Ordering::Relaxed)
|
||||
}
|
||||
|
||||
pub fn completed(self, result: NeonIOResult) {
|
||||
unsafe {
|
||||
*self.0.result.get() = result;
|
||||
};
|
||||
|
||||
// Ok, we have completed the IO. Mark the request as completed. After that,
|
||||
// we no longer have ownership of the slot, and must not modify it.
|
||||
let old_state = self
|
||||
.0
|
||||
.state
|
||||
.swap(NeonIOHandleState::Completed, Ordering::Release);
|
||||
assert!(old_state == NeonIOHandleState::Processing);
|
||||
}
|
||||
}
|
||||
|
||||
impl NeonIOHandle {
|
||||
pub fn fill_request(&self, request: &NeonIORequest, proc_number: i32) {
|
||||
// Verify that the slot is in Idle state previously, and start filling it.
|
||||
//
|
||||
// XXX: This step isn't strictly necessary. Assuming the caller didn't screw up
|
||||
// and try to use a slot that's already in use, we could fill the slot and
|
||||
// switch it directly from Idle to Submitted state.
|
||||
if let Err(s) = self.state.compare_exchange(
|
||||
NeonIOHandleState::Idle,
|
||||
NeonIOHandleState::Filling,
|
||||
Ordering::Relaxed,
|
||||
Ordering::Relaxed,
|
||||
) {
|
||||
panic!("unexpected state in request slot: {s:?}");
|
||||
}
|
||||
|
||||
// This fence synchronizes-with store/swap in `communicator_process_main_loop`.
|
||||
fence(Ordering::Acquire);
|
||||
|
||||
self.owner_procno.store(proc_number, Ordering::Relaxed);
|
||||
unsafe { *self.request.get() = *request }
|
||||
self.state
|
||||
.store(NeonIOHandleState::Submitted, Ordering::Release);
|
||||
}
|
||||
|
||||
pub fn try_get_result(&self) -> Option<NeonIOResult> {
|
||||
// FIXME: ordering?
|
||||
let state = self.state.load(Ordering::Relaxed);
|
||||
if state == NeonIOHandleState::Completed {
|
||||
// This fence synchronizes-with store/swap in `communicator_process_main_loop`.
|
||||
fence(Ordering::Acquire);
|
||||
let result = unsafe { *self.result.get() };
|
||||
self.state.store(NeonIOHandleState::Idle, Ordering::Relaxed);
|
||||
Some(result)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
pub fn start_processing_request<'a>(&'a self) -> Option<RequestProcessingGuard<'a>> {
|
||||
// Read the IO request from the slot indicated in the wakeup
|
||||
//
|
||||
// XXX: using compare_exchange for this is not strictly necessary, as long as
|
||||
// the communicator process has _some_ means of tracking which requests it's
|
||||
// already processing. That could be a flag somewhere in communicator's private
|
||||
// memory, for example.
|
||||
if let Err(s) = self.state.compare_exchange(
|
||||
NeonIOHandleState::Submitted,
|
||||
NeonIOHandleState::Processing,
|
||||
Ordering::Relaxed,
|
||||
Ordering::Relaxed,
|
||||
) {
|
||||
// FIXME surprising state. This is unexpected at the moment, but if we
|
||||
// started to process requests more aggressively, without waiting for the
|
||||
// read from the pipe, then this could happen
|
||||
panic!("unexpected state in request slot: {s:?}");
|
||||
}
|
||||
fence(Ordering::Acquire);
|
||||
|
||||
Some(RequestProcessingGuard(self))
|
||||
}
|
||||
}
|
||||
196
pgxn/neon/communicator/src/backend_interface.rs
Normal file
196
pgxn/neon/communicator/src/backend_interface.rs
Normal file
@@ -0,0 +1,196 @@
|
||||
//! This code runs in each backend process. That means that launching Rust threads, panicking
|
||||
//! etc. is forbidden!
|
||||
|
||||
use crate::backend_comms::NeonIOHandle;
|
||||
use crate::init::CommunicatorInitStruct;
|
||||
use crate::integrated_cache::{BackendCacheReadOp, IntegratedCacheReadAccess};
|
||||
use crate::neon_request::CCachedGetPageVResult;
|
||||
use crate::neon_request::{NeonIORequest, NeonIOResult};
|
||||
|
||||
pub struct CommunicatorBackendStruct<'t> {
|
||||
my_proc_number: i32,
|
||||
|
||||
next_neon_request_idx: u32,
|
||||
|
||||
my_start_idx: u32, // First request slot that belongs to this backend
|
||||
my_end_idx: u32, // end + 1 request slot that belongs to this backend
|
||||
|
||||
neon_request_slots: &'t [NeonIOHandle],
|
||||
|
||||
submission_pipe_write_fd: std::ffi::c_int,
|
||||
|
||||
pending_cache_read_op: Option<BackendCacheReadOp<'t>>,
|
||||
|
||||
integrated_cache: &'t IntegratedCacheReadAccess<'t>,
|
||||
}
|
||||
|
||||
#[unsafe(no_mangle)]
|
||||
pub extern "C" fn rcommunicator_backend_init(
|
||||
cis: Box<CommunicatorInitStruct>,
|
||||
my_proc_number: i32,
|
||||
) -> &'static mut CommunicatorBackendStruct<'static> {
|
||||
let start_idx = my_proc_number as u32 * cis.num_neon_request_slots_per_backend;
|
||||
let end_idx = start_idx + cis.num_neon_request_slots_per_backend;
|
||||
|
||||
let integrated_cache = Box::leak(Box::new(cis.integrated_cache_init_struct.backend_init()));
|
||||
|
||||
let bs: &'static mut CommunicatorBackendStruct =
|
||||
Box::leak(Box::new(CommunicatorBackendStruct {
|
||||
my_proc_number,
|
||||
next_neon_request_idx: start_idx,
|
||||
my_start_idx: start_idx,
|
||||
my_end_idx: end_idx,
|
||||
neon_request_slots: cis.neon_request_slots,
|
||||
|
||||
submission_pipe_write_fd: cis.submission_pipe_write_fd,
|
||||
pending_cache_read_op: None,
|
||||
|
||||
integrated_cache,
|
||||
}));
|
||||
bs
|
||||
}
|
||||
|
||||
/// Start a request. You can poll for its completion and get the result by
|
||||
/// calling bcomm_poll_dbsize_request_completion(). The communicator will wake
|
||||
/// us up by setting our process latch, so to wait for the completion, wait on
|
||||
/// the latch and call bcomm_poll_dbsize_request_completion() every time the
|
||||
/// latch is set.
|
||||
///
|
||||
/// Safety: The C caller must ensure that the references are valid.
|
||||
#[unsafe(no_mangle)]
|
||||
pub extern "C" fn bcomm_start_io_request<'t>(
|
||||
bs: &'t mut CommunicatorBackendStruct,
|
||||
request: &NeonIORequest,
|
||||
immediate_result_ptr: &mut NeonIOResult,
|
||||
) -> i32 {
|
||||
assert!(bs.pending_cache_read_op.is_none());
|
||||
|
||||
// Check if the request can be satisfied from the cache first
|
||||
if let NeonIORequest::RelSize(req) = request {
|
||||
if let Some(nblocks) = bs.integrated_cache.get_rel_size(&req.reltag()) {
|
||||
*immediate_result_ptr = NeonIOResult::RelSize(nblocks);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
// Create neon request and submit it
|
||||
let request_idx = bs.start_neon_request(request);
|
||||
|
||||
// Tell the communicator about it
|
||||
bs.submit_request(request_idx);
|
||||
|
||||
return request_idx;
|
||||
}
|
||||
|
||||
#[unsafe(no_mangle)]
|
||||
pub extern "C" fn bcomm_start_get_page_v_request<'t>(
|
||||
bs: &'t mut CommunicatorBackendStruct,
|
||||
request: &NeonIORequest,
|
||||
immediate_result_ptr: &mut CCachedGetPageVResult,
|
||||
) -> i32 {
|
||||
let NeonIORequest::GetPageV(get_pagev_request) = request else {
|
||||
panic!("invalid request passed to bcomm_start_get_page_v_request()");
|
||||
};
|
||||
assert!(matches!(request, NeonIORequest::GetPageV(_)));
|
||||
assert!(bs.pending_cache_read_op.is_none());
|
||||
|
||||
// Check if the request can be satisfied from the cache first
|
||||
let mut all_cached = true;
|
||||
let read_op = bs.integrated_cache.start_read_op();
|
||||
for i in 0..get_pagev_request.nblocks {
|
||||
if let Some(cache_block) = read_op.get_page(
|
||||
&get_pagev_request.reltag(),
|
||||
get_pagev_request.block_number + i as u32,
|
||||
) {
|
||||
(*immediate_result_ptr).cache_block_numbers[i as usize] = cache_block;
|
||||
} else {
|
||||
// not found in cache
|
||||
all_cached = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if all_cached {
|
||||
bs.pending_cache_read_op = Some(read_op);
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Create neon request and submit it
|
||||
let request_idx = bs.start_neon_request(request);
|
||||
|
||||
// Tell the communicator about it
|
||||
bs.submit_request(request_idx);
|
||||
|
||||
return request_idx;
|
||||
}
|
||||
|
||||
/// Check if a request has completed. Returns:
|
||||
///
|
||||
/// -1 if the request is still being processed
|
||||
/// 0 on success
|
||||
#[unsafe(no_mangle)]
|
||||
pub extern "C" fn bcomm_poll_request_completion(
|
||||
bs: &mut CommunicatorBackendStruct,
|
||||
request_idx: u32,
|
||||
result_p: &mut NeonIOResult,
|
||||
) -> i32 {
|
||||
match bs.neon_request_slots[request_idx as usize].try_get_result() {
|
||||
None => -1, // still processing
|
||||
Some(result) => {
|
||||
*result_p = result;
|
||||
0
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// LFC functions
|
||||
|
||||
/// Finish a local file cache read
|
||||
///
|
||||
//
|
||||
#[unsafe(no_mangle)]
|
||||
pub extern "C" fn bcomm_finish_cache_read(bs: &mut CommunicatorBackendStruct) -> bool {
|
||||
if let Some(op) = bs.pending_cache_read_op.take() {
|
||||
op.finish()
|
||||
} else {
|
||||
panic!("bcomm_finish_cache_read() called with no cached read pending");
|
||||
}
|
||||
}
|
||||
|
||||
impl<'t> CommunicatorBackendStruct<'t> {
|
||||
/// Send a wakeup to the communicator process
|
||||
fn submit_request(self: &CommunicatorBackendStruct<'t>, request_idx: i32) {
|
||||
// wake up communicator by writing the idx to the submission pipe
|
||||
//
|
||||
// This can block, if the pipe is full. That should be very rare,
|
||||
// because the communicator tries hard to drain the pipe to prevent
|
||||
// that. Also, there's a natural upper bound on how many wakeups can be
|
||||
// queued up: there is only a limited number of request slots for each
|
||||
// backend.
|
||||
//
|
||||
// If it does block very briefly, that's not too serious.
|
||||
let idxbuf = request_idx.to_ne_bytes();
|
||||
let _res = nix::unistd::write(self.submission_pipe_write_fd, &idxbuf);
|
||||
// FIXME: check result, return any errors
|
||||
}
|
||||
|
||||
/// Note: there's no guarantee on when the communicator might pick it up. You should ring
|
||||
/// the doorbell. But it might pick it up immediately.
|
||||
pub(crate) fn start_neon_request(&mut self, request: &NeonIORequest) -> i32 {
|
||||
let my_proc_number = self.my_proc_number;
|
||||
|
||||
// Grab next free slot
|
||||
// FIXME: any guarantee that there will be any?
|
||||
let idx = self.next_neon_request_idx;
|
||||
|
||||
let next_idx = idx + 1;
|
||||
self.next_neon_request_idx = if next_idx == self.my_end_idx {
|
||||
self.my_start_idx
|
||||
} else {
|
||||
next_idx
|
||||
};
|
||||
|
||||
self.neon_request_slots[idx as usize].fill_request(request, my_proc_number);
|
||||
|
||||
return idx as i32;
|
||||
}
|
||||
}
|
||||
109
pgxn/neon/communicator/src/file_cache.rs
Normal file
109
pgxn/neon/communicator/src/file_cache.rs
Normal file
@@ -0,0 +1,109 @@
|
||||
//! Implement the "low-level" parts of the file cache.
|
||||
//!
|
||||
//! This module just deals with reading and writing the file, and keeping track
|
||||
//! which blocks in the cache file are in use and which are free. The "high
|
||||
//! level" parts of tracking which block in the cache file corresponds to which
|
||||
//! relation block is handled in 'integrated_cache' instead.
|
||||
//!
|
||||
//! This module is only used to access the file from the communicator
|
||||
//! process. The backend processes *also* read the file (and sometimes also
|
||||
//! write it? ), but the backends use direct C library calls for that.
|
||||
use std::fs::File;
|
||||
use std::path::Path;
|
||||
use std::sync::Arc;
|
||||
use std::sync::atomic::{AtomicU64, Ordering};
|
||||
|
||||
use tokio_epoll_uring;
|
||||
|
||||
use crate::BLCKSZ;
|
||||
|
||||
pub type CacheBlock = u64;
|
||||
|
||||
pub struct FileCache {
|
||||
uring_system: tokio_epoll_uring::SystemHandle,
|
||||
|
||||
file: Arc<File>,
|
||||
|
||||
// TODO: there's no reclamation mechanism, the cache grows
|
||||
// indefinitely. This is the next free block, i.e. the current
|
||||
// size of the file
|
||||
next_free_block: AtomicU64,
|
||||
}
|
||||
|
||||
impl FileCache {
|
||||
pub fn new(
|
||||
file_cache_path: &Path,
|
||||
uring_system: tokio_epoll_uring::SystemHandle,
|
||||
) -> Result<FileCache, std::io::Error> {
|
||||
let file = std::fs::OpenOptions::new()
|
||||
.read(true)
|
||||
.write(true)
|
||||
.truncate(true)
|
||||
.create(true)
|
||||
.open(file_cache_path)?;
|
||||
|
||||
tracing::info!("Created cache file {file_cache_path:?}");
|
||||
|
||||
Ok(FileCache {
|
||||
file: Arc::new(file),
|
||||
uring_system,
|
||||
next_free_block: AtomicU64::new(0),
|
||||
})
|
||||
}
|
||||
|
||||
// File cache management
|
||||
|
||||
pub async fn read_block(
|
||||
&self,
|
||||
cache_block: CacheBlock,
|
||||
dst: impl uring_common::buf::IoBufMut + Send + Sync,
|
||||
) -> Result<(), std::io::Error> {
|
||||
assert!(dst.bytes_total() == BLCKSZ);
|
||||
let file = self.file.clone();
|
||||
|
||||
let ((_file, _buf), res) = self
|
||||
.uring_system
|
||||
.read(file, cache_block as u64 * BLCKSZ as u64, dst)
|
||||
.await;
|
||||
|
||||
let res = res.map_err(map_io_uring_error)?;
|
||||
if res != BLCKSZ {
|
||||
panic!("unexpected read result");
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn write_block(
|
||||
&self,
|
||||
cache_block: CacheBlock,
|
||||
src: impl uring_common::buf::IoBuf + Send + Sync,
|
||||
) -> Result<(), std::io::Error> {
|
||||
assert!(src.bytes_init() == BLCKSZ);
|
||||
let file = self.file.clone();
|
||||
|
||||
let ((_file, _buf), res) = self
|
||||
.uring_system
|
||||
.write(file, cache_block as u64 * BLCKSZ as u64, src)
|
||||
.await;
|
||||
let res = res.map_err(map_io_uring_error)?;
|
||||
if res != BLCKSZ {
|
||||
panic!("unexpected read result");
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn alloc_block(&self) -> CacheBlock {
|
||||
self.next_free_block.fetch_add(1, Ordering::Relaxed)
|
||||
}
|
||||
}
|
||||
|
||||
fn map_io_uring_error(err: tokio_epoll_uring::Error<std::io::Error>) -> std::io::Error {
|
||||
match err {
|
||||
tokio_epoll_uring::Error::Op(err) => err,
|
||||
tokio_epoll_uring::Error::System(err) => {
|
||||
std::io::Error::new(std::io::ErrorKind::Other, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
130
pgxn/neon/communicator/src/init.rs
Normal file
130
pgxn/neon/communicator/src/init.rs
Normal file
@@ -0,0 +1,130 @@
|
||||
//! Initialization functions. These are executed in the postmaster process,
|
||||
//! at different stages of server startup.
|
||||
//!
|
||||
//!
|
||||
//! Communicator initialization steps:
|
||||
//!
|
||||
//! 1. At postmaster startup, before shared memory is allocated,
|
||||
//! rcommunicator_shmem_size() is called to get the amount of
|
||||
//! shared memory that this module needs.
|
||||
//!
|
||||
//! 2. Later, after the shared memory has been allocated,
|
||||
//! rcommunicator_shmem_init() is called to initialize the shmem
|
||||
//! area.
|
||||
//!
|
||||
//! Per process initialization:
|
||||
//!
|
||||
//! When a backend process starts up, it calls rcommunicator_backend_init().
|
||||
//! In the communicator worker process, other functions are called, see
|
||||
//! `worker_process` module.
|
||||
|
||||
use std::ffi::c_int;
|
||||
use std::mem;
|
||||
|
||||
use crate::backend_comms::NeonIOHandle;
|
||||
use crate::integrated_cache::IntegratedCacheInitStruct;
|
||||
|
||||
const NUM_NEON_REQUEST_SLOTS_PER_BACKEND: u32 = 5;
|
||||
|
||||
/// This struct is created in the postmaster process, and inherited to
|
||||
/// the communicator process and all backend processes through fork()
|
||||
#[repr(C)]
|
||||
pub struct CommunicatorInitStruct {
|
||||
#[allow(dead_code)]
|
||||
pub max_procs: u32,
|
||||
|
||||
pub submission_pipe_read_fd: std::ffi::c_int,
|
||||
pub submission_pipe_write_fd: std::ffi::c_int,
|
||||
|
||||
// Shared memory data structures
|
||||
pub num_neon_request_slots_per_backend: u32,
|
||||
|
||||
pub neon_request_slots: &'static [NeonIOHandle],
|
||||
|
||||
pub integrated_cache_init_struct: IntegratedCacheInitStruct<'static>,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for CommunicatorInitStruct {
|
||||
fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
|
||||
fmt.debug_struct("CommunicatorInitStruct")
|
||||
.field("max_procs", &self.max_procs)
|
||||
.field("submission_pipe_read_fd", &self.submission_pipe_read_fd)
|
||||
.field("submission_pipe_write_fd", &self.submission_pipe_write_fd)
|
||||
.field(
|
||||
"num_neon_request_slots_per_backend",
|
||||
&self.num_neon_request_slots_per_backend,
|
||||
)
|
||||
.field("neon_request_slots length", &self.neon_request_slots.len())
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
#[unsafe(no_mangle)]
|
||||
pub extern "C" fn rcommunicator_shmem_size(max_procs: u32) -> u64 {
|
||||
let mut size = 0;
|
||||
|
||||
let num_neon_request_slots = max_procs * NUM_NEON_REQUEST_SLOTS_PER_BACKEND;
|
||||
size += mem::size_of::<NeonIOHandle>() * num_neon_request_slots as usize;
|
||||
|
||||
// For integrated_cache's Allocator. TODO: make this adjustable
|
||||
size += IntegratedCacheInitStruct::shmem_size(max_procs);
|
||||
|
||||
size as u64
|
||||
}
|
||||
|
||||
/// Initialize the shared memory segment. Returns a backend-private
|
||||
/// struct, which will be inherited by backend processes through fork
|
||||
#[unsafe(no_mangle)]
|
||||
pub extern "C" fn rcommunicator_shmem_init(
|
||||
submission_pipe_read_fd: c_int,
|
||||
submission_pipe_write_fd: c_int,
|
||||
max_procs: u32,
|
||||
shmem_area_ptr: *mut u8,
|
||||
shmem_area_len: u64,
|
||||
) -> &'static mut CommunicatorInitStruct {
|
||||
let mut ptr = shmem_area_ptr;
|
||||
|
||||
// Carve out the request slots from the shmem area and initialize them
|
||||
let num_neon_request_slots_per_backend = NUM_NEON_REQUEST_SLOTS_PER_BACKEND;
|
||||
let num_neon_request_slots = max_procs * num_neon_request_slots_per_backend;
|
||||
|
||||
let len_used;
|
||||
let neon_request_slots: &mut [NeonIOHandle] = unsafe {
|
||||
ptr = ptr.add(ptr.align_offset(std::mem::align_of::<NeonIOHandle>()));
|
||||
let neon_request_slots_ptr: *mut NeonIOHandle = ptr.cast();
|
||||
for _i in 0..num_neon_request_slots {
|
||||
let slot: *mut NeonIOHandle = ptr.cast();
|
||||
*slot = NeonIOHandle::default();
|
||||
ptr = ptr.byte_add(mem::size_of::<NeonIOHandle>());
|
||||
}
|
||||
len_used = ptr.byte_offset_from(shmem_area_ptr) as usize;
|
||||
assert!(len_used <= shmem_area_len as usize);
|
||||
|
||||
std::slice::from_raw_parts_mut(neon_request_slots_ptr, num_neon_request_slots as usize)
|
||||
};
|
||||
|
||||
let remaining_area =
|
||||
unsafe { std::slice::from_raw_parts_mut(ptr, shmem_area_len as usize - len_used) };
|
||||
|
||||
// Give the rest of the area to the integrated cache
|
||||
let integrated_cache_init_struct =
|
||||
IntegratedCacheInitStruct::shmem_init(max_procs, remaining_area);
|
||||
|
||||
eprintln!(
|
||||
"PIPE READ {} WRITE {}",
|
||||
submission_pipe_read_fd, submission_pipe_write_fd
|
||||
);
|
||||
|
||||
let cis: &'static mut CommunicatorInitStruct = Box::leak(Box::new(CommunicatorInitStruct {
|
||||
max_procs,
|
||||
submission_pipe_read_fd,
|
||||
submission_pipe_write_fd,
|
||||
|
||||
num_neon_request_slots_per_backend: NUM_NEON_REQUEST_SLOTS_PER_BACKEND,
|
||||
neon_request_slots,
|
||||
|
||||
integrated_cache_init_struct,
|
||||
}));
|
||||
|
||||
cis
|
||||
}
|
||||
423
pgxn/neon/communicator/src/integrated_cache.rs
Normal file
423
pgxn/neon/communicator/src/integrated_cache.rs
Normal file
@@ -0,0 +1,423 @@
|
||||
//! Integrated communicator cache
|
||||
//!
|
||||
//! Tracks:
|
||||
//! - Relation sizes and existence
|
||||
//! - Last-written LSN
|
||||
//! - TODO: Block cache (also known as LFC)
|
||||
//!
|
||||
//! TODO: limit the size
|
||||
//! TODO: concurrency
|
||||
//!
|
||||
//! Note: This deals with "relations", which is really just one "relation fork" in Postgres
|
||||
//! terms. RelFileLocator + ForkNumber is the key.
|
||||
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use crate::file_cache::{CacheBlock, FileCache};
|
||||
use pageserver_data_api::model::RelTag;
|
||||
|
||||
use neonart;
|
||||
use neonart::TreeInitStruct;
|
||||
|
||||
const CACHE_AREA_SIZE: usize = 10 * 1024 * 1024;
|
||||
|
||||
/// This struct is stored in the shared memory segment.
|
||||
struct IntegratedCacheShmemData {
|
||||
allocator: neonart::Allocator,
|
||||
}
|
||||
|
||||
/// This struct is initialized at postmaster startup, and passed to all the processes via fork().
|
||||
pub struct IntegratedCacheInitStruct<'t> {
|
||||
shmem_data: &'t IntegratedCacheShmemData,
|
||||
handle: TreeInitStruct<'t, TreeKey, TreeEntry>,
|
||||
}
|
||||
|
||||
/// Represents write-access to the integrated cache. This is used by the communicator process.
|
||||
pub struct IntegratedCacheWriteAccess<'t> {
|
||||
cache_tree: neonart::TreeWriteAccess<'t, TreeKey, TreeEntry>,
|
||||
|
||||
global_lw_lsn: Lsn,
|
||||
|
||||
file_cache: Option<FileCache>,
|
||||
}
|
||||
|
||||
/// Represents read-only access to the integrated cache. Backend processes have this.
|
||||
pub struct IntegratedCacheReadAccess<'t> {
|
||||
cache_tree: neonart::TreeReadAccess<'t, TreeKey, TreeEntry>,
|
||||
}
|
||||
|
||||
impl<'t> IntegratedCacheInitStruct<'t> {
|
||||
/// Return the desired size in bytes of the shared memory area to reserve for the integrated
|
||||
/// cache.
|
||||
pub fn shmem_size(_max_procs: u32) -> usize {
|
||||
CACHE_AREA_SIZE
|
||||
}
|
||||
|
||||
/// Initialize the shared memory segment. This runs once in postmaster. Returns a struct which
|
||||
/// will be inherited by all processes through fork.
|
||||
pub fn shmem_init(_max_procs: u32, shmem_area: &'t mut [u8]) -> IntegratedCacheInitStruct<'t> {
|
||||
assert!(shmem_area.len() > std::mem::size_of::<IntegratedCacheShmemData>());
|
||||
|
||||
let mut ptr = shmem_area.as_mut_ptr();
|
||||
let shmem_data_ptr;
|
||||
let len_used;
|
||||
unsafe {
|
||||
ptr = ptr.byte_add(ptr.align_offset(align_of::<IntegratedCacheShmemData>()));
|
||||
shmem_data_ptr = ptr.cast::<IntegratedCacheShmemData>();
|
||||
ptr = ptr.byte_add(std::mem::size_of::<IntegratedCacheShmemData>());
|
||||
len_used = ptr.byte_offset_from(shmem_area.as_mut_ptr()) as usize;
|
||||
};
|
||||
assert!(len_used < shmem_area.len());
|
||||
|
||||
let area_ptr = ptr;
|
||||
let area_size = shmem_area.len() - len_used;
|
||||
|
||||
let cache_area: &mut [u8] = unsafe { std::slice::from_raw_parts_mut(area_ptr, area_size) };
|
||||
let allocator = neonart::Allocator::new(cache_area);
|
||||
|
||||
// Initialize the shared memory area
|
||||
let shmem_data = unsafe {
|
||||
*shmem_data_ptr = IntegratedCacheShmemData { allocator };
|
||||
&*shmem_data_ptr
|
||||
};
|
||||
|
||||
let tree_handle = TreeInitStruct::new(&shmem_data.allocator);
|
||||
|
||||
IntegratedCacheInitStruct {
|
||||
shmem_data,
|
||||
handle: tree_handle,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn worker_process_init(
|
||||
self,
|
||||
lsn: Lsn,
|
||||
file_cache: Option<FileCache>,
|
||||
) -> IntegratedCacheWriteAccess<'t> {
|
||||
let IntegratedCacheInitStruct {
|
||||
shmem_data: _shmem,
|
||||
handle,
|
||||
} = self;
|
||||
let tree_writer = handle.attach_writer();
|
||||
|
||||
IntegratedCacheWriteAccess {
|
||||
cache_tree: tree_writer,
|
||||
global_lw_lsn: lsn,
|
||||
file_cache,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn backend_init(self) -> IntegratedCacheReadAccess<'t> {
|
||||
let IntegratedCacheInitStruct {
|
||||
shmem_data: _shmem,
|
||||
handle,
|
||||
} = self;
|
||||
|
||||
let tree_reader = handle.attach_reader();
|
||||
|
||||
IntegratedCacheReadAccess {
|
||||
cache_tree: tree_reader,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
enum TreeEntry {
|
||||
Rel(RelEntry),
|
||||
Block(BlockEntry),
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
struct BlockEntry {
|
||||
lw_lsn: Lsn,
|
||||
cache_block: Option<CacheBlock>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Default)]
|
||||
struct RelEntry {
|
||||
/// cached size of the relation
|
||||
nblocks: Option<u32>,
|
||||
}
|
||||
|
||||
#[derive(
|
||||
Clone,
|
||||
Debug,
|
||||
PartialEq,
|
||||
PartialOrd,
|
||||
Eq,
|
||||
Ord,
|
||||
zerocopy_derive::IntoBytes,
|
||||
zerocopy_derive::Immutable,
|
||||
)]
|
||||
#[repr(packed)]
|
||||
struct TreeKey {
|
||||
spc_oid: u32,
|
||||
db_oid: u32,
|
||||
rel_number: u32,
|
||||
fork_number: u8,
|
||||
block_number: u32,
|
||||
}
|
||||
|
||||
impl From<&RelTag> for TreeKey {
|
||||
fn from(val: &RelTag) -> TreeKey {
|
||||
TreeKey {
|
||||
spc_oid: val.spc_oid,
|
||||
db_oid: val.db_oid,
|
||||
rel_number: val.rel_number,
|
||||
fork_number: val.fork_number,
|
||||
block_number: u32::MAX,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<(&RelTag, u32)> for TreeKey {
|
||||
fn from(val: (&RelTag, u32)) -> TreeKey {
|
||||
TreeKey {
|
||||
spc_oid: val.0.spc_oid,
|
||||
db_oid: val.0.db_oid,
|
||||
rel_number: val.0.rel_number,
|
||||
fork_number: val.0.fork_number,
|
||||
block_number: val.1,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl neonart::Key for TreeKey {
|
||||
const KEY_LEN: usize = 4 + 4 + 4 + 1 + 32;
|
||||
|
||||
fn as_bytes(&self) -> &[u8] {
|
||||
zerocopy::IntoBytes::as_bytes(self)
|
||||
}
|
||||
}
|
||||
|
||||
impl neonart::Value for TreeEntry {}
|
||||
|
||||
/// Return type used in the cache's get_*() functions. 'Found' means that the page, or other
|
||||
/// information that was enqueried, exists in the cache. '
|
||||
pub enum CacheResult<V> {
|
||||
/// The enqueried page or other information existed in the cache.
|
||||
Found(V),
|
||||
|
||||
/// The cache doesn't contain the page (or other enqueried information, like relation size). The
|
||||
/// Lsn is the 'not_modified_since' LSN that should be used in the request to the pageserver to
|
||||
/// read the page.
|
||||
NotFound(Lsn),
|
||||
}
|
||||
|
||||
impl<'t> IntegratedCacheWriteAccess<'t> {
|
||||
pub fn get_rel_size(&'t self, rel: &RelTag) -> CacheResult<u32> {
|
||||
let r = self.cache_tree.start_read();
|
||||
if let Some(nblocks) = get_rel_size(&r, rel) {
|
||||
CacheResult::Found(nblocks)
|
||||
} else {
|
||||
CacheResult::NotFound(self.global_lw_lsn)
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn get_page(
|
||||
&'t self,
|
||||
rel: &RelTag,
|
||||
block_number: u32,
|
||||
dst: impl uring_common::buf::IoBufMut + Send + Sync,
|
||||
) -> Result<CacheResult<()>, std::io::Error> {
|
||||
let r = self.cache_tree.start_read();
|
||||
if let Some(block_tree_entry) = r.get(&TreeKey::from((rel, block_number))) {
|
||||
let block_entry = if let TreeEntry::Block(e) = block_tree_entry {
|
||||
e
|
||||
} else {
|
||||
panic!("unexpected tree entry type for block key");
|
||||
};
|
||||
|
||||
if let Some(cache_block) = block_entry.cache_block {
|
||||
self.file_cache
|
||||
.as_ref()
|
||||
.unwrap()
|
||||
.read_block(cache_block, dst)
|
||||
.await?;
|
||||
Ok(CacheResult::Found(()))
|
||||
} else {
|
||||
Ok(CacheResult::NotFound(block_entry.lw_lsn))
|
||||
}
|
||||
} else {
|
||||
Ok(CacheResult::NotFound(self.global_lw_lsn))
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn page_is_cached(
|
||||
&'t self,
|
||||
rel: &RelTag,
|
||||
block_number: u32,
|
||||
) -> Result<CacheResult<()>, std::io::Error> {
|
||||
let r = self.cache_tree.start_read();
|
||||
if let Some(block_tree_entry) = r.get(&TreeKey::from((rel, block_number))) {
|
||||
let block_entry = if let TreeEntry::Block(e) = block_tree_entry {
|
||||
e
|
||||
} else {
|
||||
panic!("unexpected tree entry type for block key");
|
||||
};
|
||||
|
||||
if let Some(_cache_block) = block_entry.cache_block {
|
||||
Ok(CacheResult::Found(()))
|
||||
} else {
|
||||
Ok(CacheResult::NotFound(block_entry.lw_lsn))
|
||||
}
|
||||
} else {
|
||||
Ok(CacheResult::NotFound(self.global_lw_lsn))
|
||||
}
|
||||
}
|
||||
|
||||
/// Does the relation exists? CacheResult::NotFound means that the cache doesn't contain that
|
||||
/// information, i.e. we don't know if the relation exists or not.
|
||||
pub fn get_rel_exists(&'t self, rel: &RelTag) -> CacheResult<bool> {
|
||||
// we don't currently cache negative entries, so if the relation is in the cache, it exists
|
||||
let r = self.cache_tree.start_read();
|
||||
if let Some(_rel_entry) = r.get(&TreeKey::from(rel)) {
|
||||
CacheResult::Found(true)
|
||||
} else {
|
||||
CacheResult::NotFound(self.global_lw_lsn)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_db_size(&'t self, _db_oid: u32) -> CacheResult<u64> {
|
||||
// fixme: is this right lsn?
|
||||
CacheResult::NotFound(self.global_lw_lsn)
|
||||
}
|
||||
|
||||
pub fn remember_rel_size(&'t self, rel: &RelTag, nblocks: u32) {
|
||||
let mut w = self.cache_tree.start_write();
|
||||
|
||||
w.insert(
|
||||
&TreeKey::from(rel),
|
||||
TreeEntry::Rel(RelEntry {
|
||||
nblocks: Some(nblocks),
|
||||
}),
|
||||
);
|
||||
}
|
||||
|
||||
/// Remember the given page contents in the cache.
|
||||
pub async fn remember_page(
|
||||
&'t self,
|
||||
rel: &RelTag,
|
||||
block_number: u32,
|
||||
src: impl uring_common::buf::IoBuf + Send + Sync,
|
||||
lw_lsn: Lsn,
|
||||
) {
|
||||
if let Some(file_cache) = self.file_cache.as_ref() {
|
||||
let mut w = self.cache_tree.start_write();
|
||||
|
||||
let key = TreeKey::from((rel, block_number));
|
||||
|
||||
let mut cache_block = None;
|
||||
|
||||
w.update_with_fn(&key, |existing| {
|
||||
if let Some(existing) = existing {
|
||||
let mut block_entry = if let TreeEntry::Block(e) = existing.clone() {
|
||||
e
|
||||
} else {
|
||||
panic!("unexpected tree entry type for block key");
|
||||
};
|
||||
block_entry.lw_lsn = lw_lsn;
|
||||
if block_entry.cache_block.is_none() {
|
||||
block_entry.cache_block = Some(file_cache.alloc_block());
|
||||
}
|
||||
cache_block = block_entry.cache_block;
|
||||
Some(TreeEntry::Block(block_entry))
|
||||
} else {
|
||||
cache_block = Some(file_cache.alloc_block());
|
||||
Some(TreeEntry::Block(BlockEntry {
|
||||
lw_lsn: lw_lsn,
|
||||
cache_block: cache_block,
|
||||
}))
|
||||
}
|
||||
});
|
||||
let cache_block = cache_block.unwrap();
|
||||
file_cache
|
||||
.write_block(cache_block, src)
|
||||
.await
|
||||
.expect("error writing to cache");
|
||||
}
|
||||
}
|
||||
|
||||
/// Forget information about given relation in the cache. (For DROP TABLE and such)
|
||||
pub fn forget_rel(&'t self, rel: &RelTag) {
|
||||
// FIXME: not implemented properly. smgrexists() would still return true for this
|
||||
let mut w = self.cache_tree.start_write();
|
||||
w.insert(
|
||||
&TreeKey::from(rel),
|
||||
TreeEntry::Rel(RelEntry { nblocks: None }),
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/// Read relation size from the cache.
|
||||
///
|
||||
/// This is in a separate function so that it can be shared by
|
||||
/// IntegratedCacheReadAccess::get_rel_size() and IntegratedCacheWriteAccess::get_rel_size()
|
||||
fn get_rel_size<'t>(r: &neonart::TreeReadGuard<TreeKey, TreeEntry>, rel: &RelTag) -> Option<u32> {
|
||||
if let Some(existing) = r.get(&TreeKey::from(rel)) {
|
||||
let rel_entry = if let TreeEntry::Rel(e) = existing {
|
||||
e
|
||||
} else {
|
||||
panic!("unexpected tree entry type for rel key");
|
||||
};
|
||||
|
||||
if let Some(nblocks) = rel_entry.nblocks {
|
||||
Some(nblocks)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Accessor for other backends
|
||||
///
|
||||
/// This allows backends to read pages from the cache directly, on their own, without making a
|
||||
/// request to the communicator process.
|
||||
impl<'t> IntegratedCacheReadAccess<'t> {
|
||||
pub fn get_rel_size(&'t self, rel: &RelTag) -> Option<u32> {
|
||||
get_rel_size(&self.cache_tree.start_read(), rel)
|
||||
}
|
||||
|
||||
pub fn start_read_op(&'t self) -> BackendCacheReadOp<'t> {
|
||||
let r = self.cache_tree.start_read();
|
||||
BackendCacheReadOp { read_guard: r }
|
||||
}
|
||||
}
|
||||
|
||||
pub struct BackendCacheReadOp<'t> {
|
||||
read_guard: neonart::TreeReadGuard<'t, TreeKey, TreeEntry>,
|
||||
}
|
||||
|
||||
impl<'e> BackendCacheReadOp<'e> {
|
||||
/// Initiate a read of the page from the cache.
|
||||
///
|
||||
/// This returns the "cache block number", i.e. the block number within the cache file, where
|
||||
/// the page's contents is stored. To get the page contents, the caller needs to read that block
|
||||
/// from the cache file. This returns a guard object that you must hold while it performs the
|
||||
/// read. It's possible that while you are performing the read, the cache block is invalidated.
|
||||
/// After you have completed the read, call BackendCacheReadResult::finish() to check if the
|
||||
/// read was in fact valid or not. If it was concurrently invalidated, you need to retry.
|
||||
pub fn get_page(&self, rel: &RelTag, block_number: u32) -> Option<u64> {
|
||||
if let Some(block_tree_entry) = self.read_guard.get(&TreeKey::from((rel, block_number))) {
|
||||
let block_entry = if let TreeEntry::Block(e) = block_tree_entry {
|
||||
e
|
||||
} else {
|
||||
panic!("unexpected tree entry type for block key");
|
||||
};
|
||||
|
||||
block_entry.cache_block
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
pub fn finish(self) -> bool {
|
||||
// TODO: currently, we use a spinlock to protect the in-memory tree, so concurrent
|
||||
// invalidations are not possible. But the plan is to switch to optimistic locking,
|
||||
// and once we do that, this would return 'false' if the optimistic locking failed and
|
||||
// you need to retry.
|
||||
true
|
||||
}
|
||||
}
|
||||
25
pgxn/neon/communicator/src/lib.rs
Normal file
25
pgxn/neon/communicator/src/lib.rs
Normal file
@@ -0,0 +1,25 @@
|
||||
//!
|
||||
//! Three main parts:
|
||||
//! - async tokio communicator core, which receives requests and processes them.
|
||||
//! - Main loop and requests queues, which routes requests from backends to the core
|
||||
//! - the per-backend glue code, which submits requests
|
||||
//!
|
||||
|
||||
mod backend_comms;
|
||||
|
||||
// mark this 'pub', because these functions are called from C code. Otherwise, the compiler
|
||||
// complains about a bunch of structs and enum variants being unused, because it thinkgs
|
||||
// the functions that use them are never called. There are some C-callable functions in
|
||||
// other modules too, but marking this as pub is currently enough to silence the warnings
|
||||
//
|
||||
// TODO: perhaps collect *all* the extern "C" functions to one module?
|
||||
pub mod backend_interface;
|
||||
|
||||
mod file_cache;
|
||||
mod init;
|
||||
mod integrated_cache;
|
||||
mod neon_request;
|
||||
mod worker_process;
|
||||
|
||||
// FIXME get this from postgres headers somehow
|
||||
pub const BLCKSZ: usize = 8192;
|
||||
346
pgxn/neon/communicator/src/neon_request.rs
Normal file
346
pgxn/neon/communicator/src/neon_request.rs
Normal file
@@ -0,0 +1,346 @@
|
||||
type CLsn = u64;
|
||||
type COid = u32;
|
||||
|
||||
// This conveniently matches PG_IOV_MAX
|
||||
pub const MAX_GETPAGEV_PAGES: usize = 32;
|
||||
|
||||
use pageserver_data_api::model;
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Copy, Clone, Debug)]
|
||||
pub enum NeonIORequest {
|
||||
Empty,
|
||||
|
||||
// Read requests. These are C-friendly variants of the corresponding structs in
|
||||
// pageserver_data_api::model.
|
||||
RelExists(CRelExistsRequest),
|
||||
RelSize(CRelSizeRequest),
|
||||
GetPageV(CGetPageVRequest),
|
||||
PrefetchV(CPrefetchVRequest),
|
||||
DbSize(CDbSizeRequest),
|
||||
|
||||
// Write requests. These are needed to keep the relation size cache and LFC up-to-date.
|
||||
// They are not sent to the pageserver.
|
||||
WritePage(CWritePageRequest),
|
||||
RelExtend(CRelExtendRequest),
|
||||
RelZeroExtend(CRelZeroExtendRequest),
|
||||
RelCreate(CRelCreateRequest),
|
||||
RelTruncate(CRelTruncateRequest),
|
||||
RelUnlink(CRelUnlinkRequest),
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Copy, Clone, Debug)]
|
||||
pub enum NeonIOResult {
|
||||
Empty,
|
||||
RelExists(bool),
|
||||
RelSize(u32),
|
||||
|
||||
/// the result pages are written to the shared memory addresses given in the request
|
||||
GetPageV,
|
||||
|
||||
/// A prefetch request returns as soon as the request has been received by the communicator.
|
||||
/// It is processed in the background.
|
||||
PrefetchVLaunched,
|
||||
|
||||
DbSize(u64),
|
||||
|
||||
// FIXME design compact error codes. Can't easily pass a string or other dynamic data.
|
||||
// currently, this is 'errno'
|
||||
Error(i32),
|
||||
|
||||
Aborted,
|
||||
|
||||
/// used for all write requests
|
||||
WriteOK,
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Copy, Clone, Debug)]
|
||||
pub struct CCachedGetPageVResult {
|
||||
pub cache_block_numbers: [u64; MAX_GETPAGEV_PAGES],
|
||||
}
|
||||
|
||||
/// ShmemBuf represents a buffer in shared memory.
|
||||
///
|
||||
/// SAFETY: The pointer must point to an area in shared memory. The functions allow you to liberally
|
||||
/// get a mutable pointer to the contents; it is the caller's responsibility to ensure that you
|
||||
/// don't access a buffer that's you're not allowed to. Inappropriate access to the buffer doesn't
|
||||
/// violate Rust's safety semantics, but it will mess up and crash Postgres.
|
||||
///
|
||||
#[repr(C)]
|
||||
#[derive(Copy, Clone, Debug)]
|
||||
pub struct ShmemBuf {
|
||||
// These fields define where the result is written. Must point into a buffer in shared memory!
|
||||
pub ptr: *mut u8,
|
||||
}
|
||||
|
||||
unsafe impl Send for ShmemBuf {}
|
||||
unsafe impl Sync for ShmemBuf {}
|
||||
|
||||
unsafe impl uring_common::buf::IoBuf for ShmemBuf {
|
||||
fn stable_ptr(&self) -> *const u8 {
|
||||
self.ptr
|
||||
}
|
||||
|
||||
fn bytes_init(&self) -> usize {
|
||||
crate::BLCKSZ
|
||||
}
|
||||
|
||||
fn bytes_total(&self) -> usize {
|
||||
crate::BLCKSZ
|
||||
}
|
||||
}
|
||||
|
||||
unsafe impl uring_common::buf::IoBufMut for ShmemBuf {
|
||||
fn stable_mut_ptr(&mut self) -> *mut u8 {
|
||||
self.ptr
|
||||
}
|
||||
|
||||
unsafe fn set_init(&mut self, pos: usize) {
|
||||
if pos > crate::BLCKSZ as usize {
|
||||
panic!(
|
||||
"set_init called past end of buffer, pos {}, buffer size {}",
|
||||
pos,
|
||||
crate::BLCKSZ
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl ShmemBuf {
|
||||
pub fn as_mut_ptr(&self) -> *mut u8 {
|
||||
self.ptr
|
||||
}
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Copy, Clone, Debug)]
|
||||
pub struct CRelExistsRequest {
|
||||
pub spc_oid: COid,
|
||||
pub db_oid: COid,
|
||||
pub rel_number: u32,
|
||||
pub fork_number: u8,
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Copy, Clone, Debug)]
|
||||
pub struct CRelSizeRequest {
|
||||
pub spc_oid: COid,
|
||||
pub db_oid: COid,
|
||||
pub rel_number: u32,
|
||||
pub fork_number: u8,
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Copy, Clone, Debug)]
|
||||
pub struct CGetPageVRequest {
|
||||
pub spc_oid: COid,
|
||||
pub db_oid: COid,
|
||||
pub rel_number: u32,
|
||||
pub fork_number: u8,
|
||||
pub block_number: u32,
|
||||
pub nblocks: u8,
|
||||
|
||||
// These fields define where the result is written. Must point into a buffer in shared memory!
|
||||
pub dest: [ShmemBuf; MAX_GETPAGEV_PAGES],
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Copy, Clone, Debug)]
|
||||
pub struct CPrefetchVRequest {
|
||||
pub spc_oid: COid,
|
||||
pub db_oid: COid,
|
||||
pub rel_number: u32,
|
||||
pub fork_number: u8,
|
||||
pub block_number: u32,
|
||||
pub nblocks: u8,
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Copy, Clone, Debug)]
|
||||
pub struct CDbSizeRequest {
|
||||
pub db_oid: COid,
|
||||
pub request_lsn: CLsn,
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Copy, Clone, Debug)]
|
||||
pub struct CWritePageRequest {
|
||||
pub spc_oid: COid,
|
||||
pub db_oid: COid,
|
||||
pub rel_number: u32,
|
||||
pub fork_number: u8,
|
||||
pub block_number: u32,
|
||||
pub lsn: CLsn,
|
||||
|
||||
// These fields define where the result is written. Must point into a buffer in shared memory!
|
||||
pub src: ShmemBuf,
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Copy, Clone, Debug)]
|
||||
pub struct CRelExtendRequest {
|
||||
pub spc_oid: COid,
|
||||
pub db_oid: COid,
|
||||
pub rel_number: u32,
|
||||
pub fork_number: u8,
|
||||
pub block_number: u32,
|
||||
pub lsn: CLsn,
|
||||
|
||||
// These fields define page contents. Must point into a buffer in shared memory!
|
||||
pub src_ptr: usize,
|
||||
pub src_size: u32,
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Copy, Clone, Debug)]
|
||||
pub struct CRelZeroExtendRequest {
|
||||
pub spc_oid: COid,
|
||||
pub db_oid: COid,
|
||||
pub rel_number: u32,
|
||||
pub fork_number: u8,
|
||||
pub block_number: u32,
|
||||
pub nblocks: u32,
|
||||
pub lsn: CLsn,
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Copy, Clone, Debug)]
|
||||
pub struct CRelCreateRequest {
|
||||
pub spc_oid: COid,
|
||||
pub db_oid: COid,
|
||||
pub rel_number: u32,
|
||||
pub fork_number: u8,
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Copy, Clone, Debug)]
|
||||
pub struct CRelTruncateRequest {
|
||||
pub spc_oid: COid,
|
||||
pub db_oid: COid,
|
||||
pub rel_number: u32,
|
||||
pub fork_number: u8,
|
||||
pub nblocks: u32,
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Copy, Clone, Debug)]
|
||||
pub struct CRelUnlinkRequest {
|
||||
pub spc_oid: COid,
|
||||
pub db_oid: COid,
|
||||
pub rel_number: u32,
|
||||
pub fork_number: u8,
|
||||
pub block_number: u32,
|
||||
pub nblocks: u32,
|
||||
}
|
||||
|
||||
impl CRelExistsRequest {
|
||||
pub fn reltag(&self) -> model::RelTag {
|
||||
model::RelTag {
|
||||
spc_oid: self.spc_oid,
|
||||
db_oid: self.db_oid,
|
||||
rel_number: self.rel_number,
|
||||
fork_number: self.fork_number,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl CRelSizeRequest {
|
||||
pub fn reltag(&self) -> model::RelTag {
|
||||
model::RelTag {
|
||||
spc_oid: self.spc_oid,
|
||||
db_oid: self.db_oid,
|
||||
rel_number: self.rel_number,
|
||||
fork_number: self.fork_number,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl CGetPageVRequest {
|
||||
pub fn reltag(&self) -> model::RelTag {
|
||||
model::RelTag {
|
||||
spc_oid: self.spc_oid,
|
||||
db_oid: self.db_oid,
|
||||
rel_number: self.rel_number,
|
||||
fork_number: self.fork_number,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl CPrefetchVRequest {
|
||||
pub fn reltag(&self) -> model::RelTag {
|
||||
model::RelTag {
|
||||
spc_oid: self.spc_oid,
|
||||
db_oid: self.db_oid,
|
||||
rel_number: self.rel_number,
|
||||
fork_number: self.fork_number,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl CWritePageRequest {
|
||||
pub fn reltag(&self) -> model::RelTag {
|
||||
model::RelTag {
|
||||
spc_oid: self.spc_oid,
|
||||
db_oid: self.db_oid,
|
||||
rel_number: self.rel_number,
|
||||
fork_number: self.fork_number,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl CRelExtendRequest {
|
||||
pub fn reltag(&self) -> model::RelTag {
|
||||
model::RelTag {
|
||||
spc_oid: self.spc_oid,
|
||||
db_oid: self.db_oid,
|
||||
rel_number: self.rel_number,
|
||||
fork_number: self.fork_number,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl CRelZeroExtendRequest {
|
||||
pub fn reltag(&self) -> model::RelTag {
|
||||
model::RelTag {
|
||||
spc_oid: self.spc_oid,
|
||||
db_oid: self.db_oid,
|
||||
rel_number: self.rel_number,
|
||||
fork_number: self.fork_number,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl CRelCreateRequest {
|
||||
pub fn reltag(&self) -> model::RelTag {
|
||||
model::RelTag {
|
||||
spc_oid: self.spc_oid,
|
||||
db_oid: self.db_oid,
|
||||
rel_number: self.rel_number,
|
||||
fork_number: self.fork_number,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl CRelTruncateRequest {
|
||||
pub fn reltag(&self) -> model::RelTag {
|
||||
model::RelTag {
|
||||
spc_oid: self.spc_oid,
|
||||
db_oid: self.db_oid,
|
||||
rel_number: self.rel_number,
|
||||
fork_number: self.fork_number,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl CRelUnlinkRequest {
|
||||
pub fn reltag(&self) -> model::RelTag {
|
||||
model::RelTag {
|
||||
spc_oid: self.spc_oid,
|
||||
db_oid: self.db_oid,
|
||||
rel_number: self.rel_number,
|
||||
fork_number: self.fork_number,
|
||||
}
|
||||
}
|
||||
}
|
||||
28
pgxn/neon/communicator/src/worker_process/callbacks.rs
Normal file
28
pgxn/neon/communicator/src/worker_process/callbacks.rs
Normal file
@@ -0,0 +1,28 @@
|
||||
//! C callbacks to PostgreSQL facilities that the neon extension needs
|
||||
//! to provide. These are implemented in `neon/pgxn/communicator_new.c`.
|
||||
//! The function signatures better match!
|
||||
//!
|
||||
//! These are called from the communicator threads! Careful what you do, most
|
||||
//! Postgres functions are not safe to call in that context.
|
||||
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
unsafe extern "C" {
|
||||
pub fn notify_proc_unsafe(procno: std::ffi::c_int);
|
||||
pub fn callback_set_my_latch_unsafe();
|
||||
pub fn callback_get_request_lsn_unsafe() -> u64;
|
||||
}
|
||||
|
||||
// safe wrappers
|
||||
|
||||
pub(super) fn notify_proc(procno: std::ffi::c_int) {
|
||||
unsafe { notify_proc_unsafe(procno) };
|
||||
}
|
||||
|
||||
pub(super) fn callback_set_my_latch() {
|
||||
unsafe { callback_set_my_latch_unsafe() };
|
||||
}
|
||||
|
||||
pub(super) fn get_request_lsn() -> Lsn {
|
||||
Lsn(unsafe { callback_get_request_lsn_unsafe() })
|
||||
}
|
||||
229
pgxn/neon/communicator/src/worker_process/logging.rs
Normal file
229
pgxn/neon/communicator/src/worker_process/logging.rs
Normal file
@@ -0,0 +1,229 @@
|
||||
//! Glue code to hook up Rust logging, with the `tracing` crate, to the PostgreSQL log
|
||||
//!
|
||||
//! In the Rust threads, the log messages are written to a mpsc Channel, and the Postgres
|
||||
//! process latch is raised. That wakes up the loop in the main thread. It reads the
|
||||
//! message from the channel and ereport()s it. This ensures that only one thread, the main
|
||||
//! thread, calls the PostgreSQL logging routines at any time.
|
||||
|
||||
use std::sync::mpsc::sync_channel;
|
||||
use std::sync::mpsc::{Receiver, SyncSender};
|
||||
use std::sync::mpsc::{TryRecvError, TrySendError};
|
||||
|
||||
use tracing::info;
|
||||
use tracing::{Event, Level, Metadata, Subscriber};
|
||||
use tracing_subscriber::filter::LevelFilter;
|
||||
use tracing_subscriber::fmt::FmtContext;
|
||||
use tracing_subscriber::fmt::FormatEvent;
|
||||
use tracing_subscriber::fmt::FormatFields;
|
||||
use tracing_subscriber::fmt::FormattedFields;
|
||||
use tracing_subscriber::fmt::MakeWriter;
|
||||
use tracing_subscriber::fmt::format::Writer;
|
||||
use tracing_subscriber::registry::LookupSpan;
|
||||
|
||||
use crate::worker_process::callbacks::callback_set_my_latch;
|
||||
|
||||
pub struct LoggingState {
|
||||
receiver: Receiver<FormattedEventWithMeta>,
|
||||
}
|
||||
|
||||
/// Called once, at worker process startup. The returned LoggingState is passed back
|
||||
/// in the subsequent calls to `pump_logging`. It is opaque to the C code.
|
||||
#[unsafe(no_mangle)]
|
||||
pub extern "C" fn configure_logging() -> Box<LoggingState> {
|
||||
let (sender, receiver) = sync_channel(1000);
|
||||
|
||||
let maker = Maker { channel: sender };
|
||||
|
||||
use tracing_subscriber::prelude::*;
|
||||
let r = tracing_subscriber::registry();
|
||||
|
||||
let r = r.with(
|
||||
tracing_subscriber::fmt::layer()
|
||||
.event_format(SimpleFormatter::new())
|
||||
.with_writer(maker)
|
||||
// TODO: derive this from log_min_messages?
|
||||
.with_filter(LevelFilter::from_level(Level::INFO)),
|
||||
);
|
||||
r.init();
|
||||
|
||||
info!("communicator process logging started");
|
||||
|
||||
let state = LoggingState { receiver };
|
||||
|
||||
Box::new(state)
|
||||
}
|
||||
|
||||
/// Read one message from the logging queue. This is essentially a wrapper to Receiver,
|
||||
/// with a C-friendly signature.
|
||||
///
|
||||
/// The message is copied into *errbuf, which is a caller-supplied buffer of size `errbuf_len`.
|
||||
/// If the message doesn't fit in the buffer, it is truncated. It is always NULL-terminated.
|
||||
///
|
||||
/// The error level is returned *elevel_p. It's one of the PostgreSQL error levels, see elog.h
|
||||
#[unsafe(no_mangle)]
|
||||
pub extern "C" fn pump_logging(
|
||||
state: &mut LoggingState,
|
||||
errbuf: *mut u8,
|
||||
errbuf_len: u32,
|
||||
elevel_p: &mut i32,
|
||||
) -> i32 {
|
||||
let msg = match state.receiver.try_recv() {
|
||||
Err(TryRecvError::Empty) => return 0,
|
||||
Err(TryRecvError::Disconnected) => return -1,
|
||||
Ok(msg) => msg,
|
||||
};
|
||||
|
||||
let src: &[u8] = &msg.message;
|
||||
let dst = errbuf;
|
||||
let len = std::cmp::min(src.len(), errbuf_len as usize - 1);
|
||||
unsafe {
|
||||
std::ptr::copy_nonoverlapping(src.as_ptr(), dst, len);
|
||||
*(errbuf.add(len)) = b'\0'; // NULL terminator
|
||||
}
|
||||
|
||||
// XXX: these levels are copied from PostgreSQL's elog.h. Introduce another enum
|
||||
// to hide these?
|
||||
*elevel_p = match msg.level {
|
||||
Level::TRACE => 10, // DEBUG5
|
||||
Level::DEBUG => 14, // DEBUG1
|
||||
Level::INFO => 17, // INFO
|
||||
Level::WARN => 19, // WARNING
|
||||
Level::ERROR => 21, // ERROR
|
||||
};
|
||||
1
|
||||
}
|
||||
|
||||
//---- The following functions can be called from any thread ----
|
||||
|
||||
#[derive(Clone)]
|
||||
struct FormattedEventWithMeta {
|
||||
message: Vec<u8>,
|
||||
level: tracing::Level,
|
||||
}
|
||||
|
||||
impl Default for FormattedEventWithMeta {
|
||||
fn default() -> Self {
|
||||
FormattedEventWithMeta {
|
||||
message: Vec::new(),
|
||||
level: tracing::Level::DEBUG,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct EventBuilder<'a> {
|
||||
event: FormattedEventWithMeta,
|
||||
|
||||
maker: &'a Maker,
|
||||
}
|
||||
|
||||
impl<'a> std::io::Write for EventBuilder<'a> {
|
||||
fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
|
||||
self.event.message.write(buf)
|
||||
}
|
||||
fn flush(&mut self) -> std::io::Result<()> {
|
||||
self.maker.send_event(self.event.clone());
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Drop for EventBuilder<'a> {
|
||||
fn drop(&mut self) {
|
||||
let maker = self.maker;
|
||||
let event = std::mem::take(&mut self.event);
|
||||
|
||||
maker.send_event(event);
|
||||
}
|
||||
}
|
||||
|
||||
struct Maker {
|
||||
channel: SyncSender<FormattedEventWithMeta>,
|
||||
}
|
||||
|
||||
impl<'a> MakeWriter<'a> for Maker {
|
||||
type Writer = EventBuilder<'a>;
|
||||
|
||||
fn make_writer(&'a self) -> Self::Writer {
|
||||
panic!("not expected to be called when make_writer_for is implemented");
|
||||
}
|
||||
|
||||
fn make_writer_for(&'a self, meta: &Metadata<'_>) -> Self::Writer {
|
||||
EventBuilder {
|
||||
event: FormattedEventWithMeta {
|
||||
message: Vec::new(),
|
||||
level: *meta.level(),
|
||||
},
|
||||
maker: self,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Maker {
|
||||
fn send_event(&self, e: FormattedEventWithMeta) {
|
||||
match self.channel.try_send(e) {
|
||||
Ok(()) => {
|
||||
// notify the main thread
|
||||
callback_set_my_latch();
|
||||
}
|
||||
Err(TrySendError::Disconnected(_)) => {}
|
||||
Err(TrySendError::Full(_)) => {
|
||||
// TODO: record that some messages were lost
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Simple formatter implementation for tracing_subscriber, which prints the log
|
||||
/// spans and message part like the default formatter, but no timestamp or error
|
||||
/// level. The error level is captured separately by `FormattedEventWithMeta',
|
||||
/// and when the error is printed by the main thread, with PostgreSQL ereport(),
|
||||
/// it gets a timestamp at that point. (The timestamp printed will therefore lag
|
||||
/// behind the timestamp on the event here, if the main thread doesn't process
|
||||
/// the log message promptly)
|
||||
struct SimpleFormatter;
|
||||
|
||||
impl<S, N> FormatEvent<S, N> for SimpleFormatter
|
||||
where
|
||||
S: Subscriber + for<'a> LookupSpan<'a>,
|
||||
N: for<'a> FormatFields<'a> + 'static,
|
||||
{
|
||||
fn format_event(
|
||||
&self,
|
||||
ctx: &FmtContext<'_, S, N>,
|
||||
mut writer: Writer<'_>,
|
||||
event: &Event<'_>,
|
||||
) -> std::fmt::Result {
|
||||
// Format all the spans in the event's span context.
|
||||
if let Some(scope) = ctx.event_scope() {
|
||||
for span in scope.from_root() {
|
||||
write!(writer, "{}", span.name())?;
|
||||
|
||||
// `FormattedFields` is a formatted representation of the span's
|
||||
// fields, which is stored in its extensions by the `fmt` layer's
|
||||
// `new_span` method. The fields will have been formatted
|
||||
// by the same field formatter that's provided to the event
|
||||
// formatter in the `FmtContext`.
|
||||
let ext = span.extensions();
|
||||
let fields = &ext
|
||||
.get::<FormattedFields<N>>()
|
||||
.expect("will never be `None`");
|
||||
|
||||
// Skip formatting the fields if the span had no fields.
|
||||
if !fields.is_empty() {
|
||||
write!(writer, "{{{}}}", fields)?;
|
||||
}
|
||||
write!(writer, ": ")?;
|
||||
}
|
||||
}
|
||||
|
||||
// Write fields on the event
|
||||
ctx.field_format().format_fields(writer.by_ref(), event)?;
|
||||
|
||||
writeln!(writer)
|
||||
}
|
||||
}
|
||||
|
||||
impl SimpleFormatter {
|
||||
fn new() -> Self {
|
||||
SimpleFormatter {}
|
||||
}
|
||||
}
|
||||
384
pgxn/neon/communicator/src/worker_process/main_loop.rs
Normal file
384
pgxn/neon/communicator/src/worker_process/main_loop.rs
Normal file
@@ -0,0 +1,384 @@
|
||||
use std::collections::HashMap;
|
||||
use std::path::PathBuf;
|
||||
|
||||
use crate::backend_comms::NeonIOHandle;
|
||||
use crate::file_cache::FileCache;
|
||||
use crate::init::CommunicatorInitStruct;
|
||||
use crate::integrated_cache::{CacheResult, IntegratedCacheWriteAccess};
|
||||
use crate::neon_request::{CGetPageVRequest, CPrefetchVRequest};
|
||||
use crate::neon_request::{NeonIORequest, NeonIOResult};
|
||||
use pageserver_client_grpc::PageserverClient;
|
||||
use pageserver_data_api::model;
|
||||
|
||||
use tokio::io::AsyncReadExt;
|
||||
use tokio_epoll_uring::IoBuf;
|
||||
use tokio_pipe::PipeRead;
|
||||
|
||||
use super::callbacks::{get_request_lsn, notify_proc};
|
||||
|
||||
use tracing::{error, info, trace};
|
||||
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
pub struct CommunicatorWorkerProcessStruct<'a> {
|
||||
neon_request_slots: &'a [NeonIOHandle],
|
||||
|
||||
pageserver_client: PageserverClient,
|
||||
|
||||
cache: IntegratedCacheWriteAccess<'a>,
|
||||
|
||||
submission_pipe_read_raw_fd: i32,
|
||||
}
|
||||
|
||||
pub(super) async fn init(
|
||||
cis: Box<CommunicatorInitStruct>,
|
||||
tenant_id: String,
|
||||
timeline_id: String,
|
||||
auth_token: Option<String>,
|
||||
shard_map: HashMap<u16, String>,
|
||||
_file_cache_size: u64,
|
||||
file_cache_path: Option<PathBuf>,
|
||||
) -> CommunicatorWorkerProcessStruct<'static> {
|
||||
let last_lsn = get_request_lsn();
|
||||
|
||||
let uring_system = tokio_epoll_uring::System::launch().await.unwrap();
|
||||
|
||||
let file_cache = if let Some(path) = file_cache_path {
|
||||
Some(FileCache::new(&path, uring_system).expect("could not create cache file"))
|
||||
} else {
|
||||
// FIXME: temporarily for testing, use LFC even if disabled
|
||||
Some(
|
||||
FileCache::new(&PathBuf::from("new_filecache"), uring_system)
|
||||
.expect("could not create cache file"),
|
||||
)
|
||||
};
|
||||
|
||||
// Initialize subsystems
|
||||
let cache = cis
|
||||
.integrated_cache_init_struct
|
||||
.worker_process_init(last_lsn, file_cache);
|
||||
|
||||
let pageserver_client = PageserverClient::new(&tenant_id, &timeline_id, &auth_token, shard_map);
|
||||
|
||||
let this = CommunicatorWorkerProcessStruct {
|
||||
neon_request_slots: cis.neon_request_slots,
|
||||
pageserver_client,
|
||||
cache,
|
||||
submission_pipe_read_raw_fd: cis.submission_pipe_read_fd,
|
||||
};
|
||||
|
||||
this
|
||||
}
|
||||
|
||||
impl<'t> CommunicatorWorkerProcessStruct<'t> {
|
||||
/// Main loop of the worker process. Receive requests from the backends and process them.
|
||||
pub(super) async fn run(self: &'static Self) {
|
||||
let mut idxbuf: [u8; 4] = [0; 4];
|
||||
|
||||
let mut submission_pipe_read =
|
||||
PipeRead::from_raw_fd_checked(self.submission_pipe_read_raw_fd)
|
||||
.expect("invalid pipe fd");
|
||||
|
||||
loop {
|
||||
// Wait for a backend to ring the doorbell
|
||||
|
||||
match submission_pipe_read.read(&mut idxbuf).await {
|
||||
Ok(4) => {}
|
||||
Ok(nbytes) => panic!("short read ({nbytes} bytes) on communicator pipe"),
|
||||
Err(e) => panic!("error reading from communicator pipe: {e}"),
|
||||
}
|
||||
let request_idx = u32::from_ne_bytes(idxbuf);
|
||||
|
||||
// Read the IO request from the slot indicated in the wakeup
|
||||
let Some(slot) =
|
||||
self.neon_request_slots[request_idx as usize].start_processing_request()
|
||||
else {
|
||||
// This currently should not happen. But if we have multiple threads picking up
|
||||
// requests, and without waiting for the notifications, it could.
|
||||
panic!("no request in slot");
|
||||
};
|
||||
|
||||
// Ok, we have ownership of this request now. We must process
|
||||
// it now, there's no going back.
|
||||
|
||||
//trace!("processing request {request_idx}: {request:?}");
|
||||
|
||||
// Spawn a separate task for every request. That's a little excessive for requests that
|
||||
// can be quickly satisfied from the cache, but we expect that to be rare, because the
|
||||
// requesting backend would have already checked the cache.
|
||||
tokio::spawn(async {
|
||||
let result = self.handle_request(slot.get_request()).await;
|
||||
let owner_procno = slot.get_owner_procno();
|
||||
|
||||
// Ok, we have completed the IO. Mark the request as completed. After that,
|
||||
// we no longer have ownership of the slot, and must not modify it.
|
||||
slot.completed(result);
|
||||
|
||||
// Notify the backend about the completion. (Note that the backend might see
|
||||
// the completed status even before this; this is just a wakeup)
|
||||
notify_proc(owner_procno);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
fn request_common(&self, not_modified_since_lsn: Lsn) -> model::RequestCommon {
|
||||
model::RequestCommon {
|
||||
request_lsn: get_request_lsn(),
|
||||
not_modified_since_lsn,
|
||||
}
|
||||
}
|
||||
|
||||
async fn handle_request<'x>(self: &'static Self, req: &'x NeonIORequest) -> NeonIOResult {
|
||||
match req {
|
||||
NeonIORequest::Empty => {
|
||||
error!("unexpected Empty IO request");
|
||||
NeonIOResult::Error(-1)
|
||||
}
|
||||
NeonIORequest::RelExists(req) => {
|
||||
let rel = req.reltag();
|
||||
|
||||
let not_modified_since = match self.cache.get_rel_exists(&rel) {
|
||||
CacheResult::Found(exists) => return NeonIOResult::RelExists(exists),
|
||||
CacheResult::NotFound(lsn) => lsn,
|
||||
};
|
||||
|
||||
match self
|
||||
.pageserver_client
|
||||
.process_rel_exists_request(&model::RelExistsRequest {
|
||||
common: self.request_common(not_modified_since),
|
||||
rel,
|
||||
})
|
||||
.await
|
||||
{
|
||||
Ok(exists) => NeonIOResult::RelExists(exists),
|
||||
Err(err) => {
|
||||
info!("tonic error: {err:?}");
|
||||
NeonIOResult::Error(-1)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
NeonIORequest::RelSize(req) => {
|
||||
let rel = req.reltag();
|
||||
|
||||
// Check the cache first
|
||||
let not_modified_since = match self.cache.get_rel_size(&rel) {
|
||||
CacheResult::Found(nblocks) => {
|
||||
tracing::trace!("found relsize for {:?} in cache: {}", rel, nblocks);
|
||||
return NeonIOResult::RelSize(nblocks);
|
||||
}
|
||||
CacheResult::NotFound(lsn) => lsn,
|
||||
};
|
||||
|
||||
let common = self.request_common(not_modified_since);
|
||||
match self
|
||||
.pageserver_client
|
||||
.process_rel_size_request(&model::RelSizeRequest {
|
||||
common: common.clone(),
|
||||
rel: rel.clone(),
|
||||
})
|
||||
.await
|
||||
{
|
||||
Ok(nblocks) => {
|
||||
// update the cache
|
||||
tracing::info!("updated relsize for {:?} in cache: {}", rel, nblocks);
|
||||
self.cache.remember_rel_size(&rel, nblocks);
|
||||
|
||||
NeonIOResult::RelSize(nblocks)
|
||||
}
|
||||
Err(err) => {
|
||||
info!("tonic error: {err:?}");
|
||||
NeonIOResult::Error(-1)
|
||||
}
|
||||
}
|
||||
}
|
||||
NeonIORequest::GetPageV(req) => match self.handle_get_pagev_request(req).await {
|
||||
Ok(()) => NeonIOResult::GetPageV,
|
||||
Err(errno) => NeonIOResult::Error(errno),
|
||||
},
|
||||
NeonIORequest::PrefetchV(req) => {
|
||||
let req = req.clone();
|
||||
tokio::spawn(async move { self.handle_prefetchv_request(&req).await });
|
||||
NeonIOResult::PrefetchVLaunched
|
||||
}
|
||||
NeonIORequest::DbSize(req) => {
|
||||
// Check the cache first
|
||||
let not_modified_since = match self.cache.get_db_size(req.db_oid) {
|
||||
CacheResult::Found(db_size) => {
|
||||
// get_page already copied the block content to the destination
|
||||
return NeonIOResult::DbSize(db_size);
|
||||
}
|
||||
CacheResult::NotFound(lsn) => lsn,
|
||||
};
|
||||
|
||||
match self
|
||||
.pageserver_client
|
||||
.process_dbsize_request(&model::DbSizeRequest {
|
||||
common: self.request_common(not_modified_since),
|
||||
db_oid: req.db_oid,
|
||||
})
|
||||
.await
|
||||
{
|
||||
Ok(db_size) => NeonIOResult::DbSize(db_size),
|
||||
Err(err) => {
|
||||
info!("tonic error: {err:?}");
|
||||
NeonIOResult::Error(-1)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Write requests
|
||||
NeonIORequest::WritePage(req) => {
|
||||
// Also store it in the LFC while we still have it
|
||||
let rel = req.reltag();
|
||||
self.cache
|
||||
.remember_page(&rel, req.block_number, req.src, Lsn(req.lsn))
|
||||
.await;
|
||||
NeonIOResult::WriteOK
|
||||
}
|
||||
NeonIORequest::RelExtend(req) => {
|
||||
self.cache
|
||||
.remember_rel_size(&req.reltag(), req.block_number + 1);
|
||||
NeonIOResult::WriteOK
|
||||
}
|
||||
NeonIORequest::RelZeroExtend(req) => {
|
||||
self.cache
|
||||
.remember_rel_size(&req.reltag(), req.block_number + req.nblocks);
|
||||
NeonIOResult::WriteOK
|
||||
}
|
||||
NeonIORequest::RelCreate(req) => {
|
||||
self.cache.remember_rel_size(&req.reltag(), 0);
|
||||
NeonIOResult::WriteOK
|
||||
}
|
||||
NeonIORequest::RelTruncate(req) => {
|
||||
self.cache.remember_rel_size(&req.reltag(), req.nblocks);
|
||||
NeonIOResult::WriteOK
|
||||
}
|
||||
NeonIORequest::RelUnlink(req) => {
|
||||
self.cache.forget_rel(&req.reltag());
|
||||
NeonIOResult::WriteOK
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn handle_get_pagev_request(&'t self, req: &CGetPageVRequest) -> Result<(), i32> {
|
||||
let rel = req.reltag();
|
||||
|
||||
// Check the cache first
|
||||
let mut cache_misses = Vec::new();
|
||||
for i in 0..req.nblocks {
|
||||
let blkno = req.block_number + i as u32;
|
||||
let dest = req.dest[i as usize];
|
||||
let not_modified_since = match self.cache.get_page(&rel, blkno, dest).await {
|
||||
Ok(CacheResult::Found(_)) => {
|
||||
// get_page already copied the block content to the destination
|
||||
trace!("found blk {} in rel {:?} in LFC ", blkno, rel);
|
||||
continue;
|
||||
}
|
||||
Ok(CacheResult::NotFound(lsn)) => lsn,
|
||||
Err(_io_error) => return Err(-1), // FIXME errno?
|
||||
};
|
||||
cache_misses.push((blkno, not_modified_since, dest));
|
||||
}
|
||||
if cache_misses.is_empty() {
|
||||
return Ok(());
|
||||
}
|
||||
let not_modified_since = cache_misses
|
||||
.iter()
|
||||
.map(|(_blkno, lsn, _dest)| *lsn)
|
||||
.max()
|
||||
.unwrap();
|
||||
|
||||
// TODO: Use batched protocol
|
||||
for (blkno, _lsn, dest) in cache_misses.iter() {
|
||||
match self
|
||||
.pageserver_client
|
||||
.get_page(&model::GetPageRequest {
|
||||
common: self.request_common(not_modified_since),
|
||||
rel: rel.clone(),
|
||||
block_number: *blkno,
|
||||
})
|
||||
.await
|
||||
{
|
||||
Ok(page_image) => {
|
||||
// Write the received page image directly to the shared memory location
|
||||
// that the backend requested.
|
||||
let src: &[u8] = page_image.as_ref();
|
||||
let len = std::cmp::min(src.len(), dest.bytes_total() as usize);
|
||||
unsafe {
|
||||
std::ptr::copy_nonoverlapping(src.as_ptr(), dest.as_mut_ptr(), len);
|
||||
};
|
||||
|
||||
trace!("remembering blk {} in rel {:?} in LFC", blkno, rel);
|
||||
|
||||
// Also store it in the LFC while we have it
|
||||
self.cache
|
||||
.remember_page(&rel, *blkno, page_image, not_modified_since)
|
||||
.await;
|
||||
}
|
||||
Err(err) => {
|
||||
info!("tonic error: {err:?}");
|
||||
return Err(-1);
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn handle_prefetchv_request(
|
||||
self: &'static Self,
|
||||
req: &CPrefetchVRequest,
|
||||
) -> Result<(), i32> {
|
||||
let rel = req.reltag();
|
||||
|
||||
// Check the cache first
|
||||
let mut cache_misses = Vec::new();
|
||||
for i in 0..req.nblocks {
|
||||
let blkno = req.block_number + i as u32;
|
||||
let not_modified_since = match self.cache.page_is_cached(&rel, blkno).await {
|
||||
Ok(CacheResult::Found(_)) => {
|
||||
trace!("found blk {} in rel {:?} in LFC ", req.block_number, rel);
|
||||
continue;
|
||||
}
|
||||
Ok(CacheResult::NotFound(lsn)) => lsn,
|
||||
Err(_io_error) => return Err(-1), // FIXME errno?
|
||||
};
|
||||
cache_misses.push((req.block_number, not_modified_since));
|
||||
}
|
||||
if cache_misses.is_empty() {
|
||||
return Ok(());
|
||||
}
|
||||
let not_modified_since = cache_misses.iter().map(|(_blkno, lsn)| *lsn).max().unwrap();
|
||||
|
||||
// TODO: spawn separate tasks for these. Use the integrated cache to keep track of the
|
||||
// in-flight requests
|
||||
|
||||
// TODO: Use batched protocol
|
||||
for (blkno, _lsn) in cache_misses.iter() {
|
||||
match self
|
||||
.pageserver_client
|
||||
.get_page(&model::GetPageRequest {
|
||||
common: self.request_common(not_modified_since),
|
||||
rel: rel.clone(),
|
||||
block_number: *blkno,
|
||||
})
|
||||
.await
|
||||
{
|
||||
Ok(page_image) => {
|
||||
trace!(
|
||||
"prefetch completed, remembering blk {} in rel {:?} in LFC",
|
||||
req.block_number, rel
|
||||
);
|
||||
self.cache
|
||||
.remember_page(&rel, req.block_number, page_image, not_modified_since)
|
||||
.await;
|
||||
}
|
||||
Err(err) => {
|
||||
info!("tonic error: {err:?}");
|
||||
return Err(-1);
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
11
pgxn/neon/communicator/src/worker_process/mod.rs
Normal file
11
pgxn/neon/communicator/src/worker_process/mod.rs
Normal file
@@ -0,0 +1,11 @@
|
||||
//! This code runs in the communicator worker process. This provides
|
||||
//! the glue code to:
|
||||
//!
|
||||
//! - launch the 'processor',
|
||||
//! - receive IO requests from backends and pass them to the processor,
|
||||
//! - write results back to backends.
|
||||
|
||||
mod callbacks;
|
||||
mod logging;
|
||||
mod main_loop;
|
||||
mod worker_interface;
|
||||
@@ -0,0 +1,93 @@
|
||||
//! Functions called from the C code in the worker process
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::ffi::{CStr, c_char};
|
||||
use std::path::PathBuf;
|
||||
|
||||
use tracing::error;
|
||||
|
||||
use crate::init::CommunicatorInitStruct;
|
||||
use crate::worker_process::main_loop;
|
||||
|
||||
/// Launch the communicator's tokio tasks, which do most of the work.
|
||||
///
|
||||
/// The caller has initialized the process as a regular PostgreSQL
|
||||
/// background worker process. The shared memory segment used to
|
||||
/// communicate with the backends has been allocated and initialized
|
||||
/// earlier, at postmaster startup, in rcommunicator_shmem_init().
|
||||
#[unsafe(no_mangle)]
|
||||
pub extern "C" fn communicator_worker_process_launch(
|
||||
cis: Box<CommunicatorInitStruct>,
|
||||
tenant_id: *const c_char,
|
||||
timeline_id: *const c_char,
|
||||
auth_token: *const c_char,
|
||||
shard_map: *mut *mut c_char,
|
||||
nshards: u32,
|
||||
file_cache_path: *const c_char,
|
||||
file_cache_size: u64,
|
||||
) {
|
||||
// Convert the arguments into more convenient Rust types
|
||||
let tenant_id = unsafe { CStr::from_ptr(tenant_id) }.to_str().unwrap();
|
||||
let timeline_id = unsafe { CStr::from_ptr(timeline_id) }.to_str().unwrap();
|
||||
let auth_token = {
|
||||
if auth_token.is_null() {
|
||||
None
|
||||
} else {
|
||||
let c_str = unsafe { CStr::from_ptr(auth_token) };
|
||||
Some(c_str.to_str().unwrap().to_string())
|
||||
}
|
||||
};
|
||||
let file_cache_path = {
|
||||
if file_cache_path.is_null() {
|
||||
None
|
||||
} else {
|
||||
let c_str = unsafe { CStr::from_ptr(file_cache_path) };
|
||||
Some(PathBuf::from(c_str.to_str().unwrap()))
|
||||
}
|
||||
};
|
||||
let shard_map = parse_shard_map(nshards, shard_map);
|
||||
|
||||
// start main loop
|
||||
let runtime = tokio::runtime::Builder::new_multi_thread()
|
||||
.enable_all()
|
||||
.thread_name("communicator thread")
|
||||
.build()
|
||||
.unwrap();
|
||||
|
||||
let worker_struct = runtime.block_on(main_loop::init(
|
||||
cis,
|
||||
tenant_id.to_string(),
|
||||
timeline_id.to_string(),
|
||||
auth_token,
|
||||
shard_map,
|
||||
file_cache_size,
|
||||
file_cache_path,
|
||||
));
|
||||
let worker_struct = Box::leak(Box::new(worker_struct));
|
||||
|
||||
let main_loop_handle = runtime.spawn(worker_struct.run());
|
||||
|
||||
runtime.spawn(async {
|
||||
let err = main_loop_handle.await.unwrap_err();
|
||||
error!("error: {err:?}");
|
||||
});
|
||||
|
||||
// keep the runtime running after we exit this function
|
||||
Box::leak(Box::new(runtime));
|
||||
}
|
||||
|
||||
/// Convert the "shard map" from an array of C strings, indexed by shard no to a rust HashMap
|
||||
fn parse_shard_map(nshards: u32, shard_map: *mut *mut c_char) -> HashMap<u16, String> {
|
||||
let mut result: HashMap<u16, String> = HashMap::new();
|
||||
let mut p = shard_map;
|
||||
|
||||
for i in 0..nshards {
|
||||
let c_str = unsafe { CStr::from_ptr(*p) };
|
||||
|
||||
p = unsafe { p.add(1) };
|
||||
|
||||
let s = c_str.to_str().unwrap();
|
||||
result.insert(i as u16, s.into());
|
||||
}
|
||||
result
|
||||
}
|
||||
953
pgxn/neon/communicator_new.c
Normal file
953
pgxn/neon/communicator_new.c
Normal file
@@ -0,0 +1,953 @@
|
||||
/*-------------------------------------------------------------------------
|
||||
*
|
||||
* communicator_new.c
|
||||
* Functions for communicating with remote pageservers.
|
||||
*
|
||||
* This is the "new" communicator. It consists of functions that
|
||||
* are called from the smgr implementation, in pagestore_smgr.c.
|
||||
*
|
||||
* Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
|
||||
* Portions Copyright (c) 1994, Regents of the University of California
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
#include "postgres.h"
|
||||
|
||||
#include "access/xlog.h"
|
||||
#include "access/xlogdefs.h"
|
||||
#if PG_VERSION_NUM >= 150000
|
||||
#include "access/xlogrecovery.h"
|
||||
#endif
|
||||
#include "access/xlog_internal.h"
|
||||
#include "access/xlogutils.h"
|
||||
#include "executor/instrument.h"
|
||||
#include "miscadmin.h"
|
||||
#include "postmaster/bgworker.h"
|
||||
#include "postmaster/interrupt.h"
|
||||
#include "replication/walsender.h"
|
||||
#include "storage/fd.h"
|
||||
#include "storage/ipc.h"
|
||||
#include "storage/latch.h"
|
||||
#include "storage/procarray.h"
|
||||
#if PG_VERSION_NUM >= 170000
|
||||
#include "storage/procnumber.h"
|
||||
#endif
|
||||
#include "storage/spin.h"
|
||||
#include "tcop/tcopprot.h"
|
||||
|
||||
#include "communicator_new.h"
|
||||
#include "neon.h"
|
||||
#include "neon_perf_counters.h"
|
||||
#include "pagestore_client.h"
|
||||
|
||||
/*
|
||||
* FIXME: these are in file_cache.h, but I don't want to #include that
|
||||
* here. This code shouldn't be using the C file cache for anything else than
|
||||
* the GUCs.
|
||||
*/
|
||||
extern int lfc_size_limit;
|
||||
extern char *lfc_path;
|
||||
|
||||
|
||||
/* the rust bindings, generated by cbindgen */
|
||||
#include "communicator/communicator_bindings.h"
|
||||
|
||||
#define MaxProcs (MaxBackends + NUM_AUXILIARY_PROCS)
|
||||
|
||||
static CommunicatorInitStruct *cis;
|
||||
static CommunicatorBackendStruct *my_bs;
|
||||
|
||||
static File cache_file = 0;
|
||||
|
||||
typedef struct CommunicatorShmemPerBackendData
|
||||
{
|
||||
/*
|
||||
* Latch used to notify backend of IO completion. We cannot use the
|
||||
* standard process latch (MyProc->latch) because we cannot clear that
|
||||
* latch as part of the IO handling, or we might cause the caller to miss
|
||||
* some other events.
|
||||
*/
|
||||
Latch io_completion_latch;
|
||||
|
||||
/*
|
||||
* Normally, when reading or writing pages from shared buffer cache, the
|
||||
* worker process can operate directly on the shared buffer. But when
|
||||
* working with a local buffer, we use this "bounce buffer" to pass the
|
||||
* data to the worker process.
|
||||
*
|
||||
* TODO: That's slow, because it incurs an extra memory copy, and there's
|
||||
* currently only one of these per backend, which means you can have only
|
||||
* one such IO in progress at a time.
|
||||
*/
|
||||
PGIOAlignedBlock bounce_buffer;
|
||||
} CommunicatorShmemPerBackendData;
|
||||
|
||||
typedef struct CommunicatorShmemData
|
||||
{
|
||||
int dummy;
|
||||
|
||||
CommunicatorShmemPerBackendData backends[]; /* MaxProcs */
|
||||
|
||||
/* rust-managed shmem area follows at next MAXALIGN boundary */
|
||||
} CommunicatorShmemData;
|
||||
|
||||
static CommunicatorShmemData *communicator_shmem_ptr;
|
||||
|
||||
#define MyIOCompletionLatch (&communicator_shmem_ptr->backends[MyProcNumber].io_completion_latch)
|
||||
|
||||
static slock_t in_elog;
|
||||
|
||||
#define MAX_INFLIGHT_ASYNC_REQUESTS 5
|
||||
|
||||
/* request indexes of (prefetch) requests that have been started */
|
||||
static int inflight_requests[MAX_INFLIGHT_ASYNC_REQUESTS];
|
||||
static int num_inflight_requests = 0;
|
||||
|
||||
static int start_request(NeonIORequest *request, struct NeonIOResult *immediate_result_p);
|
||||
static void wait_request_completion(int request_idx, struct NeonIOResult *result_p);
|
||||
static void perform_request(NeonIORequest *request, struct NeonIOResult *result_p);
|
||||
static void process_inflight_requests(void);
|
||||
|
||||
static bool bounce_needed(void *buffer);
|
||||
static void *bounce_buf(void);
|
||||
static void *bounce_write_if_needed(void *buffer);
|
||||
|
||||
PGDLLEXPORT void communicator_new_bgworker_main(Datum main_arg);
|
||||
static void communicator_new_backend_exit(int code, Datum arg);
|
||||
|
||||
/**** Initialization functions. These run in postmaster ****/
|
||||
|
||||
void
|
||||
pg_init_communicator_new(void)
|
||||
{
|
||||
BackgroundWorker bgw;
|
||||
|
||||
/* Initialize the background worker process */
|
||||
memset(&bgw, 0, sizeof(bgw));
|
||||
bgw.bgw_flags = BGWORKER_SHMEM_ACCESS;
|
||||
bgw.bgw_start_time = BgWorkerStart_PostmasterStart;
|
||||
snprintf(bgw.bgw_library_name, BGW_MAXLEN, "neon");
|
||||
snprintf(bgw.bgw_function_name, BGW_MAXLEN, "communicator_new_bgworker_main");
|
||||
snprintf(bgw.bgw_name, BGW_MAXLEN, "Storage communicator process");
|
||||
snprintf(bgw.bgw_type, BGW_MAXLEN, "Storage communicator process");
|
||||
bgw.bgw_restart_time = 5;
|
||||
bgw.bgw_notify_pid = 0;
|
||||
bgw.bgw_main_arg = (Datum) 0;
|
||||
|
||||
RegisterBackgroundWorker(&bgw);
|
||||
|
||||
SpinLockInit(&in_elog);
|
||||
}
|
||||
|
||||
static size_t
|
||||
communicator_new_shmem_size(void)
|
||||
{
|
||||
size_t size = 0;
|
||||
|
||||
size += MAXALIGN(
|
||||
offsetof(CommunicatorShmemData, backends) +
|
||||
MaxProcs * sizeof(CommunicatorShmemPerBackendData)
|
||||
);
|
||||
|
||||
/* space needed by the rust code */
|
||||
size += rcommunicator_shmem_size(MaxProcs);
|
||||
|
||||
return size;
|
||||
}
|
||||
|
||||
void
|
||||
communicator_new_shmem_request(void)
|
||||
{
|
||||
RequestAddinShmemSpace(communicator_new_shmem_size());
|
||||
}
|
||||
|
||||
void
|
||||
communicator_new_shmem_startup(void)
|
||||
{
|
||||
bool found;
|
||||
int pipefd[2];
|
||||
int rc;
|
||||
size_t communicator_size;
|
||||
size_t shmem_size;
|
||||
void *shmem_ptr;
|
||||
|
||||
rc = pipe(pipefd);
|
||||
if (rc != 0)
|
||||
ereport(ERROR,
|
||||
(errcode_for_file_access(),
|
||||
errmsg_internal("could not create pipe between neon communicator and backends : %m")));
|
||||
if (fcntl(pipefd[0], F_SETFL, O_NONBLOCK) == -1)
|
||||
elog(FATAL, "fcntl(F_SETFL) failed on read-end of communicator pipe: %m");
|
||||
if (fcntl(pipefd[1], F_SETFL, O_NONBLOCK) == -1)
|
||||
elog(FATAL, "fcntl(F_SETFL) failed on write-end of communicator pipe: %m");
|
||||
|
||||
shmem_size = communicator_new_shmem_size();
|
||||
shmem_ptr = ShmemInitStruct("Communicator shmem state",
|
||||
shmem_size,
|
||||
&found);
|
||||
Assert(!found);
|
||||
|
||||
/* Initialize the C-managed parts */
|
||||
communicator_shmem_ptr = (CommunicatorShmemData *) shmem_ptr;
|
||||
communicator_size = MAXALIGN(offsetof(CommunicatorShmemData, backends) + MaxProcs * sizeof(CommunicatorShmemPerBackendData));
|
||||
shmem_ptr = (char *) shmem_ptr + communicator_size;
|
||||
shmem_size -= communicator_size;
|
||||
|
||||
for (int i = 0; i < MaxProcs; i++)
|
||||
InitSharedLatch(&communicator_shmem_ptr->backends[i].io_completion_latch);
|
||||
|
||||
/* Initialize the rust-managed parts */
|
||||
cis = rcommunicator_shmem_init(pipefd[0], pipefd[1], MaxProcs, shmem_ptr, shmem_size);
|
||||
}
|
||||
|
||||
/**** Worker process functions. These run in the communicator worker process ****/
|
||||
|
||||
/* Entry point for the communicator bgworker process */
|
||||
void
|
||||
communicator_new_bgworker_main(Datum main_arg)
|
||||
{
|
||||
char **connstrs;
|
||||
shardno_t num_shards;
|
||||
struct LoggingState *logging;
|
||||
char errbuf[1000];
|
||||
int elevel;
|
||||
|
||||
/* Establish signal handlers. */
|
||||
pqsignal(SIGUSR1, procsignal_sigusr1_handler);
|
||||
pqsignal(SIGHUP, SignalHandlerForConfigReload);
|
||||
pqsignal(SIGTERM, die);
|
||||
|
||||
BackgroundWorkerUnblockSignals();
|
||||
|
||||
get_shard_map(&connstrs, &num_shards);
|
||||
|
||||
logging = configure_logging();
|
||||
|
||||
communicator_worker_process_launch(
|
||||
cis,
|
||||
neon_tenant,
|
||||
neon_timeline,
|
||||
neon_auth_token,
|
||||
connstrs,
|
||||
num_shards,
|
||||
lfc_path,
|
||||
lfc_size_limit);
|
||||
cis = NULL;
|
||||
|
||||
elog(LOG, "communicator threads started");
|
||||
for (;;)
|
||||
{
|
||||
int32 rc;
|
||||
|
||||
CHECK_FOR_INTERRUPTS();
|
||||
|
||||
for (;;)
|
||||
{
|
||||
rc = pump_logging(logging, (uint8 *) errbuf, sizeof(errbuf), &elevel);
|
||||
if (rc == 0)
|
||||
{
|
||||
/* nothing to do */
|
||||
break;
|
||||
}
|
||||
else if (rc == 1)
|
||||
{
|
||||
/* Because we don't want to exit on error */
|
||||
if (elevel == ERROR)
|
||||
elevel = LOG;
|
||||
if (elevel == INFO)
|
||||
elevel = LOG;
|
||||
elog(elevel, "[COMMUNICATOR] %s", errbuf);
|
||||
}
|
||||
else if (rc == -1)
|
||||
{
|
||||
elog(ERROR, "logging channel was closed unexpectedly");
|
||||
}
|
||||
}
|
||||
|
||||
(void) WaitLatch(MyLatch,
|
||||
WL_LATCH_SET | WL_EXIT_ON_PM_DEATH,
|
||||
0,
|
||||
PG_WAIT_EXTENSION);
|
||||
ResetLatch(MyLatch);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Callbacks from the rust code, in the communicator process.
|
||||
*
|
||||
* NOTE: These must be thread safe! It's very limited which PostgreSQL functions you can use!!!
|
||||
*
|
||||
* NOTE: the signatures of these better match the Rust definitions!
|
||||
*/
|
||||
|
||||
void
|
||||
notify_proc_unsafe(int procno)
|
||||
{
|
||||
SetLatch(&communicator_shmem_ptr->backends[procno].io_completion_latch);
|
||||
|
||||
}
|
||||
|
||||
void
|
||||
callback_set_my_latch_unsafe(void)
|
||||
{
|
||||
SetLatch(MyLatch);
|
||||
}
|
||||
|
||||
/*
|
||||
* FIXME: The logic from neon_get_request_lsns() needs to go here, except for
|
||||
* the last-written LSN cache stuff, which is managed by the rust code now.
|
||||
*/
|
||||
uint64
|
||||
callback_get_request_lsn_unsafe(void)
|
||||
{
|
||||
/*
|
||||
* NB: be very careful with what you do here! This is called from tokio
|
||||
* threads, so anything tha tries to take LWLocks is unsafe, for example.
|
||||
*
|
||||
* RecoveryInProgress() is OK
|
||||
*/
|
||||
if (RecoveryInProgress())
|
||||
{
|
||||
XLogRecPtr replay_lsn = GetXLogReplayRecPtr(NULL);
|
||||
|
||||
return replay_lsn;
|
||||
}
|
||||
else
|
||||
{
|
||||
XLogRecPtr flushlsn;
|
||||
|
||||
#if PG_VERSION_NUM >= 150000
|
||||
flushlsn = GetFlushRecPtr(NULL);
|
||||
#else
|
||||
flushlsn = GetFlushRecPtr();
|
||||
#endif
|
||||
|
||||
return flushlsn;
|
||||
}
|
||||
}
|
||||
|
||||
/**** Backend functions. These run in each backend ****/
|
||||
|
||||
/* Initialize per-backend private state */
|
||||
void
|
||||
communicator_new_init(void)
|
||||
{
|
||||
Assert(cis != NULL);
|
||||
Assert(my_bs == NULL);
|
||||
|
||||
if (MyBgworkerEntry && strcmp(MyBgworkerEntry->bgw_function_name, "communicator_new_bgworker_main") == 0)
|
||||
return;
|
||||
|
||||
OwnLatch(MyIOCompletionLatch);
|
||||
|
||||
my_bs = rcommunicator_backend_init(cis, MyProcNumber);
|
||||
cis = NULL;
|
||||
|
||||
/*
|
||||
* Arrange to clean up at backend exit.
|
||||
*/
|
||||
on_shmem_exit(communicator_new_backend_exit, 0);
|
||||
}
|
||||
|
||||
static void
|
||||
communicator_new_backend_exit(int code, Datum arg)
|
||||
{
|
||||
DisownLatch(MyIOCompletionLatch);
|
||||
}
|
||||
|
||||
/*
|
||||
* prefetch_register_bufferv() - register and prefetch buffers
|
||||
*
|
||||
* Register that we may want the contents of BufferTag in the near future.
|
||||
* This is used when issuing a speculative prefetch request, but also when
|
||||
* performing a synchronous request and need the buffer right now.
|
||||
*
|
||||
* When performing a prefetch rather than a synchronous request,
|
||||
* is_prefetch==true. Currently, it only affects how the request is accounted
|
||||
* in the perf counters.
|
||||
*
|
||||
* NOTE: this function may indirectly update MyPState->pfs_hash; which
|
||||
* invalidates any active pointers into the hash table.
|
||||
*/
|
||||
void
|
||||
communicator_new_prefetch_register_bufferv(NRelFileInfo rinfo, ForkNumber forkNum,
|
||||
BlockNumber blockno, BlockNumber nblocks)
|
||||
{
|
||||
int request_idx;
|
||||
NeonIORequest request = {
|
||||
.tag = NeonIORequest_PrefetchV,
|
||||
.prefetch_v = {
|
||||
.spc_oid = NInfoGetSpcOid(rinfo),
|
||||
.db_oid = NInfoGetDbOid(rinfo),
|
||||
.rel_number = NInfoGetRelNumber(rinfo),
|
||||
.fork_number = forkNum,
|
||||
.block_number = blockno,
|
||||
.nblocks = nblocks,
|
||||
}
|
||||
};
|
||||
struct NeonIOResult result;
|
||||
|
||||
elog(LOG, "prefetch called for rel %u/%u/%u.%u block %u (%u blocks)",
|
||||
RelFileInfoFmt(rinfo), forkNum, blockno, nblocks);
|
||||
|
||||
if (num_inflight_requests >= MAX_INFLIGHT_ASYNC_REQUESTS)
|
||||
process_inflight_requests();
|
||||
|
||||
request_idx = bcomm_start_io_request(my_bs, &request, &result);
|
||||
if (request_idx == -1)
|
||||
{
|
||||
/* -1 means the request was satisfied immediately. */
|
||||
/* FIXME: check and log errors */
|
||||
return;
|
||||
}
|
||||
inflight_requests[num_inflight_requests] = request_idx;
|
||||
num_inflight_requests++;
|
||||
|
||||
elog(LOG, "sent prefetch request with idx %d", request_idx);
|
||||
}
|
||||
|
||||
static void
|
||||
process_inflight_requests(void)
|
||||
{
|
||||
struct NeonIOResult result;
|
||||
|
||||
/* FIXME: log errors */
|
||||
for (int i = 0; i < num_inflight_requests; i++)
|
||||
wait_request_completion(inflight_requests[i], &result);
|
||||
num_inflight_requests = 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Perform an IO request in a synchronous fashion.
|
||||
*
|
||||
* Returns a pointer to the result slot. It is valid until the next time a
|
||||
* request is submitted.
|
||||
*/
|
||||
static void
|
||||
perform_request(NeonIORequest * request, struct NeonIOResult *result_p)
|
||||
{
|
||||
int request_idx;
|
||||
|
||||
process_inflight_requests();
|
||||
|
||||
request_idx = start_request(request, result_p);
|
||||
if (request_idx == -1)
|
||||
{
|
||||
/* it was completed immediately */
|
||||
return;
|
||||
}
|
||||
wait_request_completion(request_idx, result_p);
|
||||
}
|
||||
|
||||
static int
|
||||
start_request(NeonIORequest * request, struct NeonIOResult *immediate_result_p)
|
||||
{
|
||||
int request_idx;
|
||||
|
||||
request_idx = bcomm_start_io_request(my_bs, request, immediate_result_p);
|
||||
if (request_idx == -1)
|
||||
{
|
||||
/* -1 means the request was satisfied immediately. */
|
||||
return -1;
|
||||
}
|
||||
elog(DEBUG5, "sent request with idx %d: tag %d", request_idx, request->tag);
|
||||
return request_idx;
|
||||
}
|
||||
|
||||
static void
|
||||
wait_request_completion(int request_idx, struct NeonIOResult *result_p)
|
||||
{
|
||||
int32_t poll_res;
|
||||
|
||||
/* fixme: check 'request_idx' ? */
|
||||
|
||||
for (;;)
|
||||
{
|
||||
ResetLatch(MyIOCompletionLatch);
|
||||
|
||||
poll_res = bcomm_poll_request_completion(my_bs, request_idx, result_p);
|
||||
if (poll_res == -1)
|
||||
{
|
||||
CHECK_FOR_INTERRUPTS();
|
||||
|
||||
/*
|
||||
* TODO: wake up periodically for CHECK_FOR_INTERRUPTS(). Because
|
||||
* we wait on MyIOCompletionLatch rather than MyLatch, we won't be
|
||||
* woken up for the standard interrupts.
|
||||
*/
|
||||
(void) WaitLatch(MyIOCompletionLatch,
|
||||
WL_EXIT_ON_PM_DEATH | WL_LATCH_SET,
|
||||
0,
|
||||
WAIT_EVENT_NEON_PS_STARTING);
|
||||
continue; /* still busy */
|
||||
}
|
||||
else if (poll_res == 0)
|
||||
{
|
||||
return;
|
||||
}
|
||||
else
|
||||
{
|
||||
elog(ERROR, "unexpected return code from bcomm_poll_request_completion()");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Does the physical file exist?
|
||||
*/
|
||||
bool
|
||||
communicator_new_rel_exists(NRelFileInfo rinfo, ForkNumber forkNum)
|
||||
{
|
||||
NeonIORequest request = {
|
||||
.tag = NeonIORequest_RelExists,
|
||||
.rel_exists = {
|
||||
.spc_oid = NInfoGetSpcOid(rinfo),
|
||||
.db_oid = NInfoGetDbOid(rinfo),
|
||||
.rel_number = NInfoGetRelNumber(rinfo),
|
||||
.fork_number = forkNum,
|
||||
}
|
||||
};
|
||||
NeonIOResult result;
|
||||
|
||||
perform_request(&request, &result);
|
||||
switch (result.tag)
|
||||
{
|
||||
case NeonIOResult_RelExists:
|
||||
return result.rel_exists;
|
||||
case NeonIOResult_Error:
|
||||
ereport(ERROR,
|
||||
(errcode_for_file_access(),
|
||||
errmsg("could not check existence of rel %u/%u/%u.%u: %s",
|
||||
RelFileInfoFmt(rinfo), forkNum, pg_strerror(result.error))));
|
||||
break;
|
||||
default:
|
||||
elog(ERROR, "unexpected result for RelExists operation: %d", result.tag);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Read N consecutive pages from a relation
|
||||
*/
|
||||
void
|
||||
communicator_new_read_at_lsnv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blockno,
|
||||
void **buffers, BlockNumber nblocks)
|
||||
{
|
||||
NeonIOResult result;
|
||||
CCachedGetPageVResult cached_result;
|
||||
void *bounce_buf_used = NULL;
|
||||
int request_idx;
|
||||
NeonIORequest request = {
|
||||
.tag = NeonIORequest_GetPageV,
|
||||
.get_page_v = {
|
||||
.spc_oid = NInfoGetSpcOid(rinfo),
|
||||
.db_oid = NInfoGetDbOid(rinfo),
|
||||
.rel_number = NInfoGetRelNumber(rinfo),
|
||||
.fork_number = forkNum,
|
||||
.block_number = blockno,
|
||||
.nblocks = nblocks,
|
||||
}
|
||||
};
|
||||
|
||||
elog(LOG, "getpagev called for rel %u/%u/%u.%u block %u (%u blocks)",
|
||||
RelFileInfoFmt(rinfo), forkNum, blockno, nblocks);
|
||||
|
||||
/* Fill in the destination buffers in the request */
|
||||
if (nblocks == 1)
|
||||
{
|
||||
if (bounce_needed(buffers[0]))
|
||||
{
|
||||
bounce_buf_used = bounce_buf();
|
||||
request.get_page_v.dest[0].ptr = bounce_buf_used;
|
||||
}
|
||||
else
|
||||
request.get_page_v.dest[0].ptr = buffers[0];
|
||||
}
|
||||
else
|
||||
{
|
||||
for (int i = 0; i < nblocks; i++)
|
||||
{
|
||||
if (bounce_needed(buffers[i]))
|
||||
{
|
||||
/* Split the vector-request into single page requests */
|
||||
for (int j = 0; j < nblocks; j++)
|
||||
{
|
||||
communicator_new_read_at_lsnv(rinfo, forkNum, blockno + j,
|
||||
&buffers[j], 1);
|
||||
}
|
||||
return;
|
||||
}
|
||||
request.get_page_v.dest[i].ptr = buffers[i];
|
||||
}
|
||||
}
|
||||
|
||||
process_inflight_requests();
|
||||
|
||||
retry:
|
||||
request_idx = bcomm_start_get_page_v_request(my_bs, &request, &cached_result);
|
||||
if (request_idx == -1)
|
||||
{
|
||||
bool completed;
|
||||
|
||||
/*
|
||||
* LFC hit, but we are responsible for completing the I/O on the local
|
||||
* file
|
||||
*/
|
||||
if (cache_file == 0)
|
||||
cache_file = PathNameOpenFile(lfc_path, O_RDONLY | PG_BINARY);
|
||||
|
||||
for (int i = 0; i < nblocks; i++)
|
||||
{
|
||||
uint64_t cached_block = cached_result.cache_block_numbers[i];
|
||||
ssize_t bytes_total = 0;
|
||||
|
||||
while (bytes_total < BLCKSZ)
|
||||
{
|
||||
ssize_t nbytes;
|
||||
|
||||
nbytes = FileRead(cache_file, ((char *) buffers[i]) + bytes_total, BLCKSZ - bytes_total, cached_block * BLCKSZ + bytes_total, WAIT_EVENT_NEON_LFC_READ);
|
||||
if (nbytes == -1)
|
||||
ereport(ERROR,
|
||||
(errcode_for_file_access(),
|
||||
errmsg("could not read block %lu in local cache file: %m",
|
||||
cached_block)));
|
||||
bytes_total += nbytes;
|
||||
}
|
||||
}
|
||||
completed = bcomm_finish_cache_read(my_bs);
|
||||
if (!completed)
|
||||
{
|
||||
elog(DEBUG1, "read from local cache file was superseded by concurrent update");
|
||||
goto retry;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
wait_request_completion(request_idx, &result);
|
||||
switch (result.tag)
|
||||
{
|
||||
case NeonIOResult_GetPageV:
|
||||
if (bounce_buf_used)
|
||||
memcpy(buffers[0], bounce_buf_used, BLCKSZ);
|
||||
return;
|
||||
case NeonIOResult_Error:
|
||||
ereport(ERROR,
|
||||
(errcode_for_file_access(),
|
||||
errmsg("could not read block %u in rel %u/%u/%u.%u: %s",
|
||||
blockno, RelFileInfoFmt(rinfo), forkNum, pg_strerror(result.error))));
|
||||
break;
|
||||
default:
|
||||
elog(ERROR, "unexpected result for GetPage operation: %d", result.tag);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* neon_nblocks() -- Get the number of blocks stored in a relation.
|
||||
*/
|
||||
BlockNumber
|
||||
communicator_new_rel_nblocks(NRelFileInfo rinfo, ForkNumber forkNum)
|
||||
{
|
||||
NeonIORequest request = {
|
||||
.tag = NeonIORequest_RelSize,
|
||||
.rel_size = {
|
||||
.spc_oid = NInfoGetSpcOid(rinfo),
|
||||
.db_oid = NInfoGetDbOid(rinfo),
|
||||
.rel_number = NInfoGetRelNumber(rinfo),
|
||||
.fork_number = forkNum,
|
||||
}
|
||||
};
|
||||
NeonIOResult result;
|
||||
|
||||
perform_request(&request, &result);
|
||||
switch (result.tag)
|
||||
{
|
||||
case NeonIOResult_RelSize:
|
||||
return result.rel_size;
|
||||
case NeonIOResult_Error:
|
||||
ereport(ERROR,
|
||||
(errcode_for_file_access(),
|
||||
errmsg("could not read size of rel %u/%u/%u.%u: %s",
|
||||
RelFileInfoFmt(rinfo), forkNum, pg_strerror(result.error))));
|
||||
break;
|
||||
default:
|
||||
elog(ERROR, "unexpected result for RelSize operation: %d", result.tag);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* neon_db_size() -- Get the size of the database in bytes.
|
||||
*/
|
||||
int64
|
||||
communicator_new_dbsize(Oid dbNode)
|
||||
{
|
||||
NeonIORequest request = {
|
||||
.tag = NeonIORequest_DbSize,
|
||||
.db_size = {
|
||||
.db_oid = dbNode,
|
||||
}
|
||||
};
|
||||
NeonIOResult result;
|
||||
|
||||
perform_request(&request, &result);
|
||||
switch (result.tag)
|
||||
{
|
||||
case NeonIOResult_DbSize:
|
||||
return (int64) result.db_size;
|
||||
case NeonIOResult_Error:
|
||||
ereport(ERROR,
|
||||
(errcode_for_file_access(),
|
||||
errmsg("could not read database size of database %u: %s",
|
||||
dbNode, pg_strerror(result.error))));
|
||||
break;
|
||||
default:
|
||||
elog(ERROR, "unexpected result for DbSize operation: %d", result.tag);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
int
|
||||
communicator_new_read_slru_segment(SlruKind kind, int64 segno, void *buffer)
|
||||
{
|
||||
/* TODO */
|
||||
elog(ERROR, "not implemented");
|
||||
}
|
||||
|
||||
/* Write requests */
|
||||
void
|
||||
communicator_new_write_page(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blockno,
|
||||
const void *buffer, XLogRecPtr lsn)
|
||||
{
|
||||
void *src = bounce_write_if_needed((void *) buffer);
|
||||
NeonIORequest request = {
|
||||
.tag = NeonIORequest_WritePage,
|
||||
.write_page = {
|
||||
.spc_oid = NInfoGetSpcOid(rinfo),
|
||||
.db_oid = NInfoGetDbOid(rinfo),
|
||||
.rel_number = NInfoGetRelNumber(rinfo),
|
||||
.fork_number = forkNum,
|
||||
.block_number = blockno,
|
||||
.lsn = lsn,
|
||||
.src.ptr = src,
|
||||
}
|
||||
};
|
||||
NeonIOResult result;
|
||||
|
||||
perform_request(&request, &result);
|
||||
switch (result.tag)
|
||||
{
|
||||
case NeonIOResult_WriteOK:
|
||||
return;
|
||||
case NeonIOResult_Error:
|
||||
ereport(ERROR,
|
||||
(errcode_for_file_access(),
|
||||
errmsg("could not write block %u in rel %u/%u/%u.%u: %s",
|
||||
blockno, RelFileInfoFmt(rinfo), forkNum, pg_strerror(result.error))));
|
||||
break;
|
||||
default:
|
||||
elog(ERROR, "unexpected result for WritePage operation: %d", result.tag);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
communicator_new_rel_extend(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blockno,
|
||||
const void *buffer, XLogRecPtr lsn)
|
||||
{
|
||||
void *src = bounce_write_if_needed((void *) buffer);
|
||||
NeonIORequest request = {
|
||||
.tag = NeonIORequest_RelExtend,
|
||||
.rel_extend = {
|
||||
.spc_oid = NInfoGetSpcOid(rinfo),
|
||||
.db_oid = NInfoGetDbOid(rinfo),
|
||||
.rel_number = NInfoGetRelNumber(rinfo),
|
||||
.fork_number = forkNum,
|
||||
.block_number = blockno,
|
||||
.lsn = lsn,
|
||||
.src_ptr = (uintptr_t) src,
|
||||
.src_size = BLCKSZ,
|
||||
}
|
||||
};
|
||||
NeonIOResult result;
|
||||
|
||||
perform_request(&request, &result);
|
||||
switch (result.tag)
|
||||
{
|
||||
case NeonIOResult_WriteOK:
|
||||
return;
|
||||
case NeonIOResult_Error:
|
||||
ereport(ERROR,
|
||||
(errcode_for_file_access(),
|
||||
errmsg("could not extend to block %u in rel %u/%u/%u.%u: %s",
|
||||
blockno, RelFileInfoFmt(rinfo), forkNum, pg_strerror(result.error))));
|
||||
break;
|
||||
default:
|
||||
elog(ERROR, "unexpected result for Extend operation: %d", result.tag);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
communicator_new_rel_zeroextend(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blockno,
|
||||
BlockNumber nblocks, XLogRecPtr lsn)
|
||||
{
|
||||
NeonIORequest request = {
|
||||
.tag = NeonIORequest_RelZeroExtend,
|
||||
.rel_zero_extend = {
|
||||
.spc_oid = NInfoGetSpcOid(rinfo),
|
||||
.db_oid = NInfoGetDbOid(rinfo),
|
||||
.rel_number = NInfoGetRelNumber(rinfo),
|
||||
.fork_number = forkNum,
|
||||
.block_number = blockno,
|
||||
.nblocks = nblocks,
|
||||
.lsn = lsn,
|
||||
}
|
||||
};
|
||||
NeonIOResult result;
|
||||
|
||||
perform_request(&request, &result);
|
||||
switch (result.tag)
|
||||
{
|
||||
case NeonIOResult_WriteOK:
|
||||
return;
|
||||
case NeonIOResult_Error:
|
||||
ereport(ERROR,
|
||||
(errcode_for_file_access(),
|
||||
errmsg("could not zeroextend to block %u in rel %u/%u/%u.%u: %s",
|
||||
blockno, RelFileInfoFmt(rinfo), forkNum, pg_strerror(result.error))));
|
||||
break;
|
||||
default:
|
||||
elog(ERROR, "unexpected result for ZeroExtend operation: %d", result.tag);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
communicator_new_rel_create(NRelFileInfo rinfo, ForkNumber forkNum)
|
||||
{
|
||||
NeonIORequest request = {
|
||||
.tag = NeonIORequest_RelCreate,
|
||||
.rel_create = {
|
||||
.spc_oid = NInfoGetSpcOid(rinfo),
|
||||
.db_oid = NInfoGetDbOid(rinfo),
|
||||
.rel_number = NInfoGetRelNumber(rinfo),
|
||||
.fork_number = forkNum,
|
||||
}
|
||||
};
|
||||
NeonIOResult result;
|
||||
|
||||
perform_request(&request, &result);
|
||||
switch (result.tag)
|
||||
{
|
||||
case NeonIOResult_WriteOK:
|
||||
return;
|
||||
case NeonIOResult_Error:
|
||||
ereport(ERROR,
|
||||
(errcode_for_file_access(),
|
||||
errmsg("could not create rel %u/%u/%u.%u: %s",
|
||||
RelFileInfoFmt(rinfo), forkNum, pg_strerror(result.error))));
|
||||
break;
|
||||
default:
|
||||
elog(ERROR, "unexpected result for Create operation: %d", result.tag);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
communicator_new_rel_truncate(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber nblocks)
|
||||
{
|
||||
NeonIORequest request = {
|
||||
.tag = NeonIORequest_RelTruncate,
|
||||
.rel_truncate = {
|
||||
.spc_oid = NInfoGetSpcOid(rinfo),
|
||||
.db_oid = NInfoGetDbOid(rinfo),
|
||||
.rel_number = NInfoGetRelNumber(rinfo),
|
||||
.fork_number = forkNum,
|
||||
.nblocks = nblocks,
|
||||
}
|
||||
};
|
||||
NeonIOResult result;
|
||||
|
||||
perform_request(&request, &result);
|
||||
switch (result.tag)
|
||||
{
|
||||
case NeonIOResult_WriteOK:
|
||||
return;
|
||||
case NeonIOResult_Error:
|
||||
ereport(ERROR,
|
||||
(errcode_for_file_access(),
|
||||
errmsg("could not truncate rel %u/%u/%u.%u to %u blocks: %s",
|
||||
RelFileInfoFmt(rinfo), forkNum, nblocks, pg_strerror(result.error))));
|
||||
break;
|
||||
default:
|
||||
elog(ERROR, "unexpected result for Truncate operation: %d", result.tag);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
communicator_new_rel_unlink(NRelFileInfo rinfo, ForkNumber forkNum)
|
||||
{
|
||||
NeonIORequest request = {
|
||||
.tag = NeonIORequest_RelUnlink,
|
||||
.rel_unlink = {
|
||||
.spc_oid = NInfoGetSpcOid(rinfo),
|
||||
.db_oid = NInfoGetDbOid(rinfo),
|
||||
.rel_number = NInfoGetRelNumber(rinfo),
|
||||
.fork_number = forkNum,
|
||||
}
|
||||
};
|
||||
NeonIOResult result;
|
||||
|
||||
perform_request(&request, &result);
|
||||
switch (result.tag)
|
||||
{
|
||||
case NeonIOResult_WriteOK:
|
||||
return;
|
||||
case NeonIOResult_Error:
|
||||
ereport(ERROR,
|
||||
(errcode_for_file_access(),
|
||||
errmsg("could not unlink rel %u/%u/%u.%u: %s",
|
||||
RelFileInfoFmt(rinfo), forkNum, pg_strerror(result.error))));
|
||||
break;
|
||||
default:
|
||||
elog(ERROR, "unexpected result for Unlink operation: %d", result.tag);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* The worker process can read / write shared buffers directly. But if smgrread() or
|
||||
* smgrwrite() is called with a private temporary buffer, we need to copy it to the
|
||||
* "bounce buffer", to make it available fro the worker process.
|
||||
*/
|
||||
static bool
|
||||
bounce_needed(void *buffer)
|
||||
{
|
||||
if ((uintptr_t) buffer >= (uintptr_t) BufferBlocks &&
|
||||
(uintptr_t) buffer < (uintptr_t) BufferBlocks + NBuffers * BLCKSZ)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
static void *
|
||||
bounce_buf(void)
|
||||
{
|
||||
return &communicator_shmem_ptr->backends[MyProcNumber].bounce_buffer;
|
||||
}
|
||||
|
||||
static void *
|
||||
bounce_write_if_needed(void *buffer)
|
||||
{
|
||||
void *p;
|
||||
|
||||
if (!bounce_needed(buffer))
|
||||
return buffer;
|
||||
|
||||
p = bounce_buf();
|
||||
memcpy(p, buffer, BLCKSZ);
|
||||
return p;
|
||||
}
|
||||
54
pgxn/neon/communicator_new.h
Normal file
54
pgxn/neon/communicator_new.h
Normal file
@@ -0,0 +1,54 @@
|
||||
/*-------------------------------------------------------------------------
|
||||
*
|
||||
* communicator_new.h
|
||||
* new implementation
|
||||
*
|
||||
*
|
||||
* Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
|
||||
* Portions Copyright (c) 1994, Regents of the University of California
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
#ifndef COMMUNICATOR_NEW_H
|
||||
#define COMMUNICATOR_NEW_H
|
||||
|
||||
#include "neon_pgversioncompat.h"
|
||||
|
||||
#include "storage/buf_internals.h"
|
||||
|
||||
#include "pagestore_client.h"
|
||||
|
||||
/* initialization at postmaster startup */
|
||||
extern void pg_init_communicator_new(void);
|
||||
extern void communicator_new_shmem_request(void);
|
||||
extern void communicator_new_shmem_startup(void);
|
||||
|
||||
/* initialization at backend startup */
|
||||
extern void communicator_new_init(void);
|
||||
|
||||
/* Read requests */
|
||||
extern bool communicator_new_rel_exists(NRelFileInfo rinfo, ForkNumber forkNum);
|
||||
extern BlockNumber communicator_new_rel_nblocks(NRelFileInfo rinfo, ForkNumber forknum);
|
||||
extern int64 communicator_new_dbsize(Oid dbNode);
|
||||
extern void communicator_new_read_at_lsnv(NRelFileInfo rinfo, ForkNumber forkNum,
|
||||
BlockNumber base_blockno,
|
||||
void **buffers, BlockNumber nblocks);
|
||||
extern void communicator_new_prefetch_register_bufferv(NRelFileInfo rinfo, ForkNumber forkNum,
|
||||
BlockNumber blockno,
|
||||
BlockNumber nblocks);
|
||||
extern int communicator_new_read_slru_segment(SlruKind kind, int64 segno,
|
||||
void *buffer);
|
||||
|
||||
/* Write requests, to keep the caches up-to-date */
|
||||
extern void communicator_new_write_page(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blockno,
|
||||
const void *buffer, XLogRecPtr lsn);
|
||||
extern void communicator_new_rel_extend(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blockno,
|
||||
const void *buffer, XLogRecPtr lsn);
|
||||
extern void communicator_new_rel_zeroextend(NRelFileInfo rinfo, ForkNumber forkNum,
|
||||
BlockNumber blockno, BlockNumber nblocks,
|
||||
XLogRecPtr lsn);
|
||||
extern void communicator_new_rel_create(NRelFileInfo rinfo, ForkNumber forkNum);
|
||||
extern void communicator_new_rel_truncate(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber nblocks);
|
||||
extern void communicator_new_rel_unlink(NRelFileInfo rinfo, ForkNumber forkNum);
|
||||
|
||||
#endif /* COMMUNICATOR_NEW_H */
|
||||
@@ -164,10 +164,10 @@ static HTAB *lfc_hash;
|
||||
static int lfc_desc = -1;
|
||||
static LWLockId lfc_lock;
|
||||
static int lfc_max_size;
|
||||
static int lfc_size_limit;
|
||||
int lfc_size_limit;
|
||||
static int lfc_chunk_size_log = MAX_BLOCKS_PER_CHUNK_LOG;
|
||||
static int lfc_blocks_per_chunk = MAX_BLOCKS_PER_CHUNK;
|
||||
static char *lfc_path;
|
||||
char *lfc_path;
|
||||
static uint64 lfc_generation;
|
||||
static FileCacheControl *lfc_ctl;
|
||||
static shmem_startup_hook_type prev_shmem_startup_hook;
|
||||
|
||||
@@ -15,6 +15,8 @@
|
||||
|
||||
/* GUCs */
|
||||
extern bool lfc_store_prefetch_result;
|
||||
extern int lfc_size_limit;
|
||||
extern char *lfc_path;
|
||||
|
||||
/* functions for local file cache */
|
||||
extern void lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum,
|
||||
|
||||
@@ -279,6 +279,55 @@ AssignPageserverConnstring(const char *newval, void *extra)
|
||||
}
|
||||
}
|
||||
|
||||
/* Return a copy of the whole shard map from shared memory */
|
||||
void
|
||||
get_shard_map(char ***connstrs_p, shardno_t *num_shards_p)
|
||||
{
|
||||
uint64 begin_update_counter;
|
||||
uint64 end_update_counter;
|
||||
ShardMap *shard_map = &pagestore_shared->shard_map;
|
||||
shardno_t num_shards;
|
||||
char *buf;
|
||||
char **connstrs;
|
||||
|
||||
buf = palloc(MAX_SHARDS*MAX_PAGESERVER_CONNSTRING_SIZE);
|
||||
connstrs = palloc(sizeof(char *) * MAX_SHARDS);
|
||||
|
||||
/*
|
||||
* Postmaster can update the shared memory values concurrently, in which
|
||||
* case we would copy a garbled mix of the old and new values. We will
|
||||
* detect it because the counter's won't match, and retry. But it's
|
||||
* important that we don't do anything within the retry-loop that would
|
||||
* depend on the string having valid contents.
|
||||
*/
|
||||
do
|
||||
{
|
||||
char *p;
|
||||
|
||||
begin_update_counter = pg_atomic_read_u64(&pagestore_shared->begin_update_counter);
|
||||
end_update_counter = pg_atomic_read_u64(&pagestore_shared->end_update_counter);
|
||||
|
||||
num_shards = shard_map->num_shards;
|
||||
|
||||
p = buf;
|
||||
for (int i = 0; i < Min(num_shards, MAX_SHARDS); i++)
|
||||
{
|
||||
strlcpy(p, shard_map->connstring[i], MAX_PAGESERVER_CONNSTRING_SIZE);
|
||||
connstrs[i] = p;
|
||||
elog(LOG, "XX: connstrs[%d]: %p", i, p);
|
||||
p += MAX_PAGESERVER_CONNSTRING_SIZE;
|
||||
}
|
||||
|
||||
pg_memory_barrier();
|
||||
}
|
||||
while (begin_update_counter != end_update_counter
|
||||
|| begin_update_counter != pg_atomic_read_u64(&pagestore_shared->begin_update_counter)
|
||||
|| end_update_counter != pg_atomic_read_u64(&pagestore_shared->end_update_counter));
|
||||
|
||||
*connstrs_p = connstrs;
|
||||
*num_shards_p = num_shards;
|
||||
}
|
||||
|
||||
/*
|
||||
* Get the current number of shards, and/or the connection string for a
|
||||
* particular shard from the shard map in shared memory.
|
||||
|
||||
@@ -20,6 +20,7 @@
|
||||
#include "replication/logicallauncher.h"
|
||||
#include "replication/slot.h"
|
||||
#include "replication/walsender.h"
|
||||
#include "storage/ipc.h"
|
||||
#include "storage/proc.h"
|
||||
#include "funcapi.h"
|
||||
#include "access/htup_details.h"
|
||||
@@ -29,6 +30,7 @@
|
||||
#include "utils/guc_tables.h"
|
||||
|
||||
#include "communicator.h"
|
||||
#include "communicator_new.h"
|
||||
#include "extension_server.h"
|
||||
#include "file_cache.h"
|
||||
#include "neon.h"
|
||||
@@ -45,13 +47,17 @@ PG_MODULE_MAGIC;
|
||||
void _PG_init(void);
|
||||
|
||||
|
||||
bool neon_enable_new_communicator;
|
||||
static int running_xacts_overflow_policy;
|
||||
|
||||
#if PG_MAJORVERSION_NUM >= 16
|
||||
static shmem_startup_hook_type prev_shmem_startup_hook;
|
||||
|
||||
static void neon_shmem_startup_hook(void);
|
||||
#if PG_VERSION_NUM>=150000
|
||||
static shmem_request_hook_type prev_shmem_request_hook;
|
||||
#endif
|
||||
|
||||
static void neon_shmem_request(void);
|
||||
static void neon_shmem_startup_hook(void);
|
||||
|
||||
#if PG_MAJORVERSION_NUM >= 17
|
||||
uint32 WAIT_EVENT_NEON_LFC_MAINTENANCE;
|
||||
uint32 WAIT_EVENT_NEON_LFC_READ;
|
||||
@@ -430,17 +436,36 @@ _PG_init(void)
|
||||
*/
|
||||
#if PG_VERSION_NUM >= 160000
|
||||
load_file("$libdir/neon_rmgr", false);
|
||||
#endif
|
||||
|
||||
prev_shmem_startup_hook = shmem_startup_hook;
|
||||
shmem_startup_hook = neon_shmem_startup_hook;
|
||||
#if PG_VERSION_NUM>=150000
|
||||
prev_shmem_request_hook = shmem_request_hook;
|
||||
shmem_request_hook = neon_shmem_request;
|
||||
#else
|
||||
neon_shmem_request();
|
||||
#endif
|
||||
|
||||
DefineCustomBoolVariable(
|
||||
"neon.enable_new_communicator",
|
||||
"Enables new communicator implementation",
|
||||
NULL,
|
||||
&neon_enable_new_communicator,
|
||||
true,
|
||||
PGC_POSTMASTER,
|
||||
0,
|
||||
NULL, NULL, NULL);
|
||||
|
||||
pg_init_libpagestore();
|
||||
lfc_init();
|
||||
pg_init_walproposer();
|
||||
init_lwlsncache();
|
||||
|
||||
pg_init_communicator();
|
||||
if (neon_enable_new_communicator)
|
||||
pg_init_communicator_new();
|
||||
|
||||
Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines;
|
||||
|
||||
InitUnstableExtensionsSupport();
|
||||
@@ -559,7 +584,17 @@ backpressure_throttling_time(PG_FUNCTION_ARGS)
|
||||
PG_RETURN_UINT64(BackpressureThrottlingTime());
|
||||
}
|
||||
|
||||
#if PG_MAJORVERSION_NUM >= 16
|
||||
static void
|
||||
neon_shmem_request(void)
|
||||
{
|
||||
#if PG_VERSION_NUM>=150000
|
||||
if (prev_shmem_request_hook)
|
||||
prev_shmem_request_hook();
|
||||
#endif
|
||||
|
||||
communicator_new_shmem_request();
|
||||
}
|
||||
|
||||
static void
|
||||
neon_shmem_startup_hook(void)
|
||||
{
|
||||
@@ -579,5 +614,6 @@ neon_shmem_startup_hook(void)
|
||||
WAIT_EVENT_NEON_PS_READ = WaitEventExtensionNew("Neon/PS_ReadIO");
|
||||
WAIT_EVENT_NEON_WAL_DL = WaitEventExtensionNew("Neon/WAL_Download");
|
||||
#endif
|
||||
|
||||
communicator_new_shmem_startup();
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -13,6 +13,7 @@
|
||||
#include "utils/wait_event.h"
|
||||
|
||||
/* GUCs */
|
||||
extern bool neon_enable_new_communicator;
|
||||
extern char *neon_auth_token;
|
||||
extern char *neon_timeline;
|
||||
extern char *neon_tenant;
|
||||
|
||||
@@ -9,6 +9,10 @@
|
||||
#include "fmgr.h"
|
||||
#include "storage/buf_internals.h"
|
||||
|
||||
#if PG_MAJORVERSION_NUM < 16
|
||||
typedef PGAlignedBlock PGIOAlignedBlock;
|
||||
#endif
|
||||
|
||||
#if PG_MAJORVERSION_NUM < 17
|
||||
#define NRelFileInfoBackendIsTemp(rinfo) (rinfo.backend != InvalidBackendId)
|
||||
#else
|
||||
@@ -154,6 +158,10 @@ InitBufferTag(BufferTag *tag, const RelFileNode *rnode,
|
||||
#define AmAutoVacuumWorkerProcess() (IsAutoVacuumWorkerProcess())
|
||||
#endif
|
||||
|
||||
#if PG_MAJORVERSION_NUM < 17
|
||||
#define MyProcNumber (MyProc - &ProcGlobal->allProcs[0])
|
||||
#endif
|
||||
|
||||
#if PG_MAJORVERSION_NUM < 15
|
||||
extern void InitMaterializedSRF(FunctionCallInfo fcinfo, bits32 flags);
|
||||
#endif
|
||||
|
||||
@@ -228,6 +228,7 @@ extern char *neon_tenant;
|
||||
extern int32 max_cluster_size;
|
||||
extern int neon_protocol_version;
|
||||
|
||||
extern void get_shard_map(char ***connstrs_p, shardno_t *num_shards_p);
|
||||
extern shardno_t get_shard_number(BufferTag* tag);
|
||||
|
||||
extern const f_smgr *smgr_neon(ProcNumber backend, NRelFileInfo rinfo);
|
||||
|
||||
@@ -62,6 +62,7 @@
|
||||
|
||||
#include "bitmap.h"
|
||||
#include "communicator.h"
|
||||
#include "communicator_new.h"
|
||||
#include "file_cache.h"
|
||||
#include "neon.h"
|
||||
#include "neon_lwlsncache.h"
|
||||
@@ -72,10 +73,6 @@
|
||||
#include "access/xlogrecovery.h"
|
||||
#endif
|
||||
|
||||
#if PG_VERSION_NUM < 160000
|
||||
typedef PGAlignedBlock PGIOAlignedBlock;
|
||||
#endif
|
||||
|
||||
/*
|
||||
* If DEBUG_COMPARE_LOCAL is defined, we pass through all the SMGR API
|
||||
* calls to md.c, and *also* do the calls to the Page Server. On every
|
||||
@@ -97,7 +94,7 @@ static char *hexdump_page(char *page);
|
||||
NInfoGetRelNumber(InfoFromSMgrRel(reln)) >= FirstNormalObjectId \
|
||||
)
|
||||
|
||||
const int SmgrTrace = DEBUG5;
|
||||
const int SmgrTrace = DEBUG1;
|
||||
|
||||
/* unlogged relation build states */
|
||||
typedef enum
|
||||
@@ -779,10 +776,15 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
|
||||
return false;
|
||||
}
|
||||
|
||||
neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum,
|
||||
REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1);
|
||||
if (neon_enable_new_communicator)
|
||||
return communicator_new_rel_exists(InfoFromSMgrRel(reln), forkNum);
|
||||
else
|
||||
{
|
||||
neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum,
|
||||
REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1);
|
||||
|
||||
return communicator_exists(InfoFromSMgrRel(reln), forkNum, &request_lsns);
|
||||
return communicator_exists(InfoFromSMgrRel(reln), forkNum, &request_lsns);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -820,33 +822,40 @@ neon_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
|
||||
RelFileInfoFmt(InfoFromSMgrRel(reln)),
|
||||
forkNum);
|
||||
|
||||
/*
|
||||
* Newly created relation is empty, remember that in the relsize cache.
|
||||
*
|
||||
* Note that in REDO, this is called to make sure the relation fork
|
||||
* exists, but it does not truncate the relation. So, we can only update
|
||||
* the relsize if it didn't exist before.
|
||||
*
|
||||
* Also, in redo, we must make sure to update the cached size of the
|
||||
* relation, as that is the primary source of truth for REDO's file length
|
||||
* considerations, and as file extension isn't (perfectly) logged, we need
|
||||
* to take care of that before we hit file size checks.
|
||||
*
|
||||
* FIXME: This is currently not just an optimization, but required for
|
||||
* correctness. Postgres can call smgrnblocks() on the newly-created
|
||||
* relation. Currently, we don't call SetLastWrittenLSN() when a new
|
||||
* relation created, so if we didn't remember the size in the relsize
|
||||
* cache, we might call smgrnblocks() on the newly-created relation before
|
||||
* the creation WAL record hass been received by the page server.
|
||||
*/
|
||||
if (isRedo)
|
||||
if (neon_enable_new_communicator)
|
||||
{
|
||||
update_cached_relsize(InfoFromSMgrRel(reln), forkNum, 0);
|
||||
get_cached_relsize(InfoFromSMgrRel(reln), forkNum,
|
||||
&reln->smgr_cached_nblocks[forkNum]);
|
||||
communicator_new_rel_create(InfoFromSMgrRel(reln), forkNum);
|
||||
}
|
||||
else
|
||||
set_cached_relsize(InfoFromSMgrRel(reln), forkNum, 0);
|
||||
{
|
||||
/*
|
||||
* Newly created relation is empty, remember that in the relsize cache.
|
||||
*
|
||||
* Note that in REDO, this is called to make sure the relation fork
|
||||
* exists, but it does not truncate the relation. So, we can only update
|
||||
* the relsize if it didn't exist before.
|
||||
*
|
||||
* Also, in redo, we must make sure to update the cached size of the
|
||||
* relation, as that is the primary source of truth for REDO's file length
|
||||
* considerations, and as file extension isn't (perfectly) logged, we need
|
||||
* to take care of that before we hit file size checks.
|
||||
*
|
||||
* FIXME: This is currently not just an optimization, but required for
|
||||
* correctness. Postgres can call smgrnblocks() on the newly-created
|
||||
* relation. Currently, we don't call SetLastWrittenLSN() when a new
|
||||
* relation created, so if we didn't remember the size in the relsize
|
||||
* cache, we might call smgrnblocks() on the newly-created relation before
|
||||
* the creation WAL record hass been received by the page server.
|
||||
*/
|
||||
if (isRedo)
|
||||
{
|
||||
update_cached_relsize(InfoFromSMgrRel(reln), forkNum, 0);
|
||||
get_cached_relsize(InfoFromSMgrRel(reln), forkNum,
|
||||
&reln->smgr_cached_nblocks[forkNum]);
|
||||
}
|
||||
else
|
||||
set_cached_relsize(InfoFromSMgrRel(reln), forkNum, 0);
|
||||
}
|
||||
|
||||
#ifdef DEBUG_COMPARE_LOCAL
|
||||
if (IS_LOCAL_REL(reln))
|
||||
@@ -968,34 +977,43 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
|
||||
forkNum, blkno,
|
||||
(uint32) (lsn >> 32), (uint32) lsn);
|
||||
|
||||
lfc_write(InfoFromSMgrRel(reln), forkNum, blkno, buffer);
|
||||
if (neon_enable_new_communicator)
|
||||
{
|
||||
// FIXME: this can pass lsn == invalid. Is that ok?
|
||||
communicator_new_rel_extend(InfoFromSMgrRel(reln), forkNum, blkno, (const void *) buffer, lsn);
|
||||
}
|
||||
else
|
||||
{
|
||||
lfc_write(InfoFromSMgrRel(reln), forkNum, blkno, buffer);
|
||||
|
||||
#ifdef DEBUG_COMPARE_LOCAL
|
||||
if (IS_LOCAL_REL(reln))
|
||||
mdextend(reln, forkNum, blkno, buffer, skipFsync);
|
||||
if (IS_LOCAL_REL(reln))
|
||||
mdextend(reln, forkNum, blkno, buffer, skipFsync);
|
||||
#endif
|
||||
|
||||
/*
|
||||
* smgr_extend is often called with an all-zeroes page, so
|
||||
* lsn==InvalidXLogRecPtr. An smgr_write() call will come for the buffer
|
||||
* later, after it has been initialized with the real page contents, and
|
||||
* it is eventually evicted from the buffer cache. But we need a valid LSN
|
||||
* to the relation metadata update now.
|
||||
*/
|
||||
if (lsn == InvalidXLogRecPtr)
|
||||
{
|
||||
lsn = GetXLogInsertRecPtr();
|
||||
neon_set_lwlsn_block(lsn, InfoFromSMgrRel(reln), forkNum, blkno);
|
||||
/*
|
||||
* smgr_extend is often called with an all-zeroes page, so
|
||||
* lsn==InvalidXLogRecPtr. An smgr_write() call will come for the buffer
|
||||
* later, after it has been initialized with the real page contents, and
|
||||
* it is eventually evicted from the buffer cache. But we need a valid LSN
|
||||
* to the relation metadata update now.
|
||||
*/
|
||||
if (lsn == InvalidXLogRecPtr)
|
||||
{
|
||||
lsn = GetXLogInsertRecPtr();
|
||||
neon_set_lwlsn_block(lsn, InfoFromSMgrRel(reln), forkNum, blkno);
|
||||
}
|
||||
neon_set_lwlsn_relation(lsn, InfoFromSMgrRel(reln), forkNum);
|
||||
}
|
||||
neon_set_lwlsn_relation(lsn, InfoFromSMgrRel(reln), forkNum);
|
||||
}
|
||||
|
||||
#if PG_MAJORVERSION_NUM >= 16
|
||||
static void
|
||||
neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,
|
||||
neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber start_block,
|
||||
int nblocks, bool skipFsync)
|
||||
{
|
||||
const PGIOAlignedBlock buffer = {0};
|
||||
BlockNumber blocknum = start_block;
|
||||
int remblocks = nblocks;
|
||||
XLogRecPtr lsn = 0;
|
||||
|
||||
@@ -1092,8 +1110,15 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,
|
||||
|
||||
Assert(lsn != 0);
|
||||
|
||||
neon_set_lwlsn_relation(lsn, InfoFromSMgrRel(reln), forkNum);
|
||||
set_cached_relsize(InfoFromSMgrRel(reln), forkNum, blocknum);
|
||||
if (neon_enable_new_communicator)
|
||||
{
|
||||
communicator_new_rel_zeroextend(InfoFromSMgrRel(reln), forkNum, start_block, nblocks, lsn);
|
||||
}
|
||||
else
|
||||
{
|
||||
neon_set_lwlsn_relation(lsn, InfoFromSMgrRel(reln), forkNum);
|
||||
set_cached_relsize(InfoFromSMgrRel(reln), forkNum, blocknum);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -1153,11 +1178,17 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
|
||||
neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
|
||||
}
|
||||
|
||||
if (neon_enable_new_communicator)
|
||||
{
|
||||
communicator_new_prefetch_register_bufferv(InfoFromSMgrRel(reln), forknum, blocknum, nblocks);
|
||||
return false;
|
||||
}
|
||||
|
||||
tag.spcOid = reln->smgr_rlocator.locator.spcOid;
|
||||
tag.dbOid = reln->smgr_rlocator.locator.dbOid;
|
||||
tag.relNumber = reln->smgr_rlocator.locator.relNumber;
|
||||
tag.forkNum = forknum;
|
||||
|
||||
|
||||
while (nblocks > 0)
|
||||
{
|
||||
int iterblocks = Min(nblocks, PG_IOV_MAX);
|
||||
@@ -1179,7 +1210,8 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
|
||||
blocknum += iterblocks;
|
||||
}
|
||||
|
||||
communicator_prefetch_pump_state(false);
|
||||
if (!neon_enable_new_communicator)
|
||||
communicator_prefetch_pump_state(false);
|
||||
|
||||
return false;
|
||||
}
|
||||
@@ -1216,9 +1248,13 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
|
||||
|
||||
CopyNRelFileInfoToBufTag(tag, InfoFromSMgrRel(reln));
|
||||
|
||||
communicator_prefetch_register_bufferv(tag, NULL, 1, NULL);
|
||||
if (neon_enable_new_communicator)
|
||||
communicator_new_prefetch_register_bufferv(InfoFromSMgrRel(reln), forknum, blocknum, 1);
|
||||
else
|
||||
communicator_prefetch_register_bufferv(tag, NULL, 1, NULL);
|
||||
|
||||
communicator_prefetch_pump_state(false);
|
||||
if (!neon_enable_new_communicator)
|
||||
communicator_prefetch_pump_state(false);
|
||||
|
||||
return false;
|
||||
}
|
||||
@@ -1262,7 +1298,8 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum,
|
||||
*/
|
||||
neon_log(SmgrTrace, "writeback noop");
|
||||
|
||||
communicator_prefetch_pump_state(false);
|
||||
if (!neon_enable_new_communicator)
|
||||
communicator_prefetch_pump_state(false);
|
||||
|
||||
#ifdef DEBUG_COMPARE_LOCAL
|
||||
if (IS_LOCAL_REL(reln))
|
||||
@@ -1278,7 +1315,14 @@ void
|
||||
neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
neon_request_lsns request_lsns, void *buffer)
|
||||
{
|
||||
communicator_read_at_lsnv(rinfo, forkNum, blkno, &request_lsns, &buffer, 1, NULL);
|
||||
if (neon_enable_new_communicator)
|
||||
{
|
||||
// FIXME: request_lsns is ignored. That affects the neon_test_utils callers.
|
||||
// Add the capability to specify the LSNs explicitly, for the sake of neon_test_utils ?
|
||||
communicator_new_read_at_lsnv(rinfo, forkNum, blkno, &buffer, 1);
|
||||
}
|
||||
else
|
||||
communicator_read_at_lsnv(rinfo, forkNum, blkno, &request_lsns, &buffer, 1, NULL);
|
||||
}
|
||||
|
||||
#if PG_MAJORVERSION_NUM < 17
|
||||
@@ -1296,6 +1340,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
|
||||
neon_request_lsns request_lsns;
|
||||
bits8 present;
|
||||
void *bufferp;
|
||||
bool prefetch_hit;
|
||||
|
||||
switch (reln->smgr_relpersistence)
|
||||
{
|
||||
@@ -1314,33 +1359,62 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
|
||||
neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
|
||||
}
|
||||
|
||||
/* Try to read PS results if they are available */
|
||||
communicator_prefetch_pump_state(false);
|
||||
|
||||
neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1);
|
||||
|
||||
present = 0;
|
||||
bufferp = buffer;
|
||||
if (communicator_prefetch_lookupv(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1, &bufferp, &present))
|
||||
|
||||
if (neon_enable_new_communicator)
|
||||
{
|
||||
/* Prefetch hit */
|
||||
return;
|
||||
communicator_new_read_at_lsnv(InfoFromSMgrRel(reln), forkNum, blkno,
|
||||
(void *) &buffer, 1);
|
||||
}
|
||||
|
||||
/* Try to read from local file cache */
|
||||
if (lfc_read(InfoFromSMgrRel(reln), forkNum, blkno, buffer))
|
||||
else
|
||||
{
|
||||
MyNeonCounters->file_cache_hits_total++;
|
||||
return;
|
||||
prefetch_hit = communicator_prefetch_lookupv(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1, &bufferp, &present);
|
||||
if (prefetch_hit)
|
||||
{
|
||||
/* Prefetch hit */
|
||||
return;
|
||||
}
|
||||
|
||||
/* Try to read from local file cache */
|
||||
if (lfc_read(InfoFromSMgrRel(reln), forkNum, blkno, buffer))
|
||||
{
|
||||
MyNeonCounters->file_cache_hits_total++;
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* Try to receive prefetch results once again just to make sure we
|
||||
* don't leave the smgr code while the OS might still have buffered
|
||||
* bytes.
|
||||
*/
|
||||
communicator_prefetch_pump_state(false);
|
||||
|
||||
neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1);
|
||||
|
||||
prefetch_hit = communicator_prefetch_lookupv(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1, &bufferp, &present);
|
||||
|
||||
if (prefetch_hit)
|
||||
{
|
||||
/* Prefetch hit */
|
||||
return;
|
||||
}
|
||||
|
||||
/* Try to read from local file cache */
|
||||
if (lfc_read(InfoFromSMgrRel(reln), forkNum, blkno, buffer))
|
||||
{
|
||||
MyNeonCounters->file_cache_hits_total++;
|
||||
return;
|
||||
}
|
||||
|
||||
neon_read_at_lsn(InfoFromSMgrRel(reln), forkNum, blkno, request_lsns, buffer);
|
||||
|
||||
/*
|
||||
* Try to receive prefetch results once again just to make sure we don't leave the smgr code while the OS might still have buffered bytes.
|
||||
*/
|
||||
communicator_prefetch_pump_state(false);
|
||||
}
|
||||
|
||||
neon_read_at_lsn(InfoFromSMgrRel(reln), forkNum, blkno, request_lsns, buffer);
|
||||
|
||||
/*
|
||||
* Try to receive prefetch results once again just to make sure we don't leave the smgr code while the OS might still have buffered bytes.
|
||||
*/
|
||||
communicator_prefetch_pump_state(false);
|
||||
|
||||
#ifdef DEBUG_COMPARE_LOCAL
|
||||
if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln))
|
||||
{
|
||||
@@ -1449,38 +1523,47 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
|
||||
nblocks, PG_IOV_MAX);
|
||||
|
||||
/* Try to read PS results if they are available */
|
||||
communicator_prefetch_pump_state(false);
|
||||
|
||||
neon_get_request_lsns(InfoFromSMgrRel(reln), forknum, blocknum,
|
||||
request_lsns, nblocks);
|
||||
if (!neon_enable_new_communicator)
|
||||
communicator_prefetch_pump_state(false);
|
||||
|
||||
memset(read_pages, 0, sizeof(read_pages));
|
||||
|
||||
prefetch_result = communicator_prefetch_lookupv(InfoFromSMgrRel(reln), forknum,
|
||||
blocknum, request_lsns, nblocks,
|
||||
buffers, read_pages);
|
||||
if (neon_enable_new_communicator)
|
||||
{
|
||||
communicator_new_read_at_lsnv(InfoFromSMgrRel(reln), forknum, blocknum,
|
||||
buffers, nblocks);
|
||||
}
|
||||
else
|
||||
{
|
||||
neon_get_request_lsns(InfoFromSMgrRel(reln), forknum, blocknum,
|
||||
request_lsns, nblocks);
|
||||
|
||||
prefetch_result = communicator_prefetch_lookupv(InfoFromSMgrRel(reln), forknum,
|
||||
blocknum, request_lsns, nblocks,
|
||||
buffers, read_pages);
|
||||
|
||||
if (prefetch_result == nblocks)
|
||||
return;
|
||||
if (prefetch_result == nblocks)
|
||||
return;
|
||||
|
||||
/* Try to read from local file cache */
|
||||
lfc_result = lfc_readv_select(InfoFromSMgrRel(reln), forknum, blocknum, buffers,
|
||||
nblocks, read_pages);
|
||||
/* Try to read from local file cache */
|
||||
lfc_result = lfc_readv_select(InfoFromSMgrRel(reln), forknum, blocknum, buffers,
|
||||
nblocks, read_pages);
|
||||
|
||||
if (lfc_result > 0)
|
||||
MyNeonCounters->file_cache_hits_total += lfc_result;
|
||||
if (lfc_result > 0)
|
||||
MyNeonCounters->file_cache_hits_total += lfc_result;
|
||||
|
||||
/* Read all blocks from LFC, so we're done */
|
||||
if (prefetch_result + lfc_result == nblocks)
|
||||
return;
|
||||
/* Read all blocks from LFC, so we're done */
|
||||
if (prefetch_result + lfc_result == nblocks)
|
||||
return;
|
||||
|
||||
communicator_read_at_lsnv(InfoFromSMgrRel(reln), forknum, blocknum, request_lsns,
|
||||
buffers, nblocks, read_pages);
|
||||
communicator_read_at_lsnv(InfoFromSMgrRel(reln), forknum, blocknum, request_lsns,
|
||||
buffers, nblocks, read_pages);
|
||||
|
||||
/*
|
||||
* Try to receive prefetch results once again just to make sure we don't leave the smgr code while the OS might still have buffered bytes.
|
||||
*/
|
||||
communicator_prefetch_pump_state(false);
|
||||
/*
|
||||
* Try to receive prefetch results once again just to make sure we don't leave the smgr code while the OS might still have buffered bytes.
|
||||
*/
|
||||
communicator_prefetch_pump_state(false);
|
||||
}
|
||||
|
||||
#ifdef DEBUG_COMPARE_LOCAL
|
||||
if (forknum == MAIN_FORKNUM && IS_LOCAL_REL(reln))
|
||||
@@ -1663,9 +1746,16 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo
|
||||
forknum, blocknum,
|
||||
(uint32) (lsn >> 32), (uint32) lsn);
|
||||
|
||||
lfc_write(InfoFromSMgrRel(reln), forknum, blocknum, buffer);
|
||||
if (neon_enable_new_communicator)
|
||||
{
|
||||
communicator_new_write_page(InfoFromSMgrRel(reln), forknum, blocknum, buffer, lsn);
|
||||
}
|
||||
else
|
||||
{
|
||||
lfc_write(InfoFromSMgrRel(reln), forknum, blocknum, buffer);
|
||||
|
||||
communicator_prefetch_pump_state(false);
|
||||
communicator_prefetch_pump_state(false);
|
||||
}
|
||||
|
||||
#ifdef DEBUG_COMPARE_LOCAL
|
||||
if (IS_LOCAL_REL(reln))
|
||||
@@ -1725,9 +1815,21 @@ neon_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
|
||||
|
||||
neon_wallog_pagev(reln, forknum, blkno, nblocks, (const char **) buffers, false);
|
||||
|
||||
lfc_writev(InfoFromSMgrRel(reln), forknum, blkno, buffers, nblocks);
|
||||
if (neon_enable_new_communicator)
|
||||
{
|
||||
for (int i = 0; i < nblocks; i++)
|
||||
{
|
||||
XLogRecPtr lsn = PageGetLSN((Page) buffers[i]);
|
||||
|
||||
communicator_prefetch_pump_state(false);
|
||||
communicator_new_write_page(InfoFromSMgrRel(reln), forknum, blkno + i, buffers[i], lsn);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
lfc_writev(InfoFromSMgrRel(reln), forknum, blkno, buffers, nblocks);
|
||||
|
||||
communicator_prefetch_pump_state(false);
|
||||
}
|
||||
|
||||
#ifdef DEBUG_COMPARE_LOCAL
|
||||
if (IS_LOCAL_REL(reln))
|
||||
@@ -1763,19 +1865,26 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
|
||||
neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
|
||||
}
|
||||
|
||||
if (get_cached_relsize(InfoFromSMgrRel(reln), forknum, &n_blocks))
|
||||
if (neon_enable_new_communicator)
|
||||
{
|
||||
neon_log(SmgrTrace, "cached nblocks for %u/%u/%u.%u: %u blocks",
|
||||
RelFileInfoFmt(InfoFromSMgrRel(reln)),
|
||||
forknum, n_blocks);
|
||||
return n_blocks;
|
||||
n_blocks = communicator_new_rel_nblocks(InfoFromSMgrRel(reln), forknum);
|
||||
}
|
||||
else
|
||||
{
|
||||
if (get_cached_relsize(InfoFromSMgrRel(reln), forknum, &n_blocks))
|
||||
{
|
||||
neon_log(SmgrTrace, "cached nblocks for %u/%u/%u.%u: %u blocks",
|
||||
RelFileInfoFmt(InfoFromSMgrRel(reln)),
|
||||
forknum, n_blocks);
|
||||
return n_blocks;
|
||||
}
|
||||
|
||||
neon_get_request_lsns(InfoFromSMgrRel(reln), forknum,
|
||||
REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1);
|
||||
neon_get_request_lsns(InfoFromSMgrRel(reln), forknum,
|
||||
REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1);
|
||||
|
||||
n_blocks = communicator_nblocks(InfoFromSMgrRel(reln), forknum, &request_lsns);
|
||||
update_cached_relsize(InfoFromSMgrRel(reln), forknum, n_blocks);
|
||||
n_blocks = communicator_nblocks(InfoFromSMgrRel(reln), forknum, &request_lsns);
|
||||
update_cached_relsize(InfoFromSMgrRel(reln), forknum, n_blocks);
|
||||
}
|
||||
|
||||
neon_log(SmgrTrace, "neon_nblocks: rel %u/%u/%u fork %u (request LSN %X/%08X): %u blocks",
|
||||
RelFileInfoFmt(InfoFromSMgrRel(reln)),
|
||||
@@ -1796,10 +1905,17 @@ neon_dbsize(Oid dbNode)
|
||||
neon_request_lsns request_lsns;
|
||||
NRelFileInfo dummy_node = {0};
|
||||
|
||||
neon_get_request_lsns(dummy_node, MAIN_FORKNUM,
|
||||
REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1);
|
||||
if (neon_enable_new_communicator)
|
||||
{
|
||||
db_size = communicator_new_dbsize(dbNode);
|
||||
}
|
||||
else
|
||||
{
|
||||
neon_get_request_lsns(dummy_node, MAIN_FORKNUM,
|
||||
REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1);
|
||||
|
||||
db_size = communicator_dbsize(dbNode, &request_lsns);
|
||||
db_size = communicator_dbsize(dbNode, &request_lsns);
|
||||
}
|
||||
|
||||
neon_log(SmgrTrace, "neon_dbsize: db %u (request LSN %X/%08X): %ld bytes",
|
||||
dbNode, LSN_FORMAT_ARGS(request_lsns.effective_request_lsn), db_size);
|
||||
@@ -1813,8 +1929,6 @@ neon_dbsize(Oid dbNode)
|
||||
static void
|
||||
neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber old_blocks, BlockNumber nblocks)
|
||||
{
|
||||
XLogRecPtr lsn;
|
||||
|
||||
switch (reln->smgr_relpersistence)
|
||||
{
|
||||
case 0:
|
||||
@@ -1833,34 +1947,43 @@ neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber old_blocks, Blo
|
||||
neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
|
||||
}
|
||||
|
||||
set_cached_relsize(InfoFromSMgrRel(reln), forknum, nblocks);
|
||||
if (neon_enable_new_communicator)
|
||||
{
|
||||
communicator_new_rel_truncate(InfoFromSMgrRel(reln), forknum, nblocks);
|
||||
}
|
||||
else
|
||||
{
|
||||
XLogRecPtr lsn;
|
||||
|
||||
/*
|
||||
* Truncating a relation drops all its buffers from the buffer cache
|
||||
* without calling smgrwrite() on them. But we must account for that in
|
||||
* our tracking of last-written-LSN all the same: any future smgrnblocks()
|
||||
* request must return the new size after the truncation. We don't know
|
||||
* what the LSN of the truncation record was, so be conservative and use
|
||||
* the most recently inserted WAL record's LSN.
|
||||
*/
|
||||
lsn = GetXLogInsertRecPtr();
|
||||
lsn = nm_adjust_lsn(lsn);
|
||||
set_cached_relsize(InfoFromSMgrRel(reln), forknum, nblocks);
|
||||
|
||||
/*
|
||||
* Flush it, too. We don't actually care about it here, but let's uphold
|
||||
* the invariant that last-written LSN <= flush LSN.
|
||||
*/
|
||||
XLogFlush(lsn);
|
||||
/*
|
||||
* Truncating a relation drops all its buffers from the buffer cache
|
||||
* without calling smgrwrite() on them. But we must account for that in
|
||||
* our tracking of last-written-LSN all the same: any future smgrnblocks()
|
||||
* request must return the new size after the truncation. We don't know
|
||||
* what the LSN of the truncation record was, so be conservative and use
|
||||
* the most recently inserted WAL record's LSN.
|
||||
*/
|
||||
lsn = GetXLogInsertRecPtr();
|
||||
lsn = nm_adjust_lsn(lsn);
|
||||
|
||||
/*
|
||||
* Truncate may affect several chunks of relations. So we should either
|
||||
* update last written LSN for all of them, or update LSN for "dummy"
|
||||
* metadata block. Second approach seems more efficient. If the relation
|
||||
* is extended again later, the extension will update the last-written LSN
|
||||
* for the extended pages, so there's no harm in leaving behind obsolete
|
||||
* entries for the truncated chunks.
|
||||
*/
|
||||
neon_set_lwlsn_relation(lsn, InfoFromSMgrRel(reln), forknum);
|
||||
/*
|
||||
* Flush it, too. We don't actually care about it here, but let's uphold
|
||||
* the invariant that last-written LSN <= flush LSN.
|
||||
*/
|
||||
XLogFlush(lsn);
|
||||
|
||||
/*
|
||||
* Truncate may affect several chunks of relations. So we should either
|
||||
* update last written LSN for all of them, or update LSN for "dummy"
|
||||
* metadata block. Second approach seems more efficient. If the relation
|
||||
* is extended again later, the extension will update the last-written LSN
|
||||
* for the extended pages, so there's no harm in leaving behind obsolete
|
||||
* entries for the truncated chunks.
|
||||
*/
|
||||
neon_set_lwlsn_relation(lsn, InfoFromSMgrRel(reln), forknum);
|
||||
}
|
||||
|
||||
#ifdef DEBUG_COMPARE_LOCAL
|
||||
if (IS_LOCAL_REL(reln))
|
||||
@@ -1902,7 +2025,8 @@ neon_immedsync(SMgrRelation reln, ForkNumber forknum)
|
||||
|
||||
neon_log(SmgrTrace, "[NEON_SMGR] immedsync noop");
|
||||
|
||||
communicator_prefetch_pump_state(false);
|
||||
if (!neon_enable_new_communicator)
|
||||
communicator_prefetch_pump_state(false);
|
||||
|
||||
#ifdef DEBUG_COMPARE_LOCAL
|
||||
if (IS_LOCAL_REL(reln))
|
||||
@@ -2173,7 +2297,10 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf
|
||||
request_lsns.not_modified_since = not_modified_since;
|
||||
request_lsns.effective_request_lsn = request_lsn;
|
||||
|
||||
n_blocks = communicator_read_slru_segment(kind, segno, &request_lsns, buffer);
|
||||
if (neon_enable_new_communicator)
|
||||
n_blocks = communicator_new_read_slru_segment(kind, segno, buffer);
|
||||
else
|
||||
n_blocks = communicator_read_slru_segment(kind, segno, &request_lsns, buffer);
|
||||
|
||||
return n_blocks;
|
||||
}
|
||||
@@ -2210,7 +2337,8 @@ AtEOXact_neon(XactEvent event, void *arg)
|
||||
}
|
||||
break;
|
||||
}
|
||||
communicator_reconfigure_timeout_if_needed();
|
||||
if (!neon_enable_new_communicator)
|
||||
communicator_reconfigure_timeout_if_needed();
|
||||
}
|
||||
|
||||
static const struct f_smgr neon_smgr =
|
||||
@@ -2268,7 +2396,10 @@ smgr_init_neon(void)
|
||||
|
||||
smgr_init_standard();
|
||||
neon_init();
|
||||
communicator_init();
|
||||
if (neon_enable_new_communicator)
|
||||
communicator_new_init();
|
||||
else
|
||||
communicator_init();
|
||||
}
|
||||
|
||||
|
||||
@@ -2280,6 +2411,12 @@ neon_extend_rel_size(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
|
||||
/* This is only used in WAL replay */
|
||||
Assert(RecoveryInProgress());
|
||||
|
||||
if (neon_enable_new_communicator)
|
||||
{
|
||||
// FIXME: broken, but this is only used in replica
|
||||
elog(ERROR, "not implemented yet");
|
||||
}
|
||||
|
||||
/* Extend the relation if we know its size */
|
||||
if (get_cached_relsize(rinfo, forknum, &relsize))
|
||||
{
|
||||
|
||||
Reference in New Issue
Block a user