mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-15 12:10:37 +00:00
Compare commits
132 Commits
release-co
...
elizabeth/
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
09791177a2 | ||
|
|
782062014e | ||
|
|
d0b3629412 | ||
|
|
f4d51c0f5c | ||
|
|
ec17ae0658 | ||
|
|
9ecce60ded | ||
|
|
e74a957045 | ||
|
|
396a16a3b2 | ||
|
|
7140a50225 | ||
|
|
68f18ccacf | ||
|
|
786888d93f | ||
|
|
255537dda1 | ||
|
|
8b494f6a24 | ||
|
|
28a61741b3 | ||
|
|
2fb6164bf8 | ||
|
|
328f28dfe5 | ||
|
|
95838056da | ||
|
|
6d451654f1 | ||
|
|
37c58522a2 | ||
|
|
6ae4b89000 | ||
|
|
f7ec7668a2 | ||
|
|
038e967daf | ||
|
|
6a43f23eca | ||
|
|
868f194a3b | ||
|
|
9c6c780201 | ||
|
|
6123fe2d5e | ||
|
|
1577665c20 | ||
|
|
d8ebd1d771 | ||
|
|
c8a96cf722 | ||
|
|
56d505bce6 | ||
|
|
dae203ef69 | ||
|
|
1fb1315aed | ||
|
|
838622c594 | ||
|
|
3fd5a94a85 | ||
|
|
e7d6f525b3 | ||
|
|
e4ca3ac745 | ||
|
|
b69d103b90 | ||
|
|
208cbd52d4 | ||
|
|
4b6f02e47d | ||
|
|
c567ed0de0 | ||
|
|
c698cee19a | ||
|
|
4a3f32bf4a | ||
|
|
a963aab14b | ||
|
|
8202c6172f | ||
|
|
5bdba70f7d | ||
|
|
25fffd3a55 | ||
|
|
e00fd45bba | ||
|
|
69a47d789d | ||
|
|
b36f880710 | ||
|
|
745b750f33 | ||
|
|
3b8be98b67 | ||
|
|
3e72edede5 | ||
|
|
a650f7f5af | ||
|
|
fc3994eb71 | ||
|
|
781bf4945d | ||
|
|
a21c1174ed | ||
|
|
8d7ed2a4ee | ||
|
|
5b62749c42 | ||
|
|
af5bb67f08 | ||
|
|
589bfdfd02 | ||
|
|
87179e26b3 | ||
|
|
f05df409bd | ||
|
|
f6c0f6c4ec | ||
|
|
62cd3b8d3d | ||
|
|
8d26978ed9 | ||
|
|
35372a8f12 | ||
|
|
6d95a3fe2d | ||
|
|
99726495c7 | ||
|
|
4a4a457312 | ||
|
|
e78d1e2ec6 | ||
|
|
af429b4a62 | ||
|
|
f06bb2bbd8 | ||
|
|
b3c25418a6 | ||
|
|
33549bad1d | ||
|
|
009168d711 | ||
|
|
7c9bd542a6 | ||
|
|
014823b305 | ||
|
|
af9379ccf6 | ||
|
|
bb28109ffa | ||
|
|
60a0bec1c0 | ||
|
|
31fa7a545d | ||
|
|
ac464c5f2c | ||
|
|
0dddb1e373 | ||
|
|
3acb263e62 | ||
|
|
1e83398cdd | ||
|
|
be8ed81532 | ||
|
|
12b08c4b82 | ||
|
|
827358dd03 | ||
|
|
d367273000 | ||
|
|
e2bad5d9e9 | ||
|
|
5623e4665b | ||
|
|
8abb4dab6d | ||
|
|
731667ac37 | ||
|
|
6a1374d106 | ||
|
|
f7c908f2f0 | ||
|
|
86671e3a0b | ||
|
|
319cd74f73 | ||
|
|
0efefbf77c | ||
|
|
e6a4171fa1 | ||
|
|
0c25ea9e31 | ||
|
|
6692321026 | ||
|
|
791df28755 | ||
|
|
d20da994f4 | ||
|
|
6dbbdaae73 | ||
|
|
977bc09d2a | ||
|
|
44269fcd5e | ||
|
|
44cc648dc8 | ||
|
|
884e028a4a | ||
|
|
42df3e5453 | ||
|
|
fc743e284f | ||
|
|
d02f9a2139 | ||
|
|
083118e98e | ||
|
|
54cd2272f1 | ||
|
|
e40193e3c8 | ||
|
|
ce9f7bacc1 | ||
|
|
b7891f8fe8 | ||
|
|
5f2adaa9ad | ||
|
|
3e5e396c8d | ||
|
|
9d781c6fda | ||
|
|
cf5d038472 | ||
|
|
d785100c02 | ||
|
|
2c0d930e3d | ||
|
|
66171a117b | ||
|
|
df2806e7a0 | ||
|
|
07631692db | ||
|
|
4c77397943 | ||
|
|
7bb58be546 | ||
|
|
b5373de208 | ||
|
|
b86c610f42 | ||
|
|
0f520d79ab | ||
|
|
93eb7bb6b8 | ||
|
|
e58d0fece1 |
1
.gitignore
vendored
1
.gitignore
vendored
@@ -13,6 +13,7 @@ neon.iml
|
||||
/.neon
|
||||
/integration_tests/.neon
|
||||
compaction-suite-results.*
|
||||
pgxn/neon/communicator/communicator_bindings.h
|
||||
|
||||
# Coverage
|
||||
*.profraw
|
||||
|
||||
384
Cargo.lock
generated
384
Cargo.lock
generated
@@ -253,6 +253,17 @@ version = "1.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a8ab6b55fe97976e46f91ddbed8d147d966475dc29b2032757ba47e02376fbc3"
|
||||
|
||||
[[package]]
|
||||
name = "atomic_enum"
|
||||
version = "0.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "99e1aca718ea7b89985790c94aad72d77533063fe00bc497bb79a7c2dae6a661"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.100",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "autocfg"
|
||||
version = "1.1.0"
|
||||
@@ -687,13 +698,40 @@ dependencies = [
|
||||
"tracing",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "axum"
|
||||
version = "0.7.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "edca88bc138befd0323b20752846e6587272d3b03b0343c8ea28a6f819e6e71f"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"axum-core 0.4.5",
|
||||
"bytes",
|
||||
"futures-util",
|
||||
"http 1.1.0",
|
||||
"http-body 1.0.0",
|
||||
"http-body-util",
|
||||
"itoa",
|
||||
"matchit 0.7.3",
|
||||
"memchr",
|
||||
"mime",
|
||||
"percent-encoding",
|
||||
"pin-project-lite",
|
||||
"rustversion",
|
||||
"serde",
|
||||
"sync_wrapper 1.0.1",
|
||||
"tower 0.5.2",
|
||||
"tower-layer",
|
||||
"tower-service",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "axum"
|
||||
version = "0.8.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6d6fd624c75e18b3b4c6b9caf42b1afe24437daaee904069137d8bab077be8b8"
|
||||
dependencies = [
|
||||
"axum-core",
|
||||
"axum-core 0.5.0",
|
||||
"base64 0.22.1",
|
||||
"bytes",
|
||||
"form_urlencoded",
|
||||
@@ -701,10 +739,10 @@ dependencies = [
|
||||
"http 1.1.0",
|
||||
"http-body 1.0.0",
|
||||
"http-body-util",
|
||||
"hyper 1.4.1",
|
||||
"hyper 1.6.0",
|
||||
"hyper-util",
|
||||
"itoa",
|
||||
"matchit",
|
||||
"matchit 0.8.4",
|
||||
"memchr",
|
||||
"mime",
|
||||
"percent-encoding",
|
||||
@@ -724,6 +762,26 @@ dependencies = [
|
||||
"tracing",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "axum-core"
|
||||
version = "0.4.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "09f2bd6146b97ae3359fa0cc6d6b376d9539582c7b4220f041a33ec24c226199"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"bytes",
|
||||
"futures-util",
|
||||
"http 1.1.0",
|
||||
"http-body 1.0.0",
|
||||
"http-body-util",
|
||||
"mime",
|
||||
"pin-project-lite",
|
||||
"rustversion",
|
||||
"sync_wrapper 1.0.1",
|
||||
"tower-layer",
|
||||
"tower-service",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "axum-core"
|
||||
version = "0.5.0"
|
||||
@@ -750,8 +808,8 @@ version = "0.10.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "460fc6f625a1f7705c6cf62d0d070794e94668988b1c38111baeec177c715f7b"
|
||||
dependencies = [
|
||||
"axum",
|
||||
"axum-core",
|
||||
"axum 0.8.1",
|
||||
"axum-core 0.5.0",
|
||||
"bytes",
|
||||
"futures-util",
|
||||
"headers",
|
||||
@@ -1086,6 +1144,25 @@ version = "0.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5"
|
||||
|
||||
[[package]]
|
||||
name = "cbindgen"
|
||||
version = "0.28.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "eadd868a2ce9ca38de7eeafdcec9c7065ef89b42b32f0839278d55f35c54d1ff"
|
||||
dependencies = [
|
||||
"clap",
|
||||
"heck 0.4.1",
|
||||
"indexmap 2.9.0",
|
||||
"log",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"syn 2.0.100",
|
||||
"tempfile",
|
||||
"toml",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cc"
|
||||
version = "1.2.16"
|
||||
@@ -1212,7 +1289,7 @@ version = "4.5.18"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4ac6a0c7b1a9e9a5186361f67dfa1b88213572f427fb9ab038efb2bd8c582dab"
|
||||
dependencies = [
|
||||
"heck",
|
||||
"heck 0.5.0",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.100",
|
||||
@@ -1270,6 +1347,34 @@ dependencies = [
|
||||
"unicode-width",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "communicator"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"atomic_enum",
|
||||
"axum 0.8.1",
|
||||
"bytes",
|
||||
"cbindgen",
|
||||
"clashmap",
|
||||
"http 1.1.0",
|
||||
"libc",
|
||||
"metrics",
|
||||
"neon-shmem",
|
||||
"nix 0.30.1",
|
||||
"pageserver_client_grpc",
|
||||
"pageserver_page_api",
|
||||
"prometheus",
|
||||
"prost 0.13.5",
|
||||
"thiserror 1.0.69",
|
||||
"tokio",
|
||||
"tokio-pipe",
|
||||
"tonic 0.12.3",
|
||||
"tracing",
|
||||
"tracing-subscriber",
|
||||
"uring-common",
|
||||
"utils",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "compute_api"
|
||||
version = "0.1.0"
|
||||
@@ -1295,7 +1400,7 @@ dependencies = [
|
||||
"aws-sdk-kms",
|
||||
"aws-sdk-s3",
|
||||
"aws-smithy-types",
|
||||
"axum",
|
||||
"axum 0.8.1",
|
||||
"axum-extra",
|
||||
"base64 0.13.1",
|
||||
"bytes",
|
||||
@@ -1319,6 +1424,7 @@ dependencies = [
|
||||
"opentelemetry",
|
||||
"opentelemetry_sdk",
|
||||
"p256 0.13.2",
|
||||
"pageserver_page_api",
|
||||
"postgres",
|
||||
"postgres_initdb",
|
||||
"regex",
|
||||
@@ -1337,6 +1443,7 @@ dependencies = [
|
||||
"tokio-postgres",
|
||||
"tokio-stream",
|
||||
"tokio-util",
|
||||
"tonic 0.13.1",
|
||||
"tower 0.5.2",
|
||||
"tower-http",
|
||||
"tower-otel",
|
||||
@@ -1594,9 +1701,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "crossbeam-utils"
|
||||
version = "0.8.19"
|
||||
version = "0.8.21"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "248e3bacc7dc6baa3b21e405ee045c3047101a49145e7e9eca583ab4c2ca5345"
|
||||
checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
|
||||
|
||||
[[package]]
|
||||
name = "crossterm"
|
||||
@@ -1936,7 +2043,7 @@ checksum = "0892a17df262a24294c382f0d5997571006e7a4348b4327557c4ff1cd4a8bccc"
|
||||
dependencies = [
|
||||
"darling",
|
||||
"either",
|
||||
"heck",
|
||||
"heck 0.5.0",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.100",
|
||||
@@ -2050,7 +2157,7 @@ name = "endpoint_storage"
|
||||
version = "0.0.1"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"axum",
|
||||
"axum 0.8.1",
|
||||
"axum-extra",
|
||||
"camino",
|
||||
"camino-tempfile",
|
||||
@@ -2330,7 +2437,7 @@ dependencies = [
|
||||
"futures-core",
|
||||
"futures-sink",
|
||||
"http-body-util",
|
||||
"hyper 1.4.1",
|
||||
"hyper 1.6.0",
|
||||
"hyper-util",
|
||||
"pin-project",
|
||||
"rand 0.8.5",
|
||||
@@ -2500,6 +2607,18 @@ dependencies = [
|
||||
"wasm-bindgen",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "getrandom"
|
||||
version = "0.3.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "73fea8450eea4bac3940448fb7ae50d91f034f941199fcd9d909a5a07aa455f0"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"libc",
|
||||
"r-efi",
|
||||
"wasi 0.14.2+wasi-0.2.4",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "gettid"
|
||||
version = "0.1.3"
|
||||
@@ -2712,6 +2831,12 @@ dependencies = [
|
||||
"http 1.1.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "heck"
|
||||
version = "0.4.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"
|
||||
|
||||
[[package]]
|
||||
name = "heck"
|
||||
version = "0.5.0"
|
||||
@@ -2883,9 +3008,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "httparse"
|
||||
version = "1.8.0"
|
||||
version = "1.10.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d897f394bad6a705d5f4104762e116a75639e470d80901eed05a860a95cb1904"
|
||||
checksum = "6dbf3de79e51f3d586ab4cb9d5c3e2c14aa28ed23d180cf89b4df0454a69cc87"
|
||||
|
||||
[[package]]
|
||||
name = "httpdate"
|
||||
@@ -2935,9 +3060,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "hyper"
|
||||
version = "1.4.1"
|
||||
version = "1.6.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "50dfd22e0e76d0f662d429a5f80fcaf3855009297eab6a0a9f8543834744ba05"
|
||||
checksum = "cc2b571658e38e0c01b1fdca3bbbe93c00d3d71693ff2770043f8c29bc7d6f80"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"futures-channel",
|
||||
@@ -2977,7 +3102,7 @@ checksum = "a0bea761b46ae2b24eb4aef630d8d1c398157b6fc29e6350ecf090a0b70c952c"
|
||||
dependencies = [
|
||||
"futures-util",
|
||||
"http 1.1.0",
|
||||
"hyper 1.4.1",
|
||||
"hyper 1.6.0",
|
||||
"hyper-util",
|
||||
"rustls 0.22.4",
|
||||
"rustls-pki-types",
|
||||
@@ -2992,7 +3117,7 @@ version = "0.5.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3203a961e5c83b6f5498933e78b6b263e208c197b63e9c6c53cc82ffd3f63793"
|
||||
dependencies = [
|
||||
"hyper 1.4.1",
|
||||
"hyper 1.6.0",
|
||||
"hyper-util",
|
||||
"pin-project-lite",
|
||||
"tokio",
|
||||
@@ -3001,20 +3126,20 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "hyper-util"
|
||||
version = "0.1.7"
|
||||
version = "0.1.12"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "cde7055719c54e36e95e8719f95883f22072a48ede39db7fc17a4e1d5281e9b9"
|
||||
checksum = "cf9f1e950e0d9d1d3c47184416723cf29c0d1f93bd8cccf37e4beb6b44f31710"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"futures-channel",
|
||||
"futures-util",
|
||||
"http 1.1.0",
|
||||
"http-body 1.0.0",
|
||||
"hyper 1.4.1",
|
||||
"hyper 1.6.0",
|
||||
"libc",
|
||||
"pin-project-lite",
|
||||
"socket2",
|
||||
"tokio",
|
||||
"tower 0.4.13",
|
||||
"tower-service",
|
||||
"tracing",
|
||||
]
|
||||
@@ -3603,6 +3728,12 @@ dependencies = [
|
||||
"regex-automata 0.1.10",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "matchit"
|
||||
version = "0.7.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0e7465ac9959cc2b1404e8e2367b43684a6d13790fe23056cc8c6c5a6b7bcb94"
|
||||
|
||||
[[package]]
|
||||
name = "matchit"
|
||||
version = "0.8.4"
|
||||
@@ -3648,7 +3779,7 @@ version = "0.0.22"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b9e6777fc80a575f9503d908c8b498782a6c3ee88a06cb416dc3941401e43b94"
|
||||
dependencies = [
|
||||
"heck",
|
||||
"heck 0.5.0",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.100",
|
||||
@@ -3710,7 +3841,7 @@ dependencies = [
|
||||
"procfs",
|
||||
"prometheus",
|
||||
"rand 0.8.5",
|
||||
"rand_distr",
|
||||
"rand_distr 0.4.3",
|
||||
"twox-hash",
|
||||
]
|
||||
|
||||
@@ -3799,11 +3930,25 @@ name = "neon-shmem"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"nix 0.30.1",
|
||||
"rand 0.9.1",
|
||||
"rand_distr 0.5.1",
|
||||
"spin",
|
||||
"tempfile",
|
||||
"thiserror 1.0.69",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "neonart"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"crossbeam-utils",
|
||||
"rand 0.9.1",
|
||||
"rand_distr 0.5.1",
|
||||
"spin",
|
||||
"tracing",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "never-say-never"
|
||||
version = "6.6.666"
|
||||
@@ -4236,20 +4381,29 @@ name = "pagebench"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"async-trait",
|
||||
"axum 0.8.1",
|
||||
"bytes",
|
||||
"camino",
|
||||
"clap",
|
||||
"futures",
|
||||
"hdrhistogram",
|
||||
"http 1.1.0",
|
||||
"humantime",
|
||||
"humantime-serde",
|
||||
"metrics",
|
||||
"pageserver_api",
|
||||
"pageserver_client",
|
||||
"pageserver_client_grpc",
|
||||
"pageserver_page_api",
|
||||
"rand 0.8.5",
|
||||
"reqwest",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"tokio",
|
||||
"tokio-stream",
|
||||
"tokio-util",
|
||||
"tonic 0.13.1",
|
||||
"tracing",
|
||||
"utils",
|
||||
"workspace_hack",
|
||||
@@ -4305,6 +4459,7 @@ dependencies = [
|
||||
"hashlink",
|
||||
"hex",
|
||||
"hex-literal",
|
||||
"http 1.1.0",
|
||||
"http-utils",
|
||||
"humantime",
|
||||
"humantime-serde",
|
||||
@@ -4322,6 +4477,7 @@ dependencies = [
|
||||
"pageserver_client",
|
||||
"pageserver_compaction",
|
||||
"pageserver_page_api",
|
||||
"peekable",
|
||||
"pem",
|
||||
"pin-project-lite",
|
||||
"postgres-protocol",
|
||||
@@ -4334,6 +4490,7 @@ dependencies = [
|
||||
"pprof",
|
||||
"pq_proto",
|
||||
"procfs",
|
||||
"prost 0.13.5",
|
||||
"rand 0.8.5",
|
||||
"range-set-blaze",
|
||||
"regex",
|
||||
@@ -4367,6 +4524,7 @@ dependencies = [
|
||||
"toml_edit",
|
||||
"tonic 0.13.1",
|
||||
"tonic-reflection",
|
||||
"tower 0.5.2",
|
||||
"tracing",
|
||||
"tracing-utils",
|
||||
"twox-hash",
|
||||
@@ -4432,6 +4590,34 @@ dependencies = [
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pageserver_client_grpc"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"bytes",
|
||||
"chrono",
|
||||
"dashmap 5.5.0",
|
||||
"futures",
|
||||
"http 1.1.0",
|
||||
"hyper 1.6.0",
|
||||
"hyper-util",
|
||||
"metrics",
|
||||
"pageserver_api",
|
||||
"pageserver_page_api",
|
||||
"priority-queue",
|
||||
"rand 0.8.5",
|
||||
"thiserror 1.0.69",
|
||||
"tokio",
|
||||
"tokio-stream",
|
||||
"tokio-util",
|
||||
"tonic 0.13.1",
|
||||
"tower 0.4.13",
|
||||
"tracing",
|
||||
"utils",
|
||||
"uuid",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pageserver_compaction"
|
||||
version = "0.1.0"
|
||||
@@ -4463,7 +4649,6 @@ dependencies = [
|
||||
"pageserver_api",
|
||||
"postgres_ffi",
|
||||
"prost 0.13.5",
|
||||
"smallvec",
|
||||
"thiserror 1.0.69",
|
||||
"tonic 0.13.1",
|
||||
"tonic-build",
|
||||
@@ -4597,6 +4782,15 @@ dependencies = [
|
||||
"sha2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "peekable"
|
||||
version = "0.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "225f9651e475709164f871dc2f5724956be59cb9edb055372ffeeab01ec2d20b"
|
||||
dependencies = [
|
||||
"smallvec",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pem"
|
||||
version = "3.0.3"
|
||||
@@ -5008,6 +5202,17 @@ dependencies = [
|
||||
"elliptic-curve 0.13.8",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "priority-queue"
|
||||
version = "2.3.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ef08705fa1589a1a59aa924ad77d14722cb0cd97b67dd5004ed5f4a4873fce8d"
|
||||
dependencies = [
|
||||
"autocfg",
|
||||
"equivalent",
|
||||
"indexmap 2.9.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "proc-macro2"
|
||||
version = "1.0.94"
|
||||
@@ -5086,7 +5291,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "22505a5c94da8e3b7c2996394d1c933236c4d743e81a410bcca4e6989fc066a4"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"heck",
|
||||
"heck 0.5.0",
|
||||
"itertools 0.12.1",
|
||||
"log",
|
||||
"multimap",
|
||||
@@ -5107,7 +5312,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0c1318b19085f08681016926435853bbf7858f9c082d0999b80550ff5d9abe15"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"heck",
|
||||
"heck 0.5.0",
|
||||
"itertools 0.12.1",
|
||||
"log",
|
||||
"multimap",
|
||||
@@ -5208,7 +5413,7 @@ dependencies = [
|
||||
"humantime",
|
||||
"humantime-serde",
|
||||
"hyper 0.14.30",
|
||||
"hyper 1.4.1",
|
||||
"hyper 1.6.0",
|
||||
"hyper-util",
|
||||
"indexmap 2.9.0",
|
||||
"ipnet",
|
||||
@@ -5232,7 +5437,7 @@ dependencies = [
|
||||
"postgres_backend",
|
||||
"pq_proto",
|
||||
"rand 0.8.5",
|
||||
"rand_distr",
|
||||
"rand_distr 0.4.3",
|
||||
"rcgen",
|
||||
"redis",
|
||||
"regex",
|
||||
@@ -5336,6 +5541,12 @@ dependencies = [
|
||||
"proc-macro2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "r-efi"
|
||||
version = "5.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "74765f6d916ee2faa39bc8e68e4f3ed8949b48cccdac59983d287a7cb71ce9c5"
|
||||
|
||||
[[package]]
|
||||
name = "rand"
|
||||
version = "0.7.3"
|
||||
@@ -5360,6 +5571,16 @@ dependencies = [
|
||||
"rand_core 0.6.4",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand"
|
||||
version = "0.9.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9fbfd9d094a40bf3ae768db9361049ace4c0e04a4fd6b359518bd7b73a73dd97"
|
||||
dependencies = [
|
||||
"rand_chacha 0.9.0",
|
||||
"rand_core 0.9.3",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_chacha"
|
||||
version = "0.2.2"
|
||||
@@ -5380,6 +5601,16 @@ dependencies = [
|
||||
"rand_core 0.6.4",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_chacha"
|
||||
version = "0.9.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb"
|
||||
dependencies = [
|
||||
"ppv-lite86",
|
||||
"rand_core 0.9.3",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_core"
|
||||
version = "0.5.1"
|
||||
@@ -5398,6 +5629,15 @@ dependencies = [
|
||||
"getrandom 0.2.11",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_core"
|
||||
version = "0.9.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "99d9a13982dcf210057a8a78572b2217b667c3beacbf3a0d8b454f6f82837d38"
|
||||
dependencies = [
|
||||
"getrandom 0.3.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_distr"
|
||||
version = "0.4.3"
|
||||
@@ -5408,6 +5648,16 @@ dependencies = [
|
||||
"rand 0.8.5",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_distr"
|
||||
version = "0.5.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6a8615d50dcf34fa31f7ab52692afec947c4dd0ab803cc87cb3b0b4570ff7463"
|
||||
dependencies = [
|
||||
"num-traits",
|
||||
"rand 0.9.1",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_hc"
|
||||
version = "0.2.0"
|
||||
@@ -5604,7 +5854,7 @@ dependencies = [
|
||||
"http-body-util",
|
||||
"http-types",
|
||||
"humantime-serde",
|
||||
"hyper 1.4.1",
|
||||
"hyper 1.6.0",
|
||||
"itertools 0.10.5",
|
||||
"metrics",
|
||||
"once_cell",
|
||||
@@ -5644,7 +5894,7 @@ dependencies = [
|
||||
"http 1.1.0",
|
||||
"http-body 1.0.0",
|
||||
"http-body-util",
|
||||
"hyper 1.4.1",
|
||||
"hyper 1.6.0",
|
||||
"hyper-rustls 0.26.0",
|
||||
"hyper-util",
|
||||
"ipnet",
|
||||
@@ -5701,7 +5951,7 @@ dependencies = [
|
||||
"futures",
|
||||
"getrandom 0.2.11",
|
||||
"http 1.1.0",
|
||||
"hyper 1.4.1",
|
||||
"hyper 1.6.0",
|
||||
"parking_lot 0.11.2",
|
||||
"reqwest",
|
||||
"reqwest-middleware",
|
||||
@@ -5722,7 +5972,7 @@ dependencies = [
|
||||
"async-trait",
|
||||
"getrandom 0.2.11",
|
||||
"http 1.1.0",
|
||||
"matchit",
|
||||
"matchit 0.8.4",
|
||||
"opentelemetry",
|
||||
"reqwest",
|
||||
"reqwest-middleware",
|
||||
@@ -6642,12 +6892,12 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "socket2"
|
||||
version = "0.5.5"
|
||||
version = "0.5.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7b5fac59a5cb5dd637972e5fca70daf0523c9067fcdc4842f053dae04a18f8e9"
|
||||
checksum = "4f5fd57c80058a56cf5c777ab8a126398ece8e442983605d280a44ce79d0edef"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"windows-sys 0.48.0",
|
||||
"windows-sys 0.52.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -6655,6 +6905,9 @@ name = "spin"
|
||||
version = "0.9.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67"
|
||||
dependencies = [
|
||||
"lock_api",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "spinning_top"
|
||||
@@ -6713,7 +6966,7 @@ dependencies = [
|
||||
"http-body-util",
|
||||
"http-utils",
|
||||
"humantime",
|
||||
"hyper 1.4.1",
|
||||
"hyper 1.6.0",
|
||||
"hyper-util",
|
||||
"metrics",
|
||||
"once_cell",
|
||||
@@ -6894,7 +7147,7 @@ version = "0.26.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4c6bee85a5a24955dc440386795aa378cd9cf82acd5f764469152d2270e581be"
|
||||
dependencies = [
|
||||
"heck",
|
||||
"heck 0.5.0",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"rustversion",
|
||||
@@ -7319,6 +7572,16 @@ dependencies = [
|
||||
"syn 2.0.100",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tokio-pipe"
|
||||
version = "0.2.12"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f213a84bffbd61b8fa0ba8a044b4bbe35d471d0b518867181e82bd5c15542784"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"tokio",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tokio-postgres"
|
||||
version = "0.7.10"
|
||||
@@ -7513,16 +7776,25 @@ version = "0.12.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "877c5b330756d856ffcc4553ab34a5684481ade925ecc54bcd1bf02b1d0d4d52"
|
||||
dependencies = [
|
||||
"async-stream",
|
||||
"async-trait",
|
||||
"axum 0.7.9",
|
||||
"base64 0.22.1",
|
||||
"bytes",
|
||||
"h2 0.4.4",
|
||||
"http 1.1.0",
|
||||
"http-body 1.0.0",
|
||||
"http-body-util",
|
||||
"hyper 1.6.0",
|
||||
"hyper-timeout",
|
||||
"hyper-util",
|
||||
"percent-encoding",
|
||||
"pin-project",
|
||||
"prost 0.13.5",
|
||||
"socket2",
|
||||
"tokio",
|
||||
"tokio-stream",
|
||||
"tower 0.4.13",
|
||||
"tower-layer",
|
||||
"tower-service",
|
||||
"tracing",
|
||||
@@ -7535,14 +7807,15 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7e581ba15a835f4d9ea06c55ab1bd4dce26fc53752c69a04aac00703bfb49ba9"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"axum",
|
||||
"axum 0.8.1",
|
||||
"base64 0.22.1",
|
||||
"bytes",
|
||||
"flate2",
|
||||
"h2 0.4.4",
|
||||
"http 1.1.0",
|
||||
"http-body 1.0.0",
|
||||
"http-body-util",
|
||||
"hyper 1.4.1",
|
||||
"hyper 1.6.0",
|
||||
"hyper-timeout",
|
||||
"hyper-util",
|
||||
"percent-encoding",
|
||||
@@ -7594,11 +7867,16 @@ checksum = "b8fa9be0de6cf49e536ce1851f987bd21a43b771b09473c3549a6c853db37c1c"
|
||||
dependencies = [
|
||||
"futures-core",
|
||||
"futures-util",
|
||||
"indexmap 1.9.3",
|
||||
"pin-project",
|
||||
"pin-project-lite",
|
||||
"rand 0.8.5",
|
||||
"slab",
|
||||
"tokio",
|
||||
"tokio-util",
|
||||
"tower-layer",
|
||||
"tower-service",
|
||||
"tracing",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -8082,7 +8360,7 @@ name = "vm_monitor"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"axum",
|
||||
"axum 0.8.1",
|
||||
"cgroups-rs",
|
||||
"clap",
|
||||
"futures",
|
||||
@@ -8193,6 +8471,15 @@ version = "0.11.0+wasi-snapshot-preview1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
|
||||
|
||||
[[package]]
|
||||
name = "wasi"
|
||||
version = "0.14.2+wasi-0.2.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9683f9a5a998d873c0d21fcbe3c083009670149a8fab228644b8bd36b2c48cb3"
|
||||
dependencies = [
|
||||
"wit-bindgen-rt",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "wasite"
|
||||
version = "0.1.0"
|
||||
@@ -8550,6 +8837,15 @@ dependencies = [
|
||||
"windows-sys 0.48.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "wit-bindgen-rt"
|
||||
version = "0.39.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6f42320e61fe2cfd34354ecb597f86f413484a798ba44a8ca1165c58d42da6c1"
|
||||
dependencies = [
|
||||
"bitflags 2.8.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "workspace_hack"
|
||||
version = "0.1.0"
|
||||
@@ -8557,8 +8853,8 @@ dependencies = [
|
||||
"ahash",
|
||||
"anstream",
|
||||
"anyhow",
|
||||
"axum",
|
||||
"axum-core",
|
||||
"axum 0.8.1",
|
||||
"axum-core 0.5.0",
|
||||
"base64 0.13.1",
|
||||
"base64 0.21.7",
|
||||
"base64ct",
|
||||
@@ -8591,7 +8887,7 @@ dependencies = [
|
||||
"hex",
|
||||
"hmac",
|
||||
"hyper 0.14.30",
|
||||
"hyper 1.4.1",
|
||||
"hyper 1.6.0",
|
||||
"hyper-util",
|
||||
"indexmap 2.9.0",
|
||||
"itertools 0.12.1",
|
||||
|
||||
18
Cargo.toml
18
Cargo.toml
@@ -8,6 +8,7 @@ members = [
|
||||
"pageserver/compaction",
|
||||
"pageserver/ctl",
|
||||
"pageserver/client",
|
||||
"pageserver/client_grpc",
|
||||
"pageserver/pagebench",
|
||||
"pageserver/page_api",
|
||||
"proxy",
|
||||
@@ -32,6 +33,7 @@ members = [
|
||||
"libs/pq_proto",
|
||||
"libs/tenant_size_model",
|
||||
"libs/metrics",
|
||||
"libs/neonart",
|
||||
"libs/postgres_connection",
|
||||
"libs/remote_storage",
|
||||
"libs/tracing-utils",
|
||||
@@ -44,6 +46,7 @@ members = [
|
||||
"libs/proxy/postgres-types2",
|
||||
"libs/proxy/tokio-postgres2",
|
||||
"endpoint_storage",
|
||||
"pgxn/neon/communicator",
|
||||
]
|
||||
|
||||
[workspace.package]
|
||||
@@ -87,6 +90,7 @@ clap = { version = "4.0", features = ["derive", "env"] }
|
||||
clashmap = { version = "1.0", features = ["raw-api"] }
|
||||
comfy-table = "7.1"
|
||||
const_format = "0.2"
|
||||
crossbeam-utils = "0.8.21"
|
||||
crc32c = "0.6"
|
||||
diatomic-waker = { version = "0.2.3" }
|
||||
either = "1.8"
|
||||
@@ -145,6 +149,7 @@ parquet = { version = "53", default-features = false, features = ["zstd"] }
|
||||
parquet_derive = "53"
|
||||
pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
|
||||
pem = "3.0.3"
|
||||
peekable = "0.3.0"
|
||||
pin-project-lite = "0.2"
|
||||
pprof = { version = "0.14", features = ["criterion", "flamegraph", "frame-pointer", "prost-codec"] }
|
||||
procfs = "0.16"
|
||||
@@ -179,6 +184,7 @@ smallvec = "1.11"
|
||||
smol_str = { version = "0.2.0", features = ["serde"] }
|
||||
socket2 = "0.5"
|
||||
spki = "0.7.3"
|
||||
spin = "0.9.8"
|
||||
strum = "0.26"
|
||||
strum_macros = "0.26"
|
||||
"subtle" = "2.5.0"
|
||||
@@ -190,16 +196,15 @@ thiserror = "1.0"
|
||||
tikv-jemallocator = { version = "0.6", features = ["profiling", "stats", "unprefixed_malloc_on_supported_platforms"] }
|
||||
tikv-jemalloc-ctl = { version = "0.6", features = ["stats"] }
|
||||
tokio = { version = "1.43.1", features = ["macros"] }
|
||||
tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
|
||||
tokio-io-timeout = "1.2.0"
|
||||
tokio-postgres-rustls = "0.12.0"
|
||||
tokio-rustls = { version = "0.26.0", default-features = false, features = ["tls12", "ring"]}
|
||||
tokio-stream = "0.1"
|
||||
tokio-tar = "0.3"
|
||||
tokio-util = { version = "0.7.10", features = ["io", "rt"] }
|
||||
tokio-util = { version = "0.7.10", features = ["io", "io-util", "rt"] }
|
||||
toml = "0.8"
|
||||
toml_edit = "0.22"
|
||||
tonic = { version = "0.13.1", default-features = false, features = ["channel", "codegen", "prost", "router", "server", "tls-ring", "tls-native-roots"] }
|
||||
tonic = { version = "0.13.1", default-features = false, features = ["channel", "codegen", "gzip", "prost", "router", "server", "tls-ring", "tls-native-roots"] }
|
||||
tonic-reflection = { version = "0.13.1", features = ["server"] }
|
||||
tower = { version = "0.5.2", default-features = false }
|
||||
tower-http = { version = "0.6.2", features = ["auth", "request-id", "trace"] }
|
||||
@@ -232,6 +237,9 @@ x509-cert = { version = "0.2.5" }
|
||||
env_logger = "0.11"
|
||||
log = "0.4"
|
||||
|
||||
tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
|
||||
uring-common = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
|
||||
|
||||
## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
|
||||
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch = "neon" }
|
||||
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", branch = "neon" }
|
||||
@@ -251,9 +259,12 @@ desim = { version = "0.1", path = "./libs/desim" }
|
||||
endpoint_storage = { version = "0.0.1", path = "./endpoint_storage/" }
|
||||
http-utils = { version = "0.1", path = "./libs/http-utils/" }
|
||||
metrics = { version = "0.1", path = "./libs/metrics/" }
|
||||
neonart = { version = "0.1", path = "./libs/neonart/" }
|
||||
neon-shmem = { version = "0.1", path = "./libs/neon-shmem/" }
|
||||
pageserver = { path = "./pageserver" }
|
||||
pageserver_api = { version = "0.1", path = "./libs/pageserver_api/" }
|
||||
pageserver_client = { path = "./pageserver/client" }
|
||||
pageserver_client_grpc = { path = "./pageserver/client_grpc" }
|
||||
pageserver_compaction = { version = "0.1", path = "./pageserver/compaction/" }
|
||||
pageserver_page_api = { path = "./pageserver/page_api" }
|
||||
postgres_backend = { version = "0.1", path = "./libs/postgres_backend/" }
|
||||
@@ -278,6 +289,7 @@ walproposer = { version = "0.1", path = "./libs/walproposer/" }
|
||||
workspace_hack = { version = "0.1", path = "./workspace_hack/" }
|
||||
|
||||
## Build dependencies
|
||||
cbindgen = "0.28.0"
|
||||
criterion = "0.5.1"
|
||||
rcgen = "0.13"
|
||||
rstest = "0.18"
|
||||
|
||||
7
Makefile
7
Makefile
@@ -18,10 +18,12 @@ ifeq ($(BUILD_TYPE),release)
|
||||
PG_LDFLAGS = $(LDFLAGS)
|
||||
# Unfortunately, `--profile=...` is a nightly feature
|
||||
CARGO_BUILD_FLAGS += --release
|
||||
NEON_CARGO_ARTIFACT_TARGET_DIR = $(ROOT_PROJECT_DIR)/target/release
|
||||
else ifeq ($(BUILD_TYPE),debug)
|
||||
PG_CONFIGURE_OPTS = --enable-debug --with-openssl --enable-cassert --enable-depend
|
||||
PG_CFLAGS += -O0 -g3 $(CFLAGS)
|
||||
PG_LDFLAGS = $(LDFLAGS)
|
||||
NEON_CARGO_ARTIFACT_TARGET_DIR = $(ROOT_PROJECT_DIR)/target/debug
|
||||
else
|
||||
$(error Bad build type '$(BUILD_TYPE)', see Makefile for options)
|
||||
endif
|
||||
@@ -180,11 +182,16 @@ postgres-check-%: postgres-%
|
||||
|
||||
.PHONY: neon-pg-ext-%
|
||||
neon-pg-ext-%: postgres-%
|
||||
+@echo "Compiling communicator $*"
|
||||
$(CARGO_CMD_PREFIX) cargo build -p communicator $(CARGO_BUILD_FLAGS)
|
||||
|
||||
+@echo "Compiling neon $*"
|
||||
mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-$*
|
||||
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config COPT='$(COPT)' \
|
||||
LIBCOMMUNICATOR_PATH=$(NEON_CARGO_ARTIFACT_TARGET_DIR) \
|
||||
-C $(POSTGRES_INSTALL_DIR)/build/neon-$* \
|
||||
-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile install
|
||||
|
||||
+@echo "Compiling neon_walredo $*"
|
||||
mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-walredo-$*
|
||||
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config COPT='$(COPT)' \
|
||||
|
||||
@@ -310,13 +310,13 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux
|
||||
. "$HOME/.cargo/env" && \
|
||||
cargo --version && rustup --version && \
|
||||
rustup component add llvm-tools rustfmt clippy && \
|
||||
cargo install rustfilt --version ${RUSTFILT_VERSION} && \
|
||||
cargo install cargo-hakari --version ${CARGO_HAKARI_VERSION} && \
|
||||
cargo install cargo-deny --locked --version ${CARGO_DENY_VERSION} && \
|
||||
cargo install cargo-hack --version ${CARGO_HACK_VERSION} && \
|
||||
cargo install cargo-nextest --version ${CARGO_NEXTEST_VERSION} && \
|
||||
cargo install cargo-chef --locked --version ${CARGO_CHEF_VERSION} && \
|
||||
cargo install diesel_cli --version ${CARGO_DIESEL_CLI_VERSION} \
|
||||
cargo install rustfilt --version ${RUSTFILT_VERSION} --locked && \
|
||||
cargo install cargo-hakari --version ${CARGO_HAKARI_VERSION} --locked && \
|
||||
cargo install cargo-deny --version ${CARGO_DENY_VERSION} --locked && \
|
||||
cargo install cargo-hack --version ${CARGO_HACK_VERSION} --locked && \
|
||||
cargo install cargo-nextest --version ${CARGO_NEXTEST_VERSION} --locked && \
|
||||
cargo install cargo-chef --version ${CARGO_CHEF_VERSION} --locked && \
|
||||
cargo install diesel_cli --version ${CARGO_DIESEL_CLI_VERSION} --locked \
|
||||
--features postgres-bundled --no-default-features && \
|
||||
rm -rf /home/nonroot/.cargo/registry && \
|
||||
rm -rf /home/nonroot/.cargo/git
|
||||
|
||||
@@ -297,6 +297,7 @@ RUN ./autogen.sh && \
|
||||
./configure --with-sfcgal=/usr/local/bin/sfcgal-config && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
make staged-install && \
|
||||
cd extensions/postgis && \
|
||||
make clean && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
@@ -602,7 +603,7 @@ RUN case "${PG_VERSION:?}" in \
|
||||
;; \
|
||||
esac && \
|
||||
wget https://github.com/knizhnik/online_advisor/archive/refs/tags/1.0.tar.gz -O online_advisor.tar.gz && \
|
||||
echo "059b7d9e5a90013a58bdd22e9505b88406ce05790675eb2d8434e5b215652d54 online_advisor.tar.gz" | sha256sum --check && \
|
||||
echo "37dcadf8f7cc8d6cc1f8831276ee245b44f1b0274f09e511e47a67738ba9ed0f online_advisor.tar.gz" | sha256sum --check && \
|
||||
mkdir online_advisor-src && cd online_advisor-src && tar xzf ../online_advisor.tar.gz --strip-components=1 -C .
|
||||
|
||||
FROM pg-build AS online_advisor-build
|
||||
@@ -1180,14 +1181,14 @@ RUN cd exts/rag && \
|
||||
RUN cd exts/rag_bge_small_en_v15 && \
|
||||
sed -i 's/pgrx = "0.14.1"/pgrx = { version = "0.14.1", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
|
||||
ORT_LIB_LOCATION=/ext-src/onnxruntime-src/build/Linux \
|
||||
REMOTE_ONNX_URL=http://pg-ext-s3-gateway/pgrag-data/bge_small_en_v15.onnx \
|
||||
REMOTE_ONNX_URL=http://pg-ext-s3-gateway.pg-ext-s3-gateway.svc.cluster.local/pgrag-data/bge_small_en_v15.onnx \
|
||||
cargo pgrx install --release --features remote_onnx && \
|
||||
echo "trusted = true" >> /usr/local/pgsql/share/extension/rag_bge_small_en_v15.control
|
||||
|
||||
RUN cd exts/rag_jina_reranker_v1_tiny_en && \
|
||||
sed -i 's/pgrx = "0.14.1"/pgrx = { version = "0.14.1", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
|
||||
ORT_LIB_LOCATION=/ext-src/onnxruntime-src/build/Linux \
|
||||
REMOTE_ONNX_URL=http://pg-ext-s3-gateway/pgrag-data/jina_reranker_v1_tiny_en.onnx \
|
||||
REMOTE_ONNX_URL=http://pg-ext-s3-gateway.pg-ext-s3-gateway.svc.cluster.local/pgrag-data/jina_reranker_v1_tiny_en.onnx \
|
||||
cargo pgrx install --release --features remote_onnx && \
|
||||
echo "trusted = true" >> /usr/local/pgsql/share/extension/rag_jina_reranker_v1_tiny_en.control
|
||||
|
||||
@@ -1842,10 +1843,25 @@ RUN make PG_VERSION="${PG_VERSION:?}" -C compute
|
||||
|
||||
FROM pg-build AS extension-tests
|
||||
ARG PG_VERSION
|
||||
# This is required for the PostGIS test
|
||||
RUN apt-get update && case $DEBIAN_VERSION in \
|
||||
bullseye) \
|
||||
apt-get install -y libproj19 libgdal28 time; \
|
||||
;; \
|
||||
bookworm) \
|
||||
apt-get install -y libgdal32 libproj25 time; \
|
||||
;; \
|
||||
*) \
|
||||
echo "Unknown Debian version ${DEBIAN_VERSION}" && exit 1 \
|
||||
;; \
|
||||
esac
|
||||
|
||||
COPY docker-compose/ext-src/ /ext-src/
|
||||
|
||||
COPY --from=pg-build /postgres /postgres
|
||||
#COPY --from=postgis-src /ext-src/ /ext-src/
|
||||
COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=postgis-build /ext-src/postgis-src /ext-src/postgis-src
|
||||
COPY --from=postgis-build /sfcgal/* /usr
|
||||
COPY --from=plv8-src /ext-src/ /ext-src/
|
||||
COPY --from=h3-pg-src /ext-src/h3-pg-src /ext-src/h3-pg-src
|
||||
COPY --from=postgresql-unit-src /ext-src/ /ext-src/
|
||||
@@ -1886,6 +1902,7 @@ COPY compute/patches/pg_repack.patch /ext-src
|
||||
RUN cd /ext-src/pg_repack-src && patch -p1 </ext-src/pg_repack.patch && rm -f /ext-src/pg_repack.patch
|
||||
|
||||
COPY --chmod=755 docker-compose/run-tests.sh /run-tests.sh
|
||||
RUN echo /usr/local/pgsql/lib > /etc/ld.so.conf.d/00-neon.conf && /sbin/ldconfig
|
||||
RUN apt-get update && apt-get install -y libtap-parser-sourcehandler-pgtap-perl jq \
|
||||
&& apt clean && rm -rf /ext-src/*.tar.gz /ext-src/*.patch /var/lib/apt/lists/*
|
||||
ENV PATH=/usr/local/pgsql/bin:$PATH
|
||||
|
||||
121
compute/manifest.yaml
Normal file
121
compute/manifest.yaml
Normal file
@@ -0,0 +1,121 @@
|
||||
pg_settings:
|
||||
# Common settings for primaries and replicas of all versions.
|
||||
common:
|
||||
# Check for client disconnection every 1 minute. By default, Postgres will detect the
|
||||
# loss of the connection only at the next interaction with the socket, when it waits
|
||||
# for, receives or sends data, so it will likely waste resources till the end of the
|
||||
# query execution. There should be no drawbacks in setting this for everyone, so enable
|
||||
# it by default. If anyone will complain, we can allow editing it.
|
||||
# https://www.postgresql.org/docs/16/runtime-config-connection.html#GUC-CLIENT-CONNECTION-CHECK-INTERVAL
|
||||
client_connection_check_interval: "60000" # 1 minute
|
||||
# ---- IO ----
|
||||
effective_io_concurrency: "20"
|
||||
maintenance_io_concurrency: "100"
|
||||
fsync: "off"
|
||||
hot_standby: "off"
|
||||
# We allow users to change this if needed, but by default we
|
||||
# just don't want to see long-lasting idle transactions, as they
|
||||
# prevent activity monitor from suspending projects.
|
||||
idle_in_transaction_session_timeout: "300000" # 5 minutes
|
||||
listen_addresses: "*"
|
||||
# --- LOGGING ---- helps investigations
|
||||
log_connections: "on"
|
||||
log_disconnections: "on"
|
||||
# 1GB, unit is KB
|
||||
log_temp_files: "1048576"
|
||||
# Disable dumping customer data to logs, both to increase data privacy
|
||||
# and to reduce the amount the logs.
|
||||
log_error_verbosity: "terse"
|
||||
log_min_error_statement: "panic"
|
||||
max_connections: "100"
|
||||
# --- WAL ---
|
||||
# - flush lag is the max amount of WAL that has been generated but not yet stored
|
||||
# to disk in the page server. A smaller value means less delay after a pageserver
|
||||
# restart, but if you set it too small you might again need to slow down writes if the
|
||||
# pageserver cannot flush incoming WAL to disk fast enough. This must be larger
|
||||
# than the pageserver's checkpoint interval, currently 1 GB! Otherwise you get a
|
||||
# a deadlock where the compute node refuses to generate more WAL before the
|
||||
# old WAL has been uploaded to S3, but the pageserver is waiting for more WAL
|
||||
# to be generated before it is uploaded to S3.
|
||||
max_replication_flush_lag: "10GB"
|
||||
max_replication_slots: "10"
|
||||
# Backpressure configuration:
|
||||
# - write lag is the max amount of WAL that has been generated by Postgres but not yet
|
||||
# processed by the page server. Making this smaller reduces the worst case latency
|
||||
# of a GetPage request, if you request a page that was recently modified. On the other
|
||||
# hand, if this is too small, the compute node might need to wait on a write if there is a
|
||||
# hiccup in the network or page server so that the page server has temporarily fallen
|
||||
# behind.
|
||||
#
|
||||
# Previously it was set to 500 MB, but it caused compute being unresponsive under load
|
||||
# https://github.com/neondatabase/neon/issues/2028
|
||||
max_replication_write_lag: "500MB"
|
||||
max_wal_senders: "10"
|
||||
# A Postgres checkpoint is cheap in storage, as doesn't involve any significant amount
|
||||
# of real I/O. Only the SLRU buffers and some other small files are flushed to disk.
|
||||
# However, as long as we have full_page_writes=on, page updates after a checkpoint
|
||||
# include full-page images which bloats the WAL. So may want to bump max_wal_size to
|
||||
# reduce the WAL bloating, but at the same it will increase pg_wal directory size on
|
||||
# compute and can lead to out of disk error on k8s nodes.
|
||||
max_wal_size: "1024"
|
||||
wal_keep_size: "0"
|
||||
wal_level: "replica"
|
||||
# Reduce amount of WAL generated by default.
|
||||
wal_log_hints: "off"
|
||||
# - without wal_sender_timeout set we don't get feedback messages,
|
||||
# required for backpressure.
|
||||
wal_sender_timeout: "10000"
|
||||
# We have some experimental extensions, which we don't want users to install unconsciously.
|
||||
# To install them, users would need to set the `neon.allow_unstable_extensions` setting.
|
||||
# There are two of them currently:
|
||||
# - `pgrag` - https://github.com/neondatabase-labs/pgrag - extension is actually called just `rag`,
|
||||
# and two dependencies:
|
||||
# - `rag_bge_small_en_v15`
|
||||
# - `rag_jina_reranker_v1_tiny_en`
|
||||
# - `pg_mooncake` - https://github.com/Mooncake-Labs/pg_mooncake/
|
||||
neon.unstable_extensions: "rag,rag_bge_small_en_v15,rag_jina_reranker_v1_tiny_en,pg_mooncake,anon"
|
||||
neon.protocol_version: "3"
|
||||
password_encryption: "scram-sha-256"
|
||||
# This is important to prevent Postgres from trying to perform
|
||||
# a local WAL redo after backend crash. It should exit and let
|
||||
# the systemd or k8s to do a fresh startup with compute_ctl.
|
||||
restart_after_crash: "off"
|
||||
# By default 3. We have the following persistent connections in the VM:
|
||||
# * compute_activity_monitor (from compute_ctl)
|
||||
# * postgres-exporter (metrics collector; it has 2 connections)
|
||||
# * sql_exporter (metrics collector; we have 2 instances [1 for us & users; 1 for autoscaling])
|
||||
# * vm-monitor (to query & change file cache size)
|
||||
# i.e. total of 6. Let's reserve 7, so there's still at least one left over.
|
||||
superuser_reserved_connections: "7"
|
||||
synchronous_standby_names: "walproposer"
|
||||
|
||||
replica:
|
||||
hot_standby: "on"
|
||||
|
||||
per_version:
|
||||
17:
|
||||
common:
|
||||
# PostgreSQL 17 has a new IO system called "read stream", which can combine IOs up to some
|
||||
# size. It still has some issues with readahead, though, so we default to disabled/
|
||||
# "no combining of IOs" to make sure we get the maximum prefetch depth.
|
||||
# See also: https://github.com/neondatabase/neon/pull/9860
|
||||
io_combine_limit: "1"
|
||||
replica:
|
||||
# prefetching of blocks referenced in WAL doesn't make sense for us
|
||||
# Neon hot standby ignores pages that are not in the shared_buffers
|
||||
recovery_prefetch: "off"
|
||||
16:
|
||||
common:
|
||||
replica:
|
||||
# prefetching of blocks referenced in WAL doesn't make sense for us
|
||||
# Neon hot standby ignores pages that are not in the shared_buffers
|
||||
recovery_prefetch: "off"
|
||||
15:
|
||||
common:
|
||||
replica:
|
||||
# prefetching of blocks referenced in WAL doesn't make sense for us
|
||||
# Neon hot standby ignores pages that are not in the shared_buffers
|
||||
recovery_prefetch: "off"
|
||||
14:
|
||||
common:
|
||||
replica:
|
||||
@@ -38,6 +38,7 @@ once_cell.workspace = true
|
||||
opentelemetry.workspace = true
|
||||
opentelemetry_sdk.workspace = true
|
||||
p256 = { version = "0.13", features = ["pem"] }
|
||||
pageserver_page_api.workspace = true
|
||||
postgres.workspace = true
|
||||
regex.workspace = true
|
||||
reqwest = { workspace = true, features = ["json"] }
|
||||
@@ -53,6 +54,7 @@ tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
|
||||
tokio-postgres.workspace = true
|
||||
tokio-util.workspace = true
|
||||
tokio-stream.workspace = true
|
||||
tonic.workspace = true
|
||||
tower-otel.workspace = true
|
||||
tracing.workspace = true
|
||||
tracing-opentelemetry.workspace = true
|
||||
|
||||
@@ -40,7 +40,7 @@ use std::sync::mpsc;
|
||||
use std::thread;
|
||||
use std::time::Duration;
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use anyhow::{Context, Result, bail};
|
||||
use clap::Parser;
|
||||
use compute_api::responses::ComputeConfig;
|
||||
use compute_tools::compute::{
|
||||
@@ -57,31 +57,15 @@ use tracing::{error, info};
|
||||
use url::Url;
|
||||
use utils::failpoint_support;
|
||||
|
||||
// Compatibility hack: if the control plane specified any remote-ext-config
|
||||
// use the default value for extension storage proxy gateway.
|
||||
// Remove this once the control plane is updated to pass the gateway URL
|
||||
fn parse_remote_ext_base_url(arg: &str) -> Result<String> {
|
||||
const FALLBACK_PG_EXT_GATEWAY_BASE_URL: &str =
|
||||
"http://pg-ext-s3-gateway.pg-ext-s3-gateway.svc.cluster.local";
|
||||
|
||||
Ok(if arg.starts_with("http") {
|
||||
arg
|
||||
} else {
|
||||
FALLBACK_PG_EXT_GATEWAY_BASE_URL
|
||||
}
|
||||
.to_owned())
|
||||
}
|
||||
|
||||
#[derive(Parser)]
|
||||
#[derive(Debug, Parser)]
|
||||
#[command(rename_all = "kebab-case")]
|
||||
struct Cli {
|
||||
#[arg(short = 'b', long, default_value = "postgres", env = "POSTGRES_PATH")]
|
||||
pub pgbin: String,
|
||||
|
||||
/// The base URL for the remote extension storage proxy gateway.
|
||||
/// Should be in the form of `http(s)://<gateway-hostname>[:<port>]`.
|
||||
#[arg(short = 'r', long, value_parser = parse_remote_ext_base_url, alias = "remote-ext-config")]
|
||||
pub remote_ext_base_url: Option<String>,
|
||||
#[arg(short = 'r', long, value_parser = Self::parse_remote_ext_base_url)]
|
||||
pub remote_ext_base_url: Option<Url>,
|
||||
|
||||
/// The port to bind the external listening HTTP server to. Clients running
|
||||
/// outside the compute will talk to the compute through this port. Keep
|
||||
@@ -142,6 +126,25 @@ struct Cli {
|
||||
pub installed_extensions_collection_interval: u64,
|
||||
}
|
||||
|
||||
impl Cli {
|
||||
/// Parse a URL from an argument. By default, this isn't necessary, but we
|
||||
/// want to do some sanity checking.
|
||||
fn parse_remote_ext_base_url(value: &str) -> Result<Url> {
|
||||
// Remove extra trailing slashes, and add one. We use Url::join() later
|
||||
// when downloading remote extensions. If the base URL is something like
|
||||
// http://example.com/pg-ext-s3-gateway, and join() is called with
|
||||
// something like "xyz", the resulting URL is http://example.com/xyz.
|
||||
let value = value.trim_end_matches('/').to_owned() + "/";
|
||||
let url = Url::parse(&value)?;
|
||||
|
||||
if url.query_pairs().count() != 0 {
|
||||
bail!("parameters detected in remote extensions base URL")
|
||||
}
|
||||
|
||||
Ok(url)
|
||||
}
|
||||
}
|
||||
|
||||
fn main() -> Result<()> {
|
||||
let cli = Cli::parse();
|
||||
|
||||
@@ -268,7 +271,8 @@ fn handle_exit_signal(sig: i32) {
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use clap::CommandFactory;
|
||||
use clap::{CommandFactory, Parser};
|
||||
use url::Url;
|
||||
|
||||
use super::Cli;
|
||||
|
||||
@@ -278,16 +282,41 @@ mod test {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_pg_ext_gateway_base_url() {
|
||||
let arg = "http://pg-ext-s3-gateway2";
|
||||
let result = super::parse_remote_ext_base_url(arg).unwrap();
|
||||
assert_eq!(result, arg);
|
||||
|
||||
let arg = "pg-ext-s3-gateway";
|
||||
let result = super::parse_remote_ext_base_url(arg).unwrap();
|
||||
fn verify_remote_ext_base_url() {
|
||||
let cli = Cli::parse_from([
|
||||
"compute_ctl",
|
||||
"--pgdata=test",
|
||||
"--connstr=test",
|
||||
"--compute-id=test",
|
||||
"--remote-ext-base-url",
|
||||
"https://example.com/subpath",
|
||||
]);
|
||||
assert_eq!(
|
||||
result,
|
||||
"http://pg-ext-s3-gateway.pg-ext-s3-gateway.svc.cluster.local"
|
||||
cli.remote_ext_base_url.unwrap(),
|
||||
Url::parse("https://example.com/subpath/").unwrap()
|
||||
);
|
||||
|
||||
let cli = Cli::parse_from([
|
||||
"compute_ctl",
|
||||
"--pgdata=test",
|
||||
"--connstr=test",
|
||||
"--compute-id=test",
|
||||
"--remote-ext-base-url",
|
||||
"https://example.com//",
|
||||
]);
|
||||
assert_eq!(
|
||||
cli.remote_ext_base_url.unwrap(),
|
||||
Url::parse("https://example.com").unwrap()
|
||||
);
|
||||
|
||||
Cli::try_parse_from([
|
||||
"compute_ctl",
|
||||
"--pgdata=test",
|
||||
"--connstr=test",
|
||||
"--compute-id=test",
|
||||
"--remote-ext-base-url",
|
||||
"https://example.com?hello=world",
|
||||
])
|
||||
.expect_err("URL parameters are not allowed");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
use anyhow::{Context, Result};
|
||||
use anyhow::{Context, Result, anyhow};
|
||||
use chrono::{DateTime, Utc};
|
||||
use compute_api::privilege::Privilege;
|
||||
use compute_api::responses::{
|
||||
ComputeConfig, ComputeCtlConfig, ComputeMetrics, ComputeStatus, LfcOffloadState,
|
||||
LfcPrewarmState,
|
||||
LfcPrewarmState, TlsConfig,
|
||||
};
|
||||
use compute_api::spec::{
|
||||
ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, ExtVersion, PgIdent,
|
||||
@@ -15,6 +15,7 @@ use itertools::Itertools;
|
||||
use nix::sys::signal::{Signal, kill};
|
||||
use nix::unistd::Pid;
|
||||
use once_cell::sync::Lazy;
|
||||
use pageserver_page_api as page_api;
|
||||
use postgres;
|
||||
use postgres::NoTls;
|
||||
use postgres::error::SqlState;
|
||||
@@ -29,8 +30,11 @@ use std::sync::atomic::{AtomicU32, Ordering};
|
||||
use std::sync::{Arc, Condvar, Mutex, RwLock};
|
||||
use std::time::{Duration, Instant};
|
||||
use std::{env, fs};
|
||||
use tokio::io::AsyncReadExt;
|
||||
use tokio::spawn;
|
||||
use tokio_util::io::StreamReader;
|
||||
use tracing::{Instrument, debug, error, info, instrument, warn};
|
||||
use url::Url;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
use utils::lsn::Lsn;
|
||||
use utils::measured_stream::MeasuredReader;
|
||||
@@ -96,7 +100,7 @@ pub struct ComputeNodeParams {
|
||||
pub internal_http_port: u16,
|
||||
|
||||
/// the address of extension storage proxy gateway
|
||||
pub remote_ext_base_url: Option<String>,
|
||||
pub remote_ext_base_url: Option<Url>,
|
||||
|
||||
/// Interval for installed extensions collection
|
||||
pub installed_extensions_collection_interval: u64,
|
||||
@@ -368,7 +372,7 @@ impl ComputeNode {
|
||||
|
||||
let mut new_state = ComputeState::new();
|
||||
if let Some(spec) = config.spec {
|
||||
let pspec = ParsedSpec::try_from(spec).map_err(|msg| anyhow::anyhow!(msg))?;
|
||||
let pspec = ParsedSpec::try_from(spec).map_err(|msg| anyhow!(msg))?;
|
||||
new_state.pspec = Some(pspec);
|
||||
}
|
||||
|
||||
@@ -395,7 +399,7 @@ impl ComputeNode {
|
||||
// because QEMU will already have its memory allocated from the host, and
|
||||
// the necessary binaries will already be cached.
|
||||
if cli_spec.is_none() {
|
||||
this.prewarm_postgres()?;
|
||||
this.prewarm_postgres_vm_memory()?;
|
||||
}
|
||||
|
||||
// Set the up metric with Empty status before starting the HTTP server.
|
||||
@@ -602,6 +606,8 @@ impl ComputeNode {
|
||||
});
|
||||
}
|
||||
|
||||
let tls_config = self.tls_config(&pspec.spec);
|
||||
|
||||
// If there are any remote extensions in shared_preload_libraries, start downloading them
|
||||
if pspec.spec.remote_extensions.is_some() {
|
||||
let (this, spec) = (self.clone(), pspec.spec.clone());
|
||||
@@ -658,7 +664,7 @@ impl ComputeNode {
|
||||
info!("tuning pgbouncer");
|
||||
|
||||
let pgbouncer_settings = pgbouncer_settings.clone();
|
||||
let tls_config = self.compute_ctl_config.tls.clone();
|
||||
let tls_config = tls_config.clone();
|
||||
|
||||
// Spawn a background task to do the tuning,
|
||||
// so that we don't block the main thread that starts Postgres.
|
||||
@@ -677,7 +683,10 @@ impl ComputeNode {
|
||||
|
||||
// Spawn a background task to do the configuration,
|
||||
// so that we don't block the main thread that starts Postgres.
|
||||
let local_proxy = local_proxy.clone();
|
||||
|
||||
let mut local_proxy = local_proxy.clone();
|
||||
local_proxy.tls = tls_config.clone();
|
||||
|
||||
let _handle = tokio::spawn(async move {
|
||||
if let Err(err) = local_proxy::configure(&local_proxy) {
|
||||
error!("error while configuring local_proxy: {err:?}");
|
||||
@@ -778,7 +787,7 @@ impl ComputeNode {
|
||||
// Spawn the extension stats background task
|
||||
self.spawn_extension_stats_task();
|
||||
|
||||
if pspec.spec.prewarm_lfc_on_startup {
|
||||
if pspec.spec.autoprewarm {
|
||||
self.prewarm_lfc();
|
||||
}
|
||||
Ok(())
|
||||
@@ -935,6 +944,74 @@ impl ComputeNode {
|
||||
#[instrument(skip_all, fields(%lsn))]
|
||||
fn try_get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> {
|
||||
let spec = compute_state.pspec.as_ref().expect("spec must be set");
|
||||
let shard0_connstr = spec.pageserver_connstr.split(',').next().unwrap();
|
||||
|
||||
match Url::parse(shard0_connstr)?.scheme() {
|
||||
"postgres" | "postgresql" => self.try_get_basebackup_libpq(spec, lsn),
|
||||
"grpc" => self.try_get_basebackup_grpc(spec, lsn),
|
||||
scheme => return Err(anyhow!("unknown URL scheme {scheme}")),
|
||||
}
|
||||
}
|
||||
|
||||
fn try_get_basebackup_grpc(&self, spec: &ParsedSpec, lsn: Lsn) -> Result<()> {
|
||||
let start_time = Instant::now();
|
||||
|
||||
let shard0_connstr = spec
|
||||
.pageserver_connstr
|
||||
.split(',')
|
||||
.next()
|
||||
.unwrap()
|
||||
.to_string();
|
||||
|
||||
let chunks = tokio::runtime::Handle::current().block_on(async move {
|
||||
let mut client = page_api::proto::PageServiceClient::connect(shard0_connstr).await?;
|
||||
|
||||
let req = page_api::proto::GetBaseBackupRequest {
|
||||
lsn: lsn.0,
|
||||
replica: false, // TODO: handle replicas, with LSN 0
|
||||
};
|
||||
let mut req = tonic::Request::new(req);
|
||||
let metadata = req.metadata_mut();
|
||||
metadata.insert("neon-tenant-id", spec.tenant_id.to_string().parse()?);
|
||||
metadata.insert("neon-timeline-id", spec.timeline_id.to_string().parse()?);
|
||||
metadata.insert("neon-shard-id", "0000".to_string().parse()?); // TODO: shard count
|
||||
if let Some(auth) = spec.storage_auth_token.as_ref() {
|
||||
metadata.insert("authorization", format!("Bearer {auth}").parse()?);
|
||||
}
|
||||
|
||||
let chunks = client.get_base_backup(req).await?.into_inner();
|
||||
anyhow::Ok(chunks)
|
||||
})?;
|
||||
let pageserver_connect_micros = start_time.elapsed().as_micros() as u64;
|
||||
|
||||
// Convert the chunks stream into an AsyncRead
|
||||
let stream_reader = StreamReader::new(
|
||||
chunks.map(|chunk| chunk.map(|c| c.chunk).map_err(std::io::Error::other)),
|
||||
);
|
||||
|
||||
// Wrap the AsyncRead into a blocking reader for compatibility with tar::Archive
|
||||
let reader = tokio_util::io::SyncIoBridge::new(stream_reader);
|
||||
let mut measured_reader = MeasuredReader::new(reader);
|
||||
let mut bufreader = std::io::BufReader::new(&mut measured_reader);
|
||||
|
||||
// Read the archive directly from the `CopyOutReader`
|
||||
//
|
||||
// Set `ignore_zeros` so that unpack() reads all the Copy data and
|
||||
// doesn't stop at the end-of-archive marker. Otherwise, if the server
|
||||
// sends an Error after finishing the tarball, we will not notice it.
|
||||
let mut ar = tar::Archive::new(&mut bufreader);
|
||||
ar.set_ignore_zeros(true);
|
||||
ar.unpack(&self.params.pgdata)?;
|
||||
|
||||
// Report metrics
|
||||
let mut state = self.state.lock().unwrap();
|
||||
state.metrics.pageserver_connect_micros = pageserver_connect_micros;
|
||||
state.metrics.basebackup_bytes = measured_reader.get_byte_count() as u64;
|
||||
state.metrics.basebackup_ms = start_time.elapsed().as_millis() as u64;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn try_get_basebackup_libpq(&self, spec: &ParsedSpec, lsn: Lsn) -> Result<()> {
|
||||
let start_time = Instant::now();
|
||||
|
||||
let shard0_connstr = spec.pageserver_connstr.split(',').next().unwrap();
|
||||
@@ -950,12 +1027,10 @@ impl ComputeNode {
|
||||
}
|
||||
|
||||
config.application_name("compute_ctl");
|
||||
if let Some(spec) = &compute_state.pspec {
|
||||
config.options(&format!(
|
||||
"-c neon.compute_mode={}",
|
||||
spec.spec.mode.to_type_str()
|
||||
));
|
||||
}
|
||||
config.options(&format!(
|
||||
"-c neon.compute_mode={}",
|
||||
spec.spec.mode.to_type_str()
|
||||
));
|
||||
|
||||
// Connect to pageserver
|
||||
let mut client = config.connect(NoTls)?;
|
||||
@@ -1029,10 +1104,7 @@ impl ComputeNode {
|
||||
return result;
|
||||
}
|
||||
Err(ref e) if attempts < max_attempts => {
|
||||
warn!(
|
||||
"Failed to get basebackup: {} (attempt {}/{})",
|
||||
e, attempts, max_attempts
|
||||
);
|
||||
warn!("Failed to get basebackup: {e:?} (attempt {attempts}/{max_attempts})");
|
||||
std::thread::sleep(std::time::Duration::from_millis(retry_period_ms as u64));
|
||||
retry_period_ms *= 1.5;
|
||||
}
|
||||
@@ -1204,13 +1276,15 @@ impl ComputeNode {
|
||||
let spec = &pspec.spec;
|
||||
let pgdata_path = Path::new(&self.params.pgdata);
|
||||
|
||||
let tls_config = self.tls_config(&pspec.spec);
|
||||
|
||||
// Remove/create an empty pgdata directory and put configuration there.
|
||||
self.create_pgdata()?;
|
||||
config::write_postgres_conf(
|
||||
pgdata_path,
|
||||
&pspec.spec,
|
||||
self.params.internal_http_port,
|
||||
&self.compute_ctl_config.tls,
|
||||
tls_config,
|
||||
)?;
|
||||
|
||||
// Syncing safekeepers is only safe with primary nodes: if a primary
|
||||
@@ -1306,8 +1380,8 @@ impl ComputeNode {
|
||||
}
|
||||
|
||||
/// Start and stop a postgres process to warm up the VM for startup.
|
||||
pub fn prewarm_postgres(&self) -> Result<()> {
|
||||
info!("prewarming");
|
||||
pub fn prewarm_postgres_vm_memory(&self) -> Result<()> {
|
||||
info!("prewarming VM memory");
|
||||
|
||||
// Create pgdata
|
||||
let pgdata = &format!("{}.warmup", self.params.pgdata);
|
||||
@@ -1349,7 +1423,7 @@ impl ComputeNode {
|
||||
kill(pm_pid, Signal::SIGQUIT)?;
|
||||
info!("sent SIGQUIT signal");
|
||||
pg.wait()?;
|
||||
info!("done prewarming");
|
||||
info!("done prewarming vm memory");
|
||||
|
||||
// clean up
|
||||
let _ok = fs::remove_dir_all(pgdata);
|
||||
@@ -1535,14 +1609,22 @@ impl ComputeNode {
|
||||
.clone(),
|
||||
);
|
||||
|
||||
let mut tls_config = None::<TlsConfig>;
|
||||
if spec.features.contains(&ComputeFeature::TlsExperimental) {
|
||||
tls_config = self.compute_ctl_config.tls.clone();
|
||||
}
|
||||
|
||||
let max_concurrent_connections = self.max_service_connections(compute_state, &spec);
|
||||
|
||||
// Merge-apply spec & changes to PostgreSQL state.
|
||||
self.apply_spec_sql(spec.clone(), conf.clone(), max_concurrent_connections)?;
|
||||
|
||||
if let Some(local_proxy) = &spec.clone().local_proxy_config {
|
||||
let mut local_proxy = local_proxy.clone();
|
||||
local_proxy.tls = tls_config.clone();
|
||||
|
||||
info!("configuring local_proxy");
|
||||
local_proxy::configure(local_proxy).context("apply_config local_proxy")?;
|
||||
local_proxy::configure(&local_proxy).context("apply_config local_proxy")?;
|
||||
}
|
||||
|
||||
// Run migrations separately to not hold up cold starts
|
||||
@@ -1594,11 +1676,13 @@ impl ComputeNode {
|
||||
pub fn reconfigure(&self) -> Result<()> {
|
||||
let spec = self.state.lock().unwrap().pspec.clone().unwrap().spec;
|
||||
|
||||
let tls_config = self.tls_config(&spec);
|
||||
|
||||
if let Some(ref pgbouncer_settings) = spec.pgbouncer_settings {
|
||||
info!("tuning pgbouncer");
|
||||
|
||||
let pgbouncer_settings = pgbouncer_settings.clone();
|
||||
let tls_config = self.compute_ctl_config.tls.clone();
|
||||
let tls_config = tls_config.clone();
|
||||
|
||||
// Spawn a background task to do the tuning,
|
||||
// so that we don't block the main thread that starts Postgres.
|
||||
@@ -1616,7 +1700,7 @@ impl ComputeNode {
|
||||
// Spawn a background task to do the configuration,
|
||||
// so that we don't block the main thread that starts Postgres.
|
||||
let mut local_proxy = local_proxy.clone();
|
||||
local_proxy.tls = self.compute_ctl_config.tls.clone();
|
||||
local_proxy.tls = tls_config.clone();
|
||||
tokio::spawn(async move {
|
||||
if let Err(err) = local_proxy::configure(&local_proxy) {
|
||||
error!("error while configuring local_proxy: {err:?}");
|
||||
@@ -1634,7 +1718,7 @@ impl ComputeNode {
|
||||
pgdata_path,
|
||||
&spec,
|
||||
self.params.internal_http_port,
|
||||
&self.compute_ctl_config.tls,
|
||||
tls_config,
|
||||
)?;
|
||||
|
||||
if !spec.skip_pg_catalog_updates {
|
||||
@@ -1754,6 +1838,14 @@ impl ComputeNode {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn tls_config(&self, spec: &ComputeSpec) -> &Option<TlsConfig> {
|
||||
if spec.features.contains(&ComputeFeature::TlsExperimental) {
|
||||
&self.compute_ctl_config.tls
|
||||
} else {
|
||||
&None::<TlsConfig>
|
||||
}
|
||||
}
|
||||
|
||||
/// Update the `last_active` in the shared state, but ensure that it's a more recent one.
|
||||
pub fn update_last_active(&self, last_active: Option<DateTime<Utc>>) {
|
||||
let mut state = self.state.lock().unwrap();
|
||||
@@ -1890,7 +1982,7 @@ LIMIT 100",
|
||||
self.params
|
||||
.remote_ext_base_url
|
||||
.as_ref()
|
||||
.ok_or(DownloadError::BadInput(anyhow::anyhow!(
|
||||
.ok_or(DownloadError::BadInput(anyhow!(
|
||||
"Remote extensions storage is not configured",
|
||||
)))?;
|
||||
|
||||
@@ -2086,7 +2178,7 @@ LIMIT 100",
|
||||
let remote_extensions = spec
|
||||
.remote_extensions
|
||||
.as_ref()
|
||||
.ok_or(anyhow::anyhow!("Remote extensions are not configured"))?;
|
||||
.ok_or(anyhow!("Remote extensions are not configured"))?;
|
||||
|
||||
info!("parse shared_preload_libraries from spec.cluster.settings");
|
||||
let mut libs_vec = Vec::new();
|
||||
|
||||
@@ -83,6 +83,7 @@ use reqwest::StatusCode;
|
||||
use tar::Archive;
|
||||
use tracing::info;
|
||||
use tracing::log::warn;
|
||||
use url::Url;
|
||||
use zstd::stream::read::Decoder;
|
||||
|
||||
use crate::metrics::{REMOTE_EXT_REQUESTS_TOTAL, UNKNOWN_HTTP_STATUS};
|
||||
@@ -158,7 +159,7 @@ fn parse_pg_version(human_version: &str) -> PostgresMajorVersion {
|
||||
pub async fn download_extension(
|
||||
ext_name: &str,
|
||||
ext_path: &RemotePath,
|
||||
remote_ext_base_url: &str,
|
||||
remote_ext_base_url: &Url,
|
||||
pgbin: &str,
|
||||
) -> Result<u64> {
|
||||
info!("Download extension {:?} from {:?}", ext_name, ext_path);
|
||||
@@ -270,10 +271,14 @@ pub fn create_control_files(remote_extensions: &RemoteExtSpec, pgbin: &str) {
|
||||
}
|
||||
|
||||
// Do request to extension storage proxy, e.g.,
|
||||
// curl http://pg-ext-s3-gateway/latest/v15/extensions/anon.tar.zst
|
||||
// curl http://pg-ext-s3-gateway.pg-ext-s3-gateway.svc.cluster.local/latest/v15/extensions/anon.tar.zst
|
||||
// using HTTP GET and return the response body as bytes.
|
||||
async fn download_extension_tar(remote_ext_base_url: &str, ext_path: &str) -> Result<Bytes> {
|
||||
let uri = format!("{}/{}", remote_ext_base_url, ext_path);
|
||||
async fn download_extension_tar(remote_ext_base_url: &Url, ext_path: &str) -> Result<Bytes> {
|
||||
let uri = remote_ext_base_url.join(ext_path).with_context(|| {
|
||||
format!(
|
||||
"failed to create the remote extension URI for {ext_path} using {remote_ext_base_url}"
|
||||
)
|
||||
})?;
|
||||
let filename = Path::new(ext_path)
|
||||
.file_name()
|
||||
.unwrap_or_else(|| std::ffi::OsStr::new("unknown"))
|
||||
@@ -283,7 +288,7 @@ async fn download_extension_tar(remote_ext_base_url: &str, ext_path: &str) -> Re
|
||||
|
||||
info!("Downloading extension file '{}' from uri {}", filename, uri);
|
||||
|
||||
match do_extension_server_request(&uri).await {
|
||||
match do_extension_server_request(uri).await {
|
||||
Ok(resp) => {
|
||||
info!("Successfully downloaded remote extension data {}", ext_path);
|
||||
REMOTE_EXT_REQUESTS_TOTAL
|
||||
@@ -302,7 +307,7 @@ async fn download_extension_tar(remote_ext_base_url: &str, ext_path: &str) -> Re
|
||||
|
||||
// Do a single remote extensions server request.
|
||||
// Return result or (error message + stringified status code) in case of any failures.
|
||||
async fn do_extension_server_request(uri: &str) -> Result<Bytes, (String, String)> {
|
||||
async fn do_extension_server_request(uri: Url) -> Result<Bytes, (String, String)> {
|
||||
let resp = reqwest::get(uri).await.map_err(|e| {
|
||||
(
|
||||
format!(
|
||||
|
||||
@@ -48,11 +48,9 @@ impl JsonResponse {
|
||||
|
||||
/// Create an error response related to the compute being in an invalid state
|
||||
pub(self) fn invalid_status(status: ComputeStatus) -> Response {
|
||||
Self::create_response(
|
||||
Self::error(
|
||||
StatusCode::PRECONDITION_FAILED,
|
||||
&GenericAPIError {
|
||||
error: format!("invalid compute status: {status}"),
|
||||
},
|
||||
format!("invalid compute status: {status}"),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -22,7 +22,7 @@ pub(in crate::http) async fn configure(
|
||||
State(compute): State<Arc<ComputeNode>>,
|
||||
request: Json<ConfigurationRequest>,
|
||||
) -> Response {
|
||||
let pspec = match ParsedSpec::try_from(request.spec.clone()) {
|
||||
let pspec = match ParsedSpec::try_from(request.0.spec) {
|
||||
Ok(p) => p,
|
||||
Err(e) => return JsonResponse::error(StatusCode::BAD_REQUEST, e),
|
||||
};
|
||||
|
||||
@@ -13,6 +13,12 @@ use crate::metrics::{PG_CURR_DOWNTIME_MS, PG_TOTAL_DOWNTIME_MS};
|
||||
|
||||
const MONITOR_CHECK_INTERVAL: Duration = Duration::from_millis(500);
|
||||
|
||||
/// Struct to store runtime state of the compute monitor thread.
|
||||
/// In theory, this could be a part of `Compute`, but i)
|
||||
/// this state is expected to be accessed only by single thread,
|
||||
/// so we don't need to care about locking; ii) `Compute` is
|
||||
/// already quite big. Thus, it seems to be a good idea to keep
|
||||
/// all the activity/health monitoring parts here.
|
||||
struct ComputeMonitor {
|
||||
compute: Arc<ComputeNode>,
|
||||
|
||||
@@ -70,12 +76,36 @@ impl ComputeMonitor {
|
||||
)
|
||||
}
|
||||
|
||||
/// Check if compute is in some terminal or soon-to-be-terminal
|
||||
/// state, then return `true`, signalling the caller that it
|
||||
/// should exit gracefully. Otherwise, return `false`.
|
||||
fn check_interrupts(&mut self) -> bool {
|
||||
let compute_status = self.compute.get_status();
|
||||
if matches!(
|
||||
compute_status,
|
||||
ComputeStatus::Terminated | ComputeStatus::TerminationPending | ComputeStatus::Failed
|
||||
) {
|
||||
info!(
|
||||
"compute is in {} status, stopping compute monitor",
|
||||
compute_status
|
||||
);
|
||||
return true;
|
||||
}
|
||||
|
||||
false
|
||||
}
|
||||
|
||||
/// Spin in a loop and figure out the last activity time in the Postgres.
|
||||
/// Then update it in the shared state. This function never errors out.
|
||||
/// Then update it in the shared state. This function currently never
|
||||
/// errors out explicitly, but there is a graceful termination path.
|
||||
/// Every time we receive an error trying to check Postgres, we use
|
||||
/// [`ComputeMonitor::check_interrupts()`] because it could be that
|
||||
/// compute is being terminated already, then we can exit gracefully
|
||||
/// to not produce errors' noise in the log.
|
||||
/// NB: the only expected panic is at `Mutex` unwrap(), all other errors
|
||||
/// should be handled gracefully.
|
||||
#[instrument(skip_all)]
|
||||
pub fn run(&mut self) {
|
||||
pub fn run(&mut self) -> anyhow::Result<()> {
|
||||
// Suppose that `connstr` doesn't change
|
||||
let connstr = self.compute.params.connstr.clone();
|
||||
let conf = self
|
||||
@@ -93,6 +123,10 @@ impl ComputeMonitor {
|
||||
info!("starting compute monitor for {}", connstr);
|
||||
|
||||
loop {
|
||||
if self.check_interrupts() {
|
||||
break;
|
||||
}
|
||||
|
||||
match &mut client {
|
||||
Ok(cli) => {
|
||||
if cli.is_closed() {
|
||||
@@ -100,6 +134,10 @@ impl ComputeMonitor {
|
||||
downtime_info = self.downtime_info(),
|
||||
"connection to Postgres is closed, trying to reconnect"
|
||||
);
|
||||
if self.check_interrupts() {
|
||||
break;
|
||||
}
|
||||
|
||||
self.report_down();
|
||||
|
||||
// Connection is closed, reconnect and try again.
|
||||
@@ -111,15 +149,19 @@ impl ComputeMonitor {
|
||||
self.compute.update_last_active(self.last_active);
|
||||
}
|
||||
Err(e) => {
|
||||
error!(
|
||||
downtime_info = self.downtime_info(),
|
||||
"could not check Postgres: {}", e
|
||||
);
|
||||
if self.check_interrupts() {
|
||||
break;
|
||||
}
|
||||
|
||||
// Although we have many places where we can return errors in `check()`,
|
||||
// normally it shouldn't happen. I.e., we will likely return error if
|
||||
// connection got broken, query timed out, Postgres returned invalid data, etc.
|
||||
// In all such cases it's suspicious, so let's report this as downtime.
|
||||
self.report_down();
|
||||
error!(
|
||||
downtime_info = self.downtime_info(),
|
||||
"could not check Postgres: {}", e
|
||||
);
|
||||
|
||||
// Reconnect to Postgres just in case. During tests, I noticed
|
||||
// that queries in `check()` can fail with `connection closed`,
|
||||
@@ -136,6 +178,10 @@ impl ComputeMonitor {
|
||||
downtime_info = self.downtime_info(),
|
||||
"could not connect to Postgres: {}, retrying", e
|
||||
);
|
||||
if self.check_interrupts() {
|
||||
break;
|
||||
}
|
||||
|
||||
self.report_down();
|
||||
|
||||
// Establish a new connection and try again.
|
||||
@@ -147,6 +193,9 @@ impl ComputeMonitor {
|
||||
self.last_checked = Utc::now();
|
||||
thread::sleep(MONITOR_CHECK_INTERVAL);
|
||||
}
|
||||
|
||||
// Graceful termination path
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[instrument(skip_all)]
|
||||
@@ -429,7 +478,10 @@ pub fn launch_monitor(compute: &Arc<ComputeNode>) -> thread::JoinHandle<()> {
|
||||
.spawn(move || {
|
||||
let span = span!(Level::INFO, "compute_monitor");
|
||||
let _enter = span.enter();
|
||||
monitor.run();
|
||||
match monitor.run() {
|
||||
Ok(_) => info!("compute monitor thread terminated gracefully"),
|
||||
Err(err) => error!("compute monitor thread terminated abnormally {:?}", err),
|
||||
}
|
||||
})
|
||||
.expect("cannot launch compute monitor thread")
|
||||
}
|
||||
|
||||
@@ -30,7 +30,7 @@ mod pg_helpers_tests {
|
||||
r#"fsync = off
|
||||
wal_level = logical
|
||||
hot_standby = on
|
||||
prewarm_lfc_on_startup = off
|
||||
autoprewarm = off
|
||||
neon.safekeepers = '127.0.0.1:6502,127.0.0.1:6503,127.0.0.1:6501'
|
||||
wal_log_hints = on
|
||||
log_connections = on
|
||||
|
||||
@@ -18,7 +18,7 @@ use clap::Parser;
|
||||
use compute_api::requests::ComputeClaimsScope;
|
||||
use compute_api::spec::ComputeMode;
|
||||
use control_plane::broker::StorageBroker;
|
||||
use control_plane::endpoint::ComputeControlPlane;
|
||||
use control_plane::endpoint::{ComputeControlPlane, PageserverProtocol};
|
||||
use control_plane::endpoint_storage::{ENDPOINT_STORAGE_DEFAULT_ADDR, EndpointStorage};
|
||||
use control_plane::local_env;
|
||||
use control_plane::local_env::{
|
||||
@@ -664,6 +664,10 @@ struct EndpointStartCmdArgs {
|
||||
#[clap(short = 't', long, value_parser= humantime::parse_duration, help = "timeout until we fail the command")]
|
||||
#[arg(default_value = "90s")]
|
||||
start_timeout: Duration,
|
||||
|
||||
/// If enabled, use gRPC (and the communicator) to talk to Pageservers.
|
||||
#[clap(long)]
|
||||
grpc: bool,
|
||||
}
|
||||
|
||||
#[derive(clap::Args)]
|
||||
@@ -682,6 +686,10 @@ struct EndpointReconfigureCmdArgs {
|
||||
|
||||
#[clap(long)]
|
||||
safekeepers: Option<String>,
|
||||
|
||||
/// If enabled, use gRPC (and communicator) to talk to Pageservers.
|
||||
#[clap(long)]
|
||||
grpc: bool,
|
||||
}
|
||||
|
||||
#[derive(clap::Args)]
|
||||
@@ -1452,13 +1460,18 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
|
||||
|
||||
let (pageservers, stripe_size) = if let Some(pageserver_id) = pageserver_id {
|
||||
let conf = env.get_pageserver_conf(pageserver_id).unwrap();
|
||||
let parsed = parse_host_port(&conf.listen_pg_addr).expect("Bad config");
|
||||
(
|
||||
vec![(parsed.0, parsed.1.unwrap_or(5432))],
|
||||
// If caller is telling us what pageserver to use, this is not a tenant which is
|
||||
// full managed by storage controller, therefore not sharded.
|
||||
DEFAULT_STRIPE_SIZE,
|
||||
)
|
||||
// Use gRPC if requested.
|
||||
let (protocol, host, port) = if args.grpc {
|
||||
let grpc_addr = conf.listen_grpc_addr.as_ref().expect("bad config");
|
||||
let (host, port) = parse_host_port(grpc_addr).expect("bad config");
|
||||
(PageserverProtocol::Grpc, host, port.unwrap_or(51051))
|
||||
} else {
|
||||
let (host, port) = parse_host_port(&conf.listen_pg_addr).expect("bad config");
|
||||
(PageserverProtocol::Libpq, host, port.unwrap_or(5432))
|
||||
};
|
||||
// If caller is telling us what pageserver to use, this is not a tenant which is
|
||||
// fully managed by storage controller, therefore not sharded.
|
||||
(vec![(protocol, host, port)], DEFAULT_STRIPE_SIZE)
|
||||
} else {
|
||||
// Look up the currently attached location of the tenant, and its striping metadata,
|
||||
// to pass these on to postgres.
|
||||
@@ -1477,11 +1490,22 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
|
||||
.await?;
|
||||
}
|
||||
|
||||
anyhow::Ok((
|
||||
Host::parse(&shard.listen_pg_addr)
|
||||
.expect("Storage controller reported bad hostname"),
|
||||
shard.listen_pg_port,
|
||||
))
|
||||
let pageserver = if args.grpc {
|
||||
(
|
||||
PageserverProtocol::Grpc,
|
||||
Host::parse(&shard.listen_grpc_addr.expect("no gRPC addr"))
|
||||
.expect("bad hostname"),
|
||||
shard.listen_grpc_port.expect("no gRPC port"),
|
||||
)
|
||||
} else {
|
||||
(
|
||||
PageserverProtocol::Libpq,
|
||||
Host::parse(&shard.listen_pg_addr).expect("bad hostname"),
|
||||
shard.listen_pg_port,
|
||||
)
|
||||
};
|
||||
|
||||
anyhow::Ok(pageserver)
|
||||
}),
|
||||
)
|
||||
.await?;
|
||||
@@ -1536,11 +1560,17 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
|
||||
.get(endpoint_id.as_str())
|
||||
.with_context(|| format!("postgres endpoint {endpoint_id} is not found"))?;
|
||||
let pageservers = if let Some(ps_id) = args.endpoint_pageserver_id {
|
||||
let pageserver = PageServerNode::from_env(env, env.get_pageserver_conf(ps_id)?);
|
||||
vec![(
|
||||
pageserver.pg_connection_config.host().clone(),
|
||||
pageserver.pg_connection_config.port(),
|
||||
)]
|
||||
let conf = env.get_pageserver_conf(ps_id)?;
|
||||
// Use gRPC if requested.
|
||||
let (protocol, host, port) = if args.grpc {
|
||||
let grpc_addr = conf.listen_grpc_addr.as_ref().expect("bad config");
|
||||
let (host, port) = parse_host_port(grpc_addr).expect("bad config");
|
||||
(PageserverProtocol::Grpc, host, port.unwrap_or(51051))
|
||||
} else {
|
||||
let (host, port) = parse_host_port(&conf.listen_pg_addr).expect("bad config");
|
||||
(PageserverProtocol::Libpq, host, port.unwrap_or(5432))
|
||||
};
|
||||
vec![(protocol, host, port)]
|
||||
} else {
|
||||
let storage_controller = StorageController::from_env(env);
|
||||
storage_controller
|
||||
@@ -1549,11 +1579,20 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
|
||||
.shards
|
||||
.into_iter()
|
||||
.map(|shard| {
|
||||
(
|
||||
Host::parse(&shard.listen_pg_addr)
|
||||
.expect("Storage controller reported malformed host"),
|
||||
shard.listen_pg_port,
|
||||
)
|
||||
if args.grpc {
|
||||
(
|
||||
PageserverProtocol::Grpc,
|
||||
Host::parse(&shard.listen_grpc_addr.expect("no gRPC addr"))
|
||||
.expect("bad hostname"),
|
||||
shard.listen_grpc_port.expect("no gRPC port"),
|
||||
)
|
||||
} else {
|
||||
(
|
||||
PageserverProtocol::Libpq,
|
||||
Host::parse(&shard.listen_pg_addr).expect("bad hostname"),
|
||||
shard.listen_pg_port,
|
||||
)
|
||||
}
|
||||
})
|
||||
.collect::<Vec<_>>()
|
||||
};
|
||||
|
||||
@@ -37,6 +37,7 @@
|
||||
//! ```
|
||||
//!
|
||||
use std::collections::BTreeMap;
|
||||
use std::fmt::Display;
|
||||
use std::net::{IpAddr, Ipv4Addr, SocketAddr, TcpStream};
|
||||
use std::path::PathBuf;
|
||||
use std::process::Command;
|
||||
@@ -74,7 +75,6 @@ use utils::id::{NodeId, TenantId, TimelineId};
|
||||
|
||||
use crate::local_env::LocalEnv;
|
||||
use crate::postgresql_conf::PostgresConf;
|
||||
use crate::storage_controller::StorageController;
|
||||
|
||||
// contents of a endpoint.json file
|
||||
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
|
||||
@@ -331,7 +331,7 @@ pub enum EndpointStatus {
|
||||
RunningNoPidfile,
|
||||
}
|
||||
|
||||
impl std::fmt::Display for EndpointStatus {
|
||||
impl Display for EndpointStatus {
|
||||
fn fmt(&self, writer: &mut std::fmt::Formatter) -> std::fmt::Result {
|
||||
let s = match self {
|
||||
Self::Running => "running",
|
||||
@@ -343,6 +343,28 @@ impl std::fmt::Display for EndpointStatus {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
pub enum PageserverProtocol {
|
||||
Libpq,
|
||||
Grpc,
|
||||
}
|
||||
|
||||
impl PageserverProtocol {
|
||||
/// Returns the URL scheme for the protocol, used in connstrings.
|
||||
pub fn scheme(&self) -> &'static str {
|
||||
match self {
|
||||
Self::Libpq => "postgresql",
|
||||
Self::Grpc => "grpc",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Display for PageserverProtocol {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.write_str(self.scheme())
|
||||
}
|
||||
}
|
||||
|
||||
impl Endpoint {
|
||||
fn from_dir_entry(entry: std::fs::DirEntry, env: &LocalEnv) -> Result<Endpoint> {
|
||||
if !entry.file_type()?.is_dir() {
|
||||
@@ -606,10 +628,10 @@ impl Endpoint {
|
||||
}
|
||||
}
|
||||
|
||||
fn build_pageserver_connstr(pageservers: &[(Host, u16)]) -> String {
|
||||
fn build_pageserver_connstr(pageservers: &[(PageserverProtocol, Host, u16)]) -> String {
|
||||
pageservers
|
||||
.iter()
|
||||
.map(|(host, port)| format!("postgresql://no_user@{host}:{port}"))
|
||||
.map(|(scheme, host, port)| format!("{scheme}://no_user@{host}:{port}"))
|
||||
.collect::<Vec<_>>()
|
||||
.join(",")
|
||||
}
|
||||
@@ -654,7 +676,7 @@ impl Endpoint {
|
||||
endpoint_storage_addr: String,
|
||||
safekeepers_generation: Option<SafekeeperGeneration>,
|
||||
safekeepers: Vec<NodeId>,
|
||||
pageservers: Vec<(Host, u16)>,
|
||||
pageservers: Vec<(PageserverProtocol, Host, u16)>,
|
||||
remote_ext_base_url: Option<&String>,
|
||||
shard_stripe_size: usize,
|
||||
create_test_user: bool,
|
||||
@@ -747,7 +769,7 @@ impl Endpoint {
|
||||
logs_export_host: None::<String>,
|
||||
endpoint_storage_addr: Some(endpoint_storage_addr),
|
||||
endpoint_storage_token: Some(endpoint_storage_token),
|
||||
prewarm_lfc_on_startup: false,
|
||||
autoprewarm: false,
|
||||
};
|
||||
|
||||
// this strange code is needed to support respec() in tests
|
||||
@@ -939,10 +961,12 @@ impl Endpoint {
|
||||
|
||||
pub async fn reconfigure(
|
||||
&self,
|
||||
mut pageservers: Vec<(Host, u16)>,
|
||||
pageservers: Vec<(PageserverProtocol, Host, u16)>,
|
||||
stripe_size: Option<ShardStripeSize>,
|
||||
safekeepers: Option<Vec<NodeId>>,
|
||||
) -> Result<()> {
|
||||
anyhow::ensure!(!pageservers.is_empty(), "no pageservers provided");
|
||||
|
||||
let (mut spec, compute_ctl_config) = {
|
||||
let config_path = self.endpoint_path().join("config.json");
|
||||
let file = std::fs::File::open(config_path)?;
|
||||
@@ -954,25 +978,7 @@ impl Endpoint {
|
||||
let postgresql_conf = self.read_postgresql_conf()?;
|
||||
spec.cluster.postgresql_conf = Some(postgresql_conf);
|
||||
|
||||
// If we weren't given explicit pageservers, query the storage controller
|
||||
if pageservers.is_empty() {
|
||||
let storage_controller = StorageController::from_env(&self.env);
|
||||
let locate_result = storage_controller.tenant_locate(self.tenant_id).await?;
|
||||
pageservers = locate_result
|
||||
.shards
|
||||
.into_iter()
|
||||
.map(|shard| {
|
||||
(
|
||||
Host::parse(&shard.listen_pg_addr)
|
||||
.expect("Storage controller reported bad hostname"),
|
||||
shard.listen_pg_port,
|
||||
)
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
}
|
||||
|
||||
let pageserver_connstr = Self::build_pageserver_connstr(&pageservers);
|
||||
assert!(!pageserver_connstr.is_empty());
|
||||
spec.pageserver_connstring = Some(pageserver_connstr);
|
||||
if stripe_size.is_some() {
|
||||
spec.shard_stripe_size = stripe_size.map(|s| s.0 as usize);
|
||||
|
||||
@@ -265,6 +265,14 @@ impl PageServerNode {
|
||||
None => None,
|
||||
};
|
||||
|
||||
let mut grpc_host = None;
|
||||
let mut grpc_port = None;
|
||||
if let Some(grpc_addr) = &self.conf.listen_grpc_addr {
|
||||
let (_, port) = parse_host_port(grpc_addr).expect("Unable to parse listen_grpc_addr");
|
||||
grpc_host = Some("localhost".to_string());
|
||||
grpc_port = Some(port.unwrap_or(51051));
|
||||
}
|
||||
|
||||
// Intentionally hand-craft JSON: this acts as an implicit format compat test
|
||||
// in case the pageserver-side structure is edited, and reflects the real life
|
||||
// situation: the metadata is written by some other script.
|
||||
@@ -273,6 +281,8 @@ impl PageServerNode {
|
||||
serde_json::to_vec(&pageserver_api::config::NodeMetadata {
|
||||
postgres_host: "localhost".to_string(),
|
||||
postgres_port: self.pg_connection_config.port(),
|
||||
grpc_host,
|
||||
grpc_port,
|
||||
http_host: "localhost".to_string(),
|
||||
http_port,
|
||||
https_port,
|
||||
@@ -513,11 +523,6 @@ impl PageServerNode {
|
||||
.map(|x| x.parse::<bool>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'timeline_offloading' as bool")?,
|
||||
wal_receiver_protocol_override: settings
|
||||
.remove("wal_receiver_protocol_override")
|
||||
.map(serde_json::from_str)
|
||||
.transpose()
|
||||
.context("parse `wal_receiver_protocol_override` from json")?,
|
||||
rel_size_v2_enabled: settings
|
||||
.remove("rel_size_v2_enabled")
|
||||
.map(|x| x.parse::<bool>())
|
||||
|
||||
@@ -37,6 +37,11 @@ enum Command {
|
||||
#[arg(long)]
|
||||
listen_pg_port: u16,
|
||||
|
||||
#[arg(long)]
|
||||
listen_grpc_addr: Option<String>,
|
||||
#[arg(long)]
|
||||
listen_grpc_port: Option<u16>,
|
||||
|
||||
#[arg(long)]
|
||||
listen_http_addr: String,
|
||||
#[arg(long)]
|
||||
@@ -410,6 +415,8 @@ async fn main() -> anyhow::Result<()> {
|
||||
node_id,
|
||||
listen_pg_addr,
|
||||
listen_pg_port,
|
||||
listen_grpc_addr,
|
||||
listen_grpc_port,
|
||||
listen_http_addr,
|
||||
listen_http_port,
|
||||
listen_https_port,
|
||||
@@ -423,6 +430,8 @@ async fn main() -> anyhow::Result<()> {
|
||||
node_id,
|
||||
listen_pg_addr,
|
||||
listen_pg_port,
|
||||
listen_grpc_addr,
|
||||
listen_grpc_port,
|
||||
listen_http_addr,
|
||||
listen_http_port,
|
||||
listen_https_port,
|
||||
|
||||
@@ -13,6 +13,6 @@ RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \
|
||||
jq \
|
||||
netcat-openbsd
|
||||
#This is required for the pg_hintplan test
|
||||
RUN mkdir -p /ext-src/pg_hint_plan-src /postgres/contrib/file_fdw && chown postgres /ext-src/pg_hint_plan-src /postgres/contrib/file_fdw
|
||||
RUN mkdir -p /ext-src/pg_hint_plan-src /postgres/contrib/file_fdw /ext-src/postgis-src/ && chown postgres /ext-src/pg_hint_plan-src /postgres/contrib/file_fdw /ext-src/postgis-src
|
||||
|
||||
USER postgres
|
||||
|
||||
@@ -1,18 +1,18 @@
|
||||
#!/bin/bash
|
||||
#!/usr/bin/env bash
|
||||
set -eux
|
||||
|
||||
# Generate a random tenant or timeline ID
|
||||
#
|
||||
# Takes a variable name as argument. The result is stored in that variable.
|
||||
generate_id() {
|
||||
local -n resvar=$1
|
||||
printf -v resvar '%08x%08x%08x%08x' $SRANDOM $SRANDOM $SRANDOM $SRANDOM
|
||||
local -n resvar=${1}
|
||||
printf -v resvar '%08x%08x%08x%08x' ${SRANDOM} ${SRANDOM} ${SRANDOM} ${SRANDOM}
|
||||
}
|
||||
|
||||
PG_VERSION=${PG_VERSION:-14}
|
||||
|
||||
CONFIG_FILE_ORG=/var/db/postgres/configs/config.json
|
||||
CONFIG_FILE=/tmp/config.json
|
||||
readonly CONFIG_FILE_ORG=/var/db/postgres/configs/config.json
|
||||
readonly CONFIG_FILE=/tmp/config.json
|
||||
|
||||
# Test that the first library path that the dynamic loader looks in is the path
|
||||
# that we use for custom compiled software
|
||||
@@ -20,17 +20,17 @@ first_path="$(ldconfig --verbose 2>/dev/null \
|
||||
| grep --invert-match ^$'\t' \
|
||||
| cut --delimiter=: --fields=1 \
|
||||
| head --lines=1)"
|
||||
test "$first_path" == '/usr/local/lib'
|
||||
test "${first_path}" = '/usr/local/lib'
|
||||
|
||||
echo "Waiting pageserver become ready."
|
||||
while ! nc -z pageserver 6400; do
|
||||
sleep 1;
|
||||
sleep 1
|
||||
done
|
||||
echo "Page server is ready."
|
||||
|
||||
cp ${CONFIG_FILE_ORG} ${CONFIG_FILE}
|
||||
cp "${CONFIG_FILE_ORG}" "${CONFIG_FILE}"
|
||||
|
||||
if [ -n "${TENANT_ID:-}" ] && [ -n "${TIMELINE_ID:-}" ]; then
|
||||
if [[ -n "${TENANT_ID:-}" && -n "${TIMELINE_ID:-}" ]]; then
|
||||
tenant_id=${TENANT_ID}
|
||||
timeline_id=${TIMELINE_ID}
|
||||
else
|
||||
@@ -41,7 +41,7 @@ else
|
||||
"http://pageserver:9898/v1/tenant"
|
||||
)
|
||||
tenant_id=$(curl "${PARAMS[@]}" | jq -r .[0].id)
|
||||
if [ -z "${tenant_id}" ] || [ "${tenant_id}" = null ]; then
|
||||
if [[ -z "${tenant_id}" || "${tenant_id}" = null ]]; then
|
||||
echo "Create a tenant"
|
||||
generate_id tenant_id
|
||||
PARAMS=(
|
||||
@@ -51,7 +51,7 @@ else
|
||||
"http://pageserver:9898/v1/tenant/${tenant_id}/location_config"
|
||||
)
|
||||
result=$(curl "${PARAMS[@]}")
|
||||
echo $result | jq .
|
||||
printf '%s\n' "${result}" | jq .
|
||||
fi
|
||||
|
||||
echo "Check if a timeline present"
|
||||
@@ -61,7 +61,7 @@ else
|
||||
"http://pageserver:9898/v1/tenant/${tenant_id}/timeline"
|
||||
)
|
||||
timeline_id=$(curl "${PARAMS[@]}" | jq -r .[0].timeline_id)
|
||||
if [ -z "${timeline_id}" ] || [ "${timeline_id}" = null ]; then
|
||||
if [[ -z "${timeline_id}" || "${timeline_id}" = null ]]; then
|
||||
generate_id timeline_id
|
||||
PARAMS=(
|
||||
-sbf
|
||||
@@ -71,7 +71,7 @@ else
|
||||
"http://pageserver:9898/v1/tenant/${tenant_id}/timeline/"
|
||||
)
|
||||
result=$(curl "${PARAMS[@]}")
|
||||
echo $result | jq .
|
||||
printf '%s\n' "${result}" | jq .
|
||||
fi
|
||||
fi
|
||||
|
||||
@@ -82,10 +82,10 @@ else
|
||||
fi
|
||||
echo "Adding pgx_ulid"
|
||||
shared_libraries=$(jq -r '.spec.cluster.settings[] | select(.name=="shared_preload_libraries").value' ${CONFIG_FILE})
|
||||
sed -i "s/${shared_libraries}/${shared_libraries},${ulid_extension}/" ${CONFIG_FILE}
|
||||
sed -i "s|${shared_libraries}|${shared_libraries},${ulid_extension}|" ${CONFIG_FILE}
|
||||
echo "Overwrite tenant id and timeline id in spec file"
|
||||
sed -i "s/TENANT_ID/${tenant_id}/" ${CONFIG_FILE}
|
||||
sed -i "s/TIMELINE_ID/${timeline_id}/" ${CONFIG_FILE}
|
||||
sed -i "s|TENANT_ID|${tenant_id}|" ${CONFIG_FILE}
|
||||
sed -i "s|TIMELINE_ID|${timeline_id}|" ${CONFIG_FILE}
|
||||
|
||||
cat ${CONFIG_FILE}
|
||||
|
||||
@@ -93,5 +93,5 @@ echo "Start compute node"
|
||||
/usr/local/bin/compute_ctl --pgdata /var/db/postgres/compute \
|
||||
-C "postgresql://cloud_admin@localhost:55433/postgres" \
|
||||
-b /usr/local/bin/postgres \
|
||||
--compute-id "compute-$RANDOM" \
|
||||
--config "$CONFIG_FILE"
|
||||
--compute-id "compute-${RANDOM}" \
|
||||
--config "${CONFIG_FILE}"
|
||||
|
||||
@@ -186,13 +186,14 @@ services:
|
||||
|
||||
neon-test-extensions:
|
||||
profiles: ["test-extensions"]
|
||||
image: ${REPOSITORY:-ghcr.io/neondatabase}/neon-test-extensions-v${PG_TEST_VERSION:-16}:${TEST_EXTENSIONS_TAG:-${TAG:-latest}}
|
||||
image: ${REPOSITORY:-ghcr.io/neondatabase}/neon-test-extensions-v${PG_TEST_VERSION:-${PG_VERSION:-16}}:${TEST_EXTENSIONS_TAG:-${TAG:-latest}}
|
||||
environment:
|
||||
- PGPASSWORD=cloud_admin
|
||||
- PGUSER=${PGUSER:-cloud_admin}
|
||||
- PGPASSWORD=${PGPASSWORD:-cloud_admin}
|
||||
entrypoint:
|
||||
- "/bin/bash"
|
||||
- "-c"
|
||||
command:
|
||||
- sleep 1800
|
||||
- sleep 3600
|
||||
depends_on:
|
||||
- compute
|
||||
|
||||
@@ -54,6 +54,15 @@ for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do
|
||||
# It cannot be moved to Dockerfile now because the database directory is created after the start of the container
|
||||
echo Adding dummy config
|
||||
docker compose exec compute touch /var/db/postgres/compute/compute_ctl_temp_override.conf
|
||||
# Prepare for the PostGIS test
|
||||
docker compose exec compute mkdir -p /tmp/pgis_reg/pgis_reg_tmp
|
||||
TMPDIR=$(mktemp -d)
|
||||
docker compose cp neon-test-extensions:/ext-src/postgis-src/raster/test "${TMPDIR}"
|
||||
docker compose cp neon-test-extensions:/ext-src/postgis-src/regress/00-regress-install "${TMPDIR}"
|
||||
docker compose exec compute mkdir -p /ext-src/postgis-src/raster /ext-src/postgis-src/regress /ext-src/postgis-src/regress/00-regress-install
|
||||
docker compose cp "${TMPDIR}/test" compute:/ext-src/postgis-src/raster/test
|
||||
docker compose cp "${TMPDIR}/00-regress-install" compute:/ext-src/postgis-src/regress
|
||||
rm -rf "${TMPDIR}"
|
||||
# The following block copies the files for the pg_hintplan test to the compute node for the extension test in an isolated docker-compose environment
|
||||
TMPDIR=$(mktemp -d)
|
||||
docker compose cp neon-test-extensions:/ext-src/pg_hint_plan-src/data "${TMPDIR}/data"
|
||||
@@ -68,7 +77,7 @@ for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do
|
||||
docker compose exec -T neon-test-extensions bash -c "(cd /postgres && patch -p1)" <"../compute/patches/contrib_pg${pg_version}.patch"
|
||||
# We are running tests now
|
||||
rm -f testout.txt testout_contrib.txt
|
||||
docker compose exec -e USE_PGXS=1 -e SKIP=timescaledb-src,rdkit-src,postgis-src,pg_jsonschema-src,kq_imcx-src,wal2json_2_5-src,rag_jina_reranker_v1_tiny_en-src,rag_bge_small_en_v15-src \
|
||||
docker compose exec -e USE_PGXS=1 -e SKIP=timescaledb-src,rdkit-src,pg_jsonschema-src,kq_imcx-src,wal2json_2_5-src,rag_jina_reranker_v1_tiny_en-src,rag_bge_small_en_v15-src \
|
||||
neon-test-extensions /run-tests.sh /ext-src | tee testout.txt && EXT_SUCCESS=1 || EXT_SUCCESS=0
|
||||
docker compose exec -e SKIP=start-scripts,postgres_fdw,ltree_plpython,jsonb_plpython,jsonb_plperl,hstore_plpython,hstore_plperl,dblink,bool_plperl \
|
||||
neon-test-extensions /run-tests.sh /postgres/contrib | tee testout_contrib.txt && CONTRIB_SUCCESS=1 || CONTRIB_SUCCESS=0
|
||||
|
||||
70
docker-compose/ext-src/postgis-src/README-Neon.md
Normal file
70
docker-compose/ext-src/postgis-src/README-Neon.md
Normal file
@@ -0,0 +1,70 @@
|
||||
# PostGIS Testing in Neon
|
||||
|
||||
This directory contains configuration files and patches for running PostGIS tests in the Neon database environment.
|
||||
|
||||
## Overview
|
||||
|
||||
PostGIS is a spatial database extension for PostgreSQL that adds support for geographic objects. Testing PostGIS compatibility ensures that Neon's modifications to PostgreSQL don't break compatibility with this critical extension.
|
||||
|
||||
## PostGIS Versions
|
||||
|
||||
- PostgreSQL v17: PostGIS 3.5.0
|
||||
- PostgreSQL v14/v15/v16: PostGIS 3.3.3
|
||||
|
||||
## Test Configuration
|
||||
|
||||
The test setup includes:
|
||||
|
||||
- `postgis-no-upgrade-test.patch`: Disables upgrade tests by removing the upgrade test section from regress/runtest.mk
|
||||
- `postgis-regular-v16.patch`: Version-specific patch for PostgreSQL v16
|
||||
- `postgis-regular-v17.patch`: Version-specific patch for PostgreSQL v17
|
||||
- `regular-test.sh`: Script to run PostGIS tests as a regular user
|
||||
- `neon-test.sh`: Script to handle version-specific test configurations
|
||||
- `raster_outdb_template.sql`: Template for raster tests with explicit file paths
|
||||
|
||||
## Excluded Tests
|
||||
|
||||
**Important Note:** The test exclusions listed below are specifically for regular-user tests against staging instances. These exclusions are necessary because staging instances run with limited privileges and cannot perform operations requiring superuser access. Docker-compose based tests are not affected by these exclusions.
|
||||
|
||||
### Tests Requiring Superuser Permissions
|
||||
|
||||
These tests cannot be run as a regular user:
|
||||
- `estimatedextent`
|
||||
- `regress/core/legacy`
|
||||
- `regress/core/typmod`
|
||||
- `regress/loader/TestSkipANALYZE`
|
||||
- `regress/loader/TestANALYZE`
|
||||
|
||||
### Tests Requiring Filesystem Access
|
||||
|
||||
These tests need direct filesystem access that is only possible for superusers:
|
||||
- `loader/load_outdb`
|
||||
|
||||
### Tests with Flaky Results
|
||||
|
||||
These tests have assumptions that don't always hold true:
|
||||
- `regress/core/computed_columns` - Assumes computed columns always outperform alternatives, which is not consistently true
|
||||
|
||||
### Tests Requiring Tunable Parameter Modifications
|
||||
|
||||
These tests attempt to modify the `postgis.gdal_enabled_drivers` parameter, which is only accessible to superusers:
|
||||
- `raster/test/regress/rt_wkb`
|
||||
- `raster/test/regress/rt_addband`
|
||||
- `raster/test/regress/rt_setbandpath`
|
||||
- `raster/test/regress/rt_fromgdalraster`
|
||||
- `raster/test/regress/rt_asgdalraster`
|
||||
- `raster/test/regress/rt_astiff`
|
||||
- `raster/test/regress/rt_asjpeg`
|
||||
- `raster/test/regress/rt_aspng`
|
||||
- `raster/test/regress/permitted_gdal_drivers`
|
||||
- Loader tests: `BasicOutDB`, `Tiled10x10`, `Tiled10x10Copy`, `Tiled8x8`, `TiledAuto`, `TiledAutoSkipNoData`, `TiledAutoCopyn`
|
||||
|
||||
### Topology Tests (v17 only)
|
||||
- `populate_topology_layer`
|
||||
- `renametopogeometrycolumn`
|
||||
|
||||
## Other Modifications
|
||||
|
||||
- Binary.sql tests are modified to use explicit file paths
|
||||
- Server-side SQL COPY commands (which require superuser privileges) are converted to client-side `\copy` commands
|
||||
- Upgrade tests are disabled
|
||||
9
docker-compose/ext-src/postgis-src/neon-test.sh
Executable file
9
docker-compose/ext-src/postgis-src/neon-test.sh
Executable file
@@ -0,0 +1,9 @@
|
||||
#!/bin/bash
|
||||
set -ex
|
||||
cd "$(dirname "$0")"
|
||||
if [[ ${PG_VERSION} = v17 ]]; then
|
||||
sed -i '/computed_columns/d' regress/core/tests.mk
|
||||
fi
|
||||
patch -p1 <postgis-no-upgrade-test.patch
|
||||
trap 'echo Cleaning up; patch -R -p1 <postgis-no-upgrade-test.patch' EXIT
|
||||
make installcheck-base
|
||||
@@ -0,0 +1,21 @@
|
||||
diff --git a/regress/runtest.mk b/regress/runtest.mk
|
||||
index c051f03..010e493 100644
|
||||
--- a/regress/runtest.mk
|
||||
+++ b/regress/runtest.mk
|
||||
@@ -24,16 +24,6 @@ check-regress:
|
||||
|
||||
POSTGIS_TOP_BUILD_DIR=$(abs_top_builddir) $(PERL) $(top_srcdir)/regress/run_test.pl $(RUNTESTFLAGS) $(RUNTESTFLAGS_INTERNAL) $(TESTS)
|
||||
|
||||
- @if echo "$(RUNTESTFLAGS)" | grep -vq -- --upgrade; then \
|
||||
- echo "Running upgrade test as RUNTESTFLAGS did not contain that"; \
|
||||
- POSTGIS_TOP_BUILD_DIR=$(abs_top_builddir) $(PERL) $(top_srcdir)/regress/run_test.pl \
|
||||
- --upgrade \
|
||||
- $(RUNTESTFLAGS) \
|
||||
- $(RUNTESTFLAGS_INTERNAL) \
|
||||
- $(TESTS); \
|
||||
- else \
|
||||
- echo "Skipping upgrade test as RUNTESTFLAGS already requested upgrades"; \
|
||||
- fi
|
||||
|
||||
check-long:
|
||||
$(PERL) $(top_srcdir)/regress/run_test.pl $(RUNTESTFLAGS) $(TESTS) $(TESTS_SLOW)
|
||||
198
docker-compose/ext-src/postgis-src/postgis-regular-v16.patch
Normal file
198
docker-compose/ext-src/postgis-src/postgis-regular-v16.patch
Normal file
@@ -0,0 +1,198 @@
|
||||
diff --git a/raster/test/regress/tests.mk b/raster/test/regress/tests.mk
|
||||
index 00918e1..7e2b6cd 100644
|
||||
--- a/raster/test/regress/tests.mk
|
||||
+++ b/raster/test/regress/tests.mk
|
||||
@@ -17,9 +17,7 @@ override RUNTESTFLAGS_INTERNAL := \
|
||||
$(RUNTESTFLAGS_INTERNAL) \
|
||||
--after-upgrade-script $(top_srcdir)/raster/test/regress/hooks/hook-after-upgrade-raster.sql
|
||||
|
||||
-RASTER_TEST_FIRST = \
|
||||
- $(top_srcdir)/raster/test/regress/check_gdal \
|
||||
- $(top_srcdir)/raster/test/regress/loader/load_outdb
|
||||
+RASTER_TEST_FIRST =
|
||||
|
||||
RASTER_TEST_LAST = \
|
||||
$(top_srcdir)/raster/test/regress/clean
|
||||
@@ -33,9 +31,7 @@ RASTER_TEST_IO = \
|
||||
|
||||
RASTER_TEST_BASIC_FUNC = \
|
||||
$(top_srcdir)/raster/test/regress/rt_bytea \
|
||||
- $(top_srcdir)/raster/test/regress/rt_wkb \
|
||||
$(top_srcdir)/raster/test/regress/box3d \
|
||||
- $(top_srcdir)/raster/test/regress/rt_addband \
|
||||
$(top_srcdir)/raster/test/regress/rt_band \
|
||||
$(top_srcdir)/raster/test/regress/rt_tile
|
||||
|
||||
@@ -73,16 +69,10 @@ RASTER_TEST_BANDPROPS = \
|
||||
$(top_srcdir)/raster/test/regress/rt_neighborhood \
|
||||
$(top_srcdir)/raster/test/regress/rt_nearestvalue \
|
||||
$(top_srcdir)/raster/test/regress/rt_pixelofvalue \
|
||||
- $(top_srcdir)/raster/test/regress/rt_polygon \
|
||||
- $(top_srcdir)/raster/test/regress/rt_setbandpath
|
||||
+ $(top_srcdir)/raster/test/regress/rt_polygon
|
||||
|
||||
RASTER_TEST_UTILITY = \
|
||||
$(top_srcdir)/raster/test/regress/rt_utility \
|
||||
- $(top_srcdir)/raster/test/regress/rt_fromgdalraster \
|
||||
- $(top_srcdir)/raster/test/regress/rt_asgdalraster \
|
||||
- $(top_srcdir)/raster/test/regress/rt_astiff \
|
||||
- $(top_srcdir)/raster/test/regress/rt_asjpeg \
|
||||
- $(top_srcdir)/raster/test/regress/rt_aspng \
|
||||
$(top_srcdir)/raster/test/regress/rt_reclass \
|
||||
$(top_srcdir)/raster/test/regress/rt_gdalwarp \
|
||||
$(top_srcdir)/raster/test/regress/rt_gdalcontour \
|
||||
@@ -120,21 +110,13 @@ RASTER_TEST_SREL = \
|
||||
|
||||
RASTER_TEST_BUGS = \
|
||||
$(top_srcdir)/raster/test/regress/bug_test_car5 \
|
||||
- $(top_srcdir)/raster/test/regress/permitted_gdal_drivers \
|
||||
$(top_srcdir)/raster/test/regress/tickets
|
||||
|
||||
RASTER_TEST_LOADER = \
|
||||
$(top_srcdir)/raster/test/regress/loader/Basic \
|
||||
$(top_srcdir)/raster/test/regress/loader/Projected \
|
||||
$(top_srcdir)/raster/test/regress/loader/BasicCopy \
|
||||
- $(top_srcdir)/raster/test/regress/loader/BasicFilename \
|
||||
- $(top_srcdir)/raster/test/regress/loader/BasicOutDB \
|
||||
- $(top_srcdir)/raster/test/regress/loader/Tiled10x10 \
|
||||
- $(top_srcdir)/raster/test/regress/loader/Tiled10x10Copy \
|
||||
- $(top_srcdir)/raster/test/regress/loader/Tiled8x8 \
|
||||
- $(top_srcdir)/raster/test/regress/loader/TiledAuto \
|
||||
- $(top_srcdir)/raster/test/regress/loader/TiledAutoSkipNoData \
|
||||
- $(top_srcdir)/raster/test/regress/loader/TiledAutoCopyn
|
||||
+ $(top_srcdir)/raster/test/regress/loader/BasicFilename
|
||||
|
||||
RASTER_TESTS := $(RASTER_TEST_FIRST) \
|
||||
$(RASTER_TEST_METADATA) $(RASTER_TEST_IO) $(RASTER_TEST_BASIC_FUNC) \
|
||||
diff --git a/regress/core/binary.sql b/regress/core/binary.sql
|
||||
index 7a36b65..ad78fc7 100644
|
||||
--- a/regress/core/binary.sql
|
||||
+++ b/regress/core/binary.sql
|
||||
@@ -1,4 +1,5 @@
|
||||
SET client_min_messages TO warning;
|
||||
+
|
||||
CREATE SCHEMA tm;
|
||||
|
||||
CREATE TABLE tm.geoms (id serial, g geometry);
|
||||
@@ -31,24 +32,39 @@ SELECT st_force4d(g) FROM tm.geoms WHERE id < 15 ORDER BY id;
|
||||
INSERT INTO tm.geoms(g)
|
||||
SELECT st_setsrid(g,4326) FROM tm.geoms ORDER BY id;
|
||||
|
||||
-COPY tm.geoms TO :tmpfile WITH BINARY;
|
||||
+-- define temp file path
|
||||
+\set tmpfile '/tmp/postgis_binary_test.dat'
|
||||
+
|
||||
+-- export
|
||||
+\set command '\\copy tm.geoms TO ':tmpfile' WITH (FORMAT BINARY)'
|
||||
+:command
|
||||
+
|
||||
+-- import
|
||||
CREATE TABLE tm.geoms_in AS SELECT * FROM tm.geoms LIMIT 0;
|
||||
-COPY tm.geoms_in FROM :tmpfile WITH BINARY;
|
||||
-SELECT 'geometry', count(*) FROM tm.geoms_in i, tm.geoms o WHERE i.id = o.id
|
||||
- AND ST_OrderingEquals(i.g, o.g);
|
||||
+\set command '\\copy tm.geoms_in FROM ':tmpfile' WITH (FORMAT BINARY)'
|
||||
+:command
|
||||
+
|
||||
+SELECT 'geometry', count(*) FROM tm.geoms_in i, tm.geoms o
|
||||
+WHERE i.id = o.id AND ST_OrderingEquals(i.g, o.g);
|
||||
|
||||
CREATE TABLE tm.geogs AS SELECT id,g::geography FROM tm.geoms
|
||||
WHERE geometrytype(g) NOT LIKE '%CURVE%'
|
||||
AND geometrytype(g) NOT LIKE '%CIRCULAR%'
|
||||
AND geometrytype(g) NOT LIKE '%SURFACE%'
|
||||
AND geometrytype(g) NOT LIKE 'TRIANGLE%'
|
||||
- AND geometrytype(g) NOT LIKE 'TIN%'
|
||||
-;
|
||||
+ AND geometrytype(g) NOT LIKE 'TIN%';
|
||||
|
||||
-COPY tm.geogs TO :tmpfile WITH BINARY;
|
||||
+-- export
|
||||
+\set command '\\copy tm.geogs TO ':tmpfile' WITH (FORMAT BINARY)'
|
||||
+:command
|
||||
+
|
||||
+-- import
|
||||
CREATE TABLE tm.geogs_in AS SELECT * FROM tm.geogs LIMIT 0;
|
||||
-COPY tm.geogs_in FROM :tmpfile WITH BINARY;
|
||||
-SELECT 'geometry', count(*) FROM tm.geogs_in i, tm.geogs o WHERE i.id = o.id
|
||||
- AND ST_OrderingEquals(i.g::geometry, o.g::geometry);
|
||||
+\set command '\\copy tm.geogs_in FROM ':tmpfile' WITH (FORMAT BINARY)'
|
||||
+:command
|
||||
+
|
||||
+SELECT 'geometry', count(*) FROM tm.geogs_in i, tm.geogs o
|
||||
+WHERE i.id = o.id AND ST_OrderingEquals(i.g::geometry, o.g::geometry);
|
||||
|
||||
DROP SCHEMA tm CASCADE;
|
||||
+
|
||||
diff --git a/regress/core/tests.mk b/regress/core/tests.mk
|
||||
index 3abd7bc..94903c3 100644
|
||||
--- a/regress/core/tests.mk
|
||||
+++ b/regress/core/tests.mk
|
||||
@@ -23,7 +23,6 @@ current_dir := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
|
||||
RUNTESTFLAGS_INTERNAL += \
|
||||
--before-upgrade-script $(top_srcdir)/regress/hooks/hook-before-upgrade.sql \
|
||||
--after-upgrade-script $(top_srcdir)/regress/hooks/hook-after-upgrade.sql \
|
||||
- --after-create-script $(top_srcdir)/regress/hooks/hook-after-create.sql \
|
||||
--before-uninstall-script $(top_srcdir)/regress/hooks/hook-before-uninstall.sql
|
||||
|
||||
TESTS += \
|
||||
@@ -40,7 +39,6 @@ TESTS += \
|
||||
$(top_srcdir)/regress/core/dumppoints \
|
||||
$(top_srcdir)/regress/core/dumpsegments \
|
||||
$(top_srcdir)/regress/core/empty \
|
||||
- $(top_srcdir)/regress/core/estimatedextent \
|
||||
$(top_srcdir)/regress/core/forcecurve \
|
||||
$(top_srcdir)/regress/core/flatgeobuf \
|
||||
$(top_srcdir)/regress/core/geography \
|
||||
@@ -55,7 +53,6 @@ TESTS += \
|
||||
$(top_srcdir)/regress/core/out_marc21 \
|
||||
$(top_srcdir)/regress/core/in_encodedpolyline \
|
||||
$(top_srcdir)/regress/core/iscollection \
|
||||
- $(top_srcdir)/regress/core/legacy \
|
||||
$(top_srcdir)/regress/core/letters \
|
||||
$(top_srcdir)/regress/core/long_xact \
|
||||
$(top_srcdir)/regress/core/lwgeom_regress \
|
||||
@@ -112,7 +109,6 @@ TESTS += \
|
||||
$(top_srcdir)/regress/core/temporal_knn \
|
||||
$(top_srcdir)/regress/core/tickets \
|
||||
$(top_srcdir)/regress/core/twkb \
|
||||
- $(top_srcdir)/regress/core/typmod \
|
||||
$(top_srcdir)/regress/core/wkb \
|
||||
$(top_srcdir)/regress/core/wkt \
|
||||
$(top_srcdir)/regress/core/wmsservers \
|
||||
@@ -144,11 +140,6 @@ TESTS_SLOW = \
|
||||
$(top_srcdir)/regress/core/concave_hull_hard \
|
||||
$(top_srcdir)/regress/core/knn_recheck
|
||||
|
||||
-ifeq ($(shell expr "$(POSTGIS_PGSQL_VERSION)" ">=" 120),1)
|
||||
- TESTS += \
|
||||
- $(top_srcdir)/regress/core/computed_columns
|
||||
-endif
|
||||
-
|
||||
ifeq ($(shell expr "$(POSTGIS_GEOS_VERSION)" ">=" 30700),1)
|
||||
# GEOS-3.7 adds:
|
||||
# ST_FrechetDistance
|
||||
diff --git a/regress/loader/tests.mk b/regress/loader/tests.mk
|
||||
index 1fc77ac..c3cb9de 100644
|
||||
--- a/regress/loader/tests.mk
|
||||
+++ b/regress/loader/tests.mk
|
||||
@@ -38,7 +38,5 @@ TESTS += \
|
||||
$(top_srcdir)/regress/loader/Latin1 \
|
||||
$(top_srcdir)/regress/loader/Latin1-implicit \
|
||||
$(top_srcdir)/regress/loader/mfile \
|
||||
- $(top_srcdir)/regress/loader/TestSkipANALYZE \
|
||||
- $(top_srcdir)/regress/loader/TestANALYZE \
|
||||
$(top_srcdir)/regress/loader/CharNoWidth
|
||||
|
||||
diff --git a/regress/run_test.pl b/regress/run_test.pl
|
||||
index 0ec5b2d..1c331f4 100755
|
||||
--- a/regress/run_test.pl
|
||||
+++ b/regress/run_test.pl
|
||||
@@ -147,7 +147,6 @@ $ENV{"LANG"} = "C";
|
||||
# Add locale info to the psql options
|
||||
# Add pg12 precision suppression
|
||||
my $PGOPTIONS = $ENV{"PGOPTIONS"};
|
||||
-$PGOPTIONS .= " -c lc_messages=C";
|
||||
$PGOPTIONS .= " -c client_min_messages=NOTICE";
|
||||
$PGOPTIONS .= " -c extra_float_digits=0";
|
||||
$ENV{"PGOPTIONS"} = $PGOPTIONS;
|
||||
218
docker-compose/ext-src/postgis-src/postgis-regular-v17.patch
Normal file
218
docker-compose/ext-src/postgis-src/postgis-regular-v17.patch
Normal file
@@ -0,0 +1,218 @@
|
||||
diff --git a/raster/test/regress/tests.mk b/raster/test/regress/tests.mk
|
||||
index 00918e1..7e2b6cd 100644
|
||||
--- a/raster/test/regress/tests.mk
|
||||
+++ b/raster/test/regress/tests.mk
|
||||
@@ -17,9 +17,7 @@ override RUNTESTFLAGS_INTERNAL := \
|
||||
$(RUNTESTFLAGS_INTERNAL) \
|
||||
--after-upgrade-script $(top_srcdir)/raster/test/regress/hooks/hook-after-upgrade-raster.sql
|
||||
|
||||
-RASTER_TEST_FIRST = \
|
||||
- $(top_srcdir)/raster/test/regress/check_gdal \
|
||||
- $(top_srcdir)/raster/test/regress/loader/load_outdb
|
||||
+RASTER_TEST_FIRST =
|
||||
|
||||
RASTER_TEST_LAST = \
|
||||
$(top_srcdir)/raster/test/regress/clean
|
||||
@@ -33,9 +31,7 @@ RASTER_TEST_IO = \
|
||||
|
||||
RASTER_TEST_BASIC_FUNC = \
|
||||
$(top_srcdir)/raster/test/regress/rt_bytea \
|
||||
- $(top_srcdir)/raster/test/regress/rt_wkb \
|
||||
$(top_srcdir)/raster/test/regress/box3d \
|
||||
- $(top_srcdir)/raster/test/regress/rt_addband \
|
||||
$(top_srcdir)/raster/test/regress/rt_band \
|
||||
$(top_srcdir)/raster/test/regress/rt_tile
|
||||
|
||||
@@ -73,16 +69,10 @@ RASTER_TEST_BANDPROPS = \
|
||||
$(top_srcdir)/raster/test/regress/rt_neighborhood \
|
||||
$(top_srcdir)/raster/test/regress/rt_nearestvalue \
|
||||
$(top_srcdir)/raster/test/regress/rt_pixelofvalue \
|
||||
- $(top_srcdir)/raster/test/regress/rt_polygon \
|
||||
- $(top_srcdir)/raster/test/regress/rt_setbandpath
|
||||
+ $(top_srcdir)/raster/test/regress/rt_polygon
|
||||
|
||||
RASTER_TEST_UTILITY = \
|
||||
$(top_srcdir)/raster/test/regress/rt_utility \
|
||||
- $(top_srcdir)/raster/test/regress/rt_fromgdalraster \
|
||||
- $(top_srcdir)/raster/test/regress/rt_asgdalraster \
|
||||
- $(top_srcdir)/raster/test/regress/rt_astiff \
|
||||
- $(top_srcdir)/raster/test/regress/rt_asjpeg \
|
||||
- $(top_srcdir)/raster/test/regress/rt_aspng \
|
||||
$(top_srcdir)/raster/test/regress/rt_reclass \
|
||||
$(top_srcdir)/raster/test/regress/rt_gdalwarp \
|
||||
$(top_srcdir)/raster/test/regress/rt_gdalcontour \
|
||||
@@ -120,21 +110,13 @@ RASTER_TEST_SREL = \
|
||||
|
||||
RASTER_TEST_BUGS = \
|
||||
$(top_srcdir)/raster/test/regress/bug_test_car5 \
|
||||
- $(top_srcdir)/raster/test/regress/permitted_gdal_drivers \
|
||||
$(top_srcdir)/raster/test/regress/tickets
|
||||
|
||||
RASTER_TEST_LOADER = \
|
||||
$(top_srcdir)/raster/test/regress/loader/Basic \
|
||||
$(top_srcdir)/raster/test/regress/loader/Projected \
|
||||
$(top_srcdir)/raster/test/regress/loader/BasicCopy \
|
||||
- $(top_srcdir)/raster/test/regress/loader/BasicFilename \
|
||||
- $(top_srcdir)/raster/test/regress/loader/BasicOutDB \
|
||||
- $(top_srcdir)/raster/test/regress/loader/Tiled10x10 \
|
||||
- $(top_srcdir)/raster/test/regress/loader/Tiled10x10Copy \
|
||||
- $(top_srcdir)/raster/test/regress/loader/Tiled8x8 \
|
||||
- $(top_srcdir)/raster/test/regress/loader/TiledAuto \
|
||||
- $(top_srcdir)/raster/test/regress/loader/TiledAutoSkipNoData \
|
||||
- $(top_srcdir)/raster/test/regress/loader/TiledAutoCopyn
|
||||
+ $(top_srcdir)/raster/test/regress/loader/BasicFilename
|
||||
|
||||
RASTER_TESTS := $(RASTER_TEST_FIRST) \
|
||||
$(RASTER_TEST_METADATA) $(RASTER_TEST_IO) $(RASTER_TEST_BASIC_FUNC) \
|
||||
diff --git a/regress/core/binary.sql b/regress/core/binary.sql
|
||||
index 7a36b65..ad78fc7 100644
|
||||
--- a/regress/core/binary.sql
|
||||
+++ b/regress/core/binary.sql
|
||||
@@ -1,4 +1,5 @@
|
||||
SET client_min_messages TO warning;
|
||||
+
|
||||
CREATE SCHEMA tm;
|
||||
|
||||
CREATE TABLE tm.geoms (id serial, g geometry);
|
||||
@@ -31,24 +32,39 @@ SELECT st_force4d(g) FROM tm.geoms WHERE id < 15 ORDER BY id;
|
||||
INSERT INTO tm.geoms(g)
|
||||
SELECT st_setsrid(g,4326) FROM tm.geoms ORDER BY id;
|
||||
|
||||
-COPY tm.geoms TO :tmpfile WITH BINARY;
|
||||
+-- define temp file path
|
||||
+\set tmpfile '/tmp/postgis_binary_test.dat'
|
||||
+
|
||||
+-- export
|
||||
+\set command '\\copy tm.geoms TO ':tmpfile' WITH (FORMAT BINARY)'
|
||||
+:command
|
||||
+
|
||||
+-- import
|
||||
CREATE TABLE tm.geoms_in AS SELECT * FROM tm.geoms LIMIT 0;
|
||||
-COPY tm.geoms_in FROM :tmpfile WITH BINARY;
|
||||
-SELECT 'geometry', count(*) FROM tm.geoms_in i, tm.geoms o WHERE i.id = o.id
|
||||
- AND ST_OrderingEquals(i.g, o.g);
|
||||
+\set command '\\copy tm.geoms_in FROM ':tmpfile' WITH (FORMAT BINARY)'
|
||||
+:command
|
||||
+
|
||||
+SELECT 'geometry', count(*) FROM tm.geoms_in i, tm.geoms o
|
||||
+WHERE i.id = o.id AND ST_OrderingEquals(i.g, o.g);
|
||||
|
||||
CREATE TABLE tm.geogs AS SELECT id,g::geography FROM tm.geoms
|
||||
WHERE geometrytype(g) NOT LIKE '%CURVE%'
|
||||
AND geometrytype(g) NOT LIKE '%CIRCULAR%'
|
||||
AND geometrytype(g) NOT LIKE '%SURFACE%'
|
||||
AND geometrytype(g) NOT LIKE 'TRIANGLE%'
|
||||
- AND geometrytype(g) NOT LIKE 'TIN%'
|
||||
-;
|
||||
+ AND geometrytype(g) NOT LIKE 'TIN%';
|
||||
|
||||
-COPY tm.geogs TO :tmpfile WITH BINARY;
|
||||
+-- export
|
||||
+\set command '\\copy tm.geogs TO ':tmpfile' WITH (FORMAT BINARY)'
|
||||
+:command
|
||||
+
|
||||
+-- import
|
||||
CREATE TABLE tm.geogs_in AS SELECT * FROM tm.geogs LIMIT 0;
|
||||
-COPY tm.geogs_in FROM :tmpfile WITH BINARY;
|
||||
-SELECT 'geometry', count(*) FROM tm.geogs_in i, tm.geogs o WHERE i.id = o.id
|
||||
- AND ST_OrderingEquals(i.g::geometry, o.g::geometry);
|
||||
+\set command '\\copy tm.geogs_in FROM ':tmpfile' WITH (FORMAT BINARY)'
|
||||
+:command
|
||||
+
|
||||
+SELECT 'geometry', count(*) FROM tm.geogs_in i, tm.geogs o
|
||||
+WHERE i.id = o.id AND ST_OrderingEquals(i.g::geometry, o.g::geometry);
|
||||
|
||||
DROP SCHEMA tm CASCADE;
|
||||
+
|
||||
diff --git a/regress/core/tests.mk b/regress/core/tests.mk
|
||||
index 9e05244..a63a3e1 100644
|
||||
--- a/regress/core/tests.mk
|
||||
+++ b/regress/core/tests.mk
|
||||
@@ -16,14 +16,13 @@ POSTGIS_PGSQL_VERSION=170
|
||||
POSTGIS_GEOS_VERSION=31101
|
||||
HAVE_JSON=yes
|
||||
HAVE_SPGIST=yes
|
||||
-INTERRUPTTESTS=yes
|
||||
+INTERRUPTTESTS=no
|
||||
|
||||
current_dir := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
|
||||
|
||||
RUNTESTFLAGS_INTERNAL += \
|
||||
--before-upgrade-script $(top_srcdir)/regress/hooks/hook-before-upgrade.sql \
|
||||
--after-upgrade-script $(top_srcdir)/regress/hooks/hook-after-upgrade.sql \
|
||||
- --after-create-script $(top_srcdir)/regress/hooks/hook-after-create.sql \
|
||||
--before-uninstall-script $(top_srcdir)/regress/hooks/hook-before-uninstall.sql
|
||||
|
||||
TESTS += \
|
||||
@@ -40,7 +39,6 @@ TESTS += \
|
||||
$(top_srcdir)/regress/core/dumppoints \
|
||||
$(top_srcdir)/regress/core/dumpsegments \
|
||||
$(top_srcdir)/regress/core/empty \
|
||||
- $(top_srcdir)/regress/core/estimatedextent \
|
||||
$(top_srcdir)/regress/core/forcecurve \
|
||||
$(top_srcdir)/regress/core/flatgeobuf \
|
||||
$(top_srcdir)/regress/core/frechet \
|
||||
@@ -60,7 +58,6 @@ TESTS += \
|
||||
$(top_srcdir)/regress/core/out_marc21 \
|
||||
$(top_srcdir)/regress/core/in_encodedpolyline \
|
||||
$(top_srcdir)/regress/core/iscollection \
|
||||
- $(top_srcdir)/regress/core/legacy \
|
||||
$(top_srcdir)/regress/core/letters \
|
||||
$(top_srcdir)/regress/core/lwgeom_regress \
|
||||
$(top_srcdir)/regress/core/measures \
|
||||
@@ -119,7 +116,6 @@ TESTS += \
|
||||
$(top_srcdir)/regress/core/temporal_knn \
|
||||
$(top_srcdir)/regress/core/tickets \
|
||||
$(top_srcdir)/regress/core/twkb \
|
||||
- $(top_srcdir)/regress/core/typmod \
|
||||
$(top_srcdir)/regress/core/wkb \
|
||||
$(top_srcdir)/regress/core/wkt \
|
||||
$(top_srcdir)/regress/core/wmsservers \
|
||||
@@ -143,8 +139,7 @@ TESTS += \
|
||||
$(top_srcdir)/regress/core/oriented_envelope \
|
||||
$(top_srcdir)/regress/core/point_coordinates \
|
||||
$(top_srcdir)/regress/core/out_geojson \
|
||||
- $(top_srcdir)/regress/core/wrapx \
|
||||
- $(top_srcdir)/regress/core/computed_columns
|
||||
+ $(top_srcdir)/regress/core/wrapx
|
||||
|
||||
# Slow slow tests
|
||||
TESTS_SLOW = \
|
||||
diff --git a/regress/loader/tests.mk b/regress/loader/tests.mk
|
||||
index ac4f8ad..4bad4fc 100644
|
||||
--- a/regress/loader/tests.mk
|
||||
+++ b/regress/loader/tests.mk
|
||||
@@ -38,7 +38,5 @@ TESTS += \
|
||||
$(top_srcdir)/regress/loader/Latin1 \
|
||||
$(top_srcdir)/regress/loader/Latin1-implicit \
|
||||
$(top_srcdir)/regress/loader/mfile \
|
||||
- $(top_srcdir)/regress/loader/TestSkipANALYZE \
|
||||
- $(top_srcdir)/regress/loader/TestANALYZE \
|
||||
$(top_srcdir)/regress/loader/CharNoWidth \
|
||||
|
||||
diff --git a/regress/run_test.pl b/regress/run_test.pl
|
||||
index cac4b2e..4c7c82b 100755
|
||||
--- a/regress/run_test.pl
|
||||
+++ b/regress/run_test.pl
|
||||
@@ -238,7 +238,6 @@ $ENV{"LANG"} = "C";
|
||||
# Add locale info to the psql options
|
||||
# Add pg12 precision suppression
|
||||
my $PGOPTIONS = $ENV{"PGOPTIONS"};
|
||||
-$PGOPTIONS .= " -c lc_messages=C";
|
||||
$PGOPTIONS .= " -c client_min_messages=NOTICE";
|
||||
$PGOPTIONS .= " -c extra_float_digits=0";
|
||||
$ENV{"PGOPTIONS"} = $PGOPTIONS;
|
||||
diff --git a/topology/test/tests.mk b/topology/test/tests.mk
|
||||
index cbe2633..2c7c18f 100644
|
||||
--- a/topology/test/tests.mk
|
||||
+++ b/topology/test/tests.mk
|
||||
@@ -46,9 +46,7 @@ TESTS += \
|
||||
$(top_srcdir)/topology/test/regress/legacy_query.sql \
|
||||
$(top_srcdir)/topology/test/regress/legacy_validate.sql \
|
||||
$(top_srcdir)/topology/test/regress/polygonize.sql \
|
||||
- $(top_srcdir)/topology/test/regress/populate_topology_layer.sql \
|
||||
$(top_srcdir)/topology/test/regress/removeunusedprimitives.sql \
|
||||
- $(top_srcdir)/topology/test/regress/renametopogeometrycolumn.sql \
|
||||
$(top_srcdir)/topology/test/regress/renametopology.sql \
|
||||
$(top_srcdir)/topology/test/regress/share_sequences.sql \
|
||||
$(top_srcdir)/topology/test/regress/sqlmm.sql \
|
||||
46
docker-compose/ext-src/postgis-src/raster_outdb_template.sql
Normal file
46
docker-compose/ext-src/postgis-src/raster_outdb_template.sql
Normal file
File diff suppressed because one or more lines are too long
17
docker-compose/ext-src/postgis-src/regular-test.sh
Executable file
17
docker-compose/ext-src/postgis-src/regular-test.sh
Executable file
@@ -0,0 +1,17 @@
|
||||
#!/bin/bash
|
||||
set -ex
|
||||
cd "$(dirname "${0}")"
|
||||
dropdb --if-exist contrib_regression
|
||||
createdb contrib_regression
|
||||
psql -d contrib_regression -c "ALTER DATABASE contrib_regression SET TimeZone='UTC'" \
|
||||
-c "ALTER DATABASE contrib_regression SET DateStyle='ISO, MDY'" \
|
||||
-c "CREATE EXTENSION postgis SCHEMA public" \
|
||||
-c "CREATE EXTENSION postgis_topology" \
|
||||
-c "CREATE EXTENSION postgis_tiger_geocoder CASCADE" \
|
||||
-c "CREATE EXTENSION postgis_raster SCHEMA public" \
|
||||
-c "CREATE EXTENSION postgis_sfcgal SCHEMA public"
|
||||
patch -p1 <postgis-no-upgrade-test.patch
|
||||
patch -p1 <"postgis-regular-${PG_VERSION}.patch"
|
||||
psql -d contrib_regression -f raster_outdb_template.sql
|
||||
trap 'patch -R -p1 <postgis-no-upgrade-test.patch && patch -R -p1 <"postgis-regular-${PG_VERSION}.patch"' EXIT
|
||||
POSTGIS_REGRESS_DB=contrib_regression RUNTESTFLAGS=--nocreate make installcheck-base
|
||||
@@ -63,5 +63,9 @@ done
|
||||
for d in ${FAILED}; do
|
||||
cat "$(find $d -name regression.diffs)"
|
||||
done
|
||||
for postgis_diff in /tmp/pgis_reg/*_diff; do
|
||||
echo "${postgis_diff}:"
|
||||
cat "${postgis_diff}"
|
||||
done
|
||||
echo "${FAILED}"
|
||||
exit 1
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
//! provide it by calling the compute_ctl's `/compute_ctl` endpoint, or
|
||||
//! compute_ctl can fetch it by calling the control plane's API.
|
||||
use std::collections::HashMap;
|
||||
use std::fmt::Display;
|
||||
|
||||
use indexmap::IndexMap;
|
||||
use regex::Regex;
|
||||
@@ -178,9 +179,9 @@ pub struct ComputeSpec {
|
||||
/// JWT for authorizing requests to endpoint storage service
|
||||
pub endpoint_storage_token: Option<String>,
|
||||
|
||||
/// If true, download LFC state from endpoint_storage and pass it to Postgres on startup
|
||||
/// Download LFC state from endpoint_storage and pass it to Postgres on startup
|
||||
#[serde(default)]
|
||||
pub prewarm_lfc_on_startup: bool,
|
||||
pub autoprewarm: bool,
|
||||
}
|
||||
|
||||
/// Feature flag to signal `compute_ctl` to enable certain experimental functionality.
|
||||
@@ -192,6 +193,9 @@ pub enum ComputeFeature {
|
||||
/// track short-lived connections as user activity.
|
||||
ActivityMonitorExperimental,
|
||||
|
||||
/// Enable TLS functionality.
|
||||
TlsExperimental,
|
||||
|
||||
/// This is a special feature flag that is used to represent unknown feature flags.
|
||||
/// Basically all unknown to enum flags are represented as this one. See unit test
|
||||
/// `parse_unknown_features()` for more details.
|
||||
@@ -250,34 +254,44 @@ impl RemoteExtSpec {
|
||||
}
|
||||
|
||||
match self.extension_data.get(real_ext_name) {
|
||||
Some(_ext_data) => {
|
||||
// We have decided to use the Go naming convention due to Kubernetes.
|
||||
|
||||
let arch = match std::env::consts::ARCH {
|
||||
"x86_64" => "amd64",
|
||||
"aarch64" => "arm64",
|
||||
arch => arch,
|
||||
};
|
||||
|
||||
// Construct the path to the extension archive
|
||||
// BUILD_TAG/PG_MAJOR_VERSION/extensions/EXTENSION_NAME.tar.zst
|
||||
//
|
||||
// Keep it in sync with path generation in
|
||||
// https://github.com/neondatabase/build-custom-extensions/tree/main
|
||||
let archive_path_str = format!(
|
||||
"{build_tag}/{arch}/{pg_major_version}/extensions/{real_ext_name}.tar.zst"
|
||||
);
|
||||
Ok((
|
||||
real_ext_name.to_string(),
|
||||
RemotePath::from_string(&archive_path_str)?,
|
||||
))
|
||||
}
|
||||
Some(_ext_data) => Ok((
|
||||
real_ext_name.to_string(),
|
||||
Self::build_remote_path(build_tag, pg_major_version, real_ext_name)?,
|
||||
)),
|
||||
None => Err(anyhow::anyhow!(
|
||||
"real_ext_name {} is not found",
|
||||
real_ext_name
|
||||
)),
|
||||
}
|
||||
}
|
||||
|
||||
/// Get the architecture-specific portion of the remote extension path. We
|
||||
/// use the Go naming convention due to Kubernetes.
|
||||
fn get_arch() -> &'static str {
|
||||
match std::env::consts::ARCH {
|
||||
"x86_64" => "amd64",
|
||||
"aarch64" => "arm64",
|
||||
arch => arch,
|
||||
}
|
||||
}
|
||||
|
||||
/// Build a [`RemotePath`] for an extension.
|
||||
fn build_remote_path(
|
||||
build_tag: &str,
|
||||
pg_major_version: &str,
|
||||
ext_name: &str,
|
||||
) -> anyhow::Result<RemotePath> {
|
||||
let arch = Self::get_arch();
|
||||
|
||||
// Construct the path to the extension archive
|
||||
// BUILD_TAG/PG_MAJOR_VERSION/extensions/EXTENSION_NAME.tar.zst
|
||||
//
|
||||
// Keep it in sync with path generation in
|
||||
// https://github.com/neondatabase/build-custom-extensions/tree/main
|
||||
RemotePath::from_string(&format!(
|
||||
"{build_tag}/{arch}/{pg_major_version}/extensions/{ext_name}.tar.zst"
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, Debug, Default, Eq, PartialEq, Deserialize, Serialize)]
|
||||
@@ -306,6 +320,12 @@ impl ComputeMode {
|
||||
}
|
||||
}
|
||||
|
||||
impl Display for ComputeMode {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.write_str(self.to_type_str())
|
||||
}
|
||||
}
|
||||
|
||||
/// Log level for audit logging
|
||||
#[derive(Clone, Debug, Default, Eq, PartialEq, Deserialize, Serialize)]
|
||||
pub enum ComputeAudit {
|
||||
@@ -518,6 +538,37 @@ mod tests {
|
||||
.expect("Library should be found");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn remote_extension_path() {
|
||||
let rspec: RemoteExtSpec = serde_json::from_value(serde_json::json!({
|
||||
"public_extensions": ["ext"],
|
||||
"custom_extensions": [],
|
||||
"library_index": {
|
||||
"extlib": "ext",
|
||||
},
|
||||
"extension_data": {
|
||||
"ext": {
|
||||
"control_data": {
|
||||
"ext.control": ""
|
||||
},
|
||||
"archive_path": ""
|
||||
}
|
||||
},
|
||||
}))
|
||||
.unwrap();
|
||||
|
||||
let (_ext_name, ext_path) = rspec
|
||||
.get_ext("ext", false, "latest", "v17")
|
||||
.expect("Extension should be found");
|
||||
// Starting with a forward slash would have consequences for the
|
||||
// Url::join() that occurs when downloading a remote extension.
|
||||
assert!(!ext_path.to_string().starts_with("/"));
|
||||
assert_eq!(
|
||||
ext_path,
|
||||
RemoteExtSpec::build_remote_path("latest", "v17", "ext").unwrap()
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_spec_file() {
|
||||
let file = File::open("tests/cluster_spec.json").unwrap();
|
||||
|
||||
@@ -85,7 +85,7 @@
|
||||
"vartype": "bool"
|
||||
},
|
||||
{
|
||||
"name": "prewarm_lfc_on_startup",
|
||||
"name": "autoprewarm",
|
||||
"value": "off",
|
||||
"vartype": "bool"
|
||||
},
|
||||
|
||||
@@ -107,7 +107,7 @@ impl<const N: usize> MetricType for HyperLogLogState<N> {
|
||||
}
|
||||
|
||||
impl<const N: usize> HyperLogLogState<N> {
|
||||
pub fn measure(&self, item: &impl Hash) {
|
||||
pub fn measure(&self, item: &(impl Hash + ?Sized)) {
|
||||
// changing the hasher will break compatibility with previous measurements.
|
||||
self.record(BuildHasherDefault::<xxh3::Hash64>::default().hash_one(item));
|
||||
}
|
||||
|
||||
@@ -27,6 +27,7 @@ pub use prometheus::{
|
||||
|
||||
pub mod launch_timestamp;
|
||||
mod wrappers;
|
||||
pub use prometheus;
|
||||
pub use wrappers::{CountedReader, CountedWriter};
|
||||
mod hll;
|
||||
pub use hll::{HyperLogLog, HyperLogLogState, HyperLogLogVec};
|
||||
|
||||
@@ -6,8 +6,13 @@ license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
thiserror.workspace = true
|
||||
nix.workspace=true
|
||||
nix.workspace = true
|
||||
spin.workspace = true
|
||||
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
||||
|
||||
[dev-dependencies]
|
||||
rand = "0.9.1"
|
||||
rand_distr = "0.5.1"
|
||||
|
||||
[target.'cfg(target_os = "macos")'.dependencies]
|
||||
tempfile = "3.14.0"
|
||||
|
||||
366
libs/neon-shmem/src/hash.rs
Normal file
366
libs/neon-shmem/src/hash.rs
Normal file
@@ -0,0 +1,366 @@
|
||||
//! Hash table implementation on top of 'shmem'
|
||||
//!
|
||||
//! Features required in the long run by the communicator project:
|
||||
//!
|
||||
//! [X] Accessible from both Postgres processes and rust threads in the communicator process
|
||||
//! [X] Low latency
|
||||
//! [ ] Scalable to lots of concurrent accesses (currently uses a single spinlock)
|
||||
//! [ ] Resizable
|
||||
|
||||
use std::fmt::Debug;
|
||||
use std::hash::Hash;
|
||||
use std::mem::MaybeUninit;
|
||||
use std::ops::Deref;
|
||||
|
||||
use crate::shmem::ShmemHandle;
|
||||
|
||||
use spin;
|
||||
|
||||
mod core;
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests;
|
||||
|
||||
use core::CoreHashMap;
|
||||
|
||||
pub enum UpdateAction<V> {
|
||||
Nothing,
|
||||
Insert(V),
|
||||
Remove,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct OutOfMemoryError();
|
||||
|
||||
pub struct HashMapInit<'a, K, V> {
|
||||
// Hash table can be allocated in a fixed memory area, or in a resizeable ShmemHandle.
|
||||
shmem_handle: Option<ShmemHandle>,
|
||||
shared_ptr: *mut HashMapShared<'a, K, V>,
|
||||
}
|
||||
|
||||
pub struct HashMapAccess<'a, K, V> {
|
||||
shmem_handle: Option<ShmemHandle>,
|
||||
shared_ptr: *mut HashMapShared<'a, K, V>,
|
||||
}
|
||||
|
||||
unsafe impl<'a, K: Sync, V: Sync> Sync for HashMapAccess<'a, K, V> {}
|
||||
unsafe impl<'a, K: Send, V: Send> Send for HashMapAccess<'a, K, V> {}
|
||||
|
||||
impl<'a, K, V> HashMapInit<'a, K, V> {
|
||||
pub fn attach_writer(self) -> HashMapAccess<'a, K, V> {
|
||||
HashMapAccess {
|
||||
shmem_handle: self.shmem_handle,
|
||||
shared_ptr: self.shared_ptr,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn attach_reader(self) -> HashMapAccess<'a, K, V> {
|
||||
// no difference to attach_writer currently
|
||||
self.attach_writer()
|
||||
}
|
||||
}
|
||||
|
||||
// This is stored in the shared memory area
|
||||
struct HashMapShared<'a, K, V> {
|
||||
inner: spin::RwLock<CoreHashMap<'a, K, V>>,
|
||||
}
|
||||
|
||||
impl<'a, K, V> HashMapInit<'a, K, V>
|
||||
where
|
||||
K: Clone + Hash + Eq,
|
||||
{
|
||||
pub fn estimate_size(num_buckets: u32) -> usize {
|
||||
// add some margin to cover alignment etc.
|
||||
CoreHashMap::<K, V>::estimate_size(num_buckets) + size_of::<HashMapShared<K, V>>() + 1000
|
||||
}
|
||||
|
||||
pub fn init_in_fixed_area(
|
||||
num_buckets: u32,
|
||||
area: &'a mut [MaybeUninit<u8>],
|
||||
) -> HashMapInit<'a, K, V> {
|
||||
Self::init_common(num_buckets, None, area.as_mut_ptr().cast(), area.len())
|
||||
}
|
||||
|
||||
/// Initialize a new hash map in the given shared memory area
|
||||
pub fn init_in_shmem(num_buckets: u32, mut shmem: ShmemHandle) -> HashMapInit<'a, K, V> {
|
||||
let size = Self::estimate_size(num_buckets);
|
||||
shmem
|
||||
.set_size(size)
|
||||
.expect("could not resize shared memory area");
|
||||
|
||||
let ptr = unsafe { shmem.data_ptr.as_mut() };
|
||||
Self::init_common(num_buckets, Some(shmem), ptr, size)
|
||||
}
|
||||
|
||||
fn init_common(
|
||||
num_buckets: u32,
|
||||
shmem_handle: Option<ShmemHandle>,
|
||||
area_ptr: *mut u8,
|
||||
area_len: usize,
|
||||
) -> HashMapInit<'a, K, V> {
|
||||
// carve out HashMapShared from the area. This does not include the hashmap's dictionary
|
||||
// and buckets.
|
||||
let mut ptr: *mut u8 = area_ptr;
|
||||
ptr = unsafe { ptr.add(ptr.align_offset(align_of::<HashMapShared<K, V>>())) };
|
||||
let shared_ptr: *mut HashMapShared<K, V> = ptr.cast();
|
||||
ptr = unsafe { ptr.add(size_of::<HashMapShared<K, V>>()) };
|
||||
|
||||
// the rest of the space is given to the hash map's dictionary and buckets
|
||||
let remaining_area = unsafe {
|
||||
std::slice::from_raw_parts_mut(ptr, area_len - ptr.offset_from(area_ptr) as usize)
|
||||
};
|
||||
|
||||
let hashmap = CoreHashMap::new(num_buckets, remaining_area);
|
||||
unsafe {
|
||||
std::ptr::write(
|
||||
shared_ptr,
|
||||
HashMapShared {
|
||||
inner: spin::RwLock::new(hashmap),
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
HashMapInit {
|
||||
shmem_handle: shmem_handle,
|
||||
shared_ptr,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, K, V> HashMapAccess<'a, K, V>
|
||||
where
|
||||
K: Clone + Hash + Eq,
|
||||
{
|
||||
pub fn get<'e>(&'e self, key: &K) -> Option<ValueReadGuard<'e, K, V>> {
|
||||
let map = unsafe { self.shared_ptr.as_ref() }.unwrap();
|
||||
let lock_guard = map.inner.read();
|
||||
|
||||
match lock_guard.get(key) {
|
||||
None => None,
|
||||
Some(val_ref) => {
|
||||
let val_ptr = std::ptr::from_ref(val_ref);
|
||||
Some(ValueReadGuard {
|
||||
_lock_guard: lock_guard,
|
||||
value: val_ptr,
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Insert a value
|
||||
pub fn insert(&self, key: &K, value: V) -> Result<bool, OutOfMemoryError> {
|
||||
let mut success = None;
|
||||
|
||||
self.update_with_fn(key, |existing| {
|
||||
if let Some(_) = existing {
|
||||
success = Some(false);
|
||||
UpdateAction::Nothing
|
||||
} else {
|
||||
success = Some(true);
|
||||
UpdateAction::Insert(value)
|
||||
}
|
||||
})?;
|
||||
Ok(success.expect("value_fn not called"))
|
||||
}
|
||||
|
||||
/// Remove value. Returns true if it existed
|
||||
pub fn remove(&self, key: &K) -> bool {
|
||||
let mut result = false;
|
||||
self.update_with_fn(key, |existing| match existing {
|
||||
Some(_) => {
|
||||
result = true;
|
||||
UpdateAction::Remove
|
||||
}
|
||||
None => UpdateAction::Nothing,
|
||||
})
|
||||
.expect("out of memory while removing");
|
||||
result
|
||||
}
|
||||
|
||||
/// Update key using the given function. All the other modifying operations are based on this.
|
||||
pub fn update_with_fn<F>(&self, key: &K, value_fn: F) -> Result<(), OutOfMemoryError>
|
||||
where
|
||||
F: FnOnce(Option<&V>) -> UpdateAction<V>,
|
||||
{
|
||||
let map = unsafe { self.shared_ptr.as_ref() }.unwrap();
|
||||
let mut lock_guard = map.inner.write();
|
||||
|
||||
let old_val = lock_guard.get(key);
|
||||
let action = value_fn(old_val);
|
||||
match (old_val, action) {
|
||||
(_, UpdateAction::Nothing) => {}
|
||||
(_, UpdateAction::Insert(new_val)) => {
|
||||
let _ = lock_guard.insert(key, new_val);
|
||||
}
|
||||
(None, UpdateAction::Remove) => panic!("Remove action with no old value"),
|
||||
(Some(_), UpdateAction::Remove) => {
|
||||
let _ = lock_guard.remove(key);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Update key using the given function. All the other modifying operations are based on this.
|
||||
pub fn update_with_fn_at_bucket<F>(
|
||||
&self,
|
||||
pos: usize,
|
||||
value_fn: F,
|
||||
) -> Result<(), OutOfMemoryError>
|
||||
where
|
||||
F: FnOnce(Option<&V>) -> UpdateAction<V>,
|
||||
{
|
||||
let map = unsafe { self.shared_ptr.as_ref() }.unwrap();
|
||||
let mut lock_guard = map.inner.write();
|
||||
|
||||
let old_val = lock_guard.get_bucket(pos);
|
||||
let action = value_fn(old_val.map(|(_k, v)| v));
|
||||
match (old_val, action) {
|
||||
(_, UpdateAction::Nothing) => {}
|
||||
(_, UpdateAction::Insert(_new_val)) => panic!("cannot insert without key"),
|
||||
(None, UpdateAction::Remove) => panic!("Remove action with no old value"),
|
||||
(Some((key, _value)), UpdateAction::Remove) => {
|
||||
let key = key.clone();
|
||||
let _ = lock_guard.remove(&key);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn get_num_buckets(&self) -> usize {
|
||||
let map = unsafe { self.shared_ptr.as_ref() }.unwrap();
|
||||
map.inner.read().get_num_buckets()
|
||||
}
|
||||
|
||||
/// Return the key and value stored in bucket with given index. This can be used to
|
||||
/// iterate through the hash map. (An Iterator might be nicer. The communicator's
|
||||
/// clock algorithm needs to _slowly_ iterate through all buckets with its clock hand,
|
||||
/// without holding a lock. If we switch to an Iterator, it must not hold the lock.)
|
||||
pub fn get_bucket<'e>(&'e self, pos: usize) -> Option<ValueReadGuard<'e, K, V>> {
|
||||
let map = unsafe { self.shared_ptr.as_ref() }.unwrap();
|
||||
let lock_guard = map.inner.read();
|
||||
|
||||
match lock_guard.get_bucket(pos) {
|
||||
None => None,
|
||||
Some((_key, val_ref)) => {
|
||||
let val_ptr = std::ptr::from_ref(val_ref);
|
||||
Some(ValueReadGuard {
|
||||
_lock_guard: lock_guard,
|
||||
value: val_ptr,
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// for metrics
|
||||
pub fn get_num_buckets_in_use(&self) -> usize {
|
||||
let map = unsafe { self.shared_ptr.as_ref() }.unwrap();
|
||||
map.inner.read().buckets_in_use as usize
|
||||
}
|
||||
|
||||
/// Grow
|
||||
///
|
||||
/// 1. grow the underlying shared memory area
|
||||
/// 2. Initialize new buckets. This overwrites the current dictionary
|
||||
/// 3. Recalculate the dictionary
|
||||
pub fn grow(&self, num_buckets: u32) -> Result<(), crate::shmem::Error> {
|
||||
let map = unsafe { self.shared_ptr.as_ref() }.unwrap();
|
||||
let mut lock_guard = map.inner.write();
|
||||
let inner = &mut *lock_guard;
|
||||
let old_num_buckets = inner.buckets.len() as u32;
|
||||
|
||||
if num_buckets < old_num_buckets {
|
||||
panic!("grow called with a smaller number of buckets");
|
||||
}
|
||||
if num_buckets == old_num_buckets {
|
||||
return Ok(());
|
||||
}
|
||||
let shmem_handle = self
|
||||
.shmem_handle
|
||||
.as_ref()
|
||||
.expect("grow called on a fixed-size hash table");
|
||||
|
||||
let size_bytes = HashMapInit::<K, V>::estimate_size(num_buckets);
|
||||
shmem_handle.set_size(size_bytes)?;
|
||||
let end_ptr: *mut u8 = unsafe { shmem_handle.data_ptr.as_ptr().add(size_bytes) };
|
||||
|
||||
// Initialize new buckets. The new buckets are linked to the free list. NB: This overwrites
|
||||
// the dictionary!
|
||||
let buckets_ptr = inner.buckets.as_mut_ptr();
|
||||
unsafe {
|
||||
for i in old_num_buckets..num_buckets {
|
||||
let bucket_ptr = buckets_ptr.add(i as usize);
|
||||
bucket_ptr.write(core::Bucket {
|
||||
hash: 0,
|
||||
next: if i < num_buckets {
|
||||
i as u32 + 1
|
||||
} else {
|
||||
inner.free_head
|
||||
},
|
||||
inner: None,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Recalculate the dictionary
|
||||
let buckets;
|
||||
let dictionary;
|
||||
unsafe {
|
||||
let buckets_end_ptr = buckets_ptr.add(num_buckets as usize);
|
||||
let dictionary_ptr: *mut u32 = buckets_end_ptr
|
||||
.byte_add(buckets_end_ptr.align_offset(align_of::<u32>()))
|
||||
.cast();
|
||||
let dictionary_size: usize =
|
||||
end_ptr.byte_offset_from(buckets_end_ptr) as usize / size_of::<u32>();
|
||||
|
||||
buckets = std::slice::from_raw_parts_mut(buckets_ptr, num_buckets as usize);
|
||||
dictionary = std::slice::from_raw_parts_mut(dictionary_ptr, dictionary_size);
|
||||
}
|
||||
for i in 0..dictionary.len() {
|
||||
dictionary[i] = core::INVALID_POS;
|
||||
}
|
||||
|
||||
for i in 0..old_num_buckets as usize {
|
||||
if buckets[i].inner.is_none() {
|
||||
continue;
|
||||
}
|
||||
let pos: usize = (buckets[i].hash % dictionary.len() as u64) as usize;
|
||||
buckets[i].next = dictionary[pos];
|
||||
dictionary[pos] = i as u32;
|
||||
}
|
||||
|
||||
// Finally, update the CoreHashMap struct
|
||||
inner.dictionary = dictionary;
|
||||
inner.buckets = buckets;
|
||||
inner.free_head = old_num_buckets;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// TODO: Shrinking is a multi-step process that requires co-operation from the caller
|
||||
//
|
||||
// 1. The caller must first call begin_shrink(). That forbids allocation of higher-numbered
|
||||
// buckets.
|
||||
//
|
||||
// 2. Next, the caller must evict all entries in higher-numbered buckets.
|
||||
//
|
||||
// 3. Finally, call finish_shrink(). This recomputes the dictionary and shrinks the underlying
|
||||
// shmem area
|
||||
}
|
||||
|
||||
pub struct ValueReadGuard<'a, K, V> {
|
||||
_lock_guard: spin::RwLockReadGuard<'a, CoreHashMap<'a, K, V>>,
|
||||
value: *const V,
|
||||
}
|
||||
|
||||
impl<'a, K, V> Deref for ValueReadGuard<'a, K, V> {
|
||||
type Target = V;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
// SAFETY: The `lock_guard` ensures that the underlying map (and thus the value pointed to
|
||||
// by `value`) remains valid for the lifetime `'a`. The `value` has been obtained from a
|
||||
// valid reference within the map.
|
||||
unsafe { &*self.value }
|
||||
}
|
||||
}
|
||||
233
libs/neon-shmem/src/hash/core.rs
Normal file
233
libs/neon-shmem/src/hash/core.rs
Normal file
@@ -0,0 +1,233 @@
|
||||
//! Simple hash table with chaining
|
||||
//!
|
||||
//! # Resizing
|
||||
//!
|
||||
|
||||
use std::hash::{DefaultHasher, Hash, Hasher};
|
||||
use std::mem::MaybeUninit;
|
||||
|
||||
pub(crate) const INVALID_POS: u32 = u32::MAX;
|
||||
|
||||
// Bucket
|
||||
pub(crate) struct Bucket<K, V> {
|
||||
pub(crate) hash: u64,
|
||||
pub(crate) next: u32,
|
||||
pub(crate) inner: Option<(K, V)>,
|
||||
}
|
||||
|
||||
pub(crate) struct CoreHashMap<'a, K, V> {
|
||||
pub(crate) dictionary: &'a mut [u32],
|
||||
pub(crate) buckets: &'a mut [Bucket<K, V>],
|
||||
pub(crate) free_head: u32,
|
||||
|
||||
// metrics
|
||||
pub(crate) buckets_in_use: u32,
|
||||
}
|
||||
|
||||
pub struct FullError();
|
||||
|
||||
impl<'a, K, V> CoreHashMap<'a, K, V>
|
||||
where
|
||||
K: Clone + Hash + Eq,
|
||||
{
|
||||
const FILL_FACTOR: f32 = 0.60;
|
||||
|
||||
pub fn estimate_size(num_buckets: u32) -> usize {
|
||||
let mut size = 0;
|
||||
|
||||
// buckets
|
||||
size += size_of::<Bucket<K, V>>() * num_buckets as usize;
|
||||
|
||||
// dictionary
|
||||
size += (f32::ceil((size_of::<u32>() * num_buckets as usize) as f32 / Self::FILL_FACTOR))
|
||||
as usize;
|
||||
|
||||
size
|
||||
}
|
||||
|
||||
pub fn new(num_buckets: u32, area: &'a mut [u8]) -> CoreHashMap<'a, K, V> {
|
||||
let len = area.len();
|
||||
|
||||
let mut ptr: *mut u8 = area.as_mut_ptr();
|
||||
let end_ptr: *mut u8 = unsafe { area.as_mut_ptr().add(len) };
|
||||
|
||||
// carve out the buckets
|
||||
ptr = unsafe { ptr.byte_add(ptr.align_offset(align_of::<Bucket<K, V>>())) };
|
||||
let buckets_ptr = ptr;
|
||||
ptr = unsafe { ptr.add(size_of::<Bucket<K, V>>() * num_buckets as usize) };
|
||||
|
||||
// use remaining space for the dictionary
|
||||
ptr = unsafe { ptr.byte_add(ptr.align_offset(align_of::<u32>())) };
|
||||
let dictionary_ptr = ptr;
|
||||
|
||||
assert!(ptr.addr() < end_ptr.addr());
|
||||
let dictionary_size = unsafe { end_ptr.byte_offset_from(ptr) / size_of::<u32>() as isize };
|
||||
assert!(dictionary_size > 0);
|
||||
|
||||
// Initialize the buckets
|
||||
let buckets = {
|
||||
let buckets_ptr: *mut MaybeUninit<Bucket<K, V>> = buckets_ptr.cast();
|
||||
let buckets =
|
||||
unsafe { std::slice::from_raw_parts_mut(buckets_ptr, num_buckets as usize) };
|
||||
for i in 0..buckets.len() {
|
||||
buckets[i].write(Bucket {
|
||||
hash: 0,
|
||||
next: if i < buckets.len() - 1 {
|
||||
i as u32 + 1
|
||||
} else {
|
||||
INVALID_POS
|
||||
},
|
||||
inner: None,
|
||||
});
|
||||
}
|
||||
// TODO: use std::slice::assume_init_mut() once it stabilizes
|
||||
unsafe { std::slice::from_raw_parts_mut(buckets_ptr.cast(), num_buckets as usize) }
|
||||
};
|
||||
|
||||
// Initialize the dictionary
|
||||
let dictionary = {
|
||||
let dictionary_ptr: *mut MaybeUninit<u32> = dictionary_ptr.cast();
|
||||
let dictionary =
|
||||
unsafe { std::slice::from_raw_parts_mut(dictionary_ptr, dictionary_size as usize) };
|
||||
|
||||
for i in 0..dictionary.len() {
|
||||
dictionary[i].write(INVALID_POS);
|
||||
}
|
||||
// TODO: use std::slice::assume_init_mut() once it stabilizes
|
||||
unsafe {
|
||||
std::slice::from_raw_parts_mut(dictionary_ptr.cast(), dictionary_size as usize)
|
||||
}
|
||||
};
|
||||
|
||||
CoreHashMap {
|
||||
dictionary,
|
||||
buckets,
|
||||
free_head: 0,
|
||||
buckets_in_use: 0,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get(&self, key: &K) -> Option<&V> {
|
||||
let mut hasher = DefaultHasher::new();
|
||||
key.hash(&mut hasher);
|
||||
let hash = hasher.finish();
|
||||
|
||||
let mut next = self.dictionary[hash as usize % self.dictionary.len()];
|
||||
loop {
|
||||
if next == INVALID_POS {
|
||||
return None;
|
||||
}
|
||||
|
||||
let bucket = &self.buckets[next as usize];
|
||||
let (bucket_key, bucket_value) = bucket.inner.as_ref().expect("entry is in use");
|
||||
if bucket_key == key {
|
||||
return Some(&bucket_value);
|
||||
}
|
||||
next = bucket.next;
|
||||
}
|
||||
}
|
||||
|
||||
pub fn insert(&mut self, key: &K, value: V) -> Result<(), FullError> {
|
||||
let mut hasher = DefaultHasher::new();
|
||||
key.hash(&mut hasher);
|
||||
let hash = hasher.finish();
|
||||
|
||||
let first = self.dictionary[hash as usize % self.dictionary.len()];
|
||||
if first == INVALID_POS {
|
||||
// no existing entry
|
||||
let pos = self.alloc_bucket(key.clone(), value, hash)?;
|
||||
if pos == INVALID_POS {
|
||||
return Err(FullError());
|
||||
}
|
||||
self.dictionary[hash as usize % self.dictionary.len()] = pos;
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let mut next = first;
|
||||
loop {
|
||||
let bucket = &mut self.buckets[next as usize];
|
||||
let (bucket_key, bucket_value) = bucket.inner.as_mut().expect("entry is in use");
|
||||
if bucket_key == key {
|
||||
// found existing entry, update its value
|
||||
*bucket_value = value;
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
if bucket.next == INVALID_POS {
|
||||
// No existing entry found. Append to the chain
|
||||
let pos = self.alloc_bucket(key.clone(), value, hash)?;
|
||||
if pos == INVALID_POS {
|
||||
return Err(FullError());
|
||||
}
|
||||
self.buckets[next as usize].next = pos;
|
||||
return Ok(());
|
||||
}
|
||||
next = bucket.next;
|
||||
}
|
||||
}
|
||||
|
||||
pub fn remove(&mut self, key: &K) -> Result<(), FullError> {
|
||||
let mut hasher = DefaultHasher::new();
|
||||
key.hash(&mut hasher);
|
||||
let hash = hasher.finish();
|
||||
|
||||
let mut next = self.dictionary[hash as usize % self.dictionary.len()];
|
||||
let mut prev_pos: u32 = INVALID_POS;
|
||||
loop {
|
||||
if next == INVALID_POS {
|
||||
// no existing entry
|
||||
return Ok(());
|
||||
}
|
||||
let bucket = &mut self.buckets[next as usize];
|
||||
let (bucket_key, _) = bucket.inner.as_mut().expect("entry is in use");
|
||||
if bucket_key == key {
|
||||
// found existing entry, unlink it from the chain
|
||||
if prev_pos == INVALID_POS {
|
||||
self.dictionary[hash as usize % self.dictionary.len()] = bucket.next;
|
||||
} else {
|
||||
self.buckets[prev_pos as usize].next = bucket.next;
|
||||
}
|
||||
|
||||
// and add it to the freelist
|
||||
let bucket = &mut self.buckets[next as usize];
|
||||
bucket.hash = 0;
|
||||
bucket.inner = None;
|
||||
bucket.next = self.free_head;
|
||||
self.free_head = next;
|
||||
self.buckets_in_use -= 1;
|
||||
return Ok(());
|
||||
}
|
||||
prev_pos = next;
|
||||
next = bucket.next;
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_num_buckets(&self) -> usize {
|
||||
self.buckets.len()
|
||||
}
|
||||
|
||||
pub fn get_bucket(&self, pos: usize) -> Option<&(K, V)> {
|
||||
if pos >= self.buckets.len() {
|
||||
return None;
|
||||
}
|
||||
|
||||
self.buckets[pos].inner.as_ref()
|
||||
}
|
||||
|
||||
fn alloc_bucket(&mut self, key: K, value: V, hash: u64) -> Result<u32, FullError> {
|
||||
let pos = self.free_head;
|
||||
if pos == INVALID_POS {
|
||||
return Err(FullError());
|
||||
}
|
||||
|
||||
let bucket = &mut self.buckets[pos as usize];
|
||||
self.free_head = bucket.next;
|
||||
self.buckets_in_use += 1;
|
||||
|
||||
bucket.hash = hash;
|
||||
bucket.next = INVALID_POS;
|
||||
bucket.inner = Some((key, value));
|
||||
|
||||
return Ok(pos);
|
||||
}
|
||||
}
|
||||
220
libs/neon-shmem/src/hash/tests.rs
Normal file
220
libs/neon-shmem/src/hash/tests.rs
Normal file
@@ -0,0 +1,220 @@
|
||||
use std::collections::BTreeMap;
|
||||
use std::collections::HashSet;
|
||||
use std::fmt::{Debug, Formatter};
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
|
||||
use crate::hash::HashMapAccess;
|
||||
use crate::hash::HashMapInit;
|
||||
use crate::hash::UpdateAction;
|
||||
use crate::shmem::ShmemHandle;
|
||||
|
||||
use rand::seq::SliceRandom;
|
||||
use rand::{Rng, RngCore};
|
||||
use rand_distr::Zipf;
|
||||
|
||||
const TEST_KEY_LEN: usize = 16;
|
||||
|
||||
#[derive(Clone, Copy, Debug, Hash, PartialEq, Eq, PartialOrd, Ord)]
|
||||
struct TestKey([u8; TEST_KEY_LEN]);
|
||||
|
||||
impl From<&TestKey> for u128 {
|
||||
fn from(val: &TestKey) -> u128 {
|
||||
u128::from_be_bytes(val.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<u128> for TestKey {
|
||||
fn from(val: u128) -> TestKey {
|
||||
TestKey(val.to_be_bytes())
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> From<&'a [u8]> for TestKey {
|
||||
fn from(bytes: &'a [u8]) -> TestKey {
|
||||
TestKey(bytes.try_into().unwrap())
|
||||
}
|
||||
}
|
||||
|
||||
fn test_inserts<K: Into<TestKey> + Copy>(keys: &[K]) {
|
||||
const MAX_MEM_SIZE: usize = 10000000;
|
||||
let shmem = ShmemHandle::new("test_inserts", 0, MAX_MEM_SIZE).unwrap();
|
||||
|
||||
let init_struct = HashMapInit::<TestKey, usize>::init_in_shmem(100000, shmem);
|
||||
let w = init_struct.attach_writer();
|
||||
|
||||
for (idx, k) in keys.iter().enumerate() {
|
||||
let res = w.insert(&(*k).into(), idx);
|
||||
assert!(res.is_ok());
|
||||
}
|
||||
|
||||
for (idx, k) in keys.iter().enumerate() {
|
||||
let x = w.get(&(*k).into());
|
||||
let value = x.as_deref().copied();
|
||||
assert_eq!(value, Some(idx));
|
||||
}
|
||||
|
||||
//eprintln!("stats: {:?}", tree_writer.get_statistics());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn dense() {
|
||||
// This exercises splitting a node with prefix
|
||||
let keys: &[u128] = &[0, 1, 2, 3, 256];
|
||||
test_inserts(keys);
|
||||
|
||||
// Dense keys
|
||||
let mut keys: Vec<u128> = (0..10000).collect();
|
||||
test_inserts(&keys);
|
||||
|
||||
// Do the same in random orders
|
||||
for _ in 1..10 {
|
||||
keys.shuffle(&mut rand::rng());
|
||||
test_inserts(&keys);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn sparse() {
|
||||
// sparse keys
|
||||
let mut keys: Vec<TestKey> = Vec::new();
|
||||
let mut used_keys = HashSet::new();
|
||||
for _ in 0..10000 {
|
||||
loop {
|
||||
let key = rand::random::<u128>();
|
||||
if used_keys.get(&key).is_some() {
|
||||
continue;
|
||||
}
|
||||
used_keys.insert(key);
|
||||
keys.push(key.into());
|
||||
break;
|
||||
}
|
||||
}
|
||||
test_inserts(&keys);
|
||||
}
|
||||
|
||||
struct TestValue(AtomicUsize);
|
||||
|
||||
impl TestValue {
|
||||
fn new(val: usize) -> TestValue {
|
||||
TestValue(AtomicUsize::new(val))
|
||||
}
|
||||
|
||||
fn load(&self) -> usize {
|
||||
self.0.load(Ordering::Relaxed)
|
||||
}
|
||||
}
|
||||
|
||||
impl Clone for TestValue {
|
||||
fn clone(&self) -> TestValue {
|
||||
TestValue::new(self.load())
|
||||
}
|
||||
}
|
||||
|
||||
impl Debug for TestValue {
|
||||
fn fmt(&self, fmt: &mut Formatter<'_>) -> Result<(), std::fmt::Error> {
|
||||
write!(fmt, "{:?}", self.load())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
struct TestOp(TestKey, Option<usize>);
|
||||
|
||||
fn apply_op(
|
||||
op: &TestOp,
|
||||
sut: &HashMapAccess<TestKey, TestValue>,
|
||||
shadow: &mut BTreeMap<TestKey, usize>,
|
||||
) {
|
||||
eprintln!("applying op: {op:?}");
|
||||
|
||||
// apply the change to the shadow tree first
|
||||
let shadow_existing = if let Some(v) = op.1 {
|
||||
shadow.insert(op.0, v)
|
||||
} else {
|
||||
shadow.remove(&op.0)
|
||||
};
|
||||
|
||||
// apply to Art tree
|
||||
sut.update_with_fn(&op.0, |existing| {
|
||||
assert_eq!(existing.map(TestValue::load), shadow_existing);
|
||||
|
||||
match (existing, op.1) {
|
||||
(None, None) => UpdateAction::Nothing,
|
||||
(None, Some(new_val)) => UpdateAction::Insert(TestValue::new(new_val)),
|
||||
(Some(_old_val), None) => UpdateAction::Remove,
|
||||
(Some(old_val), Some(new_val)) => {
|
||||
old_val.0.store(new_val, Ordering::Relaxed);
|
||||
UpdateAction::Nothing
|
||||
}
|
||||
}
|
||||
})
|
||||
.expect("out of memory");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn random_ops() {
|
||||
const MAX_MEM_SIZE: usize = 10000000;
|
||||
let shmem = ShmemHandle::new("test_inserts", 0, MAX_MEM_SIZE).unwrap();
|
||||
|
||||
let init_struct = HashMapInit::<TestKey, TestValue>::init_in_shmem(100000, shmem);
|
||||
let writer = init_struct.attach_writer();
|
||||
|
||||
let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
|
||||
|
||||
let distribution = Zipf::new(u128::MAX as f64, 1.1).unwrap();
|
||||
let mut rng = rand::rng();
|
||||
for i in 0..100000 {
|
||||
let key: TestKey = (rng.sample(distribution) as u128).into();
|
||||
|
||||
let op = TestOp(key, if rng.random_bool(0.75) { Some(i) } else { None });
|
||||
|
||||
apply_op(&op, &writer, &mut shadow);
|
||||
|
||||
if i % 1000 == 0 {
|
||||
eprintln!("{i} ops processed");
|
||||
//eprintln!("stats: {:?}", tree_writer.get_statistics());
|
||||
//test_iter(&tree_writer, &shadow);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_grow() {
|
||||
const MEM_SIZE: usize = 10000000;
|
||||
let shmem = ShmemHandle::new("test_grow", 0, MEM_SIZE).unwrap();
|
||||
|
||||
let init_struct = HashMapInit::<TestKey, TestValue>::init_in_shmem(1000, shmem);
|
||||
let writer = init_struct.attach_writer();
|
||||
|
||||
let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
|
||||
|
||||
let mut rng = rand::rng();
|
||||
for i in 0..10000 {
|
||||
let key: TestKey = ((rng.next_u32() % 1000) as u128).into();
|
||||
|
||||
let op = TestOp(key, if rng.random_bool(0.75) { Some(i) } else { None });
|
||||
|
||||
apply_op(&op, &writer, &mut shadow);
|
||||
|
||||
if i % 1000 == 0 {
|
||||
eprintln!("{i} ops processed");
|
||||
//eprintln!("stats: {:?}", tree_writer.get_statistics());
|
||||
//test_iter(&tree_writer, &shadow);
|
||||
}
|
||||
}
|
||||
|
||||
writer.grow(1500).unwrap();
|
||||
|
||||
for i in 0..10000 {
|
||||
let key: TestKey = ((rng.next_u32() % 1500) as u128).into();
|
||||
|
||||
let op = TestOp(key, if rng.random_bool(0.75) { Some(i) } else { None });
|
||||
|
||||
apply_op(&op, &writer, &mut shadow);
|
||||
|
||||
if i % 1000 == 0 {
|
||||
eprintln!("{i} ops processed");
|
||||
//eprintln!("stats: {:?}", tree_writer.get_statistics());
|
||||
//test_iter(&tree_writer, &shadow);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,418 +1,4 @@
|
||||
//! Shared memory utilities for neon communicator
|
||||
|
||||
use std::num::NonZeroUsize;
|
||||
use std::os::fd::{AsFd, BorrowedFd, OwnedFd};
|
||||
use std::ptr::NonNull;
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
|
||||
use nix::errno::Errno;
|
||||
use nix::sys::mman::MapFlags;
|
||||
use nix::sys::mman::ProtFlags;
|
||||
use nix::sys::mman::mmap as nix_mmap;
|
||||
use nix::sys::mman::munmap as nix_munmap;
|
||||
use nix::unistd::ftruncate as nix_ftruncate;
|
||||
|
||||
/// ShmemHandle represents a shared memory area that can be shared by processes over fork().
|
||||
/// Unlike shared memory allocated by Postgres, this area is resizable, up to 'max_size' that's
|
||||
/// specified at creation.
|
||||
///
|
||||
/// The area is backed by an anonymous file created with memfd_create(). The full address space for
|
||||
/// 'max_size' is reserved up-front with mmap(), but whenever you call [`ShmemHandle::set_size`],
|
||||
/// the underlying file is resized. Do not access the area beyond the current size. Currently, that
|
||||
/// will cause the file to be expanded, but we might use mprotect() etc. to enforce that in the
|
||||
/// future.
|
||||
pub struct ShmemHandle {
|
||||
/// memfd file descriptor
|
||||
fd: OwnedFd,
|
||||
|
||||
max_size: usize,
|
||||
|
||||
// Pointer to the beginning of the shared memory area. The header is stored there.
|
||||
shared_ptr: NonNull<SharedStruct>,
|
||||
|
||||
// Pointer to the beginning of the user data
|
||||
pub data_ptr: NonNull<u8>,
|
||||
}
|
||||
|
||||
/// This is stored at the beginning in the shared memory area.
|
||||
struct SharedStruct {
|
||||
max_size: usize,
|
||||
|
||||
/// Current size of the backing file. The high-order bit is used for the RESIZE_IN_PROGRESS flag
|
||||
current_size: AtomicUsize,
|
||||
}
|
||||
|
||||
const RESIZE_IN_PROGRESS: usize = 1 << 63;
|
||||
|
||||
const HEADER_SIZE: usize = std::mem::size_of::<SharedStruct>();
|
||||
|
||||
/// Error type returned by the ShmemHandle functions.
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
#[error("{msg}: {errno}")]
|
||||
pub struct Error {
|
||||
pub msg: String,
|
||||
pub errno: Errno,
|
||||
}
|
||||
|
||||
impl Error {
|
||||
fn new(msg: &str, errno: Errno) -> Error {
|
||||
Error {
|
||||
msg: msg.to_string(),
|
||||
errno,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl ShmemHandle {
|
||||
/// Create a new shared memory area. To communicate between processes, the processes need to be
|
||||
/// fork()'d after calling this, so that the ShmemHandle is inherited by all processes.
|
||||
///
|
||||
/// If the ShmemHandle is dropped, the memory is unmapped from the current process. Other
|
||||
/// processes can continue using it, however.
|
||||
pub fn new(name: &str, initial_size: usize, max_size: usize) -> Result<ShmemHandle, Error> {
|
||||
// create the backing anonymous file.
|
||||
let fd = create_backing_file(name)?;
|
||||
|
||||
Self::new_with_fd(fd, initial_size, max_size)
|
||||
}
|
||||
|
||||
fn new_with_fd(
|
||||
fd: OwnedFd,
|
||||
initial_size: usize,
|
||||
max_size: usize,
|
||||
) -> Result<ShmemHandle, Error> {
|
||||
// We reserve the high-order bit for the RESIZE_IN_PROGRESS flag, and the actual size
|
||||
// is a little larger than this because of the SharedStruct header. Make the upper limit
|
||||
// somewhat smaller than that, because with anything close to that, you'll run out of
|
||||
// memory anyway.
|
||||
if max_size >= 1 << 48 {
|
||||
panic!("max size {} too large", max_size);
|
||||
}
|
||||
if initial_size > max_size {
|
||||
panic!("initial size {initial_size} larger than max size {max_size}");
|
||||
}
|
||||
|
||||
// The actual initial / max size is the one given by the caller, plus the size of
|
||||
// 'SharedStruct'.
|
||||
let initial_size = HEADER_SIZE + initial_size;
|
||||
let max_size = NonZeroUsize::new(HEADER_SIZE + max_size).unwrap();
|
||||
|
||||
// Reserve address space for it with mmap
|
||||
//
|
||||
// TODO: Use MAP_HUGETLB if possible
|
||||
let start_ptr = unsafe {
|
||||
nix_mmap(
|
||||
None,
|
||||
max_size,
|
||||
ProtFlags::PROT_READ | ProtFlags::PROT_WRITE,
|
||||
MapFlags::MAP_SHARED,
|
||||
&fd,
|
||||
0,
|
||||
)
|
||||
}
|
||||
.map_err(|e| Error::new("mmap failed: {e}", e))?;
|
||||
|
||||
// Reserve space for the initial size
|
||||
enlarge_file(fd.as_fd(), initial_size as u64)?;
|
||||
|
||||
// Initialize the header
|
||||
let shared: NonNull<SharedStruct> = start_ptr.cast();
|
||||
unsafe {
|
||||
shared.write(SharedStruct {
|
||||
max_size: max_size.into(),
|
||||
current_size: AtomicUsize::new(initial_size),
|
||||
})
|
||||
};
|
||||
|
||||
// The user data begins after the header
|
||||
let data_ptr = unsafe { start_ptr.cast().add(HEADER_SIZE) };
|
||||
|
||||
Ok(ShmemHandle {
|
||||
fd,
|
||||
max_size: max_size.into(),
|
||||
shared_ptr: shared,
|
||||
data_ptr,
|
||||
})
|
||||
}
|
||||
|
||||
// return reference to the header
|
||||
fn shared(&self) -> &SharedStruct {
|
||||
unsafe { self.shared_ptr.as_ref() }
|
||||
}
|
||||
|
||||
/// Resize the shared memory area. 'new_size' must not be larger than the 'max_size' specified
|
||||
/// when creating the area.
|
||||
///
|
||||
/// This may only be called from one process/thread concurrently. We detect that case
|
||||
/// and return an Error.
|
||||
pub fn set_size(&self, new_size: usize) -> Result<(), Error> {
|
||||
let new_size = new_size + HEADER_SIZE;
|
||||
let shared = self.shared();
|
||||
|
||||
if new_size > self.max_size {
|
||||
panic!(
|
||||
"new size ({} is greater than max size ({})",
|
||||
new_size, self.max_size
|
||||
);
|
||||
}
|
||||
assert_eq!(self.max_size, shared.max_size);
|
||||
|
||||
// Lock the area by setting the bit in 'current_size'
|
||||
//
|
||||
// Ordering::Relaxed would probably be sufficient here, as we don't access any other memory
|
||||
// and the posix_fallocate/ftruncate call is surely a synchronization point anyway. But
|
||||
// since this is not performance-critical, better safe than sorry .
|
||||
let mut old_size = shared.current_size.load(Ordering::Acquire);
|
||||
loop {
|
||||
if (old_size & RESIZE_IN_PROGRESS) != 0 {
|
||||
return Err(Error::new(
|
||||
"concurrent resize detected",
|
||||
Errno::UnknownErrno,
|
||||
));
|
||||
}
|
||||
match shared.current_size.compare_exchange(
|
||||
old_size,
|
||||
new_size,
|
||||
Ordering::Acquire,
|
||||
Ordering::Relaxed,
|
||||
) {
|
||||
Ok(_) => break,
|
||||
Err(x) => old_size = x,
|
||||
}
|
||||
}
|
||||
|
||||
// Ok, we got the lock.
|
||||
//
|
||||
// NB: If anything goes wrong, we *must* clear the bit!
|
||||
let result = {
|
||||
use std::cmp::Ordering::{Equal, Greater, Less};
|
||||
match new_size.cmp(&old_size) {
|
||||
Less => nix_ftruncate(&self.fd, new_size as i64).map_err(|e| {
|
||||
Error::new("could not shrink shmem segment, ftruncate failed: {e}", e)
|
||||
}),
|
||||
Equal => Ok(()),
|
||||
Greater => enlarge_file(self.fd.as_fd(), new_size as u64),
|
||||
}
|
||||
};
|
||||
|
||||
// Unlock
|
||||
shared.current_size.store(
|
||||
if result.is_ok() { new_size } else { old_size },
|
||||
Ordering::Release,
|
||||
);
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
/// Returns the current user-visible size of the shared memory segment.
|
||||
///
|
||||
/// NOTE: a concurrent set_size() call can change the size at any time. It is the caller's
|
||||
/// responsibility not to access the area beyond the current size.
|
||||
pub fn current_size(&self) -> usize {
|
||||
let total_current_size =
|
||||
self.shared().current_size.load(Ordering::Relaxed) & !RESIZE_IN_PROGRESS;
|
||||
total_current_size - HEADER_SIZE
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for ShmemHandle {
|
||||
fn drop(&mut self) {
|
||||
// SAFETY: The pointer was obtained from mmap() with the given size.
|
||||
// We unmap the entire region.
|
||||
let _ = unsafe { nix_munmap(self.shared_ptr.cast(), self.max_size) };
|
||||
// The fd is dropped automatically by OwnedFd.
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a "backing file" for the shared memory area. On Linux, use memfd_create(), to create an
|
||||
/// anonymous in-memory file. One macos, fall back to a regular file. That's good enough for
|
||||
/// development and testing, but in production we want the file to stay in memory.
|
||||
///
|
||||
/// disable 'unused_variables' warnings, because in the macos path, 'name' is unused.
|
||||
#[allow(unused_variables)]
|
||||
fn create_backing_file(name: &str) -> Result<OwnedFd, Error> {
|
||||
#[cfg(not(target_os = "macos"))]
|
||||
{
|
||||
nix::sys::memfd::memfd_create(name, nix::sys::memfd::MFdFlags::empty())
|
||||
.map_err(|e| Error::new("memfd_create failed: {e}", e))
|
||||
}
|
||||
#[cfg(target_os = "macos")]
|
||||
{
|
||||
let file = tempfile::tempfile().map_err(|e| {
|
||||
Error::new(
|
||||
"could not create temporary file to back shmem area: {e}",
|
||||
nix::errno::Errno::from_raw(e.raw_os_error().unwrap_or(0)),
|
||||
)
|
||||
})?;
|
||||
Ok(OwnedFd::from(file))
|
||||
}
|
||||
}
|
||||
|
||||
fn enlarge_file(fd: BorrowedFd, size: u64) -> Result<(), Error> {
|
||||
// Use posix_fallocate() to enlarge the file. It reserves the space correctly, so that
|
||||
// we don't get a segfault later when trying to actually use it.
|
||||
#[cfg(not(target_os = "macos"))]
|
||||
{
|
||||
nix::fcntl::posix_fallocate(fd, 0, size as i64).map_err(|e| {
|
||||
Error::new(
|
||||
"could not grow shmem segment, posix_fallocate failed: {e}",
|
||||
e,
|
||||
)
|
||||
})
|
||||
}
|
||||
// As a fallback on macos, which doesn't have posix_fallocate, use plain 'fallocate'
|
||||
#[cfg(target_os = "macos")]
|
||||
{
|
||||
nix::unistd::ftruncate(fd, size as i64)
|
||||
.map_err(|e| Error::new("could not grow shmem segment, ftruncate failed: {e}", e))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
use nix::unistd::ForkResult;
|
||||
use std::ops::Range;
|
||||
|
||||
/// check that all bytes in given range have the expected value.
|
||||
fn assert_range(ptr: *const u8, expected: u8, range: Range<usize>) {
|
||||
for i in range {
|
||||
let b = unsafe { *(ptr.add(i)) };
|
||||
assert_eq!(expected, b, "unexpected byte at offset {}", i);
|
||||
}
|
||||
}
|
||||
|
||||
/// Write 'b' to all bytes in the given range
|
||||
fn write_range(ptr: *mut u8, b: u8, range: Range<usize>) {
|
||||
unsafe { std::ptr::write_bytes(ptr.add(range.start), b, range.end - range.start) };
|
||||
}
|
||||
|
||||
// simple single-process test of growing and shrinking
|
||||
#[test]
|
||||
fn test_shmem_resize() -> Result<(), Error> {
|
||||
let max_size = 1024 * 1024;
|
||||
let init_struct = ShmemHandle::new("test_shmem_resize", 0, max_size)?;
|
||||
|
||||
assert_eq!(init_struct.current_size(), 0);
|
||||
|
||||
// Initial grow
|
||||
let size1 = 10000;
|
||||
init_struct.set_size(size1).unwrap();
|
||||
assert_eq!(init_struct.current_size(), size1);
|
||||
|
||||
// Write some data
|
||||
let data_ptr = init_struct.data_ptr.as_ptr();
|
||||
write_range(data_ptr, 0xAA, 0..size1);
|
||||
assert_range(data_ptr, 0xAA, 0..size1);
|
||||
|
||||
// Shrink
|
||||
let size2 = 5000;
|
||||
init_struct.set_size(size2).unwrap();
|
||||
assert_eq!(init_struct.current_size(), size2);
|
||||
|
||||
// Grow again
|
||||
let size3 = 20000;
|
||||
init_struct.set_size(size3).unwrap();
|
||||
assert_eq!(init_struct.current_size(), size3);
|
||||
|
||||
// Try to read it. The area that was shrunk and grown again should read as all zeros now
|
||||
assert_range(data_ptr, 0xAA, 0..5000);
|
||||
assert_range(data_ptr, 0, 5000..size1);
|
||||
|
||||
// Try to grow beyond max_size
|
||||
//let size4 = max_size + 1;
|
||||
//assert!(init_struct.set_size(size4).is_err());
|
||||
|
||||
// Dropping init_struct should unmap the memory
|
||||
drop(init_struct);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// This is used in tests to coordinate between test processes. It's like std::sync::Barrier,
|
||||
/// but is stored in the shared memory area and works across processes. It's implemented by
|
||||
/// polling, because e.g. standard rust mutexes are not guaranteed to work across processes.
|
||||
struct SimpleBarrier {
|
||||
num_procs: usize,
|
||||
count: AtomicUsize,
|
||||
}
|
||||
|
||||
impl SimpleBarrier {
|
||||
unsafe fn init(ptr: *mut SimpleBarrier, num_procs: usize) {
|
||||
unsafe {
|
||||
*ptr = SimpleBarrier {
|
||||
num_procs,
|
||||
count: AtomicUsize::new(0),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn wait(&self) {
|
||||
let old = self.count.fetch_add(1, Ordering::Relaxed);
|
||||
|
||||
let generation = old / self.num_procs;
|
||||
|
||||
let mut current = old + 1;
|
||||
while current < (generation + 1) * self.num_procs {
|
||||
std::thread::sleep(std::time::Duration::from_millis(10));
|
||||
current = self.count.load(Ordering::Relaxed);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_multi_process() {
|
||||
// Initialize
|
||||
let max_size = 1_000_000_000_000;
|
||||
let init_struct = ShmemHandle::new("test_multi_process", 0, max_size).unwrap();
|
||||
let ptr = init_struct.data_ptr.as_ptr();
|
||||
|
||||
// Store the SimpleBarrier in the first 1k of the area.
|
||||
init_struct.set_size(10000).unwrap();
|
||||
let barrier_ptr: *mut SimpleBarrier = unsafe {
|
||||
ptr.add(ptr.align_offset(std::mem::align_of::<SimpleBarrier>()))
|
||||
.cast()
|
||||
};
|
||||
unsafe { SimpleBarrier::init(barrier_ptr, 2) };
|
||||
let barrier = unsafe { barrier_ptr.as_ref().unwrap() };
|
||||
|
||||
// Fork another test process. The code after this runs in both processes concurrently.
|
||||
let fork_result = unsafe { nix::unistd::fork().unwrap() };
|
||||
|
||||
// In the parent, fill bytes between 1000..2000. In the child, between 2000..3000
|
||||
if fork_result.is_parent() {
|
||||
write_range(ptr, 0xAA, 1000..2000);
|
||||
} else {
|
||||
write_range(ptr, 0xBB, 2000..3000);
|
||||
}
|
||||
barrier.wait();
|
||||
// Verify the contents. (in both processes)
|
||||
assert_range(ptr, 0xAA, 1000..2000);
|
||||
assert_range(ptr, 0xBB, 2000..3000);
|
||||
|
||||
// Grow, from the child this time
|
||||
let size = 10_000_000;
|
||||
if !fork_result.is_parent() {
|
||||
init_struct.set_size(size).unwrap();
|
||||
}
|
||||
barrier.wait();
|
||||
|
||||
// make some writes at the end
|
||||
if fork_result.is_parent() {
|
||||
write_range(ptr, 0xAA, (size - 10)..size);
|
||||
} else {
|
||||
write_range(ptr, 0xBB, (size - 20)..(size - 10));
|
||||
}
|
||||
barrier.wait();
|
||||
|
||||
// Verify the contents. (This runs in both processes)
|
||||
assert_range(ptr, 0, (size - 1000)..(size - 20));
|
||||
assert_range(ptr, 0xBB, (size - 20)..(size - 10));
|
||||
assert_range(ptr, 0xAA, (size - 10)..size);
|
||||
|
||||
if let ForkResult::Parent { child } = fork_result {
|
||||
nix::sys::wait::waitpid(child, None).unwrap();
|
||||
}
|
||||
}
|
||||
}
|
||||
pub mod hash;
|
||||
pub mod shmem;
|
||||
|
||||
418
libs/neon-shmem/src/shmem.rs
Normal file
418
libs/neon-shmem/src/shmem.rs
Normal file
@@ -0,0 +1,418 @@
|
||||
//! Dynamically resizable contiguous chunk of shared memory
|
||||
|
||||
use std::num::NonZeroUsize;
|
||||
use std::os::fd::{AsFd, BorrowedFd, OwnedFd};
|
||||
use std::ptr::NonNull;
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
|
||||
use nix::errno::Errno;
|
||||
use nix::sys::mman::MapFlags;
|
||||
use nix::sys::mman::ProtFlags;
|
||||
use nix::sys::mman::mmap as nix_mmap;
|
||||
use nix::sys::mman::munmap as nix_munmap;
|
||||
use nix::unistd::ftruncate as nix_ftruncate;
|
||||
|
||||
/// ShmemHandle represents a shared memory area that can be shared by processes over fork().
|
||||
/// Unlike shared memory allocated by Postgres, this area is resizable, up to 'max_size' that's
|
||||
/// specified at creation.
|
||||
///
|
||||
/// The area is backed by an anonymous file created with memfd_create(). The full address space for
|
||||
/// 'max_size' is reserved up-front with mmap(), but whenever you call [`ShmemHandle::set_size`],
|
||||
/// the underlying file is resized. Do not access the area beyond the current size. Currently, that
|
||||
/// will cause the file to be expanded, but we might use mprotect() etc. to enforce that in the
|
||||
/// future.
|
||||
pub struct ShmemHandle {
|
||||
/// memfd file descriptor
|
||||
fd: OwnedFd,
|
||||
|
||||
max_size: usize,
|
||||
|
||||
// Pointer to the beginning of the shared memory area. The header is stored there.
|
||||
shared_ptr: NonNull<SharedStruct>,
|
||||
|
||||
// Pointer to the beginning of the user data
|
||||
pub data_ptr: NonNull<u8>,
|
||||
}
|
||||
|
||||
/// This is stored at the beginning in the shared memory area.
|
||||
struct SharedStruct {
|
||||
max_size: usize,
|
||||
|
||||
/// Current size of the backing file. The high-order bit is used for the RESIZE_IN_PROGRESS flag
|
||||
current_size: AtomicUsize,
|
||||
}
|
||||
|
||||
const RESIZE_IN_PROGRESS: usize = 1 << 63;
|
||||
|
||||
const HEADER_SIZE: usize = std::mem::size_of::<SharedStruct>();
|
||||
|
||||
/// Error type returned by the ShmemHandle functions.
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
#[error("{msg}: {errno}")]
|
||||
pub struct Error {
|
||||
pub msg: String,
|
||||
pub errno: Errno,
|
||||
}
|
||||
|
||||
impl Error {
|
||||
fn new(msg: &str, errno: Errno) -> Error {
|
||||
Error {
|
||||
msg: msg.to_string(),
|
||||
errno,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl ShmemHandle {
|
||||
/// Create a new shared memory area. To communicate between processes, the processes need to be
|
||||
/// fork()'d after calling this, so that the ShmemHandle is inherited by all processes.
|
||||
///
|
||||
/// If the ShmemHandle is dropped, the memory is unmapped from the current process. Other
|
||||
/// processes can continue using it, however.
|
||||
pub fn new(name: &str, initial_size: usize, max_size: usize) -> Result<ShmemHandle, Error> {
|
||||
// create the backing anonymous file.
|
||||
let fd = create_backing_file(name)?;
|
||||
|
||||
Self::new_with_fd(fd, initial_size, max_size)
|
||||
}
|
||||
|
||||
fn new_with_fd(
|
||||
fd: OwnedFd,
|
||||
initial_size: usize,
|
||||
max_size: usize,
|
||||
) -> Result<ShmemHandle, Error> {
|
||||
// We reserve the high-order bit for the RESIZE_IN_PROGRESS flag, and the actual size
|
||||
// is a little larger than this because of the SharedStruct header. Make the upper limit
|
||||
// somewhat smaller than that, because with anything close to that, you'll run out of
|
||||
// memory anyway.
|
||||
if max_size >= 1 << 48 {
|
||||
panic!("max size {} too large", max_size);
|
||||
}
|
||||
if initial_size > max_size {
|
||||
panic!("initial size {initial_size} larger than max size {max_size}");
|
||||
}
|
||||
|
||||
// The actual initial / max size is the one given by the caller, plus the size of
|
||||
// 'SharedStruct'.
|
||||
let initial_size = HEADER_SIZE + initial_size;
|
||||
let max_size = NonZeroUsize::new(HEADER_SIZE + max_size).unwrap();
|
||||
|
||||
// Reserve address space for it with mmap
|
||||
//
|
||||
// TODO: Use MAP_HUGETLB if possible
|
||||
let start_ptr = unsafe {
|
||||
nix_mmap(
|
||||
None,
|
||||
max_size,
|
||||
ProtFlags::PROT_READ | ProtFlags::PROT_WRITE,
|
||||
MapFlags::MAP_SHARED,
|
||||
&fd,
|
||||
0,
|
||||
)
|
||||
}
|
||||
.map_err(|e| Error::new("mmap failed: {e}", e))?;
|
||||
|
||||
// Reserve space for the initial size
|
||||
enlarge_file(fd.as_fd(), initial_size as u64)?;
|
||||
|
||||
// Initialize the header
|
||||
let shared: NonNull<SharedStruct> = start_ptr.cast();
|
||||
unsafe {
|
||||
shared.write(SharedStruct {
|
||||
max_size: max_size.into(),
|
||||
current_size: AtomicUsize::new(initial_size),
|
||||
})
|
||||
};
|
||||
|
||||
// The user data begins after the header
|
||||
let data_ptr = unsafe { start_ptr.cast().add(HEADER_SIZE) };
|
||||
|
||||
Ok(ShmemHandle {
|
||||
fd,
|
||||
max_size: max_size.into(),
|
||||
shared_ptr: shared,
|
||||
data_ptr,
|
||||
})
|
||||
}
|
||||
|
||||
// return reference to the header
|
||||
fn shared(&self) -> &SharedStruct {
|
||||
unsafe { self.shared_ptr.as_ref() }
|
||||
}
|
||||
|
||||
/// Resize the shared memory area. 'new_size' must not be larger than the 'max_size' specified
|
||||
/// when creating the area.
|
||||
///
|
||||
/// This may only be called from one process/thread concurrently. We detect that case
|
||||
/// and return an Error.
|
||||
pub fn set_size(&self, new_size: usize) -> Result<(), Error> {
|
||||
let new_size = new_size + HEADER_SIZE;
|
||||
let shared = self.shared();
|
||||
|
||||
if new_size > self.max_size {
|
||||
panic!(
|
||||
"new size ({} is greater than max size ({})",
|
||||
new_size, self.max_size
|
||||
);
|
||||
}
|
||||
assert_eq!(self.max_size, shared.max_size);
|
||||
|
||||
// Lock the area by setting the bit in 'current_size'
|
||||
//
|
||||
// Ordering::Relaxed would probably be sufficient here, as we don't access any other memory
|
||||
// and the posix_fallocate/ftruncate call is surely a synchronization point anyway. But
|
||||
// since this is not performance-critical, better safe than sorry .
|
||||
let mut old_size = shared.current_size.load(Ordering::Acquire);
|
||||
loop {
|
||||
if (old_size & RESIZE_IN_PROGRESS) != 0 {
|
||||
return Err(Error::new(
|
||||
"concurrent resize detected",
|
||||
Errno::UnknownErrno,
|
||||
));
|
||||
}
|
||||
match shared.current_size.compare_exchange(
|
||||
old_size,
|
||||
new_size,
|
||||
Ordering::Acquire,
|
||||
Ordering::Relaxed,
|
||||
) {
|
||||
Ok(_) => break,
|
||||
Err(x) => old_size = x,
|
||||
}
|
||||
}
|
||||
|
||||
// Ok, we got the lock.
|
||||
//
|
||||
// NB: If anything goes wrong, we *must* clear the bit!
|
||||
let result = {
|
||||
use std::cmp::Ordering::{Equal, Greater, Less};
|
||||
match new_size.cmp(&old_size) {
|
||||
Less => nix_ftruncate(&self.fd, new_size as i64).map_err(|e| {
|
||||
Error::new("could not shrink shmem segment, ftruncate failed: {e}", e)
|
||||
}),
|
||||
Equal => Ok(()),
|
||||
Greater => enlarge_file(self.fd.as_fd(), new_size as u64),
|
||||
}
|
||||
};
|
||||
|
||||
// Unlock
|
||||
shared.current_size.store(
|
||||
if result.is_ok() { new_size } else { old_size },
|
||||
Ordering::Release,
|
||||
);
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
/// Returns the current user-visible size of the shared memory segment.
|
||||
///
|
||||
/// NOTE: a concurrent set_size() call can change the size at any time. It is the caller's
|
||||
/// responsibility not to access the area beyond the current size.
|
||||
pub fn current_size(&self) -> usize {
|
||||
let total_current_size =
|
||||
self.shared().current_size.load(Ordering::Relaxed) & !RESIZE_IN_PROGRESS;
|
||||
total_current_size - HEADER_SIZE
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for ShmemHandle {
|
||||
fn drop(&mut self) {
|
||||
// SAFETY: The pointer was obtained from mmap() with the given size.
|
||||
// We unmap the entire region.
|
||||
let _ = unsafe { nix_munmap(self.shared_ptr.cast(), self.max_size) };
|
||||
// The fd is dropped automatically by OwnedFd.
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a "backing file" for the shared memory area. On Linux, use memfd_create(), to create an
|
||||
/// anonymous in-memory file. One macos, fall back to a regular file. That's good enough for
|
||||
/// development and testing, but in production we want the file to stay in memory.
|
||||
///
|
||||
/// disable 'unused_variables' warnings, because in the macos path, 'name' is unused.
|
||||
#[allow(unused_variables)]
|
||||
fn create_backing_file(name: &str) -> Result<OwnedFd, Error> {
|
||||
#[cfg(not(target_os = "macos"))]
|
||||
{
|
||||
nix::sys::memfd::memfd_create(name, nix::sys::memfd::MFdFlags::empty())
|
||||
.map_err(|e| Error::new("memfd_create failed: {e}", e))
|
||||
}
|
||||
#[cfg(target_os = "macos")]
|
||||
{
|
||||
let file = tempfile::tempfile().map_err(|e| {
|
||||
Error::new(
|
||||
"could not create temporary file to back shmem area: {e}",
|
||||
nix::errno::Errno::from_raw(e.raw_os_error().unwrap_or(0)),
|
||||
)
|
||||
})?;
|
||||
Ok(OwnedFd::from(file))
|
||||
}
|
||||
}
|
||||
|
||||
fn enlarge_file(fd: BorrowedFd, size: u64) -> Result<(), Error> {
|
||||
// Use posix_fallocate() to enlarge the file. It reserves the space correctly, so that
|
||||
// we don't get a segfault later when trying to actually use it.
|
||||
#[cfg(not(target_os = "macos"))]
|
||||
{
|
||||
nix::fcntl::posix_fallocate(fd, 0, size as i64).map_err(|e| {
|
||||
Error::new(
|
||||
"could not grow shmem segment, posix_fallocate failed: {e}",
|
||||
e,
|
||||
)
|
||||
})
|
||||
}
|
||||
// As a fallback on macos, which doesn't have posix_fallocate, use plain 'fallocate'
|
||||
#[cfg(target_os = "macos")]
|
||||
{
|
||||
nix::unistd::ftruncate(fd, size as i64)
|
||||
.map_err(|e| Error::new("could not grow shmem segment, ftruncate failed: {e}", e))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
use nix::unistd::ForkResult;
|
||||
use std::ops::Range;
|
||||
|
||||
/// check that all bytes in given range have the expected value.
|
||||
fn assert_range(ptr: *const u8, expected: u8, range: Range<usize>) {
|
||||
for i in range {
|
||||
let b = unsafe { *(ptr.add(i)) };
|
||||
assert_eq!(expected, b, "unexpected byte at offset {}", i);
|
||||
}
|
||||
}
|
||||
|
||||
/// Write 'b' to all bytes in the given range
|
||||
fn write_range(ptr: *mut u8, b: u8, range: Range<usize>) {
|
||||
unsafe { std::ptr::write_bytes(ptr.add(range.start), b, range.end - range.start) };
|
||||
}
|
||||
|
||||
// simple single-process test of growing and shrinking
|
||||
#[test]
|
||||
fn test_shmem_resize() -> Result<(), Error> {
|
||||
let max_size = 1024 * 1024;
|
||||
let init_struct = ShmemHandle::new("test_shmem_resize", 0, max_size)?;
|
||||
|
||||
assert_eq!(init_struct.current_size(), 0);
|
||||
|
||||
// Initial grow
|
||||
let size1 = 10000;
|
||||
init_struct.set_size(size1).unwrap();
|
||||
assert_eq!(init_struct.current_size(), size1);
|
||||
|
||||
// Write some data
|
||||
let data_ptr = init_struct.data_ptr.as_ptr();
|
||||
write_range(data_ptr, 0xAA, 0..size1);
|
||||
assert_range(data_ptr, 0xAA, 0..size1);
|
||||
|
||||
// Shrink
|
||||
let size2 = 5000;
|
||||
init_struct.set_size(size2).unwrap();
|
||||
assert_eq!(init_struct.current_size(), size2);
|
||||
|
||||
// Grow again
|
||||
let size3 = 20000;
|
||||
init_struct.set_size(size3).unwrap();
|
||||
assert_eq!(init_struct.current_size(), size3);
|
||||
|
||||
// Try to read it. The area that was shrunk and grown again should read as all zeros now
|
||||
assert_range(data_ptr, 0xAA, 0..5000);
|
||||
assert_range(data_ptr, 0, 5000..size1);
|
||||
|
||||
// Try to grow beyond max_size
|
||||
//let size4 = max_size + 1;
|
||||
//assert!(init_struct.set_size(size4).is_err());
|
||||
|
||||
// Dropping init_struct should unmap the memory
|
||||
drop(init_struct);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// This is used in tests to coordinate between test processes. It's like std::sync::Barrier,
|
||||
/// but is stored in the shared memory area and works across processes. It's implemented by
|
||||
/// polling, because e.g. standard rust mutexes are not guaranteed to work across processes.
|
||||
struct SimpleBarrier {
|
||||
num_procs: usize,
|
||||
count: AtomicUsize,
|
||||
}
|
||||
|
||||
impl SimpleBarrier {
|
||||
unsafe fn init(ptr: *mut SimpleBarrier, num_procs: usize) {
|
||||
unsafe {
|
||||
*ptr = SimpleBarrier {
|
||||
num_procs,
|
||||
count: AtomicUsize::new(0),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn wait(&self) {
|
||||
let old = self.count.fetch_add(1, Ordering::Relaxed);
|
||||
|
||||
let generation = old / self.num_procs;
|
||||
|
||||
let mut current = old + 1;
|
||||
while current < (generation + 1) * self.num_procs {
|
||||
std::thread::sleep(std::time::Duration::from_millis(10));
|
||||
current = self.count.load(Ordering::Relaxed);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_multi_process() {
|
||||
// Initialize
|
||||
let max_size = 1_000_000_000_000;
|
||||
let init_struct = ShmemHandle::new("test_multi_process", 0, max_size).unwrap();
|
||||
let ptr = init_struct.data_ptr.as_ptr();
|
||||
|
||||
// Store the SimpleBarrier in the first 1k of the area.
|
||||
init_struct.set_size(10000).unwrap();
|
||||
let barrier_ptr: *mut SimpleBarrier = unsafe {
|
||||
ptr.add(ptr.align_offset(std::mem::align_of::<SimpleBarrier>()))
|
||||
.cast()
|
||||
};
|
||||
unsafe { SimpleBarrier::init(barrier_ptr, 2) };
|
||||
let barrier = unsafe { barrier_ptr.as_ref().unwrap() };
|
||||
|
||||
// Fork another test process. The code after this runs in both processes concurrently.
|
||||
let fork_result = unsafe { nix::unistd::fork().unwrap() };
|
||||
|
||||
// In the parent, fill bytes between 1000..2000. In the child, between 2000..3000
|
||||
if fork_result.is_parent() {
|
||||
write_range(ptr, 0xAA, 1000..2000);
|
||||
} else {
|
||||
write_range(ptr, 0xBB, 2000..3000);
|
||||
}
|
||||
barrier.wait();
|
||||
// Verify the contents. (in both processes)
|
||||
assert_range(ptr, 0xAA, 1000..2000);
|
||||
assert_range(ptr, 0xBB, 2000..3000);
|
||||
|
||||
// Grow, from the child this time
|
||||
let size = 10_000_000;
|
||||
if !fork_result.is_parent() {
|
||||
init_struct.set_size(size).unwrap();
|
||||
}
|
||||
barrier.wait();
|
||||
|
||||
// make some writes at the end
|
||||
if fork_result.is_parent() {
|
||||
write_range(ptr, 0xAA, (size - 10)..size);
|
||||
} else {
|
||||
write_range(ptr, 0xBB, (size - 20)..(size - 10));
|
||||
}
|
||||
barrier.wait();
|
||||
|
||||
// Verify the contents. (This runs in both processes)
|
||||
assert_range(ptr, 0, (size - 1000)..(size - 20));
|
||||
assert_range(ptr, 0xBB, (size - 20)..(size - 10));
|
||||
assert_range(ptr, 0xAA, (size - 10)..size);
|
||||
|
||||
if let ForkResult::Parent { child } = fork_result {
|
||||
nix::sys::wait::waitpid(child, None).unwrap();
|
||||
}
|
||||
}
|
||||
}
|
||||
14
libs/neonart/Cargo.toml
Normal file
14
libs/neonart/Cargo.toml
Normal file
@@ -0,0 +1,14 @@
|
||||
[package]
|
||||
name = "neonart"
|
||||
version = "0.1.0"
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
crossbeam-utils.workspace = true
|
||||
spin.workspace = true
|
||||
tracing.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
rand = "0.9.1"
|
||||
rand_distr = "0.5.1"
|
||||
594
libs/neonart/src/algorithm.rs
Normal file
594
libs/neonart/src/algorithm.rs
Normal file
@@ -0,0 +1,594 @@
|
||||
mod lock_and_version;
|
||||
pub(crate) mod node_ptr;
|
||||
mod node_ref;
|
||||
|
||||
use std::vec::Vec;
|
||||
|
||||
use crate::algorithm::lock_and_version::ConcurrentUpdateError;
|
||||
use crate::algorithm::node_ptr::MAX_PREFIX_LEN;
|
||||
use crate::algorithm::node_ref::{NewNodeRef, NodeRef, ReadLockedNodeRef, WriteLockedNodeRef};
|
||||
use crate::allocator::OutOfMemoryError;
|
||||
|
||||
use crate::TreeWriteGuard;
|
||||
use crate::UpdateAction;
|
||||
use crate::allocator::ArtAllocator;
|
||||
use crate::epoch::EpochPin;
|
||||
use crate::{Key, Value};
|
||||
|
||||
pub(crate) type RootPtr<V> = node_ptr::NodePtr<V>;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum ArtError {
|
||||
ConcurrentUpdate, // need to retry
|
||||
OutOfMemory,
|
||||
}
|
||||
|
||||
impl From<ConcurrentUpdateError> for ArtError {
|
||||
fn from(_: ConcurrentUpdateError) -> ArtError {
|
||||
ArtError::ConcurrentUpdate
|
||||
}
|
||||
}
|
||||
|
||||
impl From<OutOfMemoryError> for ArtError {
|
||||
fn from(_: OutOfMemoryError) -> ArtError {
|
||||
ArtError::OutOfMemory
|
||||
}
|
||||
}
|
||||
|
||||
pub fn new_root<V: Value>(
|
||||
allocator: &impl ArtAllocator<V>,
|
||||
) -> Result<RootPtr<V>, OutOfMemoryError> {
|
||||
node_ptr::new_root(allocator)
|
||||
}
|
||||
|
||||
pub(crate) fn search<'e, K: Key, V: Value>(
|
||||
key: &K,
|
||||
root: RootPtr<V>,
|
||||
epoch_pin: &'e EpochPin,
|
||||
) -> Option<&'e V> {
|
||||
loop {
|
||||
let root_ref = NodeRef::from_root_ptr(root);
|
||||
if let Ok(result) = lookup_recurse(key.as_bytes(), root_ref, None, epoch_pin) {
|
||||
break result;
|
||||
}
|
||||
// retry
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn iter_next<'e, V: Value>(
|
||||
key: &[u8],
|
||||
root: RootPtr<V>,
|
||||
epoch_pin: &'e EpochPin,
|
||||
) -> Option<(Vec<u8>, &'e V)> {
|
||||
loop {
|
||||
let mut path = Vec::new();
|
||||
let root_ref = NodeRef::from_root_ptr(root);
|
||||
|
||||
match next_recurse(key, &mut path, root_ref, epoch_pin) {
|
||||
Ok(Some(v)) => {
|
||||
assert_eq!(path.len(), key.len());
|
||||
break Some((path, v));
|
||||
}
|
||||
Ok(None) => break None,
|
||||
Err(ConcurrentUpdateError()) => {
|
||||
// retry
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn update_fn<'e, 'g, K: Key, V: Value, A: ArtAllocator<V>, F>(
|
||||
key: &K,
|
||||
value_fn: F,
|
||||
root: RootPtr<V>,
|
||||
guard: &'g mut TreeWriteGuard<'e, K, V, A>,
|
||||
) -> Result<(), OutOfMemoryError>
|
||||
where
|
||||
F: FnOnce(Option<&V>) -> UpdateAction<V>,
|
||||
{
|
||||
let value_fn_cell = std::cell::Cell::new(Some(value_fn));
|
||||
loop {
|
||||
let root_ref = NodeRef::from_root_ptr(root);
|
||||
let this_value_fn = |arg: Option<&V>| value_fn_cell.take().unwrap()(arg);
|
||||
let key_bytes = key.as_bytes();
|
||||
|
||||
match update_recurse(
|
||||
key_bytes,
|
||||
this_value_fn,
|
||||
root_ref,
|
||||
None,
|
||||
None,
|
||||
guard,
|
||||
0,
|
||||
key_bytes,
|
||||
) {
|
||||
Ok(()) => break Ok(()),
|
||||
Err(ArtError::ConcurrentUpdate) => {
|
||||
continue; // retry
|
||||
}
|
||||
Err(ArtError::OutOfMemory) => break Err(OutOfMemoryError()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Error means you must retry.
|
||||
//
|
||||
// This corresponds to the 'lookupOpt' function in the paper
|
||||
fn lookup_recurse<'e, V: Value>(
|
||||
key: &[u8],
|
||||
node: NodeRef<'e, V>,
|
||||
parent: Option<ReadLockedNodeRef<V>>,
|
||||
epoch_pin: &'e EpochPin,
|
||||
) -> Result<Option<&'e V>, ConcurrentUpdateError> {
|
||||
let rnode = node.read_lock_or_restart()?;
|
||||
if let Some(parent) = parent {
|
||||
parent.read_unlock_or_restart()?;
|
||||
}
|
||||
|
||||
// check if the prefix matches, may increment level
|
||||
let prefix_len = if let Some(prefix_len) = rnode.prefix_matches(key) {
|
||||
prefix_len
|
||||
} else {
|
||||
rnode.read_unlock_or_restart()?;
|
||||
return Ok(None);
|
||||
};
|
||||
|
||||
if rnode.is_leaf() {
|
||||
assert_eq!(key.len(), prefix_len);
|
||||
let vptr = rnode.get_leaf_value_ptr()?;
|
||||
// safety: It's OK to return a ref of the pointer because we checked the version
|
||||
// and the lifetime of 'epoch_pin' enforces that the reference is only accessible
|
||||
// as long as the epoch is pinned.
|
||||
let v = unsafe { vptr.as_ref().unwrap() };
|
||||
return Ok(Some(v));
|
||||
}
|
||||
|
||||
let key = &key[prefix_len..];
|
||||
|
||||
// find child (or leaf value)
|
||||
let next_node = rnode.find_child_or_restart(key[0])?;
|
||||
|
||||
match next_node {
|
||||
None => Ok(None), // key not found
|
||||
Some(child) => lookup_recurse(&key[1..], child, Some(rnode), epoch_pin),
|
||||
}
|
||||
}
|
||||
|
||||
fn next_recurse<'e, V: Value>(
|
||||
min_key: &[u8],
|
||||
path: &mut Vec<u8>,
|
||||
node: NodeRef<'e, V>,
|
||||
epoch_pin: &'e EpochPin,
|
||||
) -> Result<Option<&'e V>, ConcurrentUpdateError> {
|
||||
let rnode = node.read_lock_or_restart()?;
|
||||
let prefix = rnode.get_prefix();
|
||||
if prefix.len() != 0 {
|
||||
path.extend_from_slice(prefix);
|
||||
}
|
||||
|
||||
use std::cmp::Ordering;
|
||||
let comparison = path.as_slice().cmp(&min_key[0..path.len()]);
|
||||
if comparison == Ordering::Less {
|
||||
rnode.read_unlock_or_restart()?;
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
if rnode.is_leaf() {
|
||||
assert_eq!(path.len(), min_key.len());
|
||||
let vptr = rnode.get_leaf_value_ptr()?;
|
||||
// safety: It's OK to return a ref of the pointer because we checked the version
|
||||
// and the lifetime of 'epoch_pin' enforces that the reference is only accessible
|
||||
// as long as the epoch is pinned.
|
||||
let v = unsafe { vptr.as_ref().unwrap() };
|
||||
return Ok(Some(v));
|
||||
}
|
||||
|
||||
let mut min_key_byte = match comparison {
|
||||
Ordering::Less => unreachable!(), // checked this above already
|
||||
Ordering::Equal => min_key[path.len()],
|
||||
Ordering::Greater => 0,
|
||||
};
|
||||
|
||||
loop {
|
||||
match rnode.find_next_child_or_restart(min_key_byte)? {
|
||||
None => {
|
||||
return Ok(None);
|
||||
}
|
||||
Some((key_byte, child_ref)) => {
|
||||
let path_len = path.len();
|
||||
path.push(key_byte);
|
||||
let result = next_recurse(min_key, path, child_ref, epoch_pin)?;
|
||||
if result.is_some() {
|
||||
return Ok(result);
|
||||
}
|
||||
if key_byte == u8::MAX {
|
||||
return Ok(None);
|
||||
}
|
||||
path.truncate(path_len);
|
||||
min_key_byte = key_byte + 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// This corresponds to the 'insertOpt' function in the paper
|
||||
pub(crate) fn update_recurse<'e, 'g, K: Key, V: Value, A: ArtAllocator<V>, F>(
|
||||
key: &[u8],
|
||||
value_fn: F,
|
||||
node: NodeRef<'e, V>,
|
||||
rparent: Option<(ReadLockedNodeRef<V>, u8)>,
|
||||
rgrandparent: Option<(ReadLockedNodeRef<V>, u8)>,
|
||||
guard: &'g mut TreeWriteGuard<'e, K, V, A>,
|
||||
level: usize,
|
||||
orig_key: &[u8],
|
||||
) -> Result<(), ArtError>
|
||||
where
|
||||
F: FnOnce(Option<&V>) -> UpdateAction<V>,
|
||||
{
|
||||
let rnode = node.read_lock_or_restart()?;
|
||||
|
||||
let prefix_match_len = rnode.prefix_matches(key);
|
||||
if prefix_match_len.is_none() {
|
||||
let (rparent, parent_key) = rparent.expect("direct children of the root have no prefix");
|
||||
let mut wparent = rparent.upgrade_to_write_lock_or_restart()?;
|
||||
let mut wnode = rnode.upgrade_to_write_lock_or_restart()?;
|
||||
|
||||
match value_fn(None) {
|
||||
UpdateAction::Nothing => {}
|
||||
UpdateAction::Insert(new_value) => {
|
||||
insert_split_prefix(key, new_value, &mut wnode, &mut wparent, parent_key, guard)?;
|
||||
}
|
||||
UpdateAction::Remove => {
|
||||
panic!("unexpected Remove action on insertion");
|
||||
}
|
||||
}
|
||||
wnode.write_unlock();
|
||||
wparent.write_unlock();
|
||||
return Ok(());
|
||||
}
|
||||
let prefix_match_len = prefix_match_len.unwrap();
|
||||
let key = &key[prefix_match_len as usize..];
|
||||
let level = level + prefix_match_len as usize;
|
||||
|
||||
if rnode.is_leaf() {
|
||||
assert_eq!(key.len(), 0);
|
||||
let (rparent, parent_key) = rparent.expect("root cannot be leaf");
|
||||
let mut wparent = rparent.upgrade_to_write_lock_or_restart()?;
|
||||
let mut wnode = rnode.upgrade_to_write_lock_or_restart()?;
|
||||
|
||||
// safety: Now that we have acquired the write lock, we have exclusive access to the
|
||||
// value. XXX: There might be concurrent reads though?
|
||||
let value_mut = wnode.get_leaf_value_mut();
|
||||
|
||||
match value_fn(Some(value_mut)) {
|
||||
UpdateAction::Nothing => {
|
||||
wparent.write_unlock();
|
||||
wnode.write_unlock();
|
||||
}
|
||||
UpdateAction::Insert(_) => panic!("cannot insert over existing value"),
|
||||
UpdateAction::Remove => {
|
||||
guard.remember_obsolete_node(wnode.as_ptr());
|
||||
wparent.delete_child(parent_key);
|
||||
wnode.write_unlock_obsolete();
|
||||
|
||||
if let Some(rgrandparent) = rgrandparent {
|
||||
// FIXME: Ignore concurrency error. It doesn't lead to
|
||||
// corruption, but it means we might leak something. Until
|
||||
// another update cleans it up.
|
||||
let _ = cleanup_parent(wparent, rgrandparent, guard);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let next_node = rnode.find_child_or_restart(key[0])?;
|
||||
|
||||
if next_node.is_none() {
|
||||
if rnode.is_full() {
|
||||
let (rparent, parent_key) = rparent.expect("root node cannot become full");
|
||||
let mut wparent = rparent.upgrade_to_write_lock_or_restart()?;
|
||||
let wnode = rnode.upgrade_to_write_lock_or_restart()?;
|
||||
|
||||
match value_fn(None) {
|
||||
UpdateAction::Nothing => {
|
||||
wnode.write_unlock();
|
||||
wparent.write_unlock();
|
||||
}
|
||||
UpdateAction::Insert(new_value) => {
|
||||
insert_and_grow(key, new_value, wnode, &mut wparent, parent_key, guard)?;
|
||||
wparent.write_unlock();
|
||||
}
|
||||
UpdateAction::Remove => {
|
||||
panic!("unexpected Remove action on insertion");
|
||||
}
|
||||
};
|
||||
} else {
|
||||
let mut wnode = rnode.upgrade_to_write_lock_or_restart()?;
|
||||
if let Some((rparent, _)) = rparent {
|
||||
rparent.read_unlock_or_restart()?;
|
||||
}
|
||||
match value_fn(None) {
|
||||
UpdateAction::Nothing => {}
|
||||
UpdateAction::Insert(new_value) => {
|
||||
insert_to_node(&mut wnode, key, new_value, guard)?;
|
||||
}
|
||||
UpdateAction::Remove => {
|
||||
panic!("unexpected Remove action on insertion");
|
||||
}
|
||||
};
|
||||
wnode.write_unlock();
|
||||
}
|
||||
return Ok(());
|
||||
} else {
|
||||
let next_child = next_node.unwrap(); // checked above it's not None
|
||||
if let Some((ref rparent, _)) = rparent {
|
||||
rparent.check_or_restart()?;
|
||||
}
|
||||
|
||||
// recurse to next level
|
||||
update_recurse(
|
||||
&key[1..],
|
||||
value_fn,
|
||||
next_child,
|
||||
Some((rnode, key[0])),
|
||||
rparent,
|
||||
guard,
|
||||
level + 1,
|
||||
orig_key,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
enum PathElement {
|
||||
Prefix(Vec<u8>),
|
||||
KeyByte(u8),
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for PathElement {
|
||||
fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
|
||||
match self {
|
||||
PathElement::Prefix(prefix) => write!(fmt, "{:?}", prefix),
|
||||
PathElement::KeyByte(key_byte) => write!(fmt, "{}", key_byte),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn dump_tree<'e, V: Value + std::fmt::Debug>(
|
||||
root: RootPtr<V>,
|
||||
epoch_pin: &'e EpochPin,
|
||||
dst: &mut dyn std::io::Write,
|
||||
) {
|
||||
let root_ref = NodeRef::from_root_ptr(root);
|
||||
|
||||
let _ = dump_recurse(&[], root_ref, &epoch_pin, 0, dst);
|
||||
}
|
||||
|
||||
// TODO: return an Err if writeln!() returns error, instead of unwrapping
|
||||
fn dump_recurse<'e, V: Value + std::fmt::Debug>(
|
||||
path: &[PathElement],
|
||||
node: NodeRef<'e, V>,
|
||||
epoch_pin: &'e EpochPin,
|
||||
level: usize,
|
||||
dst: &mut dyn std::io::Write,
|
||||
) -> Result<(), ConcurrentUpdateError> {
|
||||
let indent = str::repeat(" ", level);
|
||||
|
||||
let rnode = node.read_lock_or_restart()?;
|
||||
let mut path = Vec::from(path);
|
||||
let prefix = rnode.get_prefix();
|
||||
if prefix.len() != 0 {
|
||||
path.push(PathElement::Prefix(Vec::from(prefix)));
|
||||
}
|
||||
|
||||
if rnode.is_leaf() {
|
||||
let vptr = rnode.get_leaf_value_ptr()?;
|
||||
// safety: It's OK to return a ref of the pointer because we checked the version
|
||||
// and the lifetime of 'epoch_pin' enforces that the reference is only accessible
|
||||
// as long as the epoch is pinned.
|
||||
let val = unsafe { vptr.as_ref().unwrap() };
|
||||
writeln!(dst, "{} {:?}: {:?}", indent, path, val).unwrap();
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
for key_byte in 0..=u8::MAX {
|
||||
match rnode.find_child_or_restart(key_byte)? {
|
||||
None => continue,
|
||||
Some(child_ref) => {
|
||||
let rchild = child_ref.read_lock_or_restart()?;
|
||||
writeln!(
|
||||
dst,
|
||||
"{} {:?}, {}: prefix {:?}",
|
||||
indent,
|
||||
&path,
|
||||
key_byte,
|
||||
rchild.get_prefix()
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let mut child_path = path.clone();
|
||||
child_path.push(PathElement::KeyByte(key_byte));
|
||||
|
||||
dump_recurse(&child_path, child_ref, epoch_pin, level + 1, dst)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
///```text
|
||||
/// [fooba]r -> value
|
||||
///
|
||||
/// [foo]b -> [a]r -> value
|
||||
/// e -> [ls]e -> value
|
||||
///```
|
||||
fn insert_split_prefix<'e, K: Key, V: Value, A: ArtAllocator<V>>(
|
||||
key: &[u8],
|
||||
value: V,
|
||||
node: &mut WriteLockedNodeRef<V>,
|
||||
parent: &mut WriteLockedNodeRef<V>,
|
||||
parent_key: u8,
|
||||
guard: &'e TreeWriteGuard<K, V, A>,
|
||||
) -> Result<(), OutOfMemoryError> {
|
||||
let old_node = node;
|
||||
let old_prefix = old_node.get_prefix();
|
||||
let common_prefix_len = common_prefix(key, old_prefix);
|
||||
|
||||
// Allocate a node for the new value.
|
||||
let new_value_node = allocate_node_for_value(
|
||||
&key[common_prefix_len + 1..],
|
||||
value,
|
||||
guard.tree_writer.allocator,
|
||||
)?;
|
||||
|
||||
// Allocate a new internal node with the common prefix
|
||||
// FIXME: deallocate 'new_value_node' on OOM
|
||||
let mut prefix_node =
|
||||
node_ref::new_internal(&key[..common_prefix_len], guard.tree_writer.allocator)?;
|
||||
|
||||
// Add the old node and the new nodes to the new internal node
|
||||
prefix_node.insert_old_child(old_prefix[common_prefix_len], old_node);
|
||||
prefix_node.insert_new_child(key[common_prefix_len], new_value_node);
|
||||
|
||||
// Modify the prefix of the old child in place
|
||||
old_node.truncate_prefix(old_prefix.len() - common_prefix_len - 1);
|
||||
|
||||
// replace the pointer in the parent
|
||||
parent.replace_child(parent_key, prefix_node.into_ptr());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn insert_to_node<'e, K: Key, V: Value, A: ArtAllocator<V>>(
|
||||
wnode: &mut WriteLockedNodeRef<V>,
|
||||
key: &[u8],
|
||||
value: V,
|
||||
guard: &'e TreeWriteGuard<K, V, A>,
|
||||
) -> Result<(), OutOfMemoryError> {
|
||||
let value_child = allocate_node_for_value(&key[1..], value, guard.tree_writer.allocator)?;
|
||||
wnode.insert_child(key[0], value_child.into_ptr());
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// On entry: 'parent' and 'node' are locked
|
||||
fn insert_and_grow<'e, 'g, K: Key, V: Value, A: ArtAllocator<V>>(
|
||||
key: &[u8],
|
||||
value: V,
|
||||
wnode: WriteLockedNodeRef<V>,
|
||||
parent: &mut WriteLockedNodeRef<V>,
|
||||
parent_key_byte: u8,
|
||||
guard: &'g mut TreeWriteGuard<'e, K, V, A>,
|
||||
) -> Result<(), ArtError> {
|
||||
let mut bigger_node = wnode.grow(guard.tree_writer.allocator)?;
|
||||
|
||||
// FIXME: deallocate 'bigger_node' on OOM
|
||||
let value_child = allocate_node_for_value(&key[1..], value, guard.tree_writer.allocator)?;
|
||||
bigger_node.insert_new_child(key[0], value_child);
|
||||
|
||||
// Replace the pointer in the parent
|
||||
parent.replace_child(parent_key_byte, bigger_node.into_ptr());
|
||||
|
||||
guard.remember_obsolete_node(wnode.as_ptr());
|
||||
wnode.write_unlock_obsolete();
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn cleanup_parent<'e, 'g, K: Key, V: Value, A: ArtAllocator<V>>(
|
||||
wparent: WriteLockedNodeRef<V>,
|
||||
rgrandparent: (ReadLockedNodeRef<V>, u8),
|
||||
guard: &'g mut TreeWriteGuard<'e, K, V, A>,
|
||||
) -> Result<(), ArtError> {
|
||||
let (rgrandparent, grandparent_key_byte) = rgrandparent;
|
||||
|
||||
// If the parent becomes completely empty after the deletion, remove the parent from the
|
||||
// grandparent. (This case is possible because we reserve only 8 bytes for the prefix.)
|
||||
// TODO: not implemented.
|
||||
|
||||
// If the parent has only one child, replace the parent with the remaining child. (This is not
|
||||
// possible if the child's prefix field cannot absorb the parent's)
|
||||
if wparent.num_children() == 1 {
|
||||
// Try to lock the remaining child. This can fail if the child is updated
|
||||
// concurrently.
|
||||
let (key_byte, remaining_child) = wparent.find_remaining_child();
|
||||
|
||||
let mut wremaining_child = remaining_child.write_lock_or_restart()?;
|
||||
|
||||
if 1 + wremaining_child.get_prefix().len() + wparent.get_prefix().len() <= MAX_PREFIX_LEN {
|
||||
let mut wgrandparent = rgrandparent.upgrade_to_write_lock_or_restart()?;
|
||||
|
||||
// Ok, we have locked the leaf, the parent, the grandparent, and the parent's only
|
||||
// remaining leaf. Proceed with the updates.
|
||||
|
||||
// Update the prefix on the remaining leaf
|
||||
wremaining_child.prepend_prefix(wparent.get_prefix(), key_byte);
|
||||
|
||||
// Replace the pointer in the grandparent to point directly to the remaining leaf
|
||||
wgrandparent.replace_child(grandparent_key_byte, wremaining_child.as_ptr());
|
||||
|
||||
// Mark the parent as deleted.
|
||||
guard.remember_obsolete_node(wparent.as_ptr());
|
||||
wparent.write_unlock_obsolete();
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
|
||||
// If the parent's children would fit on a smaller node type after the deletion, replace it with
|
||||
// a smaller node.
|
||||
if wparent.can_shrink() {
|
||||
let mut wgrandparent = rgrandparent.upgrade_to_write_lock_or_restart()?;
|
||||
let smaller_node = wparent.shrink(guard.tree_writer.allocator)?;
|
||||
|
||||
// Replace the pointer in the grandparent
|
||||
wgrandparent.replace_child(grandparent_key_byte, smaller_node.into_ptr());
|
||||
|
||||
guard.remember_obsolete_node(wparent.as_ptr());
|
||||
wparent.write_unlock_obsolete();
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// nothing to do
|
||||
wparent.write_unlock();
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Allocate a new leaf node to hold 'value'. If the key is long, we
|
||||
// may need to allocate new internal nodes to hold it too
|
||||
fn allocate_node_for_value<'a, V: Value, A: ArtAllocator<V>>(
|
||||
key: &[u8],
|
||||
value: V,
|
||||
allocator: &'a A,
|
||||
) -> Result<NewNodeRef<'a, V, A>, OutOfMemoryError> {
|
||||
let mut prefix_off = key.len().saturating_sub(MAX_PREFIX_LEN);
|
||||
|
||||
let leaf_node = node_ref::new_leaf(&key[prefix_off..key.len()], value, allocator)?;
|
||||
|
||||
let mut node = leaf_node;
|
||||
while prefix_off > 0 {
|
||||
// Need another internal node
|
||||
let remain_prefix = &key[0..prefix_off];
|
||||
|
||||
prefix_off = remain_prefix.len().saturating_sub(MAX_PREFIX_LEN + 1);
|
||||
let mut internal_node = node_ref::new_internal(
|
||||
&remain_prefix[prefix_off..remain_prefix.len() - 1],
|
||||
allocator,
|
||||
)?;
|
||||
internal_node.insert_new_child(*remain_prefix.last().unwrap(), node);
|
||||
node = internal_node;
|
||||
}
|
||||
|
||||
Ok(node)
|
||||
}
|
||||
|
||||
fn common_prefix(a: &[u8], b: &[u8]) -> usize {
|
||||
for i in 0..MAX_PREFIX_LEN {
|
||||
if a[i] != b[i] {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
panic!("prefixes are equal");
|
||||
}
|
||||
117
libs/neonart/src/algorithm/lock_and_version.rs
Normal file
117
libs/neonart/src/algorithm/lock_and_version.rs
Normal file
@@ -0,0 +1,117 @@
|
||||
//! Each node in the tree has contains one atomic word that stores three things:
|
||||
//!
|
||||
//! Bit 0: set if the node is "obsolete". An obsolete node has been removed from the tree,
|
||||
//! but might still be accessed by concurrent readers until the epoch expires.
|
||||
//! Bit 1: set if the node is currently write-locked. Used as a spinlock.
|
||||
//! Bits 2-63: Version number, incremented every time the node is modified.
|
||||
//!
|
||||
//! AtomicLockAndVersion represents that.
|
||||
|
||||
use std::sync::atomic::{AtomicU64, Ordering};
|
||||
|
||||
pub(crate) struct ConcurrentUpdateError();
|
||||
|
||||
pub(crate) struct AtomicLockAndVersion {
|
||||
inner: AtomicU64,
|
||||
}
|
||||
|
||||
impl AtomicLockAndVersion {
|
||||
pub(crate) fn new() -> AtomicLockAndVersion {
|
||||
AtomicLockAndVersion {
|
||||
inner: AtomicU64::new(0),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl AtomicLockAndVersion {
|
||||
pub(crate) fn read_lock_or_restart(&self) -> Result<u64, ConcurrentUpdateError> {
|
||||
let version = self.await_node_unlocked();
|
||||
if is_obsolete(version) {
|
||||
return Err(ConcurrentUpdateError());
|
||||
}
|
||||
Ok(version)
|
||||
}
|
||||
|
||||
pub(crate) fn check_or_restart(&self, version: u64) -> Result<(), ConcurrentUpdateError> {
|
||||
self.read_unlock_or_restart(version)
|
||||
}
|
||||
|
||||
pub(crate) fn read_unlock_or_restart(&self, version: u64) -> Result<(), ConcurrentUpdateError> {
|
||||
if self.inner.load(Ordering::Acquire) != version {
|
||||
return Err(ConcurrentUpdateError());
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) fn upgrade_to_write_lock_or_restart(
|
||||
&self,
|
||||
version: u64,
|
||||
) -> Result<(), ConcurrentUpdateError> {
|
||||
if self
|
||||
.inner
|
||||
.compare_exchange(
|
||||
version,
|
||||
set_locked_bit(version),
|
||||
Ordering::Acquire,
|
||||
Ordering::Relaxed,
|
||||
)
|
||||
.is_err()
|
||||
{
|
||||
return Err(ConcurrentUpdateError());
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) fn write_lock_or_restart(&self) -> Result<(), ConcurrentUpdateError> {
|
||||
let old = self.inner.load(Ordering::Relaxed);
|
||||
if is_obsolete(old) || is_locked(old) {
|
||||
return Err(ConcurrentUpdateError());
|
||||
}
|
||||
if self
|
||||
.inner
|
||||
.compare_exchange(
|
||||
old,
|
||||
set_locked_bit(old),
|
||||
Ordering::Acquire,
|
||||
Ordering::Relaxed,
|
||||
)
|
||||
.is_err()
|
||||
{
|
||||
return Err(ConcurrentUpdateError());
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) fn write_unlock(&self) {
|
||||
// reset locked bit and overflow into version
|
||||
self.inner.fetch_add(2, Ordering::Release);
|
||||
}
|
||||
|
||||
pub(crate) fn write_unlock_obsolete(&self) {
|
||||
// set obsolete, reset locked, overflow into version
|
||||
self.inner.fetch_add(3, Ordering::Release);
|
||||
}
|
||||
|
||||
// Helper functions
|
||||
fn await_node_unlocked(&self) -> u64 {
|
||||
let mut version = self.inner.load(Ordering::Acquire);
|
||||
while is_locked(version) {
|
||||
// spinlock
|
||||
std::thread::yield_now();
|
||||
version = self.inner.load(Ordering::Acquire)
|
||||
}
|
||||
version
|
||||
}
|
||||
}
|
||||
|
||||
fn set_locked_bit(version: u64) -> u64 {
|
||||
return version + 2;
|
||||
}
|
||||
|
||||
fn is_obsolete(version: u64) -> bool {
|
||||
return (version & 1) == 1;
|
||||
}
|
||||
|
||||
fn is_locked(version: u64) -> bool {
|
||||
return (version & 2) == 2;
|
||||
}
|
||||
1102
libs/neonart/src/algorithm/node_ptr.rs
Normal file
1102
libs/neonart/src/algorithm/node_ptr.rs
Normal file
File diff suppressed because it is too large
Load Diff
349
libs/neonart/src/algorithm/node_ref.rs
Normal file
349
libs/neonart/src/algorithm/node_ref.rs
Normal file
@@ -0,0 +1,349 @@
|
||||
use std::fmt::Debug;
|
||||
use std::marker::PhantomData;
|
||||
|
||||
use super::node_ptr;
|
||||
use super::node_ptr::NodePtr;
|
||||
use crate::EpochPin;
|
||||
use crate::Value;
|
||||
use crate::algorithm::lock_and_version::AtomicLockAndVersion;
|
||||
use crate::algorithm::lock_and_version::ConcurrentUpdateError;
|
||||
use crate::allocator::ArtAllocator;
|
||||
use crate::allocator::OutOfMemoryError;
|
||||
|
||||
pub struct NodeRef<'e, V> {
|
||||
ptr: NodePtr<V>,
|
||||
|
||||
phantom: PhantomData<&'e EpochPin<'e>>,
|
||||
}
|
||||
|
||||
impl<'e, V> Debug for NodeRef<'e, V> {
|
||||
fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
|
||||
write!(fmt, "{:?}", self.ptr)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'e, V: Value> NodeRef<'e, V> {
|
||||
pub(crate) fn from_root_ptr(root_ptr: NodePtr<V>) -> NodeRef<'e, V> {
|
||||
NodeRef {
|
||||
ptr: root_ptr,
|
||||
phantom: PhantomData,
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn read_lock_or_restart(
|
||||
&self,
|
||||
) -> Result<ReadLockedNodeRef<'e, V>, ConcurrentUpdateError> {
|
||||
let version = self.lockword().read_lock_or_restart()?;
|
||||
Ok(ReadLockedNodeRef {
|
||||
ptr: self.ptr,
|
||||
version,
|
||||
phantom: self.phantom,
|
||||
})
|
||||
}
|
||||
|
||||
pub(crate) fn write_lock_or_restart(
|
||||
&self,
|
||||
) -> Result<WriteLockedNodeRef<'e, V>, ConcurrentUpdateError> {
|
||||
self.lockword().write_lock_or_restart()?;
|
||||
Ok(WriteLockedNodeRef {
|
||||
ptr: self.ptr,
|
||||
phantom: self.phantom,
|
||||
})
|
||||
}
|
||||
|
||||
fn lockword(&self) -> &AtomicLockAndVersion {
|
||||
self.ptr.lockword()
|
||||
}
|
||||
}
|
||||
|
||||
/// A reference to a node that has been optimistically read-locked. The functions re-check
|
||||
/// the version after each read.
|
||||
pub struct ReadLockedNodeRef<'e, V> {
|
||||
ptr: NodePtr<V>,
|
||||
version: u64,
|
||||
|
||||
phantom: PhantomData<&'e EpochPin<'e>>,
|
||||
}
|
||||
|
||||
impl<'e, V: Value> ReadLockedNodeRef<'e, V> {
|
||||
pub(crate) fn is_leaf(&self) -> bool {
|
||||
self.ptr.is_leaf()
|
||||
}
|
||||
|
||||
pub(crate) fn is_full(&self) -> bool {
|
||||
self.ptr.is_full()
|
||||
}
|
||||
|
||||
pub(crate) fn get_prefix(&self) -> &[u8] {
|
||||
self.ptr.get_prefix()
|
||||
}
|
||||
|
||||
/// Note: because we're only holding a read lock, the prefix can change concurrently.
|
||||
/// You must be prepared to restart, if read_unlock() returns error later.
|
||||
///
|
||||
/// Returns the length of the prefix, or None if it's not a match
|
||||
pub(crate) fn prefix_matches(&self, key: &[u8]) -> Option<usize> {
|
||||
self.ptr.prefix_matches(key)
|
||||
}
|
||||
|
||||
pub(crate) fn find_child_or_restart(
|
||||
&self,
|
||||
key_byte: u8,
|
||||
) -> Result<Option<NodeRef<'e, V>>, ConcurrentUpdateError> {
|
||||
let child_or_value = self.ptr.find_child(key_byte);
|
||||
self.ptr.lockword().check_or_restart(self.version)?;
|
||||
|
||||
match child_or_value {
|
||||
None => Ok(None),
|
||||
Some(child_ptr) => Ok(Some(NodeRef {
|
||||
ptr: child_ptr,
|
||||
phantom: self.phantom,
|
||||
})),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn find_next_child_or_restart(
|
||||
&self,
|
||||
min_key_byte: u8,
|
||||
) -> Result<Option<(u8, NodeRef<'e, V>)>, ConcurrentUpdateError> {
|
||||
let child_or_value = self.ptr.find_next_child(min_key_byte);
|
||||
self.ptr.lockword().check_or_restart(self.version)?;
|
||||
|
||||
match child_or_value {
|
||||
None => Ok(None),
|
||||
Some((k, child_ptr)) => Ok(Some((
|
||||
k,
|
||||
NodeRef {
|
||||
ptr: child_ptr,
|
||||
phantom: self.phantom,
|
||||
},
|
||||
))),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn get_leaf_value_ptr(&self) -> Result<*const V, ConcurrentUpdateError> {
|
||||
let result = self.ptr.get_leaf_value();
|
||||
self.ptr.lockword().check_or_restart(self.version)?;
|
||||
|
||||
// Extend the lifetime.
|
||||
let result = std::ptr::from_ref(result);
|
||||
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
pub(crate) fn upgrade_to_write_lock_or_restart(
|
||||
self,
|
||||
) -> Result<WriteLockedNodeRef<'e, V>, ConcurrentUpdateError> {
|
||||
self.ptr
|
||||
.lockword()
|
||||
.upgrade_to_write_lock_or_restart(self.version)?;
|
||||
|
||||
Ok(WriteLockedNodeRef {
|
||||
ptr: self.ptr,
|
||||
phantom: self.phantom,
|
||||
})
|
||||
}
|
||||
|
||||
pub(crate) fn read_unlock_or_restart(self) -> Result<(), ConcurrentUpdateError> {
|
||||
self.ptr.lockword().check_or_restart(self.version)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) fn check_or_restart(&self) -> Result<(), ConcurrentUpdateError> {
|
||||
self.ptr.lockword().check_or_restart(self.version)?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// A reference to a node that has been optimistically read-locked. The functions re-check
|
||||
/// the version after each read.
|
||||
pub struct WriteLockedNodeRef<'e, V> {
|
||||
ptr: NodePtr<V>,
|
||||
phantom: PhantomData<&'e EpochPin<'e>>,
|
||||
}
|
||||
|
||||
impl<'e, V: Value> WriteLockedNodeRef<'e, V> {
|
||||
pub(crate) fn can_shrink(&self) -> bool {
|
||||
self.ptr.can_shrink()
|
||||
}
|
||||
|
||||
pub(crate) fn num_children(&self) -> usize {
|
||||
self.ptr.num_children()
|
||||
}
|
||||
|
||||
pub(crate) fn write_unlock(mut self) {
|
||||
self.ptr.lockword().write_unlock();
|
||||
self.ptr = NodePtr::null();
|
||||
}
|
||||
|
||||
pub(crate) fn write_unlock_obsolete(mut self) {
|
||||
self.ptr.lockword().write_unlock_obsolete();
|
||||
self.ptr = NodePtr::null();
|
||||
}
|
||||
|
||||
pub(crate) fn get_prefix(&self) -> &[u8] {
|
||||
self.ptr.get_prefix()
|
||||
}
|
||||
|
||||
pub(crate) fn truncate_prefix(&mut self, new_prefix_len: usize) {
|
||||
self.ptr.truncate_prefix(new_prefix_len)
|
||||
}
|
||||
|
||||
pub(crate) fn prepend_prefix(&mut self, prefix: &[u8], prefix_byte: u8) {
|
||||
self.ptr.prepend_prefix(prefix, prefix_byte)
|
||||
}
|
||||
|
||||
pub(crate) fn insert_child(&mut self, key_byte: u8, child: NodePtr<V>) {
|
||||
self.ptr.insert_child(key_byte, child)
|
||||
}
|
||||
|
||||
pub(crate) fn get_leaf_value_mut(&mut self) -> &mut V {
|
||||
self.ptr.get_leaf_value_mut()
|
||||
}
|
||||
|
||||
pub(crate) fn grow<'a, A>(
|
||||
&self,
|
||||
allocator: &'a A,
|
||||
) -> Result<NewNodeRef<'a, V, A>, OutOfMemoryError>
|
||||
where
|
||||
A: ArtAllocator<V>,
|
||||
{
|
||||
let new_node = self.ptr.grow(allocator)?;
|
||||
Ok(NewNodeRef {
|
||||
ptr: new_node,
|
||||
allocator,
|
||||
extra_nodes: Vec::new(),
|
||||
})
|
||||
}
|
||||
|
||||
pub(crate) fn shrink<'a, A>(
|
||||
&self,
|
||||
allocator: &'a A,
|
||||
) -> Result<NewNodeRef<'a, V, A>, OutOfMemoryError>
|
||||
where
|
||||
A: ArtAllocator<V>,
|
||||
{
|
||||
let new_node = self.ptr.shrink(allocator)?;
|
||||
Ok(NewNodeRef {
|
||||
ptr: new_node,
|
||||
allocator,
|
||||
extra_nodes: Vec::new(),
|
||||
})
|
||||
}
|
||||
|
||||
pub(crate) fn as_ptr(&self) -> NodePtr<V> {
|
||||
self.ptr
|
||||
}
|
||||
|
||||
pub(crate) fn replace_child(&mut self, key_byte: u8, replacement: NodePtr<V>) {
|
||||
self.ptr.replace_child(key_byte, replacement);
|
||||
}
|
||||
|
||||
pub(crate) fn delete_child(&mut self, key_byte: u8) {
|
||||
self.ptr.delete_child(key_byte);
|
||||
}
|
||||
|
||||
pub(crate) fn find_remaining_child(&self) -> (u8, NodeRef<'e, V>) {
|
||||
assert_eq!(self.num_children(), 1);
|
||||
let child_or_value = self.ptr.find_next_child(0);
|
||||
|
||||
match child_or_value {
|
||||
None => panic!("could not find only child in node"),
|
||||
Some((k, child_ptr)) => (
|
||||
k,
|
||||
NodeRef {
|
||||
ptr: child_ptr,
|
||||
phantom: self.phantom,
|
||||
},
|
||||
),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'e, V> Drop for WriteLockedNodeRef<'e, V> {
|
||||
fn drop(&mut self) {
|
||||
if !self.ptr.is_null() {
|
||||
self.ptr.lockword().write_unlock();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) struct NewNodeRef<'a, V, A>
|
||||
where
|
||||
V: Value,
|
||||
A: ArtAllocator<V>,
|
||||
{
|
||||
ptr: NodePtr<V>,
|
||||
allocator: &'a A,
|
||||
|
||||
extra_nodes: Vec<NodePtr<V>>,
|
||||
}
|
||||
|
||||
impl<'a, V, A> NewNodeRef<'a, V, A>
|
||||
where
|
||||
V: Value,
|
||||
A: ArtAllocator<V>,
|
||||
{
|
||||
pub(crate) fn insert_old_child(&mut self, key_byte: u8, child: &WriteLockedNodeRef<V>) {
|
||||
self.ptr.insert_child(key_byte, child.as_ptr())
|
||||
}
|
||||
|
||||
pub(crate) fn into_ptr(mut self) -> NodePtr<V> {
|
||||
let ptr = self.ptr;
|
||||
self.ptr = NodePtr::null();
|
||||
ptr
|
||||
}
|
||||
|
||||
pub(crate) fn insert_new_child(&mut self, key_byte: u8, child: NewNodeRef<'a, V, A>) {
|
||||
let child_ptr = child.into_ptr();
|
||||
self.ptr.insert_child(key_byte, child_ptr);
|
||||
self.extra_nodes.push(child_ptr);
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, V, A> Drop for NewNodeRef<'a, V, A>
|
||||
where
|
||||
V: Value,
|
||||
A: ArtAllocator<V>,
|
||||
{
|
||||
/// This drop implementation deallocates the newly allocated node, if into_ptr() was not called.
|
||||
fn drop(&mut self) {
|
||||
if !self.ptr.is_null() {
|
||||
self.ptr.deallocate(self.allocator);
|
||||
for p in self.extra_nodes.iter() {
|
||||
p.deallocate(self.allocator);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn new_internal<'a, V, A>(
|
||||
prefix: &[u8],
|
||||
allocator: &'a A,
|
||||
) -> Result<NewNodeRef<'a, V, A>, OutOfMemoryError>
|
||||
where
|
||||
V: Value,
|
||||
A: ArtAllocator<V>,
|
||||
{
|
||||
Ok(NewNodeRef {
|
||||
ptr: node_ptr::new_internal(prefix, allocator)?,
|
||||
allocator,
|
||||
extra_nodes: Vec::new(),
|
||||
})
|
||||
}
|
||||
|
||||
pub(crate) fn new_leaf<'a, V, A>(
|
||||
prefix: &[u8],
|
||||
value: V,
|
||||
allocator: &'a A,
|
||||
) -> Result<NewNodeRef<'a, V, A>, OutOfMemoryError>
|
||||
where
|
||||
V: Value,
|
||||
A: ArtAllocator<V>,
|
||||
{
|
||||
Ok(NewNodeRef {
|
||||
ptr: node_ptr::new_leaf(prefix, value, allocator)?,
|
||||
allocator,
|
||||
extra_nodes: Vec::new(),
|
||||
})
|
||||
}
|
||||
158
libs/neonart/src/allocator.rs
Normal file
158
libs/neonart/src/allocator.rs
Normal file
@@ -0,0 +1,158 @@
|
||||
pub mod block;
|
||||
mod multislab;
|
||||
mod slab;
|
||||
pub mod r#static;
|
||||
|
||||
use std::alloc::Layout;
|
||||
use std::marker::PhantomData;
|
||||
use std::mem::MaybeUninit;
|
||||
use std::sync::atomic::Ordering;
|
||||
|
||||
use crate::allocator::multislab::MultiSlabAllocator;
|
||||
use crate::allocator::r#static::alloc_from_slice;
|
||||
|
||||
use spin;
|
||||
|
||||
use crate::Tree;
|
||||
pub use crate::algorithm::node_ptr::{
|
||||
NodeInternal4, NodeInternal16, NodeInternal48, NodeInternal256, NodeLeaf,
|
||||
};
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct OutOfMemoryError();
|
||||
|
||||
pub trait ArtAllocator<V: crate::Value> {
|
||||
fn alloc_tree(&self) -> *mut Tree<V>;
|
||||
|
||||
fn alloc_node_internal4(&self) -> *mut NodeInternal4<V>;
|
||||
fn alloc_node_internal16(&self) -> *mut NodeInternal16<V>;
|
||||
fn alloc_node_internal48(&self) -> *mut NodeInternal48<V>;
|
||||
fn alloc_node_internal256(&self) -> *mut NodeInternal256<V>;
|
||||
fn alloc_node_leaf(&self) -> *mut NodeLeaf<V>;
|
||||
|
||||
fn dealloc_node_internal4(&self, ptr: *mut NodeInternal4<V>);
|
||||
fn dealloc_node_internal16(&self, ptr: *mut NodeInternal16<V>);
|
||||
fn dealloc_node_internal48(&self, ptr: *mut NodeInternal48<V>);
|
||||
fn dealloc_node_internal256(&self, ptr: *mut NodeInternal256<V>);
|
||||
fn dealloc_node_leaf(&self, ptr: *mut NodeLeaf<V>);
|
||||
}
|
||||
|
||||
pub struct ArtMultiSlabAllocator<'t, V>
|
||||
where
|
||||
V: crate::Value,
|
||||
{
|
||||
tree_area: spin::Mutex<Option<&'t mut MaybeUninit<Tree<V>>>>,
|
||||
|
||||
pub(crate) inner: MultiSlabAllocator<'t, 5>,
|
||||
|
||||
phantom_val: PhantomData<V>,
|
||||
}
|
||||
|
||||
impl<'t, V: crate::Value> ArtMultiSlabAllocator<'t, V> {
|
||||
const LAYOUTS: [Layout; 5] = [
|
||||
Layout::new::<NodeInternal4<V>>(),
|
||||
Layout::new::<NodeInternal16<V>>(),
|
||||
Layout::new::<NodeInternal48<V>>(),
|
||||
Layout::new::<NodeInternal256<V>>(),
|
||||
Layout::new::<NodeLeaf<V>>(),
|
||||
];
|
||||
|
||||
pub fn new(area: &'t mut [MaybeUninit<u8>]) -> &'t mut ArtMultiSlabAllocator<'t, V> {
|
||||
let (allocator_area, remain) = alloc_from_slice::<ArtMultiSlabAllocator<V>>(area);
|
||||
let (tree_area, remain) = alloc_from_slice::<Tree<V>>(remain);
|
||||
|
||||
let allocator = allocator_area.write(ArtMultiSlabAllocator {
|
||||
tree_area: spin::Mutex::new(Some(tree_area)),
|
||||
inner: MultiSlabAllocator::new(remain, &Self::LAYOUTS),
|
||||
phantom_val: PhantomData,
|
||||
});
|
||||
|
||||
allocator
|
||||
}
|
||||
}
|
||||
|
||||
impl<'t, V: crate::Value> ArtAllocator<V> for ArtMultiSlabAllocator<'t, V> {
|
||||
fn alloc_tree(&self) -> *mut Tree<V> {
|
||||
let mut t = self.tree_area.lock();
|
||||
if let Some(tree_area) = t.take() {
|
||||
return tree_area.as_mut_ptr().cast();
|
||||
}
|
||||
panic!("cannot allocate more than one tree");
|
||||
}
|
||||
|
||||
fn alloc_node_internal4(&self) -> *mut NodeInternal4<V> {
|
||||
self.inner.alloc_slab(0).cast()
|
||||
}
|
||||
fn alloc_node_internal16(&self) -> *mut NodeInternal16<V> {
|
||||
self.inner.alloc_slab(1).cast()
|
||||
}
|
||||
fn alloc_node_internal48(&self) -> *mut NodeInternal48<V> {
|
||||
self.inner.alloc_slab(2).cast()
|
||||
}
|
||||
fn alloc_node_internal256(&self) -> *mut NodeInternal256<V> {
|
||||
self.inner.alloc_slab(3).cast()
|
||||
}
|
||||
fn alloc_node_leaf(&self) -> *mut NodeLeaf<V> {
|
||||
self.inner.alloc_slab(4).cast()
|
||||
}
|
||||
|
||||
fn dealloc_node_internal4(&self, ptr: *mut NodeInternal4<V>) {
|
||||
self.inner.dealloc_slab(0, ptr.cast())
|
||||
}
|
||||
|
||||
fn dealloc_node_internal16(&self, ptr: *mut NodeInternal16<V>) {
|
||||
self.inner.dealloc_slab(1, ptr.cast())
|
||||
}
|
||||
fn dealloc_node_internal48(&self, ptr: *mut NodeInternal48<V>) {
|
||||
self.inner.dealloc_slab(2, ptr.cast())
|
||||
}
|
||||
fn dealloc_node_internal256(&self, ptr: *mut NodeInternal256<V>) {
|
||||
self.inner.dealloc_slab(3, ptr.cast())
|
||||
}
|
||||
fn dealloc_node_leaf(&self, ptr: *mut NodeLeaf<V>) {
|
||||
self.inner.dealloc_slab(4, ptr.cast())
|
||||
}
|
||||
}
|
||||
|
||||
impl<'t, V: crate::Value> ArtMultiSlabAllocator<'t, V> {
|
||||
pub(crate) fn get_statistics(&self) -> ArtMultiSlabStats {
|
||||
ArtMultiSlabStats {
|
||||
num_internal4: self.inner.slab_descs[0]
|
||||
.num_allocated
|
||||
.load(Ordering::Relaxed),
|
||||
num_internal16: self.inner.slab_descs[1]
|
||||
.num_allocated
|
||||
.load(Ordering::Relaxed),
|
||||
num_internal48: self.inner.slab_descs[2]
|
||||
.num_allocated
|
||||
.load(Ordering::Relaxed),
|
||||
num_internal256: self.inner.slab_descs[3]
|
||||
.num_allocated
|
||||
.load(Ordering::Relaxed),
|
||||
num_leaf: self.inner.slab_descs[4]
|
||||
.num_allocated
|
||||
.load(Ordering::Relaxed),
|
||||
|
||||
num_blocks_internal4: self.inner.slab_descs[0].num_blocks.load(Ordering::Relaxed),
|
||||
num_blocks_internal16: self.inner.slab_descs[1].num_blocks.load(Ordering::Relaxed),
|
||||
num_blocks_internal48: self.inner.slab_descs[2].num_blocks.load(Ordering::Relaxed),
|
||||
num_blocks_internal256: self.inner.slab_descs[3].num_blocks.load(Ordering::Relaxed),
|
||||
num_blocks_leaf: self.inner.slab_descs[4].num_blocks.load(Ordering::Relaxed),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct ArtMultiSlabStats {
|
||||
pub num_internal4: u64,
|
||||
pub num_internal16: u64,
|
||||
pub num_internal48: u64,
|
||||
pub num_internal256: u64,
|
||||
pub num_leaf: u64,
|
||||
|
||||
pub num_blocks_internal4: u64,
|
||||
pub num_blocks_internal16: u64,
|
||||
pub num_blocks_internal48: u64,
|
||||
pub num_blocks_internal256: u64,
|
||||
pub num_blocks_leaf: u64,
|
||||
}
|
||||
191
libs/neonart/src/allocator/block.rs
Normal file
191
libs/neonart/src/allocator/block.rs
Normal file
@@ -0,0 +1,191 @@
|
||||
//! Simple allocator of fixed-size blocks
|
||||
|
||||
use std::mem::MaybeUninit;
|
||||
use std::sync::atomic::{AtomicU64, Ordering};
|
||||
|
||||
use spin;
|
||||
|
||||
pub const BLOCK_SIZE: usize = 16 * 1024;
|
||||
|
||||
const INVALID_BLOCK: u64 = u64::MAX;
|
||||
|
||||
pub(crate) struct BlockAllocator<'t> {
|
||||
blocks_ptr: &'t [MaybeUninit<u8>],
|
||||
num_blocks: u64,
|
||||
num_initialized: AtomicU64,
|
||||
|
||||
freelist_head: spin::Mutex<u64>,
|
||||
}
|
||||
|
||||
struct FreeListBlock {
|
||||
inner: spin::Mutex<FreeListBlockInner>,
|
||||
}
|
||||
|
||||
struct FreeListBlockInner {
|
||||
next: u64,
|
||||
|
||||
num_free_blocks: u64,
|
||||
free_blocks: [u64; 100], // FIXME: fill the rest of the block
|
||||
}
|
||||
|
||||
impl<'t> BlockAllocator<'t> {
|
||||
pub(crate) fn new(area: &'t mut [MaybeUninit<u8>]) -> Self {
|
||||
// Use all the space for the blocks
|
||||
let padding = area.as_ptr().align_offset(BLOCK_SIZE);
|
||||
let remain = &mut area[padding..];
|
||||
|
||||
let num_blocks = (remain.len() / BLOCK_SIZE) as u64;
|
||||
|
||||
BlockAllocator {
|
||||
blocks_ptr: remain,
|
||||
num_blocks,
|
||||
num_initialized: AtomicU64::new(0),
|
||||
freelist_head: spin::Mutex::new(INVALID_BLOCK),
|
||||
}
|
||||
}
|
||||
|
||||
/// safety: you must hold a lock on the pointer to this block, otherwise it might get
|
||||
/// reused for another kind of block
|
||||
fn read_freelist_block(&self, blkno: u64) -> &FreeListBlock {
|
||||
let ptr: *const FreeListBlock = self.get_block_ptr(blkno).cast();
|
||||
unsafe { ptr.as_ref().unwrap() }
|
||||
}
|
||||
|
||||
fn get_block_ptr(&self, blkno: u64) -> *mut u8 {
|
||||
assert!(blkno < self.num_blocks);
|
||||
unsafe {
|
||||
self.blocks_ptr
|
||||
.as_ptr()
|
||||
.byte_offset(blkno as isize * BLOCK_SIZE as isize)
|
||||
}
|
||||
.cast_mut()
|
||||
.cast()
|
||||
}
|
||||
|
||||
#[allow(clippy::mut_from_ref)]
|
||||
pub(crate) fn alloc_block(&self) -> &mut [MaybeUninit<u8>] {
|
||||
// FIXME: handle OOM
|
||||
let blkno = self.alloc_block_internal();
|
||||
if blkno == INVALID_BLOCK {
|
||||
panic!("out of memory");
|
||||
}
|
||||
|
||||
let ptr: *mut MaybeUninit<u8> = self.get_block_ptr(blkno).cast();
|
||||
unsafe { std::slice::from_raw_parts_mut(ptr, BLOCK_SIZE) }
|
||||
}
|
||||
|
||||
fn alloc_block_internal(&self) -> u64 {
|
||||
// check the free list.
|
||||
{
|
||||
let mut freelist_head = self.freelist_head.lock();
|
||||
if *freelist_head != INVALID_BLOCK {
|
||||
let freelist_block = self.read_freelist_block(*freelist_head);
|
||||
|
||||
// acquire lock on the freelist block before releasing the lock on the parent (i.e. lock coupling)
|
||||
let mut g = freelist_block.inner.lock();
|
||||
|
||||
if g.num_free_blocks > 0 {
|
||||
g.num_free_blocks -= 1;
|
||||
let result = g.free_blocks[g.num_free_blocks as usize];
|
||||
return result;
|
||||
} else {
|
||||
// consume the freelist block itself
|
||||
let result = *freelist_head;
|
||||
*freelist_head = g.next;
|
||||
// This freelist block is now unlinked and can be repurposed
|
||||
drop(g);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If there are some blocks left that we've never used, pick next such block
|
||||
let mut next_uninitialized = self.num_initialized.load(Ordering::Relaxed);
|
||||
while next_uninitialized < self.num_blocks {
|
||||
match self.num_initialized.compare_exchange(
|
||||
next_uninitialized,
|
||||
next_uninitialized + 1,
|
||||
Ordering::Relaxed,
|
||||
Ordering::Relaxed,
|
||||
) {
|
||||
Ok(_) => {
|
||||
return next_uninitialized;
|
||||
}
|
||||
Err(old) => {
|
||||
next_uninitialized = old;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// out of blocks
|
||||
return INVALID_BLOCK;
|
||||
}
|
||||
|
||||
// TODO: this is currently unused. The slab allocator never releases blocks
|
||||
#[allow(dead_code)]
|
||||
pub(crate) fn release_block(&self, block_ptr: *mut u8) {
|
||||
let blockno = unsafe { block_ptr.byte_offset_from(self.blocks_ptr) / BLOCK_SIZE as isize };
|
||||
self.release_block_internal(blockno as u64);
|
||||
}
|
||||
|
||||
fn release_block_internal(&self, blockno: u64) {
|
||||
let mut freelist_head = self.freelist_head.lock();
|
||||
if *freelist_head != INVALID_BLOCK {
|
||||
let freelist_block = self.read_freelist_block(*freelist_head);
|
||||
|
||||
// acquire lock on the freelist block before releasing the lock on the parent (i.e. lock coupling)
|
||||
let mut g = freelist_block.inner.lock();
|
||||
|
||||
let num_free_blocks = g.num_free_blocks;
|
||||
if num_free_blocks < g.free_blocks.len() as u64 {
|
||||
g.free_blocks[num_free_blocks as usize] = blockno;
|
||||
g.num_free_blocks += 1;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// Convert the block into a new freelist block
|
||||
let block_ptr: *mut FreeListBlock = self.get_block_ptr(blockno).cast();
|
||||
let init = FreeListBlock {
|
||||
inner: spin::Mutex::new(FreeListBlockInner {
|
||||
next: *freelist_head,
|
||||
num_free_blocks: 0,
|
||||
free_blocks: [INVALID_BLOCK; 100],
|
||||
}),
|
||||
};
|
||||
unsafe { (*block_ptr) = init };
|
||||
*freelist_head = blockno;
|
||||
}
|
||||
|
||||
// for debugging
|
||||
pub(crate) fn get_statistics(&self) -> BlockAllocatorStats {
|
||||
let mut num_free_blocks = 0;
|
||||
|
||||
let mut _prev_lock = None;
|
||||
let head_lock = self.freelist_head.lock();
|
||||
let mut next_blk = *head_lock;
|
||||
let mut _head_lock = Some(head_lock);
|
||||
while next_blk != INVALID_BLOCK {
|
||||
let freelist_block = self.read_freelist_block(next_blk);
|
||||
let lock = freelist_block.inner.lock();
|
||||
num_free_blocks += lock.num_free_blocks;
|
||||
next_blk = lock.next;
|
||||
_prev_lock = Some(lock); // hold the lock until we've read the next block
|
||||
_head_lock = None;
|
||||
}
|
||||
|
||||
BlockAllocatorStats {
|
||||
num_blocks: self.num_blocks,
|
||||
num_initialized: self.num_initialized.load(Ordering::Relaxed),
|
||||
num_free_blocks,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct BlockAllocatorStats {
|
||||
pub num_blocks: u64,
|
||||
pub num_initialized: u64,
|
||||
pub num_free_blocks: u64,
|
||||
}
|
||||
33
libs/neonart/src/allocator/multislab.rs
Normal file
33
libs/neonart/src/allocator/multislab.rs
Normal file
@@ -0,0 +1,33 @@
|
||||
use std::alloc::Layout;
|
||||
use std::mem::MaybeUninit;
|
||||
|
||||
use crate::allocator::block::BlockAllocator;
|
||||
use crate::allocator::slab::SlabDesc;
|
||||
|
||||
pub struct MultiSlabAllocator<'t, const N: usize> {
|
||||
pub(crate) block_allocator: BlockAllocator<'t>,
|
||||
|
||||
pub(crate) slab_descs: [SlabDesc; N],
|
||||
}
|
||||
|
||||
impl<'t, const N: usize> MultiSlabAllocator<'t, N> {
|
||||
pub(crate) fn new(
|
||||
area: &'t mut [MaybeUninit<u8>],
|
||||
layouts: &[Layout; N],
|
||||
) -> MultiSlabAllocator<'t, N> {
|
||||
let block_allocator = BlockAllocator::new(area);
|
||||
MultiSlabAllocator {
|
||||
block_allocator,
|
||||
|
||||
slab_descs: std::array::from_fn(|i| SlabDesc::new(&layouts[i])),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn alloc_slab(&self, slab_idx: usize) -> *mut u8 {
|
||||
self.slab_descs[slab_idx].alloc_chunk(&self.block_allocator)
|
||||
}
|
||||
|
||||
pub(crate) fn dealloc_slab(&self, slab_idx: usize, ptr: *mut u8) {
|
||||
self.slab_descs[slab_idx].dealloc_chunk(ptr, &self.block_allocator)
|
||||
}
|
||||
}
|
||||
432
libs/neonart/src/allocator/slab.rs
Normal file
432
libs/neonart/src/allocator/slab.rs
Normal file
@@ -0,0 +1,432 @@
|
||||
//! A slab allocator that carves out fixed-size chunks from larger blocks.
|
||||
//!
|
||||
//!
|
||||
|
||||
use std::alloc::Layout;
|
||||
use std::mem::MaybeUninit;
|
||||
use std::ops::Deref;
|
||||
use std::sync::atomic::{AtomicU32, AtomicU64, Ordering};
|
||||
|
||||
use spin;
|
||||
|
||||
use super::alloc_from_slice;
|
||||
use super::block::BlockAllocator;
|
||||
|
||||
use crate::allocator::block::BLOCK_SIZE;
|
||||
|
||||
pub(crate) struct SlabDesc {
|
||||
pub(crate) layout: Layout,
|
||||
|
||||
block_lists: spin::RwLock<BlockLists>,
|
||||
|
||||
pub(crate) num_blocks: AtomicU64,
|
||||
pub(crate) num_allocated: AtomicU64,
|
||||
}
|
||||
|
||||
// FIXME: Not sure if SlabDesc is really Sync or Send. It probably is when it's empty, but
|
||||
// 'block_lists' contains pointers when it's not empty. In the current use as part of the
|
||||
// the art tree, SlabDescs are only moved during initialization.
|
||||
unsafe impl Sync for SlabDesc {}
|
||||
unsafe impl Send for SlabDesc {}
|
||||
|
||||
#[derive(Default, Debug)]
|
||||
struct BlockLists {
|
||||
full_blocks: BlockList,
|
||||
nonfull_blocks: BlockList,
|
||||
}
|
||||
|
||||
impl BlockLists {
|
||||
// Unlink a node. It must be in either one of the two lists.
|
||||
unsafe fn unlink(&mut self, elem: *mut SlabBlockHeader) {
|
||||
let list = unsafe {
|
||||
if (*elem).next.is_null() {
|
||||
if self.full_blocks.tail == elem {
|
||||
Some(&mut self.full_blocks)
|
||||
} else {
|
||||
Some(&mut self.nonfull_blocks)
|
||||
}
|
||||
} else if (*elem).prev.is_null() {
|
||||
if self.full_blocks.head == elem {
|
||||
Some(&mut self.full_blocks)
|
||||
} else {
|
||||
Some(&mut self.nonfull_blocks)
|
||||
}
|
||||
} else {
|
||||
None
|
||||
}
|
||||
};
|
||||
unsafe { unlink_slab_block(list, elem) };
|
||||
}
|
||||
}
|
||||
|
||||
unsafe fn unlink_slab_block(mut list: Option<&mut BlockList>, elem: *mut SlabBlockHeader) {
|
||||
unsafe {
|
||||
if (*elem).next.is_null() {
|
||||
assert_eq!(list.as_ref().unwrap().tail, elem);
|
||||
list.as_mut().unwrap().tail = (*elem).prev;
|
||||
} else {
|
||||
assert_eq!((*(*elem).next).prev, elem);
|
||||
(*(*elem).next).prev = (*elem).prev;
|
||||
}
|
||||
if (*elem).prev.is_null() {
|
||||
assert_eq!(list.as_ref().unwrap().head, elem);
|
||||
list.as_mut().unwrap().head = (*elem).next;
|
||||
} else {
|
||||
assert_eq!((*(*elem).prev).next, elem);
|
||||
(*(*elem).prev).next = (*elem).next;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct BlockList {
|
||||
head: *mut SlabBlockHeader,
|
||||
tail: *mut SlabBlockHeader,
|
||||
}
|
||||
|
||||
impl Default for BlockList {
|
||||
fn default() -> Self {
|
||||
BlockList {
|
||||
head: std::ptr::null_mut(),
|
||||
tail: std::ptr::null_mut(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl BlockList {
|
||||
unsafe fn push_head(&mut self, elem: *mut SlabBlockHeader) {
|
||||
unsafe {
|
||||
if self.is_empty() {
|
||||
self.tail = elem;
|
||||
(*elem).next = std::ptr::null_mut();
|
||||
} else {
|
||||
(*elem).next = self.head;
|
||||
(*self.head).prev = elem;
|
||||
}
|
||||
(*elem).prev = std::ptr::null_mut();
|
||||
self.head = elem;
|
||||
}
|
||||
}
|
||||
|
||||
fn is_empty(&self) -> bool {
|
||||
self.head.is_null()
|
||||
}
|
||||
|
||||
unsafe fn unlink(&mut self, elem: *mut SlabBlockHeader) {
|
||||
unsafe { unlink_slab_block(Some(self), elem) }
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
fn dump(&self) {
|
||||
let mut next = self.head;
|
||||
|
||||
while !next.is_null() {
|
||||
let n = unsafe { next.as_ref() }.unwrap();
|
||||
eprintln!(
|
||||
" blk {:?} (free {}/{})",
|
||||
next,
|
||||
n.num_free_chunks.load(Ordering::Relaxed),
|
||||
n.num_chunks
|
||||
);
|
||||
next = n.next;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl SlabDesc {
|
||||
pub(crate) fn new(layout: &Layout) -> SlabDesc {
|
||||
SlabDesc {
|
||||
layout: *layout,
|
||||
block_lists: spin::RwLock::new(BlockLists::default()),
|
||||
num_allocated: AtomicU64::new(0),
|
||||
num_blocks: AtomicU64::new(0),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct SlabBlockHeader {
|
||||
free_chunks_head: spin::Mutex<*mut FreeChunk>,
|
||||
num_free_chunks: AtomicU32,
|
||||
num_chunks: u32, // this is really a constant for a given Layout
|
||||
|
||||
// these fields are protected by the lock on the BlockLists
|
||||
prev: *mut SlabBlockHeader,
|
||||
next: *mut SlabBlockHeader,
|
||||
}
|
||||
|
||||
struct FreeChunk {
|
||||
next: *mut FreeChunk,
|
||||
}
|
||||
|
||||
enum ReadOrWriteGuard<'a, T> {
|
||||
Read(spin::RwLockReadGuard<'a, T>),
|
||||
Write(spin::RwLockWriteGuard<'a, T>),
|
||||
}
|
||||
|
||||
impl<'a, T> Deref for ReadOrWriteGuard<'a, T> {
|
||||
type Target = T;
|
||||
|
||||
fn deref(&self) -> &<Self as Deref>::Target {
|
||||
match self {
|
||||
ReadOrWriteGuard::Read(g) => g.deref(),
|
||||
ReadOrWriteGuard::Write(g) => g.deref(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl SlabDesc {
|
||||
pub fn alloc_chunk(&self, block_allocator: &BlockAllocator) -> *mut u8 {
|
||||
// Are there any free chunks?
|
||||
let mut acquire_write = false;
|
||||
'outer: loop {
|
||||
let mut block_lists_guard = if acquire_write {
|
||||
ReadOrWriteGuard::Write(self.block_lists.write())
|
||||
} else {
|
||||
ReadOrWriteGuard::Read(self.block_lists.read())
|
||||
};
|
||||
'inner: loop {
|
||||
let block_ptr = block_lists_guard.nonfull_blocks.head;
|
||||
if block_ptr.is_null() {
|
||||
break 'outer;
|
||||
}
|
||||
unsafe {
|
||||
let mut free_chunks_head = (*block_ptr).free_chunks_head.lock();
|
||||
if !(*free_chunks_head).is_null() {
|
||||
let result = *free_chunks_head;
|
||||
(*free_chunks_head) = (*result).next;
|
||||
let _old = (*block_ptr).num_free_chunks.fetch_sub(1, Ordering::Relaxed);
|
||||
|
||||
self.num_allocated.fetch_add(1, Ordering::Relaxed);
|
||||
return result.cast();
|
||||
}
|
||||
}
|
||||
|
||||
// The block at the head of the list was full. Grab write lock and retry
|
||||
match block_lists_guard {
|
||||
ReadOrWriteGuard::Read(_) => {
|
||||
acquire_write = true;
|
||||
continue 'outer;
|
||||
}
|
||||
ReadOrWriteGuard::Write(ref mut g) => {
|
||||
// move the node to the list of full blocks
|
||||
unsafe {
|
||||
g.nonfull_blocks.unlink(block_ptr);
|
||||
g.full_blocks.push_head(block_ptr);
|
||||
};
|
||||
continue 'inner;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// no free chunks. Allocate a new block (and the chunk from that)
|
||||
let (new_block, new_chunk) = self.alloc_block_and_chunk(block_allocator);
|
||||
self.num_blocks.fetch_add(1, Ordering::Relaxed);
|
||||
|
||||
// Add the block to the list in the SlabDesc
|
||||
unsafe {
|
||||
let mut block_lists_guard = self.block_lists.write();
|
||||
block_lists_guard.nonfull_blocks.push_head(new_block);
|
||||
}
|
||||
self.num_allocated.fetch_add(1, Ordering::Relaxed);
|
||||
new_chunk
|
||||
}
|
||||
|
||||
pub fn dealloc_chunk(&self, chunk_ptr: *mut u8, _block_allocator: &BlockAllocator) {
|
||||
// Find the block it belongs to. You can find the block from the address. (And knowing the
|
||||
// layout, you could calculate the chunk number too.)
|
||||
let block_ptr: *mut SlabBlockHeader = {
|
||||
let block_addr = (chunk_ptr.addr() / BLOCK_SIZE) * BLOCK_SIZE;
|
||||
chunk_ptr.with_addr(block_addr).cast()
|
||||
};
|
||||
let chunk_ptr: *mut FreeChunk = chunk_ptr.cast();
|
||||
|
||||
// Mark the chunk as free in 'freechunks' list
|
||||
let num_chunks;
|
||||
let num_free_chunks;
|
||||
unsafe {
|
||||
let mut free_chunks_head = (*block_ptr).free_chunks_head.lock();
|
||||
(*chunk_ptr).next = *free_chunks_head;
|
||||
*free_chunks_head = chunk_ptr;
|
||||
|
||||
num_free_chunks = (*block_ptr).num_free_chunks.fetch_add(1, Ordering::Relaxed) + 1;
|
||||
num_chunks = (*block_ptr).num_chunks;
|
||||
}
|
||||
|
||||
if num_free_chunks == 1 {
|
||||
// If the block was full previously, add it to the nonfull blocks list. Note that
|
||||
// we're not holding the lock anymore, so it can immediately become full again.
|
||||
// That's harmless, it will be moved back to the full list again when a call
|
||||
// to alloc_chunk() sees it.
|
||||
let mut block_lists = self.block_lists.write();
|
||||
unsafe {
|
||||
block_lists.unlink(block_ptr);
|
||||
block_lists.nonfull_blocks.push_head(block_ptr);
|
||||
};
|
||||
} else if num_free_chunks == num_chunks {
|
||||
// If the block became completely empty, move it to the free list
|
||||
// TODO
|
||||
// FIXME: we're still holding the spinlock. It's not exactly safe to return it to
|
||||
// the free blocks list, is it? Defer it as garbage to wait out concurrent updates?
|
||||
//block_allocator.release_block()
|
||||
}
|
||||
|
||||
// update stats
|
||||
self.num_allocated.fetch_sub(1, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
fn alloc_block_and_chunk(
|
||||
&self,
|
||||
block_allocator: &BlockAllocator,
|
||||
) -> (*mut SlabBlockHeader, *mut u8) {
|
||||
// fixme: handle OOM
|
||||
let block_slice: &mut [MaybeUninit<u8>] = block_allocator.alloc_block();
|
||||
let (block_header, remain) = alloc_from_slice::<SlabBlockHeader>(block_slice);
|
||||
|
||||
let padding = remain.as_ptr().align_offset(self.layout.align());
|
||||
|
||||
let num_chunks = (remain.len() - padding) / self.layout.size();
|
||||
|
||||
let first_chunk_ptr: *mut FreeChunk = remain[padding..].as_mut_ptr().cast();
|
||||
|
||||
unsafe {
|
||||
let mut chunk_ptr = first_chunk_ptr;
|
||||
for _ in 0..num_chunks - 1 {
|
||||
let next_chunk_ptr = chunk_ptr.byte_add(self.layout.size());
|
||||
(*chunk_ptr).next = next_chunk_ptr;
|
||||
chunk_ptr = next_chunk_ptr;
|
||||
}
|
||||
(*chunk_ptr).next = std::ptr::null_mut();
|
||||
|
||||
let result_chunk = first_chunk_ptr;
|
||||
|
||||
let block_header = block_header.write(SlabBlockHeader {
|
||||
free_chunks_head: spin::Mutex::new((*first_chunk_ptr).next),
|
||||
prev: std::ptr::null_mut(),
|
||||
next: std::ptr::null_mut(),
|
||||
num_chunks: num_chunks as u32,
|
||||
num_free_chunks: AtomicU32::new(num_chunks as u32 - 1),
|
||||
});
|
||||
|
||||
(block_header, result_chunk.cast())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
fn dump(&self) {
|
||||
eprintln!(
|
||||
"slab dump ({} blocks, {} allocated chunks)",
|
||||
self.num_blocks.load(Ordering::Relaxed),
|
||||
self.num_allocated.load(Ordering::Relaxed)
|
||||
);
|
||||
let lists = self.block_lists.read();
|
||||
|
||||
eprintln!("nonfull blocks:");
|
||||
lists.nonfull_blocks.dump();
|
||||
eprintln!("full blocks:");
|
||||
lists.full_blocks.dump();
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
use rand::Rng;
|
||||
use rand_distr::Zipf;
|
||||
|
||||
struct TestObject {
|
||||
val: usize,
|
||||
_dummy: [u8; BLOCK_SIZE / 4],
|
||||
}
|
||||
|
||||
struct TestObjectSlab<'a>(SlabDesc, BlockAllocator<'a>);
|
||||
impl<'a> TestObjectSlab<'a> {
|
||||
fn new(block_allocator: BlockAllocator) -> TestObjectSlab {
|
||||
TestObjectSlab(SlabDesc::new(&Layout::new::<TestObject>()), block_allocator)
|
||||
}
|
||||
|
||||
fn alloc(&self, val: usize) -> *mut TestObject {
|
||||
let obj: *mut TestObject = self.0.alloc_chunk(&self.1).cast();
|
||||
unsafe { (*obj).val = val };
|
||||
obj
|
||||
}
|
||||
|
||||
fn dealloc(&self, obj: *mut TestObject) {
|
||||
self.0.dealloc_chunk(obj.cast(), &self.1)
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_slab_alloc() {
|
||||
const MEM_SIZE: usize = 100000000;
|
||||
let mut area = Box::new_uninit_slice(MEM_SIZE);
|
||||
let block_allocator = BlockAllocator::new(&mut area);
|
||||
|
||||
let slab = TestObjectSlab::new(block_allocator);
|
||||
|
||||
let mut all: Vec<*mut TestObject> = Vec::new();
|
||||
for i in 0..11 {
|
||||
all.push(slab.alloc(i));
|
||||
}
|
||||
for i in 0..11 {
|
||||
assert!(unsafe { (*all[i]).val == i });
|
||||
}
|
||||
|
||||
let distribution = Zipf::new(10 as f64, 1.1).unwrap();
|
||||
let mut rng = rand::rng();
|
||||
for _ in 0..100000 {
|
||||
slab.0.dump();
|
||||
let idx = (rng.sample(distribution) as usize).into();
|
||||
let ptr: *mut TestObject = all[idx];
|
||||
if !ptr.is_null() {
|
||||
assert_eq!(unsafe { (*ptr).val }, idx);
|
||||
slab.dealloc(ptr);
|
||||
all[idx] = std::ptr::null_mut();
|
||||
} else {
|
||||
all[idx] = slab.alloc(idx);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn new_test_blk(i: u32) -> *mut SlabBlockHeader {
|
||||
Box::into_raw(Box::new(SlabBlockHeader {
|
||||
free_chunks_head: spin::Mutex::new(std::ptr::null_mut()),
|
||||
num_free_chunks: AtomicU32::new(0),
|
||||
num_chunks: i,
|
||||
prev: std::ptr::null_mut(),
|
||||
next: std::ptr::null_mut(),
|
||||
}))
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_block_linked_list() {
|
||||
// note: these are leaked, but that's OK for tests
|
||||
let a = new_test_blk(0);
|
||||
let b = new_test_blk(1);
|
||||
|
||||
let mut list = BlockList::default();
|
||||
assert!(list.is_empty());
|
||||
|
||||
unsafe {
|
||||
list.push_head(a);
|
||||
assert!(!list.is_empty());
|
||||
list.unlink(a);
|
||||
}
|
||||
assert!(list.is_empty());
|
||||
|
||||
unsafe {
|
||||
list.push_head(b);
|
||||
list.push_head(a);
|
||||
assert_eq!(list.head, a);
|
||||
assert_eq!((*a).next, b);
|
||||
assert_eq!((*b).prev, a);
|
||||
assert_eq!(list.tail, b);
|
||||
|
||||
list.unlink(a);
|
||||
list.unlink(b);
|
||||
assert!(list.is_empty());
|
||||
}
|
||||
}
|
||||
}
|
||||
44
libs/neonart/src/allocator/static.rs
Normal file
44
libs/neonart/src/allocator/static.rs
Normal file
@@ -0,0 +1,44 @@
|
||||
use std::mem::MaybeUninit;
|
||||
|
||||
pub fn alloc_from_slice<T>(
|
||||
area: &mut [MaybeUninit<u8>],
|
||||
) -> (&mut MaybeUninit<T>, &mut [MaybeUninit<u8>]) {
|
||||
let layout = std::alloc::Layout::new::<T>();
|
||||
|
||||
let area_start = area.as_mut_ptr();
|
||||
|
||||
// pad to satisfy alignment requirements
|
||||
let padding = area_start.align_offset(layout.align());
|
||||
if padding + layout.size() > area.len() {
|
||||
panic!("out of memory");
|
||||
}
|
||||
let area = &mut area[padding..];
|
||||
let (result_area, remain) = area.split_at_mut(layout.size());
|
||||
|
||||
let result_ptr: *mut MaybeUninit<T> = result_area.as_mut_ptr().cast();
|
||||
let result = unsafe { result_ptr.as_mut().unwrap() };
|
||||
|
||||
(result, remain)
|
||||
}
|
||||
|
||||
pub fn alloc_array_from_slice<T>(
|
||||
area: &mut [MaybeUninit<u8>],
|
||||
len: usize,
|
||||
) -> (&mut [MaybeUninit<T>], &mut [MaybeUninit<u8>]) {
|
||||
let layout = std::alloc::Layout::new::<T>();
|
||||
|
||||
let area_start = area.as_mut_ptr();
|
||||
|
||||
// pad to satisfy alignment requirements
|
||||
let padding = area_start.align_offset(layout.align());
|
||||
if padding + layout.size() * len > area.len() {
|
||||
panic!("out of memory");
|
||||
}
|
||||
let area = &mut area[padding..];
|
||||
let (result_area, remain) = area.split_at_mut(layout.size() * len);
|
||||
|
||||
let result_ptr: *mut MaybeUninit<T> = result_area.as_mut_ptr().cast();
|
||||
let result = unsafe { std::slice::from_raw_parts_mut(result_ptr.as_mut().unwrap(), len) };
|
||||
|
||||
(result, remain)
|
||||
}
|
||||
147
libs/neonart/src/epoch.rs
Normal file
147
libs/neonart/src/epoch.rs
Normal file
@@ -0,0 +1,147 @@
|
||||
//! This is similar to crossbeam_epoch crate, but works in shared memory
|
||||
|
||||
use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering};
|
||||
|
||||
use crossbeam_utils::CachePadded;
|
||||
use spin;
|
||||
|
||||
const NUM_SLOTS: usize = 1000;
|
||||
|
||||
/// This is the struct that is stored in shmem
|
||||
///
|
||||
/// bit 0: is it pinned or not?
|
||||
/// rest of the bits are the epoch counter.
|
||||
pub struct EpochShared {
|
||||
global_epoch: AtomicU64,
|
||||
participants: [CachePadded<AtomicU64>; NUM_SLOTS],
|
||||
|
||||
broadcast_lock: spin::Mutex<()>,
|
||||
}
|
||||
|
||||
impl EpochShared {
|
||||
pub fn new() -> EpochShared {
|
||||
EpochShared {
|
||||
global_epoch: AtomicU64::new(2),
|
||||
participants: [const { CachePadded::new(AtomicU64::new(2)) }; NUM_SLOTS],
|
||||
broadcast_lock: spin::Mutex::new(()),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn register(&self) -> LocalHandle {
|
||||
LocalHandle {
|
||||
global: self,
|
||||
last_slot: AtomicUsize::new(0), // todo: choose more intelligently
|
||||
}
|
||||
}
|
||||
|
||||
fn release_pin(&self, slot: usize, _epoch: u64) {
|
||||
let global_epoch = self.global_epoch.load(Ordering::Relaxed);
|
||||
self.participants[slot].store(global_epoch, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
fn pin_internal(&self, slot_hint: usize) -> (usize, u64) {
|
||||
// pick a slot
|
||||
let mut slot = slot_hint;
|
||||
let epoch = loop {
|
||||
let old = self.participants[slot].fetch_or(1, Ordering::Relaxed);
|
||||
if old & 1 == 0 {
|
||||
// Got this slot
|
||||
break old;
|
||||
}
|
||||
|
||||
// the slot was busy by another thread / process. try a different slot
|
||||
slot += 1;
|
||||
if slot == NUM_SLOTS {
|
||||
slot = 0;
|
||||
}
|
||||
continue;
|
||||
};
|
||||
(slot, epoch)
|
||||
}
|
||||
|
||||
pub(crate) fn advance(&self) -> u64 {
|
||||
// Advance the global epoch
|
||||
let old_epoch = self.global_epoch.fetch_add(2, Ordering::Relaxed);
|
||||
let new_epoch = old_epoch + 2;
|
||||
|
||||
// Anyone that release their pin after this will update their slot.
|
||||
new_epoch
|
||||
}
|
||||
|
||||
pub(crate) fn broadcast(&self) {
|
||||
let Some(_guard) = self.broadcast_lock.try_lock() else {
|
||||
return;
|
||||
};
|
||||
|
||||
let epoch = self.global_epoch.load(Ordering::Relaxed);
|
||||
let old_epoch = epoch.wrapping_sub(2);
|
||||
|
||||
// Update all free slots.
|
||||
for i in 0..NUM_SLOTS {
|
||||
// TODO: check result, as a sanity check. It should either be the old epoch, or pinned
|
||||
let _ = self.participants[i].compare_exchange(
|
||||
old_epoch,
|
||||
epoch,
|
||||
Ordering::Relaxed,
|
||||
Ordering::Relaxed,
|
||||
);
|
||||
}
|
||||
|
||||
// FIXME: memory fence here, since we used Relaxed?
|
||||
}
|
||||
|
||||
pub(crate) fn get_oldest(&self) -> u64 {
|
||||
// Read all slots.
|
||||
let now = self.global_epoch.load(Ordering::Relaxed);
|
||||
let mut oldest = now;
|
||||
for i in 0..NUM_SLOTS {
|
||||
let this_epoch = self.participants[i].load(Ordering::Relaxed);
|
||||
let delta = now.wrapping_sub(this_epoch);
|
||||
if delta > u64::MAX / 2 {
|
||||
// this is very recent
|
||||
} else {
|
||||
if delta > now.wrapping_sub(oldest) {
|
||||
oldest = this_epoch;
|
||||
}
|
||||
}
|
||||
}
|
||||
oldest
|
||||
}
|
||||
|
||||
pub(crate) fn get_current(&self) -> u64 {
|
||||
self.global_epoch.load(Ordering::Relaxed)
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) struct EpochPin<'e> {
|
||||
slot: usize,
|
||||
pub(crate) epoch: u64,
|
||||
|
||||
handle: &'e LocalHandle<'e>,
|
||||
}
|
||||
|
||||
impl<'e> Drop for EpochPin<'e> {
|
||||
fn drop(&mut self) {
|
||||
self.handle.global.release_pin(self.slot, self.epoch);
|
||||
}
|
||||
}
|
||||
|
||||
pub struct LocalHandle<'g> {
|
||||
global: &'g EpochShared,
|
||||
|
||||
last_slot: AtomicUsize,
|
||||
}
|
||||
|
||||
impl<'g> LocalHandle<'g> {
|
||||
pub fn pin(&self) -> EpochPin {
|
||||
let (slot, epoch) = self
|
||||
.global
|
||||
.pin_internal(self.last_slot.load(Ordering::Relaxed));
|
||||
self.last_slot.store(slot, Ordering::Relaxed);
|
||||
EpochPin {
|
||||
handle: self,
|
||||
epoch,
|
||||
slot,
|
||||
}
|
||||
}
|
||||
}
|
||||
587
libs/neonart/src/lib.rs
Normal file
587
libs/neonart/src/lib.rs
Normal file
@@ -0,0 +1,587 @@
|
||||
//! Adaptive Radix Tree (ART) implementation, with Optimistic Lock Coupling.
|
||||
//!
|
||||
//! The data structure is described in these two papers:
|
||||
//!
|
||||
//! [1] Leis, V. & Kemper, Alfons & Neumann, Thomas. (2013).
|
||||
//! The adaptive radix tree: ARTful indexing for main-memory databases.
|
||||
//! Proceedings - International Conference on Data Engineering. 38-49. 10.1109/ICDE.2013.6544812.
|
||||
//! https://db.in.tum.de/~leis/papers/ART.pdf
|
||||
//!
|
||||
//! [2] Leis, Viktor & Scheibner, Florian & Kemper, Alfons & Neumann, Thomas. (2016).
|
||||
//! The ART of practical synchronization.
|
||||
//! 1-8. 10.1145/2933349.2933352.
|
||||
//! https://db.in.tum.de/~leis/papers/artsync.pdf
|
||||
//!
|
||||
//! [1] describes the base data structure, and [2] describes the Optimistic Lock Coupling that we
|
||||
//! use.
|
||||
//!
|
||||
//! The papers mention a few different variants. We have made the following choices in this
|
||||
//! implementation:
|
||||
//!
|
||||
//! - All keys have the same length
|
||||
//!
|
||||
//! - Single-value leaves.
|
||||
//!
|
||||
//! - For collapsing inner nodes, we use the Pessimistic approach, where each inner node stores a
|
||||
//! variable length "prefix", which stores the keys of all the one-way nodes which have been
|
||||
//! removed. However, similar to the "hybrid" approach described in the paper, each node only has
|
||||
//! space for a constant-size prefix of 8 bytes. If a node would have a longer prefix, then we
|
||||
//! create create one-way nodes to store them. (There was no particular reason for this choice,
|
||||
//! the "hybrid" approach described in the paper might be better.)
|
||||
//!
|
||||
//! - For concurrency, we use Optimistic Lock Coupling. The paper [2] also describes another method,
|
||||
//! ROWEX, which generally performs better when there is contention, but that is not important
|
||||
//! for use and Optimisic Lock Coupling is simpler to implement.
|
||||
//!
|
||||
//! ## Requirements
|
||||
//!
|
||||
//! This data structure is currently used for the integrated LFC, relsize and last-written LSN cache
|
||||
//! in the compute communicator, part of the 'neon' Postgres extension. We have some unique
|
||||
//! requirements, which is why we had to write our own. Namely:
|
||||
//!
|
||||
//! - The data structure has to live in fixed-sized shared memory segment. That rules out any
|
||||
//! built-in Rust collections and most crates. (Except possibly with the 'allocator_api' rust
|
||||
//! feature, which still nightly-only experimental as of this writing).
|
||||
//!
|
||||
//! - The data structure is accessed from multiple processes. Only one process updates the data
|
||||
//! structure, but other processes perform reads. That rules out using built-in Rust locking
|
||||
//! primitives like Mutex and RwLock, and most crates too.
|
||||
//!
|
||||
//! - Within the one process with write-access, multiple threads can perform updates concurrently.
|
||||
//! That rules out using PostgreSQL LWLocks for the locking.
|
||||
//!
|
||||
//! The implementation is generic, and doesn't depend on any PostgreSQL specifics, but it has been
|
||||
//! written with that usage and the above constraints in mind. Some noteworthy assumptions:
|
||||
//!
|
||||
//! - Contention is assumed to be rare. In the integrated cache in PostgreSQL, there's higher level
|
||||
//! locking in the PostgreSQL buffer manager, which ensures that two backends should not try to
|
||||
//! read / write the same page at the same time. (Prefetching can conflict with actual reads,
|
||||
//! however.)
|
||||
//!
|
||||
//! - The keys in the integrated cache are 17 bytes long.
|
||||
//!
|
||||
//! ## Usage
|
||||
//!
|
||||
//! Because this is designed to be used as a Postgres shared memory data structure, initialization
|
||||
//! happens in three stages:
|
||||
//!
|
||||
//! 0. A fixed area of shared memory is allocated at postmaster startup.
|
||||
//!
|
||||
//! 1. TreeInitStruct::new() is called to initialize it, still in Postmaster process, before any
|
||||
//! other process or thread is running. It returns a TreeInitStruct, which is inherited by all
|
||||
//! the processes through fork().
|
||||
//!
|
||||
//! 2. One process may have write-access to the struct, by calling
|
||||
//! [TreeInitStruct::attach_writer]. (That process is the communicator process.)
|
||||
//!
|
||||
//! 3. Other processes get read-access to the struct, by calling [TreeInitStruct::attach_reader]
|
||||
//!
|
||||
//! "Write access" means that you can insert / update / delete values in the tree.
|
||||
//!
|
||||
//! NOTE: The Values stored in the tree are sometimes moved, when a leaf node fills up and a new
|
||||
//! larger node needs to be allocated. The versioning and epoch-based allocator ensure that the data
|
||||
//! structure stays consistent, but if the Value has interior mutability, like atomic fields,
|
||||
//! updates to such fields might be lost if the leaf node is concurrently moved! If that becomes a
|
||||
//! problem, the version check could be passed up to the caller, so that the caller could detect the
|
||||
//! lost updates and retry the operation.
|
||||
//!
|
||||
//! ## Implementation
|
||||
//!
|
||||
//! node_ptr: Provides low-level implementations of the four different node types (eight actually,
|
||||
//! since there is an Internal and Leaf variant of each)
|
||||
//!
|
||||
//! lock_and_version.rs: Provides an abstraction for the combined lock and version counter on each
|
||||
//! node.
|
||||
//!
|
||||
//! node_ref.rs: The code in node_ptr.rs deals with raw pointers. node_ref.rs provides more type-safe
|
||||
//! abstractions on top.
|
||||
//!
|
||||
//! algorithm.rs: Contains the functions to implement lookups and updates in the tree
|
||||
//!
|
||||
//! allocator.rs: Provides a facility to allocate memory for the tree nodes. (We must provide our
|
||||
//! own abstraction for that because we need the data structure to live in a pre-allocated shared
|
||||
//! memory segment).
|
||||
//!
|
||||
//! epoch.rs: The data structure requires that when a node is removed from the tree, it is not
|
||||
//! immediately deallocated, but stays around for as long as concurrent readers might still have
|
||||
//! pointers to them. This is enforced by an epoch system. This is similar to
|
||||
//! e.g. crossbeam_epoch, but we couldn't use that either because it has to work across processes
|
||||
//! communicating over the shared memory segment.
|
||||
//!
|
||||
//! ## See also
|
||||
//!
|
||||
//! There are some existing Rust ART implementations out there, but none of them filled all
|
||||
//! the requirements:
|
||||
//!
|
||||
//! - https://github.com/XiangpengHao/congee
|
||||
//! - https://github.com/declanvk/blart
|
||||
//!
|
||||
//! ## TODO
|
||||
//!
|
||||
//! - Removing values has not been implemented
|
||||
|
||||
mod algorithm;
|
||||
pub mod allocator;
|
||||
mod epoch;
|
||||
|
||||
use algorithm::RootPtr;
|
||||
use algorithm::node_ptr::NodePtr;
|
||||
|
||||
use std::collections::VecDeque;
|
||||
use std::fmt::Debug;
|
||||
use std::marker::PhantomData;
|
||||
use std::ptr::NonNull;
|
||||
use std::sync::atomic::{AtomicBool, Ordering};
|
||||
|
||||
use crate::epoch::EpochPin;
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests;
|
||||
|
||||
use allocator::ArtAllocator;
|
||||
pub use allocator::ArtMultiSlabAllocator;
|
||||
pub use allocator::OutOfMemoryError;
|
||||
|
||||
/// Fixed-length key type.
|
||||
///
|
||||
pub trait Key: Debug {
|
||||
const KEY_LEN: usize;
|
||||
|
||||
fn as_bytes(&self) -> &[u8];
|
||||
}
|
||||
|
||||
/// Values stored in the tree
|
||||
///
|
||||
/// Values need to be Cloneable, because when a node "grows", the value is copied to a new node and
|
||||
/// the old sticks around until all readers that might see the old value are gone.
|
||||
// fixme obsolete, no longer needs Clone
|
||||
pub trait Value {}
|
||||
|
||||
const MAX_GARBAGE: usize = 1024;
|
||||
|
||||
/// The root of the tree, plus other tree-wide data. This is stored in the shared memory.
|
||||
pub struct Tree<V: Value> {
|
||||
/// For simplicity, so that we never need to grow or shrink the root, the root node is always an
|
||||
/// Internal256 node. Also, it never has a prefix (that's actually a bit wasteful, incurring one
|
||||
/// indirection to every lookup)
|
||||
root: RootPtr<V>,
|
||||
|
||||
writer_attached: AtomicBool,
|
||||
|
||||
epoch: epoch::EpochShared,
|
||||
}
|
||||
|
||||
unsafe impl<V: Value + Sync> Sync for Tree<V> {}
|
||||
unsafe impl<V: Value + Send> Send for Tree<V> {}
|
||||
|
||||
struct GarbageQueue<V>(VecDeque<(NodePtr<V>, u64)>);
|
||||
|
||||
unsafe impl<V: Value + Sync> Sync for GarbageQueue<V> {}
|
||||
unsafe impl<V: Value + Send> Send for GarbageQueue<V> {}
|
||||
|
||||
impl<V> GarbageQueue<V> {
|
||||
fn new() -> GarbageQueue<V> {
|
||||
GarbageQueue(VecDeque::with_capacity(MAX_GARBAGE))
|
||||
}
|
||||
|
||||
fn remember_obsolete_node(&mut self, ptr: NodePtr<V>, epoch: u64) {
|
||||
self.0.push_front((ptr, epoch));
|
||||
}
|
||||
|
||||
fn next_obsolete(&mut self, cutoff_epoch: u64) -> Option<NodePtr<V>> {
|
||||
if let Some(back) = self.0.back() {
|
||||
if back.1 < cutoff_epoch {
|
||||
return Some(self.0.pop_back().unwrap().0);
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Struct created at postmaster startup
|
||||
pub struct TreeInitStruct<'t, K: Key, V: Value, A: ArtAllocator<V>> {
|
||||
tree: &'t Tree<V>,
|
||||
|
||||
allocator: &'t A,
|
||||
|
||||
phantom_key: PhantomData<K>,
|
||||
}
|
||||
|
||||
/// The worker process has a reference to this. The write operations are only safe
|
||||
/// from the worker process
|
||||
pub struct TreeWriteAccess<'t, K: Key, V: Value, A: ArtAllocator<V>>
|
||||
where
|
||||
K: Key,
|
||||
V: Value,
|
||||
{
|
||||
tree: &'t Tree<V>,
|
||||
|
||||
pub allocator: &'t A,
|
||||
|
||||
epoch_handle: epoch::LocalHandle<'t>,
|
||||
|
||||
phantom_key: PhantomData<K>,
|
||||
|
||||
/// Obsolete nodes that cannot be recycled until their epoch expires.
|
||||
garbage: spin::Mutex<GarbageQueue<V>>,
|
||||
}
|
||||
|
||||
/// The backends have a reference to this. It cannot be used to modify the tree
|
||||
pub struct TreeReadAccess<'t, K: Key, V: Value>
|
||||
where
|
||||
K: Key,
|
||||
V: Value,
|
||||
{
|
||||
tree: &'t Tree<V>,
|
||||
|
||||
epoch_handle: epoch::LocalHandle<'t>,
|
||||
|
||||
phantom_key: PhantomData<K>,
|
||||
}
|
||||
|
||||
impl<'a, 't: 'a, K: Key, V: Value, A: ArtAllocator<V>> TreeInitStruct<'t, K, V, A> {
|
||||
pub fn new(allocator: &'t A) -> TreeInitStruct<'t, K, V, A> {
|
||||
let tree_ptr = allocator.alloc_tree();
|
||||
let tree_ptr = NonNull::new(tree_ptr).expect("out of memory");
|
||||
let init = Tree {
|
||||
root: algorithm::new_root(allocator).expect("out of memory"),
|
||||
writer_attached: AtomicBool::new(false),
|
||||
epoch: epoch::EpochShared::new(),
|
||||
};
|
||||
unsafe { tree_ptr.write(init) };
|
||||
|
||||
TreeInitStruct {
|
||||
tree: unsafe { tree_ptr.as_ref() },
|
||||
allocator,
|
||||
phantom_key: PhantomData,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn attach_writer(self) -> TreeWriteAccess<'t, K, V, A> {
|
||||
let previously_attached = self.tree.writer_attached.swap(true, Ordering::Relaxed);
|
||||
if previously_attached {
|
||||
panic!("writer already attached");
|
||||
}
|
||||
TreeWriteAccess {
|
||||
tree: self.tree,
|
||||
allocator: self.allocator,
|
||||
phantom_key: PhantomData,
|
||||
epoch_handle: self.tree.epoch.register(),
|
||||
garbage: spin::Mutex::new(GarbageQueue::new()),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn attach_reader(self) -> TreeReadAccess<'t, K, V> {
|
||||
TreeReadAccess {
|
||||
tree: self.tree,
|
||||
phantom_key: PhantomData,
|
||||
epoch_handle: self.tree.epoch.register(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'t, K: Key, V: Value, A: ArtAllocator<V>> TreeWriteAccess<'t, K, V, A> {
|
||||
pub fn start_write<'g>(&'t self) -> TreeWriteGuard<'g, K, V, A>
|
||||
where
|
||||
't: 'g,
|
||||
{
|
||||
TreeWriteGuard {
|
||||
tree_writer: self,
|
||||
epoch_pin: self.epoch_handle.pin(),
|
||||
phantom_key: PhantomData,
|
||||
created_garbage: false,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn start_read(&'t self) -> TreeReadGuard<'t, K, V> {
|
||||
TreeReadGuard {
|
||||
tree: &self.tree,
|
||||
epoch_pin: self.epoch_handle.pin(),
|
||||
phantom_key: PhantomData,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'t, K: Key, V: Value> TreeReadAccess<'t, K, V> {
|
||||
pub fn start_read(&'t self) -> TreeReadGuard<'t, K, V> {
|
||||
TreeReadGuard {
|
||||
tree: &self.tree,
|
||||
epoch_pin: self.epoch_handle.pin(),
|
||||
phantom_key: PhantomData,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct TreeReadGuard<'e, K, V>
|
||||
where
|
||||
K: Key,
|
||||
V: Value,
|
||||
{
|
||||
tree: &'e Tree<V>,
|
||||
|
||||
epoch_pin: EpochPin<'e>,
|
||||
phantom_key: PhantomData<K>,
|
||||
}
|
||||
|
||||
impl<'e, K: Key, V: Value> TreeReadGuard<'e, K, V> {
|
||||
pub fn get(&'e self, key: &K) -> Option<&'e V> {
|
||||
algorithm::search(key, self.tree.root, &self.epoch_pin)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct TreeWriteGuard<'e, K, V, A>
|
||||
where
|
||||
K: Key,
|
||||
V: Value,
|
||||
A: ArtAllocator<V>,
|
||||
{
|
||||
tree_writer: &'e TreeWriteAccess<'e, K, V, A>,
|
||||
|
||||
epoch_pin: EpochPin<'e>,
|
||||
phantom_key: PhantomData<K>,
|
||||
|
||||
created_garbage: bool,
|
||||
}
|
||||
|
||||
pub enum UpdateAction<V> {
|
||||
Nothing,
|
||||
Insert(V),
|
||||
Remove,
|
||||
}
|
||||
|
||||
impl<'e, K: Key, V: Value, A: ArtAllocator<V>> TreeWriteGuard<'e, K, V, A> {
|
||||
/// Get a value
|
||||
pub fn get(&'e mut self, key: &K) -> Option<&'e V> {
|
||||
algorithm::search(key, self.tree_writer.tree.root, &self.epoch_pin)
|
||||
}
|
||||
|
||||
/// Insert a value
|
||||
pub fn insert(self, key: &K, value: V) -> Result<bool, OutOfMemoryError> {
|
||||
let mut success = None;
|
||||
|
||||
self.update_with_fn(key, |existing| {
|
||||
if let Some(_) = existing {
|
||||
success = Some(false);
|
||||
UpdateAction::Nothing
|
||||
} else {
|
||||
success = Some(true);
|
||||
UpdateAction::Insert(value)
|
||||
}
|
||||
})?;
|
||||
Ok(success.expect("value_fn not called"))
|
||||
}
|
||||
|
||||
/// Remove value. Returns true if it existed
|
||||
pub fn remove(self, key: &K) -> bool {
|
||||
let mut result = false;
|
||||
// FIXME: It's not clear if OOM is expected while removing. It seems
|
||||
// not nice, but shrinking a node can OOM. Then again, we could opt
|
||||
// to not shrink a node if we cannot allocate, to live a little longer.
|
||||
self.update_with_fn(key, |existing| match existing {
|
||||
Some(_) => {
|
||||
result = true;
|
||||
UpdateAction::Remove
|
||||
}
|
||||
None => UpdateAction::Nothing,
|
||||
})
|
||||
.expect("out of memory while removing");
|
||||
result
|
||||
}
|
||||
|
||||
/// Try to remove value and return the old value.
|
||||
pub fn remove_and_return(self, key: &K) -> Option<V>
|
||||
where
|
||||
V: Clone,
|
||||
{
|
||||
let mut old = None;
|
||||
self.update_with_fn(key, |existing| {
|
||||
old = existing.cloned();
|
||||
UpdateAction::Remove
|
||||
})
|
||||
.expect("out of memory while removing");
|
||||
old
|
||||
}
|
||||
|
||||
/// Update key using the given function. All the other modifying operations are based on this.
|
||||
///
|
||||
/// The function is passed a reference to the existing value, if any. If the function
|
||||
/// returns None, the value is removed from the tree (or if there was no existing value,
|
||||
/// does nothing). If the function returns Some, the existing value is replaced, of if there
|
||||
/// was no existing value, it is inserted. FIXME: update comment
|
||||
pub fn update_with_fn<F>(mut self, key: &K, value_fn: F) -> Result<(), OutOfMemoryError>
|
||||
where
|
||||
F: FnOnce(Option<&V>) -> UpdateAction<V>,
|
||||
{
|
||||
algorithm::update_fn(key, value_fn, self.tree_writer.tree.root, &mut self)?;
|
||||
|
||||
if self.created_garbage {
|
||||
let _ = self.collect_garbage();
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn remember_obsolete_node(&mut self, ptr: NodePtr<V>) {
|
||||
self.tree_writer
|
||||
.garbage
|
||||
.lock()
|
||||
.remember_obsolete_node(ptr, self.epoch_pin.epoch);
|
||||
self.created_garbage = true;
|
||||
}
|
||||
|
||||
// returns number of nodes recycled
|
||||
fn collect_garbage(&self) -> usize {
|
||||
self.tree_writer.tree.epoch.advance();
|
||||
self.tree_writer.tree.epoch.broadcast();
|
||||
|
||||
let cutoff_epoch = self.tree_writer.tree.epoch.get_oldest();
|
||||
|
||||
let mut result = 0;
|
||||
let mut garbage_queue = self.tree_writer.garbage.lock();
|
||||
while let Some(ptr) = garbage_queue.next_obsolete(cutoff_epoch) {
|
||||
ptr.deallocate(self.tree_writer.allocator);
|
||||
result += 1;
|
||||
}
|
||||
result
|
||||
}
|
||||
}
|
||||
|
||||
pub struct TreeIterator<K>
|
||||
where
|
||||
K: Key + for<'a> From<&'a [u8]>,
|
||||
{
|
||||
done: bool,
|
||||
pub next_key: Vec<u8>,
|
||||
max_key: Option<Vec<u8>>,
|
||||
|
||||
phantom_key: PhantomData<K>,
|
||||
}
|
||||
|
||||
impl<K> TreeIterator<K>
|
||||
where
|
||||
K: Key + for<'a> From<&'a [u8]>,
|
||||
{
|
||||
pub fn new_wrapping() -> TreeIterator<K> {
|
||||
let mut next_key = Vec::new();
|
||||
next_key.resize(K::KEY_LEN, 0);
|
||||
TreeIterator {
|
||||
done: false,
|
||||
next_key,
|
||||
max_key: None,
|
||||
phantom_key: PhantomData,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn new(range: &std::ops::Range<K>) -> TreeIterator<K> {
|
||||
let result = TreeIterator {
|
||||
done: false,
|
||||
next_key: Vec::from(range.start.as_bytes()),
|
||||
max_key: Some(Vec::from(range.end.as_bytes())),
|
||||
phantom_key: PhantomData,
|
||||
};
|
||||
assert_eq!(result.next_key.len(), K::KEY_LEN);
|
||||
assert_eq!(result.max_key.as_ref().unwrap().len(), K::KEY_LEN);
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
pub fn next<'g, V>(&mut self, read_guard: &'g TreeReadGuard<'g, K, V>) -> Option<(K, &'g V)>
|
||||
where
|
||||
V: Value,
|
||||
{
|
||||
if self.done {
|
||||
return None;
|
||||
}
|
||||
|
||||
let mut wrapped_around = false;
|
||||
loop {
|
||||
assert_eq!(self.next_key.len(), K::KEY_LEN);
|
||||
if let Some((k, v)) = algorithm::iter_next(
|
||||
&mut self.next_key,
|
||||
read_guard.tree.root,
|
||||
&read_guard.epoch_pin,
|
||||
) {
|
||||
assert_eq!(k.len(), K::KEY_LEN);
|
||||
assert_eq!(self.next_key.len(), K::KEY_LEN);
|
||||
|
||||
// Check if we reached the end of the range
|
||||
if let Some(max_key) = &self.max_key {
|
||||
if k.as_slice() >= max_key.as_slice() {
|
||||
self.done = true;
|
||||
break None;
|
||||
}
|
||||
}
|
||||
|
||||
// increment the key
|
||||
self.next_key = k.clone();
|
||||
increment_key(self.next_key.as_mut_slice());
|
||||
let k = k.as_slice().into();
|
||||
|
||||
break Some((k, v));
|
||||
} else {
|
||||
if self.max_key.is_some() {
|
||||
self.done = true;
|
||||
} else {
|
||||
// Start from beginning
|
||||
if !wrapped_around {
|
||||
for i in 0..K::KEY_LEN {
|
||||
self.next_key[i] = 0;
|
||||
}
|
||||
wrapped_around = true;
|
||||
continue;
|
||||
} else {
|
||||
// The tree is completely empty
|
||||
// FIXME: perhaps we should remember the starting point instead.
|
||||
// Currently this will scan some ranges twice.
|
||||
break None;
|
||||
}
|
||||
}
|
||||
break None;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn increment_key(key: &mut [u8]) -> bool {
|
||||
for i in (0..key.len()).rev() {
|
||||
let (byte, overflow) = key[i].overflowing_add(1);
|
||||
key[i] = byte;
|
||||
if !overflow {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
true
|
||||
}
|
||||
|
||||
// Debugging functions
|
||||
impl<'e, K: Key, V: Value + Debug, A: ArtAllocator<V>> TreeWriteGuard<'e, K, V, A> {
|
||||
pub fn dump(&mut self, dst: &mut dyn std::io::Write) {
|
||||
algorithm::dump_tree(self.tree_writer.tree.root, &self.epoch_pin, dst)
|
||||
}
|
||||
}
|
||||
impl<'e, K: Key, V: Value + Debug> TreeReadGuard<'e, K, V> {
|
||||
pub fn dump(&mut self, dst: &mut dyn std::io::Write) {
|
||||
algorithm::dump_tree(self.tree.root, &self.epoch_pin, dst)
|
||||
}
|
||||
}
|
||||
impl<'e, K: Key, V: Value> TreeWriteAccess<'e, K, V, ArtMultiSlabAllocator<'e, V>> {
|
||||
pub fn get_statistics(&self) -> ArtTreeStatistics {
|
||||
self.allocator.get_statistics();
|
||||
ArtTreeStatistics {
|
||||
blocks: self.allocator.inner.block_allocator.get_statistics(),
|
||||
slabs: self.allocator.get_statistics(),
|
||||
epoch: self.tree.epoch.get_current(),
|
||||
oldest_epoch: self.tree.epoch.get_oldest(),
|
||||
num_garbage: self.garbage.lock().0.len() as u64,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct ArtTreeStatistics {
|
||||
pub blocks: allocator::block::BlockAllocatorStats,
|
||||
pub slabs: allocator::ArtMultiSlabStats,
|
||||
|
||||
pub epoch: u64,
|
||||
pub oldest_epoch: u64,
|
||||
pub num_garbage: u64,
|
||||
}
|
||||
243
libs/neonart/src/tests.rs
Normal file
243
libs/neonart/src/tests.rs
Normal file
@@ -0,0 +1,243 @@
|
||||
use std::collections::BTreeMap;
|
||||
use std::collections::HashSet;
|
||||
use std::fmt::{Debug, Formatter};
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
|
||||
use crate::ArtAllocator;
|
||||
use crate::ArtMultiSlabAllocator;
|
||||
use crate::TreeInitStruct;
|
||||
use crate::TreeIterator;
|
||||
use crate::TreeWriteAccess;
|
||||
use crate::UpdateAction;
|
||||
|
||||
use crate::{Key, Value};
|
||||
|
||||
use rand::Rng;
|
||||
use rand::seq::SliceRandom;
|
||||
use rand_distr::Zipf;
|
||||
|
||||
const TEST_KEY_LEN: usize = 16;
|
||||
|
||||
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
|
||||
struct TestKey([u8; TEST_KEY_LEN]);
|
||||
|
||||
impl TestKey {
|
||||
const MIN: TestKey = TestKey([0; TEST_KEY_LEN]);
|
||||
const MAX: TestKey = TestKey([u8::MAX; TEST_KEY_LEN]);
|
||||
}
|
||||
|
||||
impl Key for TestKey {
|
||||
const KEY_LEN: usize = TEST_KEY_LEN;
|
||||
fn as_bytes(&self) -> &[u8] {
|
||||
&self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl From<&TestKey> for u128 {
|
||||
fn from(val: &TestKey) -> u128 {
|
||||
u128::from_be_bytes(val.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<u128> for TestKey {
|
||||
fn from(val: u128) -> TestKey {
|
||||
TestKey(val.to_be_bytes())
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> From<&'a [u8]> for TestKey {
|
||||
fn from(bytes: &'a [u8]) -> TestKey {
|
||||
TestKey(bytes.try_into().unwrap())
|
||||
}
|
||||
}
|
||||
|
||||
impl Value for usize {}
|
||||
|
||||
fn test_inserts<K: Into<TestKey> + Copy>(keys: &[K]) {
|
||||
const MEM_SIZE: usize = 10000000;
|
||||
let mut area = Box::new_uninit_slice(MEM_SIZE);
|
||||
|
||||
let allocator = ArtMultiSlabAllocator::new(&mut area);
|
||||
|
||||
let init_struct = TreeInitStruct::<TestKey, usize, _>::new(allocator);
|
||||
let tree_writer = init_struct.attach_writer();
|
||||
|
||||
for (idx, k) in keys.iter().enumerate() {
|
||||
let w = tree_writer.start_write();
|
||||
let res = w.insert(&(*k).into(), idx);
|
||||
assert!(res.is_ok());
|
||||
}
|
||||
|
||||
for (idx, k) in keys.iter().enumerate() {
|
||||
let r = tree_writer.start_read();
|
||||
let value = r.get(&(*k).into());
|
||||
assert_eq!(value, Some(idx).as_ref());
|
||||
}
|
||||
|
||||
eprintln!("stats: {:?}", tree_writer.get_statistics());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn dense() {
|
||||
// This exercises splitting a node with prefix
|
||||
let keys: &[u128] = &[0, 1, 2, 3, 256];
|
||||
test_inserts(keys);
|
||||
|
||||
// Dense keys
|
||||
let mut keys: Vec<u128> = (0..10000).collect();
|
||||
test_inserts(&keys);
|
||||
|
||||
// Do the same in random orders
|
||||
for _ in 1..10 {
|
||||
keys.shuffle(&mut rand::rng());
|
||||
test_inserts(&keys);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn sparse() {
|
||||
// sparse keys
|
||||
let mut keys: Vec<TestKey> = Vec::new();
|
||||
let mut used_keys = HashSet::new();
|
||||
for _ in 0..10000 {
|
||||
loop {
|
||||
let key = rand::random::<u128>();
|
||||
if used_keys.get(&key).is_some() {
|
||||
continue;
|
||||
}
|
||||
used_keys.insert(key);
|
||||
keys.push(key.into());
|
||||
break;
|
||||
}
|
||||
}
|
||||
test_inserts(&keys);
|
||||
}
|
||||
|
||||
struct TestValue(AtomicUsize);
|
||||
|
||||
impl TestValue {
|
||||
fn new(val: usize) -> TestValue {
|
||||
TestValue(AtomicUsize::new(val))
|
||||
}
|
||||
|
||||
fn load(&self) -> usize {
|
||||
self.0.load(Ordering::Relaxed)
|
||||
}
|
||||
}
|
||||
|
||||
impl Value for TestValue {}
|
||||
|
||||
impl Clone for TestValue {
|
||||
fn clone(&self) -> TestValue {
|
||||
TestValue::new(self.load())
|
||||
}
|
||||
}
|
||||
|
||||
impl Debug for TestValue {
|
||||
fn fmt(&self, fmt: &mut Formatter<'_>) -> Result<(), std::fmt::Error> {
|
||||
write!(fmt, "{:?}", self.load())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
struct TestOp(TestKey, Option<usize>);
|
||||
|
||||
fn apply_op<A: ArtAllocator<TestValue>>(
|
||||
op: &TestOp,
|
||||
tree: &TreeWriteAccess<TestKey, TestValue, A>,
|
||||
shadow: &mut BTreeMap<TestKey, usize>,
|
||||
) {
|
||||
eprintln!("applying op: {op:?}");
|
||||
|
||||
// apply the change to the shadow tree first
|
||||
let shadow_existing = if let Some(v) = op.1 {
|
||||
shadow.insert(op.0, v)
|
||||
} else {
|
||||
shadow.remove(&op.0)
|
||||
};
|
||||
|
||||
// apply to Art tree
|
||||
let w = tree.start_write();
|
||||
w.update_with_fn(&op.0, |existing| {
|
||||
assert_eq!(existing.map(TestValue::load), shadow_existing);
|
||||
|
||||
match (existing, op.1) {
|
||||
(None, None) => UpdateAction::Nothing,
|
||||
(None, Some(new_val)) => UpdateAction::Insert(TestValue::new(new_val)),
|
||||
(Some(_old_val), None) => UpdateAction::Remove,
|
||||
(Some(old_val), Some(new_val)) => {
|
||||
old_val.0.store(new_val, Ordering::Relaxed);
|
||||
UpdateAction::Nothing
|
||||
}
|
||||
}
|
||||
})
|
||||
.expect("out of memory");
|
||||
}
|
||||
|
||||
fn test_iter<A: ArtAllocator<TestValue>>(
|
||||
tree: &TreeWriteAccess<TestKey, TestValue, A>,
|
||||
shadow: &BTreeMap<TestKey, usize>,
|
||||
) {
|
||||
let mut shadow_iter = shadow.iter();
|
||||
let mut iter = TreeIterator::new(&(TestKey::MIN..TestKey::MAX));
|
||||
|
||||
loop {
|
||||
let shadow_item = shadow_iter.next().map(|(k, v)| (k.clone(), v.clone()));
|
||||
let r = tree.start_read();
|
||||
let item = iter.next(&r);
|
||||
|
||||
if shadow_item != item.map(|(k, v)| (k, v.load())) {
|
||||
eprintln!(
|
||||
"FAIL: iterator returned {:?}, expected {:?}",
|
||||
item, shadow_item
|
||||
);
|
||||
tree.start_read().dump(&mut std::io::stderr());
|
||||
|
||||
eprintln!("SHADOW:");
|
||||
let mut si = shadow.iter();
|
||||
while let Some(si) = si.next() {
|
||||
eprintln!("key: {:?}, val: {}", si.0, si.1);
|
||||
}
|
||||
panic!(
|
||||
"FAIL: iterator returned {:?}, expected {:?}",
|
||||
item, shadow_item
|
||||
);
|
||||
}
|
||||
if item.is_none() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn random_ops() {
|
||||
const MEM_SIZE: usize = 10000000;
|
||||
let mut area = Box::new_uninit_slice(MEM_SIZE);
|
||||
|
||||
let allocator = ArtMultiSlabAllocator::new(&mut area);
|
||||
|
||||
let init_struct = TreeInitStruct::<TestKey, TestValue, _>::new(allocator);
|
||||
let tree_writer = init_struct.attach_writer();
|
||||
|
||||
let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
|
||||
|
||||
let distribution = Zipf::new(u128::MAX as f64, 1.1).unwrap();
|
||||
let mut rng = rand::rng();
|
||||
for i in 0..100000 {
|
||||
let mut key: TestKey = (rng.sample(distribution) as u128).into();
|
||||
|
||||
if rng.random_bool(0.10) {
|
||||
key = TestKey::from(u128::from(&key) | 0xffffffff);
|
||||
}
|
||||
|
||||
let op = TestOp(key, if rng.random_bool(0.75) { Some(i) } else { None });
|
||||
|
||||
apply_op(&op, &tree_writer, &mut shadow);
|
||||
|
||||
if i % 1000 == 0 {
|
||||
eprintln!("{i} ops processed");
|
||||
eprintln!("stats: {:?}", tree_writer.get_statistics());
|
||||
test_iter(&tree_writer, &shadow);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -20,7 +20,6 @@ use postgres_backend::AuthType;
|
||||
use remote_storage::RemoteStorageConfig;
|
||||
use serde_with::serde_as;
|
||||
use utils::logging::LogFormat;
|
||||
use utils::postgres_client::PostgresClientProtocol;
|
||||
|
||||
use crate::models::{ImageCompressionAlgorithm, LsnLease};
|
||||
|
||||
@@ -35,6 +34,8 @@ pub struct NodeMetadata {
|
||||
pub postgres_host: String,
|
||||
#[serde(rename = "port")]
|
||||
pub postgres_port: u16,
|
||||
pub grpc_host: Option<String>,
|
||||
pub grpc_port: Option<u16>,
|
||||
pub http_host: String,
|
||||
pub http_port: u16,
|
||||
pub https_port: Option<u16>,
|
||||
@@ -181,6 +182,7 @@ pub struct ConfigToml {
|
||||
pub virtual_file_io_engine: Option<crate::models::virtual_file::IoEngineKind>,
|
||||
pub ingest_batch_size: u64,
|
||||
pub max_vectored_read_bytes: MaxVectoredReadBytes,
|
||||
pub max_get_vectored_keys: MaxGetVectoredKeys,
|
||||
pub image_compression: ImageCompressionAlgorithm,
|
||||
pub timeline_offloading: bool,
|
||||
pub ephemeral_bytes_per_memory_kb: usize,
|
||||
@@ -188,7 +190,6 @@ pub struct ConfigToml {
|
||||
pub virtual_file_io_mode: Option<crate::models::virtual_file::IoMode>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub no_sync: Option<bool>,
|
||||
pub wal_receiver_protocol: PostgresClientProtocol,
|
||||
pub page_service_pipelining: PageServicePipeliningConfig,
|
||||
pub get_vectored_concurrent_io: GetVectoredConcurrentIo,
|
||||
pub enable_read_path_debugging: Option<bool>,
|
||||
@@ -229,7 +230,7 @@ pub enum PageServicePipeliningConfig {
|
||||
}
|
||||
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||
pub struct PageServicePipeliningConfigPipelined {
|
||||
/// Causes runtime errors if larger than max get_vectored batch size.
|
||||
/// Failed config parsing and validation if larger than `max_get_vectored_keys`.
|
||||
pub max_batch_size: NonZeroUsize,
|
||||
pub execution: PageServiceProtocolPipelinedExecutionStrategy,
|
||||
// The default below is such that new versions of the software can start
|
||||
@@ -329,6 +330,8 @@ pub struct TimelineImportConfig {
|
||||
pub import_job_concurrency: NonZeroUsize,
|
||||
pub import_job_soft_size_limit: NonZeroUsize,
|
||||
pub import_job_checkpoint_threshold: NonZeroUsize,
|
||||
/// Max size of the remote storage partial read done by any job
|
||||
pub import_job_max_byte_range_size: NonZeroUsize,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||
@@ -403,6 +406,16 @@ impl Default for EvictionOrder {
|
||||
#[serde(transparent)]
|
||||
pub struct MaxVectoredReadBytes(pub NonZeroUsize);
|
||||
|
||||
#[derive(Copy, Clone, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||
#[serde(transparent)]
|
||||
pub struct MaxGetVectoredKeys(NonZeroUsize);
|
||||
|
||||
impl MaxGetVectoredKeys {
|
||||
pub fn get(&self) -> usize {
|
||||
self.0.get()
|
||||
}
|
||||
}
|
||||
|
||||
/// Tenant-level configuration values, used for various purposes.
|
||||
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||
#[serde(default)]
|
||||
@@ -514,8 +527,6 @@ pub struct TenantConfigToml {
|
||||
/// (either this flag or the pageserver-global one need to be set)
|
||||
pub timeline_offloading: bool,
|
||||
|
||||
pub wal_receiver_protocol_override: Option<PostgresClientProtocol>,
|
||||
|
||||
/// Enable rel_size_v2 for this tenant. Once enabled, the tenant will persist this information into
|
||||
/// `index_part.json`, and it cannot be reversed.
|
||||
pub rel_size_v2_enabled: bool,
|
||||
@@ -587,6 +598,8 @@ pub mod defaults {
|
||||
/// That is, slightly above 128 kB.
|
||||
pub const DEFAULT_MAX_VECTORED_READ_BYTES: usize = 130 * 1024; // 130 KiB
|
||||
|
||||
pub const DEFAULT_MAX_GET_VECTORED_KEYS: usize = 32;
|
||||
|
||||
pub const DEFAULT_IMAGE_COMPRESSION: ImageCompressionAlgorithm =
|
||||
ImageCompressionAlgorithm::Zstd { level: Some(1) };
|
||||
|
||||
@@ -594,9 +607,6 @@ pub mod defaults {
|
||||
|
||||
pub const DEFAULT_IO_BUFFER_ALIGNMENT: usize = 512;
|
||||
|
||||
pub const DEFAULT_WAL_RECEIVER_PROTOCOL: utils::postgres_client::PostgresClientProtocol =
|
||||
utils::postgres_client::PostgresClientProtocol::Vanilla;
|
||||
|
||||
pub const DEFAULT_SSL_KEY_FILE: &str = "server.key";
|
||||
pub const DEFAULT_SSL_CERT_FILE: &str = "server.crt";
|
||||
}
|
||||
@@ -685,6 +695,9 @@ impl Default for ConfigToml {
|
||||
max_vectored_read_bytes: (MaxVectoredReadBytes(
|
||||
NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(),
|
||||
)),
|
||||
max_get_vectored_keys: (MaxGetVectoredKeys(
|
||||
NonZeroUsize::new(DEFAULT_MAX_GET_VECTORED_KEYS).unwrap(),
|
||||
)),
|
||||
image_compression: (DEFAULT_IMAGE_COMPRESSION),
|
||||
timeline_offloading: true,
|
||||
ephemeral_bytes_per_memory_kb: (DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
|
||||
@@ -692,7 +705,6 @@ impl Default for ConfigToml {
|
||||
virtual_file_io_mode: None,
|
||||
tenant_config: TenantConfigToml::default(),
|
||||
no_sync: None,
|
||||
wal_receiver_protocol: DEFAULT_WAL_RECEIVER_PROTOCOL,
|
||||
page_service_pipelining: PageServicePipeliningConfig::Pipelined(
|
||||
PageServicePipeliningConfigPipelined {
|
||||
max_batch_size: NonZeroUsize::new(32).unwrap(),
|
||||
@@ -713,9 +725,10 @@ impl Default for ConfigToml {
|
||||
enable_tls_page_service_api: false,
|
||||
dev_mode: false,
|
||||
timeline_import_config: TimelineImportConfig {
|
||||
import_job_concurrency: NonZeroUsize::new(128).unwrap(),
|
||||
import_job_soft_size_limit: NonZeroUsize::new(1024 * 1024 * 1024).unwrap(),
|
||||
import_job_checkpoint_threshold: NonZeroUsize::new(128).unwrap(),
|
||||
import_job_concurrency: NonZeroUsize::new(32).unwrap(),
|
||||
import_job_soft_size_limit: NonZeroUsize::new(256 * 1024 * 1024).unwrap(),
|
||||
import_job_checkpoint_threshold: NonZeroUsize::new(32).unwrap(),
|
||||
import_job_max_byte_range_size: NonZeroUsize::new(4 * 1024 * 1024).unwrap(),
|
||||
},
|
||||
basebackup_cache_config: None,
|
||||
posthog_config: None,
|
||||
@@ -836,7 +849,6 @@ impl Default for TenantConfigToml {
|
||||
lsn_lease_length: LsnLease::DEFAULT_LENGTH,
|
||||
lsn_lease_length_for_ts: LsnLease::DEFAULT_LENGTH_FOR_TS,
|
||||
timeline_offloading: true,
|
||||
wal_receiver_protocol_override: None,
|
||||
rel_size_v2_enabled: false,
|
||||
gc_compaction_enabled: DEFAULT_GC_COMPACTION_ENABLED,
|
||||
gc_compaction_verification: DEFAULT_GC_COMPACTION_VERIFICATION,
|
||||
|
||||
@@ -14,6 +14,8 @@ fn test_node_metadata_v1_backward_compatibilty() {
|
||||
NodeMetadata {
|
||||
postgres_host: "localhost".to_string(),
|
||||
postgres_port: 23,
|
||||
grpc_host: None,
|
||||
grpc_port: None,
|
||||
http_host: "localhost".to_string(),
|
||||
http_port: 42,
|
||||
https_port: None,
|
||||
@@ -37,6 +39,35 @@ fn test_node_metadata_v2_backward_compatibilty() {
|
||||
NodeMetadata {
|
||||
postgres_host: "localhost".to_string(),
|
||||
postgres_port: 23,
|
||||
grpc_host: None,
|
||||
grpc_port: None,
|
||||
http_host: "localhost".to_string(),
|
||||
http_port: 42,
|
||||
https_port: Some(123),
|
||||
other: HashMap::new(),
|
||||
}
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_node_metadata_v3_backward_compatibilty() {
|
||||
let v3 = serde_json::to_vec(&serde_json::json!({
|
||||
"host": "localhost",
|
||||
"port": 23,
|
||||
"grpc_host": "localhost",
|
||||
"grpc_port": 51,
|
||||
"http_host": "localhost",
|
||||
"http_port": 42,
|
||||
"https_port": 123,
|
||||
}));
|
||||
|
||||
assert_eq!(
|
||||
serde_json::from_slice::<NodeMetadata>(&v3.unwrap()).unwrap(),
|
||||
NodeMetadata {
|
||||
postgres_host: "localhost".to_string(),
|
||||
postgres_port: 23,
|
||||
grpc_host: Some("localhost".to_string()),
|
||||
grpc_port: Some(51),
|
||||
http_host: "localhost".to_string(),
|
||||
http_port: 42,
|
||||
https_port: Some(123),
|
||||
|
||||
@@ -53,6 +53,9 @@ pub struct NodeRegisterRequest {
|
||||
pub listen_pg_addr: String,
|
||||
pub listen_pg_port: u16,
|
||||
|
||||
pub listen_grpc_addr: Option<String>,
|
||||
pub listen_grpc_port: Option<u16>,
|
||||
|
||||
pub listen_http_addr: String,
|
||||
pub listen_http_port: u16,
|
||||
pub listen_https_port: Option<u16>,
|
||||
@@ -102,6 +105,9 @@ pub struct TenantLocateResponseShard {
|
||||
pub listen_pg_addr: String,
|
||||
pub listen_pg_port: u16,
|
||||
|
||||
pub listen_grpc_addr: Option<String>,
|
||||
pub listen_grpc_port: Option<u16>,
|
||||
|
||||
pub listen_http_addr: String,
|
||||
pub listen_http_port: u16,
|
||||
pub listen_https_port: Option<u16>,
|
||||
@@ -152,6 +158,8 @@ pub struct NodeDescribeResponse {
|
||||
|
||||
pub listen_pg_addr: String,
|
||||
pub listen_pg_port: u16,
|
||||
pub listen_grpc_addr: Option<String>,
|
||||
pub listen_grpc_port: Option<u16>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
|
||||
@@ -20,7 +20,6 @@ use serde_with::serde_as;
|
||||
pub use utilization::PageserverUtilization;
|
||||
use utils::id::{NodeId, TenantId, TimelineId};
|
||||
use utils::lsn::Lsn;
|
||||
use utils::postgres_client::PostgresClientProtocol;
|
||||
use utils::{completion, serde_system_time};
|
||||
|
||||
use crate::config::Ratio;
|
||||
@@ -622,8 +621,6 @@ pub struct TenantConfigPatch {
|
||||
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
|
||||
pub timeline_offloading: FieldPatch<bool>,
|
||||
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
|
||||
pub wal_receiver_protocol_override: FieldPatch<PostgresClientProtocol>,
|
||||
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
|
||||
pub rel_size_v2_enabled: FieldPatch<bool>,
|
||||
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
|
||||
pub gc_compaction_enabled: FieldPatch<bool>,
|
||||
@@ -748,9 +745,6 @@ pub struct TenantConfig {
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub timeline_offloading: Option<bool>,
|
||||
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub wal_receiver_protocol_override: Option<PostgresClientProtocol>,
|
||||
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub rel_size_v2_enabled: Option<bool>,
|
||||
|
||||
@@ -812,7 +806,6 @@ impl TenantConfig {
|
||||
mut lsn_lease_length,
|
||||
mut lsn_lease_length_for_ts,
|
||||
mut timeline_offloading,
|
||||
mut wal_receiver_protocol_override,
|
||||
mut rel_size_v2_enabled,
|
||||
mut gc_compaction_enabled,
|
||||
mut gc_compaction_verification,
|
||||
@@ -905,9 +898,6 @@ impl TenantConfig {
|
||||
.map(|v| humantime::parse_duration(&v))?
|
||||
.apply(&mut lsn_lease_length_for_ts);
|
||||
patch.timeline_offloading.apply(&mut timeline_offloading);
|
||||
patch
|
||||
.wal_receiver_protocol_override
|
||||
.apply(&mut wal_receiver_protocol_override);
|
||||
patch.rel_size_v2_enabled.apply(&mut rel_size_v2_enabled);
|
||||
patch
|
||||
.gc_compaction_enabled
|
||||
@@ -960,7 +950,6 @@ impl TenantConfig {
|
||||
lsn_lease_length,
|
||||
lsn_lease_length_for_ts,
|
||||
timeline_offloading,
|
||||
wal_receiver_protocol_override,
|
||||
rel_size_v2_enabled,
|
||||
gc_compaction_enabled,
|
||||
gc_compaction_verification,
|
||||
@@ -1058,9 +1047,6 @@ impl TenantConfig {
|
||||
timeline_offloading: self
|
||||
.timeline_offloading
|
||||
.unwrap_or(global_conf.timeline_offloading),
|
||||
wal_receiver_protocol_override: self
|
||||
.wal_receiver_protocol_override
|
||||
.or(global_conf.wal_receiver_protocol_override),
|
||||
rel_size_v2_enabled: self
|
||||
.rel_size_v2_enabled
|
||||
.unwrap_or(global_conf.rel_size_v2_enabled),
|
||||
@@ -1934,7 +1920,7 @@ pub enum PagestreamFeMessage {
|
||||
}
|
||||
|
||||
// Wrapped in libpq CopyData
|
||||
#[derive(strum_macros::EnumProperty)]
|
||||
#[derive(Debug, strum_macros::EnumProperty)]
|
||||
pub enum PagestreamBeMessage {
|
||||
Exists(PagestreamExistsResponse),
|
||||
Nblocks(PagestreamNblocksResponse),
|
||||
@@ -2045,7 +2031,7 @@ pub enum PagestreamProtocolVersion {
|
||||
|
||||
pub type RequestId = u64;
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
|
||||
#[derive(Debug, Default, PartialEq, Eq, Clone, Copy)]
|
||||
pub struct PagestreamRequest {
|
||||
pub reqid: RequestId,
|
||||
pub request_lsn: Lsn,
|
||||
@@ -2064,7 +2050,7 @@ pub struct PagestreamNblocksRequest {
|
||||
pub rel: RelTag,
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
|
||||
#[derive(Debug, Default, PartialEq, Eq, Clone, Copy)]
|
||||
pub struct PagestreamGetPageRequest {
|
||||
pub hdr: PagestreamRequest,
|
||||
pub rel: RelTag,
|
||||
|
||||
@@ -24,7 +24,7 @@ use serde::{Deserialize, Serialize};
|
||||
// FIXME: should move 'forknum' as last field to keep this consistent with Postgres.
|
||||
// Then we could replace the custom Ord and PartialOrd implementations below with
|
||||
// deriving them. This will require changes in walredoproc.c.
|
||||
#[derive(Debug, PartialEq, Eq, Hash, Clone, Copy, Serialize, Deserialize)]
|
||||
#[derive(Debug, Default, PartialEq, Eq, Hash, Clone, Copy, Serialize, Deserialize)]
|
||||
pub struct RelTag {
|
||||
pub forknum: u8,
|
||||
pub spcnode: Oid,
|
||||
@@ -184,12 +184,12 @@ pub enum SlruKind {
|
||||
MultiXactOffsets,
|
||||
}
|
||||
|
||||
impl SlruKind {
|
||||
pub fn to_str(&self) -> &'static str {
|
||||
impl fmt::Display for SlruKind {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
match self {
|
||||
Self::Clog => "pg_xact",
|
||||
Self::MultiXactMembers => "pg_multixact/members",
|
||||
Self::MultiXactOffsets => "pg_multixact/offsets",
|
||||
Self::Clog => write!(f, "pg_xact"),
|
||||
Self::MultiXactMembers => write!(f, "pg_multixact/members"),
|
||||
Self::MultiXactOffsets => write!(f, "pg_multixact/offsets"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,8 +4,9 @@ use std::{sync::Arc, time::Duration};
|
||||
|
||||
use arc_swap::ArcSwap;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{Instrument, info_span};
|
||||
|
||||
use crate::{FeatureStore, PostHogClient, PostHogClientConfig};
|
||||
use crate::{CaptureEvent, FeatureStore, PostHogClient, PostHogClientConfig};
|
||||
|
||||
/// A background loop that fetches feature flags from PostHog and updates the feature store.
|
||||
pub struct FeatureResolverBackgroundLoop {
|
||||
@@ -23,34 +24,61 @@ impl FeatureResolverBackgroundLoop {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn spawn(self: Arc<Self>, handle: &tokio::runtime::Handle, refresh_period: Duration) {
|
||||
pub fn spawn(
|
||||
self: Arc<Self>,
|
||||
handle: &tokio::runtime::Handle,
|
||||
refresh_period: Duration,
|
||||
fake_tenants: Vec<CaptureEvent>,
|
||||
) {
|
||||
let this = self.clone();
|
||||
let cancel = self.cancel.clone();
|
||||
handle.spawn(async move {
|
||||
tracing::info!("Starting PostHog feature resolver");
|
||||
let mut ticker = tokio::time::interval(refresh_period);
|
||||
ticker.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip);
|
||||
loop {
|
||||
tokio::select! {
|
||||
_ = ticker.tick() => {}
|
||||
_ = cancel.cancelled() => break
|
||||
}
|
||||
let resp = match this
|
||||
.posthog_client
|
||||
.get_feature_flags_local_evaluation()
|
||||
.await
|
||||
{
|
||||
Ok(resp) => resp,
|
||||
Err(e) => {
|
||||
tracing::warn!("Cannot get feature flags: {}", e);
|
||||
continue;
|
||||
|
||||
// Main loop of updating the feature flags.
|
||||
handle.spawn(
|
||||
async move {
|
||||
tracing::info!("Starting PostHog feature resolver");
|
||||
let mut ticker = tokio::time::interval(refresh_period);
|
||||
ticker.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip);
|
||||
loop {
|
||||
tokio::select! {
|
||||
_ = ticker.tick() => {}
|
||||
_ = cancel.cancelled() => break
|
||||
}
|
||||
};
|
||||
let feature_store = FeatureStore::new_with_flags(resp.flags);
|
||||
this.feature_store.store(Arc::new(feature_store));
|
||||
let resp = match this
|
||||
.posthog_client
|
||||
.get_feature_flags_local_evaluation()
|
||||
.await
|
||||
{
|
||||
Ok(resp) => resp,
|
||||
Err(e) => {
|
||||
tracing::warn!("Cannot get feature flags: {}", e);
|
||||
continue;
|
||||
}
|
||||
};
|
||||
let feature_store = FeatureStore::new_with_flags(resp.flags);
|
||||
this.feature_store.store(Arc::new(feature_store));
|
||||
tracing::info!("Feature flag updated");
|
||||
}
|
||||
tracing::info!("PostHog feature resolver stopped");
|
||||
}
|
||||
tracing::info!("PostHog feature resolver stopped");
|
||||
});
|
||||
.instrument(info_span!("posthog_feature_resolver")),
|
||||
);
|
||||
|
||||
// Report fake tenants to PostHog so that we have the combination of all the properties in the UI.
|
||||
// Do one report per pageserver restart.
|
||||
let this = self.clone();
|
||||
handle.spawn(
|
||||
async move {
|
||||
tracing::info!("Starting PostHog feature reporter");
|
||||
for tenant in &fake_tenants {
|
||||
tracing::info!("Reporting fake tenant: {:?}", tenant);
|
||||
}
|
||||
if let Err(e) = this.posthog_client.capture_event_batch(&fake_tenants).await {
|
||||
tracing::warn!("Cannot report fake tenants: {}", e);
|
||||
}
|
||||
}
|
||||
.instrument(info_span!("posthog_feature_reporter")),
|
||||
);
|
||||
}
|
||||
|
||||
pub fn feature_store(&self) -> Arc<FeatureStore> {
|
||||
|
||||
@@ -22,6 +22,16 @@ pub enum PostHogEvaluationError {
|
||||
Internal(String),
|
||||
}
|
||||
|
||||
impl PostHogEvaluationError {
|
||||
pub fn as_variant_str(&self) -> &'static str {
|
||||
match self {
|
||||
PostHogEvaluationError::NotAvailable(_) => "not_available",
|
||||
PostHogEvaluationError::NoConditionGroupMatched => "no_condition_group_matched",
|
||||
PostHogEvaluationError::Internal(_) => "internal",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
pub struct LocalEvaluationResponse {
|
||||
pub flags: Vec<LocalEvaluationFlag>,
|
||||
@@ -54,7 +64,7 @@ pub struct LocalEvaluationFlagFilterProperty {
|
||||
operator: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
#[derive(Debug, Serialize, Deserialize, Clone)]
|
||||
#[serde(untagged)]
|
||||
pub enum PostHogFlagFilterPropertyValue {
|
||||
String(String),
|
||||
@@ -448,6 +458,18 @@ impl FeatureStore {
|
||||
)))
|
||||
}
|
||||
}
|
||||
|
||||
/// Infer whether a feature flag is a boolean flag by checking if it has a multivariate filter.
|
||||
pub fn is_feature_flag_boolean(&self, flag_key: &str) -> Result<bool, PostHogEvaluationError> {
|
||||
if let Some(flag_config) = self.flags.get(flag_key) {
|
||||
Ok(flag_config.filters.multivariate.is_none())
|
||||
} else {
|
||||
Err(PostHogEvaluationError::NotAvailable(format!(
|
||||
"Not found in the local evaluation spec: {}",
|
||||
flag_key
|
||||
)))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct PostHogClientConfig {
|
||||
@@ -485,6 +507,13 @@ pub struct PostHogClient {
|
||||
client: reqwest::Client,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Debug)]
|
||||
pub struct CaptureEvent {
|
||||
pub event: String,
|
||||
pub distinct_id: String,
|
||||
pub properties: serde_json::Value,
|
||||
}
|
||||
|
||||
impl PostHogClient {
|
||||
pub fn new(config: PostHogClientConfig) -> Self {
|
||||
let client = reqwest::Client::new();
|
||||
@@ -528,7 +557,15 @@ impl PostHogClient {
|
||||
.bearer_auth(&self.config.server_api_key)
|
||||
.send()
|
||||
.await?;
|
||||
let status = response.status();
|
||||
let body = response.text().await?;
|
||||
if !status.is_success() {
|
||||
return Err(anyhow::anyhow!(
|
||||
"Failed to get feature flags: {}, {}",
|
||||
status,
|
||||
body
|
||||
));
|
||||
}
|
||||
Ok(serde_json::from_str(&body)?)
|
||||
}
|
||||
|
||||
@@ -540,12 +577,12 @@ impl PostHogClient {
|
||||
&self,
|
||||
event: &str,
|
||||
distinct_id: &str,
|
||||
properties: &HashMap<String, PostHogFlagFilterPropertyValue>,
|
||||
properties: &serde_json::Value,
|
||||
) -> anyhow::Result<()> {
|
||||
// PUBLIC_URL/capture/
|
||||
// with bearer token of self.client_api_key
|
||||
let url = format!("{}/capture/", self.config.public_api_url);
|
||||
self.client
|
||||
let response = self
|
||||
.client
|
||||
.post(url)
|
||||
.body(serde_json::to_string(&json!({
|
||||
"api_key": self.config.client_api_key,
|
||||
@@ -555,6 +592,39 @@ impl PostHogClient {
|
||||
}))?)
|
||||
.send()
|
||||
.await?;
|
||||
let status = response.status();
|
||||
let body = response.text().await?;
|
||||
if !status.is_success() {
|
||||
return Err(anyhow::anyhow!(
|
||||
"Failed to capture events: {}, {}",
|
||||
status,
|
||||
body
|
||||
));
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn capture_event_batch(&self, events: &[CaptureEvent]) -> anyhow::Result<()> {
|
||||
// PUBLIC_URL/batch/
|
||||
let url = format!("{}/batch/", self.config.public_api_url);
|
||||
let response = self
|
||||
.client
|
||||
.post(url)
|
||||
.body(serde_json::to_string(&json!({
|
||||
"api_key": self.config.client_api_key,
|
||||
"batch": events,
|
||||
}))?)
|
||||
.send()
|
||||
.await?;
|
||||
let status = response.status();
|
||||
let body = response.text().await?;
|
||||
if !status.is_success() {
|
||||
return Err(anyhow::anyhow!(
|
||||
"Failed to capture events: {}, {}",
|
||||
status,
|
||||
body
|
||||
));
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -28,6 +28,7 @@ use std::time::Duration;
|
||||
use tokio::sync::Notify;
|
||||
use tokio::time::Instant;
|
||||
|
||||
#[derive(Clone, Copy)]
|
||||
pub struct LeakyBucketConfig {
|
||||
/// This is the "time cost" of a single request unit.
|
||||
/// Should loosely represent how long it takes to handle a request unit in active resource time.
|
||||
|
||||
@@ -73,6 +73,7 @@ pub mod error;
|
||||
/// async timeout helper
|
||||
pub mod timeout;
|
||||
|
||||
pub mod span;
|
||||
pub mod sync;
|
||||
|
||||
pub mod failpoint_support;
|
||||
|
||||
19
libs/utils/src/span.rs
Normal file
19
libs/utils/src/span.rs
Normal file
@@ -0,0 +1,19 @@
|
||||
//! Tracing span helpers.
|
||||
|
||||
/// Records the given fields in the current span, as a single call. The fields must already have
|
||||
/// been declared for the span (typically with empty values).
|
||||
#[macro_export]
|
||||
macro_rules! span_record {
|
||||
($($tokens:tt)*) => {$crate::span_record_in!(::tracing::Span::current(), $($tokens)*)};
|
||||
}
|
||||
|
||||
/// Records the given fields in the given span, as a single call. The fields must already have been
|
||||
/// declared for the span (typically with empty values).
|
||||
#[macro_export]
|
||||
macro_rules! span_record_in {
|
||||
($span:expr, $($tokens:tt)*) => {
|
||||
if let Some(meta) = $span.metadata() {
|
||||
$span.record_all(&tracing::valueset!(meta.fields(), $($tokens)*));
|
||||
}
|
||||
};
|
||||
}
|
||||
@@ -439,6 +439,7 @@ pub fn empty_shmem() -> crate::bindings::WalproposerShmemState {
|
||||
currentClusterSize: crate::bindings::pg_atomic_uint64 { value: 0 },
|
||||
shard_ps_feedback: [empty_feedback; 128],
|
||||
num_shards: 0,
|
||||
replica_promote: false,
|
||||
min_ps_feedback: empty_feedback,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -34,6 +34,7 @@ fail.workspace = true
|
||||
futures.workspace = true
|
||||
hashlink.workspace = true
|
||||
hex.workspace = true
|
||||
http.workspace = true
|
||||
http-utils.workspace = true
|
||||
humantime-serde.workspace = true
|
||||
humantime.workspace = true
|
||||
@@ -50,6 +51,7 @@ pageserver_api.workspace = true
|
||||
pageserver_client.workspace = true # for ResponseErrorMessageExt TOOD refactor that
|
||||
pageserver_compaction.workspace = true
|
||||
pageserver_page_api.workspace = true
|
||||
peekable.workspace = true
|
||||
pem.workspace = true
|
||||
pin-project-lite.workspace = true
|
||||
postgres_backend.workspace = true
|
||||
@@ -61,6 +63,7 @@ postgres-types.workspace = true
|
||||
posthog_client_lite.workspace = true
|
||||
pprof.workspace = true
|
||||
pq_proto.workspace = true
|
||||
prost.workspace = true
|
||||
rand.workspace = true
|
||||
range-set-blaze = { version = "0.1.16", features = ["alloc"] }
|
||||
regex.workspace = true
|
||||
@@ -93,6 +96,7 @@ tokio-util.workspace = true
|
||||
toml_edit = { workspace = true, features = [ "serde" ] }
|
||||
tonic.workspace = true
|
||||
tonic-reflection.workspace = true
|
||||
tower.workspace = true
|
||||
tracing.workspace = true
|
||||
tracing-utils.workspace = true
|
||||
url.workspace = true
|
||||
|
||||
@@ -264,10 +264,56 @@ mod propagation_of_cached_label_value {
|
||||
}
|
||||
}
|
||||
|
||||
criterion_group!(histograms, histograms::bench_bucket_scalability);
|
||||
mod histograms {
|
||||
use std::time::Instant;
|
||||
|
||||
use criterion::{BenchmarkId, Criterion};
|
||||
use metrics::core::Collector;
|
||||
|
||||
pub fn bench_bucket_scalability(c: &mut Criterion) {
|
||||
let mut g = c.benchmark_group("bucket_scalability");
|
||||
|
||||
for n in [1, 4, 8, 16, 32, 64, 128, 256] {
|
||||
g.bench_with_input(BenchmarkId::new("nbuckets", n), &n, |b, n| {
|
||||
b.iter_custom(|iters| {
|
||||
let buckets: Vec<f64> = (0..*n).map(|i| i as f64 * 100.0).collect();
|
||||
let histo = metrics::Histogram::with_opts(
|
||||
metrics::prometheus::HistogramOpts::new("name", "help")
|
||||
.buckets(buckets.clone()),
|
||||
)
|
||||
.unwrap();
|
||||
let start = Instant::now();
|
||||
for i in 0..usize::try_from(iters).unwrap() {
|
||||
histo.observe(buckets[i % buckets.len()]);
|
||||
}
|
||||
let elapsed = start.elapsed();
|
||||
// self-test
|
||||
let mfs = histo.collect();
|
||||
assert_eq!(mfs.len(), 1);
|
||||
let metrics = mfs[0].get_metric();
|
||||
assert_eq!(metrics.len(), 1);
|
||||
let histo = metrics[0].get_histogram();
|
||||
let buckets = histo.get_bucket();
|
||||
assert!(
|
||||
buckets
|
||||
.iter()
|
||||
.enumerate()
|
||||
.all(|(i, b)| b.get_cumulative_count()
|
||||
>= i as u64 * (iters / buckets.len() as u64))
|
||||
);
|
||||
elapsed
|
||||
})
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
criterion_main!(
|
||||
label_values,
|
||||
single_metric_multicore_scalability,
|
||||
propagation_of_cached_label_value
|
||||
propagation_of_cached_label_value,
|
||||
histograms,
|
||||
);
|
||||
|
||||
/*
|
||||
@@ -290,6 +336,14 @@ propagation_of_cached_label_value__naive/nthreads/8 time: [211.50 ns 214.44 ns
|
||||
propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/1 time: [14.135 ns 14.147 ns 14.160 ns]
|
||||
propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/4 time: [14.243 ns 14.255 ns 14.268 ns]
|
||||
propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/8 time: [14.470 ns 14.682 ns 14.895 ns]
|
||||
bucket_scalability/nbuckets/1 time: [30.352 ns 30.353 ns 30.354 ns]
|
||||
bucket_scalability/nbuckets/4 time: [30.464 ns 30.465 ns 30.467 ns]
|
||||
bucket_scalability/nbuckets/8 time: [30.569 ns 30.575 ns 30.584 ns]
|
||||
bucket_scalability/nbuckets/16 time: [30.961 ns 30.965 ns 30.969 ns]
|
||||
bucket_scalability/nbuckets/32 time: [35.691 ns 35.707 ns 35.722 ns]
|
||||
bucket_scalability/nbuckets/64 time: [47.829 ns 47.898 ns 47.974 ns]
|
||||
bucket_scalability/nbuckets/128 time: [73.479 ns 73.512 ns 73.545 ns]
|
||||
bucket_scalability/nbuckets/256 time: [127.92 ns 127.94 ns 127.96 ns]
|
||||
|
||||
Results on an i3en.3xlarge instance
|
||||
|
||||
@@ -344,6 +398,14 @@ propagation_of_cached_label_value__naive/nthreads/8 time: [434.87 ns 456.4
|
||||
propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/1 time: [3.3767 ns 3.3974 ns 3.4220 ns]
|
||||
propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/4 time: [3.6105 ns 4.2355 ns 5.1463 ns]
|
||||
propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/8 time: [4.0889 ns 4.9714 ns 6.0779 ns]
|
||||
bucket_scalability/nbuckets/1 time: [4.8455 ns 4.8542 ns 4.8646 ns]
|
||||
bucket_scalability/nbuckets/4 time: [4.5663 ns 4.5722 ns 4.5787 ns]
|
||||
bucket_scalability/nbuckets/8 time: [4.5531 ns 4.5670 ns 4.5842 ns]
|
||||
bucket_scalability/nbuckets/16 time: [4.6392 ns 4.6524 ns 4.6685 ns]
|
||||
bucket_scalability/nbuckets/32 time: [6.0302 ns 6.0439 ns 6.0589 ns]
|
||||
bucket_scalability/nbuckets/64 time: [10.608 ns 10.644 ns 10.691 ns]
|
||||
bucket_scalability/nbuckets/128 time: [22.178 ns 22.316 ns 22.483 ns]
|
||||
bucket_scalability/nbuckets/256 time: [42.190 ns 42.328 ns 42.492 ns]
|
||||
|
||||
Results on a Hetzner AX102 AMD Ryzen 9 7950X3D 16-Core Processor
|
||||
|
||||
@@ -362,5 +424,13 @@ propagation_of_cached_label_value__naive/nthreads/8 time: [164.24 ns 170.1
|
||||
propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/1 time: [2.2915 ns 2.2960 ns 2.3012 ns]
|
||||
propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/4 time: [2.5726 ns 2.6158 ns 2.6624 ns]
|
||||
propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/8 time: [2.7068 ns 2.8243 ns 2.9824 ns]
|
||||
bucket_scalability/nbuckets/1 time: [6.3998 ns 6.4288 ns 6.4684 ns]
|
||||
bucket_scalability/nbuckets/4 time: [6.3603 ns 6.3620 ns 6.3637 ns]
|
||||
bucket_scalability/nbuckets/8 time: [6.1646 ns 6.1654 ns 6.1667 ns]
|
||||
bucket_scalability/nbuckets/16 time: [6.1341 ns 6.1391 ns 6.1454 ns]
|
||||
bucket_scalability/nbuckets/32 time: [8.2206 ns 8.2254 ns 8.2301 ns]
|
||||
bucket_scalability/nbuckets/64 time: [13.988 ns 13.994 ns 14.000 ns]
|
||||
bucket_scalability/nbuckets/128 time: [28.180 ns 28.216 ns 28.251 ns]
|
||||
bucket_scalability/nbuckets/256 time: [54.914 ns 54.931 ns 54.951 ns]
|
||||
|
||||
*/
|
||||
|
||||
30
pageserver/client_grpc/Cargo.toml
Normal file
30
pageserver/client_grpc/Cargo.toml
Normal file
@@ -0,0 +1,30 @@
|
||||
[package]
|
||||
name = "pageserver_client_grpc"
|
||||
version = "0.1.0"
|
||||
edition = "2024"
|
||||
|
||||
[dependencies]
|
||||
bytes.workspace = true
|
||||
futures.workspace = true
|
||||
http.workspace = true
|
||||
thiserror.workspace = true
|
||||
tonic.workspace = true
|
||||
tracing.workspace = true
|
||||
tokio = { version = "1.43.1", features = ["full", "macros", "net", "io-util", "rt", "rt-multi-thread"] }
|
||||
uuid = { version = "1", features = ["v4"] }
|
||||
tower = { version = "0.4", features = ["timeout", "util"] }
|
||||
rand = "0.8"
|
||||
tokio-util = { version = "0.7", features = ["compat"] }
|
||||
hyper-util = "0.1.9"
|
||||
hyper = "1.6.0"
|
||||
metrics.workspace = true
|
||||
priority-queue = "2.3.1"
|
||||
async-trait = { version = "0.1" }
|
||||
tokio-stream = "0.1"
|
||||
dashmap = "5"
|
||||
chrono = { version = "0.4", features = ["serde"] }
|
||||
|
||||
|
||||
pageserver_page_api.workspace = true
|
||||
pageserver_api.workspace = true
|
||||
utils.workspace = true
|
||||
296
pageserver/client_grpc/examples/load_test.rs
Normal file
296
pageserver/client_grpc/examples/load_test.rs
Normal file
@@ -0,0 +1,296 @@
|
||||
// examples/load_test.rs, generated by AI
|
||||
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::sync::{
|
||||
Arc,
|
||||
Mutex,
|
||||
atomic::{AtomicU64, AtomicUsize, Ordering},
|
||||
};
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use tokio::task;
|
||||
use tokio::time::sleep;
|
||||
use rand::Rng;
|
||||
use tonic::Status;
|
||||
use uuid::Uuid;
|
||||
|
||||
// Pull in your ConnectionPool and PooledItemFactory from the pageserver_client_grpc crate.
|
||||
// Adjust these paths if necessary.
|
||||
use pageserver_client_grpc::client_cache::ConnectionPool;
|
||||
use pageserver_client_grpc::client_cache::PooledItemFactory;
|
||||
|
||||
// --------------------------------------
|
||||
// GLOBAL COUNTERS FOR “CREATED” / “DROPPED” MockConnections
|
||||
// --------------------------------------
|
||||
static CREATED: AtomicU64 = AtomicU64::new(0);
|
||||
static DROPPED: AtomicU64 = AtomicU64::new(0);
|
||||
|
||||
// --------------------------------------
|
||||
// MockConnection + Factory
|
||||
// --------------------------------------
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct MockConnection {
|
||||
pub id: u64,
|
||||
}
|
||||
|
||||
impl Clone for MockConnection {
|
||||
fn clone(&self) -> Self {
|
||||
// Cloning a MockConnection does NOT count as “creating” a brand‐new connection,
|
||||
// so we do NOT bump CREATED here. We only bump CREATED in the factory’s `create()`.
|
||||
CREATED.fetch_add(1, Ordering::Relaxed);
|
||||
MockConnection { id: self.id }
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for MockConnection {
|
||||
fn drop(&mut self) {
|
||||
// When a MockConnection actually gets dropped, bump the counter.
|
||||
DROPPED.fetch_add(1, Ordering::SeqCst);
|
||||
}
|
||||
}
|
||||
|
||||
pub struct MockConnectionFactory {
|
||||
counter: AtomicU64,
|
||||
}
|
||||
|
||||
impl MockConnectionFactory {
|
||||
pub fn new() -> Self {
|
||||
MockConnectionFactory {
|
||||
counter: AtomicU64::new(1),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl PooledItemFactory<MockConnection> for MockConnectionFactory {
|
||||
/// The trait on ConnectionPool expects:
|
||||
/// async fn create(&self, timeout: Duration)
|
||||
/// -> Result<Result<MockConnection, Status>, tokio::time::error::Elapsed>;
|
||||
///
|
||||
/// On success: Ok(Ok(MockConnection))
|
||||
/// On a simulated “gRPC” failure: Ok(Err(Status::…))
|
||||
/// On a transport/factory error: Err(Box<…>)
|
||||
async fn create(
|
||||
&self,
|
||||
_timeout: Duration,
|
||||
) -> Result<Result<MockConnection, Status>, tokio::time::error::Elapsed> {
|
||||
// Simulate connection creation immediately succeeding.
|
||||
CREATED.fetch_add(1, Ordering::SeqCst);
|
||||
let next_id = self.counter.fetch_add(1, Ordering::Relaxed);
|
||||
Ok(Ok(MockConnection { id: next_id }))
|
||||
}
|
||||
}
|
||||
|
||||
// --------------------------------------
|
||||
// CLIENT WORKER
|
||||
// --------------------------------------
|
||||
//
|
||||
// Each worker repeatedly calls `pool.get_client().await`. When it succeeds, we:
|
||||
// 1. Lock the shared Mutex<HashMap<u64, Arc<AtomicUsize>>> to fetch/insert an Arc<AtomicUsize> for this conn_id.
|
||||
// 2. Lock the shared Mutex<HashSet<u64>> to record this conn_id as “seen.”
|
||||
// 3. Drop both locks, then atomically increment that counter and assert it ≤ max_consumers.
|
||||
// 4. Sleep 10–100 ms to simulate “work.”
|
||||
// 5. Atomically decrement the counter.
|
||||
// 6. Call `pooled.finish(Ok(()))` to return to the pool.
|
||||
|
||||
async fn client_worker(
|
||||
pool: Arc<ConnectionPool<MockConnection>>,
|
||||
usage_map: Arc<Mutex<HashMap<u64, Arc<AtomicUsize>>>>,
|
||||
seen_set: Arc<Mutex<HashSet<u64>>>,
|
||||
max_consumers: usize,
|
||||
worker_id: usize,
|
||||
) {
|
||||
for iteration in 0..10 {
|
||||
match pool.clone().get_client().await {
|
||||
Ok(pooled) => {
|
||||
let conn: MockConnection = pooled.channel();
|
||||
let conn_id = conn.id;
|
||||
|
||||
// 1. Fetch or insert the Arc<AtomicUsize> for this conn_id:
|
||||
let counter_arc: Arc<AtomicUsize> = {
|
||||
let mut guard = usage_map.lock().unwrap();
|
||||
guard
|
||||
.entry(conn_id)
|
||||
.or_insert_with(|| Arc::new(AtomicUsize::new(0)))
|
||||
.clone()
|
||||
// MutexGuard is dropped here
|
||||
};
|
||||
|
||||
// 2. Record this conn_id in the shared HashSet of “seen” IDs:
|
||||
{
|
||||
let mut seen_guard = seen_set.lock().unwrap();
|
||||
seen_guard.insert(conn_id);
|
||||
// MutexGuard is dropped immediately
|
||||
}
|
||||
|
||||
// 3. Atomically bump the count for this connection ID
|
||||
let prev = counter_arc.fetch_add(1, Ordering::SeqCst);
|
||||
let current = prev + 1;
|
||||
assert!(
|
||||
current <= max_consumers,
|
||||
"Connection {} exceeded max_consumers (got {})",
|
||||
conn_id,
|
||||
current
|
||||
);
|
||||
|
||||
println!(
|
||||
"[worker {}][iter {}] got MockConnection id={} ({} concurrent)",
|
||||
worker_id, iteration, conn_id, current
|
||||
);
|
||||
|
||||
// 4. Simulate some work (10–100 ms)
|
||||
let delay_ms = rand::thread_rng().gen_range(10..100);
|
||||
sleep(Duration::from_millis(delay_ms)).await;
|
||||
|
||||
// 5. Decrement the usage counter
|
||||
let prev2 = counter_arc.fetch_sub(1, Ordering::SeqCst);
|
||||
let after = prev2 - 1;
|
||||
println!(
|
||||
"[worker {}][iter {}] returning MockConnection id={} (now {} remain)",
|
||||
worker_id, iteration, conn_id, after
|
||||
);
|
||||
|
||||
// 6. Return to the pool (mark success)
|
||||
pooled.finish(Ok(())).await;
|
||||
}
|
||||
Err(status) => {
|
||||
eprintln!(
|
||||
"[worker {}][iter {}] failed to get client: {:?}",
|
||||
worker_id, iteration, status
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// Small random pause before next iteration to spread out load
|
||||
let pause = rand::thread_rng().gen_range(0..20);
|
||||
sleep(Duration::from_millis(pause)).await;
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::main(flavor = "multi_thread", worker_threads = 8)]
|
||||
async fn main() {
|
||||
// --------------------------------------
|
||||
// 1. Create factory and shared instrumentation
|
||||
// --------------------------------------
|
||||
let factory = Arc::new(MockConnectionFactory::new());
|
||||
|
||||
// Shared map: connection ID → Arc<AtomicUsize>
|
||||
let usage_map: Arc<Mutex<HashMap<u64, Arc<AtomicUsize>>>> =
|
||||
Arc::new(Mutex::new(HashMap::new()));
|
||||
|
||||
// Shared set: record each unique connection ID we actually saw
|
||||
let seen_set: Arc<Mutex<HashSet<u64>>> = Arc::new(Mutex::new(HashSet::new()));
|
||||
|
||||
// --------------------------------------
|
||||
// 2. Pool parameters
|
||||
// --------------------------------------
|
||||
let connect_timeout = Duration::from_millis(500);
|
||||
let connect_backoff = Duration::from_millis(100);
|
||||
let max_consumers = 100; // test limit
|
||||
let error_threshold = 2; // mock never fails
|
||||
let max_idle_duration = Duration::from_secs(2);
|
||||
let max_total_connections = 3;
|
||||
let aggregate_metrics = None;
|
||||
|
||||
let pool: Arc<ConnectionPool<MockConnection>> = ConnectionPool::new(
|
||||
factory,
|
||||
connect_timeout,
|
||||
connect_backoff,
|
||||
max_consumers,
|
||||
error_threshold,
|
||||
max_idle_duration,
|
||||
max_total_connections,
|
||||
aggregate_metrics,
|
||||
);
|
||||
|
||||
// --------------------------------------
|
||||
// 3. Spawn worker tasks
|
||||
// --------------------------------------
|
||||
let num_workers = 10000;
|
||||
let mut handles = Vec::with_capacity(num_workers);
|
||||
let start_time = Instant::now();
|
||||
|
||||
for worker_id in 0..num_workers {
|
||||
let pool_clone = Arc::clone(&pool);
|
||||
let usage_clone = Arc::clone(&usage_map);
|
||||
let seen_clone = Arc::clone(&seen_set);
|
||||
let mc = max_consumers;
|
||||
|
||||
let handle = task::spawn(async move {
|
||||
client_worker(pool_clone, usage_clone, seen_clone, mc, worker_id).await;
|
||||
});
|
||||
handles.push(handle);
|
||||
}
|
||||
|
||||
// --------------------------------------
|
||||
// 4. Wait for workers to finish
|
||||
// --------------------------------------
|
||||
for handle in handles {
|
||||
let _ = handle.await;
|
||||
}
|
||||
let elapsed = Instant::now().duration_since(start_time);
|
||||
println!(
|
||||
"All {} workers completed in {:?}",
|
||||
num_workers, elapsed
|
||||
);
|
||||
|
||||
// --------------------------------------
|
||||
// 5. Print the total number of unique connections seen so far
|
||||
// --------------------------------------
|
||||
let unique_count = {
|
||||
let seen_guard = seen_set.lock().unwrap();
|
||||
seen_guard.len()
|
||||
};
|
||||
println!("Total unique connections used by workers: {}", unique_count);
|
||||
|
||||
// --------------------------------------
|
||||
// 6. Sleep so the background sweeper can run (max_idle_duration = 2 s)
|
||||
// --------------------------------------
|
||||
sleep(Duration::from_secs(3)).await;
|
||||
|
||||
// --------------------------------------
|
||||
// 7. Shutdown the pool
|
||||
// --------------------------------------
|
||||
let shutdown_pool = Arc::clone(&pool);
|
||||
shutdown_pool.shutdown().await;
|
||||
println!("Pool.shutdown() returned.");
|
||||
|
||||
// --------------------------------------
|
||||
// 8. Verify that no background task still holds an Arc clone of `pool`.
|
||||
// If any task is still alive (sweeper/create_connection), strong_count > 1.
|
||||
// --------------------------------------
|
||||
sleep(Duration::from_secs(1)).await; // give tasks time to exit
|
||||
let sc = Arc::strong_count(&pool);
|
||||
assert!(
|
||||
sc == 1,
|
||||
"Pool tasks did not all terminate: Arc::strong_count = {} (expected 1)",
|
||||
sc
|
||||
);
|
||||
println!("Verified: all pool tasks have terminated (strong_count == 1).");
|
||||
|
||||
// --------------------------------------
|
||||
// 9. Verify no MockConnection was leaked:
|
||||
// CREATED must equal DROPPED.
|
||||
// --------------------------------------
|
||||
let created = CREATED.load(Ordering::SeqCst);
|
||||
let dropped = DROPPED.load(Ordering::SeqCst);
|
||||
assert!(
|
||||
created == dropped,
|
||||
"Leaked connections: created={} but dropped={}",
|
||||
created,
|
||||
dropped
|
||||
);
|
||||
println!(
|
||||
"Verified: no connections leaked (created = {}, dropped = {}).",
|
||||
created, dropped
|
||||
);
|
||||
|
||||
// --------------------------------------
|
||||
// 10. Because `client_worker` asserted inside that no connection
|
||||
// ever exceeded `max_consumers`, reaching this point means that check passed.
|
||||
// --------------------------------------
|
||||
println!("All per-connection usage stayed within max_consumers = {}.", max_consumers);
|
||||
|
||||
println!("Load test complete; exiting cleanly.");
|
||||
}
|
||||
160
pageserver/client_grpc/examples/request_tracker_load_test.rs
Normal file
160
pageserver/client_grpc/examples/request_tracker_load_test.rs
Normal file
@@ -0,0 +1,160 @@
|
||||
// examples/request_tracker_load_test.rs
|
||||
|
||||
use std::{sync::Arc, time::Duration};
|
||||
use tokio;
|
||||
use pageserver_client_grpc::request_tracker::RequestTracker;
|
||||
use pageserver_client_grpc::request_tracker::MockStreamFactory;
|
||||
use pageserver_client_grpc::request_tracker::StreamReturner;
|
||||
use pageserver_client_grpc::client_cache::ConnectionPool;
|
||||
use pageserver_client_grpc::client_cache::PooledItemFactory;
|
||||
use pageserver_client_grpc::ClientCacheOptions;
|
||||
use pageserver_client_grpc::PageserverClientAggregateMetrics;
|
||||
use pageserver_client_grpc::AuthInterceptor;
|
||||
|
||||
use pageserver_client_grpc::client_cache::ChannelFactory;
|
||||
|
||||
use tonic::{transport::{Channel}, Request};
|
||||
|
||||
use rand::prelude::*;
|
||||
|
||||
use pageserver_api::key::Key;
|
||||
|
||||
use utils::lsn::Lsn;
|
||||
use utils::id::TenantTimelineId;
|
||||
|
||||
use futures::stream::FuturesOrdered;
|
||||
use futures::StreamExt;
|
||||
// use chrono
|
||||
use chrono::Utc;
|
||||
|
||||
use pageserver_page_api::{GetPageClass, GetPageResponse};
|
||||
use pageserver_page_api::proto;
|
||||
#[derive(Clone)]
|
||||
struct KeyRange {
|
||||
timeline: TenantTimelineId,
|
||||
timeline_lsn: Lsn,
|
||||
start: i128,
|
||||
end: i128,
|
||||
}
|
||||
|
||||
impl KeyRange {
|
||||
fn len(&self) -> i128 {
|
||||
self.end - self.start
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() {
|
||||
// 1) configure the client‐pool behavior
|
||||
let client_cache_options = ClientCacheOptions {
|
||||
max_delay_ms: 0,
|
||||
drop_rate: 0.0,
|
||||
hang_rate: 0.0,
|
||||
connect_timeout: Duration::from_secs(10),
|
||||
connect_backoff: Duration::from_millis(200),
|
||||
max_consumers: 64,
|
||||
error_threshold: 10,
|
||||
max_idle_duration: Duration::from_secs(60),
|
||||
max_total_connections: 12,
|
||||
};
|
||||
|
||||
// 2) metrics collector (we assume Default is implemented)
|
||||
let metrics = Arc::new(PageserverClientAggregateMetrics::new());
|
||||
let pool = ConnectionPool::<StreamReturner>::new(
|
||||
Arc::new(MockStreamFactory::new(
|
||||
)),
|
||||
client_cache_options.connect_timeout,
|
||||
client_cache_options.connect_backoff,
|
||||
client_cache_options.max_consumers,
|
||||
client_cache_options.error_threshold,
|
||||
client_cache_options.max_idle_duration,
|
||||
client_cache_options.max_total_connections,
|
||||
Some(Arc::clone(&metrics)),
|
||||
);
|
||||
|
||||
// -----------
|
||||
// There is no mock for the unary connection pool, so for now just
|
||||
// don't use this pool
|
||||
//
|
||||
let channel_fact : Arc<dyn PooledItemFactory<Channel> + Send + Sync> = Arc::new(ChannelFactory::new(
|
||||
"".to_string(),
|
||||
client_cache_options.max_delay_ms,
|
||||
client_cache_options.drop_rate,
|
||||
client_cache_options.hang_rate,
|
||||
));
|
||||
let unary_pool: Arc<ConnectionPool<Channel>> = ConnectionPool::new(
|
||||
Arc::clone(&channel_fact),
|
||||
client_cache_options.connect_timeout,
|
||||
client_cache_options.connect_backoff,
|
||||
client_cache_options.max_consumers,
|
||||
client_cache_options.error_threshold,
|
||||
client_cache_options.max_idle_duration,
|
||||
client_cache_options.max_total_connections,
|
||||
Some(Arc::clone(&metrics)),
|
||||
);
|
||||
|
||||
// -----------
|
||||
// Dummy auth interceptor. This is not used in this test.
|
||||
let auth_interceptor = AuthInterceptor::new("dummy_tenant_id",
|
||||
"dummy_timeline_id",
|
||||
None);
|
||||
let mut tracker = RequestTracker::new(
|
||||
pool,
|
||||
unary_pool,
|
||||
auth_interceptor,
|
||||
);
|
||||
|
||||
// 4) fire off 10 000 requests in parallel
|
||||
let mut handles = FuturesOrdered::new();
|
||||
for i in 0..500000 {
|
||||
|
||||
let mut rng = rand::thread_rng();
|
||||
let r = 0..=1000000i128;
|
||||
let key: i128 = rng.gen_range(r.clone());
|
||||
let key = Key::from_i128(key);
|
||||
let (rel_tag, block_no) = key
|
||||
.to_rel_block()
|
||||
.expect("we filter non-rel-block keys out above");
|
||||
|
||||
let req2 = proto::GetPageRequest {
|
||||
request_id: 0,
|
||||
request_class: proto::GetPageClass::Normal as i32,
|
||||
read_lsn: Some(proto::ReadLsn {
|
||||
request_lsn: if rng.gen_bool(0.5) {
|
||||
u64::from(Lsn::MAX)
|
||||
} else {
|
||||
10000
|
||||
},
|
||||
not_modified_since_lsn: 10000,
|
||||
}),
|
||||
rel: Some(rel_tag.into()),
|
||||
block_number: vec![block_no],
|
||||
};
|
||||
let req_model = pageserver_page_api::GetPageRequest::try_from(req2.clone());
|
||||
|
||||
// RequestTracker is Clone, so we can share it
|
||||
let mut tr = tracker.clone();
|
||||
let fut = async move {
|
||||
let resp = tr.send_getpage_request(req_model.unwrap()).await.unwrap();
|
||||
// sanity‐check: the mock echo returns the same request_id
|
||||
assert!(resp.request_id > 0);
|
||||
};
|
||||
handles.push_back(fut);
|
||||
|
||||
// empty future
|
||||
let fut = async move {};
|
||||
fut.await;
|
||||
}
|
||||
|
||||
// print timestamp
|
||||
println!("Starting 5000000 requests at: {}", chrono::Utc::now());
|
||||
// 5) wait for them all
|
||||
for i in 0..500000 {
|
||||
handles.next().await.expect("Failed to get next handle");
|
||||
}
|
||||
|
||||
// print timestamp
|
||||
println!("Finished 5000000 requests at: {}", chrono::Utc::now());
|
||||
|
||||
println!("✅ All 100000 requests completed successfully");
|
||||
}
|
||||
741
pageserver/client_grpc/src/client_cache.rs
Normal file
741
pageserver/client_grpc/src/client_cache.rs
Normal file
@@ -0,0 +1,741 @@
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
io::{self, Error, ErrorKind},
|
||||
sync::Arc,
|
||||
time::{Duration, Instant},
|
||||
};
|
||||
|
||||
use priority_queue::PriorityQueue;
|
||||
|
||||
use tokio::{
|
||||
io::{AsyncRead, AsyncWrite, ReadBuf},
|
||||
net::TcpStream,
|
||||
sync::{Mutex, OwnedSemaphorePermit, Semaphore},
|
||||
time::sleep,
|
||||
};
|
||||
use tonic::transport::{Channel, Endpoint};
|
||||
|
||||
use uuid;
|
||||
|
||||
use std::{
|
||||
pin::Pin,
|
||||
task::{Context, Poll},
|
||||
};
|
||||
|
||||
use futures::future;
|
||||
use rand::{Rng, SeedableRng, rngs::StdRng};
|
||||
|
||||
use bytes::BytesMut;
|
||||
use http::Uri;
|
||||
use hyper_util::rt::TokioIo;
|
||||
use tower::service_fn;
|
||||
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use async_trait::async_trait;
|
||||
|
||||
//
|
||||
// The "TokioTcp" is flakey TCP network for testing purposes, in order
|
||||
// to simulate network errors and delays.
|
||||
//
|
||||
|
||||
/// Wraps a `TcpStream`, buffers incoming data, and injects a random delay per fresh read/write.
|
||||
pub struct TokioTcp {
|
||||
tcp: TcpStream,
|
||||
/// Maximum randomized delay in milliseconds
|
||||
delay_ms: u64,
|
||||
|
||||
/// Next deadline instant for delay
|
||||
deadline: Instant,
|
||||
/// Internal buffer of previously-read data
|
||||
buffer: BytesMut,
|
||||
}
|
||||
|
||||
impl TokioTcp {
|
||||
/// Create a new wrapper with given max delay (ms)
|
||||
pub fn new(stream: TcpStream, delay_ms: u64) -> Self {
|
||||
let initial = if delay_ms > 0 {
|
||||
rand::thread_rng().gen_range(0..delay_ms)
|
||||
} else {
|
||||
0
|
||||
};
|
||||
let deadline = Instant::now() + Duration::from_millis(initial);
|
||||
TokioTcp {
|
||||
tcp: stream,
|
||||
delay_ms,
|
||||
deadline,
|
||||
buffer: BytesMut::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl AsyncRead for TokioTcp {
|
||||
fn poll_read(
|
||||
self: Pin<&mut Self>,
|
||||
cx: &mut Context<'_>,
|
||||
buf: &mut ReadBuf<'_>,
|
||||
) -> Poll<io::Result<()>> {
|
||||
// Safe because TokioTcp is Unpin
|
||||
let this = self.get_mut();
|
||||
|
||||
// 1) Drain any buffered data
|
||||
if !this.buffer.is_empty() {
|
||||
let to_copy = this.buffer.len().min(buf.remaining());
|
||||
buf.put_slice(&this.buffer.split_to(to_copy));
|
||||
return Poll::Ready(Ok(()));
|
||||
}
|
||||
|
||||
// 2) If we're still before the deadline, schedule a wake and return Pending
|
||||
let now = Instant::now();
|
||||
if this.delay_ms > 0 && now < this.deadline {
|
||||
let waker = cx.waker().clone();
|
||||
let wait = this.deadline - now;
|
||||
tokio::spawn(async move {
|
||||
sleep(wait).await;
|
||||
waker.wake_by_ref();
|
||||
});
|
||||
return Poll::Pending;
|
||||
}
|
||||
|
||||
// 3) Past deadline: compute next random deadline
|
||||
if this.delay_ms > 0 {
|
||||
let next_ms = rand::thread_rng().gen_range(0..=this.delay_ms);
|
||||
this.deadline = Instant::now() + Duration::from_millis(next_ms);
|
||||
}
|
||||
|
||||
// 4) Perform actual read into a temporary buffer
|
||||
let mut tmp = [0u8; 4096];
|
||||
let mut rb = ReadBuf::new(&mut tmp);
|
||||
match Pin::new(&mut this.tcp).poll_read(cx, &mut rb) {
|
||||
Poll::Pending => Poll::Pending,
|
||||
Poll::Ready(Ok(())) => {
|
||||
let filled = rb.filled();
|
||||
if filled.is_empty() {
|
||||
// EOF or zero bytes
|
||||
Poll::Ready(Ok(()))
|
||||
} else {
|
||||
this.buffer.extend_from_slice(filled);
|
||||
let to_copy = this.buffer.len().min(buf.remaining());
|
||||
buf.put_slice(&this.buffer.split_to(to_copy));
|
||||
Poll::Ready(Ok(()))
|
||||
}
|
||||
}
|
||||
Poll::Ready(Err(e)) => Poll::Ready(Err(e)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl AsyncWrite for TokioTcp {
|
||||
fn poll_write(
|
||||
self: Pin<&mut Self>,
|
||||
cx: &mut Context<'_>,
|
||||
data: &[u8],
|
||||
) -> Poll<io::Result<usize>> {
|
||||
let this = self.get_mut();
|
||||
|
||||
// 1) If before deadline, schedule wake and return Pending
|
||||
let now = Instant::now();
|
||||
if this.delay_ms > 0 && now < this.deadline {
|
||||
let waker = cx.waker().clone();
|
||||
let wait = this.deadline - now;
|
||||
tokio::spawn(async move {
|
||||
sleep(wait).await;
|
||||
waker.wake_by_ref();
|
||||
});
|
||||
return Poll::Pending;
|
||||
}
|
||||
|
||||
// 2) Past deadline: compute next random deadline
|
||||
if this.delay_ms > 0 {
|
||||
let next_ms = rand::thread_rng().gen_range(0..=this.delay_ms);
|
||||
this.deadline = Instant::now() + Duration::from_millis(next_ms);
|
||||
}
|
||||
|
||||
// 3) Actual write
|
||||
Pin::new(&mut this.tcp).poll_write(cx, data)
|
||||
}
|
||||
|
||||
fn poll_flush(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<io::Result<()>> {
|
||||
let this = self.get_mut();
|
||||
Pin::new(&mut this.tcp).poll_flush(cx)
|
||||
}
|
||||
|
||||
fn poll_shutdown(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<io::Result<()>> {
|
||||
let this = self.get_mut();
|
||||
Pin::new(&mut this.tcp).poll_shutdown(cx)
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
pub trait PooledItemFactory<T>: Send + Sync + 'static {
|
||||
/// Create a new pooled item.
|
||||
async fn create(&self, connect_timeout: Duration) -> Result<Result<T, tonic::Status>, tokio::time::error::Elapsed>;
|
||||
}
|
||||
|
||||
pub struct ChannelFactory {
|
||||
endpoint: String,
|
||||
max_delay_ms: u64,
|
||||
drop_rate: f64,
|
||||
hang_rate: f64,
|
||||
}
|
||||
|
||||
|
||||
impl ChannelFactory {
|
||||
pub fn new(
|
||||
endpoint: String,
|
||||
max_delay_ms: u64,
|
||||
drop_rate: f64,
|
||||
hang_rate: f64,
|
||||
) -> Self {
|
||||
ChannelFactory {
|
||||
endpoint,
|
||||
max_delay_ms,
|
||||
drop_rate,
|
||||
hang_rate,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl PooledItemFactory<Channel> for ChannelFactory {
|
||||
async fn create(&self, connect_timeout: Duration) -> Result<Result<Channel, tonic::Status>, tokio::time::error::Elapsed> {
|
||||
let max_delay_ms = self.max_delay_ms;
|
||||
let drop_rate = self.drop_rate;
|
||||
let hang_rate = self.hang_rate;
|
||||
|
||||
// This is a custom connector that inserts delays and errors, for
|
||||
// testing purposes. It would normally be disabled by the config.
|
||||
let connector = service_fn(move |uri: Uri| {
|
||||
let drop_rate = drop_rate;
|
||||
let hang_rate = hang_rate;
|
||||
async move {
|
||||
let mut rng = StdRng::from_entropy();
|
||||
// Simulate an indefinite hang
|
||||
if hang_rate > 0.0 && rng.gen_bool(hang_rate) {
|
||||
// never completes, to test timeout
|
||||
return future::pending::<Result<TokioIo<TokioTcp>, std::io::Error>>().await;
|
||||
}
|
||||
|
||||
// Random drop (connect error)
|
||||
if drop_rate > 0.0 && rng.gen_bool(drop_rate) {
|
||||
return Err(std::io::Error::new(
|
||||
std::io::ErrorKind::Other,
|
||||
"simulated connect drop",
|
||||
));
|
||||
}
|
||||
|
||||
// Otherwise perform real TCP connect
|
||||
let addr = match (uri.host(), uri.port()) {
|
||||
// host + explicit port
|
||||
(Some(host), Some(port)) => format!("{}:{}", host, port.as_str()),
|
||||
// host only (no port)
|
||||
(Some(host), None) => host.to_string(),
|
||||
// neither? error out
|
||||
_ => return Err(Error::new(ErrorKind::InvalidInput, "no host or port")),
|
||||
};
|
||||
|
||||
let tcp = TcpStream::connect(addr).await?;
|
||||
let tcpwrapper = TokioTcp::new(tcp, max_delay_ms);
|
||||
Ok(TokioIo::new(tcpwrapper))
|
||||
}
|
||||
});
|
||||
|
||||
|
||||
let attempt = tokio::time::timeout(
|
||||
connect_timeout,
|
||||
Endpoint::from_shared(self.endpoint.clone())
|
||||
.expect("invalid endpoint")
|
||||
.timeout(connect_timeout)
|
||||
.connect_with_connector(connector),
|
||||
)
|
||||
.await;
|
||||
match attempt {
|
||||
Ok(Ok(channel)) => {
|
||||
// Connection succeeded
|
||||
Ok(Ok(channel))
|
||||
}
|
||||
Ok(Err(e)) => {
|
||||
Ok(Err(tonic::Status::new(
|
||||
tonic::Code::Unavailable,
|
||||
format!("Failed to connect: {}", e),
|
||||
)))
|
||||
}
|
||||
Err(e) => {
|
||||
Err(e)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// A pooled gRPC client with capacity tracking and error handling.
|
||||
pub struct ConnectionPool<T> {
|
||||
inner: Mutex<Inner<T>>,
|
||||
|
||||
fact: Arc<dyn PooledItemFactory<T> + Send + Sync>,
|
||||
|
||||
connect_timeout: Duration,
|
||||
connect_backoff: Duration,
|
||||
/// The maximum number of consumers that can use a single connection.
|
||||
max_consumers: usize,
|
||||
/// The number of consecutive errors before a connection is removed from the pool.
|
||||
error_threshold: usize,
|
||||
/// The maximum duration a connection can be idle before being removed.
|
||||
max_idle_duration: Duration,
|
||||
max_total_connections: usize,
|
||||
|
||||
channel_semaphore: Arc<Semaphore>,
|
||||
|
||||
shutdown_token: CancellationToken,
|
||||
aggregate_metrics: Option<Arc<crate::PageserverClientAggregateMetrics>>,
|
||||
}
|
||||
|
||||
struct Inner<T> {
|
||||
entries: HashMap<uuid::Uuid, ConnectionEntry<T>>,
|
||||
pq: PriorityQueue<uuid::Uuid, usize>,
|
||||
// This is updated when a connection is dropped, or we fail
|
||||
// to create a new connection.
|
||||
last_connect_failure: Option<Instant>,
|
||||
waiters: usize,
|
||||
in_progress: usize,
|
||||
}
|
||||
struct ConnectionEntry<T> {
|
||||
channel: T,
|
||||
active_consumers: usize,
|
||||
consecutive_errors: usize,
|
||||
last_used: Instant,
|
||||
}
|
||||
|
||||
/// A client borrowed from the pool.
|
||||
pub struct PooledClient<T> {
|
||||
pub channel: T,
|
||||
pool: Arc<ConnectionPool<T>>,
|
||||
is_ok: bool,
|
||||
id: uuid::Uuid,
|
||||
permit: OwnedSemaphorePermit,
|
||||
}
|
||||
|
||||
impl<T: Clone + Send + 'static> ConnectionPool<T> {
|
||||
pub fn new(
|
||||
fact: Arc<dyn PooledItemFactory<T> + Send + Sync>,
|
||||
connect_timeout: Duration,
|
||||
connect_backoff: Duration,
|
||||
max_consumers: usize,
|
||||
error_threshold: usize,
|
||||
max_idle_duration: Duration,
|
||||
max_total_connections: usize,
|
||||
aggregate_metrics: Option<Arc<crate::PageserverClientAggregateMetrics>>,
|
||||
) -> Arc<Self> {
|
||||
let shutdown_token = CancellationToken::new();
|
||||
let pool = Arc::new(Self {
|
||||
inner: Mutex::new(Inner::<T> {
|
||||
entries: HashMap::new(),
|
||||
pq: PriorityQueue::new(),
|
||||
last_connect_failure: None,
|
||||
waiters: 0,
|
||||
in_progress: 0,
|
||||
}),
|
||||
fact: Arc::clone(&fact),
|
||||
connect_timeout,
|
||||
connect_backoff,
|
||||
max_consumers,
|
||||
error_threshold,
|
||||
max_idle_duration,
|
||||
max_total_connections,
|
||||
channel_semaphore: Arc::new(Semaphore::new(0)),
|
||||
shutdown_token: shutdown_token.clone(),
|
||||
aggregate_metrics: aggregate_metrics.clone(),
|
||||
});
|
||||
|
||||
// Cancelable background task to sweep idle connections
|
||||
let sweeper_token = shutdown_token.clone();
|
||||
let sweeper_pool = Arc::clone(&pool);
|
||||
tokio::spawn(async move {
|
||||
loop {
|
||||
tokio::select! {
|
||||
_ = sweeper_token.cancelled() => break,
|
||||
_ = async {
|
||||
sweeper_pool.sweep_idle_connections().await;
|
||||
sleep(Duration::from_secs(5)).await;
|
||||
} => {}
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
pool
|
||||
}
|
||||
|
||||
pub async fn shutdown(self: Arc<Self>) {
|
||||
self.shutdown_token.cancel();
|
||||
|
||||
loop {
|
||||
let all_idle = {
|
||||
let inner = self.inner.lock().await;
|
||||
inner.entries.values().all(|e| e.active_consumers == 0)
|
||||
};
|
||||
if all_idle {
|
||||
break;
|
||||
}
|
||||
sleep(Duration::from_millis(100)).await;
|
||||
}
|
||||
|
||||
// 4. Remove all entries
|
||||
let mut inner = self.inner.lock().await;
|
||||
inner.entries.clear();
|
||||
}
|
||||
|
||||
/// Sweep and remove idle connections safely, burning their permits.
|
||||
async fn sweep_idle_connections(self: &Arc<Self>) {
|
||||
let mut ids_to_remove = Vec::new();
|
||||
let now = Instant::now();
|
||||
|
||||
// Remove idle entries. First collect permits for those connections so that
|
||||
// no consumer will reserve them, then remove them from the pool.
|
||||
{
|
||||
let mut inner = self.inner.lock().await;
|
||||
inner.entries.retain(|id, entry| {
|
||||
if entry.active_consumers == 0
|
||||
&& now.duration_since(entry.last_used) > self.max_idle_duration
|
||||
{
|
||||
// metric
|
||||
match self.aggregate_metrics {
|
||||
Some(ref metrics) => {
|
||||
metrics
|
||||
.retry_counters
|
||||
.with_label_values(&["connection_swept"])
|
||||
.inc();
|
||||
}
|
||||
None => {}
|
||||
}
|
||||
ids_to_remove.push(*id);
|
||||
return false; // remove this entry
|
||||
}
|
||||
true
|
||||
});
|
||||
// Remove the entries from the priority queue
|
||||
for id in ids_to_remove {
|
||||
inner.pq.remove(&id);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If we have a permit already, get a connection out of the heap
|
||||
async fn get_conn_with_permit(
|
||||
self: Arc<Self>,
|
||||
permit: OwnedSemaphorePermit,
|
||||
) -> Option<PooledClient<T>> {
|
||||
let mut inner = self.inner.lock().await;
|
||||
|
||||
// Pop the highest-active-consumers connection. There are no connections
|
||||
// in the heap that have more than max_consumers active consumers.
|
||||
if let Some((id, _cons)) = inner.pq.pop() {
|
||||
let entry = inner
|
||||
.entries
|
||||
.get_mut(&id)
|
||||
.expect("pq and entries got out of sync");
|
||||
|
||||
let mut active_consumers = entry.active_consumers;
|
||||
entry.active_consumers += 1;
|
||||
entry.last_used = Instant::now();
|
||||
|
||||
let client = PooledClient::<T> {
|
||||
channel: entry.channel.clone(),
|
||||
pool: Arc::clone(&self),
|
||||
is_ok: true,
|
||||
id,
|
||||
permit: permit,
|
||||
};
|
||||
|
||||
// re‐insert with updated priority
|
||||
active_consumers += 1;
|
||||
if active_consumers < self.max_consumers {
|
||||
inner.pq.push(id, active_consumers as usize);
|
||||
}
|
||||
return Some(client);
|
||||
} else {
|
||||
// If there is no connection to take, it is because permits for a connection
|
||||
// need to drain. This can happen if a connection is removed because it has
|
||||
// too many errors. It is taken out of the heap/hash table in this case, but
|
||||
// we can't remove it's permits until now.
|
||||
//
|
||||
// Just forget the permit and retry.
|
||||
permit.forget();
|
||||
return None;
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn get_client(self: Arc<Self>) -> Result<PooledClient<T>, tonic::Status> {
|
||||
// The pool is shutting down. Don't accept new connections.
|
||||
if self.shutdown_token.is_cancelled() {
|
||||
return Err(tonic::Status::unavailable("Pool is shutting down"));
|
||||
}
|
||||
|
||||
// A loop is necessary because when a connection is draining, we have to return
|
||||
// a permit and retry.
|
||||
loop {
|
||||
let self_clone = Arc::clone(&self);
|
||||
let mut semaphore = Arc::clone(&self_clone.channel_semaphore);
|
||||
|
||||
match semaphore.try_acquire_owned() {
|
||||
Ok(permit_) => {
|
||||
// We got a permit, so check the heap for a connection
|
||||
// we can use.
|
||||
let pool_conn = self_clone.get_conn_with_permit(permit_).await;
|
||||
match pool_conn {
|
||||
Some(pool_conn_) => {
|
||||
return Ok(pool_conn_);
|
||||
}
|
||||
None => {
|
||||
// No connection available. Forget the permit and retry.
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(_) => {
|
||||
match self_clone.aggregate_metrics {
|
||||
Some(ref metrics) => {
|
||||
metrics
|
||||
.retry_counters
|
||||
.with_label_values(&["sema_acquire_failed"])
|
||||
.inc();
|
||||
}
|
||||
None => {}
|
||||
}
|
||||
|
||||
{
|
||||
//
|
||||
// This is going to generate enough connections to handle a burst,
|
||||
// but it may generate up to twice the number of connections needed
|
||||
// in the worst case. Extra connections will go idle and be cleaned
|
||||
// up.
|
||||
//
|
||||
let mut inner = self_clone.inner.lock().await;
|
||||
inner.waiters += 1;
|
||||
if inner.waiters > (inner.in_progress * self_clone.max_consumers) {
|
||||
if (inner.entries.len() + inner.in_progress) < self_clone.max_total_connections {
|
||||
|
||||
let self_clone_spawn = Arc::clone(&self_clone);
|
||||
tokio::task::spawn(async move {
|
||||
self_clone_spawn.create_connection().await;
|
||||
});
|
||||
inner.in_progress += 1;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
// Wait for a connection to become available, either because it
|
||||
// was created or because a connection was returned to the pool
|
||||
// by another consumer.
|
||||
semaphore = Arc::clone(&self_clone.channel_semaphore);
|
||||
let conn_permit = semaphore.acquire_owned().await.unwrap();
|
||||
{
|
||||
let mut inner = self_clone.inner.lock().await;
|
||||
inner.waiters -= 1;
|
||||
}
|
||||
// We got a permit, check the heap for a connection.
|
||||
let pool_conn = self_clone.get_conn_with_permit(conn_permit).await;
|
||||
match pool_conn {
|
||||
Some(pool_conn_) => {
|
||||
return Ok(pool_conn_);
|
||||
}
|
||||
None => {
|
||||
// No connection was found, forget the permit and retry.
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn create_connection(&self) -> () {
|
||||
|
||||
// Generate a random backoff to add some jitter so that connections
|
||||
// don't all retry at the same time.
|
||||
let mut backoff_delay = Duration::from_millis(
|
||||
rand::thread_rng().gen_range(0..=self.connect_backoff.as_millis() as u64),
|
||||
);
|
||||
|
||||
loop {
|
||||
if self.shutdown_token.is_cancelled() {
|
||||
return;
|
||||
}
|
||||
|
||||
// Back off.
|
||||
// Loop because failure can occur while we are sleeping, so wait
|
||||
// until the failure stopped for at least one backoff period. Backoff
|
||||
// period includes some jitter, so that if multiple connections are
|
||||
// failing, they don't all retry at the same time.
|
||||
loop {
|
||||
if let Some(delay) = {
|
||||
let inner = self.inner.lock().await;
|
||||
inner.last_connect_failure.and_then(|at| {
|
||||
(at.elapsed() < backoff_delay).then(|| backoff_delay - at.elapsed())
|
||||
})
|
||||
} {
|
||||
sleep(delay).await;
|
||||
} else {
|
||||
break; // No delay, so we can create a connection
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// Create a new connection.
|
||||
//
|
||||
// The connect timeout is also the timeout for an individual gRPC request
|
||||
// on this connection. (Requests made later on this channel will time out
|
||||
// with the same timeout.)
|
||||
//
|
||||
match self.aggregate_metrics {
|
||||
Some(ref metrics) => {
|
||||
metrics
|
||||
.retry_counters
|
||||
.with_label_values(&["connection_attempt"])
|
||||
.inc();
|
||||
}
|
||||
None => {}
|
||||
}
|
||||
|
||||
let attempt = self.fact
|
||||
.create(self.connect_timeout)
|
||||
.await;
|
||||
|
||||
match attempt {
|
||||
// Connection succeeded
|
||||
Ok(Ok(channel)) => {
|
||||
{
|
||||
match self.aggregate_metrics {
|
||||
Some(ref metrics) => {
|
||||
metrics
|
||||
.retry_counters
|
||||
.with_label_values(&["connection_success"])
|
||||
.inc();
|
||||
}
|
||||
None => {}
|
||||
}
|
||||
let mut inner = self.inner.lock().await;
|
||||
let id = uuid::Uuid::new_v4();
|
||||
inner.entries.insert(
|
||||
id,
|
||||
ConnectionEntry::<T> {
|
||||
channel: channel.clone(),
|
||||
active_consumers: 0,
|
||||
consecutive_errors: 0,
|
||||
last_used: Instant::now(),
|
||||
},
|
||||
);
|
||||
inner.pq.push(id, 0);
|
||||
inner.in_progress -= 1;
|
||||
self.channel_semaphore.add_permits(self.max_consumers);
|
||||
return;
|
||||
};
|
||||
}
|
||||
// Connection failed, back off and retry
|
||||
Ok(Err(_)) | Err(_) => {
|
||||
match self.aggregate_metrics {
|
||||
Some(ref metrics) => {
|
||||
metrics
|
||||
.retry_counters
|
||||
.with_label_values(&["connect_failed"])
|
||||
.inc();
|
||||
}
|
||||
None => {}
|
||||
}
|
||||
let mut inner = self.inner.lock().await;
|
||||
inner.last_connect_failure = Some(Instant::now());
|
||||
// Add some jitter so that every connection doesn't retry at once
|
||||
let jitter = rand::thread_rng().gen_range(0..=backoff_delay.as_millis() as u64);
|
||||
backoff_delay =
|
||||
Duration::from_millis(backoff_delay.as_millis() as u64 + jitter);
|
||||
|
||||
// Do not backoff longer than one minute
|
||||
if backoff_delay > Duration::from_secs(60) {
|
||||
backoff_delay = Duration::from_secs(60);
|
||||
}
|
||||
// continue the loop to retry
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Return client to the pool, indicating success or error.
|
||||
pub async fn return_client(&self, id: uuid::Uuid, success: bool, permit: OwnedSemaphorePermit) {
|
||||
let mut inner = self.inner.lock().await;
|
||||
if let Some(entry) = inner.entries.get_mut(&id) {
|
||||
entry.last_used = Instant::now();
|
||||
if entry.active_consumers <= 0 {
|
||||
panic!("A consumer completed when active_consumers was zero!")
|
||||
}
|
||||
entry.active_consumers = entry.active_consumers - 1;
|
||||
if success {
|
||||
if entry.consecutive_errors < self.error_threshold {
|
||||
entry.consecutive_errors = 0;
|
||||
}
|
||||
} else {
|
||||
entry.consecutive_errors += 1;
|
||||
if entry.consecutive_errors == self.error_threshold {
|
||||
match self.aggregate_metrics {
|
||||
Some(ref metrics) => {
|
||||
metrics
|
||||
.retry_counters
|
||||
.with_label_values(&["connection_dropped"])
|
||||
.inc();
|
||||
}
|
||||
None => {}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// Too many errors on this connection. If there are no active users,
|
||||
// remove it. Otherwise just wait for active_consumers to go to zero.
|
||||
// This connection will not be selected for new consumers.
|
||||
//
|
||||
let active_consumers = entry.active_consumers;
|
||||
if entry.consecutive_errors >= self.error_threshold {
|
||||
// too many errors, remove the connection permanently. Once it drains,
|
||||
// it will be dropped.
|
||||
if inner.pq.get_priority(&id).is_some() {
|
||||
inner.pq.remove(&id);
|
||||
}
|
||||
|
||||
// remove from entries
|
||||
// check if entry is in inner
|
||||
if inner.entries.contains_key(&id) {
|
||||
inner.entries.remove(&id);
|
||||
}
|
||||
inner.last_connect_failure = Some(Instant::now());
|
||||
|
||||
// The connection has been removed, it's permits will be
|
||||
// drained because if we look for a connection and it's not there
|
||||
// we just forget the permit. However, this process can be a little
|
||||
// bit faster if we just forget permits as the connections are returned.
|
||||
permit.forget();
|
||||
} else {
|
||||
// update its priority in the queue
|
||||
if inner.pq.get_priority(&id).is_some() {
|
||||
inner.pq.change_priority(&id, active_consumers);
|
||||
} else {
|
||||
// This connection is not in the heap, but it has space
|
||||
// for more consumers. Put it back in the heap.
|
||||
if active_consumers < self.max_consumers {
|
||||
inner.pq.push(id, active_consumers);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Clone + Send + 'static> PooledClient<T> {
|
||||
pub fn channel(&self) -> T {
|
||||
return self.channel.clone();
|
||||
}
|
||||
pub async fn finish(mut self, result: Result<(), tonic::Status>) {
|
||||
self.is_ok = result.is_ok();
|
||||
self.pool.return_client(
|
||||
self.id,
|
||||
self.is_ok,
|
||||
self.permit,
|
||||
).await;
|
||||
}
|
||||
}
|
||||
456
pageserver/client_grpc/src/lib.rs
Normal file
456
pageserver/client_grpc/src/lib.rs
Normal file
@@ -0,0 +1,456 @@
|
||||
//! Pageserver Data API client
|
||||
//!
|
||||
//! - Manage connections to pageserver
|
||||
//! - Send requests to correct shards
|
||||
//!
|
||||
use std::collections::HashMap;
|
||||
use std::sync::Arc;
|
||||
use std::sync::RwLock;
|
||||
use std::time::Duration;
|
||||
|
||||
use bytes::Bytes;
|
||||
use futures::{Stream, StreamExt};
|
||||
use thiserror::Error;
|
||||
use tonic::metadata::AsciiMetadataValue;
|
||||
|
||||
use pageserver_page_api::proto;
|
||||
use pageserver_page_api::*;
|
||||
|
||||
use pageserver_page_api::proto::PageServiceClient;
|
||||
use utils::shard::ShardIndex;
|
||||
|
||||
use std::fmt::Debug;
|
||||
pub mod client_cache;
|
||||
pub mod request_tracker;
|
||||
use tonic::transport::Channel;
|
||||
|
||||
use metrics::{IntCounterVec, core::Collector};
|
||||
use crate::client_cache::{PooledItemFactory};
|
||||
|
||||
use tokio::sync::mpsc;
|
||||
use async_trait::async_trait;
|
||||
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
pub enum PageserverClientError {
|
||||
#[error("could not connect to service: {0}")]
|
||||
ConnectError(#[from] tonic::transport::Error),
|
||||
#[error("could not perform request: {0}`")]
|
||||
RequestError(#[from] tonic::Status),
|
||||
#[error("protocol error: {0}")]
|
||||
ProtocolError(#[from] ProtocolError),
|
||||
|
||||
#[error("could not perform request: {0}`")]
|
||||
InvalidUri(#[from] http::uri::InvalidUri),
|
||||
|
||||
#[error("could not perform request: {0}`")]
|
||||
Other(String),
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct PageserverClientAggregateMetrics {
|
||||
pub request_counters: IntCounterVec,
|
||||
pub retry_counters: IntCounterVec,
|
||||
}
|
||||
impl PageserverClientAggregateMetrics {
|
||||
pub fn new() -> Self {
|
||||
let request_counters = IntCounterVec::new(
|
||||
metrics::core::Opts::new(
|
||||
"backend_requests_total",
|
||||
"Number of requests from backends.",
|
||||
),
|
||||
&["request_kind"],
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let retry_counters = IntCounterVec::new(
|
||||
metrics::core::Opts::new(
|
||||
"backend_requests_retries_total",
|
||||
"Number of retried requests from backends.",
|
||||
),
|
||||
&["request_kind"],
|
||||
)
|
||||
.unwrap();
|
||||
Self {
|
||||
request_counters,
|
||||
retry_counters,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn collect(&self) -> Vec<metrics::proto::MetricFamily> {
|
||||
let mut metrics = Vec::new();
|
||||
metrics.append(&mut self.request_counters.collect());
|
||||
metrics.append(&mut self.retry_counters.collect());
|
||||
metrics
|
||||
}
|
||||
}
|
||||
|
||||
pub struct PageserverClient {
|
||||
_tenant_id: String,
|
||||
_timeline_id: String,
|
||||
|
||||
_auth_token: Option<String>,
|
||||
|
||||
shard_map: HashMap<ShardIndex, String>,
|
||||
|
||||
channels: RwLock<HashMap<ShardIndex, Arc<client_cache::ConnectionPool<Channel>>>>,
|
||||
|
||||
auth_interceptor: AuthInterceptor,
|
||||
|
||||
client_cache_options: ClientCacheOptions,
|
||||
|
||||
aggregate_metrics: Option<Arc<PageserverClientAggregateMetrics>>,
|
||||
}
|
||||
#[derive(Clone)]
|
||||
pub struct ClientCacheOptions {
|
||||
pub max_consumers: usize,
|
||||
pub error_threshold: usize,
|
||||
pub connect_timeout: Duration,
|
||||
pub connect_backoff: Duration,
|
||||
pub max_idle_duration: Duration,
|
||||
pub max_total_connections: usize,
|
||||
pub max_delay_ms: u64,
|
||||
pub drop_rate: f64,
|
||||
pub hang_rate: f64,
|
||||
}
|
||||
|
||||
impl PageserverClient {
|
||||
/// TODO: this doesn't currently react to changes in the shard map.
|
||||
pub fn new(
|
||||
tenant_id: &str,
|
||||
timeline_id: &str,
|
||||
auth_token: &Option<String>,
|
||||
shard_map: HashMap<ShardIndex, String>,
|
||||
) -> Self {
|
||||
let options = ClientCacheOptions {
|
||||
max_consumers: 5000,
|
||||
error_threshold: 5,
|
||||
connect_timeout: Duration::from_secs(5),
|
||||
connect_backoff: Duration::from_secs(1),
|
||||
max_idle_duration: Duration::from_secs(60),
|
||||
max_total_connections: 100000,
|
||||
max_delay_ms: 0,
|
||||
drop_rate: 0.0,
|
||||
hang_rate: 0.0,
|
||||
};
|
||||
Self::new_with_config(tenant_id, timeline_id, auth_token, shard_map, options, None)
|
||||
}
|
||||
pub fn new_with_config(
|
||||
tenant_id: &str,
|
||||
timeline_id: &str,
|
||||
auth_token: &Option<String>,
|
||||
shard_map: HashMap<ShardIndex, String>,
|
||||
options: ClientCacheOptions,
|
||||
metrics: Option<Arc<PageserverClientAggregateMetrics>>,
|
||||
) -> Self {
|
||||
Self {
|
||||
_tenant_id: tenant_id.to_string(),
|
||||
_timeline_id: timeline_id.to_string(),
|
||||
_auth_token: auth_token.clone(),
|
||||
shard_map,
|
||||
channels: RwLock::new(HashMap::new()),
|
||||
auth_interceptor: AuthInterceptor::new(tenant_id, timeline_id, auth_token.as_deref()),
|
||||
client_cache_options: options,
|
||||
aggregate_metrics: metrics,
|
||||
}
|
||||
}
|
||||
pub async fn process_check_rel_exists_request(
|
||||
&self,
|
||||
request: CheckRelExistsRequest,
|
||||
) -> Result<bool, PageserverClientError> {
|
||||
// Current sharding model assumes that all metadata is present only at shard 0.
|
||||
let shard = ShardIndex::unsharded();
|
||||
let pooled_client = self.get_client(shard).await;
|
||||
let chan = pooled_client.channel();
|
||||
|
||||
let mut client =
|
||||
PageServiceClient::with_interceptor(chan, self.auth_interceptor.for_shard(shard));
|
||||
|
||||
let request = proto::CheckRelExistsRequest::from(request);
|
||||
let response = client.check_rel_exists(tonic::Request::new(request)).await;
|
||||
|
||||
match response {
|
||||
Err(status) => {
|
||||
pooled_client.finish(Err(status.clone())).await; // Pass error to finish
|
||||
return Err(PageserverClientError::RequestError(status));
|
||||
}
|
||||
Ok(resp) => {
|
||||
pooled_client.finish(Ok(())).await; // Pass success to finish
|
||||
return Ok(resp.get_ref().exists);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn process_get_rel_size_request(
|
||||
&self,
|
||||
request: GetRelSizeRequest,
|
||||
) -> Result<u32, PageserverClientError> {
|
||||
// Current sharding model assumes that all metadata is present only at shard 0.
|
||||
let shard = ShardIndex::unsharded();
|
||||
let pooled_client = self.get_client(shard).await;
|
||||
let chan = pooled_client.channel();
|
||||
|
||||
let mut client =
|
||||
PageServiceClient::with_interceptor(chan, self.auth_interceptor.for_shard(shard));
|
||||
|
||||
let request = proto::GetRelSizeRequest::from(request);
|
||||
let response = client.get_rel_size(tonic::Request::new(request)).await;
|
||||
|
||||
match response {
|
||||
Err(status) => {
|
||||
pooled_client.finish(Err(status.clone())).await; // Pass error to finish
|
||||
return Err(PageserverClientError::RequestError(status));
|
||||
}
|
||||
Ok(resp) => {
|
||||
pooled_client.finish(Ok(())).await; // Pass success to finish
|
||||
return Ok(resp.get_ref().num_blocks);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Request a single batch of pages
|
||||
//
|
||||
// TODO: This opens a new gRPC stream for every request, which is extremely inefficient
|
||||
pub async fn get_page(
|
||||
&self,
|
||||
request: GetPageRequest,
|
||||
) -> Result<Vec<Bytes>, PageserverClientError> {
|
||||
// FIXME: calculate the shard number correctly
|
||||
let shard = ShardIndex::unsharded();
|
||||
let pooled_client = self.get_client(shard).await;
|
||||
let chan = pooled_client.channel();
|
||||
|
||||
let mut client =
|
||||
PageServiceClient::with_interceptor(chan, self.auth_interceptor.for_shard(shard));
|
||||
|
||||
let request = proto::GetPageRequest::from(request);
|
||||
|
||||
let request_stream = futures::stream::once(std::future::ready(request));
|
||||
|
||||
let mut response_stream = client
|
||||
.get_pages(tonic::Request::new(request_stream))
|
||||
.await?
|
||||
.into_inner();
|
||||
|
||||
let Some(response) = response_stream.next().await else {
|
||||
return Err(PageserverClientError::Other(
|
||||
"no response received for getpage request".to_string(),
|
||||
));
|
||||
};
|
||||
|
||||
match self.aggregate_metrics {
|
||||
Some(ref metrics) => {
|
||||
metrics
|
||||
.request_counters
|
||||
.with_label_values(&["get_page"])
|
||||
.inc();
|
||||
}
|
||||
None => {}
|
||||
}
|
||||
|
||||
match response {
|
||||
Err(status) => {
|
||||
pooled_client.finish(Err(status.clone())).await; // Pass error to finish
|
||||
return Err(PageserverClientError::RequestError(status));
|
||||
}
|
||||
Ok(resp) => {
|
||||
pooled_client.finish(Ok(())).await; // Pass success to finish
|
||||
let response: GetPageResponse = resp.into();
|
||||
return Ok(response.page_images.to_vec());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Open a stream for requesting pages
|
||||
//
|
||||
// TODO: This is a pretty low level interface, the caller should not need to be concerned
|
||||
// with streams. But 'get_page' is currently very naive and inefficient.
|
||||
pub async fn get_pages(
|
||||
&self,
|
||||
requests: impl Stream<Item = proto::GetPageRequest> + Send + 'static,
|
||||
) -> std::result::Result<
|
||||
tonic::Response<tonic::codec::Streaming<proto::GetPageResponse>>,
|
||||
PageserverClientError,
|
||||
> {
|
||||
// FIXME: calculate the shard number correctly
|
||||
let shard = ShardIndex::unsharded();
|
||||
let pooled_client = self.get_client(shard).await;
|
||||
let chan = pooled_client.channel();
|
||||
|
||||
let mut client =
|
||||
PageServiceClient::with_interceptor(chan, self.auth_interceptor.for_shard(shard));
|
||||
|
||||
let response = client.get_pages(tonic::Request::new(requests)).await;
|
||||
|
||||
match response {
|
||||
Err(status) => {
|
||||
pooled_client.finish(Err(status.clone())).await; // Pass error to finish
|
||||
return Err(PageserverClientError::RequestError(status));
|
||||
}
|
||||
Ok(resp) => {
|
||||
return Ok(resp);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Process a request to get the size of a database.
|
||||
pub async fn process_get_dbsize_request(
|
||||
&self,
|
||||
request: GetDbSizeRequest,
|
||||
) -> Result<u64, PageserverClientError> {
|
||||
// Current sharding model assumes that all metadata is present only at shard 0.
|
||||
let shard = ShardIndex::unsharded();
|
||||
let pooled_client = self.get_client(shard).await;
|
||||
let chan = pooled_client.channel();
|
||||
|
||||
let mut client =
|
||||
PageServiceClient::with_interceptor(chan, self.auth_interceptor.for_shard(shard));
|
||||
|
||||
let request = proto::GetDbSizeRequest::from(request);
|
||||
let response = client.get_db_size(tonic::Request::new(request)).await;
|
||||
|
||||
match response {
|
||||
Err(status) => {
|
||||
pooled_client.finish(Err(status.clone())).await; // Pass error to finish
|
||||
return Err(PageserverClientError::RequestError(status));
|
||||
}
|
||||
Ok(resp) => {
|
||||
pooled_client.finish(Ok(())).await; // Pass success to finish
|
||||
return Ok(resp.get_ref().num_bytes);
|
||||
}
|
||||
}
|
||||
}
|
||||
/// Process a request to get the size of a database.
|
||||
pub async fn get_base_backup(
|
||||
&self,
|
||||
request: GetBaseBackupRequest,
|
||||
gzip: bool,
|
||||
) -> std::result::Result<
|
||||
tonic::Response<tonic::codec::Streaming<proto::GetBaseBackupResponseChunk>>,
|
||||
PageserverClientError,
|
||||
> {
|
||||
// Current sharding model assumes that all metadata is present only at shard 0.
|
||||
let shard = ShardIndex::unsharded();
|
||||
let pooled_client = self.get_client(shard).await;
|
||||
let chan = pooled_client.channel();
|
||||
|
||||
let mut client =
|
||||
PageServiceClient::with_interceptor(chan, self.auth_interceptor.for_shard(shard));
|
||||
|
||||
if gzip {
|
||||
client = client.accept_compressed(tonic::codec::CompressionEncoding::Gzip);
|
||||
}
|
||||
|
||||
let request = proto::GetBaseBackupRequest::from(request);
|
||||
let response = client.get_base_backup(tonic::Request::new(request)).await;
|
||||
|
||||
match response {
|
||||
Err(status) => {
|
||||
pooled_client.finish(Err(status.clone())).await; // Pass error to finish
|
||||
return Err(PageserverClientError::RequestError(status));
|
||||
}
|
||||
Ok(resp) => {
|
||||
pooled_client.finish(Ok(())).await; // Pass success to finish
|
||||
return Ok(resp);
|
||||
}
|
||||
}
|
||||
}
|
||||
/// Get a client for given shard
|
||||
///
|
||||
/// Get a client from the pool for this shard, also creating the pool if it doesn't exist.
|
||||
///
|
||||
async fn get_client(&self, shard: ShardIndex) -> client_cache::PooledClient<Channel> {
|
||||
let reused_pool: Option<Arc<client_cache::ConnectionPool<Channel>>> = {
|
||||
let channels = self.channels.read().unwrap();
|
||||
channels.get(&shard).cloned()
|
||||
};
|
||||
|
||||
let usable_pool: Arc<client_cache::ConnectionPool<Channel>>;
|
||||
match reused_pool {
|
||||
Some(pool) => {
|
||||
let pooled_client = pool.get_client().await.unwrap();
|
||||
return pooled_client;
|
||||
}
|
||||
None => {
|
||||
// Create a new pool using client_cache_options
|
||||
// declare new_pool
|
||||
|
||||
let new_pool: Arc<client_cache::ConnectionPool<Channel>>;
|
||||
let channel_fact = Arc::new(client_cache::ChannelFactory::new(
|
||||
self.shard_map.get(&shard).unwrap().clone(),
|
||||
self.client_cache_options.max_delay_ms,
|
||||
self.client_cache_options.drop_rate,
|
||||
self.client_cache_options.hang_rate,
|
||||
));
|
||||
new_pool = client_cache::ConnectionPool::new(
|
||||
channel_fact,
|
||||
self.client_cache_options.connect_timeout,
|
||||
self.client_cache_options.connect_backoff,
|
||||
self.client_cache_options.max_consumers,
|
||||
self.client_cache_options.error_threshold,
|
||||
self.client_cache_options.max_idle_duration,
|
||||
self.client_cache_options.max_total_connections,
|
||||
self.aggregate_metrics.clone(),
|
||||
);
|
||||
let mut write_pool = self.channels.write().unwrap();
|
||||
write_pool.insert(shard, new_pool.clone());
|
||||
usable_pool = new_pool.clone();
|
||||
}
|
||||
}
|
||||
|
||||
let pooled_client = usable_pool.get_client().await.unwrap();
|
||||
return pooled_client;
|
||||
}
|
||||
}
|
||||
|
||||
/// Inject tenant_id, timeline_id and authentication token to all pageserver requests.
|
||||
#[derive(Clone)]
|
||||
pub struct AuthInterceptor {
|
||||
tenant_id: AsciiMetadataValue,
|
||||
shard_id: Option<AsciiMetadataValue>,
|
||||
timeline_id: AsciiMetadataValue,
|
||||
|
||||
auth_header: Option<AsciiMetadataValue>, // including "Bearer " prefix
|
||||
}
|
||||
|
||||
impl AuthInterceptor {
|
||||
pub fn new(tenant_id: &str, timeline_id: &str, auth_token: Option<&str>) -> Self {
|
||||
Self {
|
||||
tenant_id: tenant_id.parse().expect("could not parse tenant id"),
|
||||
shard_id: None,
|
||||
timeline_id: timeline_id.parse().expect("could not parse timeline id"),
|
||||
auth_header: auth_token
|
||||
.map(|t| format!("Bearer {t}"))
|
||||
.map(|t| t.parse().expect("could not parse auth token")),
|
||||
}
|
||||
}
|
||||
|
||||
fn for_shard(&self, shard_id: ShardIndex) -> Self {
|
||||
let mut with_shard = self.clone();
|
||||
with_shard.shard_id = Some(
|
||||
shard_id
|
||||
.to_string()
|
||||
.parse()
|
||||
.expect("could not parse shard id"),
|
||||
);
|
||||
with_shard
|
||||
}
|
||||
}
|
||||
|
||||
impl tonic::service::Interceptor for AuthInterceptor {
|
||||
fn call(&mut self, mut req: tonic::Request<()>) -> Result<tonic::Request<()>, tonic::Status> {
|
||||
req.metadata_mut()
|
||||
.insert("neon-tenant-id", self.tenant_id.clone());
|
||||
if let Some(shard_id) = &self.shard_id {
|
||||
req.metadata_mut().insert("neon-shard-id", shard_id.clone());
|
||||
}
|
||||
req.metadata_mut()
|
||||
.insert("neon-timeline-id", self.timeline_id.clone());
|
||||
if let Some(auth_header) = &self.auth_header {
|
||||
req.metadata_mut()
|
||||
.insert("authorization", auth_header.clone());
|
||||
}
|
||||
|
||||
Ok(req)
|
||||
}
|
||||
}
|
||||
590
pageserver/client_grpc/src/request_tracker.rs
Normal file
590
pageserver/client_grpc/src/request_tracker.rs
Normal file
@@ -0,0 +1,590 @@
|
||||
|
||||
//
|
||||
// API Visible to the spawner, just a function call that is async
|
||||
//
|
||||
use std::sync::Arc;
|
||||
use crate::client_cache;
|
||||
use pageserver_page_api::GetPageRequest;
|
||||
use pageserver_page_api::GetPageResponse;
|
||||
use pageserver_page_api::*;
|
||||
use pageserver_page_api::proto;
|
||||
use crate::client_cache::ConnectionPool;
|
||||
use crate::client_cache::ChannelFactory;
|
||||
use crate::AuthInterceptor;
|
||||
use tonic::{transport::{Channel}, Request};
|
||||
use crate::ClientCacheOptions;
|
||||
use crate::PageserverClientAggregateMetrics;
|
||||
use tokio::sync::Mutex;
|
||||
use std::sync::atomic::{AtomicU64, Ordering};
|
||||
|
||||
use utils::shard::ShardIndex;
|
||||
|
||||
use tokio_stream::wrappers::ReceiverStream;
|
||||
use pageserver_page_api::proto::PageServiceClient;
|
||||
|
||||
use tonic::{
|
||||
Status,
|
||||
Code,
|
||||
};
|
||||
|
||||
use async_trait::async_trait;
|
||||
use std::time::Duration;
|
||||
|
||||
use client_cache::PooledItemFactory;
|
||||
//use tracing::info;
|
||||
//
|
||||
// A mock stream pool that just returns a sending channel, and whenever a GetPageRequest
|
||||
// comes in on that channel, it randomly sleeps before sending a GetPageResponse
|
||||
//
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct StreamReturner {
|
||||
sender: tokio::sync::mpsc::Sender<proto::GetPageRequest>,
|
||||
sender_hashmap: Arc<Mutex<std::collections::HashMap<u64, tokio::sync::mpsc::Sender<Result<proto::GetPageResponse, Status>>>>>,
|
||||
}
|
||||
pub struct MockStreamFactory {
|
||||
}
|
||||
|
||||
impl MockStreamFactory {
|
||||
pub fn new() -> Self {
|
||||
MockStreamFactory {
|
||||
}
|
||||
}
|
||||
}
|
||||
#[async_trait]
|
||||
impl PooledItemFactory<StreamReturner> for MockStreamFactory {
|
||||
async fn create(&self, _connect_timeout: Duration) -> Result<Result<StreamReturner, tonic::Status>, tokio::time::error::Elapsed> {
|
||||
let (sender, mut receiver) = tokio::sync::mpsc::channel::<proto::GetPageRequest>(1000);
|
||||
// Create a StreamReturner that will send requests to the receiver channel
|
||||
let stream_returner = StreamReturner {
|
||||
sender: sender.clone(),
|
||||
sender_hashmap: Arc::new(Mutex::new(std::collections::HashMap::new())),
|
||||
};
|
||||
|
||||
let map : Arc<Mutex<std::collections::HashMap<u64, tokio::sync::mpsc::Sender<Result<proto::GetPageResponse, _>>>>>
|
||||
= Arc::clone(&stream_returner.sender_hashmap);
|
||||
tokio::spawn(async move {
|
||||
while let Some(request) = receiver.recv().await {
|
||||
|
||||
// Break out of the loop with 1% chance
|
||||
if rand::random::<f32>() < 0.001 {
|
||||
break;
|
||||
}
|
||||
// Generate a random number between 0 and 100
|
||||
// Simulate some processing time
|
||||
let mapclone = Arc::clone(&map);
|
||||
tokio::spawn(async move {
|
||||
let sleep_ms = rand::random::<u64>() % 100;
|
||||
tokio::time::sleep(tokio::time::Duration::from_millis(sleep_ms)).await;
|
||||
let response = proto::GetPageResponse {
|
||||
request_id: request.request_id,
|
||||
..Default::default()
|
||||
};
|
||||
// look up stream in hash map
|
||||
let mut hashmap = mapclone.lock().await;
|
||||
if let Some(sender) = hashmap.get(&request.request_id) {
|
||||
// Send the response to the original request sender
|
||||
if let Err(e) = sender.send(Ok(response.clone())).await {
|
||||
eprintln!("Failed to send response: {}", e);
|
||||
}
|
||||
hashmap.remove(&request.request_id);
|
||||
} else {
|
||||
eprintln!("No sender found for request ID: {}", request.request_id);
|
||||
}
|
||||
});
|
||||
}
|
||||
// Close every sender stream in the hashmap
|
||||
let hashmap = map.lock().await;
|
||||
for sender in hashmap.values() {
|
||||
let error = Status::new(Code::Unknown, "Stream closed");
|
||||
if let Err(e) = sender.send(Err(error)).await {
|
||||
eprintln!("Failed to send close response: {}", e);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
Ok(Ok(stream_returner))
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
pub struct StreamFactory {
|
||||
connection_pool: Arc<client_cache::ConnectionPool<Channel>>,
|
||||
auth_interceptor: AuthInterceptor,
|
||||
shard: ShardIndex,
|
||||
}
|
||||
|
||||
impl StreamFactory {
|
||||
pub fn new(
|
||||
connection_pool: Arc<ConnectionPool<Channel>>,
|
||||
auth_interceptor: AuthInterceptor,
|
||||
shard: ShardIndex,
|
||||
) -> Self {
|
||||
StreamFactory {
|
||||
connection_pool,
|
||||
auth_interceptor,
|
||||
shard,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl PooledItemFactory<StreamReturner> for StreamFactory {
|
||||
async fn create(&self, _connect_timeout: Duration) ->
|
||||
Result<Result<StreamReturner, tonic::Status>, tokio::time::error::Elapsed>
|
||||
{
|
||||
let pool_clone : Arc<ConnectionPool<Channel>> = Arc::clone(&self.connection_pool);
|
||||
let pooled_client = pool_clone.get_client().await;
|
||||
let channel = pooled_client.unwrap().channel();
|
||||
let mut client =
|
||||
PageServiceClient::with_interceptor(channel, self.auth_interceptor.for_shard(self.shard));
|
||||
|
||||
let (sender, receiver) = tokio::sync::mpsc::channel::<proto::GetPageRequest>(1000);
|
||||
let outbound = ReceiverStream::new(receiver);
|
||||
|
||||
let client_resp = client
|
||||
.get_pages(Request::new(outbound))
|
||||
.await;
|
||||
|
||||
match client_resp {
|
||||
Err(status) => {
|
||||
// TODO: Convert this error correctly
|
||||
Ok(Err(tonic::Status::new(
|
||||
status.code(),
|
||||
format!("Failed to connect to pageserver: {}", status.message()),
|
||||
)))
|
||||
}
|
||||
Ok(resp) => {
|
||||
let stream_returner = StreamReturner {
|
||||
sender: sender.clone(),
|
||||
sender_hashmap: Arc::new(Mutex::new(std::collections::HashMap::new())),
|
||||
};
|
||||
let map : Arc<Mutex<std::collections::HashMap<u64, tokio::sync::mpsc::Sender<Result<proto::GetPageResponse, _>>>>>
|
||||
= Arc::clone(&stream_returner.sender_hashmap);
|
||||
|
||||
tokio::spawn(async move {
|
||||
|
||||
let map_clone = Arc::clone(&map);
|
||||
let mut inner = resp.into_inner();
|
||||
loop {
|
||||
|
||||
let resp = inner.message().await;
|
||||
if !resp.is_ok() {
|
||||
break; // Exit the loop if no more messages
|
||||
}
|
||||
let response = resp.unwrap().unwrap();
|
||||
|
||||
// look up stream in hash map
|
||||
let mut hashmap = map_clone.lock().await;
|
||||
if let Some(sender) = hashmap.get(&response.request_id) {
|
||||
// Send the response to the original request sender
|
||||
if let Err(e) = sender.send(Ok(response.clone())).await {
|
||||
eprintln!("Failed to send response: {}", e);
|
||||
}
|
||||
hashmap.remove(&response.request_id);
|
||||
} else {
|
||||
eprintln!("No sender found for request ID: {}", response.request_id);
|
||||
}
|
||||
}
|
||||
// Close every sender stream in the hashmap
|
||||
let hashmap = map_clone.lock().await;
|
||||
for sender in hashmap.values() {
|
||||
let error = Status::new(Code::Unknown, "Stream closed");
|
||||
if let Err(e) = sender.send(Err(error)).await {
|
||||
eprintln!("Failed to send close response: {}", e);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
Ok(Ok(stream_returner))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct RequestTracker {
|
||||
cur_id: Arc<AtomicU64>,
|
||||
stream_pool: Arc<ConnectionPool<StreamReturner>>,
|
||||
unary_pool: Arc<ConnectionPool<Channel>>,
|
||||
auth_interceptor: AuthInterceptor,
|
||||
shard: ShardIndex,
|
||||
}
|
||||
|
||||
impl RequestTracker {
|
||||
pub fn new(stream_pool: Arc<ConnectionPool<StreamReturner>>,
|
||||
unary_pool: Arc<ConnectionPool<Channel>>,
|
||||
auth_interceptor: AuthInterceptor,
|
||||
shard: ShardIndex,
|
||||
) -> Self {
|
||||
let cur_id = Arc::new(AtomicU64::new(0));
|
||||
|
||||
RequestTracker {
|
||||
cur_id: cur_id.clone(),
|
||||
stream_pool: stream_pool,
|
||||
unary_pool: unary_pool,
|
||||
auth_interceptor: auth_interceptor,
|
||||
shard: shard.clone()
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn send_process_check_rel_exists_request(
|
||||
&self,
|
||||
req: CheckRelExistsRequest,
|
||||
) -> Result<bool, tonic::Status> {
|
||||
loop {
|
||||
let unary_pool = Arc::clone(&self.unary_pool);
|
||||
let pooled_client = unary_pool.get_client().await.unwrap();
|
||||
let channel = pooled_client.channel();
|
||||
let mut ps_client = PageServiceClient::with_interceptor(channel, self.auth_interceptor.for_shard(self.shard));
|
||||
let request = proto::CheckRelExistsRequest::from(req.clone());
|
||||
let response = ps_client.check_rel_exists(tonic::Request::new(request)).await;
|
||||
|
||||
match response {
|
||||
Err(status) => {
|
||||
pooled_client.finish(Err(status.clone())).await; // Pass error to finish
|
||||
continue;
|
||||
}
|
||||
Ok(resp) => {
|
||||
pooled_client.finish(Ok(())).await; // Pass success to finish
|
||||
return Ok(resp.get_ref().exists);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn send_process_get_rel_size_request(
|
||||
&self,
|
||||
req: GetRelSizeRequest,
|
||||
) -> Result<u32, tonic::Status> {
|
||||
loop {
|
||||
// Current sharding model assumes that all metadata is present only at shard 0.
|
||||
let unary_pool = Arc::clone(&self.unary_pool);
|
||||
let pooled_client = unary_pool.get_client().await.unwrap();
|
||||
let channel = pooled_client.channel();
|
||||
let mut ps_client = PageServiceClient::with_interceptor(channel, self.auth_interceptor.for_shard(self.shard));
|
||||
|
||||
let request = proto::GetRelSizeRequest::from(req.clone());
|
||||
let response = ps_client.get_rel_size(tonic::Request::new(request)).await;
|
||||
|
||||
match response {
|
||||
Err(status) => {
|
||||
pooled_client.finish(Err(status.clone())).await; // Pass error to finish
|
||||
continue;
|
||||
}
|
||||
Ok(resp) => {
|
||||
pooled_client.finish(Ok(())).await; // Pass success to finish
|
||||
return Ok(resp.get_ref().num_blocks);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn send_process_get_dbsize_request(
|
||||
&self,
|
||||
req: GetDbSizeRequest,
|
||||
) -> Result<u64, tonic::Status> {
|
||||
loop {
|
||||
// Current sharding model assumes that all metadata is present only at shard 0.
|
||||
let unary_pool = Arc::clone(&self.unary_pool);
|
||||
let pooled_client = unary_pool.get_client().await.unwrap();let channel = pooled_client.channel();
|
||||
let mut ps_client = PageServiceClient::with_interceptor(channel, self.auth_interceptor.for_shard(self.shard));
|
||||
|
||||
let request = proto::GetDbSizeRequest::from(req.clone());
|
||||
let response = ps_client.get_db_size(tonic::Request::new(request)).await;
|
||||
|
||||
match response {
|
||||
Err(status) => {
|
||||
pooled_client.finish(Err(status.clone())).await; // Pass error to finish
|
||||
continue;
|
||||
}
|
||||
Ok(resp) => {
|
||||
pooled_client.finish(Ok(())).await; // Pass success to finish
|
||||
return Ok(resp.get_ref().num_bytes);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn send_getpage_request(
|
||||
&mut self,
|
||||
req: GetPageRequest,
|
||||
) -> Result<GetPageResponse, tonic::Status> {
|
||||
loop {
|
||||
let mut request = req.clone();
|
||||
// Increment cur_id
|
||||
//let request_id = self.cur_id.fetch_add(1, Ordering::SeqCst) + 1;
|
||||
let request_id = request.request_id;
|
||||
let response_sender: tokio::sync::mpsc::Sender<Result<proto::GetPageResponse, Status>>;
|
||||
let mut response_receiver: tokio::sync::mpsc::Receiver<Result<proto::GetPageResponse, Status>>;
|
||||
|
||||
(response_sender, response_receiver) = tokio::sync::mpsc::channel(1);
|
||||
//request.request_id = request_id;
|
||||
|
||||
// Get a stream from the stream pool
|
||||
let pool_clone = Arc::clone(&self.stream_pool);
|
||||
let sender_stream_pool = pool_clone.get_client().await;
|
||||
let stream_returner = match sender_stream_pool {
|
||||
Ok(stream_ret) => stream_ret,
|
||||
Err(_e) => {
|
||||
// retry
|
||||
continue;
|
||||
}
|
||||
};
|
||||
let returner = stream_returner.channel();
|
||||
let map = returner.sender_hashmap.clone();
|
||||
// Insert the response sender into the hashmap
|
||||
{
|
||||
let mut map_inner = map.lock().await;
|
||||
map_inner.insert(request_id, response_sender);
|
||||
}
|
||||
let sent = returner.sender.send(proto::GetPageRequest::from(request))
|
||||
.await;
|
||||
|
||||
if let Err(_e) = sent {
|
||||
// Remove the request from the map if sending failed
|
||||
{
|
||||
let mut map_inner = map.lock().await;
|
||||
// remove from hashmap
|
||||
map_inner.remove(&request_id);
|
||||
}
|
||||
stream_returner.finish(Err(Status::new(Code::Unknown,
|
||||
"Failed to send request"))).await;
|
||||
continue;
|
||||
}
|
||||
|
||||
let response: Option<Result<proto::GetPageResponse, Status>>;
|
||||
response = response_receiver.recv().await;
|
||||
match response {
|
||||
Some (resp) => {
|
||||
match resp {
|
||||
Err(_status) => {
|
||||
// Handle the case where the response was not received
|
||||
stream_returner.finish(Err(Status::new(Code::Unknown,
|
||||
"Failed to receive response"))).await;
|
||||
continue;
|
||||
},
|
||||
Ok(resp) => {
|
||||
stream_returner.finish(Result::Ok(())).await;
|
||||
return Ok(resp.clone().into());
|
||||
}
|
||||
}
|
||||
}
|
||||
None => {
|
||||
// Handle the case where the response channel was closed
|
||||
stream_returner.finish(Err(Status::new(Code::Unknown,
|
||||
"Response channel closed"))).await;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct ShardedRequestTrackerInner {
|
||||
// Hashmap of shard index to RequestTracker
|
||||
trackers: std::collections::HashMap<ShardIndex, RequestTracker>,
|
||||
}
|
||||
pub struct ShardedRequestTracker {
|
||||
inner: Arc<Mutex<ShardedRequestTrackerInner>>,
|
||||
tcp_client_cache_options: ClientCacheOptions,
|
||||
stream_client_cache_options: ClientCacheOptions,
|
||||
}
|
||||
|
||||
//
|
||||
// TODO: Functions in the ShardedRequestTracker should be able to timeout and
|
||||
// cancel a reqeust. The request should return an error if it is cancelled.
|
||||
//
|
||||
impl ShardedRequestTracker {
|
||||
pub fn new() -> Self {
|
||||
//
|
||||
// Default configuration for the client. These could be added to a config file
|
||||
//
|
||||
let tcp_client_cache_options = ClientCacheOptions {
|
||||
max_delay_ms: 0,
|
||||
drop_rate: 0.0,
|
||||
hang_rate: 0.0,
|
||||
connect_timeout: Duration::from_secs(1),
|
||||
connect_backoff: Duration::from_millis(100),
|
||||
max_consumers: 8, // Streams per connection
|
||||
error_threshold: 10,
|
||||
max_idle_duration: Duration::from_secs(5),
|
||||
max_total_connections: 8,
|
||||
};
|
||||
let stream_client_cache_options = ClientCacheOptions {
|
||||
max_delay_ms: 0,
|
||||
drop_rate: 0.0,
|
||||
hang_rate: 0.0,
|
||||
connect_timeout: Duration::from_secs(1),
|
||||
connect_backoff: Duration::from_millis(100),
|
||||
max_consumers: 64, // Requests per stream
|
||||
error_threshold: 10,
|
||||
max_idle_duration: Duration::from_secs(5),
|
||||
max_total_connections: 64, // Total allowable number of streams
|
||||
};
|
||||
ShardedRequestTracker {
|
||||
inner: Arc::new(Mutex::new(ShardedRequestTrackerInner {
|
||||
trackers: std::collections::HashMap::new(),
|
||||
})),
|
||||
tcp_client_cache_options,
|
||||
stream_client_cache_options,
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn update_shard_map(&self,
|
||||
shard_urls: std::collections::HashMap<ShardIndex, String>,
|
||||
metrics: Option<Arc<PageserverClientAggregateMetrics>>,
|
||||
tenant_id: String, timeline_id: String, auth_str: Option<&str>) {
|
||||
|
||||
|
||||
let mut trackers = std::collections::HashMap::new();
|
||||
for (shard, endpoint_url) in shard_urls {
|
||||
//
|
||||
// Create a pool of streams for streaming get_page requests
|
||||
//
|
||||
let channel_fact : Arc<dyn PooledItemFactory<Channel> + Send + Sync> = Arc::new(ChannelFactory::new(
|
||||
endpoint_url.clone(),
|
||||
self.tcp_client_cache_options.max_delay_ms,
|
||||
self.tcp_client_cache_options.drop_rate,
|
||||
self.tcp_client_cache_options.hang_rate,
|
||||
));
|
||||
let new_pool: Arc<ConnectionPool<Channel>>;
|
||||
new_pool = ConnectionPool::new(
|
||||
Arc::clone(&channel_fact),
|
||||
self.tcp_client_cache_options.connect_timeout,
|
||||
self.tcp_client_cache_options.connect_backoff,
|
||||
self.tcp_client_cache_options.max_consumers,
|
||||
self.tcp_client_cache_options.error_threshold,
|
||||
self.tcp_client_cache_options.max_idle_duration,
|
||||
self.tcp_client_cache_options.max_total_connections,
|
||||
metrics.clone(),
|
||||
);
|
||||
|
||||
let auth_interceptor = AuthInterceptor::new(tenant_id.as_str(),
|
||||
timeline_id.as_str(),
|
||||
auth_str);
|
||||
|
||||
let stream_pool = ConnectionPool::<StreamReturner>::new(
|
||||
Arc::new(StreamFactory::new(new_pool.clone(),
|
||||
auth_interceptor.clone(), ShardIndex::unsharded())),
|
||||
self.stream_client_cache_options.connect_timeout,
|
||||
self.stream_client_cache_options.connect_backoff,
|
||||
self.stream_client_cache_options.max_consumers,
|
||||
self.stream_client_cache_options.error_threshold,
|
||||
self.stream_client_cache_options.max_idle_duration,
|
||||
self.stream_client_cache_options.max_total_connections,
|
||||
metrics.clone(),
|
||||
);
|
||||
|
||||
//
|
||||
// Create a client pool for unary requests
|
||||
//
|
||||
|
||||
let unary_pool: Arc<ConnectionPool<Channel>>;
|
||||
unary_pool = ConnectionPool::new(
|
||||
Arc::clone(&channel_fact),
|
||||
self.tcp_client_cache_options.connect_timeout,
|
||||
self.tcp_client_cache_options.connect_backoff,
|
||||
self.tcp_client_cache_options.max_consumers,
|
||||
self.tcp_client_cache_options.error_threshold,
|
||||
self.tcp_client_cache_options.max_idle_duration,
|
||||
self.tcp_client_cache_options.max_total_connections,
|
||||
metrics.clone()
|
||||
);
|
||||
//
|
||||
// Create a new RequestTracker for this shard
|
||||
//
|
||||
let new_tracker = RequestTracker::new(stream_pool, unary_pool, auth_interceptor, shard);
|
||||
trackers.insert(shard, new_tracker);
|
||||
}
|
||||
let mut inner = self.inner.lock().await;
|
||||
inner.trackers = trackers;
|
||||
}
|
||||
|
||||
pub async fn get_page(
|
||||
&self,
|
||||
req: GetPageRequest,
|
||||
) -> Result<GetPageResponse, tonic::Status> {
|
||||
|
||||
// Get shard index from the request
|
||||
let shard_index = ShardIndex::unsharded();
|
||||
let inner = self.inner.lock().await;
|
||||
let mut tracker : RequestTracker;
|
||||
if let Some(t) = inner.trackers.get(&shard_index) {
|
||||
tracker = t.clone();
|
||||
} else {
|
||||
return Err(tonic::Status::not_found(format!("Shard {} not found", shard_index)));
|
||||
}
|
||||
drop(inner);
|
||||
// Call the send_getpage_request method on the tracker
|
||||
let response = tracker.send_getpage_request(req).await;
|
||||
match response {
|
||||
Ok(resp) => Ok(resp),
|
||||
Err(e) => Err(tonic::Status::unknown(format!("Failed to get page: {}", e))),
|
||||
}
|
||||
}
|
||||
pub async fn process_get_dbsize_request(
|
||||
&self,
|
||||
request: GetDbSizeRequest,
|
||||
) -> Result<u64, tonic::Status> {
|
||||
let shard_index = ShardIndex::unsharded();
|
||||
let inner = self.inner.lock().await;
|
||||
let mut tracker: RequestTracker;
|
||||
if let Some(t) = inner.trackers.get(&shard_index) {
|
||||
tracker = t.clone();
|
||||
} else {
|
||||
return Err(tonic::Status::not_found(format!("Shard {} not found", shard_index)));
|
||||
}
|
||||
drop(inner); // Release the lock before calling send_process_get_dbsize_request
|
||||
// Call the send_process_get_dbsize_request method on the tracker
|
||||
let response = tracker.send_process_get_dbsize_request(request).await;
|
||||
match response {
|
||||
Ok(resp) => Ok(resp),
|
||||
Err(e) => Err(e),
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn process_get_rel_size_request(
|
||||
&self,
|
||||
request: GetRelSizeRequest,
|
||||
) -> Result<u32, tonic::Status> {
|
||||
let shard_index = ShardIndex::unsharded();
|
||||
let inner = self.inner.lock().await;
|
||||
let mut tracker: RequestTracker;
|
||||
if let Some(t) = inner.trackers.get(&shard_index) {
|
||||
tracker = t.clone();
|
||||
} else {
|
||||
return Err(tonic::Status::not_found(format!("Shard {} not found", shard_index)));
|
||||
}
|
||||
drop(inner); // Release the lock before calling send_process_get_rel_size_request
|
||||
// Call the send_process_get_rel_size_request method on the tracker
|
||||
let response = tracker.send_process_get_rel_size_request(request).await;
|
||||
match response {
|
||||
Ok(resp) => Ok(resp),
|
||||
Err(e) => Err(e),
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn process_check_rel_exists_request(
|
||||
&self,
|
||||
request: CheckRelExistsRequest,
|
||||
) -> Result<bool, tonic::Status> {
|
||||
let shard_index = ShardIndex::unsharded();
|
||||
let inner = self.inner.lock().await;
|
||||
let mut tracker: RequestTracker;
|
||||
if let Some(t) = inner.trackers.get(&shard_index) {
|
||||
tracker = t.clone();
|
||||
} else {
|
||||
return Err(tonic::Status::not_found(format!("Shard {} not found", shard_index)));
|
||||
}
|
||||
drop(inner); // Release the lock before calling send_process_check_rel_exists_request
|
||||
// Call the send_process_check_rel_exists_request method on the tracker
|
||||
let response = tracker.send_process_check_rel_exists_request(request).await;
|
||||
match response {
|
||||
Ok(resp) => Ok(resp),
|
||||
Err(e) => Err(e),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -9,7 +9,6 @@ bytes.workspace = true
|
||||
pageserver_api.workspace = true
|
||||
postgres_ffi.workspace = true
|
||||
prost.workspace = true
|
||||
smallvec.workspace = true
|
||||
thiserror.workspace = true
|
||||
tonic.workspace = true
|
||||
utils.workspace = true
|
||||
|
||||
@@ -104,8 +104,8 @@ message CheckRelExistsResponse {
|
||||
|
||||
// Requests a base backup at a given LSN.
|
||||
message GetBaseBackupRequest {
|
||||
// The LSN to fetch a base backup at.
|
||||
ReadLsn read_lsn = 1;
|
||||
// The LSN to fetch a base backup at. 0 or absent means the latest LSN known to the Pageserver.
|
||||
uint64 lsn = 1;
|
||||
// If true, logical replication slots will not be created.
|
||||
bool replica = 2;
|
||||
}
|
||||
|
||||
@@ -9,10 +9,16 @@
|
||||
//! - Use more precise datatypes, e.g. Lsn and uints shorter than 32 bits.
|
||||
//!
|
||||
//! - Validate protocol invariants, via try_from() and try_into().
|
||||
//!
|
||||
//! Validation only happens on the receiver side, i.e. when converting from Protobuf to domain
|
||||
//! types. This is where it matters -- the Protobuf types are less strict than the domain types, and
|
||||
//! receivers should expect all sorts of junk from senders. This also allows the sender to use e.g.
|
||||
//! stream combinators without dealing with errors, and avoids validating the same message twice.
|
||||
|
||||
use std::fmt::Display;
|
||||
|
||||
use bytes::Bytes;
|
||||
use postgres_ffi::Oid;
|
||||
use smallvec::SmallVec;
|
||||
// TODO: split out Lsn, RelTag, SlruKind, Oid and other basic types to a separate crate, to avoid
|
||||
// pulling in all of their other crate dependencies when building the client.
|
||||
use utils::lsn::Lsn;
|
||||
@@ -48,7 +54,8 @@ pub struct ReadLsn {
|
||||
pub request_lsn: Lsn,
|
||||
/// If given, the caller guarantees that the page has not been modified since this LSN. Must be
|
||||
/// smaller than or equal to request_lsn. This allows the Pageserver to serve an old page
|
||||
/// without waiting for the request LSN to arrive. Valid for all request types.
|
||||
/// without waiting for the request LSN to arrive. If not given, the request will read at the
|
||||
/// request_lsn and wait for it to arrive if necessary. Valid for all request types.
|
||||
///
|
||||
/// It is undefined behaviour to make a request such that the page was, in fact, modified
|
||||
/// between request_lsn and not_modified_since_lsn. The Pageserver might detect it and return an
|
||||
@@ -58,19 +65,14 @@ pub struct ReadLsn {
|
||||
pub not_modified_since_lsn: Option<Lsn>,
|
||||
}
|
||||
|
||||
impl ReadLsn {
|
||||
/// Validates the ReadLsn.
|
||||
pub fn validate(&self) -> Result<(), ProtocolError> {
|
||||
if self.request_lsn == Lsn::INVALID {
|
||||
return Err(ProtocolError::invalid("request_lsn", self.request_lsn));
|
||||
impl Display for ReadLsn {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
let req_lsn = self.request_lsn;
|
||||
if let Some(mod_lsn) = self.not_modified_since_lsn {
|
||||
write!(f, "{req_lsn}>={mod_lsn}")
|
||||
} else {
|
||||
req_lsn.fmt(f)
|
||||
}
|
||||
if self.not_modified_since_lsn > Some(self.request_lsn) {
|
||||
return Err(ProtocolError::invalid(
|
||||
"not_modified_since_lsn",
|
||||
self.not_modified_since_lsn,
|
||||
));
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -78,27 +80,31 @@ impl TryFrom<proto::ReadLsn> for ReadLsn {
|
||||
type Error = ProtocolError;
|
||||
|
||||
fn try_from(pb: proto::ReadLsn) -> Result<Self, Self::Error> {
|
||||
let read_lsn = Self {
|
||||
if pb.request_lsn == 0 {
|
||||
return Err(ProtocolError::invalid("request_lsn", pb.request_lsn));
|
||||
}
|
||||
if pb.not_modified_since_lsn > pb.request_lsn {
|
||||
return Err(ProtocolError::invalid(
|
||||
"not_modified_since_lsn",
|
||||
pb.not_modified_since_lsn,
|
||||
));
|
||||
}
|
||||
Ok(Self {
|
||||
request_lsn: Lsn(pb.request_lsn),
|
||||
not_modified_since_lsn: match pb.not_modified_since_lsn {
|
||||
0 => None,
|
||||
lsn => Some(Lsn(lsn)),
|
||||
},
|
||||
};
|
||||
read_lsn.validate()?;
|
||||
Ok(read_lsn)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<ReadLsn> for proto::ReadLsn {
|
||||
type Error = ProtocolError;
|
||||
|
||||
fn try_from(read_lsn: ReadLsn) -> Result<Self, Self::Error> {
|
||||
read_lsn.validate()?;
|
||||
Ok(Self {
|
||||
impl From<ReadLsn> for proto::ReadLsn {
|
||||
fn from(read_lsn: ReadLsn) -> Self {
|
||||
Self {
|
||||
request_lsn: read_lsn.request_lsn.0,
|
||||
not_modified_since_lsn: read_lsn.not_modified_since_lsn.unwrap_or_default().0,
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -153,6 +159,15 @@ impl TryFrom<proto::CheckRelExistsRequest> for CheckRelExistsRequest {
|
||||
}
|
||||
}
|
||||
|
||||
impl From<CheckRelExistsRequest> for proto::CheckRelExistsRequest {
|
||||
fn from(request: CheckRelExistsRequest) -> Self {
|
||||
Self {
|
||||
read_lsn: Some(request.read_lsn.into()),
|
||||
rel: Some(request.rel.into()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub type CheckRelExistsResponse = bool;
|
||||
|
||||
impl From<proto::CheckRelExistsResponse> for CheckRelExistsResponse {
|
||||
@@ -170,34 +185,27 @@ impl From<CheckRelExistsResponse> for proto::CheckRelExistsResponse {
|
||||
/// Requests a base backup at a given LSN.
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
pub struct GetBaseBackupRequest {
|
||||
/// The LSN to fetch a base backup at.
|
||||
pub read_lsn: ReadLsn,
|
||||
/// The LSN to fetch a base backup at. If None, uses the latest LSN known to the Pageserver.
|
||||
pub lsn: Option<Lsn>,
|
||||
/// If true, logical replication slots will not be created.
|
||||
pub replica: bool,
|
||||
}
|
||||
|
||||
impl TryFrom<proto::GetBaseBackupRequest> for GetBaseBackupRequest {
|
||||
type Error = ProtocolError;
|
||||
|
||||
fn try_from(pb: proto::GetBaseBackupRequest) -> Result<Self, Self::Error> {
|
||||
Ok(Self {
|
||||
read_lsn: pb
|
||||
.read_lsn
|
||||
.ok_or(ProtocolError::Missing("read_lsn"))?
|
||||
.try_into()?,
|
||||
impl From<proto::GetBaseBackupRequest> for GetBaseBackupRequest {
|
||||
fn from(pb: proto::GetBaseBackupRequest) -> Self {
|
||||
Self {
|
||||
lsn: (pb.lsn != 0).then_some(Lsn(pb.lsn)),
|
||||
replica: pb.replica,
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<GetBaseBackupRequest> for proto::GetBaseBackupRequest {
|
||||
type Error = ProtocolError;
|
||||
|
||||
fn try_from(request: GetBaseBackupRequest) -> Result<Self, Self::Error> {
|
||||
Ok(Self {
|
||||
read_lsn: Some(request.read_lsn.try_into()?),
|
||||
impl From<GetBaseBackupRequest> for proto::GetBaseBackupRequest {
|
||||
fn from(request: GetBaseBackupRequest) -> Self {
|
||||
Self {
|
||||
lsn: request.lsn.unwrap_or_default().0,
|
||||
replica: request.replica,
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -214,14 +222,9 @@ impl TryFrom<proto::GetBaseBackupResponseChunk> for GetBaseBackupResponseChunk {
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<GetBaseBackupResponseChunk> for proto::GetBaseBackupResponseChunk {
|
||||
type Error = ProtocolError;
|
||||
|
||||
fn try_from(chunk: GetBaseBackupResponseChunk) -> Result<Self, Self::Error> {
|
||||
if chunk.is_empty() {
|
||||
return Err(ProtocolError::Missing("chunk"));
|
||||
}
|
||||
Ok(Self { chunk })
|
||||
impl From<GetBaseBackupResponseChunk> for proto::GetBaseBackupResponseChunk {
|
||||
fn from(chunk: GetBaseBackupResponseChunk) -> Self {
|
||||
Self { chunk }
|
||||
}
|
||||
}
|
||||
|
||||
@@ -246,14 +249,12 @@ impl TryFrom<proto::GetDbSizeRequest> for GetDbSizeRequest {
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<GetDbSizeRequest> for proto::GetDbSizeRequest {
|
||||
type Error = ProtocolError;
|
||||
|
||||
fn try_from(request: GetDbSizeRequest) -> Result<Self, Self::Error> {
|
||||
Ok(Self {
|
||||
read_lsn: Some(request.read_lsn.try_into()?),
|
||||
impl From<GetDbSizeRequest> for proto::GetDbSizeRequest {
|
||||
fn from(request: GetDbSizeRequest) -> Self {
|
||||
Self {
|
||||
read_lsn: Some(request.read_lsn.into()),
|
||||
db_oid: request.db_oid,
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -288,7 +289,7 @@ pub struct GetPageRequest {
|
||||
/// Multiple pages will be executed as a single batch by the Pageserver, amortizing layer access
|
||||
/// costs and parallelizing them. This may increase the latency of any individual request, but
|
||||
/// improves the overall latency and throughput of the batch as a whole.
|
||||
pub block_numbers: SmallVec<[u32; 1]>,
|
||||
pub block_numbers: Vec<u32>,
|
||||
}
|
||||
|
||||
impl TryFrom<proto::GetPageRequest> for GetPageRequest {
|
||||
@@ -306,25 +307,20 @@ impl TryFrom<proto::GetPageRequest> for GetPageRequest {
|
||||
.ok_or(ProtocolError::Missing("read_lsn"))?
|
||||
.try_into()?,
|
||||
rel: pb.rel.ok_or(ProtocolError::Missing("rel"))?.try_into()?,
|
||||
block_numbers: pb.block_number.into(),
|
||||
block_numbers: pb.block_number,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<GetPageRequest> for proto::GetPageRequest {
|
||||
type Error = ProtocolError;
|
||||
|
||||
fn try_from(request: GetPageRequest) -> Result<Self, Self::Error> {
|
||||
if request.block_numbers.is_empty() {
|
||||
return Err(ProtocolError::Missing("block_number"));
|
||||
}
|
||||
Ok(Self {
|
||||
impl From<GetPageRequest> for proto::GetPageRequest {
|
||||
fn from(request: GetPageRequest) -> Self {
|
||||
Self {
|
||||
request_id: request.request_id,
|
||||
request_class: request.request_class.into(),
|
||||
read_lsn: Some(request.read_lsn.try_into()?),
|
||||
read_lsn: Some(request.read_lsn.into()),
|
||||
rel: Some(request.rel.into()),
|
||||
block_number: request.block_numbers.into_vec(),
|
||||
})
|
||||
block_number: request.block_numbers,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -396,7 +392,7 @@ pub struct GetPageResponse {
|
||||
/// A string describing the status, if any.
|
||||
pub reason: Option<String>,
|
||||
/// The 8KB page images, in the same order as the request. Empty if status != OK.
|
||||
pub page_images: SmallVec<[Bytes; 1]>,
|
||||
pub page_images: Vec<Bytes>,
|
||||
}
|
||||
|
||||
impl From<proto::GetPageResponse> for GetPageResponse {
|
||||
@@ -405,7 +401,7 @@ impl From<proto::GetPageResponse> for GetPageResponse {
|
||||
request_id: pb.request_id,
|
||||
status_code: pb.status_code.into(),
|
||||
reason: Some(pb.reason).filter(|r| !r.is_empty()),
|
||||
page_images: pb.page_image.into(),
|
||||
page_images: pb.page_image,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -416,7 +412,7 @@ impl From<GetPageResponse> for proto::GetPageResponse {
|
||||
request_id: response.request_id,
|
||||
status_code: response.status_code.into(),
|
||||
reason: response.reason.unwrap_or_default(),
|
||||
page_image: response.page_images.into_vec(),
|
||||
page_image: response.page_images,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -486,6 +482,7 @@ impl From<GetPageStatusCode> for i32 {
|
||||
|
||||
// Fetches the size of a relation at a given LSN, as # of blocks. Only valid on shard 0, other
|
||||
// shards will error.
|
||||
#[derive(Clone)]
|
||||
pub struct GetRelSizeRequest {
|
||||
pub read_lsn: ReadLsn,
|
||||
pub rel: RelTag,
|
||||
@@ -505,14 +502,12 @@ impl TryFrom<proto::GetRelSizeRequest> for GetRelSizeRequest {
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<GetRelSizeRequest> for proto::GetRelSizeRequest {
|
||||
type Error = ProtocolError;
|
||||
|
||||
fn try_from(request: GetRelSizeRequest) -> Result<Self, Self::Error> {
|
||||
Ok(Self {
|
||||
read_lsn: Some(request.read_lsn.try_into()?),
|
||||
impl From<GetRelSizeRequest> for proto::GetRelSizeRequest {
|
||||
fn from(request: GetRelSizeRequest) -> Self {
|
||||
Self {
|
||||
read_lsn: Some(request.read_lsn.into()),
|
||||
rel: Some(request.rel.into()),
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -555,15 +550,13 @@ impl TryFrom<proto::GetSlruSegmentRequest> for GetSlruSegmentRequest {
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<GetSlruSegmentRequest> for proto::GetSlruSegmentRequest {
|
||||
type Error = ProtocolError;
|
||||
|
||||
fn try_from(request: GetSlruSegmentRequest) -> Result<Self, Self::Error> {
|
||||
Ok(Self {
|
||||
read_lsn: Some(request.read_lsn.try_into()?),
|
||||
impl From<GetSlruSegmentRequest> for proto::GetSlruSegmentRequest {
|
||||
fn from(request: GetSlruSegmentRequest) -> Self {
|
||||
Self {
|
||||
read_lsn: Some(request.read_lsn.into()),
|
||||
kind: request.kind as u32,
|
||||
segno: request.segno,
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -580,14 +573,9 @@ impl TryFrom<proto::GetSlruSegmentResponse> for GetSlruSegmentResponse {
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<GetSlruSegmentResponse> for proto::GetSlruSegmentResponse {
|
||||
type Error = ProtocolError;
|
||||
|
||||
fn try_from(segment: GetSlruSegmentResponse) -> Result<Self, Self::Error> {
|
||||
if segment.is_empty() {
|
||||
return Err(ProtocolError::Missing("segment"));
|
||||
}
|
||||
Ok(Self { segment })
|
||||
impl From<GetSlruSegmentResponse> for proto::GetSlruSegmentResponse {
|
||||
fn from(segment: GetSlruSegmentResponse) -> Self {
|
||||
Self { segment }
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -8,6 +8,8 @@ license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
async-trait.workspace = true
|
||||
bytes.workspace = true
|
||||
camino.workspace = true
|
||||
clap.workspace = true
|
||||
futures.workspace = true
|
||||
@@ -15,14 +17,21 @@ hdrhistogram.workspace = true
|
||||
humantime.workspace = true
|
||||
humantime-serde.workspace = true
|
||||
rand.workspace = true
|
||||
reqwest.workspace=true
|
||||
reqwest.workspace = true
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
tracing.workspace = true
|
||||
tokio.workspace = true
|
||||
tokio-stream.workspace = true
|
||||
tokio-util.workspace = true
|
||||
axum.workspace = true
|
||||
http.workspace = true
|
||||
metrics.workspace = true
|
||||
tonic.workspace = true
|
||||
|
||||
pageserver_client.workspace = true
|
||||
pageserver_client_grpc.workspace = true
|
||||
pageserver_api.workspace = true
|
||||
pageserver_page_api.workspace = true
|
||||
utils = { path = "../../libs/utils/" }
|
||||
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
||||
|
||||
@@ -9,12 +9,15 @@ use anyhow::Context;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use pageserver_client::mgmt_api::ForceAwaitLogicalSize;
|
||||
use pageserver_client::page_service::BasebackupRequest;
|
||||
use pageserver_page_api::{GetBaseBackupRequest, ReadLsn};
|
||||
|
||||
use rand::prelude::*;
|
||||
use tokio::sync::Barrier;
|
||||
use tokio::task::JoinSet;
|
||||
use tracing::{info, instrument};
|
||||
use utils::id::TenantTimelineId;
|
||||
use utils::lsn::Lsn;
|
||||
use utils::shard::ShardIndex;
|
||||
|
||||
use crate::util::tokio_thread_local_stats::AllThreadLocalStats;
|
||||
use crate::util::{request_stats, tokio_thread_local_stats};
|
||||
@@ -22,6 +25,8 @@ use crate::util::{request_stats, tokio_thread_local_stats};
|
||||
/// basebackup@LatestLSN
|
||||
#[derive(clap::Parser)]
|
||||
pub(crate) struct Args {
|
||||
#[clap(long, default_value = "false")]
|
||||
grpc: bool,
|
||||
#[clap(long, default_value = "http://localhost:9898")]
|
||||
mgmt_api_endpoint: String,
|
||||
#[clap(long, default_value = "postgres://postgres@localhost:64000")]
|
||||
@@ -52,7 +57,7 @@ impl LiveStats {
|
||||
|
||||
struct Target {
|
||||
timeline: TenantTimelineId,
|
||||
lsn_range: Option<Range<Lsn>>,
|
||||
lsn_range: Range<Lsn>,
|
||||
}
|
||||
|
||||
#[derive(serde::Serialize)]
|
||||
@@ -105,7 +110,7 @@ async fn main_impl(
|
||||
anyhow::Ok(Target {
|
||||
timeline,
|
||||
// TODO: support lsn_range != latest LSN
|
||||
lsn_range: Some(info.last_record_lsn..(info.last_record_lsn + 1)),
|
||||
lsn_range: info.last_record_lsn..(info.last_record_lsn + 1),
|
||||
})
|
||||
}
|
||||
});
|
||||
@@ -149,14 +154,27 @@ async fn main_impl(
|
||||
for tl in &timelines {
|
||||
let (sender, receiver) = tokio::sync::mpsc::channel(1); // TODO: not sure what the implications of this are
|
||||
work_senders.insert(tl, sender);
|
||||
tasks.push(tokio::spawn(client(
|
||||
args,
|
||||
*tl,
|
||||
Arc::clone(&start_work_barrier),
|
||||
receiver,
|
||||
Arc::clone(&all_work_done_barrier),
|
||||
Arc::clone(&live_stats),
|
||||
)));
|
||||
|
||||
let client_task = if args.grpc {
|
||||
tokio::spawn(client_grpc(
|
||||
args,
|
||||
*tl,
|
||||
Arc::clone(&start_work_barrier),
|
||||
receiver,
|
||||
Arc::clone(&all_work_done_barrier),
|
||||
Arc::clone(&live_stats),
|
||||
))
|
||||
} else {
|
||||
tokio::spawn(client(
|
||||
args,
|
||||
*tl,
|
||||
Arc::clone(&start_work_barrier),
|
||||
receiver,
|
||||
Arc::clone(&all_work_done_barrier),
|
||||
Arc::clone(&live_stats),
|
||||
))
|
||||
};
|
||||
tasks.push(client_task);
|
||||
}
|
||||
|
||||
let work_sender = async move {
|
||||
@@ -165,7 +183,7 @@ async fn main_impl(
|
||||
let (timeline, work) = {
|
||||
let mut rng = rand::thread_rng();
|
||||
let target = all_targets.choose(&mut rng).unwrap();
|
||||
let lsn = target.lsn_range.clone().map(|r| rng.gen_range(r));
|
||||
let lsn = rng.gen_range(target.lsn_range.clone());
|
||||
(
|
||||
target.timeline,
|
||||
Work {
|
||||
@@ -215,7 +233,7 @@ async fn main_impl(
|
||||
|
||||
#[derive(Copy, Clone)]
|
||||
struct Work {
|
||||
lsn: Option<Lsn>,
|
||||
lsn: Lsn,
|
||||
gzip: bool,
|
||||
}
|
||||
|
||||
@@ -240,7 +258,7 @@ async fn client(
|
||||
.basebackup(&BasebackupRequest {
|
||||
tenant_id: timeline.tenant_id,
|
||||
timeline_id: timeline.timeline_id,
|
||||
lsn,
|
||||
lsn: Some(lsn),
|
||||
gzip,
|
||||
})
|
||||
.await
|
||||
@@ -270,3 +288,71 @@ async fn client(
|
||||
|
||||
all_work_done_barrier.wait().await;
|
||||
}
|
||||
|
||||
#[instrument(skip_all)]
|
||||
async fn client_grpc(
|
||||
args: &'static Args,
|
||||
timeline: TenantTimelineId,
|
||||
start_work_barrier: Arc<Barrier>,
|
||||
mut work: tokio::sync::mpsc::Receiver<Work>,
|
||||
all_work_done_barrier: Arc<Barrier>,
|
||||
live_stats: Arc<LiveStats>,
|
||||
) {
|
||||
let shard_map = HashMap::from([(
|
||||
ShardIndex::unsharded(),
|
||||
args.page_service_connstring.clone(),
|
||||
)]);
|
||||
let client = pageserver_client_grpc::PageserverClient::new(
|
||||
&timeline.tenant_id.to_string(),
|
||||
&timeline.timeline_id.to_string(),
|
||||
&None,
|
||||
shard_map,
|
||||
);
|
||||
|
||||
start_work_barrier.wait().await;
|
||||
|
||||
while let Some(Work { lsn, gzip }) = work.recv().await {
|
||||
let start = Instant::now();
|
||||
|
||||
//tokio::time::sleep(std::time::Duration::from_secs(1)).await;
|
||||
|
||||
info!("starting get_base_backup");
|
||||
let mut basebackup_stream = client
|
||||
.get_base_backup(
|
||||
GetBaseBackupRequest {
|
||||
lsn: Some(lsn),
|
||||
replica: false,
|
||||
},
|
||||
gzip,
|
||||
)
|
||||
.await
|
||||
.with_context(|| format!("start basebackup for {timeline}"))
|
||||
.unwrap()
|
||||
.into_inner();
|
||||
|
||||
info!("starting receive");
|
||||
use futures::StreamExt;
|
||||
let mut size = 0;
|
||||
let mut nchunks = 0;
|
||||
while let Some(chunk) = basebackup_stream.next().await {
|
||||
let chunk = chunk
|
||||
.with_context(|| format!("error during basebackup"))
|
||||
.unwrap();
|
||||
size += chunk.chunk.len();
|
||||
nchunks += 1;
|
||||
}
|
||||
|
||||
info!(
|
||||
"basebackup size is {} bytes, avg chunk size {} bytes",
|
||||
size,
|
||||
size as f32 / nchunks as f32
|
||||
);
|
||||
let elapsed = start.elapsed();
|
||||
live_stats.inc();
|
||||
STATS.with(|stats| {
|
||||
stats.borrow().lock().unwrap().observe(elapsed).unwrap();
|
||||
});
|
||||
}
|
||||
|
||||
all_work_done_barrier.wait().await;
|
||||
}
|
||||
|
||||
@@ -1,17 +1,22 @@
|
||||
use std::collections::{HashSet, VecDeque};
|
||||
use std::collections::{HashSet, HashMap, VecDeque};
|
||||
use std::future::Future;
|
||||
use std::num::NonZeroUsize;
|
||||
use std::pin::Pin;
|
||||
use std::sync::atomic::{AtomicU64, Ordering};
|
||||
use std::sync::{Arc, Mutex};
|
||||
use std::time::{Duration, Instant};
|
||||
use std::io::Error;
|
||||
|
||||
use anyhow::Context;
|
||||
use async_trait::async_trait;
|
||||
use bytes::Bytes;
|
||||
use camino::Utf8PathBuf;
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::keyspace::KeySpaceAccum;
|
||||
use pageserver_api::models::{PagestreamGetPageRequest, PagestreamRequest};
|
||||
use pageserver_api::reltag::RelTag;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use pageserver_page_api::proto;
|
||||
use rand::prelude::*;
|
||||
use tokio::task::JoinSet;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
@@ -19,12 +24,36 @@ use tracing::info;
|
||||
use utils::id::TenantTimelineId;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use tonic::transport::Channel;
|
||||
|
||||
use axum::Router;
|
||||
use axum::body::Body;
|
||||
use axum::extract::State;
|
||||
use axum::response::Response;
|
||||
|
||||
use http::StatusCode;
|
||||
use http::header::CONTENT_TYPE;
|
||||
|
||||
use metrics;
|
||||
use metrics::proto::MetricFamily;
|
||||
use metrics::{Encoder, TextEncoder};
|
||||
|
||||
use crate::util::tokio_thread_local_stats::AllThreadLocalStats;
|
||||
use crate::util::{request_stats, tokio_thread_local_stats};
|
||||
|
||||
#[derive(clap::ValueEnum, Clone, Debug)]
|
||||
enum Protocol {
|
||||
Libpq,
|
||||
Grpc,
|
||||
}
|
||||
|
||||
/// GetPage@LatestLSN, uniformly distributed across the compute-accessible keyspace.
|
||||
#[derive(clap::Parser)]
|
||||
pub(crate) struct Args {
|
||||
#[clap(long, default_value = "false")]
|
||||
grpc: bool,
|
||||
#[clap(long, default_value = "false")]
|
||||
grpc_stream: bool,
|
||||
#[clap(long, default_value = "http://localhost:9898")]
|
||||
mgmt_api_endpoint: String,
|
||||
#[clap(long, default_value = "postgres://postgres@localhost:64000")]
|
||||
@@ -35,6 +64,8 @@ pub(crate) struct Args {
|
||||
num_clients: NonZeroUsize,
|
||||
#[clap(long)]
|
||||
runtime: Option<humantime::Duration>,
|
||||
#[clap(long, value_enum, default_value = "libpq")]
|
||||
protocol: Protocol,
|
||||
/// Each client sends requests at the given rate.
|
||||
///
|
||||
/// If a request takes too long and we should be issuing a new request already,
|
||||
@@ -61,14 +92,48 @@ pub(crate) struct Args {
|
||||
#[clap(long)]
|
||||
set_io_mode: Option<pageserver_api::models::virtual_file::IoMode>,
|
||||
|
||||
#[clap(long)]
|
||||
only_relnode: Option<u32>,
|
||||
|
||||
/// Queue depth generated in each client.
|
||||
#[clap(long, default_value = "1")]
|
||||
queue_depth: NonZeroUsize,
|
||||
|
||||
#[clap(long)]
|
||||
only_relnode: Option<u32>,
|
||||
/// Batch size of contiguous pages generated by each client. This is equivalent to how Postgres
|
||||
/// will request page batches (e.g. prefetches or vectored reads). A batch counts as 1 RPS and
|
||||
/// 1 queue depth.
|
||||
///
|
||||
/// The libpq protocol does not support client-side batching, and will submit batches as many
|
||||
/// individual requests, in the hope that the server will batch them. Each batch still counts as
|
||||
/// 1 RPS and 1 queue depth.
|
||||
#[clap(long, default_value = "1")]
|
||||
batch_size: NonZeroUsize,
|
||||
|
||||
targets: Option<Vec<TenantTimelineId>>,
|
||||
|
||||
#[clap(long, default_value = "100")]
|
||||
pool_max_consumers: NonZeroUsize,
|
||||
|
||||
#[clap(long, default_value = "5")]
|
||||
pool_error_threshold: NonZeroUsize,
|
||||
|
||||
#[clap(long, default_value = "5000")]
|
||||
pool_connect_timeout: NonZeroUsize,
|
||||
|
||||
#[clap(long, default_value = "1000")]
|
||||
pool_connect_backoff: NonZeroUsize,
|
||||
|
||||
#[clap(long, default_value = "60000")]
|
||||
pool_max_idle_duration: NonZeroUsize,
|
||||
|
||||
#[clap(long, default_value = "0")]
|
||||
max_delay_ms: usize,
|
||||
|
||||
#[clap(long, default_value = "0")]
|
||||
percent_drops: usize,
|
||||
|
||||
#[clap(long, default_value = "0")]
|
||||
percent_hangs: usize,
|
||||
}
|
||||
|
||||
/// State shared by all clients
|
||||
@@ -125,6 +190,37 @@ pub(crate) fn main(args: Args) -> anyhow::Result<()> {
|
||||
main_impl(args, thread_local_stats)
|
||||
})
|
||||
}
|
||||
async fn get_metrics(
|
||||
State(state): State<Arc<pageserver_client_grpc::PageserverClientAggregateMetrics>>,
|
||||
) -> Response {
|
||||
let metrics = state.collect();
|
||||
|
||||
info!("metrics: {metrics:?}");
|
||||
// When we call TextEncoder::encode() below, it will immediately return an
|
||||
// error if a metric family has no metrics, so we need to preemptively
|
||||
// filter out metric families with no metrics.
|
||||
let metrics = metrics
|
||||
.into_iter()
|
||||
.filter(|m| !m.get_metric().is_empty())
|
||||
.collect::<Vec<MetricFamily>>();
|
||||
|
||||
let encoder = TextEncoder::new();
|
||||
let mut buffer = vec![];
|
||||
|
||||
if let Err(e) = encoder.encode(&metrics, &mut buffer) {
|
||||
Response::builder()
|
||||
.status(StatusCode::INTERNAL_SERVER_ERROR)
|
||||
.header(CONTENT_TYPE, "application/text")
|
||||
.body(Body::from(e.to_string()))
|
||||
.unwrap()
|
||||
} else {
|
||||
Response::builder()
|
||||
.status(StatusCode::OK)
|
||||
.header(CONTENT_TYPE, encoder.format_type())
|
||||
.body(Body::from(buffer))
|
||||
.unwrap()
|
||||
}
|
||||
}
|
||||
|
||||
async fn main_impl(
|
||||
args: Args,
|
||||
@@ -132,6 +228,24 @@ async fn main_impl(
|
||||
) -> anyhow::Result<()> {
|
||||
let args: &'static Args = Box::leak(Box::new(args));
|
||||
|
||||
// Vector of pageserver clients
|
||||
let client_metrics = Arc::new(pageserver_client_grpc::PageserverClientAggregateMetrics::new());
|
||||
|
||||
use axum::routing::get;
|
||||
let app = Router::new()
|
||||
.route("/metrics", get(get_metrics))
|
||||
.with_state(client_metrics.clone());
|
||||
|
||||
// TODO: make configurable. Or listen on unix domain socket?
|
||||
let listener = tokio::net::TcpListener::bind("127.0.0.1:9090")
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
tokio::spawn(async {
|
||||
tracing::info!("metrics listener spawned");
|
||||
axum::serve(listener, app).await.unwrap()
|
||||
});
|
||||
|
||||
let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new(
|
||||
reqwest::Client::new(), // TODO: support ssl_ca_file for https APIs in pagebench.
|
||||
args.mgmt_api_endpoint.clone(),
|
||||
@@ -290,6 +404,7 @@ async fn main_impl(
|
||||
let rps_period = args
|
||||
.per_client_rate
|
||||
.map(|rps_limit| Duration::from_secs_f64(1.0 / (rps_limit as f64)));
|
||||
|
||||
let make_worker: &dyn Fn(WorkerId) -> Pin<Box<dyn Send + Future<Output = ()>>> = &|worker_id| {
|
||||
let ss = shared_state.clone();
|
||||
let cancel = cancel.clone();
|
||||
@@ -303,7 +418,21 @@ async fn main_impl(
|
||||
.unwrap();
|
||||
|
||||
Box::pin(async move {
|
||||
client_libpq(args, worker_id, ss, cancel, rps_period, ranges, weights).await
|
||||
let client: Box<dyn Client> = match args.protocol {
|
||||
Protocol::Libpq => Box::new(
|
||||
LibpqClient::new(args.page_service_connstring.clone(), worker_id.timeline)
|
||||
.await
|
||||
.unwrap(),
|
||||
),
|
||||
|
||||
Protocol::Grpc => Box::new(
|
||||
GrpcClient::new(args.page_service_connstring.clone(), worker_id.timeline)
|
||||
.await
|
||||
.unwrap(),
|
||||
),
|
||||
|
||||
};
|
||||
run_worker(args, client, ss, cancel, rps_period, ranges, weights).await
|
||||
})
|
||||
};
|
||||
|
||||
@@ -355,27 +484,28 @@ async fn main_impl(
|
||||
anyhow::Ok(())
|
||||
}
|
||||
|
||||
async fn client_libpq(
|
||||
async fn run_worker(
|
||||
args: &Args,
|
||||
worker_id: WorkerId,
|
||||
mut client: Box<dyn Client>,
|
||||
shared_state: Arc<SharedState>,
|
||||
cancel: CancellationToken,
|
||||
rps_period: Option<Duration>,
|
||||
ranges: Vec<KeyRange>,
|
||||
weights: rand::distributions::weighted::WeightedIndex<i128>,
|
||||
) {
|
||||
let client = pageserver_client::page_service::Client::new(args.page_service_connstring.clone())
|
||||
.await
|
||||
.unwrap();
|
||||
let mut client = client
|
||||
.pagestream(worker_id.timeline.tenant_id, worker_id.timeline.timeline_id)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
shared_state.start_work_barrier.wait().await;
|
||||
let client_start = Instant::now();
|
||||
let mut ticks_processed = 0;
|
||||
let mut inflight = VecDeque::new();
|
||||
let mut req_id = 0;
|
||||
let batch_size: usize = args.batch_size.into();
|
||||
|
||||
// Track inflight requests by request ID and start time. This times the request duration, and
|
||||
// ensures responses match requests. We don't expect responses back in any particular order.
|
||||
//
|
||||
// NB: this does not check that all requests received a response, because we don't wait for the
|
||||
// inflight requests to complete when the duration elapses.
|
||||
let mut inflight: HashMap<u64, Instant> = HashMap::new();
|
||||
|
||||
while !cancel.is_cancelled() {
|
||||
// Detect if a request took longer than the RPS rate
|
||||
if let Some(period) = &rps_period {
|
||||
@@ -391,36 +521,72 @@ async fn client_libpq(
|
||||
}
|
||||
|
||||
while inflight.len() < args.queue_depth.get() {
|
||||
req_id += 1;
|
||||
let start = Instant::now();
|
||||
let req = {
|
||||
let (req_lsn, mod_lsn, rel, blks) = {
|
||||
/// Converts a compact i128 key to a relation tag and block number.
|
||||
fn key_to_block(key: i128) -> (RelTag, u32) {
|
||||
let key = Key::from_i128(key);
|
||||
assert!(key.is_rel_block_key());
|
||||
key.to_rel_block()
|
||||
.expect("we filter non-rel-block keys out above")
|
||||
}
|
||||
|
||||
// Pick a random page from a random relation.
|
||||
let mut rng = rand::thread_rng();
|
||||
let r = &ranges[weights.sample(&mut rng)];
|
||||
let key: i128 = rng.gen_range(r.start..r.end);
|
||||
let key = Key::from_i128(key);
|
||||
assert!(key.is_rel_block_key());
|
||||
let (rel_tag, block_no) = key
|
||||
.to_rel_block()
|
||||
.expect("we filter non-rel-block keys out above");
|
||||
PagestreamGetPageRequest {
|
||||
hdr: PagestreamRequest {
|
||||
reqid: 0,
|
||||
request_lsn: if rng.gen_bool(args.req_latest_probability) {
|
||||
Lsn::MAX
|
||||
} else {
|
||||
r.timeline_lsn
|
||||
},
|
||||
not_modified_since: r.timeline_lsn,
|
||||
},
|
||||
rel: rel_tag,
|
||||
blkno: block_no,
|
||||
let (rel_tag, block_no) = key_to_block(key);
|
||||
|
||||
let mut blks = VecDeque::with_capacity(batch_size);
|
||||
blks.push_back(block_no);
|
||||
|
||||
// If requested, populate a batch of sequential pages. This is how Postgres will
|
||||
// request page batches (e.g. prefetches). If we hit the end of the relation, we
|
||||
// grow the batch towards the start too.
|
||||
for i in 1..batch_size {
|
||||
let (r, b) = key_to_block(key + i as i128);
|
||||
if r != rel_tag {
|
||||
break; // went outside relation
|
||||
}
|
||||
blks.push_back(b)
|
||||
}
|
||||
|
||||
if blks.len() < batch_size {
|
||||
// Grow batch backwards if needed.
|
||||
for i in 1..batch_size {
|
||||
let (r, b) = key_to_block(key - i as i128);
|
||||
if r != rel_tag {
|
||||
break; // went outside relation
|
||||
}
|
||||
blks.push_front(b)
|
||||
}
|
||||
}
|
||||
|
||||
// We assume that the entire batch can fit within the relation.
|
||||
assert_eq!(blks.len(), batch_size, "incomplete batch");
|
||||
|
||||
let req_lsn = if rng.gen_bool(args.req_latest_probability) {
|
||||
Lsn::MAX
|
||||
} else {
|
||||
r.timeline_lsn
|
||||
};
|
||||
(req_lsn, r.timeline_lsn, rel_tag, blks.into())
|
||||
};
|
||||
client.getpage_send(req).await.unwrap();
|
||||
inflight.push_back(start);
|
||||
client
|
||||
.send_get_page(req_id, req_lsn, mod_lsn, rel, blks)
|
||||
.await
|
||||
.unwrap();
|
||||
let old = inflight.insert(req_id, start);
|
||||
assert!(old.is_none(), "duplicate request ID {req_id}");
|
||||
}
|
||||
|
||||
let start = inflight.pop_front().unwrap();
|
||||
client.getpage_recv().await.unwrap();
|
||||
let (req_id, pages) = client.recv_get_page().await.unwrap();
|
||||
assert_eq!(pages.len(), batch_size, "unexpected page count");
|
||||
assert!(pages.iter().all(|p| !p.is_empty()), "empty page");
|
||||
let start = inflight
|
||||
.remove(&req_id)
|
||||
.expect("response for unknown request ID");
|
||||
let end = Instant::now();
|
||||
shared_state.live_stats.request_done();
|
||||
ticks_processed += 1;
|
||||
@@ -442,3 +608,158 @@ async fn client_libpq(
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// A benchmark client, to allow switching out the transport protocol.
|
||||
///
|
||||
/// For simplicity, this just uses separate asynchronous send/recv methods. The send method could
|
||||
/// return a future that resolves when the response is received, but we don't really need it.
|
||||
#[async_trait]
|
||||
trait Client: Send {
|
||||
/// Sends an asynchronous GetPage request to the pageserver.
|
||||
async fn send_get_page(
|
||||
&mut self,
|
||||
req_id: u64,
|
||||
req_lsn: Lsn,
|
||||
mod_lsn: Lsn,
|
||||
rel: RelTag,
|
||||
blks: Vec<u32>,
|
||||
) -> anyhow::Result<()>;
|
||||
|
||||
/// Receives the next GetPage response from the pageserver.
|
||||
async fn recv_get_page(&mut self) -> anyhow::Result<(u64, Vec<Bytes>)>;
|
||||
}
|
||||
|
||||
/// A libpq-based Pageserver client.
|
||||
struct LibpqClient {
|
||||
inner: pageserver_client::page_service::PagestreamClient,
|
||||
// Track sent batches, so we know how many responses to expect.
|
||||
batch_sizes: VecDeque<usize>,
|
||||
}
|
||||
|
||||
impl LibpqClient {
|
||||
async fn new(connstring: String, ttid: TenantTimelineId) -> anyhow::Result<Self> {
|
||||
let inner = pageserver_client::page_service::Client::new(connstring)
|
||||
.await?
|
||||
.pagestream(ttid.tenant_id, ttid.timeline_id)
|
||||
.await?;
|
||||
Ok(Self {
|
||||
inner,
|
||||
batch_sizes: VecDeque::new(),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl Client for LibpqClient {
|
||||
async fn send_get_page(
|
||||
&mut self,
|
||||
req_id: u64,
|
||||
req_lsn: Lsn,
|
||||
mod_lsn: Lsn,
|
||||
rel: RelTag,
|
||||
blks: Vec<u32>,
|
||||
) -> anyhow::Result<()> {
|
||||
// libpq doesn't support client-side batches, so we send a bunch of individual requests
|
||||
// instead in the hope that the server will batch them for us. We use the same request ID
|
||||
// for all, because we'll return a single batch response.
|
||||
self.batch_sizes.push_back(blks.len());
|
||||
for blkno in blks {
|
||||
let req = PagestreamGetPageRequest {
|
||||
hdr: PagestreamRequest {
|
||||
reqid: req_id,
|
||||
request_lsn: req_lsn,
|
||||
not_modified_since: mod_lsn,
|
||||
},
|
||||
rel,
|
||||
blkno,
|
||||
};
|
||||
self.inner.getpage_send(req).await?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn recv_get_page(&mut self) -> anyhow::Result<(u64, Vec<Bytes>)> {
|
||||
let batch_size = self.batch_sizes.pop_front().unwrap();
|
||||
let mut batch = Vec::with_capacity(batch_size);
|
||||
let mut req_id = None;
|
||||
for _ in 0..batch_size {
|
||||
let resp = self.inner.getpage_recv().await?;
|
||||
if req_id.is_none() {
|
||||
req_id = Some(resp.req.hdr.reqid);
|
||||
}
|
||||
assert_eq!(req_id, Some(resp.req.hdr.reqid), "request ID mismatch");
|
||||
batch.push(resp.page);
|
||||
}
|
||||
Ok((req_id.unwrap(), batch))
|
||||
}
|
||||
}
|
||||
|
||||
/// A gRPC client using the raw, no-frills gRPC client.
|
||||
struct GrpcClient {
|
||||
req_tx: tokio::sync::mpsc::Sender<proto::GetPageRequest>,
|
||||
resp_rx: tonic::Streaming<proto::GetPageResponse>,
|
||||
start_times: Vec<Instant>,
|
||||
}
|
||||
|
||||
impl GrpcClient {
|
||||
async fn new(connstring: String, ttid: TenantTimelineId) -> anyhow::Result<Self> {
|
||||
let mut client = pageserver_page_api::proto::PageServiceClient::connect(connstring).await?;
|
||||
|
||||
// The channel has a buffer size of 1, since 0 is not allowed. It does not matter, since the
|
||||
// benchmark will control the queue depth (i.e. in-flight requests) anyway, and requests are
|
||||
// buffered by Tonic and the OS too.
|
||||
let (req_tx, req_rx) = tokio::sync::mpsc::channel(1);
|
||||
let req_stream = tokio_stream::wrappers::ReceiverStream::new(req_rx);
|
||||
let mut req = tonic::Request::new(req_stream);
|
||||
let metadata = req.metadata_mut();
|
||||
metadata.insert("neon-tenant-id", ttid.tenant_id.to_string().try_into()?);
|
||||
metadata.insert("neon-timeline-id", ttid.timeline_id.to_string().try_into()?);
|
||||
metadata.insert("neon-shard-id", "0000".try_into()?);
|
||||
|
||||
let resp = client.get_pages(req).await?;
|
||||
let resp_stream = resp.into_inner();
|
||||
|
||||
Ok(Self {
|
||||
req_tx,
|
||||
resp_rx: resp_stream,
|
||||
start_times: Vec::new(),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl Client for GrpcClient {
|
||||
async fn send_get_page(
|
||||
&mut self,
|
||||
req_id: u64,
|
||||
req_lsn: Lsn,
|
||||
mod_lsn: Lsn,
|
||||
rel: RelTag,
|
||||
blks: Vec<u32>,
|
||||
) -> anyhow::Result<()> {
|
||||
let req = proto::GetPageRequest {
|
||||
request_id: req_id,
|
||||
request_class: proto::GetPageClass::Normal as i32,
|
||||
read_lsn: Some(proto::ReadLsn {
|
||||
request_lsn: req_lsn.0,
|
||||
not_modified_since_lsn: mod_lsn.0,
|
||||
}),
|
||||
rel: Some(rel.into()),
|
||||
block_number: blks,
|
||||
};
|
||||
self.start_times.push(Instant::now());
|
||||
self.req_tx.send(req).await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn recv_get_page(&mut self) -> anyhow::Result<(u64, Vec<Bytes>)> {
|
||||
let resp = self.resp_rx.message().await?.unwrap();
|
||||
anyhow::ensure!(
|
||||
resp.status_code == proto::GetPageStatusCode::Ok as i32,
|
||||
"unexpected status code: {}",
|
||||
resp.status_code
|
||||
);
|
||||
Ok((resp.request_id, resp.page_image))
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -65,6 +65,30 @@ impl From<GetVectoredError> for BasebackupError {
|
||||
}
|
||||
}
|
||||
|
||||
impl From<BasebackupError> for postgres_backend::QueryError {
|
||||
fn from(err: BasebackupError) -> Self {
|
||||
use postgres_backend::QueryError;
|
||||
use pq_proto::framed::ConnectionError;
|
||||
match err {
|
||||
BasebackupError::Client(err, _) => QueryError::Disconnected(ConnectionError::Io(err)),
|
||||
BasebackupError::Server(err) => QueryError::Other(err),
|
||||
BasebackupError::Shutdown => QueryError::Shutdown,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<BasebackupError> for tonic::Status {
|
||||
fn from(err: BasebackupError) -> Self {
|
||||
use tonic::Code;
|
||||
let code = match &err {
|
||||
BasebackupError::Client(_, _) => Code::Cancelled,
|
||||
BasebackupError::Server(_) => Code::Internal,
|
||||
BasebackupError::Shutdown => Code::Unavailable,
|
||||
};
|
||||
tonic::Status::new(code, err.to_string())
|
||||
}
|
||||
}
|
||||
|
||||
/// Create basebackup with non-rel data in it.
|
||||
/// Only include relational data if 'full_backup' is true.
|
||||
///
|
||||
@@ -248,7 +272,7 @@ where
|
||||
async fn flush(&mut self) -> Result<(), BasebackupError> {
|
||||
let nblocks = self.buf.len() / BLCKSZ as usize;
|
||||
let (kind, segno) = self.current_segment.take().unwrap();
|
||||
let segname = format!("{}/{:>04X}", kind.to_str(), segno);
|
||||
let segname = format!("{kind}/{segno:>04X}");
|
||||
let header = new_tar_header(&segname, self.buf.len() as u64)?;
|
||||
self.ar
|
||||
.append(&header, self.buf.as_slice())
|
||||
@@ -347,7 +371,7 @@ where
|
||||
.await?
|
||||
.partition(
|
||||
self.timeline.get_shard_identity(),
|
||||
Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64,
|
||||
self.timeline.conf.max_get_vectored_keys.get() as u64 * BLCKSZ as u64,
|
||||
);
|
||||
|
||||
let mut slru_builder = SlruSegmentsBuilder::new(&mut self.ar);
|
||||
|
||||
@@ -158,7 +158,6 @@ fn main() -> anyhow::Result<()> {
|
||||
// (maybe we should automate this with a visitor?).
|
||||
info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine");
|
||||
info!(?conf.virtual_file_io_mode, "starting with virtual_file IO mode");
|
||||
info!(?conf.wal_receiver_protocol, "starting with WAL receiver protocol");
|
||||
info!(?conf.validate_wal_contiguity, "starting with WAL contiguity validation");
|
||||
info!(?conf.page_service_pipelining, "starting with page service pipelining config");
|
||||
info!(?conf.get_vectored_concurrent_io, "starting with get_vectored IO concurrency config");
|
||||
@@ -804,7 +803,7 @@ fn start_pageserver(
|
||||
} else {
|
||||
None
|
||||
},
|
||||
basebackup_cache.clone(),
|
||||
basebackup_cache,
|
||||
);
|
||||
|
||||
// Spawn a Pageserver gRPC server task. It will spawn separate tasks for
|
||||
@@ -816,12 +815,11 @@ fn start_pageserver(
|
||||
let mut page_service_grpc = None;
|
||||
if let Some(grpc_listener) = grpc_listener {
|
||||
page_service_grpc = Some(page_service::spawn_grpc(
|
||||
conf,
|
||||
tenant_manager.clone(),
|
||||
grpc_auth,
|
||||
otel_guard.as_ref().map(|g| g.dispatch.clone()),
|
||||
conf.get_vectored_concurrent_io,
|
||||
grpc_listener,
|
||||
basebackup_cache,
|
||||
)?);
|
||||
}
|
||||
|
||||
|
||||
@@ -14,7 +14,10 @@ use std::time::Duration;
|
||||
use anyhow::{Context, bail, ensure};
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use once_cell::sync::OnceCell;
|
||||
use pageserver_api::config::{DiskUsageEvictionTaskConfig, MaxVectoredReadBytes, PostHogConfig};
|
||||
use pageserver_api::config::{
|
||||
DiskUsageEvictionTaskConfig, MaxGetVectoredKeys, MaxVectoredReadBytes,
|
||||
PageServicePipeliningConfig, PageServicePipeliningConfigPipelined, PostHogConfig,
|
||||
};
|
||||
use pageserver_api::models::ImageCompressionAlgorithm;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use pem::Pem;
|
||||
@@ -24,7 +27,6 @@ use reqwest::Url;
|
||||
use storage_broker::Uri;
|
||||
use utils::id::{NodeId, TimelineId};
|
||||
use utils::logging::{LogFormat, SecretString};
|
||||
use utils::postgres_client::PostgresClientProtocol;
|
||||
|
||||
use crate::tenant::storage_layer::inmemory_layer::IndexEntry;
|
||||
use crate::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
|
||||
@@ -185,6 +187,9 @@ pub struct PageServerConf {
|
||||
|
||||
pub max_vectored_read_bytes: MaxVectoredReadBytes,
|
||||
|
||||
/// Maximum number of keys to be read in a single get_vectored call.
|
||||
pub max_get_vectored_keys: MaxGetVectoredKeys,
|
||||
|
||||
pub image_compression: ImageCompressionAlgorithm,
|
||||
|
||||
/// Whether to offload archived timelines automatically
|
||||
@@ -205,8 +210,6 @@ pub struct PageServerConf {
|
||||
/// Optionally disable disk syncs (unsafe!)
|
||||
pub no_sync: bool,
|
||||
|
||||
pub wal_receiver_protocol: PostgresClientProtocol,
|
||||
|
||||
pub page_service_pipelining: pageserver_api::config::PageServicePipeliningConfig,
|
||||
|
||||
pub get_vectored_concurrent_io: pageserver_api::config::GetVectoredConcurrentIo,
|
||||
@@ -404,6 +407,7 @@ impl PageServerConf {
|
||||
secondary_download_concurrency,
|
||||
ingest_batch_size,
|
||||
max_vectored_read_bytes,
|
||||
max_get_vectored_keys,
|
||||
image_compression,
|
||||
timeline_offloading,
|
||||
ephemeral_bytes_per_memory_kb,
|
||||
@@ -414,7 +418,6 @@ impl PageServerConf {
|
||||
virtual_file_io_engine,
|
||||
tenant_config,
|
||||
no_sync,
|
||||
wal_receiver_protocol,
|
||||
page_service_pipelining,
|
||||
get_vectored_concurrent_io,
|
||||
enable_read_path_debugging,
|
||||
@@ -470,13 +473,13 @@ impl PageServerConf {
|
||||
secondary_download_concurrency,
|
||||
ingest_batch_size,
|
||||
max_vectored_read_bytes,
|
||||
max_get_vectored_keys,
|
||||
image_compression,
|
||||
timeline_offloading,
|
||||
ephemeral_bytes_per_memory_kb,
|
||||
import_pgdata_upcall_api,
|
||||
import_pgdata_upcall_api_token: import_pgdata_upcall_api_token.map(SecretString::from),
|
||||
import_pgdata_aws_endpoint_url,
|
||||
wal_receiver_protocol,
|
||||
page_service_pipelining,
|
||||
get_vectored_concurrent_io,
|
||||
tracing,
|
||||
@@ -598,6 +601,19 @@ impl PageServerConf {
|
||||
)
|
||||
})?;
|
||||
|
||||
if let PageServicePipeliningConfig::Pipelined(PageServicePipeliningConfigPipelined {
|
||||
max_batch_size,
|
||||
..
|
||||
}) = conf.page_service_pipelining
|
||||
{
|
||||
if max_batch_size.get() > conf.max_get_vectored_keys.get() {
|
||||
return Err(anyhow::anyhow!(
|
||||
"`max_batch_size` ({max_batch_size}) must be less than or equal to `max_get_vectored_keys` ({})",
|
||||
conf.max_get_vectored_keys.get()
|
||||
));
|
||||
}
|
||||
};
|
||||
|
||||
Ok(conf)
|
||||
}
|
||||
|
||||
@@ -685,6 +701,7 @@ impl ConfigurableSemaphore {
|
||||
mod tests {
|
||||
|
||||
use camino::Utf8PathBuf;
|
||||
use rstest::rstest;
|
||||
use utils::id::NodeId;
|
||||
|
||||
use super::PageServerConf;
|
||||
@@ -724,4 +741,28 @@ mod tests {
|
||||
PageServerConf::parse_and_validate(NodeId(0), config_toml, &workdir)
|
||||
.expect_err("parse_and_validate should fail for endpoint without scheme");
|
||||
}
|
||||
|
||||
#[rstest]
|
||||
#[case(32, 32, true)]
|
||||
#[case(64, 32, false)]
|
||||
#[case(64, 64, true)]
|
||||
#[case(128, 128, true)]
|
||||
fn test_config_max_batch_size_is_valid(
|
||||
#[case] max_batch_size: usize,
|
||||
#[case] max_get_vectored_keys: usize,
|
||||
#[case] is_valid: bool,
|
||||
) {
|
||||
let input = format!(
|
||||
r#"
|
||||
control_plane_api = "http://localhost:6666"
|
||||
max_get_vectored_keys = {max_get_vectored_keys}
|
||||
page_service_pipelining = {{ mode="pipelined", execution="concurrent-futures", max_batch_size={max_batch_size}, batching="uniform-lsn" }}
|
||||
"#,
|
||||
);
|
||||
let config_toml = toml_edit::de::from_str::<pageserver_api::config::ConfigToml>(&input)
|
||||
.expect("config has valid fields");
|
||||
let workdir = Utf8PathBuf::from("/nonexistent");
|
||||
let result = PageServerConf::parse_and_validate(NodeId(0), config_toml, &workdir);
|
||||
assert_eq!(result.is_ok(), is_valid);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -195,6 +195,8 @@ impl StorageControllerUpcallApi for StorageControllerUpcallClient {
|
||||
node_id: conf.id,
|
||||
listen_pg_addr: m.postgres_host,
|
||||
listen_pg_port: m.postgres_port,
|
||||
listen_grpc_addr: m.grpc_host,
|
||||
listen_grpc_port: m.grpc_port,
|
||||
listen_http_addr: m.http_host,
|
||||
listen_http_port: m.http_port,
|
||||
listen_https_port: m.https_port,
|
||||
|
||||
@@ -1,21 +1,28 @@
|
||||
use std::{collections::HashMap, sync::Arc, time::Duration};
|
||||
|
||||
use posthog_client_lite::{
|
||||
FeatureResolverBackgroundLoop, PostHogClientConfig, PostHogEvaluationError,
|
||||
CaptureEvent, FeatureResolverBackgroundLoop, PostHogClientConfig, PostHogEvaluationError,
|
||||
PostHogFlagFilterPropertyValue,
|
||||
};
|
||||
use remote_storage::RemoteStorageKind;
|
||||
use serde_json::json;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use utils::id::TenantId;
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::{config::PageServerConf, metrics::FEATURE_FLAG_EVALUATION};
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct FeatureResolver {
|
||||
inner: Option<Arc<FeatureResolverBackgroundLoop>>,
|
||||
internal_properties: Option<Arc<HashMap<String, PostHogFlagFilterPropertyValue>>>,
|
||||
}
|
||||
|
||||
impl FeatureResolver {
|
||||
pub fn new_disabled() -> Self {
|
||||
Self { inner: None }
|
||||
Self {
|
||||
inner: None,
|
||||
internal_properties: None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn spawn(
|
||||
@@ -36,14 +43,114 @@ impl FeatureResolver {
|
||||
shutdown_pageserver,
|
||||
);
|
||||
let inner = Arc::new(inner);
|
||||
// TODO: make this configurable
|
||||
inner.clone().spawn(handle, Duration::from_secs(60));
|
||||
Ok(FeatureResolver { inner: Some(inner) })
|
||||
|
||||
// The properties shared by all tenants on this pageserver.
|
||||
let internal_properties = {
|
||||
let mut properties = HashMap::new();
|
||||
properties.insert(
|
||||
"pageserver_id".to_string(),
|
||||
PostHogFlagFilterPropertyValue::String(conf.id.to_string()),
|
||||
);
|
||||
if let Some(availability_zone) = &conf.availability_zone {
|
||||
properties.insert(
|
||||
"availability_zone".to_string(),
|
||||
PostHogFlagFilterPropertyValue::String(availability_zone.clone()),
|
||||
);
|
||||
}
|
||||
// Infer region based on the remote storage config.
|
||||
if let Some(remote_storage) = &conf.remote_storage_config {
|
||||
match &remote_storage.storage {
|
||||
RemoteStorageKind::AwsS3(config) => {
|
||||
properties.insert(
|
||||
"region".to_string(),
|
||||
PostHogFlagFilterPropertyValue::String(format!(
|
||||
"aws-{}",
|
||||
config.bucket_region
|
||||
)),
|
||||
);
|
||||
}
|
||||
RemoteStorageKind::AzureContainer(config) => {
|
||||
properties.insert(
|
||||
"region".to_string(),
|
||||
PostHogFlagFilterPropertyValue::String(format!(
|
||||
"azure-{}",
|
||||
config.container_region
|
||||
)),
|
||||
);
|
||||
}
|
||||
RemoteStorageKind::LocalFs { .. } => {
|
||||
properties.insert(
|
||||
"region".to_string(),
|
||||
PostHogFlagFilterPropertyValue::String("local".to_string()),
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
// TODO: add pageserver URL.
|
||||
Arc::new(properties)
|
||||
};
|
||||
let fake_tenants = {
|
||||
let mut tenants = Vec::new();
|
||||
for i in 0..10 {
|
||||
let distinct_id = format!(
|
||||
"fake_tenant_{}_{}_{}",
|
||||
conf.availability_zone.as_deref().unwrap_or_default(),
|
||||
conf.id,
|
||||
i
|
||||
);
|
||||
let properties = Self::collect_properties_inner(
|
||||
distinct_id.clone(),
|
||||
Some(&internal_properties),
|
||||
);
|
||||
tenants.push(CaptureEvent {
|
||||
event: "initial_tenant_report".to_string(),
|
||||
distinct_id,
|
||||
properties: json!({ "$set": properties }), // use `$set` to set the person properties instead of the event properties
|
||||
});
|
||||
}
|
||||
tenants
|
||||
};
|
||||
// TODO: make refresh period configurable
|
||||
inner
|
||||
.clone()
|
||||
.spawn(handle, Duration::from_secs(60), fake_tenants);
|
||||
Ok(FeatureResolver {
|
||||
inner: Some(inner),
|
||||
internal_properties: Some(internal_properties),
|
||||
})
|
||||
} else {
|
||||
Ok(FeatureResolver { inner: None })
|
||||
Ok(FeatureResolver {
|
||||
inner: None,
|
||||
internal_properties: None,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
fn collect_properties_inner(
|
||||
tenant_id: String,
|
||||
internal_properties: Option<&HashMap<String, PostHogFlagFilterPropertyValue>>,
|
||||
) -> HashMap<String, PostHogFlagFilterPropertyValue> {
|
||||
let mut properties = HashMap::new();
|
||||
if let Some(internal_properties) = internal_properties {
|
||||
for (key, value) in internal_properties.iter() {
|
||||
properties.insert(key.clone(), value.clone());
|
||||
}
|
||||
}
|
||||
properties.insert(
|
||||
"tenant_id".to_string(),
|
||||
PostHogFlagFilterPropertyValue::String(tenant_id),
|
||||
);
|
||||
properties
|
||||
}
|
||||
|
||||
/// Collect all properties availble for the feature flag evaluation.
|
||||
pub(crate) fn collect_properties(
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
) -> HashMap<String, PostHogFlagFilterPropertyValue> {
|
||||
Self::collect_properties_inner(tenant_id.to_string(), self.internal_properties.as_deref())
|
||||
}
|
||||
|
||||
/// Evaluate a multivariate feature flag. Currently, we do not support any properties.
|
||||
///
|
||||
/// Error handling: the caller should inspect the error and decide the behavior when a feature flag
|
||||
@@ -55,11 +162,24 @@ impl FeatureResolver {
|
||||
tenant_id: TenantId,
|
||||
) -> Result<String, PostHogEvaluationError> {
|
||||
if let Some(inner) = &self.inner {
|
||||
inner.feature_store().evaluate_multivariate(
|
||||
let res = inner.feature_store().evaluate_multivariate(
|
||||
flag_key,
|
||||
&tenant_id.to_string(),
|
||||
&HashMap::new(),
|
||||
)
|
||||
&self.collect_properties(tenant_id),
|
||||
);
|
||||
match &res {
|
||||
Ok(value) => {
|
||||
FEATURE_FLAG_EVALUATION
|
||||
.with_label_values(&[flag_key, "ok", value])
|
||||
.inc();
|
||||
}
|
||||
Err(e) => {
|
||||
FEATURE_FLAG_EVALUATION
|
||||
.with_label_values(&[flag_key, "error", e.as_variant_str()])
|
||||
.inc();
|
||||
}
|
||||
}
|
||||
res
|
||||
} else {
|
||||
Err(PostHogEvaluationError::NotAvailable(
|
||||
"PostHog integration is not enabled".to_string(),
|
||||
@@ -80,11 +200,34 @@ impl FeatureResolver {
|
||||
tenant_id: TenantId,
|
||||
) -> Result<(), PostHogEvaluationError> {
|
||||
if let Some(inner) = &self.inner {
|
||||
inner.feature_store().evaluate_boolean(
|
||||
let res = inner.feature_store().evaluate_boolean(
|
||||
flag_key,
|
||||
&tenant_id.to_string(),
|
||||
&HashMap::new(),
|
||||
)
|
||||
&self.collect_properties(tenant_id),
|
||||
);
|
||||
match &res {
|
||||
Ok(()) => {
|
||||
FEATURE_FLAG_EVALUATION
|
||||
.with_label_values(&[flag_key, "ok", "true"])
|
||||
.inc();
|
||||
}
|
||||
Err(e) => {
|
||||
FEATURE_FLAG_EVALUATION
|
||||
.with_label_values(&[flag_key, "error", e.as_variant_str()])
|
||||
.inc();
|
||||
}
|
||||
}
|
||||
res
|
||||
} else {
|
||||
Err(PostHogEvaluationError::NotAvailable(
|
||||
"PostHog integration is not enabled".to_string(),
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
pub fn is_feature_flag_boolean(&self, flag_key: &str) -> Result<bool, PostHogEvaluationError> {
|
||||
if let Some(inner) = &self.inner {
|
||||
inner.feature_store().is_feature_flag_boolean(flag_key)
|
||||
} else {
|
||||
Err(PostHogEvaluationError::NotAvailable(
|
||||
"PostHog integration is not enabled".to_string(),
|
||||
|
||||
@@ -43,6 +43,7 @@ use pageserver_api::models::{
|
||||
use pageserver_api::shard::{ShardCount, TenantShardId};
|
||||
use remote_storage::{DownloadError, GenericRemoteStorage, TimeTravelError};
|
||||
use scopeguard::defer;
|
||||
use serde_json::json;
|
||||
use tenant_size_model::svg::SvgBranchKind;
|
||||
use tenant_size_model::{SizeResult, StorageModel};
|
||||
use tokio::time::Instant;
|
||||
@@ -3663,6 +3664,47 @@ async fn read_tar_eof(mut reader: (impl tokio::io::AsyncRead + Unpin)) -> anyhow
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn tenant_evaluate_feature_flag(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||
|
||||
let flag: String = must_parse_query_param(&request, "flag")?;
|
||||
let as_type: String = must_parse_query_param(&request, "as")?;
|
||||
|
||||
let state = get_state(&request);
|
||||
|
||||
async {
|
||||
let tenant = state
|
||||
.tenant_manager
|
||||
.get_attached_tenant_shard(tenant_shard_id)?;
|
||||
let properties = tenant.feature_resolver.collect_properties(tenant_shard_id.tenant_id);
|
||||
if as_type == "boolean" {
|
||||
let result = tenant.feature_resolver.evaluate_boolean(&flag, tenant_shard_id.tenant_id);
|
||||
let result = result.map(|_| true).map_err(|e| e.to_string());
|
||||
json_response(StatusCode::OK, json!({ "result": result, "properties": properties }))
|
||||
} else if as_type == "multivariate" {
|
||||
let result = tenant.feature_resolver.evaluate_multivariate(&flag, tenant_shard_id.tenant_id).map_err(|e| e.to_string());
|
||||
json_response(StatusCode::OK, json!({ "result": result, "properties": properties }))
|
||||
} else {
|
||||
// Auto infer the type of the feature flag.
|
||||
let is_boolean = tenant.feature_resolver.is_feature_flag_boolean(&flag).map_err(|e| ApiError::InternalServerError(anyhow::anyhow!("{e}")))?;
|
||||
if is_boolean {
|
||||
let result = tenant.feature_resolver.evaluate_boolean(&flag, tenant_shard_id.tenant_id);
|
||||
let result = result.map(|_| true).map_err(|e| e.to_string());
|
||||
json_response(StatusCode::OK, json!({ "result": result, "properties": properties }))
|
||||
} else {
|
||||
let result = tenant.feature_resolver.evaluate_multivariate(&flag, tenant_shard_id.tenant_id).map_err(|e| e.to_string());
|
||||
json_response(StatusCode::OK, json!({ "result": result, "properties": properties }))
|
||||
}
|
||||
}
|
||||
}
|
||||
.instrument(info_span!("tenant_evaluate_feature_flag", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug()))
|
||||
.await
|
||||
}
|
||||
|
||||
/// Common functionality of all the HTTP API handlers.
|
||||
///
|
||||
/// - Adds a tracing span to each request (by `request_span`)
|
||||
@@ -4039,5 +4081,8 @@ pub fn make_router(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/activate_post_import",
|
||||
|r| api_handler(r, activate_post_import_handler),
|
||||
)
|
||||
.get("/v1/tenant/:tenant_shard_id/feature_flag", |r| {
|
||||
api_handler(r, tenant_evaluate_feature_flag)
|
||||
})
|
||||
.any(handler_404))
|
||||
}
|
||||
|
||||
@@ -15,6 +15,7 @@ use metrics::{
|
||||
register_int_gauge, register_int_gauge_vec, register_uint_gauge, register_uint_gauge_vec,
|
||||
};
|
||||
use once_cell::sync::Lazy;
|
||||
use pageserver_api::config::defaults::DEFAULT_MAX_GET_VECTORED_KEYS;
|
||||
use pageserver_api::config::{
|
||||
PageServicePipeliningConfig, PageServicePipeliningConfigPipelined,
|
||||
PageServiceProtocolPipelinedBatchingStrategy, PageServiceProtocolPipelinedExecutionStrategy,
|
||||
@@ -32,7 +33,6 @@ use crate::config::PageServerConf;
|
||||
use crate::context::{PageContentKind, RequestContext};
|
||||
use crate::pgdatadir_mapping::DatadirModificationStats;
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::tenant::Timeline;
|
||||
use crate::tenant::layer_map::LayerMap;
|
||||
use crate::tenant::mgr::TenantSlot;
|
||||
use crate::tenant::storage_layer::{InMemoryLayer, PersistentLayerDesc};
|
||||
@@ -446,6 +446,15 @@ static PAGE_CACHE_ERRORS: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static FEATURE_FLAG_EVALUATION: Lazy<CounterVec> = Lazy::new(|| {
|
||||
register_counter_vec!(
|
||||
"pageserver_feature_flag_evaluation",
|
||||
"Number of times a feature flag is evaluated",
|
||||
&["flag_key", "status", "value"],
|
||||
)
|
||||
.unwrap()
|
||||
});
|
||||
|
||||
#[derive(IntoStaticStr)]
|
||||
#[strum(serialize_all = "kebab_case")]
|
||||
pub(crate) enum PageCacheErrorKind {
|
||||
@@ -1312,11 +1321,44 @@ impl EvictionsWithLowResidenceDuration {
|
||||
//
|
||||
// Roughly logarithmic scale.
|
||||
const STORAGE_IO_TIME_BUCKETS: &[f64] = &[
|
||||
0.000030, // 30 usec
|
||||
0.001000, // 1000 usec
|
||||
0.030, // 30 ms
|
||||
1.000, // 1000 ms
|
||||
30.000, // 30000 ms
|
||||
0.00005, // 50us
|
||||
0.00006, // 60us
|
||||
0.00007, // 70us
|
||||
0.00008, // 80us
|
||||
0.00009, // 90us
|
||||
0.0001, // 100us
|
||||
0.000110, // 110us
|
||||
0.000120, // 120us
|
||||
0.000130, // 130us
|
||||
0.000140, // 140us
|
||||
0.000150, // 150us
|
||||
0.000160, // 160us
|
||||
0.000170, // 170us
|
||||
0.000180, // 180us
|
||||
0.000190, // 190us
|
||||
0.000200, // 200us
|
||||
0.000210, // 210us
|
||||
0.000220, // 220us
|
||||
0.000230, // 230us
|
||||
0.000240, // 240us
|
||||
0.000250, // 250us
|
||||
0.000300, // 300us
|
||||
0.000350, // 350us
|
||||
0.000400, // 400us
|
||||
0.000450, // 450us
|
||||
0.000500, // 500us
|
||||
0.000600, // 600us
|
||||
0.000700, // 700us
|
||||
0.000800, // 800us
|
||||
0.000900, // 900us
|
||||
0.001000, // 1ms
|
||||
0.002000, // 2ms
|
||||
0.003000, // 3ms
|
||||
0.004000, // 4ms
|
||||
0.005000, // 5ms
|
||||
0.01000, // 10ms
|
||||
0.02000, // 20ms
|
||||
0.05000, // 50ms
|
||||
];
|
||||
|
||||
/// VirtualFile fs operation variants.
|
||||
@@ -1906,7 +1948,7 @@ static SMGR_QUERY_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
});
|
||||
|
||||
static PAGE_SERVICE_BATCH_SIZE_BUCKETS_GLOBAL: Lazy<Vec<f64>> = Lazy::new(|| {
|
||||
(1..=u32::try_from(Timeline::MAX_GET_VECTORED_KEYS).unwrap())
|
||||
(1..=u32::try_from(DEFAULT_MAX_GET_VECTORED_KEYS).unwrap())
|
||||
.map(|v| v.into())
|
||||
.collect()
|
||||
});
|
||||
@@ -1924,7 +1966,7 @@ static PAGE_SERVICE_BATCH_SIZE_BUCKETS_PER_TIMELINE: Lazy<Vec<f64>> = Lazy::new(
|
||||
let mut buckets = Vec::new();
|
||||
for i in 0.. {
|
||||
let bucket = 1 << i;
|
||||
if bucket > u32::try_from(Timeline::MAX_GET_VECTORED_KEYS).unwrap() {
|
||||
if bucket > u32::try_from(DEFAULT_MAX_GET_VECTORED_KEYS).unwrap() {
|
||||
break;
|
||||
}
|
||||
buckets.push(bucket.into());
|
||||
@@ -2813,7 +2855,6 @@ pub(crate) struct WalIngestMetrics {
|
||||
pub(crate) records_received: IntCounter,
|
||||
pub(crate) records_observed: IntCounter,
|
||||
pub(crate) records_committed: IntCounter,
|
||||
pub(crate) records_filtered: IntCounter,
|
||||
pub(crate) values_committed_metadata_images: IntCounter,
|
||||
pub(crate) values_committed_metadata_deltas: IntCounter,
|
||||
pub(crate) values_committed_data_images: IntCounter,
|
||||
@@ -2869,11 +2910,6 @@ pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| {
|
||||
"Number of WAL records which resulted in writes to pageserver storage"
|
||||
)
|
||||
.expect("failed to define a metric"),
|
||||
records_filtered: register_int_counter!(
|
||||
"pageserver_wal_ingest_records_filtered",
|
||||
"Number of WAL records filtered out due to sharding"
|
||||
)
|
||||
.expect("failed to define a metric"),
|
||||
values_committed_metadata_images: values_committed.with_label_values(&["metadata", "image"]),
|
||||
values_committed_metadata_deltas: values_committed.with_label_values(&["metadata", "delta"]),
|
||||
values_committed_data_images: values_committed.with_label_values(&["data", "image"]),
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -431,10 +431,10 @@ impl Timeline {
|
||||
GetVectoredError::InvalidLsn(e) => {
|
||||
Err(anyhow::anyhow!("invalid LSN: {e:?}").into())
|
||||
}
|
||||
// NB: this should never happen in practice because we limit MAX_GET_VECTORED_KEYS
|
||||
// NB: this should never happen in practice because we limit batch size to be smaller than max_get_vectored_keys
|
||||
// TODO: we can prevent this error class by moving this check into the type system
|
||||
GetVectoredError::Oversized(err) => {
|
||||
Err(anyhow::anyhow!("batching oversized: {err:?}").into())
|
||||
GetVectoredError::Oversized(err, max) => {
|
||||
Err(anyhow::anyhow!("batching oversized: {err} > {max}").into())
|
||||
}
|
||||
};
|
||||
|
||||
@@ -471,8 +471,19 @@ impl Timeline {
|
||||
|
||||
let rels = self.list_rels(spcnode, dbnode, version, ctx).await?;
|
||||
|
||||
if rels.is_empty() {
|
||||
return Ok(0);
|
||||
}
|
||||
|
||||
// Pre-deserialize the rel directory to avoid duplicated work in `get_relsize_cached`.
|
||||
let reldir_key = rel_dir_to_key(spcnode, dbnode);
|
||||
let buf = version.get(self, reldir_key, ctx).await?;
|
||||
let reldir = RelDirectory::des(&buf)?;
|
||||
|
||||
for rel in rels {
|
||||
let n_blocks = self.get_rel_size(rel, version, ctx).await?;
|
||||
let n_blocks = self
|
||||
.get_rel_size_in_reldir(rel, version, Some((reldir_key, &reldir)), ctx)
|
||||
.await?;
|
||||
total_blocks += n_blocks as usize;
|
||||
}
|
||||
Ok(total_blocks)
|
||||
@@ -487,6 +498,19 @@ impl Timeline {
|
||||
tag: RelTag,
|
||||
version: Version<'_>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<BlockNumber, PageReconstructError> {
|
||||
self.get_rel_size_in_reldir(tag, version, None, ctx).await
|
||||
}
|
||||
|
||||
/// Get size of a relation file. The relation must exist, otherwise an error is returned.
|
||||
///
|
||||
/// See [`Self::get_rel_exists_in_reldir`] on why we need `deserialized_reldir_v1`.
|
||||
pub(crate) async fn get_rel_size_in_reldir(
|
||||
&self,
|
||||
tag: RelTag,
|
||||
version: Version<'_>,
|
||||
deserialized_reldir_v1: Option<(Key, &RelDirectory)>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<BlockNumber, PageReconstructError> {
|
||||
if tag.relnode == 0 {
|
||||
return Err(PageReconstructError::Other(
|
||||
@@ -499,7 +523,9 @@ impl Timeline {
|
||||
}
|
||||
|
||||
if (tag.forknum == FSM_FORKNUM || tag.forknum == VISIBILITYMAP_FORKNUM)
|
||||
&& !self.get_rel_exists(tag, version, ctx).await?
|
||||
&& !self
|
||||
.get_rel_exists_in_reldir(tag, version, deserialized_reldir_v1, ctx)
|
||||
.await?
|
||||
{
|
||||
// FIXME: Postgres sometimes calls smgrcreate() to create
|
||||
// FSM, and smgrnblocks() on it immediately afterwards,
|
||||
@@ -521,11 +547,28 @@ impl Timeline {
|
||||
///
|
||||
/// Only shard 0 has a full view of the relations. Other shards only know about relations that
|
||||
/// the shard stores pages for.
|
||||
///
|
||||
pub(crate) async fn get_rel_exists(
|
||||
&self,
|
||||
tag: RelTag,
|
||||
version: Version<'_>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<bool, PageReconstructError> {
|
||||
self.get_rel_exists_in_reldir(tag, version, None, ctx).await
|
||||
}
|
||||
|
||||
/// Does the relation exist? With a cached deserialized `RelDirectory`.
|
||||
///
|
||||
/// There are some cases where the caller loops across all relations. In that specific case,
|
||||
/// the caller should obtain the deserialized `RelDirectory` first and then call this function
|
||||
/// to avoid duplicated work of deserliazation. This is a hack and should be removed by introducing
|
||||
/// a new API (e.g., `get_rel_exists_batched`).
|
||||
pub(crate) async fn get_rel_exists_in_reldir(
|
||||
&self,
|
||||
tag: RelTag,
|
||||
version: Version<'_>,
|
||||
deserialized_reldir_v1: Option<(Key, &RelDirectory)>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<bool, PageReconstructError> {
|
||||
if tag.relnode == 0 {
|
||||
return Err(PageReconstructError::Other(
|
||||
@@ -568,6 +611,17 @@ impl Timeline {
|
||||
// fetch directory listing (old)
|
||||
|
||||
let key = rel_dir_to_key(tag.spcnode, tag.dbnode);
|
||||
|
||||
if let Some((cached_key, dir)) = deserialized_reldir_v1 {
|
||||
if cached_key == key {
|
||||
return Ok(dir.rels.contains(&(tag.relnode, tag.forknum)));
|
||||
} else if cfg!(test) || cfg!(feature = "testing") {
|
||||
panic!("cached reldir key mismatch: {cached_key} != {key}");
|
||||
} else {
|
||||
warn!("cached reldir key mismatch: {cached_key} != {key}");
|
||||
}
|
||||
// Fallback to reading the directory from the datadir.
|
||||
}
|
||||
let buf = version.get(self, key, ctx).await?;
|
||||
|
||||
let dir = RelDirectory::des(&buf)?;
|
||||
@@ -665,7 +719,7 @@ impl Timeline {
|
||||
|
||||
let batches = keyspace.partition(
|
||||
self.get_shard_identity(),
|
||||
Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64,
|
||||
self.conf.max_get_vectored_keys.get() as u64 * BLCKSZ as u64,
|
||||
);
|
||||
|
||||
let io_concurrency = IoConcurrency::spawn_from_conf(
|
||||
@@ -905,7 +959,7 @@ impl Timeline {
|
||||
|
||||
let batches = keyspace.partition(
|
||||
self.get_shard_identity(),
|
||||
Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64,
|
||||
self.conf.max_get_vectored_keys.get() as u64 * BLCKSZ as u64,
|
||||
);
|
||||
|
||||
let io_concurrency = IoConcurrency::spawn_from_conf(
|
||||
|
||||
@@ -383,7 +383,7 @@ pub struct TenantShard {
|
||||
|
||||
l0_flush_global_state: L0FlushGlobalState,
|
||||
|
||||
feature_resolver: FeatureResolver,
|
||||
pub(crate) feature_resolver: FeatureResolver,
|
||||
}
|
||||
impl std::fmt::Debug for TenantShard {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
@@ -5832,6 +5832,7 @@ pub(crate) mod harness {
|
||||
pub conf: &'static PageServerConf,
|
||||
pub tenant_conf: pageserver_api::models::TenantConfig,
|
||||
pub tenant_shard_id: TenantShardId,
|
||||
pub shard_identity: ShardIdentity,
|
||||
pub generation: Generation,
|
||||
pub shard: ShardIndex,
|
||||
pub remote_storage: GenericRemoteStorage,
|
||||
@@ -5899,6 +5900,7 @@ pub(crate) mod harness {
|
||||
conf,
|
||||
tenant_conf,
|
||||
tenant_shard_id,
|
||||
shard_identity,
|
||||
generation,
|
||||
shard,
|
||||
remote_storage,
|
||||
@@ -5960,8 +5962,7 @@ pub(crate) mod harness {
|
||||
&ShardParameters::default(),
|
||||
))
|
||||
.unwrap(),
|
||||
// This is a legacy/test code path: sharding isn't supported here.
|
||||
ShardIdentity::unsharded(),
|
||||
self.shard_identity,
|
||||
Some(walredo_mgr),
|
||||
self.tenant_shard_id,
|
||||
self.remote_storage.clone(),
|
||||
@@ -6083,6 +6084,7 @@ mod tests {
|
||||
use timeline::compaction::{KeyHistoryRetention, KeyLogAtLsn};
|
||||
use timeline::{CompactOptions, DeltaLayerTestDesc, VersionedKeySpaceQuery};
|
||||
use utils::id::TenantId;
|
||||
use utils::shard::{ShardCount, ShardNumber};
|
||||
|
||||
use super::*;
|
||||
use crate::DEFAULT_PG_VERSION;
|
||||
@@ -7195,7 +7197,7 @@ mod tests {
|
||||
let end = desc
|
||||
.key_range
|
||||
.start
|
||||
.add(Timeline::MAX_GET_VECTORED_KEYS.try_into().unwrap());
|
||||
.add(tenant.conf.max_get_vectored_keys.get() as u32);
|
||||
reads.push(KeySpace {
|
||||
ranges: vec![start..end],
|
||||
});
|
||||
@@ -9418,6 +9420,77 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_failed_flush_should_not_update_disk_consistent_lsn() -> anyhow::Result<()> {
|
||||
//
|
||||
// Setup
|
||||
//
|
||||
let harness = TenantHarness::create_custom(
|
||||
"test_failed_flush_should_not_upload_disk_consistent_lsn",
|
||||
pageserver_api::models::TenantConfig::default(),
|
||||
TenantId::generate(),
|
||||
ShardIdentity::new(ShardNumber(0), ShardCount(4), ShardStripeSize(128)).unwrap(),
|
||||
Generation::new(1),
|
||||
)
|
||||
.await?;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
let timeline = tenant
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
|
||||
.await?;
|
||||
assert_eq!(timeline.get_shard_identity().count, ShardCount(4));
|
||||
let mut writer = timeline.writer().await;
|
||||
writer
|
||||
.put(
|
||||
*TEST_KEY,
|
||||
Lsn(0x20),
|
||||
&Value::Image(test_img("foo at 0x20")),
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
writer.finish_write(Lsn(0x20));
|
||||
drop(writer);
|
||||
timeline.freeze_and_flush().await.unwrap();
|
||||
|
||||
timeline.remote_client.wait_completion().await.unwrap();
|
||||
let disk_consistent_lsn = timeline.get_disk_consistent_lsn();
|
||||
let remote_consistent_lsn = timeline.get_remote_consistent_lsn_projected();
|
||||
assert_eq!(Some(disk_consistent_lsn), remote_consistent_lsn);
|
||||
|
||||
//
|
||||
// Test
|
||||
//
|
||||
|
||||
let mut writer = timeline.writer().await;
|
||||
writer
|
||||
.put(
|
||||
*TEST_KEY,
|
||||
Lsn(0x30),
|
||||
&Value::Image(test_img("foo at 0x30")),
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
writer.finish_write(Lsn(0x30));
|
||||
drop(writer);
|
||||
|
||||
fail::cfg(
|
||||
"flush-layer-before-update-remote-consistent-lsn",
|
||||
"return()",
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let flush_res = timeline.freeze_and_flush().await;
|
||||
// if flush failed, the disk/remote consistent LSN should not be updated
|
||||
assert!(flush_res.is_err());
|
||||
assert_eq!(disk_consistent_lsn, timeline.get_disk_consistent_lsn());
|
||||
assert_eq!(
|
||||
remote_consistent_lsn,
|
||||
timeline.get_remote_consistent_lsn_projected()
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
#[tokio::test]
|
||||
async fn test_simple_bottom_most_compaction_deltas_1() -> anyhow::Result<()> {
|
||||
@@ -11187,11 +11260,11 @@ mod tests {
|
||||
let mut keyspaces_at_lsn: HashMap<Lsn, KeySpaceRandomAccum> = HashMap::default();
|
||||
let mut used_keys: HashSet<Key> = HashSet::default();
|
||||
|
||||
while used_keys.len() < Timeline::MAX_GET_VECTORED_KEYS as usize {
|
||||
while used_keys.len() < tenant.conf.max_get_vectored_keys.get() {
|
||||
let selected_lsn = interesting_lsns.choose(&mut random).expect("not empty");
|
||||
let mut selected_key = start_key.add(random.gen_range(0..KEY_DIMENSION_SIZE));
|
||||
|
||||
while used_keys.len() < Timeline::MAX_GET_VECTORED_KEYS as usize {
|
||||
while used_keys.len() < tenant.conf.max_get_vectored_keys.get() {
|
||||
if used_keys.contains(&selected_key)
|
||||
|| selected_key >= start_key.add(KEY_DIMENSION_SIZE)
|
||||
{
|
||||
|
||||
@@ -817,8 +817,8 @@ pub(crate) enum GetVectoredError {
|
||||
#[error("timeline shutting down")]
|
||||
Cancelled,
|
||||
|
||||
#[error("requested too many keys: {0} > {}", Timeline::MAX_GET_VECTORED_KEYS)]
|
||||
Oversized(u64),
|
||||
#[error("requested too many keys: {0} > {1}")]
|
||||
Oversized(u64, u64),
|
||||
|
||||
#[error("requested at invalid LSN: {0}")]
|
||||
InvalidLsn(Lsn),
|
||||
@@ -950,6 +950,18 @@ pub(crate) enum WaitLsnError {
|
||||
Timeout(String),
|
||||
}
|
||||
|
||||
impl From<WaitLsnError> for tonic::Status {
|
||||
fn from(err: WaitLsnError) -> Self {
|
||||
use tonic::Code;
|
||||
let code = match &err {
|
||||
WaitLsnError::Timeout(_) => Code::Internal,
|
||||
WaitLsnError::BadState(_) => Code::Internal,
|
||||
WaitLsnError::Shutdown => Code::Unavailable,
|
||||
};
|
||||
tonic::Status::new(code, err.to_string())
|
||||
}
|
||||
}
|
||||
|
||||
// The impls below achieve cancellation mapping for errors.
|
||||
// Perhaps there's a way of achieving this with less cruft.
|
||||
|
||||
@@ -1007,7 +1019,7 @@ impl From<GetVectoredError> for PageReconstructError {
|
||||
match e {
|
||||
GetVectoredError::Cancelled => PageReconstructError::Cancelled,
|
||||
GetVectoredError::InvalidLsn(_) => PageReconstructError::Other(anyhow!("Invalid LSN")),
|
||||
err @ GetVectoredError::Oversized(_) => PageReconstructError::Other(err.into()),
|
||||
err @ GetVectoredError::Oversized(_, _) => PageReconstructError::Other(err.into()),
|
||||
GetVectoredError::MissingKey(err) => PageReconstructError::MissingKey(err),
|
||||
GetVectoredError::GetReadyAncestorError(err) => PageReconstructError::from(err),
|
||||
GetVectoredError::Other(err) => PageReconstructError::Other(err),
|
||||
@@ -1187,7 +1199,6 @@ impl Timeline {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) const MAX_GET_VECTORED_KEYS: u64 = 32;
|
||||
pub(crate) const LAYERS_VISITED_WARN_THRESHOLD: u32 = 100;
|
||||
|
||||
/// Look up multiple page versions at a given LSN
|
||||
@@ -1202,9 +1213,12 @@ impl Timeline {
|
||||
) -> Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError> {
|
||||
let total_keyspace = query.total_keyspace();
|
||||
|
||||
let key_count = total_keyspace.total_raw_size().try_into().unwrap();
|
||||
if key_count > Timeline::MAX_GET_VECTORED_KEYS {
|
||||
return Err(GetVectoredError::Oversized(key_count));
|
||||
let key_count = total_keyspace.total_raw_size();
|
||||
if key_count > self.conf.max_get_vectored_keys.get() {
|
||||
return Err(GetVectoredError::Oversized(
|
||||
key_count as u64,
|
||||
self.conf.max_get_vectored_keys.get() as u64,
|
||||
));
|
||||
}
|
||||
|
||||
for range in &total_keyspace.ranges {
|
||||
@@ -2492,6 +2506,13 @@ impl Timeline {
|
||||
// Preparing basebackup doesn't make sense for shards other than shard zero.
|
||||
return;
|
||||
}
|
||||
if !self.is_active() {
|
||||
// May happen during initial timeline creation.
|
||||
// Such timeline is not in the global timeline map yet,
|
||||
// so basebackup cache will not be able to find it.
|
||||
// TODO(diko): We can prepare such timelines in finish_creation().
|
||||
return;
|
||||
}
|
||||
|
||||
let res = self
|
||||
.basebackup_prepare_sender
|
||||
@@ -2831,21 +2852,6 @@ impl Timeline {
|
||||
)
|
||||
}
|
||||
|
||||
/// Resolve the effective WAL receiver protocol to use for this tenant.
|
||||
///
|
||||
/// Priority order is:
|
||||
/// 1. Tenant config override
|
||||
/// 2. Default value for tenant config override
|
||||
/// 3. Pageserver config override
|
||||
/// 4. Pageserver config default
|
||||
pub fn resolve_wal_receiver_protocol(&self) -> PostgresClientProtocol {
|
||||
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
|
||||
tenant_conf
|
||||
.wal_receiver_protocol_override
|
||||
.or(self.conf.default_tenant_conf.wal_receiver_protocol_override)
|
||||
.unwrap_or(self.conf.wal_receiver_protocol)
|
||||
}
|
||||
|
||||
pub(super) fn tenant_conf_updated(&self, new_conf: &AttachedTenantConf) {
|
||||
// NB: Most tenant conf options are read by background loops, so,
|
||||
// changes will automatically be picked up.
|
||||
@@ -3201,10 +3207,16 @@ impl Timeline {
|
||||
guard.is_none(),
|
||||
"multiple launches / re-launches of WAL receiver are not supported"
|
||||
);
|
||||
|
||||
let protocol = PostgresClientProtocol::Interpreted {
|
||||
format: utils::postgres_client::InterpretedFormat::Protobuf,
|
||||
compression: Some(utils::postgres_client::Compression::Zstd { level: 1 }),
|
||||
};
|
||||
|
||||
*guard = Some(WalReceiver::start(
|
||||
Arc::clone(self),
|
||||
WalReceiverConf {
|
||||
protocol: self.resolve_wal_receiver_protocol(),
|
||||
protocol,
|
||||
wal_connect_timeout,
|
||||
lagging_wal_timeout,
|
||||
max_lsn_wal_lag,
|
||||
@@ -4767,7 +4779,10 @@ impl Timeline {
|
||||
|| !flushed_to_lsn.is_valid()
|
||||
);
|
||||
|
||||
if flushed_to_lsn < frozen_to_lsn && self.shard_identity.count.count() > 1 {
|
||||
if flushed_to_lsn < frozen_to_lsn
|
||||
&& self.shard_identity.count.count() > 1
|
||||
&& result.is_ok()
|
||||
{
|
||||
// If our layer flushes didn't carry disk_consistent_lsn up to the `to_lsn` advertised
|
||||
// to us via layer_flush_start_rx, then advance it here.
|
||||
//
|
||||
@@ -4946,6 +4961,10 @@ impl Timeline {
|
||||
return Err(FlushLayerError::Cancelled);
|
||||
}
|
||||
|
||||
fail_point!("flush-layer-before-update-remote-consistent-lsn", |_| {
|
||||
Err(FlushLayerError::Other(anyhow!("failpoint").into()))
|
||||
});
|
||||
|
||||
let disk_consistent_lsn = Lsn(lsn_range.end.0 - 1);
|
||||
|
||||
// The new on-disk layers are now in the layer map. We can remove the
|
||||
@@ -5251,7 +5270,7 @@ impl Timeline {
|
||||
key = key.next();
|
||||
|
||||
// Maybe flush `key_rest_accum`
|
||||
if key_request_accum.raw_size() >= Timeline::MAX_GET_VECTORED_KEYS
|
||||
if key_request_accum.raw_size() >= self.conf.max_get_vectored_keys.get() as u64
|
||||
|| (last_key_in_range && key_request_accum.raw_size() > 0)
|
||||
{
|
||||
let query =
|
||||
|
||||
@@ -106,6 +106,8 @@ pub async fn doit(
|
||||
);
|
||||
}
|
||||
|
||||
tracing::info!("Import plan executed. Flushing remote changes and notifying storcon");
|
||||
|
||||
timeline
|
||||
.remote_client
|
||||
.schedule_index_upload_for_file_changes()?;
|
||||
@@ -199,8 +201,8 @@ async fn prepare_import(
|
||||
.await;
|
||||
match res {
|
||||
Ok(_) => break,
|
||||
Err(err) => {
|
||||
info!(?err, "indefinitely waiting for pgdata to finish");
|
||||
Err(_err) => {
|
||||
info!("indefinitely waiting for pgdata to finish");
|
||||
if tokio::time::timeout(std::time::Duration::from_secs(10), cancel.cancelled())
|
||||
.await
|
||||
.is_ok()
|
||||
|
||||
@@ -11,19 +11,7 @@
|
||||
//! - => S3 as the source for the PGDATA instead of local filesystem
|
||||
//!
|
||||
//! TODOs before productionization:
|
||||
//! - ChunkProcessingJob size / ImportJob::total_size does not account for sharding.
|
||||
//! => produced image layers likely too small.
|
||||
//! - ChunkProcessingJob should cut up an ImportJob to hit exactly target image layer size.
|
||||
//! - asserts / unwraps need to be replaced with errors
|
||||
//! - don't trust remote objects will be small (=prevent OOMs in those cases)
|
||||
//! - limit all in-memory buffers in size, or download to disk and read from there
|
||||
//! - limit task concurrency
|
||||
//! - generally play nice with other tenants in the system
|
||||
//! - importbucket is different bucket than main pageserver storage, so, should be fine wrt S3 rate limits
|
||||
//! - but concerns like network bandwidth, local disk write bandwidth, local disk capacity, etc
|
||||
//! - integrate with layer eviction system
|
||||
//! - audit for Tenant::cancel nor Timeline::cancel responsivity
|
||||
//! - audit for Tenant/Timeline gate holding (we spawn tokio tasks during this flow!)
|
||||
//!
|
||||
//! An incomplete set of TODOs from the Hackathon:
|
||||
//! - version-specific CheckPointData (=> pgv abstraction, already exists for regular walingest)
|
||||
@@ -44,7 +32,7 @@ use pageserver_api::key::{
|
||||
rel_dir_to_key, rel_size_to_key, relmap_file_key, slru_block_to_key, slru_dir_to_key,
|
||||
slru_segment_size_to_key,
|
||||
};
|
||||
use pageserver_api::keyspace::{contiguous_range_len, is_contiguous_range, singleton_range};
|
||||
use pageserver_api::keyspace::{ShardedRange, singleton_range};
|
||||
use pageserver_api::models::{ShardImportProgress, ShardImportProgressV1, ShardImportStatus};
|
||||
use pageserver_api::reltag::{RelTag, SlruKind};
|
||||
use pageserver_api::shard::ShardIdentity;
|
||||
@@ -112,6 +100,7 @@ async fn run_v1(
|
||||
.unwrap(),
|
||||
import_job_concurrency: base.import_job_concurrency,
|
||||
import_job_checkpoint_threshold: base.import_job_checkpoint_threshold,
|
||||
import_job_max_byte_range_size: base.import_job_max_byte_range_size,
|
||||
}
|
||||
}
|
||||
None => timeline.conf.timeline_import_config.clone(),
|
||||
@@ -142,7 +131,15 @@ async fn run_v1(
|
||||
|
||||
pausable_failpoint!("import-timeline-pre-execute-pausable");
|
||||
|
||||
let jobs_count = import_progress.as_ref().map(|p| p.jobs);
|
||||
let start_from_job_idx = import_progress.map(|progress| progress.completed);
|
||||
|
||||
tracing::info!(
|
||||
start_from_job_idx=?start_from_job_idx,
|
||||
jobs=?jobs_count,
|
||||
"Executing import plan"
|
||||
);
|
||||
|
||||
plan.execute(timeline, start_from_job_idx, plan_hash, &import_config, ctx)
|
||||
.await
|
||||
}
|
||||
@@ -167,6 +164,7 @@ impl Planner {
|
||||
/// This function is and must remain pure: given the same input, it will generate the same import plan.
|
||||
async fn plan(mut self, import_config: &TimelineImportConfig) -> anyhow::Result<Plan> {
|
||||
let pgdata_lsn = Lsn(self.control_file.control_file_data().checkPoint).align();
|
||||
anyhow::ensure!(pgdata_lsn.is_valid());
|
||||
|
||||
let datadir = PgDataDir::new(&self.storage).await?;
|
||||
|
||||
@@ -249,14 +247,22 @@ impl Planner {
|
||||
});
|
||||
|
||||
// Assigns parts of key space to later parallel jobs
|
||||
// Note: The image layers produced here may have gaps, meaning,
|
||||
// there is not an image for each key in the layer's key range.
|
||||
// The read path stops traversal at the first image layer, regardless
|
||||
// of whether a base image has been found for a key or not.
|
||||
// (Concept of sparse image layers doesn't exist.)
|
||||
// This behavior is exactly right for the base image layers we're producing here.
|
||||
// But, since no other place in the code currently produces image layers with gaps,
|
||||
// it seems noteworthy.
|
||||
let mut last_end_key = Key::MIN;
|
||||
let mut current_chunk = Vec::new();
|
||||
let mut current_chunk_size: usize = 0;
|
||||
let mut jobs = Vec::new();
|
||||
for task in std::mem::take(&mut self.tasks).into_iter() {
|
||||
if current_chunk_size + task.total_size()
|
||||
> import_config.import_job_soft_size_limit.into()
|
||||
{
|
||||
let task_size = task.total_size(&self.shard);
|
||||
let projected_chunk_size = current_chunk_size.saturating_add(task_size);
|
||||
if projected_chunk_size > import_config.import_job_soft_size_limit.into() {
|
||||
let key_range = last_end_key..task.key_range().start;
|
||||
jobs.push(ChunkProcessingJob::new(
|
||||
key_range.clone(),
|
||||
@@ -266,7 +272,7 @@ impl Planner {
|
||||
last_end_key = key_range.end;
|
||||
current_chunk_size = 0;
|
||||
}
|
||||
current_chunk_size += task.total_size();
|
||||
current_chunk_size = current_chunk_size.saturating_add(task_size);
|
||||
current_chunk.push(task);
|
||||
}
|
||||
jobs.push(ChunkProcessingJob::new(
|
||||
@@ -436,6 +442,7 @@ impl Plan {
|
||||
|
||||
let mut last_completed_job_idx = start_after_job_idx.unwrap_or(0);
|
||||
let checkpoint_every: usize = import_config.import_job_checkpoint_threshold.into();
|
||||
let max_byte_range_size: usize = import_config.import_job_max_byte_range_size.into();
|
||||
|
||||
// Run import jobs concurrently up to the limit specified by the pageserver configuration.
|
||||
// Note that we process completed futures in the oreder of insertion. This will be the
|
||||
@@ -451,7 +458,7 @@ impl Plan {
|
||||
|
||||
work.push_back(tokio::task::spawn(async move {
|
||||
let _permit = permit;
|
||||
let res = job.run(job_timeline, &ctx).await;
|
||||
let res = job.run(job_timeline, max_byte_range_size, &ctx).await;
|
||||
(job_idx, res)
|
||||
}));
|
||||
},
|
||||
@@ -466,6 +473,8 @@ impl Plan {
|
||||
last_completed_job_idx = job_idx;
|
||||
|
||||
if last_completed_job_idx % checkpoint_every == 0 {
|
||||
tracing::info!(last_completed_job_idx, jobs=%jobs_in_plan, "Checkpointing import status");
|
||||
|
||||
let progress = ShardImportProgressV1 {
|
||||
jobs: jobs_in_plan,
|
||||
completed: last_completed_job_idx,
|
||||
@@ -604,18 +613,18 @@ impl PgDataDirDb {
|
||||
};
|
||||
|
||||
let path = datadir_path.join(rel_tag.to_segfile_name(segno));
|
||||
assert!(filesize % BLCKSZ as usize == 0); // TODO: this should result in an error
|
||||
anyhow::ensure!(filesize % BLCKSZ as usize == 0);
|
||||
let nblocks = filesize / BLCKSZ as usize;
|
||||
|
||||
PgDataDirDbFile {
|
||||
Ok(PgDataDirDbFile {
|
||||
path,
|
||||
filesize,
|
||||
rel_tag,
|
||||
segno,
|
||||
nblocks: Some(nblocks), // first non-cummulative sizes
|
||||
}
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
.collect::<anyhow::Result<_, _>>()?;
|
||||
|
||||
// Set cummulative sizes. Do all of that math here, so that later we could easier
|
||||
// parallelize over segments and know with which segments we need to write relsize
|
||||
@@ -650,18 +659,29 @@ impl PgDataDirDb {
|
||||
trait ImportTask {
|
||||
fn key_range(&self) -> Range<Key>;
|
||||
|
||||
fn total_size(&self) -> usize {
|
||||
// TODO: revisit this
|
||||
if is_contiguous_range(&self.key_range()) {
|
||||
contiguous_range_len(&self.key_range()) as usize * 8192
|
||||
fn total_size(&self, shard_identity: &ShardIdentity) -> usize {
|
||||
let range = ShardedRange::new(self.key_range(), shard_identity);
|
||||
let page_count = range.page_count();
|
||||
if page_count == u32::MAX {
|
||||
tracing::warn!(
|
||||
"Import task has non contiguous key range: {}..{}",
|
||||
self.key_range().start,
|
||||
self.key_range().end
|
||||
);
|
||||
|
||||
// Tasks should operate on contiguous ranges. It is unexpected for
|
||||
// ranges to violate this assumption. Calling code handles this by mapping
|
||||
// any task on a non contiguous range to its own image layer.
|
||||
usize::MAX
|
||||
} else {
|
||||
u32::MAX as usize
|
||||
page_count as usize * 8192
|
||||
}
|
||||
}
|
||||
|
||||
async fn doit(
|
||||
self,
|
||||
layer_writer: &mut ImageLayerWriter,
|
||||
max_byte_range_size: usize,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<usize>;
|
||||
}
|
||||
@@ -698,6 +718,7 @@ impl ImportTask for ImportSingleKeyTask {
|
||||
async fn doit(
|
||||
self,
|
||||
layer_writer: &mut ImageLayerWriter,
|
||||
_max_byte_range_size: usize,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<usize> {
|
||||
layer_writer.put_image(self.key, self.buf, ctx).await?;
|
||||
@@ -751,6 +772,7 @@ impl ImportTask for ImportRelBlocksTask {
|
||||
async fn doit(
|
||||
self,
|
||||
layer_writer: &mut ImageLayerWriter,
|
||||
max_byte_range_size: usize,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<usize> {
|
||||
debug!("Importing relation file");
|
||||
@@ -777,7 +799,7 @@ impl ImportTask for ImportRelBlocksTask {
|
||||
assert_eq!(key.len(), 1);
|
||||
assert!(!acc.is_empty());
|
||||
assert!(acc_end > acc_start);
|
||||
if acc_end == start /* TODO additional max range check here, to limit memory consumption per task to X */ {
|
||||
if acc_end == start && end - acc_start <= max_byte_range_size {
|
||||
acc.push(key.pop().unwrap());
|
||||
Ok((acc, acc_start, end))
|
||||
} else {
|
||||
@@ -792,8 +814,8 @@ impl ImportTask for ImportRelBlocksTask {
|
||||
.get_range(&self.path, range_start.into_u64(), range_end.into_u64())
|
||||
.await?;
|
||||
let mut buf = Bytes::from(range_buf);
|
||||
// TODO: batched writes
|
||||
for key in keys {
|
||||
// The writer buffers writes internally
|
||||
let image = buf.split_to(8192);
|
||||
layer_writer.put_image(key, image, ctx).await?;
|
||||
nimages += 1;
|
||||
@@ -841,11 +863,15 @@ impl ImportTask for ImportSlruBlocksTask {
|
||||
async fn doit(
|
||||
self,
|
||||
layer_writer: &mut ImageLayerWriter,
|
||||
_max_byte_range_size: usize,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<usize> {
|
||||
debug!("Importing SLRU segment file {}", self.path);
|
||||
let buf = self.storage.get(&self.path).await?;
|
||||
|
||||
// TODO(vlad): Does timestamp to LSN work for imported timelines?
|
||||
// Probably not since we don't append the `xact_time` to it as in
|
||||
// [`WalIngest::ingest_xact_record`].
|
||||
let (kind, segno, start_blk) = self.key_range.start.to_slru_block()?;
|
||||
let (_kind, _segno, end_blk) = self.key_range.end.to_slru_block()?;
|
||||
let mut blknum = start_blk;
|
||||
@@ -884,12 +910,13 @@ impl ImportTask for AnyImportTask {
|
||||
async fn doit(
|
||||
self,
|
||||
layer_writer: &mut ImageLayerWriter,
|
||||
max_byte_range_size: usize,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<usize> {
|
||||
match self {
|
||||
Self::SingleKey(t) => t.doit(layer_writer, ctx).await,
|
||||
Self::RelBlocks(t) => t.doit(layer_writer, ctx).await,
|
||||
Self::SlruBlocks(t) => t.doit(layer_writer, ctx).await,
|
||||
Self::SingleKey(t) => t.doit(layer_writer, max_byte_range_size, ctx).await,
|
||||
Self::RelBlocks(t) => t.doit(layer_writer, max_byte_range_size, ctx).await,
|
||||
Self::SlruBlocks(t) => t.doit(layer_writer, max_byte_range_size, ctx).await,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -930,7 +957,12 @@ impl ChunkProcessingJob {
|
||||
}
|
||||
}
|
||||
|
||||
async fn run(self, timeline: Arc<Timeline>, ctx: &RequestContext) -> anyhow::Result<()> {
|
||||
async fn run(
|
||||
self,
|
||||
timeline: Arc<Timeline>,
|
||||
max_byte_range_size: usize,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut writer = ImageLayerWriter::new(
|
||||
timeline.conf,
|
||||
timeline.timeline_id,
|
||||
@@ -945,7 +977,7 @@ impl ChunkProcessingJob {
|
||||
|
||||
let mut nimages = 0;
|
||||
for task in self.tasks {
|
||||
nimages += task.doit(&mut writer, ctx).await?;
|
||||
nimages += task.doit(&mut writer, max_byte_range_size, ctx).await?;
|
||||
}
|
||||
|
||||
let resident_layer = if nimages > 0 {
|
||||
|
||||
@@ -6,7 +6,7 @@ use bytes::Bytes;
|
||||
use postgres_ffi::ControlFileData;
|
||||
use remote_storage::{
|
||||
Download, DownloadError, DownloadKind, DownloadOpts, GenericRemoteStorage, Listing,
|
||||
ListingObject, RemotePath,
|
||||
ListingObject, RemotePath, RemoteStorageConfig,
|
||||
};
|
||||
use serde::de::DeserializeOwned;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
@@ -22,11 +22,9 @@ pub async fn new(
|
||||
location: &index_part_format::Location,
|
||||
cancel: CancellationToken,
|
||||
) -> Result<RemoteStorageWrapper, anyhow::Error> {
|
||||
// FIXME: we probably want some timeout, and we might be able to assume the max file
|
||||
// size on S3 is 1GiB (postgres segment size). But the problem is that the individual
|
||||
// downloaders don't know enough about concurrent downloads to make a guess on the
|
||||
// expected bandwidth and resulting best timeout.
|
||||
let timeout = std::time::Duration::from_secs(24 * 60 * 60);
|
||||
// Downloads should be reasonably sized. We do ranged reads for relblock raw data
|
||||
// and full reads for SLRU segments which are bounded by Postgres.
|
||||
let timeout = RemoteStorageConfig::DEFAULT_TIMEOUT;
|
||||
let location_storage = match location {
|
||||
#[cfg(feature = "testing")]
|
||||
index_part_format::Location::LocalFs { path } => {
|
||||
@@ -50,9 +48,12 @@ pub async fn new(
|
||||
.import_pgdata_aws_endpoint_url
|
||||
.clone()
|
||||
.map(|url| url.to_string()), // by specifying None here, remote_storage/aws-sdk-rust will infer from env
|
||||
concurrency_limit: 100.try_into().unwrap(), // TODO: think about this
|
||||
max_keys_per_list_response: Some(1000), // TODO: think about this
|
||||
upload_storage_class: None, // irrelevant
|
||||
// This matches the default import job concurrency. This is managed
|
||||
// separately from the usual S3 client, but the concern here is bandwidth
|
||||
// usage.
|
||||
concurrency_limit: 128.try_into().unwrap(),
|
||||
max_keys_per_list_response: Some(1000),
|
||||
upload_storage_class: None, // irrelevant
|
||||
},
|
||||
timeout,
|
||||
)
|
||||
|
||||
@@ -113,7 +113,7 @@ impl WalReceiver {
|
||||
}
|
||||
connection_manager_state.shutdown().await;
|
||||
*loop_status.write().unwrap() = None;
|
||||
debug!("task exits");
|
||||
info!("task exits");
|
||||
}
|
||||
.instrument(info_span!(parent: None, "wal_connection_manager", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), timeline_id = %timeline_id))
|
||||
});
|
||||
|
||||
@@ -32,9 +32,7 @@ use utils::backoff::{
|
||||
};
|
||||
use utils::id::{NodeId, TenantTimelineId};
|
||||
use utils::lsn::Lsn;
|
||||
use utils::postgres_client::{
|
||||
ConnectionConfigArgs, PostgresClientProtocol, wal_stream_connection_config,
|
||||
};
|
||||
use utils::postgres_client::{ConnectionConfigArgs, wal_stream_connection_config};
|
||||
|
||||
use super::walreceiver_connection::{WalConnectionStatus, WalReceiverError};
|
||||
use super::{TaskEvent, TaskHandle, TaskStateUpdate, WalReceiverConf};
|
||||
@@ -991,19 +989,12 @@ impl ConnectionManagerState {
|
||||
return None; // no connection string, ignore sk
|
||||
}
|
||||
|
||||
let (shard_number, shard_count, shard_stripe_size) = match self.conf.protocol {
|
||||
PostgresClientProtocol::Vanilla => {
|
||||
(None, None, None)
|
||||
},
|
||||
PostgresClientProtocol::Interpreted { .. } => {
|
||||
let shard_identity = self.timeline.get_shard_identity();
|
||||
(
|
||||
Some(shard_identity.number.0),
|
||||
Some(shard_identity.count.0),
|
||||
Some(shard_identity.stripe_size.0),
|
||||
)
|
||||
}
|
||||
};
|
||||
let shard_identity = self.timeline.get_shard_identity();
|
||||
let (shard_number, shard_count, shard_stripe_size) = (
|
||||
Some(shard_identity.number.0),
|
||||
Some(shard_identity.count.0),
|
||||
Some(shard_identity.stripe_size.0),
|
||||
);
|
||||
|
||||
let connection_conf_args = ConnectionConfigArgs {
|
||||
protocol: self.conf.protocol,
|
||||
@@ -1120,8 +1111,8 @@ impl ReconnectReason {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use pageserver_api::config::defaults::DEFAULT_WAL_RECEIVER_PROTOCOL;
|
||||
use url::Host;
|
||||
use utils::postgres_client::PostgresClientProtocol;
|
||||
|
||||
use super::*;
|
||||
use crate::tenant::harness::{TIMELINE_ID, TenantHarness};
|
||||
@@ -1552,6 +1543,11 @@ mod tests {
|
||||
.await
|
||||
.expect("Failed to create an empty timeline for dummy wal connection manager");
|
||||
|
||||
let protocol = PostgresClientProtocol::Interpreted {
|
||||
format: utils::postgres_client::InterpretedFormat::Protobuf,
|
||||
compression: Some(utils::postgres_client::Compression::Zstd { level: 1 }),
|
||||
};
|
||||
|
||||
ConnectionManagerState {
|
||||
id: TenantTimelineId {
|
||||
tenant_id: harness.tenant_shard_id.tenant_id,
|
||||
@@ -1560,7 +1556,7 @@ mod tests {
|
||||
timeline,
|
||||
cancel: CancellationToken::new(),
|
||||
conf: WalReceiverConf {
|
||||
protocol: DEFAULT_WAL_RECEIVER_PROTOCOL,
|
||||
protocol,
|
||||
wal_connect_timeout: Duration::from_secs(1),
|
||||
lagging_wal_timeout: Duration::from_secs(1),
|
||||
max_lsn_wal_lag: NonZeroU64::new(1024 * 1024).unwrap(),
|
||||
|
||||
@@ -15,7 +15,7 @@ use postgres_backend::is_expected_io_error;
|
||||
use postgres_connection::PgConnectionConfig;
|
||||
use postgres_ffi::WAL_SEGMENT_SIZE;
|
||||
use postgres_ffi::v14::xlog_utils::normalize_lsn;
|
||||
use postgres_ffi::waldecoder::{WalDecodeError, WalStreamDecoder};
|
||||
use postgres_ffi::waldecoder::WalDecodeError;
|
||||
use postgres_protocol::message::backend::ReplicationMessage;
|
||||
use postgres_types::PgLsn;
|
||||
use tokio::sync::watch;
|
||||
@@ -31,7 +31,7 @@ use utils::lsn::Lsn;
|
||||
use utils::pageserver_feedback::PageserverFeedback;
|
||||
use utils::postgres_client::PostgresClientProtocol;
|
||||
use utils::sync::gate::GateError;
|
||||
use wal_decoder::models::{FlushUncommittedRecords, InterpretedWalRecord, InterpretedWalRecords};
|
||||
use wal_decoder::models::{FlushUncommittedRecords, InterpretedWalRecords};
|
||||
use wal_decoder::wire_format::FromWireFormat;
|
||||
|
||||
use super::TaskStateUpdate;
|
||||
@@ -275,8 +275,6 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
let copy_stream = replication_client.copy_both_simple(&query).await?;
|
||||
let mut physical_stream = pin!(ReplicationStream::new(copy_stream));
|
||||
|
||||
let mut waldecoder = WalStreamDecoder::new(startpoint, timeline.pg_version);
|
||||
|
||||
let mut walingest = WalIngest::new(timeline.as_ref(), startpoint, &ctx)
|
||||
.await
|
||||
.map_err(|e| match e.kind {
|
||||
@@ -284,19 +282,22 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
_ => WalReceiverError::Other(e.into()),
|
||||
})?;
|
||||
|
||||
let shard = vec![*timeline.get_shard_identity()];
|
||||
|
||||
let interpreted_proto_config = match protocol {
|
||||
PostgresClientProtocol::Vanilla => None,
|
||||
let (format, compression) = match protocol {
|
||||
PostgresClientProtocol::Interpreted {
|
||||
format,
|
||||
compression,
|
||||
} => Some((format, compression)),
|
||||
} => (format, compression),
|
||||
PostgresClientProtocol::Vanilla => {
|
||||
return Err(WalReceiverError::Other(anyhow!(
|
||||
"Vanilla WAL receiver protocol is no longer supported for ingest"
|
||||
)));
|
||||
}
|
||||
};
|
||||
|
||||
let mut expected_wal_start = startpoint;
|
||||
while let Some(replication_message) = {
|
||||
select! {
|
||||
biased;
|
||||
_ = cancellation.cancelled() => {
|
||||
debug!("walreceiver interrupted");
|
||||
None
|
||||
@@ -312,16 +313,6 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
// Update the connection status before processing the message. If the message processing
|
||||
// fails (e.g. in walingest), we still want to know latests LSNs from the safekeeper.
|
||||
match &replication_message {
|
||||
ReplicationMessage::XLogData(xlog_data) => {
|
||||
connection_status.latest_connection_update = now;
|
||||
connection_status.commit_lsn = Some(Lsn::from(xlog_data.wal_end()));
|
||||
connection_status.streaming_lsn = Some(Lsn::from(
|
||||
xlog_data.wal_start() + xlog_data.data().len() as u64,
|
||||
));
|
||||
if !xlog_data.data().is_empty() {
|
||||
connection_status.latest_wal_update = now;
|
||||
}
|
||||
}
|
||||
ReplicationMessage::PrimaryKeepAlive(keepalive) => {
|
||||
connection_status.latest_connection_update = now;
|
||||
connection_status.commit_lsn = Some(Lsn::from(keepalive.wal_end()));
|
||||
@@ -352,7 +343,6 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
// were interpreted.
|
||||
let streaming_lsn = Lsn::from(raw.streaming_lsn());
|
||||
|
||||
let (format, compression) = interpreted_proto_config.unwrap();
|
||||
let batch = InterpretedWalRecords::from_wire(raw.data(), format, compression)
|
||||
.await
|
||||
.with_context(|| {
|
||||
@@ -508,138 +498,6 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
Some(streaming_lsn)
|
||||
}
|
||||
|
||||
ReplicationMessage::XLogData(xlog_data) => {
|
||||
async fn commit(
|
||||
modification: &mut DatadirModification<'_>,
|
||||
uncommitted: &mut u64,
|
||||
filtered: &mut u64,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
let stats = modification.stats();
|
||||
modification.commit(ctx).await?;
|
||||
WAL_INGEST
|
||||
.records_committed
|
||||
.inc_by(*uncommitted - *filtered);
|
||||
WAL_INGEST.inc_values_committed(&stats);
|
||||
*uncommitted = 0;
|
||||
*filtered = 0;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Pass the WAL data to the decoder, and see if we can decode
|
||||
// more records as a result.
|
||||
let data = xlog_data.data();
|
||||
let startlsn = Lsn::from(xlog_data.wal_start());
|
||||
let endlsn = startlsn + data.len() as u64;
|
||||
|
||||
trace!("received XLogData between {startlsn} and {endlsn}");
|
||||
|
||||
WAL_INGEST.bytes_received.inc_by(data.len() as u64);
|
||||
waldecoder.feed_bytes(data);
|
||||
|
||||
{
|
||||
let mut modification = timeline.begin_modification(startlsn);
|
||||
let mut uncommitted_records = 0;
|
||||
let mut filtered_records = 0;
|
||||
|
||||
while let Some((next_record_lsn, recdata)) = waldecoder.poll_decode()? {
|
||||
// It is important to deal with the aligned records as lsn in getPage@LSN is
|
||||
// aligned and can be several bytes bigger. Without this alignment we are
|
||||
// at risk of hitting a deadlock.
|
||||
if !next_record_lsn.is_aligned() {
|
||||
return Err(WalReceiverError::Other(anyhow!("LSN not aligned")));
|
||||
}
|
||||
|
||||
// Deserialize and interpret WAL record
|
||||
let interpreted = InterpretedWalRecord::from_bytes_filtered(
|
||||
recdata,
|
||||
&shard,
|
||||
next_record_lsn,
|
||||
modification.tline.pg_version,
|
||||
)?
|
||||
.remove(timeline.get_shard_identity())
|
||||
.unwrap();
|
||||
|
||||
if matches!(interpreted.flush_uncommitted, FlushUncommittedRecords::Yes)
|
||||
&& uncommitted_records > 0
|
||||
{
|
||||
// Special case: legacy PG database creations operate by reading pages from a 'template' database:
|
||||
// these are the only kinds of WAL record that require reading data blocks while ingesting. Ensure
|
||||
// all earlier writes of data blocks are visible by committing any modification in flight.
|
||||
commit(
|
||||
&mut modification,
|
||||
&mut uncommitted_records,
|
||||
&mut filtered_records,
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
|
||||
// Ingest the records without immediately committing them.
|
||||
timeline.metrics.wal_records_received.inc();
|
||||
let ingested = walingest
|
||||
.ingest_record(interpreted, &mut modification, &ctx)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!("could not ingest record at {next_record_lsn}")
|
||||
})
|
||||
.inspect_err(|err| {
|
||||
// TODO: we can't differentiate cancellation errors with
|
||||
// anyhow::Error, so just ignore it if we're cancelled.
|
||||
if !cancellation.is_cancelled() && !timeline.is_stopping() {
|
||||
critical!("{err:?}")
|
||||
}
|
||||
})?;
|
||||
if !ingested {
|
||||
tracing::debug!("ingest: filtered out record @ LSN {next_record_lsn}");
|
||||
WAL_INGEST.records_filtered.inc();
|
||||
filtered_records += 1;
|
||||
}
|
||||
|
||||
// FIXME: this cannot be made pausable_failpoint without fixing the
|
||||
// failpoint library; in tests, the added amount of debugging will cause us
|
||||
// to timeout the tests.
|
||||
fail_point!("walreceiver-after-ingest");
|
||||
|
||||
last_rec_lsn = next_record_lsn;
|
||||
|
||||
// Commit every ingest_batch_size records. Even if we filtered out
|
||||
// all records, we still need to call commit to advance the LSN.
|
||||
uncommitted_records += 1;
|
||||
if uncommitted_records >= ingest_batch_size
|
||||
|| modification.approx_pending_bytes()
|
||||
> DatadirModification::MAX_PENDING_BYTES
|
||||
{
|
||||
commit(
|
||||
&mut modification,
|
||||
&mut uncommitted_records,
|
||||
&mut filtered_records,
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
}
|
||||
|
||||
// Commit the remaining records.
|
||||
if uncommitted_records > 0 {
|
||||
commit(
|
||||
&mut modification,
|
||||
&mut uncommitted_records,
|
||||
&mut filtered_records,
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
}
|
||||
|
||||
if !caught_up && endlsn >= end_of_wal {
|
||||
info!("caught up at LSN {endlsn}");
|
||||
caught_up = true;
|
||||
}
|
||||
|
||||
Some(endlsn)
|
||||
}
|
||||
|
||||
ReplicationMessage::PrimaryKeepAlive(keepalive) => {
|
||||
let wal_end = keepalive.wal_end();
|
||||
let timestamp = keepalive.timestamp();
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
# pgxs/neon/Makefile
|
||||
|
||||
|
||||
MODULE_big = neon
|
||||
OBJS = \
|
||||
$(WIN32RES) \
|
||||
communicator.o \
|
||||
communicator_new.o \
|
||||
extension_server.o \
|
||||
file_cache.o \
|
||||
hll.o \
|
||||
@@ -22,11 +22,13 @@ OBJS = \
|
||||
walproposer.o \
|
||||
walproposer_pg.o \
|
||||
control_plane_connector.o \
|
||||
walsender_hooks.o
|
||||
walsender_hooks.o \
|
||||
$(LIBCOMMUNICATOR_PATH)/libcommunicator.a
|
||||
|
||||
PG_CPPFLAGS = -I$(libpq_srcdir)
|
||||
SHLIB_LINK_INTERNAL = $(libpq)
|
||||
SHLIB_LINK = -lcurl
|
||||
SHLIB_LINK += -framework Security -framework CoreFoundation -framework CoreServices -framework IOKit -framework SystemConfiguration
|
||||
|
||||
EXTENSION = neon
|
||||
DATA = \
|
||||
|
||||
372
pgxn/neon/communicator/Cargo.lock
generated
Normal file
372
pgxn/neon/communicator/Cargo.lock
generated
Normal file
@@ -0,0 +1,372 @@
|
||||
# This file is automatically @generated by Cargo.
|
||||
# It is not intended for manual editing.
|
||||
version = 4
|
||||
|
||||
[[package]]
|
||||
name = "addr2line"
|
||||
version = "0.24.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "dfbe277e56a376000877090da837660b4427aad530e3028d44e0bffe4f89a1c1"
|
||||
dependencies = [
|
||||
"gimli",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "adler2"
|
||||
version = "2.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627"
|
||||
|
||||
[[package]]
|
||||
name = "backtrace"
|
||||
version = "0.3.74"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8d82cb332cdfaed17ae235a638438ac4d4839913cc2af585c3c6746e8f8bee1a"
|
||||
dependencies = [
|
||||
"addr2line",
|
||||
"cfg-if",
|
||||
"libc",
|
||||
"miniz_oxide",
|
||||
"object",
|
||||
"rustc-demangle",
|
||||
"windows-targets",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "base64"
|
||||
version = "0.22.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6"
|
||||
|
||||
[[package]]
|
||||
name = "bytes"
|
||||
version = "1.10.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d71b6127be86fdcfddb610f7182ac57211d4b18a3e9c82eb2d17662f2227ad6a"
|
||||
|
||||
[[package]]
|
||||
name = "cfg-if"
|
||||
version = "1.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
|
||||
|
||||
[[package]]
|
||||
name = "communicator"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"tonic",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "fnv"
|
||||
version = "1.0.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
|
||||
|
||||
[[package]]
|
||||
name = "futures-core"
|
||||
version = "0.3.31"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e"
|
||||
|
||||
[[package]]
|
||||
name = "gimli"
|
||||
version = "0.31.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "07e28edb80900c19c28f1072f2e8aeca7fa06b23cd4169cefe1af5aa3260783f"
|
||||
|
||||
[[package]]
|
||||
name = "http"
|
||||
version = "1.3.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f4a85d31aea989eead29a3aaf9e1115a180df8282431156e533de47660892565"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"fnv",
|
||||
"itoa",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "http-body"
|
||||
version = "1.0.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1efedce1fb8e6913f23e0c92de8e62cd5b772a67e7b3946df930a62566c93184"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"http",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "http-body-util"
|
||||
version = "0.1.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b021d93e26becf5dc7e1b75b1bed1fd93124b374ceb73f43d4d4eafec896a64a"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"futures-core",
|
||||
"http",
|
||||
"http-body",
|
||||
"pin-project-lite",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "itoa"
|
||||
version = "1.0.15"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c"
|
||||
|
||||
[[package]]
|
||||
name = "libc"
|
||||
version = "0.2.171"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c19937216e9d3aa9956d9bb8dfc0b0c8beb6058fc4f7a4dc4d850edf86a237d6"
|
||||
|
||||
[[package]]
|
||||
name = "memchr"
|
||||
version = "2.7.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
|
||||
|
||||
[[package]]
|
||||
name = "miniz_oxide"
|
||||
version = "0.8.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ff70ce3e48ae43fa075863cef62e8b43b71a4f2382229920e0df362592919430"
|
||||
dependencies = [
|
||||
"adler2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "object"
|
||||
version = "0.36.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "62948e14d923ea95ea2c7c86c71013138b66525b86bdc08d2dcc262bdb497b87"
|
||||
dependencies = [
|
||||
"memchr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "once_cell"
|
||||
version = "1.21.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d"
|
||||
|
||||
[[package]]
|
||||
name = "percent-encoding"
|
||||
version = "2.3.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e"
|
||||
|
||||
[[package]]
|
||||
name = "pin-project"
|
||||
version = "1.1.10"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "677f1add503faace112b9f1373e43e9e054bfdd22ff1a63c1bc485eaec6a6a8a"
|
||||
dependencies = [
|
||||
"pin-project-internal",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pin-project-internal"
|
||||
version = "1.1.10"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6e918e4ff8c4549eb882f14b3a4bc8c8bc93de829416eacf579f1207a8fbf861"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pin-project-lite"
|
||||
version = "0.2.16"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3b3cff922bd51709b605d9ead9aa71031d81447142d828eb4a6eba76fe619f9b"
|
||||
|
||||
[[package]]
|
||||
name = "proc-macro2"
|
||||
version = "1.0.94"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a31971752e70b8b2686d7e46ec17fb38dad4051d94024c88df49b667caea9c84"
|
||||
dependencies = [
|
||||
"unicode-ident",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "quote"
|
||||
version = "1.0.40"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rustc-demangle"
|
||||
version = "0.1.24"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f"
|
||||
|
||||
[[package]]
|
||||
name = "syn"
|
||||
version = "2.0.100"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b09a44accad81e1ba1cd74a32461ba89dee89095ba17b32f5d03683b1b1fc2a0"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"unicode-ident",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tokio"
|
||||
version = "1.44.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e6b88822cbe49de4185e3a4cbf8321dd487cf5fe0c5c65695fef6346371e9c48"
|
||||
dependencies = [
|
||||
"backtrace",
|
||||
"pin-project-lite",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tokio-stream"
|
||||
version = "0.1.17"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "eca58d7bba4a75707817a2c44174253f9236b2d5fbd055602e9d5c07c139a047"
|
||||
dependencies = [
|
||||
"futures-core",
|
||||
"pin-project-lite",
|
||||
"tokio",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tonic"
|
||||
version = "0.13.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "85839f0b32fd242bb3209262371d07feda6d780d16ee9d2bc88581b89da1549b"
|
||||
dependencies = [
|
||||
"base64",
|
||||
"bytes",
|
||||
"http",
|
||||
"http-body",
|
||||
"http-body-util",
|
||||
"percent-encoding",
|
||||
"pin-project",
|
||||
"tokio-stream",
|
||||
"tower-layer",
|
||||
"tower-service",
|
||||
"tracing",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tower-layer"
|
||||
version = "0.3.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "121c2a6cda46980bb0fcd1647ffaf6cd3fc79a013de288782836f6df9c48780e"
|
||||
|
||||
[[package]]
|
||||
name = "tower-service"
|
||||
version = "0.3.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8df9b6e13f2d32c91b9bd719c00d1958837bc7dec474d94952798cc8e69eeec3"
|
||||
|
||||
[[package]]
|
||||
name = "tracing"
|
||||
version = "0.1.41"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "784e0ac535deb450455cbfa28a6f0df145ea1bb7ae51b821cf5e7927fdcfbdd0"
|
||||
dependencies = [
|
||||
"pin-project-lite",
|
||||
"tracing-attributes",
|
||||
"tracing-core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tracing-attributes"
|
||||
version = "0.1.28"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "395ae124c09f9e6918a2310af6038fba074bcf474ac352496d5910dd59a2226d"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tracing-core"
|
||||
version = "0.1.33"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e672c95779cf947c5311f83787af4fa8fffd12fb27e4993211a84bdfd9610f9c"
|
||||
dependencies = [
|
||||
"once_cell",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "unicode-ident"
|
||||
version = "1.0.18"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512"
|
||||
|
||||
[[package]]
|
||||
name = "windows-targets"
|
||||
version = "0.52.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973"
|
||||
dependencies = [
|
||||
"windows_aarch64_gnullvm",
|
||||
"windows_aarch64_msvc",
|
||||
"windows_i686_gnu",
|
||||
"windows_i686_gnullvm",
|
||||
"windows_i686_msvc",
|
||||
"windows_x86_64_gnu",
|
||||
"windows_x86_64_gnullvm",
|
||||
"windows_x86_64_msvc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "windows_aarch64_gnullvm"
|
||||
version = "0.52.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
|
||||
|
||||
[[package]]
|
||||
name = "windows_aarch64_msvc"
|
||||
version = "0.52.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
|
||||
|
||||
[[package]]
|
||||
name = "windows_i686_gnu"
|
||||
version = "0.52.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
|
||||
|
||||
[[package]]
|
||||
name = "windows_i686_gnullvm"
|
||||
version = "0.52.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
|
||||
|
||||
[[package]]
|
||||
name = "windows_i686_msvc"
|
||||
version = "0.52.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
|
||||
|
||||
[[package]]
|
||||
name = "windows_x86_64_gnu"
|
||||
version = "0.52.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
|
||||
|
||||
[[package]]
|
||||
name = "windows_x86_64_gnullvm"
|
||||
version = "0.52.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
|
||||
|
||||
[[package]]
|
||||
name = "windows_x86_64_msvc"
|
||||
version = "0.52.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
|
||||
46
pgxn/neon/communicator/Cargo.toml
Normal file
46
pgxn/neon/communicator/Cargo.toml
Normal file
@@ -0,0 +1,46 @@
|
||||
[package]
|
||||
name = "communicator"
|
||||
version = "0.1.0"
|
||||
edition = "2024"
|
||||
|
||||
[features]
|
||||
testing = []
|
||||
|
||||
[lib]
|
||||
crate-type = ["staticlib"]
|
||||
|
||||
[dependencies]
|
||||
axum.workspace = true
|
||||
bytes.workspace = true
|
||||
clashmap.workspace = true
|
||||
http.workspace = true
|
||||
libc.workspace = true
|
||||
nix.workspace = true
|
||||
atomic_enum = "0.3.0"
|
||||
prometheus.workspace = true
|
||||
prost.workspace = true
|
||||
tonic = { version = "0.12.0", default-features = false, features=["codegen", "prost", "transport"] }
|
||||
tokio = { version = "1.43.1", features = ["macros", "net", "io-util", "rt", "rt-multi-thread"] }
|
||||
tokio-pipe = { version = "0.2.12" }
|
||||
thiserror.workspace = true
|
||||
tracing.workspace = true
|
||||
tracing-subscriber.workspace = true
|
||||
|
||||
metrics.workspace = true
|
||||
uring-common = { workspace = true, features = ["bytes"] }
|
||||
|
||||
pageserver_client_grpc.workspace = true
|
||||
pageserver_page_api.workspace = true
|
||||
|
||||
neon-shmem.workspace = true
|
||||
utils.workspace = true
|
||||
|
||||
[build-dependencies]
|
||||
cbindgen.workspace = true
|
||||
|
||||
[target.'cfg(target_os = "macos")']
|
||||
rustflags = [
|
||||
"-l", "framework=Security",
|
||||
"-l", "framework=CoreFoundation",
|
||||
"-l", "framework=CoreServices",
|
||||
]
|
||||
123
pgxn/neon/communicator/README.md
Normal file
123
pgxn/neon/communicator/README.md
Normal file
@@ -0,0 +1,123 @@
|
||||
# Communicator
|
||||
|
||||
This package provides the so-called "compute-pageserver communicator",
|
||||
or just "communicator" in short. It runs in a PostgreSQL server, as
|
||||
part of the neon extension, and handles the communication with the
|
||||
pageservers. On the PostgreSQL side, the glue code in pgxn/neon/ uses
|
||||
the communicator to implement the PostgreSQL Storage Manager (SMGR)
|
||||
interface.
|
||||
|
||||
## Design criteria
|
||||
|
||||
- Low latency
|
||||
- Saturate a 10 Gbit / s network interface without becoming a bottleneck
|
||||
|
||||
## Source code view
|
||||
|
||||
pgxn/neon/communicator_new.c
|
||||
Contains the glue that interact with PostgreSQL code and the Rust
|
||||
communicator code.
|
||||
|
||||
pgxn/neon/communicator/src/backend_interface.rs
|
||||
The entry point for calls from each backend.
|
||||
|
||||
pgxn/neon/communicator/src/init.rs
|
||||
Initialization at server startup
|
||||
|
||||
pgxn/neon/communicator/src/worker_process/
|
||||
Worker process main loop and glue code
|
||||
|
||||
At compilation time, pgxn/neon/communicator/ produces a static
|
||||
library, libcommunicator.a. It is linked to the neon.so extension
|
||||
library.
|
||||
|
||||
The real networking code, which is independent of PostgreSQL, is in
|
||||
the pageserver/client_grpc crate.
|
||||
|
||||
## Process view
|
||||
|
||||
The communicator runs in a dedicated background worker process, the
|
||||
"communicator process". The communicator uses a multi-threaded Tokio
|
||||
runtime to execute the IO requests. So the communicator process has
|
||||
multiple threads running. That's unusual for Postgres processes and
|
||||
care must be taken to make that work.
|
||||
|
||||
### Backend <-> worker communication
|
||||
|
||||
Each backend has a number of I/O request slots in shared memory. The
|
||||
slots are statically allocated for each backend, and must not be
|
||||
accessed by other backends. The worker process reads requests from the
|
||||
shared memory slots, and writes responses back to the slots.
|
||||
|
||||
To submit an IO request, first pick one of your backend's free slots,
|
||||
and write the details of the IO request in the slot. Finally, update
|
||||
the 'state' field of the slot to Submitted. That informs the worker
|
||||
process that it can start processing the request. Once the state has
|
||||
been set to Submitted, the backend *must not* access the slot anymore,
|
||||
until the worker process sets its state to 'Completed'. In other
|
||||
words, each slot is owned by either the backend or the worker process
|
||||
at all times, and the 'state' field indicates who has ownership at the
|
||||
moment.
|
||||
|
||||
To inform the worker process that a request slot has a pending IO
|
||||
request, there's a pipe shared by the worker process and all backend
|
||||
processes. After you have changed the slot's state to Submitted, write
|
||||
the index of the request slot to the pipe. This wakes up the worker
|
||||
process.
|
||||
|
||||
(Note that the pipe is just used for wakeups, but the worker process
|
||||
is free to pick up Submitted IO requests even without receiving the
|
||||
wakeup. As of this writing, it doesn't do that, but it might be useful
|
||||
in the future to reduce latency even further, for example.)
|
||||
|
||||
When the worker process has completed processing the request, it
|
||||
writes the result back in the request slot. A GetPage request can also
|
||||
contain a pointer to buffer in the shared buffer cache. In that case,
|
||||
the worker process writes the resulting page contents directly to the
|
||||
buffer, and just a result code in the request slot. It then updates
|
||||
the 'state' field to Completed, which passes the owner ship back to
|
||||
the originating backend. Finally, it signals the process Latch of the
|
||||
originating backend, waking it up.
|
||||
|
||||
### Differences between PostgreSQL v16, v17 and v18
|
||||
|
||||
PostgreSQL v18 introduced the new AIO mechanism. The PostgreSQL AIO
|
||||
mechanism uses a very similar mechanism as described in the previous
|
||||
section, for the communication between AIO worker processes and
|
||||
backends. With our communicator, the AIO worker processes are not
|
||||
used, but we use the same PgAioHandle request slots as in upstream.
|
||||
For Neon-specific IO requests like GetDbSize, a neon request slot is
|
||||
used. But for the actual IO requests, the request slot merely contains
|
||||
a pointer to the PgAioHandle slot. The worker process updates the
|
||||
status of that, calls the IO callbacks upon completionetc, just like
|
||||
the upstream AIO worker processes do.
|
||||
|
||||
## Sequence diagram
|
||||
|
||||
neon
|
||||
PostgreSQL extension backend_interface.rs worker_process.rs processor tonic
|
||||
| . . . .
|
||||
| smgr_read() . . . .
|
||||
+-------------> + . . .
|
||||
. | . . .
|
||||
. | rcommunicator_ . . .
|
||||
. | get_page_at_lsn . . .
|
||||
. +------------------> + . .
|
||||
| . .
|
||||
| write request to . . .
|
||||
| slot . .
|
||||
| . .
|
||||
| . .
|
||||
| submit_request() . .
|
||||
+-----------------> + .
|
||||
| | .
|
||||
| | db_size_request . .
|
||||
+---------------->.
|
||||
. TODO
|
||||
|
||||
|
||||
|
||||
### Compute <-> pageserver protocol
|
||||
|
||||
The protocol between Compute and the pageserver is based on gRPC. See `protos/`.
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user