diff --git a/.github/ISSUE_TEMPLATE/epic-template.md b/.github/ISSUE_TEMPLATE/epic-template.md index 019e6e7345..c442f50fde 100644 --- a/.github/ISSUE_TEMPLATE/epic-template.md +++ b/.github/ISSUE_TEMPLATE/epic-template.md @@ -16,9 +16,9 @@ assignees: '' ## Implementation ideas - +## Tasks ```[tasklist] -### Tasks +- [ ] Example Task ``` diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 2e52e7c28f..276c71c6e0 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -1132,11 +1132,9 @@ jobs: -f branch=main \ -f dockerTag=${{needs.tag.outputs.build-tag}} elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then - gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main \ + gh workflow --repo neondatabase/aws run deploy-proxy-prod.yml --ref main \ -f deployPgSniRouter=true \ -f deployProxy=true \ - -f deployStorage=false \ - -f deployStorageBroker=false \ -f branch=main \ -f dockerTag=${{needs.tag.outputs.build-tag}} else diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 80a718d61a..b2c9a19588 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -97,7 +97,7 @@ jobs: **Please merge this Pull Request using 'Create a merge commit' button** EOF - gh pr create --title "Proxy release ${RELEASE_DATE}}" \ + gh pr create --title "Proxy release ${RELEASE_DATE}" \ --body-file "body.md" \ --head "${RELEASE_BRANCH}" \ --base "release-proxy" diff --git a/Cargo.lock b/Cargo.lock index c23162971e..7fd9053f62 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -241,7 +241,7 @@ checksum = "16e62a023e7c117e27523144c5d2459f4397fcc3cab0085af8e2224f643a0193" dependencies = [ "proc-macro2", "quote", - "syn 2.0.32", + "syn 2.0.52", ] [[package]] @@ -252,7 +252,7 @@ checksum = "b9ccdd8f2a161be9bd5c023df56f1b2a0bd1d83872ae53b71a84a12c9bf6e842" dependencies = [ "proc-macro2", "quote", - "syn 2.0.32", + "syn 2.0.52", ] [[package]] @@ -626,7 +626,7 @@ dependencies = [ "once_cell", "pin-project-lite", "pin-utils", - "rustls", + "rustls 0.21.9", "tokio", "tracing", ] @@ -907,6 +907,16 @@ version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8c3c1a368f70d6cf7302d78f8f7093da241fb8e8807c05cc9e51a125895a6d5b" +[[package]] +name = "bcder" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c627747a6774aab38beb35990d88309481378558875a41da1a4b2e373c906ef0" +dependencies = [ + "bytes", + "smallvec", +] + [[package]] name = "bincode" version = "1.3.3" @@ -935,7 +945,7 @@ dependencies = [ "regex", "rustc-hash", "shlex", - "syn 2.0.32", + "syn 2.0.52", "which", ] @@ -986,9 +996,9 @@ checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" [[package]] name = "bytes" -version = "1.4.0" +version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89b2fd2a0dcf38d7971e2194b6b6eebab45ae01067456a7fd93d5547a61b70be" +checksum = "a2bd12c1caf447e69cd4528f47f94d203fd2582878ecb9e9465484c4148a8223" dependencies = [ "serde", ] @@ -1149,7 +1159,7 @@ dependencies = [ "heck", "proc-macro2", "quote", - "syn 2.0.32", + "syn 2.0.52", ] [[package]] @@ -1574,7 +1584,7 @@ dependencies = [ "proc-macro2", "quote", "strsim", - "syn 2.0.32", + "syn 2.0.52", ] [[package]] @@ -1585,7 +1595,7 @@ checksum = "29a358ff9f12ec09c3e61fef9b5a9902623a695a46a917b07f269bff1445611a" dependencies = [ "darling_core", "quote", - "syn 2.0.32", + "syn 2.0.52", ] [[package]] @@ -1627,6 +1637,16 @@ dependencies = [ "zeroize", ] +[[package]] +name = "der" +version = "0.7.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fffa369a668c8af7dbf8b5e56c9f744fbd399949ed171606040001947de40b1c" +dependencies = [ + "const-oid", + "zeroize", +] + [[package]] name = "der-parser" version = "8.2.0" @@ -1681,7 +1701,7 @@ dependencies = [ "diesel_table_macro_syntax", "proc-macro2", "quote", - "syn 2.0.32", + "syn 2.0.52", ] [[package]] @@ -1701,7 +1721,7 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fc5557efc453706fed5e4fa85006fe9817c224c3f480a34c7e5959fd700921c5" dependencies = [ - "syn 2.0.32", + "syn 2.0.52", ] [[package]] @@ -1723,7 +1743,7 @@ checksum = "487585f4d0c6655fe74905e2504d8ad6908e4db67f744eb140876906c2f3175d" dependencies = [ "proc-macro2", "quote", - "syn 2.0.32", + "syn 2.0.52", ] [[package]] @@ -1747,10 +1767,10 @@ version = "0.14.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "413301934810f597c1d19ca71c8710e99a3f1ba28a0d2ebc01551a2daeea3c5c" dependencies = [ - "der", + "der 0.6.1", "elliptic-curve", "rfc6979", - "signature", + "signature 1.6.4", ] [[package]] @@ -1767,7 +1787,7 @@ checksum = "e7bb888ab5300a19b8e5bceef25ac745ad065f3c9f7efc6de1b91958110891d3" dependencies = [ "base16ct", "crypto-bigint 0.4.9", - "der", + "der 0.6.1", "digest", "ff", "generic-array", @@ -1827,7 +1847,7 @@ dependencies = [ "darling", "proc-macro2", "quote", - "syn 2.0.32", + "syn 2.0.52", ] [[package]] @@ -2087,7 +2107,7 @@ checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac" dependencies = [ "proc-macro2", "quote", - "syn 2.0.32", + "syn 2.0.52", ] [[package]] @@ -2470,10 +2490,10 @@ dependencies = [ "http 0.2.9", "hyper", "log", - "rustls", + "rustls 0.21.9", "rustls-native-certs", "tokio", - "tokio-rustls", + "tokio-rustls 0.24.0", ] [[package]] @@ -2711,7 +2731,7 @@ checksum = "5c7ea04a7c5c055c175f189b6dc6ba036fd62306b58c66c9f6389036c503a3f4" dependencies = [ "base64 0.21.1", "js-sys", - "pem 3.0.3", + "pem", "ring 0.17.6", "serde", "serde_json", @@ -2959,9 +2979,9 @@ dependencies = [ [[package]] name = "mio" -version = "0.8.10" +version = "0.8.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f3d0b296e374a4e6f3c7b0a1f5a51d748a0d34c85e7dc48fc3fa9a87657fe09" +checksum = "a4a650543ca06a924e8b371db273b2756685faae30f8487da1b56505a8f78b0c" dependencies = [ "libc", "log", @@ -3234,7 +3254,7 @@ checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.32", + "syn 2.0.52", ] [[package]] @@ -3716,7 +3736,7 @@ dependencies = [ "parquet", "proc-macro2", "quote", - "syn 2.0.32", + "syn 2.0.52", ] [[package]] @@ -3754,16 +3774,6 @@ version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099" -[[package]] -name = "pem" -version = "2.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6b13fe415cdf3c8e44518e18a7c95a13431d9bdf6d15367d82b23c377fdd441a" -dependencies = [ - "base64 0.21.1", - "serde", -] - [[package]] name = "pem" version = "3.0.3" @@ -3825,7 +3835,7 @@ checksum = "39407670928234ebc5e6e580247dd567ad73a3578460c5990f9503df207e8f07" dependencies = [ "proc-macro2", "quote", - "syn 2.0.32", + "syn 2.0.52", ] [[package]] @@ -3846,8 +3856,8 @@ version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9eca2c590a5f85da82668fa685c09ce2888b9430e83299debf1f34b65fd4a4ba" dependencies = [ - "der", - "spki", + "der 0.6.1", + "spki 0.6.0", ] [[package]] @@ -3946,14 +3956,14 @@ dependencies = [ "futures", "once_cell", "pq_proto", - "rustls", - "rustls-pemfile", + "rustls 0.22.2", + "rustls-pemfile 2.1.1", "serde", "thiserror", "tokio", "tokio-postgres", "tokio-postgres-rustls", - "tokio-rustls", + "tokio-rustls 0.25.0", "tracing", "workspace_hack", ] @@ -4042,7 +4052,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3b69d39aab54d069e7f2fe8cb970493e7834601ca2d8c65fd7bbd183578080d1" dependencies = [ "proc-macro2", - "syn 2.0.32", + "syn 2.0.52", ] [[package]] @@ -4053,9 +4063,9 @@ checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068" [[package]] name = "proc-macro2" -version = "1.0.66" +version = "1.0.78" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "18fb31db3f9bddb2ea821cde30a9f70117e3f119938b5ee630b7403aa6e2ead9" +checksum = "e2422ad645d89c99f8f3e6b88a9fdeca7fabeac836b1002371c4367c8f984aae" dependencies = [ "unicode-ident", ] @@ -4202,8 +4212,8 @@ dependencies = [ "routerify", "rstest", "rustc-hash", - "rustls", - "rustls-pemfile", + "rustls 0.22.2", + "rustls-pemfile 2.1.1", "scopeguard", "serde", "serde_json", @@ -4216,11 +4226,10 @@ dependencies = [ "thiserror", "tikv-jemalloc-ctl", "tikv-jemallocator", - "tls-listener", "tokio", "tokio-postgres", "tokio-postgres-rustls", - "tokio-rustls", + "tokio-rustls 0.25.0", "tokio-util", "tracing", "tracing-opentelemetry", @@ -4248,9 +4257,9 @@ dependencies = [ [[package]] name = "quote" -version = "1.0.32" +version = "1.0.35" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "50f3b39ccfb720540debaa0164757101c08ecb8d326b15358ce76a62c7e85965" +checksum = "291ec9ab5efd934aaf503a6466c5d5251535d108ee747472c3977cc5acc868ef" dependencies = [ "proc-macro2", ] @@ -4371,12 +4380,12 @@ dependencies = [ [[package]] name = "rcgen" -version = "0.11.1" +version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4954fbc00dcd4d8282c987710e50ba513d351400dbdd00e803a05172a90d8976" +checksum = "48406db8ac1f3cbc7dcdb56ec355343817958a356ff430259bb07baf7607e1e1" dependencies = [ - "pem 2.0.1", - "ring 0.16.20", + "pem", + "ring 0.17.6", "time", "yasna", ] @@ -4394,15 +4403,15 @@ dependencies = [ "itoa", "percent-encoding", "pin-project-lite", - "rustls", + "rustls 0.21.9", "rustls-native-certs", - "rustls-pemfile", + "rustls-pemfile 1.0.2", "rustls-webpki 0.101.7", "ryu", "sha1_smol", "socket2 0.4.9", "tokio", - "tokio-rustls", + "tokio-rustls 0.24.0", "tokio-util", "url", ] @@ -4548,14 +4557,14 @@ dependencies = [ "once_cell", "percent-encoding", "pin-project-lite", - "rustls", - "rustls-pemfile", + "rustls 0.21.9", + "rustls-pemfile 1.0.2", "serde", "serde_json", "serde_urlencoded", "tokio", "tokio-native-tls", - "tokio-rustls", + "tokio-rustls 0.24.0", "tokio-util", "tower-service", "url", @@ -4721,7 +4730,7 @@ dependencies = [ "regex", "relative-path", "rustc_version", - "syn 2.0.32", + "syn 2.0.52", "unicode-ident", ] @@ -4805,6 +4814,20 @@ dependencies = [ "sct", ] +[[package]] +name = "rustls" +version = "0.22.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e87c9956bd9807afa1f77e0f7594af32566e830e088a5576d27c5b6f30f49d41" +dependencies = [ + "log", + "ring 0.17.6", + "rustls-pki-types", + "rustls-webpki 0.102.2", + "subtle", + "zeroize", +] + [[package]] name = "rustls-native-certs" version = "0.6.2" @@ -4812,7 +4835,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0167bac7a9f490495f3c33013e7722b53cb087ecbe082fb0c6387c96f634ea50" dependencies = [ "openssl-probe", - "rustls-pemfile", + "rustls-pemfile 1.0.2", "schannel", "security-framework", ] @@ -4826,6 +4849,22 @@ dependencies = [ "base64 0.21.1", ] +[[package]] +name = "rustls-pemfile" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f48172685e6ff52a556baa527774f61fcaa884f59daf3375c62a3f1cd2549dab" +dependencies = [ + "base64 0.21.1", + "rustls-pki-types", +] + +[[package]] +name = "rustls-pki-types" +version = "1.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5ede67b28608b4c60685c7d54122d4400d90f62b40caee7700e700380a390fa8" + [[package]] name = "rustls-webpki" version = "0.100.2" @@ -4846,6 +4885,17 @@ dependencies = [ "untrusted 0.9.0", ] +[[package]] +name = "rustls-webpki" +version = "0.102.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "faaa0a62740bedb9b2ef5afa303da42764c012f743917351dc9a237ea1663610" +dependencies = [ + "ring 0.17.6", + "rustls-pki-types", + "untrusted 0.9.0", +] + [[package]] name = "rustversion" version = "1.0.12" @@ -4888,7 +4938,7 @@ dependencies = [ "serde_with", "thiserror", "tokio", - "tokio-rustls", + "tokio-rustls 0.25.0", "tokio-stream", "tracing", "tracing-appender", @@ -5023,7 +5073,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3be24c1842290c45df0a7bf069e0c268a747ad05a192f2fd7dcfdbc1cba40928" dependencies = [ "base16ct", - "der", + "der 0.6.1", "generic-array", "pkcs8", "subtle", @@ -5067,7 +5117,7 @@ checksum = "2e95efd0cefa32028cdb9766c96de71d96671072f9fb494dc9fb84c0ef93e52b" dependencies = [ "httpdate", "reqwest", - "rustls", + "rustls 0.21.9", "sentry-backtrace", "sentry-contexts", "sentry-core", @@ -5189,7 +5239,7 @@ checksum = "aafe972d60b0b9bee71a91b92fee2d4fb3c9d7e8f6b179aa99f27203d99a4816" dependencies = [ "proc-macro2", "quote", - "syn 2.0.32", + "syn 2.0.52", ] [[package]] @@ -5270,7 +5320,7 @@ dependencies = [ "darling", "proc-macro2", "quote", - "syn 2.0.32", + "syn 2.0.52", ] [[package]] @@ -5356,6 +5406,15 @@ dependencies = [ "rand_core 0.6.4", ] +[[package]] +name = "signature" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77549399552de45a898a580c1b41d445bf730df867cc44e6c0233bbc4b8329de" +dependencies = [ + "rand_core 0.6.4", +] + [[package]] name = "simple_asn1" version = "0.6.2" @@ -5440,7 +5499,17 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "67cf02bbac7a337dc36e4f5a693db6c21e7863f45070f7064577eb4367a3212b" dependencies = [ "base64ct", - "der", + "der 0.6.1", +] + +[[package]] +name = "spki" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d91ed6c858b01f942cd56b37a94b3e0a1798290327d1236e4d9cf4eaca44d29d" +dependencies = [ + "base64ct", + "der 0.7.8", ] [[package]] @@ -5526,9 +5595,9 @@ checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc" [[package]] name = "svg_fmt" -version = "0.4.1" +version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8fb1df15f412ee2e9dfc1c504260fa695c1c3f10fe9f4a6ee2d2184d7d6450e2" +checksum = "f83ba502a3265efb76efb89b0a2f7782ad6f2675015d4ce37e4b547dda42b499" [[package]] name = "syn" @@ -5543,9 +5612,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.32" +version = "2.0.52" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "239814284fd6f1a4ffe4ca893952cdd93c224b6a1571c9a9eadd670295c0c9e2" +checksum = "b699d15b36d1f02c3e7c69f8ffef53de37aefae075d8488d4ba1a7788d574a07" dependencies = [ "proc-macro2", "quote", @@ -5660,22 +5729,22 @@ dependencies = [ [[package]] name = "thiserror" -version = "1.0.47" +version = "1.0.57" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97a802ec30afc17eee47b2855fc72e0c4cd62be9b4efe6591edde0ec5bd68d8f" +checksum = "1e45bcbe8ed29775f228095caf2cd67af7a4ccf756ebff23a306bf3e8b47b24b" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "1.0.47" +version = "1.0.57" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6bb623b56e39ab7dcd4b1b98bb6c8f8d907ed255b18de254088016b27a8ee19b" +checksum = "a953cb265bef375dae3de6663da4d3804eee9682ea80d8e2542529b73c531c81" dependencies = [ "proc-macro2", "quote", - "syn 2.0.32", + "syn 2.0.52", ] [[package]] @@ -5794,25 +5863,11 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" -[[package]] -name = "tls-listener" -version = "0.7.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "81294c017957a1a69794f506723519255879e15a870507faf45dfed288b763dd" -dependencies = [ - "futures-util", - "hyper", - "pin-project-lite", - "thiserror", - "tokio", - "tokio-rustls", -] - [[package]] name = "tokio" -version = "1.34.0" +version = "1.36.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d0c014766411e834f7af5b8f4cf46257aab4036ca95e9d2c144a10f59ad6f5b9" +checksum = "61285f6515fa018fb2d1e46eb21223fff441ee8db5d0f1435e8ab4f5cdb80931" dependencies = [ "backtrace", "bytes", @@ -5860,7 +5915,7 @@ checksum = "5b8a1e28f2deaa14e508979454cb3a223b10b938b45af148bc0986de36f1923b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.32", + "syn 2.0.52", ] [[package]] @@ -5898,16 +5953,17 @@ dependencies = [ [[package]] name = "tokio-postgres-rustls" -version = "0.10.0" +version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dd5831152cb0d3f79ef5523b357319ba154795d64c7078b2daa95a803b54057f" +checksum = "0ea13f22eda7127c827983bdaf0d7fff9df21c8817bab02815ac277a21143677" dependencies = [ "futures", - "ring 0.16.20", - "rustls", + "ring 0.17.6", + "rustls 0.22.2", "tokio", "tokio-postgres", - "tokio-rustls", + "tokio-rustls 0.25.0", + "x509-certificate", ] [[package]] @@ -5916,7 +5972,18 @@ version = "0.24.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e0d409377ff5b1e3ca6437aa86c1eb7d40c134bfec254e44c830defa92669db5" dependencies = [ - "rustls", + "rustls 0.21.9", + "tokio", +] + +[[package]] +name = "tokio-rustls" +version = "0.25.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "775e0c0f0adb3a2f22a00c4745d728b479985fc15ee7ca6a2608388c5569860f" +dependencies = [ + "rustls 0.22.2", + "rustls-pki-types", "tokio", ] @@ -6031,9 +6098,9 @@ dependencies = [ "pin-project", "prost", "rustls-native-certs", - "rustls-pemfile", + "rustls-pemfile 1.0.2", "tokio", - "tokio-rustls", + "tokio-rustls 0.24.0", "tokio-stream", "tower", "tower-layer", @@ -6129,7 +6196,7 @@ checksum = "0f57e3ca2a01450b1a921183a9c9cbfda207fd822cef4ccb00a65402cbba7a74" dependencies = [ "proc-macro2", "quote", - "syn 2.0.32", + "syn 2.0.52", ] [[package]] @@ -6345,7 +6412,7 @@ dependencies = [ "base64 0.21.1", "log", "once_cell", - "rustls", + "rustls 0.21.9", "rustls-webpki 0.100.2", "url", "webpki-roots 0.23.1", @@ -6587,7 +6654,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.32", + "syn 2.0.52", "wasm-bindgen-shared", ] @@ -6621,7 +6688,7 @@ checksum = "e128beba882dd1eb6200e1dc92ae6c5dbaa4311aa7bb211ca035779e5efc39f8" dependencies = [ "proc-macro2", "quote", - "syn 2.0.32", + "syn 2.0.52", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -6954,19 +7021,18 @@ dependencies = [ "regex-automata 0.4.3", "regex-syntax 0.8.2", "reqwest", - "ring 0.16.20", - "rustls", + "rustls 0.21.9", "scopeguard", "serde", "serde_json", "smallvec", "subtle", "syn 1.0.109", - "syn 2.0.32", + "syn 2.0.52", "time", "time-macros", "tokio", - "tokio-rustls", + "tokio-rustls 0.24.0", "tokio-util", "toml_datetime", "toml_edit", @@ -6977,11 +7043,31 @@ dependencies = [ "tungstenite", "url", "uuid", + "zeroize", "zstd", "zstd-safe", "zstd-sys", ] +[[package]] +name = "x509-certificate" +version = "0.23.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "66534846dec7a11d7c50a74b7cdb208b9a581cad890b7866430d438455847c85" +dependencies = [ + "bcder", + "bytes", + "chrono", + "der 0.7.8", + "hex", + "pem", + "ring 0.17.6", + "signature 2.2.0", + "spki 0.7.3", + "thiserror", + "zeroize", +] + [[package]] name = "x509-parser" version = "0.15.0" @@ -7040,7 +7126,7 @@ checksum = "b3c129550b3e6de3fd0ba67ba5c81818f9805e58b8d7fee80a3a59d2c9fc601a" dependencies = [ "proc-macro2", "quote", - "syn 2.0.32", + "syn 2.0.52", ] [[package]] @@ -7048,6 +7134,20 @@ name = "zeroize" version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2a0956f1ba7c7909bfb66c2e9e4124ab6f6482560f6628b5aaeba39207c9aad9" +dependencies = [ + "zeroize_derive", +] + +[[package]] +name = "zeroize_derive" +version = "1.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ce36e65b0d2999d2aafac989fb249189a141aee1f53c612c1f37d72631959f69" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.52", +] [[package]] name = "zstd" diff --git a/Cargo.toml b/Cargo.toml index 90b02b30ec..76f4ff041c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -129,8 +129,8 @@ reqwest-retry = "0.2.2" routerify = "3" rpds = "0.13" rustc-hash = "1.1.0" -rustls = "0.21" -rustls-pemfile = "1" +rustls = "0.22" +rustls-pemfile = "2" rustls-split = "0.3" scopeguard = "1.1" sysinfo = "0.29.2" @@ -156,12 +156,11 @@ test-context = "0.1" thiserror = "1.0" tikv-jemallocator = "0.5" tikv-jemalloc-ctl = "0.5" -tls-listener = { version = "0.7", features = ["rustls", "hyper-h1"] } tokio = { version = "1.17", features = ["macros"] } tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" } tokio-io-timeout = "1.2.0" -tokio-postgres-rustls = "0.10.0" -tokio-rustls = "0.24" +tokio-postgres-rustls = "0.11.0" +tokio-rustls = "0.25" tokio-stream = "0.1" tokio-tar = "0.3" tokio-util = { version = "0.7.10", features = ["io", "rt"] } @@ -220,7 +219,7 @@ workspace_hack = { version = "0.1", path = "./workspace_hack/" } ## Build dependencies criterion = "0.5.1" -rcgen = "0.11" +rcgen = "0.12" rstest = "0.18" camino-tempfile = "1.0.2" tonic-build = "0.9" diff --git a/Dockerfile b/Dockerfile index 47954a671b..5f82df3e18 100644 --- a/Dockerfile +++ b/Dockerfile @@ -53,7 +53,7 @@ RUN set -e \ --bin pagectl \ --bin safekeeper \ --bin storage_broker \ - --bin attachment_service \ + --bin storage_controller \ --bin proxy \ --bin neon_local \ --locked --release \ @@ -81,7 +81,7 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver COPY --from=build --chown=neon:neon /home/nonroot/target/release/pagectl /usr/local/bin COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper /usr/local/bin COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker /usr/local/bin -COPY --from=build --chown=neon:neon /home/nonroot/target/release/attachment_service /usr/local/bin +COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_controller /usr/local/bin COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy /usr/local/bin COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local /usr/local/bin diff --git a/clippy.toml b/clippy.toml index d788afc84d..5f7dc66152 100644 --- a/clippy.toml +++ b/clippy.toml @@ -3,3 +3,10 @@ disallowed-methods = [ # Allow this for now, to deny it later once we stop using Handle::block_on completely # "tokio::runtime::Handle::block_on", ] + +disallowed-macros = [ + # use std::pin::pin + "futures::pin_mut", + # cannot disallow this, because clippy finds used from tokio macros + #"tokio::pin", +] diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index a82b999cfb..96ab4a06a5 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -17,6 +17,7 @@ use chrono::{DateTime, Utc}; use futures::future::join_all; use futures::stream::FuturesUnordered; use futures::StreamExt; +use postgres::error::SqlState; use postgres::{Client, NoTls}; use tracing::{debug, error, info, instrument, warn}; use utils::id::{TenantId, TimelineId}; @@ -395,9 +396,9 @@ impl ComputeNode { // Gets the basebackup in a retry loop #[instrument(skip_all, fields(%lsn))] pub fn get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> { - let mut retry_period_ms = 500; + let mut retry_period_ms = 500.0; let mut attempts = 0; - let max_attempts = 5; + let max_attempts = 10; loop { let result = self.try_get_basebackup(compute_state, lsn); match result { @@ -409,8 +410,8 @@ impl ComputeNode { "Failed to get basebackup: {} (attempt {}/{})", e, attempts, max_attempts ); - std::thread::sleep(std::time::Duration::from_millis(retry_period_ms)); - retry_period_ms *= 2; + std::thread::sleep(std::time::Duration::from_millis(retry_period_ms as u64)); + retry_period_ms *= 1.5; } Err(_) => { return result; @@ -763,6 +764,26 @@ impl ComputeNode { Ok((pg, logs_handle)) } + /// Do post configuration of the already started Postgres. This function spawns a background thread to + /// configure the database after applying the compute spec. Currently, it upgrades the neon extension + /// version. In the future, it may upgrade all 3rd-party extensions. + #[instrument(skip_all)] + pub fn post_apply_config(&self) -> Result<()> { + let connstr = self.connstr.clone(); + thread::spawn(move || { + let func = || { + let mut client = Client::connect(connstr.as_str(), NoTls)?; + handle_neon_extension_upgrade(&mut client) + .context("handle_neon_extension_upgrade")?; + Ok::<_, anyhow::Error>(()) + }; + if let Err(err) = func() { + error!("error while post_apply_config: {err:#}"); + } + }); + Ok(()) + } + /// Do initial configuration of the already started Postgres. #[instrument(skip_all)] pub fn apply_config(&self, compute_state: &ComputeState) -> Result<()> { @@ -774,27 +795,34 @@ impl ComputeNode { // but we can create a new one and grant it all privileges. let connstr = self.connstr.clone(); let mut client = match Client::connect(connstr.as_str(), NoTls) { - Err(e) => { - info!( - "cannot connect to postgres: {}, retrying with `zenith_admin` username", - e - ); - let mut zenith_admin_connstr = connstr.clone(); + Err(e) => match e.code() { + Some(&SqlState::INVALID_PASSWORD) + | Some(&SqlState::INVALID_AUTHORIZATION_SPECIFICATION) => { + // connect with zenith_admin if cloud_admin could not authenticate + info!( + "cannot connect to postgres: {}, retrying with `zenith_admin` username", + e + ); + let mut zenith_admin_connstr = connstr.clone(); - zenith_admin_connstr - .set_username("zenith_admin") - .map_err(|_| anyhow::anyhow!("invalid connstr"))?; + zenith_admin_connstr + .set_username("zenith_admin") + .map_err(|_| anyhow::anyhow!("invalid connstr"))?; - let mut client = Client::connect(zenith_admin_connstr.as_str(), NoTls)?; - // Disable forwarding so that users don't get a cloud_admin role - client.simple_query("SET neon.forward_ddl = false")?; - client.simple_query("CREATE USER cloud_admin WITH SUPERUSER")?; - client.simple_query("GRANT zenith_admin TO cloud_admin")?; - drop(client); + let mut client = + Client::connect(zenith_admin_connstr.as_str(), NoTls) + .context("broken cloud_admin credential: tried connecting with cloud_admin but could not authenticate, and zenith_admin does not work either")?; + // Disable forwarding so that users don't get a cloud_admin role + client.simple_query("SET neon.forward_ddl = false")?; + client.simple_query("CREATE USER cloud_admin WITH SUPERUSER")?; + client.simple_query("GRANT zenith_admin TO cloud_admin")?; + drop(client); - // reconnect with connstring with expected name - Client::connect(connstr.as_str(), NoTls)? - } + // reconnect with connstring with expected name + Client::connect(connstr.as_str(), NoTls)? + } + _ => return Err(e.into()), + }, Ok(client) => client, }; @@ -990,18 +1018,21 @@ impl ComputeNode { let pg_process = self.start_postgres(pspec.storage_auth_token.clone())?; let config_time = Utc::now(); - if pspec.spec.mode == ComputeMode::Primary && !pspec.spec.skip_pg_catalog_updates { - let pgdata_path = Path::new(&self.pgdata); - // temporarily reset max_cluster_size in config - // to avoid the possibility of hitting the limit, while we are applying config: - // creating new extensions, roles, etc... - config::compute_ctl_temp_override_create(pgdata_path, "neon.max_cluster_size=-1")?; - self.pg_reload_conf()?; + if pspec.spec.mode == ComputeMode::Primary { + if !pspec.spec.skip_pg_catalog_updates { + let pgdata_path = Path::new(&self.pgdata); + // temporarily reset max_cluster_size in config + // to avoid the possibility of hitting the limit, while we are applying config: + // creating new extensions, roles, etc... + config::compute_ctl_temp_override_create(pgdata_path, "neon.max_cluster_size=-1")?; + self.pg_reload_conf()?; - self.apply_config(&compute_state)?; + self.apply_config(&compute_state)?; - config::compute_ctl_temp_override_remove(pgdata_path)?; - self.pg_reload_conf()?; + config::compute_ctl_temp_override_remove(pgdata_path)?; + self.pg_reload_conf()?; + } + self.post_apply_config()?; } let startup_end_time = Utc::now(); diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs index d5fd2c9462..ba3a84cda8 100644 --- a/compute_tools/src/spec.rs +++ b/compute_tools/src/spec.rs @@ -302,9 +302,9 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> { RoleAction::Create => { // This branch only runs when roles are created through the console, so it is // safe to add more permissions here. BYPASSRLS and REPLICATION are inherited - // from neon_superuser. + // from neon_superuser. (NOTE: REPLICATION has been removed from here for now). let mut query: String = format!( - "CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser", + "CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS IN ROLE neon_superuser", name.pg_quote() ); info!("running role create query: '{}'", &query); @@ -744,7 +744,17 @@ pub fn handle_extension_neon(client: &mut Client) -> Result<()> { // - extension was just installed // - extension was already installed and is up to date let query = "ALTER EXTENSION neon UPDATE"; - info!("update neon extension schema with query: {}", query); + info!("update neon extension version with query: {}", query); + client.simple_query(query)?; + + Ok(()) +} + +#[instrument(skip_all)] +pub fn handle_neon_extension_upgrade(client: &mut Client) -> Result<()> { + info!("handle neon extension upgrade"); + let query = "ALTER EXTENSION neon UPDATE"; + info!("update neon extension version with query: {}", query); client.simple_query(query)?; Ok(()) @@ -795,6 +805,18 @@ $$;"#, "", "", // Add new migrations below. + r#" +DO $$ +DECLARE + role_name TEXT; +BEGIN + FOR role_name IN SELECT rolname FROM pg_roles WHERE rolreplication IS TRUE + LOOP + RAISE NOTICE 'EXECUTING ALTER ROLE % NOREPLICATION', quote_ident(role_name); + EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' NOREPLICATION'; + END LOOP; +END +$$;"#, ]; let mut query = "CREATE SCHEMA IF NOT EXISTS neon_migration"; diff --git a/control_plane/attachment_service/Cargo.toml b/control_plane/attachment_service/Cargo.toml index bfdfd4c77d..a5fad7216c 100644 --- a/control_plane/attachment_service/Cargo.toml +++ b/control_plane/attachment_service/Cargo.toml @@ -4,6 +4,10 @@ version = "0.1.0" edition.workspace = true license.workspace = true +[[bin]] +name = "storage_controller" +path = "src/main.rs" + [features] default = [] # Enables test-only APIs and behaviors diff --git a/control_plane/attachment_service/src/compute_hook.rs b/control_plane/attachment_service/src/compute_hook.rs index b5e90491c6..bebc62ac2f 100644 --- a/control_plane/attachment_service/src/compute_hook.rs +++ b/control_plane/attachment_service/src/compute_hook.rs @@ -3,7 +3,7 @@ use std::{collections::HashMap, time::Duration}; use control_plane::endpoint::{ComputeControlPlane, EndpointStatus}; use control_plane::local_env::LocalEnv; use hyper::{Method, StatusCode}; -use pageserver_api::shard::{ShardIndex, ShardNumber, TenantShardId}; +use pageserver_api::shard::{ShardCount, ShardNumber, ShardStripeSize, TenantShardId}; use postgres_connection::parse_host_port; use serde::{Deserialize, Serialize}; use tokio_util::sync::CancellationToken; @@ -19,8 +19,66 @@ const SLOWDOWN_DELAY: Duration = Duration::from_secs(5); pub(crate) const API_CONCURRENCY: usize = 32; -pub(super) struct ComputeHookTenant { - shards: Vec<(ShardIndex, NodeId)>, +struct ShardedComputeHookTenant { + stripe_size: ShardStripeSize, + shard_count: ShardCount, + shards: Vec<(ShardNumber, NodeId)>, +} + +enum ComputeHookTenant { + Unsharded(NodeId), + Sharded(ShardedComputeHookTenant), +} + +impl ComputeHookTenant { + /// Construct with at least one shard's information + fn new(tenant_shard_id: TenantShardId, stripe_size: ShardStripeSize, node_id: NodeId) -> Self { + if tenant_shard_id.shard_count.count() > 1 { + Self::Sharded(ShardedComputeHookTenant { + shards: vec![(tenant_shard_id.shard_number, node_id)], + stripe_size, + shard_count: tenant_shard_id.shard_count, + }) + } else { + Self::Unsharded(node_id) + } + } + + /// Set one shard's location. If stripe size or shard count have changed, Self is reset + /// and drops existing content. + fn update( + &mut self, + tenant_shard_id: TenantShardId, + stripe_size: ShardStripeSize, + node_id: NodeId, + ) { + match self { + Self::Unsharded(existing_node_id) if tenant_shard_id.shard_count.count() == 1 => { + *existing_node_id = node_id + } + Self::Sharded(sharded_tenant) + if sharded_tenant.stripe_size == stripe_size + && sharded_tenant.shard_count == tenant_shard_id.shard_count => + { + if let Some(existing) = sharded_tenant + .shards + .iter() + .position(|s| s.0 == tenant_shard_id.shard_number) + { + sharded_tenant.shards.get_mut(existing).unwrap().1 = node_id; + } else { + sharded_tenant + .shards + .push((tenant_shard_id.shard_number, node_id)); + sharded_tenant.shards.sort_by_key(|s| s.0) + } + } + _ => { + // Shard count changed: reset struct. + *self = Self::new(tenant_shard_id, stripe_size, node_id); + } + } + } } #[derive(Serialize, Deserialize, Debug)] @@ -33,6 +91,7 @@ struct ComputeHookNotifyRequestShard { #[derive(Serialize, Deserialize, Debug)] struct ComputeHookNotifyRequest { tenant_id: TenantId, + stripe_size: Option, shards: Vec, } @@ -63,42 +122,43 @@ pub(crate) enum NotifyError { } impl ComputeHookTenant { - async fn maybe_reconfigure(&mut self, tenant_id: TenantId) -> Option { - // Find the highest shard count and drop any shards that aren't - // for that shard count. - let shard_count = self.shards.iter().map(|(k, _v)| k.shard_count).max(); - let Some(shard_count) = shard_count else { - // No shards, nothing to do. - tracing::info!("ComputeHookTenant::maybe_reconfigure: no shards"); - return None; - }; - - self.shards.retain(|(k, _v)| k.shard_count == shard_count); - self.shards - .sort_by_key(|(shard, _node_id)| shard.shard_number); - - if self.shards.len() == shard_count.count() as usize || shard_count.is_unsharded() { - // We have pageservers for all the shards: emit a configuration update - return Some(ComputeHookNotifyRequest { + fn maybe_reconfigure(&self, tenant_id: TenantId) -> Option { + match self { + Self::Unsharded(node_id) => Some(ComputeHookNotifyRequest { tenant_id, - shards: self - .shards - .iter() - .map(|(shard, node_id)| ComputeHookNotifyRequestShard { - shard_number: shard.shard_number, - node_id: *node_id, - }) - .collect(), - }); - } else { - tracing::info!( - "ComputeHookTenant::maybe_reconfigure: not enough shards ({}/{})", - self.shards.len(), - shard_count.count() - ); - } + shards: vec![ComputeHookNotifyRequestShard { + shard_number: ShardNumber(0), + node_id: *node_id, + }], + stripe_size: None, + }), + Self::Sharded(sharded_tenant) + if sharded_tenant.shards.len() == sharded_tenant.shard_count.count() as usize => + { + Some(ComputeHookNotifyRequest { + tenant_id, + shards: sharded_tenant + .shards + .iter() + .map(|(shard_number, node_id)| ComputeHookNotifyRequestShard { + shard_number: *shard_number, + node_id: *node_id, + }) + .collect(), + stripe_size: Some(sharded_tenant.stripe_size), + }) + } + Self::Sharded(sharded_tenant) => { + // Sharded tenant doesn't yet have information for all its shards - None + tracing::info!( + "ComputeHookTenant::maybe_reconfigure: not enough shards ({}/{})", + sharded_tenant.shards.len(), + sharded_tenant.shard_count.count() + ); + None + } + } } } @@ -139,7 +199,11 @@ impl ComputeHook { }; let cplane = ComputeControlPlane::load(env.clone()).expect("Error loading compute control plane"); - let ComputeHookNotifyRequest { tenant_id, shards } = reconfigure_request; + let ComputeHookNotifyRequest { + tenant_id, + shards, + stripe_size, + } = reconfigure_request; let compute_pageservers = shards .into_iter() @@ -156,7 +220,9 @@ impl ComputeHook { for (endpoint_name, endpoint) in &cplane.endpoints { if endpoint.tenant_id == tenant_id && endpoint.status() == EndpointStatus::Running { tracing::info!("Reconfiguring endpoint {}", endpoint_name,); - endpoint.reconfigure(compute_pageservers.clone()).await?; + endpoint + .reconfigure(compute_pageservers.clone(), stripe_size) + .await?; } } @@ -271,30 +337,26 @@ impl ComputeHook { &self, tenant_shard_id: TenantShardId, node_id: NodeId, + stripe_size: ShardStripeSize, cancel: &CancellationToken, ) -> Result<(), NotifyError> { let mut locked = self.state.lock().await; - let entry = locked - .entry(tenant_shard_id.tenant_id) - .or_insert_with(|| ComputeHookTenant { shards: Vec::new() }); - let shard_index = ShardIndex { - shard_count: tenant_shard_id.shard_count, - shard_number: tenant_shard_id.shard_number, + use std::collections::hash_map::Entry; + let tenant = match locked.entry(tenant_shard_id.tenant_id) { + Entry::Vacant(e) => e.insert(ComputeHookTenant::new( + tenant_shard_id, + stripe_size, + node_id, + )), + Entry::Occupied(e) => { + let tenant = e.into_mut(); + tenant.update(tenant_shard_id, stripe_size, node_id); + tenant + } }; - let mut set = false; - for (existing_shard, existing_node) in &mut entry.shards { - if *existing_shard == shard_index { - *existing_node = node_id; - set = true; - } - } - if !set { - entry.shards.push((shard_index, node_id)); - } - - let reconfigure_request = entry.maybe_reconfigure(tenant_shard_id.tenant_id).await; + let reconfigure_request = tenant.maybe_reconfigure(tenant_shard_id.tenant_id); let Some(reconfigure_request) = reconfigure_request else { // The tenant doesn't yet have pageservers for all its shards: we won't notify anything // until it does. @@ -316,3 +378,85 @@ impl ComputeHook { } } } + +#[cfg(test)] +pub(crate) mod tests { + use pageserver_api::shard::{ShardCount, ShardNumber}; + use utils::id::TenantId; + + use super::*; + + #[test] + fn tenant_updates() -> anyhow::Result<()> { + let tenant_id = TenantId::generate(); + let mut tenant_state = ComputeHookTenant::new( + TenantShardId { + tenant_id, + shard_count: ShardCount::new(0), + shard_number: ShardNumber(0), + }, + ShardStripeSize(12345), + NodeId(1), + ); + + // An unsharded tenant is always ready to emit a notification + assert!(tenant_state.maybe_reconfigure(tenant_id).is_some()); + assert_eq!( + tenant_state + .maybe_reconfigure(tenant_id) + .unwrap() + .shards + .len(), + 1 + ); + assert!(tenant_state + .maybe_reconfigure(tenant_id) + .unwrap() + .stripe_size + .is_none()); + + // Writing the first shard of a multi-sharded situation (i.e. in a split) + // resets the tenant state and puts it in an non-notifying state (need to + // see all shards) + tenant_state.update( + TenantShardId { + tenant_id, + shard_count: ShardCount::new(2), + shard_number: ShardNumber(1), + }, + ShardStripeSize(32768), + NodeId(1), + ); + assert!(tenant_state.maybe_reconfigure(tenant_id).is_none()); + + // Writing the second shard makes it ready to notify + tenant_state.update( + TenantShardId { + tenant_id, + shard_count: ShardCount::new(2), + shard_number: ShardNumber(0), + }, + ShardStripeSize(32768), + NodeId(1), + ); + + assert!(tenant_state.maybe_reconfigure(tenant_id).is_some()); + assert_eq!( + tenant_state + .maybe_reconfigure(tenant_id) + .unwrap() + .shards + .len(), + 2 + ); + assert_eq!( + tenant_state + .maybe_reconfigure(tenant_id) + .unwrap() + .stripe_size, + Some(ShardStripeSize(32768)) + ); + + Ok(()) + } +} diff --git a/control_plane/attachment_service/src/http.rs b/control_plane/attachment_service/src/http.rs index 384bdcef0c..7e4030b221 100644 --- a/control_plane/attachment_service/src/http.rs +++ b/control_plane/attachment_service/src/http.rs @@ -1,6 +1,5 @@ use crate::reconciler::ReconcileError; use crate::service::{Service, STARTUP_RECONCILE_TIMEOUT}; -use crate::PlacementPolicy; use hyper::{Body, Request, Response}; use hyper::{StatusCode, Uri}; use pageserver_api::models::{ @@ -119,13 +118,9 @@ async fn handle_tenant_create( let create_req = json_request::(&mut req).await?; - // TODO: enable specifying this. Using Single as a default helps legacy tests to work (they - // have no expectation of HA). - let placement_policy = PlacementPolicy::Single; - json_response( StatusCode::CREATED, - service.tenant_create(create_req, placement_policy).await?, + service.tenant_create(create_req).await?, ) } diff --git a/control_plane/attachment_service/src/lib.rs b/control_plane/attachment_service/src/lib.rs index 7ae7e264c7..796b465c10 100644 --- a/control_plane/attachment_service/src/lib.rs +++ b/control_plane/attachment_service/src/lib.rs @@ -1,4 +1,4 @@ -use serde::{Deserialize, Serialize}; +use serde::Serialize; use utils::seqwait::MonotonicCounter; mod auth; @@ -13,23 +13,6 @@ mod schema; pub mod service; mod tenant_state; -#[derive(Clone, Serialize, Deserialize, Debug, PartialEq, Eq)] -enum PlacementPolicy { - /// Cheapest way to attach a tenant: just one pageserver, no secondary - Single, - /// Production-ready way to attach a tenant: one attached pageserver and - /// some number of secondaries. - Double(usize), - /// Create one secondary mode locations. This is useful when onboarding - /// a tenant, or for an idle tenant that we might want to bring online quickly. - Secondary, - - /// Do not attach to any pageservers. This is appropriate for tenants that - /// have been idle for a long time, where we do not mind some delay in making - /// them available in future. - Detached, -} - #[derive(Ord, PartialOrd, Eq, PartialEq, Copy, Clone, Serialize)] struct Sequence(u64); @@ -66,9 +49,3 @@ impl Sequence { Sequence(self.0 + 1) } } - -impl Default for PlacementPolicy { - fn default() -> Self { - PlacementPolicy::Double(1) - } -} diff --git a/control_plane/attachment_service/src/node.rs b/control_plane/attachment_service/src/node.rs index 1f9dcef033..27b03608fa 100644 --- a/control_plane/attachment_service/src/node.rs +++ b/control_plane/attachment_service/src/node.rs @@ -1,6 +1,16 @@ -use pageserver_api::controller_api::{NodeAvailability, NodeSchedulingPolicy}; +use std::{str::FromStr, time::Duration}; + +use hyper::StatusCode; +use pageserver_api::{ + controller_api::{ + NodeAvailability, NodeRegisterRequest, NodeSchedulingPolicy, TenantLocateResponseShard, + }, + shard::TenantShardId, +}; +use pageserver_client::mgmt_api; use serde::Serialize; -use utils::id::NodeId; +use tokio_util::sync::CancellationToken; +use utils::{backoff, id::NodeId}; use crate::persistence::NodePersistence; @@ -12,16 +22,29 @@ use crate::persistence::NodePersistence; /// implementation of serialization on this type is only for debug dumps. #[derive(Clone, Serialize)] pub(crate) struct Node { - pub(crate) id: NodeId, + id: NodeId, - pub(crate) availability: NodeAvailability, - pub(crate) scheduling: NodeSchedulingPolicy, + availability: NodeAvailability, + scheduling: NodeSchedulingPolicy, - pub(crate) listen_http_addr: String, - pub(crate) listen_http_port: u16, + listen_http_addr: String, + listen_http_port: u16, - pub(crate) listen_pg_addr: String, - pub(crate) listen_pg_port: u16, + listen_pg_addr: String, + listen_pg_port: u16, + + // This cancellation token means "stop any RPCs in flight to this node, and don't start + // any more". It is not related to process shutdown. + #[serde(skip)] + cancel: CancellationToken, +} + +/// When updating [`Node::availability`] we use this type to indicate to the caller +/// whether/how they changed it. +pub(crate) enum AvailabilityTransition { + ToActive, + ToOffline, + Unchanged, } impl Node { @@ -29,6 +52,71 @@ impl Node { format!("http://{}:{}", self.listen_http_addr, self.listen_http_port) } + pub(crate) fn get_id(&self) -> NodeId { + self.id + } + + pub(crate) fn set_scheduling(&mut self, scheduling: NodeSchedulingPolicy) { + self.scheduling = scheduling + } + + /// Does this registration request match `self`? This is used when deciding whether a registration + /// request should be allowed to update an existing record with the same node ID. + pub(crate) fn registration_match(&self, register_req: &NodeRegisterRequest) -> bool { + self.id == register_req.node_id + && self.listen_http_addr == register_req.listen_http_addr + && self.listen_http_port == register_req.listen_http_port + && self.listen_pg_addr == register_req.listen_pg_addr + && self.listen_pg_port == register_req.listen_pg_port + } + + /// For a shard located on this node, populate a response object + /// with this node's address information. + pub(crate) fn shard_location(&self, shard_id: TenantShardId) -> TenantLocateResponseShard { + TenantLocateResponseShard { + shard_id, + node_id: self.id, + listen_http_addr: self.listen_http_addr.clone(), + listen_http_port: self.listen_http_port, + listen_pg_addr: self.listen_pg_addr.clone(), + listen_pg_port: self.listen_pg_port, + } + } + + pub(crate) fn set_availability( + &mut self, + availability: NodeAvailability, + ) -> AvailabilityTransition { + use NodeAvailability::*; + let transition = match (self.availability, availability) { + (Offline, Active) => { + // Give the node a new cancellation token, effectively resetting it to un-cancelled. Any + // users of previously-cloned copies of the node will still see the old cancellation + // state. For example, Reconcilers in flight will have to complete and be spawned + // again to realize that the node has become available. + self.cancel = CancellationToken::new(); + AvailabilityTransition::ToActive + } + (Active, Offline) => { + // Fire the node's cancellation token to cancel any in-flight API requests to it + self.cancel.cancel(); + AvailabilityTransition::ToOffline + } + _ => AvailabilityTransition::Unchanged, + }; + self.availability = availability; + transition + } + + /// Whether we may send API requests to this node. + pub(crate) fn is_available(&self) -> bool { + // When we clone a node, [`Self::availability`] is a snapshot, but [`Self::cancel`] holds + // a reference to the original Node's cancellation status. Checking both of these results + // in a "pessimistic" check where we will consider a Node instance unavailable if it was unavailable + // when we cloned it, or if the original Node instance's cancellation token was fired. + matches!(self.availability, NodeAvailability::Active) && !self.cancel.is_cancelled() + } + /// Is this node elegible to have work scheduled onto it? pub(crate) fn may_schedule(&self) -> bool { match self.availability { @@ -44,6 +132,26 @@ impl Node { } } + pub(crate) fn new( + id: NodeId, + listen_http_addr: String, + listen_http_port: u16, + listen_pg_addr: String, + listen_pg_port: u16, + ) -> Self { + Self { + id, + listen_http_addr, + listen_http_port, + listen_pg_addr, + listen_pg_port, + scheduling: NodeSchedulingPolicy::Filling, + // TODO: we shouldn't really call this Active until we've heartbeated it. + availability: NodeAvailability::Active, + cancel: CancellationToken::new(), + } + } + pub(crate) fn to_persistent(&self) -> NodePersistence { NodePersistence { node_id: self.id.0 as i64, @@ -54,4 +162,96 @@ impl Node { listen_pg_port: self.listen_pg_port as i32, } } + + pub(crate) fn from_persistent(np: NodePersistence) -> Self { + Self { + id: NodeId(np.node_id as u64), + // At startup we consider a node offline until proven otherwise. + availability: NodeAvailability::Offline, + scheduling: NodeSchedulingPolicy::from_str(&np.scheduling_policy) + .expect("Bad scheduling policy in DB"), + listen_http_addr: np.listen_http_addr, + listen_http_port: np.listen_http_port as u16, + listen_pg_addr: np.listen_pg_addr, + listen_pg_port: np.listen_pg_port as u16, + cancel: CancellationToken::new(), + } + } + + /// Wrapper for issuing requests to pageserver management API: takes care of generic + /// retry/backoff for retryable HTTP status codes. + /// + /// This will return None to indicate cancellation. Cancellation may happen from + /// the cancellation token passed in, or from Self's cancellation token (i.e. node + /// going offline). + pub(crate) async fn with_client_retries( + &self, + mut op: O, + jwt: &Option, + warn_threshold: u32, + max_retries: u32, + timeout: Duration, + cancel: &CancellationToken, + ) -> Option> + where + O: FnMut(mgmt_api::Client) -> F, + F: std::future::Future>, + { + fn is_fatal(e: &mgmt_api::Error) -> bool { + use mgmt_api::Error::*; + match e { + ReceiveBody(_) | ReceiveErrorBody(_) => false, + ApiError(StatusCode::SERVICE_UNAVAILABLE, _) + | ApiError(StatusCode::GATEWAY_TIMEOUT, _) + | ApiError(StatusCode::REQUEST_TIMEOUT, _) => false, + ApiError(_, _) => true, + Cancelled => true, + } + } + + backoff::retry( + || { + let http_client = reqwest::ClientBuilder::new() + .timeout(timeout) + .build() + .expect("Failed to construct HTTP client"); + + let client = + mgmt_api::Client::from_client(http_client, self.base_url(), jwt.as_deref()); + + let node_cancel_fut = self.cancel.cancelled(); + + let op_fut = op(client); + + async { + tokio::select! { + r = op_fut=> {r}, + _ = node_cancel_fut => { + Err(mgmt_api::Error::Cancelled) + }} + } + }, + is_fatal, + warn_threshold, + max_retries, + &format!( + "Call to node {} ({}:{}) management API", + self.id, self.listen_http_addr, self.listen_http_port + ), + cancel, + ) + .await + } +} + +impl std::fmt::Display for Node { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{} ({})", self.id, self.listen_http_addr) + } +} + +impl std::fmt::Debug for Node { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{} ({})", self.id, self.listen_http_addr) + } } diff --git a/control_plane/attachment_service/src/persistence.rs b/control_plane/attachment_service/src/persistence.rs index d5c304385c..d5c6d74ebe 100644 --- a/control_plane/attachment_service/src/persistence.rs +++ b/control_plane/attachment_service/src/persistence.rs @@ -7,11 +7,9 @@ use self::split_state::SplitState; use camino::Utf8Path; use camino::Utf8PathBuf; use diesel::pg::PgConnection; -use diesel::{ - Connection, ExpressionMethods, Insertable, QueryDsl, QueryResult, Queryable, RunQueryDsl, - Selectable, SelectableHelper, -}; -use pageserver_api::controller_api::NodeSchedulingPolicy; +use diesel::prelude::*; +use diesel::Connection; +use pageserver_api::controller_api::{NodeSchedulingPolicy, PlacementPolicy}; use pageserver_api::models::TenantConfig; use pageserver_api::shard::{ShardCount, ShardNumber, TenantShardId}; use serde::{Deserialize, Serialize}; @@ -19,7 +17,6 @@ use utils::generation::Generation; use utils::id::{NodeId, TenantId}; use crate::node::Node; -use crate::PlacementPolicy; /// ## What do we store? /// @@ -210,7 +207,7 @@ impl Persistence { tenant.tenant_id = tenant_id.to_string(); tenant.config = serde_json::to_string(&TenantConfig::default()) .map_err(|e| DatabaseError::Logical(format!("Serialization error: {e}")))?; - tenant.placement_policy = serde_json::to_string(&PlacementPolicy::default()) + tenant.placement_policy = serde_json::to_string(&PlacementPolicy::Single) .map_err(|e| DatabaseError::Logical(format!("Serialization error: {e}")))?; } } diff --git a/control_plane/attachment_service/src/reconciler.rs b/control_plane/attachment_service/src/reconciler.rs index b633b217c7..603da9bf02 100644 --- a/control_plane/attachment_service/src/reconciler.rs +++ b/control_plane/attachment_service/src/reconciler.rs @@ -1,6 +1,5 @@ use crate::persistence::Persistence; use crate::service; -use pageserver_api::controller_api::NodeAvailability; use pageserver_api::models::{ LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig, }; @@ -28,15 +27,16 @@ pub(super) struct Reconciler { pub(crate) shard: ShardIdentity, pub(crate) generation: Option, pub(crate) intent: TargetState, + + /// Nodes not referenced by [`Self::intent`], from which we should try + /// to detach this tenant shard. + pub(crate) detach: Vec, + pub(crate) config: TenantConfig, pub(crate) observed: ObservedState, pub(crate) service_config: service::Config, - /// A snapshot of the pageservers as they were when we were asked - /// to reconcile. - pub(crate) pageservers: Arc>, - /// A hook to notify the running postgres instances when we change the location /// of a tenant. Use this via [`Self::compute_notify`] to update our failure flag /// and guarantee eventual retries. @@ -67,29 +67,37 @@ pub(super) struct Reconciler { /// and the TargetState is just the instruction for a particular Reconciler run. #[derive(Debug)] pub(crate) struct TargetState { - pub(crate) attached: Option, - pub(crate) secondary: Vec, + pub(crate) attached: Option, + pub(crate) secondary: Vec, } impl TargetState { - pub(crate) fn from_intent(intent: &IntentState) -> Self { + pub(crate) fn from_intent(nodes: &HashMap, intent: &IntentState) -> Self { Self { - attached: *intent.get_attached(), - secondary: intent.get_secondary().clone(), + attached: intent.get_attached().map(|n| { + nodes + .get(&n) + .expect("Intent attached referenced non-existent node") + .clone() + }), + secondary: intent + .get_secondary() + .iter() + .map(|n| { + nodes + .get(n) + .expect("Intent secondary referenced non-existent node") + .clone() + }) + .collect(), } } - - fn all_pageservers(&self) -> Vec { - let mut result = self.secondary.clone(); - if let Some(node_id) = &self.attached { - result.push(*node_id); - } - result - } } #[derive(thiserror::Error, Debug)] pub(crate) enum ReconcileError { + #[error(transparent)] + Remote(#[from] mgmt_api::Error), #[error(transparent)] Notify(#[from] NotifyError), #[error("Cancelled")] @@ -101,44 +109,83 @@ pub(crate) enum ReconcileError { impl Reconciler { async fn location_config( &mut self, - node_id: NodeId, + node: &Node, config: LocationConfig, flush_ms: Option, - ) -> anyhow::Result<()> { - let node = self - .pageservers - .get(&node_id) - .expect("Pageserver may not be removed while referenced"); + lazy: bool, + ) -> Result<(), ReconcileError> { + self.observed + .locations + .insert(node.get_id(), ObservedStateLocation { conf: None }); + + // TODO: amend locations that use long-polling: they will hit this timeout. + let timeout = Duration::from_secs(25); + + tracing::info!("location_config({node}) calling: {:?}", config); + let tenant_shard_id = self.tenant_shard_id; + let config_ref = &config; + match node + .with_client_retries( + |client| async move { + let config = config_ref.clone(); + client + .location_config(tenant_shard_id, config.clone(), flush_ms, lazy) + .await + }, + &self.service_config.jwt_token, + 1, + 3, + timeout, + &self.cancel, + ) + .await + { + Some(Ok(_)) => {} + Some(Err(e)) => return Err(e.into()), + None => return Err(ReconcileError::Cancel), + }; + tracing::info!("location_config({node}) complete: {:?}", config); self.observed .locations - .insert(node.id, ObservedStateLocation { conf: None }); - - tracing::info!("location_config({}) calling: {:?}", node_id, config); - let client = - mgmt_api::Client::new(node.base_url(), self.service_config.jwt_token.as_deref()); - client - .location_config(self.tenant_shard_id, config.clone(), flush_ms) - .await?; - tracing::info!("location_config({}) complete: {:?}", node_id, config); - - self.observed - .locations - .insert(node.id, ObservedStateLocation { conf: Some(config) }); + .insert(node.get_id(), ObservedStateLocation { conf: Some(config) }); Ok(()) } + fn get_node(&self, node_id: &NodeId) -> Option<&Node> { + if let Some(node) = self.intent.attached.as_ref() { + if node.get_id() == *node_id { + return Some(node); + } + } + + if let Some(node) = self + .intent + .secondary + .iter() + .find(|n| n.get_id() == *node_id) + { + return Some(node); + } + + if let Some(node) = self.detach.iter().find(|n| n.get_id() == *node_id) { + return Some(node); + } + + None + } + async fn maybe_live_migrate(&mut self) -> Result<(), ReconcileError> { - let destination = if let Some(node_id) = self.intent.attached { - match self.observed.locations.get(&node_id) { + let destination = if let Some(node) = &self.intent.attached { + match self.observed.locations.get(&node.get_id()) { Some(conf) => { // We will do a live migration only if the intended destination is not // currently in an attached state. match &conf.conf { Some(conf) if conf.mode == LocationConfigMode::Secondary => { // Fall through to do a live migration - node_id + node } None | Some(_) => { // Attached or uncertain: don't do a live migration, proceed @@ -151,7 +198,7 @@ impl Reconciler { None => { // Our destination is not attached: maybe live migrate if some other // node is currently attached. Fall through. - node_id + node } } } else { @@ -164,15 +211,13 @@ impl Reconciler { for (node_id, state) in &self.observed.locations { if let Some(observed_conf) = &state.conf { if observed_conf.mode == LocationConfigMode::AttachedSingle { - let node = self - .pageservers - .get(node_id) - .expect("Nodes may not be removed while referenced"); // We will only attempt live migration if the origin is not offline: this // avoids trying to do it while reconciling after responding to an HA failover. - if !matches!(node.availability, NodeAvailability::Offline) { - origin = Some(*node_id); - break; + if let Some(node) = self.get_node(node_id) { + if node.is_available() { + origin = Some(node.clone()); + break; + } } } } @@ -185,7 +230,7 @@ impl Reconciler { // We have an origin and a destination: proceed to do the live migration tracing::info!("Live migrating {}->{}", origin, destination); - self.live_migrate(origin, destination).await?; + self.live_migrate(origin, destination.clone()).await?; Ok(()) } @@ -193,13 +238,8 @@ impl Reconciler { async fn get_lsns( &self, tenant_shard_id: TenantShardId, - node_id: &NodeId, + node: &Node, ) -> anyhow::Result> { - let node = self - .pageservers - .get(node_id) - .expect("Pageserver may not be removed while referenced"); - let client = mgmt_api::Client::new(node.base_url(), self.service_config.jwt_token.as_deref()); @@ -210,19 +250,27 @@ impl Reconciler { .collect()) } - async fn secondary_download(&self, tenant_shard_id: TenantShardId, node_id: &NodeId) { - let node = self - .pageservers - .get(node_id) - .expect("Pageserver may not be removed while referenced"); - - let client = - mgmt_api::Client::new(node.base_url(), self.service_config.jwt_token.as_deref()); - - match client.tenant_secondary_download(tenant_shard_id).await { - Ok(()) => {} - Err(_) => { - tracing::info!(" (skipping, destination wasn't in secondary mode)") + async fn secondary_download( + &self, + tenant_shard_id: TenantShardId, + node: &Node, + ) -> Result<(), ReconcileError> { + match node + .with_client_retries( + |client| async move { client.tenant_secondary_download(tenant_shard_id).await }, + &self.service_config.jwt_token, + 1, + 1, + Duration::from_secs(60), + &self.cancel, + ) + .await + { + None => Err(ReconcileError::Cancel), + Some(Ok(_)) => Ok(()), + Some(Err(e)) => { + tracing::info!(" (skipping destination download: {})", e); + Ok(()) } } } @@ -230,17 +278,14 @@ impl Reconciler { async fn await_lsn( &self, tenant_shard_id: TenantShardId, - pageserver_id: &NodeId, + node: &Node, baseline: HashMap, ) -> anyhow::Result<()> { loop { - let latest = match self.get_lsns(tenant_shard_id, pageserver_id).await { + let latest = match self.get_lsns(tenant_shard_id, node).await { Ok(l) => l, Err(e) => { - println!( - "🕑 Can't get LSNs on pageserver {} yet, waiting ({e})", - pageserver_id - ); + tracing::info!("🕑 Can't get LSNs on node {node} yet, waiting ({e})",); std::thread::sleep(Duration::from_millis(500)); continue; } @@ -250,7 +295,7 @@ impl Reconciler { for (timeline_id, baseline_lsn) in &baseline { match latest.get(timeline_id) { Some(latest_lsn) => { - println!("🕑 LSN origin {baseline_lsn} vs destination {latest_lsn}"); + tracing::info!("🕑 LSN origin {baseline_lsn} vs destination {latest_lsn}"); if latest_lsn < baseline_lsn { any_behind = true; } @@ -265,7 +310,7 @@ impl Reconciler { } if !any_behind { - println!("✅ LSN caught up. Proceeding..."); + tracing::info!("✅ LSN caught up. Proceeding..."); break; } else { std::thread::sleep(Duration::from_millis(500)); @@ -277,11 +322,11 @@ impl Reconciler { pub async fn live_migrate( &mut self, - origin_ps_id: NodeId, - dest_ps_id: NodeId, - ) -> anyhow::Result<()> { + origin_ps: Node, + dest_ps: Node, + ) -> Result<(), ReconcileError> { // `maybe_live_migrate` is responsibble for sanity of inputs - assert!(origin_ps_id != dest_ps_id); + assert!(origin_ps.get_id() != dest_ps.get_id()); fn build_location_config( shard: &ShardIdentity, @@ -301,10 +346,7 @@ impl Reconciler { } } - tracing::info!( - "🔁 Switching origin pageserver {} to stale mode", - origin_ps_id - ); + tracing::info!("🔁 Switching origin node {origin_ps} to stale mode",); // FIXME: it is incorrect to use self.generation here, we should use the generation // from the ObservedState of the origin pageserver (it might be older than self.generation) @@ -315,21 +357,18 @@ impl Reconciler { self.generation, None, ); - self.location_config(origin_ps_id, stale_conf, Some(Duration::from_secs(10))) + self.location_config(&origin_ps, stale_conf, Some(Duration::from_secs(10)), false) .await?; - let baseline_lsns = Some(self.get_lsns(self.tenant_shard_id, &origin_ps_id).await?); + let baseline_lsns = Some(self.get_lsns(self.tenant_shard_id, &origin_ps).await?); // If we are migrating to a destination that has a secondary location, warm it up first - if let Some(destination_conf) = self.observed.locations.get(&dest_ps_id) { + if let Some(destination_conf) = self.observed.locations.get(&dest_ps.get_id()) { if let Some(destination_conf) = &destination_conf.conf { if destination_conf.mode == LocationConfigMode::Secondary { - tracing::info!( - "🔁 Downloading latest layers to destination pageserver {}", - dest_ps_id, - ); - self.secondary_download(self.tenant_shard_id, &dest_ps_id) - .await; + tracing::info!("🔁 Downloading latest layers to destination node {dest_ps}",); + self.secondary_download(self.tenant_shard_id, &dest_ps) + .await?; } } } @@ -337,7 +376,7 @@ impl Reconciler { // Increment generation before attaching to new pageserver self.generation = Some( self.persistence - .increment_generation(self.tenant_shard_id, dest_ps_id) + .increment_generation(self.tenant_shard_id, dest_ps.get_id()) .await?, ); @@ -349,22 +388,23 @@ impl Reconciler { None, ); - tracing::info!("🔁 Attaching to pageserver {}", dest_ps_id); - self.location_config(dest_ps_id, dest_conf, None).await?; + tracing::info!("🔁 Attaching to pageserver {dest_ps}"); + self.location_config(&dest_ps, dest_conf, None, false) + .await?; if let Some(baseline) = baseline_lsns { tracing::info!("🕑 Waiting for LSN to catch up..."); - self.await_lsn(self.tenant_shard_id, &dest_ps_id, baseline) + self.await_lsn(self.tenant_shard_id, &dest_ps, baseline) .await?; } - tracing::info!("🔁 Notifying compute to use pageserver {}", dest_ps_id); + tracing::info!("🔁 Notifying compute to use pageserver {dest_ps}"); // During a live migration it is unhelpful to proceed if we couldn't notify compute: if we detach // the origin without notifying compute, we will render the tenant unavailable. while let Err(e) = self.compute_notify().await { match e { - NotifyError::Fatal(_) => return Err(anyhow::anyhow!(e)), + NotifyError::Fatal(_) => return Err(ReconcileError::Notify(e)), _ => { tracing::warn!( "Live migration blocked by compute notification error, retrying: {e}" @@ -382,22 +422,19 @@ impl Reconciler { None, Some(LocationConfigSecondary { warm: true }), ); - self.location_config(origin_ps_id, origin_secondary_conf.clone(), None) + self.location_config(&origin_ps, origin_secondary_conf.clone(), None, false) .await?; // TODO: we should also be setting the ObservedState on earlier API calls, in case we fail // partway through. In fact, all location conf API calls should be in a wrapper that sets // the observed state to None, then runs, then sets it to what we wrote. self.observed.locations.insert( - origin_ps_id, + origin_ps.get_id(), ObservedStateLocation { conf: Some(origin_secondary_conf), }, ); - println!( - "🔁 Switching to AttachedSingle mode on pageserver {}", - dest_ps_id - ); + tracing::info!("🔁 Switching to AttachedSingle mode on node {dest_ps}",); let dest_final_conf = build_location_config( &self.shard, &self.config, @@ -405,16 +442,61 @@ impl Reconciler { self.generation, None, ); - self.location_config(dest_ps_id, dest_final_conf.clone(), None) + self.location_config(&dest_ps, dest_final_conf.clone(), None, false) .await?; self.observed.locations.insert( - dest_ps_id, + dest_ps.get_id(), ObservedStateLocation { conf: Some(dest_final_conf), }, ); - println!("✅ Migration complete"); + tracing::info!("✅ Migration complete"); + + Ok(()) + } + + async fn maybe_refresh_observed(&mut self) -> Result<(), ReconcileError> { + // If the attached node has uncertain state, read it from the pageserver before proceeding: this + // is important to avoid spurious generation increments. + // + // We don't need to do this for secondary/detach locations because it's harmless to just PUT their + // location conf, whereas for attached locations it can interrupt clients if we spuriously destroy/recreate + // the `Timeline` object in the pageserver. + + let Some(attached_node) = self.intent.attached.as_ref() else { + // Nothing to do + return Ok(()); + }; + + if matches!( + self.observed.locations.get(&attached_node.get_id()), + Some(ObservedStateLocation { conf: None }) + ) { + let tenant_shard_id = self.tenant_shard_id; + let observed_conf = match attached_node + .with_client_retries( + |client| async move { client.get_location_config(tenant_shard_id).await }, + &self.service_config.jwt_token, + 1, + 1, + Duration::from_secs(5), + &self.cancel, + ) + .await + { + Some(Ok(observed)) => observed, + Some(Err(e)) => return Err(e.into()), + None => return Err(ReconcileError::Cancel), + }; + tracing::info!("Scanned location configuration on {attached_node}: {observed_conf:?}"); + self.observed.locations.insert( + attached_node.get_id(), + ObservedStateLocation { + conf: observed_conf, + }, + ); + } Ok(()) } @@ -426,14 +508,14 @@ impl Reconciler { /// general case reconciliation where we walk through the intent by pageserver /// and call out to the pageserver to apply the desired state. pub(crate) async fn reconcile(&mut self) -> Result<(), ReconcileError> { - // TODO: if any of self.observed is None, call to remote pageservers - // to learn correct state. + // Prepare: if we have uncertain `observed` state for our would-be attachement location, then refresh it + self.maybe_refresh_observed().await?; // Special case: live migration self.maybe_live_migrate().await?; // If the attached pageserver is not attached, do so now. - if let Some(node_id) = self.intent.attached { + if let Some(node) = self.intent.attached.as_ref() { // If we are in an attached policy, then generation must have been set (null generations // are only present when a tenant is initially loaded with a secondary policy) debug_assert!(self.generation.is_some()); @@ -444,10 +526,10 @@ impl Reconciler { }; let mut wanted_conf = attached_location_conf(generation, &self.shard, &self.config); - match self.observed.locations.get(&node_id) { + match self.observed.locations.get(&node.get_id()) { Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => { // Nothing to do - tracing::info!(%node_id, "Observed configuration already correct.") + tracing::info!(node_id=%node.get_id(), "Observed configuration already correct.") } observed => { // In all cases other than a matching observed configuration, we will @@ -485,13 +567,21 @@ impl Reconciler { if increment_generation { let generation = self .persistence - .increment_generation(self.tenant_shard_id, node_id) + .increment_generation(self.tenant_shard_id, node.get_id()) .await?; self.generation = Some(generation); wanted_conf.generation = generation.into(); } - tracing::info!(%node_id, "Observed configuration requires update."); - self.location_config(node_id, wanted_conf, None).await?; + tracing::info!(node_id=%node.get_id(), "Observed configuration requires update."); + + // Because `node` comes from a ref to &self, clone it before calling into a &mut self + // function: this could be avoided by refactoring the state mutated by location_config into + // a separate type to Self. + let node = node.clone(); + + // Use lazy=true, because we may run many of Self concurrently, and do not want to + // overload the pageserver with logical size calculations. + self.location_config(&node, wanted_conf, None, true).await?; self.compute_notify().await?; } } @@ -500,33 +590,27 @@ impl Reconciler { // Configure secondary locations: if these were previously attached this // implicitly downgrades them from attached to secondary. let mut changes = Vec::new(); - for node_id in &self.intent.secondary { + for node in &self.intent.secondary { let wanted_conf = secondary_location_conf(&self.shard, &self.config); - match self.observed.locations.get(node_id) { + match self.observed.locations.get(&node.get_id()) { Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => { // Nothing to do - tracing::info!(%node_id, "Observed configuration already correct.") + tracing::info!(node_id=%node.get_id(), "Observed configuration already correct.") } _ => { // In all cases other than a matching observed configuration, we will // reconcile this location. - tracing::info!(%node_id, "Observed configuration requires update."); - changes.push((*node_id, wanted_conf)) + tracing::info!(node_id=%node.get_id(), "Observed configuration requires update."); + changes.push((node.clone(), wanted_conf)) } } } // Detach any extraneous pageservers that are no longer referenced // by our intent. - let all_pageservers = self.intent.all_pageservers(); - for node_id in self.observed.locations.keys() { - if all_pageservers.contains(node_id) { - // We are only detaching pageservers that aren't used at all. - continue; - } - + for node in &self.detach { changes.push(( - *node_id, + node.clone(), LocationConfig { mode: LocationConfigMode::Detached, generation: None, @@ -539,11 +623,11 @@ impl Reconciler { )); } - for (node_id, conf) in changes { + for (node, conf) in changes { if self.cancel.is_cancelled() { return Err(ReconcileError::Cancel); } - self.location_config(node_id, conf, None).await?; + self.location_config(&node, conf, None, false).await?; } Ok(()) @@ -552,16 +636,21 @@ impl Reconciler { pub(crate) async fn compute_notify(&mut self) -> Result<(), NotifyError> { // Whenever a particular Reconciler emits a notification, it is always notifying for the intended // destination. - if let Some(node_id) = self.intent.attached { + if let Some(node) = &self.intent.attached { let result = self .compute_hook - .notify(self.tenant_shard_id, node_id, &self.cancel) + .notify( + self.tenant_shard_id, + node.get_id(), + self.shard.stripe_size, + &self.cancel, + ) .await; if let Err(e) = &result { // It is up to the caller whether they want to drop out on this error, but they don't have to: // in general we should avoid letting unavailability of the cloud control plane stop us from // making progress. - tracing::warn!("Failed to notify compute of attached pageserver {node_id}: {e}"); + tracing::warn!("Failed to notify compute of attached pageserver {node}: {e}"); // Set this flag so that in our ReconcileResult we will set the flag on the shard that it // needs to retry at some point. self.compute_notify_failure = true; diff --git a/control_plane/attachment_service/src/scheduler.rs b/control_plane/attachment_service/src/scheduler.rs index 87fce3df25..26a2707e8d 100644 --- a/control_plane/attachment_service/src/scheduler.rs +++ b/control_plane/attachment_service/src/scheduler.rs @@ -43,7 +43,7 @@ impl Scheduler { let mut scheduler_nodes = HashMap::new(); for node in nodes { scheduler_nodes.insert( - node.id, + node.get_id(), SchedulerNode { shard_count: 0, may_schedule: node.may_schedule(), @@ -68,7 +68,7 @@ impl Scheduler { let mut expect_nodes: HashMap = HashMap::new(); for node in nodes { expect_nodes.insert( - node.id, + node.get_id(), SchedulerNode { shard_count: 0, may_schedule: node.may_schedule(), @@ -156,7 +156,7 @@ impl Scheduler { pub(crate) fn node_upsert(&mut self, node: &Node) { use std::collections::hash_map::Entry::*; - match self.nodes.entry(node.id) { + match self.nodes.entry(node.get_id()) { Occupied(mut entry) => { entry.get_mut().may_schedule = node.may_schedule(); } @@ -255,7 +255,6 @@ impl Scheduler { pub(crate) mod test_utils { use crate::node::Node; - use pageserver_api::controller_api::{NodeAvailability, NodeSchedulingPolicy}; use std::collections::HashMap; use utils::id::NodeId; /// Test helper: synthesize the requested number of nodes, all in active state. @@ -264,18 +263,17 @@ pub(crate) mod test_utils { pub(crate) fn make_test_nodes(n: u64) -> HashMap { (1..n + 1) .map(|i| { - ( - NodeId(i), - Node { - id: NodeId(i), - availability: NodeAvailability::Active, - scheduling: NodeSchedulingPolicy::Active, - listen_http_addr: format!("httphost-{i}"), - listen_http_port: 80 + i as u16, - listen_pg_addr: format!("pghost-{i}"), - listen_pg_port: 5432 + i as u16, - }, - ) + (NodeId(i), { + let node = Node::new( + NodeId(i), + format!("httphost-{i}"), + 80 + i as u16, + format!("pghost-{i}"), + 5432 + i as u16, + ); + assert!(node.is_available()); + node + }) }) .collect() } diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs index 4209b62db3..556d6a6828 100644 --- a/control_plane/attachment_service/src/service.rs +++ b/control_plane/attachment_service/src/service.rs @@ -16,9 +16,9 @@ use futures::{stream::FuturesUnordered, StreamExt}; use hyper::StatusCode; use pageserver_api::{ controller_api::{ - NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, NodeSchedulingPolicy, + NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, PlacementPolicy, TenantCreateResponse, TenantCreateResponseShard, TenantLocateResponse, - TenantLocateResponseShard, TenantShardMigrateRequest, TenantShardMigrateResponse, + TenantShardMigrateRequest, TenantShardMigrateResponse, }, models::TenantConfigRequest, }; @@ -39,7 +39,6 @@ use pageserver_client::mgmt_api; use tokio_util::sync::CancellationToken; use tracing::instrument; use utils::{ - backoff, completion::Barrier, generation::Generation, http::error::ApiError, @@ -50,7 +49,7 @@ use utils::{ use crate::{ compute_hook::{self, ComputeHook}, - node::Node, + node::{AvailabilityTransition, Node}, persistence::{split_state::SplitState, DatabaseError, Persistence, TenantShardPersistence}, reconciler::attached_location_conf, scheduler::Scheduler, @@ -58,7 +57,7 @@ use crate::{ IntentState, ObservedState, ObservedStateLocation, ReconcileResult, ReconcileWaitError, ReconcilerWaiter, TenantState, }, - PlacementPolicy, Sequence, + Sequence, }; // For operations that should be quick, like attaching a new tenant @@ -177,7 +176,7 @@ impl From for ApiError { #[allow(clippy::large_enum_variant)] enum TenantCreateOrUpdate { - Create((TenantCreateRequest, PlacementPolicy)), + Create(TenantCreateRequest), Update(Vec), } @@ -201,7 +200,8 @@ impl Service { async fn startup_reconcile(self: &Arc) { // For all tenant shards, a vector of observed states on nodes (where None means // indeterminate, same as in [`ObservedStateLocation`]) - let mut observed = HashMap::new(); + let mut observed: HashMap)>> = + HashMap::new(); let mut nodes_online = HashSet::new(); @@ -236,7 +236,8 @@ impl Service { nodes_online.insert(node_id); for (tenant_shard_id, conf_opt) in tenant_shards { - observed.insert(tenant_shard_id, (node_id, conf_opt)); + let shard_observations = observed.entry(tenant_shard_id).or_default(); + shard_observations.push((node_id, conf_opt)); } } @@ -252,27 +253,28 @@ impl Service { let mut new_nodes = (**nodes).clone(); for (node_id, node) in new_nodes.iter_mut() { if nodes_online.contains(node_id) { - node.availability = NodeAvailability::Active; + node.set_availability(NodeAvailability::Active); scheduler.node_upsert(node); } } *nodes = Arc::new(new_nodes); - for (tenant_shard_id, (node_id, observed_loc)) in observed { - let Some(tenant_state) = tenants.get_mut(&tenant_shard_id) else { - cleanup.push((tenant_shard_id, node_id)); - continue; - }; - - tenant_state - .observed - .locations - .insert(node_id, ObservedStateLocation { conf: observed_loc }); + for (tenant_shard_id, shard_observations) in observed { + for (node_id, observed_loc) in shard_observations { + let Some(tenant_state) = tenants.get_mut(&tenant_shard_id) else { + cleanup.push((tenant_shard_id, node_id)); + continue; + }; + tenant_state + .observed + .locations + .insert(node_id, ObservedStateLocation { conf: observed_loc }); + } } // Populate each tenant's intent state for (tenant_shard_id, tenant_state) in tenants.iter_mut() { - tenant_state.intent_from_observed(); + tenant_state.intent_from_observed(scheduler); if let Err(e) = tenant_state.schedule(scheduler) { // Non-fatal error: we are unable to properly schedule the tenant, perhaps because // not enough pageservers are available. The tenant may well still be available @@ -283,7 +285,11 @@ impl Service { // emit a compute notification for this. In the case where our observed state does not // yet match our intent, we will eventually reconcile, and that will emit a compute notification. if let Some(attached_at) = tenant_state.stably_attached() { - compute_notifications.push((*tenant_shard_id, attached_at)); + compute_notifications.push(( + *tenant_shard_id, + attached_at, + tenant_state.shard.stripe_size, + )); } } } @@ -355,40 +361,19 @@ impl Service { for node in nodes.values() { node_list_futs.push({ async move { - let http_client = reqwest::ClientBuilder::new() - .timeout(Duration::from_secs(5)) - .build() - .expect("Failed to construct HTTP client"); - let client = mgmt_api::Client::from_client( - http_client, - node.base_url(), - self.config.jwt_token.as_deref(), - ); - - fn is_fatal(e: &mgmt_api::Error) -> bool { - use mgmt_api::Error::*; - match e { - ReceiveBody(_) | ReceiveErrorBody(_) => false, - ApiError(StatusCode::SERVICE_UNAVAILABLE, _) - | ApiError(StatusCode::GATEWAY_TIMEOUT, _) - | ApiError(StatusCode::REQUEST_TIMEOUT, _) => false, - ApiError(_, _) => true, - } - } - - tracing::info!("Scanning shards on node {}...", node.id); - let description = format!("List locations on {}", node.id); - let response = backoff::retry( - || client.list_location_config(), - is_fatal, - 1, - 5, - &description, - &self.cancel, - ) - .await; - - (node.id, response) + tracing::info!("Scanning shards on node {node}..."); + let timeout = Duration::from_secs(5); + let response = node + .with_client_retries( + |client| async move { client.list_location_config().await }, + &self.config.jwt_token, + 1, + 5, + timeout, + &self.cancel, + ) + .await; + (node.get_id(), response) } }); } @@ -468,6 +453,7 @@ impl Service { tenant_conf: models::TenantConfig::default(), }, None, + false, ) .await { @@ -492,7 +478,7 @@ impl Service { /// Returns a set of any shards for which notifications where not acked within the deadline. async fn compute_notify_many( &self, - notifications: Vec<(TenantShardId, NodeId)>, + notifications: Vec<(TenantShardId, NodeId, ShardStripeSize)>, deadline: Instant, ) -> HashSet { let compute_hook = self.inner.read().unwrap().compute_hook.clone(); @@ -503,11 +489,14 @@ impl Service { // Construct an async stream of futures to invoke the compute notify function: we do this // in order to subsequently use .buffered() on the stream to execute with bounded parallelism. let mut stream = futures::stream::iter(notifications.into_iter()) - .map(|(tenant_shard_id, node_id)| { + .map(|(tenant_shard_id, node_id, stripe_size)| { let compute_hook = compute_hook.clone(); let cancel = self.cancel.clone(); async move { - if let Err(e) = compute_hook.notify(tenant_shard_id, node_id, &cancel).await { + if let Err(e) = compute_hook + .notify(tenant_shard_id, node_id, stripe_size, &cancel) + .await + { tracing::error!( %tenant_shard_id, %node_id, @@ -654,19 +643,9 @@ impl Service { .list_nodes() .await? .into_iter() - .map(|n| Node { - id: NodeId(n.node_id as u64), - // At startup we consider a node offline until proven otherwise. - availability: NodeAvailability::Offline, - scheduling: NodeSchedulingPolicy::from_str(&n.scheduling_policy) - .expect("Bad scheduling policy in DB"), - listen_http_addr: n.listen_http_addr, - listen_http_port: n.listen_http_port as u16, - listen_pg_addr: n.listen_pg_addr, - listen_pg_port: n.listen_pg_port as u16, - }) + .map(Node::from_persistent) .collect::>(); - let nodes: HashMap = nodes.into_iter().map(|n| (n.id, n)).collect(); + let nodes: HashMap = nodes.into_iter().map(|n| (n.get_id(), n)).collect(); tracing::info!("Loaded {} nodes from database.", nodes.len()); tracing::info!("Loading shards from database..."); @@ -693,15 +672,13 @@ impl Service { } for node_id in node_ids { tracing::info!("Creating node {} in scheduler for tests", node_id); - let node = Node { - id: NodeId(node_id as u64), - availability: NodeAvailability::Active, - scheduling: NodeSchedulingPolicy::Active, - listen_http_addr: "".to_string(), - listen_http_port: 123, - listen_pg_addr: "".to_string(), - listen_pg_port: 123, - }; + let node = Node::new( + NodeId(node_id as u64), + "".to_string(), + 123, + "".to_string(), + 123, + ); scheduler.node_upsert(&node); } @@ -815,7 +792,7 @@ impl Service { shard_stripe_size: 0, generation: Some(0), generation_pageserver: None, - placement_policy: serde_json::to_string(&PlacementPolicy::default()).unwrap(), + placement_policy: serde_json::to_string(&PlacementPolicy::Single).unwrap(), config: serde_json::to_string(&TenantConfig::default()).unwrap(), splitting: SplitState::default(), }; @@ -967,6 +944,12 @@ impl Service { // Ordering: we must persist generation number updates before making them visible in the in-memory state let incremented_generations = self.persistence.re_attach(reattach_req.node_id).await?; + tracing::info!( + node_id=%reattach_req.node_id, + "Incremented {} tenant shards' generations", + incremented_generations.len() + ); + // Apply the updated generation to our in-memory state let mut locked = self.inner.write().unwrap(); @@ -979,7 +962,6 @@ impl Service { id: tenant_shard_id, gen: new_gen.into().unwrap(), }); - // Apply the new generation number to our in-memory state let shard_state = locked.tenants.get_mut(&tenant_shard_id); let Some(shard_state) = shard_state else { @@ -1015,6 +997,14 @@ impl Service { if let Some(conf) = observed.conf.as_mut() { conf.generation = new_gen.into(); } + } else { + // This node has no observed state for the shard: perhaps it was offline + // when the pageserver restarted. Insert a None, so that the Reconciler + // will be prompted to learn the location's state before it makes changes. + shard_state + .observed + .locations + .insert(reattach_req.node_id, ObservedStateLocation { conf: None }); } // TODO: cancel/restart any running reconciliation for this tenant, it might be trying @@ -1063,9 +1053,8 @@ impl Service { pub(crate) async fn tenant_create( &self, create_req: TenantCreateRequest, - placement_policy: PlacementPolicy, ) -> Result { - let (response, waiters) = self.do_tenant_create(create_req, placement_policy).await?; + let (response, waiters) = self.do_tenant_create(create_req).await?; self.await_waiters(waiters, SHORT_RECONCILE_TIMEOUT).await?; Ok(response) @@ -1074,8 +1063,13 @@ impl Service { pub(crate) async fn do_tenant_create( &self, create_req: TenantCreateRequest, - placement_policy: PlacementPolicy, ) -> Result<(TenantCreateResponse, Vec), ApiError> { + // As a default, single is convenient for tests that don't choose a policy. + let placement_policy = create_req + .placement_policy + .clone() + .unwrap_or(PlacementPolicy::Single); + // This service expects to handle sharding itself: it is an error to try and directly create // a particular shard here. let tenant_id = if !create_req.new_tenant_id.is_unsharded() { @@ -1151,9 +1145,12 @@ impl Service { let (waiters, response_shards) = { let mut locked = self.inner.write().unwrap(); - let (_nodes, tenants, scheduler) = locked.parts_mut(); + let result_tx = locked.result_tx.clone(); + let compute_hook = locked.compute_hook.clone(); + let (nodes, tenants, scheduler) = locked.parts_mut(); let mut response_shards = Vec::new(); + let mut schcedule_error = None; for tenant_shard_id in create_ids { tracing::info!("Creating shard {tenant_shard_id}..."); @@ -1190,23 +1187,20 @@ impl Service { continue; } Entry::Vacant(entry) => { - let mut state = TenantState::new( + let state = entry.insert(TenantState::new( tenant_shard_id, ShardIdentity::from_params( tenant_shard_id.shard_number, &create_req.shard_parameters, ), placement_policy.clone(), - ); + )); state.generation = initial_generation; state.config = create_req.config.clone(); - - state.schedule(scheduler).map_err(|e| { - ApiError::Conflict(format!( - "Failed to schedule shard {tenant_shard_id}: {e}" - )) - })?; + if let Err(e) = state.schedule(scheduler) { + schcedule_error = Some(e); + } // Only include shards in result if we are attaching: the purpose // of the response is to tell the caller where the shards are attached. @@ -1220,24 +1214,27 @@ impl Service { generation: generation.into().unwrap(), }); } - entry.insert(state) } }; } - // Take a snapshot of pageservers - let pageservers = locked.nodes.clone(); + // If we failed to schedule shards, then they are still created in the controller, + // but we return an error to the requester to avoid a silent failure when someone + // tries to e.g. create a tenant whose placement policy requires more nodes than + // are present in the system. We do this here rather than in the above loop, to + // avoid situations where we only create a subset of shards in the tenant. + if let Some(e) = schcedule_error { + return Err(ApiError::Conflict(format!( + "Failed to schedule shard(s): {e}" + ))); + } - let result_tx = locked.result_tx.clone(); - let compute_hook = locked.compute_hook.clone(); - - let waiters = locked - .tenants + let waiters = tenants .range_mut(TenantShardId::tenant_range(tenant_id)) .filter_map(|(_shard_id, shard)| { shard.maybe_reconcile( result_tx.clone(), - &pageservers, + nodes, &compute_hook, &self.config, &self.persistence, @@ -1346,22 +1343,20 @@ impl Service { TenantCreateOrUpdate::Create( // Synthesize a creation request - ( - TenantCreateRequest { - new_tenant_id: TenantShardId::unsharded(tenant_id), - generation, - shard_parameters: ShardParameters { - // Must preserve the incoming shard_count do distinguish unsharded (0) - // from single-sharded (1): this distinction appears in the S3 keys of the tenant. - count: req.tenant_id.shard_count, - // We only import un-sharded or single-sharded tenants, so stripe - // size can be made up arbitrarily here. - stripe_size: ShardParameters::DEFAULT_STRIPE_SIZE, - }, - config: req.config.tenant_conf, + TenantCreateRequest { + new_tenant_id: TenantShardId::unsharded(tenant_id), + generation, + shard_parameters: ShardParameters { + // Must preserve the incoming shard_count do distinguish unsharded (0) + // from single-sharded (1): this distinction appears in the S3 keys of the tenant. + count: req.tenant_id.shard_count, + // We only import un-sharded or single-sharded tenants, so stripe + // size can be made up arbitrarily here. + stripe_size: ShardParameters::DEFAULT_STRIPE_SIZE, }, - placement_policy, - ), + placement_policy: Some(placement_policy), + config: req.config.tenant_conf, + }, ) } else { TenantCreateOrUpdate::Update(updates) @@ -1395,11 +1390,13 @@ impl Service { // First check if this is a creation or an update let create_or_update = self.tenant_location_config_prepare(tenant_id, req); - let mut result = TenantLocationConfigResponse { shards: Vec::new() }; + let mut result = TenantLocationConfigResponse { + shards: Vec::new(), + stripe_size: None, + }; let waiters = match create_or_update { - TenantCreateOrUpdate::Create((create_req, placement_policy)) => { - let (create_resp, waiters) = - self.do_tenant_create(create_req, placement_policy).await?; + TenantCreateOrUpdate::Create(create_req) => { + let (create_resp, waiters) = self.do_tenant_create(create_req).await?; result.shards = create_resp .shards .into_iter() @@ -1451,6 +1448,11 @@ impl Service { continue; }; + // Update stripe size + if result.stripe_size.is_none() && shard.shard.count.count() > 1 { + result.stripe_size = Some(shard.shard.stripe_size); + } + shard.policy = placement_policy; shard.config = tenant_config; if let Some(generation) = update_generation { @@ -1666,7 +1668,7 @@ impl Service { .map_err(|e| { ApiError::InternalServerError(anyhow::anyhow!( "Error doing time travel recovery for shard {tenant_shard_id} on node {}: {e}", - node.id + node )) })?; } @@ -1720,10 +1722,7 @@ impl Service { // Secondary downloads are always advisory: if something fails, we nevertheless report success, so that whoever // is calling us will proceed with whatever migration they're doing, albeit with a slightly less warm cache // than they had hoped for. - tracing::warn!( - "Ignoring tenant secondary download error from pageserver {}: {e}", - node.id, - ); + tracing::warn!("Ignoring tenant secondary download error from pageserver {node}: {e}",); } Ok(()) @@ -1761,13 +1760,11 @@ impl Service { // surface immediately as an error to our caller. let status = client.tenant_delete(tenant_shard_id).await.map_err(|e| { ApiError::InternalServerError(anyhow::anyhow!( - "Error deleting shard {tenant_shard_id} on node {}: {e}", - node.id + "Error deleting shard {tenant_shard_id} on node {node}: {e}", )) })?; tracing::info!( - "Shard {tenant_shard_id} on node {}, delete returned {}", - node.id, + "Shard {tenant_shard_id} on node {node}, delete returned {}", status ); if status == StatusCode::ACCEPTED { @@ -1866,10 +1863,9 @@ impl Service { create_req: TimelineCreateRequest, ) -> Result { tracing::info!( - "Creating timeline on shard {}/{}, attached to node {}", + "Creating timeline on shard {}/{}, attached to node {node}", tenant_shard_id, create_req.new_timeline_id, - node.id ); let client = mgmt_api::Client::new(node.base_url(), jwt.as_deref()); @@ -1993,10 +1989,7 @@ impl Service { jwt: Option, ) -> Result { tracing::info!( - "Deleting timeline on shard {}/{}, attached to node {}", - tenant_shard_id, - timeline_id, - node.id + "Deleting timeline on shard {tenant_shard_id}/{timeline_id}, attached to node {node}", ); let client = mgmt_api::Client::new(node.base_url(), jwt.as_deref()); @@ -2005,8 +1998,7 @@ impl Service { .await .map_err(|e| { ApiError::InternalServerError(anyhow::anyhow!( - "Error deleting timeline {timeline_id} on {tenant_shard_id} on node {}: {e}", - node.id + "Error deleting timeline {timeline_id} on {tenant_shard_id} on node {node}: {e}", )) }) } @@ -2107,14 +2099,7 @@ impl Service { .get(&node_id) .expect("Pageservers may not be deleted while referenced"); - result.push(TenantLocateResponseShard { - shard_id: *tenant_shard_id, - node_id, - listen_http_addr: node.listen_http_addr.clone(), - listen_http_port: node.listen_http_port, - listen_pg_addr: node.listen_pg_addr.clone(), - listen_pg_port: node.listen_pg_port, - }); + result.push(node.shard_location(*tenant_shard_id)); match &shard_params { None => { @@ -2305,7 +2290,7 @@ impl Service { // populate the correct generation as part of its transaction, to protect us // against racing with changes in the state of the parent. generation: None, - generation_pageserver: Some(target.node.id.0 as i64), + generation_pageserver: Some(target.node.get_id().0 as i64), placement_policy: serde_json::to_string(&policy).unwrap(), // TODO: get the config out of the map config: serde_json::to_string(&TenantConfig::default()).unwrap(), @@ -2455,7 +2440,7 @@ impl Service { // as at this point in the split process we have succeeded and this part is infallible: // we will never need to do any special recovery from this state. - child_locations.push((child, pageserver)); + child_locations.push((child, pageserver, child_shard.stripe_size)); tenants.insert(child, child_state); response.new_shards.push(child); @@ -2465,8 +2450,11 @@ impl Service { // Send compute notifications for all the new shards let mut failed_notifications = Vec::new(); - for (child_id, child_ps) in child_locations { - if let Err(e) = compute_hook.notify(child_id, child_ps, &self.cancel).await { + for (child_id, child_ps, stripe_size) in child_locations { + if let Err(e) = compute_hook + .notify(child_id, child_ps, stripe_size, &self.cancel) + .await + { tracing::warn!("Failed to update compute of {}->{} during split, proceeding anyway to complete split ({e})", child_id, child_ps); failed_notifications.push(child_id); @@ -2497,6 +2485,19 @@ impl Service { let compute_hook = locked.compute_hook.clone(); let (nodes, tenants, scheduler) = locked.parts_mut(); + let Some(node) = nodes.get(&migrate_req.node_id) else { + return Err(ApiError::BadRequest(anyhow::anyhow!( + "Node {} not found", + migrate_req.node_id + ))); + }; + + if !node.is_available() { + // Warn but proceed: the caller may intend to manually adjust the placement of + // a shard even if the node is down, e.g. if intervening during an incident. + tracing::warn!("Migrating to unavailable node {node}"); + } + let Some(shard) = tenants.get_mut(&tenant_shard_id) else { return Err(ApiError::NotFound( anyhow::anyhow!("Tenant shard not found").into(), @@ -2626,6 +2627,18 @@ impl Service { .map(|t| t.to_persistent()) .collect::>(); + // This method can only validate the state of an idle system: if a reconcile is in + // progress, fail out early to avoid giving false errors on state that won't match + // between database and memory under a ReconcileResult is processed. + for t in locked.tenants.values() { + if t.reconciler.is_some() { + return Err(ApiError::InternalServerError(anyhow::anyhow!( + "Shard {} reconciliation in progress", + t.tenant_shard_id + ))); + } + } + (expect_nodes, expect_shards) }; @@ -2737,11 +2750,7 @@ impl Service { if let Some(node) = locked.nodes.get(®ister_req.node_id) { // Note that we do not do a total equality of the struct, because we don't require // the availability/scheduling states to agree for a POST to be idempotent. - if node.listen_http_addr == register_req.listen_http_addr - && node.listen_http_port == register_req.listen_http_port - && node.listen_pg_addr == register_req.listen_pg_addr - && node.listen_pg_port == register_req.listen_pg_port - { + if node.registration_match(®ister_req) { tracing::info!( "Node {} re-registered with matching address", register_req.node_id @@ -2765,16 +2774,14 @@ impl Service { // Ordering: we must persist the new node _before_ adding it to in-memory state. // This ensures that before we use it for anything or expose it via any external // API, it is guaranteed to be available after a restart. - let new_node = Node { - id: register_req.node_id, - listen_http_addr: register_req.listen_http_addr, - listen_http_port: register_req.listen_http_port, - listen_pg_addr: register_req.listen_pg_addr, - listen_pg_port: register_req.listen_pg_port, - scheduling: NodeSchedulingPolicy::Filling, - // TODO: we shouldn't really call this Active until we've heartbeated it. - availability: NodeAvailability::Active, - }; + let new_node = Node::new( + register_req.node_id, + register_req.listen_http_addr, + register_req.listen_http_port, + register_req.listen_pg_addr, + register_req.listen_pg_port, + ); + // TODO: idempotency if the node already exists in the database self.persistence.insert_node(&new_node).await?; @@ -2819,29 +2826,14 @@ impl Service { )); }; - let mut offline_transition = false; - let mut active_transition = false; - - if let Some(availability) = &config_req.availability { - match (availability, &node.availability) { - (NodeAvailability::Offline, NodeAvailability::Active) => { - tracing::info!("Node {} transition to offline", config_req.node_id); - offline_transition = true; - } - (NodeAvailability::Active, NodeAvailability::Offline) => { - tracing::info!("Node {} transition to active", config_req.node_id); - active_transition = true; - } - _ => { - tracing::info!("Node {} no change during config", config_req.node_id); - // No change - } - }; - node.availability = *availability; - } + let availability_transition = if let Some(availability) = &config_req.availability { + node.set_availability(*availability) + } else { + AvailabilityTransition::Unchanged + }; if let Some(scheduling) = config_req.scheduling { - node.scheduling = scheduling; + node.set_scheduling(scheduling); // TODO: once we have a background scheduling ticker for fill/drain, kick it // to wake up and start working. @@ -2852,74 +2844,80 @@ impl Service { let new_nodes = Arc::new(new_nodes); - if offline_transition { - let mut tenants_affected: usize = 0; - for (tenant_shard_id, tenant_state) in tenants { - if let Some(observed_loc) = - tenant_state.observed.locations.get_mut(&config_req.node_id) - { - // When a node goes offline, we set its observed configuration to None, indicating unknown: we will - // not assume our knowledge of the node's configuration is accurate until it comes back online - observed_loc.conf = None; - } + match availability_transition { + AvailabilityTransition::ToOffline => { + tracing::info!("Node {} transition to offline", config_req.node_id); + let mut tenants_affected: usize = 0; + for (tenant_shard_id, tenant_state) in tenants { + if let Some(observed_loc) = + tenant_state.observed.locations.get_mut(&config_req.node_id) + { + // When a node goes offline, we set its observed configuration to None, indicating unknown: we will + // not assume our knowledge of the node's configuration is accurate until it comes back online + observed_loc.conf = None; + } - if tenant_state.intent.demote_attached(config_req.node_id) { - tenant_state.sequence = tenant_state.sequence.next(); - match tenant_state.schedule(scheduler) { - Err(e) => { - // It is possible that some tenants will become unschedulable when too many pageservers - // go offline: in this case there isn't much we can do other than make the issue observable. - // TODO: give TenantState a scheduling error attribute to be queried later. - tracing::warn!(%tenant_shard_id, "Scheduling error when marking pageserver {} offline: {e}", config_req.node_id); - } - Ok(()) => { - if tenant_state - .maybe_reconcile( - result_tx.clone(), - &new_nodes, - &compute_hook, - &self.config, - &self.persistence, - &self.gate, - &self.cancel, - ) - .is_some() - { - tenants_affected += 1; - }; + if tenant_state.intent.demote_attached(config_req.node_id) { + tenant_state.sequence = tenant_state.sequence.next(); + match tenant_state.schedule(scheduler) { + Err(e) => { + // It is possible that some tenants will become unschedulable when too many pageservers + // go offline: in this case there isn't much we can do other than make the issue observable. + // TODO: give TenantState a scheduling error attribute to be queried later. + tracing::warn!(%tenant_shard_id, "Scheduling error when marking pageserver {} offline: {e}", config_req.node_id); + } + Ok(()) => { + if tenant_state + .maybe_reconcile( + result_tx.clone(), + &new_nodes, + &compute_hook, + &self.config, + &self.persistence, + &self.gate, + &self.cancel, + ) + .is_some() + { + tenants_affected += 1; + }; + } } } } + tracing::info!( + "Launched {} reconciler tasks for tenants affected by node {} going offline", + tenants_affected, + config_req.node_id + ) } - tracing::info!( - "Launched {} reconciler tasks for tenants affected by node {} going offline", - tenants_affected, - config_req.node_id - ) - } - - if active_transition { - // When a node comes back online, we must reconcile any tenant that has a None observed - // location on the node. - for tenant_state in locked.tenants.values_mut() { - if let Some(observed_loc) = - tenant_state.observed.locations.get_mut(&config_req.node_id) - { - if observed_loc.conf.is_none() { - tenant_state.maybe_reconcile( - result_tx.clone(), - &new_nodes, - &compute_hook, - &self.config, - &self.persistence, - &self.gate, - &self.cancel, - ); + AvailabilityTransition::ToActive => { + tracing::info!("Node {} transition to active", config_req.node_id); + // When a node comes back online, we must reconcile any tenant that has a None observed + // location on the node. + for tenant_state in locked.tenants.values_mut() { + if let Some(observed_loc) = + tenant_state.observed.locations.get_mut(&config_req.node_id) + { + if observed_loc.conf.is_none() { + tenant_state.maybe_reconcile( + result_tx.clone(), + &new_nodes, + &compute_hook, + &self.config, + &self.persistence, + &self.gate, + &self.cancel, + ); + } } } - } - // TODO: in the background, we should balance work back onto this pageserver + // TODO: in the background, we should balance work back onto this pageserver + } + AvailabilityTransition::Unchanged => { + tracing::info!("Node {} no change during config", config_req.node_id); + } } locked.nodes = new_nodes; diff --git a/control_plane/attachment_service/src/tenant_state.rs b/control_plane/attachment_service/src/tenant_state.rs index 33b7d578c7..c775736b31 100644 --- a/control_plane/attachment_service/src/tenant_state.rs +++ b/control_plane/attachment_service/src/tenant_state.rs @@ -1,7 +1,11 @@ -use std::{collections::HashMap, sync::Arc, time::Duration}; +use std::{ + collections::{HashMap, HashSet}, + sync::Arc, + time::Duration, +}; use crate::{metrics, persistence::TenantShardPersistence}; -use pageserver_api::controller_api::NodeAvailability; +use pageserver_api::controller_api::PlacementPolicy; use pageserver_api::{ models::{LocationConfig, LocationConfigMode, TenantConfig}, shard::{ShardIdentity, TenantShardId}, @@ -25,7 +29,7 @@ use crate::{ attached_location_conf, secondary_location_conf, ReconcileError, Reconciler, TargetState, }, scheduler::{ScheduleError, Scheduler}, - service, PlacementPolicy, Sequence, + service, Sequence, }; /// Serialization helper @@ -370,7 +374,7 @@ impl TenantState { /// [`ObservedState`], even if it violates my [`PlacementPolicy`]. Call [`Self::schedule`] next, /// to get an intent state that complies with placement policy. The overall goal is to do scheduling /// in a way that makes use of any configured locations that already exist in the outside world. - pub(crate) fn intent_from_observed(&mut self) { + pub(crate) fn intent_from_observed(&mut self, scheduler: &mut Scheduler) { // Choose an attached location by filtering observed locations, and then sorting to get the highest // generation let mut attached_locs = self @@ -395,7 +399,7 @@ impl TenantState { attached_locs.sort_by_key(|i| i.1); if let Some((node_id, _gen)) = attached_locs.into_iter().last() { - self.intent.attached = Some(*node_id); + self.intent.set_attached(scheduler, Some(*node_id)); } // All remaining observed locations generate secondary intents. This includes None @@ -406,7 +410,7 @@ impl TenantState { // will take care of promoting one of these secondaries to be attached. self.observed.locations.keys().for_each(|node_id| { if Some(*node_id) != self.intent.attached { - self.intent.secondary.push(*node_id); + self.intent.push_secondary(scheduler, *node_id); } }); } @@ -564,7 +568,9 @@ impl TenantState { } } - fn dirty(&self) -> bool { + fn dirty(&self, nodes: &Arc>) -> bool { + let mut dirty_nodes = HashSet::new(); + if let Some(node_id) = self.intent.attached { // Maybe panic: it is a severe bug if we try to attach while generation is null. let generation = self @@ -575,7 +581,7 @@ impl TenantState { match self.observed.locations.get(&node_id) { Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {} Some(_) | None => { - return true; + dirty_nodes.insert(node_id); } } } @@ -585,7 +591,7 @@ impl TenantState { match self.observed.locations.get(node_id) { Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {} Some(_) | None => { - return true; + dirty_nodes.insert(*node_id); } } } @@ -593,17 +599,18 @@ impl TenantState { for node_id in self.observed.locations.keys() { if self.intent.attached != Some(*node_id) && !self.intent.secondary.contains(node_id) { // We have observed state that isn't part of our intent: need to clean it up. - return true; + dirty_nodes.insert(*node_id); } } - // Even if there is no pageserver work to be done, if we have a pending notification to computes, - // wake up a reconciler to send it. - if self.pending_compute_notification { - return true; - } + dirty_nodes.retain(|node_id| { + nodes + .get(node_id) + .map(|n| n.is_available()) + .unwrap_or(false) + }); - false + !dirty_nodes.is_empty() } #[allow(clippy::too_many_arguments)] @@ -625,15 +632,20 @@ impl TenantState { let node = pageservers .get(node_id) .expect("Nodes may not be removed while referenced"); - if observed_loc.conf.is_none() - && !matches!(node.availability, NodeAvailability::Offline) - { + if observed_loc.conf.is_none() && node.is_available() { dirty_observed = true; break; } } - if !self.dirty() && !dirty_observed { + let active_nodes_dirty = self.dirty(pageservers); + + // Even if there is no pageserver work to be done, if we have a pending notification to computes, + // wake up a reconciler to send it. + let do_reconcile = + active_nodes_dirty || dirty_observed || self.pending_compute_notification; + + if !do_reconcile { tracing::info!("Not dirty, no reconciliation needed."); return None; } @@ -663,6 +675,21 @@ impl TenantState { } } + // Build list of nodes from which the reconciler should detach + let mut detach = Vec::new(); + for node_id in self.observed.locations.keys() { + if self.intent.get_attached() != &Some(*node_id) + && !self.intent.secondary.contains(node_id) + { + detach.push( + pageservers + .get(node_id) + .expect("Intent references non-existent pageserver") + .clone(), + ) + } + } + // Reconcile in flight for a stale sequence? Our sequence's task will wait for it before // doing our sequence's work. let old_handle = self.reconciler.take(); @@ -677,14 +704,15 @@ impl TenantState { self.sequence = self.sequence.next(); let reconciler_cancel = cancel.child_token(); + let reconciler_intent = TargetState::from_intent(pageservers, &self.intent); let mut reconciler = Reconciler { tenant_shard_id: self.tenant_shard_id, shard: self.shard, generation: self.generation, - intent: TargetState::from_intent(&self.intent), + intent: reconciler_intent, + detach, config: self.config.clone(), observed: self.observed.clone(), - pageservers: pageservers.clone(), compute_hook: compute_hook.clone(), service_config: service_config.clone(), _gate_guard: gate_guard, @@ -819,7 +847,10 @@ impl TenantState { #[cfg(test)] pub(crate) mod tests { - use pageserver_api::shard::{ShardCount, ShardNumber}; + use pageserver_api::{ + controller_api::NodeAvailability, + shard::{ShardCount, ShardNumber}, + }; use utils::id::TenantId; use crate::scheduler::test_utils::make_test_nodes; @@ -878,7 +909,10 @@ pub(crate) mod tests { assert_eq!(tenant_state.intent.secondary.len(), 2); // Update the scheduler state to indicate the node is offline - nodes.get_mut(&attached_node_id).unwrap().availability = NodeAvailability::Offline; + nodes + .get_mut(&attached_node_id) + .unwrap() + .set_availability(NodeAvailability::Offline); scheduler.node_upsert(nodes.get(&attached_node_id).unwrap()); // Scheduling the node should promote the still-available secondary node to attached @@ -897,4 +931,54 @@ pub(crate) mod tests { Ok(()) } + + #[test] + fn intent_from_observed() -> anyhow::Result<()> { + let nodes = make_test_nodes(3); + let mut scheduler = Scheduler::new(nodes.values()); + + let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Double(1)); + + tenant_state.observed.locations.insert( + NodeId(3), + ObservedStateLocation { + conf: Some(LocationConfig { + mode: LocationConfigMode::AttachedMulti, + generation: Some(2), + secondary_conf: None, + shard_number: tenant_state.shard.number.0, + shard_count: tenant_state.shard.count.literal(), + shard_stripe_size: tenant_state.shard.stripe_size.0, + tenant_conf: TenantConfig::default(), + }), + }, + ); + + tenant_state.observed.locations.insert( + NodeId(2), + ObservedStateLocation { + conf: Some(LocationConfig { + mode: LocationConfigMode::AttachedStale, + generation: Some(1), + secondary_conf: None, + shard_number: tenant_state.shard.number.0, + shard_count: tenant_state.shard.count.literal(), + shard_stripe_size: tenant_state.shard.stripe_size.0, + tenant_conf: TenantConfig::default(), + }), + }, + ); + + tenant_state.intent_from_observed(&mut scheduler); + + // The highest generationed attached location gets used as attached + assert_eq!(tenant_state.intent.attached, Some(NodeId(3))); + // Other locations get used as secondary + assert_eq!(tenant_state.intent.secondary, vec![NodeId(2)]); + + scheduler.consistency_check(nodes.values(), [&tenant_state].into_iter())?; + + tenant_state.intent.clear(&mut scheduler); + Ok(()) + } } diff --git a/control_plane/src/attachment_service.rs b/control_plane/src/attachment_service.rs index 610d7386d9..5c97561985 100644 --- a/control_plane/src/attachment_service.rs +++ b/control_plane/src/attachment_service.rs @@ -34,7 +34,7 @@ pub struct AttachmentService { client: reqwest::Client, } -const COMMAND: &str = "attachment_service"; +const COMMAND: &str = "storage_controller"; const ATTACHMENT_SERVICE_POSTGRES_VERSION: u32 = 16; diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index cf647a5f9b..27abcb182a 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -15,7 +15,7 @@ use control_plane::pageserver::{PageServerNode, PAGESERVER_REMOTE_STORAGE_DIR}; use control_plane::safekeeper::SafekeeperNode; use control_plane::{broker, local_env}; use pageserver_api::controller_api::{ - NodeAvailability, NodeConfigureRequest, NodeSchedulingPolicy, + NodeAvailability, NodeConfigureRequest, NodeSchedulingPolicy, PlacementPolicy, }; use pageserver_api::models::{ ShardParameters, TenantCreateRequest, TimelineCreateRequest, TimelineInfo, @@ -435,6 +435,11 @@ async fn handle_tenant( let shard_stripe_size: Option = create_match.get_one::("shard-stripe-size").cloned(); + let placement_policy = match create_match.get_one::("placement-policy") { + Some(s) if !s.is_empty() => serde_json::from_str::(s)?, + _ => PlacementPolicy::Single, + }; + let tenant_conf = PageServerNode::parse_config(tenant_conf)?; // If tenant ID was not specified, generate one @@ -456,6 +461,7 @@ async fn handle_tenant( .map(ShardStripeSize) .unwrap_or(ShardParameters::DEFAULT_STRIPE_SIZE), }, + placement_policy: Some(placement_policy), config: tenant_conf, }) .await?; @@ -1024,7 +1030,7 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re }) .collect::>() }; - endpoint.reconfigure(pageservers).await?; + endpoint.reconfigure(pageservers, None).await?; } "stop" => { let endpoint_id = sub_args @@ -1562,6 +1568,7 @@ fn cli() -> Command { .help("Use this tenant in future CLI commands where tenant_id is needed, but not specified")) .arg(Arg::new("shard-count").value_parser(value_parser!(u8)).long("shard-count").action(ArgAction::Set).help("Number of shards in the new tenant (default 1)")) .arg(Arg::new("shard-stripe-size").value_parser(value_parser!(u32)).long("shard-stripe-size").action(ArgAction::Set).help("Sharding stripe size in pages")) + .arg(Arg::new("placement-policy").value_parser(value_parser!(String)).long("placement-policy").action(ArgAction::Set).help("Placement policy shards in this tenant")) ) .subcommand(Command::new("set-default").arg(tenant_id_arg.clone().required(true)) .about("Set a particular tenant as default in future CLI commands where tenant_id is needed, but not specified")) diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs index 5a75bc2a1d..ac0a8417ae 100644 --- a/control_plane/src/endpoint.rs +++ b/control_plane/src/endpoint.rs @@ -52,6 +52,7 @@ use compute_api::spec::RemoteExtSpec; use compute_api::spec::Role; use nix::sys::signal::kill; use nix::sys::signal::Signal; +use pageserver_api::shard::ShardStripeSize; use serde::{Deserialize, Serialize}; use url::Host; use utils::id::{NodeId, TenantId, TimelineId}; @@ -655,7 +656,7 @@ impl Endpoint { // Wait for it to start let mut attempt = 0; const ATTEMPT_INTERVAL: Duration = Duration::from_millis(100); - const MAX_ATTEMPTS: u32 = 10 * 30; // Wait up to 30 s + const MAX_ATTEMPTS: u32 = 10 * 90; // Wait up to 1.5 min loop { attempt += 1; match self.get_status().await { @@ -735,7 +736,11 @@ impl Endpoint { } } - pub async fn reconfigure(&self, mut pageservers: Vec<(Host, u16)>) -> Result<()> { + pub async fn reconfigure( + &self, + mut pageservers: Vec<(Host, u16)>, + stripe_size: Option, + ) -> Result<()> { let mut spec: ComputeSpec = { let spec_path = self.endpoint_path().join("spec.json"); let file = std::fs::File::open(spec_path)?; @@ -765,6 +770,9 @@ impl Endpoint { let pageserver_connstr = Self::build_pageserver_connstr(&pageservers); assert!(!pageserver_connstr.is_empty()); spec.pageserver_connstring = Some(pageserver_connstr); + if stripe_size.is_some() { + spec.shard_stripe_size = stripe_size.map(|s| s.0 as usize); + } let client = reqwest::Client::new(); let response = client diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index a5e1325cfe..03270723a6 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -232,7 +232,7 @@ impl LocalEnv { // run from the same location as neon_local. This means that for compatibility // tests that run old pageserver/safekeeper, they still run latest attachment service. let neon_local_bin_dir = env::current_exe().unwrap().parent().unwrap().to_owned(); - neon_local_bin_dir.join("attachment_service") + neon_local_bin_dir.join("storage_controller") } pub fn safekeeper_bin(&self) -> PathBuf { diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index 642f153f2d..ae1bd60c52 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -429,6 +429,8 @@ impl PageServerNode { generation, config, shard_parameters: ShardParameters::default(), + // Placement policy is not meaningful for creations not done via storage controller + placement_policy: None, }; if !settings.is_empty() { bail!("Unrecognized tenant settings: {settings:?}") @@ -537,10 +539,11 @@ impl PageServerNode { tenant_shard_id: TenantShardId, config: LocationConfig, flush_ms: Option, + lazy: bool, ) -> anyhow::Result<()> { Ok(self .http_client - .location_config(tenant_shard_id, config, flush_ms) + .location_config(tenant_shard_id, config, flush_ms, lazy) .await?) } @@ -605,7 +608,7 @@ impl PageServerNode { eprintln!("connection error: {}", e); } }); - tokio::pin!(client); + let client = std::pin::pin!(client); // Init base reader let (start_lsn, base_tarfile_path) = base; diff --git a/docs/rfcs/002-storage.md b/docs/rfcs/002-storage.md index f99683cf09..d11b750e73 100644 --- a/docs/rfcs/002-storage.md +++ b/docs/rfcs/002-storage.md @@ -1,4 +1,4 @@ -# Zenith storage node — alternative +# Neon storage node — alternative ## **Design considerations** diff --git a/docs/rfcs/003-laptop-cli.md b/docs/rfcs/003-laptop-cli.md index 1a549c2df5..003a05bd16 100644 --- a/docs/rfcs/003-laptop-cli.md +++ b/docs/rfcs/003-laptop-cli.md @@ -1,6 +1,6 @@ # Command line interface (end-user) -Zenith CLI as it is described here mostly resides on the same conceptual level as pg_ctl/initdb/pg_recvxlog/etc and replaces some of them in an opinionated way. I would also suggest bundling our patched postgres inside zenith distribution at least at the start. +Neon CLI as it is described here mostly resides on the same conceptual level as pg_ctl/initdb/pg_recvxlog/etc and replaces some of them in an opinionated way. I would also suggest bundling our patched postgres inside neon distribution at least at the start. This proposal is focused on managing local installations. For cluster operations, different tooling would be needed. The point of integration between the two is storage URL: no matter how complex cluster setup is it may provide an endpoint where the user may push snapshots. @@ -8,40 +8,40 @@ The most important concept here is a snapshot, which can be created/pushed/pulle # Possible usage scenarios -## Install zenith, run a postgres +## Install neon, run a postgres ``` -> brew install pg-zenith -> zenith pg create # creates pgdata with default pattern pgdata$i -> zenith pg list +> brew install pg-neon +> neon pg create # creates pgdata with default pattern pgdata$i +> neon pg list ID PGDATA USED STORAGE ENDPOINT -primary1 pgdata1 0G zenith-local localhost:5432 +primary1 pgdata1 0G neon-local localhost:5432 ``` -## Import standalone postgres to zenith +## Import standalone postgres to neon ``` -> zenith snapshot import --from=basebackup://replication@localhost:5432/ oldpg +> neon snapshot import --from=basebackup://replication@localhost:5432/ oldpg [====================------------] 60% | 20MB/s -> zenith snapshot list +> neon snapshot list ID SIZE PARENT oldpg 5G - -> zenith pg create --snapshot oldpg +> neon pg create --snapshot oldpg Started postgres on localhost:5432 -> zenith pg list +> neon pg list ID PGDATA USED STORAGE ENDPOINT -primary1 pgdata1 5G zenith-local localhost:5432 +primary1 pgdata1 5G neon-local localhost:5432 -> zenith snapshot destroy oldpg +> neon snapshot destroy oldpg Ok ``` Also, we may start snapshot import implicitly by looking at snapshot schema ``` -> zenith pg create --snapshot basebackup://replication@localhost:5432/ +> neon pg create --snapshot basebackup://replication@localhost:5432/ Downloading snapshot... Done. Started postgres on localhost:5432 Destroying snapshot... Done. @@ -52,39 +52,39 @@ Destroying snapshot... Done. Since we may export the whole snapshot as one big file (tar of basebackup, maybe with some manifest) it may be shared over conventional means: http, ssh, [git+lfs](https://docs.github.com/en/github/managing-large-files/about-git-large-file-storage). ``` -> zenith pg create --snapshot http://learn-postgres.com/movies_db.zenith movies +> neon pg create --snapshot http://learn-postgres.com/movies_db.neon movies ``` ## Create snapshot and push it to the cloud ``` -> zenith snapshot create pgdata1@snap1 -> zenith snapshot push --to ssh://stas@zenith.tech pgdata1@snap1 +> neon snapshot create pgdata1@snap1 +> neon snapshot push --to ssh://stas@neon.tech pgdata1@snap1 ``` ## Rollback database to the snapshot -One way to rollback the database is just to init a new database from the snapshot and destroy the old one. But creating a new database from a snapshot would require a copy of that snapshot which is time consuming operation. Another option that would be cool to support is the ability to create the copy-on-write database from the snapshot without copying data, and store updated pages in a separate location, however that way would have performance implications. So to properly rollback the database to the older state we have `zenith pg checkout`. +One way to rollback the database is just to init a new database from the snapshot and destroy the old one. But creating a new database from a snapshot would require a copy of that snapshot which is time consuming operation. Another option that would be cool to support is the ability to create the copy-on-write database from the snapshot without copying data, and store updated pages in a separate location, however that way would have performance implications. So to properly rollback the database to the older state we have `neon pg checkout`. ``` -> zenith pg list +> neon pg list ID PGDATA USED STORAGE ENDPOINT -primary1 pgdata1 5G zenith-local localhost:5432 +primary1 pgdata1 5G neon-local localhost:5432 -> zenith snapshot create pgdata1@snap1 +> neon snapshot create pgdata1@snap1 -> zenith snapshot list +> neon snapshot list ID SIZE PARENT oldpg 5G - pgdata1@snap1 6G - pgdata1@CURRENT 6G - -> zenith pg checkout pgdata1@snap1 +> neon pg checkout pgdata1@snap1 Stopping postgres on pgdata1. Rolling back pgdata1@CURRENT to pgdata1@snap1. Starting postgres on pgdata1. -> zenith snapshot list +> neon snapshot list ID SIZE PARENT oldpg 5G - pgdata1@snap1 6G - @@ -99,7 +99,7 @@ Some notes: pgdata1@CURRENT -- implicit snapshot representing the current state PITR area acts like a continuous snapshot where you can reset the database to any point in time within this area (by area I mean some TTL period or some size limit, both possibly infinite). ``` -> zenith pitr create --storage s3tank --ttl 30d --name pitr_last_month +> neon pitr create --storage s3tank --ttl 30d --name pitr_last_month ``` Resetting the database to some state in past would require creating a snapshot on some lsn / time in this pirt area. @@ -108,29 +108,29 @@ Resetting the database to some state in past would require creating a snapshot o ## storage -Storage is either zenith pagestore or s3. Users may create a database in a pagestore and create/move *snapshots* and *pitr regions* in both pagestore and s3. Storage is a concept similar to `git remote`. After installation, I imagine one local storage is available by default. +Storage is either neon pagestore or s3. Users may create a database in a pagestore and create/move *snapshots* and *pitr regions* in both pagestore and s3. Storage is a concept similar to `git remote`. After installation, I imagine one local storage is available by default. -**zenith storage attach** -t [native|s3] -c key=value -n name +**neon storage attach** -t [native|s3] -c key=value -n name -Attaches/initializes storage. For --type=s3, user credentials and path should be provided. For --type=native we may support --path=/local/path and --url=zenith.tech/stas/mystore. Other possible term for native is 'zstore'. +Attaches/initializes storage. For --type=s3, user credentials and path should be provided. For --type=native we may support --path=/local/path and --url=neon.tech/stas/mystore. Other possible term for native is 'zstore'. -**zenith storage list** +**neon storage list** Show currently attached storages. For example: ``` -> zenith storage list +> neon storage list NAME USED TYPE OPTIONS PATH -local 5.1G zenith-local /opt/zenith/store/local -local.compr 20.4G zenith-local compression=on /opt/zenith/store/local.compr -zcloud 60G zenith-remote zenith.tech/stas/mystore +local 5.1G neon-local /opt/neon/store/local +local.compr 20.4G neon-local compression=on /opt/neon/store/local.compr +zcloud 60G neon-remote neon.tech/stas/mystore s3tank 80G S3 ``` -**zenith storage detach** +**neon storage detach** -**zenith storage show** +**neon storage show** @@ -140,29 +140,29 @@ Manages postgres data directories and can start postgres instances with proper c Pg is a term for a single postgres running on some data. I'm trying to avoid separation of datadir management and postgres instance management -- both that concepts bundled here together. -**zenith pg create** [--no-start --snapshot --cow] -s storage-name -n pgdata +**neon pg create** [--no-start --snapshot --cow] -s storage-name -n pgdata Creates (initializes) new data directory in given storage and starts postgres. I imagine that storage for this operation may be only local and data movement to remote location happens through snapshots/pitr. --no-start: just init datadir without creating ---snapshot snap: init from the snapshot. Snap is a name or URL (zenith.tech/stas/mystore/snap1) +--snapshot snap: init from the snapshot. Snap is a name or URL (neon.tech/stas/mystore/snap1) --cow: initialize Copy-on-Write data directory on top of some snapshot (makes sense if it is a snapshot of currently running a database) -**zenith pg destroy** +**neon pg destroy** -**zenith pg start** [--replica] pgdata +**neon pg start** [--replica] pgdata Start postgres with proper extensions preloaded/installed. -**zenith pg checkout** +**neon pg checkout** Rollback data directory to some previous snapshot. -**zenith pg stop** pg_id +**neon pg stop** pg_id -**zenith pg list** +**neon pg list** ``` ROLE PGDATA USED STORAGE ENDPOINT @@ -173,7 +173,7 @@ primary my_pg2 3.2G local.compr localhost:5435 - my_pg3 9.2G local.compr - ``` -**zenith pg show** +**neon pg show** ``` my_pg: @@ -194,7 +194,7 @@ my_pg: ``` -**zenith pg start-rest/graphql** pgdata +**neon pg start-rest/graphql** pgdata Starts REST/GraphQL proxy on top of postgres master. Not sure we should do that, just an idea. @@ -203,35 +203,35 @@ Starts REST/GraphQL proxy on top of postgres master. Not sure we should do that, Snapshot creation is cheap -- no actual data is copied, we just start retaining old pages. Snapshot size means the amount of retained data, not all data. Snapshot name looks like pgdata_name@tag_name. tag_name is set by the user during snapshot creation. There are some reserved tag names: CURRENT represents the current state of the data directory; HEAD{i} represents the data directory state that resided in the database before i-th checkout. -**zenith snapshot create** pgdata_name@snap_name +**neon snapshot create** pgdata_name@snap_name Creates a new snapshot in the same storage where pgdata_name exists. -**zenith snapshot push** --to url pgdata_name@snap_name +**neon snapshot push** --to url pgdata_name@snap_name -Produces binary stream of a given snapshot. Under the hood starts temp read-only postgres over this snapshot and sends basebackup stream. Receiving side should start `zenith snapshot recv` before push happens. If url has some special schema like zenith:// receiving side may require auth start `zenith snapshot recv` on the go. +Produces binary stream of a given snapshot. Under the hood starts temp read-only postgres over this snapshot and sends basebackup stream. Receiving side should start `neon snapshot recv` before push happens. If url has some special schema like neon:// receiving side may require auth start `neon snapshot recv` on the go. -**zenith snapshot recv** +**neon snapshot recv** Starts a port listening for a basebackup stream, prints connection info to stdout (so that user may use that in push command), and expects data on that socket. -**zenith snapshot pull** --from url or path +**neon snapshot pull** --from url or path -Connects to a remote zenith/s3/file and pulls snapshot. The remote site should be zenith service or files in our format. +Connects to a remote neon/s3/file and pulls snapshot. The remote site should be neon service or files in our format. -**zenith snapshot import** --from basebackup://<...> or path +**neon snapshot import** --from basebackup://<...> or path Creates a new snapshot out of running postgres via basebackup protocol or basebackup files. -**zenith snapshot export** +**neon snapshot export** -Starts read-only postgres over this snapshot and exports data in some format (pg_dump, or COPY TO on some/all tables). One of the options may be zenith own format which is handy for us (but I think just tar of basebackup would be okay). +Starts read-only postgres over this snapshot and exports data in some format (pg_dump, or COPY TO on some/all tables). One of the options may be neon own format which is handy for us (but I think just tar of basebackup would be okay). -**zenith snapshot diff** snap1 snap2 +**neon snapshot diff** snap1 snap2 Shows size of data changed between two snapshots. We also may provide options to diff schema/data in tables. To do that start temp read-only postgreses. -**zenith snapshot destroy** +**neon snapshot destroy** ## pitr @@ -239,7 +239,7 @@ Pitr represents wal stream and ttl policy for that stream XXX: any suggestions on a better name? -**zenith pitr create** name +**neon pitr create** name --ttl = inf | period @@ -247,21 +247,21 @@ XXX: any suggestions on a better name? --storage = storage_name -**zenith pitr extract-snapshot** pitr_name --lsn xxx +**neon pitr extract-snapshot** pitr_name --lsn xxx Creates a snapshot out of some lsn in PITR area. The obtained snapshot may be managed with snapshot routines (move/send/export) -**zenith pitr gc** pitr_name +**neon pitr gc** pitr_name Force garbage collection on some PITR area. -**zenith pitr list** +**neon pitr list** -**zenith pitr destroy** +**neon pitr destroy** ## console -**zenith console** +**neon console** Opens browser targeted at web console with the more or less same functionality as described here. diff --git a/docs/rfcs/004-durability.md b/docs/rfcs/004-durability.md index d4716156d1..6b83c77403 100644 --- a/docs/rfcs/004-durability.md +++ b/docs/rfcs/004-durability.md @@ -6,7 +6,7 @@ When do we consider the WAL record as durable, so that we can acknowledge the commit to the client and be reasonably certain that we will not lose the transaction? -Zenith uses a group of WAL safekeeper nodes to hold the generated WAL. +Neon uses a group of WAL safekeeper nodes to hold the generated WAL. A WAL record is considered durable, when it has been written to a majority of WAL safekeeper nodes. In this document, I use 5 safekeepers, because I have five fingers. A WAL record is durable, diff --git a/docs/rfcs/005-zenith_local.md b/docs/rfcs/005-zenith_local.md index e36d0a9ae3..6c283d7a37 100644 --- a/docs/rfcs/005-zenith_local.md +++ b/docs/rfcs/005-zenith_local.md @@ -1,23 +1,23 @@ -# Zenith local +# Neon local -Here I list some objectives to keep in mind when discussing zenith-local design and a proposal that brings all components together. Your comments on both parts are very welcome. +Here I list some objectives to keep in mind when discussing neon-local design and a proposal that brings all components together. Your comments on both parts are very welcome. #### Why do we need it? - For distribution - this easy to use binary will help us to build adoption among developers. - For internal use - to test all components together. -In my understanding, we consider it to be just a mock-up version of zenith-cloud. +In my understanding, we consider it to be just a mock-up version of neon-cloud. > Question: How much should we care about durability and security issues for a local setup? #### Why is it better than a simple local postgres? -- Easy one-line setup. As simple as `cargo install zenith && zenith start` +- Easy one-line setup. As simple as `cargo install neon && neon start` - Quick and cheap creation of compute nodes over the same storage. > Question: How can we describe a use-case for this feature? -- Zenith-local can work with S3 directly. +- Neon-local can work with S3 directly. - Push and pull images (snapshots) to remote S3 to exchange data with other users. @@ -31,50 +31,50 @@ Ideally, just one binary that incorporates all elements we need. #### Components: -- **zenith-CLI** - interface for end-users. Turns commands to REST requests and handles responses to show them in a user-friendly way. -CLI proposal is here https://github.com/libzenith/rfcs/blob/003-laptop-cli.md/003-laptop-cli.md -WIP code is here: https://github.com/libzenith/postgres/tree/main/pageserver/src/bin/cli +- **neon-CLI** - interface for end-users. Turns commands to REST requests and handles responses to show them in a user-friendly way. +CLI proposal is here https://github.com/neondatabase/rfcs/blob/003-laptop-cli.md/003-laptop-cli.md +WIP code is here: https://github.com/neondatabase/postgres/tree/main/pageserver/src/bin/cli -- **zenith-console** - WEB UI with same functionality as CLI. +- **neon-console** - WEB UI with same functionality as CLI. >Note: not for the first release. -- **zenith-local** - entrypoint. Service that starts all other components and handles REST API requests. See REST API proposal below. - > Idea: spawn all other components as child processes, so that we could shutdown everything by stopping zenith-local. +- **neon-local** - entrypoint. Service that starts all other components and handles REST API requests. See REST API proposal below. + > Idea: spawn all other components as child processes, so that we could shutdown everything by stopping neon-local. -- **zenith-pageserver** - consists of a storage and WAL-replaying service (modified PG in current implementation). +- **neon-pageserver** - consists of a storage and WAL-replaying service (modified PG in current implementation). > Question: Probably, for local setup we should be able to bypass page-storage and interact directly with S3 to avoid double caching in shared buffers and page-server? -WIP code is here: https://github.com/libzenith/postgres/tree/main/pageserver/src +WIP code is here: https://github.com/neondatabase/postgres/tree/main/pageserver/src -- **zenith-S3** - stores base images of the database and WAL in S3 object storage. Import and export images from/to zenith. +- **neon-S3** - stores base images of the database and WAL in S3 object storage. Import and export images from/to neon. > Question: How should it operate in a local setup? Will we manage it ourselves or ask user to provide credentials for existing S3 object storage (i.e. minio)? > Question: Do we use it together with local page store or they are interchangeable? WIP code is ??? -- **zenith-safekeeper** - receives WAL from postgres, stores it durably, answers to Postgres that "sync" is succeed. +- **neon-safekeeper** - receives WAL from postgres, stores it durably, answers to Postgres that "sync" is succeed. > Question: How should it operate in a local setup? In my understanding it should push WAL directly to S3 (if we use it) or store all data locally (if we use local page storage). The latter option seems meaningless (extra overhead and no gain), but it is still good to test the system. -WIP code is here: https://github.com/libzenith/postgres/tree/main/src/bin/safekeeper +WIP code is here: https://github.com/neondatabase/postgres/tree/main/src/bin/safekeeper -- **zenith-computenode** - bottomless PostgreSQL, ideally upstream, but for a start - our modified version. User can quickly create and destroy them and work with it as a regular postgres database. +- **neon-computenode** - bottomless PostgreSQL, ideally upstream, but for a start - our modified version. User can quickly create and destroy them and work with it as a regular postgres database. - WIP code is in main branch and here: https://github.com/libzenith/postgres/commits/compute_node + WIP code is in main branch and here: https://github.com/neondatabase/postgres/commits/compute_node #### REST API: Service endpoint: `http://localhost:3000` Resources: -- /storages - Where data lives: zenith-pageserver or zenith-s3 -- /pgs - Postgres - zenith-computenode +- /storages - Where data lives: neon-pageserver or neon-s3 +- /pgs - Postgres - neon-computenode - /snapshots - snapshots **TODO** ->Question: Do we want to extend this API to manage zenith components? I.e. start page-server, manage safekeepers and so on? Or they will be hardcoded to just start once and for all? +>Question: Do we want to extend this API to manage neon components? I.e. start page-server, manage safekeepers and so on? Or they will be hardcoded to just start once and for all? Methods and their mapping to CLI: -- /storages - zenith-pageserver or zenith-s3 +- /storages - neon-pageserver or neon-s3 CLI | REST API ------------- | ------------- @@ -84,7 +84,7 @@ storage list | GET /storages storage show -n name | GET /storages/:storage_name -- /pgs - zenith-computenode +- /pgs - neon-computenode CLI | REST API ------------- | ------------- diff --git a/docs/rfcs/006-laptop-cli-v2-CLI.md b/docs/rfcs/006-laptop-cli-v2-CLI.md index 84dc932211..5030ecc7e7 100644 --- a/docs/rfcs/006-laptop-cli-v2-CLI.md +++ b/docs/rfcs/006-laptop-cli-v2-CLI.md @@ -1,45 +1,45 @@ -Zenith CLI allows you to operate database clusters (catalog clusters) and their commit history locally and in the cloud. Since ANSI calls them catalog clusters and cluster is a loaded term in the modern infrastructure we will call it "catalog". +Neon CLI allows you to operate database clusters (catalog clusters) and their commit history locally and in the cloud. Since ANSI calls them catalog clusters and cluster is a loaded term in the modern infrastructure we will call it "catalog". # CLI v2 (after chatting with Carl) -Zenith introduces the notion of a repository. +Neon introduces the notion of a repository. ```bash -zenith init -zenith clone zenith://zenith.tech/piedpiper/northwind -- clones a repo to the northwind directory +neon init +neon clone neon://neon.tech/piedpiper/northwind -- clones a repo to the northwind directory ``` Once you have a cluster catalog you can explore it ```bash -zenith log -- returns a list of commits -zenith status -- returns if there are changes in the catalog that can be committed -zenith commit -- commits the changes and generates a new commit hash -zenith branch experimental -- creates a branch called testdb based on a given commit hash +neon log -- returns a list of commits +neon status -- returns if there are changes in the catalog that can be committed +neon commit -- commits the changes and generates a new commit hash +neon branch experimental -- creates a branch called testdb based on a given commit hash ``` To make changes in the catalog you need to run compute nodes ```bash -- here is how you a compute node -zenith start /home/pipedpiper/northwind:main -- starts a compute instance -zenith start zenith://zenith.tech/northwind:main -- starts a compute instance in the cloud +neon start /home/pipedpiper/northwind:main -- starts a compute instance +neon start neon://neon.tech/northwind:main -- starts a compute instance in the cloud -- you can start a compute node against any hash or branch -zenith start /home/pipedpiper/northwind:experimental --port 8008 -- start another compute instance (on different port) +neon start /home/pipedpiper/northwind:experimental --port 8008 -- start another compute instance (on different port) -- you can start a compute node against any hash or branch -zenith start /home/pipedpiper/northwind: --port 8009 -- start another compute instance (on different port) +neon start /home/pipedpiper/northwind: --port 8009 -- start another compute instance (on different port) -- After running some DML you can run --- zenith status and see how there are two WAL streams one on top of +-- neon status and see how there are two WAL streams one on top of -- the main branch -zenith status +neon status -- and another on top of the experimental branch -zenith status -b experimental +neon status -b experimental -- you can commit each branch separately -zenith commit main +neon commit main -- or -zenith commit -c /home/pipedpiper/northwind:experimental +neon commit -c /home/pipedpiper/northwind:experimental ``` Starting compute instances against cloud environments @@ -47,18 +47,18 @@ Starting compute instances against cloud environments ```bash -- you can start a compute instance against the cloud environment -- in this case all of the changes will be streamed into the cloud -zenith start https://zenith:tech/pipedpiper/northwind:main -zenith start https://zenith:tech/pipedpiper/northwind:main -zenith status -c https://zenith:tech/pipedpiper/northwind:main -zenith commit -c https://zenith:tech/pipedpiper/northwind:main -zenith branch -c https://zenith:tech/pipedpiper/northwind: experimental +neon start https://neon:tecj/pipedpiper/northwind:main +neon start https://neon:tecj/pipedpiper/northwind:main +neon status -c https://neon:tecj/pipedpiper/northwind:main +neon commit -c https://neon:tecj/pipedpiper/northwind:main +neon branch -c https://neon:tecj/pipedpiper/northwind: experimental ``` Pushing data into the cloud ```bash -- pull all the commits from the cloud -zenith pull +neon pull -- push all the commits to the cloud -zenith push +neon push ``` diff --git a/docs/rfcs/006-laptop-cli-v2-repository-structure.md b/docs/rfcs/006-laptop-cli-v2-repository-structure.md index e6e6e172ad..749a940313 100644 --- a/docs/rfcs/006-laptop-cli-v2-repository-structure.md +++ b/docs/rfcs/006-laptop-cli-v2-repository-structure.md @@ -1,14 +1,14 @@ # Repository format -A Zenith repository is similar to a traditional PostgreSQL backup +A Neon repository is similar to a traditional PostgreSQL backup archive, like a WAL-G bucket or pgbarman backup catalogue. It holds multiple versions of a PostgreSQL database cluster. -The distinguishing feature is that you can launch a Zenith Postgres +The distinguishing feature is that you can launch a Neon Postgres server directly against a branch in the repository, without having to -"restore" it first. Also, Zenith manages the storage automatically, +"restore" it first. Also, Neon manages the storage automatically, there is no separation between full and incremental backups nor WAL -archive. Zenith relies heavily on the WAL, and uses concepts similar +archive. Neon relies heavily on the WAL, and uses concepts similar to incremental backups and WAL archiving internally, but it is hidden from the user. @@ -19,15 +19,15 @@ efficient. Just something to get us started. The repository directory looks like this: - .zenith/timelines/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c/wal/ - .zenith/timelines/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c/snapshots// - .zenith/timelines/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c/history + .neon/timelines/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c/wal/ + .neon/timelines/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c/snapshots// + .neon/timelines/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c/history - .zenith/refs/branches/mybranch - .zenith/refs/tags/foo - .zenith/refs/tags/bar + .neon/refs/branches/mybranch + .neon/refs/tags/foo + .neon/refs/tags/bar - .zenith/datadirs/ + .neon/datadirs/ ### Timelines @@ -39,7 +39,7 @@ All WAL is generated on a timeline. You can launch a read-only node against a tag or arbitrary LSN on a timeline, but in order to write, you need to create a timeline. -Each timeline is stored in a directory under .zenith/timelines. It +Each timeline is stored in a directory under .neon/timelines. It consists of a WAL archive, containing all the WAL in the standard PostgreSQL format, under the wal/ subdirectory. @@ -66,18 +66,18 @@ contains the UUID of the timeline (and LSN, for tags). ### Datadirs -.zenith/datadirs contains PostgreSQL data directories. You can launch +.neon/datadirs contains PostgreSQL data directories. You can launch a Postgres instance on one of them with: ``` - postgres -D .zenith/datadirs/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c + postgres -D .neon/datadirs/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c ``` All the actual data is kept in the timeline directories, under -.zenith/timelines. The data directories are only needed for active +.neon/timelines. The data directories are only needed for active PostgreQSL instances. After an instance is stopped, the data directory -can be safely removed. "zenith start" will recreate it quickly from -the data in .zenith/timelines, if it's missing. +can be safely removed. "neon start" will recreate it quickly from +the data in .neon/timelines, if it's missing. ## Version 2 @@ -103,14 +103,14 @@ more advanced. The exact format is TODO. But it should support: ### Garbage collection -When you run "zenith gc", old timelines that are no longer needed are +When you run "neon gc", old timelines that are no longer needed are removed. That involves collecting the list of "unreachable" objects, starting from the named branches and tags. Also, if enough WAL has been generated on a timeline since last snapshot, a new snapshot or delta is created. -### zenith push/pull +### neon push/pull Compare the tags and branches on both servers, and copy missing ones. For each branch, compare the timeline it points to in both servers. If @@ -123,7 +123,7 @@ every time you start up an instance? Then you would detect that the timelines have diverged. That would match with the "epoch" concept that we have in the WAL safekeeper -### zenith checkout/commit +### neon checkout/commit In this format, there is no concept of a "working tree", and hence no concept of checking out or committing. All modifications are done on @@ -134,7 +134,7 @@ You can easily fork off a temporary timeline to emulate a "working tree". You can later remove it and have it garbage collected, or to "commit", re-point the branch to the new timeline. -If we want to have a worktree and "zenith checkout/commit" concept, we can +If we want to have a worktree and "neon checkout/commit" concept, we can emulate that with a temporary timeline. Create the temporary timeline at -"zenith checkout", and have "zenith commit" modify the branch to point to +"neon checkout", and have "neon commit" modify the branch to point to the new timeline. diff --git a/docs/rfcs/007-serverless-on-laptop.md b/docs/rfcs/007-serverless-on-laptop.md index e6355f4a03..96f117bfe9 100644 --- a/docs/rfcs/007-serverless-on-laptop.md +++ b/docs/rfcs/007-serverless-on-laptop.md @@ -4,27 +4,27 @@ How it works now 1. Create repository, start page server on it ``` -$ zenith init +$ neon init ... created main branch -new zenith repository was created in .zenith +new neon repository was created in .neon -$ zenith pageserver start -Starting pageserver at '127.0.0.1:64000' in .zenith +$ neon pageserver start +Starting pageserver at '127.0.0.1:64000' in .neon Page server started ``` 2. Create a branch, and start a Postgres instance on it ``` -$ zenith branch heikki main +$ neon branch heikki main branching at end of WAL: 0/15ECF68 -$ zenith pg create heikki +$ neon pg create heikki Initializing Postgres on timeline 76cf9279915be7797095241638e64644... -Extracting base backup to create postgres instance: path=.zenith/pgdatadirs/pg1 port=55432 +Extracting base backup to create postgres instance: path=.neon/pgdatadirs/pg1 port=55432 -$ zenith pg start pg1 +$ neon pg start pg1 Starting postgres node at 'host=127.0.0.1 port=55432 user=heikki' waiting for server to start.... done server started @@ -52,20 +52,20 @@ serverless on your laptop, so that the workflow becomes just: 1. Create repository, start page server on it (same as before) ``` -$ zenith init +$ neon init ... created main branch -new zenith repository was created in .zenith +new neon repository was created in .neon -$ zenith pageserver start -Starting pageserver at '127.0.0.1:64000' in .zenith +$ neon pageserver start +Starting pageserver at '127.0.0.1:64000' in .neon Page server started ``` 2. Create branch ``` -$ zenith branch heikki main +$ neon branch heikki main branching at end of WAL: 0/15ECF68 ``` diff --git a/docs/rfcs/008-push-pull.md b/docs/rfcs/008-push-pull.md index 272628e1ce..a36932222a 100644 --- a/docs/rfcs/008-push-pull.md +++ b/docs/rfcs/008-push-pull.md @@ -7,22 +7,22 @@ Here is a proposal about implementing push/pull mechanics between pageservers. W The origin represents connection info for some remote pageserver. Let's use here same commands as git uses except using explicit list subcommand (git uses `origin -v` for that). ``` -zenith origin add -zenith origin list -zenith origin remove +neon origin add +neon origin list +neon origin remove ``` Connection URI a string of form `postgresql://user:pass@hostname:port` (https://www.postgresql.org/docs/13/libpq-connect.html#id-1.7.3.8.3.6). We can start with libpq password auth and later add support for client certs or require ssh as transport or invent some other kind of transport. -Behind the scenes, this commands may update toml file inside .zenith directory. +Behind the scenes, this commands may update toml file inside .neon directory. ## Push ### Pushing branch ``` -zenith push mybranch cloudserver # push to eponymous branch in cloudserver -zenith push mybranch cloudserver:otherbranch # push to a different branch in cloudserver +neon push mybranch cloudserver # push to eponymous branch in cloudserver +neon push mybranch cloudserver:otherbranch # push to a different branch in cloudserver ``` Exact mechanics would be slightly different in the following situations: diff --git a/docs/rfcs/009-snapshot-first-storage-cli.md b/docs/rfcs/009-snapshot-first-storage-cli.md index 0acbd68f86..bbd0f75fe2 100644 --- a/docs/rfcs/009-snapshot-first-storage-cli.md +++ b/docs/rfcs/009-snapshot-first-storage-cli.md @@ -2,7 +2,7 @@ While working on export/import commands, I understood that they fit really well We may think about backups as snapshots in a different format (i.e plain pgdata format, basebackup tar format, WAL-G format (if they want to support it) and so on). They use same storage API, the only difference is the code that packs/unpacks files. -Even if zenith aims to maintains durability using it's own snapshots, backups will be useful for uploading data from postgres to zenith. +Even if neon aims to maintains durability using it's own snapshots, backups will be useful for uploading data from postgres to neon. So here is an attempt to design consistent CLI for different usage scenarios: @@ -16,8 +16,8 @@ Save`storage_dest` and other parameters in config. Push snapshots to `storage_dest` in background. ``` -zenith init --storage_dest=S3_PREFIX -zenith start +neon init --storage_dest=S3_PREFIX +neon start ``` #### 2. Restart pageserver (manually or crash-recovery). @@ -25,7 +25,7 @@ Take `storage_dest` from pageserver config, start pageserver from latest snapsho Push snapshots to `storage_dest` in background. ``` -zenith start +neon start ``` #### 3. Import. @@ -35,22 +35,22 @@ Do not save `snapshot_path` and `snapshot_format` in config, as it is a one-time Save`storage_dest` parameters in config. Push snapshots to `storage_dest` in background. ``` -//I.e. we want to start zenith on top of existing $PGDATA and use s3 as a persistent storage. -zenith init --snapshot_path=FILE_PREFIX --snapshot_format=pgdata --storage_dest=S3_PREFIX -zenith start +//I.e. we want to start neon on top of existing $PGDATA and use s3 as a persistent storage. +neon init --snapshot_path=FILE_PREFIX --snapshot_format=pgdata --storage_dest=S3_PREFIX +neon start ``` How to pass credentials needed for `snapshot_path`? #### 4. Export. Manually push snapshot to `snapshot_path` which differs from `storage_dest` -Optionally set `snapshot_format`, which can be plain pgdata format or zenith format. +Optionally set `snapshot_format`, which can be plain pgdata format or neon format. ``` -zenith export --snapshot_path=FILE_PREFIX --snapshot_format=pgdata +neon export --snapshot_path=FILE_PREFIX --snapshot_format=pgdata ``` #### Notes and questions - safekeeper s3_offload should use same (similar) syntax for storage. How to set it in UI? -- Why do we need `zenith init` as a separate command? Can't we init everything at first start? +- Why do we need `neon init` as a separate command? Can't we init everything at first start? - We can think of better names for all options. - Export to plain postgres format will be useless, if we are not 100% compatible on page level. I can recall at least one such difference - PD_WAL_LOGGED flag in pages. diff --git a/docs/rfcs/013-term-history.md b/docs/rfcs/013-term-history.md index 7e815abf73..2f3ccbc09b 100644 --- a/docs/rfcs/013-term-history.md +++ b/docs/rfcs/013-term-history.md @@ -9,7 +9,7 @@ receival and this might lag behind `term`; safekeeper switches to epoch `n` when it has received all committed log records from all `< n` terms. This roughly corresponds to proposed in -https://github.com/zenithdb/rfcs/pull/3/files +https://github.com/neondatabase/rfcs/pull/3/files This makes our biggest our difference from Raft. In Raft, every log record is diff --git a/docs/rfcs/014-safekeepers-gossip.md b/docs/rfcs/014-safekeepers-gossip.md index 3d6cc04b94..ff38a0a0ef 100644 --- a/docs/rfcs/014-safekeepers-gossip.md +++ b/docs/rfcs/014-safekeepers-gossip.md @@ -1,6 +1,6 @@ # Safekeeper gossip -Extracted from this [PR](https://github.com/zenithdb/rfcs/pull/13) +Extracted from this [PR](https://github.com/neondatabase/rfcs/pull/13) ## Motivation diff --git a/docs/rfcs/015-storage-messaging.md b/docs/rfcs/015-storage-messaging.md index a415b90459..7702311d65 100644 --- a/docs/rfcs/015-storage-messaging.md +++ b/docs/rfcs/015-storage-messaging.md @@ -2,7 +2,7 @@ Created on 19.01.22 -Initially created [here](https://github.com/zenithdb/rfcs/pull/16) by @kelvich. +Initially created [here](https://github.com/neondatabase/rfcs/pull/16) by @kelvich. That it is an alternative to (014-safekeeper-gossip)[] @@ -292,4 +292,4 @@ But with an etcd we are in a bit different situation: 1. We don't need persistency and strong consistency guarantees for the data we store in the etcd 2. etcd uses Grpc as a protocol, and messages are pretty simple -So it looks like implementing in-mem store with etcd interface is straightforward thing _if we will want that in future_. At the same time, we can avoid implementing it right now, and we will be able to run local zenith installation with etcd running somewhere in the background (as opposed to building and running console, which in turn requires Postgres). +So it looks like implementing in-mem store with etcd interface is straightforward thing _if we will want that in future_. At the same time, we can avoid implementing it right now, and we will be able to run local neon installation with etcd running somewhere in the background (as opposed to building and running console, which in turn requires Postgres). diff --git a/libs/metrics/src/lib.rs b/libs/metrics/src/lib.rs index 744fc18e61..22b0a18933 100644 --- a/libs/metrics/src/lib.rs +++ b/libs/metrics/src/lib.rs @@ -29,7 +29,6 @@ pub mod launch_timestamp; mod wrappers; pub use wrappers::{CountedReader, CountedWriter}; mod hll; -pub mod metric_vec_duration; pub use hll::{HyperLogLog, HyperLogLogVec}; #[cfg(target_os = "linux")] pub mod more_process_metrics; diff --git a/libs/metrics/src/metric_vec_duration.rs b/libs/metrics/src/metric_vec_duration.rs deleted file mode 100644 index e9a0a65570..0000000000 --- a/libs/metrics/src/metric_vec_duration.rs +++ /dev/null @@ -1,23 +0,0 @@ -//! Helpers for observing duration on `HistogramVec` / `CounterVec` / `GaugeVec` / `MetricVec`. - -use std::{future::Future, time::Instant}; - -pub trait DurationResultObserver { - fn observe_result(&self, res: &Result, duration: std::time::Duration); -} - -pub async fn observe_async_block_duration_by_result< - T, - E, - F: Future>, - O: DurationResultObserver, ->( - observer: &O, - block: F, -) -> Result { - let start = Instant::now(); - let result = block.await; - let duration = start.elapsed(); - observer.observe_result(&result, duration); - result -} diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs index 64b70a1a51..38e61239c5 100644 --- a/libs/pageserver_api/src/controller_api.rs +++ b/libs/pageserver_api/src/controller_api.rs @@ -125,5 +125,45 @@ impl From for String { } } +/// Controls how tenant shards are mapped to locations on pageservers, e.g. whether +/// to create secondary locations. +#[derive(Clone, Serialize, Deserialize, Debug, PartialEq, Eq)] +pub enum PlacementPolicy { + /// Cheapest way to attach a tenant: just one pageserver, no secondary + Single, + /// Production-ready way to attach a tenant: one attached pageserver and + /// some number of secondaries. + Double(usize), + /// Create one secondary mode locations. This is useful when onboarding + /// a tenant, or for an idle tenant that we might want to bring online quickly. + Secondary, + + /// Do not attach to any pageservers. This is appropriate for tenants that + /// have been idle for a long time, where we do not mind some delay in making + /// them available in future. + Detached, +} + #[derive(Serialize, Deserialize, Debug)] pub struct TenantShardMigrateResponse {} + +#[cfg(test)] +mod test { + use super::*; + use serde_json; + + /// Check stability of PlacementPolicy's serialization + #[test] + fn placement_policy_encoding() -> anyhow::Result<()> { + let v = PlacementPolicy::Double(1); + let encoded = serde_json::to_string(&v)?; + assert_eq!(encoded, "{\"Double\":1}"); + assert_eq!(serde_json::from_str::(&encoded)?, v); + + let v = PlacementPolicy::Single; + let encoded = serde_json::to_string(&v)?; + assert_eq!(encoded, "\"Single\""); + assert_eq!(serde_json::from_str::(&encoded)?, v); + Ok(()) + } +} diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index d583866290..fe5bbd1c06 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -21,6 +21,7 @@ use utils::{ lsn::Lsn, }; +use crate::controller_api::PlacementPolicy; use crate::{ reltag::RelTag, shard::{ShardCount, ShardStripeSize, TenantShardId}, @@ -242,6 +243,11 @@ pub struct TenantCreateRequest { #[serde(skip_serializing_if = "ShardParameters::is_unsharded")] pub shard_parameters: ShardParameters, + // This parameter is only meaningful in requests sent to the storage controller + #[serde(default)] + #[serde(skip_serializing_if = "Option::is_none")] + pub placement_policy: Option, + #[serde(flatten)] pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it } @@ -435,6 +441,8 @@ pub struct TenantShardLocation { #[serde(deny_unknown_fields)] pub struct TenantLocationConfigResponse { pub shards: Vec, + // If the shards' ShardCount count is >1, stripe_size will be set. + pub stripe_size: Option, } #[derive(Serialize, Deserialize, Debug)] diff --git a/libs/postgres_backend/src/lib.rs b/libs/postgres_backend/src/lib.rs index 73d25619c3..260018ad89 100644 --- a/libs/postgres_backend/src/lib.rs +++ b/libs/postgres_backend/src/lib.rs @@ -6,7 +6,6 @@ #![deny(clippy::undocumented_unsafe_blocks)] use anyhow::Context; use bytes::Bytes; -use futures::pin_mut; use serde::{Deserialize, Serialize}; use std::io::ErrorKind; use std::net::SocketAddr; @@ -378,8 +377,7 @@ impl PostgresBackend { &mut self, cx: &mut std::task::Context<'_>, ) -> Poll> { - let flush_fut = self.flush(); - pin_mut!(flush_fut); + let flush_fut = std::pin::pin!(self.flush()); flush_fut.poll(cx) } diff --git a/libs/postgres_backend/tests/simple_select.rs b/libs/postgres_backend/tests/simple_select.rs index e046fa5260..80df9db858 100644 --- a/libs/postgres_backend/tests/simple_select.rs +++ b/libs/postgres_backend/tests/simple_select.rs @@ -72,14 +72,19 @@ async fn simple_select() { } } -static KEY: Lazy = Lazy::new(|| { +static KEY: Lazy> = Lazy::new(|| { let mut cursor = Cursor::new(include_bytes!("key.pem")); - rustls::PrivateKey(rustls_pemfile::rsa_private_keys(&mut cursor).unwrap()[0].clone()) + let key = rustls_pemfile::rsa_private_keys(&mut cursor) + .next() + .unwrap() + .unwrap(); + rustls::pki_types::PrivateKeyDer::Pkcs1(key) }); -static CERT: Lazy = Lazy::new(|| { +static CERT: Lazy> = Lazy::new(|| { let mut cursor = Cursor::new(include_bytes!("cert.pem")); - rustls::Certificate(rustls_pemfile::certs(&mut cursor).unwrap()[0].clone()) + let cert = rustls_pemfile::certs(&mut cursor).next().unwrap().unwrap(); + cert }); // test that basic select with ssl works @@ -88,9 +93,8 @@ async fn simple_select_ssl() { let (client_sock, server_sock) = make_tcp_pair().await; let server_cfg = rustls::ServerConfig::builder() - .with_safe_defaults() .with_no_client_auth() - .with_single_cert(vec![CERT.clone()], KEY.clone()) + .with_single_cert(vec![CERT.clone()], KEY.clone_key()) .unwrap(); let tls_config = Some(Arc::new(server_cfg)); let pgbackend = @@ -102,10 +106,9 @@ async fn simple_select_ssl() { }); let client_cfg = rustls::ClientConfig::builder() - .with_safe_defaults() .with_root_certificates({ let mut store = rustls::RootCertStore::empty(); - store.add(&CERT).unwrap(); + store.add(CERT.clone()).unwrap(); store }) .with_no_client_auth(); diff --git a/libs/remote_storage/tests/test_real_s3.rs b/libs/remote_storage/tests/test_real_s3.rs index e927b40e80..d8b9824d99 100644 --- a/libs/remote_storage/tests/test_real_s3.rs +++ b/libs/remote_storage/tests/test_real_s3.rs @@ -17,6 +17,7 @@ use remote_storage::{ }; use test_context::test_context; use test_context::AsyncTestContext; +use tokio::io::AsyncBufReadExt; use tokio_util::sync::CancellationToken; use tracing::info; @@ -484,32 +485,33 @@ async fn download_is_cancelled(ctx: &mut MaybeEnabledStorage) { )) .unwrap(); - let len = upload_large_enough_file(&ctx.client, &path, &cancel).await; + let file_len = upload_large_enough_file(&ctx.client, &path, &cancel).await; { - let mut stream = ctx + let stream = ctx .client .download(&path, &cancel) .await .expect("download succeeds") .download_stream; - let first = stream - .next() - .await - .expect("should have the first blob") - .expect("should have succeeded"); + let mut reader = std::pin::pin!(tokio_util::io::StreamReader::new(stream)); - tracing::info!(len = first.len(), "downloaded first chunk"); + let first = reader.fill_buf().await.expect("should have the first blob"); + + let len = first.len(); + tracing::info!(len, "downloaded first chunk"); assert!( - first.len() < len, + first.len() < file_len, "uploaded file is too small, we downloaded all on first chunk" ); + reader.consume(len); + cancel.cancel(); - let next = stream.next().await.expect("stream should have more"); + let next = reader.fill_buf().await; let e = next.expect_err("expected an error, but got a chunk?"); @@ -520,6 +522,10 @@ async fn download_is_cancelled(ctx: &mut MaybeEnabledStorage) { .is_some_and(|e| matches!(e, DownloadError::Cancelled)), "{inner:?}" ); + + let e = DownloadError::from(e); + + assert!(matches!(e, DownloadError::Cancelled), "{e:?}"); } let cancel = CancellationToken::new(); diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs index 969d0d99c0..732eb951c9 100644 --- a/pageserver/client/src/mgmt_api.rs +++ b/pageserver/client/src/mgmt_api.rs @@ -7,7 +7,7 @@ use utils::{ pub mod util; -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct Client { mgmt_api_endpoint: String, authorization_header: Option, @@ -24,6 +24,9 @@ pub enum Error { #[error("pageserver API: {1}")] ApiError(StatusCode, String), + + #[error("Cancelled")] + Cancelled, } pub type Result = std::result::Result; @@ -251,21 +254,30 @@ impl Client { tenant_shard_id: TenantShardId, config: LocationConfig, flush_ms: Option, + lazy: bool, ) -> Result<()> { let req_body = TenantLocationConfigRequest { tenant_id: tenant_shard_id, config, }; - let path = format!( + + let mut path = reqwest::Url::parse(&format!( "{}/v1/tenant/{}/location_config", self.mgmt_api_endpoint, tenant_shard_id - ); - let path = if let Some(flush_ms) = flush_ms { - format!("{}?flush_ms={}", path, flush_ms.as_millis()) - } else { - path - }; - self.request(Method::PUT, &path, &req_body).await?; + )) + // Should always work: mgmt_api_endpoint is configuration, not user input. + .expect("Cannot build URL"); + + if lazy { + path.query_pairs_mut().append_pair("lazy", "true"); + } + + if let Some(flush_ms) = flush_ms { + path.query_pairs_mut() + .append_pair("flush_ms", &format!("{}", flush_ms.as_millis())); + } + + self.request(Method::PUT, path, &req_body).await?; Ok(()) } @@ -278,6 +290,21 @@ impl Client { .map_err(Error::ReceiveBody) } + pub async fn get_location_config( + &self, + tenant_shard_id: TenantShardId, + ) -> Result> { + let path = format!( + "{}/v1/location_config/{tenant_shard_id}", + self.mgmt_api_endpoint + ); + self.request(Method::GET, &path, ()) + .await? + .json() + .await + .map_err(Error::ReceiveBody) + } + pub async fn timeline_create( &self, tenant_shard_id: TenantShardId, diff --git a/pageserver/compaction/src/compact_tiered.rs b/pageserver/compaction/src/compact_tiered.rs index 52219a014c..60fc7ac925 100644 --- a/pageserver/compaction/src/compact_tiered.rs +++ b/pageserver/compaction/src/compact_tiered.rs @@ -63,7 +63,7 @@ pub async fn compact_tiered( ); // Identify the range of LSNs that belong to this level. We assume that - // each file in this level span an LSN range up to 1.75x target file + // each file in this level spans an LSN range up to 1.75x target file // size. That should give us enough slop that if we created a slightly // oversized L0 layer, e.g. because flushing the in-memory layer was // delayed for some reason, we don't consider the oversized layer to @@ -248,7 +248,6 @@ enum CompactionStrategy { CreateImage, } -#[allow(dead_code)] // Todo struct CompactionJob { key_range: Range, lsn_range: Range, @@ -345,7 +344,7 @@ where /// /// TODO: Currently, this is called exactly once for the level, and we /// decide whether to create new image layers to cover the whole level, or - /// write a new set of delta. In the future, this should try to partition + /// write a new set of deltas. In the future, this should try to partition /// the key space, and make the decision separately for each partition. async fn divide_job(&mut self, job_id: JobId, ctx: &E::RequestContext) -> anyhow::Result<()> { let job = &self.jobs[job_id.0]; @@ -709,18 +708,6 @@ where } } -// Sliding window through keyspace and values -// -// This is used to decide what layer to write next, from the beginning of the window. -// -// Candidates: -// -// 1. Create an image layer, snapping to previous images -// 2. Create a delta layer, snapping to previous images -// 3. Create an image layer, snapping to -// -// - // Take previous partitioning, based on the image layers below. // // Candidate is at the front: @@ -739,6 +726,10 @@ struct WindowElement { last_key: K, // inclusive accum_size: u64, } + +// Sliding window through keyspace and values +// +// This is used to decide what layer to write next, from the beginning of the window. struct Window { elems: VecDeque>, diff --git a/pageserver/compaction/src/identify_levels.rs b/pageserver/compaction/src/identify_levels.rs index ef388fd92b..98dd46925c 100644 --- a/pageserver/compaction/src/identify_levels.rs +++ b/pageserver/compaction/src/identify_levels.rs @@ -1,5 +1,5 @@ -//! An LSM tree consists of multiple levels, each exponential larger than the -//! previous level. And each level consists of be multiple "tiers". With tiered +//! An LSM tree consists of multiple levels, each exponentially larger than the +//! previous level. And each level consists of multiple "tiers". With tiered //! compaction, a level is compacted when it has accumulated more than N tiers, //! forming one tier on the next level. //! @@ -170,13 +170,6 @@ where }) } -// helper struct used in depth() -struct Event { - key: K, - layer_idx: usize, - start: bool, -} - impl Level { /// Count the number of deltas stacked on each other. pub fn depth(&self) -> u64 @@ -184,6 +177,11 @@ impl Level { K: CompactionKey, L: CompactionLayer, { + struct Event { + key: K, + layer_idx: usize, + start: bool, + } let mut events: Vec> = Vec::new(); for (idx, l) in self.layers.iter().enumerate() { events.push(Event { @@ -202,7 +200,7 @@ impl Level { // Sweep the key space left to right. Stop at each distinct key, and // count the number of deltas on top of the highest image at that key. // - // This is a little enefficient, as we walk through the active_set on + // This is a little inefficient, as we walk through the active_set on // every key. We could increment/decrement a counter on each step // instead, but that'd require a bit more complex bookkeeping. let mut active_set: BTreeSet<(Lsn, bool, usize)> = BTreeSet::new(); @@ -236,6 +234,7 @@ impl Level { } } } + debug_assert_eq!(active_set, BTreeSet::new()); max_depth } } diff --git a/pageserver/compaction/src/interface.rs b/pageserver/compaction/src/interface.rs index 979ceebf0e..2bb2e749c0 100644 --- a/pageserver/compaction/src/interface.rs +++ b/pageserver/compaction/src/interface.rs @@ -4,12 +4,12 @@ //! All the heavy lifting is done by the create_image and create_delta //! functions that the implementor provides. use async_trait::async_trait; +use futures::Future; use pageserver_api::{key::Key, keyspace::key_range_size}; use std::ops::Range; use utils::lsn::Lsn; /// Public interface. This is the main thing that the implementor needs to provide -#[async_trait] pub trait CompactionJobExecutor { // Type system. // @@ -17,8 +17,7 @@ pub trait CompactionJobExecutor { // compaction doesn't distinguish whether they are stored locally or // remotely. // - // The keyspace is defined by CompactionKey trait. - // + // The keyspace is defined by the CompactionKey trait. type Key: CompactionKey; type Layer: CompactionLayer + Clone; @@ -35,27 +34,27 @@ pub trait CompactionJobExecutor { // ---- /// Return all layers that overlap the given bounding box. - async fn get_layers( + fn get_layers( &mut self, key_range: &Range, lsn_range: &Range, ctx: &Self::RequestContext, - ) -> anyhow::Result>; + ) -> impl Future>> + Send; - async fn get_keyspace( + fn get_keyspace( &mut self, key_range: &Range, lsn: Lsn, ctx: &Self::RequestContext, - ) -> anyhow::Result>; + ) -> impl Future>> + Send; /// NB: This is a pretty expensive operation. In the real pageserver /// implementation, it downloads the layer, and keeps it resident /// until the DeltaLayer is dropped. - async fn downcast_delta_layer( + fn downcast_delta_layer( &self, layer: &Self::Layer, - ) -> anyhow::Result>; + ) -> impl Future>> + Send; // ---- // Functions to execute the plan @@ -63,33 +62,33 @@ pub trait CompactionJobExecutor { /// Create a new image layer, materializing all the values in the key range, /// at given 'lsn'. - async fn create_image( + fn create_image( &mut self, lsn: Lsn, key_range: &Range, ctx: &Self::RequestContext, - ) -> anyhow::Result<()>; + ) -> impl Future> + Send; /// Create a new delta layer, containing all the values from 'input_layers' /// in the given key and LSN range. - async fn create_delta( + fn create_delta( &mut self, lsn_range: &Range, key_range: &Range, input_layers: &[Self::DeltaLayer], ctx: &Self::RequestContext, - ) -> anyhow::Result<()>; + ) -> impl Future> + Send; /// Delete a layer. The compaction implementation will call this only after /// all the create_image() or create_delta() calls that deletion of this /// layer depends on have finished. But if the implementor has extra lazy - /// background tasks, like uploading the index json file to remote storage, + /// background tasks, like uploading the index json file to remote storage. /// it is the implementation's responsibility to track those. - async fn delete_layer( + fn delete_layer( &mut self, layer: &Self::Layer, ctx: &Self::RequestContext, - ) -> anyhow::Result<()>; + ) -> impl Future> + Send; } pub trait CompactionKey: std::cmp::Ord + Clone + Copy + std::fmt::Display { diff --git a/pageserver/compaction/src/simulator.rs b/pageserver/compaction/src/simulator.rs index 6d07038dcd..def7983e75 100644 --- a/pageserver/compaction/src/simulator.rs +++ b/pageserver/compaction/src/simulator.rs @@ -429,7 +429,6 @@ impl From<&Arc> for MockLayer { } } -#[async_trait] impl interface::CompactionJobExecutor for MockTimeline { type Key = Key; type Layer = MockLayer; diff --git a/pageserver/src/context.rs b/pageserver/src/context.rs index ee331ea154..86d0390c30 100644 --- a/pageserver/src/context.rs +++ b/pageserver/src/context.rs @@ -88,13 +88,16 @@ use crate::task_mgr::TaskKind; +pub(crate) mod optional_counter; + // The main structure of this module, see module-level comment. -#[derive(Clone, Debug)] +#[derive(Debug)] pub struct RequestContext { task_kind: TaskKind, download_behavior: DownloadBehavior, access_stats_behavior: AccessStatsBehavior, page_content_kind: PageContentKind, + pub micros_spent_throttled: optional_counter::MicroSecondsCounterU32, } /// The kind of access to the page cache. @@ -150,6 +153,7 @@ impl RequestContextBuilder { download_behavior: DownloadBehavior::Download, access_stats_behavior: AccessStatsBehavior::Update, page_content_kind: PageContentKind::Unknown, + micros_spent_throttled: Default::default(), }, } } @@ -163,6 +167,7 @@ impl RequestContextBuilder { download_behavior: original.download_behavior, access_stats_behavior: original.access_stats_behavior, page_content_kind: original.page_content_kind, + micros_spent_throttled: Default::default(), }, } } diff --git a/pageserver/src/context/optional_counter.rs b/pageserver/src/context/optional_counter.rs new file mode 100644 index 0000000000..100c649f18 --- /dev/null +++ b/pageserver/src/context/optional_counter.rs @@ -0,0 +1,101 @@ +use std::{ + sync::atomic::{AtomicU32, Ordering}, + time::Duration, +}; + +#[derive(Debug)] +pub struct CounterU32 { + inner: AtomicU32, +} +impl Default for CounterU32 { + fn default() -> Self { + Self { + inner: AtomicU32::new(u32::MAX), + } + } +} +impl CounterU32 { + pub fn open(&self) -> Result<(), &'static str> { + match self + .inner + .compare_exchange(u32::MAX, 0, Ordering::Relaxed, Ordering::Relaxed) + { + Ok(_) => Ok(()), + Err(_) => Err("open() called on clsoed state"), + } + } + pub fn close(&self) -> Result { + match self.inner.swap(u32::MAX, Ordering::Relaxed) { + u32::MAX => Err("close() called on closed state"), + x => Ok(x), + } + } + + pub fn add(&self, count: u32) -> Result<(), &'static str> { + if count == 0 { + return Ok(()); + } + let mut had_err = None; + self.inner + .fetch_update(Ordering::Relaxed, Ordering::Relaxed, |cur| match cur { + u32::MAX => { + had_err = Some("add() called on closed state"); + None + } + x => { + let (new, overflowed) = x.overflowing_add(count); + if new == u32::MAX || overflowed { + had_err = Some("add() overflowed the counter"); + None + } else { + Some(new) + } + } + }) + .map_err(|_| had_err.expect("we set it whenever the function returns None")) + .map(|_| ()) + } +} + +#[derive(Default, Debug)] +pub struct MicroSecondsCounterU32 { + inner: CounterU32, +} + +impl MicroSecondsCounterU32 { + pub fn open(&self) -> Result<(), &'static str> { + self.inner.open() + } + pub fn add(&self, duration: Duration) -> Result<(), &'static str> { + match duration.as_micros().try_into() { + Ok(x) => self.inner.add(x), + Err(_) => Err("add(): duration conversion error"), + } + } + pub fn close_and_checked_sub_from(&self, from: Duration) -> Result { + let val = self.inner.close()?; + let val = Duration::from_micros(val as u64); + let subbed = match from.checked_sub(val) { + Some(v) => v, + None => return Err("Duration::checked_sub"), + }; + Ok(subbed) + } +} + +#[cfg(test)] +mod tests { + + use super::*; + + #[test] + fn test_basic() { + let counter = MicroSecondsCounterU32::default(); + counter.open().unwrap(); + counter.add(Duration::from_micros(23)).unwrap(); + let res = counter + .close_and_checked_sub_from(Duration::from_micros(42)) + .unwrap(); + assert_eq!(res, Duration::from_micros(42 - 23)); + } +} diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index 19b5fb7e79..d924224a32 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -1339,6 +1339,10 @@ components: type: array items: $ref: "#/components/schemas/TenantShardLocation" + stripe_size: + description: If multiple shards are present, this field contains the sharding stripe size, else it is null. + type: integer + nullable: true TenantShardLocation: type: object required: diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 9d92fbaee0..eafad9ab73 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -14,6 +14,7 @@ use hyper::header; use hyper::StatusCode; use hyper::{Body, Request, Response, Uri}; use metrics::launch_timestamp::LaunchTimestamp; +use pageserver_api::models::LocationConfig; use pageserver_api::models::LocationConfigListResponse; use pageserver_api::models::ShardParameters; use pageserver_api::models::TenantDetails; @@ -1451,11 +1452,12 @@ async fn put_tenant_location_config_handler( tenant::SpawnMode::Eager }; - let attached = state + let tenant = state .tenant_manager .upsert_location(tenant_shard_id, location_conf, flush, spawn_mode, &ctx) - .await? - .is_some(); + .await?; + let stripe_size = tenant.as_ref().map(|t| t.get_shard_stripe_size()); + let attached = tenant.is_some(); if let Some(_flush_ms) = flush { match state @@ -1477,12 +1479,20 @@ async fn put_tenant_location_config_handler( // This API returns a vector of pageservers where the tenant is attached: this is // primarily for use in the sharding service. For compatibilty, we also return this // when called directly on a pageserver, but the payload is always zero or one shards. - let mut response = TenantLocationConfigResponse { shards: Vec::new() }; + let mut response = TenantLocationConfigResponse { + shards: Vec::new(), + stripe_size: None, + }; if attached { response.shards.push(TenantShardLocation { shard_id: tenant_shard_id, node_id: state.conf.id, - }) + }); + if tenant_shard_id.shard_count.count() > 1 { + // Stripe size should be set if we are attached + debug_assert!(stripe_size.is_some()); + response.stripe_size = stripe_size; + } } json_response(StatusCode::OK, response) @@ -1510,6 +1520,29 @@ async fn list_location_config_handler( json_response(StatusCode::OK, result) } +async fn get_location_config_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + let state = get_state(&request); + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; + let slot = state.tenant_manager.get(tenant_shard_id); + + let Some(slot) = slot else { + return Err(ApiError::NotFound( + anyhow::anyhow!("Tenant shard not found").into(), + )); + }; + + let result: Option = match slot { + TenantSlot::Attached(t) => Some(t.get_location_conf()), + TenantSlot::Secondary(s) => Some(s.get_location_conf()), + TenantSlot::InProgress(_) => None, + }; + + json_response(StatusCode::OK, result) +} + // Do a time travel recovery on the given tenant/tenant shard. Tenant needs to be detached // (from all pageservers) as it invalidates consistency assumptions. async fn tenant_time_travel_remote_storage_handler( @@ -2214,6 +2247,9 @@ pub fn make_router( .get("/v1/location_config", |r| { api_handler(r, list_location_config_handler) }) + .get("/v1/location_config/:tenant_id", |r| { + api_handler(r, get_location_config_handler) + }) .put( "/v1/tenant/:tenant_shard_id/time_travel_remote_storage", |r| api_handler(r, tenant_time_travel_remote_storage_handler), diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index ce5561b431..27e754e999 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -1,5 +1,4 @@ use enum_map::EnumMap; -use metrics::metric_vec_duration::DurationResultObserver; use metrics::{ register_counter_vec, register_gauge_vec, register_histogram, register_histogram_vec, register_int_counter, register_int_counter_pair_vec, register_int_counter_vec, @@ -11,6 +10,7 @@ use once_cell::sync::Lazy; use pageserver_api::shard::TenantShardId; use strum::{EnumCount, IntoEnumIterator, VariantNames}; use strum_macros::{EnumVariantNames, IntoStaticStr}; +use tracing::warn; use utils::id::TimelineId; /// Prometheus histogram buckets (in seconds) for operations in the critical @@ -1005,15 +1005,39 @@ impl GlobalAndPerTimelineHistogram { } } -struct GlobalAndPerTimelineHistogramTimer<'a> { +struct GlobalAndPerTimelineHistogramTimer<'a, 'c> { h: &'a GlobalAndPerTimelineHistogram, + ctx: &'c RequestContext, start: std::time::Instant, + op: SmgrQueryType, } -impl<'a> Drop for GlobalAndPerTimelineHistogramTimer<'a> { +impl<'a, 'c> Drop for GlobalAndPerTimelineHistogramTimer<'a, 'c> { fn drop(&mut self) { let elapsed = self.start.elapsed(); - self.h.observe(elapsed.as_secs_f64()); + let ex_throttled = self + .ctx + .micros_spent_throttled + .close_and_checked_sub_from(elapsed); + let ex_throttled = match ex_throttled { + Ok(res) => res, + Err(error) => { + use utils::rate_limit::RateLimit; + static LOGGED: Lazy>> = + Lazy::new(|| { + Mutex::new(enum_map::EnumMap::from_array(std::array::from_fn(|_| { + RateLimit::new(Duration::from_secs(10)) + }))) + }); + let mut guard = LOGGED.lock().unwrap(); + let rate_limit = &mut guard[self.op]; + rate_limit.call(|| { + warn!(op=?self.op, error, "error deducting time spent throttled; this message is logged at a global rate limit"); + }); + elapsed + } + }; + self.h.observe(ex_throttled.as_secs_f64()); } } @@ -1025,6 +1049,7 @@ impl<'a> Drop for GlobalAndPerTimelineHistogramTimer<'a> { strum_macros::EnumCount, strum_macros::EnumIter, strum_macros::FromRepr, + enum_map::Enum, )] #[strum(serialize_all = "snake_case")] pub enum SmgrQueryType { @@ -1130,11 +1155,35 @@ impl SmgrQueryTimePerTimeline { }); Self { metrics } } - pub(crate) fn start_timer(&self, op: SmgrQueryType) -> impl Drop + '_ { + pub(crate) fn start_timer<'c: 'a, 'a>( + &'a self, + op: SmgrQueryType, + ctx: &'c RequestContext, + ) -> impl Drop + '_ { let metric = &self.metrics[op as usize]; + let start = Instant::now(); + match ctx.micros_spent_throttled.open() { + Ok(()) => (), + Err(error) => { + use utils::rate_limit::RateLimit; + static LOGGED: Lazy>> = + Lazy::new(|| { + Mutex::new(enum_map::EnumMap::from_array(std::array::from_fn(|_| { + RateLimit::new(Duration::from_secs(10)) + }))) + }); + let mut guard = LOGGED.lock().unwrap(); + let rate_limit = &mut guard[op]; + rate_limit.call(|| { + warn!(?op, error, "error opening micros_spent_throttled; this message is logged at a global rate limit"); + }); + } + } GlobalAndPerTimelineHistogramTimer { h: metric, - start: std::time::Instant::now(), + ctx, + start, + op, } } } @@ -1145,6 +1194,11 @@ mod smgr_query_time_tests { use strum::IntoEnumIterator; use utils::id::{TenantId, TimelineId}; + use crate::{ + context::{DownloadBehavior, RequestContext}, + task_mgr::TaskKind, + }; + // Regression test, we used hard-coded string constants before using an enum. #[test] fn op_label_name() { @@ -1193,7 +1247,8 @@ mod smgr_query_time_tests { let (pre_global, pre_per_tenant_timeline) = get_counts(); assert_eq!(pre_per_tenant_timeline, 0); - let timer = metrics.start_timer(*op); + let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Download); + let timer = metrics.start_timer(*op, &ctx); drop(timer); let (post_global, post_per_tenant_timeline) = get_counts(); @@ -1227,11 +1282,65 @@ pub(crate) static BASEBACKUP_QUERY_TIME: Lazy = Lazy::new(| }) }); -impl DurationResultObserver for BasebackupQueryTime { - fn observe_result(&self, res: &Result, duration: std::time::Duration) { +pub(crate) struct BasebackupQueryTimeOngoingRecording<'a, 'c> { + parent: &'a BasebackupQueryTime, + ctx: &'c RequestContext, + start: std::time::Instant, +} + +impl BasebackupQueryTime { + pub(crate) fn start_recording<'c: 'a, 'a>( + &'a self, + ctx: &'c RequestContext, + ) -> BasebackupQueryTimeOngoingRecording<'_, '_> { + let start = Instant::now(); + match ctx.micros_spent_throttled.open() { + Ok(()) => (), + Err(error) => { + use utils::rate_limit::RateLimit; + static LOGGED: Lazy> = + Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10)))); + let mut rate_limit = LOGGED.lock().unwrap(); + rate_limit.call(|| { + warn!(error, "error opening micros_spent_throttled; this message is logged at a global rate limit"); + }); + } + } + BasebackupQueryTimeOngoingRecording { + parent: self, + ctx, + start, + } + } +} + +impl<'a, 'c> BasebackupQueryTimeOngoingRecording<'a, 'c> { + pub(crate) fn observe(self, res: &Result) { + let elapsed = self.start.elapsed(); + let ex_throttled = self + .ctx + .micros_spent_throttled + .close_and_checked_sub_from(elapsed); + let ex_throttled = match ex_throttled { + Ok(ex_throttled) => ex_throttled, + Err(error) => { + use utils::rate_limit::RateLimit; + static LOGGED: Lazy> = + Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10)))); + let mut rate_limit = LOGGED.lock().unwrap(); + rate_limit.call(|| { + warn!(error, "error deducting time spent throttled; this message is logged at a global rate limit"); + }); + elapsed + } + }; let label_value = if res.is_ok() { "ok" } else { "error" }; - let metric = self.0.get_metric_with_label_values(&[label_value]).unwrap(); - metric.observe(duration.as_secs_f64()); + let metric = self + .parent + .0 + .get_metric_with_label_values(&[label_value]) + .unwrap(); + metric.observe(ex_throttled.as_secs_f64()); } } diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 689bc5cb3c..f3ceb7d3e6 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -910,7 +910,7 @@ impl PageServerHandler { let timeline = self.get_timeline_shard_zero(tenant_id, timeline_id).await?; let _timer = timeline .query_metrics - .start_timer(metrics::SmgrQueryType::GetRelExists); + .start_timer(metrics::SmgrQueryType::GetRelExists, ctx); let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); let lsn = @@ -938,7 +938,7 @@ impl PageServerHandler { let _timer = timeline .query_metrics - .start_timer(metrics::SmgrQueryType::GetRelSize); + .start_timer(metrics::SmgrQueryType::GetRelSize, ctx); let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); let lsn = @@ -966,7 +966,7 @@ impl PageServerHandler { let _timer = timeline .query_metrics - .start_timer(metrics::SmgrQueryType::GetDbSize); + .start_timer(metrics::SmgrQueryType::GetDbSize, ctx); let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); let lsn = @@ -1144,7 +1144,7 @@ impl PageServerHandler { let _timer = timeline .query_metrics - .start_timer(metrics::SmgrQueryType::GetPageAtLsn); + .start_timer(metrics::SmgrQueryType::GetPageAtLsn, ctx); let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); let lsn = @@ -1172,7 +1172,7 @@ impl PageServerHandler { let _timer = timeline .query_metrics - .start_timer(metrics::SmgrQueryType::GetSlruSegment); + .start_timer(metrics::SmgrQueryType::GetSlruSegment, ctx); let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); let lsn = @@ -1199,7 +1199,7 @@ impl PageServerHandler { prev_lsn: Option, full_backup: bool, gzip: bool, - ctx: RequestContext, + ctx: &RequestContext, ) -> Result<(), QueryError> where IO: AsyncRead + AsyncWrite + Send + Sync + Unpin, @@ -1214,7 +1214,7 @@ impl PageServerHandler { if let Some(lsn) = lsn { // Backup was requested at a particular LSN. Wait for it to arrive. info!("waiting for {}", lsn); - timeline.wait_lsn(lsn, &ctx).await?; + timeline.wait_lsn(lsn, ctx).await?; timeline .check_lsn_is_in_scope(lsn, &latest_gc_cutoff_lsn) .context("invalid basebackup lsn")?; @@ -1236,7 +1236,7 @@ impl PageServerHandler { lsn, prev_lsn, full_backup, - &ctx, + ctx, ) .await?; } else { @@ -1257,7 +1257,7 @@ impl PageServerHandler { lsn, prev_lsn, full_backup, - &ctx, + ctx, ) .await?; // shutdown the encoder to ensure the gzip footer is written @@ -1269,7 +1269,7 @@ impl PageServerHandler { lsn, prev_lsn, full_backup, - &ctx, + ctx, ) .await?; } @@ -1449,25 +1449,25 @@ where false }; - ::metrics::metric_vec_duration::observe_async_block_duration_by_result( - &*metrics::BASEBACKUP_QUERY_TIME, - async move { - self.handle_basebackup_request( - pgb, - tenant_id, - timeline_id, - lsn, - None, - false, - gzip, - ctx, - ) - .await?; - pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; - Result::<(), QueryError>::Ok(()) - }, - ) - .await?; + let metric_recording = metrics::BASEBACKUP_QUERY_TIME.start_recording(&ctx); + let res = async { + self.handle_basebackup_request( + pgb, + tenant_id, + timeline_id, + lsn, + None, + false, + gzip, + &ctx, + ) + .await?; + pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; + Result::<(), QueryError>::Ok(()) + } + .await; + metric_recording.observe(&res); + res?; } // return pair of prev_lsn and last_lsn else if query_string.starts_with("get_last_record_rlsn ") { @@ -1563,7 +1563,7 @@ where prev_lsn, true, false, - ctx, + &ctx, ) .await?; pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 7be08f86b1..727650a5a5 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -15,6 +15,7 @@ use crate::walrecord::NeonWalRecord; use anyhow::{ensure, Context}; use bytes::{Buf, Bytes, BytesMut}; use enum_map::Enum; +use itertools::Itertools; use pageserver_api::key::{ dbdir_key_range, is_rel_block_key, is_slru_block_key, rel_block_to_key, rel_dir_to_key, rel_key_range, rel_size_to_key, relmap_file_key, slru_block_to_key, slru_dir_to_key, @@ -1498,7 +1499,7 @@ impl<'a> DatadirModification<'a> { return Ok(()); } - let writer = self.tline.writer().await; + let mut writer = self.tline.writer().await; // Flush relation and SLRU data blocks, keep metadata. let mut retained_pending_updates = HashMap::<_, Vec<_>>::new(); @@ -1537,14 +1538,22 @@ impl<'a> DatadirModification<'a> { /// All the modifications in this atomic update are stamped by the specified LSN. /// pub async fn commit(&mut self, ctx: &RequestContext) -> anyhow::Result<()> { - let writer = self.tline.writer().await; + let mut writer = self.tline.writer().await; let pending_nblocks = self.pending_nblocks; self.pending_nblocks = 0; if !self.pending_updates.is_empty() { - writer.put_batch(&self.pending_updates, ctx).await?; - self.pending_updates.clear(); + // The put_batch call below expects expects the inputs to be sorted by Lsn, + // so we do that first. + let lsn_ordered_batch: Vec<(Key, Lsn, Value)> = self + .pending_updates + .drain() + .map(|(key, vals)| vals.into_iter().map(move |(lsn, val)| (key, lsn, val))) + .kmerge_by(|lhs, rhs| lhs.1 .0 < rhs.1 .0) + .collect(); + + writer.put_batch(lsn_ordered_batch, ctx).await?; } if !self.pending_deletions.is_empty() { @@ -1677,7 +1686,7 @@ struct RelDirectory { rels: HashSet<(Oid, u8)>, } -#[derive(Debug, Serialize, Deserialize, Default)] +#[derive(Debug, Serialize, Deserialize, Default, PartialEq)] pub(crate) struct AuxFilesDirectory { pub(crate) files: HashMap, } diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs index adaa55c179..275a72c0b0 100644 --- a/pageserver/src/task_mgr.rs +++ b/pageserver/src/task_mgr.rs @@ -272,9 +272,6 @@ pub enum TaskKind { // Task that uploads a file to remote storage RemoteUploadTask, - // Task that downloads a file from remote storage - RemoteDownloadTask, - // task that handles the initial downloading of all tenants InitialLoad, diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 4158133111..4f4654422b 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -22,6 +22,7 @@ use pageserver_api::models; use pageserver_api::models::TimelineState; use pageserver_api::models::WalRedoManagerStatus; use pageserver_api::shard::ShardIdentity; +use pageserver_api::shard::ShardStripeSize; use pageserver_api::shard::TenantShardId; use remote_storage::DownloadError; use remote_storage::GenericRemoteStorage; @@ -151,7 +152,6 @@ pub(crate) mod ephemeral_file; pub mod layer_map; pub mod metadata; -mod par_fsync; pub mod remote_timeline_client; pub mod storage_layer; @@ -2087,6 +2087,10 @@ impl Tenant { &self.tenant_shard_id } + pub(crate) fn get_shard_stripe_size(&self) -> ShardStripeSize { + self.shard_identity.stripe_size + } + pub(crate) fn get_generation(&self) -> Generation { self.generation } @@ -3675,7 +3679,10 @@ pub(crate) mod harness { } impl TenantHarness { - pub fn create(test_name: &'static str) -> anyhow::Result { + pub fn create_custom( + test_name: &'static str, + tenant_conf: TenantConf, + ) -> anyhow::Result { setup_logging(); let repo_dir = PageServerConf::test_repo_dir(test_name); @@ -3687,14 +3694,6 @@ pub(crate) mod harness { // OK in a test. let conf: &'static PageServerConf = Box::leak(Box::new(conf)); - // Disable automatic GC and compaction to make the unit tests more deterministic. - // The tests perform them manually if needed. - let tenant_conf = TenantConf { - gc_period: Duration::ZERO, - compaction_period: Duration::ZERO, - ..TenantConf::default() - }; - let tenant_id = TenantId::generate(); let tenant_shard_id = TenantShardId::unsharded(tenant_id); fs::create_dir_all(conf.tenant_path(&tenant_shard_id))?; @@ -3722,6 +3721,18 @@ pub(crate) mod harness { }) } + pub fn create(test_name: &'static str) -> anyhow::Result { + // Disable automatic GC and compaction to make the unit tests more deterministic. + // The tests perform them manually if needed. + let tenant_conf = TenantConf { + gc_period: Duration::ZERO, + compaction_period: Duration::ZERO, + ..TenantConf::default() + }; + + Self::create_custom(test_name, tenant_conf) + } + pub fn span(&self) -> tracing::Span { info_span!("TenantHarness", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()) } @@ -3829,6 +3840,7 @@ mod tests { use crate::keyspace::KeySpaceAccum; use crate::repository::{Key, Value}; use crate::tenant::harness::*; + use crate::tenant::timeline::CompactFlags; use crate::DEFAULT_PG_VERSION; use bytes::BytesMut; use hex_literal::hex; @@ -3845,7 +3857,7 @@ mod tests { .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx) .await?; - let writer = tline.writer().await; + let mut writer = tline.writer().await; writer .put( *TEST_KEY, @@ -3857,7 +3869,7 @@ mod tests { writer.finish_write(Lsn(0x10)); drop(writer); - let writer = tline.writer().await; + let mut writer = tline.writer().await; writer .put( *TEST_KEY, @@ -3923,7 +3935,7 @@ mod tests { let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) .await?; - let writer = tline.writer().await; + let mut writer = tline.writer().await; #[allow(non_snake_case)] let TEST_KEY_A: Key = Key::from_hex("110000000033333333444444445500000001").unwrap(); @@ -3957,7 +3969,7 @@ mod tests { let newtline = tenant .get_timeline(NEW_TIMELINE_ID, true) .expect("Should have a local timeline"); - let new_writer = newtline.writer().await; + let mut new_writer = newtline.writer().await; new_writer .put(TEST_KEY_A, Lsn(0x40), &test_value("bar at 0x40"), &ctx) .await?; @@ -3989,7 +4001,7 @@ mod tests { ) -> anyhow::Result<()> { let mut lsn = start_lsn; { - let writer = tline.writer().await; + let mut writer = tline.writer().await; // Create a relation on the timeline writer .put( @@ -4014,7 +4026,7 @@ mod tests { } tline.freeze_and_flush().await?; { - let writer = tline.writer().await; + let mut writer = tline.writer().await; writer .put( *TEST_KEY, @@ -4377,7 +4389,7 @@ mod tests { .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx) .await?; - let writer = tline.writer().await; + let mut writer = tline.writer().await; writer .put( *TEST_KEY, @@ -4394,7 +4406,7 @@ mod tests { .compact(&CancellationToken::new(), EnumSet::empty(), &ctx) .await?; - let writer = tline.writer().await; + let mut writer = tline.writer().await; writer .put( *TEST_KEY, @@ -4411,7 +4423,7 @@ mod tests { .compact(&CancellationToken::new(), EnumSet::empty(), &ctx) .await?; - let writer = tline.writer().await; + let mut writer = tline.writer().await; writer .put( *TEST_KEY, @@ -4428,7 +4440,7 @@ mod tests { .compact(&CancellationToken::new(), EnumSet::empty(), &ctx) .await?; - let writer = tline.writer().await; + let mut writer = tline.writer().await; writer .put( *TEST_KEY, @@ -4485,7 +4497,7 @@ mod tests { for _ in 0..repeat { for _ in 0..key_count { test_key.field6 = blknum; - let writer = timeline.writer().await; + let mut writer = timeline.writer().await; writer .put( test_key, @@ -4633,6 +4645,145 @@ mod tests { Ok(()) } + // Test that vectored get handles layer gaps correctly + // by advancing into the next ancestor timeline if required. + // + // The test generates timelines that look like the diagram below. + // We leave a gap in one of the L1 layers at `gap_at_key` (`/` in the diagram). + // The reconstruct data for that key lies in the ancestor timeline (`X` in the diagram). + // + // ``` + //-------------------------------+ + // ... | + // [ L1 ] | + // [ / L1 ] | Child Timeline + // ... | + // ------------------------------+ + // [ X L1 ] | Parent Timeline + // ------------------------------+ + // ``` + #[tokio::test] + async fn test_get_vectored_key_gap() -> anyhow::Result<()> { + let tenant_conf = TenantConf { + // Make compaction deterministic + gc_period: Duration::ZERO, + compaction_period: Duration::ZERO, + // Encourage creation of L1 layers + checkpoint_distance: 16 * 1024, + compaction_target_size: 8 * 1024, + ..TenantConf::default() + }; + + let harness = TenantHarness::create_custom("test_get_vectored_key_gap", tenant_conf)?; + let (tenant, ctx) = harness.load().await; + + let mut current_key = Key::from_hex("010000000033333333444444445500000000").unwrap(); + let gap_at_key = current_key.add(100); + let mut current_lsn = Lsn(0x10); + + const KEY_COUNT: usize = 10_000; + + let timeline_id = TimelineId::generate(); + let current_timeline = tenant + .create_test_timeline(timeline_id, current_lsn, DEFAULT_PG_VERSION, &ctx) + .await?; + + current_lsn += 0x100; + + let mut writer = current_timeline.writer().await; + writer + .put( + gap_at_key, + current_lsn, + &Value::Image(test_img(&format!("{} at {}", gap_at_key, current_lsn))), + &ctx, + ) + .await?; + writer.finish_write(current_lsn); + drop(writer); + + let mut latest_lsns = HashMap::new(); + latest_lsns.insert(gap_at_key, current_lsn); + + current_timeline.freeze_and_flush().await?; + + let child_timeline_id = TimelineId::generate(); + + tenant + .branch_timeline_test( + ¤t_timeline, + child_timeline_id, + Some(current_lsn), + &ctx, + ) + .await?; + let child_timeline = tenant + .get_timeline(child_timeline_id, true) + .expect("Should have the branched timeline"); + + for i in 0..KEY_COUNT { + if current_key == gap_at_key { + current_key = current_key.next(); + continue; + } + + current_lsn += 0x10; + + let mut writer = child_timeline.writer().await; + writer + .put( + current_key, + current_lsn, + &Value::Image(test_img(&format!("{} at {}", current_key, current_lsn))), + &ctx, + ) + .await?; + writer.finish_write(current_lsn); + drop(writer); + + latest_lsns.insert(current_key, current_lsn); + current_key = current_key.next(); + + // Flush every now and then to encourage layer file creation. + if i % 500 == 0 { + child_timeline.freeze_and_flush().await?; + } + } + + child_timeline.freeze_and_flush().await?; + let mut flags = EnumSet::new(); + flags.insert(CompactFlags::ForceRepartition); + child_timeline + .compact(&CancellationToken::new(), flags, &ctx) + .await?; + + let key_near_end = { + let mut tmp = current_key; + tmp.field6 -= 10; + tmp + }; + + let key_near_gap = { + let mut tmp = gap_at_key; + tmp.field6 -= 10; + tmp + }; + + let read = KeySpace { + ranges: vec![key_near_gap..gap_at_key.next(), key_near_end..current_key], + }; + let results = child_timeline + .get_vectored_impl(read.clone(), current_lsn, &ctx) + .await?; + + for (key, img_res) in results { + let expected = test_img(&format!("{} at {}", key, latest_lsns[&key])); + assert_eq!(img_res?, expected); + } + + Ok(()) + } + #[tokio::test] async fn test_random_updates() -> anyhow::Result<()> { let harness = TenantHarness::create("test_random_updates")?; @@ -4656,7 +4807,7 @@ mod tests { for blknum in 0..NUM_KEYS { lsn = Lsn(lsn.0 + 0x10); test_key.field6 = blknum as u32; - let writer = tline.writer().await; + let mut writer = tline.writer().await; writer .put( test_key, @@ -4677,7 +4828,7 @@ mod tests { lsn = Lsn(lsn.0 + 0x10); let blknum = thread_rng().gen_range(0..NUM_KEYS); test_key.field6 = blknum as u32; - let writer = tline.writer().await; + let mut writer = tline.writer().await; writer .put( test_key, @@ -4745,7 +4896,7 @@ mod tests { for blknum in 0..NUM_KEYS { lsn = Lsn(lsn.0 + 0x10); test_key.field6 = blknum as u32; - let writer = tline.writer().await; + let mut writer = tline.writer().await; writer .put( test_key, @@ -4774,7 +4925,7 @@ mod tests { lsn = Lsn(lsn.0 + 0x10); let blknum = thread_rng().gen_range(0..NUM_KEYS); test_key.field6 = blknum as u32; - let writer = tline.writer().await; + let mut writer = tline.writer().await; writer .put( test_key, @@ -4851,7 +5002,7 @@ mod tests { lsn = Lsn(lsn.0 + 0x10); let blknum = thread_rng().gen_range(0..NUM_KEYS); test_key.field6 = blknum as u32; - let writer = tline.writer().await; + let mut writer = tline.writer().await; writer .put( test_key, diff --git a/pageserver/src/tenant/blob_io.rs b/pageserver/src/tenant/blob_io.rs index ec70bdc679..0d33100ead 100644 --- a/pageserver/src/tenant/blob_io.rs +++ b/pageserver/src/tenant/blob_io.rs @@ -12,7 +12,7 @@ //! len >= 128: 1XXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX //! use bytes::{BufMut, BytesMut}; -use tokio_epoll_uring::{BoundedBuf, Slice}; +use tokio_epoll_uring::{BoundedBuf, IoBuf, Slice}; use crate::context::RequestContext; use crate::page_cache::PAGE_SZ; @@ -127,7 +127,7 @@ impl BlobWriter { /// You need to make sure that the internal buffer is empty, otherwise /// data will be written in wrong order. #[inline(always)] - async fn write_all_unbuffered( + async fn write_all_unbuffered, Buf: IoBuf + Send>( &mut self, src_buf: B, ) -> (B::Buf, Result<(), Error>) { @@ -162,7 +162,10 @@ impl BlobWriter { } /// Internal, possibly buffered, write function - async fn write_all(&mut self, src_buf: B) -> (B::Buf, Result<(), Error>) { + async fn write_all, Buf: IoBuf + Send>( + &mut self, + src_buf: B, + ) -> (B::Buf, Result<(), Error>) { if !BUFFERED { assert!(self.buf.is_empty()); return self.write_all_unbuffered(src_buf).await; @@ -210,7 +213,10 @@ impl BlobWriter { /// Write a blob of data. Returns the offset that it was written to, /// which can be used to retrieve the data later. - pub async fn write_blob(&mut self, srcbuf: B) -> (B::Buf, Result) { + pub async fn write_blob, Buf: IoBuf + Send>( + &mut self, + srcbuf: B, + ) -> (B::Buf, Result) { let offset = self.offset; let len = srcbuf.bytes_init(); diff --git a/pageserver/src/tenant/disk_btree.rs b/pageserver/src/tenant/disk_btree.rs index ca30b0ac4f..6d85d1e60e 100644 --- a/pageserver/src/tenant/disk_btree.rs +++ b/pageserver/src/tenant/disk_btree.rs @@ -18,10 +18,19 @@ //! - An Iterator interface would be more convenient for the callers than the //! 'visit' function //! +use async_stream::try_stream; use byteorder::{ReadBytesExt, BE}; use bytes::{BufMut, Bytes, BytesMut}; use either::Either; -use std::{cmp::Ordering, io, result}; +use futures::Stream; +use hex; +use std::{ + cmp::Ordering, + io, + iter::Rev, + ops::{Range, RangeInclusive}, + result, +}; use thiserror::Error; use tracing::error; @@ -250,6 +259,90 @@ where Ok(result) } + /// Return a stream which yields all key, value pairs from the index + /// starting from the first key greater or equal to `start_key`. + /// + /// Note that this is a copy of [`Self::visit`]. + /// TODO: Once the sequential read path is removed this will become + /// the only index traversal method. + pub fn get_stream_from<'a>( + &'a self, + start_key: &'a [u8; L], + ctx: &'a RequestContext, + ) -> impl Stream, u64), DiskBtreeError>> + 'a { + try_stream! { + let mut stack = Vec::new(); + stack.push((self.root_blk, None)); + let block_cursor = self.reader.block_cursor(); + while let Some((node_blknum, opt_iter)) = stack.pop() { + // Locate the node. + let node_buf = block_cursor + .read_blk(self.start_blk + node_blknum, ctx) + .await?; + + let node = OnDiskNode::deparse(node_buf.as_ref())?; + let prefix_len = node.prefix_len as usize; + let suffix_len = node.suffix_len as usize; + + assert!(node.num_children > 0); + + let mut keybuf = Vec::new(); + keybuf.extend(node.prefix); + keybuf.resize(prefix_len + suffix_len, 0); + + let mut iter: Either, Rev>> = if let Some(iter) = opt_iter { + iter + } else { + // Locate the first match + let idx = match node.binary_search(start_key, keybuf.as_mut_slice()) { + Ok(idx) => idx, + Err(idx) => { + if node.level == 0 { + // Imagine that the node contains the following keys: + // + // 1 + // 3 <-- idx + // 5 + // + // If the search key is '2' and there is exact match, + // the binary search would return the index of key + // '3'. That's cool, '3' is the first key to return. + idx + } else { + // This is an internal page, so each key represents a lower + // bound for what's in the child page. If there is no exact + // match, we have to return the *previous* entry. + // + // 1 <-- return this + // 3 <-- idx + // 5 + idx.saturating_sub(1) + } + } + }; + Either::Left(idx..node.num_children.into()) + }; + + // idx points to the first match now. Keep going from there + while let Some(idx) = iter.next() { + let key_off = idx * suffix_len; + let suffix = &node.keys[key_off..key_off + suffix_len]; + keybuf[prefix_len..].copy_from_slice(suffix); + let value = node.value(idx); + #[allow(clippy::collapsible_if)] + if node.level == 0 { + // leaf + yield (keybuf.clone(), value.to_u64()); + } else { + stack.push((node_blknum, Some(iter))); + stack.push((value.to_blknum(), None)); + break; + } + } + } + } + } + /// /// Scan the tree, starting from 'search_key', in the given direction. 'visitor' /// will be called for every key >= 'search_key' (or <= 'search_key', if scanning diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs index 5f4814cc6b..b8ed69052f 100644 --- a/pageserver/src/tenant/layer_map.rs +++ b/pageserver/src/tenant/layer_map.rs @@ -460,15 +460,22 @@ impl LayerMap { } } - pub fn range_search(&self, key_range: Range, end_lsn: Lsn) -> Option { - let version = self.historic.get().unwrap().get_version(end_lsn.0 - 1)?; + pub fn range_search(&self, key_range: Range, end_lsn: Lsn) -> RangeSearchResult { + let version = match self.historic.get().unwrap().get_version(end_lsn.0 - 1) { + Some(version) => version, + None => { + let mut result = RangeSearchResult::new(); + result.not_found.add_range(key_range); + return result; + } + }; let raw_range = key_range.start.to_i128()..key_range.end.to_i128(); let delta_changes = version.delta_coverage.range_overlaps(&raw_range); let image_changes = version.image_coverage.range_overlaps(&raw_range); let collector = RangeSearchCollector::new(key_range, end_lsn, delta_changes, image_changes); - Some(collector.collect()) + collector.collect() } /// Start a batch of updates, applied on drop @@ -995,8 +1002,13 @@ mod tests { let layer_map = LayerMap::default(); let range = Key::from_i128(100)..Key::from_i128(200); - let res = layer_map.range_search(range, Lsn(100)); - assert!(res.is_none()); + let res = layer_map.range_search(range.clone(), Lsn(100)); + assert_eq!( + res.not_found.to_keyspace(), + KeySpace { + ranges: vec![range] + } + ); } #[test] @@ -1033,7 +1045,7 @@ mod tests { for start in 0..60 { for end in (start + 1)..60 { let range = Key::from_i128(start)..Key::from_i128(end); - let result = layer_map.range_search(range.clone(), Lsn(100)).unwrap(); + let result = layer_map.range_search(range.clone(), Lsn(100)); let expected = brute_force_range_search(&layer_map, range, Lsn(100)); assert_range_search_result_eq(result, expected); diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 06b61d4631..fc08b3c82e 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -1358,6 +1358,16 @@ impl TenantManager { } } + pub(crate) fn get(&self, tenant_shard_id: TenantShardId) -> Option { + let locked = self.tenants.read().unwrap(); + match &*locked { + TenantsMap::Initializing => None, + TenantsMap::Open(map) | TenantsMap::ShuttingDown(map) => { + map.get(&tenant_shard_id).cloned() + } + } + } + pub(crate) async fn delete_tenant( &self, tenant_shard_id: TenantShardId, diff --git a/pageserver/src/tenant/par_fsync.rs b/pageserver/src/tenant/par_fsync.rs deleted file mode 100644 index 3acb0fb431..0000000000 --- a/pageserver/src/tenant/par_fsync.rs +++ /dev/null @@ -1,84 +0,0 @@ -use std::{ - io, - sync::atomic::{AtomicUsize, Ordering}, -}; - -use camino::{Utf8Path, Utf8PathBuf}; - -fn fsync_path(path: &Utf8Path) -> io::Result<()> { - // TODO use VirtualFile::fsync_all once we fully go async. - let file = std::fs::File::open(path)?; - file.sync_all() -} - -fn parallel_worker(paths: &[Utf8PathBuf], next_path_idx: &AtomicUsize) -> io::Result<()> { - while let Some(path) = paths.get(next_path_idx.fetch_add(1, Ordering::Relaxed)) { - fsync_path(path)?; - } - - Ok(()) -} - -fn fsync_in_thread_pool(paths: &[Utf8PathBuf]) -> io::Result<()> { - // TODO: remove this function in favor of `par_fsync_async` once we asyncify everything. - - /// Use at most this number of threads. - /// Increasing this limit will - /// - use more memory - /// - increase the cost of spawn/join latency - const MAX_NUM_THREADS: usize = 64; - let num_threads = paths.len().min(MAX_NUM_THREADS); - let next_path_idx = AtomicUsize::new(0); - - std::thread::scope(|s| -> io::Result<()> { - let mut handles = vec![]; - // Spawn `num_threads - 1`, as the current thread is also a worker. - for _ in 1..num_threads { - handles.push(s.spawn(|| parallel_worker(paths, &next_path_idx))); - } - - parallel_worker(paths, &next_path_idx)?; - - for handle in handles { - handle.join().unwrap()?; - } - - Ok(()) - }) -} - -/// Parallel fsync all files. Can be used in non-async context as it is using rayon thread pool. -pub fn par_fsync(paths: &[Utf8PathBuf]) -> io::Result<()> { - if paths.len() == 1 { - fsync_path(&paths[0])?; - return Ok(()); - } - - fsync_in_thread_pool(paths) -} - -/// Parallel fsync asynchronously. -pub async fn par_fsync_async(paths: &[Utf8PathBuf]) -> io::Result<()> { - const MAX_CONCURRENT_FSYNC: usize = 64; - let mut next = paths.iter().peekable(); - let mut js = tokio::task::JoinSet::new(); - loop { - while js.len() < MAX_CONCURRENT_FSYNC && next.peek().is_some() { - let next = next.next().expect("just peeked"); - let next = next.to_owned(); - js.spawn_blocking(move || fsync_path(&next)); - } - - // now the joinset has been filled up, wait for next to complete - if let Some(res) = js.join_next().await { - res??; - } else { - // last item had already completed - assert!( - next.peek().is_none(), - "joinset emptied, we shouldn't have more work" - ); - return Ok(()); - } - } -} diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs index 167e18a829..6fff6e78e2 100644 --- a/pageserver/src/tenant/remote_timeline_client/download.rs +++ b/pageserver/src/tenant/remote_timeline_client/download.rs @@ -14,14 +14,14 @@ use tokio::io::{AsyncSeekExt, AsyncWriteExt}; use tokio_util::io::StreamReader; use tokio_util::sync::CancellationToken; use tracing::warn; -use utils::{backoff, crashsafe}; +use utils::backoff; use crate::config::PageServerConf; use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path}; use crate::tenant::storage_layer::LayerFileName; use crate::tenant::Generation; -use crate::virtual_file::on_fatal_io_error; +use crate::virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile}; use crate::TEMP_FILE_SUFFIX; use remote_storage::{DownloadError, GenericRemoteStorage, ListingMode}; use utils::crashsafe::path_with_suffix_extension; @@ -50,9 +50,8 @@ pub async fn download_layer_file<'a>( ) -> Result { debug_assert_current_span_has_tenant_and_timeline_id(); - let local_path = conf - .timeline_path(&tenant_shard_id, &timeline_id) - .join(layer_file_name.file_name()); + let timeline_path = conf.timeline_path(&tenant_shard_id, &timeline_id); + let local_path = timeline_path.join(layer_file_name.file_name()); let remote_path = remote_layer_path( &tenant_shard_id.tenant_id, @@ -149,10 +148,21 @@ pub async fn download_layer_file<'a>( .with_context(|| format!("rename download layer file to {local_path}")) .map_err(DownloadError::Other)?; - crashsafe::fsync_async(&local_path) - .await - .with_context(|| format!("fsync layer file {local_path}")) - .map_err(DownloadError::Other)?; + // We use fatal_err() below because the after the rename above, + // the in-memory state of the filesystem already has the layer file in its final place, + // and subsequent pageserver code could think it's durable while it really isn't. + let work = async move { + let timeline_dir = VirtualFile::open(&timeline_path) + .await + .fatal_err("VirtualFile::open for timeline dir fsync"); + timeline_dir + .sync_all() + .await + .fatal_err("VirtualFile::sync_all timeline dir"); + }; + crate::virtual_file::io_engine::get() + .spawn_blocking_and_block_on_if_std(work) + .await; tracing::debug!("download complete: {local_path}"); diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index 5eaf1cc1ce..b7132ee3bf 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -46,6 +46,7 @@ use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION}; use anyhow::{anyhow, bail, ensure, Context, Result}; use bytes::BytesMut; use camino::{Utf8Path, Utf8PathBuf}; +use futures::StreamExt; use pageserver_api::keyspace::KeySpace; use pageserver_api::models::LayerAccessKind; use pageserver_api::shard::TenantShardId; @@ -847,10 +848,33 @@ impl DeltaLayerInner { reconstruct_state: &mut ValuesReconstructState, ctx: &RequestContext, ) -> Result<(), GetVectoredError> { - let reads = self - .plan_reads(keyspace, lsn_range, reconstruct_state, ctx) - .await - .map_err(GetVectoredError::Other)?; + let block_reader = FileBlockReader::new(&self.file, self.file_id); + let index_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new( + self.index_start_blk, + self.index_root_blk, + block_reader, + ); + + let planner = VectoredReadPlanner::new( + self.max_vectored_read_bytes + .expect("Layer is loaded with max vectored bytes config") + .0 + .into(), + ); + + let data_end_offset = self.index_start_blk as u64 * PAGE_SZ as u64; + + let reads = Self::plan_reads( + keyspace, + lsn_range, + data_end_offset, + index_reader, + planner, + reconstruct_state, + ctx, + ) + .await + .map_err(GetVectoredError::Other)?; self.do_reads_and_update_state(reads, reconstruct_state) .await; @@ -858,73 +882,64 @@ impl DeltaLayerInner { Ok(()) } - async fn plan_reads( - &self, + async fn plan_reads( keyspace: KeySpace, lsn_range: Range, + data_end_offset: u64, + index_reader: DiskBtreeReader, + mut planner: VectoredReadPlanner, reconstruct_state: &mut ValuesReconstructState, ctx: &RequestContext, - ) -> anyhow::Result> { - let mut planner = VectoredReadPlanner::new( - self.max_vectored_read_bytes - .expect("Layer is loaded with max vectored bytes config") - .0 - .into(), - ); - - let block_reader = FileBlockReader::new(&self.file, self.file_id); - let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new( - self.index_start_blk, - self.index_root_blk, - block_reader, - ); + ) -> anyhow::Result> + where + Reader: BlockReader, + { + let ctx = RequestContextBuilder::extend(ctx) + .page_content_kind(PageContentKind::DeltaLayerBtreeNode) + .build(); for range in keyspace.ranges.iter() { let mut range_end_handled = false; let start_key = DeltaKey::from_key_lsn(&range.start, lsn_range.start); - tree_reader - .visit( - &start_key.0, - VisitDirection::Forwards, - |raw_key, value| { - let key = Key::from_slice(&raw_key[..KEY_SIZE]); - let lsn = DeltaKey::extract_lsn_from_buf(raw_key); - let blob_ref = BlobRef(value); + let index_stream = index_reader.get_stream_from(&start_key.0, &ctx); + let mut index_stream = std::pin::pin!(index_stream); - assert!(key >= range.start && lsn >= lsn_range.start); + while let Some(index_entry) = index_stream.next().await { + let (raw_key, value) = index_entry?; + let key = Key::from_slice(&raw_key[..KEY_SIZE]); + let lsn = DeltaKey::extract_lsn_from_buf(&raw_key); + let blob_ref = BlobRef(value); - let cached_lsn = reconstruct_state.get_cached_lsn(&key); - let flag = { - if cached_lsn >= Some(lsn) { - BlobFlag::Ignore - } else if blob_ref.will_init() { - BlobFlag::Replaces - } else { - BlobFlag::None - } - }; + // Lsns are not monotonically increasing across keys, so we don't assert on them. + assert!(key >= range.start); - if key >= range.end || (key.next() == range.end && lsn >= lsn_range.end) { - planner.handle_range_end(blob_ref.pos()); - range_end_handled = true; - false - } else { - planner.handle(key, lsn, blob_ref.pos(), flag); - true - } - }, - &RequestContextBuilder::extend(ctx) - .page_content_kind(PageContentKind::DeltaLayerBtreeNode) - .build(), - ) - .await - .map_err(|err| anyhow!(err))?; + let outside_lsn_range = !lsn_range.contains(&lsn); + let below_cached_lsn = reconstruct_state.get_cached_lsn(&key) >= Some(lsn); + + let flag = { + if outside_lsn_range || below_cached_lsn { + BlobFlag::Ignore + } else if blob_ref.will_init() { + BlobFlag::ReplaceAll + } else { + // Usual path: add blob to the read + BlobFlag::None + } + }; + + if key >= range.end || (key.next() == range.end && lsn >= lsn_range.end) { + planner.handle_range_end(blob_ref.pos()); + range_end_handled = true; + break; + } else { + planner.handle(key, lsn, blob_ref.pos(), flag); + } + } if !range_end_handled { - let payload_end = self.index_start_blk as u64 * PAGE_SZ as u64; - tracing::info!("Handling range end fallback at {}", payload_end); - planner.handle_range_end(payload_end); + tracing::info!("Handling range end fallback at {}", data_end_offset); + planner.handle_range_end(data_end_offset); } } @@ -1190,3 +1205,131 @@ impl<'a> pageserver_compaction::interface::CompactionDeltaEntry<'a, Key> for Del self.size } } + +#[cfg(test)] +mod test { + use std::collections::BTreeMap; + + use super::*; + use crate::{ + context::DownloadBehavior, task_mgr::TaskKind, tenant::disk_btree::tests::TestDisk, + }; + + /// Construct an index for a fictional delta layer and and then + /// traverse in order to plan vectored reads for a query. Finally, + /// verify that the traversal fed the right index key and value + /// pairs into the planner. + #[tokio::test] + async fn test_delta_layer_index_traversal() { + let base_key = Key { + field1: 0, + field2: 1663, + field3: 12972, + field4: 16396, + field5: 0, + field6: 246080, + }; + + // Populate the index with some entries + let entries: BTreeMap> = BTreeMap::from([ + (base_key, vec![Lsn(1), Lsn(5), Lsn(25), Lsn(26), Lsn(28)]), + (base_key.add(1), vec![Lsn(2), Lsn(5), Lsn(10), Lsn(50)]), + (base_key.add(2), vec![Lsn(2), Lsn(5), Lsn(10), Lsn(50)]), + (base_key.add(5), vec![Lsn(10), Lsn(15), Lsn(16), Lsn(20)]), + ]); + + let mut disk = TestDisk::default(); + let mut writer = DiskBtreeBuilder::<_, DELTA_KEY_SIZE>::new(&mut disk); + + let mut disk_offset = 0; + for (key, lsns) in &entries { + for lsn in lsns { + let index_key = DeltaKey::from_key_lsn(key, *lsn); + let blob_ref = BlobRef::new(disk_offset, false); + writer + .append(&index_key.0, blob_ref.0) + .expect("In memory disk append should never fail"); + + disk_offset += 1; + } + } + + // Prepare all the arguments for the call into `plan_reads` below + let (root_offset, _writer) = writer + .finish() + .expect("In memory disk finish should never fail"); + let reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(0, root_offset, disk); + let planner = VectoredReadPlanner::new(100); + let mut reconstruct_state = ValuesReconstructState::new(); + let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); + + let keyspace = KeySpace { + ranges: vec![ + base_key..base_key.add(3), + base_key.add(3)..base_key.add(100), + ], + }; + let lsn_range = Lsn(2)..Lsn(40); + + // Plan and validate + let vectored_reads = DeltaLayerInner::plan_reads( + keyspace.clone(), + lsn_range.clone(), + disk_offset, + reader, + planner, + &mut reconstruct_state, + &ctx, + ) + .await + .expect("Read planning should not fail"); + + validate(keyspace, lsn_range, vectored_reads, entries); + } + + fn validate( + keyspace: KeySpace, + lsn_range: Range, + vectored_reads: Vec, + index_entries: BTreeMap>, + ) { + #[derive(Debug, PartialEq, Eq)] + struct BlobSpec { + key: Key, + lsn: Lsn, + at: u64, + } + + let mut planned_blobs = Vec::new(); + for read in vectored_reads { + for (at, meta) in read.blobs_at.as_slice() { + planned_blobs.push(BlobSpec { + key: meta.key, + lsn: meta.lsn, + at: *at, + }); + } + } + + let mut expected_blobs = Vec::new(); + let mut disk_offset = 0; + for (key, lsns) in index_entries { + for lsn in lsns { + let key_included = keyspace.ranges.iter().any(|range| range.contains(&key)); + let lsn_included = lsn_range.contains(&lsn); + + if key_included && lsn_included { + expected_blobs.push(BlobSpec { + key, + lsn, + at: disk_offset, + }); + } + + disk_offset += 1; + } + } + + assert_eq!(planned_blobs, expected_blobs); + } +} diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index 56cfaeda15..14c79e413c 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -43,6 +43,7 @@ use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX}; use anyhow::{anyhow, bail, ensure, Context, Result}; use bytes::{Bytes, BytesMut}; use camino::{Utf8Path, Utf8PathBuf}; +use hex; use pageserver_api::keyspace::KeySpace; use pageserver_api::models::LayerAccessKind; use pageserver_api::shard::TenantShardId; @@ -54,6 +55,7 @@ use std::ops::Range; use std::os::unix::prelude::FileExt; use std::sync::Arc; use tokio::sync::OnceCell; +use tokio_stream::StreamExt; use tracing::*; use utils::{ @@ -488,35 +490,33 @@ impl ImageLayerInner { let tree_reader = DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, block_reader); + let ctx = RequestContextBuilder::extend(ctx) + .page_content_kind(PageContentKind::ImageLayerBtreeNode) + .build(); + for range in keyspace.ranges.iter() { let mut range_end_handled = false; let mut search_key: [u8; KEY_SIZE] = [0u8; KEY_SIZE]; range.start.write_to_byte_slice(&mut search_key); - tree_reader - .visit( - &search_key, - VisitDirection::Forwards, - |raw_key, offset| { - let key = Key::from_slice(&raw_key[..KEY_SIZE]); - assert!(key >= range.start); + let index_stream = tree_reader.get_stream_from(&search_key, &ctx); + let mut index_stream = std::pin::pin!(index_stream); - if key >= range.end { - planner.handle_range_end(offset); - range_end_handled = true; - false - } else { - planner.handle(key, self.lsn, offset, BlobFlag::None); - true - } - }, - &RequestContextBuilder::extend(ctx) - .page_content_kind(PageContentKind::ImageLayerBtreeNode) - .build(), - ) - .await - .map_err(|err| GetVectoredError::Other(anyhow!(err)))?; + while let Some(index_entry) = index_stream.next().await { + let (raw_key, offset) = index_entry?; + + let key = Key::from_slice(&raw_key[..KEY_SIZE]); + assert!(key >= range.start); + + if key >= range.end { + planner.handle_range_end(offset); + range_end_handled = true; + break; + } else { + planner.handle(key, self.lsn, offset, BlobFlag::None); + } + } if !range_end_handled { let payload_end = self.index_start_blk as u64 * PAGE_SZ as u64; diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs index e7da28b8d6..5f1db21d49 100644 --- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs +++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs @@ -336,32 +336,17 @@ impl InMemoryLayer { /// Common subroutine of the public put_wal_record() and put_page_image() functions. /// Adds the page version to the in-memory tree + pub(crate) async fn put_value( &self, key: Key, lsn: Lsn, - val: &Value, + buf: &[u8], ctx: &RequestContext, ) -> Result<()> { let mut inner = self.inner.write().await; self.assert_writable(); - self.put_value_locked(&mut inner, key, lsn, val, ctx).await - } - - pub(crate) async fn put_values( - &self, - values: &HashMap>, - ctx: &RequestContext, - ) -> Result<()> { - let mut inner = self.inner.write().await; - self.assert_writable(); - for (key, vals) in values { - for (lsn, val) in vals { - self.put_value_locked(&mut inner, *key, *lsn, val, ctx) - .await?; - } - } - Ok(()) + self.put_value_locked(&mut inner, key, lsn, buf, ctx).await } async fn put_value_locked( @@ -369,22 +354,16 @@ impl InMemoryLayer { locked_inner: &mut RwLockWriteGuard<'_, InMemoryLayerInner>, key: Key, lsn: Lsn, - val: &Value, + buf: &[u8], ctx: &RequestContext, ) -> Result<()> { trace!("put_value key {} at {}/{}", key, self.timeline_id, lsn); let off = { - // Avoid doing allocations for "small" values. - // In the regression test suite, the limit of 256 avoided allocations in 95% of cases: - // https://github.com/neondatabase/neon/pull/5056#discussion_r1301975061 - let mut buf = smallvec::SmallVec::<[u8; 256]>::new(); - buf.clear(); - val.ser_into(&mut buf)?; locked_inner .file .write_blob( - &buf, + buf, &RequestContextBuilder::extend(ctx) .page_content_kind(PageContentKind::InMemoryLayer) .build(), @@ -412,7 +391,12 @@ impl InMemoryLayer { pub async fn freeze(&self, end_lsn: Lsn) { let inner = self.inner.write().await; - assert!(self.start_lsn < end_lsn); + assert!( + self.start_lsn < end_lsn, + "{} >= {}", + self.start_lsn, + end_lsn + ); self.end_lsn.set(end_lsn).expect("end_lsn set only once"); for vec_map in inner.index.values() { diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index 247dd1a8e4..aabb13b15c 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -195,6 +195,7 @@ impl Layer { let downloaded = resident.expect("just initialized"); // if the rename works, the path is as expected + // TODO: sync system call std::fs::rename(temp_path, owner.local_path()) .with_context(|| format!("rename temporary file as correct path for {owner}"))?; @@ -879,23 +880,18 @@ impl LayerInner { ) -> Result { debug_assert_current_span_has_tenant_and_timeline_id(); - let task_name = format!("download layer {}", self); - let (tx, rx) = tokio::sync::oneshot::channel(); - // this is sadly needed because of task_mgr::shutdown_tasks, otherwise we cannot - // block tenant::mgr::remove_tenant_from_memory. - let this: Arc = self.clone(); - crate::task_mgr::spawn( - &tokio::runtime::Handle::current(), - crate::task_mgr::TaskKind::RemoteDownloadTask, - Some(self.desc.tenant_shard_id), - Some(self.desc.timeline_id), - &task_name, - false, - async move { + let guard = timeline + .gate + .enter() + .map_err(|_| DownloadError::DownloadCancelled)?; + + tokio::task::spawn(async move { + + let _guard = guard; let client = timeline .remote_client @@ -905,7 +901,7 @@ impl LayerInner { let result = client.download_layer_file( &this.desc.filename(), &this.metadata(), - &crate::task_mgr::shutdown_token() + &timeline.cancel ) .await; @@ -928,7 +924,6 @@ impl LayerInner { tokio::select! { _ = tokio::time::sleep(backoff) => {}, - _ = crate::task_mgr::shutdown_token().cancelled_owned() => {}, _ = timeline.cancel.cancelled() => {}, }; @@ -958,11 +953,10 @@ impl LayerInner { } } } - - Ok(()) } .in_current_span(), ); + match rx.await { Ok((Ok(()), permit)) => { if let Some(reason) = self @@ -975,7 +969,7 @@ impl LayerInner { } self.consecutive_failures.store(0, Ordering::Relaxed); - tracing::info!("on-demand download successful"); + tracing::info!(size=%self.desc.file_size, "on-demand download successful"); Ok(permit) } diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index 45ce6c9381..e4f5f75132 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -101,6 +101,7 @@ pub fn start_background_loops( _ = completion::Barrier::maybe_wait(background_jobs_can_start) => {} }; compaction_loop(tenant, cancel) + // If you rename this span, change the RUST_LOG env variable in test_runner/performance/test_branch_creation.py .instrument(info_span!("compaction_loop", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug())) .await; Ok(()) @@ -198,7 +199,11 @@ async fn compaction_loop(tenant: Arc, cancel: CancellationToken) { } }; - warn_when_period_overrun(started_at.elapsed(), period, BackgroundLoopKind::Compaction); + let elapsed = started_at.elapsed(); + warn_when_period_overrun(elapsed, period, BackgroundLoopKind::Compaction); + + // the duration is recorded by performance tests by enabling debug in this function + tracing::debug!(elapsed_ms=elapsed.as_millis(), "compaction iteration complete"); // Perhaps we did no work and the walredo process has been idle for some time: // give it a chance to shut down to avoid leaving walredo process running indefinitely. @@ -217,7 +222,7 @@ async fn compaction_loop(tenant: Arc, cancel: CancellationToken) { } let allowed_rps = tenant.timeline_get_throttle.steady_rps(); let delta = now - prev; - warn!( + info!( n_seconds=%format_args!("{:.3}", delta.as_secs_f64()), count_accounted, diff --git a/pageserver/src/tenant/throttle.rs b/pageserver/src/tenant/throttle.rs index 6894a88b93..280773e9c3 100644 --- a/pageserver/src/tenant/throttle.rs +++ b/pageserver/src/tenant/throttle.rs @@ -2,14 +2,14 @@ use std::{ str::FromStr, sync::{ atomic::{AtomicU64, Ordering}, - Arc, + Arc, Mutex, }, time::{Duration, Instant}, }; use arc_swap::ArcSwap; use enumset::EnumSet; -use tracing::error; +use tracing::{error, warn}; use crate::{context::RequestContext, task_mgr::TaskKind}; @@ -157,6 +157,19 @@ where .fetch_add(wait_time.as_micros() as u64, Ordering::Relaxed); let observation = Observation { wait_time }; self.metric.observe_throttling(&observation); + match ctx.micros_spent_throttled.add(wait_time) { + Ok(res) => res, + Err(error) => { + use once_cell::sync::Lazy; + use utils::rate_limit::RateLimit; + static WARN_RATE_LIMIT: Lazy> = + Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10)))); + let mut guard = WARN_RATE_LIMIT.lock().unwrap(); + guard.call(move || { + warn!(error, "error adding time spent throttled; this message is logged at a global rate limit"); + }); + } + } } } } diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 206f20306e..7004db1cb5 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -10,13 +10,13 @@ mod walreceiver; use anyhow::{anyhow, bail, ensure, Context, Result}; use bytes::Bytes; -use camino::{Utf8Path, Utf8PathBuf}; +use camino::Utf8Path; use enumset::EnumSet; use fail::fail_point; use futures::stream::StreamExt; -use itertools::Itertools; use once_cell::sync::Lazy; use pageserver_api::{ + key::AUX_FILES_KEY, keyspace::KeySpaceAccum, models::{ CompactionAlgorithm, DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, @@ -27,20 +27,6 @@ use pageserver_api::{ }; use rand::Rng; use serde_with::serde_as; -use std::ops::{Deref, Range}; -use std::pin::pin; -use std::sync::atomic::Ordering as AtomicOrdering; -use std::sync::{Arc, Mutex, RwLock, Weak}; -use std::time::{Duration, Instant, SystemTime}; -use std::{ - array, - collections::{BTreeMap, BinaryHeap, HashMap, HashSet}, - sync::atomic::AtomicU64, -}; -use std::{ - cmp::{max, min, Ordering}, - ops::ControlFlow, -}; use storage_broker::BrokerClientChannel; use tokio::{ runtime::Handle, @@ -48,17 +34,33 @@ use tokio::{ }; use tokio_util::sync::CancellationToken; use tracing::*; -use utils::sync::gate::{Gate, GateGuard}; +use utils::{ + bin_ser::BeSer, + sync::gate::{Gate, GateGuard}, +}; + +use std::ops::{Deref, Range}; +use std::pin::pin; +use std::sync::atomic::Ordering as AtomicOrdering; +use std::sync::{Arc, Mutex, RwLock, Weak}; +use std::time::{Duration, Instant, SystemTime}; +use std::{ + array, + collections::{BTreeMap, HashMap, HashSet}, + sync::atomic::AtomicU64, +}; +use std::{ + cmp::{max, min, Ordering}, + ops::ControlFlow, +}; -use crate::pgdatadir_mapping::{AuxFilesDirectory, DirectoryKind}; use crate::tenant::timeline::logical_size::CurrentLogicalSize; use crate::tenant::{ layer_map::{LayerMap, SearchResult}, metadata::TimelineMetadata, - par_fsync, }; use crate::{ - context::{AccessStatsBehavior, DownloadBehavior, RequestContext, RequestContextBuilder}, + context::{DownloadBehavior, RequestContext}, disk_usage_eviction_task::DiskUsageEvictionInfo, pgdatadir_mapping::CollectKeySpaceError, }; @@ -75,6 +77,10 @@ use crate::{ disk_usage_eviction_task::EvictionCandidate, tenant::storage_layer::delta_layer::DeltaEntry, }; use crate::{pgdatadir_mapping::LsnForTimestamp, tenant::tasks::BackgroundLoopKind}; +use crate::{ + pgdatadir_mapping::{AuxFilesDirectory, DirectoryKind}, + virtual_file::{MaybeFatalIo, VirtualFile}, +}; use crate::config::PageServerConf; use crate::keyspace::{KeyPartitioning, KeySpace}; @@ -269,7 +275,7 @@ pub struct Timeline { /// Locked automatically by [`TimelineWriter`] and checkpointer. /// Must always be acquired before the layer map/individual layer lock /// to avoid deadlock. - write_lock: tokio::sync::Mutex<()>, + write_lock: tokio::sync::Mutex>, /// Used to avoid multiple `flush_loop` tasks running pub(super) flush_loop_state: Mutex, @@ -861,8 +867,6 @@ impl Timeline { fn errors_match(lhs: &GetVectoredError, rhs: &GetVectoredError) -> bool { use GetVectoredError::*; match (lhs, rhs) { - (Cancelled, Cancelled) => true, - (_, Cancelled) => true, (Oversized(l), Oversized(r)) => l == r, (InvalidLsn(l), InvalidLsn(r)) => l == r, (MissingKey(l), MissingKey(r)) => l == r, @@ -873,6 +877,8 @@ impl Timeline { } match (&sequential_res, vectored_res) { + (Err(GetVectoredError::Cancelled), _) => {}, + (_, Err(GetVectoredError::Cancelled)) => {}, (Err(seq_err), Ok(_)) => { panic!(concat!("Sequential get failed with {}, but vectored get did not", " - keyspace={:?} lsn={}"), @@ -889,8 +895,7 @@ impl Timeline { assert_eq!(seq_key, vec_key); match (seq_res, vec_res) { (Ok(seq_blob), Ok(vec_blob)) => { - assert_eq!(seq_blob, vec_blob, - "Image mismatch for key {seq_key} - keyspace={keyspace:?} lsn={lsn}"); + Self::validate_key_equivalence(seq_key, &keyspace, lsn, seq_blob, vec_blob); }, (Err(err), Ok(_)) => { panic!( @@ -909,6 +914,41 @@ impl Timeline { } } + fn validate_key_equivalence( + key: &Key, + keyspace: &KeySpace, + lsn: Lsn, + seq: &Bytes, + vec: &Bytes, + ) { + if *key == AUX_FILES_KEY { + // The value reconstruct of AUX_FILES_KEY from records is not deterministic + // since it uses a hash map under the hood. Hence, deserialise both results + // before comparing. + let seq_aux_dir_res = AuxFilesDirectory::des(seq); + let vec_aux_dir_res = AuxFilesDirectory::des(vec); + match (&seq_aux_dir_res, &vec_aux_dir_res) { + (Ok(seq_aux_dir), Ok(vec_aux_dir)) => { + assert_eq!( + seq_aux_dir, vec_aux_dir, + "Mismatch for key {} - keyspace={:?} lsn={}", + key, keyspace, lsn + ); + } + (Err(_), Err(_)) => {} + _ => { + panic!("Mismatch for {key}: {seq_aux_dir_res:?} != {vec_aux_dir_res:?}"); + } + } + } else { + // All other keys should reconstruct deterministically, so we simply compare the blobs. + assert_eq!( + seq, vec, + "Image mismatch for key {key} - keyspace={keyspace:?} lsn={lsn}" + ); + } + } + /// Get last or prev record separately. Same as get_last_record_rlsn().last/prev. pub(crate) fn get_last_record_lsn(&self) -> Lsn { self.last_record_lsn.load().last @@ -1107,174 +1147,14 @@ impl Timeline { } } - /// TODO: cancellation - async fn compact_legacy( - self: &Arc, - _cancel: &CancellationToken, - flags: EnumSet, - ctx: &RequestContext, - ) -> Result<(), CompactionError> { - // High level strategy for compaction / image creation: - // - // 1. First, calculate the desired "partitioning" of the - // currently in-use key space. The goal is to partition the - // key space into roughly fixed-size chunks, but also take into - // account any existing image layers, and try to align the - // chunk boundaries with the existing image layers to avoid - // too much churn. Also try to align chunk boundaries with - // relation boundaries. In principle, we don't know about - // relation boundaries here, we just deal with key-value - // pairs, and the code in pgdatadir_mapping.rs knows how to - // map relations into key-value pairs. But in practice we know - // that 'field6' is the block number, and the fields 1-5 - // identify a relation. This is just an optimization, - // though. - // - // 2. Once we know the partitioning, for each partition, - // decide if it's time to create a new image layer. The - // criteria is: there has been too much "churn" since the last - // image layer? The "churn" is fuzzy concept, it's a - // combination of too many delta files, or too much WAL in - // total in the delta file. Or perhaps: if creating an image - // file would allow to delete some older files. - // - // 3. After that, we compact all level0 delta files if there - // are too many of them. While compacting, we also garbage - // collect any page versions that are no longer needed because - // of the new image layers we created in step 2. - // - // TODO: This high level strategy hasn't been implemented yet. - // Below are functions compact_level0() and create_image_layers() - // but they are a bit ad hoc and don't quite work like it's explained - // above. Rewrite it. - - // Is the timeline being deleted? - if self.is_stopping() { - trace!("Dropping out of compaction on timeline shutdown"); - return Err(CompactionError::ShuttingDown); - } - - let target_file_size = self.get_checkpoint_distance(); - - // Define partitioning schema if needed - - // FIXME: the match should only cover repartitioning, not the next steps - match self - .repartition( - self.get_last_record_lsn(), - self.get_compaction_target_size(), - flags, - ctx, - ) - .await - { - Ok((partitioning, lsn)) => { - // Disables access_stats updates, so that the files we read remain candidates for eviction after we're done with them - let image_ctx = RequestContextBuilder::extend(ctx) - .access_stats_behavior(AccessStatsBehavior::Skip) - .build(); - - // 2. Compact - let timer = self.metrics.compact_time_histo.start_timer(); - self.compact_level0(target_file_size, ctx).await?; - timer.stop_and_record(); - - // 3. Create new image layers for partitions that have been modified - // "enough". - let layers = self - .create_image_layers( - &partitioning, - lsn, - flags.contains(CompactFlags::ForceImageLayerCreation), - &image_ctx, - ) - .await - .map_err(anyhow::Error::from)?; - if let Some(remote_client) = &self.remote_client { - for layer in layers { - remote_client.schedule_layer_file_upload(layer)?; - } - } - - if let Some(remote_client) = &self.remote_client { - // should any new image layer been created, not uploading index_part will - // result in a mismatch between remote_physical_size and layermap calculated - // size, which will fail some tests, but should not be an issue otherwise. - remote_client.schedule_index_upload_for_file_changes()?; - } - } - Err(err) => { - // no partitioning? This is normal, if the timeline was just created - // as an empty timeline. Also in unit tests, when we use the timeline - // as a simple key-value store, ignoring the datadir layout. Log the - // error but continue. - // - // Suppress error when it's due to cancellation - if !self.cancel.is_cancelled() { - error!("could not compact, repartitioning keyspace failed: {err:?}"); - } - } - }; - - Ok(()) - } - /// Mutate the timeline with a [`TimelineWriter`]. pub(crate) async fn writer(&self) -> TimelineWriter<'_> { TimelineWriter { tl: self, - _write_guard: self.write_lock.lock().await, + write_guard: self.write_lock.lock().await, } } - /// Check if more than 'checkpoint_distance' of WAL has been accumulated in - /// the in-memory layer, and initiate flushing it if so. - /// - /// Also flush after a period of time without new data -- it helps - /// safekeepers to regard pageserver as caught up and suspend activity. - pub(crate) async fn check_checkpoint_distance(self: &Arc) -> anyhow::Result<()> { - let last_lsn = self.get_last_record_lsn(); - let open_layer_size = { - let guard = self.layers.read().await; - let layers = guard.layer_map(); - let Some(open_layer) = layers.open_layer.as_ref() else { - return Ok(()); - }; - open_layer.size().await? - }; - let last_freeze_at = self.last_freeze_at.load(); - let last_freeze_ts = *(self.last_freeze_ts.read().unwrap()); - let distance = last_lsn.widening_sub(last_freeze_at); - // Rolling the open layer can be triggered by: - // 1. The distance from the last LSN we rolled at. This bounds the amount of WAL that - // the safekeepers need to store. For sharded tenants, we multiply by shard count to - // account for how writes are distributed across shards: we expect each node to consume - // 1/count of the LSN on average. - // 2. The size of the currently open layer. - // 3. The time since the last roll. It helps safekeepers to regard pageserver as caught - // up and suspend activity. - if (distance - >= self.get_checkpoint_distance() as i128 * self.shard_identity.count.count() as i128) - || open_layer_size > self.get_checkpoint_distance() - || (distance > 0 && last_freeze_ts.elapsed() >= self.get_checkpoint_timeout()) - { - info!( - "check_checkpoint_distance {}, layer size {}, elapsed since last flush {:?}", - distance, - open_layer_size, - last_freeze_ts.elapsed() - ); - - self.freeze_inmem_layer(true).await; - self.last_freeze_at.store(last_lsn); - *(self.last_freeze_ts.write().unwrap()) = Instant::now(); - - // Wake up the layer flusher - self.flush_frozen_layers(); - } - Ok(()) - } - pub(crate) fn activate( self: &Arc, broker_client: BrokerClientChannel, @@ -1709,7 +1589,7 @@ impl Timeline { layer_flush_start_tx, layer_flush_done_tx, - write_lock: tokio::sync::Mutex::new(()), + write_lock: tokio::sync::Mutex::new(None), gc_info: std::sync::RwLock::new(GcInfo { retain_lsns: Vec::new(), @@ -2858,7 +2738,7 @@ impl Timeline { let guard = timeline.layers.read().await; let layers = guard.layer_map(); - 'outer: loop { + loop { if cancel.is_cancelled() { return Err(GetVectoredError::Cancelled); } @@ -2884,12 +2764,7 @@ impl Timeline { } None => { for range in unmapped_keyspace.ranges.iter() { - let results = match layers.range_search(range.clone(), cont_lsn) { - Some(res) => res, - None => { - break 'outer; - } - }; + let results = layers.range_search(range.clone(), cont_lsn); results .found @@ -3040,43 +2915,6 @@ impl Timeline { Ok(layer) } - async fn put_value( - &self, - key: Key, - lsn: Lsn, - val: &Value, - ctx: &RequestContext, - ) -> anyhow::Result<()> { - //info!("PUT: key {} at {}", key, lsn); - let layer = self.get_layer_for_write(lsn).await?; - layer.put_value(key, lsn, val, ctx).await?; - Ok(()) - } - - async fn put_values( - &self, - values: &HashMap>, - ctx: &RequestContext, - ) -> anyhow::Result<()> { - // Pick the first LSN in the batch to get the layer to write to. - for lsns in values.values() { - if let Some((lsn, _)) = lsns.first() { - let layer = self.get_layer_for_write(*lsn).await?; - layer.put_values(values, ctx).await?; - break; - } - } - Ok(()) - } - - async fn put_tombstones(&self, tombstones: &[(Range, Lsn)]) -> anyhow::Result<()> { - if let Some((_, lsn)) = tombstones.first() { - let layer = self.get_layer_for_write(*lsn).await?; - layer.put_tombstones(tombstones).await?; - } - Ok(()) - } - pub(crate) fn finish_write(&self, new_lsn: Lsn) { assert!(new_lsn.is_aligned()); @@ -3087,14 +2925,20 @@ impl Timeline { async fn freeze_inmem_layer(&self, write_lock_held: bool) { // Freeze the current open in-memory layer. It will be written to disk on next // iteration. + let _write_guard = if write_lock_held { None } else { Some(self.write_lock.lock().await) }; + + self.freeze_inmem_layer_at(self.get_last_record_lsn()).await; + } + + async fn freeze_inmem_layer_at(&self, at: Lsn) { let mut guard = self.layers.write().await; guard - .try_freeze_in_memory_layer(self.get_last_record_lsn(), &self.last_freeze_at) + .try_freeze_in_memory_layer(at, &self.last_freeze_at) .await; } @@ -3408,53 +3252,48 @@ impl Timeline { frozen_layer: &Arc, ctx: &RequestContext, ) -> anyhow::Result { - let span = tracing::info_span!("blocking"); - let new_delta: ResidentLayer = tokio::task::spawn_blocking({ - let self_clone = Arc::clone(self); - let frozen_layer = Arc::clone(frozen_layer); - let ctx = ctx.attached_child(); - move || { - // Write it out - // Keep this inside `spawn_blocking` and `Handle::current` - // as long as the write path is still sync and the read impl - // is still not fully async. Otherwise executor threads would - // be blocked. - let _g = span.entered(); - let new_delta = - Handle::current().block_on(frozen_layer.write_to_disk(&self_clone, &ctx))?; - let new_delta_path = new_delta.local_path().to_owned(); - - // Sync it to disk. - // - // We must also fsync the timeline dir to ensure the directory entries for - // new layer files are durable. - // - // NB: timeline dir must be synced _after_ the file contents are durable. - // So, two separate fsyncs are required, they mustn't be batched. - // - // TODO: If we're running inside 'flush_frozen_layers' and there are multiple - // files to flush, the fsync overhead can be reduces as follows: - // 1. write them all to temporary file names - // 2. fsync them - // 3. rename to the final name - // 4. fsync the parent directory. - // Note that (1),(2),(3) today happen inside write_to_disk(). - // - // FIXME: the writer already fsyncs all data, only rename needs to be fsynced here - par_fsync::par_fsync(&[new_delta_path]).context("fsync of delta layer")?; - par_fsync::par_fsync(&[self_clone + let self_clone = Arc::clone(self); + let frozen_layer = Arc::clone(frozen_layer); + let ctx = ctx.attached_child(); + let work = async move { + let new_delta = frozen_layer.write_to_disk(&self_clone, &ctx).await?; + // The write_to_disk() above calls writer.finish() which already did the fsync of the inodes. + // We just need to fsync the directory in which these inodes are linked, + // which we know to be the timeline directory. + // + // We use fatal_err() below because the after write_to_disk returns with success, + // the in-memory state of the filesystem already has the layer file in its final place, + // and subsequent pageserver code could think it's durable while it really isn't. + let timeline_dir = VirtualFile::open( + &self_clone .conf - .timeline_path(&self_clone.tenant_shard_id, &self_clone.timeline_id)]) - .context("fsync of timeline dir")?; - - anyhow::Ok(new_delta) + .timeline_path(&self_clone.tenant_shard_id, &self_clone.timeline_id), + ) + .await + .fatal_err("VirtualFile::open for timeline dir fsync"); + timeline_dir + .sync_all() + .await + .fatal_err("VirtualFile::sync_all timeline dir"); + anyhow::Ok(new_delta) + }; + // Before tokio-epoll-uring, we ran write_to_disk & the sync_all inside spawn_blocking. + // Preserve that behavior to maintain the same behavior for `virtual_file_io_engine=std-fs`. + use crate::virtual_file::io_engine::IoEngine; + match crate::virtual_file::io_engine::get() { + IoEngine::NotSet => panic!("io engine not set"), + IoEngine::StdFs => { + let span = tracing::info_span!("blocking"); + tokio::task::spawn_blocking({ + move || Handle::current().block_on(work.instrument(span)) + }) + .await + .context("spawn_blocking") + .and_then(|x| x) } - }) - .await - .context("spawn_blocking") - .and_then(|x| x)?; - - Ok(new_delta) + #[cfg(target_os = "linux")] + IoEngine::TokioEpollUring => work.await, + } } async fn repartition( @@ -3674,30 +3513,24 @@ impl Timeline { } } - // Sync the new layer to disk before adding it to the layer map, to make sure - // we don't garbage collect something based on the new layer, before it has - // reached the disk. - // - // We must also fsync the timeline dir to ensure the directory entries for - // new layer files are durable - // - // Compaction creates multiple image layers. It would be better to create them all - // and fsync them all in parallel. - let all_paths = image_layers - .iter() - .map(|layer| layer.local_path().to_owned()) - .collect::>(); - - par_fsync::par_fsync_async(&all_paths) + // The writer.finish() above already did the fsync of the inodes. + // We just need to fsync the directory in which these inodes are linked, + // which we know to be the timeline directory. + if !image_layers.is_empty() { + // We use fatal_err() below because the after writer.finish() returns with success, + // the in-memory state of the filesystem already has the layer file in its final place, + // and subsequent pageserver code could think it's durable while it really isn't. + let timeline_dir = VirtualFile::open( + &self + .conf + .timeline_path(&self.tenant_shard_id, &self.timeline_id), + ) .await - .context("fsync of newly created layer files")?; - - if !all_paths.is_empty() { - par_fsync::par_fsync_async(&[self - .conf - .timeline_path(&self.tenant_shard_id, &self.timeline_id)]) - .await - .context("fsync of timeline dir")?; + .fatal_err("VirtualFile::open for timeline dir fsync"); + timeline_dir + .sync_all() + .await + .fatal_err("VirtualFile::sync_all timeline dir"); } let mut guard = self.layers.write().await; @@ -3738,12 +3571,6 @@ impl Timeline { } } -#[derive(Default)] -struct CompactLevel0Phase1Result { - new_layers: Vec, - deltas_to_compact: Vec, -} - /// Top-level failure to compact. #[derive(Debug, thiserror::Error)] pub(crate) enum CompactionError { @@ -3797,578 +3624,7 @@ impl DurationRecorder { } } -#[derive(Default)] -struct CompactLevel0Phase1StatsBuilder { - version: Option, - tenant_id: Option, - timeline_id: Option, - read_lock_acquisition_micros: DurationRecorder, - read_lock_held_spawn_blocking_startup_micros: DurationRecorder, - read_lock_held_key_sort_micros: DurationRecorder, - read_lock_held_prerequisites_micros: DurationRecorder, - read_lock_held_compute_holes_micros: DurationRecorder, - read_lock_drop_micros: DurationRecorder, - write_layer_files_micros: DurationRecorder, - level0_deltas_count: Option, - new_deltas_count: Option, - new_deltas_size: Option, -} - -#[derive(serde::Serialize)] -struct CompactLevel0Phase1Stats { - version: u64, - tenant_id: TenantShardId, - timeline_id: TimelineId, - read_lock_acquisition_micros: RecordedDuration, - read_lock_held_spawn_blocking_startup_micros: RecordedDuration, - read_lock_held_key_sort_micros: RecordedDuration, - read_lock_held_prerequisites_micros: RecordedDuration, - read_lock_held_compute_holes_micros: RecordedDuration, - read_lock_drop_micros: RecordedDuration, - write_layer_files_micros: RecordedDuration, - level0_deltas_count: usize, - new_deltas_count: usize, - new_deltas_size: u64, -} - -impl TryFrom for CompactLevel0Phase1Stats { - type Error = anyhow::Error; - - fn try_from(value: CompactLevel0Phase1StatsBuilder) -> Result { - Ok(Self { - version: value.version.ok_or_else(|| anyhow!("version not set"))?, - tenant_id: value - .tenant_id - .ok_or_else(|| anyhow!("tenant_id not set"))?, - timeline_id: value - .timeline_id - .ok_or_else(|| anyhow!("timeline_id not set"))?, - read_lock_acquisition_micros: value - .read_lock_acquisition_micros - .into_recorded() - .ok_or_else(|| anyhow!("read_lock_acquisition_micros not set"))?, - read_lock_held_spawn_blocking_startup_micros: value - .read_lock_held_spawn_blocking_startup_micros - .into_recorded() - .ok_or_else(|| anyhow!("read_lock_held_spawn_blocking_startup_micros not set"))?, - read_lock_held_key_sort_micros: value - .read_lock_held_key_sort_micros - .into_recorded() - .ok_or_else(|| anyhow!("read_lock_held_key_sort_micros not set"))?, - read_lock_held_prerequisites_micros: value - .read_lock_held_prerequisites_micros - .into_recorded() - .ok_or_else(|| anyhow!("read_lock_held_prerequisites_micros not set"))?, - read_lock_held_compute_holes_micros: value - .read_lock_held_compute_holes_micros - .into_recorded() - .ok_or_else(|| anyhow!("read_lock_held_compute_holes_micros not set"))?, - read_lock_drop_micros: value - .read_lock_drop_micros - .into_recorded() - .ok_or_else(|| anyhow!("read_lock_drop_micros not set"))?, - write_layer_files_micros: value - .write_layer_files_micros - .into_recorded() - .ok_or_else(|| anyhow!("write_layer_files_micros not set"))?, - level0_deltas_count: value - .level0_deltas_count - .ok_or_else(|| anyhow!("level0_deltas_count not set"))?, - new_deltas_count: value - .new_deltas_count - .ok_or_else(|| anyhow!("new_deltas_count not set"))?, - new_deltas_size: value - .new_deltas_size - .ok_or_else(|| anyhow!("new_deltas_size not set"))?, - }) - } -} - impl Timeline { - /// Level0 files first phase of compaction, explained in the [`Self::compact_legacy`] comment. - async fn compact_level0_phase1( - self: &Arc, - guard: tokio::sync::OwnedRwLockReadGuard, - mut stats: CompactLevel0Phase1StatsBuilder, - target_file_size: u64, - ctx: &RequestContext, - ) -> Result { - stats.read_lock_held_spawn_blocking_startup_micros = - stats.read_lock_acquisition_micros.till_now(); // set by caller - let layers = guard.layer_map(); - let level0_deltas = layers.get_level0_deltas()?; - let mut level0_deltas = level0_deltas - .into_iter() - .map(|x| guard.get_from_desc(&x)) - .collect_vec(); - stats.level0_deltas_count = Some(level0_deltas.len()); - // Only compact if enough layers have accumulated. - let threshold = self.get_compaction_threshold(); - if level0_deltas.is_empty() || level0_deltas.len() < threshold { - debug!( - level0_deltas = level0_deltas.len(), - threshold, "too few deltas to compact" - ); - return Ok(CompactLevel0Phase1Result::default()); - } - - // This failpoint is used together with `test_duplicate_layers` integration test. - // It returns the compaction result exactly the same layers as input to compaction. - // We want to ensure that this will not cause any problem when updating the layer map - // after the compaction is finished. - // - // Currently, there are two rare edge cases that will cause duplicated layers being - // inserted. - // 1. The compaction job is inturrupted / did not finish successfully. Assume we have file 1, 2, 3, 4, which - // is compacted to 5, but the page server is shut down, next time we start page server we will get a layer - // map containing 1, 2, 3, 4, and 5, whereas 5 has the same content as 4. If we trigger L0 compation at this - // point again, it is likely that we will get a file 6 which has the same content and the key range as 5, - // and this causes an overwrite. This is acceptable because the content is the same, and we should do a - // layer replace instead of the normal remove / upload process. - // 2. The input workload pattern creates exactly n files that are sorted, non-overlapping and is of target file - // size length. Compaction will likely create the same set of n files afterwards. - // - // This failpoint is a superset of both of the cases. - if cfg!(feature = "testing") { - let active = (|| { - ::fail::fail_point!("compact-level0-phase1-return-same", |_| true); - false - })(); - - if active { - let mut new_layers = Vec::with_capacity(level0_deltas.len()); - for delta in &level0_deltas { - // we are just faking these layers as being produced again for this failpoint - new_layers.push( - delta - .download_and_keep_resident() - .await - .context("download layer for failpoint")?, - ); - } - tracing::info!("compact-level0-phase1-return-same"); // so that we can check if we hit the failpoint - return Ok(CompactLevel0Phase1Result { - new_layers, - deltas_to_compact: level0_deltas, - }); - } - } - - // Gather the files to compact in this iteration. - // - // Start with the oldest Level 0 delta file, and collect any other - // level 0 files that form a contiguous sequence, such that the end - // LSN of previous file matches the start LSN of the next file. - // - // Note that if the files don't form such a sequence, we might - // "compact" just a single file. That's a bit pointless, but it allows - // us to get rid of the level 0 file, and compact the other files on - // the next iteration. This could probably made smarter, but such - // "gaps" in the sequence of level 0 files should only happen in case - // of a crash, partial download from cloud storage, or something like - // that, so it's not a big deal in practice. - level0_deltas.sort_by_key(|l| l.layer_desc().lsn_range.start); - let mut level0_deltas_iter = level0_deltas.iter(); - - let first_level0_delta = level0_deltas_iter.next().unwrap(); - let mut prev_lsn_end = first_level0_delta.layer_desc().lsn_range.end; - let mut deltas_to_compact = Vec::with_capacity(level0_deltas.len()); - - deltas_to_compact.push(first_level0_delta.download_and_keep_resident().await?); - for l in level0_deltas_iter { - let lsn_range = &l.layer_desc().lsn_range; - - if lsn_range.start != prev_lsn_end { - break; - } - deltas_to_compact.push(l.download_and_keep_resident().await?); - prev_lsn_end = lsn_range.end; - } - let lsn_range = Range { - start: deltas_to_compact - .first() - .unwrap() - .layer_desc() - .lsn_range - .start, - end: deltas_to_compact.last().unwrap().layer_desc().lsn_range.end, - }; - - info!( - "Starting Level0 compaction in LSN range {}-{} for {} layers ({} deltas in total)", - lsn_range.start, - lsn_range.end, - deltas_to_compact.len(), - level0_deltas.len() - ); - - for l in deltas_to_compact.iter() { - info!("compact includes {l}"); - } - - // We don't need the original list of layers anymore. Drop it so that - // we don't accidentally use it later in the function. - drop(level0_deltas); - - stats.read_lock_held_prerequisites_micros = stats - .read_lock_held_spawn_blocking_startup_micros - .till_now(); - - // Determine N largest holes where N is number of compacted layers. - let max_holes = deltas_to_compact.len(); - let last_record_lsn = self.get_last_record_lsn(); - let min_hole_range = (target_file_size / page_cache::PAGE_SZ as u64) as i128; - let min_hole_coverage_size = 3; // TODO: something more flexible? - - // min-heap (reserve space for one more element added before eviction) - let mut heap: BinaryHeap = BinaryHeap::with_capacity(max_holes + 1); - let mut prev: Option = None; - - let mut all_keys = Vec::new(); - - for l in deltas_to_compact.iter() { - all_keys.extend(l.load_keys(ctx).await?); - } - - // FIXME: should spawn_blocking the rest of this function - - // The current stdlib sorting implementation is designed in a way where it is - // particularly fast where the slice is made up of sorted sub-ranges. - all_keys.sort_by_key(|DeltaEntry { key, lsn, .. }| (*key, *lsn)); - - stats.read_lock_held_key_sort_micros = stats.read_lock_held_prerequisites_micros.till_now(); - - for &DeltaEntry { key: next_key, .. } in all_keys.iter() { - if let Some(prev_key) = prev { - // just first fast filter - if next_key.to_i128() - prev_key.to_i128() >= min_hole_range { - let key_range = prev_key..next_key; - // Measuring hole by just subtraction of i128 representation of key range boundaries - // has not so much sense, because largest holes will corresponds field1/field2 changes. - // But we are mostly interested to eliminate holes which cause generation of excessive image layers. - // That is why it is better to measure size of hole as number of covering image layers. - let coverage_size = layers.image_coverage(&key_range, last_record_lsn).len(); - if coverage_size >= min_hole_coverage_size { - heap.push(Hole { - key_range, - coverage_size, - }); - if heap.len() > max_holes { - heap.pop(); // remove smallest hole - } - } - } - } - prev = Some(next_key.next()); - } - stats.read_lock_held_compute_holes_micros = stats.read_lock_held_key_sort_micros.till_now(); - drop_rlock(guard); - stats.read_lock_drop_micros = stats.read_lock_held_compute_holes_micros.till_now(); - let mut holes = heap.into_vec(); - holes.sort_unstable_by_key(|hole| hole.key_range.start); - let mut next_hole = 0; // index of next hole in holes vector - - // This iterator walks through all key-value pairs from all the layers - // we're compacting, in key, LSN order. - let all_values_iter = all_keys.iter(); - - // This iterator walks through all keys and is needed to calculate size used by each key - let mut all_keys_iter = all_keys - .iter() - .map(|DeltaEntry { key, lsn, size, .. }| (*key, *lsn, *size)) - .coalesce(|mut prev, cur| { - // Coalesce keys that belong to the same key pair. - // This ensures that compaction doesn't put them - // into different layer files. - // Still limit this by the target file size, - // so that we keep the size of the files in - // check. - if prev.0 == cur.0 && prev.2 < target_file_size { - prev.2 += cur.2; - Ok(prev) - } else { - Err((prev, cur)) - } - }); - - // Merge the contents of all the input delta layers into a new set - // of delta layers, based on the current partitioning. - // - // We split the new delta layers on the key dimension. We iterate through the key space, and for each key, check if including the next key to the current output layer we're building would cause the layer to become too large. If so, dump the current output layer and start new one. - // It's possible that there is a single key with so many page versions that storing all of them in a single layer file - // would be too large. In that case, we also split on the LSN dimension. - // - // LSN - // ^ - // | - // | +-----------+ +--+--+--+--+ - // | | | | | | | | - // | +-----------+ | | | | | - // | | | | | | | | - // | +-----------+ ==> | | | | | - // | | | | | | | | - // | +-----------+ | | | | | - // | | | | | | | | - // | +-----------+ +--+--+--+--+ - // | - // +--------------> key - // - // - // If one key (X) has a lot of page versions: - // - // LSN - // ^ - // | (X) - // | +-----------+ +--+--+--+--+ - // | | | | | | | | - // | +-----------+ | | +--+ | - // | | | | | | | | - // | +-----------+ ==> | | | | | - // | | | | | +--+ | - // | +-----------+ | | | | | - // | | | | | | | | - // | +-----------+ +--+--+--+--+ - // | - // +--------------> key - // TODO: this actually divides the layers into fixed-size chunks, not - // based on the partitioning. - // - // TODO: we should also opportunistically materialize and - // garbage collect what we can. - let mut new_layers = Vec::new(); - let mut prev_key: Option = None; - let mut writer: Option = None; - let mut key_values_total_size = 0u64; - let mut dup_start_lsn: Lsn = Lsn::INVALID; // start LSN of layer containing values of the single key - let mut dup_end_lsn: Lsn = Lsn::INVALID; // end LSN of layer containing values of the single key - - for &DeltaEntry { - key, lsn, ref val, .. - } in all_values_iter - { - let value = val.load(ctx).await?; - let same_key = prev_key.map_or(false, |prev_key| prev_key == key); - // We need to check key boundaries once we reach next key or end of layer with the same key - if !same_key || lsn == dup_end_lsn { - let mut next_key_size = 0u64; - let is_dup_layer = dup_end_lsn.is_valid(); - dup_start_lsn = Lsn::INVALID; - if !same_key { - dup_end_lsn = Lsn::INVALID; - } - // Determine size occupied by this key. We stop at next key or when size becomes larger than target_file_size - for (next_key, next_lsn, next_size) in all_keys_iter.by_ref() { - next_key_size = next_size; - if key != next_key { - if dup_end_lsn.is_valid() { - // We are writting segment with duplicates: - // place all remaining values of this key in separate segment - dup_start_lsn = dup_end_lsn; // new segments starts where old stops - dup_end_lsn = lsn_range.end; // there are no more values of this key till end of LSN range - } - break; - } - key_values_total_size += next_size; - // Check if it is time to split segment: if total keys size is larger than target file size. - // We need to avoid generation of empty segments if next_size > target_file_size. - if key_values_total_size > target_file_size && lsn != next_lsn { - // Split key between multiple layers: such layer can contain only single key - dup_start_lsn = if dup_end_lsn.is_valid() { - dup_end_lsn // new segment with duplicates starts where old one stops - } else { - lsn // start with the first LSN for this key - }; - dup_end_lsn = next_lsn; // upper LSN boundary is exclusive - break; - } - } - // handle case when loop reaches last key: in this case dup_end is non-zero but dup_start is not set. - if dup_end_lsn.is_valid() && !dup_start_lsn.is_valid() { - dup_start_lsn = dup_end_lsn; - dup_end_lsn = lsn_range.end; - } - if writer.is_some() { - let written_size = writer.as_mut().unwrap().size(); - let contains_hole = - next_hole < holes.len() && key >= holes[next_hole].key_range.end; - // check if key cause layer overflow or contains hole... - if is_dup_layer - || dup_end_lsn.is_valid() - || written_size + key_values_total_size > target_file_size - || contains_hole - { - // ... if so, flush previous layer and prepare to write new one - new_layers.push( - writer - .take() - .unwrap() - .finish(prev_key.unwrap().next(), self) - .await?, - ); - writer = None; - - if contains_hole { - // skip hole - next_hole += 1; - } - } - } - // Remember size of key value because at next iteration we will access next item - key_values_total_size = next_key_size; - } - fail_point!("delta-layer-writer-fail-before-finish", |_| { - Err(CompactionError::Other(anyhow::anyhow!( - "failpoint delta-layer-writer-fail-before-finish" - ))) - }); - - if !self.shard_identity.is_key_disposable(&key) { - if writer.is_none() { - // Create writer if not initiaized yet - writer = Some( - DeltaLayerWriter::new( - self.conf, - self.timeline_id, - self.tenant_shard_id, - key, - if dup_end_lsn.is_valid() { - // this is a layer containing slice of values of the same key - debug!("Create new dup layer {}..{}", dup_start_lsn, dup_end_lsn); - dup_start_lsn..dup_end_lsn - } else { - debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end); - lsn_range.clone() - }, - ) - .await?, - ); - } - - writer.as_mut().unwrap().put_value(key, lsn, value).await?; - } else { - debug!( - "Dropping key {} during compaction (it belongs on shard {:?})", - key, - self.shard_identity.get_shard_number(&key) - ); - } - - if !new_layers.is_empty() { - fail_point!("after-timeline-compacted-first-L1"); - } - - prev_key = Some(key); - } - if let Some(writer) = writer { - new_layers.push(writer.finish(prev_key.unwrap().next(), self).await?); - } - - // Sync layers - if !new_layers.is_empty() { - // Print a warning if the created layer is larger than double the target size - // Add two pages for potential overhead. This should in theory be already - // accounted for in the target calculation, but for very small targets, - // we still might easily hit the limit otherwise. - let warn_limit = target_file_size * 2 + page_cache::PAGE_SZ as u64 * 2; - for layer in new_layers.iter() { - if layer.layer_desc().file_size > warn_limit { - warn!( - %layer, - "created delta file of size {} larger than double of target of {target_file_size}", layer.layer_desc().file_size - ); - } - } - - // FIXME: the writer already fsyncs all data, only rename needs to be fsynced here - let layer_paths: Vec = new_layers - .iter() - .map(|l| l.local_path().to_owned()) - .collect(); - - // Fsync all the layer files and directory using multiple threads to - // minimize latency. - par_fsync::par_fsync_async(&layer_paths) - .await - .context("fsync all new layers")?; - - let timeline_dir = self - .conf - .timeline_path(&self.tenant_shard_id, &self.timeline_id); - - par_fsync::par_fsync_async(&[timeline_dir]) - .await - .context("fsync of timeline dir")?; - } - - stats.write_layer_files_micros = stats.read_lock_drop_micros.till_now(); - stats.new_deltas_count = Some(new_layers.len()); - stats.new_deltas_size = Some(new_layers.iter().map(|l| l.layer_desc().file_size).sum()); - - match TryInto::::try_into(stats) - .and_then(|stats| serde_json::to_string(&stats).context("serde_json::to_string")) - { - Ok(stats_json) => { - info!( - stats_json = stats_json.as_str(), - "compact_level0_phase1 stats available" - ) - } - Err(e) => { - warn!("compact_level0_phase1 stats failed to serialize: {:#}", e); - } - } - - Ok(CompactLevel0Phase1Result { - new_layers, - deltas_to_compact: deltas_to_compact - .into_iter() - .map(|x| x.drop_eviction_guard()) - .collect::>(), - }) - } - - /// - /// Collect a bunch of Level 0 layer files, and compact and reshuffle them as - /// as Level 1 files. - /// - async fn compact_level0( - self: &Arc, - target_file_size: u64, - ctx: &RequestContext, - ) -> Result<(), CompactionError> { - let CompactLevel0Phase1Result { - new_layers, - deltas_to_compact, - } = { - let phase1_span = info_span!("compact_level0_phase1"); - let ctx = ctx.attached_child(); - let mut stats = CompactLevel0Phase1StatsBuilder { - version: Some(2), - tenant_id: Some(self.tenant_shard_id), - timeline_id: Some(self.timeline_id), - ..Default::default() - }; - - let begin = tokio::time::Instant::now(); - let phase1_layers_locked = Arc::clone(&self.layers).read_owned().await; - let now = tokio::time::Instant::now(); - stats.read_lock_acquisition_micros = - DurationRecorder::Recorded(RecordedDuration(now - begin), now); - self.compact_level0_phase1(phase1_layers_locked, stats, target_file_size, &ctx) - .instrument(phase1_span) - .await? - }; - - if new_layers.is_empty() && deltas_to_compact.is_empty() { - // nothing to do - return Ok(()); - } - - self.finish_compact_batch(&new_layers, &Vec::new(), &deltas_to_compact) - .await?; - Ok(()) - } - async fn finish_compact_batch( self: &Arc, new_deltas: &[ResidentLayer], @@ -5059,13 +4315,43 @@ fn layer_traversal_error(msg: String, path: Vec) -> PageRecon PageReconstructError::from(msg) } +struct TimelineWriterState { + open_layer: Arc, + current_size: u64, + // Previous Lsn which passed through + prev_lsn: Option, + // Largest Lsn which passed through the current writer + max_lsn: Option, + // Cached details of the last freeze. Avoids going trough the atomic/lock on every put. + cached_last_freeze_at: Lsn, + cached_last_freeze_ts: Instant, +} + +impl TimelineWriterState { + fn new( + open_layer: Arc, + current_size: u64, + last_freeze_at: Lsn, + last_freeze_ts: Instant, + ) -> Self { + Self { + open_layer, + current_size, + prev_lsn: None, + max_lsn: None, + cached_last_freeze_at: last_freeze_at, + cached_last_freeze_ts: last_freeze_ts, + } + } +} + /// Various functions to mutate the timeline. // TODO Currently, Deref is used to allow easy access to read methods from this trait. // This is probably considered a bad practice in Rust and should be fixed eventually, // but will cause large code changes. pub(crate) struct TimelineWriter<'a> { tl: &'a Timeline, - _write_guard: tokio::sync::MutexGuard<'a, ()>, + write_guard: tokio::sync::MutexGuard<'a, Option>, } impl Deref for TimelineWriter<'_> { @@ -5076,31 +4362,239 @@ impl Deref for TimelineWriter<'_> { } } +impl Drop for TimelineWriter<'_> { + fn drop(&mut self) { + self.write_guard.take(); + } +} + +#[derive(PartialEq)] +enum OpenLayerAction { + Roll, + Open, + None, +} + impl<'a> TimelineWriter<'a> { /// Put a new page version that can be constructed from a WAL record /// /// This will implicitly extend the relation, if the page is beyond the /// current end-of-file. pub(crate) async fn put( - &self, + &mut self, key: Key, lsn: Lsn, value: &Value, ctx: &RequestContext, ) -> anyhow::Result<()> { - self.tl.put_value(key, lsn, value, ctx).await + // Avoid doing allocations for "small" values. + // In the regression test suite, the limit of 256 avoided allocations in 95% of cases: + // https://github.com/neondatabase/neon/pull/5056#discussion_r1301975061 + let mut buf = smallvec::SmallVec::<[u8; 256]>::new(); + value.ser_into(&mut buf)?; + let buf_size: u64 = buf.len().try_into().expect("oversized value buf"); + + let action = self.get_open_layer_action(lsn, buf_size); + let layer = self.handle_open_layer_action(lsn, action).await?; + let res = layer.put_value(key, lsn, &buf, ctx).await; + + if res.is_ok() { + // Update the current size only when the entire write was ok. + // In case of failures, we may have had partial writes which + // render the size tracking out of sync. That's ok because + // the checkpoint distance should be significantly smaller + // than the S3 single shot upload limit of 5GiB. + let state = self.write_guard.as_mut().unwrap(); + + state.current_size += buf_size; + state.prev_lsn = Some(lsn); + state.max_lsn = std::cmp::max(state.max_lsn, Some(lsn)); + } + + res } + /// "Tick" the timeline writer: it will roll the open layer if required + /// and do nothing else. + pub(crate) async fn tick(&mut self) -> anyhow::Result<()> { + self.open_layer_if_present().await?; + + let last_record_lsn = self.get_last_record_lsn(); + let action = self.get_open_layer_action(last_record_lsn, 0); + if action == OpenLayerAction::Roll { + self.roll_layer(last_record_lsn).await?; + } + + Ok(()) + } + + /// Populate the timeline writer state only if an in-memory layer + /// is already open. + async fn open_layer_if_present(&mut self) -> anyhow::Result<()> { + assert!(self.write_guard.is_none()); + + let open_layer = { + let guard = self.layers.read().await; + let layers = guard.layer_map(); + match layers.open_layer { + Some(ref open_layer) => open_layer.clone(), + None => { + return Ok(()); + } + } + }; + + let initial_size = open_layer.size().await?; + let last_freeze_at = self.last_freeze_at.load(); + let last_freeze_ts = *self.last_freeze_ts.read().unwrap(); + self.write_guard.replace(TimelineWriterState::new( + open_layer, + initial_size, + last_freeze_at, + last_freeze_ts, + )); + + Ok(()) + } + + async fn handle_open_layer_action( + &mut self, + at: Lsn, + action: OpenLayerAction, + ) -> anyhow::Result<&Arc> { + match action { + OpenLayerAction::Roll => { + let freeze_at = self.write_guard.as_ref().unwrap().max_lsn.unwrap(); + self.roll_layer(freeze_at).await?; + self.open_layer(at).await?; + } + OpenLayerAction::Open => self.open_layer(at).await?, + OpenLayerAction::None => { + assert!(self.write_guard.is_some()); + } + } + + Ok(&self.write_guard.as_ref().unwrap().open_layer) + } + + async fn open_layer(&mut self, at: Lsn) -> anyhow::Result<()> { + let layer = self.tl.get_layer_for_write(at).await?; + let initial_size = layer.size().await?; + + let last_freeze_at = self.last_freeze_at.load(); + let last_freeze_ts = *self.last_freeze_ts.read().unwrap(); + self.write_guard.replace(TimelineWriterState::new( + layer, + initial_size, + last_freeze_at, + last_freeze_ts, + )); + + Ok(()) + } + + async fn roll_layer(&mut self, freeze_at: Lsn) -> anyhow::Result<()> { + assert!(self.write_guard.is_some()); + + self.tl.freeze_inmem_layer_at(freeze_at).await; + + let now = Instant::now(); + *(self.last_freeze_ts.write().unwrap()) = now; + + self.tl.flush_frozen_layers(); + + let current_size = self.write_guard.as_ref().unwrap().current_size; + if current_size > self.get_checkpoint_distance() { + warn!("Flushed oversized open layer with size {}", current_size) + } + + Ok(()) + } + + fn get_open_layer_action(&self, lsn: Lsn, new_value_size: u64) -> OpenLayerAction { + let state = &*self.write_guard; + let Some(state) = &state else { + return OpenLayerAction::Open; + }; + + if state.prev_lsn == Some(lsn) { + // Rolling mid LSN is not supported by downstream code. + // Hence, only roll at LSN boundaries. + return OpenLayerAction::None; + } + + if state.current_size == 0 { + // Don't roll empty layers + return OpenLayerAction::None; + } + + let distance = lsn.widening_sub(state.cached_last_freeze_at); + let proposed_open_layer_size = state.current_size + new_value_size; + + // Rolling the open layer can be triggered by: + // 1. The distance from the last LSN we rolled at. This bounds the amount of WAL that + // the safekeepers need to store. For sharded tenants, we multiply by shard count to + // account for how writes are distributed across shards: we expect each node to consume + // 1/count of the LSN on average. + // 2. The size of the currently open layer. + // 3. The time since the last roll. It helps safekeepers to regard pageserver as caught + // up and suspend activity. + if distance + >= self.get_checkpoint_distance() as i128 * self.shard_identity.count.count() as i128 + { + info!( + "Will roll layer at {} with layer size {} due to LSN distance ({})", + lsn, state.current_size, distance + ); + + OpenLayerAction::Roll + } else if proposed_open_layer_size >= self.get_checkpoint_distance() { + info!( + "Will roll layer at {} with layer size {} due to layer size ({})", + lsn, state.current_size, proposed_open_layer_size + ); + + OpenLayerAction::Roll + } else if distance > 0 + && state.cached_last_freeze_ts.elapsed() >= self.get_checkpoint_timeout() + { + info!( + "Will roll layer at {} with layer size {} due to time since last flush ({:?})", + lsn, + state.current_size, + state.cached_last_freeze_ts.elapsed() + ); + + OpenLayerAction::Roll + } else { + OpenLayerAction::None + } + } + + /// Put a batch keys at the specified Lsns. + /// + /// The batch should be sorted by Lsn such that it's safe + /// to roll the open layer mid batch. pub(crate) async fn put_batch( - &self, - batch: &HashMap>, + &mut self, + batch: Vec<(Key, Lsn, Value)>, ctx: &RequestContext, ) -> anyhow::Result<()> { - self.tl.put_values(batch, ctx).await + for (key, lsn, val) in batch { + self.put(key, lsn, &val, ctx).await? + } + + Ok(()) } - pub(crate) async fn delete_batch(&self, batch: &[(Range, Lsn)]) -> anyhow::Result<()> { - self.tl.put_tombstones(batch).await + pub(crate) async fn delete_batch(&mut self, batch: &[(Range, Lsn)]) -> anyhow::Result<()> { + if let Some((_, lsn)) = batch.first() { + let action = self.get_open_layer_action(*lsn, 0); + let layer = self.handle_open_layer_action(*lsn, action).await?; + layer.put_tombstones(batch).await?; + } + + Ok(()) } /// Track the end of the latest digested WAL record. diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index 914e3948ef..74b75dabf0 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -4,24 +4,32 @@ //! //! The old legacy algorithm is implemented directly in `timeline.rs`. +use std::collections::BinaryHeap; use std::ops::{Deref, Range}; use std::sync::Arc; -use super::Timeline; +use super::layer_manager::LayerManager; +use super::{CompactFlags, DurationRecorder, RecordedDuration, Timeline}; +use anyhow::{anyhow, Context}; use async_trait::async_trait; +use enumset::EnumSet; use fail::fail_point; +use itertools::Itertools; +use pageserver_api::shard::TenantShardId; use tokio_util::sync::CancellationToken; -use tracing::{debug, trace, warn}; +use tracing::{debug, info, info_span, trace, warn, Instrument}; +use utils::id::TimelineId; -use crate::context::RequestContext; +use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder}; use crate::tenant::storage_layer::{AsLayerDesc, PersistentLayerDesc}; -use crate::tenant::timeline::{is_rel_fsm_block_key, is_rel_vm_block_key}; +use crate::tenant::timeline::{drop_rlock, is_rel_fsm_block_key, is_rel_vm_block_key, Hole}; use crate::tenant::timeline::{DeltaLayerWriter, ImageLayerWriter}; use crate::tenant::timeline::{Layer, ResidentLayer}; use crate::tenant::DeltaLayer; use crate::tenant::PageReconstructError; -use crate::ZERO_PAGE; +use crate::virtual_file::{MaybeFatalIo, VirtualFile}; +use crate::{page_cache, ZERO_PAGE}; use crate::keyspace::KeySpace; use crate::repository::Key; @@ -33,6 +41,694 @@ use pageserver_compaction::interface::*; use super::CompactionError; +impl Timeline { + /// TODO: cancellation + pub(crate) async fn compact_legacy( + self: &Arc, + _cancel: &CancellationToken, + flags: EnumSet, + ctx: &RequestContext, + ) -> Result<(), CompactionError> { + // High level strategy for compaction / image creation: + // + // 1. First, calculate the desired "partitioning" of the + // currently in-use key space. The goal is to partition the + // key space into roughly fixed-size chunks, but also take into + // account any existing image layers, and try to align the + // chunk boundaries with the existing image layers to avoid + // too much churn. Also try to align chunk boundaries with + // relation boundaries. In principle, we don't know about + // relation boundaries here, we just deal with key-value + // pairs, and the code in pgdatadir_mapping.rs knows how to + // map relations into key-value pairs. But in practice we know + // that 'field6' is the block number, and the fields 1-5 + // identify a relation. This is just an optimization, + // though. + // + // 2. Once we know the partitioning, for each partition, + // decide if it's time to create a new image layer. The + // criteria is: there has been too much "churn" since the last + // image layer? The "churn" is fuzzy concept, it's a + // combination of too many delta files, or too much WAL in + // total in the delta file. Or perhaps: if creating an image + // file would allow to delete some older files. + // + // 3. After that, we compact all level0 delta files if there + // are too many of them. While compacting, we also garbage + // collect any page versions that are no longer needed because + // of the new image layers we created in step 2. + // + // TODO: This high level strategy hasn't been implemented yet. + // Below are functions compact_level0() and create_image_layers() + // but they are a bit ad hoc and don't quite work like it's explained + // above. Rewrite it. + + // Is the timeline being deleted? + if self.is_stopping() { + trace!("Dropping out of compaction on timeline shutdown"); + return Err(CompactionError::ShuttingDown); + } + + let target_file_size = self.get_checkpoint_distance(); + + // Define partitioning schema if needed + + // FIXME: the match should only cover repartitioning, not the next steps + match self + .repartition( + self.get_last_record_lsn(), + self.get_compaction_target_size(), + flags, + ctx, + ) + .await + { + Ok((partitioning, lsn)) => { + // Disables access_stats updates, so that the files we read remain candidates for eviction after we're done with them + let image_ctx = RequestContextBuilder::extend(ctx) + .access_stats_behavior(AccessStatsBehavior::Skip) + .build(); + + // 2. Compact + let timer = self.metrics.compact_time_histo.start_timer(); + self.compact_level0(target_file_size, ctx).await?; + timer.stop_and_record(); + + // 3. Create new image layers for partitions that have been modified + // "enough". + let layers = self + .create_image_layers( + &partitioning, + lsn, + flags.contains(CompactFlags::ForceImageLayerCreation), + &image_ctx, + ) + .await + .map_err(anyhow::Error::from)?; + if let Some(remote_client) = &self.remote_client { + for layer in layers { + remote_client.schedule_layer_file_upload(layer)?; + } + } + + if let Some(remote_client) = &self.remote_client { + // should any new image layer been created, not uploading index_part will + // result in a mismatch between remote_physical_size and layermap calculated + // size, which will fail some tests, but should not be an issue otherwise. + remote_client.schedule_index_upload_for_file_changes()?; + } + } + Err(err) => { + // no partitioning? This is normal, if the timeline was just created + // as an empty timeline. Also in unit tests, when we use the timeline + // as a simple key-value store, ignoring the datadir layout. Log the + // error but continue. + // + // Suppress error when it's due to cancellation + if !self.cancel.is_cancelled() { + tracing::error!("could not compact, repartitioning keyspace failed: {err:?}"); + } + } + }; + + Ok(()) + } + + /// Collect a bunch of Level 0 layer files, and compact and reshuffle them as + /// as Level 1 files. + async fn compact_level0( + self: &Arc, + target_file_size: u64, + ctx: &RequestContext, + ) -> Result<(), CompactionError> { + let CompactLevel0Phase1Result { + new_layers, + deltas_to_compact, + } = { + let phase1_span = info_span!("compact_level0_phase1"); + let ctx = ctx.attached_child(); + let mut stats = CompactLevel0Phase1StatsBuilder { + version: Some(2), + tenant_id: Some(self.tenant_shard_id), + timeline_id: Some(self.timeline_id), + ..Default::default() + }; + + let begin = tokio::time::Instant::now(); + let phase1_layers_locked = Arc::clone(&self.layers).read_owned().await; + let now = tokio::time::Instant::now(); + stats.read_lock_acquisition_micros = + DurationRecorder::Recorded(RecordedDuration(now - begin), now); + self.compact_level0_phase1(phase1_layers_locked, stats, target_file_size, &ctx) + .instrument(phase1_span) + .await? + }; + + if new_layers.is_empty() && deltas_to_compact.is_empty() { + // nothing to do + return Ok(()); + } + + self.finish_compact_batch(&new_layers, &Vec::new(), &deltas_to_compact) + .await?; + Ok(()) + } + + /// Level0 files first phase of compaction, explained in the [`Self::compact_legacy`] comment. + async fn compact_level0_phase1( + self: &Arc, + guard: tokio::sync::OwnedRwLockReadGuard, + mut stats: CompactLevel0Phase1StatsBuilder, + target_file_size: u64, + ctx: &RequestContext, + ) -> Result { + stats.read_lock_held_spawn_blocking_startup_micros = + stats.read_lock_acquisition_micros.till_now(); // set by caller + let layers = guard.layer_map(); + let level0_deltas = layers.get_level0_deltas()?; + let mut level0_deltas = level0_deltas + .into_iter() + .map(|x| guard.get_from_desc(&x)) + .collect_vec(); + stats.level0_deltas_count = Some(level0_deltas.len()); + // Only compact if enough layers have accumulated. + let threshold = self.get_compaction_threshold(); + if level0_deltas.is_empty() || level0_deltas.len() < threshold { + debug!( + level0_deltas = level0_deltas.len(), + threshold, "too few deltas to compact" + ); + return Ok(CompactLevel0Phase1Result::default()); + } + + // This failpoint is used together with `test_duplicate_layers` integration test. + // It returns the compaction result exactly the same layers as input to compaction. + // We want to ensure that this will not cause any problem when updating the layer map + // after the compaction is finished. + // + // Currently, there are two rare edge cases that will cause duplicated layers being + // inserted. + // 1. The compaction job is inturrupted / did not finish successfully. Assume we have file 1, 2, 3, 4, which + // is compacted to 5, but the page server is shut down, next time we start page server we will get a layer + // map containing 1, 2, 3, 4, and 5, whereas 5 has the same content as 4. If we trigger L0 compation at this + // point again, it is likely that we will get a file 6 which has the same content and the key range as 5, + // and this causes an overwrite. This is acceptable because the content is the same, and we should do a + // layer replace instead of the normal remove / upload process. + // 2. The input workload pattern creates exactly n files that are sorted, non-overlapping and is of target file + // size length. Compaction will likely create the same set of n files afterwards. + // + // This failpoint is a superset of both of the cases. + if cfg!(feature = "testing") { + let active = (|| { + ::fail::fail_point!("compact-level0-phase1-return-same", |_| true); + false + })(); + + if active { + let mut new_layers = Vec::with_capacity(level0_deltas.len()); + for delta in &level0_deltas { + // we are just faking these layers as being produced again for this failpoint + new_layers.push( + delta + .download_and_keep_resident() + .await + .context("download layer for failpoint")?, + ); + } + tracing::info!("compact-level0-phase1-return-same"); // so that we can check if we hit the failpoint + return Ok(CompactLevel0Phase1Result { + new_layers, + deltas_to_compact: level0_deltas, + }); + } + } + + // Gather the files to compact in this iteration. + // + // Start with the oldest Level 0 delta file, and collect any other + // level 0 files that form a contiguous sequence, such that the end + // LSN of previous file matches the start LSN of the next file. + // + // Note that if the files don't form such a sequence, we might + // "compact" just a single file. That's a bit pointless, but it allows + // us to get rid of the level 0 file, and compact the other files on + // the next iteration. This could probably made smarter, but such + // "gaps" in the sequence of level 0 files should only happen in case + // of a crash, partial download from cloud storage, or something like + // that, so it's not a big deal in practice. + level0_deltas.sort_by_key(|l| l.layer_desc().lsn_range.start); + let mut level0_deltas_iter = level0_deltas.iter(); + + let first_level0_delta = level0_deltas_iter.next().unwrap(); + let mut prev_lsn_end = first_level0_delta.layer_desc().lsn_range.end; + let mut deltas_to_compact = Vec::with_capacity(level0_deltas.len()); + + deltas_to_compact.push(first_level0_delta.download_and_keep_resident().await?); + for l in level0_deltas_iter { + let lsn_range = &l.layer_desc().lsn_range; + + if lsn_range.start != prev_lsn_end { + break; + } + deltas_to_compact.push(l.download_and_keep_resident().await?); + prev_lsn_end = lsn_range.end; + } + let lsn_range = Range { + start: deltas_to_compact + .first() + .unwrap() + .layer_desc() + .lsn_range + .start, + end: deltas_to_compact.last().unwrap().layer_desc().lsn_range.end, + }; + + info!( + "Starting Level0 compaction in LSN range {}-{} for {} layers ({} deltas in total)", + lsn_range.start, + lsn_range.end, + deltas_to_compact.len(), + level0_deltas.len() + ); + + for l in deltas_to_compact.iter() { + info!("compact includes {l}"); + } + + // We don't need the original list of layers anymore. Drop it so that + // we don't accidentally use it later in the function. + drop(level0_deltas); + + stats.read_lock_held_prerequisites_micros = stats + .read_lock_held_spawn_blocking_startup_micros + .till_now(); + + // Determine N largest holes where N is number of compacted layers. + let max_holes = deltas_to_compact.len(); + let last_record_lsn = self.get_last_record_lsn(); + let min_hole_range = (target_file_size / page_cache::PAGE_SZ as u64) as i128; + let min_hole_coverage_size = 3; // TODO: something more flexible? + + // min-heap (reserve space for one more element added before eviction) + let mut heap: BinaryHeap = BinaryHeap::with_capacity(max_holes + 1); + let mut prev: Option = None; + + let mut all_keys = Vec::new(); + + for l in deltas_to_compact.iter() { + all_keys.extend(l.load_keys(ctx).await?); + } + + // FIXME: should spawn_blocking the rest of this function + + // The current stdlib sorting implementation is designed in a way where it is + // particularly fast where the slice is made up of sorted sub-ranges. + all_keys.sort_by_key(|DeltaEntry { key, lsn, .. }| (*key, *lsn)); + + stats.read_lock_held_key_sort_micros = stats.read_lock_held_prerequisites_micros.till_now(); + + for &DeltaEntry { key: next_key, .. } in all_keys.iter() { + if let Some(prev_key) = prev { + // just first fast filter + if next_key.to_i128() - prev_key.to_i128() >= min_hole_range { + let key_range = prev_key..next_key; + // Measuring hole by just subtraction of i128 representation of key range boundaries + // has not so much sense, because largest holes will corresponds field1/field2 changes. + // But we are mostly interested to eliminate holes which cause generation of excessive image layers. + // That is why it is better to measure size of hole as number of covering image layers. + let coverage_size = layers.image_coverage(&key_range, last_record_lsn).len(); + if coverage_size >= min_hole_coverage_size { + heap.push(Hole { + key_range, + coverage_size, + }); + if heap.len() > max_holes { + heap.pop(); // remove smallest hole + } + } + } + } + prev = Some(next_key.next()); + } + stats.read_lock_held_compute_holes_micros = stats.read_lock_held_key_sort_micros.till_now(); + drop_rlock(guard); + stats.read_lock_drop_micros = stats.read_lock_held_compute_holes_micros.till_now(); + let mut holes = heap.into_vec(); + holes.sort_unstable_by_key(|hole| hole.key_range.start); + let mut next_hole = 0; // index of next hole in holes vector + + // This iterator walks through all key-value pairs from all the layers + // we're compacting, in key, LSN order. + let all_values_iter = all_keys.iter(); + + // This iterator walks through all keys and is needed to calculate size used by each key + let mut all_keys_iter = all_keys + .iter() + .map(|DeltaEntry { key, lsn, size, .. }| (*key, *lsn, *size)) + .coalesce(|mut prev, cur| { + // Coalesce keys that belong to the same key pair. + // This ensures that compaction doesn't put them + // into different layer files. + // Still limit this by the target file size, + // so that we keep the size of the files in + // check. + if prev.0 == cur.0 && prev.2 < target_file_size { + prev.2 += cur.2; + Ok(prev) + } else { + Err((prev, cur)) + } + }); + + // Merge the contents of all the input delta layers into a new set + // of delta layers, based on the current partitioning. + // + // We split the new delta layers on the key dimension. We iterate through the key space, and for each key, check if including the next key to the current output layer we're building would cause the layer to become too large. If so, dump the current output layer and start new one. + // It's possible that there is a single key with so many page versions that storing all of them in a single layer file + // would be too large. In that case, we also split on the LSN dimension. + // + // LSN + // ^ + // | + // | +-----------+ +--+--+--+--+ + // | | | | | | | | + // | +-----------+ | | | | | + // | | | | | | | | + // | +-----------+ ==> | | | | | + // | | | | | | | | + // | +-----------+ | | | | | + // | | | | | | | | + // | +-----------+ +--+--+--+--+ + // | + // +--------------> key + // + // + // If one key (X) has a lot of page versions: + // + // LSN + // ^ + // | (X) + // | +-----------+ +--+--+--+--+ + // | | | | | | | | + // | +-----------+ | | +--+ | + // | | | | | | | | + // | +-----------+ ==> | | | | | + // | | | | | +--+ | + // | +-----------+ | | | | | + // | | | | | | | | + // | +-----------+ +--+--+--+--+ + // | + // +--------------> key + // TODO: this actually divides the layers into fixed-size chunks, not + // based on the partitioning. + // + // TODO: we should also opportunistically materialize and + // garbage collect what we can. + let mut new_layers = Vec::new(); + let mut prev_key: Option = None; + let mut writer: Option = None; + let mut key_values_total_size = 0u64; + let mut dup_start_lsn: Lsn = Lsn::INVALID; // start LSN of layer containing values of the single key + let mut dup_end_lsn: Lsn = Lsn::INVALID; // end LSN of layer containing values of the single key + + for &DeltaEntry { + key, lsn, ref val, .. + } in all_values_iter + { + let value = val.load(ctx).await?; + let same_key = prev_key.map_or(false, |prev_key| prev_key == key); + // We need to check key boundaries once we reach next key or end of layer with the same key + if !same_key || lsn == dup_end_lsn { + let mut next_key_size = 0u64; + let is_dup_layer = dup_end_lsn.is_valid(); + dup_start_lsn = Lsn::INVALID; + if !same_key { + dup_end_lsn = Lsn::INVALID; + } + // Determine size occupied by this key. We stop at next key or when size becomes larger than target_file_size + for (next_key, next_lsn, next_size) in all_keys_iter.by_ref() { + next_key_size = next_size; + if key != next_key { + if dup_end_lsn.is_valid() { + // We are writting segment with duplicates: + // place all remaining values of this key in separate segment + dup_start_lsn = dup_end_lsn; // new segments starts where old stops + dup_end_lsn = lsn_range.end; // there are no more values of this key till end of LSN range + } + break; + } + key_values_total_size += next_size; + // Check if it is time to split segment: if total keys size is larger than target file size. + // We need to avoid generation of empty segments if next_size > target_file_size. + if key_values_total_size > target_file_size && lsn != next_lsn { + // Split key between multiple layers: such layer can contain only single key + dup_start_lsn = if dup_end_lsn.is_valid() { + dup_end_lsn // new segment with duplicates starts where old one stops + } else { + lsn // start with the first LSN for this key + }; + dup_end_lsn = next_lsn; // upper LSN boundary is exclusive + break; + } + } + // handle case when loop reaches last key: in this case dup_end is non-zero but dup_start is not set. + if dup_end_lsn.is_valid() && !dup_start_lsn.is_valid() { + dup_start_lsn = dup_end_lsn; + dup_end_lsn = lsn_range.end; + } + if writer.is_some() { + let written_size = writer.as_mut().unwrap().size(); + let contains_hole = + next_hole < holes.len() && key >= holes[next_hole].key_range.end; + // check if key cause layer overflow or contains hole... + if is_dup_layer + || dup_end_lsn.is_valid() + || written_size + key_values_total_size > target_file_size + || contains_hole + { + // ... if so, flush previous layer and prepare to write new one + new_layers.push( + writer + .take() + .unwrap() + .finish(prev_key.unwrap().next(), self) + .await?, + ); + writer = None; + + if contains_hole { + // skip hole + next_hole += 1; + } + } + } + // Remember size of key value because at next iteration we will access next item + key_values_total_size = next_key_size; + } + fail_point!("delta-layer-writer-fail-before-finish", |_| { + Err(CompactionError::Other(anyhow::anyhow!( + "failpoint delta-layer-writer-fail-before-finish" + ))) + }); + + if !self.shard_identity.is_key_disposable(&key) { + if writer.is_none() { + // Create writer if not initiaized yet + writer = Some( + DeltaLayerWriter::new( + self.conf, + self.timeline_id, + self.tenant_shard_id, + key, + if dup_end_lsn.is_valid() { + // this is a layer containing slice of values of the same key + debug!("Create new dup layer {}..{}", dup_start_lsn, dup_end_lsn); + dup_start_lsn..dup_end_lsn + } else { + debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end); + lsn_range.clone() + }, + ) + .await?, + ); + } + + writer.as_mut().unwrap().put_value(key, lsn, value).await?; + } else { + debug!( + "Dropping key {} during compaction (it belongs on shard {:?})", + key, + self.shard_identity.get_shard_number(&key) + ); + } + + if !new_layers.is_empty() { + fail_point!("after-timeline-compacted-first-L1"); + } + + prev_key = Some(key); + } + if let Some(writer) = writer { + new_layers.push(writer.finish(prev_key.unwrap().next(), self).await?); + } + + // Sync layers + if !new_layers.is_empty() { + // Print a warning if the created layer is larger than double the target size + // Add two pages for potential overhead. This should in theory be already + // accounted for in the target calculation, but for very small targets, + // we still might easily hit the limit otherwise. + let warn_limit = target_file_size * 2 + page_cache::PAGE_SZ as u64 * 2; + for layer in new_layers.iter() { + if layer.layer_desc().file_size > warn_limit { + warn!( + %layer, + "created delta file of size {} larger than double of target of {target_file_size}", layer.layer_desc().file_size + ); + } + } + + // The writer.finish() above already did the fsync of the inodes. + // We just need to fsync the directory in which these inodes are linked, + // which we know to be the timeline directory. + // + // We use fatal_err() below because the after writer.finish() returns with success, + // the in-memory state of the filesystem already has the layer file in its final place, + // and subsequent pageserver code could think it's durable while it really isn't. + let timeline_dir = VirtualFile::open( + &self + .conf + .timeline_path(&self.tenant_shard_id, &self.timeline_id), + ) + .await + .fatal_err("VirtualFile::open for timeline dir fsync"); + timeline_dir + .sync_all() + .await + .fatal_err("VirtualFile::sync_all timeline dir"); + } + + stats.write_layer_files_micros = stats.read_lock_drop_micros.till_now(); + stats.new_deltas_count = Some(new_layers.len()); + stats.new_deltas_size = Some(new_layers.iter().map(|l| l.layer_desc().file_size).sum()); + + match TryInto::::try_into(stats) + .and_then(|stats| serde_json::to_string(&stats).context("serde_json::to_string")) + { + Ok(stats_json) => { + info!( + stats_json = stats_json.as_str(), + "compact_level0_phase1 stats available" + ) + } + Err(e) => { + warn!("compact_level0_phase1 stats failed to serialize: {:#}", e); + } + } + + Ok(CompactLevel0Phase1Result { + new_layers, + deltas_to_compact: deltas_to_compact + .into_iter() + .map(|x| x.drop_eviction_guard()) + .collect::>(), + }) + } +} + +#[derive(Default)] +struct CompactLevel0Phase1Result { + new_layers: Vec, + deltas_to_compact: Vec, +} + +#[derive(Default)] +struct CompactLevel0Phase1StatsBuilder { + version: Option, + tenant_id: Option, + timeline_id: Option, + read_lock_acquisition_micros: DurationRecorder, + read_lock_held_spawn_blocking_startup_micros: DurationRecorder, + read_lock_held_key_sort_micros: DurationRecorder, + read_lock_held_prerequisites_micros: DurationRecorder, + read_lock_held_compute_holes_micros: DurationRecorder, + read_lock_drop_micros: DurationRecorder, + write_layer_files_micros: DurationRecorder, + level0_deltas_count: Option, + new_deltas_count: Option, + new_deltas_size: Option, +} + +#[derive(serde::Serialize)] +struct CompactLevel0Phase1Stats { + version: u64, + tenant_id: TenantShardId, + timeline_id: TimelineId, + read_lock_acquisition_micros: RecordedDuration, + read_lock_held_spawn_blocking_startup_micros: RecordedDuration, + read_lock_held_key_sort_micros: RecordedDuration, + read_lock_held_prerequisites_micros: RecordedDuration, + read_lock_held_compute_holes_micros: RecordedDuration, + read_lock_drop_micros: RecordedDuration, + write_layer_files_micros: RecordedDuration, + level0_deltas_count: usize, + new_deltas_count: usize, + new_deltas_size: u64, +} + +impl TryFrom for CompactLevel0Phase1Stats { + type Error = anyhow::Error; + + fn try_from(value: CompactLevel0Phase1StatsBuilder) -> Result { + Ok(Self { + version: value.version.ok_or_else(|| anyhow!("version not set"))?, + tenant_id: value + .tenant_id + .ok_or_else(|| anyhow!("tenant_id not set"))?, + timeline_id: value + .timeline_id + .ok_or_else(|| anyhow!("timeline_id not set"))?, + read_lock_acquisition_micros: value + .read_lock_acquisition_micros + .into_recorded() + .ok_or_else(|| anyhow!("read_lock_acquisition_micros not set"))?, + read_lock_held_spawn_blocking_startup_micros: value + .read_lock_held_spawn_blocking_startup_micros + .into_recorded() + .ok_or_else(|| anyhow!("read_lock_held_spawn_blocking_startup_micros not set"))?, + read_lock_held_key_sort_micros: value + .read_lock_held_key_sort_micros + .into_recorded() + .ok_or_else(|| anyhow!("read_lock_held_key_sort_micros not set"))?, + read_lock_held_prerequisites_micros: value + .read_lock_held_prerequisites_micros + .into_recorded() + .ok_or_else(|| anyhow!("read_lock_held_prerequisites_micros not set"))?, + read_lock_held_compute_holes_micros: value + .read_lock_held_compute_holes_micros + .into_recorded() + .ok_or_else(|| anyhow!("read_lock_held_compute_holes_micros not set"))?, + read_lock_drop_micros: value + .read_lock_drop_micros + .into_recorded() + .ok_or_else(|| anyhow!("read_lock_drop_micros not set"))?, + write_layer_files_micros: value + .write_layer_files_micros + .into_recorded() + .ok_or_else(|| anyhow!("write_layer_files_micros not set"))?, + level0_deltas_count: value + .level0_deltas_count + .ok_or_else(|| anyhow!("level0_deltas_count not set"))?, + new_deltas_count: value + .new_deltas_count + .ok_or_else(|| anyhow!("new_deltas_count not set"))?, + new_deltas_size: value + .new_deltas_size + .ok_or_else(|| anyhow!("new_deltas_size not set"))?, + }) + } +} + impl Timeline { /// Entry point for new tiered compaction algorithm. /// @@ -134,7 +830,6 @@ struct ResidentDeltaLayer(ResidentLayer); #[derive(Clone)] struct ResidentImageLayer(ResidentLayer); -#[async_trait] impl CompactionJobExecutor for TimelineAdaptor { type Key = crate::repository::Key; diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs index 9cb53f46d1..8297ca6563 100644 --- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs @@ -343,23 +343,6 @@ pub(super) async fn handle_walreceiver_connection( modification.commit(&ctx).await?; uncommitted_records = 0; filtered_records = 0; - - // - // We should check checkpoint distance after appending each ingest_batch_size bytes because otherwise - // layer size can become much larger than `checkpoint_distance`. - // It can append because wal-sender is sending WAL using 125kb chucks and some WAL records can cause writing large - // amount of data to key-value storage. So performing this check only after processing - // all WAL records in the chunk, can cause huge L0 layer files. - // - timeline - .check_checkpoint_distance() - .await - .with_context(|| { - format!( - "Failed to check checkpoint distance for timeline {}", - timeline.timeline_id - ) - })?; } } @@ -406,15 +389,16 @@ pub(super) async fn handle_walreceiver_connection( } } - timeline - .check_checkpoint_distance() - .await - .with_context(|| { - format!( - "Failed to check checkpoint distance for timeline {}", - timeline.timeline_id - ) - })?; + { + // This is a hack. It piggybacks on the keepalive messages sent by the + // safekeeper in order to enforce `checkpoint_timeout` on the currently + // open layer. This hack doesn't provide a bound on the total size of + // in-memory layers on a pageserver. See https://github.com/neondatabase/neon/issues/6916. + let mut writer = timeline.writer().await; + if let Err(err) = writer.tick().await { + warn!("Timeline writer tick failed: {err}"); + } + } if let Some(last_lsn) = status_update { let timeline_remote_consistent_lsn = timeline diff --git a/pageserver/src/tenant/vectored_blob_io.rs b/pageserver/src/tenant/vectored_blob_io.rs index a8d9649d36..805f70b23b 100644 --- a/pageserver/src/tenant/vectored_blob_io.rs +++ b/pageserver/src/tenant/vectored_blob_io.rs @@ -128,7 +128,7 @@ impl VectoredReadBuilder { pub enum BlobFlag { None, Ignore, - Replaces, + ReplaceAll, } /// Planner for vectored blob reads. @@ -170,7 +170,7 @@ impl VectoredReadPlanner { /// incorrect data to the user. /// /// The `flag` argument has two interesting values: - /// * [`BlobFlag::Replaces`]: The blob for this key should replace all existing blobs. + /// * [`BlobFlag::ReplaceAll`]: The blob for this key should replace all existing blobs. /// This is used for WAL records that `will_init`. /// * [`BlobFlag::Ignore`]: This blob should not be included in the read. This happens /// if the blob is cached. @@ -204,7 +204,7 @@ impl VectoredReadPlanner { let blobs_for_key = self.blobs.entry(key).or_default(); blobs_for_key.push((lsn, start_offset, end_offset)); } - BlobFlag::Replaces => { + BlobFlag::ReplaceAll => { let blobs_for_key = self.blobs.entry(key).or_default(); blobs_for_key.clear(); blobs_for_key.push((lsn, start_offset, end_offset)); @@ -411,10 +411,10 @@ mod tests { let blob_descriptions = vec![ (first_key, lsn, 0, BlobFlag::None), // First in read 1 (first_key, lsn, 1024, BlobFlag::None), // Last in read 1 - (second_key, lsn, 2 * 1024, BlobFlag::Replaces), + (second_key, lsn, 2 * 1024, BlobFlag::ReplaceAll), (second_key, lsn, 3 * 1024, BlobFlag::None), - (second_key, lsn, 4 * 1024, BlobFlag::Replaces), // First in read 2 - (second_key, lsn, 5 * 1024, BlobFlag::None), // Last in read 2 + (second_key, lsn, 4 * 1024, BlobFlag::ReplaceAll), // First in read 2 + (second_key, lsn, 5 * 1024, BlobFlag::None), // Last in read 2 ]; let ranges = [&blob_descriptions[0..2], &blob_descriptions[4..]]; diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index b7112108f2..6d4774cf75 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -17,20 +17,21 @@ use crate::tenant::TENANTS_SEGMENT_NAME; use camino::{Utf8Path, Utf8PathBuf}; use once_cell::sync::OnceCell; use pageserver_api::shard::TenantShardId; -use std::fs::{self, File}; +use std::fs::File; use std::io::{Error, ErrorKind, Seek, SeekFrom}; use tokio_epoll_uring::{BoundedBuf, IoBuf, IoBufMut, Slice}; use std::os::fd::{AsRawFd, FromRawFd, IntoRawFd, OwnedFd, RawFd}; -use std::os::unix::fs::FileExt; use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering}; use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard}; use tokio::time::Instant; pub use pageserver_api::models::virtual_file as api; pub(crate) mod io_engine; +mod metadata; mod open_options; pub(crate) use io_engine::IoEngineKind; +pub(crate) use metadata::Metadata; pub(crate) use open_options::*; /// @@ -435,13 +436,25 @@ impl VirtualFile { /// Call File::sync_all() on the underlying File. pub async fn sync_all(&self) -> Result<(), Error> { - with_file!(self, StorageIoOperation::Fsync, |file_guard| file_guard - .with_std_file(|std_file| std_file.sync_all())) + with_file!(self, StorageIoOperation::Fsync, |file_guard| { + let (_file_guard, res) = io_engine::get().sync_all(file_guard).await; + res + }) } - pub async fn metadata(&self) -> Result { - with_file!(self, StorageIoOperation::Metadata, |file_guard| file_guard - .with_std_file(|std_file| std_file.metadata())) + /// Call File::sync_data() on the underlying File. + pub async fn sync_data(&self) -> Result<(), Error> { + with_file!(self, StorageIoOperation::Fsync, |file_guard| { + let (_file_guard, res) = io_engine::get().sync_data(file_guard).await; + res + }) + } + + pub async fn metadata(&self) -> Result { + with_file!(self, StorageIoOperation::Metadata, |file_guard| { + let (_file_guard, res) = io_engine::get().metadata(file_guard).await; + res + }) } /// Helper function internal to `VirtualFile` that looks up the underlying File, @@ -579,7 +592,7 @@ impl VirtualFile { } // Copied from https://doc.rust-lang.org/1.72.0/src/std/os/unix/fs.rs.html#219-235 - pub async fn write_all_at( + pub async fn write_all_at, Buf: IoBuf + Send>( &self, buf: B, mut offset: u64, @@ -590,8 +603,9 @@ impl VirtualFile { } let mut buf = buf.slice(0..buf_len); while !buf.is_empty() { - // TODO: push `buf` further down - match self.write_at(&buf, offset).await { + let res; + (buf, res) = self.write_at(buf, offset).await; + match res { Ok(0) => { return ( Slice::into_inner(buf), @@ -605,7 +619,7 @@ impl VirtualFile { buf = buf.slice(n..); offset += n as u64; } - Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {} + Err(e) if e.kind() == std::io::ErrorKind::Interrupted => {} Err(e) => return (Slice::into_inner(buf), Err(e)), } } @@ -616,15 +630,19 @@ impl VirtualFile { /// Returns the IoBuf that is underlying the BoundedBuf `buf`. /// I.e., the returned value's `bytes_init()` method returns something different than the `bytes_init()` that was passed in. /// It's quite brittle and easy to mis-use, so, we return the size in the Ok() variant. - pub async fn write_all(&mut self, buf: B) -> (B::Buf, Result) { + pub async fn write_all, Buf: IoBuf + Send>( + &mut self, + buf: B, + ) -> (B::Buf, Result) { let nbytes = buf.bytes_init(); if nbytes == 0 { return (Slice::into_inner(buf.slice_full()), Ok(0)); } let mut buf = buf.slice(0..nbytes); while !buf.is_empty() { - // TODO: push `Slice` further down - match self.write(&buf).await { + let res; + (buf, res) = self.write(buf).await; + match res { Ok(0) => { return ( Slice::into_inner(buf), @@ -644,11 +662,18 @@ impl VirtualFile { (Slice::into_inner(buf), Ok(nbytes)) } - async fn write(&mut self, buf: &[u8]) -> Result { + async fn write( + &mut self, + buf: Slice, + ) -> (Slice, Result) { let pos = self.pos; - let n = self.write_at(buf, pos).await?; + let (buf, res) = self.write_at(buf, pos).await; + let n = match res { + Ok(n) => n, + Err(e) => return (buf, Err(e)), + }; self.pos += n as u64; - Ok(n) + (buf, Ok(n)) } pub(crate) async fn read_at(&self, buf: B, offset: u64) -> (B, Result) @@ -676,16 +701,30 @@ impl VirtualFile { }) } - async fn write_at(&self, buf: &[u8], offset: u64) -> Result { - let result = with_file!(self, StorageIoOperation::Write, |file_guard| { - file_guard.with_std_file(|std_file| std_file.write_at(buf, offset)) - }); - if let Ok(size) = result { - STORAGE_IO_SIZE - .with_label_values(&["write", &self.tenant_id, &self.shard_id, &self.timeline_id]) - .add(size as i64); - } - result + async fn write_at( + &self, + buf: Slice, + offset: u64, + ) -> (Slice, Result) { + let file_guard = match self.lock_file().await { + Ok(file_guard) => file_guard, + Err(e) => return (buf, Err(e)), + }; + observe_duration!(StorageIoOperation::Write, { + let ((_file_guard, buf), result) = + io_engine::get().write_at(file_guard, offset, buf).await; + if let Ok(size) = result { + STORAGE_IO_SIZE + .with_label_values(&[ + "write", + &self.tenant_id, + &self.shard_id, + &self.timeline_id, + ]) + .add(size as i64); + } + (buf, result) + }) } } @@ -1083,6 +1122,7 @@ mod tests { use rand::Rng; use std::future::Future; use std::io::Write; + use std::os::unix::fs::FileExt; use std::sync::Arc; enum MaybeVirtualFile { @@ -1103,7 +1143,11 @@ mod tests { MaybeVirtualFile::File(file) => file.read_exact_at(&mut buf, offset).map(|()| buf), } } - async fn write_all_at(&self, buf: B, offset: u64) -> Result<(), Error> { + async fn write_all_at, Buf: IoBuf + Send>( + &self, + buf: B, + offset: u64, + ) -> Result<(), Error> { match self { MaybeVirtualFile::VirtualFile(file) => { let (_buf, res) = file.write_all_at(buf, offset).await; @@ -1124,7 +1168,10 @@ mod tests { MaybeVirtualFile::File(file) => file.seek(pos), } } - async fn write_all(&mut self, buf: B) -> Result<(), Error> { + async fn write_all, Buf: IoBuf + Send>( + &mut self, + buf: B, + ) -> Result<(), Error> { match self { MaybeVirtualFile::VirtualFile(file) => { let (_buf, res) = file.write_all(buf).await; diff --git a/pageserver/src/virtual_file/io_engine.rs b/pageserver/src/virtual_file/io_engine.rs index 892affa326..e369d28711 100644 --- a/pageserver/src/virtual_file/io_engine.rs +++ b/pageserver/src/virtual_file/io_engine.rs @@ -7,6 +7,9 @@ //! //! Then use [`get`] and [`super::OpenOptions`]. +use tokio_epoll_uring::{IoBuf, Slice}; +use tracing::Instrument; + pub(crate) use super::api::IoEngineKind; #[derive(Clone, Copy)] #[repr(u8)] @@ -61,7 +64,8 @@ pub(super) fn init(engine_kind: IoEngineKind) { set(engine_kind); } -pub(super) fn get() -> IoEngine { +/// Longer-term, this API should only be used by [`super::VirtualFile`]. +pub(crate) fn get() -> IoEngine { let cur = IoEngine::try_from(IO_ENGINE.load(Ordering::Relaxed)).unwrap(); if cfg!(test) { let env_var_name = "NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE"; @@ -98,7 +102,17 @@ use std::{ sync::atomic::{AtomicU8, Ordering}, }; -use super::FileGuard; +use super::{FileGuard, Metadata}; + +#[cfg(target_os = "linux")] +fn epoll_uring_error_to_std(e: tokio_epoll_uring::Error) -> std::io::Error { + match e { + tokio_epoll_uring::Error::Op(e) => e, + tokio_epoll_uring::Error::System(system) => { + std::io::Error::new(std::io::ErrorKind::Other, system) + } + } +} impl IoEngine { pub(super) async fn read_at( @@ -133,16 +147,109 @@ impl IoEngine { IoEngine::TokioEpollUring => { let system = tokio_epoll_uring::thread_local_system().await; let (resources, res) = system.read(file_guard, offset, buf).await; + (resources, res.map_err(epoll_uring_error_to_std)) + } + } + } + pub(super) async fn sync_all(&self, file_guard: FileGuard) -> (FileGuard, std::io::Result<()>) { + match self { + IoEngine::NotSet => panic!("not initialized"), + IoEngine::StdFs => { + let res = file_guard.with_std_file(|std_file| std_file.sync_all()); + (file_guard, res) + } + #[cfg(target_os = "linux")] + IoEngine::TokioEpollUring => { + let system = tokio_epoll_uring::thread_local_system().await; + let (resources, res) = system.fsync(file_guard).await; + (resources, res.map_err(epoll_uring_error_to_std)) + } + } + } + pub(super) async fn sync_data( + &self, + file_guard: FileGuard, + ) -> (FileGuard, std::io::Result<()>) { + match self { + IoEngine::NotSet => panic!("not initialized"), + IoEngine::StdFs => { + let res = file_guard.with_std_file(|std_file| std_file.sync_data()); + (file_guard, res) + } + #[cfg(target_os = "linux")] + IoEngine::TokioEpollUring => { + let system = tokio_epoll_uring::thread_local_system().await; + let (resources, res) = system.fdatasync(file_guard).await; + (resources, res.map_err(epoll_uring_error_to_std)) + } + } + } + pub(super) async fn metadata( + &self, + file_guard: FileGuard, + ) -> (FileGuard, std::io::Result) { + match self { + IoEngine::NotSet => panic!("not initialized"), + IoEngine::StdFs => { + let res = + file_guard.with_std_file(|std_file| std_file.metadata().map(Metadata::from)); + (file_guard, res) + } + #[cfg(target_os = "linux")] + IoEngine::TokioEpollUring => { + let system = tokio_epoll_uring::thread_local_system().await; + let (resources, res) = system.statx(file_guard).await; ( resources, - res.map_err(|e| match e { - tokio_epoll_uring::Error::Op(e) => e, - tokio_epoll_uring::Error::System(system) => { - std::io::Error::new(std::io::ErrorKind::Other, system) - } - }), + res.map_err(epoll_uring_error_to_std).map(Metadata::from), ) } } } + pub(super) async fn write_at( + &self, + file_guard: FileGuard, + offset: u64, + buf: Slice, + ) -> ((FileGuard, Slice), std::io::Result) { + match self { + IoEngine::NotSet => panic!("not initialized"), + IoEngine::StdFs => { + let result = file_guard.with_std_file(|std_file| std_file.write_at(&buf, offset)); + ((file_guard, buf), result) + } + #[cfg(target_os = "linux")] + IoEngine::TokioEpollUring => { + let system = tokio_epoll_uring::thread_local_system().await; + let (resources, res) = system.write(file_guard, offset, buf).await; + (resources, res.map_err(epoll_uring_error_to_std)) + } + } + } + + /// If we switch a user of [`tokio::fs`] to use [`super::io_engine`], + /// they'd start blocking the executor thread if [`IoEngine::StdFs`] is configured + /// whereas before the switch to [`super::io_engine`], that wasn't the case. + /// This method helps avoid such a regression. + /// + /// Panics if the `spawn_blocking` fails, see [`tokio::task::JoinError`] for reasons why that can happen. + pub(crate) async fn spawn_blocking_and_block_on_if_std(&self, work: Fut) -> R + where + Fut: 'static + Send + std::future::Future, + R: 'static + Send, + { + match self { + IoEngine::NotSet => panic!("not initialized"), + IoEngine::StdFs => { + let span = tracing::info_span!("spawn_blocking_block_on_if_std"); + tokio::task::spawn_blocking({ + move || tokio::runtime::Handle::current().block_on(work.instrument(span)) + }) + .await + .expect("failed to join blocking code most likely it panicked, panicking as well") + } + #[cfg(target_os = "linux")] + IoEngine::TokioEpollUring => work.await, + } + } } diff --git a/pageserver/src/virtual_file/metadata.rs b/pageserver/src/virtual_file/metadata.rs new file mode 100644 index 0000000000..f530c50988 --- /dev/null +++ b/pageserver/src/virtual_file/metadata.rs @@ -0,0 +1,30 @@ +use std::fs; + +pub enum Metadata { + StdFs(fs::Metadata), + #[cfg(target_os = "linux")] + TokioEpollUring(Box), +} + +#[cfg(target_os = "linux")] +impl From> for Metadata { + fn from(value: Box) -> Self { + Metadata::TokioEpollUring(value) + } +} + +impl From for Metadata { + fn from(value: std::fs::Metadata) -> Self { + Metadata::StdFs(value) + } +} + +impl Metadata { + pub fn len(&self) -> u64 { + match self { + Metadata::StdFs(metadata) => metadata.len(), + #[cfg(target_os = "linux")] + Metadata::TokioEpollUring(statx) => statx.stx_size, + } + } +} diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index 35cbefb92c..0004f4f3c9 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -262,7 +262,7 @@ impl PostgresRedoManager { // next request will launch a new one. if let Err(e) = result.as_ref() { error!( - "error applying {} WAL records {}..{} ({} bytes) to base image with LSN {} to reconstruct page image at LSN {} n_attempts={}: {:?}", + "error applying {} WAL records {}..{} ({} bytes) to key {key}, from base image with LSN {} to reconstruct page image at LSN {} n_attempts={}: {:?}", records.len(), records.first().map(|p| p.0).unwrap_or(Lsn(0)), records.last().map(|p| p.0).unwrap_or(Lsn(0)), diff --git a/pgxn/neon/Makefile b/pgxn/neon/Makefile index 7ea767ec74..0bcb9545a6 100644 --- a/pgxn/neon/Makefile +++ b/pgxn/neon/Makefile @@ -21,7 +21,7 @@ SHLIB_LINK_INTERNAL = $(libpq) SHLIB_LINK = -lcurl EXTENSION = neon -DATA = neon--1.0.sql neon--1.0--1.1.sql neon--1.1--1.2.sql neon--1.2--1.3.sql +DATA = neon--1.0.sql neon--1.0--1.1.sql neon--1.1--1.2.sql neon--1.2--1.3.sql neon--1.3--1.2.sql neon--1.2--1.1.sql neon--1.1--1.0.sql PGFILEDESC = "neon - cloud storage for PostgreSQL" EXTRA_CLEAN = \ diff --git a/pgxn/neon/control_plane_connector.c b/pgxn/neon/control_plane_connector.c index 00a582d718..93252e6b29 100644 --- a/pgxn/neon/control_plane_connector.c +++ b/pgxn/neon/control_plane_connector.c @@ -35,6 +35,7 @@ #include "utils/memutils.h" #include "utils/jsonb.h" +#include "control_plane_connector.h" #include "neon_utils.h" static ProcessUtility_hook_type PreviousProcessUtilityHook = NULL; @@ -113,6 +114,8 @@ ConstructDeltaMessage() if (RootTable.db_table) { JsonbValue dbs; + HASH_SEQ_STATUS status; + DbEntry *entry; dbs.type = jbvString; dbs.val.string.val = "dbs"; @@ -120,9 +123,6 @@ ConstructDeltaMessage() pushJsonbValue(&state, WJB_KEY, &dbs); pushJsonbValue(&state, WJB_BEGIN_ARRAY, NULL); - HASH_SEQ_STATUS status; - DbEntry *entry; - hash_seq_init(&status, RootTable.db_table); while ((entry = hash_seq_search(&status)) != NULL) { @@ -168,8 +168,9 @@ ConstructDeltaMessage() #else const char *logdetail; #endif + char *encrypted_password; PushKeyValue(&state, "password", (char *) entry->password); - char *encrypted_password = get_role_password(entry->name, &logdetail); + encrypted_password = get_role_password(entry->name, &logdetail); if (encrypted_password) { @@ -831,7 +832,7 @@ NeonProcessUtility( } } -extern void +void InitControlPlaneConnector() { PreviousProcessUtilityHook = ProcessUtility_hook; diff --git a/pgxn/neon/control_plane_connector.h b/pgxn/neon/control_plane_connector.h index 12d6a97562..7eed449200 100644 --- a/pgxn/neon/control_plane_connector.h +++ b/pgxn/neon/control_plane_connector.h @@ -1,6 +1,6 @@ #ifndef CONTROL_PLANE_CONNECTOR_H #define CONTROL_PLANE_CONNECTOR_H -void InitControlPlaneConnector(); +void InitControlPlaneConnector(void); #endif diff --git a/pgxn/neon/extension_server.c b/pgxn/neon/extension_server.c index 039405e2cd..1329e2d17b 100644 --- a/pgxn/neon/extension_server.c +++ b/pgxn/neon/extension_server.c @@ -14,6 +14,7 @@ #include "utils/guc.h" +#include "extension_server.h" #include "neon_utils.h" static int extension_server_port = 0; diff --git a/pgxn/neon/extension_server.h b/pgxn/neon/extension_server.h new file mode 100644 index 0000000000..3e67708b85 --- /dev/null +++ b/pgxn/neon/extension_server.h @@ -0,0 +1,17 @@ +/*------------------------------------------------------------------------- + * + * extension_server.h + * Request compute_ctl to download extension files. + * + * IDENTIFICATION + * contrib/neon/extension_server.h + * + *------------------------------------------------------------------------- + */ + +#ifndef EXTENSION_SERVER_H +#define EXTENSION_SERVER_H + +void pg_init_extension_server(void); + +#endif /* EXTENSION_SERVER_H */ diff --git a/pgxn/neon/neon--1.1--1.0.sql b/pgxn/neon/neon--1.1--1.0.sql new file mode 100644 index 0000000000..e83e3104e8 --- /dev/null +++ b/pgxn/neon/neon--1.1--1.0.sql @@ -0,0 +1,6 @@ +-- the order of operations is important here +-- because the view depends on the function + +DROP VIEW IF EXISTS neon_lfc_stats CASCADE; + +DROP FUNCTION IF EXISTS neon_get_lfc_stats CASCADE; diff --git a/pgxn/neon/neon--1.2--1.1.sql b/pgxn/neon/neon--1.2--1.1.sql new file mode 100644 index 0000000000..c9f6a40f73 --- /dev/null +++ b/pgxn/neon/neon--1.2--1.1.sql @@ -0,0 +1 @@ +DROP VIEW IF EXISTS NEON_STAT_FILE_CACHE CASCADE; diff --git a/pgxn/neon/neon--1.3--1.2.sql b/pgxn/neon/neon--1.3--1.2.sql new file mode 100644 index 0000000000..2733a15c75 --- /dev/null +++ b/pgxn/neon/neon--1.3--1.2.sql @@ -0,0 +1 @@ +DROP FUNCTION IF EXISTS approximate_working_set_size(bool) CASCADE; diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c index a14288b33a..1f456d9a3f 100644 --- a/pgxn/neon/neon.c +++ b/pgxn/neon/neon.c @@ -29,6 +29,7 @@ #include "utils/guc.h" #include "utils/wait_event.h" +#include "extension_server.h" #include "neon.h" #include "walproposer.h" #include "pagestore_client.h" diff --git a/pgxn/neon/neon.h b/pgxn/neon/neon.h index c3afecc679..a0f8c97497 100644 --- a/pgxn/neon/neon.h +++ b/pgxn/neon/neon.h @@ -25,12 +25,11 @@ extern int wal_acceptor_connection_timeout; extern void pg_init_libpagestore(void); extern void pg_init_walproposer(void); -extern void pg_init_extension_server(void); - extern uint64 BackpressureThrottlingTime(void); extern void replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn); extern void PGDLLEXPORT WalProposerSync(int argc, char *argv[]); extern void PGDLLEXPORT WalProposerMain(Datum main_arg); +PGDLLEXPORT void LogicalSlotsMonitorMain(Datum main_arg); #endif /* NEON_H */ diff --git a/pgxn/neon/neon_utils.c b/pgxn/neon/neon_utils.c index ce554c89df..1fb4ed9522 100644 --- a/pgxn/neon/neon_utils.c +++ b/pgxn/neon/neon_utils.c @@ -6,6 +6,7 @@ #include "postgres.h" +#include "neon_utils.h" #include "lib/stringinfo.h" #include "libpq/pqformat.h" @@ -14,7 +15,7 @@ * * Returns -1 if the character is not a hexadecimal digit. */ -int +static int HexDecodeChar(char c) { if (c >= '0' && c <= '9') diff --git a/pgxn/neon/neon_utils.h b/pgxn/neon/neon_utils.h index 10d41db102..89683714f1 100644 --- a/pgxn/neon/neon_utils.h +++ b/pgxn/neon/neon_utils.h @@ -12,7 +12,7 @@ uint32 pq_getmsgint32_le(StringInfo msg); uint64 pq_getmsgint64_le(StringInfo msg); void pq_sendint32_le(StringInfo buf, uint32 i); void pq_sendint64_le(StringInfo buf, uint64 i); -extern void disable_core_dump(); +void disable_core_dump(void); #ifndef WALPROPOSER_LIB diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c index 0d5007ef73..10487636ae 100644 --- a/pgxn/neon/walproposer.c +++ b/pgxn/neon/walproposer.c @@ -1460,7 +1460,7 @@ RecvAppendResponses(Safekeeper *sk) } /* Parse a PageserverFeedback message, or the PageserverFeedback part of an AppendResponse */ -void +static void ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, PageserverFeedback *rf) { uint8 nkeys; @@ -1590,9 +1590,9 @@ GetAcknowledgedByQuorumWALPosition(WalProposer *wp) Safekeeper * GetDonor(WalProposer *wp, XLogRecPtr *donor_lsn) { - *donor_lsn = InvalidXLogRecPtr; Safekeeper *donor = NULL; int i; + *donor_lsn = InvalidXLogRecPtr; if (wp->n_votes < wp->quorum) { diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c index 61a2a54809..7f07913fa6 100644 --- a/pgxn/neon/walproposer_pg.c +++ b/pgxn/neon/walproposer_pg.c @@ -398,7 +398,7 @@ walprop_pg_get_shmem_state(WalProposer *wp) return walprop_shared; } -void +static void replication_feedback_set(PageserverFeedback *rf) { SpinLockAcquire(&walprop_shared->mutex); diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index 0777d361d2..d8112c8bf0 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -68,7 +68,6 @@ task-local-extensions.workspace = true thiserror.workspace = true tikv-jemallocator.workspace = true tikv-jemalloc-ctl = { workspace = true, features = ["use_std"] } -tls-listener.workspace = true tokio-postgres.workspace = true tokio-rustls.workspace = true tokio-util.workspace = true diff --git a/proxy/src/auth/backend/link.rs b/proxy/src/auth/backend/link.rs index ec7d891247..7db76f3d9e 100644 --- a/proxy/src/auth/backend/link.rs +++ b/proxy/src/auth/backend/link.rs @@ -102,6 +102,8 @@ pub(super) async fn authenticate( ctx.set_user(db_info.user.into()); ctx.set_project(db_info.aux.clone()); + let cold_start_info = db_info.aux.cold_start_info.clone().unwrap_or_default(); + info!(?cold_start_info, "woken up a compute node"); // Backwards compatibility. pg_sni_proxy uses "--" in domain names // while direct connections do not. Once we migrate to pg_sni_proxy diff --git a/proxy/src/bin/pg_sni_router.rs b/proxy/src/bin/pg_sni_router.rs index d5ab66d6aa..385f7820cb 100644 --- a/proxy/src/bin/pg_sni_router.rs +++ b/proxy/src/bin/pg_sni_router.rs @@ -10,6 +10,7 @@ use itertools::Itertools; use proxy::config::TlsServerEndPoint; use proxy::context::RequestMonitoring; use proxy::proxy::run_until_cancelled; +use rustls::pki_types::PrivateKeyDer; use tokio::net::TcpListener; use anyhow::{anyhow, bail, ensure, Context}; @@ -76,37 +77,40 @@ async fn main() -> anyhow::Result<()> { (Some(key_path), Some(cert_path)) => { let key = { let key_bytes = std::fs::read(key_path).context("TLS key file")?; - let mut keys = rustls_pemfile::pkcs8_private_keys(&mut &key_bytes[..]) - .context(format!("Failed to read TLS keys at '{key_path}'"))?; + + let mut keys = + rustls_pemfile::pkcs8_private_keys(&mut &key_bytes[..]).collect_vec(); ensure!(keys.len() == 1, "keys.len() = {} (should be 1)", keys.len()); - keys.pop().map(rustls::PrivateKey).unwrap() + PrivateKeyDer::Pkcs8( + keys.pop() + .unwrap() + .context(format!("Failed to read TLS keys at '{key_path}'"))?, + ) }; let cert_chain_bytes = std::fs::read(cert_path) .context(format!("Failed to read TLS cert file at '{cert_path}.'"))?; - let cert_chain = { + let cert_chain: Vec<_> = { rustls_pemfile::certs(&mut &cert_chain_bytes[..]) - .context(format!( - "Failed to read TLS certificate chain from bytes from file at '{cert_path}'." - ))? - .into_iter() - .map(rustls::Certificate) - .collect_vec() + .try_collect() + .with_context(|| { + format!("Failed to read TLS certificate chain from bytes from file at '{cert_path}'.") + })? }; // needed for channel bindings let first_cert = cert_chain.first().context("missing certificate")?; let tls_server_end_point = TlsServerEndPoint::new(first_cert)?; - let tls_config = rustls::ServerConfig::builder() - .with_safe_default_cipher_suites() - .with_safe_default_kx_groups() - .with_protocol_versions(&[&rustls::version::TLS13, &rustls::version::TLS12])? - .with_no_client_auth() - .with_single_cert(cert_chain, key)? - .into(); + let tls_config = rustls::ServerConfig::builder_with_protocol_versions(&[ + &rustls::version::TLS13, + &rustls::version::TLS12, + ]) + .with_no_client_auth() + .with_single_cert(cert_chain, key)? + .into(); (tls_config, tls_server_end_point) } diff --git a/proxy/src/config.rs b/proxy/src/config.rs index 9f276c3c24..437ec9f401 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -1,6 +1,10 @@ use crate::{auth, rate_limiter::RateBucketInfo, serverless::GlobalConnPoolOptions}; use anyhow::{bail, ensure, Context, Ok}; -use rustls::{sign, Certificate, PrivateKey}; +use itertools::Itertools; +use rustls::{ + crypto::ring::sign, + pki_types::{CertificateDer, PrivateKeyDer}, +}; use sha2::{Digest, Sha256}; use std::{ collections::{HashMap, HashSet}, @@ -88,14 +92,14 @@ pub fn configure_tls( let cert_resolver = Arc::new(cert_resolver); - let config = rustls::ServerConfig::builder() - .with_safe_default_cipher_suites() - .with_safe_default_kx_groups() - // allow TLS 1.2 to be compatible with older client libraries - .with_protocol_versions(&[&rustls::version::TLS13, &rustls::version::TLS12])? - .with_no_client_auth() - .with_cert_resolver(cert_resolver.clone()) - .into(); + // allow TLS 1.2 to be compatible with older client libraries + let config = rustls::ServerConfig::builder_with_protocol_versions(&[ + &rustls::version::TLS13, + &rustls::version::TLS12, + ]) + .with_no_client_auth() + .with_cert_resolver(cert_resolver.clone()) + .into(); Ok(TlsConfig { config, @@ -133,14 +137,14 @@ pub enum TlsServerEndPoint { } impl TlsServerEndPoint { - pub fn new(cert: &Certificate) -> anyhow::Result { + pub fn new(cert: &CertificateDer) -> anyhow::Result { let sha256_oids = [ // I'm explicitly not adding MD5 or SHA1 here... They're bad. oid_registry::OID_SIG_ECDSA_WITH_SHA256, oid_registry::OID_PKCS1_SHA256WITHRSA, ]; - let pem = x509_parser::parse_x509_certificate(&cert.0) + let pem = x509_parser::parse_x509_certificate(cert) .context("Failed to parse PEM object from cerficiate")? .1; @@ -150,8 +154,7 @@ impl TlsServerEndPoint { let oid = pem.signature_algorithm.oid(); let alg = reg.get(oid); if sha256_oids.contains(oid) { - let tls_server_end_point: [u8; 32] = - Sha256::new().chain_update(&cert.0).finalize().into(); + let tls_server_end_point: [u8; 32] = Sha256::new().chain_update(cert).finalize().into(); info!(subject = %pem.subject, signature_algorithm = alg.map(|a| a.description()), tls_server_end_point = %base64::encode(tls_server_end_point), "determined channel binding"); Ok(Self::Sha256(tls_server_end_point)) } else { @@ -165,7 +168,7 @@ impl TlsServerEndPoint { } } -#[derive(Default)] +#[derive(Default, Debug)] pub struct CertResolver { certs: HashMap, TlsServerEndPoint)>, default: Option<(Arc, TlsServerEndPoint)>, @@ -185,11 +188,14 @@ impl CertResolver { let priv_key = { let key_bytes = std::fs::read(key_path) .context(format!("Failed to read TLS keys at '{key_path}'"))?; - let mut keys = rustls_pemfile::pkcs8_private_keys(&mut &key_bytes[..]) - .context(format!("Failed to parse TLS keys at '{key_path}'"))?; + let mut keys = rustls_pemfile::pkcs8_private_keys(&mut &key_bytes[..]).collect_vec(); ensure!(keys.len() == 1, "keys.len() = {} (should be 1)", keys.len()); - keys.pop().map(rustls::PrivateKey).unwrap() + PrivateKeyDer::Pkcs8( + keys.pop() + .unwrap() + .context(format!("Failed to parse TLS keys at '{key_path}'"))?, + ) }; let cert_chain_bytes = std::fs::read(cert_path) @@ -197,14 +203,10 @@ impl CertResolver { let cert_chain = { rustls_pemfile::certs(&mut &cert_chain_bytes[..]) + .try_collect() .with_context(|| { - format!( - "Failed to read TLS certificate chain from bytes from file at '{cert_path}'." - ) + format!("Failed to read TLS certificate chain from bytes from file at '{cert_path}'.") })? - .into_iter() - .map(rustls::Certificate) - .collect() }; self.add_cert(priv_key, cert_chain, is_default) @@ -212,15 +214,15 @@ impl CertResolver { pub fn add_cert( &mut self, - priv_key: PrivateKey, - cert_chain: Vec, + priv_key: PrivateKeyDer<'static>, + cert_chain: Vec>, is_default: bool, ) -> anyhow::Result<()> { let key = sign::any_supported_type(&priv_key).context("invalid private key")?; let first_cert = &cert_chain[0]; let tls_server_end_point = TlsServerEndPoint::new(first_cert)?; - let pem = x509_parser::parse_x509_certificate(&first_cert.0) + let pem = x509_parser::parse_x509_certificate(first_cert) .context("Failed to parse PEM object from cerficiate")? .1; diff --git a/proxy/src/console/messages.rs b/proxy/src/console/messages.rs index 1f94059f1e..102076f2c6 100644 --- a/proxy/src/console/messages.rs +++ b/proxy/src/console/messages.rs @@ -1,4 +1,4 @@ -use serde::Deserialize; +use serde::{Deserialize, Serialize}; use std::fmt; use crate::auth::IpPattern; @@ -98,7 +98,17 @@ pub struct MetricsAuxInfo { pub endpoint_id: EndpointId, pub project_id: ProjectId, pub branch_id: BranchId, - pub is_cold_start: Option, + pub cold_start_info: Option, +} + +#[derive(Debug, Default, Serialize, Deserialize, Clone)] +#[serde(rename_all = "snake_case")] +pub enum ColdStartInfo { + #[default] + Unknown = 0, + Warm = 1, + PoolHit = 2, + PoolMiss = 3, } #[cfg(test)] @@ -111,6 +121,7 @@ mod tests { "endpoint_id": "endpoint", "project_id": "project", "branch_id": "branch", + "cold_start_info": "unknown", }) } diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs index 71b34cb676..f3befa33e0 100644 --- a/proxy/src/console/provider/neon.rs +++ b/proxy/src/console/provider/neon.rs @@ -259,6 +259,9 @@ impl super::Api for Api { } let node = self.do_wake_compute(ctx, user_info).await?; + ctx.set_project(node.aux.clone()); + let cold_start_info = node.aux.cold_start_info.clone().unwrap_or_default(); + info!(?cold_start_info, "woken up a compute node"); let (_, cached) = self.caches.node_info.insert(key.clone(), node); info!(key = &*key, "created a cache entry for compute node info"); diff --git a/proxy/src/context.rs b/proxy/src/context.rs index abad8a6412..1b48e01358 100644 --- a/proxy/src/context.rs +++ b/proxy/src/context.rs @@ -9,7 +9,7 @@ use tracing::{field::display, info_span, Span}; use uuid::Uuid; use crate::{ - console::messages::MetricsAuxInfo, + console::messages::{ColdStartInfo, MetricsAuxInfo}, error::ErrorKind, metrics::{LatencyTimer, ENDPOINT_ERRORS_BY_KIND, ERROR_BY_KIND}, BranchId, DbName, EndpointId, ProjectId, RoleName, @@ -42,7 +42,7 @@ pub struct RequestMonitoring { error_kind: Option, pub(crate) auth_method: Option, success: bool, - is_cold_start: Option, + cold_start_info: Option, // extra // This sender is here to keep the request monitoring channel open while requests are taking place. @@ -91,7 +91,7 @@ impl RequestMonitoring { error_kind: None, auth_method: None, success: false, - is_cold_start: None, + cold_start_info: None, sender: LOG_CHAN.get().and_then(|tx| tx.upgrade()), latency_timer: LatencyTimer::new(protocol), @@ -115,7 +115,7 @@ impl RequestMonitoring { self.set_endpoint_id(x.endpoint_id); self.branch = Some(x.branch_id); self.project = Some(x.project_id); - self.is_cold_start = x.is_cold_start; + self.cold_start_info = x.cold_start_info; } pub fn set_project_id(&mut self, project_id: ProjectId) { diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs index 54f51604bf..1b1274b196 100644 --- a/proxy/src/context/parquet.rs +++ b/proxy/src/context/parquet.rs @@ -93,7 +93,7 @@ struct RequestData { /// Or if we make it to proxy_pass success: bool, /// Indicates if the cplane started the new compute node for this request. - is_cold_start: Option, + cold_start_info: Option, /// Tracks time from session start (HTTP request/libpq TCP handshake) /// Through to success/failure duration_us: u64, @@ -121,7 +121,10 @@ impl From for RequestData { region: value.region, error: value.error_kind.as_ref().map(|e| e.to_metric_label()), success: value.success, - is_cold_start: value.is_cold_start, + cold_start_info: value + .cold_start_info + .as_ref() + .map(|x| serde_json::to_string(x).unwrap_or_default()), duration_us: SystemTime::from(value.first_packet) .elapsed() .unwrap_or_default() @@ -455,7 +458,7 @@ mod tests { region: "us-east-1", error: None, success: rng.gen(), - is_cold_start: Some(true), + cold_start_info: Some("no".into()), duration_us: rng.gen_range(0..30_000_000), } } @@ -525,16 +528,16 @@ mod tests { assert_eq!( file_stats, [ - (1315032, 3, 6000), - (1315025, 3, 6000), - (1315085, 3, 6000), - (1315042, 3, 6000), - (1315172, 3, 6000), - (1315014, 3, 6000), - (1314806, 3, 6000), - (1315042, 3, 6000), - (438563, 1, 2000) - ], + (1314406, 3, 6000), + (1314399, 3, 6000), + (1314459, 3, 6000), + (1314416, 3, 6000), + (1314546, 3, 6000), + (1314388, 3, 6000), + (1314180, 3, 6000), + (1314416, 3, 6000), + (438359, 1, 2000) + ] ); tmpdir.close().unwrap(); @@ -563,12 +566,12 @@ mod tests { assert_eq!( file_stats, [ - (1220433, 5, 10000), - (1226583, 5, 10000), - (1228377, 5, 10000), - (1227739, 5, 10000), - (1219017, 5, 10000) - ], + (1220668, 5, 10000), + (1226818, 5, 10000), + (1228612, 5, 10000), + (1227974, 5, 10000), + (1219252, 5, 10000) + ] ); tmpdir.close().unwrap(); @@ -599,12 +602,12 @@ mod tests { assert_eq!( file_stats, [ - (1206080, 5, 10000), - (1205811, 5, 10000), - (1206104, 5, 10000), - (1206092, 5, 10000), - (1206347, 5, 10000) - ], + (1206315, 5, 10000), + (1206046, 5, 10000), + (1206339, 5, 10000), + (1206327, 5, 10000), + (1206582, 5, 10000) + ] ); tmpdir.close().unwrap(); @@ -628,16 +631,16 @@ mod tests { assert_eq!( file_stats, [ - (1315032, 3, 6000), - (1315025, 3, 6000), - (1315085, 3, 6000), - (1315042, 3, 6000), - (1315172, 3, 6000), - (1315014, 3, 6000), - (1314806, 3, 6000), - (1315042, 3, 6000), - (438563, 1, 2000) - ], + (1314406, 3, 6000), + (1314399, 3, 6000), + (1314459, 3, 6000), + (1314416, 3, 6000), + (1314546, 3, 6000), + (1314388, 3, 6000), + (1314180, 3, 6000), + (1314416, 3, 6000), + (438359, 1, 2000) + ] ); tmpdir.close().unwrap(); @@ -673,7 +676,7 @@ mod tests { // files are smaller than the size threshold, but they took too long to fill so were flushed early assert_eq!( file_stats, - [(659129, 2, 3001), (658842, 2, 3000), (658638, 2, 2999)], + [(658837, 2, 3001), (658551, 2, 3000), (658347, 2, 2999)] ); tmpdir.close().unwrap(); diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs index 66031f5eb2..0477176c45 100644 --- a/proxy/src/metrics.rs +++ b/proxy/src/metrics.rs @@ -4,7 +4,7 @@ use ::metrics::{ register_int_gauge_vec, Histogram, HistogramVec, HyperLogLogVec, IntCounterPairVec, IntCounterVec, IntGauge, IntGaugeVec, }; -use metrics::{register_int_counter_pair, IntCounterPair}; +use metrics::{register_int_counter, register_int_counter_pair, IntCounter, IntCounterPair}; use once_cell::sync::Lazy; use tokio::time; @@ -303,3 +303,20 @@ pub static ENDPOINT_ERRORS_BY_KIND: Lazy> = Lazy::new(|| { ) .unwrap() }); + +pub static REDIS_BROKEN_MESSAGES: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "proxy_redis_errors_total", + "Number of errors by a given classification", + &["channel"], + ) + .unwrap() +}); + +pub static TLS_HANDSHAKE_FAILURES: Lazy = Lazy::new(|| { + register_int_counter!( + "proxy_tls_handshake_failures", + "Number of TLS handshake failures", + ) + .unwrap() +}); diff --git a/proxy/src/protocol2.rs b/proxy/src/protocol2.rs index 1d8931be85..3a7aabca32 100644 --- a/proxy/src/protocol2.rs +++ b/proxy/src/protocol2.rs @@ -1,22 +1,27 @@ //! Proxy Protocol V2 implementation use std::{ - future::poll_fn, - future::Future, + future::{poll_fn, Future}, io, net::SocketAddr, pin::{pin, Pin}, + sync::Mutex, task::{ready, Context, Poll}, }; use bytes::{Buf, BytesMut}; +use hyper::server::accept::Accept; use hyper::server::conn::{AddrIncoming, AddrStream}; +use metrics::IntCounterPairGuard; use pin_project_lite::pin_project; -use tls_listener::AsyncAccept; use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, ReadBuf}; +use uuid::Uuid; + +use crate::{metrics::NUM_CLIENT_CONNECTION_GAUGE, serverless::tls_listener::AsyncAccept}; pub struct ProxyProtocolAccept { pub incoming: AddrIncoming, + pub protocol: &'static str, } pin_project! { @@ -327,7 +332,7 @@ impl AsyncRead for WithClientIp { } impl AsyncAccept for ProxyProtocolAccept { - type Connection = WithClientIp; + type Connection = WithConnectionGuard>; type Error = io::Error; @@ -336,11 +341,74 @@ impl AsyncAccept for ProxyProtocolAccept { cx: &mut Context<'_>, ) -> Poll>> { let conn = ready!(Pin::new(&mut self.incoming).poll_accept(cx)?); + tracing::info!(protocol = self.protocol, "accepted new TCP connection"); let Some(conn) = conn else { return Poll::Ready(None); }; - Poll::Ready(Some(Ok(WithClientIp::new(conn)))) + Poll::Ready(Some(Ok(WithConnectionGuard { + inner: WithClientIp::new(conn), + connection_id: Uuid::new_v4(), + gauge: Mutex::new(Some( + NUM_CLIENT_CONNECTION_GAUGE + .with_label_values(&[self.protocol]) + .guard(), + )), + }))) + } +} + +pin_project! { + pub struct WithConnectionGuard { + #[pin] + pub inner: T, + pub connection_id: Uuid, + pub gauge: Mutex>, + } +} + +impl AsyncWrite for WithConnectionGuard { + #[inline] + fn poll_write( + self: Pin<&mut Self>, + cx: &mut Context<'_>, + buf: &[u8], + ) -> Poll> { + self.project().inner.poll_write(cx, buf) + } + + #[inline] + fn poll_flush(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + self.project().inner.poll_flush(cx) + } + + #[inline] + fn poll_shutdown(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + self.project().inner.poll_shutdown(cx) + } + + #[inline] + fn poll_write_vectored( + self: Pin<&mut Self>, + cx: &mut Context<'_>, + bufs: &[io::IoSlice<'_>], + ) -> Poll> { + self.project().inner.poll_write_vectored(cx, bufs) + } + + #[inline] + fn is_write_vectored(&self) -> bool { + self.inner.is_write_vectored() + } +} + +impl AsyncRead for WithConnectionGuard { + fn poll_read( + self: Pin<&mut Self>, + cx: &mut Context<'_>, + buf: &mut ReadBuf<'_>, + ) -> Poll> { + self.project().inner.poll_read(cx, buf) } } diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs index d94fc67491..7848fc2ac2 100644 --- a/proxy/src/proxy.rs +++ b/proxy/src/proxy.rs @@ -24,6 +24,7 @@ use crate::{ }; use futures::TryFutureExt; use itertools::Itertools; +use metrics::IntCounterPairGuard; use once_cell::sync::OnceCell; use pq_proto::{BeMessage as Be, StartupMessageParams}; use regex::Regex; @@ -78,10 +79,16 @@ pub async fn task_main( { let (socket, peer_addr) = accept_result?; + let conn_gauge = NUM_CLIENT_CONNECTION_GAUGE + .with_label_values(&["tcp"]) + .guard(); + let session_id = uuid::Uuid::new_v4(); let cancellation_handler = Arc::clone(&cancellation_handler); let endpoint_rate_limiter = endpoint_rate_limiter.clone(); + tracing::info!(protocol = "tcp", %session_id, "accepted new TCP connection"); + connections.spawn(async move { let mut socket = WithClientIp::new(socket); let mut peer_addr = peer_addr.ip(); @@ -116,6 +123,7 @@ pub async fn task_main( socket, ClientMode::Tcp, endpoint_rate_limiter, + conn_gauge, ) .instrument(span.clone()) .await; @@ -229,13 +237,11 @@ pub async fn handle_client( stream: S, mode: ClientMode, endpoint_rate_limiter: Arc, + conn_gauge: IntCounterPairGuard, ) -> Result>, ClientRequestError> { info!("handling interactive connection from client"); let proto = ctx.protocol; - let _client_gauge = NUM_CLIENT_CONNECTION_GAUGE - .with_label_values(&[proto]) - .guard(); let _request_gauge = NUM_CONNECTION_REQUESTS_GAUGE .with_label_values(&[proto]) .guard(); @@ -325,7 +331,7 @@ pub async fn handle_client( aux: node.aux.clone(), compute: node, req: _request_gauge, - conn: _client_gauge, + conn: conn_gauge, cancel: session, })) } @@ -374,6 +380,11 @@ impl NeonOptions { Self::parse_from_iter(StartupMessageParams::parse_options_raw(options)) } + pub fn is_ephemeral(&self) -> bool { + // Currently, neon endpoint options are all reserved for ephemeral endpoints. + !self.0.is_empty() + } + fn parse_from_iter<'a>(options: impl Iterator) -> Self { let mut options = options .filter_map(neon_option) diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs index d866b1820f..5d0340e852 100644 --- a/proxy/src/proxy/tests.rs +++ b/proxy/src/proxy/tests.rs @@ -20,6 +20,7 @@ use crate::{http, sasl, scram}; use anyhow::{bail, Context}; use async_trait::async_trait; use rstest::rstest; +use rustls::pki_types; use tokio_postgres::config::SslMode; use tokio_postgres::tls::{MakeTlsConnect, NoTls}; use tokio_postgres_rustls::{MakeRustlsConnect, RustlsStream}; @@ -28,7 +29,11 @@ use tokio_postgres_rustls::{MakeRustlsConnect, RustlsStream}; fn generate_certs( hostname: &str, common_name: &str, -) -> anyhow::Result<(rustls::Certificate, rustls::Certificate, rustls::PrivateKey)> { +) -> anyhow::Result<( + pki_types::CertificateDer<'static>, + pki_types::CertificateDer<'static>, + pki_types::PrivateKeyDer<'static>, +)> { let ca = rcgen::Certificate::from_params({ let mut params = rcgen::CertificateParams::default(); params.is_ca = rcgen::IsCa::Ca(rcgen::BasicConstraints::Unconstrained); @@ -45,9 +50,9 @@ fn generate_certs( })?; Ok(( - rustls::Certificate(ca.serialize_der()?), - rustls::Certificate(cert.serialize_der_with_signer(&ca)?), - rustls::PrivateKey(cert.serialize_private_key_der()), + pki_types::CertificateDer::from(ca.serialize_der()?), + pki_types::CertificateDer::from(cert.serialize_der_with_signer(&ca)?), + pki_types::PrivateKeyDer::Pkcs8(cert.serialize_private_key_der().into()), )) } @@ -82,9 +87,8 @@ fn generate_tls_config<'a>( let tls_config = { let config = rustls::ServerConfig::builder() - .with_safe_defaults() .with_no_client_auth() - .with_single_cert(vec![cert.clone()], key.clone())? + .with_single_cert(vec![cert.clone()], key.clone_key())? .into(); let mut cert_resolver = CertResolver::new(); @@ -101,10 +105,9 @@ fn generate_tls_config<'a>( let client_config = { let config = rustls::ClientConfig::builder() - .with_safe_defaults() .with_root_certificates({ let mut store = rustls::RootCertStore::empty(); - store.add(&ca)?; + store.add(ca)?; store }) .with_no_client_auth(); diff --git a/proxy/src/redis/notifications.rs b/proxy/src/redis/notifications.rs index b8297a206c..6ae848c0d2 100644 --- a/proxy/src/redis/notifications.rs +++ b/proxy/src/redis/notifications.rs @@ -10,6 +10,7 @@ use crate::{ cache::project_info::ProjectInfoCache, cancellation::{CancelMap, CancellationHandler, NotificationsCancellationHandler}, intern::{ProjectIdInt, RoleNameInt}, + metrics::REDIS_BROKEN_MESSAGES, }; const CPLANE_CHANNEL_NAME: &str = "neondb-proxy-ws-updates"; @@ -115,6 +116,9 @@ impl< let msg: Notification = match serde_json::from_str(&payload) { Ok(msg) => msg, Err(e) => { + REDIS_BROKEN_MESSAGES + .with_label_values(&[msg.get_channel_name()]) + .inc(); tracing::error!("broken message: {e}"); return Ok(()); } diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs index b5806aec53..c81ae03b23 100644 --- a/proxy/src/serverless.rs +++ b/proxy/src/serverless.rs @@ -6,6 +6,7 @@ mod backend; mod conn_pool; mod json; mod sql_over_http; +pub mod tls_listener; mod websocket; pub use conn_pool::GlobalConnPoolOptions; @@ -20,8 +21,8 @@ pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware}; use tokio_util::task::TaskTracker; use crate::context::RequestMonitoring; -use crate::metrics::NUM_CLIENT_CONNECTION_GAUGE; -use crate::protocol2::{ProxyProtocolAccept, WithClientIp}; +use crate::metrics::TLS_HANDSHAKE_FAILURES; +use crate::protocol2::{ProxyProtocolAccept, WithClientIp, WithConnectionGuard}; use crate::rate_limiter::EndpointRateLimiter; use crate::serverless::backend::PoolingBackend; use crate::{cancellation::CancellationHandler, config::ProxyConfig}; @@ -98,6 +99,7 @@ pub async fn task_main( let _ = addr_incoming.set_nodelay(true); let addr_incoming = ProxyProtocolAccept { incoming: addr_incoming, + protocol: "http", }; let ws_connections = tokio_util::task::task_tracker::TaskTracker::new(); @@ -105,18 +107,34 @@ pub async fn task_main( let tls_listener = TlsListener::new(tls_acceptor, addr_incoming).filter(|conn| { if let Err(err) = conn { - error!("failed to accept TLS connection for websockets: {err:?}"); + error!( + protocol = "http", + "failed to accept TLS connection: {err:?}" + ); + TLS_HANDSHAKE_FAILURES.inc(); ready(false) } else { + info!(protocol = "http", "accepted new TLS connection"); ready(true) } }); let make_svc = hyper::service::make_service_fn( - |stream: &tokio_rustls::server::TlsStream>| { - let (io, _) = stream.get_ref(); - let client_addr = io.client_addr(); - let remote_addr = io.inner.remote_addr(); + |stream: &tokio_rustls::server::TlsStream< + WithConnectionGuard>, + >| { + let (conn, _) = stream.get_ref(); + + // this is jank. should dissapear with hyper 1.0 migration. + let gauge = conn + .gauge + .lock() + .expect("lock should not be poisoned") + .take() + .expect("gauge should be set on connection start"); + + let client_addr = conn.inner.client_addr(); + let remote_addr = conn.inner.inner.remote_addr(); let backend = backend.clone(); let ws_connections = ws_connections.clone(); let endpoint_rate_limiter = endpoint_rate_limiter.clone(); @@ -127,8 +145,8 @@ pub async fn task_main( None if config.require_client_ip => bail!("missing required client ip"), None => remote_addr, }; - Ok(MetricService::new(hyper::service::service_fn( - move |req: Request| { + Ok(MetricService::new( + hyper::service::service_fn(move |req: Request| { let backend = backend.clone(); let ws_connections = ws_connections.clone(); let endpoint_rate_limiter = endpoint_rate_limiter.clone(); @@ -149,8 +167,9 @@ pub async fn task_main( .map_or_else(|e| e.into_response(), |r| r), ) } - }, - ))) + }), + gauge, + )) } }, ); @@ -172,13 +191,8 @@ struct MetricService { } impl MetricService { - fn new(inner: S) -> MetricService { - MetricService { - inner, - _gauge: NUM_CLIENT_CONNECTION_GAUGE - .with_label_values(&["http"]) - .guard(), - } + fn new(inner: S, _gauge: IntCounterPairGuard) -> MetricService { + MetricService { inner, _gauge } } } diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs index 7d705ba049..73f213d074 100644 --- a/proxy/src/serverless/conn_pool.rs +++ b/proxy/src/serverless/conn_pool.rs @@ -43,8 +43,13 @@ impl ConnInfo { (self.dbname.clone(), self.user_info.user.clone()) } - pub fn endpoint_cache_key(&self) -> EndpointCacheKey { - self.user_info.endpoint_cache_key() + pub fn endpoint_cache_key(&self) -> Option { + // We don't want to cache http connections for ephemeral endpoints. + if self.user_info.options.is_ephemeral() { + None + } else { + Some(self.user_info.endpoint_cache_key()) + } } } @@ -360,8 +365,11 @@ impl GlobalConnPool { conn_info: &ConnInfo, ) -> Result>, HttpConnError> { let mut client: Option> = None; + let Some(endpoint) = conn_info.endpoint_cache_key() else { + return Ok(None); + }; - let endpoint_pool = self.get_or_create_endpoint_pool(&conn_info.endpoint_cache_key()); + let endpoint_pool = self.get_or_create_endpoint_pool(&endpoint); if let Some(entry) = endpoint_pool .write() .get_conn_entry(conn_info.db_and_user()) @@ -455,8 +463,10 @@ pub fn poll_client( span.in_scope(|| { info!(%conn_info, %session_id, "new connection"); }); - let pool = - Arc::downgrade(&global_pool.get_or_create_endpoint_pool(&conn_info.endpoint_cache_key())); + let pool = match conn_info.endpoint_cache_key() { + Some(endpoint) => Arc::downgrade(&global_pool.get_or_create_endpoint_pool(&endpoint)), + None => Weak::new(), + }; let pool_clone = pool.clone(); let db_user = conn_info.db_and_user(); @@ -723,8 +733,9 @@ mod tests { dbname: "dbname".into(), password: "password".as_bytes().into(), }; - let ep_pool = - Arc::downgrade(&pool.get_or_create_endpoint_pool(&conn_info.endpoint_cache_key())); + let ep_pool = Arc::downgrade( + &pool.get_or_create_endpoint_pool(&conn_info.endpoint_cache_key().unwrap()), + ); { let mut client = Client::new(create_inner(), conn_info.clone(), ep_pool.clone()); assert_eq!(0, pool.get_global_connections_count()); @@ -780,8 +791,9 @@ mod tests { dbname: "dbname".into(), password: "password".as_bytes().into(), }; - let ep_pool = - Arc::downgrade(&pool.get_or_create_endpoint_pool(&conn_info.endpoint_cache_key())); + let ep_pool = Arc::downgrade( + &pool.get_or_create_endpoint_pool(&conn_info.endpoint_cache_key().unwrap()), + ); { let mut client = Client::new(create_inner(), conn_info.clone(), ep_pool.clone()); client.do_drop().unwrap()(); diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs index 7f51ba82cc..74af985211 100644 --- a/proxy/src/serverless/sql_over_http.rs +++ b/proxy/src/serverless/sql_over_http.rs @@ -1,7 +1,6 @@ use std::sync::Arc; use anyhow::bail; -use futures::pin_mut; use futures::StreamExt; use hyper::body::HttpBody; use hyper::header; @@ -531,13 +530,12 @@ async fn query_to_json( ) -> anyhow::Result<(ReadyForQueryStatus, Value)> { info!("executing query"); let query_params = data.params; - let row_stream = client.query_raw_txt(&data.query, query_params).await?; + let mut row_stream = std::pin::pin!(client.query_raw_txt(&data.query, query_params).await?); info!("finished executing query"); // Manually drain the stream into a vector to leave row_stream hanging // around to get a command tag. Also check that the response is not too // big. - pin_mut!(row_stream); let mut rows: Vec = Vec::new(); while let Some(row) = row_stream.next().await { let row = row?; diff --git a/proxy/src/serverless/tls_listener.rs b/proxy/src/serverless/tls_listener.rs new file mode 100644 index 0000000000..6196ff393c --- /dev/null +++ b/proxy/src/serverless/tls_listener.rs @@ -0,0 +1,283 @@ +use std::{ + pin::Pin, + task::{Context, Poll}, + time::Duration, +}; + +use futures::{Future, Stream, StreamExt}; +use pin_project_lite::pin_project; +use thiserror::Error; +use tokio::{ + io::{AsyncRead, AsyncWrite}, + task::JoinSet, + time::timeout, +}; + +/// Default timeout for the TLS handshake. +pub const DEFAULT_HANDSHAKE_TIMEOUT: Duration = Duration::from_secs(10); + +/// Trait for TLS implementation. +/// +/// Implementations are provided by the rustls and native-tls features. +pub trait AsyncTls: Clone { + /// The type of the TLS stream created from the underlying stream. + type Stream: Send + 'static; + /// Error type for completing the TLS handshake + type Error: std::error::Error + Send + 'static; + /// Type of the Future for the TLS stream that is accepted. + type AcceptFuture: Future> + Send + 'static; + + /// Accept a TLS connection on an underlying stream + fn accept(&self, stream: C) -> Self::AcceptFuture; +} + +/// Asynchronously accept connections. +pub trait AsyncAccept { + /// The type of the connection that is accepted. + type Connection: AsyncRead + AsyncWrite; + /// The type of error that may be returned. + type Error; + + /// Poll to accept the next connection. + fn poll_accept( + self: Pin<&mut Self>, + cx: &mut Context<'_>, + ) -> Poll>>; + + /// Return a new `AsyncAccept` that stops accepting connections after + /// `ender` completes. + /// + /// Useful for graceful shutdown. + /// + /// See [examples/echo.rs](https://github.com/tmccombs/tls-listener/blob/main/examples/echo.rs) + /// for example of how to use. + fn until(self, ender: F) -> Until + where + Self: Sized, + { + Until { + acceptor: self, + ender, + } + } +} + +pin_project! { + /// + /// Wraps a `Stream` of connections (such as a TCP listener) so that each connection is itself + /// encrypted using TLS. + /// + /// It is similar to: + /// + /// ```ignore + /// tcpListener.and_then(|s| tlsAcceptor.accept(s)) + /// ``` + /// + /// except that it has the ability to accept multiple transport-level connections + /// simultaneously while the TLS handshake is pending for other connections. + /// + /// By default, if a client fails the TLS handshake, that is treated as an error, and the + /// `TlsListener` will return an `Err`. If the `TlsListener` is passed directly to a hyper + /// [`Server`][1], then an invalid handshake can cause the server to stop accepting connections. + /// See [`http-stream.rs`][2] or [`http-low-level`][3] examples, for examples of how to avoid this. + /// + /// Note that if the maximum number of pending connections is greater than 1, the resulting + /// [`T::Stream`][4] connections may come in a different order than the connections produced by the + /// underlying listener. + /// + /// [1]: https://docs.rs/hyper/latest/hyper/server/struct.Server.html + /// [2]: https://github.com/tmccombs/tls-listener/blob/main/examples/http-stream.rs + /// [3]: https://github.com/tmccombs/tls-listener/blob/main/examples/http-low-level.rs + /// [4]: AsyncTls::Stream + /// + #[allow(clippy::type_complexity)] + pub struct TlsListener> { + #[pin] + listener: A, + tls: T, + waiting: JoinSet, tokio::time::error::Elapsed>>, + timeout: Duration, + } +} + +/// Builder for `TlsListener`. +#[derive(Clone)] +pub struct Builder { + tls: T, + handshake_timeout: Duration, +} + +/// Wraps errors from either the listener or the TLS Acceptor +#[derive(Debug, Error)] +pub enum Error { + /// An error that arose from the listener ([AsyncAccept::Error]) + #[error("{0}")] + ListenerError(#[source] LE), + /// An error that occurred during the TLS accept handshake + #[error("{0}")] + TlsAcceptError(#[source] TE), +} + +impl TlsListener +where + T: AsyncTls, +{ + /// Create a `TlsListener` with default options. + pub fn new(tls: T, listener: A) -> Self { + builder(tls).listen(listener) + } +} + +impl TlsListener +where + A: AsyncAccept, + A::Error: std::error::Error, + T: AsyncTls, +{ + /// Accept the next connection + /// + /// This is essentially an alias to `self.next()` with a more domain-appropriate name. + pub async fn accept(&mut self) -> Option<::Item> + where + Self: Unpin, + { + self.next().await + } + + /// Replaces the Tls Acceptor configuration, which will be used for new connections. + /// + /// This can be used to change the certificate used at runtime. + pub fn replace_acceptor(&mut self, acceptor: T) { + self.tls = acceptor; + } + + /// Replaces the Tls Acceptor configuration from a pinned reference to `Self`. + /// + /// This is useful if your listener is `!Unpin`. + /// + /// This can be used to change the certificate used at runtime. + pub fn replace_acceptor_pin(self: Pin<&mut Self>, acceptor: T) { + *self.project().tls = acceptor; + } +} + +impl Stream for TlsListener +where + A: AsyncAccept, + A::Error: std::error::Error, + T: AsyncTls, +{ + type Item = Result>; + + fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + let mut this = self.project(); + + loop { + match this.listener.as_mut().poll_accept(cx) { + Poll::Pending => break, + Poll::Ready(Some(Ok(conn))) => { + this.waiting + .spawn(timeout(*this.timeout, this.tls.accept(conn))); + } + Poll::Ready(Some(Err(e))) => { + return Poll::Ready(Some(Err(Error::ListenerError(e)))); + } + Poll::Ready(None) => return Poll::Ready(None), + } + } + + loop { + return match this.waiting.poll_join_next(cx) { + Poll::Ready(Some(Ok(Ok(conn)))) => { + Poll::Ready(Some(conn.map_err(Error::TlsAcceptError))) + } + // The handshake timed out, try getting another connection from the queue + Poll::Ready(Some(Ok(Err(_)))) => continue, + // The handshake panicked + Poll::Ready(Some(Err(e))) if e.is_panic() => { + std::panic::resume_unwind(e.into_panic()) + } + // The handshake was externally aborted + Poll::Ready(Some(Err(_))) => unreachable!("handshake tasks are never aborted"), + _ => Poll::Pending, + }; + } + } +} + +impl AsyncTls for tokio_rustls::TlsAcceptor { + type Stream = tokio_rustls::server::TlsStream; + type Error = std::io::Error; + type AcceptFuture = tokio_rustls::Accept; + + fn accept(&self, conn: C) -> Self::AcceptFuture { + tokio_rustls::TlsAcceptor::accept(self, conn) + } +} + +impl Builder { + /// Set the timeout for handshakes. + /// + /// If a timeout takes longer than `timeout`, then the handshake will be + /// aborted and the underlying connection will be dropped. + /// + /// Defaults to `DEFAULT_HANDSHAKE_TIMEOUT`. + pub fn handshake_timeout(&mut self, timeout: Duration) -> &mut Self { + self.handshake_timeout = timeout; + self + } + + /// Create a `TlsListener` from the builder + /// + /// Actually build the `TlsListener`. The `listener` argument should be + /// an implementation of the `AsyncAccept` trait that accepts new connections + /// that the `TlsListener` will encrypt using TLS. + pub fn listen(&self, listener: A) -> TlsListener + where + T: AsyncTls, + { + TlsListener { + listener, + tls: self.tls.clone(), + waiting: JoinSet::new(), + timeout: self.handshake_timeout, + } + } +} + +/// Create a new Builder for a TlsListener +/// +/// `server_config` will be used to configure the TLS sessions. +pub fn builder(tls: T) -> Builder { + Builder { + tls, + handshake_timeout: DEFAULT_HANDSHAKE_TIMEOUT, + } +} + +pin_project! { + /// See [`AsyncAccept::until`] + pub struct Until { + #[pin] + acceptor: A, + #[pin] + ender: E, + } +} + +impl AsyncAccept for Until { + type Connection = A::Connection; + type Error = A::Error; + + fn poll_accept( + self: Pin<&mut Self>, + cx: &mut Context<'_>, + ) -> Poll>> { + let this = self.project(); + + match this.ender.poll(cx) { + Poll::Pending => this.acceptor.poll_accept(cx), + Poll::Ready(_) => Poll::Ready(None), + } + } +} diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs index 24f2bb7e8c..a72ede6d0a 100644 --- a/proxy/src/serverless/websocket.rs +++ b/proxy/src/serverless/websocket.rs @@ -3,6 +3,7 @@ use crate::{ config::ProxyConfig, context::RequestMonitoring, error::{io_error, ReportableError}, + metrics::NUM_CLIENT_CONNECTION_GAUGE, proxy::{handle_client, ClientMode}, rate_limiter::EndpointRateLimiter, }; @@ -138,6 +139,10 @@ pub async fn serve_websocket( endpoint_rate_limiter: Arc, ) -> anyhow::Result<()> { let websocket = websocket.await?; + let conn_gauge = NUM_CLIENT_CONNECTION_GAUGE + .with_label_values(&["ws"]) + .guard(); + let res = handle_client( config, &mut ctx, @@ -145,6 +150,7 @@ pub async fn serve_websocket( WebSocketRw::new(websocket), ClientMode::Websockets { hostname }, endpoint_rate_limiter, + conn_gauge, ) .await; diff --git a/proxy/src/stream.rs b/proxy/src/stream.rs index 0d639d2c07..b6b7a85659 100644 --- a/proxy/src/stream.rs +++ b/proxy/src/stream.rs @@ -1,5 +1,6 @@ use crate::config::TlsServerEndPoint; use crate::error::{ErrorKind, ReportableError, UserFacingError}; +use crate::metrics::TLS_HANDSHAKE_FAILURES; use bytes::BytesMut; use pq_proto::framed::{ConnectionError, Framed}; @@ -224,7 +225,10 @@ impl Stream { /// If possible, upgrade raw stream into a secure TLS-based stream. pub async fn upgrade(self, cfg: Arc) -> Result, StreamUpgradeError> { match self { - Stream::Raw { raw } => Ok(tokio_rustls::TlsAcceptor::from(cfg).accept(raw).await?), + Stream::Raw { raw } => Ok(tokio_rustls::TlsAcceptor::from(cfg) + .accept(raw) + .await + .inspect_err(|_| TLS_HANDSHAKE_FAILURES.inc())?), Stream::Tls { .. } => Err(StreamUpgradeError::AlreadyTls), } } diff --git a/s3_scrubber/src/checks.rs b/s3_scrubber/src/checks.rs index 7b9f96dce3..7c0f699958 100644 --- a/s3_scrubber/src/checks.rs +++ b/s3_scrubber/src/checks.rs @@ -11,7 +11,7 @@ use utils::id::TimelineId; use crate::cloud_admin_api::BranchData; use crate::metadata_stream::stream_listing; use crate::{download_object_with_retries, RootTarget, TenantShardTimelineId}; -use futures_util::{pin_mut, StreamExt}; +use futures_util::StreamExt; use pageserver::tenant::remote_timeline_client::parse_remote_index_path; use pageserver::tenant::storage_layer::LayerFileName; use pageserver::tenant::IndexPart; @@ -285,8 +285,7 @@ pub(crate) async fn list_timeline_blobs( let mut index_parts: Vec = Vec::new(); let mut initdb_archive: bool = false; - let stream = stream_listing(s3_client, &timeline_dir_target); - pin_mut!(stream); + let mut stream = std::pin::pin!(stream_listing(s3_client, &timeline_dir_target)); while let Some(obj) = stream.next().await { let obj = obj?; let key = obj.key(); diff --git a/s3_scrubber/src/garbage.rs b/s3_scrubber/src/garbage.rs index 93bb115883..7a08dffc66 100644 --- a/s3_scrubber/src/garbage.rs +++ b/s3_scrubber/src/garbage.rs @@ -12,7 +12,7 @@ use aws_sdk_s3::{ types::{Delete, ObjectIdentifier}, Client, }; -use futures_util::{pin_mut, TryStreamExt}; +use futures_util::TryStreamExt; use pageserver_api::shard::TenantShardId; use serde::{Deserialize, Serialize}; use tokio_stream::StreamExt; @@ -199,12 +199,12 @@ async fn find_garbage_inner( } } }); - let tenants_checked = tenants_checked.try_buffer_unordered(CONSOLE_CONCURRENCY); + let mut tenants_checked = + std::pin::pin!(tenants_checked.try_buffer_unordered(CONSOLE_CONCURRENCY)); // Process the results of Tenant checks. If a Tenant is garbage, it goes into // the `GarbageList`. Else it goes into `active_tenants` for more detailed timeline // checks if they are enabled by the `depth` parameter. - pin_mut!(tenants_checked); let mut garbage = GarbageList::new(node_kind, bucket_config); let mut active_tenants: Vec = vec![]; let mut counter = 0; @@ -267,10 +267,10 @@ async fn find_garbage_inner( .map(|r| (ttid, r)) } }); - let timelines_checked = timelines_checked.try_buffer_unordered(CONSOLE_CONCURRENCY); + let mut timelines_checked = + std::pin::pin!(timelines_checked.try_buffer_unordered(CONSOLE_CONCURRENCY)); // Update the GarbageList with any timelines which appear not to exist. - pin_mut!(timelines_checked); while let Some(result) = timelines_checked.next().await { let (ttid, console_result) = result?; if garbage.maybe_append(GarbageEntity::Timeline(ttid), console_result) { @@ -425,9 +425,9 @@ pub async fn purge_garbage( } } }); - let get_objects_results = get_objects_results.try_buffer_unordered(S3_CONCURRENCY); + let mut get_objects_results = + std::pin::pin!(get_objects_results.try_buffer_unordered(S3_CONCURRENCY)); - pin_mut!(get_objects_results); let mut objects_to_delete = Vec::new(); while let Some(result) = get_objects_results.next().await { let mut object_list = result?; diff --git a/s3_scrubber/src/scan_metadata.rs b/s3_scrubber/src/scan_metadata.rs index 4b63bb3884..6ff9783875 100644 --- a/s3_scrubber/src/scan_metadata.rs +++ b/s3_scrubber/src/scan_metadata.rs @@ -7,7 +7,7 @@ use crate::checks::{ use crate::metadata_stream::{stream_tenant_timelines, stream_tenants}; use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId}; use aws_sdk_s3::Client; -use futures_util::{pin_mut, StreamExt, TryStreamExt}; +use futures_util::{StreamExt, TryStreamExt}; use histogram::Histogram; use pageserver::tenant::remote_timeline_client::remote_layer_path; use pageserver::tenant::IndexPart; @@ -226,7 +226,7 @@ pub async fn scan_metadata( Ok((ttid, data)) } let timelines = timelines.map_ok(|ttid| report_on_timeline(&s3_client, &target, ttid)); - let timelines = timelines.try_buffered(CONCURRENCY); + let mut timelines = std::pin::pin!(timelines.try_buffered(CONCURRENCY)); // We must gather all the TenantShardTimelineId->S3TimelineBlobData for each tenant, because different // shards in the same tenant might refer to one anothers' keys if a shard split has happened. @@ -309,7 +309,6 @@ pub async fn scan_metadata( // all results for the same tenant will be adjacent. We accumulate these, // and then call `analyze_tenant` to flush, when we see the next tenant ID. let mut summary = MetadataSummary::new(); - pin_mut!(timelines); while let Some(i) = timelines.next().await { let (ttid, data) = i?; summary.update_data(&data); diff --git a/safekeeper/src/wal_service.rs b/safekeeper/src/wal_service.rs index bceaad1e16..4a97eb3993 100644 --- a/safekeeper/src/wal_service.rs +++ b/safekeeper/src/wal_service.rs @@ -68,7 +68,7 @@ async fn handle_socket( // is not Unpin, and all pgbackend/framed/tokio dependencies require stream // to be Unpin. Which is reasonable, as indeed something like TimeoutReader // shouldn't be moved. - tokio::pin!(socket); + let socket = std::pin::pin!(socket); let traffic_metrics = TrafficMetrics::new(); if let Some(current_az) = conf.availability_zone.as_deref() { diff --git a/test_runner/fixtures/remote_storage.py b/test_runner/fixtures/remote_storage.py index 4a692688e0..60591d8d46 100644 --- a/test_runner/fixtures/remote_storage.py +++ b/test_runner/fixtures/remote_storage.py @@ -252,6 +252,16 @@ class S3Storage: log.info(f"deleted {cnt} objects from remote storage") + def tenant_path(self, tenant_id: TenantId) -> str: + return f"{self.prefix_in_bucket}/tenants/{tenant_id}" + + def heatmap_key(self, tenant_id: TenantId) -> str: + return f"{self.tenant_path(tenant_id)}/{TENANT_HEATMAP_FILE_NAME}" + + def heatmap_content(self, tenant_id: TenantId): + r = self.client.get_object(Bucket=self.bucket_name, Key=self.heatmap_key(tenant_id)) + return json.loads(r["Body"].read().decode("utf-8")) + RemoteStorage = Union[LocalFsStorage, S3Storage] diff --git a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py index e2e7fffdbe..921b7c5b76 100644 --- a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py +++ b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py @@ -1,5 +1,6 @@ import asyncio import json +import os from pathlib import Path from typing import Any, Dict, Tuple @@ -19,6 +20,10 @@ from performance.pageserver.util import ( @pytest.mark.parametrize("n_tenants", [10]) @pytest.mark.parametrize("get_vectored_impl", ["sequential", "vectored"]) @pytest.mark.timeout(1000) +@pytest.mark.skipif( + os.getenv("CI", "false") == "true", + reason="The test if flaky on CI: https://github.com/neondatabase/neon/issues/7006", +) def test_basebackup_with_high_slru_count( neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker, diff --git a/test_runner/performance/test_branch_creation.py b/test_runner/performance/test_branch_creation.py index 6edcb8f1f2..9777bf6748 100644 --- a/test_runner/performance/test_branch_creation.py +++ b/test_runner/performance/test_branch_creation.py @@ -1,4 +1,5 @@ import random +import re import statistics import threading import time @@ -7,11 +8,14 @@ from contextlib import closing from typing import List import pytest -from fixtures.benchmark_fixture import MetricReport +from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker from fixtures.compare_fixtures import NeonCompare from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonPageserver from fixtures.pageserver.utils import wait_for_last_record_lsn from fixtures.types import Lsn +from fixtures.utils import wait_until +from prometheus_client.samples import Sample def _record_branch_creation_durations(neon_compare: NeonCompare, durs: List[float]): @@ -89,11 +93,17 @@ def test_branch_creation_heavy_write(neon_compare: NeonCompare, n_branches: int) _record_branch_creation_durations(neon_compare, branch_creation_durations) -@pytest.mark.parametrize("n_branches", [1024]) -# Test measures the latency of branch creation when creating a lot of branches. -def test_branch_creation_many(neon_compare: NeonCompare, n_branches: int): +@pytest.mark.parametrize("n_branches", [500, 1024]) +@pytest.mark.parametrize("shape", ["one_ancestor", "random"]) +def test_branch_creation_many(neon_compare: NeonCompare, n_branches: int, shape: str): + """ + Test measures the latency of branch creation when creating a lot of branches. + """ env = neon_compare.env + # seed the prng so we will measure the same structure every time + rng = random.Random("2024-02-29") + env.neon_cli.create_branch("b0") endpoint = env.endpoints.create_start("b0") @@ -102,15 +112,101 @@ def test_branch_creation_many(neon_compare: NeonCompare, n_branches: int): branch_creation_durations = [] for i in range(n_branches): - # random a source branch - p = random.randint(0, i) + if shape == "random": + parent = f"b{rng.randint(0, i)}" + elif shape == "one_ancestor": + parent = "b0" + else: + raise RuntimeError(f"unimplemented shape: {shape}") + timer = timeit.default_timer() - env.neon_cli.create_branch("b{}".format(i + 1), "b{}".format(p)) + # each of these uploads to remote storage before completion + env.neon_cli.create_branch(f"b{i + 1}", parent) dur = timeit.default_timer() - timer branch_creation_durations.append(dur) _record_branch_creation_durations(neon_compare, branch_creation_durations) + endpoint.stop_and_destroy() + + with neon_compare.record_duration("shutdown"): + # this sleeps 100ms between polls + env.pageserver.stop() + + startup_line = "INFO version: git(-env)?:" + + # find the first line of the log file so we can find the next start later + _, first_start = wait_until(5, 1, lambda: env.pageserver.assert_log_contains(startup_line)) + + # start without gc so we can time compaction with less noise; use shorter + # period for compaction so it starts earlier + env.pageserver.start( + overrides=( + "--pageserver-config-override=tenant_config={ compaction_period = '3s', gc_period = '0s' }", + ), + # this does print more than we want, but the number should be comparable between runs + extra_env_vars={ + "RUST_LOG": f"[compaction_loop{{tenant_id={env.initial_tenant}}}]=debug,info" + }, + ) + + _, second_start = wait_until( + 5, 1, lambda: env.pageserver.assert_log_contains(startup_line, first_start) + ) + env.pageserver.quiesce_tenants() + + wait_and_record_startup_metrics(env.pageserver, neon_compare.zenbenchmark, "restart_after") + + # wait for compaction to complete, which most likely has already done so multiple times + msg, _ = wait_until( + 30, + 1, + lambda: env.pageserver.assert_log_contains( + f".*tenant_id={env.initial_tenant}.*: compaction iteration complete.*", second_start + ), + ) + needle = re.search(" elapsed_ms=([0-9]+)", msg) + assert needle is not None, "failed to find the elapsed time" + duration = int(needle.group(1)) / 1000.0 + neon_compare.zenbenchmark.record("compaction", duration, "s", MetricReport.LOWER_IS_BETTER) + + +def wait_and_record_startup_metrics( + pageserver: NeonPageserver, target: NeonBenchmarker, prefix: str +): + """ + Waits until all startup metrics have non-zero values on the pageserver, then records them on the target + """ + + client = pageserver.http_client() + + expected_labels = set( + [ + "background_jobs_can_start", + "complete", + "initial", + "initial_tenant_load", + "initial_tenant_load_remote", + ] + ) + + def metrics_are_filled() -> List[Sample]: + m = client.get_metrics() + samples = m.query_all("pageserver_startup_duration_seconds") + # we should not have duplicate labels + matching = [ + x for x in samples if x.labels.get("phase") in expected_labels and x.value > 0.0 + ] + assert len(matching) == len(expected_labels) + return matching + + samples = wait_until(10, 1, metrics_are_filled) + + for sample in samples: + phase = sample.labels["phase"] + name = f"{prefix}.{phase}" + target.record(name, sample.value, "s", MetricReport.LOWER_IS_BETTER) + # Test measures the branch creation time when branching from a timeline with a lot of relations. # diff --git a/test_runner/performance/test_layer_map.py b/test_runner/performance/test_layer_map.py index 6bd0d85fa2..9b20954d45 100644 --- a/test_runner/performance/test_layer_map.py +++ b/test_runner/performance/test_layer_map.py @@ -17,10 +17,10 @@ def test_layer_map(neon_env_builder: NeonEnvBuilder, zenbenchmark): tenant, _ = env.neon_cli.create_tenant( conf={ "gc_period": "0s", - "checkpoint_distance": "8192", + "checkpoint_distance": "16384", "compaction_period": "1 s", "compaction_threshold": "1", - "compaction_target_size": "8192", + "compaction_target_size": "16384", } ) diff --git a/test_runner/pg_clients/csharp/npgsql/Dockerfile b/test_runner/pg_clients/csharp/npgsql/Dockerfile index b23eb2e5eb..71717a6006 100644 --- a/test_runner/pg_clients/csharp/npgsql/Dockerfile +++ b/test_runner/pg_clients/csharp/npgsql/Dockerfile @@ -1,4 +1,4 @@ -FROM mcr.microsoft.com/dotnet/sdk:7.0 AS build +FROM mcr.microsoft.com/dotnet/sdk:8.0 AS build WORKDIR /source COPY *.csproj . @@ -7,7 +7,7 @@ RUN dotnet restore COPY . . RUN dotnet publish -c release -o /app --no-restore -FROM mcr.microsoft.com/dotnet/runtime:7.0 +FROM mcr.microsoft.com/dotnet/runtime:8.0 WORKDIR /app COPY --from=build /app . diff --git a/test_runner/pg_clients/csharp/npgsql/csharp-npgsql.csproj b/test_runner/pg_clients/csharp/npgsql/csharp-npgsql.csproj index bb4427f2c4..50243e3ea7 100644 --- a/test_runner/pg_clients/csharp/npgsql/csharp-npgsql.csproj +++ b/test_runner/pg_clients/csharp/npgsql/csharp-npgsql.csproj @@ -2,13 +2,13 @@ Exe - net7.0 + net8.0 enable enable - + diff --git a/test_runner/pg_clients/java/jdbc/Dockerfile b/test_runner/pg_clients/java/jdbc/Dockerfile index 74eb9bdc32..7e074e07b8 100644 --- a/test_runner/pg_clients/java/jdbc/Dockerfile +++ b/test_runner/pg_clients/java/jdbc/Dockerfile @@ -1,10 +1,10 @@ -FROM openjdk:20 +FROM openjdk:21 WORKDIR /source COPY . . WORKDIR /app -RUN curl --output postgresql.jar https://jdbc.postgresql.org/download/postgresql-42.6.0.jar && \ +RUN curl --output postgresql.jar https://jdbc.postgresql.org/download/postgresql-42.7.2.jar && \ javac -d /app /source/Example.java CMD ["java", "-cp", "/app/postgresql.jar:.", "Example"] diff --git a/test_runner/pg_clients/python/asyncpg/Dockerfile b/test_runner/pg_clients/python/asyncpg/Dockerfile index 8b6d56b8fb..f2cc37a7bb 100644 --- a/test_runner/pg_clients/python/asyncpg/Dockerfile +++ b/test_runner/pg_clients/python/asyncpg/Dockerfile @@ -1,4 +1,4 @@ -FROM python:3.11 +FROM python:3.12 WORKDIR /source COPY . . diff --git a/test_runner/pg_clients/python/asyncpg/requirements.txt b/test_runner/pg_clients/python/asyncpg/requirements.txt index b33c21474c..61972959a9 100644 --- a/test_runner/pg_clients/python/asyncpg/requirements.txt +++ b/test_runner/pg_clients/python/asyncpg/requirements.txt @@ -1 +1 @@ -asyncpg==0.27.0 +asyncpg==0.29.0 diff --git a/test_runner/pg_clients/python/pg8000/Dockerfile b/test_runner/pg_clients/python/pg8000/Dockerfile index ebef1f9059..ee1de20da5 100644 --- a/test_runner/pg_clients/python/pg8000/Dockerfile +++ b/test_runner/pg_clients/python/pg8000/Dockerfile @@ -1,4 +1,4 @@ -FROM python:3.11 +FROM python:3.12 WORKDIR /source COPY . . diff --git a/test_runner/pg_clients/python/pg8000/requirements.txt b/test_runner/pg_clients/python/pg8000/requirements.txt index a8407c3cb0..e086a937e6 100644 --- a/test_runner/pg_clients/python/pg8000/requirements.txt +++ b/test_runner/pg_clients/python/pg8000/requirements.txt @@ -1,2 +1,2 @@ -pg8000==1.29.8 +pg8000==1.30.5 scramp>=1.4.3 diff --git a/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock b/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock index 3ac0f16e4b..a4a2426b97 100644 --- a/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock +++ b/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock @@ -19,9 +19,9 @@ checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" [[package]] name = "async-trait" -version = "0.1.74" +version = "0.1.77" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a66537f1bb974b254c98ed142ff995236e81b9d0fe4db0575f46612cb15eb0f9" +checksum = "c980ee35e870bd1a4d2c8294d4c04d0499e67bca1e4b5cefcc693c2fa00caea9" dependencies = [ "proc-macro2", "quote", @@ -51,9 +51,9 @@ dependencies = [ [[package]] name = "base64" -version = "0.21.4" +version = "0.21.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ba43ea6f343b788c8764558649e08df62f86c6ef251fdaeb1ffd010a9ae50a2" +checksum = "9d297deb1925b89f2ccc13d7635fa0714f12c87adce1c75356b39ca9b7178567" [[package]] name = "bitflags" @@ -63,9 +63,9 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" [[package]] name = "bitflags" -version = "2.4.1" +version = "2.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "327762f6e5a765692301e5bb513e0d9fef63be86bbc14528052b1cd3e6f03e07" +checksum = "ed570934406eb16438a4e976b1b4500774099c13b8cb96eec99f620f05090ddf" [[package]] name = "block-buffer" @@ -78,9 +78,9 @@ dependencies = [ [[package]] name = "bumpalo" -version = "3.14.0" +version = "3.15.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f30e7476521f6f8af1a1c4c0b8cc94f0bee37d91763d0ca2665f299b6cd8aec" +checksum = "8ea184aa71bb362a1157c896979544cc23974e08fd265f29ea96b59f0b4a555b" [[package]] name = "byteorder" @@ -96,12 +96,9 @@ checksum = "a2bd12c1caf447e69cd4528f47f94d203fd2582878ecb9e9465484c4148a8223" [[package]] name = "cc" -version = "1.0.83" +version = "1.0.89" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f1174fb0b6ec23863f8b971027804a42614e347eafb0a95bf0b12cdae21fc4d0" -dependencies = [ - "libc", -] +checksum = "a0ba8f7aaa012f30d5b2861462f6708eccd49c3c39863fe083a308035f63d723" [[package]] name = "cfg-if" @@ -111,9 +108,9 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] name = "core-foundation" -version = "0.9.3" +version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "194a7a9e6de53fa55116934067c844d9d749312f75c6f6d0980e8c252f8c2146" +checksum = "91e195e091a93c46f7102ec7818a2aa394e1e1771c3ab4825963fa03e45afb8f" dependencies = [ "core-foundation-sys", "libc", @@ -121,15 +118,15 @@ dependencies = [ [[package]] name = "core-foundation-sys" -version = "0.8.4" +version = "0.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa" +checksum = "06ea2b9bc92be3c2baa9334a323ebca2d6f074ff852cd1d7b11064035cd3868f" [[package]] name = "cpufeatures" -version = "0.2.9" +version = "0.2.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a17b76ff3a4162b0b27f354a0c87015ddad39d35f9c0c36607a3bdd175dde1f1" +checksum = "53fe5e26ff1b7aef8bca9c6080520cfb8d9333c7568e1829cef191a9723e5504" dependencies = [ "libc", ] @@ -157,12 +154,12 @@ dependencies = [ [[package]] name = "errno" -version = "0.3.5" +version = "0.3.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac3e13f66a2f95e32a39eaa81f6b95d42878ca0e1db0c7543723dfe12557e860" +checksum = "a258e46cdc063eb8519c00b9fc845fc47bcfca4130e2f08e88665ceda8474245" dependencies = [ "libc", - "windows-sys", + "windows-sys 0.52.0", ] [[package]] @@ -200,9 +197,9 @@ checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b" [[package]] name = "futures" -version = "0.3.28" +version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "23342abe12aba583913b2e62f22225ff9c950774065e4bfb61a19cd9770fec40" +checksum = "645c6916888f6cb6350d2550b80fb63e734897a8498abe35cfb732b6487804b0" dependencies = [ "futures-channel", "futures-core", @@ -215,9 +212,9 @@ dependencies = [ [[package]] name = "futures-channel" -version = "0.3.28" +version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "955518d47e09b25bbebc7a18df10b81f0c766eaf4c4f1cccef2fca5f2a4fb5f2" +checksum = "eac8f7d7865dcb88bd4373ab671c8cf4508703796caa2b1985a9ca867b3fcb78" dependencies = [ "futures-core", "futures-sink", @@ -225,15 +222,15 @@ dependencies = [ [[package]] name = "futures-core" -version = "0.3.28" +version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4bca583b7e26f571124fe5b7561d49cb2868d79116cfa0eefce955557c6fee8c" +checksum = "dfc6580bb841c5a68e9ef15c77ccc837b40a7504914d52e47b8b0e9bbda25a1d" [[package]] name = "futures-executor" -version = "0.3.28" +version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ccecee823288125bd88b4d7f565c9e58e41858e47ab72e8ea2d64e93624386e0" +checksum = "a576fc72ae164fca6b9db127eaa9a9dda0d61316034f33a0a0d4eda41f02b01d" dependencies = [ "futures-core", "futures-task", @@ -242,15 +239,15 @@ dependencies = [ [[package]] name = "futures-io" -version = "0.3.28" +version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4fff74096e71ed47f8e023204cfd0aa1289cd54ae5430a9523be060cdb849964" +checksum = "a44623e20b9681a318efdd71c299b6b222ed6f231972bfe2f224ebad6311f0c1" [[package]] name = "futures-macro" -version = "0.3.28" +version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89ca545a94061b6365f2c7355b4b32bd20df3ff95f02da9329b34ccc3bd6ee72" +checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac" dependencies = [ "proc-macro2", "quote", @@ -259,21 +256,21 @@ dependencies = [ [[package]] name = "futures-sink" -version = "0.3.28" +version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f43be4fe21a13b9781a69afa4985b0f6ee0e1afab2c6f454a8cf30e2b2237b6e" +checksum = "9fb8e00e87438d937621c1c6269e53f536c14d3fbd6a042bb24879e57d474fb5" [[package]] name = "futures-task" -version = "0.3.28" +version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "76d3d132be6c0e6aa1534069c705a74a5997a356c0dc2f86a47765e5617c5b65" +checksum = "38d84fa142264698cdce1a9f9172cf383a0c82de1bddcf3092901442c4097004" [[package]] name = "futures-util" -version = "0.3.28" +version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "26b01e40b772d54cf6c6d721c1d1abd0647a0106a12ecaa1c186273392a69533" +checksum = "3d6401deb83407ab3da39eba7e33987a73c3df0c82b4bb5813ee871c19c41d48" dependencies = [ "futures-channel", "futures-core", @@ -299,9 +296,9 @@ dependencies = [ [[package]] name = "getrandom" -version = "0.2.10" +version = "0.2.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "be4136b2a15dd319360be1c07d9933517ccf0be8f16bf62a3bee4f0d618df427" +checksum = "190092ea657667030ac6a35e305e62fc4dd69fd98ac98631e5d3a2b1575a12b5" dependencies = [ "cfg-if", "libc", @@ -310,9 +307,9 @@ dependencies = [ [[package]] name = "gimli" -version = "0.28.0" +version = "0.28.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6fb8d784f27acf97159b40fc4db5ecd8aa23b9ad5ef69cdd136d3bc80665f0c0" +checksum = "4271d37baee1b8c7e4b708028c57d816cf9d2434acb33a549475f78c181f6253" [[package]] name = "hmac" @@ -325,9 +322,9 @@ dependencies = [ [[package]] name = "js-sys" -version = "0.3.64" +version = "0.3.69" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c5f195fe497f702db0f318b07fdd68edb16955aed830df8363d837542f8f935a" +checksum = "29c15563dc2726973df627357ce0c9ddddbea194836909d655df6a75d2cf296d" dependencies = [ "wasm-bindgen", ] @@ -340,15 +337,15 @@ checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" [[package]] name = "libc" -version = "0.2.149" +version = "0.2.153" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a08173bc88b7955d1b3145aa561539096c421ac8debde8cbc3612ec635fee29b" +checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd" [[package]] name = "linux-raw-sys" -version = "0.4.10" +version = "0.4.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da2479e8c062e40bf0066ffa0bc823de0a9368974af99c9f6df941d2c231e03f" +checksum = "01cda141df6706de531b6c46c3a33ecca755538219bd484262fa09410c13539c" [[package]] name = "lock_api" @@ -362,9 +359,9 @@ dependencies = [ [[package]] name = "log" -version = "0.4.20" +version = "0.4.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f" +checksum = "90ed8c1e510134f979dbc4f070f87d4313098b704861a105fe34231c70a3901c" [[package]] name = "md-5" @@ -378,28 +375,28 @@ dependencies = [ [[package]] name = "memchr" -version = "2.6.4" +version = "2.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f665ee40bc4a3c5590afb1e9677db74a508659dfd71e126420da8274909a0167" +checksum = "523dc4f511e55ab87b694dc30d0f820d60906ef06413f93d4d7a1385599cc149" [[package]] name = "miniz_oxide" -version = "0.7.1" +version = "0.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e7810e0be55b428ada41041c41f32c9f1a42817901b4ccf45fa3d4b6561e74c7" +checksum = "9d811f3e15f28568be3407c8e7fdb6514c1cda3cb30683f15b6a1a1dc4ea14a7" dependencies = [ "adler", ] [[package]] name = "mio" -version = "0.8.8" +version = "0.8.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "927a765cd3fc26206e66b296465fa9d3e5ab003e651c1b3c060e7956d96b19d2" +checksum = "a4a650543ca06a924e8b371db273b2756685faae30f8487da1b56505a8f78b0c" dependencies = [ "libc", "wasi", - "windows-sys", + "windows-sys 0.48.0", ] [[package]] @@ -422,26 +419,26 @@ dependencies = [ [[package]] name = "object" -version = "0.32.1" +version = "0.32.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9cf5f9dd3933bd50a9e1f149ec995f39ae2c496d31fd772c1fd45ebc27e902b0" +checksum = "a6a622008b6e321afc04970976f62ee297fdbaa6f95318ca343e3eebb9648441" dependencies = [ "memchr", ] [[package]] name = "once_cell" -version = "1.18.0" +version = "1.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d" +checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" [[package]] name = "openssl" -version = "0.10.60" +version = "0.10.64" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "79a4c6c3a2b158f7f8f2a2fc5a969fa3a068df6fc9dbb4a43845436e3af7c800" +checksum = "95a0481286a310808298130d22dd1fef0fa571e05a8f44ec801801e84b216b1f" dependencies = [ - "bitflags 2.4.1", + "bitflags 2.4.2", "cfg-if", "foreign-types", "libc", @@ -469,9 +466,9 @@ checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" [[package]] name = "openssl-sys" -version = "0.9.96" +version = "0.9.101" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3812c071ba60da8b5677cc12bcb1d42989a65553772897a7e0355545a819838f" +checksum = "dda2b0f344e78efc2facf7d195d098df0dd72151b26ab98da807afc26c198dff" dependencies = [ "cc", "libc", @@ -497,16 +494,16 @@ checksum = "4c42a9226546d68acdd9c0a280d17ce19bfe27a46bf68784e4066115788d008e" dependencies = [ "cfg-if", "libc", - "redox_syscall 0.4.1", + "redox_syscall", "smallvec", - "windows-targets", + "windows-targets 0.48.5", ] [[package]] name = "percent-encoding" -version = "2.3.0" +version = "2.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b2a4787296e9989611394c33f193f676704af1686e70b8f8033ab5ba9a35a94" +checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" [[package]] name = "phf" @@ -540,9 +537,9 @@ checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" [[package]] name = "pkg-config" -version = "0.3.27" +version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "26072860ba924cbfa98ea39c8c19b4dd6a4a25423dbdf219c1eca91aa0cf6964" +checksum = "d231b230927b5e4ad203db57bbcbee2802f6bce620b1e4a9024a07d94e2907ec" [[package]] name = "postgres-native-tls" @@ -594,18 +591,18 @@ checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" [[package]] name = "proc-macro2" -version = "1.0.69" +version = "1.0.78" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "134c189feb4956b20f6f547d2cf727d4c0fe06722b20a0eec87ed445a97f92da" +checksum = "e2422ad645d89c99f8f3e6b88a9fdeca7fabeac836b1002371c4367c8f984aae" dependencies = [ "unicode-ident", ] [[package]] name = "quote" -version = "1.0.33" +version = "1.0.35" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5267fca4496028628a95160fc423a33e8b2e6af8a5302579e322e4b520293cae" +checksum = "291ec9ab5efd934aaf503a6466c5d5251535d108ee747472c3977cc5acc868ef" dependencies = [ "proc-macro2", ] @@ -640,15 +637,6 @@ dependencies = [ "getrandom", ] -[[package]] -name = "redox_syscall" -version = "0.3.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "567664f262709473930a4bf9e51bf2ebf3348f2e748ccc50dea20646858f8f29" -dependencies = [ - "bitflags 1.3.2", -] - [[package]] name = "redox_syscall" version = "0.4.1" @@ -676,24 +664,24 @@ checksum = "d626bb9dae77e28219937af045c257c28bfd3f69333c512553507f5f9798cb76" [[package]] name = "rustix" -version = "0.38.19" +version = "0.38.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "745ecfa778e66b2b63c88a61cb36e0eea109e803b0b86bf9879fbc77c70e86ed" +checksum = "6ea3e1a662af26cd7a3ba09c0297a31af215563ecf42817c98df621387f4e949" dependencies = [ - "bitflags 2.4.1", + "bitflags 2.4.2", "errno", "libc", "linux-raw-sys", - "windows-sys", + "windows-sys 0.52.0", ] [[package]] name = "schannel" -version = "0.1.22" +version = "0.1.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c3733bf4cf7ea0880754e19cb5a462007c4a8c1914bff372ccc95b464f1df88" +checksum = "fbc91545643bcf3a0bbb6569265615222618bdf33ce4ffbbd13c4bbd4c093534" dependencies = [ - "windows-sys", + "windows-sys 0.52.0", ] [[package]] @@ -753,18 +741,18 @@ dependencies = [ [[package]] name = "smallvec" -version = "1.11.1" +version = "1.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "942b4a808e05215192e39f4ab80813e599068285906cc91aa64f923db842bd5a" +checksum = "e6ecd384b10a64542d77071bd64bd7b231f4ed5940fba55e98c3de13824cf3d7" [[package]] name = "socket2" -version = "0.5.4" +version = "0.5.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4031e820eb552adee9295814c0ced9e5cf38ddf1e8b7d566d6de8e2538ea989e" +checksum = "05ffd9c0a93b7543e062e759284fcf5f5e3b098501104bfbdde4d404db792871" dependencies = [ "libc", - "windows-sys", + "windows-sys 0.52.0", ] [[package]] @@ -786,9 +774,9 @@ checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc" [[package]] name = "syn" -version = "2.0.38" +version = "2.0.52" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e96b79aaa137db8f61e26363a0c9b47d8b4ec75da28b7d1d614c2303e232408b" +checksum = "b699d15b36d1f02c3e7c69f8ffef53de37aefae075d8488d4ba1a7788d574a07" dependencies = [ "proc-macro2", "quote", @@ -797,15 +785,14 @@ dependencies = [ [[package]] name = "tempfile" -version = "3.8.0" +version = "3.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cb94d2f3cc536af71caac6b6fcebf65860b347e7ce0cc9ebe8f70d3e521054ef" +checksum = "85b77fafb263dd9d05cbeac119526425676db3784113aa9295c88498cbf8bff1" dependencies = [ "cfg-if", "fastrand", - "redox_syscall 0.3.5", "rustix", - "windows-sys", + "windows-sys 0.52.0", ] [[package]] @@ -825,9 +812,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.33.0" +version = "1.36.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4f38200e3ef7995e5ef13baec2f432a6da0aa9ac495b2c0e8f3b7eec2c92d653" +checksum = "61285f6515fa018fb2d1e46eb21223fff441ee8db5d0f1435e8ab4f5cdb80931" dependencies = [ "backtrace", "bytes", @@ -836,14 +823,14 @@ dependencies = [ "pin-project-lite", "socket2", "tokio-macros", - "windows-sys", + "windows-sys 0.48.0", ] [[package]] name = "tokio-macros" -version = "2.1.0" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "630bdcf245f78637c13ec01ffae6187cca34625e8c63150d424b59e55af2675e" +checksum = "5b8a1e28f2deaa14e508979454cb3a223b10b938b45af148bc0986de36f1923b" dependencies = [ "proc-macro2", "quote", @@ -888,9 +875,9 @@ dependencies = [ [[package]] name = "tokio-util" -version = "0.7.9" +version = "0.7.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d68074620f57a0b21594d9735eb2e98ab38b17f80d3fcb189fca266771ca60d" +checksum = "5419f34732d9eb6ee4c3578b7989078579b7f039cbbb9ca2c4da015749371e15" dependencies = [ "bytes", "futures-core", @@ -927,9 +914,9 @@ checksum = "42ff0bf0c66b8238c6f3b578df37d0b7848e55df8577b3f74f92a69acceeb825" [[package]] name = "unicode-bidi" -version = "0.3.13" +version = "0.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "92888ba5573ff080736b3648696b70cafad7d250551175acbaa4e0385b3e1460" +checksum = "08f95100a766bf4f8f28f90d77e0a5461bbdb219042e7679bebe79004fed8d75" [[package]] name = "unicode-ident" @@ -939,9 +926,9 @@ checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" [[package]] name = "unicode-normalization" -version = "0.1.22" +version = "0.1.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c5713f0fc4b5db668a2ac63cdb7bb4469d8c9fed047b1d0292cc7b0ce2ba921" +checksum = "a56d1686db2308d901306f92a263857ef59ea39678a5458e7cb17f01415101f5" dependencies = [ "tinyvec", ] @@ -965,10 +952,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" [[package]] -name = "wasm-bindgen" -version = "0.2.87" +name = "wasite" +version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7706a72ab36d8cb1f80ffbf0e071533974a60d0a308d01a5d0375bf60499a342" +checksum = "b8dad83b4f25e74f184f64c43b150b91efe7647395b42289f38e50566d82855b" + +[[package]] +name = "wasm-bindgen" +version = "0.2.92" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4be2531df63900aeb2bca0daaaddec08491ee64ceecbee5076636a3b026795a8" dependencies = [ "cfg-if", "wasm-bindgen-macro", @@ -976,9 +969,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-backend" -version = "0.2.87" +version = "0.2.92" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ef2b6d3c510e9625e5fe6f509ab07d66a760f0885d858736483c32ed7809abd" +checksum = "614d787b966d3989fa7bb98a654e369c762374fd3213d212cfc0251257e747da" dependencies = [ "bumpalo", "log", @@ -991,9 +984,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.87" +version = "0.2.92" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dee495e55982a3bd48105a7b947fd2a9b4a8ae3010041b9e0faab3f9cd028f1d" +checksum = "a1f8823de937b71b9460c0c34e25f3da88250760bec0ebac694b49997550d726" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -1001,9 +994,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.87" +version = "0.2.92" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "54681b18a46765f095758388f2d0cf16eb8d4169b639ab575a8f5693af210c7b" +checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7" dependencies = [ "proc-macro2", "quote", @@ -1014,15 +1007,15 @@ dependencies = [ [[package]] name = "wasm-bindgen-shared" -version = "0.2.87" +version = "0.2.92" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ca6ad05a4870b2bf5fe995117d3728437bd27d7cd5f06f13c17443ef369775a1" +checksum = "af190c94f2773fdb3729c55b007a722abb5384da03bc0986df4c289bf5567e96" [[package]] name = "web-sys" -version = "0.3.64" +version = "0.3.69" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b85cbef8c220a6abc02aefd892dfc0fc23afb1c6a426316ec33253a3877249b" +checksum = "77afa9a11836342370f4817622a2f0f418b134426d91a82dfb48f532d2ec13ef" dependencies = [ "js-sys", "wasm-bindgen", @@ -1030,11 +1023,12 @@ dependencies = [ [[package]] name = "whoami" -version = "1.4.1" +version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "22fc3756b8a9133049b26c7f61ab35416c130e8c09b660f5b3958b446f52cc50" +checksum = "0fec781d48b41f8163426ed18e8fc2864c12937df9ce54c88ede7bd47270893e" dependencies = [ - "wasm-bindgen", + "redox_syscall", + "wasite", "web-sys", ] @@ -1044,7 +1038,16 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" dependencies = [ - "windows-targets", + "windows-targets 0.48.5", +] + +[[package]] +name = "windows-sys" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" +dependencies = [ + "windows-targets 0.52.4", ] [[package]] @@ -1053,13 +1056,28 @@ version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c" dependencies = [ - "windows_aarch64_gnullvm", - "windows_aarch64_msvc", - "windows_i686_gnu", - "windows_i686_msvc", - "windows_x86_64_gnu", - "windows_x86_64_gnullvm", - "windows_x86_64_msvc", + "windows_aarch64_gnullvm 0.48.5", + "windows_aarch64_msvc 0.48.5", + "windows_i686_gnu 0.48.5", + "windows_i686_msvc 0.48.5", + "windows_x86_64_gnu 0.48.5", + "windows_x86_64_gnullvm 0.48.5", + "windows_x86_64_msvc 0.48.5", +] + +[[package]] +name = "windows-targets" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7dd37b7e5ab9018759f893a1952c9420d060016fc19a472b4bb20d1bdd694d1b" +dependencies = [ + "windows_aarch64_gnullvm 0.52.4", + "windows_aarch64_msvc 0.52.4", + "windows_i686_gnu 0.52.4", + "windows_i686_msvc 0.52.4", + "windows_x86_64_gnu 0.52.4", + "windows_x86_64_gnullvm 0.52.4", + "windows_x86_64_msvc 0.52.4", ] [[package]] @@ -1068,38 +1086,80 @@ version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bcf46cf4c365c6f2d1cc93ce535f2c8b244591df96ceee75d8e83deb70a9cac9" + [[package]] name = "windows_aarch64_msvc" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da9f259dd3bcf6990b55bffd094c4f7235817ba4ceebde8e6d11cd0c5633b675" + [[package]] name = "windows_i686_gnu" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" +[[package]] +name = "windows_i686_gnu" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b474d8268f99e0995f25b9f095bc7434632601028cf86590aea5c8a5cb7801d3" + [[package]] name = "windows_i686_msvc" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" +[[package]] +name = "windows_i686_msvc" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1515e9a29e5bed743cb4415a9ecf5dfca648ce85ee42e15873c3cd8610ff8e02" + [[package]] name = "windows_x86_64_gnu" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5eee091590e89cc02ad514ffe3ead9eb6b660aedca2183455434b93546371a03" + [[package]] name = "windows_x86_64_gnullvm" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77ca79f2451b49fa9e2af39f0747fe999fcda4f5e241b2898624dca97a1f2177" + [[package]] name = "windows_x86_64_msvc" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32b752e52a2da0ddfbdbcc6fceadfeede4c939ed16d13e648833a61dfb611ed8" diff --git a/test_runner/pg_clients/rust/tokio-postgres/Cargo.toml b/test_runner/pg_clients/rust/tokio-postgres/Cargo.toml index 6f100aafd5..0f420e5b06 100644 --- a/test_runner/pg_clients/rust/tokio-postgres/Cargo.toml +++ b/test_runner/pg_clients/rust/tokio-postgres/Cargo.toml @@ -9,7 +9,7 @@ publish = false [dependencies] native-tls = "0.2.11" postgres-native-tls = "0.5.0" -tokio = { version = "1.33", features=["rt", "macros"] } +tokio = { version = "1.36", features=["rt", "macros"] } tokio-postgres = "0.7.10" diff --git a/test_runner/pg_clients/rust/tokio-postgres/Dockerfile b/test_runner/pg_clients/rust/tokio-postgres/Dockerfile index 1d3709803e..8611e66cbb 100644 --- a/test_runner/pg_clients/rust/tokio-postgres/Dockerfile +++ b/test_runner/pg_clients/rust/tokio-postgres/Dockerfile @@ -1,4 +1,4 @@ -FROM rust:1.73 +FROM rust:1.76 WORKDIR /source COPY . . diff --git a/test_runner/pg_clients/swift/PostgresClientKitExample/Dockerfile b/test_runner/pg_clients/swift/PostgresClientKitExample/Dockerfile index 9538cf4ed4..0402838820 100644 --- a/test_runner/pg_clients/swift/PostgresClientKitExample/Dockerfile +++ b/test_runner/pg_clients/swift/PostgresClientKitExample/Dockerfile @@ -1,11 +1,11 @@ -FROM swift:5.8 AS build +FROM swift:5.9 AS build RUN apt-get -q update && apt-get -q install -y libssl-dev WORKDIR /source COPY . . RUN swift build --configuration release -FROM swift:5.8 +FROM swift:5.9 WORKDIR /app COPY --from=build /source/.build/release . CMD ["/app/PostgresClientKitExample"] diff --git a/test_runner/pg_clients/swift/PostgresNIOExample/Dockerfile b/test_runner/pg_clients/swift/PostgresNIOExample/Dockerfile index 61e1d1bba6..9130e0973f 100644 --- a/test_runner/pg_clients/swift/PostgresNIOExample/Dockerfile +++ b/test_runner/pg_clients/swift/PostgresNIOExample/Dockerfile @@ -1,10 +1,10 @@ -FROM swift:5.8 AS build +FROM swift:5.9 AS build WORKDIR /source COPY . . RUN swift build --configuration release -FROM swift:5.8 +FROM swift:5.9 WORKDIR /app COPY --from=build /source/.build/release . CMD ["/app/PostgresNIOExample"] diff --git a/test_runner/pg_clients/swift/PostgresNIOExample/Package.resolved b/test_runner/pg_clients/swift/PostgresNIOExample/Package.resolved index 9f13106011..023e03a7b1 100644 --- a/test_runner/pg_clients/swift/PostgresNIOExample/Package.resolved +++ b/test_runner/pg_clients/swift/PostgresNIOExample/Package.resolved @@ -5,8 +5,8 @@ "kind" : "remoteSourceControl", "location" : "https://github.com/vapor/postgres-nio.git", "state" : { - "revision" : "061a0836d7c1887e04a975d1d2eaa2ef5fd7dfab", - "version" : "1.16.0" + "revision" : "69ccfdf4c80144d845e3b439961b7ec6cd7ae33f", + "version" : "1.20.2" } }, { @@ -14,8 +14,8 @@ "kind" : "remoteSourceControl", "location" : "https://github.com/apple/swift-atomics.git", "state" : { - "revision" : "6c89474e62719ddcc1e9614989fff2f68208fe10", - "version" : "1.1.0" + "revision" : "cd142fd2f64be2100422d658e7411e39489da985", + "version" : "1.2.0" } }, { @@ -41,8 +41,8 @@ "kind" : "remoteSourceControl", "location" : "https://github.com/apple/swift-log.git", "state" : { - "revision" : "32e8d724467f8fe623624570367e3d50c5638e46", - "version" : "1.5.2" + "revision" : "e97a6fcb1ab07462881ac165fdbb37f067e205d5", + "version" : "1.5.4" } }, { @@ -50,8 +50,8 @@ "kind" : "remoteSourceControl", "location" : "https://github.com/apple/swift-metrics.git", "state" : { - "revision" : "9b39d811a83cf18b79d7d5513b06f8b290198b10", - "version" : "2.3.3" + "revision" : "971ba26378ab69c43737ee7ba967a896cb74c0d1", + "version" : "2.4.1" } }, { @@ -59,8 +59,8 @@ "kind" : "remoteSourceControl", "location" : "https://github.com/apple/swift-nio.git", "state" : { - "revision" : "6213ba7a06febe8fef60563a4a7d26a4085783cf", - "version" : "2.54.0" + "revision" : "635b2589494c97e48c62514bc8b37ced762e0a62", + "version" : "2.63.0" } }, { @@ -68,8 +68,8 @@ "kind" : "remoteSourceControl", "location" : "https://github.com/apple/swift-nio-ssl.git", "state" : { - "revision" : "e866a626e105042a6a72a870c88b4c531ba05f83", - "version" : "2.24.0" + "revision" : "7c381eb6083542b124a6c18fae742f55001dc2b5", + "version" : "2.26.0" } }, { @@ -77,8 +77,17 @@ "kind" : "remoteSourceControl", "location" : "https://github.com/apple/swift-nio-transport-services.git", "state" : { - "revision" : "41f4098903878418537020075a4d8a6e20a0b182", - "version" : "1.17.0" + "revision" : "6cbe0ed2b394f21ab0d46b9f0c50c6be964968ce", + "version" : "1.20.1" + } + }, + { + "identity" : "swift-system", + "kind" : "remoteSourceControl", + "location" : "https://github.com/apple/swift-system.git", + "state" : { + "revision" : "025bcb1165deab2e20d4eaba79967ce73013f496", + "version" : "1.2.1" } } ], diff --git a/test_runner/pg_clients/swift/PostgresNIOExample/Package.swift b/test_runner/pg_clients/swift/PostgresNIOExample/Package.swift index a80590daa2..637eb4bc9d 100644 --- a/test_runner/pg_clients/swift/PostgresNIOExample/Package.swift +++ b/test_runner/pg_clients/swift/PostgresNIOExample/Package.swift @@ -1,10 +1,10 @@ -// swift-tools-version:5.8 +// swift-tools-version:5.9 import PackageDescription let package = Package( name: "PostgresNIOExample", dependencies: [ - .package(url: "https://github.com/vapor/postgres-nio.git", from: "1.16.0") + .package(url: "https://github.com/vapor/postgres-nio.git", from: "1.20.2") ], targets: [ .executableTarget( diff --git a/test_runner/pg_clients/typescript/postgresql-client/Dockerfile b/test_runner/pg_clients/typescript/postgresql-client/Dockerfile index 07e98c586b..004b383749 100644 --- a/test_runner/pg_clients/typescript/postgresql-client/Dockerfile +++ b/test_runner/pg_clients/typescript/postgresql-client/Dockerfile @@ -1,4 +1,4 @@ -FROM node:20 +FROM node:21 WORKDIR /source COPY . . diff --git a/test_runner/pg_clients/typescript/postgresql-client/package-lock.json b/test_runner/pg_clients/typescript/postgresql-client/package-lock.json index 4cedf56acd..b4f8587eac 100644 --- a/test_runner/pg_clients/typescript/postgresql-client/package-lock.json +++ b/test_runner/pg_clients/typescript/postgresql-client/package-lock.json @@ -5,24 +5,24 @@ "packages": { "": { "dependencies": { - "postgresql-client": "2.5.9" + "postgresql-client": "2.10.5" } }, "node_modules/doublylinked": { - "version": "2.5.2", - "resolved": "https://registry.npmjs.org/doublylinked/-/doublylinked-2.5.2.tgz", - "integrity": "sha512-TDh0XfQWWDrfvGdAN0hLNIdkTXlw04nVCO5B/37ie4dV0yw1iT9ZrZ6tD+q/0SwXxeI/u6TF9Mxgd7s5/XYV6A==", + "version": "2.5.4", + "resolved": "https://registry.npmjs.org/doublylinked/-/doublylinked-2.5.4.tgz", + "integrity": "sha512-jBCKDnFkEHJRjQvYEl5N9VngRV8ypHgw6a52OK4VN57eV2r2rYvgOx9uABdY78INNoW7S6auULp+KBVm/jfYqw==", "engines": { "node": ">= 10.0" } }, "node_modules/lightning-pool": { - "version": "4.2.1", - "resolved": "https://registry.npmjs.org/lightning-pool/-/lightning-pool-4.2.1.tgz", - "integrity": "sha512-/pUIoGD3nzTH/wI4TYiJM3cLPeUOzGMTfFeBRuxaOAnwL0LZfwvqn5YFqsfyF98M0C3UXxWgfTz+Lu6okkno+g==", + "version": "4.2.2", + "resolved": "https://registry.npmjs.org/lightning-pool/-/lightning-pool-4.2.2.tgz", + "integrity": "sha512-KW0Df0IbjNLxy5wAsdErTKYtHGwefLRQseHNksEctyaL7gtRwJT0nqLa2uiRdNYDwKSnZtqOjSjUNtfxmfH1qw==", "dependencies": { - "doublylinked": "^2.5.2", - "putil-promisify": "^1.8.6" + "doublylinked": "^2.5.3", + "putil-promisify": "^1.10.1" } }, "node_modules/obuf": { @@ -42,16 +42,16 @@ } }, "node_modules/postgresql-client": { - "version": "2.5.9", - "resolved": "https://registry.npmjs.org/postgresql-client/-/postgresql-client-2.5.9.tgz", - "integrity": "sha512-s+kgTN6TfWLzehEyxw4Im4odnxVRCbZ0DEJzWS6SLowPAmB2m1/DOiOvZC0+ZVoi5AfbGE6SBqFxKguSyVAXZg==", + "version": "2.10.5", + "resolved": "https://registry.npmjs.org/postgresql-client/-/postgresql-client-2.10.5.tgz", + "integrity": "sha512-R3EC16pUdbgrzk1J2MQLj7jY2TepWurJHoK90nOeLZj1XTpL/+wL1VCneTmclRVKDuKVjFHr+FASV47KrLpAbw==", "dependencies": { - "doublylinked": "^2.5.2", - "lightning-pool": "^4.2.1", + "doublylinked": "^2.5.4", + "lightning-pool": "^4.2.2", "postgres-bytea": "^3.0.0", - "power-tasks": "^1.7.0", - "putil-merge": "^3.10.3", - "putil-promisify": "^1.10.0", + "power-tasks": "^1.7.3", + "putil-merge": "^3.12.1", + "putil-promisify": "^1.10.1", "putil-varhelpers": "^1.6.5" }, "engines": { @@ -60,30 +60,29 @@ } }, "node_modules/power-tasks": { - "version": "1.7.0", - "resolved": "https://registry.npmjs.org/power-tasks/-/power-tasks-1.7.0.tgz", - "integrity": "sha512-rndZXCDxhuIDjPUJJvQwBDHaYagCkjvbPF/NA+omh/Ef4rAI9KtnvdA0k98dyiGpn1zXOpc6c2c0JWzg/xAhJg==", + "version": "1.7.3", + "resolved": "https://registry.npmjs.org/power-tasks/-/power-tasks-1.7.3.tgz", + "integrity": "sha512-EnkjLfaX4PxFYHbUWyWzlE4I8SgctaW9jx4qQXrVRoELlqBXrxIMtuhHzRwsHv2qs1tO7efOcZa6/wDCdCjRfA==", "dependencies": { - "doublylinked": "^2.5.2", - "strict-typed-events": "^2.3.1" + "doublylinked": "^2.5.4", + "strict-typed-events": "^2.3.3" }, "engines": { - "node": ">=14.0", - "npm": ">=7.0.0" + "node": ">=16.0" } }, "node_modules/putil-merge": { - "version": "3.10.3", - "resolved": "https://registry.npmjs.org/putil-merge/-/putil-merge-3.10.3.tgz", - "integrity": "sha512-B18CYi0/SmBYl9+fgowYWkgzJM/8XcLSeafHrFrGzwySQuOzLW0sOGx0CdFVp9zqaxgLctexUdGoSPpm6CPM6A==", + "version": "3.12.1", + "resolved": "https://registry.npmjs.org/putil-merge/-/putil-merge-3.12.1.tgz", + "integrity": "sha512-4clPyRkJPrd5zl98AP7I3JamyXbx0ixe2CnfvGwoTyWSr7Kslcv8weoKjfU4BMBifkWIRL54l4OrNe97pYcDwQ==", "engines": { "node": ">= 10.0" } }, "node_modules/putil-promisify": { - "version": "1.10.0", - "resolved": "https://registry.npmjs.org/putil-promisify/-/putil-promisify-1.10.0.tgz", - "integrity": "sha512-zYPoAoMxmf8pC+I75kRkYkVMwU4ZbZl82aTGema175bmhQ06BEJuuOlzOy1buQK9G+hCyQ+BFpzMTKAJhD8rZw==", + "version": "1.10.1", + "resolved": "https://registry.npmjs.org/putil-promisify/-/putil-promisify-1.10.1.tgz", + "integrity": "sha512-1jm0egJNrj5eBDRj15Cg08RNHDV91OVEHeeYjAFRcs663PXxFokndxcJAGbaO6CSErCTp8eTgC8vuOF+fvXIAA==", "engines": { "node": ">= 14.0" } @@ -97,21 +96,21 @@ } }, "node_modules/strict-typed-events": { - "version": "2.3.1", - "resolved": "https://registry.npmjs.org/strict-typed-events/-/strict-typed-events-2.3.1.tgz", - "integrity": "sha512-Z1h8KpVbrVg34Vwy/VwTD/tS9tFebH2h1Kvw4xnPkKpkISMwUpnqwU44rMfkKMpXbFCybIgDt7ARoCGTzURZhQ==", + "version": "2.3.3", + "resolved": "https://registry.npmjs.org/strict-typed-events/-/strict-typed-events-2.3.3.tgz", + "integrity": "sha512-Vc8/N5giCVpO2n5BCskqDD9ns7RkdEq0pFd4yQk1ROULusJDbjORNvbtyEPxxK7Xqn9/NdW8XHLxv/PvUTgFsA==", "dependencies": { - "putil-promisify": "^1.8.5", - "ts-gems": "^2.2.0" + "putil-promisify": "^1.10.1", + "ts-gems": "^3.1.0" }, "engines": { "node": ">=16.0" } }, "node_modules/ts-gems": { - "version": "2.4.0", - "resolved": "https://registry.npmjs.org/ts-gems/-/ts-gems-2.4.0.tgz", - "integrity": "sha512-SdugYAXoWvbqrxLodIObzxhEKacDxh5LfAJIiIkiH7q5thvuuCzdmkdTVQYf7uEDrEpPhfx4tokDMamdO3be9A==" + "version": "3.1.1", + "resolved": "https://registry.npmjs.org/ts-gems/-/ts-gems-3.1.1.tgz", + "integrity": "sha512-Li1Z44FnxN06c1lBwFepb932jPYT+4eOvOmoiC30lOTkvOJOERr9xZFg3UA9y19OYO9CrW3ZSqNL66DUSuwFTw==" } } } diff --git a/test_runner/pg_clients/typescript/postgresql-client/package.json b/test_runner/pg_clients/typescript/postgresql-client/package.json index 12703ce89f..07ec100d0d 100644 --- a/test_runner/pg_clients/typescript/postgresql-client/package.json +++ b/test_runner/pg_clients/typescript/postgresql-client/package.json @@ -1,6 +1,6 @@ { "type": "module", "dependencies": { - "postgresql-client": "2.5.9" + "postgresql-client": "2.10.5" } } diff --git a/test_runner/pg_clients/typescript/serverless-driver/Dockerfile b/test_runner/pg_clients/typescript/serverless-driver/Dockerfile index 07e98c586b..004b383749 100644 --- a/test_runner/pg_clients/typescript/serverless-driver/Dockerfile +++ b/test_runner/pg_clients/typescript/serverless-driver/Dockerfile @@ -1,4 +1,4 @@ -FROM node:20 +FROM node:21 WORKDIR /source COPY . . diff --git a/test_runner/pg_clients/typescript/serverless-driver/package-lock.json b/test_runner/pg_clients/typescript/serverless-driver/package-lock.json index 72cc452817..5a3ad3c238 100644 --- a/test_runner/pg_clients/typescript/serverless-driver/package-lock.json +++ b/test_runner/pg_clients/typescript/serverless-driver/package-lock.json @@ -5,14 +5,14 @@ "packages": { "": { "dependencies": { - "@neondatabase/serverless": "0.4.18", - "ws": "8.13.0" + "@neondatabase/serverless": "0.9.0", + "ws": "8.16.0" } }, "node_modules/@neondatabase/serverless": { - "version": "0.4.18", - "resolved": "https://registry.npmjs.org/@neondatabase/serverless/-/serverless-0.4.18.tgz", - "integrity": "sha512-2TZnIyRGC/+0fjZ8TKCzaSTPUD94PM7NBGuantGZbUrbWyqBwGnUoRtdZAQ95qBKVHqORLVfymlv2NE+HQMFeA==", + "version": "0.9.0", + "resolved": "https://registry.npmjs.org/@neondatabase/serverless/-/serverless-0.9.0.tgz", + "integrity": "sha512-mmJnUAzlzvxNSZuuhI6kgJjH+JgFdBMYUWxihtq/nj0Tjt+Y5UU3W+SvRFoucnd5NObYkuLYQzk+zV5DGFKGJg==", "dependencies": { "@types/pg": "8.6.6" } @@ -96,9 +96,9 @@ } }, "node_modules/ws": { - "version": "8.13.0", - "resolved": "https://registry.npmjs.org/ws/-/ws-8.13.0.tgz", - "integrity": "sha512-x9vcZYTrFPC7aSIbj7sRCYo7L/Xb8Iy+pW0ng0wt2vCJv7M9HOMy0UoN3rr+IFC7hb7vXoqS+P9ktyLLLhO+LA==", + "version": "8.16.0", + "resolved": "https://registry.npmjs.org/ws/-/ws-8.16.0.tgz", + "integrity": "sha512-HS0c//TP7Ina87TfiPUz1rQzMhHrl/SG2guqRcTOIUYD2q8uhUdNHZYJUaQ8aTGPzCh+c6oawMKW35nFl1dxyQ==", "engines": { "node": ">=10.0.0" }, diff --git a/test_runner/pg_clients/typescript/serverless-driver/package.json b/test_runner/pg_clients/typescript/serverless-driver/package.json index 840c7a5c4c..9d9da0f42c 100644 --- a/test_runner/pg_clients/typescript/serverless-driver/package.json +++ b/test_runner/pg_clients/typescript/serverless-driver/package.json @@ -1,7 +1,7 @@ { "type": "module", "dependencies": { - "@neondatabase/serverless": "0.4.18", - "ws": "8.13.0" + "@neondatabase/serverless": "0.9.0", + "ws": "8.16.0" } } diff --git a/test_runner/regress/test_migrations.py b/test_runner/regress/test_migrations.py index 3f626c5c7c..526ae14b87 100644 --- a/test_runner/regress/test_migrations.py +++ b/test_runner/regress/test_migrations.py @@ -15,7 +15,7 @@ def test_migrations(neon_simple_env: NeonEnv): endpoint.wait_for_migrations() - num_migrations = 8 + num_migrations = 9 with endpoint.cursor() as cur: cur.execute("SELECT id FROM neon_migration.migration_id") diff --git a/test_runner/regress/test_neon_extension.py b/test_runner/regress/test_neon_extension.py index 1179a3afe9..e31e1cab51 100644 --- a/test_runner/regress/test_neon_extension.py +++ b/test_runner/regress/test_neon_extension.py @@ -29,3 +29,34 @@ def test_neon_extension(neon_env_builder: NeonEnvBuilder): log.info(res) assert len(res) == 1 assert len(res[0]) == 5 + + +# Verify that the neon extension can be upgraded/downgraded. +def test_neon_extension_compatibility(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start() + env.neon_cli.create_branch("test_neon_extension_compatibility") + + endpoint_main = env.endpoints.create("test_neon_extension_compatibility") + # don't skip pg_catalog updates - it runs CREATE EXTENSION neon + endpoint_main.respec(skip_pg_catalog_updates=False) + endpoint_main.start() + + with closing(endpoint_main.connect()) as conn: + with conn.cursor() as cur: + all_versions = ["1.3", "1.2", "1.1", "1.0"] + current_version = "1.3" + for idx, begin_version in enumerate(all_versions): + for target_version in all_versions[idx + 1 :]: + if current_version != begin_version: + cur.execute( + f"ALTER EXTENSION neon UPDATE TO '{begin_version}'; -- {current_version}->{begin_version}" + ) + current_version = begin_version + # downgrade + cur.execute( + f"ALTER EXTENSION neon UPDATE TO '{target_version}'; -- {begin_version}->{target_version}" + ) + # upgrade + cur.execute( + f"ALTER EXTENSION neon UPDATE TO '{begin_version}'; -- {target_version}->{begin_version}" + ) diff --git a/test_runner/regress/test_neon_superuser.py b/test_runner/regress/test_neon_superuser.py index e0364dd13f..fd31df84da 100644 --- a/test_runner/regress/test_neon_superuser.py +++ b/test_runner/regress/test_neon_superuser.py @@ -1,12 +1,9 @@ from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnv -from fixtures.pg_version import PgVersion, skip_on_postgres +from fixtures.pg_version import PgVersion from fixtures.utils import wait_until -@skip_on_postgres( - PgVersion.V15, reason="skip on pg15 due to https://github.com/neondatabase/neon/issues/6969" -) def test_neon_superuser(neon_simple_env: NeonEnv, pg_version: PgVersion): env = neon_simple_env env.neon_cli.create_branch("test_neon_superuser_publisher", "empty") @@ -97,3 +94,6 @@ def test_neon_superuser(neon_simple_env: NeonEnv, pg_version: PgVersion): assert cur.fetchall()[0][0] != "" cur.execute("RESET ROLE") cur.execute("DROP ROLE not_a_superuser") + query = "DROP SUBSCRIPTION sub CASCADE" + log.info(f"Dropping subscription: {query}") + cur.execute(query) diff --git a/test_runner/regress/test_pageserver_getpage_throttle.py b/test_runner/regress/test_pageserver_getpage_throttle.py new file mode 100644 index 0000000000..42cc28efee --- /dev/null +++ b/test_runner/regress/test_pageserver_getpage_throttle.py @@ -0,0 +1,118 @@ +import json +import uuid + +from anyio import Path +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnvBuilder, PgBin +from fixtures.pg_version import PgVersion +from fixtures.types import TenantId, TimelineId +from fixtures.utils import wait_until + + +def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): + env = neon_env_builder.init_start() + + env.pageserver.tenant_detach(env.initial_tenant) + + env.pageserver.allowed_errors.append( + # https://github.com/neondatabase/neon/issues/6925 + r".*query handler for.*pagestream.*failed: unexpected message: CopyFail during COPY.*" + ) + + tenant_id = TenantId.generate() + timeline_id = TimelineId.generate() + + rate_limit_rps = 100 + compaction_period = 5 + env.pageserver.tenant_create( + tenant_id, + conf={ + "compaction_period": f"{compaction_period}s", + "timeline_get_throttle": { + "task_kinds": ["PageRequestHandler"], + "initial": 0, + "refill_interval": "100ms", + "refill_amount": int(rate_limit_rps / 10), + "max": int(rate_limit_rps / 10), + "fair": True, + }, + }, + ) + + ps_http = env.pageserver.http_client() + + ps_http.timeline_create(PgVersion.V16, tenant_id, timeline_id) + + def run_pagebench_at_max_speed_and_get_total_requests_completed(duration_secs: int): + cmd = [ + str(env.neon_binpath / "pagebench"), + "get-page-latest-lsn", + "--mgmt-api-endpoint", + ps_http.base_url, + "--page-service-connstring", + env.pageserver.connstr(password=None), + "--runtime", + f"{duration_secs}s", + f"{tenant_id}/{timeline_id}", + ] + + basepath = pg_bin.run_capture(cmd, with_command_header=False) + results_path = Path(basepath + ".stdout") + log.info(f"Benchmark results at: {results_path}") + + with open(results_path, "r") as f: + results = json.load(f) + log.info(f"Results:\n{json.dumps(results, sort_keys=True, indent=2)}") + return int(results["total"]["request_count"]) + + log.info("warmup / make sure metrics are present") + run_pagebench_at_max_speed_and_get_total_requests_completed(2) + metrics_query = { + "tenant_id": str(tenant_id), + "timeline_id": str(timeline_id), + "smgr_query_type": "get_page_at_lsn", + } + metric_name = "pageserver_smgr_query_seconds_sum" + smgr_query_seconds_pre = ps_http.get_metric_value(metric_name, metrics_query) + assert smgr_query_seconds_pre is not None + + marker = uuid.uuid4().hex + ps_http.post_tracing_event("info", marker) + _, marker_offset = wait_until( + 10, 0.5, lambda: env.pageserver.assert_log_contains(marker, offset=None) + ) + + log.info("run pagebench") + duration_secs = 10 + actual_ncompleted = run_pagebench_at_max_speed_and_get_total_requests_completed(duration_secs) + + log.info("validate the client is capped at the configured rps limit") + expect_ncompleted = duration_secs * rate_limit_rps + delta_abs = abs(expect_ncompleted - actual_ncompleted) + threshold = 0.05 * expect_ncompleted + assert ( + threshold / rate_limit_rps < 0.1 * duration_secs + ), "test self-test: unrealistic expecations regarding precision in this test" + assert ( + delta_abs < 0.05 * expect_ncompleted + ), "the throttling deviates more than 5percent from the expectation" + + log.info("validate that we logged the throttling") + + wait_until( + 10, + compaction_period / 10, + lambda: env.pageserver.assert_log_contains( + f".*{tenant_id}.*shard was throttled in the last n_seconds.*", + offset=marker_offset, + ), + ) + + log.info("validate that the metric doesn't include throttle wait time") + smgr_query_seconds_post = ps_http.get_metric_value(metric_name, metrics_query) + assert smgr_query_seconds_post is not None + actual_smgr_query_seconds = smgr_query_seconds_post - smgr_query_seconds_pre + + assert ( + duration_secs >= 10 * actual_smgr_query_seconds + ), "smgr metrics should not include throttle wait time" diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py index 8f694de2e1..8ba9d767dd 100644 --- a/test_runner/regress/test_pageserver_secondary.py +++ b/test_runner/regress/test_pageserver_secondary.py @@ -1,3 +1,4 @@ +import json import random from pathlib import Path from typing import Any, Dict, Optional @@ -10,7 +11,7 @@ from fixtures.pageserver.utils import ( poll_for_remote_storage_iterations, tenant_delete_wait_completed, ) -from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind +from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind, S3Storage from fixtures.types import TenantId, TimelineId from fixtures.utils import wait_until from fixtures.workload import Workload @@ -436,6 +437,7 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder): ) env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF) assert env.attachment_service is not None + assert isinstance(env.pageserver_remote_storage, S3Storage) # Satisfy linter tenant_id = env.initial_tenant timeline_id = env.initial_timeline @@ -491,18 +493,35 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder): # Do evictions on attached pageserver, check secondary follows along # ================================================================== - log.info("Evicting a layer...") - layer_to_evict = list_layers(ps_attached, tenant_id, timeline_id)[0] - ps_attached.http_client().evict_layer(tenant_id, timeline_id, layer_name=layer_to_evict.name) + try: + log.info("Evicting a layer...") + layer_to_evict = list_layers(ps_attached, tenant_id, timeline_id)[0] + some_other_layer = list_layers(ps_attached, tenant_id, timeline_id)[1] + log.info(f"Victim layer: {layer_to_evict.name}") + ps_attached.http_client().evict_layer( + tenant_id, timeline_id, layer_name=layer_to_evict.name + ) - log.info("Synchronizing after eviction...") - ps_attached.http_client().tenant_heatmap_upload(tenant_id) - ps_secondary.http_client().tenant_secondary_download(tenant_id) + log.info("Synchronizing after eviction...") + ps_attached.http_client().tenant_heatmap_upload(tenant_id) + heatmap_after_eviction = env.pageserver_remote_storage.heatmap_content(tenant_id) + heatmap_layers = set( + layer["name"] for layer in heatmap_after_eviction["timelines"][0]["layers"] + ) + assert layer_to_evict.name not in heatmap_layers + assert some_other_layer.name in heatmap_layers - assert layer_to_evict not in list_layers(ps_attached, tenant_id, timeline_id) - assert list_layers(ps_attached, tenant_id, timeline_id) == list_layers( - ps_secondary, tenant_id, timeline_id - ) + ps_secondary.http_client().tenant_secondary_download(tenant_id) + + assert layer_to_evict not in list_layers(ps_attached, tenant_id, timeline_id) + assert list_layers(ps_attached, tenant_id, timeline_id) == list_layers( + ps_secondary, tenant_id, timeline_id + ) + except: + # On assertion failures, log some details to help with debugging + heatmap = env.pageserver_remote_storage.heatmap_content(tenant_id) + log.warn(f"heatmap contents: {json.dumps(heatmap,indent=2)}") + raise # Scrub the remote storage # ======================== diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py index f8a0bef954..06c13cc07d 100644 --- a/test_runner/regress/test_remote_storage.py +++ b/test_runner/regress/test_remote_storage.py @@ -329,14 +329,15 @@ def test_remote_storage_upload_queue_retries( churn_while_failpoints_active_thread.start() # wait for churn thread's data to get stuck in the upload queue - wait_until(10, 0.5, lambda: assert_gt(get_queued_count(file_kind="layer", op_kind="upload"), 0)) - wait_until(10, 0.5, lambda: assert_ge(get_queued_count(file_kind="index", op_kind="upload"), 2)) - wait_until(10, 0.5, lambda: assert_gt(get_queued_count(file_kind="layer", op_kind="delete"), 0)) + # Exponential back-off in upload queue, so, gracious timeouts. + + wait_until(30, 1, lambda: assert_gt(get_queued_count(file_kind="layer", op_kind="upload"), 0)) + wait_until(30, 1, lambda: assert_ge(get_queued_count(file_kind="index", op_kind="upload"), 2)) + wait_until(30, 1, lambda: assert_gt(get_queued_count(file_kind="layer", op_kind="delete"), 0)) # unblock churn operations configure_storage_sync_failpoints("off") - # ... and wait for them to finish. Exponential back-off in upload queue, so, gracious timeouts. wait_until(30, 1, lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="upload"), 0)) wait_until(30, 1, lambda: assert_eq(get_queued_count(file_kind="index", op_kind="upload"), 0)) wait_until(30, 1, lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="delete"), 0)) diff --git a/test_runner/regress/test_sharding_service.py b/test_runner/regress/test_sharding_service.py index bc77dfd084..aecc244a47 100644 --- a/test_runner/regress/test_sharding_service.py +++ b/test_runner/regress/test_sharding_service.py @@ -1,7 +1,7 @@ import time from collections import defaultdict from datetime import datetime, timezone -from typing import Any, Dict, List +from typing import Any, Dict, List, Union import pytest from fixtures.log_helper import log @@ -443,10 +443,12 @@ def test_sharding_service_compute_hook( # Initial notification from tenant creation assert len(notifications) == 1 - expect = { + expect: Dict[str, Union[List[Dict[str, int]], str, None, int]] = { "tenant_id": str(env.initial_tenant), + "stripe_size": None, "shards": [{"node_id": int(env.pageservers[0].id), "shard_number": 0}], } + assert notifications[0] == expect env.attachment_service.node_configure(env.pageservers[0].id, {"availability": "Offline"}) @@ -460,6 +462,7 @@ def test_sharding_service_compute_hook( log.info(f"notifications: {notifications}") expect = { "tenant_id": str(env.initial_tenant), + "stripe_size": None, "shards": [{"node_id": int(env.pageservers[1].id), "shard_number": 0}], } @@ -475,10 +478,27 @@ def test_sharding_service_compute_hook( def received_restart_notification(): assert len(notifications) == 3 - assert notifications[1] == expect + assert notifications[2] == expect wait_until(10, 1, received_restart_notification) + # Splitting a tenant should cause its stripe size to become visible in the compute notification + env.attachment_service.tenant_shard_split(env.initial_tenant, shard_count=2) + expect = { + "tenant_id": str(env.initial_tenant), + "stripe_size": 32768, + "shards": [ + {"node_id": int(env.pageservers[1].id), "shard_number": 0}, + {"node_id": int(env.pageservers[1].id), "shard_number": 1}, + ], + } + + def received_split_notification(): + assert len(notifications) == 4 + assert notifications[3] == expect + + wait_until(10, 1, received_split_notification) + env.attachment_service.consistency_check() diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py index c4b4e5fb77..52de889084 100644 --- a/test_runner/regress/test_tenant_delete.py +++ b/test_runner/regress/test_tenant_delete.py @@ -190,6 +190,8 @@ def test_delete_tenant_exercise_crash_safety_failpoints( # So by ignoring these instead of waiting for empty upload queue # we execute more distinct code paths. '.*stopping left-over name="remote upload".*', + # an on-demand is cancelled by shutdown + ".*initial size calculation failed: downloading failed, possibly for shutdown", ] ) diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py index 795110d90b..96a5cc491a 100644 --- a/test_runner/regress/test_timeline_delete.py +++ b/test_runner/regress/test_timeline_delete.py @@ -213,7 +213,9 @@ def test_delete_timeline_exercise_crash_safety_failpoints( # This happens when timeline remains are cleaned up during loading ".*Timeline dir entry become invalid.*", # In one of the branches we poll for tenant to become active. Polls can generate this log message: - f".*Tenant {env.initial_tenant} is not active*", + f".*Tenant {env.initial_tenant} is not active.*", + # an on-demand is cancelled by shutdown + ".*initial size calculation failed: downloading failed, possibly for shutdown", ] ) diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml index 4520a5fc9c..5b93088303 100644 --- a/vm-image-spec.yaml +++ b/vm-image-spec.yaml @@ -142,6 +142,51 @@ files: query: | select datname, state, count(*) as count from pg_stat_activity where state <> '' group by datname, state; + - metric_name: pg_stats_userdb + type: gauge + help: 'Stats for the oldest non-system db' + key_labels: + - datname + value_label: kind + values: + - db_size + - deadlocks + # Rows + - inserted + - updated + - deleted + # We export stats for only one non-system database. Without this limit + # it is too easy to abuse the system by creating lots of databases. + # We can try lifting this limit in the future after we understand the needs better. + query: | + select pg_database_size(datname) as db_size, deadlocks, + tup_inserted as inserted, tup_updated as updated, tup_deleted as deleted, + datname + from pg_stat_database + where datname IN ( + select datname + from pg_database + where datname <> 'postgres' and not datistemplate + order by oid + limit 1 + ); + + - metric_name: max_cluster_size + type: gauge + help: 'neon.max_cluster_size setting' + key_labels: + values: [max_cluster_size] + query: | + select setting::int as max_cluster_size from pg_settings where name = 'neon.max_cluster_size'; + + - metric_name: db_total_size + type: gauge + help: 'Size of all databases' + key_labels: + values: [total] + query: | + select sum(pg_database_size(datname)) as total from pg_database; + build: | # Build cgroup-tools # @@ -176,7 +221,7 @@ build: | # actually build the thing... && make install - FROM quay.io/prometheuscommunity/postgres-exporter:v0.12.0 AS postgres-exporter + FROM quay.io/prometheuscommunity/postgres-exporter:v0.12.1 AS postgres-exporter FROM burningalchemist/sql_exporter:0.13 AS sql-exporter @@ -193,7 +238,7 @@ build: | pkg-config # Use `dist_man_MANS=` to skip manpage generation (which requires python3/pandoc) - ENV PGBOUNCER_TAG pgbouncer_1_22_0 + ENV PGBOUNCER_TAG pgbouncer_1_22_1 RUN set -e \ && git clone --recurse-submodules --depth 1 --branch ${PGBOUNCER_TAG} https://github.com/pgbouncer/pgbouncer.git pgbouncer \ && cd pgbouncer \ diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index e808fabbe7..8593b752c2 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -60,7 +60,6 @@ regex = { version = "1" } regex-automata = { version = "0.4", default-features = false, features = ["dfa-onepass", "hybrid", "meta", "nfa-backtrack", "perf-inline", "perf-literal", "unicode"] } regex-syntax = { version = "0.8" } reqwest = { version = "0.11", default-features = false, features = ["blocking", "default-tls", "json", "multipart", "rustls-tls", "stream"] } -ring = { version = "0.16" } rustls = { version = "0.21", features = ["dangerous_configuration"] } scopeguard = { version = "1" } serde = { version = "1", features = ["alloc", "derive"] } @@ -80,6 +79,7 @@ tracing-core = { version = "0.1" } tungstenite = { version = "0.20" } url = { version = "2", features = ["serde"] } uuid = { version = "1", features = ["serde", "v4", "v7"] } +zeroize = { version = "1", features = ["derive"] } zstd = { version = "0.13" } zstd-safe = { version = "7", default-features = false, features = ["arrays", "legacy", "std", "zdict_builder"] } zstd-sys = { version = "2", default-features = false, features = ["legacy", "std", "zdict_builder"] }