diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 2bcda7cc8e..d27713f083 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -1121,10 +1121,16 @@ jobs: run: | if [[ "$GITHUB_REF_NAME" == "main" ]]; then gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=false - - # TODO: move deployPreprodRegion to release (`"$GITHUB_REF_NAME" == "release"` block), once Staging support different compute tag prefixes for different regions - gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=true elif [[ "$GITHUB_REF_NAME" == "release" ]]; then + gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main \ + -f deployPgSniRouter=false \ + -f deployProxy=false \ + -f deployStorage=true \ + -f deployStorageBroker=true \ + -f branch=main \ + -f dockerTag=${{needs.tag.outputs.build-tag}} \ + -f deployPreprodRegion=true + gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main \ -f deployPgSniRouter=false \ -f deployProxy=false \ @@ -1133,6 +1139,15 @@ jobs: -f branch=main \ -f dockerTag=${{needs.tag.outputs.build-tag}} elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then + gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main \ + -f deployPgSniRouter=true \ + -f deployProxy=true \ + -f deployStorage=false \ + -f deployStorageBroker=false \ + -f branch=main \ + -f dockerTag=${{needs.tag.outputs.build-tag}} \ + -f deployPreprodRegion=true + gh workflow --repo neondatabase/aws run deploy-proxy-prod.yml --ref main \ -f deployPgSniRouter=true \ -f deployProxy=true \ diff --git a/CODEOWNERS b/CODEOWNERS index 5b601f0566..9a23e8c958 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -1,12 +1,13 @@ /compute_tools/ @neondatabase/control-plane @neondatabase/compute /control_plane/attachment_service @neondatabase/storage /libs/pageserver_api/ @neondatabase/storage -/libs/postgres_ffi/ @neondatabase/compute +/libs/postgres_ffi/ @neondatabase/compute @neondatabase/safekeepers /libs/remote_storage/ @neondatabase/storage /libs/safekeeper_api/ @neondatabase/safekeepers /libs/vm_monitor/ @neondatabase/autoscaling /pageserver/ @neondatabase/storage /pgxn/ @neondatabase/compute +/pgxn/neon/ @neondatabase/compute @neondatabase/safekeepers /proxy/ @neondatabase/proxy /safekeeper/ @neondatabase/safekeepers /vendor/ @neondatabase/compute diff --git a/Cargo.lock b/Cargo.lock index c4f925e3c7..6409c79ef9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -277,6 +277,7 @@ dependencies = [ "anyhow", "aws-config", "aws-sdk-secretsmanager", + "bytes", "camino", "clap", "control_plane", @@ -288,6 +289,8 @@ dependencies = [ "hex", "humantime", "hyper", + "lasso", + "measured", "metrics", "once_cell", "pageserver_api", @@ -295,6 +298,7 @@ dependencies = [ "postgres_connection", "r2d2", "reqwest", + "routerify", "serde", "serde_json", "thiserror", @@ -343,9 +347,9 @@ dependencies = [ [[package]] name = "aws-credential-types" -version = "1.1.4" +version = "1.1.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "33cc49dcdd31c8b6e79850a179af4c367669150c7ac0135f176c61bec81a70f7" +checksum = "fa8587ae17c8e967e4b05a62d495be2fb7701bec52a97f7acfe8a29f938384c8" dependencies = [ "aws-smithy-async", "aws-smithy-runtime-api", @@ -355,9 +359,9 @@ dependencies = [ [[package]] name = "aws-runtime" -version = "1.1.4" +version = "1.1.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eb031bff99877c26c28895766f7bb8484a05e24547e370768d6cc9db514662aa" +checksum = "b13dc54b4b49f8288532334bba8f87386a40571c47c37b1304979b556dc613c8" dependencies = [ "aws-credential-types", "aws-sigv4", @@ -377,6 +381,29 @@ dependencies = [ "uuid", ] +[[package]] +name = "aws-sdk-iam" +version = "1.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8ae76026bfb1b80a6aed0bb400c1139cd9c0563e26bce1986cd021c6a968c7b" +dependencies = [ + "aws-credential-types", + "aws-runtime", + "aws-smithy-async", + "aws-smithy-http", + "aws-smithy-json", + "aws-smithy-query", + "aws-smithy-runtime", + "aws-smithy-runtime-api", + "aws-smithy-types", + "aws-smithy-xml", + "aws-types", + "http 0.2.9", + "once_cell", + "regex-lite", + "tracing", +] + [[package]] name = "aws-sdk-s3" version = "1.14.0" @@ -498,9 +525,9 @@ dependencies = [ [[package]] name = "aws-sigv4" -version = "1.1.4" +version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c371c6b0ac54d4605eb6f016624fb5c7c2925d315fdf600ac1bf21b19d5f1742" +checksum = "11d6f29688a4be9895c0ba8bef861ad0c0dac5c15e9618b9b7a6c233990fc263" dependencies = [ "aws-credential-types", "aws-smithy-eventstream", @@ -513,7 +540,7 @@ dependencies = [ "hex", "hmac", "http 0.2.9", - "http 1.0.0", + "http 1.1.0", "once_cell", "p256", "percent-encoding", @@ -527,9 +554,9 @@ dependencies = [ [[package]] name = "aws-smithy-async" -version = "1.1.4" +version = "1.1.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72ee2d09cce0ef3ae526679b522835d63e75fb427aca5413cd371e490d52dcc6" +checksum = "d26ea8fa03025b2face2b3038a63525a10891e3d8829901d502e5384a0d8cd46" dependencies = [ "futures-util", "pin-project-lite", @@ -570,9 +597,9 @@ dependencies = [ [[package]] name = "aws-smithy-http" -version = "0.60.4" +version = "0.60.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dab56aea3cd9e1101a0a999447fb346afb680ab1406cebc44b32346e25b4117d" +checksum = "3f10fa66956f01540051b0aa7ad54574640f748f9839e843442d99b970d3aff9" dependencies = [ "aws-smithy-eventstream", "aws-smithy-runtime-api", @@ -591,18 +618,18 @@ dependencies = [ [[package]] name = "aws-smithy-json" -version = "0.60.4" +version = "0.60.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fd3898ca6518f9215f62678870064398f00031912390efd03f1f6ef56d83aa8e" +checksum = "4683df9469ef09468dad3473d129960119a0d3593617542b7d52086c8486f2d6" dependencies = [ "aws-smithy-types", ] [[package]] name = "aws-smithy-query" -version = "0.60.4" +version = "0.60.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bda4b1dfc9810e35fba8a620e900522cd1bd4f9578c446e82f49d1ce41d2e9f9" +checksum = "f2fbd61ceb3fe8a1cb7352e42689cec5335833cd9f94103a61e98f9bb61c64bb" dependencies = [ "aws-smithy-types", "urlencoding", @@ -610,9 +637,9 @@ dependencies = [ [[package]] name = "aws-smithy-runtime" -version = "1.1.4" +version = "1.1.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fafdab38f40ad7816e7da5dec279400dd505160780083759f01441af1bbb10ea" +checksum = "ec81002d883e5a7fd2bb063d6fb51c4999eb55d404f4fff3dd878bf4733b9f01" dependencies = [ "aws-smithy-async", "aws-smithy-http", @@ -635,14 +662,15 @@ dependencies = [ [[package]] name = "aws-smithy-runtime-api" -version = "1.1.4" +version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c18276dd28852f34b3bf501f4f3719781f4999a51c7bff1a5c6dc8c4529adc29" +checksum = "9acb931e0adaf5132de878f1398d83f8677f90ba70f01f65ff87f6d7244be1c5" dependencies = [ "aws-smithy-async", "aws-smithy-types", "bytes", "http 0.2.9", + "http 1.1.0", "pin-project-lite", "tokio", "tracing", @@ -651,9 +679,9 @@ dependencies = [ [[package]] name = "aws-smithy-types" -version = "1.1.4" +version = "1.1.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bb3e134004170d3303718baa2a4eb4ca64ee0a1c0a7041dca31b38be0fb414f3" +checksum = "abe14dceea1e70101d38fbf2a99e6a34159477c0fb95e68e05c66bd7ae4c3729" dependencies = [ "base64-simd", "bytes", @@ -674,18 +702,18 @@ dependencies = [ [[package]] name = "aws-smithy-xml" -version = "0.60.4" +version = "0.60.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8604a11b25e9ecaf32f9aa56b9fe253c5e2f606a3477f0071e96d3155a5ed218" +checksum = "872c68cf019c0e4afc5de7753c4f7288ce4b71663212771bf5e4542eb9346ca9" dependencies = [ "xmlparser", ] [[package]] name = "aws-types" -version = "1.1.4" +version = "1.1.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "789bbe008e65636fe1b6dbbb374c40c8960d1232b96af5ff4aec349f9c4accf4" +checksum = "0dbf2f3da841a8930f159163175cf6a3d16ddde517c1b0fba7aa776822800f40" dependencies = [ "aws-credential-types", "aws-smithy-async", @@ -2392,9 +2420,9 @@ dependencies = [ [[package]] name = "http" -version = "1.0.0" +version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b32afd38673a8016f7c9ae69e5af41a58f81b1d31689040f2f1959594ce194ea" +checksum = "21b9ddb458710bc376481b842f5da65cdf31522de232c1ca8146abce2a358258" dependencies = [ "bytes", "fnv", @@ -2494,7 +2522,7 @@ dependencies = [ "hyper", "log", "rustls 0.21.9", - "rustls-native-certs", + "rustls-native-certs 0.6.2", "tokio", "tokio-rustls 0.24.0", ] @@ -2880,6 +2908,35 @@ version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771" +[[package]] +name = "measured" +version = "0.0.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f246648d027839a34b420e27c7de1165ace96e19ef894985d0a6ff89a7840a9f" +dependencies = [ + "bytes", + "hashbrown 0.14.0", + "itoa", + "lasso", + "measured-derive", + "memchr", + "parking_lot 0.12.1", + "rustc-hash", + "ryu", +] + +[[package]] +name = "measured-derive" +version = "0.0.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "edaa5cc22d99d5d6d7d99c3b5b5f7e7f8034c22f1b5d62a1adecd2ed005d9b80" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn 2.0.52", +] + [[package]] name = "memchr" version = "2.6.4" @@ -3901,7 +3958,7 @@ dependencies = [ [[package]] name = "postgres" version = "0.19.4" -source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#988d0ddb4184c408fa7fc1bd0ecca7993c02978f" +source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2" dependencies = [ "bytes", "fallible-iterator", @@ -3914,7 +3971,7 @@ dependencies = [ [[package]] name = "postgres-native-tls" version = "0.5.0" -source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#988d0ddb4184c408fa7fc1bd0ecca7993c02978f" +source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2" dependencies = [ "native-tls", "tokio", @@ -3925,7 +3982,7 @@ dependencies = [ [[package]] name = "postgres-protocol" version = "0.6.4" -source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#988d0ddb4184c408fa7fc1bd0ecca7993c02978f" +source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2" dependencies = [ "base64 0.20.0", "byteorder", @@ -3938,12 +3995,13 @@ dependencies = [ "rand 0.8.5", "sha2", "stringprep", + "tokio", ] [[package]] name = "postgres-types" version = "0.2.4" -source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#988d0ddb4184c408fa7fc1bd0ecca7993c02978f" +source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2" dependencies = [ "bytes", "fallible-iterator", @@ -4165,6 +4223,10 @@ version = "0.1.0" dependencies = [ "anyhow", "async-trait", + "aws-config", + "aws-sdk-iam", + "aws-sigv4", + "aws-types", "base64 0.13.1", "bstr", "bytes", @@ -4175,6 +4237,7 @@ dependencies = [ "consumption_metrics", "dashmap", "env_logger", + "fallible-iterator", "futures", "git-version", "hashbrown 0.13.2", @@ -4182,6 +4245,7 @@ dependencies = [ "hex", "hmac", "hostname", + "http 1.1.0", "humantime", "hyper", "hyper-tungstenite", @@ -4225,6 +4289,7 @@ dependencies = [ "smallvec", "smol_str", "socket2 0.5.5", + "subtle", "sync_wrapper", "task-local-extensions", "thiserror", @@ -4396,9 +4461,9 @@ dependencies = [ [[package]] name = "redis" -version = "0.24.0" +version = "0.25.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c580d9cbbe1d1b479e8d67cf9daf6a62c957e6846048408b80b43ac3f6af84cd" +checksum = "71d64e978fd98a0e6b105d066ba4889a7301fca65aeac850a877d8797343feeb" dependencies = [ "async-trait", "bytes", @@ -4407,15 +4472,15 @@ dependencies = [ "itoa", "percent-encoding", "pin-project-lite", - "rustls 0.21.9", - "rustls-native-certs", - "rustls-pemfile 1.0.2", - "rustls-webpki 0.101.7", + "rustls 0.22.2", + "rustls-native-certs 0.7.0", + "rustls-pemfile 2.1.1", + "rustls-pki-types", "ryu", "sha1_smol", - "socket2 0.4.9", + "socket2 0.5.5", "tokio", - "tokio-rustls 0.24.0", + "tokio-rustls 0.25.0", "tokio-util", "url", ] @@ -4844,6 +4909,19 @@ dependencies = [ "security-framework", ] +[[package]] +name = "rustls-native-certs" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f1fb85efa936c42c6d5fc28d2629bb51e4b2f4b8a5211e297d599cc5a093792" +dependencies = [ + "openssl-probe", + "rustls-pemfile 2.1.1", + "rustls-pki-types", + "schannel", + "security-framework", +] + [[package]] name = "rustls-pemfile" version = "1.0.2" @@ -5346,13 +5424,23 @@ checksum = "ae1a47186c03a32177042e55dbc5fd5aee900b8e0069a8d70fba96a9375cd012" [[package]] name = "sha2" -version = "0.10.6" +version = "0.10.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "82e6b795fe2e3b1e845bafcb27aa35405c4d47cdfc92af5fc8d3002f76cebdc0" +checksum = "793db75ad2bcafc3ffa7c68b215fee268f537982cd901d132f89c6343f3a3dc8" dependencies = [ "cfg-if", "cpufeatures", "digest", + "sha2-asm", +] + +[[package]] +name = "sha2-asm" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f27ba7066011e3fb30d808b51affff34f0a66d3a03a58edd787c6e420e40e44e" +dependencies = [ + "cc", ] [[package]] @@ -5935,7 +6023,7 @@ dependencies = [ [[package]] name = "tokio-postgres" version = "0.7.7" -source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#988d0ddb4184c408fa7fc1bd0ecca7993c02978f" +source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2" dependencies = [ "async-trait", "byteorder", @@ -6101,7 +6189,7 @@ dependencies = [ "percent-encoding", "pin-project", "prost", - "rustls-native-certs", + "rustls-native-certs 0.6.2", "rustls-pemfile 1.0.2", "tokio", "tokio-rustls 0.24.0", @@ -6468,6 +6556,7 @@ version = "0.1.0" dependencies = [ "anyhow", "arc-swap", + "async-compression", "async-trait", "bincode", "byteorder", @@ -6506,12 +6595,14 @@ dependencies = [ "thiserror", "tokio", "tokio-stream", + "tokio-tar", "tokio-util", "tracing", "tracing-error", "tracing-subscriber", "url", "uuid", + "walkdir", "workspace_hack", ] @@ -6983,7 +7074,6 @@ dependencies = [ "aws-sigv4", "aws-smithy-async", "aws-smithy-http", - "aws-smithy-runtime-api", "aws-smithy-types", "axum", "base64 0.21.1", @@ -7029,6 +7119,7 @@ dependencies = [ "scopeguard", "serde", "serde_json", + "sha2", "smallvec", "subtle", "syn 1.0.109", diff --git a/Cargo.toml b/Cargo.toml index 76f4ff041c..4dda63ff58 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -53,9 +53,12 @@ async-trait = "0.1" aws-config = { version = "1.1.4", default-features = false, features=["rustls"] } aws-sdk-s3 = "1.14" aws-sdk-secretsmanager = { version = "1.14.0" } +aws-sdk-iam = "1.15.0" aws-smithy-async = { version = "1.1.4", default-features = false, features=["rt-tokio"] } aws-smithy-types = "1.1.4" aws-credential-types = "1.1.4" +aws-sigv4 = { version = "1.2.0", features = ["sign-http"] } +aws-types = "1.1.7" axum = { version = "0.6.20", features = ["ws"] } base64 = "0.13.0" bincode = "1.3" @@ -76,6 +79,7 @@ either = "1.8" enum-map = "2.4.2" enumset = "1.0.12" fail = "0.5.0" +fallible-iterator = "0.2" fs2 = "0.4.3" futures = "0.3" futures-core = "0.3" @@ -88,6 +92,7 @@ hex = "0.4" hex-literal = "0.4" hmac = "0.12.1" hostname = "0.3.1" +http = {version = "1.1.0", features = ["std"]} http-types = { version = "2", default-features = false } humantime = "2.1" humantime-serde = "1.1.1" @@ -101,6 +106,7 @@ lasso = "0.7" leaky-bucket = "1.0.1" libc = "0.2" md5 = "0.7.0" +measured = { version = "0.0.13", features=["default", "lasso"] } memoffset = "0.8" native-tls = "0.2" nix = { version = "0.27", features = ["fs", "process", "socket", "signal", "poll"] } @@ -120,7 +126,7 @@ procfs = "0.14" prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency prost = "0.11" rand = "0.8" -redis = { version = "0.24.0", features = ["tokio-rustls-comp", "keep-alive"] } +redis = { version = "0.25.2", features = ["tokio-rustls-comp", "keep-alive"] } regex = "1.10.2" reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] } reqwest-tracing = { version = "0.4.7", features = ["opentelemetry_0_20"] } @@ -148,6 +154,7 @@ smol_str = { version = "0.2.0", features = ["serde"] } socket2 = "0.5" strum = "0.24" strum_macros = "0.24" +"subtle" = "2.5.0" svg_fmt = "0.4.1" sync_wrapper = "0.1.2" tar = "0.4" diff --git a/Dockerfile.build-tools b/Dockerfile.build-tools index 3a452fec32..1ed6f87473 100644 --- a/Dockerfile.build-tools +++ b/Dockerfile.build-tools @@ -135,7 +135,7 @@ WORKDIR /home/nonroot # Rust # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`) -ENV RUSTC_VERSION=1.76.0 +ENV RUSTC_VERSION=1.77.0 ENV RUSTUP_HOME="/home/nonroot/.rustup" ENV PATH="/home/nonroot/.cargo/bin:${PATH}" RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \ @@ -149,7 +149,7 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux cargo install --git https://github.com/paritytech/cachepot && \ cargo install rustfilt && \ cargo install cargo-hakari && \ - cargo install cargo-deny && \ + cargo install cargo-deny --locked && \ cargo install cargo-hack && \ cargo install cargo-nextest && \ rm -rf /home/nonroot/.cargo/registry && \ diff --git a/compute_tools/README.md b/compute_tools/README.md index 22a7de7cb7..8d84031efc 100644 --- a/compute_tools/README.md +++ b/compute_tools/README.md @@ -32,6 +32,29 @@ compute_ctl -D /var/db/postgres/compute \ -b /usr/local/bin/postgres ``` +## State Diagram + +Computes can be in various states. Below is a diagram that details how a +compute moves between states. + +```mermaid +%% https://mermaid.js.org/syntax/stateDiagram.html +stateDiagram-v2 + [*] --> Empty : Compute spawned + Empty --> ConfigurationPending : Waiting for compute spec + ConfigurationPending --> Configuration : Received compute spec + Configuration --> Failed : Failed to configure the compute + Configuration --> Running : Compute has been configured + Empty --> Init : Compute spec is immediately available + Empty --> TerminationPending : Requested termination + Init --> Failed : Failed to start Postgres + Init --> Running : Started Postgres + Running --> TerminationPending : Requested termination + TerminationPending --> Terminated : Terminated compute + Failed --> [*] : Compute exited + Terminated --> [*] : Compute exited +``` + ## Tests Cargo formatter: diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs index 42b8480211..f1fd8637f5 100644 --- a/compute_tools/src/config.rs +++ b/compute_tools/src/config.rs @@ -17,6 +17,7 @@ pub fn line_in_file(path: &Path, line: &str) -> Result { .write(true) .create(true) .append(false) + .truncate(false) .open(path)?; let buf = io::BufReader::new(&file); let mut count: usize = 0; diff --git a/control_plane/attachment_service/Cargo.toml b/control_plane/attachment_service/Cargo.toml index f78f56c480..34882659e3 100644 --- a/control_plane/attachment_service/Cargo.toml +++ b/control_plane/attachment_service/Cargo.toml @@ -17,6 +17,7 @@ testing = [] anyhow.workspace = true aws-config.workspace = true aws-sdk-secretsmanager.workspace = true +bytes.workspace = true camino.workspace = true clap.workspace = true fail.workspace = true @@ -25,17 +26,20 @@ git-version.workspace = true hex.workspace = true hyper.workspace = true humantime.workspace = true +lasso.workspace = true once_cell.workspace = true pageserver_api.workspace = true pageserver_client.workspace = true postgres_connection.workspace = true reqwest.workspace = true +routerify.workspace = true serde.workspace = true serde_json.workspace = true thiserror.workspace = true tokio.workspace = true tokio-util.workspace = true tracing.workspace = true +measured.workspace = true diesel = { version = "2.1.4", features = ["serde_json", "postgres", "r2d2"] } diesel_migrations = { version = "2.1.0" } diff --git a/control_plane/attachment_service/migrations/2024-03-18-184429_rename_policy/down.sql b/control_plane/attachment_service/migrations/2024-03-18-184429_rename_policy/down.sql new file mode 100644 index 0000000000..897c7e0d01 --- /dev/null +++ b/control_plane/attachment_service/migrations/2024-03-18-184429_rename_policy/down.sql @@ -0,0 +1,3 @@ + +UPDATE tenant_shards set placement_policy='{"Double": 1}' where placement_policy='{"Attached": 1}'; +UPDATE tenant_shards set placement_policy='"Single"' where placement_policy='{"Attached": 0}'; \ No newline at end of file diff --git a/control_plane/attachment_service/migrations/2024-03-18-184429_rename_policy/up.sql b/control_plane/attachment_service/migrations/2024-03-18-184429_rename_policy/up.sql new file mode 100644 index 0000000000..c898ac9aee --- /dev/null +++ b/control_plane/attachment_service/migrations/2024-03-18-184429_rename_policy/up.sql @@ -0,0 +1,3 @@ + +UPDATE tenant_shards set placement_policy='{"Attached": 1}' where placement_policy='{"Double": 1}'; +UPDATE tenant_shards set placement_policy='{"Attached": 0}' where placement_policy='"Single"'; \ No newline at end of file diff --git a/control_plane/attachment_service/src/http.rs b/control_plane/attachment_service/src/http.rs index 45ee354822..036019cd38 100644 --- a/control_plane/attachment_service/src/http.rs +++ b/control_plane/attachment_service/src/http.rs @@ -1,5 +1,11 @@ +use crate::metrics::{ + HttpRequestLatencyLabelGroup, HttpRequestStatusLabelGroup, PageserverRequestLabelGroup, + METRICS_REGISTRY, +}; use crate::reconciler::ReconcileError; use crate::service::{Service, STARTUP_RECONCILE_TIMEOUT}; +use futures::Future; +use hyper::header::CONTENT_TYPE; use hyper::{Body, Request, Response}; use hyper::{StatusCode, Uri}; use pageserver_api::models::{ @@ -34,6 +40,8 @@ use pageserver_api::upcall_api::{ReAttachRequest, ValidateRequest}; use control_plane::storage_controller::{AttachHookRequest, InspectRequest}; +use routerify::Middleware; + /// State available to HTTP request handlers #[derive(Clone)] pub struct HttpState { @@ -313,7 +321,7 @@ async fn handle_tenant_timeline_passthrough( tracing::info!("Proxying request for tenant {} ({})", tenant_id, path); // Find the node that holds shard zero - let (base_url, tenant_shard_id) = service.tenant_shard0_baseurl(tenant_id)?; + let (node, tenant_shard_id) = service.tenant_shard0_node(tenant_id)?; // Callers will always pass an unsharded tenant ID. Before proxying, we must // rewrite this to a shard-aware shard zero ID. @@ -322,12 +330,39 @@ async fn handle_tenant_timeline_passthrough( let tenant_shard_str = format!("{}", tenant_shard_id); let path = path.replace(&tenant_str, &tenant_shard_str); - let client = mgmt_api::Client::new(base_url, service.get_config().jwt_token.as_deref()); + let latency = &METRICS_REGISTRY + .metrics_group + .storage_controller_passthrough_request_latency; + + // This is a bit awkward. We remove the param from the request + // and join the words by '_' to get a label for the request. + let just_path = path.replace(&tenant_shard_str, ""); + let path_label = just_path + .split('/') + .filter(|token| !token.is_empty()) + .collect::>() + .join("_"); + let labels = PageserverRequestLabelGroup { + pageserver_id: &node.get_id().to_string(), + path: &path_label, + method: crate::metrics::Method::Get, + }; + + let _timer = latency.start_timer(labels.clone()); + + let client = mgmt_api::Client::new(node.base_url(), service.get_config().jwt_token.as_deref()); let resp = client.get_raw(path).await.map_err(|_e| // FIXME: give APiError a proper Unavailable variant. We return 503 here because // if we can't successfully send a request to the pageserver, we aren't available. ApiError::ShuttingDown)?; + if !resp.status().is_success() { + let error_counter = &METRICS_REGISTRY + .metrics_group + .storage_controller_passthrough_request_error; + error_counter.inc(labels); + } + // We have a reqest::Response, would like a http::Response let mut builder = hyper::Response::builder() .status(resp.status()) @@ -353,6 +388,16 @@ async fn handle_tenant_locate( json_response(StatusCode::OK, service.tenant_locate(tenant_id)?) } +async fn handle_tenant_describe( + service: Arc, + req: Request, +) -> Result, ApiError> { + check_permissions(&req, Scope::Admin)?; + + let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + json_response(StatusCode::OK, service.tenant_describe(tenant_id)?) +} + async fn handle_node_register(mut req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; @@ -488,7 +533,11 @@ impl From for ApiError { /// Common wrapper for request handlers that call into Service and will operate on tenants: they must only /// be allowed to run if Service has finished its initial reconciliation. -async fn tenant_service_handler(request: Request, handler: H) -> R::Output +async fn tenant_service_handler( + request: Request, + handler: H, + request_name: RequestName, +) -> R::Output where R: std::future::Future, ApiError>> + Send + 'static, H: FnOnce(Arc, Request) -> R + Send + Sync + 'static, @@ -508,9 +557,10 @@ where )); } - request_span( + named_request_span( request, |request| async move { handler(service, request).await }, + request_name, ) .await } @@ -521,11 +571,98 @@ fn check_permissions(request: &Request, required_scope: Scope) -> Result<( }) } +#[derive(Clone, Debug)] +struct RequestMeta { + method: hyper::http::Method, + at: Instant, +} + +fn prologue_metrics_middleware( +) -> Middleware { + Middleware::pre(move |req| async move { + let meta = RequestMeta { + method: req.method().clone(), + at: Instant::now(), + }; + + req.set_context(meta); + + Ok(req) + }) +} + +fn epilogue_metrics_middleware( +) -> Middleware { + Middleware::post_with_info(move |resp, req_info| async move { + let request_name = match req_info.context::() { + Some(name) => name, + None => { + return Ok(resp); + } + }; + + if let Some(meta) = req_info.context::() { + let status = &crate::metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_http_request_status; + let latency = &crate::metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_http_request_latency; + + status.inc(HttpRequestStatusLabelGroup { + path: request_name.0, + method: meta.method.clone().into(), + status: crate::metrics::StatusCode(resp.status()), + }); + + latency.observe( + HttpRequestLatencyLabelGroup { + path: request_name.0, + method: meta.method.into(), + }, + meta.at.elapsed().as_secs_f64(), + ); + } + Ok(resp) + }) +} + +pub async fn measured_metrics_handler(_req: Request) -> Result, ApiError> { + pub const TEXT_FORMAT: &str = "text/plain; version=0.0.4"; + + let payload = crate::metrics::METRICS_REGISTRY.encode(); + let response = Response::builder() + .status(200) + .header(CONTENT_TYPE, TEXT_FORMAT) + .body(payload.into()) + .unwrap(); + + Ok(response) +} + +#[derive(Clone)] +struct RequestName(&'static str); + +async fn named_request_span( + request: Request, + handler: H, + name: RequestName, +) -> R::Output +where + R: Future, ApiError>> + Send + 'static, + H: FnOnce(Request) -> R + Send + Sync + 'static, +{ + request.set_context(name); + request_span(request, handler).await +} + pub fn make_router( service: Arc, auth: Option>, ) -> RouterBuilder { - let mut router = endpoint::make_router(); + let mut router = endpoint::make_router() + .middleware(prologue_metrics_middleware()) + .middleware(epilogue_metrics_middleware()); if auth.is_some() { router = router.middleware(auth_middleware(|request| { let state = get_state(request); @@ -534,96 +671,166 @@ pub fn make_router( } else { state.auth.as_deref() } - })) + })); } router .data(Arc::new(HttpState::new(service, auth))) + .get("/metrics", |r| { + named_request_span(r, measured_metrics_handler, RequestName("metrics")) + }) // Non-prefixed generic endpoints (status, metrics) - .get("/status", |r| request_span(r, handle_status)) - .get("/ready", |r| request_span(r, handle_ready)) + .get("/status", |r| { + named_request_span(r, handle_status, RequestName("status")) + }) + .get("/ready", |r| { + named_request_span(r, handle_ready, RequestName("ready")) + }) // Upcalls for the pageserver: point the pageserver's `control_plane_api` config to this prefix .post("/upcall/v1/re-attach", |r| { - request_span(r, handle_re_attach) + named_request_span(r, handle_re_attach, RequestName("upcall_v1_reattach")) + }) + .post("/upcall/v1/validate", |r| { + named_request_span(r, handle_validate, RequestName("upcall_v1_validate")) }) - .post("/upcall/v1/validate", |r| request_span(r, handle_validate)) // Test/dev/debug endpoints .post("/debug/v1/attach-hook", |r| { - request_span(r, handle_attach_hook) + named_request_span(r, handle_attach_hook, RequestName("debug_v1_attach_hook")) + }) + .post("/debug/v1/inspect", |r| { + named_request_span(r, handle_inspect, RequestName("debug_v1_inspect")) }) - .post("/debug/v1/inspect", |r| request_span(r, handle_inspect)) .post("/debug/v1/tenant/:tenant_id/drop", |r| { - request_span(r, handle_tenant_drop) + named_request_span(r, handle_tenant_drop, RequestName("debug_v1_tenant_drop")) }) .post("/debug/v1/node/:node_id/drop", |r| { - request_span(r, handle_node_drop) + named_request_span(r, handle_node_drop, RequestName("debug_v1_node_drop")) + }) + .get("/debug/v1/tenant", |r| { + named_request_span(r, handle_tenants_dump, RequestName("debug_v1_tenant")) + }) + .get("/debug/v1/tenant/:tenant_id/locate", |r| { + tenant_service_handler( + r, + handle_tenant_locate, + RequestName("debug_v1_tenant_locate"), + ) }) - .get("/debug/v1/tenant", |r| request_span(r, handle_tenants_dump)) .get("/debug/v1/scheduler", |r| { - request_span(r, handle_scheduler_dump) + named_request_span(r, handle_scheduler_dump, RequestName("debug_v1_scheduler")) }) .post("/debug/v1/consistency_check", |r| { - request_span(r, handle_consistency_check) + named_request_span( + r, + handle_consistency_check, + RequestName("debug_v1_consistency_check"), + ) }) .put("/debug/v1/failpoints", |r| { request_span(r, |r| failpoints_handler(r, CancellationToken::new())) }) - .get("/control/v1/tenant/:tenant_id/locate", |r| { - tenant_service_handler(r, handle_tenant_locate) - }) // Node operations .post("/control/v1/node", |r| { - request_span(r, handle_node_register) + named_request_span(r, handle_node_register, RequestName("control_v1_node")) + }) + .get("/control/v1/node", |r| { + named_request_span(r, handle_node_list, RequestName("control_v1_node")) }) - .get("/control/v1/node", |r| request_span(r, handle_node_list)) .put("/control/v1/node/:node_id/config", |r| { - request_span(r, handle_node_configure) + named_request_span( + r, + handle_node_configure, + RequestName("control_v1_node_config"), + ) }) // Tenant Shard operations .put("/control/v1/tenant/:tenant_shard_id/migrate", |r| { - tenant_service_handler(r, handle_tenant_shard_migrate) + tenant_service_handler( + r, + handle_tenant_shard_migrate, + RequestName("control_v1_tenant_migrate"), + ) }) .put("/control/v1/tenant/:tenant_id/shard_split", |r| { - tenant_service_handler(r, handle_tenant_shard_split) + tenant_service_handler( + r, + handle_tenant_shard_split, + RequestName("control_v1_tenant_shard_split"), + ) + }) + .get("/control/v1/tenant/:tenant_id", |r| { + tenant_service_handler( + r, + handle_tenant_describe, + RequestName("control_v1_tenant_describe"), + ) }) // Tenant operations // The ^/v1/ endpoints act as a "Virtual Pageserver", enabling shard-naive clients to call into // this service to manage tenants that actually consist of many tenant shards, as if they are a single entity. .post("/v1/tenant", |r| { - tenant_service_handler(r, handle_tenant_create) + tenant_service_handler(r, handle_tenant_create, RequestName("v1_tenant")) }) .delete("/v1/tenant/:tenant_id", |r| { - tenant_service_handler(r, handle_tenant_delete) + tenant_service_handler(r, handle_tenant_delete, RequestName("v1_tenant")) }) .put("/v1/tenant/config", |r| { - tenant_service_handler(r, handle_tenant_config_set) + tenant_service_handler(r, handle_tenant_config_set, RequestName("v1_tenant_config")) }) .get("/v1/tenant/:tenant_id/config", |r| { - tenant_service_handler(r, handle_tenant_config_get) + tenant_service_handler(r, handle_tenant_config_get, RequestName("v1_tenant_config")) }) .put("/v1/tenant/:tenant_shard_id/location_config", |r| { - tenant_service_handler(r, handle_tenant_location_config) + tenant_service_handler( + r, + handle_tenant_location_config, + RequestName("v1_tenant_location_config"), + ) }) .put("/v1/tenant/:tenant_id/time_travel_remote_storage", |r| { - tenant_service_handler(r, handle_tenant_time_travel_remote_storage) + tenant_service_handler( + r, + handle_tenant_time_travel_remote_storage, + RequestName("v1_tenant_time_travel_remote_storage"), + ) }) .post("/v1/tenant/:tenant_id/secondary/download", |r| { - tenant_service_handler(r, handle_tenant_secondary_download) + tenant_service_handler( + r, + handle_tenant_secondary_download, + RequestName("v1_tenant_secondary_download"), + ) }) // Timeline operations .delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| { - tenant_service_handler(r, handle_tenant_timeline_delete) + tenant_service_handler( + r, + handle_tenant_timeline_delete, + RequestName("v1_tenant_timeline"), + ) }) .post("/v1/tenant/:tenant_id/timeline", |r| { - tenant_service_handler(r, handle_tenant_timeline_create) + tenant_service_handler( + r, + handle_tenant_timeline_create, + RequestName("v1_tenant_timeline"), + ) }) // Tenant detail GET passthrough to shard zero .get("/v1/tenant/:tenant_id", |r| { - tenant_service_handler(r, handle_tenant_timeline_passthrough) + tenant_service_handler( + r, + handle_tenant_timeline_passthrough, + RequestName("v1_tenant_passthrough"), + ) }) // Timeline GET passthrough to shard zero. Note that the `*` in the URL is a wildcard: any future // timeline GET APIs will be implicitly included. .get("/v1/tenant/:tenant_id/timeline*", |r| { - tenant_service_handler(r, handle_tenant_timeline_passthrough) + tenant_service_handler( + r, + handle_tenant_timeline_passthrough, + RequestName("v1_tenant_timeline_passthrough"), + ) }) } diff --git a/control_plane/attachment_service/src/lib.rs b/control_plane/attachment_service/src/lib.rs index 4aff29f15b..8bcd5c0ac4 100644 --- a/control_plane/attachment_service/src/lib.rs +++ b/control_plane/attachment_service/src/lib.rs @@ -8,6 +8,7 @@ pub mod http; mod id_lock_map; pub mod metrics; mod node; +mod pageserver_client; pub mod persistence; mod reconciler; mod scheduler; diff --git a/control_plane/attachment_service/src/metrics.rs b/control_plane/attachment_service/src/metrics.rs index ffe093b9c8..ccf5e9b07c 100644 --- a/control_plane/attachment_service/src/metrics.rs +++ b/control_plane/attachment_service/src/metrics.rs @@ -1,32 +1,284 @@ -use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec}; +//! +//! This module provides metric definitions for the storage controller. +//! +//! All metrics are grouped in [`StorageControllerMetricGroup`]. [`StorageControllerMetrics`] holds +//! the mentioned metrics and their encoder. It's globally available via the [`METRICS_REGISTRY`] +//! constant. +//! +//! The rest of the code defines label group types and deals with converting outer types to labels. +//! +use bytes::Bytes; +use measured::{ + label::{LabelValue, StaticLabelSet}, + FixedCardinalityLabel, MetricGroup, +}; use once_cell::sync::Lazy; +use std::sync::Mutex; -pub(crate) struct ReconcilerMetrics { - pub(crate) spawned: IntCounter, - pub(crate) complete: IntCounterVec, -} +use crate::persistence::{DatabaseError, DatabaseOperation}; -impl ReconcilerMetrics { - // Labels used on [`Self::complete`] - pub(crate) const SUCCESS: &'static str = "ok"; - pub(crate) const ERROR: &'static str = "success"; - pub(crate) const CANCEL: &'static str = "cancel"; -} - -pub(crate) static RECONCILER: Lazy = Lazy::new(|| ReconcilerMetrics { - spawned: register_int_counter!( - "storage_controller_reconcile_spawn", - "Count of how many times we spawn a reconcile task", - ) - .expect("failed to define a metric"), - complete: register_int_counter_vec!( - "storage_controller_reconcile_complete", - "Reconciler tasks completed, broken down by success/failure/cancelled", - &["status"], - ) - .expect("failed to define a metric"), -}); +pub(crate) static METRICS_REGISTRY: Lazy = + Lazy::new(StorageControllerMetrics::default); pub fn preinitialize_metrics() { - Lazy::force(&RECONCILER); + Lazy::force(&METRICS_REGISTRY); +} + +pub(crate) struct StorageControllerMetrics { + pub(crate) metrics_group: StorageControllerMetricGroup, + encoder: Mutex, +} + +#[derive(measured::MetricGroup)] +pub(crate) struct StorageControllerMetricGroup { + /// Count of how many times we spawn a reconcile task + pub(crate) storage_controller_reconcile_spawn: measured::Counter, + /// Reconciler tasks completed, broken down by success/failure/cancelled + pub(crate) storage_controller_reconcile_complete: + measured::CounterVec, + + /// HTTP request status counters for handled requests + pub(crate) storage_controller_http_request_status: + measured::CounterVec, + /// HTTP request handler latency across all status codes + pub(crate) storage_controller_http_request_latency: + measured::HistogramVec, + + /// Count of HTTP requests to the pageserver that resulted in an error, + /// broken down by the pageserver node id, request name and method + pub(crate) storage_controller_pageserver_request_error: + measured::CounterVec, + + /// Latency of HTTP requests to the pageserver, broken down by pageserver + /// node id, request name and method. This include both successful and unsuccessful + /// requests. + pub(crate) storage_controller_pageserver_request_latency: + measured::HistogramVec, + + /// Count of pass-through HTTP requests to the pageserver that resulted in an error, + /// broken down by the pageserver node id, request name and method + pub(crate) storage_controller_passthrough_request_error: + measured::CounterVec, + + /// Latency of pass-through HTTP requests to the pageserver, broken down by pageserver + /// node id, request name and method. This include both successful and unsuccessful + /// requests. + pub(crate) storage_controller_passthrough_request_latency: + measured::HistogramVec, + + /// Count of errors in database queries, broken down by error type and operation. + pub(crate) storage_controller_database_query_error: + measured::CounterVec, + + /// Latency of database queries, broken down by operation. + pub(crate) storage_controller_database_query_latency: + measured::HistogramVec, +} + +impl StorageControllerMetrics { + pub(crate) fn encode(&self) -> Bytes { + let mut encoder = self.encoder.lock().unwrap(); + self.metrics_group.collect_into(&mut *encoder); + encoder.finish() + } +} + +impl Default for StorageControllerMetrics { + fn default() -> Self { + Self { + metrics_group: StorageControllerMetricGroup::new(), + encoder: Mutex::new(measured::text::TextEncoder::new()), + } + } +} + +impl StorageControllerMetricGroup { + pub(crate) fn new() -> Self { + Self { + storage_controller_reconcile_spawn: measured::Counter::new(), + storage_controller_reconcile_complete: measured::CounterVec::new( + ReconcileCompleteLabelGroupSet { + status: StaticLabelSet::new(), + }, + ), + storage_controller_http_request_status: measured::CounterVec::new( + HttpRequestStatusLabelGroupSet { + path: lasso::ThreadedRodeo::new(), + method: StaticLabelSet::new(), + status: StaticLabelSet::new(), + }, + ), + storage_controller_http_request_latency: measured::HistogramVec::new( + measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0), + ), + storage_controller_pageserver_request_error: measured::CounterVec::new( + PageserverRequestLabelGroupSet { + pageserver_id: lasso::ThreadedRodeo::new(), + path: lasso::ThreadedRodeo::new(), + method: StaticLabelSet::new(), + }, + ), + storage_controller_pageserver_request_latency: measured::HistogramVec::new( + measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0), + ), + storage_controller_passthrough_request_error: measured::CounterVec::new( + PageserverRequestLabelGroupSet { + pageserver_id: lasso::ThreadedRodeo::new(), + path: lasso::ThreadedRodeo::new(), + method: StaticLabelSet::new(), + }, + ), + storage_controller_passthrough_request_latency: measured::HistogramVec::new( + measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0), + ), + storage_controller_database_query_error: measured::CounterVec::new( + DatabaseQueryErrorLabelGroupSet { + operation: StaticLabelSet::new(), + error_type: StaticLabelSet::new(), + }, + ), + storage_controller_database_query_latency: measured::HistogramVec::new( + measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0), + ), + } + } +} + +#[derive(measured::LabelGroup)] +#[label(set = ReconcileCompleteLabelGroupSet)] +pub(crate) struct ReconcileCompleteLabelGroup { + pub(crate) status: ReconcileOutcome, +} + +#[derive(measured::LabelGroup)] +#[label(set = HttpRequestStatusLabelGroupSet)] +pub(crate) struct HttpRequestStatusLabelGroup<'a> { + #[label(dynamic_with = lasso::ThreadedRodeo)] + pub(crate) path: &'a str, + pub(crate) method: Method, + pub(crate) status: StatusCode, +} + +#[derive(measured::LabelGroup)] +#[label(set = HttpRequestLatencyLabelGroupSet)] +pub(crate) struct HttpRequestLatencyLabelGroup<'a> { + #[label(dynamic_with = lasso::ThreadedRodeo)] + pub(crate) path: &'a str, + pub(crate) method: Method, +} + +impl Default for HttpRequestLatencyLabelGroupSet { + fn default() -> Self { + Self { + path: lasso::ThreadedRodeo::new(), + method: StaticLabelSet::new(), + } + } +} + +#[derive(measured::LabelGroup, Clone)] +#[label(set = PageserverRequestLabelGroupSet)] +pub(crate) struct PageserverRequestLabelGroup<'a> { + #[label(dynamic_with = lasso::ThreadedRodeo)] + pub(crate) pageserver_id: &'a str, + #[label(dynamic_with = lasso::ThreadedRodeo)] + pub(crate) path: &'a str, + pub(crate) method: Method, +} + +impl Default for PageserverRequestLabelGroupSet { + fn default() -> Self { + Self { + pageserver_id: lasso::ThreadedRodeo::new(), + path: lasso::ThreadedRodeo::new(), + method: StaticLabelSet::new(), + } + } +} + +#[derive(measured::LabelGroup)] +#[label(set = DatabaseQueryErrorLabelGroupSet)] +pub(crate) struct DatabaseQueryErrorLabelGroup { + pub(crate) error_type: DatabaseErrorLabel, + pub(crate) operation: DatabaseOperation, +} + +#[derive(measured::LabelGroup)] +#[label(set = DatabaseQueryLatencyLabelGroupSet)] +pub(crate) struct DatabaseQueryLatencyLabelGroup { + pub(crate) operation: DatabaseOperation, +} + +#[derive(FixedCardinalityLabel)] +pub(crate) enum ReconcileOutcome { + #[label(rename = "ok")] + Success, + Error, + Cancel, +} + +#[derive(FixedCardinalityLabel, Clone)] +pub(crate) enum Method { + Get, + Put, + Post, + Delete, + Other, +} + +impl From for Method { + fn from(value: hyper::Method) -> Self { + if value == hyper::Method::GET { + Method::Get + } else if value == hyper::Method::PUT { + Method::Put + } else if value == hyper::Method::POST { + Method::Post + } else if value == hyper::Method::DELETE { + Method::Delete + } else { + Method::Other + } + } +} + +pub(crate) struct StatusCode(pub(crate) hyper::http::StatusCode); + +impl LabelValue for StatusCode { + fn visit(&self, v: V) -> V::Output { + v.write_int(self.0.as_u16() as u64) + } +} + +impl FixedCardinalityLabel for StatusCode { + fn cardinality() -> usize { + (100..1000).len() + } + + fn encode(&self) -> usize { + self.0.as_u16() as usize + } + + fn decode(value: usize) -> Self { + Self(hyper::http::StatusCode::from_u16(u16::try_from(value).unwrap()).unwrap()) + } +} + +#[derive(FixedCardinalityLabel)] +pub(crate) enum DatabaseErrorLabel { + Query, + Connection, + ConnectionPool, + Logical, +} + +impl DatabaseError { + pub(crate) fn error_label(&self) -> DatabaseErrorLabel { + match self { + Self::Query(_) => DatabaseErrorLabel::Query, + Self::Connection(_) => DatabaseErrorLabel::Connection, + Self::ConnectionPool(_) => DatabaseErrorLabel::ConnectionPool, + Self::Logical(_) => DatabaseErrorLabel::Logical, + } + } } diff --git a/control_plane/attachment_service/src/node.rs b/control_plane/attachment_service/src/node.rs index 4167782715..df40bff66f 100644 --- a/control_plane/attachment_service/src/node.rs +++ b/control_plane/attachment_service/src/node.rs @@ -12,7 +12,9 @@ use serde::Serialize; use tokio_util::sync::CancellationToken; use utils::{backoff, id::NodeId}; -use crate::{persistence::NodePersistence, scheduler::MaySchedule}; +use crate::{ + pageserver_client::PageserverClient, persistence::NodePersistence, scheduler::MaySchedule, +}; /// Represents the in-memory description of a Node. /// @@ -202,7 +204,7 @@ impl Node { cancel: &CancellationToken, ) -> Option> where - O: FnMut(mgmt_api::Client) -> F, + O: FnMut(PageserverClient) -> F, F: std::future::Future>, { fn is_fatal(e: &mgmt_api::Error) -> bool { @@ -224,8 +226,12 @@ impl Node { .build() .expect("Failed to construct HTTP client"); - let client = - mgmt_api::Client::from_client(http_client, self.base_url(), jwt.as_deref()); + let client = PageserverClient::from_client( + self.get_id(), + http_client, + self.base_url(), + jwt.as_deref(), + ); let node_cancel_fut = self.cancel.cancelled(); diff --git a/control_plane/attachment_service/src/pageserver_client.rs b/control_plane/attachment_service/src/pageserver_client.rs new file mode 100644 index 0000000000..8237229d7b --- /dev/null +++ b/control_plane/attachment_service/src/pageserver_client.rs @@ -0,0 +1,203 @@ +use pageserver_api::{ + models::{ + LocationConfig, LocationConfigListResponse, PageserverUtilization, SecondaryProgress, + TenantShardSplitRequest, TenantShardSplitResponse, TimelineCreateRequest, TimelineInfo, + }, + shard::TenantShardId, +}; +use pageserver_client::mgmt_api::{Client, Result}; +use reqwest::StatusCode; +use utils::id::{NodeId, TimelineId}; + +/// Thin wrapper around [`pageserver_client::mgmt_api::Client`]. It allows the storage +/// controller to collect metrics in a non-intrusive manner. +#[derive(Debug, Clone)] +pub(crate) struct PageserverClient { + inner: Client, + node_id_label: String, +} + +macro_rules! measured_request { + ($name:literal, $method:expr, $node_id: expr, $invoke:expr) => {{ + let labels = crate::metrics::PageserverRequestLabelGroup { + pageserver_id: $node_id, + path: $name, + method: $method, + }; + + let latency = &crate::metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_pageserver_request_latency; + let _timer_guard = latency.start_timer(labels.clone()); + + let res = $invoke; + + if res.is_err() { + let error_counters = &crate::metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_pageserver_request_error; + error_counters.inc(labels) + } + + res + }}; +} + +impl PageserverClient { + pub(crate) fn new(node_id: NodeId, mgmt_api_endpoint: String, jwt: Option<&str>) -> Self { + Self { + inner: Client::from_client(reqwest::Client::new(), mgmt_api_endpoint, jwt), + node_id_label: node_id.0.to_string(), + } + } + + pub(crate) fn from_client( + node_id: NodeId, + raw_client: reqwest::Client, + mgmt_api_endpoint: String, + jwt: Option<&str>, + ) -> Self { + Self { + inner: Client::from_client(raw_client, mgmt_api_endpoint, jwt), + node_id_label: node_id.0.to_string(), + } + } + + pub(crate) async fn tenant_delete(&self, tenant_shard_id: TenantShardId) -> Result { + measured_request!( + "tenant", + crate::metrics::Method::Delete, + &self.node_id_label, + self.inner.tenant_delete(tenant_shard_id).await + ) + } + + pub(crate) async fn tenant_time_travel_remote_storage( + &self, + tenant_shard_id: TenantShardId, + timestamp: &str, + done_if_after: &str, + ) -> Result<()> { + measured_request!( + "tenant_time_travel_remote_storage", + crate::metrics::Method::Put, + &self.node_id_label, + self.inner + .tenant_time_travel_remote_storage(tenant_shard_id, timestamp, done_if_after) + .await + ) + } + + pub(crate) async fn tenant_secondary_download( + &self, + tenant_id: TenantShardId, + wait: Option, + ) -> Result<(StatusCode, SecondaryProgress)> { + measured_request!( + "tenant_secondary_download", + crate::metrics::Method::Post, + &self.node_id_label, + self.inner.tenant_secondary_download(tenant_id, wait).await + ) + } + + pub(crate) async fn location_config( + &self, + tenant_shard_id: TenantShardId, + config: LocationConfig, + flush_ms: Option, + lazy: bool, + ) -> Result<()> { + measured_request!( + "location_config", + crate::metrics::Method::Put, + &self.node_id_label, + self.inner + .location_config(tenant_shard_id, config, flush_ms, lazy) + .await + ) + } + + pub(crate) async fn list_location_config(&self) -> Result { + measured_request!( + "location_configs", + crate::metrics::Method::Get, + &self.node_id_label, + self.inner.list_location_config().await + ) + } + + pub(crate) async fn get_location_config( + &self, + tenant_shard_id: TenantShardId, + ) -> Result> { + measured_request!( + "location_config", + crate::metrics::Method::Get, + &self.node_id_label, + self.inner.get_location_config(tenant_shard_id).await + ) + } + + pub(crate) async fn timeline_create( + &self, + tenant_shard_id: TenantShardId, + req: &TimelineCreateRequest, + ) -> Result { + measured_request!( + "timeline", + crate::metrics::Method::Post, + &self.node_id_label, + self.inner.timeline_create(tenant_shard_id, req).await + ) + } + + pub(crate) async fn timeline_delete( + &self, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + ) -> Result { + measured_request!( + "timeline", + crate::metrics::Method::Delete, + &self.node_id_label, + self.inner + .timeline_delete(tenant_shard_id, timeline_id) + .await + ) + } + + pub(crate) async fn tenant_shard_split( + &self, + tenant_shard_id: TenantShardId, + req: TenantShardSplitRequest, + ) -> Result { + measured_request!( + "tenant_shard_split", + crate::metrics::Method::Put, + &self.node_id_label, + self.inner.tenant_shard_split(tenant_shard_id, req).await + ) + } + + pub(crate) async fn timeline_list( + &self, + tenant_shard_id: &TenantShardId, + ) -> Result> { + measured_request!( + "timelines", + crate::metrics::Method::Get, + &self.node_id_label, + self.inner.timeline_list(tenant_shard_id).await + ) + } + + pub(crate) async fn get_utilization(&self) -> Result { + measured_request!( + "utilization", + crate::metrics::Method::Get, + &self.node_id_label, + self.inner.get_utilization().await + ) + } +} diff --git a/control_plane/attachment_service/src/persistence.rs b/control_plane/attachment_service/src/persistence.rs index 3602cf8b1f..dafd52017b 100644 --- a/control_plane/attachment_service/src/persistence.rs +++ b/control_plane/attachment_service/src/persistence.rs @@ -19,6 +19,9 @@ use serde::{Deserialize, Serialize}; use utils::generation::Generation; use utils::id::{NodeId, TenantId}; +use crate::metrics::{ + DatabaseQueryErrorLabelGroup, DatabaseQueryLatencyLabelGroup, METRICS_REGISTRY, +}; use crate::node::Node; /// ## What do we store? @@ -75,6 +78,25 @@ pub(crate) enum DatabaseError { Logical(String), } +#[derive(measured::FixedCardinalityLabel, Clone)] +pub(crate) enum DatabaseOperation { + InsertNode, + UpdateNode, + DeleteNode, + ListNodes, + BeginShardSplit, + CompleteShardSplit, + AbortShardSplit, + Detach, + ReAttach, + IncrementGeneration, + ListTenantShards, + InsertTenantShards, + UpdateTenantShard, + DeleteTenant, + UpdateTenantConfig, +} + #[must_use] pub(crate) enum AbortShardSplitStatus { /// We aborted the split in the database by reverting to the parent shards @@ -115,6 +137,34 @@ impl Persistence { } } + /// Wraps `with_conn` in order to collect latency and error metrics + async fn with_measured_conn(&self, op: DatabaseOperation, func: F) -> DatabaseResult + where + F: Fn(&mut PgConnection) -> DatabaseResult + Send + 'static, + R: Send + 'static, + { + let latency = &METRICS_REGISTRY + .metrics_group + .storage_controller_database_query_latency; + let _timer = latency.start_timer(DatabaseQueryLatencyLabelGroup { + operation: op.clone(), + }); + + let res = self.with_conn(func).await; + + if let Err(err) = &res { + let error_counter = &METRICS_REGISTRY + .metrics_group + .storage_controller_database_query_error; + error_counter.inc(DatabaseQueryErrorLabelGroup { + error_type: err.error_label(), + operation: op, + }) + } + + res + } + /// Call the provided function in a tokio blocking thread, with a Diesel database connection. async fn with_conn(&self, func: F) -> DatabaseResult where @@ -130,21 +180,27 @@ impl Persistence { /// When a node is first registered, persist it before using it for anything pub(crate) async fn insert_node(&self, node: &Node) -> DatabaseResult<()> { let np = node.to_persistent(); - self.with_conn(move |conn| -> DatabaseResult<()> { - diesel::insert_into(crate::schema::nodes::table) - .values(&np) - .execute(conn)?; - Ok(()) - }) + self.with_measured_conn( + DatabaseOperation::InsertNode, + move |conn| -> DatabaseResult<()> { + diesel::insert_into(crate::schema::nodes::table) + .values(&np) + .execute(conn)?; + Ok(()) + }, + ) .await } /// At startup, populate the list of nodes which our shards may be placed on pub(crate) async fn list_nodes(&self) -> DatabaseResult> { let nodes: Vec = self - .with_conn(move |conn| -> DatabaseResult<_> { - Ok(crate::schema::nodes::table.load::(conn)?) - }) + .with_measured_conn( + DatabaseOperation::ListNodes, + move |conn| -> DatabaseResult<_> { + Ok(crate::schema::nodes::table.load::(conn)?) + }, + ) .await?; tracing::info!("list_nodes: loaded {} nodes", nodes.len()); @@ -159,7 +215,7 @@ impl Persistence { ) -> DatabaseResult<()> { use crate::schema::nodes::dsl::*; let updated = self - .with_conn(move |conn| { + .with_measured_conn(DatabaseOperation::UpdateNode, move |conn| { let updated = diesel::update(nodes) .filter(node_id.eq(input_node_id.0 as i64)) .set((scheduling_policy.eq(String::from(input_scheduling)),)) @@ -181,9 +237,12 @@ impl Persistence { /// be enriched at runtime with state discovered on pageservers. pub(crate) async fn list_tenant_shards(&self) -> DatabaseResult> { let loaded = self - .with_conn(move |conn| -> DatabaseResult<_> { - Ok(crate::schema::tenant_shards::table.load::(conn)?) - }) + .with_measured_conn( + DatabaseOperation::ListTenantShards, + move |conn| -> DatabaseResult<_> { + Ok(crate::schema::tenant_shards::table.load::(conn)?) + }, + ) .await?; if loaded.is_empty() { @@ -211,15 +270,10 @@ impl Persistence { let mut decoded = serde_json::from_slice::(&bytes) .map_err(|e| DatabaseError::Logical(format!("Deserialization error: {e}")))?; - for (tenant_id, tenant) in &mut decoded.tenants { - // Backward compat: an old attachments.json from before PR #6251, replace - // empty strings with proper defaults. - if tenant.tenant_id.is_empty() { - tenant.tenant_id = tenant_id.to_string(); - tenant.config = serde_json::to_string(&TenantConfig::default()) - .map_err(|e| DatabaseError::Logical(format!("Serialization error: {e}")))?; - tenant.placement_policy = serde_json::to_string(&PlacementPolicy::Single) - .map_err(|e| DatabaseError::Logical(format!("Serialization error: {e}")))?; + for shard in decoded.tenants.values_mut() { + if shard.placement_policy == "\"Single\"" { + // Backward compat for test data after PR https://github.com/neondatabase/neon/pull/7165 + shard.placement_policy = "{\"Attached\":0}".to_string(); } } @@ -265,17 +319,20 @@ impl Persistence { shards: Vec, ) -> DatabaseResult<()> { use crate::schema::tenant_shards::dsl::*; - self.with_conn(move |conn| -> DatabaseResult<()> { - conn.transaction(|conn| -> QueryResult<()> { - for tenant in &shards { - diesel::insert_into(tenant_shards) - .values(tenant) - .execute(conn)?; - } + self.with_measured_conn( + DatabaseOperation::InsertTenantShards, + move |conn| -> DatabaseResult<()> { + conn.transaction(|conn| -> QueryResult<()> { + for tenant in &shards { + diesel::insert_into(tenant_shards) + .values(tenant) + .execute(conn)?; + } + Ok(()) + })?; Ok(()) - })?; - Ok(()) - }) + }, + ) .await } @@ -283,25 +340,31 @@ impl Persistence { /// the tenant from memory on this server. pub(crate) async fn delete_tenant(&self, del_tenant_id: TenantId) -> DatabaseResult<()> { use crate::schema::tenant_shards::dsl::*; - self.with_conn(move |conn| -> DatabaseResult<()> { - diesel::delete(tenant_shards) - .filter(tenant_id.eq(del_tenant_id.to_string())) - .execute(conn)?; + self.with_measured_conn( + DatabaseOperation::DeleteTenant, + move |conn| -> DatabaseResult<()> { + diesel::delete(tenant_shards) + .filter(tenant_id.eq(del_tenant_id.to_string())) + .execute(conn)?; - Ok(()) - }) + Ok(()) + }, + ) .await } pub(crate) async fn delete_node(&self, del_node_id: NodeId) -> DatabaseResult<()> { use crate::schema::nodes::dsl::*; - self.with_conn(move |conn| -> DatabaseResult<()> { - diesel::delete(nodes) - .filter(node_id.eq(del_node_id.0 as i64)) - .execute(conn)?; + self.with_measured_conn( + DatabaseOperation::DeleteNode, + move |conn| -> DatabaseResult<()> { + diesel::delete(nodes) + .filter(node_id.eq(del_node_id.0 as i64)) + .execute(conn)?; - Ok(()) - }) + Ok(()) + }, + ) .await } @@ -315,7 +378,7 @@ impl Persistence { ) -> DatabaseResult> { use crate::schema::tenant_shards::dsl::*; let updated = self - .with_conn(move |conn| { + .with_measured_conn(DatabaseOperation::ReAttach, move |conn| { let rows_updated = diesel::update(tenant_shards) .filter(generation_pageserver.eq(node_id.0 as i64)) .set(generation.eq(generation + 1)) @@ -365,7 +428,7 @@ impl Persistence { ) -> anyhow::Result { use crate::schema::tenant_shards::dsl::*; let updated = self - .with_conn(move |conn| { + .with_measured_conn(DatabaseOperation::IncrementGeneration, move |conn| { let updated = diesel::update(tenant_shards) .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string())) .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32)) @@ -409,7 +472,7 @@ impl Persistence { ) -> DatabaseResult<()> { use crate::schema::tenant_shards::dsl::*; - self.with_conn(move |conn| { + self.with_measured_conn(DatabaseOperation::UpdateTenantShard, move |conn| { let query = diesel::update(tenant_shards) .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string())) .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32)) @@ -450,7 +513,7 @@ impl Persistence { ) -> DatabaseResult<()> { use crate::schema::tenant_shards::dsl::*; - self.with_conn(move |conn| { + self.with_measured_conn(DatabaseOperation::UpdateTenantConfig, move |conn| { diesel::update(tenant_shards) .filter(tenant_id.eq(input_tenant_id.to_string())) .set((config.eq(serde_json::to_string(&input_config).unwrap()),)) @@ -465,7 +528,7 @@ impl Persistence { pub(crate) async fn detach(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> { use crate::schema::tenant_shards::dsl::*; - self.with_conn(move |conn| { + self.with_measured_conn(DatabaseOperation::Detach, move |conn| { let updated = diesel::update(tenant_shards) .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string())) .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32)) @@ -495,7 +558,7 @@ impl Persistence { parent_to_children: Vec<(TenantShardId, Vec)>, ) -> DatabaseResult<()> { use crate::schema::tenant_shards::dsl::*; - self.with_conn(move |conn| -> DatabaseResult<()> { + self.with_measured_conn(DatabaseOperation::BeginShardSplit, move |conn| -> DatabaseResult<()> { conn.transaction(|conn| -> DatabaseResult<()> { // Mark parent shards as splitting @@ -559,26 +622,29 @@ impl Persistence { old_shard_count: ShardCount, ) -> DatabaseResult<()> { use crate::schema::tenant_shards::dsl::*; - self.with_conn(move |conn| -> DatabaseResult<()> { - conn.transaction(|conn| -> QueryResult<()> { - // Drop parent shards - diesel::delete(tenant_shards) - .filter(tenant_id.eq(split_tenant_id.to_string())) - .filter(shard_count.eq(old_shard_count.literal() as i32)) - .execute(conn)?; + self.with_measured_conn( + DatabaseOperation::CompleteShardSplit, + move |conn| -> DatabaseResult<()> { + conn.transaction(|conn| -> QueryResult<()> { + // Drop parent shards + diesel::delete(tenant_shards) + .filter(tenant_id.eq(split_tenant_id.to_string())) + .filter(shard_count.eq(old_shard_count.literal() as i32)) + .execute(conn)?; - // Clear sharding flag - let updated = diesel::update(tenant_shards) - .filter(tenant_id.eq(split_tenant_id.to_string())) - .set((splitting.eq(0),)) - .execute(conn)?; - debug_assert!(updated > 0); + // Clear sharding flag + let updated = diesel::update(tenant_shards) + .filter(tenant_id.eq(split_tenant_id.to_string())) + .set((splitting.eq(0),)) + .execute(conn)?; + debug_assert!(updated > 0); + + Ok(()) + })?; Ok(()) - })?; - - Ok(()) - }) + }, + ) .await } @@ -590,40 +656,44 @@ impl Persistence { new_shard_count: ShardCount, ) -> DatabaseResult { use crate::schema::tenant_shards::dsl::*; - self.with_conn(move |conn| -> DatabaseResult { - let aborted = conn.transaction(|conn| -> DatabaseResult { - // Clear the splitting state on parent shards - let updated = diesel::update(tenant_shards) - .filter(tenant_id.eq(split_tenant_id.to_string())) - .filter(shard_count.ne(new_shard_count.literal() as i32)) - .set((splitting.eq(0),)) - .execute(conn)?; + self.with_measured_conn( + DatabaseOperation::AbortShardSplit, + move |conn| -> DatabaseResult { + let aborted = + conn.transaction(|conn| -> DatabaseResult { + // Clear the splitting state on parent shards + let updated = diesel::update(tenant_shards) + .filter(tenant_id.eq(split_tenant_id.to_string())) + .filter(shard_count.ne(new_shard_count.literal() as i32)) + .set((splitting.eq(0),)) + .execute(conn)?; - // Parent shards are already gone: we cannot abort. - if updated == 0 { - return Ok(AbortShardSplitStatus::Complete); - } + // Parent shards are already gone: we cannot abort. + if updated == 0 { + return Ok(AbortShardSplitStatus::Complete); + } - // Sanity check: if parent shards were present, their cardinality should - // be less than the number of child shards. - if updated >= new_shard_count.count() as usize { - return Err(DatabaseError::Logical(format!( - "Unexpected parent shard count {updated} while aborting split to \ + // Sanity check: if parent shards were present, their cardinality should + // be less than the number of child shards. + if updated >= new_shard_count.count() as usize { + return Err(DatabaseError::Logical(format!( + "Unexpected parent shard count {updated} while aborting split to \ count {new_shard_count:?} on tenant {split_tenant_id}" - ))); - } + ))); + } - // Erase child shards - diesel::delete(tenant_shards) - .filter(tenant_id.eq(split_tenant_id.to_string())) - .filter(shard_count.eq(new_shard_count.literal() as i32)) - .execute(conn)?; + // Erase child shards + diesel::delete(tenant_shards) + .filter(tenant_id.eq(split_tenant_id.to_string())) + .filter(shard_count.eq(new_shard_count.literal() as i32)) + .execute(conn)?; - Ok(AbortShardSplitStatus::Aborted) - })?; + Ok(AbortShardSplitStatus::Aborted) + })?; - Ok(aborted) - }) + Ok(aborted) + }, + ) .await } } diff --git a/control_plane/attachment_service/src/reconciler.rs b/control_plane/attachment_service/src/reconciler.rs index 3bf23275bd..a62357f9ac 100644 --- a/control_plane/attachment_service/src/reconciler.rs +++ b/control_plane/attachment_service/src/reconciler.rs @@ -1,3 +1,4 @@ +use crate::pageserver_client::PageserverClient; use crate::persistence::Persistence; use crate::service; use hyper::StatusCode; @@ -117,6 +118,15 @@ impl Reconciler { flush_ms: Option, lazy: bool, ) -> Result<(), ReconcileError> { + if !node.is_available() && config.mode == LocationConfigMode::Detached { + // Attempts to detach from offline nodes may be imitated without doing I/O: a node which is offline + // will get fully reconciled wrt the shard's intent state when it is reactivated, irrespective of + // what we put into `observed`, in [`crate::service::Service::node_activate_reconcile`] + tracing::info!("Node {node} is unavailable during detach: proceeding anyway, it will be detached on next activation"); + self.observed.locations.remove(&node.get_id()); + return Ok(()); + } + self.observed .locations .insert(node.get_id(), ObservedStateLocation { conf: None }); @@ -149,9 +159,16 @@ impl Reconciler { }; tracing::info!("location_config({node}) complete: {:?}", config); - self.observed - .locations - .insert(node.get_id(), ObservedStateLocation { conf: Some(config) }); + match config.mode { + LocationConfigMode::Detached => { + self.observed.locations.remove(&node.get_id()); + } + _ => { + self.observed + .locations + .insert(node.get_id(), ObservedStateLocation { conf: Some(config) }); + } + } Ok(()) } @@ -243,8 +260,11 @@ impl Reconciler { tenant_shard_id: TenantShardId, node: &Node, ) -> anyhow::Result> { - let client = - mgmt_api::Client::new(node.base_url(), self.service_config.jwt_token.as_deref()); + let client = PageserverClient::new( + node.get_id(), + node.base_url(), + self.service_config.jwt_token.as_deref(), + ); let timelines = client.timeline_list(&tenant_shard_id).await?; Ok(timelines @@ -475,7 +495,7 @@ impl Reconciler { } } - // Downgrade the origin to secondary. If the tenant's policy is PlacementPolicy::Single, then + // Downgrade the origin to secondary. If the tenant's policy is PlacementPolicy::Attached(0), then // this location will be deleted in the general case reconciliation that runs after this. let origin_secondary_conf = build_location_config( &self.shard, diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs index 29f87021b2..aa930014b2 100644 --- a/control_plane/attachment_service/src/service.rs +++ b/control_plane/attachment_service/src/service.rs @@ -7,7 +7,9 @@ use std::{ time::{Duration, Instant}, }; -use crate::{id_lock_map::IdLockMap, persistence::AbortShardSplitStatus}; +use crate::{ + id_lock_map::IdLockMap, persistence::AbortShardSplitStatus, reconciler::ReconcileError, +}; use anyhow::Context; use control_plane::storage_controller::{ AttachHookRequest, AttachHookResponse, InspectRequest, InspectResponse, @@ -18,12 +20,14 @@ use hyper::StatusCode; use pageserver_api::{ controller_api::{ NodeAvailability, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy, - TenantCreateResponse, TenantCreateResponseShard, TenantLocateResponse, - TenantShardMigrateRequest, TenantShardMigrateResponse, UtilizationScore, + TenantCreateResponse, TenantCreateResponseShard, TenantDescribeResponse, + TenantDescribeResponseShard, TenantLocateResponse, TenantShardMigrateRequest, + TenantShardMigrateResponse, UtilizationScore, }, models::{SecondaryProgress, TenantConfigRequest}, }; +use crate::pageserver_client::PageserverClient; use pageserver_api::{ models::{ self, LocationConfig, LocationConfigListResponse, LocationConfigMode, @@ -200,6 +204,30 @@ enum TenantCreateOrUpdate { Update(Vec), } +struct ShardSplitParams { + old_shard_count: ShardCount, + new_shard_count: ShardCount, + new_stripe_size: Option, + targets: Vec, + policy: PlacementPolicy, + config: TenantConfig, + shard_ident: ShardIdentity, +} + +// When preparing for a shard split, we may either choose to proceed with the split, +// or find that the work is already done and return NoOp. +enum ShardSplitAction { + Split(ShardSplitParams), + NoOp(TenantShardSplitResponse), +} + +// A parent shard which will be split +struct ShardSplitTarget { + parent_id: TenantShardId, + node: Node, + child_ids: Vec, +} + /// When we tenant shard split operation fails, we may not be able to clean up immediately, because nodes /// might not be available. We therefore use a queue of abort operations processed in the background. struct TenantShardSplitAbort { @@ -525,7 +553,11 @@ impl Service { break; } - let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref()); + let client = PageserverClient::new( + node.get_id(), + node.base_url(), + self.config.jwt_token.as_deref(), + ); match client .location_config( tenant_shard_id, @@ -733,7 +765,19 @@ impl Service { tenant.waiter.advance(result.sequence); } Err(e) => { - tracing::warn!("Reconcile error: {}", e); + match e { + ReconcileError::Cancel => { + tracing::info!("Reconciler was cancelled"); + } + ReconcileError::Remote(mgmt_api::Error::Cancelled) => { + // This might be due to the reconciler getting cancelled, or it might + // be due to the `Node` being marked offline. + tracing::info!("Reconciler cancelled during pageserver API call"); + } + _ => { + tracing::warn!("Reconcile error: {}", e); + } + } // Ordering: populate last_error before advancing error_seq, // so that waiters will see the correct error after waiting. @@ -1057,7 +1101,7 @@ impl Service { shard_stripe_size: 0, generation: Some(0), generation_pageserver: None, - placement_policy: serde_json::to_string(&PlacementPolicy::Single).unwrap(), + placement_policy: serde_json::to_string(&PlacementPolicy::Attached(0)).unwrap(), config: serde_json::to_string(&TenantConfig::default()).unwrap(), splitting: SplitState::default(), }; @@ -1084,7 +1128,7 @@ impl Service { TenantState::new( attach_req.tenant_shard_id, ShardIdentity::unsharded(), - PlacementPolicy::Single, + PlacementPolicy::Attached(0), ), ); tracing::info!("Inserted shard {} in memory", attach_req.tenant_shard_id); @@ -1113,7 +1157,7 @@ impl Service { self.persistence .update_tenant_shard( attach_req.tenant_shard_id, - PlacementPolicy::Single, + PlacementPolicy::Attached(0), conf, None, ) @@ -1138,7 +1182,7 @@ impl Service { if let Some(new_generation) = new_generation { tenant_state.generation = Some(new_generation); - tenant_state.policy = PlacementPolicy::Single; + tenant_state.policy = PlacementPolicy::Attached(0); } else { // This is a detach notification. We must update placement policy to avoid re-attaching // during background scheduling/reconciliation, or during storage controller restart. @@ -1350,7 +1394,8 @@ impl Service { incremented_generations.len() ); - // Apply the updated generation to our in-memory state + // Apply the updated generation to our in-memory state, and + // gather discover secondary locations. let mut locked = self.inner.write().unwrap(); let (nodes, tenants, scheduler) = locked.parts_mut(); @@ -1358,62 +1403,65 @@ impl Service { tenants: Vec::new(), }; - for (tenant_shard_id, new_gen) in incremented_generations { - response.tenants.push(ReAttachResponseTenant { - id: tenant_shard_id, - gen: new_gen.into().unwrap(), - }); - // Apply the new generation number to our in-memory state - let shard_state = tenants.get_mut(&tenant_shard_id); - let Some(shard_state) = shard_state else { - // Not fatal. This edge case requires a re-attach to happen - // between inserting a new tenant shard in to the database, and updating our in-memory - // state to know about the shard, _and_ that the state inserted to the database referenced - // a pageserver. Should never happen, but handle it rather than panicking, since it should - // be harmless. - tracing::error!( - "Shard {} is in database for node {} but not in-memory state", - tenant_shard_id, - reattach_req.node_id - ); - continue; - }; + // TODO: cancel/restart any running reconciliation for this tenant, it might be trying + // to call location_conf API with an old generation. Wait for cancellation to complete + // before responding to this request. Requires well implemented CancellationToken logic + // all the way to where we call location_conf. Even then, there can still be a location_conf + // request in flight over the network: TODO handle that by making location_conf API refuse + // to go backward in generations. - // If [`Persistence::re_attach`] selected this shard, it must have alread - // had a generation set. - debug_assert!(shard_state.generation.is_some()); - let Some(old_gen) = shard_state.generation else { - // Should never happen: would only return incremented generation - // for a tenant that already had a non-null generation. - return Err(ApiError::InternalServerError(anyhow::anyhow!( - "Generation must be set while re-attaching" - ))); - }; - shard_state.generation = Some(std::cmp::max(old_gen, new_gen)); - if let Some(observed) = shard_state - .observed - .locations - .get_mut(&reattach_req.node_id) - { - if let Some(conf) = observed.conf.as_mut() { - conf.generation = new_gen.into(); + // Scan through all shards, applying updates for ones where we updated generation + // and identifying shards that intend to have a secondary location on this node. + for (tenant_shard_id, shard) in tenants { + if let Some(new_gen) = incremented_generations.get(tenant_shard_id) { + let new_gen = *new_gen; + response.tenants.push(ReAttachResponseTenant { + id: *tenant_shard_id, + gen: Some(new_gen.into().unwrap()), + // A tenant is only put into multi or stale modes in the middle of a [`Reconciler::live_migrate`] + // execution. If a pageserver is restarted during that process, then the reconcile pass will + // fail, and start from scratch, so it doesn't make sense for us to try and preserve + // the stale/multi states at this point. + mode: LocationConfigMode::AttachedSingle, + }); + + shard.generation = std::cmp::max(shard.generation, Some(new_gen)); + if let Some(observed) = shard.observed.locations.get_mut(&reattach_req.node_id) { + // Why can we update `observed` even though we're not sure our response will be received + // by the pageserver? Because the pageserver will not proceed with startup until + // it has processed response: if it loses it, we'll see another request and increment + // generation again, avoiding any uncertainty about dirtiness of tenant's state. + if let Some(conf) = observed.conf.as_mut() { + conf.generation = new_gen.into(); + } + } else { + // This node has no observed state for the shard: perhaps it was offline + // when the pageserver restarted. Insert a None, so that the Reconciler + // will be prompted to learn the location's state before it makes changes. + shard + .observed + .locations + .insert(reattach_req.node_id, ObservedStateLocation { conf: None }); } - } else { - // This node has no observed state for the shard: perhaps it was offline - // when the pageserver restarted. Insert a None, so that the Reconciler - // will be prompted to learn the location's state before it makes changes. - shard_state - .observed - .locations - .insert(reattach_req.node_id, ObservedStateLocation { conf: None }); - } + } else if shard.intent.get_secondary().contains(&reattach_req.node_id) { + // Ordering: pageserver will not accept /location_config requests until it has + // finished processing the response from re-attach. So we can update our in-memory state + // now, and be confident that we are not stamping on the result of some later location config. + // TODO: however, we are not strictly ordered wrt ReconcileResults queue, + // so we might update observed state here, and then get over-written by some racing + // ReconcileResult. The impact is low however, since we have set state on pageserver something + // that matches intent, so worst case if we race then we end up doing a spurious reconcile. - // TODO: cancel/restart any running reconciliation for this tenant, it might be trying - // to call location_conf API with an old generation. Wait for cancellation to complete - // before responding to this request. Requires well implemented CancellationToken logic - // all the way to where we call location_conf. Even then, there can still be a location_conf - // request in flight over the network: TODO handle that by making location_conf API refuse - // to go backward in generations. + response.tenants.push(ReAttachResponseTenant { + id: *tenant_shard_id, + gen: None, + mode: LocationConfigMode::Secondary, + }); + + // We must not update observed, because we have no guarantee that our + // response will be received by the pageserver. This could leave it + // falsely dirty, but the resulting reconcile should be idempotent. + } } // We consider a node Active once we have composed a re-attach response, but we @@ -1491,11 +1539,11 @@ impl Service { &self, create_req: TenantCreateRequest, ) -> Result<(TenantCreateResponse, Vec), ApiError> { - // As a default, single is convenient for tests that don't choose a policy. let placement_policy = create_req .placement_policy .clone() - .unwrap_or(PlacementPolicy::Single); + // As a default, zero secondaries is convenient for tests that don't choose a policy. + .unwrap_or(PlacementPolicy::Attached(0)); // This service expects to handle sharding itself: it is an error to try and directly create // a particular shard here. @@ -1705,11 +1753,11 @@ impl Service { | LocationConfigMode::AttachedSingle | LocationConfigMode::AttachedStale => { if nodes.len() > 1 { - PlacementPolicy::Double(1) + PlacementPolicy::Attached(1) } else { // Convenience for dev/test: if we just have one pageserver, import - // tenants into Single mode so that scheduling will succeed. - PlacementPolicy::Single + // tenants into non-HA mode so that scheduling will succeed. + PlacementPolicy::Attached(0) } } }; @@ -2058,8 +2106,11 @@ impl Service { }) .collect::>(); for tenant_shard_id in shard_ids { - let client = - mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref()); + let client = PageserverClient::new( + node.get_id(), + node.base_url(), + self.config.jwt_token.as_deref(), + ); tracing::info!("Doing time travel recovery for shard {tenant_shard_id}",); @@ -2111,7 +2162,11 @@ impl Service { // Issue concurrent requests to all shards' locations let mut futs = FuturesUnordered::new(); for (tenant_shard_id, node) in targets { - let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref()); + let client = PageserverClient::new( + node.get_id(), + node.base_url(), + self.config.jwt_token.as_deref(), + ); futs.push(async move { let result = client .tenant_secondary_download(tenant_shard_id, wait) @@ -2204,7 +2259,11 @@ impl Service { // Phase 1: delete on the pageservers let mut any_pending = false; for (tenant_shard_id, node) in targets { - let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref()); + let client = PageserverClient::new( + node.get_id(), + node.base_url(), + self.config.jwt_token.as_deref(), + ); // TODO: this, like many other places, requires proper retry handling for 503, timeout: those should not // surface immediately as an error to our caller. let status = client.tenant_delete(tenant_shard_id).await.map_err(|e| { @@ -2316,7 +2375,7 @@ impl Service { tenant_shard_id, create_req.new_timeline_id, ); - let client = mgmt_api::Client::new(node.base_url(), jwt.as_deref()); + let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref()); client .timeline_create(tenant_shard_id, &create_req) @@ -2440,7 +2499,7 @@ impl Service { "Deleting timeline on shard {tenant_shard_id}/{timeline_id}, attached to node {node}", ); - let client = mgmt_api::Client::new(node.base_url(), jwt.as_deref()); + let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref()); client .timeline_delete(tenant_shard_id, timeline_id) .await @@ -2481,11 +2540,11 @@ impl Service { } /// When you need to send an HTTP request to the pageserver that holds shard0 of a tenant, this - /// function looks it up and returns the url. If the tenant isn't found, returns Err(ApiError::NotFound) - pub(crate) fn tenant_shard0_baseurl( + /// function looks up and returns node. If the tenant isn't found, returns Err(ApiError::NotFound) + pub(crate) fn tenant_shard0_node( &self, tenant_id: TenantId, - ) -> Result<(String, TenantShardId), ApiError> { + ) -> Result<(Node, TenantShardId), ApiError> { let locked = self.inner.read().unwrap(); let Some((tenant_shard_id, shard)) = locked .tenants @@ -2517,7 +2576,7 @@ impl Service { ))); }; - Ok((node.base_url(), *tenant_shard_id)) + Ok((node.clone(), *tenant_shard_id)) } pub(crate) fn tenant_locate( @@ -2527,9 +2586,6 @@ impl Service { let locked = self.inner.read().unwrap(); tracing::info!("Locating shards for tenant {tenant_id}"); - // Take a snapshot of pageservers - let pageservers = locked.nodes.clone(); - let mut result = Vec::new(); let mut shard_params: Option = None; @@ -2543,7 +2599,8 @@ impl Service { "Cannot locate a tenant that is not attached" )))?; - let node = pageservers + let node = locked + .nodes .get(&node_id) .expect("Pageservers may not be deleted while referenced"); @@ -2591,6 +2648,47 @@ impl Service { }) } + pub(crate) fn tenant_describe( + &self, + tenant_id: TenantId, + ) -> Result { + let locked = self.inner.read().unwrap(); + + let mut shard_zero = None; + let mut shards = Vec::new(); + + for (tenant_shard_id, shard) in locked.tenants.range(TenantShardId::tenant_range(tenant_id)) + { + if tenant_shard_id.is_zero() { + shard_zero = Some(shard); + } + + let response_shard = TenantDescribeResponseShard { + tenant_shard_id: *tenant_shard_id, + node_attached: *shard.intent.get_attached(), + node_secondary: shard.intent.get_secondary().to_vec(), + last_error: shard.last_error.lock().unwrap().clone(), + is_reconciling: shard.reconciler.is_some(), + is_pending_compute_notification: shard.pending_compute_notification, + is_splitting: matches!(shard.splitting, SplitState::Splitting), + }; + shards.push(response_shard); + } + + let Some(shard_zero) = shard_zero else { + return Err(ApiError::NotFound( + anyhow::anyhow!("Tenant {tenant_id} not found").into(), + )); + }; + + Ok(TenantDescribeResponse { + shards, + stripe_size: shard_zero.shard.stripe_size, + policy: shard_zero.policy.clone(), + config: shard_zero.config.clone(), + }) + } + #[instrument(skip_all, fields(tenant_id=%op.tenant_id))] async fn abort_tenant_shard_split( &self, @@ -2648,7 +2746,7 @@ impl Service { let detach_locations: Vec<(Node, TenantShardId)> = { let mut detach_locations = Vec::new(); let mut locked = self.inner.write().unwrap(); - let (nodes, tenants, _scheduler) = locked.parts_mut(); + let (nodes, tenants, scheduler) = locked.parts_mut(); for (tenant_shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(op.tenant_id)) @@ -2681,6 +2779,13 @@ impl Service { tracing::info!("Restoring parent shard {tenant_shard_id}"); shard.splitting = SplitState::Idle; + if let Err(e) = shard.schedule(scheduler) { + // If this shard can't be scheduled now (perhaps due to offline nodes or + // capacity issues), that must not prevent us rolling back a split. In this + // case it should be eventually scheduled in the background. + tracing::warn!("Failed to schedule {tenant_shard_id} during shard abort: {e}") + } + self.maybe_reconcile_shard(shard, nodes); } @@ -2772,7 +2877,7 @@ impl Service { .map(|(shard_id, _)| *shard_id) .collect::>(); - let (_nodes, tenants, scheduler) = locked.parts_mut(); + let (nodes, tenants, scheduler) = locked.parts_mut(); for parent_id in parent_ids { let child_ids = parent_id.split(new_shard_count); @@ -2814,7 +2919,7 @@ impl Service { generation, &child_shard, &config, - matches!(policy, PlacementPolicy::Double(n) if n > 0), + matches!(policy, PlacementPolicy::Attached(n) if n > 0), )), }, ); @@ -2839,6 +2944,8 @@ impl Service { // find a secondary (e.g. because cluster is overloaded). tracing::warn!("Failed to schedule child shard {child}: {e}"); } + // In the background, attach secondary locations for the new shards + self.maybe_reconcile_shard(&mut child_state, nodes); tenants.insert(child, child_state); response.new_shards.push(child); @@ -2861,17 +2968,23 @@ impl Service { let new_shard_count = ShardCount::new(split_req.new_shard_count); let new_stripe_size = split_req.new_stripe_size; - let r = self.do_tenant_shard_split(tenant_id, split_req).await; + // Validate the request and construct parameters. This phase is fallible, but does not require + // rollback on errors, as it does no I/O and mutates no state. + let shard_split_params = match self.prepare_tenant_shard_split(tenant_id, split_req)? { + ShardSplitAction::NoOp(resp) => return Ok(resp), + ShardSplitAction::Split(params) => params, + }; + + // Execute this split: this phase mutates state and does remote I/O on pageservers. If it fails, + // we must roll back. + let r = self + .do_tenant_shard_split(tenant_id, shard_split_params) + .await; match r { Ok(r) => Ok(r), - Err(ApiError::BadRequest(_)) => { - // A request validation error does not require rollback: we rejected it before we started making any changes: just - // return the error - r - } Err(e) => { - // General case error handling: split might be part-done, we must do work to abort it. + // Split might be part-done, we must do work to abort it. tracing::warn!("Enqueuing background abort of split on {tenant_id}"); self.abort_tx .send(TenantShardSplitAbort { @@ -2887,25 +3000,18 @@ impl Service { } } - pub(crate) async fn do_tenant_shard_split( + fn prepare_tenant_shard_split( &self, tenant_id: TenantId, split_req: TenantShardSplitRequest, - ) -> Result { - let mut policy = None; - let mut shard_ident = None; - - // A parent shard which will be split - struct SplitTarget { - parent_id: TenantShardId, - node: Node, - child_ids: Vec, - } - + ) -> Result { fail::fail_point!("shard-split-validation", |_| Err(ApiError::BadRequest( anyhow::anyhow!("failpoint") ))); + let mut policy = None; + let mut config = None; + let mut shard_ident = None; // Validate input, and calculate which shards we will create let (old_shard_count, targets) = { @@ -2961,6 +3067,9 @@ impl Service { if shard_ident.is_none() { shard_ident = Some(shard.shard); } + if config.is_none() { + config = Some(shard.config.clone()); + } if tenant_shard_id.shard_count.count() == split_req.new_shard_count { tracing::info!( @@ -2979,9 +3088,7 @@ impl Service { .get(&node_id) .expect("Pageservers may not be deleted while referenced"); - // TODO: if any reconciliation is currently in progress for this shard, wait for it. - - targets.push(SplitTarget { + targets.push(ShardSplitTarget { parent_id: *tenant_shard_id, node: node.clone(), child_ids: tenant_shard_id @@ -2991,9 +3098,9 @@ impl Service { if targets.is_empty() { if children_found.len() == split_req.new_shard_count as usize { - return Ok(TenantShardSplitResponse { + return Ok(ShardSplitAction::NoOp(TenantShardSplitResponse { new_shards: children_found, - }); + })); } else { // No shards found to split, and no existing children found: the // tenant doesn't exist at all. @@ -3023,13 +3130,77 @@ impl Service { shard_ident.unwrap() }; let policy = policy.unwrap(); + let config = config.unwrap(); + Ok(ShardSplitAction::Split(ShardSplitParams { + old_shard_count, + new_shard_count: ShardCount::new(split_req.new_shard_count), + new_stripe_size: split_req.new_stripe_size, + targets, + policy, + config, + shard_ident, + })) + } + + async fn do_tenant_shard_split( + &self, + tenant_id: TenantId, + params: ShardSplitParams, + ) -> Result { // FIXME: we have dropped self.inner lock, and not yet written anything to the database: another // request could occur here, deleting or mutating the tenant. begin_shard_split checks that the // parent shards exist as expected, but it would be neater to do the above pre-checks within the // same database transaction rather than pre-check in-memory and then maybe-fail the database write. // (https://github.com/neondatabase/neon/issues/6676) + let ShardSplitParams { + old_shard_count, + new_shard_count, + new_stripe_size, + mut targets, + policy, + config, + shard_ident, + } = params; + + // Drop any secondary locations: pageservers do not support splitting these, and in any case the + // end-state for a split tenant will usually be to have secondary locations on different nodes. + // The reconciliation calls in this block also implicitly cancel+barrier wrt any ongoing reconciliation + // at the time of split. + let waiters = { + let mut locked = self.inner.write().unwrap(); + let mut waiters = Vec::new(); + let (nodes, tenants, scheduler) = locked.parts_mut(); + for target in &mut targets { + let Some(shard) = tenants.get_mut(&target.parent_id) else { + // Paranoia check: this shouldn't happen: we have the oplock for this tenant ID. + return Err(ApiError::InternalServerError(anyhow::anyhow!( + "Shard {} not found", + target.parent_id + ))); + }; + + if shard.intent.get_attached() != &Some(target.node.get_id()) { + // Paranoia check: this shouldn't happen: we have the oplock for this tenant ID. + return Err(ApiError::Conflict(format!( + "Shard {} unexpectedly rescheduled during split", + target.parent_id + ))); + } + + // Irrespective of PlacementPolicy, clear secondary locations from intent + shard.intent.clear_secondary(scheduler); + + // Run Reconciler to execute detach fo secondary locations. + if let Some(waiter) = self.maybe_reconcile_shard(shard, nodes) { + waiters.push(waiter); + } + } + waiters + }; + self.await_waiters(waiters, RECONCILE_TIMEOUT).await?; + // Before creating any new child shards in memory or on the pageservers, persist them: this // enables us to ensure that we will always be able to clean up if something goes wrong. This also // acts as the protection against two concurrent attempts to split: one of them will get a database @@ -3058,8 +3229,7 @@ impl Service { generation: None, generation_pageserver: Some(target.node.get_id().0 as i64), placement_policy: serde_json::to_string(&policy).unwrap(), - // TODO: get the config out of the map - config: serde_json::to_string(&TenantConfig::default()).unwrap(), + config: serde_json::to_string(&config).unwrap(), splitting: SplitState::Splitting, }); } @@ -3111,18 +3281,22 @@ impl Service { // N>1 shards into M shards -- initially we're usually splitting 1 shard into N). for target in &targets { - let SplitTarget { + let ShardSplitTarget { parent_id, node, child_ids, } = target; - let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref()); + let client = PageserverClient::new( + node.get_id(), + node.base_url(), + self.config.jwt_token.as_deref(), + ); let response = client .tenant_shard_split( *parent_id, TenantShardSplitRequest { - new_shard_count: split_req.new_shard_count, - new_stripe_size: split_req.new_stripe_size, + new_shard_count: new_shard_count.literal(), + new_stripe_size, }, ) .await @@ -3171,11 +3345,8 @@ impl Service { )); // Replace all the shards we just split with their children: this phase is infallible. - let (response, child_locations) = self.tenant_shard_split_commit_inmem( - tenant_id, - ShardCount::new(split_req.new_shard_count), - split_req.new_stripe_size, - ); + let (response, child_locations) = + self.tenant_shard_split_commit_inmem(tenant_id, new_shard_count, new_stripe_size); // Send compute notifications for all the new shards let mut failed_notifications = Vec::new(); @@ -3240,17 +3411,20 @@ impl Service { let old_attached = *shard.intent.get_attached(); match shard.policy { - PlacementPolicy::Single => { - shard.intent.clear_secondary(scheduler); - shard.intent.set_attached(scheduler, Some(migrate_req.node_id)); - } - PlacementPolicy::Double(_n) => { + PlacementPolicy::Attached(n) => { // If our new attached node was a secondary, it no longer should be. shard.intent.remove_secondary(scheduler, migrate_req.node_id); // If we were already attached to something, demote that to a secondary if let Some(old_attached) = old_attached { - shard.intent.push_secondary(scheduler, old_attached); + if n > 0 { + // Remove other secondaries to make room for the location we'll demote + while shard.intent.get_secondary().len() >= n { + shard.intent.pop_secondary(scheduler); + } + + shard.intent.push_secondary(scheduler, old_attached); + } } shard.intent.set_attached(scheduler, Some(migrate_req.node_id)); @@ -3276,7 +3450,7 @@ impl Service { if let Some(waiter) = waiter { waiter.wait_timeout(RECONCILE_TIMEOUT).await?; } else { - tracing::warn!("Migration is a no-op"); + tracing::info!("Migration is a no-op"); } Ok(TenantShardMigrateResponse {}) @@ -3631,6 +3805,13 @@ impl Service { observed_loc.conf = None; } + if new_nodes.len() == 1 { + // Special case for single-node cluster: there is no point trying to reschedule + // any tenant shards: avoid doing so, in order to avoid spewing warnings about + // failures to schedule them. + continue; + } + if tenant_state.intent.demote_attached(node_id) { tenant_state.sequence = tenant_state.sequence.next(); match tenant_state.schedule(scheduler) { diff --git a/control_plane/attachment_service/src/tenant_state.rs b/control_plane/attachment_service/src/tenant_state.rs index 39e557616d..83c921dc58 100644 --- a/control_plane/attachment_service/src/tenant_state.rs +++ b/control_plane/attachment_service/src/tenant_state.rs @@ -4,7 +4,10 @@ use std::{ time::Duration, }; -use crate::{metrics, persistence::TenantShardPersistence}; +use crate::{ + metrics::{self, ReconcileCompleteLabelGroup, ReconcileOutcome}, + persistence::TenantShardPersistence, +}; use pageserver_api::controller_api::PlacementPolicy; use pageserver_api::{ models::{LocationConfig, LocationConfigMode, TenantConfig}, @@ -457,22 +460,7 @@ impl TenantState { // Add/remove nodes to fulfil policy use PlacementPolicy::*; match self.policy { - Single => { - // Should have exactly one attached, and zero secondaries - if !self.intent.secondary.is_empty() { - self.intent.clear_secondary(scheduler); - modified = true; - } - - let (modified_attached, _attached_node_id) = self.schedule_attached(scheduler)?; - modified |= modified_attached; - - if !self.intent.secondary.is_empty() { - self.intent.clear_secondary(scheduler); - modified = true; - } - } - Double(secondary_count) => { + Attached(secondary_count) => { let retain_secondaries = if self.intent.attached.is_none() && scheduler.node_preferred(&self.intent.secondary).is_some() { @@ -733,7 +721,10 @@ impl TenantState { let reconciler_span = tracing::info_span!(parent: None, "reconciler", seq=%reconcile_seq, tenant_id=%reconciler.tenant_shard_id.tenant_id, shard_id=%reconciler.tenant_shard_id.shard_slug()); - metrics::RECONCILER.spawned.inc(); + metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_reconcile_spawn + .inc(); let result_tx = result_tx.clone(); let join_handle = tokio::task::spawn( async move { @@ -751,10 +742,12 @@ impl TenantState { // TODO: wrap all remote API operations in cancellation check // as well. if reconciler.cancel.is_cancelled() { - metrics::RECONCILER - .complete - .with_label_values(&[metrics::ReconcilerMetrics::CANCEL]) - .inc(); + metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_reconcile_complete + .inc(ReconcileCompleteLabelGroup { + status: ReconcileOutcome::Cancel, + }); return; } @@ -769,18 +762,18 @@ impl TenantState { } // Update result counter - match &result { - Ok(_) => metrics::RECONCILER - .complete - .with_label_values(&[metrics::ReconcilerMetrics::SUCCESS]), - Err(ReconcileError::Cancel) => metrics::RECONCILER - .complete - .with_label_values(&[metrics::ReconcilerMetrics::CANCEL]), - Err(_) => metrics::RECONCILER - .complete - .with_label_values(&[metrics::ReconcilerMetrics::ERROR]), - } - .inc(); + let outcome_label = match &result { + Ok(_) => ReconcileOutcome::Success, + Err(ReconcileError::Cancel) => ReconcileOutcome::Cancel, + Err(_) => ReconcileOutcome::Error, + }; + + metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_reconcile_complete + .inc(ReconcileCompleteLabelGroup { + status: outcome_label, + }); result_tx .send(ReconcileResult { @@ -895,7 +888,7 @@ pub(crate) mod tests { let mut scheduler = Scheduler::new(nodes.values()); - let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Double(1)); + let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Attached(1)); tenant_state .schedule(&mut scheduler) .expect("we have enough nodes, scheduling should work"); @@ -943,7 +936,7 @@ pub(crate) mod tests { let nodes = make_test_nodes(3); let mut scheduler = Scheduler::new(nodes.values()); - let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Double(1)); + let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Attached(1)); tenant_state.observed.locations.insert( NodeId(3), diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index 6c722f36b4..401feae706 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -437,7 +437,7 @@ async fn handle_tenant( let placement_policy = match create_match.get_one::("placement-policy") { Some(s) if !s.is_empty() => serde_json::from_str::(s)?, - _ => PlacementPolicy::Single, + _ => PlacementPolicy::Attached(0), }; let tenant_conf = PageServerNode::parse_config(tenant_conf)?; @@ -523,88 +523,6 @@ async fn handle_tenant( .with_context(|| format!("Tenant config failed for tenant with id {tenant_id}"))?; println!("tenant {tenant_id} successfully configured on the pageserver"); } - Some(("migrate", matches)) => { - let tenant_shard_id = get_tenant_shard_id(matches, env)?; - let new_pageserver = get_pageserver(env, matches)?; - let new_pageserver_id = new_pageserver.conf.id; - - let storage_controller = StorageController::from_env(env); - storage_controller - .tenant_migrate(tenant_shard_id, new_pageserver_id) - .await?; - - println!("tenant {tenant_shard_id} migrated to {}", new_pageserver_id); - } - Some(("status", matches)) => { - let tenant_id = get_tenant_id(matches, env)?; - - let mut shard_table = comfy_table::Table::new(); - shard_table.set_header(["Shard", "Pageserver", "Physical Size"]); - - let mut tenant_synthetic_size = None; - - let storage_controller = StorageController::from_env(env); - for shard in storage_controller.tenant_locate(tenant_id).await?.shards { - let pageserver = - PageServerNode::from_env(env, env.get_pageserver_conf(shard.node_id)?); - - let size = pageserver - .http_client - .tenant_details(shard.shard_id) - .await? - .tenant_info - .current_physical_size - .unwrap(); - - shard_table.add_row([ - format!("{}", shard.shard_id.shard_slug()), - format!("{}", shard.node_id.0), - format!("{} MiB", size / (1024 * 1024)), - ]); - - if shard.shard_id.is_zero() { - tenant_synthetic_size = - Some(pageserver.tenant_synthetic_size(shard.shard_id).await?); - } - } - - let Some(synthetic_size) = tenant_synthetic_size else { - bail!("Shard 0 not found") - }; - - let mut tenant_table = comfy_table::Table::new(); - tenant_table.add_row(["Tenant ID".to_string(), tenant_id.to_string()]); - tenant_table.add_row([ - "Synthetic size".to_string(), - format!("{} MiB", synthetic_size.size.unwrap_or(0) / (1024 * 1024)), - ]); - - println!("{tenant_table}"); - println!("{shard_table}"); - } - Some(("shard-split", matches)) => { - let tenant_id = get_tenant_id(matches, env)?; - let shard_count: u8 = matches.get_one::("shard-count").cloned().unwrap_or(0); - let shard_stripe_size: Option = matches - .get_one::>("shard-stripe-size") - .cloned() - .unwrap(); - - let storage_controller = StorageController::from_env(env); - let result = storage_controller - .tenant_split(tenant_id, shard_count, shard_stripe_size) - .await?; - println!( - "Split tenant {} into shards {}", - tenant_id, - result - .new_shards - .iter() - .map(|s| format!("{:?}", s)) - .collect::>() - .join(",") - ); - } Some((sub_name, _)) => bail!("Unexpected tenant subcommand '{}'", sub_name), None => bail!("no tenant subcommand provided"), @@ -1578,19 +1496,6 @@ fn cli() -> Command { .subcommand(Command::new("config") .arg(tenant_id_arg.clone()) .arg(Arg::new("config").short('c').num_args(1).action(ArgAction::Append).required(false))) - .subcommand(Command::new("migrate") - .about("Migrate a tenant from one pageserver to another") - .arg(tenant_id_arg.clone()) - .arg(pageserver_id_arg.clone())) - .subcommand(Command::new("status") - .about("Human readable summary of the tenant's shards and attachment locations") - .arg(tenant_id_arg.clone())) - .subcommand(Command::new("shard-split") - .about("Increase the number of shards in the tenant") - .arg(tenant_id_arg.clone()) - .arg(Arg::new("shard-count").value_parser(value_parser!(u8)).long("shard-count").action(ArgAction::Set).help("Number of shards in the new tenant (default 1)")) - .arg(Arg::new("shard-stripe-size").value_parser(value_parser!(u32)).long("shard-stripe-size").action(ArgAction::Set).help("Sharding stripe size in pages")) - ) ) .subcommand( Command::new("pageserver") diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index c7f22cc8f8..bd3dbef453 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -127,8 +127,8 @@ pub struct PageServerConf { pub pg_auth_type: AuthType, pub http_auth_type: AuthType, - pub(crate) virtual_file_io_engine: String, - pub(crate) get_vectored_impl: String, + pub(crate) virtual_file_io_engine: Option, + pub(crate) get_vectored_impl: Option, } impl Default for PageServerConf { @@ -139,9 +139,8 @@ impl Default for PageServerConf { listen_http_addr: String::new(), pg_auth_type: AuthType::Trust, http_auth_type: AuthType::Trust, - // FIXME: use the ones exposed by pageserver crate - virtual_file_io_engine: "tokio-epoll-uring".to_owned(), - get_vectored_impl: "sequential".to_owned(), + virtual_file_io_engine: None, + get_vectored_impl: None, } } } diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index 2603515681..c5eabc46db 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -101,8 +101,16 @@ impl PageServerNode { let pg_auth_type_param = format!("pg_auth_type='{}'", pg_auth_type); let listen_pg_addr_param = format!("listen_pg_addr='{}'", listen_pg_addr); - let virtual_file_io_engine = format!("virtual_file_io_engine='{virtual_file_io_engine}'"); - let get_vectored_impl = format!("get_vectored_impl='{get_vectored_impl}'"); + let virtual_file_io_engine = if let Some(virtual_file_io_engine) = virtual_file_io_engine { + format!("virtual_file_io_engine='{virtual_file_io_engine}'") + } else { + String::new() + }; + let get_vectored_impl = if let Some(get_vectored_impl) = get_vectored_impl { + format!("get_vectored_impl='{get_vectored_impl}'") + } else { + String::new() + }; let broker_endpoint_param = format!("broker_endpoint='{}'", self.env.broker.client_url()); diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs index 18014adba4..e7697ecac8 100644 --- a/control_plane/src/storage_controller.rs +++ b/control_plane/src/storage_controller.rs @@ -475,7 +475,7 @@ impl StorageController { pub async fn tenant_locate(&self, tenant_id: TenantId) -> anyhow::Result { self.dispatch::<(), _>( Method::GET, - format!("control/v1/tenant/{tenant_id}/locate"), + format!("debug/v1/tenant/{tenant_id}/locate"), None, ) .await diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs index 6053e8b8ed..e33bd0f486 100644 --- a/libs/pageserver_api/src/controller_api.rs +++ b/libs/pageserver_api/src/controller_api.rs @@ -6,7 +6,10 @@ use std::str::FromStr; use serde::{Deserialize, Serialize}; use utils::id::NodeId; -use crate::{models::ShardParameters, shard::TenantShardId}; +use crate::{ + models::{ShardParameters, TenantConfig}, + shard::{ShardStripeSize, TenantShardId}, +}; #[derive(Serialize, Deserialize)] pub struct TenantCreateResponseShard { @@ -57,6 +60,31 @@ pub struct TenantLocateResponse { pub shard_params: ShardParameters, } +#[derive(Serialize, Deserialize)] +pub struct TenantDescribeResponse { + pub shards: Vec, + pub stripe_size: ShardStripeSize, + pub policy: PlacementPolicy, + pub config: TenantConfig, +} + +#[derive(Serialize, Deserialize)] +pub struct TenantDescribeResponseShard { + pub tenant_shard_id: TenantShardId, + + pub node_attached: Option, + pub node_secondary: Vec, + + pub last_error: String, + + /// A task is currently running to reconcile this tenant's intent state with the state on pageservers + pub is_reconciling: bool, + /// This shard failed in sending a compute notification to the cloud control plane, and a retry is pending. + pub is_pending_compute_notification: bool, + /// A shard split is currently underway + pub is_splitting: bool, +} + /// Explicitly migrating a particular shard is a low level operation /// TODO: higher level "Reschedule tenant" operation where the request /// specifies some constraints, e.g. asking it to get off particular node(s) @@ -181,11 +209,8 @@ impl From for String { /// to create secondary locations. #[derive(Clone, Serialize, Deserialize, Debug, PartialEq, Eq)] pub enum PlacementPolicy { - /// Cheapest way to attach a tenant: just one pageserver, no secondary - Single, - /// Production-ready way to attach a tenant: one attached pageserver and - /// some number of secondaries. - Double(usize), + /// Normal live state: one attached pageserver and zero or more secondaries. + Attached(usize), /// Create one secondary mode locations. This is useful when onboarding /// a tenant, or for an idle tenant that we might want to bring online quickly. Secondary, @@ -207,14 +232,14 @@ mod test { /// Check stability of PlacementPolicy's serialization #[test] fn placement_policy_encoding() -> anyhow::Result<()> { - let v = PlacementPolicy::Double(1); + let v = PlacementPolicy::Attached(1); let encoded = serde_json::to_string(&v)?; - assert_eq!(encoded, "{\"Double\":1}"); + assert_eq!(encoded, "{\"Attached\":1}"); assert_eq!(serde_json::from_str::(&encoded)?, v); - let v = PlacementPolicy::Single; + let v = PlacementPolicy::Detached; let encoded = serde_json::to_string(&v)?; - assert_eq!(encoded, "\"Single\""); + assert_eq!(encoded, "\"Detached\""); assert_eq!(serde_json::from_str::(&encoded)?, v); Ok(()) } diff --git a/libs/pageserver_api/src/upcall_api.rs b/libs/pageserver_api/src/upcall_api.rs index 5472948091..2e88836bd0 100644 --- a/libs/pageserver_api/src/upcall_api.rs +++ b/libs/pageserver_api/src/upcall_api.rs @@ -6,7 +6,9 @@ use serde::{Deserialize, Serialize}; use utils::id::NodeId; -use crate::{controller_api::NodeRegisterRequest, shard::TenantShardId}; +use crate::{ + controller_api::NodeRegisterRequest, models::LocationConfigMode, shard::TenantShardId, +}; /// Upcall message sent by the pageserver to the configured `control_plane_api` on /// startup. @@ -20,12 +22,20 @@ pub struct ReAttachRequest { pub register: Option, } -#[derive(Serialize, Deserialize)] -pub struct ReAttachResponseTenant { - pub id: TenantShardId, - pub gen: u32, +fn default_mode() -> LocationConfigMode { + LocationConfigMode::AttachedSingle } +#[derive(Serialize, Deserialize, Debug)] +pub struct ReAttachResponseTenant { + pub id: TenantShardId, + /// Mandatory if LocationConfigMode is None or set to an Attached* mode + pub gen: Option, + + /// Default value only for backward compat: this field should be set + #[serde(default = "default_mode")] + pub mode: LocationConfigMode, +} #[derive(Serialize, Deserialize)] pub struct ReAttachResponse { pub tenants: Vec, diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs index 313d8226b1..8cad863731 100644 --- a/libs/remote_storage/src/local_fs.rs +++ b/libs/remote_storage/src/local_fs.rs @@ -198,6 +198,7 @@ impl LocalFs { fs::OpenOptions::new() .write(true) .create(true) + .truncate(true) .open(&temp_file_path) .await .with_context(|| { diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml index 983e94d963..c2d9d9d396 100644 --- a/libs/utils/Cargo.toml +++ b/libs/utils/Cargo.toml @@ -13,6 +13,7 @@ testing = ["fail/failpoints"] [dependencies] arc-swap.workspace = true sentry.workspace = true +async-compression.workspace = true async-trait.workspace = true anyhow.workspace = true bincode.workspace = true @@ -36,6 +37,7 @@ serde_json.workspace = true signal-hook.workspace = true thiserror.workspace = true tokio.workspace = true +tokio-tar.workspace = true tokio-util.workspace = true tracing.workspace = true tracing-error.workspace = true @@ -46,6 +48,7 @@ strum.workspace = true strum_macros.workspace = true url.workspace = true uuid.workspace = true +walkdir.workspace = true pq_proto.workspace = true postgres_connection.workspace = true diff --git a/libs/utils/src/http/endpoint.rs b/libs/utils/src/http/endpoint.rs index a60971abf0..f8a5f68131 100644 --- a/libs/utils/src/http/endpoint.rs +++ b/libs/utils/src/http/endpoint.rs @@ -245,7 +245,7 @@ impl std::io::Write for ChannelWriter { } } -async fn prometheus_metrics_handler(_req: Request) -> Result, ApiError> { +pub async fn prometheus_metrics_handler(_req: Request) -> Result, ApiError> { SERVE_METRICS_COUNT.inc(); let started_at = std::time::Instant::now(); @@ -367,7 +367,6 @@ pub fn make_router() -> RouterBuilder { .middleware(Middleware::post_with_info( add_request_id_header_to_response, )) - .get("/metrics", |r| request_span(r, prometheus_metrics_handler)) .err_handler(route_error_handler) } diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs index 890061dc59..04ce0626c8 100644 --- a/libs/utils/src/lib.rs +++ b/libs/utils/src/lib.rs @@ -87,6 +87,8 @@ pub mod failpoint_support; pub mod yielding_loop; +pub mod zstd; + /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages /// /// we have several cases: diff --git a/libs/utils/src/lock_file.rs b/libs/utils/src/lock_file.rs index 987b9d9ad2..59c66ca757 100644 --- a/libs/utils/src/lock_file.rs +++ b/libs/utils/src/lock_file.rs @@ -63,6 +63,7 @@ impl UnwrittenLockFile { pub fn create_exclusive(lock_file_path: &Utf8Path) -> anyhow::Result { let lock_file = fs::OpenOptions::new() .create(true) // O_CREAT + .truncate(true) .write(true) .open(lock_file_path) .context("open lock file")?; diff --git a/libs/utils/src/pageserver_feedback.rs b/libs/utils/src/pageserver_feedback.rs index bc8fa7362e..3ddfa44f41 100644 --- a/libs/utils/src/pageserver_feedback.rs +++ b/libs/utils/src/pageserver_feedback.rs @@ -29,12 +29,10 @@ pub struct PageserverFeedback { // Serialize with RFC3339 format. #[serde(with = "serde_systemtime")] pub replytime: SystemTime, + /// Used to track feedbacks from different shards. Always zero for unsharded tenants. + pub shard_number: u32, } -// NOTE: Do not forget to increment this number when adding new fields to PageserverFeedback. -// Do not remove previously available fields because this might be backwards incompatible. -pub const PAGESERVER_FEEDBACK_FIELDS_NUMBER: u8 = 5; - impl PageserverFeedback { pub fn empty() -> PageserverFeedback { PageserverFeedback { @@ -43,6 +41,7 @@ impl PageserverFeedback { remote_consistent_lsn: Lsn::INVALID, disk_consistent_lsn: Lsn::INVALID, replytime: *PG_EPOCH, + shard_number: 0, } } @@ -59,17 +58,26 @@ impl PageserverFeedback { // // TODO: change serialized fields names once all computes migrate to rename. pub fn serialize(&self, buf: &mut BytesMut) { - buf.put_u8(PAGESERVER_FEEDBACK_FIELDS_NUMBER); // # of keys + let buf_ptr = buf.len(); + buf.put_u8(0); // # of keys, will be filled later + let mut nkeys = 0; + + nkeys += 1; buf.put_slice(b"current_timeline_size\0"); buf.put_i32(8); buf.put_u64(self.current_timeline_size); + nkeys += 1; buf.put_slice(b"ps_writelsn\0"); buf.put_i32(8); buf.put_u64(self.last_received_lsn.0); + + nkeys += 1; buf.put_slice(b"ps_flushlsn\0"); buf.put_i32(8); buf.put_u64(self.disk_consistent_lsn.0); + + nkeys += 1; buf.put_slice(b"ps_applylsn\0"); buf.put_i32(8); buf.put_u64(self.remote_consistent_lsn.0); @@ -80,9 +88,19 @@ impl PageserverFeedback { .expect("failed to serialize pg_replytime earlier than PG_EPOCH") .as_micros() as i64; + nkeys += 1; buf.put_slice(b"ps_replytime\0"); buf.put_i32(8); buf.put_i64(timestamp); + + if self.shard_number > 0 { + nkeys += 1; + buf.put_slice(b"shard_number\0"); + buf.put_i32(4); + buf.put_u32(self.shard_number); + } + + buf[buf_ptr] = nkeys; } // Deserialize PageserverFeedback message @@ -125,9 +143,8 @@ impl PageserverFeedback { } b"shard_number" => { let len = buf.get_i32(); - // TODO: this will be implemented in the next update, - // for now, we just skip the value. - buf.advance(len as usize); + assert_eq!(len, 4); + rf.shard_number = buf.get_u32(); } _ => { let len = buf.get_i32(); @@ -200,10 +217,7 @@ mod tests { rf.serialize(&mut data); // Add an extra field to the buffer and adjust number of keys - if let Some(first) = data.first_mut() { - *first = PAGESERVER_FEEDBACK_FIELDS_NUMBER + 1; - } - + data[0] += 1; data.put_slice(b"new_field_one\0"); data.put_i32(8); data.put_u64(42); diff --git a/libs/utils/src/sync/heavier_once_cell.rs b/libs/utils/src/sync/heavier_once_cell.rs index a3aee45b58..8eee1f72a6 100644 --- a/libs/utils/src/sync/heavier_once_cell.rs +++ b/libs/utils/src/sync/heavier_once_cell.rs @@ -245,7 +245,7 @@ impl<'a, T> Guard<'a, T> { /// /// The permit will be on a semaphore part of the new internal value, and any following /// [`OnceCell::get_or_init`] will wait on it to complete. - pub fn take_and_deinit(&mut self) -> (T, InitPermit) { + pub fn take_and_deinit(mut self) -> (T, InitPermit) { let mut swapped = Inner::default(); let sem = swapped.init_semaphore.clone(); // acquire and forget right away, moving the control over to InitPermit @@ -543,7 +543,7 @@ mod tests { target.set(42, permit); let (_answer, permit) = { - let mut guard = target + let guard = target .get_or_init(|permit| async { Ok::<_, Infallible>((11, permit)) }) .await .unwrap(); diff --git a/libs/utils/src/vec_map.rs b/libs/utils/src/vec_map.rs index 9953b447c8..18b2af14f1 100644 --- a/libs/utils/src/vec_map.rs +++ b/libs/utils/src/vec_map.rs @@ -1,27 +1,60 @@ use std::{alloc::Layout, cmp::Ordering, ops::RangeBounds}; +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum VecMapOrdering { + Greater, + GreaterOrEqual, +} + /// Ordered map datastructure implemented in a Vec. /// Append only - can only add keys that are larger than the /// current max key. +/// Ordering can be adjusted using [`VecMapOrdering`] +/// during `VecMap` construction. #[derive(Clone, Debug)] -pub struct VecMap(Vec<(K, V)>); +pub struct VecMap { + data: Vec<(K, V)>, + ordering: VecMapOrdering, +} impl Default for VecMap { fn default() -> Self { - VecMap(Default::default()) + VecMap { + data: Default::default(), + ordering: VecMapOrdering::Greater, + } } } -#[derive(Debug)] -pub struct InvalidKey; +#[derive(thiserror::Error, Debug)] +pub enum VecMapError { + #[error("Key violates ordering constraint")] + InvalidKey, + #[error("Mismatched ordering constraints")] + ExtendOrderingError, +} impl VecMap { + pub fn new(ordering: VecMapOrdering) -> Self { + Self { + data: Vec::new(), + ordering, + } + } + + pub fn with_capacity(capacity: usize, ordering: VecMapOrdering) -> Self { + Self { + data: Vec::with_capacity(capacity), + ordering, + } + } + pub fn is_empty(&self) -> bool { - self.0.is_empty() + self.data.is_empty() } pub fn as_slice(&self) -> &[(K, V)] { - self.0.as_slice() + self.data.as_slice() } /// This function may panic if given a range where the lower bound is @@ -29,7 +62,7 @@ impl VecMap { pub fn slice_range>(&self, range: R) -> &[(K, V)] { use std::ops::Bound::*; - let binary_search = |k: &K| self.0.binary_search_by_key(&k, extract_key); + let binary_search = |k: &K| self.data.binary_search_by_key(&k, extract_key); let start_idx = match range.start_bound() { Unbounded => 0, @@ -41,7 +74,7 @@ impl VecMap { }; let end_idx = match range.end_bound() { - Unbounded => self.0.len(), + Unbounded => self.data.len(), Included(k) => match binary_search(k) { Ok(idx) => idx + 1, Err(idx) => idx, @@ -49,34 +82,30 @@ impl VecMap { Excluded(k) => binary_search(k).unwrap_or_else(std::convert::identity), }; - &self.0[start_idx..end_idx] + &self.data[start_idx..end_idx] } /// Add a key value pair to the map. - /// If `key` is less than or equal to the current maximum key - /// the pair will not be added and InvalidKey error will be returned. - pub fn append(&mut self, key: K, value: V) -> Result { - if let Some((last_key, _last_value)) = self.0.last() { - if &key <= last_key { - return Err(InvalidKey); - } - } + /// If `key` is not respective of the `self` ordering the + /// pair will not be added and `InvalidKey` error will be returned. + pub fn append(&mut self, key: K, value: V) -> Result { + self.validate_key_order(&key)?; let delta_size = self.instrument_vec_op(|vec| vec.push((key, value))); Ok(delta_size) } /// Update the maximum key value pair or add a new key value pair to the map. - /// If `key` is less than the current maximum key no updates or additions - /// will occur and InvalidKey error will be returned. + /// If `key` is not respective of the `self` ordering no updates or additions + /// will occur and `InvalidKey` error will be returned. pub fn append_or_update_last( &mut self, key: K, mut value: V, - ) -> Result<(Option, usize), InvalidKey> { - if let Some((last_key, last_value)) = self.0.last_mut() { + ) -> Result<(Option, usize), VecMapError> { + if let Some((last_key, last_value)) = self.data.last_mut() { match key.cmp(last_key) { - Ordering::Less => return Err(InvalidKey), + Ordering::Less => return Err(VecMapError::InvalidKey), Ordering::Equal => { std::mem::swap(last_value, &mut value); const DELTA_SIZE: usize = 0; @@ -100,40 +129,67 @@ impl VecMap { V: Clone, { let split_idx = self - .0 + .data .binary_search_by_key(&cutoff, extract_key) .unwrap_or_else(std::convert::identity); ( - VecMap(self.0[..split_idx].to_vec()), - VecMap(self.0[split_idx..].to_vec()), + VecMap { + data: self.data[..split_idx].to_vec(), + ordering: self.ordering, + }, + VecMap { + data: self.data[split_idx..].to_vec(), + ordering: self.ordering, + }, ) } /// Move items from `other` to the end of `self`, leaving `other` empty. - /// If any keys in `other` is less than or equal to any key in `self`, - /// `InvalidKey` error will be returned and no mutation will occur. - pub fn extend(&mut self, other: &mut Self) -> Result { - let self_last_opt = self.0.last().map(extract_key); - let other_first_opt = other.0.last().map(extract_key); + /// If the `other` ordering is different from `self` ordering + /// `ExtendOrderingError` error will be returned. + /// If any keys in `other` is not respective of the ordering defined in + /// `self`, `InvalidKey` error will be returned and no mutation will occur. + pub fn extend(&mut self, other: &mut Self) -> Result { + if self.ordering != other.ordering { + return Err(VecMapError::ExtendOrderingError); + } - if let (Some(self_last), Some(other_first)) = (self_last_opt, other_first_opt) { - if self_last >= other_first { - return Err(InvalidKey); + let other_first_opt = other.data.last().map(extract_key); + if let Some(other_first) = other_first_opt { + self.validate_key_order(other_first)?; + } + + let delta_size = self.instrument_vec_op(|vec| vec.append(&mut other.data)); + Ok(delta_size) + } + + /// Validate the current last key in `self` and key being + /// inserted against the order defined in `self`. + fn validate_key_order(&self, key: &K) -> Result<(), VecMapError> { + if let Some(last_key) = self.data.last().map(extract_key) { + match (&self.ordering, &key.cmp(last_key)) { + (VecMapOrdering::Greater, Ordering::Less | Ordering::Equal) => { + return Err(VecMapError::InvalidKey); + } + (VecMapOrdering::Greater, Ordering::Greater) => {} + (VecMapOrdering::GreaterOrEqual, Ordering::Less) => { + return Err(VecMapError::InvalidKey); + } + (VecMapOrdering::GreaterOrEqual, Ordering::Equal | Ordering::Greater) => {} } } - let delta_size = self.instrument_vec_op(|vec| vec.append(&mut other.0)); - Ok(delta_size) + Ok(()) } /// Instrument an operation on the underlying [`Vec`]. /// Will panic if the operation decreases capacity. /// Returns the increase in memory usage caused by the op. fn instrument_vec_op(&mut self, op: impl FnOnce(&mut Vec<(K, V)>)) -> usize { - let old_cap = self.0.capacity(); - op(&mut self.0); - let new_cap = self.0.capacity(); + let old_cap = self.data.capacity(); + op(&mut self.data); + let new_cap = self.data.capacity(); match old_cap.cmp(&new_cap) { Ordering::Less => { @@ -145,6 +201,36 @@ impl VecMap { Ordering::Greater => panic!("VecMap capacity shouldn't ever decrease"), } } + + /// Similar to `from_iter` defined in `FromIter` trait except + /// that it accepts an [`VecMapOrdering`] + pub fn from_iter>(iter: I, ordering: VecMapOrdering) -> Self { + let iter = iter.into_iter(); + let initial_capacity = { + match iter.size_hint() { + (lower_bound, None) => lower_bound, + (_, Some(upper_bound)) => upper_bound, + } + }; + + let mut vec_map = VecMap::with_capacity(initial_capacity, ordering); + for (key, value) in iter { + vec_map + .append(key, value) + .expect("The passed collection needs to be sorted!"); + } + + vec_map + } +} + +impl IntoIterator for VecMap { + type Item = (K, V); + type IntoIter = std::vec::IntoIter<(K, V)>; + + fn into_iter(self) -> Self::IntoIter { + self.data.into_iter() + } } fn extract_key(entry: &(K, V)) -> &K { @@ -155,7 +241,7 @@ fn extract_key(entry: &(K, V)) -> &K { mod tests { use std::{collections::BTreeMap, ops::Bound}; - use super::VecMap; + use super::{VecMap, VecMapOrdering}; #[test] fn unbounded_range() { @@ -310,5 +396,59 @@ mod tests { left.extend(&mut one_map).unwrap_err(); assert_eq!(left.as_slice(), &[(0, ()), (1, ())]); assert_eq!(one_map.as_slice(), &[(1, ())]); + + let mut map_greater_or_equal = VecMap::new(VecMapOrdering::GreaterOrEqual); + map_greater_or_equal.append(2, ()).unwrap(); + map_greater_or_equal.append(2, ()).unwrap(); + + left.extend(&mut map_greater_or_equal).unwrap_err(); + assert_eq!(left.as_slice(), &[(0, ()), (1, ())]); + assert_eq!(map_greater_or_equal.as_slice(), &[(2, ()), (2, ())]); + } + + #[test] + fn extend_with_ordering() { + let mut left = VecMap::new(VecMapOrdering::GreaterOrEqual); + left.append(0, ()).unwrap(); + assert_eq!(left.as_slice(), &[(0, ())]); + + let mut greater_right = VecMap::new(VecMapOrdering::Greater); + greater_right.append(0, ()).unwrap(); + left.extend(&mut greater_right).unwrap_err(); + assert_eq!(left.as_slice(), &[(0, ())]); + + let mut greater_or_equal_right = VecMap::new(VecMapOrdering::GreaterOrEqual); + greater_or_equal_right.append(2, ()).unwrap(); + greater_or_equal_right.append(2, ()).unwrap(); + left.extend(&mut greater_or_equal_right).unwrap(); + assert_eq!(left.as_slice(), &[(0, ()), (2, ()), (2, ())]); + } + + #[test] + fn vec_map_from_sorted() { + let vec = vec![(1, ()), (2, ()), (3, ()), (6, ())]; + let vec_map = VecMap::from_iter(vec, VecMapOrdering::Greater); + assert_eq!(vec_map.as_slice(), &[(1, ()), (2, ()), (3, ()), (6, ())]); + + let vec = vec![(1, ()), (2, ()), (3, ()), (3, ()), (6, ()), (6, ())]; + let vec_map = VecMap::from_iter(vec, VecMapOrdering::GreaterOrEqual); + assert_eq!( + vec_map.as_slice(), + &[(1, ()), (2, ()), (3, ()), (3, ()), (6, ()), (6, ())] + ); + } + + #[test] + #[should_panic] + fn vec_map_from_unsorted_greater() { + let vec = vec![(1, ()), (2, ()), (2, ()), (3, ()), (6, ())]; + let _ = VecMap::from_iter(vec, VecMapOrdering::Greater); + } + + #[test] + #[should_panic] + fn vec_map_from_unsorted_greater_or_equal() { + let vec = vec![(1, ()), (2, ()), (3, ()), (6, ()), (5, ())]; + let _ = VecMap::from_iter(vec, VecMapOrdering::GreaterOrEqual); } } diff --git a/libs/utils/src/zstd.rs b/libs/utils/src/zstd.rs new file mode 100644 index 0000000000..be2dcc00f5 --- /dev/null +++ b/libs/utils/src/zstd.rs @@ -0,0 +1,78 @@ +use std::io::SeekFrom; + +use anyhow::{Context, Result}; +use async_compression::{ + tokio::{bufread::ZstdDecoder, write::ZstdEncoder}, + zstd::CParameter, + Level, +}; +use camino::Utf8Path; +use nix::NixPath; +use tokio::{ + fs::{File, OpenOptions}, + io::AsyncBufRead, + io::AsyncSeekExt, + io::AsyncWriteExt, +}; +use tokio_tar::{Archive, Builder, HeaderMode}; +use walkdir::WalkDir; + +/// Creates a Zstandard tarball. +pub async fn create_zst_tarball(path: &Utf8Path, tarball: &Utf8Path) -> Result<(File, u64)> { + let file = OpenOptions::new() + .create(true) + .truncate(true) + .read(true) + .write(true) + .open(&tarball) + .await + .with_context(|| format!("tempfile creation {tarball}"))?; + + let mut paths = Vec::new(); + for entry in WalkDir::new(path) { + let entry = entry?; + let metadata = entry.metadata().expect("error getting dir entry metadata"); + // Also allow directories so that we also get empty directories + if !(metadata.is_file() || metadata.is_dir()) { + continue; + } + let path = entry.into_path(); + paths.push(path); + } + // Do a sort to get a more consistent listing + paths.sort_unstable(); + let zstd = ZstdEncoder::with_quality_and_params( + file, + Level::Default, + &[CParameter::enable_long_distance_matching(true)], + ); + let mut builder = Builder::new(zstd); + // Use reproducible header mode + builder.mode(HeaderMode::Deterministic); + for p in paths { + let rel_path = p.strip_prefix(path)?; + if rel_path.is_empty() { + // The top directory should not be compressed, + // the tar crate doesn't like that + continue; + } + builder.append_path_with_name(&p, rel_path).await?; + } + let mut zstd = builder.into_inner().await?; + zstd.shutdown().await?; + let mut compressed = zstd.into_inner(); + let compressed_len = compressed.metadata().await?.len(); + compressed.seek(SeekFrom::Start(0)).await?; + Ok((compressed, compressed_len)) +} + +/// Creates a Zstandard tarball. +pub async fn extract_zst_tarball( + path: &Utf8Path, + tarball: impl AsyncBufRead + Unpin, +) -> Result<()> { + let decoder = Box::pin(ZstdDecoder::new(tarball)); + let mut archive = Archive::new(decoder); + archive.unpack(path).await?; + Ok(()) +} diff --git a/libs/walproposer/src/api_bindings.rs b/libs/walproposer/src/api_bindings.rs index f5ed6ebb97..906302e46e 100644 --- a/libs/walproposer/src/api_bindings.rs +++ b/libs/walproposer/src/api_bindings.rs @@ -324,11 +324,11 @@ extern "C" fn finish_sync_safekeepers(wp: *mut WalProposer, lsn: XLogRecPtr) { } } -extern "C" fn process_safekeeper_feedback(wp: *mut WalProposer) { +extern "C" fn process_safekeeper_feedback(wp: *mut WalProposer, sk: *mut Safekeeper) { unsafe { let callback_data = (*(*wp).config).callback_data; let api = callback_data as *mut Box; - (*api).process_safekeeper_feedback(&mut (*wp)) + (*api).process_safekeeper_feedback(&mut (*wp), &mut (*sk)); } } diff --git a/libs/walproposer/src/walproposer.rs b/libs/walproposer/src/walproposer.rs index 734967da3f..14cc3e05a2 100644 --- a/libs/walproposer/src/walproposer.rs +++ b/libs/walproposer/src/walproposer.rs @@ -142,7 +142,7 @@ pub trait ApiImpl { todo!() } - fn process_safekeeper_feedback(&mut self, _wp: &mut WalProposer) { + fn process_safekeeper_feedback(&mut self, _wp: &mut WalProposer, _sk: &mut Safekeeper) { todo!() } diff --git a/pageserver/benches/bench_walredo.rs b/pageserver/benches/bench_walredo.rs index 47c8bd75c6..3efad546a6 100644 --- a/pageserver/benches/bench_walredo.rs +++ b/pageserver/benches/bench_walredo.rs @@ -1,160 +1,156 @@ -//! Simple benchmarking around walredo. +//! Quantify a single walredo manager's throughput under N concurrent callers. //! -//! Right now they hope to just set a baseline. Later we can try to expand into latency and -//! throughput after figuring out the coordinated omission problems below. +//! The benchmark implementation ([`bench_impl`]) is parametrized by +//! - `redo_work` => [`Request::short_request`] or [`Request::medium_request`] +//! - `n_redos` => number of times the benchmark shell execute the `redo_work` +//! - `nclients` => number of clients (more on this shortly). //! -//! There are two sets of inputs; `short` and `medium`. They were collected on postgres v14 by -//! logging what happens when a sequential scan is requested on a small table, then picking out two -//! suitable from logs. +//! The benchmark impl sets up a multi-threaded tokio runtime with default parameters. +//! It spawns `nclients` times [`client`] tokio tasks. +//! Each task executes the `redo_work` `n_redos/nclients` times. //! +//! We exercise the following combinations: +//! - `redo_work = short / medium`` +//! - `nclients = [1, 2, 4, 8, 16, 32, 64, 128]` //! -//! Reference data (git blame to see commit) on an i3en.3xlarge -// ```text -//! short/short/1 time: [39.175 µs 39.348 µs 39.536 µs] -//! short/short/2 time: [51.227 µs 51.487 µs 51.755 µs] -//! short/short/4 time: [76.048 µs 76.362 µs 76.674 µs] -//! short/short/8 time: [128.94 µs 129.82 µs 130.74 µs] -//! short/short/16 time: [227.84 µs 229.00 µs 230.28 µs] -//! short/short/32 time: [455.97 µs 457.81 µs 459.90 µs] -//! short/short/64 time: [902.46 µs 904.84 µs 907.32 µs] -//! short/short/128 time: [1.7416 ms 1.7487 ms 1.7561 ms] -//! `` - -use std::sync::Arc; +//! We let `criterion` determine the `n_redos` using `iter_custom`. +//! The idea is that for each `(redo_work, nclients)` combination, +//! criterion will run the `bench_impl` multiple times with different `n_redos`. +//! The `bench_impl` reports the aggregate wall clock time from the clients' perspective. +//! Criterion will divide that by `n_redos` to compute the "time per iteration". +//! In our case, "time per iteration" means "time per redo_work execution". +//! +//! NB: the way by which `iter_custom` determines the "number of iterations" +//! is called sampling. Apparently the idea here is to detect outliers. +//! We're not sure whether the current choice of sampling method makes sense. +//! See https://bheisler.github.io/criterion.rs/book/user_guide/command_line_output.html#collecting-samples +//! +//! # Reference Numbers +//! +//! 2024-03-20 on i3en.3xlarge +//! +//! ```text +//! short/1 time: [26.483 µs 26.614 µs 26.767 µs] +//! short/2 time: [32.223 µs 32.465 µs 32.767 µs] +//! short/4 time: [47.203 µs 47.583 µs 47.984 µs] +//! short/8 time: [89.135 µs 89.612 µs 90.139 µs] +//! short/16 time: [190.12 µs 191.52 µs 192.88 µs] +//! short/32 time: [380.96 µs 382.63 µs 384.20 µs] +//! short/64 time: [736.86 µs 741.07 µs 745.03 µs] +//! short/128 time: [1.4106 ms 1.4206 ms 1.4294 ms] +//! medium/1 time: [111.81 µs 112.25 µs 112.79 µs] +//! medium/2 time: [158.26 µs 159.13 µs 160.21 µs] +//! medium/4 time: [334.65 µs 337.14 µs 340.07 µs] +//! medium/8 time: [675.32 µs 679.91 µs 685.25 µs] +//! medium/16 time: [1.2929 ms 1.2996 ms 1.3067 ms] +//! medium/32 time: [2.4295 ms 2.4461 ms 2.4623 ms] +//! medium/64 time: [4.3973 ms 4.4458 ms 4.4875 ms] +//! medium/128 time: [7.5955 ms 7.7847 ms 7.9481 ms] +//! ``` use bytes::{Buf, Bytes}; -use pageserver::{ - config::PageServerConf, repository::Key, walrecord::NeonWalRecord, walredo::PostgresRedoManager, +use criterion::{BenchmarkId, Criterion}; +use pageserver::{config::PageServerConf, walrecord::NeonWalRecord, walredo::PostgresRedoManager}; +use pageserver_api::{key::Key, shard::TenantShardId}; +use std::{ + sync::Arc, + time::{Duration, Instant}, }; -use pageserver_api::shard::TenantShardId; -use tokio::task::JoinSet; +use tokio::{sync::Barrier, task::JoinSet}; use utils::{id::TenantId, lsn::Lsn}; -use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion}; +fn bench(c: &mut Criterion) { + { + let nclients = [1, 2, 4, 8, 16, 32, 64, 128]; + for nclients in nclients { + let mut group = c.benchmark_group("short"); + group.bench_with_input( + BenchmarkId::from_parameter(nclients), + &nclients, + |b, nclients| { + let redo_work = Arc::new(Request::short_input()); + b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients)); + }, + ); + } + } -fn redo_scenarios(c: &mut Criterion) { - // logging should be enabled when adding more inputs, since walredo will only report malformed - // input to the stderr. - // utils::logging::init(utils::logging::LogFormat::Plain).unwrap(); + { + let nclients = [1, 2, 4, 8, 16, 32, 64, 128]; + for nclients in nclients { + let mut group = c.benchmark_group("medium"); + group.bench_with_input( + BenchmarkId::from_parameter(nclients), + &nclients, + |b, nclients| { + let redo_work = Arc::new(Request::medium_input()); + b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients)); + }, + ); + } + } +} +criterion::criterion_group!(benches, bench); +criterion::criterion_main!(benches); +// Returns the sum of each client's wall-clock time spent executing their share of the n_redos. +fn bench_impl(redo_work: Arc, n_redos: u64, nclients: u64) -> Duration { let repo_dir = camino_tempfile::tempdir_in(env!("CARGO_TARGET_TMPDIR")).unwrap(); let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf()); let conf = Box::leak(Box::new(conf)); let tenant_shard_id = TenantShardId::unsharded(TenantId::generate()); - let manager = PostgresRedoManager::new(conf, tenant_shard_id); - - let manager = Arc::new(manager); - - { - let rt = tokio::runtime::Builder::new_current_thread() - .enable_all() - .build() - .unwrap(); - tracing::info!("executing first"); - rt.block_on(short().execute(&manager)).unwrap(); - tracing::info!("first executed"); - } - - let thread_counts = [1, 2, 4, 8, 16, 32, 64, 128]; - - let mut group = c.benchmark_group("short"); - group.sampling_mode(criterion::SamplingMode::Flat); - - for thread_count in thread_counts { - group.bench_with_input( - BenchmarkId::new("short", thread_count), - &thread_count, - |b, thread_count| { - add_multithreaded_walredo_requesters(b, *thread_count, &manager, short); - }, - ); - } - drop(group); - - let mut group = c.benchmark_group("medium"); - group.sampling_mode(criterion::SamplingMode::Flat); - - for thread_count in thread_counts { - group.bench_with_input( - BenchmarkId::new("medium", thread_count), - &thread_count, - |b, thread_count| { - add_multithreaded_walredo_requesters(b, *thread_count, &manager, medium); - }, - ); - } - drop(group); -} - -/// Sets up a multi-threaded tokio runtime with default worker thread count, -/// then, spawn `requesters` tasks that repeatedly: -/// - get input from `input_factor()` -/// - call `manager.request_redo()` with their input -/// -/// This stress-tests the scalability of a single walredo manager at high tokio-level concurrency. -/// -/// Using tokio's default worker thread count means the results will differ on machines -/// with different core countrs. We don't care about that, the performance will always -/// be different on different hardware. To compare performance of different software versions, -/// use the same hardware. -fn add_multithreaded_walredo_requesters( - b: &mut criterion::Bencher, - nrequesters: usize, - manager: &Arc, - input_factory: fn() -> Request, -) { - assert_ne!(nrequesters, 0); - let rt = tokio::runtime::Builder::new_multi_thread() .enable_all() .build() .unwrap(); - let barrier = Arc::new(tokio::sync::Barrier::new(nrequesters + 1)); + let start = Arc::new(Barrier::new(nclients as usize)); - let mut requesters = JoinSet::new(); - for _ in 0..nrequesters { - let _entered = rt.enter(); - let manager = manager.clone(); - let barrier = barrier.clone(); - requesters.spawn(async move { - loop { - let input = input_factory(); - barrier.wait().await; - let page = input.execute(&manager).await.unwrap(); - assert_eq!(page.remaining(), 8192); - barrier.wait().await; - } + let mut tasks = JoinSet::new(); + + let manager = PostgresRedoManager::new(conf, tenant_shard_id); + let manager = Arc::new(manager); + + for _ in 0..nclients { + rt.block_on(async { + tasks.spawn(client( + Arc::clone(&manager), + Arc::clone(&start), + Arc::clone(&redo_work), + // divide the amount of work equally among the clients + n_redos / nclients, + )) }); } - let do_one_iteration = || { - rt.block_on(async { - barrier.wait().await; - // wait for work to complete - barrier.wait().await; - }) - }; - - b.iter_batched( - || { - // warmup - do_one_iteration(); - }, - |()| { - // work loop - do_one_iteration(); - }, - criterion::BatchSize::PerIteration, - ); - - rt.block_on(requesters.shutdown()); + rt.block_on(async move { + let mut total_wallclock_time = std::time::Duration::from_millis(0); + while let Some(res) = tasks.join_next().await { + total_wallclock_time += res.unwrap(); + } + total_wallclock_time + }) } -criterion_group!(benches, redo_scenarios); -criterion_main!(benches); +async fn client( + mgr: Arc, + start: Arc, + redo_work: Arc, + n_redos: u64, +) -> Duration { + start.wait().await; + let start = Instant::now(); + for _ in 0..n_redos { + let page = redo_work.execute(&mgr).await.unwrap(); + assert_eq!(page.remaining(), 8192); + // The real pageserver will rarely if ever do 2 walredos in a row without + // yielding to the executor. + tokio::task::yield_now().await; + } + start.elapsed() +} macro_rules! lsn { ($input:expr) => {{ @@ -166,12 +162,46 @@ macro_rules! lsn { }}; } -/// Short payload, 1132 bytes. -// pg_records are copypasted from log, where they are put with Debug impl of Bytes, which uses \0 -// for null bytes. -#[allow(clippy::octal_escapes)] -fn short() -> Request { - Request { +/// Simple wrapper around `WalRedoManager::request_redo`. +/// +/// In benchmarks this is cloned around. +#[derive(Clone)] +struct Request { + key: Key, + lsn: Lsn, + base_img: Option<(Lsn, Bytes)>, + records: Vec<(Lsn, NeonWalRecord)>, + pg_version: u32, +} + +impl Request { + async fn execute(&self, manager: &PostgresRedoManager) -> anyhow::Result { + let Request { + key, + lsn, + base_img, + records, + pg_version, + } = self; + + // TODO: avoid these clones + manager + .request_redo(*key, *lsn, base_img.clone(), records.clone(), *pg_version) + .await + } + + fn pg_record(will_init: bool, bytes: &'static [u8]) -> NeonWalRecord { + let rec = Bytes::from_static(bytes); + NeonWalRecord::Postgres { will_init, rec } + } + + /// Short payload, 1132 bytes. + // pg_records are copypasted from log, where they are put with Debug impl of Bytes, which uses \0 + // for null bytes. + #[allow(clippy::octal_escapes)] + pub fn short_input() -> Request { + let pg_record = Self::pg_record; + Request { key: Key { field1: 0, field2: 1663, @@ -194,13 +224,14 @@ fn short() -> Request { ], pg_version: 14, } -} + } -/// Medium sized payload, serializes as 26393 bytes. -// see [`short`] -#[allow(clippy::octal_escapes)] -fn medium() -> Request { - Request { + /// Medium sized payload, serializes as 26393 bytes. + // see [`short`] + #[allow(clippy::octal_escapes)] + pub fn medium_input() -> Request { + let pg_record = Self::pg_record; + Request { key: Key { field1: 0, field2: 1663, @@ -442,37 +473,5 @@ fn medium() -> Request { ], pg_version: 14, } -} - -fn pg_record(will_init: bool, bytes: &'static [u8]) -> NeonWalRecord { - let rec = Bytes::from_static(bytes); - NeonWalRecord::Postgres { will_init, rec } -} - -/// Simple wrapper around `WalRedoManager::request_redo`. -/// -/// In benchmarks this is cloned around. -#[derive(Clone)] -struct Request { - key: Key, - lsn: Lsn, - base_img: Option<(Lsn, Bytes)>, - records: Vec<(Lsn, NeonWalRecord)>, - pg_version: u32, -} - -impl Request { - async fn execute(self, manager: &PostgresRedoManager) -> anyhow::Result { - let Request { - key, - lsn, - base_img, - records, - pg_version, - } = self; - - manager - .request_redo(key, lsn, base_img, records, pg_version) - .await } } diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 1fd7c775d5..f4a231f217 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -15,9 +15,9 @@ use metrics::launch_timestamp::{set_launch_timestamp_metric, LaunchTimestamp}; use pageserver::control_plane_client::ControlPlaneClient; use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task}; use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING}; -use pageserver::task_mgr::WALRECEIVER_RUNTIME; use pageserver::tenant::{secondary, TenantSharedResources}; use remote_storage::GenericRemoteStorage; +use tokio::signal::unix::SignalKind; use tokio::time::Instant; use tracing::*; @@ -28,7 +28,7 @@ use pageserver::{ deletion_queue::DeletionQueue, http, page_cache, page_service, task_mgr, task_mgr::TaskKind, - task_mgr::{BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME}, + task_mgr::THE_RUNTIME, tenant::mgr, virtual_file, }; @@ -323,7 +323,7 @@ fn start_pageserver( // Launch broker client // The storage_broker::connect call needs to happen inside a tokio runtime thread. - let broker_client = WALRECEIVER_RUNTIME + let broker_client = THE_RUNTIME .block_on(async { // Note: we do not attempt connecting here (but validate endpoints sanity). storage_broker::connect(conf.broker_endpoint.clone(), conf.broker_keepalive_interval) @@ -391,7 +391,7 @@ fn start_pageserver( conf, ); if let Some(deletion_workers) = deletion_workers { - deletion_workers.spawn_with(BACKGROUND_RUNTIME.handle()); + deletion_workers.spawn_with(THE_RUNTIME.handle()); } // Up to this point no significant I/O has been done: this should have been fast. Record @@ -423,7 +423,7 @@ fn start_pageserver( // Scan the local 'tenants/' directory and start loading the tenants let deletion_queue_client = deletion_queue.new_client(); - let tenant_manager = BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr( + let tenant_manager = THE_RUNTIME.block_on(mgr::init_tenant_mgr( conf, TenantSharedResources { broker_client: broker_client.clone(), @@ -435,7 +435,7 @@ fn start_pageserver( ))?; let tenant_manager = Arc::new(tenant_manager); - BACKGROUND_RUNTIME.spawn({ + THE_RUNTIME.spawn({ let shutdown_pageserver = shutdown_pageserver.clone(); let drive_init = async move { // NOTE: unlike many futures in pageserver, this one is cancellation-safe @@ -545,7 +545,7 @@ fn start_pageserver( // Start up the service to handle HTTP mgmt API request. We created the // listener earlier already. { - let _rt_guard = MGMT_REQUEST_RUNTIME.enter(); + let _rt_guard = THE_RUNTIME.enter(); let router_state = Arc::new( http::routes::State::new( @@ -569,7 +569,6 @@ fn start_pageserver( .with_graceful_shutdown(task_mgr::shutdown_watcher()); task_mgr::spawn( - MGMT_REQUEST_RUNTIME.handle(), TaskKind::HttpEndpointListener, None, None, @@ -594,7 +593,6 @@ fn start_pageserver( let local_disk_storage = conf.workdir.join("last_consumption_metrics.json"); task_mgr::spawn( - crate::BACKGROUND_RUNTIME.handle(), TaskKind::MetricsCollection, None, None, @@ -615,6 +613,7 @@ fn start_pageserver( pageserver::consumption_metrics::collect_metrics( metric_collection_endpoint, + &conf.metric_collection_bucket, conf.metric_collection_interval, conf.cached_metric_collection_interval, conf.synthetic_size_calculation_interval, @@ -642,7 +641,6 @@ fn start_pageserver( DownloadBehavior::Error, ); task_mgr::spawn( - COMPUTE_REQUEST_RUNTIME.handle(), TaskKind::LibpqEndpointListener, None, None, @@ -666,42 +664,37 @@ fn start_pageserver( let mut shutdown_pageserver = Some(shutdown_pageserver.drop_guard()); // All started up! Now just sit and wait for shutdown signal. - { - use signal_hook::consts::*; - let signal_handler = BACKGROUND_RUNTIME.spawn_blocking(move || { - let mut signals = - signal_hook::iterator::Signals::new([SIGINT, SIGTERM, SIGQUIT]).unwrap(); - return signals - .forever() - .next() - .expect("forever() never returns None unless explicitly closed"); - }); - let signal = BACKGROUND_RUNTIME - .block_on(signal_handler) - .expect("join error"); - match signal { - SIGQUIT => { - info!("Got signal {signal}. Terminating in immediate shutdown mode",); - std::process::exit(111); - } - SIGINT | SIGTERM => { - info!("Got signal {signal}. Terminating gracefully in fast shutdown mode",); - // This cancels the `shutdown_pageserver` cancellation tree. - // Right now that tree doesn't reach very far, and `task_mgr` is used instead. - // The plan is to change that over time. - shutdown_pageserver.take(); - let bg_remote_storage = remote_storage.clone(); - let bg_deletion_queue = deletion_queue.clone(); - BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver( - &tenant_manager, - bg_remote_storage.map(|_| bg_deletion_queue), - 0, - )); - unreachable!() - } - _ => unreachable!(), - } + { + THE_RUNTIME.block_on(async move { + let mut sigint = tokio::signal::unix::signal(SignalKind::interrupt()).unwrap(); + let mut sigterm = tokio::signal::unix::signal(SignalKind::terminate()).unwrap(); + let mut sigquit = tokio::signal::unix::signal(SignalKind::quit()).unwrap(); + let signal = tokio::select! { + _ = sigquit.recv() => { + info!("Got signal SIGQUIT. Terminating in immediate shutdown mode",); + std::process::exit(111); + } + _ = sigint.recv() => { "SIGINT" }, + _ = sigterm.recv() => { "SIGTERM" }, + }; + + info!("Got signal {signal}. Terminating gracefully in fast shutdown mode",); + + // This cancels the `shutdown_pageserver` cancellation tree. + // Right now that tree doesn't reach very far, and `task_mgr` is used instead. + // The plan is to change that over time. + shutdown_pageserver.take(); + let bg_remote_storage = remote_storage.clone(); + let bg_deletion_queue = deletion_queue.clone(); + pageserver::shutdown_pageserver( + &tenant_manager, + bg_remote_storage.map(|_| bg_deletion_queue), + 0, + ) + .await; + unreachable!() + }) } } diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 8ad9ade4a9..a29719e36f 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -234,6 +234,7 @@ pub struct PageServerConf { // How often to send unchanged cached metrics to the metrics endpoint. pub cached_metric_collection_interval: Duration, pub metric_collection_endpoint: Option, + pub metric_collection_bucket: Option, pub synthetic_size_calculation_interval: Duration, pub disk_usage_based_eviction: Option, @@ -373,6 +374,7 @@ struct PageServerConfigBuilder { cached_metric_collection_interval: BuilderValue, metric_collection_endpoint: BuilderValue>, synthetic_size_calculation_interval: BuilderValue, + metric_collection_bucket: BuilderValue>, disk_usage_based_eviction: BuilderValue>, @@ -455,6 +457,8 @@ impl PageServerConfigBuilder { .expect("cannot parse default synthetic size calculation interval")), metric_collection_endpoint: Set(DEFAULT_METRIC_COLLECTION_ENDPOINT), + metric_collection_bucket: Set(None), + disk_usage_based_eviction: Set(None), test_remote_failures: Set(0), @@ -586,6 +590,13 @@ impl PageServerConfigBuilder { self.metric_collection_endpoint = BuilderValue::Set(metric_collection_endpoint) } + pub fn metric_collection_bucket( + &mut self, + metric_collection_bucket: Option, + ) { + self.metric_collection_bucket = BuilderValue::Set(metric_collection_bucket) + } + pub fn synthetic_size_calculation_interval( &mut self, synthetic_size_calculation_interval: Duration, @@ -694,6 +705,7 @@ impl PageServerConfigBuilder { metric_collection_interval, cached_metric_collection_interval, metric_collection_endpoint, + metric_collection_bucket, synthetic_size_calculation_interval, disk_usage_based_eviction, test_remote_failures, @@ -942,6 +954,9 @@ impl PageServerConf { let endpoint = parse_toml_string(key, item)?.parse().context("failed to parse metric_collection_endpoint")?; builder.metric_collection_endpoint(Some(endpoint)); }, + "metric_collection_bucket" => { + builder.metric_collection_bucket(RemoteStorageConfig::from_toml(item)?) + } "synthetic_size_calculation_interval" => builder.synthetic_size_calculation_interval(parse_toml_duration(key, item)?), "test_remote_failures" => builder.test_remote_failures(parse_toml_u64(key, item)?), @@ -1057,6 +1072,7 @@ impl PageServerConf { metric_collection_interval: Duration::from_secs(60), cached_metric_collection_interval: Duration::from_secs(60 * 60), metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT, + metric_collection_bucket: None, synthetic_size_calculation_interval: Duration::from_secs(60), disk_usage_based_eviction: None, test_remote_failures: 0, @@ -1289,6 +1305,7 @@ background_task_maximum_delay = '334 s' defaults::DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL )?, metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT, + metric_collection_bucket: None, synthetic_size_calculation_interval: humantime::parse_duration( defaults::DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL )?, @@ -1363,6 +1380,7 @@ background_task_maximum_delay = '334 s' metric_collection_interval: Duration::from_secs(222), cached_metric_collection_interval: Duration::from_secs(22200), metric_collection_endpoint: Some(Url::parse("http://localhost:80/metrics")?), + metric_collection_bucket: None, synthetic_size_calculation_interval: Duration::from_secs(333), disk_usage_based_eviction: None, test_remote_failures: 0, diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs index c7f9d596c6..c82be8c581 100644 --- a/pageserver/src/consumption_metrics.rs +++ b/pageserver/src/consumption_metrics.rs @@ -1,12 +1,13 @@ //! Periodically collect consumption metrics for all active tenants //! and push them to a HTTP endpoint. use crate::context::{DownloadBehavior, RequestContext}; -use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME}; +use crate::task_mgr::{self, TaskKind}; use crate::tenant::tasks::BackgroundLoopKind; use crate::tenant::{mgr, LogicalSizeCalculationCause, PageReconstructError, Tenant}; use camino::Utf8PathBuf; use consumption_metrics::EventType; use pageserver_api::models::TenantState; +use remote_storage::{GenericRemoteStorage, RemoteStorageConfig}; use reqwest::Url; use std::collections::HashMap; use std::sync::Arc; @@ -41,6 +42,7 @@ type Cache = HashMap; #[allow(clippy::too_many_arguments)] pub async fn collect_metrics( metric_collection_endpoint: &Url, + metric_collection_bucket: &Option, metric_collection_interval: Duration, _cached_metric_collection_interval: Duration, synthetic_size_calculation_interval: Duration, @@ -59,7 +61,6 @@ pub async fn collect_metrics( let worker_ctx = ctx.detached_child(TaskKind::CalculateSyntheticSize, DownloadBehavior::Download); task_mgr::spawn( - BACKGROUND_RUNTIME.handle(), TaskKind::CalculateSyntheticSize, None, None, @@ -94,6 +95,20 @@ pub async fn collect_metrics( .build() .expect("Failed to create http client with timeout"); + let bucket_client = if let Some(bucket_config) = metric_collection_bucket { + match GenericRemoteStorage::from_config(bucket_config) { + Ok(client) => Some(client), + Err(e) => { + // Non-fatal error: if we were given an invalid config, we will proceed + // with sending metrics over the network, but not to S3. + tracing::warn!("Invalid configuration for metric_collection_bucket: {e}"); + None + } + } + } else { + None + }; + let node_id = node_id.to_string(); loop { @@ -118,10 +133,18 @@ pub async fn collect_metrics( tracing::error!("failed to persist metrics to {path:?}: {e:#}"); } } + + if let Some(bucket_client) = &bucket_client { + let res = + upload::upload_metrics_bucket(bucket_client, &cancel, &node_id, &metrics).await; + if let Err(e) = res { + tracing::error!("failed to upload to S3: {e:#}"); + } + } }; let upload = async { - let res = upload::upload_metrics( + let res = upload::upload_metrics_http( &client, metric_collection_endpoint, &cancel, @@ -132,7 +155,7 @@ pub async fn collect_metrics( .await; if let Err(e) = res { // serialization error which should never happen - tracing::error!("failed to upload due to {e:#}"); + tracing::error!("failed to upload via HTTP due to {e:#}"); } }; diff --git a/pageserver/src/consumption_metrics/upload.rs b/pageserver/src/consumption_metrics/upload.rs index 6b840a3136..4e8283c3e4 100644 --- a/pageserver/src/consumption_metrics/upload.rs +++ b/pageserver/src/consumption_metrics/upload.rs @@ -1,4 +1,9 @@ +use std::time::SystemTime; + +use chrono::{DateTime, Utc}; use consumption_metrics::{Event, EventChunk, IdempotencyKey, CHUNK_SIZE}; +use remote_storage::{GenericRemoteStorage, RemotePath}; +use tokio::io::AsyncWriteExt; use tokio_util::sync::CancellationToken; use tracing::Instrument; @@ -13,8 +18,9 @@ struct Ids { pub(super) timeline_id: Option, } +/// Serialize and write metrics to an HTTP endpoint #[tracing::instrument(skip_all, fields(metrics_total = %metrics.len()))] -pub(super) async fn upload_metrics( +pub(super) async fn upload_metrics_http( client: &reqwest::Client, metric_collection_endpoint: &reqwest::Url, cancel: &CancellationToken, @@ -74,6 +80,60 @@ pub(super) async fn upload_metrics( Ok(()) } +/// Serialize and write metrics to a remote storage object +#[tracing::instrument(skip_all, fields(metrics_total = %metrics.len()))] +pub(super) async fn upload_metrics_bucket( + client: &GenericRemoteStorage, + cancel: &CancellationToken, + node_id: &str, + metrics: &[RawMetric], +) -> anyhow::Result<()> { + if metrics.is_empty() { + // Skip uploads if we have no metrics, so that readers don't have to handle the edge case + // of an empty object. + return Ok(()); + } + + // Compose object path + let datetime: DateTime = SystemTime::now().into(); + let ts_prefix = datetime.format("year=%Y/month=%m/day=%d/%H:%M:%SZ"); + let path = RemotePath::from_string(&format!("{ts_prefix}_{node_id}.ndjson.gz"))?; + + // Set up a gzip writer into a buffer + let mut compressed_bytes: Vec = Vec::new(); + let compressed_writer = std::io::Cursor::new(&mut compressed_bytes); + let mut gzip_writer = async_compression::tokio::write::GzipEncoder::new(compressed_writer); + + // Serialize and write into compressed buffer + let started_at = std::time::Instant::now(); + for res in serialize_in_chunks(CHUNK_SIZE, metrics, node_id) { + let (_chunk, body) = res?; + gzip_writer.write_all(&body).await?; + } + gzip_writer.flush().await?; + gzip_writer.shutdown().await?; + let compressed_length = compressed_bytes.len(); + + // Write to remote storage + client + .upload_storage_object( + futures::stream::once(futures::future::ready(Ok(compressed_bytes.into()))), + compressed_length, + &path, + cancel, + ) + .await?; + let elapsed = started_at.elapsed(); + + tracing::info!( + compressed_length, + elapsed_ms = elapsed.as_millis(), + "write metrics bucket at {path}", + ); + + Ok(()) +} + // The return type is quite ugly, but we gain testability in isolation fn serialize_in_chunks<'a, F>( chunk_size: usize, diff --git a/pageserver/src/control_plane_client.rs b/pageserver/src/control_plane_client.rs index 1b3d76335d..55d80c2966 100644 --- a/pageserver/src/control_plane_client.rs +++ b/pageserver/src/control_plane_client.rs @@ -5,7 +5,8 @@ use pageserver_api::{ controller_api::NodeRegisterRequest, shard::TenantShardId, upcall_api::{ - ReAttachRequest, ReAttachResponse, ValidateRequest, ValidateRequestTenant, ValidateResponse, + ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, ValidateRequest, + ValidateRequestTenant, ValidateResponse, }, }; use serde::{de::DeserializeOwned, Serialize}; @@ -37,7 +38,9 @@ pub trait ControlPlaneGenerationsApi { fn re_attach( &self, conf: &PageServerConf, - ) -> impl Future, RetryForeverError>> + Send; + ) -> impl Future< + Output = Result, RetryForeverError>, + > + Send; fn validate( &self, tenants: Vec<(TenantShardId, Generation)>, @@ -118,7 +121,7 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient { async fn re_attach( &self, conf: &PageServerConf, - ) -> Result, RetryForeverError> { + ) -> Result, RetryForeverError> { let re_attach_path = self .base_url .join("re-attach") @@ -170,8 +173,6 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient { register, }; - fail::fail_point!("control-plane-client-re-attach"); - let response: ReAttachResponse = self.retry_http_forever(&re_attach_path, request).await?; tracing::info!( "Received re-attach response with {} tenants", @@ -181,7 +182,7 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient { Ok(response .tenants .into_iter() - .map(|t| (t.id, Generation::new(t.gen))) + .map(|rart| (rart.id, rart)) .collect::>()) } @@ -207,7 +208,7 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient { .collect(), }; - fail::fail_point!("control-plane-client-validate"); + crate::tenant::pausable_failpoint!("control-plane-client-validate"); let response: ValidateResponse = self.retry_http_forever(&re_attach_path, request).await?; diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs index b6aea8fae8..e3c11cb299 100644 --- a/pageserver/src/deletion_queue.rs +++ b/pageserver/src/deletion_queue.rs @@ -724,8 +724,8 @@ impl DeletionQueue { mod test { use camino::Utf8Path; use hex_literal::hex; - use pageserver_api::shard::ShardIndex; - use std::io::ErrorKind; + use pageserver_api::{shard::ShardIndex, upcall_api::ReAttachResponseTenant}; + use std::{io::ErrorKind, time::Duration}; use tracing::info; use remote_storage::{RemoteStorageConfig, RemoteStorageKind}; @@ -834,9 +834,10 @@ mod test { async fn re_attach( &self, _conf: &PageServerConf, - ) -> Result, RetryForeverError> { + ) -> Result, RetryForeverError> { unimplemented!() } + async fn validate( &self, tenants: Vec<(TenantShardId, Generation)>, diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs index 92c1475aef..6b68acd1c7 100644 --- a/pageserver/src/disk_usage_eviction_task.rs +++ b/pageserver/src/disk_usage_eviction_task.rs @@ -59,7 +59,7 @@ use utils::{completion, id::TimelineId}; use crate::{ config::PageServerConf, metrics::disk_usage_based_eviction::METRICS, - task_mgr::{self, TaskKind, BACKGROUND_RUNTIME}, + task_mgr::{self, TaskKind}, tenant::{ self, mgr::TenantManager, @@ -202,7 +202,6 @@ pub fn launch_disk_usage_global_eviction_task( info!("launching disk usage based eviction task"); task_mgr::spawn( - BACKGROUND_RUNTIME.handle(), TaskKind::DiskUsageEviction, None, None, diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 6d98d3f746..26f23fb8c2 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -36,6 +36,7 @@ use tokio_util::sync::CancellationToken; use tracing::*; use utils::auth::JwtAuth; use utils::failpoint_support::failpoints_handler; +use utils::http::endpoint::prometheus_metrics_handler; use utils::http::endpoint::request_span; use utils::http::json::json_request_or_empty_body; use utils::http::request::{get_request_param, must_get_query_param, parse_query_param}; @@ -885,14 +886,16 @@ async fn tenant_detach_handler( let state = get_state(&request); let conf = state.conf; - mgr::detach_tenant( - conf, - tenant_shard_id, - detach_ignored.unwrap_or(false), - &state.deletion_queue_client, - ) - .instrument(info_span!("tenant_detach", %tenant_id, shard_id=%tenant_shard_id.shard_slug())) - .await?; + state + .tenant_manager + .detach_tenant( + conf, + tenant_shard_id, + detach_ignored.unwrap_or(false), + &state.deletion_queue_client, + ) + .instrument(info_span!("tenant_detach", %tenant_id, shard_id=%tenant_shard_id.shard_slug())) + .await?; json_response(StatusCode::OK, ()) } @@ -1403,7 +1406,9 @@ async fn update_tenant_config_handler( TenantConfOpt::try_from(&request_data.config).map_err(ApiError::BadRequest)?; let state = get_state(&request); - mgr::set_new_tenant_config(state.conf, tenant_conf, tenant_id) + state + .tenant_manager + .set_new_tenant_config(tenant_conf, tenant_id) .instrument(info_span!("tenant_config", %tenant_id)) .await?; @@ -1428,13 +1433,14 @@ async fn put_tenant_location_config_handler( // The `Detached` state is special, it doesn't upsert a tenant, it removes // its local disk content and drops it from memory. if let LocationConfigMode::Detached = request_data.config.mode { - if let Err(e) = - mgr::detach_tenant(conf, tenant_shard_id, true, &state.deletion_queue_client) - .instrument(info_span!("tenant_detach", - tenant_id = %tenant_shard_id.tenant_id, - shard_id = %tenant_shard_id.shard_slug() - )) - .await + if let Err(e) = state + .tenant_manager + .detach_tenant(conf, tenant_shard_id, true, &state.deletion_queue_client) + .instrument(info_span!("tenant_detach", + tenant_id = %tenant_shard_id.tenant_id, + shard_id = %tenant_shard_id.shard_slug() + )) + .await { match e { TenantStateError::SlotError(TenantSlotError::NotFound(_)) => { @@ -1648,8 +1654,7 @@ async fn timeline_gc_handler( let gc_req: TimelineGcRequest = json_request(&mut request).await?; let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); - let wait_task_done = - mgr::immediate_gc(tenant_shard_id, timeline_id, gc_req, cancel, &ctx).await?; + let wait_task_done = mgr::immediate_gc(tenant_shard_id, timeline_id, gc_req, cancel, &ctx)?; let gc_result = wait_task_done .await .context("wait for gc task") @@ -2262,6 +2267,7 @@ pub fn make_router( Ok(router .data(state) + .get("/metrics", |r| request_span(r, prometheus_metrics_handler)) .get("/v1/status", |r| api_handler(r, status_handler)) .put("/v1/failpoints", |r| { testing_api_handler("manage failpoints", r, failpoints_handler) diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs index d66df36b3a..343dec2ca1 100644 --- a/pageserver/src/import_datadir.rs +++ b/pageserver/src/import_datadir.rs @@ -2,28 +2,20 @@ //! Import data and WAL from a PostgreSQL data directory and WAL segments into //! a neon Timeline. //! -use std::io::SeekFrom; use std::path::{Path, PathBuf}; use anyhow::{bail, ensure, Context, Result}; -use async_compression::tokio::bufread::ZstdDecoder; -use async_compression::{tokio::write::ZstdEncoder, zstd::CParameter, Level}; use bytes::Bytes; use camino::Utf8Path; use futures::StreamExt; -use nix::NixPath; -use tokio::fs::{File, OpenOptions}; -use tokio::io::{AsyncBufRead, AsyncRead, AsyncReadExt, AsyncSeekExt, AsyncWriteExt}; +use tokio::io::{AsyncRead, AsyncReadExt}; use tokio_tar::Archive; -use tokio_tar::Builder; -use tokio_tar::HeaderMode; use tracing::*; use walkdir::WalkDir; use crate::context::RequestContext; use crate::metrics::WAL_INGEST; use crate::pgdatadir_mapping::*; -use crate::tenant::remote_timeline_client::INITDB_PATH; use crate::tenant::Timeline; use crate::walingest::WalIngest; use crate::walrecord::DecodedWALRecord; @@ -633,65 +625,3 @@ async fn read_all_bytes(reader: &mut (impl AsyncRead + Unpin)) -> Result reader.read_to_end(&mut buf).await?; Ok(Bytes::from(buf)) } - -pub async fn create_tar_zst(pgdata_path: &Utf8Path, tmp_path: &Utf8Path) -> Result<(File, u64)> { - let file = OpenOptions::new() - .create(true) - .truncate(true) - .read(true) - .write(true) - .open(&tmp_path) - .await - .with_context(|| format!("tempfile creation {tmp_path}"))?; - - let mut paths = Vec::new(); - for entry in WalkDir::new(pgdata_path) { - let entry = entry?; - let metadata = entry.metadata().expect("error getting dir entry metadata"); - // Also allow directories so that we also get empty directories - if !(metadata.is_file() || metadata.is_dir()) { - continue; - } - let path = entry.into_path(); - paths.push(path); - } - // Do a sort to get a more consistent listing - paths.sort_unstable(); - let zstd = ZstdEncoder::with_quality_and_params( - file, - Level::Default, - &[CParameter::enable_long_distance_matching(true)], - ); - let mut builder = Builder::new(zstd); - // Use reproducible header mode - builder.mode(HeaderMode::Deterministic); - for path in paths { - let rel_path = path.strip_prefix(pgdata_path)?; - if rel_path.is_empty() { - // The top directory should not be compressed, - // the tar crate doesn't like that - continue; - } - builder.append_path_with_name(&path, rel_path).await?; - } - let mut zstd = builder.into_inner().await?; - zstd.shutdown().await?; - let mut compressed = zstd.into_inner(); - let compressed_len = compressed.metadata().await?.len(); - const INITDB_TAR_ZST_WARN_LIMIT: u64 = 2 * 1024 * 1024; - if compressed_len > INITDB_TAR_ZST_WARN_LIMIT { - warn!("compressed {INITDB_PATH} size of {compressed_len} is above limit {INITDB_TAR_ZST_WARN_LIMIT}."); - } - compressed.seek(SeekFrom::Start(0)).await?; - Ok((compressed, compressed_len)) -} - -pub async fn extract_tar_zst( - pgdata_path: &Utf8Path, - tar_zst: impl AsyncBufRead + Unpin, -) -> Result<()> { - let tar = Box::pin(ZstdDecoder::new(tar_zst)); - let mut archive = Archive::new(tar); - archive.unpack(pgdata_path).await?; - Ok(()) -} diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index f3ceb7d3e6..fa1a0f535b 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -180,7 +180,6 @@ pub async fn libpq_listener_main( // only deal with a particular timeline, but we don't know which one // yet. task_mgr::spawn( - &tokio::runtime::Handle::current(), TaskKind::PageRequestHandler, None, None, diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 727650a5a5..6f7d74bdee 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -34,6 +34,7 @@ use strum::IntoEnumIterator; use tokio_util::sync::CancellationToken; use tracing::{debug, trace, warn}; use utils::bin_ser::DeserializeError; +use utils::vec_map::{VecMap, VecMapOrdering}; use utils::{bin_ser::BeSer, lsn::Lsn}; const MAX_AUX_FILE_DELTAS: usize = 1024; @@ -1546,12 +1547,13 @@ impl<'a> DatadirModification<'a> { if !self.pending_updates.is_empty() { // The put_batch call below expects expects the inputs to be sorted by Lsn, // so we do that first. - let lsn_ordered_batch: Vec<(Key, Lsn, Value)> = self - .pending_updates - .drain() - .map(|(key, vals)| vals.into_iter().map(move |(lsn, val)| (key, lsn, val))) - .kmerge_by(|lhs, rhs| lhs.1 .0 < rhs.1 .0) - .collect(); + let lsn_ordered_batch: VecMap = VecMap::from_iter( + self.pending_updates + .drain() + .map(|(key, vals)| vals.into_iter().map(move |(lsn, val)| (lsn, (key, val)))) + .kmerge_by(|lhs, rhs| lhs.0 < rhs.0), + VecMapOrdering::GreaterOrEqual, + ); writer.put_batch(lsn_ordered_batch, ctx).await?; } diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs index 69e163effa..2d97389982 100644 --- a/pageserver/src/task_mgr.rs +++ b/pageserver/src/task_mgr.rs @@ -98,42 +98,22 @@ use utils::id::TimelineId; // other operations, if the upload tasks e.g. get blocked on locks. It shouldn't // happen, but still. // -pub static COMPUTE_REQUEST_RUNTIME: Lazy = Lazy::new(|| { - tokio::runtime::Builder::new_multi_thread() - .thread_name("compute request worker") - .enable_all() - .build() - .expect("Failed to create compute request runtime") -}); -pub static MGMT_REQUEST_RUNTIME: Lazy = Lazy::new(|| { +/// The single tokio runtime used by all pageserver code. +/// In the past, we had multiple runtimes, and in the future we should weed out +/// remaining references to this global field and rely on ambient runtime instead, +/// i.e., use `tokio::spawn` instead of `THE_RUNTIME.spawn()`, etc. +pub static THE_RUNTIME: Lazy = Lazy::new(|| { tokio::runtime::Builder::new_multi_thread() - .thread_name("mgmt request worker") - .enable_all() - .build() - .expect("Failed to create mgmt request runtime") -}); - -pub static WALRECEIVER_RUNTIME: Lazy = Lazy::new(|| { - tokio::runtime::Builder::new_multi_thread() - .thread_name("walreceiver worker") - .enable_all() - .build() - .expect("Failed to create walreceiver runtime") -}); - -pub static BACKGROUND_RUNTIME: Lazy = Lazy::new(|| { - tokio::runtime::Builder::new_multi_thread() - .thread_name("background op worker") // if you change the number of worker threads please change the constant below .enable_all() .build() .expect("Failed to create background op runtime") }); -pub(crate) static BACKGROUND_RUNTIME_WORKER_THREADS: Lazy = Lazy::new(|| { +pub(crate) static THE_RUNTIME_WORKER_THREADS: Lazy = Lazy::new(|| { // force init and thus panics - let _ = BACKGROUND_RUNTIME.handle(); + let _ = THE_RUNTIME.handle(); // replicates tokio-1.28.1::loom::sys::num_cpus which is not available publicly // tokio would had already panicked for parsing errors or NotUnicode // @@ -325,7 +305,6 @@ struct PageServerTask { /// Note: if shutdown_process_on_error is set to true failure /// of the task will lead to shutdown of entire process pub fn spawn( - runtime: &tokio::runtime::Handle, kind: TaskKind, tenant_shard_id: Option, timeline_id: Option, @@ -354,7 +333,7 @@ where let task_name = name.to_string(); let task_cloned = Arc::clone(&task); - let join_handle = runtime.spawn(task_wrapper( + let join_handle = THE_RUNTIME.spawn(task_wrapper( task_name, task_id, task_cloned, diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index ddfb47369b..7bd85b6fd5 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -43,6 +43,8 @@ use utils::sync::gate::Gate; use utils::sync::gate::GateGuard; use utils::timeout::timeout_cancellable; use utils::timeout::TimeoutCancellableError; +use utils::zstd::create_zst_tarball; +use utils::zstd::extract_zst_tarball; use self::config::AttachedLocationConfig; use self::config::AttachmentMode; @@ -142,6 +144,7 @@ macro_rules! pausable_failpoint { } }; } +pub(crate) use pausable_failpoint; pub mod blob_io; pub mod block_io; @@ -200,6 +203,13 @@ pub(super) struct AttachedTenantConf { } impl AttachedTenantConf { + fn new(tenant_conf: TenantConfOpt, location: AttachedLocationConfig) -> Self { + Self { + tenant_conf, + location, + } + } + fn try_from(location_conf: LocationConf) -> anyhow::Result { match &location_conf.mode { LocationMode::Attached(attach_conf) => Ok(Self { @@ -652,7 +662,6 @@ impl Tenant { let tenant_clone = Arc::clone(&tenant); let ctx = ctx.detached_child(TaskKind::Attach, DownloadBehavior::Warn); task_mgr::spawn( - &tokio::runtime::Handle::current(), TaskKind::Attach, Some(tenant_shard_id), None, @@ -676,9 +685,20 @@ impl Tenant { } // Ideally we should use Tenant::set_broken_no_wait, but it is not supposed to be used when tenant is in loading state. + enum BrokenVerbosity { + Error, + Info + } let make_broken = - |t: &Tenant, err: anyhow::Error| { - error!("attach failed, setting tenant state to Broken: {err:?}"); + |t: &Tenant, err: anyhow::Error, verbosity: BrokenVerbosity| { + match verbosity { + BrokenVerbosity::Info => { + info!("attach cancelled, setting tenant state to Broken: {err}"); + }, + BrokenVerbosity::Error => { + error!("attach failed, setting tenant state to Broken: {err:?}"); + } + } t.state.send_modify(|state| { // The Stopping case is for when we have passed control on to DeleteTenantFlow: // if it errors, we will call make_broken when tenant is already in Stopping. @@ -742,7 +762,7 @@ impl Tenant { // Make the tenant broken so that set_stopping will not hang waiting for it to leave // the Attaching state. This is an over-reaction (nothing really broke, the tenant is // just shutting down), but ensures progress. - make_broken(&tenant_clone, anyhow::anyhow!("Shut down while Attaching")); + make_broken(&tenant_clone, anyhow::anyhow!("Shut down while Attaching"), BrokenVerbosity::Info); return Ok(()); }, ) @@ -764,7 +784,7 @@ impl Tenant { match res { Ok(p) => Some(p), Err(e) => { - make_broken(&tenant_clone, anyhow::anyhow!(e)); + make_broken(&tenant_clone, anyhow::anyhow!(e), BrokenVerbosity::Error); return Ok(()); } } @@ -788,7 +808,7 @@ impl Tenant { { Ok(should_resume_deletion) => should_resume_deletion, Err(err) => { - make_broken(&tenant_clone, anyhow::anyhow!(err)); + make_broken(&tenant_clone, anyhow::anyhow!(err), BrokenVerbosity::Error); return Ok(()); } } @@ -818,7 +838,7 @@ impl Tenant { .await; if let Err(e) = deleted { - make_broken(&tenant_clone, anyhow::anyhow!(e)); + make_broken(&tenant_clone, anyhow::anyhow!(e), BrokenVerbosity::Error); } return Ok(()); @@ -839,7 +859,7 @@ impl Tenant { tenant_clone.activate(broker_client, None, &ctx); } Err(e) => { - make_broken(&tenant_clone, anyhow::anyhow!(e)); + make_broken(&tenant_clone, anyhow::anyhow!(e), BrokenVerbosity::Error); } } @@ -3042,8 +3062,13 @@ impl Tenant { } } - let (pgdata_zstd, tar_zst_size) = - import_datadir::create_tar_zst(pgdata_path, &temp_path).await?; + let (pgdata_zstd, tar_zst_size) = create_zst_tarball(pgdata_path, &temp_path).await?; + const INITDB_TAR_ZST_WARN_LIMIT: u64 = 2 * 1024 * 1024; + if tar_zst_size > INITDB_TAR_ZST_WARN_LIMIT { + warn!( + "compressed {temp_path} size of {tar_zst_size} is above limit {INITDB_TAR_ZST_WARN_LIMIT}." + ); + } pausable_failpoint!("before-initdb-upload"); @@ -3143,7 +3168,7 @@ impl Tenant { let buf_read = BufReader::with_capacity(remote_timeline_client::BUFFER_SIZE, initdb_tar_zst); - import_datadir::extract_tar_zst(&pgdata_path, buf_read) + extract_zst_tarball(&pgdata_path, buf_read) .await .context("extract initdb tar")?; } else { diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs index 57fc444cdd..53a8c97e23 100644 --- a/pageserver/src/tenant/config.rs +++ b/pageserver/src/tenant/config.rs @@ -196,16 +196,17 @@ impl LocationConf { /// For use when attaching/re-attaching: update the generation stored in this /// structure. If we were in a secondary state, promote to attached (posession /// of a fresh generation implies this). - pub(crate) fn attach_in_generation(&mut self, generation: Generation) { + pub(crate) fn attach_in_generation(&mut self, mode: AttachmentMode, generation: Generation) { match &mut self.mode { LocationMode::Attached(attach_conf) => { attach_conf.generation = generation; + attach_conf.attach_mode = mode; } LocationMode::Secondary(_) => { // We are promoted to attached by the control plane's re-attach response self.mode = LocationMode::Attached(AttachedLocationConfig { generation, - attach_mode: AttachmentMode::Single, + attach_mode: mode, }) } } diff --git a/pageserver/src/tenant/delete.rs b/pageserver/src/tenant/delete.rs index cab60c3111..3866136dbd 100644 --- a/pageserver/src/tenant/delete.rs +++ b/pageserver/src/tenant/delete.rs @@ -111,6 +111,7 @@ async fn create_local_delete_mark( let _ = std::fs::OpenOptions::new() .write(true) .create(true) + .truncate(true) .open(&marker_path) .with_context(|| format!("could not create delete marker file {marker_path:?}"))?; @@ -481,7 +482,6 @@ impl DeleteTenantFlow { let tenant_shard_id = tenant.tenant_shard_id; task_mgr::spawn( - task_mgr::BACKGROUND_RUNTIME.handle(), TaskKind::TimelineDeletionWorker, Some(tenant_shard_id), None, diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 3aaab6e4ef..34ca43a173 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -2,13 +2,13 @@ //! page server. use camino::{Utf8DirEntry, Utf8Path, Utf8PathBuf}; -use futures::stream::StreamExt; use itertools::Itertools; use pageserver_api::key::Key; -use pageserver_api::models::ShardParameters; +use pageserver_api::models::{LocationConfigMode, ShardParameters}; use pageserver_api::shard::{ ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId, }; +use pageserver_api::upcall_api::ReAttachResponseTenant; use rand::{distributions::Alphanumeric, Rng}; use std::borrow::Cow; use std::cmp::Ordering; @@ -125,6 +125,46 @@ pub(crate) enum ShardSelector { Page(Key), } +/// A convenience for use with the re_attach ControlPlaneClient function: rather +/// than the serializable struct, we build this enum that encapsulates +/// the invariant that attached tenants always have generations. +/// +/// This represents the subset of a LocationConfig that we receive during re-attach. +pub(crate) enum TenantStartupMode { + Attached((AttachmentMode, Generation)), + Secondary, +} + +impl TenantStartupMode { + /// Return the generation & mode that should be used when starting + /// this tenant. + /// + /// If this returns None, the re-attach struct is in an invalid state and + /// should be ignored in the response. + fn from_reattach_tenant(rart: ReAttachResponseTenant) -> Option { + match (rart.mode, rart.gen) { + (LocationConfigMode::Detached, _) => None, + (LocationConfigMode::Secondary, _) => Some(Self::Secondary), + (LocationConfigMode::AttachedMulti, Some(g)) => { + Some(Self::Attached((AttachmentMode::Multi, Generation::new(g)))) + } + (LocationConfigMode::AttachedSingle, Some(g)) => { + Some(Self::Attached((AttachmentMode::Single, Generation::new(g)))) + } + (LocationConfigMode::AttachedStale, Some(g)) => { + Some(Self::Attached((AttachmentMode::Stale, Generation::new(g)))) + } + _ => { + tracing::warn!( + "Received invalid re-attach state for tenant {}: {rart:?}", + rart.id + ); + None + } + } + } +} + impl TenantsMap { /// Convenience function for typical usage, where we want to get a `Tenant` object, for /// working with attached tenants. If the TenantId is in the map but in Secondary state, @@ -271,7 +311,7 @@ pub struct TenantManager { fn emergency_generations( tenant_confs: &HashMap>, -) -> HashMap { +) -> HashMap { tenant_confs .iter() .filter_map(|(tid, lc)| { @@ -279,12 +319,15 @@ fn emergency_generations( Ok(lc) => lc, Err(_) => return None, }; - let gen = match &lc.mode { - LocationMode::Attached(alc) => Some(alc.generation), - LocationMode::Secondary(_) => None, - }; - - gen.map(|g| (*tid, g)) + Some(( + *tid, + match &lc.mode { + LocationMode::Attached(alc) => { + TenantStartupMode::Attached((alc.attach_mode, alc.generation)) + } + LocationMode::Secondary(_) => TenantStartupMode::Secondary, + }, + )) }) .collect() } @@ -294,7 +337,7 @@ async fn init_load_generations( tenant_confs: &HashMap>, resources: &TenantSharedResources, cancel: &CancellationToken, -) -> anyhow::Result>> { +) -> anyhow::Result>> { let generations = if conf.control_plane_emergency_mode { error!( "Emergency mode! Tenants will be attached unsafely using their last known generation" @@ -304,7 +347,12 @@ async fn init_load_generations( info!("Calling control plane API to re-attach tenants"); // If we are configured to use the control plane API, then it is the source of truth for what tenants to load. match client.re_attach(conf).await { - Ok(tenants) => tenants, + Ok(tenants) => tenants + .into_iter() + .flat_map(|(id, rart)| { + TenantStartupMode::from_reattach_tenant(rart).map(|tsm| (id, tsm)) + }) + .collect(), Err(RetryForeverError::ShuttingDown) => { anyhow::bail!("Shut down while waiting for control plane re-attach response") } @@ -322,9 +370,17 @@ async fn init_load_generations( // Must only do this if remote storage is enabled, otherwise deletion queue // is not running and channel push will fail. if resources.remote_storage.is_some() { - resources - .deletion_queue_client - .recover(generations.clone())?; + let attached_tenants = generations + .iter() + .flat_map(|(id, start_mode)| { + match start_mode { + TenantStartupMode::Attached((_mode, generation)) => Some(generation), + TenantStartupMode::Secondary => None, + } + .map(|gen| (*id, *gen)) + }) + .collect(); + resources.deletion_queue_client.recover(attached_tenants)?; } Ok(Some(generations)) @@ -490,9 +546,8 @@ pub async fn init_tenant_mgr( // Scan local filesystem for attached tenants let tenant_configs = init_load_tenant_configs(conf).await?; - // Determine which tenants are to be attached - let tenant_generations = - init_load_generations(conf, &tenant_configs, &resources, &cancel).await?; + // Determine which tenants are to be secondary or attached, and in which generation + let tenant_modes = init_load_generations(conf, &tenant_configs, &resources, &cancel).await?; tracing::info!( "Attaching {} tenants at startup, warming up {} at a time", @@ -522,97 +577,102 @@ pub async fn init_tenant_mgr( } }; - let generation = if let Some(generations) = &tenant_generations { + // FIXME: if we were attached, and get demoted to secondary on re-attach, we + // don't have a place to get a config. + // (https://github.com/neondatabase/neon/issues/5377) + const DEFAULT_SECONDARY_CONF: SecondaryLocationConfig = + SecondaryLocationConfig { warm: true }; + + // Update the location config according to the re-attach response + if let Some(tenant_modes) = &tenant_modes { // We have a generation map: treat it as the authority for whether // this tenant is really attached. - if let Some(gen) = generations.get(&tenant_shard_id) { - if let LocationMode::Attached(attached) = &location_conf.mode { - if attached.generation > *gen { + match tenant_modes.get(&tenant_shard_id) { + None => { + info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Detaching tenant, control plane omitted it in re-attach response"); + if let Err(e) = safe_remove_tenant_dir_all(&tenant_dir_path).await { + error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), + "Failed to remove detached tenant directory '{tenant_dir_path}': {e:?}", + ); + } + + // We deleted local content: move on to next tenant, don't try and spawn this one. + continue; + } + Some(TenantStartupMode::Secondary) => { + if !matches!(location_conf.mode, LocationMode::Secondary(_)) { + location_conf.mode = LocationMode::Secondary(DEFAULT_SECONDARY_CONF); + } + } + Some(TenantStartupMode::Attached((attach_mode, generation))) => { + let old_gen_higher = match &location_conf.mode { + LocationMode::Attached(AttachedLocationConfig { + generation: old_generation, + attach_mode: _attach_mode, + }) => { + if old_generation > generation { + Some(old_generation) + } else { + None + } + } + _ => None, + }; + if let Some(old_generation) = old_gen_higher { tracing::error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), - "Control plane gave decreasing generation ({gen:?}) in re-attach response for tenant that was attached in generation {:?}, demoting to secondary", - attached.generation + "Control plane gave decreasing generation ({generation:?}) in re-attach response for tenant that was attached in generation {:?}, demoting to secondary", + old_generation ); // We cannot safely attach this tenant given a bogus generation number, but let's avoid throwing away // local disk content: demote to secondary rather than detaching. - tenants.insert( - tenant_shard_id, - TenantSlot::Secondary(SecondaryTenant::new( - tenant_shard_id, - location_conf.shard, - location_conf.tenant_conf.clone(), - &SecondaryLocationConfig { warm: false }, - )), - ); + location_conf.mode = LocationMode::Secondary(DEFAULT_SECONDARY_CONF); + } else { + location_conf.attach_in_generation(*attach_mode, *generation); } } - *gen - } else { - match &location_conf.mode { - LocationMode::Secondary(secondary_config) => { - // We do not require the control plane's permission for secondary mode - // tenants, because they do no remote writes and hence require no - // generation number - info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Loaded tenant in secondary mode"); - tenants.insert( - tenant_shard_id, - TenantSlot::Secondary(SecondaryTenant::new( - tenant_shard_id, - location_conf.shard, - location_conf.tenant_conf, - secondary_config, - )), - ); - } - LocationMode::Attached(_) => { - // TODO: augment re-attach API to enable the control plane to - // instruct us about secondary attachments. That way, instead of throwing - // away local state, we can gracefully fall back to secondary here, if the control - // plane tells us so. - // (https://github.com/neondatabase/neon/issues/5377) - info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Detaching tenant, control plane omitted it in re-attach response"); - if let Err(e) = safe_remove_tenant_dir_all(&tenant_dir_path).await { - error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), - "Failed to remove detached tenant directory '{tenant_dir_path}': {e:?}", - ); - } - } - }; - - continue; } } else { // Legacy mode: no generation information, any tenant present // on local disk may activate info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Starting tenant in legacy mode, no generation",); - Generation::none() }; // Presence of a generation number implies attachment: attach the tenant // if it wasn't already, and apply the generation number. - location_conf.attach_in_generation(generation); Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await?; let shard_identity = location_conf.shard; - match tenant_spawn( - conf, - tenant_shard_id, - &tenant_dir_path, - resources.clone(), - AttachedTenantConf::try_from(location_conf)?, - shard_identity, - Some(init_order.clone()), - &TENANTS, - SpawnMode::Lazy, - &ctx, - ) { - Ok(tenant) => { - tenants.insert(tenant_shard_id, TenantSlot::Attached(tenant)); + let slot = match location_conf.mode { + LocationMode::Attached(attached_conf) => { + match tenant_spawn( + conf, + tenant_shard_id, + &tenant_dir_path, + resources.clone(), + AttachedTenantConf::new(location_conf.tenant_conf, attached_conf), + shard_identity, + Some(init_order.clone()), + &TENANTS, + SpawnMode::Lazy, + &ctx, + ) { + Ok(tenant) => TenantSlot::Attached(tenant), + Err(e) => { + error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Failed to start tenant: {e:#}"); + continue; + } + } } - Err(e) => { - error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Failed to start tenant: {e:#}"); - } - } + LocationMode::Secondary(secondary_conf) => TenantSlot::Secondary(SecondaryTenant::new( + tenant_shard_id, + shard_identity, + location_conf.tenant_conf, + &secondary_conf, + )), + }; + + tenants.insert(tenant_shard_id, slot); } info!("Processed {} local tenants at startup", tenants.len()); @@ -633,7 +693,7 @@ pub async fn init_tenant_mgr( /// Wrapper for Tenant::spawn that checks invariants before running, and inserts /// a broken tenant in the map if Tenant::spawn fails. #[allow(clippy::too_many_arguments)] -pub(crate) fn tenant_spawn( +fn tenant_spawn( conf: &'static PageServerConf, tenant_shard_id: TenantShardId, tenant_path: &Utf8Path, @@ -825,40 +885,6 @@ pub(crate) enum SetNewTenantConfigError { Other(anyhow::Error), } -pub(crate) async fn set_new_tenant_config( - conf: &'static PageServerConf, - new_tenant_conf: TenantConfOpt, - tenant_id: TenantId, -) -> Result<(), SetNewTenantConfigError> { - // Legacy API: does not support sharding - let tenant_shard_id = TenantShardId::unsharded(tenant_id); - - info!("configuring tenant {tenant_id}"); - let tenant = get_tenant(tenant_shard_id, true)?; - - if !tenant.tenant_shard_id().shard_count.is_unsharded() { - // Note that we use ShardParameters::default below. - return Err(SetNewTenantConfigError::Other(anyhow::anyhow!( - "This API may only be used on single-sharded tenants, use the /location_config API for sharded tenants" - ))); - } - - // This is a legacy API that only operates on attached tenants: the preferred - // API to use is the location_config/ endpoint, which lets the caller provide - // the full LocationConf. - let location_conf = LocationConf::attached_single( - new_tenant_conf.clone(), - tenant.generation, - &ShardParameters::default(), - ); - - Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf) - .await - .map_err(SetNewTenantConfigError::Persist)?; - tenant.set_new_tenant_config(new_tenant_conf); - Ok(()) -} - #[derive(thiserror::Error, Debug)] pub(crate) enum UpsertLocationError { #[error("Bad config request: {0}")] @@ -1661,19 +1687,7 @@ impl TenantManager { let tmp_path = safe_rename_tenant_dir(&local_tenant_directory) .await .with_context(|| format!("local tenant directory {local_tenant_directory:?} rename"))?; - task_mgr::spawn( - task_mgr::BACKGROUND_RUNTIME.handle(), - TaskKind::MgmtRequest, - None, - None, - "tenant_files_delete", - false, - async move { - fs::remove_dir_all(tmp_path.as_path()) - .await - .with_context(|| format!("tenant directory {:?} deletion", tmp_path)) - }, - ); + self.spawn_background_purge(tmp_path); fail::fail_point!("shard-split-pre-finish", |_| Err(anyhow::anyhow!( "failpoint" @@ -1708,9 +1722,9 @@ impl TenantManager { .layers .read() .await - .resident_layers() - .collect::>() - .await; + .likely_resident_layers() + .collect::>(); + for layer in timeline_layers { let relative_path = layer .local_path() @@ -1827,6 +1841,133 @@ impl TenantManager { shutdown_all_tenants0(self.tenants).await } + + /// When we have moved a tenant's content to a temporary directory, we may delete it lazily in + /// the background, and thereby avoid blocking any API requests on this deletion completing. + fn spawn_background_purge(&self, tmp_path: Utf8PathBuf) { + // Although we are cleaning up the tenant, this task is not meant to be bound by the lifetime of the tenant in memory. + // After a tenant is detached, there are no more task_mgr tasks for that tenant_id. + let task_tenant_id = None; + + task_mgr::spawn( + TaskKind::MgmtRequest, + task_tenant_id, + None, + "tenant_files_delete", + false, + async move { + fs::remove_dir_all(tmp_path.as_path()) + .await + .with_context(|| format!("tenant directory {:?} deletion", tmp_path)) + }, + ); + } + + pub(crate) async fn detach_tenant( + &self, + conf: &'static PageServerConf, + tenant_shard_id: TenantShardId, + detach_ignored: bool, + deletion_queue_client: &DeletionQueueClient, + ) -> Result<(), TenantStateError> { + let tmp_path = self + .detach_tenant0( + conf, + &TENANTS, + tenant_shard_id, + detach_ignored, + deletion_queue_client, + ) + .await?; + self.spawn_background_purge(tmp_path); + + Ok(()) + } + + async fn detach_tenant0( + &self, + conf: &'static PageServerConf, + tenants: &std::sync::RwLock, + tenant_shard_id: TenantShardId, + detach_ignored: bool, + deletion_queue_client: &DeletionQueueClient, + ) -> Result { + let tenant_dir_rename_operation = |tenant_id_to_clean: TenantShardId| async move { + let local_tenant_directory = conf.tenant_path(&tenant_id_to_clean); + safe_rename_tenant_dir(&local_tenant_directory) + .await + .with_context(|| { + format!("local tenant directory {local_tenant_directory:?} rename") + }) + }; + + let removal_result = remove_tenant_from_memory( + tenants, + tenant_shard_id, + tenant_dir_rename_operation(tenant_shard_id), + ) + .await; + + // Flush pending deletions, so that they have a good chance of passing validation + // before this tenant is potentially re-attached elsewhere. + deletion_queue_client.flush_advisory(); + + // Ignored tenants are not present in memory and will bail the removal from memory operation. + // Before returning the error, check for ignored tenant removal case — we only need to clean its local files then. + if detach_ignored + && matches!( + removal_result, + Err(TenantStateError::SlotError(TenantSlotError::NotFound(_))) + ) + { + let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_shard_id); + if tenant_ignore_mark.exists() { + info!("Detaching an ignored tenant"); + let tmp_path = tenant_dir_rename_operation(tenant_shard_id) + .await + .with_context(|| { + format!("Ignored tenant {tenant_shard_id} local directory rename") + })?; + return Ok(tmp_path); + } + } + + removal_result + } + + pub(crate) async fn set_new_tenant_config( + &self, + new_tenant_conf: TenantConfOpt, + tenant_id: TenantId, + ) -> Result<(), SetNewTenantConfigError> { + // Legacy API: does not support sharding + let tenant_shard_id = TenantShardId::unsharded(tenant_id); + + info!("configuring tenant {tenant_id}"); + let tenant = get_tenant(tenant_shard_id, true)?; + + if !tenant.tenant_shard_id().shard_count.is_unsharded() { + // Note that we use ShardParameters::default below. + return Err(SetNewTenantConfigError::Other(anyhow::anyhow!( + "This API may only be used on single-sharded tenants, use the /location_config API for sharded tenants" + ))); + } + + // This is a legacy API that only operates on attached tenants: the preferred + // API to use is the location_config/ endpoint, which lets the caller provide + // the full LocationConf. + let location_conf = LocationConf::attached_single( + new_tenant_conf.clone(), + tenant.generation, + &ShardParameters::default(), + ); + + Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &location_conf) + .await + .map_err(SetNewTenantConfigError::Persist)?; + tenant.set_new_tenant_config(new_tenant_conf); + Ok(()) + } } #[derive(Debug, thiserror::Error)] @@ -2028,87 +2169,6 @@ pub(crate) enum TenantStateError { Other(#[from] anyhow::Error), } -pub(crate) async fn detach_tenant( - conf: &'static PageServerConf, - tenant_shard_id: TenantShardId, - detach_ignored: bool, - deletion_queue_client: &DeletionQueueClient, -) -> Result<(), TenantStateError> { - let tmp_path = detach_tenant0( - conf, - &TENANTS, - tenant_shard_id, - detach_ignored, - deletion_queue_client, - ) - .await?; - // Although we are cleaning up the tenant, this task is not meant to be bound by the lifetime of the tenant in memory. - // After a tenant is detached, there are no more task_mgr tasks for that tenant_id. - let task_tenant_id = None; - task_mgr::spawn( - task_mgr::BACKGROUND_RUNTIME.handle(), - TaskKind::MgmtRequest, - task_tenant_id, - None, - "tenant_files_delete", - false, - async move { - fs::remove_dir_all(tmp_path.as_path()) - .await - .with_context(|| format!("tenant directory {:?} deletion", tmp_path)) - }, - ); - Ok(()) -} - -async fn detach_tenant0( - conf: &'static PageServerConf, - tenants: &std::sync::RwLock, - tenant_shard_id: TenantShardId, - detach_ignored: bool, - deletion_queue_client: &DeletionQueueClient, -) -> Result { - let tenant_dir_rename_operation = |tenant_id_to_clean: TenantShardId| async move { - let local_tenant_directory = conf.tenant_path(&tenant_id_to_clean); - safe_rename_tenant_dir(&local_tenant_directory) - .await - .with_context(|| format!("local tenant directory {local_tenant_directory:?} rename")) - }; - - let removal_result = remove_tenant_from_memory( - tenants, - tenant_shard_id, - tenant_dir_rename_operation(tenant_shard_id), - ) - .await; - - // Flush pending deletions, so that they have a good chance of passing validation - // before this tenant is potentially re-attached elsewhere. - deletion_queue_client.flush_advisory(); - - // Ignored tenants are not present in memory and will bail the removal from memory operation. - // Before returning the error, check for ignored tenant removal case — we only need to clean its local files then. - if detach_ignored - && matches!( - removal_result, - Err(TenantStateError::SlotError(TenantSlotError::NotFound(_))) - ) - { - let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_shard_id); - if tenant_ignore_mark.exists() { - info!("Detaching an ignored tenant"); - let tmp_path = tenant_dir_rename_operation(tenant_shard_id) - .await - .with_context(|| { - format!("Ignored tenant {tenant_shard_id} local directory rename") - })?; - return Ok(tmp_path); - } - } - - removal_result -} - pub(crate) async fn load_tenant( conf: &'static PageServerConf, tenant_id: TenantId, @@ -2142,7 +2202,7 @@ pub(crate) async fn load_tenant( let mut location_conf = Tenant::load_tenant_config(conf, &tenant_shard_id).map_err(TenantMapInsertError::Other)?; - location_conf.attach_in_generation(generation); + location_conf.attach_in_generation(AttachmentMode::Single, generation); Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await?; @@ -2729,7 +2789,7 @@ use { utils::http::error::ApiError, }; -pub(crate) async fn immediate_gc( +pub(crate) fn immediate_gc( tenant_shard_id: TenantShardId, timeline_id: TimelineId, gc_req: TimelineGcRequest, @@ -2751,30 +2811,28 @@ pub(crate) async fn immediate_gc( // Run in task_mgr to avoid race with tenant_detach operation let ctx = ctx.detached_child(TaskKind::GarbageCollector, DownloadBehavior::Download); let (task_done, wait_task_done) = tokio::sync::oneshot::channel(); + let span = info_span!("manual_gc", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id); + // TODO: spawning is redundant now, need to hold the gate task_mgr::spawn( - &tokio::runtime::Handle::current(), TaskKind::GarbageCollector, Some(tenant_shard_id), Some(timeline_id), &format!("timeline_gc_handler garbage collection run for tenant {tenant_shard_id} timeline {timeline_id}"), false, async move { - fail::fail_point!("immediate_gc_task_pre"); - #[allow(unused_mut)] let mut result = tenant .gc_iteration(Some(timeline_id), gc_horizon, pitr, &cancel, &ctx) - .instrument(info_span!("manual_gc", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id)) .await; // FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it // better once the types support it. #[cfg(feature = "testing")] { + // we need to synchronize with drop completion for python tests without polling for + // log messages if let Ok(result) = result.as_mut() { - // why not futures unordered? it seems it needs very much the same task structure - // but would only run on single task. let mut js = tokio::task::JoinSet::new(); for layer in std::mem::take(&mut result.doomed_layers) { js.spawn(layer.wait_drop()); @@ -2790,7 +2848,7 @@ pub(crate) async fn immediate_gc( if let Some(rtc) = rtc { // layer drops schedule actions on remote timeline client to actually do the - // deletions; don't care just exit fast about the shutdown error + // deletions; don't care about the shutdown error, just exit fast drop(rtc.wait_completion().await); } } @@ -2801,6 +2859,7 @@ pub(crate) async fn immediate_gc( } Ok(()) } + .instrument(span) ); // drop the guard until after we've spawned the task so that timeline shutdown will wait for the task diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index 40be2ca8f3..c0a150eb0d 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -223,7 +223,6 @@ use crate::{ config::PageServerConf, task_mgr, task_mgr::TaskKind, - task_mgr::BACKGROUND_RUNTIME, tenant::metadata::TimelineMetadata, tenant::upload_queue::{ UploadOp, UploadQueue, UploadQueueInitialized, UploadQueueStopped, UploadTask, @@ -307,8 +306,6 @@ pub enum PersistIndexPartWithDeletedFlagError { pub struct RemoteTimelineClient { conf: &'static PageServerConf, - runtime: tokio::runtime::Handle, - tenant_shard_id: TenantShardId, timeline_id: TimelineId, generation: Generation, @@ -341,12 +338,6 @@ impl RemoteTimelineClient { ) -> RemoteTimelineClient { RemoteTimelineClient { conf, - runtime: if cfg!(test) { - // remote_timeline_client.rs tests rely on current-thread runtime - tokio::runtime::Handle::current() - } else { - BACKGROUND_RUNTIME.handle().clone() - }, tenant_shard_id, timeline_id, generation, @@ -1281,7 +1272,6 @@ impl RemoteTimelineClient { let tenant_shard_id = self.tenant_shard_id; let timeline_id = self.timeline_id; task_mgr::spawn( - &self.runtime, TaskKind::RemoteUploadTask, Some(self.tenant_shard_id), Some(self.timeline_id), @@ -1876,7 +1866,6 @@ mod tests { fn build_client(&self, generation: Generation) -> Arc { Arc::new(RemoteTimelineClient { conf: self.harness.conf, - runtime: tokio::runtime::Handle::current(), tenant_shard_id: self.harness.tenant_shard_id, timeline_id: TIMELINE_ID, generation, diff --git a/pageserver/src/tenant/secondary.rs b/pageserver/src/tenant/secondary.rs index 19f36c722e..b0babb1308 100644 --- a/pageserver/src/tenant/secondary.rs +++ b/pageserver/src/tenant/secondary.rs @@ -8,7 +8,7 @@ use std::{sync::Arc, time::SystemTime}; use crate::{ config::PageServerConf, disk_usage_eviction_task::DiskUsageEvictionInfo, - task_mgr::{self, TaskKind, BACKGROUND_RUNTIME}, + task_mgr::{self, TaskKind}, virtual_file::MaybeFatalIo, }; @@ -317,7 +317,6 @@ pub fn spawn_tasks( tokio::sync::mpsc::channel::>(16); task_mgr::spawn( - BACKGROUND_RUNTIME.handle(), TaskKind::SecondaryDownloads, None, None, @@ -338,7 +337,6 @@ pub fn spawn_tasks( ); task_mgr::spawn( - BACKGROUND_RUNTIME.handle(), TaskKind::SecondaryUploads, None, None, diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs index a595096133..8782a9f04e 100644 --- a/pageserver/src/tenant/secondary/downloader.rs +++ b/pageserver/src/tenant/secondary/downloader.rs @@ -15,6 +15,7 @@ use crate::{ tenant::{ config::SecondaryLocationConfig, debug_assert_current_span_has_tenant_and_timeline_id, + ephemeral_file::is_ephemeral_file, remote_timeline_client::{ index::LayerFileMetadata, is_temp_download_file, FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES, @@ -534,7 +535,11 @@ impl<'a> TenantDownloader<'a> { .await .maybe_fatal_err(&context_msg)?; - tracing::debug!("Wrote local heatmap to {}", heatmap_path); + tracing::debug!( + "Wrote local heatmap to {}, with {} timelines", + heatmap_path, + heatmap.timelines.len() + ); // Clean up any local layers that aren't in the heatmap. We do this first for all timelines, on the general // principle that deletions should be done before writes wherever possible, and so that we can use this @@ -547,6 +552,10 @@ impl<'a> TenantDownloader<'a> { // Download the layers in the heatmap for timeline in heatmap.timelines { if self.secondary_state.cancel.is_cancelled() { + tracing::debug!( + "Cancelled before downloading timeline {}", + timeline.timeline_id + ); return Ok(()); } @@ -764,10 +773,13 @@ impl<'a> TenantDownloader<'a> { } }; + tracing::debug!(timeline_id=%timeline.timeline_id, "Downloading layers, {} in heatmap", timeline.layers.len()); + // Download heatmap layers that are not present on local disk, or update their // access time if they are already present. for layer in timeline.layers { if self.secondary_state.cancel.is_cancelled() { + tracing::debug!("Cancelled -- dropping out of layer loop"); return Ok(()); } @@ -950,7 +962,10 @@ async fn init_timeline_state( // Secondary mode doesn't use local metadata files, but they might have been left behind by an attached tenant. warn!(path=?dentry.path(), "found legacy metadata file, these should have been removed in load_tenant_config"); continue; - } else if crate::is_temporary(&file_path) || is_temp_download_file(&file_path) { + } else if crate::is_temporary(&file_path) + || is_temp_download_file(&file_path) + || is_ephemeral_file(file_name) + { // Temporary files are frequently left behind from restarting during downloads tracing::info!("Cleaning up temporary file {file_path}"); if let Err(e) = tokio::fs::remove_file(&file_path) diff --git a/pageserver/src/tenant/secondary/scheduler.rs b/pageserver/src/tenant/secondary/scheduler.rs index 58bdb54161..3bd7be782e 100644 --- a/pageserver/src/tenant/secondary/scheduler.rs +++ b/pageserver/src/tenant/secondary/scheduler.rs @@ -300,6 +300,7 @@ where let tenant_shard_id = job.get_tenant_shard_id(); let barrier = if let Some(barrier) = self.get_running(tenant_shard_id) { + tracing::info!("Command already running, waiting for it"); barrier } else { let running = self.spawn_now(job); diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index 0200ff8cf4..e101a40da4 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -32,6 +32,9 @@ use utils::generation::Generation; #[cfg(test)] mod tests; +#[cfg(test)] +mod failpoints; + /// A Layer contains all data in a "rectangle" consisting of a range of keys and /// range of LSNs. /// @@ -46,7 +49,41 @@ mod tests; /// An image layer is a snapshot of all the data in a key-range, at a single /// LSN. /// -/// This type models the on-disk layers, which can be evicted and on-demand downloaded. +/// This type models the on-disk layers, which can be evicted and on-demand downloaded. As a +/// general goal, read accesses should always win eviction and eviction should not wait for +/// download. +/// +/// ### State transitions +/// +/// The internal state of `Layer` is composed of most importantly the on-filesystem state and the +/// [`ResidentOrWantedEvicted`] enum. On-filesystem state can be either present (fully downloaded, +/// right size) or deleted. +/// +/// Reads will always win requests to evict until `wait_for_turn_and_evict` has acquired the +/// `heavier_once_cell::InitPermit` and has started to `evict_blocking`. Before the +/// `heavier_once_cell::InitPermit` has been acquired, any read request +/// (`get_or_maybe_download`) can "re-initialize" using the existing downloaded file and thus +/// cancelling the eviction. +/// +/// ```text +/// +-----------------+ get_or_maybe_download +--------------------------------+ +/// | not initialized |--------------------------->| Resident(Arc) | +/// | ENOENT | /->| | +/// +-----------------+ | +--------------------------------+ +/// ^ | | ^ +/// | get_or_maybe_download | | | get_or_maybe_download, either: +/// evict_blocking | /-------------------------/ | | - upgrade weak to strong +/// | | | | - re-initialize without download +/// | | evict_and_wait | | +/// +-----------------+ v | +/// | not initialized | on_downloaded_layer_drop +--------------------------------------+ +/// | file is present |<---------------------------| WantedEvicted(Weak) | +/// +-----------------+ +--------------------------------------+ +/// ``` +/// +/// ### Unsupported +/// +/// - Evicting by the operator deleting files from the filesystem /// /// [`InMemoryLayer`]: super::inmemory_layer::InMemoryLayer #[derive(Clone)] @@ -211,8 +248,7 @@ impl Layer { /// /// Timeout is mandatory, because waiting for eviction is only needed for our tests; eviction /// will happen regardless the future returned by this method completing unless there is a - /// read access (currently including [`Layer::keep_resident`]) before eviction gets to - /// complete. + /// read access before eviction gets to complete. /// /// Technically cancellation safe, but cancelling might shift the viewpoint of what generation /// of download-evict cycle on retry. @@ -307,21 +343,28 @@ impl Layer { /// Assuming the layer is already downloaded, returns a guard which will prohibit eviction /// while the guard exists. /// - /// Returns None if the layer is currently evicted. - pub(crate) async fn keep_resident(&self) -> anyhow::Result> { - let downloaded = match self.0.get_or_maybe_download(false, None).await { - Ok(d) => d, - // technically there are a lot of possible errors, but in practice it should only be - // DownloadRequired which is tripped up. could work to improve this situation - // statically later. - Err(DownloadError::DownloadRequired) => return Ok(None), - Err(e) => return Err(e.into()), - }; + /// Returns None if the layer is currently evicted or becoming evicted. + #[cfg(test)] + pub(crate) async fn keep_resident(&self) -> Option { + let downloaded = self.0.inner.get().and_then(|rowe| rowe.get())?; - Ok(Some(ResidentLayer { + Some(ResidentLayer { downloaded, owner: self.clone(), - })) + }) + } + + /// Weak indicator of is the layer resident or not. Good enough for eviction, which can deal + /// with `EvictionError::NotFound`. + /// + /// Returns `true` if this layer might be resident, or `false`, if it most likely evicted or + /// will be unless a read happens soon. + pub(crate) fn is_likely_resident(&self) -> bool { + self.0 + .inner + .get() + .map(|rowe| rowe.is_likely_resident()) + .unwrap_or(false) } /// Downloads if necessary and creates a guard, which will keep this layer from being evicted. @@ -371,11 +414,11 @@ impl Layer { /// separatedly. #[cfg(any(feature = "testing", test))] pub(crate) fn wait_drop(&self) -> impl std::future::Future + 'static { - let mut rx = self.0.status.subscribe(); + let mut rx = self.0.status.as_ref().unwrap().subscribe(); async move { loop { - if let Err(tokio::sync::broadcast::error::RecvError::Closed) = rx.recv().await { + if rx.changed().await.is_err() { break; } } @@ -397,6 +440,32 @@ enum ResidentOrWantedEvicted { } impl ResidentOrWantedEvicted { + /// Non-mutating access to the a DownloadedLayer, if possible. + /// + /// This is not used on the read path (anything that calls + /// [`LayerInner::get_or_maybe_download`]) because it was decided that reads always win + /// evictions, and part of that winning is using [`ResidentOrWantedEvicted::get_and_upgrade`]. + #[cfg(test)] + fn get(&self) -> Option> { + match self { + ResidentOrWantedEvicted::Resident(strong) => Some(strong.clone()), + ResidentOrWantedEvicted::WantedEvicted(weak, _) => weak.upgrade(), + } + } + + /// Best-effort query for residency right now, not as strong guarantee as receiving a strong + /// reference from `ResidentOrWantedEvicted::get`. + fn is_likely_resident(&self) -> bool { + match self { + ResidentOrWantedEvicted::Resident(_) => true, + ResidentOrWantedEvicted::WantedEvicted(weak, _) => weak.strong_count() > 0, + } + } + + /// Upgrades any weak to strong if possible. + /// + /// Returns a strong reference if possible, along with a boolean telling if an upgrade + /// happened. fn get_and_upgrade(&mut self) -> Option<(Arc, bool)> { match self { ResidentOrWantedEvicted::Resident(strong) => Some((strong.clone(), false)), @@ -417,7 +486,7 @@ impl ResidentOrWantedEvicted { /// /// Returns `Some` if this was the first time eviction was requested. Care should be taken to /// drop the possibly last strong reference outside of the mutex of - /// heavier_once_cell::OnceCell. + /// [`heavier_once_cell::OnceCell`]. fn downgrade(&mut self) -> Option> { match self { ResidentOrWantedEvicted::Resident(strong) => { @@ -445,6 +514,9 @@ struct LayerInner { desc: PersistentLayerDesc, /// Timeline access is needed for remote timeline client and metrics. + /// + /// There should not be an access to timeline for any reason without entering the + /// [`Timeline::gate`] at the same time. timeline: Weak, /// Cached knowledge of [`Timeline::remote_client`] being `Some`. @@ -453,27 +525,38 @@ struct LayerInner { access_stats: LayerAccessStats, /// This custom OnceCell is backed by std mutex, but only held for short time periods. - /// Initialization and deinitialization are done while holding a permit. + /// + /// Filesystem changes (download, evict) are only done while holding a permit which the + /// `heavier_once_cell` provides. + /// + /// A number of fields in `Layer` are meant to only be updated when holding the InitPermit, but + /// possibly read while not holding it. inner: heavier_once_cell::OnceCell, /// Do we want to delete locally and remotely this when `LayerInner` is dropped wanted_deleted: AtomicBool, - /// Do we want to evict this layer as soon as possible? After being set to `true`, all accesses - /// will try to downgrade [`ResidentOrWantedEvicted`], which will eventually trigger - /// [`LayerInner::on_downloaded_layer_drop`]. - wanted_evicted: AtomicBool, - - /// Version is to make sure we will only evict a specific download of a file. + /// Version is to make sure we will only evict a specific initialization of the downloaded file. /// - /// Incremented for each download, stored in `DownloadedLayer::version` or + /// Incremented for each initialization, stored in `DownloadedLayer::version` or /// `ResidentOrWantedEvicted::WantedEvicted`. version: AtomicUsize, - /// Allow subscribing to when the layer actually gets evicted. - status: tokio::sync::broadcast::Sender, + /// Allow subscribing to when the layer actually gets evicted, a non-cancellable download + /// starts, or completes. + /// + /// Updates must only be posted while holding the InitPermit or the heavier_once_cell::Guard. + /// Holding the InitPermit is the only time we can do state transitions, but we also need to + /// cancel a pending eviction on upgrading a [`ResidentOrWantedEvicted::WantedEvicted`] back to + /// [`ResidentOrWantedEvicted::Resident`] on access. + /// + /// The sender is wrapped in an Option to facilitate moving it out on [`LayerInner::drop`]. + status: Option>, - /// Counter for exponential backoff with the download + /// Counter for exponential backoff with the download. + /// + /// This is atomic only for the purposes of having additional data only accessed while holding + /// the InitPermit. consecutive_failures: AtomicUsize, /// The generation of this Layer. @@ -491,7 +574,13 @@ struct LayerInner { /// a shard split since the layer was originally written. shard: ShardIndex, + /// When the Layer was last evicted but has not been downloaded since. + /// + /// This is used solely for updating metrics. See [`LayerImplMetrics::redownload_after`]. last_evicted_at: std::sync::Mutex>, + + #[cfg(test)] + failpoints: std::sync::Mutex>, } impl std::fmt::Display for LayerInner { @@ -508,16 +597,16 @@ impl AsLayerDesc for LayerInner { #[derive(Debug, Clone, Copy)] enum Status { + Resident, Evicted, - Downloaded, + Downloading, } impl Drop for LayerInner { fn drop(&mut self) { if !*self.wanted_deleted.get_mut() { - // should we try to evict if the last wish was for eviction? - // feels like there's some hazard of overcrowding near shutdown near by, but we don't - // run drops during shutdown (yet) + // should we try to evict if the last wish was for eviction? seems more like a hazard + // than a clear win. return; } @@ -528,9 +617,9 @@ impl Drop for LayerInner { let file_size = self.layer_desc().file_size; let timeline = self.timeline.clone(); let meta = self.metadata(); - let status = self.status.clone(); + let status = self.status.take(); - crate::task_mgr::BACKGROUND_RUNTIME.spawn_blocking(move || { + Self::spawn_blocking(move || { let _g = span.entered(); // carry this until we are finished for [`Layer::wait_drop`] support @@ -605,12 +694,16 @@ impl LayerInner { .timeline_path(&timeline.tenant_shard_id, &timeline.timeline_id) .join(desc.filename().to_string()); - let (inner, version) = if let Some(inner) = downloaded { + let (inner, version, init_status) = if let Some(inner) = downloaded { let version = inner.version; let resident = ResidentOrWantedEvicted::Resident(inner); - (heavier_once_cell::OnceCell::new(resident), version) + ( + heavier_once_cell::OnceCell::new(resident), + version, + Status::Resident, + ) } else { - (heavier_once_cell::OnceCell::default(), 0) + (heavier_once_cell::OnceCell::default(), 0, Status::Evicted) }; LayerInner { @@ -621,14 +714,15 @@ impl LayerInner { have_remote_client: timeline.remote_client.is_some(), access_stats, wanted_deleted: AtomicBool::new(false), - wanted_evicted: AtomicBool::new(false), inner, version: AtomicUsize::new(version), - status: tokio::sync::broadcast::channel(1).0, + status: Some(tokio::sync::watch::channel(init_status).0), consecutive_failures: AtomicUsize::new(0), generation, shard, last_evicted_at: std::sync::Mutex::default(), + #[cfg(test)] + failpoints: Default::default(), } } @@ -644,20 +738,34 @@ impl LayerInner { /// Cancellation safe, however dropping the future and calling this method again might result /// in a new attempt to evict OR join the previously started attempt. + #[tracing::instrument(level = tracing::Level::DEBUG, skip_all, ret, err(level = tracing::Level::DEBUG), fields(layer=%self))] pub(crate) async fn evict_and_wait(&self, timeout: Duration) -> Result<(), EvictionError> { - use tokio::sync::broadcast::error::RecvError; - assert!(self.have_remote_client); - let mut rx = self.status.subscribe(); + let mut rx = self.status.as_ref().unwrap().subscribe(); + + { + let current = rx.borrow_and_update(); + match &*current { + Status::Resident => { + // we might get lucky and evict this; continue + } + Status::Evicted | Status::Downloading => { + // it is already evicted + return Err(EvictionError::NotFound); + } + } + } let strong = { match self.inner.get() { - Some(mut either) => { - self.wanted_evicted.store(true, Ordering::Relaxed); - either.downgrade() + Some(mut either) => either.downgrade(), + None => { + // we already have a scheduled eviction, which just has not gotten to run yet. + // it might still race with a read access, but that could also get cancelled, + // so let's say this is not evictable. + return Err(EvictionError::NotFound); } - None => return Err(EvictionError::NotFound), } }; @@ -673,26 +781,26 @@ impl LayerInner { LAYER_IMPL_METRICS.inc_started_evictions(); } - match tokio::time::timeout(timeout, rx.recv()).await { - Ok(Ok(Status::Evicted)) => Ok(()), - Ok(Ok(Status::Downloaded)) => Err(EvictionError::Downloaded), - Ok(Err(RecvError::Closed)) => { - unreachable!("sender cannot be dropped while we are in &self method") - } - Ok(Err(RecvError::Lagged(_))) => { - // this is quite unlikely, but we are blocking a lot in the async context, so - // we might be missing this because we are stuck on a LIFO slot on a thread - // which is busy blocking for a 1TB database create_image_layers. - // - // use however late (compared to the initial expressing of wanted) as the - // "outcome" now - LAYER_IMPL_METRICS.inc_broadcast_lagged(); - match self.inner.get() { - Some(_) => Err(EvictionError::Downloaded), - None => Ok(()), - } - } - Err(_timeout) => Err(EvictionError::Timeout), + let changed = rx.changed(); + let changed = tokio::time::timeout(timeout, changed).await; + + let Ok(changed) = changed else { + return Err(EvictionError::Timeout); + }; + + let _: () = changed.expect("cannot be closed, because we are holding a strong reference"); + + let current = rx.borrow_and_update(); + + match &*current { + // the easiest case + Status::Evicted => Ok(()), + // it surely was evicted in between, but then there was a new access now; we can't know + // if it'll succeed so lets just call it evicted + Status::Downloading => Ok(()), + // either the download which was started after eviction completed already, or it was + // never evicted + Status::Resident => Err(EvictionError::Downloaded), } } @@ -702,181 +810,117 @@ impl LayerInner { allow_download: bool, ctx: Option<&RequestContext>, ) -> Result, DownloadError> { - let mut init_permit = None; + let (weak, permit) = { + // get_or_init_detached can: + // - be fast (mutex lock) OR uncontested semaphore permit acquire + // - be slow (wait for semaphore permit or closing) + let init_cancelled = scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled()); - loop { - let download = move |permit| { - async move { - // disable any scheduled but not yet running eviction deletions for this - let next_version = 1 + self.version.fetch_add(1, Ordering::Relaxed); + let locked = self + .inner + .get_or_init_detached() + .await + .map(|mut guard| guard.get_and_upgrade().ok_or(guard)); - // no need to make the evict_and_wait wait for the actual download to complete - drop(self.status.send(Status::Downloaded)); + scopeguard::ScopeGuard::into_inner(init_cancelled); - let timeline = self - .timeline - .upgrade() - .ok_or_else(|| DownloadError::TimelineShutdown)?; - - // count cancellations, which currently remain largely unexpected - let init_cancelled = - scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled()); - - let can_ever_evict = timeline.remote_client.as_ref().is_some(); - - // check if we really need to be downloaded; could have been already downloaded by a - // cancelled previous attempt. - let needs_download = self - .needs_download() - .await - .map_err(DownloadError::PreStatFailed); - - let needs_download = match needs_download { - Ok(reason) => reason, - Err(e) => { - scopeguard::ScopeGuard::into_inner(init_cancelled); - return Err(e); - } - }; - - let (permit, downloaded) = if let Some(reason) = needs_download { - if let NeedsDownload::NotFile(ft) = reason { - return Err(DownloadError::NotFile(ft)); - } - - // only reset this after we've decided we really need to download. otherwise it'd - // be impossible to mark cancelled downloads for eviction, like one could imagine - // we would like to do for prefetching which was not needed. - self.wanted_evicted.store(false, Ordering::Release); - - if !can_ever_evict { - scopeguard::ScopeGuard::into_inner(init_cancelled); - return Err(DownloadError::NoRemoteStorage); - } - - if let Some(ctx) = ctx { - let res = self.check_expected_download(ctx); - if let Err(e) = res { - scopeguard::ScopeGuard::into_inner(init_cancelled); - return Err(e); - } - } - - if !allow_download { - // this does look weird, but for LayerInner the "downloading" means also changing - // internal once related state ... - scopeguard::ScopeGuard::into_inner(init_cancelled); - return Err(DownloadError::DownloadRequired); - } - - tracing::info!(%reason, "downloading on-demand"); - - let permit = self.spawn_download_and_wait(timeline, permit).await; - - let permit = match permit { - Ok(permit) => permit, - Err(e) => { - scopeguard::ScopeGuard::into_inner(init_cancelled); - return Err(e); - } - }; - - (permit, true) - } else { - // the file is present locally, probably by a previous but cancelled call to - // get_or_maybe_download. alternatively we might be running without remote storage. - LAYER_IMPL_METRICS.inc_init_needed_no_download(); - - (permit, false) - }; - - scopeguard::ScopeGuard::into_inner(init_cancelled); - - if downloaded { - let since_last_eviction = self - .last_evicted_at - .lock() - .unwrap() - .take() - .map(|ts| ts.elapsed()); - - if let Some(since_last_eviction) = since_last_eviction { - LAYER_IMPL_METRICS.record_redownloaded_after(since_last_eviction); - } - } - - let res = Arc::new(DownloadedLayer { - owner: Arc::downgrade(self), - kind: tokio::sync::OnceCell::default(), - version: next_version, - }); - - self.access_stats.record_residence_event( - LayerResidenceStatus::Resident, - LayerResidenceEventReason::ResidenceChange, - ); - - let waiters = self.inner.initializer_count(); - if waiters > 0 { - tracing::info!( - waiters, - "completing the on-demand download for other tasks" - ); - } - - Ok((ResidentOrWantedEvicted::Resident(res), permit)) - } - .instrument(tracing::info_span!("get_or_maybe_download", layer=%self)) - }; - - if let Some(init_permit) = init_permit.take() { - // use the already held initialization permit because it is impossible to hit the - // below paths anymore essentially limiting the max loop iterations to 2. - let (value, init_permit) = download(init_permit).await?; - let mut guard = self.inner.set(value, init_permit); - let (strong, _upgraded) = guard - .get_and_upgrade() - .expect("init creates strong reference, we held the init permit"); - return Ok(strong); - } - - let (weak, permit) = { - let mut locked = self.inner.get_or_init(download).await?; - - if let Some((strong, upgraded)) = locked.get_and_upgrade() { - if upgraded { - // when upgraded back, the Arc is still available, but - // previously a `evict_and_wait` was received. - self.wanted_evicted.store(false, Ordering::Relaxed); - - // error out any `evict_and_wait` - drop(self.status.send(Status::Downloaded)); - LAYER_IMPL_METRICS - .inc_eviction_cancelled(EvictionCancelled::UpgradedBackOnAccess); - } + match locked { + // this path could had been a RwLock::read + Ok(Ok((strong, upgraded))) if !upgraded => return Ok(strong), + Ok(Ok((strong, _))) => { + // when upgraded back, the Arc is still available, but + // previously a `evict_and_wait` was received. this is the only place when we + // send out an update without holding the InitPermit. + // + // note that we also have dropped the Guard; this is fine, because we just made + // a state change and are holding a strong reference to be returned. + self.status.as_ref().unwrap().send_replace(Status::Resident); + LAYER_IMPL_METRICS + .inc_eviction_cancelled(EvictionCancelled::UpgradedBackOnAccess); return Ok(strong); - } else { - // path to here: the evict_blocking is stuck on spawn_blocking queue. - // - // reset the contents, deactivating the eviction and causing a - // EvictionCancelled::LostToDownload or EvictionCancelled::VersionCheckFailed. - locked.take_and_deinit() } - }; - - // unlock first, then drop the weak, but because upgrade failed, we - // know it cannot be a problem. + Ok(Err(guard)) => { + // path to here: we won the eviction, the file should still be on the disk. + let (weak, permit) = guard.take_and_deinit(); + (Some(weak), permit) + } + Err(permit) => (None, permit), + } + }; + if let Some(weak) = weak { + // only drop the weak after dropping the heavier_once_cell guard assert!( matches!(weak, ResidentOrWantedEvicted::WantedEvicted(..)), "unexpected {weak:?}, ResidentOrWantedEvicted::get_and_upgrade has a bug" ); - - init_permit = Some(permit); - - LAYER_IMPL_METRICS.inc_retried_get_or_maybe_download(); } + + let timeline = self + .timeline + .upgrade() + .ok_or_else(|| DownloadError::TimelineShutdown)?; + + // count cancellations, which currently remain largely unexpected + let init_cancelled = scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled()); + + // check if we really need to be downloaded: this can happen if a read access won the + // semaphore before eviction. + // + // if we are cancelled while doing this `stat` the `self.inner` will be uninitialized. a + // pending eviction will try to evict even upon finding an uninitialized `self.inner`. + let needs_download = self + .needs_download() + .await + .map_err(DownloadError::PreStatFailed); + + scopeguard::ScopeGuard::into_inner(init_cancelled); + + let needs_download = needs_download?; + + let Some(reason) = needs_download else { + // the file is present locally because eviction has not had a chance to run yet + + #[cfg(test)] + self.failpoint(failpoints::FailpointKind::AfterDeterminingLayerNeedsNoDownload) + .await?; + + LAYER_IMPL_METRICS.inc_init_needed_no_download(); + + return Ok(self.initialize_after_layer_is_on_disk(permit)); + }; + + // we must download; getting cancelled before spawning the download is not an issue as + // any still running eviction would not find anything to evict. + + if let NeedsDownload::NotFile(ft) = reason { + return Err(DownloadError::NotFile(ft)); + } + + if timeline.remote_client.as_ref().is_none() { + return Err(DownloadError::NoRemoteStorage); + } + + if let Some(ctx) = ctx { + self.check_expected_download(ctx)?; + } + + if !allow_download { + // this is only used from tests, but it is hard to test without the boolean + return Err(DownloadError::DownloadRequired); + } + + async move { + tracing::info!(%reason, "downloading on-demand"); + + let init_cancelled = scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled()); + let res = self.download_init_and_wait(timeline, permit).await?; + scopeguard::ScopeGuard::into_inner(init_cancelled); + Ok(res) + } + .instrument(tracing::info_span!("get_or_maybe_download", layer=%self)) + .await } /// Nag or fail per RequestContext policy @@ -906,11 +950,11 @@ impl LayerInner { } /// Actual download, at most one is executed at the time. - async fn spawn_download_and_wait( + async fn download_init_and_wait( self: &Arc, timeline: Arc, permit: heavier_once_cell::InitPermit, - ) -> Result { + ) -> Result, DownloadError> { debug_assert_current_span_has_tenant_and_timeline_id(); let (tx, rx) = tokio::sync::oneshot::channel(); @@ -922,66 +966,35 @@ impl LayerInner { .enter() .map_err(|_| DownloadError::DownloadCancelled)?; - tokio::task::spawn(async move { - + Self::spawn( + async move { let _guard = guard; - let client = timeline - .remote_client + // now that we have commited to downloading, send out an update to: + // - unhang any pending eviction + // - break out of evict_and_wait + this.status .as_ref() - .expect("checked above with have_remote_client"); + .unwrap() + .send_replace(Status::Downloading); - let result = client.download_layer_file( - &this.desc.filename(), - &this.metadata(), - &timeline.cancel - ) - .await; + #[cfg(test)] + this.failpoint(failpoints::FailpointKind::WaitBeforeDownloading) + .await + .unwrap(); - let result = match result { - Ok(size) => { - timeline.metrics.resident_physical_size_add(size); - Ok(()) - } - Err(e) => { - let consecutive_failures = - this.consecutive_failures.fetch_add(1, Ordering::Relaxed); + let res = this.download_and_init(timeline, permit).await; - let backoff = utils::backoff::exponential_backoff_duration_seconds( - consecutive_failures.min(u32::MAX as usize) as u32, - 1.5, - 60.0, - ); - - let backoff = std::time::Duration::from_secs_f64(backoff); - - tokio::select! { - _ = tokio::time::sleep(backoff) => {}, - _ = timeline.cancel.cancelled() => {}, - }; - - Err(e) - } - }; - - if let Err(res) = tx.send((result, permit)) { + if let Err(res) = tx.send(res) { match res { - (Ok(()), _) => { - // our caller is cancellation safe so this is fine; if someone - // else requests the layer, they'll find it already downloaded. - // - // See counter [`LayerImplMetrics::inc_init_needed_no_download`] - // - // FIXME(#6028): however, could be that we should consider marking the - // layer for eviction? alas, cannot: because only DownloadedLayer will - // handle that. - }, - (Err(e), _) => { - // our caller is cancellation safe, but we might be racing with - // another attempt to initialize. before we have cancellation - // token support: these attempts should converge regardless of - // their completion order. - tracing::error!("layer file download failed, and additionally failed to communicate this to caller: {e:?}"); + Ok(_res) => { + tracing::debug!("layer initialized, but caller has been cancelled"); + LAYER_IMPL_METRICS.inc_init_completed_without_requester(); + } + Err(e) => { + tracing::info!( + "layer file download failed, and caller has been cancelled: {e:?}" + ); LAYER_IMPL_METRICS.inc_download_failed_without_requester(); } } @@ -991,41 +1004,137 @@ impl LayerInner { ); match rx.await { - Ok((Ok(()), permit)) => { - if let Some(reason) = self - .needs_download() - .await - .map_err(DownloadError::PostStatFailed)? - { - // this is really a bug in needs_download or remote timeline client - panic!("post-condition failed: needs_download returned {reason:?}"); - } - - self.consecutive_failures.store(0, Ordering::Relaxed); - tracing::info!(size=%self.desc.file_size, "on-demand download successful"); - - Ok(permit) - } - Ok((Err(e), _permit)) => { + Ok(Ok(res)) => Ok(res), + Ok(Err(e)) => { // sleep already happened in the spawned task, if it was not cancelled - let consecutive_failures = self.consecutive_failures.load(Ordering::Relaxed); - match e.downcast_ref::() { // If the download failed due to its cancellation token, // propagate the cancellation error upstream. Some(remote_storage::DownloadError::Cancelled) => { Err(DownloadError::DownloadCancelled) } - _ => { - tracing::error!(consecutive_failures, "layer file download failed: {e:#}"); - Err(DownloadError::DownloadFailed) - } + // FIXME: this is not embedding the error because historically it would had + // been output to compute, however that is no longer the case. + _ => Err(DownloadError::DownloadFailed), } } Err(_gone) => Err(DownloadError::DownloadCancelled), } } + async fn download_and_init( + self: &Arc, + timeline: Arc, + permit: heavier_once_cell::InitPermit, + ) -> anyhow::Result> { + let client = timeline + .remote_client + .as_ref() + .expect("checked before download_init_and_wait"); + + let result = client + .download_layer_file(&self.desc.filename(), &self.metadata(), &timeline.cancel) + .await; + + match result { + Ok(size) => { + assert_eq!(size, self.desc.file_size); + + match self.needs_download().await { + Ok(Some(reason)) => { + // this is really a bug in needs_download or remote timeline client + panic!("post-condition failed: needs_download returned {reason:?}"); + } + Ok(None) => { + // as expected + } + Err(e) => { + panic!("post-condition failed: needs_download errored: {e:?}"); + } + } + + tracing::info!(size=%self.desc.file_size, "on-demand download successful"); + timeline + .metrics + .resident_physical_size_add(self.desc.file_size); + self.consecutive_failures.store(0, Ordering::Relaxed); + + let since_last_eviction = self + .last_evicted_at + .lock() + .unwrap() + .take() + .map(|ts| ts.elapsed()); + if let Some(since_last_eviction) = since_last_eviction { + LAYER_IMPL_METRICS.record_redownloaded_after(since_last_eviction); + } + + self.access_stats.record_residence_event( + LayerResidenceStatus::Resident, + LayerResidenceEventReason::ResidenceChange, + ); + + Ok(self.initialize_after_layer_is_on_disk(permit)) + } + Err(e) => { + let consecutive_failures = + 1 + self.consecutive_failures.fetch_add(1, Ordering::Relaxed); + + tracing::error!(consecutive_failures, "layer file download failed: {e:#}"); + + let backoff = utils::backoff::exponential_backoff_duration_seconds( + consecutive_failures.min(u32::MAX as usize) as u32, + 1.5, + 60.0, + ); + + let backoff = std::time::Duration::from_secs_f64(backoff); + + tokio::select! { + _ = tokio::time::sleep(backoff) => {}, + _ = timeline.cancel.cancelled() => {}, + }; + + Err(e) + } + } + } + + /// Initializes the `Self::inner` to a "resident" state. + /// + /// Callers are assumed to ensure that the file is actually on disk with `Self::needs_download` + /// before calling this method. + /// + /// If this method is ever made async, it needs to be cancellation safe so that no state + /// changes are made before we can write to the OnceCell in non-cancellable fashion. + fn initialize_after_layer_is_on_disk( + self: &Arc, + permit: heavier_once_cell::InitPermit, + ) -> Arc { + debug_assert_current_span_has_tenant_and_timeline_id(); + + // disable any scheduled but not yet running eviction deletions for this initialization + let next_version = 1 + self.version.fetch_add(1, Ordering::Relaxed); + self.status.as_ref().unwrap().send_replace(Status::Resident); + + let res = Arc::new(DownloadedLayer { + owner: Arc::downgrade(self), + kind: tokio::sync::OnceCell::default(), + version: next_version, + }); + + let waiters = self.inner.initializer_count(); + if waiters > 0 { + tracing::info!(waiters, "completing layer init for other tasks"); + } + + let value = ResidentOrWantedEvicted::Resident(res.clone()); + + self.inner.set(value, permit); + + res + } + async fn needs_download(&self) -> Result, std::io::Error> { match tokio::fs::metadata(&self.path).await { Ok(m) => Ok(self.is_file_present_and_good_size(&m).err()), @@ -1059,9 +1168,11 @@ impl LayerInner { fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo { let layer_file_name = self.desc.filename().file_name(); - // this is not accurate: we could have the file locally but there was a cancellation - // and now we are not in sync, or we are currently downloading it. - let remote = self.inner.get().is_none(); + let resident = self + .inner + .get() + .map(|rowe| rowe.is_likely_resident()) + .unwrap_or(false); let access_stats = self.access_stats.as_api_model(reset); @@ -1073,7 +1184,7 @@ impl LayerInner { layer_file_size: self.desc.file_size, lsn_start: lsn_range.start, lsn_end: lsn_range.end, - remote, + remote: !resident, access_stats, } } else { @@ -1083,94 +1194,195 @@ impl LayerInner { layer_file_name, layer_file_size: self.desc.file_size, lsn_start: lsn, - remote, + remote: !resident, access_stats, } } } /// `DownloadedLayer` is being dropped, so it calls this method. - fn on_downloaded_layer_drop(self: Arc, version: usize) { - let evict = self.wanted_evicted.load(Ordering::Acquire); + fn on_downloaded_layer_drop(self: Arc, only_version: usize) { let can_evict = self.have_remote_client; - if can_evict && evict { - let span = tracing::info_span!(parent: None, "layer_evict", tenant_id = %self.desc.tenant_shard_id.tenant_id, shard_id = %self.desc.tenant_shard_id.shard_slug(), timeline_id = %self.desc.timeline_id, layer=%self, %version); + // we cannot know without inspecting LayerInner::inner if we should evict or not, even + // though here it is very likely + let span = tracing::info_span!(parent: None, "layer_evict", tenant_id = %self.desc.tenant_shard_id.tenant_id, shard_id = %self.desc.tenant_shard_id.shard_slug(), timeline_id = %self.desc.timeline_id, layer=%self, version=%only_version); - // downgrade for queueing, in case there's a tear down already ongoing we should not - // hold it alive. - let this = Arc::downgrade(&self); - drop(self); - - // NOTE: this scope *must* never call `self.inner.get` because evict_and_wait might - // drop while the `self.inner` is being locked, leading to a deadlock. - - crate::task_mgr::BACKGROUND_RUNTIME.spawn_blocking(move || { - let _g = span.entered(); - - // if LayerInner is already dropped here, do nothing because the delete on drop - // has already ran while we were in queue - let Some(this) = this.upgrade() else { - LAYER_IMPL_METRICS.inc_eviction_cancelled(EvictionCancelled::LayerGone); - return; - }; - match this.evict_blocking(version) { - Ok(()) => LAYER_IMPL_METRICS.inc_completed_evictions(), - Err(reason) => LAYER_IMPL_METRICS.inc_eviction_cancelled(reason), - } + if !can_evict { + // it would be nice to assert this case out, but we are in drop + span.in_scope(|| { + tracing::error!("bug in struct Layer: ResidentOrWantedEvicted has been downgraded while we have no remote storage"); }); + return; } + + // NOTE: this scope *must* never call `self.inner.get` because evict_and_wait might + // drop while the `self.inner` is being locked, leading to a deadlock. + + let start_evicting = async move { + #[cfg(test)] + self.failpoint(failpoints::FailpointKind::WaitBeforeStartingEvicting) + .await + .expect("failpoint should not have errored"); + + tracing::debug!("eviction started"); + + let res = self.wait_for_turn_and_evict(only_version).await; + // metrics: ignore the Ok branch, it is not done yet + if let Err(e) = res { + tracing::debug!(res=?Err::<(), _>(&e), "eviction completed"); + LAYER_IMPL_METRICS.inc_eviction_cancelled(e); + } + }; + + Self::spawn(start_evicting.instrument(span)); } - fn evict_blocking(&self, only_version: usize) -> Result<(), EvictionCancelled> { - // deleted or detached timeline, don't do anything. - let Some(timeline) = self.timeline.upgrade() else { - return Err(EvictionCancelled::TimelineGone); - }; + async fn wait_for_turn_and_evict( + self: Arc, + only_version: usize, + ) -> Result<(), EvictionCancelled> { + fn is_good_to_continue(status: &Status) -> Result<(), EvictionCancelled> { + use Status::*; + match status { + Resident => Ok(()), + Evicted => Err(EvictionCancelled::UnexpectedEvictedState), + Downloading => Err(EvictionCancelled::LostToDownload), + } + } + + let timeline = self + .timeline + .upgrade() + .ok_or(EvictionCancelled::TimelineGone)?; + + let mut rx = self + .status + .as_ref() + .expect("LayerInner cannot be dropped, holding strong ref") + .subscribe(); + + is_good_to_continue(&rx.borrow_and_update())?; let Ok(_gate) = timeline.gate.enter() else { return Err(EvictionCancelled::TimelineGone); }; - // to avoid starting a new download while we evict, keep holding on to the - // permit. - let _permit = { - let maybe_downloaded = self.inner.get(); + let permit = { + // we cannot just `std::fs::remove_file` because there might already be an + // get_or_maybe_download which will inspect filesystem and reinitialize. filesystem + // operations must be done while holding the heavier_once_cell::InitPermit + let mut wait = std::pin::pin!(self.inner.get_or_init_detached()); - let (_weak, permit) = match maybe_downloaded { - Some(mut guard) => { - if let ResidentOrWantedEvicted::WantedEvicted(_weak, version) = &*guard { - if *version == only_version { - guard.take_and_deinit() - } else { - // this was not for us; maybe there's another eviction job - // TODO: does it make any sense to stall here? unique versions do not - // matter, we only want to make sure not to evict a resident, which we - // are not doing. - return Err(EvictionCancelled::VersionCheckFailed); - } - } else { - return Err(EvictionCancelled::AlreadyReinitialized); + let waited = loop { + // we must race to the Downloading starting, otherwise we would have to wait until the + // completion of the download. waiting for download could be long and hinder our + // efforts to alert on "hanging" evictions. + tokio::select! { + res = &mut wait => break res, + _ = rx.changed() => { + is_good_to_continue(&rx.borrow_and_update())?; + // two possibilities for Status::Resident: + // - the layer was found locally from disk by a read + // - we missed a bunch of updates and now the layer is + // again downloaded -- assume we'll fail later on with + // version check or AlreadyReinitialized } } - None => { - // already deinitialized, perhaps get_or_maybe_download did this and is - // currently waiting to reinitialize it - return Err(EvictionCancelled::LostToDownload); + }; + + // re-check now that we have the guard or permit; all updates should have happened + // while holding the permit. + is_good_to_continue(&rx.borrow_and_update())?; + + // the term deinitialize is used here, because we clearing out the Weak will eventually + // lead to deallocating the reference counted value, and the value we + // `Guard::take_and_deinit` is likely to be the last because the Weak is never cloned. + let (_weak, permit) = match waited { + Ok(guard) => { + match &*guard { + ResidentOrWantedEvicted::WantedEvicted(_weak, version) + if *version == only_version => + { + tracing::debug!(version, "deinitializing matching WantedEvicted"); + let (weak, permit) = guard.take_and_deinit(); + (Some(weak), permit) + } + ResidentOrWantedEvicted::WantedEvicted(_, version) => { + // if we were not doing the version check, we would need to try to + // upgrade the weak here to see if it really is dropped. version check + // is done instead assuming that it is cheaper. + tracing::debug!( + version, + only_version, + "version mismatch, not deinitializing" + ); + return Err(EvictionCancelled::VersionCheckFailed); + } + ResidentOrWantedEvicted::Resident(_) => { + return Err(EvictionCancelled::AlreadyReinitialized); + } + } + } + Err(permit) => { + tracing::debug!("continuing after cancelled get_or_maybe_download or eviction"); + (None, permit) } }; permit }; - // now accesses to inner.get_or_init wait on the semaphore or the `_permit` + let span = tracing::Span::current(); - self.access_stats.record_residence_event( - LayerResidenceStatus::Evicted, - LayerResidenceEventReason::ResidenceChange, - ); + let spawned_at = std::time::Instant::now(); - let res = match capture_mtime_and_remove(&self.path) { + // this is on purpose a detached spawn; we don't need to wait for it + // + // eviction completion reporting is the only thing hinging on this, and it can be just as + // well from a spawn_blocking thread. + // + // important to note that now that we've acquired the permit we have made sure the evicted + // file is either the exact `WantedEvicted` we wanted to evict, or uninitialized in case + // there are multiple evictions. The rest is not cancellable, and we've now commited to + // evicting. + // + // If spawn_blocking has a queue and maximum number of threads are in use, we could stall + // reads. We will need to add cancellation for that if necessary. + Self::spawn_blocking(move || { + let _span = span.entered(); + + let res = self.evict_blocking(&timeline, &permit); + + let waiters = self.inner.initializer_count(); + + if waiters > 0 { + LAYER_IMPL_METRICS.inc_evicted_with_waiters(); + } + + let completed_in = spawned_at.elapsed(); + LAYER_IMPL_METRICS.record_time_to_evict(completed_in); + + match res { + Ok(()) => LAYER_IMPL_METRICS.inc_completed_evictions(), + Err(e) => LAYER_IMPL_METRICS.inc_eviction_cancelled(e), + } + + tracing::debug!(?res, elapsed_ms=%completed_in.as_millis(), %waiters, "eviction completed"); + }); + + Ok(()) + } + + /// This is blocking only to do just one spawn_blocking hop compared to multiple via tokio::fs. + fn evict_blocking( + &self, + timeline: &Timeline, + _permit: &heavier_once_cell::InitPermit, + ) -> Result<(), EvictionCancelled> { + // now accesses to `self.inner.get_or_init*` wait on the semaphore or the `_permit` + + match capture_mtime_and_remove(&self.path) { Ok(local_layer_mtime) => { let duration = SystemTime::now().duration_since(local_layer_mtime); match duration { @@ -1194,33 +1406,60 @@ impl LayerInner { timeline .metrics .resident_physical_size_sub(self.desc.file_size); - - Ok(()) } Err(e) if e.kind() == std::io::ErrorKind::NotFound => { tracing::error!( layer_size = %self.desc.file_size, - "failed to evict layer from disk, it was already gone (metrics will be inaccurate)" + "failed to evict layer from disk, it was already gone" ); - Err(EvictionCancelled::FileNotFound) + return Err(EvictionCancelled::FileNotFound); } Err(e) => { + // FIXME: this should probably be an abort tracing::error!("failed to evict file from disk: {e:#}"); - Err(EvictionCancelled::RemoveFailed) + return Err(EvictionCancelled::RemoveFailed); } - }; + } - // we are still holding the permit, so no new spawn_download_and_wait can happen - drop(self.status.send(Status::Evicted)); + self.access_stats.record_residence_event( + LayerResidenceStatus::Evicted, + LayerResidenceEventReason::ResidenceChange, + ); + + self.status.as_ref().unwrap().send_replace(Status::Evicted); *self.last_evicted_at.lock().unwrap() = Some(std::time::Instant::now()); - res + Ok(()) } fn metadata(&self) -> LayerFileMetadata { LayerFileMetadata::new(self.desc.file_size, self.generation, self.shard) } + + /// Needed to use entered runtime in tests, but otherwise use BACKGROUND_RUNTIME. + /// + /// Synchronizing with spawned tasks is very complicated otherwise. + fn spawn(fut: F) + where + F: std::future::Future + Send + 'static, + { + #[cfg(test)] + tokio::task::spawn(fut); + #[cfg(not(test))] + crate::task_mgr::THE_RUNTIME.spawn(fut); + } + + /// Needed to use entered runtime in tests, but otherwise use BACKGROUND_RUNTIME. + fn spawn_blocking(f: F) + where + F: FnOnce() + Send + 'static, + { + #[cfg(test)] + tokio::task::spawn_blocking(f); + #[cfg(not(test))] + crate::task_mgr::THE_RUNTIME.spawn_blocking(f); + } } fn capture_mtime_and_remove(path: &Utf8Path) -> Result { @@ -1264,8 +1503,10 @@ pub(crate) enum DownloadError { DownloadCancelled, #[error("pre-condition: stat before download failed")] PreStatFailed(#[source] std::io::Error), - #[error("post-condition: stat after download failed")] - PostStatFailed(#[source] std::io::Error), + + #[cfg(test)] + #[error("failpoint: {0:?}")] + Failpoint(failpoints::FailpointKind), } #[derive(Debug, PartialEq)] @@ -1312,6 +1553,7 @@ impl Drop for DownloadedLayer { owner.on_downloaded_layer_drop(self.version); } else { // no need to do anything, we are shutting down + LAYER_IMPL_METRICS.inc_eviction_cancelled(EvictionCancelled::LayerGone); } } } @@ -1484,7 +1726,7 @@ impl ResidentLayer { } /// Loads all keys stored in the layer. Returns key, lsn and value size. - #[tracing::instrument(skip_all, fields(layer=%self))] + #[tracing::instrument(level = tracing::Level::DEBUG, skip_all, fields(layer=%self))] pub(crate) async fn load_keys<'a>( &'a self, ctx: &RequestContext, @@ -1504,9 +1746,9 @@ impl ResidentLayer { // while it's being held. delta_layer::DeltaLayerInner::load_keys(d, ctx) .await - .context("Layer index is corrupted") + .with_context(|| format!("Layer index is corrupted for {self}")) } - Image(_) => anyhow::bail!("cannot load_keys on a image layer"), + Image(_) => anyhow::bail!(format!("cannot load_keys on a image layer {self}")), } } @@ -1552,6 +1794,7 @@ pub(crate) struct LayerImplMetrics { rare_counters: enum_map::EnumMap, inits_cancelled: metrics::core::GenericCounter, redownload_after: metrics::Histogram, + time_to_evict: metrics::Histogram, } impl Default for LayerImplMetrics { @@ -1647,6 +1890,13 @@ impl Default for LayerImplMetrics { .unwrap() }; + let time_to_evict = metrics::register_histogram!( + "pageserver_layer_eviction_held_permit_seconds", + "Time eviction held the permit.", + vec![0.001, 0.010, 0.100, 0.500, 1.000, 5.000] + ) + .unwrap(); + Self { started_evictions, completed_evictions, @@ -1659,6 +1909,7 @@ impl Default for LayerImplMetrics { rare_counters, inits_cancelled, redownload_after, + time_to_evict, } } } @@ -1690,9 +1941,10 @@ impl LayerImplMetrics { self.rare_counters[RareEvent::RemoveOnDropFailed].inc(); } - /// Expected rare because requires a race with `evict_blocking` and `get_or_maybe_download`. - fn inc_retried_get_or_maybe_download(&self) { - self.rare_counters[RareEvent::RetriedGetOrMaybeDownload].inc(); + /// Expected rare just as cancellations are rare, but we could have cancellations separate from + /// the single caller which can start the download, so use this counter to separte them. + fn inc_init_completed_without_requester(&self) { + self.rare_counters[RareEvent::InitCompletedWithoutRequester].inc(); } /// Expected rare because cancellations are unexpected, and failures are unexpected @@ -1719,10 +1971,6 @@ impl LayerImplMetrics { self.rare_counters[RareEvent::PermanentLoadingFailure].inc(); } - fn inc_broadcast_lagged(&self) { - self.rare_counters[RareEvent::EvictAndWaitLagged].inc(); - } - fn inc_init_cancelled(&self) { self.inits_cancelled.inc() } @@ -1730,9 +1978,22 @@ impl LayerImplMetrics { fn record_redownloaded_after(&self, duration: std::time::Duration) { self.redownload_after.observe(duration.as_secs_f64()) } + + /// This would be bad if it ever happened, or mean extreme disk pressure. We should probably + /// instead cancel eviction if we would have read waiters. We cannot however separate reads + /// from other evictions, so this could have noise as well. + fn inc_evicted_with_waiters(&self) { + self.rare_counters[RareEvent::EvictedWithWaiters].inc(); + } + + /// Recorded at least initially as the permit is now acquired in async context before + /// spawn_blocking action. + fn record_time_to_evict(&self, duration: std::time::Duration) { + self.time_to_evict.observe(duration.as_secs_f64()) + } } -#[derive(enum_map::Enum)] +#[derive(Debug, Clone, Copy, enum_map::Enum)] enum EvictionCancelled { LayerGone, TimelineGone, @@ -1744,6 +2005,7 @@ enum EvictionCancelled { LostToDownload, /// After eviction, there was a new layer access which cancelled the eviction. UpgradedBackOnAccess, + UnexpectedEvictedState, } impl EvictionCancelled { @@ -1757,6 +2019,7 @@ impl EvictionCancelled { EvictionCancelled::AlreadyReinitialized => "already_reinitialized", EvictionCancelled::LostToDownload => "lost_to_download", EvictionCancelled::UpgradedBackOnAccess => "upgraded_back_on_access", + EvictionCancelled::UnexpectedEvictedState => "unexpected_evicted_state", } } } @@ -1779,12 +2042,12 @@ impl DeleteFailed { #[derive(enum_map::Enum)] enum RareEvent { RemoveOnDropFailed, - RetriedGetOrMaybeDownload, + InitCompletedWithoutRequester, DownloadFailedWithoutRequester, UpgradedWantedEvicted, InitWithoutDownload, PermanentLoadingFailure, - EvictAndWaitLagged, + EvictedWithWaiters, } impl RareEvent { @@ -1793,12 +2056,12 @@ impl RareEvent { match self { RemoveOnDropFailed => "remove_on_drop_failed", - RetriedGetOrMaybeDownload => "retried_gomd", + InitCompletedWithoutRequester => "init_completed_without", DownloadFailedWithoutRequester => "download_failed_without", UpgradedWantedEvicted => "raced_wanted_evicted", InitWithoutDownload => "init_needed_no_download", PermanentLoadingFailure => "permanent_loading_failure", - EvictAndWaitLagged => "broadcast_lagged", + EvictedWithWaiters => "evicted_with_waiters", } } } diff --git a/pageserver/src/tenant/storage_layer/layer/failpoints.rs b/pageserver/src/tenant/storage_layer/layer/failpoints.rs new file mode 100644 index 0000000000..6cedc41d98 --- /dev/null +++ b/pageserver/src/tenant/storage_layer/layer/failpoints.rs @@ -0,0 +1,119 @@ +//! failpoints for unit tests, implying `#[cfg(test)]`. +//! +//! These are not accessible over http. + +use super::*; + +impl Layer { + /// Enable a failpoint from a unit test. + pub(super) fn enable_failpoint(&self, failpoint: Failpoint) { + self.0.failpoints.lock().unwrap().push(failpoint); + } +} + +impl LayerInner { + /// Query if this failpoint is enabled, as in, arrive at a failpoint. + /// + /// Calls to this method need to be `#[cfg(test)]` guarded. + pub(super) async fn failpoint(&self, kind: FailpointKind) -> Result<(), FailpointHit> { + let fut = { + let mut fps = self.failpoints.lock().unwrap(); + // find the *last* failpoint for cases in which we need to use multiple for the same + // thing (two blocked evictions) + let fp = fps.iter_mut().rfind(|x| x.kind() == kind); + + let Some(fp) = fp else { + return Ok(()); + }; + + fp.hit() + }; + + fut.await + } +} + +#[derive(Debug, PartialEq, Eq)] +pub(crate) enum FailpointKind { + /// Failpoint acts as an accurate cancelled by drop here; see the only site of use. + AfterDeterminingLayerNeedsNoDownload, + /// Failpoint for stalling eviction starting + WaitBeforeStartingEvicting, + /// Failpoint hit in the spawned task + WaitBeforeDownloading, +} + +pub(crate) enum Failpoint { + AfterDeterminingLayerNeedsNoDownload, + WaitBeforeStartingEvicting( + Option, + utils::completion::Barrier, + ), + WaitBeforeDownloading( + Option, + utils::completion::Barrier, + ), +} + +impl Failpoint { + fn kind(&self) -> FailpointKind { + match self { + Failpoint::AfterDeterminingLayerNeedsNoDownload => { + FailpointKind::AfterDeterminingLayerNeedsNoDownload + } + Failpoint::WaitBeforeStartingEvicting(..) => FailpointKind::WaitBeforeStartingEvicting, + Failpoint::WaitBeforeDownloading(..) => FailpointKind::WaitBeforeDownloading, + } + } + + fn hit(&mut self) -> impl std::future::Future> + 'static { + use futures::future::FutureExt; + + // use boxed futures to avoid Either hurdles + match self { + Failpoint::AfterDeterminingLayerNeedsNoDownload => { + let kind = self.kind(); + + async move { Err(FailpointHit(kind)) }.boxed() + } + Failpoint::WaitBeforeStartingEvicting(arrival, b) + | Failpoint::WaitBeforeDownloading(arrival, b) => { + // first one signals arrival + drop(arrival.take()); + + let b = b.clone(); + + async move { + tracing::trace!("waiting on a failpoint barrier"); + b.wait().await; + tracing::trace!("done waiting on a failpoint barrier"); + Ok(()) + } + .boxed() + } + } + } +} + +impl std::fmt::Display for FailpointKind { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + std::fmt::Debug::fmt(self, f) + } +} + +#[derive(Debug)] +pub(crate) struct FailpointHit(FailpointKind); + +impl std::fmt::Display for FailpointHit { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + std::fmt::Debug::fmt(self, f) + } +} + +impl std::error::Error for FailpointHit {} + +impl From for DownloadError { + fn from(value: FailpointHit) -> Self { + DownloadError::Failpoint(value.0) + } +} diff --git a/pageserver/src/tenant/storage_layer/layer/tests.rs b/pageserver/src/tenant/storage_layer/layer/tests.rs index b43534efd4..247ff123b5 100644 --- a/pageserver/src/tenant/storage_layer/layer/tests.rs +++ b/pageserver/src/tenant/storage_layer/layer/tests.rs @@ -1,14 +1,13 @@ -use futures::StreamExt; use pageserver_api::key::CONTROLFILE_KEY; use tokio::task::JoinSet; -use tracing::Instrument; use utils::{ completion::{self, Completion}, id::TimelineId, }; +use super::failpoints::{Failpoint, FailpointKind}; use super::*; -use crate::{context::DownloadBehavior, task_mgr::BACKGROUND_RUNTIME}; +use crate::context::DownloadBehavior; use crate::{task_mgr::TaskKind, tenant::harness::TenantHarness}; /// Used in tests to advance a future to wanted await point, and not futher. @@ -21,7 +20,7 @@ const FOREVER: std::time::Duration = std::time::Duration::from_secs(ADVANCE.as_s /// Demonstrate the API and resident -> evicted -> resident -> deleted transitions. #[tokio::test] async fn smoke_test() { - let handle = BACKGROUND_RUNTIME.handle(); + let handle = tokio::runtime::Handle::current(); let h = TenantHarness::create("smoke_test").unwrap(); let span = h.span(); @@ -38,7 +37,7 @@ async fn smoke_test() { let layer = { let mut layers = { let layers = timeline.layers.read().await; - layers.resident_layers().collect::>().await + layers.likely_resident_layers().collect::>() }; assert_eq!(layers.len(), 1); @@ -88,7 +87,7 @@ async fn smoke_test() { // // ZERO for timeout does not work reliably, so first take up all spawn_blocking slots to // artificially slow it down. - let helper = SpawnBlockingPoolHelper::consume_all_spawn_blocking_threads(handle).await; + let helper = SpawnBlockingPoolHelper::consume_all_spawn_blocking_threads(&handle).await; match layer .evict_and_wait(std::time::Duration::ZERO) @@ -99,7 +98,7 @@ async fn smoke_test() { // expected, but note that the eviction is "still ongoing" helper.release().await; // exhaust spawn_blocking pool to ensure it is now complete - SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(handle) + SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(&handle) .await; } other => unreachable!("{other:?}"), @@ -108,7 +107,7 @@ async fn smoke_test() { // only way to query if a layer is resident is to acquire a ResidentLayer instance. // Layer::keep_resident never downloads, but it might initialize if the layer file is found // downloaded locally. - let none = layer.keep_resident().await.unwrap(); + let none = layer.keep_resident().await; assert!( none.is_none(), "Expected none, because eviction removed the local file, found: {none:?}" @@ -167,6 +166,7 @@ async fn smoke_test() { rtc.wait_completion().await.unwrap(); assert_eq!(rtc.get_remote_physical_size(), 0); + assert_eq!(0, LAYER_IMPL_METRICS.inits_cancelled.get()) } /// This test demonstrates a previous hang when a eviction and deletion were requested at the same @@ -174,7 +174,7 @@ async fn smoke_test() { #[tokio::test(start_paused = true)] async fn evict_and_wait_on_wanted_deleted() { // this is the runtime on which Layer spawns the blocking tasks on - let handle = BACKGROUND_RUNTIME.handle(); + let handle = tokio::runtime::Handle::current(); let h = TenantHarness::create("evict_and_wait_on_wanted_deleted").unwrap(); utils::logging::replace_panic_hook_with_tracing_panic_hook().forget(); @@ -188,7 +188,7 @@ async fn evict_and_wait_on_wanted_deleted() { let layer = { let mut layers = { let layers = timeline.layers.read().await; - layers.resident_layers().collect::>().await + layers.likely_resident_layers().collect::>() }; assert_eq!(layers.len(), 1); @@ -213,11 +213,11 @@ async fn evict_and_wait_on_wanted_deleted() { drop(resident); // make sure the eviction task gets to run - SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(handle).await; + SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(&handle).await; let resident = layer.keep_resident().await; assert!( - matches!(resident, Ok(None)), + resident.is_none(), "keep_resident should not have re-initialized: {resident:?}" ); @@ -235,24 +235,332 @@ async fn evict_and_wait_on_wanted_deleted() { layers.finish_gc_timeline(&[layer]); } - SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(handle).await; + SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(&handle).await; assert_eq!(1, LAYER_IMPL_METRICS.started_deletes.get()); assert_eq!(1, LAYER_IMPL_METRICS.completed_deletes.get()); assert_eq!(1, LAYER_IMPL_METRICS.started_evictions.get()); assert_eq!(1, LAYER_IMPL_METRICS.completed_evictions.get()); + assert_eq!(0, LAYER_IMPL_METRICS.inits_cancelled.get()) } -/// This test shows that ensures we are able to read the layer while the layer eviction has been -/// started but not completed due to spawn_blocking pool being blocked. -/// -/// Here `Layer::keep_resident` is used to "simulate" reads, because it cannot download. -#[tokio::test(start_paused = true)] -async fn residency_check_while_evict_and_wait_on_clogged_spawn_blocking() { - // this is the runtime on which Layer spawns the blocking tasks on - let handle = BACKGROUND_RUNTIME.handle(); - let h = TenantHarness::create("residency_check_while_evict_and_wait_on_clogged_spawn_blocking") +/// This test ensures we are able to read the layer while the layer eviction has been +/// started but not completed. +#[test] +fn read_wins_pending_eviction() { + let rt = tokio::runtime::Builder::new_current_thread() + .max_blocking_threads(1) + .enable_all() + .start_paused(true) + .build() .unwrap(); + + rt.block_on(async move { + // this is the runtime on which Layer spawns the blocking tasks on + let handle = tokio::runtime::Handle::current(); + let h = TenantHarness::create("read_wins_pending_eviction").unwrap(); + let (tenant, ctx) = h.load().await; + let span = h.span(); + let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1)); + + let timeline = tenant + .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx) + .await + .unwrap(); + + let layer = { + let mut layers = { + let layers = timeline.layers.read().await; + layers.likely_resident_layers().collect::>() + }; + + assert_eq!(layers.len(), 1); + + layers.swap_remove(0) + }; + + // setup done + + let resident = layer.keep_resident().await.unwrap(); + + let mut evict_and_wait = std::pin::pin!(layer.evict_and_wait(FOREVER)); + + // drive the future to await on the status channel + tokio::time::timeout(ADVANCE, &mut evict_and_wait) + .await + .expect_err("should had been a timeout since we are holding the layer resident"); + assert_eq!(1, LAYER_IMPL_METRICS.started_evictions.get()); + + let (completion, barrier) = utils::completion::channel(); + let (arrival, arrived_at_barrier) = utils::completion::channel(); + layer.enable_failpoint(Failpoint::WaitBeforeStartingEvicting( + Some(arrival), + barrier, + )); + + // now the eviction cannot proceed because the threads are consumed while completion exists + drop(resident); + arrived_at_barrier.wait().await; + assert!(!layer.is_likely_resident()); + + // because no actual eviction happened, we get to just reinitialize the DownloadedLayer + layer + .0 + .get_or_maybe_download(false, None) + .instrument(download_span) + .await + .expect("should had reinitialized without downloading"); + + assert!(layer.is_likely_resident()); + + // reinitialization notifies of new resident status, which should error out all evict_and_wait + let e = tokio::time::timeout(ADVANCE, &mut evict_and_wait) + .await + .expect("no timeout, because get_or_maybe_download re-initialized") + .expect_err("eviction should not have succeeded because re-initialized"); + + // works as intended: evictions lose to "downloads" + assert!(matches!(e, EvictionError::Downloaded), "{e:?}"); + assert_eq!(0, LAYER_IMPL_METRICS.completed_evictions.get()); + + // this is not wrong: the eviction is technically still "on the way" as it's still queued + // because of a failpoint + assert_eq!( + 0, + LAYER_IMPL_METRICS + .cancelled_evictions + .values() + .map(|ctr| ctr.get()) + .sum::() + ); + + drop(completion); + + tokio::time::sleep(ADVANCE).await; + SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads0(&handle, 1) + .await; + + assert_eq!(0, LAYER_IMPL_METRICS.completed_evictions.get()); + + // now we finally can observe the original eviction failing + // it would had been possible to observe it earlier, but here it is guaranteed to have + // happened. + assert_eq!( + 1, + LAYER_IMPL_METRICS + .cancelled_evictions + .values() + .map(|ctr| ctr.get()) + .sum::() + ); + + assert_eq!( + 1, + LAYER_IMPL_METRICS.cancelled_evictions[EvictionCancelled::AlreadyReinitialized].get() + ); + + assert_eq!(0, LAYER_IMPL_METRICS.inits_cancelled.get()) + }); +} + +/// Use failpoint to delay an eviction starting to get a VersionCheckFailed. +#[test] +fn multiple_pending_evictions_in_order() { + let name = "multiple_pending_evictions_in_order"; + let in_order = true; + multiple_pending_evictions_scenario(name, in_order); +} + +/// Use failpoint to reorder later eviction before first to get a UnexpectedEvictedState. +#[test] +fn multiple_pending_evictions_out_of_order() { + let name = "multiple_pending_evictions_out_of_order"; + let in_order = false; + multiple_pending_evictions_scenario(name, in_order); +} + +fn multiple_pending_evictions_scenario(name: &'static str, in_order: bool) { + let rt = tokio::runtime::Builder::new_current_thread() + .max_blocking_threads(1) + .enable_all() + .start_paused(true) + .build() + .unwrap(); + + rt.block_on(async move { + // this is the runtime on which Layer spawns the blocking tasks on + let handle = tokio::runtime::Handle::current(); + let h = TenantHarness::create(name).unwrap(); + let (tenant, ctx) = h.load().await; + let span = h.span(); + let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1)); + + let timeline = tenant + .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx) + .await + .unwrap(); + + let layer = { + let mut layers = { + let layers = timeline.layers.read().await; + layers.likely_resident_layers().collect::>() + }; + + assert_eq!(layers.len(), 1); + + layers.swap_remove(0) + }; + + // setup done + + let resident = layer.keep_resident().await.unwrap(); + + let mut evict_and_wait = std::pin::pin!(layer.evict_and_wait(FOREVER)); + + // drive the future to await on the status channel + tokio::time::timeout(ADVANCE, &mut evict_and_wait) + .await + .expect_err("should had been a timeout since we are holding the layer resident"); + assert_eq!(1, LAYER_IMPL_METRICS.started_evictions.get()); + + let (completion1, barrier) = utils::completion::channel(); + let mut completion1 = Some(completion1); + let (arrival, arrived_at_barrier) = utils::completion::channel(); + layer.enable_failpoint(Failpoint::WaitBeforeStartingEvicting( + Some(arrival), + barrier, + )); + + // now the eviction cannot proceed because we are simulating arbitrary long delay for the + // eviction task start. + drop(resident); + assert!(!layer.is_likely_resident()); + + arrived_at_barrier.wait().await; + + // because no actual eviction happened, we get to just reinitialize the DownloadedLayer + layer + .0 + .get_or_maybe_download(false, None) + .instrument(download_span) + .await + .expect("should had reinitialized without downloading"); + + assert!(layer.is_likely_resident()); + + // reinitialization notifies of new resident status, which should error out all evict_and_wait + let e = tokio::time::timeout(ADVANCE, &mut evict_and_wait) + .await + .expect("no timeout, because get_or_maybe_download re-initialized") + .expect_err("eviction should not have succeeded because re-initialized"); + + // works as intended: evictions lose to "downloads" + assert!(matches!(e, EvictionError::Downloaded), "{e:?}"); + assert_eq!(0, LAYER_IMPL_METRICS.completed_evictions.get()); + + // this is not wrong: the eviction is technically still "on the way" as it's still queued + // because of a failpoint + assert_eq!( + 0, + LAYER_IMPL_METRICS + .cancelled_evictions + .values() + .map(|ctr| ctr.get()) + .sum::() + ); + + assert_eq!(0, LAYER_IMPL_METRICS.completed_evictions.get()); + + // configure another failpoint for the second eviction -- evictions are per initialization, + // so now that we've reinitialized the inner, we get to run two of them at the same time. + let (completion2, barrier) = utils::completion::channel(); + let (arrival, arrived_at_barrier) = utils::completion::channel(); + layer.enable_failpoint(Failpoint::WaitBeforeStartingEvicting( + Some(arrival), + barrier, + )); + + let mut second_eviction = std::pin::pin!(layer.evict_and_wait(FOREVER)); + + // advance to the wait on the queue + tokio::time::timeout(ADVANCE, &mut second_eviction) + .await + .expect_err("timeout because failpoint is blocking"); + + arrived_at_barrier.wait().await; + + assert_eq!(2, LAYER_IMPL_METRICS.started_evictions.get()); + + let mut release_earlier_eviction = |expected_reason| { + assert_eq!( + 0, + LAYER_IMPL_METRICS.cancelled_evictions[expected_reason].get(), + ); + + drop(completion1.take().unwrap()); + + let handle = &handle; + + async move { + tokio::time::sleep(ADVANCE).await; + SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads0( + handle, 1, + ) + .await; + + assert_eq!( + 1, + LAYER_IMPL_METRICS.cancelled_evictions[expected_reason].get(), + ); + } + }; + + if in_order { + release_earlier_eviction(EvictionCancelled::VersionCheckFailed).await; + } + + // release the later eviction which is for the current version + drop(completion2); + tokio::time::sleep(ADVANCE).await; + SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads0(&handle, 1) + .await; + + if !in_order { + release_earlier_eviction(EvictionCancelled::UnexpectedEvictedState).await; + } + + tokio::time::timeout(ADVANCE, &mut second_eviction) + .await + .expect("eviction goes through now that spawn_blocking is unclogged") + .expect("eviction should succeed, because version matches"); + + assert_eq!(1, LAYER_IMPL_METRICS.completed_evictions.get()); + + // ensure the cancelled are unchanged + assert_eq!( + 1, + LAYER_IMPL_METRICS + .cancelled_evictions + .values() + .map(|ctr| ctr.get()) + .sum::() + ); + + assert_eq!(0, LAYER_IMPL_METRICS.inits_cancelled.get()) + }); +} + +/// The test ensures with a failpoint that a pending eviction is not cancelled by what is currently +/// a `Layer::keep_resident` call. +/// +/// This matters because cancelling the eviction would leave us in a state where the file is on +/// disk but the layer internal state says it has not been initialized. Futhermore, it allows us to +/// have non-repairing `Layer::is_likely_resident`. +#[tokio::test(start_paused = true)] +async fn cancelled_get_or_maybe_download_does_not_cancel_eviction() { + let handle = tokio::runtime::Handle::current(); + let h = + TenantHarness::create("cancelled_get_or_maybe_download_does_not_cancel_eviction").unwrap(); let (tenant, ctx) = h.load().await; let timeline = tenant @@ -263,7 +571,7 @@ async fn residency_check_while_evict_and_wait_on_clogged_spawn_blocking() { let layer = { let mut layers = { let layers = timeline.layers.read().await; - layers.resident_layers().collect::>().await + layers.likely_resident_layers().collect::>() }; assert_eq!(layers.len(), 1); @@ -271,90 +579,154 @@ async fn residency_check_while_evict_and_wait_on_clogged_spawn_blocking() { layers.swap_remove(0) }; - // setup done + // this failpoint will simulate the `get_or_maybe_download` becoming cancelled (by returning an + // Err) at the right time as in "during" the `LayerInner::needs_download`. + layer.enable_failpoint(Failpoint::AfterDeterminingLayerNeedsNoDownload); - let resident = layer.keep_resident().await.unwrap(); + let (completion, barrier) = utils::completion::channel(); + let (arrival, arrived_at_barrier) = utils::completion::channel(); + + layer.enable_failpoint(Failpoint::WaitBeforeStartingEvicting( + Some(arrival), + barrier, + )); + + tokio::time::timeout(ADVANCE, layer.evict_and_wait(FOREVER)) + .await + .expect_err("should had advanced to waiting on channel"); + + arrived_at_barrier.wait().await; + + // simulate a cancelled read which is cancelled before it gets to re-initialize + let e = layer + .0 + .get_or_maybe_download(false, None) + .await + .unwrap_err(); + assert!( + matches!( + e, + DownloadError::Failpoint(FailpointKind::AfterDeterminingLayerNeedsNoDownload) + ), + "{e:?}" + ); + + assert!( + layer.0.needs_download().await.unwrap().is_none(), + "file is still on disk" + ); + + // release the eviction task + drop(completion); + tokio::time::sleep(ADVANCE).await; + SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(&handle).await; + + // failpoint is still enabled, but it is not hit + let e = layer + .0 + .get_or_maybe_download(false, None) + .await + .unwrap_err(); + assert!(matches!(e, DownloadError::DownloadRequired), "{e:?}"); + + // failpoint is not counted as cancellation either + assert_eq!(0, LAYER_IMPL_METRICS.inits_cancelled.get()) +} + +#[tokio::test(start_paused = true)] +async fn evict_and_wait_does_not_wait_for_download() { + // let handle = tokio::runtime::Handle::current(); + let h = TenantHarness::create("evict_and_wait_does_not_wait_for_download").unwrap(); + let (tenant, ctx) = h.load().await; + let span = h.span(); + let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1)); + + let timeline = tenant + .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx) + .await + .unwrap(); + + let layer = { + let mut layers = { + let layers = timeline.layers.read().await; + layers.likely_resident_layers().collect::>() + }; + + assert_eq!(layers.len(), 1); + + layers.swap_remove(0) + }; + + // kind of forced setup: start an eviction but do not allow it progress until we are + // downloading + let (eviction_can_continue, barrier) = utils::completion::channel(); + let (arrival, eviction_arrived) = utils::completion::channel(); + layer.enable_failpoint(Failpoint::WaitBeforeStartingEvicting( + Some(arrival), + barrier, + )); let mut evict_and_wait = std::pin::pin!(layer.evict_and_wait(FOREVER)); - // drive the future to await on the status channel + // use this once-awaited other_evict to synchronize with the eviction + let other_evict = layer.evict_and_wait(FOREVER); + tokio::time::timeout(ADVANCE, &mut evict_and_wait) .await - .expect_err("should had been a timeout since we are holding the layer resident"); - assert_eq!(1, LAYER_IMPL_METRICS.started_evictions.get()); + .expect_err("should had advanced"); + eviction_arrived.wait().await; + drop(eviction_can_continue); + other_evict.await.unwrap(); - // clog up BACKGROUND_RUNTIME spawn_blocking - let helper = SpawnBlockingPoolHelper::consume_all_spawn_blocking_threads(handle).await; + // now the layer is evicted, and the "evict_and_wait" is waiting on the receiver + assert!(!layer.is_likely_resident()); - // now the eviction cannot proceed because the threads are consumed while completion exists - drop(resident); + // following new evict_and_wait will fail until we've completed the download + let e = layer.evict_and_wait(FOREVER).await.unwrap_err(); + assert!(matches!(e, EvictionError::NotFound), "{e:?}"); - // because no actual eviction happened, we get to just reinitialize the DownloadedLayer - layer - .keep_resident() - .await - .expect("keep_resident should had reinitialized without downloading") - .expect("ResidentLayer"); + let (download_can_continue, barrier) = utils::completion::channel(); + let (arrival, _download_arrived) = utils::completion::channel(); + layer.enable_failpoint(Failpoint::WaitBeforeDownloading(Some(arrival), barrier)); - // because the keep_resident check alters wanted evicted without sending a message, we will - // never get completed - let e = tokio::time::timeout(ADVANCE, &mut evict_and_wait) - .await - .expect("no timeout, because keep_resident re-initialized") - .expect_err("eviction should not have succeeded because re-initialized"); + let mut download = std::pin::pin!(layer + .0 + .get_or_maybe_download(true, None) + .instrument(download_span)); - // works as intended: evictions lose to "downloads" - assert!(matches!(e, EvictionError::Downloaded), "{e:?}"); - assert_eq!(0, LAYER_IMPL_METRICS.completed_evictions.get()); - - // this is not wrong: the eviction is technically still "on the way" as it's still queued - // because spawn_blocking is clogged up - assert_eq!( - 0, - LAYER_IMPL_METRICS - .cancelled_evictions - .values() - .map(|ctr| ctr.get()) - .sum::() + assert!( + !layer.is_likely_resident(), + "during download layer is evicted" ); - let mut second_eviction = std::pin::pin!(layer.evict_and_wait(FOREVER)); - - // advance to the wait on the queue - tokio::time::timeout(ADVANCE, &mut second_eviction) + tokio::time::timeout(ADVANCE, &mut download) .await - .expect_err("timeout because spawn_blocking is clogged"); + .expect_err("should had timed out because of failpoint"); - // in this case we don't leak started evictions, but I think there is still a chance of that - // happening, because we could have upgrades race multiple evictions while only one of them - // happens? - assert_eq!(2, LAYER_IMPL_METRICS.started_evictions.get()); + // now we finally get to continue, and because the latest state is downloading, we deduce that + // original eviction succeeded + evict_and_wait.await.unwrap(); - helper.release().await; + // however a new evict_and_wait will fail + let e = layer.evict_and_wait(FOREVER).await.unwrap_err(); + assert!(matches!(e, EvictionError::NotFound), "{e:?}"); - // the second_eviction gets to run here - // - // synchronize to be *strictly* after the second_eviction spawn_blocking run - SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(handle).await; + assert!(!layer.is_likely_resident()); - tokio::time::timeout(ADVANCE, &mut second_eviction) - .await - .expect("eviction goes through now that spawn_blocking is unclogged") - .expect("eviction should succeed, because version matches"); + drop(download_can_continue); + download.await.expect("download should had succeeded"); + assert!(layer.is_likely_resident()); - assert_eq!(1, LAYER_IMPL_METRICS.completed_evictions.get()); + // only now can we evict + layer.evict_and_wait(FOREVER).await.unwrap(); +} - // now we finally can observe the original spawn_blocking failing - // it would had been possible to observe it earlier, but here it is guaranteed to have - // happened. - assert_eq!( - 1, - LAYER_IMPL_METRICS - .cancelled_evictions - .values() - .map(|ctr| ctr.get()) - .sum::() - ); +#[test] +fn layer_size() { + assert_eq!(std::mem::size_of::(), 2040); + assert_eq!(std::mem::size_of::(), 104); + assert_eq!(std::mem::size_of::(), 2328); + // it also has the utf8 path } struct SpawnBlockingPoolHelper { @@ -371,31 +743,41 @@ impl SpawnBlockingPoolHelper { /// /// This should be no issue nowdays, because nextest runs each test in it's own process. async fn consume_all_spawn_blocking_threads(handle: &tokio::runtime::Handle) -> Self { - let (completion, barrier) = completion::channel(); - let (tx, mut rx) = tokio::sync::mpsc::channel(8); + let default_max_blocking_threads = 512; - let assumed_max_blocking_threads = 512; + Self::consume_all_spawn_blocking_threads0(handle, default_max_blocking_threads).await + } + + async fn consume_all_spawn_blocking_threads0( + handle: &tokio::runtime::Handle, + threads: usize, + ) -> Self { + assert_ne!(threads, 0); + + let (completion, barrier) = completion::channel(); + let (started, starts_completed) = completion::channel(); let mut blocking_tasks = JoinSet::new(); - for _ in 0..assumed_max_blocking_threads { + for _ in 0..threads { let barrier = barrier.clone(); - let tx = tx.clone(); + let started = started.clone(); blocking_tasks.spawn_blocking_on( move || { - tx.blocking_send(()).unwrap(); - drop(tx); + drop(started); tokio::runtime::Handle::current().block_on(barrier.wait()); }, handle, ); } + drop(started); + + starts_completed.wait().await; + drop(barrier); - for _ in 0..assumed_max_blocking_threads { - rx.recv().await.unwrap(); - } + tracing::trace!("consumed all threads"); SpawnBlockingPoolHelper { awaited_by_spawn_blocking_tasks: completion, @@ -415,13 +797,22 @@ impl SpawnBlockingPoolHelper { while let Some(res) = blocking_tasks.join_next().await { res.expect("none of the tasks should had panicked"); } + + tracing::trace!("released all threads"); } /// In the tests it is used as an easy way of making sure something scheduled on the target /// runtimes `spawn_blocking` has completed, because it must've been scheduled and completed /// before our tasks have a chance to schedule and complete. async fn consume_and_release_all_of_spawn_blocking_threads(handle: &tokio::runtime::Handle) { - Self::consume_all_spawn_blocking_threads(handle) + Self::consume_and_release_all_of_spawn_blocking_threads0(handle, 512).await + } + + async fn consume_and_release_all_of_spawn_blocking_threads0( + handle: &tokio::runtime::Handle, + threads: usize, + ) { + Self::consume_all_spawn_blocking_threads0(handle, threads) .await .release() .await @@ -435,7 +826,7 @@ fn spawn_blocking_pool_helper_actually_works() { // because the amount is not configurable for our helper, expect the same amount as // BACKGROUND_RUNTIME using the tokio defaults would have. let rt = tokio::runtime::Builder::new_current_thread() - .max_blocking_threads(512) + .max_blocking_threads(1) .enable_all() .build() .unwrap(); @@ -445,7 +836,8 @@ fn spawn_blocking_pool_helper_actually_works() { rt.block_on(async move { // this will not return until all threads are spun up and actually executing the code // waiting on `consumed` to be `SpawnBlockingPoolHelper::release`'d. - let consumed = SpawnBlockingPoolHelper::consume_all_spawn_blocking_threads(handle).await; + let consumed = + SpawnBlockingPoolHelper::consume_all_spawn_blocking_threads0(handle, 1).await; println!("consumed"); diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index e4f5f75132..db32223a60 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -8,7 +8,7 @@ use std::time::{Duration, Instant}; use crate::context::{DownloadBehavior, RequestContext}; use crate::metrics::TENANT_TASK_EVENTS; use crate::task_mgr; -use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME}; +use crate::task_mgr::TaskKind; use crate::tenant::throttle::Stats; use crate::tenant::timeline::CompactionError; use crate::tenant::{Tenant, TenantState}; @@ -18,7 +18,7 @@ use utils::{backoff, completion}; static CONCURRENT_BACKGROUND_TASKS: once_cell::sync::Lazy = once_cell::sync::Lazy::new(|| { - let total_threads = *task_mgr::BACKGROUND_RUNTIME_WORKER_THREADS; + let total_threads = *crate::task_mgr::THE_RUNTIME_WORKER_THREADS; let permits = usize::max( 1, // while a lot of the work is done on spawn_blocking, we still do @@ -85,7 +85,6 @@ pub fn start_background_loops( ) { let tenant_shard_id = tenant.tenant_shard_id; task_mgr::spawn( - BACKGROUND_RUNTIME.handle(), TaskKind::Compaction, Some(tenant_shard_id), None, @@ -109,7 +108,6 @@ pub fn start_background_loops( }, ); task_mgr::spawn( - BACKGROUND_RUNTIME.handle(), TaskKind::GarbageCollector, Some(tenant_shard_id), None, diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 2ab7301cce..289dee75ab 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -13,7 +13,6 @@ use bytes::Bytes; use camino::Utf8Path; use enumset::EnumSet; use fail::fail_point; -use futures::stream::StreamExt; use once_cell::sync::Lazy; use pageserver_api::{ key::AUX_FILES_KEY, @@ -37,6 +36,7 @@ use tracing::*; use utils::{ bin_ser::BeSer, sync::gate::{Gate, GateGuard}, + vec_map::VecMap, }; use std::ops::{Deref, Range}; @@ -1723,7 +1723,6 @@ impl Timeline { initdb_optimization_count: 0, }; task_mgr::spawn( - task_mgr::BACKGROUND_RUNTIME.handle(), task_mgr::TaskKind::LayerFlushTask, Some(self.tenant_shard_id), Some(self.timeline_id), @@ -2086,7 +2085,6 @@ impl Timeline { DownloadBehavior::Download, ); task_mgr::spawn( - task_mgr::BACKGROUND_RUNTIME.handle(), task_mgr::TaskKind::InitialLogicalSizeCalculation, Some(self.tenant_shard_id), Some(self.timeline_id), @@ -2264,7 +2262,6 @@ impl Timeline { DownloadBehavior::Download, ); task_mgr::spawn( - task_mgr::BACKGROUND_RUNTIME.handle(), task_mgr::TaskKind::OndemandLogicalSizeCalculation, Some(self.tenant_shard_id), Some(self.timeline_id), @@ -2442,7 +2439,7 @@ impl Timeline { let guard = self.layers.read().await; - let resident = guard.resident_layers().map(|layer| { + let resident = guard.likely_resident_layers().map(|layer| { let last_activity_ts = layer.access_stats().latest_activity_or_now(); HeatMapLayer::new( @@ -2452,7 +2449,7 @@ impl Timeline { ) }); - let layers = resident.collect().await; + let layers = resident.collect(); Some(HeatMapTimeline::new(self.timeline_id, layers)) } @@ -3840,7 +3837,7 @@ impl Timeline { }; let timer = self.metrics.garbage_collect_histo.start_timer(); - fail_point!("before-timeline-gc"); + pausable_failpoint!("before-timeline-gc"); // Is the timeline being deleted? if self.is_stopping() { @@ -4151,7 +4148,6 @@ impl Timeline { let self_clone = Arc::clone(&self); let task_id = task_mgr::spawn( - task_mgr::BACKGROUND_RUNTIME.handle(), task_mgr::TaskKind::DownloadAllRemoteLayers, Some(self.tenant_shard_id), Some(self.timeline_id), @@ -4302,7 +4298,7 @@ impl Timeline { let mut max_layer_size: Option = None; let resident_layers = guard - .resident_layers() + .likely_resident_layers() .map(|layer| { let file_size = layer.layer_desc().file_size; max_layer_size = max_layer_size.map_or(Some(file_size), |m| Some(m.max(file_size))); @@ -4315,8 +4311,7 @@ impl Timeline { relative_last_activity: finite_f32::FiniteF32::ZERO, } }) - .collect() - .await; + .collect(); DiskUsageEvictionInfo { max_layer_size, @@ -4618,16 +4613,15 @@ impl<'a> TimelineWriter<'a> { } } - /// Put a batch keys at the specified Lsns. + /// Put a batch of keys at the specified Lsns. /// - /// The batch should be sorted by Lsn such that it's safe - /// to roll the open layer mid batch. + /// The batch is sorted by Lsn (enforced by usage of [`utils::vec_map::VecMap`]. pub(crate) async fn put_batch( &mut self, - batch: Vec<(Key, Lsn, Value)>, + batch: VecMap, ctx: &RequestContext, ) -> anyhow::Result<()> { - for (key, lsn, val) in batch { + for (lsn, (key, val)) in batch { self.put(key, lsn, &val, ctx).await? } @@ -4713,7 +4707,6 @@ mod tests { .keep_resident() .await .expect("no download => no downloading errors") - .expect("should had been resident") .drop_eviction_guard(); let forever = std::time::Duration::from_secs(120); @@ -4724,7 +4717,7 @@ mod tests { let (first, second) = tokio::join!(first, second); let res = layer.keep_resident().await; - assert!(matches!(res, Ok(None)), "{res:?}"); + assert!(res.is_none(), "{res:?}"); match (first, second) { (Ok(()), Ok(())) => { diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs index a0c9d99196..d2272fc75f 100644 --- a/pageserver/src/tenant/timeline/delete.rs +++ b/pageserver/src/tenant/timeline/delete.rs @@ -443,7 +443,6 @@ impl DeleteTimelineFlow { let timeline_id = timeline.timeline_id; task_mgr::spawn( - task_mgr::BACKGROUND_RUNTIME.handle(), TaskKind::TimelineDeletionWorker, Some(tenant_shard_id), Some(timeline_id), diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs index dd603135d2..f84a4b0dac 100644 --- a/pageserver/src/tenant/timeline/eviction_task.rs +++ b/pageserver/src/tenant/timeline/eviction_task.rs @@ -28,7 +28,7 @@ use tracing::{debug, error, info, info_span, instrument, warn, Instrument}; use crate::{ context::{DownloadBehavior, RequestContext}, pgdatadir_mapping::CollectKeySpaceError, - task_mgr::{self, TaskKind, BACKGROUND_RUNTIME}, + task_mgr::{self, TaskKind}, tenant::{ tasks::BackgroundLoopKind, timeline::EvictionError, LogicalSizeCalculationCause, Tenant, }, @@ -56,7 +56,6 @@ impl Timeline { let self_clone = Arc::clone(self); let background_tasks_can_start = background_tasks_can_start.cloned(); task_mgr::spawn( - BACKGROUND_RUNTIME.handle(), TaskKind::Eviction, Some(self.tenant_shard_id), Some(self.timeline_id), @@ -225,24 +224,18 @@ impl Timeline { { let guard = self.layers.read().await; let layers = guard.layer_map(); - for hist_layer in layers.iter_historic_layers() { - let hist_layer = guard.get_from_desc(&hist_layer); + for layer in layers.iter_historic_layers() { + let layer = guard.get_from_desc(&layer); // guard against eviction while we inspect it; it might be that eviction_task and // disk_usage_eviction_task both select the same layers to be evicted, and // seemingly free up double the space. both succeeding is of no consequence. - let guard = match hist_layer.keep_resident().await { - Ok(Some(l)) => l, - Ok(None) => continue, - Err(e) => { - // these should not happen, but we cannot make them statically impossible right - // now. - tracing::warn!(layer=%hist_layer, "failed to keep the layer resident: {e:#}"); - continue; - } - }; - let last_activity_ts = hist_layer.access_stats().latest_activity_or_now(); + if !layer.is_likely_resident() { + continue; + } + + let last_activity_ts = layer.access_stats().latest_activity_or_now(); let no_activity_for = match now.duration_since(last_activity_ts) { Ok(d) => d, @@ -265,9 +258,8 @@ impl Timeline { continue; } }; - let layer = guard.drop_eviction_guard(); + if no_activity_for > p.threshold { - // this could cause a lot of allocations in some cases js.spawn(async move { layer .evict_and_wait(std::time::Duration::from_secs(5)) diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs index ebcdcfdb4d..d54dc1642c 100644 --- a/pageserver/src/tenant/timeline/layer_manager.rs +++ b/pageserver/src/tenant/timeline/layer_manager.rs @@ -1,5 +1,4 @@ use anyhow::{bail, ensure, Context, Result}; -use futures::StreamExt; use pageserver_api::shard::TenantShardId; use std::{collections::HashMap, sync::Arc}; use tracing::trace; @@ -241,29 +240,16 @@ impl LayerManager { layer.delete_on_drop(); } - pub(crate) fn resident_layers(&self) -> impl futures::stream::Stream + '_ { + pub(crate) fn likely_resident_layers(&self) -> impl Iterator + '_ { // for small layer maps, we most likely have all resident, but for larger more are likely // to be evicted assuming lots of layers correlated with longer lifespan. - let layers = self - .layer_map() - .iter_historic_layers() - .map(|desc| self.get_from_desc(&desc)); - - let layers = futures::stream::iter(layers); - - layers.filter_map(|layer| async move { - // TODO(#6028): this query does not really need to see the ResidentLayer - match layer.keep_resident().await { - Ok(Some(layer)) => Some(layer.drop_eviction_guard()), - Ok(None) => None, - Err(e) => { - // these should not happen, but we cannot make them statically impossible right - // now. - tracing::warn!(%layer, "failed to keep the layer resident: {e:#}"); - None - } - } + self.layer_map().iter_historic_layers().filter_map(|desc| { + self.layer_fmgr + .0 + .get(&desc.key()) + .filter(|l| l.is_likely_resident()) + .cloned() }) } diff --git a/pageserver/src/tenant/timeline/walreceiver.rs b/pageserver/src/tenant/timeline/walreceiver.rs index 2fab6722b8..3592dda8d7 100644 --- a/pageserver/src/tenant/timeline/walreceiver.rs +++ b/pageserver/src/tenant/timeline/walreceiver.rs @@ -24,7 +24,7 @@ mod connection_manager; mod walreceiver_connection; use crate::context::{DownloadBehavior, RequestContext}; -use crate::task_mgr::{self, TaskKind, WALRECEIVER_RUNTIME}; +use crate::task_mgr::{self, TaskKind}; use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id; use crate::tenant::timeline::walreceiver::connection_manager::{ connection_manager_loop_step, ConnectionManagerState, @@ -82,7 +82,6 @@ impl WalReceiver { let loop_status = Arc::new(std::sync::RwLock::new(None)); let manager_status = Arc::clone(&loop_status); task_mgr::spawn( - WALRECEIVER_RUNTIME.handle(), TaskKind::WalReceiverManager, Some(timeline.tenant_shard_id), Some(timeline_id), @@ -181,7 +180,7 @@ impl TaskHandle { let (events_sender, events_receiver) = watch::channel(TaskStateUpdate::Started); let cancellation_clone = cancellation.clone(); - let join_handle = WALRECEIVER_RUNTIME.spawn(async move { + let join_handle = tokio::spawn(async move { events_sender.send(TaskStateUpdate::Started).ok(); task(events_sender, cancellation_clone).await // events_sender is dropped at some point during the .await above. diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs index 8297ca6563..cf87cc6ce0 100644 --- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs @@ -11,7 +11,6 @@ use std::{ use anyhow::{anyhow, Context}; use bytes::BytesMut; use chrono::{NaiveDateTime, Utc}; -use fail::fail_point; use futures::StreamExt; use postgres::{error::SqlState, SimpleQueryMessage, SimpleQueryRow}; use postgres_ffi::WAL_SEGMENT_SIZE; @@ -27,9 +26,7 @@ use super::TaskStateUpdate; use crate::{ context::RequestContext, metrics::{LIVE_CONNECTIONS_COUNT, WALRECEIVER_STARTED_CONNECTIONS, WAL_INGEST}, - task_mgr, - task_mgr::TaskKind, - task_mgr::WALRECEIVER_RUNTIME, + task_mgr::{self, TaskKind}, tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline, WalReceiverInfo}, walingest::WalIngest, walrecord::DecodedWALRecord, @@ -163,7 +160,6 @@ pub(super) async fn handle_walreceiver_connection( ); let connection_cancellation = cancellation.clone(); task_mgr::spawn( - WALRECEIVER_RUNTIME.handle(), TaskKind::WalReceiverConnectionPoller, Some(timeline.tenant_shard_id), Some(timeline.timeline_id), @@ -329,7 +325,17 @@ pub(super) async fn handle_walreceiver_connection( filtered_records += 1; } - fail_point!("walreceiver-after-ingest"); + // don't simply use pausable_failpoint here because its spawn_blocking slows + // slows down the tests too much. + fail::fail_point!("walreceiver-after-ingest-blocking"); + if let Err(()) = (|| { + fail::fail_point!("walreceiver-after-ingest-pause-activate", |_| { + Err(()) + }); + Ok(()) + })() { + pausable_failpoint!("walreceiver-after-ingest-pause"); + } last_rec_lsn = lsn; @@ -448,6 +454,7 @@ pub(super) async fn handle_walreceiver_connection( disk_consistent_lsn, remote_consistent_lsn, replytime: ts, + shard_number: timeline.tenant_shard_id.shard_number.0 as u32, }; debug!("neon_status_update {status_update:?}"); diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 63a2b30d09..9c7e8748d5 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -109,6 +109,8 @@ impl WalIngest { self.checkpoint_modified = true; } + failpoint_support::sleep_millis_async!("wal-ingest-record-sleep"); + match decoded.xl_rmid { pg_constants::RM_HEAP_ID | pg_constants::RM_HEAP2_ID => { // Heap AM records need some special handling, because they modify VM pages diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c index 6ede78a576..8d236144b5 100644 --- a/pgxn/neon/neon.c +++ b/pgxn/neon/neon.c @@ -312,7 +312,7 @@ pg_cluster_size(PG_FUNCTION_ARGS) { int64 size; - size = GetZenithCurrentClusterSize(); + size = GetNeonCurrentClusterSize(); if (size == 0) PG_RETURN_NULL(); diff --git a/pgxn/neon/neon.h b/pgxn/neon/neon.h index a0f8c97497..5c653fc6c6 100644 --- a/pgxn/neon/neon.h +++ b/pgxn/neon/neon.h @@ -26,6 +26,8 @@ extern void pg_init_libpagestore(void); extern void pg_init_walproposer(void); extern uint64 BackpressureThrottlingTime(void); +extern void SetNeonCurrentClusterSize(uint64 size); +extern uint64 GetNeonCurrentClusterSize(void); extern void replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn); extern void PGDLLEXPORT WalProposerSync(int argc, char *argv[]); diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index 0256de2b9a..2d222e3c7c 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -1831,7 +1831,7 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, reln->smgr_relpersistence == RELPERSISTENCE_PERMANENT && !IsAutoVacuumWorkerProcess()) { - uint64 current_size = GetZenithCurrentClusterSize(); + uint64 current_size = GetNeonCurrentClusterSize(); if (current_size >= ((uint64) max_cluster_size) * 1024 * 1024) ereport(ERROR, @@ -1912,7 +1912,7 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum, reln->smgr_relpersistence == RELPERSISTENCE_PERMANENT && !IsAutoVacuumWorkerProcess()) { - uint64 current_size = GetZenithCurrentClusterSize(); + uint64 current_size = GetNeonCurrentClusterSize(); if (current_size >= ((uint64) max_cluster_size) * 1024 * 1024) ereport(ERROR, diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c index 9ff0493352..d7987954d4 100644 --- a/pgxn/neon/walproposer.c +++ b/pgxn/neon/walproposer.c @@ -70,7 +70,7 @@ static bool SendAppendRequests(Safekeeper *sk); static bool RecvAppendResponses(Safekeeper *sk); static XLogRecPtr CalculateMinFlushLsn(WalProposer *wp); static XLogRecPtr GetAcknowledgedByQuorumWALPosition(WalProposer *wp); -static void HandleSafekeeperResponse(WalProposer *wp); +static void HandleSafekeeperResponse(WalProposer *wp, Safekeeper *sk); static bool AsyncRead(Safekeeper *sk, char **buf, int *buf_size); static bool AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg); static bool BlockingWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState success_state); @@ -1405,7 +1405,6 @@ static bool RecvAppendResponses(Safekeeper *sk) { WalProposer *wp = sk->wp; - XLogRecPtr newCommitLsn; bool readAnything = false; while (true) @@ -1425,6 +1424,8 @@ RecvAppendResponses(Safekeeper *sk) LSN_FORMAT_ARGS(sk->appendResponse.commitLsn), sk->host, sk->port); + readAnything = true; + if (sk->appendResponse.term > wp->propTerm) { /* @@ -1438,35 +1439,28 @@ RecvAppendResponses(Safekeeper *sk) sk->appendResponse.term, wp->propTerm); } - readAnything = true; + HandleSafekeeperResponse(wp, sk); } if (!readAnything) return sk->state == SS_ACTIVE; - /* update commit_lsn */ - newCommitLsn = GetAcknowledgedByQuorumWALPosition(wp); - /* - * Send the new value to all safekeepers. - */ - if (newCommitLsn > wp->commitLsn) - { - wp->commitLsn = newCommitLsn; - BroadcastAppendRequest(wp); - } - - HandleSafekeeperResponse(wp); - return sk->state == SS_ACTIVE; } +#define psfeedback_log(fmt, key, ...) \ + wp_log(DEBUG2, "ParsePageserverFeedbackMessage: %s " fmt, key, __VA_ARGS__) + /* Parse a PageserverFeedback message, or the PageserverFeedback part of an AppendResponse */ static void -ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, PageserverFeedback *rf) +ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, PageserverFeedback *ps_feedback) { uint8 nkeys; int i; - int32 len; + + /* initialize the struct before parsing */ + memset(ps_feedback, 0, sizeof(PageserverFeedback)); + ps_feedback->present = true; /* get number of custom keys */ nkeys = pq_getmsgbyte(reply_message); @@ -1474,66 +1468,52 @@ ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, Pagese for (i = 0; i < nkeys; i++) { const char *key = pq_getmsgstring(reply_message); + unsigned int value_len = pq_getmsgint(reply_message, sizeof(int32)); if (strcmp(key, "current_timeline_size") == 0) { - pq_getmsgint(reply_message, sizeof(int32)); - /* read value length */ - rf->currentClusterSize = pq_getmsgint64(reply_message); - wp_log(DEBUG2, "ParsePageserverFeedbackMessage: current_timeline_size %lu", - rf->currentClusterSize); + Assert(value_len == sizeof(int64)); + ps_feedback->currentClusterSize = pq_getmsgint64(reply_message); + psfeedback_log(UINT64_FORMAT, key, ps_feedback->currentClusterSize); } else if ((strcmp(key, "ps_writelsn") == 0) || (strcmp(key, "last_received_lsn") == 0)) { - pq_getmsgint(reply_message, sizeof(int32)); - /* read value length */ - rf->last_received_lsn = pq_getmsgint64(reply_message); - wp_log(DEBUG2, "ParsePageserverFeedbackMessage: last_received_lsn %X/%X", - LSN_FORMAT_ARGS(rf->last_received_lsn)); + Assert(value_len == sizeof(int64)); + ps_feedback->last_received_lsn = pq_getmsgint64(reply_message); + psfeedback_log("%X/%X", key, LSN_FORMAT_ARGS(ps_feedback->last_received_lsn)); } else if ((strcmp(key, "ps_flushlsn") == 0) || (strcmp(key, "disk_consistent_lsn") == 0)) { - pq_getmsgint(reply_message, sizeof(int32)); - /* read value length */ - rf->disk_consistent_lsn = pq_getmsgint64(reply_message); - wp_log(DEBUG2, "ParsePageserverFeedbackMessage: disk_consistent_lsn %X/%X", - LSN_FORMAT_ARGS(rf->disk_consistent_lsn)); + Assert(value_len == sizeof(int64)); + ps_feedback->disk_consistent_lsn = pq_getmsgint64(reply_message); + psfeedback_log("%X/%X", key, LSN_FORMAT_ARGS(ps_feedback->disk_consistent_lsn)); } else if ((strcmp(key, "ps_applylsn") == 0) || (strcmp(key, "remote_consistent_lsn") == 0)) { - pq_getmsgint(reply_message, sizeof(int32)); - /* read value length */ - rf->remote_consistent_lsn = pq_getmsgint64(reply_message); - wp_log(DEBUG2, "ParsePageserverFeedbackMessage: remote_consistent_lsn %X/%X", - LSN_FORMAT_ARGS(rf->remote_consistent_lsn)); + Assert(value_len == sizeof(int64)); + ps_feedback->remote_consistent_lsn = pq_getmsgint64(reply_message); + psfeedback_log("%X/%X", key, LSN_FORMAT_ARGS(ps_feedback->remote_consistent_lsn)); } else if ((strcmp(key, "ps_replytime") == 0) || (strcmp(key, "replytime") == 0)) { - pq_getmsgint(reply_message, sizeof(int32)); - /* read value length */ - rf->replytime = pq_getmsgint64(reply_message); - { - char *replyTimeStr; - - /* Copy because timestamptz_to_str returns a static buffer */ - replyTimeStr = pstrdup(timestamptz_to_str(rf->replytime)); - wp_log(DEBUG2, "ParsePageserverFeedbackMessage: replytime %lu reply_time: %s", - rf->replytime, replyTimeStr); - - pfree(replyTimeStr); - } + Assert(value_len == sizeof(int64)); + ps_feedback->replytime = pq_getmsgint64(reply_message); + psfeedback_log("%s", key, timestamptz_to_str(ps_feedback->replytime)); + } + else if (strcmp(key, "shard_number") == 0) + { + Assert(value_len == sizeof(uint32)); + ps_feedback->shard_number = pq_getmsgint(reply_message, sizeof(uint32)); + psfeedback_log("%u", key, ps_feedback->shard_number); } else { - len = pq_getmsgint(reply_message, sizeof(int32)); - /* read value length */ - /* * Skip unknown keys to support backward compatibile protocol * changes */ - wp_log(LOG, "ParsePageserverFeedbackMessage: unknown key: %s len %d", key, len); - pq_getmsgbytes(reply_message, len); + wp_log(LOG, "ParsePageserverFeedbackMessage: unknown key: %s len %d", key, value_len); + pq_getmsgbytes(reply_message, value_len); }; } } @@ -1630,12 +1610,30 @@ GetDonor(WalProposer *wp, XLogRecPtr *donor_lsn) return donor; } +/* + * Process AppendResponse message from safekeeper. + */ static void -HandleSafekeeperResponse(WalProposer *wp) +HandleSafekeeperResponse(WalProposer *wp, Safekeeper *sk) { XLogRecPtr candidateTruncateLsn; + XLogRecPtr newCommitLsn; - wp->api.process_safekeeper_feedback(wp); + newCommitLsn = GetAcknowledgedByQuorumWALPosition(wp); + if (newCommitLsn > wp->commitLsn) + { + wp->commitLsn = newCommitLsn; + /* Send new value to all safekeepers. */ + BroadcastAppendRequest(wp); + } + + /* + * Unlock syncrep waiters, update ps_feedback, CheckGracefulShutdown(). + * The last one will terminate the process if the shutdown is requested + * and WAL is committed by the quorum. BroadcastAppendRequest() should be + * called to notify safekeepers about the new commitLsn. + */ + wp->api.process_safekeeper_feedback(wp, sk); /* * Try to advance truncateLsn -- the last record flushed to all @@ -1811,8 +1809,10 @@ AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg) msg->hs.ts = pq_getmsgint64_le(&s); msg->hs.xmin.value = pq_getmsgint64_le(&s); msg->hs.catalog_xmin.value = pq_getmsgint64_le(&s); - if (buf_size > APPENDRESPONSE_FIXEDPART_SIZE) - ParsePageserverFeedbackMessage(wp, &s, &msg->rf); + if (s.len > s.cursor) + ParsePageserverFeedbackMessage(wp, &s, &msg->ps_feedback); + else + msg->ps_feedback.present = false; pq_getmsgend(&s); return true; } diff --git a/pgxn/neon/walproposer.h b/pgxn/neon/walproposer.h index bc674fd979..69a557fdf2 100644 --- a/pgxn/neon/walproposer.h +++ b/pgxn/neon/walproposer.h @@ -10,6 +10,7 @@ #include "libpqwalproposer.h" #include "neon_walreader.h" +#include "pagestore_client.h" #define SK_MAGIC 0xCafeCeefu #define SK_PROTOCOL_VERSION 2 @@ -269,6 +270,8 @@ typedef struct HotStandbyFeedback typedef struct PageserverFeedback { + /* true if AppendResponse contains this feedback */ + bool present; /* current size of the timeline on pageserver */ uint64 currentClusterSize; /* standby_status_update fields that safekeeper received from pageserver */ @@ -276,14 +279,22 @@ typedef struct PageserverFeedback XLogRecPtr disk_consistent_lsn; XLogRecPtr remote_consistent_lsn; TimestampTz replytime; + uint32 shard_number; } PageserverFeedback; typedef struct WalproposerShmemState { slock_t mutex; - PageserverFeedback feedback; term_t mineLastElectedTerm; pg_atomic_uint64 backpressureThrottlingTime; + pg_atomic_uint64 currentClusterSize; + + /* last feedback from each shard */ + PageserverFeedback shard_ps_feedback[MAX_SHARDS]; + int num_shards; + + /* aggregated feedback with min LSNs across shards */ + PageserverFeedback min_ps_feedback; } WalproposerShmemState; /* @@ -307,12 +318,12 @@ typedef struct AppendResponse /* Feedback received from pageserver includes standby_status_update fields */ /* and custom neon feedback. */ /* This part of the message is extensible. */ - PageserverFeedback rf; + PageserverFeedback ps_feedback; } AppendResponse; /* PageserverFeedback is extensible part of the message that is parsed separately */ /* Other fields are fixed part */ -#define APPENDRESPONSE_FIXEDPART_SIZE offsetof(AppendResponse, rf) +#define APPENDRESPONSE_FIXEDPART_SIZE 56 struct WalProposer; typedef struct WalProposer WalProposer; @@ -560,11 +571,11 @@ typedef struct walproposer_api void (*finish_sync_safekeepers) (WalProposer *wp, XLogRecPtr lsn); /* - * Called after every new message from the safekeeper. Used to propagate + * Called after every AppendResponse from the safekeeper. Used to propagate * backpressure feedback and to confirm WAL persistence (has been commited * on the quorum of safekeepers). */ - void (*process_safekeeper_feedback) (WalProposer *wp); + void (*process_safekeeper_feedback) (WalProposer *wp, Safekeeper *sk); /* * Write a log message to the internal log processor. This is used only diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c index 8eec2f02c1..7debb6325e 100644 --- a/pgxn/neon/walproposer_pg.c +++ b/pgxn/neon/walproposer_pg.c @@ -63,7 +63,6 @@ char *wal_acceptors_list = ""; int wal_acceptor_reconnect_timeout = 1000; int wal_acceptor_connection_timeout = 10000; -static AppendResponse quorumFeedback; static WalproposerShmemState *walprop_shared; static WalProposerConfig walprop_config; static XLogRecPtr sentPtr = InvalidXLogRecPtr; @@ -71,6 +70,10 @@ static const walproposer_api walprop_pg; static volatile sig_atomic_t got_SIGUSR2 = false; static bool reported_sigusr2 = false; +static XLogRecPtr standby_flush_lsn = InvalidXLogRecPtr; +static XLogRecPtr standby_apply_lsn = InvalidXLogRecPtr; +static HotStandbyFeedback agg_hs_feedback; + static void nwp_shmem_startup_hook(void); static void nwp_register_gucs(void); static void nwp_prepare_shmem(void); @@ -279,6 +282,7 @@ WalproposerShmemInit(void) memset(walprop_shared, 0, WalproposerShmemSize()); SpinLockInit(&walprop_shared->mutex); pg_atomic_init_u64(&walprop_shared->backpressureThrottlingTime, 0); + pg_atomic_init_u64(&walprop_shared->currentClusterSize, 0); } LWLockRelease(AddinShmemInitLock); @@ -402,21 +406,58 @@ walprop_pg_get_shmem_state(WalProposer *wp) return walprop_shared; } -static void -replication_feedback_set(PageserverFeedback *rf) +/* + * Record new ps_feedback in the array with shards and update min_feedback. + */ +static PageserverFeedback +record_pageserver_feedback(PageserverFeedback *ps_feedback) { + PageserverFeedback min_feedback; + + Assert(ps_feedback->present); + Assert(ps_feedback->shard_number < MAX_SHARDS); + SpinLockAcquire(&walprop_shared->mutex); - memcpy(&walprop_shared->feedback, rf, sizeof(PageserverFeedback)); + + /* Update the number of shards */ + if (ps_feedback->shard_number + 1 > walprop_shared->num_shards) + walprop_shared->num_shards = ps_feedback->shard_number + 1; + + /* Update the feedback */ + memcpy(&walprop_shared->shard_ps_feedback[ps_feedback->shard_number], ps_feedback, sizeof(PageserverFeedback)); + + /* Calculate min LSNs */ + memcpy(&min_feedback, ps_feedback, sizeof(PageserverFeedback)); + for (int i = 0; i < walprop_shared->num_shards; i++) + { + PageserverFeedback *feedback = &walprop_shared->shard_ps_feedback[i]; + if (feedback->present) + { + if (min_feedback.last_received_lsn == InvalidXLogRecPtr || feedback->last_received_lsn < min_feedback.last_received_lsn) + min_feedback.last_received_lsn = feedback->last_received_lsn; + + if (min_feedback.disk_consistent_lsn == InvalidXLogRecPtr || feedback->disk_consistent_lsn < min_feedback.disk_consistent_lsn) + min_feedback.disk_consistent_lsn = feedback->disk_consistent_lsn; + + if (min_feedback.remote_consistent_lsn == InvalidXLogRecPtr || feedback->remote_consistent_lsn < min_feedback.remote_consistent_lsn) + min_feedback.remote_consistent_lsn = feedback->remote_consistent_lsn; + } + } + /* Copy min_feedback back to shmem */ + memcpy(&walprop_shared->min_ps_feedback, &min_feedback, sizeof(PageserverFeedback)); + SpinLockRelease(&walprop_shared->mutex); + + return min_feedback; } void replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn) { SpinLockAcquire(&walprop_shared->mutex); - *writeLsn = walprop_shared->feedback.last_received_lsn; - *flushLsn = walprop_shared->feedback.disk_consistent_lsn; - *applyLsn = walprop_shared->feedback.remote_consistent_lsn; + *writeLsn = walprop_shared->min_ps_feedback.last_received_lsn; + *flushLsn = walprop_shared->min_ps_feedback.disk_consistent_lsn; + *applyLsn = walprop_shared->min_ps_feedback.remote_consistent_lsn; SpinLockRelease(&walprop_shared->mutex); } @@ -509,9 +550,10 @@ walprop_pg_init_standalone_sync_safekeepers(void) static void walprop_sigusr2(SIGNAL_ARGS) { + int save_errno = errno; got_SIGUSR2 = true; - SetLatch(MyLatch); + errno = save_errno; } static void @@ -1869,39 +1911,6 @@ CheckGracefulShutdown(WalProposer *wp) } } -/* - * Choose most advanced PageserverFeedback and set it to *rf. - */ -static void -GetLatestNeonFeedback(PageserverFeedback *rf, WalProposer *wp) -{ - int latest_safekeeper = 0; - XLogRecPtr last_received_lsn = InvalidXLogRecPtr; - - for (int i = 0; i < wp->n_safekeepers; i++) - { - if (wp->safekeeper[i].appendResponse.rf.last_received_lsn > last_received_lsn) - { - latest_safekeeper = i; - last_received_lsn = wp->safekeeper[i].appendResponse.rf.last_received_lsn; - } - } - - rf->currentClusterSize = wp->safekeeper[latest_safekeeper].appendResponse.rf.currentClusterSize; - rf->last_received_lsn = wp->safekeeper[latest_safekeeper].appendResponse.rf.last_received_lsn; - rf->disk_consistent_lsn = wp->safekeeper[latest_safekeeper].appendResponse.rf.disk_consistent_lsn; - rf->remote_consistent_lsn = wp->safekeeper[latest_safekeeper].appendResponse.rf.remote_consistent_lsn; - rf->replytime = wp->safekeeper[latest_safekeeper].appendResponse.rf.replytime; - - wpg_log(DEBUG2, "GetLatestNeonFeedback: currentClusterSize %lu," - " last_received_lsn %X/%X, disk_consistent_lsn %X/%X, remote_consistent_lsn %X/%X, replytime %lu", - rf->currentClusterSize, - LSN_FORMAT_ARGS(rf->last_received_lsn), - LSN_FORMAT_ARGS(rf->disk_consistent_lsn), - LSN_FORMAT_ARGS(rf->remote_consistent_lsn), - rf->replytime); -} - /* * Combine hot standby feedbacks from all safekeepers. */ @@ -1949,26 +1958,38 @@ CombineHotStanbyFeedbacks(HotStandbyFeedback *hs, WalProposer *wp) * None of that is functional in sync-safekeepers. */ static void -walprop_pg_process_safekeeper_feedback(WalProposer *wp) +walprop_pg_process_safekeeper_feedback(WalProposer *wp, Safekeeper *sk) { - HotStandbyFeedback hsFeedback; - XLogRecPtr oldDiskConsistentLsn; + HotStandbyFeedback hsFeedback; + bool needToAdvanceSlot = false; if (wp->config->syncSafekeepers) return; - oldDiskConsistentLsn = quorumFeedback.rf.disk_consistent_lsn; - - /* Get PageserverFeedback fields from the most advanced safekeeper */ - GetLatestNeonFeedback(&quorumFeedback.rf, wp); - replication_feedback_set(&quorumFeedback.rf); - SetZenithCurrentClusterSize(quorumFeedback.rf.currentClusterSize); - - if (wp->commitLsn > quorumFeedback.flushLsn || oldDiskConsistentLsn != quorumFeedback.rf.disk_consistent_lsn) + /* handle fresh ps_feedback */ + if (sk->appendResponse.ps_feedback.present) { - if (wp->commitLsn > quorumFeedback.flushLsn) - quorumFeedback.flushLsn = wp->commitLsn; + PageserverFeedback min_feedback = record_pageserver_feedback(&sk->appendResponse.ps_feedback); + /* Only one main shard sends non-zero currentClusterSize */ + if (sk->appendResponse.ps_feedback.currentClusterSize > 0) + SetNeonCurrentClusterSize(sk->appendResponse.ps_feedback.currentClusterSize); + + if (min_feedback.disk_consistent_lsn != standby_apply_lsn) + { + standby_apply_lsn = min_feedback.disk_consistent_lsn; + needToAdvanceSlot = true; + } + } + + if (wp->commitLsn > standby_flush_lsn) + { + standby_flush_lsn = wp->commitLsn; + needToAdvanceSlot = true; + } + + if (needToAdvanceSlot) + { /* * Advance the replication slot to commitLsn. WAL before it is * hardened and will be fetched from one of safekeepers by @@ -1977,23 +1998,23 @@ walprop_pg_process_safekeeper_feedback(WalProposer *wp) * Also wakes up syncrep waiters. */ ProcessStandbyReply( - /* write_lsn - This is what durably stored in WAL service. */ - quorumFeedback.flushLsn, - /* flush_lsn - This is what durably stored in WAL service. */ - quorumFeedback.flushLsn, + /* write_lsn - This is what durably stored in safekeepers quorum. */ + standby_flush_lsn, + /* flush_lsn - This is what durably stored in safekeepers quorum. */ + standby_flush_lsn, /* * apply_lsn - This is what processed and durably saved at* * pageserver. */ - quorumFeedback.rf.disk_consistent_lsn, + standby_apply_lsn, walprop_pg_get_current_timestamp(wp), false); } CombineHotStanbyFeedbacks(&hsFeedback, wp); - if (hsFeedback.ts != 0 && memcmp(&hsFeedback, &quorumFeedback.hs, sizeof hsFeedback) != 0) + if (hsFeedback.ts != 0 && memcmp(&hsFeedback, &agg_hs_feedback, sizeof hsFeedback) != 0) { - quorumFeedback.hs = hsFeedback; + agg_hs_feedback = hsFeedback; ProcessStandbyHSFeedback(hsFeedback.ts, XidFromFullTransactionId(hsFeedback.xmin), EpochFromFullTransactionId(hsFeedback.xmin), @@ -2074,6 +2095,18 @@ GetLogRepRestartLSN(WalProposer *wp) return lrRestartLsn; } +void SetNeonCurrentClusterSize(uint64 size) +{ + pg_atomic_write_u64(&walprop_shared->currentClusterSize, size); +} + +uint64 GetNeonCurrentClusterSize(void) +{ + return pg_atomic_read_u64(&walprop_shared->currentClusterSize); +} +uint64 GetNeonCurrentClusterSize(void); + + static const walproposer_api walprop_pg = { .get_shmem_state = walprop_pg_get_shmem_state, .start_streaming = walprop_pg_start_streaming, diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index d8112c8bf0..57a2736d5b 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -11,6 +11,10 @@ testing = [] [dependencies] anyhow.workspace = true async-trait.workspace = true +aws-config.workspace = true +aws-sdk-iam.workspace = true +aws-sigv4.workspace = true +aws-types.workspace = true base64.workspace = true bstr.workspace = true bytes = { workspace = true, features = ["serde"] } @@ -27,6 +31,7 @@ hashlink.workspace = true hex.workspace = true hmac.workspace = true hostname.workspace = true +http.workspace = true humantime.workspace = true hyper-tungstenite.workspace = true hyper.workspace = true @@ -59,10 +64,11 @@ rustls.workspace = true scopeguard.workspace = true serde.workspace = true serde_json.workspace = true -sha2.workspace = true +sha2 = { workspace = true, features = ["asm"] } smol_str.workspace = true smallvec.workspace = true socket2.workspace = true +subtle.workspace = true sync_wrapper.workspace = true task-local-extensions.workspace = true thiserror.workspace = true @@ -91,6 +97,7 @@ workspace_hack.workspace = true [dev-dependencies] camino-tempfile.workspace = true +fallible-iterator.workspace = true rcgen.workspace = true rstest.workspace = true tokio-postgres-rustls.workspace = true diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs index 11af85caa4..04fe83d8eb 100644 --- a/proxy/src/auth/backend.rs +++ b/proxy/src/auth/backend.rs @@ -254,7 +254,7 @@ async fn authenticate_with_secret( config: &'static AuthenticationConfig, ) -> auth::Result { if let Some(password) = unauthenticated_password { - let auth_outcome = validate_password_and_exchange(&password, secret)?; + let auth_outcome = validate_password_and_exchange(&password, secret).await?; let keys = match auth_outcome { crate::sasl::Outcome::Success(key) => key, crate::sasl::Outcome::Failure(reason) => { @@ -408,3 +408,228 @@ impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, &()> { } } } + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use bytes::BytesMut; + use fallible_iterator::FallibleIterator; + use postgres_protocol::{ + authentication::sasl::{ChannelBinding, ScramSha256}, + message::{backend::Message as PgMessage, frontend}, + }; + use provider::AuthSecret; + use tokio::io::{AsyncRead, AsyncReadExt, AsyncWriteExt}; + + use crate::{ + auth::{ComputeUserInfoMaybeEndpoint, IpPattern}, + config::AuthenticationConfig, + console::{ + self, + provider::{self, CachedAllowedIps, CachedRoleSecret}, + CachedNodeInfo, + }, + context::RequestMonitoring, + proxy::NeonOptions, + scram::ServerSecret, + stream::{PqStream, Stream}, + }; + + use super::auth_quirks; + + struct Auth { + ips: Vec, + secret: AuthSecret, + } + + impl console::Api for Auth { + async fn get_role_secret( + &self, + _ctx: &mut RequestMonitoring, + _user_info: &super::ComputeUserInfo, + ) -> Result { + Ok(CachedRoleSecret::new_uncached(Some(self.secret.clone()))) + } + + async fn get_allowed_ips_and_secret( + &self, + _ctx: &mut RequestMonitoring, + _user_info: &super::ComputeUserInfo, + ) -> Result<(CachedAllowedIps, Option), console::errors::GetAuthInfoError> + { + Ok(( + CachedAllowedIps::new_uncached(Arc::new(self.ips.clone())), + Some(CachedRoleSecret::new_uncached(Some(self.secret.clone()))), + )) + } + + async fn wake_compute( + &self, + _ctx: &mut RequestMonitoring, + _user_info: &super::ComputeUserInfo, + ) -> Result { + unimplemented!() + } + } + + static CONFIG: &AuthenticationConfig = &AuthenticationConfig { + scram_protocol_timeout: std::time::Duration::from_secs(5), + }; + + async fn read_message(r: &mut (impl AsyncRead + Unpin), b: &mut BytesMut) -> PgMessage { + loop { + r.read_buf(&mut *b).await.unwrap(); + if let Some(m) = PgMessage::parse(&mut *b).unwrap() { + break m; + } + } + } + + #[tokio::test] + async fn auth_quirks_scram() { + let (mut client, server) = tokio::io::duplex(1024); + let mut stream = PqStream::new(Stream::from_raw(server)); + + let mut ctx = RequestMonitoring::test(); + let api = Auth { + ips: vec![], + secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()), + }; + + let user_info = ComputeUserInfoMaybeEndpoint { + user: "conrad".into(), + endpoint_id: Some("endpoint".into()), + options: NeonOptions::default(), + }; + + let handle = tokio::spawn(async move { + let mut scram = ScramSha256::new(b"my-secret-password", ChannelBinding::unsupported()); + + let mut read = BytesMut::new(); + + // server should offer scram + match read_message(&mut client, &mut read).await { + PgMessage::AuthenticationSasl(a) => { + let options: Vec<&str> = a.mechanisms().collect().unwrap(); + assert_eq!(options, ["SCRAM-SHA-256"]); + } + _ => panic!("wrong message"), + } + + // client sends client-first-message + let mut write = BytesMut::new(); + frontend::sasl_initial_response("SCRAM-SHA-256", scram.message(), &mut write).unwrap(); + client.write_all(&write).await.unwrap(); + + // server response with server-first-message + match read_message(&mut client, &mut read).await { + PgMessage::AuthenticationSaslContinue(a) => { + scram.update(a.data()).await.unwrap(); + } + _ => panic!("wrong message"), + } + + // client response with client-final-message + write.clear(); + frontend::sasl_response(scram.message(), &mut write).unwrap(); + client.write_all(&write).await.unwrap(); + + // server response with server-final-message + match read_message(&mut client, &mut read).await { + PgMessage::AuthenticationSaslFinal(a) => { + scram.finish(a.data()).unwrap(); + } + _ => panic!("wrong message"), + } + }); + + let _creds = auth_quirks(&mut ctx, &api, user_info, &mut stream, false, CONFIG) + .await + .unwrap(); + + handle.await.unwrap(); + } + + #[tokio::test] + async fn auth_quirks_cleartext() { + let (mut client, server) = tokio::io::duplex(1024); + let mut stream = PqStream::new(Stream::from_raw(server)); + + let mut ctx = RequestMonitoring::test(); + let api = Auth { + ips: vec![], + secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()), + }; + + let user_info = ComputeUserInfoMaybeEndpoint { + user: "conrad".into(), + endpoint_id: Some("endpoint".into()), + options: NeonOptions::default(), + }; + + let handle = tokio::spawn(async move { + let mut read = BytesMut::new(); + let mut write = BytesMut::new(); + + // server should offer cleartext + match read_message(&mut client, &mut read).await { + PgMessage::AuthenticationCleartextPassword => {} + _ => panic!("wrong message"), + } + + // client responds with password + write.clear(); + frontend::password_message(b"my-secret-password", &mut write).unwrap(); + client.write_all(&write).await.unwrap(); + }); + + let _creds = auth_quirks(&mut ctx, &api, user_info, &mut stream, true, CONFIG) + .await + .unwrap(); + + handle.await.unwrap(); + } + + #[tokio::test] + async fn auth_quirks_password_hack() { + let (mut client, server) = tokio::io::duplex(1024); + let mut stream = PqStream::new(Stream::from_raw(server)); + + let mut ctx = RequestMonitoring::test(); + let api = Auth { + ips: vec![], + secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()), + }; + + let user_info = ComputeUserInfoMaybeEndpoint { + user: "conrad".into(), + endpoint_id: None, + options: NeonOptions::default(), + }; + + let handle = tokio::spawn(async move { + let mut read = BytesMut::new(); + + // server should offer cleartext + match read_message(&mut client, &mut read).await { + PgMessage::AuthenticationCleartextPassword => {} + _ => panic!("wrong message"), + } + + // client responds with password + let mut write = BytesMut::new(); + frontend::password_message(b"endpoint=my-endpoint;my-secret-password", &mut write) + .unwrap(); + client.write_all(&write).await.unwrap(); + }); + + let creds = auth_quirks(&mut ctx, &api, user_info, &mut stream, true, CONFIG) + .await + .unwrap(); + + assert_eq!(creds.info.endpoint, "my-endpoint"); + + handle.await.unwrap(); + } +} diff --git a/proxy/src/auth/flow.rs b/proxy/src/auth/flow.rs index 788381b6c0..45bbad8cb2 100644 --- a/proxy/src/auth/flow.rs +++ b/proxy/src/auth/flow.rs @@ -126,7 +126,7 @@ impl AuthFlow<'_, S, CleartextPassword> { .strip_suffix(&[0]) .ok_or(AuthErrorImpl::MalformedPassword("missing terminator"))?; - let outcome = validate_password_and_exchange(password, self.state.0)?; + let outcome = validate_password_and_exchange(password, self.state.0).await?; if let sasl::Outcome::Success(_) = &outcome { self.stream.write_message_noflush(&Be::AuthenticationOk)?; @@ -180,7 +180,7 @@ impl AuthFlow<'_, S, Scram<'_>> { } } -pub(crate) fn validate_password_and_exchange( +pub(crate) async fn validate_password_and_exchange( password: &[u8], secret: AuthSecret, ) -> super::Result> { @@ -194,13 +194,7 @@ pub(crate) fn validate_password_and_exchange( } // perform scram authentication as both client and server to validate the keys AuthSecret::Scram(scram_secret) => { - use postgres_protocol::authentication::sasl::{ChannelBinding, ScramSha256}; - let sasl_client = ScramSha256::new(password, ChannelBinding::unsupported()); - let outcome = crate::scram::exchange( - &scram_secret, - sasl_client, - crate::config::TlsServerEndPoint::Undefined, - )?; + let outcome = crate::scram::exchange(&scram_secret, password).await?; let client_key = match outcome { sasl::Outcome::Success(client_key) => client_key, diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs index b3d4fc0411..d38439c2a0 100644 --- a/proxy/src/bin/proxy.rs +++ b/proxy/src/bin/proxy.rs @@ -1,3 +1,10 @@ +use aws_config::environment::EnvironmentVariableCredentialsProvider; +use aws_config::imds::credentials::ImdsCredentialsProvider; +use aws_config::meta::credentials::CredentialsProviderChain; +use aws_config::meta::region::RegionProviderChain; +use aws_config::profile::ProfileFileCredentialsProvider; +use aws_config::provider_config::ProviderConfig; +use aws_config::web_identity_token::WebIdentityTokenCredentialsProvider; use futures::future::Either; use proxy::auth; use proxy::auth::backend::MaybeOwned; @@ -10,11 +17,14 @@ use proxy::config::ProjectInfoCacheOptions; use proxy::console; use proxy::context::parquet::ParquetUploadArgs; use proxy::http; +use proxy::metrics::NUM_CANCELLATION_REQUESTS_SOURCE_FROM_CLIENT; use proxy::rate_limiter::EndpointRateLimiter; use proxy::rate_limiter::RateBucketInfo; use proxy::rate_limiter::RateLimiterConfig; +use proxy::redis::cancellation_publisher::RedisPublisherClient; +use proxy::redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider; +use proxy::redis::elasticache; use proxy::redis::notifications; -use proxy::redis::publisher::RedisPublisherClient; use proxy::serverless::GlobalConnPoolOptions; use proxy::usage_metrics; @@ -150,9 +160,24 @@ struct ProxyCliArgs { /// disable ip check for http requests. If it is too time consuming, it could be turned off. #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] disable_ip_check_for_http: bool, - /// redis url for notifications. + /// redis url for notifications (if empty, redis_host:port will be used for both notifications and streaming connections) #[clap(long)] redis_notifications: Option, + /// redis host for streaming connections (might be different from the notifications host) + #[clap(long)] + redis_host: Option, + /// redis port for streaming connections (might be different from the notifications host) + #[clap(long)] + redis_port: Option, + /// redis cluster name, used in aws elasticache + #[clap(long)] + redis_cluster_name: Option, + /// redis user_id, used in aws elasticache + #[clap(long)] + redis_user_id: Option, + /// aws region to retrieve credentials + #[clap(long, default_value_t = String::new())] + aws_region: String, /// cache for `project_info` (use `size=0` to disable) #[clap(long, default_value = config::ProjectInfoCacheOptions::CACHE_DEFAULT_OPTIONS)] project_info_cache: String, @@ -216,6 +241,61 @@ async fn main() -> anyhow::Result<()> { let config = build_config(&args)?; info!("Authentication backend: {}", config.auth_backend); + info!("Using region: {}", config.aws_region); + + let region_provider = RegionProviderChain::default_provider().or_else(&*config.aws_region); // Replace with your Redis region if needed + let provider_conf = + ProviderConfig::without_region().with_region(region_provider.region().await); + let aws_credentials_provider = { + // uses "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY" + CredentialsProviderChain::first_try("env", EnvironmentVariableCredentialsProvider::new()) + // uses "AWS_PROFILE" / `aws sso login --profile ` + .or_else( + "profile-sso", + ProfileFileCredentialsProvider::builder() + .configure(&provider_conf) + .build(), + ) + // uses "AWS_WEB_IDENTITY_TOKEN_FILE", "AWS_ROLE_ARN", "AWS_ROLE_SESSION_NAME" + // needed to access remote extensions bucket + .or_else( + "token", + WebIdentityTokenCredentialsProvider::builder() + .configure(&provider_conf) + .build(), + ) + // uses imds v2 + .or_else("imds", ImdsCredentialsProvider::builder().build()) + }; + let elasticache_credentials_provider = Arc::new(elasticache::CredentialsProvider::new( + elasticache::AWSIRSAConfig::new( + config.aws_region.clone(), + args.redis_cluster_name, + args.redis_user_id, + ), + aws_credentials_provider, + )); + let redis_notifications_client = + match (args.redis_notifications, (args.redis_host, args.redis_port)) { + (Some(url), _) => { + info!("Starting redis notifications listener ({url})"); + Some(ConnectionWithCredentialsProvider::new_with_static_credentials(url)) + } + (None, (Some(host), Some(port))) => Some( + ConnectionWithCredentialsProvider::new_with_credentials_provider( + host, + port, + elasticache_credentials_provider.clone(), + ), + ), + (None, (None, None)) => { + warn!("Redis is disabled"); + None + } + _ => { + bail!("redis-host and redis-port must be specified together"); + } + }; // Check that we can bind to address before further initialization let http_address: SocketAddr = args.http.parse()?; @@ -233,17 +313,22 @@ async fn main() -> anyhow::Result<()> { let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new(&config.endpoint_rps_limit)); let cancel_map = CancelMap::default(); - let redis_publisher = match &args.redis_notifications { - Some(url) => Some(Arc::new(Mutex::new(RedisPublisherClient::new( - url, + + // let redis_notifications_client = redis_notifications_client.map(|x| Box::leak(Box::new(x))); + let redis_publisher = match &redis_notifications_client { + Some(redis_publisher) => Some(Arc::new(Mutex::new(RedisPublisherClient::new( + redis_publisher.clone(), args.region.clone(), &config.redis_rps_limit, )?))), None => None, }; - let cancellation_handler = Arc::new(CancellationHandler::new( + let cancellation_handler = Arc::new(CancellationHandler::< + Option>>, + >::new( cancel_map.clone(), redis_publisher, + NUM_CANCELLATION_REQUESTS_SOURCE_FROM_CLIENT, )); // client facing tasks. these will exit on error or on cancellation @@ -290,17 +375,16 @@ async fn main() -> anyhow::Result<()> { if let auth::BackendType::Console(api, _) = &config.auth_backend { if let proxy::console::provider::ConsoleBackend::Console(api) = &**api { - let cache = api.caches.project_info.clone(); - if let Some(url) = args.redis_notifications { - info!("Starting redis notifications listener ({url})"); + if let Some(redis_notifications_client) = redis_notifications_client { + let cache = api.caches.project_info.clone(); maintenance_tasks.spawn(notifications::task_main( - url.to_owned(), + redis_notifications_client.clone(), cache.clone(), cancel_map.clone(), args.region.clone(), )); + maintenance_tasks.spawn(async move { cache.clone().gc_worker().await }); } - maintenance_tasks.spawn(async move { cache.clone().gc_worker().await }); } } @@ -445,8 +529,8 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { endpoint_rps_limit, redis_rps_limit, handshake_timeout: args.handshake_timeout, - // TODO: add this argument region: args.region.clone(), + aws_region: args.aws_region.clone(), })); Ok(config) diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs index c9607909b3..6151513614 100644 --- a/proxy/src/cancellation.rs +++ b/proxy/src/cancellation.rs @@ -1,4 +1,3 @@ -use async_trait::async_trait; use dashmap::DashMap; use pq_proto::CancelKeyData; use std::{net::SocketAddr, sync::Arc}; @@ -10,18 +9,26 @@ use tracing::info; use uuid::Uuid; use crate::{ - error::ReportableError, metrics::NUM_CANCELLATION_REQUESTS, - redis::publisher::RedisPublisherClient, + error::ReportableError, + metrics::NUM_CANCELLATION_REQUESTS, + redis::cancellation_publisher::{ + CancellationPublisher, CancellationPublisherMut, RedisPublisherClient, + }, }; pub type CancelMap = Arc>>; +pub type CancellationHandlerMain = CancellationHandler>>>; +pub type CancellationHandlerMainInternal = Option>>; /// Enables serving `CancelRequest`s. /// -/// If there is a `RedisPublisherClient` available, it will be used to publish the cancellation key to other proxy instances. -pub struct CancellationHandler { +/// If `CancellationPublisher` is available, cancel request will be used to publish the cancellation key to other proxy instances. +pub struct CancellationHandler

{ map: CancelMap, - redis_client: Option>>, + client: P, + /// This field used for the monitoring purposes. + /// Represents the source of the cancellation request. + from: &'static str, } #[derive(Debug, Error)] @@ -44,49 +51,9 @@ impl ReportableError for CancelError { } } -impl CancellationHandler { - pub fn new(map: CancelMap, redis_client: Option>>) -> Self { - Self { map, redis_client } - } - /// Cancel a running query for the corresponding connection. - pub async fn cancel_session( - &self, - key: CancelKeyData, - session_id: Uuid, - ) -> Result<(), CancelError> { - let from = "from_client"; - // NB: we should immediately release the lock after cloning the token. - let Some(cancel_closure) = self.map.get(&key).and_then(|x| x.clone()) else { - tracing::warn!("query cancellation key not found: {key}"); - if let Some(redis_client) = &self.redis_client { - NUM_CANCELLATION_REQUESTS - .with_label_values(&[from, "not_found"]) - .inc(); - info!("publishing cancellation key to Redis"); - match redis_client.lock().await.try_publish(key, session_id).await { - Ok(()) => { - info!("cancellation key successfuly published to Redis"); - } - Err(e) => { - tracing::error!("failed to publish a message: {e}"); - return Err(CancelError::IO(std::io::Error::new( - std::io::ErrorKind::Other, - e.to_string(), - ))); - } - } - } - return Ok(()); - }; - NUM_CANCELLATION_REQUESTS - .with_label_values(&[from, "found"]) - .inc(); - info!("cancelling query per user's request using key {key}"); - cancel_closure.try_cancel_query().await - } - +impl CancellationHandler

{ /// Run async action within an ephemeral session identified by [`CancelKeyData`]. - pub fn get_session(self: Arc) -> Session { + pub fn get_session(self: Arc) -> Session

{ // HACK: We'd rather get the real backend_pid but tokio_postgres doesn't // expose it and we don't want to do another roundtrip to query // for it. The client will be able to notice that this is not the @@ -112,9 +79,39 @@ impl CancellationHandler { cancellation_handler: self, } } + /// Try to cancel a running query for the corresponding connection. + /// If the cancellation key is not found, it will be published to Redis. + pub async fn cancel_session( + &self, + key: CancelKeyData, + session_id: Uuid, + ) -> Result<(), CancelError> { + // NB: we should immediately release the lock after cloning the token. + let Some(cancel_closure) = self.map.get(&key).and_then(|x| x.clone()) else { + tracing::warn!("query cancellation key not found: {key}"); + NUM_CANCELLATION_REQUESTS + .with_label_values(&[self.from, "not_found"]) + .inc(); + match self.client.try_publish(key, session_id).await { + Ok(()) => {} // do nothing + Err(e) => { + return Err(CancelError::IO(std::io::Error::new( + std::io::ErrorKind::Other, + e.to_string(), + ))); + } + } + return Ok(()); + }; + NUM_CANCELLATION_REQUESTS + .with_label_values(&[self.from, "found"]) + .inc(); + info!("cancelling query per user's request using key {key}"); + cancel_closure.try_cancel_query().await + } #[cfg(test)] - fn contains(&self, session: &Session) -> bool { + fn contains(&self, session: &Session

) -> bool { self.map.contains_key(&session.key) } @@ -124,31 +121,19 @@ impl CancellationHandler { } } -#[async_trait] -pub trait NotificationsCancellationHandler { - async fn cancel_session_no_publish(&self, key: CancelKeyData) -> Result<(), CancelError>; +impl CancellationHandler<()> { + pub fn new(map: CancelMap, from: &'static str) -> Self { + Self { + map, + client: (), + from, + } + } } -#[async_trait] -impl NotificationsCancellationHandler for CancellationHandler { - async fn cancel_session_no_publish(&self, key: CancelKeyData) -> Result<(), CancelError> { - let from = "from_redis"; - let cancel_closure = self.map.get(&key).and_then(|x| x.clone()); - match cancel_closure { - Some(cancel_closure) => { - NUM_CANCELLATION_REQUESTS - .with_label_values(&[from, "found"]) - .inc(); - cancel_closure.try_cancel_query().await - } - None => { - NUM_CANCELLATION_REQUESTS - .with_label_values(&[from, "not_found"]) - .inc(); - tracing::warn!("query cancellation key not found: {key}"); - Ok(()) - } - } +impl CancellationHandler>>> { + pub fn new(map: CancelMap, client: Option>>, from: &'static str) -> Self { + Self { map, client, from } } } @@ -178,14 +163,14 @@ impl CancelClosure { } /// Helper for registering query cancellation tokens. -pub struct Session { +pub struct Session

{ /// The user-facing key identifying this session. key: CancelKeyData, /// The [`CancelMap`] this session belongs to. - cancellation_handler: Arc, + cancellation_handler: Arc>, } -impl Session { +impl

Session

{ /// Store the cancel token for the given session. /// This enables query cancellation in `crate::proxy::prepare_client_connection`. pub fn enable_query_cancellation(&self, cancel_closure: CancelClosure) -> CancelKeyData { @@ -198,7 +183,7 @@ impl Session { } } -impl Drop for Session { +impl

Drop for Session

{ fn drop(&mut self) { self.cancellation_handler.map.remove(&self.key); info!("dropped query cancellation key {}", &self.key); @@ -207,14 +192,16 @@ impl Drop for Session { #[cfg(test)] mod tests { + use crate::metrics::NUM_CANCELLATION_REQUESTS_SOURCE_FROM_REDIS; + use super::*; #[tokio::test] async fn check_session_drop() -> anyhow::Result<()> { - let cancellation_handler = Arc::new(CancellationHandler { - map: CancelMap::default(), - redis_client: None, - }); + let cancellation_handler = Arc::new(CancellationHandler::<()>::new( + CancelMap::default(), + NUM_CANCELLATION_REQUESTS_SOURCE_FROM_REDIS, + )); let session = cancellation_handler.clone().get_session(); assert!(cancellation_handler.contains(&session)); @@ -224,4 +211,19 @@ mod tests { Ok(()) } + + #[tokio::test] + async fn cancel_session_noop_regression() { + let handler = CancellationHandler::<()>::new(Default::default(), "local"); + handler + .cancel_session( + CancelKeyData { + backend_pid: 0, + cancel_key: 0, + }, + Uuid::new_v4(), + ) + .await + .unwrap(); + } } diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs index b61c1fb9ef..65153babcb 100644 --- a/proxy/src/compute.rs +++ b/proxy/src/compute.rs @@ -82,14 +82,13 @@ pub type ScramKeys = tokio_postgres::config::ScramKeys<32>; /// A config for establishing a connection to compute node. /// Eventually, `tokio_postgres` will be replaced with something better. /// Newtype allows us to implement methods on top of it. -#[derive(Clone)] -#[repr(transparent)] +#[derive(Clone, Default)] pub struct ConnCfg(Box); /// Creation and initialization routines. impl ConnCfg { pub fn new() -> Self { - Self(Default::default()) + Self::default() } /// Reuse password or auth keys from the other config. @@ -165,12 +164,6 @@ impl std::ops::DerefMut for ConnCfg { } } -impl Default for ConnCfg { - fn default() -> Self { - Self::new() - } -} - impl ConnCfg { /// Establish a raw TCP connection to the compute node. async fn connect_raw(&self, timeout: Duration) -> io::Result<(SocketAddr, TcpStream, &str)> { diff --git a/proxy/src/config.rs b/proxy/src/config.rs index 437ec9f401..45f8d76144 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -28,6 +28,7 @@ pub struct ProxyConfig { pub redis_rps_limit: Vec, pub region: String, pub handshake_timeout: Duration, + pub aws_region: String, } #[derive(Debug)] diff --git a/proxy/src/console.rs b/proxy/src/console.rs index fd3c46b946..ea95e83437 100644 --- a/proxy/src/console.rs +++ b/proxy/src/console.rs @@ -6,7 +6,7 @@ pub mod messages; /// Wrappers for console APIs and their mocks. pub mod provider; -pub use provider::{errors, Api, AuthSecret, CachedNodeInfo, NodeInfo}; +pub(crate) use provider::{errors, Api, AuthSecret, CachedNodeInfo, NodeInfo}; /// Various cache-related types. pub mod caches { diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs index 8609606273..69bfd6b045 100644 --- a/proxy/src/console/provider.rs +++ b/proxy/src/console/provider.rs @@ -14,7 +14,6 @@ use crate::{ context::RequestMonitoring, scram, EndpointCacheKey, ProjectId, }; -use async_trait::async_trait; use dashmap::DashMap; use std::{sync::Arc, time::Duration}; use tokio::sync::{OwnedSemaphorePermit, Semaphore}; @@ -326,8 +325,7 @@ pub type CachedAllowedIps = Cached<&'static ProjectInfoCacheImpl, Arc), } -#[async_trait] impl Api for ConsoleBackend { async fn get_role_secret( &self, diff --git a/proxy/src/console/provider/mock.rs b/proxy/src/console/provider/mock.rs index 0579ef6fc4..b759c81373 100644 --- a/proxy/src/console/provider/mock.rs +++ b/proxy/src/console/provider/mock.rs @@ -8,7 +8,6 @@ use crate::console::provider::{CachedAllowedIps, CachedRoleSecret}; use crate::context::RequestMonitoring; use crate::{auth::backend::ComputeUserInfo, compute, error::io_error, scram, url::ApiUrl}; use crate::{auth::IpPattern, cache::Cached}; -use async_trait::async_trait; use futures::TryFutureExt; use std::{str::FromStr, sync::Arc}; use thiserror::Error; @@ -144,7 +143,6 @@ async fn get_execute_postgres_query( Ok(Some(entry)) } -#[async_trait] impl super::Api for Api { #[tracing::instrument(skip_all)] async fn get_role_secret( diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs index b36663518d..89ebfa57f1 100644 --- a/proxy/src/console/provider/neon.rs +++ b/proxy/src/console/provider/neon.rs @@ -14,7 +14,6 @@ use crate::{ context::RequestMonitoring, metrics::{ALLOWED_IPS_BY_CACHE_OUTCOME, ALLOWED_IPS_NUMBER}, }; -use async_trait::async_trait; use futures::TryFutureExt; use std::sync::Arc; use tokio::time::Instant; @@ -168,7 +167,6 @@ impl Api { } } -#[async_trait] impl super::Api for Api { #[tracing::instrument(skip_all)] async fn get_role_secret( diff --git a/proxy/src/http/health_server.rs b/proxy/src/http/health_server.rs index 6186ddde0d..cbb17ebcb7 100644 --- a/proxy/src/http/health_server.rs +++ b/proxy/src/http/health_server.rs @@ -2,14 +2,21 @@ use anyhow::{anyhow, bail}; use hyper::{Body, Request, Response, StatusCode}; use std::{convert::Infallible, net::TcpListener}; use tracing::info; -use utils::http::{endpoint, error::ApiError, json::json_response, RouterBuilder, RouterService}; +use utils::http::{ + endpoint::{self, prometheus_metrics_handler, request_span}, + error::ApiError, + json::json_response, + RouterBuilder, RouterService, +}; async fn status_handler(_: Request) -> Result, ApiError> { json_response(StatusCode::OK, "") } fn make_router() -> RouterBuilder { - endpoint::make_router().get("/v1/status", status_handler) + endpoint::make_router() + .get("/metrics", |r| request_span(r, prometheus_metrics_handler)) + .get("/v1/status", status_handler) } pub async fn task_main(http_listener: TcpListener) -> anyhow::Result { diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs index 02ebcd6aaa..eed45e421b 100644 --- a/proxy/src/metrics.rs +++ b/proxy/src/metrics.rs @@ -161,6 +161,9 @@ pub static NUM_CANCELLATION_REQUESTS: Lazy = Lazy::new(|| { .unwrap() }); +pub const NUM_CANCELLATION_REQUESTS_SOURCE_FROM_CLIENT: &str = "from_client"; +pub const NUM_CANCELLATION_REQUESTS_SOURCE_FROM_REDIS: &str = "from_redis"; + pub enum Waiting { Cplane, Client, diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs index ab5bf5d494..843bfc08cf 100644 --- a/proxy/src/proxy.rs +++ b/proxy/src/proxy.rs @@ -10,7 +10,7 @@ pub mod wake_compute; use crate::{ auth, - cancellation::{self, CancellationHandler}, + cancellation::{self, CancellationHandlerMain, CancellationHandlerMainInternal}, compute, config::{ProxyConfig, TlsConfig}, context::RequestMonitoring, @@ -62,7 +62,7 @@ pub async fn task_main( listener: tokio::net::TcpListener, cancellation_token: CancellationToken, endpoint_rate_limiter: Arc, - cancellation_handler: Arc, + cancellation_handler: Arc, ) -> anyhow::Result<()> { scopeguard::defer! { info!("proxy has shut down"); @@ -233,12 +233,12 @@ impl ReportableError for ClientRequestError { pub async fn handle_client( config: &'static ProxyConfig, ctx: &mut RequestMonitoring, - cancellation_handler: Arc, + cancellation_handler: Arc, stream: S, mode: ClientMode, endpoint_rate_limiter: Arc, conn_gauge: IntCounterPairGuard, -) -> Result>, ClientRequestError> { +) -> Result>, ClientRequestError> { info!("handling interactive connection from client"); let proto = ctx.protocol; @@ -338,9 +338,9 @@ pub async fn handle_client( /// Finish client connection initialization: confirm auth success, send params, etc. #[tracing::instrument(skip_all)] -async fn prepare_client_connection( +async fn prepare_client_connection

( node: &compute::PostgresConnection, - session: &cancellation::Session, + session: &cancellation::Session

, stream: &mut PqStream, ) -> Result<(), std::io::Error> { // Register compute's query cancellation token and produce a new, unique one. diff --git a/proxy/src/proxy/passthrough.rs b/proxy/src/proxy/passthrough.rs index b2f682fd2f..f6d4314391 100644 --- a/proxy/src/proxy/passthrough.rs +++ b/proxy/src/proxy/passthrough.rs @@ -55,17 +55,17 @@ pub async fn proxy_pass( Ok(()) } -pub struct ProxyPassthrough { +pub struct ProxyPassthrough { pub client: Stream, pub compute: PostgresConnection, pub aux: MetricsAuxInfo, pub req: IntCounterPairGuard, pub conn: IntCounterPairGuard, - pub cancel: cancellation::Session, + pub cancel: cancellation::Session

, } -impl ProxyPassthrough { +impl ProxyPassthrough { pub async fn proxy_pass(self) -> anyhow::Result<()> { let res = proxy_pass(self.client, self.compute.stream, self.aux).await; self.compute.cancel_closure.try_cancel_query().await?; diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs index 5d0340e852..9c3be73612 100644 --- a/proxy/src/proxy/tests.rs +++ b/proxy/src/proxy/tests.rs @@ -135,9 +135,10 @@ impl TestAuth for NoAuth {} struct Scram(scram::ServerSecret); impl Scram { - fn new(password: &str) -> anyhow::Result { - let secret = - scram::ServerSecret::build(password).context("failed to generate scram secret")?; + async fn new(password: &str) -> anyhow::Result { + let secret = scram::ServerSecret::build(password) + .await + .context("failed to generate scram secret")?; Ok(Scram(secret)) } @@ -284,7 +285,7 @@ async fn scram_auth_good(#[case] password: &str) -> anyhow::Result<()> { let proxy = tokio::spawn(dummy_proxy( client, Some(server_config), - Scram::new(password)?, + Scram::new(password).await?, )); let (_client, _conn) = tokio_postgres::Config::new() @@ -308,7 +309,7 @@ async fn scram_auth_disable_channel_binding() -> anyhow::Result<()> { let proxy = tokio::spawn(dummy_proxy( client, Some(server_config), - Scram::new("password")?, + Scram::new("password").await?, )); let (_client, _conn) = tokio_postgres::Config::new() diff --git a/proxy/src/proxy/tests/mitm.rs b/proxy/src/proxy/tests/mitm.rs index e0c2d836f4..3b760e5dab 100644 --- a/proxy/src/proxy/tests/mitm.rs +++ b/proxy/src/proxy/tests/mitm.rs @@ -148,7 +148,7 @@ async fn scram_auth_disable_channel_binding() -> anyhow::Result<()> { let proxy = tokio::spawn(dummy_proxy( client, Some(server_config), - Scram::new("password")?, + Scram::new("password").await?, )); let _client_err = tokio_postgres::Config::new() @@ -231,7 +231,7 @@ async fn connect_failure( let proxy = tokio::spawn(dummy_proxy( client, Some(server_config), - Scram::new("password")?, + Scram::new("password").await?, )); let _client_err = tokio_postgres::Config::new() diff --git a/proxy/src/redis.rs b/proxy/src/redis.rs index 35d6db074e..a322f0368c 100644 --- a/proxy/src/redis.rs +++ b/proxy/src/redis.rs @@ -1,2 +1,4 @@ +pub mod cancellation_publisher; +pub mod connection_with_credentials_provider; +pub mod elasticache; pub mod notifications; -pub mod publisher; diff --git a/proxy/src/redis/cancellation_publisher.rs b/proxy/src/redis/cancellation_publisher.rs new file mode 100644 index 0000000000..422789813c --- /dev/null +++ b/proxy/src/redis/cancellation_publisher.rs @@ -0,0 +1,161 @@ +use std::sync::Arc; + +use pq_proto::CancelKeyData; +use redis::AsyncCommands; +use tokio::sync::Mutex; +use uuid::Uuid; + +use crate::rate_limiter::{RateBucketInfo, RedisRateLimiter}; + +use super::{ + connection_with_credentials_provider::ConnectionWithCredentialsProvider, + notifications::{CancelSession, Notification, PROXY_CHANNEL_NAME}, +}; + +pub trait CancellationPublisherMut: Send + Sync + 'static { + #[allow(async_fn_in_trait)] + async fn try_publish( + &mut self, + cancel_key_data: CancelKeyData, + session_id: Uuid, + ) -> anyhow::Result<()>; +} + +pub trait CancellationPublisher: Send + Sync + 'static { + #[allow(async_fn_in_trait)] + async fn try_publish( + &self, + cancel_key_data: CancelKeyData, + session_id: Uuid, + ) -> anyhow::Result<()>; +} + +impl CancellationPublisher for () { + async fn try_publish( + &self, + _cancel_key_data: CancelKeyData, + _session_id: Uuid, + ) -> anyhow::Result<()> { + Ok(()) + } +} + +impl CancellationPublisherMut for P { + async fn try_publish( + &mut self, + cancel_key_data: CancelKeyData, + session_id: Uuid, + ) -> anyhow::Result<()> { +

::try_publish(self, cancel_key_data, session_id).await + } +} + +impl CancellationPublisher for Option

{ + async fn try_publish( + &self, + cancel_key_data: CancelKeyData, + session_id: Uuid, + ) -> anyhow::Result<()> { + if let Some(p) = self { + p.try_publish(cancel_key_data, session_id).await + } else { + Ok(()) + } + } +} + +impl CancellationPublisher for Arc> { + async fn try_publish( + &self, + cancel_key_data: CancelKeyData, + session_id: Uuid, + ) -> anyhow::Result<()> { + self.lock() + .await + .try_publish(cancel_key_data, session_id) + .await + } +} + +pub struct RedisPublisherClient { + client: ConnectionWithCredentialsProvider, + region_id: String, + limiter: RedisRateLimiter, +} + +impl RedisPublisherClient { + pub fn new( + client: ConnectionWithCredentialsProvider, + region_id: String, + info: &'static [RateBucketInfo], + ) -> anyhow::Result { + Ok(Self { + client, + region_id, + limiter: RedisRateLimiter::new(info), + }) + } + + async fn publish( + &mut self, + cancel_key_data: CancelKeyData, + session_id: Uuid, + ) -> anyhow::Result<()> { + let payload = serde_json::to_string(&Notification::Cancel(CancelSession { + region_id: Some(self.region_id.clone()), + cancel_key_data, + session_id, + }))?; + self.client.publish(PROXY_CHANNEL_NAME, payload).await?; + Ok(()) + } + pub async fn try_connect(&mut self) -> anyhow::Result<()> { + match self.client.connect().await { + Ok(()) => {} + Err(e) => { + tracing::error!("failed to connect to redis: {e}"); + return Err(e); + } + } + Ok(()) + } + async fn try_publish_internal( + &mut self, + cancel_key_data: CancelKeyData, + session_id: Uuid, + ) -> anyhow::Result<()> { + if !self.limiter.check() { + tracing::info!("Rate limit exceeded. Skipping cancellation message"); + return Err(anyhow::anyhow!("Rate limit exceeded")); + } + match self.publish(cancel_key_data, session_id).await { + Ok(()) => return Ok(()), + Err(e) => { + tracing::error!("failed to publish a message: {e}"); + } + } + tracing::info!("Publisher is disconnected. Reconnectiong..."); + self.try_connect().await?; + self.publish(cancel_key_data, session_id).await + } +} + +impl CancellationPublisherMut for RedisPublisherClient { + async fn try_publish( + &mut self, + cancel_key_data: CancelKeyData, + session_id: Uuid, + ) -> anyhow::Result<()> { + tracing::info!("publishing cancellation key to Redis"); + match self.try_publish_internal(cancel_key_data, session_id).await { + Ok(()) => { + tracing::info!("cancellation key successfuly published to Redis"); + Ok(()) + } + Err(e) => { + tracing::error!("failed to publish a message: {e}"); + Err(e) + } + } + } +} diff --git a/proxy/src/redis/connection_with_credentials_provider.rs b/proxy/src/redis/connection_with_credentials_provider.rs new file mode 100644 index 0000000000..d183abb53a --- /dev/null +++ b/proxy/src/redis/connection_with_credentials_provider.rs @@ -0,0 +1,225 @@ +use std::{sync::Arc, time::Duration}; + +use futures::FutureExt; +use redis::{ + aio::{ConnectionLike, MultiplexedConnection}, + ConnectionInfo, IntoConnectionInfo, RedisConnectionInfo, RedisResult, +}; +use tokio::task::JoinHandle; +use tracing::{error, info}; + +use super::elasticache::CredentialsProvider; + +enum Credentials { + Static(ConnectionInfo), + Dynamic(Arc, redis::ConnectionAddr), +} + +impl Clone for Credentials { + fn clone(&self) -> Self { + match self { + Credentials::Static(info) => Credentials::Static(info.clone()), + Credentials::Dynamic(provider, addr) => { + Credentials::Dynamic(Arc::clone(provider), addr.clone()) + } + } + } +} + +/// A wrapper around `redis::MultiplexedConnection` that automatically refreshes the token. +/// Provides PubSub connection without credentials refresh. +pub struct ConnectionWithCredentialsProvider { + credentials: Credentials, + con: Option, + refresh_token_task: Option>, + mutex: tokio::sync::Mutex<()>, +} + +impl Clone for ConnectionWithCredentialsProvider { + fn clone(&self) -> Self { + Self { + credentials: self.credentials.clone(), + con: None, + refresh_token_task: None, + mutex: tokio::sync::Mutex::new(()), + } + } +} + +impl ConnectionWithCredentialsProvider { + pub fn new_with_credentials_provider( + host: String, + port: u16, + credentials_provider: Arc, + ) -> Self { + Self { + credentials: Credentials::Dynamic( + credentials_provider, + redis::ConnectionAddr::TcpTls { + host, + port, + insecure: false, + tls_params: None, + }, + ), + con: None, + refresh_token_task: None, + mutex: tokio::sync::Mutex::new(()), + } + } + + pub fn new_with_static_credentials(params: T) -> Self { + Self { + credentials: Credentials::Static(params.into_connection_info().unwrap()), + con: None, + refresh_token_task: None, + mutex: tokio::sync::Mutex::new(()), + } + } + + pub async fn connect(&mut self) -> anyhow::Result<()> { + let _guard = self.mutex.lock().await; + if let Some(con) = self.con.as_mut() { + match redis::cmd("PING").query_async(con).await { + Ok(()) => { + return Ok(()); + } + Err(e) => { + error!("Error during PING: {e:?}"); + } + } + } else { + info!("Connection is not established"); + } + info!("Establishing a new connection..."); + self.con = None; + if let Some(f) = self.refresh_token_task.take() { + f.abort() + } + let con = self + .get_client() + .await? + .get_multiplexed_tokio_connection() + .await?; + if let Credentials::Dynamic(credentials_provider, _) = &self.credentials { + let credentials_provider = credentials_provider.clone(); + let con2 = con.clone(); + let f = tokio::spawn(async move { + let _ = Self::keep_connection(con2, credentials_provider).await; + }); + self.refresh_token_task = Some(f); + } + self.con = Some(con); + Ok(()) + } + + async fn get_connection_info(&self) -> anyhow::Result { + match &self.credentials { + Credentials::Static(info) => Ok(info.clone()), + Credentials::Dynamic(provider, addr) => { + let (username, password) = provider.provide_credentials().await?; + Ok(ConnectionInfo { + addr: addr.clone(), + redis: RedisConnectionInfo { + db: 0, + username: Some(username), + password: Some(password.clone()), + }, + }) + } + } + } + + async fn get_client(&self) -> anyhow::Result { + let client = redis::Client::open(self.get_connection_info().await?)?; + Ok(client) + } + + // PubSub does not support credentials refresh. + // Requires manual reconnection every 12h. + pub async fn get_async_pubsub(&self) -> anyhow::Result { + Ok(self.get_client().await?.get_async_pubsub().await?) + } + + // The connection lives for 12h. + // It can be prolonged with sending `AUTH` commands with the refreshed token. + // https://docs.aws.amazon.com/AmazonElastiCache/latest/red-ug/auth-iam.html#auth-iam-limits + async fn keep_connection( + mut con: MultiplexedConnection, + credentials_provider: Arc, + ) -> anyhow::Result<()> { + loop { + // The connection lives for 12h, for the sanity check we refresh it every hour. + tokio::time::sleep(Duration::from_secs(60 * 60)).await; + match Self::refresh_token(&mut con, credentials_provider.clone()).await { + Ok(()) => { + info!("Token refreshed"); + } + Err(e) => { + error!("Error during token refresh: {e:?}"); + } + } + } + } + async fn refresh_token( + con: &mut MultiplexedConnection, + credentials_provider: Arc, + ) -> anyhow::Result<()> { + let (user, password) = credentials_provider.provide_credentials().await?; + redis::cmd("AUTH") + .arg(user) + .arg(password) + .query_async(con) + .await?; + Ok(()) + } + /// Sends an already encoded (packed) command into the TCP socket and + /// reads the single response from it. + pub async fn send_packed_command(&mut self, cmd: &redis::Cmd) -> RedisResult { + // Clone connection to avoid having to lock the ArcSwap in write mode + let con = self.con.as_mut().ok_or(redis::RedisError::from(( + redis::ErrorKind::IoError, + "Connection not established", + )))?; + con.send_packed_command(cmd).await + } + + /// Sends multiple already encoded (packed) command into the TCP socket + /// and reads `count` responses from it. This is used to implement + /// pipelining. + pub async fn send_packed_commands( + &mut self, + cmd: &redis::Pipeline, + offset: usize, + count: usize, + ) -> RedisResult> { + // Clone shared connection future to avoid having to lock the ArcSwap in write mode + let con = self.con.as_mut().ok_or(redis::RedisError::from(( + redis::ErrorKind::IoError, + "Connection not established", + )))?; + con.send_packed_commands(cmd, offset, count).await + } +} + +impl ConnectionLike for ConnectionWithCredentialsProvider { + fn req_packed_command<'a>( + &'a mut self, + cmd: &'a redis::Cmd, + ) -> redis::RedisFuture<'a, redis::Value> { + (async move { self.send_packed_command(cmd).await }).boxed() + } + + fn req_packed_commands<'a>( + &'a mut self, + cmd: &'a redis::Pipeline, + offset: usize, + count: usize, + ) -> redis::RedisFuture<'a, Vec> { + (async move { self.send_packed_commands(cmd, offset, count).await }).boxed() + } + + fn get_db(&self) -> i64 { + 0 + } +} diff --git a/proxy/src/redis/elasticache.rs b/proxy/src/redis/elasticache.rs new file mode 100644 index 0000000000..eded8250af --- /dev/null +++ b/proxy/src/redis/elasticache.rs @@ -0,0 +1,110 @@ +use std::time::{Duration, SystemTime}; + +use aws_config::meta::credentials::CredentialsProviderChain; +use aws_sdk_iam::config::ProvideCredentials; +use aws_sigv4::http_request::{ + self, SignableBody, SignableRequest, SignatureLocation, SigningSettings, +}; +use tracing::info; + +#[derive(Debug)] +pub struct AWSIRSAConfig { + region: String, + service_name: String, + cluster_name: String, + user_id: String, + token_ttl: Duration, + action: String, +} + +impl AWSIRSAConfig { + pub fn new(region: String, cluster_name: Option, user_id: Option) -> Self { + AWSIRSAConfig { + region, + service_name: "elasticache".to_string(), + cluster_name: cluster_name.unwrap_or_default(), + user_id: user_id.unwrap_or_default(), + // "The IAM authentication token is valid for 15 minutes" + // https://docs.aws.amazon.com/memorydb/latest/devguide/auth-iam.html#auth-iam-limits + token_ttl: Duration::from_secs(15 * 60), + action: "connect".to_string(), + } + } +} + +/// Credentials provider for AWS elasticache authentication. +/// +/// Official documentation: +/// +/// +/// Useful resources: +/// +pub struct CredentialsProvider { + config: AWSIRSAConfig, + credentials_provider: CredentialsProviderChain, +} + +impl CredentialsProvider { + pub fn new(config: AWSIRSAConfig, credentials_provider: CredentialsProviderChain) -> Self { + CredentialsProvider { + config, + credentials_provider, + } + } + pub async fn provide_credentials(&self) -> anyhow::Result<(String, String)> { + let aws_credentials = self + .credentials_provider + .provide_credentials() + .await? + .into(); + info!("AWS credentials successfully obtained"); + info!("Connecting to Redis with configuration: {:?}", self.config); + let mut settings = SigningSettings::default(); + settings.signature_location = SignatureLocation::QueryParams; + settings.expires_in = Some(self.config.token_ttl); + let signing_params = aws_sigv4::sign::v4::SigningParams::builder() + .identity(&aws_credentials) + .region(&self.config.region) + .name(&self.config.service_name) + .time(SystemTime::now()) + .settings(settings) + .build()? + .into(); + let auth_params = [ + ("Action", &self.config.action), + ("User", &self.config.user_id), + ]; + let auth_params = url::form_urlencoded::Serializer::new(String::new()) + .extend_pairs(auth_params) + .finish(); + let auth_uri = http::Uri::builder() + .scheme("http") + .authority(self.config.cluster_name.as_bytes()) + .path_and_query(format!("/?{auth_params}")) + .build()?; + info!("{}", auth_uri); + + // Convert the HTTP request into a signable request + let signable_request = SignableRequest::new( + "GET", + auth_uri.to_string(), + std::iter::empty(), + SignableBody::Bytes(&[]), + )?; + + // Sign and then apply the signature to the request + let (si, _) = http_request::sign(signable_request, &signing_params)?.into_parts(); + let mut signable_request = http::Request::builder() + .method("GET") + .uri(auth_uri) + .body(())?; + si.apply_to_request_http1x(&mut signable_request); + Ok(( + self.config.user_id.clone(), + signable_request + .uri() + .to_string() + .replacen("http://", "", 1), + )) + } +} diff --git a/proxy/src/redis/notifications.rs b/proxy/src/redis/notifications.rs index 6ae848c0d2..8b7e3e3419 100644 --- a/proxy/src/redis/notifications.rs +++ b/proxy/src/redis/notifications.rs @@ -6,11 +6,12 @@ use redis::aio::PubSub; use serde::{Deserialize, Serialize}; use uuid::Uuid; +use super::connection_with_credentials_provider::ConnectionWithCredentialsProvider; use crate::{ cache::project_info::ProjectInfoCache, - cancellation::{CancelMap, CancellationHandler, NotificationsCancellationHandler}, + cancellation::{CancelMap, CancellationHandler}, intern::{ProjectIdInt, RoleNameInt}, - metrics::REDIS_BROKEN_MESSAGES, + metrics::{NUM_CANCELLATION_REQUESTS_SOURCE_FROM_REDIS, REDIS_BROKEN_MESSAGES}, }; const CPLANE_CHANNEL_NAME: &str = "neondb-proxy-ws-updates"; @@ -18,23 +19,13 @@ pub(crate) const PROXY_CHANNEL_NAME: &str = "neondb-proxy-to-proxy-updates"; const RECONNECT_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(20); const INVALIDATION_LAG: std::time::Duration = std::time::Duration::from_secs(20); -struct RedisConsumerClient { - client: redis::Client, -} - -impl RedisConsumerClient { - pub fn new(url: &str) -> anyhow::Result { - let client = redis::Client::open(url)?; - Ok(Self { client }) - } - async fn try_connect(&self) -> anyhow::Result { - let mut conn = self.client.get_async_connection().await?.into_pubsub(); - tracing::info!("subscribing to a channel `{CPLANE_CHANNEL_NAME}`"); - conn.subscribe(CPLANE_CHANNEL_NAME).await?; - tracing::info!("subscribing to a channel `{PROXY_CHANNEL_NAME}`"); - conn.subscribe(PROXY_CHANNEL_NAME).await?; - Ok(conn) - } +async fn try_connect(client: &ConnectionWithCredentialsProvider) -> anyhow::Result { + let mut conn = client.get_async_pubsub().await?; + tracing::info!("subscribing to a channel `{CPLANE_CHANNEL_NAME}`"); + conn.subscribe(CPLANE_CHANNEL_NAME).await?; + tracing::info!("subscribing to a channel `{PROXY_CHANNEL_NAME}`"); + conn.subscribe(PROXY_CHANNEL_NAME).await?; + Ok(conn) } #[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)] @@ -80,21 +71,18 @@ where serde_json::from_str(&s).map_err(::custom) } -struct MessageHandler< - C: ProjectInfoCache + Send + Sync + 'static, - H: NotificationsCancellationHandler + Send + Sync + 'static, -> { +struct MessageHandler { cache: Arc, - cancellation_handler: Arc, + cancellation_handler: Arc>, region_id: String, } -impl< - C: ProjectInfoCache + Send + Sync + 'static, - H: NotificationsCancellationHandler + Send + Sync + 'static, - > MessageHandler -{ - pub fn new(cache: Arc, cancellation_handler: Arc, region_id: String) -> Self { +impl MessageHandler { + pub fn new( + cache: Arc, + cancellation_handler: Arc>, + region_id: String, + ) -> Self { Self { cache, cancellation_handler, @@ -139,7 +127,7 @@ impl< // This instance of cancellation_handler doesn't have a RedisPublisherClient so it can't publish the message. match self .cancellation_handler - .cancel_session_no_publish(cancel_session.cancel_key_data) + .cancel_session(cancel_session.cancel_key_data, uuid::Uuid::nil()) .await { Ok(()) => {} @@ -182,7 +170,7 @@ fn invalidate_cache(cache: Arc, msg: Notification) { /// Handle console's invalidation messages. #[tracing::instrument(name = "console_notifications", skip_all)] pub async fn task_main( - url: String, + redis: ConnectionWithCredentialsProvider, cache: Arc, cancel_map: CancelMap, region_id: String, @@ -193,13 +181,15 @@ where cache.enable_ttl(); let handler = MessageHandler::new( cache, - Arc::new(CancellationHandler::new(cancel_map, None)), + Arc::new(CancellationHandler::<()>::new( + cancel_map, + NUM_CANCELLATION_REQUESTS_SOURCE_FROM_REDIS, + )), region_id, ); loop { - let redis = RedisConsumerClient::new(&url)?; - let conn = match redis.try_connect().await { + let mut conn = match try_connect(&redis).await { Ok(conn) => { handler.disable_ttl(); conn @@ -212,7 +202,7 @@ where continue; } }; - let mut stream = conn.into_on_message(); + let mut stream = conn.on_message(); while let Some(msg) = stream.next().await { match handler.handle_message(msg).await { Ok(()) => {} diff --git a/proxy/src/redis/publisher.rs b/proxy/src/redis/publisher.rs deleted file mode 100644 index f85593afdd..0000000000 --- a/proxy/src/redis/publisher.rs +++ /dev/null @@ -1,80 +0,0 @@ -use pq_proto::CancelKeyData; -use redis::AsyncCommands; -use uuid::Uuid; - -use crate::rate_limiter::{RateBucketInfo, RedisRateLimiter}; - -use super::notifications::{CancelSession, Notification, PROXY_CHANNEL_NAME}; - -pub struct RedisPublisherClient { - client: redis::Client, - publisher: Option, - region_id: String, - limiter: RedisRateLimiter, -} - -impl RedisPublisherClient { - pub fn new( - url: &str, - region_id: String, - info: &'static [RateBucketInfo], - ) -> anyhow::Result { - let client = redis::Client::open(url)?; - Ok(Self { - client, - publisher: None, - region_id, - limiter: RedisRateLimiter::new(info), - }) - } - pub async fn try_publish( - &mut self, - cancel_key_data: CancelKeyData, - session_id: Uuid, - ) -> anyhow::Result<()> { - if !self.limiter.check() { - tracing::info!("Rate limit exceeded. Skipping cancellation message"); - return Err(anyhow::anyhow!("Rate limit exceeded")); - } - match self.publish(cancel_key_data, session_id).await { - Ok(()) => return Ok(()), - Err(e) => { - tracing::error!("failed to publish a message: {e}"); - self.publisher = None; - } - } - tracing::info!("Publisher is disconnected. Reconnectiong..."); - self.try_connect().await?; - self.publish(cancel_key_data, session_id).await - } - - async fn publish( - &mut self, - cancel_key_data: CancelKeyData, - session_id: Uuid, - ) -> anyhow::Result<()> { - let conn = self - .publisher - .as_mut() - .ok_or_else(|| anyhow::anyhow!("not connected"))?; - let payload = serde_json::to_string(&Notification::Cancel(CancelSession { - region_id: Some(self.region_id.clone()), - cancel_key_data, - session_id, - }))?; - conn.publish(PROXY_CHANNEL_NAME, payload).await?; - Ok(()) - } - pub async fn try_connect(&mut self) -> anyhow::Result<()> { - match self.client.get_async_connection().await { - Ok(conn) => { - self.publisher = Some(conn); - } - Err(e) => { - tracing::error!("failed to connect to redis: {e}"); - return Err(e.into()); - } - } - Ok(()) - } -} diff --git a/proxy/src/sasl.rs b/proxy/src/sasl.rs index 1cf8b53e11..0811416ca2 100644 --- a/proxy/src/sasl.rs +++ b/proxy/src/sasl.rs @@ -33,6 +33,9 @@ pub enum Error { #[error("Internal error: missing digest")] MissingBinding, + #[error("could not decode salt: {0}")] + Base64(#[from] base64::DecodeError), + #[error(transparent)] Io(#[from] io::Error), } @@ -55,6 +58,7 @@ impl ReportableError for Error { Error::ChannelBindingBadMethod(_) => crate::error::ErrorKind::User, Error::BadClientMessage(_) => crate::error::ErrorKind::User, Error::MissingBinding => crate::error::ErrorKind::Service, + Error::Base64(_) => crate::error::ErrorKind::ControlPlane, Error::Io(_) => crate::error::ErrorKind::ClientDisconnect, } } diff --git a/proxy/src/scram.rs b/proxy/src/scram.rs index a95e734d06..ed80675f8a 100644 --- a/proxy/src/scram.rs +++ b/proxy/src/scram.rs @@ -56,8 +56,6 @@ fn sha256<'a>(parts: impl IntoIterator) -> [u8; 32] { #[cfg(test)] mod tests { - use postgres_protocol::authentication::sasl::{ChannelBinding, ScramSha256}; - use crate::sasl::{Mechanism, Step}; use super::{Exchange, ServerSecret}; @@ -113,17 +111,11 @@ mod tests { ); } - fn run_round_trip_test(server_password: &str, client_password: &str) { - let scram_secret = ServerSecret::build(server_password).unwrap(); - let sasl_client = - ScramSha256::new(client_password.as_bytes(), ChannelBinding::unsupported()); - - let outcome = super::exchange( - &scram_secret, - sasl_client, - crate::config::TlsServerEndPoint::Undefined, - ) - .unwrap(); + async fn run_round_trip_test(server_password: &str, client_password: &str) { + let scram_secret = ServerSecret::build(server_password).await.unwrap(); + let outcome = super::exchange(&scram_secret, client_password.as_bytes()) + .await + .unwrap(); match outcome { crate::sasl::Outcome::Success(_) => {} @@ -131,14 +123,14 @@ mod tests { } } - #[test] - fn round_trip() { - run_round_trip_test("pencil", "pencil") + #[tokio::test] + async fn round_trip() { + run_round_trip_test("pencil", "pencil").await } - #[test] + #[tokio::test] #[should_panic(expected = "password doesn't match")] - fn failure() { - run_round_trip_test("pencil", "eraser") + async fn failure() { + run_round_trip_test("pencil", "eraser").await } } diff --git a/proxy/src/scram/exchange.rs b/proxy/src/scram/exchange.rs index 9af7db5201..89dd33e59f 100644 --- a/proxy/src/scram/exchange.rs +++ b/proxy/src/scram/exchange.rs @@ -2,13 +2,16 @@ use std::convert::Infallible; -use postgres_protocol::authentication::sasl::ScramSha256; +use hmac::{Hmac, Mac}; +use sha2::Sha256; +use tokio::task::yield_now; use super::messages::{ ClientFinalMessage, ClientFirstMessage, OwnedServerFirstMessage, SCRAM_RAW_NONCE_LEN, }; use super::secret::ServerSecret; use super::signature::SignatureBuilder; +use super::ScramKey; use crate::config; use crate::sasl::{self, ChannelBinding, Error as SaslError}; @@ -71,40 +74,62 @@ impl<'a> Exchange<'a> { } } -pub fn exchange( +// copied from +async fn pbkdf2(str: &[u8], salt: &[u8], iterations: u32) -> [u8; 32] { + let hmac = Hmac::::new_from_slice(str).expect("HMAC is able to accept all key sizes"); + let mut prev = hmac + .clone() + .chain_update(salt) + .chain_update(1u32.to_be_bytes()) + .finalize() + .into_bytes(); + + let mut hi = prev; + + for i in 1..iterations { + prev = hmac.clone().chain_update(prev).finalize().into_bytes(); + + for (hi, prev) in hi.iter_mut().zip(prev) { + *hi ^= prev; + } + // yield every ~250us + // hopefully reduces tail latencies + if i % 1024 == 0 { + yield_now().await + } + } + + hi.into() +} + +// copied from +async fn derive_client_key(password: &[u8], salt: &[u8], iterations: u32) -> ScramKey { + let salted_password = pbkdf2(password, salt, iterations).await; + + let make_key = |name| { + let key = Hmac::::new_from_slice(&salted_password) + .expect("HMAC is able to accept all key sizes") + .chain_update(name) + .finalize(); + + <[u8; 32]>::from(key.into_bytes()) + }; + + make_key(b"Client Key").into() +} + +pub async fn exchange( secret: &ServerSecret, - mut client: ScramSha256, - tls_server_end_point: config::TlsServerEndPoint, + password: &[u8], ) -> sasl::Result> { - use sasl::Step::*; + let salt = base64::decode(&secret.salt_base64)?; + let client_key = derive_client_key(password, &salt, secret.iterations).await; - let init = SaslInitial { - nonce: rand::random, - }; - - let client_first = std::str::from_utf8(client.message()) - .map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?; - let sent = match init.transition(secret, &tls_server_end_point, client_first)? { - Continue(sent, server_first) => { - client.update(server_first.as_bytes())?; - sent - } - Success(x, _) => match x {}, - Failure(msg) => return Ok(sasl::Outcome::Failure(msg)), - }; - - let client_final = std::str::from_utf8(client.message()) - .map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?; - let keys = match sent.transition(secret, &tls_server_end_point, client_final)? { - Success(keys, server_final) => { - client.finish(server_final.as_bytes())?; - keys - } - Continue(x, _) => match x {}, - Failure(msg) => return Ok(sasl::Outcome::Failure(msg)), - }; - - Ok(sasl::Outcome::Success(keys)) + if secret.is_password_invalid(&client_key).into() { + Ok(sasl::Outcome::Failure("password doesn't match")) + } else { + Ok(sasl::Outcome::Success(client_key)) + } } impl SaslInitial { @@ -185,7 +210,7 @@ impl SaslSentInner { .derive_client_key(&client_final_message.proof); // Auth fails either if keys don't match or it's pre-determined to fail. - if client_key.sha256() != secret.stored_key || secret.doomed { + if secret.is_password_invalid(&client_key).into() { return Ok(sasl::Step::Failure("password doesn't match")); } diff --git a/proxy/src/scram/key.rs b/proxy/src/scram/key.rs index 973126e729..32a3dbd203 100644 --- a/proxy/src/scram/key.rs +++ b/proxy/src/scram/key.rs @@ -1,17 +1,31 @@ //! Tools for client/server/stored key management. +use subtle::ConstantTimeEq; + /// Faithfully taken from PostgreSQL. pub const SCRAM_KEY_LEN: usize = 32; /// One of the keys derived from the user's password. /// We use the same structure for all keys, i.e. /// `ClientKey`, `StoredKey`, and `ServerKey`. -#[derive(Clone, Default, PartialEq, Eq, Debug)] +#[derive(Clone, Default, Eq, Debug)] #[repr(transparent)] pub struct ScramKey { bytes: [u8; SCRAM_KEY_LEN], } +impl PartialEq for ScramKey { + fn eq(&self, other: &Self) -> bool { + self.ct_eq(other).into() + } +} + +impl ConstantTimeEq for ScramKey { + fn ct_eq(&self, other: &Self) -> subtle::Choice { + self.bytes.ct_eq(&other.bytes) + } +} + impl ScramKey { pub fn sha256(&self) -> Self { super::sha256([self.as_ref()]).into() diff --git a/proxy/src/scram/messages.rs b/proxy/src/scram/messages.rs index b59baec508..f9372540ca 100644 --- a/proxy/src/scram/messages.rs +++ b/proxy/src/scram/messages.rs @@ -206,6 +206,28 @@ mod tests { } } + #[test] + fn parse_client_first_message_with_invalid_gs2_authz() { + assert!(ClientFirstMessage::parse("n,authzid,n=user,r=nonce").is_none()) + } + + #[test] + fn parse_client_first_message_with_extra_params() { + let msg = ClientFirstMessage::parse("n,,n=user,r=nonce,a=foo,b=bar,c=baz").unwrap(); + assert_eq!(msg.bare, "n=user,r=nonce,a=foo,b=bar,c=baz"); + assert_eq!(msg.username, "user"); + assert_eq!(msg.nonce, "nonce"); + assert_eq!(msg.cbind_flag, ChannelBinding::NotSupportedClient); + } + + #[test] + fn parse_client_first_message_with_extra_params_invalid() { + // must be of the form `=<...>` + assert!(ClientFirstMessage::parse("n,,n=user,r=nonce,abc=foo").is_none()); + assert!(ClientFirstMessage::parse("n,,n=user,r=nonce,1=foo").is_none()); + assert!(ClientFirstMessage::parse("n,,n=user,r=nonce,a").is_none()); + } + #[test] fn parse_client_final_message() { let input = [ diff --git a/proxy/src/scram/secret.rs b/proxy/src/scram/secret.rs index fb3c45816e..f3414cb8ec 100644 --- a/proxy/src/scram/secret.rs +++ b/proxy/src/scram/secret.rs @@ -1,5 +1,7 @@ //! Tools for SCRAM server secret management. +use subtle::{Choice, ConstantTimeEq}; + use super::base64_decode_array; use super::key::ScramKey; @@ -40,6 +42,11 @@ impl ServerSecret { Some(secret) } + pub fn is_password_invalid(&self, client_key: &ScramKey) -> Choice { + // constant time to not leak partial key match + client_key.sha256().ct_ne(&self.stored_key) | Choice::from(self.doomed as u8) + } + /// To avoid revealing information to an attacker, we use a /// mocked server secret even if the user doesn't exist. /// See `auth-scram.c : mock_scram_secret` for details. @@ -59,10 +66,8 @@ impl ServerSecret { /// Build a new server secret from the prerequisites. /// XXX: We only use this function in tests. #[cfg(test)] - pub fn build(password: &str) -> Option { - Self::parse(&postgres_protocol::password::scram_sha_256( - password.as_bytes(), - )) + pub async fn build(password: &str) -> Option { + Self::parse(&postgres_protocol::password::scram_sha_256(password.as_bytes()).await) } } diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs index be9f90acde..a2010fd613 100644 --- a/proxy/src/serverless.rs +++ b/proxy/src/serverless.rs @@ -21,11 +21,12 @@ pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware}; use tokio_util::task::TaskTracker; use tracing::instrument::Instrumented; +use crate::cancellation::CancellationHandlerMain; +use crate::config::ProxyConfig; use crate::context::RequestMonitoring; use crate::protocol2::{ProxyProtocolAccept, WithClientIp, WithConnectionGuard}; use crate::rate_limiter::EndpointRateLimiter; use crate::serverless::backend::PoolingBackend; -use crate::{cancellation::CancellationHandler, config::ProxyConfig}; use hyper::{ server::conn::{AddrIncoming, AddrStream}, Body, Method, Request, Response, @@ -47,7 +48,7 @@ pub async fn task_main( ws_listener: TcpListener, cancellation_token: CancellationToken, endpoint_rate_limiter: Arc, - cancellation_handler: Arc, + cancellation_handler: Arc, ) -> anyhow::Result<()> { scopeguard::defer! { info!("websocket server has shut down"); @@ -237,7 +238,7 @@ async fn request_handler( config: &'static ProxyConfig, backend: Arc, ws_connections: TaskTracker, - cancellation_handler: Arc, + cancellation_handler: Arc, peer_addr: IpAddr, endpoint_rate_limiter: Arc, // used to cancel in-flight HTTP requests. not used to cancel websockets diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs index 29ef641265..72b55c45f0 100644 --- a/proxy/src/serverless/backend.rs +++ b/proxy/src/serverless/backend.rs @@ -50,7 +50,7 @@ impl PoolingBackend { } }; let auth_outcome = - crate::auth::validate_password_and_exchange(&conn_info.password, secret)?; + crate::auth::validate_password_and_exchange(&conn_info.password, secret).await?; let res = match auth_outcome { crate::sasl::Outcome::Success(key) => Ok(key), crate::sasl::Outcome::Failure(reason) => { diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs index a72ede6d0a..ada6c974f4 100644 --- a/proxy/src/serverless/websocket.rs +++ b/proxy/src/serverless/websocket.rs @@ -1,5 +1,5 @@ use crate::{ - cancellation::CancellationHandler, + cancellation::CancellationHandlerMain, config::ProxyConfig, context::RequestMonitoring, error::{io_error, ReportableError}, @@ -134,7 +134,7 @@ pub async fn serve_websocket( config: &'static ProxyConfig, mut ctx: RequestMonitoring, websocket: HyperWebsocket, - cancellation_handler: Arc, + cancellation_handler: Arc, hostname: Option, endpoint_rate_limiter: Arc, ) -> anyhow::Result<()> { diff --git a/rust-toolchain.toml b/rust-toolchain.toml index b0949c32b1..50a5a4185b 100644 --- a/rust-toolchain.toml +++ b/rust-toolchain.toml @@ -1,5 +1,5 @@ [toolchain] -channel = "1.76.0" +channel = "1.77.0" profile = "default" # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy. # https://rust-lang.github.io/rustup/concepts/profiles.html diff --git a/safekeeper/src/copy_timeline.rs b/safekeeper/src/copy_timeline.rs index 5bc877adbd..3023d4e2cb 100644 --- a/safekeeper/src/copy_timeline.rs +++ b/safekeeper/src/copy_timeline.rs @@ -225,6 +225,7 @@ async fn write_segment( assert!(from <= to); assert!(to <= wal_seg_size); + #[allow(clippy::suspicious_open_options)] let mut file = OpenOptions::new() .create(true) .write(true) diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index a0c0c7ca4c..9ce26e6c5d 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -20,7 +20,7 @@ use std::io::Write as _; use tokio::sync::mpsc; use tokio_stream::wrappers::ReceiverStream; use tracing::{info_span, Instrument}; -use utils::http::endpoint::{request_span, ChannelWriter}; +use utils::http::endpoint::{prometheus_metrics_handler, request_span, ChannelWriter}; use crate::debug_dump::TimelineDigestRequest; use crate::receive_wal::WalReceiverState; @@ -515,6 +515,7 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder router .data(Arc::new(conf)) .data(auth) + .get("/metrics", |r| request_span(r, prometheus_metrics_handler)) .get("/v1/status", |r| request_span(r, status_handler)) .put("/v1/failpoints", |r| { request_span(r, move |r| async { diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs index 8bbd95e9e8..147f318b9f 100644 --- a/safekeeper/src/wal_storage.rs +++ b/safekeeper/src/wal_storage.rs @@ -221,6 +221,7 @@ impl PhysicalStorage { // half initialized segment, first bake it under tmp filename and // then rename. let tmp_path = self.timeline_dir.join("waltmp"); + #[allow(clippy::suspicious_open_options)] let mut file = OpenOptions::new() .create(true) .write(true) diff --git a/safekeeper/tests/walproposer_sim/walproposer_api.rs b/safekeeper/tests/walproposer_sim/walproposer_api.rs index 5c79e9082b..c49495a4f3 100644 --- a/safekeeper/tests/walproposer_sim/walproposer_api.rs +++ b/safekeeper/tests/walproposer_sim/walproposer_api.rs @@ -224,6 +224,16 @@ impl SimulationApi { }) .collect::>(); + let empty_feedback = PageserverFeedback { + present: false, + currentClusterSize: 0, + last_received_lsn: 0, + disk_consistent_lsn: 0, + remote_consistent_lsn: 0, + replytime: 0, + shard_number: 0, + }; + Self { os: args.os, safekeepers: RefCell::new(sk_conns), @@ -232,15 +242,12 @@ impl SimulationApi { last_logged_commit_lsn: 0, shmem: UnsafeCell::new(walproposer::bindings::WalproposerShmemState { mutex: 0, - feedback: PageserverFeedback { - currentClusterSize: 0, - last_received_lsn: 0, - disk_consistent_lsn: 0, - remote_consistent_lsn: 0, - replytime: 0, - }, mineLastElectedTerm: 0, backpressureThrottlingTime: pg_atomic_uint64 { value: 0 }, + currentClusterSize: pg_atomic_uint64 { value: 0 }, + shard_ps_feedback: [empty_feedback; 128], + num_shards: 0, + min_ps_feedback: empty_feedback, }), config: args.config, event_set: RefCell::new(None), @@ -598,7 +605,11 @@ impl ApiImpl for SimulationApi { } } - fn process_safekeeper_feedback(&mut self, wp: &mut walproposer::bindings::WalProposer) { + fn process_safekeeper_feedback( + &mut self, + wp: &mut walproposer::bindings::WalProposer, + _sk: &mut walproposer::bindings::Safekeeper, + ) { debug!("process_safekeeper_feedback, commit_lsn={}", wp.commitLsn); if wp.commitLsn > self.last_logged_commit_lsn { self.os.log_event(format!("commit_lsn;{}", wp.commitLsn)); diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 56b23cef59..f8994a8dcc 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -51,7 +51,7 @@ from fixtures.log_helper import log from fixtures.metrics import Metrics, MetricsGetter, parse_metrics from fixtures.pageserver.allowed_errors import ( DEFAULT_PAGESERVER_ALLOWED_ERRORS, - scan_pageserver_log_for_errors, + DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS, ) from fixtures.pageserver.http import PageserverHttpClient from fixtures.pageserver.types import IndexPartDump @@ -77,6 +77,7 @@ from fixtures.utils import ( ATTACHMENT_NAME_REGEX, allure_add_grafana_links, allure_attach_from_dir, + assert_no_errors, get_self_dir, subprocess_capture, wait_until, @@ -944,6 +945,8 @@ class NeonEnvBuilder: for pageserver in self.env.pageservers: pageserver.assert_no_errors() + self.env.storage_controller.assert_no_errors() + try: self.overlay_cleanup_teardown() except Exception as e: @@ -1889,19 +1892,6 @@ class NeonCli(AbstractNeonCli): return self.raw_cli(args, check_return_code=True) - def tenant_migrate( - self, tenant_shard_id: TenantShardId, new_pageserver: int, timeout_secs: Optional[int] - ): - args = [ - "tenant", - "migrate", - "--tenant-id", - str(tenant_shard_id), - "--id", - str(new_pageserver), - ] - return self.raw_cli(args, check_return_code=True, timeout=timeout_secs) - def start(self, check_return_code=True) -> "subprocess.CompletedProcess[str]": return self.raw_cli(["start"], check_return_code=check_return_code) @@ -1961,6 +1951,7 @@ class NeonStorageController(MetricsGetter): self.env = env self.running = False self.auth_enabled = auth_enabled + self.allowed_errors: list[str] = DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS def start(self): assert not self.running @@ -1985,6 +1976,11 @@ class NeonStorageController(MetricsGetter): msg = "" raise StorageControllerApiException(msg, res.status_code) from e + def assert_no_errors(self): + assert_no_errors( + self.env.repo_dir / "storage_controller.log", "storage_controller", self.allowed_errors + ) + def pageserver_api(self) -> PageserverHttpClient: """ The storage controller implements a subset of the pageserver REST API, for mapping @@ -2147,13 +2143,25 @@ class NeonStorageController(MetricsGetter): """ response = self.request( "GET", - f"{self.env.storage_controller_api}/control/v1/tenant/{tenant_id}/locate", + f"{self.env.storage_controller_api}/debug/v1/tenant/{tenant_id}/locate", headers=self.headers(TokenScope.ADMIN), ) body = response.json() shards: list[dict[str, Any]] = body["shards"] return shards + def tenant_describe(self, tenant_id: TenantId): + """ + :return: list of {"shard_id": "", "node_id": int, "listen_pg_addr": str, "listen_pg_port": int, "listen_http_addr: str, "listen_http_port: int} + """ + response = self.request( + "GET", + f"{self.env.storage_controller_api}/control/v1/tenant/{tenant_id}", + headers=self.headers(TokenScope.ADMIN), + ) + response.raise_for_status() + return response.json() + def tenant_shard_split( self, tenant_id: TenantId, shard_count: int, shard_stripe_size: Optional[int] = None ) -> list[TenantShardId]: @@ -2357,18 +2365,9 @@ class NeonPageserver(PgProtocol): return self.env.repo_dir / f"pageserver_{self.id}" def assert_no_errors(self): - logfile = self.workdir / "pageserver.log" - if not logfile.exists(): - log.warning(f"Skipping log check: {logfile} does not exist") - return - - with logfile.open("r") as f: - errors = scan_pageserver_log_for_errors(f, self.allowed_errors) - - for _lineno, error in errors: - log.info(f"not allowed error: {error.strip()}") - - assert not errors + assert_no_errors( + self.workdir / "pageserver.log", f"pageserver_{self.id}", self.allowed_errors + ) def assert_no_metric_errors(self): """ diff --git a/test_runner/fixtures/pageserver/allowed_errors.py b/test_runner/fixtures/pageserver/allowed_errors.py index 839d4166c7..ec0f81b380 100755 --- a/test_runner/fixtures/pageserver/allowed_errors.py +++ b/test_runner/fixtures/pageserver/allowed_errors.py @@ -89,6 +89,16 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = ( ) +DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS = [ + # Many tests will take pageservers offline, resulting in log warnings on the controller + # failing to connect to them. + ".*Call to node.*management API.*failed.*receive body.*", + ".*Call to node.*management API.*failed.*ReceiveBody.*", + # Many tests will start up with a node offline + ".*startup_reconcile: Could not scan node.*", +] + + def _check_allowed_errors(input): allowed_errors: List[str] = list(DEFAULT_PAGESERVER_ALLOWED_ERRORS) diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py index 99ec894106..6aebfbc99c 100644 --- a/test_runner/fixtures/pageserver/http.py +++ b/test_runner/fixtures/pageserver/http.py @@ -626,6 +626,17 @@ class PageserverHttpClient(requests.Session, MetricsGetter): res_json = res.json() return res_json + def timeline_layer_map_info( + self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId + ): + log.info(f"Requesting layer map info of tenant {tenant_id}, timeline {timeline_id}") + res = self.get( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/layer", + ) + self.verbose_error(res) + res_json = res.json() + return res_json + def timeline_checkpoint( self, tenant_id: Union[TenantId, TenantShardId], diff --git a/test_runner/fixtures/types.py b/test_runner/fixtures/types.py index ea648e460d..80c9b9ce9a 100644 --- a/test_runner/fixtures/types.py +++ b/test_runner/fixtures/types.py @@ -158,6 +158,9 @@ class TenantShardId: def __str__(self): return f"{self.tenant_id}-{self.shard_number:02x}{self.shard_count:02x}" + def __repr__(self): + return self.__str__() + def _tuple(self) -> tuple[TenantId, int, int]: return (self.tenant_id, self.shard_number, self.shard_count) diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py index 7fc3bae3af..9365d65fc9 100644 --- a/test_runner/fixtures/utils.py +++ b/test_runner/fixtures/utils.py @@ -11,6 +11,7 @@ from typing import ( Any, Callable, Dict, + Iterable, List, Optional, Tuple, @@ -447,3 +448,39 @@ def humantime_to_ms(humantime: str) -> float: ) return round(total_ms, 3) + + +def scan_log_for_errors(input: Iterable[str], allowed_errors: List[str]) -> List[Tuple[int, str]]: + error_or_warn = re.compile(r"\s(ERROR|WARN)") + errors = [] + for lineno, line in enumerate(input, start=1): + if len(line) == 0: + continue + + if error_or_warn.search(line): + # Is this a torn log line? This happens when force-killing a process and restarting + # Example: "2023-10-25T09:38:31.752314Z WARN deletion executo2023-10-25T09:38:31.875947Z INFO version: git-env:0f9452f76e8ccdfc88291bccb3f53e3016f40192" + if re.match("\\d{4}-\\d{2}-\\d{2}T.+\\d{4}-\\d{2}-\\d{2}T.+INFO version.+", line): + continue + + # It's an ERROR or WARN. Is it in the allow-list? + for a in allowed_errors: + if re.match(a, line): + break + else: + errors.append((lineno, line)) + return errors + + +def assert_no_errors(log_file, service, allowed_errors): + if not log_file.exists(): + log.warning(f"Skipping {service} log check: {log_file} does not exist") + return + + with log_file.open("r") as f: + errors = scan_log_for_errors(f, allowed_errors) + + for _lineno, error in errors: + log.info(f"not allowed {service} error: {error.strip()}") + + assert not errors, f"Log errors on {service}: {errors[0]}" diff --git a/test_runner/fixtures/workload.py b/test_runner/fixtures/workload.py index e852281fcf..ab8717de54 100644 --- a/test_runner/fixtures/workload.py +++ b/test_runner/fixtures/workload.py @@ -1,5 +1,5 @@ import threading -from typing import Optional +from typing import Any, Optional from fixtures.log_helper import log from fixtures.neon_fixtures import ( @@ -32,6 +32,7 @@ class Workload: tenant_id: TenantId, timeline_id: TimelineId, branch_name: Optional[str] = None, + endpoint_opts: Optional[dict[str, Any]] = None, ): self.env = env self.tenant_id = tenant_id @@ -45,6 +46,7 @@ class Workload: self.churn_cursor = 0 self._endpoint: Optional[Endpoint] = None + self._endpoint_opts = endpoint_opts or {} def reconfigure(self): """ @@ -66,6 +68,7 @@ class Workload: tenant_id=self.tenant_id, pageserver_id=pageserver_id, endpoint_id=endpoint_id, + **self._endpoint_opts, ) self._endpoint.start(pageserver_id=pageserver_id) else: diff --git a/test_runner/performance/test_gc_feedback.py b/test_runner/performance/test_gc_feedback.py index 48dd84fb06..be56203b26 100644 --- a/test_runner/performance/test_gc_feedback.py +++ b/test_runner/performance/test_gc_feedback.py @@ -1,3 +1,5 @@ +import json + import pytest from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker from fixtures.log_helper import log @@ -79,3 +81,8 @@ def test_gc_feedback(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchma zenbenchmark.record( "physical/logical ratio", physical_size / logical_size, "", MetricReport.LOWER_IS_BETTER ) + + layer_map_path = env.repo_dir / "layer-map.json" + log.info(f"Writing layer map to {layer_map_path}") + with layer_map_path.open("w") as f: + f.write(json.dumps(client.timeline_layer_map_info(tenant_id, timeline_id))) diff --git a/test_runner/regress/test_backpressure.py b/test_runner/regress/test_backpressure.py index 819912dd05..af17a2e89d 100644 --- a/test_runner/regress/test_backpressure.py +++ b/test_runner/regress/test_backpressure.py @@ -116,7 +116,7 @@ def test_backpressure_received_lsn_lag(neon_env_builder: NeonEnvBuilder): # Configure failpoint to slow down walreceiver ingest with closing(env.pageserver.connect()) as psconn: with psconn.cursor(cursor_factory=psycopg2.extras.DictCursor) as pscur: - pscur.execute("failpoints walreceiver-after-ingest=sleep(20)") + pscur.execute("failpoints walreceiver-after-ingest-blocking=sleep(20)") # FIXME # Wait for the check thread to start diff --git a/test_runner/regress/test_branch_and_gc.py b/test_runner/regress/test_branch_and_gc.py index bdc944f352..ddd02238ea 100644 --- a/test_runner/regress/test_branch_and_gc.py +++ b/test_runner/regress/test_branch_and_gc.py @@ -120,12 +120,12 @@ def test_branch_creation_before_gc(neon_simple_env: NeonEnv): env = neon_simple_env pageserver_http_client = env.pageserver.http_client() - env.pageserver.allowed_errors.extend( - [ - ".*invalid branch start lsn: less than latest GC cutoff.*", - ".*invalid branch start lsn: less than planned GC cutoff.*", - ] - ) + error_regexes = [ + ".*invalid branch start lsn: less than latest GC cutoff.*", + ".*invalid branch start lsn: less than planned GC cutoff.*", + ] + env.pageserver.allowed_errors.extend(error_regexes) + env.storage_controller.allowed_errors.extend(error_regexes) # Disable background GC but set the `pitr_interval` to be small, so GC can delete something tenant, _ = env.neon_cli.create_tenant( diff --git a/test_runner/regress/test_branch_behind.py b/test_runner/regress/test_branch_behind.py index 46c74a26b8..b79cad979f 100644 --- a/test_runner/regress/test_branch_behind.py +++ b/test_runner/regress/test_branch_behind.py @@ -14,9 +14,12 @@ def test_branch_behind(neon_env_builder: NeonEnvBuilder): neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}" env = neon_env_builder.init_start() - env.pageserver.allowed_errors.extend( - [".*invalid branch start lsn.*", ".*invalid start lsn .* for ancestor timeline.*"] - ) + error_regexes = [ + ".*invalid branch start lsn.*", + ".*invalid start lsn .* for ancestor timeline.*", + ] + env.pageserver.allowed_errors.extend(error_regexes) + env.storage_controller.allowed_errors.extend(error_regexes) # Branch at the point where only 100 rows were inserted branch_behind_timeline_id = env.neon_cli.create_branch("test_branch_behind") diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py index 5f815d3e6c..e0bb4c2062 100644 --- a/test_runner/regress/test_compatibility.py +++ b/test_runner/regress/test_compatibility.py @@ -238,6 +238,10 @@ def test_forward_compatibility( pg_distrib_dir=compatibility_postgres_distrib_dir, ) + # TODO: remove this workaround after release-5090 is no longer the most recent release. + # There was a bug in that code that generates a warning in the storage controller log. + env.storage_controller.allowed_errors.append(".*no tenant_shard_id specified.*") + # Use current neon_local even though we're using old binaries for # everything else: our test code is written for latest CLI args. env.neon_local_binpath = neon_local_binpath diff --git a/test_runner/regress/test_hot_standby.py b/test_runner/regress/test_hot_standby.py index 0497e1965c..ac3315b86f 100644 --- a/test_runner/regress/test_hot_standby.py +++ b/test_runner/regress/test_hot_standby.py @@ -84,3 +84,21 @@ def test_hot_standby(neon_simple_env: NeonEnv): # clean up if slow_down_send: sk_http.configure_failpoints(("sk-send-wal-replica-sleep", "off")) + + +def test_2_replicas_start(neon_simple_env: NeonEnv): + env = neon_simple_env + + with env.endpoints.create_start( + branch_name="main", + endpoint_id="primary", + ) as primary: + time.sleep(1) + with env.endpoints.new_replica_start( + origin=primary, endpoint_id="secondary1" + ) as secondary1: + with env.endpoints.new_replica_start( + origin=primary, endpoint_id="secondary2" + ) as secondary2: + wait_replica_caughtup(primary, secondary1) + wait_replica_caughtup(primary, secondary2) diff --git a/test_runner/regress/test_pageserver_metric_collection.py b/test_runner/regress/test_pageserver_metric_collection.py index 042961baa5..c34ef46d07 100644 --- a/test_runner/regress/test_pageserver_metric_collection.py +++ b/test_runner/regress/test_pageserver_metric_collection.py @@ -1,4 +1,6 @@ +import gzip import json +import os import time from dataclasses import dataclass from pathlib import Path @@ -10,7 +12,11 @@ from fixtures.neon_fixtures import ( NeonEnvBuilder, wait_for_last_flush_lsn, ) -from fixtures.remote_storage import RemoteStorageKind +from fixtures.remote_storage import ( + LocalFsStorage, + RemoteStorageKind, + remote_storage_to_toml_inline_table, +) from fixtures.types import TenantId, TimelineId from pytest_httpserver import HTTPServer from werkzeug.wrappers.request import Request @@ -40,6 +46,9 @@ def test_metric_collection( uploads.put((events, is_last == "true")) return Response(status=200) + neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) + assert neon_env_builder.pageserver_remote_storage is not None + # Require collecting metrics frequently, since we change # the timeline and want something to be logged about it. # @@ -48,12 +57,11 @@ def test_metric_collection( neon_env_builder.pageserver_config_override = f""" metric_collection_interval="1s" metric_collection_endpoint="{metric_collection_endpoint}" + metric_collection_bucket={remote_storage_to_toml_inline_table(neon_env_builder.pageserver_remote_storage)} cached_metric_collection_interval="0s" synthetic_size_calculation_interval="3s" """ - neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) - log.info(f"test_metric_collection endpoint is {metric_collection_endpoint}") # mock http server that returns OK for the metrics @@ -70,6 +78,7 @@ def test_metric_collection( # we have a fast rate of calculation, these can happen at shutdown ".*synthetic_size_worker:calculate_synthetic_size.*:gather_size_inputs.*: failed to calculate logical size at .*: cancelled.*", ".*synthetic_size_worker: failed to calculate synthetic size for tenant .*: failed to calculate some logical_sizes", + ".*metrics_collection: failed to upload to S3: Failed to upload data of length .* to storage path.*", ] ) @@ -166,6 +175,20 @@ def test_metric_collection( httpserver.check() + # Check that at least one bucket output object is present, and that all + # can be decompressed and decoded. + bucket_dumps = {} + assert isinstance(env.pageserver_remote_storage, LocalFsStorage) + for dirpath, _dirs, files in os.walk(env.pageserver_remote_storage.root): + for file in files: + file_path = os.path.join(dirpath, file) + log.info(file_path) + if file.endswith(".gz"): + bucket_dumps[file_path] = json.load(gzip.open(file_path)) + + assert len(bucket_dumps) >= 1 + assert all("events" in data for data in bucket_dumps.values()) + def test_metric_collection_cleans_up_tempfile( httpserver: HTTPServer, diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py index 8ef75414a3..2e57136607 100644 --- a/test_runner/regress/test_pageserver_secondary.py +++ b/test_runner/regress/test_pageserver_secondary.py @@ -432,6 +432,10 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder): - Eviction of layers on the attached location results in deletion on the secondary location as well. """ + + # For debug of https://github.com/neondatabase/neon/issues/6966 + neon_env_builder.rust_log_override = "DEBUG" + neon_env_builder.num_pageservers = 2 neon_env_builder.enable_pageserver_remote_storage( remote_storage_kind=RemoteStorageKind.MOCK_S3, @@ -576,7 +580,7 @@ def test_slow_secondary_downloads(neon_env_builder: NeonEnvBuilder, via_controll timeline_id = TimelineId.generate() env.neon_cli.create_tenant( - tenant_id, timeline_id, conf=TENANT_CONF, placement_policy='{"Double":1}' + tenant_id, timeline_id, conf=TENANT_CONF, placement_policy='{"Attached":1}' ) attached_to_id = env.storage_controller.locate(tenant_id)[0]["node_id"] diff --git a/test_runner/regress/test_replication_start.py b/test_runner/regress/test_replication_start.py index b4699c7be8..2360745990 100644 --- a/test_runner/regress/test_replication_start.py +++ b/test_runner/regress/test_replication_start.py @@ -1,7 +1,9 @@ +import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnv, wait_replica_caughtup +@pytest.mark.xfail def test_replication_start(neon_simple_env: NeonEnv): env = neon_simple_env diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py index e8511e428e..e6318aff68 100644 --- a/test_runner/regress/test_sharding.py +++ b/test_runner/regress/test_sharding.py @@ -1,4 +1,6 @@ import os +import time +from collections import defaultdict from typing import Dict, List, Optional, Union import pytest @@ -12,7 +14,7 @@ from fixtures.neon_fixtures import ( tenant_get_shards, ) from fixtures.remote_storage import s3_storage -from fixtures.types import Lsn, TenantShardId, TimelineId +from fixtures.types import Lsn, TenantId, TenantShardId, TimelineId from fixtures.utils import wait_until from fixtures.workload import Workload from pytest_httpserver import HTTPServer @@ -158,11 +160,20 @@ def test_sharding_split_smoke( neon_env_builder.preserve_database_files = True - env = neon_env_builder.init_start( - initial_tenant_shard_count=shard_count, initial_tenant_shard_stripe_size=stripe_size + non_default_tenant_config = {"gc_horizon": 77 * 1024 * 1024} + + env = neon_env_builder.init_configs(True) + neon_env_builder.start() + tenant_id = TenantId.generate() + timeline_id = TimelineId.generate() + env.neon_cli.create_tenant( + tenant_id, + timeline_id, + shard_count=shard_count, + shard_stripe_size=stripe_size, + placement_policy='{"Attached": 1}', + conf=non_default_tenant_config, ) - tenant_id = env.initial_tenant - timeline_id = env.initial_timeline workload = Workload(env, tenant_id, timeline_id, branch_name="main") workload.init() @@ -222,6 +233,14 @@ def test_sharding_split_smoke( # Before split, old shards exist assert shards_on_disk(old_shard_ids) + # Before split, we have done one reconcile for each shard + assert ( + env.storage_controller.get_metric_value( + "storage_controller_reconcile_complete_total", filter={"status": "ok"} + ) + == shard_count + ) + env.storage_controller.tenant_shard_split(tenant_id, shard_count=split_shard_count) post_split_pageserver_ids = [loc["node_id"] for loc in env.storage_controller.locate(tenant_id)] @@ -263,43 +282,66 @@ def test_sharding_split_smoke( destination = migrate_to_pageserver_ids.pop() log.info(f"Migrating shard {migrate_shard} from {ps_id} to {destination}") - env.neon_cli.tenant_migrate(migrate_shard, destination, timeout_secs=10) + env.storage_controller.tenant_shard_migrate(migrate_shard, destination) workload.validate() - # Check that we didn't do any spurious reconciliations. - # Total number of reconciles should have been one per original shard, plus - # one for each shard that was migrated. + # Assert on how many reconciles happened during the process. This is something of an + # implementation detail, but it is useful to detect any bugs that might generate spurious + # extra reconcile iterations. + # + # We'll have: + # - shard_count reconciles for the original setup of the tenant + # - shard_count reconciles for detaching the original secondary locations during split + # - split_shard_count reconciles during shard splitting, for setting up secondaries. + # - shard_count reconciles for the migrations we did to move child shards away from their split location + expect_reconciles = shard_count * 2 + split_shard_count + shard_count reconcile_ok = env.storage_controller.get_metric_value( "storage_controller_reconcile_complete_total", filter={"status": "ok"} ) - assert reconcile_ok == shard_count + split_shard_count // 2 + assert reconcile_ok == expect_reconciles # Check that no cancelled or errored reconciliations occurred: this test does no # failure injection and should run clean. - assert ( - env.storage_controller.get_metric_value( - "storage_controller_reconcile_complete_total", filter={"status": "cancel"} - ) - is None + cancelled_reconciles = env.storage_controller.get_metric_value( + "storage_controller_reconcile_complete_total", filter={"status": "cancel"} ) - assert ( - env.storage_controller.get_metric_value( - "storage_controller_reconcile_complete_total", filter={"status": "error"} - ) - is None + errored_reconciles = env.storage_controller.get_metric_value( + "storage_controller_reconcile_complete_total", filter={"status": "error"} ) + assert cancelled_reconciles is not None and int(cancelled_reconciles) == 0 + assert errored_reconciles is not None and int(errored_reconciles) == 0 env.storage_controller.consistency_check() - # Validate pageserver state - shards_exist: list[TenantShardId] = [] - for pageserver in env.pageservers: - locations = pageserver.http_client().tenant_list_locations() - shards_exist.extend(TenantShardId.parse(s[0]) for s in locations["tenant_shards"]) + def get_node_shard_counts(env: NeonEnv, tenant_ids): + total: defaultdict[int, int] = defaultdict(int) + attached: defaultdict[int, int] = defaultdict(int) + for tid in tenant_ids: + for shard in env.storage_controller.tenant_describe(tid)["shards"]: + log.info( + f"{shard['tenant_shard_id']}: attached={shard['node_attached']}, secondary={shard['node_secondary']} " + ) + for node in shard["node_secondary"]: + total[int(node)] += 1 + attached[int(shard["node_attached"])] += 1 + total[int(shard["node_attached"])] += 1 - log.info("Shards after split: {shards_exist}") - assert len(shards_exist) == split_shard_count + return total, attached + + def check_effective_tenant_config(): + # Expect our custom tenant configs to have survived the split + for shard in env.storage_controller.tenant_describe(tenant_id)["shards"]: + node = env.get_pageserver(int(shard["node_attached"])) + config = node.http_client().tenant_config(TenantShardId.parse(shard["tenant_shard_id"])) + for k, v in non_default_tenant_config.items(): + assert config.effective_config[k] == v + + # Validate pageserver state: expect every child shard to have an attached and secondary location + (total, attached) = get_node_shard_counts(env, tenant_ids=[tenant_id]) + assert sum(attached.values()) == split_shard_count + assert sum(total.values()) == split_shard_count * 2 + check_effective_tenant_config() # Ensure post-split pageserver locations survive a restart (i.e. the child shards # correctly wrote config to disk, and the storage controller responds correctly @@ -308,13 +350,11 @@ def test_sharding_split_smoke( pageserver.stop() pageserver.start() - shards_exist = [] - for pageserver in env.pageservers: - locations = pageserver.http_client().tenant_list_locations() - shards_exist.extend(TenantShardId.parse(s[0]) for s in locations["tenant_shards"]) - - log.info("Shards after restart: {shards_exist}") - assert len(shards_exist) == split_shard_count + # Validate pageserver state: expect every child shard to have an attached and secondary location + (total, attached) = get_node_shard_counts(env, tenant_ids=[tenant_id]) + assert sum(attached.values()) == split_shard_count + assert sum(total.values()) == split_shard_count * 2 + check_effective_tenant_config() workload.validate() @@ -720,9 +760,32 @@ def test_sharding_split_failures( initial_shard_count = 2 split_shard_count = 4 - env = neon_env_builder.init_start(initial_tenant_shard_count=initial_shard_count) - tenant_id = env.initial_tenant - timeline_id = env.initial_timeline + env = neon_env_builder.init_configs() + env.start() + + tenant_id = TenantId.generate() + timeline_id = TimelineId.generate() + + # Create a tenant with secondary locations enabled + env.neon_cli.create_tenant( + tenant_id, timeline_id, shard_count=initial_shard_count, placement_policy='{"Attached":1}' + ) + + env.storage_controller.allowed_errors.extend( + [ + # All split failures log a warning when then enqueue the abort operation + ".*Enqueuing background abort.*", + # We exercise failure cases where abort itself will also fail (node offline) + ".*abort_tenant_shard_split.*", + ".*Failed to abort.*", + # Tolerate any error lots that mention a failpoint + ".*failpoint.*", + # Node offline cases will fail to send requests + ".*Reconcile error: receive body: error sending request for url.*", + # Node offline cases will fail inside reconciler when detaching secondaries + ".*Reconcile error on shard.*: receive body: error sending request for url.*", + ] + ) for ps in env.pageservers: # When we do node failures and abandon a shard, it will de-facto have old generation and @@ -758,7 +821,8 @@ def test_sharding_split_failures( # will have succeeded: the net result should be to return to a clean state, including # detaching any child shards. def assert_rolled_back(exclude_ps_id=None) -> None: - count = 0 + secondary_count = 0 + attached_count = 0 for ps in env.pageservers: if exclude_ps_id is not None and ps.id == exclude_ps_id: continue @@ -766,13 +830,25 @@ def test_sharding_split_failures( locations = ps.http_client().tenant_list_locations()["tenant_shards"] for loc in locations: tenant_shard_id = TenantShardId.parse(loc[0]) - log.info(f"Shard {tenant_shard_id} seen on node {ps.id}") + log.info(f"Shard {tenant_shard_id} seen on node {ps.id} in mode {loc[1]['mode']}") assert tenant_shard_id.shard_count == initial_shard_count - count += 1 - assert count == initial_shard_count + if loc[1]["mode"] == "Secondary": + secondary_count += 1 + else: + attached_count += 1 + + if exclude_ps_id is not None: + # For a node failure case, we expect there to be a secondary location + # scheduled on the offline node, so expect one fewer secondary in total + assert secondary_count == initial_shard_count - 1 + else: + assert secondary_count == initial_shard_count + + assert attached_count == initial_shard_count def assert_split_done(exclude_ps_id=None) -> None: - count = 0 + secondary_count = 0 + attached_count = 0 for ps in env.pageservers: if exclude_ps_id is not None and ps.id == exclude_ps_id: continue @@ -780,10 +856,14 @@ def test_sharding_split_failures( locations = ps.http_client().tenant_list_locations()["tenant_shards"] for loc in locations: tenant_shard_id = TenantShardId.parse(loc[0]) - log.info(f"Shard {tenant_shard_id} seen on node {ps.id}") + log.info(f"Shard {tenant_shard_id} seen on node {ps.id} in mode {loc[1]['mode']}") assert tenant_shard_id.shard_count == split_shard_count - count += 1 - assert count == split_shard_count + if loc[1]["mode"] == "Secondary": + secondary_count += 1 + else: + attached_count += 1 + assert attached_count == split_shard_count + assert secondary_count == split_shard_count def finish_split(): # Having failed+rolled back, we should be able to split again @@ -837,3 +917,130 @@ def test_sharding_split_failures( assert_split_done() env.storage_controller.consistency_check() + + +def test_sharding_backpressure(neon_env_builder: NeonEnvBuilder): + """ + Check a scenario when one of the shards is much slower than others. + Without backpressure, this would lead to the slow shard falling behind + and eventually causing WAL timeouts. + """ + + shard_count = 4 + neon_env_builder.num_pageservers = shard_count + + # 256KiB stripes: enable getting some meaningful data distribution without + # writing large quantities of data in this test. The stripe size is given + # in number of 8KiB pages. + stripe_size = 32 + + env = neon_env_builder.init_start( + initial_tenant_shard_count=shard_count, initial_tenant_shard_stripe_size=stripe_size + ) + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + pageservers = dict((int(p.id), p) for p in env.pageservers) + shards = env.storage_controller.locate(tenant_id) + + # Slow down one of the shards, around ~1MB/s + pageservers[4].http_client().configure_failpoints(("wal-ingest-record-sleep", "5%sleep(1)")) + + def shards_info(): + infos = [] + for shard in shards: + node_id = int(shard["node_id"]) + pageserver = pageservers[node_id] + shard_info = pageserver.http_client().timeline_detail(shard["shard_id"], timeline_id) + infos.append(shard_info) + last_record_lsn = shard_info["last_record_lsn"] + current_physical_size = shard_info["current_physical_size"] + log.info( + f"Shard on pageserver {node_id}: lsn={last_record_lsn}, size={current_physical_size}" + ) + return infos + + shards_info() + + workload = Workload( + env, + tenant_id, + timeline_id, + branch_name="main", + endpoint_opts={ + "config_lines": [ + # Tip: set to 100MB to make the test fail + "max_replication_write_lag=1MB", + ], + }, + ) + workload.init() + + endpoint = workload.endpoint() + + # on 2024-03-05, the default config on prod was [15MB, 10GB, null] + res = endpoint.safe_psql_many( + [ + "SHOW max_replication_write_lag", + "SHOW max_replication_flush_lag", + "SHOW max_replication_apply_lag", + ] + ) + log.info(f"backpressure config: {res}") + + last_flush_lsn = None + last_timestamp = None + + def update_write_lsn(): + nonlocal last_flush_lsn + nonlocal last_timestamp + + res = endpoint.safe_psql( + """ + SELECT + pg_wal_lsn_diff(pg_current_wal_flush_lsn(), received_lsn) as received_lsn_lag, + received_lsn, + pg_current_wal_flush_lsn() as flush_lsn, + neon.backpressure_throttling_time() as throttling_time + FROM neon.backpressure_lsns(); + """, + dbname="postgres", + )[0] + log.info( + f"received_lsn_lag = {res[0]}, received_lsn = {res[1]}, flush_lsn = {res[2]}, throttling_time = {res[3]}" + ) + + lsn = Lsn(res[2]) + now = time.time() + + if last_timestamp is not None: + delta = now - last_timestamp + delta_bytes = lsn - last_flush_lsn + avg_speed = delta_bytes / delta / 1024 / 1024 + log.info( + f"flush_lsn {lsn}, written {delta_bytes/1024}kb for {delta:.3f}s, avg_speed {avg_speed:.3f} MiB/s" + ) + + last_flush_lsn = lsn + last_timestamp = now + + update_write_lsn() + + workload.write_rows(4096, upload=False) + workload.write_rows(4096, upload=False) + workload.write_rows(4096, upload=False) + workload.write_rows(4096, upload=False) + workload.validate() + + update_write_lsn() + shards_info() + + for _write_iter in range(30): + # approximately 1MB of data + workload.write_rows(8000, upload=False) + update_write_lsn() + infos = shards_info() + min_lsn = min(Lsn(info["last_record_lsn"]) for info in infos) + max_lsn = max(Lsn(info["last_record_lsn"]) for info in infos) + diff = max_lsn - min_lsn + assert diff < 2 * 1024 * 1024, f"LSN diff={diff}, expected diff < 2MB due to backpressure" diff --git a/test_runner/regress/test_sharding_service.py b/test_runner/regress/test_sharding_service.py index 27ea425bb1..b7488cadd6 100644 --- a/test_runner/regress/test_sharding_service.py +++ b/test_runner/regress/test_sharding_service.py @@ -23,7 +23,7 @@ from fixtures.pageserver.utils import ( ) from fixtures.pg_version import PgVersion from fixtures.remote_storage import RemoteStorageKind, s3_storage -from fixtures.types import TenantId, TimelineId +from fixtures.types import TenantId, TenantShardId, TimelineId from fixtures.utils import run_pg_bench_small, wait_until from mypy_boto3_s3.type_defs import ( ObjectTypeDef, @@ -177,6 +177,7 @@ def test_node_status_after_restart( assert len(nodes) == 2 env.pageservers[1].stop() + env.storage_controller.allowed_errors.extend([".*Could not scan node"]) env.storage_controller.stop() env.storage_controller.start() @@ -681,6 +682,9 @@ def test_sharding_service_auth(neon_env_builder: NeonEnvBuilder): tenant_id = TenantId.generate() body: Dict[str, Any] = {"new_tenant_id": str(tenant_id)} + env.storage_controller.allowed_errors.append(".*Unauthorized.*") + env.storage_controller.allowed_errors.append(".*Forbidden.*") + # No token with pytest.raises( StorageControllerApiException, @@ -843,6 +847,12 @@ def test_sharding_service_heartbeats( env = neon_env_builder.init_configs() env.start() + # Default log allow list permits connection errors, but this test will use error responses on + # the utilization endpoint. + env.storage_controller.allowed_errors.append( + ".*Call to node.*management API.*failed.*failpoint.*" + ) + # Initially we have two online pageservers nodes = env.storage_controller.node_list() assert len(nodes) == 2 @@ -938,3 +948,65 @@ def test_sharding_service_heartbeats( env.storage_controller.consistency_check() wait_until(10, 1, storage_controller_consistent) + + +def test_sharding_service_re_attach(neon_env_builder: NeonEnvBuilder): + """ + Exercise the behavior of the /re-attach endpoint on pageserver startup when + pageservers have a mixture of attached and secondary locations + """ + + neon_env_builder.num_pageservers = 2 + env = neon_env_builder.init_configs() + env.start() + + # We'll have two tenants. + tenant_a = TenantId.generate() + env.neon_cli.create_tenant(tenant_a, placement_policy='{"Attached":1}') + tenant_b = TenantId.generate() + env.neon_cli.create_tenant(tenant_b, placement_policy='{"Attached":1}') + + # Each pageserver will have one attached and one secondary location + env.storage_controller.tenant_shard_migrate( + TenantShardId(tenant_a, 0, 0), env.pageservers[0].id + ) + env.storage_controller.tenant_shard_migrate( + TenantShardId(tenant_b, 0, 0), env.pageservers[1].id + ) + + # Hard-fail a pageserver + victim_ps = env.pageservers[1] + survivor_ps = env.pageservers[0] + victim_ps.stop(immediate=True) + + # Heatbeater will notice it's offline, and consequently attachments move to the other pageserver + def failed_over(): + locations = survivor_ps.http_client().tenant_list_locations()["tenant_shards"] + log.info(f"locations: {locations}") + assert len(locations) == 2 + assert all(loc[1]["mode"] == "AttachedSingle" for loc in locations) + + # We could pre-empty this by configuring the node to Offline, but it's preferable to test + # the realistic path we would take when a node restarts uncleanly. + # The delay here will be ~NEON_LOCAL_MAX_UNAVAILABLE_INTERVAL in neon_local + wait_until(30, 1, failed_over) + + reconciles_before_restart = env.storage_controller.get_metric_value( + "storage_controller_reconcile_complete_total", filter={"status": "ok"} + ) + + # Restart the failed pageserver + victim_ps.start() + + # We expect that the re-attach call correctly tipped off the pageserver that its locations + # are all secondaries now. + locations = victim_ps.http_client().tenant_list_locations()["tenant_shards"] + assert len(locations) == 2 + assert all(loc[1]["mode"] == "Secondary" for loc in locations) + + # We expect that this situation resulted from the re_attach call, and not any explicit + # Reconciler runs: assert that the reconciliation count has not gone up since we restarted. + reconciles_after_restart = env.storage_controller.get_metric_value( + "storage_controller_reconcile_complete_total", filter={"status": "ok"} + ) + assert reconciles_after_restart == reconciles_before_restart diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index 1e13a2f20f..f8701b65d7 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -36,7 +36,9 @@ def test_tenant_creation_fails(neon_simple_env: NeonEnv): ) [d for d in tenants_dir.iterdir()] - neon_simple_env.pageserver.allowed_errors.append(".*tenant-config-before-write.*") + error_regexes = [".*tenant-config-before-write.*"] + neon_simple_env.pageserver.allowed_errors.extend(error_regexes) + neon_simple_env.storage_controller.allowed_errors.extend(error_regexes) pageserver_http = neon_simple_env.pageserver.http_client() pageserver_http.configure_failpoints(("tenant-config-before-write", "return")) diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py index 205ca18050..efd257900d 100644 --- a/test_runner/regress/test_timeline_size.py +++ b/test_runner/regress/test_timeline_size.py @@ -20,6 +20,7 @@ from fixtures.neon_fixtures import ( VanillaPostgres, wait_for_last_flush_lsn, ) +from fixtures.pageserver.http import PageserverHttpClient from fixtures.pageserver.utils import ( assert_tenant_state, timeline_delete_wait_completed, @@ -684,6 +685,13 @@ def assert_physical_size_invariants(sizes: TimelinePhysicalSizeValues): # XXX would be nice to assert layer file physical storage utilization here as well, but we can only do that for LocalFS +def wait_for_tenant_startup_completions(client: PageserverHttpClient, count: int): + def condition(): + assert client.get_metric_value("pageserver_tenant_startup_complete_total") == count + + wait_until(5, 1.0, condition) + + def test_ondemand_activation(neon_env_builder: NeonEnvBuilder): """ Tenants warmuping up opportunistically will wait for one another's logical size calculations to complete @@ -767,10 +775,7 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder): # That one that we successfully accessed is now Active expect_activated += 1 assert pageserver_http.tenant_status(tenant_id=stuck_tenant_id)["state"]["slug"] == "Active" - assert ( - pageserver_http.get_metric_value("pageserver_tenant_startup_complete_total") - == expect_activated - 1 - ) + wait_for_tenant_startup_completions(pageserver_http, count=expect_activated - 1) # The ones we didn't touch are still in Attaching assert ( @@ -790,10 +795,7 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder): == n_tenants - expect_activated ) - assert ( - pageserver_http.get_metric_value("pageserver_tenant_startup_complete_total") - == expect_activated - 1 - ) + wait_for_tenant_startup_completions(pageserver_http, count=expect_activated - 1) # When we unblock logical size calculation, all tenants should proceed to active state via # the warmup route. @@ -813,7 +815,7 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder): assert ( pageserver_http.get_metric_value("pageserver_tenant_startup_scheduled_total") == n_tenants ) - assert pageserver_http.get_metric_value("pageserver_tenant_startup_complete_total") == n_tenants + wait_for_tenant_startup_completions(pageserver_http, count=n_tenants) # Check that tenant deletion/detach proactively wakes tenants: this is done separately to the main # body of the test because it will disrupt tenant counts @@ -929,7 +931,7 @@ def test_timeline_logical_size_task_priority(neon_env_builder: NeonEnvBuilder): env.pageserver.stop() env.pageserver.start( extra_env_vars={ - "FAILPOINTS": "initial-size-calculation-permit-pause=pause;walreceiver-after-ingest=pause" + "FAILPOINTS": "initial-size-calculation-permit-pause=pause;walreceiver-after-ingest-pause-activate=return(1);walreceiver-after-ingest-pause=pause" } ) @@ -951,7 +953,11 @@ def test_timeline_logical_size_task_priority(neon_env_builder: NeonEnvBuilder): assert details["current_logical_size_is_accurate"] is True client.configure_failpoints( - [("initial-size-calculation-permit-pause", "off"), ("walreceiver-after-ingest", "off")] + [ + ("initial-size-calculation-permit-pause", "off"), + ("walreceiver-after-ingest-pause-activate", "off"), + ("walreceiver-after-ingest-pause", "off"), + ] ) @@ -981,7 +987,7 @@ def test_eager_attach_does_not_queue_up(neon_env_builder: NeonEnvBuilder): # pause at logical size calculation, also pause before walreceiver can give feedback so it will give priority to logical size calculation env.pageserver.start( extra_env_vars={ - "FAILPOINTS": "timeline-calculate-logical-size-pause=pause;walreceiver-after-ingest=pause" + "FAILPOINTS": "timeline-calculate-logical-size-pause=pause;walreceiver-after-ingest-pause-activate=return(1);walreceiver-after-ingest-pause=pause" } ) @@ -1027,7 +1033,11 @@ def test_eager_attach_does_not_queue_up(neon_env_builder: NeonEnvBuilder): other_is_attaching() client.configure_failpoints( - [("timeline-calculate-logical-size-pause", "off"), ("walreceiver-after-ingest", "off")] + [ + ("timeline-calculate-logical-size-pause", "off"), + ("walreceiver-after-ingest-pause-activate", "off"), + ("walreceiver-after-ingest-pause", "off"), + ] ) @@ -1057,7 +1067,7 @@ def test_lazy_attach_activation(neon_env_builder: NeonEnvBuilder, activation_met # pause at logical size calculation, also pause before walreceiver can give feedback so it will give priority to logical size calculation env.pageserver.start( extra_env_vars={ - "FAILPOINTS": "timeline-calculate-logical-size-pause=pause;walreceiver-after-ingest=pause" + "FAILPOINTS": "timeline-calculate-logical-size-pause=pause;walreceiver-after-ingest-pause-activate=return(1);walreceiver-after-ingest-pause=pause" } ) @@ -1109,3 +1119,11 @@ def test_lazy_attach_activation(neon_env_builder: NeonEnvBuilder, activation_met delete_lazy_activating(lazy_tenant, env.pageserver, expect_attaching=True) else: raise RuntimeError(activation_method) + + client.configure_failpoints( + [ + ("timeline-calculate-logical-size-pause", "off"), + ("walreceiver-after-ingest-pause-activate", "off"), + ("walreceiver-after-ingest-pause", "off"), + ] + ) diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index 3b09894ddb..748643b468 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit 3b09894ddb8825b50c963942059eab1a2a0b0a89 +Subproject commit 748643b4683e9fe3b105011a6ba8a687d032cd65 diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index 80cef885ad..e7651e79c0 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit 80cef885add1af6741aa31944c7d2c84d8f9098f +Subproject commit e7651e79c0c27fbddc3c724f5b9553222c28e395 diff --git a/vendor/postgres-v16 b/vendor/postgres-v16 index 9007894722..3946b2e2ea 160000 --- a/vendor/postgres-v16 +++ b/vendor/postgres-v16 @@ -1 +1 @@ -Subproject commit 90078947229aa7f9ac5f7ed4527b2c7386d5332b +Subproject commit 3946b2e2ea71d07af092099cb5bcae76a69b90d6 diff --git a/vendor/revisions.json b/vendor/revisions.json index ae524d70b1..3c1b866137 100644 --- a/vendor/revisions.json +++ b/vendor/revisions.json @@ -1,5 +1,5 @@ { - "postgres-v16": "90078947229aa7f9ac5f7ed4527b2c7386d5332b", - "postgres-v15": "80cef885add1af6741aa31944c7d2c84d8f9098f", - "postgres-v14": "3b09894ddb8825b50c963942059eab1a2a0b0a89" + "postgres-v16": "3946b2e2ea71d07af092099cb5bcae76a69b90d6", + "postgres-v15": "e7651e79c0c27fbddc3c724f5b9553222c28e395", + "postgres-v14": "748643b4683e9fe3b105011a6ba8a687d032cd65" } diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index 8593b752c2..7b8228a082 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -19,8 +19,7 @@ aws-runtime = { version = "1", default-features = false, features = ["event-stre aws-sigv4 = { version = "1", features = ["http0-compat", "sign-eventstream", "sigv4a"] } aws-smithy-async = { version = "1", default-features = false, features = ["rt-tokio"] } aws-smithy-http = { version = "0.60", default-features = false, features = ["event-stream"] } -aws-smithy-runtime-api = { version = "1", features = ["client", "http-02x", "http-auth"] } -aws-smithy-types = { version = "1", default-features = false, features = ["byte-stream-poll-next", "http-body-0-4-x", "rt-tokio"] } +aws-smithy-types = { version = "1", default-features = false, features = ["byte-stream-poll-next", "http-body-0-4-x", "rt-tokio", "test-util"] } axum = { version = "0.6", features = ["ws"] } base64 = { version = "0.21", features = ["alloc"] } base64ct = { version = "1", default-features = false, features = ["std"] } @@ -64,6 +63,7 @@ rustls = { version = "0.21", features = ["dangerous_configuration"] } scopeguard = { version = "1" } serde = { version = "1", features = ["alloc", "derive"] } serde_json = { version = "1", features = ["raw_value"] } +sha2 = { version = "0.10", features = ["asm"] } smallvec = { version = "1", default-features = false, features = ["write"] } subtle = { version = "2" } time = { version = "0.3", features = ["local-offset", "macros", "serde-well-known"] }