Merge pull request #7219 from neondatabase/rc/2024-03-25

Release 2024-03-25
This commit is contained in:
John Spray
2024-03-25 12:28:09 +00:00
committed by GitHub
135 changed files with 5689 additions and 2311 deletions

View File

@@ -1121,10 +1121,16 @@ jobs:
run: |
if [[ "$GITHUB_REF_NAME" == "main" ]]; then
gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=false
# TODO: move deployPreprodRegion to release (`"$GITHUB_REF_NAME" == "release"` block), once Staging support different compute tag prefixes for different regions
gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=true
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main \
-f deployPgSniRouter=false \
-f deployProxy=false \
-f deployStorage=true \
-f deployStorageBroker=true \
-f branch=main \
-f dockerTag=${{needs.tag.outputs.build-tag}} \
-f deployPreprodRegion=true
gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main \
-f deployPgSniRouter=false \
-f deployProxy=false \
@@ -1133,6 +1139,15 @@ jobs:
-f branch=main \
-f dockerTag=${{needs.tag.outputs.build-tag}}
elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main \
-f deployPgSniRouter=true \
-f deployProxy=true \
-f deployStorage=false \
-f deployStorageBroker=false \
-f branch=main \
-f dockerTag=${{needs.tag.outputs.build-tag}} \
-f deployPreprodRegion=true
gh workflow --repo neondatabase/aws run deploy-proxy-prod.yml --ref main \
-f deployPgSniRouter=true \
-f deployProxy=true \

View File

@@ -1,12 +1,13 @@
/compute_tools/ @neondatabase/control-plane @neondatabase/compute
/control_plane/attachment_service @neondatabase/storage
/libs/pageserver_api/ @neondatabase/storage
/libs/postgres_ffi/ @neondatabase/compute
/libs/postgres_ffi/ @neondatabase/compute @neondatabase/safekeepers
/libs/remote_storage/ @neondatabase/storage
/libs/safekeeper_api/ @neondatabase/safekeepers
/libs/vm_monitor/ @neondatabase/autoscaling
/pageserver/ @neondatabase/storage
/pgxn/ @neondatabase/compute
/pgxn/neon/ @neondatabase/compute @neondatabase/safekeepers
/proxy/ @neondatabase/proxy
/safekeeper/ @neondatabase/safekeepers
/vendor/ @neondatabase/compute

181
Cargo.lock generated
View File

@@ -277,6 +277,7 @@ dependencies = [
"anyhow",
"aws-config",
"aws-sdk-secretsmanager",
"bytes",
"camino",
"clap",
"control_plane",
@@ -288,6 +289,8 @@ dependencies = [
"hex",
"humantime",
"hyper",
"lasso",
"measured",
"metrics",
"once_cell",
"pageserver_api",
@@ -295,6 +298,7 @@ dependencies = [
"postgres_connection",
"r2d2",
"reqwest",
"routerify",
"serde",
"serde_json",
"thiserror",
@@ -343,9 +347,9 @@ dependencies = [
[[package]]
name = "aws-credential-types"
version = "1.1.4"
version = "1.1.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "33cc49dcdd31c8b6e79850a179af4c367669150c7ac0135f176c61bec81a70f7"
checksum = "fa8587ae17c8e967e4b05a62d495be2fb7701bec52a97f7acfe8a29f938384c8"
dependencies = [
"aws-smithy-async",
"aws-smithy-runtime-api",
@@ -355,9 +359,9 @@ dependencies = [
[[package]]
name = "aws-runtime"
version = "1.1.4"
version = "1.1.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "eb031bff99877c26c28895766f7bb8484a05e24547e370768d6cc9db514662aa"
checksum = "b13dc54b4b49f8288532334bba8f87386a40571c47c37b1304979b556dc613c8"
dependencies = [
"aws-credential-types",
"aws-sigv4",
@@ -377,6 +381,29 @@ dependencies = [
"uuid",
]
[[package]]
name = "aws-sdk-iam"
version = "1.17.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b8ae76026bfb1b80a6aed0bb400c1139cd9c0563e26bce1986cd021c6a968c7b"
dependencies = [
"aws-credential-types",
"aws-runtime",
"aws-smithy-async",
"aws-smithy-http",
"aws-smithy-json",
"aws-smithy-query",
"aws-smithy-runtime",
"aws-smithy-runtime-api",
"aws-smithy-types",
"aws-smithy-xml",
"aws-types",
"http 0.2.9",
"once_cell",
"regex-lite",
"tracing",
]
[[package]]
name = "aws-sdk-s3"
version = "1.14.0"
@@ -498,9 +525,9 @@ dependencies = [
[[package]]
name = "aws-sigv4"
version = "1.1.4"
version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c371c6b0ac54d4605eb6f016624fb5c7c2925d315fdf600ac1bf21b19d5f1742"
checksum = "11d6f29688a4be9895c0ba8bef861ad0c0dac5c15e9618b9b7a6c233990fc263"
dependencies = [
"aws-credential-types",
"aws-smithy-eventstream",
@@ -513,7 +540,7 @@ dependencies = [
"hex",
"hmac",
"http 0.2.9",
"http 1.0.0",
"http 1.1.0",
"once_cell",
"p256",
"percent-encoding",
@@ -527,9 +554,9 @@ dependencies = [
[[package]]
name = "aws-smithy-async"
version = "1.1.4"
version = "1.1.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "72ee2d09cce0ef3ae526679b522835d63e75fb427aca5413cd371e490d52dcc6"
checksum = "d26ea8fa03025b2face2b3038a63525a10891e3d8829901d502e5384a0d8cd46"
dependencies = [
"futures-util",
"pin-project-lite",
@@ -570,9 +597,9 @@ dependencies = [
[[package]]
name = "aws-smithy-http"
version = "0.60.4"
version = "0.60.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dab56aea3cd9e1101a0a999447fb346afb680ab1406cebc44b32346e25b4117d"
checksum = "3f10fa66956f01540051b0aa7ad54574640f748f9839e843442d99b970d3aff9"
dependencies = [
"aws-smithy-eventstream",
"aws-smithy-runtime-api",
@@ -591,18 +618,18 @@ dependencies = [
[[package]]
name = "aws-smithy-json"
version = "0.60.4"
version = "0.60.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fd3898ca6518f9215f62678870064398f00031912390efd03f1f6ef56d83aa8e"
checksum = "4683df9469ef09468dad3473d129960119a0d3593617542b7d52086c8486f2d6"
dependencies = [
"aws-smithy-types",
]
[[package]]
name = "aws-smithy-query"
version = "0.60.4"
version = "0.60.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bda4b1dfc9810e35fba8a620e900522cd1bd4f9578c446e82f49d1ce41d2e9f9"
checksum = "f2fbd61ceb3fe8a1cb7352e42689cec5335833cd9f94103a61e98f9bb61c64bb"
dependencies = [
"aws-smithy-types",
"urlencoding",
@@ -610,9 +637,9 @@ dependencies = [
[[package]]
name = "aws-smithy-runtime"
version = "1.1.4"
version = "1.1.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fafdab38f40ad7816e7da5dec279400dd505160780083759f01441af1bbb10ea"
checksum = "ec81002d883e5a7fd2bb063d6fb51c4999eb55d404f4fff3dd878bf4733b9f01"
dependencies = [
"aws-smithy-async",
"aws-smithy-http",
@@ -635,14 +662,15 @@ dependencies = [
[[package]]
name = "aws-smithy-runtime-api"
version = "1.1.4"
version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c18276dd28852f34b3bf501f4f3719781f4999a51c7bff1a5c6dc8c4529adc29"
checksum = "9acb931e0adaf5132de878f1398d83f8677f90ba70f01f65ff87f6d7244be1c5"
dependencies = [
"aws-smithy-async",
"aws-smithy-types",
"bytes",
"http 0.2.9",
"http 1.1.0",
"pin-project-lite",
"tokio",
"tracing",
@@ -651,9 +679,9 @@ dependencies = [
[[package]]
name = "aws-smithy-types"
version = "1.1.4"
version = "1.1.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bb3e134004170d3303718baa2a4eb4ca64ee0a1c0a7041dca31b38be0fb414f3"
checksum = "abe14dceea1e70101d38fbf2a99e6a34159477c0fb95e68e05c66bd7ae4c3729"
dependencies = [
"base64-simd",
"bytes",
@@ -674,18 +702,18 @@ dependencies = [
[[package]]
name = "aws-smithy-xml"
version = "0.60.4"
version = "0.60.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8604a11b25e9ecaf32f9aa56b9fe253c5e2f606a3477f0071e96d3155a5ed218"
checksum = "872c68cf019c0e4afc5de7753c4f7288ce4b71663212771bf5e4542eb9346ca9"
dependencies = [
"xmlparser",
]
[[package]]
name = "aws-types"
version = "1.1.4"
version = "1.1.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "789bbe008e65636fe1b6dbbb374c40c8960d1232b96af5ff4aec349f9c4accf4"
checksum = "0dbf2f3da841a8930f159163175cf6a3d16ddde517c1b0fba7aa776822800f40"
dependencies = [
"aws-credential-types",
"aws-smithy-async",
@@ -2392,9 +2420,9 @@ dependencies = [
[[package]]
name = "http"
version = "1.0.0"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b32afd38673a8016f7c9ae69e5af41a58f81b1d31689040f2f1959594ce194ea"
checksum = "21b9ddb458710bc376481b842f5da65cdf31522de232c1ca8146abce2a358258"
dependencies = [
"bytes",
"fnv",
@@ -2494,7 +2522,7 @@ dependencies = [
"hyper",
"log",
"rustls 0.21.9",
"rustls-native-certs",
"rustls-native-certs 0.6.2",
"tokio",
"tokio-rustls 0.24.0",
]
@@ -2880,6 +2908,35 @@ version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771"
[[package]]
name = "measured"
version = "0.0.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f246648d027839a34b420e27c7de1165ace96e19ef894985d0a6ff89a7840a9f"
dependencies = [
"bytes",
"hashbrown 0.14.0",
"itoa",
"lasso",
"measured-derive",
"memchr",
"parking_lot 0.12.1",
"rustc-hash",
"ryu",
]
[[package]]
name = "measured-derive"
version = "0.0.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "edaa5cc22d99d5d6d7d99c3b5b5f7e7f8034c22f1b5d62a1adecd2ed005d9b80"
dependencies = [
"heck",
"proc-macro2",
"quote",
"syn 2.0.52",
]
[[package]]
name = "memchr"
version = "2.6.4"
@@ -3901,7 +3958,7 @@ dependencies = [
[[package]]
name = "postgres"
version = "0.19.4"
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#988d0ddb4184c408fa7fc1bd0ecca7993c02978f"
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
dependencies = [
"bytes",
"fallible-iterator",
@@ -3914,7 +3971,7 @@ dependencies = [
[[package]]
name = "postgres-native-tls"
version = "0.5.0"
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#988d0ddb4184c408fa7fc1bd0ecca7993c02978f"
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
dependencies = [
"native-tls",
"tokio",
@@ -3925,7 +3982,7 @@ dependencies = [
[[package]]
name = "postgres-protocol"
version = "0.6.4"
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#988d0ddb4184c408fa7fc1bd0ecca7993c02978f"
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
dependencies = [
"base64 0.20.0",
"byteorder",
@@ -3938,12 +3995,13 @@ dependencies = [
"rand 0.8.5",
"sha2",
"stringprep",
"tokio",
]
[[package]]
name = "postgres-types"
version = "0.2.4"
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#988d0ddb4184c408fa7fc1bd0ecca7993c02978f"
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
dependencies = [
"bytes",
"fallible-iterator",
@@ -4165,6 +4223,10 @@ version = "0.1.0"
dependencies = [
"anyhow",
"async-trait",
"aws-config",
"aws-sdk-iam",
"aws-sigv4",
"aws-types",
"base64 0.13.1",
"bstr",
"bytes",
@@ -4175,6 +4237,7 @@ dependencies = [
"consumption_metrics",
"dashmap",
"env_logger",
"fallible-iterator",
"futures",
"git-version",
"hashbrown 0.13.2",
@@ -4182,6 +4245,7 @@ dependencies = [
"hex",
"hmac",
"hostname",
"http 1.1.0",
"humantime",
"hyper",
"hyper-tungstenite",
@@ -4225,6 +4289,7 @@ dependencies = [
"smallvec",
"smol_str",
"socket2 0.5.5",
"subtle",
"sync_wrapper",
"task-local-extensions",
"thiserror",
@@ -4396,9 +4461,9 @@ dependencies = [
[[package]]
name = "redis"
version = "0.24.0"
version = "0.25.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c580d9cbbe1d1b479e8d67cf9daf6a62c957e6846048408b80b43ac3f6af84cd"
checksum = "71d64e978fd98a0e6b105d066ba4889a7301fca65aeac850a877d8797343feeb"
dependencies = [
"async-trait",
"bytes",
@@ -4407,15 +4472,15 @@ dependencies = [
"itoa",
"percent-encoding",
"pin-project-lite",
"rustls 0.21.9",
"rustls-native-certs",
"rustls-pemfile 1.0.2",
"rustls-webpki 0.101.7",
"rustls 0.22.2",
"rustls-native-certs 0.7.0",
"rustls-pemfile 2.1.1",
"rustls-pki-types",
"ryu",
"sha1_smol",
"socket2 0.4.9",
"socket2 0.5.5",
"tokio",
"tokio-rustls 0.24.0",
"tokio-rustls 0.25.0",
"tokio-util",
"url",
]
@@ -4844,6 +4909,19 @@ dependencies = [
"security-framework",
]
[[package]]
name = "rustls-native-certs"
version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8f1fb85efa936c42c6d5fc28d2629bb51e4b2f4b8a5211e297d599cc5a093792"
dependencies = [
"openssl-probe",
"rustls-pemfile 2.1.1",
"rustls-pki-types",
"schannel",
"security-framework",
]
[[package]]
name = "rustls-pemfile"
version = "1.0.2"
@@ -5346,13 +5424,23 @@ checksum = "ae1a47186c03a32177042e55dbc5fd5aee900b8e0069a8d70fba96a9375cd012"
[[package]]
name = "sha2"
version = "0.10.6"
version = "0.10.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "82e6b795fe2e3b1e845bafcb27aa35405c4d47cdfc92af5fc8d3002f76cebdc0"
checksum = "793db75ad2bcafc3ffa7c68b215fee268f537982cd901d132f89c6343f3a3dc8"
dependencies = [
"cfg-if",
"cpufeatures",
"digest",
"sha2-asm",
]
[[package]]
name = "sha2-asm"
version = "0.6.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f27ba7066011e3fb30d808b51affff34f0a66d3a03a58edd787c6e420e40e44e"
dependencies = [
"cc",
]
[[package]]
@@ -5935,7 +6023,7 @@ dependencies = [
[[package]]
name = "tokio-postgres"
version = "0.7.7"
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#988d0ddb4184c408fa7fc1bd0ecca7993c02978f"
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
dependencies = [
"async-trait",
"byteorder",
@@ -6101,7 +6189,7 @@ dependencies = [
"percent-encoding",
"pin-project",
"prost",
"rustls-native-certs",
"rustls-native-certs 0.6.2",
"rustls-pemfile 1.0.2",
"tokio",
"tokio-rustls 0.24.0",
@@ -6468,6 +6556,7 @@ version = "0.1.0"
dependencies = [
"anyhow",
"arc-swap",
"async-compression",
"async-trait",
"bincode",
"byteorder",
@@ -6506,12 +6595,14 @@ dependencies = [
"thiserror",
"tokio",
"tokio-stream",
"tokio-tar",
"tokio-util",
"tracing",
"tracing-error",
"tracing-subscriber",
"url",
"uuid",
"walkdir",
"workspace_hack",
]
@@ -6983,7 +7074,6 @@ dependencies = [
"aws-sigv4",
"aws-smithy-async",
"aws-smithy-http",
"aws-smithy-runtime-api",
"aws-smithy-types",
"axum",
"base64 0.21.1",
@@ -7029,6 +7119,7 @@ dependencies = [
"scopeguard",
"serde",
"serde_json",
"sha2",
"smallvec",
"subtle",
"syn 1.0.109",

View File

@@ -53,9 +53,12 @@ async-trait = "0.1"
aws-config = { version = "1.1.4", default-features = false, features=["rustls"] }
aws-sdk-s3 = "1.14"
aws-sdk-secretsmanager = { version = "1.14.0" }
aws-sdk-iam = "1.15.0"
aws-smithy-async = { version = "1.1.4", default-features = false, features=["rt-tokio"] }
aws-smithy-types = "1.1.4"
aws-credential-types = "1.1.4"
aws-sigv4 = { version = "1.2.0", features = ["sign-http"] }
aws-types = "1.1.7"
axum = { version = "0.6.20", features = ["ws"] }
base64 = "0.13.0"
bincode = "1.3"
@@ -76,6 +79,7 @@ either = "1.8"
enum-map = "2.4.2"
enumset = "1.0.12"
fail = "0.5.0"
fallible-iterator = "0.2"
fs2 = "0.4.3"
futures = "0.3"
futures-core = "0.3"
@@ -88,6 +92,7 @@ hex = "0.4"
hex-literal = "0.4"
hmac = "0.12.1"
hostname = "0.3.1"
http = {version = "1.1.0", features = ["std"]}
http-types = { version = "2", default-features = false }
humantime = "2.1"
humantime-serde = "1.1.1"
@@ -101,6 +106,7 @@ lasso = "0.7"
leaky-bucket = "1.0.1"
libc = "0.2"
md5 = "0.7.0"
measured = { version = "0.0.13", features=["default", "lasso"] }
memoffset = "0.8"
native-tls = "0.2"
nix = { version = "0.27", features = ["fs", "process", "socket", "signal", "poll"] }
@@ -120,7 +126,7 @@ procfs = "0.14"
prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
prost = "0.11"
rand = "0.8"
redis = { version = "0.24.0", features = ["tokio-rustls-comp", "keep-alive"] }
redis = { version = "0.25.2", features = ["tokio-rustls-comp", "keep-alive"] }
regex = "1.10.2"
reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] }
reqwest-tracing = { version = "0.4.7", features = ["opentelemetry_0_20"] }
@@ -148,6 +154,7 @@ smol_str = { version = "0.2.0", features = ["serde"] }
socket2 = "0.5"
strum = "0.24"
strum_macros = "0.24"
"subtle" = "2.5.0"
svg_fmt = "0.4.1"
sync_wrapper = "0.1.2"
tar = "0.4"

View File

@@ -135,7 +135,7 @@ WORKDIR /home/nonroot
# Rust
# Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
ENV RUSTC_VERSION=1.76.0
ENV RUSTC_VERSION=1.77.0
ENV RUSTUP_HOME="/home/nonroot/.rustup"
ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \
@@ -149,7 +149,7 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux
cargo install --git https://github.com/paritytech/cachepot && \
cargo install rustfilt && \
cargo install cargo-hakari && \
cargo install cargo-deny && \
cargo install cargo-deny --locked && \
cargo install cargo-hack && \
cargo install cargo-nextest && \
rm -rf /home/nonroot/.cargo/registry && \

View File

@@ -32,6 +32,29 @@ compute_ctl -D /var/db/postgres/compute \
-b /usr/local/bin/postgres
```
## State Diagram
Computes can be in various states. Below is a diagram that details how a
compute moves between states.
```mermaid
%% https://mermaid.js.org/syntax/stateDiagram.html
stateDiagram-v2
[*] --> Empty : Compute spawned
Empty --> ConfigurationPending : Waiting for compute spec
ConfigurationPending --> Configuration : Received compute spec
Configuration --> Failed : Failed to configure the compute
Configuration --> Running : Compute has been configured
Empty --> Init : Compute spec is immediately available
Empty --> TerminationPending : Requested termination
Init --> Failed : Failed to start Postgres
Init --> Running : Started Postgres
Running --> TerminationPending : Requested termination
TerminationPending --> Terminated : Terminated compute
Failed --> [*] : Compute exited
Terminated --> [*] : Compute exited
```
## Tests
Cargo formatter:

View File

@@ -17,6 +17,7 @@ pub fn line_in_file(path: &Path, line: &str) -> Result<bool> {
.write(true)
.create(true)
.append(false)
.truncate(false)
.open(path)?;
let buf = io::BufReader::new(&file);
let mut count: usize = 0;

View File

@@ -17,6 +17,7 @@ testing = []
anyhow.workspace = true
aws-config.workspace = true
aws-sdk-secretsmanager.workspace = true
bytes.workspace = true
camino.workspace = true
clap.workspace = true
fail.workspace = true
@@ -25,17 +26,20 @@ git-version.workspace = true
hex.workspace = true
hyper.workspace = true
humantime.workspace = true
lasso.workspace = true
once_cell.workspace = true
pageserver_api.workspace = true
pageserver_client.workspace = true
postgres_connection.workspace = true
reqwest.workspace = true
routerify.workspace = true
serde.workspace = true
serde_json.workspace = true
thiserror.workspace = true
tokio.workspace = true
tokio-util.workspace = true
tracing.workspace = true
measured.workspace = true
diesel = { version = "2.1.4", features = ["serde_json", "postgres", "r2d2"] }
diesel_migrations = { version = "2.1.0" }

View File

@@ -0,0 +1,3 @@
UPDATE tenant_shards set placement_policy='{"Double": 1}' where placement_policy='{"Attached": 1}';
UPDATE tenant_shards set placement_policy='"Single"' where placement_policy='{"Attached": 0}';

View File

@@ -0,0 +1,3 @@
UPDATE tenant_shards set placement_policy='{"Attached": 1}' where placement_policy='{"Double": 1}';
UPDATE tenant_shards set placement_policy='{"Attached": 0}' where placement_policy='"Single"';

View File

@@ -1,5 +1,11 @@
use crate::metrics::{
HttpRequestLatencyLabelGroup, HttpRequestStatusLabelGroup, PageserverRequestLabelGroup,
METRICS_REGISTRY,
};
use crate::reconciler::ReconcileError;
use crate::service::{Service, STARTUP_RECONCILE_TIMEOUT};
use futures::Future;
use hyper::header::CONTENT_TYPE;
use hyper::{Body, Request, Response};
use hyper::{StatusCode, Uri};
use pageserver_api::models::{
@@ -34,6 +40,8 @@ use pageserver_api::upcall_api::{ReAttachRequest, ValidateRequest};
use control_plane::storage_controller::{AttachHookRequest, InspectRequest};
use routerify::Middleware;
/// State available to HTTP request handlers
#[derive(Clone)]
pub struct HttpState {
@@ -313,7 +321,7 @@ async fn handle_tenant_timeline_passthrough(
tracing::info!("Proxying request for tenant {} ({})", tenant_id, path);
// Find the node that holds shard zero
let (base_url, tenant_shard_id) = service.tenant_shard0_baseurl(tenant_id)?;
let (node, tenant_shard_id) = service.tenant_shard0_node(tenant_id)?;
// Callers will always pass an unsharded tenant ID. Before proxying, we must
// rewrite this to a shard-aware shard zero ID.
@@ -322,12 +330,39 @@ async fn handle_tenant_timeline_passthrough(
let tenant_shard_str = format!("{}", tenant_shard_id);
let path = path.replace(&tenant_str, &tenant_shard_str);
let client = mgmt_api::Client::new(base_url, service.get_config().jwt_token.as_deref());
let latency = &METRICS_REGISTRY
.metrics_group
.storage_controller_passthrough_request_latency;
// This is a bit awkward. We remove the param from the request
// and join the words by '_' to get a label for the request.
let just_path = path.replace(&tenant_shard_str, "");
let path_label = just_path
.split('/')
.filter(|token| !token.is_empty())
.collect::<Vec<_>>()
.join("_");
let labels = PageserverRequestLabelGroup {
pageserver_id: &node.get_id().to_string(),
path: &path_label,
method: crate::metrics::Method::Get,
};
let _timer = latency.start_timer(labels.clone());
let client = mgmt_api::Client::new(node.base_url(), service.get_config().jwt_token.as_deref());
let resp = client.get_raw(path).await.map_err(|_e|
// FIXME: give APiError a proper Unavailable variant. We return 503 here because
// if we can't successfully send a request to the pageserver, we aren't available.
ApiError::ShuttingDown)?;
if !resp.status().is_success() {
let error_counter = &METRICS_REGISTRY
.metrics_group
.storage_controller_passthrough_request_error;
error_counter.inc(labels);
}
// We have a reqest::Response, would like a http::Response
let mut builder = hyper::Response::builder()
.status(resp.status())
@@ -353,6 +388,16 @@ async fn handle_tenant_locate(
json_response(StatusCode::OK, service.tenant_locate(tenant_id)?)
}
async fn handle_tenant_describe(
service: Arc<Service>,
req: Request<Body>,
) -> Result<Response<Body>, ApiError> {
check_permissions(&req, Scope::Admin)?;
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
json_response(StatusCode::OK, service.tenant_describe(tenant_id)?)
}
async fn handle_node_register(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
check_permissions(&req, Scope::Admin)?;
@@ -488,7 +533,11 @@ impl From<ReconcileError> for ApiError {
/// Common wrapper for request handlers that call into Service and will operate on tenants: they must only
/// be allowed to run if Service has finished its initial reconciliation.
async fn tenant_service_handler<R, H>(request: Request<Body>, handler: H) -> R::Output
async fn tenant_service_handler<R, H>(
request: Request<Body>,
handler: H,
request_name: RequestName,
) -> R::Output
where
R: std::future::Future<Output = Result<Response<Body>, ApiError>> + Send + 'static,
H: FnOnce(Arc<Service>, Request<Body>) -> R + Send + Sync + 'static,
@@ -508,9 +557,10 @@ where
));
}
request_span(
named_request_span(
request,
|request| async move { handler(service, request).await },
request_name,
)
.await
}
@@ -521,11 +571,98 @@ fn check_permissions(request: &Request<Body>, required_scope: Scope) -> Result<(
})
}
#[derive(Clone, Debug)]
struct RequestMeta {
method: hyper::http::Method,
at: Instant,
}
fn prologue_metrics_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>(
) -> Middleware<B, ApiError> {
Middleware::pre(move |req| async move {
let meta = RequestMeta {
method: req.method().clone(),
at: Instant::now(),
};
req.set_context(meta);
Ok(req)
})
}
fn epilogue_metrics_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>(
) -> Middleware<B, ApiError> {
Middleware::post_with_info(move |resp, req_info| async move {
let request_name = match req_info.context::<RequestName>() {
Some(name) => name,
None => {
return Ok(resp);
}
};
if let Some(meta) = req_info.context::<RequestMeta>() {
let status = &crate::metrics::METRICS_REGISTRY
.metrics_group
.storage_controller_http_request_status;
let latency = &crate::metrics::METRICS_REGISTRY
.metrics_group
.storage_controller_http_request_latency;
status.inc(HttpRequestStatusLabelGroup {
path: request_name.0,
method: meta.method.clone().into(),
status: crate::metrics::StatusCode(resp.status()),
});
latency.observe(
HttpRequestLatencyLabelGroup {
path: request_name.0,
method: meta.method.into(),
},
meta.at.elapsed().as_secs_f64(),
);
}
Ok(resp)
})
}
pub async fn measured_metrics_handler(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
pub const TEXT_FORMAT: &str = "text/plain; version=0.0.4";
let payload = crate::metrics::METRICS_REGISTRY.encode();
let response = Response::builder()
.status(200)
.header(CONTENT_TYPE, TEXT_FORMAT)
.body(payload.into())
.unwrap();
Ok(response)
}
#[derive(Clone)]
struct RequestName(&'static str);
async fn named_request_span<R, H>(
request: Request<Body>,
handler: H,
name: RequestName,
) -> R::Output
where
R: Future<Output = Result<Response<Body>, ApiError>> + Send + 'static,
H: FnOnce(Request<Body>) -> R + Send + Sync + 'static,
{
request.set_context(name);
request_span(request, handler).await
}
pub fn make_router(
service: Arc<Service>,
auth: Option<Arc<SwappableJwtAuth>>,
) -> RouterBuilder<hyper::Body, ApiError> {
let mut router = endpoint::make_router();
let mut router = endpoint::make_router()
.middleware(prologue_metrics_middleware())
.middleware(epilogue_metrics_middleware());
if auth.is_some() {
router = router.middleware(auth_middleware(|request| {
let state = get_state(request);
@@ -534,96 +671,166 @@ pub fn make_router(
} else {
state.auth.as_deref()
}
}))
}));
}
router
.data(Arc::new(HttpState::new(service, auth)))
.get("/metrics", |r| {
named_request_span(r, measured_metrics_handler, RequestName("metrics"))
})
// Non-prefixed generic endpoints (status, metrics)
.get("/status", |r| request_span(r, handle_status))
.get("/ready", |r| request_span(r, handle_ready))
.get("/status", |r| {
named_request_span(r, handle_status, RequestName("status"))
})
.get("/ready", |r| {
named_request_span(r, handle_ready, RequestName("ready"))
})
// Upcalls for the pageserver: point the pageserver's `control_plane_api` config to this prefix
.post("/upcall/v1/re-attach", |r| {
request_span(r, handle_re_attach)
named_request_span(r, handle_re_attach, RequestName("upcall_v1_reattach"))
})
.post("/upcall/v1/validate", |r| {
named_request_span(r, handle_validate, RequestName("upcall_v1_validate"))
})
.post("/upcall/v1/validate", |r| request_span(r, handle_validate))
// Test/dev/debug endpoints
.post("/debug/v1/attach-hook", |r| {
request_span(r, handle_attach_hook)
named_request_span(r, handle_attach_hook, RequestName("debug_v1_attach_hook"))
})
.post("/debug/v1/inspect", |r| {
named_request_span(r, handle_inspect, RequestName("debug_v1_inspect"))
})
.post("/debug/v1/inspect", |r| request_span(r, handle_inspect))
.post("/debug/v1/tenant/:tenant_id/drop", |r| {
request_span(r, handle_tenant_drop)
named_request_span(r, handle_tenant_drop, RequestName("debug_v1_tenant_drop"))
})
.post("/debug/v1/node/:node_id/drop", |r| {
request_span(r, handle_node_drop)
named_request_span(r, handle_node_drop, RequestName("debug_v1_node_drop"))
})
.get("/debug/v1/tenant", |r| {
named_request_span(r, handle_tenants_dump, RequestName("debug_v1_tenant"))
})
.get("/debug/v1/tenant/:tenant_id/locate", |r| {
tenant_service_handler(
r,
handle_tenant_locate,
RequestName("debug_v1_tenant_locate"),
)
})
.get("/debug/v1/tenant", |r| request_span(r, handle_tenants_dump))
.get("/debug/v1/scheduler", |r| {
request_span(r, handle_scheduler_dump)
named_request_span(r, handle_scheduler_dump, RequestName("debug_v1_scheduler"))
})
.post("/debug/v1/consistency_check", |r| {
request_span(r, handle_consistency_check)
named_request_span(
r,
handle_consistency_check,
RequestName("debug_v1_consistency_check"),
)
})
.put("/debug/v1/failpoints", |r| {
request_span(r, |r| failpoints_handler(r, CancellationToken::new()))
})
.get("/control/v1/tenant/:tenant_id/locate", |r| {
tenant_service_handler(r, handle_tenant_locate)
})
// Node operations
.post("/control/v1/node", |r| {
request_span(r, handle_node_register)
named_request_span(r, handle_node_register, RequestName("control_v1_node"))
})
.get("/control/v1/node", |r| {
named_request_span(r, handle_node_list, RequestName("control_v1_node"))
})
.get("/control/v1/node", |r| request_span(r, handle_node_list))
.put("/control/v1/node/:node_id/config", |r| {
request_span(r, handle_node_configure)
named_request_span(
r,
handle_node_configure,
RequestName("control_v1_node_config"),
)
})
// Tenant Shard operations
.put("/control/v1/tenant/:tenant_shard_id/migrate", |r| {
tenant_service_handler(r, handle_tenant_shard_migrate)
tenant_service_handler(
r,
handle_tenant_shard_migrate,
RequestName("control_v1_tenant_migrate"),
)
})
.put("/control/v1/tenant/:tenant_id/shard_split", |r| {
tenant_service_handler(r, handle_tenant_shard_split)
tenant_service_handler(
r,
handle_tenant_shard_split,
RequestName("control_v1_tenant_shard_split"),
)
})
.get("/control/v1/tenant/:tenant_id", |r| {
tenant_service_handler(
r,
handle_tenant_describe,
RequestName("control_v1_tenant_describe"),
)
})
// Tenant operations
// The ^/v1/ endpoints act as a "Virtual Pageserver", enabling shard-naive clients to call into
// this service to manage tenants that actually consist of many tenant shards, as if they are a single entity.
.post("/v1/tenant", |r| {
tenant_service_handler(r, handle_tenant_create)
tenant_service_handler(r, handle_tenant_create, RequestName("v1_tenant"))
})
.delete("/v1/tenant/:tenant_id", |r| {
tenant_service_handler(r, handle_tenant_delete)
tenant_service_handler(r, handle_tenant_delete, RequestName("v1_tenant"))
})
.put("/v1/tenant/config", |r| {
tenant_service_handler(r, handle_tenant_config_set)
tenant_service_handler(r, handle_tenant_config_set, RequestName("v1_tenant_config"))
})
.get("/v1/tenant/:tenant_id/config", |r| {
tenant_service_handler(r, handle_tenant_config_get)
tenant_service_handler(r, handle_tenant_config_get, RequestName("v1_tenant_config"))
})
.put("/v1/tenant/:tenant_shard_id/location_config", |r| {
tenant_service_handler(r, handle_tenant_location_config)
tenant_service_handler(
r,
handle_tenant_location_config,
RequestName("v1_tenant_location_config"),
)
})
.put("/v1/tenant/:tenant_id/time_travel_remote_storage", |r| {
tenant_service_handler(r, handle_tenant_time_travel_remote_storage)
tenant_service_handler(
r,
handle_tenant_time_travel_remote_storage,
RequestName("v1_tenant_time_travel_remote_storage"),
)
})
.post("/v1/tenant/:tenant_id/secondary/download", |r| {
tenant_service_handler(r, handle_tenant_secondary_download)
tenant_service_handler(
r,
handle_tenant_secondary_download,
RequestName("v1_tenant_secondary_download"),
)
})
// Timeline operations
.delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
tenant_service_handler(r, handle_tenant_timeline_delete)
tenant_service_handler(
r,
handle_tenant_timeline_delete,
RequestName("v1_tenant_timeline"),
)
})
.post("/v1/tenant/:tenant_id/timeline", |r| {
tenant_service_handler(r, handle_tenant_timeline_create)
tenant_service_handler(
r,
handle_tenant_timeline_create,
RequestName("v1_tenant_timeline"),
)
})
// Tenant detail GET passthrough to shard zero
.get("/v1/tenant/:tenant_id", |r| {
tenant_service_handler(r, handle_tenant_timeline_passthrough)
tenant_service_handler(
r,
handle_tenant_timeline_passthrough,
RequestName("v1_tenant_passthrough"),
)
})
// Timeline GET passthrough to shard zero. Note that the `*` in the URL is a wildcard: any future
// timeline GET APIs will be implicitly included.
.get("/v1/tenant/:tenant_id/timeline*", |r| {
tenant_service_handler(r, handle_tenant_timeline_passthrough)
tenant_service_handler(
r,
handle_tenant_timeline_passthrough,
RequestName("v1_tenant_timeline_passthrough"),
)
})
}

View File

@@ -8,6 +8,7 @@ pub mod http;
mod id_lock_map;
pub mod metrics;
mod node;
mod pageserver_client;
pub mod persistence;
mod reconciler;
mod scheduler;

View File

@@ -1,32 +1,284 @@
use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec};
//!
//! This module provides metric definitions for the storage controller.
//!
//! All metrics are grouped in [`StorageControllerMetricGroup`]. [`StorageControllerMetrics`] holds
//! the mentioned metrics and their encoder. It's globally available via the [`METRICS_REGISTRY`]
//! constant.
//!
//! The rest of the code defines label group types and deals with converting outer types to labels.
//!
use bytes::Bytes;
use measured::{
label::{LabelValue, StaticLabelSet},
FixedCardinalityLabel, MetricGroup,
};
use once_cell::sync::Lazy;
use std::sync::Mutex;
pub(crate) struct ReconcilerMetrics {
pub(crate) spawned: IntCounter,
pub(crate) complete: IntCounterVec,
}
use crate::persistence::{DatabaseError, DatabaseOperation};
impl ReconcilerMetrics {
// Labels used on [`Self::complete`]
pub(crate) const SUCCESS: &'static str = "ok";
pub(crate) const ERROR: &'static str = "success";
pub(crate) const CANCEL: &'static str = "cancel";
}
pub(crate) static RECONCILER: Lazy<ReconcilerMetrics> = Lazy::new(|| ReconcilerMetrics {
spawned: register_int_counter!(
"storage_controller_reconcile_spawn",
"Count of how many times we spawn a reconcile task",
)
.expect("failed to define a metric"),
complete: register_int_counter_vec!(
"storage_controller_reconcile_complete",
"Reconciler tasks completed, broken down by success/failure/cancelled",
&["status"],
)
.expect("failed to define a metric"),
});
pub(crate) static METRICS_REGISTRY: Lazy<StorageControllerMetrics> =
Lazy::new(StorageControllerMetrics::default);
pub fn preinitialize_metrics() {
Lazy::force(&RECONCILER);
Lazy::force(&METRICS_REGISTRY);
}
pub(crate) struct StorageControllerMetrics {
pub(crate) metrics_group: StorageControllerMetricGroup,
encoder: Mutex<measured::text::TextEncoder>,
}
#[derive(measured::MetricGroup)]
pub(crate) struct StorageControllerMetricGroup {
/// Count of how many times we spawn a reconcile task
pub(crate) storage_controller_reconcile_spawn: measured::Counter,
/// Reconciler tasks completed, broken down by success/failure/cancelled
pub(crate) storage_controller_reconcile_complete:
measured::CounterVec<ReconcileCompleteLabelGroupSet>,
/// HTTP request status counters for handled requests
pub(crate) storage_controller_http_request_status:
measured::CounterVec<HttpRequestStatusLabelGroupSet>,
/// HTTP request handler latency across all status codes
pub(crate) storage_controller_http_request_latency:
measured::HistogramVec<HttpRequestLatencyLabelGroupSet, 5>,
/// Count of HTTP requests to the pageserver that resulted in an error,
/// broken down by the pageserver node id, request name and method
pub(crate) storage_controller_pageserver_request_error:
measured::CounterVec<PageserverRequestLabelGroupSet>,
/// Latency of HTTP requests to the pageserver, broken down by pageserver
/// node id, request name and method. This include both successful and unsuccessful
/// requests.
pub(crate) storage_controller_pageserver_request_latency:
measured::HistogramVec<PageserverRequestLabelGroupSet, 5>,
/// Count of pass-through HTTP requests to the pageserver that resulted in an error,
/// broken down by the pageserver node id, request name and method
pub(crate) storage_controller_passthrough_request_error:
measured::CounterVec<PageserverRequestLabelGroupSet>,
/// Latency of pass-through HTTP requests to the pageserver, broken down by pageserver
/// node id, request name and method. This include both successful and unsuccessful
/// requests.
pub(crate) storage_controller_passthrough_request_latency:
measured::HistogramVec<PageserverRequestLabelGroupSet, 5>,
/// Count of errors in database queries, broken down by error type and operation.
pub(crate) storage_controller_database_query_error:
measured::CounterVec<DatabaseQueryErrorLabelGroupSet>,
/// Latency of database queries, broken down by operation.
pub(crate) storage_controller_database_query_latency:
measured::HistogramVec<DatabaseQueryLatencyLabelGroupSet, 5>,
}
impl StorageControllerMetrics {
pub(crate) fn encode(&self) -> Bytes {
let mut encoder = self.encoder.lock().unwrap();
self.metrics_group.collect_into(&mut *encoder);
encoder.finish()
}
}
impl Default for StorageControllerMetrics {
fn default() -> Self {
Self {
metrics_group: StorageControllerMetricGroup::new(),
encoder: Mutex::new(measured::text::TextEncoder::new()),
}
}
}
impl StorageControllerMetricGroup {
pub(crate) fn new() -> Self {
Self {
storage_controller_reconcile_spawn: measured::Counter::new(),
storage_controller_reconcile_complete: measured::CounterVec::new(
ReconcileCompleteLabelGroupSet {
status: StaticLabelSet::new(),
},
),
storage_controller_http_request_status: measured::CounterVec::new(
HttpRequestStatusLabelGroupSet {
path: lasso::ThreadedRodeo::new(),
method: StaticLabelSet::new(),
status: StaticLabelSet::new(),
},
),
storage_controller_http_request_latency: measured::HistogramVec::new(
measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0),
),
storage_controller_pageserver_request_error: measured::CounterVec::new(
PageserverRequestLabelGroupSet {
pageserver_id: lasso::ThreadedRodeo::new(),
path: lasso::ThreadedRodeo::new(),
method: StaticLabelSet::new(),
},
),
storage_controller_pageserver_request_latency: measured::HistogramVec::new(
measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0),
),
storage_controller_passthrough_request_error: measured::CounterVec::new(
PageserverRequestLabelGroupSet {
pageserver_id: lasso::ThreadedRodeo::new(),
path: lasso::ThreadedRodeo::new(),
method: StaticLabelSet::new(),
},
),
storage_controller_passthrough_request_latency: measured::HistogramVec::new(
measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0),
),
storage_controller_database_query_error: measured::CounterVec::new(
DatabaseQueryErrorLabelGroupSet {
operation: StaticLabelSet::new(),
error_type: StaticLabelSet::new(),
},
),
storage_controller_database_query_latency: measured::HistogramVec::new(
measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0),
),
}
}
}
#[derive(measured::LabelGroup)]
#[label(set = ReconcileCompleteLabelGroupSet)]
pub(crate) struct ReconcileCompleteLabelGroup {
pub(crate) status: ReconcileOutcome,
}
#[derive(measured::LabelGroup)]
#[label(set = HttpRequestStatusLabelGroupSet)]
pub(crate) struct HttpRequestStatusLabelGroup<'a> {
#[label(dynamic_with = lasso::ThreadedRodeo)]
pub(crate) path: &'a str,
pub(crate) method: Method,
pub(crate) status: StatusCode,
}
#[derive(measured::LabelGroup)]
#[label(set = HttpRequestLatencyLabelGroupSet)]
pub(crate) struct HttpRequestLatencyLabelGroup<'a> {
#[label(dynamic_with = lasso::ThreadedRodeo)]
pub(crate) path: &'a str,
pub(crate) method: Method,
}
impl Default for HttpRequestLatencyLabelGroupSet {
fn default() -> Self {
Self {
path: lasso::ThreadedRodeo::new(),
method: StaticLabelSet::new(),
}
}
}
#[derive(measured::LabelGroup, Clone)]
#[label(set = PageserverRequestLabelGroupSet)]
pub(crate) struct PageserverRequestLabelGroup<'a> {
#[label(dynamic_with = lasso::ThreadedRodeo)]
pub(crate) pageserver_id: &'a str,
#[label(dynamic_with = lasso::ThreadedRodeo)]
pub(crate) path: &'a str,
pub(crate) method: Method,
}
impl Default for PageserverRequestLabelGroupSet {
fn default() -> Self {
Self {
pageserver_id: lasso::ThreadedRodeo::new(),
path: lasso::ThreadedRodeo::new(),
method: StaticLabelSet::new(),
}
}
}
#[derive(measured::LabelGroup)]
#[label(set = DatabaseQueryErrorLabelGroupSet)]
pub(crate) struct DatabaseQueryErrorLabelGroup {
pub(crate) error_type: DatabaseErrorLabel,
pub(crate) operation: DatabaseOperation,
}
#[derive(measured::LabelGroup)]
#[label(set = DatabaseQueryLatencyLabelGroupSet)]
pub(crate) struct DatabaseQueryLatencyLabelGroup {
pub(crate) operation: DatabaseOperation,
}
#[derive(FixedCardinalityLabel)]
pub(crate) enum ReconcileOutcome {
#[label(rename = "ok")]
Success,
Error,
Cancel,
}
#[derive(FixedCardinalityLabel, Clone)]
pub(crate) enum Method {
Get,
Put,
Post,
Delete,
Other,
}
impl From<hyper::Method> for Method {
fn from(value: hyper::Method) -> Self {
if value == hyper::Method::GET {
Method::Get
} else if value == hyper::Method::PUT {
Method::Put
} else if value == hyper::Method::POST {
Method::Post
} else if value == hyper::Method::DELETE {
Method::Delete
} else {
Method::Other
}
}
}
pub(crate) struct StatusCode(pub(crate) hyper::http::StatusCode);
impl LabelValue for StatusCode {
fn visit<V: measured::label::LabelVisitor>(&self, v: V) -> V::Output {
v.write_int(self.0.as_u16() as u64)
}
}
impl FixedCardinalityLabel for StatusCode {
fn cardinality() -> usize {
(100..1000).len()
}
fn encode(&self) -> usize {
self.0.as_u16() as usize
}
fn decode(value: usize) -> Self {
Self(hyper::http::StatusCode::from_u16(u16::try_from(value).unwrap()).unwrap())
}
}
#[derive(FixedCardinalityLabel)]
pub(crate) enum DatabaseErrorLabel {
Query,
Connection,
ConnectionPool,
Logical,
}
impl DatabaseError {
pub(crate) fn error_label(&self) -> DatabaseErrorLabel {
match self {
Self::Query(_) => DatabaseErrorLabel::Query,
Self::Connection(_) => DatabaseErrorLabel::Connection,
Self::ConnectionPool(_) => DatabaseErrorLabel::ConnectionPool,
Self::Logical(_) => DatabaseErrorLabel::Logical,
}
}
}

View File

@@ -12,7 +12,9 @@ use serde::Serialize;
use tokio_util::sync::CancellationToken;
use utils::{backoff, id::NodeId};
use crate::{persistence::NodePersistence, scheduler::MaySchedule};
use crate::{
pageserver_client::PageserverClient, persistence::NodePersistence, scheduler::MaySchedule,
};
/// Represents the in-memory description of a Node.
///
@@ -202,7 +204,7 @@ impl Node {
cancel: &CancellationToken,
) -> Option<mgmt_api::Result<T>>
where
O: FnMut(mgmt_api::Client) -> F,
O: FnMut(PageserverClient) -> F,
F: std::future::Future<Output = mgmt_api::Result<T>>,
{
fn is_fatal(e: &mgmt_api::Error) -> bool {
@@ -224,8 +226,12 @@ impl Node {
.build()
.expect("Failed to construct HTTP client");
let client =
mgmt_api::Client::from_client(http_client, self.base_url(), jwt.as_deref());
let client = PageserverClient::from_client(
self.get_id(),
http_client,
self.base_url(),
jwt.as_deref(),
);
let node_cancel_fut = self.cancel.cancelled();

View File

@@ -0,0 +1,203 @@
use pageserver_api::{
models::{
LocationConfig, LocationConfigListResponse, PageserverUtilization, SecondaryProgress,
TenantShardSplitRequest, TenantShardSplitResponse, TimelineCreateRequest, TimelineInfo,
},
shard::TenantShardId,
};
use pageserver_client::mgmt_api::{Client, Result};
use reqwest::StatusCode;
use utils::id::{NodeId, TimelineId};
/// Thin wrapper around [`pageserver_client::mgmt_api::Client`]. It allows the storage
/// controller to collect metrics in a non-intrusive manner.
#[derive(Debug, Clone)]
pub(crate) struct PageserverClient {
inner: Client,
node_id_label: String,
}
macro_rules! measured_request {
($name:literal, $method:expr, $node_id: expr, $invoke:expr) => {{
let labels = crate::metrics::PageserverRequestLabelGroup {
pageserver_id: $node_id,
path: $name,
method: $method,
};
let latency = &crate::metrics::METRICS_REGISTRY
.metrics_group
.storage_controller_pageserver_request_latency;
let _timer_guard = latency.start_timer(labels.clone());
let res = $invoke;
if res.is_err() {
let error_counters = &crate::metrics::METRICS_REGISTRY
.metrics_group
.storage_controller_pageserver_request_error;
error_counters.inc(labels)
}
res
}};
}
impl PageserverClient {
pub(crate) fn new(node_id: NodeId, mgmt_api_endpoint: String, jwt: Option<&str>) -> Self {
Self {
inner: Client::from_client(reqwest::Client::new(), mgmt_api_endpoint, jwt),
node_id_label: node_id.0.to_string(),
}
}
pub(crate) fn from_client(
node_id: NodeId,
raw_client: reqwest::Client,
mgmt_api_endpoint: String,
jwt: Option<&str>,
) -> Self {
Self {
inner: Client::from_client(raw_client, mgmt_api_endpoint, jwt),
node_id_label: node_id.0.to_string(),
}
}
pub(crate) async fn tenant_delete(&self, tenant_shard_id: TenantShardId) -> Result<StatusCode> {
measured_request!(
"tenant",
crate::metrics::Method::Delete,
&self.node_id_label,
self.inner.tenant_delete(tenant_shard_id).await
)
}
pub(crate) async fn tenant_time_travel_remote_storage(
&self,
tenant_shard_id: TenantShardId,
timestamp: &str,
done_if_after: &str,
) -> Result<()> {
measured_request!(
"tenant_time_travel_remote_storage",
crate::metrics::Method::Put,
&self.node_id_label,
self.inner
.tenant_time_travel_remote_storage(tenant_shard_id, timestamp, done_if_after)
.await
)
}
pub(crate) async fn tenant_secondary_download(
&self,
tenant_id: TenantShardId,
wait: Option<std::time::Duration>,
) -> Result<(StatusCode, SecondaryProgress)> {
measured_request!(
"tenant_secondary_download",
crate::metrics::Method::Post,
&self.node_id_label,
self.inner.tenant_secondary_download(tenant_id, wait).await
)
}
pub(crate) async fn location_config(
&self,
tenant_shard_id: TenantShardId,
config: LocationConfig,
flush_ms: Option<std::time::Duration>,
lazy: bool,
) -> Result<()> {
measured_request!(
"location_config",
crate::metrics::Method::Put,
&self.node_id_label,
self.inner
.location_config(tenant_shard_id, config, flush_ms, lazy)
.await
)
}
pub(crate) async fn list_location_config(&self) -> Result<LocationConfigListResponse> {
measured_request!(
"location_configs",
crate::metrics::Method::Get,
&self.node_id_label,
self.inner.list_location_config().await
)
}
pub(crate) async fn get_location_config(
&self,
tenant_shard_id: TenantShardId,
) -> Result<Option<LocationConfig>> {
measured_request!(
"location_config",
crate::metrics::Method::Get,
&self.node_id_label,
self.inner.get_location_config(tenant_shard_id).await
)
}
pub(crate) async fn timeline_create(
&self,
tenant_shard_id: TenantShardId,
req: &TimelineCreateRequest,
) -> Result<TimelineInfo> {
measured_request!(
"timeline",
crate::metrics::Method::Post,
&self.node_id_label,
self.inner.timeline_create(tenant_shard_id, req).await
)
}
pub(crate) async fn timeline_delete(
&self,
tenant_shard_id: TenantShardId,
timeline_id: TimelineId,
) -> Result<StatusCode> {
measured_request!(
"timeline",
crate::metrics::Method::Delete,
&self.node_id_label,
self.inner
.timeline_delete(tenant_shard_id, timeline_id)
.await
)
}
pub(crate) async fn tenant_shard_split(
&self,
tenant_shard_id: TenantShardId,
req: TenantShardSplitRequest,
) -> Result<TenantShardSplitResponse> {
measured_request!(
"tenant_shard_split",
crate::metrics::Method::Put,
&self.node_id_label,
self.inner.tenant_shard_split(tenant_shard_id, req).await
)
}
pub(crate) async fn timeline_list(
&self,
tenant_shard_id: &TenantShardId,
) -> Result<Vec<TimelineInfo>> {
measured_request!(
"timelines",
crate::metrics::Method::Get,
&self.node_id_label,
self.inner.timeline_list(tenant_shard_id).await
)
}
pub(crate) async fn get_utilization(&self) -> Result<PageserverUtilization> {
measured_request!(
"utilization",
crate::metrics::Method::Get,
&self.node_id_label,
self.inner.get_utilization().await
)
}
}

View File

@@ -19,6 +19,9 @@ use serde::{Deserialize, Serialize};
use utils::generation::Generation;
use utils::id::{NodeId, TenantId};
use crate::metrics::{
DatabaseQueryErrorLabelGroup, DatabaseQueryLatencyLabelGroup, METRICS_REGISTRY,
};
use crate::node::Node;
/// ## What do we store?
@@ -75,6 +78,25 @@ pub(crate) enum DatabaseError {
Logical(String),
}
#[derive(measured::FixedCardinalityLabel, Clone)]
pub(crate) enum DatabaseOperation {
InsertNode,
UpdateNode,
DeleteNode,
ListNodes,
BeginShardSplit,
CompleteShardSplit,
AbortShardSplit,
Detach,
ReAttach,
IncrementGeneration,
ListTenantShards,
InsertTenantShards,
UpdateTenantShard,
DeleteTenant,
UpdateTenantConfig,
}
#[must_use]
pub(crate) enum AbortShardSplitStatus {
/// We aborted the split in the database by reverting to the parent shards
@@ -115,6 +137,34 @@ impl Persistence {
}
}
/// Wraps `with_conn` in order to collect latency and error metrics
async fn with_measured_conn<F, R>(&self, op: DatabaseOperation, func: F) -> DatabaseResult<R>
where
F: Fn(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
R: Send + 'static,
{
let latency = &METRICS_REGISTRY
.metrics_group
.storage_controller_database_query_latency;
let _timer = latency.start_timer(DatabaseQueryLatencyLabelGroup {
operation: op.clone(),
});
let res = self.with_conn(func).await;
if let Err(err) = &res {
let error_counter = &METRICS_REGISTRY
.metrics_group
.storage_controller_database_query_error;
error_counter.inc(DatabaseQueryErrorLabelGroup {
error_type: err.error_label(),
operation: op,
})
}
res
}
/// Call the provided function in a tokio blocking thread, with a Diesel database connection.
async fn with_conn<F, R>(&self, func: F) -> DatabaseResult<R>
where
@@ -130,21 +180,27 @@ impl Persistence {
/// When a node is first registered, persist it before using it for anything
pub(crate) async fn insert_node(&self, node: &Node) -> DatabaseResult<()> {
let np = node.to_persistent();
self.with_conn(move |conn| -> DatabaseResult<()> {
diesel::insert_into(crate::schema::nodes::table)
.values(&np)
.execute(conn)?;
Ok(())
})
self.with_measured_conn(
DatabaseOperation::InsertNode,
move |conn| -> DatabaseResult<()> {
diesel::insert_into(crate::schema::nodes::table)
.values(&np)
.execute(conn)?;
Ok(())
},
)
.await
}
/// At startup, populate the list of nodes which our shards may be placed on
pub(crate) async fn list_nodes(&self) -> DatabaseResult<Vec<NodePersistence>> {
let nodes: Vec<NodePersistence> = self
.with_conn(move |conn| -> DatabaseResult<_> {
Ok(crate::schema::nodes::table.load::<NodePersistence>(conn)?)
})
.with_measured_conn(
DatabaseOperation::ListNodes,
move |conn| -> DatabaseResult<_> {
Ok(crate::schema::nodes::table.load::<NodePersistence>(conn)?)
},
)
.await?;
tracing::info!("list_nodes: loaded {} nodes", nodes.len());
@@ -159,7 +215,7 @@ impl Persistence {
) -> DatabaseResult<()> {
use crate::schema::nodes::dsl::*;
let updated = self
.with_conn(move |conn| {
.with_measured_conn(DatabaseOperation::UpdateNode, move |conn| {
let updated = diesel::update(nodes)
.filter(node_id.eq(input_node_id.0 as i64))
.set((scheduling_policy.eq(String::from(input_scheduling)),))
@@ -181,9 +237,12 @@ impl Persistence {
/// be enriched at runtime with state discovered on pageservers.
pub(crate) async fn list_tenant_shards(&self) -> DatabaseResult<Vec<TenantShardPersistence>> {
let loaded = self
.with_conn(move |conn| -> DatabaseResult<_> {
Ok(crate::schema::tenant_shards::table.load::<TenantShardPersistence>(conn)?)
})
.with_measured_conn(
DatabaseOperation::ListTenantShards,
move |conn| -> DatabaseResult<_> {
Ok(crate::schema::tenant_shards::table.load::<TenantShardPersistence>(conn)?)
},
)
.await?;
if loaded.is_empty() {
@@ -211,15 +270,10 @@ impl Persistence {
let mut decoded = serde_json::from_slice::<JsonPersistence>(&bytes)
.map_err(|e| DatabaseError::Logical(format!("Deserialization error: {e}")))?;
for (tenant_id, tenant) in &mut decoded.tenants {
// Backward compat: an old attachments.json from before PR #6251, replace
// empty strings with proper defaults.
if tenant.tenant_id.is_empty() {
tenant.tenant_id = tenant_id.to_string();
tenant.config = serde_json::to_string(&TenantConfig::default())
.map_err(|e| DatabaseError::Logical(format!("Serialization error: {e}")))?;
tenant.placement_policy = serde_json::to_string(&PlacementPolicy::Single)
.map_err(|e| DatabaseError::Logical(format!("Serialization error: {e}")))?;
for shard in decoded.tenants.values_mut() {
if shard.placement_policy == "\"Single\"" {
// Backward compat for test data after PR https://github.com/neondatabase/neon/pull/7165
shard.placement_policy = "{\"Attached\":0}".to_string();
}
}
@@ -265,17 +319,20 @@ impl Persistence {
shards: Vec<TenantShardPersistence>,
) -> DatabaseResult<()> {
use crate::schema::tenant_shards::dsl::*;
self.with_conn(move |conn| -> DatabaseResult<()> {
conn.transaction(|conn| -> QueryResult<()> {
for tenant in &shards {
diesel::insert_into(tenant_shards)
.values(tenant)
.execute(conn)?;
}
self.with_measured_conn(
DatabaseOperation::InsertTenantShards,
move |conn| -> DatabaseResult<()> {
conn.transaction(|conn| -> QueryResult<()> {
for tenant in &shards {
diesel::insert_into(tenant_shards)
.values(tenant)
.execute(conn)?;
}
Ok(())
})?;
Ok(())
})?;
Ok(())
})
},
)
.await
}
@@ -283,25 +340,31 @@ impl Persistence {
/// the tenant from memory on this server.
pub(crate) async fn delete_tenant(&self, del_tenant_id: TenantId) -> DatabaseResult<()> {
use crate::schema::tenant_shards::dsl::*;
self.with_conn(move |conn| -> DatabaseResult<()> {
diesel::delete(tenant_shards)
.filter(tenant_id.eq(del_tenant_id.to_string()))
.execute(conn)?;
self.with_measured_conn(
DatabaseOperation::DeleteTenant,
move |conn| -> DatabaseResult<()> {
diesel::delete(tenant_shards)
.filter(tenant_id.eq(del_tenant_id.to_string()))
.execute(conn)?;
Ok(())
})
Ok(())
},
)
.await
}
pub(crate) async fn delete_node(&self, del_node_id: NodeId) -> DatabaseResult<()> {
use crate::schema::nodes::dsl::*;
self.with_conn(move |conn| -> DatabaseResult<()> {
diesel::delete(nodes)
.filter(node_id.eq(del_node_id.0 as i64))
.execute(conn)?;
self.with_measured_conn(
DatabaseOperation::DeleteNode,
move |conn| -> DatabaseResult<()> {
diesel::delete(nodes)
.filter(node_id.eq(del_node_id.0 as i64))
.execute(conn)?;
Ok(())
})
Ok(())
},
)
.await
}
@@ -315,7 +378,7 @@ impl Persistence {
) -> DatabaseResult<HashMap<TenantShardId, Generation>> {
use crate::schema::tenant_shards::dsl::*;
let updated = self
.with_conn(move |conn| {
.with_measured_conn(DatabaseOperation::ReAttach, move |conn| {
let rows_updated = diesel::update(tenant_shards)
.filter(generation_pageserver.eq(node_id.0 as i64))
.set(generation.eq(generation + 1))
@@ -365,7 +428,7 @@ impl Persistence {
) -> anyhow::Result<Generation> {
use crate::schema::tenant_shards::dsl::*;
let updated = self
.with_conn(move |conn| {
.with_measured_conn(DatabaseOperation::IncrementGeneration, move |conn| {
let updated = diesel::update(tenant_shards)
.filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
.filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
@@ -409,7 +472,7 @@ impl Persistence {
) -> DatabaseResult<()> {
use crate::schema::tenant_shards::dsl::*;
self.with_conn(move |conn| {
self.with_measured_conn(DatabaseOperation::UpdateTenantShard, move |conn| {
let query = diesel::update(tenant_shards)
.filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
.filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
@@ -450,7 +513,7 @@ impl Persistence {
) -> DatabaseResult<()> {
use crate::schema::tenant_shards::dsl::*;
self.with_conn(move |conn| {
self.with_measured_conn(DatabaseOperation::UpdateTenantConfig, move |conn| {
diesel::update(tenant_shards)
.filter(tenant_id.eq(input_tenant_id.to_string()))
.set((config.eq(serde_json::to_string(&input_config).unwrap()),))
@@ -465,7 +528,7 @@ impl Persistence {
pub(crate) async fn detach(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> {
use crate::schema::tenant_shards::dsl::*;
self.with_conn(move |conn| {
self.with_measured_conn(DatabaseOperation::Detach, move |conn| {
let updated = diesel::update(tenant_shards)
.filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
.filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
@@ -495,7 +558,7 @@ impl Persistence {
parent_to_children: Vec<(TenantShardId, Vec<TenantShardPersistence>)>,
) -> DatabaseResult<()> {
use crate::schema::tenant_shards::dsl::*;
self.with_conn(move |conn| -> DatabaseResult<()> {
self.with_measured_conn(DatabaseOperation::BeginShardSplit, move |conn| -> DatabaseResult<()> {
conn.transaction(|conn| -> DatabaseResult<()> {
// Mark parent shards as splitting
@@ -559,26 +622,29 @@ impl Persistence {
old_shard_count: ShardCount,
) -> DatabaseResult<()> {
use crate::schema::tenant_shards::dsl::*;
self.with_conn(move |conn| -> DatabaseResult<()> {
conn.transaction(|conn| -> QueryResult<()> {
// Drop parent shards
diesel::delete(tenant_shards)
.filter(tenant_id.eq(split_tenant_id.to_string()))
.filter(shard_count.eq(old_shard_count.literal() as i32))
.execute(conn)?;
self.with_measured_conn(
DatabaseOperation::CompleteShardSplit,
move |conn| -> DatabaseResult<()> {
conn.transaction(|conn| -> QueryResult<()> {
// Drop parent shards
diesel::delete(tenant_shards)
.filter(tenant_id.eq(split_tenant_id.to_string()))
.filter(shard_count.eq(old_shard_count.literal() as i32))
.execute(conn)?;
// Clear sharding flag
let updated = diesel::update(tenant_shards)
.filter(tenant_id.eq(split_tenant_id.to_string()))
.set((splitting.eq(0),))
.execute(conn)?;
debug_assert!(updated > 0);
// Clear sharding flag
let updated = diesel::update(tenant_shards)
.filter(tenant_id.eq(split_tenant_id.to_string()))
.set((splitting.eq(0),))
.execute(conn)?;
debug_assert!(updated > 0);
Ok(())
})?;
Ok(())
})?;
Ok(())
})
},
)
.await
}
@@ -590,40 +656,44 @@ impl Persistence {
new_shard_count: ShardCount,
) -> DatabaseResult<AbortShardSplitStatus> {
use crate::schema::tenant_shards::dsl::*;
self.with_conn(move |conn| -> DatabaseResult<AbortShardSplitStatus> {
let aborted = conn.transaction(|conn| -> DatabaseResult<AbortShardSplitStatus> {
// Clear the splitting state on parent shards
let updated = diesel::update(tenant_shards)
.filter(tenant_id.eq(split_tenant_id.to_string()))
.filter(shard_count.ne(new_shard_count.literal() as i32))
.set((splitting.eq(0),))
.execute(conn)?;
self.with_measured_conn(
DatabaseOperation::AbortShardSplit,
move |conn| -> DatabaseResult<AbortShardSplitStatus> {
let aborted =
conn.transaction(|conn| -> DatabaseResult<AbortShardSplitStatus> {
// Clear the splitting state on parent shards
let updated = diesel::update(tenant_shards)
.filter(tenant_id.eq(split_tenant_id.to_string()))
.filter(shard_count.ne(new_shard_count.literal() as i32))
.set((splitting.eq(0),))
.execute(conn)?;
// Parent shards are already gone: we cannot abort.
if updated == 0 {
return Ok(AbortShardSplitStatus::Complete);
}
// Parent shards are already gone: we cannot abort.
if updated == 0 {
return Ok(AbortShardSplitStatus::Complete);
}
// Sanity check: if parent shards were present, their cardinality should
// be less than the number of child shards.
if updated >= new_shard_count.count() as usize {
return Err(DatabaseError::Logical(format!(
"Unexpected parent shard count {updated} while aborting split to \
// Sanity check: if parent shards were present, their cardinality should
// be less than the number of child shards.
if updated >= new_shard_count.count() as usize {
return Err(DatabaseError::Logical(format!(
"Unexpected parent shard count {updated} while aborting split to \
count {new_shard_count:?} on tenant {split_tenant_id}"
)));
}
)));
}
// Erase child shards
diesel::delete(tenant_shards)
.filter(tenant_id.eq(split_tenant_id.to_string()))
.filter(shard_count.eq(new_shard_count.literal() as i32))
.execute(conn)?;
// Erase child shards
diesel::delete(tenant_shards)
.filter(tenant_id.eq(split_tenant_id.to_string()))
.filter(shard_count.eq(new_shard_count.literal() as i32))
.execute(conn)?;
Ok(AbortShardSplitStatus::Aborted)
})?;
Ok(AbortShardSplitStatus::Aborted)
})?;
Ok(aborted)
})
Ok(aborted)
},
)
.await
}
}

View File

@@ -1,3 +1,4 @@
use crate::pageserver_client::PageserverClient;
use crate::persistence::Persistence;
use crate::service;
use hyper::StatusCode;
@@ -117,6 +118,15 @@ impl Reconciler {
flush_ms: Option<Duration>,
lazy: bool,
) -> Result<(), ReconcileError> {
if !node.is_available() && config.mode == LocationConfigMode::Detached {
// Attempts to detach from offline nodes may be imitated without doing I/O: a node which is offline
// will get fully reconciled wrt the shard's intent state when it is reactivated, irrespective of
// what we put into `observed`, in [`crate::service::Service::node_activate_reconcile`]
tracing::info!("Node {node} is unavailable during detach: proceeding anyway, it will be detached on next activation");
self.observed.locations.remove(&node.get_id());
return Ok(());
}
self.observed
.locations
.insert(node.get_id(), ObservedStateLocation { conf: None });
@@ -149,9 +159,16 @@ impl Reconciler {
};
tracing::info!("location_config({node}) complete: {:?}", config);
self.observed
.locations
.insert(node.get_id(), ObservedStateLocation { conf: Some(config) });
match config.mode {
LocationConfigMode::Detached => {
self.observed.locations.remove(&node.get_id());
}
_ => {
self.observed
.locations
.insert(node.get_id(), ObservedStateLocation { conf: Some(config) });
}
}
Ok(())
}
@@ -243,8 +260,11 @@ impl Reconciler {
tenant_shard_id: TenantShardId,
node: &Node,
) -> anyhow::Result<HashMap<TimelineId, Lsn>> {
let client =
mgmt_api::Client::new(node.base_url(), self.service_config.jwt_token.as_deref());
let client = PageserverClient::new(
node.get_id(),
node.base_url(),
self.service_config.jwt_token.as_deref(),
);
let timelines = client.timeline_list(&tenant_shard_id).await?;
Ok(timelines
@@ -475,7 +495,7 @@ impl Reconciler {
}
}
// Downgrade the origin to secondary. If the tenant's policy is PlacementPolicy::Single, then
// Downgrade the origin to secondary. If the tenant's policy is PlacementPolicy::Attached(0), then
// this location will be deleted in the general case reconciliation that runs after this.
let origin_secondary_conf = build_location_config(
&self.shard,

View File

@@ -7,7 +7,9 @@ use std::{
time::{Duration, Instant},
};
use crate::{id_lock_map::IdLockMap, persistence::AbortShardSplitStatus};
use crate::{
id_lock_map::IdLockMap, persistence::AbortShardSplitStatus, reconciler::ReconcileError,
};
use anyhow::Context;
use control_plane::storage_controller::{
AttachHookRequest, AttachHookResponse, InspectRequest, InspectResponse,
@@ -18,12 +20,14 @@ use hyper::StatusCode;
use pageserver_api::{
controller_api::{
NodeAvailability, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy,
TenantCreateResponse, TenantCreateResponseShard, TenantLocateResponse,
TenantShardMigrateRequest, TenantShardMigrateResponse, UtilizationScore,
TenantCreateResponse, TenantCreateResponseShard, TenantDescribeResponse,
TenantDescribeResponseShard, TenantLocateResponse, TenantShardMigrateRequest,
TenantShardMigrateResponse, UtilizationScore,
},
models::{SecondaryProgress, TenantConfigRequest},
};
use crate::pageserver_client::PageserverClient;
use pageserver_api::{
models::{
self, LocationConfig, LocationConfigListResponse, LocationConfigMode,
@@ -200,6 +204,30 @@ enum TenantCreateOrUpdate {
Update(Vec<ShardUpdate>),
}
struct ShardSplitParams {
old_shard_count: ShardCount,
new_shard_count: ShardCount,
new_stripe_size: Option<ShardStripeSize>,
targets: Vec<ShardSplitTarget>,
policy: PlacementPolicy,
config: TenantConfig,
shard_ident: ShardIdentity,
}
// When preparing for a shard split, we may either choose to proceed with the split,
// or find that the work is already done and return NoOp.
enum ShardSplitAction {
Split(ShardSplitParams),
NoOp(TenantShardSplitResponse),
}
// A parent shard which will be split
struct ShardSplitTarget {
parent_id: TenantShardId,
node: Node,
child_ids: Vec<TenantShardId>,
}
/// When we tenant shard split operation fails, we may not be able to clean up immediately, because nodes
/// might not be available. We therefore use a queue of abort operations processed in the background.
struct TenantShardSplitAbort {
@@ -525,7 +553,11 @@ impl Service {
break;
}
let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());
let client = PageserverClient::new(
node.get_id(),
node.base_url(),
self.config.jwt_token.as_deref(),
);
match client
.location_config(
tenant_shard_id,
@@ -733,7 +765,19 @@ impl Service {
tenant.waiter.advance(result.sequence);
}
Err(e) => {
tracing::warn!("Reconcile error: {}", e);
match e {
ReconcileError::Cancel => {
tracing::info!("Reconciler was cancelled");
}
ReconcileError::Remote(mgmt_api::Error::Cancelled) => {
// This might be due to the reconciler getting cancelled, or it might
// be due to the `Node` being marked offline.
tracing::info!("Reconciler cancelled during pageserver API call");
}
_ => {
tracing::warn!("Reconcile error: {}", e);
}
}
// Ordering: populate last_error before advancing error_seq,
// so that waiters will see the correct error after waiting.
@@ -1057,7 +1101,7 @@ impl Service {
shard_stripe_size: 0,
generation: Some(0),
generation_pageserver: None,
placement_policy: serde_json::to_string(&PlacementPolicy::Single).unwrap(),
placement_policy: serde_json::to_string(&PlacementPolicy::Attached(0)).unwrap(),
config: serde_json::to_string(&TenantConfig::default()).unwrap(),
splitting: SplitState::default(),
};
@@ -1084,7 +1128,7 @@ impl Service {
TenantState::new(
attach_req.tenant_shard_id,
ShardIdentity::unsharded(),
PlacementPolicy::Single,
PlacementPolicy::Attached(0),
),
);
tracing::info!("Inserted shard {} in memory", attach_req.tenant_shard_id);
@@ -1113,7 +1157,7 @@ impl Service {
self.persistence
.update_tenant_shard(
attach_req.tenant_shard_id,
PlacementPolicy::Single,
PlacementPolicy::Attached(0),
conf,
None,
)
@@ -1138,7 +1182,7 @@ impl Service {
if let Some(new_generation) = new_generation {
tenant_state.generation = Some(new_generation);
tenant_state.policy = PlacementPolicy::Single;
tenant_state.policy = PlacementPolicy::Attached(0);
} else {
// This is a detach notification. We must update placement policy to avoid re-attaching
// during background scheduling/reconciliation, or during storage controller restart.
@@ -1350,7 +1394,8 @@ impl Service {
incremented_generations.len()
);
// Apply the updated generation to our in-memory state
// Apply the updated generation to our in-memory state, and
// gather discover secondary locations.
let mut locked = self.inner.write().unwrap();
let (nodes, tenants, scheduler) = locked.parts_mut();
@@ -1358,62 +1403,65 @@ impl Service {
tenants: Vec::new(),
};
for (tenant_shard_id, new_gen) in incremented_generations {
response.tenants.push(ReAttachResponseTenant {
id: tenant_shard_id,
gen: new_gen.into().unwrap(),
});
// Apply the new generation number to our in-memory state
let shard_state = tenants.get_mut(&tenant_shard_id);
let Some(shard_state) = shard_state else {
// Not fatal. This edge case requires a re-attach to happen
// between inserting a new tenant shard in to the database, and updating our in-memory
// state to know about the shard, _and_ that the state inserted to the database referenced
// a pageserver. Should never happen, but handle it rather than panicking, since it should
// be harmless.
tracing::error!(
"Shard {} is in database for node {} but not in-memory state",
tenant_shard_id,
reattach_req.node_id
);
continue;
};
// TODO: cancel/restart any running reconciliation for this tenant, it might be trying
// to call location_conf API with an old generation. Wait for cancellation to complete
// before responding to this request. Requires well implemented CancellationToken logic
// all the way to where we call location_conf. Even then, there can still be a location_conf
// request in flight over the network: TODO handle that by making location_conf API refuse
// to go backward in generations.
// If [`Persistence::re_attach`] selected this shard, it must have alread
// had a generation set.
debug_assert!(shard_state.generation.is_some());
let Some(old_gen) = shard_state.generation else {
// Should never happen: would only return incremented generation
// for a tenant that already had a non-null generation.
return Err(ApiError::InternalServerError(anyhow::anyhow!(
"Generation must be set while re-attaching"
)));
};
shard_state.generation = Some(std::cmp::max(old_gen, new_gen));
if let Some(observed) = shard_state
.observed
.locations
.get_mut(&reattach_req.node_id)
{
if let Some(conf) = observed.conf.as_mut() {
conf.generation = new_gen.into();
// Scan through all shards, applying updates for ones where we updated generation
// and identifying shards that intend to have a secondary location on this node.
for (tenant_shard_id, shard) in tenants {
if let Some(new_gen) = incremented_generations.get(tenant_shard_id) {
let new_gen = *new_gen;
response.tenants.push(ReAttachResponseTenant {
id: *tenant_shard_id,
gen: Some(new_gen.into().unwrap()),
// A tenant is only put into multi or stale modes in the middle of a [`Reconciler::live_migrate`]
// execution. If a pageserver is restarted during that process, then the reconcile pass will
// fail, and start from scratch, so it doesn't make sense for us to try and preserve
// the stale/multi states at this point.
mode: LocationConfigMode::AttachedSingle,
});
shard.generation = std::cmp::max(shard.generation, Some(new_gen));
if let Some(observed) = shard.observed.locations.get_mut(&reattach_req.node_id) {
// Why can we update `observed` even though we're not sure our response will be received
// by the pageserver? Because the pageserver will not proceed with startup until
// it has processed response: if it loses it, we'll see another request and increment
// generation again, avoiding any uncertainty about dirtiness of tenant's state.
if let Some(conf) = observed.conf.as_mut() {
conf.generation = new_gen.into();
}
} else {
// This node has no observed state for the shard: perhaps it was offline
// when the pageserver restarted. Insert a None, so that the Reconciler
// will be prompted to learn the location's state before it makes changes.
shard
.observed
.locations
.insert(reattach_req.node_id, ObservedStateLocation { conf: None });
}
} else {
// This node has no observed state for the shard: perhaps it was offline
// when the pageserver restarted. Insert a None, so that the Reconciler
// will be prompted to learn the location's state before it makes changes.
shard_state
.observed
.locations
.insert(reattach_req.node_id, ObservedStateLocation { conf: None });
}
} else if shard.intent.get_secondary().contains(&reattach_req.node_id) {
// Ordering: pageserver will not accept /location_config requests until it has
// finished processing the response from re-attach. So we can update our in-memory state
// now, and be confident that we are not stamping on the result of some later location config.
// TODO: however, we are not strictly ordered wrt ReconcileResults queue,
// so we might update observed state here, and then get over-written by some racing
// ReconcileResult. The impact is low however, since we have set state on pageserver something
// that matches intent, so worst case if we race then we end up doing a spurious reconcile.
// TODO: cancel/restart any running reconciliation for this tenant, it might be trying
// to call location_conf API with an old generation. Wait for cancellation to complete
// before responding to this request. Requires well implemented CancellationToken logic
// all the way to where we call location_conf. Even then, there can still be a location_conf
// request in flight over the network: TODO handle that by making location_conf API refuse
// to go backward in generations.
response.tenants.push(ReAttachResponseTenant {
id: *tenant_shard_id,
gen: None,
mode: LocationConfigMode::Secondary,
});
// We must not update observed, because we have no guarantee that our
// response will be received by the pageserver. This could leave it
// falsely dirty, but the resulting reconcile should be idempotent.
}
}
// We consider a node Active once we have composed a re-attach response, but we
@@ -1491,11 +1539,11 @@ impl Service {
&self,
create_req: TenantCreateRequest,
) -> Result<(TenantCreateResponse, Vec<ReconcilerWaiter>), ApiError> {
// As a default, single is convenient for tests that don't choose a policy.
let placement_policy = create_req
.placement_policy
.clone()
.unwrap_or(PlacementPolicy::Single);
// As a default, zero secondaries is convenient for tests that don't choose a policy.
.unwrap_or(PlacementPolicy::Attached(0));
// This service expects to handle sharding itself: it is an error to try and directly create
// a particular shard here.
@@ -1705,11 +1753,11 @@ impl Service {
| LocationConfigMode::AttachedSingle
| LocationConfigMode::AttachedStale => {
if nodes.len() > 1 {
PlacementPolicy::Double(1)
PlacementPolicy::Attached(1)
} else {
// Convenience for dev/test: if we just have one pageserver, import
// tenants into Single mode so that scheduling will succeed.
PlacementPolicy::Single
// tenants into non-HA mode so that scheduling will succeed.
PlacementPolicy::Attached(0)
}
}
};
@@ -2058,8 +2106,11 @@ impl Service {
})
.collect::<Vec<_>>();
for tenant_shard_id in shard_ids {
let client =
mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());
let client = PageserverClient::new(
node.get_id(),
node.base_url(),
self.config.jwt_token.as_deref(),
);
tracing::info!("Doing time travel recovery for shard {tenant_shard_id}",);
@@ -2111,7 +2162,11 @@ impl Service {
// Issue concurrent requests to all shards' locations
let mut futs = FuturesUnordered::new();
for (tenant_shard_id, node) in targets {
let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());
let client = PageserverClient::new(
node.get_id(),
node.base_url(),
self.config.jwt_token.as_deref(),
);
futs.push(async move {
let result = client
.tenant_secondary_download(tenant_shard_id, wait)
@@ -2204,7 +2259,11 @@ impl Service {
// Phase 1: delete on the pageservers
let mut any_pending = false;
for (tenant_shard_id, node) in targets {
let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());
let client = PageserverClient::new(
node.get_id(),
node.base_url(),
self.config.jwt_token.as_deref(),
);
// TODO: this, like many other places, requires proper retry handling for 503, timeout: those should not
// surface immediately as an error to our caller.
let status = client.tenant_delete(tenant_shard_id).await.map_err(|e| {
@@ -2316,7 +2375,7 @@ impl Service {
tenant_shard_id,
create_req.new_timeline_id,
);
let client = mgmt_api::Client::new(node.base_url(), jwt.as_deref());
let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref());
client
.timeline_create(tenant_shard_id, &create_req)
@@ -2440,7 +2499,7 @@ impl Service {
"Deleting timeline on shard {tenant_shard_id}/{timeline_id}, attached to node {node}",
);
let client = mgmt_api::Client::new(node.base_url(), jwt.as_deref());
let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref());
client
.timeline_delete(tenant_shard_id, timeline_id)
.await
@@ -2481,11 +2540,11 @@ impl Service {
}
/// When you need to send an HTTP request to the pageserver that holds shard0 of a tenant, this
/// function looks it up and returns the url. If the tenant isn't found, returns Err(ApiError::NotFound)
pub(crate) fn tenant_shard0_baseurl(
/// function looks up and returns node. If the tenant isn't found, returns Err(ApiError::NotFound)
pub(crate) fn tenant_shard0_node(
&self,
tenant_id: TenantId,
) -> Result<(String, TenantShardId), ApiError> {
) -> Result<(Node, TenantShardId), ApiError> {
let locked = self.inner.read().unwrap();
let Some((tenant_shard_id, shard)) = locked
.tenants
@@ -2517,7 +2576,7 @@ impl Service {
)));
};
Ok((node.base_url(), *tenant_shard_id))
Ok((node.clone(), *tenant_shard_id))
}
pub(crate) fn tenant_locate(
@@ -2527,9 +2586,6 @@ impl Service {
let locked = self.inner.read().unwrap();
tracing::info!("Locating shards for tenant {tenant_id}");
// Take a snapshot of pageservers
let pageservers = locked.nodes.clone();
let mut result = Vec::new();
let mut shard_params: Option<ShardParameters> = None;
@@ -2543,7 +2599,8 @@ impl Service {
"Cannot locate a tenant that is not attached"
)))?;
let node = pageservers
let node = locked
.nodes
.get(&node_id)
.expect("Pageservers may not be deleted while referenced");
@@ -2591,6 +2648,47 @@ impl Service {
})
}
pub(crate) fn tenant_describe(
&self,
tenant_id: TenantId,
) -> Result<TenantDescribeResponse, ApiError> {
let locked = self.inner.read().unwrap();
let mut shard_zero = None;
let mut shards = Vec::new();
for (tenant_shard_id, shard) in locked.tenants.range(TenantShardId::tenant_range(tenant_id))
{
if tenant_shard_id.is_zero() {
shard_zero = Some(shard);
}
let response_shard = TenantDescribeResponseShard {
tenant_shard_id: *tenant_shard_id,
node_attached: *shard.intent.get_attached(),
node_secondary: shard.intent.get_secondary().to_vec(),
last_error: shard.last_error.lock().unwrap().clone(),
is_reconciling: shard.reconciler.is_some(),
is_pending_compute_notification: shard.pending_compute_notification,
is_splitting: matches!(shard.splitting, SplitState::Splitting),
};
shards.push(response_shard);
}
let Some(shard_zero) = shard_zero else {
return Err(ApiError::NotFound(
anyhow::anyhow!("Tenant {tenant_id} not found").into(),
));
};
Ok(TenantDescribeResponse {
shards,
stripe_size: shard_zero.shard.stripe_size,
policy: shard_zero.policy.clone(),
config: shard_zero.config.clone(),
})
}
#[instrument(skip_all, fields(tenant_id=%op.tenant_id))]
async fn abort_tenant_shard_split(
&self,
@@ -2648,7 +2746,7 @@ impl Service {
let detach_locations: Vec<(Node, TenantShardId)> = {
let mut detach_locations = Vec::new();
let mut locked = self.inner.write().unwrap();
let (nodes, tenants, _scheduler) = locked.parts_mut();
let (nodes, tenants, scheduler) = locked.parts_mut();
for (tenant_shard_id, shard) in
tenants.range_mut(TenantShardId::tenant_range(op.tenant_id))
@@ -2681,6 +2779,13 @@ impl Service {
tracing::info!("Restoring parent shard {tenant_shard_id}");
shard.splitting = SplitState::Idle;
if let Err(e) = shard.schedule(scheduler) {
// If this shard can't be scheduled now (perhaps due to offline nodes or
// capacity issues), that must not prevent us rolling back a split. In this
// case it should be eventually scheduled in the background.
tracing::warn!("Failed to schedule {tenant_shard_id} during shard abort: {e}")
}
self.maybe_reconcile_shard(shard, nodes);
}
@@ -2772,7 +2877,7 @@ impl Service {
.map(|(shard_id, _)| *shard_id)
.collect::<Vec<_>>();
let (_nodes, tenants, scheduler) = locked.parts_mut();
let (nodes, tenants, scheduler) = locked.parts_mut();
for parent_id in parent_ids {
let child_ids = parent_id.split(new_shard_count);
@@ -2814,7 +2919,7 @@ impl Service {
generation,
&child_shard,
&config,
matches!(policy, PlacementPolicy::Double(n) if n > 0),
matches!(policy, PlacementPolicy::Attached(n) if n > 0),
)),
},
);
@@ -2839,6 +2944,8 @@ impl Service {
// find a secondary (e.g. because cluster is overloaded).
tracing::warn!("Failed to schedule child shard {child}: {e}");
}
// In the background, attach secondary locations for the new shards
self.maybe_reconcile_shard(&mut child_state, nodes);
tenants.insert(child, child_state);
response.new_shards.push(child);
@@ -2861,17 +2968,23 @@ impl Service {
let new_shard_count = ShardCount::new(split_req.new_shard_count);
let new_stripe_size = split_req.new_stripe_size;
let r = self.do_tenant_shard_split(tenant_id, split_req).await;
// Validate the request and construct parameters. This phase is fallible, but does not require
// rollback on errors, as it does no I/O and mutates no state.
let shard_split_params = match self.prepare_tenant_shard_split(tenant_id, split_req)? {
ShardSplitAction::NoOp(resp) => return Ok(resp),
ShardSplitAction::Split(params) => params,
};
// Execute this split: this phase mutates state and does remote I/O on pageservers. If it fails,
// we must roll back.
let r = self
.do_tenant_shard_split(tenant_id, shard_split_params)
.await;
match r {
Ok(r) => Ok(r),
Err(ApiError::BadRequest(_)) => {
// A request validation error does not require rollback: we rejected it before we started making any changes: just
// return the error
r
}
Err(e) => {
// General case error handling: split might be part-done, we must do work to abort it.
// Split might be part-done, we must do work to abort it.
tracing::warn!("Enqueuing background abort of split on {tenant_id}");
self.abort_tx
.send(TenantShardSplitAbort {
@@ -2887,25 +3000,18 @@ impl Service {
}
}
pub(crate) async fn do_tenant_shard_split(
fn prepare_tenant_shard_split(
&self,
tenant_id: TenantId,
split_req: TenantShardSplitRequest,
) -> Result<TenantShardSplitResponse, ApiError> {
let mut policy = None;
let mut shard_ident = None;
// A parent shard which will be split
struct SplitTarget {
parent_id: TenantShardId,
node: Node,
child_ids: Vec<TenantShardId>,
}
) -> Result<ShardSplitAction, ApiError> {
fail::fail_point!("shard-split-validation", |_| Err(ApiError::BadRequest(
anyhow::anyhow!("failpoint")
)));
let mut policy = None;
let mut config = None;
let mut shard_ident = None;
// Validate input, and calculate which shards we will create
let (old_shard_count, targets) =
{
@@ -2961,6 +3067,9 @@ impl Service {
if shard_ident.is_none() {
shard_ident = Some(shard.shard);
}
if config.is_none() {
config = Some(shard.config.clone());
}
if tenant_shard_id.shard_count.count() == split_req.new_shard_count {
tracing::info!(
@@ -2979,9 +3088,7 @@ impl Service {
.get(&node_id)
.expect("Pageservers may not be deleted while referenced");
// TODO: if any reconciliation is currently in progress for this shard, wait for it.
targets.push(SplitTarget {
targets.push(ShardSplitTarget {
parent_id: *tenant_shard_id,
node: node.clone(),
child_ids: tenant_shard_id
@@ -2991,9 +3098,9 @@ impl Service {
if targets.is_empty() {
if children_found.len() == split_req.new_shard_count as usize {
return Ok(TenantShardSplitResponse {
return Ok(ShardSplitAction::NoOp(TenantShardSplitResponse {
new_shards: children_found,
});
}));
} else {
// No shards found to split, and no existing children found: the
// tenant doesn't exist at all.
@@ -3023,13 +3130,77 @@ impl Service {
shard_ident.unwrap()
};
let policy = policy.unwrap();
let config = config.unwrap();
Ok(ShardSplitAction::Split(ShardSplitParams {
old_shard_count,
new_shard_count: ShardCount::new(split_req.new_shard_count),
new_stripe_size: split_req.new_stripe_size,
targets,
policy,
config,
shard_ident,
}))
}
async fn do_tenant_shard_split(
&self,
tenant_id: TenantId,
params: ShardSplitParams,
) -> Result<TenantShardSplitResponse, ApiError> {
// FIXME: we have dropped self.inner lock, and not yet written anything to the database: another
// request could occur here, deleting or mutating the tenant. begin_shard_split checks that the
// parent shards exist as expected, but it would be neater to do the above pre-checks within the
// same database transaction rather than pre-check in-memory and then maybe-fail the database write.
// (https://github.com/neondatabase/neon/issues/6676)
let ShardSplitParams {
old_shard_count,
new_shard_count,
new_stripe_size,
mut targets,
policy,
config,
shard_ident,
} = params;
// Drop any secondary locations: pageservers do not support splitting these, and in any case the
// end-state for a split tenant will usually be to have secondary locations on different nodes.
// The reconciliation calls in this block also implicitly cancel+barrier wrt any ongoing reconciliation
// at the time of split.
let waiters = {
let mut locked = self.inner.write().unwrap();
let mut waiters = Vec::new();
let (nodes, tenants, scheduler) = locked.parts_mut();
for target in &mut targets {
let Some(shard) = tenants.get_mut(&target.parent_id) else {
// Paranoia check: this shouldn't happen: we have the oplock for this tenant ID.
return Err(ApiError::InternalServerError(anyhow::anyhow!(
"Shard {} not found",
target.parent_id
)));
};
if shard.intent.get_attached() != &Some(target.node.get_id()) {
// Paranoia check: this shouldn't happen: we have the oplock for this tenant ID.
return Err(ApiError::Conflict(format!(
"Shard {} unexpectedly rescheduled during split",
target.parent_id
)));
}
// Irrespective of PlacementPolicy, clear secondary locations from intent
shard.intent.clear_secondary(scheduler);
// Run Reconciler to execute detach fo secondary locations.
if let Some(waiter) = self.maybe_reconcile_shard(shard, nodes) {
waiters.push(waiter);
}
}
waiters
};
self.await_waiters(waiters, RECONCILE_TIMEOUT).await?;
// Before creating any new child shards in memory or on the pageservers, persist them: this
// enables us to ensure that we will always be able to clean up if something goes wrong. This also
// acts as the protection against two concurrent attempts to split: one of them will get a database
@@ -3058,8 +3229,7 @@ impl Service {
generation: None,
generation_pageserver: Some(target.node.get_id().0 as i64),
placement_policy: serde_json::to_string(&policy).unwrap(),
// TODO: get the config out of the map
config: serde_json::to_string(&TenantConfig::default()).unwrap(),
config: serde_json::to_string(&config).unwrap(),
splitting: SplitState::Splitting,
});
}
@@ -3111,18 +3281,22 @@ impl Service {
// N>1 shards into M shards -- initially we're usually splitting 1 shard into N).
for target in &targets {
let SplitTarget {
let ShardSplitTarget {
parent_id,
node,
child_ids,
} = target;
let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());
let client = PageserverClient::new(
node.get_id(),
node.base_url(),
self.config.jwt_token.as_deref(),
);
let response = client
.tenant_shard_split(
*parent_id,
TenantShardSplitRequest {
new_shard_count: split_req.new_shard_count,
new_stripe_size: split_req.new_stripe_size,
new_shard_count: new_shard_count.literal(),
new_stripe_size,
},
)
.await
@@ -3171,11 +3345,8 @@ impl Service {
));
// Replace all the shards we just split with their children: this phase is infallible.
let (response, child_locations) = self.tenant_shard_split_commit_inmem(
tenant_id,
ShardCount::new(split_req.new_shard_count),
split_req.new_stripe_size,
);
let (response, child_locations) =
self.tenant_shard_split_commit_inmem(tenant_id, new_shard_count, new_stripe_size);
// Send compute notifications for all the new shards
let mut failed_notifications = Vec::new();
@@ -3240,17 +3411,20 @@ impl Service {
let old_attached = *shard.intent.get_attached();
match shard.policy {
PlacementPolicy::Single => {
shard.intent.clear_secondary(scheduler);
shard.intent.set_attached(scheduler, Some(migrate_req.node_id));
}
PlacementPolicy::Double(_n) => {
PlacementPolicy::Attached(n) => {
// If our new attached node was a secondary, it no longer should be.
shard.intent.remove_secondary(scheduler, migrate_req.node_id);
// If we were already attached to something, demote that to a secondary
if let Some(old_attached) = old_attached {
shard.intent.push_secondary(scheduler, old_attached);
if n > 0 {
// Remove other secondaries to make room for the location we'll demote
while shard.intent.get_secondary().len() >= n {
shard.intent.pop_secondary(scheduler);
}
shard.intent.push_secondary(scheduler, old_attached);
}
}
shard.intent.set_attached(scheduler, Some(migrate_req.node_id));
@@ -3276,7 +3450,7 @@ impl Service {
if let Some(waiter) = waiter {
waiter.wait_timeout(RECONCILE_TIMEOUT).await?;
} else {
tracing::warn!("Migration is a no-op");
tracing::info!("Migration is a no-op");
}
Ok(TenantShardMigrateResponse {})
@@ -3631,6 +3805,13 @@ impl Service {
observed_loc.conf = None;
}
if new_nodes.len() == 1 {
// Special case for single-node cluster: there is no point trying to reschedule
// any tenant shards: avoid doing so, in order to avoid spewing warnings about
// failures to schedule them.
continue;
}
if tenant_state.intent.demote_attached(node_id) {
tenant_state.sequence = tenant_state.sequence.next();
match tenant_state.schedule(scheduler) {

View File

@@ -4,7 +4,10 @@ use std::{
time::Duration,
};
use crate::{metrics, persistence::TenantShardPersistence};
use crate::{
metrics::{self, ReconcileCompleteLabelGroup, ReconcileOutcome},
persistence::TenantShardPersistence,
};
use pageserver_api::controller_api::PlacementPolicy;
use pageserver_api::{
models::{LocationConfig, LocationConfigMode, TenantConfig},
@@ -457,22 +460,7 @@ impl TenantState {
// Add/remove nodes to fulfil policy
use PlacementPolicy::*;
match self.policy {
Single => {
// Should have exactly one attached, and zero secondaries
if !self.intent.secondary.is_empty() {
self.intent.clear_secondary(scheduler);
modified = true;
}
let (modified_attached, _attached_node_id) = self.schedule_attached(scheduler)?;
modified |= modified_attached;
if !self.intent.secondary.is_empty() {
self.intent.clear_secondary(scheduler);
modified = true;
}
}
Double(secondary_count) => {
Attached(secondary_count) => {
let retain_secondaries = if self.intent.attached.is_none()
&& scheduler.node_preferred(&self.intent.secondary).is_some()
{
@@ -733,7 +721,10 @@ impl TenantState {
let reconciler_span = tracing::info_span!(parent: None, "reconciler", seq=%reconcile_seq,
tenant_id=%reconciler.tenant_shard_id.tenant_id,
shard_id=%reconciler.tenant_shard_id.shard_slug());
metrics::RECONCILER.spawned.inc();
metrics::METRICS_REGISTRY
.metrics_group
.storage_controller_reconcile_spawn
.inc();
let result_tx = result_tx.clone();
let join_handle = tokio::task::spawn(
async move {
@@ -751,10 +742,12 @@ impl TenantState {
// TODO: wrap all remote API operations in cancellation check
// as well.
if reconciler.cancel.is_cancelled() {
metrics::RECONCILER
.complete
.with_label_values(&[metrics::ReconcilerMetrics::CANCEL])
.inc();
metrics::METRICS_REGISTRY
.metrics_group
.storage_controller_reconcile_complete
.inc(ReconcileCompleteLabelGroup {
status: ReconcileOutcome::Cancel,
});
return;
}
@@ -769,18 +762,18 @@ impl TenantState {
}
// Update result counter
match &result {
Ok(_) => metrics::RECONCILER
.complete
.with_label_values(&[metrics::ReconcilerMetrics::SUCCESS]),
Err(ReconcileError::Cancel) => metrics::RECONCILER
.complete
.with_label_values(&[metrics::ReconcilerMetrics::CANCEL]),
Err(_) => metrics::RECONCILER
.complete
.with_label_values(&[metrics::ReconcilerMetrics::ERROR]),
}
.inc();
let outcome_label = match &result {
Ok(_) => ReconcileOutcome::Success,
Err(ReconcileError::Cancel) => ReconcileOutcome::Cancel,
Err(_) => ReconcileOutcome::Error,
};
metrics::METRICS_REGISTRY
.metrics_group
.storage_controller_reconcile_complete
.inc(ReconcileCompleteLabelGroup {
status: outcome_label,
});
result_tx
.send(ReconcileResult {
@@ -895,7 +888,7 @@ pub(crate) mod tests {
let mut scheduler = Scheduler::new(nodes.values());
let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Double(1));
let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Attached(1));
tenant_state
.schedule(&mut scheduler)
.expect("we have enough nodes, scheduling should work");
@@ -943,7 +936,7 @@ pub(crate) mod tests {
let nodes = make_test_nodes(3);
let mut scheduler = Scheduler::new(nodes.values());
let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Double(1));
let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Attached(1));
tenant_state.observed.locations.insert(
NodeId(3),

View File

@@ -437,7 +437,7 @@ async fn handle_tenant(
let placement_policy = match create_match.get_one::<String>("placement-policy") {
Some(s) if !s.is_empty() => serde_json::from_str::<PlacementPolicy>(s)?,
_ => PlacementPolicy::Single,
_ => PlacementPolicy::Attached(0),
};
let tenant_conf = PageServerNode::parse_config(tenant_conf)?;
@@ -523,88 +523,6 @@ async fn handle_tenant(
.with_context(|| format!("Tenant config failed for tenant with id {tenant_id}"))?;
println!("tenant {tenant_id} successfully configured on the pageserver");
}
Some(("migrate", matches)) => {
let tenant_shard_id = get_tenant_shard_id(matches, env)?;
let new_pageserver = get_pageserver(env, matches)?;
let new_pageserver_id = new_pageserver.conf.id;
let storage_controller = StorageController::from_env(env);
storage_controller
.tenant_migrate(tenant_shard_id, new_pageserver_id)
.await?;
println!("tenant {tenant_shard_id} migrated to {}", new_pageserver_id);
}
Some(("status", matches)) => {
let tenant_id = get_tenant_id(matches, env)?;
let mut shard_table = comfy_table::Table::new();
shard_table.set_header(["Shard", "Pageserver", "Physical Size"]);
let mut tenant_synthetic_size = None;
let storage_controller = StorageController::from_env(env);
for shard in storage_controller.tenant_locate(tenant_id).await?.shards {
let pageserver =
PageServerNode::from_env(env, env.get_pageserver_conf(shard.node_id)?);
let size = pageserver
.http_client
.tenant_details(shard.shard_id)
.await?
.tenant_info
.current_physical_size
.unwrap();
shard_table.add_row([
format!("{}", shard.shard_id.shard_slug()),
format!("{}", shard.node_id.0),
format!("{} MiB", size / (1024 * 1024)),
]);
if shard.shard_id.is_zero() {
tenant_synthetic_size =
Some(pageserver.tenant_synthetic_size(shard.shard_id).await?);
}
}
let Some(synthetic_size) = tenant_synthetic_size else {
bail!("Shard 0 not found")
};
let mut tenant_table = comfy_table::Table::new();
tenant_table.add_row(["Tenant ID".to_string(), tenant_id.to_string()]);
tenant_table.add_row([
"Synthetic size".to_string(),
format!("{} MiB", synthetic_size.size.unwrap_or(0) / (1024 * 1024)),
]);
println!("{tenant_table}");
println!("{shard_table}");
}
Some(("shard-split", matches)) => {
let tenant_id = get_tenant_id(matches, env)?;
let shard_count: u8 = matches.get_one::<u8>("shard-count").cloned().unwrap_or(0);
let shard_stripe_size: Option<ShardStripeSize> = matches
.get_one::<Option<ShardStripeSize>>("shard-stripe-size")
.cloned()
.unwrap();
let storage_controller = StorageController::from_env(env);
let result = storage_controller
.tenant_split(tenant_id, shard_count, shard_stripe_size)
.await?;
println!(
"Split tenant {} into shards {}",
tenant_id,
result
.new_shards
.iter()
.map(|s| format!("{:?}", s))
.collect::<Vec<_>>()
.join(",")
);
}
Some((sub_name, _)) => bail!("Unexpected tenant subcommand '{}'", sub_name),
None => bail!("no tenant subcommand provided"),
@@ -1578,19 +1496,6 @@ fn cli() -> Command {
.subcommand(Command::new("config")
.arg(tenant_id_arg.clone())
.arg(Arg::new("config").short('c').num_args(1).action(ArgAction::Append).required(false)))
.subcommand(Command::new("migrate")
.about("Migrate a tenant from one pageserver to another")
.arg(tenant_id_arg.clone())
.arg(pageserver_id_arg.clone()))
.subcommand(Command::new("status")
.about("Human readable summary of the tenant's shards and attachment locations")
.arg(tenant_id_arg.clone()))
.subcommand(Command::new("shard-split")
.about("Increase the number of shards in the tenant")
.arg(tenant_id_arg.clone())
.arg(Arg::new("shard-count").value_parser(value_parser!(u8)).long("shard-count").action(ArgAction::Set).help("Number of shards in the new tenant (default 1)"))
.arg(Arg::new("shard-stripe-size").value_parser(value_parser!(u32)).long("shard-stripe-size").action(ArgAction::Set).help("Sharding stripe size in pages"))
)
)
.subcommand(
Command::new("pageserver")

View File

@@ -127,8 +127,8 @@ pub struct PageServerConf {
pub pg_auth_type: AuthType,
pub http_auth_type: AuthType,
pub(crate) virtual_file_io_engine: String,
pub(crate) get_vectored_impl: String,
pub(crate) virtual_file_io_engine: Option<String>,
pub(crate) get_vectored_impl: Option<String>,
}
impl Default for PageServerConf {
@@ -139,9 +139,8 @@ impl Default for PageServerConf {
listen_http_addr: String::new(),
pg_auth_type: AuthType::Trust,
http_auth_type: AuthType::Trust,
// FIXME: use the ones exposed by pageserver crate
virtual_file_io_engine: "tokio-epoll-uring".to_owned(),
get_vectored_impl: "sequential".to_owned(),
virtual_file_io_engine: None,
get_vectored_impl: None,
}
}
}

View File

@@ -101,8 +101,16 @@ impl PageServerNode {
let pg_auth_type_param = format!("pg_auth_type='{}'", pg_auth_type);
let listen_pg_addr_param = format!("listen_pg_addr='{}'", listen_pg_addr);
let virtual_file_io_engine = format!("virtual_file_io_engine='{virtual_file_io_engine}'");
let get_vectored_impl = format!("get_vectored_impl='{get_vectored_impl}'");
let virtual_file_io_engine = if let Some(virtual_file_io_engine) = virtual_file_io_engine {
format!("virtual_file_io_engine='{virtual_file_io_engine}'")
} else {
String::new()
};
let get_vectored_impl = if let Some(get_vectored_impl) = get_vectored_impl {
format!("get_vectored_impl='{get_vectored_impl}'")
} else {
String::new()
};
let broker_endpoint_param = format!("broker_endpoint='{}'", self.env.broker.client_url());

View File

@@ -475,7 +475,7 @@ impl StorageController {
pub async fn tenant_locate(&self, tenant_id: TenantId) -> anyhow::Result<TenantLocateResponse> {
self.dispatch::<(), _>(
Method::GET,
format!("control/v1/tenant/{tenant_id}/locate"),
format!("debug/v1/tenant/{tenant_id}/locate"),
None,
)
.await

View File

@@ -6,7 +6,10 @@ use std::str::FromStr;
use serde::{Deserialize, Serialize};
use utils::id::NodeId;
use crate::{models::ShardParameters, shard::TenantShardId};
use crate::{
models::{ShardParameters, TenantConfig},
shard::{ShardStripeSize, TenantShardId},
};
#[derive(Serialize, Deserialize)]
pub struct TenantCreateResponseShard {
@@ -57,6 +60,31 @@ pub struct TenantLocateResponse {
pub shard_params: ShardParameters,
}
#[derive(Serialize, Deserialize)]
pub struct TenantDescribeResponse {
pub shards: Vec<TenantDescribeResponseShard>,
pub stripe_size: ShardStripeSize,
pub policy: PlacementPolicy,
pub config: TenantConfig,
}
#[derive(Serialize, Deserialize)]
pub struct TenantDescribeResponseShard {
pub tenant_shard_id: TenantShardId,
pub node_attached: Option<NodeId>,
pub node_secondary: Vec<NodeId>,
pub last_error: String,
/// A task is currently running to reconcile this tenant's intent state with the state on pageservers
pub is_reconciling: bool,
/// This shard failed in sending a compute notification to the cloud control plane, and a retry is pending.
pub is_pending_compute_notification: bool,
/// A shard split is currently underway
pub is_splitting: bool,
}
/// Explicitly migrating a particular shard is a low level operation
/// TODO: higher level "Reschedule tenant" operation where the request
/// specifies some constraints, e.g. asking it to get off particular node(s)
@@ -181,11 +209,8 @@ impl From<NodeSchedulingPolicy> for String {
/// to create secondary locations.
#[derive(Clone, Serialize, Deserialize, Debug, PartialEq, Eq)]
pub enum PlacementPolicy {
/// Cheapest way to attach a tenant: just one pageserver, no secondary
Single,
/// Production-ready way to attach a tenant: one attached pageserver and
/// some number of secondaries.
Double(usize),
/// Normal live state: one attached pageserver and zero or more secondaries.
Attached(usize),
/// Create one secondary mode locations. This is useful when onboarding
/// a tenant, or for an idle tenant that we might want to bring online quickly.
Secondary,
@@ -207,14 +232,14 @@ mod test {
/// Check stability of PlacementPolicy's serialization
#[test]
fn placement_policy_encoding() -> anyhow::Result<()> {
let v = PlacementPolicy::Double(1);
let v = PlacementPolicy::Attached(1);
let encoded = serde_json::to_string(&v)?;
assert_eq!(encoded, "{\"Double\":1}");
assert_eq!(encoded, "{\"Attached\":1}");
assert_eq!(serde_json::from_str::<PlacementPolicy>(&encoded)?, v);
let v = PlacementPolicy::Single;
let v = PlacementPolicy::Detached;
let encoded = serde_json::to_string(&v)?;
assert_eq!(encoded, "\"Single\"");
assert_eq!(encoded, "\"Detached\"");
assert_eq!(serde_json::from_str::<PlacementPolicy>(&encoded)?, v);
Ok(())
}

View File

@@ -6,7 +6,9 @@
use serde::{Deserialize, Serialize};
use utils::id::NodeId;
use crate::{controller_api::NodeRegisterRequest, shard::TenantShardId};
use crate::{
controller_api::NodeRegisterRequest, models::LocationConfigMode, shard::TenantShardId,
};
/// Upcall message sent by the pageserver to the configured `control_plane_api` on
/// startup.
@@ -20,12 +22,20 @@ pub struct ReAttachRequest {
pub register: Option<NodeRegisterRequest>,
}
#[derive(Serialize, Deserialize)]
pub struct ReAttachResponseTenant {
pub id: TenantShardId,
pub gen: u32,
fn default_mode() -> LocationConfigMode {
LocationConfigMode::AttachedSingle
}
#[derive(Serialize, Deserialize, Debug)]
pub struct ReAttachResponseTenant {
pub id: TenantShardId,
/// Mandatory if LocationConfigMode is None or set to an Attached* mode
pub gen: Option<u32>,
/// Default value only for backward compat: this field should be set
#[serde(default = "default_mode")]
pub mode: LocationConfigMode,
}
#[derive(Serialize, Deserialize)]
pub struct ReAttachResponse {
pub tenants: Vec<ReAttachResponseTenant>,

View File

@@ -198,6 +198,7 @@ impl LocalFs {
fs::OpenOptions::new()
.write(true)
.create(true)
.truncate(true)
.open(&temp_file_path)
.await
.with_context(|| {

View File

@@ -13,6 +13,7 @@ testing = ["fail/failpoints"]
[dependencies]
arc-swap.workspace = true
sentry.workspace = true
async-compression.workspace = true
async-trait.workspace = true
anyhow.workspace = true
bincode.workspace = true
@@ -36,6 +37,7 @@ serde_json.workspace = true
signal-hook.workspace = true
thiserror.workspace = true
tokio.workspace = true
tokio-tar.workspace = true
tokio-util.workspace = true
tracing.workspace = true
tracing-error.workspace = true
@@ -46,6 +48,7 @@ strum.workspace = true
strum_macros.workspace = true
url.workspace = true
uuid.workspace = true
walkdir.workspace = true
pq_proto.workspace = true
postgres_connection.workspace = true

View File

@@ -245,7 +245,7 @@ impl std::io::Write for ChannelWriter {
}
}
async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
pub async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
SERVE_METRICS_COUNT.inc();
let started_at = std::time::Instant::now();
@@ -367,7 +367,6 @@ pub fn make_router() -> RouterBuilder<hyper::Body, ApiError> {
.middleware(Middleware::post_with_info(
add_request_id_header_to_response,
))
.get("/metrics", |r| request_span(r, prometheus_metrics_handler))
.err_handler(route_error_handler)
}

View File

@@ -87,6 +87,8 @@ pub mod failpoint_support;
pub mod yielding_loop;
pub mod zstd;
/// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
///
/// we have several cases:

View File

@@ -63,6 +63,7 @@ impl UnwrittenLockFile {
pub fn create_exclusive(lock_file_path: &Utf8Path) -> anyhow::Result<UnwrittenLockFile> {
let lock_file = fs::OpenOptions::new()
.create(true) // O_CREAT
.truncate(true)
.write(true)
.open(lock_file_path)
.context("open lock file")?;

View File

@@ -29,12 +29,10 @@ pub struct PageserverFeedback {
// Serialize with RFC3339 format.
#[serde(with = "serde_systemtime")]
pub replytime: SystemTime,
/// Used to track feedbacks from different shards. Always zero for unsharded tenants.
pub shard_number: u32,
}
// NOTE: Do not forget to increment this number when adding new fields to PageserverFeedback.
// Do not remove previously available fields because this might be backwards incompatible.
pub const PAGESERVER_FEEDBACK_FIELDS_NUMBER: u8 = 5;
impl PageserverFeedback {
pub fn empty() -> PageserverFeedback {
PageserverFeedback {
@@ -43,6 +41,7 @@ impl PageserverFeedback {
remote_consistent_lsn: Lsn::INVALID,
disk_consistent_lsn: Lsn::INVALID,
replytime: *PG_EPOCH,
shard_number: 0,
}
}
@@ -59,17 +58,26 @@ impl PageserverFeedback {
//
// TODO: change serialized fields names once all computes migrate to rename.
pub fn serialize(&self, buf: &mut BytesMut) {
buf.put_u8(PAGESERVER_FEEDBACK_FIELDS_NUMBER); // # of keys
let buf_ptr = buf.len();
buf.put_u8(0); // # of keys, will be filled later
let mut nkeys = 0;
nkeys += 1;
buf.put_slice(b"current_timeline_size\0");
buf.put_i32(8);
buf.put_u64(self.current_timeline_size);
nkeys += 1;
buf.put_slice(b"ps_writelsn\0");
buf.put_i32(8);
buf.put_u64(self.last_received_lsn.0);
nkeys += 1;
buf.put_slice(b"ps_flushlsn\0");
buf.put_i32(8);
buf.put_u64(self.disk_consistent_lsn.0);
nkeys += 1;
buf.put_slice(b"ps_applylsn\0");
buf.put_i32(8);
buf.put_u64(self.remote_consistent_lsn.0);
@@ -80,9 +88,19 @@ impl PageserverFeedback {
.expect("failed to serialize pg_replytime earlier than PG_EPOCH")
.as_micros() as i64;
nkeys += 1;
buf.put_slice(b"ps_replytime\0");
buf.put_i32(8);
buf.put_i64(timestamp);
if self.shard_number > 0 {
nkeys += 1;
buf.put_slice(b"shard_number\0");
buf.put_i32(4);
buf.put_u32(self.shard_number);
}
buf[buf_ptr] = nkeys;
}
// Deserialize PageserverFeedback message
@@ -125,9 +143,8 @@ impl PageserverFeedback {
}
b"shard_number" => {
let len = buf.get_i32();
// TODO: this will be implemented in the next update,
// for now, we just skip the value.
buf.advance(len as usize);
assert_eq!(len, 4);
rf.shard_number = buf.get_u32();
}
_ => {
let len = buf.get_i32();
@@ -200,10 +217,7 @@ mod tests {
rf.serialize(&mut data);
// Add an extra field to the buffer and adjust number of keys
if let Some(first) = data.first_mut() {
*first = PAGESERVER_FEEDBACK_FIELDS_NUMBER + 1;
}
data[0] += 1;
data.put_slice(b"new_field_one\0");
data.put_i32(8);
data.put_u64(42);

View File

@@ -245,7 +245,7 @@ impl<'a, T> Guard<'a, T> {
///
/// The permit will be on a semaphore part of the new internal value, and any following
/// [`OnceCell::get_or_init`] will wait on it to complete.
pub fn take_and_deinit(&mut self) -> (T, InitPermit) {
pub fn take_and_deinit(mut self) -> (T, InitPermit) {
let mut swapped = Inner::default();
let sem = swapped.init_semaphore.clone();
// acquire and forget right away, moving the control over to InitPermit
@@ -543,7 +543,7 @@ mod tests {
target.set(42, permit);
let (_answer, permit) = {
let mut guard = target
let guard = target
.get_or_init(|permit| async { Ok::<_, Infallible>((11, permit)) })
.await
.unwrap();

View File

@@ -1,27 +1,60 @@
use std::{alloc::Layout, cmp::Ordering, ops::RangeBounds};
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum VecMapOrdering {
Greater,
GreaterOrEqual,
}
/// Ordered map datastructure implemented in a Vec.
/// Append only - can only add keys that are larger than the
/// current max key.
/// Ordering can be adjusted using [`VecMapOrdering`]
/// during `VecMap` construction.
#[derive(Clone, Debug)]
pub struct VecMap<K, V>(Vec<(K, V)>);
pub struct VecMap<K, V> {
data: Vec<(K, V)>,
ordering: VecMapOrdering,
}
impl<K, V> Default for VecMap<K, V> {
fn default() -> Self {
VecMap(Default::default())
VecMap {
data: Default::default(),
ordering: VecMapOrdering::Greater,
}
}
}
#[derive(Debug)]
pub struct InvalidKey;
#[derive(thiserror::Error, Debug)]
pub enum VecMapError {
#[error("Key violates ordering constraint")]
InvalidKey,
#[error("Mismatched ordering constraints")]
ExtendOrderingError,
}
impl<K: Ord, V> VecMap<K, V> {
pub fn new(ordering: VecMapOrdering) -> Self {
Self {
data: Vec::new(),
ordering,
}
}
pub fn with_capacity(capacity: usize, ordering: VecMapOrdering) -> Self {
Self {
data: Vec::with_capacity(capacity),
ordering,
}
}
pub fn is_empty(&self) -> bool {
self.0.is_empty()
self.data.is_empty()
}
pub fn as_slice(&self) -> &[(K, V)] {
self.0.as_slice()
self.data.as_slice()
}
/// This function may panic if given a range where the lower bound is
@@ -29,7 +62,7 @@ impl<K: Ord, V> VecMap<K, V> {
pub fn slice_range<R: RangeBounds<K>>(&self, range: R) -> &[(K, V)] {
use std::ops::Bound::*;
let binary_search = |k: &K| self.0.binary_search_by_key(&k, extract_key);
let binary_search = |k: &K| self.data.binary_search_by_key(&k, extract_key);
let start_idx = match range.start_bound() {
Unbounded => 0,
@@ -41,7 +74,7 @@ impl<K: Ord, V> VecMap<K, V> {
};
let end_idx = match range.end_bound() {
Unbounded => self.0.len(),
Unbounded => self.data.len(),
Included(k) => match binary_search(k) {
Ok(idx) => idx + 1,
Err(idx) => idx,
@@ -49,34 +82,30 @@ impl<K: Ord, V> VecMap<K, V> {
Excluded(k) => binary_search(k).unwrap_or_else(std::convert::identity),
};
&self.0[start_idx..end_idx]
&self.data[start_idx..end_idx]
}
/// Add a key value pair to the map.
/// If `key` is less than or equal to the current maximum key
/// the pair will not be added and InvalidKey error will be returned.
pub fn append(&mut self, key: K, value: V) -> Result<usize, InvalidKey> {
if let Some((last_key, _last_value)) = self.0.last() {
if &key <= last_key {
return Err(InvalidKey);
}
}
/// If `key` is not respective of the `self` ordering the
/// pair will not be added and `InvalidKey` error will be returned.
pub fn append(&mut self, key: K, value: V) -> Result<usize, VecMapError> {
self.validate_key_order(&key)?;
let delta_size = self.instrument_vec_op(|vec| vec.push((key, value)));
Ok(delta_size)
}
/// Update the maximum key value pair or add a new key value pair to the map.
/// If `key` is less than the current maximum key no updates or additions
/// will occur and InvalidKey error will be returned.
/// If `key` is not respective of the `self` ordering no updates or additions
/// will occur and `InvalidKey` error will be returned.
pub fn append_or_update_last(
&mut self,
key: K,
mut value: V,
) -> Result<(Option<V>, usize), InvalidKey> {
if let Some((last_key, last_value)) = self.0.last_mut() {
) -> Result<(Option<V>, usize), VecMapError> {
if let Some((last_key, last_value)) = self.data.last_mut() {
match key.cmp(last_key) {
Ordering::Less => return Err(InvalidKey),
Ordering::Less => return Err(VecMapError::InvalidKey),
Ordering::Equal => {
std::mem::swap(last_value, &mut value);
const DELTA_SIZE: usize = 0;
@@ -100,40 +129,67 @@ impl<K: Ord, V> VecMap<K, V> {
V: Clone,
{
let split_idx = self
.0
.data
.binary_search_by_key(&cutoff, extract_key)
.unwrap_or_else(std::convert::identity);
(
VecMap(self.0[..split_idx].to_vec()),
VecMap(self.0[split_idx..].to_vec()),
VecMap {
data: self.data[..split_idx].to_vec(),
ordering: self.ordering,
},
VecMap {
data: self.data[split_idx..].to_vec(),
ordering: self.ordering,
},
)
}
/// Move items from `other` to the end of `self`, leaving `other` empty.
/// If any keys in `other` is less than or equal to any key in `self`,
/// `InvalidKey` error will be returned and no mutation will occur.
pub fn extend(&mut self, other: &mut Self) -> Result<usize, InvalidKey> {
let self_last_opt = self.0.last().map(extract_key);
let other_first_opt = other.0.last().map(extract_key);
/// If the `other` ordering is different from `self` ordering
/// `ExtendOrderingError` error will be returned.
/// If any keys in `other` is not respective of the ordering defined in
/// `self`, `InvalidKey` error will be returned and no mutation will occur.
pub fn extend(&mut self, other: &mut Self) -> Result<usize, VecMapError> {
if self.ordering != other.ordering {
return Err(VecMapError::ExtendOrderingError);
}
if let (Some(self_last), Some(other_first)) = (self_last_opt, other_first_opt) {
if self_last >= other_first {
return Err(InvalidKey);
let other_first_opt = other.data.last().map(extract_key);
if let Some(other_first) = other_first_opt {
self.validate_key_order(other_first)?;
}
let delta_size = self.instrument_vec_op(|vec| vec.append(&mut other.data));
Ok(delta_size)
}
/// Validate the current last key in `self` and key being
/// inserted against the order defined in `self`.
fn validate_key_order(&self, key: &K) -> Result<(), VecMapError> {
if let Some(last_key) = self.data.last().map(extract_key) {
match (&self.ordering, &key.cmp(last_key)) {
(VecMapOrdering::Greater, Ordering::Less | Ordering::Equal) => {
return Err(VecMapError::InvalidKey);
}
(VecMapOrdering::Greater, Ordering::Greater) => {}
(VecMapOrdering::GreaterOrEqual, Ordering::Less) => {
return Err(VecMapError::InvalidKey);
}
(VecMapOrdering::GreaterOrEqual, Ordering::Equal | Ordering::Greater) => {}
}
}
let delta_size = self.instrument_vec_op(|vec| vec.append(&mut other.0));
Ok(delta_size)
Ok(())
}
/// Instrument an operation on the underlying [`Vec`].
/// Will panic if the operation decreases capacity.
/// Returns the increase in memory usage caused by the op.
fn instrument_vec_op(&mut self, op: impl FnOnce(&mut Vec<(K, V)>)) -> usize {
let old_cap = self.0.capacity();
op(&mut self.0);
let new_cap = self.0.capacity();
let old_cap = self.data.capacity();
op(&mut self.data);
let new_cap = self.data.capacity();
match old_cap.cmp(&new_cap) {
Ordering::Less => {
@@ -145,6 +201,36 @@ impl<K: Ord, V> VecMap<K, V> {
Ordering::Greater => panic!("VecMap capacity shouldn't ever decrease"),
}
}
/// Similar to `from_iter` defined in `FromIter` trait except
/// that it accepts an [`VecMapOrdering`]
pub fn from_iter<I: IntoIterator<Item = (K, V)>>(iter: I, ordering: VecMapOrdering) -> Self {
let iter = iter.into_iter();
let initial_capacity = {
match iter.size_hint() {
(lower_bound, None) => lower_bound,
(_, Some(upper_bound)) => upper_bound,
}
};
let mut vec_map = VecMap::with_capacity(initial_capacity, ordering);
for (key, value) in iter {
vec_map
.append(key, value)
.expect("The passed collection needs to be sorted!");
}
vec_map
}
}
impl<K: Ord, V> IntoIterator for VecMap<K, V> {
type Item = (K, V);
type IntoIter = std::vec::IntoIter<(K, V)>;
fn into_iter(self) -> Self::IntoIter {
self.data.into_iter()
}
}
fn extract_key<K, V>(entry: &(K, V)) -> &K {
@@ -155,7 +241,7 @@ fn extract_key<K, V>(entry: &(K, V)) -> &K {
mod tests {
use std::{collections::BTreeMap, ops::Bound};
use super::VecMap;
use super::{VecMap, VecMapOrdering};
#[test]
fn unbounded_range() {
@@ -310,5 +396,59 @@ mod tests {
left.extend(&mut one_map).unwrap_err();
assert_eq!(left.as_slice(), &[(0, ()), (1, ())]);
assert_eq!(one_map.as_slice(), &[(1, ())]);
let mut map_greater_or_equal = VecMap::new(VecMapOrdering::GreaterOrEqual);
map_greater_or_equal.append(2, ()).unwrap();
map_greater_or_equal.append(2, ()).unwrap();
left.extend(&mut map_greater_or_equal).unwrap_err();
assert_eq!(left.as_slice(), &[(0, ()), (1, ())]);
assert_eq!(map_greater_or_equal.as_slice(), &[(2, ()), (2, ())]);
}
#[test]
fn extend_with_ordering() {
let mut left = VecMap::new(VecMapOrdering::GreaterOrEqual);
left.append(0, ()).unwrap();
assert_eq!(left.as_slice(), &[(0, ())]);
let mut greater_right = VecMap::new(VecMapOrdering::Greater);
greater_right.append(0, ()).unwrap();
left.extend(&mut greater_right).unwrap_err();
assert_eq!(left.as_slice(), &[(0, ())]);
let mut greater_or_equal_right = VecMap::new(VecMapOrdering::GreaterOrEqual);
greater_or_equal_right.append(2, ()).unwrap();
greater_or_equal_right.append(2, ()).unwrap();
left.extend(&mut greater_or_equal_right).unwrap();
assert_eq!(left.as_slice(), &[(0, ()), (2, ()), (2, ())]);
}
#[test]
fn vec_map_from_sorted() {
let vec = vec![(1, ()), (2, ()), (3, ()), (6, ())];
let vec_map = VecMap::from_iter(vec, VecMapOrdering::Greater);
assert_eq!(vec_map.as_slice(), &[(1, ()), (2, ()), (3, ()), (6, ())]);
let vec = vec![(1, ()), (2, ()), (3, ()), (3, ()), (6, ()), (6, ())];
let vec_map = VecMap::from_iter(vec, VecMapOrdering::GreaterOrEqual);
assert_eq!(
vec_map.as_slice(),
&[(1, ()), (2, ()), (3, ()), (3, ()), (6, ()), (6, ())]
);
}
#[test]
#[should_panic]
fn vec_map_from_unsorted_greater() {
let vec = vec![(1, ()), (2, ()), (2, ()), (3, ()), (6, ())];
let _ = VecMap::from_iter(vec, VecMapOrdering::Greater);
}
#[test]
#[should_panic]
fn vec_map_from_unsorted_greater_or_equal() {
let vec = vec![(1, ()), (2, ()), (3, ()), (6, ()), (5, ())];
let _ = VecMap::from_iter(vec, VecMapOrdering::GreaterOrEqual);
}
}

78
libs/utils/src/zstd.rs Normal file
View File

@@ -0,0 +1,78 @@
use std::io::SeekFrom;
use anyhow::{Context, Result};
use async_compression::{
tokio::{bufread::ZstdDecoder, write::ZstdEncoder},
zstd::CParameter,
Level,
};
use camino::Utf8Path;
use nix::NixPath;
use tokio::{
fs::{File, OpenOptions},
io::AsyncBufRead,
io::AsyncSeekExt,
io::AsyncWriteExt,
};
use tokio_tar::{Archive, Builder, HeaderMode};
use walkdir::WalkDir;
/// Creates a Zstandard tarball.
pub async fn create_zst_tarball(path: &Utf8Path, tarball: &Utf8Path) -> Result<(File, u64)> {
let file = OpenOptions::new()
.create(true)
.truncate(true)
.read(true)
.write(true)
.open(&tarball)
.await
.with_context(|| format!("tempfile creation {tarball}"))?;
let mut paths = Vec::new();
for entry in WalkDir::new(path) {
let entry = entry?;
let metadata = entry.metadata().expect("error getting dir entry metadata");
// Also allow directories so that we also get empty directories
if !(metadata.is_file() || metadata.is_dir()) {
continue;
}
let path = entry.into_path();
paths.push(path);
}
// Do a sort to get a more consistent listing
paths.sort_unstable();
let zstd = ZstdEncoder::with_quality_and_params(
file,
Level::Default,
&[CParameter::enable_long_distance_matching(true)],
);
let mut builder = Builder::new(zstd);
// Use reproducible header mode
builder.mode(HeaderMode::Deterministic);
for p in paths {
let rel_path = p.strip_prefix(path)?;
if rel_path.is_empty() {
// The top directory should not be compressed,
// the tar crate doesn't like that
continue;
}
builder.append_path_with_name(&p, rel_path).await?;
}
let mut zstd = builder.into_inner().await?;
zstd.shutdown().await?;
let mut compressed = zstd.into_inner();
let compressed_len = compressed.metadata().await?.len();
compressed.seek(SeekFrom::Start(0)).await?;
Ok((compressed, compressed_len))
}
/// Creates a Zstandard tarball.
pub async fn extract_zst_tarball(
path: &Utf8Path,
tarball: impl AsyncBufRead + Unpin,
) -> Result<()> {
let decoder = Box::pin(ZstdDecoder::new(tarball));
let mut archive = Archive::new(decoder);
archive.unpack(path).await?;
Ok(())
}

View File

@@ -324,11 +324,11 @@ extern "C" fn finish_sync_safekeepers(wp: *mut WalProposer, lsn: XLogRecPtr) {
}
}
extern "C" fn process_safekeeper_feedback(wp: *mut WalProposer) {
extern "C" fn process_safekeeper_feedback(wp: *mut WalProposer, sk: *mut Safekeeper) {
unsafe {
let callback_data = (*(*wp).config).callback_data;
let api = callback_data as *mut Box<dyn ApiImpl>;
(*api).process_safekeeper_feedback(&mut (*wp))
(*api).process_safekeeper_feedback(&mut (*wp), &mut (*sk));
}
}

View File

@@ -142,7 +142,7 @@ pub trait ApiImpl {
todo!()
}
fn process_safekeeper_feedback(&mut self, _wp: &mut WalProposer) {
fn process_safekeeper_feedback(&mut self, _wp: &mut WalProposer, _sk: &mut Safekeeper) {
todo!()
}

View File

@@ -1,160 +1,156 @@
//! Simple benchmarking around walredo.
//! Quantify a single walredo manager's throughput under N concurrent callers.
//!
//! Right now they hope to just set a baseline. Later we can try to expand into latency and
//! throughput after figuring out the coordinated omission problems below.
//! The benchmark implementation ([`bench_impl`]) is parametrized by
//! - `redo_work` => [`Request::short_request`] or [`Request::medium_request`]
//! - `n_redos` => number of times the benchmark shell execute the `redo_work`
//! - `nclients` => number of clients (more on this shortly).
//!
//! There are two sets of inputs; `short` and `medium`. They were collected on postgres v14 by
//! logging what happens when a sequential scan is requested on a small table, then picking out two
//! suitable from logs.
//! The benchmark impl sets up a multi-threaded tokio runtime with default parameters.
//! It spawns `nclients` times [`client`] tokio tasks.
//! Each task executes the `redo_work` `n_redos/nclients` times.
//!
//! We exercise the following combinations:
//! - `redo_work = short / medium``
//! - `nclients = [1, 2, 4, 8, 16, 32, 64, 128]`
//!
//! Reference data (git blame to see commit) on an i3en.3xlarge
// ```text
//! short/short/1 time: [39.175 µs 39.348 µs 39.536 µs]
//! short/short/2 time: [51.227 µs 51.487 µs 51.755 µs]
//! short/short/4 time: [76.048 µs 76.362 µs 76.674 µs]
//! short/short/8 time: [128.94 µs 129.82 µs 130.74 µs]
//! short/short/16 time: [227.84 µs 229.00 µs 230.28 µs]
//! short/short/32 time: [455.97 µs 457.81 µs 459.90 µs]
//! short/short/64 time: [902.46 µs 904.84 µs 907.32 µs]
//! short/short/128 time: [1.7416 ms 1.7487 ms 1.7561 ms]
//! ``
use std::sync::Arc;
//! We let `criterion` determine the `n_redos` using `iter_custom`.
//! The idea is that for each `(redo_work, nclients)` combination,
//! criterion will run the `bench_impl` multiple times with different `n_redos`.
//! The `bench_impl` reports the aggregate wall clock time from the clients' perspective.
//! Criterion will divide that by `n_redos` to compute the "time per iteration".
//! In our case, "time per iteration" means "time per redo_work execution".
//!
//! NB: the way by which `iter_custom` determines the "number of iterations"
//! is called sampling. Apparently the idea here is to detect outliers.
//! We're not sure whether the current choice of sampling method makes sense.
//! See https://bheisler.github.io/criterion.rs/book/user_guide/command_line_output.html#collecting-samples
//!
//! # Reference Numbers
//!
//! 2024-03-20 on i3en.3xlarge
//!
//! ```text
//! short/1 time: [26.483 µs 26.614 µs 26.767 µs]
//! short/2 time: [32.223 µs 32.465 µs 32.767 µs]
//! short/4 time: [47.203 µs 47.583 µs 47.984 µs]
//! short/8 time: [89.135 µs 89.612 µs 90.139 µs]
//! short/16 time: [190.12 µs 191.52 µs 192.88 µs]
//! short/32 time: [380.96 µs 382.63 µs 384.20 µs]
//! short/64 time: [736.86 µs 741.07 µs 745.03 µs]
//! short/128 time: [1.4106 ms 1.4206 ms 1.4294 ms]
//! medium/1 time: [111.81 µs 112.25 µs 112.79 µs]
//! medium/2 time: [158.26 µs 159.13 µs 160.21 µs]
//! medium/4 time: [334.65 µs 337.14 µs 340.07 µs]
//! medium/8 time: [675.32 µs 679.91 µs 685.25 µs]
//! medium/16 time: [1.2929 ms 1.2996 ms 1.3067 ms]
//! medium/32 time: [2.4295 ms 2.4461 ms 2.4623 ms]
//! medium/64 time: [4.3973 ms 4.4458 ms 4.4875 ms]
//! medium/128 time: [7.5955 ms 7.7847 ms 7.9481 ms]
//! ```
use bytes::{Buf, Bytes};
use pageserver::{
config::PageServerConf, repository::Key, walrecord::NeonWalRecord, walredo::PostgresRedoManager,
use criterion::{BenchmarkId, Criterion};
use pageserver::{config::PageServerConf, walrecord::NeonWalRecord, walredo::PostgresRedoManager};
use pageserver_api::{key::Key, shard::TenantShardId};
use std::{
sync::Arc,
time::{Duration, Instant},
};
use pageserver_api::shard::TenantShardId;
use tokio::task::JoinSet;
use tokio::{sync::Barrier, task::JoinSet};
use utils::{id::TenantId, lsn::Lsn};
use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};
fn bench(c: &mut Criterion) {
{
let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
for nclients in nclients {
let mut group = c.benchmark_group("short");
group.bench_with_input(
BenchmarkId::from_parameter(nclients),
&nclients,
|b, nclients| {
let redo_work = Arc::new(Request::short_input());
b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients));
},
);
}
}
fn redo_scenarios(c: &mut Criterion) {
// logging should be enabled when adding more inputs, since walredo will only report malformed
// input to the stderr.
// utils::logging::init(utils::logging::LogFormat::Plain).unwrap();
{
let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
for nclients in nclients {
let mut group = c.benchmark_group("medium");
group.bench_with_input(
BenchmarkId::from_parameter(nclients),
&nclients,
|b, nclients| {
let redo_work = Arc::new(Request::medium_input());
b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients));
},
);
}
}
}
criterion::criterion_group!(benches, bench);
criterion::criterion_main!(benches);
// Returns the sum of each client's wall-clock time spent executing their share of the n_redos.
fn bench_impl(redo_work: Arc<Request>, n_redos: u64, nclients: u64) -> Duration {
let repo_dir = camino_tempfile::tempdir_in(env!("CARGO_TARGET_TMPDIR")).unwrap();
let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
let conf = Box::leak(Box::new(conf));
let tenant_shard_id = TenantShardId::unsharded(TenantId::generate());
let manager = PostgresRedoManager::new(conf, tenant_shard_id);
let manager = Arc::new(manager);
{
let rt = tokio::runtime::Builder::new_current_thread()
.enable_all()
.build()
.unwrap();
tracing::info!("executing first");
rt.block_on(short().execute(&manager)).unwrap();
tracing::info!("first executed");
}
let thread_counts = [1, 2, 4, 8, 16, 32, 64, 128];
let mut group = c.benchmark_group("short");
group.sampling_mode(criterion::SamplingMode::Flat);
for thread_count in thread_counts {
group.bench_with_input(
BenchmarkId::new("short", thread_count),
&thread_count,
|b, thread_count| {
add_multithreaded_walredo_requesters(b, *thread_count, &manager, short);
},
);
}
drop(group);
let mut group = c.benchmark_group("medium");
group.sampling_mode(criterion::SamplingMode::Flat);
for thread_count in thread_counts {
group.bench_with_input(
BenchmarkId::new("medium", thread_count),
&thread_count,
|b, thread_count| {
add_multithreaded_walredo_requesters(b, *thread_count, &manager, medium);
},
);
}
drop(group);
}
/// Sets up a multi-threaded tokio runtime with default worker thread count,
/// then, spawn `requesters` tasks that repeatedly:
/// - get input from `input_factor()`
/// - call `manager.request_redo()` with their input
///
/// This stress-tests the scalability of a single walredo manager at high tokio-level concurrency.
///
/// Using tokio's default worker thread count means the results will differ on machines
/// with different core countrs. We don't care about that, the performance will always
/// be different on different hardware. To compare performance of different software versions,
/// use the same hardware.
fn add_multithreaded_walredo_requesters(
b: &mut criterion::Bencher,
nrequesters: usize,
manager: &Arc<PostgresRedoManager>,
input_factory: fn() -> Request,
) {
assert_ne!(nrequesters, 0);
let rt = tokio::runtime::Builder::new_multi_thread()
.enable_all()
.build()
.unwrap();
let barrier = Arc::new(tokio::sync::Barrier::new(nrequesters + 1));
let start = Arc::new(Barrier::new(nclients as usize));
let mut requesters = JoinSet::new();
for _ in 0..nrequesters {
let _entered = rt.enter();
let manager = manager.clone();
let barrier = barrier.clone();
requesters.spawn(async move {
loop {
let input = input_factory();
barrier.wait().await;
let page = input.execute(&manager).await.unwrap();
assert_eq!(page.remaining(), 8192);
barrier.wait().await;
}
let mut tasks = JoinSet::new();
let manager = PostgresRedoManager::new(conf, tenant_shard_id);
let manager = Arc::new(manager);
for _ in 0..nclients {
rt.block_on(async {
tasks.spawn(client(
Arc::clone(&manager),
Arc::clone(&start),
Arc::clone(&redo_work),
// divide the amount of work equally among the clients
n_redos / nclients,
))
});
}
let do_one_iteration = || {
rt.block_on(async {
barrier.wait().await;
// wait for work to complete
barrier.wait().await;
})
};
b.iter_batched(
|| {
// warmup
do_one_iteration();
},
|()| {
// work loop
do_one_iteration();
},
criterion::BatchSize::PerIteration,
);
rt.block_on(requesters.shutdown());
rt.block_on(async move {
let mut total_wallclock_time = std::time::Duration::from_millis(0);
while let Some(res) = tasks.join_next().await {
total_wallclock_time += res.unwrap();
}
total_wallclock_time
})
}
criterion_group!(benches, redo_scenarios);
criterion_main!(benches);
async fn client(
mgr: Arc<PostgresRedoManager>,
start: Arc<Barrier>,
redo_work: Arc<Request>,
n_redos: u64,
) -> Duration {
start.wait().await;
let start = Instant::now();
for _ in 0..n_redos {
let page = redo_work.execute(&mgr).await.unwrap();
assert_eq!(page.remaining(), 8192);
// The real pageserver will rarely if ever do 2 walredos in a row without
// yielding to the executor.
tokio::task::yield_now().await;
}
start.elapsed()
}
macro_rules! lsn {
($input:expr) => {{
@@ -166,12 +162,46 @@ macro_rules! lsn {
}};
}
/// Short payload, 1132 bytes.
// pg_records are copypasted from log, where they are put with Debug impl of Bytes, which uses \0
// for null bytes.
#[allow(clippy::octal_escapes)]
fn short() -> Request {
Request {
/// Simple wrapper around `WalRedoManager::request_redo`.
///
/// In benchmarks this is cloned around.
#[derive(Clone)]
struct Request {
key: Key,
lsn: Lsn,
base_img: Option<(Lsn, Bytes)>,
records: Vec<(Lsn, NeonWalRecord)>,
pg_version: u32,
}
impl Request {
async fn execute(&self, manager: &PostgresRedoManager) -> anyhow::Result<Bytes> {
let Request {
key,
lsn,
base_img,
records,
pg_version,
} = self;
// TODO: avoid these clones
manager
.request_redo(*key, *lsn, base_img.clone(), records.clone(), *pg_version)
.await
}
fn pg_record(will_init: bool, bytes: &'static [u8]) -> NeonWalRecord {
let rec = Bytes::from_static(bytes);
NeonWalRecord::Postgres { will_init, rec }
}
/// Short payload, 1132 bytes.
// pg_records are copypasted from log, where they are put with Debug impl of Bytes, which uses \0
// for null bytes.
#[allow(clippy::octal_escapes)]
pub fn short_input() -> Request {
let pg_record = Self::pg_record;
Request {
key: Key {
field1: 0,
field2: 1663,
@@ -194,13 +224,14 @@ fn short() -> Request {
],
pg_version: 14,
}
}
}
/// Medium sized payload, serializes as 26393 bytes.
// see [`short`]
#[allow(clippy::octal_escapes)]
fn medium() -> Request {
Request {
/// Medium sized payload, serializes as 26393 bytes.
// see [`short`]
#[allow(clippy::octal_escapes)]
pub fn medium_input() -> Request {
let pg_record = Self::pg_record;
Request {
key: Key {
field1: 0,
field2: 1663,
@@ -442,37 +473,5 @@ fn medium() -> Request {
],
pg_version: 14,
}
}
fn pg_record(will_init: bool, bytes: &'static [u8]) -> NeonWalRecord {
let rec = Bytes::from_static(bytes);
NeonWalRecord::Postgres { will_init, rec }
}
/// Simple wrapper around `WalRedoManager::request_redo`.
///
/// In benchmarks this is cloned around.
#[derive(Clone)]
struct Request {
key: Key,
lsn: Lsn,
base_img: Option<(Lsn, Bytes)>,
records: Vec<(Lsn, NeonWalRecord)>,
pg_version: u32,
}
impl Request {
async fn execute(self, manager: &PostgresRedoManager) -> anyhow::Result<Bytes> {
let Request {
key,
lsn,
base_img,
records,
pg_version,
} = self;
manager
.request_redo(key, lsn, base_img, records, pg_version)
.await
}
}

View File

@@ -15,9 +15,9 @@ use metrics::launch_timestamp::{set_launch_timestamp_metric, LaunchTimestamp};
use pageserver::control_plane_client::ControlPlaneClient;
use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
use pageserver::task_mgr::WALRECEIVER_RUNTIME;
use pageserver::tenant::{secondary, TenantSharedResources};
use remote_storage::GenericRemoteStorage;
use tokio::signal::unix::SignalKind;
use tokio::time::Instant;
use tracing::*;
@@ -28,7 +28,7 @@ use pageserver::{
deletion_queue::DeletionQueue,
http, page_cache, page_service, task_mgr,
task_mgr::TaskKind,
task_mgr::{BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME},
task_mgr::THE_RUNTIME,
tenant::mgr,
virtual_file,
};
@@ -323,7 +323,7 @@ fn start_pageserver(
// Launch broker client
// The storage_broker::connect call needs to happen inside a tokio runtime thread.
let broker_client = WALRECEIVER_RUNTIME
let broker_client = THE_RUNTIME
.block_on(async {
// Note: we do not attempt connecting here (but validate endpoints sanity).
storage_broker::connect(conf.broker_endpoint.clone(), conf.broker_keepalive_interval)
@@ -391,7 +391,7 @@ fn start_pageserver(
conf,
);
if let Some(deletion_workers) = deletion_workers {
deletion_workers.spawn_with(BACKGROUND_RUNTIME.handle());
deletion_workers.spawn_with(THE_RUNTIME.handle());
}
// Up to this point no significant I/O has been done: this should have been fast. Record
@@ -423,7 +423,7 @@ fn start_pageserver(
// Scan the local 'tenants/' directory and start loading the tenants
let deletion_queue_client = deletion_queue.new_client();
let tenant_manager = BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
let tenant_manager = THE_RUNTIME.block_on(mgr::init_tenant_mgr(
conf,
TenantSharedResources {
broker_client: broker_client.clone(),
@@ -435,7 +435,7 @@ fn start_pageserver(
))?;
let tenant_manager = Arc::new(tenant_manager);
BACKGROUND_RUNTIME.spawn({
THE_RUNTIME.spawn({
let shutdown_pageserver = shutdown_pageserver.clone();
let drive_init = async move {
// NOTE: unlike many futures in pageserver, this one is cancellation-safe
@@ -545,7 +545,7 @@ fn start_pageserver(
// Start up the service to handle HTTP mgmt API request. We created the
// listener earlier already.
{
let _rt_guard = MGMT_REQUEST_RUNTIME.enter();
let _rt_guard = THE_RUNTIME.enter();
let router_state = Arc::new(
http::routes::State::new(
@@ -569,7 +569,6 @@ fn start_pageserver(
.with_graceful_shutdown(task_mgr::shutdown_watcher());
task_mgr::spawn(
MGMT_REQUEST_RUNTIME.handle(),
TaskKind::HttpEndpointListener,
None,
None,
@@ -594,7 +593,6 @@ fn start_pageserver(
let local_disk_storage = conf.workdir.join("last_consumption_metrics.json");
task_mgr::spawn(
crate::BACKGROUND_RUNTIME.handle(),
TaskKind::MetricsCollection,
None,
None,
@@ -615,6 +613,7 @@ fn start_pageserver(
pageserver::consumption_metrics::collect_metrics(
metric_collection_endpoint,
&conf.metric_collection_bucket,
conf.metric_collection_interval,
conf.cached_metric_collection_interval,
conf.synthetic_size_calculation_interval,
@@ -642,7 +641,6 @@ fn start_pageserver(
DownloadBehavior::Error,
);
task_mgr::spawn(
COMPUTE_REQUEST_RUNTIME.handle(),
TaskKind::LibpqEndpointListener,
None,
None,
@@ -666,42 +664,37 @@ fn start_pageserver(
let mut shutdown_pageserver = Some(shutdown_pageserver.drop_guard());
// All started up! Now just sit and wait for shutdown signal.
{
use signal_hook::consts::*;
let signal_handler = BACKGROUND_RUNTIME.spawn_blocking(move || {
let mut signals =
signal_hook::iterator::Signals::new([SIGINT, SIGTERM, SIGQUIT]).unwrap();
return signals
.forever()
.next()
.expect("forever() never returns None unless explicitly closed");
});
let signal = BACKGROUND_RUNTIME
.block_on(signal_handler)
.expect("join error");
match signal {
SIGQUIT => {
info!("Got signal {signal}. Terminating in immediate shutdown mode",);
std::process::exit(111);
}
SIGINT | SIGTERM => {
info!("Got signal {signal}. Terminating gracefully in fast shutdown mode",);
// This cancels the `shutdown_pageserver` cancellation tree.
// Right now that tree doesn't reach very far, and `task_mgr` is used instead.
// The plan is to change that over time.
shutdown_pageserver.take();
let bg_remote_storage = remote_storage.clone();
let bg_deletion_queue = deletion_queue.clone();
BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver(
&tenant_manager,
bg_remote_storage.map(|_| bg_deletion_queue),
0,
));
unreachable!()
}
_ => unreachable!(),
}
{
THE_RUNTIME.block_on(async move {
let mut sigint = tokio::signal::unix::signal(SignalKind::interrupt()).unwrap();
let mut sigterm = tokio::signal::unix::signal(SignalKind::terminate()).unwrap();
let mut sigquit = tokio::signal::unix::signal(SignalKind::quit()).unwrap();
let signal = tokio::select! {
_ = sigquit.recv() => {
info!("Got signal SIGQUIT. Terminating in immediate shutdown mode",);
std::process::exit(111);
}
_ = sigint.recv() => { "SIGINT" },
_ = sigterm.recv() => { "SIGTERM" },
};
info!("Got signal {signal}. Terminating gracefully in fast shutdown mode",);
// This cancels the `shutdown_pageserver` cancellation tree.
// Right now that tree doesn't reach very far, and `task_mgr` is used instead.
// The plan is to change that over time.
shutdown_pageserver.take();
let bg_remote_storage = remote_storage.clone();
let bg_deletion_queue = deletion_queue.clone();
pageserver::shutdown_pageserver(
&tenant_manager,
bg_remote_storage.map(|_| bg_deletion_queue),
0,
)
.await;
unreachable!()
})
}
}

View File

@@ -234,6 +234,7 @@ pub struct PageServerConf {
// How often to send unchanged cached metrics to the metrics endpoint.
pub cached_metric_collection_interval: Duration,
pub metric_collection_endpoint: Option<Url>,
pub metric_collection_bucket: Option<RemoteStorageConfig>,
pub synthetic_size_calculation_interval: Duration,
pub disk_usage_based_eviction: Option<DiskUsageEvictionTaskConfig>,
@@ -373,6 +374,7 @@ struct PageServerConfigBuilder {
cached_metric_collection_interval: BuilderValue<Duration>,
metric_collection_endpoint: BuilderValue<Option<Url>>,
synthetic_size_calculation_interval: BuilderValue<Duration>,
metric_collection_bucket: BuilderValue<Option<RemoteStorageConfig>>,
disk_usage_based_eviction: BuilderValue<Option<DiskUsageEvictionTaskConfig>>,
@@ -455,6 +457,8 @@ impl PageServerConfigBuilder {
.expect("cannot parse default synthetic size calculation interval")),
metric_collection_endpoint: Set(DEFAULT_METRIC_COLLECTION_ENDPOINT),
metric_collection_bucket: Set(None),
disk_usage_based_eviction: Set(None),
test_remote_failures: Set(0),
@@ -586,6 +590,13 @@ impl PageServerConfigBuilder {
self.metric_collection_endpoint = BuilderValue::Set(metric_collection_endpoint)
}
pub fn metric_collection_bucket(
&mut self,
metric_collection_bucket: Option<RemoteStorageConfig>,
) {
self.metric_collection_bucket = BuilderValue::Set(metric_collection_bucket)
}
pub fn synthetic_size_calculation_interval(
&mut self,
synthetic_size_calculation_interval: Duration,
@@ -694,6 +705,7 @@ impl PageServerConfigBuilder {
metric_collection_interval,
cached_metric_collection_interval,
metric_collection_endpoint,
metric_collection_bucket,
synthetic_size_calculation_interval,
disk_usage_based_eviction,
test_remote_failures,
@@ -942,6 +954,9 @@ impl PageServerConf {
let endpoint = parse_toml_string(key, item)?.parse().context("failed to parse metric_collection_endpoint")?;
builder.metric_collection_endpoint(Some(endpoint));
},
"metric_collection_bucket" => {
builder.metric_collection_bucket(RemoteStorageConfig::from_toml(item)?)
}
"synthetic_size_calculation_interval" =>
builder.synthetic_size_calculation_interval(parse_toml_duration(key, item)?),
"test_remote_failures" => builder.test_remote_failures(parse_toml_u64(key, item)?),
@@ -1057,6 +1072,7 @@ impl PageServerConf {
metric_collection_interval: Duration::from_secs(60),
cached_metric_collection_interval: Duration::from_secs(60 * 60),
metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT,
metric_collection_bucket: None,
synthetic_size_calculation_interval: Duration::from_secs(60),
disk_usage_based_eviction: None,
test_remote_failures: 0,
@@ -1289,6 +1305,7 @@ background_task_maximum_delay = '334 s'
defaults::DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL
)?,
metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT,
metric_collection_bucket: None,
synthetic_size_calculation_interval: humantime::parse_duration(
defaults::DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL
)?,
@@ -1363,6 +1380,7 @@ background_task_maximum_delay = '334 s'
metric_collection_interval: Duration::from_secs(222),
cached_metric_collection_interval: Duration::from_secs(22200),
metric_collection_endpoint: Some(Url::parse("http://localhost:80/metrics")?),
metric_collection_bucket: None,
synthetic_size_calculation_interval: Duration::from_secs(333),
disk_usage_based_eviction: None,
test_remote_failures: 0,

View File

@@ -1,12 +1,13 @@
//! Periodically collect consumption metrics for all active tenants
//! and push them to a HTTP endpoint.
use crate::context::{DownloadBehavior, RequestContext};
use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
use crate::task_mgr::{self, TaskKind};
use crate::tenant::tasks::BackgroundLoopKind;
use crate::tenant::{mgr, LogicalSizeCalculationCause, PageReconstructError, Tenant};
use camino::Utf8PathBuf;
use consumption_metrics::EventType;
use pageserver_api::models::TenantState;
use remote_storage::{GenericRemoteStorage, RemoteStorageConfig};
use reqwest::Url;
use std::collections::HashMap;
use std::sync::Arc;
@@ -41,6 +42,7 @@ type Cache = HashMap<MetricsKey, (EventType, u64)>;
#[allow(clippy::too_many_arguments)]
pub async fn collect_metrics(
metric_collection_endpoint: &Url,
metric_collection_bucket: &Option<RemoteStorageConfig>,
metric_collection_interval: Duration,
_cached_metric_collection_interval: Duration,
synthetic_size_calculation_interval: Duration,
@@ -59,7 +61,6 @@ pub async fn collect_metrics(
let worker_ctx =
ctx.detached_child(TaskKind::CalculateSyntheticSize, DownloadBehavior::Download);
task_mgr::spawn(
BACKGROUND_RUNTIME.handle(),
TaskKind::CalculateSyntheticSize,
None,
None,
@@ -94,6 +95,20 @@ pub async fn collect_metrics(
.build()
.expect("Failed to create http client with timeout");
let bucket_client = if let Some(bucket_config) = metric_collection_bucket {
match GenericRemoteStorage::from_config(bucket_config) {
Ok(client) => Some(client),
Err(e) => {
// Non-fatal error: if we were given an invalid config, we will proceed
// with sending metrics over the network, but not to S3.
tracing::warn!("Invalid configuration for metric_collection_bucket: {e}");
None
}
}
} else {
None
};
let node_id = node_id.to_string();
loop {
@@ -118,10 +133,18 @@ pub async fn collect_metrics(
tracing::error!("failed to persist metrics to {path:?}: {e:#}");
}
}
if let Some(bucket_client) = &bucket_client {
let res =
upload::upload_metrics_bucket(bucket_client, &cancel, &node_id, &metrics).await;
if let Err(e) = res {
tracing::error!("failed to upload to S3: {e:#}");
}
}
};
let upload = async {
let res = upload::upload_metrics(
let res = upload::upload_metrics_http(
&client,
metric_collection_endpoint,
&cancel,
@@ -132,7 +155,7 @@ pub async fn collect_metrics(
.await;
if let Err(e) = res {
// serialization error which should never happen
tracing::error!("failed to upload due to {e:#}");
tracing::error!("failed to upload via HTTP due to {e:#}");
}
};

View File

@@ -1,4 +1,9 @@
use std::time::SystemTime;
use chrono::{DateTime, Utc};
use consumption_metrics::{Event, EventChunk, IdempotencyKey, CHUNK_SIZE};
use remote_storage::{GenericRemoteStorage, RemotePath};
use tokio::io::AsyncWriteExt;
use tokio_util::sync::CancellationToken;
use tracing::Instrument;
@@ -13,8 +18,9 @@ struct Ids {
pub(super) timeline_id: Option<TimelineId>,
}
/// Serialize and write metrics to an HTTP endpoint
#[tracing::instrument(skip_all, fields(metrics_total = %metrics.len()))]
pub(super) async fn upload_metrics(
pub(super) async fn upload_metrics_http(
client: &reqwest::Client,
metric_collection_endpoint: &reqwest::Url,
cancel: &CancellationToken,
@@ -74,6 +80,60 @@ pub(super) async fn upload_metrics(
Ok(())
}
/// Serialize and write metrics to a remote storage object
#[tracing::instrument(skip_all, fields(metrics_total = %metrics.len()))]
pub(super) async fn upload_metrics_bucket(
client: &GenericRemoteStorage,
cancel: &CancellationToken,
node_id: &str,
metrics: &[RawMetric],
) -> anyhow::Result<()> {
if metrics.is_empty() {
// Skip uploads if we have no metrics, so that readers don't have to handle the edge case
// of an empty object.
return Ok(());
}
// Compose object path
let datetime: DateTime<Utc> = SystemTime::now().into();
let ts_prefix = datetime.format("year=%Y/month=%m/day=%d/%H:%M:%SZ");
let path = RemotePath::from_string(&format!("{ts_prefix}_{node_id}.ndjson.gz"))?;
// Set up a gzip writer into a buffer
let mut compressed_bytes: Vec<u8> = Vec::new();
let compressed_writer = std::io::Cursor::new(&mut compressed_bytes);
let mut gzip_writer = async_compression::tokio::write::GzipEncoder::new(compressed_writer);
// Serialize and write into compressed buffer
let started_at = std::time::Instant::now();
for res in serialize_in_chunks(CHUNK_SIZE, metrics, node_id) {
let (_chunk, body) = res?;
gzip_writer.write_all(&body).await?;
}
gzip_writer.flush().await?;
gzip_writer.shutdown().await?;
let compressed_length = compressed_bytes.len();
// Write to remote storage
client
.upload_storage_object(
futures::stream::once(futures::future::ready(Ok(compressed_bytes.into()))),
compressed_length,
&path,
cancel,
)
.await?;
let elapsed = started_at.elapsed();
tracing::info!(
compressed_length,
elapsed_ms = elapsed.as_millis(),
"write metrics bucket at {path}",
);
Ok(())
}
// The return type is quite ugly, but we gain testability in isolation
fn serialize_in_chunks<'a, F>(
chunk_size: usize,

View File

@@ -5,7 +5,8 @@ use pageserver_api::{
controller_api::NodeRegisterRequest,
shard::TenantShardId,
upcall_api::{
ReAttachRequest, ReAttachResponse, ValidateRequest, ValidateRequestTenant, ValidateResponse,
ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, ValidateRequest,
ValidateRequestTenant, ValidateResponse,
},
};
use serde::{de::DeserializeOwned, Serialize};
@@ -37,7 +38,9 @@ pub trait ControlPlaneGenerationsApi {
fn re_attach(
&self,
conf: &PageServerConf,
) -> impl Future<Output = Result<HashMap<TenantShardId, Generation>, RetryForeverError>> + Send;
) -> impl Future<
Output = Result<HashMap<TenantShardId, ReAttachResponseTenant>, RetryForeverError>,
> + Send;
fn validate(
&self,
tenants: Vec<(TenantShardId, Generation)>,
@@ -118,7 +121,7 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
async fn re_attach(
&self,
conf: &PageServerConf,
) -> Result<HashMap<TenantShardId, Generation>, RetryForeverError> {
) -> Result<HashMap<TenantShardId, ReAttachResponseTenant>, RetryForeverError> {
let re_attach_path = self
.base_url
.join("re-attach")
@@ -170,8 +173,6 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
register,
};
fail::fail_point!("control-plane-client-re-attach");
let response: ReAttachResponse = self.retry_http_forever(&re_attach_path, request).await?;
tracing::info!(
"Received re-attach response with {} tenants",
@@ -181,7 +182,7 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
Ok(response
.tenants
.into_iter()
.map(|t| (t.id, Generation::new(t.gen)))
.map(|rart| (rart.id, rart))
.collect::<HashMap<_, _>>())
}
@@ -207,7 +208,7 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
.collect(),
};
fail::fail_point!("control-plane-client-validate");
crate::tenant::pausable_failpoint!("control-plane-client-validate");
let response: ValidateResponse = self.retry_http_forever(&re_attach_path, request).await?;

View File

@@ -724,8 +724,8 @@ impl DeletionQueue {
mod test {
use camino::Utf8Path;
use hex_literal::hex;
use pageserver_api::shard::ShardIndex;
use std::io::ErrorKind;
use pageserver_api::{shard::ShardIndex, upcall_api::ReAttachResponseTenant};
use std::{io::ErrorKind, time::Duration};
use tracing::info;
use remote_storage::{RemoteStorageConfig, RemoteStorageKind};
@@ -834,9 +834,10 @@ mod test {
async fn re_attach(
&self,
_conf: &PageServerConf,
) -> Result<HashMap<TenantShardId, Generation>, RetryForeverError> {
) -> Result<HashMap<TenantShardId, ReAttachResponseTenant>, RetryForeverError> {
unimplemented!()
}
async fn validate(
&self,
tenants: Vec<(TenantShardId, Generation)>,

View File

@@ -59,7 +59,7 @@ use utils::{completion, id::TimelineId};
use crate::{
config::PageServerConf,
metrics::disk_usage_based_eviction::METRICS,
task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
task_mgr::{self, TaskKind},
tenant::{
self,
mgr::TenantManager,
@@ -202,7 +202,6 @@ pub fn launch_disk_usage_global_eviction_task(
info!("launching disk usage based eviction task");
task_mgr::spawn(
BACKGROUND_RUNTIME.handle(),
TaskKind::DiskUsageEviction,
None,
None,

View File

@@ -36,6 +36,7 @@ use tokio_util::sync::CancellationToken;
use tracing::*;
use utils::auth::JwtAuth;
use utils::failpoint_support::failpoints_handler;
use utils::http::endpoint::prometheus_metrics_handler;
use utils::http::endpoint::request_span;
use utils::http::json::json_request_or_empty_body;
use utils::http::request::{get_request_param, must_get_query_param, parse_query_param};
@@ -885,14 +886,16 @@ async fn tenant_detach_handler(
let state = get_state(&request);
let conf = state.conf;
mgr::detach_tenant(
conf,
tenant_shard_id,
detach_ignored.unwrap_or(false),
&state.deletion_queue_client,
)
.instrument(info_span!("tenant_detach", %tenant_id, shard_id=%tenant_shard_id.shard_slug()))
.await?;
state
.tenant_manager
.detach_tenant(
conf,
tenant_shard_id,
detach_ignored.unwrap_or(false),
&state.deletion_queue_client,
)
.instrument(info_span!("tenant_detach", %tenant_id, shard_id=%tenant_shard_id.shard_slug()))
.await?;
json_response(StatusCode::OK, ())
}
@@ -1403,7 +1406,9 @@ async fn update_tenant_config_handler(
TenantConfOpt::try_from(&request_data.config).map_err(ApiError::BadRequest)?;
let state = get_state(&request);
mgr::set_new_tenant_config(state.conf, tenant_conf, tenant_id)
state
.tenant_manager
.set_new_tenant_config(tenant_conf, tenant_id)
.instrument(info_span!("tenant_config", %tenant_id))
.await?;
@@ -1428,13 +1433,14 @@ async fn put_tenant_location_config_handler(
// The `Detached` state is special, it doesn't upsert a tenant, it removes
// its local disk content and drops it from memory.
if let LocationConfigMode::Detached = request_data.config.mode {
if let Err(e) =
mgr::detach_tenant(conf, tenant_shard_id, true, &state.deletion_queue_client)
.instrument(info_span!("tenant_detach",
tenant_id = %tenant_shard_id.tenant_id,
shard_id = %tenant_shard_id.shard_slug()
))
.await
if let Err(e) = state
.tenant_manager
.detach_tenant(conf, tenant_shard_id, true, &state.deletion_queue_client)
.instrument(info_span!("tenant_detach",
tenant_id = %tenant_shard_id.tenant_id,
shard_id = %tenant_shard_id.shard_slug()
))
.await
{
match e {
TenantStateError::SlotError(TenantSlotError::NotFound(_)) => {
@@ -1648,8 +1654,7 @@ async fn timeline_gc_handler(
let gc_req: TimelineGcRequest = json_request(&mut request).await?;
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
let wait_task_done =
mgr::immediate_gc(tenant_shard_id, timeline_id, gc_req, cancel, &ctx).await?;
let wait_task_done = mgr::immediate_gc(tenant_shard_id, timeline_id, gc_req, cancel, &ctx)?;
let gc_result = wait_task_done
.await
.context("wait for gc task")
@@ -2262,6 +2267,7 @@ pub fn make_router(
Ok(router
.data(state)
.get("/metrics", |r| request_span(r, prometheus_metrics_handler))
.get("/v1/status", |r| api_handler(r, status_handler))
.put("/v1/failpoints", |r| {
testing_api_handler("manage failpoints", r, failpoints_handler)

View File

@@ -2,28 +2,20 @@
//! Import data and WAL from a PostgreSQL data directory and WAL segments into
//! a neon Timeline.
//!
use std::io::SeekFrom;
use std::path::{Path, PathBuf};
use anyhow::{bail, ensure, Context, Result};
use async_compression::tokio::bufread::ZstdDecoder;
use async_compression::{tokio::write::ZstdEncoder, zstd::CParameter, Level};
use bytes::Bytes;
use camino::Utf8Path;
use futures::StreamExt;
use nix::NixPath;
use tokio::fs::{File, OpenOptions};
use tokio::io::{AsyncBufRead, AsyncRead, AsyncReadExt, AsyncSeekExt, AsyncWriteExt};
use tokio::io::{AsyncRead, AsyncReadExt};
use tokio_tar::Archive;
use tokio_tar::Builder;
use tokio_tar::HeaderMode;
use tracing::*;
use walkdir::WalkDir;
use crate::context::RequestContext;
use crate::metrics::WAL_INGEST;
use crate::pgdatadir_mapping::*;
use crate::tenant::remote_timeline_client::INITDB_PATH;
use crate::tenant::Timeline;
use crate::walingest::WalIngest;
use crate::walrecord::DecodedWALRecord;
@@ -633,65 +625,3 @@ async fn read_all_bytes(reader: &mut (impl AsyncRead + Unpin)) -> Result<Bytes>
reader.read_to_end(&mut buf).await?;
Ok(Bytes::from(buf))
}
pub async fn create_tar_zst(pgdata_path: &Utf8Path, tmp_path: &Utf8Path) -> Result<(File, u64)> {
let file = OpenOptions::new()
.create(true)
.truncate(true)
.read(true)
.write(true)
.open(&tmp_path)
.await
.with_context(|| format!("tempfile creation {tmp_path}"))?;
let mut paths = Vec::new();
for entry in WalkDir::new(pgdata_path) {
let entry = entry?;
let metadata = entry.metadata().expect("error getting dir entry metadata");
// Also allow directories so that we also get empty directories
if !(metadata.is_file() || metadata.is_dir()) {
continue;
}
let path = entry.into_path();
paths.push(path);
}
// Do a sort to get a more consistent listing
paths.sort_unstable();
let zstd = ZstdEncoder::with_quality_and_params(
file,
Level::Default,
&[CParameter::enable_long_distance_matching(true)],
);
let mut builder = Builder::new(zstd);
// Use reproducible header mode
builder.mode(HeaderMode::Deterministic);
for path in paths {
let rel_path = path.strip_prefix(pgdata_path)?;
if rel_path.is_empty() {
// The top directory should not be compressed,
// the tar crate doesn't like that
continue;
}
builder.append_path_with_name(&path, rel_path).await?;
}
let mut zstd = builder.into_inner().await?;
zstd.shutdown().await?;
let mut compressed = zstd.into_inner();
let compressed_len = compressed.metadata().await?.len();
const INITDB_TAR_ZST_WARN_LIMIT: u64 = 2 * 1024 * 1024;
if compressed_len > INITDB_TAR_ZST_WARN_LIMIT {
warn!("compressed {INITDB_PATH} size of {compressed_len} is above limit {INITDB_TAR_ZST_WARN_LIMIT}.");
}
compressed.seek(SeekFrom::Start(0)).await?;
Ok((compressed, compressed_len))
}
pub async fn extract_tar_zst(
pgdata_path: &Utf8Path,
tar_zst: impl AsyncBufRead + Unpin,
) -> Result<()> {
let tar = Box::pin(ZstdDecoder::new(tar_zst));
let mut archive = Archive::new(tar);
archive.unpack(pgdata_path).await?;
Ok(())
}

View File

@@ -180,7 +180,6 @@ pub async fn libpq_listener_main(
// only deal with a particular timeline, but we don't know which one
// yet.
task_mgr::spawn(
&tokio::runtime::Handle::current(),
TaskKind::PageRequestHandler,
None,
None,

View File

@@ -34,6 +34,7 @@ use strum::IntoEnumIterator;
use tokio_util::sync::CancellationToken;
use tracing::{debug, trace, warn};
use utils::bin_ser::DeserializeError;
use utils::vec_map::{VecMap, VecMapOrdering};
use utils::{bin_ser::BeSer, lsn::Lsn};
const MAX_AUX_FILE_DELTAS: usize = 1024;
@@ -1546,12 +1547,13 @@ impl<'a> DatadirModification<'a> {
if !self.pending_updates.is_empty() {
// The put_batch call below expects expects the inputs to be sorted by Lsn,
// so we do that first.
let lsn_ordered_batch: Vec<(Key, Lsn, Value)> = self
.pending_updates
.drain()
.map(|(key, vals)| vals.into_iter().map(move |(lsn, val)| (key, lsn, val)))
.kmerge_by(|lhs, rhs| lhs.1 .0 < rhs.1 .0)
.collect();
let lsn_ordered_batch: VecMap<Lsn, (Key, Value)> = VecMap::from_iter(
self.pending_updates
.drain()
.map(|(key, vals)| vals.into_iter().map(move |(lsn, val)| (lsn, (key, val))))
.kmerge_by(|lhs, rhs| lhs.0 < rhs.0),
VecMapOrdering::GreaterOrEqual,
);
writer.put_batch(lsn_ordered_batch, ctx).await?;
}

View File

@@ -98,42 +98,22 @@ use utils::id::TimelineId;
// other operations, if the upload tasks e.g. get blocked on locks. It shouldn't
// happen, but still.
//
pub static COMPUTE_REQUEST_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
tokio::runtime::Builder::new_multi_thread()
.thread_name("compute request worker")
.enable_all()
.build()
.expect("Failed to create compute request runtime")
});
pub static MGMT_REQUEST_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
/// The single tokio runtime used by all pageserver code.
/// In the past, we had multiple runtimes, and in the future we should weed out
/// remaining references to this global field and rely on ambient runtime instead,
/// i.e., use `tokio::spawn` instead of `THE_RUNTIME.spawn()`, etc.
pub static THE_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
tokio::runtime::Builder::new_multi_thread()
.thread_name("mgmt request worker")
.enable_all()
.build()
.expect("Failed to create mgmt request runtime")
});
pub static WALRECEIVER_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
tokio::runtime::Builder::new_multi_thread()
.thread_name("walreceiver worker")
.enable_all()
.build()
.expect("Failed to create walreceiver runtime")
});
pub static BACKGROUND_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
tokio::runtime::Builder::new_multi_thread()
.thread_name("background op worker")
// if you change the number of worker threads please change the constant below
.enable_all()
.build()
.expect("Failed to create background op runtime")
});
pub(crate) static BACKGROUND_RUNTIME_WORKER_THREADS: Lazy<usize> = Lazy::new(|| {
pub(crate) static THE_RUNTIME_WORKER_THREADS: Lazy<usize> = Lazy::new(|| {
// force init and thus panics
let _ = BACKGROUND_RUNTIME.handle();
let _ = THE_RUNTIME.handle();
// replicates tokio-1.28.1::loom::sys::num_cpus which is not available publicly
// tokio would had already panicked for parsing errors or NotUnicode
//
@@ -325,7 +305,6 @@ struct PageServerTask {
/// Note: if shutdown_process_on_error is set to true failure
/// of the task will lead to shutdown of entire process
pub fn spawn<F>(
runtime: &tokio::runtime::Handle,
kind: TaskKind,
tenant_shard_id: Option<TenantShardId>,
timeline_id: Option<TimelineId>,
@@ -354,7 +333,7 @@ where
let task_name = name.to_string();
let task_cloned = Arc::clone(&task);
let join_handle = runtime.spawn(task_wrapper(
let join_handle = THE_RUNTIME.spawn(task_wrapper(
task_name,
task_id,
task_cloned,

View File

@@ -43,6 +43,8 @@ use utils::sync::gate::Gate;
use utils::sync::gate::GateGuard;
use utils::timeout::timeout_cancellable;
use utils::timeout::TimeoutCancellableError;
use utils::zstd::create_zst_tarball;
use utils::zstd::extract_zst_tarball;
use self::config::AttachedLocationConfig;
use self::config::AttachmentMode;
@@ -142,6 +144,7 @@ macro_rules! pausable_failpoint {
}
};
}
pub(crate) use pausable_failpoint;
pub mod blob_io;
pub mod block_io;
@@ -200,6 +203,13 @@ pub(super) struct AttachedTenantConf {
}
impl AttachedTenantConf {
fn new(tenant_conf: TenantConfOpt, location: AttachedLocationConfig) -> Self {
Self {
tenant_conf,
location,
}
}
fn try_from(location_conf: LocationConf) -> anyhow::Result<Self> {
match &location_conf.mode {
LocationMode::Attached(attach_conf) => Ok(Self {
@@ -652,7 +662,6 @@ impl Tenant {
let tenant_clone = Arc::clone(&tenant);
let ctx = ctx.detached_child(TaskKind::Attach, DownloadBehavior::Warn);
task_mgr::spawn(
&tokio::runtime::Handle::current(),
TaskKind::Attach,
Some(tenant_shard_id),
None,
@@ -676,9 +685,20 @@ impl Tenant {
}
// Ideally we should use Tenant::set_broken_no_wait, but it is not supposed to be used when tenant is in loading state.
enum BrokenVerbosity {
Error,
Info
}
let make_broken =
|t: &Tenant, err: anyhow::Error| {
error!("attach failed, setting tenant state to Broken: {err:?}");
|t: &Tenant, err: anyhow::Error, verbosity: BrokenVerbosity| {
match verbosity {
BrokenVerbosity::Info => {
info!("attach cancelled, setting tenant state to Broken: {err}");
},
BrokenVerbosity::Error => {
error!("attach failed, setting tenant state to Broken: {err:?}");
}
}
t.state.send_modify(|state| {
// The Stopping case is for when we have passed control on to DeleteTenantFlow:
// if it errors, we will call make_broken when tenant is already in Stopping.
@@ -742,7 +762,7 @@ impl Tenant {
// Make the tenant broken so that set_stopping will not hang waiting for it to leave
// the Attaching state. This is an over-reaction (nothing really broke, the tenant is
// just shutting down), but ensures progress.
make_broken(&tenant_clone, anyhow::anyhow!("Shut down while Attaching"));
make_broken(&tenant_clone, anyhow::anyhow!("Shut down while Attaching"), BrokenVerbosity::Info);
return Ok(());
},
)
@@ -764,7 +784,7 @@ impl Tenant {
match res {
Ok(p) => Some(p),
Err(e) => {
make_broken(&tenant_clone, anyhow::anyhow!(e));
make_broken(&tenant_clone, anyhow::anyhow!(e), BrokenVerbosity::Error);
return Ok(());
}
}
@@ -788,7 +808,7 @@ impl Tenant {
{
Ok(should_resume_deletion) => should_resume_deletion,
Err(err) => {
make_broken(&tenant_clone, anyhow::anyhow!(err));
make_broken(&tenant_clone, anyhow::anyhow!(err), BrokenVerbosity::Error);
return Ok(());
}
}
@@ -818,7 +838,7 @@ impl Tenant {
.await;
if let Err(e) = deleted {
make_broken(&tenant_clone, anyhow::anyhow!(e));
make_broken(&tenant_clone, anyhow::anyhow!(e), BrokenVerbosity::Error);
}
return Ok(());
@@ -839,7 +859,7 @@ impl Tenant {
tenant_clone.activate(broker_client, None, &ctx);
}
Err(e) => {
make_broken(&tenant_clone, anyhow::anyhow!(e));
make_broken(&tenant_clone, anyhow::anyhow!(e), BrokenVerbosity::Error);
}
}
@@ -3042,8 +3062,13 @@ impl Tenant {
}
}
let (pgdata_zstd, tar_zst_size) =
import_datadir::create_tar_zst(pgdata_path, &temp_path).await?;
let (pgdata_zstd, tar_zst_size) = create_zst_tarball(pgdata_path, &temp_path).await?;
const INITDB_TAR_ZST_WARN_LIMIT: u64 = 2 * 1024 * 1024;
if tar_zst_size > INITDB_TAR_ZST_WARN_LIMIT {
warn!(
"compressed {temp_path} size of {tar_zst_size} is above limit {INITDB_TAR_ZST_WARN_LIMIT}."
);
}
pausable_failpoint!("before-initdb-upload");
@@ -3143,7 +3168,7 @@ impl Tenant {
let buf_read =
BufReader::with_capacity(remote_timeline_client::BUFFER_SIZE, initdb_tar_zst);
import_datadir::extract_tar_zst(&pgdata_path, buf_read)
extract_zst_tarball(&pgdata_path, buf_read)
.await
.context("extract initdb tar")?;
} else {

View File

@@ -196,16 +196,17 @@ impl LocationConf {
/// For use when attaching/re-attaching: update the generation stored in this
/// structure. If we were in a secondary state, promote to attached (posession
/// of a fresh generation implies this).
pub(crate) fn attach_in_generation(&mut self, generation: Generation) {
pub(crate) fn attach_in_generation(&mut self, mode: AttachmentMode, generation: Generation) {
match &mut self.mode {
LocationMode::Attached(attach_conf) => {
attach_conf.generation = generation;
attach_conf.attach_mode = mode;
}
LocationMode::Secondary(_) => {
// We are promoted to attached by the control plane's re-attach response
self.mode = LocationMode::Attached(AttachedLocationConfig {
generation,
attach_mode: AttachmentMode::Single,
attach_mode: mode,
})
}
}

View File

@@ -111,6 +111,7 @@ async fn create_local_delete_mark(
let _ = std::fs::OpenOptions::new()
.write(true)
.create(true)
.truncate(true)
.open(&marker_path)
.with_context(|| format!("could not create delete marker file {marker_path:?}"))?;
@@ -481,7 +482,6 @@ impl DeleteTenantFlow {
let tenant_shard_id = tenant.tenant_shard_id;
task_mgr::spawn(
task_mgr::BACKGROUND_RUNTIME.handle(),
TaskKind::TimelineDeletionWorker,
Some(tenant_shard_id),
None,

View File

@@ -2,13 +2,13 @@
//! page server.
use camino::{Utf8DirEntry, Utf8Path, Utf8PathBuf};
use futures::stream::StreamExt;
use itertools::Itertools;
use pageserver_api::key::Key;
use pageserver_api::models::ShardParameters;
use pageserver_api::models::{LocationConfigMode, ShardParameters};
use pageserver_api::shard::{
ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId,
};
use pageserver_api::upcall_api::ReAttachResponseTenant;
use rand::{distributions::Alphanumeric, Rng};
use std::borrow::Cow;
use std::cmp::Ordering;
@@ -125,6 +125,46 @@ pub(crate) enum ShardSelector {
Page(Key),
}
/// A convenience for use with the re_attach ControlPlaneClient function: rather
/// than the serializable struct, we build this enum that encapsulates
/// the invariant that attached tenants always have generations.
///
/// This represents the subset of a LocationConfig that we receive during re-attach.
pub(crate) enum TenantStartupMode {
Attached((AttachmentMode, Generation)),
Secondary,
}
impl TenantStartupMode {
/// Return the generation & mode that should be used when starting
/// this tenant.
///
/// If this returns None, the re-attach struct is in an invalid state and
/// should be ignored in the response.
fn from_reattach_tenant(rart: ReAttachResponseTenant) -> Option<Self> {
match (rart.mode, rart.gen) {
(LocationConfigMode::Detached, _) => None,
(LocationConfigMode::Secondary, _) => Some(Self::Secondary),
(LocationConfigMode::AttachedMulti, Some(g)) => {
Some(Self::Attached((AttachmentMode::Multi, Generation::new(g))))
}
(LocationConfigMode::AttachedSingle, Some(g)) => {
Some(Self::Attached((AttachmentMode::Single, Generation::new(g))))
}
(LocationConfigMode::AttachedStale, Some(g)) => {
Some(Self::Attached((AttachmentMode::Stale, Generation::new(g))))
}
_ => {
tracing::warn!(
"Received invalid re-attach state for tenant {}: {rart:?}",
rart.id
);
None
}
}
}
}
impl TenantsMap {
/// Convenience function for typical usage, where we want to get a `Tenant` object, for
/// working with attached tenants. If the TenantId is in the map but in Secondary state,
@@ -271,7 +311,7 @@ pub struct TenantManager {
fn emergency_generations(
tenant_confs: &HashMap<TenantShardId, anyhow::Result<LocationConf>>,
) -> HashMap<TenantShardId, Generation> {
) -> HashMap<TenantShardId, TenantStartupMode> {
tenant_confs
.iter()
.filter_map(|(tid, lc)| {
@@ -279,12 +319,15 @@ fn emergency_generations(
Ok(lc) => lc,
Err(_) => return None,
};
let gen = match &lc.mode {
LocationMode::Attached(alc) => Some(alc.generation),
LocationMode::Secondary(_) => None,
};
gen.map(|g| (*tid, g))
Some((
*tid,
match &lc.mode {
LocationMode::Attached(alc) => {
TenantStartupMode::Attached((alc.attach_mode, alc.generation))
}
LocationMode::Secondary(_) => TenantStartupMode::Secondary,
},
))
})
.collect()
}
@@ -294,7 +337,7 @@ async fn init_load_generations(
tenant_confs: &HashMap<TenantShardId, anyhow::Result<LocationConf>>,
resources: &TenantSharedResources,
cancel: &CancellationToken,
) -> anyhow::Result<Option<HashMap<TenantShardId, Generation>>> {
) -> anyhow::Result<Option<HashMap<TenantShardId, TenantStartupMode>>> {
let generations = if conf.control_plane_emergency_mode {
error!(
"Emergency mode! Tenants will be attached unsafely using their last known generation"
@@ -304,7 +347,12 @@ async fn init_load_generations(
info!("Calling control plane API to re-attach tenants");
// If we are configured to use the control plane API, then it is the source of truth for what tenants to load.
match client.re_attach(conf).await {
Ok(tenants) => tenants,
Ok(tenants) => tenants
.into_iter()
.flat_map(|(id, rart)| {
TenantStartupMode::from_reattach_tenant(rart).map(|tsm| (id, tsm))
})
.collect(),
Err(RetryForeverError::ShuttingDown) => {
anyhow::bail!("Shut down while waiting for control plane re-attach response")
}
@@ -322,9 +370,17 @@ async fn init_load_generations(
// Must only do this if remote storage is enabled, otherwise deletion queue
// is not running and channel push will fail.
if resources.remote_storage.is_some() {
resources
.deletion_queue_client
.recover(generations.clone())?;
let attached_tenants = generations
.iter()
.flat_map(|(id, start_mode)| {
match start_mode {
TenantStartupMode::Attached((_mode, generation)) => Some(generation),
TenantStartupMode::Secondary => None,
}
.map(|gen| (*id, *gen))
})
.collect();
resources.deletion_queue_client.recover(attached_tenants)?;
}
Ok(Some(generations))
@@ -490,9 +546,8 @@ pub async fn init_tenant_mgr(
// Scan local filesystem for attached tenants
let tenant_configs = init_load_tenant_configs(conf).await?;
// Determine which tenants are to be attached
let tenant_generations =
init_load_generations(conf, &tenant_configs, &resources, &cancel).await?;
// Determine which tenants are to be secondary or attached, and in which generation
let tenant_modes = init_load_generations(conf, &tenant_configs, &resources, &cancel).await?;
tracing::info!(
"Attaching {} tenants at startup, warming up {} at a time",
@@ -522,97 +577,102 @@ pub async fn init_tenant_mgr(
}
};
let generation = if let Some(generations) = &tenant_generations {
// FIXME: if we were attached, and get demoted to secondary on re-attach, we
// don't have a place to get a config.
// (https://github.com/neondatabase/neon/issues/5377)
const DEFAULT_SECONDARY_CONF: SecondaryLocationConfig =
SecondaryLocationConfig { warm: true };
// Update the location config according to the re-attach response
if let Some(tenant_modes) = &tenant_modes {
// We have a generation map: treat it as the authority for whether
// this tenant is really attached.
if let Some(gen) = generations.get(&tenant_shard_id) {
if let LocationMode::Attached(attached) = &location_conf.mode {
if attached.generation > *gen {
match tenant_modes.get(&tenant_shard_id) {
None => {
info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Detaching tenant, control plane omitted it in re-attach response");
if let Err(e) = safe_remove_tenant_dir_all(&tenant_dir_path).await {
error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
"Failed to remove detached tenant directory '{tenant_dir_path}': {e:?}",
);
}
// We deleted local content: move on to next tenant, don't try and spawn this one.
continue;
}
Some(TenantStartupMode::Secondary) => {
if !matches!(location_conf.mode, LocationMode::Secondary(_)) {
location_conf.mode = LocationMode::Secondary(DEFAULT_SECONDARY_CONF);
}
}
Some(TenantStartupMode::Attached((attach_mode, generation))) => {
let old_gen_higher = match &location_conf.mode {
LocationMode::Attached(AttachedLocationConfig {
generation: old_generation,
attach_mode: _attach_mode,
}) => {
if old_generation > generation {
Some(old_generation)
} else {
None
}
}
_ => None,
};
if let Some(old_generation) = old_gen_higher {
tracing::error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
"Control plane gave decreasing generation ({gen:?}) in re-attach response for tenant that was attached in generation {:?}, demoting to secondary",
attached.generation
"Control plane gave decreasing generation ({generation:?}) in re-attach response for tenant that was attached in generation {:?}, demoting to secondary",
old_generation
);
// We cannot safely attach this tenant given a bogus generation number, but let's avoid throwing away
// local disk content: demote to secondary rather than detaching.
tenants.insert(
tenant_shard_id,
TenantSlot::Secondary(SecondaryTenant::new(
tenant_shard_id,
location_conf.shard,
location_conf.tenant_conf.clone(),
&SecondaryLocationConfig { warm: false },
)),
);
location_conf.mode = LocationMode::Secondary(DEFAULT_SECONDARY_CONF);
} else {
location_conf.attach_in_generation(*attach_mode, *generation);
}
}
*gen
} else {
match &location_conf.mode {
LocationMode::Secondary(secondary_config) => {
// We do not require the control plane's permission for secondary mode
// tenants, because they do no remote writes and hence require no
// generation number
info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Loaded tenant in secondary mode");
tenants.insert(
tenant_shard_id,
TenantSlot::Secondary(SecondaryTenant::new(
tenant_shard_id,
location_conf.shard,
location_conf.tenant_conf,
secondary_config,
)),
);
}
LocationMode::Attached(_) => {
// TODO: augment re-attach API to enable the control plane to
// instruct us about secondary attachments. That way, instead of throwing
// away local state, we can gracefully fall back to secondary here, if the control
// plane tells us so.
// (https://github.com/neondatabase/neon/issues/5377)
info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Detaching tenant, control plane omitted it in re-attach response");
if let Err(e) = safe_remove_tenant_dir_all(&tenant_dir_path).await {
error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
"Failed to remove detached tenant directory '{tenant_dir_path}': {e:?}",
);
}
}
};
continue;
}
} else {
// Legacy mode: no generation information, any tenant present
// on local disk may activate
info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Starting tenant in legacy mode, no generation",);
Generation::none()
};
// Presence of a generation number implies attachment: attach the tenant
// if it wasn't already, and apply the generation number.
location_conf.attach_in_generation(generation);
Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await?;
let shard_identity = location_conf.shard;
match tenant_spawn(
conf,
tenant_shard_id,
&tenant_dir_path,
resources.clone(),
AttachedTenantConf::try_from(location_conf)?,
shard_identity,
Some(init_order.clone()),
&TENANTS,
SpawnMode::Lazy,
&ctx,
) {
Ok(tenant) => {
tenants.insert(tenant_shard_id, TenantSlot::Attached(tenant));
let slot = match location_conf.mode {
LocationMode::Attached(attached_conf) => {
match tenant_spawn(
conf,
tenant_shard_id,
&tenant_dir_path,
resources.clone(),
AttachedTenantConf::new(location_conf.tenant_conf, attached_conf),
shard_identity,
Some(init_order.clone()),
&TENANTS,
SpawnMode::Lazy,
&ctx,
) {
Ok(tenant) => TenantSlot::Attached(tenant),
Err(e) => {
error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Failed to start tenant: {e:#}");
continue;
}
}
}
Err(e) => {
error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Failed to start tenant: {e:#}");
}
}
LocationMode::Secondary(secondary_conf) => TenantSlot::Secondary(SecondaryTenant::new(
tenant_shard_id,
shard_identity,
location_conf.tenant_conf,
&secondary_conf,
)),
};
tenants.insert(tenant_shard_id, slot);
}
info!("Processed {} local tenants at startup", tenants.len());
@@ -633,7 +693,7 @@ pub async fn init_tenant_mgr(
/// Wrapper for Tenant::spawn that checks invariants before running, and inserts
/// a broken tenant in the map if Tenant::spawn fails.
#[allow(clippy::too_many_arguments)]
pub(crate) fn tenant_spawn(
fn tenant_spawn(
conf: &'static PageServerConf,
tenant_shard_id: TenantShardId,
tenant_path: &Utf8Path,
@@ -825,40 +885,6 @@ pub(crate) enum SetNewTenantConfigError {
Other(anyhow::Error),
}
pub(crate) async fn set_new_tenant_config(
conf: &'static PageServerConf,
new_tenant_conf: TenantConfOpt,
tenant_id: TenantId,
) -> Result<(), SetNewTenantConfigError> {
// Legacy API: does not support sharding
let tenant_shard_id = TenantShardId::unsharded(tenant_id);
info!("configuring tenant {tenant_id}");
let tenant = get_tenant(tenant_shard_id, true)?;
if !tenant.tenant_shard_id().shard_count.is_unsharded() {
// Note that we use ShardParameters::default below.
return Err(SetNewTenantConfigError::Other(anyhow::anyhow!(
"This API may only be used on single-sharded tenants, use the /location_config API for sharded tenants"
)));
}
// This is a legacy API that only operates on attached tenants: the preferred
// API to use is the location_config/ endpoint, which lets the caller provide
// the full LocationConf.
let location_conf = LocationConf::attached_single(
new_tenant_conf.clone(),
tenant.generation,
&ShardParameters::default(),
);
Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf)
.await
.map_err(SetNewTenantConfigError::Persist)?;
tenant.set_new_tenant_config(new_tenant_conf);
Ok(())
}
#[derive(thiserror::Error, Debug)]
pub(crate) enum UpsertLocationError {
#[error("Bad config request: {0}")]
@@ -1661,19 +1687,7 @@ impl TenantManager {
let tmp_path = safe_rename_tenant_dir(&local_tenant_directory)
.await
.with_context(|| format!("local tenant directory {local_tenant_directory:?} rename"))?;
task_mgr::spawn(
task_mgr::BACKGROUND_RUNTIME.handle(),
TaskKind::MgmtRequest,
None,
None,
"tenant_files_delete",
false,
async move {
fs::remove_dir_all(tmp_path.as_path())
.await
.with_context(|| format!("tenant directory {:?} deletion", tmp_path))
},
);
self.spawn_background_purge(tmp_path);
fail::fail_point!("shard-split-pre-finish", |_| Err(anyhow::anyhow!(
"failpoint"
@@ -1708,9 +1722,9 @@ impl TenantManager {
.layers
.read()
.await
.resident_layers()
.collect::<Vec<_>>()
.await;
.likely_resident_layers()
.collect::<Vec<_>>();
for layer in timeline_layers {
let relative_path = layer
.local_path()
@@ -1827,6 +1841,133 @@ impl TenantManager {
shutdown_all_tenants0(self.tenants).await
}
/// When we have moved a tenant's content to a temporary directory, we may delete it lazily in
/// the background, and thereby avoid blocking any API requests on this deletion completing.
fn spawn_background_purge(&self, tmp_path: Utf8PathBuf) {
// Although we are cleaning up the tenant, this task is not meant to be bound by the lifetime of the tenant in memory.
// After a tenant is detached, there are no more task_mgr tasks for that tenant_id.
let task_tenant_id = None;
task_mgr::spawn(
TaskKind::MgmtRequest,
task_tenant_id,
None,
"tenant_files_delete",
false,
async move {
fs::remove_dir_all(tmp_path.as_path())
.await
.with_context(|| format!("tenant directory {:?} deletion", tmp_path))
},
);
}
pub(crate) async fn detach_tenant(
&self,
conf: &'static PageServerConf,
tenant_shard_id: TenantShardId,
detach_ignored: bool,
deletion_queue_client: &DeletionQueueClient,
) -> Result<(), TenantStateError> {
let tmp_path = self
.detach_tenant0(
conf,
&TENANTS,
tenant_shard_id,
detach_ignored,
deletion_queue_client,
)
.await?;
self.spawn_background_purge(tmp_path);
Ok(())
}
async fn detach_tenant0(
&self,
conf: &'static PageServerConf,
tenants: &std::sync::RwLock<TenantsMap>,
tenant_shard_id: TenantShardId,
detach_ignored: bool,
deletion_queue_client: &DeletionQueueClient,
) -> Result<Utf8PathBuf, TenantStateError> {
let tenant_dir_rename_operation = |tenant_id_to_clean: TenantShardId| async move {
let local_tenant_directory = conf.tenant_path(&tenant_id_to_clean);
safe_rename_tenant_dir(&local_tenant_directory)
.await
.with_context(|| {
format!("local tenant directory {local_tenant_directory:?} rename")
})
};
let removal_result = remove_tenant_from_memory(
tenants,
tenant_shard_id,
tenant_dir_rename_operation(tenant_shard_id),
)
.await;
// Flush pending deletions, so that they have a good chance of passing validation
// before this tenant is potentially re-attached elsewhere.
deletion_queue_client.flush_advisory();
// Ignored tenants are not present in memory and will bail the removal from memory operation.
// Before returning the error, check for ignored tenant removal case — we only need to clean its local files then.
if detach_ignored
&& matches!(
removal_result,
Err(TenantStateError::SlotError(TenantSlotError::NotFound(_)))
)
{
let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_shard_id);
if tenant_ignore_mark.exists() {
info!("Detaching an ignored tenant");
let tmp_path = tenant_dir_rename_operation(tenant_shard_id)
.await
.with_context(|| {
format!("Ignored tenant {tenant_shard_id} local directory rename")
})?;
return Ok(tmp_path);
}
}
removal_result
}
pub(crate) async fn set_new_tenant_config(
&self,
new_tenant_conf: TenantConfOpt,
tenant_id: TenantId,
) -> Result<(), SetNewTenantConfigError> {
// Legacy API: does not support sharding
let tenant_shard_id = TenantShardId::unsharded(tenant_id);
info!("configuring tenant {tenant_id}");
let tenant = get_tenant(tenant_shard_id, true)?;
if !tenant.tenant_shard_id().shard_count.is_unsharded() {
// Note that we use ShardParameters::default below.
return Err(SetNewTenantConfigError::Other(anyhow::anyhow!(
"This API may only be used on single-sharded tenants, use the /location_config API for sharded tenants"
)));
}
// This is a legacy API that only operates on attached tenants: the preferred
// API to use is the location_config/ endpoint, which lets the caller provide
// the full LocationConf.
let location_conf = LocationConf::attached_single(
new_tenant_conf.clone(),
tenant.generation,
&ShardParameters::default(),
);
Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &location_conf)
.await
.map_err(SetNewTenantConfigError::Persist)?;
tenant.set_new_tenant_config(new_tenant_conf);
Ok(())
}
}
#[derive(Debug, thiserror::Error)]
@@ -2028,87 +2169,6 @@ pub(crate) enum TenantStateError {
Other(#[from] anyhow::Error),
}
pub(crate) async fn detach_tenant(
conf: &'static PageServerConf,
tenant_shard_id: TenantShardId,
detach_ignored: bool,
deletion_queue_client: &DeletionQueueClient,
) -> Result<(), TenantStateError> {
let tmp_path = detach_tenant0(
conf,
&TENANTS,
tenant_shard_id,
detach_ignored,
deletion_queue_client,
)
.await?;
// Although we are cleaning up the tenant, this task is not meant to be bound by the lifetime of the tenant in memory.
// After a tenant is detached, there are no more task_mgr tasks for that tenant_id.
let task_tenant_id = None;
task_mgr::spawn(
task_mgr::BACKGROUND_RUNTIME.handle(),
TaskKind::MgmtRequest,
task_tenant_id,
None,
"tenant_files_delete",
false,
async move {
fs::remove_dir_all(tmp_path.as_path())
.await
.with_context(|| format!("tenant directory {:?} deletion", tmp_path))
},
);
Ok(())
}
async fn detach_tenant0(
conf: &'static PageServerConf,
tenants: &std::sync::RwLock<TenantsMap>,
tenant_shard_id: TenantShardId,
detach_ignored: bool,
deletion_queue_client: &DeletionQueueClient,
) -> Result<Utf8PathBuf, TenantStateError> {
let tenant_dir_rename_operation = |tenant_id_to_clean: TenantShardId| async move {
let local_tenant_directory = conf.tenant_path(&tenant_id_to_clean);
safe_rename_tenant_dir(&local_tenant_directory)
.await
.with_context(|| format!("local tenant directory {local_tenant_directory:?} rename"))
};
let removal_result = remove_tenant_from_memory(
tenants,
tenant_shard_id,
tenant_dir_rename_operation(tenant_shard_id),
)
.await;
// Flush pending deletions, so that they have a good chance of passing validation
// before this tenant is potentially re-attached elsewhere.
deletion_queue_client.flush_advisory();
// Ignored tenants are not present in memory and will bail the removal from memory operation.
// Before returning the error, check for ignored tenant removal case — we only need to clean its local files then.
if detach_ignored
&& matches!(
removal_result,
Err(TenantStateError::SlotError(TenantSlotError::NotFound(_)))
)
{
let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_shard_id);
if tenant_ignore_mark.exists() {
info!("Detaching an ignored tenant");
let tmp_path = tenant_dir_rename_operation(tenant_shard_id)
.await
.with_context(|| {
format!("Ignored tenant {tenant_shard_id} local directory rename")
})?;
return Ok(tmp_path);
}
}
removal_result
}
pub(crate) async fn load_tenant(
conf: &'static PageServerConf,
tenant_id: TenantId,
@@ -2142,7 +2202,7 @@ pub(crate) async fn load_tenant(
let mut location_conf =
Tenant::load_tenant_config(conf, &tenant_shard_id).map_err(TenantMapInsertError::Other)?;
location_conf.attach_in_generation(generation);
location_conf.attach_in_generation(AttachmentMode::Single, generation);
Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await?;
@@ -2729,7 +2789,7 @@ use {
utils::http::error::ApiError,
};
pub(crate) async fn immediate_gc(
pub(crate) fn immediate_gc(
tenant_shard_id: TenantShardId,
timeline_id: TimelineId,
gc_req: TimelineGcRequest,
@@ -2751,30 +2811,28 @@ pub(crate) async fn immediate_gc(
// Run in task_mgr to avoid race with tenant_detach operation
let ctx = ctx.detached_child(TaskKind::GarbageCollector, DownloadBehavior::Download);
let (task_done, wait_task_done) = tokio::sync::oneshot::channel();
let span = info_span!("manual_gc", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id);
// TODO: spawning is redundant now, need to hold the gate
task_mgr::spawn(
&tokio::runtime::Handle::current(),
TaskKind::GarbageCollector,
Some(tenant_shard_id),
Some(timeline_id),
&format!("timeline_gc_handler garbage collection run for tenant {tenant_shard_id} timeline {timeline_id}"),
false,
async move {
fail::fail_point!("immediate_gc_task_pre");
#[allow(unused_mut)]
let mut result = tenant
.gc_iteration(Some(timeline_id), gc_horizon, pitr, &cancel, &ctx)
.instrument(info_span!("manual_gc", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id))
.await;
// FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it
// better once the types support it.
#[cfg(feature = "testing")]
{
// we need to synchronize with drop completion for python tests without polling for
// log messages
if let Ok(result) = result.as_mut() {
// why not futures unordered? it seems it needs very much the same task structure
// but would only run on single task.
let mut js = tokio::task::JoinSet::new();
for layer in std::mem::take(&mut result.doomed_layers) {
js.spawn(layer.wait_drop());
@@ -2790,7 +2848,7 @@ pub(crate) async fn immediate_gc(
if let Some(rtc) = rtc {
// layer drops schedule actions on remote timeline client to actually do the
// deletions; don't care just exit fast about the shutdown error
// deletions; don't care about the shutdown error, just exit fast
drop(rtc.wait_completion().await);
}
}
@@ -2801,6 +2859,7 @@ pub(crate) async fn immediate_gc(
}
Ok(())
}
.instrument(span)
);
// drop the guard until after we've spawned the task so that timeline shutdown will wait for the task

View File

@@ -223,7 +223,6 @@ use crate::{
config::PageServerConf,
task_mgr,
task_mgr::TaskKind,
task_mgr::BACKGROUND_RUNTIME,
tenant::metadata::TimelineMetadata,
tenant::upload_queue::{
UploadOp, UploadQueue, UploadQueueInitialized, UploadQueueStopped, UploadTask,
@@ -307,8 +306,6 @@ pub enum PersistIndexPartWithDeletedFlagError {
pub struct RemoteTimelineClient {
conf: &'static PageServerConf,
runtime: tokio::runtime::Handle,
tenant_shard_id: TenantShardId,
timeline_id: TimelineId,
generation: Generation,
@@ -341,12 +338,6 @@ impl RemoteTimelineClient {
) -> RemoteTimelineClient {
RemoteTimelineClient {
conf,
runtime: if cfg!(test) {
// remote_timeline_client.rs tests rely on current-thread runtime
tokio::runtime::Handle::current()
} else {
BACKGROUND_RUNTIME.handle().clone()
},
tenant_shard_id,
timeline_id,
generation,
@@ -1281,7 +1272,6 @@ impl RemoteTimelineClient {
let tenant_shard_id = self.tenant_shard_id;
let timeline_id = self.timeline_id;
task_mgr::spawn(
&self.runtime,
TaskKind::RemoteUploadTask,
Some(self.tenant_shard_id),
Some(self.timeline_id),
@@ -1876,7 +1866,6 @@ mod tests {
fn build_client(&self, generation: Generation) -> Arc<RemoteTimelineClient> {
Arc::new(RemoteTimelineClient {
conf: self.harness.conf,
runtime: tokio::runtime::Handle::current(),
tenant_shard_id: self.harness.tenant_shard_id,
timeline_id: TIMELINE_ID,
generation,

View File

@@ -8,7 +8,7 @@ use std::{sync::Arc, time::SystemTime};
use crate::{
config::PageServerConf,
disk_usage_eviction_task::DiskUsageEvictionInfo,
task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
task_mgr::{self, TaskKind},
virtual_file::MaybeFatalIo,
};
@@ -317,7 +317,6 @@ pub fn spawn_tasks(
tokio::sync::mpsc::channel::<CommandRequest<UploadCommand>>(16);
task_mgr::spawn(
BACKGROUND_RUNTIME.handle(),
TaskKind::SecondaryDownloads,
None,
None,
@@ -338,7 +337,6 @@ pub fn spawn_tasks(
);
task_mgr::spawn(
BACKGROUND_RUNTIME.handle(),
TaskKind::SecondaryUploads,
None,
None,

View File

@@ -15,6 +15,7 @@ use crate::{
tenant::{
config::SecondaryLocationConfig,
debug_assert_current_span_has_tenant_and_timeline_id,
ephemeral_file::is_ephemeral_file,
remote_timeline_client::{
index::LayerFileMetadata, is_temp_download_file, FAILED_DOWNLOAD_WARN_THRESHOLD,
FAILED_REMOTE_OP_RETRIES,
@@ -534,7 +535,11 @@ impl<'a> TenantDownloader<'a> {
.await
.maybe_fatal_err(&context_msg)?;
tracing::debug!("Wrote local heatmap to {}", heatmap_path);
tracing::debug!(
"Wrote local heatmap to {}, with {} timelines",
heatmap_path,
heatmap.timelines.len()
);
// Clean up any local layers that aren't in the heatmap. We do this first for all timelines, on the general
// principle that deletions should be done before writes wherever possible, and so that we can use this
@@ -547,6 +552,10 @@ impl<'a> TenantDownloader<'a> {
// Download the layers in the heatmap
for timeline in heatmap.timelines {
if self.secondary_state.cancel.is_cancelled() {
tracing::debug!(
"Cancelled before downloading timeline {}",
timeline.timeline_id
);
return Ok(());
}
@@ -764,10 +773,13 @@ impl<'a> TenantDownloader<'a> {
}
};
tracing::debug!(timeline_id=%timeline.timeline_id, "Downloading layers, {} in heatmap", timeline.layers.len());
// Download heatmap layers that are not present on local disk, or update their
// access time if they are already present.
for layer in timeline.layers {
if self.secondary_state.cancel.is_cancelled() {
tracing::debug!("Cancelled -- dropping out of layer loop");
return Ok(());
}
@@ -950,7 +962,10 @@ async fn init_timeline_state(
// Secondary mode doesn't use local metadata files, but they might have been left behind by an attached tenant.
warn!(path=?dentry.path(), "found legacy metadata file, these should have been removed in load_tenant_config");
continue;
} else if crate::is_temporary(&file_path) || is_temp_download_file(&file_path) {
} else if crate::is_temporary(&file_path)
|| is_temp_download_file(&file_path)
|| is_ephemeral_file(file_name)
{
// Temporary files are frequently left behind from restarting during downloads
tracing::info!("Cleaning up temporary file {file_path}");
if let Err(e) = tokio::fs::remove_file(&file_path)

View File

@@ -300,6 +300,7 @@ where
let tenant_shard_id = job.get_tenant_shard_id();
let barrier = if let Some(barrier) = self.get_running(tenant_shard_id) {
tracing::info!("Command already running, waiting for it");
barrier
} else {
let running = self.spawn_now(job);

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,119 @@
//! failpoints for unit tests, implying `#[cfg(test)]`.
//!
//! These are not accessible over http.
use super::*;
impl Layer {
/// Enable a failpoint from a unit test.
pub(super) fn enable_failpoint(&self, failpoint: Failpoint) {
self.0.failpoints.lock().unwrap().push(failpoint);
}
}
impl LayerInner {
/// Query if this failpoint is enabled, as in, arrive at a failpoint.
///
/// Calls to this method need to be `#[cfg(test)]` guarded.
pub(super) async fn failpoint(&self, kind: FailpointKind) -> Result<(), FailpointHit> {
let fut = {
let mut fps = self.failpoints.lock().unwrap();
// find the *last* failpoint for cases in which we need to use multiple for the same
// thing (two blocked evictions)
let fp = fps.iter_mut().rfind(|x| x.kind() == kind);
let Some(fp) = fp else {
return Ok(());
};
fp.hit()
};
fut.await
}
}
#[derive(Debug, PartialEq, Eq)]
pub(crate) enum FailpointKind {
/// Failpoint acts as an accurate cancelled by drop here; see the only site of use.
AfterDeterminingLayerNeedsNoDownload,
/// Failpoint for stalling eviction starting
WaitBeforeStartingEvicting,
/// Failpoint hit in the spawned task
WaitBeforeDownloading,
}
pub(crate) enum Failpoint {
AfterDeterminingLayerNeedsNoDownload,
WaitBeforeStartingEvicting(
Option<utils::completion::Completion>,
utils::completion::Barrier,
),
WaitBeforeDownloading(
Option<utils::completion::Completion>,
utils::completion::Barrier,
),
}
impl Failpoint {
fn kind(&self) -> FailpointKind {
match self {
Failpoint::AfterDeterminingLayerNeedsNoDownload => {
FailpointKind::AfterDeterminingLayerNeedsNoDownload
}
Failpoint::WaitBeforeStartingEvicting(..) => FailpointKind::WaitBeforeStartingEvicting,
Failpoint::WaitBeforeDownloading(..) => FailpointKind::WaitBeforeDownloading,
}
}
fn hit(&mut self) -> impl std::future::Future<Output = Result<(), FailpointHit>> + 'static {
use futures::future::FutureExt;
// use boxed futures to avoid Either hurdles
match self {
Failpoint::AfterDeterminingLayerNeedsNoDownload => {
let kind = self.kind();
async move { Err(FailpointHit(kind)) }.boxed()
}
Failpoint::WaitBeforeStartingEvicting(arrival, b)
| Failpoint::WaitBeforeDownloading(arrival, b) => {
// first one signals arrival
drop(arrival.take());
let b = b.clone();
async move {
tracing::trace!("waiting on a failpoint barrier");
b.wait().await;
tracing::trace!("done waiting on a failpoint barrier");
Ok(())
}
.boxed()
}
}
}
}
impl std::fmt::Display for FailpointKind {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
std::fmt::Debug::fmt(self, f)
}
}
#[derive(Debug)]
pub(crate) struct FailpointHit(FailpointKind);
impl std::fmt::Display for FailpointHit {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
std::fmt::Debug::fmt(self, f)
}
}
impl std::error::Error for FailpointHit {}
impl From<FailpointHit> for DownloadError {
fn from(value: FailpointHit) -> Self {
DownloadError::Failpoint(value.0)
}
}

View File

@@ -1,14 +1,13 @@
use futures::StreamExt;
use pageserver_api::key::CONTROLFILE_KEY;
use tokio::task::JoinSet;
use tracing::Instrument;
use utils::{
completion::{self, Completion},
id::TimelineId,
};
use super::failpoints::{Failpoint, FailpointKind};
use super::*;
use crate::{context::DownloadBehavior, task_mgr::BACKGROUND_RUNTIME};
use crate::context::DownloadBehavior;
use crate::{task_mgr::TaskKind, tenant::harness::TenantHarness};
/// Used in tests to advance a future to wanted await point, and not futher.
@@ -21,7 +20,7 @@ const FOREVER: std::time::Duration = std::time::Duration::from_secs(ADVANCE.as_s
/// Demonstrate the API and resident -> evicted -> resident -> deleted transitions.
#[tokio::test]
async fn smoke_test() {
let handle = BACKGROUND_RUNTIME.handle();
let handle = tokio::runtime::Handle::current();
let h = TenantHarness::create("smoke_test").unwrap();
let span = h.span();
@@ -38,7 +37,7 @@ async fn smoke_test() {
let layer = {
let mut layers = {
let layers = timeline.layers.read().await;
layers.resident_layers().collect::<Vec<_>>().await
layers.likely_resident_layers().collect::<Vec<_>>()
};
assert_eq!(layers.len(), 1);
@@ -88,7 +87,7 @@ async fn smoke_test() {
//
// ZERO for timeout does not work reliably, so first take up all spawn_blocking slots to
// artificially slow it down.
let helper = SpawnBlockingPoolHelper::consume_all_spawn_blocking_threads(handle).await;
let helper = SpawnBlockingPoolHelper::consume_all_spawn_blocking_threads(&handle).await;
match layer
.evict_and_wait(std::time::Duration::ZERO)
@@ -99,7 +98,7 @@ async fn smoke_test() {
// expected, but note that the eviction is "still ongoing"
helper.release().await;
// exhaust spawn_blocking pool to ensure it is now complete
SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(handle)
SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(&handle)
.await;
}
other => unreachable!("{other:?}"),
@@ -108,7 +107,7 @@ async fn smoke_test() {
// only way to query if a layer is resident is to acquire a ResidentLayer instance.
// Layer::keep_resident never downloads, but it might initialize if the layer file is found
// downloaded locally.
let none = layer.keep_resident().await.unwrap();
let none = layer.keep_resident().await;
assert!(
none.is_none(),
"Expected none, because eviction removed the local file, found: {none:?}"
@@ -167,6 +166,7 @@ async fn smoke_test() {
rtc.wait_completion().await.unwrap();
assert_eq!(rtc.get_remote_physical_size(), 0);
assert_eq!(0, LAYER_IMPL_METRICS.inits_cancelled.get())
}
/// This test demonstrates a previous hang when a eviction and deletion were requested at the same
@@ -174,7 +174,7 @@ async fn smoke_test() {
#[tokio::test(start_paused = true)]
async fn evict_and_wait_on_wanted_deleted() {
// this is the runtime on which Layer spawns the blocking tasks on
let handle = BACKGROUND_RUNTIME.handle();
let handle = tokio::runtime::Handle::current();
let h = TenantHarness::create("evict_and_wait_on_wanted_deleted").unwrap();
utils::logging::replace_panic_hook_with_tracing_panic_hook().forget();
@@ -188,7 +188,7 @@ async fn evict_and_wait_on_wanted_deleted() {
let layer = {
let mut layers = {
let layers = timeline.layers.read().await;
layers.resident_layers().collect::<Vec<_>>().await
layers.likely_resident_layers().collect::<Vec<_>>()
};
assert_eq!(layers.len(), 1);
@@ -213,11 +213,11 @@ async fn evict_and_wait_on_wanted_deleted() {
drop(resident);
// make sure the eviction task gets to run
SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(handle).await;
SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(&handle).await;
let resident = layer.keep_resident().await;
assert!(
matches!(resident, Ok(None)),
resident.is_none(),
"keep_resident should not have re-initialized: {resident:?}"
);
@@ -235,24 +235,332 @@ async fn evict_and_wait_on_wanted_deleted() {
layers.finish_gc_timeline(&[layer]);
}
SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(handle).await;
SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(&handle).await;
assert_eq!(1, LAYER_IMPL_METRICS.started_deletes.get());
assert_eq!(1, LAYER_IMPL_METRICS.completed_deletes.get());
assert_eq!(1, LAYER_IMPL_METRICS.started_evictions.get());
assert_eq!(1, LAYER_IMPL_METRICS.completed_evictions.get());
assert_eq!(0, LAYER_IMPL_METRICS.inits_cancelled.get())
}
/// This test shows that ensures we are able to read the layer while the layer eviction has been
/// started but not completed due to spawn_blocking pool being blocked.
///
/// Here `Layer::keep_resident` is used to "simulate" reads, because it cannot download.
#[tokio::test(start_paused = true)]
async fn residency_check_while_evict_and_wait_on_clogged_spawn_blocking() {
// this is the runtime on which Layer spawns the blocking tasks on
let handle = BACKGROUND_RUNTIME.handle();
let h = TenantHarness::create("residency_check_while_evict_and_wait_on_clogged_spawn_blocking")
/// This test ensures we are able to read the layer while the layer eviction has been
/// started but not completed.
#[test]
fn read_wins_pending_eviction() {
let rt = tokio::runtime::Builder::new_current_thread()
.max_blocking_threads(1)
.enable_all()
.start_paused(true)
.build()
.unwrap();
rt.block_on(async move {
// this is the runtime on which Layer spawns the blocking tasks on
let handle = tokio::runtime::Handle::current();
let h = TenantHarness::create("read_wins_pending_eviction").unwrap();
let (tenant, ctx) = h.load().await;
let span = h.span();
let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1));
let timeline = tenant
.create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
.await
.unwrap();
let layer = {
let mut layers = {
let layers = timeline.layers.read().await;
layers.likely_resident_layers().collect::<Vec<_>>()
};
assert_eq!(layers.len(), 1);
layers.swap_remove(0)
};
// setup done
let resident = layer.keep_resident().await.unwrap();
let mut evict_and_wait = std::pin::pin!(layer.evict_and_wait(FOREVER));
// drive the future to await on the status channel
tokio::time::timeout(ADVANCE, &mut evict_and_wait)
.await
.expect_err("should had been a timeout since we are holding the layer resident");
assert_eq!(1, LAYER_IMPL_METRICS.started_evictions.get());
let (completion, barrier) = utils::completion::channel();
let (arrival, arrived_at_barrier) = utils::completion::channel();
layer.enable_failpoint(Failpoint::WaitBeforeStartingEvicting(
Some(arrival),
barrier,
));
// now the eviction cannot proceed because the threads are consumed while completion exists
drop(resident);
arrived_at_barrier.wait().await;
assert!(!layer.is_likely_resident());
// because no actual eviction happened, we get to just reinitialize the DownloadedLayer
layer
.0
.get_or_maybe_download(false, None)
.instrument(download_span)
.await
.expect("should had reinitialized without downloading");
assert!(layer.is_likely_resident());
// reinitialization notifies of new resident status, which should error out all evict_and_wait
let e = tokio::time::timeout(ADVANCE, &mut evict_and_wait)
.await
.expect("no timeout, because get_or_maybe_download re-initialized")
.expect_err("eviction should not have succeeded because re-initialized");
// works as intended: evictions lose to "downloads"
assert!(matches!(e, EvictionError::Downloaded), "{e:?}");
assert_eq!(0, LAYER_IMPL_METRICS.completed_evictions.get());
// this is not wrong: the eviction is technically still "on the way" as it's still queued
// because of a failpoint
assert_eq!(
0,
LAYER_IMPL_METRICS
.cancelled_evictions
.values()
.map(|ctr| ctr.get())
.sum::<u64>()
);
drop(completion);
tokio::time::sleep(ADVANCE).await;
SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads0(&handle, 1)
.await;
assert_eq!(0, LAYER_IMPL_METRICS.completed_evictions.get());
// now we finally can observe the original eviction failing
// it would had been possible to observe it earlier, but here it is guaranteed to have
// happened.
assert_eq!(
1,
LAYER_IMPL_METRICS
.cancelled_evictions
.values()
.map(|ctr| ctr.get())
.sum::<u64>()
);
assert_eq!(
1,
LAYER_IMPL_METRICS.cancelled_evictions[EvictionCancelled::AlreadyReinitialized].get()
);
assert_eq!(0, LAYER_IMPL_METRICS.inits_cancelled.get())
});
}
/// Use failpoint to delay an eviction starting to get a VersionCheckFailed.
#[test]
fn multiple_pending_evictions_in_order() {
let name = "multiple_pending_evictions_in_order";
let in_order = true;
multiple_pending_evictions_scenario(name, in_order);
}
/// Use failpoint to reorder later eviction before first to get a UnexpectedEvictedState.
#[test]
fn multiple_pending_evictions_out_of_order() {
let name = "multiple_pending_evictions_out_of_order";
let in_order = false;
multiple_pending_evictions_scenario(name, in_order);
}
fn multiple_pending_evictions_scenario(name: &'static str, in_order: bool) {
let rt = tokio::runtime::Builder::new_current_thread()
.max_blocking_threads(1)
.enable_all()
.start_paused(true)
.build()
.unwrap();
rt.block_on(async move {
// this is the runtime on which Layer spawns the blocking tasks on
let handle = tokio::runtime::Handle::current();
let h = TenantHarness::create(name).unwrap();
let (tenant, ctx) = h.load().await;
let span = h.span();
let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1));
let timeline = tenant
.create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
.await
.unwrap();
let layer = {
let mut layers = {
let layers = timeline.layers.read().await;
layers.likely_resident_layers().collect::<Vec<_>>()
};
assert_eq!(layers.len(), 1);
layers.swap_remove(0)
};
// setup done
let resident = layer.keep_resident().await.unwrap();
let mut evict_and_wait = std::pin::pin!(layer.evict_and_wait(FOREVER));
// drive the future to await on the status channel
tokio::time::timeout(ADVANCE, &mut evict_and_wait)
.await
.expect_err("should had been a timeout since we are holding the layer resident");
assert_eq!(1, LAYER_IMPL_METRICS.started_evictions.get());
let (completion1, barrier) = utils::completion::channel();
let mut completion1 = Some(completion1);
let (arrival, arrived_at_barrier) = utils::completion::channel();
layer.enable_failpoint(Failpoint::WaitBeforeStartingEvicting(
Some(arrival),
barrier,
));
// now the eviction cannot proceed because we are simulating arbitrary long delay for the
// eviction task start.
drop(resident);
assert!(!layer.is_likely_resident());
arrived_at_barrier.wait().await;
// because no actual eviction happened, we get to just reinitialize the DownloadedLayer
layer
.0
.get_or_maybe_download(false, None)
.instrument(download_span)
.await
.expect("should had reinitialized without downloading");
assert!(layer.is_likely_resident());
// reinitialization notifies of new resident status, which should error out all evict_and_wait
let e = tokio::time::timeout(ADVANCE, &mut evict_and_wait)
.await
.expect("no timeout, because get_or_maybe_download re-initialized")
.expect_err("eviction should not have succeeded because re-initialized");
// works as intended: evictions lose to "downloads"
assert!(matches!(e, EvictionError::Downloaded), "{e:?}");
assert_eq!(0, LAYER_IMPL_METRICS.completed_evictions.get());
// this is not wrong: the eviction is technically still "on the way" as it's still queued
// because of a failpoint
assert_eq!(
0,
LAYER_IMPL_METRICS
.cancelled_evictions
.values()
.map(|ctr| ctr.get())
.sum::<u64>()
);
assert_eq!(0, LAYER_IMPL_METRICS.completed_evictions.get());
// configure another failpoint for the second eviction -- evictions are per initialization,
// so now that we've reinitialized the inner, we get to run two of them at the same time.
let (completion2, barrier) = utils::completion::channel();
let (arrival, arrived_at_barrier) = utils::completion::channel();
layer.enable_failpoint(Failpoint::WaitBeforeStartingEvicting(
Some(arrival),
barrier,
));
let mut second_eviction = std::pin::pin!(layer.evict_and_wait(FOREVER));
// advance to the wait on the queue
tokio::time::timeout(ADVANCE, &mut second_eviction)
.await
.expect_err("timeout because failpoint is blocking");
arrived_at_barrier.wait().await;
assert_eq!(2, LAYER_IMPL_METRICS.started_evictions.get());
let mut release_earlier_eviction = |expected_reason| {
assert_eq!(
0,
LAYER_IMPL_METRICS.cancelled_evictions[expected_reason].get(),
);
drop(completion1.take().unwrap());
let handle = &handle;
async move {
tokio::time::sleep(ADVANCE).await;
SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads0(
handle, 1,
)
.await;
assert_eq!(
1,
LAYER_IMPL_METRICS.cancelled_evictions[expected_reason].get(),
);
}
};
if in_order {
release_earlier_eviction(EvictionCancelled::VersionCheckFailed).await;
}
// release the later eviction which is for the current version
drop(completion2);
tokio::time::sleep(ADVANCE).await;
SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads0(&handle, 1)
.await;
if !in_order {
release_earlier_eviction(EvictionCancelled::UnexpectedEvictedState).await;
}
tokio::time::timeout(ADVANCE, &mut second_eviction)
.await
.expect("eviction goes through now that spawn_blocking is unclogged")
.expect("eviction should succeed, because version matches");
assert_eq!(1, LAYER_IMPL_METRICS.completed_evictions.get());
// ensure the cancelled are unchanged
assert_eq!(
1,
LAYER_IMPL_METRICS
.cancelled_evictions
.values()
.map(|ctr| ctr.get())
.sum::<u64>()
);
assert_eq!(0, LAYER_IMPL_METRICS.inits_cancelled.get())
});
}
/// The test ensures with a failpoint that a pending eviction is not cancelled by what is currently
/// a `Layer::keep_resident` call.
///
/// This matters because cancelling the eviction would leave us in a state where the file is on
/// disk but the layer internal state says it has not been initialized. Futhermore, it allows us to
/// have non-repairing `Layer::is_likely_resident`.
#[tokio::test(start_paused = true)]
async fn cancelled_get_or_maybe_download_does_not_cancel_eviction() {
let handle = tokio::runtime::Handle::current();
let h =
TenantHarness::create("cancelled_get_or_maybe_download_does_not_cancel_eviction").unwrap();
let (tenant, ctx) = h.load().await;
let timeline = tenant
@@ -263,7 +571,7 @@ async fn residency_check_while_evict_and_wait_on_clogged_spawn_blocking() {
let layer = {
let mut layers = {
let layers = timeline.layers.read().await;
layers.resident_layers().collect::<Vec<_>>().await
layers.likely_resident_layers().collect::<Vec<_>>()
};
assert_eq!(layers.len(), 1);
@@ -271,90 +579,154 @@ async fn residency_check_while_evict_and_wait_on_clogged_spawn_blocking() {
layers.swap_remove(0)
};
// setup done
// this failpoint will simulate the `get_or_maybe_download` becoming cancelled (by returning an
// Err) at the right time as in "during" the `LayerInner::needs_download`.
layer.enable_failpoint(Failpoint::AfterDeterminingLayerNeedsNoDownload);
let resident = layer.keep_resident().await.unwrap();
let (completion, barrier) = utils::completion::channel();
let (arrival, arrived_at_barrier) = utils::completion::channel();
layer.enable_failpoint(Failpoint::WaitBeforeStartingEvicting(
Some(arrival),
barrier,
));
tokio::time::timeout(ADVANCE, layer.evict_and_wait(FOREVER))
.await
.expect_err("should had advanced to waiting on channel");
arrived_at_barrier.wait().await;
// simulate a cancelled read which is cancelled before it gets to re-initialize
let e = layer
.0
.get_or_maybe_download(false, None)
.await
.unwrap_err();
assert!(
matches!(
e,
DownloadError::Failpoint(FailpointKind::AfterDeterminingLayerNeedsNoDownload)
),
"{e:?}"
);
assert!(
layer.0.needs_download().await.unwrap().is_none(),
"file is still on disk"
);
// release the eviction task
drop(completion);
tokio::time::sleep(ADVANCE).await;
SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(&handle).await;
// failpoint is still enabled, but it is not hit
let e = layer
.0
.get_or_maybe_download(false, None)
.await
.unwrap_err();
assert!(matches!(e, DownloadError::DownloadRequired), "{e:?}");
// failpoint is not counted as cancellation either
assert_eq!(0, LAYER_IMPL_METRICS.inits_cancelled.get())
}
#[tokio::test(start_paused = true)]
async fn evict_and_wait_does_not_wait_for_download() {
// let handle = tokio::runtime::Handle::current();
let h = TenantHarness::create("evict_and_wait_does_not_wait_for_download").unwrap();
let (tenant, ctx) = h.load().await;
let span = h.span();
let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1));
let timeline = tenant
.create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
.await
.unwrap();
let layer = {
let mut layers = {
let layers = timeline.layers.read().await;
layers.likely_resident_layers().collect::<Vec<_>>()
};
assert_eq!(layers.len(), 1);
layers.swap_remove(0)
};
// kind of forced setup: start an eviction but do not allow it progress until we are
// downloading
let (eviction_can_continue, barrier) = utils::completion::channel();
let (arrival, eviction_arrived) = utils::completion::channel();
layer.enable_failpoint(Failpoint::WaitBeforeStartingEvicting(
Some(arrival),
barrier,
));
let mut evict_and_wait = std::pin::pin!(layer.evict_and_wait(FOREVER));
// drive the future to await on the status channel
// use this once-awaited other_evict to synchronize with the eviction
let other_evict = layer.evict_and_wait(FOREVER);
tokio::time::timeout(ADVANCE, &mut evict_and_wait)
.await
.expect_err("should had been a timeout since we are holding the layer resident");
assert_eq!(1, LAYER_IMPL_METRICS.started_evictions.get());
.expect_err("should had advanced");
eviction_arrived.wait().await;
drop(eviction_can_continue);
other_evict.await.unwrap();
// clog up BACKGROUND_RUNTIME spawn_blocking
let helper = SpawnBlockingPoolHelper::consume_all_spawn_blocking_threads(handle).await;
// now the layer is evicted, and the "evict_and_wait" is waiting on the receiver
assert!(!layer.is_likely_resident());
// now the eviction cannot proceed because the threads are consumed while completion exists
drop(resident);
// following new evict_and_wait will fail until we've completed the download
let e = layer.evict_and_wait(FOREVER).await.unwrap_err();
assert!(matches!(e, EvictionError::NotFound), "{e:?}");
// because no actual eviction happened, we get to just reinitialize the DownloadedLayer
layer
.keep_resident()
.await
.expect("keep_resident should had reinitialized without downloading")
.expect("ResidentLayer");
let (download_can_continue, barrier) = utils::completion::channel();
let (arrival, _download_arrived) = utils::completion::channel();
layer.enable_failpoint(Failpoint::WaitBeforeDownloading(Some(arrival), barrier));
// because the keep_resident check alters wanted evicted without sending a message, we will
// never get completed
let e = tokio::time::timeout(ADVANCE, &mut evict_and_wait)
.await
.expect("no timeout, because keep_resident re-initialized")
.expect_err("eviction should not have succeeded because re-initialized");
let mut download = std::pin::pin!(layer
.0
.get_or_maybe_download(true, None)
.instrument(download_span));
// works as intended: evictions lose to "downloads"
assert!(matches!(e, EvictionError::Downloaded), "{e:?}");
assert_eq!(0, LAYER_IMPL_METRICS.completed_evictions.get());
// this is not wrong: the eviction is technically still "on the way" as it's still queued
// because spawn_blocking is clogged up
assert_eq!(
0,
LAYER_IMPL_METRICS
.cancelled_evictions
.values()
.map(|ctr| ctr.get())
.sum::<u64>()
assert!(
!layer.is_likely_resident(),
"during download layer is evicted"
);
let mut second_eviction = std::pin::pin!(layer.evict_and_wait(FOREVER));
// advance to the wait on the queue
tokio::time::timeout(ADVANCE, &mut second_eviction)
tokio::time::timeout(ADVANCE, &mut download)
.await
.expect_err("timeout because spawn_blocking is clogged");
.expect_err("should had timed out because of failpoint");
// in this case we don't leak started evictions, but I think there is still a chance of that
// happening, because we could have upgrades race multiple evictions while only one of them
// happens?
assert_eq!(2, LAYER_IMPL_METRICS.started_evictions.get());
// now we finally get to continue, and because the latest state is downloading, we deduce that
// original eviction succeeded
evict_and_wait.await.unwrap();
helper.release().await;
// however a new evict_and_wait will fail
let e = layer.evict_and_wait(FOREVER).await.unwrap_err();
assert!(matches!(e, EvictionError::NotFound), "{e:?}");
// the second_eviction gets to run here
//
// synchronize to be *strictly* after the second_eviction spawn_blocking run
SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(handle).await;
assert!(!layer.is_likely_resident());
tokio::time::timeout(ADVANCE, &mut second_eviction)
.await
.expect("eviction goes through now that spawn_blocking is unclogged")
.expect("eviction should succeed, because version matches");
drop(download_can_continue);
download.await.expect("download should had succeeded");
assert!(layer.is_likely_resident());
assert_eq!(1, LAYER_IMPL_METRICS.completed_evictions.get());
// only now can we evict
layer.evict_and_wait(FOREVER).await.unwrap();
}
// now we finally can observe the original spawn_blocking failing
// it would had been possible to observe it earlier, but here it is guaranteed to have
// happened.
assert_eq!(
1,
LAYER_IMPL_METRICS
.cancelled_evictions
.values()
.map(|ctr| ctr.get())
.sum::<u64>()
);
#[test]
fn layer_size() {
assert_eq!(std::mem::size_of::<LayerAccessStats>(), 2040);
assert_eq!(std::mem::size_of::<PersistentLayerDesc>(), 104);
assert_eq!(std::mem::size_of::<LayerInner>(), 2328);
// it also has the utf8 path
}
struct SpawnBlockingPoolHelper {
@@ -371,31 +743,41 @@ impl SpawnBlockingPoolHelper {
///
/// This should be no issue nowdays, because nextest runs each test in it's own process.
async fn consume_all_spawn_blocking_threads(handle: &tokio::runtime::Handle) -> Self {
let (completion, barrier) = completion::channel();
let (tx, mut rx) = tokio::sync::mpsc::channel(8);
let default_max_blocking_threads = 512;
let assumed_max_blocking_threads = 512;
Self::consume_all_spawn_blocking_threads0(handle, default_max_blocking_threads).await
}
async fn consume_all_spawn_blocking_threads0(
handle: &tokio::runtime::Handle,
threads: usize,
) -> Self {
assert_ne!(threads, 0);
let (completion, barrier) = completion::channel();
let (started, starts_completed) = completion::channel();
let mut blocking_tasks = JoinSet::new();
for _ in 0..assumed_max_blocking_threads {
for _ in 0..threads {
let barrier = barrier.clone();
let tx = tx.clone();
let started = started.clone();
blocking_tasks.spawn_blocking_on(
move || {
tx.blocking_send(()).unwrap();
drop(tx);
drop(started);
tokio::runtime::Handle::current().block_on(barrier.wait());
},
handle,
);
}
drop(started);
starts_completed.wait().await;
drop(barrier);
for _ in 0..assumed_max_blocking_threads {
rx.recv().await.unwrap();
}
tracing::trace!("consumed all threads");
SpawnBlockingPoolHelper {
awaited_by_spawn_blocking_tasks: completion,
@@ -415,13 +797,22 @@ impl SpawnBlockingPoolHelper {
while let Some(res) = blocking_tasks.join_next().await {
res.expect("none of the tasks should had panicked");
}
tracing::trace!("released all threads");
}
/// In the tests it is used as an easy way of making sure something scheduled on the target
/// runtimes `spawn_blocking` has completed, because it must've been scheduled and completed
/// before our tasks have a chance to schedule and complete.
async fn consume_and_release_all_of_spawn_blocking_threads(handle: &tokio::runtime::Handle) {
Self::consume_all_spawn_blocking_threads(handle)
Self::consume_and_release_all_of_spawn_blocking_threads0(handle, 512).await
}
async fn consume_and_release_all_of_spawn_blocking_threads0(
handle: &tokio::runtime::Handle,
threads: usize,
) {
Self::consume_all_spawn_blocking_threads0(handle, threads)
.await
.release()
.await
@@ -435,7 +826,7 @@ fn spawn_blocking_pool_helper_actually_works() {
// because the amount is not configurable for our helper, expect the same amount as
// BACKGROUND_RUNTIME using the tokio defaults would have.
let rt = tokio::runtime::Builder::new_current_thread()
.max_blocking_threads(512)
.max_blocking_threads(1)
.enable_all()
.build()
.unwrap();
@@ -445,7 +836,8 @@ fn spawn_blocking_pool_helper_actually_works() {
rt.block_on(async move {
// this will not return until all threads are spun up and actually executing the code
// waiting on `consumed` to be `SpawnBlockingPoolHelper::release`'d.
let consumed = SpawnBlockingPoolHelper::consume_all_spawn_blocking_threads(handle).await;
let consumed =
SpawnBlockingPoolHelper::consume_all_spawn_blocking_threads0(handle, 1).await;
println!("consumed");

View File

@@ -8,7 +8,7 @@ use std::time::{Duration, Instant};
use crate::context::{DownloadBehavior, RequestContext};
use crate::metrics::TENANT_TASK_EVENTS;
use crate::task_mgr;
use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME};
use crate::task_mgr::TaskKind;
use crate::tenant::throttle::Stats;
use crate::tenant::timeline::CompactionError;
use crate::tenant::{Tenant, TenantState};
@@ -18,7 +18,7 @@ use utils::{backoff, completion};
static CONCURRENT_BACKGROUND_TASKS: once_cell::sync::Lazy<tokio::sync::Semaphore> =
once_cell::sync::Lazy::new(|| {
let total_threads = *task_mgr::BACKGROUND_RUNTIME_WORKER_THREADS;
let total_threads = *crate::task_mgr::THE_RUNTIME_WORKER_THREADS;
let permits = usize::max(
1,
// while a lot of the work is done on spawn_blocking, we still do
@@ -85,7 +85,6 @@ pub fn start_background_loops(
) {
let tenant_shard_id = tenant.tenant_shard_id;
task_mgr::spawn(
BACKGROUND_RUNTIME.handle(),
TaskKind::Compaction,
Some(tenant_shard_id),
None,
@@ -109,7 +108,6 @@ pub fn start_background_loops(
},
);
task_mgr::spawn(
BACKGROUND_RUNTIME.handle(),
TaskKind::GarbageCollector,
Some(tenant_shard_id),
None,

View File

@@ -13,7 +13,6 @@ use bytes::Bytes;
use camino::Utf8Path;
use enumset::EnumSet;
use fail::fail_point;
use futures::stream::StreamExt;
use once_cell::sync::Lazy;
use pageserver_api::{
key::AUX_FILES_KEY,
@@ -37,6 +36,7 @@ use tracing::*;
use utils::{
bin_ser::BeSer,
sync::gate::{Gate, GateGuard},
vec_map::VecMap,
};
use std::ops::{Deref, Range};
@@ -1723,7 +1723,6 @@ impl Timeline {
initdb_optimization_count: 0,
};
task_mgr::spawn(
task_mgr::BACKGROUND_RUNTIME.handle(),
task_mgr::TaskKind::LayerFlushTask,
Some(self.tenant_shard_id),
Some(self.timeline_id),
@@ -2086,7 +2085,6 @@ impl Timeline {
DownloadBehavior::Download,
);
task_mgr::spawn(
task_mgr::BACKGROUND_RUNTIME.handle(),
task_mgr::TaskKind::InitialLogicalSizeCalculation,
Some(self.tenant_shard_id),
Some(self.timeline_id),
@@ -2264,7 +2262,6 @@ impl Timeline {
DownloadBehavior::Download,
);
task_mgr::spawn(
task_mgr::BACKGROUND_RUNTIME.handle(),
task_mgr::TaskKind::OndemandLogicalSizeCalculation,
Some(self.tenant_shard_id),
Some(self.timeline_id),
@@ -2442,7 +2439,7 @@ impl Timeline {
let guard = self.layers.read().await;
let resident = guard.resident_layers().map(|layer| {
let resident = guard.likely_resident_layers().map(|layer| {
let last_activity_ts = layer.access_stats().latest_activity_or_now();
HeatMapLayer::new(
@@ -2452,7 +2449,7 @@ impl Timeline {
)
});
let layers = resident.collect().await;
let layers = resident.collect();
Some(HeatMapTimeline::new(self.timeline_id, layers))
}
@@ -3840,7 +3837,7 @@ impl Timeline {
};
let timer = self.metrics.garbage_collect_histo.start_timer();
fail_point!("before-timeline-gc");
pausable_failpoint!("before-timeline-gc");
// Is the timeline being deleted?
if self.is_stopping() {
@@ -4151,7 +4148,6 @@ impl Timeline {
let self_clone = Arc::clone(&self);
let task_id = task_mgr::spawn(
task_mgr::BACKGROUND_RUNTIME.handle(),
task_mgr::TaskKind::DownloadAllRemoteLayers,
Some(self.tenant_shard_id),
Some(self.timeline_id),
@@ -4302,7 +4298,7 @@ impl Timeline {
let mut max_layer_size: Option<u64> = None;
let resident_layers = guard
.resident_layers()
.likely_resident_layers()
.map(|layer| {
let file_size = layer.layer_desc().file_size;
max_layer_size = max_layer_size.map_or(Some(file_size), |m| Some(m.max(file_size)));
@@ -4315,8 +4311,7 @@ impl Timeline {
relative_last_activity: finite_f32::FiniteF32::ZERO,
}
})
.collect()
.await;
.collect();
DiskUsageEvictionInfo {
max_layer_size,
@@ -4618,16 +4613,15 @@ impl<'a> TimelineWriter<'a> {
}
}
/// Put a batch keys at the specified Lsns.
/// Put a batch of keys at the specified Lsns.
///
/// The batch should be sorted by Lsn such that it's safe
/// to roll the open layer mid batch.
/// The batch is sorted by Lsn (enforced by usage of [`utils::vec_map::VecMap`].
pub(crate) async fn put_batch(
&mut self,
batch: Vec<(Key, Lsn, Value)>,
batch: VecMap<Lsn, (Key, Value)>,
ctx: &RequestContext,
) -> anyhow::Result<()> {
for (key, lsn, val) in batch {
for (lsn, (key, val)) in batch {
self.put(key, lsn, &val, ctx).await?
}
@@ -4713,7 +4707,6 @@ mod tests {
.keep_resident()
.await
.expect("no download => no downloading errors")
.expect("should had been resident")
.drop_eviction_guard();
let forever = std::time::Duration::from_secs(120);
@@ -4724,7 +4717,7 @@ mod tests {
let (first, second) = tokio::join!(first, second);
let res = layer.keep_resident().await;
assert!(matches!(res, Ok(None)), "{res:?}");
assert!(res.is_none(), "{res:?}");
match (first, second) {
(Ok(()), Ok(())) => {

View File

@@ -443,7 +443,6 @@ impl DeleteTimelineFlow {
let timeline_id = timeline.timeline_id;
task_mgr::spawn(
task_mgr::BACKGROUND_RUNTIME.handle(),
TaskKind::TimelineDeletionWorker,
Some(tenant_shard_id),
Some(timeline_id),

View File

@@ -28,7 +28,7 @@ use tracing::{debug, error, info, info_span, instrument, warn, Instrument};
use crate::{
context::{DownloadBehavior, RequestContext},
pgdatadir_mapping::CollectKeySpaceError,
task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
task_mgr::{self, TaskKind},
tenant::{
tasks::BackgroundLoopKind, timeline::EvictionError, LogicalSizeCalculationCause, Tenant,
},
@@ -56,7 +56,6 @@ impl Timeline {
let self_clone = Arc::clone(self);
let background_tasks_can_start = background_tasks_can_start.cloned();
task_mgr::spawn(
BACKGROUND_RUNTIME.handle(),
TaskKind::Eviction,
Some(self.tenant_shard_id),
Some(self.timeline_id),
@@ -225,24 +224,18 @@ impl Timeline {
{
let guard = self.layers.read().await;
let layers = guard.layer_map();
for hist_layer in layers.iter_historic_layers() {
let hist_layer = guard.get_from_desc(&hist_layer);
for layer in layers.iter_historic_layers() {
let layer = guard.get_from_desc(&layer);
// guard against eviction while we inspect it; it might be that eviction_task and
// disk_usage_eviction_task both select the same layers to be evicted, and
// seemingly free up double the space. both succeeding is of no consequence.
let guard = match hist_layer.keep_resident().await {
Ok(Some(l)) => l,
Ok(None) => continue,
Err(e) => {
// these should not happen, but we cannot make them statically impossible right
// now.
tracing::warn!(layer=%hist_layer, "failed to keep the layer resident: {e:#}");
continue;
}
};
let last_activity_ts = hist_layer.access_stats().latest_activity_or_now();
if !layer.is_likely_resident() {
continue;
}
let last_activity_ts = layer.access_stats().latest_activity_or_now();
let no_activity_for = match now.duration_since(last_activity_ts) {
Ok(d) => d,
@@ -265,9 +258,8 @@ impl Timeline {
continue;
}
};
let layer = guard.drop_eviction_guard();
if no_activity_for > p.threshold {
// this could cause a lot of allocations in some cases
js.spawn(async move {
layer
.evict_and_wait(std::time::Duration::from_secs(5))

View File

@@ -1,5 +1,4 @@
use anyhow::{bail, ensure, Context, Result};
use futures::StreamExt;
use pageserver_api::shard::TenantShardId;
use std::{collections::HashMap, sync::Arc};
use tracing::trace;
@@ -241,29 +240,16 @@ impl LayerManager {
layer.delete_on_drop();
}
pub(crate) fn resident_layers(&self) -> impl futures::stream::Stream<Item = Layer> + '_ {
pub(crate) fn likely_resident_layers(&self) -> impl Iterator<Item = Layer> + '_ {
// for small layer maps, we most likely have all resident, but for larger more are likely
// to be evicted assuming lots of layers correlated with longer lifespan.
let layers = self
.layer_map()
.iter_historic_layers()
.map(|desc| self.get_from_desc(&desc));
let layers = futures::stream::iter(layers);
layers.filter_map(|layer| async move {
// TODO(#6028): this query does not really need to see the ResidentLayer
match layer.keep_resident().await {
Ok(Some(layer)) => Some(layer.drop_eviction_guard()),
Ok(None) => None,
Err(e) => {
// these should not happen, but we cannot make them statically impossible right
// now.
tracing::warn!(%layer, "failed to keep the layer resident: {e:#}");
None
}
}
self.layer_map().iter_historic_layers().filter_map(|desc| {
self.layer_fmgr
.0
.get(&desc.key())
.filter(|l| l.is_likely_resident())
.cloned()
})
}

View File

@@ -24,7 +24,7 @@ mod connection_manager;
mod walreceiver_connection;
use crate::context::{DownloadBehavior, RequestContext};
use crate::task_mgr::{self, TaskKind, WALRECEIVER_RUNTIME};
use crate::task_mgr::{self, TaskKind};
use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
use crate::tenant::timeline::walreceiver::connection_manager::{
connection_manager_loop_step, ConnectionManagerState,
@@ -82,7 +82,6 @@ impl WalReceiver {
let loop_status = Arc::new(std::sync::RwLock::new(None));
let manager_status = Arc::clone(&loop_status);
task_mgr::spawn(
WALRECEIVER_RUNTIME.handle(),
TaskKind::WalReceiverManager,
Some(timeline.tenant_shard_id),
Some(timeline_id),
@@ -181,7 +180,7 @@ impl<E: Clone> TaskHandle<E> {
let (events_sender, events_receiver) = watch::channel(TaskStateUpdate::Started);
let cancellation_clone = cancellation.clone();
let join_handle = WALRECEIVER_RUNTIME.spawn(async move {
let join_handle = tokio::spawn(async move {
events_sender.send(TaskStateUpdate::Started).ok();
task(events_sender, cancellation_clone).await
// events_sender is dropped at some point during the .await above.

View File

@@ -11,7 +11,6 @@ use std::{
use anyhow::{anyhow, Context};
use bytes::BytesMut;
use chrono::{NaiveDateTime, Utc};
use fail::fail_point;
use futures::StreamExt;
use postgres::{error::SqlState, SimpleQueryMessage, SimpleQueryRow};
use postgres_ffi::WAL_SEGMENT_SIZE;
@@ -27,9 +26,7 @@ use super::TaskStateUpdate;
use crate::{
context::RequestContext,
metrics::{LIVE_CONNECTIONS_COUNT, WALRECEIVER_STARTED_CONNECTIONS, WAL_INGEST},
task_mgr,
task_mgr::TaskKind,
task_mgr::WALRECEIVER_RUNTIME,
task_mgr::{self, TaskKind},
tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline, WalReceiverInfo},
walingest::WalIngest,
walrecord::DecodedWALRecord,
@@ -163,7 +160,6 @@ pub(super) async fn handle_walreceiver_connection(
);
let connection_cancellation = cancellation.clone();
task_mgr::spawn(
WALRECEIVER_RUNTIME.handle(),
TaskKind::WalReceiverConnectionPoller,
Some(timeline.tenant_shard_id),
Some(timeline.timeline_id),
@@ -329,7 +325,17 @@ pub(super) async fn handle_walreceiver_connection(
filtered_records += 1;
}
fail_point!("walreceiver-after-ingest");
// don't simply use pausable_failpoint here because its spawn_blocking slows
// slows down the tests too much.
fail::fail_point!("walreceiver-after-ingest-blocking");
if let Err(()) = (|| {
fail::fail_point!("walreceiver-after-ingest-pause-activate", |_| {
Err(())
});
Ok(())
})() {
pausable_failpoint!("walreceiver-after-ingest-pause");
}
last_rec_lsn = lsn;
@@ -448,6 +454,7 @@ pub(super) async fn handle_walreceiver_connection(
disk_consistent_lsn,
remote_consistent_lsn,
replytime: ts,
shard_number: timeline.tenant_shard_id.shard_number.0 as u32,
};
debug!("neon_status_update {status_update:?}");

View File

@@ -109,6 +109,8 @@ impl WalIngest {
self.checkpoint_modified = true;
}
failpoint_support::sleep_millis_async!("wal-ingest-record-sleep");
match decoded.xl_rmid {
pg_constants::RM_HEAP_ID | pg_constants::RM_HEAP2_ID => {
// Heap AM records need some special handling, because they modify VM pages

View File

@@ -312,7 +312,7 @@ pg_cluster_size(PG_FUNCTION_ARGS)
{
int64 size;
size = GetZenithCurrentClusterSize();
size = GetNeonCurrentClusterSize();
if (size == 0)
PG_RETURN_NULL();

View File

@@ -26,6 +26,8 @@ extern void pg_init_libpagestore(void);
extern void pg_init_walproposer(void);
extern uint64 BackpressureThrottlingTime(void);
extern void SetNeonCurrentClusterSize(uint64 size);
extern uint64 GetNeonCurrentClusterSize(void);
extern void replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn);
extern void PGDLLEXPORT WalProposerSync(int argc, char *argv[]);

View File

@@ -1831,7 +1831,7 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
reln->smgr_relpersistence == RELPERSISTENCE_PERMANENT &&
!IsAutoVacuumWorkerProcess())
{
uint64 current_size = GetZenithCurrentClusterSize();
uint64 current_size = GetNeonCurrentClusterSize();
if (current_size >= ((uint64) max_cluster_size) * 1024 * 1024)
ereport(ERROR,
@@ -1912,7 +1912,7 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,
reln->smgr_relpersistence == RELPERSISTENCE_PERMANENT &&
!IsAutoVacuumWorkerProcess())
{
uint64 current_size = GetZenithCurrentClusterSize();
uint64 current_size = GetNeonCurrentClusterSize();
if (current_size >= ((uint64) max_cluster_size) * 1024 * 1024)
ereport(ERROR,

View File

@@ -70,7 +70,7 @@ static bool SendAppendRequests(Safekeeper *sk);
static bool RecvAppendResponses(Safekeeper *sk);
static XLogRecPtr CalculateMinFlushLsn(WalProposer *wp);
static XLogRecPtr GetAcknowledgedByQuorumWALPosition(WalProposer *wp);
static void HandleSafekeeperResponse(WalProposer *wp);
static void HandleSafekeeperResponse(WalProposer *wp, Safekeeper *sk);
static bool AsyncRead(Safekeeper *sk, char **buf, int *buf_size);
static bool AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg);
static bool BlockingWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState success_state);
@@ -1405,7 +1405,6 @@ static bool
RecvAppendResponses(Safekeeper *sk)
{
WalProposer *wp = sk->wp;
XLogRecPtr newCommitLsn;
bool readAnything = false;
while (true)
@@ -1425,6 +1424,8 @@ RecvAppendResponses(Safekeeper *sk)
LSN_FORMAT_ARGS(sk->appendResponse.commitLsn),
sk->host, sk->port);
readAnything = true;
if (sk->appendResponse.term > wp->propTerm)
{
/*
@@ -1438,35 +1439,28 @@ RecvAppendResponses(Safekeeper *sk)
sk->appendResponse.term, wp->propTerm);
}
readAnything = true;
HandleSafekeeperResponse(wp, sk);
}
if (!readAnything)
return sk->state == SS_ACTIVE;
/* update commit_lsn */
newCommitLsn = GetAcknowledgedByQuorumWALPosition(wp);
/*
* Send the new value to all safekeepers.
*/
if (newCommitLsn > wp->commitLsn)
{
wp->commitLsn = newCommitLsn;
BroadcastAppendRequest(wp);
}
HandleSafekeeperResponse(wp);
return sk->state == SS_ACTIVE;
}
#define psfeedback_log(fmt, key, ...) \
wp_log(DEBUG2, "ParsePageserverFeedbackMessage: %s " fmt, key, __VA_ARGS__)
/* Parse a PageserverFeedback message, or the PageserverFeedback part of an AppendResponse */
static void
ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, PageserverFeedback *rf)
ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, PageserverFeedback *ps_feedback)
{
uint8 nkeys;
int i;
int32 len;
/* initialize the struct before parsing */
memset(ps_feedback, 0, sizeof(PageserverFeedback));
ps_feedback->present = true;
/* get number of custom keys */
nkeys = pq_getmsgbyte(reply_message);
@@ -1474,66 +1468,52 @@ ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, Pagese
for (i = 0; i < nkeys; i++)
{
const char *key = pq_getmsgstring(reply_message);
unsigned int value_len = pq_getmsgint(reply_message, sizeof(int32));
if (strcmp(key, "current_timeline_size") == 0)
{
pq_getmsgint(reply_message, sizeof(int32));
/* read value length */
rf->currentClusterSize = pq_getmsgint64(reply_message);
wp_log(DEBUG2, "ParsePageserverFeedbackMessage: current_timeline_size %lu",
rf->currentClusterSize);
Assert(value_len == sizeof(int64));
ps_feedback->currentClusterSize = pq_getmsgint64(reply_message);
psfeedback_log(UINT64_FORMAT, key, ps_feedback->currentClusterSize);
}
else if ((strcmp(key, "ps_writelsn") == 0) || (strcmp(key, "last_received_lsn") == 0))
{
pq_getmsgint(reply_message, sizeof(int32));
/* read value length */
rf->last_received_lsn = pq_getmsgint64(reply_message);
wp_log(DEBUG2, "ParsePageserverFeedbackMessage: last_received_lsn %X/%X",
LSN_FORMAT_ARGS(rf->last_received_lsn));
Assert(value_len == sizeof(int64));
ps_feedback->last_received_lsn = pq_getmsgint64(reply_message);
psfeedback_log("%X/%X", key, LSN_FORMAT_ARGS(ps_feedback->last_received_lsn));
}
else if ((strcmp(key, "ps_flushlsn") == 0) || (strcmp(key, "disk_consistent_lsn") == 0))
{
pq_getmsgint(reply_message, sizeof(int32));
/* read value length */
rf->disk_consistent_lsn = pq_getmsgint64(reply_message);
wp_log(DEBUG2, "ParsePageserverFeedbackMessage: disk_consistent_lsn %X/%X",
LSN_FORMAT_ARGS(rf->disk_consistent_lsn));
Assert(value_len == sizeof(int64));
ps_feedback->disk_consistent_lsn = pq_getmsgint64(reply_message);
psfeedback_log("%X/%X", key, LSN_FORMAT_ARGS(ps_feedback->disk_consistent_lsn));
}
else if ((strcmp(key, "ps_applylsn") == 0) || (strcmp(key, "remote_consistent_lsn") == 0))
{
pq_getmsgint(reply_message, sizeof(int32));
/* read value length */
rf->remote_consistent_lsn = pq_getmsgint64(reply_message);
wp_log(DEBUG2, "ParsePageserverFeedbackMessage: remote_consistent_lsn %X/%X",
LSN_FORMAT_ARGS(rf->remote_consistent_lsn));
Assert(value_len == sizeof(int64));
ps_feedback->remote_consistent_lsn = pq_getmsgint64(reply_message);
psfeedback_log("%X/%X", key, LSN_FORMAT_ARGS(ps_feedback->remote_consistent_lsn));
}
else if ((strcmp(key, "ps_replytime") == 0) || (strcmp(key, "replytime") == 0))
{
pq_getmsgint(reply_message, sizeof(int32));
/* read value length */
rf->replytime = pq_getmsgint64(reply_message);
{
char *replyTimeStr;
/* Copy because timestamptz_to_str returns a static buffer */
replyTimeStr = pstrdup(timestamptz_to_str(rf->replytime));
wp_log(DEBUG2, "ParsePageserverFeedbackMessage: replytime %lu reply_time: %s",
rf->replytime, replyTimeStr);
pfree(replyTimeStr);
}
Assert(value_len == sizeof(int64));
ps_feedback->replytime = pq_getmsgint64(reply_message);
psfeedback_log("%s", key, timestamptz_to_str(ps_feedback->replytime));
}
else if (strcmp(key, "shard_number") == 0)
{
Assert(value_len == sizeof(uint32));
ps_feedback->shard_number = pq_getmsgint(reply_message, sizeof(uint32));
psfeedback_log("%u", key, ps_feedback->shard_number);
}
else
{
len = pq_getmsgint(reply_message, sizeof(int32));
/* read value length */
/*
* Skip unknown keys to support backward compatibile protocol
* changes
*/
wp_log(LOG, "ParsePageserverFeedbackMessage: unknown key: %s len %d", key, len);
pq_getmsgbytes(reply_message, len);
wp_log(LOG, "ParsePageserverFeedbackMessage: unknown key: %s len %d", key, value_len);
pq_getmsgbytes(reply_message, value_len);
};
}
}
@@ -1630,12 +1610,30 @@ GetDonor(WalProposer *wp, XLogRecPtr *donor_lsn)
return donor;
}
/*
* Process AppendResponse message from safekeeper.
*/
static void
HandleSafekeeperResponse(WalProposer *wp)
HandleSafekeeperResponse(WalProposer *wp, Safekeeper *sk)
{
XLogRecPtr candidateTruncateLsn;
XLogRecPtr newCommitLsn;
wp->api.process_safekeeper_feedback(wp);
newCommitLsn = GetAcknowledgedByQuorumWALPosition(wp);
if (newCommitLsn > wp->commitLsn)
{
wp->commitLsn = newCommitLsn;
/* Send new value to all safekeepers. */
BroadcastAppendRequest(wp);
}
/*
* Unlock syncrep waiters, update ps_feedback, CheckGracefulShutdown().
* The last one will terminate the process if the shutdown is requested
* and WAL is committed by the quorum. BroadcastAppendRequest() should be
* called to notify safekeepers about the new commitLsn.
*/
wp->api.process_safekeeper_feedback(wp, sk);
/*
* Try to advance truncateLsn -- the last record flushed to all
@@ -1811,8 +1809,10 @@ AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg)
msg->hs.ts = pq_getmsgint64_le(&s);
msg->hs.xmin.value = pq_getmsgint64_le(&s);
msg->hs.catalog_xmin.value = pq_getmsgint64_le(&s);
if (buf_size > APPENDRESPONSE_FIXEDPART_SIZE)
ParsePageserverFeedbackMessage(wp, &s, &msg->rf);
if (s.len > s.cursor)
ParsePageserverFeedbackMessage(wp, &s, &msg->ps_feedback);
else
msg->ps_feedback.present = false;
pq_getmsgend(&s);
return true;
}

View File

@@ -10,6 +10,7 @@
#include "libpqwalproposer.h"
#include "neon_walreader.h"
#include "pagestore_client.h"
#define SK_MAGIC 0xCafeCeefu
#define SK_PROTOCOL_VERSION 2
@@ -269,6 +270,8 @@ typedef struct HotStandbyFeedback
typedef struct PageserverFeedback
{
/* true if AppendResponse contains this feedback */
bool present;
/* current size of the timeline on pageserver */
uint64 currentClusterSize;
/* standby_status_update fields that safekeeper received from pageserver */
@@ -276,14 +279,22 @@ typedef struct PageserverFeedback
XLogRecPtr disk_consistent_lsn;
XLogRecPtr remote_consistent_lsn;
TimestampTz replytime;
uint32 shard_number;
} PageserverFeedback;
typedef struct WalproposerShmemState
{
slock_t mutex;
PageserverFeedback feedback;
term_t mineLastElectedTerm;
pg_atomic_uint64 backpressureThrottlingTime;
pg_atomic_uint64 currentClusterSize;
/* last feedback from each shard */
PageserverFeedback shard_ps_feedback[MAX_SHARDS];
int num_shards;
/* aggregated feedback with min LSNs across shards */
PageserverFeedback min_ps_feedback;
} WalproposerShmemState;
/*
@@ -307,12 +318,12 @@ typedef struct AppendResponse
/* Feedback received from pageserver includes standby_status_update fields */
/* and custom neon feedback. */
/* This part of the message is extensible. */
PageserverFeedback rf;
PageserverFeedback ps_feedback;
} AppendResponse;
/* PageserverFeedback is extensible part of the message that is parsed separately */
/* Other fields are fixed part */
#define APPENDRESPONSE_FIXEDPART_SIZE offsetof(AppendResponse, rf)
#define APPENDRESPONSE_FIXEDPART_SIZE 56
struct WalProposer;
typedef struct WalProposer WalProposer;
@@ -560,11 +571,11 @@ typedef struct walproposer_api
void (*finish_sync_safekeepers) (WalProposer *wp, XLogRecPtr lsn);
/*
* Called after every new message from the safekeeper. Used to propagate
* Called after every AppendResponse from the safekeeper. Used to propagate
* backpressure feedback and to confirm WAL persistence (has been commited
* on the quorum of safekeepers).
*/
void (*process_safekeeper_feedback) (WalProposer *wp);
void (*process_safekeeper_feedback) (WalProposer *wp, Safekeeper *sk);
/*
* Write a log message to the internal log processor. This is used only

View File

@@ -63,7 +63,6 @@ char *wal_acceptors_list = "";
int wal_acceptor_reconnect_timeout = 1000;
int wal_acceptor_connection_timeout = 10000;
static AppendResponse quorumFeedback;
static WalproposerShmemState *walprop_shared;
static WalProposerConfig walprop_config;
static XLogRecPtr sentPtr = InvalidXLogRecPtr;
@@ -71,6 +70,10 @@ static const walproposer_api walprop_pg;
static volatile sig_atomic_t got_SIGUSR2 = false;
static bool reported_sigusr2 = false;
static XLogRecPtr standby_flush_lsn = InvalidXLogRecPtr;
static XLogRecPtr standby_apply_lsn = InvalidXLogRecPtr;
static HotStandbyFeedback agg_hs_feedback;
static void nwp_shmem_startup_hook(void);
static void nwp_register_gucs(void);
static void nwp_prepare_shmem(void);
@@ -279,6 +282,7 @@ WalproposerShmemInit(void)
memset(walprop_shared, 0, WalproposerShmemSize());
SpinLockInit(&walprop_shared->mutex);
pg_atomic_init_u64(&walprop_shared->backpressureThrottlingTime, 0);
pg_atomic_init_u64(&walprop_shared->currentClusterSize, 0);
}
LWLockRelease(AddinShmemInitLock);
@@ -402,21 +406,58 @@ walprop_pg_get_shmem_state(WalProposer *wp)
return walprop_shared;
}
static void
replication_feedback_set(PageserverFeedback *rf)
/*
* Record new ps_feedback in the array with shards and update min_feedback.
*/
static PageserverFeedback
record_pageserver_feedback(PageserverFeedback *ps_feedback)
{
PageserverFeedback min_feedback;
Assert(ps_feedback->present);
Assert(ps_feedback->shard_number < MAX_SHARDS);
SpinLockAcquire(&walprop_shared->mutex);
memcpy(&walprop_shared->feedback, rf, sizeof(PageserverFeedback));
/* Update the number of shards */
if (ps_feedback->shard_number + 1 > walprop_shared->num_shards)
walprop_shared->num_shards = ps_feedback->shard_number + 1;
/* Update the feedback */
memcpy(&walprop_shared->shard_ps_feedback[ps_feedback->shard_number], ps_feedback, sizeof(PageserverFeedback));
/* Calculate min LSNs */
memcpy(&min_feedback, ps_feedback, sizeof(PageserverFeedback));
for (int i = 0; i < walprop_shared->num_shards; i++)
{
PageserverFeedback *feedback = &walprop_shared->shard_ps_feedback[i];
if (feedback->present)
{
if (min_feedback.last_received_lsn == InvalidXLogRecPtr || feedback->last_received_lsn < min_feedback.last_received_lsn)
min_feedback.last_received_lsn = feedback->last_received_lsn;
if (min_feedback.disk_consistent_lsn == InvalidXLogRecPtr || feedback->disk_consistent_lsn < min_feedback.disk_consistent_lsn)
min_feedback.disk_consistent_lsn = feedback->disk_consistent_lsn;
if (min_feedback.remote_consistent_lsn == InvalidXLogRecPtr || feedback->remote_consistent_lsn < min_feedback.remote_consistent_lsn)
min_feedback.remote_consistent_lsn = feedback->remote_consistent_lsn;
}
}
/* Copy min_feedback back to shmem */
memcpy(&walprop_shared->min_ps_feedback, &min_feedback, sizeof(PageserverFeedback));
SpinLockRelease(&walprop_shared->mutex);
return min_feedback;
}
void
replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn)
{
SpinLockAcquire(&walprop_shared->mutex);
*writeLsn = walprop_shared->feedback.last_received_lsn;
*flushLsn = walprop_shared->feedback.disk_consistent_lsn;
*applyLsn = walprop_shared->feedback.remote_consistent_lsn;
*writeLsn = walprop_shared->min_ps_feedback.last_received_lsn;
*flushLsn = walprop_shared->min_ps_feedback.disk_consistent_lsn;
*applyLsn = walprop_shared->min_ps_feedback.remote_consistent_lsn;
SpinLockRelease(&walprop_shared->mutex);
}
@@ -509,9 +550,10 @@ walprop_pg_init_standalone_sync_safekeepers(void)
static void
walprop_sigusr2(SIGNAL_ARGS)
{
int save_errno = errno;
got_SIGUSR2 = true;
SetLatch(MyLatch);
errno = save_errno;
}
static void
@@ -1869,39 +1911,6 @@ CheckGracefulShutdown(WalProposer *wp)
}
}
/*
* Choose most advanced PageserverFeedback and set it to *rf.
*/
static void
GetLatestNeonFeedback(PageserverFeedback *rf, WalProposer *wp)
{
int latest_safekeeper = 0;
XLogRecPtr last_received_lsn = InvalidXLogRecPtr;
for (int i = 0; i < wp->n_safekeepers; i++)
{
if (wp->safekeeper[i].appendResponse.rf.last_received_lsn > last_received_lsn)
{
latest_safekeeper = i;
last_received_lsn = wp->safekeeper[i].appendResponse.rf.last_received_lsn;
}
}
rf->currentClusterSize = wp->safekeeper[latest_safekeeper].appendResponse.rf.currentClusterSize;
rf->last_received_lsn = wp->safekeeper[latest_safekeeper].appendResponse.rf.last_received_lsn;
rf->disk_consistent_lsn = wp->safekeeper[latest_safekeeper].appendResponse.rf.disk_consistent_lsn;
rf->remote_consistent_lsn = wp->safekeeper[latest_safekeeper].appendResponse.rf.remote_consistent_lsn;
rf->replytime = wp->safekeeper[latest_safekeeper].appendResponse.rf.replytime;
wpg_log(DEBUG2, "GetLatestNeonFeedback: currentClusterSize %lu,"
" last_received_lsn %X/%X, disk_consistent_lsn %X/%X, remote_consistent_lsn %X/%X, replytime %lu",
rf->currentClusterSize,
LSN_FORMAT_ARGS(rf->last_received_lsn),
LSN_FORMAT_ARGS(rf->disk_consistent_lsn),
LSN_FORMAT_ARGS(rf->remote_consistent_lsn),
rf->replytime);
}
/*
* Combine hot standby feedbacks from all safekeepers.
*/
@@ -1949,26 +1958,38 @@ CombineHotStanbyFeedbacks(HotStandbyFeedback *hs, WalProposer *wp)
* None of that is functional in sync-safekeepers.
*/
static void
walprop_pg_process_safekeeper_feedback(WalProposer *wp)
walprop_pg_process_safekeeper_feedback(WalProposer *wp, Safekeeper *sk)
{
HotStandbyFeedback hsFeedback;
XLogRecPtr oldDiskConsistentLsn;
HotStandbyFeedback hsFeedback;
bool needToAdvanceSlot = false;
if (wp->config->syncSafekeepers)
return;
oldDiskConsistentLsn = quorumFeedback.rf.disk_consistent_lsn;
/* Get PageserverFeedback fields from the most advanced safekeeper */
GetLatestNeonFeedback(&quorumFeedback.rf, wp);
replication_feedback_set(&quorumFeedback.rf);
SetZenithCurrentClusterSize(quorumFeedback.rf.currentClusterSize);
if (wp->commitLsn > quorumFeedback.flushLsn || oldDiskConsistentLsn != quorumFeedback.rf.disk_consistent_lsn)
/* handle fresh ps_feedback */
if (sk->appendResponse.ps_feedback.present)
{
if (wp->commitLsn > quorumFeedback.flushLsn)
quorumFeedback.flushLsn = wp->commitLsn;
PageserverFeedback min_feedback = record_pageserver_feedback(&sk->appendResponse.ps_feedback);
/* Only one main shard sends non-zero currentClusterSize */
if (sk->appendResponse.ps_feedback.currentClusterSize > 0)
SetNeonCurrentClusterSize(sk->appendResponse.ps_feedback.currentClusterSize);
if (min_feedback.disk_consistent_lsn != standby_apply_lsn)
{
standby_apply_lsn = min_feedback.disk_consistent_lsn;
needToAdvanceSlot = true;
}
}
if (wp->commitLsn > standby_flush_lsn)
{
standby_flush_lsn = wp->commitLsn;
needToAdvanceSlot = true;
}
if (needToAdvanceSlot)
{
/*
* Advance the replication slot to commitLsn. WAL before it is
* hardened and will be fetched from one of safekeepers by
@@ -1977,23 +1998,23 @@ walprop_pg_process_safekeeper_feedback(WalProposer *wp)
* Also wakes up syncrep waiters.
*/
ProcessStandbyReply(
/* write_lsn - This is what durably stored in WAL service. */
quorumFeedback.flushLsn,
/* flush_lsn - This is what durably stored in WAL service. */
quorumFeedback.flushLsn,
/* write_lsn - This is what durably stored in safekeepers quorum. */
standby_flush_lsn,
/* flush_lsn - This is what durably stored in safekeepers quorum. */
standby_flush_lsn,
/*
* apply_lsn - This is what processed and durably saved at*
* pageserver.
*/
quorumFeedback.rf.disk_consistent_lsn,
standby_apply_lsn,
walprop_pg_get_current_timestamp(wp), false);
}
CombineHotStanbyFeedbacks(&hsFeedback, wp);
if (hsFeedback.ts != 0 && memcmp(&hsFeedback, &quorumFeedback.hs, sizeof hsFeedback) != 0)
if (hsFeedback.ts != 0 && memcmp(&hsFeedback, &agg_hs_feedback, sizeof hsFeedback) != 0)
{
quorumFeedback.hs = hsFeedback;
agg_hs_feedback = hsFeedback;
ProcessStandbyHSFeedback(hsFeedback.ts,
XidFromFullTransactionId(hsFeedback.xmin),
EpochFromFullTransactionId(hsFeedback.xmin),
@@ -2074,6 +2095,18 @@ GetLogRepRestartLSN(WalProposer *wp)
return lrRestartLsn;
}
void SetNeonCurrentClusterSize(uint64 size)
{
pg_atomic_write_u64(&walprop_shared->currentClusterSize, size);
}
uint64 GetNeonCurrentClusterSize(void)
{
return pg_atomic_read_u64(&walprop_shared->currentClusterSize);
}
uint64 GetNeonCurrentClusterSize(void);
static const walproposer_api walprop_pg = {
.get_shmem_state = walprop_pg_get_shmem_state,
.start_streaming = walprop_pg_start_streaming,

View File

@@ -11,6 +11,10 @@ testing = []
[dependencies]
anyhow.workspace = true
async-trait.workspace = true
aws-config.workspace = true
aws-sdk-iam.workspace = true
aws-sigv4.workspace = true
aws-types.workspace = true
base64.workspace = true
bstr.workspace = true
bytes = { workspace = true, features = ["serde"] }
@@ -27,6 +31,7 @@ hashlink.workspace = true
hex.workspace = true
hmac.workspace = true
hostname.workspace = true
http.workspace = true
humantime.workspace = true
hyper-tungstenite.workspace = true
hyper.workspace = true
@@ -59,10 +64,11 @@ rustls.workspace = true
scopeguard.workspace = true
serde.workspace = true
serde_json.workspace = true
sha2.workspace = true
sha2 = { workspace = true, features = ["asm"] }
smol_str.workspace = true
smallvec.workspace = true
socket2.workspace = true
subtle.workspace = true
sync_wrapper.workspace = true
task-local-extensions.workspace = true
thiserror.workspace = true
@@ -91,6 +97,7 @@ workspace_hack.workspace = true
[dev-dependencies]
camino-tempfile.workspace = true
fallible-iterator.workspace = true
rcgen.workspace = true
rstest.workspace = true
tokio-postgres-rustls.workspace = true

View File

@@ -254,7 +254,7 @@ async fn authenticate_with_secret(
config: &'static AuthenticationConfig,
) -> auth::Result<ComputeCredentials> {
if let Some(password) = unauthenticated_password {
let auth_outcome = validate_password_and_exchange(&password, secret)?;
let auth_outcome = validate_password_and_exchange(&password, secret).await?;
let keys = match auth_outcome {
crate::sasl::Outcome::Success(key) => key,
crate::sasl::Outcome::Failure(reason) => {
@@ -408,3 +408,228 @@ impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, &()> {
}
}
}
#[cfg(test)]
mod tests {
use std::sync::Arc;
use bytes::BytesMut;
use fallible_iterator::FallibleIterator;
use postgres_protocol::{
authentication::sasl::{ChannelBinding, ScramSha256},
message::{backend::Message as PgMessage, frontend},
};
use provider::AuthSecret;
use tokio::io::{AsyncRead, AsyncReadExt, AsyncWriteExt};
use crate::{
auth::{ComputeUserInfoMaybeEndpoint, IpPattern},
config::AuthenticationConfig,
console::{
self,
provider::{self, CachedAllowedIps, CachedRoleSecret},
CachedNodeInfo,
},
context::RequestMonitoring,
proxy::NeonOptions,
scram::ServerSecret,
stream::{PqStream, Stream},
};
use super::auth_quirks;
struct Auth {
ips: Vec<IpPattern>,
secret: AuthSecret,
}
impl console::Api for Auth {
async fn get_role_secret(
&self,
_ctx: &mut RequestMonitoring,
_user_info: &super::ComputeUserInfo,
) -> Result<CachedRoleSecret, console::errors::GetAuthInfoError> {
Ok(CachedRoleSecret::new_uncached(Some(self.secret.clone())))
}
async fn get_allowed_ips_and_secret(
&self,
_ctx: &mut RequestMonitoring,
_user_info: &super::ComputeUserInfo,
) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), console::errors::GetAuthInfoError>
{
Ok((
CachedAllowedIps::new_uncached(Arc::new(self.ips.clone())),
Some(CachedRoleSecret::new_uncached(Some(self.secret.clone()))),
))
}
async fn wake_compute(
&self,
_ctx: &mut RequestMonitoring,
_user_info: &super::ComputeUserInfo,
) -> Result<CachedNodeInfo, console::errors::WakeComputeError> {
unimplemented!()
}
}
static CONFIG: &AuthenticationConfig = &AuthenticationConfig {
scram_protocol_timeout: std::time::Duration::from_secs(5),
};
async fn read_message(r: &mut (impl AsyncRead + Unpin), b: &mut BytesMut) -> PgMessage {
loop {
r.read_buf(&mut *b).await.unwrap();
if let Some(m) = PgMessage::parse(&mut *b).unwrap() {
break m;
}
}
}
#[tokio::test]
async fn auth_quirks_scram() {
let (mut client, server) = tokio::io::duplex(1024);
let mut stream = PqStream::new(Stream::from_raw(server));
let mut ctx = RequestMonitoring::test();
let api = Auth {
ips: vec![],
secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()),
};
let user_info = ComputeUserInfoMaybeEndpoint {
user: "conrad".into(),
endpoint_id: Some("endpoint".into()),
options: NeonOptions::default(),
};
let handle = tokio::spawn(async move {
let mut scram = ScramSha256::new(b"my-secret-password", ChannelBinding::unsupported());
let mut read = BytesMut::new();
// server should offer scram
match read_message(&mut client, &mut read).await {
PgMessage::AuthenticationSasl(a) => {
let options: Vec<&str> = a.mechanisms().collect().unwrap();
assert_eq!(options, ["SCRAM-SHA-256"]);
}
_ => panic!("wrong message"),
}
// client sends client-first-message
let mut write = BytesMut::new();
frontend::sasl_initial_response("SCRAM-SHA-256", scram.message(), &mut write).unwrap();
client.write_all(&write).await.unwrap();
// server response with server-first-message
match read_message(&mut client, &mut read).await {
PgMessage::AuthenticationSaslContinue(a) => {
scram.update(a.data()).await.unwrap();
}
_ => panic!("wrong message"),
}
// client response with client-final-message
write.clear();
frontend::sasl_response(scram.message(), &mut write).unwrap();
client.write_all(&write).await.unwrap();
// server response with server-final-message
match read_message(&mut client, &mut read).await {
PgMessage::AuthenticationSaslFinal(a) => {
scram.finish(a.data()).unwrap();
}
_ => panic!("wrong message"),
}
});
let _creds = auth_quirks(&mut ctx, &api, user_info, &mut stream, false, CONFIG)
.await
.unwrap();
handle.await.unwrap();
}
#[tokio::test]
async fn auth_quirks_cleartext() {
let (mut client, server) = tokio::io::duplex(1024);
let mut stream = PqStream::new(Stream::from_raw(server));
let mut ctx = RequestMonitoring::test();
let api = Auth {
ips: vec![],
secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()),
};
let user_info = ComputeUserInfoMaybeEndpoint {
user: "conrad".into(),
endpoint_id: Some("endpoint".into()),
options: NeonOptions::default(),
};
let handle = tokio::spawn(async move {
let mut read = BytesMut::new();
let mut write = BytesMut::new();
// server should offer cleartext
match read_message(&mut client, &mut read).await {
PgMessage::AuthenticationCleartextPassword => {}
_ => panic!("wrong message"),
}
// client responds with password
write.clear();
frontend::password_message(b"my-secret-password", &mut write).unwrap();
client.write_all(&write).await.unwrap();
});
let _creds = auth_quirks(&mut ctx, &api, user_info, &mut stream, true, CONFIG)
.await
.unwrap();
handle.await.unwrap();
}
#[tokio::test]
async fn auth_quirks_password_hack() {
let (mut client, server) = tokio::io::duplex(1024);
let mut stream = PqStream::new(Stream::from_raw(server));
let mut ctx = RequestMonitoring::test();
let api = Auth {
ips: vec![],
secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()),
};
let user_info = ComputeUserInfoMaybeEndpoint {
user: "conrad".into(),
endpoint_id: None,
options: NeonOptions::default(),
};
let handle = tokio::spawn(async move {
let mut read = BytesMut::new();
// server should offer cleartext
match read_message(&mut client, &mut read).await {
PgMessage::AuthenticationCleartextPassword => {}
_ => panic!("wrong message"),
}
// client responds with password
let mut write = BytesMut::new();
frontend::password_message(b"endpoint=my-endpoint;my-secret-password", &mut write)
.unwrap();
client.write_all(&write).await.unwrap();
});
let creds = auth_quirks(&mut ctx, &api, user_info, &mut stream, true, CONFIG)
.await
.unwrap();
assert_eq!(creds.info.endpoint, "my-endpoint");
handle.await.unwrap();
}
}

View File

@@ -126,7 +126,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, CleartextPassword> {
.strip_suffix(&[0])
.ok_or(AuthErrorImpl::MalformedPassword("missing terminator"))?;
let outcome = validate_password_and_exchange(password, self.state.0)?;
let outcome = validate_password_and_exchange(password, self.state.0).await?;
if let sasl::Outcome::Success(_) = &outcome {
self.stream.write_message_noflush(&Be::AuthenticationOk)?;
@@ -180,7 +180,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, Scram<'_>> {
}
}
pub(crate) fn validate_password_and_exchange(
pub(crate) async fn validate_password_and_exchange(
password: &[u8],
secret: AuthSecret,
) -> super::Result<sasl::Outcome<ComputeCredentialKeys>> {
@@ -194,13 +194,7 @@ pub(crate) fn validate_password_and_exchange(
}
// perform scram authentication as both client and server to validate the keys
AuthSecret::Scram(scram_secret) => {
use postgres_protocol::authentication::sasl::{ChannelBinding, ScramSha256};
let sasl_client = ScramSha256::new(password, ChannelBinding::unsupported());
let outcome = crate::scram::exchange(
&scram_secret,
sasl_client,
crate::config::TlsServerEndPoint::Undefined,
)?;
let outcome = crate::scram::exchange(&scram_secret, password).await?;
let client_key = match outcome {
sasl::Outcome::Success(client_key) => client_key,

View File

@@ -1,3 +1,10 @@
use aws_config::environment::EnvironmentVariableCredentialsProvider;
use aws_config::imds::credentials::ImdsCredentialsProvider;
use aws_config::meta::credentials::CredentialsProviderChain;
use aws_config::meta::region::RegionProviderChain;
use aws_config::profile::ProfileFileCredentialsProvider;
use aws_config::provider_config::ProviderConfig;
use aws_config::web_identity_token::WebIdentityTokenCredentialsProvider;
use futures::future::Either;
use proxy::auth;
use proxy::auth::backend::MaybeOwned;
@@ -10,11 +17,14 @@ use proxy::config::ProjectInfoCacheOptions;
use proxy::console;
use proxy::context::parquet::ParquetUploadArgs;
use proxy::http;
use proxy::metrics::NUM_CANCELLATION_REQUESTS_SOURCE_FROM_CLIENT;
use proxy::rate_limiter::EndpointRateLimiter;
use proxy::rate_limiter::RateBucketInfo;
use proxy::rate_limiter::RateLimiterConfig;
use proxy::redis::cancellation_publisher::RedisPublisherClient;
use proxy::redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider;
use proxy::redis::elasticache;
use proxy::redis::notifications;
use proxy::redis::publisher::RedisPublisherClient;
use proxy::serverless::GlobalConnPoolOptions;
use proxy::usage_metrics;
@@ -150,9 +160,24 @@ struct ProxyCliArgs {
/// disable ip check for http requests. If it is too time consuming, it could be turned off.
#[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
disable_ip_check_for_http: bool,
/// redis url for notifications.
/// redis url for notifications (if empty, redis_host:port will be used for both notifications and streaming connections)
#[clap(long)]
redis_notifications: Option<String>,
/// redis host for streaming connections (might be different from the notifications host)
#[clap(long)]
redis_host: Option<String>,
/// redis port for streaming connections (might be different from the notifications host)
#[clap(long)]
redis_port: Option<u16>,
/// redis cluster name, used in aws elasticache
#[clap(long)]
redis_cluster_name: Option<String>,
/// redis user_id, used in aws elasticache
#[clap(long)]
redis_user_id: Option<String>,
/// aws region to retrieve credentials
#[clap(long, default_value_t = String::new())]
aws_region: String,
/// cache for `project_info` (use `size=0` to disable)
#[clap(long, default_value = config::ProjectInfoCacheOptions::CACHE_DEFAULT_OPTIONS)]
project_info_cache: String,
@@ -216,6 +241,61 @@ async fn main() -> anyhow::Result<()> {
let config = build_config(&args)?;
info!("Authentication backend: {}", config.auth_backend);
info!("Using region: {}", config.aws_region);
let region_provider = RegionProviderChain::default_provider().or_else(&*config.aws_region); // Replace with your Redis region if needed
let provider_conf =
ProviderConfig::without_region().with_region(region_provider.region().await);
let aws_credentials_provider = {
// uses "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"
CredentialsProviderChain::first_try("env", EnvironmentVariableCredentialsProvider::new())
// uses "AWS_PROFILE" / `aws sso login --profile <profile>`
.or_else(
"profile-sso",
ProfileFileCredentialsProvider::builder()
.configure(&provider_conf)
.build(),
)
// uses "AWS_WEB_IDENTITY_TOKEN_FILE", "AWS_ROLE_ARN", "AWS_ROLE_SESSION_NAME"
// needed to access remote extensions bucket
.or_else(
"token",
WebIdentityTokenCredentialsProvider::builder()
.configure(&provider_conf)
.build(),
)
// uses imds v2
.or_else("imds", ImdsCredentialsProvider::builder().build())
};
let elasticache_credentials_provider = Arc::new(elasticache::CredentialsProvider::new(
elasticache::AWSIRSAConfig::new(
config.aws_region.clone(),
args.redis_cluster_name,
args.redis_user_id,
),
aws_credentials_provider,
));
let redis_notifications_client =
match (args.redis_notifications, (args.redis_host, args.redis_port)) {
(Some(url), _) => {
info!("Starting redis notifications listener ({url})");
Some(ConnectionWithCredentialsProvider::new_with_static_credentials(url))
}
(None, (Some(host), Some(port))) => Some(
ConnectionWithCredentialsProvider::new_with_credentials_provider(
host,
port,
elasticache_credentials_provider.clone(),
),
),
(None, (None, None)) => {
warn!("Redis is disabled");
None
}
_ => {
bail!("redis-host and redis-port must be specified together");
}
};
// Check that we can bind to address before further initialization
let http_address: SocketAddr = args.http.parse()?;
@@ -233,17 +313,22 @@ async fn main() -> anyhow::Result<()> {
let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new(&config.endpoint_rps_limit));
let cancel_map = CancelMap::default();
let redis_publisher = match &args.redis_notifications {
Some(url) => Some(Arc::new(Mutex::new(RedisPublisherClient::new(
url,
// let redis_notifications_client = redis_notifications_client.map(|x| Box::leak(Box::new(x)));
let redis_publisher = match &redis_notifications_client {
Some(redis_publisher) => Some(Arc::new(Mutex::new(RedisPublisherClient::new(
redis_publisher.clone(),
args.region.clone(),
&config.redis_rps_limit,
)?))),
None => None,
};
let cancellation_handler = Arc::new(CancellationHandler::new(
let cancellation_handler = Arc::new(CancellationHandler::<
Option<Arc<tokio::sync::Mutex<RedisPublisherClient>>>,
>::new(
cancel_map.clone(),
redis_publisher,
NUM_CANCELLATION_REQUESTS_SOURCE_FROM_CLIENT,
));
// client facing tasks. these will exit on error or on cancellation
@@ -290,17 +375,16 @@ async fn main() -> anyhow::Result<()> {
if let auth::BackendType::Console(api, _) = &config.auth_backend {
if let proxy::console::provider::ConsoleBackend::Console(api) = &**api {
let cache = api.caches.project_info.clone();
if let Some(url) = args.redis_notifications {
info!("Starting redis notifications listener ({url})");
if let Some(redis_notifications_client) = redis_notifications_client {
let cache = api.caches.project_info.clone();
maintenance_tasks.spawn(notifications::task_main(
url.to_owned(),
redis_notifications_client.clone(),
cache.clone(),
cancel_map.clone(),
args.region.clone(),
));
maintenance_tasks.spawn(async move { cache.clone().gc_worker().await });
}
maintenance_tasks.spawn(async move { cache.clone().gc_worker().await });
}
}
@@ -445,8 +529,8 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
endpoint_rps_limit,
redis_rps_limit,
handshake_timeout: args.handshake_timeout,
// TODO: add this argument
region: args.region.clone(),
aws_region: args.aws_region.clone(),
}));
Ok(config)

View File

@@ -1,4 +1,3 @@
use async_trait::async_trait;
use dashmap::DashMap;
use pq_proto::CancelKeyData;
use std::{net::SocketAddr, sync::Arc};
@@ -10,18 +9,26 @@ use tracing::info;
use uuid::Uuid;
use crate::{
error::ReportableError, metrics::NUM_CANCELLATION_REQUESTS,
redis::publisher::RedisPublisherClient,
error::ReportableError,
metrics::NUM_CANCELLATION_REQUESTS,
redis::cancellation_publisher::{
CancellationPublisher, CancellationPublisherMut, RedisPublisherClient,
},
};
pub type CancelMap = Arc<DashMap<CancelKeyData, Option<CancelClosure>>>;
pub type CancellationHandlerMain = CancellationHandler<Option<Arc<Mutex<RedisPublisherClient>>>>;
pub type CancellationHandlerMainInternal = Option<Arc<Mutex<RedisPublisherClient>>>;
/// Enables serving `CancelRequest`s.
///
/// If there is a `RedisPublisherClient` available, it will be used to publish the cancellation key to other proxy instances.
pub struct CancellationHandler {
/// If `CancellationPublisher` is available, cancel request will be used to publish the cancellation key to other proxy instances.
pub struct CancellationHandler<P> {
map: CancelMap,
redis_client: Option<Arc<Mutex<RedisPublisherClient>>>,
client: P,
/// This field used for the monitoring purposes.
/// Represents the source of the cancellation request.
from: &'static str,
}
#[derive(Debug, Error)]
@@ -44,49 +51,9 @@ impl ReportableError for CancelError {
}
}
impl CancellationHandler {
pub fn new(map: CancelMap, redis_client: Option<Arc<Mutex<RedisPublisherClient>>>) -> Self {
Self { map, redis_client }
}
/// Cancel a running query for the corresponding connection.
pub async fn cancel_session(
&self,
key: CancelKeyData,
session_id: Uuid,
) -> Result<(), CancelError> {
let from = "from_client";
// NB: we should immediately release the lock after cloning the token.
let Some(cancel_closure) = self.map.get(&key).and_then(|x| x.clone()) else {
tracing::warn!("query cancellation key not found: {key}");
if let Some(redis_client) = &self.redis_client {
NUM_CANCELLATION_REQUESTS
.with_label_values(&[from, "not_found"])
.inc();
info!("publishing cancellation key to Redis");
match redis_client.lock().await.try_publish(key, session_id).await {
Ok(()) => {
info!("cancellation key successfuly published to Redis");
}
Err(e) => {
tracing::error!("failed to publish a message: {e}");
return Err(CancelError::IO(std::io::Error::new(
std::io::ErrorKind::Other,
e.to_string(),
)));
}
}
}
return Ok(());
};
NUM_CANCELLATION_REQUESTS
.with_label_values(&[from, "found"])
.inc();
info!("cancelling query per user's request using key {key}");
cancel_closure.try_cancel_query().await
}
impl<P: CancellationPublisher> CancellationHandler<P> {
/// Run async action within an ephemeral session identified by [`CancelKeyData`].
pub fn get_session(self: Arc<Self>) -> Session {
pub fn get_session(self: Arc<Self>) -> Session<P> {
// HACK: We'd rather get the real backend_pid but tokio_postgres doesn't
// expose it and we don't want to do another roundtrip to query
// for it. The client will be able to notice that this is not the
@@ -112,9 +79,39 @@ impl CancellationHandler {
cancellation_handler: self,
}
}
/// Try to cancel a running query for the corresponding connection.
/// If the cancellation key is not found, it will be published to Redis.
pub async fn cancel_session(
&self,
key: CancelKeyData,
session_id: Uuid,
) -> Result<(), CancelError> {
// NB: we should immediately release the lock after cloning the token.
let Some(cancel_closure) = self.map.get(&key).and_then(|x| x.clone()) else {
tracing::warn!("query cancellation key not found: {key}");
NUM_CANCELLATION_REQUESTS
.with_label_values(&[self.from, "not_found"])
.inc();
match self.client.try_publish(key, session_id).await {
Ok(()) => {} // do nothing
Err(e) => {
return Err(CancelError::IO(std::io::Error::new(
std::io::ErrorKind::Other,
e.to_string(),
)));
}
}
return Ok(());
};
NUM_CANCELLATION_REQUESTS
.with_label_values(&[self.from, "found"])
.inc();
info!("cancelling query per user's request using key {key}");
cancel_closure.try_cancel_query().await
}
#[cfg(test)]
fn contains(&self, session: &Session) -> bool {
fn contains(&self, session: &Session<P>) -> bool {
self.map.contains_key(&session.key)
}
@@ -124,31 +121,19 @@ impl CancellationHandler {
}
}
#[async_trait]
pub trait NotificationsCancellationHandler {
async fn cancel_session_no_publish(&self, key: CancelKeyData) -> Result<(), CancelError>;
impl CancellationHandler<()> {
pub fn new(map: CancelMap, from: &'static str) -> Self {
Self {
map,
client: (),
from,
}
}
}
#[async_trait]
impl NotificationsCancellationHandler for CancellationHandler {
async fn cancel_session_no_publish(&self, key: CancelKeyData) -> Result<(), CancelError> {
let from = "from_redis";
let cancel_closure = self.map.get(&key).and_then(|x| x.clone());
match cancel_closure {
Some(cancel_closure) => {
NUM_CANCELLATION_REQUESTS
.with_label_values(&[from, "found"])
.inc();
cancel_closure.try_cancel_query().await
}
None => {
NUM_CANCELLATION_REQUESTS
.with_label_values(&[from, "not_found"])
.inc();
tracing::warn!("query cancellation key not found: {key}");
Ok(())
}
}
impl<P: CancellationPublisherMut> CancellationHandler<Option<Arc<Mutex<P>>>> {
pub fn new(map: CancelMap, client: Option<Arc<Mutex<P>>>, from: &'static str) -> Self {
Self { map, client, from }
}
}
@@ -178,14 +163,14 @@ impl CancelClosure {
}
/// Helper for registering query cancellation tokens.
pub struct Session {
pub struct Session<P> {
/// The user-facing key identifying this session.
key: CancelKeyData,
/// The [`CancelMap`] this session belongs to.
cancellation_handler: Arc<CancellationHandler>,
cancellation_handler: Arc<CancellationHandler<P>>,
}
impl Session {
impl<P> Session<P> {
/// Store the cancel token for the given session.
/// This enables query cancellation in `crate::proxy::prepare_client_connection`.
pub fn enable_query_cancellation(&self, cancel_closure: CancelClosure) -> CancelKeyData {
@@ -198,7 +183,7 @@ impl Session {
}
}
impl Drop for Session {
impl<P> Drop for Session<P> {
fn drop(&mut self) {
self.cancellation_handler.map.remove(&self.key);
info!("dropped query cancellation key {}", &self.key);
@@ -207,14 +192,16 @@ impl Drop for Session {
#[cfg(test)]
mod tests {
use crate::metrics::NUM_CANCELLATION_REQUESTS_SOURCE_FROM_REDIS;
use super::*;
#[tokio::test]
async fn check_session_drop() -> anyhow::Result<()> {
let cancellation_handler = Arc::new(CancellationHandler {
map: CancelMap::default(),
redis_client: None,
});
let cancellation_handler = Arc::new(CancellationHandler::<()>::new(
CancelMap::default(),
NUM_CANCELLATION_REQUESTS_SOURCE_FROM_REDIS,
));
let session = cancellation_handler.clone().get_session();
assert!(cancellation_handler.contains(&session));
@@ -224,4 +211,19 @@ mod tests {
Ok(())
}
#[tokio::test]
async fn cancel_session_noop_regression() {
let handler = CancellationHandler::<()>::new(Default::default(), "local");
handler
.cancel_session(
CancelKeyData {
backend_pid: 0,
cancel_key: 0,
},
Uuid::new_v4(),
)
.await
.unwrap();
}
}

View File

@@ -82,14 +82,13 @@ pub type ScramKeys = tokio_postgres::config::ScramKeys<32>;
/// A config for establishing a connection to compute node.
/// Eventually, `tokio_postgres` will be replaced with something better.
/// Newtype allows us to implement methods on top of it.
#[derive(Clone)]
#[repr(transparent)]
#[derive(Clone, Default)]
pub struct ConnCfg(Box<tokio_postgres::Config>);
/// Creation and initialization routines.
impl ConnCfg {
pub fn new() -> Self {
Self(Default::default())
Self::default()
}
/// Reuse password or auth keys from the other config.
@@ -165,12 +164,6 @@ impl std::ops::DerefMut for ConnCfg {
}
}
impl Default for ConnCfg {
fn default() -> Self {
Self::new()
}
}
impl ConnCfg {
/// Establish a raw TCP connection to the compute node.
async fn connect_raw(&self, timeout: Duration) -> io::Result<(SocketAddr, TcpStream, &str)> {

View File

@@ -28,6 +28,7 @@ pub struct ProxyConfig {
pub redis_rps_limit: Vec<RateBucketInfo>,
pub region: String,
pub handshake_timeout: Duration,
pub aws_region: String,
}
#[derive(Debug)]

View File

@@ -6,7 +6,7 @@ pub mod messages;
/// Wrappers for console APIs and their mocks.
pub mod provider;
pub use provider::{errors, Api, AuthSecret, CachedNodeInfo, NodeInfo};
pub(crate) use provider::{errors, Api, AuthSecret, CachedNodeInfo, NodeInfo};
/// Various cache-related types.
pub mod caches {

View File

@@ -14,7 +14,6 @@ use crate::{
context::RequestMonitoring,
scram, EndpointCacheKey, ProjectId,
};
use async_trait::async_trait;
use dashmap::DashMap;
use std::{sync::Arc, time::Duration};
use tokio::sync::{OwnedSemaphorePermit, Semaphore};
@@ -326,8 +325,7 @@ pub type CachedAllowedIps = Cached<&'static ProjectInfoCacheImpl, Arc<Vec<IpPatt
/// This will allocate per each call, but the http requests alone
/// already require a few allocations, so it should be fine.
#[async_trait]
pub trait Api {
pub(crate) trait Api {
/// Get the client's auth secret for authentication.
/// Returns option because user not found situation is special.
/// We still have to mock the scram to avoid leaking information that user doesn't exist.
@@ -363,7 +361,6 @@ pub enum ConsoleBackend {
Test(Box<dyn crate::auth::backend::TestBackend>),
}
#[async_trait]
impl Api for ConsoleBackend {
async fn get_role_secret(
&self,

View File

@@ -8,7 +8,6 @@ use crate::console::provider::{CachedAllowedIps, CachedRoleSecret};
use crate::context::RequestMonitoring;
use crate::{auth::backend::ComputeUserInfo, compute, error::io_error, scram, url::ApiUrl};
use crate::{auth::IpPattern, cache::Cached};
use async_trait::async_trait;
use futures::TryFutureExt;
use std::{str::FromStr, sync::Arc};
use thiserror::Error;
@@ -144,7 +143,6 @@ async fn get_execute_postgres_query(
Ok(Some(entry))
}
#[async_trait]
impl super::Api for Api {
#[tracing::instrument(skip_all)]
async fn get_role_secret(

View File

@@ -14,7 +14,6 @@ use crate::{
context::RequestMonitoring,
metrics::{ALLOWED_IPS_BY_CACHE_OUTCOME, ALLOWED_IPS_NUMBER},
};
use async_trait::async_trait;
use futures::TryFutureExt;
use std::sync::Arc;
use tokio::time::Instant;
@@ -168,7 +167,6 @@ impl Api {
}
}
#[async_trait]
impl super::Api for Api {
#[tracing::instrument(skip_all)]
async fn get_role_secret(

View File

@@ -2,14 +2,21 @@ use anyhow::{anyhow, bail};
use hyper::{Body, Request, Response, StatusCode};
use std::{convert::Infallible, net::TcpListener};
use tracing::info;
use utils::http::{endpoint, error::ApiError, json::json_response, RouterBuilder, RouterService};
use utils::http::{
endpoint::{self, prometheus_metrics_handler, request_span},
error::ApiError,
json::json_response,
RouterBuilder, RouterService,
};
async fn status_handler(_: Request<Body>) -> Result<Response<Body>, ApiError> {
json_response(StatusCode::OK, "")
}
fn make_router() -> RouterBuilder<hyper::Body, ApiError> {
endpoint::make_router().get("/v1/status", status_handler)
endpoint::make_router()
.get("/metrics", |r| request_span(r, prometheus_metrics_handler))
.get("/v1/status", status_handler)
}
pub async fn task_main(http_listener: TcpListener) -> anyhow::Result<Infallible> {

View File

@@ -161,6 +161,9 @@ pub static NUM_CANCELLATION_REQUESTS: Lazy<IntCounterVec> = Lazy::new(|| {
.unwrap()
});
pub const NUM_CANCELLATION_REQUESTS_SOURCE_FROM_CLIENT: &str = "from_client";
pub const NUM_CANCELLATION_REQUESTS_SOURCE_FROM_REDIS: &str = "from_redis";
pub enum Waiting {
Cplane,
Client,

View File

@@ -10,7 +10,7 @@ pub mod wake_compute;
use crate::{
auth,
cancellation::{self, CancellationHandler},
cancellation::{self, CancellationHandlerMain, CancellationHandlerMainInternal},
compute,
config::{ProxyConfig, TlsConfig},
context::RequestMonitoring,
@@ -62,7 +62,7 @@ pub async fn task_main(
listener: tokio::net::TcpListener,
cancellation_token: CancellationToken,
endpoint_rate_limiter: Arc<EndpointRateLimiter>,
cancellation_handler: Arc<CancellationHandler>,
cancellation_handler: Arc<CancellationHandlerMain>,
) -> anyhow::Result<()> {
scopeguard::defer! {
info!("proxy has shut down");
@@ -233,12 +233,12 @@ impl ReportableError for ClientRequestError {
pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
config: &'static ProxyConfig,
ctx: &mut RequestMonitoring,
cancellation_handler: Arc<CancellationHandler>,
cancellation_handler: Arc<CancellationHandlerMain>,
stream: S,
mode: ClientMode,
endpoint_rate_limiter: Arc<EndpointRateLimiter>,
conn_gauge: IntCounterPairGuard,
) -> Result<Option<ProxyPassthrough<S>>, ClientRequestError> {
) -> Result<Option<ProxyPassthrough<CancellationHandlerMainInternal, S>>, ClientRequestError> {
info!("handling interactive connection from client");
let proto = ctx.protocol;
@@ -338,9 +338,9 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
/// Finish client connection initialization: confirm auth success, send params, etc.
#[tracing::instrument(skip_all)]
async fn prepare_client_connection(
async fn prepare_client_connection<P>(
node: &compute::PostgresConnection,
session: &cancellation::Session,
session: &cancellation::Session<P>,
stream: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
) -> Result<(), std::io::Error> {
// Register compute's query cancellation token and produce a new, unique one.

View File

@@ -55,17 +55,17 @@ pub async fn proxy_pass(
Ok(())
}
pub struct ProxyPassthrough<S> {
pub struct ProxyPassthrough<P, S> {
pub client: Stream<S>,
pub compute: PostgresConnection,
pub aux: MetricsAuxInfo,
pub req: IntCounterPairGuard,
pub conn: IntCounterPairGuard,
pub cancel: cancellation::Session,
pub cancel: cancellation::Session<P>,
}
impl<S: AsyncRead + AsyncWrite + Unpin> ProxyPassthrough<S> {
impl<P, S: AsyncRead + AsyncWrite + Unpin> ProxyPassthrough<P, S> {
pub async fn proxy_pass(self) -> anyhow::Result<()> {
let res = proxy_pass(self.client, self.compute.stream, self.aux).await;
self.compute.cancel_closure.try_cancel_query().await?;

View File

@@ -135,9 +135,10 @@ impl TestAuth for NoAuth {}
struct Scram(scram::ServerSecret);
impl Scram {
fn new(password: &str) -> anyhow::Result<Self> {
let secret =
scram::ServerSecret::build(password).context("failed to generate scram secret")?;
async fn new(password: &str) -> anyhow::Result<Self> {
let secret = scram::ServerSecret::build(password)
.await
.context("failed to generate scram secret")?;
Ok(Scram(secret))
}
@@ -284,7 +285,7 @@ async fn scram_auth_good(#[case] password: &str) -> anyhow::Result<()> {
let proxy = tokio::spawn(dummy_proxy(
client,
Some(server_config),
Scram::new(password)?,
Scram::new(password).await?,
));
let (_client, _conn) = tokio_postgres::Config::new()
@@ -308,7 +309,7 @@ async fn scram_auth_disable_channel_binding() -> anyhow::Result<()> {
let proxy = tokio::spawn(dummy_proxy(
client,
Some(server_config),
Scram::new("password")?,
Scram::new("password").await?,
));
let (_client, _conn) = tokio_postgres::Config::new()

View File

@@ -148,7 +148,7 @@ async fn scram_auth_disable_channel_binding() -> anyhow::Result<()> {
let proxy = tokio::spawn(dummy_proxy(
client,
Some(server_config),
Scram::new("password")?,
Scram::new("password").await?,
));
let _client_err = tokio_postgres::Config::new()
@@ -231,7 +231,7 @@ async fn connect_failure(
let proxy = tokio::spawn(dummy_proxy(
client,
Some(server_config),
Scram::new("password")?,
Scram::new("password").await?,
));
let _client_err = tokio_postgres::Config::new()

View File

@@ -1,2 +1,4 @@
pub mod cancellation_publisher;
pub mod connection_with_credentials_provider;
pub mod elasticache;
pub mod notifications;
pub mod publisher;

View File

@@ -0,0 +1,161 @@
use std::sync::Arc;
use pq_proto::CancelKeyData;
use redis::AsyncCommands;
use tokio::sync::Mutex;
use uuid::Uuid;
use crate::rate_limiter::{RateBucketInfo, RedisRateLimiter};
use super::{
connection_with_credentials_provider::ConnectionWithCredentialsProvider,
notifications::{CancelSession, Notification, PROXY_CHANNEL_NAME},
};
pub trait CancellationPublisherMut: Send + Sync + 'static {
#[allow(async_fn_in_trait)]
async fn try_publish(
&mut self,
cancel_key_data: CancelKeyData,
session_id: Uuid,
) -> anyhow::Result<()>;
}
pub trait CancellationPublisher: Send + Sync + 'static {
#[allow(async_fn_in_trait)]
async fn try_publish(
&self,
cancel_key_data: CancelKeyData,
session_id: Uuid,
) -> anyhow::Result<()>;
}
impl CancellationPublisher for () {
async fn try_publish(
&self,
_cancel_key_data: CancelKeyData,
_session_id: Uuid,
) -> anyhow::Result<()> {
Ok(())
}
}
impl<P: CancellationPublisher> CancellationPublisherMut for P {
async fn try_publish(
&mut self,
cancel_key_data: CancelKeyData,
session_id: Uuid,
) -> anyhow::Result<()> {
<P as CancellationPublisher>::try_publish(self, cancel_key_data, session_id).await
}
}
impl<P: CancellationPublisher> CancellationPublisher for Option<P> {
async fn try_publish(
&self,
cancel_key_data: CancelKeyData,
session_id: Uuid,
) -> anyhow::Result<()> {
if let Some(p) = self {
p.try_publish(cancel_key_data, session_id).await
} else {
Ok(())
}
}
}
impl<P: CancellationPublisherMut> CancellationPublisher for Arc<Mutex<P>> {
async fn try_publish(
&self,
cancel_key_data: CancelKeyData,
session_id: Uuid,
) -> anyhow::Result<()> {
self.lock()
.await
.try_publish(cancel_key_data, session_id)
.await
}
}
pub struct RedisPublisherClient {
client: ConnectionWithCredentialsProvider,
region_id: String,
limiter: RedisRateLimiter,
}
impl RedisPublisherClient {
pub fn new(
client: ConnectionWithCredentialsProvider,
region_id: String,
info: &'static [RateBucketInfo],
) -> anyhow::Result<Self> {
Ok(Self {
client,
region_id,
limiter: RedisRateLimiter::new(info),
})
}
async fn publish(
&mut self,
cancel_key_data: CancelKeyData,
session_id: Uuid,
) -> anyhow::Result<()> {
let payload = serde_json::to_string(&Notification::Cancel(CancelSession {
region_id: Some(self.region_id.clone()),
cancel_key_data,
session_id,
}))?;
self.client.publish(PROXY_CHANNEL_NAME, payload).await?;
Ok(())
}
pub async fn try_connect(&mut self) -> anyhow::Result<()> {
match self.client.connect().await {
Ok(()) => {}
Err(e) => {
tracing::error!("failed to connect to redis: {e}");
return Err(e);
}
}
Ok(())
}
async fn try_publish_internal(
&mut self,
cancel_key_data: CancelKeyData,
session_id: Uuid,
) -> anyhow::Result<()> {
if !self.limiter.check() {
tracing::info!("Rate limit exceeded. Skipping cancellation message");
return Err(anyhow::anyhow!("Rate limit exceeded"));
}
match self.publish(cancel_key_data, session_id).await {
Ok(()) => return Ok(()),
Err(e) => {
tracing::error!("failed to publish a message: {e}");
}
}
tracing::info!("Publisher is disconnected. Reconnectiong...");
self.try_connect().await?;
self.publish(cancel_key_data, session_id).await
}
}
impl CancellationPublisherMut for RedisPublisherClient {
async fn try_publish(
&mut self,
cancel_key_data: CancelKeyData,
session_id: Uuid,
) -> anyhow::Result<()> {
tracing::info!("publishing cancellation key to Redis");
match self.try_publish_internal(cancel_key_data, session_id).await {
Ok(()) => {
tracing::info!("cancellation key successfuly published to Redis");
Ok(())
}
Err(e) => {
tracing::error!("failed to publish a message: {e}");
Err(e)
}
}
}
}

View File

@@ -0,0 +1,225 @@
use std::{sync::Arc, time::Duration};
use futures::FutureExt;
use redis::{
aio::{ConnectionLike, MultiplexedConnection},
ConnectionInfo, IntoConnectionInfo, RedisConnectionInfo, RedisResult,
};
use tokio::task::JoinHandle;
use tracing::{error, info};
use super::elasticache::CredentialsProvider;
enum Credentials {
Static(ConnectionInfo),
Dynamic(Arc<CredentialsProvider>, redis::ConnectionAddr),
}
impl Clone for Credentials {
fn clone(&self) -> Self {
match self {
Credentials::Static(info) => Credentials::Static(info.clone()),
Credentials::Dynamic(provider, addr) => {
Credentials::Dynamic(Arc::clone(provider), addr.clone())
}
}
}
}
/// A wrapper around `redis::MultiplexedConnection` that automatically refreshes the token.
/// Provides PubSub connection without credentials refresh.
pub struct ConnectionWithCredentialsProvider {
credentials: Credentials,
con: Option<MultiplexedConnection>,
refresh_token_task: Option<JoinHandle<()>>,
mutex: tokio::sync::Mutex<()>,
}
impl Clone for ConnectionWithCredentialsProvider {
fn clone(&self) -> Self {
Self {
credentials: self.credentials.clone(),
con: None,
refresh_token_task: None,
mutex: tokio::sync::Mutex::new(()),
}
}
}
impl ConnectionWithCredentialsProvider {
pub fn new_with_credentials_provider(
host: String,
port: u16,
credentials_provider: Arc<CredentialsProvider>,
) -> Self {
Self {
credentials: Credentials::Dynamic(
credentials_provider,
redis::ConnectionAddr::TcpTls {
host,
port,
insecure: false,
tls_params: None,
},
),
con: None,
refresh_token_task: None,
mutex: tokio::sync::Mutex::new(()),
}
}
pub fn new_with_static_credentials<T: IntoConnectionInfo>(params: T) -> Self {
Self {
credentials: Credentials::Static(params.into_connection_info().unwrap()),
con: None,
refresh_token_task: None,
mutex: tokio::sync::Mutex::new(()),
}
}
pub async fn connect(&mut self) -> anyhow::Result<()> {
let _guard = self.mutex.lock().await;
if let Some(con) = self.con.as_mut() {
match redis::cmd("PING").query_async(con).await {
Ok(()) => {
return Ok(());
}
Err(e) => {
error!("Error during PING: {e:?}");
}
}
} else {
info!("Connection is not established");
}
info!("Establishing a new connection...");
self.con = None;
if let Some(f) = self.refresh_token_task.take() {
f.abort()
}
let con = self
.get_client()
.await?
.get_multiplexed_tokio_connection()
.await?;
if let Credentials::Dynamic(credentials_provider, _) = &self.credentials {
let credentials_provider = credentials_provider.clone();
let con2 = con.clone();
let f = tokio::spawn(async move {
let _ = Self::keep_connection(con2, credentials_provider).await;
});
self.refresh_token_task = Some(f);
}
self.con = Some(con);
Ok(())
}
async fn get_connection_info(&self) -> anyhow::Result<ConnectionInfo> {
match &self.credentials {
Credentials::Static(info) => Ok(info.clone()),
Credentials::Dynamic(provider, addr) => {
let (username, password) = provider.provide_credentials().await?;
Ok(ConnectionInfo {
addr: addr.clone(),
redis: RedisConnectionInfo {
db: 0,
username: Some(username),
password: Some(password.clone()),
},
})
}
}
}
async fn get_client(&self) -> anyhow::Result<redis::Client> {
let client = redis::Client::open(self.get_connection_info().await?)?;
Ok(client)
}
// PubSub does not support credentials refresh.
// Requires manual reconnection every 12h.
pub async fn get_async_pubsub(&self) -> anyhow::Result<redis::aio::PubSub> {
Ok(self.get_client().await?.get_async_pubsub().await?)
}
// The connection lives for 12h.
// It can be prolonged with sending `AUTH` commands with the refreshed token.
// https://docs.aws.amazon.com/AmazonElastiCache/latest/red-ug/auth-iam.html#auth-iam-limits
async fn keep_connection(
mut con: MultiplexedConnection,
credentials_provider: Arc<CredentialsProvider>,
) -> anyhow::Result<()> {
loop {
// The connection lives for 12h, for the sanity check we refresh it every hour.
tokio::time::sleep(Duration::from_secs(60 * 60)).await;
match Self::refresh_token(&mut con, credentials_provider.clone()).await {
Ok(()) => {
info!("Token refreshed");
}
Err(e) => {
error!("Error during token refresh: {e:?}");
}
}
}
}
async fn refresh_token(
con: &mut MultiplexedConnection,
credentials_provider: Arc<CredentialsProvider>,
) -> anyhow::Result<()> {
let (user, password) = credentials_provider.provide_credentials().await?;
redis::cmd("AUTH")
.arg(user)
.arg(password)
.query_async(con)
.await?;
Ok(())
}
/// Sends an already encoded (packed) command into the TCP socket and
/// reads the single response from it.
pub async fn send_packed_command(&mut self, cmd: &redis::Cmd) -> RedisResult<redis::Value> {
// Clone connection to avoid having to lock the ArcSwap in write mode
let con = self.con.as_mut().ok_or(redis::RedisError::from((
redis::ErrorKind::IoError,
"Connection not established",
)))?;
con.send_packed_command(cmd).await
}
/// Sends multiple already encoded (packed) command into the TCP socket
/// and reads `count` responses from it. This is used to implement
/// pipelining.
pub async fn send_packed_commands(
&mut self,
cmd: &redis::Pipeline,
offset: usize,
count: usize,
) -> RedisResult<Vec<redis::Value>> {
// Clone shared connection future to avoid having to lock the ArcSwap in write mode
let con = self.con.as_mut().ok_or(redis::RedisError::from((
redis::ErrorKind::IoError,
"Connection not established",
)))?;
con.send_packed_commands(cmd, offset, count).await
}
}
impl ConnectionLike for ConnectionWithCredentialsProvider {
fn req_packed_command<'a>(
&'a mut self,
cmd: &'a redis::Cmd,
) -> redis::RedisFuture<'a, redis::Value> {
(async move { self.send_packed_command(cmd).await }).boxed()
}
fn req_packed_commands<'a>(
&'a mut self,
cmd: &'a redis::Pipeline,
offset: usize,
count: usize,
) -> redis::RedisFuture<'a, Vec<redis::Value>> {
(async move { self.send_packed_commands(cmd, offset, count).await }).boxed()
}
fn get_db(&self) -> i64 {
0
}
}

View File

@@ -0,0 +1,110 @@
use std::time::{Duration, SystemTime};
use aws_config::meta::credentials::CredentialsProviderChain;
use aws_sdk_iam::config::ProvideCredentials;
use aws_sigv4::http_request::{
self, SignableBody, SignableRequest, SignatureLocation, SigningSettings,
};
use tracing::info;
#[derive(Debug)]
pub struct AWSIRSAConfig {
region: String,
service_name: String,
cluster_name: String,
user_id: String,
token_ttl: Duration,
action: String,
}
impl AWSIRSAConfig {
pub fn new(region: String, cluster_name: Option<String>, user_id: Option<String>) -> Self {
AWSIRSAConfig {
region,
service_name: "elasticache".to_string(),
cluster_name: cluster_name.unwrap_or_default(),
user_id: user_id.unwrap_or_default(),
// "The IAM authentication token is valid for 15 minutes"
// https://docs.aws.amazon.com/memorydb/latest/devguide/auth-iam.html#auth-iam-limits
token_ttl: Duration::from_secs(15 * 60),
action: "connect".to_string(),
}
}
}
/// Credentials provider for AWS elasticache authentication.
///
/// Official documentation:
/// <https://docs.aws.amazon.com/AmazonElastiCache/latest/red-ug/auth-iam.html>
///
/// Useful resources:
/// <https://aws.amazon.com/blogs/database/simplify-managing-access-to-amazon-elasticache-for-redis-clusters-with-iam/>
pub struct CredentialsProvider {
config: AWSIRSAConfig,
credentials_provider: CredentialsProviderChain,
}
impl CredentialsProvider {
pub fn new(config: AWSIRSAConfig, credentials_provider: CredentialsProviderChain) -> Self {
CredentialsProvider {
config,
credentials_provider,
}
}
pub async fn provide_credentials(&self) -> anyhow::Result<(String, String)> {
let aws_credentials = self
.credentials_provider
.provide_credentials()
.await?
.into();
info!("AWS credentials successfully obtained");
info!("Connecting to Redis with configuration: {:?}", self.config);
let mut settings = SigningSettings::default();
settings.signature_location = SignatureLocation::QueryParams;
settings.expires_in = Some(self.config.token_ttl);
let signing_params = aws_sigv4::sign::v4::SigningParams::builder()
.identity(&aws_credentials)
.region(&self.config.region)
.name(&self.config.service_name)
.time(SystemTime::now())
.settings(settings)
.build()?
.into();
let auth_params = [
("Action", &self.config.action),
("User", &self.config.user_id),
];
let auth_params = url::form_urlencoded::Serializer::new(String::new())
.extend_pairs(auth_params)
.finish();
let auth_uri = http::Uri::builder()
.scheme("http")
.authority(self.config.cluster_name.as_bytes())
.path_and_query(format!("/?{auth_params}"))
.build()?;
info!("{}", auth_uri);
// Convert the HTTP request into a signable request
let signable_request = SignableRequest::new(
"GET",
auth_uri.to_string(),
std::iter::empty(),
SignableBody::Bytes(&[]),
)?;
// Sign and then apply the signature to the request
let (si, _) = http_request::sign(signable_request, &signing_params)?.into_parts();
let mut signable_request = http::Request::builder()
.method("GET")
.uri(auth_uri)
.body(())?;
si.apply_to_request_http1x(&mut signable_request);
Ok((
self.config.user_id.clone(),
signable_request
.uri()
.to_string()
.replacen("http://", "", 1),
))
}
}

View File

@@ -6,11 +6,12 @@ use redis::aio::PubSub;
use serde::{Deserialize, Serialize};
use uuid::Uuid;
use super::connection_with_credentials_provider::ConnectionWithCredentialsProvider;
use crate::{
cache::project_info::ProjectInfoCache,
cancellation::{CancelMap, CancellationHandler, NotificationsCancellationHandler},
cancellation::{CancelMap, CancellationHandler},
intern::{ProjectIdInt, RoleNameInt},
metrics::REDIS_BROKEN_MESSAGES,
metrics::{NUM_CANCELLATION_REQUESTS_SOURCE_FROM_REDIS, REDIS_BROKEN_MESSAGES},
};
const CPLANE_CHANNEL_NAME: &str = "neondb-proxy-ws-updates";
@@ -18,23 +19,13 @@ pub(crate) const PROXY_CHANNEL_NAME: &str = "neondb-proxy-to-proxy-updates";
const RECONNECT_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(20);
const INVALIDATION_LAG: std::time::Duration = std::time::Duration::from_secs(20);
struct RedisConsumerClient {
client: redis::Client,
}
impl RedisConsumerClient {
pub fn new(url: &str) -> anyhow::Result<Self> {
let client = redis::Client::open(url)?;
Ok(Self { client })
}
async fn try_connect(&self) -> anyhow::Result<PubSub> {
let mut conn = self.client.get_async_connection().await?.into_pubsub();
tracing::info!("subscribing to a channel `{CPLANE_CHANNEL_NAME}`");
conn.subscribe(CPLANE_CHANNEL_NAME).await?;
tracing::info!("subscribing to a channel `{PROXY_CHANNEL_NAME}`");
conn.subscribe(PROXY_CHANNEL_NAME).await?;
Ok(conn)
}
async fn try_connect(client: &ConnectionWithCredentialsProvider) -> anyhow::Result<PubSub> {
let mut conn = client.get_async_pubsub().await?;
tracing::info!("subscribing to a channel `{CPLANE_CHANNEL_NAME}`");
conn.subscribe(CPLANE_CHANNEL_NAME).await?;
tracing::info!("subscribing to a channel `{PROXY_CHANNEL_NAME}`");
conn.subscribe(PROXY_CHANNEL_NAME).await?;
Ok(conn)
}
#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
@@ -80,21 +71,18 @@ where
serde_json::from_str(&s).map_err(<D::Error as serde::de::Error>::custom)
}
struct MessageHandler<
C: ProjectInfoCache + Send + Sync + 'static,
H: NotificationsCancellationHandler + Send + Sync + 'static,
> {
struct MessageHandler<C: ProjectInfoCache + Send + Sync + 'static> {
cache: Arc<C>,
cancellation_handler: Arc<H>,
cancellation_handler: Arc<CancellationHandler<()>>,
region_id: String,
}
impl<
C: ProjectInfoCache + Send + Sync + 'static,
H: NotificationsCancellationHandler + Send + Sync + 'static,
> MessageHandler<C, H>
{
pub fn new(cache: Arc<C>, cancellation_handler: Arc<H>, region_id: String) -> Self {
impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
pub fn new(
cache: Arc<C>,
cancellation_handler: Arc<CancellationHandler<()>>,
region_id: String,
) -> Self {
Self {
cache,
cancellation_handler,
@@ -139,7 +127,7 @@ impl<
// This instance of cancellation_handler doesn't have a RedisPublisherClient so it can't publish the message.
match self
.cancellation_handler
.cancel_session_no_publish(cancel_session.cancel_key_data)
.cancel_session(cancel_session.cancel_key_data, uuid::Uuid::nil())
.await
{
Ok(()) => {}
@@ -182,7 +170,7 @@ fn invalidate_cache<C: ProjectInfoCache>(cache: Arc<C>, msg: Notification) {
/// Handle console's invalidation messages.
#[tracing::instrument(name = "console_notifications", skip_all)]
pub async fn task_main<C>(
url: String,
redis: ConnectionWithCredentialsProvider,
cache: Arc<C>,
cancel_map: CancelMap,
region_id: String,
@@ -193,13 +181,15 @@ where
cache.enable_ttl();
let handler = MessageHandler::new(
cache,
Arc::new(CancellationHandler::new(cancel_map, None)),
Arc::new(CancellationHandler::<()>::new(
cancel_map,
NUM_CANCELLATION_REQUESTS_SOURCE_FROM_REDIS,
)),
region_id,
);
loop {
let redis = RedisConsumerClient::new(&url)?;
let conn = match redis.try_connect().await {
let mut conn = match try_connect(&redis).await {
Ok(conn) => {
handler.disable_ttl();
conn
@@ -212,7 +202,7 @@ where
continue;
}
};
let mut stream = conn.into_on_message();
let mut stream = conn.on_message();
while let Some(msg) = stream.next().await {
match handler.handle_message(msg).await {
Ok(()) => {}

View File

@@ -1,80 +0,0 @@
use pq_proto::CancelKeyData;
use redis::AsyncCommands;
use uuid::Uuid;
use crate::rate_limiter::{RateBucketInfo, RedisRateLimiter};
use super::notifications::{CancelSession, Notification, PROXY_CHANNEL_NAME};
pub struct RedisPublisherClient {
client: redis::Client,
publisher: Option<redis::aio::Connection>,
region_id: String,
limiter: RedisRateLimiter,
}
impl RedisPublisherClient {
pub fn new(
url: &str,
region_id: String,
info: &'static [RateBucketInfo],
) -> anyhow::Result<Self> {
let client = redis::Client::open(url)?;
Ok(Self {
client,
publisher: None,
region_id,
limiter: RedisRateLimiter::new(info),
})
}
pub async fn try_publish(
&mut self,
cancel_key_data: CancelKeyData,
session_id: Uuid,
) -> anyhow::Result<()> {
if !self.limiter.check() {
tracing::info!("Rate limit exceeded. Skipping cancellation message");
return Err(anyhow::anyhow!("Rate limit exceeded"));
}
match self.publish(cancel_key_data, session_id).await {
Ok(()) => return Ok(()),
Err(e) => {
tracing::error!("failed to publish a message: {e}");
self.publisher = None;
}
}
tracing::info!("Publisher is disconnected. Reconnectiong...");
self.try_connect().await?;
self.publish(cancel_key_data, session_id).await
}
async fn publish(
&mut self,
cancel_key_data: CancelKeyData,
session_id: Uuid,
) -> anyhow::Result<()> {
let conn = self
.publisher
.as_mut()
.ok_or_else(|| anyhow::anyhow!("not connected"))?;
let payload = serde_json::to_string(&Notification::Cancel(CancelSession {
region_id: Some(self.region_id.clone()),
cancel_key_data,
session_id,
}))?;
conn.publish(PROXY_CHANNEL_NAME, payload).await?;
Ok(())
}
pub async fn try_connect(&mut self) -> anyhow::Result<()> {
match self.client.get_async_connection().await {
Ok(conn) => {
self.publisher = Some(conn);
}
Err(e) => {
tracing::error!("failed to connect to redis: {e}");
return Err(e.into());
}
}
Ok(())
}
}

View File

@@ -33,6 +33,9 @@ pub enum Error {
#[error("Internal error: missing digest")]
MissingBinding,
#[error("could not decode salt: {0}")]
Base64(#[from] base64::DecodeError),
#[error(transparent)]
Io(#[from] io::Error),
}
@@ -55,6 +58,7 @@ impl ReportableError for Error {
Error::ChannelBindingBadMethod(_) => crate::error::ErrorKind::User,
Error::BadClientMessage(_) => crate::error::ErrorKind::User,
Error::MissingBinding => crate::error::ErrorKind::Service,
Error::Base64(_) => crate::error::ErrorKind::ControlPlane,
Error::Io(_) => crate::error::ErrorKind::ClientDisconnect,
}
}

View File

@@ -56,8 +56,6 @@ fn sha256<'a>(parts: impl IntoIterator<Item = &'a [u8]>) -> [u8; 32] {
#[cfg(test)]
mod tests {
use postgres_protocol::authentication::sasl::{ChannelBinding, ScramSha256};
use crate::sasl::{Mechanism, Step};
use super::{Exchange, ServerSecret};
@@ -113,17 +111,11 @@ mod tests {
);
}
fn run_round_trip_test(server_password: &str, client_password: &str) {
let scram_secret = ServerSecret::build(server_password).unwrap();
let sasl_client =
ScramSha256::new(client_password.as_bytes(), ChannelBinding::unsupported());
let outcome = super::exchange(
&scram_secret,
sasl_client,
crate::config::TlsServerEndPoint::Undefined,
)
.unwrap();
async fn run_round_trip_test(server_password: &str, client_password: &str) {
let scram_secret = ServerSecret::build(server_password).await.unwrap();
let outcome = super::exchange(&scram_secret, client_password.as_bytes())
.await
.unwrap();
match outcome {
crate::sasl::Outcome::Success(_) => {}
@@ -131,14 +123,14 @@ mod tests {
}
}
#[test]
fn round_trip() {
run_round_trip_test("pencil", "pencil")
#[tokio::test]
async fn round_trip() {
run_round_trip_test("pencil", "pencil").await
}
#[test]
#[tokio::test]
#[should_panic(expected = "password doesn't match")]
fn failure() {
run_round_trip_test("pencil", "eraser")
async fn failure() {
run_round_trip_test("pencil", "eraser").await
}
}

View File

@@ -2,13 +2,16 @@
use std::convert::Infallible;
use postgres_protocol::authentication::sasl::ScramSha256;
use hmac::{Hmac, Mac};
use sha2::Sha256;
use tokio::task::yield_now;
use super::messages::{
ClientFinalMessage, ClientFirstMessage, OwnedServerFirstMessage, SCRAM_RAW_NONCE_LEN,
};
use super::secret::ServerSecret;
use super::signature::SignatureBuilder;
use super::ScramKey;
use crate::config;
use crate::sasl::{self, ChannelBinding, Error as SaslError};
@@ -71,40 +74,62 @@ impl<'a> Exchange<'a> {
}
}
pub fn exchange(
// copied from <https://github.com/neondatabase/rust-postgres/blob/20031d7a9ee1addeae6e0968e3899ae6bf01cee2/postgres-protocol/src/authentication/sasl.rs#L36-L61>
async fn pbkdf2(str: &[u8], salt: &[u8], iterations: u32) -> [u8; 32] {
let hmac = Hmac::<Sha256>::new_from_slice(str).expect("HMAC is able to accept all key sizes");
let mut prev = hmac
.clone()
.chain_update(salt)
.chain_update(1u32.to_be_bytes())
.finalize()
.into_bytes();
let mut hi = prev;
for i in 1..iterations {
prev = hmac.clone().chain_update(prev).finalize().into_bytes();
for (hi, prev) in hi.iter_mut().zip(prev) {
*hi ^= prev;
}
// yield every ~250us
// hopefully reduces tail latencies
if i % 1024 == 0 {
yield_now().await
}
}
hi.into()
}
// copied from <https://github.com/neondatabase/rust-postgres/blob/20031d7a9ee1addeae6e0968e3899ae6bf01cee2/postgres-protocol/src/authentication/sasl.rs#L236-L248>
async fn derive_client_key(password: &[u8], salt: &[u8], iterations: u32) -> ScramKey {
let salted_password = pbkdf2(password, salt, iterations).await;
let make_key = |name| {
let key = Hmac::<Sha256>::new_from_slice(&salted_password)
.expect("HMAC is able to accept all key sizes")
.chain_update(name)
.finalize();
<[u8; 32]>::from(key.into_bytes())
};
make_key(b"Client Key").into()
}
pub async fn exchange(
secret: &ServerSecret,
mut client: ScramSha256,
tls_server_end_point: config::TlsServerEndPoint,
password: &[u8],
) -> sasl::Result<sasl::Outcome<super::ScramKey>> {
use sasl::Step::*;
let salt = base64::decode(&secret.salt_base64)?;
let client_key = derive_client_key(password, &salt, secret.iterations).await;
let init = SaslInitial {
nonce: rand::random,
};
let client_first = std::str::from_utf8(client.message())
.map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?;
let sent = match init.transition(secret, &tls_server_end_point, client_first)? {
Continue(sent, server_first) => {
client.update(server_first.as_bytes())?;
sent
}
Success(x, _) => match x {},
Failure(msg) => return Ok(sasl::Outcome::Failure(msg)),
};
let client_final = std::str::from_utf8(client.message())
.map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?;
let keys = match sent.transition(secret, &tls_server_end_point, client_final)? {
Success(keys, server_final) => {
client.finish(server_final.as_bytes())?;
keys
}
Continue(x, _) => match x {},
Failure(msg) => return Ok(sasl::Outcome::Failure(msg)),
};
Ok(sasl::Outcome::Success(keys))
if secret.is_password_invalid(&client_key).into() {
Ok(sasl::Outcome::Failure("password doesn't match"))
} else {
Ok(sasl::Outcome::Success(client_key))
}
}
impl SaslInitial {
@@ -185,7 +210,7 @@ impl SaslSentInner {
.derive_client_key(&client_final_message.proof);
// Auth fails either if keys don't match or it's pre-determined to fail.
if client_key.sha256() != secret.stored_key || secret.doomed {
if secret.is_password_invalid(&client_key).into() {
return Ok(sasl::Step::Failure("password doesn't match"));
}

Some files were not shown because too many files have changed in this diff Show More