mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-20 22:50:38 +00:00
Compare commits
274 Commits
revert-718
...
release-pr
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
5bad8126dc | ||
|
|
27bc242085 | ||
|
|
192b49cc6d | ||
|
|
e1b60f3693 | ||
|
|
2804f5323b | ||
|
|
676adc6b32 | ||
|
|
96a4e8de66 | ||
|
|
01180666b0 | ||
|
|
6c94269c32 | ||
|
|
edc691647d | ||
|
|
855d7b4781 | ||
|
|
c49c9707ce | ||
|
|
2227540a0d | ||
|
|
f1347f2417 | ||
|
|
30b295b017 | ||
|
|
1cef395266 | ||
|
|
78d160f76d | ||
|
|
b9238059d6 | ||
|
|
d0cb4b88c8 | ||
|
|
1ec3e39d4e | ||
|
|
a1a74eef2c | ||
|
|
90e689adda | ||
|
|
f0b2d4b053 | ||
|
|
299d9474c9 | ||
|
|
7234208b36 | ||
|
|
93450f11f5 | ||
|
|
2f0f9edf33 | ||
|
|
d424f2b7c8 | ||
|
|
21315e80bc | ||
|
|
483b66d383 | ||
|
|
aa72a22661 | ||
|
|
5c0264b591 | ||
|
|
9f13277729 | ||
|
|
54aa319805 | ||
|
|
4a227484bf | ||
|
|
2f83f85291 | ||
|
|
d6cfcb0d93 | ||
|
|
392843ad2a | ||
|
|
bd4dae8f4a | ||
|
|
b05fe53cfd | ||
|
|
c13a2f0df1 | ||
|
|
39be366fc5 | ||
|
|
6eda0a3158 | ||
|
|
306c7a1813 | ||
|
|
80be423a58 | ||
|
|
5dcfef82f2 | ||
|
|
e67b8f69c0 | ||
|
|
e546872ab4 | ||
|
|
322ea1cf7c | ||
|
|
3633742de9 | ||
|
|
079d3a37ba | ||
|
|
a46e77b476 | ||
|
|
a92702b01e | ||
|
|
8ff3253f20 | ||
|
|
04b82c92a7 | ||
|
|
e5bf423e68 | ||
|
|
60af392e45 | ||
|
|
661fc41e71 | ||
|
|
702c488f32 | ||
|
|
45c5122754 | ||
|
|
558394f710 | ||
|
|
73b0898608 | ||
|
|
e65be4c2dc | ||
|
|
40087b8164 | ||
|
|
c762b59483 | ||
|
|
5d71601ca9 | ||
|
|
a113c3e433 | ||
|
|
e81fc598f4 | ||
|
|
48b845fa76 | ||
|
|
27096858dc | ||
|
|
4430d0ae7d | ||
|
|
6e183aa0de | ||
|
|
fd6d0b7635 | ||
|
|
3710c32aae | ||
|
|
be83bee49d | ||
|
|
cf28e5922a | ||
|
|
7d384d6953 | ||
|
|
4b3b37b912 | ||
|
|
1d8d200f4d | ||
|
|
0d80d6ce18 | ||
|
|
f653ee039f | ||
|
|
e614a95853 | ||
|
|
850db4cc13 | ||
|
|
8a316b1277 | ||
|
|
4d13bae449 | ||
|
|
49377abd98 | ||
|
|
a6b2f4e54e | ||
|
|
face60d50b | ||
|
|
9768aa27f2 | ||
|
|
96b2e575e1 | ||
|
|
7222777784 | ||
|
|
5469fdede0 | ||
|
|
72aa6b9fdd | ||
|
|
ae0634b7be | ||
|
|
70711f32fa | ||
|
|
52a88af0aa | ||
|
|
b7a43bf817 | ||
|
|
dce91b33a4 | ||
|
|
23ee4f3050 | ||
|
|
46857e8282 | ||
|
|
368ab0ce54 | ||
|
|
a5987eebfd | ||
|
|
6686ede30f | ||
|
|
373c7057cc | ||
|
|
7d6ec16166 | ||
|
|
0e6fdc8a58 | ||
|
|
521438a5c6 | ||
|
|
07d7874bc8 | ||
|
|
1804111a02 | ||
|
|
cd0178efed | ||
|
|
333574be57 | ||
|
|
79a799a143 | ||
|
|
9da06af6c9 | ||
|
|
ce1753d036 | ||
|
|
67db8432b4 | ||
|
|
4e2e44e524 | ||
|
|
ed786104f3 | ||
|
|
84b74f2bd1 | ||
|
|
fec2ad6283 | ||
|
|
98eebd4682 | ||
|
|
2f74287c9b | ||
|
|
aee1bf95e3 | ||
|
|
b9de9d75ff | ||
|
|
7943b709e6 | ||
|
|
d7d066d493 | ||
|
|
e78ac22107 | ||
|
|
76a8f2bb44 | ||
|
|
8d59a8581f | ||
|
|
b1ddd01289 | ||
|
|
6eae4fc9aa | ||
|
|
765455bca2 | ||
|
|
4204960942 | ||
|
|
67345d66ea | ||
|
|
2266ee5971 | ||
|
|
b58445d855 | ||
|
|
36050e7f3d | ||
|
|
33360ed96d | ||
|
|
39a28d1108 | ||
|
|
efa6aa134f | ||
|
|
2c724e56e2 | ||
|
|
feff887c6f | ||
|
|
353d915fcf | ||
|
|
2e38098cbc | ||
|
|
a6fe5ea1ac | ||
|
|
05b0aed0c1 | ||
|
|
cd1705357d | ||
|
|
6bc7561290 | ||
|
|
fbd3ac14b5 | ||
|
|
e437787c8f | ||
|
|
3460dbf90b | ||
|
|
6b89d99677 | ||
|
|
6cc8ea86e4 | ||
|
|
e62a492d6f | ||
|
|
a475cdf642 | ||
|
|
7002c79a47 | ||
|
|
ee6cf357b4 | ||
|
|
e5c2086b5f | ||
|
|
5f1208296a | ||
|
|
88e8e473cd | ||
|
|
b0a77844f6 | ||
|
|
1baf464307 | ||
|
|
e9b8e81cea | ||
|
|
85d6194aa4 | ||
|
|
333a7a68ef | ||
|
|
6aa4e41bee | ||
|
|
840183e51f | ||
|
|
cbccc94b03 | ||
|
|
fce227df22 | ||
|
|
bd787e800f | ||
|
|
4a7704b4a3 | ||
|
|
ff1119da66 | ||
|
|
4c3ba1627b | ||
|
|
1407174fb2 | ||
|
|
ec9dcb1889 | ||
|
|
d11d781afc | ||
|
|
4e44565b71 | ||
|
|
4ed51ad33b | ||
|
|
1c1ebe5537 | ||
|
|
c19cb7f386 | ||
|
|
4b97d31b16 | ||
|
|
923ade3dd7 | ||
|
|
b04e711975 | ||
|
|
afd0a6b39a | ||
|
|
99752286d8 | ||
|
|
15df93363c | ||
|
|
bc0ab741af | ||
|
|
51d9dfeaa3 | ||
|
|
f63cb18155 | ||
|
|
0de603d88e | ||
|
|
240913912a | ||
|
|
91a4ea0de2 | ||
|
|
8608704f49 | ||
|
|
efef68ce99 | ||
|
|
8daefd24da | ||
|
|
46cc8b7982 | ||
|
|
38cd90dd0c | ||
|
|
a51b269f15 | ||
|
|
43bf6d0a0f | ||
|
|
15273a9b66 | ||
|
|
78aca668d0 | ||
|
|
acbf4148ea | ||
|
|
6508540561 | ||
|
|
a41b5244a8 | ||
|
|
2b3189be95 | ||
|
|
248563c595 | ||
|
|
14cd6ca933 | ||
|
|
eb36403e71 | ||
|
|
3c6f779698 | ||
|
|
f67f0c1c11 | ||
|
|
edb02d3299 | ||
|
|
664a69e65b | ||
|
|
478322ebf9 | ||
|
|
802f174072 | ||
|
|
47f9890bae | ||
|
|
262265daad | ||
|
|
300da5b872 | ||
|
|
7b22b5c433 | ||
|
|
ffca97bc1e | ||
|
|
cb356f3259 | ||
|
|
c85374295f | ||
|
|
4992160677 | ||
|
|
bd535b3371 | ||
|
|
d90c5a03af | ||
|
|
2d02cc9079 | ||
|
|
49ad94b99f | ||
|
|
948a217398 | ||
|
|
125381eae7 | ||
|
|
cd01bbc715 | ||
|
|
d8b5e3b88d | ||
|
|
06d25f2186 | ||
|
|
f759b561f3 | ||
|
|
ece0555600 | ||
|
|
73ea0a0b01 | ||
|
|
d8f6d6fd6f | ||
|
|
d24de169a7 | ||
|
|
0816168296 | ||
|
|
277b44d57a | ||
|
|
68c2c3880e | ||
|
|
49da498f65 | ||
|
|
2c76ba3dd7 | ||
|
|
dbe3dc69ad | ||
|
|
8e5bb3ed49 | ||
|
|
ab0be7b8da | ||
|
|
b4c55f5d24 | ||
|
|
ede70d833c | ||
|
|
70c3d18bb0 | ||
|
|
7a491f52c4 | ||
|
|
323c4ecb4f | ||
|
|
3d2466607e | ||
|
|
ed478b39f4 | ||
|
|
91585a558d | ||
|
|
93467eae1f | ||
|
|
f3aac81d19 | ||
|
|
979ad60c19 | ||
|
|
9316cb1b1f | ||
|
|
e7939a527a | ||
|
|
36d26665e1 | ||
|
|
873347f977 | ||
|
|
e814ac16f9 | ||
|
|
ad3055d386 | ||
|
|
94e03eb452 | ||
|
|
380f26ef79 | ||
|
|
3c5b7f59d7 | ||
|
|
fee89f80b5 | ||
|
|
41cce8eaf1 | ||
|
|
f88fe0218d | ||
|
|
cc856eca85 | ||
|
|
cf350c6002 | ||
|
|
0ce6b6a0a3 | ||
|
|
73f247d537 | ||
|
|
960be82183 | ||
|
|
806e5a6c19 | ||
|
|
8d5df07cce | ||
|
|
df7a9d1407 |
@@ -1,13 +1,12 @@
|
||||
/compute_tools/ @neondatabase/control-plane @neondatabase/compute
|
||||
/control_plane/attachment_service @neondatabase/storage
|
||||
/libs/pageserver_api/ @neondatabase/storage
|
||||
/libs/postgres_ffi/ @neondatabase/compute @neondatabase/safekeepers
|
||||
/libs/postgres_ffi/ @neondatabase/compute
|
||||
/libs/remote_storage/ @neondatabase/storage
|
||||
/libs/safekeeper_api/ @neondatabase/safekeepers
|
||||
/libs/vm_monitor/ @neondatabase/autoscaling
|
||||
/pageserver/ @neondatabase/storage
|
||||
/pgxn/ @neondatabase/compute
|
||||
/pgxn/neon/ @neondatabase/compute @neondatabase/safekeepers
|
||||
/proxy/ @neondatabase/proxy
|
||||
/safekeeper/ @neondatabase/safekeepers
|
||||
/vendor/ @neondatabase/compute
|
||||
|
||||
64
Cargo.lock
generated
64
Cargo.lock
generated
@@ -277,7 +277,6 @@ dependencies = [
|
||||
"anyhow",
|
||||
"aws-config",
|
||||
"aws-sdk-secretsmanager",
|
||||
"bytes",
|
||||
"camino",
|
||||
"clap",
|
||||
"control_plane",
|
||||
@@ -289,8 +288,6 @@ dependencies = [
|
||||
"hex",
|
||||
"humantime",
|
||||
"hyper",
|
||||
"lasso",
|
||||
"measured",
|
||||
"metrics",
|
||||
"once_cell",
|
||||
"pageserver_api",
|
||||
@@ -298,7 +295,6 @@ dependencies = [
|
||||
"postgres_connection",
|
||||
"r2d2",
|
||||
"reqwest",
|
||||
"routerify",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"thiserror",
|
||||
@@ -2884,35 +2880,6 @@ version = "0.7.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771"
|
||||
|
||||
[[package]]
|
||||
name = "measured"
|
||||
version = "0.0.13"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f246648d027839a34b420e27c7de1165ace96e19ef894985d0a6ff89a7840a9f"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"hashbrown 0.14.0",
|
||||
"itoa",
|
||||
"lasso",
|
||||
"measured-derive",
|
||||
"memchr",
|
||||
"parking_lot 0.12.1",
|
||||
"rustc-hash",
|
||||
"ryu",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "measured-derive"
|
||||
version = "0.0.13"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "edaa5cc22d99d5d6d7d99c3b5b5f7e7f8034c22f1b5d62a1adecd2ed005d9b80"
|
||||
dependencies = [
|
||||
"heck",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.52",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "memchr"
|
||||
version = "2.6.4"
|
||||
@@ -3934,7 +3901,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "postgres"
|
||||
version = "0.19.4"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#988d0ddb4184c408fa7fc1bd0ecca7993c02978f"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"fallible-iterator",
|
||||
@@ -3947,7 +3914,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "postgres-native-tls"
|
||||
version = "0.5.0"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#988d0ddb4184c408fa7fc1bd0ecca7993c02978f"
|
||||
dependencies = [
|
||||
"native-tls",
|
||||
"tokio",
|
||||
@@ -3958,7 +3925,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "postgres-protocol"
|
||||
version = "0.6.4"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#988d0ddb4184c408fa7fc1bd0ecca7993c02978f"
|
||||
dependencies = [
|
||||
"base64 0.20.0",
|
||||
"byteorder",
|
||||
@@ -3971,13 +3938,12 @@ dependencies = [
|
||||
"rand 0.8.5",
|
||||
"sha2",
|
||||
"stringprep",
|
||||
"tokio",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "postgres-types"
|
||||
version = "0.2.4"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#988d0ddb4184c408fa7fc1bd0ecca7993c02978f"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"fallible-iterator",
|
||||
@@ -4209,7 +4175,6 @@ dependencies = [
|
||||
"consumption_metrics",
|
||||
"dashmap",
|
||||
"env_logger",
|
||||
"fallible-iterator",
|
||||
"futures",
|
||||
"git-version",
|
||||
"hashbrown 0.13.2",
|
||||
@@ -4260,7 +4225,6 @@ dependencies = [
|
||||
"smallvec",
|
||||
"smol_str",
|
||||
"socket2 0.5.5",
|
||||
"subtle",
|
||||
"sync_wrapper",
|
||||
"task-local-extensions",
|
||||
"thiserror",
|
||||
@@ -5382,23 +5346,13 @@ checksum = "ae1a47186c03a32177042e55dbc5fd5aee900b8e0069a8d70fba96a9375cd012"
|
||||
|
||||
[[package]]
|
||||
name = "sha2"
|
||||
version = "0.10.8"
|
||||
version = "0.10.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "793db75ad2bcafc3ffa7c68b215fee268f537982cd901d132f89c6343f3a3dc8"
|
||||
checksum = "82e6b795fe2e3b1e845bafcb27aa35405c4d47cdfc92af5fc8d3002f76cebdc0"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"cpufeatures",
|
||||
"digest",
|
||||
"sha2-asm",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "sha2-asm"
|
||||
version = "0.6.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f27ba7066011e3fb30d808b51affff34f0a66d3a03a58edd787c6e420e40e44e"
|
||||
dependencies = [
|
||||
"cc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -5981,7 +5935,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "tokio-postgres"
|
||||
version = "0.7.7"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#988d0ddb4184c408fa7fc1bd0ecca7993c02978f"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"byteorder",
|
||||
@@ -6514,7 +6468,6 @@ version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"arc-swap",
|
||||
"async-compression",
|
||||
"async-trait",
|
||||
"bincode",
|
||||
"byteorder",
|
||||
@@ -6553,14 +6506,12 @@ dependencies = [
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"tokio-stream",
|
||||
"tokio-tar",
|
||||
"tokio-util",
|
||||
"tracing",
|
||||
"tracing-error",
|
||||
"tracing-subscriber",
|
||||
"url",
|
||||
"uuid",
|
||||
"walkdir",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
@@ -7078,7 +7029,6 @@ dependencies = [
|
||||
"scopeguard",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"sha2",
|
||||
"smallvec",
|
||||
"subtle",
|
||||
"syn 1.0.109",
|
||||
|
||||
@@ -76,7 +76,6 @@ either = "1.8"
|
||||
enum-map = "2.4.2"
|
||||
enumset = "1.0.12"
|
||||
fail = "0.5.0"
|
||||
fallible-iterator = "0.2"
|
||||
fs2 = "0.4.3"
|
||||
futures = "0.3"
|
||||
futures-core = "0.3"
|
||||
@@ -102,7 +101,6 @@ lasso = "0.7"
|
||||
leaky-bucket = "1.0.1"
|
||||
libc = "0.2"
|
||||
md5 = "0.7.0"
|
||||
measured = { version = "0.0.13", features=["default", "lasso"] }
|
||||
memoffset = "0.8"
|
||||
native-tls = "0.2"
|
||||
nix = { version = "0.27", features = ["fs", "process", "socket", "signal", "poll"] }
|
||||
@@ -150,7 +148,6 @@ smol_str = { version = "0.2.0", features = ["serde"] }
|
||||
socket2 = "0.5"
|
||||
strum = "0.24"
|
||||
strum_macros = "0.24"
|
||||
"subtle" = "2.5.0"
|
||||
svg_fmt = "0.4.1"
|
||||
sync_wrapper = "0.1.2"
|
||||
tar = "0.4"
|
||||
|
||||
@@ -135,7 +135,7 @@ WORKDIR /home/nonroot
|
||||
|
||||
# Rust
|
||||
# Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
|
||||
ENV RUSTC_VERSION=1.77.0
|
||||
ENV RUSTC_VERSION=1.76.0
|
||||
ENV RUSTUP_HOME="/home/nonroot/.rustup"
|
||||
ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
|
||||
RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \
|
||||
@@ -149,7 +149,7 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux
|
||||
cargo install --git https://github.com/paritytech/cachepot && \
|
||||
cargo install rustfilt && \
|
||||
cargo install cargo-hakari && \
|
||||
cargo install cargo-deny --locked && \
|
||||
cargo install cargo-deny && \
|
||||
cargo install cargo-hack && \
|
||||
cargo install cargo-nextest && \
|
||||
rm -rf /home/nonroot/.cargo/registry && \
|
||||
|
||||
@@ -32,29 +32,6 @@ compute_ctl -D /var/db/postgres/compute \
|
||||
-b /usr/local/bin/postgres
|
||||
```
|
||||
|
||||
## State Diagram
|
||||
|
||||
Computes can be in various states. Below is a diagram that details how a
|
||||
compute moves between states.
|
||||
|
||||
```mermaid
|
||||
%% https://mermaid.js.org/syntax/stateDiagram.html
|
||||
stateDiagram-v2
|
||||
[*] --> Empty : Compute spawned
|
||||
Empty --> ConfigurationPending : Waiting for compute spec
|
||||
ConfigurationPending --> Configuration : Received compute spec
|
||||
Configuration --> Failed : Failed to configure the compute
|
||||
Configuration --> Running : Compute has been configured
|
||||
Empty --> Init : Compute spec is immediately available
|
||||
Empty --> TerminationPending : Requested termination
|
||||
Init --> Failed : Failed to start Postgres
|
||||
Init --> Running : Started Postgres
|
||||
Running --> TerminationPending : Requested termination
|
||||
TerminationPending --> Terminated : Terminated compute
|
||||
Failed --> [*] : Compute exited
|
||||
Terminated --> [*] : Compute exited
|
||||
```
|
||||
|
||||
## Tests
|
||||
|
||||
Cargo formatter:
|
||||
|
||||
@@ -17,7 +17,6 @@ pub fn line_in_file(path: &Path, line: &str) -> Result<bool> {
|
||||
.write(true)
|
||||
.create(true)
|
||||
.append(false)
|
||||
.truncate(false)
|
||||
.open(path)?;
|
||||
let buf = io::BufReader::new(&file);
|
||||
let mut count: usize = 0;
|
||||
|
||||
@@ -17,7 +17,6 @@ testing = []
|
||||
anyhow.workspace = true
|
||||
aws-config.workspace = true
|
||||
aws-sdk-secretsmanager.workspace = true
|
||||
bytes.workspace = true
|
||||
camino.workspace = true
|
||||
clap.workspace = true
|
||||
fail.workspace = true
|
||||
@@ -26,20 +25,17 @@ git-version.workspace = true
|
||||
hex.workspace = true
|
||||
hyper.workspace = true
|
||||
humantime.workspace = true
|
||||
lasso.workspace = true
|
||||
once_cell.workspace = true
|
||||
pageserver_api.workspace = true
|
||||
pageserver_client.workspace = true
|
||||
postgres_connection.workspace = true
|
||||
reqwest.workspace = true
|
||||
routerify.workspace = true
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
thiserror.workspace = true
|
||||
tokio.workspace = true
|
||||
tokio-util.workspace = true
|
||||
tracing.workspace = true
|
||||
measured.workspace = true
|
||||
|
||||
diesel = { version = "2.1.4", features = ["serde_json", "postgres", "r2d2"] }
|
||||
diesel_migrations = { version = "2.1.0" }
|
||||
|
||||
@@ -1,3 +0,0 @@
|
||||
|
||||
UPDATE tenant_shards set placement_policy='{"Double": 1}' where placement_policy='{"Attached": 1}';
|
||||
UPDATE tenant_shards set placement_policy='"Single"' where placement_policy='{"Attached": 0}';
|
||||
@@ -1,3 +0,0 @@
|
||||
|
||||
UPDATE tenant_shards set placement_policy='{"Attached": 1}' where placement_policy='{"Double": 1}';
|
||||
UPDATE tenant_shards set placement_policy='{"Attached": 0}' where placement_policy='"Single"';
|
||||
@@ -1,11 +1,5 @@
|
||||
use crate::metrics::{
|
||||
HttpRequestLatencyLabelGroup, HttpRequestStatusLabelGroup, PageserverRequestLabelGroup,
|
||||
METRICS_REGISTRY,
|
||||
};
|
||||
use crate::reconciler::ReconcileError;
|
||||
use crate::service::{Service, STARTUP_RECONCILE_TIMEOUT};
|
||||
use futures::Future;
|
||||
use hyper::header::CONTENT_TYPE;
|
||||
use hyper::{Body, Request, Response};
|
||||
use hyper::{StatusCode, Uri};
|
||||
use pageserver_api::models::{
|
||||
@@ -40,8 +34,6 @@ use pageserver_api::upcall_api::{ReAttachRequest, ValidateRequest};
|
||||
|
||||
use control_plane::storage_controller::{AttachHookRequest, InspectRequest};
|
||||
|
||||
use routerify::Middleware;
|
||||
|
||||
/// State available to HTTP request handlers
|
||||
#[derive(Clone)]
|
||||
pub struct HttpState {
|
||||
@@ -321,7 +313,7 @@ async fn handle_tenant_timeline_passthrough(
|
||||
tracing::info!("Proxying request for tenant {} ({})", tenant_id, path);
|
||||
|
||||
// Find the node that holds shard zero
|
||||
let (node, tenant_shard_id) = service.tenant_shard0_node(tenant_id)?;
|
||||
let (base_url, tenant_shard_id) = service.tenant_shard0_baseurl(tenant_id)?;
|
||||
|
||||
// Callers will always pass an unsharded tenant ID. Before proxying, we must
|
||||
// rewrite this to a shard-aware shard zero ID.
|
||||
@@ -330,39 +322,12 @@ async fn handle_tenant_timeline_passthrough(
|
||||
let tenant_shard_str = format!("{}", tenant_shard_id);
|
||||
let path = path.replace(&tenant_str, &tenant_shard_str);
|
||||
|
||||
let latency = &METRICS_REGISTRY
|
||||
.metrics_group
|
||||
.storage_controller_passthrough_request_latency;
|
||||
|
||||
// This is a bit awkward. We remove the param from the request
|
||||
// and join the words by '_' to get a label for the request.
|
||||
let just_path = path.replace(&tenant_shard_str, "");
|
||||
let path_label = just_path
|
||||
.split('/')
|
||||
.filter(|token| !token.is_empty())
|
||||
.collect::<Vec<_>>()
|
||||
.join("_");
|
||||
let labels = PageserverRequestLabelGroup {
|
||||
pageserver_id: &node.get_id().to_string(),
|
||||
path: &path_label,
|
||||
method: crate::metrics::Method::Get,
|
||||
};
|
||||
|
||||
let _timer = latency.start_timer(labels.clone());
|
||||
|
||||
let client = mgmt_api::Client::new(node.base_url(), service.get_config().jwt_token.as_deref());
|
||||
let client = mgmt_api::Client::new(base_url, service.get_config().jwt_token.as_deref());
|
||||
let resp = client.get_raw(path).await.map_err(|_e|
|
||||
// FIXME: give APiError a proper Unavailable variant. We return 503 here because
|
||||
// if we can't successfully send a request to the pageserver, we aren't available.
|
||||
ApiError::ShuttingDown)?;
|
||||
|
||||
if !resp.status().is_success() {
|
||||
let error_counter = &METRICS_REGISTRY
|
||||
.metrics_group
|
||||
.storage_controller_passthrough_request_error;
|
||||
error_counter.inc(labels);
|
||||
}
|
||||
|
||||
// We have a reqest::Response, would like a http::Response
|
||||
let mut builder = hyper::Response::builder()
|
||||
.status(resp.status())
|
||||
@@ -388,16 +353,6 @@ async fn handle_tenant_locate(
|
||||
json_response(StatusCode::OK, service.tenant_locate(tenant_id)?)
|
||||
}
|
||||
|
||||
async fn handle_tenant_describe(
|
||||
service: Arc<Service>,
|
||||
req: Request<Body>,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::Admin)?;
|
||||
|
||||
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
|
||||
json_response(StatusCode::OK, service.tenant_describe(tenant_id)?)
|
||||
}
|
||||
|
||||
async fn handle_node_register(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::Admin)?;
|
||||
|
||||
@@ -533,11 +488,7 @@ impl From<ReconcileError> for ApiError {
|
||||
|
||||
/// Common wrapper for request handlers that call into Service and will operate on tenants: they must only
|
||||
/// be allowed to run if Service has finished its initial reconciliation.
|
||||
async fn tenant_service_handler<R, H>(
|
||||
request: Request<Body>,
|
||||
handler: H,
|
||||
request_name: RequestName,
|
||||
) -> R::Output
|
||||
async fn tenant_service_handler<R, H>(request: Request<Body>, handler: H) -> R::Output
|
||||
where
|
||||
R: std::future::Future<Output = Result<Response<Body>, ApiError>> + Send + 'static,
|
||||
H: FnOnce(Arc<Service>, Request<Body>) -> R + Send + Sync + 'static,
|
||||
@@ -557,10 +508,9 @@ where
|
||||
));
|
||||
}
|
||||
|
||||
named_request_span(
|
||||
request_span(
|
||||
request,
|
||||
|request| async move { handler(service, request).await },
|
||||
request_name,
|
||||
)
|
||||
.await
|
||||
}
|
||||
@@ -571,98 +521,11 @@ fn check_permissions(request: &Request<Body>, required_scope: Scope) -> Result<(
|
||||
})
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
struct RequestMeta {
|
||||
method: hyper::http::Method,
|
||||
at: Instant,
|
||||
}
|
||||
|
||||
fn prologue_metrics_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>(
|
||||
) -> Middleware<B, ApiError> {
|
||||
Middleware::pre(move |req| async move {
|
||||
let meta = RequestMeta {
|
||||
method: req.method().clone(),
|
||||
at: Instant::now(),
|
||||
};
|
||||
|
||||
req.set_context(meta);
|
||||
|
||||
Ok(req)
|
||||
})
|
||||
}
|
||||
|
||||
fn epilogue_metrics_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>(
|
||||
) -> Middleware<B, ApiError> {
|
||||
Middleware::post_with_info(move |resp, req_info| async move {
|
||||
let request_name = match req_info.context::<RequestName>() {
|
||||
Some(name) => name,
|
||||
None => {
|
||||
return Ok(resp);
|
||||
}
|
||||
};
|
||||
|
||||
if let Some(meta) = req_info.context::<RequestMeta>() {
|
||||
let status = &crate::metrics::METRICS_REGISTRY
|
||||
.metrics_group
|
||||
.storage_controller_http_request_status;
|
||||
let latency = &crate::metrics::METRICS_REGISTRY
|
||||
.metrics_group
|
||||
.storage_controller_http_request_latency;
|
||||
|
||||
status.inc(HttpRequestStatusLabelGroup {
|
||||
path: request_name.0,
|
||||
method: meta.method.clone().into(),
|
||||
status: crate::metrics::StatusCode(resp.status()),
|
||||
});
|
||||
|
||||
latency.observe(
|
||||
HttpRequestLatencyLabelGroup {
|
||||
path: request_name.0,
|
||||
method: meta.method.into(),
|
||||
},
|
||||
meta.at.elapsed().as_secs_f64(),
|
||||
);
|
||||
}
|
||||
Ok(resp)
|
||||
})
|
||||
}
|
||||
|
||||
pub async fn measured_metrics_handler(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
pub const TEXT_FORMAT: &str = "text/plain; version=0.0.4";
|
||||
|
||||
let payload = crate::metrics::METRICS_REGISTRY.encode();
|
||||
let response = Response::builder()
|
||||
.status(200)
|
||||
.header(CONTENT_TYPE, TEXT_FORMAT)
|
||||
.body(payload.into())
|
||||
.unwrap();
|
||||
|
||||
Ok(response)
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
struct RequestName(&'static str);
|
||||
|
||||
async fn named_request_span<R, H>(
|
||||
request: Request<Body>,
|
||||
handler: H,
|
||||
name: RequestName,
|
||||
) -> R::Output
|
||||
where
|
||||
R: Future<Output = Result<Response<Body>, ApiError>> + Send + 'static,
|
||||
H: FnOnce(Request<Body>) -> R + Send + Sync + 'static,
|
||||
{
|
||||
request.set_context(name);
|
||||
request_span(request, handler).await
|
||||
}
|
||||
|
||||
pub fn make_router(
|
||||
service: Arc<Service>,
|
||||
auth: Option<Arc<SwappableJwtAuth>>,
|
||||
) -> RouterBuilder<hyper::Body, ApiError> {
|
||||
let mut router = endpoint::make_router()
|
||||
.middleware(prologue_metrics_middleware())
|
||||
.middleware(epilogue_metrics_middleware());
|
||||
let mut router = endpoint::make_router();
|
||||
if auth.is_some() {
|
||||
router = router.middleware(auth_middleware(|request| {
|
||||
let state = get_state(request);
|
||||
@@ -671,166 +534,96 @@ pub fn make_router(
|
||||
} else {
|
||||
state.auth.as_deref()
|
||||
}
|
||||
}));
|
||||
}))
|
||||
}
|
||||
|
||||
router
|
||||
.data(Arc::new(HttpState::new(service, auth)))
|
||||
.get("/metrics", |r| {
|
||||
named_request_span(r, measured_metrics_handler, RequestName("metrics"))
|
||||
})
|
||||
// Non-prefixed generic endpoints (status, metrics)
|
||||
.get("/status", |r| {
|
||||
named_request_span(r, handle_status, RequestName("status"))
|
||||
})
|
||||
.get("/ready", |r| {
|
||||
named_request_span(r, handle_ready, RequestName("ready"))
|
||||
})
|
||||
.get("/status", |r| request_span(r, handle_status))
|
||||
.get("/ready", |r| request_span(r, handle_ready))
|
||||
// Upcalls for the pageserver: point the pageserver's `control_plane_api` config to this prefix
|
||||
.post("/upcall/v1/re-attach", |r| {
|
||||
named_request_span(r, handle_re_attach, RequestName("upcall_v1_reattach"))
|
||||
})
|
||||
.post("/upcall/v1/validate", |r| {
|
||||
named_request_span(r, handle_validate, RequestName("upcall_v1_validate"))
|
||||
request_span(r, handle_re_attach)
|
||||
})
|
||||
.post("/upcall/v1/validate", |r| request_span(r, handle_validate))
|
||||
// Test/dev/debug endpoints
|
||||
.post("/debug/v1/attach-hook", |r| {
|
||||
named_request_span(r, handle_attach_hook, RequestName("debug_v1_attach_hook"))
|
||||
})
|
||||
.post("/debug/v1/inspect", |r| {
|
||||
named_request_span(r, handle_inspect, RequestName("debug_v1_inspect"))
|
||||
request_span(r, handle_attach_hook)
|
||||
})
|
||||
.post("/debug/v1/inspect", |r| request_span(r, handle_inspect))
|
||||
.post("/debug/v1/tenant/:tenant_id/drop", |r| {
|
||||
named_request_span(r, handle_tenant_drop, RequestName("debug_v1_tenant_drop"))
|
||||
request_span(r, handle_tenant_drop)
|
||||
})
|
||||
.post("/debug/v1/node/:node_id/drop", |r| {
|
||||
named_request_span(r, handle_node_drop, RequestName("debug_v1_node_drop"))
|
||||
})
|
||||
.get("/debug/v1/tenant", |r| {
|
||||
named_request_span(r, handle_tenants_dump, RequestName("debug_v1_tenant"))
|
||||
})
|
||||
.get("/debug/v1/tenant/:tenant_id/locate", |r| {
|
||||
tenant_service_handler(
|
||||
r,
|
||||
handle_tenant_locate,
|
||||
RequestName("debug_v1_tenant_locate"),
|
||||
)
|
||||
request_span(r, handle_node_drop)
|
||||
})
|
||||
.get("/debug/v1/tenant", |r| request_span(r, handle_tenants_dump))
|
||||
.get("/debug/v1/scheduler", |r| {
|
||||
named_request_span(r, handle_scheduler_dump, RequestName("debug_v1_scheduler"))
|
||||
request_span(r, handle_scheduler_dump)
|
||||
})
|
||||
.post("/debug/v1/consistency_check", |r| {
|
||||
named_request_span(
|
||||
r,
|
||||
handle_consistency_check,
|
||||
RequestName("debug_v1_consistency_check"),
|
||||
)
|
||||
request_span(r, handle_consistency_check)
|
||||
})
|
||||
.put("/debug/v1/failpoints", |r| {
|
||||
request_span(r, |r| failpoints_handler(r, CancellationToken::new()))
|
||||
})
|
||||
.get("/control/v1/tenant/:tenant_id/locate", |r| {
|
||||
tenant_service_handler(r, handle_tenant_locate)
|
||||
})
|
||||
// Node operations
|
||||
.post("/control/v1/node", |r| {
|
||||
named_request_span(r, handle_node_register, RequestName("control_v1_node"))
|
||||
})
|
||||
.get("/control/v1/node", |r| {
|
||||
named_request_span(r, handle_node_list, RequestName("control_v1_node"))
|
||||
request_span(r, handle_node_register)
|
||||
})
|
||||
.get("/control/v1/node", |r| request_span(r, handle_node_list))
|
||||
.put("/control/v1/node/:node_id/config", |r| {
|
||||
named_request_span(
|
||||
r,
|
||||
handle_node_configure,
|
||||
RequestName("control_v1_node_config"),
|
||||
)
|
||||
request_span(r, handle_node_configure)
|
||||
})
|
||||
// Tenant Shard operations
|
||||
.put("/control/v1/tenant/:tenant_shard_id/migrate", |r| {
|
||||
tenant_service_handler(
|
||||
r,
|
||||
handle_tenant_shard_migrate,
|
||||
RequestName("control_v1_tenant_migrate"),
|
||||
)
|
||||
tenant_service_handler(r, handle_tenant_shard_migrate)
|
||||
})
|
||||
.put("/control/v1/tenant/:tenant_id/shard_split", |r| {
|
||||
tenant_service_handler(
|
||||
r,
|
||||
handle_tenant_shard_split,
|
||||
RequestName("control_v1_tenant_shard_split"),
|
||||
)
|
||||
})
|
||||
.get("/control/v1/tenant/:tenant_id", |r| {
|
||||
tenant_service_handler(
|
||||
r,
|
||||
handle_tenant_describe,
|
||||
RequestName("control_v1_tenant_describe"),
|
||||
)
|
||||
tenant_service_handler(r, handle_tenant_shard_split)
|
||||
})
|
||||
// Tenant operations
|
||||
// The ^/v1/ endpoints act as a "Virtual Pageserver", enabling shard-naive clients to call into
|
||||
// this service to manage tenants that actually consist of many tenant shards, as if they are a single entity.
|
||||
.post("/v1/tenant", |r| {
|
||||
tenant_service_handler(r, handle_tenant_create, RequestName("v1_tenant"))
|
||||
tenant_service_handler(r, handle_tenant_create)
|
||||
})
|
||||
.delete("/v1/tenant/:tenant_id", |r| {
|
||||
tenant_service_handler(r, handle_tenant_delete, RequestName("v1_tenant"))
|
||||
tenant_service_handler(r, handle_tenant_delete)
|
||||
})
|
||||
.put("/v1/tenant/config", |r| {
|
||||
tenant_service_handler(r, handle_tenant_config_set, RequestName("v1_tenant_config"))
|
||||
tenant_service_handler(r, handle_tenant_config_set)
|
||||
})
|
||||
.get("/v1/tenant/:tenant_id/config", |r| {
|
||||
tenant_service_handler(r, handle_tenant_config_get, RequestName("v1_tenant_config"))
|
||||
tenant_service_handler(r, handle_tenant_config_get)
|
||||
})
|
||||
.put("/v1/tenant/:tenant_shard_id/location_config", |r| {
|
||||
tenant_service_handler(
|
||||
r,
|
||||
handle_tenant_location_config,
|
||||
RequestName("v1_tenant_location_config"),
|
||||
)
|
||||
tenant_service_handler(r, handle_tenant_location_config)
|
||||
})
|
||||
.put("/v1/tenant/:tenant_id/time_travel_remote_storage", |r| {
|
||||
tenant_service_handler(
|
||||
r,
|
||||
handle_tenant_time_travel_remote_storage,
|
||||
RequestName("v1_tenant_time_travel_remote_storage"),
|
||||
)
|
||||
tenant_service_handler(r, handle_tenant_time_travel_remote_storage)
|
||||
})
|
||||
.post("/v1/tenant/:tenant_id/secondary/download", |r| {
|
||||
tenant_service_handler(
|
||||
r,
|
||||
handle_tenant_secondary_download,
|
||||
RequestName("v1_tenant_secondary_download"),
|
||||
)
|
||||
tenant_service_handler(r, handle_tenant_secondary_download)
|
||||
})
|
||||
// Timeline operations
|
||||
.delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
|
||||
tenant_service_handler(
|
||||
r,
|
||||
handle_tenant_timeline_delete,
|
||||
RequestName("v1_tenant_timeline"),
|
||||
)
|
||||
tenant_service_handler(r, handle_tenant_timeline_delete)
|
||||
})
|
||||
.post("/v1/tenant/:tenant_id/timeline", |r| {
|
||||
tenant_service_handler(
|
||||
r,
|
||||
handle_tenant_timeline_create,
|
||||
RequestName("v1_tenant_timeline"),
|
||||
)
|
||||
tenant_service_handler(r, handle_tenant_timeline_create)
|
||||
})
|
||||
// Tenant detail GET passthrough to shard zero
|
||||
.get("/v1/tenant/:tenant_id", |r| {
|
||||
tenant_service_handler(
|
||||
r,
|
||||
handle_tenant_timeline_passthrough,
|
||||
RequestName("v1_tenant_passthrough"),
|
||||
)
|
||||
tenant_service_handler(r, handle_tenant_timeline_passthrough)
|
||||
})
|
||||
// Timeline GET passthrough to shard zero. Note that the `*` in the URL is a wildcard: any future
|
||||
// timeline GET APIs will be implicitly included.
|
||||
.get("/v1/tenant/:tenant_id/timeline*", |r| {
|
||||
tenant_service_handler(
|
||||
r,
|
||||
handle_tenant_timeline_passthrough,
|
||||
RequestName("v1_tenant_timeline_passthrough"),
|
||||
)
|
||||
tenant_service_handler(r, handle_tenant_timeline_passthrough)
|
||||
})
|
||||
}
|
||||
|
||||
@@ -8,7 +8,6 @@ pub mod http;
|
||||
mod id_lock_map;
|
||||
pub mod metrics;
|
||||
mod node;
|
||||
mod pageserver_client;
|
||||
pub mod persistence;
|
||||
mod reconciler;
|
||||
mod scheduler;
|
||||
|
||||
@@ -1,284 +1,32 @@
|
||||
//!
|
||||
//! This module provides metric definitions for the storage controller.
|
||||
//!
|
||||
//! All metrics are grouped in [`StorageControllerMetricGroup`]. [`StorageControllerMetrics`] holds
|
||||
//! the mentioned metrics and their encoder. It's globally available via the [`METRICS_REGISTRY`]
|
||||
//! constant.
|
||||
//!
|
||||
//! The rest of the code defines label group types and deals with converting outer types to labels.
|
||||
//!
|
||||
use bytes::Bytes;
|
||||
use measured::{
|
||||
label::{LabelValue, StaticLabelSet},
|
||||
FixedCardinalityLabel, MetricGroup,
|
||||
};
|
||||
use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec};
|
||||
use once_cell::sync::Lazy;
|
||||
use std::sync::Mutex;
|
||||
|
||||
use crate::persistence::{DatabaseError, DatabaseOperation};
|
||||
pub(crate) struct ReconcilerMetrics {
|
||||
pub(crate) spawned: IntCounter,
|
||||
pub(crate) complete: IntCounterVec,
|
||||
}
|
||||
|
||||
pub(crate) static METRICS_REGISTRY: Lazy<StorageControllerMetrics> =
|
||||
Lazy::new(StorageControllerMetrics::default);
|
||||
impl ReconcilerMetrics {
|
||||
// Labels used on [`Self::complete`]
|
||||
pub(crate) const SUCCESS: &'static str = "ok";
|
||||
pub(crate) const ERROR: &'static str = "success";
|
||||
pub(crate) const CANCEL: &'static str = "cancel";
|
||||
}
|
||||
|
||||
pub(crate) static RECONCILER: Lazy<ReconcilerMetrics> = Lazy::new(|| ReconcilerMetrics {
|
||||
spawned: register_int_counter!(
|
||||
"storage_controller_reconcile_spawn",
|
||||
"Count of how many times we spawn a reconcile task",
|
||||
)
|
||||
.expect("failed to define a metric"),
|
||||
complete: register_int_counter_vec!(
|
||||
"storage_controller_reconcile_complete",
|
||||
"Reconciler tasks completed, broken down by success/failure/cancelled",
|
||||
&["status"],
|
||||
)
|
||||
.expect("failed to define a metric"),
|
||||
});
|
||||
|
||||
pub fn preinitialize_metrics() {
|
||||
Lazy::force(&METRICS_REGISTRY);
|
||||
}
|
||||
|
||||
pub(crate) struct StorageControllerMetrics {
|
||||
pub(crate) metrics_group: StorageControllerMetricGroup,
|
||||
encoder: Mutex<measured::text::TextEncoder>,
|
||||
}
|
||||
|
||||
#[derive(measured::MetricGroup)]
|
||||
pub(crate) struct StorageControllerMetricGroup {
|
||||
/// Count of how many times we spawn a reconcile task
|
||||
pub(crate) storage_controller_reconcile_spawn: measured::Counter,
|
||||
/// Reconciler tasks completed, broken down by success/failure/cancelled
|
||||
pub(crate) storage_controller_reconcile_complete:
|
||||
measured::CounterVec<ReconcileCompleteLabelGroupSet>,
|
||||
|
||||
/// HTTP request status counters for handled requests
|
||||
pub(crate) storage_controller_http_request_status:
|
||||
measured::CounterVec<HttpRequestStatusLabelGroupSet>,
|
||||
/// HTTP request handler latency across all status codes
|
||||
pub(crate) storage_controller_http_request_latency:
|
||||
measured::HistogramVec<HttpRequestLatencyLabelGroupSet, 5>,
|
||||
|
||||
/// Count of HTTP requests to the pageserver that resulted in an error,
|
||||
/// broken down by the pageserver node id, request name and method
|
||||
pub(crate) storage_controller_pageserver_request_error:
|
||||
measured::CounterVec<PageserverRequestLabelGroupSet>,
|
||||
|
||||
/// Latency of HTTP requests to the pageserver, broken down by pageserver
|
||||
/// node id, request name and method. This include both successful and unsuccessful
|
||||
/// requests.
|
||||
pub(crate) storage_controller_pageserver_request_latency:
|
||||
measured::HistogramVec<PageserverRequestLabelGroupSet, 5>,
|
||||
|
||||
/// Count of pass-through HTTP requests to the pageserver that resulted in an error,
|
||||
/// broken down by the pageserver node id, request name and method
|
||||
pub(crate) storage_controller_passthrough_request_error:
|
||||
measured::CounterVec<PageserverRequestLabelGroupSet>,
|
||||
|
||||
/// Latency of pass-through HTTP requests to the pageserver, broken down by pageserver
|
||||
/// node id, request name and method. This include both successful and unsuccessful
|
||||
/// requests.
|
||||
pub(crate) storage_controller_passthrough_request_latency:
|
||||
measured::HistogramVec<PageserverRequestLabelGroupSet, 5>,
|
||||
|
||||
/// Count of errors in database queries, broken down by error type and operation.
|
||||
pub(crate) storage_controller_database_query_error:
|
||||
measured::CounterVec<DatabaseQueryErrorLabelGroupSet>,
|
||||
|
||||
/// Latency of database queries, broken down by operation.
|
||||
pub(crate) storage_controller_database_query_latency:
|
||||
measured::HistogramVec<DatabaseQueryLatencyLabelGroupSet, 5>,
|
||||
}
|
||||
|
||||
impl StorageControllerMetrics {
|
||||
pub(crate) fn encode(&self) -> Bytes {
|
||||
let mut encoder = self.encoder.lock().unwrap();
|
||||
self.metrics_group.collect_into(&mut *encoder);
|
||||
encoder.finish()
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for StorageControllerMetrics {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
metrics_group: StorageControllerMetricGroup::new(),
|
||||
encoder: Mutex::new(measured::text::TextEncoder::new()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl StorageControllerMetricGroup {
|
||||
pub(crate) fn new() -> Self {
|
||||
Self {
|
||||
storage_controller_reconcile_spawn: measured::Counter::new(),
|
||||
storage_controller_reconcile_complete: measured::CounterVec::new(
|
||||
ReconcileCompleteLabelGroupSet {
|
||||
status: StaticLabelSet::new(),
|
||||
},
|
||||
),
|
||||
storage_controller_http_request_status: measured::CounterVec::new(
|
||||
HttpRequestStatusLabelGroupSet {
|
||||
path: lasso::ThreadedRodeo::new(),
|
||||
method: StaticLabelSet::new(),
|
||||
status: StaticLabelSet::new(),
|
||||
},
|
||||
),
|
||||
storage_controller_http_request_latency: measured::HistogramVec::new(
|
||||
measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0),
|
||||
),
|
||||
storage_controller_pageserver_request_error: measured::CounterVec::new(
|
||||
PageserverRequestLabelGroupSet {
|
||||
pageserver_id: lasso::ThreadedRodeo::new(),
|
||||
path: lasso::ThreadedRodeo::new(),
|
||||
method: StaticLabelSet::new(),
|
||||
},
|
||||
),
|
||||
storage_controller_pageserver_request_latency: measured::HistogramVec::new(
|
||||
measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0),
|
||||
),
|
||||
storage_controller_passthrough_request_error: measured::CounterVec::new(
|
||||
PageserverRequestLabelGroupSet {
|
||||
pageserver_id: lasso::ThreadedRodeo::new(),
|
||||
path: lasso::ThreadedRodeo::new(),
|
||||
method: StaticLabelSet::new(),
|
||||
},
|
||||
),
|
||||
storage_controller_passthrough_request_latency: measured::HistogramVec::new(
|
||||
measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0),
|
||||
),
|
||||
storage_controller_database_query_error: measured::CounterVec::new(
|
||||
DatabaseQueryErrorLabelGroupSet {
|
||||
operation: StaticLabelSet::new(),
|
||||
error_type: StaticLabelSet::new(),
|
||||
},
|
||||
),
|
||||
storage_controller_database_query_latency: measured::HistogramVec::new(
|
||||
measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0),
|
||||
),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(measured::LabelGroup)]
|
||||
#[label(set = ReconcileCompleteLabelGroupSet)]
|
||||
pub(crate) struct ReconcileCompleteLabelGroup {
|
||||
pub(crate) status: ReconcileOutcome,
|
||||
}
|
||||
|
||||
#[derive(measured::LabelGroup)]
|
||||
#[label(set = HttpRequestStatusLabelGroupSet)]
|
||||
pub(crate) struct HttpRequestStatusLabelGroup<'a> {
|
||||
#[label(dynamic_with = lasso::ThreadedRodeo)]
|
||||
pub(crate) path: &'a str,
|
||||
pub(crate) method: Method,
|
||||
pub(crate) status: StatusCode,
|
||||
}
|
||||
|
||||
#[derive(measured::LabelGroup)]
|
||||
#[label(set = HttpRequestLatencyLabelGroupSet)]
|
||||
pub(crate) struct HttpRequestLatencyLabelGroup<'a> {
|
||||
#[label(dynamic_with = lasso::ThreadedRodeo)]
|
||||
pub(crate) path: &'a str,
|
||||
pub(crate) method: Method,
|
||||
}
|
||||
|
||||
impl Default for HttpRequestLatencyLabelGroupSet {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
path: lasso::ThreadedRodeo::new(),
|
||||
method: StaticLabelSet::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(measured::LabelGroup, Clone)]
|
||||
#[label(set = PageserverRequestLabelGroupSet)]
|
||||
pub(crate) struct PageserverRequestLabelGroup<'a> {
|
||||
#[label(dynamic_with = lasso::ThreadedRodeo)]
|
||||
pub(crate) pageserver_id: &'a str,
|
||||
#[label(dynamic_with = lasso::ThreadedRodeo)]
|
||||
pub(crate) path: &'a str,
|
||||
pub(crate) method: Method,
|
||||
}
|
||||
|
||||
impl Default for PageserverRequestLabelGroupSet {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
pageserver_id: lasso::ThreadedRodeo::new(),
|
||||
path: lasso::ThreadedRodeo::new(),
|
||||
method: StaticLabelSet::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(measured::LabelGroup)]
|
||||
#[label(set = DatabaseQueryErrorLabelGroupSet)]
|
||||
pub(crate) struct DatabaseQueryErrorLabelGroup {
|
||||
pub(crate) error_type: DatabaseErrorLabel,
|
||||
pub(crate) operation: DatabaseOperation,
|
||||
}
|
||||
|
||||
#[derive(measured::LabelGroup)]
|
||||
#[label(set = DatabaseQueryLatencyLabelGroupSet)]
|
||||
pub(crate) struct DatabaseQueryLatencyLabelGroup {
|
||||
pub(crate) operation: DatabaseOperation,
|
||||
}
|
||||
|
||||
#[derive(FixedCardinalityLabel)]
|
||||
pub(crate) enum ReconcileOutcome {
|
||||
#[label(rename = "ok")]
|
||||
Success,
|
||||
Error,
|
||||
Cancel,
|
||||
}
|
||||
|
||||
#[derive(FixedCardinalityLabel, Clone)]
|
||||
pub(crate) enum Method {
|
||||
Get,
|
||||
Put,
|
||||
Post,
|
||||
Delete,
|
||||
Other,
|
||||
}
|
||||
|
||||
impl From<hyper::Method> for Method {
|
||||
fn from(value: hyper::Method) -> Self {
|
||||
if value == hyper::Method::GET {
|
||||
Method::Get
|
||||
} else if value == hyper::Method::PUT {
|
||||
Method::Put
|
||||
} else if value == hyper::Method::POST {
|
||||
Method::Post
|
||||
} else if value == hyper::Method::DELETE {
|
||||
Method::Delete
|
||||
} else {
|
||||
Method::Other
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) struct StatusCode(pub(crate) hyper::http::StatusCode);
|
||||
|
||||
impl LabelValue for StatusCode {
|
||||
fn visit<V: measured::label::LabelVisitor>(&self, v: V) -> V::Output {
|
||||
v.write_int(self.0.as_u16() as u64)
|
||||
}
|
||||
}
|
||||
|
||||
impl FixedCardinalityLabel for StatusCode {
|
||||
fn cardinality() -> usize {
|
||||
(100..1000).len()
|
||||
}
|
||||
|
||||
fn encode(&self) -> usize {
|
||||
self.0.as_u16() as usize
|
||||
}
|
||||
|
||||
fn decode(value: usize) -> Self {
|
||||
Self(hyper::http::StatusCode::from_u16(u16::try_from(value).unwrap()).unwrap())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(FixedCardinalityLabel)]
|
||||
pub(crate) enum DatabaseErrorLabel {
|
||||
Query,
|
||||
Connection,
|
||||
ConnectionPool,
|
||||
Logical,
|
||||
}
|
||||
|
||||
impl DatabaseError {
|
||||
pub(crate) fn error_label(&self) -> DatabaseErrorLabel {
|
||||
match self {
|
||||
Self::Query(_) => DatabaseErrorLabel::Query,
|
||||
Self::Connection(_) => DatabaseErrorLabel::Connection,
|
||||
Self::ConnectionPool(_) => DatabaseErrorLabel::ConnectionPool,
|
||||
Self::Logical(_) => DatabaseErrorLabel::Logical,
|
||||
}
|
||||
}
|
||||
Lazy::force(&RECONCILER);
|
||||
}
|
||||
|
||||
@@ -12,9 +12,7 @@ use serde::Serialize;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use utils::{backoff, id::NodeId};
|
||||
|
||||
use crate::{
|
||||
pageserver_client::PageserverClient, persistence::NodePersistence, scheduler::MaySchedule,
|
||||
};
|
||||
use crate::{persistence::NodePersistence, scheduler::MaySchedule};
|
||||
|
||||
/// Represents the in-memory description of a Node.
|
||||
///
|
||||
@@ -204,7 +202,7 @@ impl Node {
|
||||
cancel: &CancellationToken,
|
||||
) -> Option<mgmt_api::Result<T>>
|
||||
where
|
||||
O: FnMut(PageserverClient) -> F,
|
||||
O: FnMut(mgmt_api::Client) -> F,
|
||||
F: std::future::Future<Output = mgmt_api::Result<T>>,
|
||||
{
|
||||
fn is_fatal(e: &mgmt_api::Error) -> bool {
|
||||
@@ -226,12 +224,8 @@ impl Node {
|
||||
.build()
|
||||
.expect("Failed to construct HTTP client");
|
||||
|
||||
let client = PageserverClient::from_client(
|
||||
self.get_id(),
|
||||
http_client,
|
||||
self.base_url(),
|
||||
jwt.as_deref(),
|
||||
);
|
||||
let client =
|
||||
mgmt_api::Client::from_client(http_client, self.base_url(), jwt.as_deref());
|
||||
|
||||
let node_cancel_fut = self.cancel.cancelled();
|
||||
|
||||
|
||||
@@ -1,203 +0,0 @@
|
||||
use pageserver_api::{
|
||||
models::{
|
||||
LocationConfig, LocationConfigListResponse, PageserverUtilization, SecondaryProgress,
|
||||
TenantShardSplitRequest, TenantShardSplitResponse, TimelineCreateRequest, TimelineInfo,
|
||||
},
|
||||
shard::TenantShardId,
|
||||
};
|
||||
use pageserver_client::mgmt_api::{Client, Result};
|
||||
use reqwest::StatusCode;
|
||||
use utils::id::{NodeId, TimelineId};
|
||||
|
||||
/// Thin wrapper around [`pageserver_client::mgmt_api::Client`]. It allows the storage
|
||||
/// controller to collect metrics in a non-intrusive manner.
|
||||
#[derive(Debug, Clone)]
|
||||
pub(crate) struct PageserverClient {
|
||||
inner: Client,
|
||||
node_id_label: String,
|
||||
}
|
||||
|
||||
macro_rules! measured_request {
|
||||
($name:literal, $method:expr, $node_id: expr, $invoke:expr) => {{
|
||||
let labels = crate::metrics::PageserverRequestLabelGroup {
|
||||
pageserver_id: $node_id,
|
||||
path: $name,
|
||||
method: $method,
|
||||
};
|
||||
|
||||
let latency = &crate::metrics::METRICS_REGISTRY
|
||||
.metrics_group
|
||||
.storage_controller_pageserver_request_latency;
|
||||
let _timer_guard = latency.start_timer(labels.clone());
|
||||
|
||||
let res = $invoke;
|
||||
|
||||
if res.is_err() {
|
||||
let error_counters = &crate::metrics::METRICS_REGISTRY
|
||||
.metrics_group
|
||||
.storage_controller_pageserver_request_error;
|
||||
error_counters.inc(labels)
|
||||
}
|
||||
|
||||
res
|
||||
}};
|
||||
}
|
||||
|
||||
impl PageserverClient {
|
||||
pub(crate) fn new(node_id: NodeId, mgmt_api_endpoint: String, jwt: Option<&str>) -> Self {
|
||||
Self {
|
||||
inner: Client::from_client(reqwest::Client::new(), mgmt_api_endpoint, jwt),
|
||||
node_id_label: node_id.0.to_string(),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn from_client(
|
||||
node_id: NodeId,
|
||||
raw_client: reqwest::Client,
|
||||
mgmt_api_endpoint: String,
|
||||
jwt: Option<&str>,
|
||||
) -> Self {
|
||||
Self {
|
||||
inner: Client::from_client(raw_client, mgmt_api_endpoint, jwt),
|
||||
node_id_label: node_id.0.to_string(),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) async fn tenant_delete(&self, tenant_shard_id: TenantShardId) -> Result<StatusCode> {
|
||||
measured_request!(
|
||||
"tenant",
|
||||
crate::metrics::Method::Delete,
|
||||
&self.node_id_label,
|
||||
self.inner.tenant_delete(tenant_shard_id).await
|
||||
)
|
||||
}
|
||||
|
||||
pub(crate) async fn tenant_time_travel_remote_storage(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
timestamp: &str,
|
||||
done_if_after: &str,
|
||||
) -> Result<()> {
|
||||
measured_request!(
|
||||
"tenant_time_travel_remote_storage",
|
||||
crate::metrics::Method::Put,
|
||||
&self.node_id_label,
|
||||
self.inner
|
||||
.tenant_time_travel_remote_storage(tenant_shard_id, timestamp, done_if_after)
|
||||
.await
|
||||
)
|
||||
}
|
||||
|
||||
pub(crate) async fn tenant_secondary_download(
|
||||
&self,
|
||||
tenant_id: TenantShardId,
|
||||
wait: Option<std::time::Duration>,
|
||||
) -> Result<(StatusCode, SecondaryProgress)> {
|
||||
measured_request!(
|
||||
"tenant_secondary_download",
|
||||
crate::metrics::Method::Post,
|
||||
&self.node_id_label,
|
||||
self.inner.tenant_secondary_download(tenant_id, wait).await
|
||||
)
|
||||
}
|
||||
|
||||
pub(crate) async fn location_config(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
config: LocationConfig,
|
||||
flush_ms: Option<std::time::Duration>,
|
||||
lazy: bool,
|
||||
) -> Result<()> {
|
||||
measured_request!(
|
||||
"location_config",
|
||||
crate::metrics::Method::Put,
|
||||
&self.node_id_label,
|
||||
self.inner
|
||||
.location_config(tenant_shard_id, config, flush_ms, lazy)
|
||||
.await
|
||||
)
|
||||
}
|
||||
|
||||
pub(crate) async fn list_location_config(&self) -> Result<LocationConfigListResponse> {
|
||||
measured_request!(
|
||||
"location_configs",
|
||||
crate::metrics::Method::Get,
|
||||
&self.node_id_label,
|
||||
self.inner.list_location_config().await
|
||||
)
|
||||
}
|
||||
|
||||
pub(crate) async fn get_location_config(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
) -> Result<Option<LocationConfig>> {
|
||||
measured_request!(
|
||||
"location_config",
|
||||
crate::metrics::Method::Get,
|
||||
&self.node_id_label,
|
||||
self.inner.get_location_config(tenant_shard_id).await
|
||||
)
|
||||
}
|
||||
|
||||
pub(crate) async fn timeline_create(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
req: &TimelineCreateRequest,
|
||||
) -> Result<TimelineInfo> {
|
||||
measured_request!(
|
||||
"timeline",
|
||||
crate::metrics::Method::Post,
|
||||
&self.node_id_label,
|
||||
self.inner.timeline_create(tenant_shard_id, req).await
|
||||
)
|
||||
}
|
||||
|
||||
pub(crate) async fn timeline_delete(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
) -> Result<StatusCode> {
|
||||
measured_request!(
|
||||
"timeline",
|
||||
crate::metrics::Method::Delete,
|
||||
&self.node_id_label,
|
||||
self.inner
|
||||
.timeline_delete(tenant_shard_id, timeline_id)
|
||||
.await
|
||||
)
|
||||
}
|
||||
|
||||
pub(crate) async fn tenant_shard_split(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
req: TenantShardSplitRequest,
|
||||
) -> Result<TenantShardSplitResponse> {
|
||||
measured_request!(
|
||||
"tenant_shard_split",
|
||||
crate::metrics::Method::Put,
|
||||
&self.node_id_label,
|
||||
self.inner.tenant_shard_split(tenant_shard_id, req).await
|
||||
)
|
||||
}
|
||||
|
||||
pub(crate) async fn timeline_list(
|
||||
&self,
|
||||
tenant_shard_id: &TenantShardId,
|
||||
) -> Result<Vec<TimelineInfo>> {
|
||||
measured_request!(
|
||||
"timelines",
|
||||
crate::metrics::Method::Get,
|
||||
&self.node_id_label,
|
||||
self.inner.timeline_list(tenant_shard_id).await
|
||||
)
|
||||
}
|
||||
|
||||
pub(crate) async fn get_utilization(&self) -> Result<PageserverUtilization> {
|
||||
measured_request!(
|
||||
"utilization",
|
||||
crate::metrics::Method::Get,
|
||||
&self.node_id_label,
|
||||
self.inner.get_utilization().await
|
||||
)
|
||||
}
|
||||
}
|
||||
@@ -19,9 +19,6 @@ use serde::{Deserialize, Serialize};
|
||||
use utils::generation::Generation;
|
||||
use utils::id::{NodeId, TenantId};
|
||||
|
||||
use crate::metrics::{
|
||||
DatabaseQueryErrorLabelGroup, DatabaseQueryLatencyLabelGroup, METRICS_REGISTRY,
|
||||
};
|
||||
use crate::node::Node;
|
||||
|
||||
/// ## What do we store?
|
||||
@@ -78,25 +75,6 @@ pub(crate) enum DatabaseError {
|
||||
Logical(String),
|
||||
}
|
||||
|
||||
#[derive(measured::FixedCardinalityLabel, Clone)]
|
||||
pub(crate) enum DatabaseOperation {
|
||||
InsertNode,
|
||||
UpdateNode,
|
||||
DeleteNode,
|
||||
ListNodes,
|
||||
BeginShardSplit,
|
||||
CompleteShardSplit,
|
||||
AbortShardSplit,
|
||||
Detach,
|
||||
ReAttach,
|
||||
IncrementGeneration,
|
||||
ListTenantShards,
|
||||
InsertTenantShards,
|
||||
UpdateTenantShard,
|
||||
DeleteTenant,
|
||||
UpdateTenantConfig,
|
||||
}
|
||||
|
||||
#[must_use]
|
||||
pub(crate) enum AbortShardSplitStatus {
|
||||
/// We aborted the split in the database by reverting to the parent shards
|
||||
@@ -137,34 +115,6 @@ impl Persistence {
|
||||
}
|
||||
}
|
||||
|
||||
/// Wraps `with_conn` in order to collect latency and error metrics
|
||||
async fn with_measured_conn<F, R>(&self, op: DatabaseOperation, func: F) -> DatabaseResult<R>
|
||||
where
|
||||
F: Fn(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
|
||||
R: Send + 'static,
|
||||
{
|
||||
let latency = &METRICS_REGISTRY
|
||||
.metrics_group
|
||||
.storage_controller_database_query_latency;
|
||||
let _timer = latency.start_timer(DatabaseQueryLatencyLabelGroup {
|
||||
operation: op.clone(),
|
||||
});
|
||||
|
||||
let res = self.with_conn(func).await;
|
||||
|
||||
if let Err(err) = &res {
|
||||
let error_counter = &METRICS_REGISTRY
|
||||
.metrics_group
|
||||
.storage_controller_database_query_error;
|
||||
error_counter.inc(DatabaseQueryErrorLabelGroup {
|
||||
error_type: err.error_label(),
|
||||
operation: op,
|
||||
})
|
||||
}
|
||||
|
||||
res
|
||||
}
|
||||
|
||||
/// Call the provided function in a tokio blocking thread, with a Diesel database connection.
|
||||
async fn with_conn<F, R>(&self, func: F) -> DatabaseResult<R>
|
||||
where
|
||||
@@ -180,27 +130,21 @@ impl Persistence {
|
||||
/// When a node is first registered, persist it before using it for anything
|
||||
pub(crate) async fn insert_node(&self, node: &Node) -> DatabaseResult<()> {
|
||||
let np = node.to_persistent();
|
||||
self.with_measured_conn(
|
||||
DatabaseOperation::InsertNode,
|
||||
move |conn| -> DatabaseResult<()> {
|
||||
diesel::insert_into(crate::schema::nodes::table)
|
||||
.values(&np)
|
||||
.execute(conn)?;
|
||||
Ok(())
|
||||
},
|
||||
)
|
||||
self.with_conn(move |conn| -> DatabaseResult<()> {
|
||||
diesel::insert_into(crate::schema::nodes::table)
|
||||
.values(&np)
|
||||
.execute(conn)?;
|
||||
Ok(())
|
||||
})
|
||||
.await
|
||||
}
|
||||
|
||||
/// At startup, populate the list of nodes which our shards may be placed on
|
||||
pub(crate) async fn list_nodes(&self) -> DatabaseResult<Vec<NodePersistence>> {
|
||||
let nodes: Vec<NodePersistence> = self
|
||||
.with_measured_conn(
|
||||
DatabaseOperation::ListNodes,
|
||||
move |conn| -> DatabaseResult<_> {
|
||||
Ok(crate::schema::nodes::table.load::<NodePersistence>(conn)?)
|
||||
},
|
||||
)
|
||||
.with_conn(move |conn| -> DatabaseResult<_> {
|
||||
Ok(crate::schema::nodes::table.load::<NodePersistence>(conn)?)
|
||||
})
|
||||
.await?;
|
||||
|
||||
tracing::info!("list_nodes: loaded {} nodes", nodes.len());
|
||||
@@ -215,7 +159,7 @@ impl Persistence {
|
||||
) -> DatabaseResult<()> {
|
||||
use crate::schema::nodes::dsl::*;
|
||||
let updated = self
|
||||
.with_measured_conn(DatabaseOperation::UpdateNode, move |conn| {
|
||||
.with_conn(move |conn| {
|
||||
let updated = diesel::update(nodes)
|
||||
.filter(node_id.eq(input_node_id.0 as i64))
|
||||
.set((scheduling_policy.eq(String::from(input_scheduling)),))
|
||||
@@ -237,12 +181,9 @@ impl Persistence {
|
||||
/// be enriched at runtime with state discovered on pageservers.
|
||||
pub(crate) async fn list_tenant_shards(&self) -> DatabaseResult<Vec<TenantShardPersistence>> {
|
||||
let loaded = self
|
||||
.with_measured_conn(
|
||||
DatabaseOperation::ListTenantShards,
|
||||
move |conn| -> DatabaseResult<_> {
|
||||
Ok(crate::schema::tenant_shards::table.load::<TenantShardPersistence>(conn)?)
|
||||
},
|
||||
)
|
||||
.with_conn(move |conn| -> DatabaseResult<_> {
|
||||
Ok(crate::schema::tenant_shards::table.load::<TenantShardPersistence>(conn)?)
|
||||
})
|
||||
.await?;
|
||||
|
||||
if loaded.is_empty() {
|
||||
@@ -270,10 +211,15 @@ impl Persistence {
|
||||
|
||||
let mut decoded = serde_json::from_slice::<JsonPersistence>(&bytes)
|
||||
.map_err(|e| DatabaseError::Logical(format!("Deserialization error: {e}")))?;
|
||||
for shard in decoded.tenants.values_mut() {
|
||||
if shard.placement_policy == "\"Single\"" {
|
||||
// Backward compat for test data after PR https://github.com/neondatabase/neon/pull/7165
|
||||
shard.placement_policy = "{\"Attached\":0}".to_string();
|
||||
for (tenant_id, tenant) in &mut decoded.tenants {
|
||||
// Backward compat: an old attachments.json from before PR #6251, replace
|
||||
// empty strings with proper defaults.
|
||||
if tenant.tenant_id.is_empty() {
|
||||
tenant.tenant_id = tenant_id.to_string();
|
||||
tenant.config = serde_json::to_string(&TenantConfig::default())
|
||||
.map_err(|e| DatabaseError::Logical(format!("Serialization error: {e}")))?;
|
||||
tenant.placement_policy = serde_json::to_string(&PlacementPolicy::Single)
|
||||
.map_err(|e| DatabaseError::Logical(format!("Serialization error: {e}")))?;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -319,20 +265,17 @@ impl Persistence {
|
||||
shards: Vec<TenantShardPersistence>,
|
||||
) -> DatabaseResult<()> {
|
||||
use crate::schema::tenant_shards::dsl::*;
|
||||
self.with_measured_conn(
|
||||
DatabaseOperation::InsertTenantShards,
|
||||
move |conn| -> DatabaseResult<()> {
|
||||
conn.transaction(|conn| -> QueryResult<()> {
|
||||
for tenant in &shards {
|
||||
diesel::insert_into(tenant_shards)
|
||||
.values(tenant)
|
||||
.execute(conn)?;
|
||||
}
|
||||
Ok(())
|
||||
})?;
|
||||
self.with_conn(move |conn| -> DatabaseResult<()> {
|
||||
conn.transaction(|conn| -> QueryResult<()> {
|
||||
for tenant in &shards {
|
||||
diesel::insert_into(tenant_shards)
|
||||
.values(tenant)
|
||||
.execute(conn)?;
|
||||
}
|
||||
Ok(())
|
||||
},
|
||||
)
|
||||
})?;
|
||||
Ok(())
|
||||
})
|
||||
.await
|
||||
}
|
||||
|
||||
@@ -340,31 +283,25 @@ impl Persistence {
|
||||
/// the tenant from memory on this server.
|
||||
pub(crate) async fn delete_tenant(&self, del_tenant_id: TenantId) -> DatabaseResult<()> {
|
||||
use crate::schema::tenant_shards::dsl::*;
|
||||
self.with_measured_conn(
|
||||
DatabaseOperation::DeleteTenant,
|
||||
move |conn| -> DatabaseResult<()> {
|
||||
diesel::delete(tenant_shards)
|
||||
.filter(tenant_id.eq(del_tenant_id.to_string()))
|
||||
.execute(conn)?;
|
||||
self.with_conn(move |conn| -> DatabaseResult<()> {
|
||||
diesel::delete(tenant_shards)
|
||||
.filter(tenant_id.eq(del_tenant_id.to_string()))
|
||||
.execute(conn)?;
|
||||
|
||||
Ok(())
|
||||
},
|
||||
)
|
||||
Ok(())
|
||||
})
|
||||
.await
|
||||
}
|
||||
|
||||
pub(crate) async fn delete_node(&self, del_node_id: NodeId) -> DatabaseResult<()> {
|
||||
use crate::schema::nodes::dsl::*;
|
||||
self.with_measured_conn(
|
||||
DatabaseOperation::DeleteNode,
|
||||
move |conn| -> DatabaseResult<()> {
|
||||
diesel::delete(nodes)
|
||||
.filter(node_id.eq(del_node_id.0 as i64))
|
||||
.execute(conn)?;
|
||||
self.with_conn(move |conn| -> DatabaseResult<()> {
|
||||
diesel::delete(nodes)
|
||||
.filter(node_id.eq(del_node_id.0 as i64))
|
||||
.execute(conn)?;
|
||||
|
||||
Ok(())
|
||||
},
|
||||
)
|
||||
Ok(())
|
||||
})
|
||||
.await
|
||||
}
|
||||
|
||||
@@ -378,7 +315,7 @@ impl Persistence {
|
||||
) -> DatabaseResult<HashMap<TenantShardId, Generation>> {
|
||||
use crate::schema::tenant_shards::dsl::*;
|
||||
let updated = self
|
||||
.with_measured_conn(DatabaseOperation::ReAttach, move |conn| {
|
||||
.with_conn(move |conn| {
|
||||
let rows_updated = diesel::update(tenant_shards)
|
||||
.filter(generation_pageserver.eq(node_id.0 as i64))
|
||||
.set(generation.eq(generation + 1))
|
||||
@@ -428,7 +365,7 @@ impl Persistence {
|
||||
) -> anyhow::Result<Generation> {
|
||||
use crate::schema::tenant_shards::dsl::*;
|
||||
let updated = self
|
||||
.with_measured_conn(DatabaseOperation::IncrementGeneration, move |conn| {
|
||||
.with_conn(move |conn| {
|
||||
let updated = diesel::update(tenant_shards)
|
||||
.filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
|
||||
.filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
|
||||
@@ -472,7 +409,7 @@ impl Persistence {
|
||||
) -> DatabaseResult<()> {
|
||||
use crate::schema::tenant_shards::dsl::*;
|
||||
|
||||
self.with_measured_conn(DatabaseOperation::UpdateTenantShard, move |conn| {
|
||||
self.with_conn(move |conn| {
|
||||
let query = diesel::update(tenant_shards)
|
||||
.filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
|
||||
.filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
|
||||
@@ -513,7 +450,7 @@ impl Persistence {
|
||||
) -> DatabaseResult<()> {
|
||||
use crate::schema::tenant_shards::dsl::*;
|
||||
|
||||
self.with_measured_conn(DatabaseOperation::UpdateTenantConfig, move |conn| {
|
||||
self.with_conn(move |conn| {
|
||||
diesel::update(tenant_shards)
|
||||
.filter(tenant_id.eq(input_tenant_id.to_string()))
|
||||
.set((config.eq(serde_json::to_string(&input_config).unwrap()),))
|
||||
@@ -528,7 +465,7 @@ impl Persistence {
|
||||
|
||||
pub(crate) async fn detach(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> {
|
||||
use crate::schema::tenant_shards::dsl::*;
|
||||
self.with_measured_conn(DatabaseOperation::Detach, move |conn| {
|
||||
self.with_conn(move |conn| {
|
||||
let updated = diesel::update(tenant_shards)
|
||||
.filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
|
||||
.filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
|
||||
@@ -558,7 +495,7 @@ impl Persistence {
|
||||
parent_to_children: Vec<(TenantShardId, Vec<TenantShardPersistence>)>,
|
||||
) -> DatabaseResult<()> {
|
||||
use crate::schema::tenant_shards::dsl::*;
|
||||
self.with_measured_conn(DatabaseOperation::BeginShardSplit, move |conn| -> DatabaseResult<()> {
|
||||
self.with_conn(move |conn| -> DatabaseResult<()> {
|
||||
conn.transaction(|conn| -> DatabaseResult<()> {
|
||||
// Mark parent shards as splitting
|
||||
|
||||
@@ -622,29 +559,26 @@ impl Persistence {
|
||||
old_shard_count: ShardCount,
|
||||
) -> DatabaseResult<()> {
|
||||
use crate::schema::tenant_shards::dsl::*;
|
||||
self.with_measured_conn(
|
||||
DatabaseOperation::CompleteShardSplit,
|
||||
move |conn| -> DatabaseResult<()> {
|
||||
conn.transaction(|conn| -> QueryResult<()> {
|
||||
// Drop parent shards
|
||||
diesel::delete(tenant_shards)
|
||||
.filter(tenant_id.eq(split_tenant_id.to_string()))
|
||||
.filter(shard_count.eq(old_shard_count.literal() as i32))
|
||||
.execute(conn)?;
|
||||
self.with_conn(move |conn| -> DatabaseResult<()> {
|
||||
conn.transaction(|conn| -> QueryResult<()> {
|
||||
// Drop parent shards
|
||||
diesel::delete(tenant_shards)
|
||||
.filter(tenant_id.eq(split_tenant_id.to_string()))
|
||||
.filter(shard_count.eq(old_shard_count.literal() as i32))
|
||||
.execute(conn)?;
|
||||
|
||||
// Clear sharding flag
|
||||
let updated = diesel::update(tenant_shards)
|
||||
.filter(tenant_id.eq(split_tenant_id.to_string()))
|
||||
.set((splitting.eq(0),))
|
||||
.execute(conn)?;
|
||||
debug_assert!(updated > 0);
|
||||
|
||||
Ok(())
|
||||
})?;
|
||||
// Clear sharding flag
|
||||
let updated = diesel::update(tenant_shards)
|
||||
.filter(tenant_id.eq(split_tenant_id.to_string()))
|
||||
.set((splitting.eq(0),))
|
||||
.execute(conn)?;
|
||||
debug_assert!(updated > 0);
|
||||
|
||||
Ok(())
|
||||
},
|
||||
)
|
||||
})?;
|
||||
|
||||
Ok(())
|
||||
})
|
||||
.await
|
||||
}
|
||||
|
||||
@@ -656,44 +590,40 @@ impl Persistence {
|
||||
new_shard_count: ShardCount,
|
||||
) -> DatabaseResult<AbortShardSplitStatus> {
|
||||
use crate::schema::tenant_shards::dsl::*;
|
||||
self.with_measured_conn(
|
||||
DatabaseOperation::AbortShardSplit,
|
||||
move |conn| -> DatabaseResult<AbortShardSplitStatus> {
|
||||
let aborted =
|
||||
conn.transaction(|conn| -> DatabaseResult<AbortShardSplitStatus> {
|
||||
// Clear the splitting state on parent shards
|
||||
let updated = diesel::update(tenant_shards)
|
||||
.filter(tenant_id.eq(split_tenant_id.to_string()))
|
||||
.filter(shard_count.ne(new_shard_count.literal() as i32))
|
||||
.set((splitting.eq(0),))
|
||||
.execute(conn)?;
|
||||
self.with_conn(move |conn| -> DatabaseResult<AbortShardSplitStatus> {
|
||||
let aborted = conn.transaction(|conn| -> DatabaseResult<AbortShardSplitStatus> {
|
||||
// Clear the splitting state on parent shards
|
||||
let updated = diesel::update(tenant_shards)
|
||||
.filter(tenant_id.eq(split_tenant_id.to_string()))
|
||||
.filter(shard_count.ne(new_shard_count.literal() as i32))
|
||||
.set((splitting.eq(0),))
|
||||
.execute(conn)?;
|
||||
|
||||
// Parent shards are already gone: we cannot abort.
|
||||
if updated == 0 {
|
||||
return Ok(AbortShardSplitStatus::Complete);
|
||||
}
|
||||
// Parent shards are already gone: we cannot abort.
|
||||
if updated == 0 {
|
||||
return Ok(AbortShardSplitStatus::Complete);
|
||||
}
|
||||
|
||||
// Sanity check: if parent shards were present, their cardinality should
|
||||
// be less than the number of child shards.
|
||||
if updated >= new_shard_count.count() as usize {
|
||||
return Err(DatabaseError::Logical(format!(
|
||||
"Unexpected parent shard count {updated} while aborting split to \
|
||||
// Sanity check: if parent shards were present, their cardinality should
|
||||
// be less than the number of child shards.
|
||||
if updated >= new_shard_count.count() as usize {
|
||||
return Err(DatabaseError::Logical(format!(
|
||||
"Unexpected parent shard count {updated} while aborting split to \
|
||||
count {new_shard_count:?} on tenant {split_tenant_id}"
|
||||
)));
|
||||
}
|
||||
)));
|
||||
}
|
||||
|
||||
// Erase child shards
|
||||
diesel::delete(tenant_shards)
|
||||
.filter(tenant_id.eq(split_tenant_id.to_string()))
|
||||
.filter(shard_count.eq(new_shard_count.literal() as i32))
|
||||
.execute(conn)?;
|
||||
// Erase child shards
|
||||
diesel::delete(tenant_shards)
|
||||
.filter(tenant_id.eq(split_tenant_id.to_string()))
|
||||
.filter(shard_count.eq(new_shard_count.literal() as i32))
|
||||
.execute(conn)?;
|
||||
|
||||
Ok(AbortShardSplitStatus::Aborted)
|
||||
})?;
|
||||
Ok(AbortShardSplitStatus::Aborted)
|
||||
})?;
|
||||
|
||||
Ok(aborted)
|
||||
},
|
||||
)
|
||||
Ok(aborted)
|
||||
})
|
||||
.await
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
use crate::pageserver_client::PageserverClient;
|
||||
use crate::persistence::Persistence;
|
||||
use crate::service;
|
||||
use hyper::StatusCode;
|
||||
@@ -118,15 +117,6 @@ impl Reconciler {
|
||||
flush_ms: Option<Duration>,
|
||||
lazy: bool,
|
||||
) -> Result<(), ReconcileError> {
|
||||
if !node.is_available() && config.mode == LocationConfigMode::Detached {
|
||||
// Attempts to detach from offline nodes may be imitated without doing I/O: a node which is offline
|
||||
// will get fully reconciled wrt the shard's intent state when it is reactivated, irrespective of
|
||||
// what we put into `observed`, in [`crate::service::Service::node_activate_reconcile`]
|
||||
tracing::info!("Node {node} is unavailable during detach: proceeding anyway, it will be detached on next activation");
|
||||
self.observed.locations.remove(&node.get_id());
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
self.observed
|
||||
.locations
|
||||
.insert(node.get_id(), ObservedStateLocation { conf: None });
|
||||
@@ -159,16 +149,9 @@ impl Reconciler {
|
||||
};
|
||||
tracing::info!("location_config({node}) complete: {:?}", config);
|
||||
|
||||
match config.mode {
|
||||
LocationConfigMode::Detached => {
|
||||
self.observed.locations.remove(&node.get_id());
|
||||
}
|
||||
_ => {
|
||||
self.observed
|
||||
.locations
|
||||
.insert(node.get_id(), ObservedStateLocation { conf: Some(config) });
|
||||
}
|
||||
}
|
||||
self.observed
|
||||
.locations
|
||||
.insert(node.get_id(), ObservedStateLocation { conf: Some(config) });
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -260,11 +243,8 @@ impl Reconciler {
|
||||
tenant_shard_id: TenantShardId,
|
||||
node: &Node,
|
||||
) -> anyhow::Result<HashMap<TimelineId, Lsn>> {
|
||||
let client = PageserverClient::new(
|
||||
node.get_id(),
|
||||
node.base_url(),
|
||||
self.service_config.jwt_token.as_deref(),
|
||||
);
|
||||
let client =
|
||||
mgmt_api::Client::new(node.base_url(), self.service_config.jwt_token.as_deref());
|
||||
|
||||
let timelines = client.timeline_list(&tenant_shard_id).await?;
|
||||
Ok(timelines
|
||||
@@ -495,7 +475,7 @@ impl Reconciler {
|
||||
}
|
||||
}
|
||||
|
||||
// Downgrade the origin to secondary. If the tenant's policy is PlacementPolicy::Attached(0), then
|
||||
// Downgrade the origin to secondary. If the tenant's policy is PlacementPolicy::Single, then
|
||||
// this location will be deleted in the general case reconciliation that runs after this.
|
||||
let origin_secondary_conf = build_location_config(
|
||||
&self.shard,
|
||||
|
||||
@@ -7,9 +7,7 @@ use std::{
|
||||
time::{Duration, Instant},
|
||||
};
|
||||
|
||||
use crate::{
|
||||
id_lock_map::IdLockMap, persistence::AbortShardSplitStatus, reconciler::ReconcileError,
|
||||
};
|
||||
use crate::{id_lock_map::IdLockMap, persistence::AbortShardSplitStatus};
|
||||
use anyhow::Context;
|
||||
use control_plane::storage_controller::{
|
||||
AttachHookRequest, AttachHookResponse, InspectRequest, InspectResponse,
|
||||
@@ -20,14 +18,12 @@ use hyper::StatusCode;
|
||||
use pageserver_api::{
|
||||
controller_api::{
|
||||
NodeAvailability, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy,
|
||||
TenantCreateResponse, TenantCreateResponseShard, TenantDescribeResponse,
|
||||
TenantDescribeResponseShard, TenantLocateResponse, TenantShardMigrateRequest,
|
||||
TenantShardMigrateResponse, UtilizationScore,
|
||||
TenantCreateResponse, TenantCreateResponseShard, TenantLocateResponse,
|
||||
TenantShardMigrateRequest, TenantShardMigrateResponse, UtilizationScore,
|
||||
},
|
||||
models::{SecondaryProgress, TenantConfigRequest},
|
||||
};
|
||||
|
||||
use crate::pageserver_client::PageserverClient;
|
||||
use pageserver_api::{
|
||||
models::{
|
||||
self, LocationConfig, LocationConfigListResponse, LocationConfigMode,
|
||||
@@ -204,30 +200,6 @@ enum TenantCreateOrUpdate {
|
||||
Update(Vec<ShardUpdate>),
|
||||
}
|
||||
|
||||
struct ShardSplitParams {
|
||||
old_shard_count: ShardCount,
|
||||
new_shard_count: ShardCount,
|
||||
new_stripe_size: Option<ShardStripeSize>,
|
||||
targets: Vec<ShardSplitTarget>,
|
||||
policy: PlacementPolicy,
|
||||
config: TenantConfig,
|
||||
shard_ident: ShardIdentity,
|
||||
}
|
||||
|
||||
// When preparing for a shard split, we may either choose to proceed with the split,
|
||||
// or find that the work is already done and return NoOp.
|
||||
enum ShardSplitAction {
|
||||
Split(ShardSplitParams),
|
||||
NoOp(TenantShardSplitResponse),
|
||||
}
|
||||
|
||||
// A parent shard which will be split
|
||||
struct ShardSplitTarget {
|
||||
parent_id: TenantShardId,
|
||||
node: Node,
|
||||
child_ids: Vec<TenantShardId>,
|
||||
}
|
||||
|
||||
/// When we tenant shard split operation fails, we may not be able to clean up immediately, because nodes
|
||||
/// might not be available. We therefore use a queue of abort operations processed in the background.
|
||||
struct TenantShardSplitAbort {
|
||||
@@ -553,11 +525,7 @@ impl Service {
|
||||
break;
|
||||
}
|
||||
|
||||
let client = PageserverClient::new(
|
||||
node.get_id(),
|
||||
node.base_url(),
|
||||
self.config.jwt_token.as_deref(),
|
||||
);
|
||||
let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());
|
||||
match client
|
||||
.location_config(
|
||||
tenant_shard_id,
|
||||
@@ -765,19 +733,7 @@ impl Service {
|
||||
tenant.waiter.advance(result.sequence);
|
||||
}
|
||||
Err(e) => {
|
||||
match e {
|
||||
ReconcileError::Cancel => {
|
||||
tracing::info!("Reconciler was cancelled");
|
||||
}
|
||||
ReconcileError::Remote(mgmt_api::Error::Cancelled) => {
|
||||
// This might be due to the reconciler getting cancelled, or it might
|
||||
// be due to the `Node` being marked offline.
|
||||
tracing::info!("Reconciler cancelled during pageserver API call");
|
||||
}
|
||||
_ => {
|
||||
tracing::warn!("Reconcile error: {}", e);
|
||||
}
|
||||
}
|
||||
tracing::warn!("Reconcile error: {}", e);
|
||||
|
||||
// Ordering: populate last_error before advancing error_seq,
|
||||
// so that waiters will see the correct error after waiting.
|
||||
@@ -1101,7 +1057,7 @@ impl Service {
|
||||
shard_stripe_size: 0,
|
||||
generation: Some(0),
|
||||
generation_pageserver: None,
|
||||
placement_policy: serde_json::to_string(&PlacementPolicy::Attached(0)).unwrap(),
|
||||
placement_policy: serde_json::to_string(&PlacementPolicy::Single).unwrap(),
|
||||
config: serde_json::to_string(&TenantConfig::default()).unwrap(),
|
||||
splitting: SplitState::default(),
|
||||
};
|
||||
@@ -1128,7 +1084,7 @@ impl Service {
|
||||
TenantState::new(
|
||||
attach_req.tenant_shard_id,
|
||||
ShardIdentity::unsharded(),
|
||||
PlacementPolicy::Attached(0),
|
||||
PlacementPolicy::Single,
|
||||
),
|
||||
);
|
||||
tracing::info!("Inserted shard {} in memory", attach_req.tenant_shard_id);
|
||||
@@ -1157,7 +1113,7 @@ impl Service {
|
||||
self.persistence
|
||||
.update_tenant_shard(
|
||||
attach_req.tenant_shard_id,
|
||||
PlacementPolicy::Attached(0),
|
||||
PlacementPolicy::Single,
|
||||
conf,
|
||||
None,
|
||||
)
|
||||
@@ -1182,7 +1138,7 @@ impl Service {
|
||||
|
||||
if let Some(new_generation) = new_generation {
|
||||
tenant_state.generation = Some(new_generation);
|
||||
tenant_state.policy = PlacementPolicy::Attached(0);
|
||||
tenant_state.policy = PlacementPolicy::Single;
|
||||
} else {
|
||||
// This is a detach notification. We must update placement policy to avoid re-attaching
|
||||
// during background scheduling/reconciliation, or during storage controller restart.
|
||||
@@ -1394,8 +1350,7 @@ impl Service {
|
||||
incremented_generations.len()
|
||||
);
|
||||
|
||||
// Apply the updated generation to our in-memory state, and
|
||||
// gather discover secondary locations.
|
||||
// Apply the updated generation to our in-memory state
|
||||
let mut locked = self.inner.write().unwrap();
|
||||
let (nodes, tenants, scheduler) = locked.parts_mut();
|
||||
|
||||
@@ -1403,65 +1358,62 @@ impl Service {
|
||||
tenants: Vec::new(),
|
||||
};
|
||||
|
||||
// TODO: cancel/restart any running reconciliation for this tenant, it might be trying
|
||||
// to call location_conf API with an old generation. Wait for cancellation to complete
|
||||
// before responding to this request. Requires well implemented CancellationToken logic
|
||||
// all the way to where we call location_conf. Even then, there can still be a location_conf
|
||||
// request in flight over the network: TODO handle that by making location_conf API refuse
|
||||
// to go backward in generations.
|
||||
for (tenant_shard_id, new_gen) in incremented_generations {
|
||||
response.tenants.push(ReAttachResponseTenant {
|
||||
id: tenant_shard_id,
|
||||
gen: new_gen.into().unwrap(),
|
||||
});
|
||||
// Apply the new generation number to our in-memory state
|
||||
let shard_state = tenants.get_mut(&tenant_shard_id);
|
||||
let Some(shard_state) = shard_state else {
|
||||
// Not fatal. This edge case requires a re-attach to happen
|
||||
// between inserting a new tenant shard in to the database, and updating our in-memory
|
||||
// state to know about the shard, _and_ that the state inserted to the database referenced
|
||||
// a pageserver. Should never happen, but handle it rather than panicking, since it should
|
||||
// be harmless.
|
||||
tracing::error!(
|
||||
"Shard {} is in database for node {} but not in-memory state",
|
||||
tenant_shard_id,
|
||||
reattach_req.node_id
|
||||
);
|
||||
continue;
|
||||
};
|
||||
|
||||
// Scan through all shards, applying updates for ones where we updated generation
|
||||
// and identifying shards that intend to have a secondary location on this node.
|
||||
for (tenant_shard_id, shard) in tenants {
|
||||
if let Some(new_gen) = incremented_generations.get(tenant_shard_id) {
|
||||
let new_gen = *new_gen;
|
||||
response.tenants.push(ReAttachResponseTenant {
|
||||
id: *tenant_shard_id,
|
||||
gen: Some(new_gen.into().unwrap()),
|
||||
// A tenant is only put into multi or stale modes in the middle of a [`Reconciler::live_migrate`]
|
||||
// execution. If a pageserver is restarted during that process, then the reconcile pass will
|
||||
// fail, and start from scratch, so it doesn't make sense for us to try and preserve
|
||||
// the stale/multi states at this point.
|
||||
mode: LocationConfigMode::AttachedSingle,
|
||||
});
|
||||
|
||||
shard.generation = std::cmp::max(shard.generation, Some(new_gen));
|
||||
if let Some(observed) = shard.observed.locations.get_mut(&reattach_req.node_id) {
|
||||
// Why can we update `observed` even though we're not sure our response will be received
|
||||
// by the pageserver? Because the pageserver will not proceed with startup until
|
||||
// it has processed response: if it loses it, we'll see another request and increment
|
||||
// generation again, avoiding any uncertainty about dirtiness of tenant's state.
|
||||
if let Some(conf) = observed.conf.as_mut() {
|
||||
conf.generation = new_gen.into();
|
||||
}
|
||||
} else {
|
||||
// This node has no observed state for the shard: perhaps it was offline
|
||||
// when the pageserver restarted. Insert a None, so that the Reconciler
|
||||
// will be prompted to learn the location's state before it makes changes.
|
||||
shard
|
||||
.observed
|
||||
.locations
|
||||
.insert(reattach_req.node_id, ObservedStateLocation { conf: None });
|
||||
// If [`Persistence::re_attach`] selected this shard, it must have alread
|
||||
// had a generation set.
|
||||
debug_assert!(shard_state.generation.is_some());
|
||||
let Some(old_gen) = shard_state.generation else {
|
||||
// Should never happen: would only return incremented generation
|
||||
// for a tenant that already had a non-null generation.
|
||||
return Err(ApiError::InternalServerError(anyhow::anyhow!(
|
||||
"Generation must be set while re-attaching"
|
||||
)));
|
||||
};
|
||||
shard_state.generation = Some(std::cmp::max(old_gen, new_gen));
|
||||
if let Some(observed) = shard_state
|
||||
.observed
|
||||
.locations
|
||||
.get_mut(&reattach_req.node_id)
|
||||
{
|
||||
if let Some(conf) = observed.conf.as_mut() {
|
||||
conf.generation = new_gen.into();
|
||||
}
|
||||
} else if shard.intent.get_secondary().contains(&reattach_req.node_id) {
|
||||
// Ordering: pageserver will not accept /location_config requests until it has
|
||||
// finished processing the response from re-attach. So we can update our in-memory state
|
||||
// now, and be confident that we are not stamping on the result of some later location config.
|
||||
// TODO: however, we are not strictly ordered wrt ReconcileResults queue,
|
||||
// so we might update observed state here, and then get over-written by some racing
|
||||
// ReconcileResult. The impact is low however, since we have set state on pageserver something
|
||||
// that matches intent, so worst case if we race then we end up doing a spurious reconcile.
|
||||
|
||||
response.tenants.push(ReAttachResponseTenant {
|
||||
id: *tenant_shard_id,
|
||||
gen: None,
|
||||
mode: LocationConfigMode::Secondary,
|
||||
});
|
||||
|
||||
// We must not update observed, because we have no guarantee that our
|
||||
// response will be received by the pageserver. This could leave it
|
||||
// falsely dirty, but the resulting reconcile should be idempotent.
|
||||
} else {
|
||||
// This node has no observed state for the shard: perhaps it was offline
|
||||
// when the pageserver restarted. Insert a None, so that the Reconciler
|
||||
// will be prompted to learn the location's state before it makes changes.
|
||||
shard_state
|
||||
.observed
|
||||
.locations
|
||||
.insert(reattach_req.node_id, ObservedStateLocation { conf: None });
|
||||
}
|
||||
|
||||
// TODO: cancel/restart any running reconciliation for this tenant, it might be trying
|
||||
// to call location_conf API with an old generation. Wait for cancellation to complete
|
||||
// before responding to this request. Requires well implemented CancellationToken logic
|
||||
// all the way to where we call location_conf. Even then, there can still be a location_conf
|
||||
// request in flight over the network: TODO handle that by making location_conf API refuse
|
||||
// to go backward in generations.
|
||||
}
|
||||
|
||||
// We consider a node Active once we have composed a re-attach response, but we
|
||||
@@ -1539,11 +1491,11 @@ impl Service {
|
||||
&self,
|
||||
create_req: TenantCreateRequest,
|
||||
) -> Result<(TenantCreateResponse, Vec<ReconcilerWaiter>), ApiError> {
|
||||
// As a default, single is convenient for tests that don't choose a policy.
|
||||
let placement_policy = create_req
|
||||
.placement_policy
|
||||
.clone()
|
||||
// As a default, zero secondaries is convenient for tests that don't choose a policy.
|
||||
.unwrap_or(PlacementPolicy::Attached(0));
|
||||
.unwrap_or(PlacementPolicy::Single);
|
||||
|
||||
// This service expects to handle sharding itself: it is an error to try and directly create
|
||||
// a particular shard here.
|
||||
@@ -1753,11 +1705,11 @@ impl Service {
|
||||
| LocationConfigMode::AttachedSingle
|
||||
| LocationConfigMode::AttachedStale => {
|
||||
if nodes.len() > 1 {
|
||||
PlacementPolicy::Attached(1)
|
||||
PlacementPolicy::Double(1)
|
||||
} else {
|
||||
// Convenience for dev/test: if we just have one pageserver, import
|
||||
// tenants into non-HA mode so that scheduling will succeed.
|
||||
PlacementPolicy::Attached(0)
|
||||
// tenants into Single mode so that scheduling will succeed.
|
||||
PlacementPolicy::Single
|
||||
}
|
||||
}
|
||||
};
|
||||
@@ -2106,11 +2058,8 @@ impl Service {
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
for tenant_shard_id in shard_ids {
|
||||
let client = PageserverClient::new(
|
||||
node.get_id(),
|
||||
node.base_url(),
|
||||
self.config.jwt_token.as_deref(),
|
||||
);
|
||||
let client =
|
||||
mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());
|
||||
|
||||
tracing::info!("Doing time travel recovery for shard {tenant_shard_id}",);
|
||||
|
||||
@@ -2162,11 +2111,7 @@ impl Service {
|
||||
// Issue concurrent requests to all shards' locations
|
||||
let mut futs = FuturesUnordered::new();
|
||||
for (tenant_shard_id, node) in targets {
|
||||
let client = PageserverClient::new(
|
||||
node.get_id(),
|
||||
node.base_url(),
|
||||
self.config.jwt_token.as_deref(),
|
||||
);
|
||||
let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());
|
||||
futs.push(async move {
|
||||
let result = client
|
||||
.tenant_secondary_download(tenant_shard_id, wait)
|
||||
@@ -2259,11 +2204,7 @@ impl Service {
|
||||
// Phase 1: delete on the pageservers
|
||||
let mut any_pending = false;
|
||||
for (tenant_shard_id, node) in targets {
|
||||
let client = PageserverClient::new(
|
||||
node.get_id(),
|
||||
node.base_url(),
|
||||
self.config.jwt_token.as_deref(),
|
||||
);
|
||||
let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());
|
||||
// TODO: this, like many other places, requires proper retry handling for 503, timeout: those should not
|
||||
// surface immediately as an error to our caller.
|
||||
let status = client.tenant_delete(tenant_shard_id).await.map_err(|e| {
|
||||
@@ -2375,7 +2316,7 @@ impl Service {
|
||||
tenant_shard_id,
|
||||
create_req.new_timeline_id,
|
||||
);
|
||||
let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref());
|
||||
let client = mgmt_api::Client::new(node.base_url(), jwt.as_deref());
|
||||
|
||||
client
|
||||
.timeline_create(tenant_shard_id, &create_req)
|
||||
@@ -2499,7 +2440,7 @@ impl Service {
|
||||
"Deleting timeline on shard {tenant_shard_id}/{timeline_id}, attached to node {node}",
|
||||
);
|
||||
|
||||
let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref());
|
||||
let client = mgmt_api::Client::new(node.base_url(), jwt.as_deref());
|
||||
client
|
||||
.timeline_delete(tenant_shard_id, timeline_id)
|
||||
.await
|
||||
@@ -2540,11 +2481,11 @@ impl Service {
|
||||
}
|
||||
|
||||
/// When you need to send an HTTP request to the pageserver that holds shard0 of a tenant, this
|
||||
/// function looks up and returns node. If the tenant isn't found, returns Err(ApiError::NotFound)
|
||||
pub(crate) fn tenant_shard0_node(
|
||||
/// function looks it up and returns the url. If the tenant isn't found, returns Err(ApiError::NotFound)
|
||||
pub(crate) fn tenant_shard0_baseurl(
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
) -> Result<(Node, TenantShardId), ApiError> {
|
||||
) -> Result<(String, TenantShardId), ApiError> {
|
||||
let locked = self.inner.read().unwrap();
|
||||
let Some((tenant_shard_id, shard)) = locked
|
||||
.tenants
|
||||
@@ -2576,7 +2517,7 @@ impl Service {
|
||||
)));
|
||||
};
|
||||
|
||||
Ok((node.clone(), *tenant_shard_id))
|
||||
Ok((node.base_url(), *tenant_shard_id))
|
||||
}
|
||||
|
||||
pub(crate) fn tenant_locate(
|
||||
@@ -2586,6 +2527,9 @@ impl Service {
|
||||
let locked = self.inner.read().unwrap();
|
||||
tracing::info!("Locating shards for tenant {tenant_id}");
|
||||
|
||||
// Take a snapshot of pageservers
|
||||
let pageservers = locked.nodes.clone();
|
||||
|
||||
let mut result = Vec::new();
|
||||
let mut shard_params: Option<ShardParameters> = None;
|
||||
|
||||
@@ -2599,8 +2543,7 @@ impl Service {
|
||||
"Cannot locate a tenant that is not attached"
|
||||
)))?;
|
||||
|
||||
let node = locked
|
||||
.nodes
|
||||
let node = pageservers
|
||||
.get(&node_id)
|
||||
.expect("Pageservers may not be deleted while referenced");
|
||||
|
||||
@@ -2648,47 +2591,6 @@ impl Service {
|
||||
})
|
||||
}
|
||||
|
||||
pub(crate) fn tenant_describe(
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
) -> Result<TenantDescribeResponse, ApiError> {
|
||||
let locked = self.inner.read().unwrap();
|
||||
|
||||
let mut shard_zero = None;
|
||||
let mut shards = Vec::new();
|
||||
|
||||
for (tenant_shard_id, shard) in locked.tenants.range(TenantShardId::tenant_range(tenant_id))
|
||||
{
|
||||
if tenant_shard_id.is_zero() {
|
||||
shard_zero = Some(shard);
|
||||
}
|
||||
|
||||
let response_shard = TenantDescribeResponseShard {
|
||||
tenant_shard_id: *tenant_shard_id,
|
||||
node_attached: *shard.intent.get_attached(),
|
||||
node_secondary: shard.intent.get_secondary().to_vec(),
|
||||
last_error: shard.last_error.lock().unwrap().clone(),
|
||||
is_reconciling: shard.reconciler.is_some(),
|
||||
is_pending_compute_notification: shard.pending_compute_notification,
|
||||
is_splitting: matches!(shard.splitting, SplitState::Splitting),
|
||||
};
|
||||
shards.push(response_shard);
|
||||
}
|
||||
|
||||
let Some(shard_zero) = shard_zero else {
|
||||
return Err(ApiError::NotFound(
|
||||
anyhow::anyhow!("Tenant {tenant_id} not found").into(),
|
||||
));
|
||||
};
|
||||
|
||||
Ok(TenantDescribeResponse {
|
||||
shards,
|
||||
stripe_size: shard_zero.shard.stripe_size,
|
||||
policy: shard_zero.policy.clone(),
|
||||
config: shard_zero.config.clone(),
|
||||
})
|
||||
}
|
||||
|
||||
#[instrument(skip_all, fields(tenant_id=%op.tenant_id))]
|
||||
async fn abort_tenant_shard_split(
|
||||
&self,
|
||||
@@ -2746,7 +2648,7 @@ impl Service {
|
||||
let detach_locations: Vec<(Node, TenantShardId)> = {
|
||||
let mut detach_locations = Vec::new();
|
||||
let mut locked = self.inner.write().unwrap();
|
||||
let (nodes, tenants, scheduler) = locked.parts_mut();
|
||||
let (nodes, tenants, _scheduler) = locked.parts_mut();
|
||||
|
||||
for (tenant_shard_id, shard) in
|
||||
tenants.range_mut(TenantShardId::tenant_range(op.tenant_id))
|
||||
@@ -2779,13 +2681,6 @@ impl Service {
|
||||
|
||||
tracing::info!("Restoring parent shard {tenant_shard_id}");
|
||||
shard.splitting = SplitState::Idle;
|
||||
if let Err(e) = shard.schedule(scheduler) {
|
||||
// If this shard can't be scheduled now (perhaps due to offline nodes or
|
||||
// capacity issues), that must not prevent us rolling back a split. In this
|
||||
// case it should be eventually scheduled in the background.
|
||||
tracing::warn!("Failed to schedule {tenant_shard_id} during shard abort: {e}")
|
||||
}
|
||||
|
||||
self.maybe_reconcile_shard(shard, nodes);
|
||||
}
|
||||
|
||||
@@ -2877,7 +2772,7 @@ impl Service {
|
||||
.map(|(shard_id, _)| *shard_id)
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let (nodes, tenants, scheduler) = locked.parts_mut();
|
||||
let (_nodes, tenants, scheduler) = locked.parts_mut();
|
||||
for parent_id in parent_ids {
|
||||
let child_ids = parent_id.split(new_shard_count);
|
||||
|
||||
@@ -2919,7 +2814,7 @@ impl Service {
|
||||
generation,
|
||||
&child_shard,
|
||||
&config,
|
||||
matches!(policy, PlacementPolicy::Attached(n) if n > 0),
|
||||
matches!(policy, PlacementPolicy::Double(n) if n > 0),
|
||||
)),
|
||||
},
|
||||
);
|
||||
@@ -2944,8 +2839,6 @@ impl Service {
|
||||
// find a secondary (e.g. because cluster is overloaded).
|
||||
tracing::warn!("Failed to schedule child shard {child}: {e}");
|
||||
}
|
||||
// In the background, attach secondary locations for the new shards
|
||||
self.maybe_reconcile_shard(&mut child_state, nodes);
|
||||
|
||||
tenants.insert(child, child_state);
|
||||
response.new_shards.push(child);
|
||||
@@ -2968,23 +2861,17 @@ impl Service {
|
||||
let new_shard_count = ShardCount::new(split_req.new_shard_count);
|
||||
let new_stripe_size = split_req.new_stripe_size;
|
||||
|
||||
// Validate the request and construct parameters. This phase is fallible, but does not require
|
||||
// rollback on errors, as it does no I/O and mutates no state.
|
||||
let shard_split_params = match self.prepare_tenant_shard_split(tenant_id, split_req)? {
|
||||
ShardSplitAction::NoOp(resp) => return Ok(resp),
|
||||
ShardSplitAction::Split(params) => params,
|
||||
};
|
||||
|
||||
// Execute this split: this phase mutates state and does remote I/O on pageservers. If it fails,
|
||||
// we must roll back.
|
||||
let r = self
|
||||
.do_tenant_shard_split(tenant_id, shard_split_params)
|
||||
.await;
|
||||
let r = self.do_tenant_shard_split(tenant_id, split_req).await;
|
||||
|
||||
match r {
|
||||
Ok(r) => Ok(r),
|
||||
Err(ApiError::BadRequest(_)) => {
|
||||
// A request validation error does not require rollback: we rejected it before we started making any changes: just
|
||||
// return the error
|
||||
r
|
||||
}
|
||||
Err(e) => {
|
||||
// Split might be part-done, we must do work to abort it.
|
||||
// General case error handling: split might be part-done, we must do work to abort it.
|
||||
tracing::warn!("Enqueuing background abort of split on {tenant_id}");
|
||||
self.abort_tx
|
||||
.send(TenantShardSplitAbort {
|
||||
@@ -3000,18 +2887,25 @@ impl Service {
|
||||
}
|
||||
}
|
||||
|
||||
fn prepare_tenant_shard_split(
|
||||
pub(crate) async fn do_tenant_shard_split(
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
split_req: TenantShardSplitRequest,
|
||||
) -> Result<ShardSplitAction, ApiError> {
|
||||
) -> Result<TenantShardSplitResponse, ApiError> {
|
||||
let mut policy = None;
|
||||
let mut shard_ident = None;
|
||||
|
||||
// A parent shard which will be split
|
||||
struct SplitTarget {
|
||||
parent_id: TenantShardId,
|
||||
node: Node,
|
||||
child_ids: Vec<TenantShardId>,
|
||||
}
|
||||
|
||||
fail::fail_point!("shard-split-validation", |_| Err(ApiError::BadRequest(
|
||||
anyhow::anyhow!("failpoint")
|
||||
)));
|
||||
|
||||
let mut policy = None;
|
||||
let mut config = None;
|
||||
let mut shard_ident = None;
|
||||
// Validate input, and calculate which shards we will create
|
||||
let (old_shard_count, targets) =
|
||||
{
|
||||
@@ -3067,9 +2961,6 @@ impl Service {
|
||||
if shard_ident.is_none() {
|
||||
shard_ident = Some(shard.shard);
|
||||
}
|
||||
if config.is_none() {
|
||||
config = Some(shard.config.clone());
|
||||
}
|
||||
|
||||
if tenant_shard_id.shard_count.count() == split_req.new_shard_count {
|
||||
tracing::info!(
|
||||
@@ -3088,7 +2979,9 @@ impl Service {
|
||||
.get(&node_id)
|
||||
.expect("Pageservers may not be deleted while referenced");
|
||||
|
||||
targets.push(ShardSplitTarget {
|
||||
// TODO: if any reconciliation is currently in progress for this shard, wait for it.
|
||||
|
||||
targets.push(SplitTarget {
|
||||
parent_id: *tenant_shard_id,
|
||||
node: node.clone(),
|
||||
child_ids: tenant_shard_id
|
||||
@@ -3098,9 +2991,9 @@ impl Service {
|
||||
|
||||
if targets.is_empty() {
|
||||
if children_found.len() == split_req.new_shard_count as usize {
|
||||
return Ok(ShardSplitAction::NoOp(TenantShardSplitResponse {
|
||||
return Ok(TenantShardSplitResponse {
|
||||
new_shards: children_found,
|
||||
}));
|
||||
});
|
||||
} else {
|
||||
// No shards found to split, and no existing children found: the
|
||||
// tenant doesn't exist at all.
|
||||
@@ -3130,77 +3023,13 @@ impl Service {
|
||||
shard_ident.unwrap()
|
||||
};
|
||||
let policy = policy.unwrap();
|
||||
let config = config.unwrap();
|
||||
|
||||
Ok(ShardSplitAction::Split(ShardSplitParams {
|
||||
old_shard_count,
|
||||
new_shard_count: ShardCount::new(split_req.new_shard_count),
|
||||
new_stripe_size: split_req.new_stripe_size,
|
||||
targets,
|
||||
policy,
|
||||
config,
|
||||
shard_ident,
|
||||
}))
|
||||
}
|
||||
|
||||
async fn do_tenant_shard_split(
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
params: ShardSplitParams,
|
||||
) -> Result<TenantShardSplitResponse, ApiError> {
|
||||
// FIXME: we have dropped self.inner lock, and not yet written anything to the database: another
|
||||
// request could occur here, deleting or mutating the tenant. begin_shard_split checks that the
|
||||
// parent shards exist as expected, but it would be neater to do the above pre-checks within the
|
||||
// same database transaction rather than pre-check in-memory and then maybe-fail the database write.
|
||||
// (https://github.com/neondatabase/neon/issues/6676)
|
||||
|
||||
let ShardSplitParams {
|
||||
old_shard_count,
|
||||
new_shard_count,
|
||||
new_stripe_size,
|
||||
mut targets,
|
||||
policy,
|
||||
config,
|
||||
shard_ident,
|
||||
} = params;
|
||||
|
||||
// Drop any secondary locations: pageservers do not support splitting these, and in any case the
|
||||
// end-state for a split tenant will usually be to have secondary locations on different nodes.
|
||||
// The reconciliation calls in this block also implicitly cancel+barrier wrt any ongoing reconciliation
|
||||
// at the time of split.
|
||||
let waiters = {
|
||||
let mut locked = self.inner.write().unwrap();
|
||||
let mut waiters = Vec::new();
|
||||
let (nodes, tenants, scheduler) = locked.parts_mut();
|
||||
for target in &mut targets {
|
||||
let Some(shard) = tenants.get_mut(&target.parent_id) else {
|
||||
// Paranoia check: this shouldn't happen: we have the oplock for this tenant ID.
|
||||
return Err(ApiError::InternalServerError(anyhow::anyhow!(
|
||||
"Shard {} not found",
|
||||
target.parent_id
|
||||
)));
|
||||
};
|
||||
|
||||
if shard.intent.get_attached() != &Some(target.node.get_id()) {
|
||||
// Paranoia check: this shouldn't happen: we have the oplock for this tenant ID.
|
||||
return Err(ApiError::Conflict(format!(
|
||||
"Shard {} unexpectedly rescheduled during split",
|
||||
target.parent_id
|
||||
)));
|
||||
}
|
||||
|
||||
// Irrespective of PlacementPolicy, clear secondary locations from intent
|
||||
shard.intent.clear_secondary(scheduler);
|
||||
|
||||
// Run Reconciler to execute detach fo secondary locations.
|
||||
if let Some(waiter) = self.maybe_reconcile_shard(shard, nodes) {
|
||||
waiters.push(waiter);
|
||||
}
|
||||
}
|
||||
waiters
|
||||
};
|
||||
self.await_waiters(waiters, RECONCILE_TIMEOUT).await?;
|
||||
|
||||
// Before creating any new child shards in memory or on the pageservers, persist them: this
|
||||
// enables us to ensure that we will always be able to clean up if something goes wrong. This also
|
||||
// acts as the protection against two concurrent attempts to split: one of them will get a database
|
||||
@@ -3229,7 +3058,8 @@ impl Service {
|
||||
generation: None,
|
||||
generation_pageserver: Some(target.node.get_id().0 as i64),
|
||||
placement_policy: serde_json::to_string(&policy).unwrap(),
|
||||
config: serde_json::to_string(&config).unwrap(),
|
||||
// TODO: get the config out of the map
|
||||
config: serde_json::to_string(&TenantConfig::default()).unwrap(),
|
||||
splitting: SplitState::Splitting,
|
||||
});
|
||||
}
|
||||
@@ -3281,22 +3111,18 @@ impl Service {
|
||||
// N>1 shards into M shards -- initially we're usually splitting 1 shard into N).
|
||||
|
||||
for target in &targets {
|
||||
let ShardSplitTarget {
|
||||
let SplitTarget {
|
||||
parent_id,
|
||||
node,
|
||||
child_ids,
|
||||
} = target;
|
||||
let client = PageserverClient::new(
|
||||
node.get_id(),
|
||||
node.base_url(),
|
||||
self.config.jwt_token.as_deref(),
|
||||
);
|
||||
let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());
|
||||
let response = client
|
||||
.tenant_shard_split(
|
||||
*parent_id,
|
||||
TenantShardSplitRequest {
|
||||
new_shard_count: new_shard_count.literal(),
|
||||
new_stripe_size,
|
||||
new_shard_count: split_req.new_shard_count,
|
||||
new_stripe_size: split_req.new_stripe_size,
|
||||
},
|
||||
)
|
||||
.await
|
||||
@@ -3345,8 +3171,11 @@ impl Service {
|
||||
));
|
||||
|
||||
// Replace all the shards we just split with their children: this phase is infallible.
|
||||
let (response, child_locations) =
|
||||
self.tenant_shard_split_commit_inmem(tenant_id, new_shard_count, new_stripe_size);
|
||||
let (response, child_locations) = self.tenant_shard_split_commit_inmem(
|
||||
tenant_id,
|
||||
ShardCount::new(split_req.new_shard_count),
|
||||
split_req.new_stripe_size,
|
||||
);
|
||||
|
||||
// Send compute notifications for all the new shards
|
||||
let mut failed_notifications = Vec::new();
|
||||
@@ -3411,20 +3240,17 @@ impl Service {
|
||||
let old_attached = *shard.intent.get_attached();
|
||||
|
||||
match shard.policy {
|
||||
PlacementPolicy::Attached(n) => {
|
||||
PlacementPolicy::Single => {
|
||||
shard.intent.clear_secondary(scheduler);
|
||||
shard.intent.set_attached(scheduler, Some(migrate_req.node_id));
|
||||
}
|
||||
PlacementPolicy::Double(_n) => {
|
||||
// If our new attached node was a secondary, it no longer should be.
|
||||
shard.intent.remove_secondary(scheduler, migrate_req.node_id);
|
||||
|
||||
// If we were already attached to something, demote that to a secondary
|
||||
if let Some(old_attached) = old_attached {
|
||||
if n > 0 {
|
||||
// Remove other secondaries to make room for the location we'll demote
|
||||
while shard.intent.get_secondary().len() >= n {
|
||||
shard.intent.pop_secondary(scheduler);
|
||||
}
|
||||
|
||||
shard.intent.push_secondary(scheduler, old_attached);
|
||||
}
|
||||
shard.intent.push_secondary(scheduler, old_attached);
|
||||
}
|
||||
|
||||
shard.intent.set_attached(scheduler, Some(migrate_req.node_id));
|
||||
@@ -3450,7 +3276,7 @@ impl Service {
|
||||
if let Some(waiter) = waiter {
|
||||
waiter.wait_timeout(RECONCILE_TIMEOUT).await?;
|
||||
} else {
|
||||
tracing::info!("Migration is a no-op");
|
||||
tracing::warn!("Migration is a no-op");
|
||||
}
|
||||
|
||||
Ok(TenantShardMigrateResponse {})
|
||||
@@ -3805,13 +3631,6 @@ impl Service {
|
||||
observed_loc.conf = None;
|
||||
}
|
||||
|
||||
if new_nodes.len() == 1 {
|
||||
// Special case for single-node cluster: there is no point trying to reschedule
|
||||
// any tenant shards: avoid doing so, in order to avoid spewing warnings about
|
||||
// failures to schedule them.
|
||||
continue;
|
||||
}
|
||||
|
||||
if tenant_state.intent.demote_attached(node_id) {
|
||||
tenant_state.sequence = tenant_state.sequence.next();
|
||||
match tenant_state.schedule(scheduler) {
|
||||
|
||||
@@ -4,10 +4,7 @@ use std::{
|
||||
time::Duration,
|
||||
};
|
||||
|
||||
use crate::{
|
||||
metrics::{self, ReconcileCompleteLabelGroup, ReconcileOutcome},
|
||||
persistence::TenantShardPersistence,
|
||||
};
|
||||
use crate::{metrics, persistence::TenantShardPersistence};
|
||||
use pageserver_api::controller_api::PlacementPolicy;
|
||||
use pageserver_api::{
|
||||
models::{LocationConfig, LocationConfigMode, TenantConfig},
|
||||
@@ -460,7 +457,22 @@ impl TenantState {
|
||||
// Add/remove nodes to fulfil policy
|
||||
use PlacementPolicy::*;
|
||||
match self.policy {
|
||||
Attached(secondary_count) => {
|
||||
Single => {
|
||||
// Should have exactly one attached, and zero secondaries
|
||||
if !self.intent.secondary.is_empty() {
|
||||
self.intent.clear_secondary(scheduler);
|
||||
modified = true;
|
||||
}
|
||||
|
||||
let (modified_attached, _attached_node_id) = self.schedule_attached(scheduler)?;
|
||||
modified |= modified_attached;
|
||||
|
||||
if !self.intent.secondary.is_empty() {
|
||||
self.intent.clear_secondary(scheduler);
|
||||
modified = true;
|
||||
}
|
||||
}
|
||||
Double(secondary_count) => {
|
||||
let retain_secondaries = if self.intent.attached.is_none()
|
||||
&& scheduler.node_preferred(&self.intent.secondary).is_some()
|
||||
{
|
||||
@@ -721,10 +733,7 @@ impl TenantState {
|
||||
let reconciler_span = tracing::info_span!(parent: None, "reconciler", seq=%reconcile_seq,
|
||||
tenant_id=%reconciler.tenant_shard_id.tenant_id,
|
||||
shard_id=%reconciler.tenant_shard_id.shard_slug());
|
||||
metrics::METRICS_REGISTRY
|
||||
.metrics_group
|
||||
.storage_controller_reconcile_spawn
|
||||
.inc();
|
||||
metrics::RECONCILER.spawned.inc();
|
||||
let result_tx = result_tx.clone();
|
||||
let join_handle = tokio::task::spawn(
|
||||
async move {
|
||||
@@ -742,12 +751,10 @@ impl TenantState {
|
||||
// TODO: wrap all remote API operations in cancellation check
|
||||
// as well.
|
||||
if reconciler.cancel.is_cancelled() {
|
||||
metrics::METRICS_REGISTRY
|
||||
.metrics_group
|
||||
.storage_controller_reconcile_complete
|
||||
.inc(ReconcileCompleteLabelGroup {
|
||||
status: ReconcileOutcome::Cancel,
|
||||
});
|
||||
metrics::RECONCILER
|
||||
.complete
|
||||
.with_label_values(&[metrics::ReconcilerMetrics::CANCEL])
|
||||
.inc();
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -762,18 +769,18 @@ impl TenantState {
|
||||
}
|
||||
|
||||
// Update result counter
|
||||
let outcome_label = match &result {
|
||||
Ok(_) => ReconcileOutcome::Success,
|
||||
Err(ReconcileError::Cancel) => ReconcileOutcome::Cancel,
|
||||
Err(_) => ReconcileOutcome::Error,
|
||||
};
|
||||
|
||||
metrics::METRICS_REGISTRY
|
||||
.metrics_group
|
||||
.storage_controller_reconcile_complete
|
||||
.inc(ReconcileCompleteLabelGroup {
|
||||
status: outcome_label,
|
||||
});
|
||||
match &result {
|
||||
Ok(_) => metrics::RECONCILER
|
||||
.complete
|
||||
.with_label_values(&[metrics::ReconcilerMetrics::SUCCESS]),
|
||||
Err(ReconcileError::Cancel) => metrics::RECONCILER
|
||||
.complete
|
||||
.with_label_values(&[metrics::ReconcilerMetrics::CANCEL]),
|
||||
Err(_) => metrics::RECONCILER
|
||||
.complete
|
||||
.with_label_values(&[metrics::ReconcilerMetrics::ERROR]),
|
||||
}
|
||||
.inc();
|
||||
|
||||
result_tx
|
||||
.send(ReconcileResult {
|
||||
@@ -888,7 +895,7 @@ pub(crate) mod tests {
|
||||
|
||||
let mut scheduler = Scheduler::new(nodes.values());
|
||||
|
||||
let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Attached(1));
|
||||
let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Double(1));
|
||||
tenant_state
|
||||
.schedule(&mut scheduler)
|
||||
.expect("we have enough nodes, scheduling should work");
|
||||
@@ -936,7 +943,7 @@ pub(crate) mod tests {
|
||||
let nodes = make_test_nodes(3);
|
||||
let mut scheduler = Scheduler::new(nodes.values());
|
||||
|
||||
let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Attached(1));
|
||||
let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Double(1));
|
||||
|
||||
tenant_state.observed.locations.insert(
|
||||
NodeId(3),
|
||||
|
||||
@@ -437,7 +437,7 @@ async fn handle_tenant(
|
||||
|
||||
let placement_policy = match create_match.get_one::<String>("placement-policy") {
|
||||
Some(s) if !s.is_empty() => serde_json::from_str::<PlacementPolicy>(s)?,
|
||||
_ => PlacementPolicy::Attached(0),
|
||||
_ => PlacementPolicy::Single,
|
||||
};
|
||||
|
||||
let tenant_conf = PageServerNode::parse_config(tenant_conf)?;
|
||||
@@ -523,6 +523,88 @@ async fn handle_tenant(
|
||||
.with_context(|| format!("Tenant config failed for tenant with id {tenant_id}"))?;
|
||||
println!("tenant {tenant_id} successfully configured on the pageserver");
|
||||
}
|
||||
Some(("migrate", matches)) => {
|
||||
let tenant_shard_id = get_tenant_shard_id(matches, env)?;
|
||||
let new_pageserver = get_pageserver(env, matches)?;
|
||||
let new_pageserver_id = new_pageserver.conf.id;
|
||||
|
||||
let storage_controller = StorageController::from_env(env);
|
||||
storage_controller
|
||||
.tenant_migrate(tenant_shard_id, new_pageserver_id)
|
||||
.await?;
|
||||
|
||||
println!("tenant {tenant_shard_id} migrated to {}", new_pageserver_id);
|
||||
}
|
||||
Some(("status", matches)) => {
|
||||
let tenant_id = get_tenant_id(matches, env)?;
|
||||
|
||||
let mut shard_table = comfy_table::Table::new();
|
||||
shard_table.set_header(["Shard", "Pageserver", "Physical Size"]);
|
||||
|
||||
let mut tenant_synthetic_size = None;
|
||||
|
||||
let storage_controller = StorageController::from_env(env);
|
||||
for shard in storage_controller.tenant_locate(tenant_id).await?.shards {
|
||||
let pageserver =
|
||||
PageServerNode::from_env(env, env.get_pageserver_conf(shard.node_id)?);
|
||||
|
||||
let size = pageserver
|
||||
.http_client
|
||||
.tenant_details(shard.shard_id)
|
||||
.await?
|
||||
.tenant_info
|
||||
.current_physical_size
|
||||
.unwrap();
|
||||
|
||||
shard_table.add_row([
|
||||
format!("{}", shard.shard_id.shard_slug()),
|
||||
format!("{}", shard.node_id.0),
|
||||
format!("{} MiB", size / (1024 * 1024)),
|
||||
]);
|
||||
|
||||
if shard.shard_id.is_zero() {
|
||||
tenant_synthetic_size =
|
||||
Some(pageserver.tenant_synthetic_size(shard.shard_id).await?);
|
||||
}
|
||||
}
|
||||
|
||||
let Some(synthetic_size) = tenant_synthetic_size else {
|
||||
bail!("Shard 0 not found")
|
||||
};
|
||||
|
||||
let mut tenant_table = comfy_table::Table::new();
|
||||
tenant_table.add_row(["Tenant ID".to_string(), tenant_id.to_string()]);
|
||||
tenant_table.add_row([
|
||||
"Synthetic size".to_string(),
|
||||
format!("{} MiB", synthetic_size.size.unwrap_or(0) / (1024 * 1024)),
|
||||
]);
|
||||
|
||||
println!("{tenant_table}");
|
||||
println!("{shard_table}");
|
||||
}
|
||||
Some(("shard-split", matches)) => {
|
||||
let tenant_id = get_tenant_id(matches, env)?;
|
||||
let shard_count: u8 = matches.get_one::<u8>("shard-count").cloned().unwrap_or(0);
|
||||
let shard_stripe_size: Option<ShardStripeSize> = matches
|
||||
.get_one::<Option<ShardStripeSize>>("shard-stripe-size")
|
||||
.cloned()
|
||||
.unwrap();
|
||||
|
||||
let storage_controller = StorageController::from_env(env);
|
||||
let result = storage_controller
|
||||
.tenant_split(tenant_id, shard_count, shard_stripe_size)
|
||||
.await?;
|
||||
println!(
|
||||
"Split tenant {} into shards {}",
|
||||
tenant_id,
|
||||
result
|
||||
.new_shards
|
||||
.iter()
|
||||
.map(|s| format!("{:?}", s))
|
||||
.collect::<Vec<_>>()
|
||||
.join(",")
|
||||
);
|
||||
}
|
||||
|
||||
Some((sub_name, _)) => bail!("Unexpected tenant subcommand '{}'", sub_name),
|
||||
None => bail!("no tenant subcommand provided"),
|
||||
@@ -1496,6 +1578,19 @@ fn cli() -> Command {
|
||||
.subcommand(Command::new("config")
|
||||
.arg(tenant_id_arg.clone())
|
||||
.arg(Arg::new("config").short('c').num_args(1).action(ArgAction::Append).required(false)))
|
||||
.subcommand(Command::new("migrate")
|
||||
.about("Migrate a tenant from one pageserver to another")
|
||||
.arg(tenant_id_arg.clone())
|
||||
.arg(pageserver_id_arg.clone()))
|
||||
.subcommand(Command::new("status")
|
||||
.about("Human readable summary of the tenant's shards and attachment locations")
|
||||
.arg(tenant_id_arg.clone()))
|
||||
.subcommand(Command::new("shard-split")
|
||||
.about("Increase the number of shards in the tenant")
|
||||
.arg(tenant_id_arg.clone())
|
||||
.arg(Arg::new("shard-count").value_parser(value_parser!(u8)).long("shard-count").action(ArgAction::Set).help("Number of shards in the new tenant (default 1)"))
|
||||
.arg(Arg::new("shard-stripe-size").value_parser(value_parser!(u32)).long("shard-stripe-size").action(ArgAction::Set).help("Sharding stripe size in pages"))
|
||||
)
|
||||
)
|
||||
.subcommand(
|
||||
Command::new("pageserver")
|
||||
|
||||
@@ -127,8 +127,8 @@ pub struct PageServerConf {
|
||||
pub pg_auth_type: AuthType,
|
||||
pub http_auth_type: AuthType,
|
||||
|
||||
pub(crate) virtual_file_io_engine: Option<String>,
|
||||
pub(crate) get_vectored_impl: Option<String>,
|
||||
pub(crate) virtual_file_io_engine: String,
|
||||
pub(crate) get_vectored_impl: String,
|
||||
}
|
||||
|
||||
impl Default for PageServerConf {
|
||||
@@ -139,8 +139,9 @@ impl Default for PageServerConf {
|
||||
listen_http_addr: String::new(),
|
||||
pg_auth_type: AuthType::Trust,
|
||||
http_auth_type: AuthType::Trust,
|
||||
virtual_file_io_engine: None,
|
||||
get_vectored_impl: None,
|
||||
// FIXME: use the ones exposed by pageserver crate
|
||||
virtual_file_io_engine: "tokio-epoll-uring".to_owned(),
|
||||
get_vectored_impl: "sequential".to_owned(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -101,16 +101,8 @@ impl PageServerNode {
|
||||
|
||||
let pg_auth_type_param = format!("pg_auth_type='{}'", pg_auth_type);
|
||||
let listen_pg_addr_param = format!("listen_pg_addr='{}'", listen_pg_addr);
|
||||
let virtual_file_io_engine = if let Some(virtual_file_io_engine) = virtual_file_io_engine {
|
||||
format!("virtual_file_io_engine='{virtual_file_io_engine}'")
|
||||
} else {
|
||||
String::new()
|
||||
};
|
||||
let get_vectored_impl = if let Some(get_vectored_impl) = get_vectored_impl {
|
||||
format!("get_vectored_impl='{get_vectored_impl}'")
|
||||
} else {
|
||||
String::new()
|
||||
};
|
||||
let virtual_file_io_engine = format!("virtual_file_io_engine='{virtual_file_io_engine}'");
|
||||
let get_vectored_impl = format!("get_vectored_impl='{get_vectored_impl}'");
|
||||
|
||||
let broker_endpoint_param = format!("broker_endpoint='{}'", self.env.broker.client_url());
|
||||
|
||||
|
||||
@@ -475,7 +475,7 @@ impl StorageController {
|
||||
pub async fn tenant_locate(&self, tenant_id: TenantId) -> anyhow::Result<TenantLocateResponse> {
|
||||
self.dispatch::<(), _>(
|
||||
Method::GET,
|
||||
format!("debug/v1/tenant/{tenant_id}/locate"),
|
||||
format!("control/v1/tenant/{tenant_id}/locate"),
|
||||
None,
|
||||
)
|
||||
.await
|
||||
|
||||
@@ -6,10 +6,7 @@ use std::str::FromStr;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use utils::id::NodeId;
|
||||
|
||||
use crate::{
|
||||
models::{ShardParameters, TenantConfig},
|
||||
shard::{ShardStripeSize, TenantShardId},
|
||||
};
|
||||
use crate::{models::ShardParameters, shard::TenantShardId};
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct TenantCreateResponseShard {
|
||||
@@ -60,31 +57,6 @@ pub struct TenantLocateResponse {
|
||||
pub shard_params: ShardParameters,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct TenantDescribeResponse {
|
||||
pub shards: Vec<TenantDescribeResponseShard>,
|
||||
pub stripe_size: ShardStripeSize,
|
||||
pub policy: PlacementPolicy,
|
||||
pub config: TenantConfig,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct TenantDescribeResponseShard {
|
||||
pub tenant_shard_id: TenantShardId,
|
||||
|
||||
pub node_attached: Option<NodeId>,
|
||||
pub node_secondary: Vec<NodeId>,
|
||||
|
||||
pub last_error: String,
|
||||
|
||||
/// A task is currently running to reconcile this tenant's intent state with the state on pageservers
|
||||
pub is_reconciling: bool,
|
||||
/// This shard failed in sending a compute notification to the cloud control plane, and a retry is pending.
|
||||
pub is_pending_compute_notification: bool,
|
||||
/// A shard split is currently underway
|
||||
pub is_splitting: bool,
|
||||
}
|
||||
|
||||
/// Explicitly migrating a particular shard is a low level operation
|
||||
/// TODO: higher level "Reschedule tenant" operation where the request
|
||||
/// specifies some constraints, e.g. asking it to get off particular node(s)
|
||||
@@ -209,8 +181,11 @@ impl From<NodeSchedulingPolicy> for String {
|
||||
/// to create secondary locations.
|
||||
#[derive(Clone, Serialize, Deserialize, Debug, PartialEq, Eq)]
|
||||
pub enum PlacementPolicy {
|
||||
/// Normal live state: one attached pageserver and zero or more secondaries.
|
||||
Attached(usize),
|
||||
/// Cheapest way to attach a tenant: just one pageserver, no secondary
|
||||
Single,
|
||||
/// Production-ready way to attach a tenant: one attached pageserver and
|
||||
/// some number of secondaries.
|
||||
Double(usize),
|
||||
/// Create one secondary mode locations. This is useful when onboarding
|
||||
/// a tenant, or for an idle tenant that we might want to bring online quickly.
|
||||
Secondary,
|
||||
@@ -232,14 +207,14 @@ mod test {
|
||||
/// Check stability of PlacementPolicy's serialization
|
||||
#[test]
|
||||
fn placement_policy_encoding() -> anyhow::Result<()> {
|
||||
let v = PlacementPolicy::Attached(1);
|
||||
let v = PlacementPolicy::Double(1);
|
||||
let encoded = serde_json::to_string(&v)?;
|
||||
assert_eq!(encoded, "{\"Attached\":1}");
|
||||
assert_eq!(encoded, "{\"Double\":1}");
|
||||
assert_eq!(serde_json::from_str::<PlacementPolicy>(&encoded)?, v);
|
||||
|
||||
let v = PlacementPolicy::Detached;
|
||||
let v = PlacementPolicy::Single;
|
||||
let encoded = serde_json::to_string(&v)?;
|
||||
assert_eq!(encoded, "\"Detached\"");
|
||||
assert_eq!(encoded, "\"Single\"");
|
||||
assert_eq!(serde_json::from_str::<PlacementPolicy>(&encoded)?, v);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -6,9 +6,7 @@
|
||||
use serde::{Deserialize, Serialize};
|
||||
use utils::id::NodeId;
|
||||
|
||||
use crate::{
|
||||
controller_api::NodeRegisterRequest, models::LocationConfigMode, shard::TenantShardId,
|
||||
};
|
||||
use crate::{controller_api::NodeRegisterRequest, shard::TenantShardId};
|
||||
|
||||
/// Upcall message sent by the pageserver to the configured `control_plane_api` on
|
||||
/// startup.
|
||||
@@ -22,20 +20,12 @@ pub struct ReAttachRequest {
|
||||
pub register: Option<NodeRegisterRequest>,
|
||||
}
|
||||
|
||||
fn default_mode() -> LocationConfigMode {
|
||||
LocationConfigMode::AttachedSingle
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct ReAttachResponseTenant {
|
||||
pub id: TenantShardId,
|
||||
/// Mandatory if LocationConfigMode is None or set to an Attached* mode
|
||||
pub gen: Option<u32>,
|
||||
|
||||
/// Default value only for backward compat: this field should be set
|
||||
#[serde(default = "default_mode")]
|
||||
pub mode: LocationConfigMode,
|
||||
pub gen: u32,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct ReAttachResponse {
|
||||
pub tenants: Vec<ReAttachResponseTenant>,
|
||||
|
||||
@@ -198,7 +198,6 @@ impl LocalFs {
|
||||
fs::OpenOptions::new()
|
||||
.write(true)
|
||||
.create(true)
|
||||
.truncate(true)
|
||||
.open(&temp_file_path)
|
||||
.await
|
||||
.with_context(|| {
|
||||
|
||||
@@ -13,7 +13,6 @@ testing = ["fail/failpoints"]
|
||||
[dependencies]
|
||||
arc-swap.workspace = true
|
||||
sentry.workspace = true
|
||||
async-compression.workspace = true
|
||||
async-trait.workspace = true
|
||||
anyhow.workspace = true
|
||||
bincode.workspace = true
|
||||
@@ -37,7 +36,6 @@ serde_json.workspace = true
|
||||
signal-hook.workspace = true
|
||||
thiserror.workspace = true
|
||||
tokio.workspace = true
|
||||
tokio-tar.workspace = true
|
||||
tokio-util.workspace = true
|
||||
tracing.workspace = true
|
||||
tracing-error.workspace = true
|
||||
@@ -48,7 +46,6 @@ strum.workspace = true
|
||||
strum_macros.workspace = true
|
||||
url.workspace = true
|
||||
uuid.workspace = true
|
||||
walkdir.workspace = true
|
||||
|
||||
pq_proto.workspace = true
|
||||
postgres_connection.workspace = true
|
||||
|
||||
@@ -245,7 +245,7 @@ impl std::io::Write for ChannelWriter {
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
SERVE_METRICS_COUNT.inc();
|
||||
|
||||
let started_at = std::time::Instant::now();
|
||||
@@ -367,6 +367,7 @@ pub fn make_router() -> RouterBuilder<hyper::Body, ApiError> {
|
||||
.middleware(Middleware::post_with_info(
|
||||
add_request_id_header_to_response,
|
||||
))
|
||||
.get("/metrics", |r| request_span(r, prometheus_metrics_handler))
|
||||
.err_handler(route_error_handler)
|
||||
}
|
||||
|
||||
|
||||
@@ -87,8 +87,6 @@ pub mod failpoint_support;
|
||||
|
||||
pub mod yielding_loop;
|
||||
|
||||
pub mod zstd;
|
||||
|
||||
/// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
|
||||
///
|
||||
/// we have several cases:
|
||||
|
||||
@@ -63,7 +63,6 @@ impl UnwrittenLockFile {
|
||||
pub fn create_exclusive(lock_file_path: &Utf8Path) -> anyhow::Result<UnwrittenLockFile> {
|
||||
let lock_file = fs::OpenOptions::new()
|
||||
.create(true) // O_CREAT
|
||||
.truncate(true)
|
||||
.write(true)
|
||||
.open(lock_file_path)
|
||||
.context("open lock file")?;
|
||||
|
||||
@@ -245,7 +245,7 @@ impl<'a, T> Guard<'a, T> {
|
||||
///
|
||||
/// The permit will be on a semaphore part of the new internal value, and any following
|
||||
/// [`OnceCell::get_or_init`] will wait on it to complete.
|
||||
pub fn take_and_deinit(mut self) -> (T, InitPermit) {
|
||||
pub fn take_and_deinit(&mut self) -> (T, InitPermit) {
|
||||
let mut swapped = Inner::default();
|
||||
let sem = swapped.init_semaphore.clone();
|
||||
// acquire and forget right away, moving the control over to InitPermit
|
||||
@@ -543,7 +543,7 @@ mod tests {
|
||||
target.set(42, permit);
|
||||
|
||||
let (_answer, permit) = {
|
||||
let guard = target
|
||||
let mut guard = target
|
||||
.get_or_init(|permit| async { Ok::<_, Infallible>((11, permit)) })
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
@@ -1,60 +1,27 @@
|
||||
use std::{alloc::Layout, cmp::Ordering, ops::RangeBounds};
|
||||
|
||||
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
|
||||
pub enum VecMapOrdering {
|
||||
Greater,
|
||||
GreaterOrEqual,
|
||||
}
|
||||
|
||||
/// Ordered map datastructure implemented in a Vec.
|
||||
/// Append only - can only add keys that are larger than the
|
||||
/// current max key.
|
||||
/// Ordering can be adjusted using [`VecMapOrdering`]
|
||||
/// during `VecMap` construction.
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct VecMap<K, V> {
|
||||
data: Vec<(K, V)>,
|
||||
ordering: VecMapOrdering,
|
||||
}
|
||||
pub struct VecMap<K, V>(Vec<(K, V)>);
|
||||
|
||||
impl<K, V> Default for VecMap<K, V> {
|
||||
fn default() -> Self {
|
||||
VecMap {
|
||||
data: Default::default(),
|
||||
ordering: VecMapOrdering::Greater,
|
||||
}
|
||||
VecMap(Default::default())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub enum VecMapError {
|
||||
#[error("Key violates ordering constraint")]
|
||||
InvalidKey,
|
||||
#[error("Mismatched ordering constraints")]
|
||||
ExtendOrderingError,
|
||||
}
|
||||
#[derive(Debug)]
|
||||
pub struct InvalidKey;
|
||||
|
||||
impl<K: Ord, V> VecMap<K, V> {
|
||||
pub fn new(ordering: VecMapOrdering) -> Self {
|
||||
Self {
|
||||
data: Vec::new(),
|
||||
ordering,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn with_capacity(capacity: usize, ordering: VecMapOrdering) -> Self {
|
||||
Self {
|
||||
data: Vec::with_capacity(capacity),
|
||||
ordering,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.data.is_empty()
|
||||
self.0.is_empty()
|
||||
}
|
||||
|
||||
pub fn as_slice(&self) -> &[(K, V)] {
|
||||
self.data.as_slice()
|
||||
self.0.as_slice()
|
||||
}
|
||||
|
||||
/// This function may panic if given a range where the lower bound is
|
||||
@@ -62,7 +29,7 @@ impl<K: Ord, V> VecMap<K, V> {
|
||||
pub fn slice_range<R: RangeBounds<K>>(&self, range: R) -> &[(K, V)] {
|
||||
use std::ops::Bound::*;
|
||||
|
||||
let binary_search = |k: &K| self.data.binary_search_by_key(&k, extract_key);
|
||||
let binary_search = |k: &K| self.0.binary_search_by_key(&k, extract_key);
|
||||
|
||||
let start_idx = match range.start_bound() {
|
||||
Unbounded => 0,
|
||||
@@ -74,7 +41,7 @@ impl<K: Ord, V> VecMap<K, V> {
|
||||
};
|
||||
|
||||
let end_idx = match range.end_bound() {
|
||||
Unbounded => self.data.len(),
|
||||
Unbounded => self.0.len(),
|
||||
Included(k) => match binary_search(k) {
|
||||
Ok(idx) => idx + 1,
|
||||
Err(idx) => idx,
|
||||
@@ -82,30 +49,34 @@ impl<K: Ord, V> VecMap<K, V> {
|
||||
Excluded(k) => binary_search(k).unwrap_or_else(std::convert::identity),
|
||||
};
|
||||
|
||||
&self.data[start_idx..end_idx]
|
||||
&self.0[start_idx..end_idx]
|
||||
}
|
||||
|
||||
/// Add a key value pair to the map.
|
||||
/// If `key` is not respective of the `self` ordering the
|
||||
/// pair will not be added and `InvalidKey` error will be returned.
|
||||
pub fn append(&mut self, key: K, value: V) -> Result<usize, VecMapError> {
|
||||
self.validate_key_order(&key)?;
|
||||
/// If `key` is less than or equal to the current maximum key
|
||||
/// the pair will not be added and InvalidKey error will be returned.
|
||||
pub fn append(&mut self, key: K, value: V) -> Result<usize, InvalidKey> {
|
||||
if let Some((last_key, _last_value)) = self.0.last() {
|
||||
if &key <= last_key {
|
||||
return Err(InvalidKey);
|
||||
}
|
||||
}
|
||||
|
||||
let delta_size = self.instrument_vec_op(|vec| vec.push((key, value)));
|
||||
Ok(delta_size)
|
||||
}
|
||||
|
||||
/// Update the maximum key value pair or add a new key value pair to the map.
|
||||
/// If `key` is not respective of the `self` ordering no updates or additions
|
||||
/// will occur and `InvalidKey` error will be returned.
|
||||
/// If `key` is less than the current maximum key no updates or additions
|
||||
/// will occur and InvalidKey error will be returned.
|
||||
pub fn append_or_update_last(
|
||||
&mut self,
|
||||
key: K,
|
||||
mut value: V,
|
||||
) -> Result<(Option<V>, usize), VecMapError> {
|
||||
if let Some((last_key, last_value)) = self.data.last_mut() {
|
||||
) -> Result<(Option<V>, usize), InvalidKey> {
|
||||
if let Some((last_key, last_value)) = self.0.last_mut() {
|
||||
match key.cmp(last_key) {
|
||||
Ordering::Less => return Err(VecMapError::InvalidKey),
|
||||
Ordering::Less => return Err(InvalidKey),
|
||||
Ordering::Equal => {
|
||||
std::mem::swap(last_value, &mut value);
|
||||
const DELTA_SIZE: usize = 0;
|
||||
@@ -129,67 +100,40 @@ impl<K: Ord, V> VecMap<K, V> {
|
||||
V: Clone,
|
||||
{
|
||||
let split_idx = self
|
||||
.data
|
||||
.0
|
||||
.binary_search_by_key(&cutoff, extract_key)
|
||||
.unwrap_or_else(std::convert::identity);
|
||||
|
||||
(
|
||||
VecMap {
|
||||
data: self.data[..split_idx].to_vec(),
|
||||
ordering: self.ordering,
|
||||
},
|
||||
VecMap {
|
||||
data: self.data[split_idx..].to_vec(),
|
||||
ordering: self.ordering,
|
||||
},
|
||||
VecMap(self.0[..split_idx].to_vec()),
|
||||
VecMap(self.0[split_idx..].to_vec()),
|
||||
)
|
||||
}
|
||||
|
||||
/// Move items from `other` to the end of `self`, leaving `other` empty.
|
||||
/// If the `other` ordering is different from `self` ordering
|
||||
/// `ExtendOrderingError` error will be returned.
|
||||
/// If any keys in `other` is not respective of the ordering defined in
|
||||
/// `self`, `InvalidKey` error will be returned and no mutation will occur.
|
||||
pub fn extend(&mut self, other: &mut Self) -> Result<usize, VecMapError> {
|
||||
if self.ordering != other.ordering {
|
||||
return Err(VecMapError::ExtendOrderingError);
|
||||
}
|
||||
/// If any keys in `other` is less than or equal to any key in `self`,
|
||||
/// `InvalidKey` error will be returned and no mutation will occur.
|
||||
pub fn extend(&mut self, other: &mut Self) -> Result<usize, InvalidKey> {
|
||||
let self_last_opt = self.0.last().map(extract_key);
|
||||
let other_first_opt = other.0.last().map(extract_key);
|
||||
|
||||
let other_first_opt = other.data.last().map(extract_key);
|
||||
if let Some(other_first) = other_first_opt {
|
||||
self.validate_key_order(other_first)?;
|
||||
}
|
||||
|
||||
let delta_size = self.instrument_vec_op(|vec| vec.append(&mut other.data));
|
||||
Ok(delta_size)
|
||||
}
|
||||
|
||||
/// Validate the current last key in `self` and key being
|
||||
/// inserted against the order defined in `self`.
|
||||
fn validate_key_order(&self, key: &K) -> Result<(), VecMapError> {
|
||||
if let Some(last_key) = self.data.last().map(extract_key) {
|
||||
match (&self.ordering, &key.cmp(last_key)) {
|
||||
(VecMapOrdering::Greater, Ordering::Less | Ordering::Equal) => {
|
||||
return Err(VecMapError::InvalidKey);
|
||||
}
|
||||
(VecMapOrdering::Greater, Ordering::Greater) => {}
|
||||
(VecMapOrdering::GreaterOrEqual, Ordering::Less) => {
|
||||
return Err(VecMapError::InvalidKey);
|
||||
}
|
||||
(VecMapOrdering::GreaterOrEqual, Ordering::Equal | Ordering::Greater) => {}
|
||||
if let (Some(self_last), Some(other_first)) = (self_last_opt, other_first_opt) {
|
||||
if self_last >= other_first {
|
||||
return Err(InvalidKey);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
let delta_size = self.instrument_vec_op(|vec| vec.append(&mut other.0));
|
||||
Ok(delta_size)
|
||||
}
|
||||
|
||||
/// Instrument an operation on the underlying [`Vec`].
|
||||
/// Will panic if the operation decreases capacity.
|
||||
/// Returns the increase in memory usage caused by the op.
|
||||
fn instrument_vec_op(&mut self, op: impl FnOnce(&mut Vec<(K, V)>)) -> usize {
|
||||
let old_cap = self.data.capacity();
|
||||
op(&mut self.data);
|
||||
let new_cap = self.data.capacity();
|
||||
let old_cap = self.0.capacity();
|
||||
op(&mut self.0);
|
||||
let new_cap = self.0.capacity();
|
||||
|
||||
match old_cap.cmp(&new_cap) {
|
||||
Ordering::Less => {
|
||||
@@ -201,36 +145,6 @@ impl<K: Ord, V> VecMap<K, V> {
|
||||
Ordering::Greater => panic!("VecMap capacity shouldn't ever decrease"),
|
||||
}
|
||||
}
|
||||
|
||||
/// Similar to `from_iter` defined in `FromIter` trait except
|
||||
/// that it accepts an [`VecMapOrdering`]
|
||||
pub fn from_iter<I: IntoIterator<Item = (K, V)>>(iter: I, ordering: VecMapOrdering) -> Self {
|
||||
let iter = iter.into_iter();
|
||||
let initial_capacity = {
|
||||
match iter.size_hint() {
|
||||
(lower_bound, None) => lower_bound,
|
||||
(_, Some(upper_bound)) => upper_bound,
|
||||
}
|
||||
};
|
||||
|
||||
let mut vec_map = VecMap::with_capacity(initial_capacity, ordering);
|
||||
for (key, value) in iter {
|
||||
vec_map
|
||||
.append(key, value)
|
||||
.expect("The passed collection needs to be sorted!");
|
||||
}
|
||||
|
||||
vec_map
|
||||
}
|
||||
}
|
||||
|
||||
impl<K: Ord, V> IntoIterator for VecMap<K, V> {
|
||||
type Item = (K, V);
|
||||
type IntoIter = std::vec::IntoIter<(K, V)>;
|
||||
|
||||
fn into_iter(self) -> Self::IntoIter {
|
||||
self.data.into_iter()
|
||||
}
|
||||
}
|
||||
|
||||
fn extract_key<K, V>(entry: &(K, V)) -> &K {
|
||||
@@ -241,7 +155,7 @@ fn extract_key<K, V>(entry: &(K, V)) -> &K {
|
||||
mod tests {
|
||||
use std::{collections::BTreeMap, ops::Bound};
|
||||
|
||||
use super::{VecMap, VecMapOrdering};
|
||||
use super::VecMap;
|
||||
|
||||
#[test]
|
||||
fn unbounded_range() {
|
||||
@@ -396,59 +310,5 @@ mod tests {
|
||||
left.extend(&mut one_map).unwrap_err();
|
||||
assert_eq!(left.as_slice(), &[(0, ()), (1, ())]);
|
||||
assert_eq!(one_map.as_slice(), &[(1, ())]);
|
||||
|
||||
let mut map_greater_or_equal = VecMap::new(VecMapOrdering::GreaterOrEqual);
|
||||
map_greater_or_equal.append(2, ()).unwrap();
|
||||
map_greater_or_equal.append(2, ()).unwrap();
|
||||
|
||||
left.extend(&mut map_greater_or_equal).unwrap_err();
|
||||
assert_eq!(left.as_slice(), &[(0, ()), (1, ())]);
|
||||
assert_eq!(map_greater_or_equal.as_slice(), &[(2, ()), (2, ())]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn extend_with_ordering() {
|
||||
let mut left = VecMap::new(VecMapOrdering::GreaterOrEqual);
|
||||
left.append(0, ()).unwrap();
|
||||
assert_eq!(left.as_slice(), &[(0, ())]);
|
||||
|
||||
let mut greater_right = VecMap::new(VecMapOrdering::Greater);
|
||||
greater_right.append(0, ()).unwrap();
|
||||
left.extend(&mut greater_right).unwrap_err();
|
||||
assert_eq!(left.as_slice(), &[(0, ())]);
|
||||
|
||||
let mut greater_or_equal_right = VecMap::new(VecMapOrdering::GreaterOrEqual);
|
||||
greater_or_equal_right.append(2, ()).unwrap();
|
||||
greater_or_equal_right.append(2, ()).unwrap();
|
||||
left.extend(&mut greater_or_equal_right).unwrap();
|
||||
assert_eq!(left.as_slice(), &[(0, ()), (2, ()), (2, ())]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn vec_map_from_sorted() {
|
||||
let vec = vec![(1, ()), (2, ()), (3, ()), (6, ())];
|
||||
let vec_map = VecMap::from_iter(vec, VecMapOrdering::Greater);
|
||||
assert_eq!(vec_map.as_slice(), &[(1, ()), (2, ()), (3, ()), (6, ())]);
|
||||
|
||||
let vec = vec![(1, ()), (2, ()), (3, ()), (3, ()), (6, ()), (6, ())];
|
||||
let vec_map = VecMap::from_iter(vec, VecMapOrdering::GreaterOrEqual);
|
||||
assert_eq!(
|
||||
vec_map.as_slice(),
|
||||
&[(1, ()), (2, ()), (3, ()), (3, ()), (6, ()), (6, ())]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic]
|
||||
fn vec_map_from_unsorted_greater() {
|
||||
let vec = vec![(1, ()), (2, ()), (2, ()), (3, ()), (6, ())];
|
||||
let _ = VecMap::from_iter(vec, VecMapOrdering::Greater);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic]
|
||||
fn vec_map_from_unsorted_greater_or_equal() {
|
||||
let vec = vec![(1, ()), (2, ()), (3, ()), (6, ()), (5, ())];
|
||||
let _ = VecMap::from_iter(vec, VecMapOrdering::GreaterOrEqual);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,78 +0,0 @@
|
||||
use std::io::SeekFrom;
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use async_compression::{
|
||||
tokio::{bufread::ZstdDecoder, write::ZstdEncoder},
|
||||
zstd::CParameter,
|
||||
Level,
|
||||
};
|
||||
use camino::Utf8Path;
|
||||
use nix::NixPath;
|
||||
use tokio::{
|
||||
fs::{File, OpenOptions},
|
||||
io::AsyncBufRead,
|
||||
io::AsyncSeekExt,
|
||||
io::AsyncWriteExt,
|
||||
};
|
||||
use tokio_tar::{Archive, Builder, HeaderMode};
|
||||
use walkdir::WalkDir;
|
||||
|
||||
/// Creates a Zstandard tarball.
|
||||
pub async fn create_zst_tarball(path: &Utf8Path, tarball: &Utf8Path) -> Result<(File, u64)> {
|
||||
let file = OpenOptions::new()
|
||||
.create(true)
|
||||
.truncate(true)
|
||||
.read(true)
|
||||
.write(true)
|
||||
.open(&tarball)
|
||||
.await
|
||||
.with_context(|| format!("tempfile creation {tarball}"))?;
|
||||
|
||||
let mut paths = Vec::new();
|
||||
for entry in WalkDir::new(path) {
|
||||
let entry = entry?;
|
||||
let metadata = entry.metadata().expect("error getting dir entry metadata");
|
||||
// Also allow directories so that we also get empty directories
|
||||
if !(metadata.is_file() || metadata.is_dir()) {
|
||||
continue;
|
||||
}
|
||||
let path = entry.into_path();
|
||||
paths.push(path);
|
||||
}
|
||||
// Do a sort to get a more consistent listing
|
||||
paths.sort_unstable();
|
||||
let zstd = ZstdEncoder::with_quality_and_params(
|
||||
file,
|
||||
Level::Default,
|
||||
&[CParameter::enable_long_distance_matching(true)],
|
||||
);
|
||||
let mut builder = Builder::new(zstd);
|
||||
// Use reproducible header mode
|
||||
builder.mode(HeaderMode::Deterministic);
|
||||
for p in paths {
|
||||
let rel_path = p.strip_prefix(path)?;
|
||||
if rel_path.is_empty() {
|
||||
// The top directory should not be compressed,
|
||||
// the tar crate doesn't like that
|
||||
continue;
|
||||
}
|
||||
builder.append_path_with_name(&p, rel_path).await?;
|
||||
}
|
||||
let mut zstd = builder.into_inner().await?;
|
||||
zstd.shutdown().await?;
|
||||
let mut compressed = zstd.into_inner();
|
||||
let compressed_len = compressed.metadata().await?.len();
|
||||
compressed.seek(SeekFrom::Start(0)).await?;
|
||||
Ok((compressed, compressed_len))
|
||||
}
|
||||
|
||||
/// Creates a Zstandard tarball.
|
||||
pub async fn extract_zst_tarball(
|
||||
path: &Utf8Path,
|
||||
tarball: impl AsyncBufRead + Unpin,
|
||||
) -> Result<()> {
|
||||
let decoder = Box::pin(ZstdDecoder::new(tarball));
|
||||
let mut archive = Archive::new(decoder);
|
||||
archive.unpack(path).await?;
|
||||
Ok(())
|
||||
}
|
||||
@@ -1,156 +1,160 @@
|
||||
//! Quantify a single walredo manager's throughput under N concurrent callers.
|
||||
//! Simple benchmarking around walredo.
|
||||
//!
|
||||
//! The benchmark implementation ([`bench_impl`]) is parametrized by
|
||||
//! - `redo_work` => [`Request::short_request`] or [`Request::medium_request`]
|
||||
//! - `n_redos` => number of times the benchmark shell execute the `redo_work`
|
||||
//! - `nclients` => number of clients (more on this shortly).
|
||||
//! Right now they hope to just set a baseline. Later we can try to expand into latency and
|
||||
//! throughput after figuring out the coordinated omission problems below.
|
||||
//!
|
||||
//! The benchmark impl sets up a multi-threaded tokio runtime with default parameters.
|
||||
//! It spawns `nclients` times [`client`] tokio tasks.
|
||||
//! Each task executes the `redo_work` `n_redos/nclients` times.
|
||||
//! There are two sets of inputs; `short` and `medium`. They were collected on postgres v14 by
|
||||
//! logging what happens when a sequential scan is requested on a small table, then picking out two
|
||||
//! suitable from logs.
|
||||
//!
|
||||
//! We exercise the following combinations:
|
||||
//! - `redo_work = short / medium``
|
||||
//! - `nclients = [1, 2, 4, 8, 16, 32, 64, 128]`
|
||||
//!
|
||||
//! We let `criterion` determine the `n_redos` using `iter_custom`.
|
||||
//! The idea is that for each `(redo_work, nclients)` combination,
|
||||
//! criterion will run the `bench_impl` multiple times with different `n_redos`.
|
||||
//! The `bench_impl` reports the aggregate wall clock time from the clients' perspective.
|
||||
//! Criterion will divide that by `n_redos` to compute the "time per iteration".
|
||||
//! In our case, "time per iteration" means "time per redo_work execution".
|
||||
//!
|
||||
//! NB: the way by which `iter_custom` determines the "number of iterations"
|
||||
//! is called sampling. Apparently the idea here is to detect outliers.
|
||||
//! We're not sure whether the current choice of sampling method makes sense.
|
||||
//! See https://bheisler.github.io/criterion.rs/book/user_guide/command_line_output.html#collecting-samples
|
||||
//!
|
||||
//! # Reference Numbers
|
||||
//!
|
||||
//! 2024-03-20 on i3en.3xlarge
|
||||
//!
|
||||
//! ```text
|
||||
//! short/1 time: [26.483 µs 26.614 µs 26.767 µs]
|
||||
//! short/2 time: [32.223 µs 32.465 µs 32.767 µs]
|
||||
//! short/4 time: [47.203 µs 47.583 µs 47.984 µs]
|
||||
//! short/8 time: [89.135 µs 89.612 µs 90.139 µs]
|
||||
//! short/16 time: [190.12 µs 191.52 µs 192.88 µs]
|
||||
//! short/32 time: [380.96 µs 382.63 µs 384.20 µs]
|
||||
//! short/64 time: [736.86 µs 741.07 µs 745.03 µs]
|
||||
//! short/128 time: [1.4106 ms 1.4206 ms 1.4294 ms]
|
||||
//! medium/1 time: [111.81 µs 112.25 µs 112.79 µs]
|
||||
//! medium/2 time: [158.26 µs 159.13 µs 160.21 µs]
|
||||
//! medium/4 time: [334.65 µs 337.14 µs 340.07 µs]
|
||||
//! medium/8 time: [675.32 µs 679.91 µs 685.25 µs]
|
||||
//! medium/16 time: [1.2929 ms 1.2996 ms 1.3067 ms]
|
||||
//! medium/32 time: [2.4295 ms 2.4461 ms 2.4623 ms]
|
||||
//! medium/64 time: [4.3973 ms 4.4458 ms 4.4875 ms]
|
||||
//! medium/128 time: [7.5955 ms 7.7847 ms 7.9481 ms]
|
||||
//! ```
|
||||
//! Reference data (git blame to see commit) on an i3en.3xlarge
|
||||
// ```text
|
||||
//! short/short/1 time: [39.175 µs 39.348 µs 39.536 µs]
|
||||
//! short/short/2 time: [51.227 µs 51.487 µs 51.755 µs]
|
||||
//! short/short/4 time: [76.048 µs 76.362 µs 76.674 µs]
|
||||
//! short/short/8 time: [128.94 µs 129.82 µs 130.74 µs]
|
||||
//! short/short/16 time: [227.84 µs 229.00 µs 230.28 µs]
|
||||
//! short/short/32 time: [455.97 µs 457.81 µs 459.90 µs]
|
||||
//! short/short/64 time: [902.46 µs 904.84 µs 907.32 µs]
|
||||
//! short/short/128 time: [1.7416 ms 1.7487 ms 1.7561 ms]
|
||||
//! ``
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
use bytes::{Buf, Bytes};
|
||||
use criterion::{BenchmarkId, Criterion};
|
||||
use pageserver::{config::PageServerConf, walrecord::NeonWalRecord, walredo::PostgresRedoManager};
|
||||
use pageserver_api::{key::Key, shard::TenantShardId};
|
||||
use std::{
|
||||
sync::Arc,
|
||||
time::{Duration, Instant},
|
||||
use pageserver::{
|
||||
config::PageServerConf, repository::Key, walrecord::NeonWalRecord, walredo::PostgresRedoManager,
|
||||
};
|
||||
use tokio::{sync::Barrier, task::JoinSet};
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use tokio::task::JoinSet;
|
||||
use utils::{id::TenantId, lsn::Lsn};
|
||||
|
||||
fn bench(c: &mut Criterion) {
|
||||
{
|
||||
let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
|
||||
for nclients in nclients {
|
||||
let mut group = c.benchmark_group("short");
|
||||
group.bench_with_input(
|
||||
BenchmarkId::from_parameter(nclients),
|
||||
&nclients,
|
||||
|b, nclients| {
|
||||
let redo_work = Arc::new(Request::short_input());
|
||||
b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients));
|
||||
},
|
||||
);
|
||||
}
|
||||
}
|
||||
use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};
|
||||
|
||||
{
|
||||
let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
|
||||
for nclients in nclients {
|
||||
let mut group = c.benchmark_group("medium");
|
||||
group.bench_with_input(
|
||||
BenchmarkId::from_parameter(nclients),
|
||||
&nclients,
|
||||
|b, nclients| {
|
||||
let redo_work = Arc::new(Request::medium_input());
|
||||
b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients));
|
||||
},
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
criterion::criterion_group!(benches, bench);
|
||||
criterion::criterion_main!(benches);
|
||||
fn redo_scenarios(c: &mut Criterion) {
|
||||
// logging should be enabled when adding more inputs, since walredo will only report malformed
|
||||
// input to the stderr.
|
||||
// utils::logging::init(utils::logging::LogFormat::Plain).unwrap();
|
||||
|
||||
// Returns the sum of each client's wall-clock time spent executing their share of the n_redos.
|
||||
fn bench_impl(redo_work: Arc<Request>, n_redos: u64, nclients: u64) -> Duration {
|
||||
let repo_dir = camino_tempfile::tempdir_in(env!("CARGO_TARGET_TMPDIR")).unwrap();
|
||||
|
||||
let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
|
||||
let conf = Box::leak(Box::new(conf));
|
||||
let tenant_shard_id = TenantShardId::unsharded(TenantId::generate());
|
||||
|
||||
let manager = PostgresRedoManager::new(conf, tenant_shard_id);
|
||||
|
||||
let manager = Arc::new(manager);
|
||||
|
||||
{
|
||||
let rt = tokio::runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()
|
||||
.unwrap();
|
||||
tracing::info!("executing first");
|
||||
rt.block_on(short().execute(&manager)).unwrap();
|
||||
tracing::info!("first executed");
|
||||
}
|
||||
|
||||
let thread_counts = [1, 2, 4, 8, 16, 32, 64, 128];
|
||||
|
||||
let mut group = c.benchmark_group("short");
|
||||
group.sampling_mode(criterion::SamplingMode::Flat);
|
||||
|
||||
for thread_count in thread_counts {
|
||||
group.bench_with_input(
|
||||
BenchmarkId::new("short", thread_count),
|
||||
&thread_count,
|
||||
|b, thread_count| {
|
||||
add_multithreaded_walredo_requesters(b, *thread_count, &manager, short);
|
||||
},
|
||||
);
|
||||
}
|
||||
drop(group);
|
||||
|
||||
let mut group = c.benchmark_group("medium");
|
||||
group.sampling_mode(criterion::SamplingMode::Flat);
|
||||
|
||||
for thread_count in thread_counts {
|
||||
group.bench_with_input(
|
||||
BenchmarkId::new("medium", thread_count),
|
||||
&thread_count,
|
||||
|b, thread_count| {
|
||||
add_multithreaded_walredo_requesters(b, *thread_count, &manager, medium);
|
||||
},
|
||||
);
|
||||
}
|
||||
drop(group);
|
||||
}
|
||||
|
||||
/// Sets up a multi-threaded tokio runtime with default worker thread count,
|
||||
/// then, spawn `requesters` tasks that repeatedly:
|
||||
/// - get input from `input_factor()`
|
||||
/// - call `manager.request_redo()` with their input
|
||||
///
|
||||
/// This stress-tests the scalability of a single walredo manager at high tokio-level concurrency.
|
||||
///
|
||||
/// Using tokio's default worker thread count means the results will differ on machines
|
||||
/// with different core countrs. We don't care about that, the performance will always
|
||||
/// be different on different hardware. To compare performance of different software versions,
|
||||
/// use the same hardware.
|
||||
fn add_multithreaded_walredo_requesters(
|
||||
b: &mut criterion::Bencher,
|
||||
nrequesters: usize,
|
||||
manager: &Arc<PostgresRedoManager>,
|
||||
input_factory: fn() -> Request,
|
||||
) {
|
||||
assert_ne!(nrequesters, 0);
|
||||
|
||||
let rt = tokio::runtime::Builder::new_multi_thread()
|
||||
.enable_all()
|
||||
.build()
|
||||
.unwrap();
|
||||
|
||||
let start = Arc::new(Barrier::new(nclients as usize));
|
||||
let barrier = Arc::new(tokio::sync::Barrier::new(nrequesters + 1));
|
||||
|
||||
let mut tasks = JoinSet::new();
|
||||
|
||||
let manager = PostgresRedoManager::new(conf, tenant_shard_id);
|
||||
let manager = Arc::new(manager);
|
||||
|
||||
for _ in 0..nclients {
|
||||
rt.block_on(async {
|
||||
tasks.spawn(client(
|
||||
Arc::clone(&manager),
|
||||
Arc::clone(&start),
|
||||
Arc::clone(&redo_work),
|
||||
// divide the amount of work equally among the clients
|
||||
n_redos / nclients,
|
||||
))
|
||||
let mut requesters = JoinSet::new();
|
||||
for _ in 0..nrequesters {
|
||||
let _entered = rt.enter();
|
||||
let manager = manager.clone();
|
||||
let barrier = barrier.clone();
|
||||
requesters.spawn(async move {
|
||||
loop {
|
||||
let input = input_factory();
|
||||
barrier.wait().await;
|
||||
let page = input.execute(&manager).await.unwrap();
|
||||
assert_eq!(page.remaining(), 8192);
|
||||
barrier.wait().await;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
rt.block_on(async move {
|
||||
let mut total_wallclock_time = std::time::Duration::from_millis(0);
|
||||
while let Some(res) = tasks.join_next().await {
|
||||
total_wallclock_time += res.unwrap();
|
||||
}
|
||||
total_wallclock_time
|
||||
})
|
||||
let do_one_iteration = || {
|
||||
rt.block_on(async {
|
||||
barrier.wait().await;
|
||||
// wait for work to complete
|
||||
barrier.wait().await;
|
||||
})
|
||||
};
|
||||
|
||||
b.iter_batched(
|
||||
|| {
|
||||
// warmup
|
||||
do_one_iteration();
|
||||
},
|
||||
|()| {
|
||||
// work loop
|
||||
do_one_iteration();
|
||||
},
|
||||
criterion::BatchSize::PerIteration,
|
||||
);
|
||||
|
||||
rt.block_on(requesters.shutdown());
|
||||
}
|
||||
|
||||
async fn client(
|
||||
mgr: Arc<PostgresRedoManager>,
|
||||
start: Arc<Barrier>,
|
||||
redo_work: Arc<Request>,
|
||||
n_redos: u64,
|
||||
) -> Duration {
|
||||
start.wait().await;
|
||||
let start = Instant::now();
|
||||
for _ in 0..n_redos {
|
||||
let page = redo_work.execute(&mgr).await.unwrap();
|
||||
assert_eq!(page.remaining(), 8192);
|
||||
// The real pageserver will rarely if ever do 2 walredos in a row without
|
||||
// yielding to the executor.
|
||||
tokio::task::yield_now().await;
|
||||
}
|
||||
start.elapsed()
|
||||
}
|
||||
criterion_group!(benches, redo_scenarios);
|
||||
criterion_main!(benches);
|
||||
|
||||
macro_rules! lsn {
|
||||
($input:expr) => {{
|
||||
@@ -162,46 +166,12 @@ macro_rules! lsn {
|
||||
}};
|
||||
}
|
||||
|
||||
/// Simple wrapper around `WalRedoManager::request_redo`.
|
||||
///
|
||||
/// In benchmarks this is cloned around.
|
||||
#[derive(Clone)]
|
||||
struct Request {
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
base_img: Option<(Lsn, Bytes)>,
|
||||
records: Vec<(Lsn, NeonWalRecord)>,
|
||||
pg_version: u32,
|
||||
}
|
||||
|
||||
impl Request {
|
||||
async fn execute(&self, manager: &PostgresRedoManager) -> anyhow::Result<Bytes> {
|
||||
let Request {
|
||||
key,
|
||||
lsn,
|
||||
base_img,
|
||||
records,
|
||||
pg_version,
|
||||
} = self;
|
||||
|
||||
// TODO: avoid these clones
|
||||
manager
|
||||
.request_redo(*key, *lsn, base_img.clone(), records.clone(), *pg_version)
|
||||
.await
|
||||
}
|
||||
|
||||
fn pg_record(will_init: bool, bytes: &'static [u8]) -> NeonWalRecord {
|
||||
let rec = Bytes::from_static(bytes);
|
||||
NeonWalRecord::Postgres { will_init, rec }
|
||||
}
|
||||
|
||||
/// Short payload, 1132 bytes.
|
||||
// pg_records are copypasted from log, where they are put with Debug impl of Bytes, which uses \0
|
||||
// for null bytes.
|
||||
#[allow(clippy::octal_escapes)]
|
||||
pub fn short_input() -> Request {
|
||||
let pg_record = Self::pg_record;
|
||||
Request {
|
||||
/// Short payload, 1132 bytes.
|
||||
// pg_records are copypasted from log, where they are put with Debug impl of Bytes, which uses \0
|
||||
// for null bytes.
|
||||
#[allow(clippy::octal_escapes)]
|
||||
fn short() -> Request {
|
||||
Request {
|
||||
key: Key {
|
||||
field1: 0,
|
||||
field2: 1663,
|
||||
@@ -224,14 +194,13 @@ impl Request {
|
||||
],
|
||||
pg_version: 14,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Medium sized payload, serializes as 26393 bytes.
|
||||
// see [`short`]
|
||||
#[allow(clippy::octal_escapes)]
|
||||
pub fn medium_input() -> Request {
|
||||
let pg_record = Self::pg_record;
|
||||
Request {
|
||||
/// Medium sized payload, serializes as 26393 bytes.
|
||||
// see [`short`]
|
||||
#[allow(clippy::octal_escapes)]
|
||||
fn medium() -> Request {
|
||||
Request {
|
||||
key: Key {
|
||||
field1: 0,
|
||||
field2: 1663,
|
||||
@@ -473,5 +442,37 @@ impl Request {
|
||||
],
|
||||
pg_version: 14,
|
||||
}
|
||||
}
|
||||
|
||||
fn pg_record(will_init: bool, bytes: &'static [u8]) -> NeonWalRecord {
|
||||
let rec = Bytes::from_static(bytes);
|
||||
NeonWalRecord::Postgres { will_init, rec }
|
||||
}
|
||||
|
||||
/// Simple wrapper around `WalRedoManager::request_redo`.
|
||||
///
|
||||
/// In benchmarks this is cloned around.
|
||||
#[derive(Clone)]
|
||||
struct Request {
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
base_img: Option<(Lsn, Bytes)>,
|
||||
records: Vec<(Lsn, NeonWalRecord)>,
|
||||
pg_version: u32,
|
||||
}
|
||||
|
||||
impl Request {
|
||||
async fn execute(self, manager: &PostgresRedoManager) -> anyhow::Result<Bytes> {
|
||||
let Request {
|
||||
key,
|
||||
lsn,
|
||||
base_img,
|
||||
records,
|
||||
pg_version,
|
||||
} = self;
|
||||
|
||||
manager
|
||||
.request_redo(key, lsn, base_img, records, pg_version)
|
||||
.await
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5,8 +5,7 @@ use pageserver_api::{
|
||||
controller_api::NodeRegisterRequest,
|
||||
shard::TenantShardId,
|
||||
upcall_api::{
|
||||
ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, ValidateRequest,
|
||||
ValidateRequestTenant, ValidateResponse,
|
||||
ReAttachRequest, ReAttachResponse, ValidateRequest, ValidateRequestTenant, ValidateResponse,
|
||||
},
|
||||
};
|
||||
use serde::{de::DeserializeOwned, Serialize};
|
||||
@@ -38,9 +37,7 @@ pub trait ControlPlaneGenerationsApi {
|
||||
fn re_attach(
|
||||
&self,
|
||||
conf: &PageServerConf,
|
||||
) -> impl Future<
|
||||
Output = Result<HashMap<TenantShardId, ReAttachResponseTenant>, RetryForeverError>,
|
||||
> + Send;
|
||||
) -> impl Future<Output = Result<HashMap<TenantShardId, Generation>, RetryForeverError>> + Send;
|
||||
fn validate(
|
||||
&self,
|
||||
tenants: Vec<(TenantShardId, Generation)>,
|
||||
@@ -121,7 +118,7 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
|
||||
async fn re_attach(
|
||||
&self,
|
||||
conf: &PageServerConf,
|
||||
) -> Result<HashMap<TenantShardId, ReAttachResponseTenant>, RetryForeverError> {
|
||||
) -> Result<HashMap<TenantShardId, Generation>, RetryForeverError> {
|
||||
let re_attach_path = self
|
||||
.base_url
|
||||
.join("re-attach")
|
||||
@@ -184,7 +181,7 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
|
||||
Ok(response
|
||||
.tenants
|
||||
.into_iter()
|
||||
.map(|rart| (rart.id, rart))
|
||||
.map(|t| (t.id, Generation::new(t.gen)))
|
||||
.collect::<HashMap<_, _>>())
|
||||
}
|
||||
|
||||
|
||||
@@ -724,8 +724,8 @@ impl DeletionQueue {
|
||||
mod test {
|
||||
use camino::Utf8Path;
|
||||
use hex_literal::hex;
|
||||
use pageserver_api::{shard::ShardIndex, upcall_api::ReAttachResponseTenant};
|
||||
use std::{io::ErrorKind, time::Duration};
|
||||
use pageserver_api::shard::ShardIndex;
|
||||
use std::io::ErrorKind;
|
||||
use tracing::info;
|
||||
|
||||
use remote_storage::{RemoteStorageConfig, RemoteStorageKind};
|
||||
@@ -834,10 +834,9 @@ mod test {
|
||||
async fn re_attach(
|
||||
&self,
|
||||
_conf: &PageServerConf,
|
||||
) -> Result<HashMap<TenantShardId, ReAttachResponseTenant>, RetryForeverError> {
|
||||
) -> Result<HashMap<TenantShardId, Generation>, RetryForeverError> {
|
||||
unimplemented!()
|
||||
}
|
||||
|
||||
async fn validate(
|
||||
&self,
|
||||
tenants: Vec<(TenantShardId, Generation)>,
|
||||
|
||||
@@ -36,7 +36,6 @@ use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
use utils::auth::JwtAuth;
|
||||
use utils::failpoint_support::failpoints_handler;
|
||||
use utils::http::endpoint::prometheus_metrics_handler;
|
||||
use utils::http::endpoint::request_span;
|
||||
use utils::http::json::json_request_or_empty_body;
|
||||
use utils::http::request::{get_request_param, must_get_query_param, parse_query_param};
|
||||
@@ -2267,7 +2266,6 @@ pub fn make_router(
|
||||
|
||||
Ok(router
|
||||
.data(state)
|
||||
.get("/metrics", |r| request_span(r, prometheus_metrics_handler))
|
||||
.get("/v1/status", |r| api_handler(r, status_handler))
|
||||
.put("/v1/failpoints", |r| {
|
||||
testing_api_handler("manage failpoints", r, failpoints_handler)
|
||||
|
||||
@@ -2,20 +2,28 @@
|
||||
//! Import data and WAL from a PostgreSQL data directory and WAL segments into
|
||||
//! a neon Timeline.
|
||||
//!
|
||||
use std::io::SeekFrom;
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
use anyhow::{bail, ensure, Context, Result};
|
||||
use async_compression::tokio::bufread::ZstdDecoder;
|
||||
use async_compression::{tokio::write::ZstdEncoder, zstd::CParameter, Level};
|
||||
use bytes::Bytes;
|
||||
use camino::Utf8Path;
|
||||
use futures::StreamExt;
|
||||
use tokio::io::{AsyncRead, AsyncReadExt};
|
||||
use nix::NixPath;
|
||||
use tokio::fs::{File, OpenOptions};
|
||||
use tokio::io::{AsyncBufRead, AsyncRead, AsyncReadExt, AsyncSeekExt, AsyncWriteExt};
|
||||
use tokio_tar::Archive;
|
||||
use tokio_tar::Builder;
|
||||
use tokio_tar::HeaderMode;
|
||||
use tracing::*;
|
||||
use walkdir::WalkDir;
|
||||
|
||||
use crate::context::RequestContext;
|
||||
use crate::metrics::WAL_INGEST;
|
||||
use crate::pgdatadir_mapping::*;
|
||||
use crate::tenant::remote_timeline_client::INITDB_PATH;
|
||||
use crate::tenant::Timeline;
|
||||
use crate::walingest::WalIngest;
|
||||
use crate::walrecord::DecodedWALRecord;
|
||||
@@ -625,3 +633,65 @@ async fn read_all_bytes(reader: &mut (impl AsyncRead + Unpin)) -> Result<Bytes>
|
||||
reader.read_to_end(&mut buf).await?;
|
||||
Ok(Bytes::from(buf))
|
||||
}
|
||||
|
||||
pub async fn create_tar_zst(pgdata_path: &Utf8Path, tmp_path: &Utf8Path) -> Result<(File, u64)> {
|
||||
let file = OpenOptions::new()
|
||||
.create(true)
|
||||
.truncate(true)
|
||||
.read(true)
|
||||
.write(true)
|
||||
.open(&tmp_path)
|
||||
.await
|
||||
.with_context(|| format!("tempfile creation {tmp_path}"))?;
|
||||
|
||||
let mut paths = Vec::new();
|
||||
for entry in WalkDir::new(pgdata_path) {
|
||||
let entry = entry?;
|
||||
let metadata = entry.metadata().expect("error getting dir entry metadata");
|
||||
// Also allow directories so that we also get empty directories
|
||||
if !(metadata.is_file() || metadata.is_dir()) {
|
||||
continue;
|
||||
}
|
||||
let path = entry.into_path();
|
||||
paths.push(path);
|
||||
}
|
||||
// Do a sort to get a more consistent listing
|
||||
paths.sort_unstable();
|
||||
let zstd = ZstdEncoder::with_quality_and_params(
|
||||
file,
|
||||
Level::Default,
|
||||
&[CParameter::enable_long_distance_matching(true)],
|
||||
);
|
||||
let mut builder = Builder::new(zstd);
|
||||
// Use reproducible header mode
|
||||
builder.mode(HeaderMode::Deterministic);
|
||||
for path in paths {
|
||||
let rel_path = path.strip_prefix(pgdata_path)?;
|
||||
if rel_path.is_empty() {
|
||||
// The top directory should not be compressed,
|
||||
// the tar crate doesn't like that
|
||||
continue;
|
||||
}
|
||||
builder.append_path_with_name(&path, rel_path).await?;
|
||||
}
|
||||
let mut zstd = builder.into_inner().await?;
|
||||
zstd.shutdown().await?;
|
||||
let mut compressed = zstd.into_inner();
|
||||
let compressed_len = compressed.metadata().await?.len();
|
||||
const INITDB_TAR_ZST_WARN_LIMIT: u64 = 2 * 1024 * 1024;
|
||||
if compressed_len > INITDB_TAR_ZST_WARN_LIMIT {
|
||||
warn!("compressed {INITDB_PATH} size of {compressed_len} is above limit {INITDB_TAR_ZST_WARN_LIMIT}.");
|
||||
}
|
||||
compressed.seek(SeekFrom::Start(0)).await?;
|
||||
Ok((compressed, compressed_len))
|
||||
}
|
||||
|
||||
pub async fn extract_tar_zst(
|
||||
pgdata_path: &Utf8Path,
|
||||
tar_zst: impl AsyncBufRead + Unpin,
|
||||
) -> Result<()> {
|
||||
let tar = Box::pin(ZstdDecoder::new(tar_zst));
|
||||
let mut archive = Archive::new(tar);
|
||||
archive.unpack(pgdata_path).await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -34,7 +34,6 @@ use strum::IntoEnumIterator;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{debug, trace, warn};
|
||||
use utils::bin_ser::DeserializeError;
|
||||
use utils::vec_map::{VecMap, VecMapOrdering};
|
||||
use utils::{bin_ser::BeSer, lsn::Lsn};
|
||||
|
||||
const MAX_AUX_FILE_DELTAS: usize = 1024;
|
||||
@@ -1547,13 +1546,12 @@ impl<'a> DatadirModification<'a> {
|
||||
if !self.pending_updates.is_empty() {
|
||||
// The put_batch call below expects expects the inputs to be sorted by Lsn,
|
||||
// so we do that first.
|
||||
let lsn_ordered_batch: VecMap<Lsn, (Key, Value)> = VecMap::from_iter(
|
||||
self.pending_updates
|
||||
.drain()
|
||||
.map(|(key, vals)| vals.into_iter().map(move |(lsn, val)| (lsn, (key, val))))
|
||||
.kmerge_by(|lhs, rhs| lhs.0 < rhs.0),
|
||||
VecMapOrdering::GreaterOrEqual,
|
||||
);
|
||||
let lsn_ordered_batch: Vec<(Key, Lsn, Value)> = self
|
||||
.pending_updates
|
||||
.drain()
|
||||
.map(|(key, vals)| vals.into_iter().map(move |(lsn, val)| (key, lsn, val)))
|
||||
.kmerge_by(|lhs, rhs| lhs.1 .0 < rhs.1 .0)
|
||||
.collect();
|
||||
|
||||
writer.put_batch(lsn_ordered_batch, ctx).await?;
|
||||
}
|
||||
|
||||
@@ -43,8 +43,6 @@ use utils::sync::gate::Gate;
|
||||
use utils::sync::gate::GateGuard;
|
||||
use utils::timeout::timeout_cancellable;
|
||||
use utils::timeout::TimeoutCancellableError;
|
||||
use utils::zstd::create_zst_tarball;
|
||||
use utils::zstd::extract_zst_tarball;
|
||||
|
||||
use self::config::AttachedLocationConfig;
|
||||
use self::config::AttachmentMode;
|
||||
@@ -202,13 +200,6 @@ pub(super) struct AttachedTenantConf {
|
||||
}
|
||||
|
||||
impl AttachedTenantConf {
|
||||
fn new(tenant_conf: TenantConfOpt, location: AttachedLocationConfig) -> Self {
|
||||
Self {
|
||||
tenant_conf,
|
||||
location,
|
||||
}
|
||||
}
|
||||
|
||||
fn try_from(location_conf: LocationConf) -> anyhow::Result<Self> {
|
||||
match &location_conf.mode {
|
||||
LocationMode::Attached(attach_conf) => Ok(Self {
|
||||
@@ -685,20 +676,9 @@ impl Tenant {
|
||||
}
|
||||
|
||||
// Ideally we should use Tenant::set_broken_no_wait, but it is not supposed to be used when tenant is in loading state.
|
||||
enum BrokenVerbosity {
|
||||
Error,
|
||||
Info
|
||||
}
|
||||
let make_broken =
|
||||
|t: &Tenant, err: anyhow::Error, verbosity: BrokenVerbosity| {
|
||||
match verbosity {
|
||||
BrokenVerbosity::Info => {
|
||||
info!("attach cancelled, setting tenant state to Broken: {err}");
|
||||
},
|
||||
BrokenVerbosity::Error => {
|
||||
error!("attach failed, setting tenant state to Broken: {err:?}");
|
||||
}
|
||||
}
|
||||
|t: &Tenant, err: anyhow::Error| {
|
||||
error!("attach failed, setting tenant state to Broken: {err:?}");
|
||||
t.state.send_modify(|state| {
|
||||
// The Stopping case is for when we have passed control on to DeleteTenantFlow:
|
||||
// if it errors, we will call make_broken when tenant is already in Stopping.
|
||||
@@ -762,7 +742,7 @@ impl Tenant {
|
||||
// Make the tenant broken so that set_stopping will not hang waiting for it to leave
|
||||
// the Attaching state. This is an over-reaction (nothing really broke, the tenant is
|
||||
// just shutting down), but ensures progress.
|
||||
make_broken(&tenant_clone, anyhow::anyhow!("Shut down while Attaching"), BrokenVerbosity::Info);
|
||||
make_broken(&tenant_clone, anyhow::anyhow!("Shut down while Attaching"));
|
||||
return Ok(());
|
||||
},
|
||||
)
|
||||
@@ -784,7 +764,7 @@ impl Tenant {
|
||||
match res {
|
||||
Ok(p) => Some(p),
|
||||
Err(e) => {
|
||||
make_broken(&tenant_clone, anyhow::anyhow!(e), BrokenVerbosity::Error);
|
||||
make_broken(&tenant_clone, anyhow::anyhow!(e));
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
@@ -808,7 +788,7 @@ impl Tenant {
|
||||
{
|
||||
Ok(should_resume_deletion) => should_resume_deletion,
|
||||
Err(err) => {
|
||||
make_broken(&tenant_clone, anyhow::anyhow!(err), BrokenVerbosity::Error);
|
||||
make_broken(&tenant_clone, anyhow::anyhow!(err));
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
@@ -838,7 +818,7 @@ impl Tenant {
|
||||
.await;
|
||||
|
||||
if let Err(e) = deleted {
|
||||
make_broken(&tenant_clone, anyhow::anyhow!(e), BrokenVerbosity::Error);
|
||||
make_broken(&tenant_clone, anyhow::anyhow!(e));
|
||||
}
|
||||
|
||||
return Ok(());
|
||||
@@ -859,7 +839,7 @@ impl Tenant {
|
||||
tenant_clone.activate(broker_client, None, &ctx);
|
||||
}
|
||||
Err(e) => {
|
||||
make_broken(&tenant_clone, anyhow::anyhow!(e), BrokenVerbosity::Error);
|
||||
make_broken(&tenant_clone, anyhow::anyhow!(e));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3062,13 +3042,8 @@ impl Tenant {
|
||||
}
|
||||
}
|
||||
|
||||
let (pgdata_zstd, tar_zst_size) = create_zst_tarball(pgdata_path, &temp_path).await?;
|
||||
const INITDB_TAR_ZST_WARN_LIMIT: u64 = 2 * 1024 * 1024;
|
||||
if tar_zst_size > INITDB_TAR_ZST_WARN_LIMIT {
|
||||
warn!(
|
||||
"compressed {temp_path} size of {tar_zst_size} is above limit {INITDB_TAR_ZST_WARN_LIMIT}."
|
||||
);
|
||||
}
|
||||
let (pgdata_zstd, tar_zst_size) =
|
||||
import_datadir::create_tar_zst(pgdata_path, &temp_path).await?;
|
||||
|
||||
pausable_failpoint!("before-initdb-upload");
|
||||
|
||||
@@ -3168,7 +3143,7 @@ impl Tenant {
|
||||
|
||||
let buf_read =
|
||||
BufReader::with_capacity(remote_timeline_client::BUFFER_SIZE, initdb_tar_zst);
|
||||
extract_zst_tarball(&pgdata_path, buf_read)
|
||||
import_datadir::extract_tar_zst(&pgdata_path, buf_read)
|
||||
.await
|
||||
.context("extract initdb tar")?;
|
||||
} else {
|
||||
|
||||
@@ -196,17 +196,16 @@ impl LocationConf {
|
||||
/// For use when attaching/re-attaching: update the generation stored in this
|
||||
/// structure. If we were in a secondary state, promote to attached (posession
|
||||
/// of a fresh generation implies this).
|
||||
pub(crate) fn attach_in_generation(&mut self, mode: AttachmentMode, generation: Generation) {
|
||||
pub(crate) fn attach_in_generation(&mut self, generation: Generation) {
|
||||
match &mut self.mode {
|
||||
LocationMode::Attached(attach_conf) => {
|
||||
attach_conf.generation = generation;
|
||||
attach_conf.attach_mode = mode;
|
||||
}
|
||||
LocationMode::Secondary(_) => {
|
||||
// We are promoted to attached by the control plane's re-attach response
|
||||
self.mode = LocationMode::Attached(AttachedLocationConfig {
|
||||
generation,
|
||||
attach_mode: mode,
|
||||
attach_mode: AttachmentMode::Single,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -111,7 +111,6 @@ async fn create_local_delete_mark(
|
||||
let _ = std::fs::OpenOptions::new()
|
||||
.write(true)
|
||||
.create(true)
|
||||
.truncate(true)
|
||||
.open(&marker_path)
|
||||
.with_context(|| format!("could not create delete marker file {marker_path:?}"))?;
|
||||
|
||||
|
||||
@@ -2,13 +2,13 @@
|
||||
//! page server.
|
||||
|
||||
use camino::{Utf8DirEntry, Utf8Path, Utf8PathBuf};
|
||||
use futures::stream::StreamExt;
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::models::{LocationConfigMode, ShardParameters};
|
||||
use pageserver_api::models::ShardParameters;
|
||||
use pageserver_api::shard::{
|
||||
ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId,
|
||||
};
|
||||
use pageserver_api::upcall_api::ReAttachResponseTenant;
|
||||
use rand::{distributions::Alphanumeric, Rng};
|
||||
use std::borrow::Cow;
|
||||
use std::cmp::Ordering;
|
||||
@@ -125,46 +125,6 @@ pub(crate) enum ShardSelector {
|
||||
Page(Key),
|
||||
}
|
||||
|
||||
/// A convenience for use with the re_attach ControlPlaneClient function: rather
|
||||
/// than the serializable struct, we build this enum that encapsulates
|
||||
/// the invariant that attached tenants always have generations.
|
||||
///
|
||||
/// This represents the subset of a LocationConfig that we receive during re-attach.
|
||||
pub(crate) enum TenantStartupMode {
|
||||
Attached((AttachmentMode, Generation)),
|
||||
Secondary,
|
||||
}
|
||||
|
||||
impl TenantStartupMode {
|
||||
/// Return the generation & mode that should be used when starting
|
||||
/// this tenant.
|
||||
///
|
||||
/// If this returns None, the re-attach struct is in an invalid state and
|
||||
/// should be ignored in the response.
|
||||
fn from_reattach_tenant(rart: ReAttachResponseTenant) -> Option<Self> {
|
||||
match (rart.mode, rart.gen) {
|
||||
(LocationConfigMode::Detached, _) => None,
|
||||
(LocationConfigMode::Secondary, _) => Some(Self::Secondary),
|
||||
(LocationConfigMode::AttachedMulti, Some(g)) => {
|
||||
Some(Self::Attached((AttachmentMode::Multi, Generation::new(g))))
|
||||
}
|
||||
(LocationConfigMode::AttachedSingle, Some(g)) => {
|
||||
Some(Self::Attached((AttachmentMode::Single, Generation::new(g))))
|
||||
}
|
||||
(LocationConfigMode::AttachedStale, Some(g)) => {
|
||||
Some(Self::Attached((AttachmentMode::Stale, Generation::new(g))))
|
||||
}
|
||||
_ => {
|
||||
tracing::warn!(
|
||||
"Received invalid re-attach state for tenant {}: {rart:?}",
|
||||
rart.id
|
||||
);
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl TenantsMap {
|
||||
/// Convenience function for typical usage, where we want to get a `Tenant` object, for
|
||||
/// working with attached tenants. If the TenantId is in the map but in Secondary state,
|
||||
@@ -311,7 +271,7 @@ pub struct TenantManager {
|
||||
|
||||
fn emergency_generations(
|
||||
tenant_confs: &HashMap<TenantShardId, anyhow::Result<LocationConf>>,
|
||||
) -> HashMap<TenantShardId, TenantStartupMode> {
|
||||
) -> HashMap<TenantShardId, Generation> {
|
||||
tenant_confs
|
||||
.iter()
|
||||
.filter_map(|(tid, lc)| {
|
||||
@@ -319,15 +279,12 @@ fn emergency_generations(
|
||||
Ok(lc) => lc,
|
||||
Err(_) => return None,
|
||||
};
|
||||
Some((
|
||||
*tid,
|
||||
match &lc.mode {
|
||||
LocationMode::Attached(alc) => {
|
||||
TenantStartupMode::Attached((alc.attach_mode, alc.generation))
|
||||
}
|
||||
LocationMode::Secondary(_) => TenantStartupMode::Secondary,
|
||||
},
|
||||
))
|
||||
let gen = match &lc.mode {
|
||||
LocationMode::Attached(alc) => Some(alc.generation),
|
||||
LocationMode::Secondary(_) => None,
|
||||
};
|
||||
|
||||
gen.map(|g| (*tid, g))
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
@@ -337,7 +294,7 @@ async fn init_load_generations(
|
||||
tenant_confs: &HashMap<TenantShardId, anyhow::Result<LocationConf>>,
|
||||
resources: &TenantSharedResources,
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<Option<HashMap<TenantShardId, TenantStartupMode>>> {
|
||||
) -> anyhow::Result<Option<HashMap<TenantShardId, Generation>>> {
|
||||
let generations = if conf.control_plane_emergency_mode {
|
||||
error!(
|
||||
"Emergency mode! Tenants will be attached unsafely using their last known generation"
|
||||
@@ -347,12 +304,7 @@ async fn init_load_generations(
|
||||
info!("Calling control plane API to re-attach tenants");
|
||||
// If we are configured to use the control plane API, then it is the source of truth for what tenants to load.
|
||||
match client.re_attach(conf).await {
|
||||
Ok(tenants) => tenants
|
||||
.into_iter()
|
||||
.flat_map(|(id, rart)| {
|
||||
TenantStartupMode::from_reattach_tenant(rart).map(|tsm| (id, tsm))
|
||||
})
|
||||
.collect(),
|
||||
Ok(tenants) => tenants,
|
||||
Err(RetryForeverError::ShuttingDown) => {
|
||||
anyhow::bail!("Shut down while waiting for control plane re-attach response")
|
||||
}
|
||||
@@ -370,17 +322,9 @@ async fn init_load_generations(
|
||||
// Must only do this if remote storage is enabled, otherwise deletion queue
|
||||
// is not running and channel push will fail.
|
||||
if resources.remote_storage.is_some() {
|
||||
let attached_tenants = generations
|
||||
.iter()
|
||||
.flat_map(|(id, start_mode)| {
|
||||
match start_mode {
|
||||
TenantStartupMode::Attached((_mode, generation)) => Some(generation),
|
||||
TenantStartupMode::Secondary => None,
|
||||
}
|
||||
.map(|gen| (*id, *gen))
|
||||
})
|
||||
.collect();
|
||||
resources.deletion_queue_client.recover(attached_tenants)?;
|
||||
resources
|
||||
.deletion_queue_client
|
||||
.recover(generations.clone())?;
|
||||
}
|
||||
|
||||
Ok(Some(generations))
|
||||
@@ -546,8 +490,9 @@ pub async fn init_tenant_mgr(
|
||||
// Scan local filesystem for attached tenants
|
||||
let tenant_configs = init_load_tenant_configs(conf).await?;
|
||||
|
||||
// Determine which tenants are to be secondary or attached, and in which generation
|
||||
let tenant_modes = init_load_generations(conf, &tenant_configs, &resources, &cancel).await?;
|
||||
// Determine which tenants are to be attached
|
||||
let tenant_generations =
|
||||
init_load_generations(conf, &tenant_configs, &resources, &cancel).await?;
|
||||
|
||||
tracing::info!(
|
||||
"Attaching {} tenants at startup, warming up {} at a time",
|
||||
@@ -577,102 +522,97 @@ pub async fn init_tenant_mgr(
|
||||
}
|
||||
};
|
||||
|
||||
// FIXME: if we were attached, and get demoted to secondary on re-attach, we
|
||||
// don't have a place to get a config.
|
||||
// (https://github.com/neondatabase/neon/issues/5377)
|
||||
const DEFAULT_SECONDARY_CONF: SecondaryLocationConfig =
|
||||
SecondaryLocationConfig { warm: true };
|
||||
|
||||
// Update the location config according to the re-attach response
|
||||
if let Some(tenant_modes) = &tenant_modes {
|
||||
let generation = if let Some(generations) = &tenant_generations {
|
||||
// We have a generation map: treat it as the authority for whether
|
||||
// this tenant is really attached.
|
||||
match tenant_modes.get(&tenant_shard_id) {
|
||||
None => {
|
||||
info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Detaching tenant, control plane omitted it in re-attach response");
|
||||
if let Err(e) = safe_remove_tenant_dir_all(&tenant_dir_path).await {
|
||||
error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
|
||||
"Failed to remove detached tenant directory '{tenant_dir_path}': {e:?}",
|
||||
);
|
||||
}
|
||||
|
||||
// We deleted local content: move on to next tenant, don't try and spawn this one.
|
||||
continue;
|
||||
}
|
||||
Some(TenantStartupMode::Secondary) => {
|
||||
if !matches!(location_conf.mode, LocationMode::Secondary(_)) {
|
||||
location_conf.mode = LocationMode::Secondary(DEFAULT_SECONDARY_CONF);
|
||||
}
|
||||
}
|
||||
Some(TenantStartupMode::Attached((attach_mode, generation))) => {
|
||||
let old_gen_higher = match &location_conf.mode {
|
||||
LocationMode::Attached(AttachedLocationConfig {
|
||||
generation: old_generation,
|
||||
attach_mode: _attach_mode,
|
||||
}) => {
|
||||
if old_generation > generation {
|
||||
Some(old_generation)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
_ => None,
|
||||
};
|
||||
if let Some(old_generation) = old_gen_higher {
|
||||
if let Some(gen) = generations.get(&tenant_shard_id) {
|
||||
if let LocationMode::Attached(attached) = &location_conf.mode {
|
||||
if attached.generation > *gen {
|
||||
tracing::error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
|
||||
"Control plane gave decreasing generation ({generation:?}) in re-attach response for tenant that was attached in generation {:?}, demoting to secondary",
|
||||
old_generation
|
||||
"Control plane gave decreasing generation ({gen:?}) in re-attach response for tenant that was attached in generation {:?}, demoting to secondary",
|
||||
attached.generation
|
||||
);
|
||||
|
||||
// We cannot safely attach this tenant given a bogus generation number, but let's avoid throwing away
|
||||
// local disk content: demote to secondary rather than detaching.
|
||||
location_conf.mode = LocationMode::Secondary(DEFAULT_SECONDARY_CONF);
|
||||
} else {
|
||||
location_conf.attach_in_generation(*attach_mode, *generation);
|
||||
tenants.insert(
|
||||
tenant_shard_id,
|
||||
TenantSlot::Secondary(SecondaryTenant::new(
|
||||
tenant_shard_id,
|
||||
location_conf.shard,
|
||||
location_conf.tenant_conf.clone(),
|
||||
&SecondaryLocationConfig { warm: false },
|
||||
)),
|
||||
);
|
||||
}
|
||||
}
|
||||
*gen
|
||||
} else {
|
||||
match &location_conf.mode {
|
||||
LocationMode::Secondary(secondary_config) => {
|
||||
// We do not require the control plane's permission for secondary mode
|
||||
// tenants, because they do no remote writes and hence require no
|
||||
// generation number
|
||||
info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Loaded tenant in secondary mode");
|
||||
tenants.insert(
|
||||
tenant_shard_id,
|
||||
TenantSlot::Secondary(SecondaryTenant::new(
|
||||
tenant_shard_id,
|
||||
location_conf.shard,
|
||||
location_conf.tenant_conf,
|
||||
secondary_config,
|
||||
)),
|
||||
);
|
||||
}
|
||||
LocationMode::Attached(_) => {
|
||||
// TODO: augment re-attach API to enable the control plane to
|
||||
// instruct us about secondary attachments. That way, instead of throwing
|
||||
// away local state, we can gracefully fall back to secondary here, if the control
|
||||
// plane tells us so.
|
||||
// (https://github.com/neondatabase/neon/issues/5377)
|
||||
info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Detaching tenant, control plane omitted it in re-attach response");
|
||||
if let Err(e) = safe_remove_tenant_dir_all(&tenant_dir_path).await {
|
||||
error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
|
||||
"Failed to remove detached tenant directory '{tenant_dir_path}': {e:?}",
|
||||
);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
continue;
|
||||
}
|
||||
} else {
|
||||
// Legacy mode: no generation information, any tenant present
|
||||
// on local disk may activate
|
||||
info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Starting tenant in legacy mode, no generation",);
|
||||
Generation::none()
|
||||
};
|
||||
|
||||
// Presence of a generation number implies attachment: attach the tenant
|
||||
// if it wasn't already, and apply the generation number.
|
||||
location_conf.attach_in_generation(generation);
|
||||
Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await?;
|
||||
|
||||
let shard_identity = location_conf.shard;
|
||||
let slot = match location_conf.mode {
|
||||
LocationMode::Attached(attached_conf) => {
|
||||
match tenant_spawn(
|
||||
conf,
|
||||
tenant_shard_id,
|
||||
&tenant_dir_path,
|
||||
resources.clone(),
|
||||
AttachedTenantConf::new(location_conf.tenant_conf, attached_conf),
|
||||
shard_identity,
|
||||
Some(init_order.clone()),
|
||||
&TENANTS,
|
||||
SpawnMode::Lazy,
|
||||
&ctx,
|
||||
) {
|
||||
Ok(tenant) => TenantSlot::Attached(tenant),
|
||||
Err(e) => {
|
||||
error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Failed to start tenant: {e:#}");
|
||||
continue;
|
||||
}
|
||||
}
|
||||
match tenant_spawn(
|
||||
conf,
|
||||
tenant_shard_id,
|
||||
&tenant_dir_path,
|
||||
resources.clone(),
|
||||
AttachedTenantConf::try_from(location_conf)?,
|
||||
shard_identity,
|
||||
Some(init_order.clone()),
|
||||
&TENANTS,
|
||||
SpawnMode::Lazy,
|
||||
&ctx,
|
||||
) {
|
||||
Ok(tenant) => {
|
||||
tenants.insert(tenant_shard_id, TenantSlot::Attached(tenant));
|
||||
}
|
||||
LocationMode::Secondary(secondary_conf) => TenantSlot::Secondary(SecondaryTenant::new(
|
||||
tenant_shard_id,
|
||||
shard_identity,
|
||||
location_conf.tenant_conf,
|
||||
&secondary_conf,
|
||||
)),
|
||||
};
|
||||
|
||||
tenants.insert(tenant_shard_id, slot);
|
||||
Err(e) => {
|
||||
error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Failed to start tenant: {e:#}");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
info!("Processed {} local tenants at startup", tenants.len());
|
||||
@@ -1722,9 +1662,9 @@ impl TenantManager {
|
||||
.layers
|
||||
.read()
|
||||
.await
|
||||
.likely_resident_layers()
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
.resident_layers()
|
||||
.collect::<Vec<_>>()
|
||||
.await;
|
||||
for layer in timeline_layers {
|
||||
let relative_path = layer
|
||||
.local_path()
|
||||
@@ -2203,7 +2143,7 @@ pub(crate) async fn load_tenant(
|
||||
|
||||
let mut location_conf =
|
||||
Tenant::load_tenant_config(conf, &tenant_shard_id).map_err(TenantMapInsertError::Other)?;
|
||||
location_conf.attach_in_generation(AttachmentMode::Single, generation);
|
||||
location_conf.attach_in_generation(generation);
|
||||
|
||||
Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await?;
|
||||
|
||||
|
||||
@@ -15,7 +15,6 @@ use crate::{
|
||||
tenant::{
|
||||
config::SecondaryLocationConfig,
|
||||
debug_assert_current_span_has_tenant_and_timeline_id,
|
||||
ephemeral_file::is_ephemeral_file,
|
||||
remote_timeline_client::{
|
||||
index::LayerFileMetadata, is_temp_download_file, FAILED_DOWNLOAD_WARN_THRESHOLD,
|
||||
FAILED_REMOTE_OP_RETRIES,
|
||||
@@ -535,11 +534,7 @@ impl<'a> TenantDownloader<'a> {
|
||||
.await
|
||||
.maybe_fatal_err(&context_msg)?;
|
||||
|
||||
tracing::debug!(
|
||||
"Wrote local heatmap to {}, with {} timelines",
|
||||
heatmap_path,
|
||||
heatmap.timelines.len()
|
||||
);
|
||||
tracing::debug!("Wrote local heatmap to {}", heatmap_path);
|
||||
|
||||
// Clean up any local layers that aren't in the heatmap. We do this first for all timelines, on the general
|
||||
// principle that deletions should be done before writes wherever possible, and so that we can use this
|
||||
@@ -552,10 +547,6 @@ impl<'a> TenantDownloader<'a> {
|
||||
// Download the layers in the heatmap
|
||||
for timeline in heatmap.timelines {
|
||||
if self.secondary_state.cancel.is_cancelled() {
|
||||
tracing::debug!(
|
||||
"Cancelled before downloading timeline {}",
|
||||
timeline.timeline_id
|
||||
);
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
@@ -773,13 +764,10 @@ impl<'a> TenantDownloader<'a> {
|
||||
}
|
||||
};
|
||||
|
||||
tracing::debug!(timeline_id=%timeline.timeline_id, "Downloading layers, {} in heatmap", timeline.layers.len());
|
||||
|
||||
// Download heatmap layers that are not present on local disk, or update their
|
||||
// access time if they are already present.
|
||||
for layer in timeline.layers {
|
||||
if self.secondary_state.cancel.is_cancelled() {
|
||||
tracing::debug!("Cancelled -- dropping out of layer loop");
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
@@ -962,10 +950,7 @@ async fn init_timeline_state(
|
||||
// Secondary mode doesn't use local metadata files, but they might have been left behind by an attached tenant.
|
||||
warn!(path=?dentry.path(), "found legacy metadata file, these should have been removed in load_tenant_config");
|
||||
continue;
|
||||
} else if crate::is_temporary(&file_path)
|
||||
|| is_temp_download_file(&file_path)
|
||||
|| is_ephemeral_file(file_name)
|
||||
{
|
||||
} else if crate::is_temporary(&file_path) || is_temp_download_file(&file_path) {
|
||||
// Temporary files are frequently left behind from restarting during downloads
|
||||
tracing::info!("Cleaning up temporary file {file_path}");
|
||||
if let Err(e) = tokio::fs::remove_file(&file_path)
|
||||
|
||||
@@ -300,7 +300,6 @@ where
|
||||
|
||||
let tenant_shard_id = job.get_tenant_shard_id();
|
||||
let barrier = if let Some(barrier) = self.get_running(tenant_shard_id) {
|
||||
tracing::info!("Command already running, waiting for it");
|
||||
barrier
|
||||
} else {
|
||||
let running = self.spawn_now(job);
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,119 +0,0 @@
|
||||
//! failpoints for unit tests, implying `#[cfg(test)]`.
|
||||
//!
|
||||
//! These are not accessible over http.
|
||||
|
||||
use super::*;
|
||||
|
||||
impl Layer {
|
||||
/// Enable a failpoint from a unit test.
|
||||
pub(super) fn enable_failpoint(&self, failpoint: Failpoint) {
|
||||
self.0.failpoints.lock().unwrap().push(failpoint);
|
||||
}
|
||||
}
|
||||
|
||||
impl LayerInner {
|
||||
/// Query if this failpoint is enabled, as in, arrive at a failpoint.
|
||||
///
|
||||
/// Calls to this method need to be `#[cfg(test)]` guarded.
|
||||
pub(super) async fn failpoint(&self, kind: FailpointKind) -> Result<(), FailpointHit> {
|
||||
let fut = {
|
||||
let mut fps = self.failpoints.lock().unwrap();
|
||||
// find the *last* failpoint for cases in which we need to use multiple for the same
|
||||
// thing (two blocked evictions)
|
||||
let fp = fps.iter_mut().rfind(|x| x.kind() == kind);
|
||||
|
||||
let Some(fp) = fp else {
|
||||
return Ok(());
|
||||
};
|
||||
|
||||
fp.hit()
|
||||
};
|
||||
|
||||
fut.await
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq)]
|
||||
pub(crate) enum FailpointKind {
|
||||
/// Failpoint acts as an accurate cancelled by drop here; see the only site of use.
|
||||
AfterDeterminingLayerNeedsNoDownload,
|
||||
/// Failpoint for stalling eviction starting
|
||||
WaitBeforeStartingEvicting,
|
||||
/// Failpoint hit in the spawned task
|
||||
WaitBeforeDownloading,
|
||||
}
|
||||
|
||||
pub(crate) enum Failpoint {
|
||||
AfterDeterminingLayerNeedsNoDownload,
|
||||
WaitBeforeStartingEvicting(
|
||||
Option<utils::completion::Completion>,
|
||||
utils::completion::Barrier,
|
||||
),
|
||||
WaitBeforeDownloading(
|
||||
Option<utils::completion::Completion>,
|
||||
utils::completion::Barrier,
|
||||
),
|
||||
}
|
||||
|
||||
impl Failpoint {
|
||||
fn kind(&self) -> FailpointKind {
|
||||
match self {
|
||||
Failpoint::AfterDeterminingLayerNeedsNoDownload => {
|
||||
FailpointKind::AfterDeterminingLayerNeedsNoDownload
|
||||
}
|
||||
Failpoint::WaitBeforeStartingEvicting(..) => FailpointKind::WaitBeforeStartingEvicting,
|
||||
Failpoint::WaitBeforeDownloading(..) => FailpointKind::WaitBeforeDownloading,
|
||||
}
|
||||
}
|
||||
|
||||
fn hit(&mut self) -> impl std::future::Future<Output = Result<(), FailpointHit>> + 'static {
|
||||
use futures::future::FutureExt;
|
||||
|
||||
// use boxed futures to avoid Either hurdles
|
||||
match self {
|
||||
Failpoint::AfterDeterminingLayerNeedsNoDownload => {
|
||||
let kind = self.kind();
|
||||
|
||||
async move { Err(FailpointHit(kind)) }.boxed()
|
||||
}
|
||||
Failpoint::WaitBeforeStartingEvicting(arrival, b)
|
||||
| Failpoint::WaitBeforeDownloading(arrival, b) => {
|
||||
// first one signals arrival
|
||||
drop(arrival.take());
|
||||
|
||||
let b = b.clone();
|
||||
|
||||
async move {
|
||||
tracing::trace!("waiting on a failpoint barrier");
|
||||
b.wait().await;
|
||||
tracing::trace!("done waiting on a failpoint barrier");
|
||||
Ok(())
|
||||
}
|
||||
.boxed()
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for FailpointKind {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
std::fmt::Debug::fmt(self, f)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct FailpointHit(FailpointKind);
|
||||
|
||||
impl std::fmt::Display for FailpointHit {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
std::fmt::Debug::fmt(self, f)
|
||||
}
|
||||
}
|
||||
|
||||
impl std::error::Error for FailpointHit {}
|
||||
|
||||
impl From<FailpointHit> for DownloadError {
|
||||
fn from(value: FailpointHit) -> Self {
|
||||
DownloadError::Failpoint(value.0)
|
||||
}
|
||||
}
|
||||
@@ -1,13 +1,14 @@
|
||||
use futures::StreamExt;
|
||||
use pageserver_api::key::CONTROLFILE_KEY;
|
||||
use tokio::task::JoinSet;
|
||||
use tracing::Instrument;
|
||||
use utils::{
|
||||
completion::{self, Completion},
|
||||
id::TimelineId,
|
||||
};
|
||||
|
||||
use super::failpoints::{Failpoint, FailpointKind};
|
||||
use super::*;
|
||||
use crate::context::DownloadBehavior;
|
||||
use crate::{context::DownloadBehavior, task_mgr::BACKGROUND_RUNTIME};
|
||||
use crate::{task_mgr::TaskKind, tenant::harness::TenantHarness};
|
||||
|
||||
/// Used in tests to advance a future to wanted await point, and not futher.
|
||||
@@ -20,7 +21,7 @@ const FOREVER: std::time::Duration = std::time::Duration::from_secs(ADVANCE.as_s
|
||||
/// Demonstrate the API and resident -> evicted -> resident -> deleted transitions.
|
||||
#[tokio::test]
|
||||
async fn smoke_test() {
|
||||
let handle = tokio::runtime::Handle::current();
|
||||
let handle = BACKGROUND_RUNTIME.handle();
|
||||
|
||||
let h = TenantHarness::create("smoke_test").unwrap();
|
||||
let span = h.span();
|
||||
@@ -37,7 +38,7 @@ async fn smoke_test() {
|
||||
let layer = {
|
||||
let mut layers = {
|
||||
let layers = timeline.layers.read().await;
|
||||
layers.likely_resident_layers().collect::<Vec<_>>()
|
||||
layers.resident_layers().collect::<Vec<_>>().await
|
||||
};
|
||||
|
||||
assert_eq!(layers.len(), 1);
|
||||
@@ -87,7 +88,7 @@ async fn smoke_test() {
|
||||
//
|
||||
// ZERO for timeout does not work reliably, so first take up all spawn_blocking slots to
|
||||
// artificially slow it down.
|
||||
let helper = SpawnBlockingPoolHelper::consume_all_spawn_blocking_threads(&handle).await;
|
||||
let helper = SpawnBlockingPoolHelper::consume_all_spawn_blocking_threads(handle).await;
|
||||
|
||||
match layer
|
||||
.evict_and_wait(std::time::Duration::ZERO)
|
||||
@@ -98,7 +99,7 @@ async fn smoke_test() {
|
||||
// expected, but note that the eviction is "still ongoing"
|
||||
helper.release().await;
|
||||
// exhaust spawn_blocking pool to ensure it is now complete
|
||||
SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(&handle)
|
||||
SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(handle)
|
||||
.await;
|
||||
}
|
||||
other => unreachable!("{other:?}"),
|
||||
@@ -107,7 +108,7 @@ async fn smoke_test() {
|
||||
// only way to query if a layer is resident is to acquire a ResidentLayer instance.
|
||||
// Layer::keep_resident never downloads, but it might initialize if the layer file is found
|
||||
// downloaded locally.
|
||||
let none = layer.keep_resident().await;
|
||||
let none = layer.keep_resident().await.unwrap();
|
||||
assert!(
|
||||
none.is_none(),
|
||||
"Expected none, because eviction removed the local file, found: {none:?}"
|
||||
@@ -166,7 +167,6 @@ async fn smoke_test() {
|
||||
rtc.wait_completion().await.unwrap();
|
||||
|
||||
assert_eq!(rtc.get_remote_physical_size(), 0);
|
||||
assert_eq!(0, LAYER_IMPL_METRICS.inits_cancelled.get())
|
||||
}
|
||||
|
||||
/// This test demonstrates a previous hang when a eviction and deletion were requested at the same
|
||||
@@ -174,7 +174,7 @@ async fn smoke_test() {
|
||||
#[tokio::test(start_paused = true)]
|
||||
async fn evict_and_wait_on_wanted_deleted() {
|
||||
// this is the runtime on which Layer spawns the blocking tasks on
|
||||
let handle = tokio::runtime::Handle::current();
|
||||
let handle = BACKGROUND_RUNTIME.handle();
|
||||
|
||||
let h = TenantHarness::create("evict_and_wait_on_wanted_deleted").unwrap();
|
||||
utils::logging::replace_panic_hook_with_tracing_panic_hook().forget();
|
||||
@@ -188,7 +188,7 @@ async fn evict_and_wait_on_wanted_deleted() {
|
||||
let layer = {
|
||||
let mut layers = {
|
||||
let layers = timeline.layers.read().await;
|
||||
layers.likely_resident_layers().collect::<Vec<_>>()
|
||||
layers.resident_layers().collect::<Vec<_>>().await
|
||||
};
|
||||
|
||||
assert_eq!(layers.len(), 1);
|
||||
@@ -213,11 +213,11 @@ async fn evict_and_wait_on_wanted_deleted() {
|
||||
drop(resident);
|
||||
|
||||
// make sure the eviction task gets to run
|
||||
SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(&handle).await;
|
||||
SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(handle).await;
|
||||
|
||||
let resident = layer.keep_resident().await;
|
||||
assert!(
|
||||
resident.is_none(),
|
||||
matches!(resident, Ok(None)),
|
||||
"keep_resident should not have re-initialized: {resident:?}"
|
||||
);
|
||||
|
||||
@@ -235,332 +235,24 @@ async fn evict_and_wait_on_wanted_deleted() {
|
||||
layers.finish_gc_timeline(&[layer]);
|
||||
}
|
||||
|
||||
SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(&handle).await;
|
||||
SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(handle).await;
|
||||
|
||||
assert_eq!(1, LAYER_IMPL_METRICS.started_deletes.get());
|
||||
assert_eq!(1, LAYER_IMPL_METRICS.completed_deletes.get());
|
||||
assert_eq!(1, LAYER_IMPL_METRICS.started_evictions.get());
|
||||
assert_eq!(1, LAYER_IMPL_METRICS.completed_evictions.get());
|
||||
assert_eq!(0, LAYER_IMPL_METRICS.inits_cancelled.get())
|
||||
}
|
||||
|
||||
/// This test ensures we are able to read the layer while the layer eviction has been
|
||||
/// started but not completed.
|
||||
#[test]
|
||||
fn read_wins_pending_eviction() {
|
||||
let rt = tokio::runtime::Builder::new_current_thread()
|
||||
.max_blocking_threads(1)
|
||||
.enable_all()
|
||||
.start_paused(true)
|
||||
.build()
|
||||
.unwrap();
|
||||
|
||||
rt.block_on(async move {
|
||||
// this is the runtime on which Layer spawns the blocking tasks on
|
||||
let handle = tokio::runtime::Handle::current();
|
||||
let h = TenantHarness::create("read_wins_pending_eviction").unwrap();
|
||||
let (tenant, ctx) = h.load().await;
|
||||
let span = h.span();
|
||||
let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1));
|
||||
|
||||
let timeline = tenant
|
||||
.create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let layer = {
|
||||
let mut layers = {
|
||||
let layers = timeline.layers.read().await;
|
||||
layers.likely_resident_layers().collect::<Vec<_>>()
|
||||
};
|
||||
|
||||
assert_eq!(layers.len(), 1);
|
||||
|
||||
layers.swap_remove(0)
|
||||
};
|
||||
|
||||
// setup done
|
||||
|
||||
let resident = layer.keep_resident().await.unwrap();
|
||||
|
||||
let mut evict_and_wait = std::pin::pin!(layer.evict_and_wait(FOREVER));
|
||||
|
||||
// drive the future to await on the status channel
|
||||
tokio::time::timeout(ADVANCE, &mut evict_and_wait)
|
||||
.await
|
||||
.expect_err("should had been a timeout since we are holding the layer resident");
|
||||
assert_eq!(1, LAYER_IMPL_METRICS.started_evictions.get());
|
||||
|
||||
let (completion, barrier) = utils::completion::channel();
|
||||
let (arrival, arrived_at_barrier) = utils::completion::channel();
|
||||
layer.enable_failpoint(Failpoint::WaitBeforeStartingEvicting(
|
||||
Some(arrival),
|
||||
barrier,
|
||||
));
|
||||
|
||||
// now the eviction cannot proceed because the threads are consumed while completion exists
|
||||
drop(resident);
|
||||
arrived_at_barrier.wait().await;
|
||||
assert!(!layer.is_likely_resident());
|
||||
|
||||
// because no actual eviction happened, we get to just reinitialize the DownloadedLayer
|
||||
layer
|
||||
.0
|
||||
.get_or_maybe_download(false, None)
|
||||
.instrument(download_span)
|
||||
.await
|
||||
.expect("should had reinitialized without downloading");
|
||||
|
||||
assert!(layer.is_likely_resident());
|
||||
|
||||
// reinitialization notifies of new resident status, which should error out all evict_and_wait
|
||||
let e = tokio::time::timeout(ADVANCE, &mut evict_and_wait)
|
||||
.await
|
||||
.expect("no timeout, because get_or_maybe_download re-initialized")
|
||||
.expect_err("eviction should not have succeeded because re-initialized");
|
||||
|
||||
// works as intended: evictions lose to "downloads"
|
||||
assert!(matches!(e, EvictionError::Downloaded), "{e:?}");
|
||||
assert_eq!(0, LAYER_IMPL_METRICS.completed_evictions.get());
|
||||
|
||||
// this is not wrong: the eviction is technically still "on the way" as it's still queued
|
||||
// because of a failpoint
|
||||
assert_eq!(
|
||||
0,
|
||||
LAYER_IMPL_METRICS
|
||||
.cancelled_evictions
|
||||
.values()
|
||||
.map(|ctr| ctr.get())
|
||||
.sum::<u64>()
|
||||
);
|
||||
|
||||
drop(completion);
|
||||
|
||||
tokio::time::sleep(ADVANCE).await;
|
||||
SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads0(&handle, 1)
|
||||
.await;
|
||||
|
||||
assert_eq!(0, LAYER_IMPL_METRICS.completed_evictions.get());
|
||||
|
||||
// now we finally can observe the original eviction failing
|
||||
// it would had been possible to observe it earlier, but here it is guaranteed to have
|
||||
// happened.
|
||||
assert_eq!(
|
||||
1,
|
||||
LAYER_IMPL_METRICS
|
||||
.cancelled_evictions
|
||||
.values()
|
||||
.map(|ctr| ctr.get())
|
||||
.sum::<u64>()
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
1,
|
||||
LAYER_IMPL_METRICS.cancelled_evictions[EvictionCancelled::AlreadyReinitialized].get()
|
||||
);
|
||||
|
||||
assert_eq!(0, LAYER_IMPL_METRICS.inits_cancelled.get())
|
||||
});
|
||||
}
|
||||
|
||||
/// Use failpoint to delay an eviction starting to get a VersionCheckFailed.
|
||||
#[test]
|
||||
fn multiple_pending_evictions_in_order() {
|
||||
let name = "multiple_pending_evictions_in_order";
|
||||
let in_order = true;
|
||||
multiple_pending_evictions_scenario(name, in_order);
|
||||
}
|
||||
|
||||
/// Use failpoint to reorder later eviction before first to get a UnexpectedEvictedState.
|
||||
#[test]
|
||||
fn multiple_pending_evictions_out_of_order() {
|
||||
let name = "multiple_pending_evictions_out_of_order";
|
||||
let in_order = false;
|
||||
multiple_pending_evictions_scenario(name, in_order);
|
||||
}
|
||||
|
||||
fn multiple_pending_evictions_scenario(name: &'static str, in_order: bool) {
|
||||
let rt = tokio::runtime::Builder::new_current_thread()
|
||||
.max_blocking_threads(1)
|
||||
.enable_all()
|
||||
.start_paused(true)
|
||||
.build()
|
||||
.unwrap();
|
||||
|
||||
rt.block_on(async move {
|
||||
// this is the runtime on which Layer spawns the blocking tasks on
|
||||
let handle = tokio::runtime::Handle::current();
|
||||
let h = TenantHarness::create(name).unwrap();
|
||||
let (tenant, ctx) = h.load().await;
|
||||
let span = h.span();
|
||||
let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1));
|
||||
|
||||
let timeline = tenant
|
||||
.create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let layer = {
|
||||
let mut layers = {
|
||||
let layers = timeline.layers.read().await;
|
||||
layers.likely_resident_layers().collect::<Vec<_>>()
|
||||
};
|
||||
|
||||
assert_eq!(layers.len(), 1);
|
||||
|
||||
layers.swap_remove(0)
|
||||
};
|
||||
|
||||
// setup done
|
||||
|
||||
let resident = layer.keep_resident().await.unwrap();
|
||||
|
||||
let mut evict_and_wait = std::pin::pin!(layer.evict_and_wait(FOREVER));
|
||||
|
||||
// drive the future to await on the status channel
|
||||
tokio::time::timeout(ADVANCE, &mut evict_and_wait)
|
||||
.await
|
||||
.expect_err("should had been a timeout since we are holding the layer resident");
|
||||
assert_eq!(1, LAYER_IMPL_METRICS.started_evictions.get());
|
||||
|
||||
let (completion1, barrier) = utils::completion::channel();
|
||||
let mut completion1 = Some(completion1);
|
||||
let (arrival, arrived_at_barrier) = utils::completion::channel();
|
||||
layer.enable_failpoint(Failpoint::WaitBeforeStartingEvicting(
|
||||
Some(arrival),
|
||||
barrier,
|
||||
));
|
||||
|
||||
// now the eviction cannot proceed because we are simulating arbitrary long delay for the
|
||||
// eviction task start.
|
||||
drop(resident);
|
||||
assert!(!layer.is_likely_resident());
|
||||
|
||||
arrived_at_barrier.wait().await;
|
||||
|
||||
// because no actual eviction happened, we get to just reinitialize the DownloadedLayer
|
||||
layer
|
||||
.0
|
||||
.get_or_maybe_download(false, None)
|
||||
.instrument(download_span)
|
||||
.await
|
||||
.expect("should had reinitialized without downloading");
|
||||
|
||||
assert!(layer.is_likely_resident());
|
||||
|
||||
// reinitialization notifies of new resident status, which should error out all evict_and_wait
|
||||
let e = tokio::time::timeout(ADVANCE, &mut evict_and_wait)
|
||||
.await
|
||||
.expect("no timeout, because get_or_maybe_download re-initialized")
|
||||
.expect_err("eviction should not have succeeded because re-initialized");
|
||||
|
||||
// works as intended: evictions lose to "downloads"
|
||||
assert!(matches!(e, EvictionError::Downloaded), "{e:?}");
|
||||
assert_eq!(0, LAYER_IMPL_METRICS.completed_evictions.get());
|
||||
|
||||
// this is not wrong: the eviction is technically still "on the way" as it's still queued
|
||||
// because of a failpoint
|
||||
assert_eq!(
|
||||
0,
|
||||
LAYER_IMPL_METRICS
|
||||
.cancelled_evictions
|
||||
.values()
|
||||
.map(|ctr| ctr.get())
|
||||
.sum::<u64>()
|
||||
);
|
||||
|
||||
assert_eq!(0, LAYER_IMPL_METRICS.completed_evictions.get());
|
||||
|
||||
// configure another failpoint for the second eviction -- evictions are per initialization,
|
||||
// so now that we've reinitialized the inner, we get to run two of them at the same time.
|
||||
let (completion2, barrier) = utils::completion::channel();
|
||||
let (arrival, arrived_at_barrier) = utils::completion::channel();
|
||||
layer.enable_failpoint(Failpoint::WaitBeforeStartingEvicting(
|
||||
Some(arrival),
|
||||
barrier,
|
||||
));
|
||||
|
||||
let mut second_eviction = std::pin::pin!(layer.evict_and_wait(FOREVER));
|
||||
|
||||
// advance to the wait on the queue
|
||||
tokio::time::timeout(ADVANCE, &mut second_eviction)
|
||||
.await
|
||||
.expect_err("timeout because failpoint is blocking");
|
||||
|
||||
arrived_at_barrier.wait().await;
|
||||
|
||||
assert_eq!(2, LAYER_IMPL_METRICS.started_evictions.get());
|
||||
|
||||
let mut release_earlier_eviction = |expected_reason| {
|
||||
assert_eq!(
|
||||
0,
|
||||
LAYER_IMPL_METRICS.cancelled_evictions[expected_reason].get(),
|
||||
);
|
||||
|
||||
drop(completion1.take().unwrap());
|
||||
|
||||
let handle = &handle;
|
||||
|
||||
async move {
|
||||
tokio::time::sleep(ADVANCE).await;
|
||||
SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads0(
|
||||
handle, 1,
|
||||
)
|
||||
.await;
|
||||
|
||||
assert_eq!(
|
||||
1,
|
||||
LAYER_IMPL_METRICS.cancelled_evictions[expected_reason].get(),
|
||||
);
|
||||
}
|
||||
};
|
||||
|
||||
if in_order {
|
||||
release_earlier_eviction(EvictionCancelled::VersionCheckFailed).await;
|
||||
}
|
||||
|
||||
// release the later eviction which is for the current version
|
||||
drop(completion2);
|
||||
tokio::time::sleep(ADVANCE).await;
|
||||
SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads0(&handle, 1)
|
||||
.await;
|
||||
|
||||
if !in_order {
|
||||
release_earlier_eviction(EvictionCancelled::UnexpectedEvictedState).await;
|
||||
}
|
||||
|
||||
tokio::time::timeout(ADVANCE, &mut second_eviction)
|
||||
.await
|
||||
.expect("eviction goes through now that spawn_blocking is unclogged")
|
||||
.expect("eviction should succeed, because version matches");
|
||||
|
||||
assert_eq!(1, LAYER_IMPL_METRICS.completed_evictions.get());
|
||||
|
||||
// ensure the cancelled are unchanged
|
||||
assert_eq!(
|
||||
1,
|
||||
LAYER_IMPL_METRICS
|
||||
.cancelled_evictions
|
||||
.values()
|
||||
.map(|ctr| ctr.get())
|
||||
.sum::<u64>()
|
||||
);
|
||||
|
||||
assert_eq!(0, LAYER_IMPL_METRICS.inits_cancelled.get())
|
||||
});
|
||||
}
|
||||
|
||||
/// The test ensures with a failpoint that a pending eviction is not cancelled by what is currently
|
||||
/// a `Layer::keep_resident` call.
|
||||
/// This test shows that ensures we are able to read the layer while the layer eviction has been
|
||||
/// started but not completed due to spawn_blocking pool being blocked.
|
||||
///
|
||||
/// This matters because cancelling the eviction would leave us in a state where the file is on
|
||||
/// disk but the layer internal state says it has not been initialized. Futhermore, it allows us to
|
||||
/// have non-repairing `Layer::is_likely_resident`.
|
||||
/// Here `Layer::keep_resident` is used to "simulate" reads, because it cannot download.
|
||||
#[tokio::test(start_paused = true)]
|
||||
async fn cancelled_get_or_maybe_download_does_not_cancel_eviction() {
|
||||
let handle = tokio::runtime::Handle::current();
|
||||
let h =
|
||||
TenantHarness::create("cancelled_get_or_maybe_download_does_not_cancel_eviction").unwrap();
|
||||
async fn residency_check_while_evict_and_wait_on_clogged_spawn_blocking() {
|
||||
// this is the runtime on which Layer spawns the blocking tasks on
|
||||
let handle = BACKGROUND_RUNTIME.handle();
|
||||
let h = TenantHarness::create("residency_check_while_evict_and_wait_on_clogged_spawn_blocking")
|
||||
.unwrap();
|
||||
let (tenant, ctx) = h.load().await;
|
||||
|
||||
let timeline = tenant
|
||||
@@ -571,7 +263,7 @@ async fn cancelled_get_or_maybe_download_does_not_cancel_eviction() {
|
||||
let layer = {
|
||||
let mut layers = {
|
||||
let layers = timeline.layers.read().await;
|
||||
layers.likely_resident_layers().collect::<Vec<_>>()
|
||||
layers.resident_layers().collect::<Vec<_>>().await
|
||||
};
|
||||
|
||||
assert_eq!(layers.len(), 1);
|
||||
@@ -579,154 +271,90 @@ async fn cancelled_get_or_maybe_download_does_not_cancel_eviction() {
|
||||
layers.swap_remove(0)
|
||||
};
|
||||
|
||||
// this failpoint will simulate the `get_or_maybe_download` becoming cancelled (by returning an
|
||||
// Err) at the right time as in "during" the `LayerInner::needs_download`.
|
||||
layer.enable_failpoint(Failpoint::AfterDeterminingLayerNeedsNoDownload);
|
||||
// setup done
|
||||
|
||||
let (completion, barrier) = utils::completion::channel();
|
||||
let (arrival, arrived_at_barrier) = utils::completion::channel();
|
||||
|
||||
layer.enable_failpoint(Failpoint::WaitBeforeStartingEvicting(
|
||||
Some(arrival),
|
||||
barrier,
|
||||
));
|
||||
|
||||
tokio::time::timeout(ADVANCE, layer.evict_and_wait(FOREVER))
|
||||
.await
|
||||
.expect_err("should had advanced to waiting on channel");
|
||||
|
||||
arrived_at_barrier.wait().await;
|
||||
|
||||
// simulate a cancelled read which is cancelled before it gets to re-initialize
|
||||
let e = layer
|
||||
.0
|
||||
.get_or_maybe_download(false, None)
|
||||
.await
|
||||
.unwrap_err();
|
||||
assert!(
|
||||
matches!(
|
||||
e,
|
||||
DownloadError::Failpoint(FailpointKind::AfterDeterminingLayerNeedsNoDownload)
|
||||
),
|
||||
"{e:?}"
|
||||
);
|
||||
|
||||
assert!(
|
||||
layer.0.needs_download().await.unwrap().is_none(),
|
||||
"file is still on disk"
|
||||
);
|
||||
|
||||
// release the eviction task
|
||||
drop(completion);
|
||||
tokio::time::sleep(ADVANCE).await;
|
||||
SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(&handle).await;
|
||||
|
||||
// failpoint is still enabled, but it is not hit
|
||||
let e = layer
|
||||
.0
|
||||
.get_or_maybe_download(false, None)
|
||||
.await
|
||||
.unwrap_err();
|
||||
assert!(matches!(e, DownloadError::DownloadRequired), "{e:?}");
|
||||
|
||||
// failpoint is not counted as cancellation either
|
||||
assert_eq!(0, LAYER_IMPL_METRICS.inits_cancelled.get())
|
||||
}
|
||||
|
||||
#[tokio::test(start_paused = true)]
|
||||
async fn evict_and_wait_does_not_wait_for_download() {
|
||||
// let handle = tokio::runtime::Handle::current();
|
||||
let h = TenantHarness::create("evict_and_wait_does_not_wait_for_download").unwrap();
|
||||
let (tenant, ctx) = h.load().await;
|
||||
let span = h.span();
|
||||
let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1));
|
||||
|
||||
let timeline = tenant
|
||||
.create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let layer = {
|
||||
let mut layers = {
|
||||
let layers = timeline.layers.read().await;
|
||||
layers.likely_resident_layers().collect::<Vec<_>>()
|
||||
};
|
||||
|
||||
assert_eq!(layers.len(), 1);
|
||||
|
||||
layers.swap_remove(0)
|
||||
};
|
||||
|
||||
// kind of forced setup: start an eviction but do not allow it progress until we are
|
||||
// downloading
|
||||
let (eviction_can_continue, barrier) = utils::completion::channel();
|
||||
let (arrival, eviction_arrived) = utils::completion::channel();
|
||||
layer.enable_failpoint(Failpoint::WaitBeforeStartingEvicting(
|
||||
Some(arrival),
|
||||
barrier,
|
||||
));
|
||||
let resident = layer.keep_resident().await.unwrap();
|
||||
|
||||
let mut evict_and_wait = std::pin::pin!(layer.evict_and_wait(FOREVER));
|
||||
|
||||
// use this once-awaited other_evict to synchronize with the eviction
|
||||
let other_evict = layer.evict_and_wait(FOREVER);
|
||||
|
||||
// drive the future to await on the status channel
|
||||
tokio::time::timeout(ADVANCE, &mut evict_and_wait)
|
||||
.await
|
||||
.expect_err("should had advanced");
|
||||
eviction_arrived.wait().await;
|
||||
drop(eviction_can_continue);
|
||||
other_evict.await.unwrap();
|
||||
.expect_err("should had been a timeout since we are holding the layer resident");
|
||||
assert_eq!(1, LAYER_IMPL_METRICS.started_evictions.get());
|
||||
|
||||
// now the layer is evicted, and the "evict_and_wait" is waiting on the receiver
|
||||
assert!(!layer.is_likely_resident());
|
||||
// clog up BACKGROUND_RUNTIME spawn_blocking
|
||||
let helper = SpawnBlockingPoolHelper::consume_all_spawn_blocking_threads(handle).await;
|
||||
|
||||
// following new evict_and_wait will fail until we've completed the download
|
||||
let e = layer.evict_and_wait(FOREVER).await.unwrap_err();
|
||||
assert!(matches!(e, EvictionError::NotFound), "{e:?}");
|
||||
// now the eviction cannot proceed because the threads are consumed while completion exists
|
||||
drop(resident);
|
||||
|
||||
let (download_can_continue, barrier) = utils::completion::channel();
|
||||
let (arrival, _download_arrived) = utils::completion::channel();
|
||||
layer.enable_failpoint(Failpoint::WaitBeforeDownloading(Some(arrival), barrier));
|
||||
// because no actual eviction happened, we get to just reinitialize the DownloadedLayer
|
||||
layer
|
||||
.keep_resident()
|
||||
.await
|
||||
.expect("keep_resident should had reinitialized without downloading")
|
||||
.expect("ResidentLayer");
|
||||
|
||||
let mut download = std::pin::pin!(layer
|
||||
.0
|
||||
.get_or_maybe_download(true, None)
|
||||
.instrument(download_span));
|
||||
// because the keep_resident check alters wanted evicted without sending a message, we will
|
||||
// never get completed
|
||||
let e = tokio::time::timeout(ADVANCE, &mut evict_and_wait)
|
||||
.await
|
||||
.expect("no timeout, because keep_resident re-initialized")
|
||||
.expect_err("eviction should not have succeeded because re-initialized");
|
||||
|
||||
assert!(
|
||||
!layer.is_likely_resident(),
|
||||
"during download layer is evicted"
|
||||
// works as intended: evictions lose to "downloads"
|
||||
assert!(matches!(e, EvictionError::Downloaded), "{e:?}");
|
||||
assert_eq!(0, LAYER_IMPL_METRICS.completed_evictions.get());
|
||||
|
||||
// this is not wrong: the eviction is technically still "on the way" as it's still queued
|
||||
// because spawn_blocking is clogged up
|
||||
assert_eq!(
|
||||
0,
|
||||
LAYER_IMPL_METRICS
|
||||
.cancelled_evictions
|
||||
.values()
|
||||
.map(|ctr| ctr.get())
|
||||
.sum::<u64>()
|
||||
);
|
||||
|
||||
tokio::time::timeout(ADVANCE, &mut download)
|
||||
let mut second_eviction = std::pin::pin!(layer.evict_and_wait(FOREVER));
|
||||
|
||||
// advance to the wait on the queue
|
||||
tokio::time::timeout(ADVANCE, &mut second_eviction)
|
||||
.await
|
||||
.expect_err("should had timed out because of failpoint");
|
||||
.expect_err("timeout because spawn_blocking is clogged");
|
||||
|
||||
// now we finally get to continue, and because the latest state is downloading, we deduce that
|
||||
// original eviction succeeded
|
||||
evict_and_wait.await.unwrap();
|
||||
// in this case we don't leak started evictions, but I think there is still a chance of that
|
||||
// happening, because we could have upgrades race multiple evictions while only one of them
|
||||
// happens?
|
||||
assert_eq!(2, LAYER_IMPL_METRICS.started_evictions.get());
|
||||
|
||||
// however a new evict_and_wait will fail
|
||||
let e = layer.evict_and_wait(FOREVER).await.unwrap_err();
|
||||
assert!(matches!(e, EvictionError::NotFound), "{e:?}");
|
||||
helper.release().await;
|
||||
|
||||
assert!(!layer.is_likely_resident());
|
||||
// the second_eviction gets to run here
|
||||
//
|
||||
// synchronize to be *strictly* after the second_eviction spawn_blocking run
|
||||
SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(handle).await;
|
||||
|
||||
drop(download_can_continue);
|
||||
download.await.expect("download should had succeeded");
|
||||
assert!(layer.is_likely_resident());
|
||||
tokio::time::timeout(ADVANCE, &mut second_eviction)
|
||||
.await
|
||||
.expect("eviction goes through now that spawn_blocking is unclogged")
|
||||
.expect("eviction should succeed, because version matches");
|
||||
|
||||
// only now can we evict
|
||||
layer.evict_and_wait(FOREVER).await.unwrap();
|
||||
}
|
||||
assert_eq!(1, LAYER_IMPL_METRICS.completed_evictions.get());
|
||||
|
||||
#[test]
|
||||
fn layer_size() {
|
||||
assert_eq!(std::mem::size_of::<LayerAccessStats>(), 2040);
|
||||
assert_eq!(std::mem::size_of::<PersistentLayerDesc>(), 104);
|
||||
assert_eq!(std::mem::size_of::<LayerInner>(), 2328);
|
||||
// it also has the utf8 path
|
||||
// now we finally can observe the original spawn_blocking failing
|
||||
// it would had been possible to observe it earlier, but here it is guaranteed to have
|
||||
// happened.
|
||||
assert_eq!(
|
||||
1,
|
||||
LAYER_IMPL_METRICS
|
||||
.cancelled_evictions
|
||||
.values()
|
||||
.map(|ctr| ctr.get())
|
||||
.sum::<u64>()
|
||||
);
|
||||
}
|
||||
|
||||
struct SpawnBlockingPoolHelper {
|
||||
@@ -743,41 +371,31 @@ impl SpawnBlockingPoolHelper {
|
||||
///
|
||||
/// This should be no issue nowdays, because nextest runs each test in it's own process.
|
||||
async fn consume_all_spawn_blocking_threads(handle: &tokio::runtime::Handle) -> Self {
|
||||
let default_max_blocking_threads = 512;
|
||||
|
||||
Self::consume_all_spawn_blocking_threads0(handle, default_max_blocking_threads).await
|
||||
}
|
||||
|
||||
async fn consume_all_spawn_blocking_threads0(
|
||||
handle: &tokio::runtime::Handle,
|
||||
threads: usize,
|
||||
) -> Self {
|
||||
assert_ne!(threads, 0);
|
||||
|
||||
let (completion, barrier) = completion::channel();
|
||||
let (started, starts_completed) = completion::channel();
|
||||
let (tx, mut rx) = tokio::sync::mpsc::channel(8);
|
||||
|
||||
let assumed_max_blocking_threads = 512;
|
||||
|
||||
let mut blocking_tasks = JoinSet::new();
|
||||
|
||||
for _ in 0..threads {
|
||||
for _ in 0..assumed_max_blocking_threads {
|
||||
let barrier = barrier.clone();
|
||||
let started = started.clone();
|
||||
let tx = tx.clone();
|
||||
blocking_tasks.spawn_blocking_on(
|
||||
move || {
|
||||
drop(started);
|
||||
tx.blocking_send(()).unwrap();
|
||||
drop(tx);
|
||||
tokio::runtime::Handle::current().block_on(barrier.wait());
|
||||
},
|
||||
handle,
|
||||
);
|
||||
}
|
||||
|
||||
drop(started);
|
||||
|
||||
starts_completed.wait().await;
|
||||
|
||||
drop(barrier);
|
||||
|
||||
tracing::trace!("consumed all threads");
|
||||
for _ in 0..assumed_max_blocking_threads {
|
||||
rx.recv().await.unwrap();
|
||||
}
|
||||
|
||||
SpawnBlockingPoolHelper {
|
||||
awaited_by_spawn_blocking_tasks: completion,
|
||||
@@ -797,22 +415,13 @@ impl SpawnBlockingPoolHelper {
|
||||
while let Some(res) = blocking_tasks.join_next().await {
|
||||
res.expect("none of the tasks should had panicked");
|
||||
}
|
||||
|
||||
tracing::trace!("released all threads");
|
||||
}
|
||||
|
||||
/// In the tests it is used as an easy way of making sure something scheduled on the target
|
||||
/// runtimes `spawn_blocking` has completed, because it must've been scheduled and completed
|
||||
/// before our tasks have a chance to schedule and complete.
|
||||
async fn consume_and_release_all_of_spawn_blocking_threads(handle: &tokio::runtime::Handle) {
|
||||
Self::consume_and_release_all_of_spawn_blocking_threads0(handle, 512).await
|
||||
}
|
||||
|
||||
async fn consume_and_release_all_of_spawn_blocking_threads0(
|
||||
handle: &tokio::runtime::Handle,
|
||||
threads: usize,
|
||||
) {
|
||||
Self::consume_all_spawn_blocking_threads0(handle, threads)
|
||||
Self::consume_all_spawn_blocking_threads(handle)
|
||||
.await
|
||||
.release()
|
||||
.await
|
||||
@@ -826,7 +435,7 @@ fn spawn_blocking_pool_helper_actually_works() {
|
||||
// because the amount is not configurable for our helper, expect the same amount as
|
||||
// BACKGROUND_RUNTIME using the tokio defaults would have.
|
||||
let rt = tokio::runtime::Builder::new_current_thread()
|
||||
.max_blocking_threads(1)
|
||||
.max_blocking_threads(512)
|
||||
.enable_all()
|
||||
.build()
|
||||
.unwrap();
|
||||
@@ -836,8 +445,7 @@ fn spawn_blocking_pool_helper_actually_works() {
|
||||
rt.block_on(async move {
|
||||
// this will not return until all threads are spun up and actually executing the code
|
||||
// waiting on `consumed` to be `SpawnBlockingPoolHelper::release`'d.
|
||||
let consumed =
|
||||
SpawnBlockingPoolHelper::consume_all_spawn_blocking_threads0(handle, 1).await;
|
||||
let consumed = SpawnBlockingPoolHelper::consume_all_spawn_blocking_threads(handle).await;
|
||||
|
||||
println!("consumed");
|
||||
|
||||
|
||||
@@ -13,6 +13,7 @@ use bytes::Bytes;
|
||||
use camino::Utf8Path;
|
||||
use enumset::EnumSet;
|
||||
use fail::fail_point;
|
||||
use futures::stream::StreamExt;
|
||||
use once_cell::sync::Lazy;
|
||||
use pageserver_api::{
|
||||
key::AUX_FILES_KEY,
|
||||
@@ -36,7 +37,6 @@ use tracing::*;
|
||||
use utils::{
|
||||
bin_ser::BeSer,
|
||||
sync::gate::{Gate, GateGuard},
|
||||
vec_map::VecMap,
|
||||
};
|
||||
|
||||
use std::ops::{Deref, Range};
|
||||
@@ -2442,7 +2442,7 @@ impl Timeline {
|
||||
|
||||
let guard = self.layers.read().await;
|
||||
|
||||
let resident = guard.likely_resident_layers().map(|layer| {
|
||||
let resident = guard.resident_layers().map(|layer| {
|
||||
let last_activity_ts = layer.access_stats().latest_activity_or_now();
|
||||
|
||||
HeatMapLayer::new(
|
||||
@@ -2452,7 +2452,7 @@ impl Timeline {
|
||||
)
|
||||
});
|
||||
|
||||
let layers = resident.collect();
|
||||
let layers = resident.collect().await;
|
||||
|
||||
Some(HeatMapTimeline::new(self.timeline_id, layers))
|
||||
}
|
||||
@@ -4302,7 +4302,7 @@ impl Timeline {
|
||||
let mut max_layer_size: Option<u64> = None;
|
||||
|
||||
let resident_layers = guard
|
||||
.likely_resident_layers()
|
||||
.resident_layers()
|
||||
.map(|layer| {
|
||||
let file_size = layer.layer_desc().file_size;
|
||||
max_layer_size = max_layer_size.map_or(Some(file_size), |m| Some(m.max(file_size)));
|
||||
@@ -4315,7 +4315,8 @@ impl Timeline {
|
||||
relative_last_activity: finite_f32::FiniteF32::ZERO,
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
.collect()
|
||||
.await;
|
||||
|
||||
DiskUsageEvictionInfo {
|
||||
max_layer_size,
|
||||
@@ -4617,15 +4618,16 @@ impl<'a> TimelineWriter<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
/// Put a batch of keys at the specified Lsns.
|
||||
/// Put a batch keys at the specified Lsns.
|
||||
///
|
||||
/// The batch is sorted by Lsn (enforced by usage of [`utils::vec_map::VecMap`].
|
||||
/// The batch should be sorted by Lsn such that it's safe
|
||||
/// to roll the open layer mid batch.
|
||||
pub(crate) async fn put_batch(
|
||||
&mut self,
|
||||
batch: VecMap<Lsn, (Key, Value)>,
|
||||
batch: Vec<(Key, Lsn, Value)>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
for (lsn, (key, val)) in batch {
|
||||
for (key, lsn, val) in batch {
|
||||
self.put(key, lsn, &val, ctx).await?
|
||||
}
|
||||
|
||||
@@ -4711,6 +4713,7 @@ mod tests {
|
||||
.keep_resident()
|
||||
.await
|
||||
.expect("no download => no downloading errors")
|
||||
.expect("should had been resident")
|
||||
.drop_eviction_guard();
|
||||
|
||||
let forever = std::time::Duration::from_secs(120);
|
||||
@@ -4721,7 +4724,7 @@ mod tests {
|
||||
let (first, second) = tokio::join!(first, second);
|
||||
|
||||
let res = layer.keep_resident().await;
|
||||
assert!(res.is_none(), "{res:?}");
|
||||
assert!(matches!(res, Ok(None)), "{res:?}");
|
||||
|
||||
match (first, second) {
|
||||
(Ok(()), Ok(())) => {
|
||||
|
||||
@@ -225,18 +225,24 @@ impl Timeline {
|
||||
{
|
||||
let guard = self.layers.read().await;
|
||||
let layers = guard.layer_map();
|
||||
for layer in layers.iter_historic_layers() {
|
||||
let layer = guard.get_from_desc(&layer);
|
||||
for hist_layer in layers.iter_historic_layers() {
|
||||
let hist_layer = guard.get_from_desc(&hist_layer);
|
||||
|
||||
// guard against eviction while we inspect it; it might be that eviction_task and
|
||||
// disk_usage_eviction_task both select the same layers to be evicted, and
|
||||
// seemingly free up double the space. both succeeding is of no consequence.
|
||||
let guard = match hist_layer.keep_resident().await {
|
||||
Ok(Some(l)) => l,
|
||||
Ok(None) => continue,
|
||||
Err(e) => {
|
||||
// these should not happen, but we cannot make them statically impossible right
|
||||
// now.
|
||||
tracing::warn!(layer=%hist_layer, "failed to keep the layer resident: {e:#}");
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
if !layer.is_likely_resident() {
|
||||
continue;
|
||||
}
|
||||
|
||||
let last_activity_ts = layer.access_stats().latest_activity_or_now();
|
||||
let last_activity_ts = hist_layer.access_stats().latest_activity_or_now();
|
||||
|
||||
let no_activity_for = match now.duration_since(last_activity_ts) {
|
||||
Ok(d) => d,
|
||||
@@ -259,8 +265,9 @@ impl Timeline {
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
let layer = guard.drop_eviction_guard();
|
||||
if no_activity_for > p.threshold {
|
||||
// this could cause a lot of allocations in some cases
|
||||
js.spawn(async move {
|
||||
layer
|
||||
.evict_and_wait(std::time::Duration::from_secs(5))
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
use anyhow::{bail, ensure, Context, Result};
|
||||
use futures::StreamExt;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use std::{collections::HashMap, sync::Arc};
|
||||
use tracing::trace;
|
||||
@@ -240,16 +241,29 @@ impl LayerManager {
|
||||
layer.delete_on_drop();
|
||||
}
|
||||
|
||||
pub(crate) fn likely_resident_layers(&self) -> impl Iterator<Item = Layer> + '_ {
|
||||
pub(crate) fn resident_layers(&self) -> impl futures::stream::Stream<Item = Layer> + '_ {
|
||||
// for small layer maps, we most likely have all resident, but for larger more are likely
|
||||
// to be evicted assuming lots of layers correlated with longer lifespan.
|
||||
|
||||
self.layer_map().iter_historic_layers().filter_map(|desc| {
|
||||
self.layer_fmgr
|
||||
.0
|
||||
.get(&desc.key())
|
||||
.filter(|l| l.is_likely_resident())
|
||||
.cloned()
|
||||
let layers = self
|
||||
.layer_map()
|
||||
.iter_historic_layers()
|
||||
.map(|desc| self.get_from_desc(&desc));
|
||||
|
||||
let layers = futures::stream::iter(layers);
|
||||
|
||||
layers.filter_map(|layer| async move {
|
||||
// TODO(#6028): this query does not really need to see the ResidentLayer
|
||||
match layer.keep_resident().await {
|
||||
Ok(Some(layer)) => Some(layer.drop_eviction_guard()),
|
||||
Ok(None) => None,
|
||||
Err(e) => {
|
||||
// these should not happen, but we cannot make them statically impossible right
|
||||
// now.
|
||||
tracing::warn!(%layer, "failed to keep the layer resident: {e:#}");
|
||||
None
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
|
||||
@@ -549,10 +549,9 @@ walprop_pg_init_standalone_sync_safekeepers(void)
|
||||
static void
|
||||
walprop_sigusr2(SIGNAL_ARGS)
|
||||
{
|
||||
int save_errno = errno;
|
||||
got_SIGUSR2 = true;
|
||||
|
||||
SetLatch(MyLatch);
|
||||
errno = save_errno;
|
||||
}
|
||||
|
||||
static void
|
||||
|
||||
@@ -59,11 +59,10 @@ rustls.workspace = true
|
||||
scopeguard.workspace = true
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
sha2 = { workspace = true, features = ["asm"] }
|
||||
sha2.workspace = true
|
||||
smol_str.workspace = true
|
||||
smallvec.workspace = true
|
||||
socket2.workspace = true
|
||||
subtle.workspace = true
|
||||
sync_wrapper.workspace = true
|
||||
task-local-extensions.workspace = true
|
||||
thiserror.workspace = true
|
||||
@@ -92,7 +91,6 @@ workspace_hack.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
camino-tempfile.workspace = true
|
||||
fallible-iterator.workspace = true
|
||||
rcgen.workspace = true
|
||||
rstest.workspace = true
|
||||
tokio-postgres-rustls.workspace = true
|
||||
|
||||
@@ -408,228 +408,3 @@ impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, &()> {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::sync::Arc;
|
||||
|
||||
use bytes::BytesMut;
|
||||
use fallible_iterator::FallibleIterator;
|
||||
use postgres_protocol::{
|
||||
authentication::sasl::{ChannelBinding, ScramSha256},
|
||||
message::{backend::Message as PgMessage, frontend},
|
||||
};
|
||||
use provider::AuthSecret;
|
||||
use tokio::io::{AsyncRead, AsyncReadExt, AsyncWriteExt};
|
||||
|
||||
use crate::{
|
||||
auth::{ComputeUserInfoMaybeEndpoint, IpPattern},
|
||||
config::AuthenticationConfig,
|
||||
console::{
|
||||
self,
|
||||
provider::{self, CachedAllowedIps, CachedRoleSecret},
|
||||
CachedNodeInfo,
|
||||
},
|
||||
context::RequestMonitoring,
|
||||
proxy::NeonOptions,
|
||||
scram::ServerSecret,
|
||||
stream::{PqStream, Stream},
|
||||
};
|
||||
|
||||
use super::auth_quirks;
|
||||
|
||||
struct Auth {
|
||||
ips: Vec<IpPattern>,
|
||||
secret: AuthSecret,
|
||||
}
|
||||
|
||||
impl console::Api for Auth {
|
||||
async fn get_role_secret(
|
||||
&self,
|
||||
_ctx: &mut RequestMonitoring,
|
||||
_user_info: &super::ComputeUserInfo,
|
||||
) -> Result<CachedRoleSecret, console::errors::GetAuthInfoError> {
|
||||
Ok(CachedRoleSecret::new_uncached(Some(self.secret.clone())))
|
||||
}
|
||||
|
||||
async fn get_allowed_ips_and_secret(
|
||||
&self,
|
||||
_ctx: &mut RequestMonitoring,
|
||||
_user_info: &super::ComputeUserInfo,
|
||||
) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), console::errors::GetAuthInfoError>
|
||||
{
|
||||
Ok((
|
||||
CachedAllowedIps::new_uncached(Arc::new(self.ips.clone())),
|
||||
Some(CachedRoleSecret::new_uncached(Some(self.secret.clone()))),
|
||||
))
|
||||
}
|
||||
|
||||
async fn wake_compute(
|
||||
&self,
|
||||
_ctx: &mut RequestMonitoring,
|
||||
_user_info: &super::ComputeUserInfo,
|
||||
) -> Result<CachedNodeInfo, console::errors::WakeComputeError> {
|
||||
unimplemented!()
|
||||
}
|
||||
}
|
||||
|
||||
static CONFIG: &AuthenticationConfig = &AuthenticationConfig {
|
||||
scram_protocol_timeout: std::time::Duration::from_secs(5),
|
||||
};
|
||||
|
||||
async fn read_message(r: &mut (impl AsyncRead + Unpin), b: &mut BytesMut) -> PgMessage {
|
||||
loop {
|
||||
r.read_buf(&mut *b).await.unwrap();
|
||||
if let Some(m) = PgMessage::parse(&mut *b).unwrap() {
|
||||
break m;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn auth_quirks_scram() {
|
||||
let (mut client, server) = tokio::io::duplex(1024);
|
||||
let mut stream = PqStream::new(Stream::from_raw(server));
|
||||
|
||||
let mut ctx = RequestMonitoring::test();
|
||||
let api = Auth {
|
||||
ips: vec![],
|
||||
secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()),
|
||||
};
|
||||
|
||||
let user_info = ComputeUserInfoMaybeEndpoint {
|
||||
user: "conrad".into(),
|
||||
endpoint_id: Some("endpoint".into()),
|
||||
options: NeonOptions::default(),
|
||||
};
|
||||
|
||||
let handle = tokio::spawn(async move {
|
||||
let mut scram = ScramSha256::new(b"my-secret-password", ChannelBinding::unsupported());
|
||||
|
||||
let mut read = BytesMut::new();
|
||||
|
||||
// server should offer scram
|
||||
match read_message(&mut client, &mut read).await {
|
||||
PgMessage::AuthenticationSasl(a) => {
|
||||
let options: Vec<&str> = a.mechanisms().collect().unwrap();
|
||||
assert_eq!(options, ["SCRAM-SHA-256"]);
|
||||
}
|
||||
_ => panic!("wrong message"),
|
||||
}
|
||||
|
||||
// client sends client-first-message
|
||||
let mut write = BytesMut::new();
|
||||
frontend::sasl_initial_response("SCRAM-SHA-256", scram.message(), &mut write).unwrap();
|
||||
client.write_all(&write).await.unwrap();
|
||||
|
||||
// server response with server-first-message
|
||||
match read_message(&mut client, &mut read).await {
|
||||
PgMessage::AuthenticationSaslContinue(a) => {
|
||||
scram.update(a.data()).await.unwrap();
|
||||
}
|
||||
_ => panic!("wrong message"),
|
||||
}
|
||||
|
||||
// client response with client-final-message
|
||||
write.clear();
|
||||
frontend::sasl_response(scram.message(), &mut write).unwrap();
|
||||
client.write_all(&write).await.unwrap();
|
||||
|
||||
// server response with server-final-message
|
||||
match read_message(&mut client, &mut read).await {
|
||||
PgMessage::AuthenticationSaslFinal(a) => {
|
||||
scram.finish(a.data()).unwrap();
|
||||
}
|
||||
_ => panic!("wrong message"),
|
||||
}
|
||||
});
|
||||
|
||||
let _creds = auth_quirks(&mut ctx, &api, user_info, &mut stream, false, CONFIG)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
handle.await.unwrap();
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn auth_quirks_cleartext() {
|
||||
let (mut client, server) = tokio::io::duplex(1024);
|
||||
let mut stream = PqStream::new(Stream::from_raw(server));
|
||||
|
||||
let mut ctx = RequestMonitoring::test();
|
||||
let api = Auth {
|
||||
ips: vec![],
|
||||
secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()),
|
||||
};
|
||||
|
||||
let user_info = ComputeUserInfoMaybeEndpoint {
|
||||
user: "conrad".into(),
|
||||
endpoint_id: Some("endpoint".into()),
|
||||
options: NeonOptions::default(),
|
||||
};
|
||||
|
||||
let handle = tokio::spawn(async move {
|
||||
let mut read = BytesMut::new();
|
||||
let mut write = BytesMut::new();
|
||||
|
||||
// server should offer cleartext
|
||||
match read_message(&mut client, &mut read).await {
|
||||
PgMessage::AuthenticationCleartextPassword => {}
|
||||
_ => panic!("wrong message"),
|
||||
}
|
||||
|
||||
// client responds with password
|
||||
write.clear();
|
||||
frontend::password_message(b"my-secret-password", &mut write).unwrap();
|
||||
client.write_all(&write).await.unwrap();
|
||||
});
|
||||
|
||||
let _creds = auth_quirks(&mut ctx, &api, user_info, &mut stream, true, CONFIG)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
handle.await.unwrap();
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn auth_quirks_password_hack() {
|
||||
let (mut client, server) = tokio::io::duplex(1024);
|
||||
let mut stream = PqStream::new(Stream::from_raw(server));
|
||||
|
||||
let mut ctx = RequestMonitoring::test();
|
||||
let api = Auth {
|
||||
ips: vec![],
|
||||
secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()),
|
||||
};
|
||||
|
||||
let user_info = ComputeUserInfoMaybeEndpoint {
|
||||
user: "conrad".into(),
|
||||
endpoint_id: None,
|
||||
options: NeonOptions::default(),
|
||||
};
|
||||
|
||||
let handle = tokio::spawn(async move {
|
||||
let mut read = BytesMut::new();
|
||||
|
||||
// server should offer cleartext
|
||||
match read_message(&mut client, &mut read).await {
|
||||
PgMessage::AuthenticationCleartextPassword => {}
|
||||
_ => panic!("wrong message"),
|
||||
}
|
||||
|
||||
// client responds with password
|
||||
let mut write = BytesMut::new();
|
||||
frontend::password_message(b"endpoint=my-endpoint;my-secret-password", &mut write)
|
||||
.unwrap();
|
||||
client.write_all(&write).await.unwrap();
|
||||
});
|
||||
|
||||
let creds = auth_quirks(&mut ctx, &api, user_info, &mut stream, true, CONFIG)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(creds.info.endpoint, "my-endpoint");
|
||||
|
||||
handle.await.unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -194,7 +194,14 @@ pub(crate) async fn validate_password_and_exchange(
|
||||
}
|
||||
// perform scram authentication as both client and server to validate the keys
|
||||
AuthSecret::Scram(scram_secret) => {
|
||||
let outcome = crate::scram::exchange(&scram_secret, password).await?;
|
||||
use postgres_protocol::authentication::sasl::{ChannelBinding, ScramSha256};
|
||||
let sasl_client = ScramSha256::new(password, ChannelBinding::unsupported());
|
||||
let outcome = crate::scram::exchange(
|
||||
&scram_secret,
|
||||
sasl_client,
|
||||
crate::config::TlsServerEndPoint::Undefined,
|
||||
)
|
||||
.await?;
|
||||
|
||||
let client_key = match outcome {
|
||||
sasl::Outcome::Success(client_key) => client_key,
|
||||
|
||||
@@ -82,13 +82,14 @@ pub type ScramKeys = tokio_postgres::config::ScramKeys<32>;
|
||||
/// A config for establishing a connection to compute node.
|
||||
/// Eventually, `tokio_postgres` will be replaced with something better.
|
||||
/// Newtype allows us to implement methods on top of it.
|
||||
#[derive(Clone, Default)]
|
||||
#[derive(Clone)]
|
||||
#[repr(transparent)]
|
||||
pub struct ConnCfg(Box<tokio_postgres::Config>);
|
||||
|
||||
/// Creation and initialization routines.
|
||||
impl ConnCfg {
|
||||
pub fn new() -> Self {
|
||||
Self::default()
|
||||
Self(Default::default())
|
||||
}
|
||||
|
||||
/// Reuse password or auth keys from the other config.
|
||||
@@ -164,6 +165,12 @@ impl std::ops::DerefMut for ConnCfg {
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for ConnCfg {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
impl ConnCfg {
|
||||
/// Establish a raw TCP connection to the compute node.
|
||||
async fn connect_raw(&self, timeout: Duration) -> io::Result<(SocketAddr, TcpStream, &str)> {
|
||||
|
||||
@@ -6,7 +6,7 @@ pub mod messages;
|
||||
|
||||
/// Wrappers for console APIs and their mocks.
|
||||
pub mod provider;
|
||||
pub(crate) use provider::{errors, Api, AuthSecret, CachedNodeInfo, NodeInfo};
|
||||
pub use provider::{errors, Api, AuthSecret, CachedNodeInfo, NodeInfo};
|
||||
|
||||
/// Various cache-related types.
|
||||
pub mod caches {
|
||||
|
||||
@@ -14,6 +14,7 @@ use crate::{
|
||||
context::RequestMonitoring,
|
||||
scram, EndpointCacheKey, ProjectId,
|
||||
};
|
||||
use async_trait::async_trait;
|
||||
use dashmap::DashMap;
|
||||
use std::{sync::Arc, time::Duration};
|
||||
use tokio::sync::{OwnedSemaphorePermit, Semaphore};
|
||||
@@ -325,7 +326,8 @@ pub type CachedAllowedIps = Cached<&'static ProjectInfoCacheImpl, Arc<Vec<IpPatt
|
||||
|
||||
/// This will allocate per each call, but the http requests alone
|
||||
/// already require a few allocations, so it should be fine.
|
||||
pub(crate) trait Api {
|
||||
#[async_trait]
|
||||
pub trait Api {
|
||||
/// Get the client's auth secret for authentication.
|
||||
/// Returns option because user not found situation is special.
|
||||
/// We still have to mock the scram to avoid leaking information that user doesn't exist.
|
||||
@@ -361,6 +363,7 @@ pub enum ConsoleBackend {
|
||||
Test(Box<dyn crate::auth::backend::TestBackend>),
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl Api for ConsoleBackend {
|
||||
async fn get_role_secret(
|
||||
&self,
|
||||
|
||||
@@ -8,6 +8,7 @@ use crate::console::provider::{CachedAllowedIps, CachedRoleSecret};
|
||||
use crate::context::RequestMonitoring;
|
||||
use crate::{auth::backend::ComputeUserInfo, compute, error::io_error, scram, url::ApiUrl};
|
||||
use crate::{auth::IpPattern, cache::Cached};
|
||||
use async_trait::async_trait;
|
||||
use futures::TryFutureExt;
|
||||
use std::{str::FromStr, sync::Arc};
|
||||
use thiserror::Error;
|
||||
@@ -143,6 +144,7 @@ async fn get_execute_postgres_query(
|
||||
Ok(Some(entry))
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl super::Api for Api {
|
||||
#[tracing::instrument(skip_all)]
|
||||
async fn get_role_secret(
|
||||
|
||||
@@ -14,6 +14,7 @@ use crate::{
|
||||
context::RequestMonitoring,
|
||||
metrics::{ALLOWED_IPS_BY_CACHE_OUTCOME, ALLOWED_IPS_NUMBER},
|
||||
};
|
||||
use async_trait::async_trait;
|
||||
use futures::TryFutureExt;
|
||||
use std::sync::Arc;
|
||||
use tokio::time::Instant;
|
||||
@@ -167,6 +168,7 @@ impl Api {
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl super::Api for Api {
|
||||
#[tracing::instrument(skip_all)]
|
||||
async fn get_role_secret(
|
||||
|
||||
@@ -2,21 +2,14 @@ use anyhow::{anyhow, bail};
|
||||
use hyper::{Body, Request, Response, StatusCode};
|
||||
use std::{convert::Infallible, net::TcpListener};
|
||||
use tracing::info;
|
||||
use utils::http::{
|
||||
endpoint::{self, prometheus_metrics_handler, request_span},
|
||||
error::ApiError,
|
||||
json::json_response,
|
||||
RouterBuilder, RouterService,
|
||||
};
|
||||
use utils::http::{endpoint, error::ApiError, json::json_response, RouterBuilder, RouterService};
|
||||
|
||||
async fn status_handler(_: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
json_response(StatusCode::OK, "")
|
||||
}
|
||||
|
||||
fn make_router() -> RouterBuilder<hyper::Body, ApiError> {
|
||||
endpoint::make_router()
|
||||
.get("/metrics", |r| request_span(r, prometheus_metrics_handler))
|
||||
.get("/v1/status", status_handler)
|
||||
endpoint::make_router().get("/v1/status", status_handler)
|
||||
}
|
||||
|
||||
pub async fn task_main(http_listener: TcpListener) -> anyhow::Result<Infallible> {
|
||||
|
||||
@@ -135,10 +135,9 @@ impl TestAuth for NoAuth {}
|
||||
struct Scram(scram::ServerSecret);
|
||||
|
||||
impl Scram {
|
||||
async fn new(password: &str) -> anyhow::Result<Self> {
|
||||
let secret = scram::ServerSecret::build(password)
|
||||
.await
|
||||
.context("failed to generate scram secret")?;
|
||||
fn new(password: &str) -> anyhow::Result<Self> {
|
||||
let secret =
|
||||
scram::ServerSecret::build(password).context("failed to generate scram secret")?;
|
||||
Ok(Scram(secret))
|
||||
}
|
||||
|
||||
@@ -285,7 +284,7 @@ async fn scram_auth_good(#[case] password: &str) -> anyhow::Result<()> {
|
||||
let proxy = tokio::spawn(dummy_proxy(
|
||||
client,
|
||||
Some(server_config),
|
||||
Scram::new(password).await?,
|
||||
Scram::new(password)?,
|
||||
));
|
||||
|
||||
let (_client, _conn) = tokio_postgres::Config::new()
|
||||
@@ -309,7 +308,7 @@ async fn scram_auth_disable_channel_binding() -> anyhow::Result<()> {
|
||||
let proxy = tokio::spawn(dummy_proxy(
|
||||
client,
|
||||
Some(server_config),
|
||||
Scram::new("password").await?,
|
||||
Scram::new("password")?,
|
||||
));
|
||||
|
||||
let (_client, _conn) = tokio_postgres::Config::new()
|
||||
|
||||
@@ -148,7 +148,7 @@ async fn scram_auth_disable_channel_binding() -> anyhow::Result<()> {
|
||||
let proxy = tokio::spawn(dummy_proxy(
|
||||
client,
|
||||
Some(server_config),
|
||||
Scram::new("password").await?,
|
||||
Scram::new("password")?,
|
||||
));
|
||||
|
||||
let _client_err = tokio_postgres::Config::new()
|
||||
@@ -231,7 +231,7 @@ async fn connect_failure(
|
||||
let proxy = tokio::spawn(dummy_proxy(
|
||||
client,
|
||||
Some(server_config),
|
||||
Scram::new("password").await?,
|
||||
Scram::new("password")?,
|
||||
));
|
||||
|
||||
let _client_err = tokio_postgres::Config::new()
|
||||
|
||||
@@ -33,9 +33,6 @@ pub enum Error {
|
||||
#[error("Internal error: missing digest")]
|
||||
MissingBinding,
|
||||
|
||||
#[error("could not decode salt: {0}")]
|
||||
Base64(#[from] base64::DecodeError),
|
||||
|
||||
#[error(transparent)]
|
||||
Io(#[from] io::Error),
|
||||
}
|
||||
@@ -58,7 +55,6 @@ impl ReportableError for Error {
|
||||
Error::ChannelBindingBadMethod(_) => crate::error::ErrorKind::User,
|
||||
Error::BadClientMessage(_) => crate::error::ErrorKind::User,
|
||||
Error::MissingBinding => crate::error::ErrorKind::Service,
|
||||
Error::Base64(_) => crate::error::ErrorKind::ControlPlane,
|
||||
Error::Io(_) => crate::error::ErrorKind::ClientDisconnect,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -56,6 +56,8 @@ fn sha256<'a>(parts: impl IntoIterator<Item = &'a [u8]>) -> [u8; 32] {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use postgres_protocol::authentication::sasl::{ChannelBinding, ScramSha256};
|
||||
|
||||
use crate::sasl::{Mechanism, Step};
|
||||
|
||||
use super::{Exchange, ServerSecret};
|
||||
@@ -112,10 +114,17 @@ mod tests {
|
||||
}
|
||||
|
||||
async fn run_round_trip_test(server_password: &str, client_password: &str) {
|
||||
let scram_secret = ServerSecret::build(server_password).await.unwrap();
|
||||
let outcome = super::exchange(&scram_secret, client_password.as_bytes())
|
||||
.await
|
||||
.unwrap();
|
||||
let scram_secret = ServerSecret::build(server_password).unwrap();
|
||||
let sasl_client =
|
||||
ScramSha256::new(client_password.as_bytes(), ChannelBinding::unsupported());
|
||||
|
||||
let outcome = super::exchange(
|
||||
&scram_secret,
|
||||
sasl_client,
|
||||
crate::config::TlsServerEndPoint::Undefined,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
match outcome {
|
||||
crate::sasl::Outcome::Success(_) => {}
|
||||
|
||||
@@ -2,16 +2,13 @@
|
||||
|
||||
use std::convert::Infallible;
|
||||
|
||||
use hmac::{Hmac, Mac};
|
||||
use sha2::Sha256;
|
||||
use tokio::task::yield_now;
|
||||
use postgres_protocol::authentication::sasl::ScramSha256;
|
||||
|
||||
use super::messages::{
|
||||
ClientFinalMessage, ClientFirstMessage, OwnedServerFirstMessage, SCRAM_RAW_NONCE_LEN,
|
||||
};
|
||||
use super::secret::ServerSecret;
|
||||
use super::signature::SignatureBuilder;
|
||||
use super::ScramKey;
|
||||
use crate::config;
|
||||
use crate::sasl::{self, ChannelBinding, Error as SaslError};
|
||||
|
||||
@@ -74,62 +71,47 @@ impl<'a> Exchange<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
// copied from <https://github.com/neondatabase/rust-postgres/blob/20031d7a9ee1addeae6e0968e3899ae6bf01cee2/postgres-protocol/src/authentication/sasl.rs#L36-L61>
|
||||
async fn pbkdf2(str: &[u8], salt: &[u8], iterations: u32) -> [u8; 32] {
|
||||
let hmac = Hmac::<Sha256>::new_from_slice(str).expect("HMAC is able to accept all key sizes");
|
||||
let mut prev = hmac
|
||||
.clone()
|
||||
.chain_update(salt)
|
||||
.chain_update(1u32.to_be_bytes())
|
||||
.finalize()
|
||||
.into_bytes();
|
||||
|
||||
let mut hi = prev;
|
||||
|
||||
for i in 1..iterations {
|
||||
prev = hmac.clone().chain_update(prev).finalize().into_bytes();
|
||||
|
||||
for (hi, prev) in hi.iter_mut().zip(prev) {
|
||||
*hi ^= prev;
|
||||
}
|
||||
// yield every ~250us
|
||||
// hopefully reduces tail latencies
|
||||
if i % 1024 == 0 {
|
||||
yield_now().await
|
||||
}
|
||||
}
|
||||
|
||||
hi.into()
|
||||
}
|
||||
|
||||
// copied from <https://github.com/neondatabase/rust-postgres/blob/20031d7a9ee1addeae6e0968e3899ae6bf01cee2/postgres-protocol/src/authentication/sasl.rs#L236-L248>
|
||||
async fn derive_client_key(password: &[u8], salt: &[u8], iterations: u32) -> ScramKey {
|
||||
let salted_password = pbkdf2(password, salt, iterations).await;
|
||||
|
||||
let make_key = |name| {
|
||||
let key = Hmac::<Sha256>::new_from_slice(&salted_password)
|
||||
.expect("HMAC is able to accept all key sizes")
|
||||
.chain_update(name)
|
||||
.finalize();
|
||||
|
||||
<[u8; 32]>::from(key.into_bytes())
|
||||
};
|
||||
|
||||
make_key(b"Client Key").into()
|
||||
}
|
||||
|
||||
pub async fn exchange(
|
||||
secret: &ServerSecret,
|
||||
password: &[u8],
|
||||
mut client: ScramSha256,
|
||||
tls_server_end_point: config::TlsServerEndPoint,
|
||||
) -> sasl::Result<sasl::Outcome<super::ScramKey>> {
|
||||
let salt = base64::decode(&secret.salt_base64)?;
|
||||
let client_key = derive_client_key(password, &salt, secret.iterations).await;
|
||||
use sasl::Step::*;
|
||||
|
||||
if secret.is_password_invalid(&client_key).into() {
|
||||
Ok(sasl::Outcome::Failure("password doesn't match"))
|
||||
} else {
|
||||
Ok(sasl::Outcome::Success(client_key))
|
||||
}
|
||||
let init = SaslInitial {
|
||||
nonce: rand::random,
|
||||
};
|
||||
|
||||
let client_first = std::str::from_utf8(client.message())
|
||||
.map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?;
|
||||
let sent = match init.transition(secret, &tls_server_end_point, client_first)? {
|
||||
Continue(sent, server_first) => {
|
||||
// `client.update` might perform `pbkdf2(pw)`, best to spawn it in a blocking thread.
|
||||
// TODO(conrad): take this code from tokio-postgres and make an async-aware pbkdf2 impl
|
||||
client = tokio::task::spawn_blocking(move || {
|
||||
client.update(server_first.as_bytes())?;
|
||||
Ok::<ScramSha256, std::io::Error>(client)
|
||||
})
|
||||
.await
|
||||
.expect("should not panic while performing password hash")?;
|
||||
sent
|
||||
}
|
||||
Success(x, _) => match x {},
|
||||
Failure(msg) => return Ok(sasl::Outcome::Failure(msg)),
|
||||
};
|
||||
|
||||
let client_final = std::str::from_utf8(client.message())
|
||||
.map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?;
|
||||
let keys = match sent.transition(secret, &tls_server_end_point, client_final)? {
|
||||
Success(keys, server_final) => {
|
||||
client.finish(server_final.as_bytes())?;
|
||||
keys
|
||||
}
|
||||
Continue(x, _) => match x {},
|
||||
Failure(msg) => return Ok(sasl::Outcome::Failure(msg)),
|
||||
};
|
||||
|
||||
Ok(sasl::Outcome::Success(keys))
|
||||
}
|
||||
|
||||
impl SaslInitial {
|
||||
@@ -210,7 +192,7 @@ impl SaslSentInner {
|
||||
.derive_client_key(&client_final_message.proof);
|
||||
|
||||
// Auth fails either if keys don't match or it's pre-determined to fail.
|
||||
if secret.is_password_invalid(&client_key).into() {
|
||||
if client_key.sha256() != secret.stored_key || secret.doomed {
|
||||
return Ok(sasl::Step::Failure("password doesn't match"));
|
||||
}
|
||||
|
||||
|
||||
@@ -1,31 +1,17 @@
|
||||
//! Tools for client/server/stored key management.
|
||||
|
||||
use subtle::ConstantTimeEq;
|
||||
|
||||
/// Faithfully taken from PostgreSQL.
|
||||
pub const SCRAM_KEY_LEN: usize = 32;
|
||||
|
||||
/// One of the keys derived from the user's password.
|
||||
/// We use the same structure for all keys, i.e.
|
||||
/// `ClientKey`, `StoredKey`, and `ServerKey`.
|
||||
#[derive(Clone, Default, Eq, Debug)]
|
||||
#[derive(Clone, Default, PartialEq, Eq, Debug)]
|
||||
#[repr(transparent)]
|
||||
pub struct ScramKey {
|
||||
bytes: [u8; SCRAM_KEY_LEN],
|
||||
}
|
||||
|
||||
impl PartialEq for ScramKey {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
self.ct_eq(other).into()
|
||||
}
|
||||
}
|
||||
|
||||
impl ConstantTimeEq for ScramKey {
|
||||
fn ct_eq(&self, other: &Self) -> subtle::Choice {
|
||||
self.bytes.ct_eq(&other.bytes)
|
||||
}
|
||||
}
|
||||
|
||||
impl ScramKey {
|
||||
pub fn sha256(&self) -> Self {
|
||||
super::sha256([self.as_ref()]).into()
|
||||
|
||||
@@ -206,28 +206,6 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_client_first_message_with_invalid_gs2_authz() {
|
||||
assert!(ClientFirstMessage::parse("n,authzid,n=user,r=nonce").is_none())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_client_first_message_with_extra_params() {
|
||||
let msg = ClientFirstMessage::parse("n,,n=user,r=nonce,a=foo,b=bar,c=baz").unwrap();
|
||||
assert_eq!(msg.bare, "n=user,r=nonce,a=foo,b=bar,c=baz");
|
||||
assert_eq!(msg.username, "user");
|
||||
assert_eq!(msg.nonce, "nonce");
|
||||
assert_eq!(msg.cbind_flag, ChannelBinding::NotSupportedClient);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_client_first_message_with_extra_params_invalid() {
|
||||
// must be of the form `<ascii letter>=<...>`
|
||||
assert!(ClientFirstMessage::parse("n,,n=user,r=nonce,abc=foo").is_none());
|
||||
assert!(ClientFirstMessage::parse("n,,n=user,r=nonce,1=foo").is_none());
|
||||
assert!(ClientFirstMessage::parse("n,,n=user,r=nonce,a").is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_client_final_message() {
|
||||
let input = [
|
||||
|
||||
@@ -1,7 +1,5 @@
|
||||
//! Tools for SCRAM server secret management.
|
||||
|
||||
use subtle::{Choice, ConstantTimeEq};
|
||||
|
||||
use super::base64_decode_array;
|
||||
use super::key::ScramKey;
|
||||
|
||||
@@ -42,11 +40,6 @@ impl ServerSecret {
|
||||
Some(secret)
|
||||
}
|
||||
|
||||
pub fn is_password_invalid(&self, client_key: &ScramKey) -> Choice {
|
||||
// constant time to not leak partial key match
|
||||
client_key.sha256().ct_ne(&self.stored_key) | Choice::from(self.doomed as u8)
|
||||
}
|
||||
|
||||
/// To avoid revealing information to an attacker, we use a
|
||||
/// mocked server secret even if the user doesn't exist.
|
||||
/// See `auth-scram.c : mock_scram_secret` for details.
|
||||
@@ -66,8 +59,10 @@ impl ServerSecret {
|
||||
/// Build a new server secret from the prerequisites.
|
||||
/// XXX: We only use this function in tests.
|
||||
#[cfg(test)]
|
||||
pub async fn build(password: &str) -> Option<Self> {
|
||||
Self::parse(&postgres_protocol::password::scram_sha_256(password.as_bytes()).await)
|
||||
pub fn build(password: &str) -> Option<Self> {
|
||||
Self::parse(&postgres_protocol::password::scram_sha_256(
|
||||
password.as_bytes(),
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
[toolchain]
|
||||
channel = "1.77.0"
|
||||
channel = "1.76.0"
|
||||
profile = "default"
|
||||
# The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy.
|
||||
# https://rust-lang.github.io/rustup/concepts/profiles.html
|
||||
|
||||
@@ -225,7 +225,6 @@ async fn write_segment(
|
||||
assert!(from <= to);
|
||||
assert!(to <= wal_seg_size);
|
||||
|
||||
#[allow(clippy::suspicious_open_options)]
|
||||
let mut file = OpenOptions::new()
|
||||
.create(true)
|
||||
.write(true)
|
||||
|
||||
@@ -20,7 +20,7 @@ use std::io::Write as _;
|
||||
use tokio::sync::mpsc;
|
||||
use tokio_stream::wrappers::ReceiverStream;
|
||||
use tracing::{info_span, Instrument};
|
||||
use utils::http::endpoint::{prometheus_metrics_handler, request_span, ChannelWriter};
|
||||
use utils::http::endpoint::{request_span, ChannelWriter};
|
||||
|
||||
use crate::debug_dump::TimelineDigestRequest;
|
||||
use crate::receive_wal::WalReceiverState;
|
||||
@@ -515,7 +515,6 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError>
|
||||
router
|
||||
.data(Arc::new(conf))
|
||||
.data(auth)
|
||||
.get("/metrics", |r| request_span(r, prometheus_metrics_handler))
|
||||
.get("/v1/status", |r| request_span(r, status_handler))
|
||||
.put("/v1/failpoints", |r| {
|
||||
request_span(r, move |r| async {
|
||||
|
||||
@@ -221,7 +221,6 @@ impl PhysicalStorage {
|
||||
// half initialized segment, first bake it under tmp filename and
|
||||
// then rename.
|
||||
let tmp_path = self.timeline_dir.join("waltmp");
|
||||
#[allow(clippy::suspicious_open_options)]
|
||||
let mut file = OpenOptions::new()
|
||||
.create(true)
|
||||
.write(true)
|
||||
|
||||
@@ -51,7 +51,7 @@ from fixtures.log_helper import log
|
||||
from fixtures.metrics import Metrics, MetricsGetter, parse_metrics
|
||||
from fixtures.pageserver.allowed_errors import (
|
||||
DEFAULT_PAGESERVER_ALLOWED_ERRORS,
|
||||
DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS,
|
||||
scan_pageserver_log_for_errors,
|
||||
)
|
||||
from fixtures.pageserver.http import PageserverHttpClient
|
||||
from fixtures.pageserver.types import IndexPartDump
|
||||
@@ -77,7 +77,6 @@ from fixtures.utils import (
|
||||
ATTACHMENT_NAME_REGEX,
|
||||
allure_add_grafana_links,
|
||||
allure_attach_from_dir,
|
||||
assert_no_errors,
|
||||
get_self_dir,
|
||||
subprocess_capture,
|
||||
wait_until,
|
||||
@@ -945,8 +944,6 @@ class NeonEnvBuilder:
|
||||
for pageserver in self.env.pageservers:
|
||||
pageserver.assert_no_errors()
|
||||
|
||||
self.env.storage_controller.assert_no_errors()
|
||||
|
||||
try:
|
||||
self.overlay_cleanup_teardown()
|
||||
except Exception as e:
|
||||
@@ -1892,6 +1889,19 @@ class NeonCli(AbstractNeonCli):
|
||||
|
||||
return self.raw_cli(args, check_return_code=True)
|
||||
|
||||
def tenant_migrate(
|
||||
self, tenant_shard_id: TenantShardId, new_pageserver: int, timeout_secs: Optional[int]
|
||||
):
|
||||
args = [
|
||||
"tenant",
|
||||
"migrate",
|
||||
"--tenant-id",
|
||||
str(tenant_shard_id),
|
||||
"--id",
|
||||
str(new_pageserver),
|
||||
]
|
||||
return self.raw_cli(args, check_return_code=True, timeout=timeout_secs)
|
||||
|
||||
def start(self, check_return_code=True) -> "subprocess.CompletedProcess[str]":
|
||||
return self.raw_cli(["start"], check_return_code=check_return_code)
|
||||
|
||||
@@ -1951,7 +1961,6 @@ class NeonStorageController(MetricsGetter):
|
||||
self.env = env
|
||||
self.running = False
|
||||
self.auth_enabled = auth_enabled
|
||||
self.allowed_errors: list[str] = DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS
|
||||
|
||||
def start(self):
|
||||
assert not self.running
|
||||
@@ -1976,11 +1985,6 @@ class NeonStorageController(MetricsGetter):
|
||||
msg = ""
|
||||
raise StorageControllerApiException(msg, res.status_code) from e
|
||||
|
||||
def assert_no_errors(self):
|
||||
assert_no_errors(
|
||||
self.env.repo_dir / "storage_controller.log", "storage_controller", self.allowed_errors
|
||||
)
|
||||
|
||||
def pageserver_api(self) -> PageserverHttpClient:
|
||||
"""
|
||||
The storage controller implements a subset of the pageserver REST API, for mapping
|
||||
@@ -2143,25 +2147,13 @@ class NeonStorageController(MetricsGetter):
|
||||
"""
|
||||
response = self.request(
|
||||
"GET",
|
||||
f"{self.env.storage_controller_api}/debug/v1/tenant/{tenant_id}/locate",
|
||||
f"{self.env.storage_controller_api}/control/v1/tenant/{tenant_id}/locate",
|
||||
headers=self.headers(TokenScope.ADMIN),
|
||||
)
|
||||
body = response.json()
|
||||
shards: list[dict[str, Any]] = body["shards"]
|
||||
return shards
|
||||
|
||||
def tenant_describe(self, tenant_id: TenantId):
|
||||
"""
|
||||
:return: list of {"shard_id": "", "node_id": int, "listen_pg_addr": str, "listen_pg_port": int, "listen_http_addr: str, "listen_http_port: int}
|
||||
"""
|
||||
response = self.request(
|
||||
"GET",
|
||||
f"{self.env.storage_controller_api}/control/v1/tenant/{tenant_id}",
|
||||
headers=self.headers(TokenScope.ADMIN),
|
||||
)
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
|
||||
def tenant_shard_split(
|
||||
self, tenant_id: TenantId, shard_count: int, shard_stripe_size: Optional[int] = None
|
||||
) -> list[TenantShardId]:
|
||||
@@ -2365,9 +2357,18 @@ class NeonPageserver(PgProtocol):
|
||||
return self.env.repo_dir / f"pageserver_{self.id}"
|
||||
|
||||
def assert_no_errors(self):
|
||||
assert_no_errors(
|
||||
self.workdir / "pageserver.log", f"pageserver_{self.id}", self.allowed_errors
|
||||
)
|
||||
logfile = self.workdir / "pageserver.log"
|
||||
if not logfile.exists():
|
||||
log.warning(f"Skipping log check: {logfile} does not exist")
|
||||
return
|
||||
|
||||
with logfile.open("r") as f:
|
||||
errors = scan_pageserver_log_for_errors(f, self.allowed_errors)
|
||||
|
||||
for _lineno, error in errors:
|
||||
log.info(f"not allowed error: {error.strip()}")
|
||||
|
||||
assert not errors
|
||||
|
||||
def assert_no_metric_errors(self):
|
||||
"""
|
||||
|
||||
@@ -89,16 +89,6 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = (
|
||||
)
|
||||
|
||||
|
||||
DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS = [
|
||||
# Many tests will take pageservers offline, resulting in log warnings on the controller
|
||||
# failing to connect to them.
|
||||
".*Call to node.*management API.*failed.*receive body.*",
|
||||
".*Call to node.*management API.*failed.*ReceiveBody.*",
|
||||
# Many tests will start up with a node offline
|
||||
".*startup_reconcile: Could not scan node.*",
|
||||
]
|
||||
|
||||
|
||||
def _check_allowed_errors(input):
|
||||
allowed_errors: List[str] = list(DEFAULT_PAGESERVER_ALLOWED_ERRORS)
|
||||
|
||||
|
||||
@@ -626,17 +626,6 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
|
||||
res_json = res.json()
|
||||
return res_json
|
||||
|
||||
def timeline_layer_map_info(
|
||||
self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId
|
||||
):
|
||||
log.info(f"Requesting layer map info of tenant {tenant_id}, timeline {timeline_id}")
|
||||
res = self.get(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/layer",
|
||||
)
|
||||
self.verbose_error(res)
|
||||
res_json = res.json()
|
||||
return res_json
|
||||
|
||||
def timeline_checkpoint(
|
||||
self,
|
||||
tenant_id: Union[TenantId, TenantShardId],
|
||||
|
||||
@@ -158,9 +158,6 @@ class TenantShardId:
|
||||
def __str__(self):
|
||||
return f"{self.tenant_id}-{self.shard_number:02x}{self.shard_count:02x}"
|
||||
|
||||
def __repr__(self):
|
||||
return self.__str__()
|
||||
|
||||
def _tuple(self) -> tuple[TenantId, int, int]:
|
||||
return (self.tenant_id, self.shard_number, self.shard_count)
|
||||
|
||||
|
||||
@@ -11,7 +11,6 @@ from typing import (
|
||||
Any,
|
||||
Callable,
|
||||
Dict,
|
||||
Iterable,
|
||||
List,
|
||||
Optional,
|
||||
Tuple,
|
||||
@@ -448,39 +447,3 @@ def humantime_to_ms(humantime: str) -> float:
|
||||
)
|
||||
|
||||
return round(total_ms, 3)
|
||||
|
||||
|
||||
def scan_log_for_errors(input: Iterable[str], allowed_errors: List[str]) -> List[Tuple[int, str]]:
|
||||
error_or_warn = re.compile(r"\s(ERROR|WARN)")
|
||||
errors = []
|
||||
for lineno, line in enumerate(input, start=1):
|
||||
if len(line) == 0:
|
||||
continue
|
||||
|
||||
if error_or_warn.search(line):
|
||||
# Is this a torn log line? This happens when force-killing a process and restarting
|
||||
# Example: "2023-10-25T09:38:31.752314Z WARN deletion executo2023-10-25T09:38:31.875947Z INFO version: git-env:0f9452f76e8ccdfc88291bccb3f53e3016f40192"
|
||||
if re.match("\\d{4}-\\d{2}-\\d{2}T.+\\d{4}-\\d{2}-\\d{2}T.+INFO version.+", line):
|
||||
continue
|
||||
|
||||
# It's an ERROR or WARN. Is it in the allow-list?
|
||||
for a in allowed_errors:
|
||||
if re.match(a, line):
|
||||
break
|
||||
else:
|
||||
errors.append((lineno, line))
|
||||
return errors
|
||||
|
||||
|
||||
def assert_no_errors(log_file, service, allowed_errors):
|
||||
if not log_file.exists():
|
||||
log.warning(f"Skipping {service} log check: {log_file} does not exist")
|
||||
return
|
||||
|
||||
with log_file.open("r") as f:
|
||||
errors = scan_log_for_errors(f, allowed_errors)
|
||||
|
||||
for _lineno, error in errors:
|
||||
log.info(f"not allowed {service} error: {error.strip()}")
|
||||
|
||||
assert not errors, f"Log errors on {service}: {errors[0]}"
|
||||
|
||||
@@ -1,5 +1,3 @@
|
||||
import json
|
||||
|
||||
import pytest
|
||||
from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker
|
||||
from fixtures.log_helper import log
|
||||
@@ -81,8 +79,3 @@ def test_gc_feedback(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchma
|
||||
zenbenchmark.record(
|
||||
"physical/logical ratio", physical_size / logical_size, "", MetricReport.LOWER_IS_BETTER
|
||||
)
|
||||
|
||||
layer_map_path = env.repo_dir / "layer-map.json"
|
||||
log.info(f"Writing layer map to {layer_map_path}")
|
||||
with layer_map_path.open("w") as f:
|
||||
f.write(json.dumps(client.timeline_layer_map_info(tenant_id, timeline_id)))
|
||||
|
||||
@@ -120,12 +120,12 @@ def test_branch_creation_before_gc(neon_simple_env: NeonEnv):
|
||||
env = neon_simple_env
|
||||
pageserver_http_client = env.pageserver.http_client()
|
||||
|
||||
error_regexes = [
|
||||
".*invalid branch start lsn: less than latest GC cutoff.*",
|
||||
".*invalid branch start lsn: less than planned GC cutoff.*",
|
||||
]
|
||||
env.pageserver.allowed_errors.extend(error_regexes)
|
||||
env.storage_controller.allowed_errors.extend(error_regexes)
|
||||
env.pageserver.allowed_errors.extend(
|
||||
[
|
||||
".*invalid branch start lsn: less than latest GC cutoff.*",
|
||||
".*invalid branch start lsn: less than planned GC cutoff.*",
|
||||
]
|
||||
)
|
||||
|
||||
# Disable background GC but set the `pitr_interval` to be small, so GC can delete something
|
||||
tenant, _ = env.neon_cli.create_tenant(
|
||||
|
||||
@@ -14,12 +14,9 @@ def test_branch_behind(neon_env_builder: NeonEnvBuilder):
|
||||
neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}"
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
error_regexes = [
|
||||
".*invalid branch start lsn.*",
|
||||
".*invalid start lsn .* for ancestor timeline.*",
|
||||
]
|
||||
env.pageserver.allowed_errors.extend(error_regexes)
|
||||
env.storage_controller.allowed_errors.extend(error_regexes)
|
||||
env.pageserver.allowed_errors.extend(
|
||||
[".*invalid branch start lsn.*", ".*invalid start lsn .* for ancestor timeline.*"]
|
||||
)
|
||||
|
||||
# Branch at the point where only 100 rows were inserted
|
||||
branch_behind_timeline_id = env.neon_cli.create_branch("test_branch_behind")
|
||||
|
||||
@@ -238,10 +238,6 @@ def test_forward_compatibility(
|
||||
pg_distrib_dir=compatibility_postgres_distrib_dir,
|
||||
)
|
||||
|
||||
# TODO: remove this workaround after release-5090 is no longer the most recent release.
|
||||
# There was a bug in that code that generates a warning in the storage controller log.
|
||||
env.storage_controller.allowed_errors.append(".*no tenant_shard_id specified.*")
|
||||
|
||||
# Use current neon_local even though we're using old binaries for
|
||||
# everything else: our test code is written for latest CLI args.
|
||||
env.neon_local_binpath = neon_local_binpath
|
||||
|
||||
@@ -70,7 +70,6 @@ def test_metric_collection(
|
||||
# we have a fast rate of calculation, these can happen at shutdown
|
||||
".*synthetic_size_worker:calculate_synthetic_size.*:gather_size_inputs.*: failed to calculate logical size at .*: cancelled.*",
|
||||
".*synthetic_size_worker: failed to calculate synthetic size for tenant .*: failed to calculate some logical_sizes",
|
||||
".*metrics_collection: failed to upload to S3: Failed to upload data of length .* to storage path.*",
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
@@ -432,10 +432,6 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
|
||||
- Eviction of layers on the attached location results in deletion
|
||||
on the secondary location as well.
|
||||
"""
|
||||
|
||||
# For debug of https://github.com/neondatabase/neon/issues/6966
|
||||
neon_env_builder.rust_log_override = "DEBUG"
|
||||
|
||||
neon_env_builder.num_pageservers = 2
|
||||
neon_env_builder.enable_pageserver_remote_storage(
|
||||
remote_storage_kind=RemoteStorageKind.MOCK_S3,
|
||||
@@ -580,7 +576,7 @@ def test_slow_secondary_downloads(neon_env_builder: NeonEnvBuilder, via_controll
|
||||
timeline_id = TimelineId.generate()
|
||||
|
||||
env.neon_cli.create_tenant(
|
||||
tenant_id, timeline_id, conf=TENANT_CONF, placement_policy='{"Attached":1}'
|
||||
tenant_id, timeline_id, conf=TENANT_CONF, placement_policy='{"Double":1}'
|
||||
)
|
||||
|
||||
attached_to_id = env.storage_controller.locate(tenant_id)[0]["node_id"]
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
import os
|
||||
import time
|
||||
from collections import defaultdict
|
||||
from typing import Dict, List, Optional, Union
|
||||
|
||||
import pytest
|
||||
@@ -14,7 +13,7 @@ from fixtures.neon_fixtures import (
|
||||
tenant_get_shards,
|
||||
)
|
||||
from fixtures.remote_storage import s3_storage
|
||||
from fixtures.types import Lsn, TenantId, TenantShardId, TimelineId
|
||||
from fixtures.types import Lsn, TenantShardId, TimelineId
|
||||
from fixtures.utils import wait_until
|
||||
from fixtures.workload import Workload
|
||||
from pytest_httpserver import HTTPServer
|
||||
@@ -160,20 +159,11 @@ def test_sharding_split_smoke(
|
||||
|
||||
neon_env_builder.preserve_database_files = True
|
||||
|
||||
non_default_tenant_config = {"gc_horizon": 77 * 1024 * 1024}
|
||||
|
||||
env = neon_env_builder.init_configs(True)
|
||||
neon_env_builder.start()
|
||||
tenant_id = TenantId.generate()
|
||||
timeline_id = TimelineId.generate()
|
||||
env.neon_cli.create_tenant(
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
shard_count=shard_count,
|
||||
shard_stripe_size=stripe_size,
|
||||
placement_policy='{"Attached": 1}',
|
||||
conf=non_default_tenant_config,
|
||||
env = neon_env_builder.init_start(
|
||||
initial_tenant_shard_count=shard_count, initial_tenant_shard_stripe_size=stripe_size
|
||||
)
|
||||
tenant_id = env.initial_tenant
|
||||
timeline_id = env.initial_timeline
|
||||
workload = Workload(env, tenant_id, timeline_id, branch_name="main")
|
||||
workload.init()
|
||||
|
||||
@@ -233,14 +223,6 @@ def test_sharding_split_smoke(
|
||||
# Before split, old shards exist
|
||||
assert shards_on_disk(old_shard_ids)
|
||||
|
||||
# Before split, we have done one reconcile for each shard
|
||||
assert (
|
||||
env.storage_controller.get_metric_value(
|
||||
"storage_controller_reconcile_complete_total", filter={"status": "ok"}
|
||||
)
|
||||
== shard_count
|
||||
)
|
||||
|
||||
env.storage_controller.tenant_shard_split(tenant_id, shard_count=split_shard_count)
|
||||
|
||||
post_split_pageserver_ids = [loc["node_id"] for loc in env.storage_controller.locate(tenant_id)]
|
||||
@@ -282,66 +264,43 @@ def test_sharding_split_smoke(
|
||||
destination = migrate_to_pageserver_ids.pop()
|
||||
|
||||
log.info(f"Migrating shard {migrate_shard} from {ps_id} to {destination}")
|
||||
env.storage_controller.tenant_shard_migrate(migrate_shard, destination)
|
||||
env.neon_cli.tenant_migrate(migrate_shard, destination, timeout_secs=10)
|
||||
|
||||
workload.validate()
|
||||
|
||||
# Assert on how many reconciles happened during the process. This is something of an
|
||||
# implementation detail, but it is useful to detect any bugs that might generate spurious
|
||||
# extra reconcile iterations.
|
||||
#
|
||||
# We'll have:
|
||||
# - shard_count reconciles for the original setup of the tenant
|
||||
# - shard_count reconciles for detaching the original secondary locations during split
|
||||
# - split_shard_count reconciles during shard splitting, for setting up secondaries.
|
||||
# - shard_count reconciles for the migrations we did to move child shards away from their split location
|
||||
expect_reconciles = shard_count * 2 + split_shard_count + shard_count
|
||||
# Check that we didn't do any spurious reconciliations.
|
||||
# Total number of reconciles should have been one per original shard, plus
|
||||
# one for each shard that was migrated.
|
||||
reconcile_ok = env.storage_controller.get_metric_value(
|
||||
"storage_controller_reconcile_complete_total", filter={"status": "ok"}
|
||||
)
|
||||
assert reconcile_ok == expect_reconciles
|
||||
assert reconcile_ok == shard_count + split_shard_count // 2
|
||||
|
||||
# Check that no cancelled or errored reconciliations occurred: this test does no
|
||||
# failure injection and should run clean.
|
||||
cancelled_reconciles = env.storage_controller.get_metric_value(
|
||||
"storage_controller_reconcile_complete_total", filter={"status": "cancel"}
|
||||
assert (
|
||||
env.storage_controller.get_metric_value(
|
||||
"storage_controller_reconcile_complete_total", filter={"status": "cancel"}
|
||||
)
|
||||
is None
|
||||
)
|
||||
errored_reconciles = env.storage_controller.get_metric_value(
|
||||
"storage_controller_reconcile_complete_total", filter={"status": "error"}
|
||||
assert (
|
||||
env.storage_controller.get_metric_value(
|
||||
"storage_controller_reconcile_complete_total", filter={"status": "error"}
|
||||
)
|
||||
is None
|
||||
)
|
||||
assert cancelled_reconciles is not None and int(cancelled_reconciles) == 0
|
||||
assert errored_reconciles is not None and int(errored_reconciles) == 0
|
||||
|
||||
env.storage_controller.consistency_check()
|
||||
|
||||
def get_node_shard_counts(env: NeonEnv, tenant_ids):
|
||||
total: defaultdict[int, int] = defaultdict(int)
|
||||
attached: defaultdict[int, int] = defaultdict(int)
|
||||
for tid in tenant_ids:
|
||||
for shard in env.storage_controller.tenant_describe(tid)["shards"]:
|
||||
log.info(
|
||||
f"{shard['tenant_shard_id']}: attached={shard['node_attached']}, secondary={shard['node_secondary']} "
|
||||
)
|
||||
for node in shard["node_secondary"]:
|
||||
total[int(node)] += 1
|
||||
attached[int(shard["node_attached"])] += 1
|
||||
total[int(shard["node_attached"])] += 1
|
||||
# Validate pageserver state
|
||||
shards_exist: list[TenantShardId] = []
|
||||
for pageserver in env.pageservers:
|
||||
locations = pageserver.http_client().tenant_list_locations()
|
||||
shards_exist.extend(TenantShardId.parse(s[0]) for s in locations["tenant_shards"])
|
||||
|
||||
return total, attached
|
||||
|
||||
def check_effective_tenant_config():
|
||||
# Expect our custom tenant configs to have survived the split
|
||||
for shard in env.storage_controller.tenant_describe(tenant_id)["shards"]:
|
||||
node = env.get_pageserver(int(shard["node_attached"]))
|
||||
config = node.http_client().tenant_config(TenantShardId.parse(shard["tenant_shard_id"]))
|
||||
for k, v in non_default_tenant_config.items():
|
||||
assert config.effective_config[k] == v
|
||||
|
||||
# Validate pageserver state: expect every child shard to have an attached and secondary location
|
||||
(total, attached) = get_node_shard_counts(env, tenant_ids=[tenant_id])
|
||||
assert sum(attached.values()) == split_shard_count
|
||||
assert sum(total.values()) == split_shard_count * 2
|
||||
check_effective_tenant_config()
|
||||
log.info("Shards after split: {shards_exist}")
|
||||
assert len(shards_exist) == split_shard_count
|
||||
|
||||
# Ensure post-split pageserver locations survive a restart (i.e. the child shards
|
||||
# correctly wrote config to disk, and the storage controller responds correctly
|
||||
@@ -350,11 +309,13 @@ def test_sharding_split_smoke(
|
||||
pageserver.stop()
|
||||
pageserver.start()
|
||||
|
||||
# Validate pageserver state: expect every child shard to have an attached and secondary location
|
||||
(total, attached) = get_node_shard_counts(env, tenant_ids=[tenant_id])
|
||||
assert sum(attached.values()) == split_shard_count
|
||||
assert sum(total.values()) == split_shard_count * 2
|
||||
check_effective_tenant_config()
|
||||
shards_exist = []
|
||||
for pageserver in env.pageservers:
|
||||
locations = pageserver.http_client().tenant_list_locations()
|
||||
shards_exist.extend(TenantShardId.parse(s[0]) for s in locations["tenant_shards"])
|
||||
|
||||
log.info("Shards after restart: {shards_exist}")
|
||||
assert len(shards_exist) == split_shard_count
|
||||
|
||||
workload.validate()
|
||||
|
||||
@@ -760,32 +721,9 @@ def test_sharding_split_failures(
|
||||
initial_shard_count = 2
|
||||
split_shard_count = 4
|
||||
|
||||
env = neon_env_builder.init_configs()
|
||||
env.start()
|
||||
|
||||
tenant_id = TenantId.generate()
|
||||
timeline_id = TimelineId.generate()
|
||||
|
||||
# Create a tenant with secondary locations enabled
|
||||
env.neon_cli.create_tenant(
|
||||
tenant_id, timeline_id, shard_count=initial_shard_count, placement_policy='{"Attached":1}'
|
||||
)
|
||||
|
||||
env.storage_controller.allowed_errors.extend(
|
||||
[
|
||||
# All split failures log a warning when then enqueue the abort operation
|
||||
".*Enqueuing background abort.*",
|
||||
# We exercise failure cases where abort itself will also fail (node offline)
|
||||
".*abort_tenant_shard_split.*",
|
||||
".*Failed to abort.*",
|
||||
# Tolerate any error lots that mention a failpoint
|
||||
".*failpoint.*",
|
||||
# Node offline cases will fail to send requests
|
||||
".*Reconcile error: receive body: error sending request for url.*",
|
||||
# Node offline cases will fail inside reconciler when detaching secondaries
|
||||
".*Reconcile error on shard.*: receive body: error sending request for url.*",
|
||||
]
|
||||
)
|
||||
env = neon_env_builder.init_start(initial_tenant_shard_count=initial_shard_count)
|
||||
tenant_id = env.initial_tenant
|
||||
timeline_id = env.initial_timeline
|
||||
|
||||
for ps in env.pageservers:
|
||||
# When we do node failures and abandon a shard, it will de-facto have old generation and
|
||||
@@ -821,8 +759,7 @@ def test_sharding_split_failures(
|
||||
# will have succeeded: the net result should be to return to a clean state, including
|
||||
# detaching any child shards.
|
||||
def assert_rolled_back(exclude_ps_id=None) -> None:
|
||||
secondary_count = 0
|
||||
attached_count = 0
|
||||
count = 0
|
||||
for ps in env.pageservers:
|
||||
if exclude_ps_id is not None and ps.id == exclude_ps_id:
|
||||
continue
|
||||
@@ -830,25 +767,13 @@ def test_sharding_split_failures(
|
||||
locations = ps.http_client().tenant_list_locations()["tenant_shards"]
|
||||
for loc in locations:
|
||||
tenant_shard_id = TenantShardId.parse(loc[0])
|
||||
log.info(f"Shard {tenant_shard_id} seen on node {ps.id} in mode {loc[1]['mode']}")
|
||||
log.info(f"Shard {tenant_shard_id} seen on node {ps.id}")
|
||||
assert tenant_shard_id.shard_count == initial_shard_count
|
||||
if loc[1]["mode"] == "Secondary":
|
||||
secondary_count += 1
|
||||
else:
|
||||
attached_count += 1
|
||||
|
||||
if exclude_ps_id is not None:
|
||||
# For a node failure case, we expect there to be a secondary location
|
||||
# scheduled on the offline node, so expect one fewer secondary in total
|
||||
assert secondary_count == initial_shard_count - 1
|
||||
else:
|
||||
assert secondary_count == initial_shard_count
|
||||
|
||||
assert attached_count == initial_shard_count
|
||||
count += 1
|
||||
assert count == initial_shard_count
|
||||
|
||||
def assert_split_done(exclude_ps_id=None) -> None:
|
||||
secondary_count = 0
|
||||
attached_count = 0
|
||||
count = 0
|
||||
for ps in env.pageservers:
|
||||
if exclude_ps_id is not None and ps.id == exclude_ps_id:
|
||||
continue
|
||||
@@ -856,14 +781,10 @@ def test_sharding_split_failures(
|
||||
locations = ps.http_client().tenant_list_locations()["tenant_shards"]
|
||||
for loc in locations:
|
||||
tenant_shard_id = TenantShardId.parse(loc[0])
|
||||
log.info(f"Shard {tenant_shard_id} seen on node {ps.id} in mode {loc[1]['mode']}")
|
||||
log.info(f"Shard {tenant_shard_id} seen on node {ps.id}")
|
||||
assert tenant_shard_id.shard_count == split_shard_count
|
||||
if loc[1]["mode"] == "Secondary":
|
||||
secondary_count += 1
|
||||
else:
|
||||
attached_count += 1
|
||||
assert attached_count == split_shard_count
|
||||
assert secondary_count == split_shard_count
|
||||
count += 1
|
||||
assert count == split_shard_count
|
||||
|
||||
def finish_split():
|
||||
# Having failed+rolled back, we should be able to split again
|
||||
|
||||
@@ -23,7 +23,7 @@ from fixtures.pageserver.utils import (
|
||||
)
|
||||
from fixtures.pg_version import PgVersion
|
||||
from fixtures.remote_storage import RemoteStorageKind, s3_storage
|
||||
from fixtures.types import TenantId, TenantShardId, TimelineId
|
||||
from fixtures.types import TenantId, TimelineId
|
||||
from fixtures.utils import run_pg_bench_small, wait_until
|
||||
from mypy_boto3_s3.type_defs import (
|
||||
ObjectTypeDef,
|
||||
@@ -177,7 +177,6 @@ def test_node_status_after_restart(
|
||||
assert len(nodes) == 2
|
||||
|
||||
env.pageservers[1].stop()
|
||||
env.storage_controller.allowed_errors.extend([".*Could not scan node"])
|
||||
|
||||
env.storage_controller.stop()
|
||||
env.storage_controller.start()
|
||||
@@ -682,9 +681,6 @@ def test_sharding_service_auth(neon_env_builder: NeonEnvBuilder):
|
||||
tenant_id = TenantId.generate()
|
||||
body: Dict[str, Any] = {"new_tenant_id": str(tenant_id)}
|
||||
|
||||
env.storage_controller.allowed_errors.append(".*Unauthorized.*")
|
||||
env.storage_controller.allowed_errors.append(".*Forbidden.*")
|
||||
|
||||
# No token
|
||||
with pytest.raises(
|
||||
StorageControllerApiException,
|
||||
@@ -847,12 +843,6 @@ def test_sharding_service_heartbeats(
|
||||
env = neon_env_builder.init_configs()
|
||||
env.start()
|
||||
|
||||
# Default log allow list permits connection errors, but this test will use error responses on
|
||||
# the utilization endpoint.
|
||||
env.storage_controller.allowed_errors.append(
|
||||
".*Call to node.*management API.*failed.*failpoint.*"
|
||||
)
|
||||
|
||||
# Initially we have two online pageservers
|
||||
nodes = env.storage_controller.node_list()
|
||||
assert len(nodes) == 2
|
||||
@@ -948,65 +938,3 @@ def test_sharding_service_heartbeats(
|
||||
env.storage_controller.consistency_check()
|
||||
|
||||
wait_until(10, 1, storage_controller_consistent)
|
||||
|
||||
|
||||
def test_sharding_service_re_attach(neon_env_builder: NeonEnvBuilder):
|
||||
"""
|
||||
Exercise the behavior of the /re-attach endpoint on pageserver startup when
|
||||
pageservers have a mixture of attached and secondary locations
|
||||
"""
|
||||
|
||||
neon_env_builder.num_pageservers = 2
|
||||
env = neon_env_builder.init_configs()
|
||||
env.start()
|
||||
|
||||
# We'll have two tenants.
|
||||
tenant_a = TenantId.generate()
|
||||
env.neon_cli.create_tenant(tenant_a, placement_policy='{"Attached":1}')
|
||||
tenant_b = TenantId.generate()
|
||||
env.neon_cli.create_tenant(tenant_b, placement_policy='{"Attached":1}')
|
||||
|
||||
# Each pageserver will have one attached and one secondary location
|
||||
env.storage_controller.tenant_shard_migrate(
|
||||
TenantShardId(tenant_a, 0, 0), env.pageservers[0].id
|
||||
)
|
||||
env.storage_controller.tenant_shard_migrate(
|
||||
TenantShardId(tenant_b, 0, 0), env.pageservers[1].id
|
||||
)
|
||||
|
||||
# Hard-fail a pageserver
|
||||
victim_ps = env.pageservers[1]
|
||||
survivor_ps = env.pageservers[0]
|
||||
victim_ps.stop(immediate=True)
|
||||
|
||||
# Heatbeater will notice it's offline, and consequently attachments move to the other pageserver
|
||||
def failed_over():
|
||||
locations = survivor_ps.http_client().tenant_list_locations()["tenant_shards"]
|
||||
log.info(f"locations: {locations}")
|
||||
assert len(locations) == 2
|
||||
assert all(loc[1]["mode"] == "AttachedSingle" for loc in locations)
|
||||
|
||||
# We could pre-empty this by configuring the node to Offline, but it's preferable to test
|
||||
# the realistic path we would take when a node restarts uncleanly.
|
||||
# The delay here will be ~NEON_LOCAL_MAX_UNAVAILABLE_INTERVAL in neon_local
|
||||
wait_until(30, 1, failed_over)
|
||||
|
||||
reconciles_before_restart = env.storage_controller.get_metric_value(
|
||||
"storage_controller_reconcile_complete_total", filter={"status": "ok"}
|
||||
)
|
||||
|
||||
# Restart the failed pageserver
|
||||
victim_ps.start()
|
||||
|
||||
# We expect that the re-attach call correctly tipped off the pageserver that its locations
|
||||
# are all secondaries now.
|
||||
locations = victim_ps.http_client().tenant_list_locations()["tenant_shards"]
|
||||
assert len(locations) == 2
|
||||
assert all(loc[1]["mode"] == "Secondary" for loc in locations)
|
||||
|
||||
# We expect that this situation resulted from the re_attach call, and not any explicit
|
||||
# Reconciler runs: assert that the reconciliation count has not gone up since we restarted.
|
||||
reconciles_after_restart = env.storage_controller.get_metric_value(
|
||||
"storage_controller_reconcile_complete_total", filter={"status": "ok"}
|
||||
)
|
||||
assert reconciles_after_restart == reconciles_before_restart
|
||||
|
||||
@@ -36,9 +36,7 @@ def test_tenant_creation_fails(neon_simple_env: NeonEnv):
|
||||
)
|
||||
[d for d in tenants_dir.iterdir()]
|
||||
|
||||
error_regexes = [".*tenant-config-before-write.*"]
|
||||
neon_simple_env.pageserver.allowed_errors.extend(error_regexes)
|
||||
neon_simple_env.storage_controller.allowed_errors.extend(error_regexes)
|
||||
neon_simple_env.pageserver.allowed_errors.append(".*tenant-config-before-write.*")
|
||||
|
||||
pageserver_http = neon_simple_env.pageserver.http_client()
|
||||
pageserver_http.configure_failpoints(("tenant-config-before-write", "return"))
|
||||
|
||||
@@ -20,7 +20,6 @@ from fixtures.neon_fixtures import (
|
||||
VanillaPostgres,
|
||||
wait_for_last_flush_lsn,
|
||||
)
|
||||
from fixtures.pageserver.http import PageserverHttpClient
|
||||
from fixtures.pageserver.utils import (
|
||||
assert_tenant_state,
|
||||
timeline_delete_wait_completed,
|
||||
@@ -685,13 +684,6 @@ def assert_physical_size_invariants(sizes: TimelinePhysicalSizeValues):
|
||||
# XXX would be nice to assert layer file physical storage utilization here as well, but we can only do that for LocalFS
|
||||
|
||||
|
||||
def wait_for_tenant_startup_completions(client: PageserverHttpClient, count: int):
|
||||
def condition():
|
||||
assert client.get_metric_value("pageserver_tenant_startup_complete_total") == count
|
||||
|
||||
wait_until(5, 1.0, condition)
|
||||
|
||||
|
||||
def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
|
||||
"""
|
||||
Tenants warmuping up opportunistically will wait for one another's logical size calculations to complete
|
||||
@@ -775,7 +767,10 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
|
||||
# That one that we successfully accessed is now Active
|
||||
expect_activated += 1
|
||||
assert pageserver_http.tenant_status(tenant_id=stuck_tenant_id)["state"]["slug"] == "Active"
|
||||
wait_for_tenant_startup_completions(pageserver_http, count=expect_activated - 1)
|
||||
assert (
|
||||
pageserver_http.get_metric_value("pageserver_tenant_startup_complete_total")
|
||||
== expect_activated - 1
|
||||
)
|
||||
|
||||
# The ones we didn't touch are still in Attaching
|
||||
assert (
|
||||
@@ -795,7 +790,10 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
|
||||
== n_tenants - expect_activated
|
||||
)
|
||||
|
||||
wait_for_tenant_startup_completions(pageserver_http, count=expect_activated - 1)
|
||||
assert (
|
||||
pageserver_http.get_metric_value("pageserver_tenant_startup_complete_total")
|
||||
== expect_activated - 1
|
||||
)
|
||||
|
||||
# When we unblock logical size calculation, all tenants should proceed to active state via
|
||||
# the warmup route.
|
||||
@@ -815,7 +813,7 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
|
||||
assert (
|
||||
pageserver_http.get_metric_value("pageserver_tenant_startup_scheduled_total") == n_tenants
|
||||
)
|
||||
wait_for_tenant_startup_completions(pageserver_http, count=n_tenants)
|
||||
assert pageserver_http.get_metric_value("pageserver_tenant_startup_complete_total") == n_tenants
|
||||
|
||||
# Check that tenant deletion/detach proactively wakes tenants: this is done separately to the main
|
||||
# body of the test because it will disrupt tenant counts
|
||||
|
||||
@@ -64,7 +64,6 @@ rustls = { version = "0.21", features = ["dangerous_configuration"] }
|
||||
scopeguard = { version = "1" }
|
||||
serde = { version = "1", features = ["alloc", "derive"] }
|
||||
serde_json = { version = "1", features = ["raw_value"] }
|
||||
sha2 = { version = "0.10", features = ["asm"] }
|
||||
smallvec = { version = "1", default-features = false, features = ["write"] }
|
||||
subtle = { version = "2" }
|
||||
time = { version = "0.3", features = ["local-offset", "macros", "serde-well-known"] }
|
||||
|
||||
Reference in New Issue
Block a user