Merge commit '108f7ec54' into problame/standby-horizon-leases

This commit is contained in:
Christian Schwarz
2025-08-06 17:55:56 +02:00
59 changed files with 2030 additions and 356 deletions

183
Cargo.lock generated
View File

@@ -1097,7 +1097,7 @@ checksum = "975982cdb7ad6a142be15bdf84aea7ec6a9e5d4d797c004d43185b24cfe4e684"
dependencies = [
"clap",
"heck 0.5.0",
"indexmap 2.9.0",
"indexmap 2.10.0",
"log",
"proc-macro2",
"quote",
@@ -1296,8 +1296,14 @@ dependencies = [
name = "communicator"
version = "0.1.0"
dependencies = [
"axum",
"cbindgen",
"neon-shmem",
"http 1.3.1",
"measured",
"tokio",
"tracing",
"tracing-subscriber",
"utils",
"workspace_hack",
]
@@ -1307,7 +1313,7 @@ version = "0.1.0"
dependencies = [
"anyhow",
"chrono",
"indexmap 2.9.0",
"indexmap 2.10.0",
"jsonwebtoken",
"regex",
"remote_storage",
@@ -1341,7 +1347,10 @@ dependencies = [
"futures",
"hostname-validator",
"http 1.3.1",
"indexmap 2.9.0",
"http-body-util",
"hyper 1.4.1",
"hyper-util",
"indexmap 2.10.0",
"itertools 0.10.5",
"jsonwebtoken",
"metrics",
@@ -1363,6 +1372,7 @@ dependencies = [
"ring",
"rlimit",
"rust-ini",
"scopeguard",
"serde",
"serde_json",
"serde_with",
@@ -1373,7 +1383,7 @@ dependencies = [
"tokio-postgres",
"tokio-stream",
"tokio-util",
"tonic 0.13.1",
"tonic",
"tower 0.5.2",
"tower-http",
"tower-otel",
@@ -2649,7 +2659,7 @@ dependencies = [
"futures-sink",
"futures-util",
"http 0.2.9",
"indexmap 2.9.0",
"indexmap 2.10.0",
"slab",
"tokio",
"tokio-util",
@@ -2668,7 +2678,7 @@ dependencies = [
"futures-sink",
"futures-util",
"http 1.3.1",
"indexmap 2.9.0",
"indexmap 2.10.0",
"slab",
"tokio",
"tokio-util",
@@ -2927,7 +2937,7 @@ dependencies = [
"pprof",
"regex",
"routerify",
"rustls 0.23.27",
"rustls 0.23.29",
"rustls-pemfile 2.1.1",
"serde",
"serde_json",
@@ -3264,9 +3274,9 @@ dependencies = [
[[package]]
name = "indexmap"
version = "2.9.0"
version = "2.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cea70ddb795996207ad57735b50c5982d8844f38ba9ee5f1aedcfb708a2aa11e"
checksum = "fe4cd85333e22411419a0bcae1297d25e58c9443848b11dc6a86fefe8c78a661"
dependencies = [
"equivalent",
"hashbrown 0.15.2",
@@ -3292,7 +3302,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "232929e1d75fe899576a3d5c7416ad0d88dbfbb3c3d6aa00873a7408a50ddb88"
dependencies = [
"ahash",
"indexmap 2.9.0",
"indexmap 2.10.0",
"is-terminal",
"itoa",
"log",
@@ -3315,7 +3325,7 @@ dependencies = [
"crossbeam-utils",
"dashmap 6.1.0",
"env_logger",
"indexmap 2.9.0",
"indexmap 2.10.0",
"itoa",
"log",
"num-format",
@@ -4152,23 +4162,23 @@ checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf"
[[package]]
name = "opentelemetry"
version = "0.27.1"
version = "0.30.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ab70038c28ed37b97d8ed414b6429d343a8bbf44c9f79ec854f3a643029ba6d7"
checksum = "aaf416e4cb72756655126f7dd7bb0af49c674f4c1b9903e80c009e0c37e552e6"
dependencies = [
"futures-core",
"futures-sink",
"js-sys",
"pin-project-lite",
"thiserror 1.0.69",
"thiserror 2.0.11",
"tracing",
]
[[package]]
name = "opentelemetry-http"
version = "0.27.0"
version = "0.30.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "10a8a7f5f6ba7c1b286c2fbca0454eaba116f63bbe69ed250b642d36fbb04d80"
checksum = "50f6639e842a97dbea8886e3439710ae463120091e2e064518ba8e716e6ac36d"
dependencies = [
"async-trait",
"bytes",
@@ -4179,12 +4189,10 @@ dependencies = [
[[package]]
name = "opentelemetry-otlp"
version = "0.27.0"
version = "0.30.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "91cf61a1868dacc576bf2b2a1c3e9ab150af7272909e80085c3173384fe11f76"
checksum = "dbee664a43e07615731afc539ca60c6d9f1a9425e25ca09c57bc36c87c55852b"
dependencies = [
"async-trait",
"futures-core",
"http 1.3.1",
"opentelemetry",
"opentelemetry-http",
@@ -4192,46 +4200,43 @@ dependencies = [
"opentelemetry_sdk",
"prost 0.13.5",
"reqwest",
"thiserror 1.0.69",
"thiserror 2.0.11",
]
[[package]]
name = "opentelemetry-proto"
version = "0.27.0"
version = "0.30.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a6e05acbfada5ec79023c85368af14abd0b307c015e9064d249b2a950ef459a6"
checksum = "2e046fd7660710fe5a05e8748e70d9058dc15c94ba914e7c4faa7c728f0e8ddc"
dependencies = [
"opentelemetry",
"opentelemetry_sdk",
"prost 0.13.5",
"tonic 0.12.3",
"tonic",
]
[[package]]
name = "opentelemetry-semantic-conventions"
version = "0.27.0"
version = "0.30.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bc1b6902ff63b32ef6c489e8048c5e253e2e4a803ea3ea7e783914536eb15c52"
checksum = "83d059a296a47436748557a353c5e6c5705b9470ef6c95cfc52c21a8814ddac2"
[[package]]
name = "opentelemetry_sdk"
version = "0.27.1"
version = "0.30.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "231e9d6ceef9b0b2546ddf52335785ce41252bc7474ee8ba05bfad277be13ab8"
checksum = "11f644aa9e5e31d11896e024305d7e3c98a88884d9f8919dbf37a9991bc47a4b"
dependencies = [
"async-trait",
"futures-channel",
"futures-executor",
"futures-util",
"glob",
"opentelemetry",
"percent-encoding",
"rand 0.8.5",
"rand 0.9.1",
"serde_json",
"thiserror 1.0.69",
"thiserror 2.0.11",
"tokio",
"tokio-stream",
"tracing",
]
[[package]]
@@ -4358,7 +4363,7 @@ dependencies = [
"tokio",
"tokio-stream",
"tokio-util",
"tonic 0.13.1",
"tonic",
"tracing",
"url",
"utils",
@@ -4455,7 +4460,7 @@ dependencies = [
"reqwest",
"rpds",
"rstest",
"rustls 0.23.27",
"rustls 0.23.29",
"scopeguard",
"send-future",
"serde",
@@ -4479,7 +4484,7 @@ dependencies = [
"tokio-tar",
"tokio-util",
"toml_edit",
"tonic 0.13.1",
"tonic",
"tonic-reflection",
"tower 0.5.2",
"tracing",
@@ -4565,7 +4570,7 @@ dependencies = [
"tokio",
"tokio-stream",
"tokio-util",
"tonic 0.13.1",
"tonic",
"tracing",
"utils",
"workspace_hack",
@@ -4611,7 +4616,7 @@ dependencies = [
"thiserror 1.0.69",
"tokio",
"tokio-util",
"tonic 0.13.1",
"tonic",
"tonic-build",
"utils",
"workspace_hack",
@@ -4993,7 +4998,7 @@ dependencies = [
"bytes",
"once_cell",
"pq_proto",
"rustls 0.23.27",
"rustls 0.23.29",
"rustls-pemfile 2.1.1",
"serde",
"thiserror 1.0.69",
@@ -5392,7 +5397,7 @@ dependencies = [
"hyper 0.14.30",
"hyper 1.4.1",
"hyper-util",
"indexmap 2.9.0",
"indexmap 2.10.0",
"ipnet",
"itertools 0.10.5",
"itoa",
@@ -5429,7 +5434,7 @@ dependencies = [
"rsa",
"rstest",
"rustc-hash 2.1.1",
"rustls 0.23.27",
"rustls 0.23.29",
"rustls-native-certs 0.8.0",
"rustls-pemfile 2.1.1",
"scopeguard",
@@ -5708,7 +5713,7 @@ dependencies = [
"num-bigint",
"percent-encoding",
"pin-project-lite",
"rustls 0.23.27",
"rustls 0.23.29",
"rustls-native-certs 0.8.0",
"ryu",
"sha1_smol",
@@ -5937,9 +5942,9 @@ dependencies = [
[[package]]
name = "reqwest-tracing"
version = "0.5.5"
version = "0.5.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "73e6153390585f6961341b50e5a1931d6be6dee4292283635903c26ef9d980d2"
checksum = "d70ea85f131b2ee9874f0b160ac5976f8af75f3c9badfe0d955880257d10bd83"
dependencies = [
"anyhow",
"async-trait",
@@ -6164,15 +6169,15 @@ dependencies = [
[[package]]
name = "rustls"
version = "0.23.27"
version = "0.23.29"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "730944ca083c1c233a75c09f199e973ca499344a2b7ba9e755c457e86fb4a321"
checksum = "2491382039b29b9b11ff08b76ff6c97cf287671dbb74f0be44bda389fffe9bd1"
dependencies = [
"log",
"once_cell",
"ring",
"rustls-pki-types",
"rustls-webpki 0.103.3",
"rustls-webpki 0.103.4",
"subtle",
"zeroize",
]
@@ -6236,9 +6241,12 @@ dependencies = [
[[package]]
name = "rustls-pki-types"
version = "1.11.0"
version = "1.12.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "917ce264624a4b4db1c364dcc35bfca9ded014d0a958cd47ad3e960e988ea51c"
checksum = "229a4a4c221013e7e1f1a043678c5cc39fe5171437c88fb47151a21e6f5b5c79"
dependencies = [
"zeroize",
]
[[package]]
name = "rustls-webpki"
@@ -6263,9 +6271,9 @@ dependencies = [
[[package]]
name = "rustls-webpki"
version = "0.103.3"
version = "0.103.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e4a72fe2bcf7a6ac6fd7d0b9e5cb68aeb7d4c0a0271730218b3e92d43b4eb435"
checksum = "0a17884ae0c1b773f1ccd2bd4a8c72f16da897310a98b0e84bf349ad5ead92fc"
dependencies = [
"ring",
"rustls-pki-types",
@@ -6326,7 +6334,7 @@ dependencies = [
"regex",
"remote_storage",
"reqwest",
"rustls 0.23.27",
"rustls 0.23.29",
"safekeeper_api",
"safekeeper_client",
"scopeguard",
@@ -6516,7 +6524,7 @@ checksum = "255914a8e53822abd946e2ce8baa41d4cded6b8e938913b7f7b9da5b7ab44335"
dependencies = [
"httpdate",
"reqwest",
"rustls 0.23.27",
"rustls 0.23.29",
"sentry-backtrace",
"sentry-contexts",
"sentry-core",
@@ -6648,7 +6656,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9d2de91cf02bbc07cde38891769ccd5d4f073d22a40683aa4bc7a95781aaa2c4"
dependencies = [
"form_urlencoded",
"indexmap 2.9.0",
"indexmap 2.10.0",
"itoa",
"ryu",
"serde",
@@ -6729,7 +6737,7 @@ dependencies = [
"chrono",
"hex",
"indexmap 1.9.3",
"indexmap 2.9.0",
"indexmap 2.10.0",
"serde",
"serde_derive",
"serde_json",
@@ -6972,10 +6980,10 @@ dependencies = [
"once_cell",
"parking_lot 0.12.1",
"prost 0.13.5",
"rustls 0.23.27",
"rustls 0.23.29",
"tokio",
"tokio-rustls 0.26.2",
"tonic 0.13.1",
"tonic",
"tonic-build",
"tracing",
"utils",
@@ -7020,7 +7028,7 @@ dependencies = [
"regex",
"reqwest",
"routerify",
"rustls 0.23.27",
"rustls 0.23.29",
"rustls-native-certs 0.8.0",
"safekeeper_api",
"safekeeper_client",
@@ -7074,7 +7082,7 @@ dependencies = [
"postgres_ffi",
"remote_storage",
"reqwest",
"rustls 0.23.27",
"rustls 0.23.29",
"rustls-native-certs 0.8.0",
"serde",
"serde_json",
@@ -7613,7 +7621,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "04fb792ccd6bbcd4bba408eb8a292f70fc4a3589e5d793626f45190e6454b6ab"
dependencies = [
"ring",
"rustls 0.23.27",
"rustls 0.23.29",
"tokio",
"tokio-postgres",
"tokio-rustls 0.26.2",
@@ -7664,7 +7672,7 @@ version = "0.26.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8e727b36a1a0e8b74c376ac2211e40c2c8af09fb4013c60d910495810f008e9b"
dependencies = [
"rustls 0.23.27",
"rustls 0.23.29",
"tokio",
]
@@ -7763,34 +7771,13 @@ version = "0.22.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f21c7aaf97f1bd9ca9d4f9e73b0a6c74bd5afef56f2bc931943a6e1c37e04e38"
dependencies = [
"indexmap 2.9.0",
"indexmap 2.10.0",
"serde",
"serde_spanned",
"toml_datetime",
"winnow",
]
[[package]]
name = "tonic"
version = "0.12.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "877c5b330756d856ffcc4553ab34a5684481ade925ecc54bcd1bf02b1d0d4d52"
dependencies = [
"async-trait",
"base64 0.22.1",
"bytes",
"http 1.3.1",
"http-body 1.0.0",
"http-body-util",
"percent-encoding",
"pin-project",
"prost 0.13.5",
"tokio-stream",
"tower-layer",
"tower-service",
"tracing",
]
[[package]]
name = "tonic"
version = "0.13.1"
@@ -7848,7 +7835,7 @@ dependencies = [
"prost-types 0.13.5",
"tokio",
"tokio-stream",
"tonic 0.13.1",
"tonic",
]
[[package]]
@@ -7874,7 +7861,7 @@ checksum = "d039ad9159c98b70ecfd540b2573b97f7f52c3e8d9f8ad57a24b916a536975f9"
dependencies = [
"futures-core",
"futures-util",
"indexmap 2.9.0",
"indexmap 2.10.0",
"pin-project-lite",
"slab",
"sync_wrapper 1.0.1",
@@ -7912,10 +7899,14 @@ checksum = "121c2a6cda46980bb0fcd1647ffaf6cd3fc79a013de288782836f6df9c48780e"
[[package]]
name = "tower-otel"
version = "0.2.0"
source = "git+https://github.com/mattiapenati/tower-otel?rev=56a7321053bcb72443888257b622ba0d43a11fcd#56a7321053bcb72443888257b622ba0d43a11fcd"
version = "0.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "345000ea5ae33222624a8ccfdd88892c30db4d413a39c2d4bd714b77e0a4b23c"
dependencies = [
"axum",
"cfg-if",
"http 1.3.1",
"http-body 1.0.0",
"opentelemetry",
"pin-project",
"tower-layer",
@@ -7997,9 +7988,9 @@ dependencies = [
[[package]]
name = "tracing-opentelemetry"
version = "0.28.0"
version = "0.31.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "97a971f6058498b5c0f1affa23e7ea202057a7301dbff68e968b2d578bcbd053"
checksum = "ddcf5959f39507d0d04d6413119c04f33b623f4f951ebcbdddddfad2d0623a9c"
dependencies = [
"js-sys",
"once_cell",
@@ -8207,7 +8198,7 @@ dependencies = [
"base64 0.22.1",
"log",
"once_cell",
"rustls 0.23.27",
"rustls 0.23.29",
"rustls-pki-types",
"url",
"webpki-roots",
@@ -8879,7 +8870,7 @@ dependencies = [
"hyper 0.14.30",
"hyper 1.4.1",
"hyper-util",
"indexmap 2.9.0",
"indexmap 2.10.0",
"itertools 0.12.1",
"lazy_static",
"libc",
@@ -8902,14 +8893,14 @@ dependencies = [
"proc-macro2",
"prost 0.13.5",
"quote",
"rand 0.8.5",
"rand 0.9.1",
"regex",
"regex-automata 0.4.9",
"regex-syntax 0.8.5",
"reqwest",
"rustls 0.23.27",
"rustls 0.23.29",
"rustls-pki-types",
"rustls-webpki 0.103.3",
"rustls-webpki 0.103.4",
"scopeguard",
"sec1 0.7.3",
"serde",
@@ -8922,6 +8913,7 @@ dependencies = [
"subtle",
"syn 2.0.100",
"sync_wrapper 0.1.2",
"thiserror 2.0.11",
"tikv-jemalloc-ctl",
"tikv-jemalloc-sys",
"time",
@@ -8931,6 +8923,7 @@ dependencies = [
"tokio-stream",
"tokio-util",
"toml_edit",
"tonic",
"tower 0.5.2",
"tracing",
"tracing-core",

View File

@@ -143,10 +143,10 @@ notify = "6.0.0"
num_cpus = "1.15"
num-traits = "0.2.19"
once_cell = "1.13"
opentelemetry = "0.27"
opentelemetry_sdk = "0.27"
opentelemetry-otlp = { version = "0.27", default-features = false, features = ["http-proto", "trace", "http", "reqwest-client"] }
opentelemetry-semantic-conventions = "0.27"
opentelemetry = "0.30"
opentelemetry_sdk = "0.30"
opentelemetry-otlp = { version = "0.30", default-features = false, features = ["http-proto", "trace", "http", "reqwest-client"] }
opentelemetry-semantic-conventions = "0.30"
parking_lot = "0.12"
parquet = { version = "53", default-features = false, features = ["zstd"] }
parquet_derive = "53"
@@ -164,7 +164,7 @@ rand_core = "=0.6"
redis = { version = "0.29.2", features = ["tokio-rustls-comp", "keep-alive"] }
regex = "1.10.2"
reqwest = { version = "0.12", default-features = false, features = ["rustls-tls"] }
reqwest-tracing = { version = "0.5", features = ["opentelemetry_0_27"] }
reqwest-tracing = { version = "0.5", features = ["opentelemetry_0_30"] }
reqwest-middleware = "0.4"
reqwest-retry = "0.7"
routerify = "3"
@@ -214,15 +214,12 @@ tonic = { version = "0.13.1", default-features = false, features = ["channel", "
tonic-reflection = { version = "0.13.1", features = ["server"] }
tower = { version = "0.5.2", default-features = false }
tower-http = { version = "0.6.2", features = ["auth", "request-id", "trace"] }
# This revision uses opentelemetry 0.27. There's no tag for it.
tower-otel = { git = "https://github.com/mattiapenati/tower-otel", rev = "56a7321053bcb72443888257b622ba0d43a11fcd" }
tower-otel = { version = "0.6", features = ["axum"] }
tower-service = "0.3.3"
tracing = "0.1"
tracing-error = "0.2"
tracing-log = "0.2"
tracing-opentelemetry = "0.28"
tracing-opentelemetry = "0.31"
tracing-serde = "0.2.0"
tracing-subscriber = { version = "0.3", default-features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
try-lock = "0.2.5"

View File

@@ -27,7 +27,10 @@ fail.workspace = true
flate2.workspace = true
futures.workspace = true
http.workspace = true
http-body-util.workspace = true
hostname-validator = "1.1"
hyper.workspace = true
hyper-util.workspace = true
indexmap.workspace = true
itertools.workspace = true
jsonwebtoken.workspace = true
@@ -44,6 +47,7 @@ postgres.workspace = true
regex.workspace = true
reqwest = { workspace = true, features = ["json"] }
ring = "0.17"
scopeguard.workspace = true
serde.workspace = true
serde_with.workspace = true
serde_json.workspace = true

View File

@@ -188,7 +188,7 @@ fn main() -> Result<()> {
.build()?;
let _rt_guard = runtime.enter();
runtime.block_on(init(cli.dev))?;
let tracing_provider = init(cli.dev)?;
// enable core dumping for all child processes
setrlimit(Resource::CORE, rlimit::INFINITY, rlimit::INFINITY)?;
@@ -227,11 +227,11 @@ fn main() -> Result<()> {
scenario.teardown();
deinit_and_exit(exit_code);
deinit_and_exit(tracing_provider, exit_code);
}
async fn init(dev_mode: bool) -> Result<()> {
init_tracing_and_logging(DEFAULT_LOG_LEVEL).await?;
fn init(dev_mode: bool) -> Result<Option<tracing_utils::Provider>> {
let provider = init_tracing_and_logging(DEFAULT_LOG_LEVEL)?;
let mut signals = Signals::new([SIGINT, SIGTERM, SIGQUIT])?;
thread::spawn(move || {
@@ -242,7 +242,7 @@ async fn init(dev_mode: bool) -> Result<()> {
info!("compute build_tag: {}", &BUILD_TAG.to_string());
Ok(())
Ok(provider)
}
fn get_config(cli: &Cli) -> Result<ComputeConfig> {
@@ -267,25 +267,27 @@ fn get_config(cli: &Cli) -> Result<ComputeConfig> {
}
}
fn deinit_and_exit(exit_code: Option<i32>) -> ! {
// Shutdown trace pipeline gracefully, so that it has a chance to send any
// pending traces before we exit. Shutting down OTEL tracing provider may
// hang for quite some time, see, for example:
// - https://github.com/open-telemetry/opentelemetry-rust/issues/868
// - and our problems with staging https://github.com/neondatabase/cloud/issues/3707#issuecomment-1493983636
//
// Yet, we want computes to shut down fast enough, as we may need a new one
// for the same timeline ASAP. So wait no longer than 2s for the shutdown to
// complete, then just error out and exit the main thread.
info!("shutting down tracing");
let (sender, receiver) = mpsc::channel();
let _ = thread::spawn(move || {
tracing_utils::shutdown_tracing();
sender.send(()).ok()
});
let shutdown_res = receiver.recv_timeout(Duration::from_millis(2000));
if shutdown_res.is_err() {
error!("timed out while shutting down tracing, exiting anyway");
fn deinit_and_exit(tracing_provider: Option<tracing_utils::Provider>, exit_code: Option<i32>) -> ! {
if let Some(p) = tracing_provider {
// Shutdown trace pipeline gracefully, so that it has a chance to send any
// pending traces before we exit. Shutting down OTEL tracing provider may
// hang for quite some time, see, for example:
// - https://github.com/open-telemetry/opentelemetry-rust/issues/868
// - and our problems with staging https://github.com/neondatabase/cloud/issues/3707#issuecomment-1493983636
//
// Yet, we want computes to shut down fast enough, as we may need a new one
// for the same timeline ASAP. So wait no longer than 2s for the shutdown to
// complete, then just error out and exit the main thread.
info!("shutting down tracing");
let (sender, receiver) = mpsc::channel();
let _ = thread::spawn(move || {
_ = p.shutdown();
sender.send(()).ok()
});
let shutdown_res = receiver.recv_timeout(Duration::from_millis(2000));
if shutdown_res.is_err() {
error!("timed out while shutting down tracing, exiting anyway");
}
}
info!("shutting down");

View File

@@ -0,0 +1,98 @@
//! Client for making request to a running Postgres server's communicator control socket.
//!
//! The storage communicator process that runs inside Postgres exposes an HTTP endpoint in
//! a Unix Domain Socket in the Postgres data directory. This provides access to it.
use std::path::Path;
use anyhow::Context;
use hyper::client::conn::http1::SendRequest;
use hyper_util::rt::TokioIo;
/// Name of the socket within the Postgres data directory. This better match that in
/// `pgxn/neon/communicator/src/lib.rs`.
const NEON_COMMUNICATOR_SOCKET_NAME: &str = "neon-communicator.socket";
/// Open a connection to the communicator's control socket, prepare to send requests to it
/// with hyper.
pub async fn connect_communicator_socket<B>(pgdata: &Path) -> anyhow::Result<SendRequest<B>>
where
B: hyper::body::Body + 'static + Send,
B::Data: Send,
B::Error: Into<Box<dyn std::error::Error + Send + Sync>>,
{
let socket_path = pgdata.join(NEON_COMMUNICATOR_SOCKET_NAME);
let socket_path_len = socket_path.display().to_string().len();
// There is a limit of around 100 bytes (108 on Linux?) on the length of the path to a
// Unix Domain socket. The limit is on the connect(2) function used to open the
// socket, not on the absolute path itself. Postgres changes the current directory to
// the data directory and uses a relative path to bind to the socket, and the relative
// path "./neon-communicator.socket" is always short, but when compute_ctl needs to
// open the socket, we need to use a full path, which can be arbitrarily long.
//
// There are a few ways we could work around this:
//
// 1. Change the current directory to the Postgres data directory and use a relative
// path in the connect(2) call. That's problematic because the current directory
// applies to the whole process. We could change the current directory early in
// compute_ctl startup, and that might be a good idea anyway for other reasons too:
// it would be more robust if the data directory is moved around or unlinked for
// some reason, and you would be less likely to accidentally litter other parts of
// the filesystem with e.g. temporary files. However, that's a pretty invasive
// change.
//
// 2. On Linux, you could open() the data directory, and refer to the the socket
// inside it as "/proc/self/fd/<fd>/neon-communicator.socket". But that's
// Linux-only.
//
// 3. Create a symbolic link to the socket with a shorter path, and use that.
//
// We use the symbolic link approach here. Hopefully the paths we use in production
// are shorter, so that we can open the socket directly, so that this hack is needed
// only in development.
let connect_result = if socket_path_len < 100 {
// We can open the path directly with no hacks.
tokio::net::UnixStream::connect(socket_path).await
} else {
// The path to the socket is too long. Create a symlink to it with a shorter path.
let short_path = std::env::temp_dir().join(format!(
"compute_ctl.short-socket.{}.{}",
std::process::id(),
tokio::task::id()
));
std::os::unix::fs::symlink(&socket_path, &short_path)?;
// Delete the symlink as soon as we have connected to it. There's a small chance
// of leaking if the process dies before we remove it, so try to keep that window
// as small as possible.
scopeguard::defer! {
if let Err(err) = std::fs::remove_file(&short_path) {
tracing::warn!("could not remove symlink \"{}\" created for socket: {}",
short_path.display(), err);
}
}
tracing::info!(
"created symlink \"{}\" for socket \"{}\", opening it now",
short_path.display(),
socket_path.display()
);
tokio::net::UnixStream::connect(&short_path).await
};
let stream = connect_result.context("connecting to communicator control socket")?;
let io = TokioIo::new(stream);
let (request_sender, connection) = hyper::client::conn::http1::handshake(io).await?;
// spawn a task to poll the connection and drive the HTTP state
tokio::spawn(async move {
if let Err(err) = connection.await {
eprintln!("Error in connection: {err}");
}
});
Ok(request_sender)
}

View File

@@ -1,10 +1,18 @@
use std::path::Path;
use std::sync::Arc;
use anyhow::Context;
use axum::body::Body;
use axum::extract::State;
use axum::response::Response;
use http::StatusCode;
use http::header::CONTENT_TYPE;
use http_body_util::BodyExt;
use hyper::{Request, StatusCode};
use metrics::proto::MetricFamily;
use metrics::{Encoder, TextEncoder};
use crate::communicator_socket_client::connect_communicator_socket;
use crate::compute::ComputeNode;
use crate::http::JsonResponse;
use crate::metrics::collect;
@@ -31,3 +39,42 @@ pub(in crate::http) async fn get_metrics() -> Response {
.body(Body::from(buffer))
.unwrap()
}
/// Fetch and forward metrics from the Postgres neon extension's metrics
/// exporter that are used by autoscaling-agent.
///
/// The neon extension exposes these metrics over a Unix domain socket
/// in the data directory. That's not accessible directly from the outside
/// world, so we have this endpoint in compute_ctl to expose it
pub(in crate::http) async fn get_autoscaling_metrics(
State(compute): State<Arc<ComputeNode>>,
) -> Result<Response, Response> {
let pgdata = Path::new(&compute.params.pgdata);
// Connect to the communicator process's metrics socket
let mut metrics_client = connect_communicator_socket(pgdata)
.await
.map_err(|e| JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, format!("{e:#}")))?;
// Make a request for /autoscaling_metrics
let request = Request::builder()
.method("GET")
.uri("/autoscaling_metrics")
.header("Host", "localhost") // hyper requires Host, even though the server won't care
.body(Body::from(""))
.unwrap();
let resp = metrics_client
.send_request(request)
.await
.context("fetching metrics from Postgres metrics service")
.map_err(|e| JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, format!("{e:#}")))?;
// Build a response that just forwards the response we got.
let mut response = Response::builder();
response = response.status(resp.status());
if let Some(content_type) = resp.headers().get(CONTENT_TYPE) {
response = response.header(CONTENT_TYPE, content_type);
}
let body = tonic::service::AxumBody::from_stream(resp.into_body().into_data_stream());
Ok(response.body(body).unwrap())
}

View File

@@ -81,8 +81,12 @@ impl From<&Server> for Router<Arc<ComputeNode>> {
Server::External {
config, compute_id, ..
} => {
let unauthenticated_router =
Router::<Arc<ComputeNode>>::new().route("/metrics", get(metrics::get_metrics));
let unauthenticated_router = Router::<Arc<ComputeNode>>::new()
.route("/metrics", get(metrics::get_metrics))
.route(
"/autoscaling_metrics",
get(metrics::get_autoscaling_metrics),
);
let authenticated_router = Router::<Arc<ComputeNode>>::new()
.route("/lfc/prewarm", get(lfc::prewarm_state).post(lfc::prewarm))

View File

@@ -4,6 +4,7 @@
#![deny(clippy::undocumented_unsafe_blocks)]
pub mod checker;
pub mod communicator_socket_client;
pub mod config;
pub mod configurator;
pub mod http;

View File

@@ -13,7 +13,9 @@ use tracing_subscriber::prelude::*;
/// set `OTEL_EXPORTER_OTLP_ENDPOINT=http://jaeger:4318`. See
/// `tracing-utils` package description.
///
pub async fn init_tracing_and_logging(default_log_level: &str) -> anyhow::Result<()> {
pub fn init_tracing_and_logging(
default_log_level: &str,
) -> anyhow::Result<Option<tracing_utils::Provider>> {
// Initialize Logging
let env_filter = tracing_subscriber::EnvFilter::try_from_default_env()
.unwrap_or_else(|_| tracing_subscriber::EnvFilter::new(default_log_level));
@@ -24,8 +26,9 @@ pub async fn init_tracing_and_logging(default_log_level: &str) -> anyhow::Result
.with_writer(std::io::stderr);
// Initialize OpenTelemetry
let otlp_layer =
tracing_utils::init_tracing("compute_ctl", tracing_utils::ExportConfig::default()).await;
let provider =
tracing_utils::init_tracing("compute_ctl", tracing_utils::ExportConfig::default());
let otlp_layer = provider.as_ref().map(tracing_utils::layer);
// Put it all together
tracing_subscriber::registry()
@@ -37,7 +40,7 @@ pub async fn init_tracing_and_logging(default_log_level: &str) -> anyhow::Result
utils::logging::replace_panic_hook_with_tracing_panic_hook().forget();
Ok(())
Ok(provider)
}
/// Replace all newline characters with a special character to make it

View File

@@ -407,6 +407,12 @@ struct StorageControllerStartCmdArgs {
help = "Base port for the storage controller instance idenfified by instance-id (defaults to pageserver cplane api)"
)]
base_port: Option<u16>,
#[clap(
long,
help = "Whether the storage controller should handle pageserver-reported local disk loss events."
)]
handle_ps_local_disk_loss: Option<bool>,
}
#[derive(clap::Args)]
@@ -1823,6 +1829,7 @@ async fn handle_storage_controller(
instance_id: args.instance_id,
base_port: args.base_port,
start_timeout: args.start_timeout,
handle_ps_local_disk_loss: args.handle_ps_local_disk_loss,
};
if let Err(e) = svc.start(start_args).await {

View File

@@ -56,6 +56,7 @@ pub struct NeonStorageControllerStartArgs {
pub instance_id: u8,
pub base_port: Option<u16>,
pub start_timeout: humantime::Duration,
pub handle_ps_local_disk_loss: Option<bool>,
}
impl NeonStorageControllerStartArgs {
@@ -64,6 +65,7 @@ impl NeonStorageControllerStartArgs {
instance_id: 1,
base_port: None,
start_timeout,
handle_ps_local_disk_loss: None,
}
}
}
@@ -669,6 +671,10 @@ impl StorageController {
println!("Starting storage controller at {scheme}://{host}:{listen_port}");
if start_args.handle_ps_local_disk_loss.unwrap_or_default() {
args.push("--handle-ps-local-disk-loss".to_string());
}
background_process::start_process(
COMMAND,
&instance_dir,

View File

@@ -394,7 +394,7 @@ impl From<&OtelExporterConfig> for tracing_utils::ExportConfig {
tracing_utils::ExportConfig {
endpoint: Some(val.endpoint.clone()),
protocol: val.protocol.into(),
timeout: val.timeout,
timeout: Some(val.timeout),
}
}
}

View File

@@ -21,6 +21,14 @@ pub struct ReAttachRequest {
/// if the node already has a node_id set.
#[serde(skip_serializing_if = "Option::is_none", default)]
pub register: Option<NodeRegisterRequest>,
/// Hadron: Optional flag to indicate whether the node is starting with an empty local disk.
/// Will be set to true if the node couldn't find any local tenant data on startup, could be
/// due to the node starting for the first time or due to a local SSD failure/disk wipe event.
/// The flag may be used by the storage controller to update its observed state of the world
/// to make sure that it sends explicit location_config calls to the node following the
/// re-attach request.
pub empty_local_disk: Option<bool>,
}
#[derive(Serialize, Deserialize, Debug)]

View File

@@ -1,11 +1,5 @@
//! Helper functions to set up OpenTelemetry tracing.
//!
//! This comes in two variants, depending on whether you have a Tokio runtime available.
//! If you do, call `init_tracing()`. It sets up the trace processor and exporter to use
//! the current tokio runtime. If you don't have a runtime available, or you don't want
//! to share the runtime with the tracing tasks, call `init_tracing_without_runtime()`
//! instead. It sets up a dedicated single-threaded Tokio runtime for the tracing tasks.
//!
//! Example:
//!
//! ```rust,no_run
@@ -21,7 +15,8 @@
//! .with_writer(std::io::stderr);
//!
//! // Initialize OpenTelemetry. Exports tracing spans as OpenTelemetry traces
//! let otlp_layer = tracing_utils::init_tracing("my_application", tracing_utils::ExportConfig::default()).await;
//! let provider = tracing_utils::init_tracing("my_application", tracing_utils::ExportConfig::default());
//! let otlp_layer = provider.as_ref().map(tracing_utils::layer);
//!
//! // Put it all together
//! tracing_subscriber::registry()
@@ -36,16 +31,18 @@
pub mod http;
pub mod perf_span;
use opentelemetry::KeyValue;
use opentelemetry::trace::TracerProvider;
use opentelemetry_otlp::WithExportConfig;
pub use opentelemetry_otlp::{ExportConfig, Protocol};
use opentelemetry_sdk::trace::SdkTracerProvider;
use tracing::level_filters::LevelFilter;
use tracing::{Dispatch, Subscriber};
use tracing_subscriber::Layer;
use tracing_subscriber::layer::SubscriberExt;
use tracing_subscriber::registry::LookupSpan;
pub type Provider = SdkTracerProvider;
/// Set up OpenTelemetry exporter, using configuration from environment variables.
///
/// `service_name` is set as the OpenTelemetry 'service.name' resource (see
@@ -70,16 +67,7 @@ use tracing_subscriber::registry::LookupSpan;
/// If you need some other setting, please test if it works first. And perhaps
/// add a comment in the list above to save the effort of testing for the next
/// person.
///
/// This doesn't block, but is marked as 'async' to hint that this must be called in
/// asynchronous execution context.
pub async fn init_tracing<S>(
service_name: &str,
export_config: ExportConfig,
) -> Option<impl Layer<S>>
where
S: Subscriber + for<'span> LookupSpan<'span>,
{
pub fn init_tracing(service_name: &str, export_config: ExportConfig) -> Option<Provider> {
if std::env::var("OTEL_SDK_DISABLED") == Ok("true".to_string()) {
return None;
};
@@ -89,52 +77,14 @@ where
))
}
/// Like `init_tracing`, but creates a separate tokio Runtime for the tracing
/// tasks.
pub fn init_tracing_without_runtime<S>(
service_name: &str,
export_config: ExportConfig,
) -> Option<impl Layer<S>>
pub fn layer<S>(p: &Provider) -> impl Layer<S>
where
S: Subscriber + for<'span> LookupSpan<'span>,
{
if std::env::var("OTEL_SDK_DISABLED") == Ok("true".to_string()) {
return None;
};
// The opentelemetry batch processor and the OTLP exporter needs a Tokio
// runtime. Create a dedicated runtime for them. One thread should be
// enough.
//
// (Alternatively, instead of batching, we could use the "simple
// processor", which doesn't need Tokio, and use "reqwest-blocking"
// feature for the OTLP exporter, which also doesn't need Tokio. However,
// batching is considered best practice, and also I have the feeling that
// the non-Tokio codepaths in the opentelemetry crate are less used and
// might be more buggy, so better to stay on the well-beaten path.)
//
// We leak the runtime so that it keeps running after we exit the
// function.
let runtime = Box::leak(Box::new(
tokio::runtime::Builder::new_multi_thread()
.enable_all()
.thread_name("otlp runtime thread")
.worker_threads(1)
.build()
.unwrap(),
));
let _guard = runtime.enter();
Some(init_tracing_internal(
service_name.to_string(),
export_config,
))
tracing_opentelemetry::layer().with_tracer(p.tracer("global"))
}
fn init_tracing_internal<S>(service_name: String, export_config: ExportConfig) -> impl Layer<S>
where
S: Subscriber + for<'span> LookupSpan<'span>,
{
fn init_tracing_internal(service_name: String, export_config: ExportConfig) -> Provider {
// Sets up exporter from the provided [`ExportConfig`] parameter.
// If the endpoint is not specified, it is loaded from the
// OTEL_EXPORTER_OTLP_ENDPOINT environment variable.
@@ -153,22 +103,14 @@ where
opentelemetry_sdk::propagation::TraceContextPropagator::new(),
);
let tracer = opentelemetry_sdk::trace::TracerProvider::builder()
.with_batch_exporter(exporter, opentelemetry_sdk::runtime::Tokio)
.with_resource(opentelemetry_sdk::Resource::new(vec![KeyValue::new(
opentelemetry_semantic_conventions::resource::SERVICE_NAME,
service_name,
)]))
Provider::builder()
.with_batch_exporter(exporter)
.with_resource(
opentelemetry_sdk::Resource::builder()
.with_service_name(service_name)
.build(),
)
.build()
.tracer("global");
tracing_opentelemetry::layer().with_tracer(tracer)
}
// Shutdown trace pipeline gracefully, so that it has a chance to send any
// pending traces before we exit.
pub fn shutdown_tracing() {
opentelemetry::global::shutdown_tracer_provider();
}
pub enum OtelEnablement {
@@ -176,17 +118,17 @@ pub enum OtelEnablement {
Enabled {
service_name: String,
export_config: ExportConfig,
runtime: &'static tokio::runtime::Runtime,
},
}
pub struct OtelGuard {
provider: Provider,
pub dispatch: Dispatch,
}
impl Drop for OtelGuard {
fn drop(&mut self) {
shutdown_tracing();
_ = self.provider.shutdown();
}
}
@@ -199,22 +141,19 @@ impl Drop for OtelGuard {
/// The lifetime of the guard should match taht of the application. On drop, it tears down the
/// OTEL infra.
pub fn init_performance_tracing(otel_enablement: OtelEnablement) -> Option<OtelGuard> {
let otel_subscriber = match otel_enablement {
match otel_enablement {
OtelEnablement::Disabled => None,
OtelEnablement::Enabled {
service_name,
export_config,
runtime,
} => {
let otel_layer = runtime
.block_on(init_tracing(&service_name, export_config))
.with_filter(LevelFilter::INFO);
let provider = init_tracing(&service_name, export_config)?;
let otel_layer = layer(&provider).with_filter(LevelFilter::INFO);
let otel_subscriber = tracing_subscriber::registry().with(otel_layer);
let otel_dispatch = Dispatch::new(otel_subscriber);
let dispatch = Dispatch::new(otel_subscriber);
Some(otel_dispatch)
Some(OtelGuard { dispatch, provider })
}
};
otel_subscriber.map(|dispatch| OtelGuard { dispatch })
}
}

View File

@@ -126,7 +126,6 @@ fn main() -> anyhow::Result<()> {
Some(cfg) => tracing_utils::OtelEnablement::Enabled {
service_name: "pageserver".to_string(),
export_config: (&cfg.export_config).into(),
runtime: *COMPUTE_REQUEST_RUNTIME,
},
None => tracing_utils::OtelEnablement::Disabled,
};

View File

@@ -42,6 +42,7 @@ pub trait StorageControllerUpcallApi {
fn re_attach(
&self,
conf: &PageServerConf,
empty_local_disk: bool,
) -> impl Future<
Output = Result<HashMap<TenantShardId, ReAttachResponseTenant>, RetryForeverError>,
> + Send;
@@ -155,6 +156,7 @@ impl StorageControllerUpcallApi for StorageControllerUpcallClient {
async fn re_attach(
&self,
conf: &PageServerConf,
empty_local_disk: bool,
) -> Result<HashMap<TenantShardId, ReAttachResponseTenant>, RetryForeverError> {
let url = self
.base_url
@@ -226,6 +228,7 @@ impl StorageControllerUpcallApi for StorageControllerUpcallClient {
let request = ReAttachRequest {
node_id: self.node_id,
register: register.clone(),
empty_local_disk: Some(empty_local_disk),
};
let response: ReAttachResponse = self

View File

@@ -768,6 +768,7 @@ mod test {
async fn re_attach(
&self,
_conf: &PageServerConf,
_empty_local_disk: bool,
) -> Result<HashMap<TenantShardId, ReAttachResponseTenant>, RetryForeverError> {
unimplemented!()
}

View File

@@ -352,7 +352,8 @@ async fn init_load_generations(
let client = StorageControllerUpcallClient::new(conf, cancel);
info!("Calling {} API to re-attach tenants", client.base_url());
// If we are configured to use the control plane API, then it is the source of truth for what tenants to load.
match client.re_attach(conf).await {
let empty_local_disk = tenant_confs.is_empty();
match client.re_attach(conf, empty_local_disk).await {
Ok(tenants) => tenants
.into_iter()
.flat_map(|(id, rart)| {

View File

@@ -449,6 +449,7 @@ pub struct Timeline {
/// A channel to send async requests to prepare a basebackup for the basebackup cache.
basebackup_cache: Arc<BasebackupCache>,
#[expect(dead_code)]
feature_resolver: Arc<TenantFeatureResolver>,
}

View File

@@ -1326,13 +1326,7 @@ impl Timeline {
.max()
};
let (partition_mode, partition_lsn) = if cfg!(test)
|| cfg!(feature = "testing")
|| self
.feature_resolver
.evaluate_boolean("image-compaction-boundary")
.is_ok()
{
let (partition_mode, partition_lsn) = {
let last_repartition_lsn = self.partitioning.read().1;
let lsn = match l0_l1_boundary_lsn {
Some(boundary) => gc_cutoff
@@ -1348,8 +1342,6 @@ impl Timeline {
} else {
("l0_l1_boundary", lsn)
}
} else {
("latest_record", self.get_last_record_lsn())
};
// 2. Repartition and create image layers if necessary

View File

@@ -5,6 +5,7 @@ MODULE_big = neon
OBJS = \
$(WIN32RES) \
communicator.o \
communicator_process.o \
extension_server.o \
file_cache.o \
hll.o \
@@ -29,6 +30,11 @@ PG_CPPFLAGS = -I$(libpq_srcdir)
SHLIB_LINK_INTERNAL = $(libpq)
SHLIB_LINK = -lcurl
UNAME_S := $(shell uname -s)
ifeq ($(UNAME_S), Darwin)
SHLIB_LINK += -framework Security -framework CoreFoundation -framework SystemConfiguration
endif
EXTENSION = neon
DATA = \
neon--1.0.sql \
@@ -57,7 +63,8 @@ WALPROP_OBJS = \
# libcommunicator.a is built by cargo from the Rust sources under communicator/
# subdirectory. `cargo build` also generates communicator_bindings.h.
neon.o: communicator/communicator_bindings.h
communicator_process.o: communicator/communicator_bindings.h
file_cache.o: communicator/communicator_bindings.h
$(NEON_CARGO_ARTIFACT_TARGET_DIR)/libcommunicator.a communicator/communicator_bindings.h &:
(cd $(srcdir)/communicator && cargo build $(CARGO_BUILD_FLAGS) $(CARGO_PROFILE))

View File

@@ -1820,12 +1820,12 @@ nm_to_string(NeonMessage *msg)
}
case T_NeonGetPageResponse:
{
#if 0
NeonGetPageResponse *msg_resp = (NeonGetPageResponse *) msg;
#endif
appendStringInfoString(&s, "{\"type\": \"NeonGetPageResponse\"");
appendStringInfo(&s, ", \"page\": \"XXX\"}");
appendStringInfo(&s, ", \"rinfo\": %u/%u/%u", RelFileInfoFmt(msg_resp->req.rinfo));
appendStringInfo(&s, ", \"forknum\": %d", msg_resp->req.forknum);
appendStringInfo(&s, ", \"blkno\": %u", msg_resp->req.blkno);
appendStringInfoChar(&s, '}');
break;
}

View File

@@ -16,7 +16,14 @@ testing = []
rest_broker = []
[dependencies]
neon-shmem.workspace = true
axum.workspace = true
http.workspace = true
tokio = { workspace = true, features = ["macros", "net", "io-util", "rt", "rt-multi-thread"] }
tracing.workspace = true
tracing-subscriber.workspace = true
measured.workspace = true
utils.workspace = true
workspace_hack = { version = "0.1", path = "../../../workspace_hack" }
[build-dependencies]

View File

@@ -1,7 +1,22 @@
This package will evolve into a "compute-pageserver communicator"
process and machinery. For now, it's just a dummy that doesn't do
anything interesting, but it allows us to test the compilation and
linking of Rust code into the Postgres extensions.
# Communicator
This package provides the so-called "compute-pageserver communicator",
or just "communicator" in short. The communicator is a separate
background worker process that runs in the PostgreSQL server. It's
part of the neon extension. Currently, it only provides an HTTP
endpoint for metrics, but in the future it will evolve to handle all
communications with the pageservers.
## Source code view
pgxn/neon/communicator_process.c
Contains code needed to start up the communicator process, and
the glue that interacts with PostgreSQL code and the Rust
code in the communicator process.
pgxn/neon/communicator/src/worker_process/
Worker process main loop and glue code
At compilation time, pgxn/neon/communicator/ produces a static
library, libcommunicator.a. It is linked to the neon.so extension

View File

@@ -1,6 +1,5 @@
/// dummy function, just to test linking Rust functions into the C
/// extension
#[unsafe(no_mangle)]
pub extern "C" fn communicator_dummy(arg: u32) -> u32 {
arg + 1
}
mod worker_process;
/// Name of the Unix Domain Socket that serves the metrics, and other APIs in the
/// future. This is within the Postgres data directory.
const NEON_COMMUNICATOR_SOCKET_NAME: &str = "neon-communicator.socket";

View File

@@ -0,0 +1,51 @@
//! C callbacks to PostgreSQL facilities that the neon extension needs to provide. These
//! are implemented in `neon/pgxn/communicator_process.c`. The function signatures better
//! match!
//!
//! These are called from the communicator threads! Careful what you do, most Postgres
//! functions are not safe to call in that context.
#[cfg(not(test))]
unsafe extern "C" {
pub fn callback_set_my_latch_unsafe();
pub fn callback_get_lfc_metrics_unsafe() -> LfcMetrics;
}
// Compile unit tests with dummy versions of the functions. Unit tests cannot call back
// into the C code. (As of this writing, no unit tests even exists in the communicator
// package, but the code coverage build still builds these and tries to link with the
// external C code.)
#[cfg(test)]
unsafe fn callback_set_my_latch_unsafe() {
panic!("not usable in unit tests");
}
#[cfg(test)]
unsafe fn callback_get_lfc_metrics_unsafe() -> LfcMetrics {
panic!("not usable in unit tests");
}
// safe wrappers
pub(super) fn callback_set_my_latch() {
unsafe { callback_set_my_latch_unsafe() };
}
pub(super) fn callback_get_lfc_metrics() -> LfcMetrics {
unsafe { callback_get_lfc_metrics_unsafe() }
}
/// Return type of the callback_get_lfc_metrics() function.
#[repr(C)]
pub struct LfcMetrics {
pub lfc_cache_size_limit: i64,
pub lfc_hits: i64,
pub lfc_misses: i64,
pub lfc_used: i64,
pub lfc_writes: i64,
// working set size looking back 1..60 minutes.
//
// Index 0 is the size of the working set accessed within last 1 minute,
// index 59 is the size of the working set accessed within last 60 minutes.
pub lfc_approximate_working_set_size_windows: [i64; 60],
}

View File

@@ -0,0 +1,102 @@
//! Communicator control socket.
//!
//! Currently, the control socket is used to provide information about the communicator
//! process, file cache etc. as prometheus metrics. In the future, it can be used to
//! expose more things.
//!
//! The exporter speaks HTTP, listens on a Unix Domain Socket under the Postgres
//! data directory. For debugging, you can access it with curl:
//!
//! ```sh
//! curl --unix-socket neon-communicator.socket http://localhost/metrics
//! ```
//!
use axum::Router;
use axum::body::Body;
use axum::extract::State;
use axum::response::Response;
use http::StatusCode;
use http::header::CONTENT_TYPE;
use measured::MetricGroup;
use measured::text::BufferedTextEncoder;
use std::io::ErrorKind;
use tokio::net::UnixListener;
use crate::NEON_COMMUNICATOR_SOCKET_NAME;
use crate::worker_process::main_loop::CommunicatorWorkerProcessStruct;
impl CommunicatorWorkerProcessStruct {
/// Launch the listener
pub(crate) async fn launch_control_socket_listener(
&'static self,
) -> Result<(), std::io::Error> {
use axum::routing::get;
let app = Router::new()
.route("/metrics", get(get_metrics))
.route("/autoscaling_metrics", get(get_autoscaling_metrics))
.route("/debug/panic", get(handle_debug_panic))
.with_state(self);
// If the server is restarted, there might be an old socket still
// lying around. Remove it first.
match std::fs::remove_file(NEON_COMMUNICATOR_SOCKET_NAME) {
Ok(()) => {
tracing::warn!("removed stale control socket");
}
Err(e) if e.kind() == ErrorKind::NotFound => {}
Err(e) => {
tracing::error!("could not remove stale control socket: {e:#}");
// Try to proceed anyway. It will likely fail below though.
}
};
// Create the unix domain socket and start listening on it
let listener = UnixListener::bind(NEON_COMMUNICATOR_SOCKET_NAME)?;
tokio::spawn(async {
tracing::info!("control socket listener spawned");
axum::serve(listener, app)
.await
.expect("axum::serve never returns")
});
Ok(())
}
}
/// Expose all Prometheus metrics.
async fn get_metrics(State(state): State<&CommunicatorWorkerProcessStruct>) -> Response {
tracing::trace!("/metrics requested");
metrics_to_response(&state).await
}
/// Expose Prometheus metrics, for use by the autoscaling agent.
///
/// This is a subset of all the metrics.
async fn get_autoscaling_metrics(
State(state): State<&CommunicatorWorkerProcessStruct>,
) -> Response {
tracing::trace!("/metrics requested");
metrics_to_response(&state.lfc_metrics).await
}
async fn handle_debug_panic(State(_state): State<&CommunicatorWorkerProcessStruct>) -> Response {
panic!("test HTTP handler task panic");
}
/// Helper function to convert prometheus metrics to a text response
async fn metrics_to_response(metrics: &(dyn MetricGroup<BufferedTextEncoder> + Sync)) -> Response {
let mut enc = BufferedTextEncoder::new();
metrics
.collect_group_into(&mut enc)
.unwrap_or_else(|never| match never {});
Response::builder()
.status(StatusCode::OK)
.header(CONTENT_TYPE, "application/text")
.body(Body::from(enc.finish()))
.unwrap()
}

View File

@@ -0,0 +1,83 @@
use measured::{
FixedCardinalityLabel, Gauge, GaugeVec, LabelGroup, MetricGroup,
label::{LabelName, LabelValue, StaticLabelSet},
metric::{MetricEncoding, gauge::GaugeState, group::Encoding},
};
use super::callbacks::callback_get_lfc_metrics;
pub(crate) struct LfcMetricsCollector;
#[derive(MetricGroup)]
#[metric(new())]
struct LfcMetricsGroup {
/// LFC cache size limit in bytes
lfc_cache_size_limit: Gauge,
/// LFC cache hits
lfc_hits: Gauge,
/// LFC cache misses
lfc_misses: Gauge,
/// LFC chunks used (chunk = 1MB)
lfc_used: Gauge,
/// LFC cache writes
lfc_writes: Gauge,
/// Approximate working set size in pages of 8192 bytes
#[metric(init = GaugeVec::dense())]
lfc_approximate_working_set_size_windows: GaugeVec<StaticLabelSet<MinuteAsSeconds>>,
}
impl<T: Encoding> MetricGroup<T> for LfcMetricsCollector
where
GaugeState: MetricEncoding<T>,
{
fn collect_group_into(&self, enc: &mut T) -> Result<(), <T as Encoding>::Err> {
let g = LfcMetricsGroup::new();
let lfc_metrics = callback_get_lfc_metrics();
g.lfc_cache_size_limit.set(lfc_metrics.lfc_cache_size_limit);
g.lfc_hits.set(lfc_metrics.lfc_hits);
g.lfc_misses.set(lfc_metrics.lfc_misses);
g.lfc_used.set(lfc_metrics.lfc_used);
g.lfc_writes.set(lfc_metrics.lfc_writes);
for i in 0..60 {
let val = lfc_metrics.lfc_approximate_working_set_size_windows[i];
g.lfc_approximate_working_set_size_windows
.set(MinuteAsSeconds(i), val);
}
g.collect_group_into(enc)
}
}
/// This stores the values in range 0..60,
/// encodes them as seconds (60, 120, 180, ..., 3600)
#[derive(Clone, Copy)]
struct MinuteAsSeconds(usize);
impl FixedCardinalityLabel for MinuteAsSeconds {
fn cardinality() -> usize {
60
}
fn encode(&self) -> usize {
self.0
}
fn decode(value: usize) -> Self {
Self(value)
}
}
impl LabelValue for MinuteAsSeconds {
fn visit<V: measured::label::LabelVisitor>(&self, v: V) -> V::Output {
v.write_int((self.0 + 1) as i64 * 60)
}
}
impl LabelGroup for MinuteAsSeconds {
fn visit_values(&self, v: &mut impl measured::label::LabelGroupVisitor) {
v.write_value(LabelName::from_str("duration_seconds"), self);
}
}

View File

@@ -0,0 +1,250 @@
//! Glue code to hook up Rust logging with the `tracing` crate to the PostgreSQL log
//!
//! In the Rust threads, the log messages are written to a mpsc Channel, and the Postgres
//! process latch is raised. That wakes up the loop in the main thread, see
//! `communicator_new_bgworker_main()`. It reads the message from the channel and
//! ereport()s it. This ensures that only one thread, the main thread, calls the
//! PostgreSQL logging routines at any time.
use std::ffi::c_char;
use std::sync::atomic::{AtomicU64, Ordering};
use std::sync::mpsc::sync_channel;
use std::sync::mpsc::{Receiver, SyncSender};
use std::sync::mpsc::{TryRecvError, TrySendError};
use tracing::info;
use tracing::{Event, Level, Metadata, Subscriber};
use tracing_subscriber::filter::LevelFilter;
use tracing_subscriber::fmt::format::Writer;
use tracing_subscriber::fmt::{FmtContext, FormatEvent, FormatFields, FormattedFields, MakeWriter};
use tracing_subscriber::registry::LookupSpan;
use crate::worker_process::callbacks::callback_set_my_latch;
/// This handle is passed to the C code, and used by [`communicator_worker_poll_logging`]
pub struct LoggingReceiver {
receiver: Receiver<FormattedEventWithMeta>,
}
/// This is passed to `tracing`
struct LoggingSender {
sender: SyncSender<FormattedEventWithMeta>,
}
static DROPPED_EVENT_COUNT: AtomicU64 = AtomicU64::new(0);
/// Called once, at worker process startup. The returned LoggingState is passed back
/// in the subsequent calls to `pump_logging`. It is opaque to the C code.
#[unsafe(no_mangle)]
pub extern "C" fn communicator_worker_configure_logging() -> Box<LoggingReceiver> {
let (sender, receiver) = sync_channel(1000);
let receiver = LoggingReceiver { receiver };
let sender = LoggingSender { sender };
use tracing_subscriber::prelude::*;
let r = tracing_subscriber::registry();
let r = r.with(
tracing_subscriber::fmt::layer()
.with_ansi(false)
.event_format(SimpleFormatter)
.with_writer(sender)
// TODO: derive this from log_min_messages? Currently the code in
// communicator_process.c forces log_min_messages='INFO'.
.with_filter(LevelFilter::from_level(Level::INFO)),
);
r.init();
info!("communicator process logging started");
Box::new(receiver)
}
/// Read one message from the logging queue. This is essentially a wrapper to Receiver,
/// with a C-friendly signature.
///
/// The message is copied into *errbuf, which is a caller-supplied buffer of size
/// `errbuf_len`. If the message doesn't fit in the buffer, it is truncated. It is always
/// NULL-terminated.
///
/// The error level is returned *elevel_p. It's one of the PostgreSQL error levels, see
/// elog.h
///
/// If there was a message, *dropped_event_count_p is also updated with a counter of how
/// many log messages in total has been dropped. By comparing that with the value from
/// previous call, you can tell how many were dropped since last call.
///
/// Returns:
///
/// 0 if there were no messages
/// 1 if there was a message. The message and its level are returned in
/// *errbuf and *elevel_p. *dropped_event_count_p is also updated.
/// -1 on error, i.e the other end of the queue was disconnected
#[unsafe(no_mangle)]
pub extern "C" fn communicator_worker_poll_logging(
state: &mut LoggingReceiver,
errbuf: *mut c_char,
errbuf_len: u32,
elevel_p: &mut i32,
dropped_event_count_p: &mut u64,
) -> i32 {
let msg = match state.receiver.try_recv() {
Err(TryRecvError::Empty) => return 0,
Err(TryRecvError::Disconnected) => return -1,
Ok(msg) => msg,
};
let src: &[u8] = &msg.message;
let dst: *mut u8 = errbuf.cast();
let len = std::cmp::min(src.len(), errbuf_len as usize - 1);
unsafe {
std::ptr::copy_nonoverlapping(src.as_ptr(), dst, len);
*(dst.add(len)) = b'\0'; // NULL terminator
}
// Map the tracing Level to PostgreSQL elevel.
//
// XXX: These levels are copied from PostgreSQL's elog.h. Introduce another enum to
// hide these?
*elevel_p = match msg.level {
Level::TRACE => 10, // DEBUG5
Level::DEBUG => 14, // DEBUG1
Level::INFO => 17, // INFO
Level::WARN => 19, // WARNING
Level::ERROR => 21, // ERROR
};
*dropped_event_count_p = DROPPED_EVENT_COUNT.load(Ordering::Relaxed);
1
}
//---- The following functions can be called from any thread ----
#[derive(Clone)]
struct FormattedEventWithMeta {
message: Vec<u8>,
level: tracing::Level,
}
impl Default for FormattedEventWithMeta {
fn default() -> Self {
FormattedEventWithMeta {
message: Vec::new(),
level: tracing::Level::DEBUG,
}
}
}
struct EventBuilder<'a> {
event: FormattedEventWithMeta,
sender: &'a LoggingSender,
}
impl std::io::Write for EventBuilder<'_> {
fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
self.event.message.write(buf)
}
fn flush(&mut self) -> std::io::Result<()> {
self.sender.send_event(self.event.clone());
Ok(())
}
}
impl Drop for EventBuilder<'_> {
fn drop(&mut self) {
let sender = self.sender;
let event = std::mem::take(&mut self.event);
sender.send_event(event);
}
}
impl<'a> MakeWriter<'a> for LoggingSender {
type Writer = EventBuilder<'a>;
fn make_writer(&'a self) -> Self::Writer {
panic!("not expected to be called when make_writer_for is implemented");
}
fn make_writer_for(&'a self, meta: &Metadata<'_>) -> Self::Writer {
EventBuilder {
event: FormattedEventWithMeta {
message: Vec::new(),
level: *meta.level(),
},
sender: self,
}
}
}
impl LoggingSender {
fn send_event(&self, e: FormattedEventWithMeta) {
match self.sender.try_send(e) {
Ok(()) => {
// notify the main thread
callback_set_my_latch();
}
Err(TrySendError::Disconnected(_)) => {}
Err(TrySendError::Full(_)) => {
// The queue is full, cannot send any more. To avoid blocking the tokio
// thread, simply drop the message. Better to lose some logs than get
// stuck if there's a problem with the logging.
//
// Record the fact that was a message was dropped by incrementing the
// counter.
DROPPED_EVENT_COUNT.fetch_add(1, Ordering::Relaxed);
}
}
}
}
/// Simple formatter implementation for tracing_subscriber, which prints the log spans and
/// message part like the default formatter, but no timestamp or error level. The error
/// level is captured separately by `FormattedEventWithMeta', and when the error is
/// printed by the main thread, with PostgreSQL ereport(), it gets a timestamp at that
/// point. (The timestamp printed will therefore lag behind the timestamp on the event
/// here, if the main thread doesn't process the log message promptly)
struct SimpleFormatter;
impl<S, N> FormatEvent<S, N> for SimpleFormatter
where
S: Subscriber + for<'a> LookupSpan<'a>,
N: for<'a> FormatFields<'a> + 'static,
{
fn format_event(
&self,
ctx: &FmtContext<'_, S, N>,
mut writer: Writer<'_>,
event: &Event<'_>,
) -> std::fmt::Result {
// Format all the spans in the event's span context.
if let Some(scope) = ctx.event_scope() {
for span in scope.from_root() {
write!(writer, "{}", span.name())?;
// `FormattedFields` is a formatted representation of the span's fields,
// which is stored in its extensions by the `fmt` layer's `new_span`
// method. The fields will have been formatted by the same field formatter
// that's provided to the event formatter in the `FmtContext`.
let ext = span.extensions();
let fields = &ext
.get::<FormattedFields<N>>()
.expect("will never be `None`");
// Skip formatting the fields if the span had no fields.
if !fields.is_empty() {
write!(writer, "{{{fields}}}")?;
}
write!(writer, ": ")?;
}
}
// Write fields on the event
ctx.field_format().format_fields(writer.by_ref(), event)?;
Ok(())
}
}

View File

@@ -0,0 +1,66 @@
use std::str::FromStr as _;
use crate::worker_process::lfc_metrics::LfcMetricsCollector;
use measured::MetricGroup;
use measured::metric::MetricEncoding;
use measured::metric::gauge::GaugeState;
use measured::metric::group::Encoding;
use utils::id::{TenantId, TimelineId};
pub struct CommunicatorWorkerProcessStruct {
runtime: tokio::runtime::Runtime,
/*** Metrics ***/
pub(crate) lfc_metrics: LfcMetricsCollector,
}
/// Launch the communicator process's Rust subsystems
pub(super) fn init(
tenant_id: Option<&str>,
timeline_id: Option<&str>,
) -> Result<&'static CommunicatorWorkerProcessStruct, String> {
// The caller validated these already
let _tenant_id = tenant_id
.map(TenantId::from_str)
.transpose()
.map_err(|e| format!("invalid tenant ID: {e}"))?;
let _timeline_id = timeline_id
.map(TimelineId::from_str)
.transpose()
.map_err(|e| format!("invalid timeline ID: {e}"))?;
let runtime = tokio::runtime::Builder::new_multi_thread()
.enable_all()
.thread_name("communicator thread")
.build()
.unwrap();
let worker_struct = CommunicatorWorkerProcessStruct {
// Note: it's important to not drop the runtime, or all the tasks are dropped
// too. Including it in the returned struct is one way to keep it around.
runtime,
// metrics
lfc_metrics: LfcMetricsCollector,
};
let worker_struct = Box::leak(Box::new(worker_struct));
// Start the listener on the control socket
worker_struct
.runtime
.block_on(worker_struct.launch_control_socket_listener())
.map_err(|e| e.to_string())?;
Ok(worker_struct)
}
impl<T> MetricGroup<T> for CommunicatorWorkerProcessStruct
where
T: Encoding,
GaugeState: MetricEncoding<T>,
{
fn collect_group_into(&self, enc: &mut T) -> Result<(), T::Err> {
self.lfc_metrics.collect_group_into(enc)
}
}

View File

@@ -0,0 +1,13 @@
//! This code runs in the communicator worker process. This provides
//! the glue code to:
//!
//! - launch the main loop,
//! - receive IO requests from backends and process them,
//! - write results back to backends.
mod callbacks;
mod control_socket;
mod lfc_metrics;
mod logging;
mod main_loop;
mod worker_interface;

View File

@@ -0,0 +1,60 @@
//! Functions called from the C code in the worker process
use std::ffi::{CStr, CString, c_char};
use crate::worker_process::main_loop;
use crate::worker_process::main_loop::CommunicatorWorkerProcessStruct;
/// Launch the communicator's tokio tasks, which do most of the work.
///
/// The caller has initialized the process as a regular PostgreSQL background worker
/// process.
///
/// Inputs:
/// `tenant_id` and `timeline_id` can be NULL, if we're been launched in "non-Neon" mode,
/// where we use local storage instead of connecting to remote neon storage. That's
/// currently only used in some unit tests.
///
/// Result:
/// Returns pointer to CommunicatorWorkerProcessStruct, which is a handle to running
/// Rust tasks. The C code can use it to interact with the Rust parts. On failure, returns
/// None/NULL, and an error message is returned in *error_p
///
/// This is called only once in the process, so the returned struct, and error message in
/// case of failure, are simply leaked.
#[unsafe(no_mangle)]
pub extern "C" fn communicator_worker_launch(
tenant_id: *const c_char,
timeline_id: *const c_char,
error_p: *mut *const c_char,
) -> Option<&'static CommunicatorWorkerProcessStruct> {
// Convert the arguments into more convenient Rust types
let tenant_id = if tenant_id.is_null() {
None
} else {
let cstr = unsafe { CStr::from_ptr(tenant_id) };
Some(cstr.to_str().expect("assume UTF-8"))
};
let timeline_id = if timeline_id.is_null() {
None
} else {
let cstr = unsafe { CStr::from_ptr(timeline_id) };
Some(cstr.to_str().expect("assume UTF-8"))
};
// The `init` function does all the work.
let result = main_loop::init(tenant_id, timeline_id);
// On failure, return the error message to the C caller in *error_p.
match result {
Ok(worker_struct) => Some(worker_struct),
Err(errmsg) => {
let errmsg = CString::new(errmsg).expect("no nuls within error message");
let errmsg = Box::leak(errmsg.into_boxed_c_str());
let p: *const c_char = errmsg.as_ptr();
unsafe { *error_p = p };
None
}
}
}

View File

@@ -0,0 +1,273 @@
/*-------------------------------------------------------------------------
*
* communicator_process.c
* Functions for starting up the communicator background worker process.
*
* Currently, the communicator process only functions as a metrics
* exporter. It provides an HTTP endpoint for polling a limited set of
* metrics. TODO: In the future, it will do much more, i.e. handle all
* the communications with the pageservers.
*
* Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include <unistd.h>
#include "miscadmin.h"
#include "postmaster/bgworker.h"
#include "postmaster/interrupt.h"
#include "postmaster/postmaster.h"
#include "replication/walsender.h"
#include "storage/ipc.h"
#include "storage/latch.h"
#include "storage/pmsignal.h"
#include "storage/procsignal.h"
#include "tcop/tcopprot.h"
#include "utils/timestamp.h"
#include "communicator_process.h"
#include "file_cache.h"
#include "neon.h"
#include "neon_perf_counters.h"
/* the rust bindings, generated by cbindgen */
#include "communicator/communicator_bindings.h"
static void pump_logging(struct LoggingReceiver *logging);
PGDLLEXPORT void communicator_new_bgworker_main(Datum main_arg);
/**** Initialization functions. These run in postmaster ****/
void
pg_init_communicator_process(void)
{
BackgroundWorker bgw;
/* Initialize the background worker process */
memset(&bgw, 0, sizeof(bgw));
bgw.bgw_flags = BGWORKER_SHMEM_ACCESS;
bgw.bgw_start_time = BgWorkerStart_PostmasterStart;
snprintf(bgw.bgw_library_name, BGW_MAXLEN, "neon");
snprintf(bgw.bgw_function_name, BGW_MAXLEN, "communicator_new_bgworker_main");
snprintf(bgw.bgw_name, BGW_MAXLEN, "Storage communicator process");
snprintf(bgw.bgw_type, BGW_MAXLEN, "Storage communicator process");
bgw.bgw_restart_time = 5;
bgw.bgw_notify_pid = 0;
bgw.bgw_main_arg = (Datum) 0;
RegisterBackgroundWorker(&bgw);
}
/**** Worker process functions. These run in the communicator worker process ****/
/*
* Entry point for the communicator bgworker process
*/
void
communicator_new_bgworker_main(Datum main_arg)
{
struct LoggingReceiver *logging;
const char *errmsg = NULL;
const struct CommunicatorWorkerProcessStruct *proc_handle;
/*
* Pretend that this process is a WAL sender. That affects the shutdown
* sequence: WAL senders are shut down last, after the final checkpoint
* has been written. That's what we want for the communicator process too.
*/
am_walsender = true;
MarkPostmasterChildWalSender();
/* Establish signal handlers. */
pqsignal(SIGUSR1, procsignal_sigusr1_handler);
/*
* Postmaster sends us SIGUSR2 when all regular backends and bgworkers
* have exited, and it's time for us to exit too
*/
pqsignal(SIGUSR2, die);
pqsignal(SIGHUP, SignalHandlerForConfigReload);
pqsignal(SIGTERM, die);
BackgroundWorkerUnblockSignals();
/*
* By default, INFO messages are not printed to the log. We want
* `tracing::info!` messages emitted from the communicator to be printed,
* however, so increase the log level.
*
* XXX: This overrides any user-set value from the config file. That's not
* great, but on the other hand, there should be little reason for user to
* control the verbosity of the communicator. It's not too verbose by
* default.
*/
SetConfigOption("log_min_messages", "INFO", PGC_SUSET, PGC_S_OVERRIDE);
logging = communicator_worker_configure_logging();
proc_handle = communicator_worker_launch(
neon_tenant[0] == '\0' ? NULL : neon_tenant,
neon_timeline[0] == '\0' ? NULL : neon_timeline,
&errmsg
);
if (proc_handle == NULL)
{
/*
* Something went wrong. Before exiting, forward any log messages that
* might've been generated during the failed launch.
*/
pump_logging(logging);
elog(PANIC, "%s", errmsg);
}
/*
* The Rust tokio runtime has been launched, and it's running in the
* background now. This loop in the main thread handles any interactions
* we need with the rest of PostgreSQL.
*
* NB: This process is now multi-threaded! The Rust threads do not call
* into any Postgres functions, but it's not entirely clear which Postgres
* functions are safe to call from this main thread either. Be very
* careful about adding anything non-trivial here.
*
* Also note that we try to react quickly to any log messages arriving
* from the Rust thread. Be careful to not do anything too expensive here
* that might cause delays.
*/
elog(LOG, "communicator threads started");
for (;;)
{
TimestampTz before;
long duration;
ResetLatch(MyLatch);
/*
* Forward any log messages from the Rust threads into the normal
* Postgres logging facility.
*/
pump_logging(logging);
/*
* Check interrupts like system shutdown or config reload
*
* We mustn't block for too long within this loop, or we risk the log
* queue to fill up and messages to be lost. Also, even if we can keep
* up, if there's a long delay between sending a message and printing
* it to the log, the timestamps on the messages get skewed, which is
* confusing.
*
* We expect processing interrupts to happen fast enough that it's OK,
* but measure it just in case, and print a warning if it takes longer
* than 100 ms.
*/
#define LOG_SKEW_WARNING_MS 100
before = GetCurrentTimestamp();
CHECK_FOR_INTERRUPTS();
if (ConfigReloadPending)
{
ConfigReloadPending = false;
ProcessConfigFile(PGC_SIGHUP);
}
duration = TimestampDifferenceMilliseconds(before, GetCurrentTimestamp());
if (duration > LOG_SKEW_WARNING_MS)
elog(WARNING, "handling interrupts took %ld ms, communicator log timestamps might be skewed", duration);
/*
* Wait until we are woken up. The rust threads will set the latch
* when there's a log message to forward.
*/
(void) WaitLatch(MyLatch,
WL_LATCH_SET | WL_EXIT_ON_PM_DEATH,
0,
PG_WAIT_EXTENSION);
}
}
static void
pump_logging(struct LoggingReceiver *logging)
{
char errbuf[1000];
int elevel;
int32 rc;
static uint64_t last_dropped_event_count = 0;
uint64_t dropped_event_count;
uint64_t dropped_now;
for (;;)
{
rc = communicator_worker_poll_logging(logging,
errbuf,
sizeof(errbuf),
&elevel,
&dropped_event_count);
if (rc == 0)
{
/* nothing to do */
break;
}
else if (rc == 1)
{
/* Because we don't want to exit on error */
if (message_level_is_interesting(elevel))
{
/*
* Prevent interrupts while cleaning up.
*
* (Not sure if this is required, but all the error handlers
* in Postgres that are installed as sigsetjmp() targets do
* this, so let's follow the example)
*/
HOLD_INTERRUPTS();
errstart(elevel, TEXTDOMAIN);
errmsg_internal("[COMMUNICATOR] %s", errbuf);
EmitErrorReport();
FlushErrorState();
/* Now we can allow interrupts again */
RESUME_INTERRUPTS();
}
}
else if (rc == -1)
{
elog(ERROR, "logging channel was closed unexpectedly");
}
}
/*
* If the queue was full at any time since the last time we reported it,
* report how many messages were lost. We do this outside the loop, so
* that if the logging system is clogged, we don't exacerbate it by
* printing lots of warnings about dropped messages.
*/
dropped_now = dropped_event_count - last_dropped_event_count;
if (dropped_now != 0)
{
elog(WARNING, "%lu communicator log messages were dropped because the log buffer was full",
(unsigned long) dropped_now);
last_dropped_event_count = dropped_event_count;
}
}
/****
* Callbacks from the rust code, in the communicator process.
*
* NOTE: These must be thread-safe! It's very limited which PostgreSQL
* functions you can use!!!
*
* The signatures of these need to match those in the Rust code.
*/
void
callback_set_my_latch_unsafe(void)
{
SetLatch(MyLatch);
}

View File

@@ -0,0 +1,17 @@
/*-------------------------------------------------------------------------
*
* communicator_process.h
* Communicator process
*
*
* Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*-------------------------------------------------------------------------
*/
#ifndef COMMUNICATOR_PROCESS_H
#define COMMUNICATOR_PROCESS_H
extern void pg_init_communicator_process(void);
#endif /* COMMUNICATOR_PROCESS_H */

View File

@@ -52,6 +52,8 @@
#include "pagestore_client.h"
#include "communicator.h"
#include "communicator/communicator_bindings.h"
#define CriticalAssert(cond) do if (!(cond)) elog(PANIC, "LFC: assertion %s failed at %s:%d: ", #cond, __FILE__, __LINE__); while (0)
/*
@@ -2156,6 +2158,38 @@ lfc_approximate_working_set_size_seconds(time_t duration, bool reset)
return dc;
}
/*
* Get metrics, for the built-in metrics exporter that's part of the communicator
* process.
*
* NB: This is called from a Rust tokio task inside the communicator process.
* Acquiring lwlocks, elog(), allocating memory or anything else non-trivial
* is strictly prohibited here!
*/
struct LfcMetrics
callback_get_lfc_metrics_unsafe(void)
{
struct LfcMetrics result = {
.lfc_cache_size_limit = (int64) lfc_size_limit * 1024 * 1024,
.lfc_hits = lfc_ctl ? lfc_ctl->hits : 0,
.lfc_misses = lfc_ctl ? lfc_ctl->misses : 0,
.lfc_used = lfc_ctl ? lfc_ctl->used : 0,
.lfc_writes = lfc_ctl ? lfc_ctl->writes : 0,
};
if (lfc_ctl)
{
for (int minutes = 1; minutes <= 60; minutes++)
{
result.lfc_approximate_working_set_size_windows[minutes - 1] =
lfc_approximate_working_set_size_seconds(minutes * 60, false);
}
}
return result;
}
PG_FUNCTION_INFO_V1(get_local_cache_state);
Datum

View File

@@ -31,6 +31,7 @@
#include "utils/guc_tables.h"
#include "communicator.h"
#include "communicator_process.h"
#include "extension_server.h"
#include "file_cache.h"
#include "neon.h"
@@ -44,9 +45,6 @@
#include "storage/ipc.h"
#endif
/* the rust bindings, generated by cbindgen */
#include "communicator/communicator_bindings.h"
PG_MODULE_MAGIC;
void _PG_init(void);
@@ -457,9 +455,6 @@ _PG_init(void)
load_file("$libdir/neon_rmgr", false);
#endif
/* dummy call to a Rust function in the communicator library, to check that it works */
(void) communicator_dummy(123);
/*
* Initializing a pre-loaded Postgres extension happens in three stages:
*
@@ -497,6 +492,8 @@ _PG_init(void)
pg_init_walproposer();
init_lwlsncache();
pg_init_communicator_process();
pg_init_communicator();
Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines;

19
poetry.lock generated
View File

@@ -1,4 +1,4 @@
# This file is automatically @generated by Poetry 2.1.1 and should not be changed by hand.
# This file is automatically @generated by Poetry 2.1.3 and should not be changed by hand.
[[package]]
name = "aiohappyeyeballs"
@@ -3068,6 +3068,21 @@ urllib3 = ">=1.21.1,<3"
socks = ["PySocks (>=1.5.6,!=1.5.7)"]
use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"]
[[package]]
name = "requests-unixsocket"
version = "0.4.1"
description = "Use requests to talk HTTP via a UNIX domain socket"
optional = false
python-versions = ">=3.9"
groups = ["main"]
files = [
{file = "requests_unixsocket-0.4.1-py3-none-any.whl", hash = "sha256:60c4942e9dbecc2f64d611039fb1dfc25da382083c6434ac0316dca3ff908f4d"},
{file = "requests_unixsocket-0.4.1.tar.gz", hash = "sha256:b2596158c356ecee68d27ba469a52211230ac6fb0cde8b66afb19f0ed47a1995"},
]
[package.dependencies]
requests = ">=1.1"
[[package]]
name = "responses"
version = "0.25.3"
@@ -3844,4 +3859,4 @@ cffi = ["cffi (>=1.11)"]
[metadata]
lock-version = "2.1"
python-versions = "^3.11"
content-hash = "6a1e8ba06b8194bf28d87fd5e184e2ddc2b4a19dffcbe3953b26da3d55c9212f"
content-hash = "b08aba407631b0341d2ef8bf9acffd733bfc7d32b12d344717ab4c7fef697625"

View File

@@ -76,7 +76,7 @@ fn cli() -> clap::Command {
}
pub async fn run() -> anyhow::Result<()> {
let _logging_guard = crate::logging::init().await?;
let _logging_guard = crate::logging::init()?;
let _panic_hook_guard = utils::logging::replace_panic_hook_with_tracing_panic_hook();
let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);

View File

@@ -334,7 +334,7 @@ struct PgSniRouterArgs {
}
pub async fn run() -> anyhow::Result<()> {
let _logging_guard = crate::logging::init().await?;
let _logging_guard = crate::logging::init()?;
let _panic_hook_guard = utils::logging::replace_panic_hook_with_tracing_panic_hook();
let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);

View File

@@ -26,7 +26,7 @@ use crate::metrics::Metrics;
/// configuration from environment variables. For example, to change the
/// destination, set `OTEL_EXPORTER_OTLP_ENDPOINT=http://jaeger:4318`.
/// See <https://opentelemetry.io/docs/reference/specification/sdk-environment-variables>
pub async fn init() -> anyhow::Result<LoggingGuard> {
pub fn init() -> anyhow::Result<LoggingGuard> {
let logfmt = LogFormat::from_env()?;
let env_filter = EnvFilter::builder()
@@ -43,8 +43,8 @@ pub async fn init() -> anyhow::Result<LoggingGuard> {
.expect("this should be a valid filter directive"),
);
let otlp_layer =
tracing_utils::init_tracing("proxy", tracing_utils::ExportConfig::default()).await;
let provider = tracing_utils::init_tracing("proxy", tracing_utils::ExportConfig::default());
let otlp_layer = provider.as_ref().map(tracing_utils::layer);
let json_log_layer = if logfmt == LogFormat::Json {
Some(JsonLoggingLayer::new(
@@ -76,7 +76,7 @@ pub async fn init() -> anyhow::Result<LoggingGuard> {
.with(text_log_layer)
.try_init()?;
Ok(LoggingGuard)
Ok(LoggingGuard(provider))
}
/// Initialize logging for local_proxy with log prefix and no opentelemetry.
@@ -97,7 +97,7 @@ pub fn init_local_proxy() -> anyhow::Result<LoggingGuard> {
.with(fmt_layer)
.try_init()?;
Ok(LoggingGuard)
Ok(LoggingGuard(None))
}
pub struct LocalProxyFormatter(Format<Full, SystemTime>);
@@ -118,14 +118,16 @@ where
}
}
pub struct LoggingGuard;
pub struct LoggingGuard(Option<tracing_utils::Provider>);
impl Drop for LoggingGuard {
fn drop(&mut self) {
// Shutdown trace pipeline gracefully, so that it has a chance to send any
// pending traces before we exit.
tracing::info!("shutting down the tracing machinery");
tracing_utils::shutdown_tracing();
if let Some(p) = &self.0 {
// Shutdown trace pipeline gracefully, so that it has a chance to send any
// pending traces before we exit.
tracing::info!("shutting down the tracing machinery");
drop(p.shutdown());
}
}
}

View File

@@ -50,6 +50,7 @@ types-pyyaml = "^6.0.12.20240917"
testcontainers = "^4.9.0"
# Install a release candidate of `jsonnet`, as it supports Python 3.13
jsonnet = "^0.21.0-rc2"
requests-unixsocket = "^0.4.1"
[tool.poetry.group.dev.dependencies]
mypy = "==1.13.0"

View File

@@ -427,6 +427,9 @@ impl From<TimelineError> for ApiError {
TimelineError::NotFound(ttid) => {
ApiError::NotFound(anyhow!("timeline {} not found", ttid).into())
}
TimelineError::Deleted(ttid) => {
ApiError::NotFound(anyhow!("timeline {} deleted", ttid).into())
}
_ => ApiError::InternalServerError(anyhow!("{}", te)),
}
}

View File

@@ -0,0 +1 @@
ALTER TABLE timelines DROP sk_set_notified_generation;

View File

@@ -0,0 +1 @@
ALTER TABLE timelines ADD sk_set_notified_generation INTEGER NOT NULL DEFAULT 1;

View File

@@ -225,6 +225,10 @@ struct Cli {
#[arg(long)]
shard_split_request_timeout: Option<humantime::Duration>,
/// **Feature Flag** Whether the storage controller should act to rectify pageserver-reported local disk loss.
#[arg(long, default_value = "false")]
handle_ps_local_disk_loss: bool,
}
enum StrictMode {
@@ -477,6 +481,7 @@ async fn async_main() -> anyhow::Result<()> {
.shard_split_request_timeout
.map(humantime::Duration::into)
.unwrap_or(Duration::MAX),
handle_ps_local_disk_loss: args.handle_ps_local_disk_loss,
};
// Validate that we can connect to the database

View File

@@ -131,6 +131,8 @@ pub(crate) enum DatabaseOperation {
InsertTimeline,
UpdateTimeline,
UpdateTimelineMembership,
UpdateCplaneNotifiedGeneration,
UpdateSkSetNotifiedGeneration,
GetTimeline,
InsertTimelineReconcile,
RemoveTimelineReconcile,
@@ -1497,6 +1499,8 @@ impl Persistence {
/// Update timeline membership configuration in the database.
/// Perform a compare-and-swap (CAS) operation on the timeline's generation.
/// The `new_generation` must be the next (+1) generation after the one in the database.
/// Also inserts reconcile_requests to safekeeper_timeline_pending_ops table in the same
/// transaction.
pub(crate) async fn update_timeline_membership(
&self,
tenant_id: TenantId,
@@ -1504,8 +1508,11 @@ impl Persistence {
new_generation: SafekeeperGeneration,
sk_set: &[NodeId],
new_sk_set: Option<&[NodeId]>,
reconcile_requests: &[TimelinePendingOpPersistence],
) -> DatabaseResult<()> {
use crate::schema::timelines::dsl;
use crate::schema::safekeeper_timeline_pending_ops as stpo;
use crate::schema::timelines;
use diesel::query_dsl::methods::FilterDsl;
let prev_generation = new_generation.previous().unwrap();
@@ -1513,14 +1520,15 @@ impl Persistence {
let timeline_id = &timeline_id;
self.with_measured_conn(DatabaseOperation::UpdateTimelineMembership, move |conn| {
Box::pin(async move {
let updated = diesel::update(dsl::timelines)
.filter(dsl::tenant_id.eq(&tenant_id.to_string()))
.filter(dsl::timeline_id.eq(&timeline_id.to_string()))
.filter(dsl::generation.eq(prev_generation.into_inner() as i32))
let updated = diesel::update(timelines::table)
.filter(timelines::tenant_id.eq(&tenant_id.to_string()))
.filter(timelines::timeline_id.eq(&timeline_id.to_string()))
.filter(timelines::generation.eq(prev_generation.into_inner() as i32))
.set((
dsl::generation.eq(new_generation.into_inner() as i32),
dsl::sk_set.eq(sk_set.iter().map(|id| id.0 as i64).collect::<Vec<_>>()),
dsl::new_sk_set.eq(new_sk_set
timelines::generation.eq(new_generation.into_inner() as i32),
timelines::sk_set
.eq(sk_set.iter().map(|id| id.0 as i64).collect::<Vec<_>>()),
timelines::new_sk_set.eq(new_sk_set
.map(|set| set.iter().map(|id| id.0 as i64).collect::<Vec<_>>())),
))
.execute(conn)
@@ -1530,20 +1538,123 @@ impl Persistence {
0 => {
// TODO(diko): It makes sense to select the current generation
// and include it in the error message for better debuggability.
Err(DatabaseError::Cas(
return Err(DatabaseError::Cas(
"Failed to update membership configuration".to_string(),
))
));
}
1 => {}
_ => {
return Err(DatabaseError::Logical(format!(
"unexpected number of rows ({updated})"
)));
}
};
for req in reconcile_requests {
let inserted_updated = diesel::insert_into(stpo::table)
.values(req)
.on_conflict((stpo::tenant_id, stpo::timeline_id, stpo::sk_id))
.do_update()
.set(req)
.filter(stpo::generation.lt(req.generation))
.execute(conn)
.await?;
if inserted_updated > 1 {
return Err(DatabaseError::Logical(format!(
"unexpected number of rows ({inserted_updated})"
)));
}
1 => Ok(()),
_ => Err(DatabaseError::Logical(format!(
"unexpected number of rows ({updated})"
))),
}
Ok(())
})
})
.await
}
/// Update the cplane notified generation for a timeline.
/// Perform a compare-and-swap (CAS) operation on the timeline's cplane notified generation.
/// The update will fail if the specified generation is less than the cplane notified generation
/// in the database.
pub(crate) async fn update_cplane_notified_generation(
&self,
tenant_id: TenantId,
timeline_id: TimelineId,
generation: SafekeeperGeneration,
) -> DatabaseResult<()> {
use crate::schema::timelines::dsl;
let tenant_id = &tenant_id;
let timeline_id = &timeline_id;
self.with_measured_conn(
DatabaseOperation::UpdateCplaneNotifiedGeneration,
move |conn| {
Box::pin(async move {
let updated = diesel::update(dsl::timelines)
.filter(dsl::tenant_id.eq(&tenant_id.to_string()))
.filter(dsl::timeline_id.eq(&timeline_id.to_string()))
.filter(dsl::cplane_notified_generation.le(generation.into_inner() as i32))
.set(dsl::cplane_notified_generation.eq(generation.into_inner() as i32))
.execute(conn)
.await?;
match updated {
0 => Err(DatabaseError::Cas(
"Failed to update cplane notified generation".to_string(),
)),
1 => Ok(()),
_ => Err(DatabaseError::Logical(format!(
"unexpected number of rows ({updated})"
))),
}
})
},
)
.await
}
/// Update the sk set notified generation for a timeline.
/// Perform a compare-and-swap (CAS) operation on the timeline's sk set notified generation.
/// The update will fail if the specified generation is less than the sk set notified generation
/// in the database.
pub(crate) async fn update_sk_set_notified_generation(
&self,
tenant_id: TenantId,
timeline_id: TimelineId,
generation: SafekeeperGeneration,
) -> DatabaseResult<()> {
use crate::schema::timelines::dsl;
let tenant_id = &tenant_id;
let timeline_id = &timeline_id;
self.with_measured_conn(
DatabaseOperation::UpdateSkSetNotifiedGeneration,
move |conn| {
Box::pin(async move {
let updated = diesel::update(dsl::timelines)
.filter(dsl::tenant_id.eq(&tenant_id.to_string()))
.filter(dsl::timeline_id.eq(&timeline_id.to_string()))
.filter(dsl::sk_set_notified_generation.le(generation.into_inner() as i32))
.set(dsl::sk_set_notified_generation.eq(generation.into_inner() as i32))
.execute(conn)
.await?;
match updated {
0 => Err(DatabaseError::Cas(
"Failed to update sk set notified generation".to_string(),
)),
1 => Ok(()),
_ => Err(DatabaseError::Logical(format!(
"unexpected number of rows ({updated})"
))),
}
})
},
)
.await
}
/// Load timeline from db. Returns `None` if not present.
pub(crate) async fn get_timeline(
&self,
@@ -2493,6 +2604,7 @@ pub(crate) struct TimelinePersistence {
pub(crate) new_sk_set: Option<Vec<i64>>,
pub(crate) cplane_notified_generation: i32,
pub(crate) deleted_at: Option<chrono::DateTime<chrono::Utc>>,
pub(crate) sk_set_notified_generation: i32,
}
/// This is separate from [TimelinePersistence] only because postgres allows NULLs
@@ -2511,6 +2623,7 @@ pub(crate) struct TimelineFromDb {
pub(crate) new_sk_set: Option<Vec<Option<i64>>>,
pub(crate) cplane_notified_generation: i32,
pub(crate) deleted_at: Option<chrono::DateTime<chrono::Utc>>,
pub(crate) sk_set_notified_generation: i32,
}
impl TimelineFromDb {
@@ -2530,6 +2643,7 @@ impl TimelineFromDb {
new_sk_set,
cplane_notified_generation: self.cplane_notified_generation,
deleted_at: self.deleted_at,
sk_set_notified_generation: self.sk_set_notified_generation,
}
}
}

View File

@@ -118,6 +118,7 @@ diesel::table! {
new_sk_set -> Nullable<Array<Nullable<Int8>>>,
cplane_notified_generation -> Int4,
deleted_at -> Nullable<Timestamptz>,
sk_set_notified_generation -> Int4,
}
}

View File

@@ -487,6 +487,9 @@ pub struct Config {
/// Timeout used for HTTP client of split requests. [`Duration::MAX`] if None.
pub shard_split_request_timeout: Duration,
// Feature flag: Whether the storage controller should act to rectify pageserver-reported local disk loss.
pub handle_ps_local_disk_loss: bool,
}
impl From<DatabaseError> for ApiError {
@@ -2388,6 +2391,33 @@ impl Service {
tenants: Vec::new(),
};
// [Hadron] If the pageserver reports in the reattach message that it has an empty disk, it's possible that it just
// recovered from a local disk failure. The response of the reattach request will contain a list of tenants but it
// will not be honored by the pageserver in this case (disk failure). We should make sure we clear any observed
// locations of tenants attached to the node so that the reconciler will discover the discrpancy and reconfigure the
// missing tenants on the node properly.
if self.config.handle_ps_local_disk_loss && reattach_req.empty_local_disk.unwrap_or(false) {
tracing::info!(
"Pageserver {node_id} reports empty local disk, clearing observed locations referencing the pageserver for all tenants",
node_id = reattach_req.node_id
);
let mut num_tenant_shards_affected = 0;
for (tenant_shard_id, shard) in tenants.iter_mut() {
if shard
.observed
.locations
.remove(&reattach_req.node_id)
.is_some()
{
tracing::info!("Cleared observed location for tenant shard {tenant_shard_id}");
num_tenant_shards_affected += 1;
}
}
tracing::info!(
"Cleared observed locations for {num_tenant_shards_affected} tenant shards"
);
}
// TODO: cancel/restart any running reconciliation for this tenant, it might be trying
// to call location_conf API with an old generation. Wait for cancellation to complete
// before responding to this request. Requires well implemented CancellationToken logic

View File

@@ -312,6 +312,7 @@ impl Service {
new_sk_set: None,
cplane_notified_generation: 0,
deleted_at: None,
sk_set_notified_generation: 0,
};
let inserted = self
.persistence
@@ -461,6 +462,7 @@ impl Service {
new_sk_set: None,
cplane_notified_generation: 1,
deleted_at: None,
sk_set_notified_generation: 1,
};
let inserted = self
.persistence
@@ -894,17 +896,21 @@ impl Service {
/// If min_position is not None, validates that majority of safekeepers
/// reached at least min_position.
///
/// If update_notified_generation is set, also updates sk_set_notified_generation
/// in the timelines table.
///
/// Return responses from safekeepers in the input order.
async fn tenant_timeline_set_membership_quorum(
self: &Arc<Self>,
tenant_id: TenantId,
timeline_id: TimelineId,
safekeepers: &[Safekeeper],
config: &membership::Configuration,
mconf: &membership::Configuration,
min_position: Option<(Term, Lsn)>,
update_notified_generation: bool,
) -> Result<Vec<mgmt_api::Result<TimelineMembershipSwitchResponse>>, ApiError> {
let req = TimelineMembershipSwitchRequest {
mconf: config.clone(),
mconf: mconf.clone(),
};
const SK_SET_MEM_TIMELINE_RECONCILE_TIMEOUT: Duration = Duration::from_secs(30);
@@ -945,28 +951,34 @@ impl Service {
.await?;
for res in results.iter().flatten() {
if res.current_conf.generation > config.generation {
if res.current_conf.generation > mconf.generation {
// Antoher switch_membership raced us.
return Err(ApiError::Conflict(format!(
"received configuration with generation {} from safekeeper, but expected {}",
res.current_conf.generation, config.generation
res.current_conf.generation, mconf.generation
)));
} else if res.current_conf.generation < config.generation {
} else if res.current_conf.generation < mconf.generation {
// Note: should never happen.
// If we get a response, it should be at least the sent generation.
tracing::error!(
"received configuration with generation {} from safekeeper, but expected {}",
res.current_conf.generation,
config.generation
mconf.generation
);
return Err(ApiError::InternalServerError(anyhow::anyhow!(
"received configuration with generation {} from safekeeper, but expected {}",
res.current_conf.generation,
config.generation
mconf.generation
)));
}
}
if update_notified_generation {
self.persistence
.update_sk_set_notified_generation(tenant_id, timeline_id, mconf.generation)
.await?;
}
Ok(results)
}
@@ -1035,17 +1047,22 @@ impl Service {
}
/// Exclude a timeline from safekeepers in parallel with retries.
/// If an exclude request is unsuccessful, it will be added to
/// the reconciler, and after that the function will succeed.
async fn tenant_timeline_safekeeper_exclude(
///
/// Assumes that the exclude requests are already persistent in the database.
///
/// The function does best effort: if an exclude request is unsuccessful,
/// it will be added to the in-memory reconciler, and the function will succeed anyway.
///
/// Might fail if there is error accessing the database.
async fn tenant_timeline_safekeeper_exclude_reconcile(
self: &Arc<Self>,
tenant_id: TenantId,
timeline_id: TimelineId,
safekeepers: &[Safekeeper],
config: &membership::Configuration,
mconf: &membership::Configuration,
) -> Result<(), ApiError> {
let req = TimelineMembershipSwitchRequest {
mconf: config.clone(),
mconf: mconf.clone(),
};
const SK_EXCLUDE_TIMELINE_TIMEOUT: Duration = Duration::from_secs(30);
@@ -1063,25 +1080,32 @@ impl Service {
let mut reconcile_requests = Vec::new();
for (idx, res) in results.iter().enumerate() {
if res.is_err() {
let sk_id = safekeepers[idx].skp.id;
let pending_op = TimelinePendingOpPersistence {
tenant_id: tenant_id.to_string(),
timeline_id: timeline_id.to_string(),
generation: config.generation.into_inner() as i32,
op_kind: SafekeeperTimelineOpKind::Exclude,
sk_id,
};
tracing::info!("writing pending exclude op for sk id {sk_id}");
self.persistence.insert_pending_op(pending_op).await?;
fail::fail_point!("sk-migration-step-9-mid-exclude", |_| {
Err(ApiError::BadRequest(anyhow::anyhow!(
"failpoint sk-migration-step-9-mid-exclude"
)))
});
for (idx, res) in results.iter().enumerate() {
let sk_id = safekeepers[idx].skp.id;
let generation = mconf.generation.into_inner();
if res.is_ok() {
self.persistence
.remove_pending_op(
tenant_id,
Some(timeline_id),
NodeId(sk_id as u64),
generation,
)
.await?;
} else {
let req = ScheduleRequest {
safekeeper: Box::new(safekeepers[idx].clone()),
host_list: Vec::new(),
tenant_id,
timeline_id: Some(timeline_id),
generation: config.generation.into_inner(),
generation,
kind: SafekeeperTimelineOpKind::Exclude,
};
reconcile_requests.push(req);
@@ -1208,6 +1232,22 @@ impl Service {
}
// It it is the same new_sk_set, we can continue the migration (retry).
} else {
let prev_finished = timeline.cplane_notified_generation == timeline.generation
&& timeline.sk_set_notified_generation == timeline.generation;
if !prev_finished {
// The previous migration is committed, but the finish step failed.
// Safekeepers/cplane might not know about the last membership configuration.
// Retry the finish step to ensure smooth migration.
self.finish_safekeeper_migration_retry(tenant_id, timeline_id, &timeline)
.await?;
}
if cur_sk_set == new_sk_set {
tracing::info!("timeline is already at the desired safekeeper set");
return Ok(());
}
// 3. No active migration yet.
// Increment current generation and put desired_set to new_sk_set.
generation = generation.next();
@@ -1219,8 +1259,15 @@ impl Service {
generation,
&cur_sk_set,
Some(&new_sk_set),
&[],
)
.await?;
fail::fail_point!("sk-migration-after-step-3", |_| {
Err(ApiError::BadRequest(anyhow::anyhow!(
"failpoint sk-migration-after-step-3"
)))
});
}
let cur_safekeepers = self.get_safekeepers(&timeline.sk_set)?;
@@ -1249,6 +1296,7 @@ impl Service {
&cur_safekeepers,
&joint_config,
None, // no min position
true, // update notified generation
)
.await?;
@@ -1266,6 +1314,12 @@ impl Service {
"safekeepers set membership updated",
);
fail::fail_point!("sk-migration-after-step-4", |_| {
Err(ApiError::BadRequest(anyhow::anyhow!(
"failpoint sk-migration-after-step-4"
)))
});
// 5. Initialize timeline on safekeeper(s) from new_sk_set where it doesn't exist yet
// by doing pull_timeline from the majority of the current set.
@@ -1285,6 +1339,12 @@ impl Service {
)
.await?;
fail::fail_point!("sk-migration-after-step-5", |_| {
Err(ApiError::BadRequest(anyhow::anyhow!(
"failpoint sk-migration-after-step-5"
)))
});
// 6. Call POST bump_term(sync_term) on safekeepers from the new set. Success on majority is enough.
// TODO(diko): do we need to bump timeline term?
@@ -1300,9 +1360,16 @@ impl Service {
&new_safekeepers,
&joint_config,
Some(sync_position),
false, // we're just waiting for sync position, don't update notified generation
)
.await?;
fail::fail_point!("sk-migration-after-step-7", |_| {
Err(ApiError::BadRequest(anyhow::anyhow!(
"failpoint sk-migration-after-step-7"
)))
});
// 8. Create new_conf: Configuration incrementing joint_conf generation and
// having new safekeeper set as sk_set and None new_sk_set.
@@ -1314,45 +1381,55 @@ impl Service {
new_members: None,
};
self.persistence
.update_timeline_membership(tenant_id, timeline_id, generation, &new_sk_set, None)
.await?;
// TODO(diko): at this point we have already updated the timeline in the database,
// but we still need to notify safekeepers and cplane about the new configuration,
// and put delition of the timeline from the old safekeepers into the reconciler.
// Ideally it should be done atomically, but now it's not.
// Worst case: the timeline is not deleted from old safekeepers,
// the compute may require both quorums till the migration is retried and completed.
self.tenant_timeline_set_membership_quorum(
tenant_id,
timeline_id,
&new_safekeepers,
&new_conf,
None, // no min position
)
.await?;
let new_ids: HashSet<NodeId> = new_safekeepers.iter().map(|sk| sk.get_id()).collect();
let exclude_safekeepers = cur_safekeepers
.into_iter()
.filter(|sk| !new_ids.contains(&sk.get_id()))
.collect::<Vec<_>>();
self.tenant_timeline_safekeeper_exclude(
let exclude_requests = exclude_safekeepers
.iter()
.map(|sk| TimelinePendingOpPersistence {
sk_id: sk.skp.id,
tenant_id: tenant_id.to_string(),
timeline_id: timeline_id.to_string(),
generation: generation.into_inner() as i32,
op_kind: SafekeeperTimelineOpKind::Exclude,
})
.collect::<Vec<_>>();
self.persistence
.update_timeline_membership(
tenant_id,
timeline_id,
generation,
&new_sk_set,
None,
&exclude_requests,
)
.await?;
fail::fail_point!("sk-migration-after-step-8", |_| {
Err(ApiError::BadRequest(anyhow::anyhow!(
"failpoint sk-migration-after-step-8"
)))
});
// At this point we have already updated the timeline in the database, so the final
// membership configuration is commited and the migration is not abortable anymore.
// But safekeepers and cplane/compute still need to be notified about the new configuration.
// The [`Self::finish_safekeeper_migration`] does exactly that: notifies everyone about
// the new configuration and reconciles excluded safekeepers.
// If it fails, the safkeeper migration call should be retried.
self.finish_safekeeper_migration(
tenant_id,
timeline_id,
&exclude_safekeepers,
&new_safekeepers,
&new_conf,
&exclude_safekeepers,
)
.await?;
// Notify cplane/compute about the membership change AFTER changing the membership on safekeepers.
// This way the compute will stop talking to excluded safekeepers only after we stop requiring to
// collect a quorum from them.
self.cplane_notify_safekeepers(tenant_id, timeline_id, &new_conf)
.await?;
Ok(())
}
@@ -1396,6 +1473,130 @@ impl Service {
ApiError::InternalServerError(anyhow::anyhow!(
"failed to notify cplane about safekeeper membership change: {err}"
))
})
})?;
self.persistence
.update_cplane_notified_generation(tenant_id, timeline_id, mconf.generation)
.await?;
Ok(())
}
/// Finish safekeeper migration.
///
/// It is the last step of the safekeeper migration.
///
/// Notifies safekeepers and cplane about the final membership configuration,
/// reconciles excluded safekeepers and updates *_notified_generation in the database.
async fn finish_safekeeper_migration(
self: &Arc<Self>,
tenant_id: TenantId,
timeline_id: TimelineId,
new_safekeepers: &[Safekeeper],
new_conf: &membership::Configuration,
exclude_safekeepers: &[Safekeeper],
) -> Result<(), ApiError> {
// 9. Call PUT configuration on safekeepers from the new set, delivering them new_conf.
// Also try to exclude safekeepers and notify cplane about the membership change.
self.tenant_timeline_set_membership_quorum(
tenant_id,
timeline_id,
new_safekeepers,
new_conf,
None, // no min position
true, // update notified generation
)
.await?;
fail::fail_point!("sk-migration-step-9-after-set-membership", |_| {
Err(ApiError::BadRequest(anyhow::anyhow!(
"failpoint sk-migration-step-9-after-set-membership"
)))
});
self.tenant_timeline_safekeeper_exclude_reconcile(
tenant_id,
timeline_id,
exclude_safekeepers,
new_conf,
)
.await?;
fail::fail_point!("sk-migration-step-9-after-exclude", |_| {
Err(ApiError::BadRequest(anyhow::anyhow!(
"failpoint sk-migration-step-9-after-exclude"
)))
});
// Notify cplane/compute about the membership change AFTER changing the membership on safekeepers.
// This way the compute will stop talking to excluded safekeepers only after we stop requiring to
// collect a quorum from them.
self.cplane_notify_safekeepers(tenant_id, timeline_id, new_conf)
.await?;
fail::fail_point!("sk-migration-after-step-9", |_| {
Err(ApiError::BadRequest(anyhow::anyhow!(
"failpoint sk-migration-after-step-9"
)))
});
Ok(())
}
/// Same as [`Self::finish_safekeeper_migration`], but restores the migration state from the database.
/// It's used when the migration failed during the finish step and we need to retry it.
async fn finish_safekeeper_migration_retry(
self: &Arc<Self>,
tenant_id: TenantId,
timeline_id: TimelineId,
timeline: &TimelinePersistence,
) -> Result<(), ApiError> {
if timeline.new_sk_set.is_some() {
// Logical error, should never happen.
return Err(ApiError::InternalServerError(anyhow::anyhow!(
"can't finish timeline migration for {tenant_id}/{timeline_id}: new_sk_set is not None"
)));
}
let cur_safekeepers = self.get_safekeepers(&timeline.sk_set)?;
let cur_sk_member_set =
Self::make_member_set(&cur_safekeepers).map_err(ApiError::InternalServerError)?;
let mconf = membership::Configuration {
generation: SafekeeperGeneration::new(timeline.generation as u32),
members: cur_sk_member_set,
new_members: None,
};
// We might have failed between commiting reconciliation requests and adding them to the in-memory reconciler.
// Reload them from the database.
let pending_ops = self
.persistence
.list_pending_ops_for_timeline(tenant_id, timeline_id)
.await?;
let mut exclude_sk_ids = Vec::new();
for op in pending_ops {
if op.op_kind == SafekeeperTimelineOpKind::Exclude
&& op.generation == timeline.generation
{
exclude_sk_ids.push(op.sk_id);
}
}
let exclude_safekeepers = self.get_safekeepers(&exclude_sk_ids)?;
self.finish_safekeeper_migration(
tenant_id,
timeline_id,
&cur_safekeepers,
&mconf,
&exclude_safekeepers,
)
.await?;
Ok(())
}
}

View File

@@ -66,6 +66,12 @@ class EndpointHttpClient(requests.Session):
res.raise_for_status()
return res.json()
def autoscaling_metrics(self):
res = self.get(f"http://localhost:{self.external_port}/autoscaling_metrics")
res.raise_for_status()
log.debug("raw compute metrics: %s", res.text)
return res.text
def prewarm_lfc_status(self) -> dict[str, str]:
res = self.get(self.prewarm_url)
res.raise_for_status()

View File

@@ -24,6 +24,7 @@ def connection_parameters_to_env(params: dict[str, str]) -> dict[str, str]:
# Some API calls not yet implemented.
# You may want to copy not-yet-implemented methods from the PR https://github.com/neondatabase/neon/pull/11305
@final
class NeonAPI:
def __init__(self, neon_api_key: str, neon_api_base_url: str):
self.__neon_api_key = neon_api_key
@@ -170,7 +171,7 @@ class NeonAPI:
protected: bool | None = None,
archived: bool | None = None,
init_source: str | None = None,
add_endpoint=True,
add_endpoint: bool = True,
) -> dict[str, Any]:
data: dict[str, Any] = {}
if add_endpoint:

View File

@@ -400,6 +400,7 @@ class NeonLocalCli(AbstractNeonCli):
timeout_in_seconds: int | None = None,
instance_id: int | None = None,
base_port: int | None = None,
handle_ps_local_disk_loss: bool | None = None,
):
cmd = ["storage_controller", "start"]
if timeout_in_seconds is not None:
@@ -408,6 +409,10 @@ class NeonLocalCli(AbstractNeonCli):
cmd.append(f"--instance-id={instance_id}")
if base_port is not None:
cmd.append(f"--base-port={base_port}")
if handle_ps_local_disk_loss is not None:
cmd.append(
f"--handle-ps-local-disk-loss={'true' if handle_ps_local_disk_loss else 'false'}"
)
return self.raw_cli(cmd)
def storage_controller_stop(self, immediate: bool, instance_id: int | None = None):

View File

@@ -1940,9 +1940,12 @@ class NeonStorageController(MetricsGetter, LogUtils):
timeout_in_seconds: int | None = None,
instance_id: int | None = None,
base_port: int | None = None,
handle_ps_local_disk_loss: bool | None = None,
) -> Self:
assert not self.running
self.env.neon_cli.storage_controller_start(timeout_in_seconds, instance_id, base_port)
self.env.neon_cli.storage_controller_start(
timeout_in_seconds, instance_id, base_port, handle_ps_local_disk_loss
)
self.running = True
return self
@@ -2840,10 +2843,13 @@ class NeonProxiedStorageController(NeonStorageController):
timeout_in_seconds: int | None = None,
instance_id: int | None = None,
base_port: int | None = None,
handle_ps_local_disk_loss: bool | None = None,
) -> Self:
assert instance_id is not None and base_port is not None
self.env.neon_cli.storage_controller_start(timeout_in_seconds, instance_id, base_port)
self.env.neon_cli.storage_controller_start(
timeout_in_seconds, instance_id, base_port, handle_ps_local_disk_loss
)
self.instances[instance_id] = {"running": True}
self.running = True
@@ -5799,6 +5805,7 @@ SKIP_FILES = frozenset(
"postmaster.pid",
"pg_control",
"pg_dynshmem",
"neon-communicator.socket",
)
)

View File

@@ -0,0 +1,54 @@
from __future__ import annotations
import os
from typing import TYPE_CHECKING
import pytest
import requests
import requests_unixsocket # type: ignore [import-untyped]
from fixtures.metrics import parse_metrics
if TYPE_CHECKING:
from fixtures.neon_fixtures import NeonEnv
NEON_COMMUNICATOR_SOCKET_NAME = "neon-communicator.socket"
def test_communicator_metrics(neon_simple_env: NeonEnv):
"""
Test the communicator's built-in HTTP prometheus exporter
"""
env = neon_simple_env
endpoint = env.endpoints.create("main")
endpoint.start()
# Change current directory to the data directory, so that we can use
# a short relative path to refer to the socket. (There's a 100 char
# limitation on the path.)
os.chdir(str(endpoint.pgdata_dir))
session = requests_unixsocket.Session()
r = session.get(f"http+unix://{NEON_COMMUNICATOR_SOCKET_NAME}/metrics")
assert r.status_code == 200, f"got response {r.status_code}: {r.text}"
# quick test that the endpoint returned something expected. (We don't validate
# that the metrics returned are sensible.)
m = parse_metrics(r.text)
m.query_one("lfc_hits")
m.query_one("lfc_misses")
# Test panic handling. The /debug/panic endpoint raises a Rust panic. It's
# expected to unwind and drop the HTTP connection without response, but not
# kill the process or the server.
with pytest.raises(
requests.ConnectionError, match="Remote end closed connection without response"
):
r = session.get(f"http+unix://{NEON_COMMUNICATOR_SOCKET_NAME}/debug/panic")
assert r.status_code == 500
# Test that subsequent requests after the panic still work.
r = session.get(f"http+unix://{NEON_COMMUNICATOR_SOCKET_NAME}/metrics")
assert r.status_code == 200, f"got response {r.status_code}: {r.text}"
m = parse_metrics(r.text)
m.query_one("lfc_hits")
m.query_one("lfc_misses")

View File

@@ -197,7 +197,7 @@ def test_create_snapshot(
shutil.copytree(
test_output_dir,
new_compatibility_snapshot_dir,
ignore=shutil.ignore_patterns("pg_dynshmem"),
ignore=shutil.ignore_patterns("pg_dynshmem", "neon-communicator.socket"),
)
log.info(f"Copied new compatibility snapshot dir to: {new_compatibility_snapshot_dir}")

View File

@@ -0,0 +1,47 @@
import shutil
from fixtures.neon_fixtures import NeonEnvBuilder
from fixtures.utils import query_scalar
def test_hcc_handling_ps_data_loss(
neon_env_builder: NeonEnvBuilder,
):
"""
Test that following a pageserver local data loss event, the system can recover automatically (i.e.
rehydrating the restarted pageserver from remote storage) without manual intervention. The
pageserver indicates to the storage controller that it has restarted without any local tenant
data in its "reattach" request and the storage controller uses this information to detect the
data loss condition and reconfigure the pageserver as necessary.
"""
env = neon_env_builder.init_configs()
env.broker.start()
env.storage_controller.start(handle_ps_local_disk_loss=True)
env.pageserver.start()
for sk in env.safekeepers:
sk.start()
# create new nenant
tenant_id, _ = env.create_tenant(shard_count=4)
endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
with endpoint.cursor() as cur:
cur.execute("SELECT pg_logical_emit_message(false, 'neon-test', 'between inserts')")
cur.execute("CREATE DATABASE testdb")
with endpoint.cursor(dbname="testdb") as cur:
cur.execute("CREATE TABLE tbl_one_hundred_rows AS SELECT generate_series(1,100)")
endpoint.stop()
# Kill the pageserver, remove the `tenants/` directory, and restart. This simulates a pageserver
# that restarted with the same ID but has lost all its local disk data.
env.pageserver.stop(immediate=True)
shutil.rmtree(env.pageserver.tenant_dir())
env.pageserver.start()
# Test that the endpoint can start and query the database after the pageserver restarts. This
# indirectly tests that the pageserver was able to rehydrate the tenant data it lost from remote
# storage automatically.
endpoint.start()
with endpoint.cursor(dbname="testdb") as cur:
assert query_scalar(cur, "SELECT count(*) FROM tbl_one_hundred_rows") == 100

View File

@@ -6,6 +6,7 @@ from typing import TYPE_CHECKING
import pytest
from fixtures.log_helper import log
from fixtures.metrics import parse_metrics
from fixtures.utils import USE_LFC, query_scalar
if TYPE_CHECKING:
@@ -75,10 +76,24 @@ WITH (fillfactor='100');
cur.execute("SELECT abalance FROM pgbench_accounts WHERE aid = 104242")
cur.execute("SELECT abalance FROM pgbench_accounts WHERE aid = 204242")
# verify working set size after some index access of a few select pages only
blocks = query_scalar(cur, "select approximate_working_set_size(true)")
blocks = query_scalar(cur, "select approximate_working_set_size(false)")
log.info(f"working set size after some index access of a few select pages only {blocks}")
assert blocks < 20
# Also test the metrics from the /autoscaling_metrics endpoint
autoscaling_metrics = endpoint.http_client().autoscaling_metrics()
log.debug(f"Raw metrics: {autoscaling_metrics}")
m = parse_metrics(autoscaling_metrics)
http_estimate = m.query_one(
"lfc_approximate_working_set_size_windows",
{
"duration_seconds": "60",
},
).value
log.info(f"http estimate: {http_estimate}, blocks: {blocks}")
assert http_estimate > 0 and http_estimate < 20
@pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping")
def test_sliding_working_set_approximation(neon_simple_env: NeonEnv):

View File

@@ -3,11 +3,22 @@ from __future__ import annotations
from typing import TYPE_CHECKING
import pytest
import requests
from fixtures.log_helper import log
from fixtures.neon_fixtures import StorageControllerApiException
if TYPE_CHECKING:
from fixtures.neon_fixtures import NeonEnvBuilder
# TODO(diko): pageserver spams with various errors during safekeeper migration.
# Fix the code so it handles the migration better.
ALLOWED_PAGESERVER_ERRORS = [
".*Timeline .* was cancelled and cannot be used anymore.*",
".*Timeline .* has been deleted.*",
".*Timeline .* was not found in global map.*",
".*wal receiver task finished with an error.*",
]
def test_safekeeper_migration_simple(neon_env_builder: NeonEnvBuilder):
"""
@@ -24,16 +35,7 @@ def test_safekeeper_migration_simple(neon_env_builder: NeonEnvBuilder):
"timeline_safekeeper_count": 1,
}
env = neon_env_builder.init_start()
# TODO(diko): pageserver spams with various errors during safekeeper migration.
# Fix the code so it handles the migration better.
env.pageserver.allowed_errors.extend(
[
".*Timeline .* was cancelled and cannot be used anymore.*",
".*Timeline .* has been deleted.*",
".*Timeline .* was not found in global map.*",
".*wal receiver task finished with an error.*",
]
)
env.pageserver.allowed_errors.extend(ALLOWED_PAGESERVER_ERRORS)
ep = env.endpoints.create("main", tenant_id=env.initial_tenant)
@@ -42,15 +44,23 @@ def test_safekeeper_migration_simple(neon_env_builder: NeonEnvBuilder):
assert len(mconf["sk_set"]) == 1
assert mconf["generation"] == 1
current_sk = mconf["sk_set"][0]
ep.start(safekeeper_generation=1, safekeepers=mconf["sk_set"])
ep.safe_psql("CREATE EXTENSION neon_test_utils;")
ep.safe_psql("CREATE TABLE t(a int)")
expected_gen = 1
for active_sk in range(1, 4):
env.storage_controller.migrate_safekeepers(
env.initial_tenant, env.initial_timeline, [active_sk]
)
if active_sk != current_sk:
expected_gen += 2
current_sk = active_sk
other_sks = [sk for sk in range(1, 4) if sk != active_sk]
for sk in other_sks:
@@ -65,9 +75,6 @@ def test_safekeeper_migration_simple(neon_env_builder: NeonEnvBuilder):
assert ep.safe_psql("SELECT * FROM t") == [(i,) for i in range(1, 4)]
# 1 initial generation + 2 migrations on each loop iteration.
expected_gen = 1 + 2 * 3
mconf = env.storage_controller.timeline_locate(env.initial_tenant, env.initial_timeline)
assert mconf["generation"] == expected_gen
@@ -113,3 +120,79 @@ def test_new_sk_set_validation(neon_env_builder: NeonEnvBuilder):
env.storage_controller.safekeeper_scheduling_policy(decom_sk, "Decomissioned")
expect_fail([sk_set[0], decom_sk], "decomissioned")
def test_safekeeper_migration_common_set_failpoints(neon_env_builder: NeonEnvBuilder):
"""
Test that safekeeper migration handles failures well.
Two main conditions are checked:
1. safekeeper migration handler can be retried on different failures.
2. writes do not stuck if sk_set and new_sk_set have a quorum in common.
"""
neon_env_builder.num_safekeepers = 4
neon_env_builder.storage_controller_config = {
"timelines_onto_safekeepers": True,
"timeline_safekeeper_count": 3,
}
env = neon_env_builder.init_start()
env.pageserver.allowed_errors.extend(ALLOWED_PAGESERVER_ERRORS)
mconf = env.storage_controller.timeline_locate(env.initial_tenant, env.initial_timeline)
assert len(mconf["sk_set"]) == 3
assert mconf["generation"] == 1
ep = env.endpoints.create("main", tenant_id=env.initial_tenant)
ep.start(safekeeper_generation=1, safekeepers=mconf["sk_set"])
ep.safe_psql("CREATE EXTENSION neon_test_utils;")
ep.safe_psql("CREATE TABLE t(a int)")
excluded_sk = mconf["sk_set"][-1]
added_sk = [sk.id for sk in env.safekeepers if sk.id not in mconf["sk_set"]][0]
new_sk_set = mconf["sk_set"][:-1] + [added_sk]
log.info(f"migrating sk set from {mconf['sk_set']} to {new_sk_set}")
failpoints = [
"sk-migration-after-step-3",
"sk-migration-after-step-4",
"sk-migration-after-step-5",
"sk-migration-after-step-7",
"sk-migration-after-step-8",
"sk-migration-step-9-after-set-membership",
"sk-migration-step-9-mid-exclude",
"sk-migration-step-9-after-exclude",
"sk-migration-after-step-9",
]
for i, fp in enumerate(failpoints):
env.storage_controller.configure_failpoints((fp, "return(1)"))
with pytest.raises(StorageControllerApiException, match=f"failpoint {fp}"):
env.storage_controller.migrate_safekeepers(
env.initial_tenant, env.initial_timeline, new_sk_set
)
ep.safe_psql(f"INSERT INTO t VALUES ({i})")
env.storage_controller.configure_failpoints((fp, "off"))
# No failpoints, migration should succeed.
env.storage_controller.migrate_safekeepers(env.initial_tenant, env.initial_timeline, new_sk_set)
mconf = env.storage_controller.timeline_locate(env.initial_tenant, env.initial_timeline)
assert mconf["new_sk_set"] is None
assert mconf["sk_set"] == new_sk_set
assert mconf["generation"] == 3
ep.clear_buffers()
assert ep.safe_psql("SELECT * FROM t") == [(i,) for i in range(len(failpoints))]
assert ep.safe_psql("SHOW neon.safekeepers")[0][0].startswith("g#3:")
# Check that we didn't forget to remove the timeline on the excluded safekeeper.
with pytest.raises(requests.exceptions.HTTPError) as exc:
env.safekeepers[excluded_sk - 1].http_client().timeline_status(
env.initial_tenant, env.initial_timeline
)
assert exc.value.response.status_code == 404
assert (
f"timeline {env.initial_tenant}/{env.initial_timeline} deleted" in exc.value.response.text
)

View File

@@ -74,7 +74,7 @@ once_cell = { version = "1" }
p256 = { version = "0.13", features = ["jwk"] }
parquet = { version = "53", default-features = false, features = ["zstd"] }
prost = { version = "0.13", features = ["no-recursion-limit", "prost-derive"] }
rand = { version = "0.8", features = ["small_rng"] }
rand = { version = "0.9" }
regex = { version = "1" }
regex-automata = { version = "0.4", default-features = false, features = ["dfa-onepass", "hybrid", "meta", "nfa-backtrack", "perf-inline", "perf-literal", "unicode"] }
regex-syntax = { version = "0.8" }
@@ -93,6 +93,7 @@ spki = { version = "0.7", default-features = false, features = ["pem", "std"] }
stable_deref_trait = { version = "1" }
subtle = { version = "2" }
sync_wrapper = { version = "0.1", default-features = false, features = ["futures"] }
thiserror = { version = "2" }
tikv-jemalloc-ctl = { version = "0.6", features = ["stats", "use_std"] }
tikv-jemalloc-sys = { version = "0.6", features = ["profiling", "stats", "unprefixed_malloc_on_supported_platforms"] }
time = { version = "0.3", features = ["macros", "serde-well-known"] }
@@ -101,6 +102,7 @@ tokio-rustls = { version = "0.26", default-features = false, features = ["loggin
tokio-stream = { version = "0.1", features = ["net", "sync"] }
tokio-util = { version = "0.7", features = ["codec", "compat", "io-util", "rt"] }
toml_edit = { version = "0.22", features = ["serde"] }
tonic = { version = "0.13", default-features = false, features = ["codegen", "gzip", "prost", "router", "server", "tls-native-roots", "tls-ring", "zstd"] }
tower = { version = "0.5", default-features = false, features = ["balance", "buffer", "limit", "log"] }
tracing = { version = "0.1", features = ["log"] }
tracing-core = { version = "0.1" }