Merge pull request #7357 from neondatabase/rc/proxy/2024-04-11

Proxy release 2024-04-11
This commit is contained in:
Conrad Ludgate
2024-04-11 11:49:45 +01:00
committed by GitHub
120 changed files with 2520 additions and 1356 deletions

View File

@@ -22,6 +22,7 @@
!s3_scrubber/
!safekeeper/
!storage_broker/
!storage_controller/
!trace/
!vendor/postgres-*/
!workspace_hack/

View File

@@ -10,7 +10,7 @@ inputs:
required: true
api_host:
desctiption: 'Neon API host'
default: console.stage.neon.tech
default: console-stage.neon.build
outputs:
dsn:
description: 'Created Branch DSN (for main database)'

View File

@@ -13,7 +13,7 @@ inputs:
required: true
api_host:
desctiption: 'Neon API host'
default: console.stage.neon.tech
default: console-stage.neon.build
runs:
using: "composite"

View File

@@ -13,7 +13,7 @@ inputs:
default: 15
api_host:
desctiption: 'Neon API host'
default: console.stage.neon.tech
default: console-stage.neon.build
provisioner:
desctiption: 'k8s-pod or k8s-neonvm'
default: 'k8s-pod'

View File

@@ -10,7 +10,7 @@ inputs:
required: true
api_host:
desctiption: 'Neon API host'
default: console.stage.neon.tech
default: console-stage.neon.build
runs:
using: "composite"

View File

@@ -1,5 +1,5 @@
/compute_tools/ @neondatabase/control-plane @neondatabase/compute
/control_plane/attachment_service @neondatabase/storage
/storage_controller @neondatabase/storage
/libs/pageserver_api/ @neondatabase/storage
/libs/postgres_ffi/ @neondatabase/compute @neondatabase/safekeepers
/libs/remote_storage/ @neondatabase/storage

480
Cargo.lock generated
View File

@@ -271,43 +271,10 @@ dependencies = [
]
[[package]]
name = "attachment_service"
version = "0.1.0"
dependencies = [
"anyhow",
"aws-config",
"bytes",
"camino",
"clap",
"control_plane",
"diesel",
"diesel_migrations",
"fail",
"futures",
"git-version",
"hex",
"humantime",
"hyper",
"itertools",
"lasso",
"measured",
"metrics",
"once_cell",
"pageserver_api",
"pageserver_client",
"postgres_connection",
"r2d2",
"reqwest",
"routerify",
"serde",
"serde_json",
"thiserror",
"tokio",
"tokio-util",
"tracing",
"utils",
"workspace_hack",
]
name = "atomic-take"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a8ab6b55fe97976e46f91ddbed8d147d966475dc29b2032757ba47e02376fbc3"
[[package]]
name = "autocfg"
@@ -337,7 +304,7 @@ dependencies = [
"fastrand 2.0.0",
"hex",
"http 0.2.9",
"hyper",
"hyper 0.14.26",
"ring 0.17.6",
"time",
"tokio",
@@ -374,7 +341,7 @@ dependencies = [
"bytes",
"fastrand 2.0.0",
"http 0.2.9",
"http-body",
"http-body 0.4.5",
"percent-encoding",
"pin-project-lite",
"tracing",
@@ -425,7 +392,7 @@ dependencies = [
"aws-types",
"bytes",
"http 0.2.9",
"http-body",
"http-body 0.4.5",
"once_cell",
"percent-encoding",
"regex-lite",
@@ -553,7 +520,7 @@ dependencies = [
"crc32fast",
"hex",
"http 0.2.9",
"http-body",
"http-body 0.4.5",
"md-5",
"pin-project-lite",
"sha1",
@@ -585,7 +552,7 @@ dependencies = [
"bytes-utils",
"futures-core",
"http 0.2.9",
"http-body",
"http-body 0.4.5",
"once_cell",
"percent-encoding",
"pin-project-lite",
@@ -624,10 +591,10 @@ dependencies = [
"aws-smithy-types",
"bytes",
"fastrand 2.0.0",
"h2",
"h2 0.3.26",
"http 0.2.9",
"http-body",
"hyper",
"http-body 0.4.5",
"hyper 0.14.26",
"hyper-rustls",
"once_cell",
"pin-project-lite",
@@ -665,7 +632,7 @@ dependencies = [
"bytes-utils",
"futures-core",
"http 0.2.9",
"http-body",
"http-body 0.4.5",
"itoa",
"num-integer",
"pin-project-lite",
@@ -714,8 +681,8 @@ dependencies = [
"bytes",
"futures-util",
"http 0.2.9",
"http-body",
"hyper",
"http-body 0.4.5",
"hyper 0.14.26",
"itoa",
"matchit",
"memchr",
@@ -730,7 +697,7 @@ dependencies = [
"sha1",
"sync_wrapper",
"tokio",
"tokio-tungstenite",
"tokio-tungstenite 0.20.0",
"tower",
"tower-layer",
"tower-service",
@@ -746,7 +713,7 @@ dependencies = [
"bytes",
"futures-util",
"http 0.2.9",
"http-body",
"http-body 0.4.5",
"mime",
"rustversion",
"tower-layer",
@@ -1163,7 +1130,7 @@ version = "4.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "191d9573962933b4027f932c600cd252ce27a8ad5979418fe78e43c07996f27b"
dependencies = [
"heck",
"heck 0.4.1",
"proc-macro2",
"quote",
"syn 2.0.52",
@@ -1235,7 +1202,7 @@ dependencies = [
"compute_api",
"flate2",
"futures",
"hyper",
"hyper 0.14.26",
"nix 0.27.1",
"notify",
"num_cpus",
@@ -1352,7 +1319,7 @@ dependencies = [
"git-version",
"hex",
"humantime",
"hyper",
"hyper 0.14.26",
"nix 0.27.1",
"once_cell",
"pageserver_api",
@@ -1501,12 +1468,9 @@ dependencies = [
[[package]]
name = "crossbeam-utils"
version = "0.8.15"
version = "0.8.19"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3c063cd8cc95f5c377ed0d4b49a4b21f632396ff690e8470c29b3359b346984b"
dependencies = [
"cfg-if",
]
checksum = "248e3bacc7dc6baa3b21e405ee045c3047101a49145e7e9eca583ab4c2ca5345"
[[package]]
name = "crossterm"
@@ -1879,23 +1843,12 @@ checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5"
[[package]]
name = "errno"
version = "0.3.1"
version = "0.3.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4bcfec3a70f97c962c307b2d2c56e358cf1d00b558d74262b5f929ee8cc7e73a"
checksum = "a258e46cdc063eb8519c00b9fc845fc47bcfca4130e2f08e88665ceda8474245"
dependencies = [
"errno-dragonfly",
"libc",
"windows-sys 0.48.0",
]
[[package]]
name = "errno-dragonfly"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf"
dependencies = [
"cc",
"libc",
"windows-sys 0.52.0",
]
[[package]]
@@ -2252,6 +2205,25 @@ dependencies = [
"tracing",
]
[[package]]
name = "h2"
version = "0.4.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "816ec7294445779408f36fe57bc5b7fc1cf59664059096c65f905c1c61f58069"
dependencies = [
"bytes",
"fnv",
"futures-core",
"futures-sink",
"futures-util",
"http 1.1.0",
"indexmap 2.0.1",
"slab",
"tokio",
"tokio-util",
"tracing",
]
[[package]]
name = "half"
version = "1.8.2"
@@ -2333,6 +2305,12 @@ version = "0.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"
[[package]]
name = "heck"
version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
[[package]]
name = "hermit-abi"
version = "0.3.3"
@@ -2417,6 +2395,29 @@ dependencies = [
"pin-project-lite",
]
[[package]]
name = "http-body"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1cac85db508abc24a2e48553ba12a996e87244a0395ce011e62b37158745d643"
dependencies = [
"bytes",
"http 1.1.0",
]
[[package]]
name = "http-body-util"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "41cb79eb393015dadd30fc252023adb0b2400a0caee0fa2a077e6e21a551e840"
dependencies = [
"bytes",
"futures-util",
"http 1.1.0",
"http-body 1.0.0",
"pin-project-lite",
]
[[package]]
name = "http-types"
version = "2.12.0"
@@ -2475,9 +2476,9 @@ dependencies = [
"futures-channel",
"futures-core",
"futures-util",
"h2",
"h2 0.3.26",
"http 0.2.9",
"http-body",
"http-body 0.4.5",
"httparse",
"httpdate",
"itoa",
@@ -2489,6 +2490,26 @@ dependencies = [
"want",
]
[[package]]
name = "hyper"
version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "186548d73ac615b32a73aafe38fb4f56c0d340e110e5a200bcadbaf2e199263a"
dependencies = [
"bytes",
"futures-channel",
"futures-util",
"h2 0.4.4",
"http 1.1.0",
"http-body 1.0.0",
"httparse",
"httpdate",
"itoa",
"pin-project-lite",
"smallvec",
"tokio",
]
[[package]]
name = "hyper-rustls"
version = "0.24.0"
@@ -2496,7 +2517,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0646026eb1b3eea4cd9ba47912ea5ce9cc07713d105b1a14698f4e6433d348b7"
dependencies = [
"http 0.2.9",
"hyper",
"hyper 0.14.26",
"log",
"rustls 0.21.9",
"rustls-native-certs 0.6.2",
@@ -2510,7 +2531,7 @@ version = "0.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bbb958482e8c7be4bc3cf272a766a2b0bf1a6755e7a6ae777f017a31d11b13b1"
dependencies = [
"hyper",
"hyper 0.14.26",
"pin-project-lite",
"tokio",
"tokio-io-timeout",
@@ -2523,7 +2544,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d6183ddfa99b85da61a140bea0efc93fdf56ceaa041b37d553518030827f9905"
dependencies = [
"bytes",
"hyper",
"hyper 0.14.26",
"native-tls",
"tokio",
"tokio-native-tls",
@@ -2531,15 +2552,33 @@ dependencies = [
[[package]]
name = "hyper-tungstenite"
version = "0.11.1"
version = "0.13.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7cc7dcb1ab67cd336f468a12491765672e61a3b6b148634dbfe2fe8acd3fe7d9"
checksum = "7a343d17fe7885302ed7252767dc7bb83609a874b6ff581142241ec4b73957ad"
dependencies = [
"hyper",
"http-body-util",
"hyper 1.2.0",
"hyper-util",
"pin-project-lite",
"tokio",
"tokio-tungstenite",
"tungstenite",
"tokio-tungstenite 0.21.0",
"tungstenite 0.21.0",
]
[[package]]
name = "hyper-util"
version = "0.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ca38ef113da30126bbff9cd1705f9273e15d45498615d138b0c20279ac7a76aa"
dependencies = [
"bytes",
"futures-util",
"http 1.1.0",
"http-body 1.0.0",
"hyper 1.2.0",
"pin-project-lite",
"socket2 0.5.5",
"tokio",
]
[[package]]
@@ -2833,6 +2872,12 @@ version = "0.3.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ef53942eb7bf7ff43a617b3e2c1c4a5ecf5944a7c1bc12d7ee39bbb15e5c1519"
[[package]]
name = "linux-raw-sys"
version = "0.4.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "01cda141df6706de531b6c46c3a33ecca755538219bd484262fa09410c13539c"
[[package]]
name = "lock_api"
version = "0.4.10"
@@ -2887,11 +2932,12 @@ checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771"
[[package]]
name = "measured"
version = "0.0.13"
version = "0.0.20"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f246648d027839a34b420e27c7de1165ace96e19ef894985d0a6ff89a7840a9f"
checksum = "3cbf033874bea03565f2449572c8640ca37ec26300455faf36001f24755da452"
dependencies = [
"bytes",
"crossbeam-utils",
"hashbrown 0.14.0",
"itoa",
"lasso",
@@ -2904,16 +2950,27 @@ dependencies = [
[[package]]
name = "measured-derive"
version = "0.0.13"
version = "0.0.20"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "edaa5cc22d99d5d6d7d99c3b5b5f7e7f8034c22f1b5d62a1adecd2ed005d9b80"
checksum = "be9e29b682b38f8af2a89f960455054ab1a9f5a06822f6f3500637ad9fa57def"
dependencies = [
"heck",
"heck 0.5.0",
"proc-macro2",
"quote",
"syn 2.0.52",
]
[[package]]
name = "measured-process"
version = "0.0.20"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a20849acdd04c5d6a88f565559044546904648a1842a2937cfff0b48b4ca7ef2"
dependencies = [
"libc",
"measured",
"procfs 0.16.0",
]
[[package]]
name = "memchr"
version = "2.6.4"
@@ -2953,8 +3010,10 @@ version = "0.1.0"
dependencies = [
"chrono",
"libc",
"measured",
"measured-process",
"once_cell",
"procfs",
"procfs 0.14.2",
"prometheus",
"rand 0.8.5",
"rand_distr",
@@ -3504,12 +3563,17 @@ dependencies = [
"camino",
"clap",
"git-version",
"humantime",
"pageserver",
"pageserver_api",
"postgres_ffi",
"remote_storage",
"serde",
"serde_json",
"svg_fmt",
"tokio",
"tokio-util",
"toml_edit",
"utils",
"workspace_hack",
]
@@ -3545,7 +3609,7 @@ dependencies = [
"hex-literal",
"humantime",
"humantime-serde",
"hyper",
"hyper 0.14.26",
"itertools",
"leaky-bucket",
"md5",
@@ -3564,7 +3628,7 @@ dependencies = [
"postgres_connection",
"postgres_ffi",
"pq_proto",
"procfs",
"procfs 0.14.2",
"rand 0.8.5",
"regex",
"remote_storage",
@@ -3655,7 +3719,6 @@ dependencies = [
"anyhow",
"async-compression",
"async-stream",
"async-trait",
"byteorder",
"bytes",
"chrono",
@@ -4125,6 +4188,29 @@ dependencies = [
"rustix 0.36.16",
]
[[package]]
name = "procfs"
version = "0.16.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "731e0d9356b0c25f16f33b5be79b1c57b562f141ebfcdb0ad8ac2c13a24293b4"
dependencies = [
"bitflags 2.4.1",
"hex",
"lazy_static",
"procfs-core",
"rustix 0.38.28",
]
[[package]]
name = "procfs-core"
version = "0.16.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2d3554923a69f4ce04c4a754260c338f505ce22642d3830e049a399fc2059a29"
dependencies = [
"bitflags 2.4.1",
"hex",
]
[[package]]
name = "prometheus"
version = "0.13.3"
@@ -4137,7 +4223,7 @@ dependencies = [
"libc",
"memchr",
"parking_lot 0.12.1",
"procfs",
"procfs 0.14.2",
"thiserror",
]
@@ -4158,7 +4244,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "119533552c9a7ffacc21e099c24a0ac8bb19c2a2a3f363de84cd9b844feab270"
dependencies = [
"bytes",
"heck",
"heck 0.4.1",
"itertools",
"lazy_static",
"log",
@@ -4202,6 +4288,7 @@ dependencies = [
"anyhow",
"async-compression",
"async-trait",
"atomic-take",
"aws-config",
"aws-sdk-iam",
"aws-sigv4",
@@ -4225,9 +4312,12 @@ dependencies = [
"hmac",
"hostname",
"http 1.1.0",
"http-body-util",
"humantime",
"hyper",
"hyper 0.14.26",
"hyper 1.2.0",
"hyper-tungstenite",
"hyper-util",
"ipnet",
"itertools",
"lasso",
@@ -4560,7 +4650,7 @@ dependencies = [
"futures-util",
"http-types",
"humantime",
"hyper",
"hyper 0.14.26",
"itertools",
"metrics",
"once_cell",
@@ -4590,10 +4680,10 @@ dependencies = [
"encoding_rs",
"futures-core",
"futures-util",
"h2",
"h2 0.3.26",
"http 0.2.9",
"http-body",
"hyper",
"http-body 0.4.5",
"hyper 0.14.26",
"hyper-rustls",
"hyper-tls",
"ipnet",
@@ -4651,7 +4741,7 @@ dependencies = [
"futures",
"getrandom 0.2.11",
"http 0.2.9",
"hyper",
"hyper 0.14.26",
"parking_lot 0.11.2",
"reqwest",
"reqwest-middleware",
@@ -4738,7 +4828,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "496c1d3718081c45ba9c31fbfc07417900aa96f4070ff90dc29961836b7a9945"
dependencies = [
"http 0.2.9",
"hyper",
"hyper 0.14.26",
"lazy_static",
"percent-encoding",
"regex",
@@ -4850,6 +4940,19 @@ dependencies = [
"windows-sys 0.48.0",
]
[[package]]
name = "rustix"
version = "0.38.28"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "72e572a5e8ca657d7366229cdde4bd14c4eb5499a9573d4d366fe1b599daa316"
dependencies = [
"bitflags 2.4.1",
"errno",
"libc",
"linux-raw-sys 0.4.13",
"windows-sys 0.52.0",
]
[[package]]
name = "rustls"
version = "0.21.9"
@@ -5030,7 +5133,7 @@ dependencies = [
"git-version",
"hex",
"humantime",
"hyper",
"hyper 0.14.26",
"metrics",
"once_cell",
"parking_lot 0.12.1",
@@ -5515,9 +5618,9 @@ dependencies = [
[[package]]
name = "smallvec"
version = "1.11.0"
version = "1.13.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "62bb4feee49fdd9f707ef802e22365a35de4b7b299de4763d44bfea899442ff9"
checksum = "e6ecd384b10a64542d77071bd64bd7b231f4ed5940fba55e98c3de13824cf3d7"
[[package]]
name = "smol_str"
@@ -5609,7 +5712,7 @@ dependencies = [
"futures-util",
"git-version",
"humantime",
"hyper",
"hyper 0.14.26",
"metrics",
"once_cell",
"parking_lot 0.12.1",
@@ -5623,6 +5726,45 @@ dependencies = [
"workspace_hack",
]
[[package]]
name = "storage_controller"
version = "0.1.0"
dependencies = [
"anyhow",
"aws-config",
"bytes",
"camino",
"clap",
"control_plane",
"diesel",
"diesel_migrations",
"fail",
"futures",
"git-version",
"hex",
"humantime",
"hyper 0.14.26",
"itertools",
"lasso",
"measured",
"metrics",
"once_cell",
"pageserver_api",
"pageserver_client",
"postgres_connection",
"r2d2",
"reqwest",
"routerify",
"serde",
"serde_json",
"thiserror",
"tokio",
"tokio-util",
"tracing",
"utils",
"workspace_hack",
]
[[package]]
name = "storcon_cli"
version = "0.1.0"
@@ -5630,7 +5772,7 @@ dependencies = [
"anyhow",
"clap",
"comfy-table",
"hyper",
"hyper 0.14.26",
"pageserver_api",
"pageserver_client",
"reqwest",
@@ -5671,7 +5813,7 @@ version = "0.24.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1e385be0d24f186b4ce2f9982191e7101bb737312ad61c1f2f984f34bcf85d59"
dependencies = [
"heck",
"heck 0.4.1",
"proc-macro2",
"quote",
"rustversion",
@@ -5799,23 +5941,23 @@ dependencies = [
[[package]]
name = "test-context"
version = "0.1.4"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "055831a02a4f5aa28fede67f2902014273eb8c21b958ac5ebbd59b71ef30dbc3"
checksum = "6676ab8513edfd2601a108621103fdb45cac9098305ca25ec93f7023b06b05d9"
dependencies = [
"async-trait",
"futures",
"test-context-macros",
]
[[package]]
name = "test-context-macros"
version = "0.1.4"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8901a55b0a7a06ebc4a674dcca925170da8e613fa3b163a1df804ed10afb154d"
checksum = "78ea17a2dc368aeca6f554343ced1b1e31f76d63683fa8016e5844bd7a5144a1"
dependencies = [
"proc-macro2",
"quote",
"syn 1.0.109",
"syn 2.0.52",
]
[[package]]
@@ -6113,7 +6255,19 @@ dependencies = [
"futures-util",
"log",
"tokio",
"tungstenite",
"tungstenite 0.20.1",
]
[[package]]
name = "tokio-tungstenite"
version = "0.21.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c83b561d025642014097b66e6c1bb422783339e0909e4429cde4749d1990bc38"
dependencies = [
"futures-util",
"log",
"tokio",
"tungstenite 0.21.0",
]
[[package]]
@@ -6180,10 +6334,10 @@ dependencies = [
"bytes",
"futures-core",
"futures-util",
"h2",
"h2 0.3.26",
"http 0.2.9",
"http-body",
"hyper",
"http-body 0.4.5",
"hyper 0.14.26",
"hyper-timeout",
"percent-encoding",
"pin-project",
@@ -6369,7 +6523,7 @@ dependencies = [
name = "tracing-utils"
version = "0.1.0"
dependencies = [
"hyper",
"hyper 0.14.26",
"opentelemetry",
"opentelemetry-otlp",
"opentelemetry-semantic-conventions",
@@ -6406,6 +6560,25 @@ dependencies = [
"utf-8",
]
[[package]]
name = "tungstenite"
version = "0.21.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9ef1a641ea34f399a848dea702823bbecfb4c486f911735368f1f137cb8257e1"
dependencies = [
"byteorder",
"bytes",
"data-encoding",
"http 1.1.0",
"httparse",
"log",
"rand 0.8.5",
"sha1",
"thiserror",
"url",
"utf-8",
]
[[package]]
name = "twox-hash"
version = "1.6.3"
@@ -6570,7 +6743,8 @@ dependencies = [
"heapless",
"hex",
"hex-literal",
"hyper",
"humantime",
"hyper 0.14.26",
"jsonwebtoken",
"leaky-bucket",
"metrics",
@@ -6930,6 +7104,15 @@ dependencies = [
"windows-targets 0.48.0",
]
[[package]]
name = "windows-sys"
version = "0.52.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d"
dependencies = [
"windows-targets 0.52.4",
]
[[package]]
name = "windows-targets"
version = "0.42.2"
@@ -6960,6 +7143,21 @@ dependencies = [
"windows_x86_64_msvc 0.48.0",
]
[[package]]
name = "windows-targets"
version = "0.52.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7dd37b7e5ab9018759f893a1952c9420d060016fc19a472b4bb20d1bdd694d1b"
dependencies = [
"windows_aarch64_gnullvm 0.52.4",
"windows_aarch64_msvc 0.52.4",
"windows_i686_gnu 0.52.4",
"windows_i686_msvc 0.52.4",
"windows_x86_64_gnu 0.52.4",
"windows_x86_64_gnullvm 0.52.4",
"windows_x86_64_msvc 0.52.4",
]
[[package]]
name = "windows_aarch64_gnullvm"
version = "0.42.2"
@@ -6972,6 +7170,12 @@ version = "0.48.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "91ae572e1b79dba883e0d315474df7305d12f569b400fcf90581b06062f7e1bc"
[[package]]
name = "windows_aarch64_gnullvm"
version = "0.52.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bcf46cf4c365c6f2d1cc93ce535f2c8b244591df96ceee75d8e83deb70a9cac9"
[[package]]
name = "windows_aarch64_msvc"
version = "0.42.2"
@@ -6984,6 +7188,12 @@ version = "0.48.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b2ef27e0d7bdfcfc7b868b317c1d32c641a6fe4629c171b8928c7b08d98d7cf3"
[[package]]
name = "windows_aarch64_msvc"
version = "0.52.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "da9f259dd3bcf6990b55bffd094c4f7235817ba4ceebde8e6d11cd0c5633b675"
[[package]]
name = "windows_i686_gnu"
version = "0.42.2"
@@ -6996,6 +7206,12 @@ version = "0.48.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "622a1962a7db830d6fd0a69683c80a18fda201879f0f447f065a3b7467daa241"
[[package]]
name = "windows_i686_gnu"
version = "0.52.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b474d8268f99e0995f25b9f095bc7434632601028cf86590aea5c8a5cb7801d3"
[[package]]
name = "windows_i686_msvc"
version = "0.42.2"
@@ -7008,6 +7224,12 @@ version = "0.48.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4542c6e364ce21bf45d69fdd2a8e455fa38d316158cfd43b3ac1c5b1b19f8e00"
[[package]]
name = "windows_i686_msvc"
version = "0.52.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1515e9a29e5bed743cb4415a9ecf5dfca648ce85ee42e15873c3cd8610ff8e02"
[[package]]
name = "windows_x86_64_gnu"
version = "0.42.2"
@@ -7020,6 +7242,12 @@ version = "0.48.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ca2b8a661f7628cbd23440e50b05d705db3686f894fc9580820623656af974b1"
[[package]]
name = "windows_x86_64_gnu"
version = "0.52.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5eee091590e89cc02ad514ffe3ead9eb6b660aedca2183455434b93546371a03"
[[package]]
name = "windows_x86_64_gnullvm"
version = "0.42.2"
@@ -7032,6 +7260,12 @@ version = "0.48.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7896dbc1f41e08872e9d5e8f8baa8fdd2677f29468c4e156210174edc7f7b953"
[[package]]
name = "windows_x86_64_gnullvm"
version = "0.52.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "77ca79f2451b49fa9e2af39f0747fe999fcda4f5e241b2898624dca97a1f2177"
[[package]]
name = "windows_x86_64_msvc"
version = "0.42.2"
@@ -7044,6 +7278,12 @@ version = "0.48.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1a515f5799fe4961cb532f983ce2b23082366b898e52ffbce459c86f67c8378a"
[[package]]
name = "windows_x86_64_msvc"
version = "0.52.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "32b752e52a2da0ddfbdbcc6fceadfeede4c939ed16d13e648833a61dfb611ed8"
[[package]]
name = "winnow"
version = "0.4.6"
@@ -7092,11 +7332,10 @@ dependencies = [
"futures-sink",
"futures-util",
"getrandom 0.2.11",
"hashbrown 0.13.2",
"hashbrown 0.14.0",
"hex",
"hmac",
"hyper",
"hyper 0.14.26",
"indexmap 1.9.3",
"itertools",
"libc",
@@ -7134,7 +7373,6 @@ dependencies = [
"tower",
"tracing",
"tracing-core",
"tungstenite",
"url",
"uuid",
"zeroize",

View File

@@ -3,7 +3,6 @@ resolver = "2"
members = [
"compute_tools",
"control_plane",
"control_plane/attachment_service",
"control_plane/storcon_cli",
"pageserver",
"pageserver/compaction",
@@ -13,6 +12,7 @@ members = [
"proxy",
"safekeeper",
"storage_broker",
"storage_controller",
"s3_scrubber",
"workspace_hack",
"trace",
@@ -44,6 +44,7 @@ license = "Apache-2.0"
anyhow = { version = "1.0", features = ["backtrace"] }
arc-swap = "1.6"
async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] }
atomic-take = "1.1.0"
azure_core = "0.18"
azure_identity = "0.18"
azure_storage = "0.18"
@@ -97,7 +98,7 @@ http-types = { version = "2", default-features = false }
humantime = "2.1"
humantime-serde = "1.1.1"
hyper = "0.14"
hyper-tungstenite = "0.11"
hyper-tungstenite = "0.13.0"
inotify = "0.10.2"
ipnet = "2.9.0"
itertools = "0.10"
@@ -106,7 +107,8 @@ lasso = "0.7"
leaky-bucket = "1.0.1"
libc = "0.2"
md5 = "0.7.0"
measured = { version = "0.0.13", features=["default", "lasso"] }
measured = { version = "0.0.20", features=["lasso"] }
measured-process = { version = "0.0.20" }
memoffset = "0.8"
native-tls = "0.2"
nix = { version = "0.27", features = ["fs", "process", "socket", "signal", "poll"] }
@@ -159,7 +161,7 @@ svg_fmt = "0.4.1"
sync_wrapper = "0.1.2"
tar = "0.4"
task-local-extensions = "0.1.4"
test-context = "0.1"
test-context = "0.3"
thiserror = "1.0"
tikv-jemallocator = "0.5"
tikv-jemalloc-ctl = "0.5"

View File

@@ -6,8 +6,8 @@ use std::path::Path;
use anyhow::Result;
use crate::pg_helpers::escape_conf_value;
use crate::pg_helpers::PgOptionsSerialize;
use compute_api::spec::{ComputeMode, ComputeSpec};
use crate::pg_helpers::{GenericOptionExt, PgOptionsSerialize};
use compute_api::spec::{ComputeMode, ComputeSpec, GenericOption};
/// Check that `line` is inside a text file and put it there if it is not.
/// Create file if it doesn't exist.
@@ -92,6 +92,27 @@ pub fn write_postgres_conf(
}
}
if cfg!(target_os = "linux") {
// Check /proc/sys/vm/overcommit_memory -- if it equals 2 (i.e. linux memory overcommit is
// disabled), then the control plane has enabled swap and we should set
// dynamic_shared_memory_type = 'mmap'.
//
// This is (maybe?) temporary - for more, see https://github.com/neondatabase/cloud/issues/12047.
let overcommit_memory_contents = std::fs::read_to_string("/proc/sys/vm/overcommit_memory")
// ignore any errors - they may be expected to occur under certain situations (e.g. when
// not running in Linux).
.unwrap_or_else(|_| String::new());
if overcommit_memory_contents.trim() == "2" {
let opt = GenericOption {
name: "dynamic_shared_memory_type".to_owned(),
value: Some("mmap".to_owned()),
vartype: "enum".to_owned(),
};
write!(file, "{}", opt.to_pg_setting())?;
}
}
// If there are any extra options in the 'settings' field, append those
if spec.cluster.settings.is_some() {
writeln!(file, "# Managed by compute_ctl: begin")?;

View File

@@ -44,7 +44,7 @@ pub fn escape_conf_value(s: &str) -> String {
format!("'{}'", res)
}
trait GenericOptionExt {
pub trait GenericOptionExt {
fn to_pg_option(&self) -> String;
fn to_pg_setting(&self) -> String;
}

View File

@@ -86,7 +86,10 @@ where
.stdout(process_log_file)
.stderr(same_file_for_stderr)
.args(args);
let filled_cmd = fill_remote_storage_secrets_vars(fill_rust_env_vars(background_command));
let filled_cmd = fill_env_vars_prefixed_neon(fill_remote_storage_secrets_vars(
fill_rust_env_vars(background_command),
));
filled_cmd.envs(envs);
let pid_file_to_check = match &initial_pid_file {
@@ -268,6 +271,15 @@ fn fill_remote_storage_secrets_vars(mut cmd: &mut Command) -> &mut Command {
cmd
}
fn fill_env_vars_prefixed_neon(mut cmd: &mut Command) -> &mut Command {
for (var, val) in std::env::vars() {
if var.starts_with("NEON_PAGESERVER_") {
cmd = cmd.env(var, val);
}
}
cmd
}
/// Add a `pre_exec` to the cmd that, inbetween fork() and exec(),
/// 1. Claims a pidfile with a fcntl lock on it and
/// 2. Sets up the pidfile's file descriptor so that it (and the lock)

View File

@@ -223,7 +223,7 @@ impl Client {
}
}
/// Simple HTTP request wrapper for calling into attachment service
/// Simple HTTP request wrapper for calling into storage controller
async fn dispatch<RQ, RS>(
&self,
method: hyper::Method,

View File

@@ -2,8 +2,8 @@
# see https://diesel.rs/guides/configuring-diesel-cli
[print_schema]
file = "control_plane/attachment_service/src/schema.rs"
file = "storage_controller/src/schema.rs"
custom_type_derives = ["diesel::query_builder::QueryId"]
[migrations_directory]
dir = "control_plane/attachment_service/migrations"
dir = "storage_controller/migrations"

View File

@@ -7,6 +7,11 @@ Below you will find a brief overview of each subdir in the source tree in alphab
Neon storage broker, providing messaging between safekeepers and pageservers.
[storage_broker.md](./storage_broker.md)
`storage_controller`:
Neon storage controller, manages a cluster of pageservers and exposes an API that enables
managing a many-sharded tenant as a single entity.
`/control_plane`:
Local control plane.

View File

@@ -10,11 +10,13 @@ libc.workspace = true
once_cell.workspace = true
chrono.workspace = true
twox-hash.workspace = true
measured.workspace = true
workspace_hack.workspace = true
[target.'cfg(target_os = "linux")'.dependencies]
procfs.workspace = true
measured-process.workspace = true
[dev-dependencies]
rand = "0.8"

View File

@@ -4,6 +4,17 @@
//! a default registry.
#![deny(clippy::undocumented_unsafe_blocks)]
use measured::{
label::{LabelGroupVisitor, LabelName, NoLabels},
metric::{
counter::CounterState,
gauge::GaugeState,
group::{Encoding, MetricValue},
name::{MetricName, MetricNameEncoder},
MetricEncoding, MetricFamilyEncoding,
},
FixedCardinalityLabel, LabelGroup, MetricGroup,
};
use once_cell::sync::Lazy;
use prometheus::core::{
Atomic, AtomicU64, Collector, GenericCounter, GenericCounterVec, GenericGauge, GenericGaugeVec,
@@ -11,6 +22,7 @@ use prometheus::core::{
pub use prometheus::opts;
pub use prometheus::register;
pub use prometheus::Error;
use prometheus::Registry;
pub use prometheus::{core, default_registry, proto};
pub use prometheus::{exponential_buckets, linear_buckets};
pub use prometheus::{register_counter_vec, Counter, CounterVec};
@@ -23,7 +35,6 @@ pub use prometheus::{register_int_counter_vec, IntCounterVec};
pub use prometheus::{register_int_gauge, IntGauge};
pub use prometheus::{register_int_gauge_vec, IntGaugeVec};
pub use prometheus::{Encoder, TextEncoder};
use prometheus::{Registry, Result};
pub mod launch_timestamp;
mod wrappers;
@@ -59,7 +70,7 @@ static INTERNAL_REGISTRY: Lazy<Registry> = Lazy::new(Registry::new);
/// Register a collector in the internal registry. MUST be called before the first call to `gather()`.
/// Otherwise, we can have a deadlock in the `gather()` call, trying to register a new collector
/// while holding the lock.
pub fn register_internal(c: Box<dyn Collector>) -> Result<()> {
pub fn register_internal(c: Box<dyn Collector>) -> prometheus::Result<()> {
INTERNAL_REGISTRY.register(c)
}
@@ -96,6 +107,127 @@ pub const DISK_WRITE_SECONDS_BUCKETS: &[f64] = &[
0.000_050, 0.000_100, 0.000_500, 0.001, 0.003, 0.005, 0.01, 0.05, 0.1, 0.3, 0.5,
];
pub struct BuildInfo {
pub revision: &'static str,
pub build_tag: &'static str,
}
// todo: allow label group without the set
impl LabelGroup for BuildInfo {
fn visit_values(&self, v: &mut impl LabelGroupVisitor) {
const REVISION: &LabelName = LabelName::from_str("revision");
v.write_value(REVISION, &self.revision);
const BUILD_TAG: &LabelName = LabelName::from_str("build_tag");
v.write_value(BUILD_TAG, &self.build_tag);
}
}
impl<T: Encoding> MetricFamilyEncoding<T> for BuildInfo
where
GaugeState: MetricEncoding<T>,
{
fn collect_family_into(
&self,
name: impl measured::metric::name::MetricNameEncoder,
enc: &mut T,
) -> Result<(), T::Err> {
enc.write_help(&name, "Build/version information")?;
GaugeState::write_type(&name, enc)?;
GaugeState {
count: std::sync::atomic::AtomicI64::new(1),
}
.collect_into(&(), self, name, enc)
}
}
#[derive(MetricGroup)]
#[metric(new(build_info: BuildInfo))]
pub struct NeonMetrics {
#[cfg(target_os = "linux")]
#[metric(namespace = "process")]
#[metric(init = measured_process::ProcessCollector::for_self())]
process: measured_process::ProcessCollector,
#[metric(namespace = "libmetrics")]
#[metric(init = LibMetrics::new(build_info))]
libmetrics: LibMetrics,
}
#[derive(MetricGroup)]
#[metric(new(build_info: BuildInfo))]
pub struct LibMetrics {
#[metric(init = build_info)]
build_info: BuildInfo,
#[metric(flatten)]
rusage: Rusage,
serve_count: CollectionCounter,
}
fn write_gauge<Enc: Encoding>(
x: i64,
labels: impl LabelGroup,
name: impl MetricNameEncoder,
enc: &mut Enc,
) -> Result<(), Enc::Err> {
enc.write_metric_value(name, labels, MetricValue::Int(x))
}
#[derive(Default)]
struct Rusage;
#[derive(FixedCardinalityLabel, Clone, Copy)]
#[label(singleton = "io_operation")]
enum IoOp {
Read,
Write,
}
impl<T: Encoding> MetricGroup<T> for Rusage
where
GaugeState: MetricEncoding<T>,
{
fn collect_group_into(&self, enc: &mut T) -> Result<(), T::Err> {
const DISK_IO: &MetricName = MetricName::from_str("disk_io_bytes_total");
const MAXRSS: &MetricName = MetricName::from_str("maxrss_kb");
let ru = get_rusage_stats();
enc.write_help(
DISK_IO,
"Bytes written and read from disk, grouped by the operation (read|write)",
)?;
GaugeState::write_type(DISK_IO, enc)?;
write_gauge(ru.ru_inblock * BYTES_IN_BLOCK, IoOp::Read, DISK_IO, enc)?;
write_gauge(ru.ru_oublock * BYTES_IN_BLOCK, IoOp::Write, DISK_IO, enc)?;
enc.write_help(MAXRSS, "Memory usage (Maximum Resident Set Size)")?;
GaugeState::write_type(MAXRSS, enc)?;
write_gauge(ru.ru_maxrss, IoOp::Read, MAXRSS, enc)?;
Ok(())
}
}
#[derive(Default)]
struct CollectionCounter(CounterState);
impl<T: Encoding> MetricFamilyEncoding<T> for CollectionCounter
where
CounterState: MetricEncoding<T>,
{
fn collect_family_into(
&self,
name: impl measured::metric::name::MetricNameEncoder,
enc: &mut T,
) -> Result<(), T::Err> {
self.0.inc();
enc.write_help(&name, "Number of metric requests made")?;
self.0.collect_into(&(), NoLabels, name, enc)
}
}
pub fn set_build_info_metric(revision: &str, build_tag: &str) {
let metric = register_int_gauge_vec!(
"libmetrics_build_info",
@@ -105,6 +237,7 @@ pub fn set_build_info_metric(revision: &str, build_tag: &str) {
.expect("Failed to register build info metric");
metric.with_label_values(&[revision, build_tag]).set(1);
}
const BYTES_IN_BLOCK: i64 = 512;
// Records I/O stats in a "cross-platform" way.
// Compiles both on macOS and Linux, but current macOS implementation always returns 0 as values for I/O stats.
@@ -117,7 +250,6 @@ pub fn set_build_info_metric(revision: &str, build_tag: &str) {
fn update_rusage_metrics() {
let rusage_stats = get_rusage_stats();
const BYTES_IN_BLOCK: i64 = 512;
DISK_IO_BYTES
.with_label_values(&["read"])
.set(rusage_stats.ru_inblock * BYTES_IN_BLOCK);
@@ -151,6 +283,7 @@ macro_rules! register_int_counter_pair_vec {
}
}};
}
/// Create an [`IntCounterPair`] and registers to default registry.
#[macro_export(local_inner_macros)]
macro_rules! register_int_counter_pair {
@@ -188,7 +321,10 @@ impl<P: Atomic> GenericCounterPairVec<P> {
///
/// An error is returned if the number of label values is not the same as the
/// number of VariableLabels in Desc.
pub fn get_metric_with_label_values(&self, vals: &[&str]) -> Result<GenericCounterPair<P>> {
pub fn get_metric_with_label_values(
&self,
vals: &[&str],
) -> prometheus::Result<GenericCounterPair<P>> {
Ok(GenericCounterPair {
inc: self.inc.get_metric_with_label_values(vals)?,
dec: self.dec.get_metric_with_label_values(vals)?,
@@ -201,7 +337,7 @@ impl<P: Atomic> GenericCounterPairVec<P> {
self.get_metric_with_label_values(vals).unwrap()
}
pub fn remove_label_values(&self, res: &mut [Result<()>; 2], vals: &[&str]) {
pub fn remove_label_values(&self, res: &mut [prometheus::Result<()>; 2], vals: &[&str]) {
res[0] = self.inc.remove_label_values(vals);
res[1] = self.dec.remove_label_values(vals);
}

View File

@@ -2,7 +2,7 @@ use std::str::FromStr;
/// Request/response types for the storage controller
/// API (`/control/v1` prefix). Implemented by the server
/// in [`attachment_service::http`]
/// in [`storage_controller::http`]
use serde::{Deserialize, Serialize};
use utils::id::{NodeId, TenantId};

View File

@@ -20,6 +20,7 @@ use utils::{
history_buffer::HistoryBufferWithDropCounter,
id::{NodeId, TenantId, TimelineId},
lsn::Lsn,
serde_system_time,
};
use crate::controller_api::PlacementPolicy;
@@ -758,11 +759,7 @@ pub struct WalRedoManagerStatus {
#[derive(Default, Debug, Serialize, Deserialize, Clone)]
pub struct SecondaryProgress {
/// The remote storage LastModified time of the heatmap object we last downloaded.
#[serde(
serialize_with = "opt_ser_rfc3339_millis",
deserialize_with = "opt_deser_rfc3339_millis"
)]
pub heatmap_mtime: Option<SystemTime>,
pub heatmap_mtime: Option<serde_system_time::SystemTime>,
/// The number of layers currently on-disk
pub layers_downloaded: usize,
@@ -775,29 +772,6 @@ pub struct SecondaryProgress {
pub bytes_total: u64,
}
fn opt_ser_rfc3339_millis<S: serde::Serializer>(
ts: &Option<SystemTime>,
serializer: S,
) -> Result<S::Ok, S::Error> {
match ts {
Some(ts) => serializer.collect_str(&humantime::format_rfc3339_millis(*ts)),
None => serializer.serialize_none(),
}
}
fn opt_deser_rfc3339_millis<'de, D>(deserializer: D) -> Result<Option<SystemTime>, D::Error>
where
D: serde::de::Deserializer<'de>,
{
let s: Option<String> = serde::de::Deserialize::deserialize(deserializer)?;
match s {
None => Ok(None),
Some(s) => humantime::parse_rfc3339(&s)
.map_err(serde::de::Error::custom)
.map(Some),
}
}
pub mod virtual_file {
#[derive(
Copy,

View File

@@ -1,4 +1,4 @@
use std::time::SystemTime;
use utils::serde_system_time::SystemTime;
/// Pageserver current utilization and scoring for how good candidate the pageserver would be for
/// the next tenant.
@@ -21,28 +21,9 @@ pub struct PageserverUtilization {
/// When was this snapshot captured, pageserver local time.
///
/// Use millis to give confidence that the value is regenerated often enough.
#[serde(
serialize_with = "ser_rfc3339_millis",
deserialize_with = "deser_rfc3339_millis"
)]
pub captured_at: SystemTime,
}
fn ser_rfc3339_millis<S: serde::Serializer>(
ts: &SystemTime,
serializer: S,
) -> Result<S::Ok, S::Error> {
serializer.collect_str(&humantime::format_rfc3339_millis(*ts))
}
fn deser_rfc3339_millis<'de, D>(deserializer: D) -> Result<SystemTime, D::Error>
where
D: serde::de::Deserializer<'de>,
{
let s: String = serde::de::Deserialize::deserialize(deserializer)?;
humantime::parse_rfc3339(&s).map_err(serde::de::Error::custom)
}
/// openapi knows only `format: int64`, so avoid outputting a non-parseable value by generated clients.
///
/// Instead of newtype, use this because a newtype would get require handling deserializing values
@@ -69,7 +50,9 @@ mod tests {
disk_usage_bytes: u64::MAX,
free_space_bytes: 0,
utilization_score: u64::MAX,
captured_at: SystemTime::UNIX_EPOCH + Duration::from_secs(1708509779),
captured_at: SystemTime(
std::time::SystemTime::UNIX_EPOCH + Duration::from_secs(1708509779),
),
};
let s = serde_json::to_string(&doc).unwrap();

View File

@@ -57,7 +57,6 @@ enum MaybeEnabledStorage {
Disabled,
}
#[async_trait::async_trait]
impl AsyncTestContext for MaybeEnabledStorage {
async fn setup() -> Self {
ensure_logging_ready();
@@ -86,7 +85,6 @@ struct AzureWithTestBlobs {
remote_blobs: HashSet<RemotePath>,
}
#[async_trait::async_trait]
impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs {
async fn setup() -> Self {
ensure_logging_ready();
@@ -148,7 +146,6 @@ struct AzureWithSimpleTestBlobs {
remote_blobs: HashSet<RemotePath>,
}
#[async_trait::async_trait]
impl AsyncTestContext for MaybeEnabledStorageWithSimpleTestBlobs {
async fn setup() -> Self {
ensure_logging_ready();

View File

@@ -219,7 +219,6 @@ enum MaybeEnabledStorage {
Disabled,
}
#[async_trait::async_trait]
impl AsyncTestContext for MaybeEnabledStorage {
async fn setup() -> Self {
ensure_logging_ready();
@@ -248,7 +247,6 @@ struct S3WithTestBlobs {
remote_blobs: HashSet<RemotePath>,
}
#[async_trait::async_trait]
impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs {
async fn setup() -> Self {
ensure_logging_ready();
@@ -310,7 +308,6 @@ struct S3WithSimpleTestBlobs {
remote_blobs: HashSet<RemotePath>,
}
#[async_trait::async_trait]
impl AsyncTestContext for MaybeEnabledStorageWithSimpleTestBlobs {
async fn setup() -> Self {
ensure_logging_ready();

View File

@@ -22,6 +22,7 @@ camino.workspace = true
chrono.workspace = true
heapless.workspace = true
hex = { workspace = true, features = ["serde"] }
humantime.workspace = true
hyper = { workspace = true, features = ["full"] }
fail.workspace = true
futures = { workspace = true}

21
libs/utils/src/env.rs Normal file
View File

@@ -0,0 +1,21 @@
//! Wrapper around `std::env::var` for parsing environment variables.
use std::{fmt::Display, str::FromStr};
pub fn var<V, E>(varname: &str) -> Option<V>
where
V: FromStr<Err = E>,
E: Display,
{
match std::env::var(varname) {
Ok(s) => Some(
s.parse()
.map_err(|e| format!("failed to parse env var {varname}: {e:#}"))
.unwrap(),
),
Err(std::env::VarError::NotPresent) => None,
Err(std::env::VarError::NotUnicode(_)) => {
panic!("env var {varname} is not unicode")
}
}
}

View File

@@ -63,6 +63,7 @@ pub mod measured_stream;
pub mod serde_percent;
pub mod serde_regex;
pub mod serde_system_time;
pub mod pageserver_feedback;
@@ -89,6 +90,8 @@ pub mod yielding_loop;
pub mod zstd;
pub mod env;
/// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
///
/// we have several cases:

View File

@@ -0,0 +1,55 @@
//! A `serde::{Deserialize,Serialize}` type for SystemTime with RFC3339 format and millisecond precision.
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, serde::Serialize, serde::Deserialize)]
#[serde(transparent)]
pub struct SystemTime(
#[serde(
deserialize_with = "deser_rfc3339_millis",
serialize_with = "ser_rfc3339_millis"
)]
pub std::time::SystemTime,
);
fn ser_rfc3339_millis<S: serde::ser::Serializer>(
ts: &std::time::SystemTime,
serializer: S,
) -> Result<S::Ok, S::Error> {
serializer.collect_str(&humantime::format_rfc3339_millis(*ts))
}
fn deser_rfc3339_millis<'de, D>(deserializer: D) -> Result<std::time::SystemTime, D::Error>
where
D: serde::de::Deserializer<'de>,
{
let s: String = serde::de::Deserialize::deserialize(deserializer)?;
humantime::parse_rfc3339(&s).map_err(serde::de::Error::custom)
}
#[cfg(test)]
mod tests {
use super::*;
/// Helper function to make a SystemTime have millisecond precision by truncating additional nanoseconds.
fn to_millisecond_precision(time: SystemTime) -> SystemTime {
match time.0.duration_since(std::time::SystemTime::UNIX_EPOCH) {
Ok(duration) => {
let total_millis = duration.as_secs() * 1_000 + u64::from(duration.subsec_millis());
SystemTime(
std::time::SystemTime::UNIX_EPOCH
+ std::time::Duration::from_millis(total_millis),
)
}
Err(_) => time,
}
}
#[test]
fn test_serialize_deserialize() {
let input = SystemTime(std::time::SystemTime::now());
let expected_serialized = format!("\"{}\"", humantime::format_rfc3339_millis(input.0));
let serialized = serde_json::to_string(&input).unwrap();
assert_eq!(expected_serialized, serialized);
let deserialized: SystemTime = serde_json::from_str(&expected_serialized).unwrap();
assert_eq!(to_millisecond_precision(input), deserialized);
}
}

View File

@@ -27,25 +27,25 @@
//!
//! # Reference Numbers
//!
//! 2024-03-20 on i3en.3xlarge
//! 2024-04-04 on i3en.3xlarge
//!
//! ```text
//! short/1 time: [26.483 µs 26.614 µs 26.767 µs]
//! short/2 time: [32.223 µs 32.465 µs 32.767 µs]
//! short/4 time: [47.203 µs 47.583 µs 47.984 µs]
//! short/8 time: [89.135 µs 89.612 µs 90.139 µs]
//! short/16 time: [190.12 µs 191.52 µs 192.88 µs]
//! short/32 time: [380.96 µs 382.63 µs 384.20 µs]
//! short/64 time: [736.86 µs 741.07 µs 745.03 µs]
//! short/128 time: [1.4106 ms 1.4206 ms 1.4294 ms]
//! medium/1 time: [111.81 µs 112.25 µs 112.79 µs]
//! medium/2 time: [158.26 µs 159.13 µs 160.21 µs]
//! medium/4 time: [334.65 µs 337.14 µs 340.07 µs]
//! medium/8 time: [675.32 µs 679.91 µs 685.25 µs]
//! medium/16 time: [1.2929 ms 1.2996 ms 1.3067 ms]
//! medium/32 time: [2.4295 ms 2.4461 ms 2.4623 ms]
//! medium/64 time: [4.3973 ms 4.4458 ms 4.4875 ms]
//! medium/128 time: [7.5955 ms 7.7847 ms 7.9481 ms]
//! short/1 time: [25.925 µs 26.060 µs 26.209 µs]
//! short/2 time: [31.277 µs 31.483 µs 31.722 µs]
//! short/4 time: [45.496 µs 45.831 µs 46.182 µs]
//! short/8 time: [84.298 µs 84.920 µs 85.566 µs]
//! short/16 time: [185.04 µs 186.41 µs 187.88 µs]
//! short/32 time: [385.01 µs 386.77 µs 388.70 µs]
//! short/64 time: [770.24 µs 773.04 µs 776.04 µs]
//! short/128 time: [1.5017 ms 1.5064 ms 1.5113 ms]
//! medium/1 time: [106.65 µs 107.20 µs 107.85 µs]
//! medium/2 time: [153.28 µs 154.24 µs 155.56 µs]
//! medium/4 time: [325.67 µs 327.01 µs 328.71 µs]
//! medium/8 time: [646.82 µs 650.17 µs 653.91 µs]
//! medium/16 time: [1.2645 ms 1.2701 ms 1.2762 ms]
//! medium/32 time: [2.4409 ms 2.4550 ms 2.4692 ms]
//! medium/64 time: [4.6814 ms 4.7114 ms 4.7408 ms]
//! medium/128 time: [8.7790 ms 8.9037 ms 9.0282 ms]
//! ```
use bytes::{Buf, Bytes};

View File

@@ -128,12 +128,12 @@ impl Client {
pub async fn timeline_info(
&self,
tenant_id: TenantId,
tenant_shard_id: TenantShardId,
timeline_id: TimelineId,
force_await_logical_size: ForceAwaitLogicalSize,
) -> Result<pageserver_api::models::TimelineInfo> {
let uri = format!(
"{}/v1/tenant/{tenant_id}/timeline/{timeline_id}",
"{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}",
self.mgmt_api_endpoint
);
@@ -151,11 +151,11 @@ impl Client {
pub async fn keyspace(
&self,
tenant_id: TenantId,
tenant_shard_id: TenantShardId,
timeline_id: TimelineId,
) -> Result<pageserver_api::models::partitioning::Partitioning> {
let uri = format!(
"{}/v1/tenant/{tenant_id}/timeline/{timeline_id}/keyspace",
"{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/keyspace",
self.mgmt_api_endpoint
);
self.get(&uri)

View File

@@ -11,7 +11,6 @@ default = []
anyhow.workspace = true
async-compression.workspace = true
async-stream.workspace = true
async-trait.workspace = true
byteorder.workspace = true
bytes.workspace = true
chrono = { workspace = true, features = ["serde"] }

View File

@@ -180,7 +180,7 @@ where
match top.deref_mut() {
LazyLoadLayer::Unloaded(ref mut l) => {
let fut = l.load_keys(this.ctx);
this.load_future.set(Some(fut));
this.load_future.set(Some(Box::pin(fut)));
continue;
}
LazyLoadLayer::Loaded(ref mut entries) => {

View File

@@ -3,7 +3,6 @@
//!
//! All the heavy lifting is done by the create_image and create_delta
//! functions that the implementor provides.
use async_trait::async_trait;
use futures::Future;
use pageserver_api::{key::Key, keyspace::key_range_size};
use std::ops::Range;
@@ -141,18 +140,16 @@ pub trait CompactionLayer<K: CompactionKey + ?Sized> {
fn is_delta(&self) -> bool;
}
#[async_trait]
pub trait CompactionDeltaLayer<E: CompactionJobExecutor + ?Sized>: CompactionLayer<E::Key> {
type DeltaEntry<'a>: CompactionDeltaEntry<'a, E::Key>
where
Self: 'a;
/// Return all keys in this delta layer.
async fn load_keys<'a>(
fn load_keys<'a>(
&self,
ctx: &E::RequestContext,
) -> anyhow::Result<Vec<Self::DeltaEntry<'_>>>;
) -> impl Future<Output = anyhow::Result<Vec<Self::DeltaEntry<'_>>>> + Send;
}
pub trait CompactionImageLayer<E: CompactionJobExecutor + ?Sized>: CompactionLayer<E::Key> {}

View File

@@ -2,7 +2,6 @@ mod draw;
use draw::{LayerTraceEvent, LayerTraceFile, LayerTraceOp};
use async_trait::async_trait;
use futures::StreamExt;
use rand::Rng;
use tracing::info;
@@ -139,7 +138,6 @@ impl interface::CompactionLayer<Key> for Arc<MockDeltaLayer> {
}
}
#[async_trait]
impl interface::CompactionDeltaLayer<MockTimeline> for Arc<MockDeltaLayer> {
type DeltaEntry<'a> = MockRecord;

View File

@@ -12,9 +12,14 @@ bytes.workspace = true
camino.workspace = true
clap = { workspace = true, features = ["string"] }
git-version.workspace = true
humantime.workspace = true
pageserver = { path = ".." }
pageserver_api.workspace = true
remote_storage = { path = "../../libs/remote_storage" }
postgres_ffi.workspace = true
tokio.workspace = true
tokio-util.workspace = true
toml_edit.workspace = true
utils.workspace = true
svg_fmt.workspace = true
workspace_hack.workspace = true

View File

@@ -9,6 +9,11 @@ mod index_part;
mod layer_map_analyzer;
mod layers;
use std::{
str::FromStr,
time::{Duration, SystemTime},
};
use camino::{Utf8Path, Utf8PathBuf};
use clap::{Parser, Subcommand};
use index_part::IndexPartCmd;
@@ -20,8 +25,16 @@ use pageserver::{
tenant::{dump_layerfile_from_path, metadata::TimelineMetadata},
virtual_file,
};
use pageserver_api::shard::TenantShardId;
use postgres_ffi::ControlFileData;
use utils::{lsn::Lsn, project_git_version};
use remote_storage::{RemotePath, RemoteStorageConfig};
use tokio_util::sync::CancellationToken;
use utils::{
id::TimelineId,
logging::{self, LogFormat, TracingErrorLayerEnablement},
lsn::Lsn,
project_git_version,
};
project_git_version!(GIT_VERSION);
@@ -43,6 +56,7 @@ enum Commands {
#[command(subcommand)]
IndexPart(IndexPartCmd),
PrintLayerFile(PrintLayerFileCmd),
TimeTravelRemotePrefix(TimeTravelRemotePrefixCmd),
DrawTimeline {},
AnalyzeLayerMap(AnalyzeLayerMapCmd),
#[command(subcommand)]
@@ -68,6 +82,26 @@ struct PrintLayerFileCmd {
path: Utf8PathBuf,
}
/// Roll back the time for the specified prefix using S3 history.
///
/// The command is fairly low level and powerful. Validation is only very light,
/// so it is more powerful, and thus potentially more dangerous.
#[derive(Parser)]
struct TimeTravelRemotePrefixCmd {
/// A configuration string for the remote_storage configuration.
///
/// Example: `remote_storage = { bucket_name = "aws-storage-bucket-name", bucket_region = "us-east-2" }`
config_toml_str: String,
/// remote prefix to time travel recover. For safety reasons, we require it to contain
/// a timeline or tenant ID in the prefix.
prefix: String,
/// Timestamp to travel to. Given in format like `2024-01-20T10:45:45Z`. Assumes UTC and second accuracy.
travel_to: String,
/// Timestamp of the start of the operation, must be after any changes we want to roll back and after.
/// You can use a few seconds before invoking the command. Same format as `travel_to`.
done_if_after: Option<String>,
}
#[derive(Parser)]
struct AnalyzeLayerMapCmd {
/// Pageserver data path
@@ -78,6 +112,14 @@ struct AnalyzeLayerMapCmd {
#[tokio::main]
async fn main() -> anyhow::Result<()> {
logging::init(
LogFormat::Plain,
TracingErrorLayerEnablement::EnableWithRustLogFilter,
logging::Output::Stdout,
)?;
logging::replace_panic_hook_with_tracing_panic_hook().forget();
let cli = CliOpts::parse();
match cli.command {
@@ -105,6 +147,42 @@ async fn main() -> anyhow::Result<()> {
print_layerfile(&cmd.path).await?;
}
}
Commands::TimeTravelRemotePrefix(cmd) => {
let timestamp = humantime::parse_rfc3339(&cmd.travel_to)
.map_err(|_e| anyhow::anyhow!("Invalid time for travel_to: '{}'", cmd.travel_to))?;
let done_if_after = if let Some(done_if_after) = &cmd.done_if_after {
humantime::parse_rfc3339(done_if_after).map_err(|_e| {
anyhow::anyhow!("Invalid time for done_if_after: '{}'", done_if_after)
})?
} else {
const SAFETY_MARGIN: Duration = Duration::from_secs(3);
tokio::time::sleep(SAFETY_MARGIN).await;
// Convert to string representation and back to get rid of sub-second values
let done_if_after = SystemTime::now();
tokio::time::sleep(SAFETY_MARGIN).await;
done_if_after
};
let timestamp = strip_subsecond(timestamp);
let done_if_after = strip_subsecond(done_if_after);
let Some(prefix) = validate_prefix(&cmd.prefix) else {
println!("specified prefix '{}' failed validation", cmd.prefix);
return Ok(());
};
let toml_document = toml_edit::Document::from_str(&cmd.config_toml_str)?;
let toml_item = toml_document
.get("remote_storage")
.expect("need remote_storage");
let config = RemoteStorageConfig::from_toml(toml_item)?.expect("incomplete config");
let storage = remote_storage::GenericRemoteStorage::from_config(&config);
let cancel = CancellationToken::new();
storage
.unwrap()
.time_travel_recover(Some(&prefix), timestamp, done_if_after, &cancel)
.await?;
}
};
Ok(())
}
@@ -185,3 +263,89 @@ fn handle_metadata(
Ok(())
}
/// Ensures that the given S3 prefix is sufficiently constrained.
/// The command is very risky already and we don't want to expose something
/// that allows usually unintentional and quite catastrophic time travel of
/// an entire bucket, which would be a major catastrophy and away
/// by only one character change (similar to "rm -r /home /username/foobar").
fn validate_prefix(prefix: &str) -> Option<RemotePath> {
if prefix.is_empty() {
// Empty prefix means we want to specify the *whole* bucket
return None;
}
let components = prefix.split('/').collect::<Vec<_>>();
let (last, components) = {
let last = components.last()?;
if last.is_empty() {
(
components.iter().nth_back(1)?,
&components[..(components.len() - 1)],
)
} else {
(last, &components[..])
}
};
'valid: {
if let Ok(_timeline_id) = TimelineId::from_str(last) {
// Ends in either a tenant or timeline ID
break 'valid;
}
if *last == "timelines" {
if let Some(before_last) = components.iter().nth_back(1) {
if let Ok(_tenant_id) = TenantShardId::from_str(before_last) {
// Has a valid tenant id
break 'valid;
}
}
}
return None;
}
RemotePath::from_string(prefix).ok()
}
fn strip_subsecond(timestamp: SystemTime) -> SystemTime {
let ts_str = humantime::format_rfc3339_seconds(timestamp).to_string();
humantime::parse_rfc3339(&ts_str).expect("can't parse just created timestamp")
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_validate_prefix() {
assert_eq!(validate_prefix(""), None);
assert_eq!(validate_prefix("/"), None);
#[track_caller]
fn assert_valid(prefix: &str) {
let remote_path = RemotePath::from_string(prefix).unwrap();
assert_eq!(validate_prefix(prefix), Some(remote_path));
}
assert_valid("wal/3aa8fcc61f6d357410b7de754b1d9001/641e5342083b2235ee3deb8066819683/");
// Path is not relative but absolute
assert_eq!(
validate_prefix(
"/wal/3aa8fcc61f6d357410b7de754b1d9001/641e5342083b2235ee3deb8066819683/"
),
None
);
assert_valid("wal/3aa8fcc61f6d357410b7de754b1d9001/");
// Partial tenant IDs should be invalid, S3 will match all tenants with the specific ID prefix
assert_eq!(validate_prefix("wal/3aa8fcc61f6d357410b7d"), None);
assert_eq!(validate_prefix("wal"), None);
assert_eq!(validate_prefix("/wal/"), None);
assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001");
// Partial tenant ID
assert_eq!(
validate_prefix("pageserver/v1/tenants/3aa8fcc61f6d357410b"),
None
);
assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001/timelines");
assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001-0004/timelines");
assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001/timelines/");
assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001/timelines/641e5342083b2235ee3deb8066819683");
assert_eq!(validate_prefix("pageserver/v1/tenants/"), None);
}
}

View File

@@ -1,4 +1,5 @@
use anyhow::Context;
use pageserver_api::shard::TenantShardId;
use pageserver_client::mgmt_api::ForceAwaitLogicalSize;
use pageserver_client::page_service::BasebackupRequest;
@@ -95,7 +96,7 @@ async fn main_impl(
let timeline = *timeline;
let info = mgmt_api_client
.timeline_info(
timeline.tenant_id,
TenantShardId::unsharded(timeline.tenant_id),
timeline.timeline_id,
ForceAwaitLogicalSize::No,
)

View File

@@ -4,6 +4,7 @@ use pageserver_api::key::{is_rel_block_key, key_to_rel_block, Key};
use pageserver_api::keyspace::KeySpaceAccum;
use pageserver_api::models::PagestreamGetPageRequest;
use pageserver_api::shard::TenantShardId;
use tokio_util::sync::CancellationToken;
use utils::id::TenantTimelineId;
use utils::lsn::Lsn;
@@ -173,7 +174,10 @@ async fn main_impl(
let timeline = *timeline;
async move {
let partitioning = mgmt_api_client
.keyspace(timeline.tenant_id, timeline.timeline_id)
.keyspace(
TenantShardId::unsharded(timeline.tenant_id),
timeline.timeline_id,
)
.await?;
let lsn = partitioning.at_lsn;
let start = Instant::now();

View File

@@ -1,6 +1,7 @@
use std::sync::Arc;
use humantime::Duration;
use pageserver_api::shard::TenantShardId;
use tokio::task::JoinSet;
use utils::id::TenantTimelineId;
@@ -59,7 +60,11 @@ async fn main_impl(args: Args) -> anyhow::Result<()> {
let mgmt_api_client = Arc::clone(&mgmt_api_client);
js.spawn(async move {
let info = mgmt_api_client
.timeline_info(tl.tenant_id, tl.timeline_id, ForceAwaitLogicalSize::Yes)
.timeline_info(
TenantShardId::unsharded(tl.tenant_id),
tl.timeline_id,
ForceAwaitLogicalSize::Yes,
)
.await
.unwrap();
@@ -74,7 +79,11 @@ async fn main_impl(args: Args) -> anyhow::Result<()> {
while !info.current_logical_size_is_accurate {
ticker.tick().await;
info = mgmt_api_client
.timeline_info(tl.tenant_id, tl.timeline_id, ForceAwaitLogicalSize::Yes)
.timeline_info(
TenantShardId::unsharded(tl.tenant_id),
tl.timeline_id,
ForceAwaitLogicalSize::Yes,
)
.await
.unwrap();
}

View File

@@ -18,6 +18,7 @@ use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
use pageserver::task_mgr::WALRECEIVER_RUNTIME;
use pageserver::tenant::{secondary, TenantSharedResources};
use remote_storage::GenericRemoteStorage;
use tokio::signal::unix::SignalKind;
use tokio::time::Instant;
use tracing::*;
@@ -671,42 +672,37 @@ fn start_pageserver(
let mut shutdown_pageserver = Some(shutdown_pageserver.drop_guard());
// All started up! Now just sit and wait for shutdown signal.
{
use signal_hook::consts::*;
let signal_handler = BACKGROUND_RUNTIME.spawn_blocking(move || {
let mut signals =
signal_hook::iterator::Signals::new([SIGINT, SIGTERM, SIGQUIT]).unwrap();
return signals
.forever()
.next()
.expect("forever() never returns None unless explicitly closed");
});
let signal = BACKGROUND_RUNTIME
.block_on(signal_handler)
.expect("join error");
match signal {
SIGQUIT => {
info!("Got signal {signal}. Terminating in immediate shutdown mode",);
std::process::exit(111);
}
SIGINT | SIGTERM => {
info!("Got signal {signal}. Terminating gracefully in fast shutdown mode",);
// This cancels the `shutdown_pageserver` cancellation tree.
// Right now that tree doesn't reach very far, and `task_mgr` is used instead.
// The plan is to change that over time.
shutdown_pageserver.take();
let bg_remote_storage = remote_storage.clone();
let bg_deletion_queue = deletion_queue.clone();
BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver(
&tenant_manager,
bg_remote_storage.map(|_| bg_deletion_queue),
0,
));
unreachable!()
}
_ => unreachable!(),
}
{
BACKGROUND_RUNTIME.block_on(async move {
let mut sigint = tokio::signal::unix::signal(SignalKind::interrupt()).unwrap();
let mut sigterm = tokio::signal::unix::signal(SignalKind::terminate()).unwrap();
let mut sigquit = tokio::signal::unix::signal(SignalKind::quit()).unwrap();
let signal = tokio::select! {
_ = sigquit.recv() => {
info!("Got signal SIGQUIT. Terminating in immediate shutdown mode",);
std::process::exit(111);
}
_ = sigint.recv() => { "SIGINT" },
_ = sigterm.recv() => { "SIGTERM" },
};
info!("Got signal {signal}. Terminating gracefully in fast shutdown mode",);
// This cancels the `shutdown_pageserver` cancellation tree.
// Right now that tree doesn't reach very far, and `task_mgr` is used instead.
// The plan is to change that over time.
shutdown_pageserver.take();
let bg_remote_storage = remote_storage.clone();
let bg_deletion_queue = deletion_queue.clone();
pageserver::shutdown_pageserver(
&tenant_manager,
bg_remote_storage.map(|_| bg_deletion_queue),
0,
)
.await;
unreachable!()
})
}
}

View File

@@ -12,7 +12,7 @@ use pageserver_api::{
use serde::{de::DeserializeOwned, Serialize};
use tokio_util::sync::CancellationToken;
use url::Url;
use utils::{backoff, generation::Generation, id::NodeId};
use utils::{backoff, failpoint_support, generation::Generation, id::NodeId};
use crate::{
config::{NodeMetadata, PageServerConf},
@@ -210,7 +210,10 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
.collect(),
};
fail::fail_point!("control-plane-client-validate");
failpoint_support::sleep_millis_async!("control-plane-client-validate-sleep", &self.cancel);
if self.cancel.is_cancelled() {
return Err(RetryForeverError::ShuttingDown);
}
let response: ValidateResponse = self.retry_http_forever(&re_attach_path, request).await?;

View File

@@ -1629,7 +1629,7 @@ components:
type: integer
format: int64
minimum: 0
description: The amount of disk space currently utilized by layer files.
description: The amount of disk space currently used.
free_space_bytes:
type: integer
format: int64

View File

@@ -8,6 +8,7 @@ use anyhow::{bail, ensure, Context, Result};
use bytes::Bytes;
use camino::Utf8Path;
use futures::StreamExt;
use pageserver_api::key::rel_block_to_key;
use tokio::io::{AsyncRead, AsyncReadExt};
use tokio_tar::Archive;
use tracing::*;
@@ -170,7 +171,10 @@ async fn import_rel(
let r = reader.read_exact(&mut buf).await;
match r {
Ok(_) => {
modification.put_rel_page_image(rel, blknum, Bytes::copy_from_slice(&buf))?;
let key = rel_block_to_key(rel, blknum);
if modification.tline.get_shard_identity().is_key_local(&key) {
modification.put_rel_page_image(rel, blknum, Bytes::copy_from_slice(&buf))?;
}
}
// TODO: UnexpectedEof is expected

View File

@@ -2100,6 +2100,7 @@ pub(crate) fn remove_tenant_metrics(tenant_shard_id: &TenantShardId) {
use futures::Future;
use pin_project_lite::pin_project;
use std::collections::HashMap;
use std::num::NonZeroUsize;
use std::pin::Pin;
use std::sync::{Arc, Mutex};
use std::task::{Context, Poll};
@@ -2669,6 +2670,26 @@ pub(crate) mod disk_usage_based_eviction {
pub(crate) static METRICS: Lazy<Metrics> = Lazy::new(Metrics::default);
}
static TOKIO_EXECUTOR_THREAD_COUNT: Lazy<UIntGaugeVec> = Lazy::new(|| {
register_uint_gauge_vec!(
"pageserver_tokio_executor_thread_configured_count",
"Total number of configued tokio executor threads in the process.
The `setup` label denotes whether we're running with multiple runtimes or a single runtime.",
&["setup"],
)
.unwrap()
});
pub(crate) fn set_tokio_runtime_setup(setup: &str, num_threads: NonZeroUsize) {
static SERIALIZE: std::sync::Mutex<()> = std::sync::Mutex::new(());
let _guard = SERIALIZE.lock().unwrap();
TOKIO_EXECUTOR_THREAD_COUNT.reset();
TOKIO_EXECUTOR_THREAD_COUNT
.get_metric_with_label_values(&[setup])
.unwrap()
.set(u64::try_from(num_threads.get()).unwrap());
}
pub fn preinitialize_metrics() {
// Python tests need these and on some we do alerting.
//

View File

@@ -33,13 +33,14 @@
use std::collections::HashMap;
use std::fmt;
use std::future::Future;
use std::num::NonZeroUsize;
use std::panic::AssertUnwindSafe;
use std::str::FromStr;
use std::sync::atomic::{AtomicU64, Ordering};
use std::sync::{Arc, Mutex};
use futures::FutureExt;
use pageserver_api::shard::TenantShardId;
use tokio::runtime::Runtime;
use tokio::task::JoinHandle;
use tokio::task_local;
use tokio_util::sync::CancellationToken;
@@ -48,8 +49,11 @@ use tracing::{debug, error, info, warn};
use once_cell::sync::Lazy;
use utils::env;
use utils::id::TimelineId;
use crate::metrics::set_tokio_runtime_setup;
//
// There are four runtimes:
//
@@ -98,52 +102,119 @@ use utils::id::TimelineId;
// other operations, if the upload tasks e.g. get blocked on locks. It shouldn't
// happen, but still.
//
pub static COMPUTE_REQUEST_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
tokio::runtime::Builder::new_multi_thread()
.thread_name("compute request worker")
.enable_all()
.build()
.expect("Failed to create compute request runtime")
});
pub static MGMT_REQUEST_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
tokio::runtime::Builder::new_multi_thread()
.thread_name("mgmt request worker")
.enable_all()
.build()
.expect("Failed to create mgmt request runtime")
});
pub static WALRECEIVER_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
tokio::runtime::Builder::new_multi_thread()
.thread_name("walreceiver worker")
.enable_all()
.build()
.expect("Failed to create walreceiver runtime")
});
pub static BACKGROUND_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
tokio::runtime::Builder::new_multi_thread()
.thread_name("background op worker")
// if you change the number of worker threads please change the constant below
.enable_all()
.build()
.expect("Failed to create background op runtime")
});
pub(crate) static BACKGROUND_RUNTIME_WORKER_THREADS: Lazy<usize> = Lazy::new(|| {
// force init and thus panics
let _ = BACKGROUND_RUNTIME.handle();
pub(crate) static TOKIO_WORKER_THREADS: Lazy<NonZeroUsize> = Lazy::new(|| {
// replicates tokio-1.28.1::loom::sys::num_cpus which is not available publicly
// tokio would had already panicked for parsing errors or NotUnicode
//
// this will be wrong if any of the runtimes gets their worker threads configured to something
// else, but that has not been needed in a long time.
std::env::var("TOKIO_WORKER_THREADS")
.map(|s| s.parse::<usize>().unwrap())
.unwrap_or_else(|_e| usize::max(2, num_cpus::get()))
NonZeroUsize::new(
std::env::var("TOKIO_WORKER_THREADS")
.map(|s| s.parse::<usize>().unwrap())
.unwrap_or_else(|_e| usize::max(2, num_cpus::get())),
)
.expect("the max() ensures that this is not zero")
});
enum TokioRuntimeMode {
SingleThreaded,
MultiThreaded { num_workers: NonZeroUsize },
}
impl FromStr for TokioRuntimeMode {
type Err = String;
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s {
"current_thread" => Ok(TokioRuntimeMode::SingleThreaded),
s => match s.strip_prefix("multi_thread:") {
Some("default") => Ok(TokioRuntimeMode::MultiThreaded {
num_workers: *TOKIO_WORKER_THREADS,
}),
Some(suffix) => {
let num_workers = suffix.parse::<NonZeroUsize>().map_err(|e| {
format!(
"invalid number of multi-threaded runtime workers ({suffix:?}): {e}",
)
})?;
Ok(TokioRuntimeMode::MultiThreaded { num_workers })
}
None => Err(format!("invalid runtime config: {s:?}")),
},
}
}
}
static ONE_RUNTIME: Lazy<Option<tokio::runtime::Runtime>> = Lazy::new(|| {
let thread_name = "pageserver-tokio";
let Some(mode) = env::var("NEON_PAGESERVER_USE_ONE_RUNTIME") else {
// If the env var is not set, leave this static as None.
set_tokio_runtime_setup(
"multiple-runtimes",
NUM_MULTIPLE_RUNTIMES
.checked_mul(*TOKIO_WORKER_THREADS)
.unwrap(),
);
return None;
};
Some(match mode {
TokioRuntimeMode::SingleThreaded => {
set_tokio_runtime_setup("one-runtime-single-threaded", NonZeroUsize::new(1).unwrap());
tokio::runtime::Builder::new_current_thread()
.thread_name(thread_name)
.enable_all()
.build()
.expect("failed to create one single runtime")
}
TokioRuntimeMode::MultiThreaded { num_workers } => {
set_tokio_runtime_setup("one-runtime-multi-threaded", num_workers);
tokio::runtime::Builder::new_multi_thread()
.thread_name(thread_name)
.enable_all()
.worker_threads(num_workers.get())
.build()
.expect("failed to create one multi-threaded runtime")
}
})
});
/// Declare a lazy static variable named `$varname` that will resolve
/// to a tokio runtime handle. If the env var `NEON_PAGESERVER_USE_ONE_RUNTIME`
/// is set, this will resolve to `ONE_RUNTIME`. Otherwise, the macro invocation
/// declares a separate runtime and the lazy static variable `$varname`
/// will resolve to that separate runtime.
///
/// The result is is that `$varname.spawn()` will use `ONE_RUNTIME` if
/// `NEON_PAGESERVER_USE_ONE_RUNTIME` is set, and will use the separate runtime
/// otherwise.
macro_rules! pageserver_runtime {
($varname:ident, $name:literal) => {
pub static $varname: Lazy<&'static tokio::runtime::Runtime> = Lazy::new(|| {
if let Some(runtime) = &*ONE_RUNTIME {
return runtime;
}
static RUNTIME: Lazy<tokio::runtime::Runtime> = Lazy::new(|| {
tokio::runtime::Builder::new_multi_thread()
.thread_name($name)
.worker_threads(TOKIO_WORKER_THREADS.get())
.enable_all()
.build()
.expect(std::concat!("Failed to create runtime ", $name))
});
&*RUNTIME
});
};
}
pageserver_runtime!(COMPUTE_REQUEST_RUNTIME, "compute request worker");
pageserver_runtime!(MGMT_REQUEST_RUNTIME, "mgmt request worker");
pageserver_runtime!(WALRECEIVER_RUNTIME, "walreceiver worker");
pageserver_runtime!(BACKGROUND_RUNTIME, "background op worker");
// Bump this number when adding a new pageserver_runtime!
// SAFETY: it's obviously correct
const NUM_MULTIPLE_RUNTIMES: NonZeroUsize = unsafe { NonZeroUsize::new_unchecked(4) };
#[derive(Debug, Clone, Copy)]
pub struct PageserverTaskId(u64);

View File

@@ -200,6 +200,7 @@ use utils::backoff::{
use std::collections::{HashMap, VecDeque};
use std::sync::atomic::{AtomicU32, Ordering};
use std::sync::{Arc, Mutex};
use std::time::Duration;
use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath, TimeoutOrCancel};
use std::ops::DerefMut;
@@ -207,7 +208,7 @@ use tracing::{debug, error, info, instrument, warn};
use tracing::{info_span, Instrument};
use utils::lsn::Lsn;
use crate::deletion_queue::DeletionQueueClient;
use crate::deletion_queue::{DeletionQueueClient, DeletionQueueError};
use crate::metrics::{
MeasureRemoteOp, RemoteOpFileKind, RemoteOpKind, RemoteTimelineClientMetrics,
RemoteTimelineClientMetricsCallTrackSize, REMOTE_ONDEMAND_DOWNLOADED_BYTES,
@@ -261,6 +262,10 @@ pub(crate) const INITDB_PRESERVED_PATH: &str = "initdb-preserved.tar.zst";
/// Default buffer size when interfacing with [`tokio::fs::File`].
pub(crate) const BUFFER_SIZE: usize = 32 * 1024;
/// Doing non-essential flushes of deletion queue is subject to this timeout, after
/// which we warn and skip.
const DELETION_QUEUE_FLUSH_TIMEOUT: Duration = Duration::from_secs(10);
pub enum MaybeDeletedIndexPart {
IndexPart(IndexPart),
Deleted(IndexPart),
@@ -588,14 +593,14 @@ impl RemoteTimelineClient {
upload_queue: &mut UploadQueueInitialized,
metadata: TimelineMetadata,
) {
let disk_consistent_lsn = upload_queue.latest_metadata.disk_consistent_lsn();
info!(
"scheduling metadata upload with {} files ({} changed)",
"scheduling metadata upload up to consistent LSN {disk_consistent_lsn} with {} files ({} changed)",
upload_queue.latest_files.len(),
upload_queue.latest_files_changes_since_metadata_upload_scheduled,
);
let disk_consistent_lsn = upload_queue.latest_metadata.disk_consistent_lsn();
let index_part = IndexPart::new(
upload_queue.latest_files.clone(),
disk_consistent_lsn,
@@ -1050,6 +1055,26 @@ impl RemoteTimelineClient {
Ok(())
}
async fn flush_deletion_queue(&self) -> Result<(), DeletionQueueError> {
match tokio::time::timeout(
DELETION_QUEUE_FLUSH_TIMEOUT,
self.deletion_queue_client.flush_immediate(),
)
.await
{
Ok(result) => result,
Err(_timeout) => {
// Flushing remote deletions is not mandatory: we flush here to make the system easier to test, and
// to ensure that _usually_ objects are really gone after a DELETE is acked. However, in case of deletion
// queue issues (https://github.com/neondatabase/neon/issues/6440), we don't want to wait indefinitely here.
tracing::warn!(
"Timed out waiting for deletion queue flush, acking deletion anyway"
);
Ok(())
}
}
}
/// Prerequisites: UploadQueue should be in stopped state and deleted_at should be successfuly set.
/// The function deletes layer files one by one, then lists the prefix to see if we leaked something
/// deletes leaked files if any and proceeds with deletion of index file at the end.
@@ -1099,7 +1124,7 @@ impl RemoteTimelineClient {
// Execute all pending deletions, so that when we proceed to do a list_prefixes below, we aren't
// taking the burden of listing all the layers that we already know we should delete.
self.deletion_queue_client.flush_immediate().await?;
self.flush_deletion_queue().await?;
let cancel = shutdown_token();
@@ -1173,7 +1198,7 @@ impl RemoteTimelineClient {
// Timeline deletion is rare and we have probably emitted a reasonably number of objects: wait
// for a flush to a persistent deletion list so that we may be sure deletion will occur.
self.deletion_queue_client.flush_immediate().await?;
self.flush_deletion_queue().await?;
fail::fail_point!("timeline-delete-after-index-delete", |_| {
Err(anyhow::anyhow!(

View File

@@ -51,7 +51,7 @@ use tokio_util::sync::CancellationToken;
use tracing::{info_span, instrument, warn, Instrument};
use utils::{
backoff, completion::Barrier, crashsafe::path_with_suffix_extension, failpoint_support, fs_ext,
id::TimelineId,
id::TimelineId, serde_system_time,
};
use super::{
@@ -591,7 +591,7 @@ impl<'a> TenantDownloader<'a> {
let mut progress = SecondaryProgress {
layers_total: heatmap_stats.layers,
bytes_total: heatmap_stats.bytes,
heatmap_mtime: Some(heatmap_mtime),
heatmap_mtime: Some(serde_system_time::SystemTime(heatmap_mtime)),
layers_downloaded: 0,
bytes_downloaded: 0,
};

View File

@@ -19,6 +19,7 @@ use pageserver_api::models::InMemoryLayerInfo;
use pageserver_api::shard::TenantShardId;
use std::collections::{BinaryHeap, HashMap, HashSet};
use std::sync::{Arc, OnceLock};
use std::time::Instant;
use tracing::*;
use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn, vec_map::VecMap};
// avoid binding to Write (conflicts with std::io::Write)
@@ -53,6 +54,8 @@ pub struct InMemoryLayer {
/// Writes are only allowed when this is `None`.
end_lsn: OnceLock<Lsn>,
opened_at: Instant,
/// The above fields never change, except for `end_lsn`, which is only set once.
/// All other changing parts are in `inner`, and protected by a mutex.
inner: RwLock<InMemoryLayerInner>,
@@ -460,6 +463,7 @@ impl InMemoryLayer {
tenant_shard_id,
start_lsn,
end_lsn: OnceLock::new(),
opened_at: Instant::now(),
inner: RwLock::new(InMemoryLayerInner {
index: HashMap::new(),
file,
@@ -520,6 +524,10 @@ impl InMemoryLayer {
Ok(())
}
pub(crate) fn get_opened_at(&self) -> Instant {
self.opened_at
}
pub(crate) async fn tick(&self) -> Option<u64> {
let mut inner = self.inner.write().await;
let size = inner.file.len();

View File

@@ -18,7 +18,7 @@ use utils::{backoff, completion};
static CONCURRENT_BACKGROUND_TASKS: once_cell::sync::Lazy<tokio::sync::Semaphore> =
once_cell::sync::Lazy::new(|| {
let total_threads = *task_mgr::BACKGROUND_RUNTIME_WORKER_THREADS;
let total_threads = task_mgr::TOKIO_WORKER_THREADS.get();
let permits = usize::max(
1,
// while a lot of the work is done on spawn_blocking, we still do
@@ -72,6 +72,7 @@ pub(crate) async fn concurrent_background_tasks_rate_limit_permit(
loop_kind == BackgroundLoopKind::InitialLogicalSizeCalculation
);
// TODO: assert that we run on BACKGROUND_RUNTIME; requires tokio_unstable Handle::id();
match CONCURRENT_BACKGROUND_TASKS.acquire().await {
Ok(permit) => permit,
Err(_closed) => unreachable!("we never close the semaphore"),

View File

@@ -282,10 +282,12 @@ pub struct Timeline {
pub(super) flush_loop_state: Mutex<FlushLoopState>,
/// layer_flush_start_tx can be used to wake up the layer-flushing task.
/// The value is a counter, incremented every time a new flush cycle is requested.
/// The flush cycle counter is sent back on the layer_flush_done channel when
/// the flush finishes. You can use that to wait for the flush to finish.
layer_flush_start_tx: tokio::sync::watch::Sender<u64>,
/// - The u64 value is a counter, incremented every time a new flush cycle is requested.
/// The flush cycle counter is sent back on the layer_flush_done channel when
/// the flush finishes. You can use that to wait for the flush to finish.
/// - The LSN is updated to max() of its current value and the latest disk_consistent_lsn
/// read by whoever sends an update
layer_flush_start_tx: tokio::sync::watch::Sender<(u64, Lsn)>,
/// to be notified when layer flushing has finished, subscribe to the layer_flush_done channel
layer_flush_done_tx: tokio::sync::watch::Sender<(u64, Result<(), FlushLayerError>)>,
@@ -1169,8 +1171,8 @@ impl Timeline {
/// Flush to disk all data that was written with the put_* functions
#[instrument(skip(self), fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%self.timeline_id))]
pub(crate) async fn freeze_and_flush(&self) -> anyhow::Result<()> {
self.freeze_inmem_layer(false).await;
self.flush_frozen_layers_and_wait().await
let to_lsn = self.freeze_inmem_layer(false).await;
self.flush_frozen_layers_and_wait(to_lsn).await
}
/// If there is no writer, and conditions for rolling the latest layer are met, then freeze it.
@@ -1190,7 +1192,39 @@ impl Timeline {
};
let Some(open_layer) = &layers_guard.layer_map().open_layer else {
// No open layer, no work to do.
// If there is no open layer, we have no layer freezing to do. However, we might need to generate
// some updates to disk_consistent_lsn and remote_consistent_lsn, in case we ingested some WAL regions
// that didn't result in writes to this shard.
// Must not hold the layers lock while waiting for a flush.
drop(layers_guard);
let last_record_lsn = self.get_last_record_lsn();
let disk_consistent_lsn = self.get_disk_consistent_lsn();
if last_record_lsn > disk_consistent_lsn {
// We have no open layer, but disk_consistent_lsn is behind the last record: this indicates
// we are a sharded tenant and have skipped some WAL
let last_freeze_ts = *self.last_freeze_ts.read().unwrap();
if last_freeze_ts.elapsed() >= self.get_checkpoint_timeout() {
// This should be somewhat rare, so we log it at INFO level.
//
// We checked for checkpoint timeout so that a shard without any
// data ingested (yet) doesn't write a remote index as soon as it
// sees its LSN advance: we only do this if we've been layer-less
// for some time.
tracing::info!(
"Advancing disk_consistent_lsn past WAL ingest gap {} -> {}",
disk_consistent_lsn,
last_record_lsn
);
// The flush loop will update remote consistent LSN as well as disk consistent LSN.
self.flush_frozen_layers_and_wait(last_record_lsn)
.await
.ok();
}
}
return;
};
@@ -1223,7 +1257,7 @@ impl Timeline {
checkpoint_distance,
self.get_last_record_lsn(),
self.last_freeze_at.load(),
*self.last_freeze_ts.read().unwrap(),
open_layer.get_opened_at(),
) {
match open_layer.info() {
InMemoryLayerInfo::Frozen { lsn_start, lsn_end } => {
@@ -1588,7 +1622,7 @@ impl Timeline {
checkpoint_distance: u64,
projected_lsn: Lsn,
last_freeze_at: Lsn,
last_freeze_ts: Instant,
opened_at: Instant,
) -> bool {
let distance = projected_lsn.widening_sub(last_freeze_at);
@@ -1614,13 +1648,13 @@ impl Timeline {
);
true
} else if distance > 0 && last_freeze_ts.elapsed() >= self.get_checkpoint_timeout() {
} else if distance > 0 && opened_at.elapsed() >= self.get_checkpoint_timeout() {
info!(
"Will roll layer at {} with layer size {} due to time since last flush ({:?})",
projected_lsn,
layer_size,
last_freeze_ts.elapsed()
);
"Will roll layer at {} with layer size {} due to time since first write to the layer ({:?})",
projected_lsn,
layer_size,
opened_at.elapsed()
);
true
} else {
@@ -1769,7 +1803,7 @@ impl Timeline {
let disk_consistent_lsn = metadata.disk_consistent_lsn();
let (state, _) = watch::channel(state);
let (layer_flush_start_tx, _) = tokio::sync::watch::channel(0);
let (layer_flush_start_tx, _) = tokio::sync::watch::channel((0, disk_consistent_lsn));
let (layer_flush_done_tx, _) = tokio::sync::watch::channel((0, Ok(())));
let evictions_low_residence_duration_metric_threshold = {
@@ -3174,7 +3208,9 @@ impl Timeline {
self.last_record_lsn.advance(new_lsn);
}
async fn freeze_inmem_layer(&self, write_lock_held: bool) {
/// Whether there was a layer to freeze or not, return the value of get_last_record_lsn
/// before we attempted the freeze: this guarantees that ingested data is frozen up to this lsn (inclusive).
async fn freeze_inmem_layer(&self, write_lock_held: bool) -> Lsn {
// Freeze the current open in-memory layer. It will be written to disk on next
// iteration.
@@ -3184,7 +3220,9 @@ impl Timeline {
Some(self.write_lock.lock().await)
};
self.freeze_inmem_layer_at(self.get_last_record_lsn()).await;
let to_lsn = self.get_last_record_lsn();
self.freeze_inmem_layer_at(to_lsn).await;
to_lsn
}
async fn freeze_inmem_layer_at(&self, at: Lsn) {
@@ -3197,7 +3235,7 @@ impl Timeline {
/// Layer flusher task's main loop.
async fn flush_loop(
self: &Arc<Self>,
mut layer_flush_start_rx: tokio::sync::watch::Receiver<u64>,
mut layer_flush_start_rx: tokio::sync::watch::Receiver<(u64, Lsn)>,
ctx: &RequestContext,
) {
info!("started flush loop");
@@ -3210,7 +3248,11 @@ impl Timeline {
_ = layer_flush_start_rx.changed() => {}
}
trace!("waking up");
let flush_counter = *layer_flush_start_rx.borrow();
let (flush_counter, frozen_to_lsn) = *layer_flush_start_rx.borrow();
// The highest LSN to which we flushed in the loop over frozen layers
let mut flushed_to_lsn = Lsn(0);
let result = loop {
if self.cancel.is_cancelled() {
info!("dropping out of flush loop for timeline shutdown");
@@ -3231,7 +3273,9 @@ impl Timeline {
break Ok(());
};
match self.flush_frozen_layer(layer_to_flush, ctx).await {
Ok(()) => {}
Ok(this_layer_to_lsn) => {
flushed_to_lsn = std::cmp::max(flushed_to_lsn, this_layer_to_lsn);
}
Err(FlushLayerError::Cancelled) => {
info!("dropping out of flush loop for timeline shutdown");
return;
@@ -3240,11 +3284,36 @@ impl Timeline {
FlushLayerError::Other(_) | FlushLayerError::CreateImageLayersError(_),
) => {
error!("could not flush frozen layer: {err:?}");
break err;
break err.map(|_| ());
}
}
timer.stop_and_record();
};
// Unsharded tenants should never advance their LSN beyond the end of the
// highest layer they write: such gaps between layer data and the frozen LSN
// are only legal on sharded tenants.
debug_assert!(
self.shard_identity.count.count() > 1
|| flushed_to_lsn >= frozen_to_lsn
|| !flushed_to_lsn.is_valid()
);
if flushed_to_lsn < frozen_to_lsn && self.shard_identity.count.count() > 1 {
// If our layer flushes didn't carry disk_consistent_lsn up to the `to_lsn` advertised
// to us via layer_flush_start_rx, then advance it here.
//
// This path is only taken for tenants with multiple shards: single sharded tenants should
// never encounter a gap in the wal.
let old_disk_consistent_lsn = self.disk_consistent_lsn.load();
tracing::debug!("Advancing disk_consistent_lsn across layer gap {old_disk_consistent_lsn}->{frozen_to_lsn}");
if self.set_disk_consistent_lsn(frozen_to_lsn) {
if let Err(e) = self.schedule_uploads(frozen_to_lsn, vec![]) {
tracing::warn!("Failed to schedule metadata upload after updating disk_consistent_lsn: {e}");
}
}
}
// Notify any listeners that we're done
let _ = self
.layer_flush_done_tx
@@ -3252,7 +3321,13 @@ impl Timeline {
}
}
async fn flush_frozen_layers_and_wait(&self) -> anyhow::Result<()> {
/// Request the flush loop to write out all frozen layers up to `to_lsn` as Delta L0 files to disk.
/// The caller is responsible for the freezing, e.g., [`Self::freeze_inmem_layer`].
///
/// `last_record_lsn` may be higher than the highest LSN of a frozen layer: if this is the case,
/// it means no data will be written between the top of the highest frozen layer and to_lsn,
/// e.g. because this tenant shard has ingested up to to_lsn and not written any data locally for that part of the WAL.
async fn flush_frozen_layers_and_wait(&self, last_record_lsn: Lsn) -> anyhow::Result<()> {
let mut rx = self.layer_flush_done_tx.subscribe();
// Increment the flush cycle counter and wake up the flush task.
@@ -3266,9 +3341,10 @@ impl Timeline {
anyhow::bail!("cannot flush frozen layers when flush_loop is not running, state is {flush_loop_state:?}")
}
self.layer_flush_start_tx.send_modify(|counter| {
self.layer_flush_start_tx.send_modify(|(counter, lsn)| {
my_flush_request = *counter + 1;
*counter = my_flush_request;
*lsn = std::cmp::max(last_record_lsn, *lsn);
});
loop {
@@ -3305,16 +3381,22 @@ impl Timeline {
}
fn flush_frozen_layers(&self) {
self.layer_flush_start_tx.send_modify(|val| *val += 1);
self.layer_flush_start_tx.send_modify(|(counter, lsn)| {
*counter += 1;
*lsn = std::cmp::max(*lsn, Lsn(self.last_freeze_at.load().0 - 1));
});
}
/// Flush one frozen in-memory layer to disk, as a new delta layer.
///
/// Return value is the last lsn (inclusive) of the layer that was frozen.
#[instrument(skip_all, fields(layer=%frozen_layer))]
async fn flush_frozen_layer(
self: &Arc<Self>,
frozen_layer: Arc<InMemoryLayer>,
ctx: &RequestContext,
) -> Result<(), FlushLayerError> {
) -> Result<Lsn, FlushLayerError> {
debug_assert_current_span_has_tenant_and_timeline_id();
// As a special case, when we have just imported an image into the repository,
@@ -3389,7 +3471,6 @@ impl Timeline {
}
let disk_consistent_lsn = Lsn(lsn_range.end.0 - 1);
let old_disk_consistent_lsn = self.disk_consistent_lsn.load();
// The new on-disk layers are now in the layer map. We can remove the
// in-memory layer from the map now. The flushed layer is stored in
@@ -3403,10 +3484,7 @@ impl Timeline {
guard.finish_flush_l0_layer(delta_layer_to_add.as_ref(), &frozen_layer, &self.metrics);
if disk_consistent_lsn != old_disk_consistent_lsn {
assert!(disk_consistent_lsn > old_disk_consistent_lsn);
self.disk_consistent_lsn.store(disk_consistent_lsn);
if self.set_disk_consistent_lsn(disk_consistent_lsn) {
// Schedule remote uploads that will reflect our new disk_consistent_lsn
self.schedule_uploads(disk_consistent_lsn, layers_to_upload)?;
}
@@ -3423,7 +3501,22 @@ impl Timeline {
// This failpoint is used by another test case `test_pageserver_recovery`.
fail_point!("flush-frozen-exit");
Ok(())
Ok(Lsn(lsn_range.end.0 - 1))
}
/// Return true if the value changed
///
/// This function must only be used from the layer flush task, and may not be called concurrently.
fn set_disk_consistent_lsn(&self, new_value: Lsn) -> bool {
// We do a simple load/store cycle: that's why this function isn't safe for concurrent use.
let old_value = self.disk_consistent_lsn.load();
if new_value != old_value {
assert!(new_value >= old_value);
self.disk_consistent_lsn.store(new_value);
true
} else {
false
}
}
/// Update metadata file
@@ -4610,23 +4703,16 @@ struct TimelineWriterState {
max_lsn: Option<Lsn>,
// Cached details of the last freeze. Avoids going trough the atomic/lock on every put.
cached_last_freeze_at: Lsn,
cached_last_freeze_ts: Instant,
}
impl TimelineWriterState {
fn new(
open_layer: Arc<InMemoryLayer>,
current_size: u64,
last_freeze_at: Lsn,
last_freeze_ts: Instant,
) -> Self {
fn new(open_layer: Arc<InMemoryLayer>, current_size: u64, last_freeze_at: Lsn) -> Self {
Self {
open_layer,
current_size,
prev_lsn: None,
max_lsn: None,
cached_last_freeze_at: last_freeze_at,
cached_last_freeze_ts: last_freeze_ts,
}
}
}
@@ -4725,12 +4811,10 @@ impl<'a> TimelineWriter<'a> {
let initial_size = layer.size().await?;
let last_freeze_at = self.last_freeze_at.load();
let last_freeze_ts = *self.last_freeze_ts.read().unwrap();
self.write_guard.replace(TimelineWriterState::new(
layer,
initial_size,
last_freeze_at,
last_freeze_ts,
));
Ok(())
@@ -4777,7 +4861,7 @@ impl<'a> TimelineWriter<'a> {
self.get_checkpoint_distance(),
lsn,
state.cached_last_freeze_at,
state.cached_last_freeze_ts,
state.open_layer.get_opened_at(),
) {
OpenLayerAction::Roll
} else {

View File

@@ -12,7 +12,6 @@ use super::layer_manager::LayerManager;
use super::{CompactFlags, DurationRecorder, RecordedDuration, Timeline};
use anyhow::{anyhow, Context};
use async_trait::async_trait;
use enumset::EnumSet;
use fail::fail_point;
use itertools::Itertools;
@@ -1122,7 +1121,6 @@ impl CompactionLayer<Key> for ResidentDeltaLayer {
}
}
#[async_trait]
impl CompactionDeltaLayer<TimelineAdaptor> for ResidentDeltaLayer {
type DeltaEntry<'a> = DeltaEntry<'a>;

View File

@@ -120,9 +120,10 @@ impl LayerManager {
/// Called from `freeze_inmem_layer`, returns true if successfully frozen.
pub(crate) async fn try_freeze_in_memory_layer(
&mut self,
Lsn(last_record_lsn): Lsn,
lsn: Lsn,
last_freeze_at: &AtomicLsn,
) {
let Lsn(last_record_lsn) = lsn;
let end_lsn = Lsn(last_record_lsn + 1);
if let Some(open_layer) = &self.layer_map.open_layer {
@@ -135,8 +136,11 @@ impl LayerManager {
self.layer_map.frozen_layers.push_back(open_layer_rc);
self.layer_map.open_layer = None;
self.layer_map.next_open_layer_at = Some(end_lsn);
last_freeze_at.store(end_lsn);
}
// Even if there was no layer to freeze, advance last_freeze_at to last_record_lsn+1: this
// accounts for regions in the LSN range where we might have ingested no data due to sharding.
last_freeze_at.store(end_lsn);
}
/// Add image layers to the layer map, called from `create_image_layers`.

View File

@@ -15,11 +15,23 @@ pub(crate) fn regenerate(tenants_path: &Path) -> anyhow::Result<PageserverUtiliz
.map_err(std::io::Error::from)
.context("statvfs tenants directory")?;
let blocksz = statvfs.block_size();
// https://unix.stackexchange.com/a/703650
let blocksz = if statvfs.fragment_size() > 0 {
statvfs.fragment_size()
} else {
statvfs.block_size()
};
#[cfg_attr(not(target_os = "macos"), allow(clippy::unnecessary_cast))]
let free = statvfs.blocks_available() as u64 * blocksz;
let used = crate::metrics::RESIDENT_PHYSICAL_SIZE_GLOBAL.get();
#[cfg_attr(not(target_os = "macos"), allow(clippy::unnecessary_cast))]
let used = statvfs
.blocks()
// use blocks_free instead of available here to match df in case someone compares
.saturating_sub(statvfs.blocks_free()) as u64
* blocksz;
let captured_at = std::time::SystemTime::now();
let doc = PageserverUtilization {
@@ -29,7 +41,7 @@ pub(crate) fn regenerate(tenants_path: &Path) -> anyhow::Result<PageserverUtiliz
//
// note that u64::MAX will be output as i64::MAX as u64, but that should not matter
utilization_score: u64::MAX,
captured_at,
captured_at: utils::serde_system_time::SystemTime(captured_at),
};
// TODO: make utilization_score into a metric

View File

@@ -36,11 +36,12 @@ use bytes::{Bytes, BytesMut};
use pageserver_api::key::key_to_rel_block;
use pageserver_api::models::WalRedoManagerStatus;
use pageserver_api::shard::TenantShardId;
use std::sync::{Arc, RwLock};
use std::sync::Arc;
use std::time::Duration;
use std::time::Instant;
use tracing::*;
use utils::lsn::Lsn;
use utils::sync::heavier_once_cell;
///
/// This is the real implementation that uses a Postgres process to
@@ -53,7 +54,19 @@ pub struct PostgresRedoManager {
tenant_shard_id: TenantShardId,
conf: &'static PageServerConf,
last_redo_at: std::sync::Mutex<Option<Instant>>,
redo_process: RwLock<Option<Arc<process::WalRedoProcess>>>,
/// The current [`process::WalRedoProcess`] that is used by new redo requests.
/// We use [`heavier_once_cell`] for coalescing the spawning, but the redo
/// requests don't use the [`heavier_once_cell::Guard`] to keep ahold of the
/// their process object; we use [`Arc::clone`] for that.
/// This is primarily because earlier implementations that didn't use [`heavier_once_cell`]
/// had that behavior; it's probably unnecessary.
/// The only merit of it is that if one walredo process encounters an error,
/// it can take it out of rotation (= using [`heavier_once_cell::Guard::take_and_deinit`].
/// and retry redo, thereby starting the new process, while other redo tasks might
/// still be using the old redo process. But, those other tasks will most likely
/// encounter an error as well, and errors are an unexpected condition anyway.
/// So, probably we could get rid of the `Arc` in the future.
redo_process: heavier_once_cell::OnceCell<Arc<process::WalRedoProcess>>,
}
///
@@ -101,6 +114,7 @@ impl PostgresRedoManager {
self.conf.wal_redo_timeout,
pg_version,
)
.await
};
img = Some(result?);
@@ -121,6 +135,7 @@ impl PostgresRedoManager {
self.conf.wal_redo_timeout,
pg_version,
)
.await
}
}
@@ -134,7 +149,7 @@ impl PostgresRedoManager {
chrono::Utc::now().checked_sub_signed(chrono::Duration::from_std(age).ok()?)
})
},
pid: self.redo_process.read().unwrap().as_ref().map(|p| p.id()),
pid: self.redo_process.get().map(|p| p.id()),
})
}
}
@@ -152,7 +167,7 @@ impl PostgresRedoManager {
tenant_shard_id,
conf,
last_redo_at: std::sync::Mutex::default(),
redo_process: RwLock::new(None),
redo_process: heavier_once_cell::OnceCell::default(),
}
}
@@ -164,8 +179,7 @@ impl PostgresRedoManager {
if let Some(last_redo_at) = *g {
if last_redo_at.elapsed() >= idle_timeout {
drop(g);
let mut guard = self.redo_process.write().unwrap();
*guard = None;
drop(self.redo_process.get().map(|guard| guard.take_and_deinit()));
}
}
}
@@ -174,8 +188,11 @@ impl PostgresRedoManager {
///
/// Process one request for WAL redo using wal-redo postgres
///
/// # Cancel-Safety
///
/// Cancellation safe.
#[allow(clippy::too_many_arguments)]
fn apply_batch_postgres(
async fn apply_batch_postgres(
&self,
key: Key,
lsn: Lsn,
@@ -191,42 +208,31 @@ impl PostgresRedoManager {
const MAX_RETRY_ATTEMPTS: u32 = 1;
let mut n_attempts = 0u32;
loop {
// launch the WAL redo process on first use
let proc: Arc<process::WalRedoProcess> = {
let proc_guard = self.redo_process.read().unwrap();
match &*proc_guard {
None => {
// "upgrade" to write lock to launch the process
drop(proc_guard);
let mut proc_guard = self.redo_process.write().unwrap();
match &*proc_guard {
None => {
let start = Instant::now();
let proc = Arc::new(
process::WalRedoProcess::launch(
self.conf,
self.tenant_shard_id,
pg_version,
)
.context("launch walredo process")?,
);
let duration = start.elapsed();
WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM
.observe(duration.as_secs_f64());
info!(
duration_ms = duration.as_millis(),
pid = proc.id(),
"launched walredo process"
);
*proc_guard = Some(Arc::clone(&proc));
proc
}
Some(proc) => Arc::clone(proc),
}
let proc: Arc<process::WalRedoProcess> =
match self.redo_process.get_or_init_detached().await {
Ok(guard) => Arc::clone(&guard),
Err(permit) => {
// don't hold poison_guard, the launch code can bail
let start = Instant::now();
let proc = Arc::new(
process::WalRedoProcess::launch(
self.conf,
self.tenant_shard_id,
pg_version,
)
.context("launch walredo process")?,
);
let duration = start.elapsed();
WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.observe(duration.as_secs_f64());
info!(
duration_ms = duration.as_millis(),
pid = proc.id(),
"launched walredo process"
);
self.redo_process.set(Arc::clone(&proc), permit);
proc
}
Some(proc) => Arc::clone(proc),
}
};
};
let started_at = std::time::Instant::now();
@@ -272,34 +278,34 @@ impl PostgresRedoManager {
n_attempts,
e,
);
// Avoid concurrent callers hitting the same issue.
// We can't prevent it from happening because we want to enable parallelism.
{
let mut guard = self.redo_process.write().unwrap();
match &*guard {
Some(current_field_value) => {
if Arc::ptr_eq(current_field_value, &proc) {
// We're the first to observe an error from `proc`, it's our job to take it out of rotation.
*guard = None;
}
}
None => {
// Another thread was faster to observe the error, and already took the process out of rotation.
}
}
}
// Avoid concurrent callers hitting the same issue by taking `proc` out of the rotation.
// Note that there may be other tasks concurrent with us that also hold `proc`.
// We have to deal with that here.
// Also read the doc comment on field `self.redo_process`.
//
// NB: there may still be other concurrent threads using `proc`.
// The last one will send SIGKILL when the underlying Arc reaches refcount 0.
// NB: it's important to drop(proc) after drop(guard). Otherwise we'd keep
// holding the lock while waiting for the process to exit.
// NB: the drop impl blocks the current threads with a wait() system call for
// the child process. We dropped the `guard` above so that other threads aren't
// affected. But, it's good that the current thread _does_ block to wait.
// If we instead deferred the waiting into the background / to tokio, it could
// happen that if walredo always fails immediately, we spawn processes faster
//
// NB: the drop impl blocks the dropping thread with a wait() system call for
// the child process. In some ways the blocking is actually good: if we
// deferred the waiting into the background / to tokio if we used `tokio::process`,
// it could happen that if walredo always fails immediately, we spawn processes faster
// than we can SIGKILL & `wait` for them to exit. By doing it the way we do here,
// we limit this risk of run-away to at most $num_runtimes * $num_executor_threads.
// This probably needs revisiting at some later point.
match self.redo_process.get() {
None => (),
Some(guard) => {
if Arc::ptr_eq(&proc, &*guard) {
// We're the first to observe an error from `proc`, it's our job to take it out of rotation.
guard.take_and_deinit();
} else {
// Another task already spawned another redo process (further up in this method)
// and put it into `redo_process`. Do nothing, our view of the world is behind.
}
}
}
// The last task that does this `drop()` of `proc` will do a blocking `wait()` syscall.
drop(proc);
} else if n_attempts != 0 {
info!(n_attempts, "retried walredo succeeded");

View File

@@ -12,6 +12,7 @@ testing = []
anyhow.workspace = true
async-compression.workspace = true
async-trait.workspace = true
atomic-take.workspace = true
aws-config.workspace = true
aws-sdk-iam.workspace = true
aws-sigv4.workspace = true
@@ -36,6 +37,9 @@ http.workspace = true
humantime.workspace = true
hyper-tungstenite.workspace = true
hyper.workspace = true
hyper1 = { package = "hyper", version = "1.2", features = ["server"] }
hyper-util = { version = "0.1", features = ["server", "http1", "http2", "tokio"] }
http-body-util = { version = "0.1" }
ipnet.workspace = true
itertools.workspace = true
lasso = { workspace = true, features = ["multi-threaded"] }

View File

@@ -102,8 +102,7 @@ pub(super) async fn authenticate(
ctx.set_user(db_info.user.into());
ctx.set_project(db_info.aux.clone());
let cold_start_info = db_info.aux.cold_start_info.clone().unwrap_or_default();
info!(?cold_start_info, "woken up a compute node");
info!("woken up a compute node");
// Backwards compatibility. pg_sni_proxy uses "--" in domain names
// while direct connections do not. Once we migrate to pg_sni_proxy

View File

@@ -10,6 +10,7 @@ use itertools::Itertools;
use proxy::config::TlsServerEndPoint;
use proxy::context::RequestMonitoring;
use proxy::proxy::run_until_cancelled;
use proxy::{BranchId, EndpointId, ProjectId};
use rustls::pki_types::PrivateKeyDer;
use tokio::net::TcpListener;
@@ -269,7 +270,12 @@ async fn handle_client(
let client = tokio::net::TcpStream::connect(destination).await?;
let metrics_aux: MetricsAuxInfo = Default::default();
let metrics_aux: MetricsAuxInfo = MetricsAuxInfo {
endpoint_id: (&EndpointId::from("")).into(),
project_id: (&ProjectId::from("")).into(),
branch_id: (&BranchId::from("")).into(),
cold_start_info: proxy::console::messages::ColdStartInfo::Unknown,
};
// doesn't yet matter as pg-sni-router doesn't report analytics logs
ctx.set_success();

View File

@@ -16,7 +16,7 @@ use crate::{
config::ProjectInfoCacheOptions,
console::AuthSecret,
intern::{EndpointIdInt, ProjectIdInt, RoleNameInt},
EndpointId, ProjectId, RoleName,
EndpointId, RoleName,
};
use super::{Cache, Cached};
@@ -214,14 +214,11 @@ impl ProjectInfoCacheImpl {
}
pub fn insert_role_secret(
&self,
project_id: &ProjectId,
endpoint_id: &EndpointId,
role_name: &RoleName,
project_id: ProjectIdInt,
endpoint_id: EndpointIdInt,
role_name: RoleNameInt,
secret: Option<AuthSecret>,
) {
let project_id = ProjectIdInt::from(project_id);
let endpoint_id = EndpointIdInt::from(endpoint_id);
let role_name = RoleNameInt::from(role_name);
if self.cache.len() >= self.config.size {
// If there are too many entries, wait until the next gc cycle.
return;
@@ -234,12 +231,10 @@ impl ProjectInfoCacheImpl {
}
pub fn insert_allowed_ips(
&self,
project_id: &ProjectId,
endpoint_id: &EndpointId,
project_id: ProjectIdInt,
endpoint_id: EndpointIdInt,
allowed_ips: Arc<Vec<IpPattern>>,
) {
let project_id = ProjectIdInt::from(project_id);
let endpoint_id = EndpointIdInt::from(endpoint_id);
if self.cache.len() >= self.config.size {
// If there are too many entries, wait until the next gc cycle.
return;
@@ -358,7 +353,7 @@ impl Cache for ProjectInfoCacheImpl {
#[cfg(test)]
mod tests {
use super::*;
use crate::scram::ServerSecret;
use crate::{scram::ServerSecret, ProjectId};
#[tokio::test]
async fn test_project_info_cache_settings() {
@@ -369,8 +364,8 @@ mod tests {
ttl: Duration::from_secs(1),
gc_interval: Duration::from_secs(600),
});
let project_id = "project".into();
let endpoint_id = "endpoint".into();
let project_id: ProjectId = "project".into();
let endpoint_id: EndpointId = "endpoint".into();
let user1: RoleName = "user1".into();
let user2: RoleName = "user2".into();
let secret1 = Some(AuthSecret::Scram(ServerSecret::mock([1; 32])));
@@ -379,9 +374,23 @@ mod tests {
"127.0.0.1".parse().unwrap(),
"127.0.0.2".parse().unwrap(),
]);
cache.insert_role_secret(&project_id, &endpoint_id, &user1, secret1.clone());
cache.insert_role_secret(&project_id, &endpoint_id, &user2, secret2.clone());
cache.insert_allowed_ips(&project_id, &endpoint_id, allowed_ips.clone());
cache.insert_role_secret(
(&project_id).into(),
(&endpoint_id).into(),
(&user1).into(),
secret1.clone(),
);
cache.insert_role_secret(
(&project_id).into(),
(&endpoint_id).into(),
(&user2).into(),
secret2.clone(),
);
cache.insert_allowed_ips(
(&project_id).into(),
(&endpoint_id).into(),
allowed_ips.clone(),
);
let cached = cache.get_role_secret(&endpoint_id, &user1).unwrap();
assert!(cached.cached());
@@ -393,7 +402,12 @@ mod tests {
// Shouldn't add more than 2 roles.
let user3: RoleName = "user3".into();
let secret3 = Some(AuthSecret::Scram(ServerSecret::mock([3; 32])));
cache.insert_role_secret(&project_id, &endpoint_id, &user3, secret3.clone());
cache.insert_role_secret(
(&project_id).into(),
(&endpoint_id).into(),
(&user3).into(),
secret3.clone(),
);
assert!(cache.get_role_secret(&endpoint_id, &user3).is_none());
let cached = cache.get_allowed_ips(&endpoint_id).unwrap();
@@ -421,8 +435,8 @@ mod tests {
cache.clone().disable_ttl();
tokio::time::advance(Duration::from_secs(2)).await;
let project_id = "project".into();
let endpoint_id = "endpoint".into();
let project_id: ProjectId = "project".into();
let endpoint_id: EndpointId = "endpoint".into();
let user1: RoleName = "user1".into();
let user2: RoleName = "user2".into();
let secret1 = Some(AuthSecret::Scram(ServerSecret::mock([1; 32])));
@@ -431,9 +445,23 @@ mod tests {
"127.0.0.1".parse().unwrap(),
"127.0.0.2".parse().unwrap(),
]);
cache.insert_role_secret(&project_id, &endpoint_id, &user1, secret1.clone());
cache.insert_role_secret(&project_id, &endpoint_id, &user2, secret2.clone());
cache.insert_allowed_ips(&project_id, &endpoint_id, allowed_ips.clone());
cache.insert_role_secret(
(&project_id).into(),
(&endpoint_id).into(),
(&user1).into(),
secret1.clone(),
);
cache.insert_role_secret(
(&project_id).into(),
(&endpoint_id).into(),
(&user2).into(),
secret2.clone(),
);
cache.insert_allowed_ips(
(&project_id).into(),
(&endpoint_id).into(),
allowed_ips.clone(),
);
tokio::time::advance(Duration::from_secs(2)).await;
// Nothing should be invalidated.
@@ -470,8 +498,8 @@ mod tests {
gc_interval: Duration::from_secs(600),
}));
let project_id = "project".into();
let endpoint_id = "endpoint".into();
let project_id: ProjectId = "project".into();
let endpoint_id: EndpointId = "endpoint".into();
let user1: RoleName = "user1".into();
let user2: RoleName = "user2".into();
let secret1 = Some(AuthSecret::Scram(ServerSecret::mock([1; 32])));
@@ -480,10 +508,20 @@ mod tests {
"127.0.0.1".parse().unwrap(),
"127.0.0.2".parse().unwrap(),
]);
cache.insert_role_secret(&project_id, &endpoint_id, &user1, secret1.clone());
cache.insert_role_secret(
(&project_id).into(),
(&endpoint_id).into(),
(&user1).into(),
secret1.clone(),
);
cache.clone().disable_ttl();
tokio::time::advance(Duration::from_millis(100)).await;
cache.insert_role_secret(&project_id, &endpoint_id, &user2, secret2.clone());
cache.insert_role_secret(
(&project_id).into(),
(&endpoint_id).into(),
(&user2).into(),
secret2.clone(),
);
// Added before ttl was disabled + ttl should be still cached.
let cached = cache.get_role_secret(&endpoint_id, &user1).unwrap();
@@ -497,7 +535,11 @@ mod tests {
assert!(cache.get_role_secret(&endpoint_id, &user2).is_none());
// Added after ttl was disabled + ttl should not be cached.
cache.insert_allowed_ips(&project_id, &endpoint_id, allowed_ips.clone());
cache.insert_allowed_ips(
(&project_id).into(),
(&endpoint_id).into(),
allowed_ips.clone(),
);
let cached = cache.get_allowed_ips(&endpoint_id).unwrap();
assert!(!cached.cached());

View File

@@ -276,6 +276,7 @@ impl ConnCfg {
let stream = connection.stream.into_inner();
info!(
cold_start_info = ctx.cold_start_info.as_str(),
"connected to compute node at {host} ({socket_addr}) sslmode={:?}",
self.0.get_ssl_mode()
);

View File

@@ -3,7 +3,7 @@ use std::fmt;
use crate::auth::IpPattern;
use crate::{BranchId, EndpointId, ProjectId};
use crate::intern::{BranchIdInt, EndpointIdInt, ProjectIdInt};
/// Generic error response with human-readable description.
/// Note that we can't always present it to user as is.
@@ -18,7 +18,7 @@ pub struct ConsoleError {
pub struct GetRoleSecret {
pub role_secret: Box<str>,
pub allowed_ips: Option<Vec<IpPattern>>,
pub project_id: Option<ProjectId>,
pub project_id: Option<ProjectIdInt>,
}
// Manually implement debug to omit sensitive info.
@@ -93,22 +93,47 @@ impl fmt::Debug for DatabaseInfo {
/// Various labels for prometheus metrics.
/// Also known as `ProxyMetricsAuxInfo` in the console.
#[derive(Debug, Deserialize, Clone, Default)]
#[derive(Debug, Deserialize, Clone)]
pub struct MetricsAuxInfo {
pub endpoint_id: EndpointId,
pub project_id: ProjectId,
pub branch_id: BranchId,
pub cold_start_info: Option<ColdStartInfo>,
pub endpoint_id: EndpointIdInt,
pub project_id: ProjectIdInt,
pub branch_id: BranchIdInt,
#[serde(default)]
pub cold_start_info: ColdStartInfo,
}
#[derive(Debug, Default, Serialize, Deserialize, Clone)]
#[derive(Debug, Default, Serialize, Deserialize, Clone, Copy)]
#[serde(rename_all = "snake_case")]
pub enum ColdStartInfo {
#[default]
Unknown = 0,
Warm = 1,
PoolHit = 2,
PoolMiss = 3,
Unknown,
/// Compute was already running
Warm,
#[serde(rename = "pool_hit")]
/// Compute was not running but there was an available VM
VmPoolHit,
#[serde(rename = "pool_miss")]
/// Compute was not running and there were no VMs available
VmPoolMiss,
// not provided by control plane
/// Connection available from HTTP pool
HttpPoolHit,
/// Cached connection info
WarmCached,
}
impl ColdStartInfo {
pub fn as_str(&self) -> &'static str {
match self {
ColdStartInfo::Unknown => "unknown",
ColdStartInfo::Warm => "warm",
ColdStartInfo::VmPoolHit => "pool_hit",
ColdStartInfo::VmPoolMiss => "pool_miss",
ColdStartInfo::HttpPoolHit => "http_pool_hit",
ColdStartInfo::WarmCached => "warm_cached",
}
}
}
#[cfg(test)]

View File

@@ -12,7 +12,8 @@ use crate::{
compute,
config::{CacheOptions, ProjectInfoCacheOptions},
context::RequestMonitoring,
scram, EndpointCacheKey, ProjectId,
intern::ProjectIdInt,
scram, EndpointCacheKey,
};
use dashmap::DashMap;
use std::{sync::Arc, time::Duration};
@@ -271,7 +272,7 @@ pub struct AuthInfo {
/// List of IP addresses allowed for the autorization.
pub allowed_ips: Vec<IpPattern>,
/// Project ID. This is used for cache invalidation.
pub project_id: Option<ProjectId>,
pub project_id: Option<ProjectIdInt>,
}
/// Info for establishing a connection to a compute node.

View File

@@ -4,10 +4,16 @@ use super::{
errors::{ApiError, GetAuthInfoError, WakeComputeError},
AuthInfo, AuthSecret, CachedNodeInfo, NodeInfo,
};
use crate::console::provider::{CachedAllowedIps, CachedRoleSecret};
use crate::context::RequestMonitoring;
use crate::{auth::backend::ComputeUserInfo, compute, error::io_error, scram, url::ApiUrl};
use crate::{auth::IpPattern, cache::Cached};
use crate::{
console::{
messages::MetricsAuxInfo,
provider::{CachedAllowedIps, CachedRoleSecret},
},
BranchId, EndpointId, ProjectId,
};
use futures::TryFutureExt;
use std::{str::FromStr, sync::Arc};
use thiserror::Error;
@@ -114,7 +120,12 @@ impl Api {
let node = NodeInfo {
config,
aux: Default::default(),
aux: MetricsAuxInfo {
endpoint_id: (&EndpointId::from("endpoint")).into(),
project_id: (&ProjectId::from("project")).into(),
branch_id: (&BranchId::from("branch")).into(),
cold_start_info: crate::console::messages::ColdStartInfo::Warm,
},
allow_self_signed_compute: false,
};

View File

@@ -181,15 +181,16 @@ impl super::Api for Api {
}
let auth_info = self.do_get_auth_info(ctx, user_info).await?;
if let Some(project_id) = auth_info.project_id {
let ep_int = ep.into();
self.caches.project_info.insert_role_secret(
&project_id,
ep,
user,
project_id,
ep_int,
user.into(),
auth_info.secret.clone(),
);
self.caches.project_info.insert_allowed_ips(
&project_id,
ep,
project_id,
ep_int,
Arc::new(auth_info.allowed_ips),
);
ctx.set_project_id(project_id);
@@ -217,15 +218,16 @@ impl super::Api for Api {
let allowed_ips = Arc::new(auth_info.allowed_ips);
let user = &user_info.user;
if let Some(project_id) = auth_info.project_id {
let ep_int = ep.into();
self.caches.project_info.insert_role_secret(
&project_id,
ep,
user,
project_id,
ep_int,
user.into(),
auth_info.secret.clone(),
);
self.caches
.project_info
.insert_allowed_ips(&project_id, ep, allowed_ips.clone());
.insert_allowed_ips(project_id, ep_int, allowed_ips.clone());
ctx.set_project_id(project_id);
}
Ok((
@@ -248,8 +250,7 @@ impl super::Api for Api {
// which means that we might cache it to reduce the load and latency.
if let Some(cached) = self.caches.node_info.get(&key) {
info!(key = &*key, "found cached compute node info");
info!("cold_start_info=warm");
ctx.set_cold_start_info(ColdStartInfo::Warm);
ctx.set_project(cached.aux.clone());
return Ok(cached);
}
@@ -260,17 +261,21 @@ impl super::Api for Api {
if permit.should_check_cache() {
if let Some(cached) = self.caches.node_info.get(&key) {
info!(key = &*key, "found cached compute node info");
info!("cold_start_info=warm");
ctx.set_cold_start_info(ColdStartInfo::Warm);
ctx.set_project(cached.aux.clone());
return Ok(cached);
}
}
let node = self.do_wake_compute(ctx, user_info).await?;
let mut node = self.do_wake_compute(ctx, user_info).await?;
ctx.set_project(node.aux.clone());
let cold_start_info = node.aux.cold_start_info.clone().unwrap_or_default();
info!(?cold_start_info, "woken up a compute node");
let (_, cached) = self.caches.node_info.insert(key.clone(), node);
let cold_start_info = node.aux.cold_start_info;
info!("woken up a compute node");
// store the cached node as 'warm'
node.aux.cold_start_info = ColdStartInfo::WarmCached;
let (_, mut cached) = self.caches.node_info.insert(key.clone(), node);
cached.aux.cold_start_info = cold_start_info;
info!(key = &*key, "created a cache entry for compute node info");
Ok(cached)

View File

@@ -11,8 +11,9 @@ use uuid::Uuid;
use crate::{
console::messages::{ColdStartInfo, MetricsAuxInfo},
error::ErrorKind,
intern::{BranchIdInt, ProjectIdInt},
metrics::{LatencyTimer, ENDPOINT_ERRORS_BY_KIND, ERROR_BY_KIND},
BranchId, DbName, EndpointId, ProjectId, RoleName,
DbName, EndpointId, RoleName,
};
use self::parquet::RequestData;
@@ -34,8 +35,8 @@ pub struct RequestMonitoring {
pub span: Span,
// filled in as they are discovered
project: Option<ProjectId>,
branch: Option<BranchId>,
project: Option<ProjectIdInt>,
branch: Option<BranchIdInt>,
endpoint_id: Option<EndpointId>,
dbname: Option<DbName>,
user: Option<RoleName>,
@@ -43,7 +44,7 @@ pub struct RequestMonitoring {
error_kind: Option<ErrorKind>,
pub(crate) auth_method: Option<AuthMethod>,
success: bool,
cold_start_info: Option<ColdStartInfo>,
pub(crate) cold_start_info: ColdStartInfo,
// extra
// This sender is here to keep the request monitoring channel open while requests are taking place.
@@ -92,7 +93,7 @@ impl RequestMonitoring {
error_kind: None,
auth_method: None,
success: false,
cold_start_info: None,
cold_start_info: ColdStartInfo::Unknown,
sender: LOG_CHAN.get().and_then(|tx| tx.upgrade()),
latency_timer: LatencyTimer::new(protocol),
@@ -113,26 +114,31 @@ impl RequestMonitoring {
}
pub fn set_cold_start_info(&mut self, info: ColdStartInfo) {
self.cold_start_info = Some(info);
self.cold_start_info = info;
self.latency_timer.cold_start_info(info);
}
pub fn set_project(&mut self, x: MetricsAuxInfo) {
self.set_endpoint_id(x.endpoint_id);
if self.endpoint_id.is_none() {
self.set_endpoint_id(x.endpoint_id.as_str().into())
}
self.branch = Some(x.branch_id);
self.project = Some(x.project_id);
self.cold_start_info = x.cold_start_info;
self.set_cold_start_info(x.cold_start_info);
}
pub fn set_project_id(&mut self, project_id: ProjectId) {
pub fn set_project_id(&mut self, project_id: ProjectIdInt) {
self.project = Some(project_id);
}
pub fn set_endpoint_id(&mut self, endpoint_id: EndpointId) {
self.span.record("ep", display(&endpoint_id));
crate::metrics::CONNECTING_ENDPOINTS
.with_label_values(&[self.protocol])
.measure(&endpoint_id);
self.endpoint_id = Some(endpoint_id);
if self.endpoint_id.is_none() {
self.span.record("ep", display(&endpoint_id));
crate::metrics::CONNECTING_ENDPOINTS
.with_label_values(&[self.protocol])
.measure(&endpoint_id);
self.endpoint_id = Some(endpoint_id);
}
}
pub fn set_application(&mut self, app: Option<SmolStr>) {

View File

@@ -87,7 +87,7 @@ pub struct RequestData {
/// Or if we make it to proxy_pass
success: bool,
/// Indicates if the cplane started the new compute node for this request.
cold_start_info: Option<&'static str>,
cold_start_info: &'static str,
/// Tracks time from session start (HTTP request/libpq TCP handshake)
/// Through to success/failure
duration_us: u64,
@@ -115,12 +115,7 @@ impl From<&RequestMonitoring> for RequestData {
region: value.region,
error: value.error_kind.as_ref().map(|e| e.to_metric_label()),
success: value.success,
cold_start_info: value.cold_start_info.as_ref().map(|x| match x {
crate::console::messages::ColdStartInfo::Unknown => "unknown",
crate::console::messages::ColdStartInfo::Warm => "warm",
crate::console::messages::ColdStartInfo::PoolHit => "pool_hit",
crate::console::messages::ColdStartInfo::PoolMiss => "pool_miss",
}),
cold_start_info: value.cold_start_info.as_str(),
duration_us: SystemTime::from(value.first_packet)
.elapsed()
.unwrap_or_default()
@@ -454,7 +449,7 @@ mod tests {
region: "us-east-1",
error: None,
success: rng.gen(),
cold_start_info: Some("no"),
cold_start_info: "no",
duration_us: rng.gen_range(0..30_000_000),
}
}
@@ -524,15 +519,15 @@ mod tests {
assert_eq!(
file_stats,
[
(1314406, 3, 6000),
(1314399, 3, 6000),
(1314459, 3, 6000),
(1314416, 3, 6000),
(1314546, 3, 6000),
(1314388, 3, 6000),
(1314180, 3, 6000),
(1314416, 3, 6000),
(438359, 1, 2000)
(1314385, 3, 6000),
(1314378, 3, 6000),
(1314438, 3, 6000),
(1314395, 3, 6000),
(1314525, 3, 6000),
(1314367, 3, 6000),
(1314159, 3, 6000),
(1314395, 3, 6000),
(438352, 1, 2000)
]
);
@@ -562,11 +557,11 @@ mod tests {
assert_eq!(
file_stats,
[
(1220668, 5, 10000),
(1226818, 5, 10000),
(1228612, 5, 10000),
(1227974, 5, 10000),
(1219252, 5, 10000)
(1220633, 5, 10000),
(1226783, 5, 10000),
(1228577, 5, 10000),
(1227939, 5, 10000),
(1219217, 5, 10000)
]
);
@@ -598,11 +593,11 @@ mod tests {
assert_eq!(
file_stats,
[
(1206315, 5, 10000),
(1206046, 5, 10000),
(1206339, 5, 10000),
(1206327, 5, 10000),
(1206582, 5, 10000)
(1206280, 5, 10000),
(1206011, 5, 10000),
(1206304, 5, 10000),
(1206292, 5, 10000),
(1206547, 5, 10000)
]
);
@@ -627,15 +622,15 @@ mod tests {
assert_eq!(
file_stats,
[
(1314406, 3, 6000),
(1314399, 3, 6000),
(1314459, 3, 6000),
(1314416, 3, 6000),
(1314546, 3, 6000),
(1314388, 3, 6000),
(1314180, 3, 6000),
(1314416, 3, 6000),
(438359, 1, 2000)
(1314385, 3, 6000),
(1314378, 3, 6000),
(1314438, 3, 6000),
(1314395, 3, 6000),
(1314525, 3, 6000),
(1314367, 3, 6000),
(1314159, 3, 6000),
(1314395, 3, 6000),
(438352, 1, 2000)
]
);
@@ -672,7 +667,7 @@ mod tests {
// files are smaller than the size threshold, but they took too long to fill so were flushed early
assert_eq!(
file_stats,
[(658837, 2, 3001), (658551, 2, 3000), (658347, 2, 2999)]
[(658823, 2, 3001), (658537, 2, 3000), (658333, 2, 2999)]
);
tmpdir.close().unwrap();

View File

@@ -12,6 +12,8 @@ use metrics::{
use once_cell::sync::Lazy;
use tokio::time::{self, Instant};
use crate::console::messages::ColdStartInfo;
pub static NUM_DB_CONNECTIONS_GAUGE: Lazy<IntCounterPairVec> = Lazy::new(|| {
register_int_counter_pair_vec!(
"proxy_opened_db_connections_total",
@@ -50,8 +52,8 @@ pub static COMPUTE_CONNECTION_LATENCY: Lazy<HistogramVec> = Lazy::new(|| {
"proxy_compute_connection_latency_seconds",
"Time it took for proxy to establish a connection to the compute endpoint",
// http/ws/tcp, true/false, true/false, success/failure, client/client_and_cplane
// 3 * 2 * 2 * 2 * 2 = 48 counters
&["protocol", "cache_miss", "pool_miss", "outcome", "excluded"],
// 3 * 6 * 2 * 2 = 72 counters
&["protocol", "cold_start_info", "outcome", "excluded"],
// largest bucket = 2^16 * 0.5ms = 32s
exponential_buckets(0.0005, 2.0, 16).unwrap(),
)
@@ -183,6 +185,20 @@ struct Accumulated {
compute: time::Duration,
}
enum Outcome {
Success,
Failed,
}
impl Outcome {
fn as_str(&self) -> &'static str {
match self {
Outcome::Success => "success",
Outcome::Failed => "failed",
}
}
}
pub struct LatencyTimer {
// time since the stopwatch was started
start: time::Instant,
@@ -192,9 +208,8 @@ pub struct LatencyTimer {
accumulated: Accumulated,
// label data
protocol: &'static str,
cache_miss: bool,
pool_miss: bool,
outcome: &'static str,
cold_start_info: ColdStartInfo,
outcome: Outcome,
}
pub struct LatencyTimerPause<'a> {
@@ -210,11 +225,9 @@ impl LatencyTimer {
stop: None,
accumulated: Accumulated::default(),
protocol,
cache_miss: false,
// by default we don't do pooling
pool_miss: true,
cold_start_info: ColdStartInfo::Unknown,
// assume failed unless otherwise specified
outcome: "failed",
outcome: Outcome::Failed,
}
}
@@ -226,12 +239,8 @@ impl LatencyTimer {
}
}
pub fn cache_miss(&mut self) {
self.cache_miss = true;
}
pub fn pool_hit(&mut self) {
self.pool_miss = false;
pub fn cold_start_info(&mut self, cold_start_info: ColdStartInfo) {
self.cold_start_info = cold_start_info;
}
pub fn success(&mut self) {
@@ -239,7 +248,7 @@ impl LatencyTimer {
self.stop = Some(time::Instant::now());
// success
self.outcome = "success";
self.outcome = Outcome::Success;
}
}
@@ -264,9 +273,8 @@ impl Drop for LatencyTimer {
COMPUTE_CONNECTION_LATENCY
.with_label_values(&[
self.protocol,
bool_to_str(self.cache_miss),
bool_to_str(self.pool_miss),
self.outcome,
self.cold_start_info.as_str(),
self.outcome.as_str(),
"client",
])
.observe((duration.saturating_sub(self.accumulated.client)).as_secs_f64());
@@ -275,9 +283,8 @@ impl Drop for LatencyTimer {
COMPUTE_CONNECTION_LATENCY
.with_label_values(&[
self.protocol,
bool_to_str(self.cache_miss),
bool_to_str(self.pool_miss),
self.outcome,
self.cold_start_info.as_str(),
self.outcome.as_str(),
"client_and_cplane",
])
.observe((duration.saturating_sub(accumulated_total)).as_secs_f64());

View File

@@ -5,19 +5,13 @@ use std::{
io,
net::SocketAddr,
pin::{pin, Pin},
sync::Mutex,
task::{ready, Context, Poll},
};
use bytes::{Buf, BytesMut};
use hyper::server::accept::Accept;
use hyper::server::conn::{AddrIncoming, AddrStream};
use metrics::IntCounterPairGuard;
use hyper::server::conn::AddrIncoming;
use pin_project_lite::pin_project;
use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, ReadBuf};
use uuid::Uuid;
use crate::metrics::NUM_CLIENT_CONNECTION_GAUGE;
pub struct ProxyProtocolAccept {
pub incoming: AddrIncoming,
@@ -331,103 +325,6 @@ impl<T: AsyncRead> AsyncRead for WithClientIp<T> {
}
}
impl Accept for ProxyProtocolAccept {
type Conn = WithConnectionGuard<WithClientIp<AddrStream>>;
type Error = io::Error;
fn poll_accept(
mut self: Pin<&mut Self>,
cx: &mut Context<'_>,
) -> Poll<Option<Result<Self::Conn, Self::Error>>> {
let conn = ready!(Pin::new(&mut self.incoming).poll_accept(cx)?);
let conn_id = uuid::Uuid::new_v4();
let span = tracing::info_span!("http_conn", ?conn_id);
{
let _enter = span.enter();
tracing::info!("accepted new TCP connection");
}
let Some(conn) = conn else {
return Poll::Ready(None);
};
Poll::Ready(Some(Ok(WithConnectionGuard {
inner: WithClientIp::new(conn),
connection_id: Uuid::new_v4(),
gauge: Mutex::new(Some(
NUM_CLIENT_CONNECTION_GAUGE
.with_label_values(&[self.protocol])
.guard(),
)),
span,
})))
}
}
pin_project! {
pub struct WithConnectionGuard<T> {
#[pin]
pub inner: T,
pub connection_id: Uuid,
pub gauge: Mutex<Option<IntCounterPairGuard>>,
pub span: tracing::Span,
}
impl<S> PinnedDrop for WithConnectionGuard<S> {
fn drop(this: Pin<&mut Self>) {
let _enter = this.span.enter();
tracing::info!("HTTP connection closed")
}
}
}
impl<T: AsyncWrite> AsyncWrite for WithConnectionGuard<T> {
#[inline]
fn poll_write(
self: Pin<&mut Self>,
cx: &mut Context<'_>,
buf: &[u8],
) -> Poll<Result<usize, io::Error>> {
self.project().inner.poll_write(cx, buf)
}
#[inline]
fn poll_flush(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Result<(), io::Error>> {
self.project().inner.poll_flush(cx)
}
#[inline]
fn poll_shutdown(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Result<(), io::Error>> {
self.project().inner.poll_shutdown(cx)
}
#[inline]
fn poll_write_vectored(
self: Pin<&mut Self>,
cx: &mut Context<'_>,
bufs: &[io::IoSlice<'_>],
) -> Poll<Result<usize, io::Error>> {
self.project().inner.poll_write_vectored(cx, bufs)
}
#[inline]
fn is_write_vectored(&self) -> bool {
self.inner.is_write_vectored()
}
}
impl<T: AsyncRead> AsyncRead for WithConnectionGuard<T> {
fn poll_read(
self: Pin<&mut Self>,
cx: &mut Context<'_>,
buf: &mut ReadBuf<'_>,
) -> Poll<io::Result<()>> {
self.project().inner.poll_read(cx, buf)
}
}
#[cfg(test)]
mod tests {
use std::pin::pin;

View File

@@ -87,7 +87,6 @@ impl ConnectMechanism for TcpMechanism<'_> {
}
/// Try to connect to the compute node, retrying if necessary.
/// This function might update `node_info`, so we take it by `&mut`.
#[tracing::instrument(skip_all)]
pub async fn connect_to_compute<M: ConnectMechanism, B: ComputeConnectBackend>(
ctx: &mut RequestMonitoring,
@@ -132,7 +131,6 @@ where
} else {
// if we failed to connect, it's likely that the compute node was suspended, wake a new compute node
info!("compute node's state has likely changed; requesting a wake-up");
ctx.latency_timer.cache_miss();
let old_node_info = invalidate_cache(node_info);
let mut node_info = wake_compute(&mut num_retries, ctx, user_info).await?;
node_info.reuse_settings(old_node_info);

View File

@@ -19,8 +19,8 @@ pub async fn proxy_pass(
aux: MetricsAuxInfo,
) -> anyhow::Result<()> {
let usage = USAGE_METRICS.register(Ids {
endpoint_id: aux.endpoint_id.clone(),
branch_id: aux.branch_id.clone(),
endpoint_id: aux.endpoint_id,
branch_id: aux.branch_id,
});
let m_sent = NUM_BYTES_PROXIED_COUNTER.with_label_values(&["tx"]);

View File

@@ -12,11 +12,12 @@ use crate::auth::backend::{
};
use crate::config::CertResolver;
use crate::console::caches::NodeInfoCache;
use crate::console::messages::MetricsAuxInfo;
use crate::console::provider::{CachedAllowedIps, CachedRoleSecret, ConsoleBackend};
use crate::console::{self, CachedNodeInfo, NodeInfo};
use crate::error::ErrorKind;
use crate::proxy::retry::{retry_after, NUM_RETRIES_CONNECT};
use crate::{http, sasl, scram};
use crate::{http, sasl, scram, BranchId, EndpointId, ProjectId};
use anyhow::{bail, Context};
use async_trait::async_trait;
use rstest::rstest;
@@ -512,7 +513,12 @@ impl TestBackend for TestConnectMechanism {
fn helper_create_cached_node_info(cache: &'static NodeInfoCache) -> CachedNodeInfo {
let node = NodeInfo {
config: compute::ConnCfg::new(),
aux: Default::default(),
aux: MetricsAuxInfo {
endpoint_id: (&EndpointId::from("endpoint")).into(),
project_id: (&ProjectId::from("project")).into(),
branch_id: (&BranchId::from("branch")).into(),
cold_start_info: crate::console::messages::ColdStartInfo::Warm,
},
allow_self_signed_compute: false,
};
let (_, node) = cache.insert("key".into(), node);

View File

@@ -4,42 +4,48 @@
mod backend;
mod conn_pool;
mod http_util;
mod json;
mod sql_over_http;
pub mod tls_listener;
mod websocket;
use atomic_take::AtomicTake;
use bytes::Bytes;
pub use conn_pool::GlobalConnPoolOptions;
use anyhow::bail;
use hyper::StatusCode;
use metrics::IntCounterPairGuard;
use anyhow::Context;
use futures::future::{select, Either};
use futures::TryFutureExt;
use http::{Method, Response, StatusCode};
use http_body_util::Full;
use hyper1::body::Incoming;
use hyper_util::rt::TokioExecutor;
use hyper_util::server::conn::auto::Builder;
use rand::rngs::StdRng;
use rand::SeedableRng;
pub use reqwest_middleware::{ClientWithMiddleware, Error};
pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
use tokio::time::timeout;
use tokio_rustls::TlsAcceptor;
use tokio_util::task::TaskTracker;
use tracing::instrument::Instrumented;
use crate::cancellation::CancellationHandlerMain;
use crate::config::ProxyConfig;
use crate::context::RequestMonitoring;
use crate::protocol2::{ProxyProtocolAccept, WithClientIp, WithConnectionGuard};
use crate::metrics::{NUM_CLIENT_CONNECTION_GAUGE, TLS_HANDSHAKE_FAILURES};
use crate::protocol2::WithClientIp;
use crate::proxy::run_until_cancelled;
use crate::rate_limiter::EndpointRateLimiter;
use crate::serverless::backend::PoolingBackend;
use hyper::{
server::conn::{AddrIncoming, AddrStream},
Body, Method, Request, Response,
};
use crate::serverless::http_util::{api_error_into_response, json_response};
use std::net::IpAddr;
use std::net::{IpAddr, SocketAddr};
use std::pin::pin;
use std::sync::Arc;
use std::task::Poll;
use tls_listener::TlsListener;
use tokio::net::TcpListener;
use tokio_util::sync::{CancellationToken, DropGuard};
use tokio::net::{TcpListener, TcpStream};
use tokio_util::sync::CancellationToken;
use tracing::{error, info, warn, Instrument};
use utils::http::{error::ApiError, json::json_response};
use utils::http::error::ApiError;
pub const SERVERLESS_DRIVER_SNI: &str = "api";
@@ -91,161 +97,174 @@ pub async fn task_main(
tls_server_config.alpn_protocols = vec![b"h2".to_vec(), b"http/1.1".to_vec()];
let tls_acceptor: tokio_rustls::TlsAcceptor = Arc::new(tls_server_config).into();
let mut addr_incoming = AddrIncoming::from_listener(ws_listener)?;
let _ = addr_incoming.set_nodelay(true);
let addr_incoming = ProxyProtocolAccept {
incoming: addr_incoming,
protocol: "http",
};
let connections = tokio_util::task::task_tracker::TaskTracker::new();
connections.close(); // allows `connections.wait to complete`
let ws_connections = tokio_util::task::task_tracker::TaskTracker::new();
ws_connections.close(); // allows `ws_connections.wait to complete`
let server = Builder::new(hyper_util::rt::TokioExecutor::new());
let tls_listener = TlsListener::new(tls_acceptor, addr_incoming, config.handshake_timeout);
while let Some(res) = run_until_cancelled(ws_listener.accept(), &cancellation_token).await {
let (conn, peer_addr) = res.context("could not accept TCP stream")?;
if let Err(e) = conn.set_nodelay(true) {
tracing::error!("could not set nodelay: {e}");
continue;
}
let conn_id = uuid::Uuid::new_v4();
let http_conn_span = tracing::info_span!("http_conn", ?conn_id);
let make_svc = hyper::service::make_service_fn(
|stream: &tokio_rustls::server::TlsStream<
WithConnectionGuard<WithClientIp<AddrStream>>,
>| {
let (conn, _) = stream.get_ref();
connections.spawn(
connection_handler(
config,
backend.clone(),
connections.clone(),
cancellation_handler.clone(),
endpoint_rate_limiter.clone(),
cancellation_token.clone(),
server.clone(),
tls_acceptor.clone(),
conn,
peer_addr,
)
.instrument(http_conn_span),
);
}
// this is jank. should dissapear with hyper 1.0 migration.
let gauge = conn
.gauge
.lock()
.expect("lock should not be poisoned")
.take()
.expect("gauge should be set on connection start");
// Cancel all current inflight HTTP requests if the HTTP connection is closed.
let http_cancellation_token = CancellationToken::new();
let cancel_connection = http_cancellation_token.clone().drop_guard();
let span = conn.span.clone();
let client_addr = conn.inner.client_addr();
let remote_addr = conn.inner.inner.remote_addr();
let backend = backend.clone();
let ws_connections = ws_connections.clone();
let endpoint_rate_limiter = endpoint_rate_limiter.clone();
let cancellation_handler = cancellation_handler.clone();
async move {
let peer_addr = match client_addr {
Some(addr) => addr,
None if config.require_client_ip => bail!("missing required client ip"),
None => remote_addr,
};
Ok(MetricService::new(
hyper::service::service_fn(move |req: Request<Body>| {
let backend = backend.clone();
let ws_connections2 = ws_connections.clone();
let endpoint_rate_limiter = endpoint_rate_limiter.clone();
let cancellation_handler = cancellation_handler.clone();
let http_cancellation_token = http_cancellation_token.child_token();
// `request_handler` is not cancel safe. It expects to be cancelled only at specific times.
// By spawning the future, we ensure it never gets cancelled until it decides to.
ws_connections.spawn(
async move {
// Cancel the current inflight HTTP request if the requets stream is closed.
// This is slightly different to `_cancel_connection` in that
// h2 can cancel individual requests with a `RST_STREAM`.
let _cancel_session = http_cancellation_token.clone().drop_guard();
let res = request_handler(
req,
config,
backend,
ws_connections2,
cancellation_handler,
peer_addr.ip(),
endpoint_rate_limiter,
http_cancellation_token,
)
.await
.map_or_else(|e| e.into_response(), |r| r);
_cancel_session.disarm();
res
}
.in_current_span(),
)
}),
gauge,
cancel_connection,
span,
))
}
},
);
hyper::Server::builder(tls_listener)
.serve(make_svc)
.with_graceful_shutdown(cancellation_token.cancelled())
.await?;
// await websocket connections
ws_connections.wait().await;
connections.wait().await;
Ok(())
}
struct MetricService<S> {
inner: S,
_gauge: IntCounterPairGuard,
_cancel: DropGuard,
span: tracing::Span,
}
/// Handles the TCP lifecycle.
///
/// 1. Parses PROXY protocol V2
/// 2. Handles TLS handshake
/// 3. Handles HTTP connection
/// 1. With graceful shutdowns
/// 2. With graceful request cancellation with connection failure
/// 3. With websocket upgrade support.
#[allow(clippy::too_many_arguments)]
async fn connection_handler(
config: &'static ProxyConfig,
backend: Arc<PoolingBackend>,
connections: TaskTracker,
cancellation_handler: Arc<CancellationHandlerMain>,
endpoint_rate_limiter: Arc<EndpointRateLimiter>,
cancellation_token: CancellationToken,
server: Builder<TokioExecutor>,
tls_acceptor: TlsAcceptor,
conn: TcpStream,
peer_addr: SocketAddr,
) {
let session_id = uuid::Uuid::new_v4();
impl<S> MetricService<S> {
fn new(
inner: S,
_gauge: IntCounterPairGuard,
_cancel: DropGuard,
span: tracing::Span,
) -> MetricService<S> {
MetricService {
inner,
_gauge,
_cancel,
span,
let _gauge = NUM_CLIENT_CONNECTION_GAUGE
.with_label_values(&["http"])
.guard();
// handle PROXY protocol
let mut conn = WithClientIp::new(conn);
let peer = match conn.wait_for_addr().await {
Ok(peer) => peer,
Err(e) => {
tracing::error!(?session_id, %peer_addr, "failed to accept TCP connection: invalid PROXY protocol V2 header: {e:#}");
return;
}
}
}
};
impl<S, ReqBody> hyper::service::Service<Request<ReqBody>> for MetricService<S>
where
S: hyper::service::Service<Request<ReqBody>>,
{
type Response = S::Response;
type Error = S::Error;
type Future = Instrumented<S::Future>;
let peer_addr = peer.unwrap_or(peer_addr).ip();
info!(?session_id, %peer_addr, "accepted new TCP connection");
fn poll_ready(&mut self, cx: &mut std::task::Context<'_>) -> Poll<Result<(), Self::Error>> {
self.inner.poll_ready(cx)
}
// try upgrade to TLS, but with a timeout.
let conn = match timeout(config.handshake_timeout, tls_acceptor.accept(conn)).await {
Ok(Ok(conn)) => {
info!(?session_id, %peer_addr, "accepted new TLS connection");
conn
}
// The handshake failed
Ok(Err(e)) => {
TLS_HANDSHAKE_FAILURES.inc();
warn!(?session_id, %peer_addr, "failed to accept TLS connection: {e:?}");
return;
}
// The handshake timed out
Err(e) => {
TLS_HANDSHAKE_FAILURES.inc();
warn!(?session_id, %peer_addr, "failed to accept TLS connection: {e:?}");
return;
}
};
fn call(&mut self, req: Request<ReqBody>) -> Self::Future {
self.span
.in_scope(|| self.inner.call(req))
.instrument(self.span.clone())
let session_id = AtomicTake::new(session_id);
// Cancel all current inflight HTTP requests if the HTTP connection is closed.
let http_cancellation_token = CancellationToken::new();
let _cancel_connection = http_cancellation_token.clone().drop_guard();
let conn = server.serve_connection_with_upgrades(
hyper_util::rt::TokioIo::new(conn),
hyper1::service::service_fn(move |req: hyper1::Request<Incoming>| {
// First HTTP request shares the same session ID
let session_id = session_id.take().unwrap_or_else(uuid::Uuid::new_v4);
// Cancel the current inflight HTTP request if the requets stream is closed.
// This is slightly different to `_cancel_connection` in that
// h2 can cancel individual requests with a `RST_STREAM`.
let http_request_token = http_cancellation_token.child_token();
let cancel_request = http_request_token.clone().drop_guard();
// `request_handler` is not cancel safe. It expects to be cancelled only at specific times.
// By spawning the future, we ensure it never gets cancelled until it decides to.
let handler = connections.spawn(
request_handler(
req,
config,
backend.clone(),
connections.clone(),
cancellation_handler.clone(),
session_id,
peer_addr,
endpoint_rate_limiter.clone(),
http_request_token,
)
.in_current_span()
.map_ok_or_else(api_error_into_response, |r| r),
);
async move {
let res = handler.await;
cancel_request.disarm();
res
}
}),
);
// On cancellation, trigger the HTTP connection handler to shut down.
let res = match select(pin!(cancellation_token.cancelled()), pin!(conn)).await {
Either::Left((_cancelled, mut conn)) => {
conn.as_mut().graceful_shutdown();
conn.await
}
Either::Right((res, _)) => res,
};
match res {
Ok(()) => tracing::info!(%peer_addr, "HTTP connection closed"),
Err(e) => tracing::warn!(%peer_addr, "HTTP connection error {e}"),
}
}
#[allow(clippy::too_many_arguments)]
async fn request_handler(
mut request: Request<Body>,
mut request: hyper1::Request<Incoming>,
config: &'static ProxyConfig,
backend: Arc<PoolingBackend>,
ws_connections: TaskTracker,
cancellation_handler: Arc<CancellationHandlerMain>,
session_id: uuid::Uuid,
peer_addr: IpAddr,
endpoint_rate_limiter: Arc<EndpointRateLimiter>,
// used to cancel in-flight HTTP requests. not used to cancel websockets
http_cancellation_token: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let session_id = uuid::Uuid::new_v4();
) -> Result<Response<Full<Bytes>>, ApiError> {
let host = request
.headers()
.get("host")
@@ -282,14 +301,14 @@ async fn request_handler(
// Return the response so the spawned future can continue.
Ok(response)
} else if request.uri().path() == "/sql" && request.method() == Method::POST {
} else if request.uri().path() == "/sql" && *request.method() == Method::POST {
let ctx = RequestMonitoring::new(session_id, peer_addr, "http", &config.region);
let span = ctx.span.clone();
sql_over_http::handle(config, ctx, request, backend, http_cancellation_token)
.instrument(span)
.await
} else if request.uri().path() == "/sql" && request.method() == Method::OPTIONS {
} else if request.uri().path() == "/sql" && *request.method() == Method::OPTIONS {
Response::builder()
.header("Allow", "OPTIONS, POST")
.header("Access-Control-Allow-Origin", "*")
@@ -299,7 +318,7 @@ async fn request_handler(
)
.header("Access-Control-Max-Age", "86400" /* 24 hours */)
.status(StatusCode::OK) // 204 is also valid, but see: https://developer.mozilla.org/en-US/docs/Web/HTTP/Methods/OPTIONS#status_code
.body(Body::empty())
.body(Full::new(Bytes::new()))
.map_err(|e| ApiError::InternalServerError(e.into()))
} else {
json_response(StatusCode::BAD_REQUEST, "query is not supported")

View File

@@ -9,7 +9,6 @@ use crate::{
config::ProxyConfig,
console::{
errors::{GetAuthInfoError, WakeComputeError},
messages::ColdStartInfo,
CachedNodeInfo,
},
context::RequestMonitoring,
@@ -57,7 +56,10 @@ impl PoolingBackend {
let auth_outcome =
crate::auth::validate_password_and_exchange(&conn_info.password, secret).await?;
let res = match auth_outcome {
crate::sasl::Outcome::Success(key) => Ok(key),
crate::sasl::Outcome::Success(key) => {
info!("user successfully authenticated");
Ok(key)
}
crate::sasl::Outcome::Failure(reason) => {
info!("auth backend failed with an error: {reason}");
Err(AuthError::auth_failed(&*conn_info.user_info.user))
@@ -89,8 +91,6 @@ impl PoolingBackend {
};
if let Some(client) = maybe_client {
info!("cold_start_info=warm");
ctx.set_cold_start_info(ColdStartInfo::Warm);
return Ok(client);
}
let conn_id = uuid::Uuid::new_v4();

View File

@@ -17,7 +17,7 @@ use tokio::time::Instant;
use tokio_postgres::tls::NoTlsStream;
use tokio_postgres::{AsyncMessage, ReadyForQueryStatus, Socket};
use crate::console::messages::MetricsAuxInfo;
use crate::console::messages::{ColdStartInfo, MetricsAuxInfo};
use crate::metrics::{ENDPOINT_POOLS, GC_LATENCY, NUM_OPEN_CLIENTS_IN_HTTP_POOL};
use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS};
use crate::{
@@ -383,9 +383,12 @@ impl<C: ClientInnerExt> GlobalConnPool<C> {
"pid",
&tracing::field::display(client.inner.get_process_id()),
);
info!("pool: reusing connection '{conn_info}'");
info!(
cold_start_info = ColdStartInfo::HttpPoolHit.as_str(),
"pool: reusing connection '{conn_info}'"
);
client.session.send(ctx.session_id)?;
ctx.latency_timer.pool_hit();
ctx.set_cold_start_info(ColdStartInfo::HttpPoolHit);
ctx.latency_timer.success();
return Ok(Some(Client::new(client, conn_info.clone(), endpoint_pool)));
}
@@ -454,8 +457,9 @@ pub fn poll_client<C: ClientInnerExt>(
let (tx, mut rx) = tokio::sync::watch::channel(session_id);
let span = info_span!(parent: None, "connection", %conn_id);
let cold_start_info = ctx.cold_start_info;
span.in_scope(|| {
info!(%conn_info, %session_id, "new connection");
info!(cold_start_info = cold_start_info.as_str(), %conn_info, %session_id, "new connection");
});
let pool = match conn_info.endpoint_cache_key() {
Some(endpoint) => Arc::downgrade(&global_pool.get_or_create_endpoint_pool(&endpoint)),
@@ -565,8 +569,8 @@ impl<C: ClientInnerExt> Client<C> {
pub fn metrics(&self) -> Arc<MetricCounter> {
let aux = &self.inner.as_ref().unwrap().aux;
USAGE_METRICS.register(Ids {
endpoint_id: aux.endpoint_id.clone(),
branch_id: aux.branch_id.clone(),
endpoint_id: aux.endpoint_id,
branch_id: aux.branch_id,
})
}
}
@@ -666,6 +670,8 @@ impl<C: ClientInnerExt> Drop for Client<C> {
mod tests {
use std::{mem, sync::atomic::AtomicBool};
use crate::{BranchId, EndpointId, ProjectId};
use super::*;
struct MockClient(Arc<AtomicBool>);
@@ -691,7 +697,12 @@ mod tests {
ClientInner {
inner: client,
session: tokio::sync::watch::Sender::new(uuid::Uuid::new_v4()),
aux: Default::default(),
aux: MetricsAuxInfo {
endpoint_id: (&EndpointId::from("endpoint")).into(),
project_id: (&ProjectId::from("project")).into(),
branch_id: (&BranchId::from("branch")).into(),
cold_start_info: crate::console::messages::ColdStartInfo::Warm,
},
conn_id: uuid::Uuid::new_v4(),
}
}

View File

@@ -0,0 +1,92 @@
//! Things stolen from `libs/utils/src/http` to add hyper 1.0 compatibility
//! Will merge back in at some point in the future.
use bytes::Bytes;
use anyhow::Context;
use http::{Response, StatusCode};
use http_body_util::Full;
use serde::Serialize;
use utils::http::error::ApiError;
/// Like [`ApiError::into_response`]
pub fn api_error_into_response(this: ApiError) -> Response<Full<Bytes>> {
match this {
ApiError::BadRequest(err) => HttpErrorBody::response_from_msg_and_status(
format!("{err:#?}"), // use debug printing so that we give the cause
StatusCode::BAD_REQUEST,
),
ApiError::Forbidden(_) => {
HttpErrorBody::response_from_msg_and_status(this.to_string(), StatusCode::FORBIDDEN)
}
ApiError::Unauthorized(_) => {
HttpErrorBody::response_from_msg_and_status(this.to_string(), StatusCode::UNAUTHORIZED)
}
ApiError::NotFound(_) => {
HttpErrorBody::response_from_msg_and_status(this.to_string(), StatusCode::NOT_FOUND)
}
ApiError::Conflict(_) => {
HttpErrorBody::response_from_msg_and_status(this.to_string(), StatusCode::CONFLICT)
}
ApiError::PreconditionFailed(_) => HttpErrorBody::response_from_msg_and_status(
this.to_string(),
StatusCode::PRECONDITION_FAILED,
),
ApiError::ShuttingDown => HttpErrorBody::response_from_msg_and_status(
"Shutting down".to_string(),
StatusCode::SERVICE_UNAVAILABLE,
),
ApiError::ResourceUnavailable(err) => HttpErrorBody::response_from_msg_and_status(
err.to_string(),
StatusCode::SERVICE_UNAVAILABLE,
),
ApiError::Timeout(err) => HttpErrorBody::response_from_msg_and_status(
err.to_string(),
StatusCode::REQUEST_TIMEOUT,
),
ApiError::InternalServerError(err) => HttpErrorBody::response_from_msg_and_status(
err.to_string(),
StatusCode::INTERNAL_SERVER_ERROR,
),
}
}
/// Same as [`utils::http::error::HttpErrorBody`]
#[derive(Serialize)]
struct HttpErrorBody {
pub msg: String,
}
impl HttpErrorBody {
/// Same as [`utils::http::error::HttpErrorBody::response_from_msg_and_status`]
fn response_from_msg_and_status(msg: String, status: StatusCode) -> Response<Full<Bytes>> {
HttpErrorBody { msg }.to_response(status)
}
/// Same as [`utils::http::error::HttpErrorBody::to_response`]
fn to_response(&self, status: StatusCode) -> Response<Full<Bytes>> {
Response::builder()
.status(status)
.header(http::header::CONTENT_TYPE, "application/json")
// we do not have nested maps with non string keys so serialization shouldn't fail
.body(Full::new(Bytes::from(serde_json::to_string(self).unwrap())))
.unwrap()
}
}
/// Same as [`utils::http::json::json_response`]
pub fn json_response<T: Serialize>(
status: StatusCode,
data: T,
) -> Result<Response<Full<Bytes>>, ApiError> {
let json = serde_json::to_string(&data)
.context("Failed to serialize JSON response")
.map_err(ApiError::InternalServerError)?;
let response = Response::builder()
.status(status)
.header(http::header::CONTENT_TYPE, "application/json")
.body(Full::new(Bytes::from(json)))
.map_err(|e| ApiError::InternalServerError(e.into()))?;
Ok(response)
}

View File

@@ -1,18 +1,22 @@
use std::pin::pin;
use std::sync::Arc;
use bytes::Bytes;
use futures::future::select;
use futures::future::try_join;
use futures::future::Either;
use futures::StreamExt;
use futures::TryFutureExt;
use hyper::body::HttpBody;
use hyper::header;
use hyper::http::HeaderName;
use hyper::http::HeaderValue;
use hyper::Response;
use hyper::StatusCode;
use hyper::{Body, HeaderMap, Request};
use http_body_util::BodyExt;
use http_body_util::Full;
use hyper1::body::Body;
use hyper1::body::Incoming;
use hyper1::header;
use hyper1::http::HeaderName;
use hyper1::http::HeaderValue;
use hyper1::Response;
use hyper1::StatusCode;
use hyper1::{HeaderMap, Request};
use serde_json::json;
use serde_json::Value;
use tokio::time;
@@ -29,7 +33,6 @@ use tracing::error;
use tracing::info;
use url::Url;
use utils::http::error::ApiError;
use utils::http::json::json_response;
use crate::auth::backend::ComputeUserInfo;
use crate::auth::endpoint_sni;
@@ -52,6 +55,7 @@ use crate::RoleName;
use super::backend::PoolingBackend;
use super::conn_pool::Client;
use super::conn_pool::ConnInfo;
use super::http_util::json_response;
use super::json::json_to_pg_text;
use super::json::pg_text_row_to_json;
use super::json::JsonConversionError;
@@ -218,10 +222,10 @@ fn get_conn_info(
pub async fn handle(
config: &'static ProxyConfig,
mut ctx: RequestMonitoring,
request: Request<Body>,
request: Request<Incoming>,
backend: Arc<PoolingBackend>,
cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
) -> Result<Response<Full<Bytes>>, ApiError> {
let result = handle_inner(cancel, config, &mut ctx, request, backend).await;
let mut response = match result {
@@ -332,10 +336,9 @@ pub async fn handle(
}
};
response.headers_mut().insert(
"Access-Control-Allow-Origin",
hyper::http::HeaderValue::from_static("*"),
);
response
.headers_mut()
.insert("Access-Control-Allow-Origin", HeaderValue::from_static("*"));
Ok(response)
}
@@ -396,7 +399,7 @@ impl UserFacingError for SqlOverHttpError {
#[derive(Debug, thiserror::Error)]
pub enum ReadPayloadError {
#[error("could not read the HTTP request body: {0}")]
Read(#[from] hyper::Error),
Read(#[from] hyper1::Error),
#[error("could not parse the HTTP request body: {0}")]
Parse(#[from] serde_json::Error),
}
@@ -437,7 +440,7 @@ struct HttpHeaders {
}
impl HttpHeaders {
fn try_parse(headers: &hyper::http::HeaderMap) -> Result<Self, SqlOverHttpError> {
fn try_parse(headers: &hyper1::http::HeaderMap) -> Result<Self, SqlOverHttpError> {
// Determine the output options. Default behaviour is 'false'. Anything that is not
// strictly 'true' assumed to be false.
let raw_output = headers.get(&RAW_TEXT_OUTPUT) == Some(&HEADER_VALUE_TRUE);
@@ -488,9 +491,9 @@ async fn handle_inner(
cancel: CancellationToken,
config: &'static ProxyConfig,
ctx: &mut RequestMonitoring,
request: Request<Body>,
request: Request<Incoming>,
backend: Arc<PoolingBackend>,
) -> Result<Response<Body>, SqlOverHttpError> {
) -> Result<Response<Full<Bytes>>, SqlOverHttpError> {
let _request_gauge = NUM_CONNECTION_REQUESTS_GAUGE
.with_label_values(&[ctx.protocol])
.guard();
@@ -528,7 +531,7 @@ async fn handle_inner(
}
let fetch_and_process_request = async {
let body = hyper::body::to_bytes(request.into_body()).await?;
let body = request.into_body().collect().await?.to_bytes();
info!(length = body.len(), "request payload read");
let payload: Payload = serde_json::from_slice(&body)?;
Ok::<Payload, ReadPayloadError>(payload) // Adjust error type accordingly
@@ -596,7 +599,7 @@ async fn handle_inner(
let body = serde_json::to_string(&result).expect("json serialization should not fail");
let len = body.len();
let response = response
.body(Body::from(body))
.body(Full::new(Bytes::from(body)))
// only fails if invalid status code or invalid header/values are given.
// these are not user configurable so it cannot fail dynamically
.expect("building response payload should not fail");
@@ -639,6 +642,7 @@ impl QueryData {
}
// The query was cancelled.
Either::Right((_cancelled, query)) => {
tracing::info!("cancelling query");
if let Err(err) = cancel_token.cancel_query(NoTls).await {
tracing::error!(?err, "could not cancel query");
}

View File

@@ -1,123 +0,0 @@
use std::{
convert::Infallible,
pin::Pin,
task::{Context, Poll},
time::Duration,
};
use hyper::server::{accept::Accept, conn::AddrStream};
use pin_project_lite::pin_project;
use tokio::{
io::{AsyncRead, AsyncWrite},
task::JoinSet,
time::timeout,
};
use tokio_rustls::{server::TlsStream, TlsAcceptor};
use tracing::{info, warn, Instrument};
use crate::{
metrics::TLS_HANDSHAKE_FAILURES,
protocol2::{WithClientIp, WithConnectionGuard},
};
pin_project! {
/// Wraps a `Stream` of connections (such as a TCP listener) so that each connection is itself
/// encrypted using TLS.
pub(crate) struct TlsListener<A: Accept> {
#[pin]
listener: A,
tls: TlsAcceptor,
waiting: JoinSet<Option<TlsStream<A::Conn>>>,
timeout: Duration,
}
}
impl<A: Accept> TlsListener<A> {
/// Create a `TlsListener` with default options.
pub(crate) fn new(tls: TlsAcceptor, listener: A, timeout: Duration) -> Self {
TlsListener {
listener,
tls,
waiting: JoinSet::new(),
timeout,
}
}
}
impl<A> Accept for TlsListener<A>
where
A: Accept<Conn = WithConnectionGuard<WithClientIp<AddrStream>>>,
A::Error: std::error::Error,
A::Conn: AsyncRead + AsyncWrite + Unpin + Send + 'static,
{
type Conn = TlsStream<A::Conn>;
type Error = Infallible;
fn poll_accept(
self: Pin<&mut Self>,
cx: &mut Context<'_>,
) -> Poll<Option<Result<Self::Conn, Self::Error>>> {
let mut this = self.project();
loop {
match this.listener.as_mut().poll_accept(cx) {
Poll::Pending => break,
Poll::Ready(Some(Ok(mut conn))) => {
let t = *this.timeout;
let tls = this.tls.clone();
let span = conn.span.clone();
this.waiting.spawn(async move {
let peer_addr = match conn.inner.wait_for_addr().await {
Ok(Some(addr)) => addr,
Err(e) => {
tracing::error!("failed to accept TCP connection: invalid PROXY protocol V2 header: {e:#}");
return None;
}
Ok(None) => conn.inner.inner.remote_addr()
};
let accept = tls.accept(conn);
match timeout(t, accept).await {
Ok(Ok(conn)) => {
info!(%peer_addr, "accepted new TLS connection");
Some(conn)
},
// The handshake failed, try getting another connection from the queue
Ok(Err(e)) => {
TLS_HANDSHAKE_FAILURES.inc();
warn!(%peer_addr, "failed to accept TLS connection: {e:?}");
None
}
// The handshake timed out, try getting another connection from the queue
Err(_) => {
TLS_HANDSHAKE_FAILURES.inc();
warn!(%peer_addr, "failed to accept TLS connection: timeout");
None
}
}
}.instrument(span));
}
Poll::Ready(Some(Err(e))) => {
tracing::error!("error accepting TCP connection: {e}");
continue;
}
Poll::Ready(None) => return Poll::Ready(None),
}
}
loop {
return match this.waiting.poll_join_next(cx) {
Poll::Ready(Some(Ok(Some(conn)))) => Poll::Ready(Some(Ok(conn))),
// The handshake failed to complete, try getting another connection from the queue
Poll::Ready(Some(Ok(None))) => continue,
// The handshake panicked or was cancelled. ignore and get another connection
Poll::Ready(Some(Err(e))) => {
tracing::warn!("handshake aborted: {e}");
continue;
}
_ => Poll::Pending,
};
}
}
}

View File

@@ -3,7 +3,8 @@
use crate::{
config::{MetricBackupCollectionConfig, MetricCollectionConfig},
context::parquet::{FAILED_UPLOAD_MAX_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD},
http, BranchId, EndpointId,
http,
intern::{BranchIdInt, EndpointIdInt},
};
use anyhow::Context;
use async_compression::tokio::write::GzipEncoder;
@@ -43,8 +44,8 @@ const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60);
/// because we enrich the event with project_id in the control-plane endpoint.
#[derive(Eq, Hash, PartialEq, Serialize, Deserialize, Debug, Clone)]
pub struct Ids {
pub endpoint_id: EndpointId,
pub branch_id: BranchId,
pub endpoint_id: EndpointIdInt,
pub branch_id: BranchIdInt,
}
pub trait MetricCounterRecorder {
@@ -494,7 +495,7 @@ mod tests {
use url::Url;
use super::*;
use crate::{http, rate_limiter::RateLimiterConfig};
use crate::{http, rate_limiter::RateLimiterConfig, BranchId, EndpointId};
#[tokio::test]
async fn metrics() {
@@ -536,8 +537,8 @@ mod tests {
// register a new counter
let counter = metrics.register(Ids {
endpoint_id: "e1".into(),
branch_id: "b1".into(),
endpoint_id: (&EndpointId::from("e1")).into(),
branch_id: (&BranchId::from("b1")).into(),
});
// the counter should be observed despite 0 egress

View File

@@ -337,6 +337,17 @@ pub async fn main_task(tli: Arc<Timeline>, conf: SafeKeeperConf) {
}
}
// if we don't have any data and zero LSNs, wait for something
while flush_lsn_rx.borrow().lsn == Lsn(0) {
tokio::select! {
_ = cancellation_rx.changed() => {
info!("timeline canceled");
return;
}
_ = flush_lsn_rx.changed() => {}
}
}
// fixing the segno and waiting some time to prevent reuploading the same segment too often
let pending_segno = backup.segno(flush_lsn_rx.borrow().lsn);
let timeout = tokio::time::sleep(await_duration);

View File

@@ -15,8 +15,7 @@ FLAKY_TESTS_QUERY = """
DISTINCT parent_suite, suite, name
FROM results
WHERE
started_at > CURRENT_DATE - INTERVAL '10' day
AND started_at > '2024-03-11 14:50:11.845+00' -- we switched the default PAGESERVER_VIRTUAL_FILE_IO_ENGINE to `tokio-epoll-uring` from `std-fs` on this date, we want to ignore the flaky tests for `std-fs`
started_at > CURRENT_DATE - INTERVAL '%s' day
AND (
(status IN ('failed', 'broken') AND reference = 'refs/heads/main')
OR flaky

View File

@@ -22,7 +22,7 @@ parser.add_argument("--safekeeper-host", required=True, type=str)
args = parser.parse_args()
access_key = os.getenv("CONSOLE_API_TOKEN")
endpoint: str = "https://console.stage.neon.tech/api"
endpoint: str = "https://console-stage.neon.build/api"
trash_dir: Path = args.trash_dir
dry_run: bool = args.dry_run

View File

@@ -3,7 +3,7 @@
3. Issue admin token (add/remove .stage from url for staging/prod and setting proper API key):
```
# staging:
AUTH_TOKEN=$(curl https://console.stage.neon.tech/regions/console/api/v1/admin/issue_token -H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer $NEON_STAGING_KEY" -X POST -d '{"ttl_seconds": 43200, "scope": "safekeeperdata"}' 2>/dev/null | jq --raw-output '.jwt')
AUTH_TOKEN=$(curl https://console-stage.neon.build/regions/console/api/v1/admin/issue_token -H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer $NEON_STAGING_KEY" -X POST -d '{"ttl_seconds": 43200, "scope": "safekeeperdata"}' 2>/dev/null | jq --raw-output '.jwt')
# prod:
AUTH_TOKEN=$(curl https://console.neon.tech/regions/console/api/v1/admin/issue_token -H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer $NEON_PROD_KEY" -X POST -d '{"ttl_seconds": 43200, "scope": "safekeeperdata"}' 2>/dev/null | jq --raw-output '.jwt')
# check

View File

@@ -1,5 +1,5 @@
[package]
name = "attachment_service"
name = "storage_controller"
version = "0.1.0"
edition.workspace = true
license.workspace = true
@@ -45,8 +45,8 @@ diesel = { version = "2.1.4", features = ["serde_json", "postgres", "r2d2"] }
diesel_migrations = { version = "2.1.0" }
r2d2 = { version = "0.8.10" }
utils = { path = "../../libs/utils/" }
metrics = { path = "../../libs/metrics/" }
control_plane = { path = ".." }
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
utils = { path = "../libs/utils/" }
metrics = { path = "../libs/metrics/" }
control_plane = { path = "../control_plane" }
workspace_hack = { version = "0.1", path = "../workspace_hack" }

View File

@@ -1,3 +1,4 @@
use std::sync::Arc;
use std::{collections::HashMap, time::Duration};
use control_plane::endpoint::{ComputeControlPlane, EndpointStatus};
@@ -18,14 +19,26 @@ const SLOWDOWN_DELAY: Duration = Duration::from_secs(5);
pub(crate) const API_CONCURRENCY: usize = 32;
struct UnshardedComputeHookTenant {
// Which node is this tenant attached to
node_id: NodeId,
// Must hold this lock to send a notification.
send_lock: Arc<tokio::sync::Mutex<Option<ComputeHookNotifyRequest>>>,
}
struct ShardedComputeHookTenant {
stripe_size: ShardStripeSize,
shard_count: ShardCount,
shards: Vec<(ShardNumber, NodeId)>,
// Must hold this lock to send a notification. The contents represent
// the last successfully sent notification, and are used to coalesce multiple
// updates by only sending when there is a chance since our last successful send.
send_lock: Arc<tokio::sync::Mutex<Option<ComputeHookNotifyRequest>>>,
}
enum ComputeHookTenant {
Unsharded(NodeId),
Unsharded(UnshardedComputeHookTenant),
Sharded(ShardedComputeHookTenant),
}
@@ -37,9 +50,20 @@ impl ComputeHookTenant {
shards: vec![(tenant_shard_id.shard_number, node_id)],
stripe_size,
shard_count: tenant_shard_id.shard_count,
send_lock: Arc::default(),
})
} else {
Self::Unsharded(node_id)
Self::Unsharded(UnshardedComputeHookTenant {
node_id,
send_lock: Arc::default(),
})
}
}
fn get_send_lock(&self) -> &Arc<tokio::sync::Mutex<Option<ComputeHookNotifyRequest>>> {
match self {
Self::Unsharded(unsharded_tenant) => &unsharded_tenant.send_lock,
Self::Sharded(sharded_tenant) => &sharded_tenant.send_lock,
}
}
@@ -52,8 +76,8 @@ impl ComputeHookTenant {
node_id: NodeId,
) {
match self {
Self::Unsharded(existing_node_id) if tenant_shard_id.shard_count.count() == 1 => {
*existing_node_id = node_id
Self::Unsharded(unsharded_tenant) if tenant_shard_id.shard_count.count() == 1 => {
unsharded_tenant.node_id = node_id
}
Self::Sharded(sharded_tenant)
if sharded_tenant.stripe_size == stripe_size
@@ -80,14 +104,14 @@ impl ComputeHookTenant {
}
}
#[derive(Serialize, Deserialize, Debug)]
#[derive(Serialize, Deserialize, Debug, Eq, PartialEq)]
struct ComputeHookNotifyRequestShard {
node_id: NodeId,
shard_number: ShardNumber,
}
/// Request body that we send to the control plane to notify it of where a tenant is attached
#[derive(Serialize, Deserialize, Debug)]
#[derive(Serialize, Deserialize, Debug, Eq, PartialEq)]
struct ComputeHookNotifyRequest {
tenant_id: TenantId,
stripe_size: Option<ShardStripeSize>,
@@ -120,14 +144,44 @@ pub(crate) enum NotifyError {
Fatal(StatusCode),
}
enum MaybeSendResult {
// Please send this request while holding the lock, and if you succeed then write
// the request into the lock.
Transmit(
(
ComputeHookNotifyRequest,
tokio::sync::OwnedMutexGuard<Option<ComputeHookNotifyRequest>>,
),
),
// Something requires sending, but you must wait for a current sender then call again
AwaitLock(Arc<tokio::sync::Mutex<Option<ComputeHookNotifyRequest>>>),
// Nothing requires sending
Noop,
}
impl ComputeHookTenant {
fn maybe_reconfigure(&self, tenant_id: TenantId) -> Option<ComputeHookNotifyRequest> {
match self {
Self::Unsharded(node_id) => Some(ComputeHookNotifyRequest {
fn maybe_send(
&self,
tenant_id: TenantId,
lock: Option<tokio::sync::OwnedMutexGuard<Option<ComputeHookNotifyRequest>>>,
) -> MaybeSendResult {
let locked = match lock {
Some(already_locked) => already_locked,
None => {
// Lock order: this _must_ be only a try_lock, because we are called inside of the [`ComputeHook::state`] lock.
let Ok(locked) = self.get_send_lock().clone().try_lock_owned() else {
return MaybeSendResult::AwaitLock(self.get_send_lock().clone());
};
locked
}
};
let request = match self {
Self::Unsharded(unsharded_tenant) => Some(ComputeHookNotifyRequest {
tenant_id,
shards: vec![ComputeHookNotifyRequestShard {
shard_number: ShardNumber(0),
node_id: *node_id,
node_id: unsharded_tenant.node_id,
}],
stripe_size: None,
}),
@@ -151,12 +205,25 @@ impl ComputeHookTenant {
// Sharded tenant doesn't yet have information for all its shards
tracing::info!(
"ComputeHookTenant::maybe_reconfigure: not enough shards ({}/{})",
"ComputeHookTenant::maybe_send: not enough shards ({}/{})",
sharded_tenant.shards.len(),
sharded_tenant.shard_count.count()
);
None
}
};
match request {
None => {
// Not yet ready to emit a notification
tracing::info!("Tenant isn't yet ready to emit a notification");
MaybeSendResult::Noop
}
Some(request) if Some(&request) == locked.as_ref() => {
// No change from the last value successfully sent
MaybeSendResult::Noop
}
Some(request) => MaybeSendResult::Transmit((request, locked)),
}
}
}
@@ -166,8 +233,15 @@ impl ComputeHookTenant {
/// the compute connection string.
pub(super) struct ComputeHook {
config: Config,
state: tokio::sync::Mutex<HashMap<TenantId, ComputeHookTenant>>,
state: std::sync::Mutex<HashMap<TenantId, ComputeHookTenant>>,
authorization_header: Option<String>,
// Concurrency limiter, so that we do not overload the cloud control plane when updating
// large numbers of tenants (e.g. when failing over after a node failure)
api_concurrency: tokio::sync::Semaphore,
// This lock is only used in testing enviroments, to serialize calls into neon_lock
neon_local_lock: tokio::sync::Mutex<()>,
}
impl ComputeHook {
@@ -181,14 +255,20 @@ impl ComputeHook {
state: Default::default(),
config,
authorization_header,
neon_local_lock: Default::default(),
api_concurrency: tokio::sync::Semaphore::new(API_CONCURRENCY),
}
}
/// For test environments: use neon_local's LocalEnv to update compute
async fn do_notify_local(
&self,
reconfigure_request: ComputeHookNotifyRequest,
reconfigure_request: &ComputeHookNotifyRequest,
) -> anyhow::Result<()> {
// neon_local updates are not safe to call concurrently, use a lock to serialize
// all calls to this function
let _locked = self.neon_local_lock.lock().await;
let env = match LocalEnv::load_config() {
Ok(e) => e,
Err(e) => {
@@ -205,7 +285,7 @@ impl ComputeHook {
} = reconfigure_request;
let compute_pageservers = shards
.into_iter()
.iter()
.map(|shard| {
let ps_conf = env
.get_pageserver_conf(shard.node_id)
@@ -217,10 +297,10 @@ impl ComputeHook {
.collect::<Vec<_>>();
for (endpoint_name, endpoint) in &cplane.endpoints {
if endpoint.tenant_id == tenant_id && endpoint.status() == EndpointStatus::Running {
if endpoint.tenant_id == *tenant_id && endpoint.status() == EndpointStatus::Running {
tracing::info!("Reconfiguring endpoint {}", endpoint_name,);
endpoint
.reconfigure(compute_pageservers.clone(), stripe_size)
.reconfigure(compute_pageservers.clone(), *stripe_size)
.await?;
}
}
@@ -298,12 +378,23 @@ impl ComputeHook {
async fn do_notify(
&self,
url: &String,
reconfigure_request: ComputeHookNotifyRequest,
reconfigure_request: &ComputeHookNotifyRequest,
cancel: &CancellationToken,
) -> Result<(), NotifyError> {
let client = reqwest::Client::new();
// We hold these semaphore units across all retries, rather than only across each
// HTTP request: this is to preserve fairness and avoid a situation where a retry might
// time out waiting for a semaphore.
let _units = self
.api_concurrency
.acquire()
.await
// Interpret closed semaphore as shutdown
.map_err(|_| NotifyError::ShuttingDown)?;
backoff::retry(
|| self.do_notify_iteration(&client, url, &reconfigure_request, cancel),
|| self.do_notify_iteration(&client, url, reconfigure_request, cancel),
|e| {
matches!(
e,
@@ -343,42 +434,70 @@ impl ComputeHook {
stripe_size: ShardStripeSize,
cancel: &CancellationToken,
) -> Result<(), NotifyError> {
let mut locked = self.state.lock().await;
let maybe_send_result = {
let mut state_locked = self.state.lock().unwrap();
use std::collections::hash_map::Entry;
let tenant = match locked.entry(tenant_shard_id.tenant_id) {
Entry::Vacant(e) => e.insert(ComputeHookTenant::new(
tenant_shard_id,
stripe_size,
node_id,
)),
Entry::Occupied(e) => {
let tenant = e.into_mut();
tenant.update(tenant_shard_id, stripe_size, node_id);
tenant
use std::collections::hash_map::Entry;
let tenant = match state_locked.entry(tenant_shard_id.tenant_id) {
Entry::Vacant(e) => e.insert(ComputeHookTenant::new(
tenant_shard_id,
stripe_size,
node_id,
)),
Entry::Occupied(e) => {
let tenant = e.into_mut();
tenant.update(tenant_shard_id, stripe_size, node_id);
tenant
}
};
tenant.maybe_send(tenant_shard_id.tenant_id, None)
};
// Process result: we may get an update to send, or we may have to wait for a lock
// before trying again.
let (request, mut send_lock_guard) = match maybe_send_result {
MaybeSendResult::Noop => {
return Ok(());
}
MaybeSendResult::AwaitLock(send_lock) => {
let send_locked = send_lock.lock_owned().await;
// Lock order: maybe_send is called within the `[Self::state]` lock, and takes the send lock, but here
// we have acquired the send lock and take `[Self::state]` lock. This is safe because maybe_send only uses
// try_lock.
let state_locked = self.state.lock().unwrap();
let Some(tenant) = state_locked.get(&tenant_shard_id.tenant_id) else {
return Ok(());
};
match tenant.maybe_send(tenant_shard_id.tenant_id, Some(send_locked)) {
MaybeSendResult::AwaitLock(_) => {
unreachable!("We supplied lock guard")
}
MaybeSendResult::Noop => {
return Ok(());
}
MaybeSendResult::Transmit((request, lock)) => (request, lock),
}
}
MaybeSendResult::Transmit((request, lock)) => (request, lock),
};
let reconfigure_request = tenant.maybe_reconfigure(tenant_shard_id.tenant_id);
let Some(reconfigure_request) = reconfigure_request else {
// The tenant doesn't yet have pageservers for all its shards: we won't notify anything
// until it does.
tracing::info!("Tenant isn't yet ready to emit a notification");
return Ok(());
};
if let Some(notify_url) = &self.config.compute_hook_url {
self.do_notify(notify_url, reconfigure_request, cancel)
.await
let result = if let Some(notify_url) = &self.config.compute_hook_url {
self.do_notify(notify_url, &request, cancel).await
} else {
self.do_notify_local(reconfigure_request)
.await
.map_err(|e| {
// This path is for testing only, so munge the error into our prod-style error type.
tracing::error!("Local notification hook failed: {e}");
NotifyError::Fatal(StatusCode::INTERNAL_SERVER_ERROR)
})
self.do_notify_local(&request).await.map_err(|e| {
// This path is for testing only, so munge the error into our prod-style error type.
tracing::error!("Local notification hook failed: {e}");
NotifyError::Fatal(StatusCode::INTERNAL_SERVER_ERROR)
})
};
if result.is_ok() {
// Before dropping the send lock, stash the request we just sent so that
// subsequent callers can avoid redundantly re-sending the same thing.
*send_lock_guard = Some(request);
}
result
}
}
@@ -402,21 +521,22 @@ pub(crate) mod tests {
NodeId(1),
);
// An unsharded tenant is always ready to emit a notification
assert!(tenant_state.maybe_reconfigure(tenant_id).is_some());
assert_eq!(
tenant_state
.maybe_reconfigure(tenant_id)
.unwrap()
.shards
.len(),
1
);
assert!(tenant_state
.maybe_reconfigure(tenant_id)
.unwrap()
.stripe_size
.is_none());
// An unsharded tenant is always ready to emit a notification, but won't
// send the same one twice
let send_result = tenant_state.maybe_send(tenant_id, None);
let MaybeSendResult::Transmit((request, mut guard)) = send_result else {
anyhow::bail!("Wrong send result");
};
assert_eq!(request.shards.len(), 1);
assert!(request.stripe_size.is_none());
// Simulate successful send
*guard = Some(request);
drop(guard);
// Try asking again: this should be a no-op
let send_result = tenant_state.maybe_send(tenant_id, None);
assert!(matches!(send_result, MaybeSendResult::Noop));
// Writing the first shard of a multi-sharded situation (i.e. in a split)
// resets the tenant state and puts it in an non-notifying state (need to
@@ -430,7 +550,10 @@ pub(crate) mod tests {
ShardStripeSize(32768),
NodeId(1),
);
assert!(tenant_state.maybe_reconfigure(tenant_id).is_none());
assert!(matches!(
tenant_state.maybe_send(tenant_id, None),
MaybeSendResult::Noop
));
// Writing the second shard makes it ready to notify
tenant_state.update(
@@ -443,22 +566,16 @@ pub(crate) mod tests {
NodeId(1),
);
assert!(tenant_state.maybe_reconfigure(tenant_id).is_some());
assert_eq!(
tenant_state
.maybe_reconfigure(tenant_id)
.unwrap()
.shards
.len(),
2
);
assert_eq!(
tenant_state
.maybe_reconfigure(tenant_id)
.unwrap()
.stripe_size,
Some(ShardStripeSize(32768))
);
let send_result = tenant_state.maybe_send(tenant_id, None);
let MaybeSendResult::Transmit((request, mut guard)) = send_result else {
anyhow::bail!("Wrong send result");
};
assert_eq!(request.shards.len(), 2);
assert_eq!(request.stripe_size, Some(ShardStripeSize(32768)));
// Simulate successful send
*guard = Some(request);
drop(guard);
Ok(())
}

View File

@@ -8,6 +8,7 @@ use futures::Future;
use hyper::header::CONTENT_TYPE;
use hyper::{Body, Request, Response};
use hyper::{StatusCode, Uri};
use metrics::{BuildInfo, NeonMetrics};
use pageserver_api::models::{
TenantConfigRequest, TenantCreateRequest, TenantLocationConfigRequest, TenantShardSplitRequest,
TenantTimeTravelRequest, TimelineCreateRequest,
@@ -44,15 +45,19 @@ use control_plane::storage_controller::{AttachHookRequest, InspectRequest};
use routerify::Middleware;
/// State available to HTTP request handlers
#[derive(Clone)]
pub struct HttpState {
service: Arc<crate::service::Service>,
auth: Option<Arc<SwappableJwtAuth>>,
neon_metrics: NeonMetrics,
allowlist_routes: Vec<Uri>,
}
impl HttpState {
pub fn new(service: Arc<crate::service::Service>, auth: Option<Arc<SwappableJwtAuth>>) -> Self {
pub fn new(
service: Arc<crate::service::Service>,
auth: Option<Arc<SwappableJwtAuth>>,
build_info: BuildInfo,
) -> Self {
let allowlist_routes = ["/status", "/ready", "/metrics"]
.iter()
.map(|v| v.parse().unwrap())
@@ -60,6 +65,7 @@ impl HttpState {
Self {
service,
auth,
neon_metrics: NeonMetrics::new(build_info),
allowlist_routes,
}
}
@@ -602,9 +608,17 @@ where
.await
}
/// Check if the required scope is held in the request's token, or if the request has
/// a token with 'admin' scope then always permit it.
fn check_permissions(request: &Request<Body>, required_scope: Scope) -> Result<(), ApiError> {
check_permission_with(request, |claims| {
crate::auth::check_permission(claims, required_scope)
match crate::auth::check_permission(claims, required_scope) {
Err(e) => match crate::auth::check_permission(claims, Scope::Admin) {
Ok(()) => Ok(()),
Err(_) => Err(e),
},
Ok(()) => Ok(()),
}
})
}
@@ -664,10 +678,11 @@ fn epilogue_metrics_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>
})
}
pub async fn measured_metrics_handler(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
pub async fn measured_metrics_handler(req: Request<Body>) -> Result<Response<Body>, ApiError> {
pub const TEXT_FORMAT: &str = "text/plain; version=0.0.4";
let payload = crate::metrics::METRICS_REGISTRY.encode();
let state = get_state(&req);
let payload = crate::metrics::METRICS_REGISTRY.encode(&state.neon_metrics);
let response = Response::builder()
.status(200)
.header(CONTENT_TYPE, TEXT_FORMAT)
@@ -696,6 +711,7 @@ where
pub fn make_router(
service: Arc<Service>,
auth: Option<Arc<SwappableJwtAuth>>,
build_info: BuildInfo,
) -> RouterBuilder<hyper::Body, ApiError> {
let mut router = endpoint::make_router()
.middleware(prologue_metrics_middleware())
@@ -712,7 +728,7 @@ pub fn make_router(
}
router
.data(Arc::new(HttpState::new(service, auth)))
.data(Arc::new(HttpState::new(service, auth, build_info)))
.get("/metrics", |r| {
named_request_span(r, measured_metrics_handler, RequestName("metrics"))
})

View File

@@ -14,7 +14,7 @@ mod reconciler;
mod scheduler;
mod schema;
pub mod service;
mod tenant_state;
mod tenant_shard;
#[derive(Ord, PartialOrd, Eq, PartialEq, Copy, Clone, Serialize)]
struct Sequence(u64);

View File

@@ -1,18 +1,20 @@
use anyhow::{anyhow, Context};
use attachment_service::http::make_router;
use attachment_service::metrics::preinitialize_metrics;
use attachment_service::persistence::Persistence;
use attachment_service::service::{Config, Service, MAX_UNAVAILABLE_INTERVAL_DEFAULT};
use camino::Utf8PathBuf;
use clap::Parser;
use diesel::Connection;
use metrics::launch_timestamp::LaunchTimestamp;
use metrics::BuildInfo;
use std::sync::Arc;
use storage_controller::http::make_router;
use storage_controller::metrics::preinitialize_metrics;
use storage_controller::persistence::Persistence;
use storage_controller::service::{Config, Service, MAX_UNAVAILABLE_INTERVAL_DEFAULT};
use tokio::signal::unix::SignalKind;
use tokio_util::sync::CancellationToken;
use utils::auth::{JwtAuth, SwappableJwtAuth};
use utils::logging::{self, LogFormat};
use utils::sentry_init::init_sentry;
use utils::{project_build_tag, project_git_version, tcp_listener};
project_git_version!(GIT_VERSION);
@@ -50,7 +52,7 @@ struct Cli {
#[arg(short, long)]
path: Option<Utf8PathBuf>,
/// URL to connect to postgres, like postgresql://localhost:1234/attachment_service
/// URL to connect to postgres, like postgresql://localhost:1234/storage_controller
#[arg(long)]
database_url: Option<String>,
@@ -158,6 +160,8 @@ fn main() -> anyhow::Result<()> {
std::process::exit(1);
}));
let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);
tokio::runtime::Builder::new_current_thread()
// We use spawn_blocking for database operations, so require approximately
// as many blocking threads as we will open database connections.
@@ -189,6 +193,11 @@ async fn async_main() -> anyhow::Result<()> {
args.listen
);
let build_info = BuildInfo {
revision: GIT_VERSION,
build_tag: BUILD_TAG,
};
let strict_mode = if args.dev {
StrictMode::Dev
} else {
@@ -250,7 +259,7 @@ async fn async_main() -> anyhow::Result<()> {
let auth = secrets
.public_key
.map(|jwt_auth| Arc::new(SwappableJwtAuth::new(jwt_auth)));
let router = make_router(service.clone(), auth)
let router = make_router(service.clone(), auth, build_info)
.build()
.map_err(|err| anyhow!(err))?;
let router_service = utils::http::RouterService::new(router).unwrap();

View File

@@ -8,10 +8,8 @@
//! The rest of the code defines label group types and deals with converting outer types to labels.
//!
use bytes::Bytes;
use measured::{
label::{LabelValue, StaticLabelSet},
FixedCardinalityLabel, MetricGroup,
};
use measured::{label::LabelValue, metric::histogram, FixedCardinalityLabel, MetricGroup};
use metrics::NeonMetrics;
use once_cell::sync::Lazy;
use std::sync::Mutex;
@@ -26,13 +24,15 @@ pub fn preinitialize_metrics() {
pub(crate) struct StorageControllerMetrics {
pub(crate) metrics_group: StorageControllerMetricGroup,
encoder: Mutex<measured::text::TextEncoder>,
encoder: Mutex<measured::text::BufferedTextEncoder>,
}
#[derive(measured::MetricGroup)]
#[metric(new())]
pub(crate) struct StorageControllerMetricGroup {
/// Count of how many times we spawn a reconcile task
pub(crate) storage_controller_reconcile_spawn: measured::Counter,
/// Reconciler tasks completed, broken down by success/failure/cancelled
pub(crate) storage_controller_reconcile_complete:
measured::CounterVec<ReconcileCompleteLabelGroupSet>,
@@ -43,7 +43,9 @@ pub(crate) struct StorageControllerMetricGroup {
/// HTTP request status counters for handled requests
pub(crate) storage_controller_http_request_status:
measured::CounterVec<HttpRequestStatusLabelGroupSet>,
/// HTTP request handler latency across all status codes
#[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))]
pub(crate) storage_controller_http_request_latency:
measured::HistogramVec<HttpRequestLatencyLabelGroupSet, 5>,
@@ -55,6 +57,7 @@ pub(crate) struct StorageControllerMetricGroup {
/// Latency of HTTP requests to the pageserver, broken down by pageserver
/// node id, request name and method. This include both successful and unsuccessful
/// requests.
#[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))]
pub(crate) storage_controller_pageserver_request_latency:
measured::HistogramVec<PageserverRequestLabelGroupSet, 5>,
@@ -66,6 +69,7 @@ pub(crate) struct StorageControllerMetricGroup {
/// Latency of pass-through HTTP requests to the pageserver, broken down by pageserver
/// node id, request name and method. This include both successful and unsuccessful
/// requests.
#[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))]
pub(crate) storage_controller_passthrough_request_latency:
measured::HistogramVec<PageserverRequestLabelGroupSet, 5>,
@@ -74,76 +78,34 @@ pub(crate) struct StorageControllerMetricGroup {
measured::CounterVec<DatabaseQueryErrorLabelGroupSet>,
/// Latency of database queries, broken down by operation.
#[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))]
pub(crate) storage_controller_database_query_latency:
measured::HistogramVec<DatabaseQueryLatencyLabelGroupSet, 5>,
}
impl StorageControllerMetrics {
pub(crate) fn encode(&self) -> Bytes {
pub(crate) fn encode(&self, neon_metrics: &NeonMetrics) -> Bytes {
let mut encoder = self.encoder.lock().unwrap();
self.metrics_group.collect_into(&mut *encoder);
neon_metrics
.collect_group_into(&mut *encoder)
.unwrap_or_else(|infallible| match infallible {});
self.metrics_group
.collect_group_into(&mut *encoder)
.unwrap_or_else(|infallible| match infallible {});
encoder.finish()
}
}
impl Default for StorageControllerMetrics {
fn default() -> Self {
Self {
metrics_group: StorageControllerMetricGroup::new(),
encoder: Mutex::new(measured::text::TextEncoder::new()),
}
}
}
let mut metrics_group = StorageControllerMetricGroup::new();
metrics_group
.storage_controller_reconcile_complete
.init_all_dense();
impl StorageControllerMetricGroup {
pub(crate) fn new() -> Self {
Self {
storage_controller_reconcile_spawn: measured::Counter::new(),
storage_controller_reconcile_complete: measured::CounterVec::new(
ReconcileCompleteLabelGroupSet {
status: StaticLabelSet::new(),
},
),
storage_controller_schedule_optimization: measured::Counter::new(),
storage_controller_http_request_status: measured::CounterVec::new(
HttpRequestStatusLabelGroupSet {
path: lasso::ThreadedRodeo::new(),
method: StaticLabelSet::new(),
status: StaticLabelSet::new(),
},
),
storage_controller_http_request_latency: measured::HistogramVec::new(
measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0),
),
storage_controller_pageserver_request_error: measured::CounterVec::new(
PageserverRequestLabelGroupSet {
pageserver_id: lasso::ThreadedRodeo::new(),
path: lasso::ThreadedRodeo::new(),
method: StaticLabelSet::new(),
},
),
storage_controller_pageserver_request_latency: measured::HistogramVec::new(
measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0),
),
storage_controller_passthrough_request_error: measured::CounterVec::new(
PageserverRequestLabelGroupSet {
pageserver_id: lasso::ThreadedRodeo::new(),
path: lasso::ThreadedRodeo::new(),
method: StaticLabelSet::new(),
},
),
storage_controller_passthrough_request_latency: measured::HistogramVec::new(
measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0),
),
storage_controller_database_query_error: measured::CounterVec::new(
DatabaseQueryErrorLabelGroupSet {
operation: StaticLabelSet::new(),
error_type: StaticLabelSet::new(),
},
),
storage_controller_database_query_latency: measured::HistogramVec::new(
measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0),
),
metrics_group,
encoder: Mutex::new(measured::text::BufferedTextEncoder::new()),
}
}
}
@@ -157,7 +119,7 @@ pub(crate) struct ReconcileCompleteLabelGroup {
#[derive(measured::LabelGroup)]
#[label(set = HttpRequestStatusLabelGroupSet)]
pub(crate) struct HttpRequestStatusLabelGroup<'a> {
#[label(dynamic_with = lasso::ThreadedRodeo)]
#[label(dynamic_with = lasso::ThreadedRodeo, default)]
pub(crate) path: &'a str,
pub(crate) method: Method,
pub(crate) status: StatusCode,
@@ -166,40 +128,21 @@ pub(crate) struct HttpRequestStatusLabelGroup<'a> {
#[derive(measured::LabelGroup)]
#[label(set = HttpRequestLatencyLabelGroupSet)]
pub(crate) struct HttpRequestLatencyLabelGroup<'a> {
#[label(dynamic_with = lasso::ThreadedRodeo)]
#[label(dynamic_with = lasso::ThreadedRodeo, default)]
pub(crate) path: &'a str,
pub(crate) method: Method,
}
impl Default for HttpRequestLatencyLabelGroupSet {
fn default() -> Self {
Self {
path: lasso::ThreadedRodeo::new(),
method: StaticLabelSet::new(),
}
}
}
#[derive(measured::LabelGroup, Clone)]
#[label(set = PageserverRequestLabelGroupSet)]
pub(crate) struct PageserverRequestLabelGroup<'a> {
#[label(dynamic_with = lasso::ThreadedRodeo)]
#[label(dynamic_with = lasso::ThreadedRodeo, default)]
pub(crate) pageserver_id: &'a str,
#[label(dynamic_with = lasso::ThreadedRodeo)]
#[label(dynamic_with = lasso::ThreadedRodeo, default)]
pub(crate) path: &'a str,
pub(crate) method: Method,
}
impl Default for PageserverRequestLabelGroupSet {
fn default() -> Self {
Self {
pageserver_id: lasso::ThreadedRodeo::new(),
path: lasso::ThreadedRodeo::new(),
method: StaticLabelSet::new(),
}
}
}
#[derive(measured::LabelGroup)]
#[label(set = DatabaseQueryErrorLabelGroupSet)]
pub(crate) struct DatabaseQueryErrorLabelGroup {
@@ -213,7 +156,7 @@ pub(crate) struct DatabaseQueryLatencyLabelGroup {
pub(crate) operation: DatabaseOperation,
}
#[derive(FixedCardinalityLabel)]
#[derive(FixedCardinalityLabel, Clone, Copy)]
pub(crate) enum ReconcileOutcome {
#[label(rename = "ok")]
Success,
@@ -221,7 +164,7 @@ pub(crate) enum ReconcileOutcome {
Cancel,
}
#[derive(FixedCardinalityLabel, Clone)]
#[derive(FixedCardinalityLabel, Copy, Clone)]
pub(crate) enum Method {
Get,
Put,
@@ -246,11 +189,12 @@ impl From<hyper::Method> for Method {
}
}
#[derive(Clone, Copy)]
pub(crate) struct StatusCode(pub(crate) hyper::http::StatusCode);
impl LabelValue for StatusCode {
fn visit<V: measured::label::LabelVisitor>(&self, v: V) -> V::Output {
v.write_int(self.0.as_u16() as u64)
v.write_int(self.0.as_u16() as i64)
}
}
@@ -268,7 +212,7 @@ impl FixedCardinalityLabel for StatusCode {
}
}
#[derive(FixedCardinalityLabel)]
#[derive(FixedCardinalityLabel, Clone, Copy)]
pub(crate) enum DatabaseErrorLabel {
Query,
Connection,

Some files were not shown because too many files have changed in this diff Show More