Merge pull request #7378 from neondatabase/rc/2024-04-15

Release 2024-04-15
This commit is contained in:
Christian Schwarz
2024-04-15 15:48:14 +03:00
committed by GitHub
92 changed files with 2982 additions and 1810 deletions

View File

@@ -150,7 +150,7 @@ runs:
# Use aws s3 cp (instead of aws s3 sync) to keep files from previous runs to make old URLs work,
# and to keep files on the host to upload them to the database
time aws s3 cp --recursive --only-show-errors "${WORKDIR}/report" "s3://${BUCKET}/${REPORT_PREFIX}/${GITHUB_RUN_ID}"
time s5cmd --log error cp "${WORKDIR}/report/*" "s3://${BUCKET}/${REPORT_PREFIX}/${GITHUB_RUN_ID}/"
# Generate redirect
cat <<EOF > ${WORKDIR}/index.html

View File

@@ -10,7 +10,7 @@ inputs:
required: true
api_host:
desctiption: 'Neon API host'
default: console.stage.neon.tech
default: console-stage.neon.build
outputs:
dsn:
description: 'Created Branch DSN (for main database)'

View File

@@ -13,7 +13,7 @@ inputs:
required: true
api_host:
desctiption: 'Neon API host'
default: console.stage.neon.tech
default: console-stage.neon.build
runs:
using: "composite"

View File

@@ -13,7 +13,7 @@ inputs:
default: 15
api_host:
desctiption: 'Neon API host'
default: console.stage.neon.tech
default: console-stage.neon.build
provisioner:
desctiption: 'k8s-pod or k8s-neonvm'
default: 'k8s-pod'

View File

@@ -10,7 +10,7 @@ inputs:
required: true
api_host:
desctiption: 'Neon API host'
default: console.stage.neon.tech
default: console-stage.neon.build
runs:
using: "composite"

View File

@@ -18,6 +18,7 @@ on:
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number }}
cancel-in-progress: false
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}

View File

@@ -21,6 +21,7 @@ defaults:
concurrency:
group: build-build-tools-image-${{ inputs.image-tag }}
cancel-in-progress: false
# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
permissions: {}

View File

@@ -20,6 +20,7 @@ defaults:
concurrency:
group: pin-build-tools-image-${{ inputs.from-tag }}
cancel-in-progress: false
permissions: {}

397
Cargo.lock generated
View File

@@ -270,6 +270,12 @@ dependencies = [
"critical-section",
]
[[package]]
name = "atomic-take"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a8ab6b55fe97976e46f91ddbed8d147d966475dc29b2032757ba47e02376fbc3"
[[package]]
name = "autocfg"
version = "1.1.0"
@@ -298,7 +304,7 @@ dependencies = [
"fastrand 2.0.0",
"hex",
"http 0.2.9",
"hyper",
"hyper 0.14.26",
"ring 0.17.6",
"time",
"tokio",
@@ -335,7 +341,7 @@ dependencies = [
"bytes",
"fastrand 2.0.0",
"http 0.2.9",
"http-body",
"http-body 0.4.5",
"percent-encoding",
"pin-project-lite",
"tracing",
@@ -386,7 +392,7 @@ dependencies = [
"aws-types",
"bytes",
"http 0.2.9",
"http-body",
"http-body 0.4.5",
"once_cell",
"percent-encoding",
"regex-lite",
@@ -514,7 +520,7 @@ dependencies = [
"crc32fast",
"hex",
"http 0.2.9",
"http-body",
"http-body 0.4.5",
"md-5",
"pin-project-lite",
"sha1",
@@ -546,7 +552,7 @@ dependencies = [
"bytes-utils",
"futures-core",
"http 0.2.9",
"http-body",
"http-body 0.4.5",
"once_cell",
"percent-encoding",
"pin-project-lite",
@@ -585,10 +591,10 @@ dependencies = [
"aws-smithy-types",
"bytes",
"fastrand 2.0.0",
"h2",
"h2 0.3.26",
"http 0.2.9",
"http-body",
"hyper",
"http-body 0.4.5",
"hyper 0.14.26",
"hyper-rustls",
"once_cell",
"pin-project-lite",
@@ -626,7 +632,7 @@ dependencies = [
"bytes-utils",
"futures-core",
"http 0.2.9",
"http-body",
"http-body 0.4.5",
"itoa",
"num-integer",
"pin-project-lite",
@@ -675,8 +681,8 @@ dependencies = [
"bytes",
"futures-util",
"http 0.2.9",
"http-body",
"hyper",
"http-body 0.4.5",
"hyper 0.14.26",
"itoa",
"matchit",
"memchr",
@@ -691,7 +697,7 @@ dependencies = [
"sha1",
"sync_wrapper",
"tokio",
"tokio-tungstenite",
"tokio-tungstenite 0.20.0",
"tower",
"tower-layer",
"tower-service",
@@ -707,7 +713,7 @@ dependencies = [
"bytes",
"futures-util",
"http 0.2.9",
"http-body",
"http-body 0.4.5",
"mime",
"rustversion",
"tower-layer",
@@ -1124,7 +1130,7 @@ version = "4.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "191d9573962933b4027f932c600cd252ce27a8ad5979418fe78e43c07996f27b"
dependencies = [
"heck",
"heck 0.4.1",
"proc-macro2",
"quote",
"syn 2.0.52",
@@ -1196,7 +1202,7 @@ dependencies = [
"compute_api",
"flate2",
"futures",
"hyper",
"hyper 0.14.26",
"nix 0.27.1",
"notify",
"num_cpus",
@@ -1313,7 +1319,7 @@ dependencies = [
"git-version",
"hex",
"humantime",
"hyper",
"hyper 0.14.26",
"nix 0.27.1",
"once_cell",
"pageserver_api",
@@ -1462,12 +1468,9 @@ dependencies = [
[[package]]
name = "crossbeam-utils"
version = "0.8.15"
version = "0.8.19"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3c063cd8cc95f5c377ed0d4b49a4b21f632396ff690e8470c29b3359b346984b"
dependencies = [
"cfg-if",
]
checksum = "248e3bacc7dc6baa3b21e405ee045c3047101a49145e7e9eca583ab4c2ca5345"
[[package]]
name = "crossterm"
@@ -1840,23 +1843,12 @@ checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5"
[[package]]
name = "errno"
version = "0.3.1"
version = "0.3.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4bcfec3a70f97c962c307b2d2c56e358cf1d00b558d74262b5f929ee8cc7e73a"
checksum = "a258e46cdc063eb8519c00b9fc845fc47bcfca4130e2f08e88665ceda8474245"
dependencies = [
"errno-dragonfly",
"libc",
"windows-sys 0.48.0",
]
[[package]]
name = "errno-dragonfly"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf"
dependencies = [
"cc",
"libc",
"windows-sys 0.52.0",
]
[[package]]
@@ -2213,6 +2205,25 @@ dependencies = [
"tracing",
]
[[package]]
name = "h2"
version = "0.4.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "816ec7294445779408f36fe57bc5b7fc1cf59664059096c65f905c1c61f58069"
dependencies = [
"bytes",
"fnv",
"futures-core",
"futures-sink",
"futures-util",
"http 1.1.0",
"indexmap 2.0.1",
"slab",
"tokio",
"tokio-util",
"tracing",
]
[[package]]
name = "half"
version = "1.8.2"
@@ -2294,6 +2305,12 @@ version = "0.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"
[[package]]
name = "heck"
version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
[[package]]
name = "hermit-abi"
version = "0.3.3"
@@ -2378,6 +2395,29 @@ dependencies = [
"pin-project-lite",
]
[[package]]
name = "http-body"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1cac85db508abc24a2e48553ba12a996e87244a0395ce011e62b37158745d643"
dependencies = [
"bytes",
"http 1.1.0",
]
[[package]]
name = "http-body-util"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "41cb79eb393015dadd30fc252023adb0b2400a0caee0fa2a077e6e21a551e840"
dependencies = [
"bytes",
"futures-util",
"http 1.1.0",
"http-body 1.0.0",
"pin-project-lite",
]
[[package]]
name = "http-types"
version = "2.12.0"
@@ -2436,9 +2476,9 @@ dependencies = [
"futures-channel",
"futures-core",
"futures-util",
"h2",
"h2 0.3.26",
"http 0.2.9",
"http-body",
"http-body 0.4.5",
"httparse",
"httpdate",
"itoa",
@@ -2450,6 +2490,26 @@ dependencies = [
"want",
]
[[package]]
name = "hyper"
version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "186548d73ac615b32a73aafe38fb4f56c0d340e110e5a200bcadbaf2e199263a"
dependencies = [
"bytes",
"futures-channel",
"futures-util",
"h2 0.4.4",
"http 1.1.0",
"http-body 1.0.0",
"httparse",
"httpdate",
"itoa",
"pin-project-lite",
"smallvec",
"tokio",
]
[[package]]
name = "hyper-rustls"
version = "0.24.0"
@@ -2457,7 +2517,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0646026eb1b3eea4cd9ba47912ea5ce9cc07713d105b1a14698f4e6433d348b7"
dependencies = [
"http 0.2.9",
"hyper",
"hyper 0.14.26",
"log",
"rustls 0.21.9",
"rustls-native-certs 0.6.2",
@@ -2471,7 +2531,7 @@ version = "0.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bbb958482e8c7be4bc3cf272a766a2b0bf1a6755e7a6ae777f017a31d11b13b1"
dependencies = [
"hyper",
"hyper 0.14.26",
"pin-project-lite",
"tokio",
"tokio-io-timeout",
@@ -2484,7 +2544,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d6183ddfa99b85da61a140bea0efc93fdf56ceaa041b37d553518030827f9905"
dependencies = [
"bytes",
"hyper",
"hyper 0.14.26",
"native-tls",
"tokio",
"tokio-native-tls",
@@ -2492,15 +2552,33 @@ dependencies = [
[[package]]
name = "hyper-tungstenite"
version = "0.11.1"
version = "0.13.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7cc7dcb1ab67cd336f468a12491765672e61a3b6b148634dbfe2fe8acd3fe7d9"
checksum = "7a343d17fe7885302ed7252767dc7bb83609a874b6ff581142241ec4b73957ad"
dependencies = [
"hyper",
"http-body-util",
"hyper 1.2.0",
"hyper-util",
"pin-project-lite",
"tokio",
"tokio-tungstenite",
"tungstenite",
"tokio-tungstenite 0.21.0",
"tungstenite 0.21.0",
]
[[package]]
name = "hyper-util"
version = "0.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ca38ef113da30126bbff9cd1705f9273e15d45498615d138b0c20279ac7a76aa"
dependencies = [
"bytes",
"futures-util",
"http 1.1.0",
"http-body 1.0.0",
"hyper 1.2.0",
"pin-project-lite",
"socket2 0.5.5",
"tokio",
]
[[package]]
@@ -2794,6 +2872,12 @@ version = "0.3.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ef53942eb7bf7ff43a617b3e2c1c4a5ecf5944a7c1bc12d7ee39bbb15e5c1519"
[[package]]
name = "linux-raw-sys"
version = "0.4.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "01cda141df6706de531b6c46c3a33ecca755538219bd484262fa09410c13539c"
[[package]]
name = "lock_api"
version = "0.4.10"
@@ -2848,11 +2932,12 @@ checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771"
[[package]]
name = "measured"
version = "0.0.13"
version = "0.0.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f246648d027839a34b420e27c7de1165ace96e19ef894985d0a6ff89a7840a9f"
checksum = "652bc741286361c06de8cb4d89b21a6437f120c508c51713663589eeb9928ac5"
dependencies = [
"bytes",
"crossbeam-utils",
"hashbrown 0.14.0",
"itoa",
"lasso",
@@ -2865,16 +2950,27 @@ dependencies = [
[[package]]
name = "measured-derive"
version = "0.0.13"
version = "0.0.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "edaa5cc22d99d5d6d7d99c3b5b5f7e7f8034c22f1b5d62a1adecd2ed005d9b80"
checksum = "6ea497f33e1e856a376c32ad916f69a0bd3c597db1f912a399f842b01a4a685d"
dependencies = [
"heck",
"heck 0.5.0",
"proc-macro2",
"quote",
"syn 2.0.52",
]
[[package]]
name = "measured-process"
version = "0.0.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b364ccb66937a814b6b2ad751d1a2f7a9d5a78c761144036825fb36bb0771000"
dependencies = [
"libc",
"measured",
"procfs 0.16.0",
]
[[package]]
name = "memchr"
version = "2.6.4"
@@ -2914,8 +3010,10 @@ version = "0.1.0"
dependencies = [
"chrono",
"libc",
"measured",
"measured-process",
"once_cell",
"procfs",
"procfs 0.14.2",
"prometheus",
"rand 0.8.5",
"rand_distr",
@@ -3465,12 +3563,17 @@ dependencies = [
"camino",
"clap",
"git-version",
"humantime",
"pageserver",
"pageserver_api",
"postgres_ffi",
"remote_storage",
"serde",
"serde_json",
"svg_fmt",
"tokio",
"tokio-util",
"toml_edit",
"utils",
"workspace_hack",
]
@@ -3506,7 +3609,7 @@ dependencies = [
"hex-literal",
"humantime",
"humantime-serde",
"hyper",
"hyper 0.14.26",
"itertools",
"leaky-bucket",
"md5",
@@ -3525,7 +3628,7 @@ dependencies = [
"postgres_connection",
"postgres_ffi",
"pq_proto",
"procfs",
"procfs 0.14.2",
"rand 0.8.5",
"regex",
"remote_storage",
@@ -3616,7 +3719,6 @@ dependencies = [
"anyhow",
"async-compression",
"async-stream",
"async-trait",
"byteorder",
"bytes",
"chrono",
@@ -4086,6 +4188,29 @@ dependencies = [
"rustix 0.36.16",
]
[[package]]
name = "procfs"
version = "0.16.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "731e0d9356b0c25f16f33b5be79b1c57b562f141ebfcdb0ad8ac2c13a24293b4"
dependencies = [
"bitflags 2.4.1",
"hex",
"lazy_static",
"procfs-core",
"rustix 0.38.28",
]
[[package]]
name = "procfs-core"
version = "0.16.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2d3554923a69f4ce04c4a754260c338f505ce22642d3830e049a399fc2059a29"
dependencies = [
"bitflags 2.4.1",
"hex",
]
[[package]]
name = "prometheus"
version = "0.13.3"
@@ -4098,7 +4223,7 @@ dependencies = [
"libc",
"memchr",
"parking_lot 0.12.1",
"procfs",
"procfs 0.14.2",
"thiserror",
]
@@ -4119,7 +4244,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "119533552c9a7ffacc21e099c24a0ac8bb19c2a2a3f363de84cd9b844feab270"
dependencies = [
"bytes",
"heck",
"heck 0.4.1",
"itertools",
"lazy_static",
"log",
@@ -4163,6 +4288,7 @@ dependencies = [
"anyhow",
"async-compression",
"async-trait",
"atomic-take",
"aws-config",
"aws-sdk-iam",
"aws-sigv4",
@@ -4186,13 +4312,17 @@ dependencies = [
"hmac",
"hostname",
"http 1.1.0",
"http-body-util",
"humantime",
"hyper",
"hyper 0.14.26",
"hyper 1.2.0",
"hyper-tungstenite",
"hyper-util",
"ipnet",
"itertools",
"lasso",
"md5",
"measured",
"metrics",
"native-tls",
"once_cell",
@@ -4521,7 +4651,7 @@ dependencies = [
"futures-util",
"http-types",
"humantime",
"hyper",
"hyper 0.14.26",
"itertools",
"metrics",
"once_cell",
@@ -4551,10 +4681,10 @@ dependencies = [
"encoding_rs",
"futures-core",
"futures-util",
"h2",
"h2 0.3.26",
"http 0.2.9",
"http-body",
"hyper",
"http-body 0.4.5",
"hyper 0.14.26",
"hyper-rustls",
"hyper-tls",
"ipnet",
@@ -4612,7 +4742,7 @@ dependencies = [
"futures",
"getrandom 0.2.11",
"http 0.2.9",
"hyper",
"hyper 0.14.26",
"parking_lot 0.11.2",
"reqwest",
"reqwest-middleware",
@@ -4699,7 +4829,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "496c1d3718081c45ba9c31fbfc07417900aa96f4070ff90dc29961836b7a9945"
dependencies = [
"http 0.2.9",
"hyper",
"hyper 0.14.26",
"lazy_static",
"percent-encoding",
"regex",
@@ -4811,6 +4941,19 @@ dependencies = [
"windows-sys 0.48.0",
]
[[package]]
name = "rustix"
version = "0.38.28"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "72e572a5e8ca657d7366229cdde4bd14c4eb5499a9573d4d366fe1b599daa316"
dependencies = [
"bitflags 2.4.1",
"errno",
"libc",
"linux-raw-sys 0.4.13",
"windows-sys 0.52.0",
]
[[package]]
name = "rustls"
version = "0.21.9"
@@ -4991,7 +5134,7 @@ dependencies = [
"git-version",
"hex",
"humantime",
"hyper",
"hyper 0.14.26",
"metrics",
"once_cell",
"parking_lot 0.12.1",
@@ -5476,9 +5619,9 @@ dependencies = [
[[package]]
name = "smallvec"
version = "1.11.0"
version = "1.13.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "62bb4feee49fdd9f707ef802e22365a35de4b7b299de4763d44bfea899442ff9"
checksum = "e6ecd384b10a64542d77071bd64bd7b231f4ed5940fba55e98c3de13824cf3d7"
[[package]]
name = "smol_str"
@@ -5570,7 +5713,7 @@ dependencies = [
"futures-util",
"git-version",
"humantime",
"hyper",
"hyper 0.14.26",
"metrics",
"once_cell",
"parking_lot 0.12.1",
@@ -5601,7 +5744,7 @@ dependencies = [
"git-version",
"hex",
"humantime",
"hyper",
"hyper 0.14.26",
"itertools",
"lasso",
"measured",
@@ -5630,7 +5773,7 @@ dependencies = [
"anyhow",
"clap",
"comfy-table",
"hyper",
"hyper 0.14.26",
"pageserver_api",
"pageserver_client",
"reqwest",
@@ -5671,7 +5814,7 @@ version = "0.24.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1e385be0d24f186b4ce2f9982191e7101bb737312ad61c1f2f984f34bcf85d59"
dependencies = [
"heck",
"heck 0.4.1",
"proc-macro2",
"quote",
"rustversion",
@@ -6113,7 +6256,19 @@ dependencies = [
"futures-util",
"log",
"tokio",
"tungstenite",
"tungstenite 0.20.1",
]
[[package]]
name = "tokio-tungstenite"
version = "0.21.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c83b561d025642014097b66e6c1bb422783339e0909e4429cde4749d1990bc38"
dependencies = [
"futures-util",
"log",
"tokio",
"tungstenite 0.21.0",
]
[[package]]
@@ -6180,10 +6335,10 @@ dependencies = [
"bytes",
"futures-core",
"futures-util",
"h2",
"h2 0.3.26",
"http 0.2.9",
"http-body",
"hyper",
"http-body 0.4.5",
"hyper 0.14.26",
"hyper-timeout",
"percent-encoding",
"pin-project",
@@ -6369,7 +6524,7 @@ dependencies = [
name = "tracing-utils"
version = "0.1.0"
dependencies = [
"hyper",
"hyper 0.14.26",
"opentelemetry",
"opentelemetry-otlp",
"opentelemetry-semantic-conventions",
@@ -6406,6 +6561,25 @@ dependencies = [
"utf-8",
]
[[package]]
name = "tungstenite"
version = "0.21.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9ef1a641ea34f399a848dea702823bbecfb4c486f911735368f1f137cb8257e1"
dependencies = [
"byteorder",
"bytes",
"data-encoding",
"http 1.1.0",
"httparse",
"log",
"rand 0.8.5",
"sha1",
"thiserror",
"url",
"utf-8",
]
[[package]]
name = "twox-hash"
version = "1.6.3"
@@ -6570,7 +6744,8 @@ dependencies = [
"heapless",
"hex",
"hex-literal",
"hyper",
"humantime",
"hyper 0.14.26",
"jsonwebtoken",
"leaky-bucket",
"metrics",
@@ -6930,6 +7105,15 @@ dependencies = [
"windows-targets 0.48.0",
]
[[package]]
name = "windows-sys"
version = "0.52.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d"
dependencies = [
"windows-targets 0.52.4",
]
[[package]]
name = "windows-targets"
version = "0.42.2"
@@ -6960,6 +7144,21 @@ dependencies = [
"windows_x86_64_msvc 0.48.0",
]
[[package]]
name = "windows-targets"
version = "0.52.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7dd37b7e5ab9018759f893a1952c9420d060016fc19a472b4bb20d1bdd694d1b"
dependencies = [
"windows_aarch64_gnullvm 0.52.4",
"windows_aarch64_msvc 0.52.4",
"windows_i686_gnu 0.52.4",
"windows_i686_msvc 0.52.4",
"windows_x86_64_gnu 0.52.4",
"windows_x86_64_gnullvm 0.52.4",
"windows_x86_64_msvc 0.52.4",
]
[[package]]
name = "windows_aarch64_gnullvm"
version = "0.42.2"
@@ -6972,6 +7171,12 @@ version = "0.48.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "91ae572e1b79dba883e0d315474df7305d12f569b400fcf90581b06062f7e1bc"
[[package]]
name = "windows_aarch64_gnullvm"
version = "0.52.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bcf46cf4c365c6f2d1cc93ce535f2c8b244591df96ceee75d8e83deb70a9cac9"
[[package]]
name = "windows_aarch64_msvc"
version = "0.42.2"
@@ -6984,6 +7189,12 @@ version = "0.48.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b2ef27e0d7bdfcfc7b868b317c1d32c641a6fe4629c171b8928c7b08d98d7cf3"
[[package]]
name = "windows_aarch64_msvc"
version = "0.52.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "da9f259dd3bcf6990b55bffd094c4f7235817ba4ceebde8e6d11cd0c5633b675"
[[package]]
name = "windows_i686_gnu"
version = "0.42.2"
@@ -6996,6 +7207,12 @@ version = "0.48.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "622a1962a7db830d6fd0a69683c80a18fda201879f0f447f065a3b7467daa241"
[[package]]
name = "windows_i686_gnu"
version = "0.52.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b474d8268f99e0995f25b9f095bc7434632601028cf86590aea5c8a5cb7801d3"
[[package]]
name = "windows_i686_msvc"
version = "0.42.2"
@@ -7008,6 +7225,12 @@ version = "0.48.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4542c6e364ce21bf45d69fdd2a8e455fa38d316158cfd43b3ac1c5b1b19f8e00"
[[package]]
name = "windows_i686_msvc"
version = "0.52.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1515e9a29e5bed743cb4415a9ecf5dfca648ce85ee42e15873c3cd8610ff8e02"
[[package]]
name = "windows_x86_64_gnu"
version = "0.42.2"
@@ -7020,6 +7243,12 @@ version = "0.48.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ca2b8a661f7628cbd23440e50b05d705db3686f894fc9580820623656af974b1"
[[package]]
name = "windows_x86_64_gnu"
version = "0.52.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5eee091590e89cc02ad514ffe3ead9eb6b660aedca2183455434b93546371a03"
[[package]]
name = "windows_x86_64_gnullvm"
version = "0.42.2"
@@ -7032,6 +7261,12 @@ version = "0.48.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7896dbc1f41e08872e9d5e8f8baa8fdd2677f29468c4e156210174edc7f7b953"
[[package]]
name = "windows_x86_64_gnullvm"
version = "0.52.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "77ca79f2451b49fa9e2af39f0747fe999fcda4f5e241b2898624dca97a1f2177"
[[package]]
name = "windows_x86_64_msvc"
version = "0.42.2"
@@ -7044,6 +7279,12 @@ version = "0.48.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1a515f5799fe4961cb532f983ce2b23082366b898e52ffbce459c86f67c8378a"
[[package]]
name = "windows_x86_64_msvc"
version = "0.52.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "32b752e52a2da0ddfbdbcc6fceadfeede4c939ed16d13e648833a61dfb611ed8"
[[package]]
name = "winnow"
version = "0.4.6"
@@ -7092,11 +7333,10 @@ dependencies = [
"futures-sink",
"futures-util",
"getrandom 0.2.11",
"hashbrown 0.13.2",
"hashbrown 0.14.0",
"hex",
"hmac",
"hyper",
"hyper 0.14.26",
"indexmap 1.9.3",
"itertools",
"libc",
@@ -7134,7 +7374,6 @@ dependencies = [
"tower",
"tracing",
"tracing-core",
"tungstenite",
"url",
"uuid",
"zeroize",

View File

@@ -44,6 +44,7 @@ license = "Apache-2.0"
anyhow = { version = "1.0", features = ["backtrace"] }
arc-swap = "1.6"
async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] }
atomic-take = "1.1.0"
azure_core = "0.18"
azure_identity = "0.18"
azure_storage = "0.18"
@@ -97,7 +98,7 @@ http-types = { version = "2", default-features = false }
humantime = "2.1"
humantime-serde = "1.1.1"
hyper = "0.14"
hyper-tungstenite = "0.11"
hyper-tungstenite = "0.13.0"
inotify = "0.10.2"
ipnet = "2.9.0"
itertools = "0.10"
@@ -106,7 +107,8 @@ lasso = "0.7"
leaky-bucket = "1.0.1"
libc = "0.2"
md5 = "0.7.0"
measured = { version = "0.0.13", features=["default", "lasso"] }
measured = { version = "0.0.21", features=["lasso"] }
measured-process = { version = "0.0.21" }
memoffset = "0.8"
native-tls = "0.2"
nix = { version = "0.27", features = ["fs", "process", "socket", "signal", "poll"] }

View File

@@ -58,6 +58,12 @@ RUN curl -fsSL "https://github.com/protocolbuffers/protobuf/releases/download/v$
&& mv protoc/include/google /usr/local/include/google \
&& rm -rf protoc.zip protoc
# s5cmd
ENV S5CMD_VERSION=2.2.2
RUN curl -sL "https://github.com/peak/s5cmd/releases/download/v${S5CMD_VERSION}/s5cmd_${S5CMD_VERSION}_Linux-$(uname -m | sed 's/x86_64/64bit/g' | sed 's/aarch64/arm64/g').tar.gz" | tar zxvf - s5cmd \
&& chmod +x s5cmd \
&& mv s5cmd /usr/local/bin/s5cmd
# LLVM
ENV LLVM_VERSION=17
RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \

View File

@@ -86,7 +86,10 @@ where
.stdout(process_log_file)
.stderr(same_file_for_stderr)
.args(args);
let filled_cmd = fill_remote_storage_secrets_vars(fill_rust_env_vars(background_command));
let filled_cmd = fill_env_vars_prefixed_neon(fill_remote_storage_secrets_vars(
fill_rust_env_vars(background_command),
));
filled_cmd.envs(envs);
let pid_file_to_check = match &initial_pid_file {
@@ -268,6 +271,15 @@ fn fill_remote_storage_secrets_vars(mut cmd: &mut Command) -> &mut Command {
cmd
}
fn fill_env_vars_prefixed_neon(mut cmd: &mut Command) -> &mut Command {
for (var, val) in std::env::vars() {
if var.starts_with("NEON_PAGESERVER_") {
cmd = cmd.env(var, val);
}
}
cmd
}
/// Add a `pre_exec` to the cmd that, inbetween fork() and exec(),
/// 1. Claims a pidfile with a fcntl lock on it and
/// 2. Sets up the pidfile's file descriptor so that it (and the lock)

View File

@@ -1231,7 +1231,7 @@ async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
match ComputeControlPlane::load(env.clone()) {
Ok(cplane) => {
for (_k, node) in cplane.endpoints {
if let Err(e) = node.stop(if immediate { "immediate" } else { "fast " }, false) {
if let Err(e) = node.stop(if immediate { "immediate" } else { "fast" }, false) {
eprintln!("postgres stop failed: {e:#}");
}
}

View File

@@ -10,11 +10,13 @@ libc.workspace = true
once_cell.workspace = true
chrono.workspace = true
twox-hash.workspace = true
measured.workspace = true
workspace_hack.workspace = true
[target.'cfg(target_os = "linux")'.dependencies]
procfs.workspace = true
measured-process.workspace = true
[dev-dependencies]
rand = "0.8"

View File

@@ -7,14 +7,19 @@
//! use significantly less memory than this, but can only approximate the cardinality.
use std::{
collections::HashMap,
hash::{BuildHasher, BuildHasherDefault, Hash, Hasher},
sync::{atomic::AtomicU8, Arc, RwLock},
hash::{BuildHasher, BuildHasherDefault, Hash},
sync::atomic::AtomicU8,
};
use prometheus::{
core::{self, Describer},
proto, Opts,
use measured::{
label::{LabelGroupVisitor, LabelName, LabelValue, LabelVisitor},
metric::{
group::{Encoding, MetricValue},
name::MetricNameEncoder,
Metric, MetricType, MetricVec,
},
text::TextEncoder,
LabelGroup,
};
use twox_hash::xxh3;
@@ -93,203 +98,25 @@ macro_rules! register_hll {
/// ```
///
/// See <https://en.wikipedia.org/wiki/HyperLogLog#Practical_considerations> for estimates on alpha
#[derive(Clone)]
pub struct HyperLogLogVec<const N: usize> {
core: Arc<HyperLogLogVecCore<N>>,
pub type HyperLogLogVec<L, const N: usize> = MetricVec<HyperLogLogState<N>, L>;
pub type HyperLogLog<const N: usize> = Metric<HyperLogLogState<N>>;
pub struct HyperLogLogState<const N: usize> {
shards: [AtomicU8; N],
}
struct HyperLogLogVecCore<const N: usize> {
pub children: RwLock<HashMap<u64, HyperLogLog<N>, BuildHasherDefault<xxh3::Hash64>>>,
pub desc: core::Desc,
pub opts: Opts,
}
impl<const N: usize> core::Collector for HyperLogLogVec<N> {
fn desc(&self) -> Vec<&core::Desc> {
vec![&self.core.desc]
}
fn collect(&self) -> Vec<proto::MetricFamily> {
let mut m = proto::MetricFamily::default();
m.set_name(self.core.desc.fq_name.clone());
m.set_help(self.core.desc.help.clone());
m.set_field_type(proto::MetricType::GAUGE);
let mut metrics = Vec::new();
for child in self.core.children.read().unwrap().values() {
child.core.collect_into(&mut metrics);
}
m.set_metric(metrics);
vec![m]
impl<const N: usize> Default for HyperLogLogState<N> {
fn default() -> Self {
#[allow(clippy::declare_interior_mutable_const)]
const ZERO: AtomicU8 = AtomicU8::new(0);
Self { shards: [ZERO; N] }
}
}
impl<const N: usize> HyperLogLogVec<N> {
/// Create a new [`HyperLogLogVec`] based on the provided
/// [`Opts`] and partitioned by the given label names. At least one label name must be
/// provided.
pub fn new(opts: Opts, label_names: &[&str]) -> prometheus::Result<Self> {
assert!(N.is_power_of_two());
let variable_names = label_names.iter().map(|s| (*s).to_owned()).collect();
let opts = opts.variable_labels(variable_names);
let desc = opts.describe()?;
let v = HyperLogLogVecCore {
children: RwLock::new(HashMap::default()),
desc,
opts,
};
Ok(Self { core: Arc::new(v) })
}
/// `get_metric_with_label_values` returns the [`HyperLogLog<P>`] for the given slice
/// of label values (same order as the VariableLabels in Desc). If that combination of
/// label values is accessed for the first time, a new [`HyperLogLog<P>`] is created.
///
/// An error is returned if the number of label values is not the same as the
/// number of VariableLabels in Desc.
pub fn get_metric_with_label_values(
&self,
vals: &[&str],
) -> prometheus::Result<HyperLogLog<N>> {
self.core.get_metric_with_label_values(vals)
}
/// `with_label_values` works as `get_metric_with_label_values`, but panics if an error
/// occurs.
pub fn with_label_values(&self, vals: &[&str]) -> HyperLogLog<N> {
self.get_metric_with_label_values(vals).unwrap()
}
impl<const N: usize> MetricType for HyperLogLogState<N> {
type Metadata = ();
}
impl<const N: usize> HyperLogLogVecCore<N> {
pub fn get_metric_with_label_values(
&self,
vals: &[&str],
) -> prometheus::Result<HyperLogLog<N>> {
let h = self.hash_label_values(vals)?;
if let Some(metric) = self.children.read().unwrap().get(&h).cloned() {
return Ok(metric);
}
self.get_or_create_metric(h, vals)
}
pub(crate) fn hash_label_values(&self, vals: &[&str]) -> prometheus::Result<u64> {
if vals.len() != self.desc.variable_labels.len() {
return Err(prometheus::Error::InconsistentCardinality {
expect: self.desc.variable_labels.len(),
got: vals.len(),
});
}
let mut h = xxh3::Hash64::default();
for val in vals {
h.write(val.as_bytes());
}
Ok(h.finish())
}
fn get_or_create_metric(
&self,
hash: u64,
label_values: &[&str],
) -> prometheus::Result<HyperLogLog<N>> {
let mut children = self.children.write().unwrap();
// Check exist first.
if let Some(metric) = children.get(&hash).cloned() {
return Ok(metric);
}
let metric = HyperLogLog::with_opts_and_label_values(&self.opts, label_values)?;
children.insert(hash, metric.clone());
Ok(metric)
}
}
/// HLL is a probabilistic cardinality measure.
///
/// How to use this time-series for a metric name `my_metrics_total_hll`:
///
/// ```promql
/// # harmonic mean
/// 1 / (
/// sum (
/// 2 ^ -(
/// # HLL merge operation
/// max (my_metrics_total_hll{}) by (hll_shard, other_labels...)
/// )
/// ) without (hll_shard)
/// )
/// * alpha
/// * shards_count
/// * shards_count
/// ```
///
/// If you want an estimate over time, you can use the following query:
///
/// ```promql
/// # harmonic mean
/// 1 / (
/// sum (
/// 2 ^ -(
/// # HLL merge operation
/// max (
/// max_over_time(my_metrics_total_hll{}[$__rate_interval])
/// ) by (hll_shard, other_labels...)
/// )
/// ) without (hll_shard)
/// )
/// * alpha
/// * shards_count
/// * shards_count
/// ```
///
/// In the case of low cardinality, you might want to use the linear counting approximation:
///
/// ```promql
/// # LinearCounting(m, V) = m log (m / V)
/// shards_count * ln(shards_count /
/// # calculate V = how many shards contain a 0
/// count(max (proxy_connecting_endpoints{}) by (hll_shard, protocol) == 0) without (hll_shard)
/// )
/// ```
///
/// See <https://en.wikipedia.org/wiki/HyperLogLog#Practical_considerations> for estimates on alpha
#[derive(Clone)]
pub struct HyperLogLog<const N: usize> {
core: Arc<HyperLogLogCore<N>>,
}
impl<const N: usize> HyperLogLog<N> {
/// Create a [`HyperLogLog`] with the `name` and `help` arguments.
pub fn new<S1: Into<String>, S2: Into<String>>(name: S1, help: S2) -> prometheus::Result<Self> {
assert!(N.is_power_of_two());
let opts = Opts::new(name, help);
Self::with_opts(opts)
}
/// Create a [`HyperLogLog`] with the `opts` options.
pub fn with_opts(opts: Opts) -> prometheus::Result<Self> {
Self::with_opts_and_label_values(&opts, &[])
}
fn with_opts_and_label_values(opts: &Opts, label_values: &[&str]) -> prometheus::Result<Self> {
let desc = opts.describe()?;
let labels = make_label_pairs(&desc, label_values)?;
let v = HyperLogLogCore {
shards: [0; N].map(AtomicU8::new),
desc,
labels,
};
Ok(Self { core: Arc::new(v) })
}
impl<const N: usize> HyperLogLogState<N> {
pub fn measure(&self, item: &impl Hash) {
// changing the hasher will break compatibility with previous measurements.
self.record(BuildHasherDefault::<xxh3::Hash64>::default().hash_one(item));
@@ -299,42 +126,11 @@ impl<const N: usize> HyperLogLog<N> {
let p = N.ilog2() as u8;
let j = hash & (N as u64 - 1);
let rho = (hash >> p).leading_zeros() as u8 + 1 - p;
self.core.shards[j as usize].fetch_max(rho, std::sync::atomic::Ordering::Relaxed);
}
}
struct HyperLogLogCore<const N: usize> {
shards: [AtomicU8; N],
desc: core::Desc,
labels: Vec<proto::LabelPair>,
}
impl<const N: usize> core::Collector for HyperLogLog<N> {
fn desc(&self) -> Vec<&core::Desc> {
vec![&self.core.desc]
self.shards[j as usize].fetch_max(rho, std::sync::atomic::Ordering::Relaxed);
}
fn collect(&self) -> Vec<proto::MetricFamily> {
let mut m = proto::MetricFamily::default();
m.set_name(self.core.desc.fq_name.clone());
m.set_help(self.core.desc.help.clone());
m.set_field_type(proto::MetricType::GAUGE);
let mut metrics = Vec::new();
self.core.collect_into(&mut metrics);
m.set_metric(metrics);
vec![m]
}
}
impl<const N: usize> HyperLogLogCore<N> {
fn collect_into(&self, metrics: &mut Vec<proto::Metric>) {
self.shards.iter().enumerate().for_each(|(i, x)| {
let mut shard_label = proto::LabelPair::default();
shard_label.set_name("hll_shard".to_owned());
shard_label.set_value(format!("{i}"));
fn take_sample(&self) -> [u8; N] {
self.shards.each_ref().map(|x| {
// We reset the counter to 0 so we can perform a cardinality measure over any time slice in prometheus.
// This seems like it would be a race condition,
@@ -344,85 +140,90 @@ impl<const N: usize> HyperLogLogCore<N> {
// TODO: maybe we shouldn't reset this on every collect, instead, only after a time window.
// this would mean that a dev port-forwarding the metrics url won't break the sampling.
let v = x.swap(0, std::sync::atomic::Ordering::Relaxed);
let mut m = proto::Metric::default();
let mut c = proto::Gauge::default();
c.set_value(v as f64);
m.set_gauge(c);
let mut labels = Vec::with_capacity(self.labels.len() + 1);
labels.extend_from_slice(&self.labels);
labels.push(shard_label);
m.set_label(labels);
metrics.push(m);
x.swap(0, std::sync::atomic::Ordering::Relaxed)
})
}
}
fn make_label_pairs(
desc: &core::Desc,
label_values: &[&str],
) -> prometheus::Result<Vec<proto::LabelPair>> {
if desc.variable_labels.len() != label_values.len() {
return Err(prometheus::Error::InconsistentCardinality {
expect: desc.variable_labels.len(),
got: label_values.len(),
});
impl<W: std::io::Write, const N: usize> measured::metric::MetricEncoding<TextEncoder<W>>
for HyperLogLogState<N>
{
fn write_type(
name: impl MetricNameEncoder,
enc: &mut TextEncoder<W>,
) -> Result<(), std::io::Error> {
enc.write_type(&name, measured::text::MetricType::Gauge)
}
fn collect_into(
&self,
_: &(),
labels: impl LabelGroup,
name: impl MetricNameEncoder,
enc: &mut TextEncoder<W>,
) -> Result<(), std::io::Error> {
struct I64(i64);
impl LabelValue for I64 {
fn visit<V: LabelVisitor>(&self, v: V) -> V::Output {
v.write_int(self.0)
}
}
let total_len = desc.variable_labels.len() + desc.const_label_pairs.len();
if total_len == 0 {
return Ok(vec![]);
}
struct HllShardLabel {
hll_shard: i64,
}
if desc.variable_labels.is_empty() {
return Ok(desc.const_label_pairs.clone());
}
impl LabelGroup for HllShardLabel {
fn visit_values(&self, v: &mut impl LabelGroupVisitor) {
const LE: &LabelName = LabelName::from_str("hll_shard");
v.write_value(LE, &I64(self.hll_shard));
}
}
let mut label_pairs = Vec::with_capacity(total_len);
for (i, n) in desc.variable_labels.iter().enumerate() {
let mut label_pair = proto::LabelPair::default();
label_pair.set_name(n.clone());
label_pair.set_value(label_values[i].to_owned());
label_pairs.push(label_pair);
self.take_sample()
.into_iter()
.enumerate()
.try_for_each(|(hll_shard, val)| {
enc.write_metric_value(
name.by_ref(),
labels.by_ref().compose_with(HllShardLabel {
hll_shard: hll_shard as i64,
}),
MetricValue::Int(val as i64),
)
})
}
for label_pair in &desc.const_label_pairs {
label_pairs.push(label_pair.clone());
}
label_pairs.sort();
Ok(label_pairs)
}
#[cfg(test)]
mod tests {
use std::collections::HashSet;
use prometheus::{proto, Opts};
use measured::{label::StaticLabelSet, FixedCardinalityLabel};
use rand::{rngs::StdRng, Rng, SeedableRng};
use rand_distr::{Distribution, Zipf};
use crate::HyperLogLogVec;
fn collect(hll: &HyperLogLogVec<32>) -> Vec<proto::Metric> {
let mut metrics = vec![];
hll.core
.children
.read()
.unwrap()
.values()
.for_each(|c| c.core.collect_into(&mut metrics));
metrics
#[derive(FixedCardinalityLabel, Clone, Copy)]
#[label(singleton = "x")]
enum Label {
A,
B,
}
fn get_cardinality(metrics: &[proto::Metric], filter: impl Fn(&proto::Metric) -> bool) -> f64 {
fn collect(hll: &HyperLogLogVec<StaticLabelSet<Label>, 32>) -> ([u8; 32], [u8; 32]) {
// cannot go through the `hll.collect_family_into` interface yet...
// need to see if I can fix the conflicting impls problem in measured.
(
hll.get_metric(hll.with_labels(Label::A)).take_sample(),
hll.get_metric(hll.with_labels(Label::B)).take_sample(),
)
}
fn get_cardinality(samples: &[[u8; 32]]) -> f64 {
let mut buckets = [0.0; 32];
for metric in metrics.chunks_exact(32) {
if filter(&metric[0]) {
for (i, m) in metric.iter().enumerate() {
buckets[i] = f64::max(buckets[i], m.get_gauge().get_value());
}
for &sample in samples {
for (i, m) in sample.into_iter().enumerate() {
buckets[i] = f64::max(buckets[i], m as f64);
}
}
@@ -437,7 +238,7 @@ mod tests {
}
fn test_cardinality(n: usize, dist: impl Distribution<f64>) -> ([usize; 3], [f64; 3]) {
let hll = HyperLogLogVec::<32>::new(Opts::new("foo", "bar"), &["x"]).unwrap();
let hll = HyperLogLogVec::<StaticLabelSet<Label>, 32>::new();
let mut iter = StdRng::seed_from_u64(0x2024_0112).sample_iter(dist);
let mut set_a = HashSet::new();
@@ -445,18 +246,20 @@ mod tests {
for x in iter.by_ref().take(n) {
set_a.insert(x.to_bits());
hll.with_label_values(&["a"]).measure(&x.to_bits());
hll.get_metric(hll.with_labels(Label::A))
.measure(&x.to_bits());
}
for x in iter.by_ref().take(n) {
set_b.insert(x.to_bits());
hll.with_label_values(&["b"]).measure(&x.to_bits());
hll.get_metric(hll.with_labels(Label::B))
.measure(&x.to_bits());
}
let merge = &set_a | &set_b;
let metrics = collect(&hll);
let len = get_cardinality(&metrics, |_| true);
let len_a = get_cardinality(&metrics, |l| l.get_label()[0].get_value() == "a");
let len_b = get_cardinality(&metrics, |l| l.get_label()[0].get_value() == "b");
let (a, b) = collect(&hll);
let len = get_cardinality(&[a, b]);
let len_a = get_cardinality(&[a]);
let len_b = get_cardinality(&[b]);
([merge.len(), set_a.len(), set_b.len()], [len, len_a, len_b])
}

View File

@@ -4,6 +4,17 @@
//! a default registry.
#![deny(clippy::undocumented_unsafe_blocks)]
use measured::{
label::{LabelGroupSet, LabelGroupVisitor, LabelName, NoLabels},
metric::{
counter::CounterState,
gauge::GaugeState,
group::{Encoding, MetricValue},
name::{MetricName, MetricNameEncoder},
MetricEncoding, MetricFamilyEncoding,
},
FixedCardinalityLabel, LabelGroup, MetricGroup,
};
use once_cell::sync::Lazy;
use prometheus::core::{
Atomic, AtomicU64, Collector, GenericCounter, GenericCounterVec, GenericGauge, GenericGaugeVec,
@@ -11,6 +22,7 @@ use prometheus::core::{
pub use prometheus::opts;
pub use prometheus::register;
pub use prometheus::Error;
use prometheus::Registry;
pub use prometheus::{core, default_registry, proto};
pub use prometheus::{exponential_buckets, linear_buckets};
pub use prometheus::{register_counter_vec, Counter, CounterVec};
@@ -23,13 +35,12 @@ pub use prometheus::{register_int_counter_vec, IntCounterVec};
pub use prometheus::{register_int_gauge, IntGauge};
pub use prometheus::{register_int_gauge_vec, IntGaugeVec};
pub use prometheus::{Encoder, TextEncoder};
use prometheus::{Registry, Result};
pub mod launch_timestamp;
mod wrappers;
pub use wrappers::{CountedReader, CountedWriter};
mod hll;
pub use hll::{HyperLogLog, HyperLogLogVec};
pub use hll::{HyperLogLog, HyperLogLogState, HyperLogLogVec};
#[cfg(target_os = "linux")]
pub mod more_process_metrics;
@@ -59,7 +70,7 @@ static INTERNAL_REGISTRY: Lazy<Registry> = Lazy::new(Registry::new);
/// Register a collector in the internal registry. MUST be called before the first call to `gather()`.
/// Otherwise, we can have a deadlock in the `gather()` call, trying to register a new collector
/// while holding the lock.
pub fn register_internal(c: Box<dyn Collector>) -> Result<()> {
pub fn register_internal(c: Box<dyn Collector>) -> prometheus::Result<()> {
INTERNAL_REGISTRY.register(c)
}
@@ -96,6 +107,127 @@ pub const DISK_WRITE_SECONDS_BUCKETS: &[f64] = &[
0.000_050, 0.000_100, 0.000_500, 0.001, 0.003, 0.005, 0.01, 0.05, 0.1, 0.3, 0.5,
];
pub struct BuildInfo {
pub revision: &'static str,
pub build_tag: &'static str,
}
// todo: allow label group without the set
impl LabelGroup for BuildInfo {
fn visit_values(&self, v: &mut impl LabelGroupVisitor) {
const REVISION: &LabelName = LabelName::from_str("revision");
v.write_value(REVISION, &self.revision);
const BUILD_TAG: &LabelName = LabelName::from_str("build_tag");
v.write_value(BUILD_TAG, &self.build_tag);
}
}
impl<T: Encoding> MetricFamilyEncoding<T> for BuildInfo
where
GaugeState: MetricEncoding<T>,
{
fn collect_family_into(
&self,
name: impl measured::metric::name::MetricNameEncoder,
enc: &mut T,
) -> Result<(), T::Err> {
enc.write_help(&name, "Build/version information")?;
GaugeState::write_type(&name, enc)?;
GaugeState {
count: std::sync::atomic::AtomicI64::new(1),
}
.collect_into(&(), self, name, enc)
}
}
#[derive(MetricGroup)]
#[metric(new(build_info: BuildInfo))]
pub struct NeonMetrics {
#[cfg(target_os = "linux")]
#[metric(namespace = "process")]
#[metric(init = measured_process::ProcessCollector::for_self())]
process: measured_process::ProcessCollector,
#[metric(namespace = "libmetrics")]
#[metric(init = LibMetrics::new(build_info))]
libmetrics: LibMetrics,
}
#[derive(MetricGroup)]
#[metric(new(build_info: BuildInfo))]
pub struct LibMetrics {
#[metric(init = build_info)]
build_info: BuildInfo,
#[metric(flatten)]
rusage: Rusage,
serve_count: CollectionCounter,
}
fn write_gauge<Enc: Encoding>(
x: i64,
labels: impl LabelGroup,
name: impl MetricNameEncoder,
enc: &mut Enc,
) -> Result<(), Enc::Err> {
enc.write_metric_value(name, labels, MetricValue::Int(x))
}
#[derive(Default)]
struct Rusage;
#[derive(FixedCardinalityLabel, Clone, Copy)]
#[label(singleton = "io_operation")]
enum IoOp {
Read,
Write,
}
impl<T: Encoding> MetricGroup<T> for Rusage
where
GaugeState: MetricEncoding<T>,
{
fn collect_group_into(&self, enc: &mut T) -> Result<(), T::Err> {
const DISK_IO: &MetricName = MetricName::from_str("disk_io_bytes_total");
const MAXRSS: &MetricName = MetricName::from_str("maxrss_kb");
let ru = get_rusage_stats();
enc.write_help(
DISK_IO,
"Bytes written and read from disk, grouped by the operation (read|write)",
)?;
GaugeState::write_type(DISK_IO, enc)?;
write_gauge(ru.ru_inblock * BYTES_IN_BLOCK, IoOp::Read, DISK_IO, enc)?;
write_gauge(ru.ru_oublock * BYTES_IN_BLOCK, IoOp::Write, DISK_IO, enc)?;
enc.write_help(MAXRSS, "Memory usage (Maximum Resident Set Size)")?;
GaugeState::write_type(MAXRSS, enc)?;
write_gauge(ru.ru_maxrss, IoOp::Read, MAXRSS, enc)?;
Ok(())
}
}
#[derive(Default)]
struct CollectionCounter(CounterState);
impl<T: Encoding> MetricFamilyEncoding<T> for CollectionCounter
where
CounterState: MetricEncoding<T>,
{
fn collect_family_into(
&self,
name: impl measured::metric::name::MetricNameEncoder,
enc: &mut T,
) -> Result<(), T::Err> {
self.0.inc();
enc.write_help(&name, "Number of metric requests made")?;
self.0.collect_into(&(), NoLabels, name, enc)
}
}
pub fn set_build_info_metric(revision: &str, build_tag: &str) {
let metric = register_int_gauge_vec!(
"libmetrics_build_info",
@@ -105,6 +237,7 @@ pub fn set_build_info_metric(revision: &str, build_tag: &str) {
.expect("Failed to register build info metric");
metric.with_label_values(&[revision, build_tag]).set(1);
}
const BYTES_IN_BLOCK: i64 = 512;
// Records I/O stats in a "cross-platform" way.
// Compiles both on macOS and Linux, but current macOS implementation always returns 0 as values for I/O stats.
@@ -117,7 +250,6 @@ pub fn set_build_info_metric(revision: &str, build_tag: &str) {
fn update_rusage_metrics() {
let rusage_stats = get_rusage_stats();
const BYTES_IN_BLOCK: i64 = 512;
DISK_IO_BYTES
.with_label_values(&["read"])
.set(rusage_stats.ru_inblock * BYTES_IN_BLOCK);
@@ -151,6 +283,7 @@ macro_rules! register_int_counter_pair_vec {
}
}};
}
/// Create an [`IntCounterPair`] and registers to default registry.
#[macro_export(local_inner_macros)]
macro_rules! register_int_counter_pair {
@@ -188,7 +321,10 @@ impl<P: Atomic> GenericCounterPairVec<P> {
///
/// An error is returned if the number of label values is not the same as the
/// number of VariableLabels in Desc.
pub fn get_metric_with_label_values(&self, vals: &[&str]) -> Result<GenericCounterPair<P>> {
pub fn get_metric_with_label_values(
&self,
vals: &[&str],
) -> prometheus::Result<GenericCounterPair<P>> {
Ok(GenericCounterPair {
inc: self.inc.get_metric_with_label_values(vals)?,
dec: self.dec.get_metric_with_label_values(vals)?,
@@ -201,7 +337,7 @@ impl<P: Atomic> GenericCounterPairVec<P> {
self.get_metric_with_label_values(vals).unwrap()
}
pub fn remove_label_values(&self, res: &mut [Result<()>; 2], vals: &[&str]) {
pub fn remove_label_values(&self, res: &mut [prometheus::Result<()>; 2], vals: &[&str]) {
res[0] = self.inc.remove_label_values(vals);
res[1] = self.dec.remove_label_values(vals);
}
@@ -285,3 +421,171 @@ pub type IntCounterPair = GenericCounterPair<AtomicU64>;
/// A guard for [`IntCounterPair`] that will decrement the gauge on drop
pub type IntCounterPairGuard = GenericCounterPairGuard<AtomicU64>;
pub trait CounterPairAssoc {
const INC_NAME: &'static MetricName;
const DEC_NAME: &'static MetricName;
const INC_HELP: &'static str;
const DEC_HELP: &'static str;
type LabelGroupSet: LabelGroupSet;
}
pub struct CounterPairVec<A: CounterPairAssoc> {
vec: measured::metric::MetricVec<MeasuredCounterPairState, A::LabelGroupSet>,
}
impl<A: CounterPairAssoc> Default for CounterPairVec<A>
where
A::LabelGroupSet: Default,
{
fn default() -> Self {
Self {
vec: Default::default(),
}
}
}
impl<A: CounterPairAssoc> CounterPairVec<A> {
pub fn guard(
&self,
labels: <A::LabelGroupSet as LabelGroupSet>::Group<'_>,
) -> MeasuredCounterPairGuard<'_, A> {
let id = self.vec.with_labels(labels);
self.vec.get_metric(id).inc.inc();
MeasuredCounterPairGuard { vec: &self.vec, id }
}
pub fn inc(&self, labels: <A::LabelGroupSet as LabelGroupSet>::Group<'_>) {
let id = self.vec.with_labels(labels);
self.vec.get_metric(id).inc.inc();
}
pub fn dec(&self, labels: <A::LabelGroupSet as LabelGroupSet>::Group<'_>) {
let id = self.vec.with_labels(labels);
self.vec.get_metric(id).dec.inc();
}
pub fn remove_metric(
&self,
labels: <A::LabelGroupSet as LabelGroupSet>::Group<'_>,
) -> Option<MeasuredCounterPairState> {
let id = self.vec.with_labels(labels);
self.vec.remove_metric(id)
}
}
impl<T, A> ::measured::metric::group::MetricGroup<T> for CounterPairVec<A>
where
T: ::measured::metric::group::Encoding,
A: CounterPairAssoc,
::measured::metric::counter::CounterState: ::measured::metric::MetricEncoding<T>,
{
fn collect_group_into(&self, enc: &mut T) -> Result<(), T::Err> {
// write decrement first to avoid a race condition where inc - dec < 0
T::write_help(enc, A::DEC_NAME, A::DEC_HELP)?;
self.vec
.collect_family_into(A::DEC_NAME, &mut Dec(&mut *enc))?;
T::write_help(enc, A::INC_NAME, A::INC_HELP)?;
self.vec
.collect_family_into(A::INC_NAME, &mut Inc(&mut *enc))?;
Ok(())
}
}
#[derive(MetricGroup, Default)]
pub struct MeasuredCounterPairState {
pub inc: CounterState,
pub dec: CounterState,
}
impl measured::metric::MetricType for MeasuredCounterPairState {
type Metadata = ();
}
pub struct MeasuredCounterPairGuard<'a, A: CounterPairAssoc> {
vec: &'a measured::metric::MetricVec<MeasuredCounterPairState, A::LabelGroupSet>,
id: measured::metric::LabelId<A::LabelGroupSet>,
}
impl<A: CounterPairAssoc> Drop for MeasuredCounterPairGuard<'_, A> {
fn drop(&mut self) {
self.vec.get_metric(self.id).dec.inc();
}
}
/// [`MetricEncoding`] for [`MeasuredCounterPairState`] that only writes the inc counter to the inner encoder.
struct Inc<T>(T);
/// [`MetricEncoding`] for [`MeasuredCounterPairState`] that only writes the dec counter to the inner encoder.
struct Dec<T>(T);
impl<T: Encoding> Encoding for Inc<T> {
type Err = T::Err;
fn write_help(&mut self, name: impl MetricNameEncoder, help: &str) -> Result<(), Self::Err> {
self.0.write_help(name, help)
}
fn write_metric_value(
&mut self,
name: impl MetricNameEncoder,
labels: impl LabelGroup,
value: MetricValue,
) -> Result<(), Self::Err> {
self.0.write_metric_value(name, labels, value)
}
}
impl<T: Encoding> MetricEncoding<Inc<T>> for MeasuredCounterPairState
where
CounterState: MetricEncoding<T>,
{
fn write_type(name: impl MetricNameEncoder, enc: &mut Inc<T>) -> Result<(), T::Err> {
CounterState::write_type(name, &mut enc.0)
}
fn collect_into(
&self,
metadata: &(),
labels: impl LabelGroup,
name: impl MetricNameEncoder,
enc: &mut Inc<T>,
) -> Result<(), T::Err> {
self.inc.collect_into(metadata, labels, name, &mut enc.0)
}
}
impl<T: Encoding> Encoding for Dec<T> {
type Err = T::Err;
fn write_help(&mut self, name: impl MetricNameEncoder, help: &str) -> Result<(), Self::Err> {
self.0.write_help(name, help)
}
fn write_metric_value(
&mut self,
name: impl MetricNameEncoder,
labels: impl LabelGroup,
value: MetricValue,
) -> Result<(), Self::Err> {
self.0.write_metric_value(name, labels, value)
}
}
/// Write the dec counter to the encoder
impl<T: Encoding> MetricEncoding<Dec<T>> for MeasuredCounterPairState
where
CounterState: MetricEncoding<T>,
{
fn write_type(name: impl MetricNameEncoder, enc: &mut Dec<T>) -> Result<(), T::Err> {
CounterState::write_type(name, &mut enc.0)
}
fn collect_into(
&self,
metadata: &(),
labels: impl LabelGroup,
name: impl MetricNameEncoder,
enc: &mut Dec<T>,
) -> Result<(), T::Err> {
self.dec.collect_into(metadata, labels, name, &mut enc.0)
}
}

View File

@@ -20,6 +20,7 @@ use utils::{
history_buffer::HistoryBufferWithDropCounter,
id::{NodeId, TenantId, TimelineId},
lsn::Lsn,
serde_system_time,
};
use crate::controller_api::PlacementPolicy;
@@ -758,11 +759,7 @@ pub struct WalRedoManagerStatus {
#[derive(Default, Debug, Serialize, Deserialize, Clone)]
pub struct SecondaryProgress {
/// The remote storage LastModified time of the heatmap object we last downloaded.
#[serde(
serialize_with = "opt_ser_rfc3339_millis",
deserialize_with = "opt_deser_rfc3339_millis"
)]
pub heatmap_mtime: Option<SystemTime>,
pub heatmap_mtime: Option<serde_system_time::SystemTime>,
/// The number of layers currently on-disk
pub layers_downloaded: usize,
@@ -775,29 +772,6 @@ pub struct SecondaryProgress {
pub bytes_total: u64,
}
fn opt_ser_rfc3339_millis<S: serde::Serializer>(
ts: &Option<SystemTime>,
serializer: S,
) -> Result<S::Ok, S::Error> {
match ts {
Some(ts) => serializer.collect_str(&humantime::format_rfc3339_millis(*ts)),
None => serializer.serialize_none(),
}
}
fn opt_deser_rfc3339_millis<'de, D>(deserializer: D) -> Result<Option<SystemTime>, D::Error>
where
D: serde::de::Deserializer<'de>,
{
let s: Option<String> = serde::de::Deserialize::deserialize(deserializer)?;
match s {
None => Ok(None),
Some(s) => humantime::parse_rfc3339(&s)
.map_err(serde::de::Error::custom)
.map(Some),
}
}
pub mod virtual_file {
#[derive(
Copy,

View File

@@ -1,4 +1,4 @@
use std::time::SystemTime;
use utils::serde_system_time::SystemTime;
/// Pageserver current utilization and scoring for how good candidate the pageserver would be for
/// the next tenant.
@@ -21,28 +21,9 @@ pub struct PageserverUtilization {
/// When was this snapshot captured, pageserver local time.
///
/// Use millis to give confidence that the value is regenerated often enough.
#[serde(
serialize_with = "ser_rfc3339_millis",
deserialize_with = "deser_rfc3339_millis"
)]
pub captured_at: SystemTime,
}
fn ser_rfc3339_millis<S: serde::Serializer>(
ts: &SystemTime,
serializer: S,
) -> Result<S::Ok, S::Error> {
serializer.collect_str(&humantime::format_rfc3339_millis(*ts))
}
fn deser_rfc3339_millis<'de, D>(deserializer: D) -> Result<SystemTime, D::Error>
where
D: serde::de::Deserializer<'de>,
{
let s: String = serde::de::Deserialize::deserialize(deserializer)?;
humantime::parse_rfc3339(&s).map_err(serde::de::Error::custom)
}
/// openapi knows only `format: int64`, so avoid outputting a non-parseable value by generated clients.
///
/// Instead of newtype, use this because a newtype would get require handling deserializing values
@@ -69,7 +50,9 @@ mod tests {
disk_usage_bytes: u64::MAX,
free_space_bytes: 0,
utilization_score: u64::MAX,
captured_at: SystemTime::UNIX_EPOCH + Duration::from_secs(1708509779),
captured_at: SystemTime(
std::time::SystemTime::UNIX_EPOCH + Duration::from_secs(1708509779),
),
};
let s = serde_json::to_string(&doc).unwrap();

View File

@@ -22,6 +22,7 @@ camino.workspace = true
chrono.workspace = true
heapless.workspace = true
hex = { workspace = true, features = ["serde"] }
humantime.workspace = true
hyper = { workspace = true, features = ["full"] }
fail.workspace = true
futures = { workspace = true}

21
libs/utils/src/env.rs Normal file
View File

@@ -0,0 +1,21 @@
//! Wrapper around `std::env::var` for parsing environment variables.
use std::{fmt::Display, str::FromStr};
pub fn var<V, E>(varname: &str) -> Option<V>
where
V: FromStr<Err = E>,
E: Display,
{
match std::env::var(varname) {
Ok(s) => Some(
s.parse()
.map_err(|e| format!("failed to parse env var {varname}: {e:#}"))
.unwrap(),
),
Err(std::env::VarError::NotPresent) => None,
Err(std::env::VarError::NotUnicode(_)) => {
panic!("env var {varname} is not unicode")
}
}
}

View File

@@ -63,6 +63,7 @@ pub mod measured_stream;
pub mod serde_percent;
pub mod serde_regex;
pub mod serde_system_time;
pub mod pageserver_feedback;
@@ -89,6 +90,8 @@ pub mod yielding_loop;
pub mod zstd;
pub mod env;
/// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
///
/// we have several cases:

View File

@@ -0,0 +1,55 @@
//! A `serde::{Deserialize,Serialize}` type for SystemTime with RFC3339 format and millisecond precision.
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, serde::Serialize, serde::Deserialize)]
#[serde(transparent)]
pub struct SystemTime(
#[serde(
deserialize_with = "deser_rfc3339_millis",
serialize_with = "ser_rfc3339_millis"
)]
pub std::time::SystemTime,
);
fn ser_rfc3339_millis<S: serde::ser::Serializer>(
ts: &std::time::SystemTime,
serializer: S,
) -> Result<S::Ok, S::Error> {
serializer.collect_str(&humantime::format_rfc3339_millis(*ts))
}
fn deser_rfc3339_millis<'de, D>(deserializer: D) -> Result<std::time::SystemTime, D::Error>
where
D: serde::de::Deserializer<'de>,
{
let s: String = serde::de::Deserialize::deserialize(deserializer)?;
humantime::parse_rfc3339(&s).map_err(serde::de::Error::custom)
}
#[cfg(test)]
mod tests {
use super::*;
/// Helper function to make a SystemTime have millisecond precision by truncating additional nanoseconds.
fn to_millisecond_precision(time: SystemTime) -> SystemTime {
match time.0.duration_since(std::time::SystemTime::UNIX_EPOCH) {
Ok(duration) => {
let total_millis = duration.as_secs() * 1_000 + u64::from(duration.subsec_millis());
SystemTime(
std::time::SystemTime::UNIX_EPOCH
+ std::time::Duration::from_millis(total_millis),
)
}
Err(_) => time,
}
}
#[test]
fn test_serialize_deserialize() {
let input = SystemTime(std::time::SystemTime::now());
let expected_serialized = format!("\"{}\"", humantime::format_rfc3339_millis(input.0));
let serialized = serde_json::to_string(&input).unwrap();
assert_eq!(expected_serialized, serialized);
let deserialized: SystemTime = serde_json::from_str(&expected_serialized).unwrap();
assert_eq!(to_millisecond_precision(input), deserialized);
}
}

View File

@@ -11,7 +11,6 @@ default = []
anyhow.workspace = true
async-compression.workspace = true
async-stream.workspace = true
async-trait.workspace = true
byteorder.workspace = true
bytes.workspace = true
chrono = { workspace = true, features = ["serde"] }

View File

@@ -180,7 +180,7 @@ where
match top.deref_mut() {
LazyLoadLayer::Unloaded(ref mut l) => {
let fut = l.load_keys(this.ctx);
this.load_future.set(Some(fut));
this.load_future.set(Some(Box::pin(fut)));
continue;
}
LazyLoadLayer::Loaded(ref mut entries) => {

View File

@@ -3,7 +3,6 @@
//!
//! All the heavy lifting is done by the create_image and create_delta
//! functions that the implementor provides.
use async_trait::async_trait;
use futures::Future;
use pageserver_api::{key::Key, keyspace::key_range_size};
use std::ops::Range;
@@ -141,18 +140,16 @@ pub trait CompactionLayer<K: CompactionKey + ?Sized> {
fn is_delta(&self) -> bool;
}
#[async_trait]
pub trait CompactionDeltaLayer<E: CompactionJobExecutor + ?Sized>: CompactionLayer<E::Key> {
type DeltaEntry<'a>: CompactionDeltaEntry<'a, E::Key>
where
Self: 'a;
/// Return all keys in this delta layer.
async fn load_keys<'a>(
fn load_keys<'a>(
&self,
ctx: &E::RequestContext,
) -> anyhow::Result<Vec<Self::DeltaEntry<'_>>>;
) -> impl Future<Output = anyhow::Result<Vec<Self::DeltaEntry<'_>>>> + Send;
}
pub trait CompactionImageLayer<E: CompactionJobExecutor + ?Sized>: CompactionLayer<E::Key> {}

View File

@@ -2,7 +2,6 @@ mod draw;
use draw::{LayerTraceEvent, LayerTraceFile, LayerTraceOp};
use async_trait::async_trait;
use futures::StreamExt;
use rand::Rng;
use tracing::info;
@@ -139,7 +138,6 @@ impl interface::CompactionLayer<Key> for Arc<MockDeltaLayer> {
}
}
#[async_trait]
impl interface::CompactionDeltaLayer<MockTimeline> for Arc<MockDeltaLayer> {
type DeltaEntry<'a> = MockRecord;

View File

@@ -12,9 +12,14 @@ bytes.workspace = true
camino.workspace = true
clap = { workspace = true, features = ["string"] }
git-version.workspace = true
humantime.workspace = true
pageserver = { path = ".." }
pageserver_api.workspace = true
remote_storage = { path = "../../libs/remote_storage" }
postgres_ffi.workspace = true
tokio.workspace = true
tokio-util.workspace = true
toml_edit.workspace = true
utils.workspace = true
svg_fmt.workspace = true
workspace_hack.workspace = true

View File

@@ -9,6 +9,11 @@ mod index_part;
mod layer_map_analyzer;
mod layers;
use std::{
str::FromStr,
time::{Duration, SystemTime},
};
use camino::{Utf8Path, Utf8PathBuf};
use clap::{Parser, Subcommand};
use index_part::IndexPartCmd;
@@ -20,8 +25,16 @@ use pageserver::{
tenant::{dump_layerfile_from_path, metadata::TimelineMetadata},
virtual_file,
};
use pageserver_api::shard::TenantShardId;
use postgres_ffi::ControlFileData;
use utils::{lsn::Lsn, project_git_version};
use remote_storage::{RemotePath, RemoteStorageConfig};
use tokio_util::sync::CancellationToken;
use utils::{
id::TimelineId,
logging::{self, LogFormat, TracingErrorLayerEnablement},
lsn::Lsn,
project_git_version,
};
project_git_version!(GIT_VERSION);
@@ -43,6 +56,7 @@ enum Commands {
#[command(subcommand)]
IndexPart(IndexPartCmd),
PrintLayerFile(PrintLayerFileCmd),
TimeTravelRemotePrefix(TimeTravelRemotePrefixCmd),
DrawTimeline {},
AnalyzeLayerMap(AnalyzeLayerMapCmd),
#[command(subcommand)]
@@ -68,6 +82,26 @@ struct PrintLayerFileCmd {
path: Utf8PathBuf,
}
/// Roll back the time for the specified prefix using S3 history.
///
/// The command is fairly low level and powerful. Validation is only very light,
/// so it is more powerful, and thus potentially more dangerous.
#[derive(Parser)]
struct TimeTravelRemotePrefixCmd {
/// A configuration string for the remote_storage configuration.
///
/// Example: `remote_storage = { bucket_name = "aws-storage-bucket-name", bucket_region = "us-east-2" }`
config_toml_str: String,
/// remote prefix to time travel recover. For safety reasons, we require it to contain
/// a timeline or tenant ID in the prefix.
prefix: String,
/// Timestamp to travel to. Given in format like `2024-01-20T10:45:45Z`. Assumes UTC and second accuracy.
travel_to: String,
/// Timestamp of the start of the operation, must be after any changes we want to roll back and after.
/// You can use a few seconds before invoking the command. Same format as `travel_to`.
done_if_after: Option<String>,
}
#[derive(Parser)]
struct AnalyzeLayerMapCmd {
/// Pageserver data path
@@ -78,6 +112,14 @@ struct AnalyzeLayerMapCmd {
#[tokio::main]
async fn main() -> anyhow::Result<()> {
logging::init(
LogFormat::Plain,
TracingErrorLayerEnablement::EnableWithRustLogFilter,
logging::Output::Stdout,
)?;
logging::replace_panic_hook_with_tracing_panic_hook().forget();
let cli = CliOpts::parse();
match cli.command {
@@ -105,6 +147,42 @@ async fn main() -> anyhow::Result<()> {
print_layerfile(&cmd.path).await?;
}
}
Commands::TimeTravelRemotePrefix(cmd) => {
let timestamp = humantime::parse_rfc3339(&cmd.travel_to)
.map_err(|_e| anyhow::anyhow!("Invalid time for travel_to: '{}'", cmd.travel_to))?;
let done_if_after = if let Some(done_if_after) = &cmd.done_if_after {
humantime::parse_rfc3339(done_if_after).map_err(|_e| {
anyhow::anyhow!("Invalid time for done_if_after: '{}'", done_if_after)
})?
} else {
const SAFETY_MARGIN: Duration = Duration::from_secs(3);
tokio::time::sleep(SAFETY_MARGIN).await;
// Convert to string representation and back to get rid of sub-second values
let done_if_after = SystemTime::now();
tokio::time::sleep(SAFETY_MARGIN).await;
done_if_after
};
let timestamp = strip_subsecond(timestamp);
let done_if_after = strip_subsecond(done_if_after);
let Some(prefix) = validate_prefix(&cmd.prefix) else {
println!("specified prefix '{}' failed validation", cmd.prefix);
return Ok(());
};
let toml_document = toml_edit::Document::from_str(&cmd.config_toml_str)?;
let toml_item = toml_document
.get("remote_storage")
.expect("need remote_storage");
let config = RemoteStorageConfig::from_toml(toml_item)?.expect("incomplete config");
let storage = remote_storage::GenericRemoteStorage::from_config(&config);
let cancel = CancellationToken::new();
storage
.unwrap()
.time_travel_recover(Some(&prefix), timestamp, done_if_after, &cancel)
.await?;
}
};
Ok(())
}
@@ -185,3 +263,89 @@ fn handle_metadata(
Ok(())
}
/// Ensures that the given S3 prefix is sufficiently constrained.
/// The command is very risky already and we don't want to expose something
/// that allows usually unintentional and quite catastrophic time travel of
/// an entire bucket, which would be a major catastrophy and away
/// by only one character change (similar to "rm -r /home /username/foobar").
fn validate_prefix(prefix: &str) -> Option<RemotePath> {
if prefix.is_empty() {
// Empty prefix means we want to specify the *whole* bucket
return None;
}
let components = prefix.split('/').collect::<Vec<_>>();
let (last, components) = {
let last = components.last()?;
if last.is_empty() {
(
components.iter().nth_back(1)?,
&components[..(components.len() - 1)],
)
} else {
(last, &components[..])
}
};
'valid: {
if let Ok(_timeline_id) = TimelineId::from_str(last) {
// Ends in either a tenant or timeline ID
break 'valid;
}
if *last == "timelines" {
if let Some(before_last) = components.iter().nth_back(1) {
if let Ok(_tenant_id) = TenantShardId::from_str(before_last) {
// Has a valid tenant id
break 'valid;
}
}
}
return None;
}
RemotePath::from_string(prefix).ok()
}
fn strip_subsecond(timestamp: SystemTime) -> SystemTime {
let ts_str = humantime::format_rfc3339_seconds(timestamp).to_string();
humantime::parse_rfc3339(&ts_str).expect("can't parse just created timestamp")
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_validate_prefix() {
assert_eq!(validate_prefix(""), None);
assert_eq!(validate_prefix("/"), None);
#[track_caller]
fn assert_valid(prefix: &str) {
let remote_path = RemotePath::from_string(prefix).unwrap();
assert_eq!(validate_prefix(prefix), Some(remote_path));
}
assert_valid("wal/3aa8fcc61f6d357410b7de754b1d9001/641e5342083b2235ee3deb8066819683/");
// Path is not relative but absolute
assert_eq!(
validate_prefix(
"/wal/3aa8fcc61f6d357410b7de754b1d9001/641e5342083b2235ee3deb8066819683/"
),
None
);
assert_valid("wal/3aa8fcc61f6d357410b7de754b1d9001/");
// Partial tenant IDs should be invalid, S3 will match all tenants with the specific ID prefix
assert_eq!(validate_prefix("wal/3aa8fcc61f6d357410b7d"), None);
assert_eq!(validate_prefix("wal"), None);
assert_eq!(validate_prefix("/wal/"), None);
assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001");
// Partial tenant ID
assert_eq!(
validate_prefix("pageserver/v1/tenants/3aa8fcc61f6d357410b"),
None
);
assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001/timelines");
assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001-0004/timelines");
assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001/timelines/");
assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001/timelines/641e5342083b2235ee3deb8066819683");
assert_eq!(validate_prefix("pageserver/v1/tenants/"), None);
}
}

View File

@@ -18,6 +18,7 @@ use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
use pageserver::task_mgr::WALRECEIVER_RUNTIME;
use pageserver::tenant::{secondary, TenantSharedResources};
use remote_storage::GenericRemoteStorage;
use tokio::signal::unix::SignalKind;
use tokio::time::Instant;
use tracing::*;
@@ -671,42 +672,37 @@ fn start_pageserver(
let mut shutdown_pageserver = Some(shutdown_pageserver.drop_guard());
// All started up! Now just sit and wait for shutdown signal.
{
use signal_hook::consts::*;
let signal_handler = BACKGROUND_RUNTIME.spawn_blocking(move || {
let mut signals =
signal_hook::iterator::Signals::new([SIGINT, SIGTERM, SIGQUIT]).unwrap();
return signals
.forever()
.next()
.expect("forever() never returns None unless explicitly closed");
});
let signal = BACKGROUND_RUNTIME
.block_on(signal_handler)
.expect("join error");
match signal {
SIGQUIT => {
info!("Got signal {signal}. Terminating in immediate shutdown mode",);
std::process::exit(111);
}
SIGINT | SIGTERM => {
info!("Got signal {signal}. Terminating gracefully in fast shutdown mode",);
// This cancels the `shutdown_pageserver` cancellation tree.
// Right now that tree doesn't reach very far, and `task_mgr` is used instead.
// The plan is to change that over time.
shutdown_pageserver.take();
let bg_remote_storage = remote_storage.clone();
let bg_deletion_queue = deletion_queue.clone();
BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver(
&tenant_manager,
bg_remote_storage.map(|_| bg_deletion_queue),
0,
));
unreachable!()
}
_ => unreachable!(),
}
{
BACKGROUND_RUNTIME.block_on(async move {
let mut sigint = tokio::signal::unix::signal(SignalKind::interrupt()).unwrap();
let mut sigterm = tokio::signal::unix::signal(SignalKind::terminate()).unwrap();
let mut sigquit = tokio::signal::unix::signal(SignalKind::quit()).unwrap();
let signal = tokio::select! {
_ = sigquit.recv() => {
info!("Got signal SIGQUIT. Terminating in immediate shutdown mode",);
std::process::exit(111);
}
_ = sigint.recv() => { "SIGINT" },
_ = sigterm.recv() => { "SIGTERM" },
};
info!("Got signal {signal}. Terminating gracefully in fast shutdown mode",);
// This cancels the `shutdown_pageserver` cancellation tree.
// Right now that tree doesn't reach very far, and `task_mgr` is used instead.
// The plan is to change that over time.
shutdown_pageserver.take();
let bg_remote_storage = remote_storage.clone();
let bg_deletion_queue = deletion_queue.clone();
pageserver::shutdown_pageserver(
&tenant_manager,
bg_remote_storage.map(|_| bg_deletion_queue),
0,
)
.await;
unreachable!()
})
}
}

View File

@@ -2100,6 +2100,7 @@ pub(crate) fn remove_tenant_metrics(tenant_shard_id: &TenantShardId) {
use futures::Future;
use pin_project_lite::pin_project;
use std::collections::HashMap;
use std::num::NonZeroUsize;
use std::pin::Pin;
use std::sync::{Arc, Mutex};
use std::task::{Context, Poll};
@@ -2669,6 +2670,26 @@ pub(crate) mod disk_usage_based_eviction {
pub(crate) static METRICS: Lazy<Metrics> = Lazy::new(Metrics::default);
}
static TOKIO_EXECUTOR_THREAD_COUNT: Lazy<UIntGaugeVec> = Lazy::new(|| {
register_uint_gauge_vec!(
"pageserver_tokio_executor_thread_configured_count",
"Total number of configued tokio executor threads in the process.
The `setup` label denotes whether we're running with multiple runtimes or a single runtime.",
&["setup"],
)
.unwrap()
});
pub(crate) fn set_tokio_runtime_setup(setup: &str, num_threads: NonZeroUsize) {
static SERIALIZE: std::sync::Mutex<()> = std::sync::Mutex::new(());
let _guard = SERIALIZE.lock().unwrap();
TOKIO_EXECUTOR_THREAD_COUNT.reset();
TOKIO_EXECUTOR_THREAD_COUNT
.get_metric_with_label_values(&[setup])
.unwrap()
.set(u64::try_from(num_threads.get()).unwrap());
}
pub fn preinitialize_metrics() {
// Python tests need these and on some we do alerting.
//

View File

@@ -33,13 +33,14 @@
use std::collections::HashMap;
use std::fmt;
use std::future::Future;
use std::num::NonZeroUsize;
use std::panic::AssertUnwindSafe;
use std::str::FromStr;
use std::sync::atomic::{AtomicU64, Ordering};
use std::sync::{Arc, Mutex};
use futures::FutureExt;
use pageserver_api::shard::TenantShardId;
use tokio::runtime::Runtime;
use tokio::task::JoinHandle;
use tokio::task_local;
use tokio_util::sync::CancellationToken;
@@ -48,8 +49,11 @@ use tracing::{debug, error, info, warn};
use once_cell::sync::Lazy;
use utils::env;
use utils::id::TimelineId;
use crate::metrics::set_tokio_runtime_setup;
//
// There are four runtimes:
//
@@ -98,52 +102,119 @@ use utils::id::TimelineId;
// other operations, if the upload tasks e.g. get blocked on locks. It shouldn't
// happen, but still.
//
pub static COMPUTE_REQUEST_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
tokio::runtime::Builder::new_multi_thread()
.thread_name("compute request worker")
.enable_all()
.build()
.expect("Failed to create compute request runtime")
});
pub static MGMT_REQUEST_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
tokio::runtime::Builder::new_multi_thread()
.thread_name("mgmt request worker")
.enable_all()
.build()
.expect("Failed to create mgmt request runtime")
});
pub static WALRECEIVER_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
tokio::runtime::Builder::new_multi_thread()
.thread_name("walreceiver worker")
.enable_all()
.build()
.expect("Failed to create walreceiver runtime")
});
pub static BACKGROUND_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
tokio::runtime::Builder::new_multi_thread()
.thread_name("background op worker")
// if you change the number of worker threads please change the constant below
.enable_all()
.build()
.expect("Failed to create background op runtime")
});
pub(crate) static BACKGROUND_RUNTIME_WORKER_THREADS: Lazy<usize> = Lazy::new(|| {
// force init and thus panics
let _ = BACKGROUND_RUNTIME.handle();
pub(crate) static TOKIO_WORKER_THREADS: Lazy<NonZeroUsize> = Lazy::new(|| {
// replicates tokio-1.28.1::loom::sys::num_cpus which is not available publicly
// tokio would had already panicked for parsing errors or NotUnicode
//
// this will be wrong if any of the runtimes gets their worker threads configured to something
// else, but that has not been needed in a long time.
std::env::var("TOKIO_WORKER_THREADS")
.map(|s| s.parse::<usize>().unwrap())
.unwrap_or_else(|_e| usize::max(2, num_cpus::get()))
NonZeroUsize::new(
std::env::var("TOKIO_WORKER_THREADS")
.map(|s| s.parse::<usize>().unwrap())
.unwrap_or_else(|_e| usize::max(2, num_cpus::get())),
)
.expect("the max() ensures that this is not zero")
});
enum TokioRuntimeMode {
SingleThreaded,
MultiThreaded { num_workers: NonZeroUsize },
}
impl FromStr for TokioRuntimeMode {
type Err = String;
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s {
"current_thread" => Ok(TokioRuntimeMode::SingleThreaded),
s => match s.strip_prefix("multi_thread:") {
Some("default") => Ok(TokioRuntimeMode::MultiThreaded {
num_workers: *TOKIO_WORKER_THREADS,
}),
Some(suffix) => {
let num_workers = suffix.parse::<NonZeroUsize>().map_err(|e| {
format!(
"invalid number of multi-threaded runtime workers ({suffix:?}): {e}",
)
})?;
Ok(TokioRuntimeMode::MultiThreaded { num_workers })
}
None => Err(format!("invalid runtime config: {s:?}")),
},
}
}
}
static ONE_RUNTIME: Lazy<Option<tokio::runtime::Runtime>> = Lazy::new(|| {
let thread_name = "pageserver-tokio";
let Some(mode) = env::var("NEON_PAGESERVER_USE_ONE_RUNTIME") else {
// If the env var is not set, leave this static as None.
set_tokio_runtime_setup(
"multiple-runtimes",
NUM_MULTIPLE_RUNTIMES
.checked_mul(*TOKIO_WORKER_THREADS)
.unwrap(),
);
return None;
};
Some(match mode {
TokioRuntimeMode::SingleThreaded => {
set_tokio_runtime_setup("one-runtime-single-threaded", NonZeroUsize::new(1).unwrap());
tokio::runtime::Builder::new_current_thread()
.thread_name(thread_name)
.enable_all()
.build()
.expect("failed to create one single runtime")
}
TokioRuntimeMode::MultiThreaded { num_workers } => {
set_tokio_runtime_setup("one-runtime-multi-threaded", num_workers);
tokio::runtime::Builder::new_multi_thread()
.thread_name(thread_name)
.enable_all()
.worker_threads(num_workers.get())
.build()
.expect("failed to create one multi-threaded runtime")
}
})
});
/// Declare a lazy static variable named `$varname` that will resolve
/// to a tokio runtime handle. If the env var `NEON_PAGESERVER_USE_ONE_RUNTIME`
/// is set, this will resolve to `ONE_RUNTIME`. Otherwise, the macro invocation
/// declares a separate runtime and the lazy static variable `$varname`
/// will resolve to that separate runtime.
///
/// The result is is that `$varname.spawn()` will use `ONE_RUNTIME` if
/// `NEON_PAGESERVER_USE_ONE_RUNTIME` is set, and will use the separate runtime
/// otherwise.
macro_rules! pageserver_runtime {
($varname:ident, $name:literal) => {
pub static $varname: Lazy<&'static tokio::runtime::Runtime> = Lazy::new(|| {
if let Some(runtime) = &*ONE_RUNTIME {
return runtime;
}
static RUNTIME: Lazy<tokio::runtime::Runtime> = Lazy::new(|| {
tokio::runtime::Builder::new_multi_thread()
.thread_name($name)
.worker_threads(TOKIO_WORKER_THREADS.get())
.enable_all()
.build()
.expect(std::concat!("Failed to create runtime ", $name))
});
&*RUNTIME
});
};
}
pageserver_runtime!(COMPUTE_REQUEST_RUNTIME, "compute request worker");
pageserver_runtime!(MGMT_REQUEST_RUNTIME, "mgmt request worker");
pageserver_runtime!(WALRECEIVER_RUNTIME, "walreceiver worker");
pageserver_runtime!(BACKGROUND_RUNTIME, "background op worker");
// Bump this number when adding a new pageserver_runtime!
// SAFETY: it's obviously correct
const NUM_MULTIPLE_RUNTIMES: NonZeroUsize = unsafe { NonZeroUsize::new_unchecked(4) };
#[derive(Debug, Clone, Copy)]
pub struct PageserverTaskId(u64);

View File

@@ -51,7 +51,7 @@ use tokio_util::sync::CancellationToken;
use tracing::{info_span, instrument, warn, Instrument};
use utils::{
backoff, completion::Barrier, crashsafe::path_with_suffix_extension, failpoint_support, fs_ext,
id::TimelineId,
id::TimelineId, serde_system_time,
};
use super::{
@@ -591,7 +591,7 @@ impl<'a> TenantDownloader<'a> {
let mut progress = SecondaryProgress {
layers_total: heatmap_stats.layers,
bytes_total: heatmap_stats.bytes,
heatmap_mtime: Some(heatmap_mtime),
heatmap_mtime: Some(serde_system_time::SystemTime(heatmap_mtime)),
layers_downloaded: 0,
bytes_downloaded: 0,
};

View File

@@ -19,6 +19,7 @@ use pageserver_api::models::InMemoryLayerInfo;
use pageserver_api::shard::TenantShardId;
use std::collections::{BinaryHeap, HashMap, HashSet};
use std::sync::{Arc, OnceLock};
use std::time::Instant;
use tracing::*;
use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn, vec_map::VecMap};
// avoid binding to Write (conflicts with std::io::Write)
@@ -53,6 +54,8 @@ pub struct InMemoryLayer {
/// Writes are only allowed when this is `None`.
end_lsn: OnceLock<Lsn>,
opened_at: Instant,
/// The above fields never change, except for `end_lsn`, which is only set once.
/// All other changing parts are in `inner`, and protected by a mutex.
inner: RwLock<InMemoryLayerInner>,
@@ -460,6 +463,7 @@ impl InMemoryLayer {
tenant_shard_id,
start_lsn,
end_lsn: OnceLock::new(),
opened_at: Instant::now(),
inner: RwLock::new(InMemoryLayerInner {
index: HashMap::new(),
file,
@@ -520,6 +524,10 @@ impl InMemoryLayer {
Ok(())
}
pub(crate) fn get_opened_at(&self) -> Instant {
self.opened_at
}
pub(crate) async fn tick(&self) -> Option<u64> {
let mut inner = self.inner.write().await;
let size = inner.file.len();

View File

@@ -18,7 +18,7 @@ use utils::{backoff, completion};
static CONCURRENT_BACKGROUND_TASKS: once_cell::sync::Lazy<tokio::sync::Semaphore> =
once_cell::sync::Lazy::new(|| {
let total_threads = *task_mgr::BACKGROUND_RUNTIME_WORKER_THREADS;
let total_threads = task_mgr::TOKIO_WORKER_THREADS.get();
let permits = usize::max(
1,
// while a lot of the work is done on spawn_blocking, we still do
@@ -72,6 +72,7 @@ pub(crate) async fn concurrent_background_tasks_rate_limit_permit(
loop_kind == BackgroundLoopKind::InitialLogicalSizeCalculation
);
// TODO: assert that we run on BACKGROUND_RUNTIME; requires tokio_unstable Handle::id();
match CONCURRENT_BACKGROUND_TASKS.acquire().await {
Ok(permit) => permit,
Err(_closed) => unreachable!("we never close the semaphore"),

View File

@@ -1257,7 +1257,7 @@ impl Timeline {
checkpoint_distance,
self.get_last_record_lsn(),
self.last_freeze_at.load(),
*self.last_freeze_ts.read().unwrap(),
open_layer.get_opened_at(),
) {
match open_layer.info() {
InMemoryLayerInfo::Frozen { lsn_start, lsn_end } => {
@@ -1622,7 +1622,7 @@ impl Timeline {
checkpoint_distance: u64,
projected_lsn: Lsn,
last_freeze_at: Lsn,
last_freeze_ts: Instant,
opened_at: Instant,
) -> bool {
let distance = projected_lsn.widening_sub(last_freeze_at);
@@ -1648,13 +1648,13 @@ impl Timeline {
);
true
} else if distance > 0 && last_freeze_ts.elapsed() >= self.get_checkpoint_timeout() {
} else if distance > 0 && opened_at.elapsed() >= self.get_checkpoint_timeout() {
info!(
"Will roll layer at {} with layer size {} due to time since last flush ({:?})",
projected_lsn,
layer_size,
last_freeze_ts.elapsed()
);
"Will roll layer at {} with layer size {} due to time since first write to the layer ({:?})",
projected_lsn,
layer_size,
opened_at.elapsed()
);
true
} else {
@@ -4703,23 +4703,16 @@ struct TimelineWriterState {
max_lsn: Option<Lsn>,
// Cached details of the last freeze. Avoids going trough the atomic/lock on every put.
cached_last_freeze_at: Lsn,
cached_last_freeze_ts: Instant,
}
impl TimelineWriterState {
fn new(
open_layer: Arc<InMemoryLayer>,
current_size: u64,
last_freeze_at: Lsn,
last_freeze_ts: Instant,
) -> Self {
fn new(open_layer: Arc<InMemoryLayer>, current_size: u64, last_freeze_at: Lsn) -> Self {
Self {
open_layer,
current_size,
prev_lsn: None,
max_lsn: None,
cached_last_freeze_at: last_freeze_at,
cached_last_freeze_ts: last_freeze_ts,
}
}
}
@@ -4818,12 +4811,10 @@ impl<'a> TimelineWriter<'a> {
let initial_size = layer.size().await?;
let last_freeze_at = self.last_freeze_at.load();
let last_freeze_ts = *self.last_freeze_ts.read().unwrap();
self.write_guard.replace(TimelineWriterState::new(
layer,
initial_size,
last_freeze_at,
last_freeze_ts,
));
Ok(())
@@ -4870,7 +4861,7 @@ impl<'a> TimelineWriter<'a> {
self.get_checkpoint_distance(),
lsn,
state.cached_last_freeze_at,
state.cached_last_freeze_ts,
state.open_layer.get_opened_at(),
) {
OpenLayerAction::Roll
} else {

View File

@@ -12,7 +12,6 @@ use super::layer_manager::LayerManager;
use super::{CompactFlags, DurationRecorder, RecordedDuration, Timeline};
use anyhow::{anyhow, Context};
use async_trait::async_trait;
use enumset::EnumSet;
use fail::fail_point;
use itertools::Itertools;
@@ -1122,7 +1121,6 @@ impl CompactionLayer<Key> for ResidentDeltaLayer {
}
}
#[async_trait]
impl CompactionDeltaLayer<TimelineAdaptor> for ResidentDeltaLayer {
type DeltaEntry<'a> = DeltaEntry<'a>;

View File

@@ -41,7 +41,7 @@ pub(crate) fn regenerate(tenants_path: &Path) -> anyhow::Result<PageserverUtiliz
//
// note that u64::MAX will be output as i64::MAX as u64, but that should not matter
utilization_score: u64::MAX,
captured_at,
captured_at: utils::serde_system_time::SystemTime(captured_at),
};
// TODO: make utilization_score into a metric

19
poetry.lock generated
View File

@@ -1,4 +1,4 @@
# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand.
# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand.
[[package]]
name = "aiohttp"
@@ -1191,13 +1191,13 @@ files = [
[[package]]
name = "idna"
version = "3.3"
version = "3.7"
description = "Internationalized Domain Names in Applications (IDNA)"
optional = false
python-versions = ">=3.5"
files = [
{file = "idna-3.3-py3-none-any.whl", hash = "sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff"},
{file = "idna-3.3.tar.gz", hash = "sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d"},
{file = "idna-3.7-py3-none-any.whl", hash = "sha256:82fee1fc78add43492d3a1898bfa6d8a904cc97d8427f683ed8e798d07761aa0"},
{file = "idna-3.7.tar.gz", hash = "sha256:028ff3aadf0609c1fd278d8ea3089299412a7a8b9bd005dd08b9f8285bcb5cfc"},
]
[[package]]
@@ -2182,6 +2182,7 @@ files = [
{file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"},
{file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"},
{file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"},
{file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"},
{file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"},
{file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"},
{file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"},
@@ -2652,6 +2653,16 @@ files = [
{file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c"},
{file = "wrapt-1.14.1-cp310-cp310-win32.whl", hash = "sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8"},
{file = "wrapt-1.14.1-cp310-cp310-win_amd64.whl", hash = "sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164"},
{file = "wrapt-1.14.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ecee4132c6cd2ce5308e21672015ddfed1ff975ad0ac8d27168ea82e71413f55"},
{file = "wrapt-1.14.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2020f391008ef874c6d9e208b24f28e31bcb85ccff4f335f15a3251d222b92d9"},
{file = "wrapt-1.14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2feecf86e1f7a86517cab34ae6c2f081fd2d0dac860cb0c0ded96d799d20b335"},
{file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:240b1686f38ae665d1b15475966fe0472f78e71b1b4903c143a842659c8e4cb9"},
{file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9008dad07d71f68487c91e96579c8567c98ca4c3881b9b113bc7b33e9fd78b8"},
{file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6447e9f3ba72f8e2b985a1da758767698efa72723d5b59accefd716e9e8272bf"},
{file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:acae32e13a4153809db37405f5eba5bac5fbe2e2ba61ab227926a22901051c0a"},
{file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49ef582b7a1152ae2766557f0550a9fcbf7bbd76f43fbdc94dd3bf07cc7168be"},
{file = "wrapt-1.14.1-cp311-cp311-win32.whl", hash = "sha256:358fe87cc899c6bb0ddc185bf3dbfa4ba646f05b1b0b9b5a27c2cb92c2cea204"},
{file = "wrapt-1.14.1-cp311-cp311-win_amd64.whl", hash = "sha256:26046cd03936ae745a502abf44dac702a5e6880b2b01c29aea8ddf3353b68224"},
{file = "wrapt-1.14.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907"},
{file = "wrapt-1.14.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3"},
{file = "wrapt-1.14.1-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3"},

View File

@@ -12,6 +12,7 @@ testing = []
anyhow.workspace = true
async-compression.workspace = true
async-trait.workspace = true
atomic-take.workspace = true
aws-config.workspace = true
aws-sdk-iam.workspace = true
aws-sigv4.workspace = true
@@ -36,10 +37,14 @@ http.workspace = true
humantime.workspace = true
hyper-tungstenite.workspace = true
hyper.workspace = true
hyper1 = { package = "hyper", version = "1.2", features = ["server"] }
hyper-util = { version = "0.1", features = ["server", "http1", "http2", "tokio"] }
http-body-util = { version = "0.1" }
ipnet.workspace = true
itertools.workspace = true
lasso = { workspace = true, features = ["multi-threaded"] }
md5.workspace = true
measured = { workspace = true, features = ["lasso"] }
metrics.workspace = true
once_cell.workspace = true
opentelemetry.workspace = true

View File

@@ -13,7 +13,7 @@ use crate::console::provider::{CachedRoleSecret, ConsoleBackend};
use crate::console::{AuthSecret, NodeInfo};
use crate::context::RequestMonitoring;
use crate::intern::EndpointIdInt;
use crate::metrics::{AUTH_RATE_LIMIT_HITS, ENDPOINTS_AUTH_RATE_LIMITED};
use crate::metrics::Metrics;
use crate::proxy::connect_compute::ComputeConnectBackend;
use crate::proxy::NeonOptions;
use crate::stream::Stream;
@@ -27,7 +27,7 @@ use crate::{
},
stream, url,
};
use crate::{scram, EndpointCacheKey, EndpointId, RoleName};
use crate::{scram, EndpointCacheKey, EndpointId, Normalize, RoleName};
use std::sync::Arc;
use tokio::io::{AsyncRead, AsyncWrite};
use tracing::{info, warn};
@@ -186,7 +186,7 @@ impl AuthenticationConfig {
is_cleartext: bool,
) -> auth::Result<AuthSecret> {
// we have validated the endpoint exists, so let's intern it.
let endpoint_int = EndpointIdInt::from(endpoint);
let endpoint_int = EndpointIdInt::from(endpoint.normalize());
// only count the full hash count if password hack or websocket flow.
// in other words, if proxy needs to run the hashing
@@ -210,8 +210,12 @@ impl AuthenticationConfig {
enabled = self.rate_limiter_enabled,
"rate limiting authentication"
);
AUTH_RATE_LIMIT_HITS.inc();
ENDPOINTS_AUTH_RATE_LIMITED.measure(endpoint);
Metrics::get().proxy.requests_auth_rate_limits_total.inc();
Metrics::get()
.proxy
.endpoints_auth_rate_limits
.get_metric()
.measure(endpoint);
if self.rate_limiter_enabled {
return Err(auth::AuthError::too_many_connections());

View File

@@ -4,7 +4,7 @@ use crate::{
auth::password_hack::parse_endpoint_param,
context::RequestMonitoring,
error::{ReportableError, UserFacingError},
metrics::NUM_CONNECTION_ACCEPTED_BY_SNI,
metrics::{Metrics, SniKind},
proxy::NeonOptions,
serverless::SERVERLESS_DRIVER_SNI,
EndpointId, RoleName,
@@ -144,21 +144,22 @@ impl ComputeUserInfoMaybeEndpoint {
ctx.set_endpoint_id(ep.clone());
}
let metrics = Metrics::get();
info!(%user, "credentials");
if sni.is_some() {
info!("Connection with sni");
NUM_CONNECTION_ACCEPTED_BY_SNI
.with_label_values(&["sni"])
.inc();
metrics.proxy.accepted_connections_by_sni.inc(SniKind::Sni);
} else if endpoint.is_some() {
NUM_CONNECTION_ACCEPTED_BY_SNI
.with_label_values(&["no_sni"])
.inc();
metrics
.proxy
.accepted_connections_by_sni
.inc(SniKind::NoSni);
info!("Connection without sni");
} else {
NUM_CONNECTION_ACCEPTED_BY_SNI
.with_label_values(&["password_hack"])
.inc();
metrics
.proxy
.accepted_connections_by_sni
.inc(SniKind::PasswordHack);
info!("Connection with password hack");
}

View File

@@ -176,7 +176,12 @@ async fn task_main(
.context("failed to set socket option")?;
info!(%peer_addr, "serving");
let ctx = RequestMonitoring::new(session_id, peer_addr.ip(), "sni_router", "sni");
let ctx = RequestMonitoring::new(
session_id,
peer_addr.ip(),
proxy::metrics::Protocol::SniRouter,
"sni",
);
handle_client(ctx, dest_suffix, tls_config, tls_server_end_point, socket).await
}
.unwrap_or_else(|e| {

View File

@@ -18,7 +18,8 @@ use proxy::config::ProjectInfoCacheOptions;
use proxy::console;
use proxy::context::parquet::ParquetUploadArgs;
use proxy::http;
use proxy::metrics::NUM_CANCELLATION_REQUESTS_SOURCE_FROM_CLIENT;
use proxy::http::health_server::AppMetrics;
use proxy::metrics::Metrics;
use proxy::rate_limiter::AuthRateLimiter;
use proxy::rate_limiter::EndpointRateLimiter;
use proxy::rate_limiter::RateBucketInfo;
@@ -189,7 +190,9 @@ struct ProxyCliArgs {
/// cache for `project_info` (use `size=0` to disable)
#[clap(long, default_value = config::ProjectInfoCacheOptions::CACHE_DEFAULT_OPTIONS)]
project_info_cache: String,
/// cache for all valid endpoints
#[clap(long, default_value = config::EndpointCacheConfig::CACHE_DEFAULT_OPTIONS)]
endpoint_cache_config: String,
#[clap(flatten)]
parquet_upload: ParquetUploadArgs,
@@ -249,14 +252,18 @@ async fn main() -> anyhow::Result<()> {
info!("Version: {GIT_VERSION}");
info!("Build_tag: {BUILD_TAG}");
::metrics::set_build_info_metric(GIT_VERSION, BUILD_TAG);
let neon_metrics = ::metrics::NeonMetrics::new(::metrics::BuildInfo {
revision: GIT_VERSION,
build_tag: BUILD_TAG,
});
match proxy::jemalloc::MetricRecorder::new(prometheus::default_registry()) {
Ok(t) => {
t.start();
let jemalloc = match proxy::jemalloc::MetricRecorder::new() {
Ok(t) => Some(t),
Err(e) => {
tracing::error!(error = ?e, "could not start jemalloc metrics loop");
None
}
Err(e) => tracing::error!(error = ?e, "could not start jemalloc metrics loop"),
}
};
let args = ProxyCliArgs::parse();
let config = build_config(&args)?;
@@ -296,27 +303,27 @@ async fn main() -> anyhow::Result<()> {
),
aws_credentials_provider,
));
let redis_notifications_client =
match (args.redis_notifications, (args.redis_host, args.redis_port)) {
(Some(url), _) => {
info!("Starting redis notifications listener ({url})");
Some(ConnectionWithCredentialsProvider::new_with_static_credentials(url))
}
(None, (Some(host), Some(port))) => Some(
ConnectionWithCredentialsProvider::new_with_credentials_provider(
host,
port,
elasticache_credentials_provider.clone(),
),
let regional_redis_client = match (args.redis_host, args.redis_port) {
(Some(host), Some(port)) => Some(
ConnectionWithCredentialsProvider::new_with_credentials_provider(
host,
port,
elasticache_credentials_provider.clone(),
),
(None, (None, None)) => {
warn!("Redis is disabled");
None
}
_ => {
bail!("redis-host and redis-port must be specified together");
}
};
),
(None, None) => {
warn!("Redis events from console are disabled");
None
}
_ => {
bail!("redis-host and redis-port must be specified together");
}
};
let redis_notifications_client = if let Some(url) = args.redis_notifications {
Some(ConnectionWithCredentialsProvider::new_with_static_credentials(url))
} else {
regional_redis_client.clone()
};
// Check that we can bind to address before further initialization
let http_address: SocketAddr = args.http.parse()?;
@@ -335,8 +342,7 @@ async fn main() -> anyhow::Result<()> {
let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new(&config.endpoint_rps_limit));
let cancel_map = CancelMap::default();
// let redis_notifications_client = redis_notifications_client.map(|x| Box::leak(Box::new(x)));
let redis_publisher = match &redis_notifications_client {
let redis_publisher = match &regional_redis_client {
Some(redis_publisher) => Some(Arc::new(Mutex::new(RedisPublisherClient::new(
redis_publisher.clone(),
args.region.clone(),
@@ -349,7 +355,7 @@ async fn main() -> anyhow::Result<()> {
>::new(
cancel_map.clone(),
redis_publisher,
NUM_CANCELLATION_REQUESTS_SOURCE_FROM_CLIENT,
proxy::metrics::CancellationSource::FromClient,
));
// client facing tasks. these will exit on error or on cancellation
@@ -387,7 +393,14 @@ async fn main() -> anyhow::Result<()> {
// maintenance tasks. these never return unless there's an error
let mut maintenance_tasks = JoinSet::new();
maintenance_tasks.spawn(proxy::handle_signals(cancellation_token.clone()));
maintenance_tasks.spawn(http::health_server::task_main(http_listener));
maintenance_tasks.spawn(http::health_server::task_main(
http_listener,
AppMetrics {
jemalloc,
neon_metrics,
proxy: proxy::metrics::Metrics::get(),
},
));
maintenance_tasks.spawn(console::mgmt::task_main(mgmt_listener));
if let Some(metrics_config) = &config.metric_collection {
@@ -404,13 +417,18 @@ async fn main() -> anyhow::Result<()> {
if let Some(redis_notifications_client) = redis_notifications_client {
let cache = api.caches.project_info.clone();
maintenance_tasks.spawn(notifications::task_main(
redis_notifications_client.clone(),
redis_notifications_client,
cache.clone(),
cancel_map.clone(),
args.region.clone(),
));
maintenance_tasks.spawn(async move { cache.clone().gc_worker().await });
}
if let Some(regional_redis_client) = regional_redis_client {
let cache = api.caches.endpoints_cache.clone();
let con = regional_redis_client;
maintenance_tasks.spawn(async move { cache.do_read(con).await });
}
}
}
@@ -489,14 +507,18 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?;
let project_info_cache_config: ProjectInfoCacheOptions =
args.project_info_cache.parse()?;
let endpoint_cache_config: config::EndpointCacheConfig =
args.endpoint_cache_config.parse()?;
info!("Using NodeInfoCache (wake_compute) with options={wake_compute_cache_config:?}");
info!(
"Using AllowedIpsCache (wake_compute) with options={project_info_cache_config:?}"
);
info!("Using EndpointCacheConfig with options={endpoint_cache_config:?}");
let caches = Box::leak(Box::new(console::caches::ApiCaches::new(
wake_compute_cache_config,
project_info_cache_config,
endpoint_cache_config,
)));
let config::WakeComputeLockOptions {
@@ -507,10 +529,17 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
} = args.wake_compute_lock.parse()?;
info!(permits, shards, ?epoch, "Using NodeLocks (wake_compute)");
let locks = Box::leak(Box::new(
console::locks::ApiLocks::new("wake_compute_lock", permits, shards, timeout)
.unwrap(),
console::locks::ApiLocks::new(
"wake_compute_lock",
permits,
shards,
timeout,
epoch,
&Metrics::get().wake_compute_lock,
)
.unwrap(),
));
tokio::spawn(locks.garbage_collect_worker(epoch));
tokio::spawn(locks.garbage_collect_worker());
let url = args.auth_endpoint.parse()?;
let endpoint = http::Endpoint::new(url, http::new_client(rate_limiter_config));

View File

@@ -1,4 +1,5 @@
pub mod common;
pub mod endpoints;
pub mod project_info;
mod timed_lru;

226
proxy/src/cache/endpoints.rs vendored Normal file
View File

@@ -0,0 +1,226 @@
use std::{
convert::Infallible,
sync::{
atomic::{AtomicBool, Ordering},
Arc,
},
};
use dashmap::DashSet;
use redis::{
streams::{StreamReadOptions, StreamReadReply},
AsyncCommands, FromRedisValue, Value,
};
use serde::Deserialize;
use tokio::sync::Mutex;
use crate::{
config::EndpointCacheConfig,
context::RequestMonitoring,
intern::{BranchIdInt, EndpointIdInt, ProjectIdInt},
metrics::{Metrics, RedisErrors},
rate_limiter::GlobalRateLimiter,
redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider,
EndpointId,
};
#[derive(Deserialize, Debug, Clone)]
pub struct ControlPlaneEventKey {
endpoint_created: Option<EndpointCreated>,
branch_created: Option<BranchCreated>,
project_created: Option<ProjectCreated>,
}
#[derive(Deserialize, Debug, Clone)]
struct EndpointCreated {
endpoint_id: String,
}
#[derive(Deserialize, Debug, Clone)]
struct BranchCreated {
branch_id: String,
}
#[derive(Deserialize, Debug, Clone)]
struct ProjectCreated {
project_id: String,
}
pub struct EndpointsCache {
config: EndpointCacheConfig,
endpoints: DashSet<EndpointIdInt>,
branches: DashSet<BranchIdInt>,
projects: DashSet<ProjectIdInt>,
ready: AtomicBool,
limiter: Arc<Mutex<GlobalRateLimiter>>,
}
impl EndpointsCache {
pub fn new(config: EndpointCacheConfig) -> Self {
Self {
limiter: Arc::new(Mutex::new(GlobalRateLimiter::new(
config.limiter_info.clone(),
))),
config,
endpoints: DashSet::new(),
branches: DashSet::new(),
projects: DashSet::new(),
ready: AtomicBool::new(false),
}
}
pub async fn is_valid(&self, ctx: &mut RequestMonitoring, endpoint: &EndpointId) -> bool {
if !self.ready.load(Ordering::Acquire) {
return true;
}
// If cache is disabled, just collect the metrics and return.
if self.config.disable_cache {
ctx.set_rejected(self.should_reject(endpoint));
return true;
}
// If the limiter allows, we don't need to check the cache.
if self.limiter.lock().await.check() {
return true;
}
let rejected = self.should_reject(endpoint);
ctx.set_rejected(rejected);
!rejected
}
fn should_reject(&self, endpoint: &EndpointId) -> bool {
if endpoint.is_endpoint() {
!self.endpoints.contains(&EndpointIdInt::from(endpoint))
} else if endpoint.is_branch() {
!self
.branches
.contains(&BranchIdInt::from(&endpoint.as_branch()))
} else {
!self
.projects
.contains(&ProjectIdInt::from(&endpoint.as_project()))
}
}
fn insert_event(&self, key: ControlPlaneEventKey) {
// Do not do normalization here, we expect the events to be normalized.
if let Some(endpoint_created) = key.endpoint_created {
self.endpoints
.insert(EndpointIdInt::from(&endpoint_created.endpoint_id.into()));
}
if let Some(branch_created) = key.branch_created {
self.branches
.insert(BranchIdInt::from(&branch_created.branch_id.into()));
}
if let Some(project_created) = key.project_created {
self.projects
.insert(ProjectIdInt::from(&project_created.project_id.into()));
}
}
pub async fn do_read(
&self,
mut con: ConnectionWithCredentialsProvider,
) -> anyhow::Result<Infallible> {
let mut last_id = "0-0".to_string();
loop {
self.ready.store(false, Ordering::Release);
if let Err(e) = con.connect().await {
tracing::error!("error connecting to redis: {:?}", e);
continue;
}
if let Err(e) = self.read_from_stream(&mut con, &mut last_id).await {
tracing::error!("error reading from redis: {:?}", e);
}
tokio::time::sleep(self.config.retry_interval).await;
}
}
async fn read_from_stream(
&self,
con: &mut ConnectionWithCredentialsProvider,
last_id: &mut String,
) -> anyhow::Result<()> {
tracing::info!("reading endpoints/branches/projects from redis");
self.batch_read(
con,
StreamReadOptions::default().count(self.config.initial_batch_size),
last_id,
true,
)
.await?;
tracing::info!("ready to filter user requests");
self.ready.store(true, Ordering::Release);
self.batch_read(
con,
StreamReadOptions::default()
.count(self.config.default_batch_size)
.block(self.config.xread_timeout.as_millis() as usize),
last_id,
false,
)
.await
}
fn parse_key_value(value: &Value) -> anyhow::Result<ControlPlaneEventKey> {
let s: String = FromRedisValue::from_redis_value(value)?;
Ok(serde_json::from_str(&s)?)
}
async fn batch_read(
&self,
conn: &mut ConnectionWithCredentialsProvider,
opts: StreamReadOptions,
last_id: &mut String,
return_when_finish: bool,
) -> anyhow::Result<()> {
let mut total: usize = 0;
loop {
let mut res: StreamReadReply = conn
.xread_options(&[&self.config.stream_name], &[last_id.as_str()], &opts)
.await?;
if res.keys.is_empty() {
if return_when_finish {
anyhow::bail!(
"Redis stream {} is empty, cannot be used to filter endpoints",
self.config.stream_name
);
}
// If we are not returning when finish, we should wait for more data.
continue;
}
if res.keys.len() != 1 {
anyhow::bail!("Cannot read from redis stream {}", self.config.stream_name);
}
let res = res.keys.pop().expect("Checked length above");
let len = res.ids.len();
for x in res.ids {
total += 1;
for (_, v) in x.map {
let key = match Self::parse_key_value(&v) {
Ok(x) => x,
Err(e) => {
Metrics::get().proxy.redis_errors_total.inc(RedisErrors {
channel: &self.config.stream_name,
});
tracing::error!("error parsing value {v:?}: {e:?}");
continue;
}
};
self.insert_event(key);
}
if total.is_power_of_two() {
tracing::debug!("endpoints read {}", total);
}
*last_id = x.id;
}
if return_when_finish && len <= self.config.default_batch_size {
break;
}
}
tracing::info!("read {} endpoints/branches/projects from redis", total);
Ok(())
}
}
#[cfg(test)]
mod tests {
use super::ControlPlaneEventKey;
#[test]
fn test() {
let s = "{\"branch_created\":null,\"endpoint_created\":{\"endpoint_id\":\"ep-rapid-thunder-w0qqw2q9\"},\"project_created\":null,\"type\":\"endpoint_created\"}";
let _: ControlPlaneEventKey = serde_json::from_str(s).unwrap();
}
}

View File

@@ -10,7 +10,7 @@ use uuid::Uuid;
use crate::{
error::ReportableError,
metrics::NUM_CANCELLATION_REQUESTS,
metrics::{CancellationRequest, CancellationSource, Metrics},
redis::cancellation_publisher::{
CancellationPublisher, CancellationPublisherMut, RedisPublisherClient,
},
@@ -28,7 +28,7 @@ pub struct CancellationHandler<P> {
client: P,
/// This field used for the monitoring purposes.
/// Represents the source of the cancellation request.
from: &'static str,
from: CancellationSource,
}
#[derive(Debug, Error)]
@@ -89,9 +89,13 @@ impl<P: CancellationPublisher> CancellationHandler<P> {
// NB: we should immediately release the lock after cloning the token.
let Some(cancel_closure) = self.map.get(&key).and_then(|x| x.clone()) else {
tracing::warn!("query cancellation key not found: {key}");
NUM_CANCELLATION_REQUESTS
.with_label_values(&[self.from, "not_found"])
.inc();
Metrics::get()
.proxy
.cancellation_requests_total
.inc(CancellationRequest {
source: self.from,
kind: crate::metrics::CancellationOutcome::NotFound,
});
match self.client.try_publish(key, session_id).await {
Ok(()) => {} // do nothing
Err(e) => {
@@ -103,9 +107,13 @@ impl<P: CancellationPublisher> CancellationHandler<P> {
}
return Ok(());
};
NUM_CANCELLATION_REQUESTS
.with_label_values(&[self.from, "found"])
.inc();
Metrics::get()
.proxy
.cancellation_requests_total
.inc(CancellationRequest {
source: self.from,
kind: crate::metrics::CancellationOutcome::Found,
});
info!("cancelling query per user's request using key {key}");
cancel_closure.try_cancel_query().await
}
@@ -122,7 +130,7 @@ impl<P: CancellationPublisher> CancellationHandler<P> {
}
impl CancellationHandler<()> {
pub fn new(map: CancelMap, from: &'static str) -> Self {
pub fn new(map: CancelMap, from: CancellationSource) -> Self {
Self {
map,
client: (),
@@ -132,7 +140,7 @@ impl CancellationHandler<()> {
}
impl<P: CancellationPublisherMut> CancellationHandler<Option<Arc<Mutex<P>>>> {
pub fn new(map: CancelMap, client: Option<Arc<Mutex<P>>>, from: &'static str) -> Self {
pub fn new(map: CancelMap, client: Option<Arc<Mutex<P>>>, from: CancellationSource) -> Self {
Self { map, client, from }
}
}
@@ -192,15 +200,13 @@ impl<P> Drop for Session<P> {
#[cfg(test)]
mod tests {
use crate::metrics::NUM_CANCELLATION_REQUESTS_SOURCE_FROM_REDIS;
use super::*;
#[tokio::test]
async fn check_session_drop() -> anyhow::Result<()> {
let cancellation_handler = Arc::new(CancellationHandler::<()>::new(
CancelMap::default(),
NUM_CANCELLATION_REQUESTS_SOURCE_FROM_REDIS,
CancellationSource::FromRedis,
));
let session = cancellation_handler.clone().get_session();
@@ -214,7 +220,7 @@ mod tests {
#[tokio::test]
async fn cancel_session_noop_regression() {
let handler = CancellationHandler::<()>::new(Default::default(), "local");
let handler = CancellationHandler::<()>::new(Default::default(), CancellationSource::Local);
handler
.cancel_session(
CancelKeyData {

View File

@@ -4,12 +4,11 @@ use crate::{
console::{errors::WakeComputeError, messages::MetricsAuxInfo},
context::RequestMonitoring,
error::{ReportableError, UserFacingError},
metrics::NUM_DB_CONNECTIONS_GAUGE,
metrics::{Metrics, NumDbConnectionsGuard},
proxy::neon_option,
};
use futures::{FutureExt, TryFutureExt};
use itertools::Itertools;
use metrics::IntCounterPairGuard;
use pq_proto::StartupMessageParams;
use std::{io, net::SocketAddr, time::Duration};
use thiserror::Error;
@@ -249,7 +248,7 @@ pub struct PostgresConnection {
/// Labels for proxy's metrics.
pub aux: MetricsAuxInfo,
_guage: IntCounterPairGuard,
_guage: NumDbConnectionsGuard<'static>,
}
impl ConnCfg {
@@ -295,9 +294,7 @@ impl ConnCfg {
params,
cancel_closure,
aux,
_guage: NUM_DB_CONNECTIONS_GAUGE
.with_label_values(&[ctx.protocol])
.guard(),
_guage: Metrics::get().proxy.db_connections.guard(ctx.protocol),
};
Ok(connection)

View File

@@ -313,6 +313,80 @@ impl CertResolver {
}
}
#[derive(Debug)]
pub struct EndpointCacheConfig {
/// Batch size to receive all endpoints on the startup.
pub initial_batch_size: usize,
/// Batch size to receive endpoints.
pub default_batch_size: usize,
/// Timeouts for the stream read operation.
pub xread_timeout: Duration,
/// Stream name to read from.
pub stream_name: String,
/// Limiter info (to distinguish when to enable cache).
pub limiter_info: Vec<RateBucketInfo>,
/// Disable cache.
/// If true, cache is ignored, but reports all statistics.
pub disable_cache: bool,
/// Retry interval for the stream read operation.
pub retry_interval: Duration,
}
impl EndpointCacheConfig {
/// Default options for [`crate::console::provider::NodeInfoCache`].
/// Notice that by default the limiter is empty, which means that cache is disabled.
pub const CACHE_DEFAULT_OPTIONS: &'static str =
"initial_batch_size=1000,default_batch_size=10,xread_timeout=5m,stream_name=controlPlane,disable_cache=true,limiter_info=1000@1s,retry_interval=1s";
/// Parse cache options passed via cmdline.
/// Example: [`Self::CACHE_DEFAULT_OPTIONS`].
fn parse(options: &str) -> anyhow::Result<Self> {
let mut initial_batch_size = None;
let mut default_batch_size = None;
let mut xread_timeout = None;
let mut stream_name = None;
let mut limiter_info = vec![];
let mut disable_cache = false;
let mut retry_interval = None;
for option in options.split(',') {
let (key, value) = option
.split_once('=')
.with_context(|| format!("bad key-value pair: {option}"))?;
match key {
"initial_batch_size" => initial_batch_size = Some(value.parse()?),
"default_batch_size" => default_batch_size = Some(value.parse()?),
"xread_timeout" => xread_timeout = Some(humantime::parse_duration(value)?),
"stream_name" => stream_name = Some(value.to_string()),
"limiter_info" => limiter_info.push(RateBucketInfo::from_str(value)?),
"disable_cache" => disable_cache = value.parse()?,
"retry_interval" => retry_interval = Some(humantime::parse_duration(value)?),
unknown => bail!("unknown key: {unknown}"),
}
}
RateBucketInfo::validate(&mut limiter_info)?;
Ok(Self {
initial_batch_size: initial_batch_size.context("missing `initial_batch_size`")?,
default_batch_size: default_batch_size.context("missing `default_batch_size`")?,
xread_timeout: xread_timeout.context("missing `xread_timeout`")?,
stream_name: stream_name.context("missing `stream_name`")?,
disable_cache,
limiter_info,
retry_interval: retry_interval.context("missing `retry_interval`")?,
})
}
}
impl FromStr for EndpointCacheConfig {
type Err = anyhow::Error;
fn from_str(options: &str) -> Result<Self, Self::Err> {
let error = || format!("failed to parse endpoint cache options '{options}'");
Self::parse(options).with_context(error)
}
}
#[derive(Debug)]
pub struct MetricBackupCollectionConfig {
pub interval: Duration,

View File

@@ -1,3 +1,4 @@
use measured::FixedCardinalityLabel;
use serde::{Deserialize, Serialize};
use std::fmt;
@@ -102,7 +103,7 @@ pub struct MetricsAuxInfo {
pub cold_start_info: ColdStartInfo,
}
#[derive(Debug, Default, Serialize, Deserialize, Clone, Copy)]
#[derive(Debug, Default, Serialize, Deserialize, Clone, Copy, FixedCardinalityLabel)]
#[serde(rename_all = "snake_case")]
pub enum ColdStartInfo {
#[default]
@@ -110,9 +111,11 @@ pub enum ColdStartInfo {
/// Compute was already running
Warm,
#[serde(rename = "pool_hit")]
#[label(rename = "pool_hit")]
/// Compute was not running but there was an available VM
VmPoolHit,
#[serde(rename = "pool_miss")]
#[label(rename = "pool_miss")]
/// Compute was not running and there were no VMs available
VmPoolMiss,

View File

@@ -8,11 +8,12 @@ use crate::{
backend::{ComputeCredentialKeys, ComputeUserInfo},
IpPattern,
},
cache::{project_info::ProjectInfoCacheImpl, Cached, TimedLru},
cache::{endpoints::EndpointsCache, project_info::ProjectInfoCacheImpl, Cached, TimedLru},
compute,
config::{CacheOptions, ProjectInfoCacheOptions},
config::{CacheOptions, EndpointCacheConfig, ProjectInfoCacheOptions},
context::RequestMonitoring,
intern::ProjectIdInt,
metrics::ApiLockMetrics,
scram, EndpointCacheKey,
};
use dashmap::DashMap;
@@ -416,12 +417,15 @@ pub struct ApiCaches {
pub node_info: NodeInfoCache,
/// Cache which stores project_id -> endpoint_ids mapping.
pub project_info: Arc<ProjectInfoCacheImpl>,
/// List of all valid endpoints.
pub endpoints_cache: Arc<EndpointsCache>,
}
impl ApiCaches {
pub fn new(
wake_compute_cache_config: CacheOptions,
project_info_cache_config: ProjectInfoCacheOptions,
endpoint_cache_config: EndpointCacheConfig,
) -> Self {
Self {
node_info: NodeInfoCache::new(
@@ -431,6 +435,7 @@ impl ApiCaches {
true,
),
project_info: Arc::new(ProjectInfoCacheImpl::new(project_info_cache_config)),
endpoints_cache: Arc::new(EndpointsCache::new(endpoint_cache_config)),
}
}
}
@@ -441,10 +446,8 @@ pub struct ApiLocks {
node_locks: DashMap<EndpointCacheKey, Arc<Semaphore>>,
permits: usize,
timeout: Duration,
registered: prometheus::IntCounter,
unregistered: prometheus::IntCounter,
reclamation_lag: prometheus::Histogram,
lock_acquire_lag: prometheus::Histogram,
epoch: std::time::Duration,
metrics: &'static ApiLockMetrics,
}
impl ApiLocks {
@@ -453,54 +456,16 @@ impl ApiLocks {
permits: usize,
shards: usize,
timeout: Duration,
epoch: std::time::Duration,
metrics: &'static ApiLockMetrics,
) -> prometheus::Result<Self> {
let registered = prometheus::IntCounter::with_opts(
prometheus::Opts::new(
"semaphores_registered",
"Number of semaphores registered in this api lock",
)
.namespace(name),
)?;
prometheus::register(Box::new(registered.clone()))?;
let unregistered = prometheus::IntCounter::with_opts(
prometheus::Opts::new(
"semaphores_unregistered",
"Number of semaphores unregistered in this api lock",
)
.namespace(name),
)?;
prometheus::register(Box::new(unregistered.clone()))?;
let reclamation_lag = prometheus::Histogram::with_opts(
prometheus::HistogramOpts::new(
"reclamation_lag_seconds",
"Time it takes to reclaim unused semaphores in the api lock",
)
.namespace(name)
// 1us -> 65ms
// benchmarks on my mac indicate it's usually in the range of 256us and 512us
.buckets(prometheus::exponential_buckets(1e-6, 2.0, 16)?),
)?;
prometheus::register(Box::new(reclamation_lag.clone()))?;
let lock_acquire_lag = prometheus::Histogram::with_opts(
prometheus::HistogramOpts::new(
"semaphore_acquire_seconds",
"Time it takes to reclaim unused semaphores in the api lock",
)
.namespace(name)
// 0.1ms -> 6s
.buckets(prometheus::exponential_buckets(1e-4, 2.0, 16)?),
)?;
prometheus::register(Box::new(lock_acquire_lag.clone()))?;
Ok(Self {
name,
node_locks: DashMap::with_shard_amount(shards),
permits,
timeout,
lock_acquire_lag,
registered,
unregistered,
reclamation_lag,
epoch,
metrics,
})
}
@@ -520,7 +485,7 @@ impl ApiLocks {
self.node_locks
.entry(key.clone())
.or_insert_with(|| {
self.registered.inc();
self.metrics.semaphores_registered.inc();
Arc::new(Semaphore::new(self.permits))
})
.clone()
@@ -528,20 +493,21 @@ impl ApiLocks {
};
let permit = tokio::time::timeout_at(now + self.timeout, semaphore.acquire_owned()).await;
self.lock_acquire_lag
.observe((Instant::now() - now).as_secs_f64());
self.metrics
.semaphore_acquire_seconds
.observe(now.elapsed().as_secs_f64());
Ok(WakeComputePermit {
permit: Some(permit??),
})
}
pub async fn garbage_collect_worker(&self, epoch: std::time::Duration) {
pub async fn garbage_collect_worker(&self) {
if self.permits == 0 {
return;
}
let mut interval = tokio::time::interval(epoch / (self.node_locks.shards().len()) as u32);
let mut interval =
tokio::time::interval(self.epoch / (self.node_locks.shards().len()) as u32);
loop {
for (i, shard) in self.node_locks.shards().iter().enumerate() {
interval.tick().await;
@@ -554,13 +520,13 @@ impl ApiLocks {
"performing epoch reclamation on api lock"
);
let mut lock = shard.write();
let timer = self.reclamation_lag.start_timer();
let timer = self.metrics.reclamation_lag_seconds.start_timer();
let count = lock
.extract_if(|_, semaphore| Arc::strong_count(semaphore.get_mut()) == 1)
.count();
drop(lock);
self.unregistered.inc_by(count as u64);
timer.observe_duration()
self.metrics.semaphores_unregistered.inc_by(count as u64);
timer.observe();
}
}
}

View File

@@ -7,13 +7,14 @@ use super::{
NodeInfo,
};
use crate::{
auth::backend::ComputeUserInfo, compute, console::messages::ColdStartInfo, http, scram,
};
use crate::{
cache::Cached,
context::RequestMonitoring,
metrics::{ALLOWED_IPS_BY_CACHE_OUTCOME, ALLOWED_IPS_NUMBER},
auth::backend::ComputeUserInfo,
compute,
console::messages::ColdStartInfo,
http,
metrics::{CacheOutcome, Metrics},
scram, Normalize,
};
use crate::{cache::Cached, context::RequestMonitoring};
use futures::TryFutureExt;
use std::sync::Arc;
use tokio::time::Instant;
@@ -23,7 +24,7 @@ use tracing::{error, info, info_span, warn, Instrument};
pub struct Api {
endpoint: http::Endpoint,
pub caches: &'static ApiCaches,
locks: &'static ApiLocks,
pub locks: &'static ApiLocks,
jwt: String,
}
@@ -55,6 +56,15 @@ impl Api {
ctx: &mut RequestMonitoring,
user_info: &ComputeUserInfo,
) -> Result<AuthInfo, GetAuthInfoError> {
if !self
.caches
.endpoints_cache
.is_valid(ctx, &user_info.endpoint.normalize())
.await
{
info!("endpoint is not valid, skipping the request");
return Ok(AuthInfo::default());
}
let request_id = ctx.session_id.to_string();
let application_name = ctx.console_application_name();
async {
@@ -81,7 +91,9 @@ impl Api {
Ok(body) => body,
// Error 404 is special: it's ok not to have a secret.
Err(e) => match e.http_status_code() {
Some(http::StatusCode::NOT_FOUND) => return Ok(AuthInfo::default()),
Some(http::StatusCode::NOT_FOUND) => {
return Ok(AuthInfo::default());
}
_otherwise => return Err(e.into()),
},
};
@@ -95,7 +107,10 @@ impl Api {
Some(secret)
};
let allowed_ips = body.allowed_ips.unwrap_or_default();
ALLOWED_IPS_NUMBER.observe(allowed_ips.len() as f64);
Metrics::get()
.proxy
.allowed_ips_number
.observe(allowed_ips.len() as f64);
Ok(AuthInfo {
secret,
allowed_ips,
@@ -174,23 +189,27 @@ impl super::Api for Api {
ctx: &mut RequestMonitoring,
user_info: &ComputeUserInfo,
) -> Result<CachedRoleSecret, GetAuthInfoError> {
let ep = &user_info.endpoint;
let normalized_ep = &user_info.endpoint.normalize();
let user = &user_info.user;
if let Some(role_secret) = self.caches.project_info.get_role_secret(ep, user) {
if let Some(role_secret) = self
.caches
.project_info
.get_role_secret(normalized_ep, user)
{
return Ok(role_secret);
}
let auth_info = self.do_get_auth_info(ctx, user_info).await?;
if let Some(project_id) = auth_info.project_id {
let ep_int = ep.into();
let normalized_ep_int = normalized_ep.into();
self.caches.project_info.insert_role_secret(
project_id,
ep_int,
normalized_ep_int,
user.into(),
auth_info.secret.clone(),
);
self.caches.project_info.insert_allowed_ips(
project_id,
ep_int,
normalized_ep_int,
Arc::new(auth_info.allowed_ips),
);
ctx.set_project_id(project_id);
@@ -204,30 +223,34 @@ impl super::Api for Api {
ctx: &mut RequestMonitoring,
user_info: &ComputeUserInfo,
) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), GetAuthInfoError> {
let ep = &user_info.endpoint;
if let Some(allowed_ips) = self.caches.project_info.get_allowed_ips(ep) {
ALLOWED_IPS_BY_CACHE_OUTCOME
.with_label_values(&["hit"])
.inc();
let normalized_ep = &user_info.endpoint.normalize();
if let Some(allowed_ips) = self.caches.project_info.get_allowed_ips(normalized_ep) {
Metrics::get()
.proxy
.allowed_ips_cache_misses
.inc(CacheOutcome::Hit);
return Ok((allowed_ips, None));
}
ALLOWED_IPS_BY_CACHE_OUTCOME
.with_label_values(&["miss"])
.inc();
Metrics::get()
.proxy
.allowed_ips_cache_misses
.inc(CacheOutcome::Miss);
let auth_info = self.do_get_auth_info(ctx, user_info).await?;
let allowed_ips = Arc::new(auth_info.allowed_ips);
let user = &user_info.user;
if let Some(project_id) = auth_info.project_id {
let ep_int = ep.into();
let normalized_ep_int = normalized_ep.into();
self.caches.project_info.insert_role_secret(
project_id,
ep_int,
normalized_ep_int,
user.into(),
auth_info.secret.clone(),
);
self.caches
.project_info
.insert_allowed_ips(project_id, ep_int, allowed_ips.clone());
self.caches.project_info.insert_allowed_ips(
project_id,
normalized_ep_int,
allowed_ips.clone(),
);
ctx.set_project_id(project_id);
}
Ok((

View File

@@ -12,7 +12,7 @@ use crate::{
console::messages::{ColdStartInfo, MetricsAuxInfo},
error::ErrorKind,
intern::{BranchIdInt, ProjectIdInt},
metrics::{LatencyTimer, ENDPOINT_ERRORS_BY_KIND, ERROR_BY_KIND},
metrics::{ConnectOutcome, InvalidEndpointsGroup, LatencyTimer, Metrics, Protocol},
DbName, EndpointId, RoleName,
};
@@ -29,7 +29,7 @@ static LOG_CHAN: OnceCell<mpsc::WeakUnboundedSender<RequestData>> = OnceCell::ne
pub struct RequestMonitoring {
pub peer_addr: IpAddr,
pub session_id: Uuid,
pub protocol: &'static str,
pub protocol: Protocol,
first_packet: chrono::DateTime<Utc>,
region: &'static str,
pub span: Span,
@@ -50,6 +50,8 @@ pub struct RequestMonitoring {
// This sender is here to keep the request monitoring channel open while requests are taking place.
sender: Option<mpsc::UnboundedSender<RequestData>>,
pub latency_timer: LatencyTimer,
// Whether proxy decided that it's not a valid endpoint end rejected it before going to cplane.
rejected: bool,
}
#[derive(Clone, Debug)]
@@ -65,7 +67,7 @@ impl RequestMonitoring {
pub fn new(
session_id: Uuid,
peer_addr: IpAddr,
protocol: &'static str,
protocol: Protocol,
region: &'static str,
) -> Self {
let span = info_span!(
@@ -93,6 +95,7 @@ impl RequestMonitoring {
error_kind: None,
auth_method: None,
success: false,
rejected: false,
cold_start_info: ColdStartInfo::Unknown,
sender: LOG_CHAN.get().and_then(|tx| tx.upgrade()),
@@ -102,7 +105,7 @@ impl RequestMonitoring {
#[cfg(test)]
pub fn test() -> Self {
RequestMonitoring::new(Uuid::now_v7(), [127, 0, 0, 1].into(), "test", "test")
RequestMonitoring::new(Uuid::now_v7(), [127, 0, 0, 1].into(), Protocol::Tcp, "test")
}
pub fn console_application_name(&self) -> String {
@@ -113,6 +116,10 @@ impl RequestMonitoring {
)
}
pub fn set_rejected(&mut self, rejected: bool) {
self.rejected = rejected;
}
pub fn set_cold_start_info(&mut self, info: ColdStartInfo) {
self.cold_start_info = info;
self.latency_timer.cold_start_info(info);
@@ -134,9 +141,9 @@ impl RequestMonitoring {
pub fn set_endpoint_id(&mut self, endpoint_id: EndpointId) {
if self.endpoint_id.is_none() {
self.span.record("ep", display(&endpoint_id));
crate::metrics::CONNECTING_ENDPOINTS
.with_label_values(&[self.protocol])
.measure(&endpoint_id);
let metric = &Metrics::get().proxy.connecting_endpoints;
let label = metric.with_labels(self.protocol);
metric.get_metric(label).measure(&endpoint_id);
self.endpoint_id = Some(endpoint_id);
}
}
@@ -158,13 +165,11 @@ impl RequestMonitoring {
}
pub fn set_error_kind(&mut self, kind: ErrorKind) {
ERROR_BY_KIND
.with_label_values(&[kind.to_metric_label()])
.inc();
Metrics::get().proxy.errors_total.inc(kind);
if let Some(ep) = &self.endpoint_id {
ENDPOINT_ERRORS_BY_KIND
.with_label_values(&[kind.to_metric_label()])
.measure(ep);
let metric = &Metrics::get().proxy.endpoints_affected_by_errors;
let label = metric.with_labels(kind);
metric.get_metric(label).measure(ep);
}
self.error_kind = Some(kind);
}
@@ -178,6 +183,19 @@ impl RequestMonitoring {
impl Drop for RequestMonitoring {
fn drop(&mut self) {
let outcome = if self.success {
ConnectOutcome::Success
} else {
ConnectOutcome::Failed
};
Metrics::get()
.proxy
.invalid_endpoints_total
.inc(InvalidEndpointsGroup {
protocol: self.protocol,
rejected: self.rejected.into(),
outcome,
});
if let Some(tx) = self.sender.take() {
let _: Result<(), _> = tx.send(RequestData::from(&*self));
}

View File

@@ -111,7 +111,7 @@ impl From<&RequestMonitoring> for RequestData {
super::AuthMethod::ScramSha256Plus => "scram_sha_256_plus",
super::AuthMethod::Cleartext => "cleartext",
}),
protocol: value.protocol,
protocol: value.protocol.as_str(),
region: value.region,
error: value.error_kind.as_ref().map(|e| e.to_metric_label()),
success: value.success,

View File

@@ -1,5 +1,7 @@
use std::{error::Error as StdError, fmt, io};
use measured::FixedCardinalityLabel;
/// Upcast (almost) any error into an opaque [`io::Error`].
pub fn io_error(e: impl Into<Box<dyn StdError + Send + Sync>>) -> io::Error {
io::Error::new(io::ErrorKind::Other, e)
@@ -29,24 +31,29 @@ pub trait UserFacingError: ReportableError {
}
}
#[derive(Copy, Clone, Debug, Eq, PartialEq)]
#[derive(Copy, Clone, Debug, Eq, PartialEq, FixedCardinalityLabel)]
#[label(singleton = "type")]
pub enum ErrorKind {
/// Wrong password, unknown endpoint, protocol violation, etc...
User,
/// Network error between user and proxy. Not necessarily user error
#[label(rename = "clientdisconnect")]
ClientDisconnect,
/// Proxy self-imposed user rate limits
#[label(rename = "ratelimit")]
RateLimit,
/// Proxy self-imposed service-wise rate limits
#[label(rename = "serviceratelimit")]
ServiceRateLimit,
/// internal errors
Service,
/// Error communicating with control plane
#[label(rename = "controlplane")]
ControlPlane,
/// Postgres error

View File

@@ -13,7 +13,11 @@ pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
use tokio::time::Instant;
use tracing::trace;
use crate::{metrics::CONSOLE_REQUEST_LATENCY, rate_limiter, url::ApiUrl};
use crate::{
metrics::{ConsoleRequest, Metrics},
rate_limiter,
url::ApiUrl,
};
use reqwest_middleware::RequestBuilder;
/// This is the preferred way to create new http clients,
@@ -90,13 +94,14 @@ impl Endpoint {
/// Execute a [request](reqwest::Request).
pub async fn execute(&self, request: Request) -> Result<Response, Error> {
let path = request.url().path().to_string();
let start = Instant::now();
let res = self.client.execute(request).await;
CONSOLE_REQUEST_LATENCY
.with_label_values(&[&path])
.observe(start.elapsed().as_secs_f64());
res
let _timer = Metrics::get()
.proxy
.console_request_latency
.start_timer(ConsoleRequest {
request: request.url().path(),
});
self.client.execute(request).await
}
}

View File

@@ -1,30 +1,49 @@
use anyhow::{anyhow, bail};
use hyper::{Body, Request, Response, StatusCode};
use std::{convert::Infallible, net::TcpListener};
use tracing::info;
use hyper::{header::CONTENT_TYPE, Body, Request, Response, StatusCode};
use measured::{text::BufferedTextEncoder, MetricGroup};
use metrics::NeonMetrics;
use std::{
convert::Infallible,
net::TcpListener,
sync::{Arc, Mutex},
};
use tracing::{info, info_span};
use utils::http::{
endpoint::{self, prometheus_metrics_handler, request_span},
endpoint::{self, request_span},
error::ApiError,
json::json_response,
RouterBuilder, RouterService,
};
use crate::jemalloc;
async fn status_handler(_: Request<Body>) -> Result<Response<Body>, ApiError> {
json_response(StatusCode::OK, "")
}
fn make_router() -> RouterBuilder<hyper::Body, ApiError> {
fn make_router(metrics: AppMetrics) -> RouterBuilder<hyper::Body, ApiError> {
let state = Arc::new(Mutex::new(PrometheusHandler {
encoder: BufferedTextEncoder::new(),
metrics,
}));
endpoint::make_router()
.get("/metrics", |r| request_span(r, prometheus_metrics_handler))
.get("/metrics", move |r| {
let state = state.clone();
request_span(r, move |b| prometheus_metrics_handler(b, state))
})
.get("/v1/status", status_handler)
}
pub async fn task_main(http_listener: TcpListener) -> anyhow::Result<Infallible> {
pub async fn task_main(
http_listener: TcpListener,
metrics: AppMetrics,
) -> anyhow::Result<Infallible> {
scopeguard::defer! {
info!("http has shut down");
}
let service = || RouterService::new(make_router().build()?);
let service = || RouterService::new(make_router(metrics).build()?);
hyper::Server::from_tcp(http_listener)?
.serve(service().map_err(|e| anyhow!(e))?)
@@ -32,3 +51,57 @@ pub async fn task_main(http_listener: TcpListener) -> anyhow::Result<Infallible>
bail!("hyper server without shutdown handling cannot shutdown successfully");
}
struct PrometheusHandler {
encoder: BufferedTextEncoder,
metrics: AppMetrics,
}
#[derive(MetricGroup)]
pub struct AppMetrics {
#[metric(namespace = "jemalloc")]
pub jemalloc: Option<jemalloc::MetricRecorder>,
#[metric(flatten)]
pub neon_metrics: NeonMetrics,
#[metric(flatten)]
pub proxy: &'static crate::metrics::Metrics,
}
async fn prometheus_metrics_handler(
_req: Request<Body>,
state: Arc<Mutex<PrometheusHandler>>,
) -> Result<Response<Body>, ApiError> {
let started_at = std::time::Instant::now();
let span = info_span!("blocking");
let body = tokio::task::spawn_blocking(move || {
let _span = span.entered();
let mut state = state.lock().unwrap();
let PrometheusHandler { encoder, metrics } = &mut *state;
metrics
.collect_group_into(&mut *encoder)
.unwrap_or_else(|infallible| match infallible {});
let body = encoder.finish();
tracing::info!(
bytes = body.len(),
elapsed_ms = started_at.elapsed().as_millis(),
"responded /metrics"
);
body
})
.await
.unwrap();
let response = Response::builder()
.status(200)
.header(CONTENT_TYPE, "text/plain; version=0.0.4")
.body(Body::from(body))
.unwrap();
Ok(response)
}

View File

@@ -160,6 +160,11 @@ impl From<&EndpointId> for EndpointIdInt {
EndpointIdTag::get_interner().get_or_intern(value)
}
}
impl From<EndpointId> for EndpointIdInt {
fn from(value: EndpointId) -> Self {
EndpointIdTag::get_interner().get_or_intern(&value)
}
}
#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
pub struct BranchIdTag;
@@ -175,6 +180,11 @@ impl From<&BranchId> for BranchIdInt {
BranchIdTag::get_interner().get_or_intern(value)
}
}
impl From<BranchId> for BranchIdInt {
fn from(value: BranchId) -> Self {
BranchIdTag::get_interner().get_or_intern(&value)
}
}
#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
pub struct ProjectIdTag;
@@ -190,6 +200,11 @@ impl From<&ProjectId> for ProjectIdInt {
ProjectIdTag::get_interner().get_or_intern(value)
}
}
impl From<ProjectId> for ProjectIdInt {
fn from(value: ProjectId) -> Self {
ProjectIdTag::get_interner().get_or_intern(&value)
}
}
#[cfg(test)]
mod tests {

View File

@@ -1,27 +1,45 @@
use std::time::Duration;
use std::marker::PhantomData;
use metrics::IntGauge;
use prometheus::{register_int_gauge_with_registry, Registry};
use measured::{
label::NoLabels,
metric::{
gauge::GaugeState, group::Encoding, group::MetricValue, name::MetricNameEncoder,
MetricEncoding, MetricFamilyEncoding, MetricType,
},
text::TextEncoder,
LabelGroup, MetricGroup,
};
use tikv_jemalloc_ctl::{config, epoch, epoch_mib, stats, version};
pub struct MetricRecorder {
epoch: epoch_mib,
active: stats::active_mib,
active_gauge: IntGauge,
allocated: stats::allocated_mib,
allocated_gauge: IntGauge,
mapped: stats::mapped_mib,
mapped_gauge: IntGauge,
metadata: stats::metadata_mib,
metadata_gauge: IntGauge,
resident: stats::resident_mib,
resident_gauge: IntGauge,
retained: stats::retained_mib,
retained_gauge: IntGauge,
inner: Metrics,
}
#[derive(MetricGroup)]
struct Metrics {
active_bytes: JemallocGaugeFamily<stats::active_mib>,
allocated_bytes: JemallocGaugeFamily<stats::allocated_mib>,
mapped_bytes: JemallocGaugeFamily<stats::mapped_mib>,
metadata_bytes: JemallocGaugeFamily<stats::metadata_mib>,
resident_bytes: JemallocGaugeFamily<stats::resident_mib>,
retained_bytes: JemallocGaugeFamily<stats::retained_mib>,
}
impl<Enc: Encoding> MetricGroup<Enc> for MetricRecorder
where
Metrics: MetricGroup<Enc>,
{
fn collect_group_into(&self, enc: &mut Enc) -> Result<(), Enc::Err> {
if self.epoch.advance().is_ok() {
self.inner.collect_group_into(enc)?;
}
Ok(())
}
}
impl MetricRecorder {
pub fn new(registry: &Registry) -> Result<Self, anyhow::Error> {
pub fn new() -> Result<Self, anyhow::Error> {
tracing::info!(
config = config::malloc_conf::read()?,
version = version::read()?,
@@ -30,71 +48,69 @@ impl MetricRecorder {
Ok(Self {
epoch: epoch::mib()?,
active: stats::active::mib()?,
active_gauge: register_int_gauge_with_registry!(
"jemalloc_active_bytes",
"Total number of bytes in active pages allocated by the process",
registry
)?,
allocated: stats::allocated::mib()?,
allocated_gauge: register_int_gauge_with_registry!(
"jemalloc_allocated_bytes",
"Total number of bytes allocated by the process",
registry
)?,
mapped: stats::mapped::mib()?,
mapped_gauge: register_int_gauge_with_registry!(
"jemalloc_mapped_bytes",
"Total number of bytes in active extents mapped by the allocator",
registry
)?,
metadata: stats::metadata::mib()?,
metadata_gauge: register_int_gauge_with_registry!(
"jemalloc_metadata_bytes",
"Total number of bytes dedicated to jemalloc metadata",
registry
)?,
resident: stats::resident::mib()?,
resident_gauge: register_int_gauge_with_registry!(
"jemalloc_resident_bytes",
"Total number of bytes in physically resident data pages mapped by the allocator",
registry
)?,
retained: stats::retained::mib()?,
retained_gauge: register_int_gauge_with_registry!(
"jemalloc_retained_bytes",
"Total number of bytes in virtual memory mappings that were retained rather than being returned to the operating system",
registry
)?,
})
}
fn _poll(&self) -> Result<(), anyhow::Error> {
self.epoch.advance()?;
self.active_gauge.set(self.active.read()? as i64);
self.allocated_gauge.set(self.allocated.read()? as i64);
self.mapped_gauge.set(self.mapped.read()? as i64);
self.metadata_gauge.set(self.metadata.read()? as i64);
self.resident_gauge.set(self.resident.read()? as i64);
self.retained_gauge.set(self.retained.read()? as i64);
Ok(())
}
#[inline]
pub fn poll(&self) {
if let Err(error) = self._poll() {
tracing::warn!(%error, "Failed to poll jemalloc stats");
}
}
pub fn start(self) -> tokio::task::JoinHandle<()> {
tokio::task::spawn(async move {
let mut interval = tokio::time::interval(Duration::from_secs(15));
interval.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip);
loop {
self.poll();
interval.tick().await;
}
inner: Metrics {
active_bytes: JemallocGaugeFamily(stats::active::mib()?),
allocated_bytes: JemallocGaugeFamily(stats::allocated::mib()?),
mapped_bytes: JemallocGaugeFamily(stats::mapped::mib()?),
metadata_bytes: JemallocGaugeFamily(stats::metadata::mib()?),
resident_bytes: JemallocGaugeFamily(stats::resident::mib()?),
retained_bytes: JemallocGaugeFamily(stats::retained::mib()?),
},
})
}
}
struct JemallocGauge<T>(PhantomData<T>);
impl<T> Default for JemallocGauge<T> {
fn default() -> Self {
JemallocGauge(PhantomData)
}
}
impl<T> MetricType for JemallocGauge<T> {
type Metadata = T;
}
struct JemallocGaugeFamily<T>(T);
impl<M, T: Encoding> MetricFamilyEncoding<T> for JemallocGaugeFamily<M>
where
JemallocGauge<M>: MetricEncoding<T, Metadata = M>,
{
fn collect_family_into(&self, name: impl MetricNameEncoder, enc: &mut T) -> Result<(), T::Err> {
JemallocGauge::write_type(&name, enc)?;
JemallocGauge(PhantomData).collect_into(&self.0, NoLabels, name, enc)
}
}
macro_rules! jemalloc_gauge {
($stat:ident, $mib:ident) => {
impl<W: std::io::Write> MetricEncoding<TextEncoder<W>> for JemallocGauge<stats::$mib> {
fn write_type(
name: impl MetricNameEncoder,
enc: &mut TextEncoder<W>,
) -> Result<(), std::io::Error> {
GaugeState::write_type(name, enc)
}
fn collect_into(
&self,
mib: &stats::$mib,
labels: impl LabelGroup,
name: impl MetricNameEncoder,
enc: &mut TextEncoder<W>,
) -> Result<(), std::io::Error> {
if let Ok(v) = mib.read() {
enc.write_metric_value(name, labels, MetricValue::Int(v as i64))?;
}
Ok(())
}
}
};
}
jemalloc_gauge!(active, active_mib);
jemalloc_gauge!(allocated, allocated_mib);
jemalloc_gauge!(mapped, mapped_mib);
jemalloc_gauge!(metadata, metadata_mib);
jemalloc_gauge!(resident, resident_mib);
jemalloc_gauge!(retained, retained_mib);

View File

@@ -127,6 +127,24 @@ macro_rules! smol_str_wrapper {
};
}
const POOLER_SUFFIX: &str = "-pooler";
pub trait Normalize {
fn normalize(&self) -> Self;
}
impl<S: Clone + AsRef<str> + From<String>> Normalize for S {
fn normalize(&self) -> Self {
if self.as_ref().ends_with(POOLER_SUFFIX) {
let mut s = self.as_ref().to_string();
s.truncate(s.len() - POOLER_SUFFIX.len());
s.into()
} else {
self.clone()
}
}
}
// 90% of role name strings are 20 characters or less.
smol_str_wrapper!(RoleName);
// 50% of endpoint strings are 23 characters or less.
@@ -140,3 +158,22 @@ smol_str_wrapper!(ProjectId);
smol_str_wrapper!(EndpointCacheKey);
smol_str_wrapper!(DbName);
// Endpoints are a bit tricky. Rare they might be branches or projects.
impl EndpointId {
pub fn is_endpoint(&self) -> bool {
self.0.starts_with("ep-")
}
pub fn is_branch(&self) -> bool {
self.0.starts_with("br-")
}
pub fn is_project(&self) -> bool {
!self.is_endpoint() && !self.is_branch()
}
pub fn as_branch(&self) -> BranchId {
BranchId(self.0.clone())
}
pub fn as_project(&self) -> ProjectId {
ProjectId(self.0.clone())
}
}

View File

@@ -1,176 +1,359 @@
use ::metrics::{
exponential_buckets, register_histogram, register_histogram_vec, register_hll_vec,
register_int_counter_pair_vec, register_int_counter_vec, register_int_gauge,
register_int_gauge_vec, Histogram, HistogramVec, HyperLogLogVec, IntCounterPairVec,
IntCounterVec, IntGauge, IntGaugeVec,
};
use metrics::{
register_hll, register_int_counter, register_int_counter_pair, HyperLogLog, IntCounter,
IntCounterPair,
};
use std::sync::OnceLock;
use lasso::ThreadedRodeo;
use measured::{
label::StaticLabelSet,
metric::{histogram::Thresholds, name::MetricName},
Counter, CounterVec, FixedCardinalityLabel, Gauge, GaugeVec, Histogram, HistogramVec,
LabelGroup, MetricGroup,
};
use metrics::{CounterPairAssoc, CounterPairVec, HyperLogLog, HyperLogLogVec};
use once_cell::sync::Lazy;
use tokio::time::{self, Instant};
use crate::console::messages::ColdStartInfo;
pub static NUM_DB_CONNECTIONS_GAUGE: Lazy<IntCounterPairVec> = Lazy::new(|| {
register_int_counter_pair_vec!(
"proxy_opened_db_connections_total",
"Number of opened connections to a database.",
"proxy_closed_db_connections_total",
"Number of closed connections to a database.",
&["protocol"],
)
.unwrap()
});
#[derive(MetricGroup)]
pub struct Metrics {
#[metric(namespace = "proxy")]
pub proxy: ProxyMetrics,
pub static NUM_CLIENT_CONNECTION_GAUGE: Lazy<IntCounterPairVec> = Lazy::new(|| {
register_int_counter_pair_vec!(
"proxy_opened_client_connections_total",
"Number of opened connections from a client.",
"proxy_closed_client_connections_total",
"Number of closed connections from a client.",
&["protocol"],
)
.unwrap()
});
#[metric(namespace = "wake_compute_lock")]
pub wake_compute_lock: ApiLockMetrics,
pub static NUM_CONNECTION_REQUESTS_GAUGE: Lazy<IntCounterPairVec> = Lazy::new(|| {
register_int_counter_pair_vec!(
"proxy_accepted_connections_total",
"Number of client connections accepted.",
"proxy_closed_connections_total",
"Number of client connections closed.",
&["protocol"],
)
.unwrap()
});
// the one metric not called proxy_....
pub semaphore_control_plane_limit: GaugeVec<StaticLabelSet<RateLimit>>,
}
pub static COMPUTE_CONNECTION_LATENCY: Lazy<HistogramVec> = Lazy::new(|| {
register_histogram_vec!(
"proxy_compute_connection_latency_seconds",
"Time it took for proxy to establish a connection to the compute endpoint",
// http/ws/tcp, true/false, true/false, success/failure, client/client_and_cplane
// 3 * 6 * 2 * 2 = 72 counters
&["protocol", "cold_start_info", "outcome", "excluded"],
// largest bucket = 2^16 * 0.5ms = 32s
exponential_buckets(0.0005, 2.0, 16).unwrap(),
)
.unwrap()
});
impl Metrics {
pub fn get() -> &'static Self {
static SELF: OnceLock<Metrics> = OnceLock::new();
SELF.get_or_init(|| Metrics {
proxy: ProxyMetrics::default(),
wake_compute_lock: ApiLockMetrics::new(),
semaphore_control_plane_limit: GaugeVec::default(),
})
}
}
pub static CONSOLE_REQUEST_LATENCY: Lazy<HistogramVec> = Lazy::new(|| {
register_histogram_vec!(
"proxy_console_request_latency",
"Time it took for proxy to establish a connection to the compute endpoint",
// proxy_wake_compute/proxy_get_role_info
&["request"],
#[derive(MetricGroup)]
#[metric(new())]
pub struct ProxyMetrics {
#[metric(flatten)]
pub db_connections: CounterPairVec<NumDbConnectionsGauge>,
#[metric(flatten)]
pub client_connections: CounterPairVec<NumClientConnectionsGauge>,
#[metric(flatten)]
pub connection_requests: CounterPairVec<NumConnectionRequestsGauge>,
#[metric(flatten)]
pub http_endpoint_pools: HttpEndpointPools,
/// Time it took for proxy to establish a connection to the compute endpoint.
// largest bucket = 2^16 * 0.5ms = 32s
#[metric(metadata = Thresholds::exponential_buckets(0.0005, 2.0))]
pub compute_connection_latency_seconds: HistogramVec<ComputeConnectionLatencySet, 16>,
/// Time it took for proxy to receive a response from control plane.
#[metric(
// largest bucket = 2^16 * 0.2ms = 13s
exponential_buckets(0.0002, 2.0, 16).unwrap(),
)
.unwrap()
});
metadata = Thresholds::exponential_buckets(0.0002, 2.0),
)]
pub console_request_latency: HistogramVec<ConsoleRequestSet, 16>,
pub static ALLOWED_IPS_BY_CACHE_OUTCOME: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"proxy_allowed_ips_cache_misses",
"Number of cache hits/misses for allowed ips",
// hit/miss
&["outcome"],
)
.unwrap()
});
/// Time it takes to acquire a token to call console plane.
// largest bucket = 3^16 * 0.05ms = 2.15s
#[metric(metadata = Thresholds::exponential_buckets(0.00005, 3.0))]
pub control_plane_token_acquire_seconds: Histogram<16>,
pub static RATE_LIMITER_ACQUIRE_LATENCY: Lazy<Histogram> = Lazy::new(|| {
register_histogram!(
"proxy_control_plane_token_acquire_seconds",
"Time it took for proxy to establish a connection to the compute endpoint",
// largest bucket = 3^16 * 0.05ms = 2.15s
exponential_buckets(0.00005, 3.0, 16).unwrap(),
)
.unwrap()
});
/// Size of the HTTP request body lengths.
// smallest bucket = 16 bytes
// largest bucket = 4^12 * 16 bytes = 256MB
#[metric(metadata = Thresholds::exponential_buckets(16.0, 4.0))]
pub http_conn_content_length_bytes: HistogramVec<StaticLabelSet<HttpDirection>, 12>,
pub static RATE_LIMITER_LIMIT: Lazy<IntGaugeVec> = Lazy::new(|| {
register_int_gauge_vec!(
"semaphore_control_plane_limit",
"Current limit of the semaphore control plane",
&["limit"], // 2 counters
)
.unwrap()
});
/// Time it takes to reclaim unused connection pools.
#[metric(metadata = Thresholds::exponential_buckets(1e-6, 2.0))]
pub http_pool_reclaimation_lag_seconds: Histogram<16>,
pub static NUM_CONNECTION_ACCEPTED_BY_SNI: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"proxy_accepted_connections_by_sni",
"Number of connections (per sni).",
&["kind"],
)
.unwrap()
});
/// Number of opened connections to a database.
pub http_pool_opened_connections: Gauge,
pub static ALLOWED_IPS_NUMBER: Lazy<Histogram> = Lazy::new(|| {
register_histogram!(
"proxy_allowed_ips_number",
"Number of allowed ips",
vec![0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 10.0, 20.0, 50.0, 100.0],
)
.unwrap()
});
/// Number of cache hits/misses for allowed ips.
pub allowed_ips_cache_misses: CounterVec<StaticLabelSet<CacheOutcome>>,
pub static HTTP_CONTENT_LENGTH: Lazy<HistogramVec> = Lazy::new(|| {
register_histogram_vec!(
"proxy_http_conn_content_length_bytes",
"Number of bytes the HTTP response content consumes",
// request/response
&["direction"],
// smallest bucket = 16 bytes
// largest bucket = 4^12 * 16 bytes = 256MB
exponential_buckets(16.0, 4.0, 12).unwrap()
)
.unwrap()
});
/// Number of allowed ips
#[metric(metadata = Thresholds::with_buckets([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 10.0, 20.0, 50.0, 100.0]))]
pub allowed_ips_number: Histogram<10>,
pub static GC_LATENCY: Lazy<Histogram> = Lazy::new(|| {
register_histogram!(
"proxy_http_pool_reclaimation_lag_seconds",
"Time it takes to reclaim unused connection pools",
// 1us -> 65ms
exponential_buckets(1e-6, 2.0, 16).unwrap(),
)
.unwrap()
});
/// Number of connections (per sni).
pub accepted_connections_by_sni: CounterVec<StaticLabelSet<SniKind>>,
pub static ENDPOINT_POOLS: Lazy<IntCounterPair> = Lazy::new(|| {
register_int_counter_pair!(
"proxy_http_pool_endpoints_registered_total",
"Number of endpoints we have registered pools for",
"proxy_http_pool_endpoints_unregistered_total",
"Number of endpoints we have unregistered pools for",
)
.unwrap()
});
/// Number of connection failures (per kind).
pub connection_failures_total: CounterVec<StaticLabelSet<ConnectionFailureKind>>,
pub static NUM_OPEN_CLIENTS_IN_HTTP_POOL: Lazy<IntGauge> = Lazy::new(|| {
register_int_gauge!(
"proxy_http_pool_opened_connections",
"Number of opened connections to a database.",
)
.unwrap()
});
/// Number of wake-up failures (per kind).
pub connection_failures_breakdown: CounterVec<ConnectionFailuresBreakdownSet>,
pub static NUM_CANCELLATION_REQUESTS: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"proxy_cancellation_requests_total",
"Number of cancellation requests (per found/not_found).",
&["source", "kind"],
)
.unwrap()
});
/// Number of bytes sent/received between all clients and backends.
pub io_bytes: CounterVec<StaticLabelSet<Direction>>,
pub const NUM_CANCELLATION_REQUESTS_SOURCE_FROM_CLIENT: &str = "from_client";
pub const NUM_CANCELLATION_REQUESTS_SOURCE_FROM_REDIS: &str = "from_redis";
/// Number of errors by a given classification.
pub errors_total: CounterVec<StaticLabelSet<crate::error::ErrorKind>>,
/// Number of cancellation requests (per found/not_found).
pub cancellation_requests_total: CounterVec<CancellationRequestSet>,
/// Number of errors by a given classification
pub redis_errors_total: CounterVec<RedisErrorsSet>,
/// Number of TLS handshake failures
pub tls_handshake_failures: Counter,
/// Number of connection requests affected by authentication rate limits
pub requests_auth_rate_limits_total: Counter,
/// HLL approximate cardinality of endpoints that are connecting
pub connecting_endpoints: HyperLogLogVec<StaticLabelSet<Protocol>, 32>,
/// Number of endpoints affected by errors of a given classification
pub endpoints_affected_by_errors: HyperLogLogVec<StaticLabelSet<crate::error::ErrorKind>, 32>,
/// Number of endpoints affected by authentication rate limits
pub endpoints_auth_rate_limits: HyperLogLog<32>,
/// Number of invalid endpoints (per protocol, per rejected).
pub invalid_endpoints_total: CounterVec<InvalidEndpointsSet>,
}
#[derive(MetricGroup)]
#[metric(new())]
pub struct ApiLockMetrics {
/// Number of semaphores registered in this api lock
pub semaphores_registered: Counter,
/// Number of semaphores unregistered in this api lock
pub semaphores_unregistered: Counter,
/// Time it takes to reclaim unused semaphores in the api lock
#[metric(metadata = Thresholds::exponential_buckets(1e-6, 2.0))]
pub reclamation_lag_seconds: Histogram<16>,
/// Time it takes to acquire a semaphore lock
#[metric(metadata = Thresholds::exponential_buckets(1e-4, 2.0))]
pub semaphore_acquire_seconds: Histogram<16>,
}
impl Default for ProxyMetrics {
fn default() -> Self {
Self::new()
}
}
#[derive(FixedCardinalityLabel, Copy, Clone)]
#[label(singleton = "direction")]
pub enum HttpDirection {
Request,
Response,
}
#[derive(FixedCardinalityLabel, Copy, Clone)]
#[label(singleton = "direction")]
pub enum Direction {
Tx,
Rx,
}
#[derive(FixedCardinalityLabel, Clone, Copy, Debug)]
#[label(singleton = "protocol")]
pub enum Protocol {
Http,
Ws,
Tcp,
SniRouter,
}
impl Protocol {
pub fn as_str(&self) -> &'static str {
match self {
Protocol::Http => "http",
Protocol::Ws => "ws",
Protocol::Tcp => "tcp",
Protocol::SniRouter => "sni_router",
}
}
}
impl std::fmt::Display for Protocol {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.write_str(self.as_str())
}
}
#[derive(FixedCardinalityLabel, Copy, Clone)]
pub enum Bool {
True,
False,
}
#[derive(FixedCardinalityLabel, Copy, Clone)]
#[label(singleton = "outcome")]
pub enum Outcome {
Success,
Failed,
}
#[derive(FixedCardinalityLabel, Copy, Clone)]
#[label(singleton = "outcome")]
pub enum CacheOutcome {
Hit,
Miss,
}
#[derive(LabelGroup)]
#[label(set = ConsoleRequestSet)]
pub struct ConsoleRequest<'a> {
#[label(dynamic_with = ThreadedRodeo, default)]
pub request: &'a str,
}
#[derive(MetricGroup, Default)]
pub struct HttpEndpointPools {
/// Number of endpoints we have registered pools for
pub http_pool_endpoints_registered_total: Counter,
/// Number of endpoints we have unregistered pools for
pub http_pool_endpoints_unregistered_total: Counter,
}
pub struct HttpEndpointPoolsGuard<'a> {
dec: &'a Counter,
}
impl Drop for HttpEndpointPoolsGuard<'_> {
fn drop(&mut self) {
self.dec.inc();
}
}
impl HttpEndpointPools {
pub fn guard(&self) -> HttpEndpointPoolsGuard {
self.http_pool_endpoints_registered_total.inc();
HttpEndpointPoolsGuard {
dec: &self.http_pool_endpoints_unregistered_total,
}
}
}
pub struct NumDbConnectionsGauge;
impl CounterPairAssoc for NumDbConnectionsGauge {
const INC_NAME: &'static MetricName = MetricName::from_str("opened_db_connections_total");
const DEC_NAME: &'static MetricName = MetricName::from_str("closed_db_connections_total");
const INC_HELP: &'static str = "Number of opened connections to a database.";
const DEC_HELP: &'static str = "Number of closed connections to a database.";
type LabelGroupSet = StaticLabelSet<Protocol>;
}
pub type NumDbConnectionsGuard<'a> = metrics::MeasuredCounterPairGuard<'a, NumDbConnectionsGauge>;
pub struct NumClientConnectionsGauge;
impl CounterPairAssoc for NumClientConnectionsGauge {
const INC_NAME: &'static MetricName = MetricName::from_str("opened_client_connections_total");
const DEC_NAME: &'static MetricName = MetricName::from_str("closed_client_connections_total");
const INC_HELP: &'static str = "Number of opened connections from a client.";
const DEC_HELP: &'static str = "Number of closed connections from a client.";
type LabelGroupSet = StaticLabelSet<Protocol>;
}
pub type NumClientConnectionsGuard<'a> =
metrics::MeasuredCounterPairGuard<'a, NumClientConnectionsGauge>;
pub struct NumConnectionRequestsGauge;
impl CounterPairAssoc for NumConnectionRequestsGauge {
const INC_NAME: &'static MetricName = MetricName::from_str("accepted_connections_total");
const DEC_NAME: &'static MetricName = MetricName::from_str("closed_connections_total");
const INC_HELP: &'static str = "Number of client connections accepted.";
const DEC_HELP: &'static str = "Number of client connections closed.";
type LabelGroupSet = StaticLabelSet<Protocol>;
}
pub type NumConnectionRequestsGuard<'a> =
metrics::MeasuredCounterPairGuard<'a, NumConnectionRequestsGauge>;
#[derive(LabelGroup)]
#[label(set = ComputeConnectionLatencySet)]
pub struct ComputeConnectionLatencyGroup {
protocol: Protocol,
cold_start_info: ColdStartInfo,
outcome: ConnectOutcome,
excluded: LatencyExclusions,
}
#[derive(FixedCardinalityLabel, Copy, Clone)]
pub enum LatencyExclusions {
Client,
ClientAndCplane,
}
#[derive(FixedCardinalityLabel, Copy, Clone)]
#[label(singleton = "limit")]
pub enum RateLimit {
Actual,
Expected,
}
#[derive(FixedCardinalityLabel, Copy, Clone)]
#[label(singleton = "kind")]
pub enum SniKind {
Sni,
NoSni,
PasswordHack,
}
#[derive(FixedCardinalityLabel, Copy, Clone)]
#[label(singleton = "kind")]
pub enum ConnectionFailureKind {
ComputeCached,
ComputeUncached,
}
#[derive(FixedCardinalityLabel, Copy, Clone)]
#[label(singleton = "kind")]
pub enum WakeupFailureKind {
BadComputeAddress,
ApiTransportError,
QuotaExceeded,
ApiConsoleLocked,
ApiConsoleBadRequest,
ApiConsoleOtherServerError,
ApiConsoleOtherError,
TimeoutError,
}
#[derive(LabelGroup)]
#[label(set = ConnectionFailuresBreakdownSet)]
pub struct ConnectionFailuresBreakdownGroup {
pub kind: WakeupFailureKind,
pub retry: Bool,
}
#[derive(LabelGroup, Copy, Clone)]
#[label(set = RedisErrorsSet)]
pub struct RedisErrors<'a> {
#[label(dynamic_with = ThreadedRodeo, default)]
pub channel: &'a str,
}
#[derive(FixedCardinalityLabel, Copy, Clone)]
pub enum CancellationSource {
FromClient,
FromRedis,
Local,
}
#[derive(FixedCardinalityLabel, Copy, Clone)]
pub enum CancellationOutcome {
NotFound,
Found,
}
#[derive(LabelGroup)]
#[label(set = CancellationRequestSet)]
pub struct CancellationRequest {
pub source: CancellationSource,
pub kind: CancellationOutcome,
}
pub enum Waiting {
Cplane,
@@ -185,20 +368,6 @@ struct Accumulated {
compute: time::Duration,
}
enum Outcome {
Success,
Failed,
}
impl Outcome {
fn as_str(&self) -> &'static str {
match self {
Outcome::Success => "success",
Outcome::Failed => "failed",
}
}
}
pub struct LatencyTimer {
// time since the stopwatch was started
start: time::Instant,
@@ -207,9 +376,9 @@ pub struct LatencyTimer {
// accumulated time on the stopwatch
accumulated: Accumulated,
// label data
protocol: &'static str,
protocol: Protocol,
cold_start_info: ColdStartInfo,
outcome: Outcome,
outcome: ConnectOutcome,
}
pub struct LatencyTimerPause<'a> {
@@ -219,7 +388,7 @@ pub struct LatencyTimerPause<'a> {
}
impl LatencyTimer {
pub fn new(protocol: &'static str) -> Self {
pub fn new(protocol: Protocol) -> Self {
Self {
start: time::Instant::now(),
stop: None,
@@ -227,7 +396,7 @@ impl LatencyTimer {
protocol,
cold_start_info: ColdStartInfo::Unknown,
// assume failed unless otherwise specified
outcome: Outcome::Failed,
outcome: ConnectOutcome::Failed,
}
}
@@ -248,7 +417,7 @@ impl LatencyTimer {
self.stop = Some(time::Instant::now());
// success
self.outcome = Outcome::Success;
self.outcome = ConnectOutcome::Success;
}
}
@@ -263,128 +432,62 @@ impl Drop for LatencyTimerPause<'_> {
}
}
#[derive(FixedCardinalityLabel, Clone, Copy, Debug)]
pub enum ConnectOutcome {
Success,
Failed,
}
impl Drop for LatencyTimer {
fn drop(&mut self) {
let duration = self
.stop
.unwrap_or_else(time::Instant::now)
.duration_since(self.start);
// Excluding cplane communication from the accumulated time.
COMPUTE_CONNECTION_LATENCY
.with_label_values(&[
self.protocol,
self.cold_start_info.as_str(),
self.outcome.as_str(),
"client",
])
.observe((duration.saturating_sub(self.accumulated.client)).as_secs_f64());
let metric = &Metrics::get().proxy.compute_connection_latency_seconds;
// Excluding client communication from the accumulated time.
metric.observe(
ComputeConnectionLatencyGroup {
protocol: self.protocol,
cold_start_info: self.cold_start_info,
outcome: self.outcome,
excluded: LatencyExclusions::Client,
},
duration
.saturating_sub(self.accumulated.client)
.as_secs_f64(),
);
// Exclude client and cplane communication from the accumulated time.
let accumulated_total = self.accumulated.client + self.accumulated.cplane;
COMPUTE_CONNECTION_LATENCY
.with_label_values(&[
self.protocol,
self.cold_start_info.as_str(),
self.outcome.as_str(),
"client_and_cplane",
])
.observe((duration.saturating_sub(accumulated_total)).as_secs_f64());
metric.observe(
ComputeConnectionLatencyGroup {
protocol: self.protocol,
cold_start_info: self.cold_start_info,
outcome: self.outcome,
excluded: LatencyExclusions::ClientAndCplane,
},
duration.saturating_sub(accumulated_total).as_secs_f64(),
);
}
}
pub static NUM_CONNECTION_FAILURES: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"proxy_connection_failures_total",
"Number of connection failures (per kind).",
&["kind"],
)
.unwrap()
});
pub static NUM_WAKEUP_FAILURES: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"proxy_connection_failures_breakdown",
"Number of wake-up failures (per kind).",
&["retry", "kind"],
)
.unwrap()
});
pub static NUM_BYTES_PROXIED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"proxy_io_bytes",
"Number of bytes sent/received between all clients and backends.",
&["direction"],
)
.unwrap()
});
pub const fn bool_to_str(x: bool) -> &'static str {
if x {
"true"
} else {
"false"
impl From<bool> for Bool {
fn from(value: bool) -> Self {
if value {
Bool::True
} else {
Bool::False
}
}
}
pub static CONNECTING_ENDPOINTS: Lazy<HyperLogLogVec<32>> = Lazy::new(|| {
register_hll_vec!(
32,
"proxy_connecting_endpoints",
"HLL approximate cardinality of endpoints that are connecting",
&["protocol"],
)
.unwrap()
});
pub static ERROR_BY_KIND: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"proxy_errors_total",
"Number of errors by a given classification",
&["type"],
)
.unwrap()
});
pub static ENDPOINT_ERRORS_BY_KIND: Lazy<HyperLogLogVec<32>> = Lazy::new(|| {
register_hll_vec!(
32,
"proxy_endpoints_affected_by_errors",
"Number of endpoints affected by errors of a given classification",
&["type"],
)
.unwrap()
});
pub static REDIS_BROKEN_MESSAGES: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"proxy_redis_errors_total",
"Number of errors by a given classification",
&["channel"],
)
.unwrap()
});
pub static TLS_HANDSHAKE_FAILURES: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
"proxy_tls_handshake_failures",
"Number of TLS handshake failures",
)
.unwrap()
});
pub static ENDPOINTS_AUTH_RATE_LIMITED: Lazy<HyperLogLog<32>> = Lazy::new(|| {
register_hll!(
32,
"proxy_endpoints_auth_rate_limits",
"Number of endpoints affected by authentication rate limits",
)
.unwrap()
});
pub static AUTH_RATE_LIMIT_HITS: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
"proxy_requests_auth_rate_limits_total",
"Number of connection requests affected by authentication rate limits",
)
.unwrap()
});
#[derive(LabelGroup)]
#[label(set = InvalidEndpointsSet)]
pub struct InvalidEndpointsGroup {
pub protocol: Protocol,
pub rejected: Bool,
pub outcome: ConnectOutcome,
}

View File

@@ -5,19 +5,13 @@ use std::{
io,
net::SocketAddr,
pin::{pin, Pin},
sync::Mutex,
task::{ready, Context, Poll},
};
use bytes::{Buf, BytesMut};
use hyper::server::accept::Accept;
use hyper::server::conn::{AddrIncoming, AddrStream};
use metrics::IntCounterPairGuard;
use hyper::server::conn::AddrIncoming;
use pin_project_lite::pin_project;
use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, ReadBuf};
use uuid::Uuid;
use crate::metrics::NUM_CLIENT_CONNECTION_GAUGE;
pub struct ProxyProtocolAccept {
pub incoming: AddrIncoming,
@@ -331,103 +325,6 @@ impl<T: AsyncRead> AsyncRead for WithClientIp<T> {
}
}
impl Accept for ProxyProtocolAccept {
type Conn = WithConnectionGuard<WithClientIp<AddrStream>>;
type Error = io::Error;
fn poll_accept(
mut self: Pin<&mut Self>,
cx: &mut Context<'_>,
) -> Poll<Option<Result<Self::Conn, Self::Error>>> {
let conn = ready!(Pin::new(&mut self.incoming).poll_accept(cx)?);
let conn_id = uuid::Uuid::new_v4();
let span = tracing::info_span!("http_conn", ?conn_id);
{
let _enter = span.enter();
tracing::info!("accepted new TCP connection");
}
let Some(conn) = conn else {
return Poll::Ready(None);
};
Poll::Ready(Some(Ok(WithConnectionGuard {
inner: WithClientIp::new(conn),
connection_id: Uuid::new_v4(),
gauge: Mutex::new(Some(
NUM_CLIENT_CONNECTION_GAUGE
.with_label_values(&[self.protocol])
.guard(),
)),
span,
})))
}
}
pin_project! {
pub struct WithConnectionGuard<T> {
#[pin]
pub inner: T,
pub connection_id: Uuid,
pub gauge: Mutex<Option<IntCounterPairGuard>>,
pub span: tracing::Span,
}
impl<S> PinnedDrop for WithConnectionGuard<S> {
fn drop(this: Pin<&mut Self>) {
let _enter = this.span.enter();
tracing::info!("HTTP connection closed")
}
}
}
impl<T: AsyncWrite> AsyncWrite for WithConnectionGuard<T> {
#[inline]
fn poll_write(
self: Pin<&mut Self>,
cx: &mut Context<'_>,
buf: &[u8],
) -> Poll<Result<usize, io::Error>> {
self.project().inner.poll_write(cx, buf)
}
#[inline]
fn poll_flush(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Result<(), io::Error>> {
self.project().inner.poll_flush(cx)
}
#[inline]
fn poll_shutdown(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Result<(), io::Error>> {
self.project().inner.poll_shutdown(cx)
}
#[inline]
fn poll_write_vectored(
self: Pin<&mut Self>,
cx: &mut Context<'_>,
bufs: &[io::IoSlice<'_>],
) -> Poll<Result<usize, io::Error>> {
self.project().inner.poll_write_vectored(cx, bufs)
}
#[inline]
fn is_write_vectored(&self) -> bool {
self.inner.is_write_vectored()
}
}
impl<T: AsyncRead> AsyncRead for WithConnectionGuard<T> {
fn poll_read(
self: Pin<&mut Self>,
cx: &mut Context<'_>,
buf: &mut ReadBuf<'_>,
) -> Poll<io::Result<()>> {
self.project().inner.poll_read(cx, buf)
}
}
#[cfg(test)]
mod tests {
use std::pin::pin;

View File

@@ -15,16 +15,15 @@ use crate::{
config::{ProxyConfig, TlsConfig},
context::RequestMonitoring,
error::ReportableError,
metrics::{NUM_CLIENT_CONNECTION_GAUGE, NUM_CONNECTION_REQUESTS_GAUGE},
metrics::{Metrics, NumClientConnectionsGuard},
protocol2::WithClientIp,
proxy::handshake::{handshake, HandshakeData},
rate_limiter::EndpointRateLimiter,
stream::{PqStream, Stream},
EndpointCacheKey,
EndpointCacheKey, Normalize,
};
use futures::TryFutureExt;
use itertools::Itertools;
use metrics::IntCounterPairGuard;
use once_cell::sync::OnceCell;
use pq_proto::{BeMessage as Be, StartupMessageParams};
use regex::Regex;
@@ -79,9 +78,10 @@ pub async fn task_main(
{
let (socket, peer_addr) = accept_result?;
let conn_gauge = NUM_CLIENT_CONNECTION_GAUGE
.with_label_values(&["tcp"])
.guard();
let conn_gauge = Metrics::get()
.proxy
.client_connections
.guard(crate::metrics::Protocol::Tcp);
let session_id = uuid::Uuid::new_v4();
let cancellation_handler = Arc::clone(&cancellation_handler);
@@ -113,7 +113,12 @@ pub async fn task_main(
},
};
let mut ctx = RequestMonitoring::new(session_id, peer_addr, "tcp", &config.region);
let mut ctx = RequestMonitoring::new(
session_id,
peer_addr,
crate::metrics::Protocol::Tcp,
&config.region,
);
let span = ctx.span.clone();
let res = handle_client(
@@ -237,14 +242,17 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
stream: S,
mode: ClientMode,
endpoint_rate_limiter: Arc<EndpointRateLimiter>,
conn_gauge: IntCounterPairGuard,
conn_gauge: NumClientConnectionsGuard<'static>,
) -> Result<Option<ProxyPassthrough<CancellationHandlerMainInternal, S>>, ClientRequestError> {
info!("handling interactive connection from client");
info!(
protocol = %ctx.protocol,
"handling interactive connection from client"
);
let metrics = &Metrics::get().proxy;
let proto = ctx.protocol;
let _request_gauge = NUM_CONNECTION_REQUESTS_GAUGE
.with_label_values(&[proto])
.guard();
// let _client_gauge = metrics.client_connections.guard(proto);
let _request_gauge = metrics.connection_requests.guard(proto);
let tls = config.tls_config.as_ref();
@@ -280,7 +288,7 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
// check rate limit
if let Some(ep) = user_info.get_endpoint() {
if !endpoint_rate_limiter.check(ep, 1) {
if !endpoint_rate_limiter.check(ep.normalize(), 1) {
return stream
.throw_error(auth::AuthError::too_many_connections())
.await?;

View File

@@ -4,7 +4,7 @@ use crate::{
console::{self, errors::WakeComputeError, CachedNodeInfo, NodeInfo},
context::RequestMonitoring,
error::ReportableError,
metrics::NUM_CONNECTION_FAILURES,
metrics::{ConnectionFailureKind, Metrics},
proxy::{
retry::{retry_after, ShouldRetry},
wake_compute::wake_compute,
@@ -27,10 +27,10 @@ pub fn invalidate_cache(node_info: console::CachedNodeInfo) -> NodeInfo {
warn!("invalidating stalled compute node info cache entry");
}
let label = match is_cached {
true => "compute_cached",
false => "compute_uncached",
true => ConnectionFailureKind::ComputeCached,
false => ConnectionFailureKind::ComputeUncached,
};
NUM_CONNECTION_FAILURES.with_label_values(&[label]).inc();
Metrics::get().proxy.connection_failures_total.inc(label);
node_info.invalidate()
}

View File

@@ -2,11 +2,10 @@ use crate::{
cancellation,
compute::PostgresConnection,
console::messages::MetricsAuxInfo,
metrics::NUM_BYTES_PROXIED_COUNTER,
metrics::{Direction, Metrics, NumClientConnectionsGuard, NumConnectionRequestsGuard},
stream::Stream,
usage_metrics::{Ids, MetricCounterRecorder, USAGE_METRICS},
};
use metrics::IntCounterPairGuard;
use tokio::io::{AsyncRead, AsyncWrite};
use tracing::info;
use utils::measured_stream::MeasuredStream;
@@ -23,24 +22,25 @@ pub async fn proxy_pass(
branch_id: aux.branch_id,
});
let m_sent = NUM_BYTES_PROXIED_COUNTER.with_label_values(&["tx"]);
let metrics = &Metrics::get().proxy.io_bytes;
let m_sent = metrics.with_labels(Direction::Tx);
let mut client = MeasuredStream::new(
client,
|_| {},
|cnt| {
// Number of bytes we sent to the client (outbound).
m_sent.inc_by(cnt as u64);
metrics.get_metric(m_sent).inc_by(cnt as u64);
usage.record_egress(cnt as u64);
},
);
let m_recv = NUM_BYTES_PROXIED_COUNTER.with_label_values(&["rx"]);
let m_recv = metrics.with_labels(Direction::Rx);
let mut compute = MeasuredStream::new(
compute,
|_| {},
|cnt| {
// Number of bytes the client sent to the compute node (inbound).
m_recv.inc_by(cnt as u64);
metrics.get_metric(m_recv).inc_by(cnt as u64);
},
);
@@ -60,8 +60,8 @@ pub struct ProxyPassthrough<P, S> {
pub compute: PostgresConnection,
pub aux: MetricsAuxInfo,
pub req: IntCounterPairGuard,
pub conn: IntCounterPairGuard,
pub req: NumConnectionRequestsGuard<'static>,
pub conn: NumClientConnectionsGuard<'static>,
pub cancel: cancellation::Session<P>,
}

View File

@@ -1,6 +1,6 @@
use crate::console::{errors::WakeComputeError, provider::CachedNodeInfo};
use crate::context::RequestMonitoring;
use crate::metrics::{bool_to_str, NUM_WAKEUP_FAILURES};
use crate::metrics::{ConnectionFailuresBreakdownGroup, Metrics, WakeupFailureKind};
use crate::proxy::retry::retry_after;
use hyper::StatusCode;
use std::ops::ControlFlow;
@@ -57,39 +57,46 @@ pub fn handle_try_wake(
fn report_error(e: &WakeComputeError, retry: bool) {
use crate::console::errors::ApiError;
let retry = bool_to_str(retry);
let kind = match e {
WakeComputeError::BadComputeAddress(_) => "bad_compute_address",
WakeComputeError::ApiError(ApiError::Transport(_)) => "api_transport_error",
WakeComputeError::BadComputeAddress(_) => WakeupFailureKind::BadComputeAddress,
WakeComputeError::ApiError(ApiError::Transport(_)) => WakeupFailureKind::ApiTransportError,
WakeComputeError::ApiError(ApiError::Console {
status: StatusCode::LOCKED,
ref text,
}) if text.contains("written data quota exceeded")
|| text.contains("the limit for current plan reached") =>
{
"quota_exceeded"
WakeupFailureKind::QuotaExceeded
}
WakeComputeError::ApiError(ApiError::Console {
status: StatusCode::UNPROCESSABLE_ENTITY,
ref text,
}) if text.contains("compute time quota of non-primary branches is exceeded") => {
"quota_exceeded"
WakeupFailureKind::QuotaExceeded
}
WakeComputeError::ApiError(ApiError::Console {
status: StatusCode::LOCKED,
..
}) => "api_console_locked",
}) => WakeupFailureKind::ApiConsoleLocked,
WakeComputeError::ApiError(ApiError::Console {
status: StatusCode::BAD_REQUEST,
..
}) => "api_console_bad_request",
}) => WakeupFailureKind::ApiConsoleBadRequest,
WakeComputeError::ApiError(ApiError::Console { status, .. })
if status.is_server_error() =>
{
"api_console_other_server_error"
WakeupFailureKind::ApiConsoleOtherServerError
}
WakeComputeError::ApiError(ApiError::Console { .. }) => "api_console_other_error",
WakeComputeError::TimeoutError => "timeout_error",
WakeComputeError::ApiError(ApiError::Console { .. }) => {
WakeupFailureKind::ApiConsoleOtherError
}
WakeComputeError::TimeoutError => WakeupFailureKind::TimeoutError,
};
NUM_WAKEUP_FAILURES.with_label_values(&[retry, kind]).inc();
Metrics::get()
.proxy
.connection_failures_breakdown
.inc(ConnectionFailuresBreakdownGroup {
kind,
retry: retry.into(),
});
}

View File

@@ -4,4 +4,4 @@ mod limiter;
pub use aimd::Aimd;
pub use limit_algorithm::{AimdConfig, Fixed, RateLimitAlgorithm, RateLimiterConfig};
pub use limiter::Limiter;
pub use limiter::{AuthRateLimiter, EndpointRateLimiter, RateBucketInfo, RedisRateLimiter};
pub use limiter::{AuthRateLimiter, EndpointRateLimiter, GlobalRateLimiter, RateBucketInfo};

View File

@@ -17,20 +17,26 @@ use tokio::sync::{Mutex as AsyncMutex, Semaphore, SemaphorePermit};
use tokio::time::{timeout, Duration, Instant};
use tracing::info;
use crate::{intern::EndpointIdInt, EndpointId};
use crate::{
intern::EndpointIdInt,
{
metrics::{Metrics, RateLimit},
EndpointId,
},
};
use super::{
limit_algorithm::{LimitAlgorithm, Sample},
RateLimiterConfig,
};
pub struct RedisRateLimiter {
pub struct GlobalRateLimiter {
data: Vec<RateBucket>,
info: &'static [RateBucketInfo],
info: Vec<RateBucketInfo>,
}
impl RedisRateLimiter {
pub fn new(info: &'static [RateBucketInfo]) -> Self {
impl GlobalRateLimiter {
pub fn new(info: Vec<RateBucketInfo>) -> Self {
Self {
data: vec![
RateBucket {
@@ -50,7 +56,7 @@ impl RedisRateLimiter {
let should_allow_request = self
.data
.iter_mut()
.zip(self.info)
.zip(&self.info)
.all(|(bucket, info)| bucket.should_allow_request(info, now, 1));
if should_allow_request {
@@ -457,12 +463,9 @@ impl Limiter {
}
new_limit
};
crate::metrics::RATE_LIMITER_LIMIT
.with_label_values(&["expected"])
.set(new_limit as i64);
crate::metrics::RATE_LIMITER_LIMIT
.with_label_values(&["actual"])
.set(actual_limit as i64);
let metric = &Metrics::get().semaphore_control_plane_limit;
metric.set(RateLimit::Expected, new_limit as i64);
metric.set(RateLimit::Actual, actual_limit as i64);
self.limits.store(new_limit, Ordering::Release);
#[cfg(test)]
if let Some(n) = &self.notifier {
@@ -519,7 +522,10 @@ impl reqwest_middleware::Middleware for Limiter {
extensions: &mut task_local_extensions::Extensions,
next: reqwest_middleware::Next<'_>,
) -> reqwest_middleware::Result<reqwest::Response> {
let start = Instant::now();
let timer = Metrics::get()
.proxy
.control_plane_token_acquire_seconds
.start_timer();
let token = self
.acquire_timeout(self.config.timeout)
.await
@@ -533,8 +539,12 @@ impl reqwest_middleware::Middleware for Limiter {
.into(),
)
})?;
info!(duration = ?start.elapsed(), "waiting for token to connect to the control plane");
crate::metrics::RATE_LIMITER_ACQUIRE_LATENCY.observe(start.elapsed().as_secs_f64());
let duration = timer.observe();
info!(
?duration,
"waiting for token to connect to the control plane"
);
match next.run(req, extensions).await {
Ok(response) => {
self.release(token, Some(Outcome::from_reqwest_response(&response)))

View File

@@ -5,7 +5,7 @@ use redis::AsyncCommands;
use tokio::sync::Mutex;
use uuid::Uuid;
use crate::rate_limiter::{RateBucketInfo, RedisRateLimiter};
use crate::rate_limiter::{GlobalRateLimiter, RateBucketInfo};
use super::{
connection_with_credentials_provider::ConnectionWithCredentialsProvider,
@@ -80,7 +80,7 @@ impl<P: CancellationPublisherMut> CancellationPublisher for Arc<Mutex<P>> {
pub struct RedisPublisherClient {
client: ConnectionWithCredentialsProvider,
region_id: String,
limiter: RedisRateLimiter,
limiter: GlobalRateLimiter,
}
impl RedisPublisherClient {
@@ -92,7 +92,7 @@ impl RedisPublisherClient {
Ok(Self {
client,
region_id,
limiter: RedisRateLimiter::new(info),
limiter: GlobalRateLimiter::new(info.into()),
})
}

View File

@@ -11,7 +11,7 @@ use crate::{
cache::project_info::ProjectInfoCache,
cancellation::{CancelMap, CancellationHandler},
intern::{ProjectIdInt, RoleNameInt},
metrics::{NUM_CANCELLATION_REQUESTS_SOURCE_FROM_REDIS, REDIS_BROKEN_MESSAGES},
metrics::{Metrics, RedisErrors},
};
const CPLANE_CHANNEL_NAME: &str = "neondb-proxy-ws-updates";
@@ -104,9 +104,9 @@ impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
let msg: Notification = match serde_json::from_str(&payload) {
Ok(msg) => msg,
Err(e) => {
REDIS_BROKEN_MESSAGES
.with_label_values(&[msg.get_channel_name()])
.inc();
Metrics::get().proxy.redis_errors_total.inc(RedisErrors {
channel: msg.get_channel_name(),
});
tracing::error!("broken message: {e}");
return Ok(());
}
@@ -183,7 +183,7 @@ where
cache,
Arc::new(CancellationHandler::<()>::new(
cancel_map,
NUM_CANCELLATION_REQUESTS_SOURCE_FROM_REDIS,
crate::metrics::CancellationSource::FromRedis,
)),
region_id,
);

View File

@@ -4,42 +4,48 @@
mod backend;
mod conn_pool;
mod http_util;
mod json;
mod sql_over_http;
pub mod tls_listener;
mod websocket;
use atomic_take::AtomicTake;
use bytes::Bytes;
pub use conn_pool::GlobalConnPoolOptions;
use anyhow::bail;
use hyper::StatusCode;
use metrics::IntCounterPairGuard;
use anyhow::Context;
use futures::future::{select, Either};
use futures::TryFutureExt;
use http::{Method, Response, StatusCode};
use http_body_util::Full;
use hyper1::body::Incoming;
use hyper_util::rt::TokioExecutor;
use hyper_util::server::conn::auto::Builder;
use rand::rngs::StdRng;
use rand::SeedableRng;
pub use reqwest_middleware::{ClientWithMiddleware, Error};
pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
use tokio::time::timeout;
use tokio_rustls::TlsAcceptor;
use tokio_util::task::TaskTracker;
use tracing::instrument::Instrumented;
use crate::cancellation::CancellationHandlerMain;
use crate::config::ProxyConfig;
use crate::context::RequestMonitoring;
use crate::protocol2::{ProxyProtocolAccept, WithClientIp, WithConnectionGuard};
use crate::metrics::Metrics;
use crate::protocol2::WithClientIp;
use crate::proxy::run_until_cancelled;
use crate::rate_limiter::EndpointRateLimiter;
use crate::serverless::backend::PoolingBackend;
use hyper::{
server::conn::{AddrIncoming, AddrStream},
Body, Method, Request, Response,
};
use crate::serverless::http_util::{api_error_into_response, json_response};
use std::net::IpAddr;
use std::net::{IpAddr, SocketAddr};
use std::pin::pin;
use std::sync::Arc;
use std::task::Poll;
use tls_listener::TlsListener;
use tokio::net::TcpListener;
use tokio_util::sync::{CancellationToken, DropGuard};
use tokio::net::{TcpListener, TcpStream};
use tokio_util::sync::CancellationToken;
use tracing::{error, info, warn, Instrument};
use utils::http::{error::ApiError, json::json_response};
use utils::http::error::ApiError;
pub const SERVERLESS_DRIVER_SNI: &str = "api";
@@ -91,161 +97,175 @@ pub async fn task_main(
tls_server_config.alpn_protocols = vec![b"h2".to_vec(), b"http/1.1".to_vec()];
let tls_acceptor: tokio_rustls::TlsAcceptor = Arc::new(tls_server_config).into();
let mut addr_incoming = AddrIncoming::from_listener(ws_listener)?;
let _ = addr_incoming.set_nodelay(true);
let addr_incoming = ProxyProtocolAccept {
incoming: addr_incoming,
protocol: "http",
};
let connections = tokio_util::task::task_tracker::TaskTracker::new();
connections.close(); // allows `connections.wait to complete`
let ws_connections = tokio_util::task::task_tracker::TaskTracker::new();
ws_connections.close(); // allows `ws_connections.wait to complete`
let server = Builder::new(hyper_util::rt::TokioExecutor::new());
let tls_listener = TlsListener::new(tls_acceptor, addr_incoming, config.handshake_timeout);
while let Some(res) = run_until_cancelled(ws_listener.accept(), &cancellation_token).await {
let (conn, peer_addr) = res.context("could not accept TCP stream")?;
if let Err(e) = conn.set_nodelay(true) {
tracing::error!("could not set nodelay: {e}");
continue;
}
let conn_id = uuid::Uuid::new_v4();
let http_conn_span = tracing::info_span!("http_conn", ?conn_id);
let make_svc = hyper::service::make_service_fn(
|stream: &tokio_rustls::server::TlsStream<
WithConnectionGuard<WithClientIp<AddrStream>>,
>| {
let (conn, _) = stream.get_ref();
connections.spawn(
connection_handler(
config,
backend.clone(),
connections.clone(),
cancellation_handler.clone(),
endpoint_rate_limiter.clone(),
cancellation_token.clone(),
server.clone(),
tls_acceptor.clone(),
conn,
peer_addr,
)
.instrument(http_conn_span),
);
}
// this is jank. should dissapear with hyper 1.0 migration.
let gauge = conn
.gauge
.lock()
.expect("lock should not be poisoned")
.take()
.expect("gauge should be set on connection start");
// Cancel all current inflight HTTP requests if the HTTP connection is closed.
let http_cancellation_token = CancellationToken::new();
let cancel_connection = http_cancellation_token.clone().drop_guard();
let span = conn.span.clone();
let client_addr = conn.inner.client_addr();
let remote_addr = conn.inner.inner.remote_addr();
let backend = backend.clone();
let ws_connections = ws_connections.clone();
let endpoint_rate_limiter = endpoint_rate_limiter.clone();
let cancellation_handler = cancellation_handler.clone();
async move {
let peer_addr = match client_addr {
Some(addr) => addr,
None if config.require_client_ip => bail!("missing required client ip"),
None => remote_addr,
};
Ok(MetricService::new(
hyper::service::service_fn(move |req: Request<Body>| {
let backend = backend.clone();
let ws_connections2 = ws_connections.clone();
let endpoint_rate_limiter = endpoint_rate_limiter.clone();
let cancellation_handler = cancellation_handler.clone();
let http_cancellation_token = http_cancellation_token.child_token();
// `request_handler` is not cancel safe. It expects to be cancelled only at specific times.
// By spawning the future, we ensure it never gets cancelled until it decides to.
ws_connections.spawn(
async move {
// Cancel the current inflight HTTP request if the requets stream is closed.
// This is slightly different to `_cancel_connection` in that
// h2 can cancel individual requests with a `RST_STREAM`.
let _cancel_session = http_cancellation_token.clone().drop_guard();
let res = request_handler(
req,
config,
backend,
ws_connections2,
cancellation_handler,
peer_addr.ip(),
endpoint_rate_limiter,
http_cancellation_token,
)
.await
.map_or_else(|e| e.into_response(), |r| r);
_cancel_session.disarm();
res
}
.in_current_span(),
)
}),
gauge,
cancel_connection,
span,
))
}
},
);
hyper::Server::builder(tls_listener)
.serve(make_svc)
.with_graceful_shutdown(cancellation_token.cancelled())
.await?;
// await websocket connections
ws_connections.wait().await;
connections.wait().await;
Ok(())
}
struct MetricService<S> {
inner: S,
_gauge: IntCounterPairGuard,
_cancel: DropGuard,
span: tracing::Span,
}
/// Handles the TCP lifecycle.
///
/// 1. Parses PROXY protocol V2
/// 2. Handles TLS handshake
/// 3. Handles HTTP connection
/// 1. With graceful shutdowns
/// 2. With graceful request cancellation with connection failure
/// 3. With websocket upgrade support.
#[allow(clippy::too_many_arguments)]
async fn connection_handler(
config: &'static ProxyConfig,
backend: Arc<PoolingBackend>,
connections: TaskTracker,
cancellation_handler: Arc<CancellationHandlerMain>,
endpoint_rate_limiter: Arc<EndpointRateLimiter>,
cancellation_token: CancellationToken,
server: Builder<TokioExecutor>,
tls_acceptor: TlsAcceptor,
conn: TcpStream,
peer_addr: SocketAddr,
) {
let session_id = uuid::Uuid::new_v4();
impl<S> MetricService<S> {
fn new(
inner: S,
_gauge: IntCounterPairGuard,
_cancel: DropGuard,
span: tracing::Span,
) -> MetricService<S> {
MetricService {
inner,
_gauge,
_cancel,
span,
let _gauge = Metrics::get()
.proxy
.client_connections
.guard(crate::metrics::Protocol::Http);
// handle PROXY protocol
let mut conn = WithClientIp::new(conn);
let peer = match conn.wait_for_addr().await {
Ok(peer) => peer,
Err(e) => {
tracing::error!(?session_id, %peer_addr, "failed to accept TCP connection: invalid PROXY protocol V2 header: {e:#}");
return;
}
}
}
};
impl<S, ReqBody> hyper::service::Service<Request<ReqBody>> for MetricService<S>
where
S: hyper::service::Service<Request<ReqBody>>,
{
type Response = S::Response;
type Error = S::Error;
type Future = Instrumented<S::Future>;
let peer_addr = peer.unwrap_or(peer_addr).ip();
info!(?session_id, %peer_addr, "accepted new TCP connection");
fn poll_ready(&mut self, cx: &mut std::task::Context<'_>) -> Poll<Result<(), Self::Error>> {
self.inner.poll_ready(cx)
}
// try upgrade to TLS, but with a timeout.
let conn = match timeout(config.handshake_timeout, tls_acceptor.accept(conn)).await {
Ok(Ok(conn)) => {
info!(?session_id, %peer_addr, "accepted new TLS connection");
conn
}
// The handshake failed
Ok(Err(e)) => {
Metrics::get().proxy.tls_handshake_failures.inc();
warn!(?session_id, %peer_addr, "failed to accept TLS connection: {e:?}");
return;
}
// The handshake timed out
Err(e) => {
Metrics::get().proxy.tls_handshake_failures.inc();
warn!(?session_id, %peer_addr, "failed to accept TLS connection: {e:?}");
return;
}
};
fn call(&mut self, req: Request<ReqBody>) -> Self::Future {
self.span
.in_scope(|| self.inner.call(req))
.instrument(self.span.clone())
let session_id = AtomicTake::new(session_id);
// Cancel all current inflight HTTP requests if the HTTP connection is closed.
let http_cancellation_token = CancellationToken::new();
let _cancel_connection = http_cancellation_token.clone().drop_guard();
let conn = server.serve_connection_with_upgrades(
hyper_util::rt::TokioIo::new(conn),
hyper1::service::service_fn(move |req: hyper1::Request<Incoming>| {
// First HTTP request shares the same session ID
let session_id = session_id.take().unwrap_or_else(uuid::Uuid::new_v4);
// Cancel the current inflight HTTP request if the requets stream is closed.
// This is slightly different to `_cancel_connection` in that
// h2 can cancel individual requests with a `RST_STREAM`.
let http_request_token = http_cancellation_token.child_token();
let cancel_request = http_request_token.clone().drop_guard();
// `request_handler` is not cancel safe. It expects to be cancelled only at specific times.
// By spawning the future, we ensure it never gets cancelled until it decides to.
let handler = connections.spawn(
request_handler(
req,
config,
backend.clone(),
connections.clone(),
cancellation_handler.clone(),
session_id,
peer_addr,
endpoint_rate_limiter.clone(),
http_request_token,
)
.in_current_span()
.map_ok_or_else(api_error_into_response, |r| r),
);
async move {
let res = handler.await;
cancel_request.disarm();
res
}
}),
);
// On cancellation, trigger the HTTP connection handler to shut down.
let res = match select(pin!(cancellation_token.cancelled()), pin!(conn)).await {
Either::Left((_cancelled, mut conn)) => {
conn.as_mut().graceful_shutdown();
conn.await
}
Either::Right((res, _)) => res,
};
match res {
Ok(()) => tracing::info!(%peer_addr, "HTTP connection closed"),
Err(e) => tracing::warn!(%peer_addr, "HTTP connection error {e}"),
}
}
#[allow(clippy::too_many_arguments)]
async fn request_handler(
mut request: Request<Body>,
mut request: hyper1::Request<Incoming>,
config: &'static ProxyConfig,
backend: Arc<PoolingBackend>,
ws_connections: TaskTracker,
cancellation_handler: Arc<CancellationHandlerMain>,
session_id: uuid::Uuid,
peer_addr: IpAddr,
endpoint_rate_limiter: Arc<EndpointRateLimiter>,
// used to cancel in-flight HTTP requests. not used to cancel websockets
http_cancellation_token: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let session_id = uuid::Uuid::new_v4();
) -> Result<Response<Full<Bytes>>, ApiError> {
let host = request
.headers()
.get("host")
@@ -255,7 +275,13 @@ async fn request_handler(
// Check if the request is a websocket upgrade request.
if hyper_tungstenite::is_upgrade_request(&request) {
let ctx = RequestMonitoring::new(session_id, peer_addr, "ws", &config.region);
let ctx = RequestMonitoring::new(
session_id,
peer_addr,
crate::metrics::Protocol::Ws,
&config.region,
);
let span = ctx.span.clone();
info!(parent: &span, "performing websocket upgrade");
@@ -282,14 +308,19 @@ async fn request_handler(
// Return the response so the spawned future can continue.
Ok(response)
} else if request.uri().path() == "/sql" && request.method() == Method::POST {
let ctx = RequestMonitoring::new(session_id, peer_addr, "http", &config.region);
} else if request.uri().path() == "/sql" && *request.method() == Method::POST {
let ctx = RequestMonitoring::new(
session_id,
peer_addr,
crate::metrics::Protocol::Http,
&config.region,
);
let span = ctx.span.clone();
sql_over_http::handle(config, ctx, request, backend, http_cancellation_token)
.instrument(span)
.await
} else if request.uri().path() == "/sql" && request.method() == Method::OPTIONS {
} else if request.uri().path() == "/sql" && *request.method() == Method::OPTIONS {
Response::builder()
.header("Allow", "OPTIONS, POST")
.header("Access-Control-Allow-Origin", "*")
@@ -299,7 +330,7 @@ async fn request_handler(
)
.header("Access-Control-Max-Age", "86400" /* 24 hours */)
.status(StatusCode::OK) // 204 is also valid, but see: https://developer.mozilla.org/en-US/docs/Web/HTTP/Methods/OPTIONS#status_code
.body(Body::empty())
.body(Full::new(Bytes::new()))
.map_err(|e| ApiError::InternalServerError(e.into()))
} else {
json_response(StatusCode::BAD_REQUEST, "query is not supported")

View File

@@ -1,6 +1,5 @@
use dashmap::DashMap;
use futures::{future::poll_fn, Future};
use metrics::IntCounterPairGuard;
use parking_lot::RwLock;
use rand::Rng;
use smallvec::SmallVec;
@@ -16,13 +15,13 @@ use std::{
use tokio::time::Instant;
use tokio_postgres::tls::NoTlsStream;
use tokio_postgres::{AsyncMessage, ReadyForQueryStatus, Socket};
use tokio_util::sync::CancellationToken;
use crate::console::messages::{ColdStartInfo, MetricsAuxInfo};
use crate::metrics::{ENDPOINT_POOLS, GC_LATENCY, NUM_OPEN_CLIENTS_IN_HTTP_POOL};
use crate::metrics::{HttpEndpointPoolsGuard, Metrics};
use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS};
use crate::{
auth::backend::ComputeUserInfo, context::RequestMonitoring, metrics::NUM_DB_CONNECTIONS_GAUGE,
DbName, EndpointCacheKey, RoleName,
auth::backend::ComputeUserInfo, context::RequestMonitoring, DbName, EndpointCacheKey, RoleName,
};
use tracing::{debug, error, warn, Span};
@@ -78,7 +77,7 @@ pub struct EndpointConnPool<C: ClientInnerExt> {
pools: HashMap<(DbName, RoleName), DbUserConnPool<C>>,
total_conns: usize,
max_conns: usize,
_guard: IntCounterPairGuard,
_guard: HttpEndpointPoolsGuard<'static>,
global_connections_count: Arc<AtomicUsize>,
global_pool_size_max_conns: usize,
}
@@ -110,7 +109,11 @@ impl<C: ClientInnerExt> EndpointConnPool<C> {
let removed = old_len - new_len;
if removed > 0 {
global_connections_count.fetch_sub(removed, atomic::Ordering::Relaxed);
NUM_OPEN_CLIENTS_IN_HTTP_POOL.sub(removed as i64);
Metrics::get()
.proxy
.http_pool_opened_connections
.get_metric()
.dec_by(removed as i64);
}
*total_conns -= removed;
removed > 0
@@ -156,7 +159,11 @@ impl<C: ClientInnerExt> EndpointConnPool<C> {
pool.total_conns += 1;
pool.global_connections_count
.fetch_add(1, atomic::Ordering::Relaxed);
NUM_OPEN_CLIENTS_IN_HTTP_POOL.inc();
Metrics::get()
.proxy
.http_pool_opened_connections
.get_metric()
.inc();
}
pool.total_conns
@@ -176,7 +183,11 @@ impl<C: ClientInnerExt> Drop for EndpointConnPool<C> {
if self.total_conns > 0 {
self.global_connections_count
.fetch_sub(self.total_conns, atomic::Ordering::Relaxed);
NUM_OPEN_CLIENTS_IN_HTTP_POOL.sub(self.total_conns as i64);
Metrics::get()
.proxy
.http_pool_opened_connections
.get_metric()
.dec_by(self.total_conns as i64);
}
}
}
@@ -215,7 +226,11 @@ impl<C: ClientInnerExt> DbUserConnPool<C> {
removed += 1;
}
global_connections_count.fetch_sub(removed, atomic::Ordering::Relaxed);
NUM_OPEN_CLIENTS_IN_HTTP_POOL.sub(removed as i64);
Metrics::get()
.proxy
.http_pool_opened_connections
.get_metric()
.dec_by(removed as i64);
conn
}
}
@@ -303,7 +318,10 @@ impl<C: ClientInnerExt> GlobalConnPool<C> {
// acquire a random shard lock
let mut shard = self.global_pool.shards()[shard].write();
let timer = GC_LATENCY.start_timer();
let timer = Metrics::get()
.proxy
.http_pool_reclaimation_lag_seconds
.start_timer();
let current_len = shard.len();
let mut clients_removed = 0;
shard.retain(|endpoint, x| {
@@ -331,7 +349,7 @@ impl<C: ClientInnerExt> GlobalConnPool<C> {
let new_len = shard.len();
drop(shard);
timer.observe_duration();
timer.observe();
// Do logging outside of the lock.
if clients_removed > 0 {
@@ -339,7 +357,11 @@ impl<C: ClientInnerExt> GlobalConnPool<C> {
.global_connections_count
.fetch_sub(clients_removed, atomic::Ordering::Relaxed)
- clients_removed;
NUM_OPEN_CLIENTS_IN_HTTP_POOL.sub(clients_removed as i64);
Metrics::get()
.proxy
.http_pool_opened_connections
.get_metric()
.dec_by(clients_removed as i64);
info!("pool: performed global pool gc. removed {clients_removed} clients, total number of clients in pool is {size}");
}
let removed = current_len - new_len;
@@ -410,7 +432,7 @@ impl<C: ClientInnerExt> GlobalConnPool<C> {
pools: HashMap::new(),
total_conns: 0,
max_conns: self.config.pool_options.max_conns_per_endpoint,
_guard: ENDPOINT_POOLS.guard(),
_guard: Metrics::get().proxy.http_endpoint_pools.guard(),
global_connections_count: self.global_connections_count.clone(),
global_pool_size_max_conns: self.config.pool_options.max_total_conns,
}));
@@ -450,9 +472,7 @@ pub fn poll_client<C: ClientInnerExt>(
conn_id: uuid::Uuid,
aux: MetricsAuxInfo,
) -> Client<C> {
let conn_gauge = NUM_DB_CONNECTIONS_GAUGE
.with_label_values(&[ctx.protocol])
.guard();
let conn_gauge = Metrics::get().proxy.db_connections.guard(ctx.protocol);
let mut session_id = ctx.session_id;
let (tx, mut rx) = tokio::sync::watch::channel(session_id);
@@ -469,15 +489,32 @@ pub fn poll_client<C: ClientInnerExt>(
let db_user = conn_info.db_and_user();
let idle = global_pool.get_idle_timeout();
let cancel = CancellationToken::new();
let cancelled = cancel.clone().cancelled_owned();
tokio::spawn(
async move {
let _conn_gauge = conn_gauge;
let mut idle_timeout = pin!(tokio::time::sleep(idle));
let mut cancelled = pin!(cancelled);
poll_fn(move |cx| {
if matches!(rx.has_changed(), Ok(true)) {
session_id = *rx.borrow_and_update();
info!(%session_id, "changed session");
idle_timeout.as_mut().reset(Instant::now() + idle);
if cancelled.as_mut().poll(cx).is_ready() {
info!("connection dropped");
return Poll::Ready(())
}
match rx.has_changed() {
Ok(true) => {
session_id = *rx.borrow_and_update();
info!(%session_id, "changed session");
idle_timeout.as_mut().reset(Instant::now() + idle);
}
Err(_) => {
info!("connection dropped");
return Poll::Ready(())
}
_ => {}
}
// 5 minute idle connection timeout
@@ -532,6 +569,7 @@ pub fn poll_client<C: ClientInnerExt>(
let inner = ClientInner {
inner: client,
session: tx,
cancel,
aux,
conn_id,
};
@@ -541,10 +579,18 @@ pub fn poll_client<C: ClientInnerExt>(
struct ClientInner<C: ClientInnerExt> {
inner: C,
session: tokio::sync::watch::Sender<uuid::Uuid>,
cancel: CancellationToken,
aux: MetricsAuxInfo,
conn_id: uuid::Uuid,
}
impl<C: ClientInnerExt> Drop for ClientInner<C> {
fn drop(&mut self) {
// on client drop, tell the conn to shut down
self.cancel.cancel();
}
}
pub trait ClientInnerExt: Sync + Send + 'static {
fn is_closed(&self) -> bool;
fn get_process_id(&self) -> i32;
@@ -697,6 +743,7 @@ mod tests {
ClientInner {
inner: client,
session: tokio::sync::watch::Sender::new(uuid::Uuid::new_v4()),
cancel: CancellationToken::new(),
aux: MetricsAuxInfo {
endpoint_id: (&EndpointId::from("endpoint")).into(),
project_id: (&ProjectId::from("project")).into(),

View File

@@ -0,0 +1,92 @@
//! Things stolen from `libs/utils/src/http` to add hyper 1.0 compatibility
//! Will merge back in at some point in the future.
use bytes::Bytes;
use anyhow::Context;
use http::{Response, StatusCode};
use http_body_util::Full;
use serde::Serialize;
use utils::http::error::ApiError;
/// Like [`ApiError::into_response`]
pub fn api_error_into_response(this: ApiError) -> Response<Full<Bytes>> {
match this {
ApiError::BadRequest(err) => HttpErrorBody::response_from_msg_and_status(
format!("{err:#?}"), // use debug printing so that we give the cause
StatusCode::BAD_REQUEST,
),
ApiError::Forbidden(_) => {
HttpErrorBody::response_from_msg_and_status(this.to_string(), StatusCode::FORBIDDEN)
}
ApiError::Unauthorized(_) => {
HttpErrorBody::response_from_msg_and_status(this.to_string(), StatusCode::UNAUTHORIZED)
}
ApiError::NotFound(_) => {
HttpErrorBody::response_from_msg_and_status(this.to_string(), StatusCode::NOT_FOUND)
}
ApiError::Conflict(_) => {
HttpErrorBody::response_from_msg_and_status(this.to_string(), StatusCode::CONFLICT)
}
ApiError::PreconditionFailed(_) => HttpErrorBody::response_from_msg_and_status(
this.to_string(),
StatusCode::PRECONDITION_FAILED,
),
ApiError::ShuttingDown => HttpErrorBody::response_from_msg_and_status(
"Shutting down".to_string(),
StatusCode::SERVICE_UNAVAILABLE,
),
ApiError::ResourceUnavailable(err) => HttpErrorBody::response_from_msg_and_status(
err.to_string(),
StatusCode::SERVICE_UNAVAILABLE,
),
ApiError::Timeout(err) => HttpErrorBody::response_from_msg_and_status(
err.to_string(),
StatusCode::REQUEST_TIMEOUT,
),
ApiError::InternalServerError(err) => HttpErrorBody::response_from_msg_and_status(
err.to_string(),
StatusCode::INTERNAL_SERVER_ERROR,
),
}
}
/// Same as [`utils::http::error::HttpErrorBody`]
#[derive(Serialize)]
struct HttpErrorBody {
pub msg: String,
}
impl HttpErrorBody {
/// Same as [`utils::http::error::HttpErrorBody::response_from_msg_and_status`]
fn response_from_msg_and_status(msg: String, status: StatusCode) -> Response<Full<Bytes>> {
HttpErrorBody { msg }.to_response(status)
}
/// Same as [`utils::http::error::HttpErrorBody::to_response`]
fn to_response(&self, status: StatusCode) -> Response<Full<Bytes>> {
Response::builder()
.status(status)
.header(http::header::CONTENT_TYPE, "application/json")
// we do not have nested maps with non string keys so serialization shouldn't fail
.body(Full::new(Bytes::from(serde_json::to_string(self).unwrap())))
.unwrap()
}
}
/// Same as [`utils::http::json::json_response`]
pub fn json_response<T: Serialize>(
status: StatusCode,
data: T,
) -> Result<Response<Full<Bytes>>, ApiError> {
let json = serde_json::to_string(&data)
.context("Failed to serialize JSON response")
.map_err(ApiError::InternalServerError)?;
let response = Response::builder()
.status(status)
.header(http::header::CONTENT_TYPE, "application/json")
.body(Full::new(Bytes::from(json)))
.map_err(|e| ApiError::InternalServerError(e.into()))?;
Ok(response)
}

View File

@@ -1,18 +1,22 @@
use std::pin::pin;
use std::sync::Arc;
use bytes::Bytes;
use futures::future::select;
use futures::future::try_join;
use futures::future::Either;
use futures::StreamExt;
use futures::TryFutureExt;
use hyper::body::HttpBody;
use hyper::header;
use hyper::http::HeaderName;
use hyper::http::HeaderValue;
use hyper::Response;
use hyper::StatusCode;
use hyper::{Body, HeaderMap, Request};
use http_body_util::BodyExt;
use http_body_util::Full;
use hyper1::body::Body;
use hyper1::body::Incoming;
use hyper1::header;
use hyper1::http::HeaderName;
use hyper1::http::HeaderValue;
use hyper1::Response;
use hyper1::StatusCode;
use hyper1::{HeaderMap, Request};
use serde_json::json;
use serde_json::Value;
use tokio::time;
@@ -29,7 +33,6 @@ use tracing::error;
use tracing::info;
use url::Url;
use utils::http::error::ApiError;
use utils::http::json::json_response;
use crate::auth::backend::ComputeUserInfo;
use crate::auth::endpoint_sni;
@@ -40,8 +43,8 @@ use crate::context::RequestMonitoring;
use crate::error::ErrorKind;
use crate::error::ReportableError;
use crate::error::UserFacingError;
use crate::metrics::HTTP_CONTENT_LENGTH;
use crate::metrics::NUM_CONNECTION_REQUESTS_GAUGE;
use crate::metrics::HttpDirection;
use crate::metrics::Metrics;
use crate::proxy::run_until_cancelled;
use crate::proxy::NeonOptions;
use crate::serverless::backend::HttpConnError;
@@ -52,6 +55,7 @@ use crate::RoleName;
use super::backend::PoolingBackend;
use super::conn_pool::Client;
use super::conn_pool::ConnInfo;
use super::http_util::json_response;
use super::json::json_to_pg_text;
use super::json::pg_text_row_to_json;
use super::json::JsonConversionError;
@@ -218,10 +222,10 @@ fn get_conn_info(
pub async fn handle(
config: &'static ProxyConfig,
mut ctx: RequestMonitoring,
request: Request<Body>,
request: Request<Incoming>,
backend: Arc<PoolingBackend>,
cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
) -> Result<Response<Full<Bytes>>, ApiError> {
let result = handle_inner(cancel, config, &mut ctx, request, backend).await;
let mut response = match result {
@@ -332,10 +336,9 @@ pub async fn handle(
}
};
response.headers_mut().insert(
"Access-Control-Allow-Origin",
hyper::http::HeaderValue::from_static("*"),
);
response
.headers_mut()
.insert("Access-Control-Allow-Origin", HeaderValue::from_static("*"));
Ok(response)
}
@@ -396,7 +399,7 @@ impl UserFacingError for SqlOverHttpError {
#[derive(Debug, thiserror::Error)]
pub enum ReadPayloadError {
#[error("could not read the HTTP request body: {0}")]
Read(#[from] hyper::Error),
Read(#[from] hyper1::Error),
#[error("could not parse the HTTP request body: {0}")]
Parse(#[from] serde_json::Error),
}
@@ -437,7 +440,7 @@ struct HttpHeaders {
}
impl HttpHeaders {
fn try_parse(headers: &hyper::http::HeaderMap) -> Result<Self, SqlOverHttpError> {
fn try_parse(headers: &hyper1::http::HeaderMap) -> Result<Self, SqlOverHttpError> {
// Determine the output options. Default behaviour is 'false'. Anything that is not
// strictly 'true' assumed to be false.
let raw_output = headers.get(&RAW_TEXT_OUTPUT) == Some(&HEADER_VALUE_TRUE);
@@ -488,13 +491,14 @@ async fn handle_inner(
cancel: CancellationToken,
config: &'static ProxyConfig,
ctx: &mut RequestMonitoring,
request: Request<Body>,
request: Request<Incoming>,
backend: Arc<PoolingBackend>,
) -> Result<Response<Body>, SqlOverHttpError> {
let _request_gauge = NUM_CONNECTION_REQUESTS_GAUGE
.with_label_values(&[ctx.protocol])
.guard();
info!("handling interactive connection from client");
) -> Result<Response<Full<Bytes>>, SqlOverHttpError> {
let _requeset_gauge = Metrics::get().proxy.connection_requests.guard(ctx.protocol);
info!(
protocol = %ctx.protocol,
"handling interactive connection from client"
);
//
// Determine the destination and connection params
@@ -517,9 +521,10 @@ async fn handle_inner(
None => MAX_REQUEST_SIZE + 1,
};
info!(request_content_length, "request size in bytes");
HTTP_CONTENT_LENGTH
.with_label_values(&["request"])
.observe(request_content_length as f64);
Metrics::get()
.proxy
.http_conn_content_length_bytes
.observe(HttpDirection::Request, request_content_length as f64);
// we don't have a streaming request support yet so this is to prevent OOM
// from a malicious user sending an extremely large request body
@@ -528,7 +533,7 @@ async fn handle_inner(
}
let fetch_and_process_request = async {
let body = hyper::body::to_bytes(request.into_body()).await?;
let body = request.into_body().collect().await?.to_bytes();
info!(length = body.len(), "request payload read");
let payload: Payload = serde_json::from_slice(&body)?;
Ok::<Payload, ReadPayloadError>(payload) // Adjust error type accordingly
@@ -596,7 +601,7 @@ async fn handle_inner(
let body = serde_json::to_string(&result).expect("json serialization should not fail");
let len = body.len();
let response = response
.body(Body::from(body))
.body(Full::new(Bytes::from(body)))
// only fails if invalid status code or invalid header/values are given.
// these are not user configurable so it cannot fail dynamically
.expect("building response payload should not fail");
@@ -604,9 +609,10 @@ async fn handle_inner(
// count the egress bytes - we miss the TLS and header overhead but oh well...
// moving this later in the stack is going to be a lot of effort and ehhhh
metrics.record_egress(len as u64);
HTTP_CONTENT_LENGTH
.with_label_values(&["response"])
.observe(len as f64);
Metrics::get()
.proxy
.http_conn_content_length_bytes
.observe(HttpDirection::Response, len as f64);
Ok(response)
}
@@ -639,6 +645,7 @@ impl QueryData {
}
// The query was cancelled.
Either::Right((_cancelled, query)) => {
tracing::info!("cancelling query");
if let Err(err) = cancel_token.cancel_query(NoTls).await {
tracing::error!(?err, "could not cancel query");
}

View File

@@ -1,123 +0,0 @@
use std::{
convert::Infallible,
pin::Pin,
task::{Context, Poll},
time::Duration,
};
use hyper::server::{accept::Accept, conn::AddrStream};
use pin_project_lite::pin_project;
use tokio::{
io::{AsyncRead, AsyncWrite},
task::JoinSet,
time::timeout,
};
use tokio_rustls::{server::TlsStream, TlsAcceptor};
use tracing::{info, warn, Instrument};
use crate::{
metrics::TLS_HANDSHAKE_FAILURES,
protocol2::{WithClientIp, WithConnectionGuard},
};
pin_project! {
/// Wraps a `Stream` of connections (such as a TCP listener) so that each connection is itself
/// encrypted using TLS.
pub(crate) struct TlsListener<A: Accept> {
#[pin]
listener: A,
tls: TlsAcceptor,
waiting: JoinSet<Option<TlsStream<A::Conn>>>,
timeout: Duration,
}
}
impl<A: Accept> TlsListener<A> {
/// Create a `TlsListener` with default options.
pub(crate) fn new(tls: TlsAcceptor, listener: A, timeout: Duration) -> Self {
TlsListener {
listener,
tls,
waiting: JoinSet::new(),
timeout,
}
}
}
impl<A> Accept for TlsListener<A>
where
A: Accept<Conn = WithConnectionGuard<WithClientIp<AddrStream>>>,
A::Error: std::error::Error,
A::Conn: AsyncRead + AsyncWrite + Unpin + Send + 'static,
{
type Conn = TlsStream<A::Conn>;
type Error = Infallible;
fn poll_accept(
self: Pin<&mut Self>,
cx: &mut Context<'_>,
) -> Poll<Option<Result<Self::Conn, Self::Error>>> {
let mut this = self.project();
loop {
match this.listener.as_mut().poll_accept(cx) {
Poll::Pending => break,
Poll::Ready(Some(Ok(mut conn))) => {
let t = *this.timeout;
let tls = this.tls.clone();
let span = conn.span.clone();
this.waiting.spawn(async move {
let peer_addr = match conn.inner.wait_for_addr().await {
Ok(Some(addr)) => addr,
Err(e) => {
tracing::error!("failed to accept TCP connection: invalid PROXY protocol V2 header: {e:#}");
return None;
}
Ok(None) => conn.inner.inner.remote_addr()
};
let accept = tls.accept(conn);
match timeout(t, accept).await {
Ok(Ok(conn)) => {
info!(%peer_addr, "accepted new TLS connection");
Some(conn)
},
// The handshake failed, try getting another connection from the queue
Ok(Err(e)) => {
TLS_HANDSHAKE_FAILURES.inc();
warn!(%peer_addr, "failed to accept TLS connection: {e:?}");
None
}
// The handshake timed out, try getting another connection from the queue
Err(_) => {
TLS_HANDSHAKE_FAILURES.inc();
warn!(%peer_addr, "failed to accept TLS connection: timeout");
None
}
}
}.instrument(span));
}
Poll::Ready(Some(Err(e))) => {
tracing::error!("error accepting TCP connection: {e}");
continue;
}
Poll::Ready(None) => return Poll::Ready(None),
}
}
loop {
return match this.waiting.poll_join_next(cx) {
Poll::Ready(Some(Ok(Some(conn)))) => Poll::Ready(Some(Ok(conn))),
// The handshake failed to complete, try getting another connection from the queue
Poll::Ready(Some(Ok(None))) => continue,
// The handshake panicked or was cancelled. ignore and get another connection
Poll::Ready(Some(Err(e))) => {
tracing::warn!("handshake aborted: {e}");
continue;
}
_ => Poll::Pending,
};
}
}
}

View File

@@ -3,7 +3,7 @@ use crate::{
config::ProxyConfig,
context::RequestMonitoring,
error::{io_error, ReportableError},
metrics::NUM_CLIENT_CONNECTION_GAUGE,
metrics::Metrics,
proxy::{handle_client, ClientMode},
rate_limiter::EndpointRateLimiter,
};
@@ -139,9 +139,10 @@ pub async fn serve_websocket(
endpoint_rate_limiter: Arc<EndpointRateLimiter>,
) -> anyhow::Result<()> {
let websocket = websocket.await?;
let conn_gauge = NUM_CLIENT_CONNECTION_GAUGE
.with_label_values(&["ws"])
.guard();
let conn_gauge = Metrics::get()
.proxy
.client_connections
.guard(crate::metrics::Protocol::Ws);
let res = handle_client(
config,

View File

@@ -1,6 +1,6 @@
use crate::config::TlsServerEndPoint;
use crate::error::{ErrorKind, ReportableError, UserFacingError};
use crate::metrics::TLS_HANDSHAKE_FAILURES;
use crate::metrics::Metrics;
use bytes::BytesMut;
use pq_proto::framed::{ConnectionError, Framed};
@@ -228,7 +228,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Stream<S> {
Stream::Raw { raw } => Ok(tokio_rustls::TlsAcceptor::from(cfg)
.accept(raw)
.await
.inspect_err(|_| TLS_HANDSHAKE_FAILURES.inc())?),
.inspect_err(|_| Metrics::get().proxy.tls_handshake_failures.inc())?),
Stream::Tls { .. } => Err(StreamUpgradeError::AlreadyTls),
}
}

View File

@@ -15,8 +15,7 @@ FLAKY_TESTS_QUERY = """
DISTINCT parent_suite, suite, name
FROM results
WHERE
started_at > CURRENT_DATE - INTERVAL '10' day
AND started_at > '2024-03-11 14:50:11.845+00' -- we switched the default PAGESERVER_VIRTUAL_FILE_IO_ENGINE to `tokio-epoll-uring` from `std-fs` on this date, we want to ignore the flaky tests for `std-fs`
started_at > CURRENT_DATE - INTERVAL '%s' day
AND (
(status IN ('failed', 'broken') AND reference = 'refs/heads/main')
OR flaky

View File

@@ -22,7 +22,7 @@ parser.add_argument("--safekeeper-host", required=True, type=str)
args = parser.parse_args()
access_key = os.getenv("CONSOLE_API_TOKEN")
endpoint: str = "https://console.stage.neon.tech/api"
endpoint: str = "https://console-stage.neon.build/api"
trash_dir: Path = args.trash_dir
dry_run: bool = args.dry_run

View File

@@ -3,7 +3,7 @@
3. Issue admin token (add/remove .stage from url for staging/prod and setting proper API key):
```
# staging:
AUTH_TOKEN=$(curl https://console.stage.neon.tech/regions/console/api/v1/admin/issue_token -H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer $NEON_STAGING_KEY" -X POST -d '{"ttl_seconds": 43200, "scope": "safekeeperdata"}' 2>/dev/null | jq --raw-output '.jwt')
AUTH_TOKEN=$(curl https://console-stage.neon.build/regions/console/api/v1/admin/issue_token -H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer $NEON_STAGING_KEY" -X POST -d '{"ttl_seconds": 43200, "scope": "safekeeperdata"}' 2>/dev/null | jq --raw-output '.jwt')
# prod:
AUTH_TOKEN=$(curl https://console.neon.tech/regions/console/api/v1/admin/issue_token -H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer $NEON_PROD_KEY" -X POST -d '{"ttl_seconds": 43200, "scope": "safekeeperdata"}' 2>/dev/null | jq --raw-output '.jwt')
# check

View File

@@ -17,6 +17,8 @@ use crate::service::Config;
const SLOWDOWN_DELAY: Duration = Duration::from_secs(5);
const NOTIFY_REQUEST_TIMEOUT: Duration = Duration::from_secs(10);
pub(crate) const API_CONCURRENCY: usize = 32;
struct UnshardedComputeHookTenant {
@@ -242,6 +244,10 @@ pub(super) struct ComputeHook {
// This lock is only used in testing enviroments, to serialize calls into neon_lock
neon_local_lock: tokio::sync::Mutex<()>,
// We share a client across all notifications to enable connection re-use etc when
// sending large numbers of notifications
client: reqwest::Client,
}
impl ComputeHook {
@@ -251,12 +257,18 @@ impl ComputeHook {
.clone()
.map(|jwt| format!("Bearer {}", jwt));
let client = reqwest::ClientBuilder::new()
.timeout(NOTIFY_REQUEST_TIMEOUT)
.build()
.expect("Failed to construct HTTP client");
Self {
state: Default::default(),
config,
authorization_header,
neon_local_lock: Default::default(),
api_concurrency: tokio::sync::Semaphore::new(API_CONCURRENCY),
client,
}
}
@@ -310,12 +322,11 @@ impl ComputeHook {
async fn do_notify_iteration(
&self,
client: &reqwest::Client,
url: &String,
reconfigure_request: &ComputeHookNotifyRequest,
cancel: &CancellationToken,
) -> Result<(), NotifyError> {
let req = client.request(Method::PUT, url);
let req = self.client.request(Method::PUT, url);
let req = if let Some(value) = &self.authorization_header {
req.header(reqwest::header::AUTHORIZATION, value)
} else {
@@ -381,8 +392,6 @@ impl ComputeHook {
reconfigure_request: &ComputeHookNotifyRequest,
cancel: &CancellationToken,
) -> Result<(), NotifyError> {
let client = reqwest::Client::new();
// We hold these semaphore units across all retries, rather than only across each
// HTTP request: this is to preserve fairness and avoid a situation where a retry might
// time out waiting for a semaphore.
@@ -394,7 +403,7 @@ impl ComputeHook {
.map_err(|_| NotifyError::ShuttingDown)?;
backoff::retry(
|| self.do_notify_iteration(&client, url, reconfigure_request, cancel),
|| self.do_notify_iteration(url, reconfigure_request, cancel),
|e| {
matches!(
e,

View File

@@ -8,6 +8,7 @@ use futures::Future;
use hyper::header::CONTENT_TYPE;
use hyper::{Body, Request, Response};
use hyper::{StatusCode, Uri};
use metrics::{BuildInfo, NeonMetrics};
use pageserver_api::models::{
TenantConfigRequest, TenantCreateRequest, TenantLocationConfigRequest, TenantShardSplitRequest,
TenantTimeTravelRequest, TimelineCreateRequest,
@@ -44,15 +45,19 @@ use control_plane::storage_controller::{AttachHookRequest, InspectRequest};
use routerify::Middleware;
/// State available to HTTP request handlers
#[derive(Clone)]
pub struct HttpState {
service: Arc<crate::service::Service>,
auth: Option<Arc<SwappableJwtAuth>>,
neon_metrics: NeonMetrics,
allowlist_routes: Vec<Uri>,
}
impl HttpState {
pub fn new(service: Arc<crate::service::Service>, auth: Option<Arc<SwappableJwtAuth>>) -> Self {
pub fn new(
service: Arc<crate::service::Service>,
auth: Option<Arc<SwappableJwtAuth>>,
build_info: BuildInfo,
) -> Self {
let allowlist_routes = ["/status", "/ready", "/metrics"]
.iter()
.map(|v| v.parse().unwrap())
@@ -60,6 +65,7 @@ impl HttpState {
Self {
service,
auth,
neon_metrics: NeonMetrics::new(build_info),
allowlist_routes,
}
}
@@ -672,10 +678,11 @@ fn epilogue_metrics_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>
})
}
pub async fn measured_metrics_handler(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
pub async fn measured_metrics_handler(req: Request<Body>) -> Result<Response<Body>, ApiError> {
pub const TEXT_FORMAT: &str = "text/plain; version=0.0.4";
let payload = crate::metrics::METRICS_REGISTRY.encode();
let state = get_state(&req);
let payload = crate::metrics::METRICS_REGISTRY.encode(&state.neon_metrics);
let response = Response::builder()
.status(200)
.header(CONTENT_TYPE, TEXT_FORMAT)
@@ -704,6 +711,7 @@ where
pub fn make_router(
service: Arc<Service>,
auth: Option<Arc<SwappableJwtAuth>>,
build_info: BuildInfo,
) -> RouterBuilder<hyper::Body, ApiError> {
let mut router = endpoint::make_router()
.middleware(prologue_metrics_middleware())
@@ -720,7 +728,7 @@ pub fn make_router(
}
router
.data(Arc::new(HttpState::new(service, auth)))
.data(Arc::new(HttpState::new(service, auth, build_info)))
.get("/metrics", |r| {
named_request_span(r, measured_metrics_handler, RequestName("metrics"))
})

View File

@@ -3,6 +3,7 @@ use camino::Utf8PathBuf;
use clap::Parser;
use diesel::Connection;
use metrics::launch_timestamp::LaunchTimestamp;
use metrics::BuildInfo;
use std::sync::Arc;
use storage_controller::http::make_router;
use storage_controller::metrics::preinitialize_metrics;
@@ -192,6 +193,11 @@ async fn async_main() -> anyhow::Result<()> {
args.listen
);
let build_info = BuildInfo {
revision: GIT_VERSION,
build_tag: BUILD_TAG,
};
let strict_mode = if args.dev {
StrictMode::Dev
} else {
@@ -253,7 +259,7 @@ async fn async_main() -> anyhow::Result<()> {
let auth = secrets
.public_key
.map(|jwt_auth| Arc::new(SwappableJwtAuth::new(jwt_auth)));
let router = make_router(service.clone(), auth)
let router = make_router(service.clone(), auth, build_info)
.build()
.map_err(|err| anyhow!(err))?;
let router_service = utils::http::RouterService::new(router).unwrap();

View File

@@ -8,10 +8,8 @@
//! The rest of the code defines label group types and deals with converting outer types to labels.
//!
use bytes::Bytes;
use measured::{
label::{LabelValue, StaticLabelSet},
FixedCardinalityLabel, MetricGroup,
};
use measured::{label::LabelValue, metric::histogram, FixedCardinalityLabel, MetricGroup};
use metrics::NeonMetrics;
use once_cell::sync::Lazy;
use std::sync::Mutex;
@@ -26,13 +24,15 @@ pub fn preinitialize_metrics() {
pub(crate) struct StorageControllerMetrics {
pub(crate) metrics_group: StorageControllerMetricGroup,
encoder: Mutex<measured::text::TextEncoder>,
encoder: Mutex<measured::text::BufferedTextEncoder>,
}
#[derive(measured::MetricGroup)]
#[metric(new())]
pub(crate) struct StorageControllerMetricGroup {
/// Count of how many times we spawn a reconcile task
pub(crate) storage_controller_reconcile_spawn: measured::Counter,
/// Reconciler tasks completed, broken down by success/failure/cancelled
pub(crate) storage_controller_reconcile_complete:
measured::CounterVec<ReconcileCompleteLabelGroupSet>,
@@ -43,7 +43,9 @@ pub(crate) struct StorageControllerMetricGroup {
/// HTTP request status counters for handled requests
pub(crate) storage_controller_http_request_status:
measured::CounterVec<HttpRequestStatusLabelGroupSet>,
/// HTTP request handler latency across all status codes
#[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))]
pub(crate) storage_controller_http_request_latency:
measured::HistogramVec<HttpRequestLatencyLabelGroupSet, 5>,
@@ -55,6 +57,7 @@ pub(crate) struct StorageControllerMetricGroup {
/// Latency of HTTP requests to the pageserver, broken down by pageserver
/// node id, request name and method. This include both successful and unsuccessful
/// requests.
#[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))]
pub(crate) storage_controller_pageserver_request_latency:
measured::HistogramVec<PageserverRequestLabelGroupSet, 5>,
@@ -66,6 +69,7 @@ pub(crate) struct StorageControllerMetricGroup {
/// Latency of pass-through HTTP requests to the pageserver, broken down by pageserver
/// node id, request name and method. This include both successful and unsuccessful
/// requests.
#[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))]
pub(crate) storage_controller_passthrough_request_latency:
measured::HistogramVec<PageserverRequestLabelGroupSet, 5>,
@@ -74,76 +78,34 @@ pub(crate) struct StorageControllerMetricGroup {
measured::CounterVec<DatabaseQueryErrorLabelGroupSet>,
/// Latency of database queries, broken down by operation.
#[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))]
pub(crate) storage_controller_database_query_latency:
measured::HistogramVec<DatabaseQueryLatencyLabelGroupSet, 5>,
}
impl StorageControllerMetrics {
pub(crate) fn encode(&self) -> Bytes {
pub(crate) fn encode(&self, neon_metrics: &NeonMetrics) -> Bytes {
let mut encoder = self.encoder.lock().unwrap();
self.metrics_group.collect_into(&mut *encoder);
neon_metrics
.collect_group_into(&mut *encoder)
.unwrap_or_else(|infallible| match infallible {});
self.metrics_group
.collect_group_into(&mut *encoder)
.unwrap_or_else(|infallible| match infallible {});
encoder.finish()
}
}
impl Default for StorageControllerMetrics {
fn default() -> Self {
Self {
metrics_group: StorageControllerMetricGroup::new(),
encoder: Mutex::new(measured::text::TextEncoder::new()),
}
}
}
let mut metrics_group = StorageControllerMetricGroup::new();
metrics_group
.storage_controller_reconcile_complete
.init_all_dense();
impl StorageControllerMetricGroup {
pub(crate) fn new() -> Self {
Self {
storage_controller_reconcile_spawn: measured::Counter::new(),
storage_controller_reconcile_complete: measured::CounterVec::new(
ReconcileCompleteLabelGroupSet {
status: StaticLabelSet::new(),
},
),
storage_controller_schedule_optimization: measured::Counter::new(),
storage_controller_http_request_status: measured::CounterVec::new(
HttpRequestStatusLabelGroupSet {
path: lasso::ThreadedRodeo::new(),
method: StaticLabelSet::new(),
status: StaticLabelSet::new(),
},
),
storage_controller_http_request_latency: measured::HistogramVec::new(
measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0),
),
storage_controller_pageserver_request_error: measured::CounterVec::new(
PageserverRequestLabelGroupSet {
pageserver_id: lasso::ThreadedRodeo::new(),
path: lasso::ThreadedRodeo::new(),
method: StaticLabelSet::new(),
},
),
storage_controller_pageserver_request_latency: measured::HistogramVec::new(
measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0),
),
storage_controller_passthrough_request_error: measured::CounterVec::new(
PageserverRequestLabelGroupSet {
pageserver_id: lasso::ThreadedRodeo::new(),
path: lasso::ThreadedRodeo::new(),
method: StaticLabelSet::new(),
},
),
storage_controller_passthrough_request_latency: measured::HistogramVec::new(
measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0),
),
storage_controller_database_query_error: measured::CounterVec::new(
DatabaseQueryErrorLabelGroupSet {
operation: StaticLabelSet::new(),
error_type: StaticLabelSet::new(),
},
),
storage_controller_database_query_latency: measured::HistogramVec::new(
measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0),
),
metrics_group,
encoder: Mutex::new(measured::text::BufferedTextEncoder::new()),
}
}
}
@@ -157,7 +119,7 @@ pub(crate) struct ReconcileCompleteLabelGroup {
#[derive(measured::LabelGroup)]
#[label(set = HttpRequestStatusLabelGroupSet)]
pub(crate) struct HttpRequestStatusLabelGroup<'a> {
#[label(dynamic_with = lasso::ThreadedRodeo)]
#[label(dynamic_with = lasso::ThreadedRodeo, default)]
pub(crate) path: &'a str,
pub(crate) method: Method,
pub(crate) status: StatusCode,
@@ -166,40 +128,21 @@ pub(crate) struct HttpRequestStatusLabelGroup<'a> {
#[derive(measured::LabelGroup)]
#[label(set = HttpRequestLatencyLabelGroupSet)]
pub(crate) struct HttpRequestLatencyLabelGroup<'a> {
#[label(dynamic_with = lasso::ThreadedRodeo)]
#[label(dynamic_with = lasso::ThreadedRodeo, default)]
pub(crate) path: &'a str,
pub(crate) method: Method,
}
impl Default for HttpRequestLatencyLabelGroupSet {
fn default() -> Self {
Self {
path: lasso::ThreadedRodeo::new(),
method: StaticLabelSet::new(),
}
}
}
#[derive(measured::LabelGroup, Clone)]
#[label(set = PageserverRequestLabelGroupSet)]
pub(crate) struct PageserverRequestLabelGroup<'a> {
#[label(dynamic_with = lasso::ThreadedRodeo)]
#[label(dynamic_with = lasso::ThreadedRodeo, default)]
pub(crate) pageserver_id: &'a str,
#[label(dynamic_with = lasso::ThreadedRodeo)]
#[label(dynamic_with = lasso::ThreadedRodeo, default)]
pub(crate) path: &'a str,
pub(crate) method: Method,
}
impl Default for PageserverRequestLabelGroupSet {
fn default() -> Self {
Self {
pageserver_id: lasso::ThreadedRodeo::new(),
path: lasso::ThreadedRodeo::new(),
method: StaticLabelSet::new(),
}
}
}
#[derive(measured::LabelGroup)]
#[label(set = DatabaseQueryErrorLabelGroupSet)]
pub(crate) struct DatabaseQueryErrorLabelGroup {
@@ -213,7 +156,7 @@ pub(crate) struct DatabaseQueryLatencyLabelGroup {
pub(crate) operation: DatabaseOperation,
}
#[derive(FixedCardinalityLabel)]
#[derive(FixedCardinalityLabel, Clone, Copy)]
pub(crate) enum ReconcileOutcome {
#[label(rename = "ok")]
Success,
@@ -221,7 +164,7 @@ pub(crate) enum ReconcileOutcome {
Cancel,
}
#[derive(FixedCardinalityLabel, Clone)]
#[derive(FixedCardinalityLabel, Copy, Clone)]
pub(crate) enum Method {
Get,
Put,
@@ -246,11 +189,12 @@ impl From<hyper::Method> for Method {
}
}
#[derive(Clone, Copy)]
pub(crate) struct StatusCode(pub(crate) hyper::http::StatusCode);
impl LabelValue for StatusCode {
fn visit<V: measured::label::LabelVisitor>(&self, v: V) -> V::Output {
v.write_int(self.0.as_u16() as u64)
v.write_int(self.0.as_u16() as i64)
}
}
@@ -268,7 +212,7 @@ impl FixedCardinalityLabel for StatusCode {
}
}
#[derive(FixedCardinalityLabel)]
#[derive(FixedCardinalityLabel, Clone, Copy)]
pub(crate) enum DatabaseErrorLabel {
Query,
Connection,

View File

@@ -79,7 +79,7 @@ pub(crate) enum DatabaseError {
Logical(String),
}
#[derive(measured::FixedCardinalityLabel, Clone)]
#[derive(measured::FixedCardinalityLabel, Copy, Clone)]
pub(crate) enum DatabaseOperation {
InsertNode,
UpdateNode,
@@ -153,9 +153,7 @@ impl Persistence {
let latency = &METRICS_REGISTRY
.metrics_group
.storage_controller_database_query_latency;
let _timer = latency.start_timer(DatabaseQueryLatencyLabelGroup {
operation: op.clone(),
});
let _timer = latency.start_timer(DatabaseQueryLatencyLabelGroup { operation: op });
let res = self.with_conn(func).await;

View File

@@ -192,9 +192,6 @@ def test_backward_compatibility(
assert not breaking_changes_allowed, "Breaking changes are allowed by ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE, but the test has passed without any breakage"
# Forward compatibility is broken due to https://github.com/neondatabase/neon/pull/6530
# The test is disabled until the next release deployment
@pytest.mark.xfail
@check_ondisk_data_compatibility_if_enabled
@pytest.mark.xdist_group("compatibility")
@pytest.mark.order(after="test_create_snapshot")

View File

@@ -1,6 +1,7 @@
import asyncio
import os
from typing import Tuple
import time
from typing import Optional, Tuple
import psutil
import pytest
@@ -20,20 +21,30 @@ ENTRIES_PER_TIMELINE = 10_000
CHECKPOINT_TIMEOUT_SECONDS = 60
async def run_worker(env: NeonEnv, tenant_conf, entries: int) -> Tuple[TenantId, TimelineId, Lsn]:
tenant, timeline = env.neon_cli.create_tenant(conf=tenant_conf)
async def run_worker_for_tenant(
env: NeonEnv, entries: int, tenant: TenantId, offset: Optional[int] = None
) -> Lsn:
if offset is None:
offset = 0
with env.endpoints.create_start("main", tenant_id=tenant) as ep:
conn = await ep.connect_async()
try:
await conn.execute("CREATE TABLE IF NOT EXISTS t(key serial primary key, value text)")
await conn.execute(
f"INSERT INTO t SELECT i, CONCAT('payload_', i) FROM generate_series(0,{entries}) as i"
f"INSERT INTO t SELECT i, CONCAT('payload_', i) FROM generate_series({offset},{entries}) as i"
)
finally:
await conn.close(timeout=10)
last_flush_lsn = Lsn(ep.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
return tenant, timeline, last_flush_lsn
return last_flush_lsn
async def run_worker(env: NeonEnv, tenant_conf, entries: int) -> Tuple[TenantId, TimelineId, Lsn]:
tenant, timeline = env.neon_cli.create_tenant(conf=tenant_conf)
last_flush_lsn = await run_worker_for_tenant(env, entries, tenant)
return tenant, timeline, last_flush_lsn
async def workload(
@@ -89,7 +100,9 @@ def assert_dirty_bytes(env, v):
def assert_dirty_bytes_nonzero(env):
assert get_dirty_bytes(env) > 0
dirty_bytes = get_dirty_bytes(env)
assert dirty_bytes > 0
return dirty_bytes
@pytest.mark.parametrize("immediate_shutdown", [True, False])
@@ -182,6 +195,31 @@ def test_idle_checkpoints(neon_env_builder: NeonEnvBuilder):
log.info("Waiting for background checkpoints...")
wait_until(CHECKPOINT_TIMEOUT_SECONDS * 2, 1, lambda: assert_dirty_bytes(env, 0)) # type: ignore
# The code below verifies that we do not flush on the first write
# after an idle period longer than the checkpoint timeout.
# Sit quietly for longer than the checkpoint timeout
time.sleep(CHECKPOINT_TIMEOUT_SECONDS + CHECKPOINT_TIMEOUT_SECONDS / 2)
# Restart the safekeepers and write a bit of extra data into one tenant
for sk in env.safekeepers:
sk.start()
tenant_with_extra_writes = last_flush_lsns[0][0]
asyncio.run(
run_worker_for_tenant(env, 5, tenant_with_extra_writes, offset=ENTRIES_PER_TIMELINE)
)
dirty_after_write = wait_until(10, 1, lambda: assert_dirty_bytes_nonzero(env)) # type: ignore
# We shouldn't flush since we've just opened a new layer
waited_for = 0
while waited_for < CHECKPOINT_TIMEOUT_SECONDS // 4:
time.sleep(5)
waited_for += 5
assert get_dirty_bytes(env) >= dirty_after_write
@pytest.mark.skipif(
# We have to use at least ~100MB of data to hit the lowest limit we can configure, which is

View File

@@ -1,84 +0,0 @@
import asyncio
import time
from pathlib import Path
from typing import Iterator
import pytest
from fixtures.neon_fixtures import (
PSQL,
NeonProxy,
)
from fixtures.port_distributor import PortDistributor
from pytest_httpserver import HTTPServer
from werkzeug.wrappers.response import Response
def waiting_handler(status_code: int) -> Response:
# wait more than timeout to make sure that both (two) connections are open.
# It would be better to use a barrier here, but I don't know how to do that together with pytest-httpserver.
time.sleep(2)
return Response(status=status_code)
@pytest.fixture(scope="function")
def proxy_with_rate_limit(
port_distributor: PortDistributor,
neon_binpath: Path,
httpserver_listen_address,
test_output_dir: Path,
) -> Iterator[NeonProxy]:
"""Neon proxy that routes directly to vanilla postgres."""
proxy_port = port_distributor.get_port()
mgmt_port = port_distributor.get_port()
http_port = port_distributor.get_port()
external_http_port = port_distributor.get_port()
(host, port) = httpserver_listen_address
endpoint = f"http://{host}:{port}/billing/api/v1/usage_events"
with NeonProxy(
neon_binpath=neon_binpath,
test_output_dir=test_output_dir,
proxy_port=proxy_port,
http_port=http_port,
mgmt_port=mgmt_port,
external_http_port=external_http_port,
auth_backend=NeonProxy.Console(endpoint, fixed_rate_limit=5),
) as proxy:
proxy.start()
yield proxy
@pytest.mark.asyncio
async def test_proxy_rate_limit(
httpserver: HTTPServer,
proxy_with_rate_limit: NeonProxy,
):
uri = "/billing/api/v1/usage_events/proxy_get_role_secret"
# mock control plane service
httpserver.expect_ordered_request(uri, method="GET").respond_with_handler(
lambda _: Response(status=200)
)
httpserver.expect_ordered_request(uri, method="GET").respond_with_handler(
lambda _: waiting_handler(429)
)
httpserver.expect_ordered_request(uri, method="GET").respond_with_handler(
lambda _: waiting_handler(500)
)
psql = PSQL(host=proxy_with_rate_limit.host, port=proxy_with_rate_limit.proxy_port)
f = await psql.run("select 42;")
await proxy_with_rate_limit.find_auth_link(uri, f)
# Limit should be 2.
# Run two queries in parallel.
f1, f2 = await asyncio.gather(psql.run("select 42;"), psql.run("select 42;"))
await proxy_with_rate_limit.find_auth_link(uri, f1)
await proxy_with_rate_limit.find_auth_link(uri, f2)
# Now limit should be 0.
f = await psql.run("select 42;")
await proxy_with_rate_limit.find_auth_link(uri, f)
# There last query shouldn't reach the http-server.
assert httpserver.assertions == []

View File

@@ -1,5 +1,5 @@
{
"postgres-v16": "3946b2e2ea71d07af092099cb5bcae76a69b90d6",
"postgres-v15": "64b8c7bccc6b77e04795e2d4cf6ad82dc8d987ed",
"postgres-v14": "a7b4c66156bce00afa60e5592d4284ba9e40b4cf"
"postgres-v16": "261497dd63ace434045058b1453bcbaaa83f23e5",
"postgres-v15": "85d809c124a898847a97d66a211f7d5ef4f8e0cb",
"postgres-v14": "d9149dc59abcbeeb26293707509aef51752db28f"
}

View File

@@ -37,8 +37,7 @@ futures-io = { version = "0.3" }
futures-sink = { version = "0.3" }
futures-util = { version = "0.3", features = ["channel", "io", "sink"] }
getrandom = { version = "0.2", default-features = false, features = ["std"] }
hashbrown-582f2526e08bb6a0 = { package = "hashbrown", version = "0.14", features = ["raw"] }
hashbrown-594e8ee84c453af0 = { package = "hashbrown", version = "0.13", features = ["raw"] }
hashbrown = { version = "0.14", features = ["raw"] }
hex = { version = "0.4", features = ["serde"] }
hmac = { version = "0.12", default-features = false, features = ["reset"] }
hyper = { version = "0.14", features = ["full"] }
@@ -64,7 +63,7 @@ scopeguard = { version = "1" }
serde = { version = "1", features = ["alloc", "derive"] }
serde_json = { version = "1", features = ["raw_value"] }
sha2 = { version = "0.10", features = ["asm"] }
smallvec = { version = "1", default-features = false, features = ["write"] }
smallvec = { version = "1", default-features = false, features = ["const_new", "write"] }
subtle = { version = "2" }
time = { version = "0.3", features = ["local-offset", "macros", "serde-well-known"] }
tokio = { version = "1", features = ["fs", "io-std", "io-util", "macros", "net", "process", "rt-multi-thread", "signal", "test-util"] }
@@ -76,7 +75,6 @@ tonic = { version = "0.9", features = ["tls-roots"] }
tower = { version = "0.4", default-features = false, features = ["balance", "buffer", "limit", "log", "timeout", "util"] }
tracing = { version = "0.1", features = ["log"] }
tracing-core = { version = "0.1" }
tungstenite = { version = "0.20" }
url = { version = "2", features = ["serde"] }
uuid = { version = "1", features = ["serde", "v4", "v7"] }
zeroize = { version = "1", features = ["derive"] }
@@ -91,7 +89,7 @@ cc = { version = "1", default-features = false, features = ["parallel"] }
chrono = { version = "0.4", default-features = false, features = ["clock", "serde", "wasmbind"] }
either = { version = "1" }
getrandom = { version = "0.2", default-features = false, features = ["std"] }
hashbrown-582f2526e08bb6a0 = { package = "hashbrown", version = "0.14", features = ["raw"] }
hashbrown = { version = "0.14", features = ["raw"] }
indexmap = { version = "1", default-features = false, features = ["std"] }
itertools = { version = "0.10" }
libc = { version = "0.2", features = ["extra_traits", "use_std"] }