diff --git a/.github/actions/allure-report-generate/action.yml b/.github/actions/allure-report-generate/action.yml index 1ecb5ecc7e..f84beff20c 100644 --- a/.github/actions/allure-report-generate/action.yml +++ b/.github/actions/allure-report-generate/action.yml @@ -150,7 +150,7 @@ runs: # Use aws s3 cp (instead of aws s3 sync) to keep files from previous runs to make old URLs work, # and to keep files on the host to upload them to the database - time aws s3 cp --recursive --only-show-errors "${WORKDIR}/report" "s3://${BUCKET}/${REPORT_PREFIX}/${GITHUB_RUN_ID}" + time s5cmd --log error cp "${WORKDIR}/report/*" "s3://${BUCKET}/${REPORT_PREFIX}/${GITHUB_RUN_ID}/" # Generate redirect cat < ${WORKDIR}/index.html diff --git a/.github/actions/neon-branch-create/action.yml b/.github/actions/neon-branch-create/action.yml index f1eea34ab9..dea3fc2357 100644 --- a/.github/actions/neon-branch-create/action.yml +++ b/.github/actions/neon-branch-create/action.yml @@ -10,7 +10,7 @@ inputs: required: true api_host: desctiption: 'Neon API host' - default: console.stage.neon.tech + default: console-stage.neon.build outputs: dsn: description: 'Created Branch DSN (for main database)' diff --git a/.github/actions/neon-branch-delete/action.yml b/.github/actions/neon-branch-delete/action.yml index f8cd351dd9..8acba7ad00 100644 --- a/.github/actions/neon-branch-delete/action.yml +++ b/.github/actions/neon-branch-delete/action.yml @@ -13,7 +13,7 @@ inputs: required: true api_host: desctiption: 'Neon API host' - default: console.stage.neon.tech + default: console-stage.neon.build runs: using: "composite" diff --git a/.github/actions/neon-project-create/action.yml b/.github/actions/neon-project-create/action.yml index ae6464990e..7f0e599b97 100644 --- a/.github/actions/neon-project-create/action.yml +++ b/.github/actions/neon-project-create/action.yml @@ -13,7 +13,7 @@ inputs: default: 15 api_host: desctiption: 'Neon API host' - default: console.stage.neon.tech + default: console-stage.neon.build provisioner: desctiption: 'k8s-pod or k8s-neonvm' default: 'k8s-pod' diff --git a/.github/actions/neon-project-delete/action.yml b/.github/actions/neon-project-delete/action.yml index adc8510a34..b8ec6cac70 100644 --- a/.github/actions/neon-project-delete/action.yml +++ b/.github/actions/neon-project-delete/action.yml @@ -10,7 +10,7 @@ inputs: required: true api_host: desctiption: 'Neon API host' - default: console.stage.neon.tech + default: console-stage.neon.build runs: using: "composite" diff --git a/.github/workflows/approved-for-ci-run.yml b/.github/workflows/approved-for-ci-run.yml index 69c48d86b9..ab616d17e2 100644 --- a/.github/workflows/approved-for-ci-run.yml +++ b/.github/workflows/approved-for-ci-run.yml @@ -18,6 +18,7 @@ on: concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number }} + cancel-in-progress: false env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/build-build-tools-image.yml b/.github/workflows/build-build-tools-image.yml index 251423e701..c527cef1ac 100644 --- a/.github/workflows/build-build-tools-image.yml +++ b/.github/workflows/build-build-tools-image.yml @@ -21,6 +21,7 @@ defaults: concurrency: group: build-build-tools-image-${{ inputs.image-tag }} + cancel-in-progress: false # No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job. permissions: {} diff --git a/.github/workflows/pin-build-tools-image.yml b/.github/workflows/pin-build-tools-image.yml index c941692066..d495a158e8 100644 --- a/.github/workflows/pin-build-tools-image.yml +++ b/.github/workflows/pin-build-tools-image.yml @@ -20,6 +20,7 @@ defaults: concurrency: group: pin-build-tools-image-${{ inputs.from-tag }} + cancel-in-progress: false permissions: {} diff --git a/Cargo.lock b/Cargo.lock index dae406e4ae..6faf4b72f0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -270,6 +270,12 @@ dependencies = [ "critical-section", ] +[[package]] +name = "atomic-take" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8ab6b55fe97976e46f91ddbed8d147d966475dc29b2032757ba47e02376fbc3" + [[package]] name = "autocfg" version = "1.1.0" @@ -298,7 +304,7 @@ dependencies = [ "fastrand 2.0.0", "hex", "http 0.2.9", - "hyper", + "hyper 0.14.26", "ring 0.17.6", "time", "tokio", @@ -335,7 +341,7 @@ dependencies = [ "bytes", "fastrand 2.0.0", "http 0.2.9", - "http-body", + "http-body 0.4.5", "percent-encoding", "pin-project-lite", "tracing", @@ -386,7 +392,7 @@ dependencies = [ "aws-types", "bytes", "http 0.2.9", - "http-body", + "http-body 0.4.5", "once_cell", "percent-encoding", "regex-lite", @@ -514,7 +520,7 @@ dependencies = [ "crc32fast", "hex", "http 0.2.9", - "http-body", + "http-body 0.4.5", "md-5", "pin-project-lite", "sha1", @@ -546,7 +552,7 @@ dependencies = [ "bytes-utils", "futures-core", "http 0.2.9", - "http-body", + "http-body 0.4.5", "once_cell", "percent-encoding", "pin-project-lite", @@ -585,10 +591,10 @@ dependencies = [ "aws-smithy-types", "bytes", "fastrand 2.0.0", - "h2", + "h2 0.3.26", "http 0.2.9", - "http-body", - "hyper", + "http-body 0.4.5", + "hyper 0.14.26", "hyper-rustls", "once_cell", "pin-project-lite", @@ -626,7 +632,7 @@ dependencies = [ "bytes-utils", "futures-core", "http 0.2.9", - "http-body", + "http-body 0.4.5", "itoa", "num-integer", "pin-project-lite", @@ -675,8 +681,8 @@ dependencies = [ "bytes", "futures-util", "http 0.2.9", - "http-body", - "hyper", + "http-body 0.4.5", + "hyper 0.14.26", "itoa", "matchit", "memchr", @@ -691,7 +697,7 @@ dependencies = [ "sha1", "sync_wrapper", "tokio", - "tokio-tungstenite", + "tokio-tungstenite 0.20.0", "tower", "tower-layer", "tower-service", @@ -707,7 +713,7 @@ dependencies = [ "bytes", "futures-util", "http 0.2.9", - "http-body", + "http-body 0.4.5", "mime", "rustversion", "tower-layer", @@ -1124,7 +1130,7 @@ version = "4.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "191d9573962933b4027f932c600cd252ce27a8ad5979418fe78e43c07996f27b" dependencies = [ - "heck", + "heck 0.4.1", "proc-macro2", "quote", "syn 2.0.52", @@ -1196,7 +1202,7 @@ dependencies = [ "compute_api", "flate2", "futures", - "hyper", + "hyper 0.14.26", "nix 0.27.1", "notify", "num_cpus", @@ -1313,7 +1319,7 @@ dependencies = [ "git-version", "hex", "humantime", - "hyper", + "hyper 0.14.26", "nix 0.27.1", "once_cell", "pageserver_api", @@ -1462,12 +1468,9 @@ dependencies = [ [[package]] name = "crossbeam-utils" -version = "0.8.15" +version = "0.8.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c063cd8cc95f5c377ed0d4b49a4b21f632396ff690e8470c29b3359b346984b" -dependencies = [ - "cfg-if", -] +checksum = "248e3bacc7dc6baa3b21e405ee045c3047101a49145e7e9eca583ab4c2ca5345" [[package]] name = "crossterm" @@ -1840,23 +1843,12 @@ checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" [[package]] name = "errno" -version = "0.3.1" +version = "0.3.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4bcfec3a70f97c962c307b2d2c56e358cf1d00b558d74262b5f929ee8cc7e73a" +checksum = "a258e46cdc063eb8519c00b9fc845fc47bcfca4130e2f08e88665ceda8474245" dependencies = [ - "errno-dragonfly", - "libc", - "windows-sys 0.48.0", -] - -[[package]] -name = "errno-dragonfly" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf" -dependencies = [ - "cc", "libc", + "windows-sys 0.52.0", ] [[package]] @@ -2213,6 +2205,25 @@ dependencies = [ "tracing", ] +[[package]] +name = "h2" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "816ec7294445779408f36fe57bc5b7fc1cf59664059096c65f905c1c61f58069" +dependencies = [ + "bytes", + "fnv", + "futures-core", + "futures-sink", + "futures-util", + "http 1.1.0", + "indexmap 2.0.1", + "slab", + "tokio", + "tokio-util", + "tracing", +] + [[package]] name = "half" version = "1.8.2" @@ -2294,6 +2305,12 @@ version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" +[[package]] +name = "heck" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" + [[package]] name = "hermit-abi" version = "0.3.3" @@ -2378,6 +2395,29 @@ dependencies = [ "pin-project-lite", ] +[[package]] +name = "http-body" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1cac85db508abc24a2e48553ba12a996e87244a0395ce011e62b37158745d643" +dependencies = [ + "bytes", + "http 1.1.0", +] + +[[package]] +name = "http-body-util" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41cb79eb393015dadd30fc252023adb0b2400a0caee0fa2a077e6e21a551e840" +dependencies = [ + "bytes", + "futures-util", + "http 1.1.0", + "http-body 1.0.0", + "pin-project-lite", +] + [[package]] name = "http-types" version = "2.12.0" @@ -2436,9 +2476,9 @@ dependencies = [ "futures-channel", "futures-core", "futures-util", - "h2", + "h2 0.3.26", "http 0.2.9", - "http-body", + "http-body 0.4.5", "httparse", "httpdate", "itoa", @@ -2450,6 +2490,26 @@ dependencies = [ "want", ] +[[package]] +name = "hyper" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "186548d73ac615b32a73aafe38fb4f56c0d340e110e5a200bcadbaf2e199263a" +dependencies = [ + "bytes", + "futures-channel", + "futures-util", + "h2 0.4.4", + "http 1.1.0", + "http-body 1.0.0", + "httparse", + "httpdate", + "itoa", + "pin-project-lite", + "smallvec", + "tokio", +] + [[package]] name = "hyper-rustls" version = "0.24.0" @@ -2457,7 +2517,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0646026eb1b3eea4cd9ba47912ea5ce9cc07713d105b1a14698f4e6433d348b7" dependencies = [ "http 0.2.9", - "hyper", + "hyper 0.14.26", "log", "rustls 0.21.9", "rustls-native-certs 0.6.2", @@ -2471,7 +2531,7 @@ version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bbb958482e8c7be4bc3cf272a766a2b0bf1a6755e7a6ae777f017a31d11b13b1" dependencies = [ - "hyper", + "hyper 0.14.26", "pin-project-lite", "tokio", "tokio-io-timeout", @@ -2484,7 +2544,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d6183ddfa99b85da61a140bea0efc93fdf56ceaa041b37d553518030827f9905" dependencies = [ "bytes", - "hyper", + "hyper 0.14.26", "native-tls", "tokio", "tokio-native-tls", @@ -2492,15 +2552,33 @@ dependencies = [ [[package]] name = "hyper-tungstenite" -version = "0.11.1" +version = "0.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7cc7dcb1ab67cd336f468a12491765672e61a3b6b148634dbfe2fe8acd3fe7d9" +checksum = "7a343d17fe7885302ed7252767dc7bb83609a874b6ff581142241ec4b73957ad" dependencies = [ - "hyper", + "http-body-util", + "hyper 1.2.0", + "hyper-util", "pin-project-lite", "tokio", - "tokio-tungstenite", - "tungstenite", + "tokio-tungstenite 0.21.0", + "tungstenite 0.21.0", +] + +[[package]] +name = "hyper-util" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca38ef113da30126bbff9cd1705f9273e15d45498615d138b0c20279ac7a76aa" +dependencies = [ + "bytes", + "futures-util", + "http 1.1.0", + "http-body 1.0.0", + "hyper 1.2.0", + "pin-project-lite", + "socket2 0.5.5", + "tokio", ] [[package]] @@ -2794,6 +2872,12 @@ version = "0.3.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ef53942eb7bf7ff43a617b3e2c1c4a5ecf5944a7c1bc12d7ee39bbb15e5c1519" +[[package]] +name = "linux-raw-sys" +version = "0.4.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "01cda141df6706de531b6c46c3a33ecca755538219bd484262fa09410c13539c" + [[package]] name = "lock_api" version = "0.4.10" @@ -2848,11 +2932,12 @@ checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771" [[package]] name = "measured" -version = "0.0.13" +version = "0.0.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f246648d027839a34b420e27c7de1165ace96e19ef894985d0a6ff89a7840a9f" +checksum = "652bc741286361c06de8cb4d89b21a6437f120c508c51713663589eeb9928ac5" dependencies = [ "bytes", + "crossbeam-utils", "hashbrown 0.14.0", "itoa", "lasso", @@ -2865,16 +2950,27 @@ dependencies = [ [[package]] name = "measured-derive" -version = "0.0.13" +version = "0.0.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "edaa5cc22d99d5d6d7d99c3b5b5f7e7f8034c22f1b5d62a1adecd2ed005d9b80" +checksum = "6ea497f33e1e856a376c32ad916f69a0bd3c597db1f912a399f842b01a4a685d" dependencies = [ - "heck", + "heck 0.5.0", "proc-macro2", "quote", "syn 2.0.52", ] +[[package]] +name = "measured-process" +version = "0.0.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b364ccb66937a814b6b2ad751d1a2f7a9d5a78c761144036825fb36bb0771000" +dependencies = [ + "libc", + "measured", + "procfs 0.16.0", +] + [[package]] name = "memchr" version = "2.6.4" @@ -2914,8 +3010,10 @@ version = "0.1.0" dependencies = [ "chrono", "libc", + "measured", + "measured-process", "once_cell", - "procfs", + "procfs 0.14.2", "prometheus", "rand 0.8.5", "rand_distr", @@ -3465,12 +3563,17 @@ dependencies = [ "camino", "clap", "git-version", + "humantime", "pageserver", + "pageserver_api", "postgres_ffi", + "remote_storage", "serde", "serde_json", "svg_fmt", "tokio", + "tokio-util", + "toml_edit", "utils", "workspace_hack", ] @@ -3506,7 +3609,7 @@ dependencies = [ "hex-literal", "humantime", "humantime-serde", - "hyper", + "hyper 0.14.26", "itertools", "leaky-bucket", "md5", @@ -3525,7 +3628,7 @@ dependencies = [ "postgres_connection", "postgres_ffi", "pq_proto", - "procfs", + "procfs 0.14.2", "rand 0.8.5", "regex", "remote_storage", @@ -3616,7 +3719,6 @@ dependencies = [ "anyhow", "async-compression", "async-stream", - "async-trait", "byteorder", "bytes", "chrono", @@ -4086,6 +4188,29 @@ dependencies = [ "rustix 0.36.16", ] +[[package]] +name = "procfs" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "731e0d9356b0c25f16f33b5be79b1c57b562f141ebfcdb0ad8ac2c13a24293b4" +dependencies = [ + "bitflags 2.4.1", + "hex", + "lazy_static", + "procfs-core", + "rustix 0.38.28", +] + +[[package]] +name = "procfs-core" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d3554923a69f4ce04c4a754260c338f505ce22642d3830e049a399fc2059a29" +dependencies = [ + "bitflags 2.4.1", + "hex", +] + [[package]] name = "prometheus" version = "0.13.3" @@ -4098,7 +4223,7 @@ dependencies = [ "libc", "memchr", "parking_lot 0.12.1", - "procfs", + "procfs 0.14.2", "thiserror", ] @@ -4119,7 +4244,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "119533552c9a7ffacc21e099c24a0ac8bb19c2a2a3f363de84cd9b844feab270" dependencies = [ "bytes", - "heck", + "heck 0.4.1", "itertools", "lazy_static", "log", @@ -4163,6 +4288,7 @@ dependencies = [ "anyhow", "async-compression", "async-trait", + "atomic-take", "aws-config", "aws-sdk-iam", "aws-sigv4", @@ -4186,13 +4312,17 @@ dependencies = [ "hmac", "hostname", "http 1.1.0", + "http-body-util", "humantime", - "hyper", + "hyper 0.14.26", + "hyper 1.2.0", "hyper-tungstenite", + "hyper-util", "ipnet", "itertools", "lasso", "md5", + "measured", "metrics", "native-tls", "once_cell", @@ -4521,7 +4651,7 @@ dependencies = [ "futures-util", "http-types", "humantime", - "hyper", + "hyper 0.14.26", "itertools", "metrics", "once_cell", @@ -4551,10 +4681,10 @@ dependencies = [ "encoding_rs", "futures-core", "futures-util", - "h2", + "h2 0.3.26", "http 0.2.9", - "http-body", - "hyper", + "http-body 0.4.5", + "hyper 0.14.26", "hyper-rustls", "hyper-tls", "ipnet", @@ -4612,7 +4742,7 @@ dependencies = [ "futures", "getrandom 0.2.11", "http 0.2.9", - "hyper", + "hyper 0.14.26", "parking_lot 0.11.2", "reqwest", "reqwest-middleware", @@ -4699,7 +4829,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "496c1d3718081c45ba9c31fbfc07417900aa96f4070ff90dc29961836b7a9945" dependencies = [ "http 0.2.9", - "hyper", + "hyper 0.14.26", "lazy_static", "percent-encoding", "regex", @@ -4811,6 +4941,19 @@ dependencies = [ "windows-sys 0.48.0", ] +[[package]] +name = "rustix" +version = "0.38.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72e572a5e8ca657d7366229cdde4bd14c4eb5499a9573d4d366fe1b599daa316" +dependencies = [ + "bitflags 2.4.1", + "errno", + "libc", + "linux-raw-sys 0.4.13", + "windows-sys 0.52.0", +] + [[package]] name = "rustls" version = "0.21.9" @@ -4991,7 +5134,7 @@ dependencies = [ "git-version", "hex", "humantime", - "hyper", + "hyper 0.14.26", "metrics", "once_cell", "parking_lot 0.12.1", @@ -5476,9 +5619,9 @@ dependencies = [ [[package]] name = "smallvec" -version = "1.11.0" +version = "1.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62bb4feee49fdd9f707ef802e22365a35de4b7b299de4763d44bfea899442ff9" +checksum = "e6ecd384b10a64542d77071bd64bd7b231f4ed5940fba55e98c3de13824cf3d7" [[package]] name = "smol_str" @@ -5570,7 +5713,7 @@ dependencies = [ "futures-util", "git-version", "humantime", - "hyper", + "hyper 0.14.26", "metrics", "once_cell", "parking_lot 0.12.1", @@ -5601,7 +5744,7 @@ dependencies = [ "git-version", "hex", "humantime", - "hyper", + "hyper 0.14.26", "itertools", "lasso", "measured", @@ -5630,7 +5773,7 @@ dependencies = [ "anyhow", "clap", "comfy-table", - "hyper", + "hyper 0.14.26", "pageserver_api", "pageserver_client", "reqwest", @@ -5671,7 +5814,7 @@ version = "0.24.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e385be0d24f186b4ce2f9982191e7101bb737312ad61c1f2f984f34bcf85d59" dependencies = [ - "heck", + "heck 0.4.1", "proc-macro2", "quote", "rustversion", @@ -6113,7 +6256,19 @@ dependencies = [ "futures-util", "log", "tokio", - "tungstenite", + "tungstenite 0.20.1", +] + +[[package]] +name = "tokio-tungstenite" +version = "0.21.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c83b561d025642014097b66e6c1bb422783339e0909e4429cde4749d1990bc38" +dependencies = [ + "futures-util", + "log", + "tokio", + "tungstenite 0.21.0", ] [[package]] @@ -6180,10 +6335,10 @@ dependencies = [ "bytes", "futures-core", "futures-util", - "h2", + "h2 0.3.26", "http 0.2.9", - "http-body", - "hyper", + "http-body 0.4.5", + "hyper 0.14.26", "hyper-timeout", "percent-encoding", "pin-project", @@ -6369,7 +6524,7 @@ dependencies = [ name = "tracing-utils" version = "0.1.0" dependencies = [ - "hyper", + "hyper 0.14.26", "opentelemetry", "opentelemetry-otlp", "opentelemetry-semantic-conventions", @@ -6406,6 +6561,25 @@ dependencies = [ "utf-8", ] +[[package]] +name = "tungstenite" +version = "0.21.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ef1a641ea34f399a848dea702823bbecfb4c486f911735368f1f137cb8257e1" +dependencies = [ + "byteorder", + "bytes", + "data-encoding", + "http 1.1.0", + "httparse", + "log", + "rand 0.8.5", + "sha1", + "thiserror", + "url", + "utf-8", +] + [[package]] name = "twox-hash" version = "1.6.3" @@ -6570,7 +6744,8 @@ dependencies = [ "heapless", "hex", "hex-literal", - "hyper", + "humantime", + "hyper 0.14.26", "jsonwebtoken", "leaky-bucket", "metrics", @@ -6930,6 +7105,15 @@ dependencies = [ "windows-targets 0.48.0", ] +[[package]] +name = "windows-sys" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" +dependencies = [ + "windows-targets 0.52.4", +] + [[package]] name = "windows-targets" version = "0.42.2" @@ -6960,6 +7144,21 @@ dependencies = [ "windows_x86_64_msvc 0.48.0", ] +[[package]] +name = "windows-targets" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7dd37b7e5ab9018759f893a1952c9420d060016fc19a472b4bb20d1bdd694d1b" +dependencies = [ + "windows_aarch64_gnullvm 0.52.4", + "windows_aarch64_msvc 0.52.4", + "windows_i686_gnu 0.52.4", + "windows_i686_msvc 0.52.4", + "windows_x86_64_gnu 0.52.4", + "windows_x86_64_gnullvm 0.52.4", + "windows_x86_64_msvc 0.52.4", +] + [[package]] name = "windows_aarch64_gnullvm" version = "0.42.2" @@ -6972,6 +7171,12 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "91ae572e1b79dba883e0d315474df7305d12f569b400fcf90581b06062f7e1bc" +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bcf46cf4c365c6f2d1cc93ce535f2c8b244591df96ceee75d8e83deb70a9cac9" + [[package]] name = "windows_aarch64_msvc" version = "0.42.2" @@ -6984,6 +7189,12 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b2ef27e0d7bdfcfc7b868b317c1d32c641a6fe4629c171b8928c7b08d98d7cf3" +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da9f259dd3bcf6990b55bffd094c4f7235817ba4ceebde8e6d11cd0c5633b675" + [[package]] name = "windows_i686_gnu" version = "0.42.2" @@ -6996,6 +7207,12 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "622a1962a7db830d6fd0a69683c80a18fda201879f0f447f065a3b7467daa241" +[[package]] +name = "windows_i686_gnu" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b474d8268f99e0995f25b9f095bc7434632601028cf86590aea5c8a5cb7801d3" + [[package]] name = "windows_i686_msvc" version = "0.42.2" @@ -7008,6 +7225,12 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4542c6e364ce21bf45d69fdd2a8e455fa38d316158cfd43b3ac1c5b1b19f8e00" +[[package]] +name = "windows_i686_msvc" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1515e9a29e5bed743cb4415a9ecf5dfca648ce85ee42e15873c3cd8610ff8e02" + [[package]] name = "windows_x86_64_gnu" version = "0.42.2" @@ -7020,6 +7243,12 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ca2b8a661f7628cbd23440e50b05d705db3686f894fc9580820623656af974b1" +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5eee091590e89cc02ad514ffe3ead9eb6b660aedca2183455434b93546371a03" + [[package]] name = "windows_x86_64_gnullvm" version = "0.42.2" @@ -7032,6 +7261,12 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7896dbc1f41e08872e9d5e8f8baa8fdd2677f29468c4e156210174edc7f7b953" +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77ca79f2451b49fa9e2af39f0747fe999fcda4f5e241b2898624dca97a1f2177" + [[package]] name = "windows_x86_64_msvc" version = "0.42.2" @@ -7044,6 +7279,12 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1a515f5799fe4961cb532f983ce2b23082366b898e52ffbce459c86f67c8378a" +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32b752e52a2da0ddfbdbcc6fceadfeede4c939ed16d13e648833a61dfb611ed8" + [[package]] name = "winnow" version = "0.4.6" @@ -7092,11 +7333,10 @@ dependencies = [ "futures-sink", "futures-util", "getrandom 0.2.11", - "hashbrown 0.13.2", "hashbrown 0.14.0", "hex", "hmac", - "hyper", + "hyper 0.14.26", "indexmap 1.9.3", "itertools", "libc", @@ -7134,7 +7374,6 @@ dependencies = [ "tower", "tracing", "tracing-core", - "tungstenite", "url", "uuid", "zeroize", diff --git a/Cargo.toml b/Cargo.toml index 3c6077648e..8310d2d522 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -44,6 +44,7 @@ license = "Apache-2.0" anyhow = { version = "1.0", features = ["backtrace"] } arc-swap = "1.6" async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] } +atomic-take = "1.1.0" azure_core = "0.18" azure_identity = "0.18" azure_storage = "0.18" @@ -97,7 +98,7 @@ http-types = { version = "2", default-features = false } humantime = "2.1" humantime-serde = "1.1.1" hyper = "0.14" -hyper-tungstenite = "0.11" +hyper-tungstenite = "0.13.0" inotify = "0.10.2" ipnet = "2.9.0" itertools = "0.10" @@ -106,7 +107,8 @@ lasso = "0.7" leaky-bucket = "1.0.1" libc = "0.2" md5 = "0.7.0" -measured = { version = "0.0.13", features=["default", "lasso"] } +measured = { version = "0.0.21", features=["lasso"] } +measured-process = { version = "0.0.21" } memoffset = "0.8" native-tls = "0.2" nix = { version = "0.27", features = ["fs", "process", "socket", "signal", "poll"] } diff --git a/Dockerfile.build-tools b/Dockerfile.build-tools index 1ed6f87473..a082f15c34 100644 --- a/Dockerfile.build-tools +++ b/Dockerfile.build-tools @@ -58,6 +58,12 @@ RUN curl -fsSL "https://github.com/protocolbuffers/protobuf/releases/download/v$ && mv protoc/include/google /usr/local/include/google \ && rm -rf protoc.zip protoc +# s5cmd +ENV S5CMD_VERSION=2.2.2 +RUN curl -sL "https://github.com/peak/s5cmd/releases/download/v${S5CMD_VERSION}/s5cmd_${S5CMD_VERSION}_Linux-$(uname -m | sed 's/x86_64/64bit/g' | sed 's/aarch64/arm64/g').tar.gz" | tar zxvf - s5cmd \ + && chmod +x s5cmd \ + && mv s5cmd /usr/local/bin/s5cmd + # LLVM ENV LLVM_VERSION=17 RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \ diff --git a/control_plane/src/background_process.rs b/control_plane/src/background_process.rs index 2fced7d778..94666f2870 100644 --- a/control_plane/src/background_process.rs +++ b/control_plane/src/background_process.rs @@ -86,7 +86,10 @@ where .stdout(process_log_file) .stderr(same_file_for_stderr) .args(args); - let filled_cmd = fill_remote_storage_secrets_vars(fill_rust_env_vars(background_command)); + + let filled_cmd = fill_env_vars_prefixed_neon(fill_remote_storage_secrets_vars( + fill_rust_env_vars(background_command), + )); filled_cmd.envs(envs); let pid_file_to_check = match &initial_pid_file { @@ -268,6 +271,15 @@ fn fill_remote_storage_secrets_vars(mut cmd: &mut Command) -> &mut Command { cmd } +fn fill_env_vars_prefixed_neon(mut cmd: &mut Command) -> &mut Command { + for (var, val) in std::env::vars() { + if var.starts_with("NEON_PAGESERVER_") { + cmd = cmd.env(var, val); + } + } + cmd +} + /// Add a `pre_exec` to the cmd that, inbetween fork() and exec(), /// 1. Claims a pidfile with a fcntl lock on it and /// 2. Sets up the pidfile's file descriptor so that it (and the lock) diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index 56495dd2da..68a5474c87 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -1231,7 +1231,7 @@ async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) { match ComputeControlPlane::load(env.clone()) { Ok(cplane) => { for (_k, node) in cplane.endpoints { - if let Err(e) = node.stop(if immediate { "immediate" } else { "fast " }, false) { + if let Err(e) = node.stop(if immediate { "immediate" } else { "fast" }, false) { eprintln!("postgres stop failed: {e:#}"); } } diff --git a/libs/metrics/Cargo.toml b/libs/metrics/Cargo.toml index f6a49a0166..0bd804051c 100644 --- a/libs/metrics/Cargo.toml +++ b/libs/metrics/Cargo.toml @@ -10,11 +10,13 @@ libc.workspace = true once_cell.workspace = true chrono.workspace = true twox-hash.workspace = true +measured.workspace = true workspace_hack.workspace = true [target.'cfg(target_os = "linux")'.dependencies] procfs.workspace = true +measured-process.workspace = true [dev-dependencies] rand = "0.8" diff --git a/libs/metrics/src/hll.rs b/libs/metrics/src/hll.rs index dfb4461ce9..f53511ab5c 100644 --- a/libs/metrics/src/hll.rs +++ b/libs/metrics/src/hll.rs @@ -7,14 +7,19 @@ //! use significantly less memory than this, but can only approximate the cardinality. use std::{ - collections::HashMap, - hash::{BuildHasher, BuildHasherDefault, Hash, Hasher}, - sync::{atomic::AtomicU8, Arc, RwLock}, + hash::{BuildHasher, BuildHasherDefault, Hash}, + sync::atomic::AtomicU8, }; -use prometheus::{ - core::{self, Describer}, - proto, Opts, +use measured::{ + label::{LabelGroupVisitor, LabelName, LabelValue, LabelVisitor}, + metric::{ + group::{Encoding, MetricValue}, + name::MetricNameEncoder, + Metric, MetricType, MetricVec, + }, + text::TextEncoder, + LabelGroup, }; use twox_hash::xxh3; @@ -93,203 +98,25 @@ macro_rules! register_hll { /// ``` /// /// See for estimates on alpha -#[derive(Clone)] -pub struct HyperLogLogVec { - core: Arc>, +pub type HyperLogLogVec = MetricVec, L>; +pub type HyperLogLog = Metric>; + +pub struct HyperLogLogState { + shards: [AtomicU8; N], } - -struct HyperLogLogVecCore { - pub children: RwLock, BuildHasherDefault>>, - pub desc: core::Desc, - pub opts: Opts, -} - -impl core::Collector for HyperLogLogVec { - fn desc(&self) -> Vec<&core::Desc> { - vec![&self.core.desc] - } - - fn collect(&self) -> Vec { - let mut m = proto::MetricFamily::default(); - m.set_name(self.core.desc.fq_name.clone()); - m.set_help(self.core.desc.help.clone()); - m.set_field_type(proto::MetricType::GAUGE); - - let mut metrics = Vec::new(); - for child in self.core.children.read().unwrap().values() { - child.core.collect_into(&mut metrics); - } - m.set_metric(metrics); - - vec![m] +impl Default for HyperLogLogState { + fn default() -> Self { + #[allow(clippy::declare_interior_mutable_const)] + const ZERO: AtomicU8 = AtomicU8::new(0); + Self { shards: [ZERO; N] } } } -impl HyperLogLogVec { - /// Create a new [`HyperLogLogVec`] based on the provided - /// [`Opts`] and partitioned by the given label names. At least one label name must be - /// provided. - pub fn new(opts: Opts, label_names: &[&str]) -> prometheus::Result { - assert!(N.is_power_of_two()); - let variable_names = label_names.iter().map(|s| (*s).to_owned()).collect(); - let opts = opts.variable_labels(variable_names); - - let desc = opts.describe()?; - let v = HyperLogLogVecCore { - children: RwLock::new(HashMap::default()), - desc, - opts, - }; - - Ok(Self { core: Arc::new(v) }) - } - - /// `get_metric_with_label_values` returns the [`HyperLogLog

`] for the given slice - /// of label values (same order as the VariableLabels in Desc). If that combination of - /// label values is accessed for the first time, a new [`HyperLogLog

`] is created. - /// - /// An error is returned if the number of label values is not the same as the - /// number of VariableLabels in Desc. - pub fn get_metric_with_label_values( - &self, - vals: &[&str], - ) -> prometheus::Result> { - self.core.get_metric_with_label_values(vals) - } - - /// `with_label_values` works as `get_metric_with_label_values`, but panics if an error - /// occurs. - pub fn with_label_values(&self, vals: &[&str]) -> HyperLogLog { - self.get_metric_with_label_values(vals).unwrap() - } +impl MetricType for HyperLogLogState { + type Metadata = (); } -impl HyperLogLogVecCore { - pub fn get_metric_with_label_values( - &self, - vals: &[&str], - ) -> prometheus::Result> { - let h = self.hash_label_values(vals)?; - - if let Some(metric) = self.children.read().unwrap().get(&h).cloned() { - return Ok(metric); - } - - self.get_or_create_metric(h, vals) - } - - pub(crate) fn hash_label_values(&self, vals: &[&str]) -> prometheus::Result { - if vals.len() != self.desc.variable_labels.len() { - return Err(prometheus::Error::InconsistentCardinality { - expect: self.desc.variable_labels.len(), - got: vals.len(), - }); - } - - let mut h = xxh3::Hash64::default(); - for val in vals { - h.write(val.as_bytes()); - } - - Ok(h.finish()) - } - - fn get_or_create_metric( - &self, - hash: u64, - label_values: &[&str], - ) -> prometheus::Result> { - let mut children = self.children.write().unwrap(); - // Check exist first. - if let Some(metric) = children.get(&hash).cloned() { - return Ok(metric); - } - - let metric = HyperLogLog::with_opts_and_label_values(&self.opts, label_values)?; - children.insert(hash, metric.clone()); - Ok(metric) - } -} - -/// HLL is a probabilistic cardinality measure. -/// -/// How to use this time-series for a metric name `my_metrics_total_hll`: -/// -/// ```promql -/// # harmonic mean -/// 1 / ( -/// sum ( -/// 2 ^ -( -/// # HLL merge operation -/// max (my_metrics_total_hll{}) by (hll_shard, other_labels...) -/// ) -/// ) without (hll_shard) -/// ) -/// * alpha -/// * shards_count -/// * shards_count -/// ``` -/// -/// If you want an estimate over time, you can use the following query: -/// -/// ```promql -/// # harmonic mean -/// 1 / ( -/// sum ( -/// 2 ^ -( -/// # HLL merge operation -/// max ( -/// max_over_time(my_metrics_total_hll{}[$__rate_interval]) -/// ) by (hll_shard, other_labels...) -/// ) -/// ) without (hll_shard) -/// ) -/// * alpha -/// * shards_count -/// * shards_count -/// ``` -/// -/// In the case of low cardinality, you might want to use the linear counting approximation: -/// -/// ```promql -/// # LinearCounting(m, V) = m log (m / V) -/// shards_count * ln(shards_count / -/// # calculate V = how many shards contain a 0 -/// count(max (proxy_connecting_endpoints{}) by (hll_shard, protocol) == 0) without (hll_shard) -/// ) -/// ``` -/// -/// See for estimates on alpha -#[derive(Clone)] -pub struct HyperLogLog { - core: Arc>, -} - -impl HyperLogLog { - /// Create a [`HyperLogLog`] with the `name` and `help` arguments. - pub fn new, S2: Into>(name: S1, help: S2) -> prometheus::Result { - assert!(N.is_power_of_two()); - let opts = Opts::new(name, help); - Self::with_opts(opts) - } - - /// Create a [`HyperLogLog`] with the `opts` options. - pub fn with_opts(opts: Opts) -> prometheus::Result { - Self::with_opts_and_label_values(&opts, &[]) - } - - fn with_opts_and_label_values(opts: &Opts, label_values: &[&str]) -> prometheus::Result { - let desc = opts.describe()?; - let labels = make_label_pairs(&desc, label_values)?; - - let v = HyperLogLogCore { - shards: [0; N].map(AtomicU8::new), - desc, - labels, - }; - Ok(Self { core: Arc::new(v) }) - } - +impl HyperLogLogState { pub fn measure(&self, item: &impl Hash) { // changing the hasher will break compatibility with previous measurements. self.record(BuildHasherDefault::::default().hash_one(item)); @@ -299,42 +126,11 @@ impl HyperLogLog { let p = N.ilog2() as u8; let j = hash & (N as u64 - 1); let rho = (hash >> p).leading_zeros() as u8 + 1 - p; - self.core.shards[j as usize].fetch_max(rho, std::sync::atomic::Ordering::Relaxed); - } -} - -struct HyperLogLogCore { - shards: [AtomicU8; N], - desc: core::Desc, - labels: Vec, -} - -impl core::Collector for HyperLogLog { - fn desc(&self) -> Vec<&core::Desc> { - vec![&self.core.desc] + self.shards[j as usize].fetch_max(rho, std::sync::atomic::Ordering::Relaxed); } - fn collect(&self) -> Vec { - let mut m = proto::MetricFamily::default(); - m.set_name(self.core.desc.fq_name.clone()); - m.set_help(self.core.desc.help.clone()); - m.set_field_type(proto::MetricType::GAUGE); - - let mut metrics = Vec::new(); - self.core.collect_into(&mut metrics); - m.set_metric(metrics); - - vec![m] - } -} - -impl HyperLogLogCore { - fn collect_into(&self, metrics: &mut Vec) { - self.shards.iter().enumerate().for_each(|(i, x)| { - let mut shard_label = proto::LabelPair::default(); - shard_label.set_name("hll_shard".to_owned()); - shard_label.set_value(format!("{i}")); - + fn take_sample(&self) -> [u8; N] { + self.shards.each_ref().map(|x| { // We reset the counter to 0 so we can perform a cardinality measure over any time slice in prometheus. // This seems like it would be a race condition, @@ -344,85 +140,90 @@ impl HyperLogLogCore { // TODO: maybe we shouldn't reset this on every collect, instead, only after a time window. // this would mean that a dev port-forwarding the metrics url won't break the sampling. - let v = x.swap(0, std::sync::atomic::Ordering::Relaxed); - - let mut m = proto::Metric::default(); - let mut c = proto::Gauge::default(); - c.set_value(v as f64); - m.set_gauge(c); - - let mut labels = Vec::with_capacity(self.labels.len() + 1); - labels.extend_from_slice(&self.labels); - labels.push(shard_label); - - m.set_label(labels); - metrics.push(m); + x.swap(0, std::sync::atomic::Ordering::Relaxed) }) } } - -fn make_label_pairs( - desc: &core::Desc, - label_values: &[&str], -) -> prometheus::Result> { - if desc.variable_labels.len() != label_values.len() { - return Err(prometheus::Error::InconsistentCardinality { - expect: desc.variable_labels.len(), - got: label_values.len(), - }); +impl measured::metric::MetricEncoding> + for HyperLogLogState +{ + fn write_type( + name: impl MetricNameEncoder, + enc: &mut TextEncoder, + ) -> Result<(), std::io::Error> { + enc.write_type(&name, measured::text::MetricType::Gauge) } + fn collect_into( + &self, + _: &(), + labels: impl LabelGroup, + name: impl MetricNameEncoder, + enc: &mut TextEncoder, + ) -> Result<(), std::io::Error> { + struct I64(i64); + impl LabelValue for I64 { + fn visit(&self, v: V) -> V::Output { + v.write_int(self.0) + } + } - let total_len = desc.variable_labels.len() + desc.const_label_pairs.len(); - if total_len == 0 { - return Ok(vec![]); - } + struct HllShardLabel { + hll_shard: i64, + } - if desc.variable_labels.is_empty() { - return Ok(desc.const_label_pairs.clone()); - } + impl LabelGroup for HllShardLabel { + fn visit_values(&self, v: &mut impl LabelGroupVisitor) { + const LE: &LabelName = LabelName::from_str("hll_shard"); + v.write_value(LE, &I64(self.hll_shard)); + } + } - let mut label_pairs = Vec::with_capacity(total_len); - for (i, n) in desc.variable_labels.iter().enumerate() { - let mut label_pair = proto::LabelPair::default(); - label_pair.set_name(n.clone()); - label_pair.set_value(label_values[i].to_owned()); - label_pairs.push(label_pair); + self.take_sample() + .into_iter() + .enumerate() + .try_for_each(|(hll_shard, val)| { + enc.write_metric_value( + name.by_ref(), + labels.by_ref().compose_with(HllShardLabel { + hll_shard: hll_shard as i64, + }), + MetricValue::Int(val as i64), + ) + }) } - - for label_pair in &desc.const_label_pairs { - label_pairs.push(label_pair.clone()); - } - label_pairs.sort(); - Ok(label_pairs) } #[cfg(test)] mod tests { use std::collections::HashSet; - use prometheus::{proto, Opts}; + use measured::{label::StaticLabelSet, FixedCardinalityLabel}; use rand::{rngs::StdRng, Rng, SeedableRng}; use rand_distr::{Distribution, Zipf}; use crate::HyperLogLogVec; - fn collect(hll: &HyperLogLogVec<32>) -> Vec { - let mut metrics = vec![]; - hll.core - .children - .read() - .unwrap() - .values() - .for_each(|c| c.core.collect_into(&mut metrics)); - metrics + #[derive(FixedCardinalityLabel, Clone, Copy)] + #[label(singleton = "x")] + enum Label { + A, + B, } - fn get_cardinality(metrics: &[proto::Metric], filter: impl Fn(&proto::Metric) -> bool) -> f64 { + + fn collect(hll: &HyperLogLogVec, 32>) -> ([u8; 32], [u8; 32]) { + // cannot go through the `hll.collect_family_into` interface yet... + // need to see if I can fix the conflicting impls problem in measured. + ( + hll.get_metric(hll.with_labels(Label::A)).take_sample(), + hll.get_metric(hll.with_labels(Label::B)).take_sample(), + ) + } + + fn get_cardinality(samples: &[[u8; 32]]) -> f64 { let mut buckets = [0.0; 32]; - for metric in metrics.chunks_exact(32) { - if filter(&metric[0]) { - for (i, m) in metric.iter().enumerate() { - buckets[i] = f64::max(buckets[i], m.get_gauge().get_value()); - } + for &sample in samples { + for (i, m) in sample.into_iter().enumerate() { + buckets[i] = f64::max(buckets[i], m as f64); } } @@ -437,7 +238,7 @@ mod tests { } fn test_cardinality(n: usize, dist: impl Distribution) -> ([usize; 3], [f64; 3]) { - let hll = HyperLogLogVec::<32>::new(Opts::new("foo", "bar"), &["x"]).unwrap(); + let hll = HyperLogLogVec::, 32>::new(); let mut iter = StdRng::seed_from_u64(0x2024_0112).sample_iter(dist); let mut set_a = HashSet::new(); @@ -445,18 +246,20 @@ mod tests { for x in iter.by_ref().take(n) { set_a.insert(x.to_bits()); - hll.with_label_values(&["a"]).measure(&x.to_bits()); + hll.get_metric(hll.with_labels(Label::A)) + .measure(&x.to_bits()); } for x in iter.by_ref().take(n) { set_b.insert(x.to_bits()); - hll.with_label_values(&["b"]).measure(&x.to_bits()); + hll.get_metric(hll.with_labels(Label::B)) + .measure(&x.to_bits()); } let merge = &set_a | &set_b; - let metrics = collect(&hll); - let len = get_cardinality(&metrics, |_| true); - let len_a = get_cardinality(&metrics, |l| l.get_label()[0].get_value() == "a"); - let len_b = get_cardinality(&metrics, |l| l.get_label()[0].get_value() == "b"); + let (a, b) = collect(&hll); + let len = get_cardinality(&[a, b]); + let len_a = get_cardinality(&[a]); + let len_b = get_cardinality(&[b]); ([merge.len(), set_a.len(), set_b.len()], [len, len_a, len_b]) } diff --git a/libs/metrics/src/lib.rs b/libs/metrics/src/lib.rs index 22b0a18933..2cf3cdeaa7 100644 --- a/libs/metrics/src/lib.rs +++ b/libs/metrics/src/lib.rs @@ -4,6 +4,17 @@ //! a default registry. #![deny(clippy::undocumented_unsafe_blocks)] +use measured::{ + label::{LabelGroupSet, LabelGroupVisitor, LabelName, NoLabels}, + metric::{ + counter::CounterState, + gauge::GaugeState, + group::{Encoding, MetricValue}, + name::{MetricName, MetricNameEncoder}, + MetricEncoding, MetricFamilyEncoding, + }, + FixedCardinalityLabel, LabelGroup, MetricGroup, +}; use once_cell::sync::Lazy; use prometheus::core::{ Atomic, AtomicU64, Collector, GenericCounter, GenericCounterVec, GenericGauge, GenericGaugeVec, @@ -11,6 +22,7 @@ use prometheus::core::{ pub use prometheus::opts; pub use prometheus::register; pub use prometheus::Error; +use prometheus::Registry; pub use prometheus::{core, default_registry, proto}; pub use prometheus::{exponential_buckets, linear_buckets}; pub use prometheus::{register_counter_vec, Counter, CounterVec}; @@ -23,13 +35,12 @@ pub use prometheus::{register_int_counter_vec, IntCounterVec}; pub use prometheus::{register_int_gauge, IntGauge}; pub use prometheus::{register_int_gauge_vec, IntGaugeVec}; pub use prometheus::{Encoder, TextEncoder}; -use prometheus::{Registry, Result}; pub mod launch_timestamp; mod wrappers; pub use wrappers::{CountedReader, CountedWriter}; mod hll; -pub use hll::{HyperLogLog, HyperLogLogVec}; +pub use hll::{HyperLogLog, HyperLogLogState, HyperLogLogVec}; #[cfg(target_os = "linux")] pub mod more_process_metrics; @@ -59,7 +70,7 @@ static INTERNAL_REGISTRY: Lazy = Lazy::new(Registry::new); /// Register a collector in the internal registry. MUST be called before the first call to `gather()`. /// Otherwise, we can have a deadlock in the `gather()` call, trying to register a new collector /// while holding the lock. -pub fn register_internal(c: Box) -> Result<()> { +pub fn register_internal(c: Box) -> prometheus::Result<()> { INTERNAL_REGISTRY.register(c) } @@ -96,6 +107,127 @@ pub const DISK_WRITE_SECONDS_BUCKETS: &[f64] = &[ 0.000_050, 0.000_100, 0.000_500, 0.001, 0.003, 0.005, 0.01, 0.05, 0.1, 0.3, 0.5, ]; +pub struct BuildInfo { + pub revision: &'static str, + pub build_tag: &'static str, +} + +// todo: allow label group without the set +impl LabelGroup for BuildInfo { + fn visit_values(&self, v: &mut impl LabelGroupVisitor) { + const REVISION: &LabelName = LabelName::from_str("revision"); + v.write_value(REVISION, &self.revision); + const BUILD_TAG: &LabelName = LabelName::from_str("build_tag"); + v.write_value(BUILD_TAG, &self.build_tag); + } +} + +impl MetricFamilyEncoding for BuildInfo +where + GaugeState: MetricEncoding, +{ + fn collect_family_into( + &self, + name: impl measured::metric::name::MetricNameEncoder, + enc: &mut T, + ) -> Result<(), T::Err> { + enc.write_help(&name, "Build/version information")?; + GaugeState::write_type(&name, enc)?; + GaugeState { + count: std::sync::atomic::AtomicI64::new(1), + } + .collect_into(&(), self, name, enc) + } +} + +#[derive(MetricGroup)] +#[metric(new(build_info: BuildInfo))] +pub struct NeonMetrics { + #[cfg(target_os = "linux")] + #[metric(namespace = "process")] + #[metric(init = measured_process::ProcessCollector::for_self())] + process: measured_process::ProcessCollector, + + #[metric(namespace = "libmetrics")] + #[metric(init = LibMetrics::new(build_info))] + libmetrics: LibMetrics, +} + +#[derive(MetricGroup)] +#[metric(new(build_info: BuildInfo))] +pub struct LibMetrics { + #[metric(init = build_info)] + build_info: BuildInfo, + + #[metric(flatten)] + rusage: Rusage, + + serve_count: CollectionCounter, +} + +fn write_gauge( + x: i64, + labels: impl LabelGroup, + name: impl MetricNameEncoder, + enc: &mut Enc, +) -> Result<(), Enc::Err> { + enc.write_metric_value(name, labels, MetricValue::Int(x)) +} + +#[derive(Default)] +struct Rusage; + +#[derive(FixedCardinalityLabel, Clone, Copy)] +#[label(singleton = "io_operation")] +enum IoOp { + Read, + Write, +} + +impl MetricGroup for Rusage +where + GaugeState: MetricEncoding, +{ + fn collect_group_into(&self, enc: &mut T) -> Result<(), T::Err> { + const DISK_IO: &MetricName = MetricName::from_str("disk_io_bytes_total"); + const MAXRSS: &MetricName = MetricName::from_str("maxrss_kb"); + + let ru = get_rusage_stats(); + + enc.write_help( + DISK_IO, + "Bytes written and read from disk, grouped by the operation (read|write)", + )?; + GaugeState::write_type(DISK_IO, enc)?; + write_gauge(ru.ru_inblock * BYTES_IN_BLOCK, IoOp::Read, DISK_IO, enc)?; + write_gauge(ru.ru_oublock * BYTES_IN_BLOCK, IoOp::Write, DISK_IO, enc)?; + + enc.write_help(MAXRSS, "Memory usage (Maximum Resident Set Size)")?; + GaugeState::write_type(MAXRSS, enc)?; + write_gauge(ru.ru_maxrss, IoOp::Read, MAXRSS, enc)?; + + Ok(()) + } +} + +#[derive(Default)] +struct CollectionCounter(CounterState); + +impl MetricFamilyEncoding for CollectionCounter +where + CounterState: MetricEncoding, +{ + fn collect_family_into( + &self, + name: impl measured::metric::name::MetricNameEncoder, + enc: &mut T, + ) -> Result<(), T::Err> { + self.0.inc(); + enc.write_help(&name, "Number of metric requests made")?; + self.0.collect_into(&(), NoLabels, name, enc) + } +} + pub fn set_build_info_metric(revision: &str, build_tag: &str) { let metric = register_int_gauge_vec!( "libmetrics_build_info", @@ -105,6 +237,7 @@ pub fn set_build_info_metric(revision: &str, build_tag: &str) { .expect("Failed to register build info metric"); metric.with_label_values(&[revision, build_tag]).set(1); } +const BYTES_IN_BLOCK: i64 = 512; // Records I/O stats in a "cross-platform" way. // Compiles both on macOS and Linux, but current macOS implementation always returns 0 as values for I/O stats. @@ -117,7 +250,6 @@ pub fn set_build_info_metric(revision: &str, build_tag: &str) { fn update_rusage_metrics() { let rusage_stats = get_rusage_stats(); - const BYTES_IN_BLOCK: i64 = 512; DISK_IO_BYTES .with_label_values(&["read"]) .set(rusage_stats.ru_inblock * BYTES_IN_BLOCK); @@ -151,6 +283,7 @@ macro_rules! register_int_counter_pair_vec { } }}; } + /// Create an [`IntCounterPair`] and registers to default registry. #[macro_export(local_inner_macros)] macro_rules! register_int_counter_pair { @@ -188,7 +321,10 @@ impl GenericCounterPairVec

{ /// /// An error is returned if the number of label values is not the same as the /// number of VariableLabels in Desc. - pub fn get_metric_with_label_values(&self, vals: &[&str]) -> Result> { + pub fn get_metric_with_label_values( + &self, + vals: &[&str], + ) -> prometheus::Result> { Ok(GenericCounterPair { inc: self.inc.get_metric_with_label_values(vals)?, dec: self.dec.get_metric_with_label_values(vals)?, @@ -201,7 +337,7 @@ impl GenericCounterPairVec

{ self.get_metric_with_label_values(vals).unwrap() } - pub fn remove_label_values(&self, res: &mut [Result<()>; 2], vals: &[&str]) { + pub fn remove_label_values(&self, res: &mut [prometheus::Result<()>; 2], vals: &[&str]) { res[0] = self.inc.remove_label_values(vals); res[1] = self.dec.remove_label_values(vals); } @@ -285,3 +421,171 @@ pub type IntCounterPair = GenericCounterPair; /// A guard for [`IntCounterPair`] that will decrement the gauge on drop pub type IntCounterPairGuard = GenericCounterPairGuard; + +pub trait CounterPairAssoc { + const INC_NAME: &'static MetricName; + const DEC_NAME: &'static MetricName; + + const INC_HELP: &'static str; + const DEC_HELP: &'static str; + + type LabelGroupSet: LabelGroupSet; +} + +pub struct CounterPairVec { + vec: measured::metric::MetricVec, +} + +impl Default for CounterPairVec +where + A::LabelGroupSet: Default, +{ + fn default() -> Self { + Self { + vec: Default::default(), + } + } +} + +impl CounterPairVec { + pub fn guard( + &self, + labels: ::Group<'_>, + ) -> MeasuredCounterPairGuard<'_, A> { + let id = self.vec.with_labels(labels); + self.vec.get_metric(id).inc.inc(); + MeasuredCounterPairGuard { vec: &self.vec, id } + } + pub fn inc(&self, labels: ::Group<'_>) { + let id = self.vec.with_labels(labels); + self.vec.get_metric(id).inc.inc(); + } + pub fn dec(&self, labels: ::Group<'_>) { + let id = self.vec.with_labels(labels); + self.vec.get_metric(id).dec.inc(); + } + pub fn remove_metric( + &self, + labels: ::Group<'_>, + ) -> Option { + let id = self.vec.with_labels(labels); + self.vec.remove_metric(id) + } +} + +impl ::measured::metric::group::MetricGroup for CounterPairVec +where + T: ::measured::metric::group::Encoding, + A: CounterPairAssoc, + ::measured::metric::counter::CounterState: ::measured::metric::MetricEncoding, +{ + fn collect_group_into(&self, enc: &mut T) -> Result<(), T::Err> { + // write decrement first to avoid a race condition where inc - dec < 0 + T::write_help(enc, A::DEC_NAME, A::DEC_HELP)?; + self.vec + .collect_family_into(A::DEC_NAME, &mut Dec(&mut *enc))?; + + T::write_help(enc, A::INC_NAME, A::INC_HELP)?; + self.vec + .collect_family_into(A::INC_NAME, &mut Inc(&mut *enc))?; + + Ok(()) + } +} + +#[derive(MetricGroup, Default)] +pub struct MeasuredCounterPairState { + pub inc: CounterState, + pub dec: CounterState, +} + +impl measured::metric::MetricType for MeasuredCounterPairState { + type Metadata = (); +} + +pub struct MeasuredCounterPairGuard<'a, A: CounterPairAssoc> { + vec: &'a measured::metric::MetricVec, + id: measured::metric::LabelId, +} + +impl Drop for MeasuredCounterPairGuard<'_, A> { + fn drop(&mut self) { + self.vec.get_metric(self.id).dec.inc(); + } +} + +/// [`MetricEncoding`] for [`MeasuredCounterPairState`] that only writes the inc counter to the inner encoder. +struct Inc(T); +/// [`MetricEncoding`] for [`MeasuredCounterPairState`] that only writes the dec counter to the inner encoder. +struct Dec(T); + +impl Encoding for Inc { + type Err = T::Err; + + fn write_help(&mut self, name: impl MetricNameEncoder, help: &str) -> Result<(), Self::Err> { + self.0.write_help(name, help) + } + + fn write_metric_value( + &mut self, + name: impl MetricNameEncoder, + labels: impl LabelGroup, + value: MetricValue, + ) -> Result<(), Self::Err> { + self.0.write_metric_value(name, labels, value) + } +} + +impl MetricEncoding> for MeasuredCounterPairState +where + CounterState: MetricEncoding, +{ + fn write_type(name: impl MetricNameEncoder, enc: &mut Inc) -> Result<(), T::Err> { + CounterState::write_type(name, &mut enc.0) + } + fn collect_into( + &self, + metadata: &(), + labels: impl LabelGroup, + name: impl MetricNameEncoder, + enc: &mut Inc, + ) -> Result<(), T::Err> { + self.inc.collect_into(metadata, labels, name, &mut enc.0) + } +} + +impl Encoding for Dec { + type Err = T::Err; + + fn write_help(&mut self, name: impl MetricNameEncoder, help: &str) -> Result<(), Self::Err> { + self.0.write_help(name, help) + } + + fn write_metric_value( + &mut self, + name: impl MetricNameEncoder, + labels: impl LabelGroup, + value: MetricValue, + ) -> Result<(), Self::Err> { + self.0.write_metric_value(name, labels, value) + } +} + +/// Write the dec counter to the encoder +impl MetricEncoding> for MeasuredCounterPairState +where + CounterState: MetricEncoding, +{ + fn write_type(name: impl MetricNameEncoder, enc: &mut Dec) -> Result<(), T::Err> { + CounterState::write_type(name, &mut enc.0) + } + fn collect_into( + &self, + metadata: &(), + labels: impl LabelGroup, + name: impl MetricNameEncoder, + enc: &mut Dec, + ) -> Result<(), T::Err> { + self.dec.collect_into(metadata, labels, name, &mut enc.0) + } +} diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index ad4ca6710d..b4909f247f 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -20,6 +20,7 @@ use utils::{ history_buffer::HistoryBufferWithDropCounter, id::{NodeId, TenantId, TimelineId}, lsn::Lsn, + serde_system_time, }; use crate::controller_api::PlacementPolicy; @@ -758,11 +759,7 @@ pub struct WalRedoManagerStatus { #[derive(Default, Debug, Serialize, Deserialize, Clone)] pub struct SecondaryProgress { /// The remote storage LastModified time of the heatmap object we last downloaded. - #[serde( - serialize_with = "opt_ser_rfc3339_millis", - deserialize_with = "opt_deser_rfc3339_millis" - )] - pub heatmap_mtime: Option, + pub heatmap_mtime: Option, /// The number of layers currently on-disk pub layers_downloaded: usize, @@ -775,29 +772,6 @@ pub struct SecondaryProgress { pub bytes_total: u64, } -fn opt_ser_rfc3339_millis( - ts: &Option, - serializer: S, -) -> Result { - match ts { - Some(ts) => serializer.collect_str(&humantime::format_rfc3339_millis(*ts)), - None => serializer.serialize_none(), - } -} - -fn opt_deser_rfc3339_millis<'de, D>(deserializer: D) -> Result, D::Error> -where - D: serde::de::Deserializer<'de>, -{ - let s: Option = serde::de::Deserialize::deserialize(deserializer)?; - match s { - None => Ok(None), - Some(s) => humantime::parse_rfc3339(&s) - .map_err(serde::de::Error::custom) - .map(Some), - } -} - pub mod virtual_file { #[derive( Copy, diff --git a/libs/pageserver_api/src/models/utilization.rs b/libs/pageserver_api/src/models/utilization.rs index f5984dff5d..e88cab5d6a 100644 --- a/libs/pageserver_api/src/models/utilization.rs +++ b/libs/pageserver_api/src/models/utilization.rs @@ -1,4 +1,4 @@ -use std::time::SystemTime; +use utils::serde_system_time::SystemTime; /// Pageserver current utilization and scoring for how good candidate the pageserver would be for /// the next tenant. @@ -21,28 +21,9 @@ pub struct PageserverUtilization { /// When was this snapshot captured, pageserver local time. /// /// Use millis to give confidence that the value is regenerated often enough. - #[serde( - serialize_with = "ser_rfc3339_millis", - deserialize_with = "deser_rfc3339_millis" - )] pub captured_at: SystemTime, } -fn ser_rfc3339_millis( - ts: &SystemTime, - serializer: S, -) -> Result { - serializer.collect_str(&humantime::format_rfc3339_millis(*ts)) -} - -fn deser_rfc3339_millis<'de, D>(deserializer: D) -> Result -where - D: serde::de::Deserializer<'de>, -{ - let s: String = serde::de::Deserialize::deserialize(deserializer)?; - humantime::parse_rfc3339(&s).map_err(serde::de::Error::custom) -} - /// openapi knows only `format: int64`, so avoid outputting a non-parseable value by generated clients. /// /// Instead of newtype, use this because a newtype would get require handling deserializing values @@ -69,7 +50,9 @@ mod tests { disk_usage_bytes: u64::MAX, free_space_bytes: 0, utilization_score: u64::MAX, - captured_at: SystemTime::UNIX_EPOCH + Duration::from_secs(1708509779), + captured_at: SystemTime( + std::time::SystemTime::UNIX_EPOCH + Duration::from_secs(1708509779), + ), }; let s = serde_json::to_string(&doc).unwrap(); diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml index c2d9d9d396..a6a081c5c1 100644 --- a/libs/utils/Cargo.toml +++ b/libs/utils/Cargo.toml @@ -22,6 +22,7 @@ camino.workspace = true chrono.workspace = true heapless.workspace = true hex = { workspace = true, features = ["serde"] } +humantime.workspace = true hyper = { workspace = true, features = ["full"] } fail.workspace = true futures = { workspace = true} diff --git a/libs/utils/src/env.rs b/libs/utils/src/env.rs new file mode 100644 index 0000000000..b3e326bfd0 --- /dev/null +++ b/libs/utils/src/env.rs @@ -0,0 +1,21 @@ +//! Wrapper around `std::env::var` for parsing environment variables. + +use std::{fmt::Display, str::FromStr}; + +pub fn var(varname: &str) -> Option +where + V: FromStr, + E: Display, +{ + match std::env::var(varname) { + Ok(s) => Some( + s.parse() + .map_err(|e| format!("failed to parse env var {varname}: {e:#}")) + .unwrap(), + ), + Err(std::env::VarError::NotPresent) => None, + Err(std::env::VarError::NotUnicode(_)) => { + panic!("env var {varname} is not unicode") + } + } +} diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs index 04ce0626c8..b09350d11e 100644 --- a/libs/utils/src/lib.rs +++ b/libs/utils/src/lib.rs @@ -63,6 +63,7 @@ pub mod measured_stream; pub mod serde_percent; pub mod serde_regex; +pub mod serde_system_time; pub mod pageserver_feedback; @@ -89,6 +90,8 @@ pub mod yielding_loop; pub mod zstd; +pub mod env; + /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages /// /// we have several cases: diff --git a/libs/utils/src/serde_system_time.rs b/libs/utils/src/serde_system_time.rs new file mode 100644 index 0000000000..b0f6934e87 --- /dev/null +++ b/libs/utils/src/serde_system_time.rs @@ -0,0 +1,55 @@ +//! A `serde::{Deserialize,Serialize}` type for SystemTime with RFC3339 format and millisecond precision. + +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, serde::Serialize, serde::Deserialize)] +#[serde(transparent)] +pub struct SystemTime( + #[serde( + deserialize_with = "deser_rfc3339_millis", + serialize_with = "ser_rfc3339_millis" + )] + pub std::time::SystemTime, +); + +fn ser_rfc3339_millis( + ts: &std::time::SystemTime, + serializer: S, +) -> Result { + serializer.collect_str(&humantime::format_rfc3339_millis(*ts)) +} + +fn deser_rfc3339_millis<'de, D>(deserializer: D) -> Result +where + D: serde::de::Deserializer<'de>, +{ + let s: String = serde::de::Deserialize::deserialize(deserializer)?; + humantime::parse_rfc3339(&s).map_err(serde::de::Error::custom) +} + +#[cfg(test)] +mod tests { + use super::*; + + /// Helper function to make a SystemTime have millisecond precision by truncating additional nanoseconds. + fn to_millisecond_precision(time: SystemTime) -> SystemTime { + match time.0.duration_since(std::time::SystemTime::UNIX_EPOCH) { + Ok(duration) => { + let total_millis = duration.as_secs() * 1_000 + u64::from(duration.subsec_millis()); + SystemTime( + std::time::SystemTime::UNIX_EPOCH + + std::time::Duration::from_millis(total_millis), + ) + } + Err(_) => time, + } + } + + #[test] + fn test_serialize_deserialize() { + let input = SystemTime(std::time::SystemTime::now()); + let expected_serialized = format!("\"{}\"", humantime::format_rfc3339_millis(input.0)); + let serialized = serde_json::to_string(&input).unwrap(); + assert_eq!(expected_serialized, serialized); + let deserialized: SystemTime = serde_json::from_str(&expected_serialized).unwrap(); + assert_eq!(to_millisecond_precision(input), deserialized); + } +} diff --git a/pageserver/compaction/Cargo.toml b/pageserver/compaction/Cargo.toml index 47f318db63..0fd1d81845 100644 --- a/pageserver/compaction/Cargo.toml +++ b/pageserver/compaction/Cargo.toml @@ -11,7 +11,6 @@ default = [] anyhow.workspace = true async-compression.workspace = true async-stream.workspace = true -async-trait.workspace = true byteorder.workspace = true bytes.workspace = true chrono = { workspace = true, features = ["serde"] } diff --git a/pageserver/compaction/src/helpers.rs b/pageserver/compaction/src/helpers.rs index 22a410b4af..9de6363d6e 100644 --- a/pageserver/compaction/src/helpers.rs +++ b/pageserver/compaction/src/helpers.rs @@ -180,7 +180,7 @@ where match top.deref_mut() { LazyLoadLayer::Unloaded(ref mut l) => { let fut = l.load_keys(this.ctx); - this.load_future.set(Some(fut)); + this.load_future.set(Some(Box::pin(fut))); continue; } LazyLoadLayer::Loaded(ref mut entries) => { diff --git a/pageserver/compaction/src/interface.rs b/pageserver/compaction/src/interface.rs index 2bb2e749c0..5dc62e506f 100644 --- a/pageserver/compaction/src/interface.rs +++ b/pageserver/compaction/src/interface.rs @@ -3,7 +3,6 @@ //! //! All the heavy lifting is done by the create_image and create_delta //! functions that the implementor provides. -use async_trait::async_trait; use futures::Future; use pageserver_api::{key::Key, keyspace::key_range_size}; use std::ops::Range; @@ -141,18 +140,16 @@ pub trait CompactionLayer { fn is_delta(&self) -> bool; } - -#[async_trait] pub trait CompactionDeltaLayer: CompactionLayer { type DeltaEntry<'a>: CompactionDeltaEntry<'a, E::Key> where Self: 'a; /// Return all keys in this delta layer. - async fn load_keys<'a>( + fn load_keys<'a>( &self, ctx: &E::RequestContext, - ) -> anyhow::Result>>; + ) -> impl Future>>> + Send; } pub trait CompactionImageLayer: CompactionLayer {} diff --git a/pageserver/compaction/src/simulator.rs b/pageserver/compaction/src/simulator.rs index def7983e75..6c00df3a65 100644 --- a/pageserver/compaction/src/simulator.rs +++ b/pageserver/compaction/src/simulator.rs @@ -2,7 +2,6 @@ mod draw; use draw::{LayerTraceEvent, LayerTraceFile, LayerTraceOp}; -use async_trait::async_trait; use futures::StreamExt; use rand::Rng; use tracing::info; @@ -139,7 +138,6 @@ impl interface::CompactionLayer for Arc { } } -#[async_trait] impl interface::CompactionDeltaLayer for Arc { type DeltaEntry<'a> = MockRecord; diff --git a/pageserver/ctl/Cargo.toml b/pageserver/ctl/Cargo.toml index c5cd451e8d..843f5dd862 100644 --- a/pageserver/ctl/Cargo.toml +++ b/pageserver/ctl/Cargo.toml @@ -12,9 +12,14 @@ bytes.workspace = true camino.workspace = true clap = { workspace = true, features = ["string"] } git-version.workspace = true +humantime.workspace = true pageserver = { path = ".." } +pageserver_api.workspace = true +remote_storage = { path = "../../libs/remote_storage" } postgres_ffi.workspace = true tokio.workspace = true +tokio-util.workspace = true +toml_edit.workspace = true utils.workspace = true svg_fmt.workspace = true workspace_hack.workspace = true diff --git a/pageserver/ctl/src/main.rs b/pageserver/ctl/src/main.rs index e73d961e36..1fb75584fc 100644 --- a/pageserver/ctl/src/main.rs +++ b/pageserver/ctl/src/main.rs @@ -9,6 +9,11 @@ mod index_part; mod layer_map_analyzer; mod layers; +use std::{ + str::FromStr, + time::{Duration, SystemTime}, +}; + use camino::{Utf8Path, Utf8PathBuf}; use clap::{Parser, Subcommand}; use index_part::IndexPartCmd; @@ -20,8 +25,16 @@ use pageserver::{ tenant::{dump_layerfile_from_path, metadata::TimelineMetadata}, virtual_file, }; +use pageserver_api::shard::TenantShardId; use postgres_ffi::ControlFileData; -use utils::{lsn::Lsn, project_git_version}; +use remote_storage::{RemotePath, RemoteStorageConfig}; +use tokio_util::sync::CancellationToken; +use utils::{ + id::TimelineId, + logging::{self, LogFormat, TracingErrorLayerEnablement}, + lsn::Lsn, + project_git_version, +}; project_git_version!(GIT_VERSION); @@ -43,6 +56,7 @@ enum Commands { #[command(subcommand)] IndexPart(IndexPartCmd), PrintLayerFile(PrintLayerFileCmd), + TimeTravelRemotePrefix(TimeTravelRemotePrefixCmd), DrawTimeline {}, AnalyzeLayerMap(AnalyzeLayerMapCmd), #[command(subcommand)] @@ -68,6 +82,26 @@ struct PrintLayerFileCmd { path: Utf8PathBuf, } +/// Roll back the time for the specified prefix using S3 history. +/// +/// The command is fairly low level and powerful. Validation is only very light, +/// so it is more powerful, and thus potentially more dangerous. +#[derive(Parser)] +struct TimeTravelRemotePrefixCmd { + /// A configuration string for the remote_storage configuration. + /// + /// Example: `remote_storage = { bucket_name = "aws-storage-bucket-name", bucket_region = "us-east-2" }` + config_toml_str: String, + /// remote prefix to time travel recover. For safety reasons, we require it to contain + /// a timeline or tenant ID in the prefix. + prefix: String, + /// Timestamp to travel to. Given in format like `2024-01-20T10:45:45Z`. Assumes UTC and second accuracy. + travel_to: String, + /// Timestamp of the start of the operation, must be after any changes we want to roll back and after. + /// You can use a few seconds before invoking the command. Same format as `travel_to`. + done_if_after: Option, +} + #[derive(Parser)] struct AnalyzeLayerMapCmd { /// Pageserver data path @@ -78,6 +112,14 @@ struct AnalyzeLayerMapCmd { #[tokio::main] async fn main() -> anyhow::Result<()> { + logging::init( + LogFormat::Plain, + TracingErrorLayerEnablement::EnableWithRustLogFilter, + logging::Output::Stdout, + )?; + + logging::replace_panic_hook_with_tracing_panic_hook().forget(); + let cli = CliOpts::parse(); match cli.command { @@ -105,6 +147,42 @@ async fn main() -> anyhow::Result<()> { print_layerfile(&cmd.path).await?; } } + Commands::TimeTravelRemotePrefix(cmd) => { + let timestamp = humantime::parse_rfc3339(&cmd.travel_to) + .map_err(|_e| anyhow::anyhow!("Invalid time for travel_to: '{}'", cmd.travel_to))?; + + let done_if_after = if let Some(done_if_after) = &cmd.done_if_after { + humantime::parse_rfc3339(done_if_after).map_err(|_e| { + anyhow::anyhow!("Invalid time for done_if_after: '{}'", done_if_after) + })? + } else { + const SAFETY_MARGIN: Duration = Duration::from_secs(3); + tokio::time::sleep(SAFETY_MARGIN).await; + // Convert to string representation and back to get rid of sub-second values + let done_if_after = SystemTime::now(); + tokio::time::sleep(SAFETY_MARGIN).await; + done_if_after + }; + + let timestamp = strip_subsecond(timestamp); + let done_if_after = strip_subsecond(done_if_after); + + let Some(prefix) = validate_prefix(&cmd.prefix) else { + println!("specified prefix '{}' failed validation", cmd.prefix); + return Ok(()); + }; + let toml_document = toml_edit::Document::from_str(&cmd.config_toml_str)?; + let toml_item = toml_document + .get("remote_storage") + .expect("need remote_storage"); + let config = RemoteStorageConfig::from_toml(toml_item)?.expect("incomplete config"); + let storage = remote_storage::GenericRemoteStorage::from_config(&config); + let cancel = CancellationToken::new(); + storage + .unwrap() + .time_travel_recover(Some(&prefix), timestamp, done_if_after, &cancel) + .await?; + } }; Ok(()) } @@ -185,3 +263,89 @@ fn handle_metadata( Ok(()) } + +/// Ensures that the given S3 prefix is sufficiently constrained. +/// The command is very risky already and we don't want to expose something +/// that allows usually unintentional and quite catastrophic time travel of +/// an entire bucket, which would be a major catastrophy and away +/// by only one character change (similar to "rm -r /home /username/foobar"). +fn validate_prefix(prefix: &str) -> Option { + if prefix.is_empty() { + // Empty prefix means we want to specify the *whole* bucket + return None; + } + let components = prefix.split('/').collect::>(); + let (last, components) = { + let last = components.last()?; + if last.is_empty() { + ( + components.iter().nth_back(1)?, + &components[..(components.len() - 1)], + ) + } else { + (last, &components[..]) + } + }; + 'valid: { + if let Ok(_timeline_id) = TimelineId::from_str(last) { + // Ends in either a tenant or timeline ID + break 'valid; + } + if *last == "timelines" { + if let Some(before_last) = components.iter().nth_back(1) { + if let Ok(_tenant_id) = TenantShardId::from_str(before_last) { + // Has a valid tenant id + break 'valid; + } + } + } + + return None; + } + RemotePath::from_string(prefix).ok() +} + +fn strip_subsecond(timestamp: SystemTime) -> SystemTime { + let ts_str = humantime::format_rfc3339_seconds(timestamp).to_string(); + humantime::parse_rfc3339(&ts_str).expect("can't parse just created timestamp") +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_validate_prefix() { + assert_eq!(validate_prefix(""), None); + assert_eq!(validate_prefix("/"), None); + #[track_caller] + fn assert_valid(prefix: &str) { + let remote_path = RemotePath::from_string(prefix).unwrap(); + assert_eq!(validate_prefix(prefix), Some(remote_path)); + } + assert_valid("wal/3aa8fcc61f6d357410b7de754b1d9001/641e5342083b2235ee3deb8066819683/"); + // Path is not relative but absolute + assert_eq!( + validate_prefix( + "/wal/3aa8fcc61f6d357410b7de754b1d9001/641e5342083b2235ee3deb8066819683/" + ), + None + ); + assert_valid("wal/3aa8fcc61f6d357410b7de754b1d9001/"); + // Partial tenant IDs should be invalid, S3 will match all tenants with the specific ID prefix + assert_eq!(validate_prefix("wal/3aa8fcc61f6d357410b7d"), None); + assert_eq!(validate_prefix("wal"), None); + assert_eq!(validate_prefix("/wal/"), None); + assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001"); + // Partial tenant ID + assert_eq!( + validate_prefix("pageserver/v1/tenants/3aa8fcc61f6d357410b"), + None + ); + assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001/timelines"); + assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001-0004/timelines"); + assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001/timelines/"); + assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001/timelines/641e5342083b2235ee3deb8066819683"); + assert_eq!(validate_prefix("pageserver/v1/tenants/"), None); + } +} diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index c80230d4d7..0903b206ff 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -18,6 +18,7 @@ use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING}; use pageserver::task_mgr::WALRECEIVER_RUNTIME; use pageserver::tenant::{secondary, TenantSharedResources}; use remote_storage::GenericRemoteStorage; +use tokio::signal::unix::SignalKind; use tokio::time::Instant; use tracing::*; @@ -671,42 +672,37 @@ fn start_pageserver( let mut shutdown_pageserver = Some(shutdown_pageserver.drop_guard()); // All started up! Now just sit and wait for shutdown signal. - { - use signal_hook::consts::*; - let signal_handler = BACKGROUND_RUNTIME.spawn_blocking(move || { - let mut signals = - signal_hook::iterator::Signals::new([SIGINT, SIGTERM, SIGQUIT]).unwrap(); - return signals - .forever() - .next() - .expect("forever() never returns None unless explicitly closed"); - }); - let signal = BACKGROUND_RUNTIME - .block_on(signal_handler) - .expect("join error"); - match signal { - SIGQUIT => { - info!("Got signal {signal}. Terminating in immediate shutdown mode",); - std::process::exit(111); - } - SIGINT | SIGTERM => { - info!("Got signal {signal}. Terminating gracefully in fast shutdown mode",); - // This cancels the `shutdown_pageserver` cancellation tree. - // Right now that tree doesn't reach very far, and `task_mgr` is used instead. - // The plan is to change that over time. - shutdown_pageserver.take(); - let bg_remote_storage = remote_storage.clone(); - let bg_deletion_queue = deletion_queue.clone(); - BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver( - &tenant_manager, - bg_remote_storage.map(|_| bg_deletion_queue), - 0, - )); - unreachable!() - } - _ => unreachable!(), - } + { + BACKGROUND_RUNTIME.block_on(async move { + let mut sigint = tokio::signal::unix::signal(SignalKind::interrupt()).unwrap(); + let mut sigterm = tokio::signal::unix::signal(SignalKind::terminate()).unwrap(); + let mut sigquit = tokio::signal::unix::signal(SignalKind::quit()).unwrap(); + let signal = tokio::select! { + _ = sigquit.recv() => { + info!("Got signal SIGQUIT. Terminating in immediate shutdown mode",); + std::process::exit(111); + } + _ = sigint.recv() => { "SIGINT" }, + _ = sigterm.recv() => { "SIGTERM" }, + }; + + info!("Got signal {signal}. Terminating gracefully in fast shutdown mode",); + + // This cancels the `shutdown_pageserver` cancellation tree. + // Right now that tree doesn't reach very far, and `task_mgr` is used instead. + // The plan is to change that over time. + shutdown_pageserver.take(); + let bg_remote_storage = remote_storage.clone(); + let bg_deletion_queue = deletion_queue.clone(); + pageserver::shutdown_pageserver( + &tenant_manager, + bg_remote_storage.map(|_| bg_deletion_queue), + 0, + ) + .await; + unreachable!() + }) } } diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index ab9a2e8509..3160f204e2 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -2100,6 +2100,7 @@ pub(crate) fn remove_tenant_metrics(tenant_shard_id: &TenantShardId) { use futures::Future; use pin_project_lite::pin_project; use std::collections::HashMap; +use std::num::NonZeroUsize; use std::pin::Pin; use std::sync::{Arc, Mutex}; use std::task::{Context, Poll}; @@ -2669,6 +2670,26 @@ pub(crate) mod disk_usage_based_eviction { pub(crate) static METRICS: Lazy = Lazy::new(Metrics::default); } +static TOKIO_EXECUTOR_THREAD_COUNT: Lazy = Lazy::new(|| { + register_uint_gauge_vec!( + "pageserver_tokio_executor_thread_configured_count", + "Total number of configued tokio executor threads in the process. + The `setup` label denotes whether we're running with multiple runtimes or a single runtime.", + &["setup"], + ) + .unwrap() +}); + +pub(crate) fn set_tokio_runtime_setup(setup: &str, num_threads: NonZeroUsize) { + static SERIALIZE: std::sync::Mutex<()> = std::sync::Mutex::new(()); + let _guard = SERIALIZE.lock().unwrap(); + TOKIO_EXECUTOR_THREAD_COUNT.reset(); + TOKIO_EXECUTOR_THREAD_COUNT + .get_metric_with_label_values(&[setup]) + .unwrap() + .set(u64::try_from(num_threads.get()).unwrap()); +} + pub fn preinitialize_metrics() { // Python tests need these and on some we do alerting. // diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs index 0cc5611a12..9a1e354ecf 100644 --- a/pageserver/src/task_mgr.rs +++ b/pageserver/src/task_mgr.rs @@ -33,13 +33,14 @@ use std::collections::HashMap; use std::fmt; use std::future::Future; +use std::num::NonZeroUsize; use std::panic::AssertUnwindSafe; +use std::str::FromStr; use std::sync::atomic::{AtomicU64, Ordering}; use std::sync::{Arc, Mutex}; use futures::FutureExt; use pageserver_api::shard::TenantShardId; -use tokio::runtime::Runtime; use tokio::task::JoinHandle; use tokio::task_local; use tokio_util::sync::CancellationToken; @@ -48,8 +49,11 @@ use tracing::{debug, error, info, warn}; use once_cell::sync::Lazy; +use utils::env; use utils::id::TimelineId; +use crate::metrics::set_tokio_runtime_setup; + // // There are four runtimes: // @@ -98,52 +102,119 @@ use utils::id::TimelineId; // other operations, if the upload tasks e.g. get blocked on locks. It shouldn't // happen, but still. // -pub static COMPUTE_REQUEST_RUNTIME: Lazy = Lazy::new(|| { - tokio::runtime::Builder::new_multi_thread() - .thread_name("compute request worker") - .enable_all() - .build() - .expect("Failed to create compute request runtime") -}); -pub static MGMT_REQUEST_RUNTIME: Lazy = Lazy::new(|| { - tokio::runtime::Builder::new_multi_thread() - .thread_name("mgmt request worker") - .enable_all() - .build() - .expect("Failed to create mgmt request runtime") -}); - -pub static WALRECEIVER_RUNTIME: Lazy = Lazy::new(|| { - tokio::runtime::Builder::new_multi_thread() - .thread_name("walreceiver worker") - .enable_all() - .build() - .expect("Failed to create walreceiver runtime") -}); - -pub static BACKGROUND_RUNTIME: Lazy = Lazy::new(|| { - tokio::runtime::Builder::new_multi_thread() - .thread_name("background op worker") - // if you change the number of worker threads please change the constant below - .enable_all() - .build() - .expect("Failed to create background op runtime") -}); - -pub(crate) static BACKGROUND_RUNTIME_WORKER_THREADS: Lazy = Lazy::new(|| { - // force init and thus panics - let _ = BACKGROUND_RUNTIME.handle(); +pub(crate) static TOKIO_WORKER_THREADS: Lazy = Lazy::new(|| { // replicates tokio-1.28.1::loom::sys::num_cpus which is not available publicly // tokio would had already panicked for parsing errors or NotUnicode // // this will be wrong if any of the runtimes gets their worker threads configured to something // else, but that has not been needed in a long time. - std::env::var("TOKIO_WORKER_THREADS") - .map(|s| s.parse::().unwrap()) - .unwrap_or_else(|_e| usize::max(2, num_cpus::get())) + NonZeroUsize::new( + std::env::var("TOKIO_WORKER_THREADS") + .map(|s| s.parse::().unwrap()) + .unwrap_or_else(|_e| usize::max(2, num_cpus::get())), + ) + .expect("the max() ensures that this is not zero") }); +enum TokioRuntimeMode { + SingleThreaded, + MultiThreaded { num_workers: NonZeroUsize }, +} + +impl FromStr for TokioRuntimeMode { + type Err = String; + + fn from_str(s: &str) -> Result { + match s { + "current_thread" => Ok(TokioRuntimeMode::SingleThreaded), + s => match s.strip_prefix("multi_thread:") { + Some("default") => Ok(TokioRuntimeMode::MultiThreaded { + num_workers: *TOKIO_WORKER_THREADS, + }), + Some(suffix) => { + let num_workers = suffix.parse::().map_err(|e| { + format!( + "invalid number of multi-threaded runtime workers ({suffix:?}): {e}", + ) + })?; + Ok(TokioRuntimeMode::MultiThreaded { num_workers }) + } + None => Err(format!("invalid runtime config: {s:?}")), + }, + } + } +} + +static ONE_RUNTIME: Lazy> = Lazy::new(|| { + let thread_name = "pageserver-tokio"; + let Some(mode) = env::var("NEON_PAGESERVER_USE_ONE_RUNTIME") else { + // If the env var is not set, leave this static as None. + set_tokio_runtime_setup( + "multiple-runtimes", + NUM_MULTIPLE_RUNTIMES + .checked_mul(*TOKIO_WORKER_THREADS) + .unwrap(), + ); + return None; + }; + Some(match mode { + TokioRuntimeMode::SingleThreaded => { + set_tokio_runtime_setup("one-runtime-single-threaded", NonZeroUsize::new(1).unwrap()); + tokio::runtime::Builder::new_current_thread() + .thread_name(thread_name) + .enable_all() + .build() + .expect("failed to create one single runtime") + } + TokioRuntimeMode::MultiThreaded { num_workers } => { + set_tokio_runtime_setup("one-runtime-multi-threaded", num_workers); + tokio::runtime::Builder::new_multi_thread() + .thread_name(thread_name) + .enable_all() + .worker_threads(num_workers.get()) + .build() + .expect("failed to create one multi-threaded runtime") + } + }) +}); + +/// Declare a lazy static variable named `$varname` that will resolve +/// to a tokio runtime handle. If the env var `NEON_PAGESERVER_USE_ONE_RUNTIME` +/// is set, this will resolve to `ONE_RUNTIME`. Otherwise, the macro invocation +/// declares a separate runtime and the lazy static variable `$varname` +/// will resolve to that separate runtime. +/// +/// The result is is that `$varname.spawn()` will use `ONE_RUNTIME` if +/// `NEON_PAGESERVER_USE_ONE_RUNTIME` is set, and will use the separate runtime +/// otherwise. +macro_rules! pageserver_runtime { + ($varname:ident, $name:literal) => { + pub static $varname: Lazy<&'static tokio::runtime::Runtime> = Lazy::new(|| { + if let Some(runtime) = &*ONE_RUNTIME { + return runtime; + } + static RUNTIME: Lazy = Lazy::new(|| { + tokio::runtime::Builder::new_multi_thread() + .thread_name($name) + .worker_threads(TOKIO_WORKER_THREADS.get()) + .enable_all() + .build() + .expect(std::concat!("Failed to create runtime ", $name)) + }); + &*RUNTIME + }); + }; +} + +pageserver_runtime!(COMPUTE_REQUEST_RUNTIME, "compute request worker"); +pageserver_runtime!(MGMT_REQUEST_RUNTIME, "mgmt request worker"); +pageserver_runtime!(WALRECEIVER_RUNTIME, "walreceiver worker"); +pageserver_runtime!(BACKGROUND_RUNTIME, "background op worker"); +// Bump this number when adding a new pageserver_runtime! +// SAFETY: it's obviously correct +const NUM_MULTIPLE_RUNTIMES: NonZeroUsize = unsafe { NonZeroUsize::new_unchecked(4) }; + #[derive(Debug, Clone, Copy)] pub struct PageserverTaskId(u64); diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs index 530e1a3244..5b29c126d1 100644 --- a/pageserver/src/tenant/secondary/downloader.rs +++ b/pageserver/src/tenant/secondary/downloader.rs @@ -51,7 +51,7 @@ use tokio_util::sync::CancellationToken; use tracing::{info_span, instrument, warn, Instrument}; use utils::{ backoff, completion::Barrier, crashsafe::path_with_suffix_extension, failpoint_support, fs_ext, - id::TimelineId, + id::TimelineId, serde_system_time, }; use super::{ @@ -591,7 +591,7 @@ impl<'a> TenantDownloader<'a> { let mut progress = SecondaryProgress { layers_total: heatmap_stats.layers, bytes_total: heatmap_stats.bytes, - heatmap_mtime: Some(heatmap_mtime), + heatmap_mtime: Some(serde_system_time::SystemTime(heatmap_mtime)), layers_downloaded: 0, bytes_downloaded: 0, }; diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs index 43942ba2db..29751641b4 100644 --- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs +++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs @@ -19,6 +19,7 @@ use pageserver_api::models::InMemoryLayerInfo; use pageserver_api::shard::TenantShardId; use std::collections::{BinaryHeap, HashMap, HashSet}; use std::sync::{Arc, OnceLock}; +use std::time::Instant; use tracing::*; use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn, vec_map::VecMap}; // avoid binding to Write (conflicts with std::io::Write) @@ -53,6 +54,8 @@ pub struct InMemoryLayer { /// Writes are only allowed when this is `None`. end_lsn: OnceLock, + opened_at: Instant, + /// The above fields never change, except for `end_lsn`, which is only set once. /// All other changing parts are in `inner`, and protected by a mutex. inner: RwLock, @@ -460,6 +463,7 @@ impl InMemoryLayer { tenant_shard_id, start_lsn, end_lsn: OnceLock::new(), + opened_at: Instant::now(), inner: RwLock::new(InMemoryLayerInner { index: HashMap::new(), file, @@ -520,6 +524,10 @@ impl InMemoryLayer { Ok(()) } + pub(crate) fn get_opened_at(&self) -> Instant { + self.opened_at + } + pub(crate) async fn tick(&self) -> Option { let mut inner = self.inner.write().await; let size = inner.file.len(); diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index e4f5f75132..74ed677ffe 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -18,7 +18,7 @@ use utils::{backoff, completion}; static CONCURRENT_BACKGROUND_TASKS: once_cell::sync::Lazy = once_cell::sync::Lazy::new(|| { - let total_threads = *task_mgr::BACKGROUND_RUNTIME_WORKER_THREADS; + let total_threads = task_mgr::TOKIO_WORKER_THREADS.get(); let permits = usize::max( 1, // while a lot of the work is done on spawn_blocking, we still do @@ -72,6 +72,7 @@ pub(crate) async fn concurrent_background_tasks_rate_limit_permit( loop_kind == BackgroundLoopKind::InitialLogicalSizeCalculation ); + // TODO: assert that we run on BACKGROUND_RUNTIME; requires tokio_unstable Handle::id(); match CONCURRENT_BACKGROUND_TASKS.acquire().await { Ok(permit) => permit, Err(_closed) => unreachable!("we never close the semaphore"), diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index d3c8c5f66c..d046a60af4 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -1257,7 +1257,7 @@ impl Timeline { checkpoint_distance, self.get_last_record_lsn(), self.last_freeze_at.load(), - *self.last_freeze_ts.read().unwrap(), + open_layer.get_opened_at(), ) { match open_layer.info() { InMemoryLayerInfo::Frozen { lsn_start, lsn_end } => { @@ -1622,7 +1622,7 @@ impl Timeline { checkpoint_distance: u64, projected_lsn: Lsn, last_freeze_at: Lsn, - last_freeze_ts: Instant, + opened_at: Instant, ) -> bool { let distance = projected_lsn.widening_sub(last_freeze_at); @@ -1648,13 +1648,13 @@ impl Timeline { ); true - } else if distance > 0 && last_freeze_ts.elapsed() >= self.get_checkpoint_timeout() { + } else if distance > 0 && opened_at.elapsed() >= self.get_checkpoint_timeout() { info!( - "Will roll layer at {} with layer size {} due to time since last flush ({:?})", - projected_lsn, - layer_size, - last_freeze_ts.elapsed() - ); + "Will roll layer at {} with layer size {} due to time since first write to the layer ({:?})", + projected_lsn, + layer_size, + opened_at.elapsed() + ); true } else { @@ -4703,23 +4703,16 @@ struct TimelineWriterState { max_lsn: Option, // Cached details of the last freeze. Avoids going trough the atomic/lock on every put. cached_last_freeze_at: Lsn, - cached_last_freeze_ts: Instant, } impl TimelineWriterState { - fn new( - open_layer: Arc, - current_size: u64, - last_freeze_at: Lsn, - last_freeze_ts: Instant, - ) -> Self { + fn new(open_layer: Arc, current_size: u64, last_freeze_at: Lsn) -> Self { Self { open_layer, current_size, prev_lsn: None, max_lsn: None, cached_last_freeze_at: last_freeze_at, - cached_last_freeze_ts: last_freeze_ts, } } } @@ -4818,12 +4811,10 @@ impl<'a> TimelineWriter<'a> { let initial_size = layer.size().await?; let last_freeze_at = self.last_freeze_at.load(); - let last_freeze_ts = *self.last_freeze_ts.read().unwrap(); self.write_guard.replace(TimelineWriterState::new( layer, initial_size, last_freeze_at, - last_freeze_ts, )); Ok(()) @@ -4870,7 +4861,7 @@ impl<'a> TimelineWriter<'a> { self.get_checkpoint_distance(), lsn, state.cached_last_freeze_at, - state.cached_last_freeze_ts, + state.open_layer.get_opened_at(), ) { OpenLayerAction::Roll } else { diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index ab001bf10d..8075775bbc 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -12,7 +12,6 @@ use super::layer_manager::LayerManager; use super::{CompactFlags, DurationRecorder, RecordedDuration, Timeline}; use anyhow::{anyhow, Context}; -use async_trait::async_trait; use enumset::EnumSet; use fail::fail_point; use itertools::Itertools; @@ -1122,7 +1121,6 @@ impl CompactionLayer for ResidentDeltaLayer { } } -#[async_trait] impl CompactionDeltaLayer for ResidentDeltaLayer { type DeltaEntry<'a> = DeltaEntry<'a>; diff --git a/pageserver/src/utilization.rs b/pageserver/src/utilization.rs index 5eccf185ac..e6c835aa75 100644 --- a/pageserver/src/utilization.rs +++ b/pageserver/src/utilization.rs @@ -41,7 +41,7 @@ pub(crate) fn regenerate(tenants_path: &Path) -> anyhow::Result auth::Result { // we have validated the endpoint exists, so let's intern it. - let endpoint_int = EndpointIdInt::from(endpoint); + let endpoint_int = EndpointIdInt::from(endpoint.normalize()); // only count the full hash count if password hack or websocket flow. // in other words, if proxy needs to run the hashing @@ -210,8 +210,12 @@ impl AuthenticationConfig { enabled = self.rate_limiter_enabled, "rate limiting authentication" ); - AUTH_RATE_LIMIT_HITS.inc(); - ENDPOINTS_AUTH_RATE_LIMITED.measure(endpoint); + Metrics::get().proxy.requests_auth_rate_limits_total.inc(); + Metrics::get() + .proxy + .endpoints_auth_rate_limits + .get_metric() + .measure(endpoint); if self.rate_limiter_enabled { return Err(auth::AuthError::too_many_connections()); diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs index 89773aa1ff..783a1a5a21 100644 --- a/proxy/src/auth/credentials.rs +++ b/proxy/src/auth/credentials.rs @@ -4,7 +4,7 @@ use crate::{ auth::password_hack::parse_endpoint_param, context::RequestMonitoring, error::{ReportableError, UserFacingError}, - metrics::NUM_CONNECTION_ACCEPTED_BY_SNI, + metrics::{Metrics, SniKind}, proxy::NeonOptions, serverless::SERVERLESS_DRIVER_SNI, EndpointId, RoleName, @@ -144,21 +144,22 @@ impl ComputeUserInfoMaybeEndpoint { ctx.set_endpoint_id(ep.clone()); } + let metrics = Metrics::get(); info!(%user, "credentials"); if sni.is_some() { info!("Connection with sni"); - NUM_CONNECTION_ACCEPTED_BY_SNI - .with_label_values(&["sni"]) - .inc(); + metrics.proxy.accepted_connections_by_sni.inc(SniKind::Sni); } else if endpoint.is_some() { - NUM_CONNECTION_ACCEPTED_BY_SNI - .with_label_values(&["no_sni"]) - .inc(); + metrics + .proxy + .accepted_connections_by_sni + .inc(SniKind::NoSni); info!("Connection without sni"); } else { - NUM_CONNECTION_ACCEPTED_BY_SNI - .with_label_values(&["password_hack"]) - .inc(); + metrics + .proxy + .accepted_connections_by_sni + .inc(SniKind::PasswordHack); info!("Connection with password hack"); } diff --git a/proxy/src/bin/pg_sni_router.rs b/proxy/src/bin/pg_sni_router.rs index c28814b1c8..58737efe46 100644 --- a/proxy/src/bin/pg_sni_router.rs +++ b/proxy/src/bin/pg_sni_router.rs @@ -176,7 +176,12 @@ async fn task_main( .context("failed to set socket option")?; info!(%peer_addr, "serving"); - let ctx = RequestMonitoring::new(session_id, peer_addr.ip(), "sni_router", "sni"); + let ctx = RequestMonitoring::new( + session_id, + peer_addr.ip(), + proxy::metrics::Protocol::SniRouter, + "sni", + ); handle_client(ctx, dest_suffix, tls_config, tls_server_end_point, socket).await } .unwrap_or_else(|e| { diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs index 56a3ef79cd..2e749fc7e8 100644 --- a/proxy/src/bin/proxy.rs +++ b/proxy/src/bin/proxy.rs @@ -18,7 +18,8 @@ use proxy::config::ProjectInfoCacheOptions; use proxy::console; use proxy::context::parquet::ParquetUploadArgs; use proxy::http; -use proxy::metrics::NUM_CANCELLATION_REQUESTS_SOURCE_FROM_CLIENT; +use proxy::http::health_server::AppMetrics; +use proxy::metrics::Metrics; use proxy::rate_limiter::AuthRateLimiter; use proxy::rate_limiter::EndpointRateLimiter; use proxy::rate_limiter::RateBucketInfo; @@ -189,7 +190,9 @@ struct ProxyCliArgs { /// cache for `project_info` (use `size=0` to disable) #[clap(long, default_value = config::ProjectInfoCacheOptions::CACHE_DEFAULT_OPTIONS)] project_info_cache: String, - + /// cache for all valid endpoints + #[clap(long, default_value = config::EndpointCacheConfig::CACHE_DEFAULT_OPTIONS)] + endpoint_cache_config: String, #[clap(flatten)] parquet_upload: ParquetUploadArgs, @@ -249,14 +252,18 @@ async fn main() -> anyhow::Result<()> { info!("Version: {GIT_VERSION}"); info!("Build_tag: {BUILD_TAG}"); - ::metrics::set_build_info_metric(GIT_VERSION, BUILD_TAG); + let neon_metrics = ::metrics::NeonMetrics::new(::metrics::BuildInfo { + revision: GIT_VERSION, + build_tag: BUILD_TAG, + }); - match proxy::jemalloc::MetricRecorder::new(prometheus::default_registry()) { - Ok(t) => { - t.start(); + let jemalloc = match proxy::jemalloc::MetricRecorder::new() { + Ok(t) => Some(t), + Err(e) => { + tracing::error!(error = ?e, "could not start jemalloc metrics loop"); + None } - Err(e) => tracing::error!(error = ?e, "could not start jemalloc metrics loop"), - } + }; let args = ProxyCliArgs::parse(); let config = build_config(&args)?; @@ -296,27 +303,27 @@ async fn main() -> anyhow::Result<()> { ), aws_credentials_provider, )); - let redis_notifications_client = - match (args.redis_notifications, (args.redis_host, args.redis_port)) { - (Some(url), _) => { - info!("Starting redis notifications listener ({url})"); - Some(ConnectionWithCredentialsProvider::new_with_static_credentials(url)) - } - (None, (Some(host), Some(port))) => Some( - ConnectionWithCredentialsProvider::new_with_credentials_provider( - host, - port, - elasticache_credentials_provider.clone(), - ), + let regional_redis_client = match (args.redis_host, args.redis_port) { + (Some(host), Some(port)) => Some( + ConnectionWithCredentialsProvider::new_with_credentials_provider( + host, + port, + elasticache_credentials_provider.clone(), ), - (None, (None, None)) => { - warn!("Redis is disabled"); - None - } - _ => { - bail!("redis-host and redis-port must be specified together"); - } - }; + ), + (None, None) => { + warn!("Redis events from console are disabled"); + None + } + _ => { + bail!("redis-host and redis-port must be specified together"); + } + }; + let redis_notifications_client = if let Some(url) = args.redis_notifications { + Some(ConnectionWithCredentialsProvider::new_with_static_credentials(url)) + } else { + regional_redis_client.clone() + }; // Check that we can bind to address before further initialization let http_address: SocketAddr = args.http.parse()?; @@ -335,8 +342,7 @@ async fn main() -> anyhow::Result<()> { let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new(&config.endpoint_rps_limit)); let cancel_map = CancelMap::default(); - // let redis_notifications_client = redis_notifications_client.map(|x| Box::leak(Box::new(x))); - let redis_publisher = match &redis_notifications_client { + let redis_publisher = match ®ional_redis_client { Some(redis_publisher) => Some(Arc::new(Mutex::new(RedisPublisherClient::new( redis_publisher.clone(), args.region.clone(), @@ -349,7 +355,7 @@ async fn main() -> anyhow::Result<()> { >::new( cancel_map.clone(), redis_publisher, - NUM_CANCELLATION_REQUESTS_SOURCE_FROM_CLIENT, + proxy::metrics::CancellationSource::FromClient, )); // client facing tasks. these will exit on error or on cancellation @@ -387,7 +393,14 @@ async fn main() -> anyhow::Result<()> { // maintenance tasks. these never return unless there's an error let mut maintenance_tasks = JoinSet::new(); maintenance_tasks.spawn(proxy::handle_signals(cancellation_token.clone())); - maintenance_tasks.spawn(http::health_server::task_main(http_listener)); + maintenance_tasks.spawn(http::health_server::task_main( + http_listener, + AppMetrics { + jemalloc, + neon_metrics, + proxy: proxy::metrics::Metrics::get(), + }, + )); maintenance_tasks.spawn(console::mgmt::task_main(mgmt_listener)); if let Some(metrics_config) = &config.metric_collection { @@ -404,13 +417,18 @@ async fn main() -> anyhow::Result<()> { if let Some(redis_notifications_client) = redis_notifications_client { let cache = api.caches.project_info.clone(); maintenance_tasks.spawn(notifications::task_main( - redis_notifications_client.clone(), + redis_notifications_client, cache.clone(), cancel_map.clone(), args.region.clone(), )); maintenance_tasks.spawn(async move { cache.clone().gc_worker().await }); } + if let Some(regional_redis_client) = regional_redis_client { + let cache = api.caches.endpoints_cache.clone(); + let con = regional_redis_client; + maintenance_tasks.spawn(async move { cache.do_read(con).await }); + } } } @@ -489,14 +507,18 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?; let project_info_cache_config: ProjectInfoCacheOptions = args.project_info_cache.parse()?; + let endpoint_cache_config: config::EndpointCacheConfig = + args.endpoint_cache_config.parse()?; info!("Using NodeInfoCache (wake_compute) with options={wake_compute_cache_config:?}"); info!( "Using AllowedIpsCache (wake_compute) with options={project_info_cache_config:?}" ); + info!("Using EndpointCacheConfig with options={endpoint_cache_config:?}"); let caches = Box::leak(Box::new(console::caches::ApiCaches::new( wake_compute_cache_config, project_info_cache_config, + endpoint_cache_config, ))); let config::WakeComputeLockOptions { @@ -507,10 +529,17 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { } = args.wake_compute_lock.parse()?; info!(permits, shards, ?epoch, "Using NodeLocks (wake_compute)"); let locks = Box::leak(Box::new( - console::locks::ApiLocks::new("wake_compute_lock", permits, shards, timeout) - .unwrap(), + console::locks::ApiLocks::new( + "wake_compute_lock", + permits, + shards, + timeout, + epoch, + &Metrics::get().wake_compute_lock, + ) + .unwrap(), )); - tokio::spawn(locks.garbage_collect_worker(epoch)); + tokio::spawn(locks.garbage_collect_worker()); let url = args.auth_endpoint.parse()?; let endpoint = http::Endpoint::new(url, http::new_client(rate_limiter_config)); diff --git a/proxy/src/cache.rs b/proxy/src/cache.rs index fc5f416395..d1d4087241 100644 --- a/proxy/src/cache.rs +++ b/proxy/src/cache.rs @@ -1,4 +1,5 @@ pub mod common; +pub mod endpoints; pub mod project_info; mod timed_lru; diff --git a/proxy/src/cache/endpoints.rs b/proxy/src/cache/endpoints.rs new file mode 100644 index 0000000000..f3f9e9395f --- /dev/null +++ b/proxy/src/cache/endpoints.rs @@ -0,0 +1,226 @@ +use std::{ + convert::Infallible, + sync::{ + atomic::{AtomicBool, Ordering}, + Arc, + }, +}; + +use dashmap::DashSet; +use redis::{ + streams::{StreamReadOptions, StreamReadReply}, + AsyncCommands, FromRedisValue, Value, +}; +use serde::Deserialize; +use tokio::sync::Mutex; + +use crate::{ + config::EndpointCacheConfig, + context::RequestMonitoring, + intern::{BranchIdInt, EndpointIdInt, ProjectIdInt}, + metrics::{Metrics, RedisErrors}, + rate_limiter::GlobalRateLimiter, + redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider, + EndpointId, +}; + +#[derive(Deserialize, Debug, Clone)] +pub struct ControlPlaneEventKey { + endpoint_created: Option, + branch_created: Option, + project_created: Option, +} +#[derive(Deserialize, Debug, Clone)] +struct EndpointCreated { + endpoint_id: String, +} +#[derive(Deserialize, Debug, Clone)] +struct BranchCreated { + branch_id: String, +} +#[derive(Deserialize, Debug, Clone)] +struct ProjectCreated { + project_id: String, +} + +pub struct EndpointsCache { + config: EndpointCacheConfig, + endpoints: DashSet, + branches: DashSet, + projects: DashSet, + ready: AtomicBool, + limiter: Arc>, +} + +impl EndpointsCache { + pub fn new(config: EndpointCacheConfig) -> Self { + Self { + limiter: Arc::new(Mutex::new(GlobalRateLimiter::new( + config.limiter_info.clone(), + ))), + config, + endpoints: DashSet::new(), + branches: DashSet::new(), + projects: DashSet::new(), + ready: AtomicBool::new(false), + } + } + pub async fn is_valid(&self, ctx: &mut RequestMonitoring, endpoint: &EndpointId) -> bool { + if !self.ready.load(Ordering::Acquire) { + return true; + } + // If cache is disabled, just collect the metrics and return. + if self.config.disable_cache { + ctx.set_rejected(self.should_reject(endpoint)); + return true; + } + // If the limiter allows, we don't need to check the cache. + if self.limiter.lock().await.check() { + return true; + } + let rejected = self.should_reject(endpoint); + ctx.set_rejected(rejected); + !rejected + } + fn should_reject(&self, endpoint: &EndpointId) -> bool { + if endpoint.is_endpoint() { + !self.endpoints.contains(&EndpointIdInt::from(endpoint)) + } else if endpoint.is_branch() { + !self + .branches + .contains(&BranchIdInt::from(&endpoint.as_branch())) + } else { + !self + .projects + .contains(&ProjectIdInt::from(&endpoint.as_project())) + } + } + fn insert_event(&self, key: ControlPlaneEventKey) { + // Do not do normalization here, we expect the events to be normalized. + if let Some(endpoint_created) = key.endpoint_created { + self.endpoints + .insert(EndpointIdInt::from(&endpoint_created.endpoint_id.into())); + } + if let Some(branch_created) = key.branch_created { + self.branches + .insert(BranchIdInt::from(&branch_created.branch_id.into())); + } + if let Some(project_created) = key.project_created { + self.projects + .insert(ProjectIdInt::from(&project_created.project_id.into())); + } + } + pub async fn do_read( + &self, + mut con: ConnectionWithCredentialsProvider, + ) -> anyhow::Result { + let mut last_id = "0-0".to_string(); + loop { + self.ready.store(false, Ordering::Release); + if let Err(e) = con.connect().await { + tracing::error!("error connecting to redis: {:?}", e); + continue; + } + if let Err(e) = self.read_from_stream(&mut con, &mut last_id).await { + tracing::error!("error reading from redis: {:?}", e); + } + tokio::time::sleep(self.config.retry_interval).await; + } + } + async fn read_from_stream( + &self, + con: &mut ConnectionWithCredentialsProvider, + last_id: &mut String, + ) -> anyhow::Result<()> { + tracing::info!("reading endpoints/branches/projects from redis"); + self.batch_read( + con, + StreamReadOptions::default().count(self.config.initial_batch_size), + last_id, + true, + ) + .await?; + tracing::info!("ready to filter user requests"); + self.ready.store(true, Ordering::Release); + self.batch_read( + con, + StreamReadOptions::default() + .count(self.config.default_batch_size) + .block(self.config.xread_timeout.as_millis() as usize), + last_id, + false, + ) + .await + } + fn parse_key_value(value: &Value) -> anyhow::Result { + let s: String = FromRedisValue::from_redis_value(value)?; + Ok(serde_json::from_str(&s)?) + } + async fn batch_read( + &self, + conn: &mut ConnectionWithCredentialsProvider, + opts: StreamReadOptions, + last_id: &mut String, + return_when_finish: bool, + ) -> anyhow::Result<()> { + let mut total: usize = 0; + loop { + let mut res: StreamReadReply = conn + .xread_options(&[&self.config.stream_name], &[last_id.as_str()], &opts) + .await?; + + if res.keys.is_empty() { + if return_when_finish { + anyhow::bail!( + "Redis stream {} is empty, cannot be used to filter endpoints", + self.config.stream_name + ); + } + // If we are not returning when finish, we should wait for more data. + continue; + } + if res.keys.len() != 1 { + anyhow::bail!("Cannot read from redis stream {}", self.config.stream_name); + } + + let res = res.keys.pop().expect("Checked length above"); + let len = res.ids.len(); + for x in res.ids { + total += 1; + for (_, v) in x.map { + let key = match Self::parse_key_value(&v) { + Ok(x) => x, + Err(e) => { + Metrics::get().proxy.redis_errors_total.inc(RedisErrors { + channel: &self.config.stream_name, + }); + tracing::error!("error parsing value {v:?}: {e:?}"); + continue; + } + }; + self.insert_event(key); + } + if total.is_power_of_two() { + tracing::debug!("endpoints read {}", total); + } + *last_id = x.id; + } + if return_when_finish && len <= self.config.default_batch_size { + break; + } + } + tracing::info!("read {} endpoints/branches/projects from redis", total); + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::ControlPlaneEventKey; + + #[test] + fn test() { + let s = "{\"branch_created\":null,\"endpoint_created\":{\"endpoint_id\":\"ep-rapid-thunder-w0qqw2q9\"},\"project_created\":null,\"type\":\"endpoint_created\"}"; + let _: ControlPlaneEventKey = serde_json::from_str(s).unwrap(); + } +} diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs index 6151513614..34512e9f5b 100644 --- a/proxy/src/cancellation.rs +++ b/proxy/src/cancellation.rs @@ -10,7 +10,7 @@ use uuid::Uuid; use crate::{ error::ReportableError, - metrics::NUM_CANCELLATION_REQUESTS, + metrics::{CancellationRequest, CancellationSource, Metrics}, redis::cancellation_publisher::{ CancellationPublisher, CancellationPublisherMut, RedisPublisherClient, }, @@ -28,7 +28,7 @@ pub struct CancellationHandler

{ client: P, /// This field used for the monitoring purposes. /// Represents the source of the cancellation request. - from: &'static str, + from: CancellationSource, } #[derive(Debug, Error)] @@ -89,9 +89,13 @@ impl CancellationHandler

{ // NB: we should immediately release the lock after cloning the token. let Some(cancel_closure) = self.map.get(&key).and_then(|x| x.clone()) else { tracing::warn!("query cancellation key not found: {key}"); - NUM_CANCELLATION_REQUESTS - .with_label_values(&[self.from, "not_found"]) - .inc(); + Metrics::get() + .proxy + .cancellation_requests_total + .inc(CancellationRequest { + source: self.from, + kind: crate::metrics::CancellationOutcome::NotFound, + }); match self.client.try_publish(key, session_id).await { Ok(()) => {} // do nothing Err(e) => { @@ -103,9 +107,13 @@ impl CancellationHandler

{ } return Ok(()); }; - NUM_CANCELLATION_REQUESTS - .with_label_values(&[self.from, "found"]) - .inc(); + Metrics::get() + .proxy + .cancellation_requests_total + .inc(CancellationRequest { + source: self.from, + kind: crate::metrics::CancellationOutcome::Found, + }); info!("cancelling query per user's request using key {key}"); cancel_closure.try_cancel_query().await } @@ -122,7 +130,7 @@ impl CancellationHandler

{ } impl CancellationHandler<()> { - pub fn new(map: CancelMap, from: &'static str) -> Self { + pub fn new(map: CancelMap, from: CancellationSource) -> Self { Self { map, client: (), @@ -132,7 +140,7 @@ impl CancellationHandler<()> { } impl CancellationHandler>>> { - pub fn new(map: CancelMap, client: Option>>, from: &'static str) -> Self { + pub fn new(map: CancelMap, client: Option>>, from: CancellationSource) -> Self { Self { map, client, from } } } @@ -192,15 +200,13 @@ impl

Drop for Session

{ #[cfg(test)] mod tests { - use crate::metrics::NUM_CANCELLATION_REQUESTS_SOURCE_FROM_REDIS; - use super::*; #[tokio::test] async fn check_session_drop() -> anyhow::Result<()> { let cancellation_handler = Arc::new(CancellationHandler::<()>::new( CancelMap::default(), - NUM_CANCELLATION_REQUESTS_SOURCE_FROM_REDIS, + CancellationSource::FromRedis, )); let session = cancellation_handler.clone().get_session(); @@ -214,7 +220,7 @@ mod tests { #[tokio::test] async fn cancel_session_noop_regression() { - let handler = CancellationHandler::<()>::new(Default::default(), "local"); + let handler = CancellationHandler::<()>::new(Default::default(), CancellationSource::Local); handler .cancel_session( CancelKeyData { diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs index ee33b97fbd..149a619316 100644 --- a/proxy/src/compute.rs +++ b/proxy/src/compute.rs @@ -4,12 +4,11 @@ use crate::{ console::{errors::WakeComputeError, messages::MetricsAuxInfo}, context::RequestMonitoring, error::{ReportableError, UserFacingError}, - metrics::NUM_DB_CONNECTIONS_GAUGE, + metrics::{Metrics, NumDbConnectionsGuard}, proxy::neon_option, }; use futures::{FutureExt, TryFutureExt}; use itertools::Itertools; -use metrics::IntCounterPairGuard; use pq_proto::StartupMessageParams; use std::{io, net::SocketAddr, time::Duration}; use thiserror::Error; @@ -249,7 +248,7 @@ pub struct PostgresConnection { /// Labels for proxy's metrics. pub aux: MetricsAuxInfo, - _guage: IntCounterPairGuard, + _guage: NumDbConnectionsGuard<'static>, } impl ConnCfg { @@ -295,9 +294,7 @@ impl ConnCfg { params, cancel_closure, aux, - _guage: NUM_DB_CONNECTIONS_GAUGE - .with_label_values(&[ctx.protocol]) - .guard(), + _guage: Metrics::get().proxy.db_connections.guard(ctx.protocol), }; Ok(connection) diff --git a/proxy/src/config.rs b/proxy/src/config.rs index fc490c7348..b4b2ce8dbd 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -313,6 +313,80 @@ impl CertResolver { } } +#[derive(Debug)] +pub struct EndpointCacheConfig { + /// Batch size to receive all endpoints on the startup. + pub initial_batch_size: usize, + /// Batch size to receive endpoints. + pub default_batch_size: usize, + /// Timeouts for the stream read operation. + pub xread_timeout: Duration, + /// Stream name to read from. + pub stream_name: String, + /// Limiter info (to distinguish when to enable cache). + pub limiter_info: Vec, + /// Disable cache. + /// If true, cache is ignored, but reports all statistics. + pub disable_cache: bool, + /// Retry interval for the stream read operation. + pub retry_interval: Duration, +} + +impl EndpointCacheConfig { + /// Default options for [`crate::console::provider::NodeInfoCache`]. + /// Notice that by default the limiter is empty, which means that cache is disabled. + pub const CACHE_DEFAULT_OPTIONS: &'static str = + "initial_batch_size=1000,default_batch_size=10,xread_timeout=5m,stream_name=controlPlane,disable_cache=true,limiter_info=1000@1s,retry_interval=1s"; + + /// Parse cache options passed via cmdline. + /// Example: [`Self::CACHE_DEFAULT_OPTIONS`]. + fn parse(options: &str) -> anyhow::Result { + let mut initial_batch_size = None; + let mut default_batch_size = None; + let mut xread_timeout = None; + let mut stream_name = None; + let mut limiter_info = vec![]; + let mut disable_cache = false; + let mut retry_interval = None; + + for option in options.split(',') { + let (key, value) = option + .split_once('=') + .with_context(|| format!("bad key-value pair: {option}"))?; + + match key { + "initial_batch_size" => initial_batch_size = Some(value.parse()?), + "default_batch_size" => default_batch_size = Some(value.parse()?), + "xread_timeout" => xread_timeout = Some(humantime::parse_duration(value)?), + "stream_name" => stream_name = Some(value.to_string()), + "limiter_info" => limiter_info.push(RateBucketInfo::from_str(value)?), + "disable_cache" => disable_cache = value.parse()?, + "retry_interval" => retry_interval = Some(humantime::parse_duration(value)?), + unknown => bail!("unknown key: {unknown}"), + } + } + RateBucketInfo::validate(&mut limiter_info)?; + + Ok(Self { + initial_batch_size: initial_batch_size.context("missing `initial_batch_size`")?, + default_batch_size: default_batch_size.context("missing `default_batch_size`")?, + xread_timeout: xread_timeout.context("missing `xread_timeout`")?, + stream_name: stream_name.context("missing `stream_name`")?, + disable_cache, + limiter_info, + retry_interval: retry_interval.context("missing `retry_interval`")?, + }) + } +} + +impl FromStr for EndpointCacheConfig { + type Err = anyhow::Error; + + fn from_str(options: &str) -> Result { + let error = || format!("failed to parse endpoint cache options '{options}'"); + Self::parse(options).with_context(error) + } +} #[derive(Debug)] pub struct MetricBackupCollectionConfig { pub interval: Duration, diff --git a/proxy/src/console/messages.rs b/proxy/src/console/messages.rs index 45161f5ac8..9869b95768 100644 --- a/proxy/src/console/messages.rs +++ b/proxy/src/console/messages.rs @@ -1,3 +1,4 @@ +use measured::FixedCardinalityLabel; use serde::{Deserialize, Serialize}; use std::fmt; @@ -102,7 +103,7 @@ pub struct MetricsAuxInfo { pub cold_start_info: ColdStartInfo, } -#[derive(Debug, Default, Serialize, Deserialize, Clone, Copy)] +#[derive(Debug, Default, Serialize, Deserialize, Clone, Copy, FixedCardinalityLabel)] #[serde(rename_all = "snake_case")] pub enum ColdStartInfo { #[default] @@ -110,9 +111,11 @@ pub enum ColdStartInfo { /// Compute was already running Warm, #[serde(rename = "pool_hit")] + #[label(rename = "pool_hit")] /// Compute was not running but there was an available VM VmPoolHit, #[serde(rename = "pool_miss")] + #[label(rename = "pool_miss")] /// Compute was not running and there were no VMs available VmPoolMiss, diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs index f7d621fb12..3fa7221f98 100644 --- a/proxy/src/console/provider.rs +++ b/proxy/src/console/provider.rs @@ -8,11 +8,12 @@ use crate::{ backend::{ComputeCredentialKeys, ComputeUserInfo}, IpPattern, }, - cache::{project_info::ProjectInfoCacheImpl, Cached, TimedLru}, + cache::{endpoints::EndpointsCache, project_info::ProjectInfoCacheImpl, Cached, TimedLru}, compute, - config::{CacheOptions, ProjectInfoCacheOptions}, + config::{CacheOptions, EndpointCacheConfig, ProjectInfoCacheOptions}, context::RequestMonitoring, intern::ProjectIdInt, + metrics::ApiLockMetrics, scram, EndpointCacheKey, }; use dashmap::DashMap; @@ -416,12 +417,15 @@ pub struct ApiCaches { pub node_info: NodeInfoCache, /// Cache which stores project_id -> endpoint_ids mapping. pub project_info: Arc, + /// List of all valid endpoints. + pub endpoints_cache: Arc, } impl ApiCaches { pub fn new( wake_compute_cache_config: CacheOptions, project_info_cache_config: ProjectInfoCacheOptions, + endpoint_cache_config: EndpointCacheConfig, ) -> Self { Self { node_info: NodeInfoCache::new( @@ -431,6 +435,7 @@ impl ApiCaches { true, ), project_info: Arc::new(ProjectInfoCacheImpl::new(project_info_cache_config)), + endpoints_cache: Arc::new(EndpointsCache::new(endpoint_cache_config)), } } } @@ -441,10 +446,8 @@ pub struct ApiLocks { node_locks: DashMap>, permits: usize, timeout: Duration, - registered: prometheus::IntCounter, - unregistered: prometheus::IntCounter, - reclamation_lag: prometheus::Histogram, - lock_acquire_lag: prometheus::Histogram, + epoch: std::time::Duration, + metrics: &'static ApiLockMetrics, } impl ApiLocks { @@ -453,54 +456,16 @@ impl ApiLocks { permits: usize, shards: usize, timeout: Duration, + epoch: std::time::Duration, + metrics: &'static ApiLockMetrics, ) -> prometheus::Result { - let registered = prometheus::IntCounter::with_opts( - prometheus::Opts::new( - "semaphores_registered", - "Number of semaphores registered in this api lock", - ) - .namespace(name), - )?; - prometheus::register(Box::new(registered.clone()))?; - let unregistered = prometheus::IntCounter::with_opts( - prometheus::Opts::new( - "semaphores_unregistered", - "Number of semaphores unregistered in this api lock", - ) - .namespace(name), - )?; - prometheus::register(Box::new(unregistered.clone()))?; - let reclamation_lag = prometheus::Histogram::with_opts( - prometheus::HistogramOpts::new( - "reclamation_lag_seconds", - "Time it takes to reclaim unused semaphores in the api lock", - ) - .namespace(name) - // 1us -> 65ms - // benchmarks on my mac indicate it's usually in the range of 256us and 512us - .buckets(prometheus::exponential_buckets(1e-6, 2.0, 16)?), - )?; - prometheus::register(Box::new(reclamation_lag.clone()))?; - let lock_acquire_lag = prometheus::Histogram::with_opts( - prometheus::HistogramOpts::new( - "semaphore_acquire_seconds", - "Time it takes to reclaim unused semaphores in the api lock", - ) - .namespace(name) - // 0.1ms -> 6s - .buckets(prometheus::exponential_buckets(1e-4, 2.0, 16)?), - )?; - prometheus::register(Box::new(lock_acquire_lag.clone()))?; - Ok(Self { name, node_locks: DashMap::with_shard_amount(shards), permits, timeout, - lock_acquire_lag, - registered, - unregistered, - reclamation_lag, + epoch, + metrics, }) } @@ -520,7 +485,7 @@ impl ApiLocks { self.node_locks .entry(key.clone()) .or_insert_with(|| { - self.registered.inc(); + self.metrics.semaphores_registered.inc(); Arc::new(Semaphore::new(self.permits)) }) .clone() @@ -528,20 +493,21 @@ impl ApiLocks { }; let permit = tokio::time::timeout_at(now + self.timeout, semaphore.acquire_owned()).await; - self.lock_acquire_lag - .observe((Instant::now() - now).as_secs_f64()); + self.metrics + .semaphore_acquire_seconds + .observe(now.elapsed().as_secs_f64()); Ok(WakeComputePermit { permit: Some(permit??), }) } - pub async fn garbage_collect_worker(&self, epoch: std::time::Duration) { + pub async fn garbage_collect_worker(&self) { if self.permits == 0 { return; } - - let mut interval = tokio::time::interval(epoch / (self.node_locks.shards().len()) as u32); + let mut interval = + tokio::time::interval(self.epoch / (self.node_locks.shards().len()) as u32); loop { for (i, shard) in self.node_locks.shards().iter().enumerate() { interval.tick().await; @@ -554,13 +520,13 @@ impl ApiLocks { "performing epoch reclamation on api lock" ); let mut lock = shard.write(); - let timer = self.reclamation_lag.start_timer(); + let timer = self.metrics.reclamation_lag_seconds.start_timer(); let count = lock .extract_if(|_, semaphore| Arc::strong_count(semaphore.get_mut()) == 1) .count(); drop(lock); - self.unregistered.inc_by(count as u64); - timer.observe_duration() + self.metrics.semaphores_unregistered.inc_by(count as u64); + timer.observe(); } } } diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs index 1a3e2ca795..138acdf578 100644 --- a/proxy/src/console/provider/neon.rs +++ b/proxy/src/console/provider/neon.rs @@ -7,13 +7,14 @@ use super::{ NodeInfo, }; use crate::{ - auth::backend::ComputeUserInfo, compute, console::messages::ColdStartInfo, http, scram, -}; -use crate::{ - cache::Cached, - context::RequestMonitoring, - metrics::{ALLOWED_IPS_BY_CACHE_OUTCOME, ALLOWED_IPS_NUMBER}, + auth::backend::ComputeUserInfo, + compute, + console::messages::ColdStartInfo, + http, + metrics::{CacheOutcome, Metrics}, + scram, Normalize, }; +use crate::{cache::Cached, context::RequestMonitoring}; use futures::TryFutureExt; use std::sync::Arc; use tokio::time::Instant; @@ -23,7 +24,7 @@ use tracing::{error, info, info_span, warn, Instrument}; pub struct Api { endpoint: http::Endpoint, pub caches: &'static ApiCaches, - locks: &'static ApiLocks, + pub locks: &'static ApiLocks, jwt: String, } @@ -55,6 +56,15 @@ impl Api { ctx: &mut RequestMonitoring, user_info: &ComputeUserInfo, ) -> Result { + if !self + .caches + .endpoints_cache + .is_valid(ctx, &user_info.endpoint.normalize()) + .await + { + info!("endpoint is not valid, skipping the request"); + return Ok(AuthInfo::default()); + } let request_id = ctx.session_id.to_string(); let application_name = ctx.console_application_name(); async { @@ -81,7 +91,9 @@ impl Api { Ok(body) => body, // Error 404 is special: it's ok not to have a secret. Err(e) => match e.http_status_code() { - Some(http::StatusCode::NOT_FOUND) => return Ok(AuthInfo::default()), + Some(http::StatusCode::NOT_FOUND) => { + return Ok(AuthInfo::default()); + } _otherwise => return Err(e.into()), }, }; @@ -95,7 +107,10 @@ impl Api { Some(secret) }; let allowed_ips = body.allowed_ips.unwrap_or_default(); - ALLOWED_IPS_NUMBER.observe(allowed_ips.len() as f64); + Metrics::get() + .proxy + .allowed_ips_number + .observe(allowed_ips.len() as f64); Ok(AuthInfo { secret, allowed_ips, @@ -174,23 +189,27 @@ impl super::Api for Api { ctx: &mut RequestMonitoring, user_info: &ComputeUserInfo, ) -> Result { - let ep = &user_info.endpoint; + let normalized_ep = &user_info.endpoint.normalize(); let user = &user_info.user; - if let Some(role_secret) = self.caches.project_info.get_role_secret(ep, user) { + if let Some(role_secret) = self + .caches + .project_info + .get_role_secret(normalized_ep, user) + { return Ok(role_secret); } let auth_info = self.do_get_auth_info(ctx, user_info).await?; if let Some(project_id) = auth_info.project_id { - let ep_int = ep.into(); + let normalized_ep_int = normalized_ep.into(); self.caches.project_info.insert_role_secret( project_id, - ep_int, + normalized_ep_int, user.into(), auth_info.secret.clone(), ); self.caches.project_info.insert_allowed_ips( project_id, - ep_int, + normalized_ep_int, Arc::new(auth_info.allowed_ips), ); ctx.set_project_id(project_id); @@ -204,30 +223,34 @@ impl super::Api for Api { ctx: &mut RequestMonitoring, user_info: &ComputeUserInfo, ) -> Result<(CachedAllowedIps, Option), GetAuthInfoError> { - let ep = &user_info.endpoint; - if let Some(allowed_ips) = self.caches.project_info.get_allowed_ips(ep) { - ALLOWED_IPS_BY_CACHE_OUTCOME - .with_label_values(&["hit"]) - .inc(); + let normalized_ep = &user_info.endpoint.normalize(); + if let Some(allowed_ips) = self.caches.project_info.get_allowed_ips(normalized_ep) { + Metrics::get() + .proxy + .allowed_ips_cache_misses + .inc(CacheOutcome::Hit); return Ok((allowed_ips, None)); } - ALLOWED_IPS_BY_CACHE_OUTCOME - .with_label_values(&["miss"]) - .inc(); + Metrics::get() + .proxy + .allowed_ips_cache_misses + .inc(CacheOutcome::Miss); let auth_info = self.do_get_auth_info(ctx, user_info).await?; let allowed_ips = Arc::new(auth_info.allowed_ips); let user = &user_info.user; if let Some(project_id) = auth_info.project_id { - let ep_int = ep.into(); + let normalized_ep_int = normalized_ep.into(); self.caches.project_info.insert_role_secret( project_id, - ep_int, + normalized_ep_int, user.into(), auth_info.secret.clone(), ); - self.caches - .project_info - .insert_allowed_ips(project_id, ep_int, allowed_ips.clone()); + self.caches.project_info.insert_allowed_ips( + project_id, + normalized_ep_int, + allowed_ips.clone(), + ); ctx.set_project_id(project_id); } Ok(( diff --git a/proxy/src/context.rs b/proxy/src/context.rs index fec95f4722..dc475d57ed 100644 --- a/proxy/src/context.rs +++ b/proxy/src/context.rs @@ -12,7 +12,7 @@ use crate::{ console::messages::{ColdStartInfo, MetricsAuxInfo}, error::ErrorKind, intern::{BranchIdInt, ProjectIdInt}, - metrics::{LatencyTimer, ENDPOINT_ERRORS_BY_KIND, ERROR_BY_KIND}, + metrics::{ConnectOutcome, InvalidEndpointsGroup, LatencyTimer, Metrics, Protocol}, DbName, EndpointId, RoleName, }; @@ -29,7 +29,7 @@ static LOG_CHAN: OnceCell> = OnceCell::ne pub struct RequestMonitoring { pub peer_addr: IpAddr, pub session_id: Uuid, - pub protocol: &'static str, + pub protocol: Protocol, first_packet: chrono::DateTime, region: &'static str, pub span: Span, @@ -50,6 +50,8 @@ pub struct RequestMonitoring { // This sender is here to keep the request monitoring channel open while requests are taking place. sender: Option>, pub latency_timer: LatencyTimer, + // Whether proxy decided that it's not a valid endpoint end rejected it before going to cplane. + rejected: bool, } #[derive(Clone, Debug)] @@ -65,7 +67,7 @@ impl RequestMonitoring { pub fn new( session_id: Uuid, peer_addr: IpAddr, - protocol: &'static str, + protocol: Protocol, region: &'static str, ) -> Self { let span = info_span!( @@ -93,6 +95,7 @@ impl RequestMonitoring { error_kind: None, auth_method: None, success: false, + rejected: false, cold_start_info: ColdStartInfo::Unknown, sender: LOG_CHAN.get().and_then(|tx| tx.upgrade()), @@ -102,7 +105,7 @@ impl RequestMonitoring { #[cfg(test)] pub fn test() -> Self { - RequestMonitoring::new(Uuid::now_v7(), [127, 0, 0, 1].into(), "test", "test") + RequestMonitoring::new(Uuid::now_v7(), [127, 0, 0, 1].into(), Protocol::Tcp, "test") } pub fn console_application_name(&self) -> String { @@ -113,6 +116,10 @@ impl RequestMonitoring { ) } + pub fn set_rejected(&mut self, rejected: bool) { + self.rejected = rejected; + } + pub fn set_cold_start_info(&mut self, info: ColdStartInfo) { self.cold_start_info = info; self.latency_timer.cold_start_info(info); @@ -134,9 +141,9 @@ impl RequestMonitoring { pub fn set_endpoint_id(&mut self, endpoint_id: EndpointId) { if self.endpoint_id.is_none() { self.span.record("ep", display(&endpoint_id)); - crate::metrics::CONNECTING_ENDPOINTS - .with_label_values(&[self.protocol]) - .measure(&endpoint_id); + let metric = &Metrics::get().proxy.connecting_endpoints; + let label = metric.with_labels(self.protocol); + metric.get_metric(label).measure(&endpoint_id); self.endpoint_id = Some(endpoint_id); } } @@ -158,13 +165,11 @@ impl RequestMonitoring { } pub fn set_error_kind(&mut self, kind: ErrorKind) { - ERROR_BY_KIND - .with_label_values(&[kind.to_metric_label()]) - .inc(); + Metrics::get().proxy.errors_total.inc(kind); if let Some(ep) = &self.endpoint_id { - ENDPOINT_ERRORS_BY_KIND - .with_label_values(&[kind.to_metric_label()]) - .measure(ep); + let metric = &Metrics::get().proxy.endpoints_affected_by_errors; + let label = metric.with_labels(kind); + metric.get_metric(label).measure(ep); } self.error_kind = Some(kind); } @@ -178,6 +183,19 @@ impl RequestMonitoring { impl Drop for RequestMonitoring { fn drop(&mut self) { + let outcome = if self.success { + ConnectOutcome::Success + } else { + ConnectOutcome::Failed + }; + Metrics::get() + .proxy + .invalid_endpoints_total + .inc(InvalidEndpointsGroup { + protocol: self.protocol, + rejected: self.rejected.into(), + outcome, + }); if let Some(tx) = self.sender.take() { let _: Result<(), _> = tx.send(RequestData::from(&*self)); } diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs index eb77409429..e061216d15 100644 --- a/proxy/src/context/parquet.rs +++ b/proxy/src/context/parquet.rs @@ -111,7 +111,7 @@ impl From<&RequestMonitoring> for RequestData { super::AuthMethod::ScramSha256Plus => "scram_sha_256_plus", super::AuthMethod::Cleartext => "cleartext", }), - protocol: value.protocol, + protocol: value.protocol.as_str(), region: value.region, error: value.error_kind.as_ref().map(|e| e.to_metric_label()), success: value.success, diff --git a/proxy/src/error.rs b/proxy/src/error.rs index 4614f3913d..fdfe50a494 100644 --- a/proxy/src/error.rs +++ b/proxy/src/error.rs @@ -1,5 +1,7 @@ use std::{error::Error as StdError, fmt, io}; +use measured::FixedCardinalityLabel; + /// Upcast (almost) any error into an opaque [`io::Error`]. pub fn io_error(e: impl Into>) -> io::Error { io::Error::new(io::ErrorKind::Other, e) @@ -29,24 +31,29 @@ pub trait UserFacingError: ReportableError { } } -#[derive(Copy, Clone, Debug, Eq, PartialEq)] +#[derive(Copy, Clone, Debug, Eq, PartialEq, FixedCardinalityLabel)] +#[label(singleton = "type")] pub enum ErrorKind { /// Wrong password, unknown endpoint, protocol violation, etc... User, /// Network error between user and proxy. Not necessarily user error + #[label(rename = "clientdisconnect")] ClientDisconnect, /// Proxy self-imposed user rate limits + #[label(rename = "ratelimit")] RateLimit, /// Proxy self-imposed service-wise rate limits + #[label(rename = "serviceratelimit")] ServiceRateLimit, /// internal errors Service, /// Error communicating with control plane + #[label(rename = "controlplane")] ControlPlane, /// Postgres error diff --git a/proxy/src/http.rs b/proxy/src/http.rs index 59e1492ed4..95ca0ccd5c 100644 --- a/proxy/src/http.rs +++ b/proxy/src/http.rs @@ -13,7 +13,11 @@ pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware}; use tokio::time::Instant; use tracing::trace; -use crate::{metrics::CONSOLE_REQUEST_LATENCY, rate_limiter, url::ApiUrl}; +use crate::{ + metrics::{ConsoleRequest, Metrics}, + rate_limiter, + url::ApiUrl, +}; use reqwest_middleware::RequestBuilder; /// This is the preferred way to create new http clients, @@ -90,13 +94,14 @@ impl Endpoint { /// Execute a [request](reqwest::Request). pub async fn execute(&self, request: Request) -> Result { - let path = request.url().path().to_string(); - let start = Instant::now(); - let res = self.client.execute(request).await; - CONSOLE_REQUEST_LATENCY - .with_label_values(&[&path]) - .observe(start.elapsed().as_secs_f64()); - res + let _timer = Metrics::get() + .proxy + .console_request_latency + .start_timer(ConsoleRequest { + request: request.url().path(), + }); + + self.client.execute(request).await } } diff --git a/proxy/src/http/health_server.rs b/proxy/src/http/health_server.rs index cbb17ebcb7..cae9eb5b97 100644 --- a/proxy/src/http/health_server.rs +++ b/proxy/src/http/health_server.rs @@ -1,30 +1,49 @@ use anyhow::{anyhow, bail}; -use hyper::{Body, Request, Response, StatusCode}; -use std::{convert::Infallible, net::TcpListener}; -use tracing::info; +use hyper::{header::CONTENT_TYPE, Body, Request, Response, StatusCode}; +use measured::{text::BufferedTextEncoder, MetricGroup}; +use metrics::NeonMetrics; +use std::{ + convert::Infallible, + net::TcpListener, + sync::{Arc, Mutex}, +}; +use tracing::{info, info_span}; use utils::http::{ - endpoint::{self, prometheus_metrics_handler, request_span}, + endpoint::{self, request_span}, error::ApiError, json::json_response, RouterBuilder, RouterService, }; +use crate::jemalloc; + async fn status_handler(_: Request) -> Result, ApiError> { json_response(StatusCode::OK, "") } -fn make_router() -> RouterBuilder { +fn make_router(metrics: AppMetrics) -> RouterBuilder { + let state = Arc::new(Mutex::new(PrometheusHandler { + encoder: BufferedTextEncoder::new(), + metrics, + })); + endpoint::make_router() - .get("/metrics", |r| request_span(r, prometheus_metrics_handler)) + .get("/metrics", move |r| { + let state = state.clone(); + request_span(r, move |b| prometheus_metrics_handler(b, state)) + }) .get("/v1/status", status_handler) } -pub async fn task_main(http_listener: TcpListener) -> anyhow::Result { +pub async fn task_main( + http_listener: TcpListener, + metrics: AppMetrics, +) -> anyhow::Result { scopeguard::defer! { info!("http has shut down"); } - let service = || RouterService::new(make_router().build()?); + let service = || RouterService::new(make_router(metrics).build()?); hyper::Server::from_tcp(http_listener)? .serve(service().map_err(|e| anyhow!(e))?) @@ -32,3 +51,57 @@ pub async fn task_main(http_listener: TcpListener) -> anyhow::Result bail!("hyper server without shutdown handling cannot shutdown successfully"); } + +struct PrometheusHandler { + encoder: BufferedTextEncoder, + metrics: AppMetrics, +} + +#[derive(MetricGroup)] +pub struct AppMetrics { + #[metric(namespace = "jemalloc")] + pub jemalloc: Option, + #[metric(flatten)] + pub neon_metrics: NeonMetrics, + #[metric(flatten)] + pub proxy: &'static crate::metrics::Metrics, +} + +async fn prometheus_metrics_handler( + _req: Request, + state: Arc>, +) -> Result, ApiError> { + let started_at = std::time::Instant::now(); + + let span = info_span!("blocking"); + let body = tokio::task::spawn_blocking(move || { + let _span = span.entered(); + + let mut state = state.lock().unwrap(); + let PrometheusHandler { encoder, metrics } = &mut *state; + + metrics + .collect_group_into(&mut *encoder) + .unwrap_or_else(|infallible| match infallible {}); + + let body = encoder.finish(); + + tracing::info!( + bytes = body.len(), + elapsed_ms = started_at.elapsed().as_millis(), + "responded /metrics" + ); + + body + }) + .await + .unwrap(); + + let response = Response::builder() + .status(200) + .header(CONTENT_TYPE, "text/plain; version=0.0.4") + .body(Body::from(body)) + .unwrap(); + + Ok(response) +} diff --git a/proxy/src/intern.rs b/proxy/src/intern.rs index a6519bdff9..e38135dd22 100644 --- a/proxy/src/intern.rs +++ b/proxy/src/intern.rs @@ -160,6 +160,11 @@ impl From<&EndpointId> for EndpointIdInt { EndpointIdTag::get_interner().get_or_intern(value) } } +impl From for EndpointIdInt { + fn from(value: EndpointId) -> Self { + EndpointIdTag::get_interner().get_or_intern(&value) + } +} #[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] pub struct BranchIdTag; @@ -175,6 +180,11 @@ impl From<&BranchId> for BranchIdInt { BranchIdTag::get_interner().get_or_intern(value) } } +impl From for BranchIdInt { + fn from(value: BranchId) -> Self { + BranchIdTag::get_interner().get_or_intern(&value) + } +} #[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] pub struct ProjectIdTag; @@ -190,6 +200,11 @@ impl From<&ProjectId> for ProjectIdInt { ProjectIdTag::get_interner().get_or_intern(value) } } +impl From for ProjectIdInt { + fn from(value: ProjectId) -> Self { + ProjectIdTag::get_interner().get_or_intern(&value) + } +} #[cfg(test)] mod tests { diff --git a/proxy/src/jemalloc.rs b/proxy/src/jemalloc.rs index ed20798d56..3243e6a140 100644 --- a/proxy/src/jemalloc.rs +++ b/proxy/src/jemalloc.rs @@ -1,27 +1,45 @@ -use std::time::Duration; +use std::marker::PhantomData; -use metrics::IntGauge; -use prometheus::{register_int_gauge_with_registry, Registry}; +use measured::{ + label::NoLabels, + metric::{ + gauge::GaugeState, group::Encoding, group::MetricValue, name::MetricNameEncoder, + MetricEncoding, MetricFamilyEncoding, MetricType, + }, + text::TextEncoder, + LabelGroup, MetricGroup, +}; use tikv_jemalloc_ctl::{config, epoch, epoch_mib, stats, version}; pub struct MetricRecorder { epoch: epoch_mib, - active: stats::active_mib, - active_gauge: IntGauge, - allocated: stats::allocated_mib, - allocated_gauge: IntGauge, - mapped: stats::mapped_mib, - mapped_gauge: IntGauge, - metadata: stats::metadata_mib, - metadata_gauge: IntGauge, - resident: stats::resident_mib, - resident_gauge: IntGauge, - retained: stats::retained_mib, - retained_gauge: IntGauge, + inner: Metrics, +} + +#[derive(MetricGroup)] +struct Metrics { + active_bytes: JemallocGaugeFamily, + allocated_bytes: JemallocGaugeFamily, + mapped_bytes: JemallocGaugeFamily, + metadata_bytes: JemallocGaugeFamily, + resident_bytes: JemallocGaugeFamily, + retained_bytes: JemallocGaugeFamily, +} + +impl MetricGroup for MetricRecorder +where + Metrics: MetricGroup, +{ + fn collect_group_into(&self, enc: &mut Enc) -> Result<(), Enc::Err> { + if self.epoch.advance().is_ok() { + self.inner.collect_group_into(enc)?; + } + Ok(()) + } } impl MetricRecorder { - pub fn new(registry: &Registry) -> Result { + pub fn new() -> Result { tracing::info!( config = config::malloc_conf::read()?, version = version::read()?, @@ -30,71 +48,69 @@ impl MetricRecorder { Ok(Self { epoch: epoch::mib()?, - active: stats::active::mib()?, - active_gauge: register_int_gauge_with_registry!( - "jemalloc_active_bytes", - "Total number of bytes in active pages allocated by the process", - registry - )?, - allocated: stats::allocated::mib()?, - allocated_gauge: register_int_gauge_with_registry!( - "jemalloc_allocated_bytes", - "Total number of bytes allocated by the process", - registry - )?, - mapped: stats::mapped::mib()?, - mapped_gauge: register_int_gauge_with_registry!( - "jemalloc_mapped_bytes", - "Total number of bytes in active extents mapped by the allocator", - registry - )?, - metadata: stats::metadata::mib()?, - metadata_gauge: register_int_gauge_with_registry!( - "jemalloc_metadata_bytes", - "Total number of bytes dedicated to jemalloc metadata", - registry - )?, - resident: stats::resident::mib()?, - resident_gauge: register_int_gauge_with_registry!( - "jemalloc_resident_bytes", - "Total number of bytes in physically resident data pages mapped by the allocator", - registry - )?, - retained: stats::retained::mib()?, - retained_gauge: register_int_gauge_with_registry!( - "jemalloc_retained_bytes", - "Total number of bytes in virtual memory mappings that were retained rather than being returned to the operating system", - registry - )?, - }) - } - - fn _poll(&self) -> Result<(), anyhow::Error> { - self.epoch.advance()?; - self.active_gauge.set(self.active.read()? as i64); - self.allocated_gauge.set(self.allocated.read()? as i64); - self.mapped_gauge.set(self.mapped.read()? as i64); - self.metadata_gauge.set(self.metadata.read()? as i64); - self.resident_gauge.set(self.resident.read()? as i64); - self.retained_gauge.set(self.retained.read()? as i64); - Ok(()) - } - - #[inline] - pub fn poll(&self) { - if let Err(error) = self._poll() { - tracing::warn!(%error, "Failed to poll jemalloc stats"); - } - } - - pub fn start(self) -> tokio::task::JoinHandle<()> { - tokio::task::spawn(async move { - let mut interval = tokio::time::interval(Duration::from_secs(15)); - interval.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip); - loop { - self.poll(); - interval.tick().await; - } + inner: Metrics { + active_bytes: JemallocGaugeFamily(stats::active::mib()?), + allocated_bytes: JemallocGaugeFamily(stats::allocated::mib()?), + mapped_bytes: JemallocGaugeFamily(stats::mapped::mib()?), + metadata_bytes: JemallocGaugeFamily(stats::metadata::mib()?), + resident_bytes: JemallocGaugeFamily(stats::resident::mib()?), + retained_bytes: JemallocGaugeFamily(stats::retained::mib()?), + }, }) } } + +struct JemallocGauge(PhantomData); + +impl Default for JemallocGauge { + fn default() -> Self { + JemallocGauge(PhantomData) + } +} +impl MetricType for JemallocGauge { + type Metadata = T; +} + +struct JemallocGaugeFamily(T); +impl MetricFamilyEncoding for JemallocGaugeFamily +where + JemallocGauge: MetricEncoding, +{ + fn collect_family_into(&self, name: impl MetricNameEncoder, enc: &mut T) -> Result<(), T::Err> { + JemallocGauge::write_type(&name, enc)?; + JemallocGauge(PhantomData).collect_into(&self.0, NoLabels, name, enc) + } +} + +macro_rules! jemalloc_gauge { + ($stat:ident, $mib:ident) => { + impl MetricEncoding> for JemallocGauge { + fn write_type( + name: impl MetricNameEncoder, + enc: &mut TextEncoder, + ) -> Result<(), std::io::Error> { + GaugeState::write_type(name, enc) + } + + fn collect_into( + &self, + mib: &stats::$mib, + labels: impl LabelGroup, + name: impl MetricNameEncoder, + enc: &mut TextEncoder, + ) -> Result<(), std::io::Error> { + if let Ok(v) = mib.read() { + enc.write_metric_value(name, labels, MetricValue::Int(v as i64))?; + } + Ok(()) + } + } + }; +} + +jemalloc_gauge!(active, active_mib); +jemalloc_gauge!(allocated, allocated_mib); +jemalloc_gauge!(mapped, mapped_mib); +jemalloc_gauge!(metadata, metadata_mib); +jemalloc_gauge!(resident, resident_mib); +jemalloc_gauge!(retained, retained_mib); diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs index da7c7f3ed2..3f6d985fe8 100644 --- a/proxy/src/lib.rs +++ b/proxy/src/lib.rs @@ -127,6 +127,24 @@ macro_rules! smol_str_wrapper { }; } +const POOLER_SUFFIX: &str = "-pooler"; + +pub trait Normalize { + fn normalize(&self) -> Self; +} + +impl + From> Normalize for S { + fn normalize(&self) -> Self { + if self.as_ref().ends_with(POOLER_SUFFIX) { + let mut s = self.as_ref().to_string(); + s.truncate(s.len() - POOLER_SUFFIX.len()); + s.into() + } else { + self.clone() + } + } +} + // 90% of role name strings are 20 characters or less. smol_str_wrapper!(RoleName); // 50% of endpoint strings are 23 characters or less. @@ -140,3 +158,22 @@ smol_str_wrapper!(ProjectId); smol_str_wrapper!(EndpointCacheKey); smol_str_wrapper!(DbName); + +// Endpoints are a bit tricky. Rare they might be branches or projects. +impl EndpointId { + pub fn is_endpoint(&self) -> bool { + self.0.starts_with("ep-") + } + pub fn is_branch(&self) -> bool { + self.0.starts_with("br-") + } + pub fn is_project(&self) -> bool { + !self.is_endpoint() && !self.is_branch() + } + pub fn as_branch(&self) -> BranchId { + BranchId(self.0.clone()) + } + pub fn as_project(&self) -> ProjectId { + ProjectId(self.0.clone()) + } +} diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs index 59ee899c08..b96950b0a2 100644 --- a/proxy/src/metrics.rs +++ b/proxy/src/metrics.rs @@ -1,176 +1,359 @@ -use ::metrics::{ - exponential_buckets, register_histogram, register_histogram_vec, register_hll_vec, - register_int_counter_pair_vec, register_int_counter_vec, register_int_gauge, - register_int_gauge_vec, Histogram, HistogramVec, HyperLogLogVec, IntCounterPairVec, - IntCounterVec, IntGauge, IntGaugeVec, -}; -use metrics::{ - register_hll, register_int_counter, register_int_counter_pair, HyperLogLog, IntCounter, - IntCounterPair, -}; +use std::sync::OnceLock; + +use lasso::ThreadedRodeo; +use measured::{ + label::StaticLabelSet, + metric::{histogram::Thresholds, name::MetricName}, + Counter, CounterVec, FixedCardinalityLabel, Gauge, GaugeVec, Histogram, HistogramVec, + LabelGroup, MetricGroup, +}; +use metrics::{CounterPairAssoc, CounterPairVec, HyperLogLog, HyperLogLogVec}; -use once_cell::sync::Lazy; use tokio::time::{self, Instant}; use crate::console::messages::ColdStartInfo; -pub static NUM_DB_CONNECTIONS_GAUGE: Lazy = Lazy::new(|| { - register_int_counter_pair_vec!( - "proxy_opened_db_connections_total", - "Number of opened connections to a database.", - "proxy_closed_db_connections_total", - "Number of closed connections to a database.", - &["protocol"], - ) - .unwrap() -}); +#[derive(MetricGroup)] +pub struct Metrics { + #[metric(namespace = "proxy")] + pub proxy: ProxyMetrics, -pub static NUM_CLIENT_CONNECTION_GAUGE: Lazy = Lazy::new(|| { - register_int_counter_pair_vec!( - "proxy_opened_client_connections_total", - "Number of opened connections from a client.", - "proxy_closed_client_connections_total", - "Number of closed connections from a client.", - &["protocol"], - ) - .unwrap() -}); + #[metric(namespace = "wake_compute_lock")] + pub wake_compute_lock: ApiLockMetrics, -pub static NUM_CONNECTION_REQUESTS_GAUGE: Lazy = Lazy::new(|| { - register_int_counter_pair_vec!( - "proxy_accepted_connections_total", - "Number of client connections accepted.", - "proxy_closed_connections_total", - "Number of client connections closed.", - &["protocol"], - ) - .unwrap() -}); + // the one metric not called proxy_.... + pub semaphore_control_plane_limit: GaugeVec>, +} -pub static COMPUTE_CONNECTION_LATENCY: Lazy = Lazy::new(|| { - register_histogram_vec!( - "proxy_compute_connection_latency_seconds", - "Time it took for proxy to establish a connection to the compute endpoint", - // http/ws/tcp, true/false, true/false, success/failure, client/client_and_cplane - // 3 * 6 * 2 * 2 = 72 counters - &["protocol", "cold_start_info", "outcome", "excluded"], - // largest bucket = 2^16 * 0.5ms = 32s - exponential_buckets(0.0005, 2.0, 16).unwrap(), - ) - .unwrap() -}); +impl Metrics { + pub fn get() -> &'static Self { + static SELF: OnceLock = OnceLock::new(); + SELF.get_or_init(|| Metrics { + proxy: ProxyMetrics::default(), + wake_compute_lock: ApiLockMetrics::new(), + semaphore_control_plane_limit: GaugeVec::default(), + }) + } +} -pub static CONSOLE_REQUEST_LATENCY: Lazy = Lazy::new(|| { - register_histogram_vec!( - "proxy_console_request_latency", - "Time it took for proxy to establish a connection to the compute endpoint", - // proxy_wake_compute/proxy_get_role_info - &["request"], +#[derive(MetricGroup)] +#[metric(new())] +pub struct ProxyMetrics { + #[metric(flatten)] + pub db_connections: CounterPairVec, + #[metric(flatten)] + pub client_connections: CounterPairVec, + #[metric(flatten)] + pub connection_requests: CounterPairVec, + #[metric(flatten)] + pub http_endpoint_pools: HttpEndpointPools, + + /// Time it took for proxy to establish a connection to the compute endpoint. + // largest bucket = 2^16 * 0.5ms = 32s + #[metric(metadata = Thresholds::exponential_buckets(0.0005, 2.0))] + pub compute_connection_latency_seconds: HistogramVec, + + /// Time it took for proxy to receive a response from control plane. + #[metric( // largest bucket = 2^16 * 0.2ms = 13s - exponential_buckets(0.0002, 2.0, 16).unwrap(), - ) - .unwrap() -}); + metadata = Thresholds::exponential_buckets(0.0002, 2.0), + )] + pub console_request_latency: HistogramVec, -pub static ALLOWED_IPS_BY_CACHE_OUTCOME: Lazy = Lazy::new(|| { - register_int_counter_vec!( - "proxy_allowed_ips_cache_misses", - "Number of cache hits/misses for allowed ips", - // hit/miss - &["outcome"], - ) - .unwrap() -}); + /// Time it takes to acquire a token to call console plane. + // largest bucket = 3^16 * 0.05ms = 2.15s + #[metric(metadata = Thresholds::exponential_buckets(0.00005, 3.0))] + pub control_plane_token_acquire_seconds: Histogram<16>, -pub static RATE_LIMITER_ACQUIRE_LATENCY: Lazy = Lazy::new(|| { - register_histogram!( - "proxy_control_plane_token_acquire_seconds", - "Time it took for proxy to establish a connection to the compute endpoint", - // largest bucket = 3^16 * 0.05ms = 2.15s - exponential_buckets(0.00005, 3.0, 16).unwrap(), - ) - .unwrap() -}); + /// Size of the HTTP request body lengths. + // smallest bucket = 16 bytes + // largest bucket = 4^12 * 16 bytes = 256MB + #[metric(metadata = Thresholds::exponential_buckets(16.0, 4.0))] + pub http_conn_content_length_bytes: HistogramVec, 12>, -pub static RATE_LIMITER_LIMIT: Lazy = Lazy::new(|| { - register_int_gauge_vec!( - "semaphore_control_plane_limit", - "Current limit of the semaphore control plane", - &["limit"], // 2 counters - ) - .unwrap() -}); + /// Time it takes to reclaim unused connection pools. + #[metric(metadata = Thresholds::exponential_buckets(1e-6, 2.0))] + pub http_pool_reclaimation_lag_seconds: Histogram<16>, -pub static NUM_CONNECTION_ACCEPTED_BY_SNI: Lazy = Lazy::new(|| { - register_int_counter_vec!( - "proxy_accepted_connections_by_sni", - "Number of connections (per sni).", - &["kind"], - ) - .unwrap() -}); + /// Number of opened connections to a database. + pub http_pool_opened_connections: Gauge, -pub static ALLOWED_IPS_NUMBER: Lazy = Lazy::new(|| { - register_histogram!( - "proxy_allowed_ips_number", - "Number of allowed ips", - vec![0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 10.0, 20.0, 50.0, 100.0], - ) - .unwrap() -}); + /// Number of cache hits/misses for allowed ips. + pub allowed_ips_cache_misses: CounterVec>, -pub static HTTP_CONTENT_LENGTH: Lazy = Lazy::new(|| { - register_histogram_vec!( - "proxy_http_conn_content_length_bytes", - "Number of bytes the HTTP response content consumes", - // request/response - &["direction"], - // smallest bucket = 16 bytes - // largest bucket = 4^12 * 16 bytes = 256MB - exponential_buckets(16.0, 4.0, 12).unwrap() - ) - .unwrap() -}); + /// Number of allowed ips + #[metric(metadata = Thresholds::with_buckets([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 10.0, 20.0, 50.0, 100.0]))] + pub allowed_ips_number: Histogram<10>, -pub static GC_LATENCY: Lazy = Lazy::new(|| { - register_histogram!( - "proxy_http_pool_reclaimation_lag_seconds", - "Time it takes to reclaim unused connection pools", - // 1us -> 65ms - exponential_buckets(1e-6, 2.0, 16).unwrap(), - ) - .unwrap() -}); + /// Number of connections (per sni). + pub accepted_connections_by_sni: CounterVec>, -pub static ENDPOINT_POOLS: Lazy = Lazy::new(|| { - register_int_counter_pair!( - "proxy_http_pool_endpoints_registered_total", - "Number of endpoints we have registered pools for", - "proxy_http_pool_endpoints_unregistered_total", - "Number of endpoints we have unregistered pools for", - ) - .unwrap() -}); + /// Number of connection failures (per kind). + pub connection_failures_total: CounterVec>, -pub static NUM_OPEN_CLIENTS_IN_HTTP_POOL: Lazy = Lazy::new(|| { - register_int_gauge!( - "proxy_http_pool_opened_connections", - "Number of opened connections to a database.", - ) - .unwrap() -}); + /// Number of wake-up failures (per kind). + pub connection_failures_breakdown: CounterVec, -pub static NUM_CANCELLATION_REQUESTS: Lazy = Lazy::new(|| { - register_int_counter_vec!( - "proxy_cancellation_requests_total", - "Number of cancellation requests (per found/not_found).", - &["source", "kind"], - ) - .unwrap() -}); + /// Number of bytes sent/received between all clients and backends. + pub io_bytes: CounterVec>, -pub const NUM_CANCELLATION_REQUESTS_SOURCE_FROM_CLIENT: &str = "from_client"; -pub const NUM_CANCELLATION_REQUESTS_SOURCE_FROM_REDIS: &str = "from_redis"; + /// Number of errors by a given classification. + pub errors_total: CounterVec>, + + /// Number of cancellation requests (per found/not_found). + pub cancellation_requests_total: CounterVec, + + /// Number of errors by a given classification + pub redis_errors_total: CounterVec, + + /// Number of TLS handshake failures + pub tls_handshake_failures: Counter, + + /// Number of connection requests affected by authentication rate limits + pub requests_auth_rate_limits_total: Counter, + + /// HLL approximate cardinality of endpoints that are connecting + pub connecting_endpoints: HyperLogLogVec, 32>, + + /// Number of endpoints affected by errors of a given classification + pub endpoints_affected_by_errors: HyperLogLogVec, 32>, + + /// Number of endpoints affected by authentication rate limits + pub endpoints_auth_rate_limits: HyperLogLog<32>, + + /// Number of invalid endpoints (per protocol, per rejected). + pub invalid_endpoints_total: CounterVec, +} + +#[derive(MetricGroup)] +#[metric(new())] +pub struct ApiLockMetrics { + /// Number of semaphores registered in this api lock + pub semaphores_registered: Counter, + /// Number of semaphores unregistered in this api lock + pub semaphores_unregistered: Counter, + /// Time it takes to reclaim unused semaphores in the api lock + #[metric(metadata = Thresholds::exponential_buckets(1e-6, 2.0))] + pub reclamation_lag_seconds: Histogram<16>, + /// Time it takes to acquire a semaphore lock + #[metric(metadata = Thresholds::exponential_buckets(1e-4, 2.0))] + pub semaphore_acquire_seconds: Histogram<16>, +} + +impl Default for ProxyMetrics { + fn default() -> Self { + Self::new() + } +} + +#[derive(FixedCardinalityLabel, Copy, Clone)] +#[label(singleton = "direction")] +pub enum HttpDirection { + Request, + Response, +} + +#[derive(FixedCardinalityLabel, Copy, Clone)] +#[label(singleton = "direction")] +pub enum Direction { + Tx, + Rx, +} + +#[derive(FixedCardinalityLabel, Clone, Copy, Debug)] +#[label(singleton = "protocol")] +pub enum Protocol { + Http, + Ws, + Tcp, + SniRouter, +} + +impl Protocol { + pub fn as_str(&self) -> &'static str { + match self { + Protocol::Http => "http", + Protocol::Ws => "ws", + Protocol::Tcp => "tcp", + Protocol::SniRouter => "sni_router", + } + } +} + +impl std::fmt::Display for Protocol { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.write_str(self.as_str()) + } +} + +#[derive(FixedCardinalityLabel, Copy, Clone)] +pub enum Bool { + True, + False, +} + +#[derive(FixedCardinalityLabel, Copy, Clone)] +#[label(singleton = "outcome")] +pub enum Outcome { + Success, + Failed, +} + +#[derive(FixedCardinalityLabel, Copy, Clone)] +#[label(singleton = "outcome")] +pub enum CacheOutcome { + Hit, + Miss, +} + +#[derive(LabelGroup)] +#[label(set = ConsoleRequestSet)] +pub struct ConsoleRequest<'a> { + #[label(dynamic_with = ThreadedRodeo, default)] + pub request: &'a str, +} + +#[derive(MetricGroup, Default)] +pub struct HttpEndpointPools { + /// Number of endpoints we have registered pools for + pub http_pool_endpoints_registered_total: Counter, + /// Number of endpoints we have unregistered pools for + pub http_pool_endpoints_unregistered_total: Counter, +} + +pub struct HttpEndpointPoolsGuard<'a> { + dec: &'a Counter, +} + +impl Drop for HttpEndpointPoolsGuard<'_> { + fn drop(&mut self) { + self.dec.inc(); + } +} + +impl HttpEndpointPools { + pub fn guard(&self) -> HttpEndpointPoolsGuard { + self.http_pool_endpoints_registered_total.inc(); + HttpEndpointPoolsGuard { + dec: &self.http_pool_endpoints_unregistered_total, + } + } +} +pub struct NumDbConnectionsGauge; +impl CounterPairAssoc for NumDbConnectionsGauge { + const INC_NAME: &'static MetricName = MetricName::from_str("opened_db_connections_total"); + const DEC_NAME: &'static MetricName = MetricName::from_str("closed_db_connections_total"); + const INC_HELP: &'static str = "Number of opened connections to a database."; + const DEC_HELP: &'static str = "Number of closed connections to a database."; + type LabelGroupSet = StaticLabelSet; +} +pub type NumDbConnectionsGuard<'a> = metrics::MeasuredCounterPairGuard<'a, NumDbConnectionsGauge>; + +pub struct NumClientConnectionsGauge; +impl CounterPairAssoc for NumClientConnectionsGauge { + const INC_NAME: &'static MetricName = MetricName::from_str("opened_client_connections_total"); + const DEC_NAME: &'static MetricName = MetricName::from_str("closed_client_connections_total"); + const INC_HELP: &'static str = "Number of opened connections from a client."; + const DEC_HELP: &'static str = "Number of closed connections from a client."; + type LabelGroupSet = StaticLabelSet; +} +pub type NumClientConnectionsGuard<'a> = + metrics::MeasuredCounterPairGuard<'a, NumClientConnectionsGauge>; + +pub struct NumConnectionRequestsGauge; +impl CounterPairAssoc for NumConnectionRequestsGauge { + const INC_NAME: &'static MetricName = MetricName::from_str("accepted_connections_total"); + const DEC_NAME: &'static MetricName = MetricName::from_str("closed_connections_total"); + const INC_HELP: &'static str = "Number of client connections accepted."; + const DEC_HELP: &'static str = "Number of client connections closed."; + type LabelGroupSet = StaticLabelSet; +} +pub type NumConnectionRequestsGuard<'a> = + metrics::MeasuredCounterPairGuard<'a, NumConnectionRequestsGauge>; + +#[derive(LabelGroup)] +#[label(set = ComputeConnectionLatencySet)] +pub struct ComputeConnectionLatencyGroup { + protocol: Protocol, + cold_start_info: ColdStartInfo, + outcome: ConnectOutcome, + excluded: LatencyExclusions, +} + +#[derive(FixedCardinalityLabel, Copy, Clone)] +pub enum LatencyExclusions { + Client, + ClientAndCplane, +} + +#[derive(FixedCardinalityLabel, Copy, Clone)] +#[label(singleton = "limit")] +pub enum RateLimit { + Actual, + Expected, +} + +#[derive(FixedCardinalityLabel, Copy, Clone)] +#[label(singleton = "kind")] +pub enum SniKind { + Sni, + NoSni, + PasswordHack, +} + +#[derive(FixedCardinalityLabel, Copy, Clone)] +#[label(singleton = "kind")] +pub enum ConnectionFailureKind { + ComputeCached, + ComputeUncached, +} + +#[derive(FixedCardinalityLabel, Copy, Clone)] +#[label(singleton = "kind")] +pub enum WakeupFailureKind { + BadComputeAddress, + ApiTransportError, + QuotaExceeded, + ApiConsoleLocked, + ApiConsoleBadRequest, + ApiConsoleOtherServerError, + ApiConsoleOtherError, + TimeoutError, +} + +#[derive(LabelGroup)] +#[label(set = ConnectionFailuresBreakdownSet)] +pub struct ConnectionFailuresBreakdownGroup { + pub kind: WakeupFailureKind, + pub retry: Bool, +} + +#[derive(LabelGroup, Copy, Clone)] +#[label(set = RedisErrorsSet)] +pub struct RedisErrors<'a> { + #[label(dynamic_with = ThreadedRodeo, default)] + pub channel: &'a str, +} + +#[derive(FixedCardinalityLabel, Copy, Clone)] +pub enum CancellationSource { + FromClient, + FromRedis, + Local, +} + +#[derive(FixedCardinalityLabel, Copy, Clone)] +pub enum CancellationOutcome { + NotFound, + Found, +} + +#[derive(LabelGroup)] +#[label(set = CancellationRequestSet)] +pub struct CancellationRequest { + pub source: CancellationSource, + pub kind: CancellationOutcome, +} pub enum Waiting { Cplane, @@ -185,20 +368,6 @@ struct Accumulated { compute: time::Duration, } -enum Outcome { - Success, - Failed, -} - -impl Outcome { - fn as_str(&self) -> &'static str { - match self { - Outcome::Success => "success", - Outcome::Failed => "failed", - } - } -} - pub struct LatencyTimer { // time since the stopwatch was started start: time::Instant, @@ -207,9 +376,9 @@ pub struct LatencyTimer { // accumulated time on the stopwatch accumulated: Accumulated, // label data - protocol: &'static str, + protocol: Protocol, cold_start_info: ColdStartInfo, - outcome: Outcome, + outcome: ConnectOutcome, } pub struct LatencyTimerPause<'a> { @@ -219,7 +388,7 @@ pub struct LatencyTimerPause<'a> { } impl LatencyTimer { - pub fn new(protocol: &'static str) -> Self { + pub fn new(protocol: Protocol) -> Self { Self { start: time::Instant::now(), stop: None, @@ -227,7 +396,7 @@ impl LatencyTimer { protocol, cold_start_info: ColdStartInfo::Unknown, // assume failed unless otherwise specified - outcome: Outcome::Failed, + outcome: ConnectOutcome::Failed, } } @@ -248,7 +417,7 @@ impl LatencyTimer { self.stop = Some(time::Instant::now()); // success - self.outcome = Outcome::Success; + self.outcome = ConnectOutcome::Success; } } @@ -263,128 +432,62 @@ impl Drop for LatencyTimerPause<'_> { } } +#[derive(FixedCardinalityLabel, Clone, Copy, Debug)] +pub enum ConnectOutcome { + Success, + Failed, +} + impl Drop for LatencyTimer { fn drop(&mut self) { let duration = self .stop .unwrap_or_else(time::Instant::now) .duration_since(self.start); - // Excluding cplane communication from the accumulated time. - COMPUTE_CONNECTION_LATENCY - .with_label_values(&[ - self.protocol, - self.cold_start_info.as_str(), - self.outcome.as_str(), - "client", - ]) - .observe((duration.saturating_sub(self.accumulated.client)).as_secs_f64()); + + let metric = &Metrics::get().proxy.compute_connection_latency_seconds; + + // Excluding client communication from the accumulated time. + metric.observe( + ComputeConnectionLatencyGroup { + protocol: self.protocol, + cold_start_info: self.cold_start_info, + outcome: self.outcome, + excluded: LatencyExclusions::Client, + }, + duration + .saturating_sub(self.accumulated.client) + .as_secs_f64(), + ); + // Exclude client and cplane communication from the accumulated time. let accumulated_total = self.accumulated.client + self.accumulated.cplane; - COMPUTE_CONNECTION_LATENCY - .with_label_values(&[ - self.protocol, - self.cold_start_info.as_str(), - self.outcome.as_str(), - "client_and_cplane", - ]) - .observe((duration.saturating_sub(accumulated_total)).as_secs_f64()); + metric.observe( + ComputeConnectionLatencyGroup { + protocol: self.protocol, + cold_start_info: self.cold_start_info, + outcome: self.outcome, + excluded: LatencyExclusions::ClientAndCplane, + }, + duration.saturating_sub(accumulated_total).as_secs_f64(), + ); } } -pub static NUM_CONNECTION_FAILURES: Lazy = Lazy::new(|| { - register_int_counter_vec!( - "proxy_connection_failures_total", - "Number of connection failures (per kind).", - &["kind"], - ) - .unwrap() -}); - -pub static NUM_WAKEUP_FAILURES: Lazy = Lazy::new(|| { - register_int_counter_vec!( - "proxy_connection_failures_breakdown", - "Number of wake-up failures (per kind).", - &["retry", "kind"], - ) - .unwrap() -}); - -pub static NUM_BYTES_PROXIED_COUNTER: Lazy = Lazy::new(|| { - register_int_counter_vec!( - "proxy_io_bytes", - "Number of bytes sent/received between all clients and backends.", - &["direction"], - ) - .unwrap() -}); - -pub const fn bool_to_str(x: bool) -> &'static str { - if x { - "true" - } else { - "false" +impl From for Bool { + fn from(value: bool) -> Self { + if value { + Bool::True + } else { + Bool::False + } } } -pub static CONNECTING_ENDPOINTS: Lazy> = Lazy::new(|| { - register_hll_vec!( - 32, - "proxy_connecting_endpoints", - "HLL approximate cardinality of endpoints that are connecting", - &["protocol"], - ) - .unwrap() -}); - -pub static ERROR_BY_KIND: Lazy = Lazy::new(|| { - register_int_counter_vec!( - "proxy_errors_total", - "Number of errors by a given classification", - &["type"], - ) - .unwrap() -}); - -pub static ENDPOINT_ERRORS_BY_KIND: Lazy> = Lazy::new(|| { - register_hll_vec!( - 32, - "proxy_endpoints_affected_by_errors", - "Number of endpoints affected by errors of a given classification", - &["type"], - ) - .unwrap() -}); - -pub static REDIS_BROKEN_MESSAGES: Lazy = Lazy::new(|| { - register_int_counter_vec!( - "proxy_redis_errors_total", - "Number of errors by a given classification", - &["channel"], - ) - .unwrap() -}); - -pub static TLS_HANDSHAKE_FAILURES: Lazy = Lazy::new(|| { - register_int_counter!( - "proxy_tls_handshake_failures", - "Number of TLS handshake failures", - ) - .unwrap() -}); - -pub static ENDPOINTS_AUTH_RATE_LIMITED: Lazy> = Lazy::new(|| { - register_hll!( - 32, - "proxy_endpoints_auth_rate_limits", - "Number of endpoints affected by authentication rate limits", - ) - .unwrap() -}); - -pub static AUTH_RATE_LIMIT_HITS: Lazy = Lazy::new(|| { - register_int_counter!( - "proxy_requests_auth_rate_limits_total", - "Number of connection requests affected by authentication rate limits", - ) - .unwrap() -}); +#[derive(LabelGroup)] +#[label(set = InvalidEndpointsSet)] +pub struct InvalidEndpointsGroup { + pub protocol: Protocol, + pub rejected: Bool, + pub outcome: ConnectOutcome, +} diff --git a/proxy/src/protocol2.rs b/proxy/src/protocol2.rs index 700c8c8681..70f9b4bfab 100644 --- a/proxy/src/protocol2.rs +++ b/proxy/src/protocol2.rs @@ -5,19 +5,13 @@ use std::{ io, net::SocketAddr, pin::{pin, Pin}, - sync::Mutex, task::{ready, Context, Poll}, }; use bytes::{Buf, BytesMut}; -use hyper::server::accept::Accept; -use hyper::server::conn::{AddrIncoming, AddrStream}; -use metrics::IntCounterPairGuard; +use hyper::server::conn::AddrIncoming; use pin_project_lite::pin_project; use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, ReadBuf}; -use uuid::Uuid; - -use crate::metrics::NUM_CLIENT_CONNECTION_GAUGE; pub struct ProxyProtocolAccept { pub incoming: AddrIncoming, @@ -331,103 +325,6 @@ impl AsyncRead for WithClientIp { } } -impl Accept for ProxyProtocolAccept { - type Conn = WithConnectionGuard>; - - type Error = io::Error; - - fn poll_accept( - mut self: Pin<&mut Self>, - cx: &mut Context<'_>, - ) -> Poll>> { - let conn = ready!(Pin::new(&mut self.incoming).poll_accept(cx)?); - - let conn_id = uuid::Uuid::new_v4(); - let span = tracing::info_span!("http_conn", ?conn_id); - { - let _enter = span.enter(); - tracing::info!("accepted new TCP connection"); - } - - let Some(conn) = conn else { - return Poll::Ready(None); - }; - - Poll::Ready(Some(Ok(WithConnectionGuard { - inner: WithClientIp::new(conn), - connection_id: Uuid::new_v4(), - gauge: Mutex::new(Some( - NUM_CLIENT_CONNECTION_GAUGE - .with_label_values(&[self.protocol]) - .guard(), - )), - span, - }))) - } -} - -pin_project! { - pub struct WithConnectionGuard { - #[pin] - pub inner: T, - pub connection_id: Uuid, - pub gauge: Mutex>, - pub span: tracing::Span, - } - - impl PinnedDrop for WithConnectionGuard { - fn drop(this: Pin<&mut Self>) { - let _enter = this.span.enter(); - tracing::info!("HTTP connection closed") - } - } -} - -impl AsyncWrite for WithConnectionGuard { - #[inline] - fn poll_write( - self: Pin<&mut Self>, - cx: &mut Context<'_>, - buf: &[u8], - ) -> Poll> { - self.project().inner.poll_write(cx, buf) - } - - #[inline] - fn poll_flush(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { - self.project().inner.poll_flush(cx) - } - - #[inline] - fn poll_shutdown(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { - self.project().inner.poll_shutdown(cx) - } - - #[inline] - fn poll_write_vectored( - self: Pin<&mut Self>, - cx: &mut Context<'_>, - bufs: &[io::IoSlice<'_>], - ) -> Poll> { - self.project().inner.poll_write_vectored(cx, bufs) - } - - #[inline] - fn is_write_vectored(&self) -> bool { - self.inner.is_write_vectored() - } -} - -impl AsyncRead for WithConnectionGuard { - fn poll_read( - self: Pin<&mut Self>, - cx: &mut Context<'_>, - buf: &mut ReadBuf<'_>, - ) -> Poll> { - self.project().inner.poll_read(cx, buf) - } -} - #[cfg(test)] mod tests { use std::pin::pin; diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs index 6051c0a812..42fb10b326 100644 --- a/proxy/src/proxy.rs +++ b/proxy/src/proxy.rs @@ -15,16 +15,15 @@ use crate::{ config::{ProxyConfig, TlsConfig}, context::RequestMonitoring, error::ReportableError, - metrics::{NUM_CLIENT_CONNECTION_GAUGE, NUM_CONNECTION_REQUESTS_GAUGE}, + metrics::{Metrics, NumClientConnectionsGuard}, protocol2::WithClientIp, proxy::handshake::{handshake, HandshakeData}, rate_limiter::EndpointRateLimiter, stream::{PqStream, Stream}, - EndpointCacheKey, + EndpointCacheKey, Normalize, }; use futures::TryFutureExt; use itertools::Itertools; -use metrics::IntCounterPairGuard; use once_cell::sync::OnceCell; use pq_proto::{BeMessage as Be, StartupMessageParams}; use regex::Regex; @@ -79,9 +78,10 @@ pub async fn task_main( { let (socket, peer_addr) = accept_result?; - let conn_gauge = NUM_CLIENT_CONNECTION_GAUGE - .with_label_values(&["tcp"]) - .guard(); + let conn_gauge = Metrics::get() + .proxy + .client_connections + .guard(crate::metrics::Protocol::Tcp); let session_id = uuid::Uuid::new_v4(); let cancellation_handler = Arc::clone(&cancellation_handler); @@ -113,7 +113,12 @@ pub async fn task_main( }, }; - let mut ctx = RequestMonitoring::new(session_id, peer_addr, "tcp", &config.region); + let mut ctx = RequestMonitoring::new( + session_id, + peer_addr, + crate::metrics::Protocol::Tcp, + &config.region, + ); let span = ctx.span.clone(); let res = handle_client( @@ -237,14 +242,17 @@ pub async fn handle_client( stream: S, mode: ClientMode, endpoint_rate_limiter: Arc, - conn_gauge: IntCounterPairGuard, + conn_gauge: NumClientConnectionsGuard<'static>, ) -> Result>, ClientRequestError> { - info!("handling interactive connection from client"); + info!( + protocol = %ctx.protocol, + "handling interactive connection from client" + ); + let metrics = &Metrics::get().proxy; let proto = ctx.protocol; - let _request_gauge = NUM_CONNECTION_REQUESTS_GAUGE - .with_label_values(&[proto]) - .guard(); + // let _client_gauge = metrics.client_connections.guard(proto); + let _request_gauge = metrics.connection_requests.guard(proto); let tls = config.tls_config.as_ref(); @@ -280,7 +288,7 @@ pub async fn handle_client( // check rate limit if let Some(ep) = user_info.get_endpoint() { - if !endpoint_rate_limiter.check(ep, 1) { + if !endpoint_rate_limiter.check(ep.normalize(), 1) { return stream .throw_error(auth::AuthError::too_many_connections()) .await?; diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs index 4c0d68ce0b..33f394c550 100644 --- a/proxy/src/proxy/connect_compute.rs +++ b/proxy/src/proxy/connect_compute.rs @@ -4,7 +4,7 @@ use crate::{ console::{self, errors::WakeComputeError, CachedNodeInfo, NodeInfo}, context::RequestMonitoring, error::ReportableError, - metrics::NUM_CONNECTION_FAILURES, + metrics::{ConnectionFailureKind, Metrics}, proxy::{ retry::{retry_after, ShouldRetry}, wake_compute::wake_compute, @@ -27,10 +27,10 @@ pub fn invalidate_cache(node_info: console::CachedNodeInfo) -> NodeInfo { warn!("invalidating stalled compute node info cache entry"); } let label = match is_cached { - true => "compute_cached", - false => "compute_uncached", + true => ConnectionFailureKind::ComputeCached, + false => ConnectionFailureKind::ComputeUncached, }; - NUM_CONNECTION_FAILURES.with_label_values(&[label]).inc(); + Metrics::get().proxy.connection_failures_total.inc(label); node_info.invalidate() } diff --git a/proxy/src/proxy/passthrough.rs b/proxy/src/proxy/passthrough.rs index c81a1a8292..62de79946f 100644 --- a/proxy/src/proxy/passthrough.rs +++ b/proxy/src/proxy/passthrough.rs @@ -2,11 +2,10 @@ use crate::{ cancellation, compute::PostgresConnection, console::messages::MetricsAuxInfo, - metrics::NUM_BYTES_PROXIED_COUNTER, + metrics::{Direction, Metrics, NumClientConnectionsGuard, NumConnectionRequestsGuard}, stream::Stream, usage_metrics::{Ids, MetricCounterRecorder, USAGE_METRICS}, }; -use metrics::IntCounterPairGuard; use tokio::io::{AsyncRead, AsyncWrite}; use tracing::info; use utils::measured_stream::MeasuredStream; @@ -23,24 +22,25 @@ pub async fn proxy_pass( branch_id: aux.branch_id, }); - let m_sent = NUM_BYTES_PROXIED_COUNTER.with_label_values(&["tx"]); + let metrics = &Metrics::get().proxy.io_bytes; + let m_sent = metrics.with_labels(Direction::Tx); let mut client = MeasuredStream::new( client, |_| {}, |cnt| { // Number of bytes we sent to the client (outbound). - m_sent.inc_by(cnt as u64); + metrics.get_metric(m_sent).inc_by(cnt as u64); usage.record_egress(cnt as u64); }, ); - let m_recv = NUM_BYTES_PROXIED_COUNTER.with_label_values(&["rx"]); + let m_recv = metrics.with_labels(Direction::Rx); let mut compute = MeasuredStream::new( compute, |_| {}, |cnt| { // Number of bytes the client sent to the compute node (inbound). - m_recv.inc_by(cnt as u64); + metrics.get_metric(m_recv).inc_by(cnt as u64); }, ); @@ -60,8 +60,8 @@ pub struct ProxyPassthrough { pub compute: PostgresConnection, pub aux: MetricsAuxInfo, - pub req: IntCounterPairGuard, - pub conn: IntCounterPairGuard, + pub req: NumConnectionRequestsGuard<'static>, + pub conn: NumClientConnectionsGuard<'static>, pub cancel: cancellation::Session

, } diff --git a/proxy/src/proxy/wake_compute.rs b/proxy/src/proxy/wake_compute.rs index bfe4b7ec3a..f8154b1a94 100644 --- a/proxy/src/proxy/wake_compute.rs +++ b/proxy/src/proxy/wake_compute.rs @@ -1,6 +1,6 @@ use crate::console::{errors::WakeComputeError, provider::CachedNodeInfo}; use crate::context::RequestMonitoring; -use crate::metrics::{bool_to_str, NUM_WAKEUP_FAILURES}; +use crate::metrics::{ConnectionFailuresBreakdownGroup, Metrics, WakeupFailureKind}; use crate::proxy::retry::retry_after; use hyper::StatusCode; use std::ops::ControlFlow; @@ -57,39 +57,46 @@ pub fn handle_try_wake( fn report_error(e: &WakeComputeError, retry: bool) { use crate::console::errors::ApiError; - let retry = bool_to_str(retry); let kind = match e { - WakeComputeError::BadComputeAddress(_) => "bad_compute_address", - WakeComputeError::ApiError(ApiError::Transport(_)) => "api_transport_error", + WakeComputeError::BadComputeAddress(_) => WakeupFailureKind::BadComputeAddress, + WakeComputeError::ApiError(ApiError::Transport(_)) => WakeupFailureKind::ApiTransportError, WakeComputeError::ApiError(ApiError::Console { status: StatusCode::LOCKED, ref text, }) if text.contains("written data quota exceeded") || text.contains("the limit for current plan reached") => { - "quota_exceeded" + WakeupFailureKind::QuotaExceeded } WakeComputeError::ApiError(ApiError::Console { status: StatusCode::UNPROCESSABLE_ENTITY, ref text, }) if text.contains("compute time quota of non-primary branches is exceeded") => { - "quota_exceeded" + WakeupFailureKind::QuotaExceeded } WakeComputeError::ApiError(ApiError::Console { status: StatusCode::LOCKED, .. - }) => "api_console_locked", + }) => WakeupFailureKind::ApiConsoleLocked, WakeComputeError::ApiError(ApiError::Console { status: StatusCode::BAD_REQUEST, .. - }) => "api_console_bad_request", + }) => WakeupFailureKind::ApiConsoleBadRequest, WakeComputeError::ApiError(ApiError::Console { status, .. }) if status.is_server_error() => { - "api_console_other_server_error" + WakeupFailureKind::ApiConsoleOtherServerError } - WakeComputeError::ApiError(ApiError::Console { .. }) => "api_console_other_error", - WakeComputeError::TimeoutError => "timeout_error", + WakeComputeError::ApiError(ApiError::Console { .. }) => { + WakeupFailureKind::ApiConsoleOtherError + } + WakeComputeError::TimeoutError => WakeupFailureKind::TimeoutError, }; - NUM_WAKEUP_FAILURES.with_label_values(&[retry, kind]).inc(); + Metrics::get() + .proxy + .connection_failures_breakdown + .inc(ConnectionFailuresBreakdownGroup { + kind, + retry: retry.into(), + }); } diff --git a/proxy/src/rate_limiter.rs b/proxy/src/rate_limiter.rs index 13dffffca0..a3b83e5e50 100644 --- a/proxy/src/rate_limiter.rs +++ b/proxy/src/rate_limiter.rs @@ -4,4 +4,4 @@ mod limiter; pub use aimd::Aimd; pub use limit_algorithm::{AimdConfig, Fixed, RateLimitAlgorithm, RateLimiterConfig}; pub use limiter::Limiter; -pub use limiter::{AuthRateLimiter, EndpointRateLimiter, RateBucketInfo, RedisRateLimiter}; +pub use limiter::{AuthRateLimiter, EndpointRateLimiter, GlobalRateLimiter, RateBucketInfo}; diff --git a/proxy/src/rate_limiter/limiter.rs b/proxy/src/rate_limiter/limiter.rs index f590896dd9..7e9370f606 100644 --- a/proxy/src/rate_limiter/limiter.rs +++ b/proxy/src/rate_limiter/limiter.rs @@ -17,20 +17,26 @@ use tokio::sync::{Mutex as AsyncMutex, Semaphore, SemaphorePermit}; use tokio::time::{timeout, Duration, Instant}; use tracing::info; -use crate::{intern::EndpointIdInt, EndpointId}; +use crate::{ + intern::EndpointIdInt, + { + metrics::{Metrics, RateLimit}, + EndpointId, + }, +}; use super::{ limit_algorithm::{LimitAlgorithm, Sample}, RateLimiterConfig, }; -pub struct RedisRateLimiter { +pub struct GlobalRateLimiter { data: Vec, - info: &'static [RateBucketInfo], + info: Vec, } -impl RedisRateLimiter { - pub fn new(info: &'static [RateBucketInfo]) -> Self { +impl GlobalRateLimiter { + pub fn new(info: Vec) -> Self { Self { data: vec![ RateBucket { @@ -50,7 +56,7 @@ impl RedisRateLimiter { let should_allow_request = self .data .iter_mut() - .zip(self.info) + .zip(&self.info) .all(|(bucket, info)| bucket.should_allow_request(info, now, 1)); if should_allow_request { @@ -457,12 +463,9 @@ impl Limiter { } new_limit }; - crate::metrics::RATE_LIMITER_LIMIT - .with_label_values(&["expected"]) - .set(new_limit as i64); - crate::metrics::RATE_LIMITER_LIMIT - .with_label_values(&["actual"]) - .set(actual_limit as i64); + let metric = &Metrics::get().semaphore_control_plane_limit; + metric.set(RateLimit::Expected, new_limit as i64); + metric.set(RateLimit::Actual, actual_limit as i64); self.limits.store(new_limit, Ordering::Release); #[cfg(test)] if let Some(n) = &self.notifier { @@ -519,7 +522,10 @@ impl reqwest_middleware::Middleware for Limiter { extensions: &mut task_local_extensions::Extensions, next: reqwest_middleware::Next<'_>, ) -> reqwest_middleware::Result { - let start = Instant::now(); + let timer = Metrics::get() + .proxy + .control_plane_token_acquire_seconds + .start_timer(); let token = self .acquire_timeout(self.config.timeout) .await @@ -533,8 +539,12 @@ impl reqwest_middleware::Middleware for Limiter { .into(), ) })?; - info!(duration = ?start.elapsed(), "waiting for token to connect to the control plane"); - crate::metrics::RATE_LIMITER_ACQUIRE_LATENCY.observe(start.elapsed().as_secs_f64()); + let duration = timer.observe(); + info!( + ?duration, + "waiting for token to connect to the control plane" + ); + match next.run(req, extensions).await { Ok(response) => { self.release(token, Some(Outcome::from_reqwest_response(&response))) diff --git a/proxy/src/redis/cancellation_publisher.rs b/proxy/src/redis/cancellation_publisher.rs index 422789813c..7baf104374 100644 --- a/proxy/src/redis/cancellation_publisher.rs +++ b/proxy/src/redis/cancellation_publisher.rs @@ -5,7 +5,7 @@ use redis::AsyncCommands; use tokio::sync::Mutex; use uuid::Uuid; -use crate::rate_limiter::{RateBucketInfo, RedisRateLimiter}; +use crate::rate_limiter::{GlobalRateLimiter, RateBucketInfo}; use super::{ connection_with_credentials_provider::ConnectionWithCredentialsProvider, @@ -80,7 +80,7 @@ impl CancellationPublisher for Arc> { pub struct RedisPublisherClient { client: ConnectionWithCredentialsProvider, region_id: String, - limiter: RedisRateLimiter, + limiter: GlobalRateLimiter, } impl RedisPublisherClient { @@ -92,7 +92,7 @@ impl RedisPublisherClient { Ok(Self { client, region_id, - limiter: RedisRateLimiter::new(info), + limiter: GlobalRateLimiter::new(info.into()), }) } diff --git a/proxy/src/redis/notifications.rs b/proxy/src/redis/notifications.rs index 8b7e3e3419..5a38530faf 100644 --- a/proxy/src/redis/notifications.rs +++ b/proxy/src/redis/notifications.rs @@ -11,7 +11,7 @@ use crate::{ cache::project_info::ProjectInfoCache, cancellation::{CancelMap, CancellationHandler}, intern::{ProjectIdInt, RoleNameInt}, - metrics::{NUM_CANCELLATION_REQUESTS_SOURCE_FROM_REDIS, REDIS_BROKEN_MESSAGES}, + metrics::{Metrics, RedisErrors}, }; const CPLANE_CHANNEL_NAME: &str = "neondb-proxy-ws-updates"; @@ -104,9 +104,9 @@ impl MessageHandler { let msg: Notification = match serde_json::from_str(&payload) { Ok(msg) => msg, Err(e) => { - REDIS_BROKEN_MESSAGES - .with_label_values(&[msg.get_channel_name()]) - .inc(); + Metrics::get().proxy.redis_errors_total.inc(RedisErrors { + channel: msg.get_channel_name(), + }); tracing::error!("broken message: {e}"); return Ok(()); } @@ -183,7 +183,7 @@ where cache, Arc::new(CancellationHandler::<()>::new( cancel_map, - NUM_CANCELLATION_REQUESTS_SOURCE_FROM_REDIS, + crate::metrics::CancellationSource::FromRedis, )), region_id, ); diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs index a2010fd613..24c94fadd8 100644 --- a/proxy/src/serverless.rs +++ b/proxy/src/serverless.rs @@ -4,42 +4,48 @@ mod backend; mod conn_pool; +mod http_util; mod json; mod sql_over_http; -pub mod tls_listener; mod websocket; +use atomic_take::AtomicTake; +use bytes::Bytes; pub use conn_pool::GlobalConnPoolOptions; -use anyhow::bail; -use hyper::StatusCode; -use metrics::IntCounterPairGuard; +use anyhow::Context; +use futures::future::{select, Either}; +use futures::TryFutureExt; +use http::{Method, Response, StatusCode}; +use http_body_util::Full; +use hyper1::body::Incoming; +use hyper_util::rt::TokioExecutor; +use hyper_util::server::conn::auto::Builder; use rand::rngs::StdRng; use rand::SeedableRng; pub use reqwest_middleware::{ClientWithMiddleware, Error}; pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware}; +use tokio::time::timeout; +use tokio_rustls::TlsAcceptor; use tokio_util::task::TaskTracker; -use tracing::instrument::Instrumented; use crate::cancellation::CancellationHandlerMain; use crate::config::ProxyConfig; use crate::context::RequestMonitoring; -use crate::protocol2::{ProxyProtocolAccept, WithClientIp, WithConnectionGuard}; +use crate::metrics::Metrics; +use crate::protocol2::WithClientIp; +use crate::proxy::run_until_cancelled; use crate::rate_limiter::EndpointRateLimiter; use crate::serverless::backend::PoolingBackend; -use hyper::{ - server::conn::{AddrIncoming, AddrStream}, - Body, Method, Request, Response, -}; +use crate::serverless::http_util::{api_error_into_response, json_response}; -use std::net::IpAddr; +use std::net::{IpAddr, SocketAddr}; +use std::pin::pin; use std::sync::Arc; -use std::task::Poll; -use tls_listener::TlsListener; -use tokio::net::TcpListener; -use tokio_util::sync::{CancellationToken, DropGuard}; +use tokio::net::{TcpListener, TcpStream}; +use tokio_util::sync::CancellationToken; use tracing::{error, info, warn, Instrument}; -use utils::http::{error::ApiError, json::json_response}; +use utils::http::error::ApiError; pub const SERVERLESS_DRIVER_SNI: &str = "api"; @@ -91,161 +97,175 @@ pub async fn task_main( tls_server_config.alpn_protocols = vec![b"h2".to_vec(), b"http/1.1".to_vec()]; let tls_acceptor: tokio_rustls::TlsAcceptor = Arc::new(tls_server_config).into(); - let mut addr_incoming = AddrIncoming::from_listener(ws_listener)?; - let _ = addr_incoming.set_nodelay(true); - let addr_incoming = ProxyProtocolAccept { - incoming: addr_incoming, - protocol: "http", - }; + let connections = tokio_util::task::task_tracker::TaskTracker::new(); + connections.close(); // allows `connections.wait to complete` - let ws_connections = tokio_util::task::task_tracker::TaskTracker::new(); - ws_connections.close(); // allows `ws_connections.wait to complete` + let server = Builder::new(hyper_util::rt::TokioExecutor::new()); - let tls_listener = TlsListener::new(tls_acceptor, addr_incoming, config.handshake_timeout); + while let Some(res) = run_until_cancelled(ws_listener.accept(), &cancellation_token).await { + let (conn, peer_addr) = res.context("could not accept TCP stream")?; + if let Err(e) = conn.set_nodelay(true) { + tracing::error!("could not set nodelay: {e}"); + continue; + } + let conn_id = uuid::Uuid::new_v4(); + let http_conn_span = tracing::info_span!("http_conn", ?conn_id); - let make_svc = hyper::service::make_service_fn( - |stream: &tokio_rustls::server::TlsStream< - WithConnectionGuard>, - >| { - let (conn, _) = stream.get_ref(); + connections.spawn( + connection_handler( + config, + backend.clone(), + connections.clone(), + cancellation_handler.clone(), + endpoint_rate_limiter.clone(), + cancellation_token.clone(), + server.clone(), + tls_acceptor.clone(), + conn, + peer_addr, + ) + .instrument(http_conn_span), + ); + } - // this is jank. should dissapear with hyper 1.0 migration. - let gauge = conn - .gauge - .lock() - .expect("lock should not be poisoned") - .take() - .expect("gauge should be set on connection start"); - - // Cancel all current inflight HTTP requests if the HTTP connection is closed. - let http_cancellation_token = CancellationToken::new(); - let cancel_connection = http_cancellation_token.clone().drop_guard(); - - let span = conn.span.clone(); - let client_addr = conn.inner.client_addr(); - let remote_addr = conn.inner.inner.remote_addr(); - let backend = backend.clone(); - let ws_connections = ws_connections.clone(); - let endpoint_rate_limiter = endpoint_rate_limiter.clone(); - let cancellation_handler = cancellation_handler.clone(); - async move { - let peer_addr = match client_addr { - Some(addr) => addr, - None if config.require_client_ip => bail!("missing required client ip"), - None => remote_addr, - }; - Ok(MetricService::new( - hyper::service::service_fn(move |req: Request| { - let backend = backend.clone(); - let ws_connections2 = ws_connections.clone(); - let endpoint_rate_limiter = endpoint_rate_limiter.clone(); - let cancellation_handler = cancellation_handler.clone(); - let http_cancellation_token = http_cancellation_token.child_token(); - - // `request_handler` is not cancel safe. It expects to be cancelled only at specific times. - // By spawning the future, we ensure it never gets cancelled until it decides to. - ws_connections.spawn( - async move { - // Cancel the current inflight HTTP request if the requets stream is closed. - // This is slightly different to `_cancel_connection` in that - // h2 can cancel individual requests with a `RST_STREAM`. - let _cancel_session = http_cancellation_token.clone().drop_guard(); - - let res = request_handler( - req, - config, - backend, - ws_connections2, - cancellation_handler, - peer_addr.ip(), - endpoint_rate_limiter, - http_cancellation_token, - ) - .await - .map_or_else(|e| e.into_response(), |r| r); - - _cancel_session.disarm(); - - res - } - .in_current_span(), - ) - }), - gauge, - cancel_connection, - span, - )) - } - }, - ); - - hyper::Server::builder(tls_listener) - .serve(make_svc) - .with_graceful_shutdown(cancellation_token.cancelled()) - .await?; - - // await websocket connections - ws_connections.wait().await; + connections.wait().await; Ok(()) } -struct MetricService { - inner: S, - _gauge: IntCounterPairGuard, - _cancel: DropGuard, - span: tracing::Span, -} +/// Handles the TCP lifecycle. +/// +/// 1. Parses PROXY protocol V2 +/// 2. Handles TLS handshake +/// 3. Handles HTTP connection +/// 1. With graceful shutdowns +/// 2. With graceful request cancellation with connection failure +/// 3. With websocket upgrade support. +#[allow(clippy::too_many_arguments)] +async fn connection_handler( + config: &'static ProxyConfig, + backend: Arc, + connections: TaskTracker, + cancellation_handler: Arc, + endpoint_rate_limiter: Arc, + cancellation_token: CancellationToken, + server: Builder, + tls_acceptor: TlsAcceptor, + conn: TcpStream, + peer_addr: SocketAddr, +) { + let session_id = uuid::Uuid::new_v4(); -impl MetricService { - fn new( - inner: S, - _gauge: IntCounterPairGuard, - _cancel: DropGuard, - span: tracing::Span, - ) -> MetricService { - MetricService { - inner, - _gauge, - _cancel, - span, + let _gauge = Metrics::get() + .proxy + .client_connections + .guard(crate::metrics::Protocol::Http); + + // handle PROXY protocol + let mut conn = WithClientIp::new(conn); + let peer = match conn.wait_for_addr().await { + Ok(peer) => peer, + Err(e) => { + tracing::error!(?session_id, %peer_addr, "failed to accept TCP connection: invalid PROXY protocol V2 header: {e:#}"); + return; } - } -} + }; -impl hyper::service::Service> for MetricService -where - S: hyper::service::Service>, -{ - type Response = S::Response; - type Error = S::Error; - type Future = Instrumented; + let peer_addr = peer.unwrap_or(peer_addr).ip(); + info!(?session_id, %peer_addr, "accepted new TCP connection"); - fn poll_ready(&mut self, cx: &mut std::task::Context<'_>) -> Poll> { - self.inner.poll_ready(cx) - } + // try upgrade to TLS, but with a timeout. + let conn = match timeout(config.handshake_timeout, tls_acceptor.accept(conn)).await { + Ok(Ok(conn)) => { + info!(?session_id, %peer_addr, "accepted new TLS connection"); + conn + } + // The handshake failed + Ok(Err(e)) => { + Metrics::get().proxy.tls_handshake_failures.inc(); + warn!(?session_id, %peer_addr, "failed to accept TLS connection: {e:?}"); + return; + } + // The handshake timed out + Err(e) => { + Metrics::get().proxy.tls_handshake_failures.inc(); + warn!(?session_id, %peer_addr, "failed to accept TLS connection: {e:?}"); + return; + } + }; - fn call(&mut self, req: Request) -> Self::Future { - self.span - .in_scope(|| self.inner.call(req)) - .instrument(self.span.clone()) + let session_id = AtomicTake::new(session_id); + + // Cancel all current inflight HTTP requests if the HTTP connection is closed. + let http_cancellation_token = CancellationToken::new(); + let _cancel_connection = http_cancellation_token.clone().drop_guard(); + + let conn = server.serve_connection_with_upgrades( + hyper_util::rt::TokioIo::new(conn), + hyper1::service::service_fn(move |req: hyper1::Request| { + // First HTTP request shares the same session ID + let session_id = session_id.take().unwrap_or_else(uuid::Uuid::new_v4); + + // Cancel the current inflight HTTP request if the requets stream is closed. + // This is slightly different to `_cancel_connection` in that + // h2 can cancel individual requests with a `RST_STREAM`. + let http_request_token = http_cancellation_token.child_token(); + let cancel_request = http_request_token.clone().drop_guard(); + + // `request_handler` is not cancel safe. It expects to be cancelled only at specific times. + // By spawning the future, we ensure it never gets cancelled until it decides to. + let handler = connections.spawn( + request_handler( + req, + config, + backend.clone(), + connections.clone(), + cancellation_handler.clone(), + session_id, + peer_addr, + endpoint_rate_limiter.clone(), + http_request_token, + ) + .in_current_span() + .map_ok_or_else(api_error_into_response, |r| r), + ); + + async move { + let res = handler.await; + cancel_request.disarm(); + res + } + }), + ); + + // On cancellation, trigger the HTTP connection handler to shut down. + let res = match select(pin!(cancellation_token.cancelled()), pin!(conn)).await { + Either::Left((_cancelled, mut conn)) => { + conn.as_mut().graceful_shutdown(); + conn.await + } + Either::Right((res, _)) => res, + }; + + match res { + Ok(()) => tracing::info!(%peer_addr, "HTTP connection closed"), + Err(e) => tracing::warn!(%peer_addr, "HTTP connection error {e}"), } } #[allow(clippy::too_many_arguments)] async fn request_handler( - mut request: Request, + mut request: hyper1::Request, config: &'static ProxyConfig, backend: Arc, ws_connections: TaskTracker, cancellation_handler: Arc, + session_id: uuid::Uuid, peer_addr: IpAddr, endpoint_rate_limiter: Arc, // used to cancel in-flight HTTP requests. not used to cancel websockets http_cancellation_token: CancellationToken, -) -> Result, ApiError> { - let session_id = uuid::Uuid::new_v4(); - +) -> Result>, ApiError> { let host = request .headers() .get("host") @@ -255,7 +275,13 @@ async fn request_handler( // Check if the request is a websocket upgrade request. if hyper_tungstenite::is_upgrade_request(&request) { - let ctx = RequestMonitoring::new(session_id, peer_addr, "ws", &config.region); + let ctx = RequestMonitoring::new( + session_id, + peer_addr, + crate::metrics::Protocol::Ws, + &config.region, + ); + let span = ctx.span.clone(); info!(parent: &span, "performing websocket upgrade"); @@ -282,14 +308,19 @@ async fn request_handler( // Return the response so the spawned future can continue. Ok(response) - } else if request.uri().path() == "/sql" && request.method() == Method::POST { - let ctx = RequestMonitoring::new(session_id, peer_addr, "http", &config.region); + } else if request.uri().path() == "/sql" && *request.method() == Method::POST { + let ctx = RequestMonitoring::new( + session_id, + peer_addr, + crate::metrics::Protocol::Http, + &config.region, + ); let span = ctx.span.clone(); sql_over_http::handle(config, ctx, request, backend, http_cancellation_token) .instrument(span) .await - } else if request.uri().path() == "/sql" && request.method() == Method::OPTIONS { + } else if request.uri().path() == "/sql" && *request.method() == Method::OPTIONS { Response::builder() .header("Allow", "OPTIONS, POST") .header("Access-Control-Allow-Origin", "*") @@ -299,7 +330,7 @@ async fn request_handler( ) .header("Access-Control-Max-Age", "86400" /* 24 hours */) .status(StatusCode::OK) // 204 is also valid, but see: https://developer.mozilla.org/en-US/docs/Web/HTTP/Methods/OPTIONS#status_code - .body(Body::empty()) + .body(Full::new(Bytes::new())) .map_err(|e| ApiError::InternalServerError(e.into())) } else { json_response(StatusCode::BAD_REQUEST, "query is not supported") diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs index 35311facb8..798e488509 100644 --- a/proxy/src/serverless/conn_pool.rs +++ b/proxy/src/serverless/conn_pool.rs @@ -1,6 +1,5 @@ use dashmap::DashMap; use futures::{future::poll_fn, Future}; -use metrics::IntCounterPairGuard; use parking_lot::RwLock; use rand::Rng; use smallvec::SmallVec; @@ -16,13 +15,13 @@ use std::{ use tokio::time::Instant; use tokio_postgres::tls::NoTlsStream; use tokio_postgres::{AsyncMessage, ReadyForQueryStatus, Socket}; +use tokio_util::sync::CancellationToken; use crate::console::messages::{ColdStartInfo, MetricsAuxInfo}; -use crate::metrics::{ENDPOINT_POOLS, GC_LATENCY, NUM_OPEN_CLIENTS_IN_HTTP_POOL}; +use crate::metrics::{HttpEndpointPoolsGuard, Metrics}; use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS}; use crate::{ - auth::backend::ComputeUserInfo, context::RequestMonitoring, metrics::NUM_DB_CONNECTIONS_GAUGE, - DbName, EndpointCacheKey, RoleName, + auth::backend::ComputeUserInfo, context::RequestMonitoring, DbName, EndpointCacheKey, RoleName, }; use tracing::{debug, error, warn, Span}; @@ -78,7 +77,7 @@ pub struct EndpointConnPool { pools: HashMap<(DbName, RoleName), DbUserConnPool>, total_conns: usize, max_conns: usize, - _guard: IntCounterPairGuard, + _guard: HttpEndpointPoolsGuard<'static>, global_connections_count: Arc, global_pool_size_max_conns: usize, } @@ -110,7 +109,11 @@ impl EndpointConnPool { let removed = old_len - new_len; if removed > 0 { global_connections_count.fetch_sub(removed, atomic::Ordering::Relaxed); - NUM_OPEN_CLIENTS_IN_HTTP_POOL.sub(removed as i64); + Metrics::get() + .proxy + .http_pool_opened_connections + .get_metric() + .dec_by(removed as i64); } *total_conns -= removed; removed > 0 @@ -156,7 +159,11 @@ impl EndpointConnPool { pool.total_conns += 1; pool.global_connections_count .fetch_add(1, atomic::Ordering::Relaxed); - NUM_OPEN_CLIENTS_IN_HTTP_POOL.inc(); + Metrics::get() + .proxy + .http_pool_opened_connections + .get_metric() + .inc(); } pool.total_conns @@ -176,7 +183,11 @@ impl Drop for EndpointConnPool { if self.total_conns > 0 { self.global_connections_count .fetch_sub(self.total_conns, atomic::Ordering::Relaxed); - NUM_OPEN_CLIENTS_IN_HTTP_POOL.sub(self.total_conns as i64); + Metrics::get() + .proxy + .http_pool_opened_connections + .get_metric() + .dec_by(self.total_conns as i64); } } } @@ -215,7 +226,11 @@ impl DbUserConnPool { removed += 1; } global_connections_count.fetch_sub(removed, atomic::Ordering::Relaxed); - NUM_OPEN_CLIENTS_IN_HTTP_POOL.sub(removed as i64); + Metrics::get() + .proxy + .http_pool_opened_connections + .get_metric() + .dec_by(removed as i64); conn } } @@ -303,7 +318,10 @@ impl GlobalConnPool { // acquire a random shard lock let mut shard = self.global_pool.shards()[shard].write(); - let timer = GC_LATENCY.start_timer(); + let timer = Metrics::get() + .proxy + .http_pool_reclaimation_lag_seconds + .start_timer(); let current_len = shard.len(); let mut clients_removed = 0; shard.retain(|endpoint, x| { @@ -331,7 +349,7 @@ impl GlobalConnPool { let new_len = shard.len(); drop(shard); - timer.observe_duration(); + timer.observe(); // Do logging outside of the lock. if clients_removed > 0 { @@ -339,7 +357,11 @@ impl GlobalConnPool { .global_connections_count .fetch_sub(clients_removed, atomic::Ordering::Relaxed) - clients_removed; - NUM_OPEN_CLIENTS_IN_HTTP_POOL.sub(clients_removed as i64); + Metrics::get() + .proxy + .http_pool_opened_connections + .get_metric() + .dec_by(clients_removed as i64); info!("pool: performed global pool gc. removed {clients_removed} clients, total number of clients in pool is {size}"); } let removed = current_len - new_len; @@ -410,7 +432,7 @@ impl GlobalConnPool { pools: HashMap::new(), total_conns: 0, max_conns: self.config.pool_options.max_conns_per_endpoint, - _guard: ENDPOINT_POOLS.guard(), + _guard: Metrics::get().proxy.http_endpoint_pools.guard(), global_connections_count: self.global_connections_count.clone(), global_pool_size_max_conns: self.config.pool_options.max_total_conns, })); @@ -450,9 +472,7 @@ pub fn poll_client( conn_id: uuid::Uuid, aux: MetricsAuxInfo, ) -> Client { - let conn_gauge = NUM_DB_CONNECTIONS_GAUGE - .with_label_values(&[ctx.protocol]) - .guard(); + let conn_gauge = Metrics::get().proxy.db_connections.guard(ctx.protocol); let mut session_id = ctx.session_id; let (tx, mut rx) = tokio::sync::watch::channel(session_id); @@ -469,15 +489,32 @@ pub fn poll_client( let db_user = conn_info.db_and_user(); let idle = global_pool.get_idle_timeout(); + let cancel = CancellationToken::new(); + let cancelled = cancel.clone().cancelled_owned(); + tokio::spawn( async move { let _conn_gauge = conn_gauge; let mut idle_timeout = pin!(tokio::time::sleep(idle)); + let mut cancelled = pin!(cancelled); + poll_fn(move |cx| { - if matches!(rx.has_changed(), Ok(true)) { - session_id = *rx.borrow_and_update(); - info!(%session_id, "changed session"); - idle_timeout.as_mut().reset(Instant::now() + idle); + if cancelled.as_mut().poll(cx).is_ready() { + info!("connection dropped"); + return Poll::Ready(()) + } + + match rx.has_changed() { + Ok(true) => { + session_id = *rx.borrow_and_update(); + info!(%session_id, "changed session"); + idle_timeout.as_mut().reset(Instant::now() + idle); + } + Err(_) => { + info!("connection dropped"); + return Poll::Ready(()) + } + _ => {} } // 5 minute idle connection timeout @@ -532,6 +569,7 @@ pub fn poll_client( let inner = ClientInner { inner: client, session: tx, + cancel, aux, conn_id, }; @@ -541,10 +579,18 @@ pub fn poll_client( struct ClientInner { inner: C, session: tokio::sync::watch::Sender, + cancel: CancellationToken, aux: MetricsAuxInfo, conn_id: uuid::Uuid, } +impl Drop for ClientInner { + fn drop(&mut self) { + // on client drop, tell the conn to shut down + self.cancel.cancel(); + } +} + pub trait ClientInnerExt: Sync + Send + 'static { fn is_closed(&self) -> bool; fn get_process_id(&self) -> i32; @@ -697,6 +743,7 @@ mod tests { ClientInner { inner: client, session: tokio::sync::watch::Sender::new(uuid::Uuid::new_v4()), + cancel: CancellationToken::new(), aux: MetricsAuxInfo { endpoint_id: (&EndpointId::from("endpoint")).into(), project_id: (&ProjectId::from("project")).into(), diff --git a/proxy/src/serverless/http_util.rs b/proxy/src/serverless/http_util.rs new file mode 100644 index 0000000000..ab9127b13e --- /dev/null +++ b/proxy/src/serverless/http_util.rs @@ -0,0 +1,92 @@ +//! Things stolen from `libs/utils/src/http` to add hyper 1.0 compatibility +//! Will merge back in at some point in the future. + +use bytes::Bytes; + +use anyhow::Context; +use http::{Response, StatusCode}; +use http_body_util::Full; + +use serde::Serialize; +use utils::http::error::ApiError; + +/// Like [`ApiError::into_response`] +pub fn api_error_into_response(this: ApiError) -> Response> { + match this { + ApiError::BadRequest(err) => HttpErrorBody::response_from_msg_and_status( + format!("{err:#?}"), // use debug printing so that we give the cause + StatusCode::BAD_REQUEST, + ), + ApiError::Forbidden(_) => { + HttpErrorBody::response_from_msg_and_status(this.to_string(), StatusCode::FORBIDDEN) + } + ApiError::Unauthorized(_) => { + HttpErrorBody::response_from_msg_and_status(this.to_string(), StatusCode::UNAUTHORIZED) + } + ApiError::NotFound(_) => { + HttpErrorBody::response_from_msg_and_status(this.to_string(), StatusCode::NOT_FOUND) + } + ApiError::Conflict(_) => { + HttpErrorBody::response_from_msg_and_status(this.to_string(), StatusCode::CONFLICT) + } + ApiError::PreconditionFailed(_) => HttpErrorBody::response_from_msg_and_status( + this.to_string(), + StatusCode::PRECONDITION_FAILED, + ), + ApiError::ShuttingDown => HttpErrorBody::response_from_msg_and_status( + "Shutting down".to_string(), + StatusCode::SERVICE_UNAVAILABLE, + ), + ApiError::ResourceUnavailable(err) => HttpErrorBody::response_from_msg_and_status( + err.to_string(), + StatusCode::SERVICE_UNAVAILABLE, + ), + ApiError::Timeout(err) => HttpErrorBody::response_from_msg_and_status( + err.to_string(), + StatusCode::REQUEST_TIMEOUT, + ), + ApiError::InternalServerError(err) => HttpErrorBody::response_from_msg_and_status( + err.to_string(), + StatusCode::INTERNAL_SERVER_ERROR, + ), + } +} + +/// Same as [`utils::http::error::HttpErrorBody`] +#[derive(Serialize)] +struct HttpErrorBody { + pub msg: String, +} + +impl HttpErrorBody { + /// Same as [`utils::http::error::HttpErrorBody::response_from_msg_and_status`] + fn response_from_msg_and_status(msg: String, status: StatusCode) -> Response> { + HttpErrorBody { msg }.to_response(status) + } + + /// Same as [`utils::http::error::HttpErrorBody::to_response`] + fn to_response(&self, status: StatusCode) -> Response> { + Response::builder() + .status(status) + .header(http::header::CONTENT_TYPE, "application/json") + // we do not have nested maps with non string keys so serialization shouldn't fail + .body(Full::new(Bytes::from(serde_json::to_string(self).unwrap()))) + .unwrap() + } +} + +/// Same as [`utils::http::json::json_response`] +pub fn json_response( + status: StatusCode, + data: T, +) -> Result>, ApiError> { + let json = serde_json::to_string(&data) + .context("Failed to serialize JSON response") + .map_err(ApiError::InternalServerError)?; + let response = Response::builder() + .status(status) + .header(http::header::CONTENT_TYPE, "application/json") + .body(Full::new(Bytes::from(json))) + .map_err(|e| ApiError::InternalServerError(e.into()))?; + Ok(response) +} diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs index 00dffd5784..a66edb2c66 100644 --- a/proxy/src/serverless/sql_over_http.rs +++ b/proxy/src/serverless/sql_over_http.rs @@ -1,18 +1,22 @@ use std::pin::pin; use std::sync::Arc; +use bytes::Bytes; use futures::future::select; use futures::future::try_join; use futures::future::Either; use futures::StreamExt; use futures::TryFutureExt; -use hyper::body::HttpBody; -use hyper::header; -use hyper::http::HeaderName; -use hyper::http::HeaderValue; -use hyper::Response; -use hyper::StatusCode; -use hyper::{Body, HeaderMap, Request}; +use http_body_util::BodyExt; +use http_body_util::Full; +use hyper1::body::Body; +use hyper1::body::Incoming; +use hyper1::header; +use hyper1::http::HeaderName; +use hyper1::http::HeaderValue; +use hyper1::Response; +use hyper1::StatusCode; +use hyper1::{HeaderMap, Request}; use serde_json::json; use serde_json::Value; use tokio::time; @@ -29,7 +33,6 @@ use tracing::error; use tracing::info; use url::Url; use utils::http::error::ApiError; -use utils::http::json::json_response; use crate::auth::backend::ComputeUserInfo; use crate::auth::endpoint_sni; @@ -40,8 +43,8 @@ use crate::context::RequestMonitoring; use crate::error::ErrorKind; use crate::error::ReportableError; use crate::error::UserFacingError; -use crate::metrics::HTTP_CONTENT_LENGTH; -use crate::metrics::NUM_CONNECTION_REQUESTS_GAUGE; +use crate::metrics::HttpDirection; +use crate::metrics::Metrics; use crate::proxy::run_until_cancelled; use crate::proxy::NeonOptions; use crate::serverless::backend::HttpConnError; @@ -52,6 +55,7 @@ use crate::RoleName; use super::backend::PoolingBackend; use super::conn_pool::Client; use super::conn_pool::ConnInfo; +use super::http_util::json_response; use super::json::json_to_pg_text; use super::json::pg_text_row_to_json; use super::json::JsonConversionError; @@ -218,10 +222,10 @@ fn get_conn_info( pub async fn handle( config: &'static ProxyConfig, mut ctx: RequestMonitoring, - request: Request, + request: Request, backend: Arc, cancel: CancellationToken, -) -> Result, ApiError> { +) -> Result>, ApiError> { let result = handle_inner(cancel, config, &mut ctx, request, backend).await; let mut response = match result { @@ -332,10 +336,9 @@ pub async fn handle( } }; - response.headers_mut().insert( - "Access-Control-Allow-Origin", - hyper::http::HeaderValue::from_static("*"), - ); + response + .headers_mut() + .insert("Access-Control-Allow-Origin", HeaderValue::from_static("*")); Ok(response) } @@ -396,7 +399,7 @@ impl UserFacingError for SqlOverHttpError { #[derive(Debug, thiserror::Error)] pub enum ReadPayloadError { #[error("could not read the HTTP request body: {0}")] - Read(#[from] hyper::Error), + Read(#[from] hyper1::Error), #[error("could not parse the HTTP request body: {0}")] Parse(#[from] serde_json::Error), } @@ -437,7 +440,7 @@ struct HttpHeaders { } impl HttpHeaders { - fn try_parse(headers: &hyper::http::HeaderMap) -> Result { + fn try_parse(headers: &hyper1::http::HeaderMap) -> Result { // Determine the output options. Default behaviour is 'false'. Anything that is not // strictly 'true' assumed to be false. let raw_output = headers.get(&RAW_TEXT_OUTPUT) == Some(&HEADER_VALUE_TRUE); @@ -488,13 +491,14 @@ async fn handle_inner( cancel: CancellationToken, config: &'static ProxyConfig, ctx: &mut RequestMonitoring, - request: Request, + request: Request, backend: Arc, -) -> Result, SqlOverHttpError> { - let _request_gauge = NUM_CONNECTION_REQUESTS_GAUGE - .with_label_values(&[ctx.protocol]) - .guard(); - info!("handling interactive connection from client"); +) -> Result>, SqlOverHttpError> { + let _requeset_gauge = Metrics::get().proxy.connection_requests.guard(ctx.protocol); + info!( + protocol = %ctx.protocol, + "handling interactive connection from client" + ); // // Determine the destination and connection params @@ -517,9 +521,10 @@ async fn handle_inner( None => MAX_REQUEST_SIZE + 1, }; info!(request_content_length, "request size in bytes"); - HTTP_CONTENT_LENGTH - .with_label_values(&["request"]) - .observe(request_content_length as f64); + Metrics::get() + .proxy + .http_conn_content_length_bytes + .observe(HttpDirection::Request, request_content_length as f64); // we don't have a streaming request support yet so this is to prevent OOM // from a malicious user sending an extremely large request body @@ -528,7 +533,7 @@ async fn handle_inner( } let fetch_and_process_request = async { - let body = hyper::body::to_bytes(request.into_body()).await?; + let body = request.into_body().collect().await?.to_bytes(); info!(length = body.len(), "request payload read"); let payload: Payload = serde_json::from_slice(&body)?; Ok::(payload) // Adjust error type accordingly @@ -596,7 +601,7 @@ async fn handle_inner( let body = serde_json::to_string(&result).expect("json serialization should not fail"); let len = body.len(); let response = response - .body(Body::from(body)) + .body(Full::new(Bytes::from(body))) // only fails if invalid status code or invalid header/values are given. // these are not user configurable so it cannot fail dynamically .expect("building response payload should not fail"); @@ -604,9 +609,10 @@ async fn handle_inner( // count the egress bytes - we miss the TLS and header overhead but oh well... // moving this later in the stack is going to be a lot of effort and ehhhh metrics.record_egress(len as u64); - HTTP_CONTENT_LENGTH - .with_label_values(&["response"]) - .observe(len as f64); + Metrics::get() + .proxy + .http_conn_content_length_bytes + .observe(HttpDirection::Response, len as f64); Ok(response) } @@ -639,6 +645,7 @@ impl QueryData { } // The query was cancelled. Either::Right((_cancelled, query)) => { + tracing::info!("cancelling query"); if let Err(err) = cancel_token.cancel_query(NoTls).await { tracing::error!(?err, "could not cancel query"); } diff --git a/proxy/src/serverless/tls_listener.rs b/proxy/src/serverless/tls_listener.rs deleted file mode 100644 index 33f194dd59..0000000000 --- a/proxy/src/serverless/tls_listener.rs +++ /dev/null @@ -1,123 +0,0 @@ -use std::{ - convert::Infallible, - pin::Pin, - task::{Context, Poll}, - time::Duration, -}; - -use hyper::server::{accept::Accept, conn::AddrStream}; -use pin_project_lite::pin_project; -use tokio::{ - io::{AsyncRead, AsyncWrite}, - task::JoinSet, - time::timeout, -}; -use tokio_rustls::{server::TlsStream, TlsAcceptor}; -use tracing::{info, warn, Instrument}; - -use crate::{ - metrics::TLS_HANDSHAKE_FAILURES, - protocol2::{WithClientIp, WithConnectionGuard}, -}; - -pin_project! { - /// Wraps a `Stream` of connections (such as a TCP listener) so that each connection is itself - /// encrypted using TLS. - pub(crate) struct TlsListener { - #[pin] - listener: A, - tls: TlsAcceptor, - waiting: JoinSet>>, - timeout: Duration, - } -} - -impl TlsListener { - /// Create a `TlsListener` with default options. - pub(crate) fn new(tls: TlsAcceptor, listener: A, timeout: Duration) -> Self { - TlsListener { - listener, - tls, - waiting: JoinSet::new(), - timeout, - } - } -} - -impl Accept for TlsListener -where - A: Accept>>, - A::Error: std::error::Error, - A::Conn: AsyncRead + AsyncWrite + Unpin + Send + 'static, -{ - type Conn = TlsStream; - - type Error = Infallible; - - fn poll_accept( - self: Pin<&mut Self>, - cx: &mut Context<'_>, - ) -> Poll>> { - let mut this = self.project(); - - loop { - match this.listener.as_mut().poll_accept(cx) { - Poll::Pending => break, - Poll::Ready(Some(Ok(mut conn))) => { - let t = *this.timeout; - let tls = this.tls.clone(); - let span = conn.span.clone(); - this.waiting.spawn(async move { - let peer_addr = match conn.inner.wait_for_addr().await { - Ok(Some(addr)) => addr, - Err(e) => { - tracing::error!("failed to accept TCP connection: invalid PROXY protocol V2 header: {e:#}"); - return None; - } - Ok(None) => conn.inner.inner.remote_addr() - }; - - let accept = tls.accept(conn); - match timeout(t, accept).await { - Ok(Ok(conn)) => { - info!(%peer_addr, "accepted new TLS connection"); - Some(conn) - }, - // The handshake failed, try getting another connection from the queue - Ok(Err(e)) => { - TLS_HANDSHAKE_FAILURES.inc(); - warn!(%peer_addr, "failed to accept TLS connection: {e:?}"); - None - } - // The handshake timed out, try getting another connection from the queue - Err(_) => { - TLS_HANDSHAKE_FAILURES.inc(); - warn!(%peer_addr, "failed to accept TLS connection: timeout"); - None - } - } - }.instrument(span)); - } - Poll::Ready(Some(Err(e))) => { - tracing::error!("error accepting TCP connection: {e}"); - continue; - } - Poll::Ready(None) => return Poll::Ready(None), - } - } - - loop { - return match this.waiting.poll_join_next(cx) { - Poll::Ready(Some(Ok(Some(conn)))) => Poll::Ready(Some(Ok(conn))), - // The handshake failed to complete, try getting another connection from the queue - Poll::Ready(Some(Ok(None))) => continue, - // The handshake panicked or was cancelled. ignore and get another connection - Poll::Ready(Some(Err(e))) => { - tracing::warn!("handshake aborted: {e}"); - continue; - } - _ => Poll::Pending, - }; - } - } -} diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs index ada6c974f4..d054877126 100644 --- a/proxy/src/serverless/websocket.rs +++ b/proxy/src/serverless/websocket.rs @@ -3,7 +3,7 @@ use crate::{ config::ProxyConfig, context::RequestMonitoring, error::{io_error, ReportableError}, - metrics::NUM_CLIENT_CONNECTION_GAUGE, + metrics::Metrics, proxy::{handle_client, ClientMode}, rate_limiter::EndpointRateLimiter, }; @@ -139,9 +139,10 @@ pub async fn serve_websocket( endpoint_rate_limiter: Arc, ) -> anyhow::Result<()> { let websocket = websocket.await?; - let conn_gauge = NUM_CLIENT_CONNECTION_GAUGE - .with_label_values(&["ws"]) - .guard(); + let conn_gauge = Metrics::get() + .proxy + .client_connections + .guard(crate::metrics::Protocol::Ws); let res = handle_client( config, diff --git a/proxy/src/stream.rs b/proxy/src/stream.rs index b6b7a85659..fdd2be3ee5 100644 --- a/proxy/src/stream.rs +++ b/proxy/src/stream.rs @@ -1,6 +1,6 @@ use crate::config::TlsServerEndPoint; use crate::error::{ErrorKind, ReportableError, UserFacingError}; -use crate::metrics::TLS_HANDSHAKE_FAILURES; +use crate::metrics::Metrics; use bytes::BytesMut; use pq_proto::framed::{ConnectionError, Framed}; @@ -228,7 +228,7 @@ impl Stream { Stream::Raw { raw } => Ok(tokio_rustls::TlsAcceptor::from(cfg) .accept(raw) .await - .inspect_err(|_| TLS_HANDSHAKE_FAILURES.inc())?), + .inspect_err(|_| Metrics::get().proxy.tls_handshake_failures.inc())?), Stream::Tls { .. } => Err(StreamUpgradeError::AlreadyTls), } } diff --git a/scripts/flaky_tests.py b/scripts/flaky_tests.py index 853c67d218..878840fcee 100755 --- a/scripts/flaky_tests.py +++ b/scripts/flaky_tests.py @@ -15,8 +15,7 @@ FLAKY_TESTS_QUERY = """ DISTINCT parent_suite, suite, name FROM results WHERE - started_at > CURRENT_DATE - INTERVAL '10' day - AND started_at > '2024-03-11 14:50:11.845+00' -- we switched the default PAGESERVER_VIRTUAL_FILE_IO_ENGINE to `tokio-epoll-uring` from `std-fs` on this date, we want to ignore the flaky tests for `std-fs` + started_at > CURRENT_DATE - INTERVAL '%s' day AND ( (status IN ('failed', 'broken') AND reference = 'refs/heads/main') OR flaky diff --git a/scripts/sk_cleanup_tenants/script.py b/scripts/sk_cleanup_tenants/script.py index fa22433614..c20a4bb830 100644 --- a/scripts/sk_cleanup_tenants/script.py +++ b/scripts/sk_cleanup_tenants/script.py @@ -22,7 +22,7 @@ parser.add_argument("--safekeeper-host", required=True, type=str) args = parser.parse_args() access_key = os.getenv("CONSOLE_API_TOKEN") -endpoint: str = "https://console.stage.neon.tech/api" +endpoint: str = "https://console-stage.neon.build/api" trash_dir: Path = args.trash_dir dry_run: bool = args.dry_run diff --git a/scripts/sk_collect_dumps/readme.md b/scripts/sk_collect_dumps/readme.md index 7494a6cb78..5ae55e058b 100644 --- a/scripts/sk_collect_dumps/readme.md +++ b/scripts/sk_collect_dumps/readme.md @@ -3,7 +3,7 @@ 3. Issue admin token (add/remove .stage from url for staging/prod and setting proper API key): ``` # staging: -AUTH_TOKEN=$(curl https://console.stage.neon.tech/regions/console/api/v1/admin/issue_token -H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer $NEON_STAGING_KEY" -X POST -d '{"ttl_seconds": 43200, "scope": "safekeeperdata"}' 2>/dev/null | jq --raw-output '.jwt') +AUTH_TOKEN=$(curl https://console-stage.neon.build/regions/console/api/v1/admin/issue_token -H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer $NEON_STAGING_KEY" -X POST -d '{"ttl_seconds": 43200, "scope": "safekeeperdata"}' 2>/dev/null | jq --raw-output '.jwt') # prod: AUTH_TOKEN=$(curl https://console.neon.tech/regions/console/api/v1/admin/issue_token -H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer $NEON_PROD_KEY" -X POST -d '{"ttl_seconds": 43200, "scope": "safekeeperdata"}' 2>/dev/null | jq --raw-output '.jwt') # check diff --git a/storage_controller/src/compute_hook.rs b/storage_controller/src/compute_hook.rs index eb0c4472e4..1ed8998713 100644 --- a/storage_controller/src/compute_hook.rs +++ b/storage_controller/src/compute_hook.rs @@ -17,6 +17,8 @@ use crate::service::Config; const SLOWDOWN_DELAY: Duration = Duration::from_secs(5); +const NOTIFY_REQUEST_TIMEOUT: Duration = Duration::from_secs(10); + pub(crate) const API_CONCURRENCY: usize = 32; struct UnshardedComputeHookTenant { @@ -242,6 +244,10 @@ pub(super) struct ComputeHook { // This lock is only used in testing enviroments, to serialize calls into neon_lock neon_local_lock: tokio::sync::Mutex<()>, + + // We share a client across all notifications to enable connection re-use etc when + // sending large numbers of notifications + client: reqwest::Client, } impl ComputeHook { @@ -251,12 +257,18 @@ impl ComputeHook { .clone() .map(|jwt| format!("Bearer {}", jwt)); + let client = reqwest::ClientBuilder::new() + .timeout(NOTIFY_REQUEST_TIMEOUT) + .build() + .expect("Failed to construct HTTP client"); + Self { state: Default::default(), config, authorization_header, neon_local_lock: Default::default(), api_concurrency: tokio::sync::Semaphore::new(API_CONCURRENCY), + client, } } @@ -310,12 +322,11 @@ impl ComputeHook { async fn do_notify_iteration( &self, - client: &reqwest::Client, url: &String, reconfigure_request: &ComputeHookNotifyRequest, cancel: &CancellationToken, ) -> Result<(), NotifyError> { - let req = client.request(Method::PUT, url); + let req = self.client.request(Method::PUT, url); let req = if let Some(value) = &self.authorization_header { req.header(reqwest::header::AUTHORIZATION, value) } else { @@ -381,8 +392,6 @@ impl ComputeHook { reconfigure_request: &ComputeHookNotifyRequest, cancel: &CancellationToken, ) -> Result<(), NotifyError> { - let client = reqwest::Client::new(); - // We hold these semaphore units across all retries, rather than only across each // HTTP request: this is to preserve fairness and avoid a situation where a retry might // time out waiting for a semaphore. @@ -394,7 +403,7 @@ impl ComputeHook { .map_err(|_| NotifyError::ShuttingDown)?; backoff::retry( - || self.do_notify_iteration(&client, url, reconfigure_request, cancel), + || self.do_notify_iteration(url, reconfigure_request, cancel), |e| { matches!( e, diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs index c59bcaa174..2e83bbc5ed 100644 --- a/storage_controller/src/http.rs +++ b/storage_controller/src/http.rs @@ -8,6 +8,7 @@ use futures::Future; use hyper::header::CONTENT_TYPE; use hyper::{Body, Request, Response}; use hyper::{StatusCode, Uri}; +use metrics::{BuildInfo, NeonMetrics}; use pageserver_api::models::{ TenantConfigRequest, TenantCreateRequest, TenantLocationConfigRequest, TenantShardSplitRequest, TenantTimeTravelRequest, TimelineCreateRequest, @@ -44,15 +45,19 @@ use control_plane::storage_controller::{AttachHookRequest, InspectRequest}; use routerify::Middleware; /// State available to HTTP request handlers -#[derive(Clone)] pub struct HttpState { service: Arc, auth: Option>, + neon_metrics: NeonMetrics, allowlist_routes: Vec, } impl HttpState { - pub fn new(service: Arc, auth: Option>) -> Self { + pub fn new( + service: Arc, + auth: Option>, + build_info: BuildInfo, + ) -> Self { let allowlist_routes = ["/status", "/ready", "/metrics"] .iter() .map(|v| v.parse().unwrap()) @@ -60,6 +65,7 @@ impl HttpState { Self { service, auth, + neon_metrics: NeonMetrics::new(build_info), allowlist_routes, } } @@ -672,10 +678,11 @@ fn epilogue_metrics_middleware }) } -pub async fn measured_metrics_handler(_req: Request) -> Result, ApiError> { +pub async fn measured_metrics_handler(req: Request) -> Result, ApiError> { pub const TEXT_FORMAT: &str = "text/plain; version=0.0.4"; - let payload = crate::metrics::METRICS_REGISTRY.encode(); + let state = get_state(&req); + let payload = crate::metrics::METRICS_REGISTRY.encode(&state.neon_metrics); let response = Response::builder() .status(200) .header(CONTENT_TYPE, TEXT_FORMAT) @@ -704,6 +711,7 @@ where pub fn make_router( service: Arc, auth: Option>, + build_info: BuildInfo, ) -> RouterBuilder { let mut router = endpoint::make_router() .middleware(prologue_metrics_middleware()) @@ -720,7 +728,7 @@ pub fn make_router( } router - .data(Arc::new(HttpState::new(service, auth))) + .data(Arc::new(HttpState::new(service, auth, build_info))) .get("/metrics", |r| { named_request_span(r, measured_metrics_handler, RequestName("metrics")) }) diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs index 3c03d6efe8..6466b9f7a3 100644 --- a/storage_controller/src/main.rs +++ b/storage_controller/src/main.rs @@ -3,6 +3,7 @@ use camino::Utf8PathBuf; use clap::Parser; use diesel::Connection; use metrics::launch_timestamp::LaunchTimestamp; +use metrics::BuildInfo; use std::sync::Arc; use storage_controller::http::make_router; use storage_controller::metrics::preinitialize_metrics; @@ -192,6 +193,11 @@ async fn async_main() -> anyhow::Result<()> { args.listen ); + let build_info = BuildInfo { + revision: GIT_VERSION, + build_tag: BUILD_TAG, + }; + let strict_mode = if args.dev { StrictMode::Dev } else { @@ -253,7 +259,7 @@ async fn async_main() -> anyhow::Result<()> { let auth = secrets .public_key .map(|jwt_auth| Arc::new(SwappableJwtAuth::new(jwt_auth))); - let router = make_router(service.clone(), auth) + let router = make_router(service.clone(), auth, build_info) .build() .map_err(|err| anyhow!(err))?; let router_service = utils::http::RouterService::new(router).unwrap(); diff --git a/storage_controller/src/metrics.rs b/storage_controller/src/metrics.rs index cabf416b9f..ac9f22c739 100644 --- a/storage_controller/src/metrics.rs +++ b/storage_controller/src/metrics.rs @@ -8,10 +8,8 @@ //! The rest of the code defines label group types and deals with converting outer types to labels. //! use bytes::Bytes; -use measured::{ - label::{LabelValue, StaticLabelSet}, - FixedCardinalityLabel, MetricGroup, -}; +use measured::{label::LabelValue, metric::histogram, FixedCardinalityLabel, MetricGroup}; +use metrics::NeonMetrics; use once_cell::sync::Lazy; use std::sync::Mutex; @@ -26,13 +24,15 @@ pub fn preinitialize_metrics() { pub(crate) struct StorageControllerMetrics { pub(crate) metrics_group: StorageControllerMetricGroup, - encoder: Mutex, + encoder: Mutex, } #[derive(measured::MetricGroup)] +#[metric(new())] pub(crate) struct StorageControllerMetricGroup { /// Count of how many times we spawn a reconcile task pub(crate) storage_controller_reconcile_spawn: measured::Counter, + /// Reconciler tasks completed, broken down by success/failure/cancelled pub(crate) storage_controller_reconcile_complete: measured::CounterVec, @@ -43,7 +43,9 @@ pub(crate) struct StorageControllerMetricGroup { /// HTTP request status counters for handled requests pub(crate) storage_controller_http_request_status: measured::CounterVec, + /// HTTP request handler latency across all status codes + #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))] pub(crate) storage_controller_http_request_latency: measured::HistogramVec, @@ -55,6 +57,7 @@ pub(crate) struct StorageControllerMetricGroup { /// Latency of HTTP requests to the pageserver, broken down by pageserver /// node id, request name and method. This include both successful and unsuccessful /// requests. + #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))] pub(crate) storage_controller_pageserver_request_latency: measured::HistogramVec, @@ -66,6 +69,7 @@ pub(crate) struct StorageControllerMetricGroup { /// Latency of pass-through HTTP requests to the pageserver, broken down by pageserver /// node id, request name and method. This include both successful and unsuccessful /// requests. + #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))] pub(crate) storage_controller_passthrough_request_latency: measured::HistogramVec, @@ -74,76 +78,34 @@ pub(crate) struct StorageControllerMetricGroup { measured::CounterVec, /// Latency of database queries, broken down by operation. + #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))] pub(crate) storage_controller_database_query_latency: measured::HistogramVec, } impl StorageControllerMetrics { - pub(crate) fn encode(&self) -> Bytes { + pub(crate) fn encode(&self, neon_metrics: &NeonMetrics) -> Bytes { let mut encoder = self.encoder.lock().unwrap(); - self.metrics_group.collect_into(&mut *encoder); + neon_metrics + .collect_group_into(&mut *encoder) + .unwrap_or_else(|infallible| match infallible {}); + self.metrics_group + .collect_group_into(&mut *encoder) + .unwrap_or_else(|infallible| match infallible {}); encoder.finish() } } impl Default for StorageControllerMetrics { fn default() -> Self { - Self { - metrics_group: StorageControllerMetricGroup::new(), - encoder: Mutex::new(measured::text::TextEncoder::new()), - } - } -} + let mut metrics_group = StorageControllerMetricGroup::new(); + metrics_group + .storage_controller_reconcile_complete + .init_all_dense(); -impl StorageControllerMetricGroup { - pub(crate) fn new() -> Self { Self { - storage_controller_reconcile_spawn: measured::Counter::new(), - storage_controller_reconcile_complete: measured::CounterVec::new( - ReconcileCompleteLabelGroupSet { - status: StaticLabelSet::new(), - }, - ), - storage_controller_schedule_optimization: measured::Counter::new(), - storage_controller_http_request_status: measured::CounterVec::new( - HttpRequestStatusLabelGroupSet { - path: lasso::ThreadedRodeo::new(), - method: StaticLabelSet::new(), - status: StaticLabelSet::new(), - }, - ), - storage_controller_http_request_latency: measured::HistogramVec::new( - measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0), - ), - storage_controller_pageserver_request_error: measured::CounterVec::new( - PageserverRequestLabelGroupSet { - pageserver_id: lasso::ThreadedRodeo::new(), - path: lasso::ThreadedRodeo::new(), - method: StaticLabelSet::new(), - }, - ), - storage_controller_pageserver_request_latency: measured::HistogramVec::new( - measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0), - ), - storage_controller_passthrough_request_error: measured::CounterVec::new( - PageserverRequestLabelGroupSet { - pageserver_id: lasso::ThreadedRodeo::new(), - path: lasso::ThreadedRodeo::new(), - method: StaticLabelSet::new(), - }, - ), - storage_controller_passthrough_request_latency: measured::HistogramVec::new( - measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0), - ), - storage_controller_database_query_error: measured::CounterVec::new( - DatabaseQueryErrorLabelGroupSet { - operation: StaticLabelSet::new(), - error_type: StaticLabelSet::new(), - }, - ), - storage_controller_database_query_latency: measured::HistogramVec::new( - measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0), - ), + metrics_group, + encoder: Mutex::new(measured::text::BufferedTextEncoder::new()), } } } @@ -157,7 +119,7 @@ pub(crate) struct ReconcileCompleteLabelGroup { #[derive(measured::LabelGroup)] #[label(set = HttpRequestStatusLabelGroupSet)] pub(crate) struct HttpRequestStatusLabelGroup<'a> { - #[label(dynamic_with = lasso::ThreadedRodeo)] + #[label(dynamic_with = lasso::ThreadedRodeo, default)] pub(crate) path: &'a str, pub(crate) method: Method, pub(crate) status: StatusCode, @@ -166,40 +128,21 @@ pub(crate) struct HttpRequestStatusLabelGroup<'a> { #[derive(measured::LabelGroup)] #[label(set = HttpRequestLatencyLabelGroupSet)] pub(crate) struct HttpRequestLatencyLabelGroup<'a> { - #[label(dynamic_with = lasso::ThreadedRodeo)] + #[label(dynamic_with = lasso::ThreadedRodeo, default)] pub(crate) path: &'a str, pub(crate) method: Method, } -impl Default for HttpRequestLatencyLabelGroupSet { - fn default() -> Self { - Self { - path: lasso::ThreadedRodeo::new(), - method: StaticLabelSet::new(), - } - } -} - #[derive(measured::LabelGroup, Clone)] #[label(set = PageserverRequestLabelGroupSet)] pub(crate) struct PageserverRequestLabelGroup<'a> { - #[label(dynamic_with = lasso::ThreadedRodeo)] + #[label(dynamic_with = lasso::ThreadedRodeo, default)] pub(crate) pageserver_id: &'a str, - #[label(dynamic_with = lasso::ThreadedRodeo)] + #[label(dynamic_with = lasso::ThreadedRodeo, default)] pub(crate) path: &'a str, pub(crate) method: Method, } -impl Default for PageserverRequestLabelGroupSet { - fn default() -> Self { - Self { - pageserver_id: lasso::ThreadedRodeo::new(), - path: lasso::ThreadedRodeo::new(), - method: StaticLabelSet::new(), - } - } -} - #[derive(measured::LabelGroup)] #[label(set = DatabaseQueryErrorLabelGroupSet)] pub(crate) struct DatabaseQueryErrorLabelGroup { @@ -213,7 +156,7 @@ pub(crate) struct DatabaseQueryLatencyLabelGroup { pub(crate) operation: DatabaseOperation, } -#[derive(FixedCardinalityLabel)] +#[derive(FixedCardinalityLabel, Clone, Copy)] pub(crate) enum ReconcileOutcome { #[label(rename = "ok")] Success, @@ -221,7 +164,7 @@ pub(crate) enum ReconcileOutcome { Cancel, } -#[derive(FixedCardinalityLabel, Clone)] +#[derive(FixedCardinalityLabel, Copy, Clone)] pub(crate) enum Method { Get, Put, @@ -246,11 +189,12 @@ impl From for Method { } } +#[derive(Clone, Copy)] pub(crate) struct StatusCode(pub(crate) hyper::http::StatusCode); impl LabelValue for StatusCode { fn visit(&self, v: V) -> V::Output { - v.write_int(self.0.as_u16() as u64) + v.write_int(self.0.as_u16() as i64) } } @@ -268,7 +212,7 @@ impl FixedCardinalityLabel for StatusCode { } } -#[derive(FixedCardinalityLabel)] +#[derive(FixedCardinalityLabel, Clone, Copy)] pub(crate) enum DatabaseErrorLabel { Query, Connection, diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs index 55fbfd10bc..5312e1e218 100644 --- a/storage_controller/src/persistence.rs +++ b/storage_controller/src/persistence.rs @@ -79,7 +79,7 @@ pub(crate) enum DatabaseError { Logical(String), } -#[derive(measured::FixedCardinalityLabel, Clone)] +#[derive(measured::FixedCardinalityLabel, Copy, Clone)] pub(crate) enum DatabaseOperation { InsertNode, UpdateNode, @@ -153,9 +153,7 @@ impl Persistence { let latency = &METRICS_REGISTRY .metrics_group .storage_controller_database_query_latency; - let _timer = latency.start_timer(DatabaseQueryLatencyLabelGroup { - operation: op.clone(), - }); + let _timer = latency.start_timer(DatabaseQueryLatencyLabelGroup { operation: op }); let res = self.with_conn(func).await; diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py index 208263a22a..ddad98a5fa 100644 --- a/test_runner/regress/test_compatibility.py +++ b/test_runner/regress/test_compatibility.py @@ -192,9 +192,6 @@ def test_backward_compatibility( assert not breaking_changes_allowed, "Breaking changes are allowed by ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE, but the test has passed without any breakage" -# Forward compatibility is broken due to https://github.com/neondatabase/neon/pull/6530 -# The test is disabled until the next release deployment -@pytest.mark.xfail @check_ondisk_data_compatibility_if_enabled @pytest.mark.xdist_group("compatibility") @pytest.mark.order(after="test_create_snapshot") diff --git a/test_runner/regress/test_pageserver_layer_rolling.py b/test_runner/regress/test_pageserver_layer_rolling.py index c7e1e88468..c5dc0f2919 100644 --- a/test_runner/regress/test_pageserver_layer_rolling.py +++ b/test_runner/regress/test_pageserver_layer_rolling.py @@ -1,6 +1,7 @@ import asyncio import os -from typing import Tuple +import time +from typing import Optional, Tuple import psutil import pytest @@ -20,20 +21,30 @@ ENTRIES_PER_TIMELINE = 10_000 CHECKPOINT_TIMEOUT_SECONDS = 60 -async def run_worker(env: NeonEnv, tenant_conf, entries: int) -> Tuple[TenantId, TimelineId, Lsn]: - tenant, timeline = env.neon_cli.create_tenant(conf=tenant_conf) +async def run_worker_for_tenant( + env: NeonEnv, entries: int, tenant: TenantId, offset: Optional[int] = None +) -> Lsn: + if offset is None: + offset = 0 + with env.endpoints.create_start("main", tenant_id=tenant) as ep: conn = await ep.connect_async() try: await conn.execute("CREATE TABLE IF NOT EXISTS t(key serial primary key, value text)") await conn.execute( - f"INSERT INTO t SELECT i, CONCAT('payload_', i) FROM generate_series(0,{entries}) as i" + f"INSERT INTO t SELECT i, CONCAT('payload_', i) FROM generate_series({offset},{entries}) as i" ) finally: await conn.close(timeout=10) last_flush_lsn = Lsn(ep.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) - return tenant, timeline, last_flush_lsn + return last_flush_lsn + + +async def run_worker(env: NeonEnv, tenant_conf, entries: int) -> Tuple[TenantId, TimelineId, Lsn]: + tenant, timeline = env.neon_cli.create_tenant(conf=tenant_conf) + last_flush_lsn = await run_worker_for_tenant(env, entries, tenant) + return tenant, timeline, last_flush_lsn async def workload( @@ -89,7 +100,9 @@ def assert_dirty_bytes(env, v): def assert_dirty_bytes_nonzero(env): - assert get_dirty_bytes(env) > 0 + dirty_bytes = get_dirty_bytes(env) + assert dirty_bytes > 0 + return dirty_bytes @pytest.mark.parametrize("immediate_shutdown", [True, False]) @@ -182,6 +195,31 @@ def test_idle_checkpoints(neon_env_builder: NeonEnvBuilder): log.info("Waiting for background checkpoints...") wait_until(CHECKPOINT_TIMEOUT_SECONDS * 2, 1, lambda: assert_dirty_bytes(env, 0)) # type: ignore + # The code below verifies that we do not flush on the first write + # after an idle period longer than the checkpoint timeout. + + # Sit quietly for longer than the checkpoint timeout + time.sleep(CHECKPOINT_TIMEOUT_SECONDS + CHECKPOINT_TIMEOUT_SECONDS / 2) + + # Restart the safekeepers and write a bit of extra data into one tenant + for sk in env.safekeepers: + sk.start() + + tenant_with_extra_writes = last_flush_lsns[0][0] + asyncio.run( + run_worker_for_tenant(env, 5, tenant_with_extra_writes, offset=ENTRIES_PER_TIMELINE) + ) + + dirty_after_write = wait_until(10, 1, lambda: assert_dirty_bytes_nonzero(env)) # type: ignore + + # We shouldn't flush since we've just opened a new layer + waited_for = 0 + while waited_for < CHECKPOINT_TIMEOUT_SECONDS // 4: + time.sleep(5) + waited_for += 5 + + assert get_dirty_bytes(env) >= dirty_after_write + @pytest.mark.skipif( # We have to use at least ~100MB of data to hit the lowest limit we can configure, which is diff --git a/test_runner/regress/test_proxy_rate_limiter.py b/test_runner/regress/test_proxy_rate_limiter.py deleted file mode 100644 index f39f0cad07..0000000000 --- a/test_runner/regress/test_proxy_rate_limiter.py +++ /dev/null @@ -1,84 +0,0 @@ -import asyncio -import time -from pathlib import Path -from typing import Iterator - -import pytest -from fixtures.neon_fixtures import ( - PSQL, - NeonProxy, -) -from fixtures.port_distributor import PortDistributor -from pytest_httpserver import HTTPServer -from werkzeug.wrappers.response import Response - - -def waiting_handler(status_code: int) -> Response: - # wait more than timeout to make sure that both (two) connections are open. - # It would be better to use a barrier here, but I don't know how to do that together with pytest-httpserver. - time.sleep(2) - return Response(status=status_code) - - -@pytest.fixture(scope="function") -def proxy_with_rate_limit( - port_distributor: PortDistributor, - neon_binpath: Path, - httpserver_listen_address, - test_output_dir: Path, -) -> Iterator[NeonProxy]: - """Neon proxy that routes directly to vanilla postgres.""" - - proxy_port = port_distributor.get_port() - mgmt_port = port_distributor.get_port() - http_port = port_distributor.get_port() - external_http_port = port_distributor.get_port() - (host, port) = httpserver_listen_address - endpoint = f"http://{host}:{port}/billing/api/v1/usage_events" - - with NeonProxy( - neon_binpath=neon_binpath, - test_output_dir=test_output_dir, - proxy_port=proxy_port, - http_port=http_port, - mgmt_port=mgmt_port, - external_http_port=external_http_port, - auth_backend=NeonProxy.Console(endpoint, fixed_rate_limit=5), - ) as proxy: - proxy.start() - yield proxy - - -@pytest.mark.asyncio -async def test_proxy_rate_limit( - httpserver: HTTPServer, - proxy_with_rate_limit: NeonProxy, -): - uri = "/billing/api/v1/usage_events/proxy_get_role_secret" - # mock control plane service - httpserver.expect_ordered_request(uri, method="GET").respond_with_handler( - lambda _: Response(status=200) - ) - httpserver.expect_ordered_request(uri, method="GET").respond_with_handler( - lambda _: waiting_handler(429) - ) - httpserver.expect_ordered_request(uri, method="GET").respond_with_handler( - lambda _: waiting_handler(500) - ) - - psql = PSQL(host=proxy_with_rate_limit.host, port=proxy_with_rate_limit.proxy_port) - f = await psql.run("select 42;") - await proxy_with_rate_limit.find_auth_link(uri, f) - # Limit should be 2. - - # Run two queries in parallel. - f1, f2 = await asyncio.gather(psql.run("select 42;"), psql.run("select 42;")) - await proxy_with_rate_limit.find_auth_link(uri, f1) - await proxy_with_rate_limit.find_auth_link(uri, f2) - - # Now limit should be 0. - f = await psql.run("select 42;") - await proxy_with_rate_limit.find_auth_link(uri, f) - - # There last query shouldn't reach the http-server. - assert httpserver.assertions == [] diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index a7b4c66156..d9149dc59a 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit a7b4c66156bce00afa60e5592d4284ba9e40b4cf +Subproject commit d9149dc59abcbeeb26293707509aef51752db28f diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index 64b8c7bccc..85d809c124 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit 64b8c7bccc6b77e04795e2d4cf6ad82dc8d987ed +Subproject commit 85d809c124a898847a97d66a211f7d5ef4f8e0cb diff --git a/vendor/postgres-v16 b/vendor/postgres-v16 index 3946b2e2ea..261497dd63 160000 --- a/vendor/postgres-v16 +++ b/vendor/postgres-v16 @@ -1 +1 @@ -Subproject commit 3946b2e2ea71d07af092099cb5bcae76a69b90d6 +Subproject commit 261497dd63ace434045058b1453bcbaaa83f23e5 diff --git a/vendor/revisions.json b/vendor/revisions.json index 75dc095168..dfc0aa04c3 100644 --- a/vendor/revisions.json +++ b/vendor/revisions.json @@ -1,5 +1,5 @@ { - "postgres-v16": "3946b2e2ea71d07af092099cb5bcae76a69b90d6", - "postgres-v15": "64b8c7bccc6b77e04795e2d4cf6ad82dc8d987ed", - "postgres-v14": "a7b4c66156bce00afa60e5592d4284ba9e40b4cf" + "postgres-v16": "261497dd63ace434045058b1453bcbaaa83f23e5", + "postgres-v15": "85d809c124a898847a97d66a211f7d5ef4f8e0cb", + "postgres-v14": "d9149dc59abcbeeb26293707509aef51752db28f" } diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index 7b8228a082..d6e2cc2996 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -37,8 +37,7 @@ futures-io = { version = "0.3" } futures-sink = { version = "0.3" } futures-util = { version = "0.3", features = ["channel", "io", "sink"] } getrandom = { version = "0.2", default-features = false, features = ["std"] } -hashbrown-582f2526e08bb6a0 = { package = "hashbrown", version = "0.14", features = ["raw"] } -hashbrown-594e8ee84c453af0 = { package = "hashbrown", version = "0.13", features = ["raw"] } +hashbrown = { version = "0.14", features = ["raw"] } hex = { version = "0.4", features = ["serde"] } hmac = { version = "0.12", default-features = false, features = ["reset"] } hyper = { version = "0.14", features = ["full"] } @@ -64,7 +63,7 @@ scopeguard = { version = "1" } serde = { version = "1", features = ["alloc", "derive"] } serde_json = { version = "1", features = ["raw_value"] } sha2 = { version = "0.10", features = ["asm"] } -smallvec = { version = "1", default-features = false, features = ["write"] } +smallvec = { version = "1", default-features = false, features = ["const_new", "write"] } subtle = { version = "2" } time = { version = "0.3", features = ["local-offset", "macros", "serde-well-known"] } tokio = { version = "1", features = ["fs", "io-std", "io-util", "macros", "net", "process", "rt-multi-thread", "signal", "test-util"] } @@ -76,7 +75,6 @@ tonic = { version = "0.9", features = ["tls-roots"] } tower = { version = "0.4", default-features = false, features = ["balance", "buffer", "limit", "log", "timeout", "util"] } tracing = { version = "0.1", features = ["log"] } tracing-core = { version = "0.1" } -tungstenite = { version = "0.20" } url = { version = "2", features = ["serde"] } uuid = { version = "1", features = ["serde", "v4", "v7"] } zeroize = { version = "1", features = ["derive"] } @@ -91,7 +89,7 @@ cc = { version = "1", default-features = false, features = ["parallel"] } chrono = { version = "0.4", default-features = false, features = ["clock", "serde", "wasmbind"] } either = { version = "1" } getrandom = { version = "0.2", default-features = false, features = ["std"] } -hashbrown-582f2526e08bb6a0 = { package = "hashbrown", version = "0.14", features = ["raw"] } +hashbrown = { version = "0.14", features = ["raw"] } indexmap = { version = "1", default-features = false, features = ["std"] } itertools = { version = "0.10" } libc = { version = "0.2", features = ["extra_traits", "use_std"] }